aboutsummaryrefslogtreecommitdiff
path: root/arch/mips/math-emu/dp_div.c
diff options
context:
space:
mode:
Diffstat (limited to 'arch/mips/math-emu/dp_div.c')
-rw-r--r--arch/mips/math-emu/dp_div.c94
1 files changed, 46 insertions, 48 deletions
diff --git a/arch/mips/math-emu/dp_div.c b/arch/mips/math-emu/dp_div.c
index a1bce1b7c09..bef0e55e593 100644
--- a/arch/mips/math-emu/dp_div.c
+++ b/arch/mips/math-emu/dp_div.c
@@ -5,8 +5,6 @@
* MIPS floating point support
* Copyright (C) 1994-2000 Algorithmics Ltd.
*
- * ########################################################################
- *
* This program is free software; you can distribute it and/or modify it
* under the terms of the GNU General Public License (Version 2) as
* published by the Free Software Foundation.
@@ -18,23 +16,24 @@
*
* You should have received a copy of the GNU General Public License along
* with this program; if not, write to the Free Software Foundation, Inc.,
- * 59 Temple Place - Suite 330, Boston MA 02111-1307, USA.
- *
- * ########################################################################
+ * 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*/
-
#include "ieee754dp.h"
-ieee754dp ieee754dp_div(ieee754dp x, ieee754dp y)
+union ieee754dp ieee754dp_div(union ieee754dp x, union ieee754dp y)
{
+ u64 rm;
+ int re;
+ u64 bm;
+
COMPXDP;
COMPYDP;
EXPLODEXDP;
EXPLODEYDP;
- CLEARCX;
+ ieee754_clearcx();
FLUSHXDP;
FLUSHYDP;
@@ -51,8 +50,8 @@ ieee754dp ieee754dp_div(ieee754dp x, ieee754dp y)
case CLPAIR(IEEE754_CLASS_SNAN, IEEE754_CLASS_NORM):
case CLPAIR(IEEE754_CLASS_SNAN, IEEE754_CLASS_DNORM):
case CLPAIR(IEEE754_CLASS_SNAN, IEEE754_CLASS_INF):
- SETCX(IEEE754_INVALID_OPERATION);
- return ieee754dp_nanxcpt(ieee754dp_indef(), "div", x, y);
+ ieee754_setcx(IEEE754_INVALID_OPERATION);
+ return ieee754dp_nanxcpt(ieee754dp_indef());
case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_QNAN):
case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_QNAN):
@@ -68,12 +67,12 @@ ieee754dp ieee754dp_div(ieee754dp x, ieee754dp y)
return x;
- /* Infinity handling
- */
-
+ /*
+ * Infinity handling
+ */
case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_INF):
- SETCX(IEEE754_INVALID_OPERATION);
- return ieee754dp_xcpt(ieee754dp_indef(), "div", x, y);
+ ieee754_setcx(IEEE754_INVALID_OPERATION);
+ return ieee754dp_indef();
case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_INF):
case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_INF):
@@ -85,17 +84,17 @@ ieee754dp ieee754dp_div(ieee754dp x, ieee754dp y)
case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_DNORM):
return ieee754dp_inf(xs ^ ys);
- /* Zero handling
- */
-
+ /*
+ * Zero handling
+ */
case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_ZERO):
- SETCX(IEEE754_INVALID_OPERATION);
- return ieee754dp_xcpt(ieee754dp_indef(), "div", x, y);
+ ieee754_setcx(IEEE754_INVALID_OPERATION);
+ return ieee754dp_indef();
case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_ZERO):
case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_ZERO):
- SETCX(IEEE754_ZERO_DIVIDE);
- return ieee754dp_xcpt(ieee754dp_inf(xs ^ ys), "div", x, y);
+ ieee754_setcx(IEEE754_ZERO_DIVIDE);
+ return ieee754dp_inf(xs ^ ys);
case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_NORM):
case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_DNORM):
@@ -122,35 +121,34 @@ ieee754dp ieee754dp_div(ieee754dp x, ieee754dp y)
xm <<= 3;
ym <<= 3;
- {
- /* now the dirty work */
-
- u64 rm = 0;
- int re = xe - ye;
- u64 bm;
-
- for (bm = DP_MBIT(DP_MBITS + 2); bm; bm >>= 1) {
- if (xm >= ym) {
- xm -= ym;
- rm |= bm;
- if (xm == 0)
- break;
- }
- xm <<= 1;
- }
- rm <<= 1;
- if (xm)
- rm |= 1; /* have remainder, set sticky */
+ /* now the dirty work */
- assert(rm);
+ rm = 0;
+ re = xe - ye;
- /* normalise rm to rounding precision ?
- */
- while ((rm >> (DP_MBITS + 3)) == 0) {
- rm <<= 1;
- re--;
+ for (bm = DP_MBIT(DP_FBITS + 2); bm; bm >>= 1) {
+ if (xm >= ym) {
+ xm -= ym;
+ rm |= bm;
+ if (xm == 0)
+ break;
}
+ xm <<= 1;
+ }
+
+ rm <<= 1;
+ if (xm)
+ rm |= 1; /* have remainder, set sticky */
- DPNORMRET2(xs == ys ? 0 : 1, re, rm, "div", x, y);
+ assert(rm);
+
+ /*
+ * Normalise rm to rounding precision ?
+ */
+ while ((rm >> (DP_FBITS + 3)) == 0) {
+ rm <<= 1;
+ re--;
}
+
+ return ieee754dp_format(xs == ys ? 0 : 1, re, rm);
}
0923ff353d12194e83628bcca5a8606564'>fs/xfs/linux-2.6/xfs_iops.c860
-rw-r--r--fs/xfs/linux-2.6/xfs_lrw.c1021
-rw-r--r--fs/xfs/linux-2.6/xfs_lrw.h96
-rw-r--r--fs/xfs/linux-2.6/xfs_stats.c117
-rw-r--r--fs/xfs/linux-2.6/xfs_super.c1014
-rw-r--r--fs/xfs/linux-2.6/xfs_sysctl.c156
-rw-r--r--fs/xfs/linux-2.6/xfs_vfs.c313
-rw-r--r--fs/xfs/linux-2.6/xfs_vfs.h209
-rw-r--r--fs/xfs/linux-2.6/xfs_vnode.c216
-rw-r--r--fs/xfs/linux-2.6/xfs_vnode.h644
-rw-r--r--fs/xfs/mrlock.h (renamed from fs/xfs/linux-2.6/mrlock.h)62
-rw-r--r--fs/xfs/quota/xfs_dquot.c1601
-rw-r--r--fs/xfs/quota/xfs_dquot.h190
-rw-r--r--fs/xfs/quota/xfs_dquot_item.c699
-rw-r--r--fs/xfs/quota/xfs_qm.c2859
-rw-r--r--fs/xfs/quota/xfs_qm.h216
-rw-r--r--fs/xfs/quota/xfs_qm_bhv.c380
-rw-r--r--fs/xfs/quota/xfs_qm_stats.c134
-rw-r--r--fs/xfs/quota/xfs_qm_stats.h53
-rw-r--r--fs/xfs/quota/xfs_qm_syscalls.c1483
-rw-r--r--fs/xfs/quota/xfs_quota_priv.h175
-rw-r--r--fs/xfs/support/debug.c101
-rw-r--r--fs/xfs/support/debug.h53
-rw-r--r--fs/xfs/support/ktrace.c331
-rw-r--r--fs/xfs/support/ktrace.h87
-rw-r--r--fs/xfs/support/move.c51
-rw-r--r--fs/xfs/support/move.h70
-rw-r--r--fs/xfs/time.h (renamed from fs/xfs/linux-2.6/time.h)0
-rw-r--r--fs/xfs/uuid.c (renamed from fs/xfs/support/uuid.c)80
-rw-r--r--fs/xfs/uuid.h (renamed from fs/xfs/support/uuid.h)11
-rw-r--r--fs/xfs/xfs.h15
-rw-r--r--fs/xfs/xfs_acl.c995
-rw-r--r--fs/xfs/xfs_acl.h113
-rw-r--r--fs/xfs/xfs_ag.h174
-rw-r--r--fs/xfs/xfs_alloc.c1887
-rw-r--r--fs/xfs/xfs_alloc.h131
-rw-r--r--fs/xfs/xfs_alloc_btree.c2473
-rw-r--r--fs/xfs/xfs_alloc_btree.h154
-rw-r--r--fs/xfs/xfs_aops.c1770
-rw-r--r--fs/xfs/xfs_aops.h (renamed from fs/xfs/linux-2.6/xfs_aops.h)33
-rw-r--r--fs/xfs/xfs_arch.h236
-rw-r--r--fs/xfs/xfs_attr.c1964
-rw-r--r--fs/xfs/xfs_attr.h112
-rw-r--r--fs/xfs/xfs_attr_inactive.c452
-rw-r--r--fs/xfs/xfs_attr_leaf.c3127
-rw-r--r--fs/xfs/xfs_attr_leaf.h232
-rw-r--r--fs/xfs/xfs_attr_list.c653
-rw-r--r--fs/xfs/xfs_attr_remote.c628
-rw-r--r--fs/xfs/xfs_attr_remote.h (renamed from fs/xfs/linux-2.6/mutex.h)14
-rw-r--r--fs/xfs/xfs_attr_sf.h54
-rw-r--r--fs/xfs/xfs_behavior.c203
-rw-r--r--fs/xfs/xfs_behavior.h190
-rw-r--r--fs/xfs/xfs_bit.c198
-rw-r--r--fs/xfs/xfs_bit.h50
-rw-r--r--fs/xfs/xfs_bmap.c8321
-rw-r--r--fs/xfs/xfs_bmap.h381
-rw-r--r--fs/xfs/xfs_bmap_btree.c3134
-rw-r--r--fs/xfs/xfs_bmap_btree.h382
-rw-r--r--fs/xfs/xfs_bmap_util.c1897
-rw-r--r--fs/xfs/xfs_bmap_util.h112
-rw-r--r--fs/xfs/xfs_btree.c4278
-rw-r--r--fs/xfs/xfs_btree.h501
-rw-r--r--fs/xfs/xfs_buf.c1890
-rw-r--r--fs/xfs/xfs_buf.h393
-rw-r--r--fs/xfs/xfs_buf_item.c1400
-rw-r--r--fs/xfs/xfs_buf_item.h117
-rw-r--r--fs/xfs/xfs_cap.h70
-rw-r--r--fs/xfs/xfs_cksum.h63
-rw-r--r--fs/xfs/xfs_clnt.h104
-rw-r--r--fs/xfs/xfs_da_btree.c2889
-rw-r--r--fs/xfs/xfs_da_btree.h219
-rw-r--r--fs/xfs/xfs_da_format.c911
-rw-r--r--fs/xfs/xfs_da_format.h861
-rw-r--r--fs/xfs/xfs_dfrag.c375
-rw-r--r--fs/xfs/xfs_dfrag.h53
-rw-r--r--fs/xfs/xfs_dinode.h302
-rw-r--r--fs/xfs/xfs_dir.c1219
-rw-r--r--fs/xfs/xfs_dir.h142
-rw-r--r--fs/xfs/xfs_dir2.c1102
-rw-r--r--fs/xfs/xfs_dir2.h190
-rw-r--r--fs/xfs/xfs_dir2_block.c1225
-rw-r--r--fs/xfs/xfs_dir2_block.h96
-rw-r--r--fs/xfs/xfs_dir2_data.c841
-rw-r--r--fs/xfs/xfs_dir2_data.h190
-rw-r--r--fs/xfs/xfs_dir2_leaf.c1969
-rw-r--r--fs/xfs/xfs_dir2_leaf.h271
-rw-r--r--fs/xfs/xfs_dir2_node.c1940
-rw-r--r--fs/xfs/xfs_dir2_node.h104
-rw-r--r--fs/xfs/xfs_dir2_priv.h274
-rw-r--r--fs/xfs/xfs_dir2_readdir.c700
-rw-r--r--fs/xfs/xfs_dir2_sf.c712
-rw-r--r--fs/xfs/xfs_dir2_sf.h195
-rw-r--r--fs/xfs/xfs_dir2_trace.c216
-rw-r--r--fs/xfs/xfs_dir2_trace.h72
-rw-r--r--fs/xfs/xfs_dir_leaf.c2213
-rw-r--r--fs/xfs/xfs_dir_leaf.h231
-rw-r--r--fs/xfs/xfs_dir_sf.h153
-rw-r--r--fs/xfs/xfs_discard.c240
-rw-r--r--fs/xfs/xfs_discard.h10
-rw-r--r--fs/xfs/xfs_dmapi.h204
-rw-r--r--fs/xfs/xfs_dquot.c1105
-rw-r--r--fs/xfs/xfs_dquot.h173
-rw-r--r--fs/xfs/xfs_dquot_buf.c290
-rw-r--r--fs/xfs/xfs_dquot_item.c445
-rw-r--r--fs/xfs/xfs_dquot_item.h (renamed from fs/xfs/quota/xfs_dquot_item.h)7
-rw-r--r--fs/xfs/xfs_error.c227
-rw-r--r--fs/xfs/xfs_error.h61
-rw-r--r--fs/xfs/xfs_export.c249
-rw-r--r--fs/xfs/xfs_export.h (renamed from fs/xfs/linux-2.6/xfs_export.h)52
-rw-r--r--fs/xfs/xfs_extent_busy.c605
-rw-r--r--fs/xfs/xfs_extent_busy.h73
-rw-r--r--fs/xfs/xfs_extfree_item.c593
-rw-r--r--fs/xfs/xfs_extfree_item.h64
-rw-r--r--fs/xfs/xfs_file.c1433
-rw-r--r--fs/xfs/xfs_filestream.c434
-rw-r--r--fs/xfs/xfs_filestream.h40
-rw-r--r--fs/xfs/xfs_format.h428
-rw-r--r--fs/xfs/xfs_fs.h171
-rw-r--r--fs/xfs/xfs_fsops.c652
-rw-r--r--fs/xfs/xfs_fsops.h2
-rw-r--r--fs/xfs/xfs_globals.c (renamed from fs/xfs/linux-2.6/xfs_globals.c)21
-rw-r--r--fs/xfs/xfs_ialloc.c2649
-rw-r--r--fs/xfs/xfs_ialloc.h87
-rw-r--r--fs/xfs/xfs_ialloc_btree.c2260
-rw-r--r--fs/xfs/xfs_ialloc_btree.h172
-rw-r--r--fs/xfs/xfs_icache.c1327
-rw-r--r--fs/xfs/xfs_icache.h104
-rw-r--r--fs/xfs/xfs_icreate_item.c189
-rw-r--r--fs/xfs/xfs_icreate_item.h (renamed from fs/xfs/linux-2.6/xfs_ioctl32.h)22
-rw-r--r--fs/xfs/xfs_iget.c1043
-rw-r--r--fs/xfs/xfs_imap.h40
-rw-r--r--fs/xfs/xfs_inode.c5183
-rw-r--r--fs/xfs/xfs_inode.h668
-rw-r--r--fs/xfs/xfs_inode_buf.c479
-rw-r--r--fs/xfs/xfs_inode_buf.h50
-rw-r--r--fs/xfs/xfs_inode_fork.c1906
-rw-r--r--fs/xfs/xfs_inode_fork.h171
-rw-r--r--fs/xfs/xfs_inode_item.c1298
-rw-r--r--fs/xfs/xfs_inode_item.h148
-rw-r--r--fs/xfs/xfs_inum.h17
-rw-r--r--fs/xfs/xfs_iocore.c118
-rw-r--r--fs/xfs/xfs_ioctl.c1810
-rw-r--r--fs/xfs/xfs_ioctl.h95
-rw-r--r--fs/xfs/xfs_ioctl32.c681
-rw-r--r--fs/xfs/xfs_ioctl32.h237
-rw-r--r--fs/xfs/xfs_iomap.c1103
-rw-r--r--fs/xfs/xfs_iomap.h72
-rw-r--r--fs/xfs/xfs_iops.c1299
-rw-r--r--fs/xfs/xfs_iops.h (renamed from fs/xfs/linux-2.6/xfs_iops.h)25
-rw-r--r--fs/xfs/xfs_itable.c609
-rw-r--r--fs/xfs/xfs_itable.h55
-rw-r--r--fs/xfs/xfs_linux.h (renamed from fs/xfs/linux-2.6/xfs_linux.h)292
-rw-r--r--fs/xfs/xfs_log.c4092
-rw-r--r--fs/xfs/xfs_log.h221
-rw-r--r--fs/xfs/xfs_log_cil.c972
-rw-r--r--fs/xfs/xfs_log_format.h679
-rw-r--r--fs/xfs/xfs_log_priv.h618
-rw-r--r--fs/xfs/xfs_log_recover.c3883
-rw-r--r--fs/xfs/xfs_log_recover.h25
-rw-r--r--fs/xfs/xfs_log_rlimit.c150
-rw-r--r--fs/xfs/xfs_mac.h106
-rw-r--r--fs/xfs/xfs_message.c114
-rw-r--r--fs/xfs/xfs_message.h64
-rw-r--r--fs/xfs/xfs_mount.c2461
-rw-r--r--fs/xfs/xfs_mount.h593
-rw-r--r--fs/xfs/xfs_mru_cache.c551
-rw-r--r--fs/xfs/xfs_mru_cache.h46
-rw-r--r--fs/xfs/xfs_qm.c1966
-rw-r--r--fs/xfs/xfs_qm.h180
-rw-r--r--fs/xfs/xfs_qm_bhv.c151
-rw-r--r--fs/xfs/xfs_qm_syscalls.c1007
-rw-r--r--fs/xfs/xfs_qmops.c128
-rw-r--r--fs/xfs/xfs_quota.h398
-rw-r--r--fs/xfs/xfs_quota_defs.h161
-rw-r--r--fs/xfs/xfs_quotaops.c175
-rw-r--r--fs/xfs/xfs_refcache.h52
-rw-r--r--fs/xfs/xfs_rename.c635
-rw-r--r--fs/xfs/xfs_rtalloc.c1880
-rw-r--r--fs/xfs/xfs_rtalloc.h112
-rw-r--r--fs/xfs/xfs_rtbitmap.c973
-rw-r--r--fs/xfs/xfs_rw.c341
-rw-r--r--fs/xfs/xfs_rw.h97
-rw-r--r--fs/xfs/xfs_sb.c836
-rw-r--r--fs/xfs/xfs_sb.h537
-rw-r--r--fs/xfs/xfs_shared.h246
-rw-r--r--fs/xfs/xfs_stats.c198
-rw-r--r--fs/xfs/xfs_stats.h (renamed from fs/xfs/linux-2.6/xfs_stats.h)103
-rw-r--r--fs/xfs/xfs_super.c1809
-rw-r--r--fs/xfs/xfs_super.h (renamed from fs/xfs/linux-2.6/xfs_super.h)65
-rw-r--r--fs/xfs/xfs_symlink.c599
-rw-r--r--fs/xfs/xfs_symlink.h (renamed from fs/xfs/linux-2.6/xfs_version.h)20
-rw-r--r--fs/xfs/xfs_symlink_remote.c201
-rw-r--r--fs/xfs/xfs_sysctl.c261
-rw-r--r--fs/xfs/xfs_sysctl.h (renamed from fs/xfs/linux-2.6/xfs_sysctl.h)12
-rw-r--r--fs/xfs/xfs_trace.c56
-rw-r--r--fs/xfs/xfs_trace.h2073
-rw-r--r--fs/xfs/xfs_trans.c1255
-rw-r--r--fs/xfs/xfs_trans.h1001
-rw-r--r--fs/xfs/xfs_trans_ail.c1088
-rw-r--r--fs/xfs/xfs_trans_buf.c1005
-rw-r--r--fs/xfs/xfs_trans_dquot.c (renamed from fs/xfs/quota/xfs_trans_dquot.c)416
-rw-r--r--fs/xfs/xfs_trans_extfree.c41
-rw-r--r--fs/xfs/xfs_trans_inode.c293
-rw-r--r--fs/xfs/xfs_trans_item.c538
-rw-r--r--fs/xfs/xfs_trans_priv.h152
-rw-r--r--fs/xfs/xfs_trans_resv.c894
-rw-r--r--fs/xfs/xfs_trans_resv.h117
-rw-r--r--fs/xfs/xfs_trans_space.h21
-rw-r--r--fs/xfs/xfs_types.h109
-rw-r--r--fs/xfs/xfs_utils.c472
-rw-r--r--fs/xfs/xfs_utils.h38
-rw-r--r--fs/xfs/xfs_vfsops.c1987
-rw-r--r--fs/xfs/xfs_vnode.h (renamed from fs/xfs/xfs_dmops.c)47
-rw-r--r--fs/xfs/xfs_vnodeops.c4682
-rw-r--r--fs/xfs/xfs_xattr.c245
237 files changed, 75393 insertions, 79814 deletions
diff --git a/fs/xfs/Kbuild b/fs/xfs/Kbuild
deleted file mode 100644
index 2566e96706f..00000000000
--- a/fs/xfs/Kbuild
+++ /dev/null
@@ -1,6 +0,0 @@
-#
-# The xfs people like to share Makefile with 2.6 and 2.4.
-# Utilise file named Kbuild file which has precedence over Makefile.
-#
-
-include $(srctree)/$(obj)/Makefile-linux-2.6
diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig
index bac27d66151..399e8cec6e6 100644
--- a/fs/xfs/Kconfig
+++ b/fs/xfs/Kconfig
@@ -1,6 +1,8 @@
config XFS_FS
tristate "XFS filesystem support"
- select EXPORTFS if NFSD!=n
+ depends on BLOCK
+ select EXPORTFS
+ select LIBCRC32C
help
XFS is a high performance journaling filesystem which originated
on the SGI IRIX platform. It is completely multi-threaded, can
@@ -18,14 +20,10 @@ config XFS_FS
system of your root partition is compiled as a module, you'll need
to use an initial ramdisk (initrd) to boot.
-config XFS_EXPORT
- bool
- depends on XFS_FS && EXPORTFS
- default y
-
config XFS_QUOTA
bool "XFS Quota support"
depends on XFS_FS
+ select QUOTACTL
help
If you say Y here, you will be able to set limits for disk usage on
a per user and/or a per group basis under XFS. XFS considers quota
@@ -40,21 +38,10 @@ config XFS_QUOTA
with or without the generic quota support enabled (CONFIG_QUOTA) -
they are completely independent subsystems.
-config XFS_SECURITY
- bool "XFS Security Label support"
- depends on XFS_FS
- help
- Security labels support alternative access control models
- implemented by security modules like SELinux. This option
- enables an extended attribute namespace for inode security
- labels in the XFS filesystem.
-
- If you are not using a security module that requires using
- extended attributes for inode security labels, say N.
-
config XFS_POSIX_ACL
bool "XFS POSIX ACL support"
depends on XFS_FS
+ select FS_POSIX_ACL
help
POSIX Access Control Lists (ACLs) support permissions for users and
groups beyond the owner/group/world scheme.
@@ -65,18 +52,45 @@ config XFS_POSIX_ACL
If you don't know what Access Control Lists are, say N.
config XFS_RT
- bool "XFS Realtime support (EXPERIMENTAL)"
- depends on XFS_FS && EXPERIMENTAL
+ bool "XFS Realtime subvolume support"
+ depends on XFS_FS
help
If you say Y here you will be able to mount and use XFS filesystems
- which contain a realtime subvolume. The realtime subvolume is a
- separate area of disk space where only file data is stored. The
- realtime subvolume is designed to provide very deterministic
- data rates suitable for media streaming applications.
+ which contain a realtime subvolume. The realtime subvolume is a
+ separate area of disk space where only file data is stored. It was
+ originally designed to provide deterministic data rates suitable
+ for media streaming applications, but is also useful as a generic
+ mechanism for ensuring data and metadata/log I/Os are completely
+ separated. Regular file I/Os are isolated to a separate device
+ from all other requests, and this can be done quite transparently
+ to applications via the inherit-realtime directory inode flag.
- See the xfs man page in section 5 for a bit more information.
-
- This feature is unsupported at this time, is not yet fully
- functional, and may cause serious problems.
+ See the xfs man page in section 5 for additional information.
If unsure, say N.
+
+config XFS_WARN
+ bool "XFS Verbose Warnings"
+ depends on XFS_FS && !XFS_DEBUG
+ help
+ Say Y here to get an XFS build with many additional warnings.
+ It converts ASSERT checks to WARN, so will log any out-of-bounds
+ conditions that occur that would otherwise be missed. It is much
+ lighter weight than XFS_DEBUG and does not modify algorithms and will
+ not cause the kernel to panic on non-fatal errors.
+
+ However, similar to XFS_DEBUG, it is only advisable to use this if you
+ are debugging a particular problem.
+
+config XFS_DEBUG
+ bool "XFS Debugging support"
+ depends on XFS_FS
+ help
+ Say Y here to get an XFS build with many debugging features,
+ including ASSERT checks, function wrappers around macros,
+ and extra sanity-checking functions in various code paths.
+
+ Note that the resulting code will be HUGE and SLOW, and probably
+ not useful unless you are debugging a particular problem.
+
+ Say N unless you are an XFS developer, or you play one on TV.
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 49e3e7e5e3d..c21f4350666 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -1 +1,116 @@
-include $(TOPDIR)/fs/xfs/Makefile-linux-$(VERSION).$(PATCHLEVEL)
+#
+# Copyright (c) 2000-2005 Silicon Graphics, Inc.
+# All Rights Reserved.
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License as
+# published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it would be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write the Free Software Foundation,
+# Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+#
+
+ccflags-y += -I$(src) # needed for trace events
+
+ccflags-$(CONFIG_XFS_DEBUG) += -g
+
+obj-$(CONFIG_XFS_FS) += xfs.o
+
+# this one should be compiled first, as the tracing macros can easily blow up
+xfs-y += xfs_trace.o
+
+# highlevel code
+xfs-y += xfs_aops.o \
+ xfs_attr_inactive.o \
+ xfs_attr_list.o \
+ xfs_bit.o \
+ xfs_bmap_util.o \
+ xfs_buf.o \
+ xfs_dir2_readdir.o \
+ xfs_discard.o \
+ xfs_error.o \
+ xfs_export.o \
+ xfs_extent_busy.o \
+ xfs_file.o \
+ xfs_filestream.o \
+ xfs_fsops.o \
+ xfs_globals.o \
+ xfs_icache.o \
+ xfs_ioctl.o \
+ xfs_iomap.o \
+ xfs_iops.o \
+ xfs_itable.o \
+ xfs_message.o \
+ xfs_mount.o \
+ xfs_mru_cache.o \
+ xfs_super.o \
+ xfs_symlink.o \
+ xfs_trans.o \
+ xfs_xattr.o \
+ kmem.o \
+ uuid.o
+
+# code shared with libxfs
+xfs-y += xfs_alloc.o \
+ xfs_alloc_btree.o \
+ xfs_attr.o \
+ xfs_attr_leaf.o \
+ xfs_attr_remote.o \
+ xfs_bmap.o \
+ xfs_bmap_btree.o \
+ xfs_btree.o \
+ xfs_da_btree.o \
+ xfs_da_format.o \
+ xfs_dir2.o \
+ xfs_dir2_block.o \
+ xfs_dir2_data.o \
+ xfs_dir2_leaf.o \
+ xfs_dir2_node.o \
+ xfs_dir2_sf.o \
+ xfs_dquot_buf.o \
+ xfs_ialloc.o \
+ xfs_ialloc_btree.o \
+ xfs_icreate_item.o \
+ xfs_inode.o \
+ xfs_inode_fork.o \
+ xfs_inode_buf.o \
+ xfs_log_recover.o \
+ xfs_log_rlimit.o \
+ xfs_sb.o \
+ xfs_symlink_remote.o \
+ xfs_trans_resv.o
+
+# low-level transaction/log code
+xfs-y += xfs_log.o \
+ xfs_log_cil.o \
+ xfs_buf_item.o \
+ xfs_extfree_item.o \
+ xfs_inode_item.o \
+ xfs_trans_ail.o \
+ xfs_trans_buf.o \
+ xfs_trans_extfree.o \
+ xfs_trans_inode.o \
+
+# optional features
+xfs-$(CONFIG_XFS_QUOTA) += xfs_dquot.o \
+ xfs_dquot_item.o \
+ xfs_trans_dquot.o \
+ xfs_qm_syscalls.o \
+ xfs_qm_bhv.o \
+ xfs_qm.o \
+ xfs_quotaops.o
+
+# xfs_rtbitmap is shared with libxfs
+xfs-$(CONFIG_XFS_RT) += xfs_rtalloc.o \
+ xfs_rtbitmap.o
+
+xfs-$(CONFIG_XFS_POSIX_ACL) += xfs_acl.o
+xfs-$(CONFIG_PROC_FS) += xfs_stats.o
+xfs-$(CONFIG_SYSCTL) += xfs_sysctl.o
+xfs-$(CONFIG_COMPAT) += xfs_ioctl32.o
diff --git a/fs/xfs/Makefile-linux-2.6 b/fs/xfs/Makefile-linux-2.6
deleted file mode 100644
index 97bd4743b46..00000000000
--- a/fs/xfs/Makefile-linux-2.6
+++ /dev/null
@@ -1,151 +0,0 @@
-#
-# Copyright (c) 2000-2005 Silicon Graphics, Inc. All Rights Reserved.
-#
-# This program is free software; you can redistribute it and/or modify it
-# under the terms of version 2 of the GNU General Public License as
-# published by the Free Software Foundation.
-#
-# This program is distributed in the hope that it would be useful, but
-# WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
-#
-# Further, this software is distributed without any warranty that it is
-# free of the rightful claim of any third person regarding infringement
-# or the like. Any license provided herein, whether implied or
-# otherwise, applies only to this software file. Patent licenses, if
-# any, provided herein do not apply to combinations of this program with
-# other software, or any other product whatsoever.
-#
-# You should have received a copy of the GNU General Public License along
-# with this program; if not, write the Free Software Foundation, Inc., 59
-# Temple Place - Suite 330, Boston MA 02111-1307, USA.
-#
-# Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
-# Mountain View, CA 94043, or:
-#
-# http://www.sgi.com
-#
-# For further information regarding this notice, see:
-#
-# http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
-#
-
-EXTRA_CFLAGS += -Ifs/xfs -Ifs/xfs/linux-2.6 -funsigned-char
-
-XFS_LINUX := linux-2.6
-
-ifeq ($(CONFIG_XFS_DEBUG),y)
- EXTRA_CFLAGS += -g -DSTATIC="" -DDEBUG
- EXTRA_CFLAGS += -DPAGEBUF_LOCK_TRACKING
-endif
-ifeq ($(CONFIG_XFS_TRACE),y)
- EXTRA_CFLAGS += -DXFS_ALLOC_TRACE
- EXTRA_CFLAGS += -DXFS_ATTR_TRACE
- EXTRA_CFLAGS += -DXFS_BLI_TRACE
- EXTRA_CFLAGS += -DXFS_BMAP_TRACE
- EXTRA_CFLAGS += -DXFS_BMBT_TRACE
- EXTRA_CFLAGS += -DXFS_DIR_TRACE
- EXTRA_CFLAGS += -DXFS_DIR2_TRACE
- EXTRA_CFLAGS += -DXFS_DQUOT_TRACE
- EXTRA_CFLAGS += -DXFS_ILOCK_TRACE
- EXTRA_CFLAGS += -DXFS_LOG_TRACE
- EXTRA_CFLAGS += -DXFS_RW_TRACE
- EXTRA_CFLAGS += -DPAGEBUF_TRACE
- EXTRA_CFLAGS += -DXFS_VNODE_TRACE
-endif
-
-obj-$(CONFIG_XFS_FS) += xfs.o
-
-xfs-$(CONFIG_XFS_QUOTA) += $(addprefix quota/, \
- xfs_dquot.o \
- xfs_dquot_item.o \
- xfs_trans_dquot.o \
- xfs_qm_syscalls.o \
- xfs_qm_bhv.o \
- xfs_qm.o)
-
-ifeq ($(CONFIG_XFS_QUOTA),y)
-xfs-$(CONFIG_PROC_FS) += quota/xfs_qm_stats.o
-endif
-
-xfs-$(CONFIG_XFS_RT) += xfs_rtalloc.o
-xfs-$(CONFIG_XFS_POSIX_ACL) += xfs_acl.o
-xfs-$(CONFIG_PROC_FS) += $(XFS_LINUX)/xfs_stats.o
-xfs-$(CONFIG_SYSCTL) += $(XFS_LINUX)/xfs_sysctl.o
-xfs-$(CONFIG_COMPAT) += $(XFS_LINUX)/xfs_ioctl32.o
-xfs-$(CONFIG_XFS_EXPORT) += $(XFS_LINUX)/xfs_export.o
-
-
-xfs-y += xfs_alloc.o \
- xfs_alloc_btree.o \
- xfs_attr.o \
- xfs_attr_leaf.o \
- xfs_behavior.o \
- xfs_bit.o \
- xfs_bmap.o \
- xfs_bmap_btree.o \
- xfs_btree.o \
- xfs_buf_item.o \
- xfs_da_btree.o \
- xfs_dir.o \
- xfs_dir2.o \
- xfs_dir2_block.o \
- xfs_dir2_data.o \
- xfs_dir2_leaf.o \
- xfs_dir2_node.o \
- xfs_dir2_sf.o \
- xfs_dir_leaf.o \
- xfs_error.o \
- xfs_extfree_item.o \
- xfs_fsops.o \
- xfs_ialloc.o \
- xfs_ialloc_btree.o \
- xfs_iget.o \
- xfs_inode.o \
- xfs_inode_item.o \
- xfs_iocore.o \
- xfs_iomap.o \
- xfs_itable.o \
- xfs_dfrag.o \
- xfs_log.o \
- xfs_log_recover.o \
- xfs_mount.o \
- xfs_rename.o \
- xfs_trans.o \
- xfs_trans_ail.o \
- xfs_trans_buf.o \
- xfs_trans_extfree.o \
- xfs_trans_inode.o \
- xfs_trans_item.o \
- xfs_utils.o \
- xfs_vfsops.o \
- xfs_vnodeops.o \
- xfs_rw.o \
- xfs_dmops.o \
- xfs_qmops.o
-
-xfs-$(CONFIG_XFS_TRACE) += xfs_dir2_trace.o
-
-# Objects in linux/
-xfs-y += $(addprefix $(XFS_LINUX)/, \
- kmem.o \
- xfs_aops.o \
- xfs_buf.o \
- xfs_file.o \
- xfs_fs_subr.o \
- xfs_globals.o \
- xfs_ioctl.o \
- xfs_iops.o \
- xfs_lrw.o \
- xfs_super.o \
- xfs_vfs.o \
- xfs_vnode.o)
-
-# Objects in support/
-xfs-y += $(addprefix support/, \
- debug.o \
- move.o \
- uuid.o)
-
-xfs-$(CONFIG_XFS_TRACE) += support/ktrace.o
-
diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c
new file mode 100644
index 00000000000..844e288b957
--- /dev/null
+++ b/fs/xfs/kmem.c
@@ -0,0 +1,138 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/slab.h>
+#include <linux/swap.h>
+#include <linux/blkdev.h>
+#include <linux/backing-dev.h>
+#include "time.h"
+#include "kmem.h"
+#include "xfs_message.h"
+
+/*
+ * Greedy allocation. May fail and may return vmalloced memory.
+ */
+void *
+kmem_zalloc_greedy(size_t *size, size_t minsize, size_t maxsize)
+{
+ void *ptr;
+ size_t kmsize = maxsize;
+
+ while (!(ptr = vzalloc(kmsize))) {
+ if ((kmsize >>= 1) <= minsize)
+ kmsize = minsize;
+ }
+ if (ptr)
+ *size = kmsize;
+ return ptr;
+}
+
+void *
+kmem_alloc(size_t size, xfs_km_flags_t flags)
+{
+ int retries = 0;
+ gfp_t lflags = kmem_flags_convert(flags);
+ void *ptr;
+
+ do {
+ ptr = kmalloc(size, lflags);
+ if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP)))
+ return ptr;
+ if (!(++retries % 100))
+ xfs_err(NULL,
+ "possible memory allocation deadlock in %s (mode:0x%x)",
+ __func__, lflags);
+ congestion_wait(BLK_RW_ASYNC, HZ/50);
+ } while (1);
+}
+
+void *
+kmem_zalloc_large(size_t size, xfs_km_flags_t flags)
+{
+ unsigned noio_flag = 0;
+ void *ptr;
+ gfp_t lflags;
+
+ ptr = kmem_zalloc(size, flags | KM_MAYFAIL);
+ if (ptr)
+ return ptr;
+
+ /*
+ * __vmalloc() will allocate data pages and auxillary structures (e.g.
+ * pagetables) with GFP_KERNEL, yet we may be under GFP_NOFS context
+ * here. Hence we need to tell memory reclaim that we are in such a
+ * context via PF_MEMALLOC_NOIO to prevent memory reclaim re-entering
+ * the filesystem here and potentially deadlocking.
+ */
+ if ((current->flags & PF_FSTRANS) || (flags & KM_NOFS))
+ noio_flag = memalloc_noio_save();
+
+ lflags = kmem_flags_convert(flags);
+ ptr = __vmalloc(size, lflags | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL);
+
+ if ((current->flags & PF_FSTRANS) || (flags & KM_NOFS))
+ memalloc_noio_restore(noio_flag);
+
+ return ptr;
+}
+
+void
+kmem_free(const void *ptr)
+{
+ if (!is_vmalloc_addr(ptr)) {
+ kfree(ptr);
+ } else {
+ vfree(ptr);
+ }
+}
+
+void *
+kmem_realloc(const void *ptr, size_t newsize, size_t oldsize,
+ xfs_km_flags_t flags)
+{
+ void *new;
+
+ new = kmem_alloc(newsize, flags);
+ if (ptr) {
+ if (new)
+ memcpy(new, ptr,
+ ((oldsize < newsize) ? oldsize : newsize));
+ kmem_free(ptr);
+ }
+ return new;
+}
+
+void *
+kmem_zone_alloc(kmem_zone_t *zone, xfs_km_flags_t flags)
+{
+ int retries = 0;
+ gfp_t lflags = kmem_flags_convert(flags);
+ void *ptr;
+
+ do {
+ ptr = kmem_cache_alloc(zone, lflags);
+ if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP)))
+ return ptr;
+ if (!(++retries % 100))
+ xfs_err(NULL,
+ "possible memory allocation deadlock in %s (mode:0x%x)",
+ __func__, lflags);
+ congestion_wait(BLK_RW_ASYNC, HZ/50);
+ } while (1);
+}
diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h
new file mode 100644
index 00000000000..64db0e53ede
--- /dev/null
+++ b/fs/xfs/kmem.h
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#ifndef __XFS_SUPPORT_KMEM_H__
+#define __XFS_SUPPORT_KMEM_H__
+
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/vmalloc.h>
+
+/*
+ * General memory allocation interfaces
+ */
+
+typedef unsigned __bitwise xfs_km_flags_t;
+#define KM_SLEEP ((__force xfs_km_flags_t)0x0001u)
+#define KM_NOSLEEP ((__force xfs_km_flags_t)0x0002u)
+#define KM_NOFS ((__force xfs_km_flags_t)0x0004u)
+#define KM_MAYFAIL ((__force xfs_km_flags_t)0x0008u)
+#define KM_ZERO ((__force xfs_km_flags_t)0x0010u)
+
+/*
+ * We use a special process flag to avoid recursive callbacks into
+ * the filesystem during transactions. We will also issue our own
+ * warnings, so we explicitly skip any generic ones (silly of us).
+ */
+static inline gfp_t
+kmem_flags_convert(xfs_km_flags_t flags)
+{
+ gfp_t lflags;
+
+ BUG_ON(flags & ~(KM_SLEEP|KM_NOSLEEP|KM_NOFS|KM_MAYFAIL|KM_ZERO));
+
+ if (flags & KM_NOSLEEP) {
+ lflags = GFP_ATOMIC | __GFP_NOWARN;
+ } else {
+ lflags = GFP_KERNEL | __GFP_NOWARN;
+ if ((current->flags & PF_FSTRANS) || (flags & KM_NOFS))
+ lflags &= ~__GFP_FS;
+ }
+
+ if (flags & KM_ZERO)
+ lflags |= __GFP_ZERO;
+
+ return lflags;
+}
+
+extern void *kmem_alloc(size_t, xfs_km_flags_t);
+extern void *kmem_zalloc_large(size_t size, xfs_km_flags_t);
+extern void *kmem_realloc(const void *, size_t, size_t, xfs_km_flags_t);
+extern void kmem_free(const void *);
+
+
+extern void *kmem_zalloc_greedy(size_t *, size_t, size_t);
+
+static inline void *
+kmem_zalloc(size_t size, xfs_km_flags_t flags)
+{
+ return kmem_alloc(size, flags | KM_ZERO);
+}
+
+/*
+ * Zone interfaces
+ */
+
+#define KM_ZONE_HWALIGN SLAB_HWCACHE_ALIGN
+#define KM_ZONE_RECLAIM SLAB_RECLAIM_ACCOUNT
+#define KM_ZONE_SPREAD SLAB_MEM_SPREAD
+
+#define kmem_zone kmem_cache
+#define kmem_zone_t struct kmem_cache
+
+static inline kmem_zone_t *
+kmem_zone_init(int size, char *zone_name)
+{
+ return kmem_cache_create(zone_name, size, 0, 0, NULL);
+}
+
+static inline kmem_zone_t *
+kmem_zone_init_flags(int size, char *zone_name, unsigned long flags,
+ void (*construct)(void *))
+{
+ return kmem_cache_create(zone_name, size, 0, flags, construct);
+}
+
+static inline void
+kmem_zone_free(kmem_zone_t *zone, void *ptr)
+{
+ kmem_cache_free(zone, ptr);
+}
+
+static inline void
+kmem_zone_destroy(kmem_zone_t *zone)
+{
+ if (zone)
+ kmem_cache_destroy(zone);
+}
+
+extern void *kmem_zone_alloc(kmem_zone_t *, xfs_km_flags_t);
+
+static inline void *
+kmem_zone_zalloc(kmem_zone_t *zone, xfs_km_flags_t flags)
+{
+ return kmem_zone_alloc(zone, flags | KM_ZERO);
+}
+
+#endif /* __XFS_SUPPORT_KMEM_H__ */
diff --git a/fs/xfs/linux-2.6/kmem.c b/fs/xfs/linux-2.6/kmem.c
deleted file mode 100644
index aba7fcf881a..00000000000
--- a/fs/xfs/linux-2.6/kmem.c
+++ /dev/null
@@ -1,118 +0,0 @@
-/*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- */
-#include <linux/sched.h>
-#include <linux/mm.h>
-#include <linux/vmalloc.h>
-#include <linux/highmem.h>
-#include <linux/swap.h>
-#include <linux/blkdev.h>
-#include "time.h"
-#include "kmem.h"
-
-#define MAX_VMALLOCS 6
-#define MAX_SLAB_SIZE 0x20000
-
-void *
-kmem_alloc(size_t size, unsigned int __nocast flags)
-{
- int retries = 0;
- gfp_t lflags = kmem_flags_convert(flags);
- void *ptr;
-
- do {
- if (size < MAX_SLAB_SIZE || retries > MAX_VMALLOCS)
- ptr = kmalloc(size, lflags);
- else
- ptr = __vmalloc(size, lflags, PAGE_KERNEL);
- if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP)))
- return ptr;
- if (!(++retries % 100))
- printk(KERN_ERR "XFS: possible memory allocation "
- "deadlock in %s (mode:0x%x)\n",
- __FUNCTION__, lflags);
- blk_congestion_wait(WRITE, HZ/50);
- } while (1);
-}
-
-void *
-kmem_zalloc(size_t size, unsigned int __nocast flags)
-{
- void *ptr;
-
- ptr = kmem_alloc(size, flags);
- if (ptr)
- memset((char *)ptr, 0, (int)size);
- return ptr;
-}
-
-void
-kmem_free(void *ptr, size_t size)
-{
- if (((unsigned long)ptr < VMALLOC_START) ||
- ((unsigned long)ptr >= VMALLOC_END)) {
- kfree(ptr);
- } else {
- vfree(ptr);
- }
-}
-
-void *
-kmem_realloc(void *ptr, size_t newsize, size_t oldsize,
- unsigned int __nocast flags)
-{
- void *new;
-
- new = kmem_alloc(newsize, flags);
- if (ptr) {
- if (new)
- memcpy(new, ptr,
- ((oldsize < newsize) ? oldsize : newsize));
- kmem_free(ptr, oldsize);
- }
- return new;
-}
-
-void *
-kmem_zone_alloc(kmem_zone_t *zone, unsigned int __nocast flags)
-{
- int retries = 0;
- gfp_t lflags = kmem_flags_convert(flags);
- void *ptr;
-
- do {
- ptr = kmem_cache_alloc(zone, lflags);
- if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP)))
- return ptr;
- if (!(++retries % 100))
- printk(KERN_ERR "XFS: possible memory allocation "
- "deadlock in %s (mode:0x%x)\n",
- __FUNCTION__, lflags);
- blk_congestion_wait(WRITE, HZ/50);
- } while (1);
-}
-
-void *
-kmem_zone_zalloc(kmem_zone_t *zone, unsigned int __nocast flags)
-{
- void *ptr;
-
- ptr = kmem_zone_alloc(zone, flags);
- if (ptr)
- memset((char *)ptr, 0, kmem_cache_size(zone));
- return ptr;
-}
diff --git a/fs/xfs/linux-2.6/kmem.h b/fs/xfs/linux-2.6/kmem.h
deleted file mode 100644
index c64a29cdfff..00000000000
--- a/fs/xfs/linux-2.6/kmem.h
+++ /dev/null
@@ -1,143 +0,0 @@
-/*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- */
-#ifndef __XFS_SUPPORT_KMEM_H__
-#define __XFS_SUPPORT_KMEM_H__
-
-#include <linux/slab.h>
-#include <linux/sched.h>
-#include <linux/mm.h>
-
-/*
- * memory management routines
- */
-#define KM_SLEEP 0x0001u
-#define KM_NOSLEEP 0x0002u
-#define KM_NOFS 0x0004u
-#define KM_MAYFAIL 0x0008u
-
-#define kmem_zone kmem_cache
-#define kmem_zone_t struct kmem_cache
-
-typedef unsigned long xfs_pflags_t;
-
-#define PFLAGS_TEST_NOIO() (current->flags & PF_NOIO)
-#define PFLAGS_TEST_FSTRANS() (current->flags & PF_FSTRANS)
-
-#define PFLAGS_SET_NOIO() do { \
- current->flags |= PF_NOIO; \
-} while (0)
-
-#define PFLAGS_CLEAR_NOIO() do { \
- current->flags &= ~PF_NOIO; \
-} while (0)
-
-/* these could be nested, so we save state */
-#define PFLAGS_SET_FSTRANS(STATEP) do { \
- *(STATEP) = current->flags; \
- current->flags |= PF_FSTRANS; \
-} while (0)
-
-#define PFLAGS_CLEAR_FSTRANS(STATEP) do { \
- *(STATEP) = current->flags; \
- current->flags &= ~PF_FSTRANS; \
-} while (0)
-
-/* Restore the PF_FSTRANS state to what was saved in STATEP */
-#define PFLAGS_RESTORE_FSTRANS(STATEP) do { \
- current->flags = ((current->flags & ~PF_FSTRANS) | \
- (*(STATEP) & PF_FSTRANS)); \
-} while (0)
-
-#define PFLAGS_DUP(OSTATEP, NSTATEP) do { \
- *(NSTATEP) = *(OSTATEP); \
-} while (0)
-
-static __inline gfp_t kmem_flags_convert(unsigned int __nocast flags)
-{
- gfp_t lflags = __GFP_NOWARN; /* we'll report problems, if need be */
-
-#ifdef DEBUG
- if (unlikely(flags & ~(KM_SLEEP|KM_NOSLEEP|KM_NOFS|KM_MAYFAIL))) {
- printk(KERN_WARNING
- "XFS: memory allocation with wrong flags (%x)\n", flags);
- BUG();
- }
-#endif
-
- if (flags & KM_NOSLEEP) {
- lflags |= GFP_ATOMIC;
- } else {
- lflags |= GFP_KERNEL;
-
- /* avoid recusive callbacks to filesystem during transactions */
- if (PFLAGS_TEST_FSTRANS() || (flags & KM_NOFS))
- lflags &= ~__GFP_FS;
- }
-
- return lflags;
-}
-
-static __inline kmem_zone_t *
-kmem_zone_init(int size, char *zone_name)
-{
- return kmem_cache_create(zone_name, size, 0, 0, NULL, NULL);
-}
-
-static __inline void
-kmem_zone_free(kmem_zone_t *zone, void *ptr)
-{
- kmem_cache_free(zone, ptr);
-}
-
-static __inline void
-kmem_zone_destroy(kmem_zone_t *zone)
-{
- if (zone && kmem_cache_destroy(zone))
- BUG();
-}
-
-extern void *kmem_zone_zalloc(kmem_zone_t *, unsigned int __nocast);
-extern void *kmem_zone_alloc(kmem_zone_t *, unsigned int __nocast);
-
-extern void *kmem_alloc(size_t, unsigned int __nocast);
-extern void *kmem_realloc(void *, size_t, size_t, unsigned int __nocast);
-extern void *kmem_zalloc(size_t, unsigned int __nocast);
-extern void kmem_free(void *, size_t);
-
-typedef struct shrinker *kmem_shaker_t;
-typedef int (*kmem_shake_func_t)(int, gfp_t);
-
-static __inline kmem_shaker_t
-kmem_shake_register(kmem_shake_func_t sfunc)
-{
- return set_shrinker(DEFAULT_SEEKS, sfunc);
-}
-
-static __inline void
-kmem_shake_deregister(kmem_shaker_t shrinker)
-{
- remove_shrinker(shrinker);
-}
-
-static __inline int
-kmem_shake_allow(gfp_t gfp_mask)
-{
- return (gfp_mask & __GFP_WAIT);
-}
-
-#endif /* __XFS_SUPPORT_KMEM_H__ */
diff --git a/fs/xfs/linux-2.6/sema.h b/fs/xfs/linux-2.6/sema.h
deleted file mode 100644
index 194a84490bd..00000000000
--- a/fs/xfs/linux-2.6/sema.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- */
-#ifndef __XFS_SUPPORT_SEMA_H__
-#define __XFS_SUPPORT_SEMA_H__
-
-#include <linux/time.h>
-#include <linux/wait.h>
-#include <asm/atomic.h>
-#include <asm/semaphore.h>
-
-/*
- * sema_t structure just maps to struct semaphore in Linux kernel.
- */
-
-typedef struct semaphore sema_t;
-
-#define init_sema(sp, val, c, d) sema_init(sp, val)
-#define initsema(sp, val) sema_init(sp, val)
-#define initnsema(sp, val, name) sema_init(sp, val)
-#define psema(sp, b) down(sp)
-#define vsema(sp) up(sp)
-#define valusema(sp) (atomic_read(&(sp)->count))
-#define freesema(sema)
-
-/*
- * Map cpsema (try to get the sema) to down_trylock. We need to switch
- * the return values since cpsema returns 1 (acquired) 0 (failed) and
- * down_trylock returns the reverse 0 (acquired) 1 (failed).
- */
-
-#define cpsema(sp) (down_trylock(sp) ? 0 : 1)
-
-/*
- * Didn't do cvsema(sp). Not sure how to map this to up/down/...
- * It does a vsema if the values is < 0 other wise nothing.
- */
-
-#endif /* __XFS_SUPPORT_SEMA_H__ */
diff --git a/fs/xfs/linux-2.6/spin.h b/fs/xfs/linux-2.6/spin.h
deleted file mode 100644
index 50a6191178f..00000000000
--- a/fs/xfs/linux-2.6/spin.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- */
-#ifndef __XFS_SUPPORT_SPIN_H__
-#define __XFS_SUPPORT_SPIN_H__
-
-#include <linux/sched.h> /* preempt needs this */
-#include <linux/spinlock.h>
-
-/*
- * Map lock_t from IRIX to Linux spinlocks.
- *
- * We do not make use of lock_t from interrupt context, so we do not
- * have to worry about disabling interrupts at all (unlike IRIX).
- */
-
-typedef spinlock_t lock_t;
-
-#define SPLDECL(s) unsigned long s
-#ifndef DEFINE_SPINLOCK
-#define DEFINE_SPINLOCK(s) spinlock_t s = SPIN_LOCK_UNLOCKED
-#endif
-
-#define spinlock_init(lock, name) spin_lock_init(lock)
-#define spinlock_destroy(lock)
-#define mutex_spinlock(lock) ({ spin_lock(lock); 0; })
-#define mutex_spinunlock(lock, s) do { spin_unlock(lock); (void)s; } while (0)
-#define nested_spinlock(lock) spin_lock(lock)
-#define nested_spinunlock(lock) spin_unlock(lock)
-
-#endif /* __XFS_SUPPORT_SPIN_H__ */
diff --git a/fs/xfs/linux-2.6/sv.h b/fs/xfs/linux-2.6/sv.h
deleted file mode 100644
index 9a8ad481b00..00000000000
--- a/fs/xfs/linux-2.6/sv.h
+++ /dev/null
@@ -1,75 +0,0 @@
-/*
- * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- */
-#ifndef __XFS_SUPPORT_SV_H__
-#define __XFS_SUPPORT_SV_H__
-
-#include <linux/wait.h>
-#include <linux/sched.h>
-#include <linux/spinlock.h>
-
-/*
- * Synchronisation variables.
- *
- * (Parameters "pri", "svf" and "rts" are not implemented)
- */
-
-typedef struct sv_s {
- wait_queue_head_t waiters;
-} sv_t;
-
-#define SV_FIFO 0x0 /* sv_t is FIFO type */
-#define SV_LIFO 0x2 /* sv_t is LIFO type */
-#define SV_PRIO 0x4 /* sv_t is PRIO type */
-#define SV_KEYED 0x6 /* sv_t is KEYED type */
-#define SV_DEFAULT SV_FIFO
-
-
-static inline void _sv_wait(sv_t *sv, spinlock_t *lock, int state,
- unsigned long timeout)
-{
- DECLARE_WAITQUEUE(wait, current);
-
- add_wait_queue_exclusive(&sv->waiters, &wait);
- __set_current_state(state);
- spin_unlock(lock);
-
- schedule_timeout(timeout);
-
- remove_wait_queue(&sv->waiters, &wait);
-}
-
-#define init_sv(sv,type,name,flag) \
- init_waitqueue_head(&(sv)->waiters)
-#define sv_init(sv,flag,name) \
- init_waitqueue_head(&(sv)->waiters)
-#define sv_destroy(sv) \
- /*NOTHING*/
-#define sv_wait(sv, pri, lock, s) \
- _sv_wait(sv, lock, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT)
-#define sv_wait_sig(sv, pri, lock, s) \
- _sv_wait(sv, lock, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT)
-#define sv_timedwait(sv, pri, lock, s, svf, ts, rts) \
- _sv_wait(sv, lock, TASK_UNINTERRUPTIBLE, timespec_to_jiffies(ts))
-#define sv_timedwait_sig(sv, pri, lock, s, svf, ts, rts) \
- _sv_wait(sv, lock, TASK_INTERRUPTIBLE, timespec_to_jiffies(ts))
-#define sv_signal(sv) \
- wake_up(&(sv)->waiters)
-#define sv_broadcast(sv) \
- wake_up_all(&(sv)->waiters)
-
-#endif /* __XFS_SUPPORT_SV_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
deleted file mode 100644
index 8f2beec526c..00000000000
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ /dev/null
@@ -1,1467 +0,0 @@
-/*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- */
-#include "xfs.h"
-#include "xfs_bit.h"
-#include "xfs_log.h"
-#include "xfs_inum.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_dir.h"
-#include "xfs_dir2.h"
-#include "xfs_trans.h"
-#include "xfs_dmapi.h"
-#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
-#include "xfs_dinode.h"
-#include "xfs_inode.h"
-#include "xfs_alloc.h"
-#include "xfs_btree.h"
-#include "xfs_error.h"
-#include "xfs_rw.h"
-#include "xfs_iomap.h"
-#include <linux/mpage.h>
-#include <linux/pagevec.h>
-#include <linux/writeback.h>
-
-STATIC void xfs_count_page_state(struct page *, int *, int *, int *);
-
-#if defined(XFS_RW_TRACE)
-void
-xfs_page_trace(
- int tag,
- struct inode *inode,
- struct page *page,
- int mask)
-{
- xfs_inode_t *ip;
- vnode_t *vp = LINVFS_GET_VP(inode);
- loff_t isize = i_size_read(inode);
- loff_t offset = page_offset(page);
- int delalloc = -1, unmapped = -1, unwritten = -1;
-
- if (page_has_buffers(page))
- xfs_count_page_state(page, &delalloc, &unmapped, &unwritten);
-
- ip = xfs_vtoi(vp);
- if (!ip->i_rwtrace)
- return;
-
- ktrace_enter(ip->i_rwtrace,
- (void *)((unsigned long)tag),
- (void *)ip,
- (void *)inode,
- (void *)page,
- (void *)((unsigned long)mask),
- (void *)((unsigned long)((ip->i_d.di_size >> 32) & 0xffffffff)),
- (void *)((unsigned long)(ip->i_d.di_size & 0xffffffff)),
- (void *)((unsigned long)((isize >> 32) & 0xffffffff)),
- (void *)((unsigned long)(isize & 0xffffffff)),
- (void *)((unsigned long)((offset >> 32) & 0xffffffff)),
- (void *)((unsigned long)(offset & 0xffffffff)),
- (void *)((unsigned long)delalloc),
- (void *)((unsigned long)unmapped),
- (void *)((unsigned long)unwritten),
- (void *)NULL,
- (void *)NULL);
-}
-#else
-#define xfs_page_trace(tag, inode, page, mask)
-#endif
-
-/*
- * Schedule IO completion handling on a xfsdatad if this was
- * the final hold on this ioend.
- */
-STATIC void
-xfs_finish_ioend(
- xfs_ioend_t *ioend)
-{
- if (atomic_dec_and_test(&ioend->io_remaining))
- queue_work(xfsdatad_workqueue, &ioend->io_work);
-}
-
-/*
- * We're now finished for good with this ioend structure.
- * Update the page state via the associated buffer_heads,
- * release holds on the inode and bio, and finally free
- * up memory. Do not use the ioend after this.
- */
-STATIC void
-xfs_destroy_ioend(
- xfs_ioend_t *ioend)
-{
- struct buffer_head *bh, *next;
-
- for (bh = ioend->io_buffer_head; bh; bh = next) {
- next = bh->b_private;
- bh->b_end_io(bh, ioend->io_uptodate);
- }
-
- vn_iowake(ioend->io_vnode);
- mempool_free(ioend, xfs_ioend_pool);
-}
-
-/*
- * Buffered IO write completion for delayed allocate extents.
- * TODO: Update ondisk isize now that we know the file data
- * has been flushed (i.e. the notorious "NULL file" problem).
- */
-STATIC void
-xfs_end_bio_delalloc(
- void *data)
-{
- xfs_ioend_t *ioend = data;
-
- xfs_destroy_ioend(ioend);
-}
-
-/*
- * Buffered IO write completion for regular, written extents.
- */
-STATIC void
-xfs_end_bio_written(
- void *data)
-{
- xfs_ioend_t *ioend = data;
-
- xfs_destroy_ioend(ioend);
-}
-
-/*
- * IO write completion for unwritten extents.
- *
- * Issue transactions to convert a buffer range from unwritten
- * to written extents.
- */
-STATIC void
-xfs_end_bio_unwritten(
- void *data)
-{
- xfs_ioend_t *ioend = data;
- vnode_t *vp = ioend->io_vnode;
- xfs_off_t offset = ioend->io_offset;
- size_t size = ioend->io_size;
- int error;
-
- if (ioend->io_uptodate)
- VOP_BMAP(vp, offset, size, BMAPI_UNWRITTEN, NULL, NULL, error);
- xfs_destroy_ioend(ioend);
-}
-
-/*
- * Allocate and initialise an IO completion structure.
- * We need to track unwritten extent write completion here initially.
- * We'll need to extend this for updating the ondisk inode size later
- * (vs. incore size).
- */
-STATIC xfs_ioend_t *
-xfs_alloc_ioend(
- struct inode *inode,
- unsigned int type)
-{
- xfs_ioend_t *ioend;
-
- ioend = mempool_alloc(xfs_ioend_pool, GFP_NOFS);
-
- /*
- * Set the count to 1 initially, which will prevent an I/O
- * completion callback from happening before we have started
- * all the I/O from calling the completion routine too early.
- */
- atomic_set(&ioend->io_remaining, 1);
- ioend->io_uptodate = 1; /* cleared if any I/O fails */
- ioend->io_list = NULL;
- ioend->io_type = type;
- ioend->io_vnode = LINVFS_GET_VP(inode);
- ioend->io_buffer_head = NULL;
- ioend->io_buffer_tail = NULL;
- atomic_inc(&ioend->io_vnode->v_iocount);
- ioend->io_offset = 0;
- ioend->io_size = 0;
-
- if (type == IOMAP_UNWRITTEN)
- INIT_WORK(&ioend->io_work, xfs_end_bio_unwritten, ioend);
- else if (type == IOMAP_DELAY)
- INIT_WORK(&ioend->io_work, xfs_end_bio_delalloc, ioend);
- else
- INIT_WORK(&ioend->io_work, xfs_end_bio_written, ioend);
-
- return ioend;
-}
-
-STATIC int
-xfs_map_blocks(
- struct inode *inode,
- loff_t offset,
- ssize_t count,
- xfs_iomap_t *mapp,
- int flags)
-{
- vnode_t *vp = LINVFS_GET_VP(inode);
- int error, nmaps = 1;
-
- VOP_BMAP(vp, offset, count, flags, mapp, &nmaps, error);
- if (!error && (flags & (BMAPI_WRITE|BMAPI_ALLOCATE)))
- VMODIFY(vp);
- return -error;
-}
-
-STATIC inline int
-xfs_iomap_valid(
- xfs_iomap_t *iomapp,
- loff_t offset)
-{
- return offset >= iomapp->iomap_offset &&
- offset < iomapp->iomap_offset + iomapp->iomap_bsize;
-}
-
-/*
- * BIO completion handler for buffered IO.
- */
-STATIC int
-xfs_end_bio(
- struct bio *bio,
- unsigned int bytes_done,
- int error)
-{
- xfs_ioend_t *ioend = bio->bi_private;
-
- if (bio->bi_size)
- return 1;
-
- ASSERT(ioend);
- ASSERT(atomic_read(&bio->bi_cnt) >= 1);
-
- /* Toss bio and pass work off to an xfsdatad thread */
- if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
- ioend->io_uptodate = 0;
- bio->bi_private = NULL;
- bio->bi_end_io = NULL;
-
- bio_put(bio);
- xfs_finish_ioend(ioend);
- return 0;
-}
-
-STATIC void
-xfs_submit_ioend_bio(
- xfs_ioend_t *ioend,
- struct bio *bio)
-{
- atomic_inc(&ioend->io_remaining);
-
- bio->bi_private = ioend;
- bio->bi_end_io = xfs_end_bio;
-
- submit_bio(WRITE, bio);
- ASSERT(!bio_flagged(bio, BIO_EOPNOTSUPP));
- bio_put(bio);
-}
-
-STATIC struct bio *
-xfs_alloc_ioend_bio(
- struct buffer_head *bh)
-{
- struct bio *bio;
- int nvecs = bio_get_nr_vecs(bh->b_bdev);
-
- do {
- bio = bio_alloc(GFP_NOIO, nvecs);
- nvecs >>= 1;
- } while (!bio);
-
- ASSERT(bio->bi_private == NULL);
- bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
- bio->bi_bdev = bh->b_bdev;
- bio_get(bio);
- return bio;
-}
-
-STATIC void
-xfs_start_buffer_writeback(
- struct buffer_head *bh)
-{
- ASSERT(buffer_mapped(bh));
- ASSERT(buffer_locked(bh));
- ASSERT(!buffer_delay(bh));
- ASSERT(!buffer_unwritten(bh));
-
- mark_buffer_async_write(bh);
- set_buffer_uptodate(bh);
- clear_buffer_dirty(bh);
-}
-
-STATIC void
-xfs_start_page_writeback(
- struct page *page,
- struct writeback_control *wbc,
- int clear_dirty,
- int buffers)
-{
- ASSERT(PageLocked(page));
- ASSERT(!PageWriteback(page));
- set_page_writeback(page);
- if (clear_dirty)
- clear_page_dirty(page);
- unlock_page(page);
- if (!buffers) {
- end_page_writeback(page);
- wbc->pages_skipped++; /* We didn't write this page */
- }
-}
-
-static inline int bio_add_buffer(struct bio *bio, struct buffer_head *bh)
-{
- return bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh));
-}
-
-/*
- * Submit all of the bios for all of the ioends we have saved up, covering the
- * initial writepage page and also any probed pages.
- *
- * Because we may have multiple ioends spanning a page, we need to start
- * writeback on all the buffers before we submit them for I/O. If we mark the
- * buffers as we got, then we can end up with a page that only has buffers
- * marked async write and I/O complete on can occur before we mark the other
- * buffers async write.
- *
- * The end result of this is that we trip a bug in end_page_writeback() because
- * we call it twice for the one page as the code in end_buffer_async_write()
- * assumes that all buffers on the page are started at the same time.
- *
- * The fix is two passes across the ioend list - one to start writeback on the
- * bufferheads, and then the second one submit them for I/O.
- */
-STATIC void
-xfs_submit_ioend(
- xfs_ioend_t *ioend)
-{
- xfs_ioend_t *head = ioend;
- xfs_ioend_t *next;
- struct buffer_head *bh;
- struct bio *bio;
- sector_t lastblock = 0;
-
- /* Pass 1 - start writeback */
- do {
- next = ioend->io_list;
- for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) {
- xfs_start_buffer_writeback(bh);
- }
- } while ((ioend = next) != NULL);
-
- /* Pass 2 - submit I/O */
- ioend = head;
- do {
- next = ioend->io_list;
- bio = NULL;
-
- for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) {
-
- if (!bio) {
- retry:
- bio = xfs_alloc_ioend_bio(bh);
- } else if (bh->b_blocknr != lastblock + 1) {
- xfs_submit_ioend_bio(ioend, bio);
- goto retry;
- }
-
- if (bio_add_buffer(bio, bh) != bh->b_size) {
- xfs_submit_ioend_bio(ioend, bio);
- goto retry;
- }
-
- lastblock = bh->b_blocknr;
- }
- if (bio)
- xfs_submit_ioend_bio(ioend, bio);
- xfs_finish_ioend(ioend);
- } while ((ioend = next) != NULL);
-}
-
-/*
- * Cancel submission of all buffer_heads so far in this endio.
- * Toss the endio too. Only ever called for the initial page
- * in a writepage request, so only ever one page.
- */
-STATIC void
-xfs_cancel_ioend(
- xfs_ioend_t *ioend)
-{
- xfs_ioend_t *next;
- struct buffer_head *bh, *next_bh;
-
- do {
- next = ioend->io_list;
- bh = ioend->io_buffer_head;
- do {
- next_bh = bh->b_private;
- clear_buffer_async_write(bh);
- unlock_buffer(bh);
- } while ((bh = next_bh) != NULL);
-
- vn_iowake(ioend->io_vnode);
- mempool_free(ioend, xfs_ioend_pool);
- } while ((ioend = next) != NULL);
-}
-
-/*
- * Test to see if we've been building up a completion structure for
- * earlier buffers -- if so, we try to append to this ioend if we
- * can, otherwise we finish off any current ioend and start another.
- * Return true if we've finished the given ioend.
- */
-STATIC void
-xfs_add_to_ioend(
- struct inode *inode,
- struct buffer_head *bh,
- xfs_off_t offset,
- unsigned int type,
- xfs_ioend_t **result,
- int need_ioend)
-{
- xfs_ioend_t *ioend = *result;
-
- if (!ioend || need_ioend || type != ioend->io_type) {
- xfs_ioend_t *previous = *result;
-
- ioend = xfs_alloc_ioend(inode, type);
- ioend->io_offset = offset;
- ioend->io_buffer_head = bh;
- ioend->io_buffer_tail = bh;
- if (previous)
- previous->io_list = ioend;
- *result = ioend;
- } else {
- ioend->io_buffer_tail->b_private = bh;
- ioend->io_buffer_tail = bh;
- }
-
- bh->b_private = NULL;
- ioend->io_size += bh->b_size;
-}
-
-STATIC void
-xfs_map_at_offset(
- struct buffer_head *bh,
- loff_t offset,
- int block_bits,
- xfs_iomap_t *iomapp)
-{
- xfs_daddr_t bn;
- int sector_shift;
-
- ASSERT(!(iomapp->iomap_flags & IOMAP_HOLE));
- ASSERT(!(iomapp->iomap_flags & IOMAP_DELAY));
- ASSERT(iomapp->iomap_bn != IOMAP_DADDR_NULL);
-
- sector_shift = block_bits - BBSHIFT;
- bn = (iomapp->iomap_bn >> sector_shift) +
- ((offset - iomapp->iomap_offset) >> block_bits);
-
- ASSERT(bn || (iomapp->iomap_flags & IOMAP_REALTIME));
- ASSERT((bn << sector_shift) >= iomapp->iomap_bn);
-
- lock_buffer(bh);
- bh->b_blocknr = bn;
- bh->b_bdev = iomapp->iomap_target->bt_bdev;
- set_buffer_mapped(bh);
- clear_buffer_delay(bh);
- clear_buffer_unwritten(bh);
-}
-
-/*
- * Look for a page at index that is suitable for clustering.
- */
-STATIC unsigned int
-xfs_probe_page(
- struct page *page,
- unsigned int pg_offset,
- int mapped)
-{
- int ret = 0;
-
- if (PageWriteback(page))
- return 0;
-
- if (page->mapping && PageDirty(page)) {
- if (page_has_buffers(page)) {
- struct buffer_head *bh, *head;
-
- bh = head = page_buffers(page);
- do {
- if (!buffer_uptodate(bh))
- break;
- if (mapped != buffer_mapped(bh))
- break;
- ret += bh->b_size;
- if (ret >= pg_offset)
- break;
- } while ((bh = bh->b_this_page) != head);
- } else
- ret = mapped ? 0 : PAGE_CACHE_SIZE;
- }
-
- return ret;
-}
-
-STATIC size_t
-xfs_probe_cluster(
- struct inode *inode,
- struct page *startpage,
- struct buffer_head *bh,
- struct buffer_head *head,
- int mapped)
-{
- struct pagevec pvec;
- pgoff_t tindex, tlast, tloff;
- size_t total = 0;
- int done = 0, i;
-
- /* First sum forwards in this page */
- do {
- if (mapped != buffer_mapped(bh))
- return total;
- total += bh->b_size;
- } while ((bh = bh->b_this_page) != head);
-
- /* if we reached the end of the page, sum forwards in following pages */
- tlast = i_size_read(inode) >> PAGE_CACHE_SHIFT;
- tindex = startpage->index + 1;
-
- /* Prune this back to avoid pathological behavior */
- tloff = min(tlast, startpage->index + 64);
-
- pagevec_init(&pvec, 0);
- while (!done && tindex <= tloff) {
- unsigned len = min_t(pgoff_t, PAGEVEC_SIZE, tlast - tindex + 1);
-
- if (!pagevec_lookup(&pvec, inode->i_mapping, tindex, len))
- break;
-
- for (i = 0; i < pagevec_count(&pvec); i++) {
- struct page *page = pvec.pages[i];
- size_t pg_offset, len = 0;
-
- if (tindex == tlast) {
- pg_offset =
- i_size_read(inode) & (PAGE_CACHE_SIZE - 1);
- if (!pg_offset) {
- done = 1;
- break;
- }
- } else
- pg_offset = PAGE_CACHE_SIZE;
-
- if (page->index == tindex && !TestSetPageLocked(page)) {
- len = xfs_probe_page(page, pg_offset, mapped);
- unlock_page(page);
- }
-
- if (!len) {
- done = 1;
- break;
- }
-
- total += len;
- tindex++;
- }
-
- pagevec_release(&pvec);
- cond_resched();
- }
-
- return total;
-}
-
-/*
- * Test if a given page is suitable for writing as part of an unwritten
- * or delayed allocate extent.
- */
-STATIC int
-xfs_is_delayed_page(
- struct page *page,
- unsigned int type)
-{
- if (PageWriteback(page))
- return 0;
-
- if (page->mapping && page_has_buffers(page)) {
- struct buffer_head *bh, *head;
- int acceptable = 0;
-
- bh = head = page_buffers(page);
- do {
- if (buffer_unwritten(bh))
- acceptable = (type == IOMAP_UNWRITTEN);
- else if (buffer_delay(bh))
- acceptable = (type == IOMAP_DELAY);
- else if (buffer_mapped(bh))
- acceptable = (type == 0);
- else
- break;
- } while ((bh = bh->b_this_page) != head);
-
- if (acceptable)
- return 1;
- }
-
- return 0;
-}
-
-/*
- * Allocate & map buffers for page given the extent map. Write it out.
- * except for the original page of a writepage, this is called on
- * delalloc/unwritten pages only, for the original page it is possible
- * that the page has no mapping at all.
- */
-STATIC int
-xfs_convert_page(
- struct inode *inode,
- struct page *page,
- loff_t tindex,
- xfs_iomap_t *mp,
- xfs_ioend_t **ioendp,
- struct writeback_control *wbc,
- int startio,
- int all_bh)
-{
- struct buffer_head *bh, *head;
- xfs_off_t end_offset;
- unsigned long p_offset;
- unsigned int type;
- int bbits = inode->i_blkbits;
- int len, page_dirty;
- int count = 0, done = 0, uptodate = 1;
- xfs_off_t offset = page_offset(page);
-
- if (page->index != tindex)
- goto fail;
- if (TestSetPageLocked(page))
- goto fail;
- if (PageWriteback(page))
- goto fail_unlock_page;
- if (page->mapping != inode->i_mapping)
- goto fail_unlock_page;
- if (!xfs_is_delayed_page(page, (*ioendp)->io_type))
- goto fail_unlock_page;
-
- /*
- * page_dirty is initially a count of buffers on the page before
- * EOF and is decrememted as we move each into a cleanable state.
- *
- * Derivation:
- *
- * End offset is the highest offset that this page should represent.
- * If we are on the last page, (end_offset & (PAGE_CACHE_SIZE - 1))
- * will evaluate non-zero and be less than PAGE_CACHE_SIZE and
- * hence give us the correct page_dirty count. On any other page,
- * it will be zero and in that case we need page_dirty to be the
- * count of buffers on the page.
- */
- end_offset = min_t(unsigned long long,
- (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT,
- i_size_read(inode));
-
- len = 1 << inode->i_blkbits;
- p_offset = min_t(unsigned long, end_offset & (PAGE_CACHE_SIZE - 1),
- PAGE_CACHE_SIZE);
- p_offset = p_offset ? roundup(p_offset, len) : PAGE_CACHE_SIZE;
- page_dirty = p_offset / len;
-
- bh = head = page_buffers(page);
- do {
- if (offset >= end_offset)
- break;
- if (!buffer_uptodate(bh))
- uptodate = 0;
- if (!(PageUptodate(page) || buffer_uptodate(bh))) {
- done = 1;
- continue;
- }
-
- if (buffer_unwritten(bh) || buffer_delay(bh)) {
- if (buffer_unwritten(bh))
- type = IOMAP_UNWRITTEN;
- else
- type = IOMAP_DELAY;
-
- if (!xfs_iomap_valid(mp, offset)) {
- done = 1;
- continue;
- }
-
- ASSERT(!(mp->iomap_flags & IOMAP_HOLE));
- ASSERT(!(mp->iomap_flags & IOMAP_DELAY));
-
- xfs_map_at_offset(bh, offset, bbits, mp);
- if (startio) {
- xfs_add_to_ioend(inode, bh, offset,
- type, ioendp, done);
- } else {
- set_buffer_dirty(bh);
- unlock_buffer(bh);
- mark_buffer_dirty(bh);
- }
- page_dirty--;
- count++;
- } else {
- type = 0;
- if (buffer_mapped(bh) && all_bh && startio) {
- lock_buffer(bh);
- xfs_add_to_ioend(inode, bh, offset,
- type, ioendp, done);
- count++;
- page_dirty--;
- } else {
- done = 1;
- }
- }
- } while (offset += len, (bh = bh->b_this_page) != head);
-
- if (uptodate && bh == head)
- SetPageUptodate(page);
-
- if (startio) {
- if (count) {
- struct backing_dev_info *bdi;
-
- bdi = inode->i_mapping->backing_dev_info;
- wbc->nr_to_write--;
- if (bdi_write_congested(bdi)) {
- wbc->encountered_congestion = 1;
- done = 1;
- } else if (wbc->nr_to_write <= 0) {
- done = 1;
- }
- }
- xfs_start_page_writeback(page, wbc, !page_dirty, count);
- }
-
- return done;
- fail_unlock_page:
- unlock_page(page);
- fail:
- return 1;
-}
-
-/*
- * Convert & write out a cluster of pages in the same extent as defined
- * by mp and following the start page.
- */
-STATIC void
-xfs_cluster_write(
- struct inode *inode,
- pgoff_t tindex,
- xfs_iomap_t *iomapp,
- xfs_ioend_t **ioendp,
- struct writeback_control *wbc,
- int startio,
- int all_bh,
- pgoff_t tlast)
-{
- struct pagevec pvec;
- int done = 0, i;
-
- pagevec_init(&pvec, 0);
- while (!done && tindex <= tlast) {
- unsigned len = min_t(pgoff_t, PAGEVEC_SIZE, tlast - tindex + 1);
-
- if (!pagevec_lookup(&pvec, inode->i_mapping, tindex, len))
- break;
-
- for (i = 0; i < pagevec_count(&pvec); i++) {
- done = xfs_convert_page(inode, pvec.pages[i], tindex++,
- iomapp, ioendp, wbc, startio, all_bh);
- if (done)
- break;
- }
-
- pagevec_release(&pvec);
- cond_resched();
- }
-}
-
-/*
- * Calling this without startio set means we are being asked to make a dirty
- * page ready for freeing it's buffers. When called with startio set then
- * we are coming from writepage.
- *
- * When called with startio set it is important that we write the WHOLE
- * page if possible.
- * The bh->b_state's cannot know if any of the blocks or which block for
- * that matter are dirty due to mmap writes, and therefore bh uptodate is
- * only vaild if the page itself isn't completely uptodate. Some layers
- * may clear the page dirty flag prior to calling write page, under the
- * assumption the entire page will be written out; by not writing out the
- * whole page the page can be reused before all valid dirty data is
- * written out. Note: in the case of a page that has been dirty'd by
- * mapwrite and but partially setup by block_prepare_write the
- * bh->b_states's will not agree and only ones setup by BPW/BCW will have
- * valid state, thus the whole page must be written out thing.
- */
-
-STATIC int
-xfs_page_state_convert(
- struct inode *inode,
- struct page *page,
- struct writeback_control *wbc,
- int startio,
- int unmapped) /* also implies page uptodate */
-{
- struct buffer_head *bh, *head;
- xfs_iomap_t iomap;
- xfs_ioend_t *ioend = NULL, *iohead = NULL;
- loff_t offset;
- unsigned long p_offset = 0;
- unsigned int type;
- __uint64_t end_offset;
- pgoff_t end_index, last_index, tlast;
- ssize_t size, len;
- int flags, err, iomap_valid = 0, uptodate = 1;
- int page_dirty, count = 0, trylock_flag = 0;
- int all_bh = unmapped;
-
- /* wait for other IO threads? */
- if (startio && (wbc->sync_mode == WB_SYNC_NONE && wbc->nonblocking))
- trylock_flag |= BMAPI_TRYLOCK;
-
- /* Is this page beyond the end of the file? */
- offset = i_size_read(inode);
- end_index = offset >> PAGE_CACHE_SHIFT;
- last_index = (offset - 1) >> PAGE_CACHE_SHIFT;
- if (page->index >= end_index) {
- if ((page->index >= end_index + 1) ||
- !(i_size_read(inode) & (PAGE_CACHE_SIZE - 1))) {
- if (startio)
- unlock_page(page);
- return 0;
- }
- }
-
- /*
- * page_dirty is initially a count of buffers on the page before
- * EOF and is decrememted as we move each into a cleanable state.
- *
- * Derivation:
- *
- * End offset is the highest offset that this page should represent.
- * If we are on the last page, (end_offset & (PAGE_CACHE_SIZE - 1))
- * will evaluate non-zero and be less than PAGE_CACHE_SIZE and
- * hence give us the correct page_dirty count. On any other page,
- * it will be zero and in that case we need page_dirty to be the
- * count of buffers on the page.
- */
- end_offset = min_t(unsigned long long,
- (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT, offset);
- len = 1 << inode->i_blkbits;
- p_offset = min_t(unsigned long, end_offset & (PAGE_CACHE_SIZE - 1),
- PAGE_CACHE_SIZE);
- p_offset = p_offset ? roundup(p_offset, len) : PAGE_CACHE_SIZE;
- page_dirty = p_offset / len;
-
- bh = head = page_buffers(page);
- offset = page_offset(page);
- flags = -1;
- type = 0;
-
- /* TODO: cleanup count and page_dirty */
-
- do {
- if (offset >= end_offset)
- break;
- if (!buffer_uptodate(bh))
- uptodate = 0;
- if (!(PageUptodate(page) || buffer_uptodate(bh)) && !startio) {
- /*
- * the iomap is actually still valid, but the ioend
- * isn't. shouldn't happen too often.
- */
- iomap_valid = 0;
- continue;
- }
-
- if (iomap_valid)
- iomap_valid = xfs_iomap_valid(&iomap, offset);
-
- /*
- * First case, map an unwritten extent and prepare for
- * extent state conversion transaction on completion.
- *
- * Second case, allocate space for a delalloc buffer.
- * We can return EAGAIN here in the release page case.
- *
- * Third case, an unmapped buffer was found, and we are
- * in a path where we need to write the whole page out.
- */
- if (buffer_unwritten(bh) || buffer_delay(bh) ||
- ((buffer_uptodate(bh) || PageUptodate(page)) &&
- !buffer_mapped(bh) && (unmapped || startio))) {
- /*
- * Make sure we don't use a read-only iomap
- */
- if (flags == BMAPI_READ)
- iomap_valid = 0;
-
- if (buffer_unwritten(bh)) {
- type = IOMAP_UNWRITTEN;
- flags = BMAPI_WRITE|BMAPI_IGNSTATE;
- } else if (buffer_delay(bh)) {
- type = IOMAP_DELAY;
- flags = BMAPI_ALLOCATE;
- if (!startio)
- flags |= trylock_flag;
- } else {
- type = IOMAP_NEW;
- flags = BMAPI_WRITE|BMAPI_MMAP;
- }
-
- if (!iomap_valid) {
- if (type == IOMAP_NEW) {
- size = xfs_probe_cluster(inode,
- page, bh, head, 0);
- } else {
- size = len;
- }
-
- err = xfs_map_blocks(inode, offset, size,
- &iomap, flags);
- if (err)
- goto error;
- iomap_valid = xfs_iomap_valid(&iomap, offset);
- }
- if (iomap_valid) {
- xfs_map_at_offset(bh, offset,
- inode->i_blkbits, &iomap);
- if (startio) {
- xfs_add_to_ioend(inode, bh, offset,
- type, &ioend,
- !iomap_valid);
- } else {
- set_buffer_dirty(bh);
- unlock_buffer(bh);
- mark_buffer_dirty(bh);
- }
- page_dirty--;
- count++;
- }
- } else if (buffer_uptodate(bh) && startio) {
- /*
- * we got here because the buffer is already mapped.
- * That means it must already have extents allocated
- * underneath it. Map the extent by reading it.
- */
- if (!iomap_valid || type != 0) {
- flags = BMAPI_READ;
- size = xfs_probe_cluster(inode, page, bh,
- head, 1);
- err = xfs_map_blocks(inode, offset, size,
- &iomap, flags);
- if (err)
- goto error;
- iomap_valid = xfs_iomap_valid(&iomap, offset);
- }
-
- type = 0;
- if (!test_and_set_bit(BH_Lock, &bh->b_state)) {
- ASSERT(buffer_mapped(bh));
- if (iomap_valid)
- all_bh = 1;
- xfs_add_to_ioend(inode, bh, offset, type,
- &ioend, !iomap_valid);
- page_dirty--;
- count++;
- } else {
- iomap_valid = 0;
- }
- } else if ((buffer_uptodate(bh) || PageUptodate(page)) &&
- (unmapped || startio)) {
- iomap_valid = 0;
- }
-
- if (!iohead)
- iohead = ioend;
-
- } while (offset += len, ((bh = bh->b_this_page) != head));
-
- if (uptodate && bh == head)
- SetPageUptodate(page);
-
- if (startio)
- xfs_start_page_writeback(page, wbc, 1, count);
-
- if (ioend && iomap_valid) {
- offset = (iomap.iomap_offset + iomap.iomap_bsize - 1) >>
- PAGE_CACHE_SHIFT;
- tlast = min_t(pgoff_t, offset, last_index);
- xfs_cluster_write(inode, page->index + 1, &iomap, &ioend,
- wbc, startio, all_bh, tlast);
- }
-
- if (iohead)
- xfs_submit_ioend(iohead);
-
- return page_dirty;
-
-error:
- if (iohead)
- xfs_cancel_ioend(iohead);
-
- /*
- * If it's delalloc and we have nowhere to put it,
- * throw it away, unless the lower layers told
- * us to try again.
- */
- if (err != -EAGAIN) {
- if (!unmapped)
- block_invalidatepage(page, 0);
- ClearPageUptodate(page);
- }
- return err;
-}
-
-STATIC int
-__linvfs_get_block(
- struct inode *inode,
- sector_t iblock,
- unsigned long blocks,
- struct buffer_head *bh_result,
- int create,
- int direct,
- bmapi_flags_t flags)
-{
- vnode_t *vp = LINVFS_GET_VP(inode);
- xfs_iomap_t iomap;
- xfs_off_t offset;
- ssize_t size;
- int retpbbm = 1;
- int error;
-
- offset = (xfs_off_t)iblock << inode->i_blkbits;
- if (blocks)
- size = (ssize_t) min_t(xfs_off_t, LONG_MAX,
- (xfs_off_t)blocks << inode->i_blkbits);
- else
- size = 1 << inode->i_blkbits;
-
- VOP_BMAP(vp, offset, size,
- create ? flags : BMAPI_READ, &iomap, &retpbbm, error);
- if (error)
- return -error;
-
- if (retpbbm == 0)
- return 0;
-
- if (iomap.iomap_bn != IOMAP_DADDR_NULL) {
- xfs_daddr_t bn;
- xfs_off_t delta;
-
- /* For unwritten extents do not report a disk address on
- * the read case (treat as if we're reading into a hole).
- */
- if (create || !(iomap.iomap_flags & IOMAP_UNWRITTEN)) {
- delta = offset - iomap.iomap_offset;
- delta >>= inode->i_blkbits;
-
- bn = iomap.iomap_bn >> (inode->i_blkbits - BBSHIFT);
- bn += delta;
- BUG_ON(!bn && !(iomap.iomap_flags & IOMAP_REALTIME));
- bh_result->b_blocknr = bn;
- set_buffer_mapped(bh_result);
- }
- if (create && (iomap.iomap_flags & IOMAP_UNWRITTEN)) {
- if (direct)
- bh_result->b_private = inode;
- set_buffer_unwritten(bh_result);
- set_buffer_delay(bh_result);
- }
- }
-
- /* If this is a realtime file, data might be on a new device */
- bh_result->b_bdev = iomap.iomap_target->bt_bdev;
-
- /* If we previously allocated a block out beyond eof and
- * we are now coming back to use it then we will need to
- * flag it as new even if it has a disk address.
- */
- if (create &&
- ((!buffer_mapped(bh_result) && !buffer_uptodate(bh_result)) ||
- (offset >= i_size_read(inode)) || (iomap.iomap_flags & IOMAP_NEW)))
- set_buffer_new(bh_result);
-
- if (iomap.iomap_flags & IOMAP_DELAY) {
- BUG_ON(direct);
- if (create) {
- set_buffer_uptodate(bh_result);
- set_buffer_mapped(bh_result);
- set_buffer_delay(bh_result);
- }
- }
-
- if (blocks) {
- ASSERT(iomap.iomap_bsize - iomap.iomap_delta > 0);
- offset = min_t(xfs_off_t,
- iomap.iomap_bsize - iomap.iomap_delta,
- (xfs_off_t)blocks << inode->i_blkbits);
- bh_result->b_size = (u32) min_t(xfs_off_t, UINT_MAX, offset);
- }
-
- return 0;
-}
-
-int
-linvfs_get_block(
- struct inode *inode,
- sector_t iblock,
- struct buffer_head *bh_result,
- int create)
-{
- return __linvfs_get_block(inode, iblock, 0, bh_result,
- create, 0, BMAPI_WRITE);
-}
-
-STATIC int
-linvfs_get_blocks_direct(
- struct inode *inode,
- sector_t iblock,
- unsigned long max_blocks,
- struct buffer_head *bh_result,
- int create)
-{
- return __linvfs_get_block(inode, iblock, max_blocks, bh_result,
- create, 1, BMAPI_WRITE|BMAPI_DIRECT);
-}
-
-STATIC void
-linvfs_end_io_direct(
- struct kiocb *iocb,
- loff_t offset,
- ssize_t size,
- void *private)
-{
- xfs_ioend_t *ioend = iocb->private;
-
- /*
- * Non-NULL private data means we need to issue a transaction to
- * convert a range from unwritten to written extents. This needs
- * to happen from process contect but aio+dio I/O completion
- * happens from irq context so we need to defer it to a workqueue.
- * This is not nessecary for synchronous direct I/O, but we do
- * it anyway to keep the code uniform and simpler.
- *
- * The core direct I/O code might be changed to always call the
- * completion handler in the future, in which case all this can
- * go away.
- */
- if (private && size > 0) {
- ioend->io_offset = offset;
- ioend->io_size = size;
- xfs_finish_ioend(ioend);
- } else {
- ASSERT(size >= 0);
- xfs_destroy_ioend(ioend);
- }
-
- /*
- * blockdev_direct_IO can return an error even afer the I/O
- * completion handler was called. Thus we need to protect
- * against double-freeing.
- */
- iocb->private = NULL;
-}
-
-STATIC ssize_t
-linvfs_direct_IO(
- int rw,
- struct kiocb *iocb,
- const struct iovec *iov,
- loff_t offset,
- unsigned long nr_segs)
-{
- struct file *file = iocb->ki_filp;
- struct inode *inode = file->f_mapping->host;
- vnode_t *vp = LINVFS_GET_VP(inode);
- xfs_iomap_t iomap;
- int maps = 1;
- int error;
- ssize_t ret;
-
- VOP_BMAP(vp, offset, 0, BMAPI_DEVICE, &iomap, &maps, error);
- if (error)
- return -error;
-
- iocb->private = xfs_alloc_ioend(inode, IOMAP_UNWRITTEN);
-
- ret = blockdev_direct_IO_own_locking(rw, iocb, inode,
- iomap.iomap_target->bt_bdev,
- iov, offset, nr_segs,
- linvfs_get_blocks_direct,
- linvfs_end_io_direct);
-
- if (unlikely(ret <= 0 && iocb->private))
- xfs_destroy_ioend(iocb->private);
- return ret;
-}
-
-
-STATIC sector_t
-linvfs_bmap(
- struct address_space *mapping,
- sector_t block)
-{
- struct inode *inode = (struct inode *)mapping->host;
- vnode_t *vp = LINVFS_GET_VP(inode);
- int error;
-
- vn_trace_entry(vp, "linvfs_bmap", (inst_t *)__return_address);
-
- VOP_RWLOCK(vp, VRWLOCK_READ);
- VOP_FLUSH_PAGES(vp, (xfs_off_t)0, -1, 0, FI_REMAPF, error);
- VOP_RWUNLOCK(vp, VRWLOCK_READ);
- return generic_block_bmap(mapping, block, linvfs_get_block);
-}
-
-STATIC int
-linvfs_readpage(
- struct file *unused,
- struct page *page)
-{
- return mpage_readpage(page, linvfs_get_block);
-}
-
-STATIC int
-linvfs_readpages(
- struct file *unused,
- struct address_space *mapping,
- struct list_head *pages,
- unsigned nr_pages)
-{
- return mpage_readpages(mapping, pages, nr_pages, linvfs_get_block);
-}
-
-STATIC void
-xfs_count_page_state(
- struct page *page,
- int *delalloc,
- int *unmapped,
- int *unwritten)
-{
- struct buffer_head *bh, *head;
-
- *delalloc = *unmapped = *unwritten = 0;
-
- bh = head = page_buffers(page);
- do {
- if (buffer_uptodate(bh) && !buffer_mapped(bh))
- (*unmapped) = 1;
- else if (buffer_unwritten(bh) && !buffer_delay(bh))
- clear_buffer_unwritten(bh);
- else if (buffer_unwritten(bh))
- (*unwritten) = 1;
- else if (buffer_delay(bh))
- (*delalloc) = 1;
- } while ((bh = bh->b_this_page) != head);
-}
-
-
-/*
- * writepage: Called from one of two places:
- *
- * 1. we are flushing a delalloc buffer head.
- *
- * 2. we are writing out a dirty page. Typically the page dirty
- * state is cleared before we get here. In this case is it
- * conceivable we have no buffer heads.
- *
- * For delalloc space on the page we need to allocate space and
- * flush it. For unmapped buffer heads on the page we should
- * allocate space if the page is uptodate. For any other dirty
- * buffer heads on the page we should flush them.
- *
- * If we detect that a transaction would be required to flush
- * the page, we have to check the process flags first, if we
- * are already in a transaction or disk I/O during allocations
- * is off, we need to fail the writepage and redirty the page.
- */
-
-STATIC int
-linvfs_writepage(
- struct page *page,
- struct writeback_control *wbc)
-{
- int error;
- int need_trans;
- int delalloc, unmapped, unwritten;
- struct inode *inode = page->mapping->host;
-
- xfs_page_trace(XFS_WRITEPAGE_ENTER, inode, page, 0);
-
- /*
- * We need a transaction if:
- * 1. There are delalloc buffers on the page
- * 2. The page is uptodate and we have unmapped buffers
- * 3. The page is uptodate and we have no buffers
- * 4. There are unwritten buffers on the page
- */
-
- if (!page_has_buffers(page)) {
- unmapped = 1;
- need_trans = 1;
- } else {
- xfs_count_page_state(page, &delalloc, &unmapped, &unwritten);
- if (!PageUptodate(page))
- unmapped = 0;
- need_trans = delalloc + unmapped + unwritten;
- }
-
- /*
- * If we need a transaction and the process flags say
- * we are already in a transaction, or no IO is allowed
- * then mark the page dirty again and leave the page
- * as is.
- */
- if (PFLAGS_TEST_FSTRANS() && need_trans)
- goto out_fail;
-
- /*
- * Delay hooking up buffer heads until we have
- * made our go/no-go decision.
- */
- if (!page_has_buffers(page))
- create_empty_buffers(page, 1 << inode->i_blkbits, 0);
-
- /*
- * Convert delayed allocate, unwritten or unmapped space
- * to real space and flush out to disk.
- */
- error = xfs_page_state_convert(inode, page, wbc, 1, unmapped);
- if (error == -EAGAIN)
- goto out_fail;
- if (unlikely(error < 0))
- goto out_unlock;
-
- return 0;
-
-out_fail:
- redirty_page_for_writepage(wbc, page);
- unlock_page(page);
- return 0;
-out_unlock:
- unlock_page(page);
- return error;
-}
-
-STATIC int
-linvfs_invalidate_page(
- struct page *page,
- unsigned long offset)
-{
- xfs_page_trace(XFS_INVALIDPAGE_ENTER,
- page->mapping->host, page, offset);
- return block_invalidatepage(page, offset);
-}
-
-/*
- * Called to move a page into cleanable state - and from there
- * to be released. Possibly the page is already clean. We always
- * have buffer heads in this call.
- *
- * Returns 0 if the page is ok to release, 1 otherwise.
- *
- * Possible scenarios are:
- *
- * 1. We are being called to release a page which has been written
- * to via regular I/O. buffer heads will be dirty and possibly
- * delalloc. If no delalloc buffer heads in this case then we
- * can just return zero.
- *
- * 2. We are called to release a page which has been written via
- * mmap, all we need to do is ensure there is no delalloc
- * state in the buffer heads, if not we can let the caller
- * free them and we should come back later via writepage.
- */
-STATIC int
-linvfs_release_page(
- struct page *page,
- gfp_t gfp_mask)
-{
- struct inode *inode = page->mapping->host;
- int dirty, delalloc, unmapped, unwritten;
- struct writeback_control wbc = {
- .sync_mode = WB_SYNC_ALL,
- .nr_to_write = 1,
- };
-
- xfs_page_trace(XFS_RELEASEPAGE_ENTER, inode, page, gfp_mask);
-
- xfs_count_page_state(page, &delalloc, &unmapped, &unwritten);
- if (!delalloc && !unwritten)
- goto free_buffers;
-
- if (!(gfp_mask & __GFP_FS))
- return 0;
-
- /* If we are already inside a transaction or the thread cannot
- * do I/O, we cannot release this page.
- */
- if (PFLAGS_TEST_FSTRANS())
- return 0;
-
- /*
- * Convert delalloc space to real space, do not flush the
- * data out to disk, that will be done by the caller.
- * Never need to allocate space here - we will always
- * come back to writepage in that case.
- */
- dirty = xfs_page_state_convert(inode, page, &wbc, 0, 0);
- if (dirty == 0 && !unwritten)
- goto free_buffers;
- return 0;
-
-free_buffers:
- return try_to_free_buffers(page);
-}
-
-STATIC int
-linvfs_prepare_write(
- struct file *file,
- struct page *page,
- unsigned int from,
- unsigned int to)
-{
- return block_prepare_write(page, from, to, linvfs_get_block);
-}
-
-struct address_space_operations linvfs_aops = {
- .readpage = linvfs_readpage,
- .readpages = linvfs_readpages,
- .writepage = linvfs_writepage,
- .sync_page = block_sync_page,
- .releasepage = linvfs_release_page,
- .invalidatepage = linvfs_invalidate_page,
- .prepare_write = linvfs_prepare_write,
- .commit_write = generic_commit_write,
- .bmap = linvfs_bmap,
- .direct_IO = linvfs_direct_IO,
- .migratepage = buffer_migrate_page,
-};
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
deleted file mode 100644
index bfb4f2917bb..00000000000
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ /dev/null
@@ -1,1855 +0,0 @@
-/*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- */
-#include <linux/stddef.h>
-#include <linux/errno.h>
-#include <linux/slab.h>
-#include <linux/pagemap.h>
-#include <linux/init.h>
-#include <linux/vmalloc.h>
-#include <linux/bio.h>
-#include <linux/sysctl.h>
-#include <linux/proc_fs.h>
-#include <linux/workqueue.h>
-#include <linux/percpu.h>
-#include <linux/blkdev.h>
-#include <linux/hash.h>
-#include <linux/kthread.h>
-#include "xfs_linux.h"
-
-STATIC kmem_zone_t *xfs_buf_zone;
-STATIC kmem_shaker_t xfs_buf_shake;
-STATIC int xfsbufd(void *);
-STATIC int xfsbufd_wakeup(int, gfp_t);
-STATIC void xfs_buf_delwri_queue(xfs_buf_t *, int);
-
-STATIC struct workqueue_struct *xfslogd_workqueue;
-struct workqueue_struct *xfsdatad_workqueue;
-
-#ifdef XFS_BUF_TRACE
-void
-xfs_buf_trace(
- xfs_buf_t *bp,
- char *id,
- void *data,
- void *ra)
-{
- ktrace_enter(xfs_buf_trace_buf,
- bp, id,
- (void *)(unsigned long)bp->b_flags,
- (void *)(unsigned long)bp->b_hold.counter,
- (void *)(unsigned long)bp->b_sema.count.counter,
- (void *)current,
- data, ra,
- (void *)(unsigned long)((bp->b_file_offset>>32) & 0xffffffff),
- (void *)(unsigned long)(bp->b_file_offset & 0xffffffff),
- (void *)(unsigned long)bp->b_buffer_length,
- NULL, NULL, NULL, NULL, NULL);
-}
-ktrace_t *xfs_buf_trace_buf;
-#define XFS_BUF_TRACE_SIZE 4096
-#define XB_TRACE(bp, id, data) \
- xfs_buf_trace(bp, id, (void *)data, (void *)__builtin_return_address(0))
-#else
-#define XB_TRACE(bp, id, data) do { } while (0)
-#endif
-
-#ifdef XFS_BUF_LOCK_TRACKING
-# define XB_SET_OWNER(bp) ((bp)->b_last_holder = current->pid)
-# define XB_CLEAR_OWNER(bp) ((bp)->b_last_holder = -1)
-# define XB_GET_OWNER(bp) ((bp)->b_last_holder)
-#else
-# define XB_SET_OWNER(bp) do { } while (0)
-# define XB_CLEAR_OWNER(bp) do { } while (0)
-# define XB_GET_OWNER(bp) do { } while (0)
-#endif
-
-#define xb_to_gfp(flags) \
- ((((flags) & XBF_READ_AHEAD) ? __GFP_NORETRY : \
- ((flags) & XBF_DONT_BLOCK) ? GFP_NOFS : GFP_KERNEL) | __GFP_NOWARN)
-
-#define xb_to_km(flags) \
- (((flags) & XBF_DONT_BLOCK) ? KM_NOFS : KM_SLEEP)
-
-#define xfs_buf_allocate(flags) \
- kmem_zone_alloc(xfs_buf_zone, xb_to_km(flags))
-#define xfs_buf_deallocate(bp) \
- kmem_zone_free(xfs_buf_zone, (bp));
-
-/*
- * Page Region interfaces.
- *
- * For pages in filesystems where the blocksize is smaller than the
- * pagesize, we use the page->private field (long) to hold a bitmap
- * of uptodate regions within the page.
- *
- * Each such region is "bytes per page / bits per long" bytes long.
- *
- * NBPPR == number-of-bytes-per-page-region
- * BTOPR == bytes-to-page-region (rounded up)
- * BTOPRT == bytes-to-page-region-truncated (rounded down)
- */
-#if (BITS_PER_LONG == 32)
-#define PRSHIFT (PAGE_CACHE_SHIFT - 5) /* (32 == 1<<5) */
-#elif (BITS_PER_LONG == 64)
-#define PRSHIFT (PAGE_CACHE_SHIFT - 6) /* (64 == 1<<6) */
-#else
-#error BITS_PER_LONG must be 32 or 64
-#endif
-#define NBPPR (PAGE_CACHE_SIZE/BITS_PER_LONG)
-#define BTOPR(b) (((unsigned int)(b) + (NBPPR - 1)) >> PRSHIFT)
-#define BTOPRT(b) (((unsigned int)(b) >> PRSHIFT))
-
-STATIC unsigned long
-page_region_mask(
- size_t offset,
- size_t length)
-{
- unsigned long mask;
- int first, final;
-
- first = BTOPR(offset);
- final = BTOPRT(offset + length - 1);
- first = min(first, final);
-
- mask = ~0UL;
- mask <<= BITS_PER_LONG - (final - first);
- mask >>= BITS_PER_LONG - (final);
-
- ASSERT(offset + length <= PAGE_CACHE_SIZE);
- ASSERT((final - first) < BITS_PER_LONG && (final - first) >= 0);
-
- return mask;
-}
-
-STATIC inline void
-set_page_region(
- struct page *page,
- size_t offset,
- size_t length)
-{
- set_page_private(page,
- page_private(page) | page_region_mask(offset, length));
- if (page_private(page) == ~0UL)
- SetPageUptodate(page);
-}
-
-STATIC inline int
-test_page_region(
- struct page *page,
- size_t offset,
- size_t length)
-{
- unsigned long mask = page_region_mask(offset, length);
-
- return (mask && (page_private(page) & mask) == mask);
-}
-
-/*
- * Mapping of multi-page buffers into contiguous virtual space
- */
-
-typedef struct a_list {
- void *vm_addr;
- struct a_list *next;
-} a_list_t;
-
-STATIC a_list_t *as_free_head;
-STATIC int as_list_len;
-STATIC DEFINE_SPINLOCK(as_lock);
-
-/*
- * Try to batch vunmaps because they are costly.
- */
-STATIC void
-free_address(
- void *addr)
-{
- a_list_t *aentry;
-
- aentry = kmalloc(sizeof(a_list_t), GFP_ATOMIC & ~__GFP_HIGH);
- if (likely(aentry)) {
- spin_lock(&as_lock);
- aentry->next = as_free_head;
- aentry->vm_addr = addr;
- as_free_head = aentry;
- as_list_len++;
- spin_unlock(&as_lock);
- } else {
- vunmap(addr);
- }
-}
-
-STATIC void
-purge_addresses(void)
-{
- a_list_t *aentry, *old;
-
- if (as_free_head == NULL)
- return;
-
- spin_lock(&as_lock);
- aentry = as_free_head;
- as_free_head = NULL;
- as_list_len = 0;
- spin_unlock(&as_lock);
-
- while ((old = aentry) != NULL) {
- vunmap(aentry->vm_addr);
- aentry = aentry->next;
- kfree(old);
- }
-}
-
-/*
- * Internal xfs_buf_t object manipulation
- */
-
-STATIC void
-_xfs_buf_initialize(
- xfs_buf_t *bp,
- xfs_buftarg_t *target,
- xfs_off_t range_base,
- size_t range_length,
- xfs_buf_flags_t flags)
-{
- /*
- * We don't want certain flags to appear in b_flags.
- */
- flags &= ~(XBF_LOCK|XBF_MAPPED|XBF_DONT_BLOCK|XBF_READ_AHEAD);
-
- memset(bp, 0, sizeof(xfs_buf_t));
- atomic_set(&bp->b_hold, 1);
- init_MUTEX_LOCKED(&bp->b_iodonesema);
- INIT_LIST_HEAD(&bp->b_list);
- INIT_LIST_HEAD(&bp->b_hash_list);
- init_MUTEX_LOCKED(&bp->b_sema); /* held, no waiters */
- XB_SET_OWNER(bp);
- bp->b_target = target;
- bp->b_file_offset = range_base;
- /*
- * Set buffer_length and count_desired to the same value initially.
- * I/O routines should use count_desired, which will be the same in
- * most cases but may be reset (e.g. XFS recovery).
- */
- bp->b_buffer_length = bp->b_count_desired = range_length;
- bp->b_flags = flags;
- bp->b_bn = XFS_BUF_DADDR_NULL;
- atomic_set(&bp->b_pin_count, 0);
- init_waitqueue_head(&bp->b_waiters);
-
- XFS_STATS_INC(xb_create);
- XB_TRACE(bp, "initialize", target);
-}
-
-/*
- * Allocate a page array capable of holding a specified number
- * of pages, and point the page buf at it.
- */
-STATIC int
-_xfs_buf_get_pages(
- xfs_buf_t *bp,
- int page_count,
- xfs_buf_flags_t flags)
-{
- /* Make sure that we have a page list */
- if (bp->b_pages == NULL) {
- bp->b_offset = xfs_buf_poff(bp->b_file_offset);
- bp->b_page_count = page_count;
- if (page_count <= XB_PAGES) {
- bp->b_pages = bp->b_page_array;
- } else {
- bp->b_pages = kmem_alloc(sizeof(struct page *) *
- page_count, xb_to_km(flags));
- if (bp->b_pages == NULL)
- return -ENOMEM;
- }
- memset(bp->b_pages, 0, sizeof(struct page *) * page_count);
- }
- return 0;
-}
-
-/*
- * Frees b_pages if it was allocated.
- */
-STATIC void
-_xfs_buf_free_pages(
- xfs_buf_t *bp)
-{
- if (bp->b_pages != bp->b_page_array) {
- kmem_free(bp->b_pages,
- bp->b_page_count * sizeof(struct page *));
- }
-}
-
-/*
- * Releases the specified buffer.
- *
- * The modification state of any associated pages is left unchanged.
- * The buffer most not be on any hash - use xfs_buf_rele instead for
- * hashed and refcounted buffers
- */
-void
-xfs_buf_free(
- xfs_buf_t *bp)
-{
- XB_TRACE(bp, "free", 0);
-
- ASSERT(list_empty(&bp->b_hash_list));
-
- if (bp->b_flags & _XBF_PAGE_CACHE) {
- uint i;
-
- if ((bp->b_flags & XBF_MAPPED) && (bp->b_page_count > 1))
- free_address(bp->b_addr - bp->b_offset);
-
- for (i = 0; i < bp->b_page_count; i++)
- page_cache_release(bp->b_pages[i]);
- _xfs_buf_free_pages(bp);
- } else if (bp->b_flags & _XBF_KMEM_ALLOC) {
- /*
- * XXX(hch): bp->b_count_desired might be incorrect (see
- * xfs_buf_associate_memory for details), but fortunately
- * the Linux version of kmem_free ignores the len argument..
- */
- kmem_free(bp->b_addr, bp->b_count_desired);
- _xfs_buf_free_pages(bp);
- }
-
- xfs_buf_deallocate(bp);
-}
-
-/*
- * Finds all pages for buffer in question and builds it's page list.
- */
-STATIC int
-_xfs_buf_lookup_pages(
- xfs_buf_t *bp,
- uint flags)
-{
- struct address_space *mapping = bp->b_target->bt_mapping;
- size_t blocksize = bp->b_target->bt_bsize;
- size_t size = bp->b_count_desired;
- size_t nbytes, offset;
- gfp_t gfp_mask = xb_to_gfp(flags);
- unsigned short page_count, i;
- pgoff_t first;
- xfs_off_t end;
- int error;
-
- end = bp->b_file_offset + bp->b_buffer_length;
- page_count = xfs_buf_btoc(end) - xfs_buf_btoct(bp->b_file_offset);
-
- error = _xfs_buf_get_pages(bp, page_count, flags);
- if (unlikely(error))
- return error;
- bp->b_flags |= _XBF_PAGE_CACHE;
-
- offset = bp->b_offset;
- first = bp->b_file_offset >> PAGE_CACHE_SHIFT;
-
- for (i = 0; i < bp->b_page_count; i++) {
- struct page *page;
- uint retries = 0;
-
- retry:
- page = find_or_create_page(mapping, first + i, gfp_mask);
- if (unlikely(page == NULL)) {
- if (flags & XBF_READ_AHEAD) {
- bp->b_page_count = i;
- for (i = 0; i < bp->b_page_count; i++)
- unlock_page(bp->b_pages[i]);
- return -ENOMEM;
- }
-
- /*
- * This could deadlock.
- *
- * But until all the XFS lowlevel code is revamped to
- * handle buffer allocation failures we can't do much.
- */
- if (!(++retries % 100))
- printk(KERN_ERR
- "XFS: possible memory allocation "
- "deadlock in %s (mode:0x%x)\n",
- __FUNCTION__, gfp_mask);
-
- XFS_STATS_INC(xb_page_retries);
- xfsbufd_wakeup(0, gfp_mask);
- blk_congestion_wait(WRITE, HZ/50);
- goto retry;
- }
-
- XFS_STATS_INC(xb_page_found);
-
- nbytes = min_t(size_t, size, PAGE_CACHE_SIZE - offset);
- size -= nbytes;
-
- if (!PageUptodate(page)) {
- page_count--;
- if (blocksize >= PAGE_CACHE_SIZE) {
- if (flags & XBF_READ)
- bp->b_locked = 1;
- } else if (!PagePrivate(page)) {
- if (test_page_region(page, offset, nbytes))
- page_count++;
- }
- }
-
- bp->b_pages[i] = page;
- offset = 0;
- }
-
- if (!bp->b_locked) {
- for (i = 0; i < bp->b_page_count; i++)
- unlock_page(bp->b_pages[i]);
- }
-
- if (page_count == bp->b_page_count)
- bp->b_flags |= XBF_DONE;
-
- XB_TRACE(bp, "lookup_pages", (long)page_count);
- return error;
-}
-
-/*
- * Map buffer into kernel address-space if nessecary.
- */
-STATIC int
-_xfs_buf_map_pages(
- xfs_buf_t *bp,
- uint flags)
-{
- /* A single page buffer is always mappable */
- if (bp->b_page_count == 1) {
- bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset;
- bp->b_flags |= XBF_MAPPED;
- } else if (flags & XBF_MAPPED) {
- if (as_list_len > 64)
- purge_addresses();
- bp->b_addr = vmap(bp->b_pages, bp->b_page_count,
- VM_MAP, PAGE_KERNEL);
- if (unlikely(bp->b_addr == NULL))
- return -ENOMEM;
- bp->b_addr += bp->b_offset;
- bp->b_flags |= XBF_MAPPED;
- }
-
- return 0;
-}
-
-/*
- * Finding and Reading Buffers
- */
-
-/*
- * Look up, and creates if absent, a lockable buffer for
- * a given range of an inode. The buffer is returned
- * locked. If other overlapping buffers exist, they are
- * released before the new buffer is created and locked,
- * which may imply that this call will block until those buffers
- * are unlocked. No I/O is implied by this call.
- */
-xfs_buf_t *
-_xfs_buf_find(
- xfs_buftarg_t *btp, /* block device target */
- xfs_off_t ioff, /* starting offset of range */
- size_t isize, /* length of range */
- xfs_buf_flags_t flags,
- xfs_buf_t *new_bp)
-{
- xfs_off_t range_base;
- size_t range_length;
- xfs_bufhash_t *hash;
- xfs_buf_t *bp, *n;
-
- range_base = (ioff << BBSHIFT);
- range_length = (isize << BBSHIFT);
-
- /* Check for IOs smaller than the sector size / not sector aligned */
- ASSERT(!(range_length < (1 << btp->bt_sshift)));
- ASSERT(!(range_base & (xfs_off_t)btp->bt_smask));
-
- hash = &btp->bt_hash[hash_long((unsigned long)ioff, btp->bt_hashshift)];
-
- spin_lock(&hash->bh_lock);
-
- list_for_each_entry_safe(bp, n, &hash->bh_list, b_hash_list) {
- ASSERT(btp == bp->b_target);
- if (bp->b_file_offset == range_base &&
- bp->b_buffer_length == range_length) {
- /*
- * If we look at something, bring it to the
- * front of the list for next time.
- */
- atomic_inc(&bp->b_hold);
- list_move(&bp->b_hash_list, &hash->bh_list);
- goto found;
- }
- }
-
- /* No match found */
- if (new_bp) {
- _xfs_buf_initialize(new_bp, btp, range_base,
- range_length, flags);
- new_bp->b_hash = hash;
- list_add(&new_bp->b_hash_list, &hash->bh_list);
- } else {
- XFS_STATS_INC(xb_miss_locked);
- }
-
- spin_unlock(&hash->bh_lock);
- return new_bp;
-
-found:
- spin_unlock(&hash->bh_lock);
-
- /* Attempt to get the semaphore without sleeping,
- * if this does not work then we need to drop the
- * spinlock and do a hard attempt on the semaphore.
- */
- if (down_trylock(&bp->b_sema)) {
- if (!(flags & XBF_TRYLOCK)) {
- /* wait for buffer ownership */
- XB_TRACE(bp, "get_lock", 0);
- xfs_buf_lock(bp);
- XFS_STATS_INC(xb_get_locked_waited);
- } else {
- /* We asked for a trylock and failed, no need
- * to look at file offset and length here, we
- * know that this buffer at least overlaps our
- * buffer and is locked, therefore our buffer
- * either does not exist, or is this buffer.
- */
- xfs_buf_rele(bp);
- XFS_STATS_INC(xb_busy_locked);
- return NULL;
- }
- } else {
- /* trylock worked */
- XB_SET_OWNER(bp);
- }
-
- if (bp->b_flags & XBF_STALE) {
- ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0);
- bp->b_flags &= XBF_MAPPED;
- }
- XB_TRACE(bp, "got_lock", 0);
- XFS_STATS_INC(xb_get_locked);
- return bp;
-}
-
-/*
- * Assembles a buffer covering the specified range.
- * Storage in memory for all portions of the buffer will be allocated,
- * although backing storage may not be.
- */
-xfs_buf_t *
-xfs_buf_get_flags(
- xfs_buftarg_t *target,/* target for buffer */
- xfs_off_t ioff, /* starting offset of range */
- size_t isize, /* length of range */
- xfs_buf_flags_t flags)
-{
- xfs_buf_t *bp, *new_bp;
- int error = 0, i;
-
- new_bp = xfs_buf_allocate(flags);
- if (unlikely(!new_bp))
- return NULL;
-
- bp = _xfs_buf_find(target, ioff, isize, flags, new_bp);
- if (bp == new_bp) {
- error = _xfs_buf_lookup_pages(bp, flags);
- if (error)
- goto no_buffer;
- } else {
- xfs_buf_deallocate(new_bp);
- if (unlikely(bp == NULL))
- return NULL;
- }
-
- for (i = 0; i < bp->b_page_count; i++)
- mark_page_accessed(bp->b_pages[i]);
-
- if (!(bp->b_flags & XBF_MAPPED)) {
- error = _xfs_buf_map_pages(bp, flags);
- if (unlikely(error)) {
- printk(KERN_WARNING "%s: failed to map pages\n",
- __FUNCTION__);
- goto no_buffer;
- }
- }
-
- XFS_STATS_INC(xb_get);
-
- /*
- * Always fill in the block number now, the mapped cases can do
- * their own overlay of this later.
- */
- bp->b_bn = ioff;
- bp->b_count_desired = bp->b_buffer_length;
-
- XB_TRACE(bp, "get", (unsigned long)flags);
- return bp;
-
- no_buffer:
- if (flags & (XBF_LOCK | XBF_TRYLOCK))
- xfs_buf_unlock(bp);
- xfs_buf_rele(bp);
- return NULL;
-}
-
-xfs_buf_t *
-xfs_buf_read_flags(
- xfs_buftarg_t *target,
- xfs_off_t ioff,
- size_t isize,
- xfs_buf_flags_t flags)
-{
- xfs_buf_t *bp;
-
- flags |= XBF_READ;
-
- bp = xfs_buf_get_flags(target, ioff, isize, flags);
- if (bp) {
- if (!XFS_BUF_ISDONE(bp)) {
- XB_TRACE(bp, "read", (unsigned long)flags);
- XFS_STATS_INC(xb_get_read);
- xfs_buf_iostart(bp, flags);
- } else if (flags & XBF_ASYNC) {
- XB_TRACE(bp, "read_async", (unsigned long)flags);
- /*
- * Read ahead call which is already satisfied,
- * drop the buffer
- */
- goto no_buffer;
- } else {
- XB_TRACE(bp, "read_done", (unsigned long)flags);
- /* We do not want read in the flags */
- bp->b_flags &= ~XBF_READ;
- }
- }
-
- return bp;
-
- no_buffer:
- if (flags & (XBF_LOCK | XBF_TRYLOCK))
- xfs_buf_unlock(bp);
- xfs_buf_rele(bp);
- return NULL;
-}
-
-/*
- * If we are not low on memory then do the readahead in a deadlock
- * safe manner.
- */
-void
-xfs_buf_readahead(
- xfs_buftarg_t *target,
- xfs_off_t ioff,
- size_t isize,
- xfs_buf_flags_t flags)
-{
- struct backing_dev_info *bdi;
-
- bdi = target->bt_mapping->backing_dev_info;
- if (bdi_read_congested(bdi))
- return;
-
- flags |= (XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD);
- xfs_buf_read_flags(target, ioff, isize, flags);
-}
-
-xfs_buf_t *
-xfs_buf_get_empty(
- size_t len,
- xfs_buftarg_t *target)
-{
- xfs_buf_t *bp;
-
- bp = xfs_buf_allocate(0);
- if (bp)
- _xfs_buf_initialize(bp, target, 0, len, 0);
- return bp;
-}
-
-static inline struct page *
-mem_to_page(
- void *addr)
-{
- if (((unsigned long)addr < VMALLOC_START) ||
- ((unsigned long)addr >= VMALLOC_END)) {
- return virt_to_page(addr);
- } else {
- return vmalloc_to_page(addr);
- }
-}
-
-int
-xfs_buf_associate_memory(
- xfs_buf_t *bp,
- void *mem,
- size_t len)
-{
- int rval;
- int i = 0;
- size_t ptr;
- size_t end, end_cur;
- off_t offset;
- int page_count;
-
- page_count = PAGE_CACHE_ALIGN(len) >> PAGE_CACHE_SHIFT;
- offset = (off_t) mem - ((off_t)mem & PAGE_CACHE_MASK);
- if (offset && (len > PAGE_CACHE_SIZE))
- page_count++;
-
- /* Free any previous set of page pointers */
- if (bp->b_pages)
- _xfs_buf_free_pages(bp);
-
- bp->b_pages = NULL;
- bp->b_addr = mem;
-
- rval = _xfs_buf_get_pages(bp, page_count, 0);
- if (rval)
- return rval;
-
- bp->b_offset = offset;
- ptr = (size_t) mem & PAGE_CACHE_MASK;
- end = PAGE_CACHE_ALIGN((size_t) mem + len);
- end_cur = end;
- /* set up first page */
- bp->b_pages[0] = mem_to_page(mem);
-
- ptr += PAGE_CACHE_SIZE;
- bp->b_page_count = ++i;
- while (ptr < end) {
- bp->b_pages[i] = mem_to_page((void *)ptr);
- bp->b_page_count = ++i;
- ptr += PAGE_CACHE_SIZE;
- }
- bp->b_locked = 0;
-
- bp->b_count_desired = bp->b_buffer_length = len;
- bp->b_flags |= XBF_MAPPED;
-
- return 0;
-}
-
-xfs_buf_t *
-xfs_buf_get_noaddr(
- size_t len,
- xfs_buftarg_t *target)
-{
- size_t malloc_len = len;
- xfs_buf_t *bp;
- void *data;
- int error;
-
- bp = xfs_buf_allocate(0);
- if (unlikely(bp == NULL))
- goto fail;
- _xfs_buf_initialize(bp, target, 0, len, 0);
-
- try_again:
- data = kmem_alloc(malloc_len, KM_SLEEP | KM_MAYFAIL);
- if (unlikely(data == NULL))
- goto fail_free_buf;
-
- /* check whether alignment matches.. */
- if ((__psunsigned_t)data !=
- ((__psunsigned_t)data & ~target->bt_smask)) {
- /* .. else double the size and try again */
- kmem_free(data, malloc_len);
- malloc_len <<= 1;
- goto try_again;
- }
-
- error = xfs_buf_associate_memory(bp, data, len);
- if (error)
- goto fail_free_mem;
- bp->b_flags |= _XBF_KMEM_ALLOC;
-
- xfs_buf_unlock(bp);
-
- XB_TRACE(bp, "no_daddr", data);
- return bp;
- fail_free_mem:
- kmem_free(data, malloc_len);
- fail_free_buf:
- xfs_buf_free(bp);
- fail:
- return NULL;
-}
-
-/*
- * Increment reference count on buffer, to hold the buffer concurrently
- * with another thread which may release (free) the buffer asynchronously.
- * Must hold the buffer already to call this function.
- */
-void
-xfs_buf_hold(
- xfs_buf_t *bp)
-{
- atomic_inc(&bp->b_hold);
- XB_TRACE(bp, "hold", 0);
-}
-
-/*
- * Releases a hold on the specified buffer. If the
- * the hold count is 1, calls xfs_buf_free.
- */
-void
-xfs_buf_rele(
- xfs_buf_t *bp)
-{
- xfs_bufhash_t *hash = bp->b_hash;
-
- XB_TRACE(bp, "rele", bp->b_relse);
-
- if (unlikely(!hash)) {
- ASSERT(!bp->b_relse);
- if (atomic_dec_and_test(&bp->b_hold))
- xfs_buf_free(bp);
- return;
- }
-
- if (atomic_dec_and_lock(&bp->b_hold, &hash->bh_lock)) {
- if (bp->b_relse) {
- atomic_inc(&bp->b_hold);
- spin_unlock(&hash->bh_lock);
- (*(bp->b_relse)) (bp);
- } else if (bp->b_flags & XBF_FS_MANAGED) {
- spin_unlock(&hash->bh_lock);
- } else {
- ASSERT(!(bp->b_flags & (XBF_DELWRI|_XBF_DELWRI_Q)));
- list_del_init(&bp->b_hash_list);
- spin_unlock(&hash->bh_lock);
- xfs_buf_free(bp);
- }
- } else {
- /*
- * Catch reference count leaks
- */
- ASSERT(atomic_read(&bp->b_hold) >= 0);
- }
-}
-
-
-/*
- * Mutual exclusion on buffers. Locking model:
- *
- * Buffers associated with inodes for which buffer locking
- * is not enabled are not protected by semaphores, and are
- * assumed to be exclusively owned by the caller. There is a
- * spinlock in the buffer, used by the caller when concurrent
- * access is possible.
- */
-
-/*
- * Locks a buffer object, if it is not already locked.
- * Note that this in no way locks the underlying pages, so it is only
- * useful for synchronizing concurrent use of buffer objects, not for
- * synchronizing independent access to the underlying pages.
- */
-int
-xfs_buf_cond_lock(
- xfs_buf_t *bp)
-{
- int locked;
-
- locked = down_trylock(&bp->b_sema) == 0;
- if (locked) {
- XB_SET_OWNER(bp);
- }
- XB_TRACE(bp, "cond_lock", (long)locked);
- return locked ? 0 : -EBUSY;
-}
-
-#if defined(DEBUG) || defined(XFS_BLI_TRACE)
-int
-xfs_buf_lock_value(
- xfs_buf_t *bp)
-{
- return atomic_read(&bp->b_sema.count);
-}
-#endif
-
-/*
- * Locks a buffer object.
- * Note that this in no way locks the underlying pages, so it is only
- * useful for synchronizing concurrent use of buffer objects, not for
- * synchronizing independent access to the underlying pages.
- */
-void
-xfs_buf_lock(
- xfs_buf_t *bp)
-{
- XB_TRACE(bp, "lock", 0);
- if (atomic_read(&bp->b_io_remaining))
- blk_run_address_space(bp->b_target->bt_mapping);
- down(&bp->b_sema);
- XB_SET_OWNER(bp);
- XB_TRACE(bp, "locked", 0);
-}
-
-/*
- * Releases the lock on the buffer object.
- * If the buffer is marked delwri but is not queued, do so before we
- * unlock the buffer as we need to set flags correctly. We also need to
- * take a reference for the delwri queue because the unlocker is going to
- * drop their's and they don't know we just queued it.
- */
-void
-xfs_buf_unlock(
- xfs_buf_t *bp)
-{
- if ((bp->b_flags & (XBF_DELWRI|_XBF_DELWRI_Q)) == XBF_DELWRI) {
- atomic_inc(&bp->b_hold);
- bp->b_flags |= XBF_ASYNC;
- xfs_buf_delwri_queue(bp, 0);
- }
-
- XB_CLEAR_OWNER(bp);
- up(&bp->b_sema);
- XB_TRACE(bp, "unlock", 0);
-}
-
-
-/*
- * Pinning Buffer Storage in Memory
- * Ensure that no attempt to force a buffer to disk will succeed.
- */
-void
-xfs_buf_pin(
- xfs_buf_t *bp)
-{
- atomic_inc(&bp->b_pin_count);
- XB_TRACE(bp, "pin", (long)bp->b_pin_count.counter);
-}
-
-void
-xfs_buf_unpin(
- xfs_buf_t *bp)
-{
- if (atomic_dec_and_test(&bp->b_pin_count))
- wake_up_all(&bp->b_waiters);
- XB_TRACE(bp, "unpin", (long)bp->b_pin_count.counter);
-}
-
-int
-xfs_buf_ispin(
- xfs_buf_t *bp)
-{
- return atomic_read(&bp->b_pin_count);
-}
-
-STATIC void
-xfs_buf_wait_unpin(
- xfs_buf_t *bp)
-{
- DECLARE_WAITQUEUE (wait, current);
-
- if (atomic_read(&bp->b_pin_count) == 0)
- return;
-
- add_wait_queue(&bp->b_waiters, &wait);
- for (;;) {
- set_current_state(TASK_UNINTERRUPTIBLE);
- if (atomic_read(&bp->b_pin_count) == 0)
- break;
- if (atomic_read(&bp->b_io_remaining))
- blk_run_address_space(bp->b_target->bt_mapping);
- schedule();
- }
- remove_wait_queue(&bp->b_waiters, &wait);
- set_current_state(TASK_RUNNING);
-}
-
-/*
- * Buffer Utility Routines
- */
-
-STATIC void
-xfs_buf_iodone_work(
- void *v)
-{
- xfs_buf_t *bp = (xfs_buf_t *)v;
-
- if (bp->b_iodone)
- (*(bp->b_iodone))(bp);
- else if (bp->b_flags & XBF_ASYNC)
- xfs_buf_relse(bp);
-}
-
-void
-xfs_buf_ioend(
- xfs_buf_t *bp,
- int schedule)
-{
- bp->b_flags &= ~(XBF_READ | XBF_WRITE);
- if (bp->b_error == 0)
- bp->b_flags |= XBF_DONE;
-
- XB_TRACE(bp, "iodone", bp->b_iodone);
-
- if ((bp->b_iodone) || (bp->b_flags & XBF_ASYNC)) {
- if (schedule) {
- INIT_WORK(&bp->b_iodone_work, xfs_buf_iodone_work, bp);
- queue_work(xfslogd_workqueue, &bp->b_iodone_work);
- } else {
- xfs_buf_iodone_work(bp);
- }
- } else {
- up(&bp->b_iodonesema);
- }
-}
-
-void
-xfs_buf_ioerror(
- xfs_buf_t *bp,
- int error)
-{
- ASSERT(error >= 0 && error <= 0xffff);
- bp->b_error = (unsigned short)error;
- XB_TRACE(bp, "ioerror", (unsigned long)error);
-}
-
-/*
- * Initiate I/O on a buffer, based on the flags supplied.
- * The b_iodone routine in the buffer supplied will only be called
- * when all of the subsidiary I/O requests, if any, have been completed.
- */
-int
-xfs_buf_iostart(
- xfs_buf_t *bp,
- xfs_buf_flags_t flags)
-{
- int status = 0;
-
- XB_TRACE(bp, "iostart", (unsigned long)flags);
-
- if (flags & XBF_DELWRI) {
- bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_ASYNC);
- bp->b_flags |= flags & (XBF_DELWRI | XBF_ASYNC);
- xfs_buf_delwri_queue(bp, 1);
- return status;
- }
-
- bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_ASYNC | XBF_DELWRI | \
- XBF_READ_AHEAD | _XBF_RUN_QUEUES);
- bp->b_flags |= flags & (XBF_READ | XBF_WRITE | XBF_ASYNC | \
- XBF_READ_AHEAD | _XBF_RUN_QUEUES);
-
- BUG_ON(bp->b_bn == XFS_BUF_DADDR_NULL);
-
- /* For writes allow an alternate strategy routine to precede
- * the actual I/O request (which may not be issued at all in
- * a shutdown situation, for example).
- */
- status = (flags & XBF_WRITE) ?
- xfs_buf_iostrategy(bp) : xfs_buf_iorequest(bp);
-
- /* Wait for I/O if we are not an async request.
- * Note: async I/O request completion will release the buffer,
- * and that can already be done by this point. So using the
- * buffer pointer from here on, after async I/O, is invalid.
- */
- if (!status && !(flags & XBF_ASYNC))
- status = xfs_buf_iowait(bp);
-
- return status;
-}
-
-STATIC __inline__ int
-_xfs_buf_iolocked(
- xfs_buf_t *bp)
-{
- ASSERT(bp->b_flags & (XBF_READ | XBF_WRITE));
- if (bp->b_flags & XBF_READ)
- return bp->b_locked;
- return 0;
-}
-
-STATIC __inline__ void
-_xfs_buf_ioend(
- xfs_buf_t *bp,
- int schedule)
-{
- if (atomic_dec_and_test(&bp->b_io_remaining) == 1) {
- bp->b_locked = 0;
- xfs_buf_ioend(bp, schedule);
- }
-}
-
-STATIC int
-xfs_buf_bio_end_io(
- struct bio *bio,
- unsigned int bytes_done,
- int error)
-{
- xfs_buf_t *bp = (xfs_buf_t *)bio->bi_private;
- unsigned int blocksize = bp->b_target->bt_bsize;
- struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
-
- if (bio->bi_size)
- return 1;
-
- if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
- bp->b_error = EIO;
-
- do {
- struct page *page = bvec->bv_page;
-
- if (unlikely(bp->b_error)) {
- if (bp->b_flags & XBF_READ)
- ClearPageUptodate(page);
- SetPageError(page);
- } else if (blocksize >= PAGE_CACHE_SIZE) {
- SetPageUptodate(page);
- } else if (!PagePrivate(page) &&
- (bp->b_flags & _XBF_PAGE_CACHE)) {
- set_page_region(page, bvec->bv_offset, bvec->bv_len);
- }
-
- if (--bvec >= bio->bi_io_vec)
- prefetchw(&bvec->bv_page->flags);
-
- if (_xfs_buf_iolocked(bp)) {
- unlock_page(page);
- }
- } while (bvec >= bio->bi_io_vec);
-
- _xfs_buf_ioend(bp, 1);
- bio_put(bio);
- return 0;
-}
-
-STATIC void
-_xfs_buf_ioapply(
- xfs_buf_t *bp)
-{
- int i, rw, map_i, total_nr_pages, nr_pages;
- struct bio *bio;
- int offset = bp->b_offset;
- int size = bp->b_count_desired;
- sector_t sector = bp->b_bn;
- unsigned int blocksize = bp->b_target->bt_bsize;
- int locking = _xfs_buf_iolocked(bp);
-
- total_nr_pages = bp->b_page_count;
- map_i = 0;
-
- if (bp->b_flags & _XBF_RUN_QUEUES) {
- bp->b_flags &= ~_XBF_RUN_QUEUES;
- rw = (bp->b_flags & XBF_READ) ? READ_SYNC : WRITE_SYNC;
- } else {
- rw = (bp->b_flags & XBF_READ) ? READ : WRITE;
- }
-
- if (bp->b_flags & XBF_ORDERED) {
- ASSERT(!(bp->b_flags & XBF_READ));
- rw = WRITE_BARRIER;
- }
-
- /* Special code path for reading a sub page size buffer in --
- * we populate up the whole page, and hence the other metadata
- * in the same page. This optimization is only valid when the
- * filesystem block size is not smaller than the page size.
- */
- if ((bp->b_buffer_length < PAGE_CACHE_SIZE) &&
- (bp->b_flags & XBF_READ) && locking &&
- (blocksize >= PAGE_CACHE_SIZE)) {
- bio = bio_alloc(GFP_NOIO, 1);
-
- bio->bi_bdev = bp->b_target->bt_bdev;
- bio->bi_sector = sector - (offset >> BBSHIFT);
- bio->bi_end_io = xfs_buf_bio_end_io;
- bio->bi_private = bp;
-
- bio_add_page(bio, bp->b_pages[0], PAGE_CACHE_SIZE, 0);
- size = 0;
-
- atomic_inc(&bp->b_io_remaining);
-
- goto submit_io;
- }
-
- /* Lock down the pages which we need to for the request */
- if (locking && (bp->b_flags & XBF_WRITE) && (bp->b_locked == 0)) {
- for (i = 0; size; i++) {
- int nbytes = PAGE_CACHE_SIZE - offset;
- struct page *page = bp->b_pages[i];
-
- if (nbytes > size)
- nbytes = size;
-
- lock_page(page);
-
- size -= nbytes;
- offset = 0;
- }
- offset = bp->b_offset;
- size = bp->b_count_desired;
- }
-
-next_chunk:
- atomic_inc(&bp->b_io_remaining);
- nr_pages = BIO_MAX_SECTORS >> (PAGE_SHIFT - BBSHIFT);
- if (nr_pages > total_nr_pages)
- nr_pages = total_nr_pages;
-
- bio = bio_alloc(GFP_NOIO, nr_pages);
- bio->bi_bdev = bp->b_target->bt_bdev;
- bio->bi_sector = sector;
- bio->bi_end_io = xfs_buf_bio_end_io;
- bio->bi_private = bp;
-
- for (; size && nr_pages; nr_pages--, map_i++) {
- int rbytes, nbytes = PAGE_CACHE_SIZE - offset;
-
- if (nbytes > size)
- nbytes = size;
-
- rbytes = bio_add_page(bio, bp->b_pages[map_i], nbytes, offset);
- if (rbytes < nbytes)
- break;
-
- offset = 0;
- sector += nbytes >> BBSHIFT;
- size -= nbytes;
- total_nr_pages--;
- }
-
-submit_io:
- if (likely(bio->bi_size)) {
- submit_bio(rw, bio);
- if (size)
- goto next_chunk;
- } else {
- bio_put(bio);
- xfs_buf_ioerror(bp, EIO);
- }
-}
-
-int
-xfs_buf_iorequest(
- xfs_buf_t *bp)
-{
- XB_TRACE(bp, "iorequest", 0);
-
- if (bp->b_flags & XBF_DELWRI) {
- xfs_buf_delwri_queue(bp, 1);
- return 0;
- }
-
- if (bp->b_flags & XBF_WRITE) {
- xfs_buf_wait_unpin(bp);
- }
-
- xfs_buf_hold(bp);
-
- /* Set the count to 1 initially, this will stop an I/O
- * completion callout which happens before we have started
- * all the I/O from calling xfs_buf_ioend too early.
- */
- atomic_set(&bp->b_io_remaining, 1);
- _xfs_buf_ioapply(bp);
- _xfs_buf_ioend(bp, 0);
-
- xfs_buf_rele(bp);
- return 0;
-}
-
-/*
- * Waits for I/O to complete on the buffer supplied.
- * It returns immediately if no I/O is pending.
- * It returns the I/O error code, if any, or 0 if there was no error.
- */
-int
-xfs_buf_iowait(
- xfs_buf_t *bp)
-{
- XB_TRACE(bp, "iowait", 0);
- if (atomic_read(&bp->b_io_remaining))
- blk_run_address_space(bp->b_target->bt_mapping);
- down(&bp->b_iodonesema);
- XB_TRACE(bp, "iowaited", (long)bp->b_error);
- return bp->b_error;
-}
-
-xfs_caddr_t
-xfs_buf_offset(
- xfs_buf_t *bp,
- size_t offset)
-{
- struct page *page;
-
- if (bp->b_flags & XBF_MAPPED)
- return XFS_BUF_PTR(bp) + offset;
-
- offset += bp->b_offset;
- page = bp->b_pages[offset >> PAGE_CACHE_SHIFT];
- return (xfs_caddr_t)page_address(page) + (offset & (PAGE_CACHE_SIZE-1));
-}
-
-/*
- * Move data into or out of a buffer.
- */
-void
-xfs_buf_iomove(
- xfs_buf_t *bp, /* buffer to process */
- size_t boff, /* starting buffer offset */
- size_t bsize, /* length to copy */
- caddr_t data, /* data address */
- xfs_buf_rw_t mode) /* read/write/zero flag */
-{
- size_t bend, cpoff, csize;
- struct page *page;
-
- bend = boff + bsize;
- while (boff < bend) {
- page = bp->b_pages[xfs_buf_btoct(boff + bp->b_offset)];
- cpoff = xfs_buf_poff(boff + bp->b_offset);
- csize = min_t(size_t,
- PAGE_CACHE_SIZE-cpoff, bp->b_count_desired-boff);
-
- ASSERT(((csize + cpoff) <= PAGE_CACHE_SIZE));
-
- switch (mode) {
- case XBRW_ZERO:
- memset(page_address(page) + cpoff, 0, csize);
- break;
- case XBRW_READ:
- memcpy(data, page_address(page) + cpoff, csize);
- break;
- case XBRW_WRITE:
- memcpy(page_address(page) + cpoff, data, csize);
- }
-
- boff += csize;
- data += csize;
- }
-}
-
-/*
- * Handling of buffer targets (buftargs).
- */
-
-/*
- * Wait for any bufs with callbacks that have been submitted but
- * have not yet returned... walk the hash list for the target.
- */
-void
-xfs_wait_buftarg(
- xfs_buftarg_t *btp)
-{
- xfs_buf_t *bp, *n;
- xfs_bufhash_t *hash;
- uint i;
-
- for (i = 0; i < (1 << btp->bt_hashshift); i++) {
- hash = &btp->bt_hash[i];
-again:
- spin_lock(&hash->bh_lock);
- list_for_each_entry_safe(bp, n, &hash->bh_list, b_hash_list) {
- ASSERT(btp == bp->b_target);
- if (!(bp->b_flags & XBF_FS_MANAGED)) {
- spin_unlock(&hash->bh_lock);
- /*
- * Catch superblock reference count leaks
- * immediately
- */
- BUG_ON(bp->b_bn == 0);
- delay(100);
- goto again;
- }
- }
- spin_unlock(&hash->bh_lock);
- }
-}
-
-/*
- * Allocate buffer hash table for a given target.
- * For devices containing metadata (i.e. not the log/realtime devices)
- * we need to allocate a much larger hash table.
- */
-STATIC void
-xfs_alloc_bufhash(
- xfs_buftarg_t *btp,
- int external)
-{
- unsigned int i;
-
- btp->bt_hashshift = external ? 3 : 8; /* 8 or 256 buckets */
- btp->bt_hashmask = (1 << btp->bt_hashshift) - 1;
- btp->bt_hash = kmem_zalloc((1 << btp->bt_hashshift) *
- sizeof(xfs_bufhash_t), KM_SLEEP);
- for (i = 0; i < (1 << btp->bt_hashshift); i++) {
- spin_lock_init(&btp->bt_hash[i].bh_lock);
- INIT_LIST_HEAD(&btp->bt_hash[i].bh_list);
- }
-}
-
-STATIC void
-xfs_free_bufhash(
- xfs_buftarg_t *btp)
-{
- kmem_free(btp->bt_hash, (1<<btp->bt_hashshift) * sizeof(xfs_bufhash_t));
- btp->bt_hash = NULL;
-}
-
-/*
- * buftarg list for delwrite queue processing
- */
-STATIC LIST_HEAD(xfs_buftarg_list);
-STATIC DEFINE_SPINLOCK(xfs_buftarg_lock);
-
-STATIC void
-xfs_register_buftarg(
- xfs_buftarg_t *btp)
-{
- spin_lock(&xfs_buftarg_lock);
- list_add(&btp->bt_list, &xfs_buftarg_list);
- spin_unlock(&xfs_buftarg_lock);
-}
-
-STATIC void
-xfs_unregister_buftarg(
- xfs_buftarg_t *btp)
-{
- spin_lock(&xfs_buftarg_lock);
- list_del(&btp->bt_list);
- spin_unlock(&xfs_buftarg_lock);
-}
-
-void
-xfs_free_buftarg(
- xfs_buftarg_t *btp,
- int external)
-{
- xfs_flush_buftarg(btp, 1);
- if (external)
- xfs_blkdev_put(btp->bt_bdev);
- xfs_free_bufhash(btp);
- iput(btp->bt_mapping->host);
-
- /* Unregister the buftarg first so that we don't get a
- * wakeup finding a non-existent task
- */
- xfs_unregister_buftarg(btp);
- kthread_stop(btp->bt_task);
-
- kmem_free(btp, sizeof(*btp));
-}
-
-STATIC int
-xfs_setsize_buftarg_flags(
- xfs_buftarg_t *btp,
- unsigned int blocksize,
- unsigned int sectorsize,
- int verbose)
-{
- btp->bt_bsize = blocksize;
- btp->bt_sshift = ffs(sectorsize) - 1;
- btp->bt_smask = sectorsize - 1;
-
- if (set_blocksize(btp->bt_bdev, sectorsize)) {
- printk(KERN_WARNING
- "XFS: Cannot set_blocksize to %u on device %s\n",
- sectorsize, XFS_BUFTARG_NAME(btp));
- return EINVAL;
- }
-
- if (verbose &&
- (PAGE_CACHE_SIZE / BITS_PER_LONG) > sectorsize) {
- printk(KERN_WARNING
- "XFS: %u byte sectors in use on device %s. "
- "This is suboptimal; %u or greater is ideal.\n",
- sectorsize, XFS_BUFTARG_NAME(btp),
- (unsigned int)PAGE_CACHE_SIZE / BITS_PER_LONG);
- }
-
- return 0;
-}
-
-/*
- * When allocating the initial buffer target we have not yet
- * read in the superblock, so don't know what sized sectors
- * are being used is at this early stage. Play safe.
- */
-STATIC int
-xfs_setsize_buftarg_early(
- xfs_buftarg_t *btp,
- struct block_device *bdev)
-{
- return xfs_setsize_buftarg_flags(btp,
- PAGE_CACHE_SIZE, bdev_hardsect_size(bdev), 0);
-}
-
-int
-xfs_setsize_buftarg(
- xfs_buftarg_t *btp,
- unsigned int blocksize,
- unsigned int sectorsize)
-{
- return xfs_setsize_buftarg_flags(btp, blocksize, sectorsize, 1);
-}
-
-STATIC int
-xfs_mapping_buftarg(
- xfs_buftarg_t *btp,
- struct block_device *bdev)
-{
- struct backing_dev_info *bdi;
- struct inode *inode;
- struct address_space *mapping;
- static struct address_space_operations mapping_aops = {
- .sync_page = block_sync_page,
- .migratepage = fail_migrate_page,
- };
-
- inode = new_inode(bdev->bd_inode->i_sb);
- if (!inode) {
- printk(KERN_WARNING
- "XFS: Cannot allocate mapping inode for device %s\n",
- XFS_BUFTARG_NAME(btp));
- return ENOMEM;
- }
- inode->i_mode = S_IFBLK;
- inode->i_bdev = bdev;
- inode->i_rdev = bdev->bd_dev;
- bdi = blk_get_backing_dev_info(bdev);
- if (!bdi)
- bdi = &default_backing_dev_info;
- mapping = &inode->i_data;
- mapping->a_ops = &mapping_aops;
- mapping->backing_dev_info = bdi;
- mapping_set_gfp_mask(mapping, GFP_NOFS);
- btp->bt_mapping = mapping;
- return 0;
-}
-
-STATIC int
-xfs_alloc_delwrite_queue(
- xfs_buftarg_t *btp)
-{
- int error = 0;
-
- INIT_LIST_HEAD(&btp->bt_list);
- INIT_LIST_HEAD(&btp->bt_delwrite_queue);
- spinlock_init(&btp->bt_delwrite_lock, "delwri_lock");
- btp->bt_flags = 0;
- btp->bt_task = kthread_run(xfsbufd, btp, "xfsbufd");
- if (IS_ERR(btp->bt_task)) {
- error = PTR_ERR(btp->bt_task);
- goto out_error;
- }
- xfs_register_buftarg(btp);
-out_error:
- return error;
-}
-
-xfs_buftarg_t *
-xfs_alloc_buftarg(
- struct block_device *bdev,
- int external)
-{
- xfs_buftarg_t *btp;
-
- btp = kmem_zalloc(sizeof(*btp), KM_SLEEP);
-
- btp->bt_dev = bdev->bd_dev;
- btp->bt_bdev = bdev;
- if (xfs_setsize_buftarg_early(btp, bdev))
- goto error;
- if (xfs_mapping_buftarg(btp, bdev))
- goto error;
- if (xfs_alloc_delwrite_queue(btp))
- goto error;
- xfs_alloc_bufhash(btp, external);
- return btp;
-
-error:
- kmem_free(btp, sizeof(*btp));
- return NULL;
-}
-
-
-/*
- * Delayed write buffer handling
- */
-STATIC void
-xfs_buf_delwri_queue(
- xfs_buf_t *bp,
- int unlock)
-{
- struct list_head *dwq = &bp->b_target->bt_delwrite_queue;
- spinlock_t *dwlk = &bp->b_target->bt_delwrite_lock;
-
- XB_TRACE(bp, "delwri_q", (long)unlock);
- ASSERT((bp->b_flags&(XBF_DELWRI|XBF_ASYNC)) == (XBF_DELWRI|XBF_ASYNC));
-
- spin_lock(dwlk);
- /* If already in the queue, dequeue and place at tail */
- if (!list_empty(&bp->b_list)) {
- ASSERT(bp->b_flags & _XBF_DELWRI_Q);
- if (unlock)
- atomic_dec(&bp->b_hold);
- list_del(&bp->b_list);
- }
-
- bp->b_flags |= _XBF_DELWRI_Q;
- list_add_tail(&bp->b_list, dwq);
- bp->b_queuetime = jiffies;
- spin_unlock(dwlk);
-
- if (unlock)
- xfs_buf_unlock(bp);
-}
-
-void
-xfs_buf_delwri_dequeue(
- xfs_buf_t *bp)
-{
- spinlock_t *dwlk = &bp->b_target->bt_delwrite_lock;
- int dequeued = 0;
-
- spin_lock(dwlk);
- if ((bp->b_flags & XBF_DELWRI) && !list_empty(&bp->b_list)) {
- ASSERT(bp->b_flags & _XBF_DELWRI_Q);
- list_del_init(&bp->b_list);
- dequeued = 1;
- }
- bp->b_flags &= ~(XBF_DELWRI|_XBF_DELWRI_Q);
- spin_unlock(dwlk);
-
- if (dequeued)
- xfs_buf_rele(bp);
-
- XB_TRACE(bp, "delwri_dq", (long)dequeued);
-}
-
-STATIC void
-xfs_buf_runall_queues(
- struct workqueue_struct *queue)
-{
- flush_workqueue(queue);
-}
-
-STATIC int
-xfsbufd_wakeup(
- int priority,
- gfp_t mask)
-{
- xfs_buftarg_t *btp;
-
- spin_lock(&xfs_buftarg_lock);
- list_for_each_entry(btp, &xfs_buftarg_list, bt_list) {
- if (test_bit(XBT_FORCE_SLEEP, &btp->bt_flags))
- continue;
- set_bit(XBT_FORCE_FLUSH, &btp->bt_flags);
- wake_up_process(btp->bt_task);
- }
- spin_unlock(&xfs_buftarg_lock);
- return 0;
-}
-
-STATIC int
-xfsbufd(
- void *data)
-{
- struct list_head tmp;
- unsigned long age;
- xfs_buftarg_t *target = (xfs_buftarg_t *)data;
- xfs_buf_t *bp, *n;
- struct list_head *dwq = &target->bt_delwrite_queue;
- spinlock_t *dwlk = &target->bt_delwrite_lock;
-
- current->flags |= PF_MEMALLOC;
-
- INIT_LIST_HEAD(&tmp);
- do {
- if (unlikely(freezing(current))) {
- set_bit(XBT_FORCE_SLEEP, &target->bt_flags);
- refrigerator();
- } else {
- clear_bit(XBT_FORCE_SLEEP, &target->bt_flags);
- }
-
- schedule_timeout_interruptible(
- xfs_buf_timer_centisecs * msecs_to_jiffies(10));
-
- age = xfs_buf_age_centisecs * msecs_to_jiffies(10);
- spin_lock(dwlk);
- list_for_each_entry_safe(bp, n, dwq, b_list) {
- XB_TRACE(bp, "walkq1", (long)xfs_buf_ispin(bp));
- ASSERT(bp->b_flags & XBF_DELWRI);
-
- if (!xfs_buf_ispin(bp) && !xfs_buf_cond_lock(bp)) {
- if (!test_bit(XBT_FORCE_FLUSH,
- &target->bt_flags) &&
- time_before(jiffies,
- bp->b_queuetime + age)) {
- xfs_buf_unlock(bp);
- break;
- }
-
- bp->b_flags &= ~(XBF_DELWRI|_XBF_DELWRI_Q);
- bp->b_flags |= XBF_WRITE;
- list_move(&bp->b_list, &tmp);
- }
- }
- spin_unlock(dwlk);
-
- while (!list_empty(&tmp)) {
- bp = list_entry(tmp.next, xfs_buf_t, b_list);
- ASSERT(target == bp->b_target);
-
- list_del_init(&bp->b_list);
- xfs_buf_iostrategy(bp);
-
- blk_run_address_space(target->bt_mapping);
- }
-
- if (as_list_len > 0)
- purge_addresses();
-
- clear_bit(XBT_FORCE_FLUSH, &target->bt_flags);
- } while (!kthread_should_stop());
-
- return 0;
-}
-
-/*
- * Go through all incore buffers, and release buffers if they belong to
- * the given device. This is used in filesystem error handling to
- * preserve the consistency of its metadata.
- */
-int
-xfs_flush_buftarg(
- xfs_buftarg_t *target,
- int wait)
-{
- struct list_head tmp;
- xfs_buf_t *bp, *n;
- int pincount = 0;
- struct list_head *dwq = &target->bt_delwrite_queue;
- spinlock_t *dwlk = &target->bt_delwrite_lock;
-
- xfs_buf_runall_queues(xfsdatad_workqueue);
- xfs_buf_runall_queues(xfslogd_workqueue);
-
- INIT_LIST_HEAD(&tmp);
- spin_lock(dwlk);
- list_for_each_entry_safe(bp, n, dwq, b_list) {
- ASSERT(bp->b_target == target);
- ASSERT(bp->b_flags & (XBF_DELWRI | _XBF_DELWRI_Q));
- XB_TRACE(bp, "walkq2", (long)xfs_buf_ispin(bp));
- if (xfs_buf_ispin(bp)) {
- pincount++;
- continue;
- }
-
- list_move(&bp->b_list, &tmp);
- }
- spin_unlock(dwlk);
-
- /*
- * Dropped the delayed write list lock, now walk the temporary list
- */
- list_for_each_entry_safe(bp, n, &tmp, b_list) {
- xfs_buf_lock(bp);
- bp->b_flags &= ~(XBF_DELWRI|_XBF_DELWRI_Q);
- bp->b_flags |= XBF_WRITE;
- if (wait)
- bp->b_flags &= ~XBF_ASYNC;
- else
- list_del_init(&bp->b_list);
-
- xfs_buf_iostrategy(bp);
- }
-
- /*
- * Remaining list items must be flushed before returning
- */
- while (!list_empty(&tmp)) {
- bp = list_entry(tmp.next, xfs_buf_t, b_list);
-
- list_del_init(&bp->b_list);
- xfs_iowait(bp);
- xfs_buf_relse(bp);
- }
-
- if (wait)
- blk_run_address_space(target->bt_mapping);
-
- return pincount;
-}
-
-int __init
-xfs_buf_init(void)
-{
- int error = -ENOMEM;
-
-#ifdef XFS_BUF_TRACE
- xfs_buf_trace_buf = ktrace_alloc(XFS_BUF_TRACE_SIZE, KM_SLEEP);
-#endif
-
- xfs_buf_zone = kmem_zone_init(sizeof(xfs_buf_t), "xfs_buf");
- if (!xfs_buf_zone)
- goto out_free_trace_buf;
-
- xfslogd_workqueue = create_workqueue("xfslogd");
- if (!xfslogd_workqueue)
- goto out_free_buf_zone;
-
- xfsdatad_workqueue = create_workqueue("xfsdatad");
- if (!xfsdatad_workqueue)
- goto out_destroy_xfslogd_workqueue;
-
- xfs_buf_shake = kmem_shake_register(xfsbufd_wakeup);
- if (!xfs_buf_shake)
- goto out_destroy_xfsdatad_workqueue;
-
- return 0;
-
- out_destroy_xfsdatad_workqueue:
- destroy_workqueue(xfsdatad_workqueue);
- out_destroy_xfslogd_workqueue:
- destroy_workqueue(xfslogd_workqueue);
- out_free_buf_zone:
- kmem_zone_destroy(xfs_buf_zone);
- out_free_trace_buf:
-#ifdef XFS_BUF_TRACE
- ktrace_free(xfs_buf_trace_buf);
-#endif
- return error;
-}
-
-void
-xfs_buf_terminate(void)
-{
- kmem_shake_deregister(xfs_buf_shake);
- destroy_workqueue(xfsdatad_workqueue);
- destroy_workqueue(xfslogd_workqueue);
- kmem_zone_destroy(xfs_buf_zone);
-#ifdef XFS_BUF_TRACE
- ktrace_free(xfs_buf_trace_buf);
-#endif
-}
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
deleted file mode 100644
index 4dd6592d5a4..00000000000
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ /dev/null
@@ -1,429 +0,0 @@
-/*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- */
-#ifndef __XFS_BUF_H__
-#define __XFS_BUF_H__
-
-#include <linux/config.h>
-#include <linux/list.h>
-#include <linux/types.h>
-#include <linux/spinlock.h>
-#include <asm/system.h>
-#include <linux/mm.h>
-#include <linux/fs.h>
-#include <linux/buffer_head.h>
-#include <linux/uio.h>
-
-/*
- * Base types
- */
-
-#define XFS_BUF_DADDR_NULL ((xfs_daddr_t) (-1LL))
-
-#define xfs_buf_ctob(pp) ((pp) * PAGE_CACHE_SIZE)
-#define xfs_buf_btoc(dd) (((dd) + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT)
-#define xfs_buf_btoct(dd) ((dd) >> PAGE_CACHE_SHIFT)
-#define xfs_buf_poff(aa) ((aa) & ~PAGE_CACHE_MASK)
-
-typedef enum {
- XBRW_READ = 1, /* transfer into target memory */
- XBRW_WRITE = 2, /* transfer from target memory */
- XBRW_ZERO = 3, /* Zero target memory */
-} xfs_buf_rw_t;
-
-typedef enum {
- XBF_READ = (1 << 0), /* buffer intended for reading from device */
- XBF_WRITE = (1 << 1), /* buffer intended for writing to device */
- XBF_MAPPED = (1 << 2), /* buffer mapped (b_addr valid) */
- XBF_ASYNC = (1 << 4), /* initiator will not wait for completion */
- XBF_DONE = (1 << 5), /* all pages in the buffer uptodate */
- XBF_DELWRI = (1 << 6), /* buffer has dirty pages */
- XBF_STALE = (1 << 7), /* buffer has been staled, do not find it */
- XBF_FS_MANAGED = (1 << 8), /* filesystem controls freeing memory */
- XBF_ORDERED = (1 << 11), /* use ordered writes */
- XBF_READ_AHEAD = (1 << 12), /* asynchronous read-ahead */
-
- /* flags used only as arguments to access routines */
- XBF_LOCK = (1 << 14), /* lock requested */
- XBF_TRYLOCK = (1 << 15), /* lock requested, but do not wait */
- XBF_DONT_BLOCK = (1 << 16), /* do not block in current thread */
-
- /* flags used only internally */
- _XBF_PAGE_CACHE = (1 << 17),/* backed by pagecache */
- _XBF_KMEM_ALLOC = (1 << 18),/* backed by kmem_alloc() */
- _XBF_RUN_QUEUES = (1 << 19),/* run block device task queue */
- _XBF_DELWRI_Q = (1 << 21), /* buffer on delwri queue */
-} xfs_buf_flags_t;
-
-typedef enum {
- XBT_FORCE_SLEEP = (0 << 1),
- XBT_FORCE_FLUSH = (1 << 1),
-} xfs_buftarg_flags_t;
-
-typedef struct xfs_bufhash {
- struct list_head bh_list;
- spinlock_t bh_lock;
-} xfs_bufhash_t;
-
-typedef struct xfs_buftarg {
- dev_t bt_dev;
- struct block_device *bt_bdev;
- struct address_space *bt_mapping;
- unsigned int bt_bsize;
- unsigned int bt_sshift;
- size_t bt_smask;
-
- /* per device buffer hash table */
- uint bt_hashmask;
- uint bt_hashshift;
- xfs_bufhash_t *bt_hash;
-
- /* per device delwri queue */
- struct task_struct *bt_task;
- struct list_head bt_list;
- struct list_head bt_delwrite_queue;
- spinlock_t bt_delwrite_lock;
- unsigned long bt_flags;
-} xfs_buftarg_t;
-
-/*
- * xfs_buf_t: Buffer structure for pagecache-based buffers
- *
- * This buffer structure is used by the pagecache buffer management routines
- * to refer to an assembly of pages forming a logical buffer.
- *
- * The buffer structure is used on a temporary basis only, and discarded when
- * released. The real data storage is recorded in the pagecache. Buffers are
- * hashed to the block device on which the file system resides.
- */
-
-struct xfs_buf;
-typedef void (*xfs_buf_iodone_t)(struct xfs_buf *);
-typedef void (*xfs_buf_relse_t)(struct xfs_buf *);
-typedef int (*xfs_buf_bdstrat_t)(struct xfs_buf *);
-
-#define XB_PAGES 2
-
-typedef struct xfs_buf {
- struct semaphore b_sema; /* semaphore for lockables */
- unsigned long b_queuetime; /* time buffer was queued */
- atomic_t b_pin_count; /* pin count */
- wait_queue_head_t b_waiters; /* unpin waiters */
- struct list_head b_list;
- xfs_buf_flags_t b_flags; /* status flags */
- struct list_head b_hash_list; /* hash table list */
- xfs_bufhash_t *b_hash; /* hash table list start */
- xfs_buftarg_t *b_target; /* buffer target (device) */
- atomic_t b_hold; /* reference count */
- xfs_daddr_t b_bn; /* block number for I/O */
- xfs_off_t b_file_offset; /* offset in file */
- size_t b_buffer_length;/* size of buffer in bytes */
- size_t b_count_desired;/* desired transfer size */
- void *b_addr; /* virtual address of buffer */
- struct work_struct b_iodone_work;
- atomic_t b_io_remaining; /* #outstanding I/O requests */
- xfs_buf_iodone_t b_iodone; /* I/O completion function */
- xfs_buf_relse_t b_relse; /* releasing function */
- xfs_buf_bdstrat_t b_strat; /* pre-write function */
- struct semaphore b_iodonesema; /* Semaphore for I/O waiters */
- void *b_fspriv;
- void *b_fspriv2;
- void *b_fspriv3;
- unsigned short b_error; /* error code on I/O */
- unsigned short b_locked; /* page array is locked */
- unsigned int b_page_count; /* size of page array */
- unsigned int b_offset; /* page offset in first page */
- struct page **b_pages; /* array of page pointers */
- struct page *b_page_array[XB_PAGES]; /* inline pages */
-#ifdef XFS_BUF_LOCK_TRACKING
- int b_last_holder;
-#endif
-} xfs_buf_t;
-
-
-/* Finding and Reading Buffers */
-extern xfs_buf_t *_xfs_buf_find(xfs_buftarg_t *, xfs_off_t, size_t,
- xfs_buf_flags_t, xfs_buf_t *);
-#define xfs_incore(buftarg,blkno,len,lockit) \
- _xfs_buf_find(buftarg, blkno ,len, lockit, NULL)
-
-extern xfs_buf_t *xfs_buf_get_flags(xfs_buftarg_t *, xfs_off_t, size_t,
- xfs_buf_flags_t);
-#define xfs_buf_get(target, blkno, len, flags) \
- xfs_buf_get_flags((target), (blkno), (len), XBF_LOCK | XBF_MAPPED)
-
-extern xfs_buf_t *xfs_buf_read_flags(xfs_buftarg_t *, xfs_off_t, size_t,
- xfs_buf_flags_t);
-#define xfs_buf_read(target, blkno, len, flags) \
- xfs_buf_read_flags((target), (blkno), (len), XBF_LOCK | XBF_MAPPED)
-
-extern xfs_buf_t *xfs_buf_get_empty(size_t, xfs_buftarg_t *);
-extern xfs_buf_t *xfs_buf_get_noaddr(size_t, xfs_buftarg_t *);
-extern int xfs_buf_associate_memory(xfs_buf_t *, void *, size_t);
-extern void xfs_buf_hold(xfs_buf_t *);
-extern void xfs_buf_readahead(xfs_buftarg_t *, xfs_off_t, size_t,
- xfs_buf_flags_t);
-
-/* Releasing Buffers */
-extern void xfs_buf_free(xfs_buf_t *);
-extern void xfs_buf_rele(xfs_buf_t *);
-
-/* Locking and Unlocking Buffers */
-extern int xfs_buf_cond_lock(xfs_buf_t *);
-extern int xfs_buf_lock_value(xfs_buf_t *);
-extern void xfs_buf_lock(xfs_buf_t *);
-extern void xfs_buf_unlock(xfs_buf_t *);
-
-/* Buffer Read and Write Routines */
-extern void xfs_buf_ioend(xfs_buf_t *, int);
-extern void xfs_buf_ioerror(xfs_buf_t *, int);
-extern int xfs_buf_iostart(xfs_buf_t *, xfs_buf_flags_t);
-extern int xfs_buf_iorequest(xfs_buf_t *);
-extern int xfs_buf_iowait(xfs_buf_t *);
-extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, xfs_caddr_t,
- xfs_buf_rw_t);
-
-static inline int xfs_buf_iostrategy(xfs_buf_t *bp)
-{
- return bp->b_strat ? bp->b_strat(bp) : xfs_buf_iorequest(bp);
-}
-
-static inline int xfs_buf_geterror(xfs_buf_t *bp)
-{
- return bp ? bp->b_error : ENOMEM;
-}
-
-/* Buffer Utility Routines */
-extern xfs_caddr_t xfs_buf_offset(xfs_buf_t *, size_t);
-
-/* Pinning Buffer Storage in Memory */
-extern void xfs_buf_pin(xfs_buf_t *);
-extern void xfs_buf_unpin(xfs_buf_t *);
-extern int xfs_buf_ispin(xfs_buf_t *);
-
-/* Delayed Write Buffer Routines */
-extern void xfs_buf_delwri_dequeue(xfs_buf_t *);
-
-/* Buffer Daemon Setup Routines */
-extern int xfs_buf_init(void);
-extern void xfs_buf_terminate(void);
-
-#ifdef XFS_BUF_TRACE
-extern ktrace_t *xfs_buf_trace_buf;
-extern void xfs_buf_trace(xfs_buf_t *, char *, void *, void *);
-#else
-#define xfs_buf_trace(bp,id,ptr,ra) do { } while (0)
-#endif
-
-#define xfs_buf_target_name(target) \
- ({ char __b[BDEVNAME_SIZE]; bdevname((target)->bt_bdev, __b); __b; })
-
-
-#define XFS_B_ASYNC XBF_ASYNC
-#define XFS_B_DELWRI XBF_DELWRI
-#define XFS_B_READ XBF_READ
-#define XFS_B_WRITE XBF_WRITE
-#define XFS_B_STALE XBF_STALE
-
-#define XFS_BUF_TRYLOCK XBF_TRYLOCK
-#define XFS_INCORE_TRYLOCK XBF_TRYLOCK
-#define XFS_BUF_LOCK XBF_LOCK
-#define XFS_BUF_MAPPED XBF_MAPPED
-
-#define BUF_BUSY XBF_DONT_BLOCK
-
-#define XFS_BUF_BFLAGS(bp) ((bp)->b_flags)
-#define XFS_BUF_ZEROFLAGS(bp) \
- ((bp)->b_flags &= ~(XBF_READ|XBF_WRITE|XBF_ASYNC|XBF_DELWRI))
-
-#define XFS_BUF_STALE(bp) ((bp)->b_flags |= XFS_B_STALE)
-#define XFS_BUF_UNSTALE(bp) ((bp)->b_flags &= ~XFS_B_STALE)
-#define XFS_BUF_ISSTALE(bp) ((bp)->b_flags & XFS_B_STALE)
-#define XFS_BUF_SUPER_STALE(bp) do { \
- XFS_BUF_STALE(bp); \
- xfs_buf_delwri_dequeue(bp); \
- XFS_BUF_DONE(bp); \
- } while (0)
-
-#define XFS_BUF_MANAGE XBF_FS_MANAGED
-#define XFS_BUF_UNMANAGE(bp) ((bp)->b_flags &= ~XBF_FS_MANAGED)
-
-#define XFS_BUF_DELAYWRITE(bp) ((bp)->b_flags |= XBF_DELWRI)
-#define XFS_BUF_UNDELAYWRITE(bp) xfs_buf_delwri_dequeue(bp)
-#define XFS_BUF_ISDELAYWRITE(bp) ((bp)->b_flags & XBF_DELWRI)
-
-#define XFS_BUF_ERROR(bp,no) xfs_buf_ioerror(bp,no)
-#define XFS_BUF_GETERROR(bp) xfs_buf_geterror(bp)
-#define XFS_BUF_ISERROR(bp) (xfs_buf_geterror(bp) ? 1 : 0)
-
-#define XFS_BUF_DONE(bp) ((bp)->b_flags |= XBF_DONE)
-#define XFS_BUF_UNDONE(bp) ((bp)->b_flags &= ~XBF_DONE)
-#define XFS_BUF_ISDONE(bp) ((bp)->b_flags & XBF_DONE)
-
-#define XFS_BUF_BUSY(bp) do { } while (0)
-#define XFS_BUF_UNBUSY(bp) do { } while (0)
-#define XFS_BUF_ISBUSY(bp) (1)
-
-#define XFS_BUF_ASYNC(bp) ((bp)->b_flags |= XBF_ASYNC)
-#define XFS_BUF_UNASYNC(bp) ((bp)->b_flags &= ~XBF_ASYNC)
-#define XFS_BUF_ISASYNC(bp) ((bp)->b_flags & XBF_ASYNC)
-
-#define XFS_BUF_ORDERED(bp) ((bp)->b_flags |= XBF_ORDERED)
-#define XFS_BUF_UNORDERED(bp) ((bp)->b_flags &= ~XBF_ORDERED)
-#define XFS_BUF_ISORDERED(bp) ((bp)->b_flags & XBF_ORDERED)
-
-#define XFS_BUF_SHUT(bp) do { } while (0)
-#define XFS_BUF_UNSHUT(bp) do { } while (0)
-#define XFS_BUF_ISSHUT(bp) (0)
-
-#define XFS_BUF_HOLD(bp) xfs_buf_hold(bp)
-#define XFS_BUF_READ(bp) ((bp)->b_flags |= XBF_READ)
-#define XFS_BUF_UNREAD(bp) ((bp)->b_flags &= ~XBF_READ)
-#define XFS_BUF_ISREAD(bp) ((bp)->b_flags & XBF_READ)
-
-#define XFS_BUF_WRITE(bp) ((bp)->b_flags |= XBF_WRITE)
-#define XFS_BUF_UNWRITE(bp) ((bp)->b_flags &= ~XBF_WRITE)
-#define XFS_BUF_ISWRITE(bp) ((bp)->b_flags & XBF_WRITE)
-
-#define XFS_BUF_ISUNINITIAL(bp) (0)
-#define XFS_BUF_UNUNINITIAL(bp) (0)
-
-#define XFS_BUF_BP_ISMAPPED(bp) (1)
-
-#define XFS_BUF_IODONE_FUNC(bp) ((bp)->b_iodone)
-#define XFS_BUF_SET_IODONE_FUNC(bp, func) ((bp)->b_iodone = (func))
-#define XFS_BUF_CLR_IODONE_FUNC(bp) ((bp)->b_iodone = NULL)
-#define XFS_BUF_SET_BDSTRAT_FUNC(bp, func) ((bp)->b_strat = (func))
-#define XFS_BUF_CLR_BDSTRAT_FUNC(bp) ((bp)->b_strat = NULL)
-
-#define XFS_BUF_FSPRIVATE(bp, type) ((type)(bp)->b_fspriv)
-#define XFS_BUF_SET_FSPRIVATE(bp, val) ((bp)->b_fspriv = (void*)(val))
-#define XFS_BUF_FSPRIVATE2(bp, type) ((type)(bp)->b_fspriv2)
-#define XFS_BUF_SET_FSPRIVATE2(bp, val) ((bp)->b_fspriv2 = (void*)(val))
-#define XFS_BUF_FSPRIVATE3(bp, type) ((type)(bp)->b_fspriv3)
-#define XFS_BUF_SET_FSPRIVATE3(bp, val) ((bp)->b_fspriv3 = (void*)(val))
-#define XFS_BUF_SET_START(bp) do { } while (0)
-#define XFS_BUF_SET_BRELSE_FUNC(bp, func) ((bp)->b_relse = (func))
-
-#define XFS_BUF_PTR(bp) (xfs_caddr_t)((bp)->b_addr)
-#define XFS_BUF_SET_PTR(bp, val, cnt) xfs_buf_associate_memory(bp, val, cnt)
-#define XFS_BUF_ADDR(bp) ((bp)->b_bn)
-#define XFS_BUF_SET_ADDR(bp, bno) ((bp)->b_bn = (xfs_daddr_t)(bno))
-#define XFS_BUF_OFFSET(bp) ((bp)->b_file_offset)
-#define XFS_BUF_SET_OFFSET(bp, off) ((bp)->b_file_offset = (off))
-#define XFS_BUF_COUNT(bp) ((bp)->b_count_desired)
-#define XFS_BUF_SET_COUNT(bp, cnt) ((bp)->b_count_desired = (cnt))
-#define XFS_BUF_SIZE(bp) ((bp)->b_buffer_length)
-#define XFS_BUF_SET_SIZE(bp, cnt) ((bp)->b_buffer_length = (cnt))
-
-#define XFS_BUF_SET_VTYPE_REF(bp, type, ref) do { } while (0)
-#define XFS_BUF_SET_VTYPE(bp, type) do { } while (0)
-#define XFS_BUF_SET_REF(bp, ref) do { } while (0)
-
-#define XFS_BUF_ISPINNED(bp) xfs_buf_ispin(bp)
-
-#define XFS_BUF_VALUSEMA(bp) xfs_buf_lock_value(bp)
-#define XFS_BUF_CPSEMA(bp) (xfs_buf_cond_lock(bp) == 0)
-#define XFS_BUF_VSEMA(bp) xfs_buf_unlock(bp)
-#define XFS_BUF_PSEMA(bp,x) xfs_buf_lock(bp)
-#define XFS_BUF_V_IODONESEMA(bp) up(&bp->b_iodonesema);
-
-#define XFS_BUF_SET_TARGET(bp, target) ((bp)->b_target = (target))
-#define XFS_BUF_TARGET(bp) ((bp)->b_target)
-#define XFS_BUFTARG_NAME(target) xfs_buf_target_name(target)
-
-static inline int xfs_bawrite(void *mp, xfs_buf_t *bp)
-{
- bp->b_fspriv3 = mp;
- bp->b_strat = xfs_bdstrat_cb;
- xfs_buf_delwri_dequeue(bp);
- return xfs_buf_iostart(bp, XBF_WRITE | XBF_ASYNC | _XBF_RUN_QUEUES);
-}
-
-static inline void xfs_buf_relse(xfs_buf_t *bp)
-{
- if (!bp->b_relse)
- xfs_buf_unlock(bp);
- xfs_buf_rele(bp);
-}
-
-#define xfs_bpin(bp) xfs_buf_pin(bp)
-#define xfs_bunpin(bp) xfs_buf_unpin(bp)
-
-#define xfs_buftrace(id, bp) \
- xfs_buf_trace(bp, id, NULL, (void *)__builtin_return_address(0))
-
-#define xfs_biodone(bp) xfs_buf_ioend(bp, 0)
-
-#define xfs_biomove(bp, off, len, data, rw) \
- xfs_buf_iomove((bp), (off), (len), (data), \
- ((rw) == XFS_B_WRITE) ? XBRW_WRITE : XBRW_READ)
-
-#define xfs_biozero(bp, off, len) \
- xfs_buf_iomove((bp), (off), (len), NULL, XBRW_ZERO)
-
-
-static inline int XFS_bwrite(xfs_buf_t *bp)
-{
- int iowait = (bp->b_flags & XBF_ASYNC) == 0;
- int error = 0;
-
- if (!iowait)
- bp->b_flags |= _XBF_RUN_QUEUES;
-
- xfs_buf_delwri_dequeue(bp);
- xfs_buf_iostrategy(bp);
- if (iowait) {
- error = xfs_buf_iowait(bp);
- xfs_buf_relse(bp);
- }
- return error;
-}
-
-#define XFS_bdwrite(bp) xfs_buf_iostart(bp, XBF_DELWRI | XBF_ASYNC)
-
-static inline int xfs_bdwrite(void *mp, xfs_buf_t *bp)
-{
- bp->b_strat = xfs_bdstrat_cb;
- bp->b_fspriv3 = mp;
- return xfs_buf_iostart(bp, XBF_DELWRI | XBF_ASYNC);
-}
-
-#define XFS_bdstrat(bp) xfs_buf_iorequest(bp)
-
-#define xfs_iowait(bp) xfs_buf_iowait(bp)
-
-#define xfs_baread(target, rablkno, ralen) \
- xfs_buf_readahead((target), (rablkno), (ralen), XBF_DONT_BLOCK)
-
-
-/*
- * Handling of buftargs.
- */
-extern xfs_buftarg_t *xfs_alloc_buftarg(struct block_device *, int);
-extern void xfs_free_buftarg(xfs_buftarg_t *, int);
-extern void xfs_wait_buftarg(xfs_buftarg_t *);
-extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int, unsigned int);
-extern int xfs_flush_buftarg(xfs_buftarg_t *, int);
-
-#define xfs_getsize_buftarg(buftarg) block_size((buftarg)->bt_bdev)
-#define xfs_readonly_buftarg(buftarg) bdev_read_only((buftarg)->bt_bdev)
-
-#define xfs_binval(buftarg) xfs_flush_buftarg(buftarg, 1)
-#define XFS_bflush(buftarg) xfs_flush_buftarg(buftarg, 1)
-
-#endif /* __XFS_BUF_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_cred.h b/fs/xfs/linux-2.6/xfs_cred.h
deleted file mode 100644
index e7f3da61c6c..00000000000
--- a/fs/xfs/linux-2.6/xfs_cred.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- */
-#ifndef __XFS_CRED_H__
-#define __XFS_CRED_H__
-
-#include <linux/capability.h>
-
-/*
- * Credentials
- */
-typedef struct cred {
- /* EMPTY */
-} cred_t;
-
-extern struct cred *sys_cred;
-
-/* this is a hack.. (assumes sys_cred is the only cred_t in the system) */
-static __inline int capable_cred(cred_t *cr, int cid)
-{
- return (cr == sys_cred) ? 1 : capable(cid);
-}
-
-#endif /* __XFS_CRED_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_export.c b/fs/xfs/linux-2.6/xfs_export.c
deleted file mode 100644
index 80eb249f2fa..00000000000
--- a/fs/xfs/linux-2.6/xfs_export.c
+++ /dev/null
@@ -1,188 +0,0 @@
-/*
- * Copyright (c) 2004-2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- */
-#include "xfs.h"
-#include "xfs_types.h"
-#include "xfs_dmapi.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
-#include "xfs_sb.h"
-#include "xfs_dir.h"
-#include "xfs_mount.h"
-#include "xfs_export.h"
-
-/*
- * XFS encodes and decodes the fileid portion of NFS filehandles
- * itself instead of letting the generic NFS code do it. This
- * allows filesystems with 64 bit inode numbers to be exported.
- *
- * Note that a side effect is that xfs_vget() won't be passed a
- * zero inode/generation pair under normal circumstances. As
- * however a malicious client could send us such data, the check
- * remains in that code.
- */
-
-STATIC struct dentry *
-linvfs_decode_fh(
- struct super_block *sb,
- __u32 *fh,
- int fh_len,
- int fileid_type,
- int (*acceptable)(
- void *context,
- struct dentry *de),
- void *context)
-{
- xfs_fid2_t ifid;
- xfs_fid2_t pfid;
- void *parent = NULL;
- int is64 = 0;
- __u32 *p = fh;
-
-#if XFS_BIG_INUMS
- is64 = (fileid_type & XFS_FILEID_TYPE_64FLAG);
- fileid_type &= ~XFS_FILEID_TYPE_64FLAG;
-#endif
-
- /*
- * Note that we only accept fileids which are long enough
- * rather than allow the parent generation number to default
- * to zero. XFS considers zero a valid generation number not
- * an invalid/wildcard value. There's little point printk'ing
- * a warning here as we don't have the client information
- * which would make such a warning useful.
- */
- if (fileid_type > 2 ||
- fh_len < xfs_fileid_length((fileid_type == 2), is64))
- return NULL;
-
- p = xfs_fileid_decode_fid2(p, &ifid, is64);
-
- if (fileid_type == 2) {
- p = xfs_fileid_decode_fid2(p, &pfid, is64);
- parent = &pfid;
- }
-
- fh = (__u32 *)&ifid;
- return find_exported_dentry(sb, fh, parent, acceptable, context);
-}
-
-
-STATIC int
-linvfs_encode_fh(
- struct dentry *dentry,
- __u32 *fh,
- int *max_len,
- int connectable)
-{
- struct inode *inode = dentry->d_inode;
- int type = 1;
- __u32 *p = fh;
- int len;
- int is64 = 0;
-#if XFS_BIG_INUMS
- vfs_t *vfs = LINVFS_GET_VFS(inode->i_sb);
-
- if (!(vfs->vfs_flag & VFS_32BITINODES)) {
- /* filesystem may contain 64bit inode numbers */
- is64 = XFS_FILEID_TYPE_64FLAG;
- }
-#endif
-
- /* Directories don't need their parent encoded, they have ".." */
- if (S_ISDIR(inode->i_mode))
- connectable = 0;
-
- /*
- * Only encode if there is enough space given. In practice
- * this means we can't export a filesystem with 64bit inodes
- * over NFSv2 with the subtree_check export option; the other
- * seven combinations work. The real answer is "don't use v2".
- */
- len = xfs_fileid_length(connectable, is64);
- if (*max_len < len)
- return 255;
- *max_len = len;
-
- p = xfs_fileid_encode_inode(p, inode, is64);
- if (connectable) {
- spin_lock(&dentry->d_lock);
- p = xfs_fileid_encode_inode(p, dentry->d_parent->d_inode, is64);
- spin_unlock(&dentry->d_lock);
- type = 2;
- }
- BUG_ON((p - fh) != len);
- return type | is64;
-}
-
-STATIC struct dentry *
-linvfs_get_dentry(
- struct super_block *sb,
- void *data)
-{
- vnode_t *vp;
- struct inode *inode;
- struct dentry *result;
- vfs_t *vfsp = LINVFS_GET_VFS(sb);
- int error;
-
- VFS_VGET(vfsp, &vp, (fid_t *)data, error);
- if (error || vp == NULL)
- return ERR_PTR(-ESTALE) ;
-
- inode = LINVFS_GET_IP(vp);
- result = d_alloc_anon(inode);
- if (!result) {
- iput(inode);
- return ERR_PTR(-ENOMEM);
- }
- return result;
-}
-
-STATIC struct dentry *
-linvfs_get_parent(
- struct dentry *child)
-{
- int error;
- vnode_t *vp, *cvp;
- struct dentry *parent;
- struct dentry dotdot;
-
- dotdot.d_name.name = "..";
- dotdot.d_name.len = 2;
- dotdot.d_inode = NULL;
-
- cvp = NULL;
- vp = LINVFS_GET_VP(child->d_inode);
- VOP_LOOKUP(vp, &dotdot, &cvp, 0, NULL, NULL, error);
- if (unlikely(error))
- return ERR_PTR(-error);
-
- parent = d_alloc_anon(LINVFS_GET_IP(cvp));
- if (unlikely(!parent)) {
- VN_RELE(cvp);
- return ERR_PTR(-ENOMEM);
- }
- return parent;
-}
-
-struct export_operations linvfs_export_ops = {
- .decode_fh = linvfs_decode_fh,
- .encode_fh = linvfs_encode_fh,
- .get_parent = linvfs_get_parent,
- .get_dentry = linvfs_get_dentry,
-};
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
deleted file mode 100644
index ced4404339c..00000000000
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ /dev/null
@@ -1,595 +0,0 @@
-/*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- */
-#include "xfs.h"
-#include "xfs_bit.h"
-#include "xfs_log.h"
-#include "xfs_inum.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_dir.h"
-#include "xfs_dir2.h"
-#include "xfs_trans.h"
-#include "xfs_dmapi.h"
-#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_alloc.h"
-#include "xfs_btree.h"
-#include "xfs_attr_sf.h"
-#include "xfs_dir_sf.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_dinode.h"
-#include "xfs_inode.h"
-#include "xfs_error.h"
-#include "xfs_rw.h"
-#include "xfs_ioctl32.h"
-
-#include <linux/dcache.h>
-#include <linux/smp_lock.h>
-
-static struct vm_operations_struct linvfs_file_vm_ops;
-#ifdef CONFIG_XFS_DMAPI
-static struct vm_operations_struct linvfs_dmapi_file_vm_ops;
-#endif
-
-STATIC inline ssize_t
-__linvfs_read(
- struct kiocb *iocb,
- char __user *buf,
- int ioflags,
- size_t count,
- loff_t pos)
-{
- struct iovec iov = {buf, count};
- struct file *file = iocb->ki_filp;
- vnode_t *vp = LINVFS_GET_VP(file->f_dentry->d_inode);
- ssize_t rval;
-
- BUG_ON(iocb->ki_pos != pos);
-
- if (unlikely(file->f_flags & O_DIRECT))
- ioflags |= IO_ISDIRECT;
- VOP_READ(vp, iocb, &iov, 1, &iocb->ki_pos, ioflags, NULL, rval);
- return rval;
-}
-
-
-STATIC ssize_t
-linvfs_aio_read(
- struct kiocb *iocb,
- char __user *buf,
- size_t count,
- loff_t pos)
-{
- return __linvfs_read(iocb, buf, IO_ISAIO, count, pos);
-}
-
-STATIC ssize_t
-linvfs_aio_read_invis(
- struct kiocb *iocb,
- char __user *buf,
- size_t count,
- loff_t pos)
-{
- return __linvfs_read(iocb, buf, IO_ISAIO|IO_INVIS, count, pos);
-}
-
-
-STATIC inline ssize_t
-__linvfs_write(
- struct kiocb *iocb,
- const char __user *buf,
- int ioflags,
- size_t count,
- loff_t pos)
-{
- struct iovec iov = {(void __user *)buf, count};
- struct file *file = iocb->ki_filp;
- struct inode *inode = file->f_mapping->host;
- vnode_t *vp = LINVFS_GET_VP(inode);
- ssize_t rval;
-
- BUG_ON(iocb->ki_pos != pos);
- if (unlikely(file->f_flags & O_DIRECT))
- ioflags |= IO_ISDIRECT;
-
- VOP_WRITE(vp, iocb, &iov, 1, &iocb->ki_pos, ioflags, NULL, rval);
- return rval;
-}
-
-
-STATIC ssize_t
-linvfs_aio_write(
- struct kiocb *iocb,
- const char __user *buf,
- size_t count,
- loff_t pos)
-{
- return __linvfs_write(iocb, buf, IO_ISAIO, count, pos);
-}
-
-STATIC ssize_t
-linvfs_aio_write_invis(
- struct kiocb *iocb,
- const char __user *buf,
- size_t count,
- loff_t pos)
-{
- return __linvfs_write(iocb, buf, IO_ISAIO|IO_INVIS, count, pos);
-}
-
-
-STATIC inline ssize_t
-__linvfs_readv(
- struct file *file,
- const struct iovec *iov,
- int ioflags,
- unsigned long nr_segs,
- loff_t *ppos)
-{
- struct inode *inode = file->f_mapping->host;
- vnode_t *vp = LINVFS_GET_VP(inode);
- struct kiocb kiocb;
- ssize_t rval;
-
- init_sync_kiocb(&kiocb, file);
- kiocb.ki_pos = *ppos;
-
- if (unlikely(file->f_flags & O_DIRECT))
- ioflags |= IO_ISDIRECT;
- VOP_READ(vp, &kiocb, iov, nr_segs, &kiocb.ki_pos, ioflags, NULL, rval);
-
- *ppos = kiocb.ki_pos;
- return rval;
-}
-
-STATIC ssize_t
-linvfs_readv(
- struct file *file,
- const struct iovec *iov,
- unsigned long nr_segs,
- loff_t *ppos)
-{
- return __linvfs_readv(file, iov, 0, nr_segs, ppos);
-}
-
-STATIC ssize_t
-linvfs_readv_invis(
- struct file *file,
- const struct iovec *iov,
- unsigned long nr_segs,
- loff_t *ppos)
-{
- return __linvfs_readv(file, iov, IO_INVIS, nr_segs, ppos);
-}
-
-
-STATIC inline ssize_t
-__linvfs_writev(
- struct file *file,
- const struct iovec *iov,
- int ioflags,
- unsigned long nr_segs,
- loff_t *ppos)
-{
- struct inode *inode = file->f_mapping->host;
- vnode_t *vp = LINVFS_GET_VP(inode);
- struct kiocb kiocb;
- ssize_t rval;
-
- init_sync_kiocb(&kiocb, file);
- kiocb.ki_pos = *ppos;
- if (unlikely(file->f_flags & O_DIRECT))
- ioflags |= IO_ISDIRECT;
-
- VOP_WRITE(vp, &kiocb, iov, nr_segs, &kiocb.ki_pos, ioflags, NULL, rval);
-
- *ppos = kiocb.ki_pos;
- return rval;
-}
-
-
-STATIC ssize_t
-linvfs_writev(
- struct file *file,
- const struct iovec *iov,
- unsigned long nr_segs,
- loff_t *ppos)
-{
- return __linvfs_writev(file, iov, 0, nr_segs, ppos);
-}
-
-STATIC ssize_t
-linvfs_writev_invis(
- struct file *file,
- const struct iovec *iov,
- unsigned long nr_segs,
- loff_t *ppos)
-{
- return __linvfs_writev(file, iov, IO_INVIS, nr_segs, ppos);
-}
-
-STATIC ssize_t
-linvfs_sendfile(
- struct file *filp,
- loff_t *ppos,
- size_t count,
- read_actor_t actor,
- void *target)
-{
- vnode_t *vp = LINVFS_GET_VP(filp->f_dentry->d_inode);
- ssize_t rval;
-
- VOP_SENDFILE(vp, filp, ppos, 0, count, actor, target, NULL, rval);
- return rval;
-}
-
-
-STATIC int
-linvfs_open(
- struct inode *inode,
- struct file *filp)
-{
- vnode_t *vp = LINVFS_GET_VP(inode);
- int error;
-
- if (!(filp->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS)
- return -EFBIG;
-
- ASSERT(vp);
- VOP_OPEN(vp, NULL, error);
- return -error;
-}
-
-
-STATIC int
-linvfs_release(
- struct inode *inode,
- struct file *filp)
-{
- vnode_t *vp = LINVFS_GET_VP(inode);
- int error = 0;
-
- if (vp)
- VOP_RELEASE(vp, error);
- return -error;
-}
-
-
-STATIC int
-linvfs_fsync(
- struct file *filp,
- struct dentry *dentry,
- int datasync)
-{
- struct inode *inode = dentry->d_inode;
- vnode_t *vp = LINVFS_GET_VP(inode);
- int error;
- int flags = FSYNC_WAIT;
-
- if (datasync)
- flags |= FSYNC_DATA;
-
- ASSERT(vp);
- VOP_FSYNC(vp, flags, NULL, (xfs_off_t)0, (xfs_off_t)-1, error);
- return -error;
-}
-
-/*
- * linvfs_readdir maps to VOP_READDIR().
- * We need to build a uio, cred, ...
- */
-
-#define nextdp(dp) ((struct xfs_dirent *)((char *)(dp) + (dp)->d_reclen))
-
-#ifdef CONFIG_XFS_DMAPI
-
-STATIC struct page *
-linvfs_filemap_nopage(
- struct vm_area_struct *area,
- unsigned long address,
- int *type)
-{
- struct inode *inode = area->vm_file->f_dentry->d_inode;
- vnode_t *vp = LINVFS_GET_VP(inode);
- xfs_mount_t *mp = XFS_VFSTOM(vp->v_vfsp);
- int error;
-
- ASSERT_ALWAYS(vp->v_vfsp->vfs_flag & VFS_DMI);
-
- error = XFS_SEND_MMAP(mp, area, 0);
- if (error)
- return NULL;
-
- return filemap_nopage(area, address, type);
-}
-
-#endif /* CONFIG_XFS_DMAPI */
-
-
-STATIC int
-linvfs_readdir(
- struct file *filp,
- void *dirent,
- filldir_t filldir)
-{
- int error = 0;
- vnode_t *vp;
- uio_t uio;
- iovec_t iov;
- int eof = 0;
- caddr_t read_buf;
- int namelen, size = 0;
- size_t rlen = PAGE_CACHE_SIZE;
- xfs_off_t start_offset, curr_offset;
- xfs_dirent_t *dbp = NULL;
-
- vp = LINVFS_GET_VP(filp->f_dentry->d_inode);
- ASSERT(vp);
-
- /* Try fairly hard to get memory */
- do {
- if ((read_buf = (caddr_t)kmalloc(rlen, GFP_KERNEL)))
- break;
- rlen >>= 1;
- } while (rlen >= 1024);
-
- if (read_buf == NULL)
- return -ENOMEM;
-
- uio.uio_iov = &iov;
- uio.uio_segflg = UIO_SYSSPACE;
- curr_offset = filp->f_pos;
- if (filp->f_pos != 0x7fffffff)
- uio.uio_offset = filp->f_pos;
- else
- uio.uio_offset = 0xffffffff;
-
- while (!eof) {
- uio.uio_resid = iov.iov_len = rlen;
- iov.iov_base = read_buf;
- uio.uio_iovcnt = 1;
-
- start_offset = uio.uio_offset;
-
- VOP_READDIR(vp, &uio, NULL, &eof, error);
- if ((uio.uio_offset == start_offset) || error) {
- size = 0;
- break;
- }
-
- size = rlen - uio.uio_resid;
- dbp = (xfs_dirent_t *)read_buf;
- while (size > 0) {
- namelen = strlen(dbp->d_name);
-
- if (filldir(dirent, dbp->d_name, namelen,
- (loff_t) curr_offset & 0x7fffffff,
- (ino_t) dbp->d_ino,
- DT_UNKNOWN)) {
- goto done;
- }
- size -= dbp->d_reclen;
- curr_offset = (loff_t)dbp->d_off /* & 0x7fffffff */;
- dbp = nextdp(dbp);
- }
- }
-done:
- if (!error) {
- if (size == 0)
- filp->f_pos = uio.uio_offset & 0x7fffffff;
- else if (dbp)
- filp->f_pos = curr_offset;
- }
-
- kfree(read_buf);
- return -error;
-}
-
-
-STATIC int
-linvfs_file_mmap(
- struct file *filp,
- struct vm_area_struct *vma)
-{
- struct inode *ip = filp->f_dentry->d_inode;
- vnode_t *vp = LINVFS_GET_VP(ip);
- vattr_t va = { .va_mask = XFS_AT_UPDATIME };
- int error;
-
- vma->vm_ops = &linvfs_file_vm_ops;
-
-#ifdef CONFIG_XFS_DMAPI
- if (vp->v_vfsp->vfs_flag & VFS_DMI) {
- vma->vm_ops = &linvfs_dmapi_file_vm_ops;
- }
-#endif /* CONFIG_XFS_DMAPI */
-
- VOP_SETATTR(vp, &va, XFS_AT_UPDATIME, NULL, error);
- if (!error)
- vn_revalidate(vp); /* update Linux inode flags */
- return 0;
-}
-
-
-STATIC long
-linvfs_ioctl(
- struct file *filp,
- unsigned int cmd,
- unsigned long arg)
-{
- int error;
- struct inode *inode = filp->f_dentry->d_inode;
- vnode_t *vp = LINVFS_GET_VP(inode);
-
- VOP_IOCTL(vp, inode, filp, 0, cmd, (void __user *)arg, error);
- VMODIFY(vp);
-
- /* NOTE: some of the ioctl's return positive #'s as a
- * byte count indicating success, such as
- * readlink_by_handle. So we don't "sign flip"
- * like most other routines. This means true
- * errors need to be returned as a negative value.
- */
- return error;
-}
-
-STATIC long
-linvfs_ioctl_invis(
- struct file *filp,
- unsigned int cmd,
- unsigned long arg)
-{
- int error;
- struct inode *inode = filp->f_dentry->d_inode;
- vnode_t *vp = LINVFS_GET_VP(inode);
-
- ASSERT(vp);
- VOP_IOCTL(vp, inode, filp, IO_INVIS, cmd, (void __user *)arg, error);
- VMODIFY(vp);
-
- /* NOTE: some of the ioctl's return positive #'s as a
- * byte count indicating success, such as
- * readlink_by_handle. So we don't "sign flip"
- * like most other routines. This means true
- * errors need to be returned as a negative value.
- */
- return error;
-}
-
-#ifdef CONFIG_XFS_DMAPI
-#ifdef HAVE_VMOP_MPROTECT
-STATIC int
-linvfs_mprotect(
- struct vm_area_struct *vma,
- unsigned int newflags)
-{
- vnode_t *vp = LINVFS_GET_VP(vma->vm_file->f_dentry->d_inode);
- int error = 0;
-
- if (vp->v_vfsp->vfs_flag & VFS_DMI) {
- if ((vma->vm_flags & VM_MAYSHARE) &&
- (newflags & VM_WRITE) && !(vma->vm_flags & VM_WRITE)) {
- xfs_mount_t *mp = XFS_VFSTOM(vp->v_vfsp);
-
- error = XFS_SEND_MMAP(mp, vma, VM_WRITE);
- }
- }
- return error;
-}
-#endif /* HAVE_VMOP_MPROTECT */
-#endif /* CONFIG_XFS_DMAPI */
-
-#ifdef HAVE_FOP_OPEN_EXEC
-/* If the user is attempting to execute a file that is offline then
- * we have to trigger a DMAPI READ event before the file is marked as busy
- * otherwise the invisible I/O will not be able to write to the file to bring
- * it back online.
- */
-STATIC int
-linvfs_open_exec(
- struct inode *inode)
-{
- vnode_t *vp = LINVFS_GET_VP(inode);
- xfs_mount_t *mp = XFS_VFSTOM(vp->v_vfsp);
- int error = 0;
- xfs_inode_t *ip;
-
- if (vp->v_vfsp->vfs_flag & VFS_DMI) {
- ip = xfs_vtoi(vp);
- if (!ip) {
- error = -EINVAL;
- goto open_exec_out;
- }
- if (DM_EVENT_ENABLED(vp->v_vfsp, ip, DM_EVENT_READ)) {
- error = -XFS_SEND_DATA(mp, DM_EVENT_READ, vp,
- 0, 0, 0, NULL);
- }
- }
-open_exec_out:
- return error;
-}
-#endif /* HAVE_FOP_OPEN_EXEC */
-
-struct file_operations linvfs_file_operations = {
- .llseek = generic_file_llseek,
- .read = do_sync_read,
- .write = do_sync_write,
- .readv = linvfs_readv,
- .writev = linvfs_writev,
- .aio_read = linvfs_aio_read,
- .aio_write = linvfs_aio_write,
- .sendfile = linvfs_sendfile,
- .unlocked_ioctl = linvfs_ioctl,
-#ifdef CONFIG_COMPAT
- .compat_ioctl = linvfs_compat_ioctl,
-#endif
- .mmap = linvfs_file_mmap,
- .open = linvfs_open,
- .release = linvfs_release,
- .fsync = linvfs_fsync,
-#ifdef HAVE_FOP_OPEN_EXEC
- .open_exec = linvfs_open_exec,
-#endif
-};
-
-struct file_operations linvfs_invis_file_operations = {
- .llseek = generic_file_llseek,
- .read = do_sync_read,
- .write = do_sync_write,
- .readv = linvfs_readv_invis,
- .writev = linvfs_writev_invis,
- .aio_read = linvfs_aio_read_invis,
- .aio_write = linvfs_aio_write_invis,
- .sendfile = linvfs_sendfile,
- .unlocked_ioctl = linvfs_ioctl_invis,
-#ifdef CONFIG_COMPAT
- .compat_ioctl = linvfs_compat_invis_ioctl,
-#endif
- .mmap = linvfs_file_mmap,
- .open = linvfs_open,
- .release = linvfs_release,
- .fsync = linvfs_fsync,
-};
-
-
-struct file_operations linvfs_dir_operations = {
- .read = generic_read_dir,
- .readdir = linvfs_readdir,
- .unlocked_ioctl = linvfs_ioctl,
-#ifdef CONFIG_COMPAT
- .compat_ioctl = linvfs_compat_ioctl,
-#endif
- .fsync = linvfs_fsync,
-};
-
-static struct vm_operations_struct linvfs_file_vm_ops = {
- .nopage = filemap_nopage,
- .populate = filemap_populate,
-};
-
-#ifdef CONFIG_XFS_DMAPI
-static struct vm_operations_struct linvfs_dmapi_file_vm_ops = {
- .nopage = linvfs_filemap_nopage,
- .populate = filemap_populate,
-#ifdef HAVE_VMOP_MPROTECT
- .mprotect = linvfs_mprotect,
-#endif
-};
-#endif /* CONFIG_XFS_DMAPI */
diff --git a/fs/xfs/linux-2.6/xfs_fs_subr.c b/fs/xfs/linux-2.6/xfs_fs_subr.c
deleted file mode 100644
index 4fa4b1a5187..00000000000
--- a/fs/xfs/linux-2.6/xfs_fs_subr.c
+++ /dev/null
@@ -1,111 +0,0 @@
-/*
- * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "xfs.h"
-
-/*
- * Stub for no-op vnode operations that return error status.
- */
-int
-fs_noerr(void)
-{
- return 0;
-}
-
-/*
- * Operation unsupported under this file system.
- */
-int
-fs_nosys(void)
-{
- return ENOSYS;
-}
-
-/*
- * Stub for inactive, strategy, and read/write lock/unlock. Does nothing.
- */
-/* ARGSUSED */
-void
-fs_noval(void)
-{
-}
-
-/*
- * vnode pcache layer for vnode_tosspages.
- * 'last' parameter unused but left in for IRIX compatibility
- */
-void
-fs_tosspages(
- bhv_desc_t *bdp,
- xfs_off_t first,
- xfs_off_t last,
- int fiopt)
-{
- vnode_t *vp = BHV_TO_VNODE(bdp);
- struct inode *ip = LINVFS_GET_IP(vp);
-
- if (VN_CACHED(vp))
- truncate_inode_pages(ip->i_mapping, first);
-}
-
-
-/*
- * vnode pcache layer for vnode_flushinval_pages.
- * 'last' parameter unused but left in for IRIX compatibility
- */
-void
-fs_flushinval_pages(
- bhv_desc_t *bdp,
- xfs_off_t first,
- xfs_off_t last,
- int fiopt)
-{
- vnode_t *vp = BHV_TO_VNODE(bdp);
- struct inode *ip = LINVFS_GET_IP(vp);
-
- if (VN_CACHED(vp)) {
- filemap_write_and_wait(ip->i_mapping);
-
- truncate_inode_pages(ip->i_mapping, first);
- }
-}
-
-/*
- * vnode pcache layer for vnode_flush_pages.
- * 'last' parameter unused but left in for IRIX compatibility
- */
-int
-fs_flush_pages(
- bhv_desc_t *bdp,
- xfs_off_t first,
- xfs_off_t last,
- uint64_t flags,
- int fiopt)
-{
- vnode_t *vp = BHV_TO_VNODE(bdp);
- struct inode *ip = LINVFS_GET_IP(vp);
-
- if (VN_CACHED(vp)) {
- filemap_fdatawrite(ip->i_mapping);
- if (flags & XFS_B_ASYNC)
- return 0;
- filemap_fdatawait(ip->i_mapping);
- }
-
- return 0;
-}
diff --git a/fs/xfs/linux-2.6/xfs_fs_subr.h b/fs/xfs/linux-2.6/xfs_fs_subr.h
deleted file mode 100644
index aee9ccdd18f..00000000000
--- a/fs/xfs/linux-2.6/xfs_fs_subr.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/*
- * Copyright (c) 2000,2002,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- */
-#ifndef __XFS_FS_SUBR_H__
-#define __XFS_FS_SUBR_H__
-
-struct cred;
-extern int fs_noerr(void);
-extern int fs_nosys(void);
-extern void fs_noval(void);
-extern void fs_tosspages(bhv_desc_t *, xfs_off_t, xfs_off_t, int);
-extern void fs_flushinval_pages(bhv_desc_t *, xfs_off_t, xfs_off_t, int);
-extern int fs_flush_pages(bhv_desc_t *, xfs_off_t, xfs_off_t, uint64_t, int);
-
-#endif /* __XFS_FS_SUBR_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_globals.h b/fs/xfs/linux-2.6/xfs_globals.h
deleted file mode 100644
index e1a22bfcf86..00000000000
--- a/fs/xfs/linux-2.6/xfs_globals.h
+++ /dev/null
@@ -1,25 +0,0 @@
-/*
- * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- */
-#ifndef __XFS_GLOBALS_H__
-#define __XFS_GLOBALS_H__
-
-extern uint64_t xfs_panic_mask; /* set to cause more panics */
-extern unsigned long xfs_physmem;
-extern struct cred *sys_cred;
-
-#endif /* __XFS_GLOBALS_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
deleted file mode 100644
index 4db47790415..00000000000
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ /dev/null
@@ -1,1334 +0,0 @@
-/*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_bit.h"
-#include "xfs_log.h"
-#include "xfs_inum.h"
-#include "xfs_trans.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_dir.h"
-#include "xfs_dir2.h"
-#include "xfs_alloc.h"
-#include "xfs_dmapi.h"
-#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
-#include "xfs_attr_sf.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_dinode.h"
-#include "xfs_inode.h"
-#include "xfs_btree.h"
-#include "xfs_ialloc.h"
-#include "xfs_rtalloc.h"
-#include "xfs_itable.h"
-#include "xfs_error.h"
-#include "xfs_rw.h"
-#include "xfs_acl.h"
-#include "xfs_cap.h"
-#include "xfs_mac.h"
-#include "xfs_attr.h"
-#include "xfs_bmap.h"
-#include "xfs_buf_item.h"
-#include "xfs_utils.h"
-#include "xfs_dfrag.h"
-#include "xfs_fsops.h"
-
-#include <linux/capability.h>
-#include <linux/dcache.h>
-#include <linux/mount.h>
-#include <linux/namei.h>
-#include <linux/pagemap.h>
-
-/*
- * xfs_find_handle maps from userspace xfs_fsop_handlereq structure to
- * a file or fs handle.
- *
- * XFS_IOC_PATH_TO_FSHANDLE
- * returns fs handle for a mount point or path within that mount point
- * XFS_IOC_FD_TO_HANDLE
- * returns full handle for a FD opened in user space
- * XFS_IOC_PATH_TO_HANDLE
- * returns full handle for a path
- */
-STATIC int
-xfs_find_handle(
- unsigned int cmd,
- void __user *arg)
-{
- int hsize;
- xfs_handle_t handle;
- xfs_fsop_handlereq_t hreq;
- struct inode *inode;
- struct vnode *vp;
-
- if (copy_from_user(&hreq, arg, sizeof(hreq)))
- return -XFS_ERROR(EFAULT);
-
- memset((char *)&handle, 0, sizeof(handle));
-
- switch (cmd) {
- case XFS_IOC_PATH_TO_FSHANDLE:
- case XFS_IOC_PATH_TO_HANDLE: {
- struct nameidata nd;
- int error;
-
- error = user_path_walk_link((const char __user *)hreq.path, &nd);
- if (error)
- return error;
-
- ASSERT(nd.dentry);
- ASSERT(nd.dentry->d_inode);
- inode = igrab(nd.dentry->d_inode);
- path_release(&nd);
- break;
- }
-
- case XFS_IOC_FD_TO_HANDLE: {
- struct file *file;
-
- file = fget(hreq.fd);
- if (!file)
- return -EBADF;
-
- ASSERT(file->f_dentry);
- ASSERT(file->f_dentry->d_inode);
- inode = igrab(file->f_dentry->d_inode);
- fput(file);
- break;
- }
-
- default:
- ASSERT(0);
- return -XFS_ERROR(EINVAL);
- }
-
- if (inode->i_sb->s_magic != XFS_SB_MAGIC) {
- /* we're not in XFS anymore, Toto */
- iput(inode);
- return -XFS_ERROR(EINVAL);
- }
-
- switch (inode->i_mode & S_IFMT) {
- case S_IFREG:
- case S_IFDIR:
- case S_IFLNK:
- break;
- default:
- iput(inode);
- return -XFS_ERROR(EBADF);
- }
-
- /* we need the vnode */
- vp = LINVFS_GET_VP(inode);
-
- /* now we can grab the fsid */
- memcpy(&handle.ha_fsid, vp->v_vfsp->vfs_altfsid, sizeof(xfs_fsid_t));
- hsize = sizeof(xfs_fsid_t);
-
- if (cmd != XFS_IOC_PATH_TO_FSHANDLE) {
- xfs_inode_t *ip;
- int lock_mode;
-
- /* need to get access to the xfs_inode to read the generation */
- ip = xfs_vtoi(vp);
- ASSERT(ip);
- lock_mode = xfs_ilock_map_shared(ip);
-
- /* fill in fid section of handle from inode */
- handle.ha_fid.xfs_fid_len = sizeof(xfs_fid_t) -
- sizeof(handle.ha_fid.xfs_fid_len);
- handle.ha_fid.xfs_fid_pad = 0;
- handle.ha_fid.xfs_fid_gen = ip->i_d.di_gen;
- handle.ha_fid.xfs_fid_ino = ip->i_ino;
-
- xfs_iunlock_map_shared(ip, lock_mode);
-
- hsize = XFS_HSIZE(handle);
- }
-
- /* now copy our handle into the user buffer & write out the size */
- if (copy_to_user(hreq.ohandle, &handle, hsize) ||
- copy_to_user(hreq.ohandlen, &hsize, sizeof(__s32))) {
- iput(inode);
- return -XFS_ERROR(EFAULT);
- }
-
- iput(inode);
- return 0;
-}
-
-
-/*
- * Convert userspace handle data into vnode (and inode).
- * We [ab]use the fact that all the fsop_handlereq ioctl calls
- * have a data structure argument whose first component is always
- * a xfs_fsop_handlereq_t, so we can cast to and from this type.
- * This allows us to optimise the copy_from_user calls and gives
- * a handy, shared routine.
- *
- * If no error, caller must always VN_RELE the returned vp.
- */
-STATIC int
-xfs_vget_fsop_handlereq(
- xfs_mount_t *mp,
- struct inode *parinode, /* parent inode pointer */
- xfs_fsop_handlereq_t *hreq,
- vnode_t **vp,
- struct inode **inode)
-{
- void __user *hanp;
- size_t hlen;
- xfs_fid_t *xfid;
- xfs_handle_t *handlep;
- xfs_handle_t handle;
- xfs_inode_t *ip;
- struct inode *inodep;
- vnode_t *vpp;
- xfs_ino_t ino;
- __u32 igen;
- int error;
-
- /*
- * Only allow handle opens under a directory.
- */
- if (!S_ISDIR(parinode->i_mode))
- return XFS_ERROR(ENOTDIR);
-
- hanp = hreq->ihandle;
- hlen = hreq->ihandlen;
- handlep = &handle;
-
- if (hlen < sizeof(handlep->ha_fsid) || hlen > sizeof(*handlep))
- return XFS_ERROR(EINVAL);
- if (copy_from_user(handlep, hanp, hlen))
- return XFS_ERROR(EFAULT);
- if (hlen < sizeof(*handlep))
- memset(((char *)handlep) + hlen, 0, sizeof(*handlep) - hlen);
- if (hlen > sizeof(handlep->ha_fsid)) {
- if (handlep->ha_fid.xfs_fid_len !=
- (hlen - sizeof(handlep->ha_fsid)
- - sizeof(handlep->ha_fid.xfs_fid_len))
- || handlep->ha_fid.xfs_fid_pad)
- return XFS_ERROR(EINVAL);
- }
-
- /*
- * Crack the handle, obtain the inode # & generation #
- */
- xfid = (struct xfs_fid *)&handlep->ha_fid;
- if (xfid->xfs_fid_len == sizeof(*xfid) - sizeof(xfid->xfs_fid_len)) {
- ino = xfid->xfs_fid_ino;
- igen = xfid->xfs_fid_gen;
- } else {
- return XFS_ERROR(EINVAL);
- }
-
- /*
- * Get the XFS inode, building a vnode to go with it.
- */
- error = xfs_iget(mp, NULL, ino, 0, XFS_ILOCK_SHARED, &ip, 0);
- if (error)
- return error;
- if (ip == NULL)
- return XFS_ERROR(EIO);
- if (ip->i_d.di_mode == 0 || ip->i_d.di_gen != igen) {
- xfs_iput_new(ip, XFS_ILOCK_SHARED);
- return XFS_ERROR(ENOENT);
- }
-
- vpp = XFS_ITOV(ip);
- inodep = LINVFS_GET_IP(vpp);
- xfs_iunlock(ip, XFS_ILOCK_SHARED);
-
- *vp = vpp;
- *inode = inodep;
- return 0;
-}
-
-STATIC int
-xfs_open_by_handle(
- xfs_mount_t *mp,
- void __user *arg,
- struct file *parfilp,
- struct inode *parinode)
-{
- int error;
- int new_fd;
- int permflag;
- struct file *filp;
- struct inode *inode;
- struct dentry *dentry;
- vnode_t *vp;
- xfs_fsop_handlereq_t hreq;
-
- if (!capable(CAP_SYS_ADMIN))
- return -XFS_ERROR(EPERM);
- if (copy_from_user(&hreq, arg, sizeof(xfs_fsop_handlereq_t)))
- return -XFS_ERROR(EFAULT);
-
- error = xfs_vget_fsop_handlereq(mp, parinode, &hreq, &vp, &inode);
- if (error)
- return -error;
-
- /* Restrict xfs_open_by_handle to directories & regular files. */
- if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) {
- iput(inode);
- return -XFS_ERROR(EINVAL);
- }
-
-#if BITS_PER_LONG != 32
- hreq.oflags |= O_LARGEFILE;
-#endif
- /* Put open permission in namei format. */
- permflag = hreq.oflags;
- if ((permflag+1) & O_ACCMODE)
- permflag++;
- if (permflag & O_TRUNC)
- permflag |= 2;
-
- if ((!(permflag & O_APPEND) || (permflag & O_TRUNC)) &&
- (permflag & FMODE_WRITE) && IS_APPEND(inode)) {
- iput(inode);
- return -XFS_ERROR(EPERM);
- }
-
- if ((permflag & FMODE_WRITE) && IS_IMMUTABLE(inode)) {
- iput(inode);
- return -XFS_ERROR(EACCES);
- }
-
- /* Can't write directories. */
- if ( S_ISDIR(inode->i_mode) && (permflag & FMODE_WRITE)) {
- iput(inode);
- return -XFS_ERROR(EISDIR);
- }
-
- if ((new_fd = get_unused_fd()) < 0) {
- iput(inode);
- return new_fd;
- }
-
- dentry = d_alloc_anon(inode);
- if (dentry == NULL) {
- iput(inode);
- put_unused_fd(new_fd);
- return -XFS_ERROR(ENOMEM);
- }
-
- /* Ensure umount returns EBUSY on umounts while this file is open. */
- mntget(parfilp->f_vfsmnt);
-
- /* Create file pointer. */
- filp = dentry_open(dentry, parfilp->f_vfsmnt, hreq.oflags);
- if (IS_ERR(filp)) {
- put_unused_fd(new_fd);
- return -XFS_ERROR(-PTR_ERR(filp));
- }
- if (inode->i_mode & S_IFREG)
- filp->f_op = &linvfs_invis_file_operations;
-
- fd_install(new_fd, filp);
- return new_fd;
-}
-
-STATIC int
-xfs_readlink_by_handle(
- xfs_mount_t *mp,
- void __user *arg,
- struct file *parfilp,
- struct inode *parinode)
-{
- int error;
- struct iovec aiov;
- struct uio auio;
- struct inode *inode;
- xfs_fsop_handlereq_t hreq;
- vnode_t *vp;
- __u32 olen;
-
- if (!capable(CAP_SYS_ADMIN))
- return -XFS_ERROR(EPERM);
- if (copy_from_user(&hreq, arg, sizeof(xfs_fsop_handlereq_t)))
- return -XFS_ERROR(EFAULT);
-
- error = xfs_vget_fsop_handlereq(mp, parinode, &hreq, &vp, &inode);
- if (error)
- return -error;
-
- /* Restrict this handle operation to symlinks only. */
- if (!S_ISLNK(inode->i_mode)) {
- VN_RELE(vp);
- return -XFS_ERROR(EINVAL);
- }
-
- if (copy_from_user(&olen, hreq.ohandlen, sizeof(__u32))) {
- VN_RELE(vp);
- return -XFS_ERROR(EFAULT);
- }
- aiov.iov_len = olen;
- aiov.iov_base = hreq.ohandle;
-
- auio.uio_iov = &aiov;
- auio.uio_iovcnt = 1;
- auio.uio_offset = 0;
- auio.uio_segflg = UIO_USERSPACE;
- auio.uio_resid = olen;
-
- VOP_READLINK(vp, &auio, IO_INVIS, NULL, error);
-
- VN_RELE(vp);
- return (olen - auio.uio_resid);
-}
-
-STATIC int
-xfs_fssetdm_by_handle(
- xfs_mount_t *mp,
- void __user *arg,
- struct file *parfilp,
- struct inode *parinode)
-{
- int error;
- struct fsdmidata fsd;
- xfs_fsop_setdm_handlereq_t dmhreq;
- struct inode *inode;
- bhv_desc_t *bdp;
- vnode_t *vp;
-
- if (!capable(CAP_MKNOD))
- return -XFS_ERROR(EPERM);
- if (copy_from_user(&dmhreq, arg, sizeof(xfs_fsop_setdm_handlereq_t)))
- return -XFS_ERROR(EFAULT);
-
- error = xfs_vget_fsop_handlereq(mp, parinode, &dmhreq.hreq, &vp, &inode);
- if (error)
- return -error;
-
- if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) {
- VN_RELE(vp);
- return -XFS_ERROR(EPERM);
- }
-
- if (copy_from_user(&fsd, dmhreq.data, sizeof(fsd))) {
- VN_RELE(vp);
- return -XFS_ERROR(EFAULT);
- }
-
- bdp = bhv_base_unlocked(VN_BHV_HEAD(vp));
- error = xfs_set_dmattrs(bdp, fsd.fsd_dmevmask, fsd.fsd_dmstate, NULL);
-
- VN_RELE(vp);
- if (error)
- return -error;
- return 0;
-}
-
-STATIC int
-xfs_attrlist_by_handle(
- xfs_mount_t *mp,
- void __user *arg,
- struct file *parfilp,
- struct inode *parinode)
-{
- int error;
- attrlist_cursor_kern_t *cursor;
- xfs_fsop_attrlist_handlereq_t al_hreq;
- struct inode *inode;
- vnode_t *vp;
- char *kbuf;
-
- if (!capable(CAP_SYS_ADMIN))
- return -XFS_ERROR(EPERM);
- if (copy_from_user(&al_hreq, arg, sizeof(xfs_fsop_attrlist_handlereq_t)))
- return -XFS_ERROR(EFAULT);
- if (al_hreq.buflen > XATTR_LIST_MAX)
- return -XFS_ERROR(EINVAL);
-
- error = xfs_vget_fsop_handlereq(mp, parinode, &al_hreq.hreq,
- &vp, &inode);
- if (error)
- goto out;
-
- kbuf = kmalloc(al_hreq.buflen, GFP_KERNEL);
- if (!kbuf)
- goto out_vn_rele;
-
- cursor = (attrlist_cursor_kern_t *)&al_hreq.pos;
- VOP_ATTR_LIST(vp, kbuf, al_hreq.buflen, al_hreq.flags,
- cursor, NULL, error);
- if (error)
- goto out_kfree;
-
- if (copy_to_user(al_hreq.buffer, kbuf, al_hreq.buflen))
- error = -EFAULT;
-
- out_kfree:
- kfree(kbuf);
- out_vn_rele:
- VN_RELE(vp);
- out:
- return -error;
-}
-
-STATIC int
-xfs_attrmulti_attr_get(
- struct vnode *vp,
- char *name,
- char __user *ubuf,
- __uint32_t *len,
- __uint32_t flags)
-{
- char *kbuf;
- int error = EFAULT;
-
- if (*len > XATTR_SIZE_MAX)
- return EINVAL;
- kbuf = kmalloc(*len, GFP_KERNEL);
- if (!kbuf)
- return ENOMEM;
-
- VOP_ATTR_GET(vp, name, kbuf, len, flags, NULL, error);
- if (error)
- goto out_kfree;
-
- if (copy_to_user(ubuf, kbuf, *len))
- error = EFAULT;
-
- out_kfree:
- kfree(kbuf);
- return error;
-}
-
-STATIC int
-xfs_attrmulti_attr_set(
- struct vnode *vp,
- char *name,
- const char __user *ubuf,
- __uint32_t len,
- __uint32_t flags)
-{
- char *kbuf;
- int error = EFAULT;
-
- if (IS_RDONLY(&vp->v_inode))
- return -EROFS;
- if (IS_IMMUTABLE(&vp->v_inode) || IS_APPEND(&vp->v_inode))
- return EPERM;
- if (len > XATTR_SIZE_MAX)
- return EINVAL;
-
- kbuf = kmalloc(len, GFP_KERNEL);
- if (!kbuf)
- return ENOMEM;
-
- if (copy_from_user(kbuf, ubuf, len))
- goto out_kfree;
-
- VOP_ATTR_SET(vp, name, kbuf, len, flags, NULL, error);
-
- out_kfree:
- kfree(kbuf);
- return error;
-}
-
-STATIC int
-xfs_attrmulti_attr_remove(
- struct vnode *vp,
- char *name,
- __uint32_t flags)
-{
- int error;
-
-
- if (IS_RDONLY(&vp->v_inode))
- return -EROFS;
- if (IS_IMMUTABLE(&vp->v_inode) || IS_APPEND(&vp->v_inode))
- return EPERM;
-
- VOP_ATTR_REMOVE(vp, name, flags, NULL, error);
- return error;
-}
-
-STATIC int
-xfs_attrmulti_by_handle(
- xfs_mount_t *mp,
- void __user *arg,
- struct file *parfilp,
- struct inode *parinode)
-{
- int error;
- xfs_attr_multiop_t *ops;
- xfs_fsop_attrmulti_handlereq_t am_hreq;
- struct inode *inode;
- vnode_t *vp;
- unsigned int i, size;
- char *attr_name;
-
- if (!capable(CAP_SYS_ADMIN))
- return -XFS_ERROR(EPERM);
- if (copy_from_user(&am_hreq, arg, sizeof(xfs_fsop_attrmulti_handlereq_t)))
- return -XFS_ERROR(EFAULT);
-
- error = xfs_vget_fsop_handlereq(mp, parinode, &am_hreq.hreq, &vp, &inode);
- if (error)
- goto out;
-
- error = E2BIG;
- size = am_hreq.opcount * sizeof(attr_multiop_t);
- if (!size || size > 16 * PAGE_SIZE)
- goto out_vn_rele;
-
- error = ENOMEM;
- ops = kmalloc(size, GFP_KERNEL);
- if (!ops)
- goto out_vn_rele;
-
- error = EFAULT;
- if (copy_from_user(ops, am_hreq.ops, size))
- goto out_kfree_ops;
-
- attr_name = kmalloc(MAXNAMELEN, GFP_KERNEL);
- if (!attr_name)
- goto out_kfree_ops;
-
-
- error = 0;
- for (i = 0; i < am_hreq.opcount; i++) {
- ops[i].am_error = strncpy_from_user(attr_name,
- ops[i].am_attrname, MAXNAMELEN);
- if (ops[i].am_error == 0 || ops[i].am_error == MAXNAMELEN)
- error = -ERANGE;
- if (ops[i].am_error < 0)
- break;
-
- switch (ops[i].am_opcode) {
- case ATTR_OP_GET:
- ops[i].am_error = xfs_attrmulti_attr_get(vp,
- attr_name, ops[i].am_attrvalue,
- &ops[i].am_length, ops[i].am_flags);
- break;
- case ATTR_OP_SET:
- ops[i].am_error = xfs_attrmulti_attr_set(vp,
- attr_name, ops[i].am_attrvalue,
- ops[i].am_length, ops[i].am_flags);
- break;
- case ATTR_OP_REMOVE:
- ops[i].am_error = xfs_attrmulti_attr_remove(vp,
- attr_name, ops[i].am_flags);
- break;
- default:
- ops[i].am_error = EINVAL;
- }
- }
-
- if (copy_to_user(am_hreq.ops, ops, size))
- error = XFS_ERROR(EFAULT);
-
- kfree(attr_name);
- out_kfree_ops:
- kfree(ops);
- out_vn_rele:
- VN_RELE(vp);
- out:
- return -error;
-}
-
-/* prototypes for a few of the stack-hungry cases that have
- * their own functions. Functions are defined after their use
- * so gcc doesn't get fancy and inline them with -03 */
-
-STATIC int
-xfs_ioc_space(
- bhv_desc_t *bdp,
- vnode_t *vp,
- struct file *filp,
- int flags,
- unsigned int cmd,
- void __user *arg);
-
-STATIC int
-xfs_ioc_bulkstat(
- xfs_mount_t *mp,
- unsigned int cmd,
- void __user *arg);
-
-STATIC int
-xfs_ioc_fsgeometry_v1(
- xfs_mount_t *mp,
- void __user *arg);
-
-STATIC int
-xfs_ioc_fsgeometry(
- xfs_mount_t *mp,
- void __user *arg);
-
-STATIC int
-xfs_ioc_xattr(
- vnode_t *vp,
- xfs_inode_t *ip,
- struct file *filp,
- unsigned int cmd,
- void __user *arg);
-
-STATIC int
-xfs_ioc_getbmap(
- bhv_desc_t *bdp,
- struct file *filp,
- int flags,
- unsigned int cmd,
- void __user *arg);
-
-STATIC int
-xfs_ioc_getbmapx(
- bhv_desc_t *bdp,
- void __user *arg);
-
-int
-xfs_ioctl(
- bhv_desc_t *bdp,
- struct inode *inode,
- struct file *filp,
- int ioflags,
- unsigned int cmd,
- void __user *arg)
-{
- int error;
- vnode_t *vp;
- xfs_inode_t *ip;
- xfs_mount_t *mp;
-
- vp = LINVFS_GET_VP(inode);
-
- vn_trace_entry(vp, "xfs_ioctl", (inst_t *)__return_address);
-
- ip = XFS_BHVTOI(bdp);
- mp = ip->i_mount;
-
- switch (cmd) {
-
- case XFS_IOC_ALLOCSP:
- case XFS_IOC_FREESP:
- case XFS_IOC_RESVSP:
- case XFS_IOC_UNRESVSP:
- case XFS_IOC_ALLOCSP64:
- case XFS_IOC_FREESP64:
- case XFS_IOC_RESVSP64:
- case XFS_IOC_UNRESVSP64:
- /*
- * Only allow the sys admin to reserve space unless
- * unwritten extents are enabled.
- */
- if (!XFS_SB_VERSION_HASEXTFLGBIT(&mp->m_sb) &&
- !capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- return xfs_ioc_space(bdp, vp, filp, ioflags, cmd, arg);
-
- case XFS_IOC_DIOINFO: {
- struct dioattr da;
- xfs_buftarg_t *target =
- (ip->i_d.di_flags & XFS_DIFLAG_REALTIME) ?
- mp->m_rtdev_targp : mp->m_ddev_targp;
-
- da.d_mem = da.d_miniosz = 1 << target->bt_sshift;
- da.d_maxiosz = INT_MAX & ~(da.d_miniosz - 1);
-
- if (copy_to_user(arg, &da, sizeof(da)))
- return -XFS_ERROR(EFAULT);
- return 0;
- }
-
- case XFS_IOC_FSBULKSTAT_SINGLE:
- case XFS_IOC_FSBULKSTAT:
- case XFS_IOC_FSINUMBERS:
- return xfs_ioc_bulkstat(mp, cmd, arg);
-
- case XFS_IOC_FSGEOMETRY_V1:
- return xfs_ioc_fsgeometry_v1(mp, arg);
-
- case XFS_IOC_FSGEOMETRY:
- return xfs_ioc_fsgeometry(mp, arg);
-
- case XFS_IOC_GETVERSION:
- case XFS_IOC_GETXFLAGS:
- case XFS_IOC_SETXFLAGS:
- case XFS_IOC_FSGETXATTR:
- case XFS_IOC_FSSETXATTR:
- case XFS_IOC_FSGETXATTRA:
- return xfs_ioc_xattr(vp, ip, filp, cmd, arg);
-
- case XFS_IOC_FSSETDM: {
- struct fsdmidata dmi;
-
- if (copy_from_user(&dmi, arg, sizeof(dmi)))
- return -XFS_ERROR(EFAULT);
-
- error = xfs_set_dmattrs(bdp, dmi.fsd_dmevmask, dmi.fsd_dmstate,
- NULL);
- return -error;
- }
-
- case XFS_IOC_GETBMAP:
- case XFS_IOC_GETBMAPA:
- return xfs_ioc_getbmap(bdp, filp, ioflags, cmd, arg);
-
- case XFS_IOC_GETBMAPX:
- return xfs_ioc_getbmapx(bdp, arg);
-
- case XFS_IOC_FD_TO_HANDLE:
- case XFS_IOC_PATH_TO_HANDLE:
- case XFS_IOC_PATH_TO_FSHANDLE:
- return xfs_find_handle(cmd, arg);
-
- case XFS_IOC_OPEN_BY_HANDLE:
- return xfs_open_by_handle(mp, arg, filp, inode);
-
- case XFS_IOC_FSSETDM_BY_HANDLE:
- return xfs_fssetdm_by_handle(mp, arg, filp, inode);
-
- case XFS_IOC_READLINK_BY_HANDLE:
- return xfs_readlink_by_handle(mp, arg, filp, inode);
-
- case XFS_IOC_ATTRLIST_BY_HANDLE:
- return xfs_attrlist_by_handle(mp, arg, filp, inode);
-
- case XFS_IOC_ATTRMULTI_BY_HANDLE:
- return xfs_attrmulti_by_handle(mp, arg, filp, inode);
-
- case XFS_IOC_SWAPEXT: {
- error = xfs_swapext((struct xfs_swapext __user *)arg);
- return -error;
- }
-
- case XFS_IOC_FSCOUNTS: {
- xfs_fsop_counts_t out;
-
- error = xfs_fs_counts(mp, &out);
- if (error)
- return -error;
-
- if (copy_to_user(arg, &out, sizeof(out)))
- return -XFS_ERROR(EFAULT);
- return 0;
- }
-
- case XFS_IOC_SET_RESBLKS: {
- xfs_fsop_resblks_t inout;
- __uint64_t in;
-
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- if (copy_from_user(&inout, arg, sizeof(inout)))
- return -XFS_ERROR(EFAULT);
-
- /* input parameter is passed in resblks field of structure */
- in = inout.resblks;
- error = xfs_reserve_blocks(mp, &in, &inout);
- if (error)
- return -error;
-
- if (copy_to_user(arg, &inout, sizeof(inout)))
- return -XFS_ERROR(EFAULT);
- return 0;
- }
-
- case XFS_IOC_GET_RESBLKS: {
- xfs_fsop_resblks_t out;
-
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- error = xfs_reserve_blocks(mp, NULL, &out);
- if (error)
- return -error;
-
- if (copy_to_user(arg, &out, sizeof(out)))
- return -XFS_ERROR(EFAULT);
-
- return 0;
- }
-
- case XFS_IOC_FSGROWFSDATA: {
- xfs_growfs_data_t in;
-
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- if (copy_from_user(&in, arg, sizeof(in)))
- return -XFS_ERROR(EFAULT);
-
- error = xfs_growfs_data(mp, &in);
- return -error;
- }
-
- case XFS_IOC_FSGROWFSLOG: {
- xfs_growfs_log_t in;
-
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- if (copy_from_user(&in, arg, sizeof(in)))
- return -XFS_ERROR(EFAULT);
-
- error = xfs_growfs_log(mp, &in);
- return -error;
- }
-
- case XFS_IOC_FSGROWFSRT: {
- xfs_growfs_rt_t in;
-
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- if (copy_from_user(&in, arg, sizeof(in)))
- return -XFS_ERROR(EFAULT);
-
- error = xfs_growfs_rt(mp, &in);
- return -error;
- }
-
- case XFS_IOC_FREEZE:
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- if (inode->i_sb->s_frozen == SB_UNFROZEN)
- freeze_bdev(inode->i_sb->s_bdev);
- return 0;
-
- case XFS_IOC_THAW:
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
- if (inode->i_sb->s_frozen != SB_UNFROZEN)
- thaw_bdev(inode->i_sb->s_bdev, inode->i_sb);
- return 0;
-
- case XFS_IOC_GOINGDOWN: {
- __uint32_t in;
-
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- if (get_user(in, (__uint32_t __user *)arg))
- return -XFS_ERROR(EFAULT);
-
- error = xfs_fs_goingdown(mp, in);
- return -error;
- }
-
- case XFS_IOC_ERROR_INJECTION: {
- xfs_error_injection_t in;
-
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- if (copy_from_user(&in, arg, sizeof(in)))
- return -XFS_ERROR(EFAULT);
-
- error = xfs_errortag_add(in.errtag, mp);
- return -error;
- }
-
- case XFS_IOC_ERROR_CLEARALL:
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- error = xfs_errortag_clearall(mp);
- return -error;
-
- default:
- return -ENOTTY;
- }
-}
-
-STATIC int
-xfs_ioc_space(
- bhv_desc_t *bdp,
- vnode_t *vp,
- struct file *filp,
- int ioflags,
- unsigned int cmd,
- void __user *arg)
-{
- xfs_flock64_t bf;
- int attr_flags = 0;
- int error;
-
- if (vp->v_inode.i_flags & (S_IMMUTABLE|S_APPEND))
- return -XFS_ERROR(EPERM);
-
- if (!(filp->f_mode & FMODE_WRITE))
- return -XFS_ERROR(EBADF);
-
- if (!VN_ISREG(vp))
- return -XFS_ERROR(EINVAL);
-
- if (copy_from_user(&bf, arg, sizeof(bf)))
- return -XFS_ERROR(EFAULT);
-
- if (filp->f_flags & (O_NDELAY|O_NONBLOCK))
- attr_flags |= ATTR_NONBLOCK;
- if (ioflags & IO_INVIS)
- attr_flags |= ATTR_DMI;
-
- error = xfs_change_file_space(bdp, cmd, &bf, filp->f_pos,
- NULL, attr_flags);
- return -error;
-}
-
-STATIC int
-xfs_ioc_bulkstat(
- xfs_mount_t *mp,
- unsigned int cmd,
- void __user *arg)
-{
- xfs_fsop_bulkreq_t bulkreq;
- int count; /* # of records returned */
- xfs_ino_t inlast; /* last inode number */
- int done;
- int error;
-
- /* done = 1 if there are more stats to get and if bulkstat */
- /* should be called again (unused here, but used in dmapi) */
-
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- if (XFS_FORCED_SHUTDOWN(mp))
- return -XFS_ERROR(EIO);
-
- if (copy_from_user(&bulkreq, arg, sizeof(xfs_fsop_bulkreq_t)))
- return -XFS_ERROR(EFAULT);
-
- if (copy_from_user(&inlast, bulkreq.lastip, sizeof(__s64)))
- return -XFS_ERROR(EFAULT);
-
- if ((count = bulkreq.icount) <= 0)
- return -XFS_ERROR(EINVAL);
-
- if (cmd == XFS_IOC_FSINUMBERS)
- error = xfs_inumbers(mp, &inlast, &count,
- bulkreq.ubuffer);
- else if (cmd == XFS_IOC_FSBULKSTAT_SINGLE)
- error = xfs_bulkstat_single(mp, &inlast,
- bulkreq.ubuffer, &done);
- else { /* XFS_IOC_FSBULKSTAT */
- if (count == 1 && inlast != 0) {
- inlast++;
- error = xfs_bulkstat_single(mp, &inlast,
- bulkreq.ubuffer, &done);
- } else {
- error = xfs_bulkstat(mp, &inlast, &count,
- (bulkstat_one_pf)xfs_bulkstat_one, NULL,
- sizeof(xfs_bstat_t), bulkreq.ubuffer,
- BULKSTAT_FG_QUICK, &done);
- }
- }
-
- if (error)
- return -error;
-
- if (bulkreq.ocount != NULL) {
- if (copy_to_user(bulkreq.lastip, &inlast,
- sizeof(xfs_ino_t)))
- return -XFS_ERROR(EFAULT);
-
- if (copy_to_user(bulkreq.ocount, &count, sizeof(count)))
- return -XFS_ERROR(EFAULT);
- }
-
- return 0;
-}
-
-STATIC int
-xfs_ioc_fsgeometry_v1(
- xfs_mount_t *mp,
- void __user *arg)
-{
- xfs_fsop_geom_v1_t fsgeo;
- int error;
-
- error = xfs_fs_geometry(mp, (xfs_fsop_geom_t *)&fsgeo, 3);
- if (error)
- return -error;
-
- if (copy_to_user(arg, &fsgeo, sizeof(fsgeo)))
- return -XFS_ERROR(EFAULT);
- return 0;
-}
-
-STATIC int
-xfs_ioc_fsgeometry(
- xfs_mount_t *mp,
- void __user *arg)
-{
- xfs_fsop_geom_t fsgeo;
- int error;
-
- error = xfs_fs_geometry(mp, &fsgeo, 4);
- if (error)
- return -error;
-
- if (copy_to_user(arg, &fsgeo, sizeof(fsgeo)))
- return -XFS_ERROR(EFAULT);
- return 0;
-}
-
-/*
- * Linux extended inode flags interface.
- */
-#define LINUX_XFLAG_SYNC 0x00000008 /* Synchronous updates */
-#define LINUX_XFLAG_IMMUTABLE 0x00000010 /* Immutable file */
-#define LINUX_XFLAG_APPEND 0x00000020 /* writes to file may only append */
-#define LINUX_XFLAG_NODUMP 0x00000040 /* do not dump file */
-#define LINUX_XFLAG_NOATIME 0x00000080 /* do not update atime */
-
-STATIC unsigned int
-xfs_merge_ioc_xflags(
- unsigned int flags,
- unsigned int start)
-{
- unsigned int xflags = start;
-
- if (flags & LINUX_XFLAG_IMMUTABLE)
- xflags |= XFS_XFLAG_IMMUTABLE;
- else
- xflags &= ~XFS_XFLAG_IMMUTABLE;
- if (flags & LINUX_XFLAG_APPEND)
- xflags |= XFS_XFLAG_APPEND;
- else
- xflags &= ~XFS_XFLAG_APPEND;
- if (flags & LINUX_XFLAG_SYNC)
- xflags |= XFS_XFLAG_SYNC;
- else
- xflags &= ~XFS_XFLAG_SYNC;
- if (flags & LINUX_XFLAG_NOATIME)
- xflags |= XFS_XFLAG_NOATIME;
- else
- xflags &= ~XFS_XFLAG_NOATIME;
- if (flags & LINUX_XFLAG_NODUMP)
- xflags |= XFS_XFLAG_NODUMP;
- else
- xflags &= ~XFS_XFLAG_NODUMP;
-
- return xflags;
-}
-
-STATIC unsigned int
-xfs_di2lxflags(
- __uint16_t di_flags)
-{
- unsigned int flags = 0;
-
- if (di_flags & XFS_DIFLAG_IMMUTABLE)
- flags |= LINUX_XFLAG_IMMUTABLE;
- if (di_flags & XFS_DIFLAG_APPEND)
- flags |= LINUX_XFLAG_APPEND;
- if (di_flags & XFS_DIFLAG_SYNC)
- flags |= LINUX_XFLAG_SYNC;
- if (di_flags & XFS_DIFLAG_NOATIME)
- flags |= LINUX_XFLAG_NOATIME;
- if (di_flags & XFS_DIFLAG_NODUMP)
- flags |= LINUX_XFLAG_NODUMP;
- return flags;
-}
-
-STATIC int
-xfs_ioc_xattr(
- vnode_t *vp,
- xfs_inode_t *ip,
- struct file *filp,
- unsigned int cmd,
- void __user *arg)
-{
- struct fsxattr fa;
- vattr_t va;
- int error;
- int attr_flags;
- unsigned int flags;
-
- switch (cmd) {
- case XFS_IOC_FSGETXATTR: {
- va.va_mask = XFS_AT_XFLAGS | XFS_AT_EXTSIZE | \
- XFS_AT_NEXTENTS | XFS_AT_PROJID;
- VOP_GETATTR(vp, &va, 0, NULL, error);
- if (error)
- return -error;
-
- fa.fsx_xflags = va.va_xflags;
- fa.fsx_extsize = va.va_extsize;
- fa.fsx_nextents = va.va_nextents;
- fa.fsx_projid = va.va_projid;
-
- if (copy_to_user(arg, &fa, sizeof(fa)))
- return -XFS_ERROR(EFAULT);
- return 0;
- }
-
- case XFS_IOC_FSSETXATTR: {
- if (copy_from_user(&fa, arg, sizeof(fa)))
- return -XFS_ERROR(EFAULT);
-
- attr_flags = 0;
- if (filp->f_flags & (O_NDELAY|O_NONBLOCK))
- attr_flags |= ATTR_NONBLOCK;
-
- va.va_mask = XFS_AT_XFLAGS | XFS_AT_EXTSIZE | XFS_AT_PROJID;
- va.va_xflags = fa.fsx_xflags;
- va.va_extsize = fa.fsx_extsize;
- va.va_projid = fa.fsx_projid;
-
- VOP_SETATTR(vp, &va, attr_flags, NULL, error);
- if (!error)
- vn_revalidate(vp); /* update Linux inode flags */
- return -error;
- }
-
- case XFS_IOC_FSGETXATTRA: {
- va.va_mask = XFS_AT_XFLAGS | XFS_AT_EXTSIZE | \
- XFS_AT_ANEXTENTS | XFS_AT_PROJID;
- VOP_GETATTR(vp, &va, 0, NULL, error);
- if (error)
- return -error;
-
- fa.fsx_xflags = va.va_xflags;
- fa.fsx_extsize = va.va_extsize;
- fa.fsx_nextents = va.va_anextents;
- fa.fsx_projid = va.va_projid;
-
- if (copy_to_user(arg, &fa, sizeof(fa)))
- return -XFS_ERROR(EFAULT);
- return 0;
- }
-
- case XFS_IOC_GETXFLAGS: {
- flags = xfs_di2lxflags(ip->i_d.di_flags);
- if (copy_to_user(arg, &flags, sizeof(flags)))
- return -XFS_ERROR(EFAULT);
- return 0;
- }
-
- case XFS_IOC_SETXFLAGS: {
- if (copy_from_user(&flags, arg, sizeof(flags)))
- return -XFS_ERROR(EFAULT);
-
- if (flags & ~(LINUX_XFLAG_IMMUTABLE | LINUX_XFLAG_APPEND | \
- LINUX_XFLAG_NOATIME | LINUX_XFLAG_NODUMP | \
- LINUX_XFLAG_SYNC))
- return -XFS_ERROR(EOPNOTSUPP);
-
- attr_flags = 0;
- if (filp->f_flags & (O_NDELAY|O_NONBLOCK))
- attr_flags |= ATTR_NONBLOCK;
-
- va.va_mask = XFS_AT_XFLAGS;
- va.va_xflags = xfs_merge_ioc_xflags(flags,
- xfs_ip2xflags(ip));
-
- VOP_SETATTR(vp, &va, attr_flags, NULL, error);
- if (!error)
- vn_revalidate(vp); /* update Linux inode flags */
- return -error;
- }
-
- case XFS_IOC_GETVERSION: {
- flags = LINVFS_GET_IP(vp)->i_generation;
- if (copy_to_user(arg, &flags, sizeof(flags)))
- return -XFS_ERROR(EFAULT);
- return 0;
- }
-
- default:
- return -ENOTTY;
- }
-}
-
-STATIC int
-xfs_ioc_getbmap(
- bhv_desc_t *bdp,
- struct file *filp,
- int ioflags,
- unsigned int cmd,
- void __user *arg)
-{
- struct getbmap bm;
- int iflags;
- int error;
-
- if (copy_from_user(&bm, arg, sizeof(bm)))
- return -XFS_ERROR(EFAULT);
-
- if (bm.bmv_count < 2)
- return -XFS_ERROR(EINVAL);
-
- iflags = (cmd == XFS_IOC_GETBMAPA ? BMV_IF_ATTRFORK : 0);
- if (ioflags & IO_INVIS)
- iflags |= BMV_IF_NO_DMAPI_READ;
-
- error = xfs_getbmap(bdp, &bm, (struct getbmap __user *)arg+1, iflags);
- if (error)
- return -error;
-
- if (copy_to_user(arg, &bm, sizeof(bm)))
- return -XFS_ERROR(EFAULT);
- return 0;
-}
-
-STATIC int
-xfs_ioc_getbmapx(
- bhv_desc_t *bdp,
- void __user *arg)
-{
- struct getbmapx bmx;
- struct getbmap bm;
- int iflags;
- int error;
-
- if (copy_from_user(&bmx, arg, sizeof(bmx)))
- return -XFS_ERROR(EFAULT);
-
- if (bmx.bmv_count < 2)
- return -XFS_ERROR(EINVAL);
-
- /*
- * Map input getbmapx structure to a getbmap
- * structure for xfs_getbmap.
- */
- GETBMAP_CONVERT(bmx, bm);
-
- iflags = bmx.bmv_iflags;
-
- if (iflags & (~BMV_IF_VALID))
- return -XFS_ERROR(EINVAL);
-
- iflags |= BMV_IF_EXTENDED;
-
- error = xfs_getbmap(bdp, &bm, (struct getbmapx __user *)arg+1, iflags);
- if (error)
- return -error;
-
- GETBMAP_CONVERT(bm, bmx);
-
- if (copy_to_user(arg, &bmx, sizeof(bmx)))
- return -XFS_ERROR(EFAULT);
-
- return 0;
-}
diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/linux-2.6/xfs_ioctl32.c
deleted file mode 100644
index a7c9ba1a9f7..00000000000
--- a/fs/xfs/linux-2.6/xfs_ioctl32.c
+++ /dev/null
@@ -1,214 +0,0 @@
-/*
- * Copyright (c) 2004-2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- */
-#include <linux/config.h>
-#include <linux/compat.h>
-#include <linux/init.h>
-#include <linux/ioctl.h>
-#include <linux/syscalls.h>
-#include <linux/types.h>
-#include <linux/fs.h>
-#include <asm/uaccess.h>
-#include "xfs.h"
-#include "xfs_types.h"
-#include "xfs_fs.h"
-#include "xfs_vfs.h"
-#include "xfs_vnode.h"
-#include "xfs_dfrag.h"
-
-#define _NATIVE_IOC(cmd, type) \
- _IOC(_IOC_DIR(cmd), _IOC_TYPE(cmd), _IOC_NR(cmd), sizeof(type))
-
-#if defined(CONFIG_IA64) || defined(CONFIG_X86_64)
-#define BROKEN_X86_ALIGNMENT
-/* on ia32 l_start is on a 32-bit boundary */
-typedef struct xfs_flock64_32 {
- __s16 l_type;
- __s16 l_whence;
- __s64 l_start __attribute__((packed));
- /* len == 0 means until end of file */
- __s64 l_len __attribute__((packed));
- __s32 l_sysid;
- __u32 l_pid;
- __s32 l_pad[4]; /* reserve area */
-} xfs_flock64_32_t;
-
-#define XFS_IOC_ALLOCSP_32 _IOW ('X', 10, struct xfs_flock64_32)
-#define XFS_IOC_FREESP_32 _IOW ('X', 11, struct xfs_flock64_32)
-#define XFS_IOC_ALLOCSP64_32 _IOW ('X', 36, struct xfs_flock64_32)
-#define XFS_IOC_FREESP64_32 _IOW ('X', 37, struct xfs_flock64_32)
-#define XFS_IOC_RESVSP_32 _IOW ('X', 40, struct xfs_flock64_32)
-#define XFS_IOC_UNRESVSP_32 _IOW ('X', 41, struct xfs_flock64_32)
-#define XFS_IOC_RESVSP64_32 _IOW ('X', 42, struct xfs_flock64_32)
-#define XFS_IOC_UNRESVSP64_32 _IOW ('X', 43, struct xfs_flock64_32)
-
-/* just account for different alignment */
-STATIC unsigned long
-xfs_ioctl32_flock(
- unsigned long arg)
-{
- xfs_flock64_32_t __user *p32 = (void __user *)arg;
- xfs_flock64_t __user *p = compat_alloc_user_space(sizeof(*p));
-
- if (copy_in_user(&p->l_type, &p32->l_type, sizeof(s16)) ||
- copy_in_user(&p->l_whence, &p32->l_whence, sizeof(s16)) ||
- copy_in_user(&p->l_start, &p32->l_start, sizeof(s64)) ||
- copy_in_user(&p->l_len, &p32->l_len, sizeof(s64)) ||
- copy_in_user(&p->l_sysid, &p32->l_sysid, sizeof(s32)) ||
- copy_in_user(&p->l_pid, &p32->l_pid, sizeof(u32)) ||
- copy_in_user(&p->l_pad, &p32->l_pad, 4*sizeof(u32)))
- return -EFAULT;
-
- return (unsigned long)p;
-}
-
-#else
-
-typedef struct xfs_fsop_bulkreq32 {
- compat_uptr_t lastip; /* last inode # pointer */
- __s32 icount; /* count of entries in buffer */
- compat_uptr_t ubuffer; /* user buffer for inode desc. */
- __s32 ocount; /* output count pointer */
-} xfs_fsop_bulkreq32_t;
-
-STATIC unsigned long
-xfs_ioctl32_bulkstat(
- unsigned long arg)
-{
- xfs_fsop_bulkreq32_t __user *p32 = (void __user *)arg;
- xfs_fsop_bulkreq_t __user *p = compat_alloc_user_space(sizeof(*p));
- u32 addr;
-
- if (get_user(addr, &p32->lastip) ||
- put_user(compat_ptr(addr), &p->lastip) ||
- copy_in_user(&p->icount, &p32->icount, sizeof(s32)) ||
- get_user(addr, &p32->ubuffer) ||
- put_user(compat_ptr(addr), &p->ubuffer) ||
- get_user(addr, &p32->ocount) ||
- put_user(compat_ptr(addr), &p->ocount))
- return -EFAULT;
-
- return (unsigned long)p;
-}
-#endif
-
-STATIC long
-__linvfs_compat_ioctl(int mode, struct file *f, unsigned cmd, unsigned long arg)
-{
- int error;
- struct inode *inode = f->f_dentry->d_inode;
- vnode_t *vp = LINVFS_GET_VP(inode);
-
- switch (cmd) {
- case XFS_IOC_DIOINFO:
- case XFS_IOC_FSGEOMETRY_V1:
- case XFS_IOC_FSGEOMETRY:
- case XFS_IOC_GETVERSION:
- case XFS_IOC_GETXFLAGS:
- case XFS_IOC_SETXFLAGS:
- case XFS_IOC_FSGETXATTR:
- case XFS_IOC_FSSETXATTR:
- case XFS_IOC_FSGETXATTRA:
- case XFS_IOC_FSSETDM:
- case XFS_IOC_GETBMAP:
- case XFS_IOC_GETBMAPA:
- case XFS_IOC_GETBMAPX:
-/* not handled
- case XFS_IOC_FD_TO_HANDLE:
- case XFS_IOC_PATH_TO_HANDLE:
- case XFS_IOC_PATH_TO_FSHANDLE:
- case XFS_IOC_OPEN_BY_HANDLE:
- case XFS_IOC_FSSETDM_BY_HANDLE:
- case XFS_IOC_READLINK_BY_HANDLE:
- case XFS_IOC_ATTRLIST_BY_HANDLE:
- case XFS_IOC_ATTRMULTI_BY_HANDLE:
-*/
- case XFS_IOC_FSCOUNTS:
- case XFS_IOC_SET_RESBLKS:
- case XFS_IOC_GET_RESBLKS:
- case XFS_IOC_FSGROWFSDATA:
- case XFS_IOC_FSGROWFSLOG:
- case XFS_IOC_FSGROWFSRT:
- case XFS_IOC_FREEZE:
- case XFS_IOC_THAW:
- case XFS_IOC_GOINGDOWN:
- case XFS_IOC_ERROR_INJECTION:
- case XFS_IOC_ERROR_CLEARALL:
- break;
-
-#ifdef BROKEN_X86_ALIGNMENT
- /* xfs_flock_t has wrong u32 vs u64 alignment */
- case XFS_IOC_ALLOCSP_32:
- case XFS_IOC_FREESP_32:
- case XFS_IOC_ALLOCSP64_32:
- case XFS_IOC_FREESP64_32:
- case XFS_IOC_RESVSP_32:
- case XFS_IOC_UNRESVSP_32:
- case XFS_IOC_RESVSP64_32:
- case XFS_IOC_UNRESVSP64_32:
- arg = xfs_ioctl32_flock(arg);
- cmd = _NATIVE_IOC(cmd, struct xfs_flock64);
- break;
-
-#else /* These are handled fine if no alignment issues */
- case XFS_IOC_ALLOCSP:
- case XFS_IOC_FREESP:
- case XFS_IOC_RESVSP:
- case XFS_IOC_UNRESVSP:
- case XFS_IOC_ALLOCSP64:
- case XFS_IOC_FREESP64:
- case XFS_IOC_RESVSP64:
- case XFS_IOC_UNRESVSP64:
- break;
-
- /* xfs_bstat_t still has wrong u32 vs u64 alignment */
- case XFS_IOC_SWAPEXT:
- break;
-
- case XFS_IOC_FSBULKSTAT_SINGLE:
- case XFS_IOC_FSBULKSTAT:
- case XFS_IOC_FSINUMBERS:
- arg = xfs_ioctl32_bulkstat(arg);
- break;
-#endif
- default:
- return -ENOIOCTLCMD;
- }
-
- VOP_IOCTL(vp, inode, f, mode, cmd, (void __user *)arg, error);
- VMODIFY(vp);
-
- return error;
-}
-
-long
-linvfs_compat_ioctl(
- struct file *f,
- unsigned cmd,
- unsigned long arg)
-{
- return __linvfs_compat_ioctl(0, f, cmd, arg);
-}
-
-long
-linvfs_compat_invis_ioctl(
- struct file *f,
- unsigned cmd,
- unsigned long arg)
-{
- return __linvfs_compat_ioctl(IO_INVIS, f, cmd, arg);
-}
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
deleted file mode 100644
index d7f6f2d8ac8..00000000000
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ /dev/null
@@ -1,860 +0,0 @@
-/*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_bit.h"
-#include "xfs_log.h"
-#include "xfs_inum.h"
-#include "xfs_trans.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_dir.h"
-#include "xfs_dir2.h"
-#include "xfs_alloc.h"
-#include "xfs_dmapi.h"
-#include "xfs_quota.h"
-#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
-#include "xfs_dinode.h"
-#include "xfs_inode.h"
-#include "xfs_bmap.h"
-#include "xfs_btree.h"
-#include "xfs_ialloc.h"
-#include "xfs_rtalloc.h"
-#include "xfs_error.h"
-#include "xfs_itable.h"
-#include "xfs_rw.h"
-#include "xfs_acl.h"
-#include "xfs_cap.h"
-#include "xfs_mac.h"
-#include "xfs_attr.h"
-#include "xfs_buf_item.h"
-#include "xfs_utils.h"
-
-#include <linux/capability.h>
-#include <linux/xattr.h>
-#include <linux/namei.h>
-#include <linux/security.h>
-
-/*
- * Get a XFS inode from a given vnode.
- */
-xfs_inode_t *
-xfs_vtoi(
- struct vnode *vp)
-{
- bhv_desc_t *bdp;
-
- bdp = bhv_lookup_range(VN_BHV_HEAD(vp),
- VNODE_POSITION_XFS, VNODE_POSITION_XFS);
- if (unlikely(bdp == NULL))
- return NULL;
- return XFS_BHVTOI(bdp);
-}
-
-/*
- * Bring the atime in the XFS inode uptodate.
- * Used before logging the inode to disk or when the Linux inode goes away.
- */
-void
-xfs_synchronize_atime(
- xfs_inode_t *ip)
-{
- vnode_t *vp;
-
- vp = XFS_ITOV_NULL(ip);
- if (vp) {
- struct inode *inode = &vp->v_inode;
- ip->i_d.di_atime.t_sec = (__int32_t)inode->i_atime.tv_sec;
- ip->i_d.di_atime.t_nsec = (__int32_t)inode->i_atime.tv_nsec;
- }
-}
-
-/*
- * Change the requested timestamp in the given inode.
- * We don't lock across timestamp updates, and we don't log them but
- * we do record the fact that there is dirty information in core.
- *
- * NOTE -- callers MUST combine XFS_ICHGTIME_MOD or XFS_ICHGTIME_CHG
- * with XFS_ICHGTIME_ACC to be sure that access time
- * update will take. Calling first with XFS_ICHGTIME_ACC
- * and then XFS_ICHGTIME_MOD may fail to modify the access
- * timestamp if the filesystem is mounted noacctm.
- */
-void
-xfs_ichgtime(
- xfs_inode_t *ip,
- int flags)
-{
- struct inode *inode = LINVFS_GET_IP(XFS_ITOV(ip));
- timespec_t tv;
-
- nanotime(&tv);
- if (flags & XFS_ICHGTIME_MOD) {
- inode->i_mtime = tv;
- ip->i_d.di_mtime.t_sec = (__int32_t)tv.tv_sec;
- ip->i_d.di_mtime.t_nsec = (__int32_t)tv.tv_nsec;
- }
- if (flags & XFS_ICHGTIME_ACC) {
- inode->i_atime = tv;
- ip->i_d.di_atime.t_sec = (__int32_t)tv.tv_sec;
- ip->i_d.di_atime.t_nsec = (__int32_t)tv.tv_nsec;
- }
- if (flags & XFS_ICHGTIME_CHG) {
- inode->i_ctime = tv;
- ip->i_d.di_ctime.t_sec = (__int32_t)tv.tv_sec;
- ip->i_d.di_ctime.t_nsec = (__int32_t)tv.tv_nsec;
- }
-
- /*
- * We update the i_update_core field _after_ changing
- * the timestamps in order to coordinate properly with
- * xfs_iflush() so that we don't lose timestamp updates.
- * This keeps us from having to hold the inode lock
- * while doing this. We use the SYNCHRONIZE macro to
- * ensure that the compiler does not reorder the update
- * of i_update_core above the timestamp updates above.
- */
- SYNCHRONIZE();
- ip->i_update_core = 1;
- if (!(inode->i_state & I_LOCK))
- mark_inode_dirty_sync(inode);
-}
-
-/*
- * Variant on the above which avoids querying the system clock
- * in situations where we know the Linux inode timestamps have
- * just been updated (and so we can update our inode cheaply).
- */
-void
-xfs_ichgtime_fast(
- xfs_inode_t *ip,
- struct inode *inode,
- int flags)
-{
- timespec_t *tvp;
-
- /*
- * Atime updates for read() & friends are handled lazily now, and
- * explicit updates must go through xfs_ichgtime()
- */
- ASSERT((flags & XFS_ICHGTIME_ACC) == 0);
-
- /*
- * We're not supposed to change timestamps in readonly-mounted
- * filesystems. Throw it away if anyone asks us.
- */
- if (unlikely(IS_RDONLY(inode)))
- return;
-
- if (flags & XFS_ICHGTIME_MOD) {
- tvp = &inode->i_mtime;
- ip->i_d.di_mtime.t_sec = (__int32_t)tvp->tv_sec;
- ip->i_d.di_mtime.t_nsec = (__int32_t)tvp->tv_nsec;
- }
- if (flags & XFS_ICHGTIME_CHG) {
- tvp = &inode->i_ctime;
- ip->i_d.di_ctime.t_sec = (__int32_t)tvp->tv_sec;
- ip->i_d.di_ctime.t_nsec = (__int32_t)tvp->tv_nsec;
- }
-
- /*
- * We update the i_update_core field _after_ changing
- * the timestamps in order to coordinate properly with
- * xfs_iflush() so that we don't lose timestamp updates.
- * This keeps us from having to hold the inode lock
- * while doing this. We use the SYNCHRONIZE macro to
- * ensure that the compiler does not reorder the update
- * of i_update_core above the timestamp updates above.
- */
- SYNCHRONIZE();
- ip->i_update_core = 1;
- if (!(inode->i_state & I_LOCK))
- mark_inode_dirty_sync(inode);
-}
-
-
-/*
- * Pull the link count and size up from the xfs inode to the linux inode
- */
-STATIC void
-validate_fields(
- struct inode *ip)
-{
- vnode_t *vp = LINVFS_GET_VP(ip);
- vattr_t va;
- int error;
-
- va.va_mask = XFS_AT_NLINK|XFS_AT_SIZE|XFS_AT_NBLOCKS;
- VOP_GETATTR(vp, &va, ATTR_LAZY, NULL, error);
- if (likely(!error)) {
- ip->i_nlink = va.va_nlink;
- ip->i_blocks = va.va_nblocks;
-
- /* we're under i_mutex so i_size can't change under us */
- if (i_size_read(ip) != va.va_size)
- i_size_write(ip, va.va_size);
- }
-}
-
-/*
- * Hook in SELinux. This is not quite correct yet, what we really need
- * here (as we do for default ACLs) is a mechanism by which creation of
- * these attrs can be journalled at inode creation time (along with the
- * inode, of course, such that log replay can't cause these to be lost).
- */
-STATIC int
-linvfs_init_security(
- struct vnode *vp,
- struct inode *dir)
-{
- struct inode *ip = LINVFS_GET_IP(vp);
- size_t length;
- void *value;
- char *name;
- int error;
-
- error = security_inode_init_security(ip, dir, &name, &value, &length);
- if (error) {
- if (error == -EOPNOTSUPP)
- return 0;
- return -error;
- }
-
- VOP_ATTR_SET(vp, name, value, length, ATTR_SECURE, NULL, error);
- if (!error)
- VMODIFY(vp);
-
- kfree(name);
- kfree(value);
- return error;
-}
-
-/*
- * Determine whether a process has a valid fs_struct (kernel daemons
- * like knfsd don't have an fs_struct).
- *
- * XXX(hch): nfsd is broken, better fix it instead.
- */
-STATIC inline int
-has_fs_struct(struct task_struct *task)
-{
- return (task->fs != init_task.fs);
-}
-
-STATIC inline void
-cleanup_inode(
- vnode_t *dvp,
- vnode_t *vp,
- struct dentry *dentry,
- int mode)
-{
- struct dentry teardown = {};
- int err2;
-
- /* Oh, the horror.
- * If we can't add the ACL or we fail in
- * linvfs_init_security we must back out.
- * ENOSPC can hit here, among other things.
- */
- teardown.d_inode = LINVFS_GET_IP(vp);
- teardown.d_name = dentry->d_name;
-
- if (S_ISDIR(mode))
- VOP_RMDIR(dvp, &teardown, NULL, err2);
- else
- VOP_REMOVE(dvp, &teardown, NULL, err2);
- VN_RELE(vp);
-}
-
-STATIC int
-linvfs_mknod(
- struct inode *dir,
- struct dentry *dentry,
- int mode,
- dev_t rdev)
-{
- struct inode *ip;
- vattr_t va;
- vnode_t *vp = NULL, *dvp = LINVFS_GET_VP(dir);
- xfs_acl_t *default_acl = NULL;
- attrexists_t test_default_acl = _ACL_DEFAULT_EXISTS;
- int error;
-
- /*
- * Irix uses Missed'em'V split, but doesn't want to see
- * the upper 5 bits of (14bit) major.
- */
- if (!sysv_valid_dev(rdev) || MAJOR(rdev) & ~0x1ff)
- return -EINVAL;
-
- if (test_default_acl && test_default_acl(dvp)) {
- if (!_ACL_ALLOC(default_acl))
- return -ENOMEM;
- if (!_ACL_GET_DEFAULT(dvp, default_acl)) {
- _ACL_FREE(default_acl);
- default_acl = NULL;
- }
- }
-
- if (IS_POSIXACL(dir) && !default_acl && has_fs_struct(current))
- mode &= ~current->fs->umask;
-
- memset(&va, 0, sizeof(va));
- va.va_mask = XFS_AT_TYPE|XFS_AT_MODE;
- va.va_mode = mode;
-
- switch (mode & S_IFMT) {
- case S_IFCHR: case S_IFBLK: case S_IFIFO: case S_IFSOCK:
- va.va_rdev = sysv_encode_dev(rdev);
- va.va_mask |= XFS_AT_RDEV;
- /*FALLTHROUGH*/
- case S_IFREG:
- VOP_CREATE(dvp, dentry, &va, &vp, NULL, error);
- break;
- case S_IFDIR:
- VOP_MKDIR(dvp, dentry, &va, &vp, NULL, error);
- break;
- default:
- error = EINVAL;
- break;
- }
-
- if (!error)
- {
- error = linvfs_init_security(vp, dir);
- if (error)
- cleanup_inode(dvp, vp, dentry, mode);
- }
-
- if (default_acl) {
- if (!error) {
- error = _ACL_INHERIT(vp, &va, default_acl);
- if (!error)
- VMODIFY(vp);
- else
- cleanup_inode(dvp, vp, dentry, mode);
- }
- _ACL_FREE(default_acl);
- }
-
- if (!error) {
- ASSERT(vp);
- ip = LINVFS_GET_IP(vp);
-
- if (S_ISCHR(mode) || S_ISBLK(mode))
- ip->i_rdev = rdev;
- else if (S_ISDIR(mode))
- validate_fields(ip);
- d_instantiate(dentry, ip);
- validate_fields(dir);
- }
- return -error;
-}
-
-STATIC int
-linvfs_create(
- struct inode *dir,
- struct dentry *dentry,
- int mode,
- struct nameidata *nd)
-{
- return linvfs_mknod(dir, dentry, mode, 0);
-}
-
-STATIC int
-linvfs_mkdir(
- struct inode *dir,
- struct dentry *dentry,
- int mode)
-{
- return linvfs_mknod(dir, dentry, mode|S_IFDIR, 0);
-}
-
-STATIC struct dentry *
-linvfs_lookup(
- struct inode *dir,
- struct dentry *dentry,
- struct nameidata *nd)
-{
- struct vnode *vp = LINVFS_GET_VP(dir), *cvp;
- int error;
-
- if (dentry->d_name.len >= MAXNAMELEN)
- return ERR_PTR(-ENAMETOOLONG);
-
- VOP_LOOKUP(vp, dentry, &cvp, 0, NULL, NULL, error);
- if (error) {
- if (unlikely(error != ENOENT))
- return ERR_PTR(-error);
- d_add(dentry, NULL);
- return NULL;
- }
-
- return d_splice_alias(LINVFS_GET_IP(cvp), dentry);
-}
-
-STATIC int
-linvfs_link(
- struct dentry *old_dentry,
- struct inode *dir,
- struct dentry *dentry)
-{
- struct inode *ip; /* inode of guy being linked to */
- vnode_t *tdvp; /* target directory for new name/link */
- vnode_t *vp; /* vp of name being linked */
- int error;
-
- ip = old_dentry->d_inode; /* inode being linked to */
- if (S_ISDIR(ip->i_mode))
- return -EPERM;
-
- tdvp = LINVFS_GET_VP(dir);
- vp = LINVFS_GET_VP(ip);
-
- VOP_LINK(tdvp, vp, dentry, NULL, error);
- if (!error) {
- VMODIFY(tdvp);
- VN_HOLD(vp);
- validate_fields(ip);
- d_instantiate(dentry, ip);
- }
- return -error;
-}
-
-STATIC int
-linvfs_unlink(
- struct inode *dir,
- struct dentry *dentry)
-{
- struct inode *inode;
- vnode_t *dvp; /* directory containing name to remove */
- int error;
-
- inode = dentry->d_inode;
- dvp = LINVFS_GET_VP(dir);
-
- VOP_REMOVE(dvp, dentry, NULL, error);
- if (!error) {
- validate_fields(dir); /* For size only */
- validate_fields(inode);
- }
-
- return -error;
-}
-
-STATIC int
-linvfs_symlink(
- struct inode *dir,
- struct dentry *dentry,
- const char *symname)
-{
- struct inode *ip;
- vattr_t va;
- vnode_t *dvp; /* directory containing name of symlink */
- vnode_t *cvp; /* used to lookup symlink to put in dentry */
- int error;
-
- dvp = LINVFS_GET_VP(dir);
- cvp = NULL;
-
- memset(&va, 0, sizeof(va));
- va.va_mode = S_IFLNK |
- (irix_symlink_mode ? 0777 & ~current->fs->umask : S_IRWXUGO);
- va.va_mask = XFS_AT_TYPE|XFS_AT_MODE;
-
- error = 0;
- VOP_SYMLINK(dvp, dentry, &va, (char *)symname, &cvp, NULL, error);
- if (likely(!error && cvp)) {
- error = linvfs_init_security(cvp, dir);
- if (likely(!error)) {
- ip = LINVFS_GET_IP(cvp);
- d_instantiate(dentry, ip);
- validate_fields(dir);
- validate_fields(ip);
- }
- }
- return -error;
-}
-
-STATIC int
-linvfs_rmdir(
- struct inode *dir,
- struct dentry *dentry)
-{
- struct inode *inode = dentry->d_inode;
- vnode_t *dvp = LINVFS_GET_VP(dir);
- int error;
-
- VOP_RMDIR(dvp, dentry, NULL, error);
- if (!error) {
- validate_fields(inode);
- validate_fields(dir);
- }
- return -error;
-}
-
-STATIC int
-linvfs_rename(
- struct inode *odir,
- struct dentry *odentry,
- struct inode *ndir,
- struct dentry *ndentry)
-{
- struct inode *new_inode = ndentry->d_inode;
- vnode_t *fvp; /* from directory */
- vnode_t *tvp; /* target directory */
- int error;
-
- fvp = LINVFS_GET_VP(odir);
- tvp = LINVFS_GET_VP(ndir);
-
- VOP_RENAME(fvp, odentry, tvp, ndentry, NULL, error);
- if (error)
- return -error;
-
- if (new_inode)
- validate_fields(new_inode);
-
- validate_fields(odir);
- if (ndir != odir)
- validate_fields(ndir);
- return 0;
-}
-
-/*
- * careful here - this function can get called recursively, so
- * we need to be very careful about how much stack we use.
- * uio is kmalloced for this reason...
- */
-STATIC void *
-linvfs_follow_link(
- struct dentry *dentry,
- struct nameidata *nd)
-{
- vnode_t *vp;
- uio_t *uio;
- iovec_t iov;
- int error;
- char *link;
-
- ASSERT(dentry);
- ASSERT(nd);
-
- link = (char *)kmalloc(MAXPATHLEN+1, GFP_KERNEL);
- if (!link) {
- nd_set_link(nd, ERR_PTR(-ENOMEM));
- return NULL;
- }
-
- uio = (uio_t *)kmalloc(sizeof(uio_t), GFP_KERNEL);
- if (!uio) {
- kfree(link);
- nd_set_link(nd, ERR_PTR(-ENOMEM));
- return NULL;
- }
-
- vp = LINVFS_GET_VP(dentry->d_inode);
-
- iov.iov_base = link;
- iov.iov_len = MAXPATHLEN;
-
- uio->uio_iov = &iov;
- uio->uio_offset = 0;
- uio->uio_segflg = UIO_SYSSPACE;
- uio->uio_resid = MAXPATHLEN;
- uio->uio_iovcnt = 1;
-
- VOP_READLINK(vp, uio, 0, NULL, error);
- if (error) {
- kfree(link);
- link = ERR_PTR(-error);
- } else {
- link[MAXPATHLEN - uio->uio_resid] = '\0';
- }
- kfree(uio);
-
- nd_set_link(nd, link);
- return NULL;
-}
-
-STATIC void
-linvfs_put_link(
- struct dentry *dentry,
- struct nameidata *nd,
- void *p)
-{
- char *s = nd_get_link(nd);
-
- if (!IS_ERR(s))
- kfree(s);
-}
-
-#ifdef CONFIG_XFS_POSIX_ACL
-STATIC int
-linvfs_permission(
- struct inode *inode,
- int mode,
- struct nameidata *nd)
-{
- vnode_t *vp = LINVFS_GET_VP(inode);
- int error;
-
- mode <<= 6; /* convert from linux to vnode access bits */
- VOP_ACCESS(vp, mode, NULL, error);
- return -error;
-}
-#else
-#define linvfs_permission NULL
-#endif
-
-STATIC int
-linvfs_getattr(
- struct vfsmount *mnt,
- struct dentry *dentry,
- struct kstat *stat)
-{
- struct inode *inode = dentry->d_inode;
- vnode_t *vp = LINVFS_GET_VP(inode);
- int error = 0;
-
- if (unlikely(vp->v_flag & VMODIFIED))
- error = vn_revalidate(vp);
- if (!error)
- generic_fillattr(inode, stat);
- return 0;
-}
-
-STATIC int
-linvfs_setattr(
- struct dentry *dentry,
- struct iattr *attr)
-{
- struct inode *inode = dentry->d_inode;
- unsigned int ia_valid = attr->ia_valid;
- vnode_t *vp = LINVFS_GET_VP(inode);
- vattr_t vattr;
- int flags = 0;
- int error;
-
- memset(&vattr, 0, sizeof(vattr_t));
- if (ia_valid & ATTR_UID) {
- vattr.va_mask |= XFS_AT_UID;
- vattr.va_uid = attr->ia_uid;
- }
- if (ia_valid & ATTR_GID) {
- vattr.va_mask |= XFS_AT_GID;
- vattr.va_gid = attr->ia_gid;
- }
- if (ia_valid & ATTR_SIZE) {
- vattr.va_mask |= XFS_AT_SIZE;
- vattr.va_size = attr->ia_size;
- }
- if (ia_valid & ATTR_ATIME) {
- vattr.va_mask |= XFS_AT_ATIME;
- vattr.va_atime = attr->ia_atime;
- if (ia_valid & ATTR_ATIME_SET)
- inode->i_atime = attr->ia_atime;
- }
- if (ia_valid & ATTR_MTIME) {
- vattr.va_mask |= XFS_AT_MTIME;
- vattr.va_mtime = attr->ia_mtime;
- }
- if (ia_valid & ATTR_CTIME) {
- vattr.va_mask |= XFS_AT_CTIME;
- vattr.va_ctime = attr->ia_ctime;
- }
- if (ia_valid & ATTR_MODE) {
- vattr.va_mask |= XFS_AT_MODE;
- vattr.va_mode = attr->ia_mode;
- if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID))
- inode->i_mode &= ~S_ISGID;
- }
-
- if (ia_valid & (ATTR_MTIME_SET | ATTR_ATIME_SET))
- flags |= ATTR_UTIME;
-#ifdef ATTR_NO_BLOCK
- if ((ia_valid & ATTR_NO_BLOCK))
- flags |= ATTR_NONBLOCK;
-#endif
-
- VOP_SETATTR(vp, &vattr, flags, NULL, error);
- if (error)
- return -error;
- vn_revalidate(vp);
- return error;
-}
-
-STATIC void
-linvfs_truncate(
- struct inode *inode)
-{
- block_truncate_page(inode->i_mapping, inode->i_size, linvfs_get_block);
-}
-
-STATIC int
-linvfs_setxattr(
- struct dentry *dentry,
- const char *name,
- const void *data,
- size_t size,
- int flags)
-{
- vnode_t *vp = LINVFS_GET_VP(dentry->d_inode);
- char *attr = (char *)name;
- attrnames_t *namesp;
- int xflags = 0;
- int error;
-
- namesp = attr_lookup_namespace(attr, attr_namespaces, ATTR_NAMECOUNT);
- if (!namesp)
- return -EOPNOTSUPP;
- attr += namesp->attr_namelen;
- error = namesp->attr_capable(vp, NULL);
- if (error)
- return error;
-
- /* Convert Linux syscall to XFS internal ATTR flags */
- if (flags & XATTR_CREATE)
- xflags |= ATTR_CREATE;
- if (flags & XATTR_REPLACE)
- xflags |= ATTR_REPLACE;
- xflags |= namesp->attr_flag;
- return namesp->attr_set(vp, attr, (void *)data, size, xflags);
-}
-
-STATIC ssize_t
-linvfs_getxattr(
- struct dentry *dentry,
- const char *name,
- void *data,
- size_t size)
-{
- vnode_t *vp = LINVFS_GET_VP(dentry->d_inode);
- char *attr = (char *)name;
- attrnames_t *namesp;
- int xflags = 0;
- ssize_t error;
-
- namesp = attr_lookup_namespace(attr, attr_namespaces, ATTR_NAMECOUNT);
- if (!namesp)
- return -EOPNOTSUPP;
- attr += namesp->attr_namelen;
- error = namesp->attr_capable(vp, NULL);
- if (error)
- return error;
-
- /* Convert Linux syscall to XFS internal ATTR flags */
- if (!size) {
- xflags |= ATTR_KERNOVAL;
- data = NULL;
- }
- xflags |= namesp->attr_flag;
- return namesp->attr_get(vp, attr, (void *)data, size, xflags);
-}
-
-STATIC ssize_t
-linvfs_listxattr(
- struct dentry *dentry,
- char *data,
- size_t size)
-{
- vnode_t *vp = LINVFS_GET_VP(dentry->d_inode);
- int error, xflags = ATTR_KERNAMELS;
- ssize_t result;
-
- if (!size)
- xflags |= ATTR_KERNOVAL;
- xflags |= capable(CAP_SYS_ADMIN) ? ATTR_KERNFULLS : ATTR_KERNORMALS;
-
- error = attr_generic_list(vp, data, size, xflags, &result);
- if (error < 0)
- return error;
- return result;
-}
-
-STATIC int
-linvfs_removexattr(
- struct dentry *dentry,
- const char *name)
-{
- vnode_t *vp = LINVFS_GET_VP(dentry->d_inode);
- char *attr = (char *)name;
- attrnames_t *namesp;
- int xflags = 0;
- int error;
-
- namesp = attr_lookup_namespace(attr, attr_namespaces, ATTR_NAMECOUNT);
- if (!namesp)
- return -EOPNOTSUPP;
- attr += namesp->attr_namelen;
- error = namesp->attr_capable(vp, NULL);
- if (error)
- return error;
- xflags |= namesp->attr_flag;
- return namesp->attr_remove(vp, attr, xflags);
-}
-
-
-struct inode_operations linvfs_file_inode_operations = {
- .permission = linvfs_permission,
- .truncate = linvfs_truncate,
- .getattr = linvfs_getattr,
- .setattr = linvfs_setattr,
- .setxattr = linvfs_setxattr,
- .getxattr = linvfs_getxattr,
- .listxattr = linvfs_listxattr,
- .removexattr = linvfs_removexattr,
-};
-
-struct inode_operations linvfs_dir_inode_operations = {
- .create = linvfs_create,
- .lookup = linvfs_lookup,
- .link = linvfs_link,
- .unlink = linvfs_unlink,
- .symlink = linvfs_symlink,
- .mkdir = linvfs_mkdir,
- .rmdir = linvfs_rmdir,
- .mknod = linvfs_mknod,
- .rename = linvfs_rename,
- .permission = linvfs_permission,
- .getattr = linvfs_getattr,
- .setattr = linvfs_setattr,
- .setxattr = linvfs_setxattr,
- .getxattr = linvfs_getxattr,
- .listxattr = linvfs_listxattr,
- .removexattr = linvfs_removexattr,
-};
-
-struct inode_operations linvfs_symlink_inode_operations = {
- .readlink = generic_readlink,
- .follow_link = linvfs_follow_link,
- .put_link = linvfs_put_link,
- .permission = linvfs_permission,
- .getattr = linvfs_getattr,
- .setattr = linvfs_setattr,
- .setxattr = linvfs_setxattr,
- .getxattr = linvfs_getxattr,
- .listxattr = linvfs_listxattr,
- .removexattr = linvfs_removexattr,
-};
diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c
deleted file mode 100644
index e0ab45fbfeb..00000000000
--- a/fs/xfs/linux-2.6/xfs_lrw.c
+++ /dev/null
@@ -1,1021 +0,0 @@
-/*
- * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_bit.h"
-#include "xfs_log.h"
-#include "xfs_inum.h"
-#include "xfs_trans.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_dir.h"
-#include "xfs_dir2.h"
-#include "xfs_alloc.h"
-#include "xfs_dmapi.h"
-#include "xfs_quota.h"
-#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
-#include "xfs_dinode.h"
-#include "xfs_inode.h"
-#include "xfs_bmap.h"
-#include "xfs_btree.h"
-#include "xfs_ialloc.h"
-#include "xfs_rtalloc.h"
-#include "xfs_error.h"
-#include "xfs_itable.h"
-#include "xfs_rw.h"
-#include "xfs_acl.h"
-#include "xfs_cap.h"
-#include "xfs_mac.h"
-#include "xfs_attr.h"
-#include "xfs_inode_item.h"
-#include "xfs_buf_item.h"
-#include "xfs_utils.h"
-#include "xfs_iomap.h"
-
-#include <linux/capability.h>
-#include <linux/writeback.h>
-
-
-#if defined(XFS_RW_TRACE)
-void
-xfs_rw_enter_trace(
- int tag,
- xfs_iocore_t *io,
- void *data,
- size_t segs,
- loff_t offset,
- int ioflags)
-{
- xfs_inode_t *ip = XFS_IO_INODE(io);
-
- if (ip->i_rwtrace == NULL)
- return;
- ktrace_enter(ip->i_rwtrace,
- (void *)(unsigned long)tag,
- (void *)ip,
- (void *)((unsigned long)((ip->i_d.di_size >> 32) & 0xffffffff)),
- (void *)((unsigned long)(ip->i_d.di_size & 0xffffffff)),
- (void *)data,
- (void *)((unsigned long)segs),
- (void *)((unsigned long)((offset >> 32) & 0xffffffff)),
- (void *)((unsigned long)(offset & 0xffffffff)),
- (void *)((unsigned long)ioflags),
- (void *)((unsigned long)((io->io_new_size >> 32) & 0xffffffff)),
- (void *)((unsigned long)(io->io_new_size & 0xffffffff)),
- (void *)NULL,
- (void *)NULL,
- (void *)NULL,
- (void *)NULL,
- (void *)NULL);
-}
-
-void
-xfs_inval_cached_trace(
- xfs_iocore_t *io,
- xfs_off_t offset,
- xfs_off_t len,
- xfs_off_t first,
- xfs_off_t last)
-{
- xfs_inode_t *ip = XFS_IO_INODE(io);
-
- if (ip->i_rwtrace == NULL)
- return;
- ktrace_enter(ip->i_rwtrace,
- (void *)(__psint_t)XFS_INVAL_CACHED,
- (void *)ip,
- (void *)((unsigned long)((offset >> 32) & 0xffffffff)),
- (void *)((unsigned long)(offset & 0xffffffff)),
- (void *)((unsigned long)((len >> 32) & 0xffffffff)),
- (void *)((unsigned long)(len & 0xffffffff)),
- (void *)((unsigned long)((first >> 32) & 0xffffffff)),
- (void *)((unsigned long)(first & 0xffffffff)),
- (void *)((unsigned long)((last >> 32) & 0xffffffff)),
- (void *)((unsigned long)(last & 0xffffffff)),
- (void *)NULL,
- (void *)NULL,
- (void *)NULL,
- (void *)NULL,
- (void *)NULL,
- (void *)NULL);
-}
-#endif
-
-/*
- * xfs_iozero
- *
- * xfs_iozero clears the specified range of buffer supplied,
- * and marks all the affected blocks as valid and modified. If
- * an affected block is not allocated, it will be allocated. If
- * an affected block is not completely overwritten, and is not
- * valid before the operation, it will be read from disk before
- * being partially zeroed.
- */
-STATIC int
-xfs_iozero(
- struct inode *ip, /* inode */
- loff_t pos, /* offset in file */
- size_t count, /* size of data to zero */
- loff_t end_size) /* max file size to set */
-{
- unsigned bytes;
- struct page *page;
- struct address_space *mapping;
- char *kaddr;
- int status;
-
- mapping = ip->i_mapping;
- do {
- unsigned long index, offset;
-
- offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
- index = pos >> PAGE_CACHE_SHIFT;
- bytes = PAGE_CACHE_SIZE - offset;
- if (bytes > count)
- bytes = count;
-
- status = -ENOMEM;
- page = grab_cache_page(mapping, index);
- if (!page)
- break;
-
- kaddr = kmap(page);
- status = mapping->a_ops->prepare_write(NULL, page, offset,
- offset + bytes);
- if (status) {
- goto unlock;
- }
-
- memset((void *) (kaddr + offset), 0, bytes);
- flush_dcache_page(page);
- status = mapping->a_ops->commit_write(NULL, page, offset,
- offset + bytes);
- if (!status) {
- pos += bytes;
- count -= bytes;
- if (pos > i_size_read(ip))
- i_size_write(ip, pos < end_size ? pos : end_size);
- }
-
-unlock:
- kunmap(page);
- unlock_page(page);
- page_cache_release(page);
- if (status)
- break;
- } while (count);
-
- return (-status);
-}
-
-ssize_t /* bytes read, or (-) error */
-xfs_read(
- bhv_desc_t *bdp,
- struct kiocb *iocb,
- const struct iovec *iovp,
- unsigned int segs,
- loff_t *offset,
- int ioflags,
- cred_t *credp)
-{
- struct file *file = iocb->ki_filp;
- struct inode *inode = file->f_mapping->host;
- size_t size = 0;
- ssize_t ret;
- xfs_fsize_t n;
- xfs_inode_t *ip;
- xfs_mount_t *mp;
- vnode_t *vp;
- unsigned long seg;
-
- ip = XFS_BHVTOI(bdp);
- vp = BHV_TO_VNODE(bdp);
- mp = ip->i_mount;
-
- XFS_STATS_INC(xs_read_calls);
-
- /* START copy & waste from filemap.c */
- for (seg = 0; seg < segs; seg++) {
- const struct iovec *iv = &iovp[seg];
-
- /*
- * If any segment has a negative length, or the cumulative
- * length ever wraps negative then return -EINVAL.
- */
- size += iv->iov_len;
- if (unlikely((ssize_t)(size|iv->iov_len) < 0))
- return XFS_ERROR(-EINVAL);
- }
- /* END copy & waste from filemap.c */
-
- if (unlikely(ioflags & IO_ISDIRECT)) {
- xfs_buftarg_t *target =
- (ip->i_d.di_flags & XFS_DIFLAG_REALTIME) ?
- mp->m_rtdev_targp : mp->m_ddev_targp;
- if ((*offset & target->bt_smask) ||
- (size & target->bt_smask)) {
- if (*offset == ip->i_d.di_size) {
- return (0);
- }
- return -XFS_ERROR(EINVAL);
- }
- }
-
- n = XFS_MAXIOFFSET(mp) - *offset;
- if ((n <= 0) || (size == 0))
- return 0;
-
- if (n < size)
- size = n;
-
- if (XFS_FORCED_SHUTDOWN(mp)) {
- return -EIO;
- }
-
- if (unlikely(ioflags & IO_ISDIRECT))
- mutex_lock(&inode->i_mutex);
- xfs_ilock(ip, XFS_IOLOCK_SHARED);
-
- if (DM_EVENT_ENABLED(vp->v_vfsp, ip, DM_EVENT_READ) &&
- !(ioflags & IO_INVIS)) {
- vrwlock_t locktype = VRWLOCK_READ;
- int dmflags = FILP_DELAY_FLAG(file) | DM_SEM_FLAG_RD(ioflags);
-
- ret = -XFS_SEND_DATA(mp, DM_EVENT_READ,
- BHV_TO_VNODE(bdp), *offset, size,
- dmflags, &locktype);
- if (ret) {
- xfs_iunlock(ip, XFS_IOLOCK_SHARED);
- goto unlock_isem;
- }
- }
-
- xfs_rw_enter_trace(XFS_READ_ENTER, &ip->i_iocore,
- (void *)iovp, segs, *offset, ioflags);
- ret = __generic_file_aio_read(iocb, iovp, segs, offset);
- if (ret == -EIOCBQUEUED && !(ioflags & IO_ISAIO))
- ret = wait_on_sync_kiocb(iocb);
- if (ret > 0)
- XFS_STATS_ADD(xs_read_bytes, ret);
-
- xfs_iunlock(ip, XFS_IOLOCK_SHARED);
-
-unlock_isem:
- if (unlikely(ioflags & IO_ISDIRECT))
- mutex_unlock(&inode->i_mutex);
- return ret;
-}
-
-ssize_t
-xfs_sendfile(
- bhv_desc_t *bdp,
- struct file *filp,
- loff_t *offset,
- int ioflags,
- size_t count,
- read_actor_t actor,
- void *target,
- cred_t *credp)
-{
- ssize_t ret;
- xfs_fsize_t n;
- xfs_inode_t *ip;
- xfs_mount_t *mp;
- vnode_t *vp;
-
- ip = XFS_BHVTOI(bdp);
- vp = BHV_TO_VNODE(bdp);
- mp = ip->i_mount;
-
- XFS_STATS_INC(xs_read_calls);
-
- n = XFS_MAXIOFFSET(mp) - *offset;
- if ((n <= 0) || (count == 0))
- return 0;
-
- if (n < count)
- count = n;
-
- if (XFS_FORCED_SHUTDOWN(ip->i_mount))
- return -EIO;
-
- xfs_ilock(ip, XFS_IOLOCK_SHARED);
-
- if (DM_EVENT_ENABLED(vp->v_vfsp, ip, DM_EVENT_READ) &&
- (!(ioflags & IO_INVIS))) {
- vrwlock_t locktype = VRWLOCK_READ;
- int error;
-
- error = XFS_SEND_DATA(mp, DM_EVENT_READ, BHV_TO_VNODE(bdp), *offset, count,
- FILP_DELAY_FLAG(filp), &locktype);
- if (error) {
- xfs_iunlock(ip, XFS_IOLOCK_SHARED);
- return -error;
- }
- }
- xfs_rw_enter_trace(XFS_SENDFILE_ENTER, &ip->i_iocore,
- (void *)(unsigned long)target, count, *offset, ioflags);
- ret = generic_file_sendfile(filp, offset, count, actor, target);
-
- xfs_iunlock(ip, XFS_IOLOCK_SHARED);
-
- if (ret > 0)
- XFS_STATS_ADD(xs_read_bytes, ret);
-
- return ret;
-}
-
-/*
- * This routine is called to handle zeroing any space in the last
- * block of the file that is beyond the EOF. We do this since the
- * size is being increased without writing anything to that block
- * and we don't want anyone to read the garbage on the disk.
- */
-STATIC int /* error (positive) */
-xfs_zero_last_block(
- struct inode *ip,
- xfs_iocore_t *io,
- xfs_fsize_t isize,
- xfs_fsize_t end_size)
-{
- xfs_fileoff_t last_fsb;
- xfs_mount_t *mp;
- int nimaps;
- int zero_offset;
- int zero_len;
- int error = 0;
- xfs_bmbt_irec_t imap;
- loff_t loff;
-
- ASSERT(ismrlocked(io->io_lock, MR_UPDATE) != 0);
-
- mp = io->io_mount;
-
- zero_offset = XFS_B_FSB_OFFSET(mp, isize);
- if (zero_offset == 0) {
- /*
- * There are no extra bytes in the last block on disk to
- * zero, so return.
- */
- return 0;
- }
-
- last_fsb = XFS_B_TO_FSBT(mp, isize);
- nimaps = 1;
- error = XFS_BMAPI(mp, NULL, io, last_fsb, 1, 0, NULL, 0, &imap,
- &nimaps, NULL);
- if (error) {
- return error;
- }
- ASSERT(nimaps > 0);
- /*
- * If the block underlying isize is just a hole, then there
- * is nothing to zero.
- */
- if (imap.br_startblock == HOLESTARTBLOCK) {
- return 0;
- }
- /*
- * Zero the part of the last block beyond the EOF, and write it
- * out sync. We need to drop the ilock while we do this so we
- * don't deadlock when the buffer cache calls back to us.
- */
- XFS_IUNLOCK(mp, io, XFS_ILOCK_EXCL| XFS_EXTSIZE_RD);
- loff = XFS_FSB_TO_B(mp, last_fsb);
-
- zero_len = mp->m_sb.sb_blocksize - zero_offset;
-
- error = xfs_iozero(ip, loff + zero_offset, zero_len, end_size);
-
- XFS_ILOCK(mp, io, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD);
- ASSERT(error >= 0);
- return error;
-}
-
-/*
- * Zero any on disk space between the current EOF and the new,
- * larger EOF. This handles the normal case of zeroing the remainder
- * of the last block in the file and the unusual case of zeroing blocks
- * out beyond the size of the file. This second case only happens
- * with fixed size extents and when the system crashes before the inode
- * size was updated but after blocks were allocated. If fill is set,
- * then any holes in the range are filled and zeroed. If not, the holes
- * are left alone as holes.
- */
-
-int /* error (positive) */
-xfs_zero_eof(
- vnode_t *vp,
- xfs_iocore_t *io,
- xfs_off_t offset, /* starting I/O offset */
- xfs_fsize_t isize, /* current inode size */
- xfs_fsize_t end_size) /* terminal inode size */
-{
- struct inode *ip = LINVFS_GET_IP(vp);
- xfs_fileoff_t start_zero_fsb;
- xfs_fileoff_t end_zero_fsb;
- xfs_fileoff_t zero_count_fsb;
- xfs_fileoff_t last_fsb;
- xfs_extlen_t buf_len_fsb;
- xfs_mount_t *mp;
- int nimaps;
- int error = 0;
- xfs_bmbt_irec_t imap;
-
- ASSERT(ismrlocked(io->io_lock, MR_UPDATE));
- ASSERT(ismrlocked(io->io_iolock, MR_UPDATE));
- ASSERT(offset > isize);
-
- mp = io->io_mount;
-
- /*
- * First handle zeroing the block on which isize resides.
- * We only zero a part of that block so it is handled specially.
- */
- error = xfs_zero_last_block(ip, io, isize, end_size);
- if (error) {
- ASSERT(ismrlocked(io->io_lock, MR_UPDATE));
- ASSERT(ismrlocked(io->io_iolock, MR_UPDATE));
- return error;
- }
-
- /*
- * Calculate the range between the new size and the old
- * where blocks needing to be zeroed may exist. To get the
- * block where the last byte in the file currently resides,
- * we need to subtract one from the size and truncate back
- * to a block boundary. We subtract 1 in case the size is
- * exactly on a block boundary.
- */
- last_fsb = isize ? XFS_B_TO_FSBT(mp, isize - 1) : (xfs_fileoff_t)-1;
- start_zero_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize);
- end_zero_fsb = XFS_B_TO_FSBT(mp, offset - 1);
- ASSERT((xfs_sfiloff_t)last_fsb < (xfs_sfiloff_t)start_zero_fsb);
- if (last_fsb == end_zero_fsb) {
- /*
- * The size was only incremented on its last block.
- * We took care of that above, so just return.
- */
- return 0;
- }
-
- ASSERT(start_zero_fsb <= end_zero_fsb);
- while (start_zero_fsb <= end_zero_fsb) {
- nimaps = 1;
- zero_count_fsb = end_zero_fsb - start_zero_fsb + 1;
- error = XFS_BMAPI(mp, NULL, io, start_zero_fsb, zero_count_fsb,
- 0, NULL, 0, &imap, &nimaps, NULL);
- if (error) {
- ASSERT(ismrlocked(io->io_lock, MR_UPDATE));
- ASSERT(ismrlocked(io->io_iolock, MR_UPDATE));
- return error;
- }
- ASSERT(nimaps > 0);
-
- if (imap.br_state == XFS_EXT_UNWRITTEN ||
- imap.br_startblock == HOLESTARTBLOCK) {
- /*
- * This loop handles initializing pages that were
- * partially initialized by the code below this
- * loop. It basically zeroes the part of the page
- * that sits on a hole and sets the page as P_HOLE
- * and calls remapf if it is a mapped file.
- */
- start_zero_fsb = imap.br_startoff + imap.br_blockcount;
- ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
- continue;
- }
-
- /*
- * There are blocks in the range requested.
- * Zero them a single write at a time. We actually
- * don't zero the entire range returned if it is
- * too big and simply loop around to get the rest.
- * That is not the most efficient thing to do, but it
- * is simple and this path should not be exercised often.
- */
- buf_len_fsb = XFS_FILBLKS_MIN(imap.br_blockcount,
- mp->m_writeio_blocks << 8);
- /*
- * Drop the inode lock while we're doing the I/O.
- * We'll still have the iolock to protect us.
- */
- XFS_IUNLOCK(mp, io, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD);
-
- error = xfs_iozero(ip,
- XFS_FSB_TO_B(mp, start_zero_fsb),
- XFS_FSB_TO_B(mp, buf_len_fsb),
- end_size);
-
- if (error) {
- goto out_lock;
- }
-
- start_zero_fsb = imap.br_startoff + buf_len_fsb;
- ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
-
- XFS_ILOCK(mp, io, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD);
- }
-
- return 0;
-
-out_lock:
-
- XFS_ILOCK(mp, io, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD);
- ASSERT(error >= 0);
- return error;
-}
-
-ssize_t /* bytes written, or (-) error */
-xfs_write(
- bhv_desc_t *bdp,
- struct kiocb *iocb,
- const struct iovec *iovp,
- unsigned int nsegs,
- loff_t *offset,
- int ioflags,
- cred_t *credp)
-{
- struct file *file = iocb->ki_filp;
- struct address_space *mapping = file->f_mapping;
- struct inode *inode = mapping->host;
- unsigned long segs = nsegs;
- xfs_inode_t *xip;
- xfs_mount_t *mp;
- ssize_t ret = 0, error = 0;
- xfs_fsize_t isize, new_size;
- xfs_iocore_t *io;
- vnode_t *vp;
- unsigned long seg;
- int iolock;
- int eventsent = 0;
- vrwlock_t locktype;
- size_t ocount = 0, count;
- loff_t pos;
- int need_isem = 1, need_flush = 0;
-
- XFS_STATS_INC(xs_write_calls);
-
- vp = BHV_TO_VNODE(bdp);
- xip = XFS_BHVTOI(bdp);
-
- for (seg = 0; seg < segs; seg++) {
- const struct iovec *iv = &iovp[seg];
-
- /*
- * If any segment has a negative length, or the cumulative
- * length ever wraps negative then return -EINVAL.
- */
- ocount += iv->iov_len;
- if (unlikely((ssize_t)(ocount|iv->iov_len) < 0))
- return -EINVAL;
- if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
- continue;
- if (seg == 0)
- return -EFAULT;
- segs = seg;
- ocount -= iv->iov_len; /* This segment is no good */
- break;
- }
-
- count = ocount;
- pos = *offset;
-
- if (count == 0)
- return 0;
-
- io = &xip->i_iocore;
- mp = io->io_mount;
-
- if (XFS_FORCED_SHUTDOWN(mp))
- return -EIO;
-
- fs_check_frozen(vp->v_vfsp, SB_FREEZE_WRITE);
-
- if (ioflags & IO_ISDIRECT) {
- xfs_buftarg_t *target =
- (xip->i_d.di_flags & XFS_DIFLAG_REALTIME) ?
- mp->m_rtdev_targp : mp->m_ddev_targp;
-
- if ((pos & target->bt_smask) || (count & target->bt_smask))
- return XFS_ERROR(-EINVAL);
-
- if (!VN_CACHED(vp) && pos < i_size_read(inode))
- need_isem = 0;
-
- if (VN_CACHED(vp))
- need_flush = 1;
- }
-
-relock:
- if (need_isem) {
- iolock = XFS_IOLOCK_EXCL;
- locktype = VRWLOCK_WRITE;
-
- mutex_lock(&inode->i_mutex);
- } else {
- iolock = XFS_IOLOCK_SHARED;
- locktype = VRWLOCK_WRITE_DIRECT;
- }
-
- xfs_ilock(xip, XFS_ILOCK_EXCL|iolock);
-
- isize = i_size_read(inode);
-
- if (file->f_flags & O_APPEND)
- *offset = isize;
-
-start:
- error = -generic_write_checks(file, &pos, &count,
- S_ISBLK(inode->i_mode));
- if (error) {
- xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock);
- goto out_unlock_isem;
- }
-
- new_size = pos + count;
- if (new_size > isize)
- io->io_new_size = new_size;
-
- if ((DM_EVENT_ENABLED(vp->v_vfsp, xip, DM_EVENT_WRITE) &&
- !(ioflags & IO_INVIS) && !eventsent)) {
- loff_t savedsize = pos;
- int dmflags = FILP_DELAY_FLAG(file);
-
- if (need_isem)
- dmflags |= DM_FLAGS_IMUX;
-
- xfs_iunlock(xip, XFS_ILOCK_EXCL);
- error = XFS_SEND_DATA(xip->i_mount, DM_EVENT_WRITE, vp,
- pos, count,
- dmflags, &locktype);
- if (error) {
- xfs_iunlock(xip, iolock);
- goto out_unlock_isem;
- }
- xfs_ilock(xip, XFS_ILOCK_EXCL);
- eventsent = 1;
-
- /*
- * The iolock was dropped and reaquired in XFS_SEND_DATA
- * so we have to recheck the size when appending.
- * We will only "goto start;" once, since having sent the
- * event prevents another call to XFS_SEND_DATA, which is
- * what allows the size to change in the first place.
- */
- if ((file->f_flags & O_APPEND) && savedsize != isize) {
- pos = isize = xip->i_d.di_size;
- goto start;
- }
- }
-
- if (likely(!(ioflags & IO_INVIS))) {
- file_update_time(file);
- xfs_ichgtime_fast(xip, inode,
- XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
- }
-
- /*
- * If the offset is beyond the size of the file, we have a couple
- * of things to do. First, if there is already space allocated
- * we need to either create holes or zero the disk or ...
- *
- * If there is a page where the previous size lands, we need
- * to zero it out up to the new size.
- */
-
- if (pos > isize) {
- error = xfs_zero_eof(BHV_TO_VNODE(bdp), io, pos,
- isize, pos + count);
- if (error) {
- xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock);
- goto out_unlock_isem;
- }
- }
- xfs_iunlock(xip, XFS_ILOCK_EXCL);
-
- /*
- * If we're writing the file then make sure to clear the
- * setuid and setgid bits if the process is not being run
- * by root. This keeps people from modifying setuid and
- * setgid binaries.
- */
-
- if (((xip->i_d.di_mode & S_ISUID) ||
- ((xip->i_d.di_mode & (S_ISGID | S_IXGRP)) ==
- (S_ISGID | S_IXGRP))) &&
- !capable(CAP_FSETID)) {
- error = xfs_write_clear_setuid(xip);
- if (likely(!error))
- error = -remove_suid(file->f_dentry);
- if (unlikely(error)) {
- xfs_iunlock(xip, iolock);
- goto out_unlock_isem;
- }
- }
-
-retry:
- /* We can write back this queue in page reclaim */
- current->backing_dev_info = mapping->backing_dev_info;
-
- if ((ioflags & IO_ISDIRECT)) {
- if (need_flush) {
- xfs_inval_cached_trace(io, pos, -1,
- ctooff(offtoct(pos)), -1);
- VOP_FLUSHINVAL_PAGES(vp, ctooff(offtoct(pos)),
- -1, FI_REMAPF_LOCKED);
- }
-
- if (need_isem) {
- /* demote the lock now the cached pages are gone */
- XFS_ILOCK_DEMOTE(mp, io, XFS_IOLOCK_EXCL);
- mutex_unlock(&inode->i_mutex);
-
- iolock = XFS_IOLOCK_SHARED;
- locktype = VRWLOCK_WRITE_DIRECT;
- need_isem = 0;
- }
-
- xfs_rw_enter_trace(XFS_DIOWR_ENTER, io, (void *)iovp, segs,
- *offset, ioflags);
- ret = generic_file_direct_write(iocb, iovp,
- &segs, pos, offset, count, ocount);
-
- /*
- * direct-io write to a hole: fall through to buffered I/O
- * for completing the rest of the request.
- */
- if (ret >= 0 && ret != count) {
- XFS_STATS_ADD(xs_write_bytes, ret);
-
- pos += ret;
- count -= ret;
-
- need_isem = 1;
- ioflags &= ~IO_ISDIRECT;
- xfs_iunlock(xip, iolock);
- goto relock;
- }
- } else {
- xfs_rw_enter_trace(XFS_WRITE_ENTER, io, (void *)iovp, segs,
- *offset, ioflags);
- ret = generic_file_buffered_write(iocb, iovp, segs,
- pos, offset, count, ret);
- }
-
- current->backing_dev_info = NULL;
-
- if (ret == -EIOCBQUEUED && !(ioflags & IO_ISAIO))
- ret = wait_on_sync_kiocb(iocb);
-
- if ((ret == -ENOSPC) &&
- DM_EVENT_ENABLED(vp->v_vfsp, xip, DM_EVENT_NOSPACE) &&
- !(ioflags & IO_INVIS)) {
-
- xfs_rwunlock(bdp, locktype);
- if (need_isem)
- mutex_unlock(&inode->i_mutex);
- error = XFS_SEND_NAMESP(xip->i_mount, DM_EVENT_NOSPACE, vp,
- DM_RIGHT_NULL, vp, DM_RIGHT_NULL, NULL, NULL,
- 0, 0, 0); /* Delay flag intentionally unused */
- if (error)
- goto out_nounlocks;
- if (need_isem)
- mutex_lock(&inode->i_mutex);
- xfs_rwlock(bdp, locktype);
- pos = xip->i_d.di_size;
- ret = 0;
- goto retry;
- }
-
- isize = i_size_read(inode);
- if (unlikely(ret < 0 && ret != -EFAULT && *offset > isize))
- *offset = isize;
-
- if (*offset > xip->i_d.di_size) {
- xfs_ilock(xip, XFS_ILOCK_EXCL);
- if (*offset > xip->i_d.di_size) {
- xip->i_d.di_size = *offset;
- i_size_write(inode, *offset);
- xip->i_update_core = 1;
- xip->i_update_size = 1;
- }
- xfs_iunlock(xip, XFS_ILOCK_EXCL);
- }
-
- error = -ret;
- if (ret <= 0)
- goto out_unlock_internal;
-
- XFS_STATS_ADD(xs_write_bytes, ret);
-
- /* Handle various SYNC-type writes */
- if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) {
- /*
- * If we're treating this as O_DSYNC and we have not updated the
- * size, force the log.
- */
- if (!(mp->m_flags & XFS_MOUNT_OSYNCISOSYNC) &&
- !(xip->i_update_size)) {
- xfs_inode_log_item_t *iip = xip->i_itemp;
-
- /*
- * If an allocation transaction occurred
- * without extending the size, then we have to force
- * the log up the proper point to ensure that the
- * allocation is permanent. We can't count on
- * the fact that buffered writes lock out direct I/O
- * writes - the direct I/O write could have extended
- * the size nontransactionally, then finished before
- * we started. xfs_write_file will think that the file
- * didn't grow but the update isn't safe unless the
- * size change is logged.
- *
- * Force the log if we've committed a transaction
- * against the inode or if someone else has and
- * the commit record hasn't gone to disk (e.g.
- * the inode is pinned). This guarantees that
- * all changes affecting the inode are permanent
- * when we return.
- */
- if (iip && iip->ili_last_lsn) {
- xfs_log_force(mp, iip->ili_last_lsn,
- XFS_LOG_FORCE | XFS_LOG_SYNC);
- } else if (xfs_ipincount(xip) > 0) {
- xfs_log_force(mp, (xfs_lsn_t)0,
- XFS_LOG_FORCE | XFS_LOG_SYNC);
- }
-
- } else {
- xfs_trans_t *tp;
-
- /*
- * O_SYNC or O_DSYNC _with_ a size update are handled
- * the same way.
- *
- * If the write was synchronous then we need to make
- * sure that the inode modification time is permanent.
- * We'll have updated the timestamp above, so here
- * we use a synchronous transaction to log the inode.
- * It's not fast, but it's necessary.
- *
- * If this a dsync write and the size got changed
- * non-transactionally, then we need to ensure that
- * the size change gets logged in a synchronous
- * transaction.
- */
-
- tp = xfs_trans_alloc(mp, XFS_TRANS_WRITE_SYNC);
- if ((error = xfs_trans_reserve(tp, 0,
- XFS_SWRITE_LOG_RES(mp),
- 0, 0, 0))) {
- /* Transaction reserve failed */
- xfs_trans_cancel(tp, 0);
- } else {
- /* Transaction reserve successful */
- xfs_ilock(xip, XFS_ILOCK_EXCL);
- xfs_trans_ijoin(tp, xip, XFS_ILOCK_EXCL);
- xfs_trans_ihold(tp, xip);
- xfs_trans_log_inode(tp, xip, XFS_ILOG_CORE);
- xfs_trans_set_sync(tp);
- error = xfs_trans_commit(tp, 0, NULL);
- xfs_iunlock(xip, XFS_ILOCK_EXCL);
- }
- if (error)
- goto out_unlock_internal;
- }
-
- xfs_rwunlock(bdp, locktype);
- if (need_isem)
- mutex_unlock(&inode->i_mutex);
-
- error = sync_page_range(inode, mapping, pos, ret);
- if (!error)
- error = ret;
- return error;
- }
-
- out_unlock_internal:
- xfs_rwunlock(bdp, locktype);
- out_unlock_isem:
- if (need_isem)
- mutex_unlock(&inode->i_mutex);
- out_nounlocks:
- return -error;
-}
-
-/*
- * All xfs metadata buffers except log state machine buffers
- * get this attached as their b_bdstrat callback function.
- * This is so that we can catch a buffer
- * after prematurely unpinning it to forcibly shutdown the filesystem.
- */
-int
-xfs_bdstrat_cb(struct xfs_buf *bp)
-{
- xfs_mount_t *mp;
-
- mp = XFS_BUF_FSPRIVATE3(bp, xfs_mount_t *);
- if (!XFS_FORCED_SHUTDOWN(mp)) {
- xfs_buf_iorequest(bp);
- return 0;
- } else {
- xfs_buftrace("XFS__BDSTRAT IOERROR", bp);
- /*
- * Metadata write that didn't get logged but
- * written delayed anyway. These aren't associated
- * with a transaction, and can be ignored.
- */
- if (XFS_BUF_IODONE_FUNC(bp) == NULL &&
- (XFS_BUF_ISREAD(bp)) == 0)
- return (xfs_bioerror_relse(bp));
- else
- return (xfs_bioerror(bp));
- }
-}
-
-
-int
-xfs_bmap(bhv_desc_t *bdp,
- xfs_off_t offset,
- ssize_t count,
- int flags,
- xfs_iomap_t *iomapp,
- int *niomaps)
-{
- xfs_inode_t *ip = XFS_BHVTOI(bdp);
- xfs_iocore_t *io = &ip->i_iocore;
-
- ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFREG);
- ASSERT(((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) != 0) ==
- ((ip->i_iocore.io_flags & XFS_IOCORE_RT) != 0));
-
- return xfs_iomap(io, offset, count, flags, iomapp, niomaps);
-}
-
-/*
- * Wrapper around bdstrat so that we can stop data
- * from going to disk in case we are shutting down the filesystem.
- * Typically user data goes thru this path; one of the exceptions
- * is the superblock.
- */
-int
-xfsbdstrat(
- struct xfs_mount *mp,
- struct xfs_buf *bp)
-{
- ASSERT(mp);
- if (!XFS_FORCED_SHUTDOWN(mp)) {
- /* Grio redirection would go here
- * if (XFS_BUF_IS_GRIO(bp)) {
- */
-
- xfs_buf_iorequest(bp);
- return 0;
- }
-
- xfs_buftrace("XFSBDSTRAT IOERROR", bp);
- return (xfs_bioerror_relse(bp));
-}
-
-/*
- * If the underlying (data/log/rt) device is readonly, there are some
- * operations that cannot proceed.
- */
-int
-xfs_dev_is_read_only(
- xfs_mount_t *mp,
- char *message)
-{
- if (xfs_readonly_buftarg(mp->m_ddev_targp) ||
- xfs_readonly_buftarg(mp->m_logdev_targp) ||
- (mp->m_rtdev_targp && xfs_readonly_buftarg(mp->m_rtdev_targp))) {
- cmn_err(CE_NOTE,
- "XFS: %s required on read-only device.", message);
- cmn_err(CE_NOTE,
- "XFS: write access unavailable, cannot proceed.");
- return EROFS;
- }
- return 0;
-}
diff --git a/fs/xfs/linux-2.6/xfs_lrw.h b/fs/xfs/linux-2.6/xfs_lrw.h
deleted file mode 100644
index 38864a88d42..00000000000
--- a/fs/xfs/linux-2.6/xfs_lrw.h
+++ /dev/null
@@ -1,96 +0,0 @@
-/*
- * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- */
-#ifndef __XFS_LRW_H__
-#define __XFS_LRW_H__
-
-struct vnode;
-struct bhv_desc;
-struct xfs_mount;
-struct xfs_iocore;
-struct xfs_inode;
-struct xfs_bmbt_irec;
-struct xfs_buf;
-struct xfs_iomap;
-
-#if defined(XFS_RW_TRACE)
-/*
- * Defines for the trace mechanisms in xfs_lrw.c.
- */
-#define XFS_RW_KTRACE_SIZE 128
-
-#define XFS_READ_ENTER 1
-#define XFS_WRITE_ENTER 2
-#define XFS_IOMAP_READ_ENTER 3
-#define XFS_IOMAP_WRITE_ENTER 4
-#define XFS_IOMAP_READ_MAP 5
-#define XFS_IOMAP_WRITE_MAP 6
-#define XFS_IOMAP_WRITE_NOSPACE 7
-#define XFS_ITRUNC_START 8
-#define XFS_ITRUNC_FINISH1 9
-#define XFS_ITRUNC_FINISH2 10
-#define XFS_CTRUNC1 11
-#define XFS_CTRUNC2 12
-#define XFS_CTRUNC3 13
-#define XFS_CTRUNC4 14
-#define XFS_CTRUNC5 15
-#define XFS_CTRUNC6 16
-#define XFS_BUNMAPI 17
-#define XFS_INVAL_CACHED 18
-#define XFS_DIORD_ENTER 19
-#define XFS_DIOWR_ENTER 20
-#define XFS_SENDFILE_ENTER 21
-#define XFS_WRITEPAGE_ENTER 22
-#define XFS_RELEASEPAGE_ENTER 23
-#define XFS_INVALIDPAGE_ENTER 24
-#define XFS_IOMAP_ALLOC_ENTER 25
-#define XFS_IOMAP_ALLOC_MAP 26
-#define XFS_IOMAP_UNWRITTEN 27
-extern void xfs_rw_enter_trace(int, struct xfs_iocore *,
- void *, size_t, loff_t, int);
-extern void xfs_inval_cached_trace(struct xfs_iocore *,
- xfs_off_t, xfs_off_t, xfs_off_t, xfs_off_t);
-#else
-#define xfs_rw_enter_trace(tag, io, data, size, offset, ioflags)
-#define xfs_inval_cached_trace(io, offset, len, first, last)
-#endif
-
-/*
- * Maximum count of bmaps used by read and write paths.
- */
-#define XFS_MAX_RW_NBMAPS 4
-
-extern int xfs_bmap(struct bhv_desc *, xfs_off_t, ssize_t, int,
- struct xfs_iomap *, int *);
-extern int xfsbdstrat(struct xfs_mount *, struct xfs_buf *);
-extern int xfs_bdstrat_cb(struct xfs_buf *);
-
-extern int xfs_zero_eof(struct vnode *, struct xfs_iocore *, xfs_off_t,
- xfs_fsize_t, xfs_fsize_t);
-extern ssize_t xfs_read(struct bhv_desc *, struct kiocb *,
- const struct iovec *, unsigned int,
- loff_t *, int, struct cred *);
-extern ssize_t xfs_write(struct bhv_desc *, struct kiocb *,
- const struct iovec *, unsigned int,
- loff_t *, int, struct cred *);
-extern ssize_t xfs_sendfile(struct bhv_desc *, struct file *,
- loff_t *, int, size_t, read_actor_t,
- void *, struct cred *);
-
-extern int xfs_dev_is_read_only(struct xfs_mount *, char *);
-
-#endif /* __XFS_LRW_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_stats.c b/fs/xfs/linux-2.6/xfs_stats.c
deleted file mode 100644
index 8955720a2c6..00000000000
--- a/fs/xfs/linux-2.6/xfs_stats.c
+++ /dev/null
@@ -1,117 +0,0 @@
-/*
- * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- */
-#include "xfs.h"
-#include <linux/proc_fs.h>
-
-DEFINE_PER_CPU(struct xfsstats, xfsstats);
-
-STATIC int
-xfs_read_xfsstats(
- char *buffer,
- char **start,
- off_t offset,
- int count,
- int *eof,
- void *data)
-{
- int c, i, j, len, val;
- __uint64_t xs_xstrat_bytes = 0;
- __uint64_t xs_write_bytes = 0;
- __uint64_t xs_read_bytes = 0;
-
- static const struct xstats_entry {
- char *desc;
- int endpoint;
- } xstats[] = {
- { "extent_alloc", XFSSTAT_END_EXTENT_ALLOC },
- { "abt", XFSSTAT_END_ALLOC_BTREE },
- { "blk_map", XFSSTAT_END_BLOCK_MAPPING },
- { "bmbt", XFSSTAT_END_BLOCK_MAP_BTREE },
- { "dir", XFSSTAT_END_DIRECTORY_OPS },
- { "trans", XFSSTAT_END_TRANSACTIONS },
- { "ig", XFSSTAT_END_INODE_OPS },
- { "log", XFSSTAT_END_LOG_OPS },
- { "push_ail", XFSSTAT_END_TAIL_PUSHING },
- { "xstrat", XFSSTAT_END_WRITE_CONVERT },
- { "rw", XFSSTAT_END_READ_WRITE_OPS },
- { "attr", XFSSTAT_END_ATTRIBUTE_OPS },
- { "icluster", XFSSTAT_END_INODE_CLUSTER },
- { "vnodes", XFSSTAT_END_VNODE_OPS },
- { "buf", XFSSTAT_END_BUF },
- };
-
- /* Loop over all stats groups */
- for (i=j=len = 0; i < sizeof(xstats)/sizeof(struct xstats_entry); i++) {
- len += sprintf(buffer + len, xstats[i].desc);
- /* inner loop does each group */
- while (j < xstats[i].endpoint) {
- val = 0;
- /* sum over all cpus */
- for (c = 0; c < NR_CPUS; c++) {
- if (!cpu_possible(c)) continue;
- val += *(((__u32*)&per_cpu(xfsstats, c) + j));
- }
- len += sprintf(buffer + len, " %u", val);
- j++;
- }
- buffer[len++] = '\n';
- }
- /* extra precision counters */
- for (i = 0; i < NR_CPUS; i++) {
- if (!cpu_possible(i)) continue;
- xs_xstrat_bytes += per_cpu(xfsstats, i).xs_xstrat_bytes;
- xs_write_bytes += per_cpu(xfsstats, i).xs_write_bytes;
- xs_read_bytes += per_cpu(xfsstats, i).xs_read_bytes;
- }
-
- len += sprintf(buffer + len, "xpc %Lu %Lu %Lu\n",
- xs_xstrat_bytes, xs_write_bytes, xs_read_bytes);
- len += sprintf(buffer + len, "debug %u\n",
-#if defined(DEBUG)
- 1);
-#else
- 0);
-#endif
-
- if (offset >= len) {
- *start = buffer;
- *eof = 1;
- return 0;
- }
- *start = buffer + offset;
- if ((len -= offset) > count)
- return count;
- *eof = 1;
-
- return len;
-}
-
-void
-xfs_init_procfs(void)
-{
- if (!proc_mkdir("fs/xfs", NULL))
- return;
- create_proc_read_entry("fs/xfs/stat", 0, NULL, xfs_read_xfsstats, NULL);
-}
-
-void
-xfs_cleanup_procfs(void)
-{
- remove_proc_entry("fs/xfs/stat", NULL);
- remove_proc_entry("fs/xfs", NULL);
-}
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
deleted file mode 100644
index f22e426d9e4..00000000000
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ /dev/null
@@ -1,1014 +0,0 @@
-/*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- */
-#include "xfs.h"
-#include "xfs_bit.h"
-#include "xfs_log.h"
-#include "xfs_clnt.h"
-#include "xfs_inum.h"
-#include "xfs_trans.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_dir.h"
-#include "xfs_dir2.h"
-#include "xfs_alloc.h"
-#include "xfs_dmapi.h"
-#include "xfs_quota.h"
-#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
-#include "xfs_dinode.h"
-#include "xfs_inode.h"
-#include "xfs_btree.h"
-#include "xfs_ialloc.h"
-#include "xfs_bmap.h"
-#include "xfs_rtalloc.h"
-#include "xfs_error.h"
-#include "xfs_itable.h"
-#include "xfs_rw.h"
-#include "xfs_acl.h"
-#include "xfs_cap.h"
-#include "xfs_mac.h"
-#include "xfs_attr.h"
-#include "xfs_buf_item.h"
-#include "xfs_utils.h"
-#include "xfs_version.h"
-
-#include <linux/namei.h>
-#include <linux/init.h>
-#include <linux/mount.h>
-#include <linux/mempool.h>
-#include <linux/writeback.h>
-#include <linux/kthread.h>
-
-STATIC struct quotactl_ops linvfs_qops;
-STATIC struct super_operations linvfs_sops;
-STATIC kmem_zone_t *xfs_vnode_zone;
-STATIC kmem_zone_t *xfs_ioend_zone;
-mempool_t *xfs_ioend_pool;
-
-STATIC struct xfs_mount_args *
-xfs_args_allocate(
- struct super_block *sb)
-{
- struct xfs_mount_args *args;
-
- args = kmem_zalloc(sizeof(struct xfs_mount_args), KM_SLEEP);
- args->logbufs = args->logbufsize = -1;
- strncpy(args->fsname, sb->s_id, MAXNAMELEN);
-
- /* Copy the already-parsed mount(2) flags we're interested in */
- if (sb->s_flags & MS_NOATIME)
- args->flags |= XFSMNT_NOATIME;
- if (sb->s_flags & MS_DIRSYNC)
- args->flags |= XFSMNT_DIRSYNC;
- if (sb->s_flags & MS_SYNCHRONOUS)
- args->flags |= XFSMNT_WSYNC;
-
- /* Default to 32 bit inodes on Linux all the time */
- args->flags |= XFSMNT_32BITINODES;
-
- return args;
-}
-
-__uint64_t
-xfs_max_file_offset(
- unsigned int blockshift)
-{
- unsigned int pagefactor = 1;
- unsigned int bitshift = BITS_PER_LONG - 1;
-
- /* Figure out maximum filesize, on Linux this can depend on
- * the filesystem blocksize (on 32 bit platforms).
- * __block_prepare_write does this in an [unsigned] long...
- * page->index << (PAGE_CACHE_SHIFT - bbits)
- * So, for page sized blocks (4K on 32 bit platforms),
- * this wraps at around 8Tb (hence MAX_LFS_FILESIZE which is
- * (((u64)PAGE_CACHE_SIZE << (BITS_PER_LONG-1))-1)
- * but for smaller blocksizes it is less (bbits = log2 bsize).
- * Note1: get_block_t takes a long (implicit cast from above)
- * Note2: The Large Block Device (LBD and HAVE_SECTOR_T) patch
- * can optionally convert the [unsigned] long from above into
- * an [unsigned] long long.
- */
-
-#if BITS_PER_LONG == 32
-# if defined(CONFIG_LBD)
- ASSERT(sizeof(sector_t) == 8);
- pagefactor = PAGE_CACHE_SIZE;
- bitshift = BITS_PER_LONG;
-# else
- pagefactor = PAGE_CACHE_SIZE >> (PAGE_CACHE_SHIFT - blockshift);
-# endif
-#endif
-
- return (((__uint64_t)pagefactor) << bitshift) - 1;
-}
-
-STATIC __inline__ void
-xfs_set_inodeops(
- struct inode *inode)
-{
- switch (inode->i_mode & S_IFMT) {
- case S_IFREG:
- inode->i_op = &linvfs_file_inode_operations;
- inode->i_fop = &linvfs_file_operations;
- inode->i_mapping->a_ops = &linvfs_aops;
- break;
- case S_IFDIR:
- inode->i_op = &linvfs_dir_inode_operations;
- inode->i_fop = &linvfs_dir_operations;
- break;
- case S_IFLNK:
- inode->i_op = &linvfs_symlink_inode_operations;
- if (inode->i_blocks)
- inode->i_mapping->a_ops = &linvfs_aops;
- break;
- default:
- inode->i_op = &linvfs_file_inode_operations;
- init_special_inode(inode, inode->i_mode, inode->i_rdev);
- break;
- }
-}
-
-STATIC __inline__ void
-xfs_revalidate_inode(
- xfs_mount_t *mp,
- vnode_t *vp,
- xfs_inode_t *ip)
-{
- struct inode *inode = LINVFS_GET_IP(vp);
-
- inode->i_mode = ip->i_d.di_mode;
- inode->i_nlink = ip->i_d.di_nlink;
- inode->i_uid = ip->i_d.di_uid;
- inode->i_gid = ip->i_d.di_gid;
-
- switch (inode->i_mode & S_IFMT) {
- case S_IFBLK:
- case S_IFCHR:
- inode->i_rdev =
- MKDEV(sysv_major(ip->i_df.if_u2.if_rdev) & 0x1ff,
- sysv_minor(ip->i_df.if_u2.if_rdev));
- break;
- default:
- inode->i_rdev = 0;
- break;
- }
-
- inode->i_blksize = xfs_preferred_iosize(mp);
- inode->i_generation = ip->i_d.di_gen;
- i_size_write(inode, ip->i_d.di_size);
- inode->i_blocks =
- XFS_FSB_TO_BB(mp, ip->i_d.di_nblocks + ip->i_delayed_blks);
- inode->i_atime.tv_sec = ip->i_d.di_atime.t_sec;
- inode->i_atime.tv_nsec = ip->i_d.di_atime.t_nsec;
- inode->i_mtime.tv_sec = ip->i_d.di_mtime.t_sec;
- inode->i_mtime.tv_nsec = ip->i_d.di_mtime.t_nsec;
- inode->i_ctime.tv_sec = ip->i_d.di_ctime.t_sec;
- inode->i_ctime.tv_nsec = ip->i_d.di_ctime.t_nsec;
- if (ip->i_d.di_flags & XFS_DIFLAG_IMMUTABLE)
- inode->i_flags |= S_IMMUTABLE;
- else
- inode->i_flags &= ~S_IMMUTABLE;
- if (ip->i_d.di_flags & XFS_DIFLAG_APPEND)
- inode->i_flags |= S_APPEND;
- else
- inode->i_flags &= ~S_APPEND;
- if (ip->i_d.di_flags & XFS_DIFLAG_SYNC)
- inode->i_flags |= S_SYNC;
- else
- inode->i_flags &= ~S_SYNC;
- if (ip->i_d.di_flags & XFS_DIFLAG_NOATIME)
- inode->i_flags |= S_NOATIME;
- else
- inode->i_flags &= ~S_NOATIME;
- vp->v_flag &= ~VMODIFIED;
-}
-
-void
-xfs_initialize_vnode(
- bhv_desc_t *bdp,
- vnode_t *vp,
- bhv_desc_t *inode_bhv,
- int unlock)
-{
- xfs_inode_t *ip = XFS_BHVTOI(inode_bhv);
- struct inode *inode = LINVFS_GET_IP(vp);
-
- if (!inode_bhv->bd_vobj) {
- vp->v_vfsp = bhvtovfs(bdp);
- bhv_desc_init(inode_bhv, ip, vp, &xfs_vnodeops);
- bhv_insert(VN_BHV_HEAD(vp), inode_bhv);
- }
-
- /*
- * We need to set the ops vectors, and unlock the inode, but if
- * we have been called during the new inode create process, it is
- * too early to fill in the Linux inode. We will get called a
- * second time once the inode is properly set up, and then we can
- * finish our work.
- */
- if (ip->i_d.di_mode != 0 && unlock && (inode->i_state & I_NEW)) {
- xfs_revalidate_inode(XFS_BHVTOM(bdp), vp, ip);
- xfs_set_inodeops(inode);
-
- ip->i_flags &= ~XFS_INEW;
- barrier();
-
- unlock_new_inode(inode);
- }
-}
-
-int
-xfs_blkdev_get(
- xfs_mount_t *mp,
- const char *name,
- struct block_device **bdevp)
-{
- int error = 0;
-
- *bdevp = open_bdev_excl(name, 0, mp);
- if (IS_ERR(*bdevp)) {
- error = PTR_ERR(*bdevp);
- printk("XFS: Invalid device [%s], error=%d\n", name, error);
- }
-
- return -error;
-}
-
-void
-xfs_blkdev_put(
- struct block_device *bdev)
-{
- if (bdev)
- close_bdev_excl(bdev);
-}
-
-/*
- * Try to write out the superblock using barriers.
- */
-STATIC int
-xfs_barrier_test(
- xfs_mount_t *mp)
-{
- xfs_buf_t *sbp = xfs_getsb(mp, 0);
- int error;
-
- XFS_BUF_UNDONE(sbp);
- XFS_BUF_UNREAD(sbp);
- XFS_BUF_UNDELAYWRITE(sbp);
- XFS_BUF_WRITE(sbp);
- XFS_BUF_UNASYNC(sbp);
- XFS_BUF_ORDERED(sbp);
-
- xfsbdstrat(mp, sbp);
- error = xfs_iowait(sbp);
-
- /*
- * Clear all the flags we set and possible error state in the
- * buffer. We only did the write to try out whether barriers
- * worked and shouldn't leave any traces in the superblock
- * buffer.
- */
- XFS_BUF_DONE(sbp);
- XFS_BUF_ERROR(sbp, 0);
- XFS_BUF_UNORDERED(sbp);
-
- xfs_buf_relse(sbp);
- return error;
-}
-
-void
-xfs_mountfs_check_barriers(xfs_mount_t *mp)
-{
- int error;
-
- if (mp->m_logdev_targp != mp->m_ddev_targp) {
- xfs_fs_cmn_err(CE_NOTE, mp,
- "Disabling barriers, not supported with external log device");
- mp->m_flags &= ~XFS_MOUNT_BARRIER;
- return;
- }
-
- if (mp->m_ddev_targp->bt_bdev->bd_disk->queue->ordered ==
- QUEUE_ORDERED_NONE) {
- xfs_fs_cmn_err(CE_NOTE, mp,
- "Disabling barriers, not supported by the underlying device");
- mp->m_flags &= ~XFS_MOUNT_BARRIER;
- return;
- }
-
- error = xfs_barrier_test(mp);
- if (error) {
- xfs_fs_cmn_err(CE_NOTE, mp,
- "Disabling barriers, trial barrier write failed");
- mp->m_flags &= ~XFS_MOUNT_BARRIER;
- return;
- }
-}
-
-void
-xfs_blkdev_issue_flush(
- xfs_buftarg_t *buftarg)
-{
- blkdev_issue_flush(buftarg->bt_bdev, NULL);
-}
-
-STATIC struct inode *
-linvfs_alloc_inode(
- struct super_block *sb)
-{
- vnode_t *vp;
-
- vp = kmem_cache_alloc(xfs_vnode_zone, kmem_flags_convert(KM_SLEEP));
- if (!vp)
- return NULL;
- return LINVFS_GET_IP(vp);
-}
-
-STATIC void
-linvfs_destroy_inode(
- struct inode *inode)
-{
- kmem_zone_free(xfs_vnode_zone, LINVFS_GET_VP(inode));
-}
-
-STATIC void
-linvfs_inode_init_once(
- void *data,
- kmem_cache_t *cachep,
- unsigned long flags)
-{
- vnode_t *vp = (vnode_t *)data;
-
- if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
- SLAB_CTOR_CONSTRUCTOR)
- inode_init_once(LINVFS_GET_IP(vp));
-}
-
-STATIC int
-linvfs_init_zones(void)
-{
- xfs_vnode_zone = kmem_cache_create("xfs_vnode",
- sizeof(vnode_t), 0, SLAB_RECLAIM_ACCOUNT,
- linvfs_inode_init_once, NULL);
- if (!xfs_vnode_zone)
- goto out;
-
- xfs_ioend_zone = kmem_zone_init(sizeof(xfs_ioend_t), "xfs_ioend");
- if (!xfs_ioend_zone)
- goto out_destroy_vnode_zone;
-
- xfs_ioend_pool = mempool_create(4 * MAX_BUF_PER_PAGE,
- mempool_alloc_slab, mempool_free_slab,
- xfs_ioend_zone);
- if (!xfs_ioend_pool)
- goto out_free_ioend_zone;
-
- return 0;
-
-
- out_free_ioend_zone:
- kmem_zone_destroy(xfs_ioend_zone);
- out_destroy_vnode_zone:
- kmem_zone_destroy(xfs_vnode_zone);
- out:
- return -ENOMEM;
-}
-
-STATIC void
-linvfs_destroy_zones(void)
-{
- mempool_destroy(xfs_ioend_pool);
- kmem_zone_destroy(xfs_vnode_zone);
- kmem_zone_destroy(xfs_ioend_zone);
-}
-
-/*
- * Attempt to flush the inode, this will actually fail
- * if the inode is pinned, but we dirty the inode again
- * at the point when it is unpinned after a log write,
- * since this is when the inode itself becomes flushable.
- */
-STATIC int
-linvfs_write_inode(
- struct inode *inode,
- int sync)
-{
- vnode_t *vp = LINVFS_GET_VP(inode);
- int error = 0, flags = FLUSH_INODE;
-
- if (vp) {
- vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address);
- if (sync)
- flags |= FLUSH_SYNC;
- VOP_IFLUSH(vp, flags, error);
- if (error == EAGAIN) {
- if (sync)
- VOP_IFLUSH(vp, flags | FLUSH_LOG, error);
- else
- error = 0;
- }
- }
-
- return -error;
-}
-
-STATIC void
-linvfs_clear_inode(
- struct inode *inode)
-{
- vnode_t *vp = LINVFS_GET_VP(inode);
- int error, cache;
-
- vn_trace_entry(vp, "clear_inode", (inst_t *)__return_address);
-
- XFS_STATS_INC(vn_rele);
- XFS_STATS_INC(vn_remove);
- XFS_STATS_INC(vn_reclaim);
- XFS_STATS_DEC(vn_active);
-
- /*
- * This can happen because xfs_iget_core calls xfs_idestroy if we
- * find an inode with di_mode == 0 but without IGET_CREATE set.
- */
- if (vp->v_fbhv)
- VOP_INACTIVE(vp, NULL, cache);
-
- VN_LOCK(vp);
- vp->v_flag &= ~VMODIFIED;
- VN_UNLOCK(vp, 0);
-
- if (vp->v_fbhv) {
- VOP_RECLAIM(vp, error);
- if (error)
- panic("vn_purge: cannot reclaim");
- }
-
- ASSERT(vp->v_fbhv == NULL);
-
-#ifdef XFS_VNODE_TRACE
- ktrace_free(vp->v_trace);
-#endif
-}
-
-/*
- * Enqueue a work item to be picked up by the vfs xfssyncd thread.
- * Doing this has two advantages:
- * - It saves on stack space, which is tight in certain situations
- * - It can be used (with care) as a mechanism to avoid deadlocks.
- * Flushing while allocating in a full filesystem requires both.
- */
-STATIC void
-xfs_syncd_queue_work(
- struct vfs *vfs,
- void *data,
- void (*syncer)(vfs_t *, void *))
-{
- vfs_sync_work_t *work;
-
- work = kmem_alloc(sizeof(struct vfs_sync_work), KM_SLEEP);
- INIT_LIST_HEAD(&work->w_list);
- work->w_syncer = syncer;
- work->w_data = data;
- work->w_vfs = vfs;
- spin_lock(&vfs->vfs_sync_lock);
- list_add_tail(&work->w_list, &vfs->vfs_sync_list);
- spin_unlock(&vfs->vfs_sync_lock);
- wake_up_process(vfs->vfs_sync_task);
-}
-
-/*
- * Flush delayed allocate data, attempting to free up reserved space
- * from existing allocations. At this point a new allocation attempt
- * has failed with ENOSPC and we are in the process of scratching our
- * heads, looking about for more room...
- */
-STATIC void
-xfs_flush_inode_work(
- vfs_t *vfs,
- void *inode)
-{
- filemap_flush(((struct inode *)inode)->i_mapping);
- iput((struct inode *)inode);
-}
-
-void
-xfs_flush_inode(
- xfs_inode_t *ip)
-{
- struct inode *inode = LINVFS_GET_IP(XFS_ITOV(ip));
- struct vfs *vfs = XFS_MTOVFS(ip->i_mount);
-
- igrab(inode);
- xfs_syncd_queue_work(vfs, inode, xfs_flush_inode_work);
- delay(msecs_to_jiffies(500));
-}
-
-/*
- * This is the "bigger hammer" version of xfs_flush_inode_work...
- * (IOW, "If at first you don't succeed, use a Bigger Hammer").
- */
-STATIC void
-xfs_flush_device_work(
- vfs_t *vfs,
- void *inode)
-{
- sync_blockdev(vfs->vfs_super->s_bdev);
- iput((struct inode *)inode);
-}
-
-void
-xfs_flush_device(
- xfs_inode_t *ip)
-{
- struct inode *inode = LINVFS_GET_IP(XFS_ITOV(ip));
- struct vfs *vfs = XFS_MTOVFS(ip->i_mount);
-
- igrab(inode);
- xfs_syncd_queue_work(vfs, inode, xfs_flush_device_work);
- delay(msecs_to_jiffies(500));
- xfs_log_force(ip->i_mount, (xfs_lsn_t)0, XFS_LOG_FORCE|XFS_LOG_SYNC);
-}
-
-#define SYNCD_FLAGS (SYNC_FSDATA|SYNC_BDFLUSH|SYNC_ATTR)
-STATIC void
-vfs_sync_worker(
- vfs_t *vfsp,
- void *unused)
-{
- int error;
-
- if (!(vfsp->vfs_flag & VFS_RDONLY))
- VFS_SYNC(vfsp, SYNCD_FLAGS, NULL, error);
- vfsp->vfs_sync_seq++;
- wmb();
- wake_up(&vfsp->vfs_wait_single_sync_task);
-}
-
-STATIC int
-xfssyncd(
- void *arg)
-{
- long timeleft;
- vfs_t *vfsp = (vfs_t *) arg;
- struct vfs_sync_work *work, *n;
- LIST_HEAD (tmp);
-
- timeleft = xfs_syncd_centisecs * msecs_to_jiffies(10);
- for (;;) {
- timeleft = schedule_timeout_interruptible(timeleft);
- /* swsusp */
- try_to_freeze();
- if (kthread_should_stop() && list_empty(&vfsp->vfs_sync_list))
- break;
-
- spin_lock(&vfsp->vfs_sync_lock);
- /*
- * We can get woken by laptop mode, to do a sync -
- * that's the (only!) case where the list would be
- * empty with time remaining.
- */
- if (!timeleft || list_empty(&vfsp->vfs_sync_list)) {
- if (!timeleft)
- timeleft = xfs_syncd_centisecs *
- msecs_to_jiffies(10);
- INIT_LIST_HEAD(&vfsp->vfs_sync_work.w_list);
- list_add_tail(&vfsp->vfs_sync_work.w_list,
- &vfsp->vfs_sync_list);
- }
- list_for_each_entry_safe(work, n, &vfsp->vfs_sync_list, w_list)
- list_move(&work->w_list, &tmp);
- spin_unlock(&vfsp->vfs_sync_lock);
-
- list_for_each_entry_safe(work, n, &tmp, w_list) {
- (*work->w_syncer)(vfsp, work->w_data);
- list_del(&work->w_list);
- if (work == &vfsp->vfs_sync_work)
- continue;
- kmem_free(work, sizeof(struct vfs_sync_work));
- }
- }
-
- return 0;
-}
-
-STATIC int
-linvfs_start_syncd(
- vfs_t *vfsp)
-{
- vfsp->vfs_sync_work.w_syncer = vfs_sync_worker;
- vfsp->vfs_sync_work.w_vfs = vfsp;
- vfsp->vfs_sync_task = kthread_run(xfssyncd, vfsp, "xfssyncd");
- if (IS_ERR(vfsp->vfs_sync_task))
- return -PTR_ERR(vfsp->vfs_sync_task);
- return 0;
-}
-
-STATIC void
-linvfs_stop_syncd(
- vfs_t *vfsp)
-{
- kthread_stop(vfsp->vfs_sync_task);
-}
-
-STATIC void
-linvfs_put_super(
- struct super_block *sb)
-{
- vfs_t *vfsp = LINVFS_GET_VFS(sb);
- int error;
-
- linvfs_stop_syncd(vfsp);
- VFS_SYNC(vfsp, SYNC_ATTR|SYNC_DELWRI, NULL, error);
- if (!error)
- VFS_UNMOUNT(vfsp, 0, NULL, error);
- if (error) {
- printk("XFS unmount got error %d\n", error);
- printk("%s: vfsp/0x%p left dangling!\n", __FUNCTION__, vfsp);
- return;
- }
-
- vfs_deallocate(vfsp);
-}
-
-STATIC void
-linvfs_write_super(
- struct super_block *sb)
-{
- vfs_t *vfsp = LINVFS_GET_VFS(sb);
- int error;
-
- if (sb->s_flags & MS_RDONLY) {
- sb->s_dirt = 0; /* paranoia */
- return;
- }
- /* Push the log and superblock a little */
- VFS_SYNC(vfsp, SYNC_FSDATA, NULL, error);
- sb->s_dirt = 0;
-}
-
-STATIC int
-linvfs_sync_super(
- struct super_block *sb,
- int wait)
-{
- vfs_t *vfsp = LINVFS_GET_VFS(sb);
- int error;
- int flags = SYNC_FSDATA;
-
- if (unlikely(sb->s_frozen == SB_FREEZE_WRITE))
- flags = SYNC_QUIESCE;
- else
- flags = SYNC_FSDATA | (wait ? SYNC_WAIT : 0);
-
- VFS_SYNC(vfsp, flags, NULL, error);
- sb->s_dirt = 0;
-
- if (unlikely(laptop_mode)) {
- int prev_sync_seq = vfsp->vfs_sync_seq;
-
- /*
- * The disk must be active because we're syncing.
- * We schedule xfssyncd now (now that the disk is
- * active) instead of later (when it might not be).
- */
- wake_up_process(vfsp->vfs_sync_task);
- /*
- * We have to wait for the sync iteration to complete.
- * If we don't, the disk activity caused by the sync
- * will come after the sync is completed, and that
- * triggers another sync from laptop mode.
- */
- wait_event(vfsp->vfs_wait_single_sync_task,
- vfsp->vfs_sync_seq != prev_sync_seq);
- }
-
- return -error;
-}
-
-STATIC int
-linvfs_statfs(
- struct super_block *sb,
- struct kstatfs *statp)
-{
- vfs_t *vfsp = LINVFS_GET_VFS(sb);
- int error;
-
- VFS_STATVFS(vfsp, statp, NULL, error);
- return -error;
-}
-
-STATIC int
-linvfs_remount(
- struct super_block *sb,
- int *flags,
- char *options)
-{
- vfs_t *vfsp = LINVFS_GET_VFS(sb);
- struct xfs_mount_args *args = xfs_args_allocate(sb);
- int error;
-
- VFS_PARSEARGS(vfsp, options, args, 1, error);
- if (!error)
- VFS_MNTUPDATE(vfsp, flags, args, error);
- kmem_free(args, sizeof(*args));
- return -error;
-}
-
-STATIC void
-linvfs_freeze_fs(
- struct super_block *sb)
-{
- VFS_FREEZE(LINVFS_GET_VFS(sb));
-}
-
-STATIC int
-linvfs_show_options(
- struct seq_file *m,
- struct vfsmount *mnt)
-{
- struct vfs *vfsp = LINVFS_GET_VFS(mnt->mnt_sb);
- int error;
-
- VFS_SHOWARGS(vfsp, m, error);
- return error;
-}
-
-STATIC int
-linvfs_quotasync(
- struct super_block *sb,
- int type)
-{
- struct vfs *vfsp = LINVFS_GET_VFS(sb);
- int error;
-
- VFS_QUOTACTL(vfsp, Q_XQUOTASYNC, 0, (caddr_t)NULL, error);
- return -error;
-}
-
-STATIC int
-linvfs_getxstate(
- struct super_block *sb,
- struct fs_quota_stat *fqs)
-{
- struct vfs *vfsp = LINVFS_GET_VFS(sb);
- int error;
-
- VFS_QUOTACTL(vfsp, Q_XGETQSTAT, 0, (caddr_t)fqs, error);
- return -error;
-}
-
-STATIC int
-linvfs_setxstate(
- struct super_block *sb,
- unsigned int flags,
- int op)
-{
- struct vfs *vfsp = LINVFS_GET_VFS(sb);
- int error;
-
- VFS_QUOTACTL(vfsp, op, 0, (caddr_t)&flags, error);
- return -error;
-}
-
-STATIC int
-linvfs_getxquota(
- struct super_block *sb,
- int type,
- qid_t id,
- struct fs_disk_quota *fdq)
-{
- struct vfs *vfsp = LINVFS_GET_VFS(sb);
- int error, getmode;
-
- getmode = (type == USRQUOTA) ? Q_XGETQUOTA :
- ((type == GRPQUOTA) ? Q_XGETGQUOTA : Q_XGETPQUOTA);
- VFS_QUOTACTL(vfsp, getmode, id, (caddr_t)fdq, error);
- return -error;
-}
-
-STATIC int
-linvfs_setxquota(
- struct super_block *sb,
- int type,
- qid_t id,
- struct fs_disk_quota *fdq)
-{
- struct vfs *vfsp = LINVFS_GET_VFS(sb);
- int error, setmode;
-
- setmode = (type == USRQUOTA) ? Q_XSETQLIM :
- ((type == GRPQUOTA) ? Q_XSETGQLIM : Q_XSETPQLIM);
- VFS_QUOTACTL(vfsp, setmode, id, (caddr_t)fdq, error);
- return -error;
-}
-
-STATIC int
-linvfs_fill_super(
- struct super_block *sb,
- void *data,
- int silent)
-{
- vnode_t *rootvp;
- struct vfs *vfsp = vfs_allocate();
- struct xfs_mount_args *args = xfs_args_allocate(sb);
- struct kstatfs statvfs;
- int error, error2;
-
- vfsp->vfs_super = sb;
- LINVFS_SET_VFS(sb, vfsp);
- if (sb->s_flags & MS_RDONLY)
- vfsp->vfs_flag |= VFS_RDONLY;
- bhv_insert_all_vfsops(vfsp);
-
- VFS_PARSEARGS(vfsp, (char *)data, args, 0, error);
- if (error) {
- bhv_remove_all_vfsops(vfsp, 1);
- goto fail_vfsop;
- }
-
- sb_min_blocksize(sb, BBSIZE);
-#ifdef CONFIG_XFS_EXPORT
- sb->s_export_op = &linvfs_export_ops;
-#endif
- sb->s_qcop = &linvfs_qops;
- sb->s_op = &linvfs_sops;
-
- VFS_MOUNT(vfsp, args, NULL, error);
- if (error) {
- bhv_remove_all_vfsops(vfsp, 1);
- goto fail_vfsop;
- }
-
- VFS_STATVFS(vfsp, &statvfs, NULL, error);
- if (error)
- goto fail_unmount;
-
- sb->s_dirt = 1;
- sb->s_magic = statvfs.f_type;
- sb->s_blocksize = statvfs.f_bsize;
- sb->s_blocksize_bits = ffs(statvfs.f_bsize) - 1;
- sb->s_maxbytes = xfs_max_file_offset(sb->s_blocksize_bits);
- sb->s_time_gran = 1;
- set_posix_acl_flag(sb);
-
- VFS_ROOT(vfsp, &rootvp, error);
- if (error)
- goto fail_unmount;
-
- sb->s_root = d_alloc_root(LINVFS_GET_IP(rootvp));
- if (!sb->s_root) {
- error = ENOMEM;
- goto fail_vnrele;
- }
- if (is_bad_inode(sb->s_root->d_inode)) {
- error = EINVAL;
- goto fail_vnrele;
- }
- if ((error = linvfs_start_syncd(vfsp)))
- goto fail_vnrele;
- vn_trace_exit(rootvp, __FUNCTION__, (inst_t *)__return_address);
-
- kmem_free(args, sizeof(*args));
- return 0;
-
-fail_vnrele:
- if (sb->s_root) {
- dput(sb->s_root);
- sb->s_root = NULL;
- } else {
- VN_RELE(rootvp);
- }
-
-fail_unmount:
- VFS_UNMOUNT(vfsp, 0, NULL, error2);
-
-fail_vfsop:
- vfs_deallocate(vfsp);
- kmem_free(args, sizeof(*args));
- return -error;
-}
-
-STATIC struct super_block *
-linvfs_get_sb(
- struct file_system_type *fs_type,
- int flags,
- const char *dev_name,
- void *data)
-{
- return get_sb_bdev(fs_type, flags, dev_name, data, linvfs_fill_super);
-}
-
-STATIC struct super_operations linvfs_sops = {
- .alloc_inode = linvfs_alloc_inode,
- .destroy_inode = linvfs_destroy_inode,
- .write_inode = linvfs_write_inode,
- .clear_inode = linvfs_clear_inode,
- .put_super = linvfs_put_super,
- .write_super = linvfs_write_super,
- .sync_fs = linvfs_sync_super,
- .write_super_lockfs = linvfs_freeze_fs,
- .statfs = linvfs_statfs,
- .remount_fs = linvfs_remount,
- .show_options = linvfs_show_options,
-};
-
-STATIC struct quotactl_ops linvfs_qops = {
- .quota_sync = linvfs_quotasync,
- .get_xstate = linvfs_getxstate,
- .set_xstate = linvfs_setxstate,
- .get_xquota = linvfs_getxquota,
- .set_xquota = linvfs_setxquota,
-};
-
-STATIC struct file_system_type xfs_fs_type = {
- .owner = THIS_MODULE,
- .name = "xfs",
- .get_sb = linvfs_get_sb,
- .kill_sb = kill_block_super,
- .fs_flags = FS_REQUIRES_DEV,
-};
-
-
-STATIC int __init
-init_xfs_fs( void )
-{
- int error;
- struct sysinfo si;
- static char message[] __initdata = KERN_INFO \
- XFS_VERSION_STRING " with " XFS_BUILD_OPTIONS " enabled\n";
-
- printk(message);
-
- si_meminfo(&si);
- xfs_physmem = si.totalram;
-
- ktrace_init(64);
-
- error = linvfs_init_zones();
- if (error < 0)
- goto undo_zones;
-
- error = xfs_buf_init();
- if (error < 0)
- goto undo_buffers;
-
- vn_init();
- xfs_init();
- uuid_init();
- vfs_initquota();
-
- error = register_filesystem(&xfs_fs_type);
- if (error)
- goto undo_register;
- XFS_DM_INIT(&xfs_fs_type);
- return 0;
-
-undo_register:
- xfs_buf_terminate();
-
-undo_buffers:
- linvfs_destroy_zones();
-
-undo_zones:
- return error;
-}
-
-STATIC void __exit
-exit_xfs_fs( void )
-{
- vfs_exitquota();
- XFS_DM_EXIT(&xfs_fs_type);
- unregister_filesystem(&xfs_fs_type);
- xfs_cleanup();
- xfs_buf_terminate();
- linvfs_destroy_zones();
- ktrace_uninit();
-}
-
-module_init(init_xfs_fs);
-module_exit(exit_xfs_fs);
-
-MODULE_AUTHOR("Silicon Graphics, Inc.");
-MODULE_DESCRIPTION(XFS_VERSION_STRING " with " XFS_BUILD_OPTIONS " enabled");
-MODULE_LICENSE("GPL");
diff --git a/fs/xfs/linux-2.6/xfs_sysctl.c b/fs/xfs/linux-2.6/xfs_sysctl.c
deleted file mode 100644
index a0256497242..00000000000
--- a/fs/xfs/linux-2.6/xfs_sysctl.c
+++ /dev/null
@@ -1,156 +0,0 @@
-/*
- * Copyright (c) 2001-2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- */
-#include "xfs.h"
-#include <linux/sysctl.h>
-#include <linux/proc_fs.h>
-
-static struct ctl_table_header *xfs_table_header;
-
-#ifdef CONFIG_PROC_FS
-STATIC int
-xfs_stats_clear_proc_handler(
- ctl_table *ctl,
- int write,
- struct file *filp,
- void __user *buffer,
- size_t *lenp,
- loff_t *ppos)
-{
- int c, ret, *valp = ctl->data;
- __uint32_t vn_active;
-
- ret = proc_dointvec_minmax(ctl, write, filp, buffer, lenp, ppos);
-
- if (!ret && write && *valp) {
- printk("XFS Clearing xfsstats\n");
- for (c = 0; c < NR_CPUS; c++) {
- if (!cpu_possible(c)) continue;
- preempt_disable();
- /* save vn_active, it's a universal truth! */
- vn_active = per_cpu(xfsstats, c).vn_active;
- memset(&per_cpu(xfsstats, c), 0,
- sizeof(struct xfsstats));
- per_cpu(xfsstats, c).vn_active = vn_active;
- preempt_enable();
- }
- xfs_stats_clear = 0;
- }
-
- return ret;
-}
-#endif /* CONFIG_PROC_FS */
-
-STATIC ctl_table xfs_table[] = {
- {XFS_RESTRICT_CHOWN, "restrict_chown", &xfs_params.restrict_chown.val,
- sizeof(int), 0644, NULL, &proc_dointvec_minmax,
- &sysctl_intvec, NULL,
- &xfs_params.restrict_chown.min, &xfs_params.restrict_chown.max},
-
- {XFS_SGID_INHERIT, "irix_sgid_inherit", &xfs_params.sgid_inherit.val,
- sizeof(int), 0644, NULL, &proc_dointvec_minmax,
- &sysctl_intvec, NULL,
- &xfs_params.sgid_inherit.min, &xfs_params.sgid_inherit.max},
-
- {XFS_SYMLINK_MODE, "irix_symlink_mode", &xfs_params.symlink_mode.val,
- sizeof(int), 0644, NULL, &proc_dointvec_minmax,
- &sysctl_intvec, NULL,
- &xfs_params.symlink_mode.min, &xfs_params.symlink_mode.max},
-
- {XFS_PANIC_MASK, "panic_mask", &xfs_params.panic_mask.val,
- sizeof(int), 0644, NULL, &proc_dointvec_minmax,
- &sysctl_intvec, NULL,
- &xfs_params.panic_mask.min, &xfs_params.panic_mask.max},
-
- {XFS_ERRLEVEL, "error_level", &xfs_params.error_level.val,
- sizeof(int), 0644, NULL, &proc_dointvec_minmax,
- &sysctl_intvec, NULL,
- &xfs_params.error_level.min, &xfs_params.error_level.max},
-
- {XFS_SYNCD_TIMER, "xfssyncd_centisecs", &xfs_params.syncd_timer.val,
- sizeof(int), 0644, NULL, &proc_dointvec_minmax,
- &sysctl_intvec, NULL,
- &xfs_params.syncd_timer.min, &xfs_params.syncd_timer.max},
-
- {XFS_INHERIT_SYNC, "inherit_sync", &xfs_params.inherit_sync.val,
- sizeof(int), 0644, NULL, &proc_dointvec_minmax,
- &sysctl_intvec, NULL,
- &xfs_params.inherit_sync.min, &xfs_params.inherit_sync.max},
-
- {XFS_INHERIT_NODUMP, "inherit_nodump", &xfs_params.inherit_nodump.val,
- sizeof(int), 0644, NULL, &proc_dointvec_minmax,
- &sysctl_intvec, NULL,
- &xfs_params.inherit_nodump.min, &xfs_params.inherit_nodump.max},
-
- {XFS_INHERIT_NOATIME, "inherit_noatime", &xfs_params.inherit_noatim.val,
- sizeof(int), 0644, NULL, &proc_dointvec_minmax,
- &sysctl_intvec, NULL,
- &xfs_params.inherit_noatim.min, &xfs_params.inherit_noatim.max},
-
- {XFS_BUF_TIMER, "xfsbufd_centisecs", &xfs_params.xfs_buf_timer.val,
- sizeof(int), 0644, NULL, &proc_dointvec_minmax,
- &sysctl_intvec, NULL,
- &xfs_params.xfs_buf_timer.min, &xfs_params.xfs_buf_timer.max},
-
- {XFS_BUF_AGE, "age_buffer_centisecs", &xfs_params.xfs_buf_age.val,
- sizeof(int), 0644, NULL, &proc_dointvec_minmax,
- &sysctl_intvec, NULL,
- &xfs_params.xfs_buf_age.min, &xfs_params.xfs_buf_age.max},
-
- {XFS_INHERIT_NOSYM, "inherit_nosymlinks", &xfs_params.inherit_nosym.val,
- sizeof(int), 0644, NULL, &proc_dointvec_minmax,
- &sysctl_intvec, NULL,
- &xfs_params.inherit_nosym.min, &xfs_params.inherit_nosym.max},
-
- {XFS_ROTORSTEP, "rotorstep", &xfs_params.rotorstep.val,
- sizeof(int), 0644, NULL, &proc_dointvec_minmax,
- &sysctl_intvec, NULL,
- &xfs_params.rotorstep.min, &xfs_params.rotorstep.max},
-
- /* please keep this the last entry */
-#ifdef CONFIG_PROC_FS
- {XFS_STATS_CLEAR, "stats_clear", &xfs_params.stats_clear.val,
- sizeof(int), 0644, NULL, &xfs_stats_clear_proc_handler,
- &sysctl_intvec, NULL,
- &xfs_params.stats_clear.min, &xfs_params.stats_clear.max},
-#endif /* CONFIG_PROC_FS */
-
- {0}
-};
-
-STATIC ctl_table xfs_dir_table[] = {
- {FS_XFS, "xfs", NULL, 0, 0555, xfs_table},
- {0}
-};
-
-STATIC ctl_table xfs_root_table[] = {
- {CTL_FS, "fs", NULL, 0, 0555, xfs_dir_table},
- {0}
-};
-
-void
-xfs_sysctl_register(void)
-{
- xfs_table_header = register_sysctl_table(xfs_root_table, 1);
-}
-
-void
-xfs_sysctl_unregister(void)
-{
- if (xfs_table_header)
- unregister_sysctl_table(xfs_table_header);
-}
diff --git a/fs/xfs/linux-2.6/xfs_vfs.c b/fs/xfs/linux-2.6/xfs_vfs.c
deleted file mode 100644
index c855d62e534..00000000000
--- a/fs/xfs/linux-2.6/xfs_vfs.c
+++ /dev/null
@@ -1,313 +0,0 @@
-/*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_inum.h"
-#include "xfs_log.h"
-#include "xfs_clnt.h"
-#include "xfs_trans.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_dir.h"
-#include "xfs_dir2.h"
-#include "xfs_imap.h"
-#include "xfs_alloc.h"
-#include "xfs_dmapi.h"
-#include "xfs_mount.h"
-#include "xfs_quota.h"
-
-int
-vfs_mount(
- struct bhv_desc *bdp,
- struct xfs_mount_args *args,
- struct cred *cr)
-{
- struct bhv_desc *next = bdp;
-
- ASSERT(next);
- while (! (bhvtovfsops(next))->vfs_mount)
- next = BHV_NEXT(next);
- return ((*bhvtovfsops(next)->vfs_mount)(next, args, cr));
-}
-
-int
-vfs_parseargs(
- struct bhv_desc *bdp,
- char *s,
- struct xfs_mount_args *args,
- int f)
-{
- struct bhv_desc *next = bdp;
-
- ASSERT(next);
- while (! (bhvtovfsops(next))->vfs_parseargs)
- next = BHV_NEXT(next);
- return ((*bhvtovfsops(next)->vfs_parseargs)(next, s, args, f));
-}
-
-int
-vfs_showargs(
- struct bhv_desc *bdp,
- struct seq_file *m)
-{
- struct bhv_desc *next = bdp;
-
- ASSERT(next);
- while (! (bhvtovfsops(next))->vfs_showargs)
- next = BHV_NEXT(next);
- return ((*bhvtovfsops(next)->vfs_showargs)(next, m));
-}
-
-int
-vfs_unmount(
- struct bhv_desc *bdp,
- int fl,
- struct cred *cr)
-{
- struct bhv_desc *next = bdp;
-
- ASSERT(next);
- while (! (bhvtovfsops(next))->vfs_unmount)
- next = BHV_NEXT(next);
- return ((*bhvtovfsops(next)->vfs_unmount)(next, fl, cr));
-}
-
-int
-vfs_mntupdate(
- struct bhv_desc *bdp,
- int *fl,
- struct xfs_mount_args *args)
-{
- struct bhv_desc *next = bdp;
-
- ASSERT(next);
- while (! (bhvtovfsops(next))->vfs_mntupdate)
- next = BHV_NEXT(next);
- return ((*bhvtovfsops(next)->vfs_mntupdate)(next, fl, args));
-}
-
-int
-vfs_root(
- struct bhv_desc *bdp,
- struct vnode **vpp)
-{
- struct bhv_desc *next = bdp;
-
- ASSERT(next);
- while (! (bhvtovfsops(next))->vfs_root)
- next = BHV_NEXT(next);
- return ((*bhvtovfsops(next)->vfs_root)(next, vpp));
-}
-
-int
-vfs_statvfs(
- struct bhv_desc *bdp,
- xfs_statfs_t *sp,
- struct vnode *vp)
-{
- struct bhv_desc *next = bdp;
-
- ASSERT(next);
- while (! (bhvtovfsops(next))->vfs_statvfs)
- next = BHV_NEXT(next);
- return ((*bhvtovfsops(next)->vfs_statvfs)(next, sp, vp));
-}
-
-int
-vfs_sync(
- struct bhv_desc *bdp,
- int fl,
- struct cred *cr)
-{
- struct bhv_desc *next = bdp;
-
- ASSERT(next);
- while (! (bhvtovfsops(next))->vfs_sync)
- next = BHV_NEXT(next);
- return ((*bhvtovfsops(next)->vfs_sync)(next, fl, cr));
-}
-
-int
-vfs_vget(
- struct bhv_desc *bdp,
- struct vnode **vpp,
- struct fid *fidp)
-{
- struct bhv_desc *next = bdp;
-
- ASSERT(next);
- while (! (bhvtovfsops(next))->vfs_vget)
- next = BHV_NEXT(next);
- return ((*bhvtovfsops(next)->vfs_vget)(next, vpp, fidp));
-}
-
-int
-vfs_dmapiops(
- struct bhv_desc *bdp,
- caddr_t addr)
-{
- struct bhv_desc *next = bdp;
-
- ASSERT(next);
- while (! (bhvtovfsops(next))->vfs_dmapiops)
- next = BHV_NEXT(next);
- return ((*bhvtovfsops(next)->vfs_dmapiops)(next, addr));
-}
-
-int
-vfs_quotactl(
- struct bhv_desc *bdp,
- int cmd,
- int id,
- caddr_t addr)
-{
- struct bhv_desc *next = bdp;
-
- ASSERT(next);
- while (! (bhvtovfsops(next))->vfs_quotactl)
- next = BHV_NEXT(next);
- return ((*bhvtovfsops(next)->vfs_quotactl)(next, cmd, id, addr));
-}
-
-void
-vfs_init_vnode(
- struct bhv_desc *bdp,
- struct vnode *vp,
- struct bhv_desc *bp,
- int unlock)
-{
- struct bhv_desc *next = bdp;
-
- ASSERT(next);
- while (! (bhvtovfsops(next))->vfs_init_vnode)
- next = BHV_NEXT(next);
- ((*bhvtovfsops(next)->vfs_init_vnode)(next, vp, bp, unlock));
-}
-
-void
-vfs_force_shutdown(
- struct bhv_desc *bdp,
- int fl,
- char *file,
- int line)
-{
- struct bhv_desc *next = bdp;
-
- ASSERT(next);
- while (! (bhvtovfsops(next))->vfs_force_shutdown)
- next = BHV_NEXT(next);
- ((*bhvtovfsops(next)->vfs_force_shutdown)(next, fl, file, line));
-}
-
-void
-vfs_freeze(
- struct bhv_desc *bdp)
-{
- struct bhv_desc *next = bdp;
-
- ASSERT(next);
- while (! (bhvtovfsops(next))->vfs_freeze)
- next = BHV_NEXT(next);
- ((*bhvtovfsops(next)->vfs_freeze)(next));
-}
-
-vfs_t *
-vfs_allocate( void )
-{
- struct vfs *vfsp;
-
- vfsp = kmem_zalloc(sizeof(vfs_t), KM_SLEEP);
- bhv_head_init(VFS_BHVHEAD(vfsp), "vfs");
- INIT_LIST_HEAD(&vfsp->vfs_sync_list);
- spin_lock_init(&vfsp->vfs_sync_lock);
- init_waitqueue_head(&vfsp->vfs_wait_single_sync_task);
- return vfsp;
-}
-
-void
-vfs_deallocate(
- struct vfs *vfsp)
-{
- bhv_head_destroy(VFS_BHVHEAD(vfsp));
- kmem_free(vfsp, sizeof(vfs_t));
-}
-
-void
-vfs_insertops(
- struct vfs *vfsp,
- struct bhv_vfsops *vfsops)
-{
- struct bhv_desc *bdp;
-
- bdp = kmem_alloc(sizeof(struct bhv_desc), KM_SLEEP);
- bhv_desc_init(bdp, NULL, vfsp, vfsops);
- bhv_insert(&vfsp->vfs_bh, bdp);
-}
-
-void
-vfs_insertbhv(
- struct vfs *vfsp,
- struct bhv_desc *bdp,
- struct vfsops *vfsops,
- void *mount)
-{
- bhv_desc_init(bdp, mount, vfsp, vfsops);
- bhv_insert_initial(&vfsp->vfs_bh, bdp);
-}
-
-void
-bhv_remove_vfsops(
- struct vfs *vfsp,
- int pos)
-{
- struct bhv_desc *bhv;
-
- bhv = bhv_lookup_range(&vfsp->vfs_bh, pos, pos);
- if (!bhv)
- return;
- bhv_remove(&vfsp->vfs_bh, bhv);
- kmem_free(bhv, sizeof(*bhv));
-}
-
-void
-bhv_remove_all_vfsops(
- struct vfs *vfsp,
- int freebase)
-{
- struct xfs_mount *mp;
-
- bhv_remove_vfsops(vfsp, VFS_POSITION_QM);
- bhv_remove_vfsops(vfsp, VFS_POSITION_DM);
- if (!freebase)
- return;
- mp = XFS_BHVTOM(bhv_lookup(VFS_BHVHEAD(vfsp), &xfs_vfsops));
- VFS_REMOVEBHV(vfsp, &mp->m_bhv);
- xfs_mount_free(mp, 0);
-}
-
-void
-bhv_insert_all_vfsops(
- struct vfs *vfsp)
-{
- struct xfs_mount *mp;
-
- mp = xfs_mount_init();
- vfs_insertbhv(vfsp, &mp->m_bhv, &xfs_vfsops, mp);
- vfs_insertdmapi(vfsp);
- vfs_insertquota(vfsp);
-}
diff --git a/fs/xfs/linux-2.6/xfs_vfs.h b/fs/xfs/linux-2.6/xfs_vfs.h
deleted file mode 100644
index 57caf9eddee..00000000000
--- a/fs/xfs/linux-2.6/xfs_vfs.h
+++ /dev/null
@@ -1,209 +0,0 @@
-/*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- */
-#ifndef __XFS_VFS_H__
-#define __XFS_VFS_H__
-
-#include <linux/vfs.h>
-#include "xfs_fs.h"
-
-struct fid;
-struct vfs;
-struct cred;
-struct vnode;
-struct kstatfs;
-struct seq_file;
-struct super_block;
-struct xfs_mount_args;
-
-typedef struct kstatfs xfs_statfs_t;
-
-typedef struct vfs_sync_work {
- struct list_head w_list;
- struct vfs *w_vfs;
- void *w_data; /* syncer routine argument */
- void (*w_syncer)(struct vfs *, void *);
-} vfs_sync_work_t;
-
-typedef struct vfs {
- u_int vfs_flag; /* flags */
- xfs_fsid_t vfs_fsid; /* file system ID */
- xfs_fsid_t *vfs_altfsid; /* An ID fixed for life of FS */
- bhv_head_t vfs_bh; /* head of vfs behavior chain */
- struct super_block *vfs_super; /* generic superblock pointer */
- struct task_struct *vfs_sync_task; /* generalised sync thread */
- vfs_sync_work_t vfs_sync_work; /* work item for VFS_SYNC */
- struct list_head vfs_sync_list; /* sync thread work item list */
- spinlock_t vfs_sync_lock; /* work item list lock */
- int vfs_sync_seq; /* sync thread generation no. */
- wait_queue_head_t vfs_wait_single_sync_task;
-} vfs_t;
-
-#define vfs_fbhv vfs_bh.bh_first /* 1st on vfs behavior chain */
-
-#define bhvtovfs(bdp) ( (struct vfs *)BHV_VOBJ(bdp) )
-#define bhvtovfsops(bdp) ( (struct vfsops *)BHV_OPS(bdp) )
-#define VFS_BHVHEAD(vfs) ( &(vfs)->vfs_bh )
-#define VFS_REMOVEBHV(vfs, bdp) ( bhv_remove(VFS_BHVHEAD(vfs), bdp) )
-
-#define VFS_POSITION_BASE BHV_POSITION_BASE /* chain bottom */
-#define VFS_POSITION_TOP BHV_POSITION_TOP /* chain top */
-#define VFS_POSITION_INVALID BHV_POSITION_INVALID /* invalid pos. num */
-
-typedef enum {
- VFS_BHV_UNKNOWN, /* not specified */
- VFS_BHV_XFS, /* xfs */
- VFS_BHV_DM, /* data migration */
- VFS_BHV_QM, /* quota manager */
- VFS_BHV_IO, /* IO path */
- VFS_BHV_END /* housekeeping end-of-range */
-} vfs_bhv_t;
-
-#define VFS_POSITION_XFS (BHV_POSITION_BASE)
-#define VFS_POSITION_DM (VFS_POSITION_BASE+10)
-#define VFS_POSITION_QM (VFS_POSITION_BASE+20)
-#define VFS_POSITION_IO (VFS_POSITION_BASE+30)
-
-#define VFS_RDONLY 0x0001 /* read-only vfs */
-#define VFS_GRPID 0x0002 /* group-ID assigned from directory */
-#define VFS_DMI 0x0004 /* filesystem has the DMI enabled */
-#define VFS_32BITINODES 0x0008 /* do not use inums above 32 bits */
-#define VFS_END 0x0008 /* max flag */
-
-#define SYNC_ATTR 0x0001 /* sync attributes */
-#define SYNC_CLOSE 0x0002 /* close file system down */
-#define SYNC_DELWRI 0x0004 /* look at delayed writes */
-#define SYNC_WAIT 0x0008 /* wait for i/o to complete */
-#define SYNC_BDFLUSH 0x0010 /* BDFLUSH is calling -- don't block */
-#define SYNC_FSDATA 0x0020 /* flush fs data (e.g. superblocks) */
-#define SYNC_REFCACHE 0x0040 /* prune some of the nfs ref cache */
-#define SYNC_REMOUNT 0x0080 /* remount readonly, no dummy LRs */
-#define SYNC_QUIESCE 0x0100 /* quiesce fileystem for a snapshot */
-
-typedef int (*vfs_mount_t)(bhv_desc_t *,
- struct xfs_mount_args *, struct cred *);
-typedef int (*vfs_parseargs_t)(bhv_desc_t *, char *,
- struct xfs_mount_args *, int);
-typedef int (*vfs_showargs_t)(bhv_desc_t *, struct seq_file *);
-typedef int (*vfs_unmount_t)(bhv_desc_t *, int, struct cred *);
-typedef int (*vfs_mntupdate_t)(bhv_desc_t *, int *,
- struct xfs_mount_args *);
-typedef int (*vfs_root_t)(bhv_desc_t *, struct vnode **);
-typedef int (*vfs_statvfs_t)(bhv_desc_t *, xfs_statfs_t *, struct vnode *);
-typedef int (*vfs_sync_t)(bhv_desc_t *, int, struct cred *);
-typedef int (*vfs_vget_t)(bhv_desc_t *, struct vnode **, struct fid *);
-typedef int (*vfs_dmapiops_t)(bhv_desc_t *, caddr_t);
-typedef int (*vfs_quotactl_t)(bhv_desc_t *, int, int, caddr_t);
-typedef void (*vfs_init_vnode_t)(bhv_desc_t *,
- struct vnode *, bhv_desc_t *, int);
-typedef void (*vfs_force_shutdown_t)(bhv_desc_t *, int, char *, int);
-typedef void (*vfs_freeze_t)(bhv_desc_t *);
-
-typedef struct vfsops {
- bhv_position_t vf_position; /* behavior chain position */
- vfs_mount_t vfs_mount; /* mount file system */
- vfs_parseargs_t vfs_parseargs; /* parse mount options */
- vfs_showargs_t vfs_showargs; /* unparse mount options */
- vfs_unmount_t vfs_unmount; /* unmount file system */
- vfs_mntupdate_t vfs_mntupdate; /* update file system options */
- vfs_root_t vfs_root; /* get root vnode */
- vfs_statvfs_t vfs_statvfs; /* file system statistics */
- vfs_sync_t vfs_sync; /* flush files */
- vfs_vget_t vfs_vget; /* get vnode from fid */
- vfs_dmapiops_t vfs_dmapiops; /* data migration */
- vfs_quotactl_t vfs_quotactl; /* disk quota */
- vfs_init_vnode_t vfs_init_vnode; /* initialize a new vnode */
- vfs_force_shutdown_t vfs_force_shutdown; /* crash and burn */
- vfs_freeze_t vfs_freeze; /* freeze fs for snapshot */
-} vfsops_t;
-
-/*
- * VFS's. Operates on vfs structure pointers (starts at bhv head).
- */
-#define VHEAD(v) ((v)->vfs_fbhv)
-#define VFS_MOUNT(v, ma,cr, rv) ((rv) = vfs_mount(VHEAD(v), ma,cr))
-#define VFS_PARSEARGS(v, o,ma,f, rv) ((rv) = vfs_parseargs(VHEAD(v), o,ma,f))
-#define VFS_SHOWARGS(v, m, rv) ((rv) = vfs_showargs(VHEAD(v), m))
-#define VFS_UNMOUNT(v, f, cr, rv) ((rv) = vfs_unmount(VHEAD(v), f,cr))
-#define VFS_MNTUPDATE(v, fl, args, rv) ((rv) = vfs_mntupdate(VHEAD(v), fl, args))
-#define VFS_ROOT(v, vpp, rv) ((rv) = vfs_root(VHEAD(v), vpp))
-#define VFS_STATVFS(v, sp,vp, rv) ((rv) = vfs_statvfs(VHEAD(v), sp,vp))
-#define VFS_SYNC(v, flag,cr, rv) ((rv) = vfs_sync(VHEAD(v), flag,cr))
-#define VFS_VGET(v, vpp,fidp, rv) ((rv) = vfs_vget(VHEAD(v), vpp,fidp))
-#define VFS_DMAPIOPS(v, p, rv) ((rv) = vfs_dmapiops(VHEAD(v), p))
-#define VFS_QUOTACTL(v, c,id,p, rv) ((rv) = vfs_quotactl(VHEAD(v), c,id,p))
-#define VFS_INIT_VNODE(v, vp,b,ul) ( vfs_init_vnode(VHEAD(v), vp,b,ul) )
-#define VFS_FORCE_SHUTDOWN(v, fl,f,l) ( vfs_force_shutdown(VHEAD(v), fl,f,l) )
-#define VFS_FREEZE(v) ( vfs_freeze(VHEAD(v)) )
-
-/*
- * PVFS's. Operates on behavior descriptor pointers.
- */
-#define PVFS_MOUNT(b, ma,cr, rv) ((rv) = vfs_mount(b, ma,cr))
-#define PVFS_PARSEARGS(b, o,ma,f, rv) ((rv) = vfs_parseargs(b, o,ma,f))
-#define PVFS_SHOWARGS(b, m, rv) ((rv) = vfs_showargs(b, m))
-#define PVFS_UNMOUNT(b, f,cr, rv) ((rv) = vfs_unmount(b, f,cr))
-#define PVFS_MNTUPDATE(b, fl, args, rv) ((rv) = vfs_mntupdate(b, fl, args))
-#define PVFS_ROOT(b, vpp, rv) ((rv) = vfs_root(b, vpp))
-#define PVFS_STATVFS(b, sp,vp, rv) ((rv) = vfs_statvfs(b, sp,vp))
-#define PVFS_SYNC(b, flag,cr, rv) ((rv) = vfs_sync(b, flag,cr))
-#define PVFS_VGET(b, vpp,fidp, rv) ((rv) = vfs_vget(b, vpp,fidp))
-#define PVFS_DMAPIOPS(b, p, rv) ((rv) = vfs_dmapiops(b, p))
-#define PVFS_QUOTACTL(b, c,id,p, rv) ((rv) = vfs_quotactl(b, c,id,p))
-#define PVFS_INIT_VNODE(b, vp,b2,ul) ( vfs_init_vnode(b, vp,b2,ul) )
-#define PVFS_FORCE_SHUTDOWN(b, fl,f,l) ( vfs_force_shutdown(b, fl,f,l) )
-#define PVFS_FREEZE(b) ( vfs_freeze(b) )
-
-extern int vfs_mount(bhv_desc_t *, struct xfs_mount_args *, struct cred *);
-extern int vfs_parseargs(bhv_desc_t *, char *, struct xfs_mount_args *, int);
-extern int vfs_showargs(bhv_desc_t *, struct seq_file *);
-extern int vfs_unmount(bhv_desc_t *, int, struct cred *);
-extern int vfs_mntupdate(bhv_desc_t *, int *, struct xfs_mount_args *);
-extern int vfs_root(bhv_desc_t *, struct vnode **);
-extern int vfs_statvfs(bhv_desc_t *, xfs_statfs_t *, struct vnode *);
-extern int vfs_sync(bhv_desc_t *, int, struct cred *);
-extern int vfs_vget(bhv_desc_t *, struct vnode **, struct fid *);
-extern int vfs_dmapiops(bhv_desc_t *, caddr_t);
-extern int vfs_quotactl(bhv_desc_t *, int, int, caddr_t);
-extern void vfs_init_vnode(bhv_desc_t *, struct vnode *, bhv_desc_t *, int);
-extern void vfs_force_shutdown(bhv_desc_t *, int, char *, int);
-extern void vfs_freeze(bhv_desc_t *);
-
-typedef struct bhv_vfsops {
- struct vfsops bhv_common;
- void * bhv_custom;
-} bhv_vfsops_t;
-
-#define vfs_bhv_lookup(v, id) ( bhv_lookup_range(&(v)->vfs_bh, (id), (id)) )
-#define vfs_bhv_custom(b) ( ((bhv_vfsops_t *)BHV_OPS(b))->bhv_custom )
-#define vfs_bhv_set_custom(b,o) ( (b)->bhv_custom = (void *)(o))
-#define vfs_bhv_clr_custom(b) ( (b)->bhv_custom = NULL )
-
-extern vfs_t *vfs_allocate(void);
-extern void vfs_deallocate(vfs_t *);
-extern void vfs_insertops(vfs_t *, bhv_vfsops_t *);
-extern void vfs_insertbhv(vfs_t *, bhv_desc_t *, vfsops_t *, void *);
-
-extern void bhv_insert_all_vfsops(struct vfs *);
-extern void bhv_remove_all_vfsops(struct vfs *, int);
-extern void bhv_remove_vfsops(struct vfs *, int);
-
-#define fs_frozen(vfsp) ((vfsp)->vfs_super->s_frozen)
-#define fs_check_frozen(vfsp, level) \
- vfs_check_frozen(vfsp->vfs_super, level);
-
-#endif /* __XFS_VFS_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_vnode.c b/fs/xfs/linux-2.6/xfs_vnode.c
deleted file mode 100644
index 260dd8415dd..00000000000
--- a/fs/xfs/linux-2.6/xfs_vnode.c
+++ /dev/null
@@ -1,216 +0,0 @@
-/*
- * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- */
-#include "xfs.h"
-
-uint64_t vn_generation; /* vnode generation number */
-DEFINE_SPINLOCK(vnumber_lock);
-
-/*
- * Dedicated vnode inactive/reclaim sync semaphores.
- * Prime number of hash buckets since address is used as the key.
- */
-#define NVSYNC 37
-#define vptosync(v) (&vsync[((unsigned long)v) % NVSYNC])
-STATIC wait_queue_head_t vsync[NVSYNC];
-
-void
-vn_init(void)
-{
- int i;
-
- for (i = 0; i < NVSYNC; i++)
- init_waitqueue_head(&vsync[i]);
-}
-
-void
-vn_iowait(
- struct vnode *vp)
-{
- wait_queue_head_t *wq = vptosync(vp);
-
- wait_event(*wq, (atomic_read(&vp->v_iocount) == 0));
-}
-
-void
-vn_iowake(
- struct vnode *vp)
-{
- if (atomic_dec_and_test(&vp->v_iocount))
- wake_up(vptosync(vp));
-}
-
-struct vnode *
-vn_initialize(
- struct inode *inode)
-{
- struct vnode *vp = LINVFS_GET_VP(inode);
-
- XFS_STATS_INC(vn_active);
- XFS_STATS_INC(vn_alloc);
-
- vp->v_flag = VMODIFIED;
- spinlock_init(&vp->v_lock, "v_lock");
-
- spin_lock(&vnumber_lock);
- if (!++vn_generation) /* v_number shouldn't be zero */
- vn_generation++;
- vp->v_number = vn_generation;
- spin_unlock(&vnumber_lock);
-
- ASSERT(VN_CACHED(vp) == 0);
-
- /* Initialize the first behavior and the behavior chain head. */
- vn_bhv_head_init(VN_BHV_HEAD(vp), "vnode");
-
- atomic_set(&vp->v_iocount, 0);
-
-#ifdef XFS_VNODE_TRACE
- vp->v_trace = ktrace_alloc(VNODE_TRACE_SIZE, KM_SLEEP);
-#endif /* XFS_VNODE_TRACE */
-
- vn_trace_exit(vp, "vn_initialize", (inst_t *)__return_address);
- return vp;
-}
-
-/*
- * Revalidate the Linux inode from the vattr.
- * Note: i_size _not_ updated; we must hold the inode
- * semaphore when doing that - callers responsibility.
- */
-void
-vn_revalidate_core(
- struct vnode *vp,
- vattr_t *vap)
-{
- struct inode *inode = LINVFS_GET_IP(vp);
-
- inode->i_mode = vap->va_mode;
- inode->i_nlink = vap->va_nlink;
- inode->i_uid = vap->va_uid;
- inode->i_gid = vap->va_gid;
- inode->i_blocks = vap->va_nblocks;
- inode->i_mtime = vap->va_mtime;
- inode->i_ctime = vap->va_ctime;
- inode->i_blksize = vap->va_blocksize;
- if (vap->va_xflags & XFS_XFLAG_IMMUTABLE)
- inode->i_flags |= S_IMMUTABLE;
- else
- inode->i_flags &= ~S_IMMUTABLE;
- if (vap->va_xflags & XFS_XFLAG_APPEND)
- inode->i_flags |= S_APPEND;
- else
- inode->i_flags &= ~S_APPEND;
- if (vap->va_xflags & XFS_XFLAG_SYNC)
- inode->i_flags |= S_SYNC;
- else
- inode->i_flags &= ~S_SYNC;
- if (vap->va_xflags & XFS_XFLAG_NOATIME)
- inode->i_flags |= S_NOATIME;
- else
- inode->i_flags &= ~S_NOATIME;
-}
-
-/*
- * Revalidate the Linux inode from the vnode.
- */
-int
-vn_revalidate(
- struct vnode *vp)
-{
- vattr_t va;
- int error;
-
- vn_trace_entry(vp, "vn_revalidate", (inst_t *)__return_address);
- ASSERT(vp->v_fbhv != NULL);
-
- va.va_mask = XFS_AT_STAT|XFS_AT_XFLAGS;
- VOP_GETATTR(vp, &va, 0, NULL, error);
- if (!error) {
- vn_revalidate_core(vp, &va);
- VUNMODIFY(vp);
- }
- return -error;
-}
-
-/*
- * Add a reference to a referenced vnode.
- */
-struct vnode *
-vn_hold(
- struct vnode *vp)
-{
- struct inode *inode;
-
- XFS_STATS_INC(vn_hold);
-
- VN_LOCK(vp);
- inode = igrab(LINVFS_GET_IP(vp));
- ASSERT(inode);
- VN_UNLOCK(vp, 0);
-
- return vp;
-}
-
-#ifdef XFS_VNODE_TRACE
-
-#define KTRACE_ENTER(vp, vk, s, line, ra) \
- ktrace_enter( (vp)->v_trace, \
-/* 0 */ (void *)(__psint_t)(vk), \
-/* 1 */ (void *)(s), \
-/* 2 */ (void *)(__psint_t) line, \
-/* 3 */ (void *)(__psint_t)(vn_count(vp)), \
-/* 4 */ (void *)(ra), \
-/* 5 */ (void *)(__psunsigned_t)(vp)->v_flag, \
-/* 6 */ (void *)(__psint_t)current_cpu(), \
-/* 7 */ (void *)(__psint_t)current_pid(), \
-/* 8 */ (void *)__return_address, \
-/* 9 */ NULL, NULL, NULL, NULL, NULL, NULL, NULL)
-
-/*
- * Vnode tracing code.
- */
-void
-vn_trace_entry(vnode_t *vp, const char *func, inst_t *ra)
-{
- KTRACE_ENTER(vp, VNODE_KTRACE_ENTRY, func, 0, ra);
-}
-
-void
-vn_trace_exit(vnode_t *vp, const char *func, inst_t *ra)
-{
- KTRACE_ENTER(vp, VNODE_KTRACE_EXIT, func, 0, ra);
-}
-
-void
-vn_trace_hold(vnode_t *vp, char *file, int line, inst_t *ra)
-{
- KTRACE_ENTER(vp, VNODE_KTRACE_HOLD, file, line, ra);
-}
-
-void
-vn_trace_ref(vnode_t *vp, char *file, int line, inst_t *ra)
-{
- KTRACE_ENTER(vp, VNODE_KTRACE_REF, file, line, ra);
-}
-
-void
-vn_trace_rele(vnode_t *vp, char *file, int line, inst_t *ra)
-{
- KTRACE_ENTER(vp, VNODE_KTRACE_RELE, file, line, ra);
-}
-#endif /* XFS_VNODE_TRACE */
diff --git a/fs/xfs/linux-2.6/xfs_vnode.h b/fs/xfs/linux-2.6/xfs_vnode.h
deleted file mode 100644
index 0fe2419461d..00000000000
--- a/fs/xfs/linux-2.6/xfs_vnode.h
+++ /dev/null
@@ -1,644 +0,0 @@
-/*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
- * Portions Copyright (c) 1989, 1993
- * The Regents of the University of California. All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- * 3. Neither the name of the University nor the names of its contributors
- * may be used to endorse or promote products derived from this software
- * without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-#ifndef __XFS_VNODE_H__
-#define __XFS_VNODE_H__
-
-struct uio;
-struct file;
-struct vattr;
-struct xfs_iomap;
-struct attrlist_cursor_kern;
-
-
-typedef xfs_ino_t vnumber_t;
-typedef struct dentry vname_t;
-typedef bhv_head_t vn_bhv_head_t;
-
-/*
- * MP locking protocols:
- * v_flag, v_vfsp VN_LOCK/VN_UNLOCK
- */
-typedef struct vnode {
- __u32 v_flag; /* vnode flags (see below) */
- struct vfs *v_vfsp; /* ptr to containing VFS */
- vnumber_t v_number; /* in-core vnode number */
- vn_bhv_head_t v_bh; /* behavior head */
- spinlock_t v_lock; /* VN_LOCK/VN_UNLOCK */
- atomic_t v_iocount; /* outstanding I/O count */
-#ifdef XFS_VNODE_TRACE
- struct ktrace *v_trace; /* trace header structure */
-#endif
- struct inode v_inode; /* Linux inode */
- /* inode MUST be last */
-} vnode_t;
-
-#define VN_ISLNK(vp) S_ISLNK((vp)->v_inode.i_mode)
-#define VN_ISREG(vp) S_ISREG((vp)->v_inode.i_mode)
-#define VN_ISDIR(vp) S_ISDIR((vp)->v_inode.i_mode)
-#define VN_ISCHR(vp) S_ISCHR((vp)->v_inode.i_mode)
-#define VN_ISBLK(vp) S_ISBLK((vp)->v_inode.i_mode)
-
-#define v_fbhv v_bh.bh_first /* first behavior */
-#define v_fops v_bh.bh_first->bd_ops /* first behavior ops */
-
-#define VNODE_POSITION_BASE BHV_POSITION_BASE /* chain bottom */
-#define VNODE_POSITION_TOP BHV_POSITION_TOP /* chain top */
-#define VNODE_POSITION_INVALID BHV_POSITION_INVALID /* invalid pos. num */
-
-typedef enum {
- VN_BHV_UNKNOWN, /* not specified */
- VN_BHV_XFS, /* xfs */
- VN_BHV_DM, /* data migration */
- VN_BHV_QM, /* quota manager */
- VN_BHV_IO, /* IO path */
- VN_BHV_END /* housekeeping end-of-range */
-} vn_bhv_t;
-
-#define VNODE_POSITION_XFS (VNODE_POSITION_BASE)
-#define VNODE_POSITION_DM (VNODE_POSITION_BASE+10)
-#define VNODE_POSITION_QM (VNODE_POSITION_BASE+20)
-#define VNODE_POSITION_IO (VNODE_POSITION_BASE+30)
-
-/*
- * Macros for dealing with the behavior descriptor inside of the vnode.
- */
-#define BHV_TO_VNODE(bdp) ((vnode_t *)BHV_VOBJ(bdp))
-#define BHV_TO_VNODE_NULL(bdp) ((vnode_t *)BHV_VOBJNULL(bdp))
-
-#define VN_BHV_HEAD(vp) ((bhv_head_t *)(&((vp)->v_bh)))
-#define vn_bhv_head_init(bhp,name) bhv_head_init(bhp,name)
-#define vn_bhv_remove(bhp,bdp) bhv_remove(bhp,bdp)
-#define vn_bhv_lookup(bhp,ops) bhv_lookup(bhp,ops)
-#define vn_bhv_lookup_unlocked(bhp,ops) bhv_lookup_unlocked(bhp,ops)
-
-/*
- * Vnode to Linux inode mapping.
- */
-#define LINVFS_GET_VP(inode) ((vnode_t *)list_entry(inode, vnode_t, v_inode))
-#define LINVFS_GET_IP(vp) (&(vp)->v_inode)
-
-/*
- * Vnode flags.
- */
-#define VMODIFIED 0x8 /* XFS inode state possibly differs */
- /* to the Linux inode state. */
-
-/*
- * Values for the VOP_RWLOCK and VOP_RWUNLOCK flags parameter.
- */
-typedef enum vrwlock {
- VRWLOCK_NONE,
- VRWLOCK_READ,
- VRWLOCK_WRITE,
- VRWLOCK_WRITE_DIRECT,
- VRWLOCK_TRY_READ,
- VRWLOCK_TRY_WRITE
-} vrwlock_t;
-
-/*
- * Return values for VOP_INACTIVE. A return value of
- * VN_INACTIVE_NOCACHE implies that the file system behavior
- * has disassociated its state and bhv_desc_t from the vnode.
- */
-#define VN_INACTIVE_CACHE 0
-#define VN_INACTIVE_NOCACHE 1
-
-/*
- * Values for the cmd code given to VOP_VNODE_CHANGE.
- */
-typedef enum vchange {
- VCHANGE_FLAGS_FRLOCKS = 0,
- VCHANGE_FLAGS_ENF_LOCKING = 1,
- VCHANGE_FLAGS_TRUNCATED = 2,
- VCHANGE_FLAGS_PAGE_DIRTY = 3,
- VCHANGE_FLAGS_IOEXCL_COUNT = 4
-} vchange_t;
-
-
-typedef int (*vop_open_t)(bhv_desc_t *, struct cred *);
-typedef ssize_t (*vop_read_t)(bhv_desc_t *, struct kiocb *,
- const struct iovec *, unsigned int,
- loff_t *, int, struct cred *);
-typedef ssize_t (*vop_write_t)(bhv_desc_t *, struct kiocb *,
- const struct iovec *, unsigned int,
- loff_t *, int, struct cred *);
-typedef ssize_t (*vop_sendfile_t)(bhv_desc_t *, struct file *,
- loff_t *, int, size_t, read_actor_t,
- void *, struct cred *);
-typedef int (*vop_ioctl_t)(bhv_desc_t *, struct inode *, struct file *,
- int, unsigned int, void __user *);
-typedef int (*vop_getattr_t)(bhv_desc_t *, struct vattr *, int,
- struct cred *);
-typedef int (*vop_setattr_t)(bhv_desc_t *, struct vattr *, int,
- struct cred *);
-typedef int (*vop_access_t)(bhv_desc_t *, int, struct cred *);
-typedef int (*vop_lookup_t)(bhv_desc_t *, vname_t *, vnode_t **,
- int, vnode_t *, struct cred *);
-typedef int (*vop_create_t)(bhv_desc_t *, vname_t *, struct vattr *,
- vnode_t **, struct cred *);
-typedef int (*vop_remove_t)(bhv_desc_t *, vname_t *, struct cred *);
-typedef int (*vop_link_t)(bhv_desc_t *, vnode_t *, vname_t *,
- struct cred *);
-typedef int (*vop_rename_t)(bhv_desc_t *, vname_t *, vnode_t *, vname_t *,
- struct cred *);
-typedef int (*vop_mkdir_t)(bhv_desc_t *, vname_t *, struct vattr *,
- vnode_t **, struct cred *);
-typedef int (*vop_rmdir_t)(bhv_desc_t *, vname_t *, struct cred *);
-typedef int (*vop_readdir_t)(bhv_desc_t *, struct uio *, struct cred *,
- int *);
-typedef int (*vop_symlink_t)(bhv_desc_t *, vname_t *, struct vattr *,
- char *, vnode_t **, struct cred *);
-typedef int (*vop_readlink_t)(bhv_desc_t *, struct uio *, int,
- struct cred *);
-typedef int (*vop_fsync_t)(bhv_desc_t *, int, struct cred *,
- xfs_off_t, xfs_off_t);
-typedef int (*vop_inactive_t)(bhv_desc_t *, struct cred *);
-typedef int (*vop_fid2_t)(bhv_desc_t *, struct fid *);
-typedef int (*vop_release_t)(bhv_desc_t *);
-typedef int (*vop_rwlock_t)(bhv_desc_t *, vrwlock_t);
-typedef void (*vop_rwunlock_t)(bhv_desc_t *, vrwlock_t);
-typedef int (*vop_bmap_t)(bhv_desc_t *, xfs_off_t, ssize_t, int,
- struct xfs_iomap *, int *);
-typedef int (*vop_reclaim_t)(bhv_desc_t *);
-typedef int (*vop_attr_get_t)(bhv_desc_t *, const char *, char *, int *,
- int, struct cred *);
-typedef int (*vop_attr_set_t)(bhv_desc_t *, const char *, char *, int,
- int, struct cred *);
-typedef int (*vop_attr_remove_t)(bhv_desc_t *, const char *,
- int, struct cred *);
-typedef int (*vop_attr_list_t)(bhv_desc_t *, char *, int, int,
- struct attrlist_cursor_kern *, struct cred *);
-typedef void (*vop_link_removed_t)(bhv_desc_t *, vnode_t *, int);
-typedef void (*vop_vnode_change_t)(bhv_desc_t *, vchange_t, __psint_t);
-typedef void (*vop_ptossvp_t)(bhv_desc_t *, xfs_off_t, xfs_off_t, int);
-typedef void (*vop_pflushinvalvp_t)(bhv_desc_t *, xfs_off_t, xfs_off_t, int);
-typedef int (*vop_pflushvp_t)(bhv_desc_t *, xfs_off_t, xfs_off_t,
- uint64_t, int);
-typedef int (*vop_iflush_t)(bhv_desc_t *, int);
-
-
-typedef struct vnodeops {
- bhv_position_t vn_position; /* position within behavior chain */
- vop_open_t vop_open;
- vop_read_t vop_read;
- vop_write_t vop_write;
- vop_sendfile_t vop_sendfile;
- vop_ioctl_t vop_ioctl;
- vop_getattr_t vop_getattr;
- vop_setattr_t vop_setattr;
- vop_access_t vop_access;
- vop_lookup_t vop_lookup;
- vop_create_t vop_create;
- vop_remove_t vop_remove;
- vop_link_t vop_link;
- vop_rename_t vop_rename;
- vop_mkdir_t vop_mkdir;
- vop_rmdir_t vop_rmdir;
- vop_readdir_t vop_readdir;
- vop_symlink_t vop_symlink;
- vop_readlink_t vop_readlink;
- vop_fsync_t vop_fsync;
- vop_inactive_t vop_inactive;
- vop_fid2_t vop_fid2;
- vop_rwlock_t vop_rwlock;
- vop_rwunlock_t vop_rwunlock;
- vop_bmap_t vop_bmap;
- vop_reclaim_t vop_reclaim;
- vop_attr_get_t vop_attr_get;
- vop_attr_set_t vop_attr_set;
- vop_attr_remove_t vop_attr_remove;
- vop_attr_list_t vop_attr_list;
- vop_link_removed_t vop_link_removed;
- vop_vnode_change_t vop_vnode_change;
- vop_ptossvp_t vop_tosspages;
- vop_pflushinvalvp_t vop_flushinval_pages;
- vop_pflushvp_t vop_flush_pages;
- vop_release_t vop_release;
- vop_iflush_t vop_iflush;
-} vnodeops_t;
-
-/*
- * VOP's.
- */
-#define _VOP_(op, vp) (*((vnodeops_t *)(vp)->v_fops)->op)
-
-#define VOP_READ(vp,file,iov,segs,offset,ioflags,cr,rv) \
- rv = _VOP_(vop_read, vp)((vp)->v_fbhv,file,iov,segs,offset,ioflags,cr)
-#define VOP_WRITE(vp,file,iov,segs,offset,ioflags,cr,rv) \
- rv = _VOP_(vop_write, vp)((vp)->v_fbhv,file,iov,segs,offset,ioflags,cr)
-#define VOP_SENDFILE(vp,f,off,ioflags,cnt,act,targ,cr,rv) \
- rv = _VOP_(vop_sendfile, vp)((vp)->v_fbhv,f,off,ioflags,cnt,act,targ,cr)
-#define VOP_BMAP(vp,of,sz,rw,b,n,rv) \
- rv = _VOP_(vop_bmap, vp)((vp)->v_fbhv,of,sz,rw,b,n)
-#define VOP_OPEN(vp, cr, rv) \
- rv = _VOP_(vop_open, vp)((vp)->v_fbhv, cr)
-#define VOP_GETATTR(vp, vap, f, cr, rv) \
- rv = _VOP_(vop_getattr, vp)((vp)->v_fbhv, vap, f, cr)
-#define VOP_SETATTR(vp, vap, f, cr, rv) \
- rv = _VOP_(vop_setattr, vp)((vp)->v_fbhv, vap, f, cr)
-#define VOP_ACCESS(vp, mode, cr, rv) \
- rv = _VOP_(vop_access, vp)((vp)->v_fbhv, mode, cr)
-#define VOP_LOOKUP(vp,d,vpp,f,rdir,cr,rv) \
- rv = _VOP_(vop_lookup, vp)((vp)->v_fbhv,d,vpp,f,rdir,cr)
-#define VOP_CREATE(dvp,d,vap,vpp,cr,rv) \
- rv = _VOP_(vop_create, dvp)((dvp)->v_fbhv,d,vap,vpp,cr)
-#define VOP_REMOVE(dvp,d,cr,rv) \
- rv = _VOP_(vop_remove, dvp)((dvp)->v_fbhv,d,cr)
-#define VOP_LINK(tdvp,fvp,d,cr,rv) \
- rv = _VOP_(vop_link, tdvp)((tdvp)->v_fbhv,fvp,d,cr)
-#define VOP_RENAME(fvp,fnm,tdvp,tnm,cr,rv) \
- rv = _VOP_(vop_rename, fvp)((fvp)->v_fbhv,fnm,tdvp,tnm,cr)
-#define VOP_MKDIR(dp,d,vap,vpp,cr,rv) \
- rv = _VOP_(vop_mkdir, dp)((dp)->v_fbhv,d,vap,vpp,cr)
-#define VOP_RMDIR(dp,d,cr,rv) \
- rv = _VOP_(vop_rmdir, dp)((dp)->v_fbhv,d,cr)
-#define VOP_READDIR(vp,uiop,cr,eofp,rv) \
- rv = _VOP_(vop_readdir, vp)((vp)->v_fbhv,uiop,cr,eofp)
-#define VOP_SYMLINK(dvp,d,vap,tnm,vpp,cr,rv) \
- rv = _VOP_(vop_symlink, dvp) ((dvp)->v_fbhv,d,vap,tnm,vpp,cr)
-#define VOP_READLINK(vp,uiop,fl,cr,rv) \
- rv = _VOP_(vop_readlink, vp)((vp)->v_fbhv,uiop,fl,cr)
-#define VOP_FSYNC(vp,f,cr,b,e,rv) \
- rv = _VOP_(vop_fsync, vp)((vp)->v_fbhv,f,cr,b,e)
-#define VOP_INACTIVE(vp, cr, rv) \
- rv = _VOP_(vop_inactive, vp)((vp)->v_fbhv, cr)
-#define VOP_RELEASE(vp, rv) \
- rv = _VOP_(vop_release, vp)((vp)->v_fbhv)
-#define VOP_FID2(vp, fidp, rv) \
- rv = _VOP_(vop_fid2, vp)((vp)->v_fbhv, fidp)
-#define VOP_RWLOCK(vp,i) \
- (void)_VOP_(vop_rwlock, vp)((vp)->v_fbhv, i)
-#define VOP_RWLOCK_TRY(vp,i) \
- _VOP_(vop_rwlock, vp)((vp)->v_fbhv, i)
-#define VOP_RWUNLOCK(vp,i) \
- (void)_VOP_(vop_rwunlock, vp)((vp)->v_fbhv, i)
-#define VOP_FRLOCK(vp,c,fl,flags,offset,fr,rv) \
- rv = _VOP_(vop_frlock, vp)((vp)->v_fbhv,c,fl,flags,offset,fr)
-#define VOP_RECLAIM(vp, rv) \
- rv = _VOP_(vop_reclaim, vp)((vp)->v_fbhv)
-#define VOP_ATTR_GET(vp, name, val, vallenp, fl, cred, rv) \
- rv = _VOP_(vop_attr_get, vp)((vp)->v_fbhv,name,val,vallenp,fl,cred)
-#define VOP_ATTR_SET(vp, name, val, vallen, fl, cred, rv) \
- rv = _VOP_(vop_attr_set, vp)((vp)->v_fbhv,name,val,vallen,fl,cred)
-#define VOP_ATTR_REMOVE(vp, name, flags, cred, rv) \
- rv = _VOP_(vop_attr_remove, vp)((vp)->v_fbhv,name,flags,cred)
-#define VOP_ATTR_LIST(vp, buf, buflen, fl, cursor, cred, rv) \
- rv = _VOP_(vop_attr_list, vp)((vp)->v_fbhv,buf,buflen,fl,cursor,cred)
-#define VOP_LINK_REMOVED(vp, dvp, linkzero) \
- (void)_VOP_(vop_link_removed, vp)((vp)->v_fbhv, dvp, linkzero)
-#define VOP_VNODE_CHANGE(vp, cmd, val) \
- (void)_VOP_(vop_vnode_change, vp)((vp)->v_fbhv,cmd,val)
-/*
- * These are page cache functions that now go thru VOPs.
- * 'last' parameter is unused and left in for IRIX compatibility
- */
-#define VOP_TOSS_PAGES(vp, first, last, fiopt) \
- _VOP_(vop_tosspages, vp)((vp)->v_fbhv,first, last, fiopt)
-/*
- * 'last' parameter is unused and left in for IRIX compatibility
- */
-#define VOP_FLUSHINVAL_PAGES(vp, first, last, fiopt) \
- _VOP_(vop_flushinval_pages, vp)((vp)->v_fbhv,first,last,fiopt)
-/*
- * 'last' parameter is unused and left in for IRIX compatibility
- */
-#define VOP_FLUSH_PAGES(vp, first, last, flags, fiopt, rv) \
- rv = _VOP_(vop_flush_pages, vp)((vp)->v_fbhv,first,last,flags,fiopt)
-#define VOP_IOCTL(vp, inode, filp, fl, cmd, arg, rv) \
- rv = _VOP_(vop_ioctl, vp)((vp)->v_fbhv,inode,filp,fl,cmd,arg)
-#define VOP_IFLUSH(vp, flags, rv) \
- rv = _VOP_(vop_iflush, vp)((vp)->v_fbhv, flags)
-
-/*
- * Flags for read/write calls - same values as IRIX
- */
-#define IO_ISAIO 0x00001 /* don't wait for completion */
-#define IO_ISDIRECT 0x00004 /* bypass page cache */
-#define IO_INVIS 0x00020 /* don't update inode timestamps */
-
-/*
- * Flags for VOP_IFLUSH call
- */
-#define FLUSH_SYNC 1 /* wait for flush to complete */
-#define FLUSH_INODE 2 /* flush the inode itself */
-#define FLUSH_LOG 4 /* force the last log entry for
- * this inode out to disk */
-
-/*
- * Flush/Invalidate options for VOP_TOSS_PAGES, VOP_FLUSHINVAL_PAGES and
- * VOP_FLUSH_PAGES.
- */
-#define FI_NONE 0 /* none */
-#define FI_REMAPF 1 /* Do a remapf prior to the operation */
-#define FI_REMAPF_LOCKED 2 /* Do a remapf prior to the operation.
- Prevent VM access to the pages until
- the operation completes. */
-
-/*
- * Vnode attributes. va_mask indicates those attributes the caller
- * wants to set or extract.
- */
-typedef struct vattr {
- int va_mask; /* bit-mask of attributes present */
- mode_t va_mode; /* file access mode and type */
- xfs_nlink_t va_nlink; /* number of references to file */
- uid_t va_uid; /* owner user id */
- gid_t va_gid; /* owner group id */
- xfs_ino_t va_nodeid; /* file id */
- xfs_off_t va_size; /* file size in bytes */
- u_long va_blocksize; /* blocksize preferred for i/o */
- struct timespec va_atime; /* time of last access */
- struct timespec va_mtime; /* time of last modification */
- struct timespec va_ctime; /* time file changed */
- u_int va_gen; /* generation number of file */
- xfs_dev_t va_rdev; /* device the special file represents */
- __int64_t va_nblocks; /* number of blocks allocated */
- u_long va_xflags; /* random extended file flags */
- u_long va_extsize; /* file extent size */
- u_long va_nextents; /* number of extents in file */
- u_long va_anextents; /* number of attr extents in file */
- prid_t va_projid; /* project id */
-} vattr_t;
-
-/*
- * setattr or getattr attributes
- */
-#define XFS_AT_TYPE 0x00000001
-#define XFS_AT_MODE 0x00000002
-#define XFS_AT_UID 0x00000004
-#define XFS_AT_GID 0x00000008
-#define XFS_AT_FSID 0x00000010
-#define XFS_AT_NODEID 0x00000020
-#define XFS_AT_NLINK 0x00000040
-#define XFS_AT_SIZE 0x00000080
-#define XFS_AT_ATIME 0x00000100
-#define XFS_AT_MTIME 0x00000200
-#define XFS_AT_CTIME 0x00000400
-#define XFS_AT_RDEV 0x00000800
-#define XFS_AT_BLKSIZE 0x00001000
-#define XFS_AT_NBLOCKS 0x00002000
-#define XFS_AT_VCODE 0x00004000
-#define XFS_AT_MAC 0x00008000
-#define XFS_AT_UPDATIME 0x00010000
-#define XFS_AT_UPDMTIME 0x00020000
-#define XFS_AT_UPDCTIME 0x00040000
-#define XFS_AT_ACL 0x00080000
-#define XFS_AT_CAP 0x00100000
-#define XFS_AT_INF 0x00200000
-#define XFS_AT_XFLAGS 0x00400000
-#define XFS_AT_EXTSIZE 0x00800000
-#define XFS_AT_NEXTENTS 0x01000000
-#define XFS_AT_ANEXTENTS 0x02000000
-#define XFS_AT_PROJID 0x04000000
-#define XFS_AT_SIZE_NOPERM 0x08000000
-#define XFS_AT_GENCOUNT 0x10000000
-
-#define XFS_AT_ALL (XFS_AT_TYPE|XFS_AT_MODE|XFS_AT_UID|XFS_AT_GID|\
- XFS_AT_FSID|XFS_AT_NODEID|XFS_AT_NLINK|XFS_AT_SIZE|\
- XFS_AT_ATIME|XFS_AT_MTIME|XFS_AT_CTIME|XFS_AT_RDEV|\
- XFS_AT_BLKSIZE|XFS_AT_NBLOCKS|XFS_AT_VCODE|XFS_AT_MAC|\
- XFS_AT_ACL|XFS_AT_CAP|XFS_AT_INF|XFS_AT_XFLAGS|XFS_AT_EXTSIZE|\
- XFS_AT_NEXTENTS|XFS_AT_ANEXTENTS|XFS_AT_PROJID|XFS_AT_GENCOUNT)
-
-#define XFS_AT_STAT (XFS_AT_TYPE|XFS_AT_MODE|XFS_AT_UID|XFS_AT_GID|\
- XFS_AT_FSID|XFS_AT_NODEID|XFS_AT_NLINK|XFS_AT_SIZE|\
- XFS_AT_ATIME|XFS_AT_MTIME|XFS_AT_CTIME|XFS_AT_RDEV|\
- XFS_AT_BLKSIZE|XFS_AT_NBLOCKS|XFS_AT_PROJID)
-
-#define XFS_AT_TIMES (XFS_AT_ATIME|XFS_AT_MTIME|XFS_AT_CTIME)
-
-#define XFS_AT_UPDTIMES (XFS_AT_UPDATIME|XFS_AT_UPDMTIME|XFS_AT_UPDCTIME)
-
-#define XFS_AT_NOSET (XFS_AT_NLINK|XFS_AT_RDEV|XFS_AT_FSID|XFS_AT_NODEID|\
- XFS_AT_TYPE|XFS_AT_BLKSIZE|XFS_AT_NBLOCKS|XFS_AT_VCODE|\
- XFS_AT_NEXTENTS|XFS_AT_ANEXTENTS|XFS_AT_GENCOUNT)
-
-/*
- * Modes.
- */
-#define VSUID S_ISUID /* set user id on execution */
-#define VSGID S_ISGID /* set group id on execution */
-#define VSVTX S_ISVTX /* save swapped text even after use */
-#define VREAD S_IRUSR /* read, write, execute permissions */
-#define VWRITE S_IWUSR
-#define VEXEC S_IXUSR
-
-#define MODEMASK S_IALLUGO /* mode bits plus permission bits */
-
-/*
- * Check whether mandatory file locking is enabled.
- */
-#define MANDLOCK(vp, mode) \
- (VN_ISREG(vp) && ((mode) & (VSGID|(VEXEC>>3))) == VSGID)
-
-extern void vn_init(void);
-extern vnode_t *vn_initialize(struct inode *);
-
-/*
- * vnode_map structures _must_ match vn_epoch and vnode structure sizes.
- */
-typedef struct vnode_map {
- vfs_t *v_vfsp;
- vnumber_t v_number; /* in-core vnode number */
- xfs_ino_t v_ino; /* inode # */
-} vmap_t;
-
-#define VMAP(vp, vmap) {(vmap).v_vfsp = (vp)->v_vfsp, \
- (vmap).v_number = (vp)->v_number, \
- (vmap).v_ino = (vp)->v_inode.i_ino; }
-
-extern int vn_revalidate(struct vnode *);
-extern void vn_revalidate_core(struct vnode *, vattr_t *);
-
-extern void vn_iowait(struct vnode *vp);
-extern void vn_iowake(struct vnode *vp);
-
-static inline int vn_count(struct vnode *vp)
-{
- return atomic_read(&LINVFS_GET_IP(vp)->i_count);
-}
-
-/*
- * Vnode reference counting functions (and macros for compatibility).
- */
-extern vnode_t *vn_hold(struct vnode *);
-
-#if defined(XFS_VNODE_TRACE)
-#define VN_HOLD(vp) \
- ((void)vn_hold(vp), \
- vn_trace_hold(vp, __FILE__, __LINE__, (inst_t *)__return_address))
-#define VN_RELE(vp) \
- (vn_trace_rele(vp, __FILE__, __LINE__, (inst_t *)__return_address), \
- iput(LINVFS_GET_IP(vp)))
-#else
-#define VN_HOLD(vp) ((void)vn_hold(vp))
-#define VN_RELE(vp) (iput(LINVFS_GET_IP(vp)))
-#endif
-
-static inline struct vnode *vn_grab(struct vnode *vp)
-{
- struct inode *inode = igrab(LINVFS_GET_IP(vp));
- return inode ? LINVFS_GET_VP(inode) : NULL;
-}
-
-/*
- * Vname handling macros.
- */
-#define VNAME(dentry) ((char *) (dentry)->d_name.name)
-#define VNAMELEN(dentry) ((dentry)->d_name.len)
-#define VNAME_TO_VNODE(dentry) (LINVFS_GET_VP((dentry)->d_inode))
-
-/*
- * Vnode spinlock manipulation.
- */
-#define VN_LOCK(vp) mutex_spinlock(&(vp)->v_lock)
-#define VN_UNLOCK(vp, s) mutex_spinunlock(&(vp)->v_lock, s)
-#define VN_FLAGSET(vp,b) vn_flagset(vp,b)
-#define VN_FLAGCLR(vp,b) vn_flagclr(vp,b)
-
-static __inline__ void vn_flagset(struct vnode *vp, uint flag)
-{
- spin_lock(&vp->v_lock);
- vp->v_flag |= flag;
- spin_unlock(&vp->v_lock);
-}
-
-static __inline__ void vn_flagclr(struct vnode *vp, uint flag)
-{
- spin_lock(&vp->v_lock);
- vp->v_flag &= ~flag;
- spin_unlock(&vp->v_lock);
-}
-
-/*
- * Dealing with bad inodes
- */
-static inline void vn_mark_bad(struct vnode *vp)
-{
- make_bad_inode(LINVFS_GET_IP(vp));
-}
-
-static inline int VN_BAD(struct vnode *vp)
-{
- return is_bad_inode(LINVFS_GET_IP(vp));
-}
-
-/*
- * Extracting atime values in various formats
- */
-static inline void vn_atime_to_bstime(struct vnode *vp, xfs_bstime_t *bs_atime)
-{
- bs_atime->tv_sec = vp->v_inode.i_atime.tv_sec;
- bs_atime->tv_nsec = vp->v_inode.i_atime.tv_nsec;
-}
-
-static inline void vn_atime_to_timespec(struct vnode *vp, struct timespec *ts)
-{
- *ts = vp->v_inode.i_atime;
-}
-
-static inline void vn_atime_to_time_t(struct vnode *vp, time_t *tt)
-{
- *tt = vp->v_inode.i_atime.tv_sec;
-}
-
-/*
- * Some useful predicates.
- */
-#define VN_MAPPED(vp) mapping_mapped(LINVFS_GET_IP(vp)->i_mapping)
-#define VN_CACHED(vp) (LINVFS_GET_IP(vp)->i_mapping->nrpages)
-#define VN_DIRTY(vp) mapping_tagged(LINVFS_GET_IP(vp)->i_mapping, \
- PAGECACHE_TAG_DIRTY)
-#define VMODIFY(vp) VN_FLAGSET(vp, VMODIFIED)
-#define VUNMODIFY(vp) VN_FLAGCLR(vp, VMODIFIED)
-
-/*
- * Flags to VOP_SETATTR/VOP_GETATTR.
- */
-#define ATTR_UTIME 0x01 /* non-default utime(2) request */
-#define ATTR_DMI 0x08 /* invocation from a DMI function */
-#define ATTR_LAZY 0x80 /* set/get attributes lazily */
-#define ATTR_NONBLOCK 0x100 /* return EAGAIN if operation would block */
-#define ATTR_NOLOCK 0x200 /* Don't grab any conflicting locks */
-#define ATTR_NOSIZETOK 0x400 /* Don't get the SIZE token */
-
-/*
- * Flags to VOP_FSYNC and VOP_RECLAIM.
- */
-#define FSYNC_NOWAIT 0 /* asynchronous flush */
-#define FSYNC_WAIT 0x1 /* synchronous fsync or forced reclaim */
-#define FSYNC_INVAL 0x2 /* flush and invalidate cached data */
-#define FSYNC_DATA 0x4 /* synchronous fsync of data only */
-
-/*
- * Tracking vnode activity.
- */
-#if defined(XFS_VNODE_TRACE)
-
-#define VNODE_TRACE_SIZE 16 /* number of trace entries */
-#define VNODE_KTRACE_ENTRY 1
-#define VNODE_KTRACE_EXIT 2
-#define VNODE_KTRACE_HOLD 3
-#define VNODE_KTRACE_REF 4
-#define VNODE_KTRACE_RELE 5
-
-extern void vn_trace_entry(struct vnode *, const char *, inst_t *);
-extern void vn_trace_exit(struct vnode *, const char *, inst_t *);
-extern void vn_trace_hold(struct vnode *, char *, int, inst_t *);
-extern void vn_trace_ref(struct vnode *, char *, int, inst_t *);
-extern void vn_trace_rele(struct vnode *, char *, int, inst_t *);
-
-#define VN_TRACE(vp) \
- vn_trace_ref(vp, __FILE__, __LINE__, (inst_t *)__return_address)
-#else
-#define vn_trace_entry(a,b,c)
-#define vn_trace_exit(a,b,c)
-#define vn_trace_hold(a,b,c,d)
-#define vn_trace_ref(a,b,c,d)
-#define vn_trace_rele(a,b,c,d)
-#define VN_TRACE(vp)
-#endif
-
-#endif /* __XFS_VNODE_H__ */
diff --git a/fs/xfs/linux-2.6/mrlock.h b/fs/xfs/mrlock.h
index 16b44c3c236..e3c92d19e54 100644
--- a/fs/xfs/linux-2.6/mrlock.h
+++ b/fs/xfs/mrlock.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
* All Rights Reserved.
*
* This program is free software; you can redistribute it and/or
@@ -20,29 +20,35 @@
#include <linux/rwsem.h>
-enum { MR_NONE, MR_ACCESS, MR_UPDATE };
-
typedef struct {
struct rw_semaphore mr_lock;
+#if defined(DEBUG) || defined(XFS_WARN)
int mr_writer;
+#endif
} mrlock_t;
+#if defined(DEBUG) || defined(XFS_WARN)
+#define mrinit(mrp, name) \
+ do { (mrp)->mr_writer = 0; init_rwsem(&(mrp)->mr_lock); } while (0)
+#else
#define mrinit(mrp, name) \
- ( (mrp)->mr_writer = 0, init_rwsem(&(mrp)->mr_lock) )
+ do { init_rwsem(&(mrp)->mr_lock); } while (0)
+#endif
+
#define mrlock_init(mrp, t,n,s) mrinit(mrp, n)
#define mrfree(mrp) do { } while (0)
-#define mraccess(mrp) mraccessf(mrp, 0)
-#define mrupdate(mrp) mrupdatef(mrp, 0)
-static inline void mraccessf(mrlock_t *mrp, int flags)
+static inline void mraccess_nested(mrlock_t *mrp, int subclass)
{
- down_read(&mrp->mr_lock);
+ down_read_nested(&mrp->mr_lock, subclass);
}
-static inline void mrupdatef(mrlock_t *mrp, int flags)
+static inline void mrupdate_nested(mrlock_t *mrp, int subclass)
{
- down_write(&mrp->mr_lock);
+ down_write_nested(&mrp->mr_lock, subclass);
+#if defined(DEBUG) || defined(XFS_WARN)
mrp->mr_writer = 1;
+#endif
}
static inline int mrtryaccess(mrlock_t *mrp)
@@ -54,39 +60,31 @@ static inline int mrtryupdate(mrlock_t *mrp)
{
if (!down_write_trylock(&mrp->mr_lock))
return 0;
+#if defined(DEBUG) || defined(XFS_WARN)
mrp->mr_writer = 1;
+#endif
return 1;
}
-static inline void mrunlock(mrlock_t *mrp)
+static inline void mrunlock_excl(mrlock_t *mrp)
{
- if (mrp->mr_writer) {
- mrp->mr_writer = 0;
- up_write(&mrp->mr_lock);
- } else {
- up_read(&mrp->mr_lock);
- }
+#if defined(DEBUG) || defined(XFS_WARN)
+ mrp->mr_writer = 0;
+#endif
+ up_write(&mrp->mr_lock);
}
-static inline void mrdemote(mrlock_t *mrp)
+static inline void mrunlock_shared(mrlock_t *mrp)
{
- mrp->mr_writer = 0;
- downgrade_write(&mrp->mr_lock);
+ up_read(&mrp->mr_lock);
}
-#ifdef DEBUG
-/*
- * Debug-only routine, without some platform-specific asm code, we can
- * now only answer requests regarding whether we hold the lock for write
- * (reader state is outside our visibility, we only track writer state).
- * Note: means !ismrlocked would give false positivies, so don't do that.
- */
-static inline int ismrlocked(mrlock_t *mrp, int type)
+static inline void mrdemote(mrlock_t *mrp)
{
- if (mrp && type == MR_UPDATE)
- return mrp->mr_writer;
- return 1;
-}
+#if defined(DEBUG) || defined(XFS_WARN)
+ mrp->mr_writer = 0;
#endif
+ downgrade_write(&mrp->mr_lock);
+}
#endif /* __XFS_SUPPORT_MRLOCK_H__ */
diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c
deleted file mode 100644
index 772ac48329e..00000000000
--- a/fs/xfs/quota/xfs_dquot.c
+++ /dev/null
@@ -1,1601 +0,0 @@
-/*
- * Copyright (c) 2000-2003 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_bit.h"
-#include "xfs_log.h"
-#include "xfs_inum.h"
-#include "xfs_trans.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_dir.h"
-#include "xfs_dir2.h"
-#include "xfs_alloc.h"
-#include "xfs_dmapi.h"
-#include "xfs_quota.h"
-#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
-#include "xfs_dinode.h"
-#include "xfs_inode.h"
-#include "xfs_btree.h"
-#include "xfs_ialloc.h"
-#include "xfs_bmap.h"
-#include "xfs_rtalloc.h"
-#include "xfs_error.h"
-#include "xfs_itable.h"
-#include "xfs_rw.h"
-#include "xfs_acl.h"
-#include "xfs_cap.h"
-#include "xfs_mac.h"
-#include "xfs_attr.h"
-#include "xfs_buf_item.h"
-#include "xfs_trans_space.h"
-#include "xfs_trans_priv.h"
-#include "xfs_qm.h"
-
-
-/*
- LOCK ORDER
-
- inode lock (ilock)
- dquot hash-chain lock (hashlock)
- xqm dquot freelist lock (freelistlock
- mount's dquot list lock (mplistlock)
- user dquot lock - lock ordering among dquots is based on the uid or gid
- group dquot lock - similar to udquots. Between the two dquots, the udquot
- has to be locked first.
- pin lock - the dquot lock must be held to take this lock.
- flush lock - ditto.
-*/
-
-STATIC void xfs_qm_dqflush_done(xfs_buf_t *, xfs_dq_logitem_t *);
-
-#ifdef DEBUG
-xfs_buftarg_t *xfs_dqerror_target;
-int xfs_do_dqerror;
-int xfs_dqreq_num;
-int xfs_dqerror_mod = 33;
-#endif
-
-/*
- * Allocate and initialize a dquot. We don't always allocate fresh memory;
- * we try to reclaim a free dquot if the number of incore dquots are above
- * a threshold.
- * The only field inside the core that gets initialized at this point
- * is the d_id field. The idea is to fill in the entire q_core
- * when we read in the on disk dquot.
- */
-STATIC xfs_dquot_t *
-xfs_qm_dqinit(
- xfs_mount_t *mp,
- xfs_dqid_t id,
- uint type)
-{
- xfs_dquot_t *dqp;
- boolean_t brandnewdquot;
-
- brandnewdquot = xfs_qm_dqalloc_incore(&dqp);
- dqp->dq_flags = type;
- dqp->q_core.d_id = cpu_to_be32(id);
- dqp->q_mount = mp;
-
- /*
- * No need to re-initialize these if this is a reclaimed dquot.
- */
- if (brandnewdquot) {
- dqp->dq_flnext = dqp->dq_flprev = dqp;
- mutex_init(&dqp->q_qlock);
- initnsema(&dqp->q_flock, 1, "fdq");
- sv_init(&dqp->q_pinwait, SV_DEFAULT, "pdq");
-
-#ifdef XFS_DQUOT_TRACE
- dqp->q_trace = ktrace_alloc(DQUOT_TRACE_SIZE, KM_SLEEP);
- xfs_dqtrace_entry(dqp, "DQINIT");
-#endif
- } else {
- /*
- * Only the q_core portion was zeroed in dqreclaim_one().
- * So, we need to reset others.
- */
- dqp->q_nrefs = 0;
- dqp->q_blkno = 0;
- dqp->MPL_NEXT = dqp->HL_NEXT = NULL;
- dqp->HL_PREVP = dqp->MPL_PREVP = NULL;
- dqp->q_bufoffset = 0;
- dqp->q_fileoffset = 0;
- dqp->q_transp = NULL;
- dqp->q_gdquot = NULL;
- dqp->q_res_bcount = 0;
- dqp->q_res_icount = 0;
- dqp->q_res_rtbcount = 0;
- dqp->q_pincount = 0;
- dqp->q_hash = NULL;
- ASSERT(dqp->dq_flnext == dqp->dq_flprev);
-
-#ifdef XFS_DQUOT_TRACE
- ASSERT(dqp->q_trace);
- xfs_dqtrace_entry(dqp, "DQRECLAIMED_INIT");
-#endif
- }
-
- /*
- * log item gets initialized later
- */
- return (dqp);
-}
-
-/*
- * This is called to free all the memory associated with a dquot
- */
-void
-xfs_qm_dqdestroy(
- xfs_dquot_t *dqp)
-{
- ASSERT(! XFS_DQ_IS_ON_FREELIST(dqp));
-
- mutex_destroy(&dqp->q_qlock);
- freesema(&dqp->q_flock);
- sv_destroy(&dqp->q_pinwait);
-
-#ifdef XFS_DQUOT_TRACE
- if (dqp->q_trace)
- ktrace_free(dqp->q_trace);
- dqp->q_trace = NULL;
-#endif
- kmem_zone_free(xfs_Gqm->qm_dqzone, dqp);
- atomic_dec(&xfs_Gqm->qm_totaldquots);
-}
-
-/*
- * This is what a 'fresh' dquot inside a dquot chunk looks like on disk.
- */
-STATIC void
-xfs_qm_dqinit_core(
- xfs_dqid_t id,
- uint type,
- xfs_dqblk_t *d)
-{
- /*
- * Caller has zero'd the entire dquot 'chunk' already.
- */
- d->dd_diskdq.d_magic = cpu_to_be16(XFS_DQUOT_MAGIC);
- d->dd_diskdq.d_version = XFS_DQUOT_VERSION;
- d->dd_diskdq.d_id = cpu_to_be32(id);
- d->dd_diskdq.d_flags = type;
-}
-
-
-#ifdef XFS_DQUOT_TRACE
-/*
- * Dquot tracing for debugging.
- */
-/* ARGSUSED */
-void
-__xfs_dqtrace_entry(
- xfs_dquot_t *dqp,
- char *func,
- void *retaddr,
- xfs_inode_t *ip)
-{
- xfs_dquot_t *udqp = NULL;
- xfs_ino_t ino = 0;
-
- ASSERT(dqp->q_trace);
- if (ip) {
- ino = ip->i_ino;
- udqp = ip->i_udquot;
- }
- ktrace_enter(dqp->q_trace,
- (void *)(__psint_t)DQUOT_KTRACE_ENTRY,
- (void *)func,
- (void *)(__psint_t)dqp->q_nrefs,
- (void *)(__psint_t)dqp->dq_flags,
- (void *)(__psint_t)dqp->q_res_bcount,
- (void *)(__psint_t)be64_to_cpu(dqp->q_core.d_bcount),
- (void *)(__psint_t)be64_to_cpu(dqp->q_core.d_icount),
- (void *)(__psint_t)be64_to_cpu(dqp->q_core.d_blk_hardlimit),
- (void *)(__psint_t)be64_to_cpu(dqp->q_core.d_blk_softlimit),
- (void *)(__psint_t)be64_to_cpu(dqp->q_core.d_ino_hardlimit),
- (void *)(__psint_t)be64_to_cpu(dqp->q_core.d_ino_softlimit),
- (void *)(__psint_t)be32_to_cpu(dqp->q_core.d_id),
- (void *)(__psint_t)current_pid(),
- (void *)(__psint_t)ino,
- (void *)(__psint_t)retaddr,
- (void *)(__psint_t)udqp);
- return;
-}
-#endif
-
-
-/*
- * If default limits are in force, push them into the dquot now.
- * We overwrite the dquot limits only if they are zero and this
- * is not the root dquot.
- */
-void
-xfs_qm_adjust_dqlimits(
- xfs_mount_t *mp,
- xfs_disk_dquot_t *d)
-{
- xfs_quotainfo_t *q = mp->m_quotainfo;
-
- ASSERT(d->d_id);
-
- if (q->qi_bsoftlimit && !d->d_blk_softlimit)
- d->d_blk_softlimit = cpu_to_be64(q->qi_bsoftlimit);
- if (q->qi_bhardlimit && !d->d_blk_hardlimit)
- d->d_blk_hardlimit = cpu_to_be64(q->qi_bhardlimit);
- if (q->qi_isoftlimit && !d->d_ino_softlimit)
- d->d_ino_softlimit = cpu_to_be64(q->qi_isoftlimit);
- if (q->qi_ihardlimit && !d->d_ino_hardlimit)
- d->d_ino_hardlimit = cpu_to_be64(q->qi_ihardlimit);
- if (q->qi_rtbsoftlimit && !d->d_rtb_softlimit)
- d->d_rtb_softlimit = cpu_to_be64(q->qi_rtbsoftlimit);
- if (q->qi_rtbhardlimit && !d->d_rtb_hardlimit)
- d->d_rtb_hardlimit = cpu_to_be64(q->qi_rtbhardlimit);
-}
-
-/*
- * Check the limits and timers of a dquot and start or reset timers
- * if necessary.
- * This gets called even when quota enforcement is OFF, which makes our
- * life a little less complicated. (We just don't reject any quota
- * reservations in that case, when enforcement is off).
- * We also return 0 as the values of the timers in Q_GETQUOTA calls, when
- * enforcement's off.
- * In contrast, warnings are a little different in that they don't
- * 'automatically' get started when limits get exceeded. They do
- * get reset to zero, however, when we find the count to be under
- * the soft limit (they are only ever set non-zero via userspace).
- */
-void
-xfs_qm_adjust_dqtimers(
- xfs_mount_t *mp,
- xfs_disk_dquot_t *d)
-{
- ASSERT(d->d_id);
-
-#ifdef QUOTADEBUG
- if (d->d_blk_hardlimit)
- ASSERT(be64_to_cpu(d->d_blk_softlimit) <=
- be64_to_cpu(d->d_blk_hardlimit));
- if (d->d_ino_hardlimit)
- ASSERT(be64_to_cpu(d->d_ino_softlimit) <=
- be64_to_cpu(d->d_ino_hardlimit));
- if (d->d_rtb_hardlimit)
- ASSERT(be64_to_cpu(d->d_rtb_softlimit) <=
- be64_to_cpu(d->d_rtb_hardlimit));
-#endif
- if (!d->d_btimer) {
- if ((d->d_blk_softlimit &&
- (be64_to_cpu(d->d_bcount) >=
- be64_to_cpu(d->d_blk_softlimit))) ||
- (d->d_blk_hardlimit &&
- (be64_to_cpu(d->d_bcount) >=
- be64_to_cpu(d->d_blk_hardlimit)))) {
- d->d_btimer = cpu_to_be32(get_seconds() +
- XFS_QI_BTIMELIMIT(mp));
- } else {
- d->d_bwarns = 0;
- }
- } else {
- if ((!d->d_blk_softlimit ||
- (be64_to_cpu(d->d_bcount) <
- be64_to_cpu(d->d_blk_softlimit))) &&
- (!d->d_blk_hardlimit ||
- (be64_to_cpu(d->d_bcount) <
- be64_to_cpu(d->d_blk_hardlimit)))) {
- d->d_btimer = 0;
- }
- }
-
- if (!d->d_itimer) {
- if ((d->d_ino_softlimit &&
- (be64_to_cpu(d->d_icount) >=
- be64_to_cpu(d->d_ino_softlimit))) ||
- (d->d_ino_hardlimit &&
- (be64_to_cpu(d->d_icount) >=
- be64_to_cpu(d->d_ino_hardlimit)))) {
- d->d_itimer = cpu_to_be32(get_seconds() +
- XFS_QI_ITIMELIMIT(mp));
- } else {
- d->d_iwarns = 0;
- }
- } else {
- if ((!d->d_ino_softlimit ||
- (be64_to_cpu(d->d_icount) <
- be64_to_cpu(d->d_ino_softlimit))) &&
- (!d->d_ino_hardlimit ||
- (be64_to_cpu(d->d_icount) <
- be64_to_cpu(d->d_ino_hardlimit)))) {
- d->d_itimer = 0;
- }
- }
-
- if (!d->d_rtbtimer) {
- if ((d->d_rtb_softlimit &&
- (be64_to_cpu(d->d_rtbcount) >=
- be64_to_cpu(d->d_rtb_softlimit))) ||
- (d->d_rtb_hardlimit &&
- (be64_to_cpu(d->d_rtbcount) >=
- be64_to_cpu(d->d_rtb_hardlimit)))) {
- d->d_rtbtimer = cpu_to_be32(get_seconds() +
- XFS_QI_RTBTIMELIMIT(mp));
- } else {
- d->d_rtbwarns = 0;
- }
- } else {
- if ((!d->d_rtb_softlimit ||
- (be64_to_cpu(d->d_rtbcount) <
- be64_to_cpu(d->d_rtb_softlimit))) &&
- (!d->d_rtb_hardlimit ||
- (be64_to_cpu(d->d_rtbcount) <
- be64_to_cpu(d->d_rtb_hardlimit)))) {
- d->d_rtbtimer = 0;
- }
- }
-}
-
-/*
- * initialize a buffer full of dquots and log the whole thing
- */
-STATIC void
-xfs_qm_init_dquot_blk(
- xfs_trans_t *tp,
- xfs_mount_t *mp,
- xfs_dqid_t id,
- uint type,
- xfs_buf_t *bp)
-{
- xfs_dqblk_t *d;
- int curid, i;
-
- ASSERT(tp);
- ASSERT(XFS_BUF_ISBUSY(bp));
- ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
-
- d = (xfs_dqblk_t *)XFS_BUF_PTR(bp);
-
- /*
- * ID of the first dquot in the block - id's are zero based.
- */
- curid = id - (id % XFS_QM_DQPERBLK(mp));
- ASSERT(curid >= 0);
- memset(d, 0, BBTOB(XFS_QI_DQCHUNKLEN(mp)));
- for (i = 0; i < XFS_QM_DQPERBLK(mp); i++, d++, curid++)
- xfs_qm_dqinit_core(curid, type, d);
- xfs_trans_dquot_buf(tp, bp,
- (type & XFS_DQ_USER ? XFS_BLI_UDQUOT_BUF :
- ((type & XFS_DQ_PROJ) ? XFS_BLI_PDQUOT_BUF :
- XFS_BLI_GDQUOT_BUF)));
- xfs_trans_log_buf(tp, bp, 0, BBTOB(XFS_QI_DQCHUNKLEN(mp)) - 1);
-}
-
-
-
-/*
- * Allocate a block and fill it with dquots.
- * This is called when the bmapi finds a hole.
- */
-STATIC int
-xfs_qm_dqalloc(
- xfs_trans_t **tpp,
- xfs_mount_t *mp,
- xfs_dquot_t *dqp,
- xfs_inode_t *quotip,
- xfs_fileoff_t offset_fsb,
- xfs_buf_t **O_bpp)
-{
- xfs_fsblock_t firstblock;
- xfs_bmap_free_t flist;
- xfs_bmbt_irec_t map;
- int nmaps, error, committed;
- xfs_buf_t *bp;
- xfs_trans_t *tp = *tpp;
-
- ASSERT(tp != NULL);
- xfs_dqtrace_entry(dqp, "DQALLOC");
-
- /*
- * Initialize the bmap freelist prior to calling bmapi code.
- */
- XFS_BMAP_INIT(&flist, &firstblock);
- xfs_ilock(quotip, XFS_ILOCK_EXCL);
- /*
- * Return if this type of quotas is turned off while we didn't
- * have an inode lock
- */
- if (XFS_IS_THIS_QUOTA_OFF(dqp)) {
- xfs_iunlock(quotip, XFS_ILOCK_EXCL);
- return (ESRCH);
- }
-
- /*
- * xfs_trans_commit normally decrements the vnode ref count
- * when it unlocks the inode. Since we want to keep the quota
- * inode around, we bump the vnode ref count now.
- */
- VN_HOLD(XFS_ITOV(quotip));
-
- xfs_trans_ijoin(tp, quotip, XFS_ILOCK_EXCL);
- nmaps = 1;
- if ((error = xfs_bmapi(tp, quotip,
- offset_fsb, XFS_DQUOT_CLUSTER_SIZE_FSB,
- XFS_BMAPI_METADATA | XFS_BMAPI_WRITE,
- &firstblock,
- XFS_QM_DQALLOC_SPACE_RES(mp),
- &map, &nmaps, &flist))) {
- goto error0;
- }
- ASSERT(map.br_blockcount == XFS_DQUOT_CLUSTER_SIZE_FSB);
- ASSERT(nmaps == 1);
- ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
- (map.br_startblock != HOLESTARTBLOCK));
-
- /*
- * Keep track of the blkno to save a lookup later
- */
- dqp->q_blkno = XFS_FSB_TO_DADDR(mp, map.br_startblock);
-
- /* now we can just get the buffer (there's nothing to read yet) */
- bp = xfs_trans_get_buf(tp, mp->m_ddev_targp,
- dqp->q_blkno,
- XFS_QI_DQCHUNKLEN(mp),
- 0);
- if (!bp || (error = XFS_BUF_GETERROR(bp)))
- goto error1;
- /*
- * Make a chunk of dquots out of this buffer and log
- * the entire thing.
- */
- xfs_qm_init_dquot_blk(tp, mp, be32_to_cpu(dqp->q_core.d_id),
- dqp->dq_flags & XFS_DQ_ALLTYPES, bp);
-
- /*
- * xfs_bmap_finish() may commit the current transaction and
- * start a second transaction if the freelist is not empty.
- *
- * Since we still want to modify this buffer, we need to
- * ensure that the buffer is not released on commit of
- * the first transaction and ensure the buffer is added to the
- * second transaction.
- *
- * If there is only one transaction then don't stop the buffer
- * from being released when it commits later on.
- */
-
- xfs_trans_bhold(tp, bp);
-
- if ((error = xfs_bmap_finish(tpp, &flist, firstblock, &committed))) {
- goto error1;
- }
-
- if (committed) {
- tp = *tpp;
- xfs_trans_bjoin(tp, bp);
- } else {
- xfs_trans_bhold_release(tp, bp);
- }
-
- *O_bpp = bp;
- return 0;
-
- error1:
- xfs_bmap_cancel(&flist);
- error0:
- xfs_iunlock(quotip, XFS_ILOCK_EXCL);
-
- return (error);
-}
-
-/*
- * Maps a dquot to the buffer containing its on-disk version.
- * This returns a ptr to the buffer containing the on-disk dquot
- * in the bpp param, and a ptr to the on-disk dquot within that buffer
- */
-STATIC int
-xfs_qm_dqtobp(
- xfs_trans_t **tpp,
- xfs_dquot_t *dqp,
- xfs_disk_dquot_t **O_ddpp,
- xfs_buf_t **O_bpp,
- uint flags)
-{
- xfs_bmbt_irec_t map;
- int nmaps, error;
- xfs_buf_t *bp;
- xfs_inode_t *quotip;
- xfs_mount_t *mp;
- xfs_disk_dquot_t *ddq;
- xfs_dqid_t id;
- boolean_t newdquot;
- xfs_trans_t *tp = (tpp ? *tpp : NULL);
-
- mp = dqp->q_mount;
- id = be32_to_cpu(dqp->q_core.d_id);
- nmaps = 1;
- newdquot = B_FALSE;
-
- /*
- * If we don't know where the dquot lives, find out.
- */
- if (dqp->q_blkno == (xfs_daddr_t) 0) {
- /* We use the id as an index */
- dqp->q_fileoffset = (xfs_fileoff_t)id / XFS_QM_DQPERBLK(mp);
- nmaps = 1;
- quotip = XFS_DQ_TO_QIP(dqp);
- xfs_ilock(quotip, XFS_ILOCK_SHARED);
- /*
- * Return if this type of quotas is turned off while we didn't
- * have an inode lock
- */
- if (XFS_IS_THIS_QUOTA_OFF(dqp)) {
- xfs_iunlock(quotip, XFS_ILOCK_SHARED);
- return (ESRCH);
- }
- /*
- * Find the block map; no allocations yet
- */
- error = xfs_bmapi(NULL, quotip, dqp->q_fileoffset,
- XFS_DQUOT_CLUSTER_SIZE_FSB,
- XFS_BMAPI_METADATA,
- NULL, 0, &map, &nmaps, NULL);
-
- xfs_iunlock(quotip, XFS_ILOCK_SHARED);
- if (error)
- return (error);
- ASSERT(nmaps == 1);
- ASSERT(map.br_blockcount == 1);
-
- /*
- * offset of dquot in the (fixed sized) dquot chunk.
- */
- dqp->q_bufoffset = (id % XFS_QM_DQPERBLK(mp)) *
- sizeof(xfs_dqblk_t);
- if (map.br_startblock == HOLESTARTBLOCK) {
- /*
- * We don't allocate unless we're asked to
- */
- if (!(flags & XFS_QMOPT_DQALLOC))
- return (ENOENT);
-
- ASSERT(tp);
- if ((error = xfs_qm_dqalloc(tpp, mp, dqp, quotip,
- dqp->q_fileoffset, &bp)))
- return (error);
- tp = *tpp;
- newdquot = B_TRUE;
- } else {
- /*
- * store the blkno etc so that we don't have to do the
- * mapping all the time
- */
- dqp->q_blkno = XFS_FSB_TO_DADDR(mp, map.br_startblock);
- }
- }
- ASSERT(dqp->q_blkno != DELAYSTARTBLOCK);
- ASSERT(dqp->q_blkno != HOLESTARTBLOCK);
-
- /*
- * Read in the buffer, unless we've just done the allocation
- * (in which case we already have the buf).
- */
- if (! newdquot) {
- xfs_dqtrace_entry(dqp, "DQTOBP READBUF");
- if ((error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
- dqp->q_blkno,
- XFS_QI_DQCHUNKLEN(mp),
- 0, &bp))) {
- return (error);
- }
- if (error || !bp)
- return XFS_ERROR(error);
- }
- ASSERT(XFS_BUF_ISBUSY(bp));
- ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
-
- /*
- * calculate the location of the dquot inside the buffer.
- */
- ddq = (xfs_disk_dquot_t *)((char *)XFS_BUF_PTR(bp) + dqp->q_bufoffset);
-
- /*
- * A simple sanity check in case we got a corrupted dquot...
- */
- if (xfs_qm_dqcheck(ddq, id, dqp->dq_flags & XFS_DQ_ALLTYPES,
- flags & (XFS_QMOPT_DQREPAIR|XFS_QMOPT_DOWARN),
- "dqtobp")) {
- if (!(flags & XFS_QMOPT_DQREPAIR)) {
- xfs_trans_brelse(tp, bp);
- return XFS_ERROR(EIO);
- }
- XFS_BUF_BUSY(bp); /* We dirtied this */
- }
-
- *O_bpp = bp;
- *O_ddpp = ddq;
-
- return (0);
-}
-
-
-/*
- * Read in the ondisk dquot using dqtobp() then copy it to an incore version,
- * and release the buffer immediately.
- *
- */
-/* ARGSUSED */
-STATIC int
-xfs_qm_dqread(
- xfs_trans_t **tpp,
- xfs_dqid_t id,
- xfs_dquot_t *dqp, /* dquot to get filled in */
- uint flags)
-{
- xfs_disk_dquot_t *ddqp;
- xfs_buf_t *bp;
- int error;
- xfs_trans_t *tp;
-
- ASSERT(tpp);
-
- /*
- * get a pointer to the on-disk dquot and the buffer containing it
- * dqp already knows its own type (GROUP/USER).
- */
- xfs_dqtrace_entry(dqp, "DQREAD");
- if ((error = xfs_qm_dqtobp(tpp, dqp, &ddqp, &bp, flags))) {
- return (error);
- }
- tp = *tpp;
-
- /* copy everything from disk dquot to the incore dquot */
- memcpy(&dqp->q_core, ddqp, sizeof(xfs_disk_dquot_t));
- ASSERT(be32_to_cpu(dqp->q_core.d_id) == id);
- xfs_qm_dquot_logitem_init(dqp);
-
- /*
- * Reservation counters are defined as reservation plus current usage
- * to avoid having to add everytime.
- */
- dqp->q_res_bcount = be64_to_cpu(ddqp->d_bcount);
- dqp->q_res_icount = be64_to_cpu(ddqp->d_icount);
- dqp->q_res_rtbcount = be64_to_cpu(ddqp->d_rtbcount);
-
- /* Mark the buf so that this will stay incore a little longer */
- XFS_BUF_SET_VTYPE_REF(bp, B_FS_DQUOT, XFS_DQUOT_REF);
-
- /*
- * We got the buffer with a xfs_trans_read_buf() (in dqtobp())
- * So we need to release with xfs_trans_brelse().
- * The strategy here is identical to that of inodes; we lock
- * the dquot in xfs_qm_dqget() before making it accessible to
- * others. This is because dquots, like inodes, need a good level of
- * concurrency, and we don't want to take locks on the entire buffers
- * for dquot accesses.
- * Note also that the dquot buffer may even be dirty at this point, if
- * this particular dquot was repaired. We still aren't afraid to
- * brelse it because we have the changes incore.
- */
- ASSERT(XFS_BUF_ISBUSY(bp));
- ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
- xfs_trans_brelse(tp, bp);
-
- return (error);
-}
-
-
-/*
- * allocate an incore dquot from the kernel heap,
- * and fill its core with quota information kept on disk.
- * If XFS_QMOPT_DQALLOC is set, it'll allocate a dquot on disk
- * if it wasn't already allocated.
- */
-STATIC int
-xfs_qm_idtodq(
- xfs_mount_t *mp,
- xfs_dqid_t id, /* gid or uid, depending on type */
- uint type, /* UDQUOT or GDQUOT */
- uint flags, /* DQALLOC, DQREPAIR */
- xfs_dquot_t **O_dqpp)/* OUT : incore dquot, not locked */
-{
- xfs_dquot_t *dqp;
- int error;
- xfs_trans_t *tp;
- int cancelflags=0;
-
- dqp = xfs_qm_dqinit(mp, id, type);
- tp = NULL;
- if (flags & XFS_QMOPT_DQALLOC) {
- tp = xfs_trans_alloc(mp, XFS_TRANS_QM_DQALLOC);
- if ((error = xfs_trans_reserve(tp,
- XFS_QM_DQALLOC_SPACE_RES(mp),
- XFS_WRITE_LOG_RES(mp) +
- BBTOB(XFS_QI_DQCHUNKLEN(mp)) - 1 +
- 128,
- 0,
- XFS_TRANS_PERM_LOG_RES,
- XFS_WRITE_LOG_COUNT))) {
- cancelflags = 0;
- goto error0;
- }
- cancelflags = XFS_TRANS_RELEASE_LOG_RES;
- }
-
- /*
- * Read it from disk; xfs_dqread() takes care of
- * all the necessary initialization of dquot's fields (locks, etc)
- */
- if ((error = xfs_qm_dqread(&tp, id, dqp, flags))) {
- /*
- * This can happen if quotas got turned off (ESRCH),
- * or if the dquot didn't exist on disk and we ask to
- * allocate (ENOENT).
- */
- xfs_dqtrace_entry(dqp, "DQREAD FAIL");
- cancelflags |= XFS_TRANS_ABORT;
- goto error0;
- }
- if (tp) {
- if ((error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES,
- NULL)))
- goto error1;
- }
-
- *O_dqpp = dqp;
- return (0);
-
- error0:
- ASSERT(error);
- if (tp)
- xfs_trans_cancel(tp, cancelflags);
- error1:
- xfs_qm_dqdestroy(dqp);
- *O_dqpp = NULL;
- return (error);
-}
-
-/*
- * Lookup a dquot in the incore dquot hashtable. We keep two separate
- * hashtables for user and group dquots; and, these are global tables
- * inside the XQM, not per-filesystem tables.
- * The hash chain must be locked by caller, and it is left locked
- * on return. Returning dquot is locked.
- */
-STATIC int
-xfs_qm_dqlookup(
- xfs_mount_t *mp,
- xfs_dqid_t id,
- xfs_dqhash_t *qh,
- xfs_dquot_t **O_dqpp)
-{
- xfs_dquot_t *dqp;
- uint flist_locked;
- xfs_dquot_t *d;
-
- ASSERT(XFS_DQ_IS_HASH_LOCKED(qh));
-
- flist_locked = B_FALSE;
-
- /*
- * Traverse the hashchain looking for a match
- */
- for (dqp = qh->qh_next; dqp != NULL; dqp = dqp->HL_NEXT) {
- /*
- * We already have the hashlock. We don't need the
- * dqlock to look at the id field of the dquot, since the
- * id can't be modified without the hashlock anyway.
- */
- if (be32_to_cpu(dqp->q_core.d_id) == id && dqp->q_mount == mp) {
- xfs_dqtrace_entry(dqp, "DQFOUND BY LOOKUP");
- /*
- * All in core dquots must be on the dqlist of mp
- */
- ASSERT(dqp->MPL_PREVP != NULL);
-
- xfs_dqlock(dqp);
- if (dqp->q_nrefs == 0) {
- ASSERT (XFS_DQ_IS_ON_FREELIST(dqp));
- if (! xfs_qm_freelist_lock_nowait(xfs_Gqm)) {
- xfs_dqtrace_entry(dqp, "DQLOOKUP: WANT");
-
- /*
- * We may have raced with dqreclaim_one()
- * (and lost). So, flag that we don't
- * want the dquot to be reclaimed.
- */
- dqp->dq_flags |= XFS_DQ_WANT;
- xfs_dqunlock(dqp);
- xfs_qm_freelist_lock(xfs_Gqm);
- xfs_dqlock(dqp);
- dqp->dq_flags &= ~(XFS_DQ_WANT);
- }
- flist_locked = B_TRUE;
- }
-
- /*
- * id couldn't have changed; we had the hashlock all
- * along
- */
- ASSERT(be32_to_cpu(dqp->q_core.d_id) == id);
-
- if (flist_locked) {
- if (dqp->q_nrefs != 0) {
- xfs_qm_freelist_unlock(xfs_Gqm);
- flist_locked = B_FALSE;
- } else {
- /*
- * take it off the freelist
- */
- xfs_dqtrace_entry(dqp,
- "DQLOOKUP: TAKEOFF FL");
- XQM_FREELIST_REMOVE(dqp);
- /* xfs_qm_freelist_print(&(xfs_Gqm->
- qm_dqfreelist),
- "after removal"); */
- }
- }
-
- /*
- * grab a reference
- */
- XFS_DQHOLD(dqp);
-
- if (flist_locked)
- xfs_qm_freelist_unlock(xfs_Gqm);
- /*
- * move the dquot to the front of the hashchain
- */
- ASSERT(XFS_DQ_IS_HASH_LOCKED(qh));
- if (dqp->HL_PREVP != &qh->qh_next) {
- xfs_dqtrace_entry(dqp,
- "DQLOOKUP: HASH MOVETOFRONT");
- if ((d = dqp->HL_NEXT))
- d->HL_PREVP = dqp->HL_PREVP;
- *(dqp->HL_PREVP) = d;
- d = qh->qh_next;
- d->HL_PREVP = &dqp->HL_NEXT;
- dqp->HL_NEXT = d;
- dqp->HL_PREVP = &qh->qh_next;
- qh->qh_next = dqp;
- }
- xfs_dqtrace_entry(dqp, "LOOKUP END");
- *O_dqpp = dqp;
- ASSERT(XFS_DQ_IS_HASH_LOCKED(qh));
- return (0);
- }
- }
-
- *O_dqpp = NULL;
- ASSERT(XFS_DQ_IS_HASH_LOCKED(qh));
- return (1);
-}
-
-/*
- * Given the file system, inode OR id, and type (UDQUOT/GDQUOT), return a
- * a locked dquot, doing an allocation (if requested) as needed.
- * When both an inode and an id are given, the inode's id takes precedence.
- * That is, if the id changes while we don't hold the ilock inside this
- * function, the new dquot is returned, not necessarily the one requested
- * in the id argument.
- */
-int
-xfs_qm_dqget(
- xfs_mount_t *mp,
- xfs_inode_t *ip, /* locked inode (optional) */
- xfs_dqid_t id, /* uid/projid/gid depending on type */
- uint type, /* XFS_DQ_USER/XFS_DQ_PROJ/XFS_DQ_GROUP */
- uint flags, /* DQALLOC, DQSUSER, DQREPAIR, DOWARN */
- xfs_dquot_t **O_dqpp) /* OUT : locked incore dquot */
-{
- xfs_dquot_t *dqp;
- xfs_dqhash_t *h;
- uint version;
- int error;
-
- ASSERT(XFS_IS_QUOTA_RUNNING(mp));
- if ((! XFS_IS_UQUOTA_ON(mp) && type == XFS_DQ_USER) ||
- (! XFS_IS_PQUOTA_ON(mp) && type == XFS_DQ_PROJ) ||
- (! XFS_IS_GQUOTA_ON(mp) && type == XFS_DQ_GROUP)) {
- return (ESRCH);
- }
- h = XFS_DQ_HASH(mp, id, type);
-
-#ifdef DEBUG
- if (xfs_do_dqerror) {
- if ((xfs_dqerror_target == mp->m_ddev_targp) &&
- (xfs_dqreq_num++ % xfs_dqerror_mod) == 0) {
- cmn_err(CE_DEBUG, "Returning error in dqget");
- return (EIO);
- }
- }
-#endif
-
- again:
-
-#ifdef DEBUG
- ASSERT(type == XFS_DQ_USER ||
- type == XFS_DQ_PROJ ||
- type == XFS_DQ_GROUP);
- if (ip) {
- ASSERT(XFS_ISLOCKED_INODE_EXCL(ip));
- if (type == XFS_DQ_USER)
- ASSERT(ip->i_udquot == NULL);
- else
- ASSERT(ip->i_gdquot == NULL);
- }
-#endif
- XFS_DQ_HASH_LOCK(h);
-
- /*
- * Look in the cache (hashtable).
- * The chain is kept locked during lookup.
- */
- if (xfs_qm_dqlookup(mp, id, h, O_dqpp) == 0) {
- XQM_STATS_INC(xqmstats.xs_qm_dqcachehits);
- /*
- * The dquot was found, moved to the front of the chain,
- * taken off the freelist if it was on it, and locked
- * at this point. Just unlock the hashchain and return.
- */
- ASSERT(*O_dqpp);
- ASSERT(XFS_DQ_IS_LOCKED(*O_dqpp));
- XFS_DQ_HASH_UNLOCK(h);
- xfs_dqtrace_entry(*O_dqpp, "DQGET DONE (FROM CACHE)");
- return (0); /* success */
- }
- XQM_STATS_INC(xqmstats.xs_qm_dqcachemisses);
-
- /*
- * Dquot cache miss. We don't want to keep the inode lock across
- * a (potential) disk read. Also we don't want to deal with the lock
- * ordering between quotainode and this inode. OTOH, dropping the inode
- * lock here means dealing with a chown that can happen before
- * we re-acquire the lock.
- */
- if (ip)
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- /*
- * Save the hashchain version stamp, and unlock the chain, so that
- * we don't keep the lock across a disk read
- */
- version = h->qh_version;
- XFS_DQ_HASH_UNLOCK(h);
-
- /*
- * Allocate the dquot on the kernel heap, and read the ondisk
- * portion off the disk. Also, do all the necessary initialization
- * This can return ENOENT if dquot didn't exist on disk and we didn't
- * ask it to allocate; ESRCH if quotas got turned off suddenly.
- */
- if ((error = xfs_qm_idtodq(mp, id, type,
- flags & (XFS_QMOPT_DQALLOC|XFS_QMOPT_DQREPAIR|
- XFS_QMOPT_DOWARN),
- &dqp))) {
- if (ip)
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- return (error);
- }
-
- /*
- * See if this is mount code calling to look at the overall quota limits
- * which are stored in the id == 0 user or group's dquot.
- * Since we may not have done a quotacheck by this point, just return
- * the dquot without attaching it to any hashtables, lists, etc, or even
- * taking a reference.
- * The caller must dqdestroy this once done.
- */
- if (flags & XFS_QMOPT_DQSUSER) {
- ASSERT(id == 0);
- ASSERT(! ip);
- goto dqret;
- }
-
- /*
- * Dquot lock comes after hashlock in the lock ordering
- */
- if (ip) {
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- if (! XFS_IS_DQTYPE_ON(mp, type)) {
- /* inode stays locked on return */
- xfs_qm_dqdestroy(dqp);
- return XFS_ERROR(ESRCH);
- }
- /*
- * A dquot could be attached to this inode by now, since
- * we had dropped the ilock.
- */
- if (type == XFS_DQ_USER) {
- if (ip->i_udquot) {
- xfs_qm_dqdestroy(dqp);
- dqp = ip->i_udquot;
- xfs_dqlock(dqp);
- goto dqret;
- }
- } else {
- if (ip->i_gdquot) {
- xfs_qm_dqdestroy(dqp);
- dqp = ip->i_gdquot;
- xfs_dqlock(dqp);
- goto dqret;
- }
- }
- }
-
- /*
- * Hashlock comes after ilock in lock order
- */
- XFS_DQ_HASH_LOCK(h);
- if (version != h->qh_version) {
- xfs_dquot_t *tmpdqp;
- /*
- * Now, see if somebody else put the dquot in the
- * hashtable before us. This can happen because we didn't
- * keep the hashchain lock. We don't have to worry about
- * lock order between the two dquots here since dqp isn't
- * on any findable lists yet.
- */
- if (xfs_qm_dqlookup(mp, id, h, &tmpdqp) == 0) {
- /*
- * Duplicate found. Just throw away the new dquot
- * and start over.
- */
- xfs_qm_dqput(tmpdqp);
- XFS_DQ_HASH_UNLOCK(h);
- xfs_qm_dqdestroy(dqp);
- XQM_STATS_INC(xqmstats.xs_qm_dquot_dups);
- goto again;
- }
- }
-
- /*
- * Put the dquot at the beginning of the hash-chain and mp's list
- * LOCK ORDER: hashlock, freelistlock, mplistlock, udqlock, gdqlock ..
- */
- ASSERT(XFS_DQ_IS_HASH_LOCKED(h));
- dqp->q_hash = h;
- XQM_HASHLIST_INSERT(h, dqp);
-
- /*
- * Attach this dquot to this filesystem's list of all dquots,
- * kept inside the mount structure in m_quotainfo field
- */
- xfs_qm_mplist_lock(mp);
-
- /*
- * We return a locked dquot to the caller, with a reference taken
- */
- xfs_dqlock(dqp);
- dqp->q_nrefs = 1;
-
- XQM_MPLIST_INSERT(&(XFS_QI_MPL_LIST(mp)), dqp);
-
- xfs_qm_mplist_unlock(mp);
- XFS_DQ_HASH_UNLOCK(h);
- dqret:
- ASSERT((ip == NULL) || XFS_ISLOCKED_INODE_EXCL(ip));
- xfs_dqtrace_entry(dqp, "DQGET DONE");
- *O_dqpp = dqp;
- return (0);
-}
-
-
-/*
- * Release a reference to the dquot (decrement ref-count)
- * and unlock it. If there is a group quota attached to this
- * dquot, carefully release that too without tripping over
- * deadlocks'n'stuff.
- */
-void
-xfs_qm_dqput(
- xfs_dquot_t *dqp)
-{
- xfs_dquot_t *gdqp;
-
- ASSERT(dqp->q_nrefs > 0);
- ASSERT(XFS_DQ_IS_LOCKED(dqp));
- xfs_dqtrace_entry(dqp, "DQPUT");
-
- if (dqp->q_nrefs != 1) {
- dqp->q_nrefs--;
- xfs_dqunlock(dqp);
- return;
- }
-
- /*
- * drop the dqlock and acquire the freelist and dqlock
- * in the right order; but try to get it out-of-order first
- */
- if (! xfs_qm_freelist_lock_nowait(xfs_Gqm)) {
- xfs_dqtrace_entry(dqp, "DQPUT: FLLOCK-WAIT");
- xfs_dqunlock(dqp);
- xfs_qm_freelist_lock(xfs_Gqm);
- xfs_dqlock(dqp);
- }
-
- while (1) {
- gdqp = NULL;
-
- /* We can't depend on nrefs being == 1 here */
- if (--dqp->q_nrefs == 0) {
- xfs_dqtrace_entry(dqp, "DQPUT: ON FREELIST");
- /*
- * insert at end of the freelist.
- */
- XQM_FREELIST_INSERT(&(xfs_Gqm->qm_dqfreelist), dqp);
-
- /*
- * If we just added a udquot to the freelist, then
- * we want to release the gdquot reference that
- * it (probably) has. Otherwise it'll keep the
- * gdquot from getting reclaimed.
- */
- if ((gdqp = dqp->q_gdquot)) {
- /*
- * Avoid a recursive dqput call
- */
- xfs_dqlock(gdqp);
- dqp->q_gdquot = NULL;
- }
-
- /* xfs_qm_freelist_print(&(xfs_Gqm->qm_dqfreelist),
- "@@@@@++ Free list (after append) @@@@@+");
- */
- }
- xfs_dqunlock(dqp);
-
- /*
- * If we had a group quota inside the user quota as a hint,
- * release it now.
- */
- if (! gdqp)
- break;
- dqp = gdqp;
- }
- xfs_qm_freelist_unlock(xfs_Gqm);
-}
-
-/*
- * Release a dquot. Flush it if dirty, then dqput() it.
- * dquot must not be locked.
- */
-void
-xfs_qm_dqrele(
- xfs_dquot_t *dqp)
-{
- ASSERT(dqp);
- xfs_dqtrace_entry(dqp, "DQRELE");
-
- xfs_dqlock(dqp);
- /*
- * We don't care to flush it if the dquot is dirty here.
- * That will create stutters that we want to avoid.
- * Instead we do a delayed write when we try to reclaim
- * a dirty dquot. Also xfs_sync will take part of the burden...
- */
- xfs_qm_dqput(dqp);
-}
-
-
-/*
- * Write a modified dquot to disk.
- * The dquot must be locked and the flush lock too taken by caller.
- * The flush lock will not be unlocked until the dquot reaches the disk,
- * but the dquot is free to be unlocked and modified by the caller
- * in the interim. Dquot is still locked on return. This behavior is
- * identical to that of inodes.
- */
-int
-xfs_qm_dqflush(
- xfs_dquot_t *dqp,
- uint flags)
-{
- xfs_mount_t *mp;
- xfs_buf_t *bp;
- xfs_disk_dquot_t *ddqp;
- int error;
- SPLDECL(s);
-
- ASSERT(XFS_DQ_IS_LOCKED(dqp));
- ASSERT(XFS_DQ_IS_FLUSH_LOCKED(dqp));
- xfs_dqtrace_entry(dqp, "DQFLUSH");
-
- /*
- * If not dirty, nada.
- */
- if (!XFS_DQ_IS_DIRTY(dqp)) {
- xfs_dqfunlock(dqp);
- return (0);
- }
-
- /*
- * Cant flush a pinned dquot. Wait for it.
- */
- xfs_qm_dqunpin_wait(dqp);
-
- /*
- * This may have been unpinned because the filesystem is shutting
- * down forcibly. If that's the case we must not write this dquot
- * to disk, because the log record didn't make it to disk!
- */
- if (XFS_FORCED_SHUTDOWN(dqp->q_mount)) {
- dqp->dq_flags &= ~(XFS_DQ_DIRTY);
- xfs_dqfunlock(dqp);
- return XFS_ERROR(EIO);
- }
-
- /*
- * Get the buffer containing the on-disk dquot
- * We don't need a transaction envelope because we know that the
- * the ondisk-dquot has already been allocated for.
- */
- if ((error = xfs_qm_dqtobp(NULL, dqp, &ddqp, &bp, XFS_QMOPT_DOWARN))) {
- xfs_dqtrace_entry(dqp, "DQTOBP FAIL");
- ASSERT(error != ENOENT);
- /*
- * Quotas could have gotten turned off (ESRCH)
- */
- xfs_dqfunlock(dqp);
- return (error);
- }
-
- if (xfs_qm_dqcheck(&dqp->q_core, be32_to_cpu(ddqp->d_id),
- 0, XFS_QMOPT_DOWARN, "dqflush (incore copy)")) {
- xfs_force_shutdown(dqp->q_mount, XFS_CORRUPT_INCORE);
- return XFS_ERROR(EIO);
- }
-
- /* This is the only portion of data that needs to persist */
- memcpy(ddqp, &(dqp->q_core), sizeof(xfs_disk_dquot_t));
-
- /*
- * Clear the dirty field and remember the flush lsn for later use.
- */
- dqp->dq_flags &= ~(XFS_DQ_DIRTY);
- mp = dqp->q_mount;
-
- /* lsn is 64 bits */
- AIL_LOCK(mp, s);
- dqp->q_logitem.qli_flush_lsn = dqp->q_logitem.qli_item.li_lsn;
- AIL_UNLOCK(mp, s);
-
- /*
- * Attach an iodone routine so that we can remove this dquot from the
- * AIL and release the flush lock once the dquot is synced to disk.
- */
- xfs_buf_attach_iodone(bp, (void(*)(xfs_buf_t *, xfs_log_item_t *))
- xfs_qm_dqflush_done, &(dqp->q_logitem.qli_item));
- /*
- * If the buffer is pinned then push on the log so we won't
- * get stuck waiting in the write for too long.
- */
- if (XFS_BUF_ISPINNED(bp)) {
- xfs_dqtrace_entry(dqp, "DQFLUSH LOG FORCE");
- xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE);
- }
-
- if (flags & XFS_QMOPT_DELWRI) {
- xfs_bdwrite(mp, bp);
- } else if (flags & XFS_QMOPT_ASYNC) {
- xfs_bawrite(mp, bp);
- } else {
- error = xfs_bwrite(mp, bp);
- }
- xfs_dqtrace_entry(dqp, "DQFLUSH END");
- /*
- * dqp is still locked, but caller is free to unlock it now.
- */
- return (error);
-
-}
-
-/*
- * This is the dquot flushing I/O completion routine. It is called
- * from interrupt level when the buffer containing the dquot is
- * flushed to disk. It is responsible for removing the dquot logitem
- * from the AIL if it has not been re-logged, and unlocking the dquot's
- * flush lock. This behavior is very similar to that of inodes..
- */
-/*ARGSUSED*/
-STATIC void
-xfs_qm_dqflush_done(
- xfs_buf_t *bp,
- xfs_dq_logitem_t *qip)
-{
- xfs_dquot_t *dqp;
- SPLDECL(s);
-
- dqp = qip->qli_dquot;
-
- /*
- * We only want to pull the item from the AIL if its
- * location in the log has not changed since we started the flush.
- * Thus, we only bother if the dquot's lsn has
- * not changed. First we check the lsn outside the lock
- * since it's cheaper, and then we recheck while
- * holding the lock before removing the dquot from the AIL.
- */
- if ((qip->qli_item.li_flags & XFS_LI_IN_AIL) &&
- qip->qli_item.li_lsn == qip->qli_flush_lsn) {
-
- AIL_LOCK(dqp->q_mount, s);
- /*
- * xfs_trans_delete_ail() drops the AIL lock.
- */
- if (qip->qli_item.li_lsn == qip->qli_flush_lsn)
- xfs_trans_delete_ail(dqp->q_mount,
- (xfs_log_item_t*)qip, s);
- else
- AIL_UNLOCK(dqp->q_mount, s);
- }
-
- /*
- * Release the dq's flush lock since we're done with it.
- */
- xfs_dqfunlock(dqp);
-}
-
-
-int
-xfs_qm_dqflock_nowait(
- xfs_dquot_t *dqp)
-{
- int locked;
-
- locked = cpsema(&((dqp)->q_flock));
-
- /* XXX ifdef these out */
- if (locked)
- (dqp)->dq_flags |= XFS_DQ_FLOCKED;
- return (locked);
-}
-
-
-int
-xfs_qm_dqlock_nowait(
- xfs_dquot_t *dqp)
-{
- return (mutex_trylock(&((dqp)->q_qlock)));
-}
-
-void
-xfs_dqlock(
- xfs_dquot_t *dqp)
-{
- mutex_lock(&(dqp->q_qlock));
-}
-
-void
-xfs_dqunlock(
- xfs_dquot_t *dqp)
-{
- mutex_unlock(&(dqp->q_qlock));
- if (dqp->q_logitem.qli_dquot == dqp) {
- /* Once was dqp->q_mount, but might just have been cleared */
- xfs_trans_unlocked_item(dqp->q_logitem.qli_item.li_mountp,
- (xfs_log_item_t*)&(dqp->q_logitem));
- }
-}
-
-
-void
-xfs_dqunlock_nonotify(
- xfs_dquot_t *dqp)
-{
- mutex_unlock(&(dqp->q_qlock));
-}
-
-void
-xfs_dqlock2(
- xfs_dquot_t *d1,
- xfs_dquot_t *d2)
-{
- if (d1 && d2) {
- ASSERT(d1 != d2);
- if (be32_to_cpu(d1->q_core.d_id) >
- be32_to_cpu(d2->q_core.d_id)) {
- xfs_dqlock(d2);
- xfs_dqlock(d1);
- } else {
- xfs_dqlock(d1);
- xfs_dqlock(d2);
- }
- } else {
- if (d1) {
- xfs_dqlock(d1);
- } else if (d2) {
- xfs_dqlock(d2);
- }
- }
-}
-
-
-/*
- * Take a dquot out of the mount's dqlist as well as the hashlist.
- * This is called via unmount as well as quotaoff, and the purge
- * will always succeed unless there are soft (temp) references
- * outstanding.
- *
- * This returns 0 if it was purged, 1 if it wasn't. It's not an error code
- * that we're returning! XXXsup - not cool.
- */
-/* ARGSUSED */
-int
-xfs_qm_dqpurge(
- xfs_dquot_t *dqp,
- uint flags)
-{
- xfs_dqhash_t *thishash;
- xfs_mount_t *mp;
-
- mp = dqp->q_mount;
-
- ASSERT(XFS_QM_IS_MPLIST_LOCKED(mp));
- ASSERT(XFS_DQ_IS_HASH_LOCKED(dqp->q_hash));
-
- xfs_dqlock(dqp);
- /*
- * We really can't afford to purge a dquot that is
- * referenced, because these are hard refs.
- * It shouldn't happen in general because we went thru _all_ inodes in
- * dqrele_all_inodes before calling this and didn't let the mountlock go.
- * However it is possible that we have dquots with temporary
- * references that are not attached to an inode. e.g. see xfs_setattr().
- */
- if (dqp->q_nrefs != 0) {
- xfs_dqunlock(dqp);
- XFS_DQ_HASH_UNLOCK(dqp->q_hash);
- return (1);
- }
-
- ASSERT(XFS_DQ_IS_ON_FREELIST(dqp));
-
- /*
- * If we're turning off quotas, we have to make sure that, for
- * example, we don't delete quota disk blocks while dquots are
- * in the process of getting written to those disk blocks.
- * This dquot might well be on AIL, and we can't leave it there
- * if we're turning off quotas. Basically, we need this flush
- * lock, and are willing to block on it.
- */
- if (! xfs_qm_dqflock_nowait(dqp)) {
- /*
- * Block on the flush lock after nudging dquot buffer,
- * if it is incore.
- */
- xfs_qm_dqflock_pushbuf_wait(dqp);
- }
-
- /*
- * XXXIf we're turning this type of quotas off, we don't care
- * about the dirty metadata sitting in this dquot. OTOH, if
- * we're unmounting, we do care, so we flush it and wait.
- */
- if (XFS_DQ_IS_DIRTY(dqp)) {
- xfs_dqtrace_entry(dqp, "DQPURGE ->DQFLUSH: DQDIRTY");
- /* dqflush unlocks dqflock */
- /*
- * Given that dqpurge is a very rare occurrence, it is OK
- * that we're holding the hashlist and mplist locks
- * across the disk write. But, ... XXXsup
- *
- * We don't care about getting disk errors here. We need
- * to purge this dquot anyway, so we go ahead regardless.
- */
- (void) xfs_qm_dqflush(dqp, XFS_QMOPT_SYNC);
- xfs_dqflock(dqp);
- }
- ASSERT(dqp->q_pincount == 0);
- ASSERT(XFS_FORCED_SHUTDOWN(mp) ||
- !(dqp->q_logitem.qli_item.li_flags & XFS_LI_IN_AIL));
-
- thishash = dqp->q_hash;
- XQM_HASHLIST_REMOVE(thishash, dqp);
- XQM_MPLIST_REMOVE(&(XFS_QI_MPL_LIST(mp)), dqp);
- /*
- * XXX Move this to the front of the freelist, if we can get the
- * freelist lock.
- */
- ASSERT(XFS_DQ_IS_ON_FREELIST(dqp));
-
- dqp->q_mount = NULL;
- dqp->q_hash = NULL;
- dqp->dq_flags = XFS_DQ_INACTIVE;
- memset(&dqp->q_core, 0, sizeof(dqp->q_core));
- xfs_dqfunlock(dqp);
- xfs_dqunlock(dqp);
- XFS_DQ_HASH_UNLOCK(thishash);
- return (0);
-}
-
-
-#ifdef QUOTADEBUG
-void
-xfs_qm_dqprint(xfs_dquot_t *dqp)
-{
- cmn_err(CE_DEBUG, "-----------KERNEL DQUOT----------------");
- cmn_err(CE_DEBUG, "---- dquotID = %d",
- (int)be32_to_cpu(dqp->q_core.d_id));
- cmn_err(CE_DEBUG, "---- type = %s", DQFLAGTO_TYPESTR(dqp));
- cmn_err(CE_DEBUG, "---- fs = 0x%p", dqp->q_mount);
- cmn_err(CE_DEBUG, "---- blkno = 0x%x", (int) dqp->q_blkno);
- cmn_err(CE_DEBUG, "---- boffset = 0x%x", (int) dqp->q_bufoffset);
- cmn_err(CE_DEBUG, "---- blkhlimit = %Lu (0x%x)",
- be64_to_cpu(dqp->q_core.d_blk_hardlimit),
- (int)be64_to_cpu(dqp->q_core.d_blk_hardlimit));
- cmn_err(CE_DEBUG, "---- blkslimit = %Lu (0x%x)",
- be64_to_cpu(dqp->q_core.d_blk_softlimit),
- (int)be64_to_cpu(dqp->q_core.d_blk_softlimit));
- cmn_err(CE_DEBUG, "---- inohlimit = %Lu (0x%x)",
- be64_to_cpu(dqp->q_core.d_ino_hardlimit),
- (int)be64_to_cpu(dqp->q_core.d_ino_hardlimit));
- cmn_err(CE_DEBUG, "---- inoslimit = %Lu (0x%x)",
- be64_to_cpu(dqp->q_core.d_ino_softlimit),
- (int)be64_to_cpu(dqp->q_core.d_ino_softlimit));
- cmn_err(CE_DEBUG, "---- bcount = %Lu (0x%x)",
- be64_to_cpu(dqp->q_core.d_bcount),
- (int)be64_to_cpu(dqp->q_core.d_bcount));
- cmn_err(CE_DEBUG, "---- icount = %Lu (0x%x)",
- be64_to_cpu(dqp->q_core.d_icount),
- (int)be64_to_cpu(dqp->q_core.d_icount));
- cmn_err(CE_DEBUG, "---- btimer = %d",
- (int)be32_to_cpu(dqp->q_core.d_btimer));
- cmn_err(CE_DEBUG, "---- itimer = %d",
- (int)be32_to_cpu(dqp->q_core.d_itimer));
- cmn_err(CE_DEBUG, "---------------------------");
-}
-#endif
-
-/*
- * Give the buffer a little push if it is incore and
- * wait on the flush lock.
- */
-void
-xfs_qm_dqflock_pushbuf_wait(
- xfs_dquot_t *dqp)
-{
- xfs_buf_t *bp;
-
- /*
- * Check to see if the dquot has been flushed delayed
- * write. If so, grab its buffer and send it
- * out immediately. We'll be able to acquire
- * the flush lock when the I/O completes.
- */
- bp = xfs_incore(dqp->q_mount->m_ddev_targp, dqp->q_blkno,
- XFS_QI_DQCHUNKLEN(dqp->q_mount),
- XFS_INCORE_TRYLOCK);
- if (bp != NULL) {
- if (XFS_BUF_ISDELAYWRITE(bp)) {
- if (XFS_BUF_ISPINNED(bp)) {
- xfs_log_force(dqp->q_mount,
- (xfs_lsn_t)0,
- XFS_LOG_FORCE);
- }
- xfs_bawrite(dqp->q_mount, bp);
- } else {
- xfs_buf_relse(bp);
- }
- }
- xfs_dqflock(dqp);
-}
diff --git a/fs/xfs/quota/xfs_dquot.h b/fs/xfs/quota/xfs_dquot.h
deleted file mode 100644
index c0c629663a5..00000000000
--- a/fs/xfs/quota/xfs_dquot.h
+++ /dev/null
@@ -1,190 +0,0 @@
-/*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- */
-#ifndef __XFS_DQUOT_H__
-#define __XFS_DQUOT_H__
-
-/*
- * Dquots are structures that hold quota information about a user or a group,
- * much like inodes are for files. In fact, dquots share many characteristics
- * with inodes. However, dquots can also be a centralized resource, relative
- * to a collection of inodes. In this respect, dquots share some characteristics
- * of the superblock.
- * XFS dquots exploit both those in its algorithms. They make every attempt
- * to not be a bottleneck when quotas are on and have minimal impact, if any,
- * when quotas are off.
- */
-
-/*
- * The hash chain headers (hash buckets)
- */
-typedef struct xfs_dqhash {
- struct xfs_dquot *qh_next;
- mutex_t qh_lock;
- uint qh_version; /* ever increasing version */
- uint qh_nelems; /* number of dquots on the list */
-} xfs_dqhash_t;
-
-typedef struct xfs_dqlink {
- struct xfs_dquot *ql_next; /* forward link */
- struct xfs_dquot **ql_prevp; /* pointer to prev ql_next */
-} xfs_dqlink_t;
-
-struct xfs_mount;
-struct xfs_trans;
-
-/*
- * This is the marker which is designed to occupy the first few
- * bytes of the xfs_dquot_t structure. Even inside this, the freelist pointers
- * must come first.
- * This serves as the marker ("sentinel") when we have to restart list
- * iterations because of locking considerations.
- */
-typedef struct xfs_dqmarker {
- struct xfs_dquot*dqm_flnext; /* link to freelist: must be first */
- struct xfs_dquot*dqm_flprev;
- xfs_dqlink_t dqm_mplist; /* link to mount's list of dquots */
- xfs_dqlink_t dqm_hashlist; /* link to the hash chain */
- uint dqm_flags; /* various flags (XFS_DQ_*) */
-} xfs_dqmarker_t;
-
-/*
- * The incore dquot structure
- */
-typedef struct xfs_dquot {
- xfs_dqmarker_t q_lists; /* list ptrs, q_flags (marker) */
- xfs_dqhash_t *q_hash; /* the hashchain header */
- struct xfs_mount*q_mount; /* filesystem this relates to */
- struct xfs_trans*q_transp; /* trans this belongs to currently */
- uint q_nrefs; /* # active refs from inodes */
- xfs_daddr_t q_blkno; /* blkno of dquot buffer */
- int q_bufoffset; /* off of dq in buffer (# dquots) */
- xfs_fileoff_t q_fileoffset; /* offset in quotas file */
-
- struct xfs_dquot*q_gdquot; /* group dquot, hint only */
- xfs_disk_dquot_t q_core; /* actual usage & quotas */
- xfs_dq_logitem_t q_logitem; /* dquot log item */
- xfs_qcnt_t q_res_bcount; /* total regular nblks used+reserved */
- xfs_qcnt_t q_res_icount; /* total inos allocd+reserved */
- xfs_qcnt_t q_res_rtbcount;/* total realtime blks used+reserved */
- mutex_t q_qlock; /* quota lock */
- sema_t q_flock; /* flush lock */
- uint q_pincount; /* pin count for this dquot */
- sv_t q_pinwait; /* sync var for pinning */
-#ifdef XFS_DQUOT_TRACE
- struct ktrace *q_trace; /* trace header structure */
-#endif
-} xfs_dquot_t;
-
-
-#define dq_flnext q_lists.dqm_flnext
-#define dq_flprev q_lists.dqm_flprev
-#define dq_mplist q_lists.dqm_mplist
-#define dq_hashlist q_lists.dqm_hashlist
-#define dq_flags q_lists.dqm_flags
-
-#define XFS_DQHOLD(dqp) ((dqp)->q_nrefs++)
-
-#ifdef DEBUG
-static inline int
-XFS_DQ_IS_LOCKED(xfs_dquot_t *dqp)
-{
- if (mutex_trylock(&dqp->q_qlock)) {
- mutex_unlock(&dqp->q_qlock);
- return 0;
- }
- return 1;
-}
-#endif
-
-
-/*
- * The following three routines simply manage the q_flock
- * semaphore embedded in the dquot. This semaphore synchronizes
- * processes attempting to flush the in-core dquot back to disk.
- */
-#define xfs_dqflock(dqp) { psema(&((dqp)->q_flock), PINOD | PRECALC);\
- (dqp)->dq_flags |= XFS_DQ_FLOCKED; }
-#define xfs_dqfunlock(dqp) { ASSERT(valusema(&((dqp)->q_flock)) <= 0); \
- vsema(&((dqp)->q_flock)); \
- (dqp)->dq_flags &= ~(XFS_DQ_FLOCKED); }
-
-#define XFS_DQ_PINLOCK(dqp) mutex_spinlock( \
- &(XFS_DQ_TO_QINF(dqp)->qi_pinlock))
-#define XFS_DQ_PINUNLOCK(dqp, s) mutex_spinunlock( \
- &(XFS_DQ_TO_QINF(dqp)->qi_pinlock), s)
-
-#define XFS_DQ_IS_FLUSH_LOCKED(dqp) (valusema(&((dqp)->q_flock)) <= 0)
-#define XFS_DQ_IS_ON_FREELIST(dqp) ((dqp)->dq_flnext != (dqp))
-#define XFS_DQ_IS_DIRTY(dqp) ((dqp)->dq_flags & XFS_DQ_DIRTY)
-#define XFS_QM_ISUDQ(dqp) ((dqp)->dq_flags & XFS_DQ_USER)
-#define XFS_QM_ISPDQ(dqp) ((dqp)->dq_flags & XFS_DQ_PROJ)
-#define XFS_QM_ISGDQ(dqp) ((dqp)->dq_flags & XFS_DQ_GROUP)
-#define XFS_DQ_TO_QINF(dqp) ((dqp)->q_mount->m_quotainfo)
-#define XFS_DQ_TO_QIP(dqp) (XFS_QM_ISUDQ(dqp) ? \
- XFS_DQ_TO_QINF(dqp)->qi_uquotaip : \
- XFS_DQ_TO_QINF(dqp)->qi_gquotaip)
-
-#define XFS_IS_THIS_QUOTA_OFF(d) (! (XFS_QM_ISUDQ(d) ? \
- (XFS_IS_UQUOTA_ON((d)->q_mount)) : \
- (XFS_IS_OQUOTA_ON((d)->q_mount))))
-
-#ifdef XFS_DQUOT_TRACE
-/*
- * Dquot Tracing stuff.
- */
-#define DQUOT_TRACE_SIZE 64
-#define DQUOT_KTRACE_ENTRY 1
-
-extern void __xfs_dqtrace_entry(xfs_dquot_t *dqp, char *func,
- void *, xfs_inode_t *);
-#define xfs_dqtrace_entry_ino(a,b,ip) \
- __xfs_dqtrace_entry((a), (b), (void*)__return_address, (ip))
-#define xfs_dqtrace_entry(a,b) \
- __xfs_dqtrace_entry((a), (b), (void*)__return_address, NULL)
-#else
-#define xfs_dqtrace_entry(a,b)
-#define xfs_dqtrace_entry_ino(a,b,ip)
-#endif
-
-#ifdef QUOTADEBUG
-extern void xfs_qm_dqprint(xfs_dquot_t *);
-#else
-#define xfs_qm_dqprint(a)
-#endif
-
-extern void xfs_qm_dqdestroy(xfs_dquot_t *);
-extern int xfs_qm_dqflush(xfs_dquot_t *, uint);
-extern int xfs_qm_dqpurge(xfs_dquot_t *, uint);
-extern void xfs_qm_dqunpin_wait(xfs_dquot_t *);
-extern int xfs_qm_dqlock_nowait(xfs_dquot_t *);
-extern int xfs_qm_dqflock_nowait(xfs_dquot_t *);
-extern void xfs_qm_dqflock_pushbuf_wait(xfs_dquot_t *dqp);
-extern void xfs_qm_adjust_dqtimers(xfs_mount_t *,
- xfs_disk_dquot_t *);
-extern void xfs_qm_adjust_dqlimits(xfs_mount_t *,
- xfs_disk_dquot_t *);
-extern int xfs_qm_dqget(xfs_mount_t *, xfs_inode_t *,
- xfs_dqid_t, uint, uint, xfs_dquot_t **);
-extern void xfs_qm_dqput(xfs_dquot_t *);
-extern void xfs_qm_dqrele(xfs_dquot_t *);
-extern void xfs_dqlock(xfs_dquot_t *);
-extern void xfs_dqlock2(xfs_dquot_t *, xfs_dquot_t *);
-extern void xfs_dqunlock(xfs_dquot_t *);
-extern void xfs_dqunlock_nonotify(xfs_dquot_t *);
-
-#endif /* __XFS_DQUOT_H__ */
diff --git a/fs/xfs/quota/xfs_dquot_item.c b/fs/xfs/quota/xfs_dquot_item.c
deleted file mode 100644
index 2ec6b441849..00000000000
--- a/fs/xfs/quota/xfs_dquot_item.c
+++ /dev/null
@@ -1,699 +0,0 @@
-/*
- * Copyright (c) 2000-2003 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_bit.h"
-#include "xfs_log.h"
-#include "xfs_inum.h"
-#include "xfs_trans.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_dir.h"
-#include "xfs_dir2.h"
-#include "xfs_alloc.h"
-#include "xfs_dmapi.h"
-#include "xfs_quota.h"
-#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
-#include "xfs_dinode.h"
-#include "xfs_inode.h"
-#include "xfs_bmap.h"
-#include "xfs_btree.h"
-#include "xfs_ialloc.h"
-#include "xfs_rtalloc.h"
-#include "xfs_error.h"
-#include "xfs_itable.h"
-#include "xfs_rw.h"
-#include "xfs_acl.h"
-#include "xfs_cap.h"
-#include "xfs_mac.h"
-#include "xfs_attr.h"
-#include "xfs_buf_item.h"
-#include "xfs_trans_priv.h"
-#include "xfs_qm.h"
-
-/*
- * returns the number of iovecs needed to log the given dquot item.
- */
-/* ARGSUSED */
-STATIC uint
-xfs_qm_dquot_logitem_size(
- xfs_dq_logitem_t *logitem)
-{
- /*
- * we need only two iovecs, one for the format, one for the real thing
- */
- return (2);
-}
-
-/*
- * fills in the vector of log iovecs for the given dquot log item.
- */
-STATIC void
-xfs_qm_dquot_logitem_format(
- xfs_dq_logitem_t *logitem,
- xfs_log_iovec_t *logvec)
-{
- ASSERT(logitem);
- ASSERT(logitem->qli_dquot);
-
- logvec->i_addr = (xfs_caddr_t)&logitem->qli_format;
- logvec->i_len = sizeof(xfs_dq_logformat_t);
- logvec++;
- logvec->i_addr = (xfs_caddr_t)&logitem->qli_dquot->q_core;
- logvec->i_len = sizeof(xfs_disk_dquot_t);
-
- ASSERT(2 == logitem->qli_item.li_desc->lid_size);
- logitem->qli_format.qlf_size = 2;
-
-}
-
-/*
- * Increment the pin count of the given dquot.
- * This value is protected by pinlock spinlock in the xQM structure.
- */
-STATIC void
-xfs_qm_dquot_logitem_pin(
- xfs_dq_logitem_t *logitem)
-{
- unsigned long s;
- xfs_dquot_t *dqp;
-
- dqp = logitem->qli_dquot;
- ASSERT(XFS_DQ_IS_LOCKED(dqp));
- s = XFS_DQ_PINLOCK(dqp);
- dqp->q_pincount++;
- XFS_DQ_PINUNLOCK(dqp, s);
-}
-
-/*
- * Decrement the pin count of the given dquot, and wake up
- * anyone in xfs_dqwait_unpin() if the count goes to 0. The
- * dquot must have been previously pinned with a call to xfs_dqpin().
- */
-/* ARGSUSED */
-STATIC void
-xfs_qm_dquot_logitem_unpin(
- xfs_dq_logitem_t *logitem,
- int stale)
-{
- unsigned long s;
- xfs_dquot_t *dqp;
-
- dqp = logitem->qli_dquot;
- ASSERT(dqp->q_pincount > 0);
- s = XFS_DQ_PINLOCK(dqp);
- dqp->q_pincount--;
- if (dqp->q_pincount == 0) {
- sv_broadcast(&dqp->q_pinwait);
- }
- XFS_DQ_PINUNLOCK(dqp, s);
-}
-
-/* ARGSUSED */
-STATIC void
-xfs_qm_dquot_logitem_unpin_remove(
- xfs_dq_logitem_t *logitem,
- xfs_trans_t *tp)
-{
- xfs_qm_dquot_logitem_unpin(logitem, 0);
-}
-
-/*
- * Given the logitem, this writes the corresponding dquot entry to disk
- * asynchronously. This is called with the dquot entry securely locked;
- * we simply get xfs_qm_dqflush() to do the work, and unlock the dquot
- * at the end.
- */
-STATIC void
-xfs_qm_dquot_logitem_push(
- xfs_dq_logitem_t *logitem)
-{
- xfs_dquot_t *dqp;
-
- dqp = logitem->qli_dquot;
-
- ASSERT(XFS_DQ_IS_LOCKED(dqp));
- ASSERT(XFS_DQ_IS_FLUSH_LOCKED(dqp));
-
- /*
- * Since we were able to lock the dquot's flush lock and
- * we found it on the AIL, the dquot must be dirty. This
- * is because the dquot is removed from the AIL while still
- * holding the flush lock in xfs_dqflush_done(). Thus, if
- * we found it in the AIL and were able to obtain the flush
- * lock without sleeping, then there must not have been
- * anyone in the process of flushing the dquot.
- */
- xfs_qm_dqflush(dqp, XFS_B_DELWRI);
- xfs_dqunlock(dqp);
-}
-
-/*ARGSUSED*/
-STATIC xfs_lsn_t
-xfs_qm_dquot_logitem_committed(
- xfs_dq_logitem_t *l,
- xfs_lsn_t lsn)
-{
- /*
- * We always re-log the entire dquot when it becomes dirty,
- * so, the latest copy _is_ the only one that matters.
- */
- return (lsn);
-}
-
-
-/*
- * This is called to wait for the given dquot to be unpinned.
- * Most of these pin/unpin routines are plagiarized from inode code.
- */
-void
-xfs_qm_dqunpin_wait(
- xfs_dquot_t *dqp)
-{
- SPLDECL(s);
-
- ASSERT(XFS_DQ_IS_LOCKED(dqp));
- if (dqp->q_pincount == 0) {
- return;
- }
-
- /*
- * Give the log a push so we don't wait here too long.
- */
- xfs_log_force(dqp->q_mount, (xfs_lsn_t)0, XFS_LOG_FORCE);
- s = XFS_DQ_PINLOCK(dqp);
- if (dqp->q_pincount == 0) {
- XFS_DQ_PINUNLOCK(dqp, s);
- return;
- }
- sv_wait(&(dqp->q_pinwait), PINOD,
- &(XFS_DQ_TO_QINF(dqp)->qi_pinlock), s);
-}
-
-/*
- * This is called when IOP_TRYLOCK returns XFS_ITEM_PUSHBUF to indicate that
- * the dquot is locked by us, but the flush lock isn't. So, here we are
- * going to see if the relevant dquot buffer is incore, waiting on DELWRI.
- * If so, we want to push it out to help us take this item off the AIL as soon
- * as possible.
- *
- * We must not be holding the AIL_LOCK at this point. Calling incore() to
- * search the buffercache can be a time consuming thing, and AIL_LOCK is a
- * spinlock.
- */
-STATIC void
-xfs_qm_dquot_logitem_pushbuf(
- xfs_dq_logitem_t *qip)
-{
- xfs_dquot_t *dqp;
- xfs_mount_t *mp;
- xfs_buf_t *bp;
- uint dopush;
-
- dqp = qip->qli_dquot;
- ASSERT(XFS_DQ_IS_LOCKED(dqp));
-
- /*
- * The qli_pushbuf_flag keeps others from
- * trying to duplicate our effort.
- */
- ASSERT(qip->qli_pushbuf_flag != 0);
- ASSERT(qip->qli_push_owner == current_pid());
-
- /*
- * If flushlock isn't locked anymore, chances are that the
- * inode flush completed and the inode was taken off the AIL.
- * So, just get out.
- */
- if ((valusema(&(dqp->q_flock)) > 0) ||
- ((qip->qli_item.li_flags & XFS_LI_IN_AIL) == 0)) {
- qip->qli_pushbuf_flag = 0;
- xfs_dqunlock(dqp);
- return;
- }
- mp = dqp->q_mount;
- bp = xfs_incore(mp->m_ddev_targp, qip->qli_format.qlf_blkno,
- XFS_QI_DQCHUNKLEN(mp),
- XFS_INCORE_TRYLOCK);
- if (bp != NULL) {
- if (XFS_BUF_ISDELAYWRITE(bp)) {
- dopush = ((qip->qli_item.li_flags & XFS_LI_IN_AIL) &&
- (valusema(&(dqp->q_flock)) <= 0));
- qip->qli_pushbuf_flag = 0;
- xfs_dqunlock(dqp);
-
- if (XFS_BUF_ISPINNED(bp)) {
- xfs_log_force(mp, (xfs_lsn_t)0,
- XFS_LOG_FORCE);
- }
- if (dopush) {
-#ifdef XFSRACEDEBUG
- delay_for_intr();
- delay(300);
-#endif
- xfs_bawrite(mp, bp);
- } else {
- xfs_buf_relse(bp);
- }
- } else {
- qip->qli_pushbuf_flag = 0;
- xfs_dqunlock(dqp);
- xfs_buf_relse(bp);
- }
- return;
- }
-
- qip->qli_pushbuf_flag = 0;
- xfs_dqunlock(dqp);
-}
-
-/*
- * This is called to attempt to lock the dquot associated with this
- * dquot log item. Don't sleep on the dquot lock or the flush lock.
- * If the flush lock is already held, indicating that the dquot has
- * been or is in the process of being flushed, then see if we can
- * find the dquot's buffer in the buffer cache without sleeping. If
- * we can and it is marked delayed write, then we want to send it out.
- * We delay doing so until the push routine, though, to avoid sleeping
- * in any device strategy routines.
- */
-STATIC uint
-xfs_qm_dquot_logitem_trylock(
- xfs_dq_logitem_t *qip)
-{
- xfs_dquot_t *dqp;
- uint retval;
-
- dqp = qip->qli_dquot;
- if (dqp->q_pincount > 0)
- return (XFS_ITEM_PINNED);
-
- if (! xfs_qm_dqlock_nowait(dqp))
- return (XFS_ITEM_LOCKED);
-
- retval = XFS_ITEM_SUCCESS;
- if (! xfs_qm_dqflock_nowait(dqp)) {
- /*
- * The dquot is already being flushed. It may have been
- * flushed delayed write, however, and we don't want to
- * get stuck waiting for that to complete. So, we want to check
- * to see if we can lock the dquot's buffer without sleeping.
- * If we can and it is marked for delayed write, then we
- * hold it and send it out from the push routine. We don't
- * want to do that now since we might sleep in the device
- * strategy routine. We also don't want to grab the buffer lock
- * here because we'd like not to call into the buffer cache
- * while holding the AIL_LOCK.
- * Make sure to only return PUSHBUF if we set pushbuf_flag
- * ourselves. If someone else is doing it then we don't
- * want to go to the push routine and duplicate their efforts.
- */
- if (qip->qli_pushbuf_flag == 0) {
- qip->qli_pushbuf_flag = 1;
- ASSERT(qip->qli_format.qlf_blkno == dqp->q_blkno);
-#ifdef DEBUG
- qip->qli_push_owner = current_pid();
-#endif
- /*
- * The dquot is left locked.
- */
- retval = XFS_ITEM_PUSHBUF;
- } else {
- retval = XFS_ITEM_FLUSHING;
- xfs_dqunlock_nonotify(dqp);
- }
- }
-
- ASSERT(qip->qli_item.li_flags & XFS_LI_IN_AIL);
- return (retval);
-}
-
-
-/*
- * Unlock the dquot associated with the log item.
- * Clear the fields of the dquot and dquot log item that
- * are specific to the current transaction. If the
- * hold flags is set, do not unlock the dquot.
- */
-STATIC void
-xfs_qm_dquot_logitem_unlock(
- xfs_dq_logitem_t *ql)
-{
- xfs_dquot_t *dqp;
-
- ASSERT(ql != NULL);
- dqp = ql->qli_dquot;
- ASSERT(XFS_DQ_IS_LOCKED(dqp));
-
- /*
- * Clear the transaction pointer in the dquot
- */
- dqp->q_transp = NULL;
-
- /*
- * dquots are never 'held' from getting unlocked at the end of
- * a transaction. Their locking and unlocking is hidden inside the
- * transaction layer, within trans_commit. Hence, no LI_HOLD flag
- * for the logitem.
- */
- xfs_dqunlock(dqp);
-}
-
-
-/*
- * The transaction with the dquot locked has aborted. The dquot
- * must not be dirty within the transaction. We simply unlock just
- * as if the transaction had been cancelled.
- */
-STATIC void
-xfs_qm_dquot_logitem_abort(
- xfs_dq_logitem_t *ql)
-{
- xfs_qm_dquot_logitem_unlock(ql);
-}
-
-/*
- * this needs to stamp an lsn into the dquot, I think.
- * rpc's that look at user dquot's would then have to
- * push on the dependency recorded in the dquot
- */
-/* ARGSUSED */
-STATIC void
-xfs_qm_dquot_logitem_committing(
- xfs_dq_logitem_t *l,
- xfs_lsn_t lsn)
-{
- return;
-}
-
-
-/*
- * This is the ops vector for dquots
- */
-STATIC struct xfs_item_ops xfs_dquot_item_ops = {
- .iop_size = (uint(*)(xfs_log_item_t*))xfs_qm_dquot_logitem_size,
- .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*))
- xfs_qm_dquot_logitem_format,
- .iop_pin = (void(*)(xfs_log_item_t*))xfs_qm_dquot_logitem_pin,
- .iop_unpin = (void(*)(xfs_log_item_t*, int))
- xfs_qm_dquot_logitem_unpin,
- .iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t*))
- xfs_qm_dquot_logitem_unpin_remove,
- .iop_trylock = (uint(*)(xfs_log_item_t*))
- xfs_qm_dquot_logitem_trylock,
- .iop_unlock = (void(*)(xfs_log_item_t*))xfs_qm_dquot_logitem_unlock,
- .iop_committed = (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t))
- xfs_qm_dquot_logitem_committed,
- .iop_push = (void(*)(xfs_log_item_t*))xfs_qm_dquot_logitem_push,
- .iop_abort = (void(*)(xfs_log_item_t*))xfs_qm_dquot_logitem_abort,
- .iop_pushbuf = (void(*)(xfs_log_item_t*))
- xfs_qm_dquot_logitem_pushbuf,
- .iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t))
- xfs_qm_dquot_logitem_committing
-};
-
-/*
- * Initialize the dquot log item for a newly allocated dquot.
- * The dquot isn't locked at this point, but it isn't on any of the lists
- * either, so we don't care.
- */
-void
-xfs_qm_dquot_logitem_init(
- struct xfs_dquot *dqp)
-{
- xfs_dq_logitem_t *lp;
- lp = &dqp->q_logitem;
-
- lp->qli_item.li_type = XFS_LI_DQUOT;
- lp->qli_item.li_ops = &xfs_dquot_item_ops;
- lp->qli_item.li_mountp = dqp->q_mount;
- lp->qli_dquot = dqp;
- lp->qli_format.qlf_type = XFS_LI_DQUOT;
- lp->qli_format.qlf_id = be32_to_cpu(dqp->q_core.d_id);
- lp->qli_format.qlf_blkno = dqp->q_blkno;
- lp->qli_format.qlf_len = 1;
- /*
- * This is just the offset of this dquot within its buffer
- * (which is currently 1 FSB and probably won't change).
- * Hence 32 bits for this offset should be just fine.
- * Alternatively, we can store (bufoffset / sizeof(xfs_dqblk_t))
- * here, and recompute it at recovery time.
- */
- lp->qli_format.qlf_boffset = (__uint32_t)dqp->q_bufoffset;
-}
-
-/*------------------ QUOTAOFF LOG ITEMS -------------------*/
-
-/*
- * This returns the number of iovecs needed to log the given quotaoff item.
- * We only need 1 iovec for an quotaoff item. It just logs the
- * quotaoff_log_format structure.
- */
-/*ARGSUSED*/
-STATIC uint
-xfs_qm_qoff_logitem_size(xfs_qoff_logitem_t *qf)
-{
- return (1);
-}
-
-/*
- * This is called to fill in the vector of log iovecs for the
- * given quotaoff log item. We use only 1 iovec, and we point that
- * at the quotaoff_log_format structure embedded in the quotaoff item.
- * It is at this point that we assert that all of the extent
- * slots in the quotaoff item have been filled.
- */
-STATIC void
-xfs_qm_qoff_logitem_format(xfs_qoff_logitem_t *qf,
- xfs_log_iovec_t *log_vector)
-{
- ASSERT(qf->qql_format.qf_type == XFS_LI_QUOTAOFF);
-
- log_vector->i_addr = (xfs_caddr_t)&(qf->qql_format);
- log_vector->i_len = sizeof(xfs_qoff_logitem_t);
- XLOG_VEC_SET_TYPE(log_vector, XLOG_REG_TYPE_QUOTAOFF);
- qf->qql_format.qf_size = 1;
-}
-
-
-/*
- * Pinning has no meaning for an quotaoff item, so just return.
- */
-/*ARGSUSED*/
-STATIC void
-xfs_qm_qoff_logitem_pin(xfs_qoff_logitem_t *qf)
-{
- return;
-}
-
-
-/*
- * Since pinning has no meaning for an quotaoff item, unpinning does
- * not either.
- */
-/*ARGSUSED*/
-STATIC void
-xfs_qm_qoff_logitem_unpin(xfs_qoff_logitem_t *qf, int stale)
-{
- return;
-}
-
-/*ARGSUSED*/
-STATIC void
-xfs_qm_qoff_logitem_unpin_remove(xfs_qoff_logitem_t *qf, xfs_trans_t *tp)
-{
- return;
-}
-
-/*
- * Quotaoff items have no locking, so just return success.
- */
-/*ARGSUSED*/
-STATIC uint
-xfs_qm_qoff_logitem_trylock(xfs_qoff_logitem_t *qf)
-{
- return XFS_ITEM_LOCKED;
-}
-
-/*
- * Quotaoff items have no locking or pushing, so return failure
- * so that the caller doesn't bother with us.
- */
-/*ARGSUSED*/
-STATIC void
-xfs_qm_qoff_logitem_unlock(xfs_qoff_logitem_t *qf)
-{
- return;
-}
-
-/*
- * The quotaoff-start-item is logged only once and cannot be moved in the log,
- * so simply return the lsn at which it's been logged.
- */
-/*ARGSUSED*/
-STATIC xfs_lsn_t
-xfs_qm_qoff_logitem_committed(xfs_qoff_logitem_t *qf, xfs_lsn_t lsn)
-{
- return (lsn);
-}
-
-/*
- * The transaction of which this QUOTAOFF is a part has been aborted.
- * Just clean up after ourselves.
- * Shouldn't this never happen in the case of qoffend logitems? XXX
- */
-STATIC void
-xfs_qm_qoff_logitem_abort(xfs_qoff_logitem_t *qf)
-{
- kmem_free(qf, sizeof(xfs_qoff_logitem_t));
-}
-
-/*
- * There isn't much you can do to push on an quotaoff item. It is simply
- * stuck waiting for the log to be flushed to disk.
- */
-/*ARGSUSED*/
-STATIC void
-xfs_qm_qoff_logitem_push(xfs_qoff_logitem_t *qf)
-{
- return;
-}
-
-
-/*ARGSUSED*/
-STATIC xfs_lsn_t
-xfs_qm_qoffend_logitem_committed(
- xfs_qoff_logitem_t *qfe,
- xfs_lsn_t lsn)
-{
- xfs_qoff_logitem_t *qfs;
- SPLDECL(s);
-
- qfs = qfe->qql_start_lip;
- AIL_LOCK(qfs->qql_item.li_mountp,s);
- /*
- * Delete the qoff-start logitem from the AIL.
- * xfs_trans_delete_ail() drops the AIL lock.
- */
- xfs_trans_delete_ail(qfs->qql_item.li_mountp, (xfs_log_item_t *)qfs, s);
- kmem_free(qfs, sizeof(xfs_qoff_logitem_t));
- kmem_free(qfe, sizeof(xfs_qoff_logitem_t));
- return (xfs_lsn_t)-1;
-}
-
-/*
- * XXX rcc - don't know quite what to do with this. I think we can
- * just ignore it. The only time that isn't the case is if we allow
- * the client to somehow see that quotas have been turned off in which
- * we can't allow that to get back until the quotaoff hits the disk.
- * So how would that happen? Also, do we need different routines for
- * quotaoff start and quotaoff end? I suspect the answer is yes but
- * to be sure, I need to look at the recovery code and see how quota off
- * recovery is handled (do we roll forward or back or do something else).
- * If we roll forwards or backwards, then we need two separate routines,
- * one that does nothing and one that stamps in the lsn that matters
- * (truly makes the quotaoff irrevocable). If we do something else,
- * then maybe we don't need two.
- */
-/* ARGSUSED */
-STATIC void
-xfs_qm_qoff_logitem_committing(xfs_qoff_logitem_t *qip, xfs_lsn_t commit_lsn)
-{
- return;
-}
-
-/* ARGSUSED */
-STATIC void
-xfs_qm_qoffend_logitem_committing(xfs_qoff_logitem_t *qip, xfs_lsn_t commit_lsn)
-{
- return;
-}
-
-STATIC struct xfs_item_ops xfs_qm_qoffend_logitem_ops = {
- .iop_size = (uint(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_size,
- .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*))
- xfs_qm_qoff_logitem_format,
- .iop_pin = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_pin,
- .iop_unpin = (void(*)(xfs_log_item_t* ,int))
- xfs_qm_qoff_logitem_unpin,
- .iop_unpin_remove = (void(*)(xfs_log_item_t*,xfs_trans_t*))
- xfs_qm_qoff_logitem_unpin_remove,
- .iop_trylock = (uint(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_trylock,
- .iop_unlock = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_unlock,
- .iop_committed = (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t))
- xfs_qm_qoffend_logitem_committed,
- .iop_push = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_push,
- .iop_abort = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_abort,
- .iop_pushbuf = NULL,
- .iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t))
- xfs_qm_qoffend_logitem_committing
-};
-
-/*
- * This is the ops vector shared by all quotaoff-start log items.
- */
-STATIC struct xfs_item_ops xfs_qm_qoff_logitem_ops = {
- .iop_size = (uint(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_size,
- .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*))
- xfs_qm_qoff_logitem_format,
- .iop_pin = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_pin,
- .iop_unpin = (void(*)(xfs_log_item_t*, int))
- xfs_qm_qoff_logitem_unpin,
- .iop_unpin_remove = (void(*)(xfs_log_item_t*,xfs_trans_t*))
- xfs_qm_qoff_logitem_unpin_remove,
- .iop_trylock = (uint(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_trylock,
- .iop_unlock = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_unlock,
- .iop_committed = (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t))
- xfs_qm_qoff_logitem_committed,
- .iop_push = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_push,
- .iop_abort = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_abort,
- .iop_pushbuf = NULL,
- .iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t))
- xfs_qm_qoff_logitem_committing
-};
-
-/*
- * Allocate and initialize an quotaoff item of the correct quota type(s).
- */
-xfs_qoff_logitem_t *
-xfs_qm_qoff_logitem_init(
- struct xfs_mount *mp,
- xfs_qoff_logitem_t *start,
- uint flags)
-{
- xfs_qoff_logitem_t *qf;
-
- qf = (xfs_qoff_logitem_t*) kmem_zalloc(sizeof(xfs_qoff_logitem_t), KM_SLEEP);
-
- qf->qql_item.li_type = XFS_LI_QUOTAOFF;
- if (start)
- qf->qql_item.li_ops = &xfs_qm_qoffend_logitem_ops;
- else
- qf->qql_item.li_ops = &xfs_qm_qoff_logitem_ops;
- qf->qql_item.li_mountp = mp;
- qf->qql_format.qf_type = XFS_LI_QUOTAOFF;
- qf->qql_format.qf_flags = flags;
- qf->qql_start_lip = start;
- return (qf);
-}
diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
deleted file mode 100644
index 53a00fb217f..00000000000
--- a/fs/xfs/quota/xfs_qm.c
+++ /dev/null
@@ -1,2859 +0,0 @@
-/*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_bit.h"
-#include "xfs_log.h"
-#include "xfs_inum.h"
-#include "xfs_clnt.h"
-#include "xfs_trans.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_dir.h"
-#include "xfs_dir2.h"
-#include "xfs_alloc.h"
-#include "xfs_dmapi.h"
-#include "xfs_quota.h"
-#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
-#include "xfs_dinode.h"
-#include "xfs_inode.h"
-#include "xfs_btree.h"
-#include "xfs_ialloc.h"
-#include "xfs_itable.h"
-#include "xfs_rtalloc.h"
-#include "xfs_error.h"
-#include "xfs_bmap.h"
-#include "xfs_rw.h"
-#include "xfs_acl.h"
-#include "xfs_cap.h"
-#include "xfs_mac.h"
-#include "xfs_attr.h"
-#include "xfs_buf_item.h"
-#include "xfs_trans_space.h"
-#include "xfs_utils.h"
-#include "xfs_qm.h"
-
-/*
- * The global quota manager. There is only one of these for the entire
- * system, _not_ one per file system. XQM keeps track of the overall
- * quota functionality, including maintaining the freelist and hash
- * tables of dquots.
- */
-mutex_t xfs_Gqm_lock;
-struct xfs_qm *xfs_Gqm;
-uint ndquot;
-
-kmem_zone_t *qm_dqzone;
-kmem_zone_t *qm_dqtrxzone;
-STATIC kmem_shaker_t xfs_qm_shaker;
-
-STATIC void xfs_qm_list_init(xfs_dqlist_t *, char *, int);
-STATIC void xfs_qm_list_destroy(xfs_dqlist_t *);
-
-STATIC void xfs_qm_freelist_init(xfs_frlist_t *);
-STATIC void xfs_qm_freelist_destroy(xfs_frlist_t *);
-STATIC int xfs_qm_mplist_nowait(xfs_mount_t *);
-STATIC int xfs_qm_dqhashlock_nowait(xfs_dquot_t *);
-
-STATIC int xfs_qm_init_quotainos(xfs_mount_t *);
-STATIC int xfs_qm_init_quotainfo(xfs_mount_t *);
-STATIC int xfs_qm_shake(int, gfp_t);
-
-#ifdef DEBUG
-extern mutex_t qcheck_lock;
-#endif
-
-#ifdef QUOTADEBUG
-#define XQM_LIST_PRINT(l, NXT, title) \
-{ \
- xfs_dquot_t *dqp; int i = 0; \
- cmn_err(CE_DEBUG, "%s (#%d)", title, (int) (l)->qh_nelems); \
- for (dqp = (l)->qh_next; dqp != NULL; dqp = dqp->NXT) { \
- cmn_err(CE_DEBUG, " %d. \"%d (%s)\" " \
- "bcnt = %d, icnt = %d, refs = %d", \
- ++i, (int) be32_to_cpu(dqp->q_core.d_id), \
- DQFLAGTO_TYPESTR(dqp), \
- (int) be64_to_cpu(dqp->q_core.d_bcount), \
- (int) be64_to_cpu(dqp->q_core.d_icount), \
- (int) dqp->q_nrefs); } \
-}
-#else
-#define XQM_LIST_PRINT(l, NXT, title) do { } while (0)
-#endif
-
-/*
- * Initialize the XQM structure.
- * Note that there is not one quota manager per file system.
- */
-STATIC struct xfs_qm *
-xfs_Gqm_init(void)
-{
- xfs_dqhash_t *udqhash, *gdqhash;
- xfs_qm_t *xqm;
- uint i, hsize, flags = KM_SLEEP | KM_MAYFAIL;
-
- /*
- * Initialize the dquot hash tables.
- */
- hsize = XFS_QM_HASHSIZE_HIGH;
- while (!(udqhash = kmem_zalloc(hsize * sizeof(xfs_dqhash_t), flags))) {
- if ((hsize >>= 1) <= XFS_QM_HASHSIZE_LOW)
- flags = KM_SLEEP;
- }
- gdqhash = kmem_zalloc(hsize * sizeof(xfs_dqhash_t), KM_SLEEP);
- ndquot = hsize << 8;
-
- xqm = kmem_zalloc(sizeof(xfs_qm_t), KM_SLEEP);
- xqm->qm_dqhashmask = hsize - 1;
- xqm->qm_usr_dqhtable = udqhash;
- xqm->qm_grp_dqhtable = gdqhash;
- ASSERT(xqm->qm_usr_dqhtable != NULL);
- ASSERT(xqm->qm_grp_dqhtable != NULL);
-
- for (i = 0; i < hsize; i++) {
- xfs_qm_list_init(&(xqm->qm_usr_dqhtable[i]), "uxdqh", i);
- xfs_qm_list_init(&(xqm->qm_grp_dqhtable[i]), "gxdqh", i);
- }
-
- /*
- * Freelist of all dquots of all file systems
- */
- xfs_qm_freelist_init(&(xqm->qm_dqfreelist));
-
- /*
- * dquot zone. we register our own low-memory callback.
- */
- if (!qm_dqzone) {
- xqm->qm_dqzone = kmem_zone_init(sizeof(xfs_dquot_t),
- "xfs_dquots");
- qm_dqzone = xqm->qm_dqzone;
- } else
- xqm->qm_dqzone = qm_dqzone;
-
- xfs_qm_shaker = kmem_shake_register(xfs_qm_shake);
-
- /*
- * The t_dqinfo portion of transactions.
- */
- if (!qm_dqtrxzone) {
- xqm->qm_dqtrxzone = kmem_zone_init(sizeof(xfs_dquot_acct_t),
- "xfs_dqtrx");
- qm_dqtrxzone = xqm->qm_dqtrxzone;
- } else
- xqm->qm_dqtrxzone = qm_dqtrxzone;
-
- atomic_set(&xqm->qm_totaldquots, 0);
- xqm->qm_dqfree_ratio = XFS_QM_DQFREE_RATIO;
- xqm->qm_nrefs = 0;
-#ifdef DEBUG
- mutex_init(&qcheck_lock);
-#endif
- return xqm;
-}
-
-/*
- * Destroy the global quota manager when its reference count goes to zero.
- */
-STATIC void
-xfs_qm_destroy(
- struct xfs_qm *xqm)
-{
- int hsize, i;
-
- ASSERT(xqm != NULL);
- ASSERT(xqm->qm_nrefs == 0);
- kmem_shake_deregister(xfs_qm_shaker);
- hsize = xqm->qm_dqhashmask + 1;
- for (i = 0; i < hsize; i++) {
- xfs_qm_list_destroy(&(xqm->qm_usr_dqhtable[i]));
- xfs_qm_list_destroy(&(xqm->qm_grp_dqhtable[i]));
- }
- kmem_free(xqm->qm_usr_dqhtable, hsize * sizeof(xfs_dqhash_t));
- kmem_free(xqm->qm_grp_dqhtable, hsize * sizeof(xfs_dqhash_t));
- xqm->qm_usr_dqhtable = NULL;
- xqm->qm_grp_dqhtable = NULL;
- xqm->qm_dqhashmask = 0;
- xfs_qm_freelist_destroy(&(xqm->qm_dqfreelist));
-#ifdef DEBUG
- mutex_destroy(&qcheck_lock);
-#endif
- kmem_free(xqm, sizeof(xfs_qm_t));
-}
-
-/*
- * Called at mount time to let XQM know that another file system is
- * starting quotas. This isn't crucial information as the individual mount
- * structures are pretty independent, but it helps the XQM keep a
- * global view of what's going on.
- */
-/* ARGSUSED */
-STATIC int
-xfs_qm_hold_quotafs_ref(
- struct xfs_mount *mp)
-{
- /*
- * Need to lock the xfs_Gqm structure for things like this. For example,
- * the structure could disappear between the entry to this routine and
- * a HOLD operation if not locked.
- */
- XFS_QM_LOCK(xfs_Gqm);
-
- if (xfs_Gqm == NULL)
- xfs_Gqm = xfs_Gqm_init();
- /*
- * We can keep a list of all filesystems with quotas mounted for
- * debugging and statistical purposes, but ...
- * Just take a reference and get out.
- */
- XFS_QM_HOLD(xfs_Gqm);
- XFS_QM_UNLOCK(xfs_Gqm);
-
- return 0;
-}
-
-
-/*
- * Release the reference that a filesystem took at mount time,
- * so that we know when we need to destroy the entire quota manager.
- */
-/* ARGSUSED */
-STATIC void
-xfs_qm_rele_quotafs_ref(
- struct xfs_mount *mp)
-{
- xfs_dquot_t *dqp, *nextdqp;
-
- ASSERT(xfs_Gqm);
- ASSERT(xfs_Gqm->qm_nrefs > 0);
-
- /*
- * Go thru the freelist and destroy all inactive dquots.
- */
- xfs_qm_freelist_lock(xfs_Gqm);
-
- for (dqp = xfs_Gqm->qm_dqfreelist.qh_next;
- dqp != (xfs_dquot_t *)&(xfs_Gqm->qm_dqfreelist); ) {
- xfs_dqlock(dqp);
- nextdqp = dqp->dq_flnext;
- if (dqp->dq_flags & XFS_DQ_INACTIVE) {
- ASSERT(dqp->q_mount == NULL);
- ASSERT(! XFS_DQ_IS_DIRTY(dqp));
- ASSERT(dqp->HL_PREVP == NULL);
- ASSERT(dqp->MPL_PREVP == NULL);
- XQM_FREELIST_REMOVE(dqp);
- xfs_dqunlock(dqp);
- xfs_qm_dqdestroy(dqp);
- } else {
- xfs_dqunlock(dqp);
- }
- dqp = nextdqp;
- }
- xfs_qm_freelist_unlock(xfs_Gqm);
-
- /*
- * Destroy the entire XQM. If somebody mounts with quotaon, this'll
- * be restarted.
- */
- XFS_QM_LOCK(xfs_Gqm);
- XFS_QM_RELE(xfs_Gqm);
- if (xfs_Gqm->qm_nrefs == 0) {
- xfs_qm_destroy(xfs_Gqm);
- xfs_Gqm = NULL;
- }
- XFS_QM_UNLOCK(xfs_Gqm);
-}
-
-/*
- * This is called at mount time from xfs_mountfs to initialize the quotainfo
- * structure and start the global quotamanager (xfs_Gqm) if it hasn't done
- * so already. Note that the superblock has not been read in yet.
- */
-void
-xfs_qm_mount_quotainit(
- xfs_mount_t *mp,
- uint flags)
-{
- /*
- * User, projects or group quotas has to be on.
- */
- ASSERT(flags & (XFSMNT_UQUOTA | XFSMNT_PQUOTA | XFSMNT_GQUOTA));
-
- /*
- * Initialize the flags in the mount structure. From this point
- * onwards we look at m_qflags to figure out if quotas's ON/OFF, etc.
- * Note that we enforce nothing if accounting is off.
- * ie. XFSMNT_*QUOTA must be ON for XFSMNT_*QUOTAENF.
- * It isn't necessary to take the quotaoff lock to do this; this is
- * called from mount.
- */
- if (flags & XFSMNT_UQUOTA) {
- mp->m_qflags |= (XFS_UQUOTA_ACCT | XFS_UQUOTA_ACTIVE);
- if (flags & XFSMNT_UQUOTAENF)
- mp->m_qflags |= XFS_UQUOTA_ENFD;
- }
- if (flags & XFSMNT_GQUOTA) {
- mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE);
- if (flags & XFSMNT_GQUOTAENF)
- mp->m_qflags |= XFS_OQUOTA_ENFD;
- } else if (flags & XFSMNT_PQUOTA) {
- mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE);
- if (flags & XFSMNT_PQUOTAENF)
- mp->m_qflags |= XFS_OQUOTA_ENFD;
- }
-}
-
-/*
- * Just destroy the quotainfo structure.
- */
-void
-xfs_qm_unmount_quotadestroy(
- xfs_mount_t *mp)
-{
- if (mp->m_quotainfo)
- xfs_qm_destroy_quotainfo(mp);
-}
-
-
-/*
- * This is called from xfs_mountfs to start quotas and initialize all
- * necessary data structures like quotainfo. This is also responsible for
- * running a quotacheck as necessary. We are guaranteed that the superblock
- * is consistently read in at this point.
- */
-int
-xfs_qm_mount_quotas(
- xfs_mount_t *mp,
- int mfsi_flags)
-{
- unsigned long s;
- int error = 0;
- uint sbf;
-
-
- /*
- * If quotas on realtime volumes is not supported, we disable
- * quotas immediately.
- */
- if (mp->m_sb.sb_rextents) {
- cmn_err(CE_NOTE,
- "Cannot turn on quotas for realtime filesystem %s",
- mp->m_fsname);
- mp->m_qflags = 0;
- goto write_changes;
- }
-
- ASSERT(XFS_IS_QUOTA_RUNNING(mp));
-
- /*
- * Allocate the quotainfo structure inside the mount struct, and
- * create quotainode(s), and change/rev superblock if necessary.
- */
- if ((error = xfs_qm_init_quotainfo(mp))) {
- /*
- * We must turn off quotas.
- */
- ASSERT(mp->m_quotainfo == NULL);
- mp->m_qflags = 0;
- goto write_changes;
- }
- /*
- * If any of the quotas are not consistent, do a quotacheck.
- */
- if (XFS_QM_NEED_QUOTACHECK(mp) &&
- !(mfsi_flags & XFS_MFSI_NO_QUOTACHECK)) {
- if ((error = xfs_qm_quotacheck(mp))) {
- /* Quotacheck has failed and quotas have
- * been disabled.
- */
- return XFS_ERROR(error);
- }
- }
-
- write_changes:
- /*
- * We actually don't have to acquire the SB_LOCK at all.
- * This can only be called from mount, and that's single threaded. XXX
- */
- s = XFS_SB_LOCK(mp);
- sbf = mp->m_sb.sb_qflags;
- mp->m_sb.sb_qflags = mp->m_qflags & XFS_MOUNT_QUOTA_ALL;
- XFS_SB_UNLOCK(mp, s);
-
- if (sbf != (mp->m_qflags & XFS_MOUNT_QUOTA_ALL)) {
- if (xfs_qm_write_sb_changes(mp, XFS_SB_QFLAGS)) {
- /*
- * We could only have been turning quotas off.
- * We aren't in very good shape actually because
- * the incore structures are convinced that quotas are
- * off, but the on disk superblock doesn't know that !
- */
- ASSERT(!(XFS_IS_QUOTA_RUNNING(mp)));
- xfs_fs_cmn_err(CE_ALERT, mp,
- "XFS mount_quotas: Superblock update failed!");
- }
- }
-
- if (error) {
- xfs_fs_cmn_err(CE_WARN, mp,
- "Failed to initialize disk quotas.");
- }
- return XFS_ERROR(error);
-}
-
-/*
- * Called from the vfsops layer.
- */
-int
-xfs_qm_unmount_quotas(
- xfs_mount_t *mp)
-{
- xfs_inode_t *uqp, *gqp;
- int error = 0;
-
- /*
- * Release the dquots that root inode, et al might be holding,
- * before we flush quotas and blow away the quotainfo structure.
- */
- ASSERT(mp->m_rootip);
- xfs_qm_dqdetach(mp->m_rootip);
- if (mp->m_rbmip)
- xfs_qm_dqdetach(mp->m_rbmip);
- if (mp->m_rsumip)
- xfs_qm_dqdetach(mp->m_rsumip);
-
- /*
- * Flush out the quota inodes.
- */
- uqp = gqp = NULL;
- if (mp->m_quotainfo) {
- if ((uqp = mp->m_quotainfo->qi_uquotaip) != NULL) {
- xfs_ilock(uqp, XFS_ILOCK_EXCL);
- xfs_iflock(uqp);
- error = xfs_iflush(uqp, XFS_IFLUSH_SYNC);
- xfs_iunlock(uqp, XFS_ILOCK_EXCL);
- if (unlikely(error == EFSCORRUPTED)) {
- XFS_ERROR_REPORT("xfs_qm_unmount_quotas(1)",
- XFS_ERRLEVEL_LOW, mp);
- goto out;
- }
- }
- if ((gqp = mp->m_quotainfo->qi_gquotaip) != NULL) {
- xfs_ilock(gqp, XFS_ILOCK_EXCL);
- xfs_iflock(gqp);
- error = xfs_iflush(gqp, XFS_IFLUSH_SYNC);
- xfs_iunlock(gqp, XFS_ILOCK_EXCL);
- if (unlikely(error == EFSCORRUPTED)) {
- XFS_ERROR_REPORT("xfs_qm_unmount_quotas(2)",
- XFS_ERRLEVEL_LOW, mp);
- goto out;
- }
- }
- }
- if (uqp) {
- XFS_PURGE_INODE(uqp);
- mp->m_quotainfo->qi_uquotaip = NULL;
- }
- if (gqp) {
- XFS_PURGE_INODE(gqp);
- mp->m_quotainfo->qi_gquotaip = NULL;
- }
-out:
- return XFS_ERROR(error);
-}
-
-/*
- * Flush all dquots of the given file system to disk. The dquots are
- * _not_ purged from memory here, just their data written to disk.
- */
-STATIC int
-xfs_qm_dqflush_all(
- xfs_mount_t *mp,
- int flags)
-{
- int recl;
- xfs_dquot_t *dqp;
- int niters;
- int error;
-
- if (mp->m_quotainfo == NULL)
- return 0;
- niters = 0;
-again:
- xfs_qm_mplist_lock(mp);
- FOREACH_DQUOT_IN_MP(dqp, mp) {
- xfs_dqlock(dqp);
- if (! XFS_DQ_IS_DIRTY(dqp)) {
- xfs_dqunlock(dqp);
- continue;
- }
- xfs_dqtrace_entry(dqp, "FLUSHALL: DQDIRTY");
- /* XXX a sentinel would be better */
- recl = XFS_QI_MPLRECLAIMS(mp);
- if (! xfs_qm_dqflock_nowait(dqp)) {
- /*
- * If we can't grab the flush lock then check
- * to see if the dquot has been flushed delayed
- * write. If so, grab its buffer and send it
- * out immediately. We'll be able to acquire
- * the flush lock when the I/O completes.
- */
- xfs_qm_dqflock_pushbuf_wait(dqp);
- }
- /*
- * Let go of the mplist lock. We don't want to hold it
- * across a disk write.
- */
- xfs_qm_mplist_unlock(mp);
- error = xfs_qm_dqflush(dqp, flags);
- xfs_dqunlock(dqp);
- if (error)
- return error;
-
- xfs_qm_mplist_lock(mp);
- if (recl != XFS_QI_MPLRECLAIMS(mp)) {
- xfs_qm_mplist_unlock(mp);
- /* XXX restart limit */
- goto again;
- }
- }
-
- xfs_qm_mplist_unlock(mp);
- /* return ! busy */
- return 0;
-}
-/*
- * Release the group dquot pointers the user dquots may be
- * carrying around as a hint. mplist is locked on entry and exit.
- */
-STATIC void
-xfs_qm_detach_gdquots(
- xfs_mount_t *mp)
-{
- xfs_dquot_t *dqp, *gdqp;
- int nrecl;
-
- again:
- ASSERT(XFS_QM_IS_MPLIST_LOCKED(mp));
- dqp = XFS_QI_MPLNEXT(mp);
- while (dqp) {
- xfs_dqlock(dqp);
- if ((gdqp = dqp->q_gdquot)) {
- xfs_dqlock(gdqp);
- dqp->q_gdquot = NULL;
- }
- xfs_dqunlock(dqp);
-
- if (gdqp) {
- /*
- * Can't hold the mplist lock across a dqput.
- * XXXmust convert to marker based iterations here.
- */
- nrecl = XFS_QI_MPLRECLAIMS(mp);
- xfs_qm_mplist_unlock(mp);
- xfs_qm_dqput(gdqp);
-
- xfs_qm_mplist_lock(mp);
- if (nrecl != XFS_QI_MPLRECLAIMS(mp))
- goto again;
- }
- dqp = dqp->MPL_NEXT;
- }
-}
-
-/*
- * Go through all the incore dquots of this file system and take them
- * off the mplist and hashlist, if the dquot type matches the dqtype
- * parameter. This is used when turning off quota accounting for
- * users and/or groups, as well as when the filesystem is unmounting.
- */
-STATIC int
-xfs_qm_dqpurge_int(
- xfs_mount_t *mp,
- uint flags) /* QUOTAOFF/UMOUNTING/UQUOTA/PQUOTA/GQUOTA */
-{
- xfs_dquot_t *dqp;
- uint dqtype;
- int nrecl;
- xfs_dquot_t *nextdqp;
- int nmisses;
-
- if (mp->m_quotainfo == NULL)
- return 0;
-
- dqtype = (flags & XFS_QMOPT_UQUOTA) ? XFS_DQ_USER : 0;
- dqtype |= (flags & XFS_QMOPT_PQUOTA) ? XFS_DQ_PROJ : 0;
- dqtype |= (flags & XFS_QMOPT_GQUOTA) ? XFS_DQ_GROUP : 0;
-
- xfs_qm_mplist_lock(mp);
-
- /*
- * In the first pass through all incore dquots of this filesystem,
- * we release the group dquot pointers the user dquots may be
- * carrying around as a hint. We need to do this irrespective of
- * what's being turned off.
- */
- xfs_qm_detach_gdquots(mp);
-
- again:
- nmisses = 0;
- ASSERT(XFS_QM_IS_MPLIST_LOCKED(mp));
- /*
- * Try to get rid of all of the unwanted dquots. The idea is to
- * get them off mplist and hashlist, but leave them on freelist.
- */
- dqp = XFS_QI_MPLNEXT(mp);
- while (dqp) {
- /*
- * It's OK to look at the type without taking dqlock here.
- * We're holding the mplist lock here, and that's needed for
- * a dqreclaim.
- */
- if ((dqp->dq_flags & dqtype) == 0) {
- dqp = dqp->MPL_NEXT;
- continue;
- }
-
- if (! xfs_qm_dqhashlock_nowait(dqp)) {
- nrecl = XFS_QI_MPLRECLAIMS(mp);
- xfs_qm_mplist_unlock(mp);
- XFS_DQ_HASH_LOCK(dqp->q_hash);
- xfs_qm_mplist_lock(mp);
-
- /*
- * XXXTheoretically, we can get into a very long
- * ping pong game here.
- * No one can be adding dquots to the mplist at
- * this point, but somebody might be taking things off.
- */
- if (nrecl != XFS_QI_MPLRECLAIMS(mp)) {
- XFS_DQ_HASH_UNLOCK(dqp->q_hash);
- goto again;
- }
- }
-
- /*
- * Take the dquot off the mplist and hashlist. It may remain on
- * freelist in INACTIVE state.
- */
- nextdqp = dqp->MPL_NEXT;
- nmisses += xfs_qm_dqpurge(dqp, flags);
- dqp = nextdqp;
- }
- xfs_qm_mplist_unlock(mp);
- return nmisses;
-}
-
-int
-xfs_qm_dqpurge_all(
- xfs_mount_t *mp,
- uint flags)
-{
- int ndquots;
-
- /*
- * Purge the dquot cache.
- * None of the dquots should really be busy at this point.
- */
- if (mp->m_quotainfo) {
- while ((ndquots = xfs_qm_dqpurge_int(mp, flags))) {
- delay(ndquots * 10);
- }
- }
- return 0;
-}
-
-STATIC int
-xfs_qm_dqattach_one(
- xfs_inode_t *ip,
- xfs_dqid_t id,
- uint type,
- uint doalloc,
- uint dolock,
- xfs_dquot_t *udqhint, /* hint */
- xfs_dquot_t **IO_idqpp)
-{
- xfs_dquot_t *dqp;
- int error;
-
- ASSERT(XFS_ISLOCKED_INODE_EXCL(ip));
- error = 0;
- /*
- * See if we already have it in the inode itself. IO_idqpp is
- * &i_udquot or &i_gdquot. This made the code look weird, but
- * made the logic a lot simpler.
- */
- if ((dqp = *IO_idqpp)) {
- if (dolock)
- xfs_dqlock(dqp);
- xfs_dqtrace_entry(dqp, "DQATTACH: found in ip");
- goto done;
- }
-
- /*
- * udqhint is the i_udquot field in inode, and is non-NULL only
- * when the type arg is group/project. Its purpose is to save a
- * lookup by dqid (xfs_qm_dqget) by caching a group dquot inside
- * the user dquot.
- */
- ASSERT(!udqhint || type == XFS_DQ_GROUP || type == XFS_DQ_PROJ);
- if (udqhint && !dolock)
- xfs_dqlock(udqhint);
-
- /*
- * No need to take dqlock to look at the id.
- * The ID can't change until it gets reclaimed, and it won't
- * be reclaimed as long as we have a ref from inode and we hold
- * the ilock.
- */
- if (udqhint &&
- (dqp = udqhint->q_gdquot) &&
- (be32_to_cpu(dqp->q_core.d_id) == id)) {
- ASSERT(XFS_DQ_IS_LOCKED(udqhint));
- xfs_dqlock(dqp);
- XFS_DQHOLD(dqp);
- ASSERT(*IO_idqpp == NULL);
- *IO_idqpp = dqp;
- if (!dolock) {
- xfs_dqunlock(dqp);
- xfs_dqunlock(udqhint);
- }
- goto done;
- }
- /*
- * We can't hold a dquot lock when we call the dqget code.
- * We'll deadlock in no time, because of (not conforming to)
- * lock ordering - the inodelock comes before any dquot lock,
- * and we may drop and reacquire the ilock in xfs_qm_dqget().
- */
- if (udqhint)
- xfs_dqunlock(udqhint);
- /*
- * Find the dquot from somewhere. This bumps the
- * reference count of dquot and returns it locked.
- * This can return ENOENT if dquot didn't exist on
- * disk and we didn't ask it to allocate;
- * ESRCH if quotas got turned off suddenly.
- */
- if ((error = xfs_qm_dqget(ip->i_mount, ip, id, type,
- doalloc|XFS_QMOPT_DOWARN, &dqp))) {
- if (udqhint && dolock)
- xfs_dqlock(udqhint);
- goto done;
- }
-
- xfs_dqtrace_entry(dqp, "DQATTACH: found by dqget");
- /*
- * dqget may have dropped and re-acquired the ilock, but it guarantees
- * that the dquot returned is the one that should go in the inode.
- */
- *IO_idqpp = dqp;
- ASSERT(dqp);
- ASSERT(XFS_DQ_IS_LOCKED(dqp));
- if (! dolock) {
- xfs_dqunlock(dqp);
- goto done;
- }
- if (! udqhint)
- goto done;
-
- ASSERT(udqhint);
- ASSERT(dolock);
- ASSERT(XFS_DQ_IS_LOCKED(dqp));
- if (! xfs_qm_dqlock_nowait(udqhint)) {
- xfs_dqunlock(dqp);
- xfs_dqlock(udqhint);
- xfs_dqlock(dqp);
- }
- done:
-#ifdef QUOTADEBUG
- if (udqhint) {
- if (dolock)
- ASSERT(XFS_DQ_IS_LOCKED(udqhint));
- }
- if (! error) {
- if (dolock)
- ASSERT(XFS_DQ_IS_LOCKED(dqp));
- }
-#endif
- return error;
-}
-
-
-/*
- * Given a udquot and gdquot, attach a ptr to the group dquot in the
- * udquot as a hint for future lookups. The idea sounds simple, but the
- * execution isn't, because the udquot might have a group dquot attached
- * already and getting rid of that gets us into lock ordering contraints.
- * The process is complicated more by the fact that the dquots may or may not
- * be locked on entry.
- */
-STATIC void
-xfs_qm_dqattach_grouphint(
- xfs_dquot_t *udq,
- xfs_dquot_t *gdq,
- uint locked)
-{
- xfs_dquot_t *tmp;
-
-#ifdef QUOTADEBUG
- if (locked) {
- ASSERT(XFS_DQ_IS_LOCKED(udq));
- ASSERT(XFS_DQ_IS_LOCKED(gdq));
- }
-#endif
- if (! locked)
- xfs_dqlock(udq);
-
- if ((tmp = udq->q_gdquot)) {
- if (tmp == gdq) {
- if (! locked)
- xfs_dqunlock(udq);
- return;
- }
-
- udq->q_gdquot = NULL;
- /*
- * We can't keep any dqlocks when calling dqrele,
- * because the freelist lock comes before dqlocks.
- */
- xfs_dqunlock(udq);
- if (locked)
- xfs_dqunlock(gdq);
- /*
- * we took a hard reference once upon a time in dqget,
- * so give it back when the udquot no longer points at it
- * dqput() does the unlocking of the dquot.
- */
- xfs_qm_dqrele(tmp);
-
- xfs_dqlock(udq);
- xfs_dqlock(gdq);
-
- } else {
- ASSERT(XFS_DQ_IS_LOCKED(udq));
- if (! locked) {
- xfs_dqlock(gdq);
- }
- }
-
- ASSERT(XFS_DQ_IS_LOCKED(udq));
- ASSERT(XFS_DQ_IS_LOCKED(gdq));
- /*
- * Somebody could have attached a gdquot here,
- * when we dropped the uqlock. If so, just do nothing.
- */
- if (udq->q_gdquot == NULL) {
- XFS_DQHOLD(gdq);
- udq->q_gdquot = gdq;
- }
- if (! locked) {
- xfs_dqunlock(gdq);
- xfs_dqunlock(udq);
- }
-}
-
-
-/*
- * Given a locked inode, attach dquot(s) to it, taking U/G/P-QUOTAON
- * into account.
- * If XFS_QMOPT_DQALLOC, the dquot(s) will be allocated if needed.
- * If XFS_QMOPT_DQLOCK, the dquot(s) will be returned locked. This option pretty
- * much made this code a complete mess, but it has been pretty useful.
- * If XFS_QMOPT_ILOCKED, then inode sent is already locked EXCL.
- * Inode may get unlocked and relocked in here, and the caller must deal with
- * the consequences.
- */
-int
-xfs_qm_dqattach(
- xfs_inode_t *ip,
- uint flags)
-{
- xfs_mount_t *mp = ip->i_mount;
- uint nquotas = 0;
- int error = 0;
-
- if ((! XFS_IS_QUOTA_ON(mp)) ||
- (! XFS_NOT_DQATTACHED(mp, ip)) ||
- (ip->i_ino == mp->m_sb.sb_uquotino) ||
- (ip->i_ino == mp->m_sb.sb_gquotino))
- return 0;
-
- ASSERT((flags & XFS_QMOPT_ILOCKED) == 0 ||
- XFS_ISLOCKED_INODE_EXCL(ip));
-
- if (! (flags & XFS_QMOPT_ILOCKED))
- xfs_ilock(ip, XFS_ILOCK_EXCL);
-
- if (XFS_IS_UQUOTA_ON(mp)) {
- error = xfs_qm_dqattach_one(ip, ip->i_d.di_uid, XFS_DQ_USER,
- flags & XFS_QMOPT_DQALLOC,
- flags & XFS_QMOPT_DQLOCK,
- NULL, &ip->i_udquot);
- if (error)
- goto done;
- nquotas++;
- }
- ASSERT(XFS_ISLOCKED_INODE_EXCL(ip));
- if (XFS_IS_OQUOTA_ON(mp)) {
- error = XFS_IS_GQUOTA_ON(mp) ?
- xfs_qm_dqattach_one(ip, ip->i_d.di_gid, XFS_DQ_GROUP,
- flags & XFS_QMOPT_DQALLOC,
- flags & XFS_QMOPT_DQLOCK,
- ip->i_udquot, &ip->i_gdquot) :
- xfs_qm_dqattach_one(ip, ip->i_d.di_projid, XFS_DQ_PROJ,
- flags & XFS_QMOPT_DQALLOC,
- flags & XFS_QMOPT_DQLOCK,
- ip->i_udquot, &ip->i_gdquot);
- /*
- * Don't worry about the udquot that we may have
- * attached above. It'll get detached, if not already.
- */
- if (error)
- goto done;
- nquotas++;
- }
-
- /*
- * Attach this group quota to the user quota as a hint.
- * This WON'T, in general, result in a thrash.
- */
- if (nquotas == 2) {
- ASSERT(XFS_ISLOCKED_INODE_EXCL(ip));
- ASSERT(ip->i_udquot);
- ASSERT(ip->i_gdquot);
-
- /*
- * We may or may not have the i_udquot locked at this point,
- * but this check is OK since we don't depend on the i_gdquot to
- * be accurate 100% all the time. It is just a hint, and this
- * will succeed in general.
- */
- if (ip->i_udquot->q_gdquot == ip->i_gdquot)
- goto done;
- /*
- * Attach i_gdquot to the gdquot hint inside the i_udquot.
- */
- xfs_qm_dqattach_grouphint(ip->i_udquot, ip->i_gdquot,
- flags & XFS_QMOPT_DQLOCK);
- }
-
- done:
-
-#ifdef QUOTADEBUG
- if (! error) {
- if (ip->i_udquot) {
- if (flags & XFS_QMOPT_DQLOCK)
- ASSERT(XFS_DQ_IS_LOCKED(ip->i_udquot));
- }
- if (ip->i_gdquot) {
- if (flags & XFS_QMOPT_DQLOCK)
- ASSERT(XFS_DQ_IS_LOCKED(ip->i_gdquot));
- }
- if (XFS_IS_UQUOTA_ON(mp))
- ASSERT(ip->i_udquot);
- if (XFS_IS_OQUOTA_ON(mp))
- ASSERT(ip->i_gdquot);
- }
-#endif
-
- if (! (flags & XFS_QMOPT_ILOCKED))
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
-
-#ifdef QUOTADEBUG
- else
- ASSERT(XFS_ISLOCKED_INODE_EXCL(ip));
-#endif
- return error;
-}
-
-/*
- * Release dquots (and their references) if any.
- * The inode should be locked EXCL except when this's called by
- * xfs_ireclaim.
- */
-void
-xfs_qm_dqdetach(
- xfs_inode_t *ip)
-{
- if (!(ip->i_udquot || ip->i_gdquot))
- return;
-
- ASSERT(ip->i_ino != ip->i_mount->m_sb.sb_uquotino);
- ASSERT(ip->i_ino != ip->i_mount->m_sb.sb_gquotino);
- if (ip->i_udquot) {
- xfs_dqtrace_entry_ino(ip->i_udquot, "DQDETTACH", ip);
- xfs_qm_dqrele(ip->i_udquot);
- ip->i_udquot = NULL;
- }
- if (ip->i_gdquot) {
- xfs_dqtrace_entry_ino(ip->i_gdquot, "DQDETTACH", ip);
- xfs_qm_dqrele(ip->i_gdquot);
- ip->i_gdquot = NULL;
- }
-}
-
-/*
- * This is called by VFS_SYNC and flags arg determines the caller,
- * and its motives, as done in xfs_sync.
- *
- * vfs_sync: SYNC_FSDATA|SYNC_ATTR|SYNC_BDFLUSH 0x31
- * syscall sync: SYNC_FSDATA|SYNC_ATTR|SYNC_DELWRI 0x25
- * umountroot : SYNC_WAIT | SYNC_CLOSE | SYNC_ATTR | SYNC_FSDATA
- */
-
-int
-xfs_qm_sync(
- xfs_mount_t *mp,
- short flags)
-{
- int recl, restarts;
- xfs_dquot_t *dqp;
- uint flush_flags;
- boolean_t nowait;
- int error;
-
- restarts = 0;
- /*
- * We won't block unless we are asked to.
- */
- nowait = (boolean_t)(flags & SYNC_BDFLUSH || (flags & SYNC_WAIT) == 0);
-
- again:
- xfs_qm_mplist_lock(mp);
- /*
- * dqpurge_all() also takes the mplist lock and iterate thru all dquots
- * in quotaoff. However, if the QUOTA_ACTIVE bits are not cleared
- * when we have the mplist lock, we know that dquots will be consistent
- * as long as we have it locked.
- */
- if (! XFS_IS_QUOTA_ON(mp)) {
- xfs_qm_mplist_unlock(mp);
- return 0;
- }
- FOREACH_DQUOT_IN_MP(dqp, mp) {
- /*
- * If this is vfs_sync calling, then skip the dquots that
- * don't 'seem' to be dirty. ie. don't acquire dqlock.
- * This is very similar to what xfs_sync does with inodes.
- */
- if (flags & SYNC_BDFLUSH) {
- if (! XFS_DQ_IS_DIRTY(dqp))
- continue;
- }
-
- if (nowait) {
- /*
- * Try to acquire the dquot lock. We are NOT out of
- * lock order, but we just don't want to wait for this
- * lock, unless somebody wanted us to.
- */
- if (! xfs_qm_dqlock_nowait(dqp))
- continue;
- } else {
- xfs_dqlock(dqp);
- }
-
- /*
- * Now, find out for sure if this dquot is dirty or not.
- */
- if (! XFS_DQ_IS_DIRTY(dqp)) {
- xfs_dqunlock(dqp);
- continue;
- }
-
- /* XXX a sentinel would be better */
- recl = XFS_QI_MPLRECLAIMS(mp);
- if (! xfs_qm_dqflock_nowait(dqp)) {
- if (nowait) {
- xfs_dqunlock(dqp);
- continue;
- }
- /*
- * If we can't grab the flush lock then if the caller
- * really wanted us to give this our best shot,
- * see if we can give a push to the buffer before we wait
- * on the flush lock. At this point, we know that
- * eventhough the dquot is being flushed,
- * it has (new) dirty data.
- */
- xfs_qm_dqflock_pushbuf_wait(dqp);
- }
- /*
- * Let go of the mplist lock. We don't want to hold it
- * across a disk write
- */
- flush_flags = (nowait) ? XFS_QMOPT_DELWRI : XFS_QMOPT_SYNC;
- xfs_qm_mplist_unlock(mp);
- xfs_dqtrace_entry(dqp, "XQM_SYNC: DQFLUSH");
- error = xfs_qm_dqflush(dqp, flush_flags);
- xfs_dqunlock(dqp);
- if (error && XFS_FORCED_SHUTDOWN(mp))
- return 0; /* Need to prevent umount failure */
- else if (error)
- return error;
-
- xfs_qm_mplist_lock(mp);
- if (recl != XFS_QI_MPLRECLAIMS(mp)) {
- if (++restarts >= XFS_QM_SYNC_MAX_RESTARTS)
- break;
-
- xfs_qm_mplist_unlock(mp);
- goto again;
- }
- }
-
- xfs_qm_mplist_unlock(mp);
- return 0;
-}
-
-
-/*
- * This initializes all the quota information that's kept in the
- * mount structure
- */
-STATIC int
-xfs_qm_init_quotainfo(
- xfs_mount_t *mp)
-{
- xfs_quotainfo_t *qinf;
- int error;
- xfs_dquot_t *dqp;
-
- ASSERT(XFS_IS_QUOTA_RUNNING(mp));
-
- /*
- * Tell XQM that we exist as soon as possible.
- */
- if ((error = xfs_qm_hold_quotafs_ref(mp))) {
- return error;
- }
-
- qinf = mp->m_quotainfo = kmem_zalloc(sizeof(xfs_quotainfo_t), KM_SLEEP);
-
- /*
- * See if quotainodes are setup, and if not, allocate them,
- * and change the superblock accordingly.
- */
- if ((error = xfs_qm_init_quotainos(mp))) {
- kmem_free(qinf, sizeof(xfs_quotainfo_t));
- mp->m_quotainfo = NULL;
- return error;
- }
-
- spinlock_init(&qinf->qi_pinlock, "xfs_qinf_pin");
- xfs_qm_list_init(&qinf->qi_dqlist, "mpdqlist", 0);
- qinf->qi_dqreclaims = 0;
-
- /* mutex used to serialize quotaoffs */
- mutex_init(&qinf->qi_quotaofflock);
-
- /* Precalc some constants */
- qinf->qi_dqchunklen = XFS_FSB_TO_BB(mp, XFS_DQUOT_CLUSTER_SIZE_FSB);
- ASSERT(qinf->qi_dqchunklen);
- qinf->qi_dqperchunk = BBTOB(qinf->qi_dqchunklen);
- do_div(qinf->qi_dqperchunk, sizeof(xfs_dqblk_t));
-
- mp->m_qflags |= (mp->m_sb.sb_qflags & XFS_ALL_QUOTA_CHKD);
-
- /*
- * We try to get the limits from the superuser's limits fields.
- * This is quite hacky, but it is standard quota practice.
- * We look at the USR dquot with id == 0 first, but if user quotas
- * are not enabled we goto the GRP dquot with id == 0.
- * We don't really care to keep separate default limits for user
- * and group quotas, at least not at this point.
- */
- error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)0,
- XFS_IS_UQUOTA_RUNNING(mp) ? XFS_DQ_USER :
- (XFS_IS_GQUOTA_RUNNING(mp) ? XFS_DQ_GROUP :
- XFS_DQ_PROJ),
- XFS_QMOPT_DQSUSER|XFS_QMOPT_DOWARN,
- &dqp);
- if (! error) {
- xfs_disk_dquot_t *ddqp = &dqp->q_core;
-
- /*
- * The warnings and timers set the grace period given to
- * a user or group before he or she can not perform any
- * more writing. If it is zero, a default is used.
- */
- qinf->qi_btimelimit = ddqp->d_btimer ?
- be32_to_cpu(ddqp->d_btimer) : XFS_QM_BTIMELIMIT;
- qinf->qi_itimelimit = ddqp->d_itimer ?
- be32_to_cpu(ddqp->d_itimer) : XFS_QM_ITIMELIMIT;
- qinf->qi_rtbtimelimit = ddqp->d_rtbtimer ?
- be32_to_cpu(ddqp->d_rtbtimer) : XFS_QM_RTBTIMELIMIT;
- qinf->qi_bwarnlimit = ddqp->d_bwarns ?
- be16_to_cpu(ddqp->d_bwarns) : XFS_QM_BWARNLIMIT;
- qinf->qi_iwarnlimit = ddqp->d_iwarns ?
- be16_to_cpu(ddqp->d_iwarns) : XFS_QM_IWARNLIMIT;
- qinf->qi_rtbwarnlimit = ddqp->d_rtbwarns ?
- be16_to_cpu(ddqp->d_rtbwarns) : XFS_QM_RTBWARNLIMIT;
- qinf->qi_bhardlimit = be64_to_cpu(ddqp->d_blk_hardlimit);
- qinf->qi_bsoftlimit = be64_to_cpu(ddqp->d_blk_softlimit);
- qinf->qi_ihardlimit = be64_to_cpu(ddqp->d_ino_hardlimit);
- qinf->qi_isoftlimit = be64_to_cpu(ddqp->d_ino_softlimit);
- qinf->qi_rtbhardlimit = be64_to_cpu(ddqp->d_rtb_hardlimit);
- qinf->qi_rtbsoftlimit = be64_to_cpu(ddqp->d_rtb_softlimit);
-
- /*
- * We sent the XFS_QMOPT_DQSUSER flag to dqget because
- * we don't want this dquot cached. We haven't done a
- * quotacheck yet, and quotacheck doesn't like incore dquots.
- */
- xfs_qm_dqdestroy(dqp);
- } else {
- qinf->qi_btimelimit = XFS_QM_BTIMELIMIT;
- qinf->qi_itimelimit = XFS_QM_ITIMELIMIT;
- qinf->qi_rtbtimelimit = XFS_QM_RTBTIMELIMIT;
- qinf->qi_bwarnlimit = XFS_QM_BWARNLIMIT;
- qinf->qi_iwarnlimit = XFS_QM_IWARNLIMIT;
- qinf->qi_rtbwarnlimit = XFS_QM_RTBWARNLIMIT;
- }
-
- return 0;
-}
-
-
-/*
- * Gets called when unmounting a filesystem or when all quotas get
- * turned off.
- * This purges the quota inodes, destroys locks and frees itself.
- */
-void
-xfs_qm_destroy_quotainfo(
- xfs_mount_t *mp)
-{
- xfs_quotainfo_t *qi;
-
- qi = mp->m_quotainfo;
- ASSERT(qi != NULL);
- ASSERT(xfs_Gqm != NULL);
-
- /*
- * Release the reference that XQM kept, so that we know
- * when the XQM structure should be freed. We cannot assume
- * that xfs_Gqm is non-null after this point.
- */
- xfs_qm_rele_quotafs_ref(mp);
-
- spinlock_destroy(&qi->qi_pinlock);
- xfs_qm_list_destroy(&qi->qi_dqlist);
-
- if (qi->qi_uquotaip) {
- XFS_PURGE_INODE(qi->qi_uquotaip);
- qi->qi_uquotaip = NULL; /* paranoia */
- }
- if (qi->qi_gquotaip) {
- XFS_PURGE_INODE(qi->qi_gquotaip);
- qi->qi_gquotaip = NULL;
- }
- mutex_destroy(&qi->qi_quotaofflock);
- kmem_free(qi, sizeof(xfs_quotainfo_t));
- mp->m_quotainfo = NULL;
-}
-
-
-
-/* ------------------- PRIVATE STATIC FUNCTIONS ----------------------- */
-
-/* ARGSUSED */
-STATIC void
-xfs_qm_list_init(
- xfs_dqlist_t *list,
- char *str,
- int n)
-{
- mutex_init(&list->qh_lock);
- list->qh_next = NULL;
- list->qh_version = 0;
- list->qh_nelems = 0;
-}
-
-STATIC void
-xfs_qm_list_destroy(
- xfs_dqlist_t *list)
-{
- mutex_destroy(&(list->qh_lock));
-}
-
-
-/*
- * Stripped down version of dqattach. This doesn't attach, or even look at the
- * dquots attached to the inode. The rationale is that there won't be any
- * attached at the time this is called from quotacheck.
- */
-STATIC int
-xfs_qm_dqget_noattach(
- xfs_inode_t *ip,
- xfs_dquot_t **O_udqpp,
- xfs_dquot_t **O_gdqpp)
-{
- int error;
- xfs_mount_t *mp;
- xfs_dquot_t *udqp, *gdqp;
-
- ASSERT(XFS_ISLOCKED_INODE_EXCL(ip));
- mp = ip->i_mount;
- udqp = NULL;
- gdqp = NULL;
-
- if (XFS_IS_UQUOTA_ON(mp)) {
- ASSERT(ip->i_udquot == NULL);
- /*
- * We want the dquot allocated if it doesn't exist.
- */
- if ((error = xfs_qm_dqget(mp, ip, ip->i_d.di_uid, XFS_DQ_USER,
- XFS_QMOPT_DQALLOC | XFS_QMOPT_DOWARN,
- &udqp))) {
- /*
- * Shouldn't be able to turn off quotas here.
- */
- ASSERT(error != ESRCH);
- ASSERT(error != ENOENT);
- return error;
- }
- ASSERT(udqp);
- }
-
- if (XFS_IS_OQUOTA_ON(mp)) {
- ASSERT(ip->i_gdquot == NULL);
- if (udqp)
- xfs_dqunlock(udqp);
- error = XFS_IS_GQUOTA_ON(mp) ?
- xfs_qm_dqget(mp, ip,
- ip->i_d.di_gid, XFS_DQ_GROUP,
- XFS_QMOPT_DQALLOC|XFS_QMOPT_DOWARN,
- &gdqp) :
- xfs_qm_dqget(mp, ip,
- ip->i_d.di_projid, XFS_DQ_PROJ,
- XFS_QMOPT_DQALLOC|XFS_QMOPT_DOWARN,
- &gdqp);
- if (error) {
- if (udqp)
- xfs_qm_dqrele(udqp);
- ASSERT(error != ESRCH);
- ASSERT(error != ENOENT);
- return error;
- }
- ASSERT(gdqp);
-
- /* Reacquire the locks in the right order */
- if (udqp) {
- if (! xfs_qm_dqlock_nowait(udqp)) {
- xfs_dqunlock(gdqp);
- xfs_dqlock(udqp);
- xfs_dqlock(gdqp);
- }
- }
- }
-
- *O_udqpp = udqp;
- *O_gdqpp = gdqp;
-
-#ifdef QUOTADEBUG
- if (udqp) ASSERT(XFS_DQ_IS_LOCKED(udqp));
- if (gdqp) ASSERT(XFS_DQ_IS_LOCKED(gdqp));
-#endif
- return 0;
-}
-
-/*
- * Create an inode and return with a reference already taken, but unlocked
- * This is how we create quota inodes
- */
-STATIC int
-xfs_qm_qino_alloc(
- xfs_mount_t *mp,
- xfs_inode_t **ip,
- __int64_t sbfields,
- uint flags)
-{
- xfs_trans_t *tp;
- int error;
- unsigned long s;
- cred_t zerocr;
- xfs_inode_t zeroino;
- int committed;
-
- tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QINOCREATE);
- if ((error = xfs_trans_reserve(tp,
- XFS_QM_QINOCREATE_SPACE_RES(mp),
- XFS_CREATE_LOG_RES(mp), 0,
- XFS_TRANS_PERM_LOG_RES,
- XFS_CREATE_LOG_COUNT))) {
- xfs_trans_cancel(tp, 0);
- return error;
- }
- memset(&zerocr, 0, sizeof(zerocr));
- memset(&zeroino, 0, sizeof(zeroino));
-
- if ((error = xfs_dir_ialloc(&tp, &zeroino, S_IFREG, 1, 0,
- &zerocr, 0, 1, ip, &committed))) {
- xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES |
- XFS_TRANS_ABORT);
- return error;
- }
-
- /*
- * Keep an extra reference to this quota inode. This inode is
- * locked exclusively and joined to the transaction already.
- */
- ASSERT(XFS_ISLOCKED_INODE_EXCL(*ip));
- VN_HOLD(XFS_ITOV((*ip)));
-
- /*
- * Make the changes in the superblock, and log those too.
- * sbfields arg may contain fields other than *QUOTINO;
- * VERSIONNUM for example.
- */
- s = XFS_SB_LOCK(mp);
- if (flags & XFS_QMOPT_SBVERSION) {
-#if defined(DEBUG) && defined(XFS_LOUD_RECOVERY)
- unsigned oldv = mp->m_sb.sb_versionnum;
-#endif
- ASSERT(!XFS_SB_VERSION_HASQUOTA(&mp->m_sb));
- ASSERT((sbfields & (XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO |
- XFS_SB_GQUOTINO | XFS_SB_QFLAGS)) ==
- (XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO |
- XFS_SB_GQUOTINO | XFS_SB_QFLAGS));
-
- XFS_SB_VERSION_ADDQUOTA(&mp->m_sb);
- mp->m_sb.sb_uquotino = NULLFSINO;
- mp->m_sb.sb_gquotino = NULLFSINO;
-
- /* qflags will get updated _after_ quotacheck */
- mp->m_sb.sb_qflags = 0;
-#if defined(DEBUG) && defined(XFS_LOUD_RECOVERY)
- cmn_err(CE_NOTE,
- "Old superblock version %x, converting to %x.",
- oldv, mp->m_sb.sb_versionnum);
-#endif
- }
- if (flags & XFS_QMOPT_UQUOTA)
- mp->m_sb.sb_uquotino = (*ip)->i_ino;
- else
- mp->m_sb.sb_gquotino = (*ip)->i_ino;
- XFS_SB_UNLOCK(mp, s);
- xfs_mod_sb(tp, sbfields);
-
- if ((error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES,
- NULL))) {
- xfs_fs_cmn_err(CE_ALERT, mp, "XFS qino_alloc failed!");
- return error;
- }
- return 0;
-}
-
-
-STATIC int
-xfs_qm_reset_dqcounts(
- xfs_mount_t *mp,
- xfs_buf_t *bp,
- xfs_dqid_t id,
- uint type)
-{
- xfs_disk_dquot_t *ddq;
- int j;
-
- xfs_buftrace("RESET DQUOTS", bp);
- /*
- * Reset all counters and timers. They'll be
- * started afresh by xfs_qm_quotacheck.
- */
-#ifdef DEBUG
- j = XFS_FSB_TO_B(mp, XFS_DQUOT_CLUSTER_SIZE_FSB);
- do_div(j, sizeof(xfs_dqblk_t));
- ASSERT(XFS_QM_DQPERBLK(mp) == j);
-#endif
- ddq = (xfs_disk_dquot_t *)XFS_BUF_PTR(bp);
- for (j = 0; j < XFS_QM_DQPERBLK(mp); j++) {
- /*
- * Do a sanity check, and if needed, repair the dqblk. Don't
- * output any warnings because it's perfectly possible to
- * find unitialized dquot blks. See comment in xfs_qm_dqcheck.
- */
- (void) xfs_qm_dqcheck(ddq, id+j, type, XFS_QMOPT_DQREPAIR,
- "xfs_quotacheck");
- ddq->d_bcount = 0;
- ddq->d_icount = 0;
- ddq->d_rtbcount = 0;
- ddq->d_btimer = 0;
- ddq->d_itimer = 0;
- ddq->d_rtbtimer = 0;
- ddq->d_bwarns = 0;
- ddq->d_iwarns = 0;
- ddq->d_rtbwarns = 0;
- ddq = (xfs_disk_dquot_t *) ((xfs_dqblk_t *)ddq + 1);
- }
-
- return 0;
-}
-
-STATIC int
-xfs_qm_dqiter_bufs(
- xfs_mount_t *mp,
- xfs_dqid_t firstid,
- xfs_fsblock_t bno,
- xfs_filblks_t blkcnt,
- uint flags)
-{
- xfs_buf_t *bp;
- int error;
- int notcommitted;
- int incr;
- int type;
-
- ASSERT(blkcnt > 0);
- notcommitted = 0;
- incr = (blkcnt > XFS_QM_MAX_DQCLUSTER_LOGSZ) ?
- XFS_QM_MAX_DQCLUSTER_LOGSZ : blkcnt;
- type = flags & XFS_QMOPT_UQUOTA ? XFS_DQ_USER :
- (flags & XFS_QMOPT_PQUOTA ? XFS_DQ_PROJ : XFS_DQ_GROUP);
- error = 0;
-
- /*
- * Blkcnt arg can be a very big number, and might even be
- * larger than the log itself. So, we have to break it up into
- * manageable-sized transactions.
- * Note that we don't start a permanent transaction here; we might
- * not be able to get a log reservation for the whole thing up front,
- * and we don't really care to either, because we just discard
- * everything if we were to crash in the middle of this loop.
- */
- while (blkcnt--) {
- error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp,
- XFS_FSB_TO_DADDR(mp, bno),
- (int)XFS_QI_DQCHUNKLEN(mp), 0, &bp);
- if (error)
- break;
-
- (void) xfs_qm_reset_dqcounts(mp, bp, firstid, type);
- xfs_bdwrite(mp, bp);
- /*
- * goto the next block.
- */
- bno++;
- firstid += XFS_QM_DQPERBLK(mp);
- }
- return error;
-}
-
-/*
- * Iterate over all allocated USR/GRP/PRJ dquots in the system, calling a
- * caller supplied function for every chunk of dquots that we find.
- */
-STATIC int
-xfs_qm_dqiterate(
- xfs_mount_t *mp,
- xfs_inode_t *qip,
- uint flags)
-{
- xfs_bmbt_irec_t *map;
- int i, nmaps; /* number of map entries */
- int error; /* return value */
- xfs_fileoff_t lblkno;
- xfs_filblks_t maxlblkcnt;
- xfs_dqid_t firstid;
- xfs_fsblock_t rablkno;
- xfs_filblks_t rablkcnt;
-
- error = 0;
- /*
- * This looks racey, but we can't keep an inode lock across a
- * trans_reserve. But, this gets called during quotacheck, and that
- * happens only at mount time which is single threaded.
- */
- if (qip->i_d.di_nblocks == 0)
- return 0;
-
- map = kmem_alloc(XFS_DQITER_MAP_SIZE * sizeof(*map), KM_SLEEP);
-
- lblkno = 0;
- maxlblkcnt = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp));
- do {
- nmaps = XFS_DQITER_MAP_SIZE;
- /*
- * We aren't changing the inode itself. Just changing
- * some of its data. No new blocks are added here, and
- * the inode is never added to the transaction.
- */
- xfs_ilock(qip, XFS_ILOCK_SHARED);
- error = xfs_bmapi(NULL, qip, lblkno,
- maxlblkcnt - lblkno,
- XFS_BMAPI_METADATA,
- NULL,
- 0, map, &nmaps, NULL);
- xfs_iunlock(qip, XFS_ILOCK_SHARED);
- if (error)
- break;
-
- ASSERT(nmaps <= XFS_DQITER_MAP_SIZE);
- for (i = 0; i < nmaps; i++) {
- ASSERT(map[i].br_startblock != DELAYSTARTBLOCK);
- ASSERT(map[i].br_blockcount);
-
-
- lblkno += map[i].br_blockcount;
-
- if (map[i].br_startblock == HOLESTARTBLOCK)
- continue;
-
- firstid = (xfs_dqid_t) map[i].br_startoff *
- XFS_QM_DQPERBLK(mp);
- /*
- * Do a read-ahead on the next extent.
- */
- if ((i+1 < nmaps) &&
- (map[i+1].br_startblock != HOLESTARTBLOCK)) {
- rablkcnt = map[i+1].br_blockcount;
- rablkno = map[i+1].br_startblock;
- while (rablkcnt--) {
- xfs_baread(mp->m_ddev_targp,
- XFS_FSB_TO_DADDR(mp, rablkno),
- (int)XFS_QI_DQCHUNKLEN(mp));
- rablkno++;
- }
- }
- /*
- * Iterate thru all the blks in the extent and
- * reset the counters of all the dquots inside them.
- */
- if ((error = xfs_qm_dqiter_bufs(mp,
- firstid,
- map[i].br_startblock,
- map[i].br_blockcount,
- flags))) {
- break;
- }
- }
-
- if (error)
- break;
- } while (nmaps > 0);
-
- kmem_free(map, XFS_DQITER_MAP_SIZE * sizeof(*map));
-
- return error;
-}
-
-/*
- * Called by dqusage_adjust in doing a quotacheck.
- * Given the inode, and a dquot (either USR or GRP, doesn't matter),
- * this updates its incore copy as well as the buffer copy. This is
- * so that once the quotacheck is done, we can just log all the buffers,
- * as opposed to logging numerous updates to individual dquots.
- */
-STATIC void
-xfs_qm_quotacheck_dqadjust(
- xfs_dquot_t *dqp,
- xfs_qcnt_t nblks,
- xfs_qcnt_t rtblks)
-{
- ASSERT(XFS_DQ_IS_LOCKED(dqp));
- xfs_dqtrace_entry(dqp, "QCHECK DQADJUST");
- /*
- * Adjust the inode count and the block count to reflect this inode's
- * resource usage.
- */
- be64_add(&dqp->q_core.d_icount, 1);
- dqp->q_res_icount++;
- if (nblks) {
- be64_add(&dqp->q_core.d_bcount, nblks);
- dqp->q_res_bcount += nblks;
- }
- if (rtblks) {
- be64_add(&dqp->q_core.d_rtbcount, rtblks);
- dqp->q_res_rtbcount += rtblks;
- }
-
- /*
- * Set default limits, adjust timers (since we changed usages)
- */
- if (! XFS_IS_SUSER_DQUOT(dqp)) {
- xfs_qm_adjust_dqlimits(dqp->q_mount, &dqp->q_core);
- xfs_qm_adjust_dqtimers(dqp->q_mount, &dqp->q_core);
- }
-
- dqp->dq_flags |= XFS_DQ_DIRTY;
-}
-
-STATIC int
-xfs_qm_get_rtblks(
- xfs_inode_t *ip,
- xfs_qcnt_t *O_rtblks)
-{
- xfs_filblks_t rtblks; /* total rt blks */
- xfs_ifork_t *ifp; /* inode fork pointer */
- xfs_extnum_t nextents; /* number of extent entries */
- xfs_bmbt_rec_t *base; /* base of extent array */
- xfs_bmbt_rec_t *ep; /* pointer to an extent entry */
- int error;
-
- ASSERT(XFS_IS_REALTIME_INODE(ip));
- ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
- if (!(ifp->if_flags & XFS_IFEXTENTS)) {
- if ((error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK)))
- return error;
- }
- rtblks = 0;
- nextents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t);
- base = &ifp->if_u1.if_extents[0];
- for (ep = base; ep < &base[nextents]; ep++)
- rtblks += xfs_bmbt_get_blockcount(ep);
- *O_rtblks = (xfs_qcnt_t)rtblks;
- return 0;
-}
-
-/*
- * callback routine supplied to bulkstat(). Given an inumber, find its
- * dquots and update them to account for resources taken by that inode.
- */
-/* ARGSUSED */
-STATIC int
-xfs_qm_dqusage_adjust(
- xfs_mount_t *mp, /* mount point for filesystem */
- xfs_ino_t ino, /* inode number to get data for */
- void __user *buffer, /* not used */
- int ubsize, /* not used */
- void *private_data, /* not used */
- xfs_daddr_t bno, /* starting block of inode cluster */
- int *ubused, /* not used */
- void *dip, /* on-disk inode pointer (not used) */
- int *res) /* result code value */
-{
- xfs_inode_t *ip;
- xfs_dquot_t *udqp, *gdqp;
- xfs_qcnt_t nblks, rtblks;
- int error;
-
- ASSERT(XFS_IS_QUOTA_RUNNING(mp));
-
- /*
- * rootino must have its resources accounted for, not so with the quota
- * inodes.
- */
- if (ino == mp->m_sb.sb_uquotino || ino == mp->m_sb.sb_gquotino) {
- *res = BULKSTAT_RV_NOTHING;
- return XFS_ERROR(EINVAL);
- }
-
- /*
- * We don't _need_ to take the ilock EXCL. However, the xfs_qm_dqget
- * interface expects the inode to be exclusively locked because that's
- * the case in all other instances. It's OK that we do this because
- * quotacheck is done only at mount time.
- */
- if ((error = xfs_iget(mp, NULL, ino, 0, XFS_ILOCK_EXCL, &ip, bno))) {
- *res = BULKSTAT_RV_NOTHING;
- return error;
- }
-
- if (ip->i_d.di_mode == 0) {
- xfs_iput_new(ip, XFS_ILOCK_EXCL);
- *res = BULKSTAT_RV_NOTHING;
- return XFS_ERROR(ENOENT);
- }
-
- /*
- * Obtain the locked dquots. In case of an error (eg. allocation
- * fails for ENOSPC), we return the negative of the error number
- * to bulkstat, so that it can get propagated to quotacheck() and
- * making us disable quotas for the file system.
- */
- if ((error = xfs_qm_dqget_noattach(ip, &udqp, &gdqp))) {
- xfs_iput(ip, XFS_ILOCK_EXCL);
- *res = BULKSTAT_RV_GIVEUP;
- return error;
- }
-
- rtblks = 0;
- if (! XFS_IS_REALTIME_INODE(ip)) {
- nblks = (xfs_qcnt_t)ip->i_d.di_nblocks;
- } else {
- /*
- * Walk thru the extent list and count the realtime blocks.
- */
- if ((error = xfs_qm_get_rtblks(ip, &rtblks))) {
- xfs_iput(ip, XFS_ILOCK_EXCL);
- if (udqp)
- xfs_qm_dqput(udqp);
- if (gdqp)
- xfs_qm_dqput(gdqp);
- *res = BULKSTAT_RV_GIVEUP;
- return error;
- }
- nblks = (xfs_qcnt_t)ip->i_d.di_nblocks - rtblks;
- }
- ASSERT(ip->i_delayed_blks == 0);
-
- /*
- * We can't release the inode while holding its dquot locks.
- * The inode can go into inactive and might try to acquire the dquotlocks.
- * So, just unlock here and do a vn_rele at the end.
- */
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
-
- /*
- * Add the (disk blocks and inode) resources occupied by this
- * inode to its dquots. We do this adjustment in the incore dquot,
- * and also copy the changes to its buffer.
- * We don't care about putting these changes in a transaction
- * envelope because if we crash in the middle of a 'quotacheck'
- * we have to start from the beginning anyway.
- * Once we're done, we'll log all the dquot bufs.
- *
- * The *QUOTA_ON checks below may look pretty racey, but quotachecks
- * and quotaoffs don't race. (Quotachecks happen at mount time only).
- */
- if (XFS_IS_UQUOTA_ON(mp)) {
- ASSERT(udqp);
- xfs_qm_quotacheck_dqadjust(udqp, nblks, rtblks);
- xfs_qm_dqput(udqp);
- }
- if (XFS_IS_OQUOTA_ON(mp)) {
- ASSERT(gdqp);
- xfs_qm_quotacheck_dqadjust(gdqp, nblks, rtblks);
- xfs_qm_dqput(gdqp);
- }
- /*
- * Now release the inode. This will send it to 'inactive', and
- * possibly even free blocks.
- */
- VN_RELE(XFS_ITOV(ip));
-
- /*
- * Goto next inode.
- */
- *res = BULKSTAT_RV_DIDONE;
- return 0;
-}
-
-/*
- * Walk thru all the filesystem inodes and construct a consistent view
- * of the disk quota world. If the quotacheck fails, disable quotas.
- */
-int
-xfs_qm_quotacheck(
- xfs_mount_t *mp)
-{
- int done, count, error;
- xfs_ino_t lastino;
- size_t structsz;
- xfs_inode_t *uip, *gip;
- uint flags;
-
- count = INT_MAX;
- structsz = 1;
- lastino = 0;
- flags = 0;
-
- ASSERT(XFS_QI_UQIP(mp) || XFS_QI_GQIP(mp));
- ASSERT(XFS_IS_QUOTA_RUNNING(mp));
-
- /*
- * There should be no cached dquots. The (simplistic) quotacheck
- * algorithm doesn't like that.
- */
- ASSERT(XFS_QI_MPLNDQUOTS(mp) == 0);
-
- cmn_err(CE_NOTE, "XFS quotacheck %s: Please wait.", mp->m_fsname);
-
- /*
- * First we go thru all the dquots on disk, USR and GRP/PRJ, and reset
- * their counters to zero. We need a clean slate.
- * We don't log our changes till later.
- */
- if ((uip = XFS_QI_UQIP(mp))) {
- if ((error = xfs_qm_dqiterate(mp, uip, XFS_QMOPT_UQUOTA)))
- goto error_return;
- flags |= XFS_UQUOTA_CHKD;
- }
-
- if ((gip = XFS_QI_GQIP(mp))) {
- if ((error = xfs_qm_dqiterate(mp, gip, XFS_IS_GQUOTA_ON(mp) ?
- XFS_QMOPT_GQUOTA : XFS_QMOPT_PQUOTA)))
- goto error_return;
- flags |= XFS_OQUOTA_CHKD;
- }
-
- do {
- /*
- * Iterate thru all the inodes in the file system,
- * adjusting the corresponding dquot counters in core.
- */
- if ((error = xfs_bulkstat(mp, &lastino, &count,
- xfs_qm_dqusage_adjust, NULL,
- structsz, NULL,
- BULKSTAT_FG_IGET|BULKSTAT_FG_VFSLOCKED,
- &done)))
- break;
-
- } while (! done);
-
- /*
- * We can get this error if we couldn't do a dquot allocation inside
- * xfs_qm_dqusage_adjust (via bulkstat). We don't care about the
- * dirty dquots that might be cached, we just want to get rid of them
- * and turn quotaoff. The dquots won't be attached to any of the inodes
- * at this point (because we intentionally didn't in dqget_noattach).
- */
- if (error) {
- xfs_qm_dqpurge_all(mp, XFS_QMOPT_QUOTALL | XFS_QMOPT_QUOTAOFF);
- goto error_return;
- }
- /*
- * We've made all the changes that we need to make incore.
- * Now flush_them down to disk buffers.
- */
- xfs_qm_dqflush_all(mp, XFS_QMOPT_DELWRI);
-
- /*
- * We didn't log anything, because if we crashed, we'll have to
- * start the quotacheck from scratch anyway. However, we must make
- * sure that our dquot changes are secure before we put the
- * quotacheck'd stamp on the superblock. So, here we do a synchronous
- * flush.
- */
- XFS_bflush(mp->m_ddev_targp);
-
- /*
- * If one type of quotas is off, then it will lose its
- * quotachecked status, since we won't be doing accounting for
- * that type anymore.
- */
- mp->m_qflags &= ~(XFS_OQUOTA_CHKD | XFS_UQUOTA_CHKD);
- mp->m_qflags |= flags;
-
- XQM_LIST_PRINT(&(XFS_QI_MPL_LIST(mp)), MPL_NEXT, "++++ Mp list +++");
-
- error_return:
- if (error) {
- cmn_err(CE_WARN, "XFS quotacheck %s: Unsuccessful (Error %d): "
- "Disabling quotas.",
- mp->m_fsname, error);
- /*
- * We must turn off quotas.
- */
- ASSERT(mp->m_quotainfo != NULL);
- ASSERT(xfs_Gqm != NULL);
- xfs_qm_destroy_quotainfo(mp);
- (void)xfs_mount_reset_sbqflags(mp);
- } else {
- cmn_err(CE_NOTE, "XFS quotacheck %s: Done.", mp->m_fsname);
- }
- return (error);
-}
-
-/*
- * This is called after the superblock has been read in and we're ready to
- * iget the quota inodes.
- */
-STATIC int
-xfs_qm_init_quotainos(
- xfs_mount_t *mp)
-{
- xfs_inode_t *uip, *gip;
- int error;
- __int64_t sbflags;
- uint flags;
-
- ASSERT(mp->m_quotainfo);
- uip = gip = NULL;
- sbflags = 0;
- flags = 0;
-
- /*
- * Get the uquota and gquota inodes
- */
- if (XFS_SB_VERSION_HASQUOTA(&mp->m_sb)) {
- if (XFS_IS_UQUOTA_ON(mp) &&
- mp->m_sb.sb_uquotino != NULLFSINO) {
- ASSERT(mp->m_sb.sb_uquotino > 0);
- if ((error = xfs_iget(mp, NULL, mp->m_sb.sb_uquotino,
- 0, 0, &uip, 0)))
- return XFS_ERROR(error);
- }
- if (XFS_IS_OQUOTA_ON(mp) &&
- mp->m_sb.sb_gquotino != NULLFSINO) {
- ASSERT(mp->m_sb.sb_gquotino > 0);
- if ((error = xfs_iget(mp, NULL, mp->m_sb.sb_gquotino,
- 0, 0, &gip, 0))) {
- if (uip)
- VN_RELE(XFS_ITOV(uip));
- return XFS_ERROR(error);
- }
- }
- } else {
- flags |= XFS_QMOPT_SBVERSION;
- sbflags |= (XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO |
- XFS_SB_GQUOTINO | XFS_SB_QFLAGS);
- }
-
- /*
- * Create the two inodes, if they don't exist already. The changes
- * made above will get added to a transaction and logged in one of
- * the qino_alloc calls below. If the device is readonly,
- * temporarily switch to read-write to do this.
- */
- if (XFS_IS_UQUOTA_ON(mp) && uip == NULL) {
- if ((error = xfs_qm_qino_alloc(mp, &uip,
- sbflags | XFS_SB_UQUOTINO,
- flags | XFS_QMOPT_UQUOTA)))
- return XFS_ERROR(error);
-
- flags &= ~XFS_QMOPT_SBVERSION;
- }
- if (XFS_IS_OQUOTA_ON(mp) && gip == NULL) {
- flags |= (XFS_IS_GQUOTA_ON(mp) ?
- XFS_QMOPT_GQUOTA : XFS_QMOPT_PQUOTA);
- error = xfs_qm_qino_alloc(mp, &gip,
- sbflags | XFS_SB_GQUOTINO, flags);
- if (error) {
- if (uip)
- VN_RELE(XFS_ITOV(uip));
-
- return XFS_ERROR(error);
- }
- }
-
- XFS_QI_UQIP(mp) = uip;
- XFS_QI_GQIP(mp) = gip;
-
- return 0;
-}
-
-
-/*
- * Traverse the freelist of dquots and attempt to reclaim a maximum of
- * 'howmany' dquots. This operation races with dqlookup(), and attempts to
- * favor the lookup function ...
- * XXXsup merge this with qm_reclaim_one().
- */
-STATIC int
-xfs_qm_shake_freelist(
- int howmany)
-{
- int nreclaimed;
- xfs_dqhash_t *hash;
- xfs_dquot_t *dqp, *nextdqp;
- int restarts;
- int nflushes;
-
- if (howmany <= 0)
- return 0;
-
- nreclaimed = 0;
- restarts = 0;
- nflushes = 0;
-
-#ifdef QUOTADEBUG
- cmn_err(CE_DEBUG, "Shake free 0x%x", howmany);
-#endif
- /* lock order is : hashchainlock, freelistlock, mplistlock */
- tryagain:
- xfs_qm_freelist_lock(xfs_Gqm);
-
- for (dqp = xfs_Gqm->qm_dqfreelist.qh_next;
- ((dqp != (xfs_dquot_t *) &xfs_Gqm->qm_dqfreelist) &&
- nreclaimed < howmany); ) {
- xfs_dqlock(dqp);
-
- /*
- * We are racing with dqlookup here. Naturally we don't
- * want to reclaim a dquot that lookup wants.
- */
- if (dqp->dq_flags & XFS_DQ_WANT) {
- xfs_dqunlock(dqp);
- xfs_qm_freelist_unlock(xfs_Gqm);
- if (++restarts >= XFS_QM_RECLAIM_MAX_RESTARTS)
- return nreclaimed;
- XQM_STATS_INC(xqmstats.xs_qm_dqwants);
- goto tryagain;
- }
-
- /*
- * If the dquot is inactive, we are assured that it is
- * not on the mplist or the hashlist, and that makes our
- * life easier.
- */
- if (dqp->dq_flags & XFS_DQ_INACTIVE) {
- ASSERT(dqp->q_mount == NULL);
- ASSERT(! XFS_DQ_IS_DIRTY(dqp));
- ASSERT(dqp->HL_PREVP == NULL);
- ASSERT(dqp->MPL_PREVP == NULL);
- XQM_STATS_INC(xqmstats.xs_qm_dqinact_reclaims);
- nextdqp = dqp->dq_flnext;
- goto off_freelist;
- }
-
- ASSERT(dqp->MPL_PREVP);
- /*
- * Try to grab the flush lock. If this dquot is in the process of
- * getting flushed to disk, we don't want to reclaim it.
- */
- if (! xfs_qm_dqflock_nowait(dqp)) {
- xfs_dqunlock(dqp);
- dqp = dqp->dq_flnext;
- continue;
- }
-
- /*
- * We have the flush lock so we know that this is not in the
- * process of being flushed. So, if this is dirty, flush it
- * DELWRI so that we don't get a freelist infested with
- * dirty dquots.
- */
- if (XFS_DQ_IS_DIRTY(dqp)) {
- xfs_dqtrace_entry(dqp, "DQSHAKE: DQDIRTY");
- /*
- * We flush it delayed write, so don't bother
- * releasing the mplock.
- */
- (void) xfs_qm_dqflush(dqp, XFS_QMOPT_DELWRI);
- xfs_dqunlock(dqp); /* dqflush unlocks dqflock */
- dqp = dqp->dq_flnext;
- continue;
- }
- /*
- * We're trying to get the hashlock out of order. This races
- * with dqlookup; so, we giveup and goto the next dquot if
- * we couldn't get the hashlock. This way, we won't starve
- * a dqlookup process that holds the hashlock that is
- * waiting for the freelist lock.
- */
- if (! xfs_qm_dqhashlock_nowait(dqp)) {
- xfs_dqfunlock(dqp);
- xfs_dqunlock(dqp);
- dqp = dqp->dq_flnext;
- continue;
- }
- /*
- * This races with dquot allocation code as well as dqflush_all
- * and reclaim code. So, if we failed to grab the mplist lock,
- * giveup everything and start over.
- */
- hash = dqp->q_hash;
- ASSERT(hash);
- if (! xfs_qm_mplist_nowait(dqp->q_mount)) {
- /* XXX put a sentinel so that we can come back here */
- xfs_dqfunlock(dqp);
- xfs_dqunlock(dqp);
- XFS_DQ_HASH_UNLOCK(hash);
- xfs_qm_freelist_unlock(xfs_Gqm);
- if (++restarts >= XFS_QM_RECLAIM_MAX_RESTARTS)
- return nreclaimed;
- goto tryagain;
- }
- xfs_dqtrace_entry(dqp, "DQSHAKE: UNLINKING");
-#ifdef QUOTADEBUG
- cmn_err(CE_DEBUG, "Shake 0x%p, ID 0x%x\n",
- dqp, be32_to_cpu(dqp->q_core.d_id));
-#endif
- ASSERT(dqp->q_nrefs == 0);
- nextdqp = dqp->dq_flnext;
- XQM_MPLIST_REMOVE(&(XFS_QI_MPL_LIST(dqp->q_mount)), dqp);
- XQM_HASHLIST_REMOVE(hash, dqp);
- xfs_dqfunlock(dqp);
- xfs_qm_mplist_unlock(dqp->q_mount);
- XFS_DQ_HASH_UNLOCK(hash);
-
- off_freelist:
- XQM_FREELIST_REMOVE(dqp);
- xfs_dqunlock(dqp);
- nreclaimed++;
- XQM_STATS_INC(xqmstats.xs_qm_dqshake_reclaims);
- xfs_qm_dqdestroy(dqp);
- dqp = nextdqp;
- }
- xfs_qm_freelist_unlock(xfs_Gqm);
- return nreclaimed;
-}
-
-
-/*
- * The kmem_shake interface is invoked when memory is running low.
- */
-/* ARGSUSED */
-STATIC int
-xfs_qm_shake(int nr_to_scan, gfp_t gfp_mask)
-{
- int ndqused, nfree, n;
-
- if (!kmem_shake_allow(gfp_mask))
- return 0;
- if (!xfs_Gqm)
- return 0;
-
- nfree = xfs_Gqm->qm_dqfreelist.qh_nelems; /* free dquots */
- /* incore dquots in all f/s's */
- ndqused = atomic_read(&xfs_Gqm->qm_totaldquots) - nfree;
-
- ASSERT(ndqused >= 0);
-
- if (nfree <= ndqused && nfree < ndquot)
- return 0;
-
- ndqused *= xfs_Gqm->qm_dqfree_ratio; /* target # of free dquots */
- n = nfree - ndqused - ndquot; /* # over target */
-
- return xfs_qm_shake_freelist(MAX(nfree, n));
-}
-
-
-/*
- * Just pop the least recently used dquot off the freelist and
- * recycle it. The returned dquot is locked.
- */
-STATIC xfs_dquot_t *
-xfs_qm_dqreclaim_one(void)
-{
- xfs_dquot_t *dqpout;
- xfs_dquot_t *dqp;
- int restarts;
- int nflushes;
-
- restarts = 0;
- dqpout = NULL;
- nflushes = 0;
-
- /* lockorder: hashchainlock, freelistlock, mplistlock, dqlock, dqflock */
- startagain:
- xfs_qm_freelist_lock(xfs_Gqm);
-
- FOREACH_DQUOT_IN_FREELIST(dqp, &(xfs_Gqm->qm_dqfreelist)) {
- xfs_dqlock(dqp);
-
- /*
- * We are racing with dqlookup here. Naturally we don't
- * want to reclaim a dquot that lookup wants. We release the
- * freelist lock and start over, so that lookup will grab
- * both the dquot and the freelistlock.
- */
- if (dqp->dq_flags & XFS_DQ_WANT) {
- ASSERT(! (dqp->dq_flags & XFS_DQ_INACTIVE));
- xfs_dqtrace_entry(dqp, "DQRECLAIM: DQWANT");
- xfs_dqunlock(dqp);
- xfs_qm_freelist_unlock(xfs_Gqm);
- if (++restarts >= XFS_QM_RECLAIM_MAX_RESTARTS)
- return NULL;
- XQM_STATS_INC(xqmstats.xs_qm_dqwants);
- goto startagain;
- }
-
- /*
- * If the dquot is inactive, we are assured that it is
- * not on the mplist or the hashlist, and that makes our
- * life easier.
- */
- if (dqp->dq_flags & XFS_DQ_INACTIVE) {
- ASSERT(dqp->q_mount == NULL);
- ASSERT(! XFS_DQ_IS_DIRTY(dqp));
- ASSERT(dqp->HL_PREVP == NULL);
- ASSERT(dqp->MPL_PREVP == NULL);
- XQM_FREELIST_REMOVE(dqp);
- xfs_dqunlock(dqp);
- dqpout = dqp;
- XQM_STATS_INC(xqmstats.xs_qm_dqinact_reclaims);
- break;
- }
-
- ASSERT(dqp->q_hash);
- ASSERT(dqp->MPL_PREVP);
-
- /*
- * Try to grab the flush lock. If this dquot is in the process of
- * getting flushed to disk, we don't want to reclaim it.
- */
- if (! xfs_qm_dqflock_nowait(dqp)) {
- xfs_dqunlock(dqp);
- continue;
- }
-
- /*
- * We have the flush lock so we know that this is not in the
- * process of being flushed. So, if this is dirty, flush it
- * DELWRI so that we don't get a freelist infested with
- * dirty dquots.
- */
- if (XFS_DQ_IS_DIRTY(dqp)) {
- xfs_dqtrace_entry(dqp, "DQRECLAIM: DQDIRTY");
- /*
- * We flush it delayed write, so don't bother
- * releasing the freelist lock.
- */
- (void) xfs_qm_dqflush(dqp, XFS_QMOPT_DELWRI);
- xfs_dqunlock(dqp); /* dqflush unlocks dqflock */
- continue;
- }
-
- if (! xfs_qm_mplist_nowait(dqp->q_mount)) {
- xfs_dqfunlock(dqp);
- xfs_dqunlock(dqp);
- continue;
- }
-
- if (! xfs_qm_dqhashlock_nowait(dqp))
- goto mplistunlock;
-
- ASSERT(dqp->q_nrefs == 0);
- xfs_dqtrace_entry(dqp, "DQRECLAIM: UNLINKING");
- XQM_MPLIST_REMOVE(&(XFS_QI_MPL_LIST(dqp->q_mount)), dqp);
- XQM_HASHLIST_REMOVE(dqp->q_hash, dqp);
- XQM_FREELIST_REMOVE(dqp);
- dqpout = dqp;
- XFS_DQ_HASH_UNLOCK(dqp->q_hash);
- mplistunlock:
- xfs_qm_mplist_unlock(dqp->q_mount);
- xfs_dqfunlock(dqp);
- xfs_dqunlock(dqp);
- if (dqpout)
- break;
- }
-
- xfs_qm_freelist_unlock(xfs_Gqm);
- return dqpout;
-}
-
-
-/*------------------------------------------------------------------*/
-
-/*
- * Return a new incore dquot. Depending on the number of
- * dquots in the system, we either allocate a new one on the kernel heap,
- * or reclaim a free one.
- * Return value is B_TRUE if we allocated a new dquot, B_FALSE if we managed
- * to reclaim an existing one from the freelist.
- */
-boolean_t
-xfs_qm_dqalloc_incore(
- xfs_dquot_t **O_dqpp)
-{
- xfs_dquot_t *dqp;
-
- /*
- * Check against high water mark to see if we want to pop
- * a nincompoop dquot off the freelist.
- */
- if (atomic_read(&xfs_Gqm->qm_totaldquots) >= ndquot) {
- /*
- * Try to recycle a dquot from the freelist.
- */
- if ((dqp = xfs_qm_dqreclaim_one())) {
- XQM_STATS_INC(xqmstats.xs_qm_dqreclaims);
- /*
- * Just zero the core here. The rest will get
- * reinitialized by caller. XXX we shouldn't even
- * do this zero ...
- */
- memset(&dqp->q_core, 0, sizeof(dqp->q_core));
- *O_dqpp = dqp;
- return B_FALSE;
- }
- XQM_STATS_INC(xqmstats.xs_qm_dqreclaim_misses);
- }
-
- /*
- * Allocate a brand new dquot on the kernel heap and return it
- * to the caller to initialize.
- */
- ASSERT(xfs_Gqm->qm_dqzone != NULL);
- *O_dqpp = kmem_zone_zalloc(xfs_Gqm->qm_dqzone, KM_SLEEP);
- atomic_inc(&xfs_Gqm->qm_totaldquots);
-
- return B_TRUE;
-}
-
-
-/*
- * Start a transaction and write the incore superblock changes to
- * disk. flags parameter indicates which fields have changed.
- */
-int
-xfs_qm_write_sb_changes(
- xfs_mount_t *mp,
- __int64_t flags)
-{
- xfs_trans_t *tp;
- int error;
-
-#ifdef QUOTADEBUG
- cmn_err(CE_NOTE, "Writing superblock quota changes :%s", mp->m_fsname);
-#endif
- tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SBCHANGE);
- if ((error = xfs_trans_reserve(tp, 0,
- mp->m_sb.sb_sectsize + 128, 0,
- 0,
- XFS_DEFAULT_LOG_COUNT))) {
- xfs_trans_cancel(tp, 0);
- return error;
- }
-
- xfs_mod_sb(tp, flags);
- (void) xfs_trans_commit(tp, 0, NULL);
-
- return 0;
-}
-
-
-/* --------------- utility functions for vnodeops ---------------- */
-
-
-/*
- * Given an inode, a uid and gid (from cred_t) make sure that we have
- * allocated relevant dquot(s) on disk, and that we won't exceed inode
- * quotas by creating this file.
- * This also attaches dquot(s) to the given inode after locking it,
- * and returns the dquots corresponding to the uid and/or gid.
- *
- * in : inode (unlocked)
- * out : udquot, gdquot with references taken and unlocked
- */
-int
-xfs_qm_vop_dqalloc(
- xfs_mount_t *mp,
- xfs_inode_t *ip,
- uid_t uid,
- gid_t gid,
- prid_t prid,
- uint flags,
- xfs_dquot_t **O_udqpp,
- xfs_dquot_t **O_gdqpp)
-{
- int error;
- xfs_dquot_t *uq, *gq;
- uint lockflags;
-
- if (!XFS_IS_QUOTA_ON(mp))
- return 0;
-
- lockflags = XFS_ILOCK_EXCL;
- xfs_ilock(ip, lockflags);
-
- if ((flags & XFS_QMOPT_INHERIT) &&
- XFS_INHERIT_GID(ip, XFS_MTOVFS(mp)))
- gid = ip->i_d.di_gid;
-
- /*
- * Attach the dquot(s) to this inode, doing a dquot allocation
- * if necessary. The dquot(s) will not be locked.
- */
- if (XFS_NOT_DQATTACHED(mp, ip)) {
- if ((error = xfs_qm_dqattach(ip, XFS_QMOPT_DQALLOC |
- XFS_QMOPT_ILOCKED))) {
- xfs_iunlock(ip, lockflags);
- return error;
- }
- }
-
- uq = gq = NULL;
- if ((flags & XFS_QMOPT_UQUOTA) && XFS_IS_UQUOTA_ON(mp)) {
- if (ip->i_d.di_uid != uid) {
- /*
- * What we need is the dquot that has this uid, and
- * if we send the inode to dqget, the uid of the inode
- * takes priority over what's sent in the uid argument.
- * We must unlock inode here before calling dqget if
- * we're not sending the inode, because otherwise
- * we'll deadlock by doing trans_reserve while
- * holding ilock.
- */
- xfs_iunlock(ip, lockflags);
- if ((error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t) uid,
- XFS_DQ_USER,
- XFS_QMOPT_DQALLOC |
- XFS_QMOPT_DOWARN,
- &uq))) {
- ASSERT(error != ENOENT);
- return error;
- }
- /*
- * Get the ilock in the right order.
- */
- xfs_dqunlock(uq);
- lockflags = XFS_ILOCK_SHARED;
- xfs_ilock(ip, lockflags);
- } else {
- /*
- * Take an extra reference, because we'll return
- * this to caller
- */
- ASSERT(ip->i_udquot);
- uq = ip->i_udquot;
- xfs_dqlock(uq);
- XFS_DQHOLD(uq);
- xfs_dqunlock(uq);
- }
- }
- if ((flags & XFS_QMOPT_GQUOTA) && XFS_IS_GQUOTA_ON(mp)) {
- if (ip->i_d.di_gid != gid) {
- xfs_iunlock(ip, lockflags);
- if ((error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)gid,
- XFS_DQ_GROUP,
- XFS_QMOPT_DQALLOC |
- XFS_QMOPT_DOWARN,
- &gq))) {
- if (uq)
- xfs_qm_dqrele(uq);
- ASSERT(error != ENOENT);
- return error;
- }
- xfs_dqunlock(gq);
- lockflags = XFS_ILOCK_SHARED;
- xfs_ilock(ip, lockflags);
- } else {
- ASSERT(ip->i_gdquot);
- gq = ip->i_gdquot;
- xfs_dqlock(gq);
- XFS_DQHOLD(gq);
- xfs_dqunlock(gq);
- }
- } else if ((flags & XFS_QMOPT_PQUOTA) && XFS_IS_PQUOTA_ON(mp)) {
- if (ip->i_d.di_projid != prid) {
- xfs_iunlock(ip, lockflags);
- if ((error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)prid,
- XFS_DQ_PROJ,
- XFS_QMOPT_DQALLOC |
- XFS_QMOPT_DOWARN,
- &gq))) {
- if (uq)
- xfs_qm_dqrele(uq);
- ASSERT(error != ENOENT);
- return (error);
- }
- xfs_dqunlock(gq);
- lockflags = XFS_ILOCK_SHARED;
- xfs_ilock(ip, lockflags);
- } else {
- ASSERT(ip->i_gdquot);
- gq = ip->i_gdquot;
- xfs_dqlock(gq);
- XFS_DQHOLD(gq);
- xfs_dqunlock(gq);
- }
- }
- if (uq)
- xfs_dqtrace_entry_ino(uq, "DQALLOC", ip);
-
- xfs_iunlock(ip, lockflags);
- if (O_udqpp)
- *O_udqpp = uq;
- else if (uq)
- xfs_qm_dqrele(uq);
- if (O_gdqpp)
- *O_gdqpp = gq;
- else if (gq)
- xfs_qm_dqrele(gq);
- return 0;
-}
-
-/*
- * Actually transfer ownership, and do dquot modifications.
- * These were already reserved.
- */
-xfs_dquot_t *
-xfs_qm_vop_chown(
- xfs_trans_t *tp,
- xfs_inode_t *ip,
- xfs_dquot_t **IO_olddq,
- xfs_dquot_t *newdq)
-{
- xfs_dquot_t *prevdq;
- uint bfield = XFS_IS_REALTIME_INODE(ip) ?
- XFS_TRANS_DQ_RTBCOUNT : XFS_TRANS_DQ_BCOUNT;
-
- ASSERT(XFS_ISLOCKED_INODE_EXCL(ip));
- ASSERT(XFS_IS_QUOTA_RUNNING(ip->i_mount));
-
- /* old dquot */
- prevdq = *IO_olddq;
- ASSERT(prevdq);
- ASSERT(prevdq != newdq);
-
- xfs_trans_mod_dquot(tp, prevdq, bfield, -(ip->i_d.di_nblocks));
- xfs_trans_mod_dquot(tp, prevdq, XFS_TRANS_DQ_ICOUNT, -1);
-
- /* the sparkling new dquot */
- xfs_trans_mod_dquot(tp, newdq, bfield, ip->i_d.di_nblocks);
- xfs_trans_mod_dquot(tp, newdq, XFS_TRANS_DQ_ICOUNT, 1);
-
- /*
- * Take an extra reference, because the inode
- * is going to keep this dquot pointer even
- * after the trans_commit.
- */
- xfs_dqlock(newdq);
- XFS_DQHOLD(newdq);
- xfs_dqunlock(newdq);
- *IO_olddq = newdq;
-
- return prevdq;
-}
-
-/*
- * Quota reservations for setattr(AT_UID|AT_GID|AT_PROJID).
- */
-int
-xfs_qm_vop_chown_reserve(
- xfs_trans_t *tp,
- xfs_inode_t *ip,
- xfs_dquot_t *udqp,
- xfs_dquot_t *gdqp,
- uint flags)
-{
- int error;
- xfs_mount_t *mp;
- uint delblks, blkflags;
- xfs_dquot_t *unresudq, *unresgdq, *delblksudq, *delblksgdq;
-
- ASSERT(XFS_ISLOCKED_INODE(ip));
- mp = ip->i_mount;
- ASSERT(XFS_IS_QUOTA_RUNNING(mp));
-
- delblks = ip->i_delayed_blks;
- delblksudq = delblksgdq = unresudq = unresgdq = NULL;
- blkflags = XFS_IS_REALTIME_INODE(ip) ?
- XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS;
-
- if (XFS_IS_UQUOTA_ON(mp) && udqp &&
- ip->i_d.di_uid != (uid_t)be32_to_cpu(udqp->q_core.d_id)) {
- delblksudq = udqp;
- /*
- * If there are delayed allocation blocks, then we have to
- * unreserve those from the old dquot, and add them to the
- * new dquot.
- */
- if (delblks) {
- ASSERT(ip->i_udquot);
- unresudq = ip->i_udquot;
- }
- }
- if (XFS_IS_OQUOTA_ON(ip->i_mount) && gdqp) {
- if ((XFS_IS_GQUOTA_ON(ip->i_mount) &&
- ip->i_d.di_gid != be32_to_cpu(gdqp->q_core.d_id)) ||
- (XFS_IS_PQUOTA_ON(ip->i_mount) &&
- ip->i_d.di_projid != be32_to_cpu(gdqp->q_core.d_id))) {
- delblksgdq = gdqp;
- if (delblks) {
- ASSERT(ip->i_gdquot);
- unresgdq = ip->i_gdquot;
- }
- }
- }
-
- if ((error = xfs_trans_reserve_quota_bydquots(tp, ip->i_mount,
- delblksudq, delblksgdq, ip->i_d.di_nblocks, 1,
- flags | blkflags)))
- return (error);
-
- /*
- * Do the delayed blks reservations/unreservations now. Since, these
- * are done without the help of a transaction, if a reservation fails
- * its previous reservations won't be automatically undone by trans
- * code. So, we have to do it manually here.
- */
- if (delblks) {
- /*
- * Do the reservations first. Unreservation can't fail.
- */
- ASSERT(delblksudq || delblksgdq);
- ASSERT(unresudq || unresgdq);
- if ((error = xfs_trans_reserve_quota_bydquots(NULL, ip->i_mount,
- delblksudq, delblksgdq, (xfs_qcnt_t)delblks, 0,
- flags | blkflags)))
- return (error);
- xfs_trans_reserve_quota_bydquots(NULL, ip->i_mount,
- unresudq, unresgdq, -((xfs_qcnt_t)delblks), 0,
- blkflags);
- }
-
- return (0);
-}
-
-int
-xfs_qm_vop_rename_dqattach(
- xfs_inode_t **i_tab)
-{
- xfs_inode_t *ip;
- int i;
- int error;
-
- ip = i_tab[0];
-
- if (! XFS_IS_QUOTA_ON(ip->i_mount))
- return 0;
-
- if (XFS_NOT_DQATTACHED(ip->i_mount, ip)) {
- error = xfs_qm_dqattach(ip, 0);
- if (error)
- return error;
- }
- for (i = 1; (i < 4 && i_tab[i]); i++) {
- /*
- * Watch out for duplicate entries in the table.
- */
- if ((ip = i_tab[i]) != i_tab[i-1]) {
- if (XFS_NOT_DQATTACHED(ip->i_mount, ip)) {
- error = xfs_qm_dqattach(ip, 0);
- if (error)
- return error;
- }
- }
- }
- return 0;
-}
-
-void
-xfs_qm_vop_dqattach_and_dqmod_newinode(
- xfs_trans_t *tp,
- xfs_inode_t *ip,
- xfs_dquot_t *udqp,
- xfs_dquot_t *gdqp)
-{
- if (!XFS_IS_QUOTA_ON(tp->t_mountp))
- return;
-
- ASSERT(XFS_ISLOCKED_INODE_EXCL(ip));
- ASSERT(XFS_IS_QUOTA_RUNNING(tp->t_mountp));
-
- if (udqp) {
- xfs_dqlock(udqp);
- XFS_DQHOLD(udqp);
- xfs_dqunlock(udqp);
- ASSERT(ip->i_udquot == NULL);
- ip->i_udquot = udqp;
- ASSERT(XFS_IS_UQUOTA_ON(tp->t_mountp));
- ASSERT(ip->i_d.di_uid == be32_to_cpu(udqp->q_core.d_id));
- xfs_trans_mod_dquot(tp, udqp, XFS_TRANS_DQ_ICOUNT, 1);
- }
- if (gdqp) {
- xfs_dqlock(gdqp);
- XFS_DQHOLD(gdqp);
- xfs_dqunlock(gdqp);
- ASSERT(ip->i_gdquot == NULL);
- ip->i_gdquot = gdqp;
- ASSERT(XFS_IS_OQUOTA_ON(tp->t_mountp));
- ASSERT((XFS_IS_GQUOTA_ON(tp->t_mountp) ?
- ip->i_d.di_gid : ip->i_d.di_projid) ==
- be32_to_cpu(gdqp->q_core.d_id));
- xfs_trans_mod_dquot(tp, gdqp, XFS_TRANS_DQ_ICOUNT, 1);
- }
-}
-
-/* ------------- list stuff -----------------*/
-STATIC void
-xfs_qm_freelist_init(xfs_frlist_t *ql)
-{
- ql->qh_next = ql->qh_prev = (xfs_dquot_t *) ql;
- mutex_init(&ql->qh_lock);
- ql->qh_version = 0;
- ql->qh_nelems = 0;
-}
-
-STATIC void
-xfs_qm_freelist_destroy(xfs_frlist_t *ql)
-{
- xfs_dquot_t *dqp, *nextdqp;
-
- mutex_lock(&ql->qh_lock);
- for (dqp = ql->qh_next;
- dqp != (xfs_dquot_t *)ql; ) {
- xfs_dqlock(dqp);
- nextdqp = dqp->dq_flnext;
-#ifdef QUOTADEBUG
- cmn_err(CE_DEBUG, "FREELIST destroy 0x%p", dqp);
-#endif
- XQM_FREELIST_REMOVE(dqp);
- xfs_dqunlock(dqp);
- xfs_qm_dqdestroy(dqp);
- dqp = nextdqp;
- }
- /*
- * Don't bother about unlocking.
- */
- mutex_destroy(&ql->qh_lock);
-
- ASSERT(ql->qh_nelems == 0);
-}
-
-STATIC void
-xfs_qm_freelist_insert(xfs_frlist_t *ql, xfs_dquot_t *dq)
-{
- dq->dq_flnext = ql->qh_next;
- dq->dq_flprev = (xfs_dquot_t *)ql;
- ql->qh_next = dq;
- dq->dq_flnext->dq_flprev = dq;
- xfs_Gqm->qm_dqfreelist.qh_nelems++;
- xfs_Gqm->qm_dqfreelist.qh_version++;
-}
-
-void
-xfs_qm_freelist_unlink(xfs_dquot_t *dq)
-{
- xfs_dquot_t *next = dq->dq_flnext;
- xfs_dquot_t *prev = dq->dq_flprev;
-
- next->dq_flprev = prev;
- prev->dq_flnext = next;
- dq->dq_flnext = dq->dq_flprev = dq;
- xfs_Gqm->qm_dqfreelist.qh_nelems--;
- xfs_Gqm->qm_dqfreelist.qh_version++;
-}
-
-void
-xfs_qm_freelist_append(xfs_frlist_t *ql, xfs_dquot_t *dq)
-{
- xfs_qm_freelist_insert((xfs_frlist_t *)ql->qh_prev, dq);
-}
-
-STATIC int
-xfs_qm_dqhashlock_nowait(
- xfs_dquot_t *dqp)
-{
- int locked;
-
- locked = mutex_trylock(&((dqp)->q_hash->qh_lock));
- return locked;
-}
-
-int
-xfs_qm_freelist_lock_nowait(
- xfs_qm_t *xqm)
-{
- int locked;
-
- locked = mutex_trylock(&(xqm->qm_dqfreelist.qh_lock));
- return locked;
-}
-
-STATIC int
-xfs_qm_mplist_nowait(
- xfs_mount_t *mp)
-{
- int locked;
-
- ASSERT(mp->m_quotainfo);
- locked = mutex_trylock(&(XFS_QI_MPLLOCK(mp)));
- return locked;
-}
diff --git a/fs/xfs/quota/xfs_qm.h b/fs/xfs/quota/xfs_qm.h
deleted file mode 100644
index 4568deb6da8..00000000000
--- a/fs/xfs/quota/xfs_qm.h
+++ /dev/null
@@ -1,216 +0,0 @@
-/*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- */
-#ifndef __XFS_QM_H__
-#define __XFS_QM_H__
-
-#include "xfs_dquot_item.h"
-#include "xfs_dquot.h"
-#include "xfs_quota_priv.h"
-#include "xfs_qm_stats.h"
-
-struct xfs_qm;
-struct xfs_inode;
-
-extern uint ndquot;
-extern mutex_t xfs_Gqm_lock;
-extern struct xfs_qm *xfs_Gqm;
-extern kmem_zone_t *qm_dqzone;
-extern kmem_zone_t *qm_dqtrxzone;
-
-/*
- * Used in xfs_qm_sync called by xfs_sync to count the max times that it can
- * iterate over the mountpt's dquot list in one call.
- */
-#define XFS_QM_SYNC_MAX_RESTARTS 7
-
-/*
- * Ditto, for xfs_qm_dqreclaim_one.
- */
-#define XFS_QM_RECLAIM_MAX_RESTARTS 4
-
-/*
- * Ideal ratio of free to in use dquots. Quota manager makes an attempt
- * to keep this balance.
- */
-#define XFS_QM_DQFREE_RATIO 2
-
-/*
- * Dquot hashtable constants/threshold values.
- */
-#define XFS_QM_HASHSIZE_LOW (NBPP / sizeof(xfs_dqhash_t))
-#define XFS_QM_HASHSIZE_HIGH ((NBPP * 4) / sizeof(xfs_dqhash_t))
-
-/*
- * We output a cmn_err when quotachecking a quota file with more than
- * this many fsbs.
- */
-#define XFS_QM_BIG_QCHECK_NBLKS 500
-
-/*
- * This defines the unit of allocation of dquots.
- * Currently, it is just one file system block, and a 4K blk contains 30
- * (136 * 30 = 4080) dquots. It's probably not worth trying to make
- * this more dynamic.
- * XXXsup However, if this number is changed, we have to make sure that we don't
- * implicitly assume that we do allocations in chunks of a single filesystem
- * block in the dquot/xqm code.
- */
-#define XFS_DQUOT_CLUSTER_SIZE_FSB (xfs_filblks_t)1
-/*
- * When doing a quotacheck, we log dquot clusters of this many FSBs at most
- * in a single transaction. We don't want to ask for too huge a log reservation.
- */
-#define XFS_QM_MAX_DQCLUSTER_LOGSZ 3
-
-typedef xfs_dqhash_t xfs_dqlist_t;
-/*
- * The freelist head. The first two fields match the first two in the
- * xfs_dquot_t structure (in xfs_dqmarker_t)
- */
-typedef struct xfs_frlist {
- struct xfs_dquot *qh_next;
- struct xfs_dquot *qh_prev;
- mutex_t qh_lock;
- uint qh_version;
- uint qh_nelems;
-} xfs_frlist_t;
-
-/*
- * Quota Manager (global) structure. Lives only in core.
- */
-typedef struct xfs_qm {
- xfs_dqlist_t *qm_usr_dqhtable;/* udquot hash table */
- xfs_dqlist_t *qm_grp_dqhtable;/* gdquot hash table */
- uint qm_dqhashmask; /* # buckets in dq hashtab - 1 */
- xfs_frlist_t qm_dqfreelist; /* freelist of dquots */
- atomic_t qm_totaldquots; /* total incore dquots */
- uint qm_nrefs; /* file systems with quota on */
- int qm_dqfree_ratio;/* ratio of free to inuse dquots */
- kmem_zone_t *qm_dqzone; /* dquot mem-alloc zone */
- kmem_zone_t *qm_dqtrxzone; /* t_dqinfo of transactions */
-} xfs_qm_t;
-
-/*
- * Various quota information for individual filesystems.
- * The mount structure keeps a pointer to this.
- */
-typedef struct xfs_quotainfo {
- xfs_inode_t *qi_uquotaip; /* user quota inode */
- xfs_inode_t *qi_gquotaip; /* group quota inode */
- lock_t qi_pinlock; /* dquot pinning mutex */
- xfs_dqlist_t qi_dqlist; /* all dquots in filesys */
- int qi_dqreclaims; /* a change here indicates
- a removal in the dqlist */
- time_t qi_btimelimit; /* limit for blks timer */
- time_t qi_itimelimit; /* limit for inodes timer */
- time_t qi_rtbtimelimit;/* limit for rt blks timer */
- xfs_qwarncnt_t qi_bwarnlimit; /* limit for blks warnings */
- xfs_qwarncnt_t qi_iwarnlimit; /* limit for inodes warnings */
- xfs_qwarncnt_t qi_rtbwarnlimit;/* limit for rt blks warnings */
- mutex_t qi_quotaofflock;/* to serialize quotaoff */
- xfs_filblks_t qi_dqchunklen; /* # BBs in a chunk of dqs */
- uint qi_dqperchunk; /* # ondisk dqs in above chunk */
- xfs_qcnt_t qi_bhardlimit; /* default data blk hard limit */
- xfs_qcnt_t qi_bsoftlimit; /* default data blk soft limit */
- xfs_qcnt_t qi_ihardlimit; /* default inode count hard limit */
- xfs_qcnt_t qi_isoftlimit; /* default inode count soft limit */
- xfs_qcnt_t qi_rtbhardlimit;/* default realtime blk hard limit */
- xfs_qcnt_t qi_rtbsoftlimit;/* default realtime blk soft limit */
-} xfs_quotainfo_t;
-
-
-extern xfs_dqtrxops_t xfs_trans_dquot_ops;
-
-extern void xfs_trans_mod_dquot(xfs_trans_t *, xfs_dquot_t *, uint, long);
-extern int xfs_trans_reserve_quota_bydquots(xfs_trans_t *, xfs_mount_t *,
- xfs_dquot_t *, xfs_dquot_t *, long, long, uint);
-extern void xfs_trans_dqjoin(xfs_trans_t *, xfs_dquot_t *);
-extern void xfs_trans_log_dquot(xfs_trans_t *, xfs_dquot_t *);
-
-/*
- * We keep the usr and grp dquots separately so that locking will be easier
- * to do at commit time. All transactions that we know of at this point
- * affect no more than two dquots of one type. Hence, the TRANS_MAXDQS value.
- */
-#define XFS_QM_TRANS_MAXDQS 2
-typedef struct xfs_dquot_acct {
- xfs_dqtrx_t dqa_usrdquots[XFS_QM_TRANS_MAXDQS];
- xfs_dqtrx_t dqa_grpdquots[XFS_QM_TRANS_MAXDQS];
-} xfs_dquot_acct_t;
-
-/*
- * Users are allowed to have a usage exceeding their softlimit for
- * a period this long.
- */
-#define XFS_QM_BTIMELIMIT (7 * 24*60*60) /* 1 week */
-#define XFS_QM_RTBTIMELIMIT (7 * 24*60*60) /* 1 week */
-#define XFS_QM_ITIMELIMIT (7 * 24*60*60) /* 1 week */
-
-#define XFS_QM_BWARNLIMIT 5
-#define XFS_QM_IWARNLIMIT 5
-#define XFS_QM_RTBWARNLIMIT 5
-
-#define XFS_QM_LOCK(xqm) (mutex_lock(&xqm##_lock))
-#define XFS_QM_UNLOCK(xqm) (mutex_unlock(&xqm##_lock))
-#define XFS_QM_HOLD(xqm) ((xqm)->qm_nrefs++)
-#define XFS_QM_RELE(xqm) ((xqm)->qm_nrefs--)
-
-extern void xfs_qm_destroy_quotainfo(xfs_mount_t *);
-extern int xfs_qm_mount_quotas(xfs_mount_t *, int);
-extern void xfs_qm_mount_quotainit(xfs_mount_t *, uint);
-extern int xfs_qm_quotacheck(xfs_mount_t *);
-extern void xfs_qm_unmount_quotadestroy(xfs_mount_t *);
-extern int xfs_qm_unmount_quotas(xfs_mount_t *);
-extern int xfs_qm_write_sb_changes(xfs_mount_t *, __int64_t);
-extern int xfs_qm_sync(xfs_mount_t *, short);
-
-/* dquot stuff */
-extern boolean_t xfs_qm_dqalloc_incore(xfs_dquot_t **);
-extern int xfs_qm_dqattach(xfs_inode_t *, uint);
-extern void xfs_qm_dqdetach(xfs_inode_t *);
-extern int xfs_qm_dqpurge_all(xfs_mount_t *, uint);
-extern void xfs_qm_dqrele_all_inodes(xfs_mount_t *, uint);
-
-/* vop stuff */
-extern int xfs_qm_vop_dqalloc(xfs_mount_t *, xfs_inode_t *,
- uid_t, gid_t, prid_t, uint,
- xfs_dquot_t **, xfs_dquot_t **);
-extern void xfs_qm_vop_dqattach_and_dqmod_newinode(
- xfs_trans_t *, xfs_inode_t *,
- xfs_dquot_t *, xfs_dquot_t *);
-extern int xfs_qm_vop_rename_dqattach(xfs_inode_t **);
-extern xfs_dquot_t * xfs_qm_vop_chown(xfs_trans_t *, xfs_inode_t *,
- xfs_dquot_t **, xfs_dquot_t *);
-extern int xfs_qm_vop_chown_reserve(xfs_trans_t *, xfs_inode_t *,
- xfs_dquot_t *, xfs_dquot_t *, uint);
-
-/* list stuff */
-extern void xfs_qm_freelist_append(xfs_frlist_t *, xfs_dquot_t *);
-extern void xfs_qm_freelist_unlink(xfs_dquot_t *);
-extern int xfs_qm_freelist_lock_nowait(xfs_qm_t *);
-
-/* system call interface */
-extern int xfs_qm_quotactl(bhv_desc_t *, int, int, xfs_caddr_t);
-
-#ifdef DEBUG
-extern int xfs_qm_internalqcheck(xfs_mount_t *);
-#else
-#define xfs_qm_internalqcheck(mp) (0)
-#endif
-
-#endif /* __XFS_QM_H__ */
diff --git a/fs/xfs/quota/xfs_qm_bhv.c b/fs/xfs/quota/xfs_qm_bhv.c
deleted file mode 100644
index 90402a1c398..00000000000
--- a/fs/xfs/quota/xfs_qm_bhv.c
+++ /dev/null
@@ -1,380 +0,0 @@
-/*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_bit.h"
-#include "xfs_log.h"
-#include "xfs_inum.h"
-#include "xfs_clnt.h"
-#include "xfs_trans.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_dir.h"
-#include "xfs_dir2.h"
-#include "xfs_alloc.h"
-#include "xfs_dmapi.h"
-#include "xfs_quota.h"
-#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
-#include "xfs_dinode.h"
-#include "xfs_inode.h"
-#include "xfs_ialloc.h"
-#include "xfs_itable.h"
-#include "xfs_btree.h"
-#include "xfs_bmap.h"
-#include "xfs_rtalloc.h"
-#include "xfs_error.h"
-#include "xfs_rw.h"
-#include "xfs_acl.h"
-#include "xfs_cap.h"
-#include "xfs_mac.h"
-#include "xfs_attr.h"
-#include "xfs_buf_item.h"
-#include "xfs_qm.h"
-
-#define MNTOPT_QUOTA "quota" /* disk quotas (user) */
-#define MNTOPT_NOQUOTA "noquota" /* no quotas */
-#define MNTOPT_USRQUOTA "usrquota" /* user quota enabled */
-#define MNTOPT_GRPQUOTA "grpquota" /* group quota enabled */
-#define MNTOPT_PRJQUOTA "prjquota" /* project quota enabled */
-#define MNTOPT_UQUOTA "uquota" /* user quota (IRIX variant) */
-#define MNTOPT_GQUOTA "gquota" /* group quota (IRIX variant) */
-#define MNTOPT_PQUOTA "pquota" /* project quota (IRIX variant) */
-#define MNTOPT_UQUOTANOENF "uqnoenforce"/* user quota limit enforcement */
-#define MNTOPT_GQUOTANOENF "gqnoenforce"/* group quota limit enforcement */
-#define MNTOPT_PQUOTANOENF "pqnoenforce"/* project quota limit enforcement */
-#define MNTOPT_QUOTANOENF "qnoenforce" /* same as uqnoenforce */
-
-STATIC int
-xfs_qm_parseargs(
- struct bhv_desc *bhv,
- char *options,
- struct xfs_mount_args *args,
- int update)
-{
- size_t length;
- char *local_options = options;
- char *this_char;
- int error;
- int referenced = update;
-
- while ((this_char = strsep(&local_options, ",")) != NULL) {
- length = strlen(this_char);
- if (local_options)
- length++;
-
- if (!strcmp(this_char, MNTOPT_NOQUOTA)) {
- args->flags &= ~(XFSMNT_UQUOTAENF|XFSMNT_UQUOTA);
- args->flags &= ~(XFSMNT_GQUOTAENF|XFSMNT_GQUOTA);
- referenced = update;
- } else if (!strcmp(this_char, MNTOPT_QUOTA) ||
- !strcmp(this_char, MNTOPT_UQUOTA) ||
- !strcmp(this_char, MNTOPT_USRQUOTA)) {
- args->flags |= XFSMNT_UQUOTA | XFSMNT_UQUOTAENF;
- referenced = 1;
- } else if (!strcmp(this_char, MNTOPT_QUOTANOENF) ||
- !strcmp(this_char, MNTOPT_UQUOTANOENF)) {
- args->flags |= XFSMNT_UQUOTA;
- args->flags &= ~XFSMNT_UQUOTAENF;
- referenced = 1;
- } else if (!strcmp(this_char, MNTOPT_PQUOTA) ||
- !strcmp(this_char, MNTOPT_PRJQUOTA)) {
- args->flags |= XFSMNT_PQUOTA | XFSMNT_PQUOTAENF;
- referenced = 1;
- } else if (!strcmp(this_char, MNTOPT_PQUOTANOENF)) {
- args->flags |= XFSMNT_PQUOTA;
- args->flags &= ~XFSMNT_PQUOTAENF;
- referenced = 1;
- } else if (!strcmp(this_char, MNTOPT_GQUOTA) ||
- !strcmp(this_char, MNTOPT_GRPQUOTA)) {
- args->flags |= XFSMNT_GQUOTA | XFSMNT_GQUOTAENF;
- referenced = 1;
- } else if (!strcmp(this_char, MNTOPT_GQUOTANOENF)) {
- args->flags |= XFSMNT_GQUOTA;
- args->flags &= ~XFSMNT_GQUOTAENF;
- referenced = 1;
- } else {
- if (local_options)
- *(local_options-1) = ',';
- continue;
- }
-
- while (length--)
- *this_char++ = ',';
- }
-
- if ((args->flags & XFSMNT_GQUOTA) && (args->flags & XFSMNT_PQUOTA)) {
- cmn_err(CE_WARN,
- "XFS: cannot mount with both project and group quota");
- return XFS_ERROR(EINVAL);
- }
-
- PVFS_PARSEARGS(BHV_NEXT(bhv), options, args, update, error);
- if (!error && !referenced)
- bhv_remove_vfsops(bhvtovfs(bhv), VFS_POSITION_QM);
- return error;
-}
-
-STATIC int
-xfs_qm_showargs(
- struct bhv_desc *bhv,
- struct seq_file *m)
-{
- struct vfs *vfsp = bhvtovfs(bhv);
- struct xfs_mount *mp = XFS_VFSTOM(vfsp);
- int error;
-
- if (mp->m_qflags & XFS_UQUOTA_ACCT) {
- (mp->m_qflags & XFS_UQUOTA_ENFD) ?
- seq_puts(m, "," MNTOPT_USRQUOTA) :
- seq_puts(m, "," MNTOPT_UQUOTANOENF);
- }
-
- if (mp->m_qflags & XFS_PQUOTA_ACCT) {
- (mp->m_qflags & XFS_OQUOTA_ENFD) ?
- seq_puts(m, "," MNTOPT_PRJQUOTA) :
- seq_puts(m, "," MNTOPT_PQUOTANOENF);
- }
-
- if (mp->m_qflags & XFS_GQUOTA_ACCT) {
- (mp->m_qflags & XFS_OQUOTA_ENFD) ?
- seq_puts(m, "," MNTOPT_GRPQUOTA) :
- seq_puts(m, "," MNTOPT_GQUOTANOENF);
- }
-
- if (!(mp->m_qflags & XFS_ALL_QUOTA_ACCT))
- seq_puts(m, "," MNTOPT_NOQUOTA);
-
- PVFS_SHOWARGS(BHV_NEXT(bhv), m, error);
- return error;
-}
-
-STATIC int
-xfs_qm_mount(
- struct bhv_desc *bhv,
- struct xfs_mount_args *args,
- struct cred *cr)
-{
- struct vfs *vfsp = bhvtovfs(bhv);
- struct xfs_mount *mp = XFS_VFSTOM(vfsp);
- int error;
-
- if (args->flags & (XFSMNT_UQUOTA | XFSMNT_GQUOTA | XFSMNT_PQUOTA))
- xfs_qm_mount_quotainit(mp, args->flags);
- PVFS_MOUNT(BHV_NEXT(bhv), args, cr, error);
- return error;
-}
-
-STATIC int
-xfs_qm_syncall(
- struct bhv_desc *bhv,
- int flags,
- cred_t *credp)
-{
- struct vfs *vfsp = bhvtovfs(bhv);
- struct xfs_mount *mp = XFS_VFSTOM(vfsp);
- int error;
-
- /*
- * Get the Quota Manager to flush the dquots.
- */
- if (XFS_IS_QUOTA_ON(mp)) {
- if ((error = xfs_qm_sync(mp, flags))) {
- /*
- * If we got an IO error, we will be shutting down.
- * So, there's nothing more for us to do here.
- */
- ASSERT(error != EIO || XFS_FORCED_SHUTDOWN(mp));
- if (XFS_FORCED_SHUTDOWN(mp)) {
- return XFS_ERROR(error);
- }
- }
- }
- PVFS_SYNC(BHV_NEXT(bhv), flags, credp, error);
- return error;
-}
-
-STATIC int
-xfs_qm_newmount(
- xfs_mount_t *mp,
- uint *needquotamount,
- uint *quotaflags)
-{
- uint quotaondisk;
- uint uquotaondisk = 0, gquotaondisk = 0, pquotaondisk = 0;
-
- *quotaflags = 0;
- *needquotamount = B_FALSE;
-
- quotaondisk = XFS_SB_VERSION_HASQUOTA(&mp->m_sb) &&
- (mp->m_sb.sb_qflags & XFS_ALL_QUOTA_ACCT);
-
- if (quotaondisk) {
- uquotaondisk = mp->m_sb.sb_qflags & XFS_UQUOTA_ACCT;
- pquotaondisk = mp->m_sb.sb_qflags & XFS_PQUOTA_ACCT;
- gquotaondisk = mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT;
- }
-
- /*
- * If the device itself is read-only, we can't allow
- * the user to change the state of quota on the mount -
- * this would generate a transaction on the ro device,
- * which would lead to an I/O error and shutdown
- */
-
- if (((uquotaondisk && !XFS_IS_UQUOTA_ON(mp)) ||
- (!uquotaondisk && XFS_IS_UQUOTA_ON(mp)) ||
- (pquotaondisk && !XFS_IS_PQUOTA_ON(mp)) ||
- (!pquotaondisk && XFS_IS_PQUOTA_ON(mp)) ||
- (gquotaondisk && !XFS_IS_GQUOTA_ON(mp)) ||
- (!gquotaondisk && XFS_IS_OQUOTA_ON(mp))) &&
- xfs_dev_is_read_only(mp, "changing quota state")) {
- cmn_err(CE_WARN,
- "XFS: please mount with%s%s%s%s.",
- (!quotaondisk ? "out quota" : ""),
- (uquotaondisk ? " usrquota" : ""),
- (pquotaondisk ? " prjquota" : ""),
- (gquotaondisk ? " grpquota" : ""));
- return XFS_ERROR(EPERM);
- }
-
- if (XFS_IS_QUOTA_ON(mp) || quotaondisk) {
- /*
- * Call mount_quotas at this point only if we won't have to do
- * a quotacheck.
- */
- if (quotaondisk && !XFS_QM_NEED_QUOTACHECK(mp)) {
- /*
- * If an error occured, qm_mount_quotas code
- * has already disabled quotas. So, just finish
- * mounting, and get on with the boring life
- * without disk quotas.
- */
- xfs_qm_mount_quotas(mp, 0);
- } else {
- /*
- * Clear the quota flags, but remember them. This
- * is so that the quota code doesn't get invoked
- * before we're ready. This can happen when an
- * inode goes inactive and wants to free blocks,
- * or via xfs_log_mount_finish.
- */
- *needquotamount = B_TRUE;
- *quotaflags = mp->m_qflags;
- mp->m_qflags = 0;
- }
- }
-
- return 0;
-}
-
-STATIC int
-xfs_qm_endmount(
- xfs_mount_t *mp,
- uint needquotamount,
- uint quotaflags,
- int mfsi_flags)
-{
- if (needquotamount) {
- ASSERT(mp->m_qflags == 0);
- mp->m_qflags = quotaflags;
- xfs_qm_mount_quotas(mp, mfsi_flags);
- }
-
-#if defined(DEBUG) && defined(XFS_LOUD_RECOVERY)
- if (! (XFS_IS_QUOTA_ON(mp)))
- xfs_fs_cmn_err(CE_NOTE, mp, "Disk quotas not turned on");
- else
- xfs_fs_cmn_err(CE_NOTE, mp, "Disk quotas turned on");
-#endif
-
-#ifdef QUOTADEBUG
- if (XFS_IS_QUOTA_ON(mp) && xfs_qm_internalqcheck(mp))
- cmn_err(CE_WARN, "XFS: mount internalqcheck failed");
-#endif
-
- return 0;
-}
-
-STATIC void
-xfs_qm_dqrele_null(
- xfs_dquot_t *dq)
-{
- /*
- * Called from XFS, where we always check first for a NULL dquot.
- */
- if (!dq)
- return;
- xfs_qm_dqrele(dq);
-}
-
-
-STATIC struct xfs_qmops xfs_qmcore_xfs = {
- .xfs_qminit = xfs_qm_newmount,
- .xfs_qmdone = xfs_qm_unmount_quotadestroy,
- .xfs_qmmount = xfs_qm_endmount,
- .xfs_qmunmount = xfs_qm_unmount_quotas,
- .xfs_dqrele = xfs_qm_dqrele_null,
- .xfs_dqattach = xfs_qm_dqattach,
- .xfs_dqdetach = xfs_qm_dqdetach,
- .xfs_dqpurgeall = xfs_qm_dqpurge_all,
- .xfs_dqvopalloc = xfs_qm_vop_dqalloc,
- .xfs_dqvopcreate = xfs_qm_vop_dqattach_and_dqmod_newinode,
- .xfs_dqvoprename = xfs_qm_vop_rename_dqattach,
- .xfs_dqvopchown = xfs_qm_vop_chown,
- .xfs_dqvopchownresv = xfs_qm_vop_chown_reserve,
- .xfs_dqtrxops = &xfs_trans_dquot_ops,
-};
-
-struct bhv_vfsops xfs_qmops = { {
- BHV_IDENTITY_INIT(VFS_BHV_QM, VFS_POSITION_QM),
- .vfs_parseargs = xfs_qm_parseargs,
- .vfs_showargs = xfs_qm_showargs,
- .vfs_mount = xfs_qm_mount,
- .vfs_sync = xfs_qm_syncall,
- .vfs_quotactl = xfs_qm_quotactl, },
-};
-
-
-void __init
-xfs_qm_init(void)
-{
- static char message[] __initdata =
- KERN_INFO "SGI XFS Quota Management subsystem\n";
-
- printk(message);
- mutex_init(&xfs_Gqm_lock);
- vfs_bhv_set_custom(&xfs_qmops, &xfs_qmcore_xfs);
- xfs_qm_init_procfs();
-}
-
-void __exit
-xfs_qm_exit(void)
-{
- vfs_bhv_clr_custom(&xfs_qmops);
- xfs_qm_cleanup_procfs();
- if (qm_dqzone)
- kmem_cache_destroy(qm_dqzone);
- if (qm_dqtrxzone)
- kmem_cache_destroy(qm_dqtrxzone);
-}
diff --git a/fs/xfs/quota/xfs_qm_stats.c b/fs/xfs/quota/xfs_qm_stats.c
deleted file mode 100644
index 0570f773355..00000000000
--- a/fs/xfs/quota/xfs_qm_stats.c
+++ /dev/null
@@ -1,134 +0,0 @@
-/*
- * Copyright (c) 2000-2003 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_bit.h"
-#include "xfs_log.h"
-#include "xfs_inum.h"
-#include "xfs_trans.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_dir.h"
-#include "xfs_dir2.h"
-#include "xfs_alloc.h"
-#include "xfs_dmapi.h"
-#include "xfs_quota.h"
-#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
-#include "xfs_dinode.h"
-#include "xfs_inode.h"
-#include "xfs_ialloc.h"
-#include "xfs_itable.h"
-#include "xfs_bmap.h"
-#include "xfs_btree.h"
-#include "xfs_rtalloc.h"
-#include "xfs_error.h"
-#include "xfs_rw.h"
-#include "xfs_acl.h"
-#include "xfs_cap.h"
-#include "xfs_mac.h"
-#include "xfs_attr.h"
-#include "xfs_buf_item.h"
-#include "xfs_qm.h"
-
-struct xqmstats xqmstats;
-
-STATIC int
-xfs_qm_read_xfsquota(
- char *buffer,
- char **start,
- off_t offset,
- int count,
- int *eof,
- void *data)
-{
- int len;
-
- /* maximum; incore; ratio free to inuse; freelist */
- len = sprintf(buffer, "%d\t%d\t%d\t%u\n",
- ndquot,
- xfs_Gqm? atomic_read(&xfs_Gqm->qm_totaldquots) : 0,
- xfs_Gqm? xfs_Gqm->qm_dqfree_ratio : 0,
- xfs_Gqm? xfs_Gqm->qm_dqfreelist.qh_nelems : 0);
-
- if (offset >= len) {
- *start = buffer;
- *eof = 1;
- return 0;
- }
- *start = buffer + offset;
- if ((len -= offset) > count)
- return count;
- *eof = 1;
-
- return len;
-}
-
-STATIC int
-xfs_qm_read_stats(
- char *buffer,
- char **start,
- off_t offset,
- int count,
- int *eof,
- void *data)
-{
- int len;
-
- /* quota performance statistics */
- len = sprintf(buffer, "qm %u %u %u %u %u %u %u %u\n",
- xqmstats.xs_qm_dqreclaims,
- xqmstats.xs_qm_dqreclaim_misses,
- xqmstats.xs_qm_dquot_dups,
- xqmstats.xs_qm_dqcachemisses,
- xqmstats.xs_qm_dqcachehits,
- xqmstats.xs_qm_dqwants,
- xqmstats.xs_qm_dqshake_reclaims,
- xqmstats.xs_qm_dqinact_reclaims);
-
- if (offset >= len) {
- *start = buffer;
- *eof = 1;
- return 0;
- }
- *start = buffer + offset;
- if ((len -= offset) > count)
- return count;
- *eof = 1;
-
- return len;
-}
-
-void
-xfs_qm_init_procfs(void)
-{
- create_proc_read_entry("fs/xfs/xqmstat", 0, NULL, xfs_qm_read_stats, NULL);
- create_proc_read_entry("fs/xfs/xqm", 0, NULL, xfs_qm_read_xfsquota, NULL);
-}
-
-void
-xfs_qm_cleanup_procfs(void)
-{
- remove_proc_entry("fs/xfs/xqm", NULL);
- remove_proc_entry("fs/xfs/xqmstat", NULL);
-}
diff --git a/fs/xfs/quota/xfs_qm_stats.h b/fs/xfs/quota/xfs_qm_stats.h
deleted file mode 100644
index a50ffabcf55..00000000000
--- a/fs/xfs/quota/xfs_qm_stats.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * Copyright (c) 2002 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- */
-#ifndef __XFS_QM_STATS_H__
-#define __XFS_QM_STATS_H__
-
-#if defined(CONFIG_PROC_FS) && !defined(XFS_STATS_OFF)
-
-/*
- * XQM global statistics
- */
-struct xqmstats {
- __uint32_t xs_qm_dqreclaims;
- __uint32_t xs_qm_dqreclaim_misses;
- __uint32_t xs_qm_dquot_dups;
- __uint32_t xs_qm_dqcachemisses;
- __uint32_t xs_qm_dqcachehits;
- __uint32_t xs_qm_dqwants;
- __uint32_t xs_qm_dqshake_reclaims;
- __uint32_t xs_qm_dqinact_reclaims;
-};
-
-extern struct xqmstats xqmstats;
-
-# define XQM_STATS_INC(count) ( (count)++ )
-
-extern void xfs_qm_init_procfs(void);
-extern void xfs_qm_cleanup_procfs(void);
-
-#else
-
-# define XQM_STATS_INC(count) do { } while (0)
-
-static __inline void xfs_qm_init_procfs(void) { };
-static __inline void xfs_qm_cleanup_procfs(void) { };
-
-#endif
-
-#endif /* __XFS_QM_STATS_H__ */
diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c
deleted file mode 100644
index 676884394aa..00000000000
--- a/fs/xfs/quota/xfs_qm_syscalls.c
+++ /dev/null
@@ -1,1483 +0,0 @@
-/*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include <linux/capability.h>
-
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_bit.h"
-#include "xfs_log.h"
-#include "xfs_inum.h"
-#include "xfs_trans.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_dir.h"
-#include "xfs_dir2.h"
-#include "xfs_alloc.h"
-#include "xfs_dmapi.h"
-#include "xfs_quota.h"
-#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
-#include "xfs_dinode.h"
-#include "xfs_inode.h"
-#include "xfs_ialloc.h"
-#include "xfs_itable.h"
-#include "xfs_bmap.h"
-#include "xfs_btree.h"
-#include "xfs_rtalloc.h"
-#include "xfs_error.h"
-#include "xfs_rw.h"
-#include "xfs_acl.h"
-#include "xfs_cap.h"
-#include "xfs_mac.h"
-#include "xfs_attr.h"
-#include "xfs_buf_item.h"
-#include "xfs_utils.h"
-#include "xfs_qm.h"
-
-#ifdef DEBUG
-# define qdprintk(s, args...) cmn_err(CE_DEBUG, s, ## args)
-#else
-# define qdprintk(s, args...) do { } while (0)
-#endif
-
-STATIC int xfs_qm_scall_trunc_qfiles(xfs_mount_t *, uint);
-STATIC int xfs_qm_scall_getquota(xfs_mount_t *, xfs_dqid_t, uint,
- fs_disk_quota_t *);
-STATIC int xfs_qm_scall_getqstat(xfs_mount_t *, fs_quota_stat_t *);
-STATIC int xfs_qm_scall_setqlim(xfs_mount_t *, xfs_dqid_t, uint,
- fs_disk_quota_t *);
-STATIC int xfs_qm_scall_quotaon(xfs_mount_t *, uint);
-STATIC int xfs_qm_scall_quotaoff(xfs_mount_t *, uint, boolean_t);
-STATIC int xfs_qm_log_quotaoff(xfs_mount_t *, xfs_qoff_logitem_t **, uint);
-STATIC int xfs_qm_log_quotaoff_end(xfs_mount_t *, xfs_qoff_logitem_t *,
- uint);
-STATIC uint xfs_qm_import_flags(uint);
-STATIC uint xfs_qm_export_flags(uint);
-STATIC uint xfs_qm_import_qtype_flags(uint);
-STATIC uint xfs_qm_export_qtype_flags(uint);
-STATIC void xfs_qm_export_dquot(xfs_mount_t *, xfs_disk_dquot_t *,
- fs_disk_quota_t *);
-
-
-/*
- * The main distribution switch of all XFS quotactl system calls.
- */
-int
-xfs_qm_quotactl(
- struct bhv_desc *bdp,
- int cmd,
- int id,
- xfs_caddr_t addr)
-{
- xfs_mount_t *mp;
- int error;
- struct vfs *vfsp;
-
- vfsp = bhvtovfs(bdp);
- mp = XFS_VFSTOM(vfsp);
-
- ASSERT(addr != NULL || cmd == Q_XQUOTASYNC);
-
- /*
- * The following commands are valid even when quotaoff.
- */
- switch (cmd) {
- case Q_XQUOTARM:
- /*
- * Truncate quota files. quota must be off.
- */
- if (XFS_IS_QUOTA_ON(mp))
- return XFS_ERROR(EINVAL);
- if (vfsp->vfs_flag & VFS_RDONLY)
- return XFS_ERROR(EROFS);
- return (xfs_qm_scall_trunc_qfiles(mp,
- xfs_qm_import_qtype_flags(*(uint *)addr)));
-
- case Q_XGETQSTAT:
- /*
- * Get quota status information.
- */
- return (xfs_qm_scall_getqstat(mp, (fs_quota_stat_t *)addr));
-
- case Q_XQUOTAON:
- /*
- * QUOTAON - enabling quota enforcement.
- * Quota accounting must be turned on at mount time.
- */
- if (vfsp->vfs_flag & VFS_RDONLY)
- return XFS_ERROR(EROFS);
- return (xfs_qm_scall_quotaon(mp,
- xfs_qm_import_flags(*(uint *)addr)));
-
- case Q_XQUOTAOFF:
- if (vfsp->vfs_flag & VFS_RDONLY)
- return XFS_ERROR(EROFS);
- break;
-
- case Q_XQUOTASYNC:
- return (xfs_sync_inodes(mp, SYNC_DELWRI, 0, NULL));
-
- default:
- break;
- }
-
- if (! XFS_IS_QUOTA_ON(mp))
- return XFS_ERROR(ESRCH);
-
- switch (cmd) {
- case Q_XQUOTAOFF:
- if (vfsp->vfs_flag & VFS_RDONLY)
- return XFS_ERROR(EROFS);
- error = xfs_qm_scall_quotaoff(mp,
- xfs_qm_import_flags(*(uint *)addr),
- B_FALSE);
- break;
-
- case Q_XGETQUOTA:
- error = xfs_qm_scall_getquota(mp, (xfs_dqid_t)id, XFS_DQ_USER,
- (fs_disk_quota_t *)addr);
- break;
- case Q_XGETGQUOTA:
- error = xfs_qm_scall_getquota(mp, (xfs_dqid_t)id, XFS_DQ_GROUP,
- (fs_disk_quota_t *)addr);
- break;
- case Q_XGETPQUOTA:
- error = xfs_qm_scall_getquota(mp, (xfs_dqid_t)id, XFS_DQ_PROJ,
- (fs_disk_quota_t *)addr);
- break;
-
- case Q_XSETQLIM:
- if (vfsp->vfs_flag & VFS_RDONLY)
- return XFS_ERROR(EROFS);
- error = xfs_qm_scall_setqlim(mp, (xfs_dqid_t)id, XFS_DQ_USER,
- (fs_disk_quota_t *)addr);
- break;
- case Q_XSETGQLIM:
- if (vfsp->vfs_flag & VFS_RDONLY)
- return XFS_ERROR(EROFS);
- error = xfs_qm_scall_setqlim(mp, (xfs_dqid_t)id, XFS_DQ_GROUP,
- (fs_disk_quota_t *)addr);
- break;
- case Q_XSETPQLIM:
- if (vfsp->vfs_flag & VFS_RDONLY)
- return XFS_ERROR(EROFS);
- error = xfs_qm_scall_setqlim(mp, (xfs_dqid_t)id, XFS_DQ_PROJ,
- (fs_disk_quota_t *)addr);
- break;
-
- default:
- error = XFS_ERROR(EINVAL);
- break;
- }
-
- return (error);
-}
-
-/*
- * Turn off quota accounting and/or enforcement for all udquots and/or
- * gdquots. Called only at unmount time.
- *
- * This assumes that there are no dquots of this file system cached
- * incore, and modifies the ondisk dquot directly. Therefore, for example,
- * it is an error to call this twice, without purging the cache.
- */
-STATIC int
-xfs_qm_scall_quotaoff(
- xfs_mount_t *mp,
- uint flags,
- boolean_t force)
-{
- uint dqtype;
- unsigned long s;
- int error;
- uint inactivate_flags;
- xfs_qoff_logitem_t *qoffstart;
- int nculprits;
-
- if (!force && !capable(CAP_SYS_ADMIN))
- return XFS_ERROR(EPERM);
- /*
- * No file system can have quotas enabled on disk but not in core.
- * Note that quota utilities (like quotaoff) _expect_
- * errno == EEXIST here.
- */
- if ((mp->m_qflags & flags) == 0)
- return XFS_ERROR(EEXIST);
- error = 0;
-
- flags &= (XFS_ALL_QUOTA_ACCT | XFS_ALL_QUOTA_ENFD);
-
- /*
- * We don't want to deal with two quotaoffs messing up each other,
- * so we're going to serialize it. quotaoff isn't exactly a performance
- * critical thing.
- * If quotaoff, then we must be dealing with the root filesystem.
- */
- ASSERT(mp->m_quotainfo);
- if (mp->m_quotainfo)
- mutex_lock(&(XFS_QI_QOFFLOCK(mp)));
-
- ASSERT(mp->m_quotainfo);
-
- /*
- * If we're just turning off quota enforcement, change mp and go.
- */
- if ((flags & XFS_ALL_QUOTA_ACCT) == 0) {
- mp->m_qflags &= ~(flags);
-
- s = XFS_SB_LOCK(mp);
- mp->m_sb.sb_qflags = mp->m_qflags;
- XFS_SB_UNLOCK(mp, s);
- mutex_unlock(&(XFS_QI_QOFFLOCK(mp)));
-
- /* XXX what to do if error ? Revert back to old vals incore ? */
- error = xfs_qm_write_sb_changes(mp, XFS_SB_QFLAGS);
- return (error);
- }
-
- dqtype = 0;
- inactivate_flags = 0;
- /*
- * If accounting is off, we must turn enforcement off, clear the
- * quota 'CHKD' certificate to make it known that we have to
- * do a quotacheck the next time this quota is turned on.
- */
- if (flags & XFS_UQUOTA_ACCT) {
- dqtype |= XFS_QMOPT_UQUOTA;
- flags |= (XFS_UQUOTA_CHKD | XFS_UQUOTA_ENFD);
- inactivate_flags |= XFS_UQUOTA_ACTIVE;
- }
- if (flags & XFS_GQUOTA_ACCT) {
- dqtype |= XFS_QMOPT_GQUOTA;
- flags |= (XFS_OQUOTA_CHKD | XFS_OQUOTA_ENFD);
- inactivate_flags |= XFS_GQUOTA_ACTIVE;
- } else if (flags & XFS_PQUOTA_ACCT) {
- dqtype |= XFS_QMOPT_PQUOTA;
- flags |= (XFS_OQUOTA_CHKD | XFS_OQUOTA_ENFD);
- inactivate_flags |= XFS_PQUOTA_ACTIVE;
- }
-
- /*
- * Nothing to do? Don't complain. This happens when we're just
- * turning off quota enforcement.
- */
- if ((mp->m_qflags & flags) == 0) {
- mutex_unlock(&(XFS_QI_QOFFLOCK(mp)));
- return (0);
- }
-
- /*
- * Write the LI_QUOTAOFF log record, and do SB changes atomically,
- * and synchronously.
- */
- xfs_qm_log_quotaoff(mp, &qoffstart, flags);
-
- /*
- * Next we clear the XFS_MOUNT_*DQ_ACTIVE bit(s) in the mount struct
- * to take care of the race between dqget and quotaoff. We don't take
- * any special locks to reset these bits. All processes need to check
- * these bits *after* taking inode lock(s) to see if the particular
- * quota type is in the process of being turned off. If *ACTIVE, it is
- * guaranteed that all dquot structures and all quotainode ptrs will all
- * stay valid as long as that inode is kept locked.
- *
- * There is no turning back after this.
- */
- mp->m_qflags &= ~inactivate_flags;
-
- /*
- * Give back all the dquot reference(s) held by inodes.
- * Here we go thru every single incore inode in this file system, and
- * do a dqrele on the i_udquot/i_gdquot that it may have.
- * Essentially, as long as somebody has an inode locked, this guarantees
- * that quotas will not be turned off. This is handy because in a
- * transaction once we lock the inode(s) and check for quotaon, we can
- * depend on the quota inodes (and other things) being valid as long as
- * we keep the lock(s).
- */
- xfs_qm_dqrele_all_inodes(mp, flags);
-
- /*
- * Next we make the changes in the quota flag in the mount struct.
- * This isn't protected by a particular lock directly, because we
- * don't want to take a mrlock everytime we depend on quotas being on.
- */
- mp->m_qflags &= ~(flags);
-
- /*
- * Go through all the dquots of this file system and purge them,
- * according to what was turned off. We may not be able to get rid
- * of all dquots, because dquots can have temporary references that
- * are not attached to inodes. eg. xfs_setattr, xfs_create.
- * So, if we couldn't purge all the dquots from the filesystem,
- * we can't get rid of the incore data structures.
- */
- while ((nculprits = xfs_qm_dqpurge_all(mp, dqtype|XFS_QMOPT_QUOTAOFF)))
- delay(10 * nculprits);
-
- /*
- * Transactions that had started before ACTIVE state bit was cleared
- * could have logged many dquots, so they'd have higher LSNs than
- * the first QUOTAOFF log record does. If we happen to crash when
- * the tail of the log has gone past the QUOTAOFF record, but
- * before the last dquot modification, those dquots __will__
- * recover, and that's not good.
- *
- * So, we have QUOTAOFF start and end logitems; the start
- * logitem won't get overwritten until the end logitem appears...
- */
- xfs_qm_log_quotaoff_end(mp, qoffstart, flags);
-
- /*
- * If quotas is completely disabled, close shop.
- */
- if (((flags & XFS_MOUNT_QUOTA_ALL) == XFS_MOUNT_QUOTA_SET1) ||
- ((flags & XFS_MOUNT_QUOTA_ALL) == XFS_MOUNT_QUOTA_SET2)) {
- mutex_unlock(&(XFS_QI_QOFFLOCK(mp)));
- xfs_qm_destroy_quotainfo(mp);
- return (0);
- }
-
- /*
- * Release our quotainode references, and vn_purge them,
- * if we don't need them anymore.
- */
- if ((dqtype & XFS_QMOPT_UQUOTA) && XFS_QI_UQIP(mp)) {
- XFS_PURGE_INODE(XFS_QI_UQIP(mp));
- XFS_QI_UQIP(mp) = NULL;
- }
- if ((dqtype & (XFS_QMOPT_GQUOTA|XFS_QMOPT_PQUOTA)) && XFS_QI_GQIP(mp)) {
- XFS_PURGE_INODE(XFS_QI_GQIP(mp));
- XFS_QI_GQIP(mp) = NULL;
- }
- mutex_unlock(&(XFS_QI_QOFFLOCK(mp)));
-
- return (error);
-}
-
-STATIC int
-xfs_qm_scall_trunc_qfiles(
- xfs_mount_t *mp,
- uint flags)
-{
- int error;
- xfs_inode_t *qip;
-
- if (!capable(CAP_SYS_ADMIN))
- return XFS_ERROR(EPERM);
- error = 0;
- if (!XFS_SB_VERSION_HASQUOTA(&mp->m_sb) || flags == 0) {
- qdprintk("qtrunc flags=%x m_qflags=%x\n", flags, mp->m_qflags);
- return XFS_ERROR(EINVAL);
- }
-
- if ((flags & XFS_DQ_USER) && mp->m_sb.sb_uquotino != NULLFSINO) {
- error = xfs_iget(mp, NULL, mp->m_sb.sb_uquotino, 0, 0, &qip, 0);
- if (! error) {
- (void) xfs_truncate_file(mp, qip);
- VN_RELE(XFS_ITOV(qip));
- }
- }
-
- if ((flags & (XFS_DQ_GROUP|XFS_DQ_PROJ)) &&
- mp->m_sb.sb_gquotino != NULLFSINO) {
- error = xfs_iget(mp, NULL, mp->m_sb.sb_gquotino, 0, 0, &qip, 0);
- if (! error) {
- (void) xfs_truncate_file(mp, qip);
- VN_RELE(XFS_ITOV(qip));
- }
- }
-
- return (error);
-}
-
-
-/*
- * Switch on (a given) quota enforcement for a filesystem. This takes
- * effect immediately.
- * (Switching on quota accounting must be done at mount time.)
- */
-STATIC int
-xfs_qm_scall_quotaon(
- xfs_mount_t *mp,
- uint flags)
-{
- int error;
- unsigned long s;
- uint qf;
- uint accflags;
- __int64_t sbflags;
-
- if (!capable(CAP_SYS_ADMIN))
- return XFS_ERROR(EPERM);
-
- flags &= (XFS_ALL_QUOTA_ACCT | XFS_ALL_QUOTA_ENFD);
- /*
- * Switching on quota accounting must be done at mount time.
- */
- accflags = flags & XFS_ALL_QUOTA_ACCT;
- flags &= ~(XFS_ALL_QUOTA_ACCT);
-
- sbflags = 0;
-
- if (flags == 0) {
- qdprintk("quotaon: zero flags, m_qflags=%x\n", mp->m_qflags);
- return XFS_ERROR(EINVAL);
- }
-
- /* No fs can turn on quotas with a delayed effect */
- ASSERT((flags & XFS_ALL_QUOTA_ACCT) == 0);
-
- /*
- * Can't enforce without accounting. We check the superblock
- * qflags here instead of m_qflags because rootfs can have
- * quota acct on ondisk without m_qflags' knowing.
- */
- if (((flags & XFS_UQUOTA_ACCT) == 0 &&
- (mp->m_sb.sb_qflags & XFS_UQUOTA_ACCT) == 0 &&
- (flags & XFS_UQUOTA_ENFD))
- ||
- ((flags & XFS_PQUOTA_ACCT) == 0 &&
- (mp->m_sb.sb_qflags & XFS_PQUOTA_ACCT) == 0 &&
- (flags & XFS_OQUOTA_ENFD))
- ||
- ((flags & XFS_GQUOTA_ACCT) == 0 &&
- (mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT) == 0 &&
- (flags & XFS_OQUOTA_ENFD))) {
- qdprintk("Can't enforce without acct, flags=%x sbflags=%x\n",
- flags, mp->m_sb.sb_qflags);
- return XFS_ERROR(EINVAL);
- }
- /*
- * If everything's upto-date incore, then don't waste time.
- */
- if ((mp->m_qflags & flags) == flags)
- return XFS_ERROR(EEXIST);
-
- /*
- * Change sb_qflags on disk but not incore mp->qflags
- * if this is the root filesystem.
- */
- s = XFS_SB_LOCK(mp);
- qf = mp->m_sb.sb_qflags;
- mp->m_sb.sb_qflags = qf | flags;
- XFS_SB_UNLOCK(mp, s);
-
- /*
- * There's nothing to change if it's the same.
- */
- if ((qf & flags) == flags && sbflags == 0)
- return XFS_ERROR(EEXIST);
- sbflags |= XFS_SB_QFLAGS;
-
- if ((error = xfs_qm_write_sb_changes(mp, sbflags)))
- return (error);
- /*
- * If we aren't trying to switch on quota enforcement, we are done.
- */
- if (((mp->m_sb.sb_qflags & XFS_UQUOTA_ACCT) !=
- (mp->m_qflags & XFS_UQUOTA_ACCT)) ||
- ((mp->m_sb.sb_qflags & XFS_PQUOTA_ACCT) !=
- (mp->m_qflags & XFS_PQUOTA_ACCT)) ||
- ((mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT) !=
- (mp->m_qflags & XFS_GQUOTA_ACCT)) ||
- (flags & XFS_ALL_QUOTA_ENFD) == 0)
- return (0);
-
- if (! XFS_IS_QUOTA_RUNNING(mp))
- return XFS_ERROR(ESRCH);
-
- /*
- * Switch on quota enforcement in core.
- */
- mutex_lock(&(XFS_QI_QOFFLOCK(mp)));
- mp->m_qflags |= (flags & XFS_ALL_QUOTA_ENFD);
- mutex_unlock(&(XFS_QI_QOFFLOCK(mp)));
-
- return (0);
-}
-
-
-/*
- * Return quota status information, such as uquota-off, enforcements, etc.
- */
-STATIC int
-xfs_qm_scall_getqstat(
- xfs_mount_t *mp,
- fs_quota_stat_t *out)
-{
- xfs_inode_t *uip, *gip;
- boolean_t tempuqip, tempgqip;
-
- uip = gip = NULL;
- tempuqip = tempgqip = B_FALSE;
- memset(out, 0, sizeof(fs_quota_stat_t));
-
- out->qs_version = FS_QSTAT_VERSION;
- if (! XFS_SB_VERSION_HASQUOTA(&mp->m_sb)) {
- out->qs_uquota.qfs_ino = NULLFSINO;
- out->qs_gquota.qfs_ino = NULLFSINO;
- return (0);
- }
- out->qs_flags = (__uint16_t) xfs_qm_export_flags(mp->m_qflags &
- (XFS_ALL_QUOTA_ACCT|
- XFS_ALL_QUOTA_ENFD));
- out->qs_pad = 0;
- out->qs_uquota.qfs_ino = mp->m_sb.sb_uquotino;
- out->qs_gquota.qfs_ino = mp->m_sb.sb_gquotino;
-
- if (mp->m_quotainfo) {
- uip = mp->m_quotainfo->qi_uquotaip;
- gip = mp->m_quotainfo->qi_gquotaip;
- }
- if (!uip && mp->m_sb.sb_uquotino != NULLFSINO) {
- if (xfs_iget(mp, NULL, mp->m_sb.sb_uquotino,
- 0, 0, &uip, 0) == 0)
- tempuqip = B_TRUE;
- }
- if (!gip && mp->m_sb.sb_gquotino != NULLFSINO) {
- if (xfs_iget(mp, NULL, mp->m_sb.sb_gquotino,
- 0, 0, &gip, 0) == 0)
- tempgqip = B_TRUE;
- }
- if (uip) {
- out->qs_uquota.qfs_nblks = uip->i_d.di_nblocks;
- out->qs_uquota.qfs_nextents = uip->i_d.di_nextents;
- if (tempuqip)
- VN_RELE(XFS_ITOV(uip));
- }
- if (gip) {
- out->qs_gquota.qfs_nblks = gip->i_d.di_nblocks;
- out->qs_gquota.qfs_nextents = gip->i_d.di_nextents;
- if (tempgqip)
- VN_RELE(XFS_ITOV(gip));
- }
- if (mp->m_quotainfo) {
- out->qs_incoredqs = XFS_QI_MPLNDQUOTS(mp);
- out->qs_btimelimit = XFS_QI_BTIMELIMIT(mp);
- out->qs_itimelimit = XFS_QI_ITIMELIMIT(mp);
- out->qs_rtbtimelimit = XFS_QI_RTBTIMELIMIT(mp);
- out->qs_bwarnlimit = XFS_QI_BWARNLIMIT(mp);
- out->qs_iwarnlimit = XFS_QI_IWARNLIMIT(mp);
- }
- return (0);
-}
-
-/*
- * Adjust quota limits, and start/stop timers accordingly.
- */
-STATIC int
-xfs_qm_scall_setqlim(
- xfs_mount_t *mp,
- xfs_dqid_t id,
- uint type,
- fs_disk_quota_t *newlim)
-{
- xfs_disk_dquot_t *ddq;
- xfs_dquot_t *dqp;
- xfs_trans_t *tp;
- int error;
- xfs_qcnt_t hard, soft;
-
- if (!capable(CAP_SYS_ADMIN))
- return XFS_ERROR(EPERM);
-
- if ((newlim->d_fieldmask &
- (FS_DQ_LIMIT_MASK|FS_DQ_TIMER_MASK|FS_DQ_WARNS_MASK)) == 0)
- return (0);
-
- tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SETQLIM);
- if ((error = xfs_trans_reserve(tp, 0, sizeof(xfs_disk_dquot_t) + 128,
- 0, 0, XFS_DEFAULT_LOG_COUNT))) {
- xfs_trans_cancel(tp, 0);
- return (error);
- }
-
- /*
- * We don't want to race with a quotaoff so take the quotaoff lock.
- * (We don't hold an inode lock, so there's nothing else to stop
- * a quotaoff from happening). (XXXThis doesn't currently happen
- * because we take the vfslock before calling xfs_qm_sysent).
- */
- mutex_lock(&(XFS_QI_QOFFLOCK(mp)));
-
- /*
- * Get the dquot (locked), and join it to the transaction.
- * Allocate the dquot if this doesn't exist.
- */
- if ((error = xfs_qm_dqget(mp, NULL, id, type, XFS_QMOPT_DQALLOC, &dqp))) {
- xfs_trans_cancel(tp, XFS_TRANS_ABORT);
- mutex_unlock(&(XFS_QI_QOFFLOCK(mp)));
- ASSERT(error != ENOENT);
- return (error);
- }
- xfs_dqtrace_entry(dqp, "Q_SETQLIM: AFT DQGET");
- xfs_trans_dqjoin(tp, dqp);
- ddq = &dqp->q_core;
-
- /*
- * Make sure that hardlimits are >= soft limits before changing.
- */
- hard = (newlim->d_fieldmask & FS_DQ_BHARD) ?
- (xfs_qcnt_t) XFS_BB_TO_FSB(mp, newlim->d_blk_hardlimit) :
- be64_to_cpu(ddq->d_blk_hardlimit);
- soft = (newlim->d_fieldmask & FS_DQ_BSOFT) ?
- (xfs_qcnt_t) XFS_BB_TO_FSB(mp, newlim->d_blk_softlimit) :
- be64_to_cpu(ddq->d_blk_softlimit);
- if (hard == 0 || hard >= soft) {
- ddq->d_blk_hardlimit = cpu_to_be64(hard);
- ddq->d_blk_softlimit = cpu_to_be64(soft);
- if (id == 0) {
- mp->m_quotainfo->qi_bhardlimit = hard;
- mp->m_quotainfo->qi_bsoftlimit = soft;
- }
- } else {
- qdprintk("blkhard %Ld < blksoft %Ld\n", hard, soft);
- }
- hard = (newlim->d_fieldmask & FS_DQ_RTBHARD) ?
- (xfs_qcnt_t) XFS_BB_TO_FSB(mp, newlim->d_rtb_hardlimit) :
- be64_to_cpu(ddq->d_rtb_hardlimit);
- soft = (newlim->d_fieldmask & FS_DQ_RTBSOFT) ?
- (xfs_qcnt_t) XFS_BB_TO_FSB(mp, newlim->d_rtb_softlimit) :
- be64_to_cpu(ddq->d_rtb_softlimit);
- if (hard == 0 || hard >= soft) {
- ddq->d_rtb_hardlimit = cpu_to_be64(hard);
- ddq->d_rtb_softlimit = cpu_to_be64(soft);
- if (id == 0) {
- mp->m_quotainfo->qi_rtbhardlimit = hard;
- mp->m_quotainfo->qi_rtbsoftlimit = soft;
- }
- } else {
- qdprintk("rtbhard %Ld < rtbsoft %Ld\n", hard, soft);
- }
-
- hard = (newlim->d_fieldmask & FS_DQ_IHARD) ?
- (xfs_qcnt_t) newlim->d_ino_hardlimit :
- be64_to_cpu(ddq->d_ino_hardlimit);
- soft = (newlim->d_fieldmask & FS_DQ_ISOFT) ?
- (xfs_qcnt_t) newlim->d_ino_softlimit :
- be64_to_cpu(ddq->d_ino_softlimit);
- if (hard == 0 || hard >= soft) {
- ddq->d_ino_hardlimit = cpu_to_be64(hard);
- ddq->d_ino_softlimit = cpu_to_be64(soft);
- if (id == 0) {
- mp->m_quotainfo->qi_ihardlimit = hard;
- mp->m_quotainfo->qi_isoftlimit = soft;
- }
- } else {
- qdprintk("ihard %Ld < isoft %Ld\n", hard, soft);
- }
-
- /*
- * Update warnings counter(s) if requested
- */
- if (newlim->d_fieldmask & FS_DQ_BWARNS)
- ddq->d_bwarns = cpu_to_be16(newlim->d_bwarns);
- if (newlim->d_fieldmask & FS_DQ_IWARNS)
- ddq->d_iwarns = cpu_to_be16(newlim->d_iwarns);
- if (newlim->d_fieldmask & FS_DQ_RTBWARNS)
- ddq->d_rtbwarns = cpu_to_be16(newlim->d_rtbwarns);
-
- if (id == 0) {
- /*
- * Timelimits for the super user set the relative time
- * the other users can be over quota for this file system.
- * If it is zero a default is used. Ditto for the default
- * soft and hard limit values (already done, above), and
- * for warnings.
- */
- if (newlim->d_fieldmask & FS_DQ_BTIMER) {
- mp->m_quotainfo->qi_btimelimit = newlim->d_btimer;
- ddq->d_btimer = cpu_to_be32(newlim->d_btimer);
- }
- if (newlim->d_fieldmask & FS_DQ_ITIMER) {
- mp->m_quotainfo->qi_itimelimit = newlim->d_itimer;
- ddq->d_itimer = cpu_to_be32(newlim->d_itimer);
- }
- if (newlim->d_fieldmask & FS_DQ_RTBTIMER) {
- mp->m_quotainfo->qi_rtbtimelimit = newlim->d_rtbtimer;
- ddq->d_rtbtimer = cpu_to_be32(newlim->d_rtbtimer);
- }
- if (newlim->d_fieldmask & FS_DQ_BWARNS)
- mp->m_quotainfo->qi_bwarnlimit = newlim->d_bwarns;
- if (newlim->d_fieldmask & FS_DQ_IWARNS)
- mp->m_quotainfo->qi_iwarnlimit = newlim->d_iwarns;
- if (newlim->d_fieldmask & FS_DQ_RTBWARNS)
- mp->m_quotainfo->qi_rtbwarnlimit = newlim->d_rtbwarns;
- } else {
- /*
- * If the user is now over quota, start the timelimit.
- * The user will not be 'warned'.
- * Note that we keep the timers ticking, whether enforcement
- * is on or off. We don't really want to bother with iterating
- * over all ondisk dquots and turning the timers on/off.
- */
- xfs_qm_adjust_dqtimers(mp, ddq);
- }
- dqp->dq_flags |= XFS_DQ_DIRTY;
- xfs_trans_log_dquot(tp, dqp);
-
- xfs_dqtrace_entry(dqp, "Q_SETQLIM: COMMIT");
- xfs_trans_commit(tp, 0, NULL);
- xfs_qm_dqprint(dqp);
- xfs_qm_dqrele(dqp);
- mutex_unlock(&(XFS_QI_QOFFLOCK(mp)));
-
- return (0);
-}
-
-STATIC int
-xfs_qm_scall_getquota(
- xfs_mount_t *mp,
- xfs_dqid_t id,
- uint type,
- fs_disk_quota_t *out)
-{
- xfs_dquot_t *dqp;
- int error;
-
- /*
- * Try to get the dquot. We don't want it allocated on disk, so
- * we aren't passing the XFS_QMOPT_DOALLOC flag. If it doesn't
- * exist, we'll get ENOENT back.
- */
- if ((error = xfs_qm_dqget(mp, NULL, id, type, 0, &dqp))) {
- return (error);
- }
-
- xfs_dqtrace_entry(dqp, "Q_GETQUOTA SUCCESS");
- /*
- * If everything's NULL, this dquot doesn't quite exist as far as
- * our utility programs are concerned.
- */
- if (XFS_IS_DQUOT_UNINITIALIZED(dqp)) {
- xfs_qm_dqput(dqp);
- return XFS_ERROR(ENOENT);
- }
- /* xfs_qm_dqprint(dqp); */
- /*
- * Convert the disk dquot to the exportable format
- */
- xfs_qm_export_dquot(mp, &dqp->q_core, out);
- xfs_qm_dqput(dqp);
- return (error ? XFS_ERROR(EFAULT) : 0);
-}
-
-
-STATIC int
-xfs_qm_log_quotaoff_end(
- xfs_mount_t *mp,
- xfs_qoff_logitem_t *startqoff,
- uint flags)
-{
- xfs_trans_t *tp;
- int error;
- xfs_qoff_logitem_t *qoffi;
-
- tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QUOTAOFF_END);
-
- if ((error = xfs_trans_reserve(tp, 0, sizeof(xfs_qoff_logitem_t) * 2,
- 0, 0, XFS_DEFAULT_LOG_COUNT))) {
- xfs_trans_cancel(tp, 0);
- return (error);
- }
-
- qoffi = xfs_trans_get_qoff_item(tp, startqoff,
- flags & XFS_ALL_QUOTA_ACCT);
- xfs_trans_log_quotaoff_item(tp, qoffi);
-
- /*
- * We have to make sure that the transaction is secure on disk before we
- * return and actually stop quota accounting. So, make it synchronous.
- * We don't care about quotoff's performance.
- */
- xfs_trans_set_sync(tp);
- error = xfs_trans_commit(tp, 0, NULL);
- return (error);
-}
-
-
-STATIC int
-xfs_qm_log_quotaoff(
- xfs_mount_t *mp,
- xfs_qoff_logitem_t **qoffstartp,
- uint flags)
-{
- xfs_trans_t *tp;
- int error;
- unsigned long s;
- xfs_qoff_logitem_t *qoffi=NULL;
- uint oldsbqflag=0;
-
- tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QUOTAOFF);
- if ((error = xfs_trans_reserve(tp, 0,
- sizeof(xfs_qoff_logitem_t) * 2 +
- mp->m_sb.sb_sectsize + 128,
- 0,
- 0,
- XFS_DEFAULT_LOG_COUNT))) {
- goto error0;
- }
-
- qoffi = xfs_trans_get_qoff_item(tp, NULL, flags & XFS_ALL_QUOTA_ACCT);
- xfs_trans_log_quotaoff_item(tp, qoffi);
-
- s = XFS_SB_LOCK(mp);
- oldsbqflag = mp->m_sb.sb_qflags;
- mp->m_sb.sb_qflags = (mp->m_qflags & ~(flags)) & XFS_MOUNT_QUOTA_ALL;
- XFS_SB_UNLOCK(mp, s);
-
- xfs_mod_sb(tp, XFS_SB_QFLAGS);
-
- /*
- * We have to make sure that the transaction is secure on disk before we
- * return and actually stop quota accounting. So, make it synchronous.
- * We don't care about quotoff's performance.
- */
- xfs_trans_set_sync(tp);
- error = xfs_trans_commit(tp, 0, NULL);
-
-error0:
- if (error) {
- xfs_trans_cancel(tp, 0);
- /*
- * No one else is modifying sb_qflags, so this is OK.
- * We still hold the quotaofflock.
- */
- s = XFS_SB_LOCK(mp);
- mp->m_sb.sb_qflags = oldsbqflag;
- XFS_SB_UNLOCK(mp, s);
- }
- *qoffstartp = qoffi;
- return (error);
-}
-
-
-/*
- * Translate an internal style on-disk-dquot to the exportable format.
- * The main differences are that the counters/limits are all in Basic
- * Blocks (BBs) instead of the internal FSBs, and all on-disk data has
- * to be converted to the native endianness.
- */
-STATIC void
-xfs_qm_export_dquot(
- xfs_mount_t *mp,
- xfs_disk_dquot_t *src,
- struct fs_disk_quota *dst)
-{
- memset(dst, 0, sizeof(*dst));
- dst->d_version = FS_DQUOT_VERSION; /* different from src->d_version */
- dst->d_flags = xfs_qm_export_qtype_flags(src->d_flags);
- dst->d_id = be32_to_cpu(src->d_id);
- dst->d_blk_hardlimit =
- XFS_FSB_TO_BB(mp, be64_to_cpu(src->d_blk_hardlimit));
- dst->d_blk_softlimit =
- XFS_FSB_TO_BB(mp, be64_to_cpu(src->d_blk_softlimit));
- dst->d_ino_hardlimit = be64_to_cpu(src->d_ino_hardlimit);
- dst->d_ino_softlimit = be64_to_cpu(src->d_ino_softlimit);
- dst->d_bcount = XFS_FSB_TO_BB(mp, be64_to_cpu(src->d_bcount));
- dst->d_icount = be64_to_cpu(src->d_icount);
- dst->d_btimer = be32_to_cpu(src->d_btimer);
- dst->d_itimer = be32_to_cpu(src->d_itimer);
- dst->d_iwarns = be16_to_cpu(src->d_iwarns);
- dst->d_bwarns = be16_to_cpu(src->d_bwarns);
- dst->d_rtb_hardlimit =
- XFS_FSB_TO_BB(mp, be64_to_cpu(src->d_rtb_hardlimit));
- dst->d_rtb_softlimit =
- XFS_FSB_TO_BB(mp, be64_to_cpu(src->d_rtb_softlimit));
- dst->d_rtbcount = XFS_FSB_TO_BB(mp, be64_to_cpu(src->d_rtbcount));
- dst->d_rtbtimer = be32_to_cpu(src->d_rtbtimer);
- dst->d_rtbwarns = be16_to_cpu(src->d_rtbwarns);
-
- /*
- * Internally, we don't reset all the timers when quota enforcement
- * gets turned off. No need to confuse the userlevel code,
- * so return zeroes in that case.
- */
- if (! XFS_IS_QUOTA_ENFORCED(mp)) {
- dst->d_btimer = 0;
- dst->d_itimer = 0;
- dst->d_rtbtimer = 0;
- }
-
-#ifdef DEBUG
- if (XFS_IS_QUOTA_ENFORCED(mp) && dst->d_id != 0) {
- if (((int) dst->d_bcount >= (int) dst->d_blk_softlimit) &&
- (dst->d_blk_softlimit > 0)) {
- ASSERT(dst->d_btimer != 0);
- }
- if (((int) dst->d_icount >= (int) dst->d_ino_softlimit) &&
- (dst->d_ino_softlimit > 0)) {
- ASSERT(dst->d_itimer != 0);
- }
- }
-#endif
-}
-
-STATIC uint
-xfs_qm_import_qtype_flags(
- uint uflags)
-{
- uint oflags = 0;
-
- /*
- * Can't be more than one, or none.
- */
- if (((uflags & (XFS_GROUP_QUOTA | XFS_USER_QUOTA)) ==
- (XFS_GROUP_QUOTA | XFS_USER_QUOTA)) ||
- ((uflags & (XFS_GROUP_QUOTA | XFS_PROJ_QUOTA)) ==
- (XFS_GROUP_QUOTA | XFS_PROJ_QUOTA)) ||
- ((uflags & (XFS_USER_QUOTA | XFS_PROJ_QUOTA)) ==
- (XFS_USER_QUOTA | XFS_PROJ_QUOTA)) ||
- ((uflags & (XFS_GROUP_QUOTA|XFS_USER_QUOTA|XFS_PROJ_QUOTA)) == 0))
- return (0);
-
- oflags |= (uflags & XFS_USER_QUOTA) ? XFS_DQ_USER : 0;
- oflags |= (uflags & XFS_PROJ_QUOTA) ? XFS_DQ_PROJ : 0;
- oflags |= (uflags & XFS_GROUP_QUOTA) ? XFS_DQ_GROUP: 0;
- return oflags;
-}
-
-STATIC uint
-xfs_qm_export_qtype_flags(
- uint flags)
-{
- /*
- * Can't be more than one, or none.
- */
- ASSERT((flags & (XFS_PROJ_QUOTA | XFS_USER_QUOTA)) !=
- (XFS_PROJ_QUOTA | XFS_USER_QUOTA));
- ASSERT((flags & (XFS_PROJ_QUOTA | XFS_GROUP_QUOTA)) !=
- (XFS_PROJ_QUOTA | XFS_GROUP_QUOTA));
- ASSERT((flags & (XFS_USER_QUOTA | XFS_GROUP_QUOTA)) !=
- (XFS_USER_QUOTA | XFS_GROUP_QUOTA));
- ASSERT((flags & (XFS_PROJ_QUOTA|XFS_USER_QUOTA|XFS_GROUP_QUOTA)) != 0);
-
- return (flags & XFS_DQ_USER) ?
- XFS_USER_QUOTA : (flags & XFS_DQ_PROJ) ?
- XFS_PROJ_QUOTA : XFS_GROUP_QUOTA;
-}
-
-STATIC uint
-xfs_qm_import_flags(
- uint uflags)
-{
- uint flags = 0;
-
- if (uflags & XFS_QUOTA_UDQ_ACCT)
- flags |= XFS_UQUOTA_ACCT;
- if (uflags & XFS_QUOTA_PDQ_ACCT)
- flags |= XFS_PQUOTA_ACCT;
- if (uflags & XFS_QUOTA_GDQ_ACCT)
- flags |= XFS_GQUOTA_ACCT;
- if (uflags & XFS_QUOTA_UDQ_ENFD)
- flags |= XFS_UQUOTA_ENFD;
- if (uflags & (XFS_QUOTA_PDQ_ENFD|XFS_QUOTA_GDQ_ENFD))
- flags |= XFS_OQUOTA_ENFD;
- return (flags);
-}
-
-
-STATIC uint
-xfs_qm_export_flags(
- uint flags)
-{
- uint uflags;
-
- uflags = 0;
- if (flags & XFS_UQUOTA_ACCT)
- uflags |= XFS_QUOTA_UDQ_ACCT;
- if (flags & XFS_PQUOTA_ACCT)
- uflags |= XFS_QUOTA_PDQ_ACCT;
- if (flags & XFS_GQUOTA_ACCT)
- uflags |= XFS_QUOTA_GDQ_ACCT;
- if (flags & XFS_UQUOTA_ENFD)
- uflags |= XFS_QUOTA_UDQ_ENFD;
- if (flags & (XFS_OQUOTA_ENFD)) {
- uflags |= (flags & XFS_GQUOTA_ACCT) ?
- XFS_QUOTA_GDQ_ENFD : XFS_QUOTA_PDQ_ENFD;
- }
- return (uflags);
-}
-
-
-/*
- * Go thru all the inodes in the file system, releasing their dquots.
- * Note that the mount structure gets modified to indicate that quotas are off
- * AFTER this, in the case of quotaoff. This also gets called from
- * xfs_rootumount.
- */
-void
-xfs_qm_dqrele_all_inodes(
- struct xfs_mount *mp,
- uint flags)
-{
- xfs_inode_t *ip, *topino;
- uint ireclaims;
- vnode_t *vp;
- boolean_t vnode_refd;
-
- ASSERT(mp->m_quotainfo);
-
- XFS_MOUNT_ILOCK(mp);
-again:
- ip = mp->m_inodes;
- if (ip == NULL) {
- XFS_MOUNT_IUNLOCK(mp);
- return;
- }
- do {
- /* Skip markers inserted by xfs_sync */
- if (ip->i_mount == NULL) {
- ip = ip->i_mnext;
- continue;
- }
- /* Root inode, rbmip and rsumip have associated blocks */
- if (ip == XFS_QI_UQIP(mp) || ip == XFS_QI_GQIP(mp)) {
- ASSERT(ip->i_udquot == NULL);
- ASSERT(ip->i_gdquot == NULL);
- ip = ip->i_mnext;
- continue;
- }
- vp = XFS_ITOV_NULL(ip);
- if (!vp) {
- ASSERT(ip->i_udquot == NULL);
- ASSERT(ip->i_gdquot == NULL);
- ip = ip->i_mnext;
- continue;
- }
- vnode_refd = B_FALSE;
- if (xfs_ilock_nowait(ip, XFS_ILOCK_EXCL) == 0) {
- ireclaims = mp->m_ireclaims;
- topino = mp->m_inodes;
- vp = vn_grab(vp);
- if (!vp)
- goto again;
-
- XFS_MOUNT_IUNLOCK(mp);
- /* XXX restart limit ? */
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- vnode_refd = B_TRUE;
- } else {
- ireclaims = mp->m_ireclaims;
- topino = mp->m_inodes;
- XFS_MOUNT_IUNLOCK(mp);
- }
-
- /*
- * We don't keep the mountlock across the dqrele() call,
- * since it can take a while..
- */
- if ((flags & XFS_UQUOTA_ACCT) && ip->i_udquot) {
- xfs_qm_dqrele(ip->i_udquot);
- ip->i_udquot = NULL;
- }
- if (flags & (XFS_PQUOTA_ACCT|XFS_GQUOTA_ACCT) && ip->i_gdquot) {
- xfs_qm_dqrele(ip->i_gdquot);
- ip->i_gdquot = NULL;
- }
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- /*
- * Wait until we've dropped the ilock and mountlock to
- * do the vn_rele. Or be condemned to an eternity in the
- * inactive code in hell.
- */
- if (vnode_refd)
- VN_RELE(vp);
- XFS_MOUNT_ILOCK(mp);
- /*
- * If an inode was inserted or removed, we gotta
- * start over again.
- */
- if (topino != mp->m_inodes || mp->m_ireclaims != ireclaims) {
- /* XXX use a sentinel */
- goto again;
- }
- ip = ip->i_mnext;
- } while (ip != mp->m_inodes);
-
- XFS_MOUNT_IUNLOCK(mp);
-}
-
-/*------------------------------------------------------------------------*/
-#ifdef DEBUG
-/*
- * This contains all the test functions for XFS disk quotas.
- * Currently it does a quota accounting check. ie. it walks through
- * all inodes in the file system, calculating the dquot accounting fields,
- * and prints out any inconsistencies.
- */
-xfs_dqhash_t *qmtest_udqtab;
-xfs_dqhash_t *qmtest_gdqtab;
-int qmtest_hashmask;
-int qmtest_nfails;
-mutex_t qcheck_lock;
-
-#define DQTEST_HASHVAL(mp, id) (((__psunsigned_t)(mp) + \
- (__psunsigned_t)(id)) & \
- (qmtest_hashmask - 1))
-
-#define DQTEST_HASH(mp, id, type) ((type & XFS_DQ_USER) ? \
- (qmtest_udqtab + \
- DQTEST_HASHVAL(mp, id)) : \
- (qmtest_gdqtab + \
- DQTEST_HASHVAL(mp, id)))
-
-#define DQTEST_LIST_PRINT(l, NXT, title) \
-{ \
- xfs_dqtest_t *dqp; int i = 0;\
- cmn_err(CE_DEBUG, "%s (#%d)", title, (int) (l)->qh_nelems); \
- for (dqp = (xfs_dqtest_t *)(l)->qh_next; dqp != NULL; \
- dqp = (xfs_dqtest_t *)dqp->NXT) { \
- cmn_err(CE_DEBUG, " %d. \"%d (%s)\" bcnt = %d, icnt = %d", \
- ++i, dqp->d_id, DQFLAGTO_TYPESTR(dqp), \
- dqp->d_bcount, dqp->d_icount); } \
-}
-
-typedef struct dqtest {
- xfs_dqmarker_t q_lists;
- xfs_dqhash_t *q_hash; /* the hashchain header */
- xfs_mount_t *q_mount; /* filesystem this relates to */
- xfs_dqid_t d_id; /* user id or group id */
- xfs_qcnt_t d_bcount; /* # disk blocks owned by the user */
- xfs_qcnt_t d_icount; /* # inodes owned by the user */
-} xfs_dqtest_t;
-
-STATIC void
-xfs_qm_hashinsert(xfs_dqhash_t *h, xfs_dqtest_t *dqp)
-{
- xfs_dquot_t *d;
- if (((d) = (h)->qh_next))
- (d)->HL_PREVP = &((dqp)->HL_NEXT);
- (dqp)->HL_NEXT = d;
- (dqp)->HL_PREVP = &((h)->qh_next);
- (h)->qh_next = (xfs_dquot_t *)dqp;
- (h)->qh_version++;
- (h)->qh_nelems++;
-}
-STATIC void
-xfs_qm_dqtest_print(
- xfs_dqtest_t *d)
-{
- cmn_err(CE_DEBUG, "-----------DQTEST DQUOT----------------");
- cmn_err(CE_DEBUG, "---- dquot ID = %d", d->d_id);
- cmn_err(CE_DEBUG, "---- fs = 0x%p", d->q_mount);
- cmn_err(CE_DEBUG, "---- bcount = %Lu (0x%x)",
- d->d_bcount, (int)d->d_bcount);
- cmn_err(CE_DEBUG, "---- icount = %Lu (0x%x)",
- d->d_icount, (int)d->d_icount);
- cmn_err(CE_DEBUG, "---------------------------");
-}
-
-STATIC void
-xfs_qm_dqtest_failed(
- xfs_dqtest_t *d,
- xfs_dquot_t *dqp,
- char *reason,
- xfs_qcnt_t a,
- xfs_qcnt_t b,
- int error)
-{
- qmtest_nfails++;
- if (error)
- cmn_err(CE_DEBUG, "quotacheck failed id=%d, err=%d\nreason: %s",
- d->d_id, error, reason);
- else
- cmn_err(CE_DEBUG, "quotacheck failed id=%d (%s) [%d != %d]",
- d->d_id, reason, (int)a, (int)b);
- xfs_qm_dqtest_print(d);
- if (dqp)
- xfs_qm_dqprint(dqp);
-}
-
-STATIC int
-xfs_dqtest_cmp2(
- xfs_dqtest_t *d,
- xfs_dquot_t *dqp)
-{
- int err = 0;
- if (be64_to_cpu(dqp->q_core.d_icount) != d->d_icount) {
- xfs_qm_dqtest_failed(d, dqp, "icount mismatch",
- be64_to_cpu(dqp->q_core.d_icount),
- d->d_icount, 0);
- err++;
- }
- if (be64_to_cpu(dqp->q_core.d_bcount) != d->d_bcount) {
- xfs_qm_dqtest_failed(d, dqp, "bcount mismatch",
- be64_to_cpu(dqp->q_core.d_bcount),
- d->d_bcount, 0);
- err++;
- }
- if (dqp->q_core.d_blk_softlimit &&
- be64_to_cpu(dqp->q_core.d_bcount) >=
- be64_to_cpu(dqp->q_core.d_blk_softlimit)) {
- if (!dqp->q_core.d_btimer && dqp->q_core.d_id) {
- cmn_err(CE_DEBUG,
- "%d [%s] [0x%p] BLK TIMER NOT STARTED",
- d->d_id, DQFLAGTO_TYPESTR(d), d->q_mount);
- err++;
- }
- }
- if (dqp->q_core.d_ino_softlimit &&
- be64_to_cpu(dqp->q_core.d_icount) >=
- be64_to_cpu(dqp->q_core.d_ino_softlimit)) {
- if (!dqp->q_core.d_itimer && dqp->q_core.d_id) {
- cmn_err(CE_DEBUG,
- "%d [%s] [0x%p] INO TIMER NOT STARTED",
- d->d_id, DQFLAGTO_TYPESTR(d), d->q_mount);
- err++;
- }
- }
-#ifdef QUOTADEBUG
- if (!err) {
- cmn_err(CE_DEBUG, "%d [%s] [0x%p] qchecked",
- d->d_id, DQFLAGTO_TYPESTR(d), d->q_mount);
- }
-#endif
- return (err);
-}
-
-STATIC void
-xfs_dqtest_cmp(
- xfs_dqtest_t *d)
-{
- xfs_dquot_t *dqp;
- int error;
-
- /* xfs_qm_dqtest_print(d); */
- if ((error = xfs_qm_dqget(d->q_mount, NULL, d->d_id, d->dq_flags, 0,
- &dqp))) {
- xfs_qm_dqtest_failed(d, NULL, "dqget failed", 0, 0, error);
- return;
- }
- xfs_dqtest_cmp2(d, dqp);
- xfs_qm_dqput(dqp);
-}
-
-STATIC int
-xfs_qm_internalqcheck_dqget(
- xfs_mount_t *mp,
- xfs_dqid_t id,
- uint type,
- xfs_dqtest_t **O_dq)
-{
- xfs_dqtest_t *d;
- xfs_dqhash_t *h;
-
- h = DQTEST_HASH(mp, id, type);
- for (d = (xfs_dqtest_t *) h->qh_next; d != NULL;
- d = (xfs_dqtest_t *) d->HL_NEXT) {
- /* DQTEST_LIST_PRINT(h, HL_NEXT, "@@@@@ dqtestlist @@@@@"); */
- if (d->d_id == id && mp == d->q_mount) {
- *O_dq = d;
- return (0);
- }
- }
- d = kmem_zalloc(sizeof(xfs_dqtest_t), KM_SLEEP);
- d->dq_flags = type;
- d->d_id = id;
- d->q_mount = mp;
- d->q_hash = h;
- xfs_qm_hashinsert(h, d);
- *O_dq = d;
- return (0);
-}
-
-STATIC void
-xfs_qm_internalqcheck_get_dquots(
- xfs_mount_t *mp,
- xfs_dqid_t uid,
- xfs_dqid_t projid,
- xfs_dqid_t gid,
- xfs_dqtest_t **ud,
- xfs_dqtest_t **gd)
-{
- if (XFS_IS_UQUOTA_ON(mp))
- xfs_qm_internalqcheck_dqget(mp, uid, XFS_DQ_USER, ud);
- if (XFS_IS_GQUOTA_ON(mp))
- xfs_qm_internalqcheck_dqget(mp, gid, XFS_DQ_GROUP, gd);
- else if (XFS_IS_PQUOTA_ON(mp))
- xfs_qm_internalqcheck_dqget(mp, projid, XFS_DQ_PROJ, gd);
-}
-
-
-STATIC void
-xfs_qm_internalqcheck_dqadjust(
- xfs_inode_t *ip,
- xfs_dqtest_t *d)
-{
- d->d_icount++;
- d->d_bcount += (xfs_qcnt_t)ip->i_d.di_nblocks;
-}
-
-STATIC int
-xfs_qm_internalqcheck_adjust(
- xfs_mount_t *mp, /* mount point for filesystem */
- xfs_ino_t ino, /* inode number to get data for */
- void __user *buffer, /* not used */
- int ubsize, /* not used */
- void *private_data, /* not used */
- xfs_daddr_t bno, /* starting block of inode cluster */
- int *ubused, /* not used */
- void *dip, /* not used */
- int *res) /* bulkstat result code */
-{
- xfs_inode_t *ip;
- xfs_dqtest_t *ud, *gd;
- uint lock_flags;
- boolean_t ipreleased;
- int error;
-
- ASSERT(XFS_IS_QUOTA_RUNNING(mp));
-
- if (ino == mp->m_sb.sb_uquotino || ino == mp->m_sb.sb_gquotino) {
- *res = BULKSTAT_RV_NOTHING;
- qdprintk("internalqcheck: ino=%llu, uqino=%llu, gqino=%llu\n",
- (unsigned long long) ino,
- (unsigned long long) mp->m_sb.sb_uquotino,
- (unsigned long long) mp->m_sb.sb_gquotino);
- return XFS_ERROR(EINVAL);
- }
- ipreleased = B_FALSE;
- again:
- lock_flags = XFS_ILOCK_SHARED;
- if ((error = xfs_iget(mp, NULL, ino, 0, lock_flags, &ip, bno))) {
- *res = BULKSTAT_RV_NOTHING;
- return (error);
- }
-
- if (ip->i_d.di_mode == 0) {
- xfs_iput_new(ip, lock_flags);
- *res = BULKSTAT_RV_NOTHING;
- return XFS_ERROR(ENOENT);
- }
-
- /*
- * This inode can have blocks after eof which can get released
- * when we send it to inactive. Since we don't check the dquot
- * until the after all our calculations are done, we must get rid
- * of those now.
- */
- if (! ipreleased) {
- xfs_iput(ip, lock_flags);
- ipreleased = B_TRUE;
- goto again;
- }
- xfs_qm_internalqcheck_get_dquots(mp,
- (xfs_dqid_t) ip->i_d.di_uid,
- (xfs_dqid_t) ip->i_d.di_projid,
- (xfs_dqid_t) ip->i_d.di_gid,
- &ud, &gd);
- if (XFS_IS_UQUOTA_ON(mp)) {
- ASSERT(ud);
- xfs_qm_internalqcheck_dqadjust(ip, ud);
- }
- if (XFS_IS_OQUOTA_ON(mp)) {
- ASSERT(gd);
- xfs_qm_internalqcheck_dqadjust(ip, gd);
- }
- xfs_iput(ip, lock_flags);
- *res = BULKSTAT_RV_DIDONE;
- return (0);
-}
-
-
-/* PRIVATE, debugging */
-int
-xfs_qm_internalqcheck(
- xfs_mount_t *mp)
-{
- xfs_ino_t lastino;
- int done, count;
- int i;
- xfs_dqtest_t *d, *e;
- xfs_dqhash_t *h1;
- int error;
-
- lastino = 0;
- qmtest_hashmask = 32;
- count = 5;
- done = 0;
- qmtest_nfails = 0;
-
- if (! XFS_IS_QUOTA_ON(mp))
- return XFS_ERROR(ESRCH);
-
- xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE | XFS_LOG_SYNC);
- XFS_bflush(mp->m_ddev_targp);
- xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE | XFS_LOG_SYNC);
- XFS_bflush(mp->m_ddev_targp);
-
- mutex_lock(&qcheck_lock);
- /* There should be absolutely no quota activity while this
- is going on. */
- qmtest_udqtab = kmem_zalloc(qmtest_hashmask *
- sizeof(xfs_dqhash_t), KM_SLEEP);
- qmtest_gdqtab = kmem_zalloc(qmtest_hashmask *
- sizeof(xfs_dqhash_t), KM_SLEEP);
- do {
- /*
- * Iterate thru all the inodes in the file system,
- * adjusting the corresponding dquot counters
- */
- if ((error = xfs_bulkstat(mp, &lastino, &count,
- xfs_qm_internalqcheck_adjust, NULL,
- 0, NULL, BULKSTAT_FG_IGET, &done))) {
- break;
- }
- } while (! done);
- if (error) {
- cmn_err(CE_DEBUG, "Bulkstat returned error 0x%x", error);
- }
- cmn_err(CE_DEBUG, "Checking results against system dquots");
- for (i = 0; i < qmtest_hashmask; i++) {
- h1 = &qmtest_udqtab[i];
- for (d = (xfs_dqtest_t *) h1->qh_next; d != NULL; ) {
- xfs_dqtest_cmp(d);
- e = (xfs_dqtest_t *) d->HL_NEXT;
- kmem_free(d, sizeof(xfs_dqtest_t));
- d = e;
- }
- h1 = &qmtest_gdqtab[i];
- for (d = (xfs_dqtest_t *) h1->qh_next; d != NULL; ) {
- xfs_dqtest_cmp(d);
- e = (xfs_dqtest_t *) d->HL_NEXT;
- kmem_free(d, sizeof(xfs_dqtest_t));
- d = e;
- }
- }
-
- if (qmtest_nfails) {
- cmn_err(CE_DEBUG, "******** quotacheck failed ********");
- cmn_err(CE_DEBUG, "failures = %d", qmtest_nfails);
- } else {
- cmn_err(CE_DEBUG, "******** quotacheck successful! ********");
- }
- kmem_free(qmtest_udqtab, qmtest_hashmask * sizeof(xfs_dqhash_t));
- kmem_free(qmtest_gdqtab, qmtest_hashmask * sizeof(xfs_dqhash_t));
- mutex_unlock(&qcheck_lock);
- return (qmtest_nfails);
-}
-
-#endif /* DEBUG */
diff --git a/fs/xfs/quota/xfs_quota_priv.h b/fs/xfs/quota/xfs_quota_priv.h
deleted file mode 100644
index b7ddd04aae3..00000000000
--- a/fs/xfs/quota/xfs_quota_priv.h
+++ /dev/null
@@ -1,175 +0,0 @@
-/*
- * Copyright (c) 2000-2003 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- */
-#ifndef __XFS_QUOTA_PRIV_H__
-#define __XFS_QUOTA_PRIV_H__
-
-/*
- * Number of bmaps that we ask from bmapi when doing a quotacheck.
- * We make this restriction to keep the memory usage to a minimum.
- */
-#define XFS_DQITER_MAP_SIZE 10
-
-/* Number of dquots that fit in to a dquot block */
-#define XFS_QM_DQPERBLK(mp) ((mp)->m_quotainfo->qi_dqperchunk)
-
-#define XFS_ISLOCKED_INODE(ip) (ismrlocked(&(ip)->i_lock, \
- MR_UPDATE | MR_ACCESS) != 0)
-#define XFS_ISLOCKED_INODE_EXCL(ip) (ismrlocked(&(ip)->i_lock, \
- MR_UPDATE) != 0)
-
-#define XFS_DQ_IS_ADDEDTO_TRX(t, d) ((d)->q_transp == (t))
-
-#define XFS_QI_MPLRECLAIMS(mp) ((mp)->m_quotainfo->qi_dqreclaims)
-#define XFS_QI_UQIP(mp) ((mp)->m_quotainfo->qi_uquotaip)
-#define XFS_QI_GQIP(mp) ((mp)->m_quotainfo->qi_gquotaip)
-#define XFS_QI_DQCHUNKLEN(mp) ((mp)->m_quotainfo->qi_dqchunklen)
-#define XFS_QI_BTIMELIMIT(mp) ((mp)->m_quotainfo->qi_btimelimit)
-#define XFS_QI_RTBTIMELIMIT(mp) ((mp)->m_quotainfo->qi_rtbtimelimit)
-#define XFS_QI_ITIMELIMIT(mp) ((mp)->m_quotainfo->qi_itimelimit)
-#define XFS_QI_BWARNLIMIT(mp) ((mp)->m_quotainfo->qi_bwarnlimit)
-#define XFS_QI_RTBWARNLIMIT(mp) ((mp)->m_quotainfo->qi_rtbwarnlimit)
-#define XFS_QI_IWARNLIMIT(mp) ((mp)->m_quotainfo->qi_iwarnlimit)
-#define XFS_QI_QOFFLOCK(mp) ((mp)->m_quotainfo->qi_quotaofflock)
-
-#define XFS_QI_MPL_LIST(mp) ((mp)->m_quotainfo->qi_dqlist)
-#define XFS_QI_MPLLOCK(mp) ((mp)->m_quotainfo->qi_dqlist.qh_lock)
-#define XFS_QI_MPLNEXT(mp) ((mp)->m_quotainfo->qi_dqlist.qh_next)
-#define XFS_QI_MPLNDQUOTS(mp) ((mp)->m_quotainfo->qi_dqlist.qh_nelems)
-
-#define XQMLCK(h) (mutex_lock(&((h)->qh_lock)))
-#define XQMUNLCK(h) (mutex_unlock(&((h)->qh_lock)))
-#ifdef DEBUG
-struct xfs_dqhash;
-static inline int XQMISLCKD(struct xfs_dqhash *h)
-{
- if (mutex_trylock(&h->qh_lock)) {
- mutex_unlock(&h->qh_lock);
- return 0;
- }
- return 1;
-}
-#endif
-
-#define XFS_DQ_HASH_LOCK(h) XQMLCK(h)
-#define XFS_DQ_HASH_UNLOCK(h) XQMUNLCK(h)
-#define XFS_DQ_IS_HASH_LOCKED(h) XQMISLCKD(h)
-
-#define xfs_qm_mplist_lock(mp) XQMLCK(&(XFS_QI_MPL_LIST(mp)))
-#define xfs_qm_mplist_unlock(mp) XQMUNLCK(&(XFS_QI_MPL_LIST(mp)))
-#define XFS_QM_IS_MPLIST_LOCKED(mp) XQMISLCKD(&(XFS_QI_MPL_LIST(mp)))
-
-#define xfs_qm_freelist_lock(qm) XQMLCK(&((qm)->qm_dqfreelist))
-#define xfs_qm_freelist_unlock(qm) XQMUNLCK(&((qm)->qm_dqfreelist))
-#define XFS_QM_IS_FREELIST_LOCKED(qm) XQMISLCKD(&((qm)->qm_dqfreelist))
-
-/*
- * Hash into a bucket in the dquot hash table, based on <mp, id>.
- */
-#define XFS_DQ_HASHVAL(mp, id) (((__psunsigned_t)(mp) + \
- (__psunsigned_t)(id)) & \
- (xfs_Gqm->qm_dqhashmask - 1))
-#define XFS_DQ_HASH(mp, id, type) (type == XFS_DQ_USER ? \
- (xfs_Gqm->qm_usr_dqhtable + \
- XFS_DQ_HASHVAL(mp, id)) : \
- (xfs_Gqm->qm_grp_dqhtable + \
- XFS_DQ_HASHVAL(mp, id)))
-#define XFS_IS_DQTYPE_ON(mp, type) (type == XFS_DQ_USER ? \
- XFS_IS_UQUOTA_ON(mp) : \
- XFS_IS_OQUOTA_ON(mp))
-#define XFS_IS_DQUOT_UNINITIALIZED(dqp) ( \
- !dqp->q_core.d_blk_hardlimit && \
- !dqp->q_core.d_blk_softlimit && \
- !dqp->q_core.d_rtb_hardlimit && \
- !dqp->q_core.d_rtb_softlimit && \
- !dqp->q_core.d_ino_hardlimit && \
- !dqp->q_core.d_ino_softlimit && \
- !dqp->q_core.d_bcount && \
- !dqp->q_core.d_rtbcount && \
- !dqp->q_core.d_icount)
-
-#define HL_PREVP dq_hashlist.ql_prevp
-#define HL_NEXT dq_hashlist.ql_next
-#define MPL_PREVP dq_mplist.ql_prevp
-#define MPL_NEXT dq_mplist.ql_next
-
-
-#define _LIST_REMOVE(h, dqp, PVP, NXT) \
- { \
- xfs_dquot_t *d; \
- if (((d) = (dqp)->NXT)) \
- (d)->PVP = (dqp)->PVP; \
- *((dqp)->PVP) = d; \
- (dqp)->NXT = NULL; \
- (dqp)->PVP = NULL; \
- (h)->qh_version++; \
- (h)->qh_nelems--; \
- }
-
-#define _LIST_INSERT(h, dqp, PVP, NXT) \
- { \
- xfs_dquot_t *d; \
- if (((d) = (h)->qh_next)) \
- (d)->PVP = &((dqp)->NXT); \
- (dqp)->NXT = d; \
- (dqp)->PVP = &((h)->qh_next); \
- (h)->qh_next = dqp; \
- (h)->qh_version++; \
- (h)->qh_nelems++; \
- }
-
-#define FOREACH_DQUOT_IN_MP(dqp, mp) \
- for ((dqp) = XFS_QI_MPLNEXT(mp); (dqp) != NULL; (dqp) = (dqp)->MPL_NEXT)
-
-#define FOREACH_DQUOT_IN_FREELIST(dqp, qlist) \
-for ((dqp) = (qlist)->qh_next; (dqp) != (xfs_dquot_t *)(qlist); \
- (dqp) = (dqp)->dq_flnext)
-
-#define XQM_HASHLIST_INSERT(h, dqp) \
- _LIST_INSERT(h, dqp, HL_PREVP, HL_NEXT)
-
-#define XQM_FREELIST_INSERT(h, dqp) \
- xfs_qm_freelist_append(h, dqp)
-
-#define XQM_MPLIST_INSERT(h, dqp) \
- _LIST_INSERT(h, dqp, MPL_PREVP, MPL_NEXT)
-
-#define XQM_HASHLIST_REMOVE(h, dqp) \
- _LIST_REMOVE(h, dqp, HL_PREVP, HL_NEXT)
-#define XQM_FREELIST_REMOVE(dqp) \
- xfs_qm_freelist_unlink(dqp)
-#define XQM_MPLIST_REMOVE(h, dqp) \
- { _LIST_REMOVE(h, dqp, MPL_PREVP, MPL_NEXT); \
- XFS_QI_MPLRECLAIMS((dqp)->q_mount)++; }
-
-#define XFS_DQ_IS_LOGITEM_INITD(dqp) ((dqp)->q_logitem.qli_dquot == (dqp))
-
-#define XFS_QM_DQP_TO_DQACCT(tp, dqp) (XFS_QM_ISUDQ(dqp) ? \
- (tp)->t_dqinfo->dqa_usrdquots : \
- (tp)->t_dqinfo->dqa_grpdquots)
-#define XFS_IS_SUSER_DQUOT(dqp) \
- (!((dqp)->q_core.d_id))
-
-#define XFS_PURGE_INODE(ip) \
- IRELE(ip);
-
-#define DQFLAGTO_TYPESTR(d) (((d)->dq_flags & XFS_DQ_USER) ? "USR" : \
- (((d)->dq_flags & XFS_DQ_GROUP) ? "GRP" : \
- (((d)->dq_flags & XFS_DQ_PROJ) ? "PRJ":"???")))
-#define DQFLAGTO_DIRTYSTR(d) (XFS_DQ_IS_DIRTY(d) ? "DIRTY" : "NOTDIRTY")
-
-#endif /* __XFS_QUOTA_PRIV_H__ */
diff --git a/fs/xfs/support/debug.c b/fs/xfs/support/debug.c
deleted file mode 100644
index b08b3d9345b..00000000000
--- a/fs/xfs/support/debug.c
+++ /dev/null
@@ -1,101 +0,0 @@
-/*
- * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- */
-#include "debug.h"
-#include "spin.h"
-#include <asm/page.h>
-#include <linux/sched.h>
-#include <linux/kernel.h>
-
-static char message[256]; /* keep it off the stack */
-static DEFINE_SPINLOCK(xfs_err_lock);
-
-/* Translate from CE_FOO to KERN_FOO, err_level(CE_FOO) == KERN_FOO */
-#define XFS_MAX_ERR_LEVEL 7
-#define XFS_ERR_MASK ((1 << 3) - 1)
-static const char * const err_level[XFS_MAX_ERR_LEVEL+1] =
- {KERN_EMERG, KERN_ALERT, KERN_CRIT,
- KERN_ERR, KERN_WARNING, KERN_NOTICE,
- KERN_INFO, KERN_DEBUG};
-
-void
-cmn_err(register int level, char *fmt, ...)
-{
- char *fp = fmt;
- int len;
- ulong flags;
- va_list ap;
-
- level &= XFS_ERR_MASK;
- if (level > XFS_MAX_ERR_LEVEL)
- level = XFS_MAX_ERR_LEVEL;
- spin_lock_irqsave(&xfs_err_lock,flags);
- va_start(ap, fmt);
- if (*fmt == '!') fp++;
- len = vsprintf(message, fp, ap);
- if (message[len-1] != '\n')
- strcat(message, "\n");
- printk("%s%s", err_level[level], message);
- va_end(ap);
- spin_unlock_irqrestore(&xfs_err_lock,flags);
-
- if (level == CE_PANIC)
- BUG();
-}
-
-void
-icmn_err(register int level, char *fmt, va_list ap)
-{
- ulong flags;
- int len;
-
- level &= XFS_ERR_MASK;
- if(level > XFS_MAX_ERR_LEVEL)
- level = XFS_MAX_ERR_LEVEL;
- spin_lock_irqsave(&xfs_err_lock,flags);
- len = vsprintf(message, fmt, ap);
- if (message[len-1] != '\n')
- strcat(message, "\n");
- spin_unlock_irqrestore(&xfs_err_lock,flags);
- printk("%s%s", err_level[level], message);
- if (level == CE_PANIC)
- BUG();
-}
-
-void
-assfail(char *expr, char *file, int line)
-{
- printk("Assertion failed: %s, file: %s, line: %d\n", expr, file, line);
- BUG();
-}
-
-#if ((defined(DEBUG) || defined(INDUCE_IO_ERRROR)) && !defined(NO_WANT_RANDOM))
-unsigned long random(void)
-{
- static unsigned long RandomValue = 1;
- /* cycles pseudo-randomly through all values between 1 and 2^31 - 2 */
- register long rv = RandomValue;
- register long lo;
- register long hi;
-
- hi = rv / 127773;
- lo = rv % 127773;
- rv = 16807 * lo - 2836 * hi;
- if (rv <= 0) rv += 2147483647;
- return RandomValue = rv;
-}
-#endif /* DEBUG || INDUCE_IO_ERRROR || !NO_WANT_RANDOM */
diff --git a/fs/xfs/support/debug.h b/fs/xfs/support/debug.h
deleted file mode 100644
index e3bf58112e7..00000000000
--- a/fs/xfs/support/debug.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- */
-#ifndef __XFS_SUPPORT_DEBUG_H__
-#define __XFS_SUPPORT_DEBUG_H__
-
-#include <stdarg.h>
-
-#define CE_DEBUG 7 /* debug */
-#define CE_CONT 6 /* continuation */
-#define CE_NOTE 5 /* notice */
-#define CE_WARN 4 /* warning */
-#define CE_ALERT 1 /* alert */
-#define CE_PANIC 0 /* panic */
-
-extern void icmn_err(int, char *, va_list)
- __attribute__ ((format (printf, 2, 0)));
-extern void cmn_err(int, char *, ...)
- __attribute__ ((format (printf, 2, 3)));
-extern void assfail(char *expr, char *f, int l);
-
-#define prdev(fmt,targ,args...) \
- printk("Device %s - " fmt "\n", XFS_BUFTARG_NAME(targ), ## args)
-
-#define ASSERT_ALWAYS(expr) \
- (unlikely((expr) != 0) ? (void)0 : assfail(#expr, __FILE__, __LINE__))
-
-#ifndef DEBUG
-# define ASSERT(expr) ((void)0)
-#else
-# define ASSERT(expr) ASSERT_ALWAYS(expr)
-extern unsigned long random(void);
-#endif
-
-#ifndef STATIC
-# define STATIC static
-#endif
-
-#endif /* __XFS_SUPPORT_DEBUG_H__ */
diff --git a/fs/xfs/support/ktrace.c b/fs/xfs/support/ktrace.c
deleted file mode 100644
index 841aa4c15b8..00000000000
--- a/fs/xfs/support/ktrace.c
+++ /dev/null
@@ -1,331 +0,0 @@
-/*
- * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- */
-#include <xfs.h>
-
-static kmem_zone_t *ktrace_hdr_zone;
-static kmem_zone_t *ktrace_ent_zone;
-static int ktrace_zentries;
-
-void
-ktrace_init(int zentries)
-{
- ktrace_zentries = zentries;
-
- ktrace_hdr_zone = kmem_zone_init(sizeof(ktrace_t),
- "ktrace_hdr");
- ASSERT(ktrace_hdr_zone);
-
- ktrace_ent_zone = kmem_zone_init(ktrace_zentries
- * sizeof(ktrace_entry_t),
- "ktrace_ent");
- ASSERT(ktrace_ent_zone);
-}
-
-void
-ktrace_uninit(void)
-{
- kmem_cache_destroy(ktrace_hdr_zone);
- kmem_cache_destroy(ktrace_ent_zone);
-}
-
-/*
- * ktrace_alloc()
- *
- * Allocate a ktrace header and enough buffering for the given
- * number of entries.
- */
-ktrace_t *
-ktrace_alloc(int nentries, unsigned int __nocast sleep)
-{
- ktrace_t *ktp;
- ktrace_entry_t *ktep;
-
- ktp = (ktrace_t*)kmem_zone_alloc(ktrace_hdr_zone, sleep);
-
- if (ktp == (ktrace_t*)NULL) {
- /*
- * KM_SLEEP callers don't expect failure.
- */
- if (sleep & KM_SLEEP)
- panic("ktrace_alloc: NULL memory on KM_SLEEP request!");
-
- return NULL;
- }
-
- /*
- * Special treatment for buffers with the ktrace_zentries entries
- */
- if (nentries == ktrace_zentries) {
- ktep = (ktrace_entry_t*)kmem_zone_zalloc(ktrace_ent_zone,
- sleep);
- } else {
- ktep = (ktrace_entry_t*)kmem_zalloc((nentries * sizeof(*ktep)),
- sleep);
- }
-
- if (ktep == NULL) {
- /*
- * KM_SLEEP callers don't expect failure.
- */
- if (sleep & KM_SLEEP)
- panic("ktrace_alloc: NULL memory on KM_SLEEP request!");
-
- kmem_free(ktp, sizeof(*ktp));
-
- return NULL;
- }
-
- spinlock_init(&(ktp->kt_lock), "kt_lock");
-
- ktp->kt_entries = ktep;
- ktp->kt_nentries = nentries;
- ktp->kt_index = 0;
- ktp->kt_rollover = 0;
- return ktp;
-}
-
-
-/*
- * ktrace_free()
- *
- * Free up the ktrace header and buffer. It is up to the caller
- * to ensure that no-one is referencing it.
- */
-void
-ktrace_free(ktrace_t *ktp)
-{
- int entries_size;
-
- if (ktp == (ktrace_t *)NULL)
- return;
-
- spinlock_destroy(&ktp->kt_lock);
-
- /*
- * Special treatment for the Vnode trace buffer.
- */
- if (ktp->kt_nentries == ktrace_zentries) {
- kmem_zone_free(ktrace_ent_zone, ktp->kt_entries);
- } else {
- entries_size = (int)(ktp->kt_nentries * sizeof(ktrace_entry_t));
-
- kmem_free(ktp->kt_entries, entries_size);
- }
-
- kmem_zone_free(ktrace_hdr_zone, ktp);
-}
-
-
-/*
- * Enter the given values into the "next" entry in the trace buffer.
- * kt_index is always the index of the next entry to be filled.
- */
-void
-ktrace_enter(
- ktrace_t *ktp,
- void *val0,
- void *val1,
- void *val2,
- void *val3,
- void *val4,
- void *val5,
- void *val6,
- void *val7,
- void *val8,
- void *val9,
- void *val10,
- void *val11,
- void *val12,
- void *val13,
- void *val14,
- void *val15)
-{
- static DEFINE_SPINLOCK(wrap_lock);
- unsigned long flags;
- int index;
- ktrace_entry_t *ktep;
-
- ASSERT(ktp != NULL);
-
- /*
- * Grab an entry by pushing the index up to the next one.
- */
- spin_lock_irqsave(&wrap_lock, flags);
- index = ktp->kt_index;
- if (++ktp->kt_index == ktp->kt_nentries)
- ktp->kt_index = 0;
- spin_unlock_irqrestore(&wrap_lock, flags);
-
- if (!ktp->kt_rollover && index == ktp->kt_nentries - 1)
- ktp->kt_rollover = 1;
-
- ASSERT((index >= 0) && (index < ktp->kt_nentries));
-
- ktep = &(ktp->kt_entries[index]);
-
- ktep->val[0] = val0;
- ktep->val[1] = val1;
- ktep->val[2] = val2;
- ktep->val[3] = val3;
- ktep->val[4] = val4;
- ktep->val[5] = val5;
- ktep->val[6] = val6;
- ktep->val[7] = val7;
- ktep->val[8] = val8;
- ktep->val[9] = val9;
- ktep->val[10] = val10;
- ktep->val[11] = val11;
- ktep->val[12] = val12;
- ktep->val[13] = val13;
- ktep->val[14] = val14;
- ktep->val[15] = val15;
-}
-
-/*
- * Return the number of entries in the trace buffer.
- */
-int
-ktrace_nentries(
- ktrace_t *ktp)
-{
- if (ktp == NULL) {
- return 0;
- }
-
- return (ktp->kt_rollover ? ktp->kt_nentries : ktp->kt_index);
-}
-
-/*
- * ktrace_first()
- *
- * This is used to find the start of the trace buffer.
- * In conjunction with ktrace_next() it can be used to
- * iterate through the entire trace buffer. This code does
- * not do any locking because it is assumed that it is called
- * from the debugger.
- *
- * The caller must pass in a pointer to a ktrace_snap
- * structure in which we will keep some state used to
- * iterate through the buffer. This state must not touched
- * by any code outside of this module.
- */
-ktrace_entry_t *
-ktrace_first(ktrace_t *ktp, ktrace_snap_t *ktsp)
-{
- ktrace_entry_t *ktep;
- int index;
- int nentries;
-
- if (ktp->kt_rollover)
- index = ktp->kt_index;
- else
- index = 0;
-
- ktsp->ks_start = index;
- ktep = &(ktp->kt_entries[index]);
-
- nentries = ktrace_nentries(ktp);
- index++;
- if (index < nentries) {
- ktsp->ks_index = index;
- } else {
- ktsp->ks_index = 0;
- if (index > nentries)
- ktep = NULL;
- }
- return ktep;
-}
-
-/*
- * ktrace_next()
- *
- * This is used to iterate through the entries of the given
- * trace buffer. The caller must pass in the ktrace_snap_t
- * structure initialized by ktrace_first(). The return value
- * will be either a pointer to the next ktrace_entry or NULL
- * if all of the entries have been traversed.
- */
-ktrace_entry_t *
-ktrace_next(
- ktrace_t *ktp,
- ktrace_snap_t *ktsp)
-{
- int index;
- ktrace_entry_t *ktep;
-
- index = ktsp->ks_index;
- if (index == ktsp->ks_start) {
- ktep = NULL;
- } else {
- ktep = &ktp->kt_entries[index];
- }
-
- index++;
- if (index == ktrace_nentries(ktp)) {
- ktsp->ks_index = 0;
- } else {
- ktsp->ks_index = index;
- }
-
- return ktep;
-}
-
-/*
- * ktrace_skip()
- *
- * Skip the next "count" entries and return the entry after that.
- * Return NULL if this causes us to iterate past the beginning again.
- */
-ktrace_entry_t *
-ktrace_skip(
- ktrace_t *ktp,
- int count,
- ktrace_snap_t *ktsp)
-{
- int index;
- int new_index;
- ktrace_entry_t *ktep;
- int nentries = ktrace_nentries(ktp);
-
- index = ktsp->ks_index;
- new_index = index + count;
- while (new_index >= nentries) {
- new_index -= nentries;
- }
- if (index == ktsp->ks_start) {
- /*
- * We've iterated around to the start, so we're done.
- */
- ktep = NULL;
- } else if ((new_index < index) && (index < ktsp->ks_index)) {
- /*
- * We've skipped past the start again, so we're done.
- */
- ktep = NULL;
- ktsp->ks_index = ktsp->ks_start;
- } else {
- ktep = &(ktp->kt_entries[new_index]);
- new_index++;
- if (new_index == nentries) {
- ktsp->ks_index = 0;
- } else {
- ktsp->ks_index = new_index;
- }
- }
- return ktep;
-}
diff --git a/fs/xfs/support/ktrace.h b/fs/xfs/support/ktrace.h
deleted file mode 100644
index 0d73216287c..00000000000
--- a/fs/xfs/support/ktrace.h
+++ /dev/null
@@ -1,87 +0,0 @@
-/*
- * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- */
-#ifndef __XFS_SUPPORT_KTRACE_H__
-#define __XFS_SUPPORT_KTRACE_H__
-
-#include <spin.h>
-
-/*
- * Trace buffer entry structure.
- */
-typedef struct ktrace_entry {
- void *val[16];
-} ktrace_entry_t;
-
-/*
- * Trace buffer header structure.
- */
-typedef struct ktrace {
- lock_t kt_lock; /* mutex to guard counters */
- int kt_nentries; /* number of entries in trace buf */
- int kt_index; /* current index in entries */
- int kt_rollover;
- ktrace_entry_t *kt_entries; /* buffer of entries */
-} ktrace_t;
-
-/*
- * Trace buffer snapshot structure.
- */
-typedef struct ktrace_snap {
- int ks_start; /* kt_index at time of snap */
- int ks_index; /* current index */
-} ktrace_snap_t;
-
-
-#ifdef CONFIG_XFS_TRACE
-
-extern void ktrace_init(int zentries);
-extern void ktrace_uninit(void);
-
-extern ktrace_t *ktrace_alloc(int, unsigned int __nocast);
-extern void ktrace_free(ktrace_t *);
-
-extern void ktrace_enter(
- ktrace_t *,
- void *,
- void *,
- void *,
- void *,
- void *,
- void *,
- void *,
- void *,
- void *,
- void *,
- void *,
- void *,
- void *,
- void *,
- void *,
- void *);
-
-extern ktrace_entry_t *ktrace_first(ktrace_t *, ktrace_snap_t *);
-extern int ktrace_nentries(ktrace_t *);
-extern ktrace_entry_t *ktrace_next(ktrace_t *, ktrace_snap_t *);
-extern ktrace_entry_t *ktrace_skip(ktrace_t *, int, ktrace_snap_t *);
-
-#else
-#define ktrace_init(x) do { } while (0)
-#define ktrace_uninit() do { } while (0)
-#endif /* CONFIG_XFS_TRACE */
-
-#endif /* __XFS_SUPPORT_KTRACE_H__ */
diff --git a/fs/xfs/support/move.c b/fs/xfs/support/move.c
deleted file mode 100644
index caefa17b80f..00000000000
--- a/fs/xfs/support/move.c
+++ /dev/null
@@ -1,51 +0,0 @@
-/*
- * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- */
-#include <xfs.h>
-
-/* Read from kernel buffer at src to user/kernel buffer defined
- * by the uio structure. Advance the pointer in the uio struct
- * as we go.
- */
-int
-uio_read(caddr_t src, size_t len, struct uio *uio)
-{
- size_t count;
-
- if (!len || !uio->uio_resid)
- return 0;
-
- count = uio->uio_iov->iov_len;
- if (!count)
- return 0;
- if (count > len)
- count = len;
-
- if (uio->uio_segflg == UIO_USERSPACE) {
- if (copy_to_user(uio->uio_iov->iov_base, src, count))
- return EFAULT;
- } else {
- ASSERT(uio->uio_segflg == UIO_SYSSPACE);
- memcpy(uio->uio_iov->iov_base, src, count);
- }
-
- uio->uio_iov->iov_base = (void*)((char*)uio->uio_iov->iov_base + count);
- uio->uio_iov->iov_len -= count;
- uio->uio_offset += count;
- uio->uio_resid -= count;
- return 0;
-}
diff --git a/fs/xfs/support/move.h b/fs/xfs/support/move.h
deleted file mode 100644
index 97a2498d2da..00000000000
--- a/fs/xfs/support/move.h
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- *
- * Portions Copyright (c) 1982, 1986, 1993, 1994
- * The Regents of the University of California. All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- * 3. Neither the name of the University nor the names of its contributors
- * may be used to endorse or promote products derived from this software
- * without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-#ifndef __XFS_SUPPORT_MOVE_H__
-#define __XFS_SUPPORT_MOVE_H__
-
-#include <linux/uio.h>
-#include <asm/uaccess.h>
-
-/* Segment flag values. */
-enum uio_seg {
- UIO_USERSPACE, /* from user data space */
- UIO_SYSSPACE, /* from system space */
-};
-
-struct uio {
- struct iovec *uio_iov; /* pointer to array of iovecs */
- int uio_iovcnt; /* number of iovecs in array */
- xfs_off_t uio_offset; /* offset in file this uio corresponds to */
- int uio_resid; /* residual i/o count */
- enum uio_seg uio_segflg; /* see above */
-};
-
-typedef struct uio uio_t;
-typedef struct iovec iovec_t;
-
-extern int uio_read (caddr_t, size_t, uio_t *);
-
-#endif /* __XFS_SUPPORT_MOVE_H__ */
diff --git a/fs/xfs/linux-2.6/time.h b/fs/xfs/time.h
index 387e695a184..387e695a184 100644
--- a/fs/xfs/linux-2.6/time.h
+++ b/fs/xfs/time.h
diff --git a/fs/xfs/support/uuid.c b/fs/xfs/uuid.c
index a3d565a6773..b83f76b6d41 100644
--- a/fs/xfs/support/uuid.c
+++ b/fs/xfs/uuid.c
@@ -17,17 +17,6 @@
*/
#include <xfs.h>
-static mutex_t uuid_monitor;
-static int uuid_table_size;
-static uuid_t *uuid_table;
-
-void
-uuid_init(void)
-{
- mutex_init(&uuid_monitor);
-}
-
-
/* IRIX interpretation of an uuid_t */
typedef struct {
__be32 uu_timelow;
@@ -50,13 +39,7 @@ uuid_getnodeuniq(uuid_t *uuid, int fsid [2])
fsid[0] = (be16_to_cpu(uup->uu_clockseq) << 16) |
be16_to_cpu(uup->uu_timemid);
- fsid[1] = be16_to_cpu(uup->uu_timelow);
-}
-
-void
-uuid_create_nil(uuid_t *uuid)
-{
- memset(uuid, 0, sizeof(*uuid));
+ fsid[1] = be32_to_cpu(uup->uu_timelow);
}
int
@@ -78,64 +61,3 @@ uuid_equal(uuid_t *uuid1, uuid_t *uuid2)
{
return memcmp(uuid1, uuid2, sizeof(uuid_t)) ? 0 : 1;
}
-
-/*
- * Given a 128-bit uuid, return a 64-bit value by adding the top and bottom
- * 64-bit words. NOTE: This function can not be changed EVER. Although
- * brain-dead, some applications depend on this 64-bit value remaining
- * persistent. Specifically, DMI vendors store the value as a persistent
- * filehandle.
- */
-__uint64_t
-uuid_hash64(uuid_t *uuid)
-{
- __uint64_t *sp = (__uint64_t *)uuid;
-
- return sp[0] + sp[1];
-}
-
-int
-uuid_table_insert(uuid_t *uuid)
-{
- int i, hole;
-
- mutex_lock(&uuid_monitor);
- for (i = 0, hole = -1; i < uuid_table_size; i++) {
- if (uuid_is_nil(&uuid_table[i])) {
- hole = i;
- continue;
- }
- if (uuid_equal(uuid, &uuid_table[i])) {
- mutex_unlock(&uuid_monitor);
- return 0;
- }
- }
- if (hole < 0) {
- uuid_table = kmem_realloc(uuid_table,
- (uuid_table_size + 1) * sizeof(*uuid_table),
- uuid_table_size * sizeof(*uuid_table),
- KM_SLEEP);
- hole = uuid_table_size++;
- }
- uuid_table[hole] = *uuid;
- mutex_unlock(&uuid_monitor);
- return 1;
-}
-
-void
-uuid_table_remove(uuid_t *uuid)
-{
- int i;
-
- mutex_lock(&uuid_monitor);
- for (i = 0; i < uuid_table_size; i++) {
- if (uuid_is_nil(&uuid_table[i]))
- continue;
- if (!uuid_equal(uuid, &uuid_table[i]))
- continue;
- uuid_create_nil(&uuid_table[i]);
- break;
- }
- ASSERT(i < uuid_table_size);
- mutex_unlock(&uuid_monitor);
-}
diff --git a/fs/xfs/support/uuid.h b/fs/xfs/uuid.h
index b6f5922199b..104db0f3bed 100644
--- a/fs/xfs/support/uuid.h
+++ b/fs/xfs/uuid.h
@@ -22,13 +22,14 @@ typedef struct {
unsigned char __u_bits[16];
} uuid_t;
-extern void uuid_init(void);
-extern void uuid_create_nil(uuid_t *uuid);
extern int uuid_is_nil(uuid_t *uuid);
extern int uuid_equal(uuid_t *uuid1, uuid_t *uuid2);
extern void uuid_getnodeuniq(uuid_t *uuid, int fsid [2]);
-extern __uint64_t uuid_hash64(uuid_t *uuid);
-extern int uuid_table_insert(uuid_t *uuid);
-extern void uuid_table_remove(uuid_t *uuid);
+
+static inline void
+uuid_copy(uuid_t *dst, uuid_t *src)
+{
+ memcpy(dst, src, sizeof(uuid_t));
+}
#endif /* __XFS_SUPPORT_UUID_H__ */
diff --git a/fs/xfs/xfs.h b/fs/xfs/xfs.h
index 1a48dbb902a..a742c47f7d5 100644
--- a/fs/xfs/xfs.h
+++ b/fs/xfs/xfs.h
@@ -17,5 +17,18 @@
*/
#ifndef __XFS_H__
#define __XFS_H__
-#include <linux-2.6/xfs_linux.h>
+
+#ifdef CONFIG_XFS_DEBUG
+#define STATIC
+#define DEBUG 1
+#define XFS_BUF_LOCK_TRACKING 1
+#endif
+
+#ifdef CONFIG_XFS_WARN
+#define XFS_WARN 1
+#endif
+
+
+#include "xfs_linux.h"
+
#endif /* __XFS_H__ */
diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
index 4ff0f4e41c6..6888ad886ff 100644
--- a/fs/xfs/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2001-2002,2005 Silicon Graphics, Inc.
+ * Copyright (c) 2008, Christoph Hellwig
* All Rights Reserved.
*
* This program is free software; you can redistribute it and/or
@@ -16,912 +16,291 @@
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_types.h"
-#include "xfs_bit.h"
-#include "xfs_inum.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_ag.h"
-#include "xfs_dir.h"
-#include "xfs_dir2.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
-#include "xfs_dinode.h"
+#include "xfs_sb.h"
+#include "xfs_mount.h"
#include "xfs_inode.h"
-#include "xfs_btree.h"
#include "xfs_acl.h"
-#include "xfs_mac.h"
#include "xfs_attr.h"
-
-#include <linux/capability.h>
+#include "xfs_trace.h"
+#include <linux/slab.h>
+#include <linux/xattr.h>
#include <linux/posix_acl_xattr.h>
-STATIC int xfs_acl_setmode(vnode_t *, xfs_acl_t *, int *);
-STATIC void xfs_acl_filter_mode(mode_t, xfs_acl_t *);
-STATIC void xfs_acl_get_endian(xfs_acl_t *);
-STATIC int xfs_acl_access(uid_t, gid_t, xfs_acl_t *, mode_t, cred_t *);
-STATIC int xfs_acl_invalid(xfs_acl_t *);
-STATIC void xfs_acl_sync_mode(mode_t, xfs_acl_t *);
-STATIC void xfs_acl_get_attr(vnode_t *, xfs_acl_t *, int, int, int *);
-STATIC void xfs_acl_set_attr(vnode_t *, xfs_acl_t *, int, int *);
-STATIC int xfs_acl_allow_set(vnode_t *, int);
-
-kmem_zone_t *xfs_acl_zone;
-
-
-/*
- * Test for existence of access ACL attribute as efficiently as possible.
- */
-int
-xfs_acl_vhasacl_access(
- vnode_t *vp)
-{
- int error;
-
- xfs_acl_get_attr(vp, NULL, _ACL_TYPE_ACCESS, ATTR_KERNOVAL, &error);
- return (error == 0);
-}
/*
- * Test for existence of default ACL attribute as efficiently as possible.
+ * Locking scheme:
+ * - all ACL updates are protected by inode->i_mutex, which is taken before
+ * calling into this file.
*/
-int
-xfs_acl_vhasacl_default(
- vnode_t *vp)
-{
- int error;
- if (!VN_ISDIR(vp))
- return 0;
- xfs_acl_get_attr(vp, NULL, _ACL_TYPE_DEFAULT, ATTR_KERNOVAL, &error);
- return (error == 0);
-}
-
-/*
- * Convert from extended attribute representation to in-memory for XFS.
- */
-STATIC int
-posix_acl_xattr_to_xfs(
- posix_acl_xattr_header *src,
- size_t size,
- xfs_acl_t *dest)
+STATIC struct posix_acl *
+xfs_acl_from_disk(
+ struct xfs_acl *aclp,
+ int max_entries)
{
- posix_acl_xattr_entry *src_entry;
- xfs_acl_entry_t *dest_entry;
- int n;
-
- if (!src || !dest)
- return EINVAL;
-
- if (size < sizeof(posix_acl_xattr_header))
- return EINVAL;
+ struct posix_acl_entry *acl_e;
+ struct posix_acl *acl;
+ struct xfs_acl_entry *ace;
+ unsigned int count, i;
- if (src->a_version != cpu_to_le32(POSIX_ACL_XATTR_VERSION))
- return EOPNOTSUPP;
+ count = be32_to_cpu(aclp->acl_cnt);
+ if (count > max_entries)
+ return ERR_PTR(-EFSCORRUPTED);
- memset(dest, 0, sizeof(xfs_acl_t));
- dest->acl_cnt = posix_acl_xattr_count(size);
- if (dest->acl_cnt < 0 || dest->acl_cnt > XFS_ACL_MAX_ENTRIES)
- return EINVAL;
+ acl = posix_acl_alloc(count, GFP_KERNEL);
+ if (!acl)
+ return ERR_PTR(-ENOMEM);
- /*
- * acl_set_file(3) may request that we set default ACLs with
- * zero length -- defend (gracefully) against that here.
- */
- if (!dest->acl_cnt)
- return 0;
+ for (i = 0; i < count; i++) {
+ acl_e = &acl->a_entries[i];
+ ace = &aclp->acl_entry[i];
- src_entry = (posix_acl_xattr_entry *)((char *)src + sizeof(*src));
- dest_entry = &dest->acl_entry[0];
+ /*
+ * The tag is 32 bits on disk and 16 bits in core.
+ *
+ * Because every access to it goes through the core
+ * format first this is not a problem.
+ */
+ acl_e->e_tag = be32_to_cpu(ace->ae_tag);
+ acl_e->e_perm = be16_to_cpu(ace->ae_perm);
- for (n = 0; n < dest->acl_cnt; n++, src_entry++, dest_entry++) {
- dest_entry->ae_perm = le16_to_cpu(src_entry->e_perm);
- if (_ACL_PERM_INVALID(dest_entry->ae_perm))
- return EINVAL;
- dest_entry->ae_tag = le16_to_cpu(src_entry->e_tag);
- switch(dest_entry->ae_tag) {
+ switch (acl_e->e_tag) {
case ACL_USER:
+ acl_e->e_uid = xfs_uid_to_kuid(be32_to_cpu(ace->ae_id));
+ break;
case ACL_GROUP:
- dest_entry->ae_id = le32_to_cpu(src_entry->e_id);
+ acl_e->e_gid = xfs_gid_to_kgid(be32_to_cpu(ace->ae_id));
break;
case ACL_USER_OBJ:
case ACL_GROUP_OBJ:
case ACL_MASK:
case ACL_OTHER:
- dest_entry->ae_id = ACL_UNDEFINED_ID;
break;
default:
- return EINVAL;
+ goto fail;
}
}
- if (xfs_acl_invalid(dest))
- return EINVAL;
+ return acl;
- return 0;
+fail:
+ posix_acl_release(acl);
+ return ERR_PTR(-EINVAL);
}
-/*
- * Comparison function called from xfs_sort().
- * Primary key is ae_tag, secondary key is ae_id.
- */
-STATIC int
-xfs_acl_entry_compare(
- const void *va,
- const void *vb)
+STATIC void
+xfs_acl_to_disk(struct xfs_acl *aclp, const struct posix_acl *acl)
{
- xfs_acl_entry_t *a = (xfs_acl_entry_t *)va,
- *b = (xfs_acl_entry_t *)vb;
+ const struct posix_acl_entry *acl_e;
+ struct xfs_acl_entry *ace;
+ int i;
- if (a->ae_tag == b->ae_tag)
- return (a->ae_id - b->ae_id);
- return (a->ae_tag - b->ae_tag);
-}
+ aclp->acl_cnt = cpu_to_be32(acl->a_count);
+ for (i = 0; i < acl->a_count; i++) {
+ ace = &aclp->acl_entry[i];
+ acl_e = &acl->a_entries[i];
-/*
- * Convert from in-memory XFS to extended attribute representation.
- */
-STATIC int
-posix_acl_xfs_to_xattr(
- xfs_acl_t *src,
- posix_acl_xattr_header *dest,
- size_t size)
-{
- int n;
- size_t new_size = posix_acl_xattr_size(src->acl_cnt);
- posix_acl_xattr_entry *dest_entry;
- xfs_acl_entry_t *src_entry;
-
- if (size < new_size)
- return -ERANGE;
-
- /* Need to sort src XFS ACL by <ae_tag,ae_id> */
- xfs_sort(src->acl_entry, src->acl_cnt, sizeof(src->acl_entry[0]),
- xfs_acl_entry_compare);
-
- dest->a_version = cpu_to_le32(POSIX_ACL_XATTR_VERSION);
- dest_entry = &dest->a_entries[0];
- src_entry = &src->acl_entry[0];
- for (n = 0; n < src->acl_cnt; n++, dest_entry++, src_entry++) {
- dest_entry->e_perm = cpu_to_le16(src_entry->ae_perm);
- if (_ACL_PERM_INVALID(src_entry->ae_perm))
- return -EINVAL;
- dest_entry->e_tag = cpu_to_le16(src_entry->ae_tag);
- switch (src_entry->ae_tag) {
+ ace->ae_tag = cpu_to_be32(acl_e->e_tag);
+ switch (acl_e->e_tag) {
case ACL_USER:
+ ace->ae_id = cpu_to_be32(xfs_kuid_to_uid(acl_e->e_uid));
+ break;
case ACL_GROUP:
- dest_entry->e_id = cpu_to_le32(src_entry->ae_id);
- break;
- case ACL_USER_OBJ:
- case ACL_GROUP_OBJ:
- case ACL_MASK:
- case ACL_OTHER:
- dest_entry->e_id = cpu_to_le32(ACL_UNDEFINED_ID);
+ ace->ae_id = cpu_to_be32(xfs_kgid_to_gid(acl_e->e_gid));
break;
default:
- return -EINVAL;
- }
- }
- return new_size;
-}
-
-int
-xfs_acl_vget(
- vnode_t *vp,
- void *acl,
- size_t size,
- int kind)
-{
- int error;
- xfs_acl_t *xfs_acl = NULL;
- posix_acl_xattr_header *ext_acl = acl;
- int flags = 0;
-
- VN_HOLD(vp);
- if(size) {
- if (!(_ACL_ALLOC(xfs_acl))) {
- error = ENOMEM;
- goto out;
+ ace->ae_id = cpu_to_be32(ACL_UNDEFINED_ID);
+ break;
}
- memset(xfs_acl, 0, sizeof(xfs_acl_t));
- } else
- flags = ATTR_KERNOVAL;
- xfs_acl_get_attr(vp, xfs_acl, kind, flags, &error);
- if (error)
- goto out;
-
- if (!size) {
- error = -posix_acl_xattr_size(XFS_ACL_MAX_ENTRIES);
- } else {
- if (xfs_acl_invalid(xfs_acl)) {
- error = EINVAL;
- goto out;
- }
- if (kind == _ACL_TYPE_ACCESS) {
- vattr_t va;
-
- va.va_mask = XFS_AT_MODE;
- VOP_GETATTR(vp, &va, 0, sys_cred, error);
- if (error)
- goto out;
- xfs_acl_sync_mode(va.va_mode, xfs_acl);
- }
- error = -posix_acl_xfs_to_xattr(xfs_acl, ext_acl, size);
+ ace->ae_perm = cpu_to_be16(acl_e->e_perm);
}
-out:
- VN_RELE(vp);
- if(xfs_acl)
- _ACL_FREE(xfs_acl);
- return -error;
}
-int
-xfs_acl_vremove(
- vnode_t *vp,
- int kind)
+struct posix_acl *
+xfs_get_acl(struct inode *inode, int type)
{
- int error;
-
- VN_HOLD(vp);
- error = xfs_acl_allow_set(vp, kind);
- if (!error) {
- VOP_ATTR_REMOVE(vp, kind == _ACL_TYPE_DEFAULT?
- SGI_ACL_DEFAULT: SGI_ACL_FILE,
- ATTR_ROOT, sys_cred, error);
- if (error == ENOATTR)
- error = 0; /* 'scool */
+ struct xfs_inode *ip = XFS_I(inode);
+ struct posix_acl *acl = NULL;
+ struct xfs_acl *xfs_acl;
+ unsigned char *ea_name;
+ int error;
+ int len;
+
+ trace_xfs_get_acl(ip);
+
+ switch (type) {
+ case ACL_TYPE_ACCESS:
+ ea_name = SGI_ACL_FILE;
+ break;
+ case ACL_TYPE_DEFAULT:
+ ea_name = SGI_ACL_DEFAULT;
+ break;
+ default:
+ BUG();
}
- VN_RELE(vp);
- return -error;
-}
-int
-xfs_acl_vset(
- vnode_t *vp,
- void *acl,
- size_t size,
- int kind)
-{
- posix_acl_xattr_header *ext_acl = acl;
- xfs_acl_t *xfs_acl;
- int error;
- int basicperms = 0; /* more than std unix perms? */
-
- if (!acl)
- return -EINVAL;
-
- if (!(_ACL_ALLOC(xfs_acl)))
- return -ENOMEM;
+ /*
+ * If we have a cached ACLs value just return it, not need to
+ * go out to the disk.
+ */
+ len = XFS_ACL_MAX_SIZE(ip->i_mount);
+ xfs_acl = kmem_zalloc_large(len, KM_SLEEP);
+ if (!xfs_acl)
+ return ERR_PTR(-ENOMEM);
- error = posix_acl_xattr_to_xfs(ext_acl, size, xfs_acl);
+ error = -xfs_attr_get(ip, ea_name, (unsigned char *)xfs_acl,
+ &len, ATTR_ROOT);
if (error) {
- _ACL_FREE(xfs_acl);
- return -error;
- }
- if (!xfs_acl->acl_cnt) {
- _ACL_FREE(xfs_acl);
- return 0;
+ /*
+ * If the attribute doesn't exist make sure we have a negative
+ * cache entry, for any other error assume it is transient and
+ * leave the cache entry as ACL_NOT_CACHED.
+ */
+ if (error == -ENOATTR)
+ goto out_update_cache;
+ goto out;
}
- VN_HOLD(vp);
- error = xfs_acl_allow_set(vp, kind);
- if (error)
+ acl = xfs_acl_from_disk(xfs_acl, XFS_ACL_MAX_ENTRIES(ip->i_mount));
+ if (IS_ERR(acl))
goto out;
- /* Incoming ACL exists, set file mode based on its value */
- if (kind == _ACL_TYPE_ACCESS)
- xfs_acl_setmode(vp, xfs_acl, &basicperms);
-
- /*
- * If we have more than std unix permissions, set up the actual attr.
- * Otherwise, delete any existing attr. This prevents us from
- * having actual attrs for permissions that can be stored in the
- * standard permission bits.
- */
- if (!basicperms) {
- xfs_acl_set_attr(vp, xfs_acl, kind, &error);
- } else {
- xfs_acl_vremove(vp, _ACL_TYPE_ACCESS);
- }
-
+out_update_cache:
+ set_cached_acl(inode, type, acl);
out:
- VN_RELE(vp);
- _ACL_FREE(xfs_acl);
- return -error;
+ kmem_free(xfs_acl);
+ return acl;
}
-int
-xfs_acl_iaccess(
- xfs_inode_t *ip,
- mode_t mode,
- cred_t *cr)
+STATIC int
+__xfs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
{
- xfs_acl_t *acl;
- int rval;
-
- if (!(_ACL_ALLOC(acl)))
- return -1;
-
- /* If the file has no ACL return -1. */
- rval = sizeof(xfs_acl_t);
- if (xfs_attr_fetch(ip, SGI_ACL_FILE, SGI_ACL_FILE_SIZE,
- (char *)acl, &rval, ATTR_ROOT | ATTR_KERNACCESS, cr)) {
- _ACL_FREE(acl);
- return -1;
- }
- xfs_acl_get_endian(acl);
+ struct xfs_inode *ip = XFS_I(inode);
+ unsigned char *ea_name;
+ int error;
- /* If the file has an empty ACL return -1. */
- if (acl->acl_cnt == XFS_ACL_NOT_PRESENT) {
- _ACL_FREE(acl);
- return -1;
+ switch (type) {
+ case ACL_TYPE_ACCESS:
+ ea_name = SGI_ACL_FILE;
+ break;
+ case ACL_TYPE_DEFAULT:
+ if (!S_ISDIR(inode->i_mode))
+ return acl ? -EACCES : 0;
+ ea_name = SGI_ACL_DEFAULT;
+ break;
+ default:
+ return -EINVAL;
}
- /* Synchronize ACL with mode bits */
- xfs_acl_sync_mode(ip->i_d.di_mode, acl);
+ if (acl) {
+ struct xfs_acl *xfs_acl;
+ int len = XFS_ACL_MAX_SIZE(ip->i_mount);
- rval = xfs_acl_access(ip->i_d.di_uid, ip->i_d.di_gid, acl, mode, cr);
- _ACL_FREE(acl);
- return rval;
-}
+ xfs_acl = kmem_zalloc_large(len, KM_SLEEP);
+ if (!xfs_acl)
+ return -ENOMEM;
-STATIC int
-xfs_acl_allow_set(
- vnode_t *vp,
- int kind)
-{
- vattr_t va;
- int error;
-
- if (vp->v_inode.i_flags & (S_IMMUTABLE|S_APPEND))
- return EPERM;
- if (kind == _ACL_TYPE_DEFAULT && !VN_ISDIR(vp))
- return ENOTDIR;
- if (vp->v_vfsp->vfs_flag & VFS_RDONLY)
- return EROFS;
- va.va_mask = XFS_AT_UID;
- VOP_GETATTR(vp, &va, 0, NULL, error);
- if (error)
- return error;
- if (va.va_uid != current->fsuid && !capable(CAP_FOWNER))
- return EPERM;
- return error;
-}
-
-/*
- * The access control process to determine the access permission:
- * if uid == file owner id, use the file owner bits.
- * if gid == file owner group id, use the file group bits.
- * scan ACL for a maching user or group, and use matched entry
- * permission. Use total permissions of all matching group entries,
- * until all acl entries are exhausted. The final permission produced
- * by matching acl entry or entries needs to be & with group permission.
- * if not owner, owning group, or matching entry in ACL, use file
- * other bits.
- */
-STATIC int
-xfs_acl_capability_check(
- mode_t mode,
- cred_t *cr)
-{
- if ((mode & ACL_READ) && !capable_cred(cr, CAP_DAC_READ_SEARCH))
- return EACCES;
- if ((mode & ACL_WRITE) && !capable_cred(cr, CAP_DAC_OVERRIDE))
- return EACCES;
- if ((mode & ACL_EXECUTE) && !capable_cred(cr, CAP_DAC_OVERRIDE))
- return EACCES;
-
- return 0;
-}
+ xfs_acl_to_disk(xfs_acl, acl);
-/*
- * Note: cr is only used here for the capability check if the ACL test fails.
- * It is not used to find out the credentials uid or groups etc, as was
- * done in IRIX. It is assumed that the uid and groups for the current
- * thread are taken from "current" instead of the cr parameter.
- */
-STATIC int
-xfs_acl_access(
- uid_t fuid,
- gid_t fgid,
- xfs_acl_t *fap,
- mode_t md,
- cred_t *cr)
-{
- xfs_acl_entry_t matched;
- int i, allows;
- int maskallows = -1; /* true, but not 1, either */
- int seen_userobj = 0;
+ /* subtract away the unused acl entries */
+ len -= sizeof(struct xfs_acl_entry) *
+ (XFS_ACL_MAX_ENTRIES(ip->i_mount) - acl->a_count);
- matched.ae_tag = 0; /* Invalid type */
- matched.ae_perm = 0;
- md >>= 6; /* Normalize the bits for comparison */
+ error = -xfs_attr_set(ip, ea_name, (unsigned char *)xfs_acl,
+ len, ATTR_ROOT);
- for (i = 0; i < fap->acl_cnt; i++) {
+ kmem_free(xfs_acl);
+ } else {
/*
- * Break out if we've got a user_obj entry or
- * a user entry and the mask (and have processed USER_OBJ)
+ * A NULL ACL argument means we want to remove the ACL.
*/
- if (matched.ae_tag == ACL_USER_OBJ)
- break;
- if (matched.ae_tag == ACL_USER) {
- if (maskallows != -1 && seen_userobj)
- break;
- if (fap->acl_entry[i].ae_tag != ACL_MASK &&
- fap->acl_entry[i].ae_tag != ACL_USER_OBJ)
- continue;
- }
- /* True if this entry allows the requested access */
- allows = ((fap->acl_entry[i].ae_perm & md) == md);
+ error = -xfs_attr_remove(ip, ea_name, ATTR_ROOT);
- switch (fap->acl_entry[i].ae_tag) {
- case ACL_USER_OBJ:
- seen_userobj = 1;
- if (fuid != current->fsuid)
- continue;
- matched.ae_tag = ACL_USER_OBJ;
- matched.ae_perm = allows;
- break;
- case ACL_USER:
- if (fap->acl_entry[i].ae_id != current->fsuid)
- continue;
- matched.ae_tag = ACL_USER;
- matched.ae_perm = allows;
- break;
- case ACL_GROUP_OBJ:
- if ((matched.ae_tag == ACL_GROUP_OBJ ||
- matched.ae_tag == ACL_GROUP) && !allows)
- continue;
- if (!in_group_p(fgid))
- continue;
- matched.ae_tag = ACL_GROUP_OBJ;
- matched.ae_perm = allows;
- break;
- case ACL_GROUP:
- if ((matched.ae_tag == ACL_GROUP_OBJ ||
- matched.ae_tag == ACL_GROUP) && !allows)
- continue;
- if (!in_group_p(fap->acl_entry[i].ae_id))
- continue;
- matched.ae_tag = ACL_GROUP;
- matched.ae_perm = allows;
- break;
- case ACL_MASK:
- maskallows = allows;
- break;
- case ACL_OTHER:
- if (matched.ae_tag != 0)
- continue;
- matched.ae_tag = ACL_OTHER;
- matched.ae_perm = allows;
- break;
- }
- }
- /*
- * First possibility is that no matched entry allows access.
- * The capability to override DAC may exist, so check for it.
- */
- switch (matched.ae_tag) {
- case ACL_OTHER:
- case ACL_USER_OBJ:
- if (matched.ae_perm)
- return 0;
- break;
- case ACL_USER:
- case ACL_GROUP_OBJ:
- case ACL_GROUP:
- if (maskallows && matched.ae_perm)
- return 0;
- break;
- case 0:
- break;
+ /*
+ * If the attribute didn't exist to start with that's fine.
+ */
+ if (error == -ENOATTR)
+ error = 0;
}
- return xfs_acl_capability_check(md, cr);
+ if (!error)
+ set_cached_acl(inode, type, acl);
+ return error;
}
-/*
- * ACL validity checker.
- * This acl validation routine checks each ACL entry read in makes sense.
- */
-STATIC int
-xfs_acl_invalid(
- xfs_acl_t *aclp)
+static int
+xfs_set_mode(struct inode *inode, umode_t mode)
{
- xfs_acl_entry_t *entry, *e;
- int user = 0, group = 0, other = 0, mask = 0;
- int mask_required = 0;
- int i, j;
+ int error = 0;
- if (!aclp)
- goto acl_invalid;
+ if (mode != inode->i_mode) {
+ struct iattr iattr;
- if (aclp->acl_cnt > XFS_ACL_MAX_ENTRIES)
- goto acl_invalid;
-
- for (i = 0; i < aclp->acl_cnt; i++) {
- entry = &aclp->acl_entry[i];
- switch (entry->ae_tag) {
- case ACL_USER_OBJ:
- if (user++)
- goto acl_invalid;
- break;
- case ACL_GROUP_OBJ:
- if (group++)
- goto acl_invalid;
- break;
- case ACL_OTHER:
- if (other++)
- goto acl_invalid;
- break;
- case ACL_USER:
- case ACL_GROUP:
- for (j = i + 1; j < aclp->acl_cnt; j++) {
- e = &aclp->acl_entry[j];
- if (e->ae_id == entry->ae_id &&
- e->ae_tag == entry->ae_tag)
- goto acl_invalid;
- }
- mask_required++;
- break;
- case ACL_MASK:
- if (mask++)
- goto acl_invalid;
- break;
- default:
- goto acl_invalid;
- }
- }
- if (!user || !group || !other || (mask_required && !mask))
- goto acl_invalid;
- else
- return 0;
-acl_invalid:
- return EINVAL;
-}
+ iattr.ia_valid = ATTR_MODE | ATTR_CTIME;
+ iattr.ia_mode = mode;
+ iattr.ia_ctime = current_fs_time(inode->i_sb);
-/*
- * Do ACL endian conversion.
- */
-STATIC void
-xfs_acl_get_endian(
- xfs_acl_t *aclp)
-{
- xfs_acl_entry_t *ace, *end;
-
- INT_SET(aclp->acl_cnt, ARCH_CONVERT, aclp->acl_cnt);
- end = &aclp->acl_entry[0]+aclp->acl_cnt;
- for (ace = &aclp->acl_entry[0]; ace < end; ace++) {
- INT_SET(ace->ae_tag, ARCH_CONVERT, ace->ae_tag);
- INT_SET(ace->ae_id, ARCH_CONVERT, ace->ae_id);
- INT_SET(ace->ae_perm, ARCH_CONVERT, ace->ae_perm);
+ error = -xfs_setattr_nonsize(XFS_I(inode), &iattr, XFS_ATTR_NOACL);
}
-}
-/*
- * Get the ACL from the EA and do endian conversion.
- */
-STATIC void
-xfs_acl_get_attr(
- vnode_t *vp,
- xfs_acl_t *aclp,
- int kind,
- int flags,
- int *error)
-{
- int len = sizeof(xfs_acl_t);
-
- ASSERT((flags & ATTR_KERNOVAL) ? (aclp == NULL) : 1);
- flags |= ATTR_ROOT;
- VOP_ATTR_GET(vp,
- kind == _ACL_TYPE_ACCESS ? SGI_ACL_FILE : SGI_ACL_DEFAULT,
- (char *)aclp, &len, flags, sys_cred, *error);
- if (*error || (flags & ATTR_KERNOVAL))
- return;
- xfs_acl_get_endian(aclp);
+ return error;
}
-/*
- * Set the EA with the ACL and do endian conversion.
- */
-STATIC void
-xfs_acl_set_attr(
- vnode_t *vp,
- xfs_acl_t *aclp,
- int kind,
- int *error)
+static int
+xfs_acl_exists(struct inode *inode, unsigned char *name)
{
- xfs_acl_entry_t *ace, *newace, *end;
- xfs_acl_t *newacl;
- int len;
+ int len = XFS_ACL_MAX_SIZE(XFS_M(inode->i_sb));
- if (!(_ACL_ALLOC(newacl))) {
- *error = ENOMEM;
- return;
- }
-
- len = sizeof(xfs_acl_t) -
- (sizeof(xfs_acl_entry_t) * (XFS_ACL_MAX_ENTRIES - aclp->acl_cnt));
- end = &aclp->acl_entry[0]+aclp->acl_cnt;
- for (ace = &aclp->acl_entry[0], newace = &newacl->acl_entry[0];
- ace < end;
- ace++, newace++) {
- INT_SET(newace->ae_tag, ARCH_CONVERT, ace->ae_tag);
- INT_SET(newace->ae_id, ARCH_CONVERT, ace->ae_id);
- INT_SET(newace->ae_perm, ARCH_CONVERT, ace->ae_perm);
- }
- INT_SET(newacl->acl_cnt, ARCH_CONVERT, aclp->acl_cnt);
- VOP_ATTR_SET(vp,
- kind == _ACL_TYPE_ACCESS ? SGI_ACL_FILE: SGI_ACL_DEFAULT,
- (char *)newacl, len, ATTR_ROOT, sys_cred, *error);
- _ACL_FREE(newacl);
+ return (xfs_attr_get(XFS_I(inode), name, NULL, &len,
+ ATTR_ROOT|ATTR_KERNOVAL) == 0);
}
int
-xfs_acl_vtoacl(
- vnode_t *vp,
- xfs_acl_t *access_acl,
- xfs_acl_t *default_acl)
+posix_acl_access_exists(struct inode *inode)
{
- vattr_t va;
- int error = 0;
-
- if (access_acl) {
- /*
- * Get the Access ACL and the mode. If either cannot
- * be obtained for some reason, invalidate the access ACL.
- */
- xfs_acl_get_attr(vp, access_acl, _ACL_TYPE_ACCESS, 0, &error);
- if (!error) {
- /* Got the ACL, need the mode... */
- va.va_mask = XFS_AT_MODE;
- VOP_GETATTR(vp, &va, 0, sys_cred, error);
- }
-
- if (error)
- access_acl->acl_cnt = XFS_ACL_NOT_PRESENT;
- else /* We have a good ACL and the file mode, synchronize. */
- xfs_acl_sync_mode(va.va_mode, access_acl);
- }
-
- if (default_acl) {
- xfs_acl_get_attr(vp, default_acl, _ACL_TYPE_DEFAULT, 0, &error);
- if (error)
- default_acl->acl_cnt = XFS_ACL_NOT_PRESENT;
- }
- return error;
+ return xfs_acl_exists(inode, SGI_ACL_FILE);
}
-/*
- * This function retrieves the parent directory's acl, processes it
- * and lets the child inherit the acl(s) that it should.
- */
int
-xfs_acl_inherit(
- vnode_t *vp,
- vattr_t *vap,
- xfs_acl_t *pdaclp)
+posix_acl_default_exists(struct inode *inode)
{
- xfs_acl_t *cacl;
- int error = 0;
- int basicperms = 0;
-
- /*
- * If the parent does not have a default ACL, or it's an
- * invalid ACL, we're done.
- */
- if (!vp)
- return 0;
- if (!pdaclp || xfs_acl_invalid(pdaclp))
+ if (!S_ISDIR(inode->i_mode))
return 0;
-
- /*
- * Copy the default ACL of the containing directory to
- * the access ACL of the new file and use the mode that
- * was passed in to set up the correct initial values for
- * the u::,g::[m::], and o:: entries. This is what makes
- * umask() "work" with ACL's.
- */
-
- if (!(_ACL_ALLOC(cacl)))
- return ENOMEM;
-
- memcpy(cacl, pdaclp, sizeof(xfs_acl_t));
- xfs_acl_filter_mode(vap->va_mode, cacl);
- xfs_acl_setmode(vp, cacl, &basicperms);
-
- /*
- * Set the Default and Access ACL on the file. The mode is already
- * set on the file, so we don't need to worry about that.
- *
- * If the new file is a directory, its default ACL is a copy of
- * the containing directory's default ACL.
- */
- if (VN_ISDIR(vp))
- xfs_acl_set_attr(vp, pdaclp, _ACL_TYPE_DEFAULT, &error);
- if (!error && !basicperms)
- xfs_acl_set_attr(vp, cacl, _ACL_TYPE_ACCESS, &error);
- _ACL_FREE(cacl);
- return error;
+ return xfs_acl_exists(inode, SGI_ACL_DEFAULT);
}
-/*
- * Set up the correct mode on the file based on the supplied ACL. This
- * makes sure that the mode on the file reflects the state of the
- * u::,g::[m::], and o:: entries in the ACL. Since the mode is where
- * the ACL is going to get the permissions for these entries, we must
- * synchronize the mode whenever we set the ACL on a file.
- */
-STATIC int
-xfs_acl_setmode(
- vnode_t *vp,
- xfs_acl_t *acl,
- int *basicperms)
+int
+xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
{
- vattr_t va;
- xfs_acl_entry_t *ap;
- xfs_acl_entry_t *gap = NULL;
- int i, error, nomask = 1;
+ int error = 0;
- *basicperms = 1;
-
- if (acl->acl_cnt == XFS_ACL_NOT_PRESENT)
- return 0;
+ if (!acl)
+ goto set_acl;
- /*
- * Copy the u::, g::, o::, and m:: bits from the ACL into the
- * mode. The m:: bits take precedence over the g:: bits.
- */
- va.va_mask = XFS_AT_MODE;
- VOP_GETATTR(vp, &va, 0, sys_cred, error);
- if (error)
+ error = -E2BIG;
+ if (acl->a_count > XFS_ACL_MAX_ENTRIES(XFS_M(inode->i_sb)))
return error;
- va.va_mask = XFS_AT_MODE;
- va.va_mode &= ~(S_IRWXU|S_IRWXG|S_IRWXO);
- ap = acl->acl_entry;
- for (i = 0; i < acl->acl_cnt; ++i) {
- switch (ap->ae_tag) {
- case ACL_USER_OBJ:
- va.va_mode |= ap->ae_perm << 6;
- break;
- case ACL_GROUP_OBJ:
- gap = ap;
- break;
- case ACL_MASK: /* more than just standard modes */
- nomask = 0;
- va.va_mode |= ap->ae_perm << 3;
- *basicperms = 0;
- break;
- case ACL_OTHER:
- va.va_mode |= ap->ae_perm;
- break;
- default: /* more than just standard modes */
- *basicperms = 0;
- break;
- }
- ap++;
- }
-
- /* Set the group bits from ACL_GROUP_OBJ if there's no ACL_MASK */
- if (gap && nomask)
- va.va_mode |= gap->ae_perm << 3;
+ if (type == ACL_TYPE_ACCESS) {
+ umode_t mode = inode->i_mode;
+ error = posix_acl_equiv_mode(acl, &mode);
- VOP_SETATTR(vp, &va, 0, sys_cred, error);
- return error;
-}
+ if (error <= 0) {
+ acl = NULL;
-/*
- * The permissions for the special ACL entries (u::, g::[m::], o::) are
- * actually stored in the file mode (if there is both a group and a mask,
- * the group is stored in the ACL entry and the mask is stored on the file).
- * This allows the mode to remain automatically in sync with the ACL without
- * the need for a call-back to the ACL system at every point where the mode
- * could change. This function takes the permissions from the specified mode
- * and places it in the supplied ACL.
- *
- * This implementation draws its validity from the fact that, when the ACL
- * was assigned, the mode was copied from the ACL.
- * If the mode did not change, therefore, the mode remains exactly what was
- * taken from the special ACL entries at assignment.
- * If a subsequent chmod() was done, the POSIX spec says that the change in
- * mode must cause an update to the ACL seen at user level and used for
- * access checks. Before and after a mode change, therefore, the file mode
- * most accurately reflects what the special ACL entries should permit/deny.
- *
- * CAVEAT: If someone sets the SGI_ACL_FILE attribute directly,
- * the existing mode bits will override whatever is in the
- * ACL. Similarly, if there is a pre-existing ACL that was
- * never in sync with its mode (owing to a bug in 6.5 and
- * before), it will now magically (or mystically) be
- * synchronized. This could cause slight astonishment, but
- * it is better than inconsistent permissions.
- *
- * The supplied ACL is a template that may contain any combination
- * of special entries. These are treated as place holders when we fill
- * out the ACL. This routine does not add or remove special entries, it
- * simply unites each special entry with its associated set of permissions.
- */
-STATIC void
-xfs_acl_sync_mode(
- mode_t mode,
- xfs_acl_t *acl)
-{
- int i, nomask = 1;
- xfs_acl_entry_t *ap;
- xfs_acl_entry_t *gap = NULL;
-
- /*
- * Set ACL entries. POSIX1003.1eD16 requires that the MASK
- * be set instead of the GROUP entry, if there is a MASK.
- */
- for (ap = acl->acl_entry, i = 0; i < acl->acl_cnt; ap++, i++) {
- switch (ap->ae_tag) {
- case ACL_USER_OBJ:
- ap->ae_perm = (mode >> 6) & 0x7;
- break;
- case ACL_GROUP_OBJ:
- gap = ap;
- break;
- case ACL_MASK:
- nomask = 0;
- ap->ae_perm = (mode >> 3) & 0x7;
- break;
- case ACL_OTHER:
- ap->ae_perm = mode & 0x7;
- break;
- default:
- break;
+ if (error < 0)
+ return error;
}
- }
- /* Set the ACL_GROUP_OBJ if there's no ACL_MASK */
- if (gap && nomask)
- gap->ae_perm = (mode >> 3) & 0x7;
-}
-
-/*
- * When inheriting an Access ACL from a directory Default ACL,
- * the ACL bits are set to the intersection of the ACL default
- * permission bits and the file permission bits in mode. If there
- * are no permission bits on the file then we must not give them
- * the ACL. This is what what makes umask() work with ACLs.
- */
-STATIC void
-xfs_acl_filter_mode(
- mode_t mode,
- xfs_acl_t *acl)
-{
- int i, nomask = 1;
- xfs_acl_entry_t *ap;
- xfs_acl_entry_t *gap = NULL;
- /*
- * Set ACL entries. POSIX1003.1eD16 requires that the MASK
- * be merged with GROUP entry, if there is a MASK.
- */
- for (ap = acl->acl_entry, i = 0; i < acl->acl_cnt; ap++, i++) {
- switch (ap->ae_tag) {
- case ACL_USER_OBJ:
- ap->ae_perm &= (mode >> 6) & 0x7;
- break;
- case ACL_GROUP_OBJ:
- gap = ap;
- break;
- case ACL_MASK:
- nomask = 0;
- ap->ae_perm &= (mode >> 3) & 0x7;
- break;
- case ACL_OTHER:
- ap->ae_perm &= mode & 0x7;
- break;
- default:
- break;
- }
+ error = xfs_set_mode(inode, mode);
+ if (error)
+ return error;
}
- /* Set the ACL_GROUP_OBJ if there's no ACL_MASK */
- if (gap && nomask)
- gap->ae_perm &= (mode >> 3) & 0x7;
+
+ set_acl:
+ return __xfs_set_acl(inode, type, acl);
}
diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h
index f9315bc960c..5dc16374451 100644
--- a/fs/xfs/xfs_acl.h
+++ b/fs/xfs/xfs_acl.h
@@ -18,85 +18,58 @@
#ifndef __XFS_ACL_H__
#define __XFS_ACL_H__
-/*
- * Access Control Lists
- */
-typedef __uint16_t xfs_acl_perm_t;
-typedef __int32_t xfs_acl_type_t;
-typedef __int32_t xfs_acl_tag_t;
-typedef __int32_t xfs_acl_id_t;
+struct inode;
+struct posix_acl;
+struct xfs_inode;
-#define XFS_ACL_MAX_ENTRIES 25
#define XFS_ACL_NOT_PRESENT (-1)
-typedef struct xfs_acl_entry {
- xfs_acl_tag_t ae_tag;
- xfs_acl_id_t ae_id;
- xfs_acl_perm_t ae_perm;
-} xfs_acl_entry_t;
+/* On-disk XFS access control list structure */
+struct xfs_acl_entry {
+ __be32 ae_tag;
+ __be32 ae_id;
+ __be16 ae_perm;
+ __be16 ae_pad; /* fill the implicit hole in the structure */
+};
-typedef struct xfs_acl {
- __int32_t acl_cnt;
- xfs_acl_entry_t acl_entry[XFS_ACL_MAX_ENTRIES];
-} xfs_acl_t;
+struct xfs_acl {
+ __be32 acl_cnt;
+ struct xfs_acl_entry acl_entry[0];
+};
+
+/*
+ * The number of ACL entries allowed is defined by the on-disk format.
+ * For v4 superblocks, that is limited to 25 entries. For v5 superblocks, it is
+ * limited only by the maximum size of the xattr that stores the information.
+ */
+#define XFS_ACL_MAX_ENTRIES(mp) \
+ (xfs_sb_version_hascrc(&mp->m_sb) \
+ ? (XATTR_SIZE_MAX - sizeof(struct xfs_acl)) / \
+ sizeof(struct xfs_acl_entry) \
+ : 25)
+
+#define XFS_ACL_MAX_SIZE(mp) \
+ (sizeof(struct xfs_acl) + \
+ sizeof(struct xfs_acl_entry) * XFS_ACL_MAX_ENTRIES((mp)))
/* On-disk XFS extended attribute names */
-#define SGI_ACL_FILE "SGI_ACL_FILE"
-#define SGI_ACL_DEFAULT "SGI_ACL_DEFAULT"
+#define SGI_ACL_FILE (unsigned char *)"SGI_ACL_FILE"
+#define SGI_ACL_DEFAULT (unsigned char *)"SGI_ACL_DEFAULT"
#define SGI_ACL_FILE_SIZE (sizeof(SGI_ACL_FILE)-1)
#define SGI_ACL_DEFAULT_SIZE (sizeof(SGI_ACL_DEFAULT)-1)
-
#ifdef CONFIG_XFS_POSIX_ACL
-
-struct vattr;
-struct vnode;
-struct xfs_inode;
-
-extern struct kmem_zone *xfs_acl_zone;
-#define xfs_acl_zone_init(zone, name) \
- (zone) = kmem_zone_init(sizeof(xfs_acl_t), name)
-#define xfs_acl_zone_destroy(zone) kmem_cache_destroy(zone)
-
-extern int xfs_acl_inherit(struct vnode *, struct vattr *, xfs_acl_t *);
-extern int xfs_acl_iaccess(struct xfs_inode *, mode_t, cred_t *);
-extern int xfs_acl_vtoacl(struct vnode *, xfs_acl_t *, xfs_acl_t *);
-extern int xfs_acl_vhasacl_access(struct vnode *);
-extern int xfs_acl_vhasacl_default(struct vnode *);
-extern int xfs_acl_vset(struct vnode *, void *, size_t, int);
-extern int xfs_acl_vget(struct vnode *, void *, size_t, int);
-extern int xfs_acl_vremove(struct vnode *vp, int);
-
-#define _ACL_TYPE_ACCESS 1
-#define _ACL_TYPE_DEFAULT 2
-#define _ACL_PERM_INVALID(perm) ((perm) & ~(ACL_READ|ACL_WRITE|ACL_EXECUTE))
-
-#define _ACL_INHERIT(c,v,d) (xfs_acl_inherit(c,v,d))
-#define _ACL_GET_ACCESS(pv,pa) (xfs_acl_vtoacl(pv,pa,NULL) == 0)
-#define _ACL_GET_DEFAULT(pv,pd) (xfs_acl_vtoacl(pv,NULL,pd) == 0)
-#define _ACL_ACCESS_EXISTS xfs_acl_vhasacl_access
-#define _ACL_DEFAULT_EXISTS xfs_acl_vhasacl_default
-#define _ACL_XFS_IACCESS(i,m,c) (XFS_IFORK_Q(i) ? xfs_acl_iaccess(i,m,c) : -1)
-
-#define _ACL_ALLOC(a) ((a) = kmem_zone_alloc(xfs_acl_zone, KM_SLEEP))
-#define _ACL_FREE(a) ((a)? kmem_zone_free(xfs_acl_zone, (a)):(void)0)
-
+extern struct posix_acl *xfs_get_acl(struct inode *inode, int type);
+extern int xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type);
+extern int posix_acl_access_exists(struct inode *inode);
+extern int posix_acl_default_exists(struct inode *inode);
#else
-#define xfs_acl_zone_init(zone,name)
-#define xfs_acl_zone_destroy(zone)
-#define xfs_acl_vset(v,p,sz,t) (-EOPNOTSUPP)
-#define xfs_acl_vget(v,p,sz,t) (-EOPNOTSUPP)
-#define xfs_acl_vremove(v,t) (-EOPNOTSUPP)
-#define xfs_acl_vhasacl_access(v) (0)
-#define xfs_acl_vhasacl_default(v) (0)
-#define _ACL_ALLOC(a) (1) /* successfully allocate nothing */
-#define _ACL_FREE(a) ((void)0)
-#define _ACL_INHERIT(c,v,d) (0)
-#define _ACL_GET_ACCESS(pv,pa) (0)
-#define _ACL_GET_DEFAULT(pv,pd) (0)
-#define _ACL_ACCESS_EXISTS (NULL)
-#define _ACL_DEFAULT_EXISTS (NULL)
-#define _ACL_XFS_IACCESS(i,m,c) (-1)
-#endif
-
+static inline struct posix_acl *xfs_get_acl(struct inode *inode, int type)
+{
+ return NULL;
+}
+# define xfs_set_acl NULL
+# define posix_acl_access_exists(inode) 0
+# define posix_acl_default_exists(inode) 0
+#endif /* CONFIG_XFS_POSIX_ACL */
#endif /* __XFS_ACL_H__ */
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index a96e2ffce0c..6e247a99f5d 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -30,6 +30,7 @@ struct xfs_trans;
#define XFS_AGF_MAGIC 0x58414746 /* 'XAGF' */
#define XFS_AGI_MAGIC 0x58414749 /* 'XAGI' */
+#define XFS_AGFL_MAGIC 0x5841464c /* 'XAFL' */
#define XFS_AGF_VERSION 1
#define XFS_AGI_VERSION 1
@@ -63,13 +64,33 @@ typedef struct xfs_agf {
__be32 agf_spare0; /* spare field */
__be32 agf_levels[XFS_BTNUM_AGF]; /* btree levels */
__be32 agf_spare1; /* spare field */
+
__be32 agf_flfirst; /* first freelist block's index */
__be32 agf_fllast; /* last freelist block's index */
__be32 agf_flcount; /* count of blocks in freelist */
__be32 agf_freeblks; /* total free blocks */
+
__be32 agf_longest; /* longest free space */
+ __be32 agf_btreeblks; /* # of blocks held in AGF btrees */
+ uuid_t agf_uuid; /* uuid of filesystem */
+
+ /*
+ * reserve some contiguous space for future logged fields before we add
+ * the unlogged fields. This makes the range logging via flags and
+ * structure offsets much simpler.
+ */
+ __be64 agf_spare64[16];
+
+ /* unlogged fields, written during buffer writeback. */
+ __be64 agf_lsn; /* last write sequence */
+ __be32 agf_crc; /* crc of agf sector */
+ __be32 agf_spare2;
+
+ /* structure must be padded to 64 bit alignment */
} xfs_agf_t;
+#define XFS_AGF_CRC_OFF offsetof(struct xfs_agf, agf_crc)
+
#define XFS_AGF_MAGICNUM 0x00000001
#define XFS_AGF_VERSIONNUM 0x00000002
#define XFS_AGF_SEQNO 0x00000004
@@ -81,14 +102,33 @@ typedef struct xfs_agf {
#define XFS_AGF_FLCOUNT 0x00000100
#define XFS_AGF_FREEBLKS 0x00000200
#define XFS_AGF_LONGEST 0x00000400
-#define XFS_AGF_NUM_BITS 11
+#define XFS_AGF_BTREEBLKS 0x00000800
+#define XFS_AGF_UUID 0x00001000
+#define XFS_AGF_NUM_BITS 13
#define XFS_AGF_ALL_BITS ((1 << XFS_AGF_NUM_BITS) - 1)
+#define XFS_AGF_FLAGS \
+ { XFS_AGF_MAGICNUM, "MAGICNUM" }, \
+ { XFS_AGF_VERSIONNUM, "VERSIONNUM" }, \
+ { XFS_AGF_SEQNO, "SEQNO" }, \
+ { XFS_AGF_LENGTH, "LENGTH" }, \
+ { XFS_AGF_ROOTS, "ROOTS" }, \
+ { XFS_AGF_LEVELS, "LEVELS" }, \
+ { XFS_AGF_FLFIRST, "FLFIRST" }, \
+ { XFS_AGF_FLLAST, "FLLAST" }, \
+ { XFS_AGF_FLCOUNT, "FLCOUNT" }, \
+ { XFS_AGF_FREEBLKS, "FREEBLKS" }, \
+ { XFS_AGF_LONGEST, "LONGEST" }, \
+ { XFS_AGF_BTREEBLKS, "BTREEBLKS" }, \
+ { XFS_AGF_UUID, "UUID" }
+
/* disk block (xfs_daddr_t) in the AG */
#define XFS_AGF_DADDR(mp) ((xfs_daddr_t)(1 << (mp)->m_sectbb_log))
#define XFS_AGF_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_AGF_DADDR(mp))
-#define XFS_BUF_TO_AGF(bp) ((xfs_agf_t *)XFS_BUF_PTR(bp))
+#define XFS_BUF_TO_AGF(bp) ((xfs_agf_t *)((bp)->b_addr))
+extern int xfs_read_agf(struct xfs_mount *mp, struct xfs_trans *tp,
+ xfs_agnumber_t agno, int flags, struct xfs_buf **bpp);
/*
* Size of the unlinked inode hash table in the agi.
@@ -112,6 +152,7 @@ typedef struct xfs_agi {
__be32 agi_root; /* root of inode btree */
__be32 agi_level; /* levels in inode btree */
__be32 agi_freecount; /* number of free inodes */
+
__be32 agi_newino; /* new inode just allocated */
__be32 agi_dirino; /* last directory inode chunk */
/*
@@ -119,26 +160,46 @@ typedef struct xfs_agi {
* still being referenced.
*/
__be32 agi_unlinked[XFS_AGI_UNLINKED_BUCKETS];
+ /*
+ * This marks the end of logging region 1 and start of logging region 2.
+ */
+ uuid_t agi_uuid; /* uuid of filesystem */
+ __be32 agi_crc; /* crc of agi sector */
+ __be32 agi_pad32;
+ __be64 agi_lsn; /* last write sequence */
+
+ __be32 agi_free_root; /* root of the free inode btree */
+ __be32 agi_free_level;/* levels in free inode btree */
+
+ /* structure must be padded to 64 bit alignment */
} xfs_agi_t;
-#define XFS_AGI_MAGICNUM 0x00000001
-#define XFS_AGI_VERSIONNUM 0x00000002
-#define XFS_AGI_SEQNO 0x00000004
-#define XFS_AGI_LENGTH 0x00000008
-#define XFS_AGI_COUNT 0x00000010
-#define XFS_AGI_ROOT 0x00000020
-#define XFS_AGI_LEVEL 0x00000040
-#define XFS_AGI_FREECOUNT 0x00000080
-#define XFS_AGI_NEWINO 0x00000100
-#define XFS_AGI_DIRINO 0x00000200
-#define XFS_AGI_UNLINKED 0x00000400
-#define XFS_AGI_NUM_BITS 11
-#define XFS_AGI_ALL_BITS ((1 << XFS_AGI_NUM_BITS) - 1)
+#define XFS_AGI_CRC_OFF offsetof(struct xfs_agi, agi_crc)
+
+#define XFS_AGI_MAGICNUM (1 << 0)
+#define XFS_AGI_VERSIONNUM (1 << 1)
+#define XFS_AGI_SEQNO (1 << 2)
+#define XFS_AGI_LENGTH (1 << 3)
+#define XFS_AGI_COUNT (1 << 4)
+#define XFS_AGI_ROOT (1 << 5)
+#define XFS_AGI_LEVEL (1 << 6)
+#define XFS_AGI_FREECOUNT (1 << 7)
+#define XFS_AGI_NEWINO (1 << 8)
+#define XFS_AGI_DIRINO (1 << 9)
+#define XFS_AGI_UNLINKED (1 << 10)
+#define XFS_AGI_NUM_BITS_R1 11 /* end of the 1st agi logging region */
+#define XFS_AGI_ALL_BITS_R1 ((1 << XFS_AGI_NUM_BITS_R1) - 1)
+#define XFS_AGI_FREE_ROOT (1 << 11)
+#define XFS_AGI_FREE_LEVEL (1 << 12)
+#define XFS_AGI_NUM_BITS_R2 13
/* disk block (xfs_daddr_t) in the AG */
#define XFS_AGI_DADDR(mp) ((xfs_daddr_t)(2 << (mp)->m_sectbb_log))
#define XFS_AGI_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_AGI_DADDR(mp))
-#define XFS_BUF_TO_AGI(bp) ((xfs_agi_t *)XFS_BUF_PTR(bp))
+#define XFS_BUF_TO_AGI(bp) ((xfs_agi_t *)((bp)->b_addr))
+
+extern int xfs_read_agi(struct xfs_mount *mp, struct xfs_trans *tp,
+ xfs_agnumber_t agno, struct xfs_buf **bpp);
/*
* The third a.g. block contains the a.g. freelist, an array
@@ -146,53 +207,42 @@ typedef struct xfs_agi {
*/
#define XFS_AGFL_DADDR(mp) ((xfs_daddr_t)(3 << (mp)->m_sectbb_log))
#define XFS_AGFL_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_AGFL_DADDR(mp))
-#define XFS_AGFL_SIZE(mp) ((mp)->m_sb.sb_sectsize / sizeof(xfs_agblock_t))
-#define XFS_BUF_TO_AGFL(bp) ((xfs_agfl_t *)XFS_BUF_PTR(bp))
+#define XFS_BUF_TO_AGFL(bp) ((xfs_agfl_t *)((bp)->b_addr))
-typedef struct xfs_agfl {
- xfs_agblock_t agfl_bno[1]; /* actually XFS_AGFL_SIZE(mp) */
-} xfs_agfl_t;
+#define XFS_BUF_TO_AGFL_BNO(mp, bp) \
+ (xfs_sb_version_hascrc(&((mp)->m_sb)) ? \
+ &(XFS_BUF_TO_AGFL(bp)->agfl_bno[0]) : \
+ (__be32 *)(bp)->b_addr)
/*
- * Busy block/extent entry. Used in perag to mark blocks that have been freed
- * but whose transactions aren't committed to disk yet.
+ * Size of the AGFL. For CRC-enabled filesystes we steal a couple of
+ * slots in the beginning of the block for a proper header with the
+ * location information and CRC.
*/
-typedef struct xfs_perag_busy {
- xfs_agblock_t busy_start;
- xfs_extlen_t busy_length;
- struct xfs_trans *busy_tp; /* transaction that did the free */
-} xfs_perag_busy_t;
+#define XFS_AGFL_SIZE(mp) \
+ (((mp)->m_sb.sb_sectsize - \
+ (xfs_sb_version_hascrc(&((mp)->m_sb)) ? \
+ sizeof(struct xfs_agfl) : 0)) / \
+ sizeof(xfs_agblock_t))
+
+typedef struct xfs_agfl {
+ __be32 agfl_magicnum;
+ __be32 agfl_seqno;
+ uuid_t agfl_uuid;
+ __be64 agfl_lsn;
+ __be32 agfl_crc;
+ __be32 agfl_bno[]; /* actually XFS_AGFL_SIZE(mp) */
+} xfs_agfl_t;
+
+#define XFS_AGFL_CRC_OFF offsetof(struct xfs_agfl, agfl_crc)
/*
- * Per-ag incore structure, copies of information in agf and agi,
- * to improve the performance of allocation group selection.
- *
- * pick sizes which fit in allocation buckets well
+ * tags for inode radix tree
*/
-#if (BITS_PER_LONG == 32)
-#define XFS_PAGB_NUM_SLOTS 84
-#elif (BITS_PER_LONG == 64)
-#define XFS_PAGB_NUM_SLOTS 128
-#endif
-
-typedef struct xfs_perag
-{
- char pagf_init; /* this agf's entry is initialized */
- char pagi_init; /* this agi's entry is initialized */
- char pagf_metadata; /* the agf is prefered to be metadata */
- char pagi_inodeok; /* The agi is ok for inodes */
- __uint8_t pagf_levels[XFS_BTNUM_AGF];
- /* # of levels in bno & cnt btree */
- __uint32_t pagf_flcount; /* count of blocks in freelist */
- xfs_extlen_t pagf_freeblks; /* total free blocks */
- xfs_extlen_t pagf_longest; /* longest free space */
- xfs_agino_t pagi_freecount; /* number of free inodes */
-#ifdef __KERNEL__
- lock_t pagb_lock; /* lock for pagb_list */
-#endif
- int pagb_count; /* pagb slots in use */
- xfs_perag_busy_t *pagb_list; /* unstable blocks */
-} xfs_perag_t;
+#define XFS_ICI_NO_TAG (-1) /* special flag for an untagged lookup
+ in xfs_inode_ag_iterator */
+#define XFS_ICI_RECLAIM_TAG 0 /* inode is to be reclaimed */
+#define XFS_ICI_EOFBLOCKS_TAG 1 /* inode has blocks beyond EOF */
#define XFS_AG_MAXLEVELS(mp) ((mp)->m_ag_maxlevels)
#define XFS_MIN_FREELIST_RAW(bl,cl,mp) \
@@ -203,15 +253,15 @@ typedef struct xfs_perag
be32_to_cpu((a)->agf_levels[XFS_BTNUM_CNTi]), mp))
#define XFS_MIN_FREELIST_PAG(pag,mp) \
(XFS_MIN_FREELIST_RAW( \
- (uint_t)(pag)->pagf_levels[XFS_BTNUM_BNOi], \
- (uint_t)(pag)->pagf_levels[XFS_BTNUM_CNTi], mp))
+ (unsigned int)(pag)->pagf_levels[XFS_BTNUM_BNOi], \
+ (unsigned int)(pag)->pagf_levels[XFS_BTNUM_CNTi], mp))
#define XFS_AGB_TO_FSB(mp,agno,agbno) \
(((xfs_fsblock_t)(agno) << (mp)->m_sb.sb_agblklog) | (agbno))
#define XFS_FSB_TO_AGNO(mp,fsbno) \
((xfs_agnumber_t)((fsbno) >> (mp)->m_sb.sb_agblklog))
#define XFS_FSB_TO_AGBNO(mp,fsbno) \
- ((xfs_agblock_t)((fsbno) & XFS_MASK32LO((mp)->m_sb.sb_agblklog)))
+ ((xfs_agblock_t)((fsbno) & xfs_mask32lo((mp)->m_sb.sb_agblklog)))
#define XFS_AGB_TO_DADDR(mp,agno,agbno) \
((xfs_daddr_t)XFS_FSB_TO_BB(mp, \
(xfs_fsblock_t)(agno) * (mp)->m_sb.sb_agblocks + (agbno)))
@@ -224,8 +274,8 @@ typedef struct xfs_perag
#define XFS_AG_CHECK_DADDR(mp,d,len) \
((len) == 1 ? \
ASSERT((d) == XFS_SB_DADDR || \
- XFS_DADDR_TO_AGBNO(mp, d) != XFS_SB_DADDR) : \
- ASSERT(XFS_DADDR_TO_AGNO(mp, d) == \
- XFS_DADDR_TO_AGNO(mp, (d) + (len) - 1)))
+ xfs_daddr_to_agbno(mp, d) != XFS_SB_DADDR) : \
+ ASSERT(xfs_daddr_to_agno(mp, d) == \
+ xfs_daddr_to_agno(mp, (d) + (len) - 1)))
#endif /* __XFS_AG_H__ */
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index f4328e1e2a7..d43813267a8 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -17,108 +17,153 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
-#include "xfs_types.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_shared.h"
+#include "xfs_trans_resv.h"
#include "xfs_bit.h"
-#include "xfs_log.h"
-#include "xfs_inum.h"
-#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
-#include "xfs_dir.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
-#include "xfs_dinode.h"
#include "xfs_inode.h"
#include "xfs_btree.h"
-#include "xfs_ialloc.h"
+#include "xfs_alloc_btree.h"
#include "xfs_alloc.h"
+#include "xfs_extent_busy.h"
#include "xfs_error.h"
+#include "xfs_cksum.h"
+#include "xfs_trace.h"
+#include "xfs_trans.h"
+#include "xfs_buf_item.h"
+#include "xfs_log.h"
+struct workqueue_struct *xfs_alloc_wq;
#define XFS_ABSDIFF(a,b) (((a) <= (b)) ? ((b) - (a)) : ((a) - (b)))
#define XFSA_FIXUP_BNO_OK 1
#define XFSA_FIXUP_CNT_OK 2
-STATIC int
-xfs_alloc_search_busy(xfs_trans_t *tp,
- xfs_agnumber_t agno,
- xfs_agblock_t bno,
- xfs_extlen_t len);
-
-#if defined(XFS_ALLOC_TRACE)
-ktrace_t *xfs_alloc_trace_buf;
-
-#define TRACE_ALLOC(s,a) \
- xfs_alloc_trace_alloc(fname, s, a, __LINE__)
-#define TRACE_FREE(s,a,b,x,f) \
- xfs_alloc_trace_free(fname, s, mp, a, b, x, f, __LINE__)
-#define TRACE_MODAGF(s,a,f) \
- xfs_alloc_trace_modagf(fname, s, mp, a, f, __LINE__)
-#define TRACE_BUSY(fname,s,ag,agb,l,sl,tp) \
- xfs_alloc_trace_busy(fname, s, mp, ag, agb, l, sl, tp, XFS_ALLOC_KTRACE_BUSY, __LINE__)
-#define TRACE_UNBUSY(fname,s,ag,sl,tp) \
- xfs_alloc_trace_busy(fname, s, mp, ag, -1, -1, sl, tp, XFS_ALLOC_KTRACE_UNBUSY, __LINE__)
-#define TRACE_BUSYSEARCH(fname,s,ag,agb,l,sl,tp) \
- xfs_alloc_trace_busy(fname, s, mp, ag, agb, l, sl, tp, XFS_ALLOC_KTRACE_BUSYSEARCH, __LINE__)
-#else
-#define TRACE_ALLOC(s,a)
-#define TRACE_FREE(s,a,b,x,f)
-#define TRACE_MODAGF(s,a,f)
-#define TRACE_BUSY(s,a,ag,agb,l,sl,tp)
-#define TRACE_UNBUSY(fname,s,ag,sl,tp)
-#define TRACE_BUSYSEARCH(fname,s,ag,agb,l,sl,tp)
-#endif /* XFS_ALLOC_TRACE */
-
-/*
- * Prototypes for per-ag allocation routines
- */
-
STATIC int xfs_alloc_ag_vextent_exact(xfs_alloc_arg_t *);
STATIC int xfs_alloc_ag_vextent_near(xfs_alloc_arg_t *);
STATIC int xfs_alloc_ag_vextent_size(xfs_alloc_arg_t *);
STATIC int xfs_alloc_ag_vextent_small(xfs_alloc_arg_t *,
- xfs_btree_cur_t *, xfs_agblock_t *, xfs_extlen_t *, int *);
+ xfs_btree_cur_t *, xfs_agblock_t *, xfs_extlen_t *, int *);
+
+/*
+ * Lookup the record equal to [bno, len] in the btree given by cur.
+ */
+STATIC int /* error */
+xfs_alloc_lookup_eq(
+ struct xfs_btree_cur *cur, /* btree cursor */
+ xfs_agblock_t bno, /* starting block of extent */
+ xfs_extlen_t len, /* length of extent */
+ int *stat) /* success/failure */
+{
+ cur->bc_rec.a.ar_startblock = bno;
+ cur->bc_rec.a.ar_blockcount = len;
+ return xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat);
+}
+
+/*
+ * Lookup the first record greater than or equal to [bno, len]
+ * in the btree given by cur.
+ */
+int /* error */
+xfs_alloc_lookup_ge(
+ struct xfs_btree_cur *cur, /* btree cursor */
+ xfs_agblock_t bno, /* starting block of extent */
+ xfs_extlen_t len, /* length of extent */
+ int *stat) /* success/failure */
+{
+ cur->bc_rec.a.ar_startblock = bno;
+ cur->bc_rec.a.ar_blockcount = len;
+ return xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat);
+}
/*
- * Internal functions.
+ * Lookup the first record less than or equal to [bno, len]
+ * in the btree given by cur.
*/
+int /* error */
+xfs_alloc_lookup_le(
+ struct xfs_btree_cur *cur, /* btree cursor */
+ xfs_agblock_t bno, /* starting block of extent */
+ xfs_extlen_t len, /* length of extent */
+ int *stat) /* success/failure */
+{
+ cur->bc_rec.a.ar_startblock = bno;
+ cur->bc_rec.a.ar_blockcount = len;
+ return xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat);
+}
+
+/*
+ * Update the record referred to by cur to the value given
+ * by [bno, len].
+ * This either works (return 0) or gets an EFSCORRUPTED error.
+ */
+STATIC int /* error */
+xfs_alloc_update(
+ struct xfs_btree_cur *cur, /* btree cursor */
+ xfs_agblock_t bno, /* starting block of extent */
+ xfs_extlen_t len) /* length of extent */
+{
+ union xfs_btree_rec rec;
+
+ rec.alloc.ar_startblock = cpu_to_be32(bno);
+ rec.alloc.ar_blockcount = cpu_to_be32(len);
+ return xfs_btree_update(cur, &rec);
+}
+
+/*
+ * Get the data from the pointed-to record.
+ */
+int /* error */
+xfs_alloc_get_rec(
+ struct xfs_btree_cur *cur, /* btree cursor */
+ xfs_agblock_t *bno, /* output: starting block of extent */
+ xfs_extlen_t *len, /* output: length of extent */
+ int *stat) /* output: success/failure */
+{
+ union xfs_btree_rec *rec;
+ int error;
+
+ error = xfs_btree_get_rec(cur, &rec, stat);
+ if (!error && *stat == 1) {
+ *bno = be32_to_cpu(rec->alloc.ar_startblock);
+ *len = be32_to_cpu(rec->alloc.ar_blockcount);
+ }
+ return error;
+}
/*
* Compute aligned version of the found extent.
* Takes alignment and min length into account.
*/
-STATIC int /* success (>= minlen) */
+STATIC void
xfs_alloc_compute_aligned(
+ xfs_alloc_arg_t *args, /* allocation argument structure */
xfs_agblock_t foundbno, /* starting block in found extent */
xfs_extlen_t foundlen, /* length in found extent */
- xfs_extlen_t alignment, /* alignment for allocation */
- xfs_extlen_t minlen, /* minimum length for allocation */
xfs_agblock_t *resbno, /* result block number */
xfs_extlen_t *reslen) /* result length */
{
xfs_agblock_t bno;
- xfs_extlen_t diff;
xfs_extlen_t len;
- if (alignment > 1 && foundlen >= minlen) {
- bno = roundup(foundbno, alignment);
- diff = bno - foundbno;
- len = diff >= foundlen ? 0 : foundlen - diff;
+ /* Trim busy sections out of found extent */
+ xfs_extent_busy_trim(args, foundbno, foundlen, &bno, &len);
+
+ if (args->alignment > 1 && len >= args->minlen) {
+ xfs_agblock_t aligned_bno = roundup(bno, args->alignment);
+ xfs_extlen_t diff = aligned_bno - bno;
+
+ *resbno = aligned_bno;
+ *reslen = diff >= len ? 0 : len - diff;
} else {
- bno = foundbno;
- len = foundlen;
+ *resbno = bno;
+ *reslen = len;
}
- *resbno = bno;
- *reslen = len;
- return len >= minlen;
}
/*
@@ -130,6 +175,7 @@ xfs_alloc_compute_diff(
xfs_agblock_t wantbno, /* target starting block */
xfs_extlen_t wantlen, /* target length */
xfs_extlen_t alignment, /* target alignment */
+ char userdata, /* are we allocating data? */
xfs_agblock_t freebno, /* freespace's starting block */
xfs_extlen_t freelen, /* freespace's length */
xfs_agblock_t *newbnop) /* result: best start block from free */
@@ -144,7 +190,14 @@ xfs_alloc_compute_diff(
ASSERT(freelen >= wantlen);
freeend = freebno + freelen;
wantend = wantbno + wantlen;
- if (freebno >= wantbno) {
+ /*
+ * We want to allocate from the start of a free extent if it is past
+ * the desired block or if we are allocating user data and the free
+ * extent is before desired block. The second case is there to allow
+ * for contiguous allocation from the remaining free space if the file
+ * grows in the short term.
+ */
+ if (freebno >= wantbno || (userdata && freeend < wantend)) {
if ((newbno1 = roundup(freebno, alignment)) >= freeend)
newbno1 = NULLAGBLOCK;
} else if (freeend >= wantend && alignment > 1) {
@@ -204,16 +257,14 @@ xfs_alloc_fix_len(
k = rlen % args->prod;
if (k == args->mod)
return;
- if (k > args->mod) {
- if ((int)(rlen = rlen - k - args->mod) < (int)args->minlen)
- return;
- } else {
- if ((int)(rlen = rlen - args->prod - (args->mod - k)) <
- (int)args->minlen)
- return;
- }
- ASSERT(rlen >= args->minlen);
- ASSERT(rlen <= args->maxlen);
+ if (k > args->mod)
+ rlen = rlen - (k - args->mod);
+ else
+ rlen = rlen - args->prod + (args->mod - k);
+ if ((int)rlen < (int)args->minlen)
+ return;
+ ASSERT(rlen >= args->minlen && rlen <= args->maxlen);
+ ASSERT(rlen % args->prod == args->mod);
args->len = rlen;
}
@@ -232,7 +283,6 @@ xfs_alloc_fix_minleft(
return 1;
agf = XFS_BUF_TO_AGF(args->agbp);
diff = be32_to_cpu(agf->agf_freeblks)
- + be32_to_cpu(agf->agf_flcount)
- args->len - args->minleft;
if (diff >= 0)
return 1;
@@ -297,21 +347,20 @@ xfs_alloc_fixup_trees(
return error;
XFS_WANT_CORRUPTED_RETURN(i == 1);
}
+
#ifdef DEBUG
- {
- xfs_alloc_block_t *bnoblock;
- xfs_alloc_block_t *cntblock;
-
- if (bno_cur->bc_nlevels == 1 &&
- cnt_cur->bc_nlevels == 1) {
- bnoblock = XFS_BUF_TO_ALLOC_BLOCK(bno_cur->bc_bufs[0]);
- cntblock = XFS_BUF_TO_ALLOC_BLOCK(cnt_cur->bc_bufs[0]);
- XFS_WANT_CORRUPTED_RETURN(
- be16_to_cpu(bnoblock->bb_numrecs) ==
- be16_to_cpu(cntblock->bb_numrecs));
- }
+ if (bno_cur->bc_nlevels == 1 && cnt_cur->bc_nlevels == 1) {
+ struct xfs_btree_block *bnoblock;
+ struct xfs_btree_block *cntblock;
+
+ bnoblock = XFS_BUF_TO_BLOCK(bno_cur->bc_bufs[0]);
+ cntblock = XFS_BUF_TO_BLOCK(cnt_cur->bc_bufs[0]);
+
+ XFS_WANT_CORRUPTED_RETURN(
+ bnoblock->bb_numrecs == cntblock->bb_numrecs);
}
#endif
+
/*
* Deal with all four cases: the allocated record is contained
* within the freespace record, so we can have new freespace
@@ -336,7 +385,7 @@ xfs_alloc_fixup_trees(
/*
* Delete the entry from the by-size btree.
*/
- if ((error = xfs_alloc_delete(cnt_cur, &i)))
+ if ((error = xfs_btree_delete(cnt_cur, &i)))
return error;
XFS_WANT_CORRUPTED_RETURN(i == 1);
/*
@@ -346,7 +395,7 @@ xfs_alloc_fixup_trees(
if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno1, nflen1, &i)))
return error;
XFS_WANT_CORRUPTED_RETURN(i == 0);
- if ((error = xfs_alloc_insert(cnt_cur, &i)))
+ if ((error = xfs_btree_insert(cnt_cur, &i)))
return error;
XFS_WANT_CORRUPTED_RETURN(i == 1);
}
@@ -354,7 +403,7 @@ xfs_alloc_fixup_trees(
if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno2, nflen2, &i)))
return error;
XFS_WANT_CORRUPTED_RETURN(i == 0);
- if ((error = xfs_alloc_insert(cnt_cur, &i)))
+ if ((error = xfs_btree_insert(cnt_cur, &i)))
return error;
XFS_WANT_CORRUPTED_RETURN(i == 1);
}
@@ -365,7 +414,7 @@ xfs_alloc_fixup_trees(
/*
* No remaining freespace, just delete the by-block tree entry.
*/
- if ((error = xfs_alloc_delete(bno_cur, &i)))
+ if ((error = xfs_btree_delete(bno_cur, &i)))
return error;
XFS_WANT_CORRUPTED_RETURN(i == 1);
} else {
@@ -382,13 +431,94 @@ xfs_alloc_fixup_trees(
if ((error = xfs_alloc_lookup_eq(bno_cur, nfbno2, nflen2, &i)))
return error;
XFS_WANT_CORRUPTED_RETURN(i == 0);
- if ((error = xfs_alloc_insert(bno_cur, &i)))
+ if ((error = xfs_btree_insert(bno_cur, &i)))
return error;
XFS_WANT_CORRUPTED_RETURN(i == 1);
}
return 0;
}
+static bool
+xfs_agfl_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_agfl *agfl = XFS_BUF_TO_AGFL(bp);
+ int i;
+
+ if (!uuid_equal(&agfl->agfl_uuid, &mp->m_sb.sb_uuid))
+ return false;
+ if (be32_to_cpu(agfl->agfl_magicnum) != XFS_AGFL_MAGIC)
+ return false;
+ /*
+ * during growfs operations, the perag is not fully initialised,
+ * so we can't use it for any useful checking. growfs ensures we can't
+ * use it by using uncached buffers that don't have the perag attached
+ * so we can detect and avoid this problem.
+ */
+ if (bp->b_pag && be32_to_cpu(agfl->agfl_seqno) != bp->b_pag->pag_agno)
+ return false;
+
+ for (i = 0; i < XFS_AGFL_SIZE(mp); i++) {
+ if (be32_to_cpu(agfl->agfl_bno[i]) != NULLAGBLOCK &&
+ be32_to_cpu(agfl->agfl_bno[i]) >= mp->m_sb.sb_agblocks)
+ return false;
+ }
+ return true;
+}
+
+static void
+xfs_agfl_read_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+
+ /*
+ * There is no verification of non-crc AGFLs because mkfs does not
+ * initialise the AGFL to zero or NULL. Hence the only valid part of the
+ * AGFL is what the AGF says is active. We can't get to the AGF, so we
+ * can't verify just those entries are valid.
+ */
+ if (!xfs_sb_version_hascrc(&mp->m_sb))
+ return;
+
+ if (!xfs_buf_verify_cksum(bp, XFS_AGFL_CRC_OFF))
+ xfs_buf_ioerror(bp, EFSBADCRC);
+ else if (!xfs_agfl_verify(bp))
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
+
+ if (bp->b_error)
+ xfs_verifier_error(bp);
+}
+
+static void
+xfs_agfl_write_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_buf_log_item *bip = bp->b_fspriv;
+
+ /* no verification of non-crc AGFLs */
+ if (!xfs_sb_version_hascrc(&mp->m_sb))
+ return;
+
+ if (!xfs_agfl_verify(bp)) {
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
+ xfs_verifier_error(bp);
+ return;
+ }
+
+ if (bip)
+ XFS_BUF_TO_AGFL(bp)->agfl_lsn = cpu_to_be64(bip->bli_item.li_lsn);
+
+ xfs_buf_update_cksum(bp, XFS_AGFL_CRC_OFF);
+}
+
+const struct xfs_buf_ops xfs_agfl_buf_ops = {
+ .verify_read = xfs_agfl_read_verify,
+ .verify_write = xfs_agfl_write_verify,
+};
+
/*
* Read in the allocation group free block array.
*/
@@ -406,133 +536,34 @@ xfs_alloc_read_agfl(
error = xfs_trans_read_buf(
mp, tp, mp->m_ddev_targp,
XFS_AG_DADDR(mp, agno, XFS_AGFL_DADDR(mp)),
- XFS_FSS_TO_BB(mp, 1), 0, &bp);
+ XFS_FSS_TO_BB(mp, 1), 0, &bp, &xfs_agfl_buf_ops);
if (error)
return error;
- ASSERT(bp);
- ASSERT(!XFS_BUF_GETERROR(bp));
- XFS_BUF_SET_VTYPE_REF(bp, B_FS_AGFL, XFS_AGFL_REF);
+ xfs_buf_set_ref(bp, XFS_AGFL_REF);
*bpp = bp;
return 0;
}
-#if defined(XFS_ALLOC_TRACE)
-/*
- * Add an allocation trace entry for an alloc call.
- */
-STATIC void
-xfs_alloc_trace_alloc(
- char *name, /* function tag string */
- char *str, /* additional string */
- xfs_alloc_arg_t *args, /* allocation argument structure */
- int line) /* source line number */
+STATIC int
+xfs_alloc_update_counters(
+ struct xfs_trans *tp,
+ struct xfs_perag *pag,
+ struct xfs_buf *agbp,
+ long len)
{
- ktrace_enter(xfs_alloc_trace_buf,
- (void *)(__psint_t)(XFS_ALLOC_KTRACE_ALLOC | (line << 16)),
- (void *)name,
- (void *)str,
- (void *)args->mp,
- (void *)(__psunsigned_t)args->agno,
- (void *)(__psunsigned_t)args->agbno,
- (void *)(__psunsigned_t)args->minlen,
- (void *)(__psunsigned_t)args->maxlen,
- (void *)(__psunsigned_t)args->mod,
- (void *)(__psunsigned_t)args->prod,
- (void *)(__psunsigned_t)args->minleft,
- (void *)(__psunsigned_t)args->total,
- (void *)(__psunsigned_t)args->alignment,
- (void *)(__psunsigned_t)args->len,
- (void *)((((__psint_t)args->type) << 16) |
- (__psint_t)args->otype),
- (void *)(__psint_t)((args->wasdel << 3) |
- (args->wasfromfl << 2) |
- (args->isfl << 1) |
- (args->userdata << 0)));
-}
+ struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp);
-/*
- * Add an allocation trace entry for a free call.
- */
-STATIC void
-xfs_alloc_trace_free(
- char *name, /* function tag string */
- char *str, /* additional string */
- xfs_mount_t *mp, /* file system mount point */
- xfs_agnumber_t agno, /* allocation group number */
- xfs_agblock_t agbno, /* a.g. relative block number */
- xfs_extlen_t len, /* length of extent */
- int isfl, /* set if is freelist allocation/free */
- int line) /* source line number */
-{
- ktrace_enter(xfs_alloc_trace_buf,
- (void *)(__psint_t)(XFS_ALLOC_KTRACE_FREE | (line << 16)),
- (void *)name,
- (void *)str,
- (void *)mp,
- (void *)(__psunsigned_t)agno,
- (void *)(__psunsigned_t)agbno,
- (void *)(__psunsigned_t)len,
- (void *)(__psint_t)isfl,
- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL);
-}
+ pag->pagf_freeblks += len;
+ be32_add_cpu(&agf->agf_freeblks, len);
-/*
- * Add an allocation trace entry for modifying an agf.
- */
-STATIC void
-xfs_alloc_trace_modagf(
- char *name, /* function tag string */
- char *str, /* additional string */
- xfs_mount_t *mp, /* file system mount point */
- xfs_agf_t *agf, /* new agf value */
- int flags, /* logging flags for agf */
- int line) /* source line number */
-{
- ktrace_enter(xfs_alloc_trace_buf,
- (void *)(__psint_t)(XFS_ALLOC_KTRACE_MODAGF | (line << 16)),
- (void *)name,
- (void *)str,
- (void *)mp,
- (void *)(__psint_t)flags,
- (void *)(__psunsigned_t)be32_to_cpu(agf->agf_seqno),
- (void *)(__psunsigned_t)be32_to_cpu(agf->agf_length),
- (void *)(__psunsigned_t)be32_to_cpu(agf->agf_roots[XFS_BTNUM_BNO]),
- (void *)(__psunsigned_t)be32_to_cpu(agf->agf_roots[XFS_BTNUM_CNT]),
- (void *)(__psunsigned_t)be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]),
- (void *)(__psunsigned_t)be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]),
- (void *)(__psunsigned_t)be32_to_cpu(agf->agf_flfirst),
- (void *)(__psunsigned_t)be32_to_cpu(agf->agf_fllast),
- (void *)(__psunsigned_t)be32_to_cpu(agf->agf_flcount),
- (void *)(__psunsigned_t)be32_to_cpu(agf->agf_freeblks),
- (void *)(__psunsigned_t)be32_to_cpu(agf->agf_longest));
-}
+ xfs_trans_agblocks_delta(tp, len);
+ if (unlikely(be32_to_cpu(agf->agf_freeblks) >
+ be32_to_cpu(agf->agf_length)))
+ return EFSCORRUPTED;
-STATIC void
-xfs_alloc_trace_busy(
- char *name, /* function tag string */
- char *str, /* additional string */
- xfs_mount_t *mp, /* file system mount poing */
- xfs_agnumber_t agno, /* allocation group number */
- xfs_agblock_t agbno, /* a.g. relative block number */
- xfs_extlen_t len, /* length of extent */
- int slot, /* perag Busy slot */
- xfs_trans_t *tp,
- int trtype, /* type: add, delete, search */
- int line) /* source line number */
-{
- ktrace_enter(xfs_alloc_trace_buf,
- (void *)(__psint_t)(trtype | (line << 16)),
- (void *)name,
- (void *)str,
- (void *)mp,
- (void *)(__psunsigned_t)agno,
- (void *)(__psunsigned_t)agbno,
- (void *)(__psunsigned_t)len,
- (void *)(__psint_t)slot,
- (void *)tp,
- NULL, NULL, NULL, NULL, NULL, NULL, NULL);
+ xfs_alloc_log_agf(tp, agbp, XFS_AGF_FREEBLKS);
+ return 0;
}
-#endif /* XFS_ALLOC_TRACE */
/*
* Allocation group level functions.
@@ -551,9 +582,6 @@ xfs_alloc_ag_vextent(
xfs_alloc_arg_t *args) /* argument structure for allocation */
{
int error=0;
-#ifdef XFS_ALLOC_TRACE
- static char fname[] = "xfs_alloc_ag_vextent";
-#endif
ASSERT(args->minlen > 0);
ASSERT(args->maxlen > 0);
@@ -578,46 +606,36 @@ xfs_alloc_ag_vextent(
ASSERT(0);
/* NOTREACHED */
}
- if (error)
+
+ if (error || args->agbno == NULLAGBLOCK)
return error;
- /*
- * If the allocation worked, need to change the agf structure
- * (and log it), and the superblock.
- */
- if (args->agbno != NULLAGBLOCK) {
- xfs_agf_t *agf; /* allocation group freelist header */
-#ifdef XFS_ALLOC_TRACE
- xfs_mount_t *mp = args->mp;
-#endif
- long slen = (long)args->len;
- ASSERT(args->len >= args->minlen && args->len <= args->maxlen);
- ASSERT(!(args->wasfromfl) || !args->isfl);
- ASSERT(args->agbno % args->alignment == 0);
- if (!(args->wasfromfl)) {
-
- agf = XFS_BUF_TO_AGF(args->agbp);
- be32_add(&agf->agf_freeblks, -(args->len));
- xfs_trans_agblocks_delta(args->tp,
- -((long)(args->len)));
- args->pag->pagf_freeblks -= args->len;
- ASSERT(be32_to_cpu(agf->agf_freeblks) <=
- be32_to_cpu(agf->agf_length));
- TRACE_MODAGF(NULL, agf, XFS_AGF_FREEBLKS);
- xfs_alloc_log_agf(args->tp, args->agbp,
- XFS_AGF_FREEBLKS);
- /* search the busylist for these blocks */
- xfs_alloc_search_busy(args->tp, args->agno,
- args->agbno, args->len);
- }
- if (!args->isfl)
- xfs_trans_mod_sb(args->tp,
- args->wasdel ? XFS_TRANS_SB_RES_FDBLOCKS :
- XFS_TRANS_SB_FDBLOCKS, -slen);
- XFS_STATS_INC(xs_allocx);
- XFS_STATS_ADD(xs_allocb, args->len);
+ ASSERT(args->len >= args->minlen);
+ ASSERT(args->len <= args->maxlen);
+ ASSERT(!args->wasfromfl || !args->isfl);
+ ASSERT(args->agbno % args->alignment == 0);
+
+ if (!args->wasfromfl) {
+ error = xfs_alloc_update_counters(args->tp, args->pag,
+ args->agbp,
+ -((long)(args->len)));
+ if (error)
+ return error;
+
+ ASSERT(!xfs_extent_busy_search(args->mp, args->agno,
+ args->agbno, args->len));
}
- return 0;
+
+ if (!args->isfl) {
+ xfs_trans_mod_sb(args->tp, args->wasdel ?
+ XFS_TRANS_SB_RES_FDBLOCKS :
+ XFS_TRANS_SB_FDBLOCKS,
+ -((long)(args->len)));
+ }
+
+ XFS_STATS_INC(xs_allocx);
+ XFS_STATS_ADD(xs_allocb, args->len);
+ return error;
}
/*
@@ -632,97 +650,194 @@ xfs_alloc_ag_vextent_exact(
{
xfs_btree_cur_t *bno_cur;/* by block-number btree cursor */
xfs_btree_cur_t *cnt_cur;/* by count btree cursor */
- xfs_agblock_t end; /* end of allocated extent */
int error;
xfs_agblock_t fbno; /* start block of found extent */
- xfs_agblock_t fend; /* end block of found extent */
xfs_extlen_t flen; /* length of found extent */
-#ifdef XFS_ALLOC_TRACE
- static char fname[] = "xfs_alloc_ag_vextent_exact";
-#endif
+ xfs_agblock_t tbno; /* start block of trimmed extent */
+ xfs_extlen_t tlen; /* length of trimmed extent */
+ xfs_agblock_t tend; /* end block of trimmed extent */
int i; /* success/failure of operation */
- xfs_agblock_t maxend; /* end of maximal extent */
- xfs_agblock_t minend; /* end of minimal extent */
- xfs_extlen_t rlen; /* length of returned extent */
ASSERT(args->alignment == 1);
+
/*
* Allocate/initialize a cursor for the by-number freespace btree.
*/
- bno_cur = xfs_btree_init_cursor(args->mp, args->tp, args->agbp,
- args->agno, XFS_BTNUM_BNO, NULL, 0);
+ bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
+ args->agno, XFS_BTNUM_BNO);
+
/*
* Lookup bno and minlen in the btree (minlen is irrelevant, really).
* Look for the closest free block <= bno, it must contain bno
* if any free block does.
*/
- if ((error = xfs_alloc_lookup_le(bno_cur, args->agbno, args->minlen, &i)))
+ error = xfs_alloc_lookup_le(bno_cur, args->agbno, args->minlen, &i);
+ if (error)
goto error0;
- if (!i) {
- /*
- * Didn't find it, return null.
- */
- xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
- args->agbno = NULLAGBLOCK;
- return 0;
- }
+ if (!i)
+ goto not_found;
+
/*
* Grab the freespace record.
*/
- if ((error = xfs_alloc_get_rec(bno_cur, &fbno, &flen, &i)))
+ error = xfs_alloc_get_rec(bno_cur, &fbno, &flen, &i);
+ if (error)
goto error0;
XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
ASSERT(fbno <= args->agbno);
- minend = args->agbno + args->minlen;
- maxend = args->agbno + args->maxlen;
- fend = fbno + flen;
+
/*
- * Give up if the freespace isn't long enough for the minimum request.
+ * Check for overlapping busy extents.
*/
- if (fend < minend) {
- xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
- args->agbno = NULLAGBLOCK;
- return 0;
- }
+ xfs_extent_busy_trim(args, fbno, flen, &tbno, &tlen);
+
/*
- * End of extent will be smaller of the freespace end and the
- * maximal requested end.
+ * Give up if the start of the extent is busy, or the freespace isn't
+ * long enough for the minimum request.
*/
- end = XFS_AGBLOCK_MIN(fend, maxend);
+ if (tbno > args->agbno)
+ goto not_found;
+ if (tlen < args->minlen)
+ goto not_found;
+ tend = tbno + tlen;
+ if (tend < args->agbno + args->minlen)
+ goto not_found;
+
/*
+ * End of extent will be smaller of the freespace end and the
+ * maximal requested end.
+ *
* Fix the length according to mod and prod if given.
*/
- args->len = end - args->agbno;
+ args->len = XFS_AGBLOCK_MIN(tend, args->agbno + args->maxlen)
+ - args->agbno;
xfs_alloc_fix_len(args);
- if (!xfs_alloc_fix_minleft(args)) {
- xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
- return 0;
- }
- rlen = args->len;
- ASSERT(args->agbno + rlen <= fend);
- end = args->agbno + rlen;
+ if (!xfs_alloc_fix_minleft(args))
+ goto not_found;
+
+ ASSERT(args->agbno + args->len <= tend);
+
/*
- * We are allocating agbno for rlen [agbno .. end]
+ * We are allocating agbno for args->len
* Allocate/initialize a cursor for the by-size btree.
*/
- cnt_cur = xfs_btree_init_cursor(args->mp, args->tp, args->agbp,
- args->agno, XFS_BTNUM_CNT, NULL, 0);
+ cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
+ args->agno, XFS_BTNUM_CNT);
ASSERT(args->agbno + args->len <=
be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
- if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen,
- args->agbno, args->len, XFSA_FIXUP_BNO_OK))) {
+ error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen, args->agbno,
+ args->len, XFSA_FIXUP_BNO_OK);
+ if (error) {
xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
goto error0;
}
+
xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
- TRACE_ALLOC("normal", args);
+
args->wasfromfl = 0;
+ trace_xfs_alloc_exact_done(args);
+ return 0;
+
+not_found:
+ /* Didn't find it, return null. */
+ xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
+ args->agbno = NULLAGBLOCK;
+ trace_xfs_alloc_exact_notfound(args);
return 0;
error0:
xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
- TRACE_ALLOC("error", args);
+ trace_xfs_alloc_exact_error(args);
+ return error;
+}
+
+/*
+ * Search the btree in a given direction via the search cursor and compare
+ * the records found against the good extent we've already found.
+ */
+STATIC int
+xfs_alloc_find_best_extent(
+ struct xfs_alloc_arg *args, /* allocation argument structure */
+ struct xfs_btree_cur **gcur, /* good cursor */
+ struct xfs_btree_cur **scur, /* searching cursor */
+ xfs_agblock_t gdiff, /* difference for search comparison */
+ xfs_agblock_t *sbno, /* extent found by search */
+ xfs_extlen_t *slen, /* extent length */
+ xfs_agblock_t *sbnoa, /* aligned extent found by search */
+ xfs_extlen_t *slena, /* aligned extent length */
+ int dir) /* 0 = search right, 1 = search left */
+{
+ xfs_agblock_t new;
+ xfs_agblock_t sdiff;
+ int error;
+ int i;
+
+ /* The good extent is perfect, no need to search. */
+ if (!gdiff)
+ goto out_use_good;
+
+ /*
+ * Look until we find a better one, run out of space or run off the end.
+ */
+ do {
+ error = xfs_alloc_get_rec(*scur, sbno, slen, &i);
+ if (error)
+ goto error0;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+ xfs_alloc_compute_aligned(args, *sbno, *slen, sbnoa, slena);
+
+ /*
+ * The good extent is closer than this one.
+ */
+ if (!dir) {
+ if (*sbnoa >= args->agbno + gdiff)
+ goto out_use_good;
+ } else {
+ if (*sbnoa <= args->agbno - gdiff)
+ goto out_use_good;
+ }
+
+ /*
+ * Same distance, compare length and pick the best.
+ */
+ if (*slena >= args->minlen) {
+ args->len = XFS_EXTLEN_MIN(*slena, args->maxlen);
+ xfs_alloc_fix_len(args);
+
+ sdiff = xfs_alloc_compute_diff(args->agbno, args->len,
+ args->alignment,
+ args->userdata, *sbnoa,
+ *slena, &new);
+
+ /*
+ * Choose closer size and invalidate other cursor.
+ */
+ if (sdiff < gdiff)
+ goto out_use_search;
+ goto out_use_good;
+ }
+
+ if (!dir)
+ error = xfs_btree_increment(*scur, 0, &i);
+ else
+ error = xfs_btree_decrement(*scur, 0, &i);
+ if (error)
+ goto error0;
+ } while (i);
+
+out_use_good:
+ xfs_btree_del_cursor(*scur, XFS_BTREE_NOERROR);
+ *scur = NULL;
+ return 0;
+
+out_use_search:
+ xfs_btree_del_cursor(*gcur, XFS_BTREE_NOERROR);
+ *gcur = NULL;
+ return 0;
+
+error0:
+ /* caller invalidates cursors */
return error;
}
@@ -739,9 +854,6 @@ xfs_alloc_ag_vextent_near(
xfs_btree_cur_t *bno_cur_gt; /* cursor for bno btree, right side */
xfs_btree_cur_t *bno_cur_lt; /* cursor for bno btree, left side */
xfs_btree_cur_t *cnt_cur; /* cursor for count btree */
-#ifdef XFS_ALLOC_TRACE
- static char fname[] = "xfs_alloc_ag_vextent_near";
-#endif
xfs_agblock_t gtbno; /* start bno of right side entry */
xfs_agblock_t gtbnoa; /* aligned ... */
xfs_extlen_t gtdiff; /* difference to right side entry */
@@ -754,27 +866,33 @@ xfs_alloc_ag_vextent_near(
xfs_agblock_t ltbno; /* start bno of left side entry */
xfs_agblock_t ltbnoa; /* aligned ... */
xfs_extlen_t ltdiff; /* difference to left side entry */
- /*REFERENCED*/
- xfs_agblock_t ltend; /* end bno of left side entry */
xfs_extlen_t ltlen; /* length of left side entry */
xfs_extlen_t ltlena; /* aligned ... */
xfs_agblock_t ltnew; /* useful start bno of left side */
xfs_extlen_t rlen; /* length of returned extent */
-#if defined(DEBUG) && defined(__KERNEL__)
+ int forced = 0;
+#ifdef DEBUG
/*
* Randomly don't execute the first algorithm.
*/
int dofirst; /* set to do first algorithm */
- dofirst = random() & 1;
+ dofirst = prandom_u32() & 1;
#endif
+
+restart:
+ bno_cur_lt = NULL;
+ bno_cur_gt = NULL;
+ ltlen = 0;
+ gtlena = 0;
+ ltlena = 0;
+
/*
* Get a cursor for the by-size btree.
*/
- cnt_cur = xfs_btree_init_cursor(args->mp, args->tp, args->agbp,
- args->agno, XFS_BTNUM_CNT, NULL, 0);
- ltlen = 0;
- bno_cur_lt = bno_cur_gt = NULL;
+ cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
+ args->agno, XFS_BTNUM_CNT);
+
/*
* See if there are any free extents as big as maxlen.
*/
@@ -790,11 +908,13 @@ xfs_alloc_ag_vextent_near(
goto error0;
if (i == 0 || ltlen == 0) {
xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+ trace_xfs_alloc_near_noentry(args);
return 0;
}
ASSERT(i == 1);
}
args->wasfromfl = 0;
+
/*
* First algorithm.
* If the requested extent is large wrt the freespaces available
@@ -811,8 +931,8 @@ xfs_alloc_ag_vextent_near(
xfs_extlen_t blen=0;
xfs_agblock_t bnew=0;
-#if defined(DEBUG) && defined(__KERNEL__)
- if (!dofirst)
+#ifdef DEBUG
+ if (dofirst)
break;
#endif
/*
@@ -830,7 +950,7 @@ xfs_alloc_ag_vextent_near(
XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
if (ltlen >= args->minlen)
break;
- if ((error = xfs_alloc_increment(cnt_cur, 0, &i)))
+ if ((error = xfs_btree_increment(cnt_cur, 0, &i)))
goto error0;
} while (i);
ASSERT(ltlen >= args->minlen);
@@ -840,7 +960,7 @@ xfs_alloc_ag_vextent_near(
i = cnt_cur->bc_ptrs[0];
for (j = 1, blen = 0, bdiff = 0;
!error && j && (blen < args->maxlen || bdiff > 0);
- error = xfs_alloc_increment(cnt_cur, 0, &j)) {
+ error = xfs_btree_increment(cnt_cur, 0, &j)) {
/*
* For each entry, decide if it's better than
* the previous best entry.
@@ -848,9 +968,9 @@ xfs_alloc_ag_vextent_near(
if ((error = xfs_alloc_get_rec(cnt_cur, &ltbno, &ltlen, &i)))
goto error0;
XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
- if (!xfs_alloc_compute_aligned(ltbno, ltlen,
- args->alignment, args->minlen,
- &ltbnoa, &ltlena))
+ xfs_alloc_compute_aligned(args, ltbno, ltlen,
+ &ltbnoa, &ltlena);
+ if (ltlena < args->minlen)
continue;
args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
xfs_alloc_fix_len(args);
@@ -858,7 +978,8 @@ xfs_alloc_ag_vextent_near(
if (args->len < blen)
continue;
ltdiff = xfs_alloc_compute_diff(args->agbno, args->len,
- args->alignment, ltbno, ltlen, &ltnew);
+ args->alignment, args->userdata, ltbnoa,
+ ltlena, &ltnew);
if (ltnew != NULLAGBLOCK &&
(args->len > blen || ltdiff < bdiff)) {
bdiff = ltdiff;
@@ -880,12 +1001,11 @@ xfs_alloc_ag_vextent_near(
if ((error = xfs_alloc_get_rec(cnt_cur, &ltbno, &ltlen, &i)))
goto error0;
XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
- ltend = ltbno + ltlen;
- ASSERT(ltend <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
+ ASSERT(ltbno + ltlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
args->len = blen;
if (!xfs_alloc_fix_minleft(args)) {
xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
- TRACE_ALLOC("nominleft", args);
+ trace_xfs_alloc_near_nominleft(args);
return 0;
}
blen = args->len;
@@ -894,12 +1014,12 @@ xfs_alloc_ag_vextent_near(
*/
args->agbno = bnew;
ASSERT(bnew >= ltbno);
- ASSERT(bnew + blen <= ltend);
+ ASSERT(bnew + blen <= ltbno + ltlen);
/*
* Set up a cursor for the by-bno tree.
*/
- bno_cur_lt = xfs_btree_init_cursor(args->mp, args->tp,
- args->agbp, args->agno, XFS_BTNUM_BNO, NULL, 0);
+ bno_cur_lt = xfs_allocbt_init_cursor(args->mp, args->tp,
+ args->agbp, args->agno, XFS_BTNUM_BNO);
/*
* Fix up the btree entries.
*/
@@ -908,7 +1028,8 @@ xfs_alloc_ag_vextent_near(
goto error0;
xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
xfs_btree_del_cursor(bno_cur_lt, XFS_BTREE_NOERROR);
- TRACE_ALLOC("first", args);
+
+ trace_xfs_alloc_near_first(args);
return 0;
}
/*
@@ -926,8 +1047,8 @@ xfs_alloc_ag_vextent_near(
/*
* Allocate and initialize the cursor for the leftward search.
*/
- bno_cur_lt = xfs_btree_init_cursor(args->mp, args->tp, args->agbp,
- args->agno, XFS_BTNUM_BNO, NULL, 0);
+ bno_cur_lt = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
+ args->agno, XFS_BTNUM_BNO);
/*
* Lookup <= bno to find the leftward search's starting point.
*/
@@ -950,7 +1071,7 @@ xfs_alloc_ag_vextent_near(
* Increment the cursor, so we will point at the entry just right
* of the leftward entry if any, or to the leftmost entry.
*/
- if ((error = xfs_alloc_increment(bno_cur_gt, 0, &i)))
+ if ((error = xfs_btree_increment(bno_cur_gt, 0, &i)))
goto error0;
if (!i) {
/*
@@ -969,11 +1090,11 @@ xfs_alloc_ag_vextent_near(
if ((error = xfs_alloc_get_rec(bno_cur_lt, &ltbno, &ltlen, &i)))
goto error0;
XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
- if (xfs_alloc_compute_aligned(ltbno, ltlen,
- args->alignment, args->minlen,
- &ltbnoa, &ltlena))
+ xfs_alloc_compute_aligned(args, ltbno, ltlen,
+ &ltbnoa, &ltlena);
+ if (ltlena >= args->minlen)
break;
- if ((error = xfs_alloc_decrement(bno_cur_lt, 0, &i)))
+ if ((error = xfs_btree_decrement(bno_cur_lt, 0, &i)))
goto error0;
if (!i) {
xfs_btree_del_cursor(bno_cur_lt,
@@ -985,11 +1106,11 @@ xfs_alloc_ag_vextent_near(
if ((error = xfs_alloc_get_rec(bno_cur_gt, &gtbno, &gtlen, &i)))
goto error0;
XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
- if (xfs_alloc_compute_aligned(gtbno, gtlen,
- args->alignment, args->minlen,
- &gtbnoa, &gtlena))
+ xfs_alloc_compute_aligned(args, gtbno, gtlen,
+ &gtbnoa, &gtlena);
+ if (gtlena >= args->minlen)
break;
- if ((error = xfs_alloc_increment(bno_cur_gt, 0, &i)))
+ if ((error = xfs_btree_increment(bno_cur_gt, 0, &i)))
goto error0;
if (!i) {
xfs_btree_del_cursor(bno_cur_gt,
@@ -998,211 +1119,65 @@ xfs_alloc_ag_vextent_near(
}
}
} while (bno_cur_lt || bno_cur_gt);
+
/*
* Got both cursors still active, need to find better entry.
*/
if (bno_cur_lt && bno_cur_gt) {
- /*
- * Left side is long enough, look for a right side entry.
- */
if (ltlena >= args->minlen) {
/*
- * Fix up the length.
+ * Left side is good, look for a right side entry.
*/
args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
xfs_alloc_fix_len(args);
- rlen = args->len;
- ltdiff = xfs_alloc_compute_diff(args->agbno, rlen,
- args->alignment, ltbno, ltlen, &ltnew);
- /*
- * Not perfect.
- */
- if (ltdiff) {
- /*
- * Look until we find a better one, run out of
- * space, or run off the end.
- */
- while (bno_cur_lt && bno_cur_gt) {
- if ((error = xfs_alloc_get_rec(
- bno_cur_gt, &gtbno,
- &gtlen, &i)))
- goto error0;
- XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
- xfs_alloc_compute_aligned(gtbno, gtlen,
- args->alignment, args->minlen,
- &gtbnoa, &gtlena);
- /*
- * The left one is clearly better.
- */
- if (gtbnoa >= args->agbno + ltdiff) {
- xfs_btree_del_cursor(
- bno_cur_gt,
- XFS_BTREE_NOERROR);
- bno_cur_gt = NULL;
- break;
- }
- /*
- * If we reach a big enough entry,
- * compare the two and pick the best.
- */
- if (gtlena >= args->minlen) {
- args->len =
- XFS_EXTLEN_MIN(gtlena,
- args->maxlen);
- xfs_alloc_fix_len(args);
- rlen = args->len;
- gtdiff = xfs_alloc_compute_diff(
- args->agbno, rlen,
- args->alignment,
- gtbno, gtlen, &gtnew);
- /*
- * Right side is better.
- */
- if (gtdiff < ltdiff) {
- xfs_btree_del_cursor(
- bno_cur_lt,
- XFS_BTREE_NOERROR);
- bno_cur_lt = NULL;
- }
- /*
- * Left side is better.
- */
- else {
- xfs_btree_del_cursor(
- bno_cur_gt,
- XFS_BTREE_NOERROR);
- bno_cur_gt = NULL;
- }
- break;
- }
- /*
- * Fell off the right end.
- */
- if ((error = xfs_alloc_increment(
- bno_cur_gt, 0, &i)))
- goto error0;
- if (!i) {
- xfs_btree_del_cursor(
- bno_cur_gt,
- XFS_BTREE_NOERROR);
- bno_cur_gt = NULL;
- break;
- }
- }
- }
- /*
- * The left side is perfect, trash the right side.
- */
- else {
- xfs_btree_del_cursor(bno_cur_gt,
- XFS_BTREE_NOERROR);
- bno_cur_gt = NULL;
- }
- }
- /*
- * It's the right side that was found first, look left.
- */
- else {
+ ltdiff = xfs_alloc_compute_diff(args->agbno, args->len,
+ args->alignment, args->userdata, ltbnoa,
+ ltlena, &ltnew);
+
+ error = xfs_alloc_find_best_extent(args,
+ &bno_cur_lt, &bno_cur_gt,
+ ltdiff, &gtbno, &gtlen,
+ &gtbnoa, &gtlena,
+ 0 /* search right */);
+ } else {
+ ASSERT(gtlena >= args->minlen);
+
/*
- * Fix up the length.
+ * Right side is good, look for a left side entry.
*/
args->len = XFS_EXTLEN_MIN(gtlena, args->maxlen);
xfs_alloc_fix_len(args);
- rlen = args->len;
- gtdiff = xfs_alloc_compute_diff(args->agbno, rlen,
- args->alignment, gtbno, gtlen, &gtnew);
- /*
- * Right side entry isn't perfect.
- */
- if (gtdiff) {
- /*
- * Look until we find a better one, run out of
- * space, or run off the end.
- */
- while (bno_cur_lt && bno_cur_gt) {
- if ((error = xfs_alloc_get_rec(
- bno_cur_lt, &ltbno,
- &ltlen, &i)))
- goto error0;
- XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
- xfs_alloc_compute_aligned(ltbno, ltlen,
- args->alignment, args->minlen,
- &ltbnoa, &ltlena);
- /*
- * The right one is clearly better.
- */
- if (ltbnoa <= args->agbno - gtdiff) {
- xfs_btree_del_cursor(
- bno_cur_lt,
- XFS_BTREE_NOERROR);
- bno_cur_lt = NULL;
- break;
- }
- /*
- * If we reach a big enough entry,
- * compare the two and pick the best.
- */
- if (ltlena >= args->minlen) {
- args->len = XFS_EXTLEN_MIN(
- ltlena, args->maxlen);
- xfs_alloc_fix_len(args);
- rlen = args->len;
- ltdiff = xfs_alloc_compute_diff(
- args->agbno, rlen,
- args->alignment,
- ltbno, ltlen, &ltnew);
- /*
- * Left side is better.
- */
- if (ltdiff < gtdiff) {
- xfs_btree_del_cursor(
- bno_cur_gt,
- XFS_BTREE_NOERROR);
- bno_cur_gt = NULL;
- }
- /*
- * Right side is better.
- */
- else {
- xfs_btree_del_cursor(
- bno_cur_lt,
- XFS_BTREE_NOERROR);
- bno_cur_lt = NULL;
- }
- break;
- }
- /*
- * Fell off the left end.
- */
- if ((error = xfs_alloc_decrement(
- bno_cur_lt, 0, &i)))
- goto error0;
- if (!i) {
- xfs_btree_del_cursor(bno_cur_lt,
- XFS_BTREE_NOERROR);
- bno_cur_lt = NULL;
- break;
- }
- }
- }
- /*
- * The right side is perfect, trash the left side.
- */
- else {
- xfs_btree_del_cursor(bno_cur_lt,
- XFS_BTREE_NOERROR);
- bno_cur_lt = NULL;
- }
+ gtdiff = xfs_alloc_compute_diff(args->agbno, args->len,
+ args->alignment, args->userdata, gtbnoa,
+ gtlena, &gtnew);
+
+ error = xfs_alloc_find_best_extent(args,
+ &bno_cur_gt, &bno_cur_lt,
+ gtdiff, &ltbno, &ltlen,
+ &ltbnoa, &ltlena,
+ 1 /* search left */);
}
+
+ if (error)
+ goto error0;
}
+
/*
* If we couldn't get anything, give up.
*/
if (bno_cur_lt == NULL && bno_cur_gt == NULL) {
- TRACE_ALLOC("neither", args);
+ xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+
+ if (!forced++) {
+ trace_xfs_alloc_near_busy(args);
+ xfs_log_force(args->mp, XFS_LOG_SYNC);
+ goto restart;
+ }
+ trace_xfs_alloc_size_neither(args);
args->agbno = NULLAGBLOCK;
return 0;
}
+
/*
* At this point we have selected a freespace entry, either to the
* left or to the right. If it's on the right, copy all the
@@ -1219,35 +1194,41 @@ xfs_alloc_ag_vextent_near(
j = 1;
} else
j = 0;
+
/*
* Fix up the length and compute the useful address.
*/
- ltend = ltbno + ltlen;
args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
xfs_alloc_fix_len(args);
if (!xfs_alloc_fix_minleft(args)) {
- TRACE_ALLOC("nominleft", args);
+ trace_xfs_alloc_near_nominleft(args);
xfs_btree_del_cursor(bno_cur_lt, XFS_BTREE_NOERROR);
xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
return 0;
}
rlen = args->len;
- (void)xfs_alloc_compute_diff(args->agbno, rlen, args->alignment, ltbno,
- ltlen, &ltnew);
+ (void)xfs_alloc_compute_diff(args->agbno, rlen, args->alignment,
+ args->userdata, ltbnoa, ltlena, &ltnew);
ASSERT(ltnew >= ltbno);
- ASSERT(ltnew + rlen <= ltend);
+ ASSERT(ltnew + rlen <= ltbnoa + ltlena);
ASSERT(ltnew + rlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
args->agbno = ltnew;
+
if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur_lt, ltbno, ltlen,
ltnew, rlen, XFSA_FIXUP_BNO_OK)))
goto error0;
- TRACE_ALLOC(j ? "gt" : "lt", args);
+
+ if (j)
+ trace_xfs_alloc_near_greater(args);
+ else
+ trace_xfs_alloc_near_lesser(args);
+
xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
xfs_btree_del_cursor(bno_cur_lt, XFS_BTREE_NOERROR);
return 0;
error0:
- TRACE_ALLOC("error", args);
+ trace_xfs_alloc_near_error(args);
if (cnt_cur != NULL)
xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
if (bno_cur_lt != NULL)
@@ -1272,56 +1253,95 @@ xfs_alloc_ag_vextent_size(
int error; /* error result */
xfs_agblock_t fbno; /* start of found freespace */
xfs_extlen_t flen; /* length of found freespace */
-#ifdef XFS_ALLOC_TRACE
- static char fname[] = "xfs_alloc_ag_vextent_size";
-#endif
int i; /* temp status variable */
xfs_agblock_t rbno; /* returned block number */
xfs_extlen_t rlen; /* length of returned extent */
+ int forced = 0;
+restart:
/*
* Allocate and initialize a cursor for the by-size btree.
*/
- cnt_cur = xfs_btree_init_cursor(args->mp, args->tp, args->agbp,
- args->agno, XFS_BTNUM_CNT, NULL, 0);
+ cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
+ args->agno, XFS_BTNUM_CNT);
bno_cur = NULL;
+
/*
* Look for an entry >= maxlen+alignment-1 blocks.
*/
if ((error = xfs_alloc_lookup_ge(cnt_cur, 0,
args->maxlen + args->alignment - 1, &i)))
goto error0;
+
/*
- * If none, then pick up the last entry in the tree unless the
- * tree is empty.
+ * If none or we have busy extents that we cannot allocate from, then
+ * we have to settle for a smaller extent. In the case that there are
+ * no large extents, this will return the last entry in the tree unless
+ * the tree is empty. In the case that there are only busy large
+ * extents, this will return the largest small extent unless there
+ * are no smaller extents available.
*/
- if (!i) {
- if ((error = xfs_alloc_ag_vextent_small(args, cnt_cur, &fbno,
- &flen, &i)))
+ if (!i || forced > 1) {
+ error = xfs_alloc_ag_vextent_small(args, cnt_cur,
+ &fbno, &flen, &i);
+ if (error)
goto error0;
if (i == 0 || flen == 0) {
xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
- TRACE_ALLOC("noentry", args);
+ trace_xfs_alloc_size_noentry(args);
return 0;
}
ASSERT(i == 1);
+ xfs_alloc_compute_aligned(args, fbno, flen, &rbno, &rlen);
+ } else {
+ /*
+ * Search for a non-busy extent that is large enough.
+ * If we are at low space, don't check, or if we fall of
+ * the end of the btree, turn off the busy check and
+ * restart.
+ */
+ for (;;) {
+ error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, &i);
+ if (error)
+ goto error0;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+
+ xfs_alloc_compute_aligned(args, fbno, flen,
+ &rbno, &rlen);
+
+ if (rlen >= args->maxlen)
+ break;
+
+ error = xfs_btree_increment(cnt_cur, 0, &i);
+ if (error)
+ goto error0;
+ if (i == 0) {
+ /*
+ * Our only valid extents must have been busy.
+ * Make it unbusy by forcing the log out and
+ * retrying. If we've been here before, forcing
+ * the log isn't making the extents available,
+ * which means they have probably been freed in
+ * this transaction. In that case, we have to
+ * give up on them and we'll attempt a minlen
+ * allocation the next time around.
+ */
+ xfs_btree_del_cursor(cnt_cur,
+ XFS_BTREE_NOERROR);
+ trace_xfs_alloc_size_busy(args);
+ if (!forced++)
+ xfs_log_force(args->mp, XFS_LOG_SYNC);
+ goto restart;
+ }
+ }
}
- /*
- * There's a freespace as big as maxlen+alignment-1, get it.
- */
- else {
- if ((error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, &i)))
- goto error0;
- XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
- }
+
/*
* In the first case above, we got the last entry in the
* by-size btree. Now we check to see if the space hits maxlen
* once aligned; if not, we search left for something better.
* This can't happen in the second case above.
*/
- xfs_alloc_compute_aligned(fbno, flen, args->alignment, args->minlen,
- &rbno, &rlen);
rlen = XFS_EXTLEN_MIN(args->maxlen, rlen);
XFS_WANT_CORRUPTED_GOTO(rlen == 0 ||
(rlen <= flen && rbno + rlen <= fbno + flen), error0);
@@ -1336,7 +1356,7 @@ xfs_alloc_ag_vextent_size(
bestflen = flen;
bestfbno = fbno;
for (;;) {
- if ((error = xfs_alloc_decrement(cnt_cur, 0, &i)))
+ if ((error = xfs_btree_decrement(cnt_cur, 0, &i)))
goto error0;
if (i == 0)
break;
@@ -1346,8 +1366,8 @@ xfs_alloc_ag_vextent_size(
XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
if (flen < bestrlen)
break;
- xfs_alloc_compute_aligned(fbno, flen, args->alignment,
- args->minlen, &rbno, &rlen);
+ xfs_alloc_compute_aligned(args, fbno, flen,
+ &rbno, &rlen);
rlen = XFS_EXTLEN_MIN(args->maxlen, rlen);
XFS_WANT_CORRUPTED_GOTO(rlen == 0 ||
(rlen <= flen && rbno + rlen <= fbno + flen),
@@ -1375,20 +1395,26 @@ xfs_alloc_ag_vextent_size(
* Fix up the length.
*/
args->len = rlen;
- xfs_alloc_fix_len(args);
- if (rlen < args->minlen || !xfs_alloc_fix_minleft(args)) {
- xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
- TRACE_ALLOC("nominleft", args);
- args->agbno = NULLAGBLOCK;
- return 0;
+ if (rlen < args->minlen) {
+ if (!forced++) {
+ xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+ trace_xfs_alloc_size_busy(args);
+ xfs_log_force(args->mp, XFS_LOG_SYNC);
+ goto restart;
+ }
+ goto out_nominleft;
}
+ xfs_alloc_fix_len(args);
+
+ if (!xfs_alloc_fix_minleft(args))
+ goto out_nominleft;
rlen = args->len;
XFS_WANT_CORRUPTED_GOTO(rlen <= flen, error0);
/*
* Allocate and initialize a cursor for the by-block tree.
*/
- bno_cur = xfs_btree_init_cursor(args->mp, args->tp, args->agbp,
- args->agno, XFS_BTNUM_BNO, NULL, 0);
+ bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
+ args->agno, XFS_BTNUM_BNO);
if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen,
rbno, rlen, XFSA_FIXUP_CNT_OK)))
goto error0;
@@ -1401,16 +1427,22 @@ xfs_alloc_ag_vextent_size(
args->agbno + args->len <=
be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length),
error0);
- TRACE_ALLOC("normal", args);
+ trace_xfs_alloc_size_done(args);
return 0;
error0:
- TRACE_ALLOC("error", args);
+ trace_xfs_alloc_size_error(args);
if (cnt_cur)
xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
if (bno_cur)
xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
return error;
+
+out_nominleft:
+ xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+ trace_xfs_alloc_size_nominleft(args);
+ args->agbno = NULLAGBLOCK;
+ return 0;
}
/*
@@ -1429,12 +1461,9 @@ xfs_alloc_ag_vextent_small(
int error;
xfs_agblock_t fbno;
xfs_extlen_t flen;
-#ifdef XFS_ALLOC_TRACE
- static char fname[] = "xfs_alloc_ag_vextent_small";
-#endif
int i;
- if ((error = xfs_alloc_decrement(ccur, 0, &i)))
+ if ((error = xfs_btree_decrement(ccur, 0, &i)))
goto error0;
if (i) {
if ((error = xfs_alloc_get_rec(ccur, &fbno, &flen, &i)))
@@ -1449,9 +1478,13 @@ xfs_alloc_ag_vextent_small(
else if (args->minlen == 1 && args->alignment == 1 && !args->isfl &&
(be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_flcount)
> args->minleft)) {
- if ((error = xfs_alloc_get_freelist(args->tp, args->agbp, &fbno)))
+ error = xfs_alloc_get_freelist(args->tp, args->agbp, &fbno, 0);
+ if (error)
goto error0;
if (fbno != NULLAGBLOCK) {
+ xfs_extent_busy_reuse(args->mp, args->agno, fbno, 1,
+ args->userdata);
+
if (args->userdata) {
xfs_buf_t *bp;
@@ -1466,7 +1499,7 @@ xfs_alloc_ag_vextent_small(
be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length),
error0);
args->wasfromfl = 1;
- TRACE_ALLOC("freelist", args);
+ trace_xfs_alloc_small_freelist(args);
*stat = 0;
return 0;
}
@@ -1479,24 +1512,26 @@ xfs_alloc_ag_vextent_small(
/*
* Can't allocate from the freelist for some reason.
*/
- else
+ else {
+ fbno = NULLAGBLOCK;
flen = 0;
+ }
/*
* Can't do the allocation, give up.
*/
if (flen < args->minlen) {
args->agbno = NULLAGBLOCK;
- TRACE_ALLOC("notenough", args);
+ trace_xfs_alloc_small_notenough(args);
flen = 0;
}
*fbnop = fbno;
*flenp = flen;
*stat = 1;
- TRACE_ALLOC("normal", args);
+ trace_xfs_alloc_small_done(args);
return 0;
error0:
- TRACE_ALLOC("error", args);
+ trace_xfs_alloc_small_error(args);
return error;
}
@@ -1515,9 +1550,6 @@ xfs_free_ag_extent(
xfs_btree_cur_t *bno_cur; /* cursor for by-block btree */
xfs_btree_cur_t *cnt_cur; /* cursor for by-size btree */
int error; /* error return value */
-#ifdef XFS_ALLOC_TRACE
- static char fname[] = "xfs_free_ag_extent";
-#endif
xfs_agblock_t gtbno; /* start of right neighbor block */
xfs_extlen_t gtlen; /* length of right neighbor block */
int haveleft; /* have a left neighbor block */
@@ -1528,13 +1560,13 @@ xfs_free_ag_extent(
xfs_mount_t *mp; /* mount point struct for filesystem */
xfs_agblock_t nbno; /* new starting block of freespace */
xfs_extlen_t nlen; /* new length of freespace */
+ xfs_perag_t *pag; /* per allocation group data */
mp = tp->t_mountp;
/*
* Allocate and initialize a cursor for the by-block btree.
*/
- bno_cur = xfs_btree_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_BNO, NULL,
- 0);
+ bno_cur = xfs_allocbt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_BNO);
cnt_cur = NULL;
/*
* Look for a neighboring block on the left (lower block numbers)
@@ -1567,7 +1599,7 @@ xfs_free_ag_extent(
* Look for a neighboring block on the right (higher block numbers)
* that is contiguous with this space.
*/
- if ((error = xfs_alloc_increment(bno_cur, 0, &haveright)))
+ if ((error = xfs_btree_increment(bno_cur, 0, &haveright)))
goto error0;
if (haveright) {
/*
@@ -1593,8 +1625,7 @@ xfs_free_ag_extent(
/*
* Now allocate and initialize a cursor for the by-size tree.
*/
- cnt_cur = xfs_btree_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_CNT, NULL,
- 0);
+ cnt_cur = xfs_allocbt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_CNT);
/*
* Have both left and right contiguous neighbors.
* Merge all three into a single free block.
@@ -1606,7 +1637,7 @@ xfs_free_ag_extent(
if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i)))
goto error0;
XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
- if ((error = xfs_alloc_delete(cnt_cur, &i)))
+ if ((error = xfs_btree_delete(cnt_cur, &i)))
goto error0;
XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
/*
@@ -1615,19 +1646,19 @@ xfs_free_ag_extent(
if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i)))
goto error0;
XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
- if ((error = xfs_alloc_delete(cnt_cur, &i)))
+ if ((error = xfs_btree_delete(cnt_cur, &i)))
goto error0;
XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
/*
* Delete the old by-block entry for the right block.
*/
- if ((error = xfs_alloc_delete(bno_cur, &i)))
+ if ((error = xfs_btree_delete(bno_cur, &i)))
goto error0;
XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
/*
* Move the by-block cursor back to the left neighbor.
*/
- if ((error = xfs_alloc_decrement(bno_cur, 0, &i)))
+ if ((error = xfs_btree_decrement(bno_cur, 0, &i)))
goto error0;
XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
#ifdef DEBUG
@@ -1666,14 +1697,14 @@ xfs_free_ag_extent(
if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i)))
goto error0;
XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
- if ((error = xfs_alloc_delete(cnt_cur, &i)))
+ if ((error = xfs_btree_delete(cnt_cur, &i)))
goto error0;
XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
/*
* Back up the by-block cursor to the left neighbor, and
* update its length.
*/
- if ((error = xfs_alloc_decrement(bno_cur, 0, &i)))
+ if ((error = xfs_btree_decrement(bno_cur, 0, &i)))
goto error0;
XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
nbno = ltbno;
@@ -1692,7 +1723,7 @@ xfs_free_ag_extent(
if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i)))
goto error0;
XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
- if ((error = xfs_alloc_delete(cnt_cur, &i)))
+ if ((error = xfs_btree_delete(cnt_cur, &i)))
goto error0;
XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
/*
@@ -1711,7 +1742,7 @@ xfs_free_ag_extent(
else {
nbno = bno;
nlen = len;
- if ((error = xfs_alloc_insert(bno_cur, &i)))
+ if ((error = xfs_btree_insert(bno_cur, &i)))
goto error0;
XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
}
@@ -1723,55 +1754,32 @@ xfs_free_ag_extent(
if ((error = xfs_alloc_lookup_eq(cnt_cur, nbno, nlen, &i)))
goto error0;
XFS_WANT_CORRUPTED_GOTO(i == 0, error0);
- if ((error = xfs_alloc_insert(cnt_cur, &i)))
+ if ((error = xfs_btree_insert(cnt_cur, &i)))
goto error0;
XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
cnt_cur = NULL;
+
/*
* Update the freespace totals in the ag and superblock.
*/
- {
- xfs_agf_t *agf;
- xfs_perag_t *pag; /* per allocation group data */
-
- agf = XFS_BUF_TO_AGF(agbp);
- pag = &mp->m_perag[agno];
- be32_add(&agf->agf_freeblks, len);
- xfs_trans_agblocks_delta(tp, len);
- pag->pagf_freeblks += len;
- XFS_WANT_CORRUPTED_GOTO(
- be32_to_cpu(agf->agf_freeblks) <=
- be32_to_cpu(agf->agf_length),
- error0);
- TRACE_MODAGF(NULL, agf, XFS_AGF_FREEBLKS);
- xfs_alloc_log_agf(tp, agbp, XFS_AGF_FREEBLKS);
- if (!isfl)
- xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, (long)len);
- XFS_STATS_INC(xs_freex);
- XFS_STATS_ADD(xs_freeb, len);
- }
- TRACE_FREE(haveleft ?
- (haveright ? "both" : "left") :
- (haveright ? "right" : "none"),
- agno, bno, len, isfl);
-
- /*
- * Since blocks move to the free list without the coordination
- * used in xfs_bmap_finish, we can't allow block to be available
- * for reallocation and non-transaction writing (user data)
- * until we know that the transaction that moved it to the free
- * list is permanently on disk. We track the blocks by declaring
- * these blocks as "busy"; the busy list is maintained on a per-ag
- * basis and each transaction records which entries should be removed
- * when the iclog commits to disk. If a busy block is allocated,
- * the iclog is pushed up to the LSN that freed the block.
- */
- xfs_alloc_mark_busy(tp, agno, bno, len);
+ pag = xfs_perag_get(mp, agno);
+ error = xfs_alloc_update_counters(tp, pag, agbp, len);
+ xfs_perag_put(pag);
+ if (error)
+ goto error0;
+
+ if (!isfl)
+ xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, (long)len);
+ XFS_STATS_INC(xs_freex);
+ XFS_STATS_ADD(xs_freeb, len);
+
+ trace_xfs_free_extent(mp, agno, bno, len, isfl, haveleft, haveright);
+
return 0;
error0:
- TRACE_FREE("error", agno, bno, len, isfl);
+ trace_xfs_free_extent(mp, agno, bno, len, isfl, -1, -1);
if (bno_cur)
xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
if (cnt_cur)
@@ -1807,6 +1815,25 @@ xfs_alloc_compute_maxlevels(
}
/*
+ * Find the length of the longest extent in an AG.
+ */
+xfs_extlen_t
+xfs_alloc_longest_free_extent(
+ struct xfs_mount *mp,
+ struct xfs_perag *pag)
+{
+ xfs_extlen_t need, delta = 0;
+
+ need = XFS_MIN_FREELIST_PAG(pag, mp);
+ if (need > pag->pagf_flcount)
+ delta = need - pag->pagf_flcount;
+
+ if (pag->pagf_longest > delta)
+ return pag->pagf_longest - delta;
+ return pag->pagf_flcount > 0 || pag->pagf_longest > 0;
+}
+
+/*
* Decide whether to use this allocation group for this allocation.
* If so, fix up the btree freelist's size.
*/
@@ -1837,40 +1864,44 @@ xfs_alloc_fix_freelist(
&agbp)))
return error;
if (!pag->pagf_init) {
+ ASSERT(flags & XFS_ALLOC_FLAG_TRYLOCK);
+ ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING));
args->agbp = NULL;
return 0;
}
} else
agbp = NULL;
- /* If this is a metadata prefered pag and we are user data
+ /*
+ * If this is a metadata preferred pag and we are user data
* then try somewhere else if we are not being asked to
* try harder at this point
*/
- if (pag->pagf_metadata && args->userdata && flags) {
+ if (pag->pagf_metadata && args->userdata &&
+ (flags & XFS_ALLOC_FLAG_TRYLOCK)) {
+ ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING));
args->agbp = NULL;
return 0;
}
- need = XFS_MIN_FREELIST_PAG(pag, mp);
- delta = need > pag->pagf_flcount ? need - pag->pagf_flcount : 0;
- /*
- * If it looks like there isn't a long enough extent, or enough
- * total blocks, reject it.
- */
- longest = (pag->pagf_longest > delta) ?
- (pag->pagf_longest - delta) :
- (pag->pagf_flcount > 0 || pag->pagf_longest > 0);
- if (args->minlen + args->alignment + args->minalignslop - 1 > longest ||
- (args->minleft &&
- (int)(pag->pagf_freeblks + pag->pagf_flcount -
- need - args->total) <
- (int)args->minleft)) {
- if (agbp)
- xfs_trans_brelse(tp, agbp);
- args->agbp = NULL;
- return 0;
+ if (!(flags & XFS_ALLOC_FLAG_FREEING)) {
+ /*
+ * If it looks like there isn't a long enough extent, or enough
+ * total blocks, reject it.
+ */
+ need = XFS_MIN_FREELIST_PAG(pag, mp);
+ longest = xfs_alloc_longest_free_extent(mp, pag);
+ if ((args->minlen + args->alignment + args->minalignslop - 1) >
+ longest ||
+ ((int)(pag->pagf_freeblks + pag->pagf_flcount -
+ need - args->total) < (int)args->minleft)) {
+ if (agbp)
+ xfs_trans_brelse(tp, agbp);
+ args->agbp = NULL;
+ return 0;
+ }
}
+
/*
* Get the a.g. freespace buffer.
* Can fail if we're not blocking on locks, and it's held.
@@ -1880,6 +1911,8 @@ xfs_alloc_fix_freelist(
&agbp)))
return error;
if (agbp == NULL) {
+ ASSERT(flags & XFS_ALLOC_FLAG_TRYLOCK);
+ ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING));
args->agbp = NULL;
return 0;
}
@@ -1889,22 +1922,24 @@ xfs_alloc_fix_freelist(
*/
agf = XFS_BUF_TO_AGF(agbp);
need = XFS_MIN_FREELIST(agf, mp);
- delta = need > be32_to_cpu(agf->agf_flcount) ?
- (need - be32_to_cpu(agf->agf_flcount)) : 0;
/*
* If there isn't enough total or single-extent, reject it.
*/
- longest = be32_to_cpu(agf->agf_longest);
- longest = (longest > delta) ? (longest - delta) :
- (be32_to_cpu(agf->agf_flcount) > 0 || longest > 0);
- if (args->minlen + args->alignment + args->minalignslop - 1 > longest ||
- (args->minleft &&
- (int)(be32_to_cpu(agf->agf_freeblks) +
- be32_to_cpu(agf->agf_flcount) - need - args->total) <
- (int)args->minleft)) {
- xfs_trans_brelse(tp, agbp);
- args->agbp = NULL;
- return 0;
+ if (!(flags & XFS_ALLOC_FLAG_FREEING)) {
+ delta = need > be32_to_cpu(agf->agf_flcount) ?
+ (need - be32_to_cpu(agf->agf_flcount)) : 0;
+ longest = be32_to_cpu(agf->agf_longest);
+ longest = (longest > delta) ? (longest - delta) :
+ (be32_to_cpu(agf->agf_flcount) > 0 || longest > 0);
+ if ((args->minlen + args->alignment + args->minalignslop - 1) >
+ longest ||
+ ((int)(be32_to_cpu(agf->agf_freeblks) +
+ be32_to_cpu(agf->agf_flcount) - need - args->total) <
+ (int)args->minleft)) {
+ xfs_trans_brelse(tp, agbp);
+ args->agbp = NULL;
+ return 0;
+ }
}
/*
* Make the freelist shorter if it's too long.
@@ -1912,7 +1947,8 @@ xfs_alloc_fix_freelist(
while (be32_to_cpu(agf->agf_flcount) > need) {
xfs_buf_t *bp;
- if ((error = xfs_alloc_get_freelist(tp, agbp, &bno)))
+ error = xfs_alloc_get_freelist(tp, agbp, &bno, 0);
+ if (error)
return error;
if ((error = xfs_free_ag_extent(tp, agbp, args->agno, bno, 1, 1)))
return error;
@@ -1922,12 +1958,11 @@ xfs_alloc_fix_freelist(
/*
* Initialize the args structure.
*/
+ memset(&targs, 0, sizeof(targs));
targs.tp = tp;
targs.mp = mp;
targs.agbp = agbp;
targs.agno = args->agno;
- targs.mod = targs.minleft = targs.wasdel = targs.userdata =
- targs.minalignslop = 0;
targs.alignment = targs.minlen = targs.prod = targs.isfl = 1;
targs.type = XFS_ALLOCTYPE_THIS_AG;
targs.pag = pag;
@@ -1942,24 +1977,33 @@ xfs_alloc_fix_freelist(
/*
* Allocate as many blocks as possible at once.
*/
- if ((error = xfs_alloc_ag_vextent(&targs)))
+ if ((error = xfs_alloc_ag_vextent(&targs))) {
+ xfs_trans_brelse(tp, agflbp);
return error;
+ }
/*
* Stop if we run out. Won't happen if callers are obeying
* the restrictions correctly. Can happen for free calls
* on a completely full ag.
*/
- if (targs.agbno == NULLAGBLOCK)
- break;
+ if (targs.agbno == NULLAGBLOCK) {
+ if (flags & XFS_ALLOC_FLAG_FREEING)
+ break;
+ xfs_trans_brelse(tp, agflbp);
+ args->agbp = NULL;
+ return 0;
+ }
/*
* Put each allocated block on the list.
*/
for (bno = targs.agbno; bno < targs.agbno + targs.len; bno++) {
- if ((error = xfs_alloc_put_freelist(tp, agbp, agflbp,
- bno)))
+ error = xfs_alloc_put_freelist(tp, agbp,
+ agflbp, bno, 0);
+ if (error)
return error;
}
}
+ xfs_trans_brelse(tp, agflbp);
args->agbp = agbp;
return 0;
}
@@ -1972,23 +2016,22 @@ int /* error */
xfs_alloc_get_freelist(
xfs_trans_t *tp, /* transaction pointer */
xfs_buf_t *agbp, /* buffer containing the agf structure */
- xfs_agblock_t *bnop) /* block address retrieved from freelist */
+ xfs_agblock_t *bnop, /* block address retrieved from freelist */
+ int btreeblk) /* destination is a AGF btree */
{
xfs_agf_t *agf; /* a.g. freespace structure */
- xfs_agfl_t *agfl; /* a.g. freelist structure */
xfs_buf_t *agflbp;/* buffer for a.g. freelist structure */
xfs_agblock_t bno; /* block number returned */
+ __be32 *agfl_bno;
int error;
-#ifdef XFS_ALLOC_TRACE
- static char fname[] = "xfs_alloc_get_freelist";
-#endif
- xfs_mount_t *mp; /* mount structure */
+ int logflags;
+ xfs_mount_t *mp = tp->t_mountp;
xfs_perag_t *pag; /* per allocation group data */
- agf = XFS_BUF_TO_AGF(agbp);
/*
* Freelist is empty, give up.
*/
+ agf = XFS_BUF_TO_AGF(agbp);
if (!agf->agf_flcount) {
*bnop = NULLAGBLOCK;
return 0;
@@ -1996,36 +2039,38 @@ xfs_alloc_get_freelist(
/*
* Read the array of free blocks.
*/
- mp = tp->t_mountp;
- if ((error = xfs_alloc_read_agfl(mp, tp,
- be32_to_cpu(agf->agf_seqno), &agflbp)))
+ error = xfs_alloc_read_agfl(mp, tp, be32_to_cpu(agf->agf_seqno),
+ &agflbp);
+ if (error)
return error;
- agfl = XFS_BUF_TO_AGFL(agflbp);
+
+
/*
* Get the block number and update the data structures.
*/
- bno = INT_GET(agfl->agfl_bno[be32_to_cpu(agf->agf_flfirst)], ARCH_CONVERT);
- be32_add(&agf->agf_flfirst, 1);
+ agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, agflbp);
+ bno = be32_to_cpu(agfl_bno[be32_to_cpu(agf->agf_flfirst)]);
+ be32_add_cpu(&agf->agf_flfirst, 1);
xfs_trans_brelse(tp, agflbp);
if (be32_to_cpu(agf->agf_flfirst) == XFS_AGFL_SIZE(mp))
agf->agf_flfirst = 0;
- pag = &mp->m_perag[be32_to_cpu(agf->agf_seqno)];
- be32_add(&agf->agf_flcount, -1);
+
+ pag = xfs_perag_get(mp, be32_to_cpu(agf->agf_seqno));
+ be32_add_cpu(&agf->agf_flcount, -1);
xfs_trans_agflist_delta(tp, -1);
pag->pagf_flcount--;
- TRACE_MODAGF(NULL, agf, XFS_AGF_FLFIRST | XFS_AGF_FLCOUNT);
- xfs_alloc_log_agf(tp, agbp, XFS_AGF_FLFIRST | XFS_AGF_FLCOUNT);
+ xfs_perag_put(pag);
+
+ logflags = XFS_AGF_FLFIRST | XFS_AGF_FLCOUNT;
+ if (btreeblk) {
+ be32_add_cpu(&agf->agf_btreeblks, 1);
+ pag->pagf_btreeblks++;
+ logflags |= XFS_AGF_BTREEBLKS;
+ }
+
+ xfs_alloc_log_agf(tp, agbp, logflags);
*bnop = bno;
- /*
- * As blocks are freed, they are added to the per-ag busy list
- * and remain there until the freeing transaction is committed to
- * disk. Now that we have allocated blocks, this list must be
- * searched to see if a block is being reused. If one is, then
- * the freeing transaction must be pushed to disk NOW by forcing
- * to disk all iclogs up that transaction's LSN.
- */
- xfs_alloc_search_busy(tp, be32_to_cpu(agf->agf_seqno), bno, 1);
return 0;
}
@@ -2052,9 +2097,15 @@ xfs_alloc_log_agf(
offsetof(xfs_agf_t, agf_flcount),
offsetof(xfs_agf_t, agf_freeblks),
offsetof(xfs_agf_t, agf_longest),
+ offsetof(xfs_agf_t, agf_btreeblks),
+ offsetof(xfs_agf_t, agf_uuid),
sizeof(xfs_agf_t)
};
+ trace_xfs_agf(tp->t_mountp, XFS_BUF_TO_AGF(bp), fields, _RET_IP_);
+
+ xfs_trans_buf_set_type(tp, bp, XFS_BLFT_AGF_BUF);
+
xfs_btree_offsets(fields, offsets, XFS_AGF_NUM_BITS, &first, &last);
xfs_trans_log_buf(tp, bp, (uint)first, (uint)last);
}
@@ -2087,17 +2138,17 @@ xfs_alloc_put_freelist(
xfs_trans_t *tp, /* transaction pointer */
xfs_buf_t *agbp, /* buffer for a.g. freelist header */
xfs_buf_t *agflbp,/* buffer for a.g. free block array */
- xfs_agblock_t bno) /* block being freed */
+ xfs_agblock_t bno, /* block being freed */
+ int btreeblk) /* block came from a AGF btree */
{
xfs_agf_t *agf; /* a.g. freespace structure */
- xfs_agfl_t *agfl; /* a.g. free block array */
- xfs_agblock_t *blockp;/* pointer to array entry */
+ __be32 *blockp;/* pointer to array entry */
int error;
-#ifdef XFS_ALLOC_TRACE
- static char fname[] = "xfs_alloc_put_freelist";
-#endif
+ int logflags;
xfs_mount_t *mp; /* mount structure */
xfs_perag_t *pag; /* per allocation group data */
+ __be32 *agfl_bno;
+ int startoff;
agf = XFS_BUF_TO_AGF(agbp);
mp = tp->t_mountp;
@@ -2105,92 +2156,198 @@ xfs_alloc_put_freelist(
if (!agflbp && (error = xfs_alloc_read_agfl(mp, tp,
be32_to_cpu(agf->agf_seqno), &agflbp)))
return error;
- agfl = XFS_BUF_TO_AGFL(agflbp);
- be32_add(&agf->agf_fllast, 1);
+ be32_add_cpu(&agf->agf_fllast, 1);
if (be32_to_cpu(agf->agf_fllast) == XFS_AGFL_SIZE(mp))
agf->agf_fllast = 0;
- pag = &mp->m_perag[be32_to_cpu(agf->agf_seqno)];
- be32_add(&agf->agf_flcount, 1);
+
+ pag = xfs_perag_get(mp, be32_to_cpu(agf->agf_seqno));
+ be32_add_cpu(&agf->agf_flcount, 1);
xfs_trans_agflist_delta(tp, 1);
pag->pagf_flcount++;
+
+ logflags = XFS_AGF_FLLAST | XFS_AGF_FLCOUNT;
+ if (btreeblk) {
+ be32_add_cpu(&agf->agf_btreeblks, -1);
+ pag->pagf_btreeblks--;
+ logflags |= XFS_AGF_BTREEBLKS;
+ }
+ xfs_perag_put(pag);
+
+ xfs_alloc_log_agf(tp, agbp, logflags);
+
ASSERT(be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp));
- blockp = &agfl->agfl_bno[be32_to_cpu(agf->agf_fllast)];
- INT_SET(*blockp, ARCH_CONVERT, bno);
- TRACE_MODAGF(NULL, agf, XFS_AGF_FLLAST | XFS_AGF_FLCOUNT);
- xfs_alloc_log_agf(tp, agbp, XFS_AGF_FLLAST | XFS_AGF_FLCOUNT);
- xfs_trans_log_buf(tp, agflbp,
- (int)((xfs_caddr_t)blockp - (xfs_caddr_t)agfl),
- (int)((xfs_caddr_t)blockp - (xfs_caddr_t)agfl +
- sizeof(xfs_agblock_t) - 1));
+
+ agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, agflbp);
+ blockp = &agfl_bno[be32_to_cpu(agf->agf_fllast)];
+ *blockp = cpu_to_be32(bno);
+ startoff = (char *)blockp - (char *)agflbp->b_addr;
+
+ xfs_alloc_log_agf(tp, agbp, logflags);
+
+ xfs_trans_buf_set_type(tp, agflbp, XFS_BLFT_AGFL_BUF);
+ xfs_trans_log_buf(tp, agflbp, startoff,
+ startoff + sizeof(xfs_agblock_t) - 1);
return 0;
}
+static bool
+xfs_agf_verify(
+ struct xfs_mount *mp,
+ struct xfs_buf *bp)
+ {
+ struct xfs_agf *agf = XFS_BUF_TO_AGF(bp);
+
+ if (xfs_sb_version_hascrc(&mp->m_sb) &&
+ !uuid_equal(&agf->agf_uuid, &mp->m_sb.sb_uuid))
+ return false;
+
+ if (!(agf->agf_magicnum == cpu_to_be32(XFS_AGF_MAGIC) &&
+ XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) &&
+ be32_to_cpu(agf->agf_freeblks) <= be32_to_cpu(agf->agf_length) &&
+ be32_to_cpu(agf->agf_flfirst) < XFS_AGFL_SIZE(mp) &&
+ be32_to_cpu(agf->agf_fllast) < XFS_AGFL_SIZE(mp) &&
+ be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp)))
+ return false;
+
+ /*
+ * during growfs operations, the perag is not fully initialised,
+ * so we can't use it for any useful checking. growfs ensures we can't
+ * use it by using uncached buffers that don't have the perag attached
+ * so we can detect and avoid this problem.
+ */
+ if (bp->b_pag && be32_to_cpu(agf->agf_seqno) != bp->b_pag->pag_agno)
+ return false;
+
+ if (xfs_sb_version_haslazysbcount(&mp->m_sb) &&
+ be32_to_cpu(agf->agf_btreeblks) > be32_to_cpu(agf->agf_length))
+ return false;
+
+ return true;;
+
+}
+
+static void
+xfs_agf_read_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+
+ if (xfs_sb_version_hascrc(&mp->m_sb) &&
+ !xfs_buf_verify_cksum(bp, XFS_AGF_CRC_OFF))
+ xfs_buf_ioerror(bp, EFSBADCRC);
+ else if (XFS_TEST_ERROR(!xfs_agf_verify(mp, bp), mp,
+ XFS_ERRTAG_ALLOC_READ_AGF,
+ XFS_RANDOM_ALLOC_READ_AGF))
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
+
+ if (bp->b_error)
+ xfs_verifier_error(bp);
+}
+
+static void
+xfs_agf_write_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_buf_log_item *bip = bp->b_fspriv;
+
+ if (!xfs_agf_verify(mp, bp)) {
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
+ xfs_verifier_error(bp);
+ return;
+ }
+
+ if (!xfs_sb_version_hascrc(&mp->m_sb))
+ return;
+
+ if (bip)
+ XFS_BUF_TO_AGF(bp)->agf_lsn = cpu_to_be64(bip->bli_item.li_lsn);
+
+ xfs_buf_update_cksum(bp, XFS_AGF_CRC_OFF);
+}
+
+const struct xfs_buf_ops xfs_agf_buf_ops = {
+ .verify_read = xfs_agf_read_verify,
+ .verify_write = xfs_agf_write_verify,
+};
+
/*
* Read in the allocation group header (free/alloc section).
*/
int /* error */
-xfs_alloc_read_agf(
- xfs_mount_t *mp, /* mount point structure */
- xfs_trans_t *tp, /* transaction pointer */
- xfs_agnumber_t agno, /* allocation group number */
- int flags, /* XFS_ALLOC_FLAG_... */
- xfs_buf_t **bpp) /* buffer for the ag freelist header */
+xfs_read_agf(
+ struct xfs_mount *mp, /* mount point structure */
+ struct xfs_trans *tp, /* transaction pointer */
+ xfs_agnumber_t agno, /* allocation group number */
+ int flags, /* XFS_BUF_ */
+ struct xfs_buf **bpp) /* buffer for the ag freelist header */
{
- xfs_agf_t *agf; /* ag freelist header */
- int agf_ok; /* set if agf is consistent */
- xfs_buf_t *bp; /* return value */
- xfs_perag_t *pag; /* per allocation group data */
int error;
+ trace_xfs_read_agf(mp, agno);
+
ASSERT(agno != NULLAGNUMBER);
error = xfs_trans_read_buf(
mp, tp, mp->m_ddev_targp,
XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)),
- XFS_FSS_TO_BB(mp, 1),
- (flags & XFS_ALLOC_FLAG_TRYLOCK) ? XFS_BUF_TRYLOCK : 0U,
- &bp);
+ XFS_FSS_TO_BB(mp, 1), flags, bpp, &xfs_agf_buf_ops);
if (error)
return error;
- ASSERT(!bp || !XFS_BUF_GETERROR(bp));
- if (!bp) {
- *bpp = NULL;
+ if (!*bpp)
return 0;
- }
- /*
- * Validate the magic number of the agf block.
- */
- agf = XFS_BUF_TO_AGF(bp);
- agf_ok =
- be32_to_cpu(agf->agf_magicnum) == XFS_AGF_MAGIC &&
- XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) &&
- be32_to_cpu(agf->agf_freeblks) <= be32_to_cpu(agf->agf_length) &&
- be32_to_cpu(agf->agf_flfirst) < XFS_AGFL_SIZE(mp) &&
- be32_to_cpu(agf->agf_fllast) < XFS_AGFL_SIZE(mp) &&
- be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp);
- if (unlikely(XFS_TEST_ERROR(!agf_ok, mp, XFS_ERRTAG_ALLOC_READ_AGF,
- XFS_RANDOM_ALLOC_READ_AGF))) {
- XFS_CORRUPTION_ERROR("xfs_alloc_read_agf",
- XFS_ERRLEVEL_LOW, mp, agf);
- xfs_trans_brelse(tp, bp);
- return XFS_ERROR(EFSCORRUPTED);
- }
- pag = &mp->m_perag[agno];
+
+ ASSERT(!(*bpp)->b_error);
+ xfs_buf_set_ref(*bpp, XFS_AGF_REF);
+ return 0;
+}
+
+/*
+ * Read in the allocation group header (free/alloc section).
+ */
+int /* error */
+xfs_alloc_read_agf(
+ struct xfs_mount *mp, /* mount point structure */
+ struct xfs_trans *tp, /* transaction pointer */
+ xfs_agnumber_t agno, /* allocation group number */
+ int flags, /* XFS_ALLOC_FLAG_... */
+ struct xfs_buf **bpp) /* buffer for the ag freelist header */
+{
+ struct xfs_agf *agf; /* ag freelist header */
+ struct xfs_perag *pag; /* per allocation group data */
+ int error;
+
+ trace_xfs_alloc_read_agf(mp, agno);
+
+ ASSERT(agno != NULLAGNUMBER);
+ error = xfs_read_agf(mp, tp, agno,
+ (flags & XFS_ALLOC_FLAG_TRYLOCK) ? XBF_TRYLOCK : 0,
+ bpp);
+ if (error)
+ return error;
+ if (!*bpp)
+ return 0;
+ ASSERT(!(*bpp)->b_error);
+
+ agf = XFS_BUF_TO_AGF(*bpp);
+ pag = xfs_perag_get(mp, agno);
if (!pag->pagf_init) {
pag->pagf_freeblks = be32_to_cpu(agf->agf_freeblks);
+ pag->pagf_btreeblks = be32_to_cpu(agf->agf_btreeblks);
pag->pagf_flcount = be32_to_cpu(agf->agf_flcount);
pag->pagf_longest = be32_to_cpu(agf->agf_longest);
pag->pagf_levels[XFS_BTNUM_BNOi] =
be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNOi]);
pag->pagf_levels[XFS_BTNUM_CNTi] =
be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]);
- spinlock_init(&pag->pagb_lock, "xfspagb");
- pag->pagb_list = kmem_zalloc(XFS_PAGB_NUM_SLOTS *
- sizeof(xfs_perag_busy_t), KM_SLEEP);
+ spin_lock_init(&pag->pagb_lock);
+ pag->pagb_count = 0;
+ pag->pagb_tree = RB_ROOT;
pag->pagf_init = 1;
}
#ifdef DEBUG
else if (!XFS_FORCED_SHUTDOWN(mp)) {
ASSERT(pag->pagf_freeblks == be32_to_cpu(agf->agf_freeblks));
+ ASSERT(pag->pagf_btreeblks == be32_to_cpu(agf->agf_btreeblks));
ASSERT(pag->pagf_flcount == be32_to_cpu(agf->agf_flcount));
ASSERT(pag->pagf_longest == be32_to_cpu(agf->agf_longest));
ASSERT(pag->pagf_levels[XFS_BTNUM_BNOi] ==
@@ -2199,8 +2356,7 @@ xfs_alloc_read_agf(
be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]));
}
#endif
- XFS_BUF_SET_VTYPE_REF(bp, B_FS_AGF, XFS_AGF_REF);
- *bpp = bp;
+ xfs_perag_put(pag);
return 0;
}
@@ -2216,9 +2372,6 @@ xfs_alloc_vextent(
xfs_agblock_t agsize; /* allocation group size */
int error;
int flags; /* XFS_ALLOC_FLAG_... locking flags */
-#ifdef XFS_ALLOC_TRACE
- static char fname[] = "xfs_alloc_vextent";
-#endif
xfs_extlen_t minleft;/* minimum left value, temp copy */
xfs_mount_t *mp; /* mount structure pointer */
xfs_agnumber_t sagno; /* starting allocation group number */
@@ -2250,7 +2403,7 @@ xfs_alloc_vextent(
args->minlen > args->maxlen || args->minlen > agsize ||
args->mod >= args->prod) {
args->fsbno = NULLFSBLOCK;
- TRACE_ALLOC("badargs", args);
+ trace_xfs_alloc_vextent_badargs(args);
return 0;
}
minleft = args->minleft;
@@ -2263,24 +2416,21 @@ xfs_alloc_vextent(
* These three force us into a single a.g.
*/
args->agno = XFS_FSB_TO_AGNO(mp, args->fsbno);
- down_read(&mp->m_peraglock);
- args->pag = &mp->m_perag[args->agno];
+ args->pag = xfs_perag_get(mp, args->agno);
args->minleft = 0;
error = xfs_alloc_fix_freelist(args, 0);
args->minleft = minleft;
if (error) {
- TRACE_ALLOC("nofix", args);
+ trace_xfs_alloc_vextent_nofix(args);
goto error0;
}
if (!args->agbp) {
- up_read(&mp->m_peraglock);
- TRACE_ALLOC("noagbp", args);
+ trace_xfs_alloc_vextent_noagbp(args);
break;
}
args->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno);
if ((error = xfs_alloc_ag_vextent(args)))
goto error0;
- up_read(&mp->m_peraglock);
break;
case XFS_ALLOCTYPE_START_BNO:
/*
@@ -2332,14 +2482,13 @@ xfs_alloc_vextent(
* Loop over allocation groups twice; first time with
* trylock set, second time without.
*/
- down_read(&mp->m_peraglock);
for (;;) {
- args->pag = &mp->m_perag[args->agno];
+ args->pag = xfs_perag_get(mp, args->agno);
if (no_min) args->minleft = 0;
error = xfs_alloc_fix_freelist(args, flags);
args->minleft = minleft;
if (error) {
- TRACE_ALLOC("nofix", args);
+ trace_xfs_alloc_vextent_nofix(args);
goto error0;
}
/*
@@ -2350,15 +2499,28 @@ xfs_alloc_vextent(
goto error0;
break;
}
- TRACE_ALLOC("loopfailed", args);
+
+ trace_xfs_alloc_vextent_loopfailed(args);
+
/*
* Didn't work, figure out the next iteration.
*/
if (args->agno == sagno &&
type == XFS_ALLOCTYPE_START_BNO)
args->type = XFS_ALLOCTYPE_THIS_AG;
- if (++(args->agno) == mp->m_sb.sb_agcount)
- args->agno = 0;
+ /*
+ * For the first allocation, we can try any AG to get
+ * space. However, if we already have allocated a
+ * block, we don't want to try AGs whose number is below
+ * sagno. Otherwise, we may end up with out-of-order
+ * locking of AGF, which might cause deadlock.
+ */
+ if (++(args->agno) == mp->m_sb.sb_agcount) {
+ if (args->firstblock != NULLFSBLOCK)
+ args->agno = sagno;
+ else
+ args->agno = 0;
+ }
/*
* Reached the starting a.g., must either be done
* or switch to non-trylock mode.
@@ -2366,7 +2528,7 @@ xfs_alloc_vextent(
if (args->agno == sagno) {
if (no_min == 1) {
args->agbno = NULLAGBLOCK;
- TRACE_ALLOC("allfailed", args);
+ trace_xfs_alloc_vextent_allfailed(args);
break;
}
if (flags == 0) {
@@ -2380,8 +2542,8 @@ xfs_alloc_vextent(
}
}
}
+ xfs_perag_put(args->pag);
}
- up_read(&mp->m_peraglock);
if (bump_rotor || (type == XFS_ALLOCTYPE_ANY_AG)) {
if (args->agno == sagno)
mp->m_agfrotor = (mp->m_agfrotor + 1) %
@@ -2407,9 +2569,10 @@ xfs_alloc_vextent(
args->len);
#endif
}
+ xfs_perag_put(args->pag);
return 0;
error0:
- up_read(&mp->m_peraglock);
+ xfs_perag_put(args->pag);
return error;
}
@@ -2424,178 +2587,44 @@ xfs_free_extent(
xfs_fsblock_t bno, /* starting block number of extent */
xfs_extlen_t len) /* length of extent */
{
-#ifdef DEBUG
- xfs_agf_t *agf; /* a.g. freespace header */
-#endif
- xfs_alloc_arg_t args; /* allocation argument structure */
+ xfs_alloc_arg_t args;
int error;
ASSERT(len != 0);
+ memset(&args, 0, sizeof(xfs_alloc_arg_t));
args.tp = tp;
args.mp = tp->t_mountp;
- args.agno = XFS_FSB_TO_AGNO(args.mp, bno);
- ASSERT(args.agno < args.mp->m_sb.sb_agcount);
- args.agbno = XFS_FSB_TO_AGBNO(args.mp, bno);
- args.alignment = 1;
- args.minlen = args.minleft = args.minalignslop = 0;
- down_read(&args.mp->m_peraglock);
- args.pag = &args.mp->m_perag[args.agno];
- if ((error = xfs_alloc_fix_freelist(&args, 0)))
- goto error0;
-#ifdef DEBUG
- ASSERT(args.agbp != NULL);
- agf = XFS_BUF_TO_AGF(args.agbp);
- ASSERT(args.agbno + len <= be32_to_cpu(agf->agf_length));
-#endif
- error = xfs_free_ag_extent(tp, args.agbp, args.agno, args.agbno,
- len, 0);
-error0:
- up_read(&args.mp->m_peraglock);
- return error;
-}
-
-
-/*
- * AG Busy list management
- * The busy list contains block ranges that have been freed but whose
- * transacations have not yet hit disk. If any block listed in a busy
- * list is reused, the transaction that freed it must be forced to disk
- * before continuing to use the block.
- *
- * xfs_alloc_mark_busy - add to the per-ag busy list
- * xfs_alloc_clear_busy - remove an item from the per-ag busy list
- */
-void
-xfs_alloc_mark_busy(xfs_trans_t *tp,
- xfs_agnumber_t agno,
- xfs_agblock_t bno,
- xfs_extlen_t len)
-{
- xfs_mount_t *mp;
- xfs_perag_busy_t *bsy;
- int n;
- SPLDECL(s);
-
- mp = tp->t_mountp;
- s = mutex_spinlock(&mp->m_perag[agno].pagb_lock);
-
- /* search pagb_list for an open slot */
- for (bsy = mp->m_perag[agno].pagb_list, n = 0;
- n < XFS_PAGB_NUM_SLOTS;
- bsy++, n++) {
- if (bsy->busy_tp == NULL) {
- break;
- }
- }
-
- if (n < XFS_PAGB_NUM_SLOTS) {
- bsy = &mp->m_perag[agno].pagb_list[n];
- mp->m_perag[agno].pagb_count++;
- TRACE_BUSY("xfs_alloc_mark_busy", "got", agno, bno, len, n, tp);
- bsy->busy_start = bno;
- bsy->busy_length = len;
- bsy->busy_tp = tp;
- xfs_trans_add_busy(tp, agno, n);
- } else {
- TRACE_BUSY("xfs_alloc_mark_busy", "FULL", agno, bno, len, -1, tp);
- /*
- * The busy list is full! Since it is now not possible to
- * track the free block, make this a synchronous transaction
- * to insure that the block is not reused before this
- * transaction commits.
- */
- xfs_trans_set_sync(tp);
- }
-
- mutex_spinunlock(&mp->m_perag[agno].pagb_lock, s);
-}
-
-void
-xfs_alloc_clear_busy(xfs_trans_t *tp,
- xfs_agnumber_t agno,
- int idx)
-{
- xfs_mount_t *mp;
- xfs_perag_busy_t *list;
- SPLDECL(s);
-
- mp = tp->t_mountp;
-
- s = mutex_spinlock(&mp->m_perag[agno].pagb_lock);
- list = mp->m_perag[agno].pagb_list;
- ASSERT(idx < XFS_PAGB_NUM_SLOTS);
- if (list[idx].busy_tp == tp) {
- TRACE_UNBUSY("xfs_alloc_clear_busy", "found", agno, idx, tp);
- list[idx].busy_tp = NULL;
- mp->m_perag[agno].pagb_count--;
- } else {
- TRACE_UNBUSY("xfs_alloc_clear_busy", "missing", agno, idx, tp);
- }
-
- mutex_spinunlock(&mp->m_perag[agno].pagb_lock, s);
-}
-
-
-/*
- * returns non-zero if any of (agno,bno):len is in a busy list
- */
-STATIC int
-xfs_alloc_search_busy(xfs_trans_t *tp,
- xfs_agnumber_t agno,
- xfs_agblock_t bno,
- xfs_extlen_t len)
-{
- xfs_mount_t *mp;
- xfs_perag_busy_t *bsy;
- int n;
- xfs_agblock_t uend, bend;
- xfs_lsn_t lsn;
- int cnt;
- SPLDECL(s);
-
- mp = tp->t_mountp;
-
- s = mutex_spinlock(&mp->m_perag[agno].pagb_lock);
- cnt = mp->m_perag[agno].pagb_count;
+ /*
+ * validate that the block number is legal - the enables us to detect
+ * and handle a silent filesystem corruption rather than crashing.
+ */
+ args.agno = XFS_FSB_TO_AGNO(args.mp, bno);
+ if (args.agno >= args.mp->m_sb.sb_agcount)
+ return EFSCORRUPTED;
- uend = bno + len - 1;
+ args.agbno = XFS_FSB_TO_AGBNO(args.mp, bno);
+ if (args.agbno >= args.mp->m_sb.sb_agblocks)
+ return EFSCORRUPTED;
- /* search pagb_list for this slot, skipping open slots */
- for (bsy = mp->m_perag[agno].pagb_list, n = 0;
- cnt; bsy++, n++) {
+ args.pag = xfs_perag_get(args.mp, args.agno);
+ ASSERT(args.pag);
- /*
- * (start1,length1) within (start2, length2)
- */
- if (bsy->busy_tp != NULL) {
- bend = bsy->busy_start + bsy->busy_length - 1;
- if ((bno > bend) ||
- (uend < bsy->busy_start)) {
- cnt--;
- } else {
- TRACE_BUSYSEARCH("xfs_alloc_search_busy",
- "found1", agno, bno, len, n,
- tp);
- break;
- }
- }
- }
+ error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING);
+ if (error)
+ goto error0;
- /*
- * If a block was found, force the log through the LSN of the
- * transaction that freed the block
- */
- if (cnt) {
- TRACE_BUSYSEARCH("xfs_alloc_search_busy", "found", agno, bno, len, n, tp);
- lsn = bsy->busy_tp->t_commit_lsn;
- mutex_spinunlock(&mp->m_perag[agno].pagb_lock, s);
- xfs_log_force(mp, lsn, XFS_LOG_FORCE|XFS_LOG_SYNC);
- } else {
- TRACE_BUSYSEARCH("xfs_alloc_search_busy", "not-found", agno, bno, len, n, tp);
- n = -1;
- mutex_spinunlock(&mp->m_perag[agno].pagb_lock, s);
+ /* validate the extent size is legal now we have the agf locked */
+ if (args.agbno + len >
+ be32_to_cpu(XFS_BUF_TO_AGF(args.agbp)->agf_length)) {
+ error = EFSCORRUPTED;
+ goto error0;
}
- return n;
+ error = xfs_free_ag_extent(tp, args.agbp, args.agno, args.agbno, len, 0);
+ if (!error)
+ xfs_extent_busy_insert(tp, args.agno, args.agbno, len, 0);
+error0:
+ xfs_perag_put(args.pag);
+ return error;
}
diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h
index 3546dea27b7..feacb061bab 100644
--- a/fs/xfs/xfs_alloc.h
+++ b/fs/xfs/xfs_alloc.h
@@ -19,28 +19,77 @@
#define __XFS_ALLOC_H__
struct xfs_buf;
+struct xfs_btree_cur;
struct xfs_mount;
struct xfs_perag;
struct xfs_trans;
+extern struct workqueue_struct *xfs_alloc_wq;
+
/*
* Freespace allocation types. Argument to xfs_alloc_[v]extent.
*/
-typedef enum xfs_alloctype
-{
- XFS_ALLOCTYPE_ANY_AG, /* allocate anywhere, use rotor */
- XFS_ALLOCTYPE_FIRST_AG, /* ... start at ag 0 */
- XFS_ALLOCTYPE_START_AG, /* anywhere, start in this a.g. */
- XFS_ALLOCTYPE_THIS_AG, /* anywhere in this a.g. */
- XFS_ALLOCTYPE_START_BNO, /* near this block else anywhere */
- XFS_ALLOCTYPE_NEAR_BNO, /* in this a.g. and near this block */
- XFS_ALLOCTYPE_THIS_BNO /* at exactly this block */
-} xfs_alloctype_t;
+#define XFS_ALLOCTYPE_ANY_AG 0x01 /* allocate anywhere, use rotor */
+#define XFS_ALLOCTYPE_FIRST_AG 0x02 /* ... start at ag 0 */
+#define XFS_ALLOCTYPE_START_AG 0x04 /* anywhere, start in this a.g. */
+#define XFS_ALLOCTYPE_THIS_AG 0x08 /* anywhere in this a.g. */
+#define XFS_ALLOCTYPE_START_BNO 0x10 /* near this block else anywhere */
+#define XFS_ALLOCTYPE_NEAR_BNO 0x20 /* in this a.g. and near this block */
+#define XFS_ALLOCTYPE_THIS_BNO 0x40 /* at exactly this block */
+
+/* this should become an enum again when the tracing code is fixed */
+typedef unsigned int xfs_alloctype_t;
+
+#define XFS_ALLOC_TYPES \
+ { XFS_ALLOCTYPE_ANY_AG, "ANY_AG" }, \
+ { XFS_ALLOCTYPE_FIRST_AG, "FIRST_AG" }, \
+ { XFS_ALLOCTYPE_START_AG, "START_AG" }, \
+ { XFS_ALLOCTYPE_THIS_AG, "THIS_AG" }, \
+ { XFS_ALLOCTYPE_START_BNO, "START_BNO" }, \
+ { XFS_ALLOCTYPE_NEAR_BNO, "NEAR_BNO" }, \
+ { XFS_ALLOCTYPE_THIS_BNO, "THIS_BNO" }
/*
* Flags for xfs_alloc_fix_freelist.
*/
#define XFS_ALLOC_FLAG_TRYLOCK 0x00000001 /* use trylock for buffer locking */
+#define XFS_ALLOC_FLAG_FREEING 0x00000002 /* indicate caller is freeing extents*/
+
+/*
+ * In order to avoid ENOSPC-related deadlock caused by
+ * out-of-order locking of AGF buffer (PV 947395), we place
+ * constraints on the relationship among actual allocations for
+ * data blocks, freelist blocks, and potential file data bmap
+ * btree blocks. However, these restrictions may result in no
+ * actual space allocated for a delayed extent, for example, a data
+ * block in a certain AG is allocated but there is no additional
+ * block for the additional bmap btree block due to a split of the
+ * bmap btree of the file. The result of this may lead to an
+ * infinite loop in xfssyncd when the file gets flushed to disk and
+ * all delayed extents need to be actually allocated. To get around
+ * this, we explicitly set aside a few blocks which will not be
+ * reserved in delayed allocation. Considering the minimum number of
+ * needed freelist blocks is 4 fsbs _per AG_, a potential split of file's bmap
+ * btree requires 1 fsb, so we set the number of set-aside blocks
+ * to 4 + 4*agcount.
+ */
+#define XFS_ALLOC_SET_ASIDE(mp) (4 + ((mp)->m_sb.sb_agcount * 4))
+
+/*
+ * When deciding how much space to allocate out of an AG, we limit the
+ * allocation maximum size to the size the AG. However, we cannot use all the
+ * blocks in the AG - some are permanently used by metadata. These
+ * blocks are generally:
+ * - the AG superblock, AGF, AGI and AGFL
+ * - the AGF (bno and cnt) and AGI btree root blocks
+ * - 4 blocks on the AGFL according to XFS_ALLOC_SET_ASIDE() limits
+ *
+ * The AG headers are sector sized, so the amount of space they take up is
+ * dependent on filesystem geometry. The others are all single blocks.
+ */
+#define XFS_ALLOC_AG_MAX_USABLE(mp) \
+ ((mp)->m_sb.sb_agblocks - XFS_BB_TO_FSB(mp, XFS_FSS_TO_BB(mp, 4)) - 7)
+
/*
* Argument structure for xfs_alloc routines.
@@ -68,8 +117,9 @@ typedef struct xfs_alloc_arg {
xfs_alloctype_t otype; /* original allocation type */
char wasdel; /* set if allocation was prev delayed */
char wasfromfl; /* set if allocation is from freelist */
- char isfl; /* set if is freelist blocks - !actg */
+ char isfl; /* set if is freelist blocks - !acctg */
char userdata; /* set if this is user data */
+ xfs_fsblock_t firstblock; /* io first block allocated */
} xfs_alloc_arg_t;
/*
@@ -78,26 +128,12 @@ typedef struct xfs_alloc_arg {
#define XFS_ALLOC_USERDATA 1 /* allocation is for user data*/
#define XFS_ALLOC_INITIAL_USER_DATA 2 /* special case start of file */
-
-#ifdef __KERNEL__
-
-#if defined(XFS_ALLOC_TRACE)
/*
- * Allocation tracing buffer size.
+ * Find the length of the longest extent in an AG.
*/
-#define XFS_ALLOC_TRACE_SIZE 4096
-extern ktrace_t *xfs_alloc_trace_buf;
-
-/*
- * Types for alloc tracing.
- */
-#define XFS_ALLOC_KTRACE_ALLOC 1
-#define XFS_ALLOC_KTRACE_FREE 2
-#define XFS_ALLOC_KTRACE_MODAGF 3
-#define XFS_ALLOC_KTRACE_BUSY 4
-#define XFS_ALLOC_KTRACE_UNBUSY 5
-#define XFS_ALLOC_KTRACE_BUSYSEARCH 6
-#endif
+xfs_extlen_t
+xfs_alloc_longest_free_extent(struct xfs_mount *mp,
+ struct xfs_perag *pag);
/*
* Compute and fill in value of m_ag_maxlevels.
@@ -114,7 +150,8 @@ int /* error */
xfs_alloc_get_freelist(
struct xfs_trans *tp, /* transaction pointer */
struct xfs_buf *agbp, /* buffer containing the agf structure */
- xfs_agblock_t *bnop); /* block address retrieved from freelist */
+ xfs_agblock_t *bnop, /* block address retrieved from freelist */
+ int btreeblk); /* destination is a AGF btree */
/*
* Log the given fields from the agf structure.
@@ -143,7 +180,8 @@ xfs_alloc_put_freelist(
struct xfs_trans *tp, /* transaction pointer */
struct xfs_buf *agbp, /* buffer for a.g. freelist header */
struct xfs_buf *agflbp,/* buffer for a.g. free block array */
- xfs_agblock_t bno); /* block being freed */
+ xfs_agblock_t bno, /* block being freed */
+ int btreeblk); /* owner was a AGF btree */
/*
* Read in the allocation group header (free/alloc section).
@@ -172,18 +210,25 @@ xfs_free_extent(
xfs_fsblock_t bno, /* starting block number of extent */
xfs_extlen_t len); /* length of extent */
-void
-xfs_alloc_mark_busy(xfs_trans_t *tp,
- xfs_agnumber_t agno,
- xfs_agblock_t bno,
- xfs_extlen_t len);
+int /* error */
+xfs_alloc_lookup_le(
+ struct xfs_btree_cur *cur, /* btree cursor */
+ xfs_agblock_t bno, /* starting block of extent */
+ xfs_extlen_t len, /* length of extent */
+ int *stat); /* success/failure */
-void
-xfs_alloc_clear_busy(xfs_trans_t *tp,
- xfs_agnumber_t ag,
- int idx);
-
-
-#endif /* __KERNEL__ */
+int /* error */
+xfs_alloc_lookup_ge(
+ struct xfs_btree_cur *cur, /* btree cursor */
+ xfs_agblock_t bno, /* starting block of extent */
+ xfs_extlen_t len, /* length of extent */
+ int *stat); /* success/failure */
+
+int /* error */
+xfs_alloc_get_rec(
+ struct xfs_btree_cur *cur, /* btree cursor */
+ xfs_agblock_t *bno, /* output: starting block of extent */
+ xfs_extlen_t *len, /* output: length of extent */
+ int *stat); /* output: success/failure */
#endif /* __XFS_ALLOC_H__ */
diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c
index a1d92da86cc..8358f1ded94 100644
--- a/fs/xfs/xfs_alloc_btree.c
+++ b/fs/xfs/xfs_alloc_btree.c
@@ -17,2187 +17,488 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
-#include "xfs_types.h"
-#include "xfs_bit.h"
-#include "xfs_log.h"
-#include "xfs_inum.h"
-#include "xfs_trans.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
-#include "xfs_dir.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
-#include "xfs_dinode.h"
-#include "xfs_inode.h"
#include "xfs_btree.h"
-#include "xfs_ialloc.h"
+#include "xfs_alloc_btree.h"
#include "xfs_alloc.h"
+#include "xfs_extent_busy.h"
#include "xfs_error.h"
+#include "xfs_trace.h"
+#include "xfs_cksum.h"
+#include "xfs_trans.h"
-/*
- * Prototypes for internal functions.
- */
-
-STATIC void xfs_alloc_log_block(xfs_trans_t *, xfs_buf_t *, int);
-STATIC void xfs_alloc_log_keys(xfs_btree_cur_t *, xfs_buf_t *, int, int);
-STATIC void xfs_alloc_log_ptrs(xfs_btree_cur_t *, xfs_buf_t *, int, int);
-STATIC void xfs_alloc_log_recs(xfs_btree_cur_t *, xfs_buf_t *, int, int);
-STATIC int xfs_alloc_lshift(xfs_btree_cur_t *, int, int *);
-STATIC int xfs_alloc_newroot(xfs_btree_cur_t *, int *);
-STATIC int xfs_alloc_rshift(xfs_btree_cur_t *, int, int *);
-STATIC int xfs_alloc_split(xfs_btree_cur_t *, int, xfs_agblock_t *,
- xfs_alloc_key_t *, xfs_btree_cur_t **, int *);
-STATIC int xfs_alloc_updkey(xfs_btree_cur_t *, xfs_alloc_key_t *, int);
-/*
- * Internal functions.
- */
+STATIC struct xfs_btree_cur *
+xfs_allocbt_dup_cursor(
+ struct xfs_btree_cur *cur)
+{
+ return xfs_allocbt_init_cursor(cur->bc_mp, cur->bc_tp,
+ cur->bc_private.a.agbp, cur->bc_private.a.agno,
+ cur->bc_btnum);
+}
-/*
- * Single level of the xfs_alloc_delete record deletion routine.
- * Delete record pointed to by cur/level.
- * Remove the record from its block then rebalance the tree.
- * Return 0 for error, 1 for done, 2 to go on to the next level.
- */
-STATIC int /* error */
-xfs_alloc_delrec(
- xfs_btree_cur_t *cur, /* btree cursor */
- int level, /* level removing record from */
- int *stat) /* fail/done/go-on */
+STATIC void
+xfs_allocbt_set_root(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *ptr,
+ int inc)
{
- xfs_agf_t *agf; /* allocation group freelist header */
- xfs_alloc_block_t *block; /* btree block record/key lives in */
- xfs_agblock_t bno; /* btree block number */
- xfs_buf_t *bp; /* buffer for block */
- int error; /* error return value */
- int i; /* loop index */
- xfs_alloc_key_t key; /* kp points here if block is level 0 */
- xfs_agblock_t lbno; /* left block's block number */
- xfs_buf_t *lbp; /* left block's buffer pointer */
- xfs_alloc_block_t *left; /* left btree block */
- xfs_alloc_key_t *lkp=NULL; /* left block key pointer */
- xfs_alloc_ptr_t *lpp=NULL; /* left block address pointer */
- int lrecs=0; /* number of records in left block */
- xfs_alloc_rec_t *lrp; /* left block record pointer */
- xfs_mount_t *mp; /* mount structure */
- int ptr; /* index in btree block for this rec */
- xfs_agblock_t rbno; /* right block's block number */
- xfs_buf_t *rbp; /* right block's buffer pointer */
- xfs_alloc_block_t *right; /* right btree block */
- xfs_alloc_key_t *rkp; /* right block key pointer */
- xfs_alloc_ptr_t *rpp; /* right block address pointer */
- int rrecs=0; /* number of records in right block */
- xfs_alloc_rec_t *rrp; /* right block record pointer */
- xfs_btree_cur_t *tcur; /* temporary btree cursor */
+ struct xfs_buf *agbp = cur->bc_private.a.agbp;
+ struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp);
+ xfs_agnumber_t seqno = be32_to_cpu(agf->agf_seqno);
+ int btnum = cur->bc_btnum;
+ struct xfs_perag *pag = xfs_perag_get(cur->bc_mp, seqno);
- /*
- * Get the index of the entry being deleted, check for nothing there.
- */
- ptr = cur->bc_ptrs[level];
- if (ptr == 0) {
- *stat = 0;
- return 0;
- }
- /*
- * Get the buffer & block containing the record or key/ptr.
- */
- bp = cur->bc_bufs[level];
- block = XFS_BUF_TO_ALLOC_BLOCK(bp);
-#ifdef DEBUG
- if ((error = xfs_btree_check_sblock(cur, block, level, bp)))
- return error;
-#endif
- /*
- * Fail if we're off the end of the block.
- */
- if (ptr > be16_to_cpu(block->bb_numrecs)) {
- *stat = 0;
- return 0;
- }
- XFS_STATS_INC(xs_abt_delrec);
- /*
- * It's a nonleaf. Excise the key and ptr being deleted, by
- * sliding the entries past them down one.
- * Log the changed areas of the block.
- */
- if (level > 0) {
- lkp = XFS_ALLOC_KEY_ADDR(block, 1, cur);
- lpp = XFS_ALLOC_PTR_ADDR(block, 1, cur);
-#ifdef DEBUG
- for (i = ptr; i < be16_to_cpu(block->bb_numrecs); i++) {
- if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(lpp[i]), level)))
- return error;
- }
-#endif
- if (ptr < be16_to_cpu(block->bb_numrecs)) {
- memmove(&lkp[ptr - 1], &lkp[ptr],
- (be16_to_cpu(block->bb_numrecs) - ptr) * sizeof(*lkp));
- memmove(&lpp[ptr - 1], &lpp[ptr],
- (be16_to_cpu(block->bb_numrecs) - ptr) * sizeof(*lpp));
- xfs_alloc_log_ptrs(cur, bp, ptr, be16_to_cpu(block->bb_numrecs) - 1);
- xfs_alloc_log_keys(cur, bp, ptr, be16_to_cpu(block->bb_numrecs) - 1);
- }
- }
- /*
- * It's a leaf. Excise the record being deleted, by sliding the
- * entries past it down one. Log the changed areas of the block.
- */
- else {
- lrp = XFS_ALLOC_REC_ADDR(block, 1, cur);
- if (ptr < be16_to_cpu(block->bb_numrecs)) {
- memmove(&lrp[ptr - 1], &lrp[ptr],
- (be16_to_cpu(block->bb_numrecs) - ptr) * sizeof(*lrp));
- xfs_alloc_log_recs(cur, bp, ptr, be16_to_cpu(block->bb_numrecs) - 1);
- }
- /*
- * If it's the first record in the block, we'll need a key
- * structure to pass up to the next level (updkey).
- */
- if (ptr == 1) {
- key.ar_startblock = lrp->ar_startblock;
- key.ar_blockcount = lrp->ar_blockcount;
- lkp = &key;
- }
- }
- /*
- * Decrement and log the number of entries in the block.
- */
- be16_add(&block->bb_numrecs, -1);
- xfs_alloc_log_block(cur->bc_tp, bp, XFS_BB_NUMRECS);
- /*
- * See if the longest free extent in the allocation group was
- * changed by this operation. True if it's the by-size btree, and
- * this is the leaf level, and there is no right sibling block,
- * and this was the last record.
- */
- agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
- mp = cur->bc_mp;
-
- if (level == 0 &&
- cur->bc_btnum == XFS_BTNUM_CNT &&
- be32_to_cpu(block->bb_rightsib) == NULLAGBLOCK &&
- ptr > be16_to_cpu(block->bb_numrecs)) {
- ASSERT(ptr == be16_to_cpu(block->bb_numrecs) + 1);
- /*
- * There are still records in the block. Grab the size
- * from the last one.
- */
- if (be16_to_cpu(block->bb_numrecs)) {
- rrp = XFS_ALLOC_REC_ADDR(block, be16_to_cpu(block->bb_numrecs), cur);
- agf->agf_longest = rrp->ar_blockcount;
- }
- /*
- * No free extents left.
- */
- else
- agf->agf_longest = 0;
- mp->m_perag[be32_to_cpu(agf->agf_seqno)].pagf_longest =
- be32_to_cpu(agf->agf_longest);
- xfs_alloc_log_agf(cur->bc_tp, cur->bc_private.a.agbp,
- XFS_AGF_LONGEST);
- }
- /*
- * Is this the root level? If so, we're almost done.
- */
- if (level == cur->bc_nlevels - 1) {
- /*
- * If this is the root level,
- * and there's only one entry left,
- * and it's NOT the leaf level,
- * then we can get rid of this level.
- */
- if (be16_to_cpu(block->bb_numrecs) == 1 && level > 0) {
- /*
- * lpp is still set to the first pointer in the block.
- * Make it the new root of the btree.
- */
- bno = be32_to_cpu(agf->agf_roots[cur->bc_btnum]);
- agf->agf_roots[cur->bc_btnum] = *lpp;
- be32_add(&agf->agf_levels[cur->bc_btnum], -1);
- mp->m_perag[be32_to_cpu(agf->agf_seqno)].pagf_levels[cur->bc_btnum]--;
- /*
- * Put this buffer/block on the ag's freelist.
- */
- if ((error = xfs_alloc_put_freelist(cur->bc_tp,
- cur->bc_private.a.agbp, NULL, bno)))
- return error;
- /*
- * Since blocks move to the free list without the
- * coordination used in xfs_bmap_finish, we can't allow
- * block to be available for reallocation and
- * non-transaction writing (user data) until we know
- * that the transaction that moved it to the free list
- * is permanently on disk. We track the blocks by
- * declaring these blocks as "busy"; the busy list is
- * maintained on a per-ag basis and each transaction
- * records which entries should be removed when the
- * iclog commits to disk. If a busy block is
- * allocated, the iclog is pushed up to the LSN
- * that freed the block.
- */
- xfs_alloc_mark_busy(cur->bc_tp,
- be32_to_cpu(agf->agf_seqno), bno, 1);
-
- xfs_trans_agbtree_delta(cur->bc_tp, -1);
- xfs_alloc_log_agf(cur->bc_tp, cur->bc_private.a.agbp,
- XFS_AGF_ROOTS | XFS_AGF_LEVELS);
- /*
- * Update the cursor so there's one fewer level.
- */
- xfs_btree_setbuf(cur, level, NULL);
- cur->bc_nlevels--;
- } else if (level > 0 &&
- (error = xfs_alloc_decrement(cur, level, &i)))
- return error;
- *stat = 1;
- return 0;
- }
- /*
- * If we deleted the leftmost entry in the block, update the
- * key values above us in the tree.
- */
- if (ptr == 1 && (error = xfs_alloc_updkey(cur, lkp, level + 1)))
- return error;
- /*
- * If the number of records remaining in the block is at least
- * the minimum, we're done.
- */
- if (be16_to_cpu(block->bb_numrecs) >= XFS_ALLOC_BLOCK_MINRECS(level, cur)) {
- if (level > 0 && (error = xfs_alloc_decrement(cur, level, &i)))
- return error;
- *stat = 1;
- return 0;
- }
- /*
- * Otherwise, we have to move some records around to keep the
- * tree balanced. Look at the left and right sibling blocks to
- * see if we can re-balance by moving only one record.
- */
- rbno = be32_to_cpu(block->bb_rightsib);
- lbno = be32_to_cpu(block->bb_leftsib);
- bno = NULLAGBLOCK;
- ASSERT(rbno != NULLAGBLOCK || lbno != NULLAGBLOCK);
- /*
- * Duplicate the cursor so our btree manipulations here won't
- * disrupt the next level up.
- */
- if ((error = xfs_btree_dup_cursor(cur, &tcur)))
- return error;
- /*
- * If there's a right sibling, see if it's ok to shift an entry
- * out of it.
- */
- if (rbno != NULLAGBLOCK) {
- /*
- * Move the temp cursor to the last entry in the next block.
- * Actually any entry but the first would suffice.
- */
- i = xfs_btree_lastrec(tcur, level);
- XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
- if ((error = xfs_alloc_increment(tcur, level, &i)))
- goto error0;
- XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
- i = xfs_btree_lastrec(tcur, level);
- XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
- /*
- * Grab a pointer to the block.
- */
- rbp = tcur->bc_bufs[level];
- right = XFS_BUF_TO_ALLOC_BLOCK(rbp);
-#ifdef DEBUG
- if ((error = xfs_btree_check_sblock(cur, right, level, rbp)))
- goto error0;
-#endif
- /*
- * Grab the current block number, for future use.
- */
- bno = be32_to_cpu(right->bb_leftsib);
- /*
- * If right block is full enough so that removing one entry
- * won't make it too empty, and left-shifting an entry out
- * of right to us works, we're done.
- */
- if (be16_to_cpu(right->bb_numrecs) - 1 >=
- XFS_ALLOC_BLOCK_MINRECS(level, cur)) {
- if ((error = xfs_alloc_lshift(tcur, level, &i)))
- goto error0;
- if (i) {
- ASSERT(be16_to_cpu(block->bb_numrecs) >=
- XFS_ALLOC_BLOCK_MINRECS(level, cur));
- xfs_btree_del_cursor(tcur,
- XFS_BTREE_NOERROR);
- if (level > 0 &&
- (error = xfs_alloc_decrement(cur, level,
- &i)))
- return error;
- *stat = 1;
- return 0;
- }
- }
- /*
- * Otherwise, grab the number of records in right for
- * future reference, and fix up the temp cursor to point
- * to our block again (last record).
- */
- rrecs = be16_to_cpu(right->bb_numrecs);
- if (lbno != NULLAGBLOCK) {
- i = xfs_btree_firstrec(tcur, level);
- XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
- if ((error = xfs_alloc_decrement(tcur, level, &i)))
- goto error0;
- XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
- }
- }
- /*
- * If there's a left sibling, see if it's ok to shift an entry
- * out of it.
- */
- if (lbno != NULLAGBLOCK) {
- /*
- * Move the temp cursor to the first entry in the
- * previous block.
- */
- i = xfs_btree_firstrec(tcur, level);
- XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
- if ((error = xfs_alloc_decrement(tcur, level, &i)))
- goto error0;
- XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
- xfs_btree_firstrec(tcur, level);
- /*
- * Grab a pointer to the block.
- */
- lbp = tcur->bc_bufs[level];
- left = XFS_BUF_TO_ALLOC_BLOCK(lbp);
-#ifdef DEBUG
- if ((error = xfs_btree_check_sblock(cur, left, level, lbp)))
- goto error0;
-#endif
- /*
- * Grab the current block number, for future use.
- */
- bno = be32_to_cpu(left->bb_rightsib);
- /*
- * If left block is full enough so that removing one entry
- * won't make it too empty, and right-shifting an entry out
- * of left to us works, we're done.
- */
- if (be16_to_cpu(left->bb_numrecs) - 1 >=
- XFS_ALLOC_BLOCK_MINRECS(level, cur)) {
- if ((error = xfs_alloc_rshift(tcur, level, &i)))
- goto error0;
- if (i) {
- ASSERT(be16_to_cpu(block->bb_numrecs) >=
- XFS_ALLOC_BLOCK_MINRECS(level, cur));
- xfs_btree_del_cursor(tcur,
- XFS_BTREE_NOERROR);
- if (level == 0)
- cur->bc_ptrs[0]++;
- *stat = 1;
- return 0;
- }
- }
- /*
- * Otherwise, grab the number of records in right for
- * future reference.
- */
- lrecs = be16_to_cpu(left->bb_numrecs);
- }
- /*
- * Delete the temp cursor, we're done with it.
- */
- xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
- /*
- * If here, we need to do a join to keep the tree balanced.
- */
- ASSERT(bno != NULLAGBLOCK);
- /*
- * See if we can join with the left neighbor block.
- */
- if (lbno != NULLAGBLOCK &&
- lrecs + be16_to_cpu(block->bb_numrecs) <= XFS_ALLOC_BLOCK_MAXRECS(level, cur)) {
- /*
- * Set "right" to be the starting block,
- * "left" to be the left neighbor.
- */
- rbno = bno;
- right = block;
- rbp = bp;
- if ((error = xfs_btree_read_bufs(mp, cur->bc_tp,
- cur->bc_private.a.agno, lbno, 0, &lbp,
- XFS_ALLOC_BTREE_REF)))
- return error;
- left = XFS_BUF_TO_ALLOC_BLOCK(lbp);
- if ((error = xfs_btree_check_sblock(cur, left, level, lbp)))
- return error;
- }
- /*
- * If that won't work, see if we can join with the right neighbor block.
- */
- else if (rbno != NULLAGBLOCK &&
- rrecs + be16_to_cpu(block->bb_numrecs) <=
- XFS_ALLOC_BLOCK_MAXRECS(level, cur)) {
- /*
- * Set "left" to be the starting block,
- * "right" to be the right neighbor.
- */
- lbno = bno;
- left = block;
- lbp = bp;
- if ((error = xfs_btree_read_bufs(mp, cur->bc_tp,
- cur->bc_private.a.agno, rbno, 0, &rbp,
- XFS_ALLOC_BTREE_REF)))
- return error;
- right = XFS_BUF_TO_ALLOC_BLOCK(rbp);
- if ((error = xfs_btree_check_sblock(cur, right, level, rbp)))
- return error;
- }
- /*
- * Otherwise, we can't fix the imbalance.
- * Just return. This is probably a logic error, but it's not fatal.
- */
- else {
- if (level > 0 && (error = xfs_alloc_decrement(cur, level, &i)))
- return error;
- *stat = 1;
- return 0;
- }
- /*
- * We're now going to join "left" and "right" by moving all the stuff
- * in "right" to "left" and deleting "right".
- */
- if (level > 0) {
- /*
- * It's a non-leaf. Move keys and pointers.
- */
- lkp = XFS_ALLOC_KEY_ADDR(left, be16_to_cpu(left->bb_numrecs) + 1, cur);
- lpp = XFS_ALLOC_PTR_ADDR(left, be16_to_cpu(left->bb_numrecs) + 1, cur);
- rkp = XFS_ALLOC_KEY_ADDR(right, 1, cur);
- rpp = XFS_ALLOC_PTR_ADDR(right, 1, cur);
-#ifdef DEBUG
- for (i = 0; i < be16_to_cpu(right->bb_numrecs); i++) {
- if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(rpp[i]), level)))
- return error;
- }
-#endif
- memcpy(lkp, rkp, be16_to_cpu(right->bb_numrecs) * sizeof(*lkp));
- memcpy(lpp, rpp, be16_to_cpu(right->bb_numrecs) * sizeof(*lpp));
- xfs_alloc_log_keys(cur, lbp, be16_to_cpu(left->bb_numrecs) + 1,
- be16_to_cpu(left->bb_numrecs) +
- be16_to_cpu(right->bb_numrecs));
- xfs_alloc_log_ptrs(cur, lbp, be16_to_cpu(left->bb_numrecs) + 1,
- be16_to_cpu(left->bb_numrecs) +
- be16_to_cpu(right->bb_numrecs));
- } else {
- /*
- * It's a leaf. Move records.
- */
- lrp = XFS_ALLOC_REC_ADDR(left, be16_to_cpu(left->bb_numrecs) + 1, cur);
- rrp = XFS_ALLOC_REC_ADDR(right, 1, cur);
- memcpy(lrp, rrp, be16_to_cpu(right->bb_numrecs) * sizeof(*lrp));
- xfs_alloc_log_recs(cur, lbp, be16_to_cpu(left->bb_numrecs) + 1,
- be16_to_cpu(left->bb_numrecs) +
- be16_to_cpu(right->bb_numrecs));
- }
- /*
- * If we joined with the left neighbor, set the buffer in the
- * cursor to the left block, and fix up the index.
- */
- if (bp != lbp) {
- xfs_btree_setbuf(cur, level, lbp);
- cur->bc_ptrs[level] += be16_to_cpu(left->bb_numrecs);
- }
- /*
- * If we joined with the right neighbor and there's a level above
- * us, increment the cursor at that level.
- */
- else if (level + 1 < cur->bc_nlevels &&
- (error = xfs_alloc_increment(cur, level + 1, &i)))
- return error;
- /*
- * Fix up the number of records in the surviving block.
- */
- be16_add(&left->bb_numrecs, be16_to_cpu(right->bb_numrecs));
- /*
- * Fix up the right block pointer in the surviving block, and log it.
- */
- left->bb_rightsib = right->bb_rightsib;
- xfs_alloc_log_block(cur->bc_tp, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
- /*
- * If there is a right sibling now, make it point to the
- * remaining block.
- */
- if (be32_to_cpu(left->bb_rightsib) != NULLAGBLOCK) {
- xfs_alloc_block_t *rrblock;
- xfs_buf_t *rrbp;
-
- if ((error = xfs_btree_read_bufs(mp, cur->bc_tp,
- cur->bc_private.a.agno, be32_to_cpu(left->bb_rightsib), 0,
- &rrbp, XFS_ALLOC_BTREE_REF)))
- return error;
- rrblock = XFS_BUF_TO_ALLOC_BLOCK(rrbp);
- if ((error = xfs_btree_check_sblock(cur, rrblock, level, rrbp)))
- return error;
- rrblock->bb_leftsib = cpu_to_be32(lbno);
- xfs_alloc_log_block(cur->bc_tp, rrbp, XFS_BB_LEFTSIB);
- }
- /*
- * Free the deleting block by putting it on the freelist.
- */
- if ((error = xfs_alloc_put_freelist(cur->bc_tp, cur->bc_private.a.agbp,
- NULL, rbno)))
- return error;
- /*
- * Since blocks move to the free list without the coordination
- * used in xfs_bmap_finish, we can't allow block to be available
- * for reallocation and non-transaction writing (user data)
- * until we know that the transaction that moved it to the free
- * list is permanently on disk. We track the blocks by declaring
- * these blocks as "busy"; the busy list is maintained on a
- * per-ag basis and each transaction records which entries
- * should be removed when the iclog commits to disk. If a
- * busy block is allocated, the iclog is pushed up to the
- * LSN that freed the block.
- */
- xfs_alloc_mark_busy(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1);
- xfs_trans_agbtree_delta(cur->bc_tp, -1);
+ ASSERT(ptr->s != 0);
- /*
- * Adjust the current level's cursor so that we're left referring
- * to the right node, after we're done.
- * If this leaves the ptr value 0 our caller will fix it up.
- */
- if (level > 0)
- cur->bc_ptrs[level]--;
- /*
- * Return value means the next level up has something to do.
- */
- *stat = 2;
- return 0;
+ agf->agf_roots[btnum] = ptr->s;
+ be32_add_cpu(&agf->agf_levels[btnum], inc);
+ pag->pagf_levels[btnum] += inc;
+ xfs_perag_put(pag);
-error0:
- xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
- return error;
+ xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS);
}
-/*
- * Insert one record/level. Return information to the caller
- * allowing the next level up to proceed if necessary.
- */
-STATIC int /* error */
-xfs_alloc_insrec(
- xfs_btree_cur_t *cur, /* btree cursor */
- int level, /* level to insert record at */
- xfs_agblock_t *bnop, /* i/o: block number inserted */
- xfs_alloc_rec_t *recp, /* i/o: record data inserted */
- xfs_btree_cur_t **curp, /* output: new cursor replacing cur */
- int *stat) /* output: success/failure */
+STATIC int
+xfs_allocbt_alloc_block(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *start,
+ union xfs_btree_ptr *new,
+ int *stat)
{
- xfs_agf_t *agf; /* allocation group freelist header */
- xfs_alloc_block_t *block; /* btree block record/key lives in */
- xfs_buf_t *bp; /* buffer for block */
- int error; /* error return value */
- int i; /* loop index */
- xfs_alloc_key_t key; /* key value being inserted */
- xfs_alloc_key_t *kp; /* pointer to btree keys */
- xfs_agblock_t nbno; /* block number of allocated block */
- xfs_btree_cur_t *ncur; /* new cursor to be used at next lvl */
- xfs_alloc_key_t nkey; /* new key value, from split */
- xfs_alloc_rec_t nrec; /* new record value, for caller */
- int optr; /* old ptr value */
- xfs_alloc_ptr_t *pp; /* pointer to btree addresses */
- int ptr; /* index in btree block for this rec */
- xfs_alloc_rec_t *rp; /* pointer to btree records */
-
- ASSERT(be32_to_cpu(recp->ar_blockcount) > 0);
+ int error;
+ xfs_agblock_t bno;
- /*
- * GCC doesn't understand the (arguably complex) control flow in
- * this function and complains about uninitialized structure fields
- * without this.
- */
- memset(&nrec, 0, sizeof(nrec));
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
- /*
- * If we made it to the root level, allocate a new root block
- * and we're done.
- */
- if (level >= cur->bc_nlevels) {
- XFS_STATS_INC(xs_abt_insrec);
- if ((error = xfs_alloc_newroot(cur, &i)))
- return error;
- *bnop = NULLAGBLOCK;
- *stat = i;
- return 0;
+ /* Allocate the new block from the freelist. If we can't, give up. */
+ error = xfs_alloc_get_freelist(cur->bc_tp, cur->bc_private.a.agbp,
+ &bno, 1);
+ if (error) {
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+ return error;
}
- /*
- * Make a key out of the record data to be inserted, and save it.
- */
- key.ar_startblock = recp->ar_startblock;
- key.ar_blockcount = recp->ar_blockcount;
- optr = ptr = cur->bc_ptrs[level];
- /*
- * If we're off the left edge, return failure.
- */
- if (ptr == 0) {
+
+ if (bno == NULLAGBLOCK) {
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
*stat = 0;
return 0;
}
- XFS_STATS_INC(xs_abt_insrec);
- /*
- * Get pointers to the btree buffer and block.
- */
- bp = cur->bc_bufs[level];
- block = XFS_BUF_TO_ALLOC_BLOCK(bp);
-#ifdef DEBUG
- if ((error = xfs_btree_check_sblock(cur, block, level, bp)))
- return error;
- /*
- * Check that the new entry is being inserted in the right place.
- */
- if (ptr <= be16_to_cpu(block->bb_numrecs)) {
- if (level == 0) {
- rp = XFS_ALLOC_REC_ADDR(block, ptr, cur);
- xfs_btree_check_rec(cur->bc_btnum, recp, rp);
- } else {
- kp = XFS_ALLOC_KEY_ADDR(block, ptr, cur);
- xfs_btree_check_key(cur->bc_btnum, &key, kp);
- }
- }
-#endif
- nbno = NULLAGBLOCK;
- ncur = (xfs_btree_cur_t *)0;
- /*
- * If the block is full, we can't insert the new entry until we
- * make the block un-full.
- */
- if (be16_to_cpu(block->bb_numrecs) == XFS_ALLOC_BLOCK_MAXRECS(level, cur)) {
- /*
- * First, try shifting an entry to the right neighbor.
- */
- if ((error = xfs_alloc_rshift(cur, level, &i)))
- return error;
- if (i) {
- /* nothing */
- }
- /*
- * Next, try shifting an entry to the left neighbor.
- */
- else {
- if ((error = xfs_alloc_lshift(cur, level, &i)))
- return error;
- if (i)
- optr = ptr = cur->bc_ptrs[level];
- else {
- /*
- * Next, try splitting the current block in
- * half. If this works we have to re-set our
- * variables because we could be in a
- * different block now.
- */
- if ((error = xfs_alloc_split(cur, level, &nbno,
- &nkey, &ncur, &i)))
- return error;
- if (i) {
- bp = cur->bc_bufs[level];
- block = XFS_BUF_TO_ALLOC_BLOCK(bp);
-#ifdef DEBUG
- if ((error =
- xfs_btree_check_sblock(cur,
- block, level, bp)))
- return error;
-#endif
- ptr = cur->bc_ptrs[level];
- nrec.ar_startblock = nkey.ar_startblock;
- nrec.ar_blockcount = nkey.ar_blockcount;
- }
- /*
- * Otherwise the insert fails.
- */
- else {
- *stat = 0;
- return 0;
- }
- }
- }
- }
- /*
- * At this point we know there's room for our new entry in the block
- * we're pointing at.
- */
- if (level > 0) {
- /*
- * It's a non-leaf entry. Make a hole for the new data
- * in the key and ptr regions of the block.
- */
- kp = XFS_ALLOC_KEY_ADDR(block, 1, cur);
- pp = XFS_ALLOC_PTR_ADDR(block, 1, cur);
-#ifdef DEBUG
- for (i = be16_to_cpu(block->bb_numrecs); i >= ptr; i--) {
- if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(pp[i - 1]), level)))
- return error;
- }
-#endif
- memmove(&kp[ptr], &kp[ptr - 1],
- (be16_to_cpu(block->bb_numrecs) - ptr + 1) * sizeof(*kp));
- memmove(&pp[ptr], &pp[ptr - 1],
- (be16_to_cpu(block->bb_numrecs) - ptr + 1) * sizeof(*pp));
-#ifdef DEBUG
- if ((error = xfs_btree_check_sptr(cur, *bnop, level)))
- return error;
-#endif
- /*
- * Now stuff the new data in, bump numrecs and log the new data.
- */
- kp[ptr - 1] = key;
- pp[ptr - 1] = cpu_to_be32(*bnop);
- be16_add(&block->bb_numrecs, 1);
- xfs_alloc_log_keys(cur, bp, ptr, be16_to_cpu(block->bb_numrecs));
- xfs_alloc_log_ptrs(cur, bp, ptr, be16_to_cpu(block->bb_numrecs));
-#ifdef DEBUG
- if (ptr < be16_to_cpu(block->bb_numrecs))
- xfs_btree_check_key(cur->bc_btnum, kp + ptr - 1,
- kp + ptr);
-#endif
- } else {
- /*
- * It's a leaf entry. Make a hole for the new record.
- */
- rp = XFS_ALLOC_REC_ADDR(block, 1, cur);
- memmove(&rp[ptr], &rp[ptr - 1],
- (be16_to_cpu(block->bb_numrecs) - ptr + 1) * sizeof(*rp));
- /*
- * Now stuff the new record in, bump numrecs
- * and log the new data.
- */
- rp[ptr - 1] = *recp; /* INT_: struct copy */
- be16_add(&block->bb_numrecs, 1);
- xfs_alloc_log_recs(cur, bp, ptr, be16_to_cpu(block->bb_numrecs));
-#ifdef DEBUG
- if (ptr < be16_to_cpu(block->bb_numrecs))
- xfs_btree_check_rec(cur->bc_btnum, rp + ptr - 1,
- rp + ptr);
-#endif
- }
- /*
- * Log the new number of records in the btree header.
- */
- xfs_alloc_log_block(cur->bc_tp, bp, XFS_BB_NUMRECS);
- /*
- * If we inserted at the start of a block, update the parents' keys.
- */
- if (optr == 1 && (error = xfs_alloc_updkey(cur, &key, level + 1)))
- return error;
- /*
- * Look to see if the longest extent in the allocation group
- * needs to be updated.
- */
- agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
- if (level == 0 &&
- cur->bc_btnum == XFS_BTNUM_CNT &&
- be32_to_cpu(block->bb_rightsib) == NULLAGBLOCK &&
- be32_to_cpu(recp->ar_blockcount) > be32_to_cpu(agf->agf_longest)) {
- /*
- * If this is a leaf in the by-size btree and there
- * is no right sibling block and this block is bigger
- * than the previous longest block, update it.
- */
- agf->agf_longest = recp->ar_blockcount;
- cur->bc_mp->m_perag[be32_to_cpu(agf->agf_seqno)].pagf_longest
- = be32_to_cpu(recp->ar_blockcount);
- xfs_alloc_log_agf(cur->bc_tp, cur->bc_private.a.agbp,
- XFS_AGF_LONGEST);
- }
- /*
- * Return the new block number, if any.
- * If there is one, give back a record value and a cursor too.
- */
- *bnop = nbno;
- if (nbno != NULLAGBLOCK) {
- *recp = nrec; /* INT_: struct copy */
- *curp = ncur; /* INT_: struct copy */
- }
+ xfs_extent_busy_reuse(cur->bc_mp, cur->bc_private.a.agno, bno, 1, false);
+
+ xfs_trans_agbtree_delta(cur->bc_tp, 1);
+ new->s = cpu_to_be32(bno);
+
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
*stat = 1;
return 0;
}
-/*
- * Log header fields from a btree block.
- */
-STATIC void
-xfs_alloc_log_block(
- xfs_trans_t *tp, /* transaction pointer */
- xfs_buf_t *bp, /* buffer containing btree block */
- int fields) /* mask of fields: XFS_BB_... */
+STATIC int
+xfs_allocbt_free_block(
+ struct xfs_btree_cur *cur,
+ struct xfs_buf *bp)
{
- int first; /* first byte offset logged */
- int last; /* last byte offset logged */
- static const short offsets[] = { /* table of offsets */
- offsetof(xfs_alloc_block_t, bb_magic),
- offsetof(xfs_alloc_block_t, bb_level),
- offsetof(xfs_alloc_block_t, bb_numrecs),
- offsetof(xfs_alloc_block_t, bb_leftsib),
- offsetof(xfs_alloc_block_t, bb_rightsib),
- sizeof(xfs_alloc_block_t)
- };
-
- xfs_btree_offsets(fields, offsets, XFS_BB_NUM_BITS, &first, &last);
- xfs_trans_log_buf(tp, bp, first, last);
+ struct xfs_buf *agbp = cur->bc_private.a.agbp;
+ struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp);
+ xfs_agblock_t bno;
+ int error;
+
+ bno = xfs_daddr_to_agbno(cur->bc_mp, XFS_BUF_ADDR(bp));
+ error = xfs_alloc_put_freelist(cur->bc_tp, agbp, NULL, bno, 1);
+ if (error)
+ return error;
+
+ xfs_extent_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1,
+ XFS_EXTENT_BUSY_SKIP_DISCARD);
+ xfs_trans_agbtree_delta(cur->bc_tp, -1);
+
+ xfs_trans_binval(cur->bc_tp, bp);
+ return 0;
}
/*
- * Log keys from a btree block (nonleaf).
+ * Update the longest extent in the AGF
*/
STATIC void
-xfs_alloc_log_keys(
- xfs_btree_cur_t *cur, /* btree cursor */
- xfs_buf_t *bp, /* buffer containing btree block */
- int kfirst, /* index of first key to log */
- int klast) /* index of last key to log */
+xfs_allocbt_update_lastrec(
+ struct xfs_btree_cur *cur,
+ struct xfs_btree_block *block,
+ union xfs_btree_rec *rec,
+ int ptr,
+ int reason)
{
- xfs_alloc_block_t *block; /* btree block to log from */
- int first; /* first byte offset logged */
- xfs_alloc_key_t *kp; /* key pointer in btree block */
- int last; /* last byte offset logged */
-
- block = XFS_BUF_TO_ALLOC_BLOCK(bp);
- kp = XFS_ALLOC_KEY_ADDR(block, 1, cur);
- first = (int)((xfs_caddr_t)&kp[kfirst - 1] - (xfs_caddr_t)block);
- last = (int)(((xfs_caddr_t)&kp[klast] - 1) - (xfs_caddr_t)block);
- xfs_trans_log_buf(cur->bc_tp, bp, first, last);
+ struct xfs_agf *agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
+ xfs_agnumber_t seqno = be32_to_cpu(agf->agf_seqno);
+ struct xfs_perag *pag;
+ __be32 len;
+ int numrecs;
+
+ ASSERT(cur->bc_btnum == XFS_BTNUM_CNT);
+
+ switch (reason) {
+ case LASTREC_UPDATE:
+ /*
+ * If this is the last leaf block and it's the last record,
+ * then update the size of the longest extent in the AG.
+ */
+ if (ptr != xfs_btree_get_numrecs(block))
+ return;
+ len = rec->alloc.ar_blockcount;
+ break;
+ case LASTREC_INSREC:
+ if (be32_to_cpu(rec->alloc.ar_blockcount) <=
+ be32_to_cpu(agf->agf_longest))
+ return;
+ len = rec->alloc.ar_blockcount;
+ break;
+ case LASTREC_DELREC:
+ numrecs = xfs_btree_get_numrecs(block);
+ if (ptr <= numrecs)
+ return;
+ ASSERT(ptr == numrecs + 1);
+
+ if (numrecs) {
+ xfs_alloc_rec_t *rrp;
+
+ rrp = XFS_ALLOC_REC_ADDR(cur->bc_mp, block, numrecs);
+ len = rrp->ar_blockcount;
+ } else {
+ len = 0;
+ }
+
+ break;
+ default:
+ ASSERT(0);
+ return;
+ }
+
+ agf->agf_longest = len;
+ pag = xfs_perag_get(cur->bc_mp, seqno);
+ pag->pagf_longest = be32_to_cpu(len);
+ xfs_perag_put(pag);
+ xfs_alloc_log_agf(cur->bc_tp, cur->bc_private.a.agbp, XFS_AGF_LONGEST);
}
-/*
- * Log block pointer fields from a btree block (nonleaf).
- */
-STATIC void
-xfs_alloc_log_ptrs(
- xfs_btree_cur_t *cur, /* btree cursor */
- xfs_buf_t *bp, /* buffer containing btree block */
- int pfirst, /* index of first pointer to log */
- int plast) /* index of last pointer to log */
+STATIC int
+xfs_allocbt_get_minrecs(
+ struct xfs_btree_cur *cur,
+ int level)
{
- xfs_alloc_block_t *block; /* btree block to log from */
- int first; /* first byte offset logged */
- int last; /* last byte offset logged */
- xfs_alloc_ptr_t *pp; /* block-pointer pointer in btree blk */
-
- block = XFS_BUF_TO_ALLOC_BLOCK(bp);
- pp = XFS_ALLOC_PTR_ADDR(block, 1, cur);
- first = (int)((xfs_caddr_t)&pp[pfirst - 1] - (xfs_caddr_t)block);
- last = (int)(((xfs_caddr_t)&pp[plast] - 1) - (xfs_caddr_t)block);
- xfs_trans_log_buf(cur->bc_tp, bp, first, last);
+ return cur->bc_mp->m_alloc_mnr[level != 0];
}
-/*
- * Log records from a btree block (leaf).
- */
-STATIC void
-xfs_alloc_log_recs(
- xfs_btree_cur_t *cur, /* btree cursor */
- xfs_buf_t *bp, /* buffer containing btree block */
- int rfirst, /* index of first record to log */
- int rlast) /* index of last record to log */
+STATIC int
+xfs_allocbt_get_maxrecs(
+ struct xfs_btree_cur *cur,
+ int level)
{
- xfs_alloc_block_t *block; /* btree block to log from */
- int first; /* first byte offset logged */
- int last; /* last byte offset logged */
- xfs_alloc_rec_t *rp; /* record pointer for btree block */
-
-
- block = XFS_BUF_TO_ALLOC_BLOCK(bp);
- rp = XFS_ALLOC_REC_ADDR(block, 1, cur);
-#ifdef DEBUG
- {
- xfs_agf_t *agf;
- xfs_alloc_rec_t *p;
-
- agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
- for (p = &rp[rfirst - 1]; p <= &rp[rlast - 1]; p++)
- ASSERT(be32_to_cpu(p->ar_startblock) +
- be32_to_cpu(p->ar_blockcount) <=
- be32_to_cpu(agf->agf_length));
- }
-#endif
- first = (int)((xfs_caddr_t)&rp[rfirst - 1] - (xfs_caddr_t)block);
- last = (int)(((xfs_caddr_t)&rp[rlast] - 1) - (xfs_caddr_t)block);
- xfs_trans_log_buf(cur->bc_tp, bp, first, last);
+ return cur->bc_mp->m_alloc_mxr[level != 0];
}
-/*
- * Lookup the record. The cursor is made to point to it, based on dir.
- * Return 0 if can't find any such record, 1 for success.
- */
-STATIC int /* error */
-xfs_alloc_lookup(
- xfs_btree_cur_t *cur, /* btree cursor */
- xfs_lookup_t dir, /* <=, ==, or >= */
- int *stat) /* success/failure */
+STATIC void
+xfs_allocbt_init_key_from_rec(
+ union xfs_btree_key *key,
+ union xfs_btree_rec *rec)
{
- xfs_agblock_t agbno; /* a.g. relative btree block number */
- xfs_agnumber_t agno; /* allocation group number */
- xfs_alloc_block_t *block=NULL; /* current btree block */
- int diff; /* difference for the current key */
- int error; /* error return value */
- int keyno=0; /* current key number */
- int level; /* level in the btree */
- xfs_mount_t *mp; /* file system mount point */
-
- XFS_STATS_INC(xs_abt_lookup);
- /*
- * Get the allocation group header, and the root block number.
- */
- mp = cur->bc_mp;
+ ASSERT(rec->alloc.ar_startblock != 0);
- {
- xfs_agf_t *agf; /* a.g. freespace header */
+ key->alloc.ar_startblock = rec->alloc.ar_startblock;
+ key->alloc.ar_blockcount = rec->alloc.ar_blockcount;
+}
- agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
- agno = be32_to_cpu(agf->agf_seqno);
- agbno = be32_to_cpu(agf->agf_roots[cur->bc_btnum]);
- }
- /*
- * Iterate over each level in the btree, starting at the root.
- * For each level above the leaves, find the key we need, based
- * on the lookup record, then follow the corresponding block
- * pointer down to the next level.
- */
- for (level = cur->bc_nlevels - 1, diff = 1; level >= 0; level--) {
- xfs_buf_t *bp; /* buffer pointer for btree block */
- xfs_daddr_t d; /* disk address of btree block */
+STATIC void
+xfs_allocbt_init_rec_from_key(
+ union xfs_btree_key *key,
+ union xfs_btree_rec *rec)
+{
+ ASSERT(key->alloc.ar_startblock != 0);
- /*
- * Get the disk address we're looking for.
- */
- d = XFS_AGB_TO_DADDR(mp, agno, agbno);
- /*
- * If the old buffer at this level is for a different block,
- * throw it away, otherwise just use it.
- */
- bp = cur->bc_bufs[level];
- if (bp && XFS_BUF_ADDR(bp) != d)
- bp = (xfs_buf_t *)0;
- if (!bp) {
- /*
- * Need to get a new buffer. Read it, then
- * set it in the cursor, releasing the old one.
- */
- if ((error = xfs_btree_read_bufs(mp, cur->bc_tp, agno,
- agbno, 0, &bp, XFS_ALLOC_BTREE_REF)))
- return error;
- xfs_btree_setbuf(cur, level, bp);
- /*
- * Point to the btree block, now that we have the buffer
- */
- block = XFS_BUF_TO_ALLOC_BLOCK(bp);
- if ((error = xfs_btree_check_sblock(cur, block, level,
- bp)))
- return error;
- } else
- block = XFS_BUF_TO_ALLOC_BLOCK(bp);
- /*
- * If we already had a key match at a higher level, we know
- * we need to use the first entry in this block.
- */
- if (diff == 0)
- keyno = 1;
- /*
- * Otherwise we need to search this block. Do a binary search.
- */
- else {
- int high; /* high entry number */
- xfs_alloc_key_t *kkbase=NULL;/* base of keys in block */
- xfs_alloc_rec_t *krbase=NULL;/* base of records in block */
- int low; /* low entry number */
-
- /*
- * Get a pointer to keys or records.
- */
- if (level > 0)
- kkbase = XFS_ALLOC_KEY_ADDR(block, 1, cur);
- else
- krbase = XFS_ALLOC_REC_ADDR(block, 1, cur);
- /*
- * Set low and high entry numbers, 1-based.
- */
- low = 1;
- if (!(high = be16_to_cpu(block->bb_numrecs))) {
- /*
- * If the block is empty, the tree must
- * be an empty leaf.
- */
- ASSERT(level == 0 && cur->bc_nlevels == 1);
- cur->bc_ptrs[0] = dir != XFS_LOOKUP_LE;
- *stat = 0;
- return 0;
- }
- /*
- * Binary search the block.
- */
- while (low <= high) {
- xfs_extlen_t blockcount; /* key value */
- xfs_agblock_t startblock; /* key value */
-
- XFS_STATS_INC(xs_abt_compare);
- /*
- * keyno is average of low and high.
- */
- keyno = (low + high) >> 1;
- /*
- * Get startblock & blockcount.
- */
- if (level > 0) {
- xfs_alloc_key_t *kkp;
-
- kkp = kkbase + keyno - 1;
- startblock = be32_to_cpu(kkp->ar_startblock);
- blockcount = be32_to_cpu(kkp->ar_blockcount);
- } else {
- xfs_alloc_rec_t *krp;
-
- krp = krbase + keyno - 1;
- startblock = be32_to_cpu(krp->ar_startblock);
- blockcount = be32_to_cpu(krp->ar_blockcount);
- }
- /*
- * Compute difference to get next direction.
- */
- if (cur->bc_btnum == XFS_BTNUM_BNO)
- diff = (int)startblock -
- (int)cur->bc_rec.a.ar_startblock;
- else if (!(diff = (int)blockcount -
- (int)cur->bc_rec.a.ar_blockcount))
- diff = (int)startblock -
- (int)cur->bc_rec.a.ar_startblock;
- /*
- * Less than, move right.
- */
- if (diff < 0)
- low = keyno + 1;
- /*
- * Greater than, move left.
- */
- else if (diff > 0)
- high = keyno - 1;
- /*
- * Equal, we're done.
- */
- else
- break;
- }
- }
- /*
- * If there are more levels, set up for the next level
- * by getting the block number and filling in the cursor.
- */
- if (level > 0) {
- /*
- * If we moved left, need the previous key number,
- * unless there isn't one.
- */
- if (diff > 0 && --keyno < 1)
- keyno = 1;
- agbno = be32_to_cpu(*XFS_ALLOC_PTR_ADDR(block, keyno, cur));
-#ifdef DEBUG
- if ((error = xfs_btree_check_sptr(cur, agbno, level)))
- return error;
-#endif
- cur->bc_ptrs[level] = keyno;
- }
- }
- /*
- * Done with the search.
- * See if we need to adjust the results.
- */
- if (dir != XFS_LOOKUP_LE && diff < 0) {
- keyno++;
- /*
- * If ge search and we went off the end of the block, but it's
- * not the last block, we're in the wrong block.
- */
- if (dir == XFS_LOOKUP_GE &&
- keyno > be16_to_cpu(block->bb_numrecs) &&
- be32_to_cpu(block->bb_rightsib) != NULLAGBLOCK) {
- int i;
-
- cur->bc_ptrs[0] = keyno;
- if ((error = xfs_alloc_increment(cur, 0, &i)))
- return error;
- XFS_WANT_CORRUPTED_RETURN(i == 1);
- *stat = 1;
- return 0;
- }
- }
- else if (dir == XFS_LOOKUP_LE && diff > 0)
- keyno--;
- cur->bc_ptrs[0] = keyno;
- /*
- * Return if we succeeded or not.
- */
- if (keyno == 0 || keyno > be16_to_cpu(block->bb_numrecs))
- *stat = 0;
- else
- *stat = ((dir != XFS_LOOKUP_EQ) || (diff == 0));
- return 0;
+ rec->alloc.ar_startblock = key->alloc.ar_startblock;
+ rec->alloc.ar_blockcount = key->alloc.ar_blockcount;
}
-/*
- * Move 1 record left from cur/level if possible.
- * Update cur to reflect the new path.
- */
-STATIC int /* error */
-xfs_alloc_lshift(
- xfs_btree_cur_t *cur, /* btree cursor */
- int level, /* level to shift record on */
- int *stat) /* success/failure */
+STATIC void
+xfs_allocbt_init_rec_from_cur(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_rec *rec)
{
- int error; /* error return value */
-#ifdef DEBUG
- int i; /* loop index */
-#endif
- xfs_alloc_key_t key; /* key value for leaf level upward */
- xfs_buf_t *lbp; /* buffer for left neighbor block */
- xfs_alloc_block_t *left; /* left neighbor btree block */
- int nrec; /* new number of left block entries */
- xfs_buf_t *rbp; /* buffer for right (current) block */
- xfs_alloc_block_t *right; /* right (current) btree block */
- xfs_alloc_key_t *rkp=NULL; /* key pointer for right block */
- xfs_alloc_ptr_t *rpp=NULL; /* address pointer for right block */
- xfs_alloc_rec_t *rrp=NULL; /* record pointer for right block */
+ ASSERT(cur->bc_rec.a.ar_startblock != 0);
- /*
- * Set up variables for this block as "right".
- */
- rbp = cur->bc_bufs[level];
- right = XFS_BUF_TO_ALLOC_BLOCK(rbp);
-#ifdef DEBUG
- if ((error = xfs_btree_check_sblock(cur, right, level, rbp)))
- return error;
-#endif
- /*
- * If we've got no left sibling then we can't shift an entry left.
- */
- if (be32_to_cpu(right->bb_leftsib) == NULLAGBLOCK) {
- *stat = 0;
- return 0;
- }
- /*
- * If the cursor entry is the one that would be moved, don't
- * do it... it's too complicated.
- */
- if (cur->bc_ptrs[level] <= 1) {
- *stat = 0;
- return 0;
- }
- /*
- * Set up the left neighbor as "left".
- */
- if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
- cur->bc_private.a.agno, be32_to_cpu(right->bb_leftsib),
- 0, &lbp, XFS_ALLOC_BTREE_REF)))
- return error;
- left = XFS_BUF_TO_ALLOC_BLOCK(lbp);
- if ((error = xfs_btree_check_sblock(cur, left, level, lbp)))
- return error;
- /*
- * If it's full, it can't take another entry.
- */
- if (be16_to_cpu(left->bb_numrecs) == XFS_ALLOC_BLOCK_MAXRECS(level, cur)) {
- *stat = 0;
- return 0;
- }
- nrec = be16_to_cpu(left->bb_numrecs) + 1;
- /*
- * If non-leaf, copy a key and a ptr to the left block.
- */
- if (level > 0) {
- xfs_alloc_key_t *lkp; /* key pointer for left block */
- xfs_alloc_ptr_t *lpp; /* address pointer for left block */
-
- lkp = XFS_ALLOC_KEY_ADDR(left, nrec, cur);
- rkp = XFS_ALLOC_KEY_ADDR(right, 1, cur);
- *lkp = *rkp;
- xfs_alloc_log_keys(cur, lbp, nrec, nrec);
- lpp = XFS_ALLOC_PTR_ADDR(left, nrec, cur);
- rpp = XFS_ALLOC_PTR_ADDR(right, 1, cur);
-#ifdef DEBUG
- if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(*rpp), level)))
- return error;
-#endif
- *lpp = *rpp; /* INT_: copy */
- xfs_alloc_log_ptrs(cur, lbp, nrec, nrec);
- xfs_btree_check_key(cur->bc_btnum, lkp - 1, lkp);
- }
- /*
- * If leaf, copy a record to the left block.
- */
- else {
- xfs_alloc_rec_t *lrp; /* record pointer for left block */
-
- lrp = XFS_ALLOC_REC_ADDR(left, nrec, cur);
- rrp = XFS_ALLOC_REC_ADDR(right, 1, cur);
- *lrp = *rrp;
- xfs_alloc_log_recs(cur, lbp, nrec, nrec);
- xfs_btree_check_rec(cur->bc_btnum, lrp - 1, lrp);
- }
- /*
- * Bump and log left's numrecs, decrement and log right's numrecs.
- */
- be16_add(&left->bb_numrecs, 1);
- xfs_alloc_log_block(cur->bc_tp, lbp, XFS_BB_NUMRECS);
- be16_add(&right->bb_numrecs, -1);
- xfs_alloc_log_block(cur->bc_tp, rbp, XFS_BB_NUMRECS);
- /*
- * Slide the contents of right down one entry.
- */
- if (level > 0) {
-#ifdef DEBUG
- for (i = 0; i < be16_to_cpu(right->bb_numrecs); i++) {
- if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(rpp[i + 1]),
- level)))
- return error;
- }
-#endif
- memmove(rkp, rkp + 1, be16_to_cpu(right->bb_numrecs) * sizeof(*rkp));
- memmove(rpp, rpp + 1, be16_to_cpu(right->bb_numrecs) * sizeof(*rpp));
- xfs_alloc_log_keys(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
- xfs_alloc_log_ptrs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
- } else {
- memmove(rrp, rrp + 1, be16_to_cpu(right->bb_numrecs) * sizeof(*rrp));
- xfs_alloc_log_recs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
- key.ar_startblock = rrp->ar_startblock;
- key.ar_blockcount = rrp->ar_blockcount;
- rkp = &key;
- }
- /*
- * Update the parent key values of right.
- */
- if ((error = xfs_alloc_updkey(cur, rkp, level + 1)))
- return error;
- /*
- * Slide the cursor value left one.
- */
- cur->bc_ptrs[level]--;
- *stat = 1;
- return 0;
+ rec->alloc.ar_startblock = cpu_to_be32(cur->bc_rec.a.ar_startblock);
+ rec->alloc.ar_blockcount = cpu_to_be32(cur->bc_rec.a.ar_blockcount);
}
-/*
- * Allocate a new root block, fill it in.
- */
-STATIC int /* error */
-xfs_alloc_newroot(
- xfs_btree_cur_t *cur, /* btree cursor */
- int *stat) /* success/failure */
+STATIC void
+xfs_allocbt_init_ptr_from_cur(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *ptr)
{
- int error; /* error return value */
- xfs_agblock_t lbno; /* left block number */
- xfs_buf_t *lbp; /* left btree buffer */
- xfs_alloc_block_t *left; /* left btree block */
- xfs_mount_t *mp; /* mount structure */
- xfs_agblock_t nbno; /* new block number */
- xfs_buf_t *nbp; /* new (root) buffer */
- xfs_alloc_block_t *new; /* new (root) btree block */
- int nptr; /* new value for key index, 1 or 2 */
- xfs_agblock_t rbno; /* right block number */
- xfs_buf_t *rbp; /* right btree buffer */
- xfs_alloc_block_t *right; /* right btree block */
-
- mp = cur->bc_mp;
-
- ASSERT(cur->bc_nlevels < XFS_AG_MAXLEVELS(mp));
- /*
- * Get a buffer from the freelist blocks, for the new root.
- */
- if ((error = xfs_alloc_get_freelist(cur->bc_tp, cur->bc_private.a.agbp,
- &nbno)))
- return error;
- /*
- * None available, we fail.
- */
- if (nbno == NULLAGBLOCK) {
- *stat = 0;
- return 0;
- }
- xfs_trans_agbtree_delta(cur->bc_tp, 1);
- nbp = xfs_btree_get_bufs(mp, cur->bc_tp, cur->bc_private.a.agno, nbno,
- 0);
- new = XFS_BUF_TO_ALLOC_BLOCK(nbp);
- /*
- * Set the root data in the a.g. freespace structure.
- */
- {
- xfs_agf_t *agf; /* a.g. freespace header */
- xfs_agnumber_t seqno;
-
- agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
- agf->agf_roots[cur->bc_btnum] = cpu_to_be32(nbno);
- be32_add(&agf->agf_levels[cur->bc_btnum], 1);
- seqno = be32_to_cpu(agf->agf_seqno);
- mp->m_perag[seqno].pagf_levels[cur->bc_btnum]++;
- xfs_alloc_log_agf(cur->bc_tp, cur->bc_private.a.agbp,
- XFS_AGF_ROOTS | XFS_AGF_LEVELS);
- }
- /*
- * At the previous root level there are now two blocks: the old
- * root, and the new block generated when it was split.
- * We don't know which one the cursor is pointing at, so we
- * set up variables "left" and "right" for each case.
- */
- lbp = cur->bc_bufs[cur->bc_nlevels - 1];
- left = XFS_BUF_TO_ALLOC_BLOCK(lbp);
-#ifdef DEBUG
- if ((error = xfs_btree_check_sblock(cur, left, cur->bc_nlevels - 1, lbp)))
- return error;
-#endif
- if (be32_to_cpu(left->bb_rightsib) != NULLAGBLOCK) {
- /*
- * Our block is left, pick up the right block.
- */
- lbno = XFS_DADDR_TO_AGBNO(mp, XFS_BUF_ADDR(lbp));
- rbno = be32_to_cpu(left->bb_rightsib);
- if ((error = xfs_btree_read_bufs(mp, cur->bc_tp,
- cur->bc_private.a.agno, rbno, 0, &rbp,
- XFS_ALLOC_BTREE_REF)))
- return error;
- right = XFS_BUF_TO_ALLOC_BLOCK(rbp);
- if ((error = xfs_btree_check_sblock(cur, right,
- cur->bc_nlevels - 1, rbp)))
- return error;
- nptr = 1;
- } else {
- /*
- * Our block is right, pick up the left block.
- */
- rbp = lbp;
- right = left;
- rbno = XFS_DADDR_TO_AGBNO(mp, XFS_BUF_ADDR(rbp));
- lbno = be32_to_cpu(right->bb_leftsib);
- if ((error = xfs_btree_read_bufs(mp, cur->bc_tp,
- cur->bc_private.a.agno, lbno, 0, &lbp,
- XFS_ALLOC_BTREE_REF)))
- return error;
- left = XFS_BUF_TO_ALLOC_BLOCK(lbp);
- if ((error = xfs_btree_check_sblock(cur, left,
- cur->bc_nlevels - 1, lbp)))
- return error;
- nptr = 2;
- }
- /*
- * Fill in the new block's btree header and log it.
- */
- new->bb_magic = cpu_to_be32(xfs_magics[cur->bc_btnum]);
- new->bb_level = cpu_to_be16(cur->bc_nlevels);
- new->bb_numrecs = cpu_to_be16(2);
- new->bb_leftsib = cpu_to_be32(NULLAGBLOCK);
- new->bb_rightsib = cpu_to_be32(NULLAGBLOCK);
- xfs_alloc_log_block(cur->bc_tp, nbp, XFS_BB_ALL_BITS);
- ASSERT(lbno != NULLAGBLOCK && rbno != NULLAGBLOCK);
- /*
- * Fill in the key data in the new root.
- */
- {
- xfs_alloc_key_t *kp; /* btree key pointer */
+ struct xfs_agf *agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
- kp = XFS_ALLOC_KEY_ADDR(new, 1, cur);
- if (be16_to_cpu(left->bb_level) > 0) {
- kp[0] = *XFS_ALLOC_KEY_ADDR(left, 1, cur); /* INT_: structure copy */
- kp[1] = *XFS_ALLOC_KEY_ADDR(right, 1, cur);/* INT_: structure copy */
- } else {
- xfs_alloc_rec_t *rp; /* btree record pointer */
-
- rp = XFS_ALLOC_REC_ADDR(left, 1, cur);
- kp[0].ar_startblock = rp->ar_startblock;
- kp[0].ar_blockcount = rp->ar_blockcount;
- rp = XFS_ALLOC_REC_ADDR(right, 1, cur);
- kp[1].ar_startblock = rp->ar_startblock;
- kp[1].ar_blockcount = rp->ar_blockcount;
- }
- }
- xfs_alloc_log_keys(cur, nbp, 1, 2);
- /*
- * Fill in the pointer data in the new root.
- */
- {
- xfs_alloc_ptr_t *pp; /* btree address pointer */
+ ASSERT(cur->bc_private.a.agno == be32_to_cpu(agf->agf_seqno));
+ ASSERT(agf->agf_roots[cur->bc_btnum] != 0);
- pp = XFS_ALLOC_PTR_ADDR(new, 1, cur);
- pp[0] = cpu_to_be32(lbno);
- pp[1] = cpu_to_be32(rbno);
- }
- xfs_alloc_log_ptrs(cur, nbp, 1, 2);
- /*
- * Fix up the cursor.
- */
- xfs_btree_setbuf(cur, cur->bc_nlevels, nbp);
- cur->bc_ptrs[cur->bc_nlevels] = nptr;
- cur->bc_nlevels++;
- *stat = 1;
- return 0;
+ ptr->s = agf->agf_roots[cur->bc_btnum];
}
-/*
- * Move 1 record right from cur/level if possible.
- * Update cur to reflect the new path.
- */
-STATIC int /* error */
-xfs_alloc_rshift(
- xfs_btree_cur_t *cur, /* btree cursor */
- int level, /* level to shift record on */
- int *stat) /* success/failure */
+STATIC __int64_t
+xfs_allocbt_key_diff(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_key *key)
{
- int error; /* error return value */
- int i; /* loop index */
- xfs_alloc_key_t key; /* key value for leaf level upward */
- xfs_buf_t *lbp; /* buffer for left (current) block */
- xfs_alloc_block_t *left; /* left (current) btree block */
- xfs_buf_t *rbp; /* buffer for right neighbor block */
- xfs_alloc_block_t *right; /* right neighbor btree block */
- xfs_alloc_key_t *rkp; /* key pointer for right block */
- xfs_btree_cur_t *tcur; /* temporary cursor */
+ xfs_alloc_rec_incore_t *rec = &cur->bc_rec.a;
+ xfs_alloc_key_t *kp = &key->alloc;
+ __int64_t diff;
- /*
- * Set up variables for this block as "left".
- */
- lbp = cur->bc_bufs[level];
- left = XFS_BUF_TO_ALLOC_BLOCK(lbp);
-#ifdef DEBUG
- if ((error = xfs_btree_check_sblock(cur, left, level, lbp)))
- return error;
-#endif
- /*
- * If we've got no right sibling then we can't shift an entry right.
- */
- if (be32_to_cpu(left->bb_rightsib) == NULLAGBLOCK) {
- *stat = 0;
- return 0;
- }
- /*
- * If the cursor entry is the one that would be moved, don't
- * do it... it's too complicated.
- */
- if (cur->bc_ptrs[level] >= be16_to_cpu(left->bb_numrecs)) {
- *stat = 0;
- return 0;
+ if (cur->bc_btnum == XFS_BTNUM_BNO) {
+ return (__int64_t)be32_to_cpu(kp->ar_startblock) -
+ rec->ar_startblock;
}
- /*
- * Set up the right neighbor as "right".
- */
- if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
- cur->bc_private.a.agno, be32_to_cpu(left->bb_rightsib),
- 0, &rbp, XFS_ALLOC_BTREE_REF)))
- return error;
- right = XFS_BUF_TO_ALLOC_BLOCK(rbp);
- if ((error = xfs_btree_check_sblock(cur, right, level, rbp)))
- return error;
- /*
- * If it's full, it can't take another entry.
- */
- if (be16_to_cpu(right->bb_numrecs) == XFS_ALLOC_BLOCK_MAXRECS(level, cur)) {
- *stat = 0;
- return 0;
- }
- /*
- * Make a hole at the start of the right neighbor block, then
- * copy the last left block entry to the hole.
- */
- if (level > 0) {
- xfs_alloc_key_t *lkp; /* key pointer for left block */
- xfs_alloc_ptr_t *lpp; /* address pointer for left block */
- xfs_alloc_ptr_t *rpp; /* address pointer for right block */
-
- lkp = XFS_ALLOC_KEY_ADDR(left, be16_to_cpu(left->bb_numrecs), cur);
- lpp = XFS_ALLOC_PTR_ADDR(left, be16_to_cpu(left->bb_numrecs), cur);
- rkp = XFS_ALLOC_KEY_ADDR(right, 1, cur);
- rpp = XFS_ALLOC_PTR_ADDR(right, 1, cur);
-#ifdef DEBUG
- for (i = be16_to_cpu(right->bb_numrecs) - 1; i >= 0; i--) {
- if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(rpp[i]), level)))
- return error;
- }
-#endif
- memmove(rkp + 1, rkp, be16_to_cpu(right->bb_numrecs) * sizeof(*rkp));
- memmove(rpp + 1, rpp, be16_to_cpu(right->bb_numrecs) * sizeof(*rpp));
-#ifdef DEBUG
- if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(*lpp), level)))
- return error;
-#endif
- *rkp = *lkp; /* INT_: copy */
- *rpp = *lpp; /* INT_: copy */
- xfs_alloc_log_keys(cur, rbp, 1, be16_to_cpu(right->bb_numrecs) + 1);
- xfs_alloc_log_ptrs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs) + 1);
- xfs_btree_check_key(cur->bc_btnum, rkp, rkp + 1);
- } else {
- xfs_alloc_rec_t *lrp; /* record pointer for left block */
- xfs_alloc_rec_t *rrp; /* record pointer for right block */
-
- lrp = XFS_ALLOC_REC_ADDR(left, be16_to_cpu(left->bb_numrecs), cur);
- rrp = XFS_ALLOC_REC_ADDR(right, 1, cur);
- memmove(rrp + 1, rrp, be16_to_cpu(right->bb_numrecs) * sizeof(*rrp));
- *rrp = *lrp;
- xfs_alloc_log_recs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs) + 1);
- key.ar_startblock = rrp->ar_startblock;
- key.ar_blockcount = rrp->ar_blockcount;
- rkp = &key;
- xfs_btree_check_rec(cur->bc_btnum, rrp, rrp + 1);
- }
- /*
- * Decrement and log left's numrecs, bump and log right's numrecs.
- */
- be16_add(&left->bb_numrecs, -1);
- xfs_alloc_log_block(cur->bc_tp, lbp, XFS_BB_NUMRECS);
- be16_add(&right->bb_numrecs, 1);
- xfs_alloc_log_block(cur->bc_tp, rbp, XFS_BB_NUMRECS);
- /*
- * Using a temporary cursor, update the parent key values of the
- * block on the right.
- */
- if ((error = xfs_btree_dup_cursor(cur, &tcur)))
- return error;
- i = xfs_btree_lastrec(tcur, level);
- XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
- if ((error = xfs_alloc_increment(tcur, level, &i)) ||
- (error = xfs_alloc_updkey(tcur, rkp, level + 1)))
- goto error0;
- xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
- *stat = 1;
- return 0;
-error0:
- xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
- return error;
-}
-/*
- * Split cur/level block in half.
- * Return new block number and its first record (to be inserted into parent).
- */
-STATIC int /* error */
-xfs_alloc_split(
- xfs_btree_cur_t *cur, /* btree cursor */
- int level, /* level to split */
- xfs_agblock_t *bnop, /* output: block number allocated */
- xfs_alloc_key_t *keyp, /* output: first key of new block */
- xfs_btree_cur_t **curp, /* output: new cursor */
- int *stat) /* success/failure */
-{
- int error; /* error return value */
- int i; /* loop index/record number */
- xfs_agblock_t lbno; /* left (current) block number */
- xfs_buf_t *lbp; /* buffer for left block */
- xfs_alloc_block_t *left; /* left (current) btree block */
- xfs_agblock_t rbno; /* right (new) block number */
- xfs_buf_t *rbp; /* buffer for right block */
- xfs_alloc_block_t *right; /* right (new) btree block */
+ diff = (__int64_t)be32_to_cpu(kp->ar_blockcount) - rec->ar_blockcount;
+ if (diff)
+ return diff;
- /*
- * Allocate the new block from the freelist.
- * If we can't do it, we're toast. Give up.
- */
- if ((error = xfs_alloc_get_freelist(cur->bc_tp, cur->bc_private.a.agbp,
- &rbno)))
- return error;
- if (rbno == NULLAGBLOCK) {
- *stat = 0;
- return 0;
- }
- xfs_trans_agbtree_delta(cur->bc_tp, 1);
- rbp = xfs_btree_get_bufs(cur->bc_mp, cur->bc_tp, cur->bc_private.a.agno,
- rbno, 0);
- /*
- * Set up the new block as "right".
- */
- right = XFS_BUF_TO_ALLOC_BLOCK(rbp);
- /*
- * "Left" is the current (according to the cursor) block.
- */
- lbp = cur->bc_bufs[level];
- left = XFS_BUF_TO_ALLOC_BLOCK(lbp);
-#ifdef DEBUG
- if ((error = xfs_btree_check_sblock(cur, left, level, lbp)))
- return error;
-#endif
- /*
- * Fill in the btree header for the new block.
- */
- right->bb_magic = cpu_to_be32(xfs_magics[cur->bc_btnum]);
- right->bb_level = left->bb_level;
- right->bb_numrecs = cpu_to_be16(be16_to_cpu(left->bb_numrecs) / 2);
- /*
- * Make sure that if there's an odd number of entries now, that
- * each new block will have the same number of entries.
- */
- if ((be16_to_cpu(left->bb_numrecs) & 1) &&
- cur->bc_ptrs[level] <= be16_to_cpu(right->bb_numrecs) + 1)
- be16_add(&right->bb_numrecs, 1);
- i = be16_to_cpu(left->bb_numrecs) - be16_to_cpu(right->bb_numrecs) + 1;
- /*
- * For non-leaf blocks, copy keys and addresses over to the new block.
- */
- if (level > 0) {
- xfs_alloc_key_t *lkp; /* left btree key pointer */
- xfs_alloc_ptr_t *lpp; /* left btree address pointer */
- xfs_alloc_key_t *rkp; /* right btree key pointer */
- xfs_alloc_ptr_t *rpp; /* right btree address pointer */
-
- lkp = XFS_ALLOC_KEY_ADDR(left, i, cur);
- lpp = XFS_ALLOC_PTR_ADDR(left, i, cur);
- rkp = XFS_ALLOC_KEY_ADDR(right, 1, cur);
- rpp = XFS_ALLOC_PTR_ADDR(right, 1, cur);
-#ifdef DEBUG
- for (i = 0; i < be16_to_cpu(right->bb_numrecs); i++) {
- if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(lpp[i]), level)))
- return error;
- }
-#endif
- memcpy(rkp, lkp, be16_to_cpu(right->bb_numrecs) * sizeof(*rkp));
- memcpy(rpp, lpp, be16_to_cpu(right->bb_numrecs) * sizeof(*rpp));
- xfs_alloc_log_keys(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
- xfs_alloc_log_ptrs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
- *keyp = *rkp;
- }
- /*
- * For leaf blocks, copy records over to the new block.
- */
- else {
- xfs_alloc_rec_t *lrp; /* left btree record pointer */
- xfs_alloc_rec_t *rrp; /* right btree record pointer */
-
- lrp = XFS_ALLOC_REC_ADDR(left, i, cur);
- rrp = XFS_ALLOC_REC_ADDR(right, 1, cur);
- memcpy(rrp, lrp, be16_to_cpu(right->bb_numrecs) * sizeof(*rrp));
- xfs_alloc_log_recs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
- keyp->ar_startblock = rrp->ar_startblock;
- keyp->ar_blockcount = rrp->ar_blockcount;
- }
- /*
- * Find the left block number by looking in the buffer.
- * Adjust numrecs, sibling pointers.
- */
- lbno = XFS_DADDR_TO_AGBNO(cur->bc_mp, XFS_BUF_ADDR(lbp));
- be16_add(&left->bb_numrecs, -(be16_to_cpu(right->bb_numrecs)));
- right->bb_rightsib = left->bb_rightsib;
- left->bb_rightsib = cpu_to_be32(rbno);
- right->bb_leftsib = cpu_to_be32(lbno);
- xfs_alloc_log_block(cur->bc_tp, rbp, XFS_BB_ALL_BITS);
- xfs_alloc_log_block(cur->bc_tp, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
- /*
- * If there's a block to the new block's right, make that block
- * point back to right instead of to left.
- */
- if (be32_to_cpu(right->bb_rightsib) != NULLAGBLOCK) {
- xfs_alloc_block_t *rrblock; /* rr btree block */
- xfs_buf_t *rrbp; /* buffer for rrblock */
-
- if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
- cur->bc_private.a.agno, be32_to_cpu(right->bb_rightsib), 0,
- &rrbp, XFS_ALLOC_BTREE_REF)))
- return error;
- rrblock = XFS_BUF_TO_ALLOC_BLOCK(rrbp);
- if ((error = xfs_btree_check_sblock(cur, rrblock, level, rrbp)))
- return error;
- rrblock->bb_leftsib = cpu_to_be32(rbno);
- xfs_alloc_log_block(cur->bc_tp, rrbp, XFS_BB_LEFTSIB);
- }
- /*
- * If the cursor is really in the right block, move it there.
- * If it's just pointing past the last entry in left, then we'll
- * insert there, so don't change anything in that case.
- */
- if (cur->bc_ptrs[level] > be16_to_cpu(left->bb_numrecs) + 1) {
- xfs_btree_setbuf(cur, level, rbp);
- cur->bc_ptrs[level] -= be16_to_cpu(left->bb_numrecs);
- }
- /*
- * If there are more levels, we'll need another cursor which refers to
- * the right block, no matter where this cursor was.
- */
- if (level + 1 < cur->bc_nlevels) {
- if ((error = xfs_btree_dup_cursor(cur, curp)))
- return error;
- (*curp)->bc_ptrs[level + 1]++;
- }
- *bnop = rbno;
- *stat = 1;
- return 0;
+ return (__int64_t)be32_to_cpu(kp->ar_startblock) - rec->ar_startblock;
}
-/*
- * Update keys at all levels from here to the root along the cursor's path.
- */
-STATIC int /* error */
-xfs_alloc_updkey(
- xfs_btree_cur_t *cur, /* btree cursor */
- xfs_alloc_key_t *keyp, /* new key value to update to */
- int level) /* starting level for update */
+static bool
+xfs_allocbt_verify(
+ struct xfs_buf *bp)
{
- int ptr; /* index of key in block */
-
- /*
- * Go up the tree from this level toward the root.
- * At each level, update the key value to the value input.
- * Stop when we reach a level where the cursor isn't pointing
- * at the first entry in the block.
- */
- for (ptr = 1; ptr == 1 && level < cur->bc_nlevels; level++) {
- xfs_alloc_block_t *block; /* btree block */
- xfs_buf_t *bp; /* buffer for block */
-#ifdef DEBUG
- int error; /* error return value */
-#endif
- xfs_alloc_key_t *kp; /* ptr to btree block keys */
-
- bp = cur->bc_bufs[level];
- block = XFS_BUF_TO_ALLOC_BLOCK(bp);
-#ifdef DEBUG
- if ((error = xfs_btree_check_sblock(cur, block, level, bp)))
- return error;
-#endif
- ptr = cur->bc_ptrs[level];
- kp = XFS_ALLOC_KEY_ADDR(block, ptr, cur);
- *kp = *keyp;
- xfs_alloc_log_keys(cur, bp, ptr, ptr);
- }
- return 0;
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
+ struct xfs_perag *pag = bp->b_pag;
+ unsigned int level;
+
+ /*
+ * magic number and level verification
+ *
+ * During growfs operations, we can't verify the exact level or owner as
+ * the perag is not fully initialised and hence not attached to the
+ * buffer. In this case, check against the maximum tree depth.
+ *
+ * Similarly, during log recovery we will have a perag structure
+ * attached, but the agf information will not yet have been initialised
+ * from the on disk AGF. Again, we can only check against maximum limits
+ * in this case.
+ */
+ level = be16_to_cpu(block->bb_level);
+ switch (block->bb_magic) {
+ case cpu_to_be32(XFS_ABTB_CRC_MAGIC):
+ if (!xfs_sb_version_hascrc(&mp->m_sb))
+ return false;
+ if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid))
+ return false;
+ if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn))
+ return false;
+ if (pag &&
+ be32_to_cpu(block->bb_u.s.bb_owner) != pag->pag_agno)
+ return false;
+ /* fall through */
+ case cpu_to_be32(XFS_ABTB_MAGIC):
+ if (pag && pag->pagf_init) {
+ if (level >= pag->pagf_levels[XFS_BTNUM_BNOi])
+ return false;
+ } else if (level >= mp->m_ag_maxlevels)
+ return false;
+ break;
+ case cpu_to_be32(XFS_ABTC_CRC_MAGIC):
+ if (!xfs_sb_version_hascrc(&mp->m_sb))
+ return false;
+ if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid))
+ return false;
+ if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn))
+ return false;
+ if (pag &&
+ be32_to_cpu(block->bb_u.s.bb_owner) != pag->pag_agno)
+ return false;
+ /* fall through */
+ case cpu_to_be32(XFS_ABTC_MAGIC):
+ if (pag && pag->pagf_init) {
+ if (level >= pag->pagf_levels[XFS_BTNUM_CNTi])
+ return false;
+ } else if (level >= mp->m_ag_maxlevels)
+ return false;
+ break;
+ default:
+ return false;
+ }
+
+ /* numrecs verification */
+ if (be16_to_cpu(block->bb_numrecs) > mp->m_alloc_mxr[level != 0])
+ return false;
+
+ /* sibling pointer verification */
+ if (!block->bb_u.s.bb_leftsib ||
+ (be32_to_cpu(block->bb_u.s.bb_leftsib) >= mp->m_sb.sb_agblocks &&
+ block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK)))
+ return false;
+ if (!block->bb_u.s.bb_rightsib ||
+ (be32_to_cpu(block->bb_u.s.bb_rightsib) >= mp->m_sb.sb_agblocks &&
+ block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK)))
+ return false;
+
+ return true;
}
-/*
- * Externally visible routines.
- */
-
-/*
- * Decrement cursor by one record at the level.
- * For nonzero levels the leaf-ward information is untouched.
- */
-int /* error */
-xfs_alloc_decrement(
- xfs_btree_cur_t *cur, /* btree cursor */
- int level, /* level in btree, 0 is leaf */
- int *stat) /* success/failure */
+static void
+xfs_allocbt_read_verify(
+ struct xfs_buf *bp)
{
- xfs_alloc_block_t *block; /* btree block */
- int error; /* error return value */
- int lev; /* btree level */
+ if (!xfs_btree_sblock_verify_crc(bp))
+ xfs_buf_ioerror(bp, EFSBADCRC);
+ else if (!xfs_allocbt_verify(bp))
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
- ASSERT(level < cur->bc_nlevels);
- /*
- * Read-ahead to the left at this level.
- */
- xfs_btree_readahead(cur, level, XFS_BTCUR_LEFTRA);
- /*
- * Decrement the ptr at this level. If we're still in the block
- * then we're done.
- */
- if (--cur->bc_ptrs[level] > 0) {
- *stat = 1;
- return 0;
- }
- /*
- * Get a pointer to the btree block.
- */
- block = XFS_BUF_TO_ALLOC_BLOCK(cur->bc_bufs[level]);
-#ifdef DEBUG
- if ((error = xfs_btree_check_sblock(cur, block, level,
- cur->bc_bufs[level])))
- return error;
-#endif
- /*
- * If we just went off the left edge of the tree, return failure.
- */
- if (be32_to_cpu(block->bb_leftsib) == NULLAGBLOCK) {
- *stat = 0;
- return 0;
- }
- /*
- * March up the tree decrementing pointers.
- * Stop when we don't go off the left edge of a block.
- */
- for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
- if (--cur->bc_ptrs[lev] > 0)
- break;
- /*
- * Read-ahead the left block, we're going to read it
- * in the next loop.
- */
- xfs_btree_readahead(cur, lev, XFS_BTCUR_LEFTRA);
- }
- /*
- * If we went off the root then we are seriously confused.
- */
- ASSERT(lev < cur->bc_nlevels);
- /*
- * Now walk back down the tree, fixing up the cursor's buffer
- * pointers and key numbers.
- */
- for (block = XFS_BUF_TO_ALLOC_BLOCK(cur->bc_bufs[lev]); lev > level; ) {
- xfs_agblock_t agbno; /* block number of btree block */
- xfs_buf_t *bp; /* buffer pointer for block */
-
- agbno = be32_to_cpu(*XFS_ALLOC_PTR_ADDR(block, cur->bc_ptrs[lev], cur));
- if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
- cur->bc_private.a.agno, agbno, 0, &bp,
- XFS_ALLOC_BTREE_REF)))
- return error;
- lev--;
- xfs_btree_setbuf(cur, lev, bp);
- block = XFS_BUF_TO_ALLOC_BLOCK(bp);
- if ((error = xfs_btree_check_sblock(cur, block, lev, bp)))
- return error;
- cur->bc_ptrs[lev] = be16_to_cpu(block->bb_numrecs);
+ if (bp->b_error) {
+ trace_xfs_btree_corrupt(bp, _RET_IP_);
+ xfs_verifier_error(bp);
}
- *stat = 1;
- return 0;
}
-/*
- * Delete the record pointed to by cur.
- * The cursor refers to the place where the record was (could be inserted)
- * when the operation returns.
- */
-int /* error */
-xfs_alloc_delete(
- xfs_btree_cur_t *cur, /* btree cursor */
- int *stat) /* success/failure */
+static void
+xfs_allocbt_write_verify(
+ struct xfs_buf *bp)
{
- int error; /* error return value */
- int i; /* result code */
- int level; /* btree level */
-
- /*
- * Go up the tree, starting at leaf level.
- * If 2 is returned then a join was done; go to the next level.
- * Otherwise we are done.
- */
- for (level = 0, i = 2; i == 2; level++) {
- if ((error = xfs_alloc_delrec(cur, level, &i)))
- return error;
+ if (!xfs_allocbt_verify(bp)) {
+ trace_xfs_btree_corrupt(bp, _RET_IP_);
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
+ xfs_verifier_error(bp);
+ return;
}
- if (i == 0) {
- for (level = 1; level < cur->bc_nlevels; level++) {
- if (cur->bc_ptrs[level] == 0) {
- if ((error = xfs_alloc_decrement(cur, level, &i)))
- return error;
- break;
- }
- }
- }
- *stat = i;
- return 0;
+ xfs_btree_sblock_calc_crc(bp);
+
}
-/*
- * Get the data from the pointed-to record.
- */
-int /* error */
-xfs_alloc_get_rec(
- xfs_btree_cur_t *cur, /* btree cursor */
- xfs_agblock_t *bno, /* output: starting block of extent */
- xfs_extlen_t *len, /* output: length of extent */
- int *stat) /* output: success/failure */
-{
- xfs_alloc_block_t *block; /* btree block */
-#ifdef DEBUG
- int error; /* error return value */
-#endif
- int ptr; /* record number */
+const struct xfs_buf_ops xfs_allocbt_buf_ops = {
+ .verify_read = xfs_allocbt_read_verify,
+ .verify_write = xfs_allocbt_write_verify,
+};
- ptr = cur->bc_ptrs[0];
- block = XFS_BUF_TO_ALLOC_BLOCK(cur->bc_bufs[0]);
-#ifdef DEBUG
- if ((error = xfs_btree_check_sblock(cur, block, 0, cur->bc_bufs[0])))
- return error;
-#endif
- /*
- * Off the right end or left end, return failure.
- */
- if (ptr > be16_to_cpu(block->bb_numrecs) || ptr <= 0) {
- *stat = 0;
- return 0;
- }
- /*
- * Point to the record and extract its data.
- */
- {
- xfs_alloc_rec_t *rec; /* record data */
- rec = XFS_ALLOC_REC_ADDR(block, ptr, cur);
- *bno = be32_to_cpu(rec->ar_startblock);
- *len = be32_to_cpu(rec->ar_blockcount);
+#if defined(DEBUG) || defined(XFS_WARN)
+STATIC int
+xfs_allocbt_keys_inorder(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_key *k1,
+ union xfs_btree_key *k2)
+{
+ if (cur->bc_btnum == XFS_BTNUM_BNO) {
+ return be32_to_cpu(k1->alloc.ar_startblock) <
+ be32_to_cpu(k2->alloc.ar_startblock);
+ } else {
+ return be32_to_cpu(k1->alloc.ar_blockcount) <
+ be32_to_cpu(k2->alloc.ar_blockcount) ||
+ (k1->alloc.ar_blockcount == k2->alloc.ar_blockcount &&
+ be32_to_cpu(k1->alloc.ar_startblock) <
+ be32_to_cpu(k2->alloc.ar_startblock));
}
- *stat = 1;
- return 0;
}
-/*
- * Increment cursor by one record at the level.
- * For nonzero levels the leaf-ward information is untouched.
- */
-int /* error */
-xfs_alloc_increment(
- xfs_btree_cur_t *cur, /* btree cursor */
- int level, /* level in btree, 0 is leaf */
- int *stat) /* success/failure */
+STATIC int
+xfs_allocbt_recs_inorder(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_rec *r1,
+ union xfs_btree_rec *r2)
{
- xfs_alloc_block_t *block; /* btree block */
- xfs_buf_t *bp; /* tree block buffer */
- int error; /* error return value */
- int lev; /* btree level */
-
- ASSERT(level < cur->bc_nlevels);
- /*
- * Read-ahead to the right at this level.
- */
- xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA);
- /*
- * Get a pointer to the btree block.
- */
- bp = cur->bc_bufs[level];
- block = XFS_BUF_TO_ALLOC_BLOCK(bp);
-#ifdef DEBUG
- if ((error = xfs_btree_check_sblock(cur, block, level, bp)))
- return error;
-#endif
- /*
- * Increment the ptr at this level. If we're still in the block
- * then we're done.
- */
- if (++cur->bc_ptrs[level] <= be16_to_cpu(block->bb_numrecs)) {
- *stat = 1;
- return 0;
- }
- /*
- * If we just went off the right edge of the tree, return failure.
- */
- if (be32_to_cpu(block->bb_rightsib) == NULLAGBLOCK) {
- *stat = 0;
- return 0;
- }
- /*
- * March up the tree incrementing pointers.
- * Stop when we don't go off the right edge of a block.
- */
- for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
- bp = cur->bc_bufs[lev];
- block = XFS_BUF_TO_ALLOC_BLOCK(bp);
-#ifdef DEBUG
- if ((error = xfs_btree_check_sblock(cur, block, lev, bp)))
- return error;
-#endif
- if (++cur->bc_ptrs[lev] <= be16_to_cpu(block->bb_numrecs))
- break;
- /*
- * Read-ahead the right block, we're going to read it
- * in the next loop.
- */
- xfs_btree_readahead(cur, lev, XFS_BTCUR_RIGHTRA);
- }
- /*
- * If we went off the root then we are seriously confused.
- */
- ASSERT(lev < cur->bc_nlevels);
- /*
- * Now walk back down the tree, fixing up the cursor's buffer
- * pointers and key numbers.
- */
- for (bp = cur->bc_bufs[lev], block = XFS_BUF_TO_ALLOC_BLOCK(bp);
- lev > level; ) {
- xfs_agblock_t agbno; /* block number of btree block */
-
- agbno = be32_to_cpu(*XFS_ALLOC_PTR_ADDR(block, cur->bc_ptrs[lev], cur));
- if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
- cur->bc_private.a.agno, agbno, 0, &bp,
- XFS_ALLOC_BTREE_REF)))
- return error;
- lev--;
- xfs_btree_setbuf(cur, lev, bp);
- block = XFS_BUF_TO_ALLOC_BLOCK(bp);
- if ((error = xfs_btree_check_sblock(cur, block, lev, bp)))
- return error;
- cur->bc_ptrs[lev] = 1;
+ if (cur->bc_btnum == XFS_BTNUM_BNO) {
+ return be32_to_cpu(r1->alloc.ar_startblock) +
+ be32_to_cpu(r1->alloc.ar_blockcount) <=
+ be32_to_cpu(r2->alloc.ar_startblock);
+ } else {
+ return be32_to_cpu(r1->alloc.ar_blockcount) <
+ be32_to_cpu(r2->alloc.ar_blockcount) ||
+ (r1->alloc.ar_blockcount == r2->alloc.ar_blockcount &&
+ be32_to_cpu(r1->alloc.ar_startblock) <
+ be32_to_cpu(r2->alloc.ar_startblock));
}
- *stat = 1;
- return 0;
}
+#endif /* DEBUG */
+
+static const struct xfs_btree_ops xfs_allocbt_ops = {
+ .rec_len = sizeof(xfs_alloc_rec_t),
+ .key_len = sizeof(xfs_alloc_key_t),
+
+ .dup_cursor = xfs_allocbt_dup_cursor,
+ .set_root = xfs_allocbt_set_root,
+ .alloc_block = xfs_allocbt_alloc_block,
+ .free_block = xfs_allocbt_free_block,
+ .update_lastrec = xfs_allocbt_update_lastrec,
+ .get_minrecs = xfs_allocbt_get_minrecs,
+ .get_maxrecs = xfs_allocbt_get_maxrecs,
+ .init_key_from_rec = xfs_allocbt_init_key_from_rec,
+ .init_rec_from_key = xfs_allocbt_init_rec_from_key,
+ .init_rec_from_cur = xfs_allocbt_init_rec_from_cur,
+ .init_ptr_from_cur = xfs_allocbt_init_ptr_from_cur,
+ .key_diff = xfs_allocbt_key_diff,
+ .buf_ops = &xfs_allocbt_buf_ops,
+#if defined(DEBUG) || defined(XFS_WARN)
+ .keys_inorder = xfs_allocbt_keys_inorder,
+ .recs_inorder = xfs_allocbt_recs_inorder,
+#endif
+};
/*
- * Insert the current record at the point referenced by cur.
- * The cursor may be inconsistent on return if splits have been done.
+ * Allocate a new allocation btree cursor.
*/
-int /* error */
-xfs_alloc_insert(
- xfs_btree_cur_t *cur, /* btree cursor */
- int *stat) /* success/failure */
+struct xfs_btree_cur * /* new alloc btree cursor */
+xfs_allocbt_init_cursor(
+ struct xfs_mount *mp, /* file system mount point */
+ struct xfs_trans *tp, /* transaction pointer */
+ struct xfs_buf *agbp, /* buffer for agf structure */
+ xfs_agnumber_t agno, /* allocation group number */
+ xfs_btnum_t btnum) /* btree identifier */
{
- int error; /* error return value */
- int i; /* result value, 0 for failure */
- int level; /* current level number in btree */
- xfs_agblock_t nbno; /* new block number (split result) */
- xfs_btree_cur_t *ncur; /* new cursor (split result) */
- xfs_alloc_rec_t nrec; /* record being inserted this level */
- xfs_btree_cur_t *pcur; /* previous level's cursor */
-
- level = 0;
- nbno = NULLAGBLOCK;
- nrec.ar_startblock = cpu_to_be32(cur->bc_rec.a.ar_startblock);
- nrec.ar_blockcount = cpu_to_be32(cur->bc_rec.a.ar_blockcount);
- ncur = (xfs_btree_cur_t *)0;
- pcur = cur;
- /*
- * Loop going up the tree, starting at the leaf level.
- * Stop when we don't get a split block, that must mean that
- * the insert is finished with this level.
- */
- do {
- /*
- * Insert nrec/nbno into this level of the tree.
- * Note if we fail, nbno will be null.
- */
- if ((error = xfs_alloc_insrec(pcur, level++, &nbno, &nrec, &ncur,
- &i))) {
- if (pcur != cur)
- xfs_btree_del_cursor(pcur, XFS_BTREE_ERROR);
- return error;
- }
- /*
- * See if the cursor we just used is trash.
- * Can't trash the caller's cursor, but otherwise we should
- * if ncur is a new cursor or we're about to be done.
- */
- if (pcur != cur && (ncur || nbno == NULLAGBLOCK)) {
- cur->bc_nlevels = pcur->bc_nlevels;
- xfs_btree_del_cursor(pcur, XFS_BTREE_NOERROR);
- }
- /*
- * If we got a new cursor, switch to it.
- */
- if (ncur) {
- pcur = ncur;
- ncur = (xfs_btree_cur_t *)0;
- }
- } while (nbno != NULLAGBLOCK);
- *stat = i;
- return 0;
-}
+ struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp);
+ struct xfs_btree_cur *cur;
-/*
- * Lookup the record equal to [bno, len] in the btree given by cur.
- */
-int /* error */
-xfs_alloc_lookup_eq(
- xfs_btree_cur_t *cur, /* btree cursor */
- xfs_agblock_t bno, /* starting block of extent */
- xfs_extlen_t len, /* length of extent */
- int *stat) /* success/failure */
-{
- cur->bc_rec.a.ar_startblock = bno;
- cur->bc_rec.a.ar_blockcount = len;
- return xfs_alloc_lookup(cur, XFS_LOOKUP_EQ, stat);
-}
+ ASSERT(btnum == XFS_BTNUM_BNO || btnum == XFS_BTNUM_CNT);
-/*
- * Lookup the first record greater than or equal to [bno, len]
- * in the btree given by cur.
- */
-int /* error */
-xfs_alloc_lookup_ge(
- xfs_btree_cur_t *cur, /* btree cursor */
- xfs_agblock_t bno, /* starting block of extent */
- xfs_extlen_t len, /* length of extent */
- int *stat) /* success/failure */
-{
- cur->bc_rec.a.ar_startblock = bno;
- cur->bc_rec.a.ar_blockcount = len;
- return xfs_alloc_lookup(cur, XFS_LOOKUP_GE, stat);
-}
+ cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_SLEEP);
-/*
- * Lookup the first record less than or equal to [bno, len]
- * in the btree given by cur.
- */
-int /* error */
-xfs_alloc_lookup_le(
- xfs_btree_cur_t *cur, /* btree cursor */
- xfs_agblock_t bno, /* starting block of extent */
- xfs_extlen_t len, /* length of extent */
- int *stat) /* success/failure */
-{
- cur->bc_rec.a.ar_startblock = bno;
- cur->bc_rec.a.ar_blockcount = len;
- return xfs_alloc_lookup(cur, XFS_LOOKUP_LE, stat);
+ cur->bc_tp = tp;
+ cur->bc_mp = mp;
+ cur->bc_btnum = btnum;
+ cur->bc_blocklog = mp->m_sb.sb_blocklog;
+ cur->bc_ops = &xfs_allocbt_ops;
+
+ if (btnum == XFS_BTNUM_CNT) {
+ cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]);
+ cur->bc_flags = XFS_BTREE_LASTREC_UPDATE;
+ } else {
+ cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]);
+ }
+
+ cur->bc_private.a.agbp = agbp;
+ cur->bc_private.a.agno = agno;
+
+ if (xfs_sb_version_hascrc(&mp->m_sb))
+ cur->bc_flags |= XFS_BTREE_CRC_BLOCKS;
+
+ return cur;
}
/*
- * Update the record referred to by cur, to the value given by [bno, len].
- * This either works (return 0) or gets an EFSCORRUPTED error.
+ * Calculate number of records in an alloc btree block.
*/
-int /* error */
-xfs_alloc_update(
- xfs_btree_cur_t *cur, /* btree cursor */
- xfs_agblock_t bno, /* starting block of extent */
- xfs_extlen_t len) /* length of extent */
+int
+xfs_allocbt_maxrecs(
+ struct xfs_mount *mp,
+ int blocklen,
+ int leaf)
{
- xfs_alloc_block_t *block; /* btree block to update */
- int error; /* error return value */
- int ptr; /* current record number (updating) */
+ blocklen -= XFS_ALLOC_BLOCK_LEN(mp);
- ASSERT(len > 0);
- /*
- * Pick up the a.g. freelist struct and the current block.
- */
- block = XFS_BUF_TO_ALLOC_BLOCK(cur->bc_bufs[0]);
-#ifdef DEBUG
- if ((error = xfs_btree_check_sblock(cur, block, 0, cur->bc_bufs[0])))
- return error;
-#endif
- /*
- * Get the address of the rec to be updated.
- */
- ptr = cur->bc_ptrs[0];
- {
- xfs_alloc_rec_t *rp; /* pointer to updated record */
-
- rp = XFS_ALLOC_REC_ADDR(block, ptr, cur);
- /*
- * Fill in the new contents and log them.
- */
- rp->ar_startblock = cpu_to_be32(bno);
- rp->ar_blockcount = cpu_to_be32(len);
- xfs_alloc_log_recs(cur, cur->bc_bufs[0], ptr, ptr);
- }
- /*
- * If it's the by-size btree and it's the last leaf block and
- * it's the last record... then update the size of the longest
- * extent in the a.g., which we cache in the a.g. freelist header.
- */
- if (cur->bc_btnum == XFS_BTNUM_CNT &&
- be32_to_cpu(block->bb_rightsib) == NULLAGBLOCK &&
- ptr == be16_to_cpu(block->bb_numrecs)) {
- xfs_agf_t *agf; /* a.g. freespace header */
- xfs_agnumber_t seqno;
-
- agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
- seqno = be32_to_cpu(agf->agf_seqno);
- cur->bc_mp->m_perag[seqno].pagf_longest = len;
- agf->agf_longest = cpu_to_be32(len);
- xfs_alloc_log_agf(cur->bc_tp, cur->bc_private.a.agbp,
- XFS_AGF_LONGEST);
- }
- /*
- * Updating first record in leaf. Pass new key value up to our parent.
- */
- if (ptr == 1) {
- xfs_alloc_key_t key; /* key containing [bno, len] */
-
- key.ar_startblock = cpu_to_be32(bno);
- key.ar_blockcount = cpu_to_be32(len);
- if ((error = xfs_alloc_updkey(cur, &key, 1)))
- return error;
- }
- return 0;
+ if (leaf)
+ return blocklen / sizeof(xfs_alloc_rec_t);
+ return blocklen / (sizeof(xfs_alloc_key_t) + sizeof(xfs_alloc_ptr_t));
}
diff --git a/fs/xfs/xfs_alloc_btree.h b/fs/xfs/xfs_alloc_btree.h
index bce81c7a4fd..45e189e7e81 100644
--- a/fs/xfs/xfs_alloc_btree.h
+++ b/fs/xfs/xfs_alloc_btree.h
@@ -24,136 +24,42 @@
struct xfs_buf;
struct xfs_btree_cur;
-struct xfs_btree_sblock;
struct xfs_mount;
/*
- * There are two on-disk btrees, one sorted by blockno and one sorted
- * by blockcount and blockno. All blocks look the same to make the code
- * simpler; if we have time later, we'll make the optimizations.
+ * Btree block header size depends on a superblock flag.
*/
-#define XFS_ABTB_MAGIC 0x41425442 /* 'ABTB' for bno tree */
-#define XFS_ABTC_MAGIC 0x41425443 /* 'ABTC' for cnt tree */
-
-/*
- * Data record/key structure
- */
-typedef struct xfs_alloc_rec {
- __be32 ar_startblock; /* starting block number */
- __be32 ar_blockcount; /* count of free blocks */
-} xfs_alloc_rec_t, xfs_alloc_key_t;
-
-typedef struct xfs_alloc_rec_incore {
- xfs_agblock_t ar_startblock; /* starting block number */
- xfs_extlen_t ar_blockcount; /* count of free blocks */
-} xfs_alloc_rec_incore_t;
-
-/* btree pointer type */
-typedef __be32 xfs_alloc_ptr_t;
-/* btree block header type */
-typedef struct xfs_btree_sblock xfs_alloc_block_t;
-
-#define XFS_BUF_TO_ALLOC_BLOCK(bp) ((xfs_alloc_block_t *)XFS_BUF_PTR(bp))
-
-/*
- * Real block structures have a size equal to the disk block size.
- */
-#define XFS_ALLOC_BLOCK_SIZE(lev,cur) (1 << (cur)->bc_blocklog)
-#define XFS_ALLOC_BLOCK_MAXRECS(lev,cur) ((cur)->bc_mp->m_alloc_mxr[lev != 0])
-#define XFS_ALLOC_BLOCK_MINRECS(lev,cur) ((cur)->bc_mp->m_alloc_mnr[lev != 0])
-
-/*
- * Minimum and maximum blocksize and sectorsize.
- * The blocksize upper limit is pretty much arbitrary.
- * The sectorsize upper limit is due to sizeof(sb_sectsize).
- */
-#define XFS_MIN_BLOCKSIZE_LOG 9 /* i.e. 512 bytes */
-#define XFS_MAX_BLOCKSIZE_LOG 16 /* i.e. 65536 bytes */
-#define XFS_MIN_BLOCKSIZE (1 << XFS_MIN_BLOCKSIZE_LOG)
-#define XFS_MAX_BLOCKSIZE (1 << XFS_MAX_BLOCKSIZE_LOG)
-#define XFS_MIN_SECTORSIZE_LOG 9 /* i.e. 512 bytes */
-#define XFS_MAX_SECTORSIZE_LOG 15 /* i.e. 32768 bytes */
-#define XFS_MIN_SECTORSIZE (1 << XFS_MIN_SECTORSIZE_LOG)
-#define XFS_MAX_SECTORSIZE (1 << XFS_MAX_SECTORSIZE_LOG)
-
-/*
- * Block numbers in the AG:
- * SB is sector 0, AGF is sector 1, AGI is sector 2, AGFL is sector 3.
- */
-#define XFS_BNO_BLOCK(mp) ((xfs_agblock_t)(XFS_AGFL_BLOCK(mp) + 1))
-#define XFS_CNT_BLOCK(mp) ((xfs_agblock_t)(XFS_BNO_BLOCK(mp) + 1))
+#define XFS_ALLOC_BLOCK_LEN(mp) \
+ (xfs_sb_version_hascrc(&((mp)->m_sb)) ? \
+ XFS_BTREE_SBLOCK_CRC_LEN : XFS_BTREE_SBLOCK_LEN)
/*
* Record, key, and pointer address macros for btree blocks.
- */
-#define XFS_ALLOC_REC_ADDR(bb,i,cur) \
- XFS_BTREE_REC_ADDR(XFS_ALLOC_BLOCK_SIZE(0,cur), xfs_alloc, \
- bb, i, XFS_ALLOC_BLOCK_MAXRECS(0, cur))
-
-#define XFS_ALLOC_KEY_ADDR(bb,i,cur) \
- XFS_BTREE_KEY_ADDR(XFS_ALLOC_BLOCK_SIZE(1,cur), xfs_alloc, \
- bb, i, XFS_ALLOC_BLOCK_MAXRECS(1, cur))
-
-#define XFS_ALLOC_PTR_ADDR(bb,i,cur) \
- XFS_BTREE_PTR_ADDR(XFS_ALLOC_BLOCK_SIZE(1,cur), xfs_alloc, \
- bb, i, XFS_ALLOC_BLOCK_MAXRECS(1, cur))
-
-/*
- * Decrement cursor by one record at the level.
- * For nonzero levels the leaf-ward information is untouched.
- */
-extern int xfs_alloc_decrement(struct xfs_btree_cur *cur, int level, int *stat);
-
-/*
- * Delete the record pointed to by cur.
- * The cursor refers to the place where the record was (could be inserted)
- * when the operation returns.
- */
-extern int xfs_alloc_delete(struct xfs_btree_cur *cur, int *stat);
-
-/*
- * Get the data from the pointed-to record.
- */
-extern int xfs_alloc_get_rec(struct xfs_btree_cur *cur, xfs_agblock_t *bno,
- xfs_extlen_t *len, int *stat);
-
-/*
- * Increment cursor by one record at the level.
- * For nonzero levels the leaf-ward information is untouched.
- */
-extern int xfs_alloc_increment(struct xfs_btree_cur *cur, int level, int *stat);
-
-/*
- * Insert the current record at the point referenced by cur.
- * The cursor may be inconsistent on return if splits have been done.
- */
-extern int xfs_alloc_insert(struct xfs_btree_cur *cur, int *stat);
-
-/*
- * Lookup the record equal to [bno, len] in the btree given by cur.
- */
-extern int xfs_alloc_lookup_eq(struct xfs_btree_cur *cur, xfs_agblock_t bno,
- xfs_extlen_t len, int *stat);
-
-/*
- * Lookup the first record greater than or equal to [bno, len]
- * in the btree given by cur.
- */
-extern int xfs_alloc_lookup_ge(struct xfs_btree_cur *cur, xfs_agblock_t bno,
- xfs_extlen_t len, int *stat);
-
-/*
- * Lookup the first record less than or equal to [bno, len]
- * in the btree given by cur.
- */
-extern int xfs_alloc_lookup_le(struct xfs_btree_cur *cur, xfs_agblock_t bno,
- xfs_extlen_t len, int *stat);
-
-/*
- * Update the record referred to by cur, to the value given by [bno, len].
- * This either works (return 0) or gets an EFSCORRUPTED error.
- */
-extern int xfs_alloc_update(struct xfs_btree_cur *cur, xfs_agblock_t bno,
- xfs_extlen_t len);
+ *
+ * (note that some of these may appear unused, but they are used in userspace)
+ */
+#define XFS_ALLOC_REC_ADDR(mp, block, index) \
+ ((xfs_alloc_rec_t *) \
+ ((char *)(block) + \
+ XFS_ALLOC_BLOCK_LEN(mp) + \
+ (((index) - 1) * sizeof(xfs_alloc_rec_t))))
+
+#define XFS_ALLOC_KEY_ADDR(mp, block, index) \
+ ((xfs_alloc_key_t *) \
+ ((char *)(block) + \
+ XFS_ALLOC_BLOCK_LEN(mp) + \
+ ((index) - 1) * sizeof(xfs_alloc_key_t)))
+
+#define XFS_ALLOC_PTR_ADDR(mp, block, index, maxrecs) \
+ ((xfs_alloc_ptr_t *) \
+ ((char *)(block) + \
+ XFS_ALLOC_BLOCK_LEN(mp) + \
+ (maxrecs) * sizeof(xfs_alloc_key_t) + \
+ ((index) - 1) * sizeof(xfs_alloc_ptr_t)))
+
+extern struct xfs_btree_cur *xfs_allocbt_init_cursor(struct xfs_mount *,
+ struct xfs_trans *, struct xfs_buf *,
+ xfs_agnumber_t, xfs_btnum_t);
+extern int xfs_allocbt_maxrecs(struct xfs_mount *, int, int);
#endif /* __XFS_ALLOC_BTREE_H__ */
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
new file mode 100644
index 00000000000..faaf716e208
--- /dev/null
+++ b/fs/xfs/xfs_aops.c
@@ -0,0 +1,1770 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "xfs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_inode_item.h"
+#include "xfs_alloc.h"
+#include "xfs_error.h"
+#include "xfs_iomap.h"
+#include "xfs_trace.h"
+#include "xfs_bmap.h"
+#include "xfs_bmap_util.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_dinode.h"
+#include <linux/aio.h>
+#include <linux/gfp.h>
+#include <linux/mpage.h>
+#include <linux/pagevec.h>
+#include <linux/writeback.h>
+
+void
+xfs_count_page_state(
+ struct page *page,
+ int *delalloc,
+ int *unwritten)
+{
+ struct buffer_head *bh, *head;
+
+ *delalloc = *unwritten = 0;
+
+ bh = head = page_buffers(page);
+ do {
+ if (buffer_unwritten(bh))
+ (*unwritten) = 1;
+ else if (buffer_delay(bh))
+ (*delalloc) = 1;
+ } while ((bh = bh->b_this_page) != head);
+}
+
+STATIC struct block_device *
+xfs_find_bdev_for_inode(
+ struct inode *inode)
+{
+ struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_mount *mp = ip->i_mount;
+
+ if (XFS_IS_REALTIME_INODE(ip))
+ return mp->m_rtdev_targp->bt_bdev;
+ else
+ return mp->m_ddev_targp->bt_bdev;
+}
+
+/*
+ * We're now finished for good with this ioend structure.
+ * Update the page state via the associated buffer_heads,
+ * release holds on the inode and bio, and finally free
+ * up memory. Do not use the ioend after this.
+ */
+STATIC void
+xfs_destroy_ioend(
+ xfs_ioend_t *ioend)
+{
+ struct buffer_head *bh, *next;
+
+ for (bh = ioend->io_buffer_head; bh; bh = next) {
+ next = bh->b_private;
+ bh->b_end_io(bh, !ioend->io_error);
+ }
+
+ mempool_free(ioend, xfs_ioend_pool);
+}
+
+/*
+ * Fast and loose check if this write could update the on-disk inode size.
+ */
+static inline bool xfs_ioend_is_append(struct xfs_ioend *ioend)
+{
+ return ioend->io_offset + ioend->io_size >
+ XFS_I(ioend->io_inode)->i_d.di_size;
+}
+
+STATIC int
+xfs_setfilesize_trans_alloc(
+ struct xfs_ioend *ioend)
+{
+ struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount;
+ struct xfs_trans *tp;
+ int error;
+
+ tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
+
+ error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0);
+ if (error) {
+ xfs_trans_cancel(tp, 0);
+ return error;
+ }
+
+ ioend->io_append_trans = tp;
+
+ /*
+ * We may pass freeze protection with a transaction. So tell lockdep
+ * we released it.
+ */
+ rwsem_release(&ioend->io_inode->i_sb->s_writers.lock_map[SB_FREEZE_FS-1],
+ 1, _THIS_IP_);
+ /*
+ * We hand off the transaction to the completion thread now, so
+ * clear the flag here.
+ */
+ current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
+ return 0;
+}
+
+/*
+ * Update on-disk file size now that data has been written to disk.
+ */
+STATIC int
+xfs_setfilesize(
+ struct xfs_ioend *ioend)
+{
+ struct xfs_inode *ip = XFS_I(ioend->io_inode);
+ struct xfs_trans *tp = ioend->io_append_trans;
+ xfs_fsize_t isize;
+
+ /*
+ * The transaction may have been allocated in the I/O submission thread,
+ * thus we need to mark ourselves as beeing in a transaction manually.
+ * Similarly for freeze protection.
+ */
+ current_set_flags_nested(&tp->t_pflags, PF_FSTRANS);
+ rwsem_acquire_read(&VFS_I(ip)->i_sb->s_writers.lock_map[SB_FREEZE_FS-1],
+ 0, 1, _THIS_IP_);
+
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ isize = xfs_new_eof(ip, ioend->io_offset + ioend->io_size);
+ if (!isize) {
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ xfs_trans_cancel(tp, 0);
+ return 0;
+ }
+
+ trace_xfs_setfilesize(ip, ioend->io_offset, ioend->io_size);
+
+ ip->i_d.di_size = isize;
+ xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+ return xfs_trans_commit(tp, 0);
+}
+
+/*
+ * Schedule IO completion handling on the final put of an ioend.
+ *
+ * If there is no work to do we might as well call it a day and free the
+ * ioend right now.
+ */
+STATIC void
+xfs_finish_ioend(
+ struct xfs_ioend *ioend)
+{
+ if (atomic_dec_and_test(&ioend->io_remaining)) {
+ struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount;
+
+ if (ioend->io_type == XFS_IO_UNWRITTEN)
+ queue_work(mp->m_unwritten_workqueue, &ioend->io_work);
+ else if (ioend->io_append_trans ||
+ (ioend->io_isdirect && xfs_ioend_is_append(ioend)))
+ queue_work(mp->m_data_workqueue, &ioend->io_work);
+ else
+ xfs_destroy_ioend(ioend);
+ }
+}
+
+/*
+ * IO write completion.
+ */
+STATIC void
+xfs_end_io(
+ struct work_struct *work)
+{
+ xfs_ioend_t *ioend = container_of(work, xfs_ioend_t, io_work);
+ struct xfs_inode *ip = XFS_I(ioend->io_inode);
+ int error = 0;
+
+ if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+ ioend->io_error = -EIO;
+ goto done;
+ }
+ if (ioend->io_error)
+ goto done;
+
+ /*
+ * For unwritten extents we need to issue transactions to convert a
+ * range to normal written extens after the data I/O has finished.
+ */
+ if (ioend->io_type == XFS_IO_UNWRITTEN) {
+ error = xfs_iomap_write_unwritten(ip, ioend->io_offset,
+ ioend->io_size);
+ } else if (ioend->io_isdirect && xfs_ioend_is_append(ioend)) {
+ /*
+ * For direct I/O we do not know if we need to allocate blocks
+ * or not so we can't preallocate an append transaction as that
+ * results in nested reservations and log space deadlocks. Hence
+ * allocate the transaction here. While this is sub-optimal and
+ * can block IO completion for some time, we're stuck with doing
+ * it this way until we can pass the ioend to the direct IO
+ * allocation callbacks and avoid nesting that way.
+ */
+ error = xfs_setfilesize_trans_alloc(ioend);
+ if (error)
+ goto done;
+ error = xfs_setfilesize(ioend);
+ } else if (ioend->io_append_trans) {
+ error = xfs_setfilesize(ioend);
+ } else {
+ ASSERT(!xfs_ioend_is_append(ioend));
+ }
+
+done:
+ if (error)
+ ioend->io_error = -error;
+ xfs_destroy_ioend(ioend);
+}
+
+/*
+ * Call IO completion handling in caller context on the final put of an ioend.
+ */
+STATIC void
+xfs_finish_ioend_sync(
+ struct xfs_ioend *ioend)
+{
+ if (atomic_dec_and_test(&ioend->io_remaining))
+ xfs_end_io(&ioend->io_work);
+}
+
+/*
+ * Allocate and initialise an IO completion structure.
+ * We need to track unwritten extent write completion here initially.
+ * We'll need to extend this for updating the ondisk inode size later
+ * (vs. incore size).
+ */
+STATIC xfs_ioend_t *
+xfs_alloc_ioend(
+ struct inode *inode,
+ unsigned int type)
+{
+ xfs_ioend_t *ioend;
+
+ ioend = mempool_alloc(xfs_ioend_pool, GFP_NOFS);
+
+ /*
+ * Set the count to 1 initially, which will prevent an I/O
+ * completion callback from happening before we have started
+ * all the I/O from calling the completion routine too early.
+ */
+ atomic_set(&ioend->io_remaining, 1);
+ ioend->io_isdirect = 0;
+ ioend->io_error = 0;
+ ioend->io_list = NULL;
+ ioend->io_type = type;
+ ioend->io_inode = inode;
+ ioend->io_buffer_head = NULL;
+ ioend->io_buffer_tail = NULL;
+ ioend->io_offset = 0;
+ ioend->io_size = 0;
+ ioend->io_append_trans = NULL;
+
+ INIT_WORK(&ioend->io_work, xfs_end_io);
+ return ioend;
+}
+
+STATIC int
+xfs_map_blocks(
+ struct inode *inode,
+ loff_t offset,
+ struct xfs_bmbt_irec *imap,
+ int type,
+ int nonblocking)
+{
+ struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_mount *mp = ip->i_mount;
+ ssize_t count = 1 << inode->i_blkbits;
+ xfs_fileoff_t offset_fsb, end_fsb;
+ int error = 0;
+ int bmapi_flags = XFS_BMAPI_ENTIRE;
+ int nimaps = 1;
+
+ if (XFS_FORCED_SHUTDOWN(mp))
+ return -XFS_ERROR(EIO);
+
+ if (type == XFS_IO_UNWRITTEN)
+ bmapi_flags |= XFS_BMAPI_IGSTATE;
+
+ if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
+ if (nonblocking)
+ return -XFS_ERROR(EAGAIN);
+ xfs_ilock(ip, XFS_ILOCK_SHARED);
+ }
+
+ ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
+ (ip->i_df.if_flags & XFS_IFEXTENTS));
+ ASSERT(offset <= mp->m_super->s_maxbytes);
+
+ if (offset + count > mp->m_super->s_maxbytes)
+ count = mp->m_super->s_maxbytes - offset;
+ end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
+ offset_fsb = XFS_B_TO_FSBT(mp, offset);
+ error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb,
+ imap, &nimaps, bmapi_flags);
+ xfs_iunlock(ip, XFS_ILOCK_SHARED);
+
+ if (error)
+ return -XFS_ERROR(error);
+
+ if (type == XFS_IO_DELALLOC &&
+ (!nimaps || isnullstartblock(imap->br_startblock))) {
+ error = xfs_iomap_write_allocate(ip, offset, imap);
+ if (!error)
+ trace_xfs_map_blocks_alloc(ip, offset, count, type, imap);
+ return -XFS_ERROR(error);
+ }
+
+#ifdef DEBUG
+ if (type == XFS_IO_UNWRITTEN) {
+ ASSERT(nimaps);
+ ASSERT(imap->br_startblock != HOLESTARTBLOCK);
+ ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
+ }
+#endif
+ if (nimaps)
+ trace_xfs_map_blocks_found(ip, offset, count, type, imap);
+ return 0;
+}
+
+STATIC int
+xfs_imap_valid(
+ struct inode *inode,
+ struct xfs_bmbt_irec *imap,
+ xfs_off_t offset)
+{
+ offset >>= inode->i_blkbits;
+
+ return offset >= imap->br_startoff &&
+ offset < imap->br_startoff + imap->br_blockcount;
+}
+
+/*
+ * BIO completion handler for buffered IO.
+ */
+STATIC void
+xfs_end_bio(
+ struct bio *bio,
+ int error)
+{
+ xfs_ioend_t *ioend = bio->bi_private;
+
+ ASSERT(atomic_read(&bio->bi_cnt) >= 1);
+ ioend->io_error = test_bit(BIO_UPTODATE, &bio->bi_flags) ? 0 : error;
+
+ /* Toss bio and pass work off to an xfsdatad thread */
+ bio->bi_private = NULL;
+ bio->bi_end_io = NULL;
+ bio_put(bio);
+
+ xfs_finish_ioend(ioend);
+}
+
+STATIC void
+xfs_submit_ioend_bio(
+ struct writeback_control *wbc,
+ xfs_ioend_t *ioend,
+ struct bio *bio)
+{
+ atomic_inc(&ioend->io_remaining);
+ bio->bi_private = ioend;
+ bio->bi_end_io = xfs_end_bio;
+ submit_bio(wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE, bio);
+}
+
+STATIC struct bio *
+xfs_alloc_ioend_bio(
+ struct buffer_head *bh)
+{
+ int nvecs = bio_get_nr_vecs(bh->b_bdev);
+ struct bio *bio = bio_alloc(GFP_NOIO, nvecs);
+
+ ASSERT(bio->bi_private == NULL);
+ bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
+ bio->bi_bdev = bh->b_bdev;
+ return bio;
+}
+
+STATIC void
+xfs_start_buffer_writeback(
+ struct buffer_head *bh)
+{
+ ASSERT(buffer_mapped(bh));
+ ASSERT(buffer_locked(bh));
+ ASSERT(!buffer_delay(bh));
+ ASSERT(!buffer_unwritten(bh));
+
+ mark_buffer_async_write(bh);
+ set_buffer_uptodate(bh);
+ clear_buffer_dirty(bh);
+}
+
+STATIC void
+xfs_start_page_writeback(
+ struct page *page,
+ int clear_dirty,
+ int buffers)
+{
+ ASSERT(PageLocked(page));
+ ASSERT(!PageWriteback(page));
+ if (clear_dirty)
+ clear_page_dirty_for_io(page);
+ set_page_writeback(page);
+ unlock_page(page);
+ /* If no buffers on the page are to be written, finish it here */
+ if (!buffers)
+ end_page_writeback(page);
+}
+
+static inline int xfs_bio_add_buffer(struct bio *bio, struct buffer_head *bh)
+{
+ return bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh));
+}
+
+/*
+ * Submit all of the bios for all of the ioends we have saved up, covering the
+ * initial writepage page and also any probed pages.
+ *
+ * Because we may have multiple ioends spanning a page, we need to start
+ * writeback on all the buffers before we submit them for I/O. If we mark the
+ * buffers as we got, then we can end up with a page that only has buffers
+ * marked async write and I/O complete on can occur before we mark the other
+ * buffers async write.
+ *
+ * The end result of this is that we trip a bug in end_page_writeback() because
+ * we call it twice for the one page as the code in end_buffer_async_write()
+ * assumes that all buffers on the page are started at the same time.
+ *
+ * The fix is two passes across the ioend list - one to start writeback on the
+ * buffer_heads, and then submit them for I/O on the second pass.
+ *
+ * If @fail is non-zero, it means that we have a situation where some part of
+ * the submission process has failed after we have marked paged for writeback
+ * and unlocked them. In this situation, we need to fail the ioend chain rather
+ * than submit it to IO. This typically only happens on a filesystem shutdown.
+ */
+STATIC void
+xfs_submit_ioend(
+ struct writeback_control *wbc,
+ xfs_ioend_t *ioend,
+ int fail)
+{
+ xfs_ioend_t *head = ioend;
+ xfs_ioend_t *next;
+ struct buffer_head *bh;
+ struct bio *bio;
+ sector_t lastblock = 0;
+
+ /* Pass 1 - start writeback */
+ do {
+ next = ioend->io_list;
+ for (bh = ioend->io_buffer_head; bh; bh = bh->b_private)
+ xfs_start_buffer_writeback(bh);
+ } while ((ioend = next) != NULL);
+
+ /* Pass 2 - submit I/O */
+ ioend = head;
+ do {
+ next = ioend->io_list;
+ bio = NULL;
+
+ /*
+ * If we are failing the IO now, just mark the ioend with an
+ * error and finish it. This will run IO completion immediately
+ * as there is only one reference to the ioend at this point in
+ * time.
+ */
+ if (fail) {
+ ioend->io_error = -fail;
+ xfs_finish_ioend(ioend);
+ continue;
+ }
+
+ for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) {
+
+ if (!bio) {
+ retry:
+ bio = xfs_alloc_ioend_bio(bh);
+ } else if (bh->b_blocknr != lastblock + 1) {
+ xfs_submit_ioend_bio(wbc, ioend, bio);
+ goto retry;
+ }
+
+ if (xfs_bio_add_buffer(bio, bh) != bh->b_size) {
+ xfs_submit_ioend_bio(wbc, ioend, bio);
+ goto retry;
+ }
+
+ lastblock = bh->b_blocknr;
+ }
+ if (bio)
+ xfs_submit_ioend_bio(wbc, ioend, bio);
+ xfs_finish_ioend(ioend);
+ } while ((ioend = next) != NULL);
+}
+
+/*
+ * Cancel submission of all buffer_heads so far in this endio.
+ * Toss the endio too. Only ever called for the initial page
+ * in a writepage request, so only ever one page.
+ */
+STATIC void
+xfs_cancel_ioend(
+ xfs_ioend_t *ioend)
+{
+ xfs_ioend_t *next;
+ struct buffer_head *bh, *next_bh;
+
+ do {
+ next = ioend->io_list;
+ bh = ioend->io_buffer_head;
+ do {
+ next_bh = bh->b_private;
+ clear_buffer_async_write(bh);
+ unlock_buffer(bh);
+ } while ((bh = next_bh) != NULL);
+
+ mempool_free(ioend, xfs_ioend_pool);
+ } while ((ioend = next) != NULL);
+}
+
+/*
+ * Test to see if we've been building up a completion structure for
+ * earlier buffers -- if so, we try to append to this ioend if we
+ * can, otherwise we finish off any current ioend and start another.
+ * Return true if we've finished the given ioend.
+ */
+STATIC void
+xfs_add_to_ioend(
+ struct inode *inode,
+ struct buffer_head *bh,
+ xfs_off_t offset,
+ unsigned int type,
+ xfs_ioend_t **result,
+ int need_ioend)
+{
+ xfs_ioend_t *ioend = *result;
+
+ if (!ioend || need_ioend || type != ioend->io_type) {
+ xfs_ioend_t *previous = *result;
+
+ ioend = xfs_alloc_ioend(inode, type);
+ ioend->io_offset = offset;
+ ioend->io_buffer_head = bh;
+ ioend->io_buffer_tail = bh;
+ if (previous)
+ previous->io_list = ioend;
+ *result = ioend;
+ } else {
+ ioend->io_buffer_tail->b_private = bh;
+ ioend->io_buffer_tail = bh;
+ }
+
+ bh->b_private = NULL;
+ ioend->io_size += bh->b_size;
+}
+
+STATIC void
+xfs_map_buffer(
+ struct inode *inode,
+ struct buffer_head *bh,
+ struct xfs_bmbt_irec *imap,
+ xfs_off_t offset)
+{
+ sector_t bn;
+ struct xfs_mount *m = XFS_I(inode)->i_mount;
+ xfs_off_t iomap_offset = XFS_FSB_TO_B(m, imap->br_startoff);
+ xfs_daddr_t iomap_bn = xfs_fsb_to_db(XFS_I(inode), imap->br_startblock);
+
+ ASSERT(imap->br_startblock != HOLESTARTBLOCK);
+ ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
+
+ bn = (iomap_bn >> (inode->i_blkbits - BBSHIFT)) +
+ ((offset - iomap_offset) >> inode->i_blkbits);
+
+ ASSERT(bn || XFS_IS_REALTIME_INODE(XFS_I(inode)));
+
+ bh->b_blocknr = bn;
+ set_buffer_mapped(bh);
+}
+
+STATIC void
+xfs_map_at_offset(
+ struct inode *inode,
+ struct buffer_head *bh,
+ struct xfs_bmbt_irec *imap,
+ xfs_off_t offset)
+{
+ ASSERT(imap->br_startblock != HOLESTARTBLOCK);
+ ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
+
+ xfs_map_buffer(inode, bh, imap, offset);
+ set_buffer_mapped(bh);
+ clear_buffer_delay(bh);
+ clear_buffer_unwritten(bh);
+}
+
+/*
+ * Test if a given page contains at least one buffer of a given @type.
+ * If @check_all_buffers is true, then we walk all the buffers in the page to
+ * try to find one of the type passed in. If it is not set, then the caller only
+ * needs to check the first buffer on the page for a match.
+ */
+STATIC bool
+xfs_check_page_type(
+ struct page *page,
+ unsigned int type,
+ bool check_all_buffers)
+{
+ struct buffer_head *bh;
+ struct buffer_head *head;
+
+ if (PageWriteback(page))
+ return false;
+ if (!page->mapping)
+ return false;
+ if (!page_has_buffers(page))
+ return false;
+
+ bh = head = page_buffers(page);
+ do {
+ if (buffer_unwritten(bh)) {
+ if (type == XFS_IO_UNWRITTEN)
+ return true;
+ } else if (buffer_delay(bh)) {
+ if (type == XFS_IO_DELALLOC)
+ return true;
+ } else if (buffer_dirty(bh) && buffer_mapped(bh)) {
+ if (type == XFS_IO_OVERWRITE)
+ return true;
+ }
+
+ /* If we are only checking the first buffer, we are done now. */
+ if (!check_all_buffers)
+ break;
+ } while ((bh = bh->b_this_page) != head);
+
+ return false;
+}
+
+/*
+ * Allocate & map buffers for page given the extent map. Write it out.
+ * except for the original page of a writepage, this is called on
+ * delalloc/unwritten pages only, for the original page it is possible
+ * that the page has no mapping at all.
+ */
+STATIC int
+xfs_convert_page(
+ struct inode *inode,
+ struct page *page,
+ loff_t tindex,
+ struct xfs_bmbt_irec *imap,
+ xfs_ioend_t **ioendp,
+ struct writeback_control *wbc)
+{
+ struct buffer_head *bh, *head;
+ xfs_off_t end_offset;
+ unsigned long p_offset;
+ unsigned int type;
+ int len, page_dirty;
+ int count = 0, done = 0, uptodate = 1;
+ xfs_off_t offset = page_offset(page);
+
+ if (page->index != tindex)
+ goto fail;
+ if (!trylock_page(page))
+ goto fail;
+ if (PageWriteback(page))
+ goto fail_unlock_page;
+ if (page->mapping != inode->i_mapping)
+ goto fail_unlock_page;
+ if (!xfs_check_page_type(page, (*ioendp)->io_type, false))
+ goto fail_unlock_page;
+
+ /*
+ * page_dirty is initially a count of buffers on the page before
+ * EOF and is decremented as we move each into a cleanable state.
+ *
+ * Derivation:
+ *
+ * End offset is the highest offset that this page should represent.
+ * If we are on the last page, (end_offset & (PAGE_CACHE_SIZE - 1))
+ * will evaluate non-zero and be less than PAGE_CACHE_SIZE and
+ * hence give us the correct page_dirty count. On any other page,
+ * it will be zero and in that case we need page_dirty to be the
+ * count of buffers on the page.
+ */
+ end_offset = min_t(unsigned long long,
+ (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT,
+ i_size_read(inode));
+
+ /*
+ * If the current map does not span the entire page we are about to try
+ * to write, then give up. The only way we can write a page that spans
+ * multiple mappings in a single writeback iteration is via the
+ * xfs_vm_writepage() function. Data integrity writeback requires the
+ * entire page to be written in a single attempt, otherwise the part of
+ * the page we don't write here doesn't get written as part of the data
+ * integrity sync.
+ *
+ * For normal writeback, we also don't attempt to write partial pages
+ * here as it simply means that write_cache_pages() will see it under
+ * writeback and ignore the page until some point in the future, at
+ * which time this will be the only page in the file that needs
+ * writeback. Hence for more optimal IO patterns, we should always
+ * avoid partial page writeback due to multiple mappings on a page here.
+ */
+ if (!xfs_imap_valid(inode, imap, end_offset))
+ goto fail_unlock_page;
+
+ len = 1 << inode->i_blkbits;
+ p_offset = min_t(unsigned long, end_offset & (PAGE_CACHE_SIZE - 1),
+ PAGE_CACHE_SIZE);
+ p_offset = p_offset ? roundup(p_offset, len) : PAGE_CACHE_SIZE;
+ page_dirty = p_offset / len;
+
+ /*
+ * The moment we find a buffer that doesn't match our current type
+ * specification or can't be written, abort the loop and start
+ * writeback. As per the above xfs_imap_valid() check, only
+ * xfs_vm_writepage() can handle partial page writeback fully - we are
+ * limited here to the buffers that are contiguous with the current
+ * ioend, and hence a buffer we can't write breaks that contiguity and
+ * we have to defer the rest of the IO to xfs_vm_writepage().
+ */
+ bh = head = page_buffers(page);
+ do {
+ if (offset >= end_offset)
+ break;
+ if (!buffer_uptodate(bh))
+ uptodate = 0;
+ if (!(PageUptodate(page) || buffer_uptodate(bh))) {
+ done = 1;
+ break;
+ }
+
+ if (buffer_unwritten(bh) || buffer_delay(bh) ||
+ buffer_mapped(bh)) {
+ if (buffer_unwritten(bh))
+ type = XFS_IO_UNWRITTEN;
+ else if (buffer_delay(bh))
+ type = XFS_IO_DELALLOC;
+ else
+ type = XFS_IO_OVERWRITE;
+
+ /*
+ * imap should always be valid because of the above
+ * partial page end_offset check on the imap.
+ */
+ ASSERT(xfs_imap_valid(inode, imap, offset));
+
+ lock_buffer(bh);
+ if (type != XFS_IO_OVERWRITE)
+ xfs_map_at_offset(inode, bh, imap, offset);
+ xfs_add_to_ioend(inode, bh, offset, type,
+ ioendp, done);
+
+ page_dirty--;
+ count++;
+ } else {
+ done = 1;
+ break;
+ }
+ } while (offset += len, (bh = bh->b_this_page) != head);
+
+ if (uptodate && bh == head)
+ SetPageUptodate(page);
+
+ if (count) {
+ if (--wbc->nr_to_write <= 0 &&
+ wbc->sync_mode == WB_SYNC_NONE)
+ done = 1;
+ }
+ xfs_start_page_writeback(page, !page_dirty, count);
+
+ return done;
+ fail_unlock_page:
+ unlock_page(page);
+ fail:
+ return 1;
+}
+
+/*
+ * Convert & write out a cluster of pages in the same extent as defined
+ * by mp and following the start page.
+ */
+STATIC void
+xfs_cluster_write(
+ struct inode *inode,
+ pgoff_t tindex,
+ struct xfs_bmbt_irec *imap,
+ xfs_ioend_t **ioendp,
+ struct writeback_control *wbc,
+ pgoff_t tlast)
+{
+ struct pagevec pvec;
+ int done = 0, i;
+
+ pagevec_init(&pvec, 0);
+ while (!done && tindex <= tlast) {
+ unsigned len = min_t(pgoff_t, PAGEVEC_SIZE, tlast - tindex + 1);
+
+ if (!pagevec_lookup(&pvec, inode->i_mapping, tindex, len))
+ break;
+
+ for (i = 0; i < pagevec_count(&pvec); i++) {
+ done = xfs_convert_page(inode, pvec.pages[i], tindex++,
+ imap, ioendp, wbc);
+ if (done)
+ break;
+ }
+
+ pagevec_release(&pvec);
+ cond_resched();
+ }
+}
+
+STATIC void
+xfs_vm_invalidatepage(
+ struct page *page,
+ unsigned int offset,
+ unsigned int length)
+{
+ trace_xfs_invalidatepage(page->mapping->host, page, offset,
+ length);
+ block_invalidatepage(page, offset, length);
+}
+
+/*
+ * If the page has delalloc buffers on it, we need to punch them out before we
+ * invalidate the page. If we don't, we leave a stale delalloc mapping on the
+ * inode that can trip a BUG() in xfs_get_blocks() later on if a direct IO read
+ * is done on that same region - the delalloc extent is returned when none is
+ * supposed to be there.
+ *
+ * We prevent this by truncating away the delalloc regions on the page before
+ * invalidating it. Because they are delalloc, we can do this without needing a
+ * transaction. Indeed - if we get ENOSPC errors, we have to be able to do this
+ * truncation without a transaction as there is no space left for block
+ * reservation (typically why we see a ENOSPC in writeback).
+ *
+ * This is not a performance critical path, so for now just do the punching a
+ * buffer head at a time.
+ */
+STATIC void
+xfs_aops_discard_page(
+ struct page *page)
+{
+ struct inode *inode = page->mapping->host;
+ struct xfs_inode *ip = XFS_I(inode);
+ struct buffer_head *bh, *head;
+ loff_t offset = page_offset(page);
+
+ if (!xfs_check_page_type(page, XFS_IO_DELALLOC, true))
+ goto out_invalidate;
+
+ if (XFS_FORCED_SHUTDOWN(ip->i_mount))
+ goto out_invalidate;
+
+ xfs_alert(ip->i_mount,
+ "page discard on page %p, inode 0x%llx, offset %llu.",
+ page, ip->i_ino, offset);
+
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ bh = head = page_buffers(page);
+ do {
+ int error;
+ xfs_fileoff_t start_fsb;
+
+ if (!buffer_delay(bh))
+ goto next_buffer;
+
+ start_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
+ error = xfs_bmap_punch_delalloc_range(ip, start_fsb, 1);
+ if (error) {
+ /* something screwed, just bail */
+ if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+ xfs_alert(ip->i_mount,
+ "page discard unable to remove delalloc mapping.");
+ }
+ break;
+ }
+next_buffer:
+ offset += 1 << inode->i_blkbits;
+
+ } while ((bh = bh->b_this_page) != head);
+
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+out_invalidate:
+ xfs_vm_invalidatepage(page, 0, PAGE_CACHE_SIZE);
+ return;
+}
+
+/*
+ * Write out a dirty page.
+ *
+ * For delalloc space on the page we need to allocate space and flush it.
+ * For unwritten space on the page we need to start the conversion to
+ * regular allocated space.
+ * For any other dirty buffer heads on the page we should flush them.
+ */
+STATIC int
+xfs_vm_writepage(
+ struct page *page,
+ struct writeback_control *wbc)
+{
+ struct inode *inode = page->mapping->host;
+ struct buffer_head *bh, *head;
+ struct xfs_bmbt_irec imap;
+ xfs_ioend_t *ioend = NULL, *iohead = NULL;
+ loff_t offset;
+ unsigned int type;
+ __uint64_t end_offset;
+ pgoff_t end_index, last_index;
+ ssize_t len;
+ int err, imap_valid = 0, uptodate = 1;
+ int count = 0;
+ int nonblocking = 0;
+
+ trace_xfs_writepage(inode, page, 0, 0);
+
+ ASSERT(page_has_buffers(page));
+
+ /*
+ * Refuse to write the page out if we are called from reclaim context.
+ *
+ * This avoids stack overflows when called from deeply used stacks in
+ * random callers for direct reclaim or memcg reclaim. We explicitly
+ * allow reclaim from kswapd as the stack usage there is relatively low.
+ *
+ * This should never happen except in the case of a VM regression so
+ * warn about it.
+ */
+ if (WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD)) ==
+ PF_MEMALLOC))
+ goto redirty;
+
+ /*
+ * Given that we do not allow direct reclaim to call us, we should
+ * never be called while in a filesystem transaction.
+ */
+ if (WARN_ON_ONCE(current->flags & PF_FSTRANS))
+ goto redirty;
+
+ /* Is this page beyond the end of the file? */
+ offset = i_size_read(inode);
+ end_index = offset >> PAGE_CACHE_SHIFT;
+ last_index = (offset - 1) >> PAGE_CACHE_SHIFT;
+
+ /*
+ * The page index is less than the end_index, adjust the end_offset
+ * to the highest offset that this page should represent.
+ * -----------------------------------------------------
+ * | file mapping | <EOF> |
+ * -----------------------------------------------------
+ * | Page ... | Page N-2 | Page N-1 | Page N | |
+ * ^--------------------------------^----------|--------
+ * | desired writeback range | see else |
+ * ---------------------------------^------------------|
+ */
+ if (page->index < end_index)
+ end_offset = (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT;
+ else {
+ /*
+ * Check whether the page to write out is beyond or straddles
+ * i_size or not.
+ * -------------------------------------------------------
+ * | file mapping | <EOF> |
+ * -------------------------------------------------------
+ * | Page ... | Page N-2 | Page N-1 | Page N | Beyond |
+ * ^--------------------------------^-----------|---------
+ * | | Straddles |
+ * ---------------------------------^-----------|--------|
+ */
+ unsigned offset_into_page = offset & (PAGE_CACHE_SIZE - 1);
+
+ /*
+ * Skip the page if it is fully outside i_size, e.g. due to a
+ * truncate operation that is in progress. We must redirty the
+ * page so that reclaim stops reclaiming it. Otherwise
+ * xfs_vm_releasepage() is called on it and gets confused.
+ *
+ * Note that the end_index is unsigned long, it would overflow
+ * if the given offset is greater than 16TB on 32-bit system
+ * and if we do check the page is fully outside i_size or not
+ * via "if (page->index >= end_index + 1)" as "end_index + 1"
+ * will be evaluated to 0. Hence this page will be redirtied
+ * and be written out repeatedly which would result in an
+ * infinite loop, the user program that perform this operation
+ * will hang. Instead, we can verify this situation by checking
+ * if the page to write is totally beyond the i_size or if it's
+ * offset is just equal to the EOF.
+ */
+ if (page->index > end_index ||
+ (page->index == end_index && offset_into_page == 0))
+ goto redirty;
+
+ /*
+ * The page straddles i_size. It must be zeroed out on each
+ * and every writepage invocation because it may be mmapped.
+ * "A file is mapped in multiples of the page size. For a file
+ * that is not a multiple of the page size, the remaining
+ * memory is zeroed when mapped, and writes to that region are
+ * not written out to the file."
+ */
+ zero_user_segment(page, offset_into_page, PAGE_CACHE_SIZE);
+
+ /* Adjust the end_offset to the end of file */
+ end_offset = offset;
+ }
+
+ len = 1 << inode->i_blkbits;
+
+ bh = head = page_buffers(page);
+ offset = page_offset(page);
+ type = XFS_IO_OVERWRITE;
+
+ if (wbc->sync_mode == WB_SYNC_NONE)
+ nonblocking = 1;
+
+ do {
+ int new_ioend = 0;
+
+ if (offset >= end_offset)
+ break;
+ if (!buffer_uptodate(bh))
+ uptodate = 0;
+
+ /*
+ * set_page_dirty dirties all buffers in a page, independent
+ * of their state. The dirty state however is entirely
+ * meaningless for holes (!mapped && uptodate), so skip
+ * buffers covering holes here.
+ */
+ if (!buffer_mapped(bh) && buffer_uptodate(bh)) {
+ imap_valid = 0;
+ continue;
+ }
+
+ if (buffer_unwritten(bh)) {
+ if (type != XFS_IO_UNWRITTEN) {
+ type = XFS_IO_UNWRITTEN;
+ imap_valid = 0;
+ }
+ } else if (buffer_delay(bh)) {
+ if (type != XFS_IO_DELALLOC) {
+ type = XFS_IO_DELALLOC;
+ imap_valid = 0;
+ }
+ } else if (buffer_uptodate(bh)) {
+ if (type != XFS_IO_OVERWRITE) {
+ type = XFS_IO_OVERWRITE;
+ imap_valid = 0;
+ }
+ } else {
+ if (PageUptodate(page))
+ ASSERT(buffer_mapped(bh));
+ /*
+ * This buffer is not uptodate and will not be
+ * written to disk. Ensure that we will put any
+ * subsequent writeable buffers into a new
+ * ioend.
+ */
+ imap_valid = 0;
+ continue;
+ }
+
+ if (imap_valid)
+ imap_valid = xfs_imap_valid(inode, &imap, offset);
+ if (!imap_valid) {
+ /*
+ * If we didn't have a valid mapping then we need to
+ * put the new mapping into a separate ioend structure.
+ * This ensures non-contiguous extents always have
+ * separate ioends, which is particularly important
+ * for unwritten extent conversion at I/O completion
+ * time.
+ */
+ new_ioend = 1;
+ err = xfs_map_blocks(inode, offset, &imap, type,
+ nonblocking);
+ if (err)
+ goto error;
+ imap_valid = xfs_imap_valid(inode, &imap, offset);
+ }
+ if (imap_valid) {
+ lock_buffer(bh);
+ if (type != XFS_IO_OVERWRITE)
+ xfs_map_at_offset(inode, bh, &imap, offset);
+ xfs_add_to_ioend(inode, bh, offset, type, &ioend,
+ new_ioend);
+ count++;
+ }
+
+ if (!iohead)
+ iohead = ioend;
+
+ } while (offset += len, ((bh = bh->b_this_page) != head));
+
+ if (uptodate && bh == head)
+ SetPageUptodate(page);
+
+ xfs_start_page_writeback(page, 1, count);
+
+ /* if there is no IO to be submitted for this page, we are done */
+ if (!ioend)
+ return 0;
+
+ ASSERT(iohead);
+
+ /*
+ * Any errors from this point onwards need tobe reported through the IO
+ * completion path as we have marked the initial page as under writeback
+ * and unlocked it.
+ */
+ if (imap_valid) {
+ xfs_off_t end_index;
+
+ end_index = imap.br_startoff + imap.br_blockcount;
+
+ /* to bytes */
+ end_index <<= inode->i_blkbits;
+
+ /* to pages */
+ end_index = (end_index - 1) >> PAGE_CACHE_SHIFT;
+
+ /* check against file size */
+ if (end_index > last_index)
+ end_index = last_index;
+
+ xfs_cluster_write(inode, page->index + 1, &imap, &ioend,
+ wbc, end_index);
+ }
+
+
+ /*
+ * Reserve log space if we might write beyond the on-disk inode size.
+ */
+ err = 0;
+ if (ioend->io_type != XFS_IO_UNWRITTEN && xfs_ioend_is_append(ioend))
+ err = xfs_setfilesize_trans_alloc(ioend);
+
+ xfs_submit_ioend(wbc, iohead, err);
+
+ return 0;
+
+error:
+ if (iohead)
+ xfs_cancel_ioend(iohead);
+
+ if (err == -EAGAIN)
+ goto redirty;
+
+ xfs_aops_discard_page(page);
+ ClearPageUptodate(page);
+ unlock_page(page);
+ return err;
+
+redirty:
+ redirty_page_for_writepage(wbc, page);
+ unlock_page(page);
+ return 0;
+}
+
+STATIC int
+xfs_vm_writepages(
+ struct address_space *mapping,
+ struct writeback_control *wbc)
+{
+ xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED);
+ return generic_writepages(mapping, wbc);
+}
+
+/*
+ * Called to move a page into cleanable state - and from there
+ * to be released. The page should already be clean. We always
+ * have buffer heads in this call.
+ *
+ * Returns 1 if the page is ok to release, 0 otherwise.
+ */
+STATIC int
+xfs_vm_releasepage(
+ struct page *page,
+ gfp_t gfp_mask)
+{
+ int delalloc, unwritten;
+
+ trace_xfs_releasepage(page->mapping->host, page, 0, 0);
+
+ xfs_count_page_state(page, &delalloc, &unwritten);
+
+ if (WARN_ON_ONCE(delalloc))
+ return 0;
+ if (WARN_ON_ONCE(unwritten))
+ return 0;
+
+ return try_to_free_buffers(page);
+}
+
+STATIC int
+__xfs_get_blocks(
+ struct inode *inode,
+ sector_t iblock,
+ struct buffer_head *bh_result,
+ int create,
+ int direct)
+{
+ struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_mount *mp = ip->i_mount;
+ xfs_fileoff_t offset_fsb, end_fsb;
+ int error = 0;
+ int lockmode = 0;
+ struct xfs_bmbt_irec imap;
+ int nimaps = 1;
+ xfs_off_t offset;
+ ssize_t size;
+ int new = 0;
+
+ if (XFS_FORCED_SHUTDOWN(mp))
+ return -XFS_ERROR(EIO);
+
+ offset = (xfs_off_t)iblock << inode->i_blkbits;
+ ASSERT(bh_result->b_size >= (1 << inode->i_blkbits));
+ size = bh_result->b_size;
+
+ if (!create && direct && offset >= i_size_read(inode))
+ return 0;
+
+ /*
+ * Direct I/O is usually done on preallocated files, so try getting
+ * a block mapping without an exclusive lock first. For buffered
+ * writes we already have the exclusive iolock anyway, so avoiding
+ * a lock roundtrip here by taking the ilock exclusive from the
+ * beginning is a useful micro optimization.
+ */
+ if (create && !direct) {
+ lockmode = XFS_ILOCK_EXCL;
+ xfs_ilock(ip, lockmode);
+ } else {
+ lockmode = xfs_ilock_data_map_shared(ip);
+ }
+
+ ASSERT(offset <= mp->m_super->s_maxbytes);
+ if (offset + size > mp->m_super->s_maxbytes)
+ size = mp->m_super->s_maxbytes - offset;
+ end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + size);
+ offset_fsb = XFS_B_TO_FSBT(mp, offset);
+
+ error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb,
+ &imap, &nimaps, XFS_BMAPI_ENTIRE);
+ if (error)
+ goto out_unlock;
+
+ if (create &&
+ (!nimaps ||
+ (imap.br_startblock == HOLESTARTBLOCK ||
+ imap.br_startblock == DELAYSTARTBLOCK))) {
+ if (direct || xfs_get_extsz_hint(ip)) {
+ /*
+ * Drop the ilock in preparation for starting the block
+ * allocation transaction. It will be retaken
+ * exclusively inside xfs_iomap_write_direct for the
+ * actual allocation.
+ */
+ xfs_iunlock(ip, lockmode);
+ error = xfs_iomap_write_direct(ip, offset, size,
+ &imap, nimaps);
+ if (error)
+ return -error;
+ new = 1;
+ } else {
+ /*
+ * Delalloc reservations do not require a transaction,
+ * we can go on without dropping the lock here. If we
+ * are allocating a new delalloc block, make sure that
+ * we set the new flag so that we mark the buffer new so
+ * that we know that it is newly allocated if the write
+ * fails.
+ */
+ if (nimaps && imap.br_startblock == HOLESTARTBLOCK)
+ new = 1;
+ error = xfs_iomap_write_delay(ip, offset, size, &imap);
+ if (error)
+ goto out_unlock;
+
+ xfs_iunlock(ip, lockmode);
+ }
+
+ trace_xfs_get_blocks_alloc(ip, offset, size, 0, &imap);
+ } else if (nimaps) {
+ trace_xfs_get_blocks_found(ip, offset, size, 0, &imap);
+ xfs_iunlock(ip, lockmode);
+ } else {
+ trace_xfs_get_blocks_notfound(ip, offset, size);
+ goto out_unlock;
+ }
+
+ if (imap.br_startblock != HOLESTARTBLOCK &&
+ imap.br_startblock != DELAYSTARTBLOCK) {
+ /*
+ * For unwritten extents do not report a disk address on
+ * the read case (treat as if we're reading into a hole).
+ */
+ if (create || !ISUNWRITTEN(&imap))
+ xfs_map_buffer(inode, bh_result, &imap, offset);
+ if (create && ISUNWRITTEN(&imap)) {
+ if (direct) {
+ bh_result->b_private = inode;
+ set_buffer_defer_completion(bh_result);
+ }
+ set_buffer_unwritten(bh_result);
+ }
+ }
+
+ /*
+ * If this is a realtime file, data may be on a different device.
+ * to that pointed to from the buffer_head b_bdev currently.
+ */
+ bh_result->b_bdev = xfs_find_bdev_for_inode(inode);
+
+ /*
+ * If we previously allocated a block out beyond eof and we are now
+ * coming back to use it then we will need to flag it as new even if it
+ * has a disk address.
+ *
+ * With sub-block writes into unwritten extents we also need to mark
+ * the buffer as new so that the unwritten parts of the buffer gets
+ * correctly zeroed.
+ */
+ if (create &&
+ ((!buffer_mapped(bh_result) && !buffer_uptodate(bh_result)) ||
+ (offset >= i_size_read(inode)) ||
+ (new || ISUNWRITTEN(&imap))))
+ set_buffer_new(bh_result);
+
+ if (imap.br_startblock == DELAYSTARTBLOCK) {
+ BUG_ON(direct);
+ if (create) {
+ set_buffer_uptodate(bh_result);
+ set_buffer_mapped(bh_result);
+ set_buffer_delay(bh_result);
+ }
+ }
+
+ /*
+ * If this is O_DIRECT or the mpage code calling tell them how large
+ * the mapping is, so that we can avoid repeated get_blocks calls.
+ *
+ * If the mapping spans EOF, then we have to break the mapping up as the
+ * mapping for blocks beyond EOF must be marked new so that sub block
+ * regions can be correctly zeroed. We can't do this for mappings within
+ * EOF unless the mapping was just allocated or is unwritten, otherwise
+ * the callers would overwrite existing data with zeros. Hence we have
+ * to split the mapping into a range up to and including EOF, and a
+ * second mapping for beyond EOF.
+ */
+ if (direct || size > (1 << inode->i_blkbits)) {
+ xfs_off_t mapping_size;
+
+ mapping_size = imap.br_startoff + imap.br_blockcount - iblock;
+ mapping_size <<= inode->i_blkbits;
+
+ ASSERT(mapping_size > 0);
+ if (mapping_size > size)
+ mapping_size = size;
+ if (offset < i_size_read(inode) &&
+ offset + mapping_size >= i_size_read(inode)) {
+ /* limit mapping to block that spans EOF */
+ mapping_size = roundup_64(i_size_read(inode) - offset,
+ 1 << inode->i_blkbits);
+ }
+ if (mapping_size > LONG_MAX)
+ mapping_size = LONG_MAX;
+
+ bh_result->b_size = mapping_size;
+ }
+
+ return 0;
+
+out_unlock:
+ xfs_iunlock(ip, lockmode);
+ return -error;
+}
+
+int
+xfs_get_blocks(
+ struct inode *inode,
+ sector_t iblock,
+ struct buffer_head *bh_result,
+ int create)
+{
+ return __xfs_get_blocks(inode, iblock, bh_result, create, 0);
+}
+
+STATIC int
+xfs_get_blocks_direct(
+ struct inode *inode,
+ sector_t iblock,
+ struct buffer_head *bh_result,
+ int create)
+{
+ return __xfs_get_blocks(inode, iblock, bh_result, create, 1);
+}
+
+/*
+ * Complete a direct I/O write request.
+ *
+ * If the private argument is non-NULL __xfs_get_blocks signals us that we
+ * need to issue a transaction to convert the range from unwritten to written
+ * extents. In case this is regular synchronous I/O we just call xfs_end_io
+ * to do this and we are done. But in case this was a successful AIO
+ * request this handler is called from interrupt context, from which we
+ * can't start transactions. In that case offload the I/O completion to
+ * the workqueues we also use for buffered I/O completion.
+ */
+STATIC void
+xfs_end_io_direct_write(
+ struct kiocb *iocb,
+ loff_t offset,
+ ssize_t size,
+ void *private)
+{
+ struct xfs_ioend *ioend = iocb->private;
+
+ /*
+ * While the generic direct I/O code updates the inode size, it does
+ * so only after the end_io handler is called, which means our
+ * end_io handler thinks the on-disk size is outside the in-core
+ * size. To prevent this just update it a little bit earlier here.
+ */
+ if (offset + size > i_size_read(ioend->io_inode))
+ i_size_write(ioend->io_inode, offset + size);
+
+ /*
+ * blockdev_direct_IO can return an error even after the I/O
+ * completion handler was called. Thus we need to protect
+ * against double-freeing.
+ */
+ iocb->private = NULL;
+
+ ioend->io_offset = offset;
+ ioend->io_size = size;
+ if (private && size > 0)
+ ioend->io_type = XFS_IO_UNWRITTEN;
+
+ xfs_finish_ioend_sync(ioend);
+}
+
+STATIC ssize_t
+xfs_vm_direct_IO(
+ int rw,
+ struct kiocb *iocb,
+ struct iov_iter *iter,
+ loff_t offset)
+{
+ struct inode *inode = iocb->ki_filp->f_mapping->host;
+ struct block_device *bdev = xfs_find_bdev_for_inode(inode);
+ struct xfs_ioend *ioend = NULL;
+ ssize_t ret;
+
+ if (rw & WRITE) {
+ size_t size = iov_iter_count(iter);
+
+ /*
+ * We cannot preallocate a size update transaction here as we
+ * don't know whether allocation is necessary or not. Hence we
+ * can only tell IO completion that one is necessary if we are
+ * not doing unwritten extent conversion.
+ */
+ iocb->private = ioend = xfs_alloc_ioend(inode, XFS_IO_DIRECT);
+ if (offset + size > XFS_I(inode)->i_d.di_size)
+ ioend->io_isdirect = 1;
+
+ ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iter,
+ offset, xfs_get_blocks_direct,
+ xfs_end_io_direct_write, NULL,
+ DIO_ASYNC_EXTEND);
+ if (ret != -EIOCBQUEUED && iocb->private)
+ goto out_destroy_ioend;
+ } else {
+ ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iter,
+ offset, xfs_get_blocks_direct,
+ NULL, NULL, 0);
+ }
+
+ return ret;
+
+out_destroy_ioend:
+ xfs_destroy_ioend(ioend);
+ return ret;
+}
+
+/*
+ * Punch out the delalloc blocks we have already allocated.
+ *
+ * Don't bother with xfs_setattr given that nothing can have made it to disk yet
+ * as the page is still locked at this point.
+ */
+STATIC void
+xfs_vm_kill_delalloc_range(
+ struct inode *inode,
+ loff_t start,
+ loff_t end)
+{
+ struct xfs_inode *ip = XFS_I(inode);
+ xfs_fileoff_t start_fsb;
+ xfs_fileoff_t end_fsb;
+ int error;
+
+ start_fsb = XFS_B_TO_FSB(ip->i_mount, start);
+ end_fsb = XFS_B_TO_FSB(ip->i_mount, end);
+ if (end_fsb <= start_fsb)
+ return;
+
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
+ end_fsb - start_fsb);
+ if (error) {
+ /* something screwed, just bail */
+ if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+ xfs_alert(ip->i_mount,
+ "xfs_vm_write_failed: unable to clean up ino %lld",
+ ip->i_ino);
+ }
+ }
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+}
+
+STATIC void
+xfs_vm_write_failed(
+ struct inode *inode,
+ struct page *page,
+ loff_t pos,
+ unsigned len)
+{
+ loff_t block_offset;
+ loff_t block_start;
+ loff_t block_end;
+ loff_t from = pos & (PAGE_CACHE_SIZE - 1);
+ loff_t to = from + len;
+ struct buffer_head *bh, *head;
+
+ /*
+ * The request pos offset might be 32 or 64 bit, this is all fine
+ * on 64-bit platform. However, for 64-bit pos request on 32-bit
+ * platform, the high 32-bit will be masked off if we evaluate the
+ * block_offset via (pos & PAGE_MASK) because the PAGE_MASK is
+ * 0xfffff000 as an unsigned long, hence the result is incorrect
+ * which could cause the following ASSERT failed in most cases.
+ * In order to avoid this, we can evaluate the block_offset of the
+ * start of the page by using shifts rather than masks the mismatch
+ * problem.
+ */
+ block_offset = (pos >> PAGE_CACHE_SHIFT) << PAGE_CACHE_SHIFT;
+
+ ASSERT(block_offset + from == pos);
+
+ head = page_buffers(page);
+ block_start = 0;
+ for (bh = head; bh != head || !block_start;
+ bh = bh->b_this_page, block_start = block_end,
+ block_offset += bh->b_size) {
+ block_end = block_start + bh->b_size;
+
+ /* skip buffers before the write */
+ if (block_end <= from)
+ continue;
+
+ /* if the buffer is after the write, we're done */
+ if (block_start >= to)
+ break;
+
+ if (!buffer_delay(bh))
+ continue;
+
+ if (!buffer_new(bh) && block_offset < i_size_read(inode))
+ continue;
+
+ xfs_vm_kill_delalloc_range(inode, block_offset,
+ block_offset + bh->b_size);
+
+ /*
+ * This buffer does not contain data anymore. make sure anyone
+ * who finds it knows that for certain.
+ */
+ clear_buffer_delay(bh);
+ clear_buffer_uptodate(bh);
+ clear_buffer_mapped(bh);
+ clear_buffer_new(bh);
+ clear_buffer_dirty(bh);
+ }
+
+}
+
+/*
+ * This used to call block_write_begin(), but it unlocks and releases the page
+ * on error, and we need that page to be able to punch stale delalloc blocks out
+ * on failure. hence we copy-n-waste it here and call xfs_vm_write_failed() at
+ * the appropriate point.
+ */
+STATIC int
+xfs_vm_write_begin(
+ struct file *file,
+ struct address_space *mapping,
+ loff_t pos,
+ unsigned len,
+ unsigned flags,
+ struct page **pagep,
+ void **fsdata)
+{
+ pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+ struct page *page;
+ int status;
+
+ ASSERT(len <= PAGE_CACHE_SIZE);
+
+ page = grab_cache_page_write_begin(mapping, index, flags);
+ if (!page)
+ return -ENOMEM;
+
+ status = __block_write_begin(page, pos, len, xfs_get_blocks);
+ if (unlikely(status)) {
+ struct inode *inode = mapping->host;
+ size_t isize = i_size_read(inode);
+
+ xfs_vm_write_failed(inode, page, pos, len);
+ unlock_page(page);
+
+ /*
+ * If the write is beyond EOF, we only want to kill blocks
+ * allocated in this write, not blocks that were previously
+ * written successfully.
+ */
+ if (pos + len > isize) {
+ ssize_t start = max_t(ssize_t, pos, isize);
+
+ truncate_pagecache_range(inode, start, pos + len);
+ }
+
+ page_cache_release(page);
+ page = NULL;
+ }
+
+ *pagep = page;
+ return status;
+}
+
+/*
+ * On failure, we only need to kill delalloc blocks beyond EOF in the range of
+ * this specific write because they will never be written. Previous writes
+ * beyond EOF where block allocation succeeded do not need to be trashed, so
+ * only new blocks from this write should be trashed. For blocks within
+ * EOF, generic_write_end() zeros them so they are safe to leave alone and be
+ * written with all the other valid data.
+ */
+STATIC int
+xfs_vm_write_end(
+ struct file *file,
+ struct address_space *mapping,
+ loff_t pos,
+ unsigned len,
+ unsigned copied,
+ struct page *page,
+ void *fsdata)
+{
+ int ret;
+
+ ASSERT(len <= PAGE_CACHE_SIZE);
+
+ ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
+ if (unlikely(ret < len)) {
+ struct inode *inode = mapping->host;
+ size_t isize = i_size_read(inode);
+ loff_t to = pos + len;
+
+ if (to > isize) {
+ /* only kill blocks in this write beyond EOF */
+ if (pos > isize)
+ isize = pos;
+ xfs_vm_kill_delalloc_range(inode, isize, to);
+ truncate_pagecache_range(inode, isize, to);
+ }
+ }
+ return ret;
+}
+
+STATIC sector_t
+xfs_vm_bmap(
+ struct address_space *mapping,
+ sector_t block)
+{
+ struct inode *inode = (struct inode *)mapping->host;
+ struct xfs_inode *ip = XFS_I(inode);
+
+ trace_xfs_vm_bmap(XFS_I(inode));
+ xfs_ilock(ip, XFS_IOLOCK_SHARED);
+ filemap_write_and_wait(mapping);
+ xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+ return generic_block_bmap(mapping, block, xfs_get_blocks);
+}
+
+STATIC int
+xfs_vm_readpage(
+ struct file *unused,
+ struct page *page)
+{
+ return mpage_readpage(page, xfs_get_blocks);
+}
+
+STATIC int
+xfs_vm_readpages(
+ struct file *unused,
+ struct address_space *mapping,
+ struct list_head *pages,
+ unsigned nr_pages)
+{
+ return mpage_readpages(mapping, pages, nr_pages, xfs_get_blocks);
+}
+
+const struct address_space_operations xfs_address_space_operations = {
+ .readpage = xfs_vm_readpage,
+ .readpages = xfs_vm_readpages,
+ .writepage = xfs_vm_writepage,
+ .writepages = xfs_vm_writepages,
+ .releasepage = xfs_vm_releasepage,
+ .invalidatepage = xfs_vm_invalidatepage,
+ .write_begin = xfs_vm_write_begin,
+ .write_end = xfs_vm_write_end,
+ .bmap = xfs_vm_bmap,
+ .direct_IO = xfs_vm_direct_IO,
+ .migratepage = buffer_migrate_page,
+ .is_partially_uptodate = block_is_partially_uptodate,
+ .error_remove_page = generic_error_remove_page,
+};
diff --git a/fs/xfs/linux-2.6/xfs_aops.h b/fs/xfs/xfs_aops.h
index 55339dd5a30..f94dd459dff 100644
--- a/fs/xfs/linux-2.6/xfs_aops.h
+++ b/fs/xfs/xfs_aops.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2005 Silicon Graphics, Inc.
+ * Copyright (c) 2005-2006 Silicon Graphics, Inc.
* All Rights Reserved.
*
* This program is free software; you can redistribute it and/or
@@ -18,10 +18,23 @@
#ifndef __XFS_AOPS_H__
#define __XFS_AOPS_H__
-extern struct workqueue_struct *xfsdatad_workqueue;
extern mempool_t *xfs_ioend_pool;
-typedef void (*xfs_ioend_func_t)(void *);
+/*
+ * Types of I/O for bmap clustering and I/O completion tracking.
+ */
+enum {
+ XFS_IO_DIRECT = 0, /* special case for direct I/O ioends */
+ XFS_IO_DELALLOC, /* covers delalloc region */
+ XFS_IO_UNWRITTEN, /* covers allocated but uninitialized data */
+ XFS_IO_OVERWRITE, /* covers already allocated extent */
+};
+
+#define XFS_IO_TYPES \
+ { 0, "" }, \
+ { XFS_IO_DELALLOC, "delalloc" }, \
+ { XFS_IO_UNWRITTEN, "unwritten" }, \
+ { XFS_IO_OVERWRITE, "overwrite" }
/*
* xfs_ioend struct manages large extent writes for XFS.
@@ -30,17 +43,21 @@ typedef void (*xfs_ioend_func_t)(void *);
typedef struct xfs_ioend {
struct xfs_ioend *io_list; /* next ioend in chain */
unsigned int io_type; /* delalloc / unwritten */
- unsigned int io_uptodate; /* I/O status register */
+ int io_error; /* I/O error code */
atomic_t io_remaining; /* hold count */
- struct vnode *io_vnode; /* file being written to */
+ unsigned int io_isdirect : 1;/* direct I/O */
+ struct inode *io_inode; /* file being written to */
struct buffer_head *io_buffer_head;/* buffer linked list head */
struct buffer_head *io_buffer_tail;/* buffer linked list tail */
size_t io_size; /* size of the extent */
xfs_off_t io_offset; /* offset in the file */
struct work_struct io_work; /* xfsdatad work queue */
+ struct xfs_trans *io_append_trans;/* xact. for size update */
} xfs_ioend_t;
-extern struct address_space_operations linvfs_aops;
-extern int linvfs_get_block(struct inode *, sector_t, struct buffer_head *, int);
+extern const struct address_space_operations xfs_address_space_operations;
+extern int xfs_get_blocks(struct inode *, sector_t, struct buffer_head *, int);
+
+extern void xfs_count_page_state(struct page *, int *, int *);
-#endif /* __XFS_IOPS_H__ */
+#endif /* __XFS_AOPS_H__ */
diff --git a/fs/xfs/xfs_arch.h b/fs/xfs/xfs_arch.h
deleted file mode 100644
index c4836890b72..00000000000
--- a/fs/xfs/xfs_arch.h
+++ /dev/null
@@ -1,236 +0,0 @@
-/*
- * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- */
-#ifndef __XFS_ARCH_H__
-#define __XFS_ARCH_H__
-
-#ifndef XFS_BIG_INUMS
-# error XFS_BIG_INUMS must be defined true or false
-#endif
-
-#ifdef __KERNEL__
-
-#include <asm/byteorder.h>
-
-#ifdef __BIG_ENDIAN
-#define XFS_NATIVE_HOST 1
-#else
-#undef XFS_NATIVE_HOST
-#endif
-
-#else /* __KERNEL__ */
-
-#if __BYTE_ORDER == __BIG_ENDIAN
-#define XFS_NATIVE_HOST 1
-#else
-#undef XFS_NATIVE_HOST
-#endif
-
-#ifdef XFS_NATIVE_HOST
-#define cpu_to_be16(val) ((__be16)(val))
-#define cpu_to_be32(val) ((__be32)(val))
-#define cpu_to_be64(val) ((__be64)(val))
-#define be16_to_cpu(val) ((__uint16_t)(val))
-#define be32_to_cpu(val) ((__uint32_t)(val))
-#define be64_to_cpu(val) ((__uint64_t)(val))
-#else
-#define cpu_to_be16(val) (__swab16((__uint16_t)(val)))
-#define cpu_to_be32(val) (__swab32((__uint32_t)(val)))
-#define cpu_to_be64(val) (__swab64((__uint64_t)(val)))
-#define be16_to_cpu(val) (__swab16((__be16)(val)))
-#define be32_to_cpu(val) (__swab32((__be32)(val)))
-#define be64_to_cpu(val) (__swab64((__be64)(val)))
-#endif
-
-#endif /* __KERNEL__ */
-
-/* do we need conversion? */
-#define ARCH_NOCONVERT 1
-#ifdef XFS_NATIVE_HOST
-# define ARCH_CONVERT ARCH_NOCONVERT
-#else
-# define ARCH_CONVERT 0
-#endif
-
-/* generic swapping macros */
-
-#ifndef HAVE_SWABMACROS
-#define INT_SWAP16(type,var) ((typeof(type))(__swab16((__u16)(var))))
-#define INT_SWAP32(type,var) ((typeof(type))(__swab32((__u32)(var))))
-#define INT_SWAP64(type,var) ((typeof(type))(__swab64((__u64)(var))))
-#endif
-
-#define INT_SWAP(type, var) \
- ((sizeof(type) == 8) ? INT_SWAP64(type,var) : \
- ((sizeof(type) == 4) ? INT_SWAP32(type,var) : \
- ((sizeof(type) == 2) ? INT_SWAP16(type,var) : \
- (var))))
-
-/*
- * get and set integers from potentially unaligned locations
- */
-
-#define INT_GET_UNALIGNED_16_BE(pointer) \
- ((__u16)((((__u8*)(pointer))[0] << 8) | (((__u8*)(pointer))[1])))
-#define INT_SET_UNALIGNED_16_BE(pointer,value) \
- { \
- ((__u8*)(pointer))[0] = (((value) >> 8) & 0xff); \
- ((__u8*)(pointer))[1] = (((value) ) & 0xff); \
- }
-
-/* define generic INT_ macros */
-
-#define INT_GET(reference,arch) \
- (((arch) == ARCH_NOCONVERT) \
- ? \
- (reference) \
- : \
- INT_SWAP((reference),(reference)) \
- )
-
-/* does not return a value */
-#define INT_SET(reference,arch,valueref) \
- (__builtin_constant_p(valueref) ? \
- (void)( (reference) = ( ((arch) != ARCH_NOCONVERT) ? (INT_SWAP((reference),(valueref))) : (valueref)) ) : \
- (void)( \
- ((reference) = (valueref)), \
- ( ((arch) != ARCH_NOCONVERT) ? (reference) = INT_SWAP((reference),(reference)) : 0 ) \
- ) \
- )
-
-/* does not return a value */
-#define INT_MOD_EXPR(reference,arch,code) \
- (((arch) == ARCH_NOCONVERT) \
- ? \
- (void)((reference) code) \
- : \
- (void)( \
- (reference) = INT_GET((reference),arch) , \
- ((reference) code), \
- INT_SET(reference, arch, reference) \
- ) \
- )
-
-/* does not return a value */
-#define INT_MOD(reference,arch,delta) \
- (void)( \
- INT_MOD_EXPR(reference,arch,+=(delta)) \
- )
-
-/*
- * INT_COPY - copy a value between two locations with the
- * _same architecture_ but _potentially different sizes_
- *
- * if the types of the two parameters are equal or they are
- * in native architecture, a simple copy is done
- *
- * otherwise, architecture conversions are done
- *
- */
-
-/* does not return a value */
-#define INT_COPY(dst,src,arch) \
- ( \
- ((sizeof(dst) == sizeof(src)) || ((arch) == ARCH_NOCONVERT)) \
- ? \
- (void)((dst) = (src)) \
- : \
- INT_SET(dst, arch, INT_GET(src, arch)) \
- )
-
-/*
- * INT_XLATE - copy a value in either direction between two locations
- * with different architectures
- *
- * dir < 0 - copy from memory to buffer (native to arch)
- * dir > 0 - copy from buffer to memory (arch to native)
- */
-
-/* does not return a value */
-#define INT_XLATE(buf,mem,dir,arch) {\
- ASSERT(dir); \
- if (dir>0) { \
- (mem)=INT_GET(buf, arch); \
- } else { \
- INT_SET(buf, arch, mem); \
- } \
-}
-
-static inline void be16_add(__be16 *a, __s16 b)
-{
- *a = cpu_to_be16(be16_to_cpu(*a) + b);
-}
-
-static inline void be32_add(__be32 *a, __s32 b)
-{
- *a = cpu_to_be32(be32_to_cpu(*a) + b);
-}
-
-static inline void be64_add(__be64 *a, __s64 b)
-{
- *a = cpu_to_be64(be64_to_cpu(*a) + b);
-}
-
-/*
- * In directories inode numbers are stored as unaligned arrays of unsigned
- * 8bit integers on disk.
- *
- * For v1 directories or v2 directories that contain inode numbers that
- * do not fit into 32bit the array has eight members, but the first member
- * is always zero:
- *
- * |unused|48-55|40-47|32-39|24-31|16-23| 8-15| 0- 7|
- *
- * For v2 directories that only contain entries with inode numbers that fit
- * into 32bits a four-member array is used:
- *
- * |24-31|16-23| 8-15| 0- 7|
- */
-
-#define XFS_GET_DIR_INO4(di) \
- (((__u32)(di).i[0] << 24) | ((di).i[1] << 16) | ((di).i[2] << 8) | ((di).i[3]))
-
-#define XFS_PUT_DIR_INO4(from, di) \
-do { \
- (di).i[0] = (((from) & 0xff000000ULL) >> 24); \
- (di).i[1] = (((from) & 0x00ff0000ULL) >> 16); \
- (di).i[2] = (((from) & 0x0000ff00ULL) >> 8); \
- (di).i[3] = ((from) & 0x000000ffULL); \
-} while (0)
-
-#define XFS_DI_HI(di) \
- (((__u32)(di).i[1] << 16) | ((di).i[2] << 8) | ((di).i[3]))
-#define XFS_DI_LO(di) \
- (((__u32)(di).i[4] << 24) | ((di).i[5] << 16) | ((di).i[6] << 8) | ((di).i[7]))
-
-#define XFS_GET_DIR_INO8(di) \
- (((xfs_ino_t)XFS_DI_LO(di) & 0xffffffffULL) | \
- ((xfs_ino_t)XFS_DI_HI(di) << 32))
-
-#define XFS_PUT_DIR_INO8(from, di) \
-do { \
- (di).i[0] = 0; \
- (di).i[1] = (((from) & 0x00ff000000000000ULL) >> 48); \
- (di).i[2] = (((from) & 0x0000ff0000000000ULL) >> 40); \
- (di).i[3] = (((from) & 0x000000ff00000000ULL) >> 32); \
- (di).i[4] = (((from) & 0x00000000ff000000ULL) >> 24); \
- (di).i[5] = (((from) & 0x0000000000ff0000ULL) >> 16); \
- (di).i[6] = (((from) & 0x000000000000ff00ULL) >> 8); \
- (di).i[7] = ((from) & 0x00000000000000ffULL); \
-} while (0)
-
-#endif /* __XFS_ARCH_H__ */
diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index e5e91e9c7e8..bfe36fc2cdc 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -15,42 +15,34 @@
* along with this program; if not, write the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
-
-#include <linux/capability.h>
-
#include "xfs.h"
#include "xfs_fs.h"
-#include "xfs_types.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_bit.h"
-#include "xfs_log.h"
-#include "xfs_inum.h"
-#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
-#include "xfs_dir.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
#include "xfs_mount.h"
+#include "xfs_da_format.h"
#include "xfs_da_btree.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
-#include "xfs_dir2_sf.h"
#include "xfs_attr_sf.h"
-#include "xfs_dinode.h"
#include "xfs_inode.h"
#include "xfs_alloc.h"
-#include "xfs_btree.h"
+#include "xfs_trans.h"
#include "xfs_inode_item.h"
#include "xfs_bmap.h"
+#include "xfs_bmap_util.h"
+#include "xfs_bmap_btree.h"
#include "xfs_attr.h"
#include "xfs_attr_leaf.h"
+#include "xfs_attr_remote.h"
#include "xfs_error.h"
#include "xfs_quota.h"
#include "xfs_trans_space.h"
-#include "xfs_acl.h"
-#include "xfs_rw.h"
+#include "xfs_trace.h"
+#include "xfs_dinode.h"
/*
* xfs_attr.c
@@ -58,11 +50,6 @@
* Provide the external interfaces to manage attribute lists.
*/
-#define ATTR_SYSCOUNT 2
-STATIC struct attrnames posix_acl_access;
-STATIC struct attrnames posix_acl_default;
-STATIC struct attrnames *attr_system_names[ATTR_SYSCOUNT];
-
/*========================================================================
* Function prototypes for the kernel.
*========================================================================*/
@@ -78,7 +65,6 @@ STATIC int xfs_attr_shortform_addname(xfs_da_args_t *args);
STATIC int xfs_attr_leaf_get(xfs_da_args_t *args);
STATIC int xfs_attr_leaf_addname(xfs_da_args_t *args);
STATIC int xfs_attr_leaf_removename(xfs_da_args_t *args);
-STATIC int xfs_attr_leaf_list(xfs_attr_list_context_t *context);
/*
* Internal routines when attribute list is more than one block.
@@ -86,172 +72,177 @@ STATIC int xfs_attr_leaf_list(xfs_attr_list_context_t *context);
STATIC int xfs_attr_node_get(xfs_da_args_t *args);
STATIC int xfs_attr_node_addname(xfs_da_args_t *args);
STATIC int xfs_attr_node_removename(xfs_da_args_t *args);
-STATIC int xfs_attr_node_list(xfs_attr_list_context_t *context);
STATIC int xfs_attr_fillstate(xfs_da_state_t *state);
STATIC int xfs_attr_refillstate(xfs_da_state_t *state);
-/*
- * Routines to manipulate out-of-line attribute values.
- */
-STATIC int xfs_attr_rmtval_get(xfs_da_args_t *args);
-STATIC int xfs_attr_rmtval_set(xfs_da_args_t *args);
-STATIC int xfs_attr_rmtval_remove(xfs_da_args_t *args);
-#define ATTR_RMTVALUE_MAPSIZE 1 /* # of map entries at once */
+STATIC int
+xfs_attr_args_init(
+ struct xfs_da_args *args,
+ struct xfs_inode *dp,
+ const unsigned char *name,
+ int flags)
+{
-#if defined(XFS_ATTR_TRACE)
-ktrace_t *xfs_attr_trace_buf;
-#endif
+ if (!name)
+ return EINVAL;
+
+ memset(args, 0, sizeof(*args));
+ args->geo = dp->i_mount->m_attr_geo;
+ args->whichfork = XFS_ATTR_FORK;
+ args->dp = dp;
+ args->flags = flags;
+ args->name = name;
+ args->namelen = strlen((const char *)name);
+ if (args->namelen >= MAXNAMELEN)
+ return EFAULT; /* match IRIX behaviour */
+ args->hashval = xfs_da_hashname(args->name, args->namelen);
+ return 0;
+}
+
+int
+xfs_inode_hasattr(
+ struct xfs_inode *ip)
+{
+ if (!XFS_IFORK_Q(ip) ||
+ (ip->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS &&
+ ip->i_d.di_anextents == 0))
+ return 0;
+ return 1;
+}
/*========================================================================
* Overall external interface routines.
*========================================================================*/
int
-xfs_attr_fetch(xfs_inode_t *ip, const char *name, int namelen,
- char *value, int *valuelenp, int flags, struct cred *cred)
+xfs_attr_get(
+ struct xfs_inode *ip,
+ const unsigned char *name,
+ unsigned char *value,
+ int *valuelenp,
+ int flags)
{
- xfs_da_args_t args;
- int error;
+ struct xfs_da_args args;
+ uint lock_mode;
+ int error;
- if ((XFS_IFORK_Q(ip) == 0) ||
- (ip->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS &&
- ip->i_d.di_anextents == 0))
- return(ENOATTR);
+ XFS_STATS_INC(xs_attr_get);
+
+ if (XFS_FORCED_SHUTDOWN(ip->i_mount))
+ return EIO;
+
+ if (!xfs_inode_hasattr(ip))
+ return ENOATTR;
+
+ error = xfs_attr_args_init(&args, ip, name, flags);
+ if (error)
+ return error;
- /*
- * Fill in the arg structure for this request.
- */
- memset((char *)&args, 0, sizeof(args));
- args.name = name;
- args.namelen = namelen;
args.value = value;
args.valuelen = *valuelenp;
- args.flags = flags;
- args.hashval = xfs_da_hashname(args.name, args.namelen);
- args.dp = ip;
- args.whichfork = XFS_ATTR_FORK;
- /*
- * Decide on what work routines to call based on the inode size.
- */
- if (XFS_IFORK_Q(ip) == 0 ||
- (ip->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS &&
- ip->i_d.di_anextents == 0)) {
- error = XFS_ERROR(ENOATTR);
- } else if (ip->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) {
+ lock_mode = xfs_ilock_attr_map_shared(ip);
+ if (!xfs_inode_hasattr(ip))
+ error = ENOATTR;
+ else if (ip->i_d.di_aformat == XFS_DINODE_FMT_LOCAL)
error = xfs_attr_shortform_getvalue(&args);
- } else if (xfs_bmap_one_block(ip, XFS_ATTR_FORK)) {
+ else if (xfs_bmap_one_block(ip, XFS_ATTR_FORK))
error = xfs_attr_leaf_get(&args);
- } else {
+ else
error = xfs_attr_node_get(&args);
- }
+ xfs_iunlock(ip, lock_mode);
+
+ *valuelenp = args.valuelen;
+ return error == EEXIST ? 0 : error;
+}
+
+/*
+ * Calculate how many blocks we need for the new attribute,
+ */
+STATIC int
+xfs_attr_calc_size(
+ struct xfs_da_args *args,
+ int *local)
+{
+ struct xfs_mount *mp = args->dp->i_mount;
+ int size;
+ int nblks;
/*
- * Return the number of bytes in the value to the caller.
+ * Determine space new attribute will use, and if it would be
+ * "local" or "remote" (note: local != inline).
*/
- *valuelenp = args.valuelen;
+ size = xfs_attr_leaf_newentsize(args, local);
+ nblks = XFS_DAENTER_SPACE_RES(mp, XFS_ATTR_FORK);
+ if (*local) {
+ if (size > (args->geo->blksize / 2)) {
+ /* Double split possible */
+ nblks *= 2;
+ }
+ } else {
+ /*
+ * Out of line attribute, cannot double split, but
+ * make room for the attribute value itself.
+ */
+ uint dblocks = xfs_attr3_rmt_blocks(mp, args->valuelen);
+ nblks += dblocks;
+ nblks += XFS_NEXTENTADD_SPACE_RES(mp, dblocks, XFS_ATTR_FORK);
+ }
- if (error == EEXIST)
- error = 0;
- return(error);
+ return nblks;
}
int
-xfs_attr_get(bhv_desc_t *bdp, const char *name, char *value, int *valuelenp,
- int flags, struct cred *cred)
+xfs_attr_set(
+ struct xfs_inode *dp,
+ const unsigned char *name,
+ unsigned char *value,
+ int valuelen,
+ int flags)
{
- xfs_inode_t *ip = XFS_BHVTOI(bdp);
- int error, namelen;
+ struct xfs_mount *mp = dp->i_mount;
+ struct xfs_da_args args;
+ struct xfs_bmap_free flist;
+ struct xfs_trans_res tres;
+ xfs_fsblock_t firstblock;
+ int rsvd = (flags & ATTR_ROOT) != 0;
+ int error, err2, committed, local;
- XFS_STATS_INC(xs_attr_get);
-
- if (!name)
- return(EINVAL);
- namelen = strlen(name);
- if (namelen >= MAXNAMELEN)
- return(EFAULT); /* match IRIX behaviour */
-
- if (XFS_FORCED_SHUTDOWN(ip->i_mount))
- return(EIO);
+ XFS_STATS_INC(xs_attr_set);
- xfs_ilock(ip, XFS_ILOCK_SHARED);
- error = xfs_attr_fetch(ip, name, namelen, value, valuelenp, flags, cred);
- xfs_iunlock(ip, XFS_ILOCK_SHARED);
- return(error);
-}
+ if (XFS_FORCED_SHUTDOWN(dp->i_mount))
+ return EIO;
-STATIC int
-xfs_attr_set_int(xfs_inode_t *dp, const char *name, int namelen,
- char *value, int valuelen, int flags)
-{
- xfs_da_args_t args;
- xfs_fsblock_t firstblock;
- xfs_bmap_free_t flist;
- int error, err2, committed;
- int local, size;
- uint nblks;
- xfs_mount_t *mp = dp->i_mount;
- int rsvd = (flags & ATTR_ROOT) != 0;
+ error = xfs_attr_args_init(&args, dp, name, flags);
+ if (error)
+ return error;
- /*
- * Attach the dquots to the inode.
- */
- if ((error = XFS_QM_DQATTACH(mp, dp, 0)))
- return (error);
+ args.value = value;
+ args.valuelen = valuelen;
+ args.firstblock = &firstblock;
+ args.flist = &flist;
+ args.op_flags = XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT;
+ args.total = xfs_attr_calc_size(&args, &local);
- /*
- * Determine space new attribute will use, and if it would be
- * "local" or "remote" (note: local != inline).
- */
- size = xfs_attr_leaf_newentsize(namelen, valuelen,
- mp->m_sb.sb_blocksize, &local);
+ error = xfs_qm_dqattach(dp, 0);
+ if (error)
+ return error;
/*
* If the inode doesn't have an attribute fork, add one.
* (inode must not be locked when we call this routine)
*/
if (XFS_IFORK_Q(dp) == 0) {
- if ((error = xfs_bmap_add_attrfork(dp, size, rsvd)))
- return(error);
- }
+ int sf_size = sizeof(xfs_attr_sf_hdr_t) +
+ XFS_ATTR_SF_ENTSIZE_BYNAME(args.namelen, valuelen);
- /*
- * Fill in the arg structure for this request.
- */
- memset((char *)&args, 0, sizeof(args));
- args.name = name;
- args.namelen = namelen;
- args.value = value;
- args.valuelen = valuelen;
- args.flags = flags;
- args.hashval = xfs_da_hashname(args.name, args.namelen);
- args.dp = dp;
- args.firstblock = &firstblock;
- args.flist = &flist;
- args.whichfork = XFS_ATTR_FORK;
- args.addname = 1;
- args.oknoent = 1;
-
- nblks = XFS_DAENTER_SPACE_RES(mp, XFS_ATTR_FORK);
- if (local) {
- if (size > (mp->m_sb.sb_blocksize >> 1)) {
- /* Double split possible */
- nblks <<= 1;
- }
- } else {
- uint dblocks = XFS_B_TO_FSB(mp, valuelen);
- /* Out of line attribute, cannot double split, but make
- * room for the attribute value itself.
- */
- nblks += dblocks;
- nblks += XFS_NEXTENTADD_SPACE_RES(mp, dblocks, XFS_ATTR_FORK);
+ error = xfs_bmap_add_attrfork(dp, sf_size, rsvd);
+ if (error)
+ return error;
}
- /* Size is now blocks for attribute data */
- args.total = nblks;
-
/*
* Start our first transaction of the day.
*
@@ -272,34 +263,35 @@ xfs_attr_set_int(xfs_inode_t *dp, const char *name, int namelen,
if (rsvd)
args.trans->t_flags |= XFS_TRANS_RESERVE;
- if ((error = xfs_trans_reserve(args.trans, (uint) nblks,
- XFS_ATTRSET_LOG_RES(mp, nblks),
- 0, XFS_TRANS_PERM_LOG_RES,
- XFS_ATTRSET_LOG_COUNT))) {
+ tres.tr_logres = M_RES(mp)->tr_attrsetm.tr_logres +
+ M_RES(mp)->tr_attrsetrt.tr_logres * args.total;
+ tres.tr_logcount = XFS_ATTRSET_LOG_COUNT;
+ tres.tr_logflags = XFS_TRANS_PERM_LOG_RES;
+ error = xfs_trans_reserve(args.trans, &tres, args.total, 0);
+ if (error) {
xfs_trans_cancel(args.trans, 0);
- return(error);
+ return error;
}
xfs_ilock(dp, XFS_ILOCK_EXCL);
- error = XFS_TRANS_RESERVE_QUOTA_NBLKS(mp, args.trans, dp, nblks, 0,
- rsvd ? XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES :
- XFS_QMOPT_RES_REGBLKS);
+ error = xfs_trans_reserve_quota_nblks(args.trans, dp, args.total, 0,
+ rsvd ? XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES :
+ XFS_QMOPT_RES_REGBLKS);
if (error) {
xfs_iunlock(dp, XFS_ILOCK_EXCL);
xfs_trans_cancel(args.trans, XFS_TRANS_RELEASE_LOG_RES);
- return (error);
+ return error;
}
- xfs_trans_ijoin(args.trans, dp, XFS_ILOCK_EXCL);
- xfs_trans_ihold(args.trans, dp);
+ xfs_trans_ijoin(args.trans, dp, 0);
/*
- * If the attribute list is non-existant or a shortform list,
+ * If the attribute list is non-existent or a shortform list,
* upgrade it to a single-leaf-block attribute list.
*/
- if ((dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) ||
- ((dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS) &&
- (dp->i_d.di_anextents == 0))) {
+ if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL ||
+ (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS &&
+ dp->i_d.di_anextents == 0)) {
/*
* Build initial attribute list (if required).
@@ -324,32 +316,29 @@ xfs_attr_set_int(xfs_inode_t *dp, const char *name, int namelen,
* the transaction goes to disk before returning
* to the user.
*/
- if (mp->m_flags & XFS_MOUNT_WSYNC) {
+ if (mp->m_flags & XFS_MOUNT_WSYNC)
xfs_trans_set_sync(args.trans);
+
+ if (!error && (flags & ATTR_KERNOTIME) == 0) {
+ xfs_trans_ichgtime(args.trans, dp,
+ XFS_ICHGTIME_CHG);
}
err2 = xfs_trans_commit(args.trans,
- XFS_TRANS_RELEASE_LOG_RES,
- NULL);
+ XFS_TRANS_RELEASE_LOG_RES);
xfs_iunlock(dp, XFS_ILOCK_EXCL);
- /*
- * Hit the inode change time.
- */
- if (!error && (flags & ATTR_KERNOTIME) == 0) {
- xfs_ichgtime(dp, XFS_ICHGTIME_CHG);
- }
- return(error == 0 ? err2 : error);
+ return error ? error : err2;
}
/*
* It won't fit in the shortform, transform to a leaf block.
* GROT: another possible req'mt for a double-split btree op.
*/
- XFS_BMAP_INIT(args.flist, args.firstblock);
+ xfs_bmap_init(args.flist, args.firstblock);
error = xfs_attr_shortform_to_leaf(&args);
if (!error) {
error = xfs_bmap_finish(&args.trans, args.flist,
- *args.firstblock, &committed);
+ &committed);
}
if (error) {
ASSERT(committed);
@@ -362,114 +351,96 @@ xfs_attr_set_int(xfs_inode_t *dp, const char *name, int namelen,
* bmap_finish() may have committed the last trans and started
* a new one. We need the inode to be in all transactions.
*/
- if (committed) {
- xfs_trans_ijoin(args.trans, dp, XFS_ILOCK_EXCL);
- xfs_trans_ihold(args.trans, dp);
- }
+ if (committed)
+ xfs_trans_ijoin(args.trans, dp, 0);
/*
* Commit the leaf transformation. We'll need another (linked)
* transaction to add the new attribute to the leaf.
*/
- if ((error = xfs_attr_rolltrans(&args.trans, dp)))
+
+ error = xfs_trans_roll(&args.trans, dp);
+ if (error)
goto out;
}
- if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) {
+ if (xfs_bmap_one_block(dp, XFS_ATTR_FORK))
error = xfs_attr_leaf_addname(&args);
- } else {
+ else
error = xfs_attr_node_addname(&args);
- }
- if (error) {
+ if (error)
goto out;
- }
/*
* If this is a synchronous mount, make sure that the
* transaction goes to disk before returning to the user.
*/
- if (mp->m_flags & XFS_MOUNT_WSYNC) {
+ if (mp->m_flags & XFS_MOUNT_WSYNC)
xfs_trans_set_sync(args.trans);
- }
+
+ if ((flags & ATTR_KERNOTIME) == 0)
+ xfs_trans_ichgtime(args.trans, dp, XFS_ICHGTIME_CHG);
/*
* Commit the last in the sequence of transactions.
*/
xfs_trans_log_inode(args.trans, dp, XFS_ILOG_CORE);
- error = xfs_trans_commit(args.trans, XFS_TRANS_RELEASE_LOG_RES,
- NULL);
+ error = xfs_trans_commit(args.trans, XFS_TRANS_RELEASE_LOG_RES);
xfs_iunlock(dp, XFS_ILOCK_EXCL);
- /*
- * Hit the inode change time.
- */
- if (!error && (flags & ATTR_KERNOTIME) == 0) {
- xfs_ichgtime(dp, XFS_ICHGTIME_CHG);
- }
-
- return(error);
+ return error;
out:
- if (args.trans)
+ if (args.trans) {
xfs_trans_cancel(args.trans,
XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
+ }
xfs_iunlock(dp, XFS_ILOCK_EXCL);
- return(error);
+ return error;
}
+/*
+ * Generic handler routine to remove a name from an attribute list.
+ * Transitions attribute list from Btree to shortform as necessary.
+ */
int
-xfs_attr_set(bhv_desc_t *bdp, const char *name, char *value, int valuelen, int flags,
- struct cred *cred)
+xfs_attr_remove(
+ struct xfs_inode *dp,
+ const unsigned char *name,
+ int flags)
{
- xfs_inode_t *dp;
- int namelen;
-
- namelen = strlen(name);
- if (namelen >= MAXNAMELEN)
- return EFAULT; /* match IRIX behaviour */
+ struct xfs_mount *mp = dp->i_mount;
+ struct xfs_da_args args;
+ struct xfs_bmap_free flist;
+ xfs_fsblock_t firstblock;
+ int error;
- XFS_STATS_INC(xs_attr_set);
+ XFS_STATS_INC(xs_attr_remove);
- dp = XFS_BHVTOI(bdp);
if (XFS_FORCED_SHUTDOWN(dp->i_mount))
- return (EIO);
+ return EIO;
- return xfs_attr_set_int(dp, name, namelen, value, valuelen, flags);
-}
+ if (!xfs_inode_hasattr(dp))
+ return ENOATTR;
-/*
- * Generic handler routine to remove a name from an attribute list.
- * Transitions attribute list from Btree to shortform as necessary.
- */
-STATIC int
-xfs_attr_remove_int(xfs_inode_t *dp, const char *name, int namelen, int flags)
-{
- xfs_da_args_t args;
- xfs_fsblock_t firstblock;
- xfs_bmap_free_t flist;
- int error;
- xfs_mount_t *mp = dp->i_mount;
+ error = xfs_attr_args_init(&args, dp, name, flags);
+ if (error)
+ return error;
- /*
- * Fill in the arg structure for this request.
- */
- memset((char *)&args, 0, sizeof(args));
- args.name = name;
- args.namelen = namelen;
- args.flags = flags;
- args.hashval = xfs_da_hashname(args.name, args.namelen);
- args.dp = dp;
args.firstblock = &firstblock;
args.flist = &flist;
- args.total = 0;
- args.whichfork = XFS_ATTR_FORK;
/*
- * Attach the dquots to the inode.
+ * we have no control over the attribute names that userspace passes us
+ * to remove, so we have to allow the name lookup prior to attribute
+ * removal to fail.
*/
- if ((error = XFS_QM_DQATTACH(mp, dp, 0)))
- return (error);
+ args.op_flags = XFS_DA_OP_OKNOENT;
+
+ error = xfs_qm_dqattach(dp, 0);
+ if (error)
+ return error;
/*
* Start our first transaction of the day.
@@ -491,13 +462,11 @@ xfs_attr_remove_int(xfs_inode_t *dp, const char *name, int namelen, int flags)
if (flags & ATTR_ROOT)
args.trans->t_flags |= XFS_TRANS_RESERVE;
- if ((error = xfs_trans_reserve(args.trans,
- XFS_ATTRRM_SPACE_RES(mp),
- XFS_ATTRRM_LOG_RES(mp),
- 0, XFS_TRANS_PERM_LOG_RES,
- XFS_ATTRRM_LOG_COUNT))) {
+ error = xfs_trans_reserve(args.trans, &M_RES(mp)->tr_attrrm,
+ XFS_ATTRRM_SPACE_RES(mp), 0);
+ if (error) {
xfs_trans_cancel(args.trans, 0);
- return(error);
+ return error;
}
xfs_ilock(dp, XFS_ILOCK_EXCL);
@@ -505,276 +474,50 @@ xfs_attr_remove_int(xfs_inode_t *dp, const char *name, int namelen, int flags)
* No need to make quota reservations here. We expect to release some
* blocks not allocate in the common case.
*/
- xfs_trans_ijoin(args.trans, dp, XFS_ILOCK_EXCL);
- xfs_trans_ihold(args.trans, dp);
+ xfs_trans_ijoin(args.trans, dp, 0);
- /*
- * Decide on what work routines to call based on the inode size.
- */
- if (XFS_IFORK_Q(dp) == 0 ||
- (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS &&
- dp->i_d.di_anextents == 0)) {
+ if (!xfs_inode_hasattr(dp)) {
error = XFS_ERROR(ENOATTR);
- goto out;
- }
- if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) {
+ } else if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) {
ASSERT(dp->i_afp->if_flags & XFS_IFINLINE);
error = xfs_attr_shortform_remove(&args);
- if (error) {
- goto out;
- }
} else if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) {
error = xfs_attr_leaf_removename(&args);
} else {
error = xfs_attr_node_removename(&args);
}
- if (error) {
+
+ if (error)
goto out;
- }
/*
* If this is a synchronous mount, make sure that the
* transaction goes to disk before returning to the user.
*/
- if (mp->m_flags & XFS_MOUNT_WSYNC) {
+ if (mp->m_flags & XFS_MOUNT_WSYNC)
xfs_trans_set_sync(args.trans);
- }
+
+ if ((flags & ATTR_KERNOTIME) == 0)
+ xfs_trans_ichgtime(args.trans, dp, XFS_ICHGTIME_CHG);
/*
* Commit the last in the sequence of transactions.
*/
xfs_trans_log_inode(args.trans, dp, XFS_ILOG_CORE);
- error = xfs_trans_commit(args.trans, XFS_TRANS_RELEASE_LOG_RES,
- NULL);
+ error = xfs_trans_commit(args.trans, XFS_TRANS_RELEASE_LOG_RES);
xfs_iunlock(dp, XFS_ILOCK_EXCL);
- /*
- * Hit the inode change time.
- */
- if (!error && (flags & ATTR_KERNOTIME) == 0) {
- xfs_ichgtime(dp, XFS_ICHGTIME_CHG);
- }
-
- return(error);
+ return error;
out:
- if (args.trans)
+ if (args.trans) {
xfs_trans_cancel(args.trans,
XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
- xfs_iunlock(dp, XFS_ILOCK_EXCL);
- return(error);
-}
-
-int
-xfs_attr_remove(bhv_desc_t *bdp, const char *name, int flags, struct cred *cred)
-{
- xfs_inode_t *dp;
- int namelen;
-
- namelen = strlen(name);
- if (namelen >= MAXNAMELEN)
- return EFAULT; /* match IRIX behaviour */
-
- XFS_STATS_INC(xs_attr_remove);
-
- dp = XFS_BHVTOI(bdp);
- if (XFS_FORCED_SHUTDOWN(dp->i_mount))
- return (EIO);
-
- xfs_ilock(dp, XFS_ILOCK_SHARED);
- if (XFS_IFORK_Q(dp) == 0 ||
- (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS &&
- dp->i_d.di_anextents == 0)) {
- xfs_iunlock(dp, XFS_ILOCK_SHARED);
- return(XFS_ERROR(ENOATTR));
- }
- xfs_iunlock(dp, XFS_ILOCK_SHARED);
-
- return xfs_attr_remove_int(dp, name, namelen, flags);
-}
-
-/*
- * Generate a list of extended attribute names and optionally
- * also value lengths. Positive return value follows the XFS
- * convention of being an error, zero or negative return code
- * is the length of the buffer returned (negated), indicating
- * success.
- */
-int
-xfs_attr_list(bhv_desc_t *bdp, char *buffer, int bufsize, int flags,
- attrlist_cursor_kern_t *cursor, struct cred *cred)
-{
- xfs_attr_list_context_t context;
- xfs_inode_t *dp;
- int error;
-
- XFS_STATS_INC(xs_attr_list);
-
- /*
- * Validate the cursor.
- */
- if (cursor->pad1 || cursor->pad2)
- return(XFS_ERROR(EINVAL));
- if ((cursor->initted == 0) &&
- (cursor->hashval || cursor->blkno || cursor->offset))
- return(XFS_ERROR(EINVAL));
-
- /*
- * Check for a properly aligned buffer.
- */
- if (((long)buffer) & (sizeof(int)-1))
- return(XFS_ERROR(EFAULT));
- if (flags & ATTR_KERNOVAL)
- bufsize = 0;
-
- /*
- * Initialize the output buffer.
- */
- context.dp = dp = XFS_BHVTOI(bdp);
- context.cursor = cursor;
- context.count = 0;
- context.dupcnt = 0;
- context.resynch = 1;
- context.flags = flags;
- if (!(flags & ATTR_KERNAMELS)) {
- context.bufsize = (bufsize & ~(sizeof(int)-1)); /* align */
- context.firstu = context.bufsize;
- context.alist = (attrlist_t *)buffer;
- context.alist->al_count = 0;
- context.alist->al_more = 0;
- context.alist->al_offset[0] = context.bufsize;
- }
- else {
- context.bufsize = bufsize;
- context.firstu = context.bufsize;
- context.alist = (attrlist_t *)buffer;
- }
-
- if (XFS_FORCED_SHUTDOWN(dp->i_mount))
- return (EIO);
-
- xfs_ilock(dp, XFS_ILOCK_SHARED);
- /*
- * Decide on what work routines to call based on the inode size.
- */
- xfs_attr_trace_l_c("syscall start", &context);
- if (XFS_IFORK_Q(dp) == 0 ||
- (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS &&
- dp->i_d.di_anextents == 0)) {
- error = 0;
- } else if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) {
- error = xfs_attr_shortform_list(&context);
- } else if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) {
- error = xfs_attr_leaf_list(&context);
- } else {
- error = xfs_attr_node_list(&context);
}
- xfs_iunlock(dp, XFS_ILOCK_SHARED);
- xfs_attr_trace_l_c("syscall end", &context);
-
- if (!(context.flags & (ATTR_KERNOVAL|ATTR_KERNAMELS))) {
- ASSERT(error >= 0);
- }
- else { /* must return negated buffer size or the error */
- if (context.count < 0)
- error = XFS_ERROR(ERANGE);
- else
- error = -context.count;
- }
-
- return(error);
-}
-
-int /* error */
-xfs_attr_inactive(xfs_inode_t *dp)
-{
- xfs_trans_t *trans;
- xfs_mount_t *mp;
- int error;
-
- mp = dp->i_mount;
- ASSERT(! XFS_NOT_DQATTACHED(mp, dp));
-
- xfs_ilock(dp, XFS_ILOCK_SHARED);
- if ((XFS_IFORK_Q(dp) == 0) ||
- (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) ||
- (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS &&
- dp->i_d.di_anextents == 0)) {
- xfs_iunlock(dp, XFS_ILOCK_SHARED);
- return(0);
- }
- xfs_iunlock(dp, XFS_ILOCK_SHARED);
-
- /*
- * Start our first transaction of the day.
- *
- * All future transactions during this code must be "chained" off
- * this one via the trans_dup() call. All transactions will contain
- * the inode, and the inode will always be marked with trans_ihold().
- * Since the inode will be locked in all transactions, we must log
- * the inode in every transaction to let it float upward through
- * the log.
- */
- trans = xfs_trans_alloc(mp, XFS_TRANS_ATTRINVAL);
- if ((error = xfs_trans_reserve(trans, 0, XFS_ATTRINVAL_LOG_RES(mp), 0,
- XFS_TRANS_PERM_LOG_RES,
- XFS_ATTRINVAL_LOG_COUNT))) {
- xfs_trans_cancel(trans, 0);
- return(error);
- }
- xfs_ilock(dp, XFS_ILOCK_EXCL);
-
- /*
- * No need to make quota reservations here. We expect to release some
- * blocks, not allocate, in the common case.
- */
- xfs_trans_ijoin(trans, dp, XFS_ILOCK_EXCL);
- xfs_trans_ihold(trans, dp);
-
- /*
- * Decide on what work routines to call based on the inode size.
- */
- if ((XFS_IFORK_Q(dp) == 0) ||
- (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) ||
- (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS &&
- dp->i_d.di_anextents == 0)) {
- error = 0;
- goto out;
- }
- error = xfs_attr_root_inactive(&trans, dp);
- if (error)
- goto out;
- /*
- * signal synchronous inactive transactions unless this
- * is a synchronous mount filesystem in which case we
- * know that we're here because we've been called out of
- * xfs_inactive which means that the last reference is gone
- * and the unlink transaction has already hit the disk so
- * async inactive transactions are safe.
- */
- if ((error = xfs_itruncate_finish(&trans, dp, 0LL, XFS_ATTR_FORK,
- (!(mp->m_flags & XFS_MOUNT_WSYNC)
- ? 1 : 0))))
- goto out;
-
- /*
- * Commit the last in the sequence of transactions.
- */
- xfs_trans_log_inode(trans, dp, XFS_ILOG_CORE);
- error = xfs_trans_commit(trans, XFS_TRANS_RELEASE_LOG_RES,
- NULL);
- xfs_iunlock(dp, XFS_ILOCK_EXCL);
-
- return(error);
-
-out:
- xfs_trans_cancel(trans, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
xfs_iunlock(dp, XFS_ILOCK_EXCL);
- return(error);
+ return error;
}
-
-
/*========================================================================
* External routines when attribute list is inside the inode
*========================================================================*/
@@ -788,6 +531,8 @@ xfs_attr_shortform_addname(xfs_da_args_t *args)
{
int newsize, forkoff, retval;
+ trace_xfs_attr_sf_addname(args);
+
retval = xfs_attr_shortform_lookup(args);
if ((args->flags & ATTR_REPLACE) && (retval == ENOATTR)) {
return(retval);
@@ -824,61 +569,74 @@ xfs_attr_shortform_addname(xfs_da_args_t *args)
* This leaf block cannot have a "remote" value, we only call this routine
* if bmap_one_block() says there is only one block (ie: no remote blks).
*/
-int
+STATIC int
xfs_attr_leaf_addname(xfs_da_args_t *args)
{
xfs_inode_t *dp;
- xfs_dabuf_t *bp;
+ struct xfs_buf *bp;
int retval, error, committed, forkoff;
+ trace_xfs_attr_leaf_addname(args);
+
/*
* Read the (only) block in the attribute list in.
*/
dp = args->dp;
args->blkno = 0;
- error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp,
- XFS_ATTR_FORK);
+ error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
if (error)
- return(error);
- ASSERT(bp != NULL);
+ return error;
/*
* Look up the given attribute in the leaf block. Figure out if
* the given flags produce an error or call for an atomic rename.
*/
- retval = xfs_attr_leaf_lookup_int(bp, args);
+ retval = xfs_attr3_leaf_lookup_int(bp, args);
if ((args->flags & ATTR_REPLACE) && (retval == ENOATTR)) {
- xfs_da_brelse(args->trans, bp);
- return(retval);
+ xfs_trans_brelse(args->trans, bp);
+ return retval;
} else if (retval == EEXIST) {
if (args->flags & ATTR_CREATE) { /* pure create op */
- xfs_da_brelse(args->trans, bp);
- return(retval);
+ xfs_trans_brelse(args->trans, bp);
+ return retval;
}
- args->rename = 1; /* an atomic rename */
+
+ trace_xfs_attr_leaf_replace(args);
+
+ /* save the attribute state for later removal*/
+ args->op_flags |= XFS_DA_OP_RENAME; /* an atomic rename */
args->blkno2 = args->blkno; /* set 2nd entry info*/
args->index2 = args->index;
args->rmtblkno2 = args->rmtblkno;
args->rmtblkcnt2 = args->rmtblkcnt;
+ args->rmtvaluelen2 = args->rmtvaluelen;
+
+ /*
+ * clear the remote attr state now that it is saved so that the
+ * values reflect the state of the attribute we are about to
+ * add, not the attribute we just found and will remove later.
+ */
+ args->rmtblkno = 0;
+ args->rmtblkcnt = 0;
+ args->rmtvaluelen = 0;
}
/*
* Add the attribute to the leaf block, transitioning to a Btree
* if required.
*/
- retval = xfs_attr_leaf_add(bp, args);
- xfs_da_buf_done(bp);
+ retval = xfs_attr3_leaf_add(bp, args);
if (retval == ENOSPC) {
/*
* Promote the attribute list to the Btree format, then
* Commit that transaction so that the node_addname() call
* can manage its own transactions.
*/
- XFS_BMAP_INIT(args->flist, args->firstblock);
- error = xfs_attr_leaf_to_node(args);
+ xfs_bmap_init(args->flist, args->firstblock);
+ error = xfs_attr3_leaf_to_node(args);
if (!error) {
error = xfs_bmap_finish(&args->trans, args->flist,
- *args->firstblock, &committed);
+ &committed);
}
if (error) {
ASSERT(committed);
@@ -891,16 +649,15 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
* bmap_finish() may have committed the last trans and started
* a new one. We need the inode to be in all transactions.
*/
- if (committed) {
- xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL);
- xfs_trans_ihold(args->trans, dp);
- }
+ if (committed)
+ xfs_trans_ijoin(args->trans, dp, 0);
/*
* Commit the current trans (including the inode) and start
* a new one.
*/
- if ((error = xfs_attr_rolltrans(&args->trans, dp)))
+ error = xfs_trans_roll(&args->trans, dp);
+ if (error)
return (error);
/*
@@ -914,7 +671,8 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
* Commit the transaction that added the attr name so that
* later routines can manage their own transactions.
*/
- if ((error = xfs_attr_rolltrans(&args->trans, dp)))
+ error = xfs_trans_roll(&args->trans, dp);
+ if (error)
return (error);
/*
@@ -935,12 +693,12 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
* so that one disappears and one appears atomically. Then we
* must remove the "old" attribute/value pair.
*/
- if (args->rename) {
+ if (args->op_flags & XFS_DA_OP_RENAME) {
/*
* In a separate transaction, set the incomplete flag on the
* "old" attr and clear the incomplete flag on the "new" attr.
*/
- error = xfs_attr_leaf_flipflags(args);
+ error = xfs_attr3_leaf_flipflags(args);
if (error)
return(error);
@@ -952,6 +710,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
args->blkno = args->blkno2;
args->rmtblkno = args->rmtblkno2;
args->rmtblkcnt = args->rmtblkcnt2;
+ args->rmtvaluelen = args->rmtvaluelen2;
if (args->rmtblkno) {
error = xfs_attr_rmtval_remove(args);
if (error)
@@ -962,24 +721,23 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
* Read in the block containing the "old" attr, then
* remove the "old" attr from that block (neat, huh!)
*/
- error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1,
- &bp, XFS_ATTR_FORK);
+ error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno,
+ -1, &bp);
if (error)
- return(error);
- ASSERT(bp != NULL);
- (void)xfs_attr_leaf_remove(bp, args);
+ return error;
+
+ xfs_attr3_leaf_remove(bp, args);
/*
* If the result is small enough, shrink it all into the inode.
*/
if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) {
- XFS_BMAP_INIT(args->flist, args->firstblock);
- error = xfs_attr_leaf_to_shortform(bp, args, forkoff);
+ xfs_bmap_init(args->flist, args->firstblock);
+ error = xfs_attr3_leaf_to_shortform(bp, args, forkoff);
/* bp is gone due to xfs_da_shrink_inode */
if (!error) {
error = xfs_bmap_finish(&args->trans,
args->flist,
- *args->firstblock,
&committed);
}
if (error) {
@@ -994,25 +752,22 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
* and started a new one. We need the inode to be
* in all transactions.
*/
- if (committed) {
- xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL);
- xfs_trans_ihold(args->trans, dp);
- }
- } else
- xfs_da_buf_done(bp);
+ if (committed)
+ xfs_trans_ijoin(args->trans, dp, 0);
+ }
/*
* Commit the remove and start the next trans in series.
*/
- error = xfs_attr_rolltrans(&args->trans, dp);
+ error = xfs_trans_roll(&args->trans, dp);
} else if (args->rmtblkno > 0) {
/*
* Added a "remote" value, just clear the incomplete flag.
*/
- error = xfs_attr_leaf_clearflag(args);
+ error = xfs_attr3_leaf_clearflag(args);
}
- return(error);
+ return error;
}
/*
@@ -1025,58 +780,54 @@ STATIC int
xfs_attr_leaf_removename(xfs_da_args_t *args)
{
xfs_inode_t *dp;
- xfs_dabuf_t *bp;
+ struct xfs_buf *bp;
int error, committed, forkoff;
+ trace_xfs_attr_leaf_removename(args);
+
/*
* Remove the attribute.
*/
dp = args->dp;
args->blkno = 0;
- error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp,
- XFS_ATTR_FORK);
- if (error) {
- return(error);
- }
+ error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
+ if (error)
+ return error;
- ASSERT(bp != NULL);
- error = xfs_attr_leaf_lookup_int(bp, args);
+ error = xfs_attr3_leaf_lookup_int(bp, args);
if (error == ENOATTR) {
- xfs_da_brelse(args->trans, bp);
- return(error);
+ xfs_trans_brelse(args->trans, bp);
+ return error;
}
- (void)xfs_attr_leaf_remove(bp, args);
+ xfs_attr3_leaf_remove(bp, args);
/*
* If the result is small enough, shrink it all into the inode.
*/
if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) {
- XFS_BMAP_INIT(args->flist, args->firstblock);
- error = xfs_attr_leaf_to_shortform(bp, args, forkoff);
+ xfs_bmap_init(args->flist, args->firstblock);
+ error = xfs_attr3_leaf_to_shortform(bp, args, forkoff);
/* bp is gone due to xfs_da_shrink_inode */
if (!error) {
error = xfs_bmap_finish(&args->trans, args->flist,
- *args->firstblock, &committed);
+ &committed);
}
if (error) {
ASSERT(committed);
args->trans = NULL;
xfs_bmap_cancel(args->flist);
- return(error);
+ return error;
}
/*
* bmap_finish() may have committed the last trans and started
* a new one. We need the inode to be in all transactions.
*/
- if (committed) {
- xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL);
- xfs_trans_ihold(args->trans, dp);
- }
- } else
- xfs_da_buf_done(bp);
- return(0);
+ if (committed)
+ xfs_trans_ijoin(args->trans, dp, 0);
+ }
+ return 0;
}
/*
@@ -1088,61 +839,31 @@ xfs_attr_leaf_removename(xfs_da_args_t *args)
STATIC int
xfs_attr_leaf_get(xfs_da_args_t *args)
{
- xfs_dabuf_t *bp;
+ struct xfs_buf *bp;
int error;
+ trace_xfs_attr_leaf_get(args);
+
args->blkno = 0;
- error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp,
- XFS_ATTR_FORK);
+ error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
if (error)
- return(error);
- ASSERT(bp != NULL);
+ return error;
- error = xfs_attr_leaf_lookup_int(bp, args);
+ error = xfs_attr3_leaf_lookup_int(bp, args);
if (error != EEXIST) {
- xfs_da_brelse(args->trans, bp);
- return(error);
+ xfs_trans_brelse(args->trans, bp);
+ return error;
}
- error = xfs_attr_leaf_getvalue(bp, args);
- xfs_da_brelse(args->trans, bp);
+ error = xfs_attr3_leaf_getvalue(bp, args);
+ xfs_trans_brelse(args->trans, bp);
if (!error && (args->rmtblkno > 0) && !(args->flags & ATTR_KERNOVAL)) {
error = xfs_attr_rmtval_get(args);
}
- return(error);
-}
-
-/*
- * Copy out attribute entries for attr_list(), for leaf attribute lists.
- */
-STATIC int
-xfs_attr_leaf_list(xfs_attr_list_context_t *context)
-{
- xfs_attr_leafblock_t *leaf;
- int error;
- xfs_dabuf_t *bp;
-
- context->cursor->blkno = 0;
- error = xfs_da_read_buf(NULL, context->dp, 0, -1, &bp, XFS_ATTR_FORK);
- if (error)
- return(error);
- ASSERT(bp != NULL);
- leaf = bp->data;
- if (unlikely(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT)
- != XFS_ATTR_LEAF_MAGIC)) {
- XFS_CORRUPTION_ERROR("xfs_attr_leaf_list", XFS_ERRLEVEL_LOW,
- context->dp->i_mount, leaf);
- xfs_da_brelse(NULL, bp);
- return(XFS_ERROR(EFSCORRUPTED));
- }
-
- (void)xfs_attr_leaf_list_int(bp, context);
- xfs_da_brelse(NULL, bp);
- return(0);
+ return error;
}
-
/*========================================================================
- * External routines when attribute list size > XFS_LBSIZE(mp).
+ * External routines when attribute list size > geo->blksize
*========================================================================*/
/*
@@ -1164,6 +885,8 @@ xfs_attr_node_addname(xfs_da_args_t *args)
xfs_mount_t *mp;
int committed, retval, error;
+ trace_xfs_attr_node_addname(args);
+
/*
* Fill in bucket of arguments/results/context to carry around.
*/
@@ -1173,14 +896,12 @@ restart:
state = xfs_da_state_alloc();
state->args = args;
state->mp = mp;
- state->blocksize = state->mp->m_sb.sb_blocksize;
- state->node_ents = state->mp->m_attr_node_ents;
/*
* Search to see if name already exists, and get back a pointer
* to where it should go.
*/
- error = xfs_da_node_lookup_int(state, &retval);
+ error = xfs_da3_node_lookup_int(state, &retval);
if (error)
goto out;
blk = &state->path.blk[ state->path.active-1 ];
@@ -1190,16 +911,28 @@ restart:
} else if (retval == EEXIST) {
if (args->flags & ATTR_CREATE)
goto out;
- args->rename = 1; /* atomic rename op */
+
+ trace_xfs_attr_node_replace(args);
+
+ /* save the attribute state for later removal*/
+ args->op_flags |= XFS_DA_OP_RENAME; /* atomic rename op */
args->blkno2 = args->blkno; /* set 2nd entry info*/
args->index2 = args->index;
args->rmtblkno2 = args->rmtblkno;
args->rmtblkcnt2 = args->rmtblkcnt;
+ args->rmtvaluelen2 = args->rmtvaluelen;
+
+ /*
+ * clear the remote attr state now that it is saved so that the
+ * values reflect the state of the attribute we are about to
+ * add, not the attribute we just found and will remove later.
+ */
args->rmtblkno = 0;
args->rmtblkcnt = 0;
+ args->rmtvaluelen = 0;
}
- retval = xfs_attr_leaf_add(blk->bp, state->args);
+ retval = xfs_attr3_leaf_add(blk->bp, state->args);
if (retval == ENOSPC) {
if (state->path.active == 1) {
/*
@@ -1208,12 +941,12 @@ restart:
* have been a b-tree.
*/
xfs_da_state_free(state);
- XFS_BMAP_INIT(args->flist, args->firstblock);
- error = xfs_attr_leaf_to_node(args);
+ state = NULL;
+ xfs_bmap_init(args->flist, args->firstblock);
+ error = xfs_attr3_leaf_to_node(args);
if (!error) {
error = xfs_bmap_finish(&args->trans,
args->flist,
- *args->firstblock,
&committed);
}
if (error) {
@@ -1228,16 +961,15 @@ restart:
* and started a new one. We need the inode to be
* in all transactions.
*/
- if (committed) {
- xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL);
- xfs_trans_ihold(args->trans, dp);
- }
+ if (committed)
+ xfs_trans_ijoin(args->trans, dp, 0);
/*
* Commit the node conversion and start the next
* trans in the chain.
*/
- if ((error = xfs_attr_rolltrans(&args->trans, dp)))
+ error = xfs_trans_roll(&args->trans, dp);
+ if (error)
goto out;
goto restart;
@@ -1249,11 +981,11 @@ restart:
* in the index/blkno/rmtblkno/rmtblkcnt fields and
* in the index2/blkno2/rmtblkno2/rmtblkcnt2 fields.
*/
- XFS_BMAP_INIT(args->flist, args->firstblock);
- error = xfs_da_split(state);
+ xfs_bmap_init(args->flist, args->firstblock);
+ error = xfs_da3_split(state);
if (!error) {
error = xfs_bmap_finish(&args->trans, args->flist,
- *args->firstblock, &committed);
+ &committed);
}
if (error) {
ASSERT(committed);
@@ -1266,15 +998,13 @@ restart:
* bmap_finish() may have committed the last trans and started
* a new one. We need the inode to be in all transactions.
*/
- if (committed) {
- xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL);
- xfs_trans_ihold(args->trans, dp);
- }
+ if (committed)
+ xfs_trans_ijoin(args->trans, dp, 0);
} else {
/*
* Addition succeeded, update Btree hashvals.
*/
- xfs_da_fixhashpath(state, &state->path);
+ xfs_da3_fixhashpath(state, &state->path);
}
/*
@@ -1288,7 +1018,8 @@ restart:
* Commit the leaf addition or btree split and start the next
* trans in the chain.
*/
- if ((error = xfs_attr_rolltrans(&args->trans, dp)))
+ error = xfs_trans_roll(&args->trans, dp);
+ if (error)
goto out;
/*
@@ -1309,12 +1040,12 @@ restart:
* so that one disappears and one appears atomically. Then we
* must remove the "old" attribute/value pair.
*/
- if (args->rename) {
+ if (args->op_flags & XFS_DA_OP_RENAME) {
/*
* In a separate transaction, set the incomplete flag on the
* "old" attr and clear the incomplete flag on the "new" attr.
*/
- error = xfs_attr_leaf_flipflags(args);
+ error = xfs_attr3_leaf_flipflags(args);
if (error)
goto out;
@@ -1326,6 +1057,7 @@ restart:
args->blkno = args->blkno2;
args->rmtblkno = args->rmtblkno2;
args->rmtblkcnt = args->rmtblkcnt2;
+ args->rmtvaluelen = args->rmtvaluelen2;
if (args->rmtblkno) {
error = xfs_attr_rmtval_remove(args);
if (error)
@@ -1341,10 +1073,8 @@ restart:
state = xfs_da_state_alloc();
state->args = args;
state->mp = mp;
- state->blocksize = state->mp->m_sb.sb_blocksize;
- state->node_ents = state->mp->m_attr_node_ents;
state->inleaf = 0;
- error = xfs_da_node_lookup_int(state, &retval);
+ error = xfs_da3_node_lookup_int(state, &retval);
if (error)
goto out;
@@ -1353,19 +1083,18 @@ restart:
*/
blk = &state->path.blk[ state->path.active-1 ];
ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC);
- error = xfs_attr_leaf_remove(blk->bp, args);
- xfs_da_fixhashpath(state, &state->path);
+ error = xfs_attr3_leaf_remove(blk->bp, args);
+ xfs_da3_fixhashpath(state, &state->path);
/*
* Check to see if the tree needs to be collapsed.
*/
if (retval && (state->path.active > 1)) {
- XFS_BMAP_INIT(args->flist, args->firstblock);
- error = xfs_da_join(state);
+ xfs_bmap_init(args->flist, args->firstblock);
+ error = xfs_da3_join(state);
if (!error) {
error = xfs_bmap_finish(&args->trans,
args->flist,
- *args->firstblock,
&committed);
}
if (error) {
@@ -1380,23 +1109,22 @@ restart:
* and started a new one. We need the inode to be
* in all transactions.
*/
- if (committed) {
- xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL);
- xfs_trans_ihold(args->trans, dp);
- }
+ if (committed)
+ xfs_trans_ijoin(args->trans, dp, 0);
}
/*
* Commit and start the next trans in the chain.
*/
- if ((error = xfs_attr_rolltrans(&args->trans, dp)))
+ error = xfs_trans_roll(&args->trans, dp);
+ if (error)
goto out;
} else if (args->rmtblkno > 0) {
/*
* Added a "remote" value, just clear the incomplete flag.
*/
- error = xfs_attr_leaf_clearflag(args);
+ error = xfs_attr3_leaf_clearflag(args);
if (error)
goto out;
}
@@ -1423,9 +1151,11 @@ xfs_attr_node_removename(xfs_da_args_t *args)
xfs_da_state_t *state;
xfs_da_state_blk_t *blk;
xfs_inode_t *dp;
- xfs_dabuf_t *bp;
+ struct xfs_buf *bp;
int retval, error, committed, forkoff;
+ trace_xfs_attr_node_removename(args);
+
/*
* Tie a string around our finger to remind us where we are.
*/
@@ -1433,13 +1163,11 @@ xfs_attr_node_removename(xfs_da_args_t *args)
state = xfs_da_state_alloc();
state->args = args;
state->mp = dp->i_mount;
- state->blocksize = state->mp->m_sb.sb_blocksize;
- state->node_ents = state->mp->m_attr_node_ents;
/*
* Search to see if name exists, and get back a pointer to it.
*/
- error = xfs_da_node_lookup_int(state, &retval);
+ error = xfs_da3_node_lookup_int(state, &retval);
if (error || (retval != EEXIST)) {
if (error == 0)
error = retval;
@@ -1468,7 +1196,7 @@ xfs_attr_node_removename(xfs_da_args_t *args)
* Mark the attribute as INCOMPLETE, then bunmapi() the
* remote value.
*/
- error = xfs_attr_leaf_setflag(args);
+ error = xfs_attr3_leaf_setflag(args);
if (error)
goto out;
error = xfs_attr_rmtval_remove(args);
@@ -1489,18 +1217,18 @@ xfs_attr_node_removename(xfs_da_args_t *args)
*/
blk = &state->path.blk[ state->path.active-1 ];
ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC);
- retval = xfs_attr_leaf_remove(blk->bp, args);
- xfs_da_fixhashpath(state, &state->path);
+ retval = xfs_attr3_leaf_remove(blk->bp, args);
+ xfs_da3_fixhashpath(state, &state->path);
/*
* Check to see if the tree needs to be collapsed.
*/
if (retval && (state->path.active > 1)) {
- XFS_BMAP_INIT(args->flist, args->firstblock);
- error = xfs_da_join(state);
+ xfs_bmap_init(args->flist, args->firstblock);
+ error = xfs_da3_join(state);
if (!error) {
error = xfs_bmap_finish(&args->trans, args->flist,
- *args->firstblock, &committed);
+ &committed);
}
if (error) {
ASSERT(committed);
@@ -1513,15 +1241,14 @@ xfs_attr_node_removename(xfs_da_args_t *args)
* bmap_finish() may have committed the last trans and started
* a new one. We need the inode to be in all transactions.
*/
- if (committed) {
- xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL);
- xfs_trans_ihold(args->trans, dp);
- }
+ if (committed)
+ xfs_trans_ijoin(args->trans, dp, 0);
/*
* Commit the Btree join operation and start a new trans.
*/
- if ((error = xfs_attr_rolltrans(&args->trans, dp)))
+ error = xfs_trans_roll(&args->trans, dp);
+ if (error)
goto out;
}
@@ -1534,25 +1261,19 @@ xfs_attr_node_removename(xfs_da_args_t *args)
*/
ASSERT(state->path.active == 1);
ASSERT(state->path.blk[0].bp);
- xfs_da_buf_done(state->path.blk[0].bp);
state->path.blk[0].bp = NULL;
- error = xfs_da_read_buf(args->trans, args->dp, 0, -1, &bp,
- XFS_ATTR_FORK);
+ error = xfs_attr3_leaf_read(args->trans, args->dp, 0, -1, &bp);
if (error)
goto out;
- ASSERT(INT_GET(((xfs_attr_leafblock_t *)
- bp->data)->hdr.info.magic, ARCH_CONVERT)
- == XFS_ATTR_LEAF_MAGIC);
if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) {
- XFS_BMAP_INIT(args->flist, args->firstblock);
- error = xfs_attr_leaf_to_shortform(bp, args, forkoff);
+ xfs_bmap_init(args->flist, args->firstblock);
+ error = xfs_attr3_leaf_to_shortform(bp, args, forkoff);
/* bp is gone due to xfs_da_shrink_inode */
if (!error) {
error = xfs_bmap_finish(&args->trans,
args->flist,
- *args->firstblock,
&committed);
}
if (error) {
@@ -1567,12 +1288,10 @@ xfs_attr_node_removename(xfs_da_args_t *args)
* and started a new one. We need the inode to be
* in all transactions.
*/
- if (committed) {
- xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL);
- xfs_trans_ihold(args->trans, dp);
- }
+ if (committed)
+ xfs_trans_ijoin(args->trans, dp, 0);
} else
- xfs_da_brelse(args->trans, bp);
+ xfs_trans_brelse(args->trans, bp);
}
error = 0;
@@ -1585,7 +1304,7 @@ out:
* Fill in the disk block numbers in the state structure for the buffers
* that are attached to the state structure.
* This is done so that we can quickly reattach ourselves to those buffers
- * after some set of transaction commit's has released these buffers.
+ * after some set of transaction commits have released these buffers.
*/
STATIC int
xfs_attr_fillstate(xfs_da_state_t *state)
@@ -1594,6 +1313,8 @@ xfs_attr_fillstate(xfs_da_state_t *state)
xfs_da_state_blk_t *blk;
int level;
+ trace_xfs_attr_fillstate(state->args);
+
/*
* Roll down the "path" in the state structure, storing the on-disk
* block number for those buffers in the "path".
@@ -1602,8 +1323,7 @@ xfs_attr_fillstate(xfs_da_state_t *state)
ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
if (blk->bp) {
- blk->disk_blkno = xfs_da_blkno(blk->bp);
- xfs_da_buf_done(blk->bp);
+ blk->disk_blkno = XFS_BUF_ADDR(blk->bp);
blk->bp = NULL;
} else {
blk->disk_blkno = 0;
@@ -1618,8 +1338,7 @@ xfs_attr_fillstate(xfs_da_state_t *state)
ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
if (blk->bp) {
- blk->disk_blkno = xfs_da_blkno(blk->bp);
- xfs_da_buf_done(blk->bp);
+ blk->disk_blkno = XFS_BUF_ADDR(blk->bp);
blk->bp = NULL;
} else {
blk->disk_blkno = 0;
@@ -1632,7 +1351,7 @@ xfs_attr_fillstate(xfs_da_state_t *state)
/*
* Reattach the buffers to the state structure based on the disk block
* numbers stored in the state structure.
- * This is done after some set of transaction commit's has released those
+ * This is done after some set of transaction commits have released those
* buffers from our grip.
*/
STATIC int
@@ -1642,6 +1361,8 @@ xfs_attr_refillstate(xfs_da_state_t *state)
xfs_da_state_blk_t *blk;
int level, error;
+ trace_xfs_attr_refillstate(state->args);
+
/*
* Roll down the "path" in the state structure, storing the on-disk
* block number for those buffers in the "path".
@@ -1650,7 +1371,7 @@ xfs_attr_refillstate(xfs_da_state_t *state)
ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
if (blk->disk_blkno) {
- error = xfs_da_read_buf(state->args->trans,
+ error = xfs_da3_node_read(state->args->trans,
state->args->dp,
blk->blkno, blk->disk_blkno,
&blk->bp, XFS_ATTR_FORK);
@@ -1669,7 +1390,7 @@ xfs_attr_refillstate(xfs_da_state_t *state)
ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
if (blk->disk_blkno) {
- error = xfs_da_read_buf(state->args->trans,
+ error = xfs_da3_node_read(state->args->trans,
state->args->dp,
blk->blkno, blk->disk_blkno,
&blk->bp, XFS_ATTR_FORK);
@@ -1698,16 +1419,16 @@ xfs_attr_node_get(xfs_da_args_t *args)
int error, retval;
int i;
+ trace_xfs_attr_node_get(args);
+
state = xfs_da_state_alloc();
state->args = args;
state->mp = args->dp->i_mount;
- state->blocksize = state->mp->m_sb.sb_blocksize;
- state->node_ents = state->mp->m_attr_node_ents;
/*
* Search to see if name exists, and get back a pointer to it.
*/
- error = xfs_da_node_lookup_int(state, &retval);
+ error = xfs_da3_node_lookup_int(state, &retval);
if (error) {
retval = error;
} else if (retval == EEXIST) {
@@ -1718,7 +1439,7 @@ xfs_attr_node_get(xfs_da_args_t *args)
/*
* Get the value, local or "remote"
*/
- retval = xfs_attr_leaf_getvalue(blk->bp, args);
+ retval = xfs_attr3_leaf_getvalue(blk->bp, args);
if (!retval && (args->rmtblkno > 0)
&& !(args->flags & ATTR_KERNOVAL)) {
retval = xfs_attr_rmtval_get(args);
@@ -1729,915 +1450,10 @@ xfs_attr_node_get(xfs_da_args_t *args)
* If not in a transaction, we have to release all the buffers.
*/
for (i = 0; i < state->path.active; i++) {
- xfs_da_brelse(args->trans, state->path.blk[i].bp);
+ xfs_trans_brelse(args->trans, state->path.blk[i].bp);
state->path.blk[i].bp = NULL;
}
xfs_da_state_free(state);
return(retval);
}
-
-STATIC int /* error */
-xfs_attr_node_list(xfs_attr_list_context_t *context)
-{
- attrlist_cursor_kern_t *cursor;
- xfs_attr_leafblock_t *leaf;
- xfs_da_intnode_t *node;
- xfs_da_node_entry_t *btree;
- int error, i;
- xfs_dabuf_t *bp;
-
- cursor = context->cursor;
- cursor->initted = 1;
-
- /*
- * Do all sorts of validation on the passed-in cursor structure.
- * If anything is amiss, ignore the cursor and look up the hashval
- * starting from the btree root.
- */
- bp = NULL;
- if (cursor->blkno > 0) {
- error = xfs_da_read_buf(NULL, context->dp, cursor->blkno, -1,
- &bp, XFS_ATTR_FORK);
- if ((error != 0) && (error != EFSCORRUPTED))
- return(error);
- if (bp) {
- node = bp->data;
- switch (INT_GET(node->hdr.info.magic, ARCH_CONVERT)) {
- case XFS_DA_NODE_MAGIC:
- xfs_attr_trace_l_cn("wrong blk", context, node);
- xfs_da_brelse(NULL, bp);
- bp = NULL;
- break;
- case XFS_ATTR_LEAF_MAGIC:
- leaf = bp->data;
- if (cursor->hashval >
- INT_GET(leaf->entries[
- INT_GET(leaf->hdr.count,
- ARCH_CONVERT)-1].hashval,
- ARCH_CONVERT)) {
- xfs_attr_trace_l_cl("wrong blk",
- context, leaf);
- xfs_da_brelse(NULL, bp);
- bp = NULL;
- } else if (cursor->hashval <=
- INT_GET(leaf->entries[0].hashval,
- ARCH_CONVERT)) {
- xfs_attr_trace_l_cl("maybe wrong blk",
- context, leaf);
- xfs_da_brelse(NULL, bp);
- bp = NULL;
- }
- break;
- default:
- xfs_attr_trace_l_c("wrong blk - ??", context);
- xfs_da_brelse(NULL, bp);
- bp = NULL;
- }
- }
- }
-
- /*
- * We did not find what we expected given the cursor's contents,
- * so we start from the top and work down based on the hash value.
- * Note that start of node block is same as start of leaf block.
- */
- if (bp == NULL) {
- cursor->blkno = 0;
- for (;;) {
- error = xfs_da_read_buf(NULL, context->dp,
- cursor->blkno, -1, &bp,
- XFS_ATTR_FORK);
- if (error)
- return(error);
- if (unlikely(bp == NULL)) {
- XFS_ERROR_REPORT("xfs_attr_node_list(2)",
- XFS_ERRLEVEL_LOW,
- context->dp->i_mount);
- return(XFS_ERROR(EFSCORRUPTED));
- }
- node = bp->data;
- if (INT_GET(node->hdr.info.magic, ARCH_CONVERT)
- == XFS_ATTR_LEAF_MAGIC)
- break;
- if (unlikely(INT_GET(node->hdr.info.magic, ARCH_CONVERT)
- != XFS_DA_NODE_MAGIC)) {
- XFS_CORRUPTION_ERROR("xfs_attr_node_list(3)",
- XFS_ERRLEVEL_LOW,
- context->dp->i_mount,
- node);
- xfs_da_brelse(NULL, bp);
- return(XFS_ERROR(EFSCORRUPTED));
- }
- btree = node->btree;
- for (i = 0;
- i < INT_GET(node->hdr.count, ARCH_CONVERT);
- btree++, i++) {
- if (cursor->hashval
- <= INT_GET(btree->hashval,
- ARCH_CONVERT)) {
- cursor->blkno = INT_GET(btree->before, ARCH_CONVERT);
- xfs_attr_trace_l_cb("descending",
- context, btree);
- break;
- }
- }
- if (i == INT_GET(node->hdr.count, ARCH_CONVERT)) {
- xfs_da_brelse(NULL, bp);
- return(0);
- }
- xfs_da_brelse(NULL, bp);
- }
- }
- ASSERT(bp != NULL);
-
- /*
- * Roll upward through the blocks, processing each leaf block in
- * order. As long as there is space in the result buffer, keep
- * adding the information.
- */
- for (;;) {
- leaf = bp->data;
- if (unlikely(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT)
- != XFS_ATTR_LEAF_MAGIC)) {
- XFS_CORRUPTION_ERROR("xfs_attr_node_list(4)",
- XFS_ERRLEVEL_LOW,
- context->dp->i_mount, leaf);
- xfs_da_brelse(NULL, bp);
- return(XFS_ERROR(EFSCORRUPTED));
- }
- error = xfs_attr_leaf_list_int(bp, context);
- if (error || !leaf->hdr.info.forw)
- break; /* not really an error, buffer full or EOF */
- cursor->blkno = INT_GET(leaf->hdr.info.forw, ARCH_CONVERT);
- xfs_da_brelse(NULL, bp);
- error = xfs_da_read_buf(NULL, context->dp, cursor->blkno, -1,
- &bp, XFS_ATTR_FORK);
- if (error)
- return(error);
- if (unlikely((bp == NULL))) {
- XFS_ERROR_REPORT("xfs_attr_node_list(5)",
- XFS_ERRLEVEL_LOW,
- context->dp->i_mount);
- return(XFS_ERROR(EFSCORRUPTED));
- }
- }
- xfs_da_brelse(NULL, bp);
- return(0);
-}
-
-
-/*========================================================================
- * External routines for manipulating out-of-line attribute values.
- *========================================================================*/
-
-/*
- * Read the value associated with an attribute from the out-of-line buffer
- * that we stored it in.
- */
-STATIC int
-xfs_attr_rmtval_get(xfs_da_args_t *args)
-{
- xfs_bmbt_irec_t map[ATTR_RMTVALUE_MAPSIZE];
- xfs_mount_t *mp;
- xfs_daddr_t dblkno;
- xfs_caddr_t dst;
- xfs_buf_t *bp;
- int nmap, error, tmp, valuelen, blkcnt, i;
- xfs_dablk_t lblkno;
-
- ASSERT(!(args->flags & ATTR_KERNOVAL));
-
- mp = args->dp->i_mount;
- dst = args->value;
- valuelen = args->valuelen;
- lblkno = args->rmtblkno;
- while (valuelen > 0) {
- nmap = ATTR_RMTVALUE_MAPSIZE;
- error = xfs_bmapi(args->trans, args->dp, (xfs_fileoff_t)lblkno,
- args->rmtblkcnt,
- XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
- NULL, 0, map, &nmap, NULL);
- if (error)
- return(error);
- ASSERT(nmap >= 1);
-
- for (i = 0; (i < nmap) && (valuelen > 0); i++) {
- ASSERT((map[i].br_startblock != DELAYSTARTBLOCK) &&
- (map[i].br_startblock != HOLESTARTBLOCK));
- dblkno = XFS_FSB_TO_DADDR(mp, map[i].br_startblock);
- blkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount);
- error = xfs_read_buf(mp, mp->m_ddev_targp, dblkno,
- blkcnt, XFS_BUF_LOCK, &bp);
- if (error)
- return(error);
-
- tmp = (valuelen < XFS_BUF_SIZE(bp))
- ? valuelen : XFS_BUF_SIZE(bp);
- xfs_biomove(bp, 0, tmp, dst, XFS_B_READ);
- xfs_buf_relse(bp);
- dst += tmp;
- valuelen -= tmp;
-
- lblkno += map[i].br_blockcount;
- }
- }
- ASSERT(valuelen == 0);
- return(0);
-}
-
-/*
- * Write the value associated with an attribute into the out-of-line buffer
- * that we have defined for it.
- */
-STATIC int
-xfs_attr_rmtval_set(xfs_da_args_t *args)
-{
- xfs_mount_t *mp;
- xfs_fileoff_t lfileoff;
- xfs_inode_t *dp;
- xfs_bmbt_irec_t map;
- xfs_daddr_t dblkno;
- xfs_caddr_t src;
- xfs_buf_t *bp;
- xfs_dablk_t lblkno;
- int blkcnt, valuelen, nmap, error, tmp, committed;
-
- dp = args->dp;
- mp = dp->i_mount;
- src = args->value;
-
- /*
- * Find a "hole" in the attribute address space large enough for
- * us to drop the new attribute's value into.
- */
- blkcnt = XFS_B_TO_FSB(mp, args->valuelen);
- lfileoff = 0;
- error = xfs_bmap_first_unused(args->trans, args->dp, blkcnt, &lfileoff,
- XFS_ATTR_FORK);
- if (error) {
- return(error);
- }
- args->rmtblkno = lblkno = (xfs_dablk_t)lfileoff;
- args->rmtblkcnt = blkcnt;
-
- /*
- * Roll through the "value", allocating blocks on disk as required.
- */
- while (blkcnt > 0) {
- /*
- * Allocate a single extent, up to the size of the value.
- */
- XFS_BMAP_INIT(args->flist, args->firstblock);
- nmap = 1;
- error = xfs_bmapi(args->trans, dp, (xfs_fileoff_t)lblkno,
- blkcnt,
- XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA |
- XFS_BMAPI_WRITE,
- args->firstblock, args->total, &map, &nmap,
- args->flist);
- if (!error) {
- error = xfs_bmap_finish(&args->trans, args->flist,
- *args->firstblock, &committed);
- }
- if (error) {
- ASSERT(committed);
- args->trans = NULL;
- xfs_bmap_cancel(args->flist);
- return(error);
- }
-
- /*
- * bmap_finish() may have committed the last trans and started
- * a new one. We need the inode to be in all transactions.
- */
- if (committed) {
- xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL);
- xfs_trans_ihold(args->trans, dp);
- }
-
- ASSERT(nmap == 1);
- ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
- (map.br_startblock != HOLESTARTBLOCK));
- lblkno += map.br_blockcount;
- blkcnt -= map.br_blockcount;
-
- /*
- * Start the next trans in the chain.
- */
- if ((error = xfs_attr_rolltrans(&args->trans, dp)))
- return (error);
- }
-
- /*
- * Roll through the "value", copying the attribute value to the
- * already-allocated blocks. Blocks are written synchronously
- * so that we can know they are all on disk before we turn off
- * the INCOMPLETE flag.
- */
- lblkno = args->rmtblkno;
- valuelen = args->valuelen;
- while (valuelen > 0) {
- /*
- * Try to remember where we decided to put the value.
- */
- XFS_BMAP_INIT(args->flist, args->firstblock);
- nmap = 1;
- error = xfs_bmapi(NULL, dp, (xfs_fileoff_t)lblkno,
- args->rmtblkcnt,
- XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
- args->firstblock, 0, &map, &nmap, NULL);
- if (error) {
- return(error);
- }
- ASSERT(nmap == 1);
- ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
- (map.br_startblock != HOLESTARTBLOCK));
-
- dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock),
- blkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount);
-
- bp = xfs_buf_get_flags(mp->m_ddev_targp, dblkno,
- blkcnt, XFS_BUF_LOCK);
- ASSERT(bp);
- ASSERT(!XFS_BUF_GETERROR(bp));
-
- tmp = (valuelen < XFS_BUF_SIZE(bp)) ? valuelen :
- XFS_BUF_SIZE(bp);
- xfs_biomove(bp, 0, tmp, src, XFS_B_WRITE);
- if (tmp < XFS_BUF_SIZE(bp))
- xfs_biozero(bp, tmp, XFS_BUF_SIZE(bp) - tmp);
- if ((error = xfs_bwrite(mp, bp))) {/* GROT: NOTE: synchronous write */
- return (error);
- }
- src += tmp;
- valuelen -= tmp;
-
- lblkno += map.br_blockcount;
- }
- ASSERT(valuelen == 0);
- return(0);
-}
-
-/*
- * Remove the value associated with an attribute by deleting the
- * out-of-line buffer that it is stored on.
- */
-STATIC int
-xfs_attr_rmtval_remove(xfs_da_args_t *args)
-{
- xfs_mount_t *mp;
- xfs_bmbt_irec_t map;
- xfs_buf_t *bp;
- xfs_daddr_t dblkno;
- xfs_dablk_t lblkno;
- int valuelen, blkcnt, nmap, error, done, committed;
-
- mp = args->dp->i_mount;
-
- /*
- * Roll through the "value", invalidating the attribute value's
- * blocks.
- */
- lblkno = args->rmtblkno;
- valuelen = args->rmtblkcnt;
- while (valuelen > 0) {
- /*
- * Try to remember where we decided to put the value.
- */
- XFS_BMAP_INIT(args->flist, args->firstblock);
- nmap = 1;
- error = xfs_bmapi(NULL, args->dp, (xfs_fileoff_t)lblkno,
- args->rmtblkcnt,
- XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
- args->firstblock, 0, &map, &nmap,
- args->flist);
- if (error) {
- return(error);
- }
- ASSERT(nmap == 1);
- ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
- (map.br_startblock != HOLESTARTBLOCK));
-
- dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock),
- blkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount);
-
- /*
- * If the "remote" value is in the cache, remove it.
- */
- bp = xfs_incore(mp->m_ddev_targp, dblkno, blkcnt,
- XFS_INCORE_TRYLOCK);
- if (bp) {
- XFS_BUF_STALE(bp);
- XFS_BUF_UNDELAYWRITE(bp);
- xfs_buf_relse(bp);
- bp = NULL;
- }
-
- valuelen -= map.br_blockcount;
-
- lblkno += map.br_blockcount;
- }
-
- /*
- * Keep de-allocating extents until the remote-value region is gone.
- */
- lblkno = args->rmtblkno;
- blkcnt = args->rmtblkcnt;
- done = 0;
- while (!done) {
- XFS_BMAP_INIT(args->flist, args->firstblock);
- error = xfs_bunmapi(args->trans, args->dp, lblkno, blkcnt,
- XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
- 1, args->firstblock, args->flist, &done);
- if (!error) {
- error = xfs_bmap_finish(&args->trans, args->flist,
- *args->firstblock, &committed);
- }
- if (error) {
- ASSERT(committed);
- args->trans = NULL;
- xfs_bmap_cancel(args->flist);
- return(error);
- }
-
- /*
- * bmap_finish() may have committed the last trans and started
- * a new one. We need the inode to be in all transactions.
- */
- if (committed) {
- xfs_trans_ijoin(args->trans, args->dp, XFS_ILOCK_EXCL);
- xfs_trans_ihold(args->trans, args->dp);
- }
-
- /*
- * Close out trans and start the next one in the chain.
- */
- if ((error = xfs_attr_rolltrans(&args->trans, args->dp)))
- return (error);
- }
- return(0);
-}
-
-#if defined(XFS_ATTR_TRACE)
-/*
- * Add a trace buffer entry for an attr_list context structure.
- */
-void
-xfs_attr_trace_l_c(char *where, struct xfs_attr_list_context *context)
-{
- xfs_attr_trace_enter(XFS_ATTR_KTRACE_L_C, where,
- (__psunsigned_t)context->dp,
- (__psunsigned_t)context->cursor->hashval,
- (__psunsigned_t)context->cursor->blkno,
- (__psunsigned_t)context->cursor->offset,
- (__psunsigned_t)context->alist,
- (__psunsigned_t)context->bufsize,
- (__psunsigned_t)context->count,
- (__psunsigned_t)context->firstu,
- (__psunsigned_t)
- ((context->count > 0) &&
- !(context->flags & (ATTR_KERNAMELS|ATTR_KERNOVAL)))
- ? (ATTR_ENTRY(context->alist,
- context->count-1)->a_valuelen)
- : 0,
- (__psunsigned_t)context->dupcnt,
- (__psunsigned_t)context->flags,
- (__psunsigned_t)NULL,
- (__psunsigned_t)NULL,
- (__psunsigned_t)NULL);
-}
-
-/*
- * Add a trace buffer entry for a context structure and a Btree node.
- */
-void
-xfs_attr_trace_l_cn(char *where, struct xfs_attr_list_context *context,
- struct xfs_da_intnode *node)
-{
- xfs_attr_trace_enter(XFS_ATTR_KTRACE_L_CN, where,
- (__psunsigned_t)context->dp,
- (__psunsigned_t)context->cursor->hashval,
- (__psunsigned_t)context->cursor->blkno,
- (__psunsigned_t)context->cursor->offset,
- (__psunsigned_t)context->alist,
- (__psunsigned_t)context->bufsize,
- (__psunsigned_t)context->count,
- (__psunsigned_t)context->firstu,
- (__psunsigned_t)
- ((context->count > 0) &&
- !(context->flags & (ATTR_KERNAMELS|ATTR_KERNOVAL)))
- ? (ATTR_ENTRY(context->alist,
- context->count-1)->a_valuelen)
- : 0,
- (__psunsigned_t)context->dupcnt,
- (__psunsigned_t)context->flags,
- (__psunsigned_t)INT_GET(node->hdr.count, ARCH_CONVERT),
- (__psunsigned_t)INT_GET(node->btree[0].hashval, ARCH_CONVERT),
- (__psunsigned_t)INT_GET(node->btree[INT_GET(node->hdr.count, ARCH_CONVERT)-1].hashval, ARCH_CONVERT));
-}
-
-/*
- * Add a trace buffer entry for a context structure and a Btree element.
- */
-void
-xfs_attr_trace_l_cb(char *where, struct xfs_attr_list_context *context,
- struct xfs_da_node_entry *btree)
-{
- xfs_attr_trace_enter(XFS_ATTR_KTRACE_L_CB, where,
- (__psunsigned_t)context->dp,
- (__psunsigned_t)context->cursor->hashval,
- (__psunsigned_t)context->cursor->blkno,
- (__psunsigned_t)context->cursor->offset,
- (__psunsigned_t)context->alist,
- (__psunsigned_t)context->bufsize,
- (__psunsigned_t)context->count,
- (__psunsigned_t)context->firstu,
- (__psunsigned_t)
- ((context->count > 0) &&
- !(context->flags & (ATTR_KERNAMELS|ATTR_KERNOVAL)))
- ? (ATTR_ENTRY(context->alist,
- context->count-1)->a_valuelen)
- : 0,
- (__psunsigned_t)context->dupcnt,
- (__psunsigned_t)context->flags,
- (__psunsigned_t)INT_GET(btree->hashval, ARCH_CONVERT),
- (__psunsigned_t)INT_GET(btree->before, ARCH_CONVERT),
- (__psunsigned_t)NULL);
-}
-
-/*
- * Add a trace buffer entry for a context structure and a leaf block.
- */
-void
-xfs_attr_trace_l_cl(char *where, struct xfs_attr_list_context *context,
- struct xfs_attr_leafblock *leaf)
-{
- xfs_attr_trace_enter(XFS_ATTR_KTRACE_L_CL, where,
- (__psunsigned_t)context->dp,
- (__psunsigned_t)context->cursor->hashval,
- (__psunsigned_t)context->cursor->blkno,
- (__psunsigned_t)context->cursor->offset,
- (__psunsigned_t)context->alist,
- (__psunsigned_t)context->bufsize,
- (__psunsigned_t)context->count,
- (__psunsigned_t)context->firstu,
- (__psunsigned_t)
- ((context->count > 0) &&
- !(context->flags & (ATTR_KERNAMELS|ATTR_KERNOVAL)))
- ? (ATTR_ENTRY(context->alist,
- context->count-1)->a_valuelen)
- : 0,
- (__psunsigned_t)context->dupcnt,
- (__psunsigned_t)context->flags,
- (__psunsigned_t)INT_GET(leaf->hdr.count, ARCH_CONVERT),
- (__psunsigned_t)INT_GET(leaf->entries[0].hashval, ARCH_CONVERT),
- (__psunsigned_t)INT_GET(leaf->entries[INT_GET(leaf->hdr.count, ARCH_CONVERT)-1].hashval, ARCH_CONVERT));
-}
-
-/*
- * Add a trace buffer entry for the arguments given to the routine,
- * generic form.
- */
-void
-xfs_attr_trace_enter(int type, char *where,
- __psunsigned_t a2, __psunsigned_t a3,
- __psunsigned_t a4, __psunsigned_t a5,
- __psunsigned_t a6, __psunsigned_t a7,
- __psunsigned_t a8, __psunsigned_t a9,
- __psunsigned_t a10, __psunsigned_t a11,
- __psunsigned_t a12, __psunsigned_t a13,
- __psunsigned_t a14, __psunsigned_t a15)
-{
- ASSERT(xfs_attr_trace_buf);
- ktrace_enter(xfs_attr_trace_buf, (void *)((__psunsigned_t)type),
- (void *)where,
- (void *)a2, (void *)a3, (void *)a4,
- (void *)a5, (void *)a6, (void *)a7,
- (void *)a8, (void *)a9, (void *)a10,
- (void *)a11, (void *)a12, (void *)a13,
- (void *)a14, (void *)a15);
-}
-#endif /* XFS_ATTR_TRACE */
-
-
-/*========================================================================
- * System (pseudo) namespace attribute interface routines.
- *========================================================================*/
-
-STATIC int
-posix_acl_access_set(
- vnode_t *vp, char *name, void *data, size_t size, int xflags)
-{
- return xfs_acl_vset(vp, data, size, _ACL_TYPE_ACCESS);
-}
-
-STATIC int
-posix_acl_access_remove(
- struct vnode *vp, char *name, int xflags)
-{
- return xfs_acl_vremove(vp, _ACL_TYPE_ACCESS);
-}
-
-STATIC int
-posix_acl_access_get(
- vnode_t *vp, char *name, void *data, size_t size, int xflags)
-{
- return xfs_acl_vget(vp, data, size, _ACL_TYPE_ACCESS);
-}
-
-STATIC int
-posix_acl_access_exists(
- vnode_t *vp)
-{
- return xfs_acl_vhasacl_access(vp);
-}
-
-STATIC int
-posix_acl_default_set(
- vnode_t *vp, char *name, void *data, size_t size, int xflags)
-{
- return xfs_acl_vset(vp, data, size, _ACL_TYPE_DEFAULT);
-}
-
-STATIC int
-posix_acl_default_get(
- vnode_t *vp, char *name, void *data, size_t size, int xflags)
-{
- return xfs_acl_vget(vp, data, size, _ACL_TYPE_DEFAULT);
-}
-
-STATIC int
-posix_acl_default_remove(
- struct vnode *vp, char *name, int xflags)
-{
- return xfs_acl_vremove(vp, _ACL_TYPE_DEFAULT);
-}
-
-STATIC int
-posix_acl_default_exists(
- vnode_t *vp)
-{
- return xfs_acl_vhasacl_default(vp);
-}
-
-STATIC struct attrnames posix_acl_access = {
- .attr_name = "posix_acl_access",
- .attr_namelen = sizeof("posix_acl_access") - 1,
- .attr_get = posix_acl_access_get,
- .attr_set = posix_acl_access_set,
- .attr_remove = posix_acl_access_remove,
- .attr_exists = posix_acl_access_exists,
-};
-
-STATIC struct attrnames posix_acl_default = {
- .attr_name = "posix_acl_default",
- .attr_namelen = sizeof("posix_acl_default") - 1,
- .attr_get = posix_acl_default_get,
- .attr_set = posix_acl_default_set,
- .attr_remove = posix_acl_default_remove,
- .attr_exists = posix_acl_default_exists,
-};
-
-STATIC struct attrnames *attr_system_names[] =
- { &posix_acl_access, &posix_acl_default };
-
-
-/*========================================================================
- * Namespace-prefix-style attribute name interface routines.
- *========================================================================*/
-
-STATIC int
-attr_generic_set(
- struct vnode *vp, char *name, void *data, size_t size, int xflags)
-{
- int error;
-
- VOP_ATTR_SET(vp, name, data, size, xflags, NULL, error);
- return -error;
-}
-
-STATIC int
-attr_generic_get(
- struct vnode *vp, char *name, void *data, size_t size, int xflags)
-{
- int error, asize = size;
-
- VOP_ATTR_GET(vp, name, data, &asize, xflags, NULL, error);
- if (!error)
- return asize;
- return -error;
-}
-
-STATIC int
-attr_generic_remove(
- struct vnode *vp, char *name, int xflags)
-{
- int error;
-
- VOP_ATTR_REMOVE(vp, name, xflags, NULL, error);
- return -error;
-}
-
-STATIC int
-attr_generic_listadd(
- attrnames_t *prefix,
- attrnames_t *namesp,
- void *data,
- size_t size,
- ssize_t *result)
-{
- char *p = data + *result;
-
- *result += prefix->attr_namelen;
- *result += namesp->attr_namelen + 1;
- if (!size)
- return 0;
- if (*result > size)
- return -ERANGE;
- strcpy(p, prefix->attr_name);
- p += prefix->attr_namelen;
- strcpy(p, namesp->attr_name);
- p += namesp->attr_namelen + 1;
- return 0;
-}
-
-STATIC int
-attr_system_list(
- struct vnode *vp,
- void *data,
- size_t size,
- ssize_t *result)
-{
- attrnames_t *namesp;
- int i, error = 0;
-
- for (i = 0; i < ATTR_SYSCOUNT; i++) {
- namesp = attr_system_names[i];
- if (!namesp->attr_exists || !namesp->attr_exists(vp))
- continue;
- error = attr_generic_listadd(&attr_system, namesp,
- data, size, result);
- if (error)
- break;
- }
- return error;
-}
-
-int
-attr_generic_list(
- struct vnode *vp, void *data, size_t size, int xflags, ssize_t *result)
-{
- attrlist_cursor_kern_t cursor = { 0 };
- int error;
-
- VOP_ATTR_LIST(vp, data, size, xflags, &cursor, NULL, error);
- if (error > 0)
- return -error;
- *result = -error;
- return attr_system_list(vp, data, size, result);
-}
-
-attrnames_t *
-attr_lookup_namespace(
- char *name,
- struct attrnames **names,
- int nnames)
-{
- int i;
-
- for (i = 0; i < nnames; i++)
- if (!strncmp(name, names[i]->attr_name, names[i]->attr_namelen))
- return names[i];
- return NULL;
-}
-
-/*
- * Some checks to prevent people abusing EAs to get over quota:
- * - Don't allow modifying user EAs on devices/symlinks;
- * - Don't allow modifying user EAs if sticky bit set;
- */
-STATIC int
-attr_user_capable(
- struct vnode *vp,
- cred_t *cred)
-{
- struct inode *inode = LINVFS_GET_IP(vp);
-
- if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
- return -EPERM;
- if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode) &&
- !capable(CAP_SYS_ADMIN))
- return -EPERM;
- if (S_ISDIR(inode->i_mode) && (inode->i_mode & S_ISVTX) &&
- (current_fsuid(cred) != inode->i_uid) && !capable(CAP_FOWNER))
- return -EPERM;
- return 0;
-}
-
-STATIC int
-attr_trusted_capable(
- struct vnode *vp,
- cred_t *cred)
-{
- struct inode *inode = LINVFS_GET_IP(vp);
-
- if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
- return -EPERM;
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
- return 0;
-}
-
-STATIC int
-attr_secure_capable(
- struct vnode *vp,
- cred_t *cred)
-{
- return -ENOSECURITY;
-}
-
-STATIC int
-attr_system_set(
- struct vnode *vp, char *name, void *data, size_t size, int xflags)
-{
- attrnames_t *namesp;
- int error;
-
- if (xflags & ATTR_CREATE)
- return -EINVAL;
-
- namesp = attr_lookup_namespace(name, attr_system_names, ATTR_SYSCOUNT);
- if (!namesp)
- return -EOPNOTSUPP;
- error = namesp->attr_set(vp, name, data, size, xflags);
- if (!error)
- error = vn_revalidate(vp);
- return error;
-}
-
-STATIC int
-attr_system_get(
- struct vnode *vp, char *name, void *data, size_t size, int xflags)
-{
- attrnames_t *namesp;
-
- namesp = attr_lookup_namespace(name, attr_system_names, ATTR_SYSCOUNT);
- if (!namesp)
- return -EOPNOTSUPP;
- return namesp->attr_get(vp, name, data, size, xflags);
-}
-
-STATIC int
-attr_system_remove(
- struct vnode *vp, char *name, int xflags)
-{
- attrnames_t *namesp;
-
- namesp = attr_lookup_namespace(name, attr_system_names, ATTR_SYSCOUNT);
- if (!namesp)
- return -EOPNOTSUPP;
- return namesp->attr_remove(vp, name, xflags);
-}
-
-struct attrnames attr_system = {
- .attr_name = "system.",
- .attr_namelen = sizeof("system.") - 1,
- .attr_flag = ATTR_SYSTEM,
- .attr_get = attr_system_get,
- .attr_set = attr_system_set,
- .attr_remove = attr_system_remove,
- .attr_capable = (attrcapable_t)fs_noerr,
-};
-
-struct attrnames attr_trusted = {
- .attr_name = "trusted.",
- .attr_namelen = sizeof("trusted.") - 1,
- .attr_flag = ATTR_ROOT,
- .attr_get = attr_generic_get,
- .attr_set = attr_generic_set,
- .attr_remove = attr_generic_remove,
- .attr_capable = attr_trusted_capable,
-};
-
-struct attrnames attr_secure = {
- .attr_name = "security.",
- .attr_namelen = sizeof("security.") - 1,
- .attr_flag = ATTR_SECURE,
- .attr_get = attr_generic_get,
- .attr_set = attr_generic_set,
- .attr_remove = attr_generic_remove,
- .attr_capable = attr_secure_capable,
-};
-
-struct attrnames attr_user = {
- .attr_name = "user.",
- .attr_namelen = sizeof("user.") - 1,
- .attr_get = attr_generic_get,
- .attr_set = attr_generic_set,
- .attr_remove = attr_generic_remove,
- .attr_capable = attr_user_capable,
-};
-
-struct attrnames *attr_namespaces[] =
- { &attr_system, &attr_trusted, &attr_secure, &attr_user };
diff --git a/fs/xfs/xfs_attr.h b/fs/xfs/xfs_attr.h
index b2c7b9fcded..dd482458947 100644
--- a/fs/xfs/xfs_attr.h
+++ b/fs/xfs/xfs_attr.h
@@ -18,9 +18,11 @@
#ifndef __XFS_ATTR_H__
#define __XFS_ATTR_H__
+struct xfs_inode;
+struct xfs_da_args;
+struct xfs_attr_list_context;
+
/*
- * xfs_attr.h
- *
* Large attribute lists are structured around Btrees where all the data
* elements are in the leaf nodes. Attribute names are hashed into an int,
* then that int is used as the index into the Btree. Since the hashval
@@ -35,35 +37,6 @@
* External interfaces
*========================================================================*/
-struct cred;
-struct vnode;
-
-typedef int (*attrset_t)(struct vnode *, char *, void *, size_t, int);
-typedef int (*attrget_t)(struct vnode *, char *, void *, size_t, int);
-typedef int (*attrremove_t)(struct vnode *, char *, int);
-typedef int (*attrexists_t)(struct vnode *);
-typedef int (*attrcapable_t)(struct vnode *, struct cred *);
-
-typedef struct attrnames {
- char * attr_name;
- unsigned int attr_namelen;
- unsigned int attr_flag;
- attrget_t attr_get;
- attrset_t attr_set;
- attrremove_t attr_remove;
- attrexists_t attr_exists;
- attrcapable_t attr_capable;
-} attrnames_t;
-
-#define ATTR_NAMECOUNT 4
-extern struct attrnames attr_user;
-extern struct attrnames attr_secure;
-extern struct attrnames attr_system;
-extern struct attrnames attr_trusted;
-extern struct attrnames *attr_namespaces[ATTR_NAMECOUNT];
-
-extern attrnames_t *attr_lookup_namespace(char *, attrnames_t **, int);
-extern int attr_generic_list(struct vnode *, void *, size_t, int, ssize_t *);
#define ATTR_DONTFOLLOW 0x0001 /* -- unused, from IRIX -- */
#define ATTR_ROOT 0x0002 /* use attrs in root (trusted) namespace */
@@ -71,16 +44,19 @@ extern int attr_generic_list(struct vnode *, void *, size_t, int, ssize_t *);
#define ATTR_SECURE 0x0008 /* use attrs in security namespace */
#define ATTR_CREATE 0x0010 /* pure create: fail if attr already exists */
#define ATTR_REPLACE 0x0020 /* pure set: fail if attr does not exist */
-#define ATTR_SYSTEM 0x0100 /* use attrs in system (pseudo) namespace */
-#define ATTR_KERNACCESS 0x0400 /* [kernel] iaccess, inode held io-locked */
#define ATTR_KERNOTIME 0x1000 /* [kernel] don't update inode timestamps */
#define ATTR_KERNOVAL 0x2000 /* [kernel] get attr size only, not value */
-#define ATTR_KERNAMELS 0x4000 /* [kernel] list attr names (simple list) */
-#define ATTR_KERNORMALS 0x0800 /* [kernel] normal attr list: user+secure */
-#define ATTR_KERNROOTLS 0x8000 /* [kernel] include root in the attr list */
-#define ATTR_KERNFULLS (ATTR_KERNORMALS|ATTR_KERNROOTLS)
+#define XFS_ATTR_FLAGS \
+ { ATTR_DONTFOLLOW, "DONTFOLLOW" }, \
+ { ATTR_ROOT, "ROOT" }, \
+ { ATTR_TRUST, "TRUST" }, \
+ { ATTR_SECURE, "SECURE" }, \
+ { ATTR_CREATE, "CREATE" }, \
+ { ATTR_REPLACE, "REPLACE" }, \
+ { ATTR_KERNOTIME, "KERNOTIME" }, \
+ { ATTR_KERNOVAL, "KERNOVAL" }
/*
* The maximum size (into the kernel or returned from the kernel) of an
@@ -119,22 +95,6 @@ typedef struct attrlist_ent { /* data from attr_list() */
&((char *)buffer)[ ((attrlist_t *)(buffer))->al_offset[index] ])
/*
- * Multi-attribute operation vector.
- */
-typedef struct attr_multiop {
- int am_opcode; /* operation to perform (ATTR_OP_GET, etc.) */
- int am_error; /* [out arg] result of this sub-op (an errno) */
- char *am_attrname; /* attribute name to work with */
- char *am_attrvalue; /* [in/out arg] attribute value (raw bytes) */
- int am_length; /* [in/out arg] length of value */
- int am_flags; /* bitwise OR of attr API flags defined above */
-} attr_multiop_t;
-
-#define ATTR_OP_GET 1 /* return the indicated attr's value */
-#define ATTR_OP_SET 2 /* set/create the indicated attr/value pair */
-#define ATTR_OP_REMOVE 3 /* remove the indicated attr */
-
-/*
* Kernel-internal version of the attrlist cursor.
*/
typedef struct attrlist_cursor_kern {
@@ -148,25 +108,47 @@ typedef struct attrlist_cursor_kern {
/*========================================================================
- * Function prototypes for the kernel.
+ * Structure used to pass context around among the routines.
*========================================================================*/
-struct xfs_inode;
-struct attrlist_cursor_kern;
-struct xfs_da_args;
+
+typedef int (*put_listent_func_t)(struct xfs_attr_list_context *, int,
+ unsigned char *, int, int, unsigned char *);
+
+typedef struct xfs_attr_list_context {
+ struct xfs_inode *dp; /* inode */
+ struct attrlist_cursor_kern *cursor; /* position in list */
+ char *alist; /* output buffer */
+ int seen_enough; /* T/F: seen enough of list? */
+ ssize_t count; /* num used entries */
+ int dupcnt; /* count dup hashvals seen */
+ int bufsize; /* total buffer size */
+ int firstu; /* first used byte in buffer */
+ int flags; /* from VOP call */
+ int resynch; /* T/F: resynch with cursor */
+ int put_value; /* T/F: need value for listent */
+ put_listent_func_t put_listent; /* list output fmt function */
+ int index; /* index into output buffer */
+} xfs_attr_list_context_t;
+
+
+/*========================================================================
+ * Function prototypes for the kernel.
+ *========================================================================*/
/*
* Overall external interface routines.
*/
-int xfs_attr_get(bhv_desc_t *, const char *, char *, int *, int, struct cred *);
-int xfs_attr_set(bhv_desc_t *, const char *, char *, int, int, struct cred *);
-int xfs_attr_remove(bhv_desc_t *, const char *, int, struct cred *);
-int xfs_attr_list(bhv_desc_t *, char *, int, int,
- struct attrlist_cursor_kern *, struct cred *);
int xfs_attr_inactive(struct xfs_inode *dp);
+int xfs_attr_list_int(struct xfs_attr_list_context *);
+int xfs_inode_hasattr(struct xfs_inode *ip);
+int xfs_attr_get(struct xfs_inode *ip, const unsigned char *name,
+ unsigned char *value, int *valuelenp, int flags);
+int xfs_attr_set(struct xfs_inode *dp, const unsigned char *name,
+ unsigned char *value, int valuelen, int flags);
+int xfs_attr_remove(struct xfs_inode *dp, const unsigned char *name, int flags);
+int xfs_attr_list(struct xfs_inode *dp, char *buffer, int bufsize,
+ int flags, struct attrlist_cursor_kern *cursor);
-int xfs_attr_shortform_getvalue(struct xfs_da_args *);
-int xfs_attr_fetch(struct xfs_inode *, const char *, int,
- char *, int *, int, struct cred *);
#endif /* __XFS_ATTR_H__ */
diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c
new file mode 100644
index 00000000000..09480c57f06
--- /dev/null
+++ b/fs/xfs/xfs_attr_inactive.c
@@ -0,0 +1,452 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_inode.h"
+#include "xfs_alloc.h"
+#include "xfs_attr_remote.h"
+#include "xfs_trans.h"
+#include "xfs_inode_item.h"
+#include "xfs_bmap.h"
+#include "xfs_attr.h"
+#include "xfs_attr_leaf.h"
+#include "xfs_error.h"
+#include "xfs_quota.h"
+#include "xfs_trace.h"
+#include "xfs_dinode.h"
+#include "xfs_dir2.h"
+
+/*
+ * Look at all the extents for this logical region,
+ * invalidate any buffers that are incore/in transactions.
+ */
+STATIC int
+xfs_attr3_leaf_freextent(
+ struct xfs_trans **trans,
+ struct xfs_inode *dp,
+ xfs_dablk_t blkno,
+ int blkcnt)
+{
+ struct xfs_bmbt_irec map;
+ struct xfs_buf *bp;
+ xfs_dablk_t tblkno;
+ xfs_daddr_t dblkno;
+ int tblkcnt;
+ int dblkcnt;
+ int nmap;
+ int error;
+
+ /*
+ * Roll through the "value", invalidating the attribute value's
+ * blocks.
+ */
+ tblkno = blkno;
+ tblkcnt = blkcnt;
+ while (tblkcnt > 0) {
+ /*
+ * Try to remember where we decided to put the value.
+ */
+ nmap = 1;
+ error = xfs_bmapi_read(dp, (xfs_fileoff_t)tblkno, tblkcnt,
+ &map, &nmap, XFS_BMAPI_ATTRFORK);
+ if (error) {
+ return(error);
+ }
+ ASSERT(nmap == 1);
+ ASSERT(map.br_startblock != DELAYSTARTBLOCK);
+
+ /*
+ * If it's a hole, these are already unmapped
+ * so there's nothing to invalidate.
+ */
+ if (map.br_startblock != HOLESTARTBLOCK) {
+
+ dblkno = XFS_FSB_TO_DADDR(dp->i_mount,
+ map.br_startblock);
+ dblkcnt = XFS_FSB_TO_BB(dp->i_mount,
+ map.br_blockcount);
+ bp = xfs_trans_get_buf(*trans,
+ dp->i_mount->m_ddev_targp,
+ dblkno, dblkcnt, 0);
+ if (!bp)
+ return ENOMEM;
+ xfs_trans_binval(*trans, bp);
+ /*
+ * Roll to next transaction.
+ */
+ error = xfs_trans_roll(trans, dp);
+ if (error)
+ return (error);
+ }
+
+ tblkno += map.br_blockcount;
+ tblkcnt -= map.br_blockcount;
+ }
+
+ return(0);
+}
+
+/*
+ * Invalidate all of the "remote" value regions pointed to by a particular
+ * leaf block.
+ * Note that we must release the lock on the buffer so that we are not
+ * caught holding something that the logging code wants to flush to disk.
+ */
+STATIC int
+xfs_attr3_leaf_inactive(
+ struct xfs_trans **trans,
+ struct xfs_inode *dp,
+ struct xfs_buf *bp)
+{
+ struct xfs_attr_leafblock *leaf;
+ struct xfs_attr3_icleaf_hdr ichdr;
+ struct xfs_attr_leaf_entry *entry;
+ struct xfs_attr_leaf_name_remote *name_rmt;
+ struct xfs_attr_inactive_list *list;
+ struct xfs_attr_inactive_list *lp;
+ int error;
+ int count;
+ int size;
+ int tmp;
+ int i;
+
+ leaf = bp->b_addr;
+ xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
+
+ /*
+ * Count the number of "remote" value extents.
+ */
+ count = 0;
+ entry = xfs_attr3_leaf_entryp(leaf);
+ for (i = 0; i < ichdr.count; entry++, i++) {
+ if (be16_to_cpu(entry->nameidx) &&
+ ((entry->flags & XFS_ATTR_LOCAL) == 0)) {
+ name_rmt = xfs_attr3_leaf_name_remote(leaf, i);
+ if (name_rmt->valueblk)
+ count++;
+ }
+ }
+
+ /*
+ * If there are no "remote" values, we're done.
+ */
+ if (count == 0) {
+ xfs_trans_brelse(*trans, bp);
+ return 0;
+ }
+
+ /*
+ * Allocate storage for a list of all the "remote" value extents.
+ */
+ size = count * sizeof(xfs_attr_inactive_list_t);
+ list = kmem_alloc(size, KM_SLEEP);
+
+ /*
+ * Identify each of the "remote" value extents.
+ */
+ lp = list;
+ entry = xfs_attr3_leaf_entryp(leaf);
+ for (i = 0; i < ichdr.count; entry++, i++) {
+ if (be16_to_cpu(entry->nameidx) &&
+ ((entry->flags & XFS_ATTR_LOCAL) == 0)) {
+ name_rmt = xfs_attr3_leaf_name_remote(leaf, i);
+ if (name_rmt->valueblk) {
+ lp->valueblk = be32_to_cpu(name_rmt->valueblk);
+ lp->valuelen = xfs_attr3_rmt_blocks(dp->i_mount,
+ be32_to_cpu(name_rmt->valuelen));
+ lp++;
+ }
+ }
+ }
+ xfs_trans_brelse(*trans, bp); /* unlock for trans. in freextent() */
+
+ /*
+ * Invalidate each of the "remote" value extents.
+ */
+ error = 0;
+ for (lp = list, i = 0; i < count; i++, lp++) {
+ tmp = xfs_attr3_leaf_freextent(trans, dp,
+ lp->valueblk, lp->valuelen);
+
+ if (error == 0)
+ error = tmp; /* save only the 1st errno */
+ }
+
+ kmem_free(list);
+ return error;
+}
+
+/*
+ * Recurse (gasp!) through the attribute nodes until we find leaves.
+ * We're doing a depth-first traversal in order to invalidate everything.
+ */
+STATIC int
+xfs_attr3_node_inactive(
+ struct xfs_trans **trans,
+ struct xfs_inode *dp,
+ struct xfs_buf *bp,
+ int level)
+{
+ xfs_da_blkinfo_t *info;
+ xfs_da_intnode_t *node;
+ xfs_dablk_t child_fsb;
+ xfs_daddr_t parent_blkno, child_blkno;
+ int error, i;
+ struct xfs_buf *child_bp;
+ struct xfs_da_node_entry *btree;
+ struct xfs_da3_icnode_hdr ichdr;
+
+ /*
+ * Since this code is recursive (gasp!) we must protect ourselves.
+ */
+ if (level > XFS_DA_NODE_MAXDEPTH) {
+ xfs_trans_brelse(*trans, bp); /* no locks for later trans */
+ return XFS_ERROR(EIO);
+ }
+
+ node = bp->b_addr;
+ dp->d_ops->node_hdr_from_disk(&ichdr, node);
+ parent_blkno = bp->b_bn;
+ if (!ichdr.count) {
+ xfs_trans_brelse(*trans, bp);
+ return 0;
+ }
+ btree = dp->d_ops->node_tree_p(node);
+ child_fsb = be32_to_cpu(btree[0].before);
+ xfs_trans_brelse(*trans, bp); /* no locks for later trans */
+
+ /*
+ * If this is the node level just above the leaves, simply loop
+ * over the leaves removing all of them. If this is higher up
+ * in the tree, recurse downward.
+ */
+ for (i = 0; i < ichdr.count; i++) {
+ /*
+ * Read the subsidiary block to see what we have to work with.
+ * Don't do this in a transaction. This is a depth-first
+ * traversal of the tree so we may deal with many blocks
+ * before we come back to this one.
+ */
+ error = xfs_da3_node_read(*trans, dp, child_fsb, -2, &child_bp,
+ XFS_ATTR_FORK);
+ if (error)
+ return(error);
+ if (child_bp) {
+ /* save for re-read later */
+ child_blkno = XFS_BUF_ADDR(child_bp);
+
+ /*
+ * Invalidate the subtree, however we have to.
+ */
+ info = child_bp->b_addr;
+ switch (info->magic) {
+ case cpu_to_be16(XFS_DA_NODE_MAGIC):
+ case cpu_to_be16(XFS_DA3_NODE_MAGIC):
+ error = xfs_attr3_node_inactive(trans, dp,
+ child_bp, level + 1);
+ break;
+ case cpu_to_be16(XFS_ATTR_LEAF_MAGIC):
+ case cpu_to_be16(XFS_ATTR3_LEAF_MAGIC):
+ error = xfs_attr3_leaf_inactive(trans, dp,
+ child_bp);
+ break;
+ default:
+ error = XFS_ERROR(EIO);
+ xfs_trans_brelse(*trans, child_bp);
+ break;
+ }
+ if (error)
+ return error;
+
+ /*
+ * Remove the subsidiary block from the cache
+ * and from the log.
+ */
+ error = xfs_da_get_buf(*trans, dp, 0, child_blkno,
+ &child_bp, XFS_ATTR_FORK);
+ if (error)
+ return error;
+ xfs_trans_binval(*trans, child_bp);
+ }
+
+ /*
+ * If we're not done, re-read the parent to get the next
+ * child block number.
+ */
+ if (i + 1 < ichdr.count) {
+ error = xfs_da3_node_read(*trans, dp, 0, parent_blkno,
+ &bp, XFS_ATTR_FORK);
+ if (error)
+ return error;
+ child_fsb = be32_to_cpu(btree[i + 1].before);
+ xfs_trans_brelse(*trans, bp);
+ }
+ /*
+ * Atomically commit the whole invalidate stuff.
+ */
+ error = xfs_trans_roll(trans, dp);
+ if (error)
+ return error;
+ }
+
+ return 0;
+}
+
+/*
+ * Indiscriminately delete the entire attribute fork
+ *
+ * Recurse (gasp!) through the attribute nodes until we find leaves.
+ * We're doing a depth-first traversal in order to invalidate everything.
+ */
+int
+xfs_attr3_root_inactive(
+ struct xfs_trans **trans,
+ struct xfs_inode *dp)
+{
+ struct xfs_da_blkinfo *info;
+ struct xfs_buf *bp;
+ xfs_daddr_t blkno;
+ int error;
+
+ /*
+ * Read block 0 to see what we have to work with.
+ * We only get here if we have extents, since we remove
+ * the extents in reverse order the extent containing
+ * block 0 must still be there.
+ */
+ error = xfs_da3_node_read(*trans, dp, 0, -1, &bp, XFS_ATTR_FORK);
+ if (error)
+ return error;
+ blkno = bp->b_bn;
+
+ /*
+ * Invalidate the tree, even if the "tree" is only a single leaf block.
+ * This is a depth-first traversal!
+ */
+ info = bp->b_addr;
+ switch (info->magic) {
+ case cpu_to_be16(XFS_DA_NODE_MAGIC):
+ case cpu_to_be16(XFS_DA3_NODE_MAGIC):
+ error = xfs_attr3_node_inactive(trans, dp, bp, 1);
+ break;
+ case cpu_to_be16(XFS_ATTR_LEAF_MAGIC):
+ case cpu_to_be16(XFS_ATTR3_LEAF_MAGIC):
+ error = xfs_attr3_leaf_inactive(trans, dp, bp);
+ break;
+ default:
+ error = XFS_ERROR(EIO);
+ xfs_trans_brelse(*trans, bp);
+ break;
+ }
+ if (error)
+ return error;
+
+ /*
+ * Invalidate the incore copy of the root block.
+ */
+ error = xfs_da_get_buf(*trans, dp, 0, blkno, &bp, XFS_ATTR_FORK);
+ if (error)
+ return error;
+ xfs_trans_binval(*trans, bp); /* remove from cache */
+ /*
+ * Commit the invalidate and start the next transaction.
+ */
+ error = xfs_trans_roll(trans, dp);
+
+ return error;
+}
+
+int
+xfs_attr_inactive(xfs_inode_t *dp)
+{
+ xfs_trans_t *trans;
+ xfs_mount_t *mp;
+ int error;
+
+ mp = dp->i_mount;
+ ASSERT(! XFS_NOT_DQATTACHED(mp, dp));
+
+ xfs_ilock(dp, XFS_ILOCK_SHARED);
+ if (!xfs_inode_hasattr(dp) ||
+ dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) {
+ xfs_iunlock(dp, XFS_ILOCK_SHARED);
+ return 0;
+ }
+ xfs_iunlock(dp, XFS_ILOCK_SHARED);
+
+ /*
+ * Start our first transaction of the day.
+ *
+ * All future transactions during this code must be "chained" off
+ * this one via the trans_dup() call. All transactions will contain
+ * the inode, and the inode will always be marked with trans_ihold().
+ * Since the inode will be locked in all transactions, we must log
+ * the inode in every transaction to let it float upward through
+ * the log.
+ */
+ trans = xfs_trans_alloc(mp, XFS_TRANS_ATTRINVAL);
+ error = xfs_trans_reserve(trans, &M_RES(mp)->tr_attrinval, 0, 0);
+ if (error) {
+ xfs_trans_cancel(trans, 0);
+ return(error);
+ }
+ xfs_ilock(dp, XFS_ILOCK_EXCL);
+
+ /*
+ * No need to make quota reservations here. We expect to release some
+ * blocks, not allocate, in the common case.
+ */
+ xfs_trans_ijoin(trans, dp, 0);
+
+ /*
+ * Decide on what work routines to call based on the inode size.
+ */
+ if (!xfs_inode_hasattr(dp) ||
+ dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) {
+ error = 0;
+ goto out;
+ }
+ error = xfs_attr3_root_inactive(&trans, dp);
+ if (error)
+ goto out;
+
+ error = xfs_itruncate_extents(&trans, dp, XFS_ATTR_FORK, 0);
+ if (error)
+ goto out;
+
+ error = xfs_trans_commit(trans, XFS_TRANS_RELEASE_LOG_RES);
+ xfs_iunlock(dp, XFS_ILOCK_EXCL);
+
+ return(error);
+
+out:
+ xfs_trans_cancel(trans, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
+ xfs_iunlock(dp, XFS_ILOCK_EXCL);
+ return(error);
+}
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index fe91eac4e2a..28712d29e43 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -1,5 +1,6 @@
/*
* Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
* All Rights Reserved.
*
* This program is free software; you can redistribute it and/or
@@ -17,33 +18,32 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
-#include "xfs_types.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_bit.h"
-#include "xfs_log.h"
-#include "xfs_inum.h"
-#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
-#include "xfs_dir.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
#include "xfs_mount.h"
+#include "xfs_da_format.h"
#include "xfs_da_btree.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_alloc.h"
-#include "xfs_btree.h"
-#include "xfs_dir_sf.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
-#include "xfs_dinode.h"
#include "xfs_inode.h"
+#include "xfs_trans.h"
#include "xfs_inode_item.h"
+#include "xfs_bmap_btree.h"
#include "xfs_bmap.h"
+#include "xfs_attr_sf.h"
+#include "xfs_attr_remote.h"
#include "xfs_attr.h"
#include "xfs_attr_leaf.h"
#include "xfs_error.h"
+#include "xfs_trace.h"
+#include "xfs_buf_item.h"
+#include "xfs_cksum.h"
+#include "xfs_dinode.h"
+#include "xfs_dir2.h"
+
/*
* xfs_attr_leaf.c
@@ -58,42 +58,233 @@
/*
* Routines used for growing the Btree.
*/
-STATIC int xfs_attr_leaf_create(xfs_da_args_t *args, xfs_dablk_t which_block,
- xfs_dabuf_t **bpp);
-STATIC int xfs_attr_leaf_add_work(xfs_dabuf_t *leaf_buffer, xfs_da_args_t *args,
- int freemap_index);
-STATIC void xfs_attr_leaf_compact(xfs_trans_t *trans, xfs_dabuf_t *leaf_buffer);
-STATIC void xfs_attr_leaf_rebalance(xfs_da_state_t *state,
+STATIC int xfs_attr3_leaf_create(struct xfs_da_args *args,
+ xfs_dablk_t which_block, struct xfs_buf **bpp);
+STATIC int xfs_attr3_leaf_add_work(struct xfs_buf *leaf_buffer,
+ struct xfs_attr3_icleaf_hdr *ichdr,
+ struct xfs_da_args *args, int freemap_index);
+STATIC void xfs_attr3_leaf_compact(struct xfs_da_args *args,
+ struct xfs_attr3_icleaf_hdr *ichdr,
+ struct xfs_buf *leaf_buffer);
+STATIC void xfs_attr3_leaf_rebalance(xfs_da_state_t *state,
xfs_da_state_blk_t *blk1,
xfs_da_state_blk_t *blk2);
-STATIC int xfs_attr_leaf_figure_balance(xfs_da_state_t *state,
- xfs_da_state_blk_t *leaf_blk_1,
- xfs_da_state_blk_t *leaf_blk_2,
- int *number_entries_in_blk1,
- int *number_usedbytes_in_blk1);
+STATIC int xfs_attr3_leaf_figure_balance(xfs_da_state_t *state,
+ xfs_da_state_blk_t *leaf_blk_1,
+ struct xfs_attr3_icleaf_hdr *ichdr1,
+ xfs_da_state_blk_t *leaf_blk_2,
+ struct xfs_attr3_icleaf_hdr *ichdr2,
+ int *number_entries_in_blk1,
+ int *number_usedbytes_in_blk1);
/*
- * Routines used for shrinking the Btree.
+ * Utility routines.
*/
-STATIC int xfs_attr_node_inactive(xfs_trans_t **trans, xfs_inode_t *dp,
- xfs_dabuf_t *bp, int level);
-STATIC int xfs_attr_leaf_inactive(xfs_trans_t **trans, xfs_inode_t *dp,
- xfs_dabuf_t *bp);
-STATIC int xfs_attr_leaf_freextent(xfs_trans_t **trans, xfs_inode_t *dp,
- xfs_dablk_t blkno, int blkcnt);
+STATIC void xfs_attr3_leaf_moveents(struct xfs_da_args *args,
+ struct xfs_attr_leafblock *src_leaf,
+ struct xfs_attr3_icleaf_hdr *src_ichdr, int src_start,
+ struct xfs_attr_leafblock *dst_leaf,
+ struct xfs_attr3_icleaf_hdr *dst_ichdr, int dst_start,
+ int move_count);
+STATIC int xfs_attr_leaf_entsize(xfs_attr_leafblock_t *leaf, int index);
+
+void
+xfs_attr3_leaf_hdr_from_disk(
+ struct xfs_attr3_icleaf_hdr *to,
+ struct xfs_attr_leafblock *from)
+{
+ int i;
+
+ ASSERT(from->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC) ||
+ from->hdr.info.magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC));
+
+ if (from->hdr.info.magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC)) {
+ struct xfs_attr3_leaf_hdr *hdr3 = (struct xfs_attr3_leaf_hdr *)from;
+
+ to->forw = be32_to_cpu(hdr3->info.hdr.forw);
+ to->back = be32_to_cpu(hdr3->info.hdr.back);
+ to->magic = be16_to_cpu(hdr3->info.hdr.magic);
+ to->count = be16_to_cpu(hdr3->count);
+ to->usedbytes = be16_to_cpu(hdr3->usedbytes);
+ to->firstused = be16_to_cpu(hdr3->firstused);
+ to->holes = hdr3->holes;
+
+ for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) {
+ to->freemap[i].base = be16_to_cpu(hdr3->freemap[i].base);
+ to->freemap[i].size = be16_to_cpu(hdr3->freemap[i].size);
+ }
+ return;
+ }
+ to->forw = be32_to_cpu(from->hdr.info.forw);
+ to->back = be32_to_cpu(from->hdr.info.back);
+ to->magic = be16_to_cpu(from->hdr.info.magic);
+ to->count = be16_to_cpu(from->hdr.count);
+ to->usedbytes = be16_to_cpu(from->hdr.usedbytes);
+ to->firstused = be16_to_cpu(from->hdr.firstused);
+ to->holes = from->hdr.holes;
+
+ for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) {
+ to->freemap[i].base = be16_to_cpu(from->hdr.freemap[i].base);
+ to->freemap[i].size = be16_to_cpu(from->hdr.freemap[i].size);
+ }
+}
+
+void
+xfs_attr3_leaf_hdr_to_disk(
+ struct xfs_attr_leafblock *to,
+ struct xfs_attr3_icleaf_hdr *from)
+{
+ int i;
+
+ ASSERT(from->magic == XFS_ATTR_LEAF_MAGIC ||
+ from->magic == XFS_ATTR3_LEAF_MAGIC);
+
+ if (from->magic == XFS_ATTR3_LEAF_MAGIC) {
+ struct xfs_attr3_leaf_hdr *hdr3 = (struct xfs_attr3_leaf_hdr *)to;
+
+ hdr3->info.hdr.forw = cpu_to_be32(from->forw);
+ hdr3->info.hdr.back = cpu_to_be32(from->back);
+ hdr3->info.hdr.magic = cpu_to_be16(from->magic);
+ hdr3->count = cpu_to_be16(from->count);
+ hdr3->usedbytes = cpu_to_be16(from->usedbytes);
+ hdr3->firstused = cpu_to_be16(from->firstused);
+ hdr3->holes = from->holes;
+ hdr3->pad1 = 0;
+
+ for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) {
+ hdr3->freemap[i].base = cpu_to_be16(from->freemap[i].base);
+ hdr3->freemap[i].size = cpu_to_be16(from->freemap[i].size);
+ }
+ return;
+ }
+ to->hdr.info.forw = cpu_to_be32(from->forw);
+ to->hdr.info.back = cpu_to_be32(from->back);
+ to->hdr.info.magic = cpu_to_be16(from->magic);
+ to->hdr.count = cpu_to_be16(from->count);
+ to->hdr.usedbytes = cpu_to_be16(from->usedbytes);
+ to->hdr.firstused = cpu_to_be16(from->firstused);
+ to->hdr.holes = from->holes;
+ to->hdr.pad1 = 0;
+
+ for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) {
+ to->hdr.freemap[i].base = cpu_to_be16(from->freemap[i].base);
+ to->hdr.freemap[i].size = cpu_to_be16(from->freemap[i].size);
+ }
+}
+
+static bool
+xfs_attr3_leaf_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_attr_leafblock *leaf = bp->b_addr;
+ struct xfs_attr3_icleaf_hdr ichdr;
+
+ xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
+
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ struct xfs_da3_node_hdr *hdr3 = bp->b_addr;
+
+ if (ichdr.magic != XFS_ATTR3_LEAF_MAGIC)
+ return false;
+
+ if (!uuid_equal(&hdr3->info.uuid, &mp->m_sb.sb_uuid))
+ return false;
+ if (be64_to_cpu(hdr3->info.blkno) != bp->b_bn)
+ return false;
+ } else {
+ if (ichdr.magic != XFS_ATTR_LEAF_MAGIC)
+ return false;
+ }
+ if (ichdr.count == 0)
+ return false;
+
+ /* XXX: need to range check rest of attr header values */
+ /* XXX: hash order check? */
+
+ return true;
+}
+
+static void
+xfs_attr3_leaf_write_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_buf_log_item *bip = bp->b_fspriv;
+ struct xfs_attr3_leaf_hdr *hdr3 = bp->b_addr;
+
+ if (!xfs_attr3_leaf_verify(bp)) {
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
+ xfs_verifier_error(bp);
+ return;
+ }
+
+ if (!xfs_sb_version_hascrc(&mp->m_sb))
+ return;
+
+ if (bip)
+ hdr3->info.lsn = cpu_to_be64(bip->bli_item.li_lsn);
+
+ xfs_buf_update_cksum(bp, XFS_ATTR3_LEAF_CRC_OFF);
+}
/*
- * Utility routines.
+ * leaf/node format detection on trees is sketchy, so a node read can be done on
+ * leaf level blocks when detection identifies the tree as a node format tree
+ * incorrectly. In this case, we need to swap the verifier to match the correct
+ * format of the block being read.
*/
-STATIC void xfs_attr_leaf_moveents(xfs_attr_leafblock_t *src_leaf,
- int src_start,
- xfs_attr_leafblock_t *dst_leaf,
- int dst_start, int move_count,
- xfs_mount_t *mp);
-STATIC int xfs_attr_leaf_entsize(xfs_attr_leafblock_t *leaf, int index);
-STATIC int xfs_attr_put_listent(xfs_attr_list_context_t *context,
- attrnames_t *, char *name, int namelen,
- int valuelen);
+static void
+xfs_attr3_leaf_read_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+
+ if (xfs_sb_version_hascrc(&mp->m_sb) &&
+ !xfs_buf_verify_cksum(bp, XFS_ATTR3_LEAF_CRC_OFF))
+ xfs_buf_ioerror(bp, EFSBADCRC);
+ else if (!xfs_attr3_leaf_verify(bp))
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
+
+ if (bp->b_error)
+ xfs_verifier_error(bp);
+}
+
+const struct xfs_buf_ops xfs_attr3_leaf_buf_ops = {
+ .verify_read = xfs_attr3_leaf_read_verify,
+ .verify_write = xfs_attr3_leaf_write_verify,
+};
+
+int
+xfs_attr3_leaf_read(
+ struct xfs_trans *tp,
+ struct xfs_inode *dp,
+ xfs_dablk_t bno,
+ xfs_daddr_t mappedbno,
+ struct xfs_buf **bpp)
+{
+ int err;
+
+ err = xfs_da_read_buf(tp, dp, bno, mappedbno, bpp,
+ XFS_ATTR_FORK, &xfs_attr3_leaf_buf_ops);
+ if (!err && tp)
+ xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_ATTR_LEAF_BUF);
+ return err;
+}
+
+/*========================================================================
+ * Namespace helper routines
+ *========================================================================*/
+
+/*
+ * If namespace bits don't match return 0.
+ * If all match then return 1.
+ */
+STATIC int
+xfs_attr_namesp_match(int arg_flags, int ondisk_flags)
+{
+ return XFS_ATTR_NSP_ONDISK(ondisk_flags) == XFS_ATTR_NSP_ARGS_TO_ONDISK(arg_flags);
+}
/*========================================================================
@@ -103,6 +294,7 @@ STATIC int xfs_attr_put_listent(xfs_attr_list_context_t *context,
/*
* Query whether the requested number of additional bytes of extended
* attribute space will be able to fit inline.
+ *
* Returns zero if not, else the di_forkoff fork offset to be used in the
* literal area for attribute data once the new bytes have been added.
*
@@ -115,9 +307,11 @@ xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes)
int offset;
int minforkoff; /* lower limit on valid forkoff locations */
int maxforkoff; /* upper limit on valid forkoff locations */
+ int dsize;
xfs_mount_t *mp = dp->i_mount;
- offset = (XFS_LITINO(mp) - bytes) >> 3; /* rounded down */
+ /* rounded down */
+ offset = (XFS_LITINO(mp, dp->i_d.di_version) - bytes) >> 3;
switch (dp->i_d.di_format) {
case XFS_DINODE_FMT_DEV:
@@ -128,24 +322,74 @@ xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes)
return (offset >= minforkoff) ? minforkoff : 0;
}
- if (!(mp->m_flags & XFS_MOUNT_ATTR2)) {
- if (bytes <= XFS_IFORK_ASIZE(dp))
- return mp->m_attroffset >> 3;
+ /*
+ * If the requested numbers of bytes is smaller or equal to the
+ * current attribute fork size we can always proceed.
+ *
+ * Note that if_bytes in the data fork might actually be larger than
+ * the current data fork size is due to delalloc extents. In that
+ * case either the extent count will go down when they are converted
+ * to real extents, or the delalloc conversion will take care of the
+ * literal area rebalancing.
+ */
+ if (bytes <= XFS_IFORK_ASIZE(dp))
+ return dp->i_d.di_forkoff;
+
+ /*
+ * For attr2 we can try to move the forkoff if there is space in the
+ * literal area, but for the old format we are done if there is no
+ * space in the fixed attribute fork.
+ */
+ if (!(mp->m_flags & XFS_MOUNT_ATTR2))
return 0;
+
+ dsize = dp->i_df.if_bytes;
+
+ switch (dp->i_d.di_format) {
+ case XFS_DINODE_FMT_EXTENTS:
+ /*
+ * If there is no attr fork and the data fork is extents,
+ * determine if creating the default attr fork will result
+ * in the extents form migrating to btree. If so, the
+ * minimum offset only needs to be the space required for
+ * the btree root.
+ */
+ if (!dp->i_d.di_forkoff && dp->i_df.if_bytes >
+ xfs_default_attroffset(dp))
+ dsize = XFS_BMDR_SPACE_CALC(MINDBTPTRS);
+ break;
+ case XFS_DINODE_FMT_BTREE:
+ /*
+ * If we have a data btree then keep forkoff if we have one,
+ * otherwise we are adding a new attr, so then we set
+ * minforkoff to where the btree root can finish so we have
+ * plenty of room for attrs
+ */
+ if (dp->i_d.di_forkoff) {
+ if (offset < dp->i_d.di_forkoff)
+ return 0;
+ return dp->i_d.di_forkoff;
+ }
+ dsize = XFS_BMAP_BROOT_SPACE(mp, dp->i_df.if_broot);
+ break;
}
- /* data fork btree root can have at least this many key/ptr pairs */
- minforkoff = MAX(dp->i_df.if_bytes, XFS_BMDR_SPACE_CALC(MINDBTPTRS));
+ /*
+ * A data fork btree root must have space for at least
+ * MINDBTPTRS key/ptr pairs if the data fork is small or empty.
+ */
+ minforkoff = MAX(dsize, XFS_BMDR_SPACE_CALC(MINDBTPTRS));
minforkoff = roundup(minforkoff, 8) >> 3;
/* attr fork btree root can have at least this many key/ptr pairs */
- maxforkoff = XFS_LITINO(mp) - XFS_BMDR_SPACE_CALC(MINABTPTRS);
+ maxforkoff = XFS_LITINO(mp, dp->i_d.di_version) -
+ XFS_BMDR_SPACE_CALC(MINABTPTRS);
maxforkoff = maxforkoff >> 3; /* rounded down */
- if (offset >= minforkoff && offset < maxforkoff)
- return offset;
if (offset >= maxforkoff)
return maxforkoff;
+ if (offset >= minforkoff)
+ return offset;
return 0;
}
@@ -155,17 +399,15 @@ xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes)
STATIC void
xfs_sbversion_add_attr2(xfs_mount_t *mp, xfs_trans_t *tp)
{
- unsigned long s;
-
if ((mp->m_flags & XFS_MOUNT_ATTR2) &&
- !(XFS_SB_VERSION_HASATTR2(&mp->m_sb))) {
- s = XFS_SB_LOCK(mp);
- if (!XFS_SB_VERSION_HASATTR2(&mp->m_sb)) {
- XFS_SB_VERSION_ADDATTR2(&mp->m_sb);
- XFS_SB_UNLOCK(mp, s);
+ !(xfs_sb_version_hasattr2(&mp->m_sb))) {
+ spin_lock(&mp->m_sb_lock);
+ if (!xfs_sb_version_hasattr2(&mp->m_sb)) {
+ xfs_sb_version_addattr2(&mp->m_sb);
+ spin_unlock(&mp->m_sb_lock);
xfs_mod_sb(tp, XFS_SB_VERSIONNUM | XFS_SB_FEATURES2);
} else
- XFS_SB_UNLOCK(mp, s);
+ spin_unlock(&mp->m_sb_lock);
}
}
@@ -179,6 +421,8 @@ xfs_attr_shortform_create(xfs_da_args_t *args)
xfs_inode_t *dp;
xfs_ifork_t *ifp;
+ trace_xfs_attr_sf_create(args);
+
dp = args->dp;
ASSERT(dp != NULL);
ifp = dp->i_afp;
@@ -194,7 +438,7 @@ xfs_attr_shortform_create(xfs_da_args_t *args)
xfs_idata_realloc(dp, sizeof(*hdr), XFS_ATTR_FORK);
hdr = (xfs_attr_sf_hdr_t *)ifp->if_u1.if_data;
hdr->count = 0;
- INT_SET(hdr->totsize, ARCH_CONVERT, sizeof(*hdr));
+ hdr->totsize = cpu_to_be16(sizeof(*hdr));
xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_ADATA);
}
@@ -212,30 +456,23 @@ xfs_attr_shortform_add(xfs_da_args_t *args, int forkoff)
xfs_inode_t *dp;
xfs_ifork_t *ifp;
+ trace_xfs_attr_sf_add(args);
+
dp = args->dp;
mp = dp->i_mount;
dp->i_d.di_forkoff = forkoff;
- dp->i_df.if_ext_max =
- XFS_IFORK_DSIZE(dp) / (uint)sizeof(xfs_bmbt_rec_t);
- dp->i_afp->if_ext_max =
- XFS_IFORK_ASIZE(dp) / (uint)sizeof(xfs_bmbt_rec_t);
ifp = dp->i_afp;
ASSERT(ifp->if_flags & XFS_IFINLINE);
sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data;
sfe = &sf->list[0];
- for (i = 0; i < INT_GET(sf->hdr.count, ARCH_CONVERT);
- sfe = XFS_ATTR_SF_NEXTENTRY(sfe), i++) {
+ for (i = 0; i < sf->hdr.count; sfe = XFS_ATTR_SF_NEXTENTRY(sfe), i++) {
#ifdef DEBUG
if (sfe->namelen != args->namelen)
continue;
if (memcmp(args->name, sfe->nameval, args->namelen) != 0)
continue;
- if (((args->flags & ATTR_SECURE) != 0) !=
- ((sfe->flags & XFS_ATTR_SECURE) != 0))
- continue;
- if (((args->flags & ATTR_ROOT) != 0) !=
- ((sfe->flags & XFS_ATTR_ROOT) != 0))
+ if (!xfs_attr_namesp_match(args->flags, sfe->flags))
continue;
ASSERT(0);
#endif
@@ -248,19 +485,37 @@ xfs_attr_shortform_add(xfs_da_args_t *args, int forkoff)
sfe = (xfs_attr_sf_entry_t *)((char *)sf + offset);
sfe->namelen = args->namelen;
- INT_SET(sfe->valuelen, ARCH_CONVERT, args->valuelen);
- sfe->flags = (args->flags & ATTR_SECURE) ? XFS_ATTR_SECURE :
- ((args->flags & ATTR_ROOT) ? XFS_ATTR_ROOT : 0);
+ sfe->valuelen = args->valuelen;
+ sfe->flags = XFS_ATTR_NSP_ARGS_TO_ONDISK(args->flags);
memcpy(sfe->nameval, args->name, args->namelen);
memcpy(&sfe->nameval[args->namelen], args->value, args->valuelen);
- INT_MOD(sf->hdr.count, ARCH_CONVERT, 1);
- INT_MOD(sf->hdr.totsize, ARCH_CONVERT, size);
+ sf->hdr.count++;
+ be16_add_cpu(&sf->hdr.totsize, size);
xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_ADATA);
xfs_sbversion_add_attr2(mp, args->trans);
}
/*
+ * After the last attribute is removed revert to original inode format,
+ * making all literal area available to the data fork once more.
+ */
+STATIC void
+xfs_attr_fork_reset(
+ struct xfs_inode *ip,
+ struct xfs_trans *tp)
+{
+ xfs_idestroy_fork(ip, XFS_ATTR_FORK);
+ ip->i_d.di_forkoff = 0;
+ ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
+
+ ASSERT(ip->i_d.di_anextents == 0);
+ ASSERT(ip->i_afp == NULL);
+
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+}
+
+/*
* Remove an attribute from the shortform attribute list structure.
*/
int
@@ -272,12 +527,14 @@ xfs_attr_shortform_remove(xfs_da_args_t *args)
xfs_mount_t *mp;
xfs_inode_t *dp;
+ trace_xfs_attr_sf_remove(args);
+
dp = args->dp;
mp = dp->i_mount;
base = sizeof(xfs_attr_sf_hdr_t);
sf = (xfs_attr_shortform_t *)dp->i_afp->if_u1.if_data;
sfe = &sf->list[0];
- end = INT_GET(sf->hdr.count, ARCH_CONVERT);
+ end = sf->hdr.count;
for (i = 0; i < end; sfe = XFS_ATTR_SF_NEXTENTRY(sfe),
base += size, i++) {
size = XFS_ATTR_SF_ENTSIZE(sfe);
@@ -285,11 +542,7 @@ xfs_attr_shortform_remove(xfs_da_args_t *args)
continue;
if (memcmp(sfe->nameval, args->name, args->namelen) != 0)
continue;
- if (((args->flags & ATTR_SECURE) != 0) !=
- ((sfe->flags & XFS_ATTR_SECURE) != 0))
- continue;
- if (((args->flags & ATTR_ROOT) != 0) !=
- ((sfe->flags & XFS_ATTR_ROOT) != 0))
+ if (!xfs_attr_namesp_match(args->flags, sfe->flags))
continue;
break;
}
@@ -300,41 +553,29 @@ xfs_attr_shortform_remove(xfs_da_args_t *args)
* Fix up the attribute fork data, covering the hole
*/
end = base + size;
- totsize = INT_GET(sf->hdr.totsize, ARCH_CONVERT);
+ totsize = be16_to_cpu(sf->hdr.totsize);
if (end != totsize)
memmove(&((char *)sf)[base], &((char *)sf)[end], totsize - end);
- INT_MOD(sf->hdr.count, ARCH_CONVERT, -1);
- INT_MOD(sf->hdr.totsize, ARCH_CONVERT, -size);
+ sf->hdr.count--;
+ be16_add_cpu(&sf->hdr.totsize, -size);
/*
* Fix up the start offset of the attribute fork
*/
totsize -= size;
- if (totsize == sizeof(xfs_attr_sf_hdr_t) && !args->addname &&
- (mp->m_flags & XFS_MOUNT_ATTR2)) {
- /*
- * Last attribute now removed, revert to original
- * inode format making all literal area available
- * to the data fork once more.
- */
- xfs_idestroy_fork(dp, XFS_ATTR_FORK);
- dp->i_d.di_forkoff = 0;
- dp->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
- ASSERT(dp->i_d.di_anextents == 0);
- ASSERT(dp->i_afp == NULL);
- dp->i_df.if_ext_max =
- XFS_IFORK_DSIZE(dp) / (uint)sizeof(xfs_bmbt_rec_t);
- xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE);
+ if (totsize == sizeof(xfs_attr_sf_hdr_t) &&
+ (mp->m_flags & XFS_MOUNT_ATTR2) &&
+ (dp->i_d.di_format != XFS_DINODE_FMT_BTREE) &&
+ !(args->op_flags & XFS_DA_OP_ADDNAME)) {
+ xfs_attr_fork_reset(dp, args->trans);
} else {
xfs_idata_realloc(dp, -size, XFS_ATTR_FORK);
dp->i_d.di_forkoff = xfs_attr_shortform_bytesfit(dp, totsize);
ASSERT(dp->i_d.di_forkoff);
- ASSERT(totsize > sizeof(xfs_attr_sf_hdr_t) || args->addname ||
- !(mp->m_flags & XFS_MOUNT_ATTR2));
- dp->i_afp->if_ext_max =
- XFS_IFORK_ASIZE(dp) / (uint)sizeof(xfs_bmbt_rec_t);
- dp->i_df.if_ext_max =
- XFS_IFORK_DSIZE(dp) / (uint)sizeof(xfs_bmbt_rec_t);
+ ASSERT(totsize > sizeof(xfs_attr_sf_hdr_t) ||
+ (args->op_flags & XFS_DA_OP_ADDNAME) ||
+ !(mp->m_flags & XFS_MOUNT_ATTR2) ||
+ dp->i_d.di_format == XFS_DINODE_FMT_BTREE);
xfs_trans_log_inode(args->trans, dp,
XFS_ILOG_CORE | XFS_ILOG_ADATA);
}
@@ -356,21 +597,19 @@ xfs_attr_shortform_lookup(xfs_da_args_t *args)
int i;
xfs_ifork_t *ifp;
+ trace_xfs_attr_sf_lookup(args);
+
ifp = args->dp->i_afp;
ASSERT(ifp->if_flags & XFS_IFINLINE);
sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data;
sfe = &sf->list[0];
- for (i = 0; i < INT_GET(sf->hdr.count, ARCH_CONVERT);
+ for (i = 0; i < sf->hdr.count;
sfe = XFS_ATTR_SF_NEXTENTRY(sfe), i++) {
if (sfe->namelen != args->namelen)
continue;
if (memcmp(args->name, sfe->nameval, args->namelen) != 0)
continue;
- if (((args->flags & ATTR_SECURE) != 0) !=
- ((sfe->flags & XFS_ATTR_SECURE) != 0))
- continue;
- if (((args->flags & ATTR_ROOT) != 0) !=
- ((sfe->flags & XFS_ATTR_ROOT) != 0))
+ if (!xfs_attr_namesp_match(args->flags, sfe->flags))
continue;
return(XFS_ERROR(EEXIST));
}
@@ -388,30 +627,26 @@ xfs_attr_shortform_getvalue(xfs_da_args_t *args)
xfs_attr_sf_entry_t *sfe;
int i;
- ASSERT(args->dp->i_d.di_aformat == XFS_IFINLINE);
+ ASSERT(args->dp->i_afp->if_flags == XFS_IFINLINE);
sf = (xfs_attr_shortform_t *)args->dp->i_afp->if_u1.if_data;
sfe = &sf->list[0];
- for (i = 0; i < INT_GET(sf->hdr.count, ARCH_CONVERT);
+ for (i = 0; i < sf->hdr.count;
sfe = XFS_ATTR_SF_NEXTENTRY(sfe), i++) {
if (sfe->namelen != args->namelen)
continue;
if (memcmp(args->name, sfe->nameval, args->namelen) != 0)
continue;
- if (((args->flags & ATTR_SECURE) != 0) !=
- ((sfe->flags & XFS_ATTR_SECURE) != 0))
- continue;
- if (((args->flags & ATTR_ROOT) != 0) !=
- ((sfe->flags & XFS_ATTR_ROOT) != 0))
+ if (!xfs_attr_namesp_match(args->flags, sfe->flags))
continue;
if (args->flags & ATTR_KERNOVAL) {
- args->valuelen = INT_GET(sfe->valuelen, ARCH_CONVERT);
+ args->valuelen = sfe->valuelen;
return(XFS_ERROR(EEXIST));
}
- if (args->valuelen < INT_GET(sfe->valuelen, ARCH_CONVERT)) {
- args->valuelen = INT_GET(sfe->valuelen, ARCH_CONVERT);
+ if (args->valuelen < sfe->valuelen) {
+ args->valuelen = sfe->valuelen;
return(XFS_ERROR(ERANGE));
}
- args->valuelen = INT_GET(sfe->valuelen, ARCH_CONVERT);
+ args->valuelen = sfe->valuelen;
memcpy(args->value, &sfe->nameval[args->namelen],
args->valuelen);
return(XFS_ERROR(EEXIST));
@@ -432,19 +667,23 @@ xfs_attr_shortform_to_leaf(xfs_da_args_t *args)
char *tmpbuffer;
int error, i, size;
xfs_dablk_t blkno;
- xfs_dabuf_t *bp;
+ struct xfs_buf *bp;
xfs_ifork_t *ifp;
+ trace_xfs_attr_sf_to_leaf(args);
+
dp = args->dp;
ifp = dp->i_afp;
sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data;
- size = INT_GET(sf->hdr.totsize, ARCH_CONVERT);
+ size = be16_to_cpu(sf->hdr.totsize);
tmpbuffer = kmem_alloc(size, KM_SLEEP);
ASSERT(tmpbuffer != NULL);
memcpy(tmpbuffer, ifp->if_u1.if_data, size);
sf = (xfs_attr_shortform_t *)tmpbuffer;
xfs_idata_realloc(dp, -size, XFS_ATTR_FORK);
+ xfs_bmap_local_to_extents_empty(dp, XFS_ATTR_FORK);
+
bp = NULL;
error = xfs_da_grow_inode(args, &blkno);
if (error) {
@@ -460,7 +699,7 @@ xfs_attr_shortform_to_leaf(xfs_da_args_t *args)
}
ASSERT(blkno == 0);
- error = xfs_attr_leaf_create(args, blkno, &bp);
+ error = xfs_attr3_leaf_create(args, blkno, &bp);
if (error) {
error = xfs_da_shrink_inode(args, 0, bp);
bp = NULL;
@@ -473,26 +712,26 @@ xfs_attr_shortform_to_leaf(xfs_da_args_t *args)
memset((char *)&nargs, 0, sizeof(nargs));
nargs.dp = dp;
+ nargs.geo = args->geo;
nargs.firstblock = args->firstblock;
nargs.flist = args->flist;
nargs.total = args->total;
nargs.whichfork = XFS_ATTR_FORK;
nargs.trans = args->trans;
- nargs.oknoent = 1;
+ nargs.op_flags = XFS_DA_OP_OKNOENT;
sfe = &sf->list[0];
- for (i = 0; i < INT_GET(sf->hdr.count, ARCH_CONVERT); i++) {
- nargs.name = (char *)sfe->nameval;
+ for (i = 0; i < sf->hdr.count; i++) {
+ nargs.name = sfe->nameval;
nargs.namelen = sfe->namelen;
- nargs.value = (char *)&sfe->nameval[nargs.namelen];
- nargs.valuelen = INT_GET(sfe->valuelen, ARCH_CONVERT);
- nargs.hashval = xfs_da_hashname((char *)sfe->nameval,
+ nargs.value = &sfe->nameval[nargs.namelen];
+ nargs.valuelen = sfe->valuelen;
+ nargs.hashval = xfs_da_hashname(sfe->nameval,
sfe->namelen);
- nargs.flags = (sfe->flags & XFS_ATTR_SECURE) ? ATTR_SECURE :
- ((sfe->flags & XFS_ATTR_ROOT) ? ATTR_ROOT : 0);
- error = xfs_attr_leaf_lookup_int(bp, &nargs); /* set a->index */
+ nargs.flags = XFS_ATTR_NSP_ONDISK_TO_ARGS(sfe->flags);
+ error = xfs_attr3_leaf_lookup_int(bp, &nargs); /* set a->index */
ASSERT(error == ENOATTR);
- error = xfs_attr_leaf_add(bp, &nargs);
+ error = xfs_attr3_leaf_add(bp, &nargs);
ASSERT(error != ENOSPC);
if (error)
goto out;
@@ -501,274 +740,85 @@ xfs_attr_shortform_to_leaf(xfs_da_args_t *args)
error = 0;
out:
- if(bp)
- xfs_da_buf_done(bp);
- kmem_free(tmpbuffer, size);
+ kmem_free(tmpbuffer);
return(error);
}
-STATIC int
-xfs_attr_shortform_compare(const void *a, const void *b)
-{
- xfs_attr_sf_sort_t *sa, *sb;
-
- sa = (xfs_attr_sf_sort_t *)a;
- sb = (xfs_attr_sf_sort_t *)b;
- if (INT_GET(sa->hash, ARCH_CONVERT)
- < INT_GET(sb->hash, ARCH_CONVERT)) {
- return(-1);
- } else if (INT_GET(sa->hash, ARCH_CONVERT)
- > INT_GET(sb->hash, ARCH_CONVERT)) {
- return(1);
- } else {
- return(sa->entno - sb->entno);
- }
-}
-
-/*
- * Copy out entries of shortform attribute lists for attr_list().
- * Shortform atrtribute lists are not stored in hashval sorted order.
- * If the output buffer is not large enough to hold them all, then we
- * we have to calculate each entries' hashvalue and sort them before
- * we can begin returning them to the user.
- */
-/*ARGSUSED*/
-int
-xfs_attr_shortform_list(xfs_attr_list_context_t *context)
-{
- attrlist_cursor_kern_t *cursor;
- xfs_attr_sf_sort_t *sbuf, *sbp;
- xfs_attr_shortform_t *sf;
- xfs_attr_sf_entry_t *sfe;
- xfs_inode_t *dp;
- int sbsize, nsbuf, count, i;
-
- ASSERT(context != NULL);
- dp = context->dp;
- ASSERT(dp != NULL);
- ASSERT(dp->i_afp != NULL);
- sf = (xfs_attr_shortform_t *)dp->i_afp->if_u1.if_data;
- ASSERT(sf != NULL);
- if (!sf->hdr.count)
- return(0);
- cursor = context->cursor;
- ASSERT(cursor != NULL);
-
- xfs_attr_trace_l_c("sf start", context);
-
- /*
- * If the buffer is large enough, do not bother with sorting.
- * Note the generous fudge factor of 16 overhead bytes per entry.
- */
- if ((dp->i_afp->if_bytes + INT_GET(sf->hdr.count, ARCH_CONVERT) * 16)
- < context->bufsize) {
- for (i = 0, sfe = &sf->list[0];
- i < INT_GET(sf->hdr.count, ARCH_CONVERT); i++) {
- attrnames_t *namesp;
-
- if (((context->flags & ATTR_SECURE) != 0) !=
- ((sfe->flags & XFS_ATTR_SECURE) != 0) &&
- !(context->flags & ATTR_KERNORMALS)) {
- sfe = XFS_ATTR_SF_NEXTENTRY(sfe);
- continue;
- }
- if (((context->flags & ATTR_ROOT) != 0) !=
- ((sfe->flags & XFS_ATTR_ROOT) != 0) &&
- !(context->flags & ATTR_KERNROOTLS)) {
- sfe = XFS_ATTR_SF_NEXTENTRY(sfe);
- continue;
- }
- namesp = (sfe->flags & XFS_ATTR_SECURE) ? &attr_secure:
- ((sfe->flags & XFS_ATTR_ROOT) ? &attr_trusted :
- &attr_user);
- if (context->flags & ATTR_KERNOVAL) {
- ASSERT(context->flags & ATTR_KERNAMELS);
- context->count += namesp->attr_namelen +
- INT_GET(sfe->namelen, ARCH_CONVERT) + 1;
- }
- else {
- if (xfs_attr_put_listent(context, namesp,
- (char *)sfe->nameval,
- (int)sfe->namelen,
- (int)INT_GET(sfe->valuelen,
- ARCH_CONVERT)))
- break;
- }
- sfe = XFS_ATTR_SF_NEXTENTRY(sfe);
- }
- xfs_attr_trace_l_c("sf big-gulp", context);
- return(0);
- }
-
- /*
- * It didn't all fit, so we have to sort everything on hashval.
- */
- sbsize = INT_GET(sf->hdr.count, ARCH_CONVERT) * sizeof(*sbuf);
- sbp = sbuf = kmem_alloc(sbsize, KM_SLEEP);
-
- /*
- * Scan the attribute list for the rest of the entries, storing
- * the relevant info from only those that match into a buffer.
- */
- nsbuf = 0;
- for (i = 0, sfe = &sf->list[0];
- i < INT_GET(sf->hdr.count, ARCH_CONVERT); i++) {
- if (unlikely(
- ((char *)sfe < (char *)sf) ||
- ((char *)sfe >= ((char *)sf + dp->i_afp->if_bytes)))) {
- XFS_CORRUPTION_ERROR("xfs_attr_shortform_list",
- XFS_ERRLEVEL_LOW,
- context->dp->i_mount, sfe);
- xfs_attr_trace_l_c("sf corrupted", context);
- kmem_free(sbuf, sbsize);
- return XFS_ERROR(EFSCORRUPTED);
- }
- if (((context->flags & ATTR_SECURE) != 0) !=
- ((sfe->flags & XFS_ATTR_SECURE) != 0) &&
- !(context->flags & ATTR_KERNORMALS)) {
- sfe = XFS_ATTR_SF_NEXTENTRY(sfe);
- continue;
- }
- if (((context->flags & ATTR_ROOT) != 0) !=
- ((sfe->flags & XFS_ATTR_ROOT) != 0) &&
- !(context->flags & ATTR_KERNROOTLS)) {
- sfe = XFS_ATTR_SF_NEXTENTRY(sfe);
- continue;
- }
- sbp->entno = i;
- INT_SET(sbp->hash, ARCH_CONVERT,
- xfs_da_hashname((char *)sfe->nameval, sfe->namelen));
- sbp->name = (char *)sfe->nameval;
- sbp->namelen = sfe->namelen;
- /* These are bytes, and both on-disk, don't endian-flip */
- sbp->valuelen = sfe->valuelen;
- sbp->flags = sfe->flags;
- sfe = XFS_ATTR_SF_NEXTENTRY(sfe);
- sbp++;
- nsbuf++;
- }
-
- /*
- * Sort the entries on hash then entno.
- */
- xfs_sort(sbuf, nsbuf, sizeof(*sbuf), xfs_attr_shortform_compare);
-
- /*
- * Re-find our place IN THE SORTED LIST.
- */
- count = 0;
- cursor->initted = 1;
- cursor->blkno = 0;
- for (sbp = sbuf, i = 0; i < nsbuf; i++, sbp++) {
- if (INT_GET(sbp->hash, ARCH_CONVERT) == cursor->hashval) {
- if (cursor->offset == count) {
- break;
- }
- count++;
- } else if (INT_GET(sbp->hash, ARCH_CONVERT) > cursor->hashval) {
- break;
- }
- }
- if (i == nsbuf) {
- kmem_free(sbuf, sbsize);
- xfs_attr_trace_l_c("blk end", context);
- return(0);
- }
-
- /*
- * Loop putting entries into the user buffer.
- */
- for ( ; i < nsbuf; i++, sbp++) {
- attrnames_t *namesp;
-
- namesp = (sbp->flags & XFS_ATTR_SECURE) ? &attr_secure :
- ((sbp->flags & XFS_ATTR_ROOT) ? &attr_trusted :
- &attr_user);
-
- if (cursor->hashval != INT_GET(sbp->hash, ARCH_CONVERT)) {
- cursor->hashval = INT_GET(sbp->hash, ARCH_CONVERT);
- cursor->offset = 0;
- }
- if (context->flags & ATTR_KERNOVAL) {
- ASSERT(context->flags & ATTR_KERNAMELS);
- context->count += namesp->attr_namelen +
- sbp->namelen + 1;
- } else {
- if (xfs_attr_put_listent(context, namesp,
- sbp->name, sbp->namelen,
- INT_GET(sbp->valuelen, ARCH_CONVERT)))
- break;
- }
- cursor->offset++;
- }
-
- kmem_free(sbuf, sbsize);
- xfs_attr_trace_l_c("sf E-O-F", context);
- return(0);
-}
-
/*
* Check a leaf attribute block to see if all the entries would fit into
* a shortform attribute list.
*/
int
-xfs_attr_shortform_allfit(xfs_dabuf_t *bp, xfs_inode_t *dp)
+xfs_attr_shortform_allfit(
+ struct xfs_buf *bp,
+ struct xfs_inode *dp)
{
- xfs_attr_leafblock_t *leaf;
- xfs_attr_leaf_entry_t *entry;
+ struct xfs_attr_leafblock *leaf;
+ struct xfs_attr_leaf_entry *entry;
xfs_attr_leaf_name_local_t *name_loc;
- int bytes, i;
+ struct xfs_attr3_icleaf_hdr leafhdr;
+ int bytes;
+ int i;
- leaf = bp->data;
- ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT)
- == XFS_ATTR_LEAF_MAGIC);
+ leaf = bp->b_addr;
+ xfs_attr3_leaf_hdr_from_disk(&leafhdr, leaf);
+ entry = xfs_attr3_leaf_entryp(leaf);
- entry = &leaf->entries[0];
bytes = sizeof(struct xfs_attr_sf_hdr);
- for (i = 0; i < INT_GET(leaf->hdr.count, ARCH_CONVERT); entry++, i++) {
+ for (i = 0; i < leafhdr.count; entry++, i++) {
if (entry->flags & XFS_ATTR_INCOMPLETE)
continue; /* don't copy partial entries */
if (!(entry->flags & XFS_ATTR_LOCAL))
return(0);
- name_loc = XFS_ATTR_LEAF_NAME_LOCAL(leaf, i);
+ name_loc = xfs_attr3_leaf_name_local(leaf, i);
if (name_loc->namelen >= XFS_ATTR_SF_ENTSIZE_MAX)
return(0);
- if (INT_GET(name_loc->valuelen, ARCH_CONVERT) >= XFS_ATTR_SF_ENTSIZE_MAX)
+ if (be16_to_cpu(name_loc->valuelen) >= XFS_ATTR_SF_ENTSIZE_MAX)
return(0);
- bytes += sizeof(struct xfs_attr_sf_entry)-1
+ bytes += sizeof(struct xfs_attr_sf_entry) - 1
+ name_loc->namelen
- + INT_GET(name_loc->valuelen, ARCH_CONVERT);
+ + be16_to_cpu(name_loc->valuelen);
}
if ((dp->i_mount->m_flags & XFS_MOUNT_ATTR2) &&
+ (dp->i_d.di_format != XFS_DINODE_FMT_BTREE) &&
(bytes == sizeof(struct xfs_attr_sf_hdr)))
- return(-1);
- return(xfs_attr_shortform_bytesfit(dp, bytes));
+ return -1;
+ return xfs_attr_shortform_bytesfit(dp, bytes);
}
/*
* Convert a leaf attribute list to shortform attribute list
*/
int
-xfs_attr_leaf_to_shortform(xfs_dabuf_t *bp, xfs_da_args_t *args, int forkoff)
+xfs_attr3_leaf_to_shortform(
+ struct xfs_buf *bp,
+ struct xfs_da_args *args,
+ int forkoff)
{
- xfs_attr_leafblock_t *leaf;
- xfs_attr_leaf_entry_t *entry;
- xfs_attr_leaf_name_local_t *name_loc;
- xfs_da_args_t nargs;
- xfs_inode_t *dp;
- char *tmpbuffer;
- int error, i;
+ struct xfs_attr_leafblock *leaf;
+ struct xfs_attr3_icleaf_hdr ichdr;
+ struct xfs_attr_leaf_entry *entry;
+ struct xfs_attr_leaf_name_local *name_loc;
+ struct xfs_da_args nargs;
+ struct xfs_inode *dp = args->dp;
+ char *tmpbuffer;
+ int error;
+ int i;
- dp = args->dp;
- tmpbuffer = kmem_alloc(XFS_LBSIZE(dp->i_mount), KM_SLEEP);
- ASSERT(tmpbuffer != NULL);
+ trace_xfs_attr_leaf_to_sf(args);
+
+ tmpbuffer = kmem_alloc(args->geo->blksize, KM_SLEEP);
+ if (!tmpbuffer)
+ return ENOMEM;
+
+ memcpy(tmpbuffer, bp->b_addr, args->geo->blksize);
- ASSERT(bp != NULL);
- memcpy(tmpbuffer, bp->data, XFS_LBSIZE(dp->i_mount));
leaf = (xfs_attr_leafblock_t *)tmpbuffer;
- ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT)
- == XFS_ATTR_LEAF_MAGIC);
- memset(bp->data, 0, XFS_LBSIZE(dp->i_mount));
+ xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
+ entry = xfs_attr3_leaf_entryp(leaf);
+
+ /* XXX (dgc): buffer is about to be marked stale - why zero it? */
+ memset(bp->b_addr, 0, args->geo->blksize);
/*
* Clean out the prior contents of the attribute list.
@@ -779,20 +829,8 @@ xfs_attr_leaf_to_shortform(xfs_dabuf_t *bp, xfs_da_args_t *args, int forkoff)
if (forkoff == -1) {
ASSERT(dp->i_mount->m_flags & XFS_MOUNT_ATTR2);
-
- /*
- * Last attribute was removed, revert to original
- * inode format making all literal area available
- * to the data fork once more.
- */
- xfs_idestroy_fork(dp, XFS_ATTR_FORK);
- dp->i_d.di_forkoff = 0;
- dp->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
- ASSERT(dp->i_d.di_anextents == 0);
- ASSERT(dp->i_afp == NULL);
- dp->i_df.if_ext_max =
- XFS_IFORK_DSIZE(dp) / (uint)sizeof(xfs_bmbt_rec_t);
- xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE);
+ ASSERT(dp->i_d.di_format != XFS_DINODE_FMT_BTREE);
+ xfs_attr_fork_reset(dp, args->trans);
goto out;
}
@@ -802,97 +840,105 @@ xfs_attr_leaf_to_shortform(xfs_dabuf_t *bp, xfs_da_args_t *args, int forkoff)
* Copy the attributes
*/
memset((char *)&nargs, 0, sizeof(nargs));
+ nargs.geo = args->geo;
nargs.dp = dp;
nargs.firstblock = args->firstblock;
nargs.flist = args->flist;
nargs.total = args->total;
nargs.whichfork = XFS_ATTR_FORK;
nargs.trans = args->trans;
- nargs.oknoent = 1;
- entry = &leaf->entries[0];
- for (i = 0; i < INT_GET(leaf->hdr.count, ARCH_CONVERT); entry++, i++) {
+ nargs.op_flags = XFS_DA_OP_OKNOENT;
+
+ for (i = 0; i < ichdr.count; entry++, i++) {
if (entry->flags & XFS_ATTR_INCOMPLETE)
continue; /* don't copy partial entries */
if (!entry->nameidx)
continue;
ASSERT(entry->flags & XFS_ATTR_LOCAL);
- name_loc = XFS_ATTR_LEAF_NAME_LOCAL(leaf, i);
- nargs.name = (char *)name_loc->nameval;
+ name_loc = xfs_attr3_leaf_name_local(leaf, i);
+ nargs.name = name_loc->nameval;
nargs.namelen = name_loc->namelen;
- nargs.value = (char *)&name_loc->nameval[nargs.namelen];
- nargs.valuelen = INT_GET(name_loc->valuelen, ARCH_CONVERT);
- nargs.hashval = INT_GET(entry->hashval, ARCH_CONVERT);
- nargs.flags = (entry->flags & XFS_ATTR_SECURE) ? ATTR_SECURE :
- ((entry->flags & XFS_ATTR_ROOT) ? ATTR_ROOT : 0);
+ nargs.value = &name_loc->nameval[nargs.namelen];
+ nargs.valuelen = be16_to_cpu(name_loc->valuelen);
+ nargs.hashval = be32_to_cpu(entry->hashval);
+ nargs.flags = XFS_ATTR_NSP_ONDISK_TO_ARGS(entry->flags);
xfs_attr_shortform_add(&nargs, forkoff);
}
error = 0;
out:
- kmem_free(tmpbuffer, XFS_LBSIZE(dp->i_mount));
- return(error);
+ kmem_free(tmpbuffer);
+ return error;
}
/*
* Convert from using a single leaf to a root node and a leaf.
*/
int
-xfs_attr_leaf_to_node(xfs_da_args_t *args)
+xfs_attr3_leaf_to_node(
+ struct xfs_da_args *args)
{
- xfs_attr_leafblock_t *leaf;
- xfs_da_intnode_t *node;
- xfs_inode_t *dp;
- xfs_dabuf_t *bp1, *bp2;
- xfs_dablk_t blkno;
- int error;
+ struct xfs_attr_leafblock *leaf;
+ struct xfs_attr3_icleaf_hdr icleafhdr;
+ struct xfs_attr_leaf_entry *entries;
+ struct xfs_da_node_entry *btree;
+ struct xfs_da3_icnode_hdr icnodehdr;
+ struct xfs_da_intnode *node;
+ struct xfs_inode *dp = args->dp;
+ struct xfs_mount *mp = dp->i_mount;
+ struct xfs_buf *bp1 = NULL;
+ struct xfs_buf *bp2 = NULL;
+ xfs_dablk_t blkno;
+ int error;
+
+ trace_xfs_attr_leaf_to_node(args);
- dp = args->dp;
- bp1 = bp2 = NULL;
error = xfs_da_grow_inode(args, &blkno);
if (error)
goto out;
- error = xfs_da_read_buf(args->trans, args->dp, 0, -1, &bp1,
- XFS_ATTR_FORK);
+ error = xfs_attr3_leaf_read(args->trans, dp, 0, -1, &bp1);
if (error)
goto out;
- ASSERT(bp1 != NULL);
- bp2 = NULL;
- error = xfs_da_get_buf(args->trans, args->dp, blkno, -1, &bp2,
- XFS_ATTR_FORK);
+
+ error = xfs_da_get_buf(args->trans, dp, blkno, -1, &bp2, XFS_ATTR_FORK);
if (error)
goto out;
- ASSERT(bp2 != NULL);
- memcpy(bp2->data, bp1->data, XFS_LBSIZE(dp->i_mount));
- xfs_da_buf_done(bp1);
- bp1 = NULL;
- xfs_da_log_buf(args->trans, bp2, 0, XFS_LBSIZE(dp->i_mount) - 1);
+
+ /* copy leaf to new buffer, update identifiers */
+ xfs_trans_buf_set_type(args->trans, bp2, XFS_BLFT_ATTR_LEAF_BUF);
+ bp2->b_ops = bp1->b_ops;
+ memcpy(bp2->b_addr, bp1->b_addr, args->geo->blksize);
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ struct xfs_da3_blkinfo *hdr3 = bp2->b_addr;
+ hdr3->blkno = cpu_to_be64(bp2->b_bn);
+ }
+ xfs_trans_log_buf(args->trans, bp2, 0, args->geo->blksize - 1);
/*
* Set up the new root node.
*/
- error = xfs_da_node_create(args, 0, 1, &bp1, XFS_ATTR_FORK);
+ error = xfs_da3_node_create(args, 0, 1, &bp1, XFS_ATTR_FORK);
if (error)
goto out;
- node = bp1->data;
- leaf = bp2->data;
- ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT)
- == XFS_ATTR_LEAF_MAGIC);
+ node = bp1->b_addr;
+ dp->d_ops->node_hdr_from_disk(&icnodehdr, node);
+ btree = dp->d_ops->node_tree_p(node);
+
+ leaf = bp2->b_addr;
+ xfs_attr3_leaf_hdr_from_disk(&icleafhdr, leaf);
+ entries = xfs_attr3_leaf_entryp(leaf);
+
/* both on-disk, don't endian-flip twice */
- node->btree[0].hashval =
- leaf->entries[INT_GET(leaf->hdr.count, ARCH_CONVERT)-1 ].hashval;
- INT_SET(node->btree[0].before, ARCH_CONVERT, blkno);
- INT_SET(node->hdr.count, ARCH_CONVERT, 1);
- xfs_da_log_buf(args->trans, bp1, 0, XFS_LBSIZE(dp->i_mount) - 1);
+ btree[0].hashval = entries[icleafhdr.count - 1].hashval;
+ btree[0].before = cpu_to_be32(blkno);
+ icnodehdr.count = 1;
+ dp->d_ops->node_hdr_to_disk(node, &icnodehdr);
+ xfs_trans_log_buf(args->trans, bp1, 0, args->geo->blksize - 1);
error = 0;
out:
- if (bp1)
- xfs_da_buf_done(bp1);
- if (bp2)
- xfs_da_buf_done(bp2);
- return(error);
+ return error;
}
-
/*========================================================================
* Routines used for growing the Btree.
*========================================================================*/
@@ -902,54 +948,69 @@ out:
* or a leaf in a node attribute list.
*/
STATIC int
-xfs_attr_leaf_create(xfs_da_args_t *args, xfs_dablk_t blkno, xfs_dabuf_t **bpp)
+xfs_attr3_leaf_create(
+ struct xfs_da_args *args,
+ xfs_dablk_t blkno,
+ struct xfs_buf **bpp)
{
- xfs_attr_leafblock_t *leaf;
- xfs_attr_leaf_hdr_t *hdr;
- xfs_inode_t *dp;
- xfs_dabuf_t *bp;
- int error;
+ struct xfs_attr_leafblock *leaf;
+ struct xfs_attr3_icleaf_hdr ichdr;
+ struct xfs_inode *dp = args->dp;
+ struct xfs_mount *mp = dp->i_mount;
+ struct xfs_buf *bp;
+ int error;
+
+ trace_xfs_attr_leaf_create(args);
- dp = args->dp;
- ASSERT(dp != NULL);
error = xfs_da_get_buf(args->trans, args->dp, blkno, -1, &bp,
XFS_ATTR_FORK);
if (error)
- return(error);
- ASSERT(bp != NULL);
- leaf = bp->data;
- memset((char *)leaf, 0, XFS_LBSIZE(dp->i_mount));
- hdr = &leaf->hdr;
- INT_SET(hdr->info.magic, ARCH_CONVERT, XFS_ATTR_LEAF_MAGIC);
- INT_SET(hdr->firstused, ARCH_CONVERT, XFS_LBSIZE(dp->i_mount));
- if (!hdr->firstused) {
- INT_SET(hdr->firstused, ARCH_CONVERT,
- XFS_LBSIZE(dp->i_mount) - XFS_ATTR_LEAF_NAME_ALIGN);
- }
-
- INT_SET(hdr->freemap[0].base, ARCH_CONVERT,
- sizeof(xfs_attr_leaf_hdr_t));
- INT_SET(hdr->freemap[0].size, ARCH_CONVERT,
- INT_GET(hdr->firstused, ARCH_CONVERT)
- - INT_GET(hdr->freemap[0].base,
- ARCH_CONVERT));
-
- xfs_da_log_buf(args->trans, bp, 0, XFS_LBSIZE(dp->i_mount) - 1);
+ return error;
+ bp->b_ops = &xfs_attr3_leaf_buf_ops;
+ xfs_trans_buf_set_type(args->trans, bp, XFS_BLFT_ATTR_LEAF_BUF);
+ leaf = bp->b_addr;
+ memset(leaf, 0, args->geo->blksize);
+
+ memset(&ichdr, 0, sizeof(ichdr));
+ ichdr.firstused = args->geo->blksize;
+
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ struct xfs_da3_blkinfo *hdr3 = bp->b_addr;
+
+ ichdr.magic = XFS_ATTR3_LEAF_MAGIC;
+
+ hdr3->blkno = cpu_to_be64(bp->b_bn);
+ hdr3->owner = cpu_to_be64(dp->i_ino);
+ uuid_copy(&hdr3->uuid, &mp->m_sb.sb_uuid);
+
+ ichdr.freemap[0].base = sizeof(struct xfs_attr3_leaf_hdr);
+ } else {
+ ichdr.magic = XFS_ATTR_LEAF_MAGIC;
+ ichdr.freemap[0].base = sizeof(struct xfs_attr_leaf_hdr);
+ }
+ ichdr.freemap[0].size = ichdr.firstused - ichdr.freemap[0].base;
+
+ xfs_attr3_leaf_hdr_to_disk(leaf, &ichdr);
+ xfs_trans_log_buf(args->trans, bp, 0, args->geo->blksize - 1);
*bpp = bp;
- return(0);
+ return 0;
}
/*
* Split the leaf node, rebalance, then add the new entry.
*/
int
-xfs_attr_leaf_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk,
- xfs_da_state_blk_t *newblk)
+xfs_attr3_leaf_split(
+ struct xfs_da_state *state,
+ struct xfs_da_state_blk *oldblk,
+ struct xfs_da_state_blk *newblk)
{
xfs_dablk_t blkno;
int error;
+ trace_xfs_attr_leaf_split(state->args);
+
/*
* Allocate space for a new leaf node.
*/
@@ -957,7 +1018,7 @@ xfs_attr_leaf_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk,
error = xfs_da_grow_inode(state->args, &blkno);
if (error)
return(error);
- error = xfs_attr_leaf_create(state->args, blkno, &newblk->bp);
+ error = xfs_attr3_leaf_create(state->args, blkno, &newblk->bp);
if (error)
return(error);
newblk->blkno = blkno;
@@ -967,8 +1028,8 @@ xfs_attr_leaf_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk,
* Rebalance the entries across the two leaves.
* NOTE: rebalance() currently depends on the 2nd block being empty.
*/
- xfs_attr_leaf_rebalance(state, oldblk, newblk);
- error = xfs_da_blk_link(state, oldblk, newblk);
+ xfs_attr3_leaf_rebalance(state, oldblk, newblk);
+ error = xfs_da3_blk_link(state, oldblk, newblk);
if (error)
return(error);
@@ -979,10 +1040,13 @@ xfs_attr_leaf_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk,
*
* Insert the "new" entry in the correct block.
*/
- if (state->inleaf)
- error = xfs_attr_leaf_add(oldblk->bp, state->args);
- else
- error = xfs_attr_leaf_add(newblk->bp, state->args);
+ if (state->inleaf) {
+ trace_xfs_attr_leaf_add_old(state->args);
+ error = xfs_attr3_leaf_add(oldblk->bp, state->args);
+ } else {
+ trace_xfs_attr_leaf_add_new(state->args);
+ error = xfs_attr3_leaf_add(newblk->bp, state->args);
+ }
/*
* Update last hashval in each block since we added the name.
@@ -996,46 +1060,46 @@ xfs_attr_leaf_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk,
* Add a name to the leaf attribute list structure.
*/
int
-xfs_attr_leaf_add(xfs_dabuf_t *bp, xfs_da_args_t *args)
+xfs_attr3_leaf_add(
+ struct xfs_buf *bp,
+ struct xfs_da_args *args)
{
- xfs_attr_leafblock_t *leaf;
- xfs_attr_leaf_hdr_t *hdr;
- xfs_attr_leaf_map_t *map;
- int tablesize, entsize, sum, tmp, i;
-
- leaf = bp->data;
- ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT)
- == XFS_ATTR_LEAF_MAGIC);
- ASSERT((args->index >= 0)
- && (args->index <= INT_GET(leaf->hdr.count, ARCH_CONVERT)));
- hdr = &leaf->hdr;
- entsize = xfs_attr_leaf_newentsize(args->namelen, args->valuelen,
- args->trans->t_mountp->m_sb.sb_blocksize, NULL);
+ struct xfs_attr_leafblock *leaf;
+ struct xfs_attr3_icleaf_hdr ichdr;
+ int tablesize;
+ int entsize;
+ int sum;
+ int tmp;
+ int i;
+
+ trace_xfs_attr_leaf_add(args);
+
+ leaf = bp->b_addr;
+ xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
+ ASSERT(args->index >= 0 && args->index <= ichdr.count);
+ entsize = xfs_attr_leaf_newentsize(args, NULL);
/*
* Search through freemap for first-fit on new name length.
* (may need to figure in size of entry struct too)
*/
- tablesize = (INT_GET(hdr->count, ARCH_CONVERT) + 1)
- * sizeof(xfs_attr_leaf_entry_t)
- + sizeof(xfs_attr_leaf_hdr_t);
- map = &hdr->freemap[XFS_ATTR_LEAF_MAPSIZE-1];
- for (sum = 0, i = XFS_ATTR_LEAF_MAPSIZE-1; i >= 0; map--, i--) {
- if (tablesize > INT_GET(hdr->firstused, ARCH_CONVERT)) {
- sum += INT_GET(map->size, ARCH_CONVERT);
+ tablesize = (ichdr.count + 1) * sizeof(xfs_attr_leaf_entry_t)
+ + xfs_attr3_leaf_hdr_size(leaf);
+ for (sum = 0, i = XFS_ATTR_LEAF_MAPSIZE - 1; i >= 0; i--) {
+ if (tablesize > ichdr.firstused) {
+ sum += ichdr.freemap[i].size;
continue;
}
- if (!map->size)
+ if (!ichdr.freemap[i].size)
continue; /* no space in this map */
tmp = entsize;
- if (INT_GET(map->base, ARCH_CONVERT)
- < INT_GET(hdr->firstused, ARCH_CONVERT))
+ if (ichdr.freemap[i].base < ichdr.firstused)
tmp += sizeof(xfs_attr_leaf_entry_t);
- if (INT_GET(map->size, ARCH_CONVERT) >= tmp) {
- tmp = xfs_attr_leaf_add_work(bp, args, i);
- return(tmp);
+ if (ichdr.freemap[i].size >= tmp) {
+ tmp = xfs_attr3_leaf_add_work(bp, &ichdr, args, i);
+ goto out_log_hdr;
}
- sum += INT_GET(map->size, ARCH_CONVERT);
+ sum += ichdr.freemap[i].size;
}
/*
@@ -1043,103 +1107,104 @@ xfs_attr_leaf_add(xfs_dabuf_t *bp, xfs_da_args_t *args)
* and we don't have enough freespace, then compaction will do us
* no good and we should just give up.
*/
- if (!hdr->holes && (sum < entsize))
- return(XFS_ERROR(ENOSPC));
+ if (!ichdr.holes && sum < entsize)
+ return XFS_ERROR(ENOSPC);
/*
* Compact the entries to coalesce free space.
* This may change the hdr->count via dropping INCOMPLETE entries.
*/
- xfs_attr_leaf_compact(args->trans, bp);
+ xfs_attr3_leaf_compact(args, &ichdr, bp);
/*
* After compaction, the block is guaranteed to have only one
* free region, in freemap[0]. If it is not big enough, give up.
*/
- if (INT_GET(hdr->freemap[0].size, ARCH_CONVERT)
- < (entsize + sizeof(xfs_attr_leaf_entry_t)))
- return(XFS_ERROR(ENOSPC));
+ if (ichdr.freemap[0].size < (entsize + sizeof(xfs_attr_leaf_entry_t))) {
+ tmp = ENOSPC;
+ goto out_log_hdr;
+ }
- return(xfs_attr_leaf_add_work(bp, args, 0));
+ tmp = xfs_attr3_leaf_add_work(bp, &ichdr, args, 0);
+
+out_log_hdr:
+ xfs_attr3_leaf_hdr_to_disk(leaf, &ichdr);
+ xfs_trans_log_buf(args->trans, bp,
+ XFS_DA_LOGRANGE(leaf, &leaf->hdr,
+ xfs_attr3_leaf_hdr_size(leaf)));
+ return tmp;
}
/*
* Add a name to a leaf attribute list structure.
*/
STATIC int
-xfs_attr_leaf_add_work(xfs_dabuf_t *bp, xfs_da_args_t *args, int mapindex)
+xfs_attr3_leaf_add_work(
+ struct xfs_buf *bp,
+ struct xfs_attr3_icleaf_hdr *ichdr,
+ struct xfs_da_args *args,
+ int mapindex)
{
- xfs_attr_leafblock_t *leaf;
- xfs_attr_leaf_hdr_t *hdr;
- xfs_attr_leaf_entry_t *entry;
- xfs_attr_leaf_name_local_t *name_loc;
- xfs_attr_leaf_name_remote_t *name_rmt;
- xfs_attr_leaf_map_t *map;
- xfs_mount_t *mp;
- int tmp, i;
+ struct xfs_attr_leafblock *leaf;
+ struct xfs_attr_leaf_entry *entry;
+ struct xfs_attr_leaf_name_local *name_loc;
+ struct xfs_attr_leaf_name_remote *name_rmt;
+ struct xfs_mount *mp;
+ int tmp;
+ int i;
+
+ trace_xfs_attr_leaf_add_work(args);
- leaf = bp->data;
- ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT)
- == XFS_ATTR_LEAF_MAGIC);
- hdr = &leaf->hdr;
- ASSERT((mapindex >= 0) && (mapindex < XFS_ATTR_LEAF_MAPSIZE));
- ASSERT((args->index >= 0)
- && (args->index <= INT_GET(hdr->count, ARCH_CONVERT)));
+ leaf = bp->b_addr;
+ ASSERT(mapindex >= 0 && mapindex < XFS_ATTR_LEAF_MAPSIZE);
+ ASSERT(args->index >= 0 && args->index <= ichdr->count);
/*
* Force open some space in the entry array and fill it in.
*/
- entry = &leaf->entries[args->index];
- if (args->index < INT_GET(hdr->count, ARCH_CONVERT)) {
- tmp = INT_GET(hdr->count, ARCH_CONVERT) - args->index;
+ entry = &xfs_attr3_leaf_entryp(leaf)[args->index];
+ if (args->index < ichdr->count) {
+ tmp = ichdr->count - args->index;
tmp *= sizeof(xfs_attr_leaf_entry_t);
- memmove((char *)(entry+1), (char *)entry, tmp);
- xfs_da_log_buf(args->trans, bp,
+ memmove(entry + 1, entry, tmp);
+ xfs_trans_log_buf(args->trans, bp,
XFS_DA_LOGRANGE(leaf, entry, tmp + sizeof(*entry)));
}
- INT_MOD(hdr->count, ARCH_CONVERT, 1);
+ ichdr->count++;
/*
* Allocate space for the new string (at the end of the run).
*/
- map = &hdr->freemap[mapindex];
mp = args->trans->t_mountp;
- ASSERT(INT_GET(map->base, ARCH_CONVERT) < XFS_LBSIZE(mp));
- ASSERT((INT_GET(map->base, ARCH_CONVERT) & 0x3) == 0);
- ASSERT(INT_GET(map->size, ARCH_CONVERT) >=
- xfs_attr_leaf_newentsize(args->namelen, args->valuelen,
- mp->m_sb.sb_blocksize, NULL));
- ASSERT(INT_GET(map->size, ARCH_CONVERT) < XFS_LBSIZE(mp));
- ASSERT((INT_GET(map->size, ARCH_CONVERT) & 0x3) == 0);
- INT_MOD(map->size, ARCH_CONVERT,
- -xfs_attr_leaf_newentsize(args->namelen, args->valuelen,
- mp->m_sb.sb_blocksize, &tmp));
- INT_SET(entry->nameidx, ARCH_CONVERT,
- INT_GET(map->base, ARCH_CONVERT)
- + INT_GET(map->size, ARCH_CONVERT));
- INT_SET(entry->hashval, ARCH_CONVERT, args->hashval);
+ ASSERT(ichdr->freemap[mapindex].base < args->geo->blksize);
+ ASSERT((ichdr->freemap[mapindex].base & 0x3) == 0);
+ ASSERT(ichdr->freemap[mapindex].size >=
+ xfs_attr_leaf_newentsize(args, NULL));
+ ASSERT(ichdr->freemap[mapindex].size < args->geo->blksize);
+ ASSERT((ichdr->freemap[mapindex].size & 0x3) == 0);
+
+ ichdr->freemap[mapindex].size -= xfs_attr_leaf_newentsize(args, &tmp);
+
+ entry->nameidx = cpu_to_be16(ichdr->freemap[mapindex].base +
+ ichdr->freemap[mapindex].size);
+ entry->hashval = cpu_to_be32(args->hashval);
entry->flags = tmp ? XFS_ATTR_LOCAL : 0;
- entry->flags |= (args->flags & ATTR_SECURE) ? XFS_ATTR_SECURE :
- ((args->flags & ATTR_ROOT) ? XFS_ATTR_ROOT : 0);
- if (args->rename) {
+ entry->flags |= XFS_ATTR_NSP_ARGS_TO_ONDISK(args->flags);
+ if (args->op_flags & XFS_DA_OP_RENAME) {
entry->flags |= XFS_ATTR_INCOMPLETE;
if ((args->blkno2 == args->blkno) &&
(args->index2 <= args->index)) {
args->index2++;
}
}
- xfs_da_log_buf(args->trans, bp,
+ xfs_trans_log_buf(args->trans, bp,
XFS_DA_LOGRANGE(leaf, entry, sizeof(*entry)));
- ASSERT((args->index == 0) || (INT_GET(entry->hashval, ARCH_CONVERT)
- >= INT_GET((entry-1)->hashval,
- ARCH_CONVERT)));
- ASSERT((args->index == INT_GET(hdr->count, ARCH_CONVERT)-1) ||
- (INT_GET(entry->hashval, ARCH_CONVERT)
- <= (INT_GET((entry+1)->hashval, ARCH_CONVERT))));
+ ASSERT((args->index == 0) ||
+ (be32_to_cpu(entry->hashval) >= be32_to_cpu((entry-1)->hashval)));
+ ASSERT((args->index == ichdr->count - 1) ||
+ (be32_to_cpu(entry->hashval) <= be32_to_cpu((entry+1)->hashval)));
/*
- * Copy the attribute name and value into the new space.
- *
* For "remote" attribute values, simply note that we need to
* allocate space for the "remote" value. We can't actually
* allocate the extents in this transaction, and we can't decide
@@ -1147,14 +1212,14 @@ xfs_attr_leaf_add_work(xfs_dabuf_t *bp, xfs_da_args_t *args, int mapindex)
* as part of this transaction (a split operation for example).
*/
if (entry->flags & XFS_ATTR_LOCAL) {
- name_loc = XFS_ATTR_LEAF_NAME_LOCAL(leaf, args->index);
+ name_loc = xfs_attr3_leaf_name_local(leaf, args->index);
name_loc->namelen = args->namelen;
- INT_SET(name_loc->valuelen, ARCH_CONVERT, args->valuelen);
+ name_loc->valuelen = cpu_to_be16(args->valuelen);
memcpy((char *)name_loc->nameval, args->name, args->namelen);
memcpy((char *)&name_loc->nameval[args->namelen], args->value,
- INT_GET(name_loc->valuelen, ARCH_CONVERT));
+ be16_to_cpu(name_loc->valuelen));
} else {
- name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf, args->index);
+ name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index);
name_rmt->namelen = args->namelen;
memcpy((char *)name_rmt->name, args->name, args->namelen);
entry->flags |= XFS_ATTR_INCOMPLETE;
@@ -1162,92 +1227,129 @@ xfs_attr_leaf_add_work(xfs_dabuf_t *bp, xfs_da_args_t *args, int mapindex)
name_rmt->valuelen = 0;
name_rmt->valueblk = 0;
args->rmtblkno = 1;
- args->rmtblkcnt = XFS_B_TO_FSB(mp, args->valuelen);
+ args->rmtblkcnt = xfs_attr3_rmt_blocks(mp, args->valuelen);
+ args->rmtvaluelen = args->valuelen;
}
- xfs_da_log_buf(args->trans, bp,
- XFS_DA_LOGRANGE(leaf, XFS_ATTR_LEAF_NAME(leaf, args->index),
+ xfs_trans_log_buf(args->trans, bp,
+ XFS_DA_LOGRANGE(leaf, xfs_attr3_leaf_name(leaf, args->index),
xfs_attr_leaf_entsize(leaf, args->index)));
/*
* Update the control info for this leaf node
*/
- if (INT_GET(entry->nameidx, ARCH_CONVERT)
- < INT_GET(hdr->firstused, ARCH_CONVERT)) {
- /* both on-disk, don't endian-flip twice */
- hdr->firstused = entry->nameidx;
- }
- ASSERT(INT_GET(hdr->firstused, ARCH_CONVERT)
- >= ((INT_GET(hdr->count, ARCH_CONVERT)
- * sizeof(*entry))+sizeof(*hdr)));
- tmp = (INT_GET(hdr->count, ARCH_CONVERT)-1)
- * sizeof(xfs_attr_leaf_entry_t)
- + sizeof(xfs_attr_leaf_hdr_t);
- map = &hdr->freemap[0];
- for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; map++, i++) {
- if (INT_GET(map->base, ARCH_CONVERT) == tmp) {
- INT_MOD(map->base, ARCH_CONVERT,
- sizeof(xfs_attr_leaf_entry_t));
- INT_MOD(map->size, ARCH_CONVERT,
- -sizeof(xfs_attr_leaf_entry_t));
+ if (be16_to_cpu(entry->nameidx) < ichdr->firstused)
+ ichdr->firstused = be16_to_cpu(entry->nameidx);
+
+ ASSERT(ichdr->firstused >= ichdr->count * sizeof(xfs_attr_leaf_entry_t)
+ + xfs_attr3_leaf_hdr_size(leaf));
+ tmp = (ichdr->count - 1) * sizeof(xfs_attr_leaf_entry_t)
+ + xfs_attr3_leaf_hdr_size(leaf);
+
+ for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) {
+ if (ichdr->freemap[i].base == tmp) {
+ ichdr->freemap[i].base += sizeof(xfs_attr_leaf_entry_t);
+ ichdr->freemap[i].size -= sizeof(xfs_attr_leaf_entry_t);
}
}
- INT_MOD(hdr->usedbytes, ARCH_CONVERT,
- xfs_attr_leaf_entsize(leaf, args->index));
- xfs_da_log_buf(args->trans, bp,
- XFS_DA_LOGRANGE(leaf, hdr, sizeof(*hdr)));
- return(0);
+ ichdr->usedbytes += xfs_attr_leaf_entsize(leaf, args->index);
+ return 0;
}
/*
* Garbage collect a leaf attribute list block by copying it to a new buffer.
*/
STATIC void
-xfs_attr_leaf_compact(xfs_trans_t *trans, xfs_dabuf_t *bp)
+xfs_attr3_leaf_compact(
+ struct xfs_da_args *args,
+ struct xfs_attr3_icleaf_hdr *ichdr_dst,
+ struct xfs_buf *bp)
{
- xfs_attr_leafblock_t *leaf_s, *leaf_d;
- xfs_attr_leaf_hdr_t *hdr_s, *hdr_d;
- xfs_mount_t *mp;
- char *tmpbuffer;
+ struct xfs_attr_leafblock *leaf_src;
+ struct xfs_attr_leafblock *leaf_dst;
+ struct xfs_attr3_icleaf_hdr ichdr_src;
+ struct xfs_trans *trans = args->trans;
+ char *tmpbuffer;
- mp = trans->t_mountp;
- tmpbuffer = kmem_alloc(XFS_LBSIZE(mp), KM_SLEEP);
- ASSERT(tmpbuffer != NULL);
- memcpy(tmpbuffer, bp->data, XFS_LBSIZE(mp));
- memset(bp->data, 0, XFS_LBSIZE(mp));
+ trace_xfs_attr_leaf_compact(args);
+
+ tmpbuffer = kmem_alloc(args->geo->blksize, KM_SLEEP);
+ memcpy(tmpbuffer, bp->b_addr, args->geo->blksize);
+ memset(bp->b_addr, 0, args->geo->blksize);
+ leaf_src = (xfs_attr_leafblock_t *)tmpbuffer;
+ leaf_dst = bp->b_addr;
/*
- * Copy basic information
+ * Copy the on-disk header back into the destination buffer to ensure
+ * all the information in the header that is not part of the incore
+ * header structure is preserved.
*/
- leaf_s = (xfs_attr_leafblock_t *)tmpbuffer;
- leaf_d = bp->data;
- hdr_s = &leaf_s->hdr;
- hdr_d = &leaf_d->hdr;
- hdr_d->info = hdr_s->info; /* struct copy */
- INT_SET(hdr_d->firstused, ARCH_CONVERT, XFS_LBSIZE(mp));
- /* handle truncation gracefully */
- if (!hdr_d->firstused) {
- INT_SET(hdr_d->firstused, ARCH_CONVERT,
- XFS_LBSIZE(mp) - XFS_ATTR_LEAF_NAME_ALIGN);
- }
- hdr_d->usedbytes = 0;
- hdr_d->count = 0;
- hdr_d->holes = 0;
- INT_SET(hdr_d->freemap[0].base, ARCH_CONVERT,
- sizeof(xfs_attr_leaf_hdr_t));
- INT_SET(hdr_d->freemap[0].size, ARCH_CONVERT,
- INT_GET(hdr_d->firstused, ARCH_CONVERT)
- - INT_GET(hdr_d->freemap[0].base, ARCH_CONVERT));
+ memcpy(bp->b_addr, tmpbuffer, xfs_attr3_leaf_hdr_size(leaf_src));
+
+ /* Initialise the incore headers */
+ ichdr_src = *ichdr_dst; /* struct copy */
+ ichdr_dst->firstused = args->geo->blksize;
+ ichdr_dst->usedbytes = 0;
+ ichdr_dst->count = 0;
+ ichdr_dst->holes = 0;
+ ichdr_dst->freemap[0].base = xfs_attr3_leaf_hdr_size(leaf_src);
+ ichdr_dst->freemap[0].size = ichdr_dst->firstused -
+ ichdr_dst->freemap[0].base;
+
+ /* write the header back to initialise the underlying buffer */
+ xfs_attr3_leaf_hdr_to_disk(leaf_dst, ichdr_dst);
/*
* Copy all entry's in the same (sorted) order,
* but allocate name/value pairs packed and in sequence.
*/
- xfs_attr_leaf_moveents(leaf_s, 0, leaf_d, 0,
- (int)INT_GET(hdr_s->count, ARCH_CONVERT), mp);
+ xfs_attr3_leaf_moveents(args, leaf_src, &ichdr_src, 0,
+ leaf_dst, ichdr_dst, 0, ichdr_src.count);
+ /*
+ * this logs the entire buffer, but the caller must write the header
+ * back to the buffer when it is finished modifying it.
+ */
+ xfs_trans_log_buf(trans, bp, 0, args->geo->blksize - 1);
- xfs_da_log_buf(trans, bp, 0, XFS_LBSIZE(mp) - 1);
+ kmem_free(tmpbuffer);
+}
- kmem_free(tmpbuffer, XFS_LBSIZE(mp));
+/*
+ * Compare two leaf blocks "order".
+ * Return 0 unless leaf2 should go before leaf1.
+ */
+static int
+xfs_attr3_leaf_order(
+ struct xfs_buf *leaf1_bp,
+ struct xfs_attr3_icleaf_hdr *leaf1hdr,
+ struct xfs_buf *leaf2_bp,
+ struct xfs_attr3_icleaf_hdr *leaf2hdr)
+{
+ struct xfs_attr_leaf_entry *entries1;
+ struct xfs_attr_leaf_entry *entries2;
+
+ entries1 = xfs_attr3_leaf_entryp(leaf1_bp->b_addr);
+ entries2 = xfs_attr3_leaf_entryp(leaf2_bp->b_addr);
+ if (leaf1hdr->count > 0 && leaf2hdr->count > 0 &&
+ ((be32_to_cpu(entries2[0].hashval) <
+ be32_to_cpu(entries1[0].hashval)) ||
+ (be32_to_cpu(entries2[leaf2hdr->count - 1].hashval) <
+ be32_to_cpu(entries1[leaf1hdr->count - 1].hashval)))) {
+ return 1;
+ }
+ return 0;
+}
+
+int
+xfs_attr_leaf_order(
+ struct xfs_buf *leaf1_bp,
+ struct xfs_buf *leaf2_bp)
+{
+ struct xfs_attr3_icleaf_hdr ichdr1;
+ struct xfs_attr3_icleaf_hdr ichdr2;
+
+ xfs_attr3_leaf_hdr_from_disk(&ichdr1, leaf1_bp->b_addr);
+ xfs_attr3_leaf_hdr_from_disk(&ichdr2, leaf2_bp->b_addr);
+ return xfs_attr3_leaf_order(leaf1_bp, &ichdr1, leaf2_bp, &ichdr2);
}
/*
@@ -1263,28 +1365,38 @@ xfs_attr_leaf_compact(xfs_trans_t *trans, xfs_dabuf_t *bp)
* the "new" and "old" values can end up in different blocks.
*/
STATIC void
-xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
- xfs_da_state_blk_t *blk2)
+xfs_attr3_leaf_rebalance(
+ struct xfs_da_state *state,
+ struct xfs_da_state_blk *blk1,
+ struct xfs_da_state_blk *blk2)
{
- xfs_da_args_t *args;
- xfs_da_state_blk_t *tmp_blk;
- xfs_attr_leafblock_t *leaf1, *leaf2;
- xfs_attr_leaf_hdr_t *hdr1, *hdr2;
- int count, totallen, max, space, swap;
+ struct xfs_da_args *args;
+ struct xfs_attr_leafblock *leaf1;
+ struct xfs_attr_leafblock *leaf2;
+ struct xfs_attr3_icleaf_hdr ichdr1;
+ struct xfs_attr3_icleaf_hdr ichdr2;
+ struct xfs_attr_leaf_entry *entries1;
+ struct xfs_attr_leaf_entry *entries2;
+ int count;
+ int totallen;
+ int max;
+ int space;
+ int swap;
/*
* Set up environment.
*/
ASSERT(blk1->magic == XFS_ATTR_LEAF_MAGIC);
ASSERT(blk2->magic == XFS_ATTR_LEAF_MAGIC);
- leaf1 = blk1->bp->data;
- leaf2 = blk2->bp->data;
- ASSERT(INT_GET(leaf1->hdr.info.magic, ARCH_CONVERT)
- == XFS_ATTR_LEAF_MAGIC);
- ASSERT(INT_GET(leaf2->hdr.info.magic, ARCH_CONVERT)
- == XFS_ATTR_LEAF_MAGIC);
+ leaf1 = blk1->bp->b_addr;
+ leaf2 = blk2->bp->b_addr;
+ xfs_attr3_leaf_hdr_from_disk(&ichdr1, leaf1);
+ xfs_attr3_leaf_hdr_from_disk(&ichdr2, leaf2);
+ ASSERT(ichdr2.count == 0);
args = state->args;
+ trace_xfs_attr_leaf_rebalance(args);
+
/*
* Check ordering of blocks, reverse if it makes things simpler.
*
@@ -1292,16 +1404,23 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
* second block, this code should never set "swap".
*/
swap = 0;
- if (xfs_attr_leaf_order(blk1->bp, blk2->bp)) {
+ if (xfs_attr3_leaf_order(blk1->bp, &ichdr1, blk2->bp, &ichdr2)) {
+ struct xfs_da_state_blk *tmp_blk;
+ struct xfs_attr3_icleaf_hdr tmp_ichdr;
+
tmp_blk = blk1;
blk1 = blk2;
blk2 = tmp_blk;
- leaf1 = blk1->bp->data;
- leaf2 = blk2->bp->data;
+
+ /* struct copies to swap them rather than reconverting */
+ tmp_ichdr = ichdr1;
+ ichdr1 = ichdr2;
+ ichdr2 = tmp_ichdr;
+
+ leaf1 = blk1->bp->b_addr;
+ leaf2 = blk2->bp->b_addr;
swap = 1;
}
- hdr1 = &leaf1->hdr;
- hdr2 = &leaf2->hdr;
/*
* Examine entries until we reduce the absolute difference in
@@ -1311,88 +1430,80 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
* "inleaf" is true if the new entry should be inserted into blk1.
* If "swap" is also true, then reverse the sense of "inleaf".
*/
- state->inleaf = xfs_attr_leaf_figure_balance(state, blk1, blk2,
- &count, &totallen);
+ state->inleaf = xfs_attr3_leaf_figure_balance(state, blk1, &ichdr1,
+ blk2, &ichdr2,
+ &count, &totallen);
if (swap)
state->inleaf = !state->inleaf;
/*
* Move any entries required from leaf to leaf:
*/
- if (count < INT_GET(hdr1->count, ARCH_CONVERT)) {
+ if (count < ichdr1.count) {
/*
* Figure the total bytes to be added to the destination leaf.
*/
/* number entries being moved */
- count = INT_GET(hdr1->count, ARCH_CONVERT) - count;
- space = INT_GET(hdr1->usedbytes, ARCH_CONVERT) - totallen;
+ count = ichdr1.count - count;
+ space = ichdr1.usedbytes - totallen;
space += count * sizeof(xfs_attr_leaf_entry_t);
/*
* leaf2 is the destination, compact it if it looks tight.
*/
- max = INT_GET(hdr2->firstused, ARCH_CONVERT)
- - sizeof(xfs_attr_leaf_hdr_t);
- max -= INT_GET(hdr2->count, ARCH_CONVERT)
- * sizeof(xfs_attr_leaf_entry_t);
- if (space > max) {
- xfs_attr_leaf_compact(args->trans, blk2->bp);
- }
+ max = ichdr2.firstused - xfs_attr3_leaf_hdr_size(leaf1);
+ max -= ichdr2.count * sizeof(xfs_attr_leaf_entry_t);
+ if (space > max)
+ xfs_attr3_leaf_compact(args, &ichdr2, blk2->bp);
/*
* Move high entries from leaf1 to low end of leaf2.
*/
- xfs_attr_leaf_moveents(leaf1,
- INT_GET(hdr1->count, ARCH_CONVERT)-count,
- leaf2, 0, count, state->mp);
+ xfs_attr3_leaf_moveents(args, leaf1, &ichdr1,
+ ichdr1.count - count, leaf2, &ichdr2, 0, count);
- xfs_da_log_buf(args->trans, blk1->bp, 0, state->blocksize-1);
- xfs_da_log_buf(args->trans, blk2->bp, 0, state->blocksize-1);
- } else if (count > INT_GET(hdr1->count, ARCH_CONVERT)) {
+ } else if (count > ichdr1.count) {
/*
* I assert that since all callers pass in an empty
* second buffer, this code should never execute.
*/
+ ASSERT(0);
/*
* Figure the total bytes to be added to the destination leaf.
*/
/* number entries being moved */
- count -= INT_GET(hdr1->count, ARCH_CONVERT);
- space = totallen - INT_GET(hdr1->usedbytes, ARCH_CONVERT);
+ count -= ichdr1.count;
+ space = totallen - ichdr1.usedbytes;
space += count * sizeof(xfs_attr_leaf_entry_t);
/*
* leaf1 is the destination, compact it if it looks tight.
*/
- max = INT_GET(hdr1->firstused, ARCH_CONVERT)
- - sizeof(xfs_attr_leaf_hdr_t);
- max -= INT_GET(hdr1->count, ARCH_CONVERT)
- * sizeof(xfs_attr_leaf_entry_t);
- if (space > max) {
- xfs_attr_leaf_compact(args->trans, blk1->bp);
- }
+ max = ichdr1.firstused - xfs_attr3_leaf_hdr_size(leaf1);
+ max -= ichdr1.count * sizeof(xfs_attr_leaf_entry_t);
+ if (space > max)
+ xfs_attr3_leaf_compact(args, &ichdr1, blk1->bp);
/*
* Move low entries from leaf2 to high end of leaf1.
*/
- xfs_attr_leaf_moveents(leaf2, 0, leaf1,
- (int)INT_GET(hdr1->count, ARCH_CONVERT), count,
- state->mp);
-
- xfs_da_log_buf(args->trans, blk1->bp, 0, state->blocksize-1);
- xfs_da_log_buf(args->trans, blk2->bp, 0, state->blocksize-1);
+ xfs_attr3_leaf_moveents(args, leaf2, &ichdr2, 0, leaf1, &ichdr1,
+ ichdr1.count, count);
}
+ xfs_attr3_leaf_hdr_to_disk(leaf1, &ichdr1);
+ xfs_attr3_leaf_hdr_to_disk(leaf2, &ichdr2);
+ xfs_trans_log_buf(args->trans, blk1->bp, 0, args->geo->blksize - 1);
+ xfs_trans_log_buf(args->trans, blk2->bp, 0, args->geo->blksize - 1);
+
/*
* Copy out last hashval in each block for B-tree code.
*/
- blk1->hashval =
- INT_GET(leaf1->entries[INT_GET(leaf1->hdr.count,
- ARCH_CONVERT)-1].hashval, ARCH_CONVERT);
- blk2->hashval =
- INT_GET(leaf2->entries[INT_GET(leaf2->hdr.count,
- ARCH_CONVERT)-1].hashval, ARCH_CONVERT);
+ entries1 = xfs_attr3_leaf_entryp(leaf1);
+ entries2 = xfs_attr3_leaf_entryp(leaf2);
+ blk1->hashval = be32_to_cpu(entries1[ichdr1.count - 1].hashval);
+ blk2->hashval = be32_to_cpu(entries2[ichdr2.count - 1].hashval);
/*
* Adjust the expected index for insertion.
@@ -1406,23 +1517,35 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
* inserting. The index/blkno fields refer to the "old" entry,
* while the index2/blkno2 fields refer to the "new" entry.
*/
- if (blk1->index > INT_GET(leaf1->hdr.count, ARCH_CONVERT)) {
+ if (blk1->index > ichdr1.count) {
ASSERT(state->inleaf == 0);
- blk2->index = blk1->index
- - INT_GET(leaf1->hdr.count, ARCH_CONVERT);
+ blk2->index = blk1->index - ichdr1.count;
args->index = args->index2 = blk2->index;
args->blkno = args->blkno2 = blk2->blkno;
- } else if (blk1->index == INT_GET(leaf1->hdr.count, ARCH_CONVERT)) {
+ } else if (blk1->index == ichdr1.count) {
if (state->inleaf) {
args->index = blk1->index;
args->blkno = blk1->blkno;
args->index2 = 0;
args->blkno2 = blk2->blkno;
} else {
- blk2->index = blk1->index
- - INT_GET(leaf1->hdr.count, ARCH_CONVERT);
- args->index = args->index2 = blk2->index;
- args->blkno = args->blkno2 = blk2->blkno;
+ /*
+ * On a double leaf split, the original attr location
+ * is already stored in blkno2/index2, so don't
+ * overwrite it overwise we corrupt the tree.
+ */
+ blk2->index = blk1->index - ichdr1.count;
+ args->index = blk2->index;
+ args->blkno = blk2->blkno;
+ if (!state->extravalid) {
+ /*
+ * set the new attr location to match the old
+ * one and let the higher level split code
+ * decide where in the leaf to place it.
+ */
+ args->index2 = blk2->index;
+ args->blkno2 = blk2->blkno;
+ }
}
} else {
ASSERT(state->inleaf == 1);
@@ -1439,43 +1562,38 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
* GROT: Do a double-split for this case?
*/
STATIC int
-xfs_attr_leaf_figure_balance(xfs_da_state_t *state,
- xfs_da_state_blk_t *blk1,
- xfs_da_state_blk_t *blk2,
- int *countarg, int *usedbytesarg)
+xfs_attr3_leaf_figure_balance(
+ struct xfs_da_state *state,
+ struct xfs_da_state_blk *blk1,
+ struct xfs_attr3_icleaf_hdr *ichdr1,
+ struct xfs_da_state_blk *blk2,
+ struct xfs_attr3_icleaf_hdr *ichdr2,
+ int *countarg,
+ int *usedbytesarg)
{
- xfs_attr_leafblock_t *leaf1, *leaf2;
- xfs_attr_leaf_hdr_t *hdr1, *hdr2;
- xfs_attr_leaf_entry_t *entry;
- int count, max, index, totallen, half;
- int lastdelta, foundit, tmp;
-
- /*
- * Set up environment.
- */
- leaf1 = blk1->bp->data;
- leaf2 = blk2->bp->data;
- hdr1 = &leaf1->hdr;
- hdr2 = &leaf2->hdr;
- foundit = 0;
- totallen = 0;
+ struct xfs_attr_leafblock *leaf1 = blk1->bp->b_addr;
+ struct xfs_attr_leafblock *leaf2 = blk2->bp->b_addr;
+ struct xfs_attr_leaf_entry *entry;
+ int count;
+ int max;
+ int index;
+ int totallen = 0;
+ int half;
+ int lastdelta;
+ int foundit = 0;
+ int tmp;
/*
* Examine entries until we reduce the absolute difference in
* byte usage between the two blocks to a minimum.
*/
- max = INT_GET(hdr1->count, ARCH_CONVERT)
- + INT_GET(hdr2->count, ARCH_CONVERT);
- half = (max+1) * sizeof(*entry);
- half += INT_GET(hdr1->usedbytes, ARCH_CONVERT)
- + INT_GET(hdr2->usedbytes, ARCH_CONVERT)
- + xfs_attr_leaf_newentsize(
- state->args->namelen,
- state->args->valuelen,
- state->blocksize, NULL);
+ max = ichdr1->count + ichdr2->count;
+ half = (max + 1) * sizeof(*entry);
+ half += ichdr1->usedbytes + ichdr2->usedbytes +
+ xfs_attr_leaf_newentsize(state->args, NULL);
half /= 2;
- lastdelta = state->blocksize;
- entry = &leaf1->entries[0];
+ lastdelta = state->args->geo->blksize;
+ entry = xfs_attr3_leaf_entryp(leaf1);
for (count = index = 0; count < max; entry++, index++, count++) {
#define XFS_ATTR_ABS(A) (((A) < 0) ? -(A) : (A))
@@ -1484,10 +1602,7 @@ xfs_attr_leaf_figure_balance(xfs_da_state_t *state,
*/
if (count == blk1->index) {
tmp = totallen + sizeof(*entry) +
- xfs_attr_leaf_newentsize(
- state->args->namelen,
- state->args->valuelen,
- state->blocksize, NULL);
+ xfs_attr_leaf_newentsize(state->args, NULL);
if (XFS_ATTR_ABS(half - tmp) > lastdelta)
break;
lastdelta = XFS_ATTR_ABS(half - tmp);
@@ -1498,9 +1613,9 @@ xfs_attr_leaf_figure_balance(xfs_da_state_t *state,
/*
* Wrap around into the second block if necessary.
*/
- if (count == INT_GET(hdr1->count, ARCH_CONVERT)) {
+ if (count == ichdr1->count) {
leaf1 = leaf2;
- entry = &leaf1->entries[0];
+ entry = xfs_attr3_leaf_entryp(leaf1);
index = 0;
}
@@ -1523,15 +1638,12 @@ xfs_attr_leaf_figure_balance(xfs_da_state_t *state,
totallen -= count * sizeof(*entry);
if (foundit) {
totallen -= sizeof(*entry) +
- xfs_attr_leaf_newentsize(
- state->args->namelen,
- state->args->valuelen,
- state->blocksize, NULL);
+ xfs_attr_leaf_newentsize(state->args, NULL);
}
*countarg = count;
*usedbytesarg = totallen;
- return(foundit);
+ return foundit;
}
/*========================================================================
@@ -1550,14 +1662,22 @@ xfs_attr_leaf_figure_balance(xfs_da_state_t *state,
* GROT: allow for INCOMPLETE entries in calculation.
*/
int
-xfs_attr_leaf_toosmall(xfs_da_state_t *state, int *action)
+xfs_attr3_leaf_toosmall(
+ struct xfs_da_state *state,
+ int *action)
{
- xfs_attr_leafblock_t *leaf;
- xfs_da_state_blk_t *blk;
- xfs_da_blkinfo_t *info;
- int count, bytes, forward, error, retval, i;
- xfs_dablk_t blkno;
- xfs_dabuf_t *bp;
+ struct xfs_attr_leafblock *leaf;
+ struct xfs_da_state_blk *blk;
+ struct xfs_attr3_icleaf_hdr ichdr;
+ struct xfs_buf *bp;
+ xfs_dablk_t blkno;
+ int bytes;
+ int forward;
+ int error;
+ int retval;
+ int i;
+
+ trace_xfs_attr_leaf_toosmall(state->args);
/*
* Check for the degenerate case of the block being over 50% full.
@@ -1565,14 +1685,12 @@ xfs_attr_leaf_toosmall(xfs_da_state_t *state, int *action)
* to coalesce with a sibling.
*/
blk = &state->path.blk[ state->path.active-1 ];
- info = blk->bp->data;
- ASSERT(INT_GET(info->magic, ARCH_CONVERT) == XFS_ATTR_LEAF_MAGIC);
- leaf = (xfs_attr_leafblock_t *)info;
- count = INT_GET(leaf->hdr.count, ARCH_CONVERT);
- bytes = sizeof(xfs_attr_leaf_hdr_t) +
- count * sizeof(xfs_attr_leaf_entry_t) +
- INT_GET(leaf->hdr.usedbytes, ARCH_CONVERT);
- if (bytes > (state->blocksize >> 1)) {
+ leaf = blk->bp->b_addr;
+ xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
+ bytes = xfs_attr3_leaf_hdr_size(leaf) +
+ ichdr.count * sizeof(xfs_attr_leaf_entry_t) +
+ ichdr.usedbytes;
+ if (bytes > (state->args->geo->blksize >> 1)) {
*action = 0; /* blk over 50%, don't try to join */
return(0);
}
@@ -1580,17 +1698,17 @@ xfs_attr_leaf_toosmall(xfs_da_state_t *state, int *action)
/*
* Check for the degenerate case of the block being empty.
* If the block is empty, we'll simply delete it, no need to
- * coalesce it with a sibling block. We choose (aribtrarily)
+ * coalesce it with a sibling block. We choose (arbitrarily)
* to merge with the forward block unless it is NULL.
*/
- if (count == 0) {
+ if (ichdr.count == 0) {
/*
* Make altpath point to the block we want to keep and
* path point to the block we want to drop (this one).
*/
- forward = info->forw;
+ forward = (ichdr.forw != 0);
memcpy(&state->altpath, &state->path, sizeof(state->path));
- error = xfs_da_path_shift(state, &state->altpath, forward,
+ error = xfs_da3_path_shift(state, &state->altpath, forward,
0, &retval);
if (error)
return(error);
@@ -1599,7 +1717,7 @@ xfs_attr_leaf_toosmall(xfs_da_state_t *state, int *action)
} else {
*action = 2;
}
- return(0);
+ return 0;
}
/*
@@ -1610,33 +1728,30 @@ xfs_attr_leaf_toosmall(xfs_da_state_t *state, int *action)
* to shrink an attribute list over time.
*/
/* start with smaller blk num */
- forward = (INT_GET(info->forw, ARCH_CONVERT)
- < INT_GET(info->back, ARCH_CONVERT));
+ forward = ichdr.forw < ichdr.back;
for (i = 0; i < 2; forward = !forward, i++) {
+ struct xfs_attr3_icleaf_hdr ichdr2;
if (forward)
- blkno = INT_GET(info->forw, ARCH_CONVERT);
+ blkno = ichdr.forw;
else
- blkno = INT_GET(info->back, ARCH_CONVERT);
+ blkno = ichdr.back;
if (blkno == 0)
continue;
- error = xfs_da_read_buf(state->args->trans, state->args->dp,
- blkno, -1, &bp, XFS_ATTR_FORK);
+ error = xfs_attr3_leaf_read(state->args->trans, state->args->dp,
+ blkno, -1, &bp);
if (error)
return(error);
- ASSERT(bp != NULL);
-
- leaf = (xfs_attr_leafblock_t *)info;
- count = INT_GET(leaf->hdr.count, ARCH_CONVERT);
- bytes = state->blocksize - (state->blocksize>>2);
- bytes -= INT_GET(leaf->hdr.usedbytes, ARCH_CONVERT);
- leaf = bp->data;
- ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT)
- == XFS_ATTR_LEAF_MAGIC);
- count += INT_GET(leaf->hdr.count, ARCH_CONVERT);
- bytes -= INT_GET(leaf->hdr.usedbytes, ARCH_CONVERT);
- bytes -= count * sizeof(xfs_attr_leaf_entry_t);
- bytes -= sizeof(xfs_attr_leaf_hdr_t);
- xfs_da_brelse(state->args->trans, bp);
+
+ xfs_attr3_leaf_hdr_from_disk(&ichdr2, bp->b_addr);
+
+ bytes = state->args->geo->blksize -
+ (state->args->geo->blksize >> 2) -
+ ichdr.usedbytes - ichdr2.usedbytes -
+ ((ichdr.count + ichdr2.count) *
+ sizeof(xfs_attr_leaf_entry_t)) -
+ xfs_attr3_leaf_hdr_size(leaf);
+
+ xfs_trans_brelse(state->args->trans, bp);
if (bytes >= 0)
break; /* fits with at least 25% to spare */
}
@@ -1651,10 +1766,10 @@ xfs_attr_leaf_toosmall(xfs_da_state_t *state, int *action)
*/
memcpy(&state->altpath, &state->path, sizeof(state->path));
if (blkno < blk->blkno) {
- error = xfs_da_path_shift(state, &state->altpath, forward,
+ error = xfs_da3_path_shift(state, &state->altpath, forward,
0, &retval);
} else {
- error = xfs_da_path_shift(state, &state->path, forward,
+ error = xfs_da3_path_shift(state, &state->path, forward,
0, &retval);
}
if (error)
@@ -1674,32 +1789,35 @@ xfs_attr_leaf_toosmall(xfs_da_state_t *state, int *action)
* If two leaves are 37% full, when combined they will leave 25% free.
*/
int
-xfs_attr_leaf_remove(xfs_dabuf_t *bp, xfs_da_args_t *args)
+xfs_attr3_leaf_remove(
+ struct xfs_buf *bp,
+ struct xfs_da_args *args)
{
- xfs_attr_leafblock_t *leaf;
- xfs_attr_leaf_hdr_t *hdr;
- xfs_attr_leaf_map_t *map;
- xfs_attr_leaf_entry_t *entry;
- int before, after, smallest, entsize;
- int tablesize, tmp, i;
- xfs_mount_t *mp;
+ struct xfs_attr_leafblock *leaf;
+ struct xfs_attr3_icleaf_hdr ichdr;
+ struct xfs_attr_leaf_entry *entry;
+ int before;
+ int after;
+ int smallest;
+ int entsize;
+ int tablesize;
+ int tmp;
+ int i;
- leaf = bp->data;
- ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT)
- == XFS_ATTR_LEAF_MAGIC);
- hdr = &leaf->hdr;
- mp = args->trans->t_mountp;
- ASSERT((INT_GET(hdr->count, ARCH_CONVERT) > 0)
- && (INT_GET(hdr->count, ARCH_CONVERT) < (XFS_LBSIZE(mp)/8)));
- ASSERT((args->index >= 0)
- && (args->index < INT_GET(hdr->count, ARCH_CONVERT)));
- ASSERT(INT_GET(hdr->firstused, ARCH_CONVERT)
- >= ((INT_GET(hdr->count, ARCH_CONVERT)
- * sizeof(*entry))+sizeof(*hdr)));
- entry = &leaf->entries[args->index];
- ASSERT(INT_GET(entry->nameidx, ARCH_CONVERT)
- >= INT_GET(hdr->firstused, ARCH_CONVERT));
- ASSERT(INT_GET(entry->nameidx, ARCH_CONVERT) < XFS_LBSIZE(mp));
+ trace_xfs_attr_leaf_remove(args);
+
+ leaf = bp->b_addr;
+ xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
+
+ ASSERT(ichdr.count > 0 && ichdr.count < args->geo->blksize / 8);
+ ASSERT(args->index >= 0 && args->index < ichdr.count);
+ ASSERT(ichdr.firstused >= ichdr.count * sizeof(*entry) +
+ xfs_attr3_leaf_hdr_size(leaf));
+
+ entry = &xfs_attr3_leaf_entryp(leaf)[args->index];
+
+ ASSERT(be16_to_cpu(entry->nameidx) >= ichdr.firstused);
+ ASSERT(be16_to_cpu(entry->nameidx) < args->geo->blksize);
/*
* Scan through free region table:
@@ -1707,33 +1825,28 @@ xfs_attr_leaf_remove(xfs_dabuf_t *bp, xfs_da_args_t *args)
* find smallest free region in case we need to replace it,
* adjust any map that borders the entry table,
*/
- tablesize = INT_GET(hdr->count, ARCH_CONVERT)
- * sizeof(xfs_attr_leaf_entry_t)
- + sizeof(xfs_attr_leaf_hdr_t);
- map = &hdr->freemap[0];
- tmp = INT_GET(map->size, ARCH_CONVERT);
+ tablesize = ichdr.count * sizeof(xfs_attr_leaf_entry_t)
+ + xfs_attr3_leaf_hdr_size(leaf);
+ tmp = ichdr.freemap[0].size;
before = after = -1;
smallest = XFS_ATTR_LEAF_MAPSIZE - 1;
entsize = xfs_attr_leaf_entsize(leaf, args->index);
- for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; map++, i++) {
- ASSERT(INT_GET(map->base, ARCH_CONVERT) < XFS_LBSIZE(mp));
- ASSERT(INT_GET(map->size, ARCH_CONVERT) < XFS_LBSIZE(mp));
- if (INT_GET(map->base, ARCH_CONVERT) == tablesize) {
- INT_MOD(map->base, ARCH_CONVERT,
- -sizeof(xfs_attr_leaf_entry_t));
- INT_MOD(map->size, ARCH_CONVERT,
- sizeof(xfs_attr_leaf_entry_t));
+ for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) {
+ ASSERT(ichdr.freemap[i].base < args->geo->blksize);
+ ASSERT(ichdr.freemap[i].size < args->geo->blksize);
+ if (ichdr.freemap[i].base == tablesize) {
+ ichdr.freemap[i].base -= sizeof(xfs_attr_leaf_entry_t);
+ ichdr.freemap[i].size += sizeof(xfs_attr_leaf_entry_t);
}
- if ((INT_GET(map->base, ARCH_CONVERT)
- + INT_GET(map->size, ARCH_CONVERT))
- == INT_GET(entry->nameidx, ARCH_CONVERT)) {
+ if (ichdr.freemap[i].base + ichdr.freemap[i].size ==
+ be16_to_cpu(entry->nameidx)) {
before = i;
- } else if (INT_GET(map->base, ARCH_CONVERT)
- == (INT_GET(entry->nameidx, ARCH_CONVERT) + entsize)) {
+ } else if (ichdr.freemap[i].base ==
+ (be16_to_cpu(entry->nameidx) + entsize)) {
after = i;
- } else if (INT_GET(map->size, ARCH_CONVERT) < tmp) {
- tmp = INT_GET(map->size, ARCH_CONVERT);
+ } else if (ichdr.freemap[i].size < tmp) {
+ tmp = ichdr.freemap[i].size;
smallest = i;
}
}
@@ -1744,39 +1857,30 @@ xfs_attr_leaf_remove(xfs_dabuf_t *bp, xfs_da_args_t *args)
*/
if ((before >= 0) || (after >= 0)) {
if ((before >= 0) && (after >= 0)) {
- map = &hdr->freemap[before];
- INT_MOD(map->size, ARCH_CONVERT, entsize);
- INT_MOD(map->size, ARCH_CONVERT,
- INT_GET(hdr->freemap[after].size,
- ARCH_CONVERT));
- hdr->freemap[after].base = 0;
- hdr->freemap[after].size = 0;
+ ichdr.freemap[before].size += entsize;
+ ichdr.freemap[before].size += ichdr.freemap[after].size;
+ ichdr.freemap[after].base = 0;
+ ichdr.freemap[after].size = 0;
} else if (before >= 0) {
- map = &hdr->freemap[before];
- INT_MOD(map->size, ARCH_CONVERT, entsize);
+ ichdr.freemap[before].size += entsize;
} else {
- map = &hdr->freemap[after];
- /* both on-disk, don't endian flip twice */
- map->base = entry->nameidx;
- INT_MOD(map->size, ARCH_CONVERT, entsize);
+ ichdr.freemap[after].base = be16_to_cpu(entry->nameidx);
+ ichdr.freemap[after].size += entsize;
}
} else {
/*
* Replace smallest region (if it is smaller than free'd entry)
*/
- map = &hdr->freemap[smallest];
- if (INT_GET(map->size, ARCH_CONVERT) < entsize) {
- INT_SET(map->base, ARCH_CONVERT,
- INT_GET(entry->nameidx, ARCH_CONVERT));
- INT_SET(map->size, ARCH_CONVERT, entsize);
+ if (ichdr.freemap[smallest].size < entsize) {
+ ichdr.freemap[smallest].base = be16_to_cpu(entry->nameidx);
+ ichdr.freemap[smallest].size = entsize;
}
}
/*
* Did we remove the first entry?
*/
- if (INT_GET(entry->nameidx, ARCH_CONVERT)
- == INT_GET(hdr->firstused, ARCH_CONVERT))
+ if (be16_to_cpu(entry->nameidx) == ichdr.firstused)
smallest = 1;
else
smallest = 0;
@@ -1784,20 +1888,20 @@ xfs_attr_leaf_remove(xfs_dabuf_t *bp, xfs_da_args_t *args)
/*
* Compress the remaining entries and zero out the removed stuff.
*/
- memset(XFS_ATTR_LEAF_NAME(leaf, args->index), 0, entsize);
- INT_MOD(hdr->usedbytes, ARCH_CONVERT, -entsize);
- xfs_da_log_buf(args->trans, bp,
- XFS_DA_LOGRANGE(leaf, XFS_ATTR_LEAF_NAME(leaf, args->index),
+ memset(xfs_attr3_leaf_name(leaf, args->index), 0, entsize);
+ ichdr.usedbytes -= entsize;
+ xfs_trans_log_buf(args->trans, bp,
+ XFS_DA_LOGRANGE(leaf, xfs_attr3_leaf_name(leaf, args->index),
entsize));
- tmp = (INT_GET(hdr->count, ARCH_CONVERT) - args->index)
- * sizeof(xfs_attr_leaf_entry_t);
- memmove((char *)entry, (char *)(entry+1), tmp);
- INT_MOD(hdr->count, ARCH_CONVERT, -1);
- xfs_da_log_buf(args->trans, bp,
- XFS_DA_LOGRANGE(leaf, entry, tmp + sizeof(*entry)));
- entry = &leaf->entries[INT_GET(hdr->count, ARCH_CONVERT)];
- memset((char *)entry, 0, sizeof(xfs_attr_leaf_entry_t));
+ tmp = (ichdr.count - args->index) * sizeof(xfs_attr_leaf_entry_t);
+ memmove(entry, entry + 1, tmp);
+ ichdr.count--;
+ xfs_trans_log_buf(args->trans, bp,
+ XFS_DA_LOGRANGE(leaf, entry, tmp + sizeof(xfs_attr_leaf_entry_t)));
+
+ entry = &xfs_attr3_leaf_entryp(leaf)[ichdr.count];
+ memset(entry, 0, sizeof(xfs_attr_leaf_entry_t));
/*
* If we removed the first entry, re-find the first used byte
@@ -1806,142 +1910,146 @@ xfs_attr_leaf_remove(xfs_dabuf_t *bp, xfs_da_args_t *args)
* removing the name.
*/
if (smallest) {
- tmp = XFS_LBSIZE(mp);
- entry = &leaf->entries[0];
- for (i = INT_GET(hdr->count, ARCH_CONVERT)-1;
- i >= 0; entry++, i--) {
- ASSERT(INT_GET(entry->nameidx, ARCH_CONVERT)
- >= INT_GET(hdr->firstused, ARCH_CONVERT));
- ASSERT(INT_GET(entry->nameidx, ARCH_CONVERT)
- < XFS_LBSIZE(mp));
- if (INT_GET(entry->nameidx, ARCH_CONVERT) < tmp)
- tmp = INT_GET(entry->nameidx, ARCH_CONVERT);
- }
- INT_SET(hdr->firstused, ARCH_CONVERT, tmp);
- if (!hdr->firstused) {
- INT_SET(hdr->firstused, ARCH_CONVERT,
- tmp - XFS_ATTR_LEAF_NAME_ALIGN);
+ tmp = args->geo->blksize;
+ entry = xfs_attr3_leaf_entryp(leaf);
+ for (i = ichdr.count - 1; i >= 0; entry++, i--) {
+ ASSERT(be16_to_cpu(entry->nameidx) >= ichdr.firstused);
+ ASSERT(be16_to_cpu(entry->nameidx) < args->geo->blksize);
+
+ if (be16_to_cpu(entry->nameidx) < tmp)
+ tmp = be16_to_cpu(entry->nameidx);
}
+ ichdr.firstused = tmp;
+ if (!ichdr.firstused)
+ ichdr.firstused = tmp - XFS_ATTR_LEAF_NAME_ALIGN;
} else {
- hdr->holes = 1; /* mark as needing compaction */
+ ichdr.holes = 1; /* mark as needing compaction */
}
- xfs_da_log_buf(args->trans, bp,
- XFS_DA_LOGRANGE(leaf, hdr, sizeof(*hdr)));
+ xfs_attr3_leaf_hdr_to_disk(leaf, &ichdr);
+ xfs_trans_log_buf(args->trans, bp,
+ XFS_DA_LOGRANGE(leaf, &leaf->hdr,
+ xfs_attr3_leaf_hdr_size(leaf)));
/*
* Check if leaf is less than 50% full, caller may want to
* "join" the leaf with a sibling if so.
*/
- tmp = sizeof(xfs_attr_leaf_hdr_t);
- tmp += INT_GET(leaf->hdr.count, ARCH_CONVERT)
- * sizeof(xfs_attr_leaf_entry_t);
- tmp += INT_GET(leaf->hdr.usedbytes, ARCH_CONVERT);
- return(tmp < mp->m_attr_magicpct); /* leaf is < 37% full */
+ tmp = ichdr.usedbytes + xfs_attr3_leaf_hdr_size(leaf) +
+ ichdr.count * sizeof(xfs_attr_leaf_entry_t);
+
+ return tmp < args->geo->magicpct; /* leaf is < 37% full */
}
/*
* Move all the attribute list entries from drop_leaf into save_leaf.
*/
void
-xfs_attr_leaf_unbalance(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk,
- xfs_da_state_blk_t *save_blk)
+xfs_attr3_leaf_unbalance(
+ struct xfs_da_state *state,
+ struct xfs_da_state_blk *drop_blk,
+ struct xfs_da_state_blk *save_blk)
{
- xfs_attr_leafblock_t *drop_leaf, *save_leaf, *tmp_leaf;
- xfs_attr_leaf_hdr_t *drop_hdr, *save_hdr, *tmp_hdr;
- xfs_mount_t *mp;
- char *tmpbuffer;
+ struct xfs_attr_leafblock *drop_leaf = drop_blk->bp->b_addr;
+ struct xfs_attr_leafblock *save_leaf = save_blk->bp->b_addr;
+ struct xfs_attr3_icleaf_hdr drophdr;
+ struct xfs_attr3_icleaf_hdr savehdr;
+ struct xfs_attr_leaf_entry *entry;
- /*
- * Set up environment.
- */
- mp = state->mp;
- ASSERT(drop_blk->magic == XFS_ATTR_LEAF_MAGIC);
- ASSERT(save_blk->magic == XFS_ATTR_LEAF_MAGIC);
- drop_leaf = drop_blk->bp->data;
- save_leaf = save_blk->bp->data;
- ASSERT(INT_GET(drop_leaf->hdr.info.magic, ARCH_CONVERT)
- == XFS_ATTR_LEAF_MAGIC);
- ASSERT(INT_GET(save_leaf->hdr.info.magic, ARCH_CONVERT)
- == XFS_ATTR_LEAF_MAGIC);
- drop_hdr = &drop_leaf->hdr;
- save_hdr = &save_leaf->hdr;
+ trace_xfs_attr_leaf_unbalance(state->args);
+
+ drop_leaf = drop_blk->bp->b_addr;
+ save_leaf = save_blk->bp->b_addr;
+ xfs_attr3_leaf_hdr_from_disk(&drophdr, drop_leaf);
+ xfs_attr3_leaf_hdr_from_disk(&savehdr, save_leaf);
+ entry = xfs_attr3_leaf_entryp(drop_leaf);
/*
* Save last hashval from dying block for later Btree fixup.
*/
- drop_blk->hashval =
- INT_GET(drop_leaf->entries[INT_GET(drop_leaf->hdr.count,
- ARCH_CONVERT)-1].hashval,
- ARCH_CONVERT);
+ drop_blk->hashval = be32_to_cpu(entry[drophdr.count - 1].hashval);
/*
* Check if we need a temp buffer, or can we do it in place.
* Note that we don't check "leaf" for holes because we will
* always be dropping it, toosmall() decided that for us already.
*/
- if (save_hdr->holes == 0) {
+ if (savehdr.holes == 0) {
/*
* dest leaf has no holes, so we add there. May need
* to make some room in the entry array.
*/
- if (xfs_attr_leaf_order(save_blk->bp, drop_blk->bp)) {
- xfs_attr_leaf_moveents(drop_leaf, 0, save_leaf, 0,
- (int)INT_GET(drop_hdr->count, ARCH_CONVERT), mp);
+ if (xfs_attr3_leaf_order(save_blk->bp, &savehdr,
+ drop_blk->bp, &drophdr)) {
+ xfs_attr3_leaf_moveents(state->args,
+ drop_leaf, &drophdr, 0,
+ save_leaf, &savehdr, 0,
+ drophdr.count);
} else {
- xfs_attr_leaf_moveents(drop_leaf, 0, save_leaf,
- INT_GET(save_hdr->count, ARCH_CONVERT),
- (int)INT_GET(drop_hdr->count, ARCH_CONVERT),
- mp);
+ xfs_attr3_leaf_moveents(state->args,
+ drop_leaf, &drophdr, 0,
+ save_leaf, &savehdr,
+ savehdr.count, drophdr.count);
}
} else {
/*
* Destination has holes, so we make a temporary copy
* of the leaf and add them both to that.
*/
- tmpbuffer = kmem_alloc(state->blocksize, KM_SLEEP);
- ASSERT(tmpbuffer != NULL);
- memset(tmpbuffer, 0, state->blocksize);
- tmp_leaf = (xfs_attr_leafblock_t *)tmpbuffer;
- tmp_hdr = &tmp_leaf->hdr;
- tmp_hdr->info = save_hdr->info; /* struct copy */
- tmp_hdr->count = 0;
- INT_SET(tmp_hdr->firstused, ARCH_CONVERT, state->blocksize);
- if (!tmp_hdr->firstused) {
- INT_SET(tmp_hdr->firstused, ARCH_CONVERT,
- state->blocksize - XFS_ATTR_LEAF_NAME_ALIGN);
- }
- tmp_hdr->usedbytes = 0;
- if (xfs_attr_leaf_order(save_blk->bp, drop_blk->bp)) {
- xfs_attr_leaf_moveents(drop_leaf, 0, tmp_leaf, 0,
- (int)INT_GET(drop_hdr->count, ARCH_CONVERT),
- mp);
- xfs_attr_leaf_moveents(save_leaf, 0, tmp_leaf,
- INT_GET(tmp_leaf->hdr.count, ARCH_CONVERT),
- (int)INT_GET(save_hdr->count, ARCH_CONVERT),
- mp);
+ struct xfs_attr_leafblock *tmp_leaf;
+ struct xfs_attr3_icleaf_hdr tmphdr;
+
+ tmp_leaf = kmem_zalloc(state->args->geo->blksize, KM_SLEEP);
+
+ /*
+ * Copy the header into the temp leaf so that all the stuff
+ * not in the incore header is present and gets copied back in
+ * once we've moved all the entries.
+ */
+ memcpy(tmp_leaf, save_leaf, xfs_attr3_leaf_hdr_size(save_leaf));
+
+ memset(&tmphdr, 0, sizeof(tmphdr));
+ tmphdr.magic = savehdr.magic;
+ tmphdr.forw = savehdr.forw;
+ tmphdr.back = savehdr.back;
+ tmphdr.firstused = state->args->geo->blksize;
+
+ /* write the header to the temp buffer to initialise it */
+ xfs_attr3_leaf_hdr_to_disk(tmp_leaf, &tmphdr);
+
+ if (xfs_attr3_leaf_order(save_blk->bp, &savehdr,
+ drop_blk->bp, &drophdr)) {
+ xfs_attr3_leaf_moveents(state->args,
+ drop_leaf, &drophdr, 0,
+ tmp_leaf, &tmphdr, 0,
+ drophdr.count);
+ xfs_attr3_leaf_moveents(state->args,
+ save_leaf, &savehdr, 0,
+ tmp_leaf, &tmphdr, tmphdr.count,
+ savehdr.count);
} else {
- xfs_attr_leaf_moveents(save_leaf, 0, tmp_leaf, 0,
- (int)INT_GET(save_hdr->count, ARCH_CONVERT),
- mp);
- xfs_attr_leaf_moveents(drop_leaf, 0, tmp_leaf,
- INT_GET(tmp_leaf->hdr.count, ARCH_CONVERT),
- (int)INT_GET(drop_hdr->count, ARCH_CONVERT),
- mp);
+ xfs_attr3_leaf_moveents(state->args,
+ save_leaf, &savehdr, 0,
+ tmp_leaf, &tmphdr, 0,
+ savehdr.count);
+ xfs_attr3_leaf_moveents(state->args,
+ drop_leaf, &drophdr, 0,
+ tmp_leaf, &tmphdr, tmphdr.count,
+ drophdr.count);
}
- memcpy((char *)save_leaf, (char *)tmp_leaf, state->blocksize);
- kmem_free(tmpbuffer, state->blocksize);
+ memcpy(save_leaf, tmp_leaf, state->args->geo->blksize);
+ savehdr = tmphdr; /* struct copy */
+ kmem_free(tmp_leaf);
}
- xfs_da_log_buf(state->args->trans, save_blk->bp, 0,
- state->blocksize - 1);
+ xfs_attr3_leaf_hdr_to_disk(save_leaf, &savehdr);
+ xfs_trans_log_buf(state->args->trans, save_blk->bp, 0,
+ state->args->geo->blksize - 1);
/*
* Copy out last hashval in each block for B-tree code.
*/
- save_blk->hashval =
- INT_GET(save_leaf->entries[INT_GET(save_leaf->hdr.count,
- ARCH_CONVERT)-1].hashval,
- ARCH_CONVERT);
+ entry = xfs_attr3_leaf_entryp(save_leaf);
+ save_blk->hashval = be32_to_cpu(entry[savehdr.count - 1].hashval);
}
/*========================================================================
@@ -1962,67 +2070,66 @@ xfs_attr_leaf_unbalance(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk,
* Don't change the args->value unless we find the attribute.
*/
int
-xfs_attr_leaf_lookup_int(xfs_dabuf_t *bp, xfs_da_args_t *args)
+xfs_attr3_leaf_lookup_int(
+ struct xfs_buf *bp,
+ struct xfs_da_args *args)
{
- xfs_attr_leafblock_t *leaf;
- xfs_attr_leaf_entry_t *entry;
- xfs_attr_leaf_name_local_t *name_loc;
- xfs_attr_leaf_name_remote_t *name_rmt;
- int probe, span;
- xfs_dahash_t hashval;
+ struct xfs_attr_leafblock *leaf;
+ struct xfs_attr3_icleaf_hdr ichdr;
+ struct xfs_attr_leaf_entry *entry;
+ struct xfs_attr_leaf_entry *entries;
+ struct xfs_attr_leaf_name_local *name_loc;
+ struct xfs_attr_leaf_name_remote *name_rmt;
+ xfs_dahash_t hashval;
+ int probe;
+ int span;
- leaf = bp->data;
- ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT)
- == XFS_ATTR_LEAF_MAGIC);
- ASSERT(INT_GET(leaf->hdr.count, ARCH_CONVERT)
- < (XFS_LBSIZE(args->dp->i_mount)/8));
+ trace_xfs_attr_leaf_lookup(args);
+
+ leaf = bp->b_addr;
+ xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
+ entries = xfs_attr3_leaf_entryp(leaf);
+ ASSERT(ichdr.count < args->geo->blksize / 8);
/*
* Binary search. (note: small blocks will skip this loop)
*/
hashval = args->hashval;
- probe = span = INT_GET(leaf->hdr.count, ARCH_CONVERT) / 2;
- for (entry = &leaf->entries[probe]; span > 4;
- entry = &leaf->entries[probe]) {
+ probe = span = ichdr.count / 2;
+ for (entry = &entries[probe]; span > 4; entry = &entries[probe]) {
span /= 2;
- if (INT_GET(entry->hashval, ARCH_CONVERT) < hashval)
+ if (be32_to_cpu(entry->hashval) < hashval)
probe += span;
- else if (INT_GET(entry->hashval, ARCH_CONVERT) > hashval)
+ else if (be32_to_cpu(entry->hashval) > hashval)
probe -= span;
else
break;
}
- ASSERT((probe >= 0) &&
- (!leaf->hdr.count
- || (probe < INT_GET(leaf->hdr.count, ARCH_CONVERT))));
- ASSERT((span <= 4) || (INT_GET(entry->hashval, ARCH_CONVERT)
- == hashval));
+ ASSERT(probe >= 0 && (!ichdr.count || probe < ichdr.count));
+ ASSERT(span <= 4 || be32_to_cpu(entry->hashval) == hashval);
/*
* Since we may have duplicate hashval's, find the first matching
* hashval in the leaf.
*/
- while ((probe > 0) && (INT_GET(entry->hashval, ARCH_CONVERT)
- >= hashval)) {
+ while (probe > 0 && be32_to_cpu(entry->hashval) >= hashval) {
entry--;
probe--;
}
- while ((probe < INT_GET(leaf->hdr.count, ARCH_CONVERT))
- && (INT_GET(entry->hashval, ARCH_CONVERT) < hashval)) {
+ while (probe < ichdr.count &&
+ be32_to_cpu(entry->hashval) < hashval) {
entry++;
probe++;
}
- if ((probe == INT_GET(leaf->hdr.count, ARCH_CONVERT))
- || (INT_GET(entry->hashval, ARCH_CONVERT) != hashval)) {
+ if (probe == ichdr.count || be32_to_cpu(entry->hashval) != hashval) {
args->index = probe;
- return(XFS_ERROR(ENOATTR));
+ return XFS_ERROR(ENOATTR);
}
/*
* Duplicate keys may be present, so search all of them for a match.
*/
- for ( ; (probe < INT_GET(leaf->hdr.count, ARCH_CONVERT))
- && (INT_GET(entry->hashval, ARCH_CONVERT) == hashval);
+ for (; probe < ichdr.count && (be32_to_cpu(entry->hashval) == hashval);
entry++, probe++) {
/*
* GROT: Add code to remove incomplete entries.
@@ -2036,44 +2143,36 @@ xfs_attr_leaf_lookup_int(xfs_dabuf_t *bp, xfs_da_args_t *args)
continue;
}
if (entry->flags & XFS_ATTR_LOCAL) {
- name_loc = XFS_ATTR_LEAF_NAME_LOCAL(leaf, probe);
+ name_loc = xfs_attr3_leaf_name_local(leaf, probe);
if (name_loc->namelen != args->namelen)
continue;
- if (memcmp(args->name, (char *)name_loc->nameval,
- args->namelen) != 0)
+ if (memcmp(args->name, name_loc->nameval,
+ args->namelen) != 0)
continue;
- if (((args->flags & ATTR_SECURE) != 0) !=
- ((entry->flags & XFS_ATTR_SECURE) != 0))
- continue;
- if (((args->flags & ATTR_ROOT) != 0) !=
- ((entry->flags & XFS_ATTR_ROOT) != 0))
+ if (!xfs_attr_namesp_match(args->flags, entry->flags))
continue;
args->index = probe;
- return(XFS_ERROR(EEXIST));
+ return XFS_ERROR(EEXIST);
} else {
- name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf, probe);
+ name_rmt = xfs_attr3_leaf_name_remote(leaf, probe);
if (name_rmt->namelen != args->namelen)
continue;
- if (memcmp(args->name, (char *)name_rmt->name,
- args->namelen) != 0)
+ if (memcmp(args->name, name_rmt->name,
+ args->namelen) != 0)
continue;
- if (((args->flags & ATTR_SECURE) != 0) !=
- ((entry->flags & XFS_ATTR_SECURE) != 0))
- continue;
- if (((args->flags & ATTR_ROOT) != 0) !=
- ((entry->flags & XFS_ATTR_ROOT) != 0))
+ if (!xfs_attr_namesp_match(args->flags, entry->flags))
continue;
args->index = probe;
- args->rmtblkno
- = INT_GET(name_rmt->valueblk, ARCH_CONVERT);
- args->rmtblkcnt = XFS_B_TO_FSB(args->dp->i_mount,
- INT_GET(name_rmt->valuelen,
- ARCH_CONVERT));
- return(XFS_ERROR(EEXIST));
+ args->rmtvaluelen = be32_to_cpu(name_rmt->valuelen);
+ args->rmtblkno = be32_to_cpu(name_rmt->valueblk);
+ args->rmtblkcnt = xfs_attr3_rmt_blocks(
+ args->dp->i_mount,
+ args->rmtvaluelen);
+ return XFS_ERROR(EEXIST);
}
}
args->index = probe;
- return(XFS_ERROR(ENOATTR));
+ return XFS_ERROR(ENOATTR);
}
/*
@@ -2081,55 +2180,57 @@ xfs_attr_leaf_lookup_int(xfs_dabuf_t *bp, xfs_da_args_t *args)
* list structure.
*/
int
-xfs_attr_leaf_getvalue(xfs_dabuf_t *bp, xfs_da_args_t *args)
+xfs_attr3_leaf_getvalue(
+ struct xfs_buf *bp,
+ struct xfs_da_args *args)
{
- int valuelen;
- xfs_attr_leafblock_t *leaf;
- xfs_attr_leaf_entry_t *entry;
- xfs_attr_leaf_name_local_t *name_loc;
- xfs_attr_leaf_name_remote_t *name_rmt;
-
- leaf = bp->data;
- ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT)
- == XFS_ATTR_LEAF_MAGIC);
- ASSERT(INT_GET(leaf->hdr.count, ARCH_CONVERT)
- < (XFS_LBSIZE(args->dp->i_mount)/8));
- ASSERT(args->index < ((int)INT_GET(leaf->hdr.count, ARCH_CONVERT)));
-
- entry = &leaf->entries[args->index];
+ struct xfs_attr_leafblock *leaf;
+ struct xfs_attr3_icleaf_hdr ichdr;
+ struct xfs_attr_leaf_entry *entry;
+ struct xfs_attr_leaf_name_local *name_loc;
+ struct xfs_attr_leaf_name_remote *name_rmt;
+ int valuelen;
+
+ leaf = bp->b_addr;
+ xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
+ ASSERT(ichdr.count < args->geo->blksize / 8);
+ ASSERT(args->index < ichdr.count);
+
+ entry = &xfs_attr3_leaf_entryp(leaf)[args->index];
if (entry->flags & XFS_ATTR_LOCAL) {
- name_loc = XFS_ATTR_LEAF_NAME_LOCAL(leaf, args->index);
+ name_loc = xfs_attr3_leaf_name_local(leaf, args->index);
ASSERT(name_loc->namelen == args->namelen);
ASSERT(memcmp(args->name, name_loc->nameval, args->namelen) == 0);
- valuelen = INT_GET(name_loc->valuelen, ARCH_CONVERT);
+ valuelen = be16_to_cpu(name_loc->valuelen);
if (args->flags & ATTR_KERNOVAL) {
args->valuelen = valuelen;
- return(0);
+ return 0;
}
if (args->valuelen < valuelen) {
args->valuelen = valuelen;
- return(XFS_ERROR(ERANGE));
+ return XFS_ERROR(ERANGE);
}
args->valuelen = valuelen;
memcpy(args->value, &name_loc->nameval[args->namelen], valuelen);
} else {
- name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf, args->index);
+ name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index);
ASSERT(name_rmt->namelen == args->namelen);
ASSERT(memcmp(args->name, name_rmt->name, args->namelen) == 0);
- valuelen = INT_GET(name_rmt->valuelen, ARCH_CONVERT);
- args->rmtblkno = INT_GET(name_rmt->valueblk, ARCH_CONVERT);
- args->rmtblkcnt = XFS_B_TO_FSB(args->dp->i_mount, valuelen);
+ args->rmtvaluelen = be32_to_cpu(name_rmt->valuelen);
+ args->rmtblkno = be32_to_cpu(name_rmt->valueblk);
+ args->rmtblkcnt = xfs_attr3_rmt_blocks(args->dp->i_mount,
+ args->rmtvaluelen);
if (args->flags & ATTR_KERNOVAL) {
- args->valuelen = valuelen;
- return(0);
+ args->valuelen = args->rmtvaluelen;
+ return 0;
}
- if (args->valuelen < valuelen) {
- args->valuelen = valuelen;
- return(XFS_ERROR(ERANGE));
+ if (args->valuelen < args->rmtvaluelen) {
+ args->valuelen = args->rmtvaluelen;
+ return XFS_ERROR(ERANGE);
}
- args->valuelen = valuelen;
+ args->valuelen = args->rmtvaluelen;
}
- return(0);
+ return 0;
}
/*========================================================================
@@ -2142,13 +2243,21 @@ xfs_attr_leaf_getvalue(xfs_dabuf_t *bp, xfs_da_args_t *args)
*/
/*ARGSUSED*/
STATIC void
-xfs_attr_leaf_moveents(xfs_attr_leafblock_t *leaf_s, int start_s,
- xfs_attr_leafblock_t *leaf_d, int start_d,
- int count, xfs_mount_t *mp)
+xfs_attr3_leaf_moveents(
+ struct xfs_da_args *args,
+ struct xfs_attr_leafblock *leaf_s,
+ struct xfs_attr3_icleaf_hdr *ichdr_s,
+ int start_s,
+ struct xfs_attr_leafblock *leaf_d,
+ struct xfs_attr3_icleaf_hdr *ichdr_d,
+ int start_d,
+ int count)
{
- xfs_attr_leaf_hdr_t *hdr_s, *hdr_d;
- xfs_attr_leaf_entry_t *entry_s, *entry_d;
- int desti, tmp, i;
+ struct xfs_attr_leaf_entry *entry_s;
+ struct xfs_attr_leaf_entry *entry_d;
+ int desti;
+ int tmp;
+ int i;
/*
* Check for nothing to do.
@@ -2159,48 +2268,41 @@ xfs_attr_leaf_moveents(xfs_attr_leafblock_t *leaf_s, int start_s,
/*
* Set up environment.
*/
- ASSERT(INT_GET(leaf_s->hdr.info.magic, ARCH_CONVERT)
- == XFS_ATTR_LEAF_MAGIC);
- ASSERT(INT_GET(leaf_d->hdr.info.magic, ARCH_CONVERT)
- == XFS_ATTR_LEAF_MAGIC);
- hdr_s = &leaf_s->hdr;
- hdr_d = &leaf_d->hdr;
- ASSERT((INT_GET(hdr_s->count, ARCH_CONVERT) > 0)
- && (INT_GET(hdr_s->count, ARCH_CONVERT)
- < (XFS_LBSIZE(mp)/8)));
- ASSERT(INT_GET(hdr_s->firstused, ARCH_CONVERT) >=
- ((INT_GET(hdr_s->count, ARCH_CONVERT)
- * sizeof(*entry_s))+sizeof(*hdr_s)));
- ASSERT(INT_GET(hdr_d->count, ARCH_CONVERT) < (XFS_LBSIZE(mp)/8));
- ASSERT(INT_GET(hdr_d->firstused, ARCH_CONVERT) >=
- ((INT_GET(hdr_d->count, ARCH_CONVERT)
- * sizeof(*entry_d))+sizeof(*hdr_d)));
-
- ASSERT(start_s < INT_GET(hdr_s->count, ARCH_CONVERT));
- ASSERT(start_d <= INT_GET(hdr_d->count, ARCH_CONVERT));
- ASSERT(count <= INT_GET(hdr_s->count, ARCH_CONVERT));
+ ASSERT(ichdr_s->magic == XFS_ATTR_LEAF_MAGIC ||
+ ichdr_s->magic == XFS_ATTR3_LEAF_MAGIC);
+ ASSERT(ichdr_s->magic == ichdr_d->magic);
+ ASSERT(ichdr_s->count > 0 && ichdr_s->count < args->geo->blksize / 8);
+ ASSERT(ichdr_s->firstused >= (ichdr_s->count * sizeof(*entry_s))
+ + xfs_attr3_leaf_hdr_size(leaf_s));
+ ASSERT(ichdr_d->count < args->geo->blksize / 8);
+ ASSERT(ichdr_d->firstused >= (ichdr_d->count * sizeof(*entry_d))
+ + xfs_attr3_leaf_hdr_size(leaf_d));
+
+ ASSERT(start_s < ichdr_s->count);
+ ASSERT(start_d <= ichdr_d->count);
+ ASSERT(count <= ichdr_s->count);
+
/*
* Move the entries in the destination leaf up to make a hole?
*/
- if (start_d < INT_GET(hdr_d->count, ARCH_CONVERT)) {
- tmp = INT_GET(hdr_d->count, ARCH_CONVERT) - start_d;
+ if (start_d < ichdr_d->count) {
+ tmp = ichdr_d->count - start_d;
tmp *= sizeof(xfs_attr_leaf_entry_t);
- entry_s = &leaf_d->entries[start_d];
- entry_d = &leaf_d->entries[start_d + count];
- memmove((char *)entry_d, (char *)entry_s, tmp);
+ entry_s = &xfs_attr3_leaf_entryp(leaf_d)[start_d];
+ entry_d = &xfs_attr3_leaf_entryp(leaf_d)[start_d + count];
+ memmove(entry_d, entry_s, tmp);
}
/*
* Copy all entry's in the same (sorted) order,
* but allocate attribute info packed and in sequence.
*/
- entry_s = &leaf_s->entries[start_s];
- entry_d = &leaf_d->entries[start_d];
+ entry_s = &xfs_attr3_leaf_entryp(leaf_s)[start_s];
+ entry_d = &xfs_attr3_leaf_entryp(leaf_d)[start_d];
desti = start_d;
for (i = 0; i < count; entry_s++, entry_d++, desti++, i++) {
- ASSERT(INT_GET(entry_s->nameidx, ARCH_CONVERT)
- >= INT_GET(hdr_s->firstused, ARCH_CONVERT));
+ ASSERT(be16_to_cpu(entry_s->nameidx) >= ichdr_s->firstused);
tmp = xfs_attr_leaf_entsize(leaf_s, start_s + i);
#ifdef GROT
/*
@@ -2209,36 +2311,34 @@ xfs_attr_leaf_moveents(xfs_attr_leafblock_t *leaf_s, int start_s,
* off for 6.2, should be revisited later.
*/
if (entry_s->flags & XFS_ATTR_INCOMPLETE) { /* skip partials? */
- memset(XFS_ATTR_LEAF_NAME(leaf_s, start_s + i), 0, tmp);
- INT_MOD(hdr_s->usedbytes, ARCH_CONVERT, -tmp);
- INT_MOD(hdr_s->count, ARCH_CONVERT, -1);
+ memset(xfs_attr3_leaf_name(leaf_s, start_s + i), 0, tmp);
+ ichdr_s->usedbytes -= tmp;
+ ichdr_s->count -= 1;
entry_d--; /* to compensate for ++ in loop hdr */
desti--;
if ((start_s + i) < offset)
result++; /* insertion index adjustment */
} else {
#endif /* GROT */
- INT_MOD(hdr_d->firstused, ARCH_CONVERT, -tmp);
+ ichdr_d->firstused -= tmp;
/* both on-disk, don't endian flip twice */
entry_d->hashval = entry_s->hashval;
- /* both on-disk, don't endian flip twice */
- entry_d->nameidx = hdr_d->firstused;
+ entry_d->nameidx = cpu_to_be16(ichdr_d->firstused);
entry_d->flags = entry_s->flags;
- ASSERT(INT_GET(entry_d->nameidx, ARCH_CONVERT) + tmp
- <= XFS_LBSIZE(mp));
- memmove(XFS_ATTR_LEAF_NAME(leaf_d, desti),
- XFS_ATTR_LEAF_NAME(leaf_s, start_s + i), tmp);
- ASSERT(INT_GET(entry_s->nameidx, ARCH_CONVERT) + tmp
- <= XFS_LBSIZE(mp));
- memset(XFS_ATTR_LEAF_NAME(leaf_s, start_s + i), 0, tmp);
- INT_MOD(hdr_s->usedbytes, ARCH_CONVERT, -tmp);
- INT_MOD(hdr_d->usedbytes, ARCH_CONVERT, tmp);
- INT_MOD(hdr_s->count, ARCH_CONVERT, -1);
- INT_MOD(hdr_d->count, ARCH_CONVERT, 1);
- tmp = INT_GET(hdr_d->count, ARCH_CONVERT)
- * sizeof(xfs_attr_leaf_entry_t)
- + sizeof(xfs_attr_leaf_hdr_t);
- ASSERT(INT_GET(hdr_d->firstused, ARCH_CONVERT) >= tmp);
+ ASSERT(be16_to_cpu(entry_d->nameidx) + tmp
+ <= args->geo->blksize);
+ memmove(xfs_attr3_leaf_name(leaf_d, desti),
+ xfs_attr3_leaf_name(leaf_s, start_s + i), tmp);
+ ASSERT(be16_to_cpu(entry_s->nameidx) + tmp
+ <= args->geo->blksize);
+ memset(xfs_attr3_leaf_name(leaf_s, start_s + i), 0, tmp);
+ ichdr_s->usedbytes -= tmp;
+ ichdr_d->usedbytes += tmp;
+ ichdr_s->count -= 1;
+ ichdr_d->count += 1;
+ tmp = ichdr_d->count * sizeof(xfs_attr_leaf_entry_t)
+ + xfs_attr3_leaf_hdr_size(leaf_d);
+ ASSERT(ichdr_d->firstused >= tmp);
#ifdef GROT
}
#endif /* GROT */
@@ -2247,94 +2347,60 @@ xfs_attr_leaf_moveents(xfs_attr_leafblock_t *leaf_s, int start_s,
/*
* Zero out the entries we just copied.
*/
- if (start_s == INT_GET(hdr_s->count, ARCH_CONVERT)) {
+ if (start_s == ichdr_s->count) {
tmp = count * sizeof(xfs_attr_leaf_entry_t);
- entry_s = &leaf_s->entries[start_s];
+ entry_s = &xfs_attr3_leaf_entryp(leaf_s)[start_s];
ASSERT(((char *)entry_s + tmp) <=
- ((char *)leaf_s + XFS_LBSIZE(mp)));
- memset((char *)entry_s, 0, tmp);
+ ((char *)leaf_s + args->geo->blksize));
+ memset(entry_s, 0, tmp);
} else {
/*
* Move the remaining entries down to fill the hole,
* then zero the entries at the top.
*/
- tmp = INT_GET(hdr_s->count, ARCH_CONVERT) - count;
- tmp *= sizeof(xfs_attr_leaf_entry_t);
- entry_s = &leaf_s->entries[start_s + count];
- entry_d = &leaf_s->entries[start_s];
- memmove((char *)entry_d, (char *)entry_s, tmp);
+ tmp = (ichdr_s->count - count) * sizeof(xfs_attr_leaf_entry_t);
+ entry_s = &xfs_attr3_leaf_entryp(leaf_s)[start_s + count];
+ entry_d = &xfs_attr3_leaf_entryp(leaf_s)[start_s];
+ memmove(entry_d, entry_s, tmp);
tmp = count * sizeof(xfs_attr_leaf_entry_t);
- entry_s = &leaf_s->entries[INT_GET(hdr_s->count,
- ARCH_CONVERT)];
+ entry_s = &xfs_attr3_leaf_entryp(leaf_s)[ichdr_s->count];
ASSERT(((char *)entry_s + tmp) <=
- ((char *)leaf_s + XFS_LBSIZE(mp)));
- memset((char *)entry_s, 0, tmp);
+ ((char *)leaf_s + args->geo->blksize));
+ memset(entry_s, 0, tmp);
}
/*
* Fill in the freemap information
*/
- INT_SET(hdr_d->freemap[0].base, ARCH_CONVERT,
- sizeof(xfs_attr_leaf_hdr_t));
- INT_MOD(hdr_d->freemap[0].base, ARCH_CONVERT,
- INT_GET(hdr_d->count, ARCH_CONVERT)
- * sizeof(xfs_attr_leaf_entry_t));
- INT_SET(hdr_d->freemap[0].size, ARCH_CONVERT,
- INT_GET(hdr_d->firstused, ARCH_CONVERT)
- - INT_GET(hdr_d->freemap[0].base, ARCH_CONVERT));
- hdr_d->freemap[1].base = 0;
- hdr_d->freemap[2].base = 0;
- hdr_d->freemap[1].size = 0;
- hdr_d->freemap[2].size = 0;
- hdr_s->holes = 1; /* leaf may not be compact */
-}
-
-/*
- * Compare two leaf blocks "order".
- * Return 0 unless leaf2 should go before leaf1.
- */
-int
-xfs_attr_leaf_order(xfs_dabuf_t *leaf1_bp, xfs_dabuf_t *leaf2_bp)
-{
- xfs_attr_leafblock_t *leaf1, *leaf2;
-
- leaf1 = leaf1_bp->data;
- leaf2 = leaf2_bp->data;
- ASSERT((INT_GET(leaf1->hdr.info.magic, ARCH_CONVERT)
- == XFS_ATTR_LEAF_MAGIC) &&
- (INT_GET(leaf2->hdr.info.magic, ARCH_CONVERT)
- == XFS_ATTR_LEAF_MAGIC));
- if ( (INT_GET(leaf1->hdr.count, ARCH_CONVERT) > 0)
- && (INT_GET(leaf2->hdr.count, ARCH_CONVERT) > 0)
- && ( (INT_GET(leaf2->entries[ 0 ].hashval, ARCH_CONVERT) <
- INT_GET(leaf1->entries[ 0 ].hashval, ARCH_CONVERT))
- || (INT_GET(leaf2->entries[INT_GET(leaf2->hdr.count,
- ARCH_CONVERT)-1].hashval, ARCH_CONVERT) <
- INT_GET(leaf1->entries[INT_GET(leaf1->hdr.count,
- ARCH_CONVERT)-1].hashval, ARCH_CONVERT))) ) {
- return(1);
- }
- return(0);
+ ichdr_d->freemap[0].base = xfs_attr3_leaf_hdr_size(leaf_d);
+ ichdr_d->freemap[0].base += ichdr_d->count * sizeof(xfs_attr_leaf_entry_t);
+ ichdr_d->freemap[0].size = ichdr_d->firstused - ichdr_d->freemap[0].base;
+ ichdr_d->freemap[1].base = 0;
+ ichdr_d->freemap[2].base = 0;
+ ichdr_d->freemap[1].size = 0;
+ ichdr_d->freemap[2].size = 0;
+ ichdr_s->holes = 1; /* leaf may not be compact */
}
/*
* Pick up the last hashvalue from a leaf block.
*/
xfs_dahash_t
-xfs_attr_leaf_lasthash(xfs_dabuf_t *bp, int *count)
+xfs_attr_leaf_lasthash(
+ struct xfs_buf *bp,
+ int *count)
{
- xfs_attr_leafblock_t *leaf;
+ struct xfs_attr3_icleaf_hdr ichdr;
+ struct xfs_attr_leaf_entry *entries;
- leaf = bp->data;
- ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT)
- == XFS_ATTR_LEAF_MAGIC);
+ xfs_attr3_leaf_hdr_from_disk(&ichdr, bp->b_addr);
+ entries = xfs_attr3_leaf_entryp(bp->b_addr);
if (count)
- *count = INT_GET(leaf->hdr.count, ARCH_CONVERT);
- if (!leaf->hdr.count)
- return(0);
- return(INT_GET(leaf->entries[INT_GET(leaf->hdr.count,
- ARCH_CONVERT)-1].hashval, ARCH_CONVERT));
+ *count = ichdr.count;
+ if (!ichdr.count)
+ return 0;
+ return be32_to_cpu(entries[ichdr.count - 1].hashval);
}
/*
@@ -2344,22 +2410,21 @@ xfs_attr_leaf_lasthash(xfs_dabuf_t *bp, int *count)
STATIC int
xfs_attr_leaf_entsize(xfs_attr_leafblock_t *leaf, int index)
{
+ struct xfs_attr_leaf_entry *entries;
xfs_attr_leaf_name_local_t *name_loc;
xfs_attr_leaf_name_remote_t *name_rmt;
int size;
- ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT)
- == XFS_ATTR_LEAF_MAGIC);
- if (leaf->entries[index].flags & XFS_ATTR_LOCAL) {
- name_loc = XFS_ATTR_LEAF_NAME_LOCAL(leaf, index);
- size = XFS_ATTR_LEAF_ENTSIZE_LOCAL(name_loc->namelen,
- INT_GET(name_loc->valuelen,
- ARCH_CONVERT));
+ entries = xfs_attr3_leaf_entryp(leaf);
+ if (entries[index].flags & XFS_ATTR_LOCAL) {
+ name_loc = xfs_attr3_leaf_name_local(leaf, index);
+ size = xfs_attr_leaf_entsize_local(name_loc->namelen,
+ be16_to_cpu(name_loc->valuelen));
} else {
- name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf, index);
- size = XFS_ATTR_LEAF_ENTSIZE_REMOTE(name_rmt->namelen);
+ name_rmt = xfs_attr3_leaf_name_remote(leaf, index);
+ size = xfs_attr_leaf_entsize_remote(name_rmt->namelen);
}
- return(size);
+ return size;
}
/*
@@ -2369,200 +2434,23 @@ xfs_attr_leaf_entsize(xfs_attr_leafblock_t *leaf, int index)
* a "local" or a "remote" attribute.
*/
int
-xfs_attr_leaf_newentsize(int namelen, int valuelen, int blocksize, int *local)
+xfs_attr_leaf_newentsize(
+ struct xfs_da_args *args,
+ int *local)
{
- int size;
+ int size;
- size = XFS_ATTR_LEAF_ENTSIZE_LOCAL(namelen, valuelen);
- if (size < XFS_ATTR_LEAF_ENTSIZE_LOCAL_MAX(blocksize)) {
- if (local) {
+ size = xfs_attr_leaf_entsize_local(args->namelen, args->valuelen);
+ if (size < xfs_attr_leaf_entsize_local_max(args->geo->blksize)) {
+ if (local)
*local = 1;
- }
- } else {
- size = XFS_ATTR_LEAF_ENTSIZE_REMOTE(namelen);
- if (local) {
- *local = 0;
- }
- }
- return(size);
-}
-
-/*
- * Copy out attribute list entries for attr_list(), for leaf attribute lists.
- */
-int
-xfs_attr_leaf_list_int(xfs_dabuf_t *bp, xfs_attr_list_context_t *context)
-{
- attrlist_cursor_kern_t *cursor;
- xfs_attr_leafblock_t *leaf;
- xfs_attr_leaf_entry_t *entry;
- xfs_attr_leaf_name_local_t *name_loc;
- xfs_attr_leaf_name_remote_t *name_rmt;
- int retval, i;
-
- ASSERT(bp != NULL);
- leaf = bp->data;
- cursor = context->cursor;
- cursor->initted = 1;
-
- xfs_attr_trace_l_cl("blk start", context, leaf);
-
- /*
- * Re-find our place in the leaf block if this is a new syscall.
- */
- if (context->resynch) {
- entry = &leaf->entries[0];
- for (i = 0; i < INT_GET(leaf->hdr.count, ARCH_CONVERT);
- entry++, i++) {
- if (INT_GET(entry->hashval, ARCH_CONVERT)
- == cursor->hashval) {
- if (cursor->offset == context->dupcnt) {
- context->dupcnt = 0;
- break;
- }
- context->dupcnt++;
- } else if (INT_GET(entry->hashval, ARCH_CONVERT)
- > cursor->hashval) {
- context->dupcnt = 0;
- break;
- }
- }
- if (i == INT_GET(leaf->hdr.count, ARCH_CONVERT)) {
- xfs_attr_trace_l_c("not found", context);
- return(0);
- }
- } else {
- entry = &leaf->entries[0];
- i = 0;
+ return size;
}
- context->resynch = 0;
-
- /*
- * We have found our place, start copying out the new attributes.
- */
- retval = 0;
- for ( ; (i < INT_GET(leaf->hdr.count, ARCH_CONVERT))
- && (retval == 0); entry++, i++) {
- attrnames_t *namesp;
-
- if (INT_GET(entry->hashval, ARCH_CONVERT) != cursor->hashval) {
- cursor->hashval = INT_GET(entry->hashval, ARCH_CONVERT);
- cursor->offset = 0;
- }
-
- if (entry->flags & XFS_ATTR_INCOMPLETE)
- continue; /* skip incomplete entries */
- if (((context->flags & ATTR_SECURE) != 0) !=
- ((entry->flags & XFS_ATTR_SECURE) != 0) &&
- !(context->flags & ATTR_KERNORMALS))
- continue; /* skip non-matching entries */
- if (((context->flags & ATTR_ROOT) != 0) !=
- ((entry->flags & XFS_ATTR_ROOT) != 0) &&
- !(context->flags & ATTR_KERNROOTLS))
- continue; /* skip non-matching entries */
-
- namesp = (entry->flags & XFS_ATTR_SECURE) ? &attr_secure :
- ((entry->flags & XFS_ATTR_ROOT) ? &attr_trusted :
- &attr_user);
-
- if (entry->flags & XFS_ATTR_LOCAL) {
- name_loc = XFS_ATTR_LEAF_NAME_LOCAL(leaf, i);
- if (context->flags & ATTR_KERNOVAL) {
- ASSERT(context->flags & ATTR_KERNAMELS);
- context->count += namesp->attr_namelen +
- (int)name_loc->namelen + 1;
- } else {
- retval = xfs_attr_put_listent(context, namesp,
- (char *)name_loc->nameval,
- (int)name_loc->namelen,
- (int)INT_GET(name_loc->valuelen,
- ARCH_CONVERT));
- }
- } else {
- name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf, i);
- if (context->flags & ATTR_KERNOVAL) {
- ASSERT(context->flags & ATTR_KERNAMELS);
- context->count += namesp->attr_namelen +
- (int)name_rmt->namelen + 1;
- } else {
- retval = xfs_attr_put_listent(context, namesp,
- (char *)name_rmt->name,
- (int)name_rmt->namelen,
- (int)INT_GET(name_rmt->valuelen,
- ARCH_CONVERT));
- }
- }
- if (retval == 0) {
- cursor->offset++;
- }
- }
- xfs_attr_trace_l_cl("blk end", context, leaf);
- return(retval);
+ if (local)
+ *local = 0;
+ return xfs_attr_leaf_entsize_remote(args->namelen);
}
-#define ATTR_ENTBASESIZE /* minimum bytes used by an attr */ \
- (((struct attrlist_ent *) 0)->a_name - (char *) 0)
-#define ATTR_ENTSIZE(namelen) /* actual bytes used by an attr */ \
- ((ATTR_ENTBASESIZE + (namelen) + 1 + sizeof(u_int32_t)-1) \
- & ~(sizeof(u_int32_t)-1))
-
-/*
- * Format an attribute and copy it out to the user's buffer.
- * Take care to check values and protect against them changing later,
- * we may be reading them directly out of a user buffer.
- */
-/*ARGSUSED*/
-STATIC int
-xfs_attr_put_listent(xfs_attr_list_context_t *context,
- attrnames_t *namesp, char *name, int namelen, int valuelen)
-{
- attrlist_ent_t *aep;
- int arraytop;
-
- ASSERT(!(context->flags & ATTR_KERNOVAL));
- if (context->flags & ATTR_KERNAMELS) {
- char *offset;
-
- ASSERT(context->count >= 0);
-
- arraytop = context->count + namesp->attr_namelen + namelen + 1;
- if (arraytop > context->firstu) {
- context->count = -1; /* insufficient space */
- return(1);
- }
- offset = (char *)context->alist + context->count;
- strncpy(offset, namesp->attr_name, namesp->attr_namelen);
- offset += namesp->attr_namelen;
- strncpy(offset, name, namelen); /* real name */
- offset += namelen;
- *offset = '\0';
- context->count += namesp->attr_namelen + namelen + 1;
- return(0);
- }
-
- ASSERT(context->count >= 0);
- ASSERT(context->count < (ATTR_MAX_VALUELEN/8));
- ASSERT(context->firstu >= sizeof(*context->alist));
- ASSERT(context->firstu <= context->bufsize);
-
- arraytop = sizeof(*context->alist) +
- context->count * sizeof(context->alist->al_offset[0]);
- context->firstu -= ATTR_ENTSIZE(namelen);
- if (context->firstu < arraytop) {
- xfs_attr_trace_l_c("buffer full", context);
- context->alist->al_more = 1;
- return(1);
- }
-
- aep = (attrlist_ent_t *)&(((char *)context->alist)[ context->firstu ]);
- aep->a_valuelen = valuelen;
- memcpy(aep->a_name, name, namelen);
- aep->a_name[ namelen ] = 0;
- context->alist->al_offset[ context->count++ ] = context->firstu;
- context->alist->al_count = context->count;
- xfs_attr_trace_l_c("add", context);
- return(0);
-}
/*========================================================================
* Manage the INCOMPLETE flag in a leaf entry
@@ -2572,122 +2460,120 @@ xfs_attr_put_listent(xfs_attr_list_context_t *context,
* Clear the INCOMPLETE flag on an entry in a leaf block.
*/
int
-xfs_attr_leaf_clearflag(xfs_da_args_t *args)
+xfs_attr3_leaf_clearflag(
+ struct xfs_da_args *args)
{
- xfs_attr_leafblock_t *leaf;
- xfs_attr_leaf_entry_t *entry;
- xfs_attr_leaf_name_remote_t *name_rmt;
- xfs_dabuf_t *bp;
- int error;
+ struct xfs_attr_leafblock *leaf;
+ struct xfs_attr_leaf_entry *entry;
+ struct xfs_attr_leaf_name_remote *name_rmt;
+ struct xfs_buf *bp;
+ int error;
#ifdef DEBUG
+ struct xfs_attr3_icleaf_hdr ichdr;
xfs_attr_leaf_name_local_t *name_loc;
int namelen;
char *name;
#endif /* DEBUG */
+ trace_xfs_attr_leaf_clearflag(args);
/*
* Set up the operation.
*/
- error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp,
- XFS_ATTR_FORK);
- if (error) {
+ error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
+ if (error)
return(error);
- }
- ASSERT(bp != NULL);
- leaf = bp->data;
- ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT)
- == XFS_ATTR_LEAF_MAGIC);
- ASSERT(args->index < INT_GET(leaf->hdr.count, ARCH_CONVERT));
- ASSERT(args->index >= 0);
- entry = &leaf->entries[ args->index ];
+ leaf = bp->b_addr;
+ entry = &xfs_attr3_leaf_entryp(leaf)[args->index];
ASSERT(entry->flags & XFS_ATTR_INCOMPLETE);
#ifdef DEBUG
+ xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
+ ASSERT(args->index < ichdr.count);
+ ASSERT(args->index >= 0);
+
if (entry->flags & XFS_ATTR_LOCAL) {
- name_loc = XFS_ATTR_LEAF_NAME_LOCAL(leaf, args->index);
+ name_loc = xfs_attr3_leaf_name_local(leaf, args->index);
namelen = name_loc->namelen;
name = (char *)name_loc->nameval;
} else {
- name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf, args->index);
+ name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index);
namelen = name_rmt->namelen;
name = (char *)name_rmt->name;
}
- ASSERT(INT_GET(entry->hashval, ARCH_CONVERT) == args->hashval);
+ ASSERT(be32_to_cpu(entry->hashval) == args->hashval);
ASSERT(namelen == args->namelen);
ASSERT(memcmp(name, args->name, namelen) == 0);
#endif /* DEBUG */
entry->flags &= ~XFS_ATTR_INCOMPLETE;
- xfs_da_log_buf(args->trans, bp,
+ xfs_trans_log_buf(args->trans, bp,
XFS_DA_LOGRANGE(leaf, entry, sizeof(*entry)));
if (args->rmtblkno) {
ASSERT((entry->flags & XFS_ATTR_LOCAL) == 0);
- name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf, args->index);
- INT_SET(name_rmt->valueblk, ARCH_CONVERT, args->rmtblkno);
- INT_SET(name_rmt->valuelen, ARCH_CONVERT, args->valuelen);
- xfs_da_log_buf(args->trans, bp,
+ name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index);
+ name_rmt->valueblk = cpu_to_be32(args->rmtblkno);
+ name_rmt->valuelen = cpu_to_be32(args->rmtvaluelen);
+ xfs_trans_log_buf(args->trans, bp,
XFS_DA_LOGRANGE(leaf, name_rmt, sizeof(*name_rmt)));
}
- xfs_da_buf_done(bp);
/*
* Commit the flag value change and start the next trans in series.
*/
- error = xfs_attr_rolltrans(&args->trans, args->dp);
-
- return(error);
+ return xfs_trans_roll(&args->trans, args->dp);
}
/*
* Set the INCOMPLETE flag on an entry in a leaf block.
*/
int
-xfs_attr_leaf_setflag(xfs_da_args_t *args)
+xfs_attr3_leaf_setflag(
+ struct xfs_da_args *args)
{
- xfs_attr_leafblock_t *leaf;
- xfs_attr_leaf_entry_t *entry;
- xfs_attr_leaf_name_remote_t *name_rmt;
- xfs_dabuf_t *bp;
+ struct xfs_attr_leafblock *leaf;
+ struct xfs_attr_leaf_entry *entry;
+ struct xfs_attr_leaf_name_remote *name_rmt;
+ struct xfs_buf *bp;
int error;
+#ifdef DEBUG
+ struct xfs_attr3_icleaf_hdr ichdr;
+#endif
+
+ trace_xfs_attr_leaf_setflag(args);
/*
* Set up the operation.
*/
- error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp,
- XFS_ATTR_FORK);
- if (error) {
+ error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
+ if (error)
return(error);
- }
- ASSERT(bp != NULL);
- leaf = bp->data;
- ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT)
- == XFS_ATTR_LEAF_MAGIC);
- ASSERT(args->index < INT_GET(leaf->hdr.count, ARCH_CONVERT));
+ leaf = bp->b_addr;
+#ifdef DEBUG
+ xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
+ ASSERT(args->index < ichdr.count);
ASSERT(args->index >= 0);
- entry = &leaf->entries[ args->index ];
+#endif
+ entry = &xfs_attr3_leaf_entryp(leaf)[args->index];
ASSERT((entry->flags & XFS_ATTR_INCOMPLETE) == 0);
entry->flags |= XFS_ATTR_INCOMPLETE;
- xfs_da_log_buf(args->trans, bp,
+ xfs_trans_log_buf(args->trans, bp,
XFS_DA_LOGRANGE(leaf, entry, sizeof(*entry)));
if ((entry->flags & XFS_ATTR_LOCAL) == 0) {
- name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf, args->index);
+ name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index);
name_rmt->valueblk = 0;
name_rmt->valuelen = 0;
- xfs_da_log_buf(args->trans, bp,
+ xfs_trans_log_buf(args->trans, bp,
XFS_DA_LOGRANGE(leaf, name_rmt, sizeof(*name_rmt)));
}
- xfs_da_buf_done(bp);
/*
* Commit the flag value change and start the next trans in series.
*/
- error = xfs_attr_rolltrans(&args->trans, args->dp);
-
- return(error);
+ return xfs_trans_roll(&args->trans, args->dp);
}
/*
@@ -2698,77 +2584,80 @@ xfs_attr_leaf_setflag(xfs_da_args_t *args)
* Note that they could be in different blocks, or in the same block.
*/
int
-xfs_attr_leaf_flipflags(xfs_da_args_t *args)
+xfs_attr3_leaf_flipflags(
+ struct xfs_da_args *args)
{
- xfs_attr_leafblock_t *leaf1, *leaf2;
- xfs_attr_leaf_entry_t *entry1, *entry2;
- xfs_attr_leaf_name_remote_t *name_rmt;
- xfs_dabuf_t *bp1, *bp2;
+ struct xfs_attr_leafblock *leaf1;
+ struct xfs_attr_leafblock *leaf2;
+ struct xfs_attr_leaf_entry *entry1;
+ struct xfs_attr_leaf_entry *entry2;
+ struct xfs_attr_leaf_name_remote *name_rmt;
+ struct xfs_buf *bp1;
+ struct xfs_buf *bp2;
int error;
#ifdef DEBUG
+ struct xfs_attr3_icleaf_hdr ichdr1;
+ struct xfs_attr3_icleaf_hdr ichdr2;
xfs_attr_leaf_name_local_t *name_loc;
int namelen1, namelen2;
char *name1, *name2;
#endif /* DEBUG */
+ trace_xfs_attr_leaf_flipflags(args);
+
/*
* Read the block containing the "old" attr
*/
- error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp1,
- XFS_ATTR_FORK);
- if (error) {
- return(error);
- }
- ASSERT(bp1 != NULL);
+ error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp1);
+ if (error)
+ return error;
/*
* Read the block containing the "new" attr, if it is different
*/
if (args->blkno2 != args->blkno) {
- error = xfs_da_read_buf(args->trans, args->dp, args->blkno2,
- -1, &bp2, XFS_ATTR_FORK);
- if (error) {
- return(error);
- }
- ASSERT(bp2 != NULL);
+ error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno2,
+ -1, &bp2);
+ if (error)
+ return error;
} else {
bp2 = bp1;
}
- leaf1 = bp1->data;
- ASSERT(INT_GET(leaf1->hdr.info.magic, ARCH_CONVERT)
- == XFS_ATTR_LEAF_MAGIC);
- ASSERT(args->index < INT_GET(leaf1->hdr.count, ARCH_CONVERT));
+ leaf1 = bp1->b_addr;
+ entry1 = &xfs_attr3_leaf_entryp(leaf1)[args->index];
+
+ leaf2 = bp2->b_addr;
+ entry2 = &xfs_attr3_leaf_entryp(leaf2)[args->index2];
+
+#ifdef DEBUG
+ xfs_attr3_leaf_hdr_from_disk(&ichdr1, leaf1);
+ ASSERT(args->index < ichdr1.count);
ASSERT(args->index >= 0);
- entry1 = &leaf1->entries[ args->index ];
- leaf2 = bp2->data;
- ASSERT(INT_GET(leaf2->hdr.info.magic, ARCH_CONVERT)
- == XFS_ATTR_LEAF_MAGIC);
- ASSERT(args->index2 < INT_GET(leaf2->hdr.count, ARCH_CONVERT));
+ xfs_attr3_leaf_hdr_from_disk(&ichdr2, leaf2);
+ ASSERT(args->index2 < ichdr2.count);
ASSERT(args->index2 >= 0);
- entry2 = &leaf2->entries[ args->index2 ];
-#ifdef DEBUG
if (entry1->flags & XFS_ATTR_LOCAL) {
- name_loc = XFS_ATTR_LEAF_NAME_LOCAL(leaf1, args->index);
+ name_loc = xfs_attr3_leaf_name_local(leaf1, args->index);
namelen1 = name_loc->namelen;
name1 = (char *)name_loc->nameval;
} else {
- name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf1, args->index);
+ name_rmt = xfs_attr3_leaf_name_remote(leaf1, args->index);
namelen1 = name_rmt->namelen;
name1 = (char *)name_rmt->name;
}
if (entry2->flags & XFS_ATTR_LOCAL) {
- name_loc = XFS_ATTR_LEAF_NAME_LOCAL(leaf2, args->index2);
+ name_loc = xfs_attr3_leaf_name_local(leaf2, args->index2);
namelen2 = name_loc->namelen;
name2 = (char *)name_loc->nameval;
} else {
- name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf2, args->index2);
+ name_rmt = xfs_attr3_leaf_name_remote(leaf2, args->index2);
namelen2 = name_rmt->namelen;
name2 = (char *)name_rmt->name;
}
- ASSERT(INT_GET(entry1->hashval, ARCH_CONVERT) == INT_GET(entry2->hashval, ARCH_CONVERT));
+ ASSERT(be32_to_cpu(entry1->hashval) == be32_to_cpu(entry2->hashval));
ASSERT(namelen1 == namelen2);
ASSERT(memcmp(name1, name2, namelen1) == 0);
#endif /* DEBUG */
@@ -2777,406 +2666,32 @@ xfs_attr_leaf_flipflags(xfs_da_args_t *args)
ASSERT((entry2->flags & XFS_ATTR_INCOMPLETE) == 0);
entry1->flags &= ~XFS_ATTR_INCOMPLETE;
- xfs_da_log_buf(args->trans, bp1,
+ xfs_trans_log_buf(args->trans, bp1,
XFS_DA_LOGRANGE(leaf1, entry1, sizeof(*entry1)));
if (args->rmtblkno) {
ASSERT((entry1->flags & XFS_ATTR_LOCAL) == 0);
- name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf1, args->index);
- INT_SET(name_rmt->valueblk, ARCH_CONVERT, args->rmtblkno);
- INT_SET(name_rmt->valuelen, ARCH_CONVERT, args->valuelen);
- xfs_da_log_buf(args->trans, bp1,
+ name_rmt = xfs_attr3_leaf_name_remote(leaf1, args->index);
+ name_rmt->valueblk = cpu_to_be32(args->rmtblkno);
+ name_rmt->valuelen = cpu_to_be32(args->rmtvaluelen);
+ xfs_trans_log_buf(args->trans, bp1,
XFS_DA_LOGRANGE(leaf1, name_rmt, sizeof(*name_rmt)));
}
entry2->flags |= XFS_ATTR_INCOMPLETE;
- xfs_da_log_buf(args->trans, bp2,
+ xfs_trans_log_buf(args->trans, bp2,
XFS_DA_LOGRANGE(leaf2, entry2, sizeof(*entry2)));
if ((entry2->flags & XFS_ATTR_LOCAL) == 0) {
- name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf2, args->index2);
+ name_rmt = xfs_attr3_leaf_name_remote(leaf2, args->index2);
name_rmt->valueblk = 0;
name_rmt->valuelen = 0;
- xfs_da_log_buf(args->trans, bp2,
+ xfs_trans_log_buf(args->trans, bp2,
XFS_DA_LOGRANGE(leaf2, name_rmt, sizeof(*name_rmt)));
}
- xfs_da_buf_done(bp1);
- if (bp1 != bp2)
- xfs_da_buf_done(bp2);
/*
* Commit the flag value change and start the next trans in series.
*/
- error = xfs_attr_rolltrans(&args->trans, args->dp);
-
- return(error);
-}
-
-/*========================================================================
- * Indiscriminately delete the entire attribute fork
- *========================================================================*/
-
-/*
- * Recurse (gasp!) through the attribute nodes until we find leaves.
- * We're doing a depth-first traversal in order to invalidate everything.
- */
-int
-xfs_attr_root_inactive(xfs_trans_t **trans, xfs_inode_t *dp)
-{
- xfs_da_blkinfo_t *info;
- xfs_daddr_t blkno;
- xfs_dabuf_t *bp;
- int error;
-
- /*
- * Read block 0 to see what we have to work with.
- * We only get here if we have extents, since we remove
- * the extents in reverse order the extent containing
- * block 0 must still be there.
- */
- error = xfs_da_read_buf(*trans, dp, 0, -1, &bp, XFS_ATTR_FORK);
- if (error)
- return(error);
- blkno = xfs_da_blkno(bp);
-
- /*
- * Invalidate the tree, even if the "tree" is only a single leaf block.
- * This is a depth-first traversal!
- */
- info = bp->data;
- if (INT_GET(info->magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC) {
- error = xfs_attr_node_inactive(trans, dp, bp, 1);
- } else if (INT_GET(info->magic, ARCH_CONVERT) == XFS_ATTR_LEAF_MAGIC) {
- error = xfs_attr_leaf_inactive(trans, dp, bp);
- } else {
- error = XFS_ERROR(EIO);
- xfs_da_brelse(*trans, bp);
- }
- if (error)
- return(error);
-
- /*
- * Invalidate the incore copy of the root block.
- */
- error = xfs_da_get_buf(*trans, dp, 0, blkno, &bp, XFS_ATTR_FORK);
- if (error)
- return(error);
- xfs_da_binval(*trans, bp); /* remove from cache */
- /*
- * Commit the invalidate and start the next transaction.
- */
- error = xfs_attr_rolltrans(trans, dp);
-
- return (error);
-}
-
-/*
- * Recurse (gasp!) through the attribute nodes until we find leaves.
- * We're doing a depth-first traversal in order to invalidate everything.
- */
-STATIC int
-xfs_attr_node_inactive(xfs_trans_t **trans, xfs_inode_t *dp, xfs_dabuf_t *bp,
- int level)
-{
- xfs_da_blkinfo_t *info;
- xfs_da_intnode_t *node;
- xfs_dablk_t child_fsb;
- xfs_daddr_t parent_blkno, child_blkno;
- int error, count, i;
- xfs_dabuf_t *child_bp;
-
- /*
- * Since this code is recursive (gasp!) we must protect ourselves.
- */
- if (level > XFS_DA_NODE_MAXDEPTH) {
- xfs_da_brelse(*trans, bp); /* no locks for later trans */
- return(XFS_ERROR(EIO));
- }
-
- node = bp->data;
- ASSERT(INT_GET(node->hdr.info.magic, ARCH_CONVERT)
- == XFS_DA_NODE_MAGIC);
- parent_blkno = xfs_da_blkno(bp); /* save for re-read later */
- count = INT_GET(node->hdr.count, ARCH_CONVERT);
- if (!count) {
- xfs_da_brelse(*trans, bp);
- return(0);
- }
- child_fsb = INT_GET(node->btree[0].before, ARCH_CONVERT);
- xfs_da_brelse(*trans, bp); /* no locks for later trans */
-
- /*
- * If this is the node level just above the leaves, simply loop
- * over the leaves removing all of them. If this is higher up
- * in the tree, recurse downward.
- */
- for (i = 0; i < count; i++) {
- /*
- * Read the subsidiary block to see what we have to work with.
- * Don't do this in a transaction. This is a depth-first
- * traversal of the tree so we may deal with many blocks
- * before we come back to this one.
- */
- error = xfs_da_read_buf(*trans, dp, child_fsb, -2, &child_bp,
- XFS_ATTR_FORK);
- if (error)
- return(error);
- if (child_bp) {
- /* save for re-read later */
- child_blkno = xfs_da_blkno(child_bp);
-
- /*
- * Invalidate the subtree, however we have to.
- */
- info = child_bp->data;
- if (INT_GET(info->magic, ARCH_CONVERT)
- == XFS_DA_NODE_MAGIC) {
- error = xfs_attr_node_inactive(trans, dp,
- child_bp, level+1);
- } else if (INT_GET(info->magic, ARCH_CONVERT)
- == XFS_ATTR_LEAF_MAGIC) {
- error = xfs_attr_leaf_inactive(trans, dp,
- child_bp);
- } else {
- error = XFS_ERROR(EIO);
- xfs_da_brelse(*trans, child_bp);
- }
- if (error)
- return(error);
-
- /*
- * Remove the subsidiary block from the cache
- * and from the log.
- */
- error = xfs_da_get_buf(*trans, dp, 0, child_blkno,
- &child_bp, XFS_ATTR_FORK);
- if (error)
- return(error);
- xfs_da_binval(*trans, child_bp);
- }
-
- /*
- * If we're not done, re-read the parent to get the next
- * child block number.
- */
- if ((i+1) < count) {
- error = xfs_da_read_buf(*trans, dp, 0, parent_blkno,
- &bp, XFS_ATTR_FORK);
- if (error)
- return(error);
- child_fsb = INT_GET(node->btree[i+1].before, ARCH_CONVERT);
- xfs_da_brelse(*trans, bp);
- }
- /*
- * Atomically commit the whole invalidate stuff.
- */
- if ((error = xfs_attr_rolltrans(trans, dp)))
- return (error);
- }
-
- return(0);
-}
-
-/*
- * Invalidate all of the "remote" value regions pointed to by a particular
- * leaf block.
- * Note that we must release the lock on the buffer so that we are not
- * caught holding something that the logging code wants to flush to disk.
- */
-STATIC int
-xfs_attr_leaf_inactive(xfs_trans_t **trans, xfs_inode_t *dp, xfs_dabuf_t *bp)
-{
- xfs_attr_leafblock_t *leaf;
- xfs_attr_leaf_entry_t *entry;
- xfs_attr_leaf_name_remote_t *name_rmt;
- xfs_attr_inactive_list_t *list, *lp;
- int error, count, size, tmp, i;
-
- leaf = bp->data;
- ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT)
- == XFS_ATTR_LEAF_MAGIC);
-
- /*
- * Count the number of "remote" value extents.
- */
- count = 0;
- entry = &leaf->entries[0];
- for (i = 0; i < INT_GET(leaf->hdr.count, ARCH_CONVERT); entry++, i++) {
- if ( INT_GET(entry->nameidx, ARCH_CONVERT)
- && ((entry->flags & XFS_ATTR_LOCAL) == 0)) {
- name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf, i);
- if (name_rmt->valueblk)
- count++;
- }
- }
-
- /*
- * If there are no "remote" values, we're done.
- */
- if (count == 0) {
- xfs_da_brelse(*trans, bp);
- return(0);
- }
-
- /*
- * Allocate storage for a list of all the "remote" value extents.
- */
- size = count * sizeof(xfs_attr_inactive_list_t);
- list = (xfs_attr_inactive_list_t *)kmem_alloc(size, KM_SLEEP);
-
- /*
- * Identify each of the "remote" value extents.
- */
- lp = list;
- entry = &leaf->entries[0];
- for (i = 0; i < INT_GET(leaf->hdr.count, ARCH_CONVERT); entry++, i++) {
- if ( INT_GET(entry->nameidx, ARCH_CONVERT)
- && ((entry->flags & XFS_ATTR_LOCAL) == 0)) {
- name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf, i);
- if (name_rmt->valueblk) {
- /* both on-disk, don't endian flip twice */
- lp->valueblk = name_rmt->valueblk;
- INT_SET(lp->valuelen, ARCH_CONVERT,
- XFS_B_TO_FSB(dp->i_mount,
- INT_GET(name_rmt->valuelen,
- ARCH_CONVERT)));
- lp++;
- }
- }
- }
- xfs_da_brelse(*trans, bp); /* unlock for trans. in freextent() */
-
- /*
- * Invalidate each of the "remote" value extents.
- */
- error = 0;
- for (lp = list, i = 0; i < count; i++, lp++) {
- tmp = xfs_attr_leaf_freextent(trans, dp,
- INT_GET(lp->valueblk,
- ARCH_CONVERT),
- INT_GET(lp->valuelen,
- ARCH_CONVERT));
- if (error == 0)
- error = tmp; /* save only the 1st errno */
- }
-
- kmem_free((xfs_caddr_t)list, size);
- return(error);
-}
-
-/*
- * Look at all the extents for this logical region,
- * invalidate any buffers that are incore/in transactions.
- */
-STATIC int
-xfs_attr_leaf_freextent(xfs_trans_t **trans, xfs_inode_t *dp,
- xfs_dablk_t blkno, int blkcnt)
-{
- xfs_bmbt_irec_t map;
- xfs_dablk_t tblkno;
- int tblkcnt, dblkcnt, nmap, error;
- xfs_daddr_t dblkno;
- xfs_buf_t *bp;
-
- /*
- * Roll through the "value", invalidating the attribute value's
- * blocks.
- */
- tblkno = blkno;
- tblkcnt = blkcnt;
- while (tblkcnt > 0) {
- /*
- * Try to remember where we decided to put the value.
- */
- nmap = 1;
- error = xfs_bmapi(*trans, dp, (xfs_fileoff_t)tblkno, tblkcnt,
- XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
- NULL, 0, &map, &nmap, NULL);
- if (error) {
- return(error);
- }
- ASSERT(nmap == 1);
- ASSERT(map.br_startblock != DELAYSTARTBLOCK);
-
- /*
- * If it's a hole, these are already unmapped
- * so there's nothing to invalidate.
- */
- if (map.br_startblock != HOLESTARTBLOCK) {
-
- dblkno = XFS_FSB_TO_DADDR(dp->i_mount,
- map.br_startblock);
- dblkcnt = XFS_FSB_TO_BB(dp->i_mount,
- map.br_blockcount);
- bp = xfs_trans_get_buf(*trans,
- dp->i_mount->m_ddev_targp,
- dblkno, dblkcnt, XFS_BUF_LOCK);
- xfs_trans_binval(*trans, bp);
- /*
- * Roll to next transaction.
- */
- if ((error = xfs_attr_rolltrans(trans, dp)))
- return (error);
- }
-
- tblkno += map.br_blockcount;
- tblkcnt -= map.br_blockcount;
- }
-
- return(0);
-}
-
-
-/*
- * Roll from one trans in the sequence of PERMANENT transactions to the next.
- */
-int
-xfs_attr_rolltrans(xfs_trans_t **transp, xfs_inode_t *dp)
-{
- xfs_trans_t *trans;
- unsigned int logres, count;
- int error;
-
- /*
- * Ensure that the inode is always logged.
- */
- trans = *transp;
- xfs_trans_log_inode(trans, dp, XFS_ILOG_CORE);
-
- /*
- * Copy the critical parameters from one trans to the next.
- */
- logres = trans->t_log_res;
- count = trans->t_log_count;
- *transp = xfs_trans_dup(trans);
-
- /*
- * Commit the current transaction.
- * If this commit failed, then it'd just unlock those items that
- * are not marked ihold. That also means that a filesystem shutdown
- * is in progress. The caller takes the responsibility to cancel
- * the duplicate transaction that gets returned.
- */
- if ((error = xfs_trans_commit(trans, 0, NULL)))
- return (error);
-
- trans = *transp;
-
- /*
- * Reserve space in the log for th next transaction.
- * This also pushes items in the "AIL", the list of logged items,
- * out to disk if they are taking up space at the tail of the log
- * that we want to use. This requires that either nothing be locked
- * across this call, or that anything that is locked be logged in
- * the prior and the next transactions.
- */
- error = xfs_trans_reserve(trans, 0, logres, 0,
- XFS_TRANS_PERM_LOG_RES, count);
- /*
- * Ensure that the inode is in the new transaction and locked.
- */
- if (!error) {
- xfs_trans_ijoin(trans, dp, XFS_ILOCK_EXCL);
- xfs_trans_ihold(trans, dp);
- }
- return (error);
+ error = xfs_trans_roll(&args->trans, args->dp);
+ return error;
}
diff --git a/fs/xfs/xfs_attr_leaf.h b/fs/xfs/xfs_attr_leaf.h
index 541e34109bb..e2929da7c3b 100644
--- a/fs/xfs/xfs_attr_leaf.h
+++ b/fs/xfs/xfs_attr_leaf.h
@@ -1,5 +1,6 @@
/*
* Copyright (c) 2000,2002-2003,2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
* All Rights Reserved.
*
* This program is free software; you can redistribute it and/or
@@ -18,197 +19,15 @@
#ifndef __XFS_ATTR_LEAF_H__
#define __XFS_ATTR_LEAF_H__
-/*
- * Attribute storage layout, internal structure, access macros, etc.
- *
- * Attribute lists are structured around Btrees where all the data
- * elements are in the leaf nodes. Attribute names are hashed into an int,
- * then that int is used as the index into the Btree. Since the hashval
- * of an attribute name may not be unique, we may have duplicate keys. The
- * internal links in the Btree are logical block offsets into the file.
- */
-
struct attrlist;
struct attrlist_cursor_kern;
-struct attrnames;
-struct xfs_dabuf;
+struct xfs_attr_list_context;
struct xfs_da_args;
struct xfs_da_state;
struct xfs_da_state_blk;
struct xfs_inode;
struct xfs_trans;
-/*========================================================================
- * Attribute structure when equal to XFS_LBSIZE(mp) bytes.
- *========================================================================*/
-
-/*
- * This is the structure of the leaf nodes in the Btree.
- *
- * Struct leaf_entry's are packed from the top. Name/values grow from the
- * bottom but are not packed. The freemap contains run-length-encoded entries
- * for the free bytes after the leaf_entry's, but only the N largest such,
- * smaller runs are dropped. When the freemap doesn't show enough space
- * for an allocation, we compact the name/value area and try again. If we
- * still don't have enough space, then we have to split the block. The
- * name/value structs (both local and remote versions) must be 32bit aligned.
- *
- * Since we have duplicate hash keys, for each key that matches, compare
- * the actual name string. The root and intermediate node search always
- * takes the first-in-the-block key match found, so we should only have
- * to work "forw"ard. If none matches, continue with the "forw"ard leaf
- * nodes until the hash key changes or the attribute name is found.
- *
- * We store the fact that an attribute is a ROOT/USER/SECURE attribute in
- * the leaf_entry. The namespaces are independent only because we also look
- * at the namespace bit when we are looking for a matching attribute name.
- *
- * We also store an "incomplete" bit in the leaf_entry. It shows that an
- * attribute is in the middle of being created and should not be shown to
- * the user if we crash during the time that the bit is set. We clear the
- * bit when we have finished setting up the attribute. We do this because
- * we cannot create some large attributes inside a single transaction, and we
- * need some indication that we weren't finished if we crash in the middle.
- */
-#define XFS_ATTR_LEAF_MAPSIZE 3 /* how many freespace slots */
-
-typedef struct xfs_attr_leaf_map { /* RLE map of free bytes */
- __uint16_t base; /* base of free region */
- __uint16_t size; /* length of free region */
-} xfs_attr_leaf_map_t;
-
-typedef struct xfs_attr_leaf_hdr { /* constant-structure header block */
- xfs_da_blkinfo_t info; /* block type, links, etc. */
- __uint16_t count; /* count of active leaf_entry's */
- __uint16_t usedbytes; /* num bytes of names/values stored */
- __uint16_t firstused; /* first used byte in name area */
- __uint8_t holes; /* != 0 if blk needs compaction */
- __uint8_t pad1;
- xfs_attr_leaf_map_t freemap[XFS_ATTR_LEAF_MAPSIZE];
- /* N largest free regions */
-} xfs_attr_leaf_hdr_t;
-
-typedef struct xfs_attr_leaf_entry { /* sorted on key, not name */
- xfs_dahash_t hashval; /* hash value of name */
- __uint16_t nameidx; /* index into buffer of name/value */
- __uint8_t flags; /* LOCAL/ROOT/SECURE/INCOMPLETE flag */
- __uint8_t pad2; /* unused pad byte */
-} xfs_attr_leaf_entry_t;
-
-typedef struct xfs_attr_leaf_name_local {
- __uint16_t valuelen; /* number of bytes in value */
- __uint8_t namelen; /* length of name bytes */
- __uint8_t nameval[1]; /* name/value bytes */
-} xfs_attr_leaf_name_local_t;
-
-typedef struct xfs_attr_leaf_name_remote {
- xfs_dablk_t valueblk; /* block number of value bytes */
- __uint32_t valuelen; /* number of bytes in value */
- __uint8_t namelen; /* length of name bytes */
- __uint8_t name[1]; /* name bytes */
-} xfs_attr_leaf_name_remote_t;
-
-typedef struct xfs_attr_leafblock {
- xfs_attr_leaf_hdr_t hdr; /* constant-structure header block */
- xfs_attr_leaf_entry_t entries[1]; /* sorted on key, not name */
- xfs_attr_leaf_name_local_t namelist; /* grows from bottom of buf */
- xfs_attr_leaf_name_remote_t valuelist; /* grows from bottom of buf */
-} xfs_attr_leafblock_t;
-
-/*
- * Flags used in the leaf_entry[i].flags field.
- * NOTE: the INCOMPLETE bit must not collide with the flags bits specified
- * on the system call, they are "or"ed together for various operations.
- */
-#define XFS_ATTR_LOCAL_BIT 0 /* attr is stored locally */
-#define XFS_ATTR_ROOT_BIT 1 /* limit access to trusted attrs */
-#define XFS_ATTR_SECURE_BIT 2 /* limit access to secure attrs */
-#define XFS_ATTR_INCOMPLETE_BIT 7 /* attr in middle of create/delete */
-#define XFS_ATTR_LOCAL (1 << XFS_ATTR_LOCAL_BIT)
-#define XFS_ATTR_ROOT (1 << XFS_ATTR_ROOT_BIT)
-#define XFS_ATTR_SECURE (1 << XFS_ATTR_SECURE_BIT)
-#define XFS_ATTR_INCOMPLETE (1 << XFS_ATTR_INCOMPLETE_BIT)
-
-/*
- * Alignment for namelist and valuelist entries (since they are mixed
- * there can be only one alignment value)
- */
-#define XFS_ATTR_LEAF_NAME_ALIGN ((uint)sizeof(xfs_dablk_t))
-
-/*
- * Cast typed pointers for "local" and "remote" name/value structs.
- */
-#define XFS_ATTR_LEAF_NAME_REMOTE(leafp,idx) \
- xfs_attr_leaf_name_remote(leafp,idx)
-static inline xfs_attr_leaf_name_remote_t *
-xfs_attr_leaf_name_remote(xfs_attr_leafblock_t *leafp, int idx)
-{
- return (xfs_attr_leaf_name_remote_t *) &((char *)
- (leafp))[INT_GET((leafp)->entries[idx].nameidx, ARCH_CONVERT)];
-}
-
-#define XFS_ATTR_LEAF_NAME_LOCAL(leafp,idx) \
- xfs_attr_leaf_name_local(leafp,idx)
-static inline xfs_attr_leaf_name_local_t *
-xfs_attr_leaf_name_local(xfs_attr_leafblock_t *leafp, int idx)
-{
- return (xfs_attr_leaf_name_local_t *) &((char *)
- (leafp))[INT_GET((leafp)->entries[idx].nameidx, ARCH_CONVERT)];
-}
-
-#define XFS_ATTR_LEAF_NAME(leafp,idx) \
- xfs_attr_leaf_name(leafp,idx)
-static inline char *xfs_attr_leaf_name(xfs_attr_leafblock_t *leafp, int idx)
-{
- return (&((char *)
- (leafp))[INT_GET((leafp)->entries[idx].nameidx, ARCH_CONVERT)]);
-}
-
-/*
- * Calculate total bytes used (including trailing pad for alignment) for
- * a "local" name/value structure, a "remote" name/value structure, and
- * a pointer which might be either.
- */
-#define XFS_ATTR_LEAF_ENTSIZE_REMOTE(nlen) \
- xfs_attr_leaf_entsize_remote(nlen)
-static inline int xfs_attr_leaf_entsize_remote(int nlen)
-{
- return ((uint)sizeof(xfs_attr_leaf_name_remote_t) - 1 + (nlen) + \
- XFS_ATTR_LEAF_NAME_ALIGN - 1) & ~(XFS_ATTR_LEAF_NAME_ALIGN - 1);
-}
-
-#define XFS_ATTR_LEAF_ENTSIZE_LOCAL(nlen,vlen) \
- xfs_attr_leaf_entsize_local(nlen,vlen)
-static inline int xfs_attr_leaf_entsize_local(int nlen, int vlen)
-{
- return ((uint)sizeof(xfs_attr_leaf_name_local_t) - 1 + (nlen) + (vlen) +
- XFS_ATTR_LEAF_NAME_ALIGN - 1) & ~(XFS_ATTR_LEAF_NAME_ALIGN - 1);
-}
-
-#define XFS_ATTR_LEAF_ENTSIZE_LOCAL_MAX(bsize) \
- xfs_attr_leaf_entsize_local_max(bsize)
-static inline int xfs_attr_leaf_entsize_local_max(int bsize)
-{
- return (((bsize) >> 1) + ((bsize) >> 2));
-}
-
-
-/*========================================================================
- * Structure used to pass context around among the routines.
- *========================================================================*/
-
-typedef struct xfs_attr_list_context {
- struct xfs_inode *dp; /* inode */
- struct attrlist_cursor_kern *cursor;/* position in list */
- struct attrlist *alist; /* output buffer */
- int count; /* num used entries */
- int dupcnt; /* count dup hashvals seen */
- int bufsize;/* total buffer size */
- int firstu; /* first used byte in buffer */
- int flags; /* from VOP call */
- int resynch;/* T/F: resynch with cursor */
-} xfs_attr_list_context_t;
-
/*
* Used to keep a list of "remote value" extents when unlinking an inode.
*/
@@ -232,53 +51,58 @@ int xfs_attr_shortform_getvalue(struct xfs_da_args *args);
int xfs_attr_shortform_to_leaf(struct xfs_da_args *args);
int xfs_attr_shortform_remove(struct xfs_da_args *args);
int xfs_attr_shortform_list(struct xfs_attr_list_context *context);
-int xfs_attr_shortform_allfit(struct xfs_dabuf *bp, struct xfs_inode *dp);
+int xfs_attr_shortform_allfit(struct xfs_buf *bp, struct xfs_inode *dp);
int xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes);
/*
* Internal routines when attribute fork size == XFS_LBSIZE(mp).
*/
-int xfs_attr_leaf_to_node(struct xfs_da_args *args);
-int xfs_attr_leaf_to_shortform(struct xfs_dabuf *bp,
+int xfs_attr3_leaf_to_node(struct xfs_da_args *args);
+int xfs_attr3_leaf_to_shortform(struct xfs_buf *bp,
struct xfs_da_args *args, int forkoff);
-int xfs_attr_leaf_clearflag(struct xfs_da_args *args);
-int xfs_attr_leaf_setflag(struct xfs_da_args *args);
-int xfs_attr_leaf_flipflags(xfs_da_args_t *args);
+int xfs_attr3_leaf_clearflag(struct xfs_da_args *args);
+int xfs_attr3_leaf_setflag(struct xfs_da_args *args);
+int xfs_attr3_leaf_flipflags(struct xfs_da_args *args);
/*
* Routines used for growing the Btree.
*/
-int xfs_attr_leaf_split(struct xfs_da_state *state,
+int xfs_attr3_leaf_split(struct xfs_da_state *state,
struct xfs_da_state_blk *oldblk,
struct xfs_da_state_blk *newblk);
-int xfs_attr_leaf_lookup_int(struct xfs_dabuf *leaf,
+int xfs_attr3_leaf_lookup_int(struct xfs_buf *leaf,
struct xfs_da_args *args);
-int xfs_attr_leaf_getvalue(struct xfs_dabuf *bp, struct xfs_da_args *args);
-int xfs_attr_leaf_add(struct xfs_dabuf *leaf_buffer,
+int xfs_attr3_leaf_getvalue(struct xfs_buf *bp, struct xfs_da_args *args);
+int xfs_attr3_leaf_add(struct xfs_buf *leaf_buffer,
struct xfs_da_args *args);
-int xfs_attr_leaf_remove(struct xfs_dabuf *leaf_buffer,
+int xfs_attr3_leaf_remove(struct xfs_buf *leaf_buffer,
struct xfs_da_args *args);
-int xfs_attr_leaf_list_int(struct xfs_dabuf *bp,
+int xfs_attr3_leaf_list_int(struct xfs_buf *bp,
struct xfs_attr_list_context *context);
/*
* Routines used for shrinking the Btree.
*/
-int xfs_attr_leaf_toosmall(struct xfs_da_state *state, int *retval);
-void xfs_attr_leaf_unbalance(struct xfs_da_state *state,
+int xfs_attr3_leaf_toosmall(struct xfs_da_state *state, int *retval);
+void xfs_attr3_leaf_unbalance(struct xfs_da_state *state,
struct xfs_da_state_blk *drop_blk,
struct xfs_da_state_blk *save_blk);
-int xfs_attr_root_inactive(struct xfs_trans **trans, struct xfs_inode *dp);
+int xfs_attr3_root_inactive(struct xfs_trans **trans, struct xfs_inode *dp);
/*
* Utility routines.
*/
-xfs_dahash_t xfs_attr_leaf_lasthash(struct xfs_dabuf *bp, int *count);
-int xfs_attr_leaf_order(struct xfs_dabuf *leaf1_bp,
- struct xfs_dabuf *leaf2_bp);
-int xfs_attr_leaf_newentsize(int namelen, int valuelen, int blocksize,
- int *local);
-int xfs_attr_rolltrans(struct xfs_trans **transp, struct xfs_inode *dp);
+xfs_dahash_t xfs_attr_leaf_lasthash(struct xfs_buf *bp, int *count);
+int xfs_attr_leaf_order(struct xfs_buf *leaf1_bp,
+ struct xfs_buf *leaf2_bp);
+int xfs_attr_leaf_newentsize(struct xfs_da_args *args, int *local);
+int xfs_attr3_leaf_read(struct xfs_trans *tp, struct xfs_inode *dp,
+ xfs_dablk_t bno, xfs_daddr_t mappedbno,
+ struct xfs_buf **bpp);
+void xfs_attr3_leaf_hdr_from_disk(struct xfs_attr3_icleaf_hdr *to,
+ struct xfs_attr_leafblock *from);
+void xfs_attr3_leaf_hdr_to_disk(struct xfs_attr_leafblock *to,
+ struct xfs_attr3_icleaf_hdr *from);
#endif /* __XFS_ATTR_LEAF_H__ */
diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c
new file mode 100644
index 00000000000..90e2eeb2120
--- /dev/null
+++ b/fs/xfs/xfs_attr_list.c
@@ -0,0 +1,653 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_inode_item.h"
+#include "xfs_bmap.h"
+#include "xfs_attr.h"
+#include "xfs_attr_sf.h"
+#include "xfs_attr_remote.h"
+#include "xfs_attr_leaf.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+#include "xfs_buf_item.h"
+#include "xfs_cksum.h"
+#include "xfs_dinode.h"
+#include "xfs_dir2.h"
+
+STATIC int
+xfs_attr_shortform_compare(const void *a, const void *b)
+{
+ xfs_attr_sf_sort_t *sa, *sb;
+
+ sa = (xfs_attr_sf_sort_t *)a;
+ sb = (xfs_attr_sf_sort_t *)b;
+ if (sa->hash < sb->hash) {
+ return(-1);
+ } else if (sa->hash > sb->hash) {
+ return(1);
+ } else {
+ return(sa->entno - sb->entno);
+ }
+}
+
+#define XFS_ISRESET_CURSOR(cursor) \
+ (!((cursor)->initted) && !((cursor)->hashval) && \
+ !((cursor)->blkno) && !((cursor)->offset))
+/*
+ * Copy out entries of shortform attribute lists for attr_list().
+ * Shortform attribute lists are not stored in hashval sorted order.
+ * If the output buffer is not large enough to hold them all, then we
+ * we have to calculate each entries' hashvalue and sort them before
+ * we can begin returning them to the user.
+ */
+int
+xfs_attr_shortform_list(xfs_attr_list_context_t *context)
+{
+ attrlist_cursor_kern_t *cursor;
+ xfs_attr_sf_sort_t *sbuf, *sbp;
+ xfs_attr_shortform_t *sf;
+ xfs_attr_sf_entry_t *sfe;
+ xfs_inode_t *dp;
+ int sbsize, nsbuf, count, i;
+ int error;
+
+ ASSERT(context != NULL);
+ dp = context->dp;
+ ASSERT(dp != NULL);
+ ASSERT(dp->i_afp != NULL);
+ sf = (xfs_attr_shortform_t *)dp->i_afp->if_u1.if_data;
+ ASSERT(sf != NULL);
+ if (!sf->hdr.count)
+ return(0);
+ cursor = context->cursor;
+ ASSERT(cursor != NULL);
+
+ trace_xfs_attr_list_sf(context);
+
+ /*
+ * If the buffer is large enough and the cursor is at the start,
+ * do not bother with sorting since we will return everything in
+ * one buffer and another call using the cursor won't need to be
+ * made.
+ * Note the generous fudge factor of 16 overhead bytes per entry.
+ * If bufsize is zero then put_listent must be a search function
+ * and can just scan through what we have.
+ */
+ if (context->bufsize == 0 ||
+ (XFS_ISRESET_CURSOR(cursor) &&
+ (dp->i_afp->if_bytes + sf->hdr.count * 16) < context->bufsize)) {
+ for (i = 0, sfe = &sf->list[0]; i < sf->hdr.count; i++) {
+ error = context->put_listent(context,
+ sfe->flags,
+ sfe->nameval,
+ (int)sfe->namelen,
+ (int)sfe->valuelen,
+ &sfe->nameval[sfe->namelen]);
+
+ /*
+ * Either search callback finished early or
+ * didn't fit it all in the buffer after all.
+ */
+ if (context->seen_enough)
+ break;
+
+ if (error)
+ return error;
+ sfe = XFS_ATTR_SF_NEXTENTRY(sfe);
+ }
+ trace_xfs_attr_list_sf_all(context);
+ return(0);
+ }
+
+ /* do no more for a search callback */
+ if (context->bufsize == 0)
+ return 0;
+
+ /*
+ * It didn't all fit, so we have to sort everything on hashval.
+ */
+ sbsize = sf->hdr.count * sizeof(*sbuf);
+ sbp = sbuf = kmem_alloc(sbsize, KM_SLEEP | KM_NOFS);
+
+ /*
+ * Scan the attribute list for the rest of the entries, storing
+ * the relevant info from only those that match into a buffer.
+ */
+ nsbuf = 0;
+ for (i = 0, sfe = &sf->list[0]; i < sf->hdr.count; i++) {
+ if (unlikely(
+ ((char *)sfe < (char *)sf) ||
+ ((char *)sfe >= ((char *)sf + dp->i_afp->if_bytes)))) {
+ XFS_CORRUPTION_ERROR("xfs_attr_shortform_list",
+ XFS_ERRLEVEL_LOW,
+ context->dp->i_mount, sfe);
+ kmem_free(sbuf);
+ return XFS_ERROR(EFSCORRUPTED);
+ }
+
+ sbp->entno = i;
+ sbp->hash = xfs_da_hashname(sfe->nameval, sfe->namelen);
+ sbp->name = sfe->nameval;
+ sbp->namelen = sfe->namelen;
+ /* These are bytes, and both on-disk, don't endian-flip */
+ sbp->valuelen = sfe->valuelen;
+ sbp->flags = sfe->flags;
+ sfe = XFS_ATTR_SF_NEXTENTRY(sfe);
+ sbp++;
+ nsbuf++;
+ }
+
+ /*
+ * Sort the entries on hash then entno.
+ */
+ xfs_sort(sbuf, nsbuf, sizeof(*sbuf), xfs_attr_shortform_compare);
+
+ /*
+ * Re-find our place IN THE SORTED LIST.
+ */
+ count = 0;
+ cursor->initted = 1;
+ cursor->blkno = 0;
+ for (sbp = sbuf, i = 0; i < nsbuf; i++, sbp++) {
+ if (sbp->hash == cursor->hashval) {
+ if (cursor->offset == count) {
+ break;
+ }
+ count++;
+ } else if (sbp->hash > cursor->hashval) {
+ break;
+ }
+ }
+ if (i == nsbuf) {
+ kmem_free(sbuf);
+ return(0);
+ }
+
+ /*
+ * Loop putting entries into the user buffer.
+ */
+ for ( ; i < nsbuf; i++, sbp++) {
+ if (cursor->hashval != sbp->hash) {
+ cursor->hashval = sbp->hash;
+ cursor->offset = 0;
+ }
+ error = context->put_listent(context,
+ sbp->flags,
+ sbp->name,
+ sbp->namelen,
+ sbp->valuelen,
+ &sbp->name[sbp->namelen]);
+ if (error)
+ return error;
+ if (context->seen_enough)
+ break;
+ cursor->offset++;
+ }
+
+ kmem_free(sbuf);
+ return(0);
+}
+
+STATIC int
+xfs_attr_node_list(xfs_attr_list_context_t *context)
+{
+ attrlist_cursor_kern_t *cursor;
+ xfs_attr_leafblock_t *leaf;
+ xfs_da_intnode_t *node;
+ struct xfs_attr3_icleaf_hdr leafhdr;
+ struct xfs_da3_icnode_hdr nodehdr;
+ struct xfs_da_node_entry *btree;
+ int error, i;
+ struct xfs_buf *bp;
+ struct xfs_inode *dp = context->dp;
+
+ trace_xfs_attr_node_list(context);
+
+ cursor = context->cursor;
+ cursor->initted = 1;
+
+ /*
+ * Do all sorts of validation on the passed-in cursor structure.
+ * If anything is amiss, ignore the cursor and look up the hashval
+ * starting from the btree root.
+ */
+ bp = NULL;
+ if (cursor->blkno > 0) {
+ error = xfs_da3_node_read(NULL, dp, cursor->blkno, -1,
+ &bp, XFS_ATTR_FORK);
+ if ((error != 0) && (error != EFSCORRUPTED))
+ return(error);
+ if (bp) {
+ struct xfs_attr_leaf_entry *entries;
+
+ node = bp->b_addr;
+ switch (be16_to_cpu(node->hdr.info.magic)) {
+ case XFS_DA_NODE_MAGIC:
+ case XFS_DA3_NODE_MAGIC:
+ trace_xfs_attr_list_wrong_blk(context);
+ xfs_trans_brelse(NULL, bp);
+ bp = NULL;
+ break;
+ case XFS_ATTR_LEAF_MAGIC:
+ case XFS_ATTR3_LEAF_MAGIC:
+ leaf = bp->b_addr;
+ xfs_attr3_leaf_hdr_from_disk(&leafhdr, leaf);
+ entries = xfs_attr3_leaf_entryp(leaf);
+ if (cursor->hashval > be32_to_cpu(
+ entries[leafhdr.count - 1].hashval)) {
+ trace_xfs_attr_list_wrong_blk(context);
+ xfs_trans_brelse(NULL, bp);
+ bp = NULL;
+ } else if (cursor->hashval <= be32_to_cpu(
+ entries[0].hashval)) {
+ trace_xfs_attr_list_wrong_blk(context);
+ xfs_trans_brelse(NULL, bp);
+ bp = NULL;
+ }
+ break;
+ default:
+ trace_xfs_attr_list_wrong_blk(context);
+ xfs_trans_brelse(NULL, bp);
+ bp = NULL;
+ }
+ }
+ }
+
+ /*
+ * We did not find what we expected given the cursor's contents,
+ * so we start from the top and work down based on the hash value.
+ * Note that start of node block is same as start of leaf block.
+ */
+ if (bp == NULL) {
+ cursor->blkno = 0;
+ for (;;) {
+ __uint16_t magic;
+
+ error = xfs_da3_node_read(NULL, dp,
+ cursor->blkno, -1, &bp,
+ XFS_ATTR_FORK);
+ if (error)
+ return(error);
+ node = bp->b_addr;
+ magic = be16_to_cpu(node->hdr.info.magic);
+ if (magic == XFS_ATTR_LEAF_MAGIC ||
+ magic == XFS_ATTR3_LEAF_MAGIC)
+ break;
+ if (magic != XFS_DA_NODE_MAGIC &&
+ magic != XFS_DA3_NODE_MAGIC) {
+ XFS_CORRUPTION_ERROR("xfs_attr_node_list(3)",
+ XFS_ERRLEVEL_LOW,
+ context->dp->i_mount,
+ node);
+ xfs_trans_brelse(NULL, bp);
+ return XFS_ERROR(EFSCORRUPTED);
+ }
+
+ dp->d_ops->node_hdr_from_disk(&nodehdr, node);
+ btree = dp->d_ops->node_tree_p(node);
+ for (i = 0; i < nodehdr.count; btree++, i++) {
+ if (cursor->hashval
+ <= be32_to_cpu(btree->hashval)) {
+ cursor->blkno = be32_to_cpu(btree->before);
+ trace_xfs_attr_list_node_descend(context,
+ btree);
+ break;
+ }
+ }
+ if (i == nodehdr.count) {
+ xfs_trans_brelse(NULL, bp);
+ return 0;
+ }
+ xfs_trans_brelse(NULL, bp);
+ }
+ }
+ ASSERT(bp != NULL);
+
+ /*
+ * Roll upward through the blocks, processing each leaf block in
+ * order. As long as there is space in the result buffer, keep
+ * adding the information.
+ */
+ for (;;) {
+ leaf = bp->b_addr;
+ error = xfs_attr3_leaf_list_int(bp, context);
+ if (error) {
+ xfs_trans_brelse(NULL, bp);
+ return error;
+ }
+ xfs_attr3_leaf_hdr_from_disk(&leafhdr, leaf);
+ if (context->seen_enough || leafhdr.forw == 0)
+ break;
+ cursor->blkno = leafhdr.forw;
+ xfs_trans_brelse(NULL, bp);
+ error = xfs_attr3_leaf_read(NULL, dp, cursor->blkno, -1, &bp);
+ if (error)
+ return error;
+ }
+ xfs_trans_brelse(NULL, bp);
+ return 0;
+}
+
+/*
+ * Copy out attribute list entries for attr_list(), for leaf attribute lists.
+ */
+int
+xfs_attr3_leaf_list_int(
+ struct xfs_buf *bp,
+ struct xfs_attr_list_context *context)
+{
+ struct attrlist_cursor_kern *cursor;
+ struct xfs_attr_leafblock *leaf;
+ struct xfs_attr3_icleaf_hdr ichdr;
+ struct xfs_attr_leaf_entry *entries;
+ struct xfs_attr_leaf_entry *entry;
+ int retval;
+ int i;
+
+ trace_xfs_attr_list_leaf(context);
+
+ leaf = bp->b_addr;
+ xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
+ entries = xfs_attr3_leaf_entryp(leaf);
+
+ cursor = context->cursor;
+ cursor->initted = 1;
+
+ /*
+ * Re-find our place in the leaf block if this is a new syscall.
+ */
+ if (context->resynch) {
+ entry = &entries[0];
+ for (i = 0; i < ichdr.count; entry++, i++) {
+ if (be32_to_cpu(entry->hashval) == cursor->hashval) {
+ if (cursor->offset == context->dupcnt) {
+ context->dupcnt = 0;
+ break;
+ }
+ context->dupcnt++;
+ } else if (be32_to_cpu(entry->hashval) >
+ cursor->hashval) {
+ context->dupcnt = 0;
+ break;
+ }
+ }
+ if (i == ichdr.count) {
+ trace_xfs_attr_list_notfound(context);
+ return 0;
+ }
+ } else {
+ entry = &entries[0];
+ i = 0;
+ }
+ context->resynch = 0;
+
+ /*
+ * We have found our place, start copying out the new attributes.
+ */
+ retval = 0;
+ for (; i < ichdr.count; entry++, i++) {
+ if (be32_to_cpu(entry->hashval) != cursor->hashval) {
+ cursor->hashval = be32_to_cpu(entry->hashval);
+ cursor->offset = 0;
+ }
+
+ if (entry->flags & XFS_ATTR_INCOMPLETE)
+ continue; /* skip incomplete entries */
+
+ if (entry->flags & XFS_ATTR_LOCAL) {
+ xfs_attr_leaf_name_local_t *name_loc =
+ xfs_attr3_leaf_name_local(leaf, i);
+
+ retval = context->put_listent(context,
+ entry->flags,
+ name_loc->nameval,
+ (int)name_loc->namelen,
+ be16_to_cpu(name_loc->valuelen),
+ &name_loc->nameval[name_loc->namelen]);
+ if (retval)
+ return retval;
+ } else {
+ xfs_attr_leaf_name_remote_t *name_rmt =
+ xfs_attr3_leaf_name_remote(leaf, i);
+
+ int valuelen = be32_to_cpu(name_rmt->valuelen);
+
+ if (context->put_value) {
+ xfs_da_args_t args;
+
+ memset((char *)&args, 0, sizeof(args));
+ args.geo = context->dp->i_mount->m_attr_geo;
+ args.dp = context->dp;
+ args.whichfork = XFS_ATTR_FORK;
+ args.valuelen = valuelen;
+ args.rmtvaluelen = valuelen;
+ args.value = kmem_alloc(valuelen, KM_SLEEP | KM_NOFS);
+ args.rmtblkno = be32_to_cpu(name_rmt->valueblk);
+ args.rmtblkcnt = xfs_attr3_rmt_blocks(
+ args.dp->i_mount, valuelen);
+ retval = xfs_attr_rmtval_get(&args);
+ if (retval)
+ return retval;
+ retval = context->put_listent(context,
+ entry->flags,
+ name_rmt->name,
+ (int)name_rmt->namelen,
+ valuelen,
+ args.value);
+ kmem_free(args.value);
+ } else {
+ retval = context->put_listent(context,
+ entry->flags,
+ name_rmt->name,
+ (int)name_rmt->namelen,
+ valuelen,
+ NULL);
+ }
+ if (retval)
+ return retval;
+ }
+ if (context->seen_enough)
+ break;
+ cursor->offset++;
+ }
+ trace_xfs_attr_list_leaf_end(context);
+ return retval;
+}
+
+/*
+ * Copy out attribute entries for attr_list(), for leaf attribute lists.
+ */
+STATIC int
+xfs_attr_leaf_list(xfs_attr_list_context_t *context)
+{
+ int error;
+ struct xfs_buf *bp;
+
+ trace_xfs_attr_leaf_list(context);
+
+ context->cursor->blkno = 0;
+ error = xfs_attr3_leaf_read(NULL, context->dp, 0, -1, &bp);
+ if (error)
+ return XFS_ERROR(error);
+
+ error = xfs_attr3_leaf_list_int(bp, context);
+ xfs_trans_brelse(NULL, bp);
+ return XFS_ERROR(error);
+}
+
+int
+xfs_attr_list_int(
+ xfs_attr_list_context_t *context)
+{
+ int error;
+ xfs_inode_t *dp = context->dp;
+ uint lock_mode;
+
+ XFS_STATS_INC(xs_attr_list);
+
+ if (XFS_FORCED_SHUTDOWN(dp->i_mount))
+ return EIO;
+
+ /*
+ * Decide on what work routines to call based on the inode size.
+ */
+ lock_mode = xfs_ilock_attr_map_shared(dp);
+ if (!xfs_inode_hasattr(dp)) {
+ error = 0;
+ } else if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) {
+ error = xfs_attr_shortform_list(context);
+ } else if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) {
+ error = xfs_attr_leaf_list(context);
+ } else {
+ error = xfs_attr_node_list(context);
+ }
+ xfs_iunlock(dp, lock_mode);
+ return error;
+}
+
+#define ATTR_ENTBASESIZE /* minimum bytes used by an attr */ \
+ (((struct attrlist_ent *) 0)->a_name - (char *) 0)
+#define ATTR_ENTSIZE(namelen) /* actual bytes used by an attr */ \
+ ((ATTR_ENTBASESIZE + (namelen) + 1 + sizeof(u_int32_t)-1) \
+ & ~(sizeof(u_int32_t)-1))
+
+/*
+ * Format an attribute and copy it out to the user's buffer.
+ * Take care to check values and protect against them changing later,
+ * we may be reading them directly out of a user buffer.
+ */
+STATIC int
+xfs_attr_put_listent(
+ xfs_attr_list_context_t *context,
+ int flags,
+ unsigned char *name,
+ int namelen,
+ int valuelen,
+ unsigned char *value)
+{
+ struct attrlist *alist = (struct attrlist *)context->alist;
+ attrlist_ent_t *aep;
+ int arraytop;
+
+ ASSERT(!(context->flags & ATTR_KERNOVAL));
+ ASSERT(context->count >= 0);
+ ASSERT(context->count < (ATTR_MAX_VALUELEN/8));
+ ASSERT(context->firstu >= sizeof(*alist));
+ ASSERT(context->firstu <= context->bufsize);
+
+ /*
+ * Only list entries in the right namespace.
+ */
+ if (((context->flags & ATTR_SECURE) == 0) !=
+ ((flags & XFS_ATTR_SECURE) == 0))
+ return 0;
+ if (((context->flags & ATTR_ROOT) == 0) !=
+ ((flags & XFS_ATTR_ROOT) == 0))
+ return 0;
+
+ arraytop = sizeof(*alist) +
+ context->count * sizeof(alist->al_offset[0]);
+ context->firstu -= ATTR_ENTSIZE(namelen);
+ if (context->firstu < arraytop) {
+ trace_xfs_attr_list_full(context);
+ alist->al_more = 1;
+ context->seen_enough = 1;
+ return 1;
+ }
+
+ aep = (attrlist_ent_t *)&context->alist[context->firstu];
+ aep->a_valuelen = valuelen;
+ memcpy(aep->a_name, name, namelen);
+ aep->a_name[namelen] = 0;
+ alist->al_offset[context->count++] = context->firstu;
+ alist->al_count = context->count;
+ trace_xfs_attr_list_add(context);
+ return 0;
+}
+
+/*
+ * Generate a list of extended attribute names and optionally
+ * also value lengths. Positive return value follows the XFS
+ * convention of being an error, zero or negative return code
+ * is the length of the buffer returned (negated), indicating
+ * success.
+ */
+int
+xfs_attr_list(
+ xfs_inode_t *dp,
+ char *buffer,
+ int bufsize,
+ int flags,
+ attrlist_cursor_kern_t *cursor)
+{
+ xfs_attr_list_context_t context;
+ struct attrlist *alist;
+ int error;
+
+ /*
+ * Validate the cursor.
+ */
+ if (cursor->pad1 || cursor->pad2)
+ return(XFS_ERROR(EINVAL));
+ if ((cursor->initted == 0) &&
+ (cursor->hashval || cursor->blkno || cursor->offset))
+ return XFS_ERROR(EINVAL);
+
+ /*
+ * Check for a properly aligned buffer.
+ */
+ if (((long)buffer) & (sizeof(int)-1))
+ return XFS_ERROR(EFAULT);
+ if (flags & ATTR_KERNOVAL)
+ bufsize = 0;
+
+ /*
+ * Initialize the output buffer.
+ */
+ memset(&context, 0, sizeof(context));
+ context.dp = dp;
+ context.cursor = cursor;
+ context.resynch = 1;
+ context.flags = flags;
+ context.alist = buffer;
+ context.bufsize = (bufsize & ~(sizeof(int)-1)); /* align */
+ context.firstu = context.bufsize;
+ context.put_listent = xfs_attr_put_listent;
+
+ alist = (struct attrlist *)context.alist;
+ alist->al_count = 0;
+ alist->al_more = 0;
+ alist->al_offset[0] = context.bufsize;
+
+ error = xfs_attr_list_int(&context);
+ ASSERT(error >= 0);
+ return error;
+}
diff --git a/fs/xfs/xfs_attr_remote.c b/fs/xfs/xfs_attr_remote.c
new file mode 100644
index 00000000000..b5adfecbb8e
--- /dev/null
+++ b/fs/xfs/xfs_attr_remote.c
@@ -0,0 +1,628 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_inode.h"
+#include "xfs_alloc.h"
+#include "xfs_trans.h"
+#include "xfs_inode_item.h"
+#include "xfs_bmap.h"
+#include "xfs_bmap_util.h"
+#include "xfs_attr.h"
+#include "xfs_attr_leaf.h"
+#include "xfs_attr_remote.h"
+#include "xfs_trans_space.h"
+#include "xfs_trace.h"
+#include "xfs_cksum.h"
+#include "xfs_buf_item.h"
+#include "xfs_error.h"
+
+#define ATTR_RMTVALUE_MAPSIZE 1 /* # of map entries at once */
+
+/*
+ * Each contiguous block has a header, so it is not just a simple attribute
+ * length to FSB conversion.
+ */
+int
+xfs_attr3_rmt_blocks(
+ struct xfs_mount *mp,
+ int attrlen)
+{
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ int buflen = XFS_ATTR3_RMT_BUF_SPACE(mp, mp->m_sb.sb_blocksize);
+ return (attrlen + buflen - 1) / buflen;
+ }
+ return XFS_B_TO_FSB(mp, attrlen);
+}
+
+/*
+ * Checking of the remote attribute header is split into two parts. The verifier
+ * does CRC, location and bounds checking, the unpacking function checks the
+ * attribute parameters and owner.
+ */
+static bool
+xfs_attr3_rmt_hdr_ok(
+ void *ptr,
+ xfs_ino_t ino,
+ uint32_t offset,
+ uint32_t size,
+ xfs_daddr_t bno)
+{
+ struct xfs_attr3_rmt_hdr *rmt = ptr;
+
+ if (bno != be64_to_cpu(rmt->rm_blkno))
+ return false;
+ if (offset != be32_to_cpu(rmt->rm_offset))
+ return false;
+ if (size != be32_to_cpu(rmt->rm_bytes))
+ return false;
+ if (ino != be64_to_cpu(rmt->rm_owner))
+ return false;
+
+ /* ok */
+ return true;
+}
+
+static bool
+xfs_attr3_rmt_verify(
+ struct xfs_mount *mp,
+ void *ptr,
+ int fsbsize,
+ xfs_daddr_t bno)
+{
+ struct xfs_attr3_rmt_hdr *rmt = ptr;
+
+ if (!xfs_sb_version_hascrc(&mp->m_sb))
+ return false;
+ if (rmt->rm_magic != cpu_to_be32(XFS_ATTR3_RMT_MAGIC))
+ return false;
+ if (!uuid_equal(&rmt->rm_uuid, &mp->m_sb.sb_uuid))
+ return false;
+ if (be64_to_cpu(rmt->rm_blkno) != bno)
+ return false;
+ if (be32_to_cpu(rmt->rm_bytes) > fsbsize - sizeof(*rmt))
+ return false;
+ if (be32_to_cpu(rmt->rm_offset) +
+ be32_to_cpu(rmt->rm_bytes) > XATTR_SIZE_MAX)
+ return false;
+ if (rmt->rm_owner == 0)
+ return false;
+
+ return true;
+}
+
+static void
+xfs_attr3_rmt_read_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ char *ptr;
+ int len;
+ xfs_daddr_t bno;
+ int blksize = mp->m_attr_geo->blksize;
+
+ /* no verification of non-crc buffers */
+ if (!xfs_sb_version_hascrc(&mp->m_sb))
+ return;
+
+ ptr = bp->b_addr;
+ bno = bp->b_bn;
+ len = BBTOB(bp->b_length);
+ ASSERT(len >= blksize);
+
+ while (len > 0) {
+ if (!xfs_verify_cksum(ptr, blksize, XFS_ATTR3_RMT_CRC_OFF)) {
+ xfs_buf_ioerror(bp, EFSBADCRC);
+ break;
+ }
+ if (!xfs_attr3_rmt_verify(mp, ptr, blksize, bno)) {
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
+ break;
+ }
+ len -= blksize;
+ ptr += blksize;
+ bno += BTOBB(blksize);
+ }
+
+ if (bp->b_error)
+ xfs_verifier_error(bp);
+ else
+ ASSERT(len == 0);
+}
+
+static void
+xfs_attr3_rmt_write_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_buf_log_item *bip = bp->b_fspriv;
+ char *ptr;
+ int len;
+ xfs_daddr_t bno;
+ int blksize = mp->m_attr_geo->blksize;
+
+ /* no verification of non-crc buffers */
+ if (!xfs_sb_version_hascrc(&mp->m_sb))
+ return;
+
+ ptr = bp->b_addr;
+ bno = bp->b_bn;
+ len = BBTOB(bp->b_length);
+ ASSERT(len >= blksize);
+
+ while (len > 0) {
+ if (!xfs_attr3_rmt_verify(mp, ptr, blksize, bno)) {
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
+ xfs_verifier_error(bp);
+ return;
+ }
+ if (bip) {
+ struct xfs_attr3_rmt_hdr *rmt;
+
+ rmt = (struct xfs_attr3_rmt_hdr *)ptr;
+ rmt->rm_lsn = cpu_to_be64(bip->bli_item.li_lsn);
+ }
+ xfs_update_cksum(ptr, blksize, XFS_ATTR3_RMT_CRC_OFF);
+
+ len -= blksize;
+ ptr += blksize;
+ bno += BTOBB(blksize);
+ }
+ ASSERT(len == 0);
+}
+
+const struct xfs_buf_ops xfs_attr3_rmt_buf_ops = {
+ .verify_read = xfs_attr3_rmt_read_verify,
+ .verify_write = xfs_attr3_rmt_write_verify,
+};
+
+STATIC int
+xfs_attr3_rmt_hdr_set(
+ struct xfs_mount *mp,
+ void *ptr,
+ xfs_ino_t ino,
+ uint32_t offset,
+ uint32_t size,
+ xfs_daddr_t bno)
+{
+ struct xfs_attr3_rmt_hdr *rmt = ptr;
+
+ if (!xfs_sb_version_hascrc(&mp->m_sb))
+ return 0;
+
+ rmt->rm_magic = cpu_to_be32(XFS_ATTR3_RMT_MAGIC);
+ rmt->rm_offset = cpu_to_be32(offset);
+ rmt->rm_bytes = cpu_to_be32(size);
+ uuid_copy(&rmt->rm_uuid, &mp->m_sb.sb_uuid);
+ rmt->rm_owner = cpu_to_be64(ino);
+ rmt->rm_blkno = cpu_to_be64(bno);
+
+ return sizeof(struct xfs_attr3_rmt_hdr);
+}
+
+/*
+ * Helper functions to copy attribute data in and out of the one disk extents
+ */
+STATIC int
+xfs_attr_rmtval_copyout(
+ struct xfs_mount *mp,
+ struct xfs_buf *bp,
+ xfs_ino_t ino,
+ int *offset,
+ int *valuelen,
+ __uint8_t **dst)
+{
+ char *src = bp->b_addr;
+ xfs_daddr_t bno = bp->b_bn;
+ int len = BBTOB(bp->b_length);
+ int blksize = mp->m_attr_geo->blksize;
+
+ ASSERT(len >= blksize);
+
+ while (len > 0 && *valuelen > 0) {
+ int hdr_size = 0;
+ int byte_cnt = XFS_ATTR3_RMT_BUF_SPACE(mp, blksize);
+
+ byte_cnt = min(*valuelen, byte_cnt);
+
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ if (!xfs_attr3_rmt_hdr_ok(src, ino, *offset,
+ byte_cnt, bno)) {
+ xfs_alert(mp,
+"remote attribute header mismatch bno/off/len/owner (0x%llx/0x%x/Ox%x/0x%llx)",
+ bno, *offset, byte_cnt, ino);
+ return EFSCORRUPTED;
+ }
+ hdr_size = sizeof(struct xfs_attr3_rmt_hdr);
+ }
+
+ memcpy(*dst, src + hdr_size, byte_cnt);
+
+ /* roll buffer forwards */
+ len -= blksize;
+ src += blksize;
+ bno += BTOBB(blksize);
+
+ /* roll attribute data forwards */
+ *valuelen -= byte_cnt;
+ *dst += byte_cnt;
+ *offset += byte_cnt;
+ }
+ return 0;
+}
+
+STATIC void
+xfs_attr_rmtval_copyin(
+ struct xfs_mount *mp,
+ struct xfs_buf *bp,
+ xfs_ino_t ino,
+ int *offset,
+ int *valuelen,
+ __uint8_t **src)
+{
+ char *dst = bp->b_addr;
+ xfs_daddr_t bno = bp->b_bn;
+ int len = BBTOB(bp->b_length);
+ int blksize = mp->m_attr_geo->blksize;
+
+ ASSERT(len >= blksize);
+
+ while (len > 0 && *valuelen > 0) {
+ int hdr_size;
+ int byte_cnt = XFS_ATTR3_RMT_BUF_SPACE(mp, blksize);
+
+ byte_cnt = min(*valuelen, byte_cnt);
+ hdr_size = xfs_attr3_rmt_hdr_set(mp, dst, ino, *offset,
+ byte_cnt, bno);
+
+ memcpy(dst + hdr_size, *src, byte_cnt);
+
+ /*
+ * If this is the last block, zero the remainder of it.
+ * Check that we are actually the last block, too.
+ */
+ if (byte_cnt + hdr_size < blksize) {
+ ASSERT(*valuelen - byte_cnt == 0);
+ ASSERT(len == blksize);
+ memset(dst + hdr_size + byte_cnt, 0,
+ blksize - hdr_size - byte_cnt);
+ }
+
+ /* roll buffer forwards */
+ len -= blksize;
+ dst += blksize;
+ bno += BTOBB(blksize);
+
+ /* roll attribute data forwards */
+ *valuelen -= byte_cnt;
+ *src += byte_cnt;
+ *offset += byte_cnt;
+ }
+}
+
+/*
+ * Read the value associated with an attribute from the out-of-line buffer
+ * that we stored it in.
+ */
+int
+xfs_attr_rmtval_get(
+ struct xfs_da_args *args)
+{
+ struct xfs_bmbt_irec map[ATTR_RMTVALUE_MAPSIZE];
+ struct xfs_mount *mp = args->dp->i_mount;
+ struct xfs_buf *bp;
+ xfs_dablk_t lblkno = args->rmtblkno;
+ __uint8_t *dst = args->value;
+ int valuelen;
+ int nmap;
+ int error;
+ int blkcnt = args->rmtblkcnt;
+ int i;
+ int offset = 0;
+
+ trace_xfs_attr_rmtval_get(args);
+
+ ASSERT(!(args->flags & ATTR_KERNOVAL));
+ ASSERT(args->rmtvaluelen == args->valuelen);
+
+ valuelen = args->rmtvaluelen;
+ while (valuelen > 0) {
+ nmap = ATTR_RMTVALUE_MAPSIZE;
+ error = xfs_bmapi_read(args->dp, (xfs_fileoff_t)lblkno,
+ blkcnt, map, &nmap,
+ XFS_BMAPI_ATTRFORK);
+ if (error)
+ return error;
+ ASSERT(nmap >= 1);
+
+ for (i = 0; (i < nmap) && (valuelen > 0); i++) {
+ xfs_daddr_t dblkno;
+ int dblkcnt;
+
+ ASSERT((map[i].br_startblock != DELAYSTARTBLOCK) &&
+ (map[i].br_startblock != HOLESTARTBLOCK));
+ dblkno = XFS_FSB_TO_DADDR(mp, map[i].br_startblock);
+ dblkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount);
+ error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp,
+ dblkno, dblkcnt, 0, &bp,
+ &xfs_attr3_rmt_buf_ops);
+ if (error)
+ return error;
+
+ error = xfs_attr_rmtval_copyout(mp, bp, args->dp->i_ino,
+ &offset, &valuelen,
+ &dst);
+ xfs_buf_relse(bp);
+ if (error)
+ return error;
+
+ /* roll attribute extent map forwards */
+ lblkno += map[i].br_blockcount;
+ blkcnt -= map[i].br_blockcount;
+ }
+ }
+ ASSERT(valuelen == 0);
+ return 0;
+}
+
+/*
+ * Write the value associated with an attribute into the out-of-line buffer
+ * that we have defined for it.
+ */
+int
+xfs_attr_rmtval_set(
+ struct xfs_da_args *args)
+{
+ struct xfs_inode *dp = args->dp;
+ struct xfs_mount *mp = dp->i_mount;
+ struct xfs_bmbt_irec map;
+ xfs_dablk_t lblkno;
+ xfs_fileoff_t lfileoff = 0;
+ __uint8_t *src = args->value;
+ int blkcnt;
+ int valuelen;
+ int nmap;
+ int error;
+ int offset = 0;
+
+ trace_xfs_attr_rmtval_set(args);
+
+ /*
+ * Find a "hole" in the attribute address space large enough for
+ * us to drop the new attribute's value into. Because CRC enable
+ * attributes have headers, we can't just do a straight byte to FSB
+ * conversion and have to take the header space into account.
+ */
+ blkcnt = xfs_attr3_rmt_blocks(mp, args->rmtvaluelen);
+ error = xfs_bmap_first_unused(args->trans, args->dp, blkcnt, &lfileoff,
+ XFS_ATTR_FORK);
+ if (error)
+ return error;
+
+ args->rmtblkno = lblkno = (xfs_dablk_t)lfileoff;
+ args->rmtblkcnt = blkcnt;
+
+ /*
+ * Roll through the "value", allocating blocks on disk as required.
+ */
+ while (blkcnt > 0) {
+ int committed;
+
+ /*
+ * Allocate a single extent, up to the size of the value.
+ */
+ xfs_bmap_init(args->flist, args->firstblock);
+ nmap = 1;
+ error = xfs_bmapi_write(args->trans, dp, (xfs_fileoff_t)lblkno,
+ blkcnt,
+ XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
+ args->firstblock, args->total, &map, &nmap,
+ args->flist);
+ if (!error) {
+ error = xfs_bmap_finish(&args->trans, args->flist,
+ &committed);
+ }
+ if (error) {
+ ASSERT(committed);
+ args->trans = NULL;
+ xfs_bmap_cancel(args->flist);
+ return(error);
+ }
+
+ /*
+ * bmap_finish() may have committed the last trans and started
+ * a new one. We need the inode to be in all transactions.
+ */
+ if (committed)
+ xfs_trans_ijoin(args->trans, dp, 0);
+
+ ASSERT(nmap == 1);
+ ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
+ (map.br_startblock != HOLESTARTBLOCK));
+ lblkno += map.br_blockcount;
+ blkcnt -= map.br_blockcount;
+
+ /*
+ * Start the next trans in the chain.
+ */
+ error = xfs_trans_roll(&args->trans, dp);
+ if (error)
+ return (error);
+ }
+
+ /*
+ * Roll through the "value", copying the attribute value to the
+ * already-allocated blocks. Blocks are written synchronously
+ * so that we can know they are all on disk before we turn off
+ * the INCOMPLETE flag.
+ */
+ lblkno = args->rmtblkno;
+ blkcnt = args->rmtblkcnt;
+ valuelen = args->rmtvaluelen;
+ while (valuelen > 0) {
+ struct xfs_buf *bp;
+ xfs_daddr_t dblkno;
+ int dblkcnt;
+
+ ASSERT(blkcnt > 0);
+
+ xfs_bmap_init(args->flist, args->firstblock);
+ nmap = 1;
+ error = xfs_bmapi_read(dp, (xfs_fileoff_t)lblkno,
+ blkcnt, &map, &nmap,
+ XFS_BMAPI_ATTRFORK);
+ if (error)
+ return(error);
+ ASSERT(nmap == 1);
+ ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
+ (map.br_startblock != HOLESTARTBLOCK));
+
+ dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock),
+ dblkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount);
+
+ bp = xfs_buf_get(mp->m_ddev_targp, dblkno, dblkcnt, 0);
+ if (!bp)
+ return ENOMEM;
+ bp->b_ops = &xfs_attr3_rmt_buf_ops;
+
+ xfs_attr_rmtval_copyin(mp, bp, args->dp->i_ino, &offset,
+ &valuelen, &src);
+
+ error = xfs_bwrite(bp); /* GROT: NOTE: synchronous write */
+ xfs_buf_relse(bp);
+ if (error)
+ return error;
+
+
+ /* roll attribute extent map forwards */
+ lblkno += map.br_blockcount;
+ blkcnt -= map.br_blockcount;
+ }
+ ASSERT(valuelen == 0);
+ return 0;
+}
+
+/*
+ * Remove the value associated with an attribute by deleting the
+ * out-of-line buffer that it is stored on.
+ */
+int
+xfs_attr_rmtval_remove(
+ struct xfs_da_args *args)
+{
+ struct xfs_mount *mp = args->dp->i_mount;
+ xfs_dablk_t lblkno;
+ int blkcnt;
+ int error;
+ int done;
+
+ trace_xfs_attr_rmtval_remove(args);
+
+ /*
+ * Roll through the "value", invalidating the attribute value's blocks.
+ */
+ lblkno = args->rmtblkno;
+ blkcnt = args->rmtblkcnt;
+ while (blkcnt > 0) {
+ struct xfs_bmbt_irec map;
+ struct xfs_buf *bp;
+ xfs_daddr_t dblkno;
+ int dblkcnt;
+ int nmap;
+
+ /*
+ * Try to remember where we decided to put the value.
+ */
+ nmap = 1;
+ error = xfs_bmapi_read(args->dp, (xfs_fileoff_t)lblkno,
+ blkcnt, &map, &nmap, XFS_BMAPI_ATTRFORK);
+ if (error)
+ return(error);
+ ASSERT(nmap == 1);
+ ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
+ (map.br_startblock != HOLESTARTBLOCK));
+
+ dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock),
+ dblkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount);
+
+ /*
+ * If the "remote" value is in the cache, remove it.
+ */
+ bp = xfs_incore(mp->m_ddev_targp, dblkno, dblkcnt, XBF_TRYLOCK);
+ if (bp) {
+ xfs_buf_stale(bp);
+ xfs_buf_relse(bp);
+ bp = NULL;
+ }
+
+ lblkno += map.br_blockcount;
+ blkcnt -= map.br_blockcount;
+ }
+
+ /*
+ * Keep de-allocating extents until the remote-value region is gone.
+ */
+ lblkno = args->rmtblkno;
+ blkcnt = args->rmtblkcnt;
+ done = 0;
+ while (!done) {
+ int committed;
+
+ xfs_bmap_init(args->flist, args->firstblock);
+ error = xfs_bunmapi(args->trans, args->dp, lblkno, blkcnt,
+ XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
+ 1, args->firstblock, args->flist,
+ &done);
+ if (!error) {
+ error = xfs_bmap_finish(&args->trans, args->flist,
+ &committed);
+ }
+ if (error) {
+ ASSERT(committed);
+ args->trans = NULL;
+ xfs_bmap_cancel(args->flist);
+ return error;
+ }
+
+ /*
+ * bmap_finish() may have committed the last trans and started
+ * a new one. We need the inode to be in all transactions.
+ */
+ if (committed)
+ xfs_trans_ijoin(args->trans, args->dp, 0);
+
+ /*
+ * Close out trans and start the next one in the chain.
+ */
+ error = xfs_trans_roll(&args->trans, args->dp);
+ if (error)
+ return (error);
+ }
+ return(0);
+}
diff --git a/fs/xfs/linux-2.6/mutex.h b/fs/xfs/xfs_attr_remote.h
index 2a88d56c4dc..5a9acfa156d 100644
--- a/fs/xfs/linux-2.6/mutex.h
+++ b/fs/xfs/xfs_attr_remote.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
* All Rights Reserved.
*
* This program is free software; you can redistribute it and/or
@@ -15,11 +15,13 @@
* along with this program; if not, write the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
-#ifndef __XFS_SUPPORT_MUTEX_H__
-#define __XFS_SUPPORT_MUTEX_H__
+#ifndef __XFS_ATTR_REMOTE_H__
+#define __XFS_ATTR_REMOTE_H__
-#include <linux/mutex.h>
+int xfs_attr3_rmt_blocks(struct xfs_mount *mp, int attrlen);
-typedef struct mutex mutex_t;
+int xfs_attr_rmtval_get(struct xfs_da_args *args);
+int xfs_attr_rmtval_set(struct xfs_da_args *args);
+int xfs_attr_rmtval_remove(struct xfs_da_args *args);
-#endif /* __XFS_SUPPORT_MUTEX_H__ */
+#endif /* __XFS_ATTR_REMOTE_H__ */
diff --git a/fs/xfs/xfs_attr_sf.h b/fs/xfs/xfs_attr_sf.h
index ffed6ca81a5..919756e3ba5 100644
--- a/fs/xfs/xfs_attr_sf.h
+++ b/fs/xfs/xfs_attr_sf.h
@@ -25,15 +25,13 @@
* to fit into the literal area of the inode.
*/
-struct xfs_inode;
-
/*
* Entries are packed toward the top as tight as possible.
*/
typedef struct xfs_attr_shortform {
struct xfs_attr_sf_hdr { /* constant-structure header block */
- __uint16_t totsize; /* total bytes in shortform list */
- __uint8_t count; /* count of active entries */
+ __be16 totsize; /* total bytes in shortform list */
+ __u8 count; /* count of active entries */
} hdr;
struct xfs_attr_sf_entry {
__uint8_t namelen; /* actual length of name (no NULL) */
@@ -54,7 +52,7 @@ typedef struct xfs_attr_sf_sort {
__uint8_t valuelen; /* length of value */
__uint8_t flags; /* flags bits (see xfs_attr_leaf.h) */
xfs_dahash_t hash; /* this entry's hash value */
- char *name; /* name value, pointer into buffer */
+ unsigned char *name; /* name value, pointer into buffer */
} xfs_attr_sf_sort_t;
#define XFS_ATTR_SF_ENTSIZE_BYNAME(nlen,vlen) /* space name/value uses */ \
@@ -66,49 +64,7 @@ typedef struct xfs_attr_sf_sort {
#define XFS_ATTR_SF_NEXTENTRY(sfep) /* next entry in struct */ \
((xfs_attr_sf_entry_t *)((char *)(sfep) + XFS_ATTR_SF_ENTSIZE(sfep)))
#define XFS_ATTR_SF_TOTSIZE(dp) /* total space in use */ \
- (INT_GET(((xfs_attr_shortform_t *) \
- ((dp)->i_afp->if_u1.if_data))->hdr.totsize, ARCH_CONVERT))
-
-#if defined(XFS_ATTR_TRACE)
-/*
- * Kernel tracing support for attribute lists
- */
-struct xfs_attr_list_context;
-struct xfs_da_intnode;
-struct xfs_da_node_entry;
-struct xfs_attr_leafblock;
-
-#define XFS_ATTR_TRACE_SIZE 4096 /* size of global trace buffer */
-extern ktrace_t *xfs_attr_trace_buf;
-
-/*
- * Trace record types.
- */
-#define XFS_ATTR_KTRACE_L_C 1 /* context */
-#define XFS_ATTR_KTRACE_L_CN 2 /* context, node */
-#define XFS_ATTR_KTRACE_L_CB 3 /* context, btree */
-#define XFS_ATTR_KTRACE_L_CL 4 /* context, leaf */
-
-void xfs_attr_trace_l_c(char *where, struct xfs_attr_list_context *context);
-void xfs_attr_trace_l_cn(char *where, struct xfs_attr_list_context *context,
- struct xfs_da_intnode *node);
-void xfs_attr_trace_l_cb(char *where, struct xfs_attr_list_context *context,
- struct xfs_da_node_entry *btree);
-void xfs_attr_trace_l_cl(char *where, struct xfs_attr_list_context *context,
- struct xfs_attr_leafblock *leaf);
-void xfs_attr_trace_enter(int type, char *where,
- __psunsigned_t a2, __psunsigned_t a3,
- __psunsigned_t a4, __psunsigned_t a5,
- __psunsigned_t a6, __psunsigned_t a7,
- __psunsigned_t a8, __psunsigned_t a9,
- __psunsigned_t a10, __psunsigned_t a11,
- __psunsigned_t a12, __psunsigned_t a13,
- __psunsigned_t a14, __psunsigned_t a15);
-#else
-#define xfs_attr_trace_l_c(w,c)
-#define xfs_attr_trace_l_cn(w,c,n)
-#define xfs_attr_trace_l_cb(w,c,b)
-#define xfs_attr_trace_l_cl(w,c,l)
-#endif /* XFS_ATTR_TRACE */
+ (be16_to_cpu(((xfs_attr_shortform_t *) \
+ ((dp)->i_afp->if_u1.if_data))->hdr.totsize))
#endif /* __XFS_ATTR_SF_H__ */
diff --git a/fs/xfs/xfs_behavior.c b/fs/xfs/xfs_behavior.c
deleted file mode 100644
index 9880adae393..00000000000
--- a/fs/xfs/xfs_behavior.c
+++ /dev/null
@@ -1,203 +0,0 @@
-/*
- * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- */
-#include "xfs.h"
-
-/*
- * Source file used to associate/disassociate behaviors with virtualized
- * objects. See xfs_behavior.h for more information about behaviors, etc.
- *
- * The implementation is split between functions in this file and macros
- * in xfs_behavior.h.
- */
-
-/*
- * Insert a new behavior descriptor into a behavior chain.
- *
- * The behavior chain is ordered based on the 'position' number which
- * lives in the first field of the ops vector (higher numbers first).
- *
- * Attemps to insert duplicate ops result in an EINVAL return code.
- * Otherwise, return 0 to indicate success.
- */
-int
-bhv_insert(bhv_head_t *bhp, bhv_desc_t *bdp)
-{
- bhv_desc_t *curdesc, *prev;
- int position;
-
- /*
- * Validate the position value of the new behavior.
- */
- position = BHV_POSITION(bdp);
- ASSERT(position >= BHV_POSITION_BASE && position <= BHV_POSITION_TOP);
-
- /*
- * Find location to insert behavior. Check for duplicates.
- */
- prev = NULL;
- for (curdesc = bhp->bh_first;
- curdesc != NULL;
- curdesc = curdesc->bd_next) {
-
- /* Check for duplication. */
- if (curdesc->bd_ops == bdp->bd_ops) {
- ASSERT(0);
- return EINVAL;
- }
-
- /* Find correct position */
- if (position >= BHV_POSITION(curdesc)) {
- ASSERT(position != BHV_POSITION(curdesc));
- break; /* found it */
- }
-
- prev = curdesc;
- }
-
- if (prev == NULL) {
- /* insert at front of chain */
- bdp->bd_next = bhp->bh_first;
- bhp->bh_first = bdp;
- } else {
- /* insert after prev */
- bdp->bd_next = prev->bd_next;
- prev->bd_next = bdp;
- }
-
- return 0;
-}
-
-/*
- * Remove a behavior descriptor from a position in a behavior chain;
- * the postition is guaranteed not to be the first position.
- * Should only be called by the bhv_remove() macro.
- */
-void
-bhv_remove_not_first(bhv_head_t *bhp, bhv_desc_t *bdp)
-{
- bhv_desc_t *curdesc, *prev;
-
- ASSERT(bhp->bh_first != NULL);
- ASSERT(bhp->bh_first->bd_next != NULL);
-
- prev = bhp->bh_first;
- for (curdesc = bhp->bh_first->bd_next;
- curdesc != NULL;
- curdesc = curdesc->bd_next) {
-
- if (curdesc == bdp)
- break; /* found it */
- prev = curdesc;
- }
-
- ASSERT(curdesc == bdp);
- prev->bd_next = bdp->bd_next; /* remove from after prev */
-}
-
-/*
- * Look for a specific ops vector on the specified behavior chain.
- * Return the associated behavior descriptor. Or NULL, if not found.
- */
-bhv_desc_t *
-bhv_lookup(bhv_head_t *bhp, void *ops)
-{
- bhv_desc_t *curdesc;
-
- for (curdesc = bhp->bh_first;
- curdesc != NULL;
- curdesc = curdesc->bd_next) {
-
- if (curdesc->bd_ops == ops)
- return curdesc;
- }
-
- return NULL;
-}
-
-/*
- * Looks for the first behavior within a specified range of positions.
- * Return the associated behavior descriptor. Or NULL, if none found.
- */
-bhv_desc_t *
-bhv_lookup_range(bhv_head_t *bhp, int low, int high)
-{
- bhv_desc_t *curdesc;
-
- for (curdesc = bhp->bh_first;
- curdesc != NULL;
- curdesc = curdesc->bd_next) {
-
- int position = BHV_POSITION(curdesc);
-
- if (position <= high) {
- if (position >= low)
- return curdesc;
- return NULL;
- }
- }
-
- return NULL;
-}
-
-/*
- * Return the base behavior in the chain, or NULL if the chain
- * is empty.
- *
- * The caller has not read locked the behavior chain, so acquire the
- * lock before traversing the chain.
- */
-bhv_desc_t *
-bhv_base(bhv_head_t *bhp)
-{
- bhv_desc_t *curdesc;
-
- for (curdesc = bhp->bh_first;
- curdesc != NULL;
- curdesc = curdesc->bd_next) {
-
- if (curdesc->bd_next == NULL) {
- return curdesc;
- }
- }
-
- return NULL;
-}
-
-void
-bhv_head_init(
- bhv_head_t *bhp,
- char *name)
-{
- bhp->bh_first = NULL;
-}
-
-void
-bhv_insert_initial(
- bhv_head_t *bhp,
- bhv_desc_t *bdp)
-{
- ASSERT(bhp->bh_first == NULL);
- (bhp)->bh_first = bdp;
-}
-
-void
-bhv_head_destroy(
- bhv_head_t *bhp)
-{
- ASSERT(bhp->bh_first == NULL);
-}
diff --git a/fs/xfs/xfs_behavior.h b/fs/xfs/xfs_behavior.h
deleted file mode 100644
index 2cd89bb5ab1..00000000000
--- a/fs/xfs/xfs_behavior.h
+++ /dev/null
@@ -1,190 +0,0 @@
-/*
- * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- */
-#ifndef __XFS_BEHAVIOR_H__
-#define __XFS_BEHAVIOR_H__
-
-/*
- * Header file used to associate behaviors with virtualized objects.
- *
- * A virtualized object is an internal, virtualized representation of
- * OS entities such as persistent files, processes, or sockets. Examples
- * of virtualized objects include vnodes, vprocs, and vsockets. Often
- * a virtualized object is referred to simply as an "object."
- *
- * A behavior is essentially an implementation layer associated with
- * an object. Multiple behaviors for an object are chained together,
- * the order of chaining determining the order of invocation. Each
- * behavior of a given object implements the same set of interfaces
- * (e.g., the VOP interfaces).
- *
- * Behaviors may be dynamically inserted into an object's behavior chain,
- * such that the addition is transparent to consumers that already have
- * references to the object. Typically, a given behavior will be inserted
- * at a particular location in the behavior chain. Insertion of new
- * behaviors is synchronized with operations-in-progress (oip's) so that
- * the oip's always see a consistent view of the chain.
- *
- * The term "interpostion" is used to refer to the act of inserting
- * a behavior such that it interposes on (i.e., is inserted in front
- * of) a particular other behavior. A key example of this is when a
- * system implementing distributed single system image wishes to
- * interpose a distribution layer (providing distributed coherency)
- * in front of an object that is otherwise only accessed locally.
- *
- * Note that the traditional vnode/inode combination is simply a virtualized
- * object that has exactly one associated behavior.
- *
- * Behavior synchronization is logic which is necessary under certain
- * circumstances that there is no conflict between ongoing operations
- * traversing the behavior chain and those dunamically modifying the
- * behavior chain. Because behavior synchronization adds extra overhead
- * to virtual operation invocation, we want to restrict, as much as
- * we can, the requirement for this extra code, to those situations
- * in which it is truly necessary.
- *
- * Behavior synchronization is needed whenever there's at least one class
- * of object in the system for which:
- * 1) multiple behaviors for a given object are supported,
- * -- AND --
- * 2a) insertion of a new behavior can happen dynamically at any time during
- * the life of an active object,
- * -- AND --
- * 3a) insertion of a new behavior needs to synchronize with existing
- * ops-in-progress.
- * -- OR --
- * 3b) multiple different behaviors can be dynamically inserted at
- * any time during the life of an active object
- * -- OR --
- * 3c) removal of a behavior can occur at any time during the life of
- * an active object.
- * -- OR --
- * 2b) removal of a behavior can occur at any time during the life of an
- * active object
- *
- */
-
-struct bhv_head_lock;
-
-/*
- * Behavior head. Head of the chain of behaviors.
- * Contained within each virtualized object data structure.
- */
-typedef struct bhv_head {
- struct bhv_desc *bh_first; /* first behavior in chain */
- struct bhv_head_lock *bh_lockp; /* pointer to lock info struct */
-} bhv_head_t;
-
-/*
- * Behavior descriptor. Descriptor associated with each behavior.
- * Contained within the behavior's private data structure.
- */
-typedef struct bhv_desc {
- void *bd_pdata; /* private data for this behavior */
- void *bd_vobj; /* virtual object associated with */
- void *bd_ops; /* ops for this behavior */
- struct bhv_desc *bd_next; /* next behavior in chain */
-} bhv_desc_t;
-
-/*
- * Behavior identity field. A behavior's identity determines the position
- * where it lives within a behavior chain, and it's always the first field
- * of the behavior's ops vector. The optional id field further identifies the
- * subsystem responsible for the behavior.
- */
-typedef struct bhv_identity {
- __u16 bi_id; /* owning subsystem id */
- __u16 bi_position; /* position in chain */
-} bhv_identity_t;
-
-typedef bhv_identity_t bhv_position_t;
-
-#define BHV_IDENTITY_INIT(id,pos) {id, pos}
-#define BHV_IDENTITY_INIT_POSITION(pos) BHV_IDENTITY_INIT(0, pos)
-
-/*
- * Define boundaries of position values.
- */
-#define BHV_POSITION_INVALID 0 /* invalid position number */
-#define BHV_POSITION_BASE 1 /* base (last) implementation layer */
-#define BHV_POSITION_TOP 63 /* top (first) implementation layer */
-
-/*
- * Plumbing macros.
- */
-#define BHV_HEAD_FIRST(bhp) (ASSERT((bhp)->bh_first), (bhp)->bh_first)
-#define BHV_NEXT(bdp) (ASSERT((bdp)->bd_next), (bdp)->bd_next)
-#define BHV_NEXTNULL(bdp) ((bdp)->bd_next)
-#define BHV_VOBJ(bdp) (ASSERT((bdp)->bd_vobj), (bdp)->bd_vobj)
-#define BHV_VOBJNULL(bdp) ((bdp)->bd_vobj)
-#define BHV_PDATA(bdp) (bdp)->bd_pdata
-#define BHV_OPS(bdp) (bdp)->bd_ops
-#define BHV_IDENTITY(bdp) ((bhv_identity_t *)(bdp)->bd_ops)
-#define BHV_POSITION(bdp) (BHV_IDENTITY(bdp)->bi_position)
-
-extern void bhv_head_init(bhv_head_t *, char *);
-extern void bhv_head_destroy(bhv_head_t *);
-extern int bhv_insert(bhv_head_t *, bhv_desc_t *);
-extern void bhv_insert_initial(bhv_head_t *, bhv_desc_t *);
-
-/*
- * Initialize a new behavior descriptor.
- * Arguments:
- * bdp - pointer to behavior descriptor
- * pdata - pointer to behavior's private data
- * vobj - pointer to associated virtual object
- * ops - pointer to ops for this behavior
- */
-#define bhv_desc_init(bdp, pdata, vobj, ops) \
- { \
- (bdp)->bd_pdata = pdata; \
- (bdp)->bd_vobj = vobj; \
- (bdp)->bd_ops = ops; \
- (bdp)->bd_next = NULL; \
- }
-
-/*
- * Remove a behavior descriptor from a behavior chain.
- */
-#define bhv_remove(bhp, bdp) \
- { \
- if ((bhp)->bh_first == (bdp)) { \
- /* \
- * Remove from front of chain. \
- * Atomic wrt oip's. \
- */ \
- (bhp)->bh_first = (bdp)->bd_next; \
- } else { \
- /* remove from non-front of chain */ \
- bhv_remove_not_first(bhp, bdp); \
- } \
- (bdp)->bd_vobj = NULL; \
- }
-
-/*
- * Behavior module prototypes.
- */
-extern void bhv_remove_not_first(bhv_head_t *bhp, bhv_desc_t *bdp);
-extern bhv_desc_t * bhv_lookup(bhv_head_t *bhp, void *ops);
-extern bhv_desc_t * bhv_lookup_range(bhv_head_t *bhp, int low, int high);
-extern bhv_desc_t * bhv_base(bhv_head_t *bhp);
-
-/* No bhv locking on Linux */
-#define bhv_lookup_unlocked bhv_lookup
-#define bhv_base_unlocked bhv_base
-
-#endif /* __XFS_BEHAVIOR_H__ */
diff --git a/fs/xfs/xfs_bit.c b/fs/xfs/xfs_bit.c
index 43be6a7e47c..0e8885a5964 100644
--- a/fs/xfs/xfs_bit.c
+++ b/fs/xfs/xfs_bit.c
@@ -16,207 +16,29 @@
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "xfs.h"
+#include "xfs_log_format.h"
#include "xfs_bit.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
-#include "xfs_buf_item.h"
/*
* XFS bit manipulation routines, used in non-realtime code.
*/
-#ifndef HAVE_ARCH_HIGHBIT
/*
- * Index of high bit number in byte, -1 for none set, 0..7 otherwise.
- */
-STATIC const char xfs_highbit[256] = {
- -1, 0, 1, 1, 2, 2, 2, 2, /* 00 .. 07 */
- 3, 3, 3, 3, 3, 3, 3, 3, /* 08 .. 0f */
- 4, 4, 4, 4, 4, 4, 4, 4, /* 10 .. 17 */
- 4, 4, 4, 4, 4, 4, 4, 4, /* 18 .. 1f */
- 5, 5, 5, 5, 5, 5, 5, 5, /* 20 .. 27 */
- 5, 5, 5, 5, 5, 5, 5, 5, /* 28 .. 2f */
- 5, 5, 5, 5, 5, 5, 5, 5, /* 30 .. 37 */
- 5, 5, 5, 5, 5, 5, 5, 5, /* 38 .. 3f */
- 6, 6, 6, 6, 6, 6, 6, 6, /* 40 .. 47 */
- 6, 6, 6, 6, 6, 6, 6, 6, /* 48 .. 4f */
- 6, 6, 6, 6, 6, 6, 6, 6, /* 50 .. 57 */
- 6, 6, 6, 6, 6, 6, 6, 6, /* 58 .. 5f */
- 6, 6, 6, 6, 6, 6, 6, 6, /* 60 .. 67 */
- 6, 6, 6, 6, 6, 6, 6, 6, /* 68 .. 6f */
- 6, 6, 6, 6, 6, 6, 6, 6, /* 70 .. 77 */
- 6, 6, 6, 6, 6, 6, 6, 6, /* 78 .. 7f */
- 7, 7, 7, 7, 7, 7, 7, 7, /* 80 .. 87 */
- 7, 7, 7, 7, 7, 7, 7, 7, /* 88 .. 8f */
- 7, 7, 7, 7, 7, 7, 7, 7, /* 90 .. 97 */
- 7, 7, 7, 7, 7, 7, 7, 7, /* 98 .. 9f */
- 7, 7, 7, 7, 7, 7, 7, 7, /* a0 .. a7 */
- 7, 7, 7, 7, 7, 7, 7, 7, /* a8 .. af */
- 7, 7, 7, 7, 7, 7, 7, 7, /* b0 .. b7 */
- 7, 7, 7, 7, 7, 7, 7, 7, /* b8 .. bf */
- 7, 7, 7, 7, 7, 7, 7, 7, /* c0 .. c7 */
- 7, 7, 7, 7, 7, 7, 7, 7, /* c8 .. cf */
- 7, 7, 7, 7, 7, 7, 7, 7, /* d0 .. d7 */
- 7, 7, 7, 7, 7, 7, 7, 7, /* d8 .. df */
- 7, 7, 7, 7, 7, 7, 7, 7, /* e0 .. e7 */
- 7, 7, 7, 7, 7, 7, 7, 7, /* e8 .. ef */
- 7, 7, 7, 7, 7, 7, 7, 7, /* f0 .. f7 */
- 7, 7, 7, 7, 7, 7, 7, 7, /* f8 .. ff */
-};
-#endif
-
-/*
- * Count of bits set in byte, 0..8.
- */
-static const char xfs_countbit[256] = {
- 0, 1, 1, 2, 1, 2, 2, 3, /* 00 .. 07 */
- 1, 2, 2, 3, 2, 3, 3, 4, /* 08 .. 0f */
- 1, 2, 2, 3, 2, 3, 3, 4, /* 10 .. 17 */
- 2, 3, 3, 4, 3, 4, 4, 5, /* 18 .. 1f */
- 1, 2, 2, 3, 2, 3, 3, 4, /* 20 .. 27 */
- 2, 3, 3, 4, 3, 4, 4, 5, /* 28 .. 2f */
- 2, 3, 3, 4, 3, 4, 4, 5, /* 30 .. 37 */
- 3, 4, 4, 5, 4, 5, 5, 6, /* 38 .. 3f */
- 1, 2, 2, 3, 2, 3, 3, 4, /* 40 .. 47 */
- 2, 3, 3, 4, 3, 4, 4, 5, /* 48 .. 4f */
- 2, 3, 3, 4, 3, 4, 4, 5, /* 50 .. 57 */
- 3, 4, 4, 5, 4, 5, 5, 6, /* 58 .. 5f */
- 2, 3, 3, 4, 3, 4, 4, 5, /* 60 .. 67 */
- 3, 4, 4, 5, 4, 5, 5, 6, /* 68 .. 6f */
- 3, 4, 4, 5, 4, 5, 5, 6, /* 70 .. 77 */
- 4, 5, 5, 6, 5, 6, 6, 7, /* 78 .. 7f */
- 1, 2, 2, 3, 2, 3, 3, 4, /* 80 .. 87 */
- 2, 3, 3, 4, 3, 4, 4, 5, /* 88 .. 8f */
- 2, 3, 3, 4, 3, 4, 4, 5, /* 90 .. 97 */
- 3, 4, 4, 5, 4, 5, 5, 6, /* 98 .. 9f */
- 2, 3, 3, 4, 3, 4, 4, 5, /* a0 .. a7 */
- 3, 4, 4, 5, 4, 5, 5, 6, /* a8 .. af */
- 3, 4, 4, 5, 4, 5, 5, 6, /* b0 .. b7 */
- 4, 5, 5, 6, 5, 6, 6, 7, /* b8 .. bf */
- 2, 3, 3, 4, 3, 4, 4, 5, /* c0 .. c7 */
- 3, 4, 4, 5, 4, 5, 5, 6, /* c8 .. cf */
- 3, 4, 4, 5, 4, 5, 5, 6, /* d0 .. d7 */
- 4, 5, 5, 6, 5, 6, 6, 7, /* d8 .. df */
- 3, 4, 4, 5, 4, 5, 5, 6, /* e0 .. e7 */
- 4, 5, 5, 6, 5, 6, 6, 7, /* e8 .. ef */
- 4, 5, 5, 6, 5, 6, 6, 7, /* f0 .. f7 */
- 5, 6, 6, 7, 6, 7, 7, 8, /* f8 .. ff */
-};
-
-/*
- * xfs_highbit32: get high bit set out of 32-bit argument, -1 if none set.
- */
-inline int
-xfs_highbit32(
- __uint32_t v)
-{
-#ifdef HAVE_ARCH_HIGHBIT
- return highbit32(v);
-#else
- int i;
-
- if (v & 0xffff0000)
- if (v & 0xff000000)
- i = 24;
- else
- i = 16;
- else if (v & 0x0000ffff)
- if (v & 0x0000ff00)
- i = 8;
- else
- i = 0;
- else
- return -1;
- return i + xfs_highbit[(v >> i) & 0xff];
-#endif
-}
-
-/*
- * xfs_lowbit64: get low bit set out of 64-bit argument, -1 if none set.
- */
-int
-xfs_lowbit64(
- __uint64_t v)
-{
- __uint32_t w = (__uint32_t)v;
- int n = 0;
-
- if (w) { /* lower bits */
- n = ffs(w);
- } else { /* upper bits */
- w = (__uint32_t)(v >> 32);
- if (w && (n = ffs(w)))
- n += 32;
- }
- return n - 1;
-}
-
-/*
- * xfs_highbit64: get high bit set out of 64-bit argument, -1 if none set.
- */
-int
-xfs_highbit64(
- __uint64_t v)
-{
- __uint32_t h = (__uint32_t)(v >> 32);
-
- if (h)
- return xfs_highbit32(h) + 32;
- return xfs_highbit32((__uint32_t)v);
-}
-
-
-/*
- * Count the number of bits set in the bitmap starting with bit
- * start_bit. Size is the size of the bitmap in words.
- *
- * Do the counting by mapping a byte value to the number of set
- * bits for that value using the xfs_countbit array, i.e.
- * xfs_countbit[0] == 0, xfs_countbit[1] == 1, xfs_countbit[2] == 1,
- * xfs_countbit[3] == 2, etc.
+ * Return whether bitmap is empty.
+ * Size is number of words in the bitmap, which is padded to word boundary
+ * Returns 1 for empty, 0 for non-empty.
*/
int
-xfs_count_bits(uint *map, uint size, uint start_bit)
+xfs_bitmap_empty(uint *map, uint size)
{
- register int bits;
- register unsigned char *bytep;
- register unsigned char *end_map;
- int byte_bit;
-
- bits = 0;
- end_map = (char*)(map + size);
- bytep = (char*)(map + (start_bit & ~0x7));
- byte_bit = start_bit & 0x7;
-
- /*
- * If the caller fell off the end of the map, return 0.
- */
- if (bytep >= end_map) {
- return (0);
- }
-
- /*
- * If start_bit is not byte aligned, then process the
- * first byte separately.
- */
- if (byte_bit != 0) {
- /*
- * Shift off the bits we don't want to look at,
- * before indexing into xfs_countbit.
- */
- bits += xfs_countbit[(*bytep >> byte_bit)];
- bytep++;
- }
+ uint i;
+ uint ret = 0;
- /*
- * Count the bits in each byte until the end of the bitmap.
- */
- while (bytep < end_map) {
- bits += xfs_countbit[*bytep];
- bytep++;
+ for (i = 0; i < size; i++) {
+ ret |= map[i];
}
- return (bits);
+ return (ret == 0);
}
/*
diff --git a/fs/xfs/xfs_bit.h b/fs/xfs/xfs_bit.h
index 0bbe5681754..e1649c0d3e0 100644
--- a/fs/xfs/xfs_bit.h
+++ b/fs/xfs/xfs_bit.h
@@ -23,40 +23,60 @@
*/
/*
- * masks with n high/low bits set, 32-bit values & 64-bit values
+ * masks with n high/low bits set, 64-bit values
*/
-#define XFS_MASK32HI(n) xfs_mask32hi(n)
-static inline __uint32_t xfs_mask32hi(int n)
-{
- return (__uint32_t)-1 << (32 - (n));
-}
-#define XFS_MASK64HI(n) xfs_mask64hi(n)
static inline __uint64_t xfs_mask64hi(int n)
{
return (__uint64_t)-1 << (64 - (n));
}
-#define XFS_MASK32LO(n) xfs_mask32lo(n)
static inline __uint32_t xfs_mask32lo(int n)
{
return ((__uint32_t)1 << (n)) - 1;
}
-#define XFS_MASK64LO(n) xfs_mask64lo(n)
static inline __uint64_t xfs_mask64lo(int n)
{
return ((__uint64_t)1 << (n)) - 1;
}
/* Get high bit set out of 32-bit argument, -1 if none set */
-extern int xfs_highbit32(__uint32_t v);
+static inline int xfs_highbit32(__uint32_t v)
+{
+ return fls(v) - 1;
+}
+
+/* Get high bit set out of 64-bit argument, -1 if none set */
+static inline int xfs_highbit64(__uint64_t v)
+{
+ return fls64(v) - 1;
+}
+
+/* Get low bit set out of 32-bit argument, -1 if none set */
+static inline int xfs_lowbit32(__uint32_t v)
+{
+ return ffs(v) - 1;
+}
/* Get low bit set out of 64-bit argument, -1 if none set */
-extern int xfs_lowbit64(__uint64_t v);
+static inline int xfs_lowbit64(__uint64_t v)
+{
+ __uint32_t w = (__uint32_t)v;
+ int n = 0;
-/* Get high bit set out of 64-bit argument, -1 if none set */
-extern int xfs_highbit64(__uint64_t);
+ if (w) { /* lower bits */
+ n = ffs(w);
+ } else { /* upper bits */
+ w = (__uint32_t)(v >> 32);
+ if (w) {
+ n = ffs(w);
+ if (n)
+ n += 32;
+ }
+ }
+ return n - 1;
+}
-/* Count set bits in map starting with start_bit */
-extern int xfs_count_bits(uint *map, uint size, uint start_bit);
+/* Return whether bitmap is empty (1 == empty) */
+extern int xfs_bitmap_empty(uint *map, uint size);
/* Count continuous one bits in map starting with start_bit */
extern int xfs_contig_bits(uint *map, uint size, uint start_bit);
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 70625e577c7..75c3fe5f3d9 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
* All Rights Reserved.
*
* This program is free software; you can redistribute it and/or
@@ -17,216 +17,727 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
-#include "xfs_types.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_bit.h"
-#include "xfs_log.h"
#include "xfs_inum.h"
-#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
-#include "xfs_dir.h"
-#include "xfs_dir2.h"
+#include "xfs_mount.h"
+#include "xfs_da_format.h"
#include "xfs_da_btree.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
-#include "xfs_dinode.h"
+#include "xfs_dir2.h"
#include "xfs_inode.h"
#include "xfs_btree.h"
-#include "xfs_dmapi.h"
-#include "xfs_mount.h"
-#include "xfs_ialloc.h"
-#include "xfs_itable.h"
+#include "xfs_trans.h"
#include "xfs_inode_item.h"
#include "xfs_extfree_item.h"
#include "xfs_alloc.h"
#include "xfs_bmap.h"
+#include "xfs_bmap_util.h"
+#include "xfs_bmap_btree.h"
#include "xfs_rtalloc.h"
#include "xfs_error.h"
-#include "xfs_dir_leaf.h"
-#include "xfs_attr_leaf.h"
-#include "xfs_rw.h"
#include "xfs_quota.h"
#include "xfs_trans_space.h"
#include "xfs_buf_item.h"
+#include "xfs_trace.h"
+#include "xfs_symlink.h"
+#include "xfs_attr_leaf.h"
+#include "xfs_dinode.h"
+#include "xfs_filestream.h"
-#ifdef DEBUG
-STATIC void
-xfs_bmap_check_leaf_extents(xfs_btree_cur_t *cur, xfs_inode_t *ip, int whichfork);
-#endif
-
kmem_zone_t *xfs_bmap_free_item_zone;
/*
- * Prototypes for internal bmap routines.
+ * Miscellaneous helper functions
*/
+/*
+ * Compute and fill in the value of the maximum depth of a bmap btree
+ * in this filesystem. Done once, during mount.
+ */
+void
+xfs_bmap_compute_maxlevels(
+ xfs_mount_t *mp, /* file system mount structure */
+ int whichfork) /* data or attr fork */
+{
+ int level; /* btree level */
+ uint maxblocks; /* max blocks at this level */
+ uint maxleafents; /* max leaf entries possible */
+ int maxrootrecs; /* max records in root block */
+ int minleafrecs; /* min records in leaf block */
+ int minnoderecs; /* min records in node block */
+ int sz; /* root block size */
+
+ /*
+ * The maximum number of extents in a file, hence the maximum
+ * number of leaf entries, is controlled by the type of di_nextents
+ * (a signed 32-bit number, xfs_extnum_t), or by di_anextents
+ * (a signed 16-bit number, xfs_aextnum_t).
+ *
+ * Note that we can no longer assume that if we are in ATTR1 that
+ * the fork offset of all the inodes will be
+ * (xfs_default_attroffset(ip) >> 3) because we could have mounted
+ * with ATTR2 and then mounted back with ATTR1, keeping the
+ * di_forkoff's fixed but probably at various positions. Therefore,
+ * for both ATTR1 and ATTR2 we have to assume the worst case scenario
+ * of a minimum size available.
+ */
+ if (whichfork == XFS_DATA_FORK) {
+ maxleafents = MAXEXTNUM;
+ sz = XFS_BMDR_SPACE_CALC(MINDBTPTRS);
+ } else {
+ maxleafents = MAXAEXTNUM;
+ sz = XFS_BMDR_SPACE_CALC(MINABTPTRS);
+ }
+ maxrootrecs = xfs_bmdr_maxrecs(sz, 0);
+ minleafrecs = mp->m_bmap_dmnr[0];
+ minnoderecs = mp->m_bmap_dmnr[1];
+ maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs;
+ for (level = 1; maxblocks > 1; level++) {
+ if (maxblocks <= maxrootrecs)
+ maxblocks = 1;
+ else
+ maxblocks = (maxblocks + minnoderecs - 1) / minnoderecs;
+ }
+ mp->m_bm_maxlevels[whichfork] = level;
+}
+
+STATIC int /* error */
+xfs_bmbt_lookup_eq(
+ struct xfs_btree_cur *cur,
+ xfs_fileoff_t off,
+ xfs_fsblock_t bno,
+ xfs_filblks_t len,
+ int *stat) /* success/failure */
+{
+ cur->bc_rec.b.br_startoff = off;
+ cur->bc_rec.b.br_startblock = bno;
+ cur->bc_rec.b.br_blockcount = len;
+ return xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat);
+}
+
+STATIC int /* error */
+xfs_bmbt_lookup_ge(
+ struct xfs_btree_cur *cur,
+ xfs_fileoff_t off,
+ xfs_fsblock_t bno,
+ xfs_filblks_t len,
+ int *stat) /* success/failure */
+{
+ cur->bc_rec.b.br_startoff = off;
+ cur->bc_rec.b.br_startblock = bno;
+ cur->bc_rec.b.br_blockcount = len;
+ return xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat);
+}
/*
- * Called from xfs_bmap_add_attrfork to handle extents format files.
+ * Check if the inode needs to be converted to btree format.
*/
-STATIC int /* error */
-xfs_bmap_add_attrfork_extents(
- xfs_trans_t *tp, /* transaction pointer */
- xfs_inode_t *ip, /* incore inode pointer */
- xfs_fsblock_t *firstblock, /* first block allocated */
- xfs_bmap_free_t *flist, /* blocks to free at commit */
- int *flags); /* inode logging flags */
+static inline bool xfs_bmap_needs_btree(struct xfs_inode *ip, int whichfork)
+{
+ return XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS &&
+ XFS_IFORK_NEXTENTS(ip, whichfork) >
+ XFS_IFORK_MAXEXT(ip, whichfork);
+}
/*
- * Called from xfs_bmap_add_attrfork to handle local format files.
+ * Check if the inode should be converted to extent format.
*/
-STATIC int /* error */
-xfs_bmap_add_attrfork_local(
- xfs_trans_t *tp, /* transaction pointer */
- xfs_inode_t *ip, /* incore inode pointer */
- xfs_fsblock_t *firstblock, /* first block allocated */
- xfs_bmap_free_t *flist, /* blocks to free at commit */
- int *flags); /* inode logging flags */
+static inline bool xfs_bmap_wants_extents(struct xfs_inode *ip, int whichfork)
+{
+ return XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE &&
+ XFS_IFORK_NEXTENTS(ip, whichfork) <=
+ XFS_IFORK_MAXEXT(ip, whichfork);
+}
/*
- * Called by xfs_bmapi to update extent list structure and the btree
- * after allocating space (or doing a delayed allocation).
+ * Update the record referred to by cur to the value given
+ * by [off, bno, len, state].
+ * This either works (return 0) or gets an EFSCORRUPTED error.
*/
-STATIC int /* error */
-xfs_bmap_add_extent(
- xfs_inode_t *ip, /* incore inode pointer */
- xfs_extnum_t idx, /* extent number to update/insert */
- xfs_btree_cur_t **curp, /* if *curp is null, not a btree */
- xfs_bmbt_irec_t *new, /* new data to put in extent list */
- xfs_fsblock_t *first, /* pointer to firstblock variable */
- xfs_bmap_free_t *flist, /* list of extents to be freed */
- int *logflagsp, /* inode logging flags */
- int whichfork, /* data or attr fork */
- int rsvd); /* OK to allocate reserved blocks */
+STATIC int
+xfs_bmbt_update(
+ struct xfs_btree_cur *cur,
+ xfs_fileoff_t off,
+ xfs_fsblock_t bno,
+ xfs_filblks_t len,
+ xfs_exntst_t state)
+{
+ union xfs_btree_rec rec;
+
+ xfs_bmbt_disk_set_allf(&rec.bmbt, off, bno, len, state);
+ return xfs_btree_update(cur, &rec);
+}
/*
- * Called by xfs_bmap_add_extent to handle cases converting a delayed
- * allocation to a real allocation.
+ * Compute the worst-case number of indirect blocks that will be used
+ * for ip's delayed extent of length "len".
*/
-STATIC int /* error */
-xfs_bmap_add_extent_delay_real(
- xfs_inode_t *ip, /* incore inode pointer */
- xfs_extnum_t idx, /* extent number to update/insert */
- xfs_btree_cur_t **curp, /* if *curp is null, not a btree */
- xfs_bmbt_irec_t *new, /* new data to put in extent list */
- xfs_filblks_t *dnew, /* new delayed-alloc indirect blocks */
- xfs_fsblock_t *first, /* pointer to firstblock variable */
- xfs_bmap_free_t *flist, /* list of extents to be freed */
- int *logflagsp, /* inode logging flags */
- int rsvd); /* OK to allocate reserved blocks */
+STATIC xfs_filblks_t
+xfs_bmap_worst_indlen(
+ xfs_inode_t *ip, /* incore inode pointer */
+ xfs_filblks_t len) /* delayed extent length */
+{
+ int level; /* btree level number */
+ int maxrecs; /* maximum record count at this level */
+ xfs_mount_t *mp; /* mount structure */
+ xfs_filblks_t rval; /* return value */
+
+ mp = ip->i_mount;
+ maxrecs = mp->m_bmap_dmxr[0];
+ for (level = 0, rval = 0;
+ level < XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK);
+ level++) {
+ len += maxrecs - 1;
+ do_div(len, maxrecs);
+ rval += len;
+ if (len == 1)
+ return rval + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) -
+ level - 1;
+ if (level == 0)
+ maxrecs = mp->m_bmap_dmxr[1];
+ }
+ return rval;
+}
/*
- * Called by xfs_bmap_add_extent to handle cases converting a hole
- * to a delayed allocation.
+ * Calculate the default attribute fork offset for newly created inodes.
*/
-STATIC int /* error */
-xfs_bmap_add_extent_hole_delay(
- xfs_inode_t *ip, /* incore inode pointer */
- xfs_extnum_t idx, /* extent number to update/insert */
- xfs_btree_cur_t *cur, /* if null, not a btree */
- xfs_bmbt_irec_t *new, /* new data to put in extent list */
- int *logflagsp,/* inode logging flags */
- int rsvd); /* OK to allocate reserved blocks */
+uint
+xfs_default_attroffset(
+ struct xfs_inode *ip)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ uint offset;
+
+ if (mp->m_sb.sb_inodesize == 256) {
+ offset = XFS_LITINO(mp, ip->i_d.di_version) -
+ XFS_BMDR_SPACE_CALC(MINABTPTRS);
+ } else {
+ offset = XFS_BMDR_SPACE_CALC(6 * MINABTPTRS);
+ }
+
+ ASSERT(offset < XFS_LITINO(mp, ip->i_d.di_version));
+ return offset;
+}
/*
- * Called by xfs_bmap_add_extent to handle cases converting a hole
- * to a real allocation.
+ * Helper routine to reset inode di_forkoff field when switching
+ * attribute fork from local to extent format - we reset it where
+ * possible to make space available for inline data fork extents.
*/
-STATIC int /* error */
-xfs_bmap_add_extent_hole_real(
- xfs_inode_t *ip, /* incore inode pointer */
- xfs_extnum_t idx, /* extent number to update/insert */
- xfs_btree_cur_t *cur, /* if null, not a btree */
- xfs_bmbt_irec_t *new, /* new data to put in extent list */
- int *logflagsp, /* inode logging flags */
- int whichfork); /* data or attr fork */
+STATIC void
+xfs_bmap_forkoff_reset(
+ xfs_inode_t *ip,
+ int whichfork)
+{
+ if (whichfork == XFS_ATTR_FORK &&
+ ip->i_d.di_format != XFS_DINODE_FMT_DEV &&
+ ip->i_d.di_format != XFS_DINODE_FMT_UUID &&
+ ip->i_d.di_format != XFS_DINODE_FMT_BTREE) {
+ uint dfl_forkoff = xfs_default_attroffset(ip) >> 3;
+
+ if (dfl_forkoff > ip->i_d.di_forkoff)
+ ip->i_d.di_forkoff = dfl_forkoff;
+ }
+}
/*
- * Called by xfs_bmap_add_extent to handle cases converting an unwritten
- * allocation to a real allocation or vice versa.
+ * Debug/sanity checking code
*/
-STATIC int /* error */
-xfs_bmap_add_extent_unwritten_real(
- xfs_inode_t *ip, /* incore inode pointer */
- xfs_extnum_t idx, /* extent number to update/insert */
- xfs_btree_cur_t **curp, /* if *curp is null, not a btree */
- xfs_bmbt_irec_t *new, /* new data to put in extent list */
- int *logflagsp); /* inode logging flags */
+
+STATIC int
+xfs_bmap_sanity_check(
+ struct xfs_mount *mp,
+ struct xfs_buf *bp,
+ int level)
+{
+ struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
+
+ if (block->bb_magic != cpu_to_be32(XFS_BMAP_CRC_MAGIC) &&
+ block->bb_magic != cpu_to_be32(XFS_BMAP_MAGIC))
+ return 0;
+
+ if (be16_to_cpu(block->bb_level) != level ||
+ be16_to_cpu(block->bb_numrecs) == 0 ||
+ be16_to_cpu(block->bb_numrecs) > mp->m_bmap_dmxr[level != 0])
+ return 0;
+
+ return 1;
+}
+
+#ifdef DEBUG
+STATIC struct xfs_buf *
+xfs_bmap_get_bp(
+ struct xfs_btree_cur *cur,
+ xfs_fsblock_t bno)
+{
+ struct xfs_log_item_desc *lidp;
+ int i;
+
+ if (!cur)
+ return NULL;
+
+ for (i = 0; i < XFS_BTREE_MAXLEVELS; i++) {
+ if (!cur->bc_bufs[i])
+ break;
+ if (XFS_BUF_ADDR(cur->bc_bufs[i]) == bno)
+ return cur->bc_bufs[i];
+ }
+
+ /* Chase down all the log items to see if the bp is there */
+ list_for_each_entry(lidp, &cur->bc_tp->t_items, lid_trans) {
+ struct xfs_buf_log_item *bip;
+ bip = (struct xfs_buf_log_item *)lidp->lid_item;
+ if (bip->bli_item.li_type == XFS_LI_BUF &&
+ XFS_BUF_ADDR(bip->bli_buf) == bno)
+ return bip->bli_buf;
+ }
+
+ return NULL;
+}
+
+STATIC void
+xfs_check_block(
+ struct xfs_btree_block *block,
+ xfs_mount_t *mp,
+ int root,
+ short sz)
+{
+ int i, j, dmxr;
+ __be64 *pp, *thispa; /* pointer to block address */
+ xfs_bmbt_key_t *prevp, *keyp;
+
+ ASSERT(be16_to_cpu(block->bb_level) > 0);
+
+ prevp = NULL;
+ for( i = 1; i <= xfs_btree_get_numrecs(block); i++) {
+ dmxr = mp->m_bmap_dmxr[0];
+ keyp = XFS_BMBT_KEY_ADDR(mp, block, i);
+
+ if (prevp) {
+ ASSERT(be64_to_cpu(prevp->br_startoff) <
+ be64_to_cpu(keyp->br_startoff));
+ }
+ prevp = keyp;
+
+ /*
+ * Compare the block numbers to see if there are dups.
+ */
+ if (root)
+ pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, i, sz);
+ else
+ pp = XFS_BMBT_PTR_ADDR(mp, block, i, dmxr);
+
+ for (j = i+1; j <= be16_to_cpu(block->bb_numrecs); j++) {
+ if (root)
+ thispa = XFS_BMAP_BROOT_PTR_ADDR(mp, block, j, sz);
+ else
+ thispa = XFS_BMBT_PTR_ADDR(mp, block, j, dmxr);
+ if (*thispa == *pp) {
+ xfs_warn(mp, "%s: thispa(%d) == pp(%d) %Ld",
+ __func__, j, i,
+ (unsigned long long)be64_to_cpu(*thispa));
+ panic("%s: ptrs are equal in node\n",
+ __func__);
+ }
+ }
+ }
+}
/*
- * xfs_bmap_alloc is called by xfs_bmapi to allocate an extent for a file.
- * It figures out where to ask the underlying allocator to put the new extent.
+ * Check that the extents for the inode ip are in the right order in all
+ * btree leaves.
*/
-STATIC int /* error */
-xfs_bmap_alloc(
- xfs_bmalloca_t *ap); /* bmap alloc argument struct */
+
+STATIC void
+xfs_bmap_check_leaf_extents(
+ xfs_btree_cur_t *cur, /* btree cursor or null */
+ xfs_inode_t *ip, /* incore inode pointer */
+ int whichfork) /* data or attr fork */
+{
+ struct xfs_btree_block *block; /* current btree block */
+ xfs_fsblock_t bno; /* block # of "block" */
+ xfs_buf_t *bp; /* buffer for "block" */
+ int error; /* error return value */
+ xfs_extnum_t i=0, j; /* index into the extents list */
+ xfs_ifork_t *ifp; /* fork structure */
+ int level; /* btree level, for checking */
+ xfs_mount_t *mp; /* file system mount structure */
+ __be64 *pp; /* pointer to block address */
+ xfs_bmbt_rec_t *ep; /* pointer to current extent */
+ xfs_bmbt_rec_t last = {0, 0}; /* last extent in prev block */
+ xfs_bmbt_rec_t *nextp; /* pointer to next extent */
+ int bp_release = 0;
+
+ if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE) {
+ return;
+ }
+
+ bno = NULLFSBLOCK;
+ mp = ip->i_mount;
+ ifp = XFS_IFORK_PTR(ip, whichfork);
+ block = ifp->if_broot;
+ /*
+ * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out.
+ */
+ level = be16_to_cpu(block->bb_level);
+ ASSERT(level > 0);
+ xfs_check_block(block, mp, 1, ifp->if_broot_bytes);
+ pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes);
+ bno = be64_to_cpu(*pp);
+
+ ASSERT(bno != NULLDFSBNO);
+ ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount);
+ ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks);
+
+ /*
+ * Go down the tree until leaf level is reached, following the first
+ * pointer (leftmost) at each level.
+ */
+ while (level-- > 0) {
+ /* See if buf is in cur first */
+ bp_release = 0;
+ bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno));
+ if (!bp) {
+ bp_release = 1;
+ error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp,
+ XFS_BMAP_BTREE_REF,
+ &xfs_bmbt_buf_ops);
+ if (error)
+ goto error_norelse;
+ }
+ block = XFS_BUF_TO_BLOCK(bp);
+ XFS_WANT_CORRUPTED_GOTO(
+ xfs_bmap_sanity_check(mp, bp, level),
+ error0);
+ if (level == 0)
+ break;
+
+ /*
+ * Check this block for basic sanity (increasing keys and
+ * no duplicate blocks).
+ */
+
+ xfs_check_block(block, mp, 0, 0);
+ pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]);
+ bno = be64_to_cpu(*pp);
+ XFS_WANT_CORRUPTED_GOTO(XFS_FSB_SANITY_CHECK(mp, bno), error0);
+ if (bp_release) {
+ bp_release = 0;
+ xfs_trans_brelse(NULL, bp);
+ }
+ }
+
+ /*
+ * Here with bp and block set to the leftmost leaf node in the tree.
+ */
+ i = 0;
+
+ /*
+ * Loop over all leaf nodes checking that all extents are in the right order.
+ */
+ for (;;) {
+ xfs_fsblock_t nextbno;
+ xfs_extnum_t num_recs;
+
+
+ num_recs = xfs_btree_get_numrecs(block);
+
+ /*
+ * Read-ahead the next leaf block, if any.
+ */
+
+ nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
+
+ /*
+ * Check all the extents to make sure they are OK.
+ * If we had a previous block, the last entry should
+ * conform with the first entry in this one.
+ */
+
+ ep = XFS_BMBT_REC_ADDR(mp, block, 1);
+ if (i) {
+ ASSERT(xfs_bmbt_disk_get_startoff(&last) +
+ xfs_bmbt_disk_get_blockcount(&last) <=
+ xfs_bmbt_disk_get_startoff(ep));
+ }
+ for (j = 1; j < num_recs; j++) {
+ nextp = XFS_BMBT_REC_ADDR(mp, block, j + 1);
+ ASSERT(xfs_bmbt_disk_get_startoff(ep) +
+ xfs_bmbt_disk_get_blockcount(ep) <=
+ xfs_bmbt_disk_get_startoff(nextp));
+ ep = nextp;
+ }
+
+ last = *ep;
+ i += num_recs;
+ if (bp_release) {
+ bp_release = 0;
+ xfs_trans_brelse(NULL, bp);
+ }
+ bno = nextbno;
+ /*
+ * If we've reached the end, stop.
+ */
+ if (bno == NULLFSBLOCK)
+ break;
+
+ bp_release = 0;
+ bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno));
+ if (!bp) {
+ bp_release = 1;
+ error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp,
+ XFS_BMAP_BTREE_REF,
+ &xfs_bmbt_buf_ops);
+ if (error)
+ goto error_norelse;
+ }
+ block = XFS_BUF_TO_BLOCK(bp);
+ }
+ if (bp_release) {
+ bp_release = 0;
+ xfs_trans_brelse(NULL, bp);
+ }
+ return;
+
+error0:
+ xfs_warn(mp, "%s: at error0", __func__);
+ if (bp_release)
+ xfs_trans_brelse(NULL, bp);
+error_norelse:
+ xfs_warn(mp, "%s: BAD after btree leaves for %d extents",
+ __func__, i);
+ panic("%s: CORRUPTED BTREE OR SOMETHING", __func__);
+ return;
+}
/*
- * Transform a btree format file with only one leaf node, where the
- * extents list will fit in the inode, into an extents format file.
- * Since the extent list is already in-core, all we have to do is
- * give up the space for the btree root and pitch the leaf block.
+ * Add bmap trace insert entries for all the contents of the extent records.
*/
-STATIC int /* error */
-xfs_bmap_btree_to_extents(
- xfs_trans_t *tp, /* transaction pointer */
- xfs_inode_t *ip, /* incore inode pointer */
- xfs_btree_cur_t *cur, /* btree cursor */
- int *logflagsp, /* inode logging flags */
- int whichfork); /* data or attr fork */
+void
+xfs_bmap_trace_exlist(
+ xfs_inode_t *ip, /* incore inode pointer */
+ xfs_extnum_t cnt, /* count of entries in the list */
+ int whichfork, /* data or attr fork */
+ unsigned long caller_ip)
+{
+ xfs_extnum_t idx; /* extent record index */
+ xfs_ifork_t *ifp; /* inode fork pointer */
+ int state = 0;
+
+ if (whichfork == XFS_ATTR_FORK)
+ state |= BMAP_ATTRFORK;
+
+ ifp = XFS_IFORK_PTR(ip, whichfork);
+ ASSERT(cnt == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)));
+ for (idx = 0; idx < cnt; idx++)
+ trace_xfs_extlist(ip, idx, whichfork, caller_ip);
+}
-#ifdef DEBUG
/*
- * Check that the extents list for the inode ip is in the right order.
+ * Validate that the bmbt_irecs being returned from bmapi are valid
+ * given the caller's original parameters. Specifically check the
+ * ranges of the returned irecs to ensure that they only extend beyond
+ * the given parameters if the XFS_BMAPI_ENTIRE flag was set.
*/
STATIC void
-xfs_bmap_check_extents(
- xfs_inode_t *ip, /* incore inode pointer */
- int whichfork); /* data or attr fork */
-#endif
+xfs_bmap_validate_ret(
+ xfs_fileoff_t bno,
+ xfs_filblks_t len,
+ int flags,
+ xfs_bmbt_irec_t *mval,
+ int nmap,
+ int ret_nmap)
+{
+ int i; /* index to map values */
+
+ ASSERT(ret_nmap <= nmap);
+
+ for (i = 0; i < ret_nmap; i++) {
+ ASSERT(mval[i].br_blockcount > 0);
+ if (!(flags & XFS_BMAPI_ENTIRE)) {
+ ASSERT(mval[i].br_startoff >= bno);
+ ASSERT(mval[i].br_blockcount <= len);
+ ASSERT(mval[i].br_startoff + mval[i].br_blockcount <=
+ bno + len);
+ } else {
+ ASSERT(mval[i].br_startoff < bno + len);
+ ASSERT(mval[i].br_startoff + mval[i].br_blockcount >
+ bno);
+ }
+ ASSERT(i == 0 ||
+ mval[i - 1].br_startoff + mval[i - 1].br_blockcount ==
+ mval[i].br_startoff);
+ ASSERT(mval[i].br_startblock != DELAYSTARTBLOCK &&
+ mval[i].br_startblock != HOLESTARTBLOCK);
+ ASSERT(mval[i].br_state == XFS_EXT_NORM ||
+ mval[i].br_state == XFS_EXT_UNWRITTEN);
+ }
+}
+
+#else
+#define xfs_bmap_check_leaf_extents(cur, ip, whichfork) do { } while (0)
+#define xfs_bmap_validate_ret(bno,len,flags,mval,onmap,nmap)
+#endif /* DEBUG */
/*
- * Called by xfs_bmapi to update extent list structure and the btree
- * after removing space (or undoing a delayed allocation).
+ * bmap free list manipulation functions
*/
-STATIC int /* error */
-xfs_bmap_del_extent(
- xfs_inode_t *ip, /* incore inode pointer */
- xfs_trans_t *tp, /* current trans pointer */
- xfs_extnum_t idx, /* extent number to update/insert */
- xfs_bmap_free_t *flist, /* list of extents to be freed */
- xfs_btree_cur_t *cur, /* if null, not a btree */
- xfs_bmbt_irec_t *new, /* new data to put in extent list */
- int *logflagsp,/* inode logging flags */
- int whichfork, /* data or attr fork */
- int rsvd); /* OK to allocate reserved blocks */
+
+/*
+ * Add the extent to the list of extents to be free at transaction end.
+ * The list is maintained sorted (by block number).
+ */
+void
+xfs_bmap_add_free(
+ xfs_fsblock_t bno, /* fs block number of extent */
+ xfs_filblks_t len, /* length of extent */
+ xfs_bmap_free_t *flist, /* list of extents */
+ xfs_mount_t *mp) /* mount point structure */
+{
+ xfs_bmap_free_item_t *cur; /* current (next) element */
+ xfs_bmap_free_item_t *new; /* new element */
+ xfs_bmap_free_item_t *prev; /* previous element */
+#ifdef DEBUG
+ xfs_agnumber_t agno;
+ xfs_agblock_t agbno;
+
+ ASSERT(bno != NULLFSBLOCK);
+ ASSERT(len > 0);
+ ASSERT(len <= MAXEXTLEN);
+ ASSERT(!isnullstartblock(bno));
+ agno = XFS_FSB_TO_AGNO(mp, bno);
+ agbno = XFS_FSB_TO_AGBNO(mp, bno);
+ ASSERT(agno < mp->m_sb.sb_agcount);
+ ASSERT(agbno < mp->m_sb.sb_agblocks);
+ ASSERT(len < mp->m_sb.sb_agblocks);
+ ASSERT(agbno + len <= mp->m_sb.sb_agblocks);
+#endif
+ ASSERT(xfs_bmap_free_item_zone != NULL);
+ new = kmem_zone_alloc(xfs_bmap_free_item_zone, KM_SLEEP);
+ new->xbfi_startblock = bno;
+ new->xbfi_blockcount = (xfs_extlen_t)len;
+ for (prev = NULL, cur = flist->xbf_first;
+ cur != NULL;
+ prev = cur, cur = cur->xbfi_next) {
+ if (cur->xbfi_startblock >= bno)
+ break;
+ }
+ if (prev)
+ prev->xbfi_next = new;
+ else
+ flist->xbf_first = new;
+ new->xbfi_next = cur;
+ flist->xbf_count++;
+}
/*
* Remove the entry "free" from the free item list. Prev points to the
* previous entry, unless "free" is the head of the list.
*/
-STATIC void
+void
xfs_bmap_del_free(
xfs_bmap_free_t *flist, /* free item list header */
xfs_bmap_free_item_t *prev, /* previous item on list, if any */
- xfs_bmap_free_item_t *free); /* list item to be freed */
+ xfs_bmap_free_item_t *free) /* list item to be freed */
+{
+ if (prev)
+ prev->xbfi_next = free->xbfi_next;
+ else
+ flist->xbf_first = free->xbfi_next;
+ flist->xbf_count--;
+ kmem_zone_free(xfs_bmap_free_item_zone, free);
+}
/*
- * Remove count entries from the extents array for inode "ip", starting
- * at index "idx". Copies the remaining items down over the deleted ones,
- * and gives back the excess memory.
+ * Free up any items left in the list.
*/
-STATIC void
-xfs_bmap_delete_exlist(
- xfs_inode_t *ip, /* incode inode pointer */
- xfs_extnum_t idx, /* starting delete index */
- xfs_extnum_t count, /* count of items to delete */
- int whichfork); /* data or attr fork */
+void
+xfs_bmap_cancel(
+ xfs_bmap_free_t *flist) /* list of bmap_free_items */
+{
+ xfs_bmap_free_item_t *free; /* free list item */
+ xfs_bmap_free_item_t *next;
+
+ if (flist->xbf_count == 0)
+ return;
+ ASSERT(flist->xbf_first != NULL);
+ for (free = flist->xbf_first; free; free = next) {
+ next = free->xbfi_next;
+ xfs_bmap_del_free(flist, NULL, free);
+ }
+ ASSERT(flist->xbf_count == 0);
+}
+
+/*
+ * Inode fork format manipulation functions
+ */
+
+/*
+ * Transform a btree format file with only one leaf node, where the
+ * extents list will fit in the inode, into an extents format file.
+ * Since the file extents are already in-core, all we have to do is
+ * give up the space for the btree root and pitch the leaf block.
+ */
+STATIC int /* error */
+xfs_bmap_btree_to_extents(
+ xfs_trans_t *tp, /* transaction pointer */
+ xfs_inode_t *ip, /* incore inode pointer */
+ xfs_btree_cur_t *cur, /* btree cursor */
+ int *logflagsp, /* inode logging flags */
+ int whichfork) /* data or attr fork */
+{
+ /* REFERENCED */
+ struct xfs_btree_block *cblock;/* child btree block */
+ xfs_fsblock_t cbno; /* child block number */
+ xfs_buf_t *cbp; /* child block's buffer */
+ int error; /* error return value */
+ xfs_ifork_t *ifp; /* inode fork data */
+ xfs_mount_t *mp; /* mount point structure */
+ __be64 *pp; /* ptr to block address */
+ struct xfs_btree_block *rblock;/* root btree block */
+
+ mp = ip->i_mount;
+ ifp = XFS_IFORK_PTR(ip, whichfork);
+ ASSERT(ifp->if_flags & XFS_IFEXTENTS);
+ ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE);
+ rblock = ifp->if_broot;
+ ASSERT(be16_to_cpu(rblock->bb_level) == 1);
+ ASSERT(be16_to_cpu(rblock->bb_numrecs) == 1);
+ ASSERT(xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0) == 1);
+ pp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, ifp->if_broot_bytes);
+ cbno = be64_to_cpu(*pp);
+ *logflagsp = 0;
+#ifdef DEBUG
+ if ((error = xfs_btree_check_lptr(cur, cbno, 1)))
+ return error;
+#endif
+ error = xfs_btree_read_bufl(mp, tp, cbno, 0, &cbp, XFS_BMAP_BTREE_REF,
+ &xfs_bmbt_buf_ops);
+ if (error)
+ return error;
+ cblock = XFS_BUF_TO_BLOCK(cbp);
+ if ((error = xfs_btree_check_block(cur, cblock, 0, cbp)))
+ return error;
+ xfs_bmap_add_free(cbno, 1, cur->bc_private.b.flist, mp);
+ ip->i_d.di_nblocks--;
+ xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L);
+ xfs_trans_binval(tp, cbp);
+ if (cur->bc_bufs[0] == cbp)
+ cur->bc_bufs[0] = NULL;
+ xfs_iroot_realloc(ip, -1, whichfork);
+ ASSERT(ifp->if_broot == NULL);
+ ASSERT((ifp->if_flags & XFS_IFBROOT) == 0);
+ XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS);
+ *logflagsp = XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
+ return 0;
+}
/*
* Convert an extents-format file into a btree-format file.
@@ -241,197 +752,255 @@ xfs_bmap_extents_to_btree(
xfs_btree_cur_t **curp, /* cursor returned to caller */
int wasdel, /* converting a delayed alloc */
int *logflagsp, /* inode logging flags */
- int whichfork); /* data or attr fork */
+ int whichfork) /* data or attr fork */
+{
+ struct xfs_btree_block *ablock; /* allocated (child) bt block */
+ xfs_buf_t *abp; /* buffer for ablock */
+ xfs_alloc_arg_t args; /* allocation arguments */
+ xfs_bmbt_rec_t *arp; /* child record pointer */
+ struct xfs_btree_block *block; /* btree root block */
+ xfs_btree_cur_t *cur; /* bmap btree cursor */
+ xfs_bmbt_rec_host_t *ep; /* extent record pointer */
+ int error; /* error return value */
+ xfs_extnum_t i, cnt; /* extent record index */
+ xfs_ifork_t *ifp; /* inode fork pointer */
+ xfs_bmbt_key_t *kp; /* root block key pointer */
+ xfs_mount_t *mp; /* mount structure */
+ xfs_extnum_t nextents; /* number of file extents */
+ xfs_bmbt_ptr_t *pp; /* root block address pointer */
-/*
- * Insert new item(s) in the extent list for inode "ip".
- * Count new items are inserted at offset idx.
- */
-STATIC void
-xfs_bmap_insert_exlist(
- xfs_inode_t *ip, /* incore inode pointer */
- xfs_extnum_t idx, /* starting index of new items */
- xfs_extnum_t count, /* number of inserted items */
- xfs_bmbt_irec_t *new, /* items to insert */
- int whichfork); /* data or attr fork */
+ mp = ip->i_mount;
+ ifp = XFS_IFORK_PTR(ip, whichfork);
+ ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS);
-/*
- * Convert a local file to an extents file.
- * This code is sort of bogus, since the file data needs to get
- * logged so it won't be lost. The bmap-level manipulations are ok, though.
- */
-STATIC int /* error */
-xfs_bmap_local_to_extents(
- xfs_trans_t *tp, /* transaction pointer */
- xfs_inode_t *ip, /* incore inode pointer */
- xfs_fsblock_t *firstblock, /* first block allocated in xaction */
- xfs_extlen_t total, /* total blocks needed by transaction */
- int *logflagsp, /* inode logging flags */
- int whichfork); /* data or attr fork */
+ /*
+ * Make space in the inode incore.
+ */
+ xfs_iroot_realloc(ip, 1, whichfork);
+ ifp->if_flags |= XFS_IFBROOT;
-/*
- * Search the extents list for the inode, for the extent containing bno.
- * If bno lies in a hole, point to the next entry. If bno lies past eof,
- * *eofp will be set, and *prevp will contain the last entry (null if none).
- * Else, *lastxp will be set to the index of the found
- * entry; *gotp will contain the entry.
- */
-STATIC xfs_bmbt_rec_t * /* pointer to found extent entry */
-xfs_bmap_search_extents(
- xfs_inode_t *ip, /* incore inode pointer */
- xfs_fileoff_t bno, /* block number searched for */
- int whichfork, /* data or attr fork */
- int *eofp, /* out: end of file found */
- xfs_extnum_t *lastxp, /* out: last extent index */
- xfs_bmbt_irec_t *gotp, /* out: extent entry found */
- xfs_bmbt_irec_t *prevp); /* out: previous extent entry found */
+ /*
+ * Fill in the root.
+ */
+ block = ifp->if_broot;
+ if (xfs_sb_version_hascrc(&mp->m_sb))
+ xfs_btree_init_block_int(mp, block, XFS_BUF_DADDR_NULL,
+ XFS_BMAP_CRC_MAGIC, 1, 1, ip->i_ino,
+ XFS_BTREE_LONG_PTRS | XFS_BTREE_CRC_BLOCKS);
+ else
+ xfs_btree_init_block_int(mp, block, XFS_BUF_DADDR_NULL,
+ XFS_BMAP_MAGIC, 1, 1, ip->i_ino,
+ XFS_BTREE_LONG_PTRS);
-/*
- * Check the last inode extent to determine whether this allocation will result
- * in blocks being allocated at the end of the file. When we allocate new data
- * blocks at the end of the file which do not start at the previous data block,
- * we will try to align the new blocks at stripe unit boundaries.
- */
-STATIC int /* error */
-xfs_bmap_isaeof(
- xfs_inode_t *ip, /* incore inode pointer */
- xfs_fileoff_t off, /* file offset in fsblocks */
- int whichfork, /* data or attribute fork */
- char *aeof); /* return value */
+ /*
+ * Need a cursor. Can't allocate until bb_level is filled in.
+ */
+ cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
+ cur->bc_private.b.firstblock = *firstblock;
+ cur->bc_private.b.flist = flist;
+ cur->bc_private.b.flags = wasdel ? XFS_BTCUR_BPRV_WASDEL : 0;
+ /*
+ * Convert to a btree with two levels, one record in root.
+ */
+ XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_BTREE);
+ memset(&args, 0, sizeof(args));
+ args.tp = tp;
+ args.mp = mp;
+ args.firstblock = *firstblock;
+ if (*firstblock == NULLFSBLOCK) {
+ args.type = XFS_ALLOCTYPE_START_BNO;
+ args.fsbno = XFS_INO_TO_FSB(mp, ip->i_ino);
+ } else if (flist->xbf_low) {
+ args.type = XFS_ALLOCTYPE_START_BNO;
+ args.fsbno = *firstblock;
+ } else {
+ args.type = XFS_ALLOCTYPE_NEAR_BNO;
+ args.fsbno = *firstblock;
+ }
+ args.minlen = args.maxlen = args.prod = 1;
+ args.wasdel = wasdel;
+ *logflagsp = 0;
+ if ((error = xfs_alloc_vextent(&args))) {
+ xfs_iroot_realloc(ip, -1, whichfork);
+ xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+ return error;
+ }
+ /*
+ * Allocation can't fail, the space was reserved.
+ */
+ ASSERT(args.fsbno != NULLFSBLOCK);
+ ASSERT(*firstblock == NULLFSBLOCK ||
+ args.agno == XFS_FSB_TO_AGNO(mp, *firstblock) ||
+ (flist->xbf_low &&
+ args.agno > XFS_FSB_TO_AGNO(mp, *firstblock)));
+ *firstblock = cur->bc_private.b.firstblock = args.fsbno;
+ cur->bc_private.b.allocated++;
+ ip->i_d.di_nblocks++;
+ xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, 1L);
+ abp = xfs_btree_get_bufl(mp, tp, args.fsbno, 0);
+ /*
+ * Fill in the child block.
+ */
+ abp->b_ops = &xfs_bmbt_buf_ops;
+ ablock = XFS_BUF_TO_BLOCK(abp);
+ if (xfs_sb_version_hascrc(&mp->m_sb))
+ xfs_btree_init_block_int(mp, ablock, abp->b_bn,
+ XFS_BMAP_CRC_MAGIC, 0, 0, ip->i_ino,
+ XFS_BTREE_LONG_PTRS | XFS_BTREE_CRC_BLOCKS);
+ else
+ xfs_btree_init_block_int(mp, ablock, abp->b_bn,
+ XFS_BMAP_MAGIC, 0, 0, ip->i_ino,
+ XFS_BTREE_LONG_PTRS);
-#ifdef XFS_BMAP_TRACE
-/*
- * Add a bmap trace buffer entry. Base routine for the others.
- */
-STATIC void
-xfs_bmap_trace_addentry(
- int opcode, /* operation */
- char *fname, /* function name */
- char *desc, /* operation description */
- xfs_inode_t *ip, /* incore inode pointer */
- xfs_extnum_t idx, /* index of entry(ies) */
- xfs_extnum_t cnt, /* count of entries, 1 or 2 */
- xfs_bmbt_rec_t *r1, /* first record */
- xfs_bmbt_rec_t *r2, /* second record or null */
- int whichfork); /* data or attr fork */
+ arp = XFS_BMBT_REC_ADDR(mp, ablock, 1);
+ nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+ for (cnt = i = 0; i < nextents; i++) {
+ ep = xfs_iext_get_ext(ifp, i);
+ if (!isnullstartblock(xfs_bmbt_get_startblock(ep))) {
+ arp->l0 = cpu_to_be64(ep->l0);
+ arp->l1 = cpu_to_be64(ep->l1);
+ arp++; cnt++;
+ }
+ }
+ ASSERT(cnt == XFS_IFORK_NEXTENTS(ip, whichfork));
+ xfs_btree_set_numrecs(ablock, cnt);
-/*
- * Add bmap trace entry prior to a call to xfs_bmap_delete_exlist.
- */
-STATIC void
-xfs_bmap_trace_delete(
- char *fname, /* function name */
- char *desc, /* operation description */
- xfs_inode_t *ip, /* incore inode pointer */
- xfs_extnum_t idx, /* index of entry(entries) deleted */
- xfs_extnum_t cnt, /* count of entries deleted, 1 or 2 */
- int whichfork); /* data or attr fork */
+ /*
+ * Fill in the root key and pointer.
+ */
+ kp = XFS_BMBT_KEY_ADDR(mp, block, 1);
+ arp = XFS_BMBT_REC_ADDR(mp, ablock, 1);
+ kp->br_startoff = cpu_to_be64(xfs_bmbt_disk_get_startoff(arp));
+ pp = XFS_BMBT_PTR_ADDR(mp, block, 1, xfs_bmbt_get_maxrecs(cur,
+ be16_to_cpu(block->bb_level)));
+ *pp = cpu_to_be64(args.fsbno);
-/*
- * Add bmap trace entry prior to a call to xfs_bmap_insert_exlist, or
- * reading in the extents list from the disk (in the btree).
- */
-STATIC void
-xfs_bmap_trace_insert(
- char *fname, /* function name */
- char *desc, /* operation description */
- xfs_inode_t *ip, /* incore inode pointer */
- xfs_extnum_t idx, /* index of entry(entries) inserted */
- xfs_extnum_t cnt, /* count of entries inserted, 1 or 2 */
- xfs_bmbt_irec_t *r1, /* inserted record 1 */
- xfs_bmbt_irec_t *r2, /* inserted record 2 or null */
- int whichfork); /* data or attr fork */
+ /*
+ * Do all this logging at the end so that
+ * the root is at the right level.
+ */
+ xfs_btree_log_block(cur, abp, XFS_BB_ALL_BITS);
+ xfs_btree_log_recs(cur, abp, 1, be16_to_cpu(ablock->bb_numrecs));
+ ASSERT(*curp == NULL);
+ *curp = cur;
+ *logflagsp = XFS_ILOG_CORE | xfs_ilog_fbroot(whichfork);
+ return 0;
+}
/*
- * Add bmap trace entry after updating an extent list entry in place.
+ * Convert a local file to an extents file.
+ * This code is out of bounds for data forks of regular files,
+ * since the file data needs to get logged so things will stay consistent.
+ * (The bmap-level manipulations are ok, though).
*/
-STATIC void
-xfs_bmap_trace_post_update(
- char *fname, /* function name */
- char *desc, /* operation description */
- xfs_inode_t *ip, /* incore inode pointer */
- xfs_extnum_t idx, /* index of entry updated */
- int whichfork); /* data or attr fork */
+void
+xfs_bmap_local_to_extents_empty(
+ struct xfs_inode *ip,
+ int whichfork)
+{
+ struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
-/*
- * Add bmap trace entry prior to updating an extent list entry in place.
- */
-STATIC void
-xfs_bmap_trace_pre_update(
- char *fname, /* function name */
- char *desc, /* operation description */
+ ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL);
+ ASSERT(ifp->if_bytes == 0);
+ ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) == 0);
+
+ xfs_bmap_forkoff_reset(ip, whichfork);
+ ifp->if_flags &= ~XFS_IFINLINE;
+ ifp->if_flags |= XFS_IFEXTENTS;
+ XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS);
+}
+
+
+STATIC int /* error */
+xfs_bmap_local_to_extents(
+ xfs_trans_t *tp, /* transaction pointer */
xfs_inode_t *ip, /* incore inode pointer */
- xfs_extnum_t idx, /* index of entry to be updated */
- int whichfork); /* data or attr fork */
+ xfs_fsblock_t *firstblock, /* first block allocated in xaction */
+ xfs_extlen_t total, /* total blocks needed by transaction */
+ int *logflagsp, /* inode logging flags */
+ int whichfork,
+ void (*init_fn)(struct xfs_trans *tp,
+ struct xfs_buf *bp,
+ struct xfs_inode *ip,
+ struct xfs_ifork *ifp))
+{
+ int error = 0;
+ int flags; /* logging flags returned */
+ xfs_ifork_t *ifp; /* inode fork pointer */
+ xfs_alloc_arg_t args; /* allocation arguments */
+ xfs_buf_t *bp; /* buffer for extent block */
+ xfs_bmbt_rec_host_t *ep; /* extent record pointer */
-#else
-#define xfs_bmap_trace_delete(f,d,ip,i,c,w)
-#define xfs_bmap_trace_insert(f,d,ip,i,c,r1,r2,w)
-#define xfs_bmap_trace_post_update(f,d,ip,i,w)
-#define xfs_bmap_trace_pre_update(f,d,ip,i,w)
-#endif /* XFS_BMAP_TRACE */
+ /*
+ * We don't want to deal with the case of keeping inode data inline yet.
+ * So sending the data fork of a regular inode is invalid.
+ */
+ ASSERT(!(S_ISREG(ip->i_d.di_mode) && whichfork == XFS_DATA_FORK));
+ ifp = XFS_IFORK_PTR(ip, whichfork);
+ ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL);
-/*
- * Compute the worst-case number of indirect blocks that will be used
- * for ip's delayed extent of length "len".
- */
-STATIC xfs_filblks_t
-xfs_bmap_worst_indlen(
- xfs_inode_t *ip, /* incore inode pointer */
- xfs_filblks_t len); /* delayed extent length */
+ if (!ifp->if_bytes) {
+ xfs_bmap_local_to_extents_empty(ip, whichfork);
+ flags = XFS_ILOG_CORE;
+ goto done;
+ }
-#ifdef DEBUG
-/*
- * Perform various validation checks on the values being returned
- * from xfs_bmapi().
- */
-STATIC void
-xfs_bmap_validate_ret(
- xfs_fileoff_t bno,
- xfs_filblks_t len,
- int flags,
- xfs_bmbt_irec_t *mval,
- int nmap,
- int ret_nmap);
-#else
-#define xfs_bmap_validate_ret(bno,len,flags,mval,onmap,nmap)
-#endif /* DEBUG */
+ flags = 0;
+ error = 0;
+ ASSERT((ifp->if_flags & (XFS_IFINLINE|XFS_IFEXTENTS|XFS_IFEXTIREC)) ==
+ XFS_IFINLINE);
+ memset(&args, 0, sizeof(args));
+ args.tp = tp;
+ args.mp = ip->i_mount;
+ args.firstblock = *firstblock;
+ /*
+ * Allocate a block. We know we need only one, since the
+ * file currently fits in an inode.
+ */
+ if (*firstblock == NULLFSBLOCK) {
+ args.fsbno = XFS_INO_TO_FSB(args.mp, ip->i_ino);
+ args.type = XFS_ALLOCTYPE_START_BNO;
+ } else {
+ args.fsbno = *firstblock;
+ args.type = XFS_ALLOCTYPE_NEAR_BNO;
+ }
+ args.total = total;
+ args.minlen = args.maxlen = args.prod = 1;
+ error = xfs_alloc_vextent(&args);
+ if (error)
+ goto done;
-#if defined(XFS_RW_TRACE)
-STATIC void
-xfs_bunmap_trace(
- xfs_inode_t *ip,
- xfs_fileoff_t bno,
- xfs_filblks_t len,
- int flags,
- inst_t *ra);
-#else
-#define xfs_bunmap_trace(ip, bno, len, flags, ra)
-#endif /* XFS_RW_TRACE */
+ /* Can't fail, the space was reserved. */
+ ASSERT(args.fsbno != NULLFSBLOCK);
+ ASSERT(args.len == 1);
+ *firstblock = args.fsbno;
+ bp = xfs_btree_get_bufl(args.mp, tp, args.fsbno, 0);
-STATIC int
-xfs_bmap_count_tree(
- xfs_mount_t *mp,
- xfs_trans_t *tp,
- xfs_fsblock_t blockno,
- int levelin,
- int *count);
+ /* initialise the block and copy the data */
+ init_fn(tp, bp, ip, ifp);
-STATIC int
-xfs_bmap_count_leaves(
- xfs_bmbt_rec_t *frp,
- int numrecs,
- int *count);
+ /* account for the change in fork size and log everything */
+ xfs_trans_log_buf(tp, bp, 0, ifp->if_bytes - 1);
+ xfs_idata_realloc(ip, -ifp->if_bytes, whichfork);
+ xfs_bmap_local_to_extents_empty(ip, whichfork);
+ flags |= XFS_ILOG_CORE;
-STATIC int
-xfs_bmap_disk_count_leaves(
- xfs_bmbt_rec_t *frp,
- int numrecs,
- int *count);
+ xfs_iext_add(ifp, 0, 1);
+ ep = xfs_iext_get_ext(ifp, 0);
+ xfs_bmbt_set_allf(ep, 0, args.fsbno, 1, XFS_EXT_NORM);
+ trace_xfs_bmap_post_update(ip, 0,
+ whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0,
+ _THIS_IP_);
+ XFS_IFORK_NEXT_SET(ip, whichfork, 1);
+ ip->i_d.di_nblocks = 1;
+ xfs_trans_mod_dquot_byino(tp, ip,
+ XFS_TRANS_DQ_BCOUNT, 1L);
+ flags |= xfs_ilog_fext(whichfork);
-/*
- * Bmap internal routines.
- */
+done:
+ *logflagsp = flags;
+ return error;
+}
/*
* Called from xfs_bmap_add_attrfork to handle btree format files.
@@ -453,14 +1022,14 @@ xfs_bmap_add_attrfork_btree(
if (ip->i_df.if_broot_bytes <= XFS_IFORK_DSIZE(ip))
*flags |= XFS_ILOG_DBROOT;
else {
- cur = xfs_btree_init_cursor(mp, tp, NULL, 0, XFS_BTNUM_BMAP, ip,
- XFS_DATA_FORK);
+ cur = xfs_bmbt_init_cursor(mp, tp, ip, XFS_DATA_FORK);
cur->bc_private.b.flist = flist;
cur->bc_private.b.firstblock = *firstblock;
if ((error = xfs_bmbt_lookup_ge(cur, 0, 0, 0, &stat)))
goto error0;
- ASSERT(stat == 1); /* must be at least one entry */
- if ((error = xfs_bmbt_newroot(cur, flags, &stat)))
+ /* must be at least one entry */
+ XFS_WANT_CORRUPTED_GOTO(stat == 1, error0);
+ if ((error = xfs_btree_new_iroot(cur, flags, &stat)))
goto error0;
if (stat == 0) {
xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
@@ -504,7 +1073,15 @@ xfs_bmap_add_attrfork_extents(
}
/*
- * Called from xfs_bmap_add_attrfork to handle local format files.
+ * Called from xfs_bmap_add_attrfork to handle local format files. Each
+ * different data fork content type needs a different callout to do the
+ * conversion. Some are basic and only require special block initialisation
+ * callouts for the data formating, others (directories) are so specialised they
+ * handle everything themselves.
+ *
+ * XXX (dgc): investigate whether directory conversion can use the generic
+ * formatting callout. It should be possible - it's just a very complex
+ * formatter.
*/
STATIC int /* error */
xfs_bmap_add_attrfork_local(
@@ -515,623 +1092,1065 @@ xfs_bmap_add_attrfork_local(
int *flags) /* inode logging flags */
{
xfs_da_args_t dargs; /* args for dir/attr code */
- int error; /* error return value */
- xfs_mount_t *mp; /* mount structure pointer */
if (ip->i_df.if_bytes <= XFS_IFORK_DSIZE(ip))
return 0;
- if ((ip->i_d.di_mode & S_IFMT) == S_IFDIR) {
- mp = ip->i_mount;
+
+ if (S_ISDIR(ip->i_d.di_mode)) {
memset(&dargs, 0, sizeof(dargs));
+ dargs.geo = ip->i_mount->m_dir_geo;
dargs.dp = ip;
dargs.firstblock = firstblock;
dargs.flist = flist;
- dargs.total = mp->m_dirblkfsbs;
+ dargs.total = dargs.geo->fsbcount;
dargs.whichfork = XFS_DATA_FORK;
dargs.trans = tp;
- error = XFS_DIR_SHORTFORM_TO_SINGLE(mp, &dargs);
- } else
- error = xfs_bmap_local_to_extents(tp, ip, firstblock, 1, flags,
- XFS_DATA_FORK);
+ return xfs_dir2_sf_to_block(&dargs);
+ }
+
+ if (S_ISLNK(ip->i_d.di_mode))
+ return xfs_bmap_local_to_extents(tp, ip, firstblock, 1,
+ flags, XFS_DATA_FORK,
+ xfs_symlink_local_to_remote);
+
+ /* should only be called for types that support local format data */
+ ASSERT(0);
+ return EFSCORRUPTED;
+}
+
+/*
+ * Convert inode from non-attributed to attributed.
+ * Must not be in a transaction, ip must not be locked.
+ */
+int /* error code */
+xfs_bmap_add_attrfork(
+ xfs_inode_t *ip, /* incore inode pointer */
+ int size, /* space new attribute needs */
+ int rsvd) /* xact may use reserved blks */
+{
+ xfs_fsblock_t firstblock; /* 1st block/ag allocated */
+ xfs_bmap_free_t flist; /* freed extent records */
+ xfs_mount_t *mp; /* mount structure */
+ xfs_trans_t *tp; /* transaction pointer */
+ int blks; /* space reservation */
+ int version = 1; /* superblock attr version */
+ int committed; /* xaction was committed */
+ int logflags; /* logging flags */
+ int error; /* error return value */
+ int cancel_flags = 0;
+
+ ASSERT(XFS_IFORK_Q(ip) == 0);
+
+ mp = ip->i_mount;
+ ASSERT(!XFS_NOT_DQATTACHED(mp, ip));
+ tp = xfs_trans_alloc(mp, XFS_TRANS_ADDAFORK);
+ blks = XFS_ADDAFORK_SPACE_RES(mp);
+ if (rsvd)
+ tp->t_flags |= XFS_TRANS_RESERVE;
+ error = xfs_trans_reserve(tp, &M_RES(mp)->tr_addafork, blks, 0);
+ if (error) {
+ xfs_trans_cancel(tp, 0);
+ return error;
+ }
+ cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ error = xfs_trans_reserve_quota_nblks(tp, ip, blks, 0, rsvd ?
+ XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES :
+ XFS_QMOPT_RES_REGBLKS);
+ if (error)
+ goto trans_cancel;
+ cancel_flags |= XFS_TRANS_ABORT;
+ if (XFS_IFORK_Q(ip))
+ goto trans_cancel;
+ if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS) {
+ /*
+ * For inodes coming from pre-6.2 filesystems.
+ */
+ ASSERT(ip->i_d.di_aformat == 0);
+ ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
+ }
+ ASSERT(ip->i_d.di_anextents == 0);
+
+ xfs_trans_ijoin(tp, ip, 0);
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+ switch (ip->i_d.di_format) {
+ case XFS_DINODE_FMT_DEV:
+ ip->i_d.di_forkoff = roundup(sizeof(xfs_dev_t), 8) >> 3;
+ break;
+ case XFS_DINODE_FMT_UUID:
+ ip->i_d.di_forkoff = roundup(sizeof(uuid_t), 8) >> 3;
+ break;
+ case XFS_DINODE_FMT_LOCAL:
+ case XFS_DINODE_FMT_EXTENTS:
+ case XFS_DINODE_FMT_BTREE:
+ ip->i_d.di_forkoff = xfs_attr_shortform_bytesfit(ip, size);
+ if (!ip->i_d.di_forkoff)
+ ip->i_d.di_forkoff = xfs_default_attroffset(ip) >> 3;
+ else if (mp->m_flags & XFS_MOUNT_ATTR2)
+ version = 2;
+ break;
+ default:
+ ASSERT(0);
+ error = XFS_ERROR(EINVAL);
+ goto trans_cancel;
+ }
+
+ ASSERT(ip->i_afp == NULL);
+ ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP);
+ ip->i_afp->if_flags = XFS_IFEXTENTS;
+ logflags = 0;
+ xfs_bmap_init(&flist, &firstblock);
+ switch (ip->i_d.di_format) {
+ case XFS_DINODE_FMT_LOCAL:
+ error = xfs_bmap_add_attrfork_local(tp, ip, &firstblock, &flist,
+ &logflags);
+ break;
+ case XFS_DINODE_FMT_EXTENTS:
+ error = xfs_bmap_add_attrfork_extents(tp, ip, &firstblock,
+ &flist, &logflags);
+ break;
+ case XFS_DINODE_FMT_BTREE:
+ error = xfs_bmap_add_attrfork_btree(tp, ip, &firstblock, &flist,
+ &logflags);
+ break;
+ default:
+ error = 0;
+ break;
+ }
+ if (logflags)
+ xfs_trans_log_inode(tp, ip, logflags);
+ if (error)
+ goto bmap_cancel;
+ if (!xfs_sb_version_hasattr(&mp->m_sb) ||
+ (!xfs_sb_version_hasattr2(&mp->m_sb) && version == 2)) {
+ __int64_t sbfields = 0;
+
+ spin_lock(&mp->m_sb_lock);
+ if (!xfs_sb_version_hasattr(&mp->m_sb)) {
+ xfs_sb_version_addattr(&mp->m_sb);
+ sbfields |= XFS_SB_VERSIONNUM;
+ }
+ if (!xfs_sb_version_hasattr2(&mp->m_sb) && version == 2) {
+ xfs_sb_version_addattr2(&mp->m_sb);
+ sbfields |= (XFS_SB_VERSIONNUM | XFS_SB_FEATURES2);
+ }
+ if (sbfields) {
+ spin_unlock(&mp->m_sb_lock);
+ xfs_mod_sb(tp, sbfields);
+ } else
+ spin_unlock(&mp->m_sb_lock);
+ }
+
+ error = xfs_bmap_finish(&tp, &flist, &committed);
+ if (error)
+ goto bmap_cancel;
+ error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ return error;
+
+bmap_cancel:
+ xfs_bmap_cancel(&flist);
+trans_cancel:
+ xfs_trans_cancel(tp, cancel_flags);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
return error;
}
/*
- * Called by xfs_bmapi to update extent list structure and the btree
- * after allocating space (or doing a delayed allocation).
+ * Internal and external extent tree search functions.
*/
-STATIC int /* error */
-xfs_bmap_add_extent(
- xfs_inode_t *ip, /* incore inode pointer */
- xfs_extnum_t idx, /* extent number to update/insert */
- xfs_btree_cur_t **curp, /* if *curp is null, not a btree */
- xfs_bmbt_irec_t *new, /* new data to put in extent list */
- xfs_fsblock_t *first, /* pointer to firstblock variable */
- xfs_bmap_free_t *flist, /* list of extents to be freed */
- int *logflagsp, /* inode logging flags */
- int whichfork, /* data or attr fork */
- int rsvd) /* OK to use reserved data blocks */
+
+/*
+ * Read in the extents to if_extents.
+ * All inode fields are set up by caller, we just traverse the btree
+ * and copy the records in. If the file system cannot contain unwritten
+ * extents, the records are checked for no "state" flags.
+ */
+int /* error */
+xfs_bmap_read_extents(
+ xfs_trans_t *tp, /* transaction pointer */
+ xfs_inode_t *ip, /* incore inode */
+ int whichfork) /* data or attr fork */
{
- xfs_btree_cur_t *cur; /* btree cursor or null */
- xfs_filblks_t da_new; /* new count del alloc blocks used */
- xfs_filblks_t da_old; /* old count del alloc blocks used */
+ struct xfs_btree_block *block; /* current btree block */
+ xfs_fsblock_t bno; /* block # of "block" */
+ xfs_buf_t *bp; /* buffer for "block" */
int error; /* error return value */
-#ifdef XFS_BMAP_TRACE
- static char fname[] = "xfs_bmap_add_extent";
-#endif
- xfs_ifork_t *ifp; /* inode fork ptr */
- int logflags; /* returned value */
- xfs_extnum_t nextents; /* number of extents in file now */
+ xfs_exntfmt_t exntf; /* XFS_EXTFMT_NOSTATE, if checking */
+ xfs_extnum_t i, j; /* index into the extents list */
+ xfs_ifork_t *ifp; /* fork structure */
+ int level; /* btree level, for checking */
+ xfs_mount_t *mp; /* file system mount structure */
+ __be64 *pp; /* pointer to block address */
+ /* REFERENCED */
+ xfs_extnum_t room; /* number of entries there's room for */
- XFS_STATS_INC(xs_add_exlist);
- cur = *curp;
+ bno = NULLFSBLOCK;
+ mp = ip->i_mount;
ifp = XFS_IFORK_PTR(ip, whichfork);
- nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
- ASSERT(idx <= nextents);
- da_old = da_new = 0;
- error = 0;
+ exntf = (whichfork != XFS_DATA_FORK) ? XFS_EXTFMT_NOSTATE :
+ XFS_EXTFMT_INODE(ip);
+ block = ifp->if_broot;
/*
- * This is the first extent added to a new/empty file.
- * Special case this one, so other routines get to assume there are
- * already extents in the list.
+ * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out.
*/
- if (nextents == 0) {
- xfs_bmap_trace_insert(fname, "insert empty", ip, 0, 1, new,
- NULL, whichfork);
- xfs_bmap_insert_exlist(ip, 0, 1, new, whichfork);
- ASSERT(cur == NULL);
- ifp->if_lastex = 0;
- if (!ISNULLSTARTBLOCK(new->br_startblock)) {
- XFS_IFORK_NEXT_SET(ip, whichfork, 1);
- logflags = XFS_ILOG_CORE | XFS_ILOG_FEXT(whichfork);
- } else
- logflags = 0;
- }
+ level = be16_to_cpu(block->bb_level);
+ ASSERT(level > 0);
+ pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes);
+ bno = be64_to_cpu(*pp);
+ ASSERT(bno != NULLDFSBNO);
+ ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount);
+ ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks);
/*
- * Any kind of new delayed allocation goes here.
+ * Go down the tree until leaf level is reached, following the first
+ * pointer (leftmost) at each level.
*/
- else if (ISNULLSTARTBLOCK(new->br_startblock)) {
- if (cur)
- ASSERT((cur->bc_private.b.flags &
- XFS_BTCUR_BPRV_WASDEL) == 0);
- if ((error = xfs_bmap_add_extent_hole_delay(ip, idx, cur, new,
- &logflags, rsvd)))
- goto done;
+ while (level-- > 0) {
+ error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
+ XFS_BMAP_BTREE_REF, &xfs_bmbt_buf_ops);
+ if (error)
+ return error;
+ block = XFS_BUF_TO_BLOCK(bp);
+ XFS_WANT_CORRUPTED_GOTO(
+ xfs_bmap_sanity_check(mp, bp, level),
+ error0);
+ if (level == 0)
+ break;
+ pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]);
+ bno = be64_to_cpu(*pp);
+ XFS_WANT_CORRUPTED_GOTO(XFS_FSB_SANITY_CHECK(mp, bno), error0);
+ xfs_trans_brelse(tp, bp);
}
/*
- * Real allocation off the end of the file.
+ * Here with bp and block set to the leftmost leaf node in the tree.
*/
- else if (idx == nextents) {
- if (cur)
- ASSERT((cur->bc_private.b.flags &
- XFS_BTCUR_BPRV_WASDEL) == 0);
- if ((error = xfs_bmap_add_extent_hole_real(ip, idx, cur, new,
- &logflags, whichfork)))
- goto done;
- } else {
- xfs_bmbt_irec_t prev; /* old extent at offset idx */
+ room = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+ i = 0;
+ /*
+ * Loop over all leaf nodes. Copy information to the extent records.
+ */
+ for (;;) {
+ xfs_bmbt_rec_t *frp;
+ xfs_fsblock_t nextbno;
+ xfs_extnum_t num_recs;
+ xfs_extnum_t start;
+ num_recs = xfs_btree_get_numrecs(block);
+ if (unlikely(i + num_recs > room)) {
+ ASSERT(i + num_recs <= room);
+ xfs_warn(ip->i_mount,
+ "corrupt dinode %Lu, (btree extents).",
+ (unsigned long long) ip->i_ino);
+ XFS_CORRUPTION_ERROR("xfs_bmap_read_extents(1)",
+ XFS_ERRLEVEL_LOW, ip->i_mount, block);
+ goto error0;
+ }
+ XFS_WANT_CORRUPTED_GOTO(
+ xfs_bmap_sanity_check(mp, bp, 0),
+ error0);
/*
- * Get the record referred to by idx.
+ * Read-ahead the next leaf block, if any.
*/
- xfs_bmbt_get_all(&ifp->if_u1.if_extents[idx], &prev);
+ nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
+ if (nextbno != NULLFSBLOCK)
+ xfs_btree_reada_bufl(mp, nextbno, 1,
+ &xfs_bmbt_buf_ops);
/*
- * If it's a real allocation record, and the new allocation ends
- * after the start of the referred to record, then we're filling
- * in a delayed or unwritten allocation with a real one, or
- * converting real back to unwritten.
+ * Copy records into the extent records.
*/
- if (!ISNULLSTARTBLOCK(new->br_startblock) &&
- new->br_startoff + new->br_blockcount > prev.br_startoff) {
- if (prev.br_state != XFS_EXT_UNWRITTEN &&
- ISNULLSTARTBLOCK(prev.br_startblock)) {
- da_old = STARTBLOCKVAL(prev.br_startblock);
- if (cur)
- ASSERT(cur->bc_private.b.flags &
- XFS_BTCUR_BPRV_WASDEL);
- if ((error = xfs_bmap_add_extent_delay_real(ip,
- idx, &cur, new, &da_new, first, flist,
- &logflags, rsvd)))
- goto done;
- } else if (new->br_state == XFS_EXT_NORM) {
- ASSERT(new->br_state == XFS_EXT_NORM);
- if ((error = xfs_bmap_add_extent_unwritten_real(
- ip, idx, &cur, new, &logflags)))
- goto done;
- } else {
- ASSERT(new->br_state == XFS_EXT_UNWRITTEN);
- if ((error = xfs_bmap_add_extent_unwritten_real(
- ip, idx, &cur, new, &logflags)))
- goto done;
+ frp = XFS_BMBT_REC_ADDR(mp, block, 1);
+ start = i;
+ for (j = 0; j < num_recs; j++, i++, frp++) {
+ xfs_bmbt_rec_host_t *trp = xfs_iext_get_ext(ifp, i);
+ trp->l0 = be64_to_cpu(frp->l0);
+ trp->l1 = be64_to_cpu(frp->l1);
+ }
+ if (exntf == XFS_EXTFMT_NOSTATE) {
+ /*
+ * Check all attribute bmap btree records and
+ * any "older" data bmap btree records for a
+ * set bit in the "extent flag" position.
+ */
+ if (unlikely(xfs_check_nostate_extents(ifp,
+ start, num_recs))) {
+ XFS_ERROR_REPORT("xfs_bmap_read_extents(2)",
+ XFS_ERRLEVEL_LOW,
+ ip->i_mount);
+ goto error0;
}
- ASSERT(*curp == cur || *curp == NULL);
}
+ xfs_trans_brelse(tp, bp);
+ bno = nextbno;
/*
- * Otherwise we're filling in a hole with an allocation.
+ * If we've reached the end, stop.
*/
- else {
- if (cur)
- ASSERT((cur->bc_private.b.flags &
- XFS_BTCUR_BPRV_WASDEL) == 0);
- if ((error = xfs_bmap_add_extent_hole_real(ip, idx, cur,
- new, &logflags, whichfork)))
- goto done;
- }
+ if (bno == NULLFSBLOCK)
+ break;
+ error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
+ XFS_BMAP_BTREE_REF, &xfs_bmbt_buf_ops);
+ if (error)
+ return error;
+ block = XFS_BUF_TO_BLOCK(bp);
}
+ ASSERT(i == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)));
+ ASSERT(i == XFS_IFORK_NEXTENTS(ip, whichfork));
+ XFS_BMAP_TRACE_EXLIST(ip, i, whichfork);
+ return 0;
+error0:
+ xfs_trans_brelse(tp, bp);
+ return XFS_ERROR(EFSCORRUPTED);
+}
+
+
+/*
+ * Search the extent records for the entry containing block bno.
+ * If bno lies in a hole, point to the next entry. If bno lies
+ * past eof, *eofp will be set, and *prevp will contain the last
+ * entry (null if none). Else, *lastxp will be set to the index
+ * of the found entry; *gotp will contain the entry.
+ */
+STATIC xfs_bmbt_rec_host_t * /* pointer to found extent entry */
+xfs_bmap_search_multi_extents(
+ xfs_ifork_t *ifp, /* inode fork pointer */
+ xfs_fileoff_t bno, /* block number searched for */
+ int *eofp, /* out: end of file found */
+ xfs_extnum_t *lastxp, /* out: last extent index */
+ xfs_bmbt_irec_t *gotp, /* out: extent entry found */
+ xfs_bmbt_irec_t *prevp) /* out: previous extent entry found */
+{
+ xfs_bmbt_rec_host_t *ep; /* extent record pointer */
+ xfs_extnum_t lastx; /* last extent index */
- ASSERT(*curp == cur || *curp == NULL);
/*
- * Convert to a btree if necessary.
+ * Initialize the extent entry structure to catch access to
+ * uninitialized br_startblock field.
*/
- if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS &&
- XFS_IFORK_NEXTENTS(ip, whichfork) > ifp->if_ext_max) {
- int tmp_logflags; /* partial log flag return val */
+ gotp->br_startoff = 0xffa5a5a5a5a5a5a5LL;
+ gotp->br_blockcount = 0xa55a5a5a5a5a5a5aLL;
+ gotp->br_state = XFS_EXT_INVALID;
+#if XFS_BIG_BLKNOS
+ gotp->br_startblock = 0xffffa5a5a5a5a5a5LL;
+#else
+ gotp->br_startblock = 0xffffa5a5;
+#endif
+ prevp->br_startoff = NULLFILEOFF;
- ASSERT(cur == NULL);
- error = xfs_bmap_extents_to_btree(ip->i_transp, ip, first,
- flist, &cur, da_old > 0, &tmp_logflags, whichfork);
- logflags |= tmp_logflags;
- if (error)
- goto done;
+ ep = xfs_iext_bno_to_ext(ifp, bno, &lastx);
+ if (lastx > 0) {
+ xfs_bmbt_get_all(xfs_iext_get_ext(ifp, lastx - 1), prevp);
+ }
+ if (lastx < (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))) {
+ xfs_bmbt_get_all(ep, gotp);
+ *eofp = 0;
+ } else {
+ if (lastx > 0) {
+ *gotp = *prevp;
+ }
+ *eofp = 1;
+ ep = NULL;
+ }
+ *lastxp = lastx;
+ return ep;
+}
+
+/*
+ * Search the extents list for the inode, for the extent containing bno.
+ * If bno lies in a hole, point to the next entry. If bno lies past eof,
+ * *eofp will be set, and *prevp will contain the last entry (null if none).
+ * Else, *lastxp will be set to the index of the found
+ * entry; *gotp will contain the entry.
+ */
+STATIC xfs_bmbt_rec_host_t * /* pointer to found extent entry */
+xfs_bmap_search_extents(
+ xfs_inode_t *ip, /* incore inode pointer */
+ xfs_fileoff_t bno, /* block number searched for */
+ int fork, /* data or attr fork */
+ int *eofp, /* out: end of file found */
+ xfs_extnum_t *lastxp, /* out: last extent index */
+ xfs_bmbt_irec_t *gotp, /* out: extent entry found */
+ xfs_bmbt_irec_t *prevp) /* out: previous extent entry found */
+{
+ xfs_ifork_t *ifp; /* inode fork pointer */
+ xfs_bmbt_rec_host_t *ep; /* extent record pointer */
+
+ XFS_STATS_INC(xs_look_exlist);
+ ifp = XFS_IFORK_PTR(ip, fork);
+
+ ep = xfs_bmap_search_multi_extents(ifp, bno, eofp, lastxp, gotp, prevp);
+
+ if (unlikely(!(gotp->br_startblock) && (*lastxp != NULLEXTNUM) &&
+ !(XFS_IS_REALTIME_INODE(ip) && fork == XFS_DATA_FORK))) {
+ xfs_alert_tag(ip->i_mount, XFS_PTAG_FSBLOCK_ZERO,
+ "Access to block zero in inode %llu "
+ "start_block: %llx start_off: %llx "
+ "blkcnt: %llx extent-state: %x lastx: %x",
+ (unsigned long long)ip->i_ino,
+ (unsigned long long)gotp->br_startblock,
+ (unsigned long long)gotp->br_startoff,
+ (unsigned long long)gotp->br_blockcount,
+ gotp->br_state, *lastxp);
+ *lastxp = NULLEXTNUM;
+ *eofp = 1;
+ return NULL;
+ }
+ return ep;
+}
+
+/*
+ * Returns the file-relative block number of the first unused block(s)
+ * in the file with at least "len" logically contiguous blocks free.
+ * This is the lowest-address hole if the file has holes, else the first block
+ * past the end of file.
+ * Return 0 if the file is currently local (in-inode).
+ */
+int /* error */
+xfs_bmap_first_unused(
+ xfs_trans_t *tp, /* transaction pointer */
+ xfs_inode_t *ip, /* incore inode */
+ xfs_extlen_t len, /* size of hole to find */
+ xfs_fileoff_t *first_unused, /* unused block */
+ int whichfork) /* data or attr fork */
+{
+ int error; /* error return value */
+ int idx; /* extent record index */
+ xfs_ifork_t *ifp; /* inode fork pointer */
+ xfs_fileoff_t lastaddr; /* last block number seen */
+ xfs_fileoff_t lowest; /* lowest useful block */
+ xfs_fileoff_t max; /* starting useful block */
+ xfs_fileoff_t off; /* offset for this block */
+ xfs_extnum_t nextents; /* number of extent entries */
+
+ ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE ||
+ XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS ||
+ XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL);
+ if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) {
+ *first_unused = 0;
+ return 0;
+ }
+ ifp = XFS_IFORK_PTR(ip, whichfork);
+ if (!(ifp->if_flags & XFS_IFEXTENTS) &&
+ (error = xfs_iread_extents(tp, ip, whichfork)))
+ return error;
+ lowest = *first_unused;
+ nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+ for (idx = 0, lastaddr = 0, max = lowest; idx < nextents; idx++) {
+ xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, idx);
+ off = xfs_bmbt_get_startoff(ep);
+ /*
+ * See if the hole before this extent will work.
+ */
+ if (off >= lowest + len && off - max >= len) {
+ *first_unused = max;
+ return 0;
+ }
+ lastaddr = off + xfs_bmbt_get_blockcount(ep);
+ max = XFS_FILEOFF_MAX(lastaddr, lowest);
+ }
+ *first_unused = max;
+ return 0;
+}
+
+/*
+ * Returns the file-relative block number of the last block - 1 before
+ * last_block (input value) in the file.
+ * This is not based on i_size, it is based on the extent records.
+ * Returns 0 for local files, as they do not have extent records.
+ */
+int /* error */
+xfs_bmap_last_before(
+ xfs_trans_t *tp, /* transaction pointer */
+ xfs_inode_t *ip, /* incore inode */
+ xfs_fileoff_t *last_block, /* last block */
+ int whichfork) /* data or attr fork */
+{
+ xfs_fileoff_t bno; /* input file offset */
+ int eof; /* hit end of file */
+ xfs_bmbt_rec_host_t *ep; /* pointer to last extent */
+ int error; /* error return value */
+ xfs_bmbt_irec_t got; /* current extent value */
+ xfs_ifork_t *ifp; /* inode fork pointer */
+ xfs_extnum_t lastx; /* last extent used */
+ xfs_bmbt_irec_t prev; /* previous extent value */
+
+ if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE &&
+ XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
+ XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL)
+ return XFS_ERROR(EIO);
+ if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) {
+ *last_block = 0;
+ return 0;
+ }
+ ifp = XFS_IFORK_PTR(ip, whichfork);
+ if (!(ifp->if_flags & XFS_IFEXTENTS) &&
+ (error = xfs_iread_extents(tp, ip, whichfork)))
+ return error;
+ bno = *last_block - 1;
+ ep = xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got,
+ &prev);
+ if (eof || xfs_bmbt_get_startoff(ep) > bno) {
+ if (prev.br_startoff == NULLFILEOFF)
+ *last_block = 0;
+ else
+ *last_block = prev.br_startoff + prev.br_blockcount;
}
/*
- * Adjust for changes in reserved delayed indirect blocks.
- * Nothing to do for disk quotas here.
+ * Otherwise *last_block is already the right answer.
*/
- if (da_old || da_new) {
- xfs_filblks_t nblks;
-
- nblks = da_new;
- if (cur)
- nblks += cur->bc_private.b.allocated;
- ASSERT(nblks <= da_old);
- if (nblks < da_old)
- xfs_mod_incore_sb(ip->i_mount, XFS_SBS_FDBLOCKS,
- (int)(da_old - nblks), rsvd);
+ return 0;
+}
+
+int
+xfs_bmap_last_extent(
+ struct xfs_trans *tp,
+ struct xfs_inode *ip,
+ int whichfork,
+ struct xfs_bmbt_irec *rec,
+ int *is_empty)
+{
+ struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ int error;
+ int nextents;
+
+ if (!(ifp->if_flags & XFS_IFEXTENTS)) {
+ error = xfs_iread_extents(tp, ip, whichfork);
+ if (error)
+ return error;
+ }
+
+ nextents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t);
+ if (nextents == 0) {
+ *is_empty = 1;
+ return 0;
}
+
+ xfs_bmbt_get_all(xfs_iext_get_ext(ifp, nextents - 1), rec);
+ *is_empty = 0;
+ return 0;
+}
+
+/*
+ * Check the last inode extent to determine whether this allocation will result
+ * in blocks being allocated at the end of the file. When we allocate new data
+ * blocks at the end of the file which do not start at the previous data block,
+ * we will try to align the new blocks at stripe unit boundaries.
+ *
+ * Returns 1 in bma->aeof if the file (fork) is empty as any new write will be
+ * at, or past the EOF.
+ */
+STATIC int
+xfs_bmap_isaeof(
+ struct xfs_bmalloca *bma,
+ int whichfork)
+{
+ struct xfs_bmbt_irec rec;
+ int is_empty;
+ int error;
+
+ bma->aeof = 0;
+ error = xfs_bmap_last_extent(NULL, bma->ip, whichfork, &rec,
+ &is_empty);
+ if (error)
+ return error;
+
+ if (is_empty) {
+ bma->aeof = 1;
+ return 0;
+ }
+
/*
- * Clear out the allocated field, done with it now in any case.
+ * Check if we are allocation or past the last extent, or at least into
+ * the last delayed allocated extent.
*/
- if (cur) {
- cur->bc_private.b.allocated = 0;
- *curp = cur;
- }
-done:
-#ifdef DEBUG
- if (!error)
- xfs_bmap_check_leaf_extents(*curp, ip, whichfork);
-#endif
- *logflagsp = logflags;
- return error;
+ bma->aeof = bma->offset >= rec.br_startoff + rec.br_blockcount ||
+ (bma->offset >= rec.br_startoff &&
+ isnullstartblock(rec.br_startblock));
+ return 0;
+}
+
+/*
+ * Returns the file-relative block number of the first block past eof in
+ * the file. This is not based on i_size, it is based on the extent records.
+ * Returns 0 for local files, as they do not have extent records.
+ */
+int
+xfs_bmap_last_offset(
+ struct xfs_inode *ip,
+ xfs_fileoff_t *last_block,
+ int whichfork)
+{
+ struct xfs_bmbt_irec rec;
+ int is_empty;
+ int error;
+
+ *last_block = 0;
+
+ if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL)
+ return 0;
+
+ if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE &&
+ XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS)
+ return XFS_ERROR(EIO);
+
+ error = xfs_bmap_last_extent(NULL, ip, whichfork, &rec, &is_empty);
+ if (error || is_empty)
+ return error;
+
+ *last_block = rec.br_startoff + rec.br_blockcount;
+ return 0;
+}
+
+/*
+ * Returns whether the selected fork of the inode has exactly one
+ * block or not. For the data fork we check this matches di_size,
+ * implying the file's range is 0..bsize-1.
+ */
+int /* 1=>1 block, 0=>otherwise */
+xfs_bmap_one_block(
+ xfs_inode_t *ip, /* incore inode */
+ int whichfork) /* data or attr fork */
+{
+ xfs_bmbt_rec_host_t *ep; /* ptr to fork's extent */
+ xfs_ifork_t *ifp; /* inode fork pointer */
+ int rval; /* return value */
+ xfs_bmbt_irec_t s; /* internal version of extent */
+
+#ifndef DEBUG
+ if (whichfork == XFS_DATA_FORK)
+ return XFS_ISIZE(ip) == ip->i_mount->m_sb.sb_blocksize;
+#endif /* !DEBUG */
+ if (XFS_IFORK_NEXTENTS(ip, whichfork) != 1)
+ return 0;
+ if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS)
+ return 0;
+ ifp = XFS_IFORK_PTR(ip, whichfork);
+ ASSERT(ifp->if_flags & XFS_IFEXTENTS);
+ ep = xfs_iext_get_ext(ifp, 0);
+ xfs_bmbt_get_all(ep, &s);
+ rval = s.br_startoff == 0 && s.br_blockcount == 1;
+ if (rval && whichfork == XFS_DATA_FORK)
+ ASSERT(XFS_ISIZE(ip) == ip->i_mount->m_sb.sb_blocksize);
+ return rval;
}
/*
- * Called by xfs_bmap_add_extent to handle cases converting a delayed
- * allocation to a real allocation.
+ * Extent tree manipulation functions used during allocation.
+ */
+
+/*
+ * Convert a delayed allocation to a real allocation.
*/
STATIC int /* error */
xfs_bmap_add_extent_delay_real(
- xfs_inode_t *ip, /* incore inode pointer */
- xfs_extnum_t idx, /* extent number to update/insert */
- xfs_btree_cur_t **curp, /* if *curp is null, not a btree */
- xfs_bmbt_irec_t *new, /* new data to put in extent list */
- xfs_filblks_t *dnew, /* new delayed-alloc indirect blocks */
- xfs_fsblock_t *first, /* pointer to firstblock variable */
- xfs_bmap_free_t *flist, /* list of extents to be freed */
- int *logflagsp, /* inode logging flags */
- int rsvd) /* OK to use reserved data block allocation */
+ struct xfs_bmalloca *bma)
{
- xfs_bmbt_rec_t *base; /* base of extent entry list */
- xfs_btree_cur_t *cur; /* btree cursor */
+ struct xfs_bmbt_irec *new = &bma->got;
int diff; /* temp value */
- xfs_bmbt_rec_t *ep; /* extent entry for idx */
+ xfs_bmbt_rec_host_t *ep; /* extent entry for idx */
int error; /* error return value */
-#ifdef XFS_BMAP_TRACE
- static char fname[] = "xfs_bmap_add_extent_delay_real";
-#endif
int i; /* temp state */
+ xfs_ifork_t *ifp; /* inode fork pointer */
xfs_fileoff_t new_endoff; /* end offset of new entry */
xfs_bmbt_irec_t r[3]; /* neighbor extent entries */
/* left is 0, right is 1, prev is 2 */
int rval=0; /* return value (logging flags) */
int state = 0;/* state bits, accessed thru macros */
- xfs_filblks_t temp; /* value for dnew calculations */
- xfs_filblks_t temp2; /* value for dnew calculations */
+ xfs_filblks_t da_new; /* new count del alloc blocks used */
+ xfs_filblks_t da_old; /* old count del alloc blocks used */
+ xfs_filblks_t temp=0; /* value for da_new calculations */
+ xfs_filblks_t temp2=0;/* value for da_new calculations */
int tmp_rval; /* partial logging flags */
- enum { /* bit number definitions for state */
- LEFT_CONTIG, RIGHT_CONTIG,
- LEFT_FILLING, RIGHT_FILLING,
- LEFT_DELAY, RIGHT_DELAY,
- LEFT_VALID, RIGHT_VALID
- };
+
+ ifp = XFS_IFORK_PTR(bma->ip, XFS_DATA_FORK);
+
+ ASSERT(bma->idx >= 0);
+ ASSERT(bma->idx <= ifp->if_bytes / sizeof(struct xfs_bmbt_rec));
+ ASSERT(!isnullstartblock(new->br_startblock));
+ ASSERT(!bma->cur ||
+ (bma->cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL));
+
+ XFS_STATS_INC(xs_add_exlist);
#define LEFT r[0]
#define RIGHT r[1]
#define PREV r[2]
-#define MASK(b) (1 << (b))
-#define MASK2(a,b) (MASK(a) | MASK(b))
-#define MASK3(a,b,c) (MASK2(a,b) | MASK(c))
-#define MASK4(a,b,c,d) (MASK3(a,b,c) | MASK(d))
-#define STATE_SET(b,v) ((v) ? (state |= MASK(b)) : (state &= ~MASK(b)))
-#define STATE_TEST(b) (state & MASK(b))
-#define STATE_SET_TEST(b,v) ((v) ? ((state |= MASK(b)), 1) : \
- ((state &= ~MASK(b)), 0))
-#define SWITCH_STATE \
- (state & MASK4(LEFT_FILLING, RIGHT_FILLING, LEFT_CONTIG, RIGHT_CONTIG))
/*
* Set up a bunch of variables to make the tests simpler.
*/
- cur = *curp;
- base = ip->i_df.if_u1.if_extents;
- ep = &base[idx];
+ ep = xfs_iext_get_ext(ifp, bma->idx);
xfs_bmbt_get_all(ep, &PREV);
new_endoff = new->br_startoff + new->br_blockcount;
ASSERT(PREV.br_startoff <= new->br_startoff);
ASSERT(PREV.br_startoff + PREV.br_blockcount >= new_endoff);
+
+ da_old = startblockval(PREV.br_startblock);
+ da_new = 0;
+
/*
* Set flags determining what part of the previous delayed allocation
* extent is being replaced by a real allocation.
*/
- STATE_SET(LEFT_FILLING, PREV.br_startoff == new->br_startoff);
- STATE_SET(RIGHT_FILLING,
- PREV.br_startoff + PREV.br_blockcount == new_endoff);
+ if (PREV.br_startoff == new->br_startoff)
+ state |= BMAP_LEFT_FILLING;
+ if (PREV.br_startoff + PREV.br_blockcount == new_endoff)
+ state |= BMAP_RIGHT_FILLING;
+
/*
* Check and set flags if this segment has a left neighbor.
* Don't set contiguous if the combined extent would be too large.
*/
- if (STATE_SET_TEST(LEFT_VALID, idx > 0)) {
- xfs_bmbt_get_all(ep - 1, &LEFT);
- STATE_SET(LEFT_DELAY, ISNULLSTARTBLOCK(LEFT.br_startblock));
+ if (bma->idx > 0) {
+ state |= BMAP_LEFT_VALID;
+ xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx - 1), &LEFT);
+
+ if (isnullstartblock(LEFT.br_startblock))
+ state |= BMAP_LEFT_DELAY;
}
- STATE_SET(LEFT_CONTIG,
- STATE_TEST(LEFT_VALID) && !STATE_TEST(LEFT_DELAY) &&
- LEFT.br_startoff + LEFT.br_blockcount == new->br_startoff &&
- LEFT.br_startblock + LEFT.br_blockcount == new->br_startblock &&
- LEFT.br_state == new->br_state &&
- LEFT.br_blockcount + new->br_blockcount <= MAXEXTLEN);
+
+ if ((state & BMAP_LEFT_VALID) && !(state & BMAP_LEFT_DELAY) &&
+ LEFT.br_startoff + LEFT.br_blockcount == new->br_startoff &&
+ LEFT.br_startblock + LEFT.br_blockcount == new->br_startblock &&
+ LEFT.br_state == new->br_state &&
+ LEFT.br_blockcount + new->br_blockcount <= MAXEXTLEN)
+ state |= BMAP_LEFT_CONTIG;
+
/*
* Check and set flags if this segment has a right neighbor.
* Don't set contiguous if the combined extent would be too large.
* Also check for all-three-contiguous being too large.
*/
- if (STATE_SET_TEST(RIGHT_VALID,
- idx <
- ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1)) {
- xfs_bmbt_get_all(ep + 1, &RIGHT);
- STATE_SET(RIGHT_DELAY, ISNULLSTARTBLOCK(RIGHT.br_startblock));
+ if (bma->idx < bma->ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) {
+ state |= BMAP_RIGHT_VALID;
+ xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx + 1), &RIGHT);
+
+ if (isnullstartblock(RIGHT.br_startblock))
+ state |= BMAP_RIGHT_DELAY;
}
- STATE_SET(RIGHT_CONTIG,
- STATE_TEST(RIGHT_VALID) && !STATE_TEST(RIGHT_DELAY) &&
- new_endoff == RIGHT.br_startoff &&
- new->br_startblock + new->br_blockcount ==
- RIGHT.br_startblock &&
- new->br_state == RIGHT.br_state &&
- new->br_blockcount + RIGHT.br_blockcount <= MAXEXTLEN &&
- ((state & MASK3(LEFT_CONTIG, LEFT_FILLING, RIGHT_FILLING)) !=
- MASK3(LEFT_CONTIG, LEFT_FILLING, RIGHT_FILLING) ||
- LEFT.br_blockcount + new->br_blockcount + RIGHT.br_blockcount
- <= MAXEXTLEN));
+
+ if ((state & BMAP_RIGHT_VALID) && !(state & BMAP_RIGHT_DELAY) &&
+ new_endoff == RIGHT.br_startoff &&
+ new->br_startblock + new->br_blockcount == RIGHT.br_startblock &&
+ new->br_state == RIGHT.br_state &&
+ new->br_blockcount + RIGHT.br_blockcount <= MAXEXTLEN &&
+ ((state & (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING |
+ BMAP_RIGHT_FILLING)) !=
+ (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING |
+ BMAP_RIGHT_FILLING) ||
+ LEFT.br_blockcount + new->br_blockcount + RIGHT.br_blockcount
+ <= MAXEXTLEN))
+ state |= BMAP_RIGHT_CONTIG;
+
error = 0;
/*
* Switch out based on the FILLING and CONTIG state bits.
*/
- switch (SWITCH_STATE) {
-
- case MASK4(LEFT_FILLING, RIGHT_FILLING, LEFT_CONTIG, RIGHT_CONTIG):
+ switch (state & (BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG |
+ BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG)) {
+ case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG |
+ BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
/*
* Filling in all of a previously delayed allocation extent.
* The left and right neighbors are both contiguous with new.
*/
- xfs_bmap_trace_pre_update(fname, "LF|RF|LC|RC", ip, idx - 1,
- XFS_DATA_FORK);
- xfs_bmbt_set_blockcount(ep - 1,
+ bma->idx--;
+ trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
+ xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, bma->idx),
LEFT.br_blockcount + PREV.br_blockcount +
RIGHT.br_blockcount);
- xfs_bmap_trace_post_update(fname, "LF|RF|LC|RC", ip, idx - 1,
- XFS_DATA_FORK);
- xfs_bmap_trace_delete(fname, "LF|RF|LC|RC", ip, idx, 2,
- XFS_DATA_FORK);
- xfs_bmap_delete_exlist(ip, idx, 2, XFS_DATA_FORK);
- ip->i_df.if_lastex = idx - 1;
- ip->i_d.di_nextents--;
- if (cur == NULL)
+ trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
+
+ xfs_iext_remove(bma->ip, bma->idx + 1, 2, state);
+ bma->ip->i_d.di_nextents--;
+ if (bma->cur == NULL)
rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
else {
rval = XFS_ILOG_CORE;
- if ((error = xfs_bmbt_lookup_eq(cur, RIGHT.br_startoff,
+ error = xfs_bmbt_lookup_eq(bma->cur, RIGHT.br_startoff,
RIGHT.br_startblock,
- RIGHT.br_blockcount, &i)))
+ RIGHT.br_blockcount, &i);
+ if (error)
goto done;
- ASSERT(i == 1);
- if ((error = xfs_bmbt_delete(cur, &i)))
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+ error = xfs_btree_delete(bma->cur, &i);
+ if (error)
goto done;
- ASSERT(i == 1);
- if ((error = xfs_bmbt_decrement(cur, 0, &i)))
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+ error = xfs_btree_decrement(bma->cur, 0, &i);
+ if (error)
goto done;
- ASSERT(i == 1);
- if ((error = xfs_bmbt_update(cur, LEFT.br_startoff,
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+ error = xfs_bmbt_update(bma->cur, LEFT.br_startoff,
LEFT.br_startblock,
LEFT.br_blockcount +
PREV.br_blockcount +
- RIGHT.br_blockcount, LEFT.br_state)))
+ RIGHT.br_blockcount, LEFT.br_state);
+ if (error)
goto done;
}
- *dnew = 0;
break;
- case MASK3(LEFT_FILLING, RIGHT_FILLING, LEFT_CONTIG):
+ case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG:
/*
* Filling in all of a previously delayed allocation extent.
* The left neighbor is contiguous, the right is not.
*/
- xfs_bmap_trace_pre_update(fname, "LF|RF|LC", ip, idx - 1,
- XFS_DATA_FORK);
- xfs_bmbt_set_blockcount(ep - 1,
+ bma->idx--;
+
+ trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
+ xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, bma->idx),
LEFT.br_blockcount + PREV.br_blockcount);
- xfs_bmap_trace_post_update(fname, "LF|RF|LC", ip, idx - 1,
- XFS_DATA_FORK);
- ip->i_df.if_lastex = idx - 1;
- xfs_bmap_trace_delete(fname, "LF|RF|LC", ip, idx, 1,
- XFS_DATA_FORK);
- xfs_bmap_delete_exlist(ip, idx, 1, XFS_DATA_FORK);
- if (cur == NULL)
+ trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
+
+ xfs_iext_remove(bma->ip, bma->idx + 1, 1, state);
+ if (bma->cur == NULL)
rval = XFS_ILOG_DEXT;
else {
rval = 0;
- if ((error = xfs_bmbt_lookup_eq(cur, LEFT.br_startoff,
+ error = xfs_bmbt_lookup_eq(bma->cur, LEFT.br_startoff,
LEFT.br_startblock, LEFT.br_blockcount,
- &i)))
+ &i);
+ if (error)
goto done;
- ASSERT(i == 1);
- if ((error = xfs_bmbt_update(cur, LEFT.br_startoff,
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+ error = xfs_bmbt_update(bma->cur, LEFT.br_startoff,
LEFT.br_startblock,
LEFT.br_blockcount +
- PREV.br_blockcount, LEFT.br_state)))
+ PREV.br_blockcount, LEFT.br_state);
+ if (error)
goto done;
}
- *dnew = 0;
break;
- case MASK3(LEFT_FILLING, RIGHT_FILLING, RIGHT_CONTIG):
+ case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
/*
* Filling in all of a previously delayed allocation extent.
* The right neighbor is contiguous, the left is not.
*/
- xfs_bmap_trace_pre_update(fname, "LF|RF|RC", ip, idx,
- XFS_DATA_FORK);
+ trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
xfs_bmbt_set_startblock(ep, new->br_startblock);
xfs_bmbt_set_blockcount(ep,
PREV.br_blockcount + RIGHT.br_blockcount);
- xfs_bmap_trace_post_update(fname, "LF|RF|RC", ip, idx,
- XFS_DATA_FORK);
- ip->i_df.if_lastex = idx;
- xfs_bmap_trace_delete(fname, "LF|RF|RC", ip, idx + 1, 1,
- XFS_DATA_FORK);
- xfs_bmap_delete_exlist(ip, idx + 1, 1, XFS_DATA_FORK);
- if (cur == NULL)
+ trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
+
+ xfs_iext_remove(bma->ip, bma->idx + 1, 1, state);
+ if (bma->cur == NULL)
rval = XFS_ILOG_DEXT;
else {
rval = 0;
- if ((error = xfs_bmbt_lookup_eq(cur, RIGHT.br_startoff,
+ error = xfs_bmbt_lookup_eq(bma->cur, RIGHT.br_startoff,
RIGHT.br_startblock,
- RIGHT.br_blockcount, &i)))
+ RIGHT.br_blockcount, &i);
+ if (error)
goto done;
- ASSERT(i == 1);
- if ((error = xfs_bmbt_update(cur, PREV.br_startoff,
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+ error = xfs_bmbt_update(bma->cur, PREV.br_startoff,
new->br_startblock,
PREV.br_blockcount +
- RIGHT.br_blockcount, PREV.br_state)))
+ RIGHT.br_blockcount, PREV.br_state);
+ if (error)
goto done;
}
- *dnew = 0;
break;
- case MASK2(LEFT_FILLING, RIGHT_FILLING):
+ case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING:
/*
* Filling in all of a previously delayed allocation extent.
* Neither the left nor right neighbors are contiguous with
* the new one.
*/
- xfs_bmap_trace_pre_update(fname, "LF|RF", ip, idx,
- XFS_DATA_FORK);
+ trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
xfs_bmbt_set_startblock(ep, new->br_startblock);
- xfs_bmap_trace_post_update(fname, "LF|RF", ip, idx,
- XFS_DATA_FORK);
- ip->i_df.if_lastex = idx;
- ip->i_d.di_nextents++;
- if (cur == NULL)
+ trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
+
+ bma->ip->i_d.di_nextents++;
+ if (bma->cur == NULL)
rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
else {
rval = XFS_ILOG_CORE;
- if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff,
+ error = xfs_bmbt_lookup_eq(bma->cur, new->br_startoff,
new->br_startblock, new->br_blockcount,
- &i)))
+ &i);
+ if (error)
goto done;
- ASSERT(i == 0);
- cur->bc_rec.b.br_state = XFS_EXT_NORM;
- if ((error = xfs_bmbt_insert(cur, &i)))
+ XFS_WANT_CORRUPTED_GOTO(i == 0, done);
+ bma->cur->bc_rec.b.br_state = XFS_EXT_NORM;
+ error = xfs_btree_insert(bma->cur, &i);
+ if (error)
goto done;
- ASSERT(i == 1);
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
}
- *dnew = 0;
break;
- case MASK2(LEFT_FILLING, LEFT_CONTIG):
+ case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG:
/*
* Filling in the first part of a previous delayed allocation.
* The left neighbor is contiguous.
*/
- xfs_bmap_trace_pre_update(fname, "LF|LC", ip, idx - 1,
- XFS_DATA_FORK);
- xfs_bmbt_set_blockcount(ep - 1,
+ trace_xfs_bmap_pre_update(bma->ip, bma->idx - 1, state, _THIS_IP_);
+ xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, bma->idx - 1),
LEFT.br_blockcount + new->br_blockcount);
xfs_bmbt_set_startoff(ep,
PREV.br_startoff + new->br_blockcount);
- xfs_bmap_trace_post_update(fname, "LF|LC", ip, idx - 1,
- XFS_DATA_FORK);
+ trace_xfs_bmap_post_update(bma->ip, bma->idx - 1, state, _THIS_IP_);
+
temp = PREV.br_blockcount - new->br_blockcount;
- xfs_bmap_trace_pre_update(fname, "LF|LC", ip, idx,
- XFS_DATA_FORK);
+ trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
xfs_bmbt_set_blockcount(ep, temp);
- ip->i_df.if_lastex = idx - 1;
- if (cur == NULL)
+ if (bma->cur == NULL)
rval = XFS_ILOG_DEXT;
else {
rval = 0;
- if ((error = xfs_bmbt_lookup_eq(cur, LEFT.br_startoff,
+ error = xfs_bmbt_lookup_eq(bma->cur, LEFT.br_startoff,
LEFT.br_startblock, LEFT.br_blockcount,
- &i)))
+ &i);
+ if (error)
goto done;
- ASSERT(i == 1);
- if ((error = xfs_bmbt_update(cur, LEFT.br_startoff,
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+ error = xfs_bmbt_update(bma->cur, LEFT.br_startoff,
LEFT.br_startblock,
LEFT.br_blockcount +
new->br_blockcount,
- LEFT.br_state)))
+ LEFT.br_state);
+ if (error)
goto done;
}
- temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
- STARTBLOCKVAL(PREV.br_startblock));
- xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp));
- xfs_bmap_trace_post_update(fname, "LF|LC", ip, idx,
- XFS_DATA_FORK);
- *dnew = temp;
+ da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp),
+ startblockval(PREV.br_startblock));
+ xfs_bmbt_set_startblock(ep, nullstartblock(da_new));
+ trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
+
+ bma->idx--;
break;
- case MASK(LEFT_FILLING):
+ case BMAP_LEFT_FILLING:
/*
* Filling in the first part of a previous delayed allocation.
* The left neighbor is not contiguous.
*/
- xfs_bmap_trace_pre_update(fname, "LF", ip, idx, XFS_DATA_FORK);
+ trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
xfs_bmbt_set_startoff(ep, new_endoff);
temp = PREV.br_blockcount - new->br_blockcount;
xfs_bmbt_set_blockcount(ep, temp);
- xfs_bmap_trace_insert(fname, "LF", ip, idx, 1, new, NULL,
- XFS_DATA_FORK);
- xfs_bmap_insert_exlist(ip, idx, 1, new, XFS_DATA_FORK);
- ip->i_df.if_lastex = idx;
- ip->i_d.di_nextents++;
- if (cur == NULL)
+ xfs_iext_insert(bma->ip, bma->idx, 1, new, state);
+ bma->ip->i_d.di_nextents++;
+ if (bma->cur == NULL)
rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
else {
rval = XFS_ILOG_CORE;
- if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff,
+ error = xfs_bmbt_lookup_eq(bma->cur, new->br_startoff,
new->br_startblock, new->br_blockcount,
- &i)))
+ &i);
+ if (error)
goto done;
- ASSERT(i == 0);
- cur->bc_rec.b.br_state = XFS_EXT_NORM;
- if ((error = xfs_bmbt_insert(cur, &i)))
+ XFS_WANT_CORRUPTED_GOTO(i == 0, done);
+ bma->cur->bc_rec.b.br_state = XFS_EXT_NORM;
+ error = xfs_btree_insert(bma->cur, &i);
+ if (error)
goto done;
- ASSERT(i == 1);
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
}
- if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
- ip->i_d.di_nextents > ip->i_df.if_ext_max) {
- error = xfs_bmap_extents_to_btree(ip->i_transp, ip,
- first, flist, &cur, 1, &tmp_rval,
- XFS_DATA_FORK);
+
+ if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) {
+ error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
+ bma->firstblock, bma->flist,
+ &bma->cur, 1, &tmp_rval, XFS_DATA_FORK);
rval |= tmp_rval;
if (error)
goto done;
}
- temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
- STARTBLOCKVAL(PREV.br_startblock) -
- (cur ? cur->bc_private.b.allocated : 0));
- base = ip->i_df.if_u1.if_extents;
- ep = &base[idx + 1];
- xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp));
- xfs_bmap_trace_post_update(fname, "LF", ip, idx + 1,
- XFS_DATA_FORK);
- *dnew = temp;
+ da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp),
+ startblockval(PREV.br_startblock) -
+ (bma->cur ? bma->cur->bc_private.b.allocated : 0));
+ ep = xfs_iext_get_ext(ifp, bma->idx + 1);
+ xfs_bmbt_set_startblock(ep, nullstartblock(da_new));
+ trace_xfs_bmap_post_update(bma->ip, bma->idx + 1, state, _THIS_IP_);
break;
- case MASK2(RIGHT_FILLING, RIGHT_CONTIG):
+ case BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
/*
* Filling in the last part of a previous delayed allocation.
* The right neighbor is contiguous with the new allocation.
*/
temp = PREV.br_blockcount - new->br_blockcount;
- xfs_bmap_trace_pre_update(fname, "RF|RC", ip, idx,
- XFS_DATA_FORK);
- xfs_bmap_trace_pre_update(fname, "RF|RC", ip, idx + 1,
- XFS_DATA_FORK);
+ trace_xfs_bmap_pre_update(bma->ip, bma->idx + 1, state, _THIS_IP_);
xfs_bmbt_set_blockcount(ep, temp);
- xfs_bmbt_set_allf(ep + 1, new->br_startoff, new->br_startblock,
+ xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, bma->idx + 1),
+ new->br_startoff, new->br_startblock,
new->br_blockcount + RIGHT.br_blockcount,
RIGHT.br_state);
- xfs_bmap_trace_post_update(fname, "RF|RC", ip, idx + 1,
- XFS_DATA_FORK);
- ip->i_df.if_lastex = idx + 1;
- if (cur == NULL)
+ trace_xfs_bmap_post_update(bma->ip, bma->idx + 1, state, _THIS_IP_);
+ if (bma->cur == NULL)
rval = XFS_ILOG_DEXT;
else {
rval = 0;
- if ((error = xfs_bmbt_lookup_eq(cur, RIGHT.br_startoff,
+ error = xfs_bmbt_lookup_eq(bma->cur, RIGHT.br_startoff,
RIGHT.br_startblock,
- RIGHT.br_blockcount, &i)))
+ RIGHT.br_blockcount, &i);
+ if (error)
goto done;
- ASSERT(i == 1);
- if ((error = xfs_bmbt_update(cur, new->br_startoff,
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+ error = xfs_bmbt_update(bma->cur, new->br_startoff,
new->br_startblock,
new->br_blockcount +
RIGHT.br_blockcount,
- RIGHT.br_state)))
+ RIGHT.br_state);
+ if (error)
goto done;
}
- temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
- STARTBLOCKVAL(PREV.br_startblock));
- xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp));
- xfs_bmap_trace_post_update(fname, "RF|RC", ip, idx,
- XFS_DATA_FORK);
- *dnew = temp;
+
+ da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp),
+ startblockval(PREV.br_startblock));
+ trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
+ xfs_bmbt_set_startblock(ep, nullstartblock(da_new));
+ trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
+
+ bma->idx++;
break;
- case MASK(RIGHT_FILLING):
+ case BMAP_RIGHT_FILLING:
/*
* Filling in the last part of a previous delayed allocation.
* The right neighbor is not contiguous.
*/
temp = PREV.br_blockcount - new->br_blockcount;
- xfs_bmap_trace_pre_update(fname, "RF", ip, idx, XFS_DATA_FORK);
+ trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
xfs_bmbt_set_blockcount(ep, temp);
- xfs_bmap_trace_insert(fname, "RF", ip, idx + 1, 1,
- new, NULL, XFS_DATA_FORK);
- xfs_bmap_insert_exlist(ip, idx + 1, 1, new, XFS_DATA_FORK);
- ip->i_df.if_lastex = idx + 1;
- ip->i_d.di_nextents++;
- if (cur == NULL)
+ xfs_iext_insert(bma->ip, bma->idx + 1, 1, new, state);
+ bma->ip->i_d.di_nextents++;
+ if (bma->cur == NULL)
rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
else {
rval = XFS_ILOG_CORE;
- if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff,
+ error = xfs_bmbt_lookup_eq(bma->cur, new->br_startoff,
new->br_startblock, new->br_blockcount,
- &i)))
+ &i);
+ if (error)
goto done;
- ASSERT(i == 0);
- cur->bc_rec.b.br_state = XFS_EXT_NORM;
- if ((error = xfs_bmbt_insert(cur, &i)))
+ XFS_WANT_CORRUPTED_GOTO(i == 0, done);
+ bma->cur->bc_rec.b.br_state = XFS_EXT_NORM;
+ error = xfs_btree_insert(bma->cur, &i);
+ if (error)
goto done;
- ASSERT(i == 1);
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
}
- if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
- ip->i_d.di_nextents > ip->i_df.if_ext_max) {
- error = xfs_bmap_extents_to_btree(ip->i_transp, ip,
- first, flist, &cur, 1, &tmp_rval,
- XFS_DATA_FORK);
+
+ if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) {
+ error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
+ bma->firstblock, bma->flist, &bma->cur, 1,
+ &tmp_rval, XFS_DATA_FORK);
rval |= tmp_rval;
if (error)
goto done;
}
- temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
- STARTBLOCKVAL(PREV.br_startblock) -
- (cur ? cur->bc_private.b.allocated : 0));
- base = ip->i_df.if_u1.if_extents;
- ep = &base[idx];
- xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp));
- xfs_bmap_trace_post_update(fname, "RF", ip, idx, XFS_DATA_FORK);
- *dnew = temp;
+ da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp),
+ startblockval(PREV.br_startblock) -
+ (bma->cur ? bma->cur->bc_private.b.allocated : 0));
+ ep = xfs_iext_get_ext(ifp, bma->idx);
+ xfs_bmbt_set_startblock(ep, nullstartblock(da_new));
+ trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
+
+ bma->idx++;
break;
case 0:
@@ -1139,132 +2158,155 @@ xfs_bmap_add_extent_delay_real(
* Filling in the middle part of a previous delayed allocation.
* Contiguity is impossible here.
* This case is avoided almost all the time.
+ *
+ * We start with a delayed allocation:
+ *
+ * +ddddddddddddddddddddddddddddddddddddddddddddddddddddddd+
+ * PREV @ idx
+ *
+ * and we are allocating:
+ * +rrrrrrrrrrrrrrrrr+
+ * new
+ *
+ * and we set it up for insertion as:
+ * +ddddddddddddddddddd+rrrrrrrrrrrrrrrrr+ddddddddddddddddd+
+ * new
+ * PREV @ idx LEFT RIGHT
+ * inserted at idx + 1
*/
temp = new->br_startoff - PREV.br_startoff;
- xfs_bmap_trace_pre_update(fname, "0", ip, idx, XFS_DATA_FORK);
- xfs_bmbt_set_blockcount(ep, temp);
- r[0] = *new;
- r[1].br_startoff = new_endoff;
temp2 = PREV.br_startoff + PREV.br_blockcount - new_endoff;
- r[1].br_blockcount = temp2;
- xfs_bmap_trace_insert(fname, "0", ip, idx + 1, 2, &r[0], &r[1],
- XFS_DATA_FORK);
- xfs_bmap_insert_exlist(ip, idx + 1, 2, &r[0], XFS_DATA_FORK);
- ip->i_df.if_lastex = idx + 1;
- ip->i_d.di_nextents++;
- if (cur == NULL)
+ trace_xfs_bmap_pre_update(bma->ip, bma->idx, 0, _THIS_IP_);
+ xfs_bmbt_set_blockcount(ep, temp); /* truncate PREV */
+ LEFT = *new;
+ RIGHT.br_state = PREV.br_state;
+ RIGHT.br_startblock = nullstartblock(
+ (int)xfs_bmap_worst_indlen(bma->ip, temp2));
+ RIGHT.br_startoff = new_endoff;
+ RIGHT.br_blockcount = temp2;
+ /* insert LEFT (r[0]) and RIGHT (r[1]) at the same time */
+ xfs_iext_insert(bma->ip, bma->idx + 1, 2, &LEFT, state);
+ bma->ip->i_d.di_nextents++;
+ if (bma->cur == NULL)
rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
else {
rval = XFS_ILOG_CORE;
- if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff,
+ error = xfs_bmbt_lookup_eq(bma->cur, new->br_startoff,
new->br_startblock, new->br_blockcount,
- &i)))
+ &i);
+ if (error)
goto done;
- ASSERT(i == 0);
- cur->bc_rec.b.br_state = XFS_EXT_NORM;
- if ((error = xfs_bmbt_insert(cur, &i)))
+ XFS_WANT_CORRUPTED_GOTO(i == 0, done);
+ bma->cur->bc_rec.b.br_state = XFS_EXT_NORM;
+ error = xfs_btree_insert(bma->cur, &i);
+ if (error)
goto done;
- ASSERT(i == 1);
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
}
- if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
- ip->i_d.di_nextents > ip->i_df.if_ext_max) {
- error = xfs_bmap_extents_to_btree(ip->i_transp, ip,
- first, flist, &cur, 1, &tmp_rval,
- XFS_DATA_FORK);
+
+ if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) {
+ error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
+ bma->firstblock, bma->flist, &bma->cur,
+ 1, &tmp_rval, XFS_DATA_FORK);
rval |= tmp_rval;
if (error)
goto done;
}
- temp = xfs_bmap_worst_indlen(ip, temp);
- temp2 = xfs_bmap_worst_indlen(ip, temp2);
- diff = (int)(temp + temp2 - STARTBLOCKVAL(PREV.br_startblock) -
- (cur ? cur->bc_private.b.allocated : 0));
- if (diff > 0 &&
- xfs_mod_incore_sb(ip->i_mount, XFS_SBS_FDBLOCKS, -diff, rsvd)) {
- /*
- * Ick gross gag me with a spoon.
- */
- ASSERT(0); /* want to see if this ever happens! */
- while (diff > 0) {
- if (temp) {
- temp--;
- diff--;
- if (!diff ||
- !xfs_mod_incore_sb(ip->i_mount,
- XFS_SBS_FDBLOCKS, -diff, rsvd))
- break;
- }
- if (temp2) {
- temp2--;
- diff--;
- if (!diff ||
- !xfs_mod_incore_sb(ip->i_mount,
- XFS_SBS_FDBLOCKS, -diff, rsvd))
- break;
- }
- }
+ temp = xfs_bmap_worst_indlen(bma->ip, temp);
+ temp2 = xfs_bmap_worst_indlen(bma->ip, temp2);
+ diff = (int)(temp + temp2 - startblockval(PREV.br_startblock) -
+ (bma->cur ? bma->cur->bc_private.b.allocated : 0));
+ if (diff > 0) {
+ error = xfs_icsb_modify_counters(bma->ip->i_mount,
+ XFS_SBS_FDBLOCKS,
+ -((int64_t)diff), 0);
+ ASSERT(!error);
+ if (error)
+ goto done;
}
- base = ip->i_df.if_u1.if_extents;
- ep = &base[idx];
- xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp));
- xfs_bmap_trace_post_update(fname, "0", ip, idx, XFS_DATA_FORK);
- xfs_bmap_trace_pre_update(fname, "0", ip, idx + 2,
- XFS_DATA_FORK);
- xfs_bmbt_set_startblock(ep + 2, NULLSTARTBLOCK((int)temp2));
- xfs_bmap_trace_post_update(fname, "0", ip, idx + 2,
- XFS_DATA_FORK);
- *dnew = temp + temp2;
+
+ ep = xfs_iext_get_ext(ifp, bma->idx);
+ xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
+ trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
+ trace_xfs_bmap_pre_update(bma->ip, bma->idx + 2, state, _THIS_IP_);
+ xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, bma->idx + 2),
+ nullstartblock((int)temp2));
+ trace_xfs_bmap_post_update(bma->ip, bma->idx + 2, state, _THIS_IP_);
+
+ bma->idx++;
+ da_new = temp + temp2;
break;
- case MASK3(LEFT_FILLING, LEFT_CONTIG, RIGHT_CONTIG):
- case MASK3(RIGHT_FILLING, LEFT_CONTIG, RIGHT_CONTIG):
- case MASK2(LEFT_FILLING, RIGHT_CONTIG):
- case MASK2(RIGHT_FILLING, LEFT_CONTIG):
- case MASK2(LEFT_CONTIG, RIGHT_CONTIG):
- case MASK(LEFT_CONTIG):
- case MASK(RIGHT_CONTIG):
+ case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
+ case BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
+ case BMAP_LEFT_FILLING | BMAP_RIGHT_CONTIG:
+ case BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG:
+ case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
+ case BMAP_LEFT_CONTIG:
+ case BMAP_RIGHT_CONTIG:
/*
* These cases are all impossible.
*/
ASSERT(0);
}
- *curp = cur;
+
+ /* convert to a btree if necessary */
+ if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) {
+ int tmp_logflags; /* partial log flag return val */
+
+ ASSERT(bma->cur == NULL);
+ error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
+ bma->firstblock, bma->flist, &bma->cur,
+ da_old > 0, &tmp_logflags, XFS_DATA_FORK);
+ bma->logflags |= tmp_logflags;
+ if (error)
+ goto done;
+ }
+
+ /* adjust for changes in reserved delayed indirect blocks */
+ if (da_old || da_new) {
+ temp = da_new;
+ if (bma->cur)
+ temp += bma->cur->bc_private.b.allocated;
+ ASSERT(temp <= da_old);
+ if (temp < da_old)
+ xfs_icsb_modify_counters(bma->ip->i_mount,
+ XFS_SBS_FDBLOCKS,
+ (int64_t)(da_old - temp), 0);
+ }
+
+ /* clear out the allocated field, done with it now in any case. */
+ if (bma->cur)
+ bma->cur->bc_private.b.allocated = 0;
+
+ xfs_bmap_check_leaf_extents(bma->cur, bma->ip, XFS_DATA_FORK);
done:
- *logflagsp = rval;
+ bma->logflags |= rval;
return error;
#undef LEFT
#undef RIGHT
#undef PREV
-#undef MASK
-#undef MASK2
-#undef MASK3
-#undef MASK4
-#undef STATE_SET
-#undef STATE_TEST
-#undef STATE_SET_TEST
-#undef SWITCH_STATE
}
/*
- * Called by xfs_bmap_add_extent to handle cases converting an unwritten
- * allocation to a real allocation or vice versa.
+ * Convert an unwritten allocation to a real allocation or vice versa.
*/
STATIC int /* error */
xfs_bmap_add_extent_unwritten_real(
+ struct xfs_trans *tp,
xfs_inode_t *ip, /* incore inode pointer */
- xfs_extnum_t idx, /* extent number to update/insert */
+ xfs_extnum_t *idx, /* extent number to update/insert */
xfs_btree_cur_t **curp, /* if *curp is null, not a btree */
- xfs_bmbt_irec_t *new, /* new data to put in extent list */
+ xfs_bmbt_irec_t *new, /* new data to add to file extents */
+ xfs_fsblock_t *first, /* pointer to firstblock variable */
+ xfs_bmap_free_t *flist, /* list of extents to be freed */
int *logflagsp) /* inode logging flags */
{
- xfs_bmbt_rec_t *base; /* base of extent entry list */
xfs_btree_cur_t *cur; /* btree cursor */
- xfs_bmbt_rec_t *ep; /* extent entry for idx */
+ xfs_bmbt_rec_host_t *ep; /* extent entry for idx */
int error; /* error return value */
-#ifdef XFS_BMAP_TRACE
- static char fname[] = "xfs_bmap_add_extent_unwritten_real";
-#endif
int i; /* temp state */
+ xfs_ifork_t *ifp; /* inode fork pointer */
xfs_fileoff_t new_endoff; /* end offset of new entry */
xfs_exntst_t newext; /* new extent state */
xfs_exntst_t oldext; /* old extent state */
@@ -1272,34 +2314,27 @@ xfs_bmap_add_extent_unwritten_real(
/* left is 0, right is 1, prev is 2 */
int rval=0; /* return value (logging flags) */
int state = 0;/* state bits, accessed thru macros */
- enum { /* bit number definitions for state */
- LEFT_CONTIG, RIGHT_CONTIG,
- LEFT_FILLING, RIGHT_FILLING,
- LEFT_DELAY, RIGHT_DELAY,
- LEFT_VALID, RIGHT_VALID
- };
+
+ *logflagsp = 0;
+
+ cur = *curp;
+ ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+
+ ASSERT(*idx >= 0);
+ ASSERT(*idx <= ifp->if_bytes / sizeof(struct xfs_bmbt_rec));
+ ASSERT(!isnullstartblock(new->br_startblock));
+
+ XFS_STATS_INC(xs_add_exlist);
#define LEFT r[0]
#define RIGHT r[1]
#define PREV r[2]
-#define MASK(b) (1 << (b))
-#define MASK2(a,b) (MASK(a) | MASK(b))
-#define MASK3(a,b,c) (MASK2(a,b) | MASK(c))
-#define MASK4(a,b,c,d) (MASK3(a,b,c) | MASK(d))
-#define STATE_SET(b,v) ((v) ? (state |= MASK(b)) : (state &= ~MASK(b)))
-#define STATE_TEST(b) (state & MASK(b))
-#define STATE_SET_TEST(b,v) ((v) ? ((state |= MASK(b)), 1) : \
- ((state &= ~MASK(b)), 0))
-#define SWITCH_STATE \
- (state & MASK4(LEFT_FILLING, RIGHT_FILLING, LEFT_CONTIG, RIGHT_CONTIG))
/*
* Set up a bunch of variables to make the tests simpler.
*/
error = 0;
- cur = *curp;
- base = ip->i_df.if_u1.if_extents;
- ep = &base[idx];
+ ep = xfs_iext_get_ext(ifp, *idx);
xfs_bmbt_get_all(ep, &PREV);
newext = new->br_state;
oldext = (newext == XFS_EXT_UNWRITTEN) ?
@@ -1308,70 +2343,80 @@ xfs_bmap_add_extent_unwritten_real(
new_endoff = new->br_startoff + new->br_blockcount;
ASSERT(PREV.br_startoff <= new->br_startoff);
ASSERT(PREV.br_startoff + PREV.br_blockcount >= new_endoff);
+
/*
* Set flags determining what part of the previous oldext allocation
* extent is being replaced by a newext allocation.
*/
- STATE_SET(LEFT_FILLING, PREV.br_startoff == new->br_startoff);
- STATE_SET(RIGHT_FILLING,
- PREV.br_startoff + PREV.br_blockcount == new_endoff);
+ if (PREV.br_startoff == new->br_startoff)
+ state |= BMAP_LEFT_FILLING;
+ if (PREV.br_startoff + PREV.br_blockcount == new_endoff)
+ state |= BMAP_RIGHT_FILLING;
+
/*
* Check and set flags if this segment has a left neighbor.
* Don't set contiguous if the combined extent would be too large.
*/
- if (STATE_SET_TEST(LEFT_VALID, idx > 0)) {
- xfs_bmbt_get_all(ep - 1, &LEFT);
- STATE_SET(LEFT_DELAY, ISNULLSTARTBLOCK(LEFT.br_startblock));
+ if (*idx > 0) {
+ state |= BMAP_LEFT_VALID;
+ xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx - 1), &LEFT);
+
+ if (isnullstartblock(LEFT.br_startblock))
+ state |= BMAP_LEFT_DELAY;
}
- STATE_SET(LEFT_CONTIG,
- STATE_TEST(LEFT_VALID) && !STATE_TEST(LEFT_DELAY) &&
- LEFT.br_startoff + LEFT.br_blockcount == new->br_startoff &&
- LEFT.br_startblock + LEFT.br_blockcount == new->br_startblock &&
- LEFT.br_state == newext &&
- LEFT.br_blockcount + new->br_blockcount <= MAXEXTLEN);
+
+ if ((state & BMAP_LEFT_VALID) && !(state & BMAP_LEFT_DELAY) &&
+ LEFT.br_startoff + LEFT.br_blockcount == new->br_startoff &&
+ LEFT.br_startblock + LEFT.br_blockcount == new->br_startblock &&
+ LEFT.br_state == newext &&
+ LEFT.br_blockcount + new->br_blockcount <= MAXEXTLEN)
+ state |= BMAP_LEFT_CONTIG;
+
/*
* Check and set flags if this segment has a right neighbor.
* Don't set contiguous if the combined extent would be too large.
* Also check for all-three-contiguous being too large.
*/
- if (STATE_SET_TEST(RIGHT_VALID,
- idx <
- ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1)) {
- xfs_bmbt_get_all(ep + 1, &RIGHT);
- STATE_SET(RIGHT_DELAY, ISNULLSTARTBLOCK(RIGHT.br_startblock));
+ if (*idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) {
+ state |= BMAP_RIGHT_VALID;
+ xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx + 1), &RIGHT);
+ if (isnullstartblock(RIGHT.br_startblock))
+ state |= BMAP_RIGHT_DELAY;
}
- STATE_SET(RIGHT_CONTIG,
- STATE_TEST(RIGHT_VALID) && !STATE_TEST(RIGHT_DELAY) &&
- new_endoff == RIGHT.br_startoff &&
- new->br_startblock + new->br_blockcount ==
- RIGHT.br_startblock &&
- newext == RIGHT.br_state &&
- new->br_blockcount + RIGHT.br_blockcount <= MAXEXTLEN &&
- ((state & MASK3(LEFT_CONTIG, LEFT_FILLING, RIGHT_FILLING)) !=
- MASK3(LEFT_CONTIG, LEFT_FILLING, RIGHT_FILLING) ||
- LEFT.br_blockcount + new->br_blockcount + RIGHT.br_blockcount
- <= MAXEXTLEN));
+
+ if ((state & BMAP_RIGHT_VALID) && !(state & BMAP_RIGHT_DELAY) &&
+ new_endoff == RIGHT.br_startoff &&
+ new->br_startblock + new->br_blockcount == RIGHT.br_startblock &&
+ newext == RIGHT.br_state &&
+ new->br_blockcount + RIGHT.br_blockcount <= MAXEXTLEN &&
+ ((state & (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING |
+ BMAP_RIGHT_FILLING)) !=
+ (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING |
+ BMAP_RIGHT_FILLING) ||
+ LEFT.br_blockcount + new->br_blockcount + RIGHT.br_blockcount
+ <= MAXEXTLEN))
+ state |= BMAP_RIGHT_CONTIG;
+
/*
* Switch out based on the FILLING and CONTIG state bits.
*/
- switch (SWITCH_STATE) {
-
- case MASK4(LEFT_FILLING, RIGHT_FILLING, LEFT_CONTIG, RIGHT_CONTIG):
+ switch (state & (BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG |
+ BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG)) {
+ case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG |
+ BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
/*
* Setting all of a previous oldext extent to newext.
* The left and right neighbors are both contiguous with new.
*/
- xfs_bmap_trace_pre_update(fname, "LF|RF|LC|RC", ip, idx - 1,
- XFS_DATA_FORK);
- xfs_bmbt_set_blockcount(ep - 1,
+ --*idx;
+
+ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+ xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx),
LEFT.br_blockcount + PREV.br_blockcount +
RIGHT.br_blockcount);
- xfs_bmap_trace_post_update(fname, "LF|RF|LC|RC", ip, idx - 1,
- XFS_DATA_FORK);
- xfs_bmap_trace_delete(fname, "LF|RF|LC|RC", ip, idx, 2,
- XFS_DATA_FORK);
- xfs_bmap_delete_exlist(ip, idx, 2, XFS_DATA_FORK);
- ip->i_df.if_lastex = idx - 1;
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+
+ xfs_iext_remove(ip, *idx + 1, 2, state);
ip->i_d.di_nextents -= 2;
if (cur == NULL)
rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
@@ -1381,19 +2426,19 @@ xfs_bmap_add_extent_unwritten_real(
RIGHT.br_startblock,
RIGHT.br_blockcount, &i)))
goto done;
- ASSERT(i == 1);
- if ((error = xfs_bmbt_delete(cur, &i)))
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+ if ((error = xfs_btree_delete(cur, &i)))
goto done;
- ASSERT(i == 1);
- if ((error = xfs_bmbt_decrement(cur, 0, &i)))
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+ if ((error = xfs_btree_decrement(cur, 0, &i)))
goto done;
- ASSERT(i == 1);
- if ((error = xfs_bmbt_delete(cur, &i)))
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+ if ((error = xfs_btree_delete(cur, &i)))
goto done;
- ASSERT(i == 1);
- if ((error = xfs_bmbt_decrement(cur, 0, &i)))
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+ if ((error = xfs_btree_decrement(cur, 0, &i)))
goto done;
- ASSERT(i == 1);
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
if ((error = xfs_bmbt_update(cur, LEFT.br_startoff,
LEFT.br_startblock,
LEFT.br_blockcount + PREV.br_blockcount +
@@ -1402,21 +2447,19 @@ xfs_bmap_add_extent_unwritten_real(
}
break;
- case MASK3(LEFT_FILLING, RIGHT_FILLING, LEFT_CONTIG):
+ case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG:
/*
* Setting all of a previous oldext extent to newext.
* The left neighbor is contiguous, the right is not.
*/
- xfs_bmap_trace_pre_update(fname, "LF|RF|LC", ip, idx - 1,
- XFS_DATA_FORK);
- xfs_bmbt_set_blockcount(ep - 1,
+ --*idx;
+
+ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+ xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx),
LEFT.br_blockcount + PREV.br_blockcount);
- xfs_bmap_trace_post_update(fname, "LF|RF|LC", ip, idx - 1,
- XFS_DATA_FORK);
- ip->i_df.if_lastex = idx - 1;
- xfs_bmap_trace_delete(fname, "LF|RF|LC", ip, idx, 1,
- XFS_DATA_FORK);
- xfs_bmap_delete_exlist(ip, idx, 1, XFS_DATA_FORK);
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+
+ xfs_iext_remove(ip, *idx + 1, 1, state);
ip->i_d.di_nextents--;
if (cur == NULL)
rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
@@ -1426,13 +2469,13 @@ xfs_bmap_add_extent_unwritten_real(
PREV.br_startblock, PREV.br_blockcount,
&i)))
goto done;
- ASSERT(i == 1);
- if ((error = xfs_bmbt_delete(cur, &i)))
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+ if ((error = xfs_btree_delete(cur, &i)))
goto done;
- ASSERT(i == 1);
- if ((error = xfs_bmbt_decrement(cur, 0, &i)))
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+ if ((error = xfs_btree_decrement(cur, 0, &i)))
goto done;
- ASSERT(i == 1);
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
if ((error = xfs_bmbt_update(cur, LEFT.br_startoff,
LEFT.br_startblock,
LEFT.br_blockcount + PREV.br_blockcount,
@@ -1441,22 +2484,17 @@ xfs_bmap_add_extent_unwritten_real(
}
break;
- case MASK3(LEFT_FILLING, RIGHT_FILLING, RIGHT_CONTIG):
+ case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
/*
* Setting all of a previous oldext extent to newext.
* The right neighbor is contiguous, the left is not.
*/
- xfs_bmap_trace_pre_update(fname, "LF|RF|RC", ip, idx,
- XFS_DATA_FORK);
+ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
xfs_bmbt_set_blockcount(ep,
PREV.br_blockcount + RIGHT.br_blockcount);
xfs_bmbt_set_state(ep, newext);
- xfs_bmap_trace_post_update(fname, "LF|RF|RC", ip, idx,
- XFS_DATA_FORK);
- ip->i_df.if_lastex = idx;
- xfs_bmap_trace_delete(fname, "LF|RF|RC", ip, idx + 1, 1,
- XFS_DATA_FORK);
- xfs_bmap_delete_exlist(ip, idx + 1, 1, XFS_DATA_FORK);
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+ xfs_iext_remove(ip, *idx + 1, 1, state);
ip->i_d.di_nextents--;
if (cur == NULL)
rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
@@ -1466,13 +2504,13 @@ xfs_bmap_add_extent_unwritten_real(
RIGHT.br_startblock,
RIGHT.br_blockcount, &i)))
goto done;
- ASSERT(i == 1);
- if ((error = xfs_bmbt_delete(cur, &i)))
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+ if ((error = xfs_btree_delete(cur, &i)))
goto done;
- ASSERT(i == 1);
- if ((error = xfs_bmbt_decrement(cur, 0, &i)))
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+ if ((error = xfs_btree_decrement(cur, 0, &i)))
goto done;
- ASSERT(i == 1);
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
if ((error = xfs_bmbt_update(cur, new->br_startoff,
new->br_startblock,
new->br_blockcount + RIGHT.br_blockcount,
@@ -1481,18 +2519,16 @@ xfs_bmap_add_extent_unwritten_real(
}
break;
- case MASK2(LEFT_FILLING, RIGHT_FILLING):
+ case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING:
/*
* Setting all of a previous oldext extent to newext.
* Neither the left nor right neighbors are contiguous with
* the new one.
*/
- xfs_bmap_trace_pre_update(fname, "LF|RF", ip, idx,
- XFS_DATA_FORK);
+ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
xfs_bmbt_set_state(ep, newext);
- xfs_bmap_trace_post_update(fname, "LF|RF", ip, idx,
- XFS_DATA_FORK);
- ip->i_df.if_lastex = idx;
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+
if (cur == NULL)
rval = XFS_ILOG_DEXT;
else {
@@ -1501,7 +2537,7 @@ xfs_bmap_add_extent_unwritten_real(
new->br_startblock, new->br_blockcount,
&i)))
goto done;
- ASSERT(i == 1);
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
if ((error = xfs_bmbt_update(cur, new->br_startoff,
new->br_startblock, new->br_blockcount,
newext)))
@@ -1509,28 +2545,27 @@ xfs_bmap_add_extent_unwritten_real(
}
break;
- case MASK2(LEFT_FILLING, LEFT_CONTIG):
+ case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG:
/*
* Setting the first part of a previous oldext extent to newext.
* The left neighbor is contiguous.
*/
- xfs_bmap_trace_pre_update(fname, "LF|LC", ip, idx - 1,
- XFS_DATA_FORK);
- xfs_bmbt_set_blockcount(ep - 1,
+ trace_xfs_bmap_pre_update(ip, *idx - 1, state, _THIS_IP_);
+ xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx - 1),
LEFT.br_blockcount + new->br_blockcount);
xfs_bmbt_set_startoff(ep,
PREV.br_startoff + new->br_blockcount);
- xfs_bmap_trace_post_update(fname, "LF|LC", ip, idx - 1,
- XFS_DATA_FORK);
- xfs_bmap_trace_pre_update(fname, "LF|LC", ip, idx,
- XFS_DATA_FORK);
+ trace_xfs_bmap_post_update(ip, *idx - 1, state, _THIS_IP_);
+
+ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
xfs_bmbt_set_startblock(ep,
new->br_startblock + new->br_blockcount);
xfs_bmbt_set_blockcount(ep,
PREV.br_blockcount - new->br_blockcount);
- xfs_bmap_trace_post_update(fname, "LF|LC", ip, idx,
- XFS_DATA_FORK);
- ip->i_df.if_lastex = idx - 1;
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+
+ --*idx;
+
if (cur == NULL)
rval = XFS_ILOG_DEXT;
else {
@@ -1539,40 +2574,39 @@ xfs_bmap_add_extent_unwritten_real(
PREV.br_startblock, PREV.br_blockcount,
&i)))
goto done;
- ASSERT(i == 1);
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
if ((error = xfs_bmbt_update(cur,
PREV.br_startoff + new->br_blockcount,
PREV.br_startblock + new->br_blockcount,
PREV.br_blockcount - new->br_blockcount,
oldext)))
goto done;
- if ((error = xfs_bmbt_decrement(cur, 0, &i)))
+ if ((error = xfs_btree_decrement(cur, 0, &i)))
goto done;
- if (xfs_bmbt_update(cur, LEFT.br_startoff,
+ error = xfs_bmbt_update(cur, LEFT.br_startoff,
LEFT.br_startblock,
LEFT.br_blockcount + new->br_blockcount,
- LEFT.br_state))
+ LEFT.br_state);
+ if (error)
goto done;
}
break;
- case MASK(LEFT_FILLING):
+ case BMAP_LEFT_FILLING:
/*
* Setting the first part of a previous oldext extent to newext.
* The left neighbor is not contiguous.
*/
- xfs_bmap_trace_pre_update(fname, "LF", ip, idx, XFS_DATA_FORK);
+ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
ASSERT(ep && xfs_bmbt_get_state(ep) == oldext);
xfs_bmbt_set_startoff(ep, new_endoff);
xfs_bmbt_set_blockcount(ep,
PREV.br_blockcount - new->br_blockcount);
xfs_bmbt_set_startblock(ep,
new->br_startblock + new->br_blockcount);
- xfs_bmap_trace_post_update(fname, "LF", ip, idx, XFS_DATA_FORK);
- xfs_bmap_trace_insert(fname, "LF", ip, idx, 1, new, NULL,
- XFS_DATA_FORK);
- xfs_bmap_insert_exlist(ip, idx, 1, new, XFS_DATA_FORK);
- ip->i_df.if_lastex = idx;
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+
+ xfs_iext_insert(ip, *idx, 1, new, state);
ip->i_d.di_nextents++;
if (cur == NULL)
rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
@@ -1582,7 +2616,7 @@ xfs_bmap_add_extent_unwritten_real(
PREV.br_startblock, PREV.br_blockcount,
&i)))
goto done;
- ASSERT(i == 1);
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
if ((error = xfs_bmbt_update(cur,
PREV.br_startoff + new->br_blockcount,
PREV.br_startblock + new->br_blockcount,
@@ -1590,30 +2624,30 @@ xfs_bmap_add_extent_unwritten_real(
oldext)))
goto done;
cur->bc_rec.b = *new;
- if ((error = xfs_bmbt_insert(cur, &i)))
+ if ((error = xfs_btree_insert(cur, &i)))
goto done;
- ASSERT(i == 1);
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
}
break;
- case MASK2(RIGHT_FILLING, RIGHT_CONTIG):
+ case BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
/*
* Setting the last part of a previous oldext extent to newext.
* The right neighbor is contiguous with the new allocation.
*/
- xfs_bmap_trace_pre_update(fname, "RF|RC", ip, idx,
- XFS_DATA_FORK);
- xfs_bmap_trace_pre_update(fname, "RF|RC", ip, idx + 1,
- XFS_DATA_FORK);
+ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
xfs_bmbt_set_blockcount(ep,
PREV.br_blockcount - new->br_blockcount);
- xfs_bmap_trace_post_update(fname, "RF|RC", ip, idx,
- XFS_DATA_FORK);
- xfs_bmbt_set_allf(ep + 1, new->br_startoff, new->br_startblock,
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+
+ ++*idx;
+
+ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+ xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx),
+ new->br_startoff, new->br_startblock,
new->br_blockcount + RIGHT.br_blockcount, newext);
- xfs_bmap_trace_post_update(fname, "RF|RC", ip, idx + 1,
- XFS_DATA_FORK);
- ip->i_df.if_lastex = idx + 1;
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+
if (cur == NULL)
rval = XFS_ILOG_DEXT;
else {
@@ -1622,13 +2656,13 @@ xfs_bmap_add_extent_unwritten_real(
PREV.br_startblock,
PREV.br_blockcount, &i)))
goto done;
- ASSERT(i == 1);
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
if ((error = xfs_bmbt_update(cur, PREV.br_startoff,
PREV.br_startblock,
PREV.br_blockcount - new->br_blockcount,
oldext)))
goto done;
- if ((error = xfs_bmbt_increment(cur, 0, &i)))
+ if ((error = xfs_btree_increment(cur, 0, &i)))
goto done;
if ((error = xfs_bmbt_update(cur, new->br_startoff,
new->br_startblock,
@@ -1638,19 +2672,19 @@ xfs_bmap_add_extent_unwritten_real(
}
break;
- case MASK(RIGHT_FILLING):
+ case BMAP_RIGHT_FILLING:
/*
* Setting the last part of a previous oldext extent to newext.
* The right neighbor is not contiguous.
*/
- xfs_bmap_trace_pre_update(fname, "RF", ip, idx, XFS_DATA_FORK);
+ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
xfs_bmbt_set_blockcount(ep,
PREV.br_blockcount - new->br_blockcount);
- xfs_bmap_trace_post_update(fname, "RF", ip, idx, XFS_DATA_FORK);
- xfs_bmap_trace_insert(fname, "RF", ip, idx + 1, 1,
- new, NULL, XFS_DATA_FORK);
- xfs_bmap_insert_exlist(ip, idx + 1, 1, new, XFS_DATA_FORK);
- ip->i_df.if_lastex = idx + 1;
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+
+ ++*idx;
+ xfs_iext_insert(ip, *idx, 1, new, state);
+
ip->i_d.di_nextents++;
if (cur == NULL)
rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
@@ -1660,7 +2694,7 @@ xfs_bmap_add_extent_unwritten_real(
PREV.br_startblock, PREV.br_blockcount,
&i)))
goto done;
- ASSERT(i == 1);
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
if ((error = xfs_bmbt_update(cur, PREV.br_startoff,
PREV.br_startblock,
PREV.br_blockcount - new->br_blockcount,
@@ -1670,11 +2704,11 @@ xfs_bmap_add_extent_unwritten_real(
new->br_startblock, new->br_blockcount,
&i)))
goto done;
- ASSERT(i == 0);
+ XFS_WANT_CORRUPTED_GOTO(i == 0, done);
cur->bc_rec.b.br_state = XFS_EXT_NORM;
- if ((error = xfs_bmbt_insert(cur, &i)))
+ if ((error = xfs_btree_insert(cur, &i)))
goto done;
- ASSERT(i == 1);
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
}
break;
@@ -1684,20 +2718,21 @@ xfs_bmap_add_extent_unwritten_real(
* newext. Contiguity is impossible here.
* One extent becomes three extents.
*/
- xfs_bmap_trace_pre_update(fname, "0", ip, idx, XFS_DATA_FORK);
+ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
xfs_bmbt_set_blockcount(ep,
new->br_startoff - PREV.br_startoff);
- xfs_bmap_trace_post_update(fname, "0", ip, idx, XFS_DATA_FORK);
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+
r[0] = *new;
r[1].br_startoff = new_endoff;
r[1].br_blockcount =
PREV.br_startoff + PREV.br_blockcount - new_endoff;
r[1].br_startblock = new->br_startblock + new->br_blockcount;
r[1].br_state = oldext;
- xfs_bmap_trace_insert(fname, "0", ip, idx + 1, 2, &r[0], &r[1],
- XFS_DATA_FORK);
- xfs_bmap_insert_exlist(ip, idx + 1, 2, &r[0], XFS_DATA_FORK);
- ip->i_df.if_lastex = idx + 1;
+
+ ++*idx;
+ xfs_iext_insert(ip, *idx, 2, &r[0], state);
+
ip->i_d.di_nextents += 2;
if (cur == NULL)
rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
@@ -1707,197 +2742,199 @@ xfs_bmap_add_extent_unwritten_real(
PREV.br_startblock, PREV.br_blockcount,
&i)))
goto done;
- ASSERT(i == 1);
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
/* new right extent - oldext */
if ((error = xfs_bmbt_update(cur, r[1].br_startoff,
r[1].br_startblock, r[1].br_blockcount,
r[1].br_state)))
goto done;
/* new left extent - oldext */
- PREV.br_blockcount =
- new->br_startoff - PREV.br_startoff;
cur->bc_rec.b = PREV;
- if ((error = xfs_bmbt_insert(cur, &i)))
+ cur->bc_rec.b.br_blockcount =
+ new->br_startoff - PREV.br_startoff;
+ if ((error = xfs_btree_insert(cur, &i)))
goto done;
- ASSERT(i == 1);
- if ((error = xfs_bmbt_increment(cur, 0, &i)))
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+ /*
+ * Reset the cursor to the position of the new extent
+ * we are about to insert as we can't trust it after
+ * the previous insert.
+ */
+ if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff,
+ new->br_startblock, new->br_blockcount,
+ &i)))
goto done;
- ASSERT(i == 1);
+ XFS_WANT_CORRUPTED_GOTO(i == 0, done);
/* new middle extent - newext */
- cur->bc_rec.b = *new;
- if ((error = xfs_bmbt_insert(cur, &i)))
+ cur->bc_rec.b.br_state = new->br_state;
+ if ((error = xfs_btree_insert(cur, &i)))
goto done;
- ASSERT(i == 1);
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
}
break;
- case MASK3(LEFT_FILLING, LEFT_CONTIG, RIGHT_CONTIG):
- case MASK3(RIGHT_FILLING, LEFT_CONTIG, RIGHT_CONTIG):
- case MASK2(LEFT_FILLING, RIGHT_CONTIG):
- case MASK2(RIGHT_FILLING, LEFT_CONTIG):
- case MASK2(LEFT_CONTIG, RIGHT_CONTIG):
- case MASK(LEFT_CONTIG):
- case MASK(RIGHT_CONTIG):
+ case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
+ case BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
+ case BMAP_LEFT_FILLING | BMAP_RIGHT_CONTIG:
+ case BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG:
+ case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
+ case BMAP_LEFT_CONTIG:
+ case BMAP_RIGHT_CONTIG:
/*
* These cases are all impossible.
*/
ASSERT(0);
}
- *curp = cur;
+
+ /* convert to a btree if necessary */
+ if (xfs_bmap_needs_btree(ip, XFS_DATA_FORK)) {
+ int tmp_logflags; /* partial log flag return val */
+
+ ASSERT(cur == NULL);
+ error = xfs_bmap_extents_to_btree(tp, ip, first, flist, &cur,
+ 0, &tmp_logflags, XFS_DATA_FORK);
+ *logflagsp |= tmp_logflags;
+ if (error)
+ goto done;
+ }
+
+ /* clear out the allocated field, done with it now in any case. */
+ if (cur) {
+ cur->bc_private.b.allocated = 0;
+ *curp = cur;
+ }
+
+ xfs_bmap_check_leaf_extents(*curp, ip, XFS_DATA_FORK);
done:
- *logflagsp = rval;
+ *logflagsp |= rval;
return error;
#undef LEFT
#undef RIGHT
#undef PREV
-#undef MASK
-#undef MASK2
-#undef MASK3
-#undef MASK4
-#undef STATE_SET
-#undef STATE_TEST
-#undef STATE_SET_TEST
-#undef SWITCH_STATE
}
/*
- * Called by xfs_bmap_add_extent to handle cases converting a hole
- * to a delayed allocation.
+ * Convert a hole to a delayed allocation.
*/
-/*ARGSUSED*/
-STATIC int /* error */
+STATIC void
xfs_bmap_add_extent_hole_delay(
xfs_inode_t *ip, /* incore inode pointer */
- xfs_extnum_t idx, /* extent number to update/insert */
- xfs_btree_cur_t *cur, /* if null, not a btree */
- xfs_bmbt_irec_t *new, /* new data to put in extent list */
- int *logflagsp, /* inode logging flags */
- int rsvd) /* OK to allocate reserved blocks */
+ xfs_extnum_t *idx, /* extent number to update/insert */
+ xfs_bmbt_irec_t *new) /* new data to add to file extents */
{
- xfs_bmbt_rec_t *base; /* base of extent entry list */
- xfs_bmbt_rec_t *ep; /* extent list entry for idx */
-#ifdef XFS_BMAP_TRACE
- static char fname[] = "xfs_bmap_add_extent_hole_delay";
-#endif
+ xfs_ifork_t *ifp; /* inode fork pointer */
xfs_bmbt_irec_t left; /* left neighbor extent entry */
xfs_filblks_t newlen=0; /* new indirect size */
xfs_filblks_t oldlen=0; /* old indirect size */
xfs_bmbt_irec_t right; /* right neighbor extent entry */
int state; /* state bits, accessed thru macros */
- xfs_filblks_t temp; /* temp for indirect calculations */
- enum { /* bit number definitions for state */
- LEFT_CONTIG, RIGHT_CONTIG,
- LEFT_DELAY, RIGHT_DELAY,
- LEFT_VALID, RIGHT_VALID
- };
-
-#define MASK(b) (1 << (b))
-#define MASK2(a,b) (MASK(a) | MASK(b))
-#define STATE_SET(b,v) ((v) ? (state |= MASK(b)) : (state &= ~MASK(b)))
-#define STATE_TEST(b) (state & MASK(b))
-#define STATE_SET_TEST(b,v) ((v) ? ((state |= MASK(b)), 1) : \
- ((state &= ~MASK(b)), 0))
-#define SWITCH_STATE (state & MASK2(LEFT_CONTIG, RIGHT_CONTIG))
-
- base = ip->i_df.if_u1.if_extents;
- ep = &base[idx];
+ xfs_filblks_t temp=0; /* temp for indirect calculations */
+
+ ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
state = 0;
- ASSERT(ISNULLSTARTBLOCK(new->br_startblock));
+ ASSERT(isnullstartblock(new->br_startblock));
+
/*
* Check and set flags if this segment has a left neighbor
*/
- if (STATE_SET_TEST(LEFT_VALID, idx > 0)) {
- xfs_bmbt_get_all(ep - 1, &left);
- STATE_SET(LEFT_DELAY, ISNULLSTARTBLOCK(left.br_startblock));
+ if (*idx > 0) {
+ state |= BMAP_LEFT_VALID;
+ xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx - 1), &left);
+
+ if (isnullstartblock(left.br_startblock))
+ state |= BMAP_LEFT_DELAY;
}
+
/*
* Check and set flags if the current (right) segment exists.
* If it doesn't exist, we're converting the hole at end-of-file.
*/
- if (STATE_SET_TEST(RIGHT_VALID,
- idx <
- ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t))) {
- xfs_bmbt_get_all(ep, &right);
- STATE_SET(RIGHT_DELAY, ISNULLSTARTBLOCK(right.br_startblock));
+ if (*idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) {
+ state |= BMAP_RIGHT_VALID;
+ xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx), &right);
+
+ if (isnullstartblock(right.br_startblock))
+ state |= BMAP_RIGHT_DELAY;
}
+
/*
* Set contiguity flags on the left and right neighbors.
* Don't let extents get too large, even if the pieces are contiguous.
*/
- STATE_SET(LEFT_CONTIG,
- STATE_TEST(LEFT_VALID) && STATE_TEST(LEFT_DELAY) &&
- left.br_startoff + left.br_blockcount == new->br_startoff &&
- left.br_blockcount + new->br_blockcount <= MAXEXTLEN);
- STATE_SET(RIGHT_CONTIG,
- STATE_TEST(RIGHT_VALID) && STATE_TEST(RIGHT_DELAY) &&
- new->br_startoff + new->br_blockcount == right.br_startoff &&
- new->br_blockcount + right.br_blockcount <= MAXEXTLEN &&
- (!STATE_TEST(LEFT_CONTIG) ||
- (left.br_blockcount + new->br_blockcount +
- right.br_blockcount <= MAXEXTLEN)));
+ if ((state & BMAP_LEFT_VALID) && (state & BMAP_LEFT_DELAY) &&
+ left.br_startoff + left.br_blockcount == new->br_startoff &&
+ left.br_blockcount + new->br_blockcount <= MAXEXTLEN)
+ state |= BMAP_LEFT_CONTIG;
+
+ if ((state & BMAP_RIGHT_VALID) && (state & BMAP_RIGHT_DELAY) &&
+ new->br_startoff + new->br_blockcount == right.br_startoff &&
+ new->br_blockcount + right.br_blockcount <= MAXEXTLEN &&
+ (!(state & BMAP_LEFT_CONTIG) ||
+ (left.br_blockcount + new->br_blockcount +
+ right.br_blockcount <= MAXEXTLEN)))
+ state |= BMAP_RIGHT_CONTIG;
+
/*
* Switch out based on the contiguity flags.
*/
- switch (SWITCH_STATE) {
-
- case MASK2(LEFT_CONTIG, RIGHT_CONTIG):
+ switch (state & (BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG)) {
+ case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
/*
* New allocation is contiguous with delayed allocations
* on the left and on the right.
- * Merge all three into a single extent list entry.
+ * Merge all three into a single extent record.
*/
+ --*idx;
temp = left.br_blockcount + new->br_blockcount +
right.br_blockcount;
- xfs_bmap_trace_pre_update(fname, "LC|RC", ip, idx - 1,
- XFS_DATA_FORK);
- xfs_bmbt_set_blockcount(ep - 1, temp);
- oldlen = STARTBLOCKVAL(left.br_startblock) +
- STARTBLOCKVAL(new->br_startblock) +
- STARTBLOCKVAL(right.br_startblock);
+
+ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+ xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), temp);
+ oldlen = startblockval(left.br_startblock) +
+ startblockval(new->br_startblock) +
+ startblockval(right.br_startblock);
newlen = xfs_bmap_worst_indlen(ip, temp);
- xfs_bmbt_set_startblock(ep - 1, NULLSTARTBLOCK((int)newlen));
- xfs_bmap_trace_post_update(fname, "LC|RC", ip, idx - 1,
- XFS_DATA_FORK);
- xfs_bmap_trace_delete(fname, "LC|RC", ip, idx, 1,
- XFS_DATA_FORK);
- xfs_bmap_delete_exlist(ip, idx, 1, XFS_DATA_FORK);
- ip->i_df.if_lastex = idx - 1;
+ xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, *idx),
+ nullstartblock((int)newlen));
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+
+ xfs_iext_remove(ip, *idx + 1, 1, state);
break;
- case MASK(LEFT_CONTIG):
+ case BMAP_LEFT_CONTIG:
/*
* New allocation is contiguous with a delayed allocation
* on the left.
* Merge the new allocation with the left neighbor.
*/
+ --*idx;
temp = left.br_blockcount + new->br_blockcount;
- xfs_bmap_trace_pre_update(fname, "LC", ip, idx - 1,
- XFS_DATA_FORK);
- xfs_bmbt_set_blockcount(ep - 1, temp);
- oldlen = STARTBLOCKVAL(left.br_startblock) +
- STARTBLOCKVAL(new->br_startblock);
+
+ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+ xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), temp);
+ oldlen = startblockval(left.br_startblock) +
+ startblockval(new->br_startblock);
newlen = xfs_bmap_worst_indlen(ip, temp);
- xfs_bmbt_set_startblock(ep - 1, NULLSTARTBLOCK((int)newlen));
- xfs_bmap_trace_post_update(fname, "LC", ip, idx - 1,
- XFS_DATA_FORK);
- ip->i_df.if_lastex = idx - 1;
+ xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, *idx),
+ nullstartblock((int)newlen));
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
break;
- case MASK(RIGHT_CONTIG):
+ case BMAP_RIGHT_CONTIG:
/*
* New allocation is contiguous with a delayed allocation
* on the right.
* Merge the new allocation with the right neighbor.
*/
- xfs_bmap_trace_pre_update(fname, "RC", ip, idx, XFS_DATA_FORK);
+ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
temp = new->br_blockcount + right.br_blockcount;
- oldlen = STARTBLOCKVAL(new->br_startblock) +
- STARTBLOCKVAL(right.br_startblock);
+ oldlen = startblockval(new->br_startblock) +
+ startblockval(right.br_startblock);
newlen = xfs_bmap_worst_indlen(ip, temp);
- xfs_bmbt_set_allf(ep, new->br_startoff,
- NULLSTARTBLOCK((int)newlen), temp, right.br_state);
- xfs_bmap_trace_post_update(fname, "RC", ip, idx, XFS_DATA_FORK);
- ip->i_df.if_lastex = idx;
+ xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx),
+ new->br_startoff,
+ nullstartblock((int)newlen), temp, right.br_state);
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
break;
case 0:
@@ -1907,206 +2944,208 @@ xfs_bmap_add_extent_hole_delay(
* Insert a new entry.
*/
oldlen = newlen = 0;
- xfs_bmap_trace_insert(fname, "0", ip, idx, 1, new, NULL,
- XFS_DATA_FORK);
- xfs_bmap_insert_exlist(ip, idx, 1, new, XFS_DATA_FORK);
- ip->i_df.if_lastex = idx;
+ xfs_iext_insert(ip, *idx, 1, new, state);
break;
}
if (oldlen != newlen) {
ASSERT(oldlen > newlen);
- xfs_mod_incore_sb(ip->i_mount, XFS_SBS_FDBLOCKS,
- (int)(oldlen - newlen), rsvd);
+ xfs_icsb_modify_counters(ip->i_mount, XFS_SBS_FDBLOCKS,
+ (int64_t)(oldlen - newlen), 0);
/*
* Nothing to do for disk quota accounting here.
*/
}
- *logflagsp = 0;
- return 0;
-#undef MASK
-#undef MASK2
-#undef STATE_SET
-#undef STATE_TEST
-#undef STATE_SET_TEST
-#undef SWITCH_STATE
}
/*
- * Called by xfs_bmap_add_extent to handle cases converting a hole
- * to a real allocation.
+ * Convert a hole to a real allocation.
*/
STATIC int /* error */
xfs_bmap_add_extent_hole_real(
- xfs_inode_t *ip, /* incore inode pointer */
- xfs_extnum_t idx, /* extent number to update/insert */
- xfs_btree_cur_t *cur, /* if null, not a btree */
- xfs_bmbt_irec_t *new, /* new data to put in extent list */
- int *logflagsp, /* inode logging flags */
- int whichfork) /* data or attr fork */
+ struct xfs_bmalloca *bma,
+ int whichfork)
{
- xfs_bmbt_rec_t *ep; /* pointer to extent entry ins. point */
+ struct xfs_bmbt_irec *new = &bma->got;
int error; /* error return value */
-#ifdef XFS_BMAP_TRACE
- static char fname[] = "xfs_bmap_add_extent_hole_real";
-#endif
int i; /* temp state */
xfs_ifork_t *ifp; /* inode fork pointer */
xfs_bmbt_irec_t left; /* left neighbor extent entry */
xfs_bmbt_irec_t right; /* right neighbor extent entry */
+ int rval=0; /* return value (logging flags) */
int state; /* state bits, accessed thru macros */
- enum { /* bit number definitions for state */
- LEFT_CONTIG, RIGHT_CONTIG,
- LEFT_DELAY, RIGHT_DELAY,
- LEFT_VALID, RIGHT_VALID
- };
-
-#define MASK(b) (1 << (b))
-#define MASK2(a,b) (MASK(a) | MASK(b))
-#define STATE_SET(b,v) ((v) ? (state |= MASK(b)) : (state &= ~MASK(b)))
-#define STATE_TEST(b) (state & MASK(b))
-#define STATE_SET_TEST(b,v) ((v) ? ((state |= MASK(b)), 1) : \
- ((state &= ~MASK(b)), 0))
-#define SWITCH_STATE (state & MASK2(LEFT_CONTIG, RIGHT_CONTIG))
- ifp = XFS_IFORK_PTR(ip, whichfork);
- ASSERT(idx <= ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t));
- ep = &ifp->if_u1.if_extents[idx];
+ ifp = XFS_IFORK_PTR(bma->ip, whichfork);
+
+ ASSERT(bma->idx >= 0);
+ ASSERT(bma->idx <= ifp->if_bytes / sizeof(struct xfs_bmbt_rec));
+ ASSERT(!isnullstartblock(new->br_startblock));
+ ASSERT(!bma->cur ||
+ !(bma->cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL));
+
+ XFS_STATS_INC(xs_add_exlist);
+
state = 0;
+ if (whichfork == XFS_ATTR_FORK)
+ state |= BMAP_ATTRFORK;
+
/*
* Check and set flags if this segment has a left neighbor.
*/
- if (STATE_SET_TEST(LEFT_VALID, idx > 0)) {
- xfs_bmbt_get_all(ep - 1, &left);
- STATE_SET(LEFT_DELAY, ISNULLSTARTBLOCK(left.br_startblock));
+ if (bma->idx > 0) {
+ state |= BMAP_LEFT_VALID;
+ xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx - 1), &left);
+ if (isnullstartblock(left.br_startblock))
+ state |= BMAP_LEFT_DELAY;
}
+
/*
* Check and set flags if this segment has a current value.
* Not true if we're inserting into the "hole" at eof.
*/
- if (STATE_SET_TEST(RIGHT_VALID,
- idx <
- ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))) {
- xfs_bmbt_get_all(ep, &right);
- STATE_SET(RIGHT_DELAY, ISNULLSTARTBLOCK(right.br_startblock));
+ if (bma->idx < ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) {
+ state |= BMAP_RIGHT_VALID;
+ xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx), &right);
+ if (isnullstartblock(right.br_startblock))
+ state |= BMAP_RIGHT_DELAY;
}
+
/*
* We're inserting a real allocation between "left" and "right".
* Set the contiguity flags. Don't let extents get too large.
*/
- STATE_SET(LEFT_CONTIG,
- STATE_TEST(LEFT_VALID) && !STATE_TEST(LEFT_DELAY) &&
- left.br_startoff + left.br_blockcount == new->br_startoff &&
- left.br_startblock + left.br_blockcount == new->br_startblock &&
- left.br_state == new->br_state &&
- left.br_blockcount + new->br_blockcount <= MAXEXTLEN);
- STATE_SET(RIGHT_CONTIG,
- STATE_TEST(RIGHT_VALID) && !STATE_TEST(RIGHT_DELAY) &&
- new->br_startoff + new->br_blockcount == right.br_startoff &&
- new->br_startblock + new->br_blockcount ==
- right.br_startblock &&
- new->br_state == right.br_state &&
- new->br_blockcount + right.br_blockcount <= MAXEXTLEN &&
- (!STATE_TEST(LEFT_CONTIG) ||
- left.br_blockcount + new->br_blockcount +
- right.br_blockcount <= MAXEXTLEN));
+ if ((state & BMAP_LEFT_VALID) && !(state & BMAP_LEFT_DELAY) &&
+ left.br_startoff + left.br_blockcount == new->br_startoff &&
+ left.br_startblock + left.br_blockcount == new->br_startblock &&
+ left.br_state == new->br_state &&
+ left.br_blockcount + new->br_blockcount <= MAXEXTLEN)
+ state |= BMAP_LEFT_CONTIG;
+
+ if ((state & BMAP_RIGHT_VALID) && !(state & BMAP_RIGHT_DELAY) &&
+ new->br_startoff + new->br_blockcount == right.br_startoff &&
+ new->br_startblock + new->br_blockcount == right.br_startblock &&
+ new->br_state == right.br_state &&
+ new->br_blockcount + right.br_blockcount <= MAXEXTLEN &&
+ (!(state & BMAP_LEFT_CONTIG) ||
+ left.br_blockcount + new->br_blockcount +
+ right.br_blockcount <= MAXEXTLEN))
+ state |= BMAP_RIGHT_CONTIG;
+ error = 0;
/*
* Select which case we're in here, and implement it.
*/
- switch (SWITCH_STATE) {
-
- case MASK2(LEFT_CONTIG, RIGHT_CONTIG):
+ switch (state & (BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG)) {
+ case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
/*
* New allocation is contiguous with real allocations on the
* left and on the right.
- * Merge all three into a single extent list entry.
+ * Merge all three into a single extent record.
*/
- xfs_bmap_trace_pre_update(fname, "LC|RC", ip, idx - 1,
- whichfork);
- xfs_bmbt_set_blockcount(ep - 1,
+ --bma->idx;
+ trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
+ xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, bma->idx),
left.br_blockcount + new->br_blockcount +
right.br_blockcount);
- xfs_bmap_trace_post_update(fname, "LC|RC", ip, idx - 1,
- whichfork);
- xfs_bmap_trace_delete(fname, "LC|RC", ip,
- idx, 1, whichfork);
- xfs_bmap_delete_exlist(ip, idx, 1, whichfork);
- ifp->if_lastex = idx - 1;
- XFS_IFORK_NEXT_SET(ip, whichfork,
- XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
- if (cur == NULL) {
- *logflagsp = XFS_ILOG_CORE | XFS_ILOG_FEXT(whichfork);
- return 0;
+ trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
+
+ xfs_iext_remove(bma->ip, bma->idx + 1, 1, state);
+
+ XFS_IFORK_NEXT_SET(bma->ip, whichfork,
+ XFS_IFORK_NEXTENTS(bma->ip, whichfork) - 1);
+ if (bma->cur == NULL) {
+ rval = XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
+ } else {
+ rval = XFS_ILOG_CORE;
+ error = xfs_bmbt_lookup_eq(bma->cur, right.br_startoff,
+ right.br_startblock, right.br_blockcount,
+ &i);
+ if (error)
+ goto done;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+ error = xfs_btree_delete(bma->cur, &i);
+ if (error)
+ goto done;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+ error = xfs_btree_decrement(bma->cur, 0, &i);
+ if (error)
+ goto done;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+ error = xfs_bmbt_update(bma->cur, left.br_startoff,
+ left.br_startblock,
+ left.br_blockcount +
+ new->br_blockcount +
+ right.br_blockcount,
+ left.br_state);
+ if (error)
+ goto done;
}
- *logflagsp = XFS_ILOG_CORE;
- if ((error = xfs_bmbt_lookup_eq(cur, right.br_startoff,
- right.br_startblock, right.br_blockcount, &i)))
- return error;
- ASSERT(i == 1);
- if ((error = xfs_bmbt_delete(cur, &i)))
- return error;
- ASSERT(i == 1);
- if ((error = xfs_bmbt_decrement(cur, 0, &i)))
- return error;
- ASSERT(i == 1);
- error = xfs_bmbt_update(cur, left.br_startoff,
- left.br_startblock,
- left.br_blockcount + new->br_blockcount +
- right.br_blockcount, left.br_state);
- return error;
+ break;
- case MASK(LEFT_CONTIG):
+ case BMAP_LEFT_CONTIG:
/*
* New allocation is contiguous with a real allocation
* on the left.
* Merge the new allocation with the left neighbor.
*/
- xfs_bmap_trace_pre_update(fname, "LC", ip, idx - 1, whichfork);
- xfs_bmbt_set_blockcount(ep - 1,
+ --bma->idx;
+ trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
+ xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, bma->idx),
left.br_blockcount + new->br_blockcount);
- xfs_bmap_trace_post_update(fname, "LC", ip, idx - 1, whichfork);
- ifp->if_lastex = idx - 1;
- if (cur == NULL) {
- *logflagsp = XFS_ILOG_FEXT(whichfork);
- return 0;
+ trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
+
+ if (bma->cur == NULL) {
+ rval = xfs_ilog_fext(whichfork);
+ } else {
+ rval = 0;
+ error = xfs_bmbt_lookup_eq(bma->cur, left.br_startoff,
+ left.br_startblock, left.br_blockcount,
+ &i);
+ if (error)
+ goto done;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+ error = xfs_bmbt_update(bma->cur, left.br_startoff,
+ left.br_startblock,
+ left.br_blockcount +
+ new->br_blockcount,
+ left.br_state);
+ if (error)
+ goto done;
}
- *logflagsp = 0;
- if ((error = xfs_bmbt_lookup_eq(cur, left.br_startoff,
- left.br_startblock, left.br_blockcount, &i)))
- return error;
- ASSERT(i == 1);
- error = xfs_bmbt_update(cur, left.br_startoff,
- left.br_startblock,
- left.br_blockcount + new->br_blockcount,
- left.br_state);
- return error;
+ break;
- case MASK(RIGHT_CONTIG):
+ case BMAP_RIGHT_CONTIG:
/*
* New allocation is contiguous with a real allocation
* on the right.
* Merge the new allocation with the right neighbor.
*/
- xfs_bmap_trace_pre_update(fname, "RC", ip, idx, whichfork);
- xfs_bmbt_set_allf(ep, new->br_startoff, new->br_startblock,
+ trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
+ xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, bma->idx),
+ new->br_startoff, new->br_startblock,
new->br_blockcount + right.br_blockcount,
right.br_state);
- xfs_bmap_trace_post_update(fname, "RC", ip, idx, whichfork);
- ifp->if_lastex = idx;
- if (cur == NULL) {
- *logflagsp = XFS_ILOG_FEXT(whichfork);
- return 0;
+ trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
+
+ if (bma->cur == NULL) {
+ rval = xfs_ilog_fext(whichfork);
+ } else {
+ rval = 0;
+ error = xfs_bmbt_lookup_eq(bma->cur,
+ right.br_startoff,
+ right.br_startblock,
+ right.br_blockcount, &i);
+ if (error)
+ goto done;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+ error = xfs_bmbt_update(bma->cur, new->br_startoff,
+ new->br_startblock,
+ new->br_blockcount +
+ right.br_blockcount,
+ right.br_state);
+ if (error)
+ goto done;
}
- *logflagsp = 0;
- if ((error = xfs_bmbt_lookup_eq(cur, right.br_startoff,
- right.br_startblock, right.br_blockcount, &i)))
- return error;
- ASSERT(i == 1);
- error = xfs_bmbt_update(cur, new->br_startoff,
- new->br_startblock,
- new->br_blockcount + right.br_blockcount,
- right.br_state);
- return error;
+ break;
case 0:
/*
@@ -2114,42 +3153,60 @@ xfs_bmap_add_extent_hole_real(
* real allocation.
* Insert a new entry.
*/
- xfs_bmap_trace_insert(fname, "0", ip, idx, 1, new, NULL,
- whichfork);
- xfs_bmap_insert_exlist(ip, idx, 1, new, whichfork);
- ifp->if_lastex = idx;
- XFS_IFORK_NEXT_SET(ip, whichfork,
- XFS_IFORK_NEXTENTS(ip, whichfork) + 1);
- if (cur == NULL) {
- *logflagsp = XFS_ILOG_CORE | XFS_ILOG_FEXT(whichfork);
- return 0;
+ xfs_iext_insert(bma->ip, bma->idx, 1, new, state);
+ XFS_IFORK_NEXT_SET(bma->ip, whichfork,
+ XFS_IFORK_NEXTENTS(bma->ip, whichfork) + 1);
+ if (bma->cur == NULL) {
+ rval = XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
+ } else {
+ rval = XFS_ILOG_CORE;
+ error = xfs_bmbt_lookup_eq(bma->cur,
+ new->br_startoff,
+ new->br_startblock,
+ new->br_blockcount, &i);
+ if (error)
+ goto done;
+ XFS_WANT_CORRUPTED_GOTO(i == 0, done);
+ bma->cur->bc_rec.b.br_state = new->br_state;
+ error = xfs_btree_insert(bma->cur, &i);
+ if (error)
+ goto done;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
}
- *logflagsp = XFS_ILOG_CORE;
- if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff,
- new->br_startblock, new->br_blockcount, &i)))
- return error;
- ASSERT(i == 0);
- cur->bc_rec.b.br_state = new->br_state;
- if ((error = xfs_bmbt_insert(cur, &i)))
- return error;
- ASSERT(i == 1);
- return 0;
+ break;
}
-#undef MASK
-#undef MASK2
-#undef STATE_SET
-#undef STATE_TEST
-#undef STATE_SET_TEST
-#undef SWITCH_STATE
- /* NOTREACHED */
- ASSERT(0);
- return 0; /* keep gcc quite */
+
+ /* convert to a btree if necessary */
+ if (xfs_bmap_needs_btree(bma->ip, whichfork)) {
+ int tmp_logflags; /* partial log flag return val */
+
+ ASSERT(bma->cur == NULL);
+ error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
+ bma->firstblock, bma->flist, &bma->cur,
+ 0, &tmp_logflags, whichfork);
+ bma->logflags |= tmp_logflags;
+ if (error)
+ goto done;
+ }
+
+ /* clear out the allocated field, done with it now in any case. */
+ if (bma->cur)
+ bma->cur->bc_private.b.allocated = 0;
+
+ xfs_bmap_check_leaf_extents(bma->cur, bma->ip, whichfork);
+done:
+ bma->logflags |= rval;
+ return error;
}
/*
+ * Functions used in the extent read, allocate and remove paths
+ */
+
+/*
* Adjust the size of the new extent based on di_extsize and rt extsize.
*/
-STATIC int
+int
xfs_bmap_extsize_align(
xfs_mount_t *mp,
xfs_bmbt_irec_t *gotp, /* next extent pointer */
@@ -2311,25 +3368,15 @@ xfs_bmap_extsize_align(
#define XFS_ALLOC_GAP_UNITS 4
-/*
- * xfs_bmap_alloc is called by xfs_bmapi to allocate an extent for a file.
- * It figures out where to ask the underlying allocator to put the new extent.
- */
-STATIC int
-xfs_bmap_alloc(
- xfs_bmalloca_t *ap) /* bmap alloc argument struct */
+void
+xfs_bmap_adjacent(
+ struct xfs_bmalloca *ap) /* bmap alloc argument struct */
{
xfs_fsblock_t adjust; /* adjustment to block numbers */
- xfs_alloctype_t atype=0; /* type for allocation routines */
- int error; /* error return value */
xfs_agnumber_t fb_agno; /* ag number of ap->firstblock */
xfs_mount_t *mp; /* mount point structure */
int nullfb; /* true if ap->firstblock isn't set */
int rt; /* true if inode is realtime */
- xfs_extlen_t prod = 0; /* product factor for allocators */
- xfs_extlen_t ralen = 0; /* realtime allocation length */
- xfs_extlen_t align; /* minimum allocation alignment */
- xfs_rtblock_t rtx;
#define ISVALID(x,y) \
(rt ? \
@@ -2338,92 +3385,27 @@ xfs_bmap_alloc(
XFS_FSB_TO_AGNO(mp, x) < mp->m_sb.sb_agcount && \
XFS_FSB_TO_AGBNO(mp, x) < mp->m_sb.sb_agblocks)
- /*
- * Set up variables.
- */
mp = ap->ip->i_mount;
- nullfb = ap->firstblock == NULLFSBLOCK;
+ nullfb = *ap->firstblock == NULLFSBLOCK;
rt = XFS_IS_REALTIME_INODE(ap->ip) && ap->userdata;
- fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, ap->firstblock);
- if (rt) {
- align = ap->ip->i_d.di_extsize ?
- ap->ip->i_d.di_extsize : mp->m_sb.sb_rextsize;
- /* Set prod to match the extent size */
- prod = align / mp->m_sb.sb_rextsize;
-
- error = xfs_bmap_extsize_align(mp, ap->gotp, ap->prevp,
- align, rt, ap->eof, 0,
- ap->conv, &ap->off, &ap->alen);
- if (error)
- return error;
- ASSERT(ap->alen);
- ASSERT(ap->alen % mp->m_sb.sb_rextsize == 0);
-
- /*
- * If the offset & length are not perfectly aligned
- * then kill prod, it will just get us in trouble.
- */
- if (do_mod(ap->off, align) || ap->alen % align)
- prod = 1;
- /*
- * Set ralen to be the actual requested length in rtextents.
- */
- ralen = ap->alen / mp->m_sb.sb_rextsize;
- /*
- * If the old value was close enough to MAXEXTLEN that
- * we rounded up to it, cut it back so it's valid again.
- * Note that if it's a really large request (bigger than
- * MAXEXTLEN), we don't hear about that number, and can't
- * adjust the starting point to match it.
- */
- if (ralen * mp->m_sb.sb_rextsize >= MAXEXTLEN)
- ralen = MAXEXTLEN / mp->m_sb.sb_rextsize;
- /*
- * If it's an allocation to an empty file at offset 0,
- * pick an extent that will space things out in the rt area.
- */
- if (ap->eof && ap->off == 0) {
- error = xfs_rtpick_extent(mp, ap->tp, ralen, &rtx);
- if (error)
- return error;
- ap->rval = rtx * mp->m_sb.sb_rextsize;
- } else
- ap->rval = 0;
- } else {
- align = (ap->userdata && ap->ip->i_d.di_extsize &&
- (ap->ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE)) ?
- ap->ip->i_d.di_extsize : 0;
- if (unlikely(align)) {
- error = xfs_bmap_extsize_align(mp, ap->gotp, ap->prevp,
- align, rt,
- ap->eof, 0, ap->conv,
- &ap->off, &ap->alen);
- ASSERT(!error);
- ASSERT(ap->alen);
- }
- if (nullfb)
- ap->rval = XFS_INO_TO_FSB(mp, ap->ip->i_ino);
- else
- ap->rval = ap->firstblock;
- }
-
+ fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, *ap->firstblock);
/*
* If allocating at eof, and there's a previous real block,
- * try to use it's last block as our starting point.
+ * try to use its last block as our starting point.
*/
- if (ap->eof && ap->prevp->br_startoff != NULLFILEOFF &&
- !ISNULLSTARTBLOCK(ap->prevp->br_startblock) &&
- ISVALID(ap->prevp->br_startblock + ap->prevp->br_blockcount,
- ap->prevp->br_startblock)) {
- ap->rval = ap->prevp->br_startblock + ap->prevp->br_blockcount;
+ if (ap->eof && ap->prev.br_startoff != NULLFILEOFF &&
+ !isnullstartblock(ap->prev.br_startblock) &&
+ ISVALID(ap->prev.br_startblock + ap->prev.br_blockcount,
+ ap->prev.br_startblock)) {
+ ap->blkno = ap->prev.br_startblock + ap->prev.br_blockcount;
/*
* Adjust for the gap between prevp and us.
*/
- adjust = ap->off -
- (ap->prevp->br_startoff + ap->prevp->br_blockcount);
+ adjust = ap->offset -
+ (ap->prev.br_startoff + ap->prev.br_blockcount);
if (adjust &&
- ISVALID(ap->rval + adjust, ap->prevp->br_startblock))
- ap->rval += adjust;
+ ISVALID(ap->blkno + adjust, ap->prev.br_startblock))
+ ap->blkno += adjust;
}
/*
* If not at eof, then compare the two neighbor blocks.
@@ -2440,17 +3422,17 @@ xfs_bmap_alloc(
* If there's a previous (left) block, select a requested
* start block based on it.
*/
- if (ap->prevp->br_startoff != NULLFILEOFF &&
- !ISNULLSTARTBLOCK(ap->prevp->br_startblock) &&
- (prevbno = ap->prevp->br_startblock +
- ap->prevp->br_blockcount) &&
- ISVALID(prevbno, ap->prevp->br_startblock)) {
+ if (ap->prev.br_startoff != NULLFILEOFF &&
+ !isnullstartblock(ap->prev.br_startblock) &&
+ (prevbno = ap->prev.br_startblock +
+ ap->prev.br_blockcount) &&
+ ISVALID(prevbno, ap->prev.br_startblock)) {
/*
* Calculate gap to end of previous block.
*/
- adjust = prevdiff = ap->off -
- (ap->prevp->br_startoff +
- ap->prevp->br_blockcount);
+ adjust = prevdiff = ap->offset -
+ (ap->prev.br_startoff +
+ ap->prev.br_blockcount);
/*
* Figure the startblock based on the previous block's
* end and the gap size.
@@ -2459,9 +3441,9 @@ xfs_bmap_alloc(
* allocating, or using it gives us an invalid block
* number, then just use the end of the previous block.
*/
- if (prevdiff <= XFS_ALLOC_GAP_UNITS * ap->alen &&
+ if (prevdiff <= XFS_ALLOC_GAP_UNITS * ap->length &&
ISVALID(prevbno + prevdiff,
- ap->prevp->br_startblock))
+ ap->prev.br_startblock))
prevbno += adjust;
else
prevdiff += adjust;
@@ -2482,16 +3464,16 @@ xfs_bmap_alloc(
* If there's a following (right) block, select a requested
* start block based on it.
*/
- if (!ISNULLSTARTBLOCK(ap->gotp->br_startblock)) {
+ if (!isnullstartblock(ap->got.br_startblock)) {
/*
* Calculate gap to start of next block.
*/
- adjust = gotdiff = ap->gotp->br_startoff - ap->off;
+ adjust = gotdiff = ap->got.br_startoff - ap->offset;
/*
* Figure the startblock based on the next block's
* start and the gap size.
*/
- gotbno = ap->gotp->br_startblock;
+ gotbno = ap->got.br_startblock;
/*
* Heuristic!
* If the gap is large relative to the piece we're
@@ -2499,12 +3481,12 @@ xfs_bmap_alloc(
* number, then just use the start of the next block
* offset by our length.
*/
- if (gotdiff <= XFS_ALLOC_GAP_UNITS * ap->alen &&
+ if (gotdiff <= XFS_ALLOC_GAP_UNITS * ap->length &&
ISVALID(gotbno - gotdiff, gotbno))
gotbno -= adjust;
- else if (ISVALID(gotbno - ap->alen, gotbno)) {
- gotbno -= ap->alen;
- gotdiff += adjust - ap->alen;
+ else if (ISVALID(gotbno - ap->length, gotbno)) {
+ gotbno -= ap->length;
+ gotdiff += adjust - ap->length;
} else
gotdiff += adjust;
/*
@@ -2522,366 +3504,1220 @@ xfs_bmap_alloc(
gotbno = NULLFSBLOCK;
/*
* If both valid, pick the better one, else the only good
- * one, else ap->rval is already set (to 0 or the inode block).
+ * one, else ap->blkno is already set (to 0 or the inode block).
*/
if (prevbno != NULLFSBLOCK && gotbno != NULLFSBLOCK)
- ap->rval = prevdiff <= gotdiff ? prevbno : gotbno;
+ ap->blkno = prevdiff <= gotdiff ? prevbno : gotbno;
else if (prevbno != NULLFSBLOCK)
- ap->rval = prevbno;
+ ap->blkno = prevbno;
else if (gotbno != NULLFSBLOCK)
- ap->rval = gotbno;
+ ap->blkno = gotbno;
+ }
+#undef ISVALID
+}
+
+static int
+xfs_bmap_longest_free_extent(
+ struct xfs_trans *tp,
+ xfs_agnumber_t ag,
+ xfs_extlen_t *blen,
+ int *notinit)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_perag *pag;
+ xfs_extlen_t longest;
+ int error = 0;
+
+ pag = xfs_perag_get(mp, ag);
+ if (!pag->pagf_init) {
+ error = xfs_alloc_pagf_init(mp, tp, ag, XFS_ALLOC_FLAG_TRYLOCK);
+ if (error)
+ goto out;
+
+ if (!pag->pagf_init) {
+ *notinit = 1;
+ goto out;
+ }
+ }
+
+ longest = xfs_alloc_longest_free_extent(mp, pag);
+ if (*blen < longest)
+ *blen = longest;
+
+out:
+ xfs_perag_put(pag);
+ return error;
+}
+
+static void
+xfs_bmap_select_minlen(
+ struct xfs_bmalloca *ap,
+ struct xfs_alloc_arg *args,
+ xfs_extlen_t *blen,
+ int notinit)
+{
+ if (notinit || *blen < ap->minlen) {
+ /*
+ * Since we did a BUF_TRYLOCK above, it is possible that
+ * there is space for this request.
+ */
+ args->minlen = ap->minlen;
+ } else if (*blen < args->maxlen) {
+ /*
+ * If the best seen length is less than the request length,
+ * use the best as the minimum.
+ */
+ args->minlen = *blen;
+ } else {
+ /*
+ * Otherwise we've seen an extent as big as maxlen, use that
+ * as the minimum.
+ */
+ args->minlen = args->maxlen;
+ }
+}
+
+STATIC int
+xfs_bmap_btalloc_nullfb(
+ struct xfs_bmalloca *ap,
+ struct xfs_alloc_arg *args,
+ xfs_extlen_t *blen)
+{
+ struct xfs_mount *mp = ap->ip->i_mount;
+ xfs_agnumber_t ag, startag;
+ int notinit = 0;
+ int error;
+
+ args->type = XFS_ALLOCTYPE_START_BNO;
+ args->total = ap->total;
+
+ startag = ag = XFS_FSB_TO_AGNO(mp, args->fsbno);
+ if (startag == NULLAGNUMBER)
+ startag = ag = 0;
+
+ while (*blen < args->maxlen) {
+ error = xfs_bmap_longest_free_extent(args->tp, ag, blen,
+ &notinit);
+ if (error)
+ return error;
+
+ if (++ag == mp->m_sb.sb_agcount)
+ ag = 0;
+ if (ag == startag)
+ break;
+ }
+
+ xfs_bmap_select_minlen(ap, args, blen, notinit);
+ return 0;
+}
+
+STATIC int
+xfs_bmap_btalloc_filestreams(
+ struct xfs_bmalloca *ap,
+ struct xfs_alloc_arg *args,
+ xfs_extlen_t *blen)
+{
+ struct xfs_mount *mp = ap->ip->i_mount;
+ xfs_agnumber_t ag;
+ int notinit = 0;
+ int error;
+
+ args->type = XFS_ALLOCTYPE_NEAR_BNO;
+ args->total = ap->total;
+
+ ag = XFS_FSB_TO_AGNO(mp, args->fsbno);
+ if (ag == NULLAGNUMBER)
+ ag = 0;
+
+ error = xfs_bmap_longest_free_extent(args->tp, ag, blen, &notinit);
+ if (error)
+ return error;
+
+ if (*blen < args->maxlen) {
+ error = xfs_filestream_new_ag(ap, &ag);
+ if (error)
+ return error;
+
+ error = xfs_bmap_longest_free_extent(args->tp, ag, blen,
+ &notinit);
+ if (error)
+ return error;
+
}
+
+ xfs_bmap_select_minlen(ap, args, blen, notinit);
+
/*
- * If allowed, use ap->rval; otherwise must use firstblock since
+ * Set the failure fallback case to look in the selected AG as stream
+ * may have moved.
+ */
+ ap->blkno = args->fsbno = XFS_AGB_TO_FSB(mp, ag, 0);
+ return 0;
+}
+
+STATIC int
+xfs_bmap_btalloc(
+ struct xfs_bmalloca *ap) /* bmap alloc argument struct */
+{
+ xfs_mount_t *mp; /* mount point structure */
+ xfs_alloctype_t atype = 0; /* type for allocation routines */
+ xfs_extlen_t align; /* minimum allocation alignment */
+ xfs_agnumber_t fb_agno; /* ag number of ap->firstblock */
+ xfs_agnumber_t ag;
+ xfs_alloc_arg_t args;
+ xfs_extlen_t blen;
+ xfs_extlen_t nextminlen = 0;
+ int nullfb; /* true if ap->firstblock isn't set */
+ int isaligned;
+ int tryagain;
+ int error;
+ int stripe_align;
+
+ ASSERT(ap->length);
+
+ mp = ap->ip->i_mount;
+
+ /* stripe alignment for allocation is determined by mount parameters */
+ stripe_align = 0;
+ if (mp->m_swidth && (mp->m_flags & XFS_MOUNT_SWALLOC))
+ stripe_align = mp->m_swidth;
+ else if (mp->m_dalign)
+ stripe_align = mp->m_dalign;
+
+ align = ap->userdata ? xfs_get_extsz_hint(ap->ip) : 0;
+ if (unlikely(align)) {
+ error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev,
+ align, 0, ap->eof, 0, ap->conv,
+ &ap->offset, &ap->length);
+ ASSERT(!error);
+ ASSERT(ap->length);
+ }
+
+
+ nullfb = *ap->firstblock == NULLFSBLOCK;
+ fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, *ap->firstblock);
+ if (nullfb) {
+ if (ap->userdata && xfs_inode_is_filestream(ap->ip)) {
+ ag = xfs_filestream_lookup_ag(ap->ip);
+ ag = (ag != NULLAGNUMBER) ? ag : 0;
+ ap->blkno = XFS_AGB_TO_FSB(mp, ag, 0);
+ } else {
+ ap->blkno = XFS_INO_TO_FSB(mp, ap->ip->i_ino);
+ }
+ } else
+ ap->blkno = *ap->firstblock;
+
+ xfs_bmap_adjacent(ap);
+
+ /*
+ * If allowed, use ap->blkno; otherwise must use firstblock since
* it's in the right allocation group.
*/
- if (nullfb || rt || XFS_FSB_TO_AGNO(mp, ap->rval) == fb_agno)
+ if (nullfb || XFS_FSB_TO_AGNO(mp, ap->blkno) == fb_agno)
;
else
- ap->rval = ap->firstblock;
+ ap->blkno = *ap->firstblock;
/*
- * Realtime allocation, done through xfs_rtallocate_extent.
+ * Normal allocation, done through xfs_alloc_vextent.
*/
- if (rt) {
-#ifndef __KERNEL__
- ASSERT(0);
-#else
- xfs_rtblock_t rtb;
-
- atype = ap->rval == 0 ?
- XFS_ALLOCTYPE_ANY_AG : XFS_ALLOCTYPE_NEAR_BNO;
- do_div(ap->rval, mp->m_sb.sb_rextsize);
- rtb = ap->rval;
- ap->alen = ralen;
- if ((error = xfs_rtallocate_extent(ap->tp, ap->rval, 1, ap->alen,
- &ralen, atype, ap->wasdel, prod, &rtb)))
- return error;
- if (rtb == NULLFSBLOCK && prod > 1 &&
- (error = xfs_rtallocate_extent(ap->tp, ap->rval, 1,
- ap->alen, &ralen, atype,
- ap->wasdel, 1, &rtb)))
+ tryagain = isaligned = 0;
+ memset(&args, 0, sizeof(args));
+ args.tp = ap->tp;
+ args.mp = mp;
+ args.fsbno = ap->blkno;
+
+ /* Trim the allocation back to the maximum an AG can fit. */
+ args.maxlen = MIN(ap->length, XFS_ALLOC_AG_MAX_USABLE(mp));
+ args.firstblock = *ap->firstblock;
+ blen = 0;
+ if (nullfb) {
+ /*
+ * Search for an allocation group with a single extent large
+ * enough for the request. If one isn't found, then adjust
+ * the minimum allocation size to the largest space found.
+ */
+ if (ap->userdata && xfs_inode_is_filestream(ap->ip))
+ error = xfs_bmap_btalloc_filestreams(ap, &args, &blen);
+ else
+ error = xfs_bmap_btalloc_nullfb(ap, &args, &blen);
+ if (error)
return error;
- ap->rval = rtb;
- if (ap->rval != NULLFSBLOCK) {
- ap->rval *= mp->m_sb.sb_rextsize;
- ralen *= mp->m_sb.sb_rextsize;
- ap->alen = ralen;
- ap->ip->i_d.di_nblocks += ralen;
- xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE);
- if (ap->wasdel)
- ap->ip->i_delayed_blks -= ralen;
- /*
- * Adjust the disk quota also. This was reserved
- * earlier.
- */
- XFS_TRANS_MOD_DQUOT_BYINO(mp, ap->tp, ap->ip,
- ap->wasdel ? XFS_TRANS_DQ_DELRTBCOUNT :
- XFS_TRANS_DQ_RTBCOUNT,
- (long) ralen);
- } else
- ap->alen = 0;
-#endif /* __KERNEL__ */
+ } else if (ap->flist->xbf_low) {
+ if (xfs_inode_is_filestream(ap->ip))
+ args.type = XFS_ALLOCTYPE_FIRST_AG;
+ else
+ args.type = XFS_ALLOCTYPE_START_BNO;
+ args.total = args.minlen = ap->minlen;
+ } else {
+ args.type = XFS_ALLOCTYPE_NEAR_BNO;
+ args.total = ap->total;
+ args.minlen = ap->minlen;
+ }
+ /* apply extent size hints if obtained earlier */
+ if (unlikely(align)) {
+ args.prod = align;
+ if ((args.mod = (xfs_extlen_t)do_mod(ap->offset, args.prod)))
+ args.mod = (xfs_extlen_t)(args.prod - args.mod);
+ } else if (mp->m_sb.sb_blocksize >= PAGE_CACHE_SIZE) {
+ args.prod = 1;
+ args.mod = 0;
+ } else {
+ args.prod = PAGE_CACHE_SIZE >> mp->m_sb.sb_blocklog;
+ if ((args.mod = (xfs_extlen_t)(do_mod(ap->offset, args.prod))))
+ args.mod = (xfs_extlen_t)(args.prod - args.mod);
}
/*
- * Normal allocation, done through xfs_alloc_vextent.
+ * If we are not low on available data blocks, and the
+ * underlying logical volume manager is a stripe, and
+ * the file offset is zero then try to allocate data
+ * blocks on stripe unit boundary.
+ * NOTE: ap->aeof is only set if the allocation length
+ * is >= the stripe unit and the allocation offset is
+ * at the end of file.
*/
- else {
- xfs_agnumber_t ag;
- xfs_alloc_arg_t args;
- xfs_extlen_t blen;
- xfs_extlen_t delta;
- int isaligned;
- xfs_extlen_t longest;
- xfs_extlen_t need;
- xfs_extlen_t nextminlen=0;
- int notinit;
- xfs_perag_t *pag;
- xfs_agnumber_t startag;
- int tryagain;
-
- tryagain = isaligned = 0;
- args.tp = ap->tp;
- args.mp = mp;
- args.fsbno = ap->rval;
- args.maxlen = MIN(ap->alen, mp->m_sb.sb_agblocks);
- blen = 0;
- if (nullfb) {
- args.type = XFS_ALLOCTYPE_START_BNO;
- args.total = ap->total;
- /*
- * Find the longest available space.
- * We're going to try for the whole allocation at once.
- */
- startag = ag = XFS_FSB_TO_AGNO(mp, args.fsbno);
- notinit = 0;
- down_read(&mp->m_peraglock);
- while (blen < ap->alen) {
- pag = &mp->m_perag[ag];
- if (!pag->pagf_init &&
- (error = xfs_alloc_pagf_init(mp, args.tp,
- ag, XFS_ALLOC_FLAG_TRYLOCK))) {
- up_read(&mp->m_peraglock);
- return error;
- }
- /*
- * See xfs_alloc_fix_freelist...
- */
- if (pag->pagf_init) {
- need = XFS_MIN_FREELIST_PAG(pag, mp);
- delta = need > pag->pagf_flcount ?
- need - pag->pagf_flcount : 0;
- longest = (pag->pagf_longest > delta) ?
- (pag->pagf_longest - delta) :
- (pag->pagf_flcount > 0 ||
- pag->pagf_longest > 0);
- if (blen < longest)
- blen = longest;
- } else
- notinit = 1;
- if (++ag == mp->m_sb.sb_agcount)
- ag = 0;
- if (ag == startag)
- break;
- }
- up_read(&mp->m_peraglock);
+ if (!ap->flist->xbf_low && ap->aeof) {
+ if (!ap->offset) {
+ args.alignment = stripe_align;
+ atype = args.type;
+ isaligned = 1;
/*
- * Since the above loop did a BUF_TRYLOCK, it is
- * possible that there is space for this request.
+ * Adjust for alignment
*/
- if (notinit || blen < ap->minlen)
- args.minlen = ap->minlen;
+ if (blen > args.alignment && blen <= args.maxlen)
+ args.minlen = blen - args.alignment;
+ args.minalignslop = 0;
+ } else {
/*
- * If the best seen length is less than the request
- * length, use the best as the minimum.
+ * First try an exact bno allocation.
+ * If it fails then do a near or start bno
+ * allocation with alignment turned on.
*/
- else if (blen < ap->alen)
- args.minlen = blen;
+ atype = args.type;
+ tryagain = 1;
+ args.type = XFS_ALLOCTYPE_THIS_BNO;
+ args.alignment = 1;
/*
- * Otherwise we've seen an extent as big as alen,
- * use that as the minimum.
+ * Compute the minlen+alignment for the
+ * next case. Set slop so that the value
+ * of minlen+alignment+slop doesn't go up
+ * between the calls.
*/
+ if (blen > stripe_align && blen <= args.maxlen)
+ nextminlen = blen - stripe_align;
else
- args.minlen = ap->alen;
- } else if (ap->low) {
- args.type = XFS_ALLOCTYPE_FIRST_AG;
- args.total = args.minlen = ap->minlen;
- } else {
- args.type = XFS_ALLOCTYPE_NEAR_BNO;
- args.total = ap->total;
- args.minlen = ap->minlen;
- }
- if (unlikely(ap->userdata && ap->ip->i_d.di_extsize &&
- (ap->ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE))) {
- args.prod = ap->ip->i_d.di_extsize;
- if ((args.mod = (xfs_extlen_t)do_mod(ap->off, args.prod)))
- args.mod = (xfs_extlen_t)(args.prod - args.mod);
- } else if (unlikely(mp->m_sb.sb_blocksize >= NBPP)) {
- args.prod = 1;
- args.mod = 0;
- } else {
- args.prod = NBPP >> mp->m_sb.sb_blocklog;
- if ((args.mod = (xfs_extlen_t)(do_mod(ap->off, args.prod))))
- args.mod = (xfs_extlen_t)(args.prod - args.mod);
+ nextminlen = args.minlen;
+ if (nextminlen + stripe_align > args.minlen + 1)
+ args.minalignslop =
+ nextminlen + stripe_align -
+ args.minlen - 1;
+ else
+ args.minalignslop = 0;
}
+ } else {
+ args.alignment = 1;
+ args.minalignslop = 0;
+ }
+ args.minleft = ap->minleft;
+ args.wasdel = ap->wasdel;
+ args.isfl = 0;
+ args.userdata = ap->userdata;
+ if ((error = xfs_alloc_vextent(&args)))
+ return error;
+ if (tryagain && args.fsbno == NULLFSBLOCK) {
/*
- * If we are not low on available data blocks, and the
- * underlying logical volume manager is a stripe, and
- * the file offset is zero then try to allocate data
- * blocks on stripe unit boundary.
- * NOTE: ap->aeof is only set if the allocation length
- * is >= the stripe unit and the allocation offset is
- * at the end of file.
+ * Exact allocation failed. Now try with alignment
+ * turned on.
*/
- if (!ap->low && ap->aeof) {
- if (!ap->off) {
- args.alignment = mp->m_dalign;
- atype = args.type;
- isaligned = 1;
- /*
- * Adjust for alignment
- */
- if (blen > args.alignment && blen <= ap->alen)
- args.minlen = blen - args.alignment;
- args.minalignslop = 0;
- } else {
- /*
- * First try an exact bno allocation.
- * If it fails then do a near or start bno
- * allocation with alignment turned on.
- */
- atype = args.type;
- tryagain = 1;
- args.type = XFS_ALLOCTYPE_THIS_BNO;
- args.alignment = 1;
- /*
- * Compute the minlen+alignment for the
- * next case. Set slop so that the value
- * of minlen+alignment+slop doesn't go up
- * between the calls.
- */
- if (blen > mp->m_dalign && blen <= ap->alen)
- nextminlen = blen - mp->m_dalign;
- else
- nextminlen = args.minlen;
- if (nextminlen + mp->m_dalign > args.minlen + 1)
- args.minalignslop =
- nextminlen + mp->m_dalign -
- args.minlen - 1;
- else
- args.minalignslop = 0;
- }
- } else {
- args.alignment = 1;
- args.minalignslop = 0;
- }
- args.minleft = ap->minleft;
- args.wasdel = ap->wasdel;
- args.isfl = 0;
- args.userdata = ap->userdata;
+ args.type = atype;
+ args.fsbno = ap->blkno;
+ args.alignment = stripe_align;
+ args.minlen = nextminlen;
+ args.minalignslop = 0;
+ isaligned = 1;
+ if ((error = xfs_alloc_vextent(&args)))
+ return error;
+ }
+ if (isaligned && args.fsbno == NULLFSBLOCK) {
+ /*
+ * allocation failed, so turn off alignment and
+ * try again.
+ */
+ args.type = atype;
+ args.fsbno = ap->blkno;
+ args.alignment = 0;
if ((error = xfs_alloc_vextent(&args)))
return error;
- if (tryagain && args.fsbno == NULLFSBLOCK) {
- /*
- * Exact allocation failed. Now try with alignment
- * turned on.
- */
- args.type = atype;
- args.fsbno = ap->rval;
- args.alignment = mp->m_dalign;
- args.minlen = nextminlen;
- args.minalignslop = 0;
- isaligned = 1;
- if ((error = xfs_alloc_vextent(&args)))
- return error;
- }
- if (isaligned && args.fsbno == NULLFSBLOCK) {
- /*
- * allocation failed, so turn off alignment and
- * try again.
- */
- args.type = atype;
- args.fsbno = ap->rval;
- args.alignment = 0;
- if ((error = xfs_alloc_vextent(&args)))
- return error;
- }
- if (args.fsbno == NULLFSBLOCK && nullfb &&
- args.minlen > ap->minlen) {
- args.minlen = ap->minlen;
- args.type = XFS_ALLOCTYPE_START_BNO;
- args.fsbno = ap->rval;
- if ((error = xfs_alloc_vextent(&args)))
- return error;
- }
- if (args.fsbno == NULLFSBLOCK && nullfb) {
- args.fsbno = 0;
- args.type = XFS_ALLOCTYPE_FIRST_AG;
- args.total = ap->minlen;
- args.minleft = 0;
- if ((error = xfs_alloc_vextent(&args)))
- return error;
- ap->low = 1;
- }
- if (args.fsbno != NULLFSBLOCK) {
- ap->firstblock = ap->rval = args.fsbno;
- ASSERT(nullfb || fb_agno == args.agno ||
- (ap->low && fb_agno < args.agno));
- ap->alen = args.len;
- ap->ip->i_d.di_nblocks += args.len;
- xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE);
- if (ap->wasdel)
- ap->ip->i_delayed_blks -= args.len;
- /*
- * Adjust the disk quota also. This was reserved
- * earlier.
- */
- XFS_TRANS_MOD_DQUOT_BYINO(mp, ap->tp, ap->ip,
- ap->wasdel ? XFS_TRANS_DQ_DELBCOUNT :
- XFS_TRANS_DQ_BCOUNT,
- (long) args.len);
- } else {
- ap->rval = NULLFSBLOCK;
- ap->alen = 0;
- }
+ }
+ if (args.fsbno == NULLFSBLOCK && nullfb &&
+ args.minlen > ap->minlen) {
+ args.minlen = ap->minlen;
+ args.type = XFS_ALLOCTYPE_START_BNO;
+ args.fsbno = ap->blkno;
+ if ((error = xfs_alloc_vextent(&args)))
+ return error;
+ }
+ if (args.fsbno == NULLFSBLOCK && nullfb) {
+ args.fsbno = 0;
+ args.type = XFS_ALLOCTYPE_FIRST_AG;
+ args.total = ap->minlen;
+ args.minleft = 0;
+ if ((error = xfs_alloc_vextent(&args)))
+ return error;
+ ap->flist->xbf_low = 1;
+ }
+ if (args.fsbno != NULLFSBLOCK) {
+ /*
+ * check the allocation happened at the same or higher AG than
+ * the first block that was allocated.
+ */
+ ASSERT(*ap->firstblock == NULLFSBLOCK ||
+ XFS_FSB_TO_AGNO(mp, *ap->firstblock) ==
+ XFS_FSB_TO_AGNO(mp, args.fsbno) ||
+ (ap->flist->xbf_low &&
+ XFS_FSB_TO_AGNO(mp, *ap->firstblock) <
+ XFS_FSB_TO_AGNO(mp, args.fsbno)));
+
+ ap->blkno = args.fsbno;
+ if (*ap->firstblock == NULLFSBLOCK)
+ *ap->firstblock = args.fsbno;
+ ASSERT(nullfb || fb_agno == args.agno ||
+ (ap->flist->xbf_low && fb_agno < args.agno));
+ ap->length = args.len;
+ ap->ip->i_d.di_nblocks += args.len;
+ xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE);
+ if (ap->wasdel)
+ ap->ip->i_delayed_blks -= args.len;
+ /*
+ * Adjust the disk quota also. This was reserved
+ * earlier.
+ */
+ xfs_trans_mod_dquot_byino(ap->tp, ap->ip,
+ ap->wasdel ? XFS_TRANS_DQ_DELBCOUNT :
+ XFS_TRANS_DQ_BCOUNT,
+ (long) args.len);
+ } else {
+ ap->blkno = NULLFSBLOCK;
+ ap->length = 0;
}
return 0;
-#undef ISVALID
}
/*
- * Transform a btree format file with only one leaf node, where the
- * extents list will fit in the inode, into an extents format file.
- * Since the extent list is already in-core, all we have to do is
- * give up the space for the btree root and pitch the leaf block.
+ * xfs_bmap_alloc is called by xfs_bmapi to allocate an extent for a file.
+ * It figures out where to ask the underlying allocator to put the new extent.
*/
-STATIC int /* error */
-xfs_bmap_btree_to_extents(
- xfs_trans_t *tp, /* transaction pointer */
- xfs_inode_t *ip, /* incore inode pointer */
- xfs_btree_cur_t *cur, /* btree cursor */
- int *logflagsp, /* inode logging flags */
- int whichfork) /* data or attr fork */
+STATIC int
+xfs_bmap_alloc(
+ struct xfs_bmalloca *ap) /* bmap alloc argument struct */
{
- /* REFERENCED */
- xfs_bmbt_block_t *cblock;/* child btree block */
- xfs_fsblock_t cbno; /* child block number */
- xfs_buf_t *cbp; /* child block's buffer */
- int error; /* error return value */
- xfs_ifork_t *ifp; /* inode fork data */
- xfs_mount_t *mp; /* mount point structure */
- xfs_bmbt_ptr_t *pp; /* ptr to block address */
- xfs_bmbt_block_t *rblock;/* root btree block */
+ if (XFS_IS_REALTIME_INODE(ap->ip) && ap->userdata)
+ return xfs_bmap_rtalloc(ap);
+ return xfs_bmap_btalloc(ap);
+}
+
+/*
+ * Trim the returned map to the required bounds
+ */
+STATIC void
+xfs_bmapi_trim_map(
+ struct xfs_bmbt_irec *mval,
+ struct xfs_bmbt_irec *got,
+ xfs_fileoff_t *bno,
+ xfs_filblks_t len,
+ xfs_fileoff_t obno,
+ xfs_fileoff_t end,
+ int n,
+ int flags)
+{
+ if ((flags & XFS_BMAPI_ENTIRE) ||
+ got->br_startoff + got->br_blockcount <= obno) {
+ *mval = *got;
+ if (isnullstartblock(got->br_startblock))
+ mval->br_startblock = DELAYSTARTBLOCK;
+ return;
+ }
+
+ if (obno > *bno)
+ *bno = obno;
+ ASSERT((*bno >= obno) || (n == 0));
+ ASSERT(*bno < end);
+ mval->br_startoff = *bno;
+ if (isnullstartblock(got->br_startblock))
+ mval->br_startblock = DELAYSTARTBLOCK;
+ else
+ mval->br_startblock = got->br_startblock +
+ (*bno - got->br_startoff);
+ /*
+ * Return the minimum of what we got and what we asked for for
+ * the length. We can use the len variable here because it is
+ * modified below and we could have been there before coming
+ * here if the first part of the allocation didn't overlap what
+ * was asked for.
+ */
+ mval->br_blockcount = XFS_FILBLKS_MIN(end - *bno,
+ got->br_blockcount - (*bno - got->br_startoff));
+ mval->br_state = got->br_state;
+ ASSERT(mval->br_blockcount <= len);
+ return;
+}
+
+/*
+ * Update and validate the extent map to return
+ */
+STATIC void
+xfs_bmapi_update_map(
+ struct xfs_bmbt_irec **map,
+ xfs_fileoff_t *bno,
+ xfs_filblks_t *len,
+ xfs_fileoff_t obno,
+ xfs_fileoff_t end,
+ int *n,
+ int flags)
+{
+ xfs_bmbt_irec_t *mval = *map;
+
+ ASSERT((flags & XFS_BMAPI_ENTIRE) ||
+ ((mval->br_startoff + mval->br_blockcount) <= end));
+ ASSERT((flags & XFS_BMAPI_ENTIRE) || (mval->br_blockcount <= *len) ||
+ (mval->br_startoff < obno));
+
+ *bno = mval->br_startoff + mval->br_blockcount;
+ *len = end - *bno;
+ if (*n > 0 && mval->br_startoff == mval[-1].br_startoff) {
+ /* update previous map with new information */
+ ASSERT(mval->br_startblock == mval[-1].br_startblock);
+ ASSERT(mval->br_blockcount > mval[-1].br_blockcount);
+ ASSERT(mval->br_state == mval[-1].br_state);
+ mval[-1].br_blockcount = mval->br_blockcount;
+ mval[-1].br_state = mval->br_state;
+ } else if (*n > 0 && mval->br_startblock != DELAYSTARTBLOCK &&
+ mval[-1].br_startblock != DELAYSTARTBLOCK &&
+ mval[-1].br_startblock != HOLESTARTBLOCK &&
+ mval->br_startblock == mval[-1].br_startblock +
+ mval[-1].br_blockcount &&
+ ((flags & XFS_BMAPI_IGSTATE) ||
+ mval[-1].br_state == mval->br_state)) {
+ ASSERT(mval->br_startoff ==
+ mval[-1].br_startoff + mval[-1].br_blockcount);
+ mval[-1].br_blockcount += mval->br_blockcount;
+ } else if (*n > 0 &&
+ mval->br_startblock == DELAYSTARTBLOCK &&
+ mval[-1].br_startblock == DELAYSTARTBLOCK &&
+ mval->br_startoff ==
+ mval[-1].br_startoff + mval[-1].br_blockcount) {
+ mval[-1].br_blockcount += mval->br_blockcount;
+ mval[-1].br_state = mval->br_state;
+ } else if (!((*n == 0) &&
+ ((mval->br_startoff + mval->br_blockcount) <=
+ obno))) {
+ mval++;
+ (*n)++;
+ }
+ *map = mval;
+}
+
+/*
+ * Map file blocks to filesystem blocks without allocation.
+ */
+int
+xfs_bmapi_read(
+ struct xfs_inode *ip,
+ xfs_fileoff_t bno,
+ xfs_filblks_t len,
+ struct xfs_bmbt_irec *mval,
+ int *nmap,
+ int flags)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_ifork *ifp;
+ struct xfs_bmbt_irec got;
+ struct xfs_bmbt_irec prev;
+ xfs_fileoff_t obno;
+ xfs_fileoff_t end;
+ xfs_extnum_t lastx;
+ int error;
+ int eof;
+ int n = 0;
+ int whichfork = (flags & XFS_BMAPI_ATTRFORK) ?
+ XFS_ATTR_FORK : XFS_DATA_FORK;
+
+ ASSERT(*nmap >= 1);
+ ASSERT(!(flags & ~(XFS_BMAPI_ATTRFORK|XFS_BMAPI_ENTIRE|
+ XFS_BMAPI_IGSTATE)));
+ ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED|XFS_ILOCK_EXCL));
+
+ if (unlikely(XFS_TEST_ERROR(
+ (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
+ XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE),
+ mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) {
+ XFS_ERROR_REPORT("xfs_bmapi_read", XFS_ERRLEVEL_LOW, mp);
+ return XFS_ERROR(EFSCORRUPTED);
+ }
+
+ if (XFS_FORCED_SHUTDOWN(mp))
+ return XFS_ERROR(EIO);
+
+ XFS_STATS_INC(xs_blk_mapr);
ifp = XFS_IFORK_PTR(ip, whichfork);
- ASSERT(ifp->if_flags & XFS_IFEXTENTS);
- ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE);
- rblock = ifp->if_broot;
- ASSERT(be16_to_cpu(rblock->bb_level) == 1);
- ASSERT(be16_to_cpu(rblock->bb_numrecs) == 1);
- ASSERT(XFS_BMAP_BROOT_MAXRECS(ifp->if_broot_bytes) == 1);
- mp = ip->i_mount;
- pp = XFS_BMAP_BROOT_PTR_ADDR(rblock, 1, ifp->if_broot_bytes);
- *logflagsp = 0;
-#ifdef DEBUG
- if ((error = xfs_btree_check_lptr(cur, INT_GET(*pp, ARCH_CONVERT), 1)))
+
+ if (!(ifp->if_flags & XFS_IFEXTENTS)) {
+ error = xfs_iread_extents(NULL, ip, whichfork);
+ if (error)
+ return error;
+ }
+
+ xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got, &prev);
+ end = bno + len;
+ obno = bno;
+
+ while (bno < end && n < *nmap) {
+ /* Reading past eof, act as though there's a hole up to end. */
+ if (eof)
+ got.br_startoff = end;
+ if (got.br_startoff > bno) {
+ /* Reading in a hole. */
+ mval->br_startoff = bno;
+ mval->br_startblock = HOLESTARTBLOCK;
+ mval->br_blockcount =
+ XFS_FILBLKS_MIN(len, got.br_startoff - bno);
+ mval->br_state = XFS_EXT_NORM;
+ bno += mval->br_blockcount;
+ len -= mval->br_blockcount;
+ mval++;
+ n++;
+ continue;
+ }
+
+ /* set up the extent map to return. */
+ xfs_bmapi_trim_map(mval, &got, &bno, len, obno, end, n, flags);
+ xfs_bmapi_update_map(&mval, &bno, &len, obno, end, &n, flags);
+
+ /* If we're done, stop now. */
+ if (bno >= end || n >= *nmap)
+ break;
+
+ /* Else go on to the next record. */
+ if (++lastx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t))
+ xfs_bmbt_get_all(xfs_iext_get_ext(ifp, lastx), &got);
+ else
+ eof = 1;
+ }
+ *nmap = n;
+ return 0;
+}
+
+STATIC int
+xfs_bmapi_reserve_delalloc(
+ struct xfs_inode *ip,
+ xfs_fileoff_t aoff,
+ xfs_filblks_t len,
+ struct xfs_bmbt_irec *got,
+ struct xfs_bmbt_irec *prev,
+ xfs_extnum_t *lastx,
+ int eof)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+ xfs_extlen_t alen;
+ xfs_extlen_t indlen;
+ char rt = XFS_IS_REALTIME_INODE(ip);
+ xfs_extlen_t extsz;
+ int error;
+
+ alen = XFS_FILBLKS_MIN(len, MAXEXTLEN);
+ if (!eof)
+ alen = XFS_FILBLKS_MIN(alen, got->br_startoff - aoff);
+
+ /* Figure out the extent size, adjust alen */
+ extsz = xfs_get_extsz_hint(ip);
+ if (extsz) {
+ /*
+ * Make sure we don't exceed a single extent length when we
+ * align the extent by reducing length we are going to
+ * allocate by the maximum amount extent size aligment may
+ * require.
+ */
+ alen = XFS_FILBLKS_MIN(len, MAXEXTLEN - (2 * extsz - 1));
+ error = xfs_bmap_extsize_align(mp, got, prev, extsz, rt, eof,
+ 1, 0, &aoff, &alen);
+ ASSERT(!error);
+ }
+
+ if (rt)
+ extsz = alen / mp->m_sb.sb_rextsize;
+
+ /*
+ * Make a transaction-less quota reservation for delayed allocation
+ * blocks. This number gets adjusted later. We return if we haven't
+ * allocated blocks already inside this loop.
+ */
+ error = xfs_trans_reserve_quota_nblks(NULL, ip, (long)alen, 0,
+ rt ? XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS);
+ if (error)
return error;
-#endif
- cbno = INT_GET(*pp, ARCH_CONVERT);
- if ((error = xfs_btree_read_bufl(mp, tp, cbno, 0, &cbp,
- XFS_BMAP_BTREE_REF)))
+
+ /*
+ * Split changing sb for alen and indlen since they could be coming
+ * from different places.
+ */
+ indlen = (xfs_extlen_t)xfs_bmap_worst_indlen(ip, alen);
+ ASSERT(indlen > 0);
+
+ if (rt) {
+ error = xfs_mod_incore_sb(mp, XFS_SBS_FREXTENTS,
+ -((int64_t)extsz), 0);
+ } else {
+ error = xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
+ -((int64_t)alen), 0);
+ }
+
+ if (error)
+ goto out_unreserve_quota;
+
+ error = xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
+ -((int64_t)indlen), 0);
+ if (error)
+ goto out_unreserve_blocks;
+
+
+ ip->i_delayed_blks += alen;
+
+ got->br_startoff = aoff;
+ got->br_startblock = nullstartblock(indlen);
+ got->br_blockcount = alen;
+ got->br_state = XFS_EXT_NORM;
+ xfs_bmap_add_extent_hole_delay(ip, lastx, got);
+
+ /*
+ * Update our extent pointer, given that xfs_bmap_add_extent_hole_delay
+ * might have merged it into one of the neighbouring ones.
+ */
+ xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *lastx), got);
+
+ ASSERT(got->br_startoff <= aoff);
+ ASSERT(got->br_startoff + got->br_blockcount >= aoff + alen);
+ ASSERT(isnullstartblock(got->br_startblock));
+ ASSERT(got->br_state == XFS_EXT_NORM);
+ return 0;
+
+out_unreserve_blocks:
+ if (rt)
+ xfs_mod_incore_sb(mp, XFS_SBS_FREXTENTS, extsz, 0);
+ else
+ xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, alen, 0);
+out_unreserve_quota:
+ if (XFS_IS_QUOTA_ON(mp))
+ xfs_trans_unreserve_quota_nblks(NULL, ip, (long)alen, 0, rt ?
+ XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS);
+ return error;
+}
+
+/*
+ * Map file blocks to filesystem blocks, adding delayed allocations as needed.
+ */
+int
+xfs_bmapi_delay(
+ struct xfs_inode *ip, /* incore inode */
+ xfs_fileoff_t bno, /* starting file offs. mapped */
+ xfs_filblks_t len, /* length to map in file */
+ struct xfs_bmbt_irec *mval, /* output: map values */
+ int *nmap, /* i/o: mval size/count */
+ int flags) /* XFS_BMAPI_... */
+{
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+ struct xfs_bmbt_irec got; /* current file extent record */
+ struct xfs_bmbt_irec prev; /* previous file extent record */
+ xfs_fileoff_t obno; /* old block number (offset) */
+ xfs_fileoff_t end; /* end of mapped file region */
+ xfs_extnum_t lastx; /* last useful extent number */
+ int eof; /* we've hit the end of extents */
+ int n = 0; /* current extent index */
+ int error = 0;
+
+ ASSERT(*nmap >= 1);
+ ASSERT(*nmap <= XFS_BMAP_MAX_NMAP);
+ ASSERT(!(flags & ~XFS_BMAPI_ENTIRE));
+ ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+
+ if (unlikely(XFS_TEST_ERROR(
+ (XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_EXTENTS &&
+ XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_BTREE),
+ mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) {
+ XFS_ERROR_REPORT("xfs_bmapi_delay", XFS_ERRLEVEL_LOW, mp);
+ return XFS_ERROR(EFSCORRUPTED);
+ }
+
+ if (XFS_FORCED_SHUTDOWN(mp))
+ return XFS_ERROR(EIO);
+
+ XFS_STATS_INC(xs_blk_mapw);
+
+ if (!(ifp->if_flags & XFS_IFEXTENTS)) {
+ error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK);
+ if (error)
+ return error;
+ }
+
+ xfs_bmap_search_extents(ip, bno, XFS_DATA_FORK, &eof, &lastx, &got, &prev);
+ end = bno + len;
+ obno = bno;
+
+ while (bno < end && n < *nmap) {
+ if (eof || got.br_startoff > bno) {
+ error = xfs_bmapi_reserve_delalloc(ip, bno, len, &got,
+ &prev, &lastx, eof);
+ if (error) {
+ if (n == 0) {
+ *nmap = 0;
+ return error;
+ }
+ break;
+ }
+ }
+
+ /* set up the extent map to return. */
+ xfs_bmapi_trim_map(mval, &got, &bno, len, obno, end, n, flags);
+ xfs_bmapi_update_map(&mval, &bno, &len, obno, end, &n, flags);
+
+ /* If we're done, stop now. */
+ if (bno >= end || n >= *nmap)
+ break;
+
+ /* Else go on to the next record. */
+ prev = got;
+ if (++lastx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t))
+ xfs_bmbt_get_all(xfs_iext_get_ext(ifp, lastx), &got);
+ else
+ eof = 1;
+ }
+
+ *nmap = n;
+ return 0;
+}
+
+
+static int
+xfs_bmapi_allocate(
+ struct xfs_bmalloca *bma)
+{
+ struct xfs_mount *mp = bma->ip->i_mount;
+ int whichfork = (bma->flags & XFS_BMAPI_ATTRFORK) ?
+ XFS_ATTR_FORK : XFS_DATA_FORK;
+ struct xfs_ifork *ifp = XFS_IFORK_PTR(bma->ip, whichfork);
+ int tmp_logflags = 0;
+ int error;
+
+ ASSERT(bma->length > 0);
+
+ /*
+ * For the wasdelay case, we could also just allocate the stuff asked
+ * for in this bmap call but that wouldn't be as good.
+ */
+ if (bma->wasdel) {
+ bma->length = (xfs_extlen_t)bma->got.br_blockcount;
+ bma->offset = bma->got.br_startoff;
+ if (bma->idx != NULLEXTNUM && bma->idx) {
+ xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx - 1),
+ &bma->prev);
+ }
+ } else {
+ bma->length = XFS_FILBLKS_MIN(bma->length, MAXEXTLEN);
+ if (!bma->eof)
+ bma->length = XFS_FILBLKS_MIN(bma->length,
+ bma->got.br_startoff - bma->offset);
+ }
+
+ /*
+ * Indicate if this is the first user data in the file, or just any
+ * user data.
+ */
+ if (!(bma->flags & XFS_BMAPI_METADATA)) {
+ bma->userdata = (bma->offset == 0) ?
+ XFS_ALLOC_INITIAL_USER_DATA : XFS_ALLOC_USERDATA;
+ }
+
+ bma->minlen = (bma->flags & XFS_BMAPI_CONTIG) ? bma->length : 1;
+
+ /*
+ * Only want to do the alignment at the eof if it is userdata and
+ * allocation length is larger than a stripe unit.
+ */
+ if (mp->m_dalign && bma->length >= mp->m_dalign &&
+ !(bma->flags & XFS_BMAPI_METADATA) && whichfork == XFS_DATA_FORK) {
+ error = xfs_bmap_isaeof(bma, whichfork);
+ if (error)
+ return error;
+ }
+
+ error = xfs_bmap_alloc(bma);
+ if (error)
return error;
- cblock = XFS_BUF_TO_BMBT_BLOCK(cbp);
- if ((error = xfs_btree_check_lblock(cur, cblock, 0, cbp)))
+
+ if (bma->flist->xbf_low)
+ bma->minleft = 0;
+ if (bma->cur)
+ bma->cur->bc_private.b.firstblock = *bma->firstblock;
+ if (bma->blkno == NULLFSBLOCK)
+ return 0;
+ if ((ifp->if_flags & XFS_IFBROOT) && !bma->cur) {
+ bma->cur = xfs_bmbt_init_cursor(mp, bma->tp, bma->ip, whichfork);
+ bma->cur->bc_private.b.firstblock = *bma->firstblock;
+ bma->cur->bc_private.b.flist = bma->flist;
+ }
+ /*
+ * Bump the number of extents we've allocated
+ * in this call.
+ */
+ bma->nallocs++;
+
+ if (bma->cur)
+ bma->cur->bc_private.b.flags =
+ bma->wasdel ? XFS_BTCUR_BPRV_WASDEL : 0;
+
+ bma->got.br_startoff = bma->offset;
+ bma->got.br_startblock = bma->blkno;
+ bma->got.br_blockcount = bma->length;
+ bma->got.br_state = XFS_EXT_NORM;
+
+ /*
+ * A wasdelay extent has been initialized, so shouldn't be flagged
+ * as unwritten.
+ */
+ if (!bma->wasdel && (bma->flags & XFS_BMAPI_PREALLOC) &&
+ xfs_sb_version_hasextflgbit(&mp->m_sb))
+ bma->got.br_state = XFS_EXT_UNWRITTEN;
+
+ if (bma->wasdel)
+ error = xfs_bmap_add_extent_delay_real(bma);
+ else
+ error = xfs_bmap_add_extent_hole_real(bma, whichfork);
+
+ bma->logflags |= tmp_logflags;
+ if (error)
return error;
- xfs_bmap_add_free(cbno, 1, cur->bc_private.b.flist, mp);
- ip->i_d.di_nblocks--;
- XFS_TRANS_MOD_DQUOT_BYINO(mp, tp, ip, XFS_TRANS_DQ_BCOUNT, -1L);
- xfs_trans_binval(tp, cbp);
- if (cur->bc_bufs[0] == cbp)
- cur->bc_bufs[0] = NULL;
- xfs_iroot_realloc(ip, -1, whichfork);
- ASSERT(ifp->if_broot == NULL);
- ASSERT((ifp->if_flags & XFS_IFBROOT) == 0);
- XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS);
- *logflagsp = XFS_ILOG_CORE | XFS_ILOG_FEXT(whichfork);
+
+ /*
+ * Update our extent pointer, given that xfs_bmap_add_extent_delay_real
+ * or xfs_bmap_add_extent_hole_real might have merged it into one of
+ * the neighbouring ones.
+ */
+ xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx), &bma->got);
+
+ ASSERT(bma->got.br_startoff <= bma->offset);
+ ASSERT(bma->got.br_startoff + bma->got.br_blockcount >=
+ bma->offset + bma->length);
+ ASSERT(bma->got.br_state == XFS_EXT_NORM ||
+ bma->got.br_state == XFS_EXT_UNWRITTEN);
return 0;
}
+STATIC int
+xfs_bmapi_convert_unwritten(
+ struct xfs_bmalloca *bma,
+ struct xfs_bmbt_irec *mval,
+ xfs_filblks_t len,
+ int flags)
+{
+ int whichfork = (flags & XFS_BMAPI_ATTRFORK) ?
+ XFS_ATTR_FORK : XFS_DATA_FORK;
+ struct xfs_ifork *ifp = XFS_IFORK_PTR(bma->ip, whichfork);
+ int tmp_logflags = 0;
+ int error;
+
+ /* check if we need to do unwritten->real conversion */
+ if (mval->br_state == XFS_EXT_UNWRITTEN &&
+ (flags & XFS_BMAPI_PREALLOC))
+ return 0;
+
+ /* check if we need to do real->unwritten conversion */
+ if (mval->br_state == XFS_EXT_NORM &&
+ (flags & (XFS_BMAPI_PREALLOC | XFS_BMAPI_CONVERT)) !=
+ (XFS_BMAPI_PREALLOC | XFS_BMAPI_CONVERT))
+ return 0;
+
+ /*
+ * Modify (by adding) the state flag, if writing.
+ */
+ ASSERT(mval->br_blockcount <= len);
+ if ((ifp->if_flags & XFS_IFBROOT) && !bma->cur) {
+ bma->cur = xfs_bmbt_init_cursor(bma->ip->i_mount, bma->tp,
+ bma->ip, whichfork);
+ bma->cur->bc_private.b.firstblock = *bma->firstblock;
+ bma->cur->bc_private.b.flist = bma->flist;
+ }
+ mval->br_state = (mval->br_state == XFS_EXT_UNWRITTEN)
+ ? XFS_EXT_NORM : XFS_EXT_UNWRITTEN;
+
+ error = xfs_bmap_add_extent_unwritten_real(bma->tp, bma->ip, &bma->idx,
+ &bma->cur, mval, bma->firstblock, bma->flist,
+ &tmp_logflags);
+ bma->logflags |= tmp_logflags;
+ if (error)
+ return error;
+
+ /*
+ * Update our extent pointer, given that
+ * xfs_bmap_add_extent_unwritten_real might have merged it into one
+ * of the neighbouring ones.
+ */
+ xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx), &bma->got);
+
+ /*
+ * We may have combined previously unwritten space with written space,
+ * so generate another request.
+ */
+ if (mval->br_blockcount < len)
+ return EAGAIN;
+ return 0;
+}
+
+/*
+ * Map file blocks to filesystem blocks, and allocate blocks or convert the
+ * extent state if necessary. Details behaviour is controlled by the flags
+ * parameter. Only allocates blocks from a single allocation group, to avoid
+ * locking problems.
+ *
+ * The returned value in "firstblock" from the first call in a transaction
+ * must be remembered and presented to subsequent calls in "firstblock".
+ * An upper bound for the number of blocks to be allocated is supplied to
+ * the first call in "total"; if no allocation group has that many free
+ * blocks then the call will fail (return NULLFSBLOCK in "firstblock").
+ */
+int
+xfs_bmapi_write(
+ struct xfs_trans *tp, /* transaction pointer */
+ struct xfs_inode *ip, /* incore inode */
+ xfs_fileoff_t bno, /* starting file offs. mapped */
+ xfs_filblks_t len, /* length to map in file */
+ int flags, /* XFS_BMAPI_... */
+ xfs_fsblock_t *firstblock, /* first allocated block
+ controls a.g. for allocs */
+ xfs_extlen_t total, /* total blocks needed */
+ struct xfs_bmbt_irec *mval, /* output: map values */
+ int *nmap, /* i/o: mval size/count */
+ struct xfs_bmap_free *flist) /* i/o: list extents to free */
+{
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_ifork *ifp;
+ struct xfs_bmalloca bma = { NULL }; /* args for xfs_bmap_alloc */
+ xfs_fileoff_t end; /* end of mapped file region */
+ int eof; /* after the end of extents */
+ int error; /* error return */
+ int n; /* current extent index */
+ xfs_fileoff_t obno; /* old block number (offset) */
+ int whichfork; /* data or attr fork */
+ char inhole; /* current location is hole in file */
+ char wasdelay; /* old extent was delayed */
+
+#ifdef DEBUG
+ xfs_fileoff_t orig_bno; /* original block number value */
+ int orig_flags; /* original flags arg value */
+ xfs_filblks_t orig_len; /* original value of len arg */
+ struct xfs_bmbt_irec *orig_mval; /* original value of mval */
+ int orig_nmap; /* original value of *nmap */
+
+ orig_bno = bno;
+ orig_len = len;
+ orig_flags = flags;
+ orig_mval = mval;
+ orig_nmap = *nmap;
+#endif
+ whichfork = (flags & XFS_BMAPI_ATTRFORK) ?
+ XFS_ATTR_FORK : XFS_DATA_FORK;
+
+ ASSERT(*nmap >= 1);
+ ASSERT(*nmap <= XFS_BMAP_MAX_NMAP);
+ ASSERT(!(flags & XFS_BMAPI_IGSTATE));
+ ASSERT(tp != NULL);
+ ASSERT(len > 0);
+ ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL);
+ ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+
+ if (unlikely(XFS_TEST_ERROR(
+ (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
+ XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE),
+ mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) {
+ XFS_ERROR_REPORT("xfs_bmapi_write", XFS_ERRLEVEL_LOW, mp);
+ return XFS_ERROR(EFSCORRUPTED);
+ }
+
+ if (XFS_FORCED_SHUTDOWN(mp))
+ return XFS_ERROR(EIO);
+
+ ifp = XFS_IFORK_PTR(ip, whichfork);
+
+ XFS_STATS_INC(xs_blk_mapw);
+
+ if (*firstblock == NULLFSBLOCK) {
+ if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE)
+ bma.minleft = be16_to_cpu(ifp->if_broot->bb_level) + 1;
+ else
+ bma.minleft = 1;
+ } else {
+ bma.minleft = 0;
+ }
+
+ if (!(ifp->if_flags & XFS_IFEXTENTS)) {
+ error = xfs_iread_extents(tp, ip, whichfork);
+ if (error)
+ goto error0;
+ }
+
+ xfs_bmap_search_extents(ip, bno, whichfork, &eof, &bma.idx, &bma.got,
+ &bma.prev);
+ n = 0;
+ end = bno + len;
+ obno = bno;
+
+ bma.tp = tp;
+ bma.ip = ip;
+ bma.total = total;
+ bma.userdata = 0;
+ bma.flist = flist;
+ bma.firstblock = firstblock;
+
+ while (bno < end && n < *nmap) {
+ inhole = eof || bma.got.br_startoff > bno;
+ wasdelay = !inhole && isnullstartblock(bma.got.br_startblock);
+
+ /*
+ * First, deal with the hole before the allocated space
+ * that we found, if any.
+ */
+ if (inhole || wasdelay) {
+ bma.eof = eof;
+ bma.conv = !!(flags & XFS_BMAPI_CONVERT);
+ bma.wasdel = wasdelay;
+ bma.offset = bno;
+ bma.flags = flags;
+
+ /*
+ * There's a 32/64 bit type mismatch between the
+ * allocation length request (which can be 64 bits in
+ * length) and the bma length request, which is
+ * xfs_extlen_t and therefore 32 bits. Hence we have to
+ * check for 32-bit overflows and handle them here.
+ */
+ if (len > (xfs_filblks_t)MAXEXTLEN)
+ bma.length = MAXEXTLEN;
+ else
+ bma.length = len;
+
+ ASSERT(len > 0);
+ ASSERT(bma.length > 0);
+ error = xfs_bmapi_allocate(&bma);
+ if (error)
+ goto error0;
+ if (bma.blkno == NULLFSBLOCK)
+ break;
+ }
+
+ /* Deal with the allocated space we found. */
+ xfs_bmapi_trim_map(mval, &bma.got, &bno, len, obno,
+ end, n, flags);
+
+ /* Execute unwritten extent conversion if necessary */
+ error = xfs_bmapi_convert_unwritten(&bma, mval, len, flags);
+ if (error == EAGAIN)
+ continue;
+ if (error)
+ goto error0;
+
+ /* update the extent map to return */
+ xfs_bmapi_update_map(&mval, &bno, &len, obno, end, &n, flags);
+
+ /*
+ * If we're done, stop now. Stop when we've allocated
+ * XFS_BMAP_MAX_NMAP extents no matter what. Otherwise
+ * the transaction may get too big.
+ */
+ if (bno >= end || n >= *nmap || bma.nallocs >= *nmap)
+ break;
+
+ /* Else go on to the next record. */
+ bma.prev = bma.got;
+ if (++bma.idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t)) {
+ xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma.idx),
+ &bma.got);
+ } else
+ eof = 1;
+ }
+ *nmap = n;
+
+ /*
+ * Transform from btree to extents, give it cur.
+ */
+ if (xfs_bmap_wants_extents(ip, whichfork)) {
+ int tmp_logflags = 0;
+
+ ASSERT(bma.cur);
+ error = xfs_bmap_btree_to_extents(tp, ip, bma.cur,
+ &tmp_logflags, whichfork);
+ bma.logflags |= tmp_logflags;
+ if (error)
+ goto error0;
+ }
+
+ ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE ||
+ XFS_IFORK_NEXTENTS(ip, whichfork) >
+ XFS_IFORK_MAXEXT(ip, whichfork));
+ error = 0;
+error0:
+ /*
+ * Log everything. Do this after conversion, there's no point in
+ * logging the extent records if we've converted to btree format.
+ */
+ if ((bma.logflags & xfs_ilog_fext(whichfork)) &&
+ XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS)
+ bma.logflags &= ~xfs_ilog_fext(whichfork);
+ else if ((bma.logflags & xfs_ilog_fbroot(whichfork)) &&
+ XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)
+ bma.logflags &= ~xfs_ilog_fbroot(whichfork);
+ /*
+ * Log whatever the flags say, even if error. Otherwise we might miss
+ * detecting a case where the data is changed, there's an error,
+ * and it's not logged so we don't shutdown when we should.
+ */
+ if (bma.logflags)
+ xfs_trans_log_inode(tp, ip, bma.logflags);
+
+ if (bma.cur) {
+ if (!error) {
+ ASSERT(*firstblock == NULLFSBLOCK ||
+ XFS_FSB_TO_AGNO(mp, *firstblock) ==
+ XFS_FSB_TO_AGNO(mp,
+ bma.cur->bc_private.b.firstblock) ||
+ (flist->xbf_low &&
+ XFS_FSB_TO_AGNO(mp, *firstblock) <
+ XFS_FSB_TO_AGNO(mp,
+ bma.cur->bc_private.b.firstblock)));
+ *firstblock = bma.cur->bc_private.b.firstblock;
+ }
+ xfs_btree_del_cursor(bma.cur,
+ error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
+ }
+ if (!error)
+ xfs_bmap_validate_ret(orig_bno, orig_len, orig_flags, orig_mval,
+ orig_nmap, *nmap);
+ return error;
+}
+
/*
- * Called by xfs_bmapi to update extent list structure and the btree
+ * Called by xfs_bmapi to update file extent records and the btree
* after removing space (or undoing a delayed allocation).
*/
STATIC int /* error */
xfs_bmap_del_extent(
xfs_inode_t *ip, /* incore inode pointer */
xfs_trans_t *tp, /* current transaction pointer */
- xfs_extnum_t idx, /* extent number to update/delete */
+ xfs_extnum_t *idx, /* extent number to update/delete */
xfs_bmap_free_t *flist, /* list of extents to be freed */
xfs_btree_cur_t *cur, /* if null, not a btree */
- xfs_bmbt_irec_t *del, /* data to remove from extent list */
+ xfs_bmbt_irec_t *del, /* data to remove from extents */
int *logflagsp, /* inode logging flags */
- int whichfork, /* data or attr fork */
- int rsvd) /* OK to allocate reserved blocks */
+ int whichfork) /* data or attr fork */
{
xfs_filblks_t da_new; /* new delay-alloc indirect blocks */
xfs_filblks_t da_old; /* old delay-alloc indirect blocks */
@@ -2889,12 +4725,9 @@ xfs_bmap_del_extent(
xfs_fileoff_t del_endoff; /* first offset past del */
int delay; /* current block is delayed allocated */
int do_fx; /* free extent at end of routine */
- xfs_bmbt_rec_t *ep; /* current extent entry pointer */
+ xfs_bmbt_rec_host_t *ep; /* current extent entry pointer */
int error; /* error return value */
int flags; /* inode logging flags */
-#ifdef XFS_BMAP_TRACE
- static char fname[] = "xfs_bmap_del_extent";
-#endif
xfs_bmbt_irec_t got; /* current extent entry */
xfs_fileoff_t got_endoff; /* first offset past got */
int i; /* temp state */
@@ -2903,25 +4736,29 @@ xfs_bmap_del_extent(
xfs_filblks_t nblks; /* quota/sb block count */
xfs_bmbt_irec_t new; /* new record to be inserted */
/* REFERENCED */
- xfs_extnum_t nextents; /* number of extents in list */
uint qfield; /* quota field to update */
xfs_filblks_t temp; /* for indirect length calculations */
xfs_filblks_t temp2; /* for indirect length calculations */
+ int state = 0;
XFS_STATS_INC(xs_del_exlist);
+
+ if (whichfork == XFS_ATTR_FORK)
+ state |= BMAP_ATTRFORK;
+
mp = ip->i_mount;
ifp = XFS_IFORK_PTR(ip, whichfork);
- nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
- ASSERT(idx >= 0 && idx < nextents);
+ ASSERT((*idx >= 0) && (*idx < ifp->if_bytes /
+ (uint)sizeof(xfs_bmbt_rec_t)));
ASSERT(del->br_blockcount > 0);
- ep = &ifp->if_u1.if_extents[idx];
+ ep = xfs_iext_get_ext(ifp, *idx);
xfs_bmbt_get_all(ep, &got);
ASSERT(got.br_startoff <= del->br_startoff);
del_endoff = del->br_startoff + del->br_blockcount;
got_endoff = got.br_startoff + got.br_blockcount;
ASSERT(got_endoff >= del_endoff);
- delay = ISNULLSTARTBLOCK(got.br_startblock);
- ASSERT(ISNULLSTARTBLOCK(del->br_startblock) == delay);
+ delay = isnullstartblock(got.br_startblock);
+ ASSERT(isnullstartblock(del->br_startblock) == delay);
flags = 0;
qfield = 0;
error = 0;
@@ -2933,8 +4770,7 @@ xfs_bmap_del_extent(
/*
* Realtime allocation. Free it and record di_nblocks update.
*/
- if (whichfork == XFS_DATA_FORK &&
- (ip->i_d.di_flags & XFS_DIFLAG_REALTIME)) {
+ if (whichfork == XFS_DATA_FORK && XFS_IS_REALTIME_INODE(ip)) {
xfs_fsblock_t bno;
xfs_filblks_t len;
@@ -2946,8 +4782,8 @@ xfs_bmap_del_extent(
len = del->br_blockcount;
do_div(bno, mp->m_sb.sb_rextsize);
do_div(len, mp->m_sb.sb_rextsize);
- if ((error = xfs_rtfree_extent(ip->i_transp, bno,
- (xfs_extlen_t)len)))
+ error = xfs_rtfree_extent(tp, bno, (xfs_extlen_t)len);
+ if (error)
goto done;
do_fx = 0;
nblks = len * mp->m_sb.sb_rextsize;
@@ -2970,11 +4806,11 @@ xfs_bmap_del_extent(
got.br_startblock, got.br_blockcount,
&i)))
goto done;
- ASSERT(i == 1);
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
}
da_old = da_new = 0;
} else {
- da_old = STARTBLOCKVAL(got.br_startblock);
+ da_old = startblockval(got.br_startblock);
da_new = 0;
nblks = 0;
do_fx = 0;
@@ -2989,45 +4825,44 @@ xfs_bmap_del_extent(
/*
* Matches the whole extent. Delete the entry.
*/
- xfs_bmap_trace_delete(fname, "3", ip, idx, 1, whichfork);
- xfs_bmap_delete_exlist(ip, idx, 1, whichfork);
- ifp->if_lastex = idx;
+ xfs_iext_remove(ip, *idx, 1,
+ whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0);
+ --*idx;
if (delay)
break;
+
XFS_IFORK_NEXT_SET(ip, whichfork,
XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
flags |= XFS_ILOG_CORE;
if (!cur) {
- flags |= XFS_ILOG_FEXT(whichfork);
+ flags |= xfs_ilog_fext(whichfork);
break;
}
- if ((error = xfs_bmbt_delete(cur, &i)))
+ if ((error = xfs_btree_delete(cur, &i)))
goto done;
- ASSERT(i == 1);
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
break;
case 2:
/*
* Deleting the first part of the extent.
*/
- xfs_bmap_trace_pre_update(fname, "2", ip, idx, whichfork);
+ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
xfs_bmbt_set_startoff(ep, del_endoff);
temp = got.br_blockcount - del->br_blockcount;
xfs_bmbt_set_blockcount(ep, temp);
- ifp->if_lastex = idx;
if (delay) {
temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
da_old);
- xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp));
- xfs_bmap_trace_post_update(fname, "2", ip, idx,
- whichfork);
+ xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
da_new = temp;
break;
}
xfs_bmbt_set_startblock(ep, del_endblock);
- xfs_bmap_trace_post_update(fname, "2", ip, idx, whichfork);
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
if (!cur) {
- flags |= XFS_ILOG_FEXT(whichfork);
+ flags |= xfs_ilog_fext(whichfork);
break;
}
if ((error = xfs_bmbt_update(cur, del_endoff, del_endblock,
@@ -3041,21 +4876,19 @@ xfs_bmap_del_extent(
* Deleting the last part of the extent.
*/
temp = got.br_blockcount - del->br_blockcount;
- xfs_bmap_trace_pre_update(fname, "1", ip, idx, whichfork);
+ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
xfs_bmbt_set_blockcount(ep, temp);
- ifp->if_lastex = idx;
if (delay) {
temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
da_old);
- xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp));
- xfs_bmap_trace_post_update(fname, "1", ip, idx,
- whichfork);
+ xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
da_new = temp;
break;
}
- xfs_bmap_trace_post_update(fname, "1", ip, idx, whichfork);
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
if (!cur) {
- flags |= XFS_ILOG_FEXT(whichfork);
+ flags |= xfs_ilog_fext(whichfork);
break;
}
if ((error = xfs_bmbt_update(cur, got.br_startoff,
@@ -3070,7 +4903,7 @@ xfs_bmap_del_extent(
* Deleting the middle of the extent.
*/
temp = del->br_startoff - got.br_startoff;
- xfs_bmap_trace_pre_update(fname, "0", ip, idx, whichfork);
+ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
xfs_bmbt_set_blockcount(ep, temp);
new.br_startoff = del_endoff;
temp2 = got_endoff - del_endoff;
@@ -3085,10 +4918,10 @@ xfs_bmap_del_extent(
got.br_startblock, temp,
got.br_state)))
goto done;
- if ((error = xfs_bmbt_increment(cur, 0, &i)))
+ if ((error = xfs_btree_increment(cur, 0, &i)))
goto done;
cur->bc_rec.b = new;
- error = xfs_bmbt_insert(cur, &i);
+ error = xfs_btree_insert(cur, &i);
if (error && error != ENOSPC)
goto done;
/*
@@ -3107,7 +4940,7 @@ xfs_bmap_del_extent(
got.br_startblock,
temp, &i)))
goto done;
- ASSERT(i == 1);
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
/*
* Update the btree record back
* to the original value.
@@ -3128,24 +4961,24 @@ xfs_bmap_del_extent(
error = XFS_ERROR(ENOSPC);
goto done;
}
- ASSERT(i == 1);
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
} else
- flags |= XFS_ILOG_FEXT(whichfork);
+ flags |= xfs_ilog_fext(whichfork);
XFS_IFORK_NEXT_SET(ip, whichfork,
XFS_IFORK_NEXTENTS(ip, whichfork) + 1);
} else {
ASSERT(whichfork == XFS_DATA_FORK);
temp = xfs_bmap_worst_indlen(ip, temp);
- xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp));
+ xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
temp2 = xfs_bmap_worst_indlen(ip, temp2);
- new.br_startblock = NULLSTARTBLOCK((int)temp2);
+ new.br_startblock = nullstartblock((int)temp2);
da_new = temp + temp2;
while (da_new > da_old) {
if (temp) {
temp--;
da_new--;
xfs_bmbt_set_startblock(ep,
- NULLSTARTBLOCK((int)temp));
+ nullstartblock((int)temp));
}
if (da_new == da_old)
break;
@@ -3153,15 +4986,13 @@ xfs_bmap_del_extent(
temp2--;
da_new--;
new.br_startblock =
- NULLSTARTBLOCK((int)temp2);
+ nullstartblock((int)temp2);
}
}
}
- xfs_bmap_trace_post_update(fname, "0", ip, idx, whichfork);
- xfs_bmap_trace_insert(fname, "0", ip, idx + 1, 1, &new, NULL,
- whichfork);
- xfs_bmap_insert_exlist(ip, idx + 1, 1, &new, whichfork);
- ifp->if_lastex = idx + 1;
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+ xfs_iext_insert(ip, *idx + 1, 1, &new, state);
+ ++*idx;
break;
}
/*
@@ -3179,2117 +5010,22 @@ xfs_bmap_del_extent(
* Adjust quota data.
*/
if (qfield)
- XFS_TRANS_MOD_DQUOT_BYINO(mp, tp, ip, qfield, (long)-nblks);
+ xfs_trans_mod_dquot_byino(tp, ip, qfield, (long)-nblks);
/*
* Account for change in delayed indirect blocks.
* Nothing to do for disk quota accounting here.
*/
ASSERT(da_old >= da_new);
- if (da_old > da_new)
- xfs_mod_incore_sb(mp, XFS_SBS_FDBLOCKS, (int)(da_old - da_new),
- rsvd);
-done:
- *logflagsp = flags;
- return error;
-}
-
-/*
- * Remove the entry "free" from the free item list. Prev points to the
- * previous entry, unless "free" is the head of the list.
- */
-STATIC void
-xfs_bmap_del_free(
- xfs_bmap_free_t *flist, /* free item list header */
- xfs_bmap_free_item_t *prev, /* previous item on list, if any */
- xfs_bmap_free_item_t *free) /* list item to be freed */
-{
- if (prev)
- prev->xbfi_next = free->xbfi_next;
- else
- flist->xbf_first = free->xbfi_next;
- flist->xbf_count--;
- kmem_zone_free(xfs_bmap_free_item_zone, free);
-}
-
-/*
- * Remove count entries from the extents array for inode "ip", starting
- * at index "idx". Copies the remaining items down over the deleted ones,
- * and gives back the excess memory.
- */
-STATIC void
-xfs_bmap_delete_exlist(
- xfs_inode_t *ip, /* incore inode pointer */
- xfs_extnum_t idx, /* starting delete index */
- xfs_extnum_t count, /* count of items to delete */
- int whichfork) /* data or attr fork */
-{
- xfs_bmbt_rec_t *base; /* base of extent list */
- xfs_ifork_t *ifp; /* inode fork pointer */
- xfs_extnum_t nextents; /* number of extents in list after */
-
- ifp = XFS_IFORK_PTR(ip, whichfork);
- ASSERT(ifp->if_flags & XFS_IFEXTENTS);
- base = ifp->if_u1.if_extents;
- nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - count;
- memmove(&base[idx], &base[idx + count],
- (nextents - idx) * sizeof(*base));
- xfs_iext_realloc(ip, -count, whichfork);
-}
-
-/*
- * Convert an extents-format file into a btree-format file.
- * The new file will have a root block (in the inode) and a single child block.
- */
-STATIC int /* error */
-xfs_bmap_extents_to_btree(
- xfs_trans_t *tp, /* transaction pointer */
- xfs_inode_t *ip, /* incore inode pointer */
- xfs_fsblock_t *firstblock, /* first-block-allocated */
- xfs_bmap_free_t *flist, /* blocks freed in xaction */
- xfs_btree_cur_t **curp, /* cursor returned to caller */
- int wasdel, /* converting a delayed alloc */
- int *logflagsp, /* inode logging flags */
- int whichfork) /* data or attr fork */
-{
- xfs_bmbt_block_t *ablock; /* allocated (child) bt block */
- xfs_buf_t *abp; /* buffer for ablock */
- xfs_alloc_arg_t args; /* allocation arguments */
- xfs_bmbt_rec_t *arp; /* child record pointer */
- xfs_bmbt_block_t *block; /* btree root block */
- xfs_btree_cur_t *cur; /* bmap btree cursor */
- xfs_bmbt_rec_t *ep; /* extent list pointer */
- int error; /* error return value */
- xfs_extnum_t i, cnt; /* extent list index */
- xfs_ifork_t *ifp; /* inode fork pointer */
- xfs_bmbt_key_t *kp; /* root block key pointer */
- xfs_mount_t *mp; /* mount structure */
- xfs_extnum_t nextents; /* extent list size */
- xfs_bmbt_ptr_t *pp; /* root block address pointer */
-
- ifp = XFS_IFORK_PTR(ip, whichfork);
- ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS);
- ASSERT(ifp->if_ext_max ==
- XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
- /*
- * Make space in the inode incore.
- */
- xfs_iroot_realloc(ip, 1, whichfork);
- ifp->if_flags |= XFS_IFBROOT;
- /*
- * Fill in the root.
- */
- block = ifp->if_broot;
- block->bb_magic = cpu_to_be32(XFS_BMAP_MAGIC);
- block->bb_level = cpu_to_be16(1);
- block->bb_numrecs = cpu_to_be16(1);
- block->bb_leftsib = cpu_to_be64(NULLDFSBNO);
- block->bb_rightsib = cpu_to_be64(NULLDFSBNO);
- /*
- * Need a cursor. Can't allocate until bb_level is filled in.
- */
- mp = ip->i_mount;
- cur = xfs_btree_init_cursor(mp, tp, NULL, 0, XFS_BTNUM_BMAP, ip,
- whichfork);
- cur->bc_private.b.firstblock = *firstblock;
- cur->bc_private.b.flist = flist;
- cur->bc_private.b.flags = wasdel ? XFS_BTCUR_BPRV_WASDEL : 0;
- /*
- * Convert to a btree with two levels, one record in root.
- */
- XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_BTREE);
- args.tp = tp;
- args.mp = mp;
- if (*firstblock == NULLFSBLOCK) {
- args.type = XFS_ALLOCTYPE_START_BNO;
- args.fsbno = XFS_INO_TO_FSB(mp, ip->i_ino);
- } else if (flist->xbf_low) {
- args.type = XFS_ALLOCTYPE_START_BNO;
- args.fsbno = *firstblock;
- } else {
- args.type = XFS_ALLOCTYPE_NEAR_BNO;
- args.fsbno = *firstblock;
- }
- args.minlen = args.maxlen = args.prod = 1;
- args.total = args.minleft = args.alignment = args.mod = args.isfl =
- args.minalignslop = 0;
- args.wasdel = wasdel;
- *logflagsp = 0;
- if ((error = xfs_alloc_vextent(&args))) {
- xfs_iroot_realloc(ip, -1, whichfork);
- xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
- return error;
- }
- /*
- * Allocation can't fail, the space was reserved.
- */
- ASSERT(args.fsbno != NULLFSBLOCK);
- ASSERT(*firstblock == NULLFSBLOCK ||
- args.agno == XFS_FSB_TO_AGNO(mp, *firstblock) ||
- (flist->xbf_low &&
- args.agno > XFS_FSB_TO_AGNO(mp, *firstblock)));
- *firstblock = cur->bc_private.b.firstblock = args.fsbno;
- cur->bc_private.b.allocated++;
- ip->i_d.di_nblocks++;
- XFS_TRANS_MOD_DQUOT_BYINO(mp, tp, ip, XFS_TRANS_DQ_BCOUNT, 1L);
- abp = xfs_btree_get_bufl(mp, tp, args.fsbno, 0);
- /*
- * Fill in the child block.
- */
- ablock = XFS_BUF_TO_BMBT_BLOCK(abp);
- ablock->bb_magic = cpu_to_be32(XFS_BMAP_MAGIC);
- ablock->bb_level = 0;
- ablock->bb_leftsib = cpu_to_be64(NULLDFSBNO);
- ablock->bb_rightsib = cpu_to_be64(NULLDFSBNO);
- arp = XFS_BMAP_REC_IADDR(ablock, 1, cur);
- nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
- for (ep = ifp->if_u1.if_extents, cnt = i = 0; i < nextents; i++, ep++) {
- if (!ISNULLSTARTBLOCK(xfs_bmbt_get_startblock(ep))) {
- arp->l0 = INT_GET(ep->l0, ARCH_CONVERT);
- arp->l1 = INT_GET(ep->l1, ARCH_CONVERT);
- arp++; cnt++;
- }
- }
- ASSERT(cnt == XFS_IFORK_NEXTENTS(ip, whichfork));
- ablock->bb_numrecs = cpu_to_be16(cnt);
- /*
- * Fill in the root key and pointer.
- */
- kp = XFS_BMAP_KEY_IADDR(block, 1, cur);
- arp = XFS_BMAP_REC_IADDR(ablock, 1, cur);
- INT_SET(kp->br_startoff, ARCH_CONVERT, xfs_bmbt_disk_get_startoff(arp));
- pp = XFS_BMAP_PTR_IADDR(block, 1, cur);
- INT_SET(*pp, ARCH_CONVERT, args.fsbno);
- /*
- * Do all this logging at the end so that
- * the root is at the right level.
- */
- xfs_bmbt_log_block(cur, abp, XFS_BB_ALL_BITS);
- xfs_bmbt_log_recs(cur, abp, 1, be16_to_cpu(ablock->bb_numrecs));
- ASSERT(*curp == NULL);
- *curp = cur;
- *logflagsp = XFS_ILOG_CORE | XFS_ILOG_FBROOT(whichfork);
- return 0;
-}
-
-/*
- * Insert new item(s) in the extent list for inode "ip".
- * Count new items are inserted at offset idx.
- */
-STATIC void
-xfs_bmap_insert_exlist(
- xfs_inode_t *ip, /* incore inode pointer */
- xfs_extnum_t idx, /* starting index of new items */
- xfs_extnum_t count, /* number of inserted items */
- xfs_bmbt_irec_t *new, /* items to insert */
- int whichfork) /* data or attr fork */
-{
- xfs_bmbt_rec_t *base; /* extent list base */
- xfs_ifork_t *ifp; /* inode fork pointer */
- xfs_extnum_t nextents; /* extent list size */
- xfs_extnum_t to; /* extent list index */
-
- ifp = XFS_IFORK_PTR(ip, whichfork);
- ASSERT(ifp->if_flags & XFS_IFEXTENTS);
- xfs_iext_realloc(ip, count, whichfork);
- base = ifp->if_u1.if_extents;
- nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
- memmove(&base[idx + count], &base[idx],
- (nextents - (idx + count)) * sizeof(*base));
- for (to = idx; to < idx + count; to++, new++)
- xfs_bmbt_set_all(&base[to], new);
-}
-
-/*
- * Helper routine to reset inode di_forkoff field when switching
- * attribute fork from local to extent format - we reset it where
- * possible to make space available for inline data fork extents.
- */
-STATIC void
-xfs_bmap_forkoff_reset(
- xfs_mount_t *mp,
- xfs_inode_t *ip,
- int whichfork)
-{
- if (whichfork == XFS_ATTR_FORK &&
- (ip->i_d.di_format != XFS_DINODE_FMT_DEV) &&
- (ip->i_d.di_format != XFS_DINODE_FMT_UUID) &&
- ((mp->m_attroffset >> 3) > ip->i_d.di_forkoff)) {
- ip->i_d.di_forkoff = mp->m_attroffset >> 3;
- ip->i_df.if_ext_max = XFS_IFORK_DSIZE(ip) /
- (uint)sizeof(xfs_bmbt_rec_t);
- ip->i_afp->if_ext_max = XFS_IFORK_ASIZE(ip) /
- (uint)sizeof(xfs_bmbt_rec_t);
+ if (da_old > da_new) {
+ xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
+ (int64_t)(da_old - da_new), 0);
}
-}
-
-/*
- * Convert a local file to an extents file.
- * This code is out of bounds for data forks of regular files,
- * since the file data needs to get logged so things will stay consistent.
- * (The bmap-level manipulations are ok, though).
- */
-STATIC int /* error */
-xfs_bmap_local_to_extents(
- xfs_trans_t *tp, /* transaction pointer */
- xfs_inode_t *ip, /* incore inode pointer */
- xfs_fsblock_t *firstblock, /* first block allocated in xaction */
- xfs_extlen_t total, /* total blocks needed by transaction */
- int *logflagsp, /* inode logging flags */
- int whichfork) /* data or attr fork */
-{
- int error; /* error return value */
- int flags; /* logging flags returned */
-#ifdef XFS_BMAP_TRACE
- static char fname[] = "xfs_bmap_local_to_extents";
-#endif
- xfs_ifork_t *ifp; /* inode fork pointer */
-
- /*
- * We don't want to deal with the case of keeping inode data inline yet.
- * So sending the data fork of a regular inode is invalid.
- */
- ASSERT(!((ip->i_d.di_mode & S_IFMT) == S_IFREG &&
- whichfork == XFS_DATA_FORK));
- ifp = XFS_IFORK_PTR(ip, whichfork);
- ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL);
- flags = 0;
- error = 0;
- if (ifp->if_bytes) {
- xfs_alloc_arg_t args; /* allocation arguments */
- xfs_buf_t *bp; /* buffer for extent list block */
- xfs_bmbt_rec_t *ep; /* extent list pointer */
-
- args.tp = tp;
- args.mp = ip->i_mount;
- ASSERT(ifp->if_flags & XFS_IFINLINE);
- /*
- * Allocate a block. We know we need only one, since the
- * file currently fits in an inode.
- */
- if (*firstblock == NULLFSBLOCK) {
- args.fsbno = XFS_INO_TO_FSB(args.mp, ip->i_ino);
- args.type = XFS_ALLOCTYPE_START_BNO;
- } else {
- args.fsbno = *firstblock;
- args.type = XFS_ALLOCTYPE_NEAR_BNO;
- }
- args.total = total;
- args.mod = args.minleft = args.alignment = args.wasdel =
- args.isfl = args.minalignslop = 0;
- args.minlen = args.maxlen = args.prod = 1;
- if ((error = xfs_alloc_vextent(&args)))
- goto done;
- /*
- * Can't fail, the space was reserved.
- */
- ASSERT(args.fsbno != NULLFSBLOCK);
- ASSERT(args.len == 1);
- *firstblock = args.fsbno;
- bp = xfs_btree_get_bufl(args.mp, tp, args.fsbno, 0);
- memcpy((char *)XFS_BUF_PTR(bp), ifp->if_u1.if_data,
- ifp->if_bytes);
- xfs_trans_log_buf(tp, bp, 0, ifp->if_bytes - 1);
- xfs_bmap_forkoff_reset(args.mp, ip, whichfork);
- xfs_idata_realloc(ip, -ifp->if_bytes, whichfork);
- xfs_iext_realloc(ip, 1, whichfork);
- ep = ifp->if_u1.if_extents;
- xfs_bmbt_set_allf(ep, 0, args.fsbno, 1, XFS_EXT_NORM);
- xfs_bmap_trace_post_update(fname, "new", ip, 0, whichfork);
- XFS_IFORK_NEXT_SET(ip, whichfork, 1);
- ip->i_d.di_nblocks = 1;
- XFS_TRANS_MOD_DQUOT_BYINO(args.mp, tp, ip,
- XFS_TRANS_DQ_BCOUNT, 1L);
- flags |= XFS_ILOG_FEXT(whichfork);
- } else {
- ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) == 0);
- xfs_bmap_forkoff_reset(ip->i_mount, ip, whichfork);
- }
- ifp->if_flags &= ~XFS_IFINLINE;
- ifp->if_flags |= XFS_IFEXTENTS;
- XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS);
- flags |= XFS_ILOG_CORE;
done:
*logflagsp = flags;
return error;
}
-xfs_bmbt_rec_t * /* pointer to found extent entry */
-xfs_bmap_do_search_extents(
- xfs_bmbt_rec_t *base, /* base of extent list */
- xfs_extnum_t lastx, /* last extent index used */
- xfs_extnum_t nextents, /* extent list size */
- xfs_fileoff_t bno, /* block number searched for */
- int *eofp, /* out: end of file found */
- xfs_extnum_t *lastxp, /* out: last extent index */
- xfs_bmbt_irec_t *gotp, /* out: extent entry found */
- xfs_bmbt_irec_t *prevp) /* out: previous extent entry found */
-{
- xfs_bmbt_rec_t *ep; /* extent list entry pointer */
- xfs_bmbt_irec_t got; /* extent list entry, decoded */
- int high; /* high index of binary search */
- int low; /* low index of binary search */
-
- /*
- * Initialize the extent entry structure to catch access to
- * uninitialized br_startblock field.
- */
- got.br_startoff = 0xffa5a5a5a5a5a5a5LL;
- got.br_blockcount = 0xa55a5a5a5a5a5a5aLL;
- got.br_state = XFS_EXT_INVALID;
-
-#if XFS_BIG_BLKNOS
- got.br_startblock = 0xffffa5a5a5a5a5a5LL;
-#else
- got.br_startblock = 0xffffa5a5;
-#endif
-
- if (lastx != NULLEXTNUM && lastx < nextents)
- ep = base + lastx;
- else
- ep = NULL;
- prevp->br_startoff = NULLFILEOFF;
- if (ep && bno >= (got.br_startoff = xfs_bmbt_get_startoff(ep)) &&
- bno < got.br_startoff +
- (got.br_blockcount = xfs_bmbt_get_blockcount(ep)))
- *eofp = 0;
- else if (ep && lastx < nextents - 1 &&
- bno >= (got.br_startoff = xfs_bmbt_get_startoff(ep + 1)) &&
- bno < got.br_startoff +
- (got.br_blockcount = xfs_bmbt_get_blockcount(ep + 1))) {
- lastx++;
- ep++;
- *eofp = 0;
- } else if (nextents == 0)
- *eofp = 1;
- else if (bno == 0 &&
- (got.br_startoff = xfs_bmbt_get_startoff(base)) == 0) {
- ep = base;
- lastx = 0;
- got.br_blockcount = xfs_bmbt_get_blockcount(ep);
- *eofp = 0;
- } else {
- /* binary search the extents array */
- low = 0;
- high = nextents - 1;
- while (low <= high) {
- XFS_STATS_INC(xs_cmp_exlist);
- lastx = (low + high) >> 1;
- ep = base + lastx;
- got.br_startoff = xfs_bmbt_get_startoff(ep);
- got.br_blockcount = xfs_bmbt_get_blockcount(ep);
- if (bno < got.br_startoff)
- high = lastx - 1;
- else if (bno >= got.br_startoff + got.br_blockcount)
- low = lastx + 1;
- else {
- got.br_startblock = xfs_bmbt_get_startblock(ep);
- got.br_state = xfs_bmbt_get_state(ep);
- *eofp = 0;
- *lastxp = lastx;
- *gotp = got;
- return ep;
- }
- }
- if (bno >= got.br_startoff + got.br_blockcount) {
- lastx++;
- if (lastx == nextents) {
- *eofp = 1;
- got.br_startblock = xfs_bmbt_get_startblock(ep);
- got.br_state = xfs_bmbt_get_state(ep);
- *prevp = got;
- ep = NULL;
- } else {
- *eofp = 0;
- xfs_bmbt_get_all(ep, prevp);
- ep++;
- got.br_startoff = xfs_bmbt_get_startoff(ep);
- got.br_blockcount = xfs_bmbt_get_blockcount(ep);
- }
- } else {
- *eofp = 0;
- if (ep > base)
- xfs_bmbt_get_all(ep - 1, prevp);
- }
- }
- if (ep) {
- got.br_startblock = xfs_bmbt_get_startblock(ep);
- got.br_state = xfs_bmbt_get_state(ep);
- }
- *lastxp = lastx;
- *gotp = got;
- return ep;
-}
-
-/*
- * Search the extents list for the inode, for the extent containing bno.
- * If bno lies in a hole, point to the next entry. If bno lies past eof,
- * *eofp will be set, and *prevp will contain the last entry (null if none).
- * Else, *lastxp will be set to the index of the found
- * entry; *gotp will contain the entry.
- */
-STATIC xfs_bmbt_rec_t * /* pointer to found extent entry */
-xfs_bmap_search_extents(
- xfs_inode_t *ip, /* incore inode pointer */
- xfs_fileoff_t bno, /* block number searched for */
- int whichfork, /* data or attr fork */
- int *eofp, /* out: end of file found */
- xfs_extnum_t *lastxp, /* out: last extent index */
- xfs_bmbt_irec_t *gotp, /* out: extent entry found */
- xfs_bmbt_irec_t *prevp) /* out: previous extent entry found */
-{
- xfs_ifork_t *ifp; /* inode fork pointer */
- xfs_bmbt_rec_t *base; /* base of extent list */
- xfs_extnum_t lastx; /* last extent index used */
- xfs_extnum_t nextents; /* extent list size */
- xfs_bmbt_rec_t *ep; /* extent list entry pointer */
- int rt; /* realtime flag */
-
- XFS_STATS_INC(xs_look_exlist);
- ifp = XFS_IFORK_PTR(ip, whichfork);
- lastx = ifp->if_lastex;
- nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
- base = &ifp->if_u1.if_extents[0];
-
- ep = xfs_bmap_do_search_extents(base, lastx, nextents, bno, eofp,
- lastxp, gotp, prevp);
- rt = (whichfork == XFS_DATA_FORK) && XFS_IS_REALTIME_INODE(ip);
- if (unlikely(!rt && !gotp->br_startblock && (*lastxp != NULLEXTNUM))) {
- cmn_err(CE_PANIC,"Access to block zero: fs: <%s> inode: %lld "
- "start_block : %llx start_off : %llx blkcnt : %llx "
- "extent-state : %x \n",
- (ip->i_mount)->m_fsname, (long long)ip->i_ino,
- (unsigned long long)gotp->br_startblock,
- (unsigned long long)gotp->br_startoff,
- (unsigned long long)gotp->br_blockcount,
- gotp->br_state);
- }
- return ep;
-}
-
-
-#ifdef XFS_BMAP_TRACE
-ktrace_t *xfs_bmap_trace_buf;
-
-/*
- * Add a bmap trace buffer entry. Base routine for the others.
- */
-STATIC void
-xfs_bmap_trace_addentry(
- int opcode, /* operation */
- char *fname, /* function name */
- char *desc, /* operation description */
- xfs_inode_t *ip, /* incore inode pointer */
- xfs_extnum_t idx, /* index of entry(ies) */
- xfs_extnum_t cnt, /* count of entries, 1 or 2 */
- xfs_bmbt_rec_t *r1, /* first record */
- xfs_bmbt_rec_t *r2, /* second record or null */
- int whichfork) /* data or attr fork */
-{
- xfs_bmbt_rec_t tr2;
-
- ASSERT(cnt == 1 || cnt == 2);
- ASSERT(r1 != NULL);
- if (cnt == 1) {
- ASSERT(r2 == NULL);
- r2 = &tr2;
- memset(&tr2, 0, sizeof(tr2));
- } else
- ASSERT(r2 != NULL);
- ktrace_enter(xfs_bmap_trace_buf,
- (void *)(__psint_t)(opcode | (whichfork << 16)),
- (void *)fname, (void *)desc, (void *)ip,
- (void *)(__psint_t)idx,
- (void *)(__psint_t)cnt,
- (void *)(__psunsigned_t)(ip->i_ino >> 32),
- (void *)(__psunsigned_t)(unsigned)ip->i_ino,
- (void *)(__psunsigned_t)(r1->l0 >> 32),
- (void *)(__psunsigned_t)(unsigned)(r1->l0),
- (void *)(__psunsigned_t)(r1->l1 >> 32),
- (void *)(__psunsigned_t)(unsigned)(r1->l1),
- (void *)(__psunsigned_t)(r2->l0 >> 32),
- (void *)(__psunsigned_t)(unsigned)(r2->l0),
- (void *)(__psunsigned_t)(r2->l1 >> 32),
- (void *)(__psunsigned_t)(unsigned)(r2->l1)
- );
- ASSERT(ip->i_xtrace);
- ktrace_enter(ip->i_xtrace,
- (void *)(__psint_t)(opcode | (whichfork << 16)),
- (void *)fname, (void *)desc, (void *)ip,
- (void *)(__psint_t)idx,
- (void *)(__psint_t)cnt,
- (void *)(__psunsigned_t)(ip->i_ino >> 32),
- (void *)(__psunsigned_t)(unsigned)ip->i_ino,
- (void *)(__psunsigned_t)(r1->l0 >> 32),
- (void *)(__psunsigned_t)(unsigned)(r1->l0),
- (void *)(__psunsigned_t)(r1->l1 >> 32),
- (void *)(__psunsigned_t)(unsigned)(r1->l1),
- (void *)(__psunsigned_t)(r2->l0 >> 32),
- (void *)(__psunsigned_t)(unsigned)(r2->l0),
- (void *)(__psunsigned_t)(r2->l1 >> 32),
- (void *)(__psunsigned_t)(unsigned)(r2->l1)
- );
-}
-
-/*
- * Add bmap trace entry prior to a call to xfs_bmap_delete_exlist.
- */
-STATIC void
-xfs_bmap_trace_delete(
- char *fname, /* function name */
- char *desc, /* operation description */
- xfs_inode_t *ip, /* incore inode pointer */
- xfs_extnum_t idx, /* index of entry(entries) deleted */
- xfs_extnum_t cnt, /* count of entries deleted, 1 or 2 */
- int whichfork) /* data or attr fork */
-{
- xfs_ifork_t *ifp; /* inode fork pointer */
-
- ifp = XFS_IFORK_PTR(ip, whichfork);
- xfs_bmap_trace_addentry(XFS_BMAP_KTRACE_DELETE, fname, desc, ip, idx,
- cnt, &ifp->if_u1.if_extents[idx],
- cnt == 2 ? &ifp->if_u1.if_extents[idx + 1] : NULL,
- whichfork);
-}
-
-/*
- * Add bmap trace entry prior to a call to xfs_bmap_insert_exlist, or
- * reading in the extents list from the disk (in the btree).
- */
-STATIC void
-xfs_bmap_trace_insert(
- char *fname, /* function name */
- char *desc, /* operation description */
- xfs_inode_t *ip, /* incore inode pointer */
- xfs_extnum_t idx, /* index of entry(entries) inserted */
- xfs_extnum_t cnt, /* count of entries inserted, 1 or 2 */
- xfs_bmbt_irec_t *r1, /* inserted record 1 */
- xfs_bmbt_irec_t *r2, /* inserted record 2 or null */
- int whichfork) /* data or attr fork */
-{
- xfs_bmbt_rec_t tr1; /* compressed record 1 */
- xfs_bmbt_rec_t tr2; /* compressed record 2 if needed */
-
- xfs_bmbt_set_all(&tr1, r1);
- if (cnt == 2) {
- ASSERT(r2 != NULL);
- xfs_bmbt_set_all(&tr2, r2);
- } else {
- ASSERT(cnt == 1);
- ASSERT(r2 == NULL);
- }
- xfs_bmap_trace_addentry(XFS_BMAP_KTRACE_INSERT, fname, desc, ip, idx,
- cnt, &tr1, cnt == 2 ? &tr2 : NULL, whichfork);
-}
-
-/*
- * Add bmap trace entry after updating an extent list entry in place.
- */
-STATIC void
-xfs_bmap_trace_post_update(
- char *fname, /* function name */
- char *desc, /* operation description */
- xfs_inode_t *ip, /* incore inode pointer */
- xfs_extnum_t idx, /* index of entry updated */
- int whichfork) /* data or attr fork */
-{
- xfs_ifork_t *ifp; /* inode fork pointer */
-
- ifp = XFS_IFORK_PTR(ip, whichfork);
- xfs_bmap_trace_addentry(XFS_BMAP_KTRACE_POST_UP, fname, desc, ip, idx,
- 1, &ifp->if_u1.if_extents[idx], NULL, whichfork);
-}
-
-/*
- * Add bmap trace entry prior to updating an extent list entry in place.
- */
-STATIC void
-xfs_bmap_trace_pre_update(
- char *fname, /* function name */
- char *desc, /* operation description */
- xfs_inode_t *ip, /* incore inode pointer */
- xfs_extnum_t idx, /* index of entry to be updated */
- int whichfork) /* data or attr fork */
-{
- xfs_ifork_t *ifp; /* inode fork pointer */
-
- ifp = XFS_IFORK_PTR(ip, whichfork);
- xfs_bmap_trace_addentry(XFS_BMAP_KTRACE_PRE_UP, fname, desc, ip, idx, 1,
- &ifp->if_u1.if_extents[idx], NULL, whichfork);
-}
-#endif /* XFS_BMAP_TRACE */
-
-/*
- * Compute the worst-case number of indirect blocks that will be used
- * for ip's delayed extent of length "len".
- */
-STATIC xfs_filblks_t
-xfs_bmap_worst_indlen(
- xfs_inode_t *ip, /* incore inode pointer */
- xfs_filblks_t len) /* delayed extent length */
-{
- int level; /* btree level number */
- int maxrecs; /* maximum record count at this level */
- xfs_mount_t *mp; /* mount structure */
- xfs_filblks_t rval; /* return value */
-
- mp = ip->i_mount;
- maxrecs = mp->m_bmap_dmxr[0];
- for (level = 0, rval = 0;
- level < XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK);
- level++) {
- len += maxrecs - 1;
- do_div(len, maxrecs);
- rval += len;
- if (len == 1)
- return rval + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) -
- level - 1;
- if (level == 0)
- maxrecs = mp->m_bmap_dmxr[1];
- }
- return rval;
-}
-
-#if defined(XFS_RW_TRACE)
-STATIC void
-xfs_bunmap_trace(
- xfs_inode_t *ip,
- xfs_fileoff_t bno,
- xfs_filblks_t len,
- int flags,
- inst_t *ra)
-{
- if (ip->i_rwtrace == NULL)
- return;
- ktrace_enter(ip->i_rwtrace,
- (void *)(__psint_t)XFS_BUNMAPI,
- (void *)ip,
- (void *)(__psint_t)((ip->i_d.di_size >> 32) & 0xffffffff),
- (void *)(__psint_t)(ip->i_d.di_size & 0xffffffff),
- (void *)(__psint_t)(((xfs_dfiloff_t)bno >> 32) & 0xffffffff),
- (void *)(__psint_t)((xfs_dfiloff_t)bno & 0xffffffff),
- (void *)(__psint_t)len,
- (void *)(__psint_t)flags,
- (void *)(unsigned long)current_cpu(),
- (void *)ra,
- (void *)0,
- (void *)0,
- (void *)0,
- (void *)0,
- (void *)0,
- (void *)0);
-}
-#endif
-
-/*
- * Convert inode from non-attributed to attributed.
- * Must not be in a transaction, ip must not be locked.
- */
-int /* error code */
-xfs_bmap_add_attrfork(
- xfs_inode_t *ip, /* incore inode pointer */
- int size, /* space new attribute needs */
- int rsvd) /* xact may use reserved blks */
-{
- xfs_fsblock_t firstblock; /* 1st block/ag allocated */
- xfs_bmap_free_t flist; /* freed extent list */
- xfs_mount_t *mp; /* mount structure */
- xfs_trans_t *tp; /* transaction pointer */
- unsigned long s; /* spinlock spl value */
- int blks; /* space reservation */
- int version = 1; /* superblock attr version */
- int committed; /* xaction was committed */
- int logflags; /* logging flags */
- int error; /* error return value */
-
- ASSERT(XFS_IFORK_Q(ip) == 0);
- ASSERT(ip->i_df.if_ext_max ==
- XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t));
-
- mp = ip->i_mount;
- ASSERT(!XFS_NOT_DQATTACHED(mp, ip));
- tp = xfs_trans_alloc(mp, XFS_TRANS_ADDAFORK);
- blks = XFS_ADDAFORK_SPACE_RES(mp);
- if (rsvd)
- tp->t_flags |= XFS_TRANS_RESERVE;
- if ((error = xfs_trans_reserve(tp, blks, XFS_ADDAFORK_LOG_RES(mp), 0,
- XFS_TRANS_PERM_LOG_RES, XFS_ADDAFORK_LOG_COUNT)))
- goto error0;
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- error = XFS_TRANS_RESERVE_QUOTA_NBLKS(mp, tp, ip, blks, 0, rsvd ?
- XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES :
- XFS_QMOPT_RES_REGBLKS);
- if (error) {
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES);
- return error;
- }
- if (XFS_IFORK_Q(ip))
- goto error1;
- if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS) {
- /*
- * For inodes coming from pre-6.2 filesystems.
- */
- ASSERT(ip->i_d.di_aformat == 0);
- ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
- }
- ASSERT(ip->i_d.di_anextents == 0);
- VN_HOLD(XFS_ITOV(ip));
- xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
- xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
- switch (ip->i_d.di_format) {
- case XFS_DINODE_FMT_DEV:
- ip->i_d.di_forkoff = roundup(sizeof(xfs_dev_t), 8) >> 3;
- break;
- case XFS_DINODE_FMT_UUID:
- ip->i_d.di_forkoff = roundup(sizeof(uuid_t), 8) >> 3;
- break;
- case XFS_DINODE_FMT_LOCAL:
- case XFS_DINODE_FMT_EXTENTS:
- case XFS_DINODE_FMT_BTREE:
- ip->i_d.di_forkoff = xfs_attr_shortform_bytesfit(ip, size);
- if (!ip->i_d.di_forkoff)
- ip->i_d.di_forkoff = mp->m_attroffset >> 3;
- else if (mp->m_flags & XFS_MOUNT_ATTR2)
- version = 2;
- break;
- default:
- ASSERT(0);
- error = XFS_ERROR(EINVAL);
- goto error1;
- }
- ip->i_df.if_ext_max =
- XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
- ASSERT(ip->i_afp == NULL);
- ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP);
- ip->i_afp->if_ext_max =
- XFS_IFORK_ASIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
- ip->i_afp->if_flags = XFS_IFEXTENTS;
- logflags = 0;
- XFS_BMAP_INIT(&flist, &firstblock);
- switch (ip->i_d.di_format) {
- case XFS_DINODE_FMT_LOCAL:
- error = xfs_bmap_add_attrfork_local(tp, ip, &firstblock, &flist,
- &logflags);
- break;
- case XFS_DINODE_FMT_EXTENTS:
- error = xfs_bmap_add_attrfork_extents(tp, ip, &firstblock,
- &flist, &logflags);
- break;
- case XFS_DINODE_FMT_BTREE:
- error = xfs_bmap_add_attrfork_btree(tp, ip, &firstblock, &flist,
- &logflags);
- break;
- default:
- error = 0;
- break;
- }
- if (logflags)
- xfs_trans_log_inode(tp, ip, logflags);
- if (error)
- goto error2;
- if (!XFS_SB_VERSION_HASATTR(&mp->m_sb) ||
- (!XFS_SB_VERSION_HASATTR2(&mp->m_sb) && version == 2)) {
- __int64_t sbfields = 0;
-
- s = XFS_SB_LOCK(mp);
- if (!XFS_SB_VERSION_HASATTR(&mp->m_sb)) {
- XFS_SB_VERSION_ADDATTR(&mp->m_sb);
- sbfields |= XFS_SB_VERSIONNUM;
- }
- if (!XFS_SB_VERSION_HASATTR2(&mp->m_sb) && version == 2) {
- XFS_SB_VERSION_ADDATTR2(&mp->m_sb);
- sbfields |= (XFS_SB_VERSIONNUM | XFS_SB_FEATURES2);
- }
- if (sbfields) {
- XFS_SB_UNLOCK(mp, s);
- xfs_mod_sb(tp, sbfields);
- } else
- XFS_SB_UNLOCK(mp, s);
- }
- if ((error = xfs_bmap_finish(&tp, &flist, firstblock, &committed)))
- goto error2;
- error = xfs_trans_commit(tp, XFS_TRANS_PERM_LOG_RES, NULL);
- ASSERT(ip->i_df.if_ext_max ==
- XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t));
- return error;
-error2:
- xfs_bmap_cancel(&flist);
-error1:
- ASSERT(ismrlocked(&ip->i_lock,MR_UPDATE));
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
-error0:
- xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
- ASSERT(ip->i_df.if_ext_max ==
- XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t));
- return error;
-}
-
-/*
- * Add the extent to the list of extents to be free at transaction end.
- * The list is maintained sorted (by block number).
- */
-/* ARGSUSED */
-void
-xfs_bmap_add_free(
- xfs_fsblock_t bno, /* fs block number of extent */
- xfs_filblks_t len, /* length of extent */
- xfs_bmap_free_t *flist, /* list of extents */
- xfs_mount_t *mp) /* mount point structure */
-{
- xfs_bmap_free_item_t *cur; /* current (next) element */
- xfs_bmap_free_item_t *new; /* new element */
- xfs_bmap_free_item_t *prev; /* previous element */
-#ifdef DEBUG
- xfs_agnumber_t agno;
- xfs_agblock_t agbno;
-
- ASSERT(bno != NULLFSBLOCK);
- ASSERT(len > 0);
- ASSERT(len <= MAXEXTLEN);
- ASSERT(!ISNULLSTARTBLOCK(bno));
- agno = XFS_FSB_TO_AGNO(mp, bno);
- agbno = XFS_FSB_TO_AGBNO(mp, bno);
- ASSERT(agno < mp->m_sb.sb_agcount);
- ASSERT(agbno < mp->m_sb.sb_agblocks);
- ASSERT(len < mp->m_sb.sb_agblocks);
- ASSERT(agbno + len <= mp->m_sb.sb_agblocks);
-#endif
- ASSERT(xfs_bmap_free_item_zone != NULL);
- new = kmem_zone_alloc(xfs_bmap_free_item_zone, KM_SLEEP);
- new->xbfi_startblock = bno;
- new->xbfi_blockcount = (xfs_extlen_t)len;
- for (prev = NULL, cur = flist->xbf_first;
- cur != NULL;
- prev = cur, cur = cur->xbfi_next) {
- if (cur->xbfi_startblock >= bno)
- break;
- }
- if (prev)
- prev->xbfi_next = new;
- else
- flist->xbf_first = new;
- new->xbfi_next = cur;
- flist->xbf_count++;
-}
-
-/*
- * Compute and fill in the value of the maximum depth of a bmap btree
- * in this filesystem. Done once, during mount.
- */
-void
-xfs_bmap_compute_maxlevels(
- xfs_mount_t *mp, /* file system mount structure */
- int whichfork) /* data or attr fork */
-{
- int level; /* btree level */
- uint maxblocks; /* max blocks at this level */
- uint maxleafents; /* max leaf entries possible */
- int maxrootrecs; /* max records in root block */
- int minleafrecs; /* min records in leaf block */
- int minnoderecs; /* min records in node block */
- int sz; /* root block size */
-
- /*
- * The maximum number of extents in a file, hence the maximum
- * number of leaf entries, is controlled by the type of di_nextents
- * (a signed 32-bit number, xfs_extnum_t), or by di_anextents
- * (a signed 16-bit number, xfs_aextnum_t).
- */
- if (whichfork == XFS_DATA_FORK) {
- maxleafents = MAXEXTNUM;
- sz = (mp->m_flags & XFS_MOUNT_ATTR2) ?
- XFS_BMDR_SPACE_CALC(MINDBTPTRS) : mp->m_attroffset;
- } else {
- maxleafents = MAXAEXTNUM;
- sz = (mp->m_flags & XFS_MOUNT_ATTR2) ?
- XFS_BMDR_SPACE_CALC(MINABTPTRS) :
- mp->m_sb.sb_inodesize - mp->m_attroffset;
- }
- maxrootrecs = (int)XFS_BTREE_BLOCK_MAXRECS(sz, xfs_bmdr, 0);
- minleafrecs = mp->m_bmap_dmnr[0];
- minnoderecs = mp->m_bmap_dmnr[1];
- maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs;
- for (level = 1; maxblocks > 1; level++) {
- if (maxblocks <= maxrootrecs)
- maxblocks = 1;
- else
- maxblocks = (maxblocks + minnoderecs - 1) / minnoderecs;
- }
- mp->m_bm_maxlevels[whichfork] = level;
-}
-
-/*
- * Routine to be called at transaction's end by xfs_bmapi, xfs_bunmapi
- * caller. Frees all the extents that need freeing, which must be done
- * last due to locking considerations. We never free any extents in
- * the first transaction. This is to allow the caller to make the first
- * transaction a synchronous one so that the pointers to the data being
- * broken in this transaction will be permanent before the data is actually
- * freed. This is necessary to prevent blocks from being reallocated
- * and written to before the free and reallocation are actually permanent.
- * We do not just make the first transaction synchronous here, because
- * there are more efficient ways to gain the same protection in some cases
- * (see the file truncation code).
- *
- * Return 1 if the given transaction was committed and a new one
- * started, and 0 otherwise in the committed parameter.
- */
-/*ARGSUSED*/
-int /* error */
-xfs_bmap_finish(
- xfs_trans_t **tp, /* transaction pointer addr */
- xfs_bmap_free_t *flist, /* i/o: list extents to free */
- xfs_fsblock_t firstblock, /* controlled ag for allocs */
- int *committed) /* xact committed or not */
-{
- xfs_efd_log_item_t *efd; /* extent free data */
- xfs_efi_log_item_t *efi; /* extent free intention */
- int error; /* error return value */
- xfs_bmap_free_item_t *free; /* free extent list item */
- unsigned int logres; /* new log reservation */
- unsigned int logcount; /* new log count */
- xfs_mount_t *mp; /* filesystem mount structure */
- xfs_bmap_free_item_t *next; /* next item on free list */
- xfs_trans_t *ntp; /* new transaction pointer */
-
- ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES);
- if (flist->xbf_count == 0) {
- *committed = 0;
- return 0;
- }
- ntp = *tp;
- efi = xfs_trans_get_efi(ntp, flist->xbf_count);
- for (free = flist->xbf_first; free; free = free->xbfi_next)
- xfs_trans_log_efi_extent(ntp, efi, free->xbfi_startblock,
- free->xbfi_blockcount);
- logres = ntp->t_log_res;
- logcount = ntp->t_log_count;
- ntp = xfs_trans_dup(*tp);
- error = xfs_trans_commit(*tp, 0, NULL);
- *tp = ntp;
- *committed = 1;
- /*
- * We have a new transaction, so we should return committed=1,
- * even though we're returning an error.
- */
- if (error) {
- return error;
- }
- if ((error = xfs_trans_reserve(ntp, 0, logres, 0, XFS_TRANS_PERM_LOG_RES,
- logcount)))
- return error;
- efd = xfs_trans_get_efd(ntp, efi, flist->xbf_count);
- for (free = flist->xbf_first; free != NULL; free = next) {
- next = free->xbfi_next;
- if ((error = xfs_free_extent(ntp, free->xbfi_startblock,
- free->xbfi_blockcount))) {
- /*
- * The bmap free list will be cleaned up at a
- * higher level. The EFI will be canceled when
- * this transaction is aborted.
- * Need to force shutdown here to make sure it
- * happens, since this transaction may not be
- * dirty yet.
- */
- mp = ntp->t_mountp;
- if (!XFS_FORCED_SHUTDOWN(mp))
- xfs_force_shutdown(mp,
- (error == EFSCORRUPTED) ?
- XFS_CORRUPT_INCORE :
- XFS_METADATA_IO_ERROR);
- return error;
- }
- xfs_trans_log_efd_extent(ntp, efd, free->xbfi_startblock,
- free->xbfi_blockcount);
- xfs_bmap_del_free(flist, NULL, free);
- }
- return 0;
-}
-
-/*
- * Free up any items left in the list.
- */
-void
-xfs_bmap_cancel(
- xfs_bmap_free_t *flist) /* list of bmap_free_items */
-{
- xfs_bmap_free_item_t *free; /* free list item */
- xfs_bmap_free_item_t *next;
-
- if (flist->xbf_count == 0)
- return;
- ASSERT(flist->xbf_first != NULL);
- for (free = flist->xbf_first; free; free = next) {
- next = free->xbfi_next;
- xfs_bmap_del_free(flist, NULL, free);
- }
- ASSERT(flist->xbf_count == 0);
-}
-
-/*
- * Returns the file-relative block number of the first unused block(s)
- * in the file with at least "len" logically contiguous blocks free.
- * This is the lowest-address hole if the file has holes, else the first block
- * past the end of file.
- * Return 0 if the file is currently local (in-inode).
- */
-int /* error */
-xfs_bmap_first_unused(
- xfs_trans_t *tp, /* transaction pointer */
- xfs_inode_t *ip, /* incore inode */
- xfs_extlen_t len, /* size of hole to find */
- xfs_fileoff_t *first_unused, /* unused block */
- int whichfork) /* data or attr fork */
-{
- xfs_bmbt_rec_t *base; /* base of extent array */
- xfs_bmbt_rec_t *ep; /* pointer to an extent entry */
- int error; /* error return value */
- xfs_ifork_t *ifp; /* inode fork pointer */
- xfs_fileoff_t lastaddr; /* last block number seen */
- xfs_fileoff_t lowest; /* lowest useful block */
- xfs_fileoff_t max; /* starting useful block */
- xfs_fileoff_t off; /* offset for this block */
- xfs_extnum_t nextents; /* number of extent entries */
-
- ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE ||
- XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS ||
- XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL);
- if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) {
- *first_unused = 0;
- return 0;
- }
- ifp = XFS_IFORK_PTR(ip, whichfork);
- if (!(ifp->if_flags & XFS_IFEXTENTS) &&
- (error = xfs_iread_extents(tp, ip, whichfork)))
- return error;
- lowest = *first_unused;
- nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
- base = &ifp->if_u1.if_extents[0];
- for (lastaddr = 0, max = lowest, ep = base;
- ep < &base[nextents];
- ep++) {
- off = xfs_bmbt_get_startoff(ep);
- /*
- * See if the hole before this extent will work.
- */
- if (off >= lowest + len && off - max >= len) {
- *first_unused = max;
- return 0;
- }
- lastaddr = off + xfs_bmbt_get_blockcount(ep);
- max = XFS_FILEOFF_MAX(lastaddr, lowest);
- }
- *first_unused = max;
- return 0;
-}
-
-/*
- * Returns the file-relative block number of the last block + 1 before
- * last_block (input value) in the file.
- * This is not based on i_size, it is based on the extent list.
- * Returns 0 for local files, as they do not have an extent list.
- */
-int /* error */
-xfs_bmap_last_before(
- xfs_trans_t *tp, /* transaction pointer */
- xfs_inode_t *ip, /* incore inode */
- xfs_fileoff_t *last_block, /* last block */
- int whichfork) /* data or attr fork */
-{
- xfs_fileoff_t bno; /* input file offset */
- int eof; /* hit end of file */
- xfs_bmbt_rec_t *ep; /* pointer to last extent */
- int error; /* error return value */
- xfs_bmbt_irec_t got; /* current extent value */
- xfs_ifork_t *ifp; /* inode fork pointer */
- xfs_extnum_t lastx; /* last extent used */
- xfs_bmbt_irec_t prev; /* previous extent value */
-
- if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE &&
- XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
- XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL)
- return XFS_ERROR(EIO);
- if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) {
- *last_block = 0;
- return 0;
- }
- ifp = XFS_IFORK_PTR(ip, whichfork);
- if (!(ifp->if_flags & XFS_IFEXTENTS) &&
- (error = xfs_iread_extents(tp, ip, whichfork)))
- return error;
- bno = *last_block - 1;
- ep = xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got,
- &prev);
- if (eof || xfs_bmbt_get_startoff(ep) > bno) {
- if (prev.br_startoff == NULLFILEOFF)
- *last_block = 0;
- else
- *last_block = prev.br_startoff + prev.br_blockcount;
- }
- /*
- * Otherwise *last_block is already the right answer.
- */
- return 0;
-}
-
-/*
- * Returns the file-relative block number of the first block past eof in
- * the file. This is not based on i_size, it is based on the extent list.
- * Returns 0 for local files, as they do not have an extent list.
- */
-int /* error */
-xfs_bmap_last_offset(
- xfs_trans_t *tp, /* transaction pointer */
- xfs_inode_t *ip, /* incore inode */
- xfs_fileoff_t *last_block, /* last block */
- int whichfork) /* data or attr fork */
-{
- xfs_bmbt_rec_t *base; /* base of extent array */
- xfs_bmbt_rec_t *ep; /* pointer to last extent */
- int error; /* error return value */
- xfs_ifork_t *ifp; /* inode fork pointer */
- xfs_extnum_t nextents; /* number of extent entries */
-
- if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE &&
- XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
- XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL)
- return XFS_ERROR(EIO);
- if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) {
- *last_block = 0;
- return 0;
- }
- ifp = XFS_IFORK_PTR(ip, whichfork);
- if (!(ifp->if_flags & XFS_IFEXTENTS) &&
- (error = xfs_iread_extents(tp, ip, whichfork)))
- return error;
- nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
- if (!nextents) {
- *last_block = 0;
- return 0;
- }
- base = &ifp->if_u1.if_extents[0];
- ASSERT(base != NULL);
- ep = &base[nextents - 1];
- *last_block = xfs_bmbt_get_startoff(ep) + xfs_bmbt_get_blockcount(ep);
- return 0;
-}
-
-/*
- * Returns whether the selected fork of the inode has exactly one
- * block or not. For the data fork we check this matches di_size,
- * implying the file's range is 0..bsize-1.
- */
-int /* 1=>1 block, 0=>otherwise */
-xfs_bmap_one_block(
- xfs_inode_t *ip, /* incore inode */
- int whichfork) /* data or attr fork */
-{
- xfs_bmbt_rec_t *ep; /* ptr to fork's extent */
- xfs_ifork_t *ifp; /* inode fork pointer */
- int rval; /* return value */
- xfs_bmbt_irec_t s; /* internal version of extent */
-
-#ifndef DEBUG
- if (whichfork == XFS_DATA_FORK)
- return ip->i_d.di_size == ip->i_mount->m_sb.sb_blocksize;
-#endif /* !DEBUG */
- if (XFS_IFORK_NEXTENTS(ip, whichfork) != 1)
- return 0;
- if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS)
- return 0;
- ifp = XFS_IFORK_PTR(ip, whichfork);
- ASSERT(ifp->if_flags & XFS_IFEXTENTS);
- ep = ifp->if_u1.if_extents;
- xfs_bmbt_get_all(ep, &s);
- rval = s.br_startoff == 0 && s.br_blockcount == 1;
- if (rval && whichfork == XFS_DATA_FORK)
- ASSERT(ip->i_d.di_size == ip->i_mount->m_sb.sb_blocksize);
- return rval;
-}
-
-/*
- * Read in the extents to if_extents.
- * All inode fields are set up by caller, we just traverse the btree
- * and copy the records in. If the file system cannot contain unwritten
- * extents, the records are checked for no "state" flags.
- */
-int /* error */
-xfs_bmap_read_extents(
- xfs_trans_t *tp, /* transaction pointer */
- xfs_inode_t *ip, /* incore inode */
- int whichfork) /* data or attr fork */
-{
- xfs_bmbt_block_t *block; /* current btree block */
- xfs_fsblock_t bno; /* block # of "block" */
- xfs_buf_t *bp; /* buffer for "block" */
- int error; /* error return value */
- xfs_exntfmt_t exntf; /* XFS_EXTFMT_NOSTATE, if checking */
-#ifdef XFS_BMAP_TRACE
- static char fname[] = "xfs_bmap_read_extents";
-#endif
- xfs_extnum_t i, j; /* index into the extents list */
- xfs_ifork_t *ifp; /* fork structure */
- int level; /* btree level, for checking */
- xfs_mount_t *mp; /* file system mount structure */
- xfs_bmbt_ptr_t *pp; /* pointer to block address */
- /* REFERENCED */
- xfs_extnum_t room; /* number of entries there's room for */
- xfs_bmbt_rec_t *trp; /* target record pointer */
-
- bno = NULLFSBLOCK;
- mp = ip->i_mount;
- ifp = XFS_IFORK_PTR(ip, whichfork);
- exntf = (whichfork != XFS_DATA_FORK) ? XFS_EXTFMT_NOSTATE :
- XFS_EXTFMT_INODE(ip);
- block = ifp->if_broot;
- /*
- * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out.
- */
- level = be16_to_cpu(block->bb_level);
- ASSERT(level > 0);
- pp = XFS_BMAP_BROOT_PTR_ADDR(block, 1, ifp->if_broot_bytes);
- ASSERT(INT_GET(*pp, ARCH_CONVERT) != NULLDFSBNO);
- ASSERT(XFS_FSB_TO_AGNO(mp, INT_GET(*pp, ARCH_CONVERT)) < mp->m_sb.sb_agcount);
- ASSERT(XFS_FSB_TO_AGBNO(mp, INT_GET(*pp, ARCH_CONVERT)) < mp->m_sb.sb_agblocks);
- bno = INT_GET(*pp, ARCH_CONVERT);
- /*
- * Go down the tree until leaf level is reached, following the first
- * pointer (leftmost) at each level.
- */
- while (level-- > 0) {
- if ((error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
- XFS_BMAP_BTREE_REF)))
- return error;
- block = XFS_BUF_TO_BMBT_BLOCK(bp);
- XFS_WANT_CORRUPTED_GOTO(
- XFS_BMAP_SANITY_CHECK(mp, block, level),
- error0);
- if (level == 0)
- break;
- pp = XFS_BTREE_PTR_ADDR(mp->m_sb.sb_blocksize, xfs_bmbt, block,
- 1, mp->m_bmap_dmxr[1]);
- XFS_WANT_CORRUPTED_GOTO(
- XFS_FSB_SANITY_CHECK(mp, INT_GET(*pp, ARCH_CONVERT)),
- error0);
- bno = INT_GET(*pp, ARCH_CONVERT);
- xfs_trans_brelse(tp, bp);
- }
- /*
- * Here with bp and block set to the leftmost leaf node in the tree.
- */
- room = ifp->if_bytes / (uint)sizeof(*trp);
- trp = ifp->if_u1.if_extents;
- i = 0;
- /*
- * Loop over all leaf nodes. Copy information to the extent list.
- */
- for (;;) {
- xfs_bmbt_rec_t *frp, *temp;
- xfs_fsblock_t nextbno;
- xfs_extnum_t num_recs;
-
-
- num_recs = be16_to_cpu(block->bb_numrecs);
- if (unlikely(i + num_recs > room)) {
- ASSERT(i + num_recs <= room);
- xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
- "corrupt dinode %Lu, (btree extents).",
- (unsigned long long) ip->i_ino);
- XFS_ERROR_REPORT("xfs_bmap_read_extents(1)",
- XFS_ERRLEVEL_LOW,
- ip->i_mount);
- goto error0;
- }
- XFS_WANT_CORRUPTED_GOTO(
- XFS_BMAP_SANITY_CHECK(mp, block, 0),
- error0);
- /*
- * Read-ahead the next leaf block, if any.
- */
- nextbno = be64_to_cpu(block->bb_rightsib);
- if (nextbno != NULLFSBLOCK)
- xfs_btree_reada_bufl(mp, nextbno, 1);
- /*
- * Copy records into the extent list.
- */
- frp = XFS_BTREE_REC_ADDR(mp->m_sb.sb_blocksize, xfs_bmbt,
- block, 1, mp->m_bmap_dmxr[0]);
- temp = trp;
- for (j = 0; j < num_recs; j++, frp++, trp++) {
- trp->l0 = INT_GET(frp->l0, ARCH_CONVERT);
- trp->l1 = INT_GET(frp->l1, ARCH_CONVERT);
- }
- if (exntf == XFS_EXTFMT_NOSTATE) {
- /*
- * Check all attribute bmap btree records and
- * any "older" data bmap btree records for a
- * set bit in the "extent flag" position.
- */
- if (unlikely(xfs_check_nostate_extents(temp, num_recs))) {
- XFS_ERROR_REPORT("xfs_bmap_read_extents(2)",
- XFS_ERRLEVEL_LOW,
- ip->i_mount);
- goto error0;
- }
- }
- i += num_recs;
- xfs_trans_brelse(tp, bp);
- bno = nextbno;
- /*
- * If we've reached the end, stop.
- */
- if (bno == NULLFSBLOCK)
- break;
- if ((error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
- XFS_BMAP_BTREE_REF)))
- return error;
- block = XFS_BUF_TO_BMBT_BLOCK(bp);
- }
- ASSERT(i == ifp->if_bytes / (uint)sizeof(*trp));
- ASSERT(i == XFS_IFORK_NEXTENTS(ip, whichfork));
- xfs_bmap_trace_exlist(fname, ip, i, whichfork);
- return 0;
-error0:
- xfs_trans_brelse(tp, bp);
- return XFS_ERROR(EFSCORRUPTED);
-}
-
-#ifdef XFS_BMAP_TRACE
-/*
- * Add bmap trace insert entries for all the contents of the extent list.
- */
-void
-xfs_bmap_trace_exlist(
- char *fname, /* function name */
- xfs_inode_t *ip, /* incore inode pointer */
- xfs_extnum_t cnt, /* count of entries in the list */
- int whichfork) /* data or attr fork */
-{
- xfs_bmbt_rec_t *base; /* base of extent list */
- xfs_bmbt_rec_t *ep; /* current entry in extent list */
- xfs_extnum_t idx; /* extent list entry number */
- xfs_ifork_t *ifp; /* inode fork pointer */
- xfs_bmbt_irec_t s; /* extent list record */
-
- ifp = XFS_IFORK_PTR(ip, whichfork);
- ASSERT(cnt == ifp->if_bytes / (uint)sizeof(*base));
- base = ifp->if_u1.if_extents;
- for (idx = 0, ep = base; idx < cnt; idx++, ep++) {
- xfs_bmbt_get_all(ep, &s);
- xfs_bmap_trace_insert(fname, "exlist", ip, idx, 1, &s, NULL,
- whichfork);
- }
-}
-#endif
-
-#ifdef DEBUG
-/*
- * Validate that the bmbt_irecs being returned from bmapi are valid
- * given the callers original parameters. Specifically check the
- * ranges of the returned irecs to ensure that they only extent beyond
- * the given parameters if the XFS_BMAPI_ENTIRE flag was set.
- */
-STATIC void
-xfs_bmap_validate_ret(
- xfs_fileoff_t bno,
- xfs_filblks_t len,
- int flags,
- xfs_bmbt_irec_t *mval,
- int nmap,
- int ret_nmap)
-{
- int i; /* index to map values */
-
- ASSERT(ret_nmap <= nmap);
-
- for (i = 0; i < ret_nmap; i++) {
- ASSERT(mval[i].br_blockcount > 0);
- if (!(flags & XFS_BMAPI_ENTIRE)) {
- ASSERT(mval[i].br_startoff >= bno);
- ASSERT(mval[i].br_blockcount <= len);
- ASSERT(mval[i].br_startoff + mval[i].br_blockcount <=
- bno + len);
- } else {
- ASSERT(mval[i].br_startoff < bno + len);
- ASSERT(mval[i].br_startoff + mval[i].br_blockcount >
- bno);
- }
- ASSERT(i == 0 ||
- mval[i - 1].br_startoff + mval[i - 1].br_blockcount ==
- mval[i].br_startoff);
- if ((flags & XFS_BMAPI_WRITE) && !(flags & XFS_BMAPI_DELAY))
- ASSERT(mval[i].br_startblock != DELAYSTARTBLOCK &&
- mval[i].br_startblock != HOLESTARTBLOCK);
- ASSERT(mval[i].br_state == XFS_EXT_NORM ||
- mval[i].br_state == XFS_EXT_UNWRITTEN);
- }
-}
-#endif /* DEBUG */
-
-
-/*
- * Map file blocks to filesystem blocks.
- * File range is given by the bno/len pair.
- * Adds blocks to file if a write ("flags & XFS_BMAPI_WRITE" set)
- * into a hole or past eof.
- * Only allocates blocks from a single allocation group,
- * to avoid locking problems.
- * The returned value in "firstblock" from the first call in a transaction
- * must be remembered and presented to subsequent calls in "firstblock".
- * An upper bound for the number of blocks to be allocated is supplied to
- * the first call in "total"; if no allocation group has that many free
- * blocks then the call will fail (return NULLFSBLOCK in "firstblock").
- */
-int /* error */
-xfs_bmapi(
- xfs_trans_t *tp, /* transaction pointer */
- xfs_inode_t *ip, /* incore inode */
- xfs_fileoff_t bno, /* starting file offs. mapped */
- xfs_filblks_t len, /* length to map in file */
- int flags, /* XFS_BMAPI_... */
- xfs_fsblock_t *firstblock, /* first allocated block
- controls a.g. for allocs */
- xfs_extlen_t total, /* total blocks needed */
- xfs_bmbt_irec_t *mval, /* output: map values */
- int *nmap, /* i/o: mval size/count */
- xfs_bmap_free_t *flist) /* i/o: list extents to free */
-{
- xfs_fsblock_t abno; /* allocated block number */
- xfs_extlen_t alen; /* allocated extent length */
- xfs_fileoff_t aoff; /* allocated file offset */
- xfs_bmalloca_t bma; /* args for xfs_bmap_alloc */
- xfs_btree_cur_t *cur; /* bmap btree cursor */
- xfs_fileoff_t end; /* end of mapped file region */
- int eof; /* we've hit the end of extent list */
- char contig; /* allocation must be one extent */
- char delay; /* this request is for delayed alloc */
- char exact; /* don't do all of wasdelayed extent */
- char convert; /* unwritten extent I/O completion */
- xfs_bmbt_rec_t *ep; /* extent list entry pointer */
- int error; /* error return */
- xfs_bmbt_irec_t got; /* current extent list record */
- xfs_ifork_t *ifp; /* inode fork pointer */
- xfs_extlen_t indlen; /* indirect blocks length */
- xfs_extnum_t lastx; /* last useful extent number */
- int logflags; /* flags for transaction logging */
- xfs_extlen_t minleft; /* min blocks left after allocation */
- xfs_extlen_t minlen; /* min allocation size */
- xfs_mount_t *mp; /* xfs mount structure */
- int n; /* current extent index */
- int nallocs; /* number of extents alloc\'d */
- xfs_extnum_t nextents; /* number of extents in file */
- xfs_fileoff_t obno; /* old block number (offset) */
- xfs_bmbt_irec_t prev; /* previous extent list record */
- int tmp_logflags; /* temp flags holder */
- int whichfork; /* data or attr fork */
- char inhole; /* current location is hole in file */
- char stateless; /* ignore state flag set */
- char trim; /* output trimmed to match range */
- char userdata; /* allocating non-metadata */
- char wasdelay; /* old extent was delayed */
- char wr; /* this is a write request */
- char rt; /* this is a realtime file */
- char rsvd; /* OK to allocate reserved blocks */
-#ifdef DEBUG
- xfs_fileoff_t orig_bno; /* original block number value */
- int orig_flags; /* original flags arg value */
- xfs_filblks_t orig_len; /* original value of len arg */
- xfs_bmbt_irec_t *orig_mval; /* original value of mval */
- int orig_nmap; /* original value of *nmap */
-
- orig_bno = bno;
- orig_len = len;
- orig_flags = flags;
- orig_mval = mval;
- orig_nmap = *nmap;
-#endif
- ASSERT(*nmap >= 1);
- ASSERT(*nmap <= XFS_BMAP_MAX_NMAP || !(flags & XFS_BMAPI_WRITE));
- whichfork = (flags & XFS_BMAPI_ATTRFORK) ?
- XFS_ATTR_FORK : XFS_DATA_FORK;
- mp = ip->i_mount;
- if (unlikely(XFS_TEST_ERROR(
- (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
- XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE &&
- XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL),
- mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) {
- XFS_ERROR_REPORT("xfs_bmapi", XFS_ERRLEVEL_LOW, mp);
- return XFS_ERROR(EFSCORRUPTED);
- }
- if (XFS_FORCED_SHUTDOWN(mp))
- return XFS_ERROR(EIO);
- rt = (whichfork == XFS_DATA_FORK) && XFS_IS_REALTIME_INODE(ip);
- ifp = XFS_IFORK_PTR(ip, whichfork);
- ASSERT(ifp->if_ext_max ==
- XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
- if ((wr = (flags & XFS_BMAPI_WRITE)) != 0)
- XFS_STATS_INC(xs_blk_mapw);
- else
- XFS_STATS_INC(xs_blk_mapr);
- delay = (flags & XFS_BMAPI_DELAY) != 0;
- trim = (flags & XFS_BMAPI_ENTIRE) == 0;
- userdata = (flags & XFS_BMAPI_METADATA) == 0;
- convert = (flags & XFS_BMAPI_CONVERT) != 0;
- exact = (flags & XFS_BMAPI_EXACT) != 0;
- rsvd = (flags & XFS_BMAPI_RSVBLOCKS) != 0;
- contig = (flags & XFS_BMAPI_CONTIG) != 0;
- /*
- * stateless is used to combine extents which
- * differ only due to the state of the extents.
- * This technique is used from xfs_getbmap()
- * when the caller does not wish to see the
- * separation (which is the default).
- *
- * This technique is also used when writing a
- * buffer which has been partially written,
- * (usually by being flushed during a chunkread),
- * to ensure one write takes place. This also
- * prevents a change in the xfs inode extents at
- * this time, intentionally. This change occurs
- * on completion of the write operation, in
- * xfs_strat_comp(), where the xfs_bmapi() call
- * is transactioned, and the extents combined.
- */
- stateless = (flags & XFS_BMAPI_IGSTATE) != 0;
- if (stateless && wr) /* if writing unwritten space, no */
- wr = 0; /* allocations are allowed */
- ASSERT(wr || !delay);
- logflags = 0;
- nallocs = 0;
- cur = NULL;
- if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) {
- ASSERT(wr && tp);
- if ((error = xfs_bmap_local_to_extents(tp, ip,
- firstblock, total, &logflags, whichfork)))
- goto error0;
- }
- if (wr && *firstblock == NULLFSBLOCK) {
- if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE)
- minleft = be16_to_cpu(ifp->if_broot->bb_level) + 1;
- else
- minleft = 1;
- } else
- minleft = 0;
- if (!(ifp->if_flags & XFS_IFEXTENTS) &&
- (error = xfs_iread_extents(tp, ip, whichfork)))
- goto error0;
- ep = xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got,
- &prev);
- nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
- n = 0;
- end = bno + len;
- obno = bno;
- bma.ip = NULL;
- while (bno < end && n < *nmap) {
- /*
- * Reading past eof, act as though there's a hole
- * up to end.
- */
- if (eof && !wr)
- got.br_startoff = end;
- inhole = eof || got.br_startoff > bno;
- wasdelay = wr && !inhole && !delay &&
- ISNULLSTARTBLOCK(got.br_startblock);
- /*
- * First, deal with the hole before the allocated space
- * that we found, if any.
- */
- if (wr && (inhole || wasdelay)) {
- /*
- * For the wasdelay case, we could also just
- * allocate the stuff asked for in this bmap call
- * but that wouldn't be as good.
- */
- if (wasdelay && !exact) {
- alen = (xfs_extlen_t)got.br_blockcount;
- aoff = got.br_startoff;
- if (lastx != NULLEXTNUM && lastx) {
- ep = &ifp->if_u1.if_extents[lastx - 1];
- xfs_bmbt_get_all(ep, &prev);
- }
- } else if (wasdelay) {
- alen = (xfs_extlen_t)
- XFS_FILBLKS_MIN(len,
- (got.br_startoff +
- got.br_blockcount) - bno);
- aoff = bno;
- } else {
- alen = (xfs_extlen_t)
- XFS_FILBLKS_MIN(len, MAXEXTLEN);
- if (!eof)
- alen = (xfs_extlen_t)
- XFS_FILBLKS_MIN(alen,
- got.br_startoff - bno);
- aoff = bno;
- }
- minlen = contig ? alen : 1;
- if (delay) {
- xfs_extlen_t extsz;
-
- /* Figure out the extent size, adjust alen */
- if (rt) {
- if (!(extsz = ip->i_d.di_extsize))
- extsz = mp->m_sb.sb_rextsize;
- } else {
- extsz = ip->i_d.di_extsize;
- }
- if (extsz) {
- error = xfs_bmap_extsize_align(mp,
- &got, &prev, extsz,
- rt, eof, delay, convert,
- &aoff, &alen);
- ASSERT(!error);
- }
-
- if (rt)
- extsz = alen / mp->m_sb.sb_rextsize;
-
- /*
- * Make a transaction-less quota reservation for
- * delayed allocation blocks. This number gets
- * adjusted later.
- * We return EDQUOT if we haven't allocated
- * blks already inside this loop;
- */
- if (XFS_TRANS_RESERVE_QUOTA_NBLKS(
- mp, NULL, ip, (long)alen, 0,
- rt ? XFS_QMOPT_RES_RTBLKS :
- XFS_QMOPT_RES_REGBLKS)) {
- if (n == 0) {
- *nmap = 0;
- ASSERT(cur == NULL);
- return XFS_ERROR(EDQUOT);
- }
- break;
- }
-
- /*
- * Split changing sb for alen and indlen since
- * they could be coming from different places.
- */
- indlen = (xfs_extlen_t)
- xfs_bmap_worst_indlen(ip, alen);
- ASSERT(indlen > 0);
-
- if (rt) {
- error = xfs_mod_incore_sb(mp,
- XFS_SBS_FREXTENTS,
- -(extsz), rsvd);
- } else {
- error = xfs_mod_incore_sb(mp,
- XFS_SBS_FDBLOCKS,
- -(alen), rsvd);
- }
- if (!error) {
- error = xfs_mod_incore_sb(mp,
- XFS_SBS_FDBLOCKS,
- -(indlen), rsvd);
- if (error && rt)
- xfs_mod_incore_sb(mp,
- XFS_SBS_FREXTENTS,
- extsz, rsvd);
- else if (error)
- xfs_mod_incore_sb(mp,
- XFS_SBS_FDBLOCKS,
- alen, rsvd);
- }
-
- if (error) {
- if (XFS_IS_QUOTA_ON(mp))
- /* unreserve the blocks now */
- (void)
- XFS_TRANS_UNRESERVE_QUOTA_NBLKS(
- mp, NULL, ip,
- (long)alen, 0, rt ?
- XFS_QMOPT_RES_RTBLKS :
- XFS_QMOPT_RES_REGBLKS);
- break;
- }
-
- ip->i_delayed_blks += alen;
- abno = NULLSTARTBLOCK(indlen);
- } else {
- /*
- * If first time, allocate and fill in
- * once-only bma fields.
- */
- if (bma.ip == NULL) {
- bma.tp = tp;
- bma.ip = ip;
- bma.prevp = &prev;
- bma.gotp = &got;
- bma.total = total;
- bma.userdata = 0;
- }
- /* Indicate if this is the first user data
- * in the file, or just any user data.
- */
- if (userdata) {
- bma.userdata = (aoff == 0) ?
- XFS_ALLOC_INITIAL_USER_DATA :
- XFS_ALLOC_USERDATA;
- }
- /*
- * Fill in changeable bma fields.
- */
- bma.eof = eof;
- bma.firstblock = *firstblock;
- bma.alen = alen;
- bma.off = aoff;
- bma.conv = convert;
- bma.wasdel = wasdelay;
- bma.minlen = minlen;
- bma.low = flist->xbf_low;
- bma.minleft = minleft;
- /*
- * Only want to do the alignment at the
- * eof if it is userdata and allocation length
- * is larger than a stripe unit.
- */
- if (mp->m_dalign && alen >= mp->m_dalign &&
- userdata && whichfork == XFS_DATA_FORK) {
- if ((error = xfs_bmap_isaeof(ip, aoff,
- whichfork, &bma.aeof)))
- goto error0;
- } else
- bma.aeof = 0;
- /*
- * Call allocator.
- */
- if ((error = xfs_bmap_alloc(&bma)))
- goto error0;
- /*
- * Copy out result fields.
- */
- abno = bma.rval;
- if ((flist->xbf_low = bma.low))
- minleft = 0;
- alen = bma.alen;
- aoff = bma.off;
- ASSERT(*firstblock == NULLFSBLOCK ||
- XFS_FSB_TO_AGNO(mp, *firstblock) ==
- XFS_FSB_TO_AGNO(mp, bma.firstblock) ||
- (flist->xbf_low &&
- XFS_FSB_TO_AGNO(mp, *firstblock) <
- XFS_FSB_TO_AGNO(mp, bma.firstblock)));
- *firstblock = bma.firstblock;
- if (cur)
- cur->bc_private.b.firstblock =
- *firstblock;
- if (abno == NULLFSBLOCK)
- break;
- if ((ifp->if_flags & XFS_IFBROOT) && !cur) {
- cur = xfs_btree_init_cursor(mp,
- tp, NULL, 0, XFS_BTNUM_BMAP,
- ip, whichfork);
- cur->bc_private.b.firstblock =
- *firstblock;
- cur->bc_private.b.flist = flist;
- }
- /*
- * Bump the number of extents we've allocated
- * in this call.
- */
- nallocs++;
- }
- if (cur)
- cur->bc_private.b.flags =
- wasdelay ? XFS_BTCUR_BPRV_WASDEL : 0;
- got.br_startoff = aoff;
- got.br_startblock = abno;
- got.br_blockcount = alen;
- got.br_state = XFS_EXT_NORM; /* assume normal */
- /*
- * Determine state of extent, and the filesystem.
- * A wasdelay extent has been initialized, so
- * shouldn't be flagged as unwritten.
- */
- if (wr && XFS_SB_VERSION_HASEXTFLGBIT(&mp->m_sb)) {
- if (!wasdelay && (flags & XFS_BMAPI_PREALLOC))
- got.br_state = XFS_EXT_UNWRITTEN;
- }
- error = xfs_bmap_add_extent(ip, lastx, &cur, &got,
- firstblock, flist, &tmp_logflags, whichfork,
- rsvd);
- logflags |= tmp_logflags;
- if (error)
- goto error0;
- lastx = ifp->if_lastex;
- ep = &ifp->if_u1.if_extents[lastx];
- nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
- xfs_bmbt_get_all(ep, &got);
- ASSERT(got.br_startoff <= aoff);
- ASSERT(got.br_startoff + got.br_blockcount >=
- aoff + alen);
-#ifdef DEBUG
- if (delay) {
- ASSERT(ISNULLSTARTBLOCK(got.br_startblock));
- ASSERT(STARTBLOCKVAL(got.br_startblock) > 0);
- }
- ASSERT(got.br_state == XFS_EXT_NORM ||
- got.br_state == XFS_EXT_UNWRITTEN);
-#endif
- /*
- * Fall down into the found allocated space case.
- */
- } else if (inhole) {
- /*
- * Reading in a hole.
- */
- mval->br_startoff = bno;
- mval->br_startblock = HOLESTARTBLOCK;
- mval->br_blockcount =
- XFS_FILBLKS_MIN(len, got.br_startoff - bno);
- mval->br_state = XFS_EXT_NORM;
- bno += mval->br_blockcount;
- len -= mval->br_blockcount;
- mval++;
- n++;
- continue;
- }
- /*
- * Then deal with the allocated space we found.
- */
- ASSERT(ep != NULL);
- if (trim && (got.br_startoff + got.br_blockcount > obno)) {
- if (obno > bno)
- bno = obno;
- ASSERT((bno >= obno) || (n == 0));
- ASSERT(bno < end);
- mval->br_startoff = bno;
- if (ISNULLSTARTBLOCK(got.br_startblock)) {
- ASSERT(!wr || delay);
- mval->br_startblock = DELAYSTARTBLOCK;
- } else
- mval->br_startblock =
- got.br_startblock +
- (bno - got.br_startoff);
- /*
- * Return the minimum of what we got and what we
- * asked for for the length. We can use the len
- * variable here because it is modified below
- * and we could have been there before coming
- * here if the first part of the allocation
- * didn't overlap what was asked for.
- */
- mval->br_blockcount =
- XFS_FILBLKS_MIN(end - bno, got.br_blockcount -
- (bno - got.br_startoff));
- mval->br_state = got.br_state;
- ASSERT(mval->br_blockcount <= len);
- } else {
- *mval = got;
- if (ISNULLSTARTBLOCK(mval->br_startblock)) {
- ASSERT(!wr || delay);
- mval->br_startblock = DELAYSTARTBLOCK;
- }
- }
-
- /*
- * Check if writing previously allocated but
- * unwritten extents.
- */
- if (wr && mval->br_state == XFS_EXT_UNWRITTEN &&
- ((flags & (XFS_BMAPI_PREALLOC|XFS_BMAPI_DELAY)) == 0)) {
- /*
- * Modify (by adding) the state flag, if writing.
- */
- ASSERT(mval->br_blockcount <= len);
- if ((ifp->if_flags & XFS_IFBROOT) && !cur) {
- cur = xfs_btree_init_cursor(mp,
- tp, NULL, 0, XFS_BTNUM_BMAP,
- ip, whichfork);
- cur->bc_private.b.firstblock =
- *firstblock;
- cur->bc_private.b.flist = flist;
- }
- mval->br_state = XFS_EXT_NORM;
- error = xfs_bmap_add_extent(ip, lastx, &cur, mval,
- firstblock, flist, &tmp_logflags, whichfork,
- rsvd);
- logflags |= tmp_logflags;
- if (error)
- goto error0;
- lastx = ifp->if_lastex;
- ep = &ifp->if_u1.if_extents[lastx];
- nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
- xfs_bmbt_get_all(ep, &got);
- /*
- * We may have combined previously unwritten
- * space with written space, so generate
- * another request.
- */
- if (mval->br_blockcount < len)
- continue;
- }
-
- ASSERT(!trim ||
- ((mval->br_startoff + mval->br_blockcount) <= end));
- ASSERT(!trim || (mval->br_blockcount <= len) ||
- (mval->br_startoff < obno));
- bno = mval->br_startoff + mval->br_blockcount;
- len = end - bno;
- if (n > 0 && mval->br_startoff == mval[-1].br_startoff) {
- ASSERT(mval->br_startblock == mval[-1].br_startblock);
- ASSERT(mval->br_blockcount > mval[-1].br_blockcount);
- ASSERT(mval->br_state == mval[-1].br_state);
- mval[-1].br_blockcount = mval->br_blockcount;
- mval[-1].br_state = mval->br_state;
- } else if (n > 0 && mval->br_startblock != DELAYSTARTBLOCK &&
- mval[-1].br_startblock != DELAYSTARTBLOCK &&
- mval[-1].br_startblock != HOLESTARTBLOCK &&
- mval->br_startblock ==
- mval[-1].br_startblock + mval[-1].br_blockcount &&
- (stateless || mval[-1].br_state == mval->br_state)) {
- ASSERT(mval->br_startoff ==
- mval[-1].br_startoff + mval[-1].br_blockcount);
- mval[-1].br_blockcount += mval->br_blockcount;
- } else if (n > 0 &&
- mval->br_startblock == DELAYSTARTBLOCK &&
- mval[-1].br_startblock == DELAYSTARTBLOCK &&
- mval->br_startoff ==
- mval[-1].br_startoff + mval[-1].br_blockcount) {
- mval[-1].br_blockcount += mval->br_blockcount;
- mval[-1].br_state = mval->br_state;
- } else if (!((n == 0) &&
- ((mval->br_startoff + mval->br_blockcount) <=
- obno))) {
- mval++;
- n++;
- }
- /*
- * If we're done, stop now. Stop when we've allocated
- * XFS_BMAP_MAX_NMAP extents no matter what. Otherwise
- * the transaction may get too big.
- */
- if (bno >= end || n >= *nmap || nallocs >= *nmap)
- break;
- /*
- * Else go on to the next record.
- */
- ep++;
- lastx++;
- if (lastx >= nextents) {
- eof = 1;
- prev = got;
- } else
- xfs_bmbt_get_all(ep, &got);
- }
- ifp->if_lastex = lastx;
- *nmap = n;
- /*
- * Transform from btree to extents, give it cur.
- */
- if (tp && XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE &&
- XFS_IFORK_NEXTENTS(ip, whichfork) <= ifp->if_ext_max) {
- ASSERT(wr && cur);
- error = xfs_bmap_btree_to_extents(tp, ip, cur,
- &tmp_logflags, whichfork);
- logflags |= tmp_logflags;
- if (error)
- goto error0;
- }
- ASSERT(ifp->if_ext_max ==
- XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
- ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE ||
- XFS_IFORK_NEXTENTS(ip, whichfork) > ifp->if_ext_max);
- error = 0;
-
-error0:
- /*
- * Log everything. Do this after conversion, there's no point in
- * logging the extent list if we've converted to btree format.
- */
- if ((logflags & XFS_ILOG_FEXT(whichfork)) &&
- XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS)
- logflags &= ~XFS_ILOG_FEXT(whichfork);
- else if ((logflags & XFS_ILOG_FBROOT(whichfork)) &&
- XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)
- logflags &= ~XFS_ILOG_FBROOT(whichfork);
- /*
- * Log whatever the flags say, even if error. Otherwise we might miss
- * detecting a case where the data is changed, there's an error,
- * and it's not logged so we don't shutdown when we should.
- */
- if (logflags) {
- ASSERT(tp && wr);
- xfs_trans_log_inode(tp, ip, logflags);
- }
- if (cur) {
- if (!error) {
- ASSERT(*firstblock == NULLFSBLOCK ||
- XFS_FSB_TO_AGNO(mp, *firstblock) ==
- XFS_FSB_TO_AGNO(mp,
- cur->bc_private.b.firstblock) ||
- (flist->xbf_low &&
- XFS_FSB_TO_AGNO(mp, *firstblock) <
- XFS_FSB_TO_AGNO(mp,
- cur->bc_private.b.firstblock)));
- *firstblock = cur->bc_private.b.firstblock;
- }
- xfs_btree_del_cursor(cur,
- error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
- }
- if (!error)
- xfs_bmap_validate_ret(orig_bno, orig_len, orig_flags, orig_mval,
- orig_nmap, *nmap);
- return error;
-}
-
-/*
- * Map file blocks to filesystem blocks, simple version.
- * One block (extent) only, read-only.
- * For flags, only the XFS_BMAPI_ATTRFORK flag is examined.
- * For the other flag values, the effect is as if XFS_BMAPI_METADATA
- * was set and all the others were clear.
- */
-int /* error */
-xfs_bmapi_single(
- xfs_trans_t *tp, /* transaction pointer */
- xfs_inode_t *ip, /* incore inode */
- int whichfork, /* data or attr fork */
- xfs_fsblock_t *fsb, /* output: mapped block */
- xfs_fileoff_t bno) /* starting file offs. mapped */
-{
- int eof; /* we've hit the end of extent list */
- int error; /* error return */
- xfs_bmbt_irec_t got; /* current extent list record */
- xfs_ifork_t *ifp; /* inode fork pointer */
- xfs_extnum_t lastx; /* last useful extent number */
- xfs_bmbt_irec_t prev; /* previous extent list record */
-
- ifp = XFS_IFORK_PTR(ip, whichfork);
- if (unlikely(
- XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE &&
- XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS)) {
- XFS_ERROR_REPORT("xfs_bmapi_single", XFS_ERRLEVEL_LOW,
- ip->i_mount);
- return XFS_ERROR(EFSCORRUPTED);
- }
- if (XFS_FORCED_SHUTDOWN(ip->i_mount))
- return XFS_ERROR(EIO);
- XFS_STATS_INC(xs_blk_mapr);
- if (!(ifp->if_flags & XFS_IFEXTENTS) &&
- (error = xfs_iread_extents(tp, ip, whichfork)))
- return error;
- (void)xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got,
- &prev);
- /*
- * Reading past eof, act as though there's a hole
- * up to end.
- */
- if (eof || got.br_startoff > bno) {
- *fsb = NULLFSBLOCK;
- return 0;
- }
- ASSERT(!ISNULLSTARTBLOCK(got.br_startblock));
- ASSERT(bno < got.br_startoff + got.br_blockcount);
- *fsb = got.br_startblock + (bno - got.br_startoff);
- ifp->if_lastex = lastx;
- return 0;
-}
-
/*
* Unmap (remove) blocks from a file.
* If nexts is nonzero then the number of extents to remove is limited to
@@ -5312,26 +5048,26 @@ xfs_bunmapi(
xfs_btree_cur_t *cur; /* bmap btree cursor */
xfs_bmbt_irec_t del; /* extent being deleted */
int eof; /* is deleting at eof */
- xfs_bmbt_rec_t *ep; /* extent list entry pointer */
+ xfs_bmbt_rec_host_t *ep; /* extent record pointer */
int error; /* error return value */
xfs_extnum_t extno; /* extent number in list */
- xfs_bmbt_irec_t got; /* current extent list entry */
+ xfs_bmbt_irec_t got; /* current extent record */
xfs_ifork_t *ifp; /* inode fork pointer */
int isrt; /* freeing in rt area */
xfs_extnum_t lastx; /* last extent index used */
int logflags; /* transaction logging flags */
xfs_extlen_t mod; /* rt extent offset */
xfs_mount_t *mp; /* mount structure */
- xfs_extnum_t nextents; /* size of extent list */
- xfs_bmbt_irec_t prev; /* previous extent list entry */
+ xfs_extnum_t nextents; /* number of file extents */
+ xfs_bmbt_irec_t prev; /* previous extent record */
xfs_fileoff_t start; /* first file offset deleted */
int tmp_logflags; /* partial logging flags */
int wasdel; /* was a delayed alloc extent */
int whichfork; /* data or attribute fork */
- int rsvd; /* OK to allocate reserved blocks */
xfs_fsblock_t sum;
- xfs_bunmap_trace(ip, bno, len, flags, (inst_t *)__return_address);
+ trace_xfs_bunmap(ip, bno, len, flags, _RET_IP_);
+
whichfork = (flags & XFS_BMAPI_ATTRFORK) ?
XFS_ATTR_FORK : XFS_DATA_FORK;
ifp = XFS_IFORK_PTR(ip, whichfork);
@@ -5345,11 +5081,11 @@ xfs_bunmapi(
mp = ip->i_mount;
if (XFS_FORCED_SHUTDOWN(mp))
return XFS_ERROR(EIO);
- rsvd = (flags & XFS_BMAPI_RSVBLOCKS) != 0;
+
+ ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
ASSERT(len > 0);
ASSERT(nexts >= 0);
- ASSERT(ifp->if_ext_max ==
- XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
+
if (!(ifp->if_flags & XFS_IFEXTENTS) &&
(error = xfs_iread_extents(tp, ip, whichfork)))
return error;
@@ -5364,25 +5100,34 @@ xfs_bunmapi(
bno = start + len - 1;
ep = xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got,
&prev);
+
/*
* Check to see if the given block number is past the end of the
* file, back up to the last block if so...
*/
if (eof) {
- ep = &ifp->if_u1.if_extents[--lastx];
+ ep = xfs_iext_get_ext(ifp, --lastx);
xfs_bmbt_get_all(ep, &got);
bno = got.br_startoff + got.br_blockcount - 1;
}
logflags = 0;
if (ifp->if_flags & XFS_IFBROOT) {
ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE);
- cur = xfs_btree_init_cursor(mp, tp, NULL, 0, XFS_BTNUM_BMAP, ip,
- whichfork);
+ cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
cur->bc_private.b.firstblock = *firstblock;
cur->bc_private.b.flist = flist;
cur->bc_private.b.flags = 0;
} else
cur = NULL;
+
+ if (isrt) {
+ /*
+ * Synchronize by locking the bitmap inode.
+ */
+ xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, mp->m_rbmip, XFS_ILOCK_EXCL);
+ }
+
extno = 0;
while (bno != (xfs_fileoff_t)-1 && bno >= start && lastx >= 0 &&
(nexts == 0 || extno < nexts)) {
@@ -5393,7 +5138,7 @@ xfs_bunmapi(
if (got.br_startoff > bno) {
if (--lastx < 0)
break;
- ep--;
+ ep = xfs_iext_get_ext(ifp, lastx);
xfs_bmbt_get_all(ep, &got);
}
/*
@@ -5410,7 +5155,7 @@ xfs_bunmapi(
*/
ASSERT(ep != NULL);
del = got;
- wasdel = ISNULLSTARTBLOCK(del.br_startblock);
+ wasdel = isnullstartblock(del.br_startblock);
if (got.br_startoff < start) {
del.br_startoff = start;
del.br_blockcount -= start - got.br_startoff;
@@ -5430,7 +5175,7 @@ xfs_bunmapi(
* get rid of part of a realtime extent.
*/
if (del.br_state == XFS_EXT_UNWRITTEN ||
- !XFS_SB_VERSION_HASEXTFLGBIT(&mp->m_sb)) {
+ !xfs_sb_version_hasextflgbit(&mp->m_sb)) {
/*
* This piece is unwritten, or we're not
* using unwritten extents. Skip over it.
@@ -5440,7 +5185,8 @@ xfs_bunmapi(
del.br_blockcount : mod;
if (bno < got.br_startoff) {
if (--lastx >= 0)
- xfs_bmbt_get_all(--ep, &got);
+ xfs_bmbt_get_all(xfs_iext_get_ext(
+ ifp, lastx), &got);
}
continue;
}
@@ -5460,8 +5206,9 @@ xfs_bunmapi(
del.br_blockcount = mod;
}
del.br_state = XFS_EXT_UNWRITTEN;
- error = xfs_bmap_add_extent(ip, lastx, &cur, &del,
- firstblock, flist, &logflags, XFS_DATA_FORK, 0);
+ error = xfs_bmap_add_extent_unwritten_real(tp, ip,
+ &lastx, &cur, &del, firstblock, flist,
+ &logflags);
if (error)
goto error0;
goto nodelete;
@@ -5480,16 +5227,19 @@ xfs_bunmapi(
} else if ((del.br_startoff == start &&
(del.br_state == XFS_EXT_UNWRITTEN ||
xfs_trans_get_block_res(tp) == 0)) ||
- !XFS_SB_VERSION_HASEXTFLGBIT(&mp->m_sb)) {
+ !xfs_sb_version_hasextflgbit(&mp->m_sb)) {
/*
* Can't make it unwritten. There isn't
* a full extent here so just skip it.
*/
ASSERT(bno >= del.br_blockcount);
bno -= del.br_blockcount;
- if (bno < got.br_startoff) {
- if (--lastx >= 0)
- xfs_bmbt_get_all(--ep, &got);
+ if (got.br_startoff > bno) {
+ if (--lastx >= 0) {
+ ep = xfs_iext_get_ext(ifp,
+ lastx);
+ xfs_bmbt_get_all(ep, &got);
+ }
}
continue;
} else if (del.br_state == XFS_EXT_UNWRITTEN) {
@@ -5500,9 +5250,10 @@ xfs_bunmapi(
* try again.
*/
ASSERT(lastx > 0);
- xfs_bmbt_get_all(ep - 1, &prev);
+ xfs_bmbt_get_all(xfs_iext_get_ext(ifp,
+ lastx - 1), &prev);
ASSERT(prev.br_state == XFS_EXT_NORM);
- ASSERT(!ISNULLSTARTBLOCK(prev.br_startblock));
+ ASSERT(!isnullstartblock(prev.br_startblock));
ASSERT(del.br_startblock ==
prev.br_startblock + prev.br_blockcount);
if (prev.br_startoff < start) {
@@ -5512,25 +5263,26 @@ xfs_bunmapi(
prev.br_startoff = start;
}
prev.br_state = XFS_EXT_UNWRITTEN;
- error = xfs_bmap_add_extent(ip, lastx - 1, &cur,
- &prev, firstblock, flist, &logflags,
- XFS_DATA_FORK, 0);
+ lastx--;
+ error = xfs_bmap_add_extent_unwritten_real(tp,
+ ip, &lastx, &cur, &prev,
+ firstblock, flist, &logflags);
if (error)
goto error0;
goto nodelete;
} else {
ASSERT(del.br_state == XFS_EXT_NORM);
del.br_state = XFS_EXT_UNWRITTEN;
- error = xfs_bmap_add_extent(ip, lastx, &cur,
- &del, firstblock, flist, &logflags,
- XFS_DATA_FORK, 0);
+ error = xfs_bmap_add_extent_unwritten_real(tp,
+ ip, &lastx, &cur, &del,
+ firstblock, flist, &logflags);
if (error)
goto error0;
goto nodelete;
}
}
if (wasdel) {
- ASSERT(STARTBLOCKVAL(del.br_startblock) > 0);
+ ASSERT(startblockval(del.br_startblock) > 0);
/* Update realtime/data freespace, unreserve quota */
if (isrt) {
xfs_filblks_t rtexts;
@@ -5538,15 +5290,15 @@ xfs_bunmapi(
rtexts = XFS_FSB_TO_B(mp, del.br_blockcount);
do_div(rtexts, mp->m_sb.sb_rextsize);
xfs_mod_incore_sb(mp, XFS_SBS_FREXTENTS,
- (int)rtexts, rsvd);
- (void)XFS_TRANS_RESERVE_QUOTA_NBLKS(mp,
- NULL, ip, -((long)del.br_blockcount), 0,
+ (int64_t)rtexts, 0);
+ (void)xfs_trans_reserve_quota_nblks(NULL,
+ ip, -((long)del.br_blockcount), 0,
XFS_QMOPT_RES_RTBLKS);
} else {
- xfs_mod_incore_sb(mp, XFS_SBS_FDBLOCKS,
- (int)del.br_blockcount, rsvd);
- (void)XFS_TRANS_RESERVE_QUOTA_NBLKS(mp,
- NULL, ip, -((long)del.br_blockcount), 0,
+ xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
+ (int64_t)del.br_blockcount, 0);
+ (void)xfs_trans_reserve_quota_nblks(NULL,
+ ip, -((long)del.br_blockcount), 0,
XFS_QMOPT_RES_REGBLKS);
}
ip->i_delayed_blks -= del.br_blockcount;
@@ -5568,46 +5320,43 @@ xfs_bunmapi(
*/
if (!wasdel && xfs_trans_get_block_res(tp) == 0 &&
XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS &&
- XFS_IFORK_NEXTENTS(ip, whichfork) >= ifp->if_ext_max &&
+ XFS_IFORK_NEXTENTS(ip, whichfork) >= /* Note the >= */
+ XFS_IFORK_MAXEXT(ip, whichfork) &&
del.br_startoff > got.br_startoff &&
del.br_startoff + del.br_blockcount <
got.br_startoff + got.br_blockcount) {
error = XFS_ERROR(ENOSPC);
goto error0;
}
- error = xfs_bmap_del_extent(ip, tp, lastx, flist, cur, &del,
- &tmp_logflags, whichfork, rsvd);
+ error = xfs_bmap_del_extent(ip, tp, &lastx, flist, cur, &del,
+ &tmp_logflags, whichfork);
logflags |= tmp_logflags;
if (error)
goto error0;
bno = del.br_startoff - 1;
nodelete:
- lastx = ifp->if_lastex;
/*
* If not done go on to the next (previous) record.
- * Reset ep in case the extents array was re-alloced.
*/
- ep = &ifp->if_u1.if_extents[lastx];
if (bno != (xfs_fileoff_t)-1 && bno >= start) {
- if (lastx >= XFS_IFORK_NEXTENTS(ip, whichfork) ||
- xfs_bmbt_get_startoff(ep) > bno) {
- lastx--;
- ep--;
- }
- if (lastx >= 0)
+ if (lastx >= 0) {
+ ep = xfs_iext_get_ext(ifp, lastx);
+ if (xfs_bmbt_get_startoff(ep) > bno) {
+ if (--lastx >= 0)
+ ep = xfs_iext_get_ext(ifp,
+ lastx);
+ }
xfs_bmbt_get_all(ep, &got);
+ }
extno++;
}
}
- ifp->if_lastex = lastx;
*done = bno == (xfs_fileoff_t)-1 || bno < start || lastx < 0;
- ASSERT(ifp->if_ext_max ==
- XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
+
/*
* Convert to a btree if necessary.
*/
- if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS &&
- XFS_IFORK_NEXTENTS(ip, whichfork) > ifp->if_ext_max) {
+ if (xfs_bmap_needs_btree(ip, whichfork)) {
ASSERT(cur == NULL);
error = xfs_bmap_extents_to_btree(tp, ip, firstblock, flist,
&cur, 0, &tmp_logflags, whichfork);
@@ -5618,8 +5367,7 @@ nodelete:
/*
* transform from btree to extents, give it cur
*/
- else if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE &&
- XFS_IFORK_NEXTENTS(ip, whichfork) <= ifp->if_ext_max) {
+ else if (xfs_bmap_wants_extents(ip, whichfork)) {
ASSERT(cur != NULL);
error = xfs_bmap_btree_to_extents(tp, ip, cur, &tmp_logflags,
whichfork);
@@ -5630,20 +5378,18 @@ nodelete:
/*
* transform from extents to local?
*/
- ASSERT(ifp->if_ext_max ==
- XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
error = 0;
error0:
/*
* Log everything. Do this after conversion, there's no point in
- * logging the extent list if we've converted to btree format.
+ * logging the extent records if we've converted to btree format.
*/
- if ((logflags & XFS_ILOG_FEXT(whichfork)) &&
+ if ((logflags & xfs_ilog_fext(whichfork)) &&
XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS)
- logflags &= ~XFS_ILOG_FEXT(whichfork);
- else if ((logflags & XFS_ILOG_FBROOT(whichfork)) &&
+ logflags &= ~xfs_ilog_fext(whichfork);
+ else if ((logflags & xfs_ilog_fbroot(whichfork)) &&
XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)
- logflags &= ~XFS_ILOG_FBROOT(whichfork);
+ logflags &= ~xfs_ilog_fbroot(whichfork);
/*
* Log inode even in the error case, if the transaction
* is dirty we'll need to shut down the filesystem.
@@ -5662,770 +5408,199 @@ error0:
}
/*
- * Fcntl interface to xfs_bmapi.
+ * Shift extent records to the left to cover a hole.
+ *
+ * The maximum number of extents to be shifted in a single operation
+ * is @num_exts, and @current_ext keeps track of the current extent
+ * index we have shifted. @offset_shift_fsb is the length by which each
+ * extent is shifted. If there is no hole to shift the extents
+ * into, this will be considered invalid operation and we abort immediately.
*/
-int /* error code */
-xfs_getbmap(
- bhv_desc_t *bdp, /* XFS behavior descriptor*/
- struct getbmap *bmv, /* user bmap structure */
- void __user *ap, /* pointer to user's array */
- int interface) /* interface flags */
+int
+xfs_bmap_shift_extents(
+ struct xfs_trans *tp,
+ struct xfs_inode *ip,
+ int *done,
+ xfs_fileoff_t start_fsb,
+ xfs_fileoff_t offset_shift_fsb,
+ xfs_extnum_t *current_ext,
+ xfs_fsblock_t *firstblock,
+ struct xfs_bmap_free *flist,
+ int num_exts)
{
- __int64_t bmvend; /* last block requested */
- int error; /* return value */
- __int64_t fixlen; /* length for -1 case */
- int i; /* extent number */
- xfs_inode_t *ip; /* xfs incore inode pointer */
- vnode_t *vp; /* corresponding vnode */
- int lock; /* lock state */
- xfs_bmbt_irec_t *map; /* buffer for user's data */
- xfs_mount_t *mp; /* file system mount point */
- int nex; /* # of user extents can do */
- int nexleft; /* # of user extents left */
- int subnex; /* # of bmapi's can do */
- int nmap; /* number of map entries */
- struct getbmap out; /* output structure */
- int whichfork; /* data or attr fork */
- int prealloced; /* this is a file with
- * preallocated data space */
- int sh_unwritten; /* true, if unwritten */
- /* extents listed separately */
- int bmapi_flags; /* flags for xfs_bmapi */
- __int32_t oflags; /* getbmapx bmv_oflags field */
-
- vp = BHV_TO_VNODE(bdp);
- ip = XFS_BHVTOI(bdp);
- mp = ip->i_mount;
+ struct xfs_btree_cur *cur;
+ struct xfs_bmbt_rec_host *gotp;
+ struct xfs_bmbt_irec got;
+ struct xfs_bmbt_irec left;
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_ifork *ifp;
+ xfs_extnum_t nexts = 0;
+ xfs_fileoff_t startoff;
+ int error = 0;
+ int i;
+ int whichfork = XFS_DATA_FORK;
+ int logflags;
+ xfs_filblks_t blockcount = 0;
+ int total_extents;
- whichfork = interface & BMV_IF_ATTRFORK ? XFS_ATTR_FORK : XFS_DATA_FORK;
- sh_unwritten = (interface & BMV_IF_PREALLOC) != 0;
-
- /* If the BMV_IF_NO_DMAPI_READ interface bit specified, do not
- * generate a DMAPI read event. Otherwise, if the DM_EVENT_READ
- * bit is set for the file, generate a read event in order
- * that the DMAPI application may do its thing before we return
- * the extents. Usually this means restoring user file data to
- * regions of the file that look like holes.
- *
- * The "old behavior" (from XFS_IOC_GETBMAP) is to not specify
- * BMV_IF_NO_DMAPI_READ so that read events are generated.
- * If this were not true, callers of ioctl( XFS_IOC_GETBMAP )
- * could misinterpret holes in a DMAPI file as true holes,
- * when in fact they may represent offline user data.
- */
- if ( (interface & BMV_IF_NO_DMAPI_READ) == 0
- && DM_EVENT_ENABLED(vp->v_vfsp, ip, DM_EVENT_READ)
- && whichfork == XFS_DATA_FORK) {
-
- error = XFS_SEND_DATA(mp, DM_EVENT_READ, vp, 0, 0, 0, NULL);
- if (error)
- return XFS_ERROR(error);
- }
-
- if (whichfork == XFS_ATTR_FORK) {
- if (XFS_IFORK_Q(ip)) {
- if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS &&
- ip->i_d.di_aformat != XFS_DINODE_FMT_BTREE &&
- ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)
- return XFS_ERROR(EINVAL);
- } else if (unlikely(
- ip->i_d.di_aformat != 0 &&
- ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS)) {
- XFS_ERROR_REPORT("xfs_getbmap", XFS_ERRLEVEL_LOW,
- ip->i_mount);
- return XFS_ERROR(EFSCORRUPTED);
- }
- } else if (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS &&
- ip->i_d.di_format != XFS_DINODE_FMT_BTREE &&
- ip->i_d.di_format != XFS_DINODE_FMT_LOCAL)
- return XFS_ERROR(EINVAL);
- if (whichfork == XFS_DATA_FORK) {
- if ((ip->i_d.di_extsize && (ip->i_d.di_flags &
- (XFS_DIFLAG_REALTIME|XFS_DIFLAG_EXTSIZE))) ||
- ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC|XFS_DIFLAG_APPEND)){
- prealloced = 1;
- fixlen = XFS_MAXIOFFSET(mp);
- } else {
- prealloced = 0;
- fixlen = ip->i_d.di_size;
- }
- } else {
- prealloced = 0;
- fixlen = 1LL << 32;
- }
-
- if (bmv->bmv_length == -1) {
- fixlen = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, fixlen));
- bmv->bmv_length = MAX( (__int64_t)(fixlen - bmv->bmv_offset),
- (__int64_t)0);
- } else if (bmv->bmv_length < 0)
- return XFS_ERROR(EINVAL);
- if (bmv->bmv_length == 0) {
- bmv->bmv_entries = 0;
- return 0;
- }
- nex = bmv->bmv_count - 1;
- if (nex <= 0)
- return XFS_ERROR(EINVAL);
- bmvend = bmv->bmv_offset + bmv->bmv_length;
-
- xfs_ilock(ip, XFS_IOLOCK_SHARED);
-
- if (whichfork == XFS_DATA_FORK && ip->i_delayed_blks) {
- /* xfs_fsize_t last_byte = xfs_file_last_byte(ip); */
- VOP_FLUSH_PAGES(vp, (xfs_off_t)0, -1, 0, FI_REMAPF, error);
- }
-
- ASSERT(whichfork == XFS_ATTR_FORK || ip->i_delayed_blks == 0);
-
- lock = xfs_ilock_map_shared(ip);
-
- /*
- * Don't let nex be bigger than the number of extents
- * we can have assuming alternating holes and real extents.
- */
- if (nex > XFS_IFORK_NEXTENTS(ip, whichfork) * 2 + 1)
- nex = XFS_IFORK_NEXTENTS(ip, whichfork) * 2 + 1;
-
- bmapi_flags = XFS_BMAPI_AFLAG(whichfork) |
- ((sh_unwritten) ? 0 : XFS_BMAPI_IGSTATE);
-
- /*
- * Allocate enough space to handle "subnex" maps at a time.
- */
- subnex = 16;
- map = kmem_alloc(subnex * sizeof(*map), KM_SLEEP);
-
- bmv->bmv_entries = 0;
-
- if (XFS_IFORK_NEXTENTS(ip, whichfork) == 0) {
- error = 0;
- goto unlock_and_return;
+ if (unlikely(XFS_TEST_ERROR(
+ (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
+ XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE),
+ mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) {
+ XFS_ERROR_REPORT("xfs_bmap_shift_extents",
+ XFS_ERRLEVEL_LOW, mp);
+ return XFS_ERROR(EFSCORRUPTED);
}
- nexleft = nex;
-
- do {
- nmap = (nexleft > subnex) ? subnex : nexleft;
- error = xfs_bmapi(NULL, ip, XFS_BB_TO_FSBT(mp, bmv->bmv_offset),
- XFS_BB_TO_FSB(mp, bmv->bmv_length),
- bmapi_flags, NULL, 0, map, &nmap, NULL);
- if (error)
- goto unlock_and_return;
- ASSERT(nmap <= subnex);
-
- for (i = 0; i < nmap && nexleft && bmv->bmv_length; i++) {
- nexleft--;
- oflags = (map[i].br_state == XFS_EXT_UNWRITTEN) ?
- BMV_OF_PREALLOC : 0;
- out.bmv_offset = XFS_FSB_TO_BB(mp, map[i].br_startoff);
- out.bmv_length = XFS_FSB_TO_BB(mp, map[i].br_blockcount);
- ASSERT(map[i].br_startblock != DELAYSTARTBLOCK);
- if (map[i].br_startblock == HOLESTARTBLOCK &&
- ((prealloced && out.bmv_offset + out.bmv_length == bmvend) ||
- whichfork == XFS_ATTR_FORK )) {
- /*
- * came to hole at end of file or the end of
- attribute fork
- */
- goto unlock_and_return;
- } else {
- out.bmv_block =
- (map[i].br_startblock == HOLESTARTBLOCK) ?
- -1 :
- XFS_FSB_TO_DB(ip, map[i].br_startblock);
-
- /* return either getbmap/getbmapx structure. */
- if (interface & BMV_IF_EXTENDED) {
- struct getbmapx outx;
-
- GETBMAP_CONVERT(out,outx);
- outx.bmv_oflags = oflags;
- outx.bmv_unused1 = outx.bmv_unused2 = 0;
- if (copy_to_user(ap, &outx,
- sizeof(outx))) {
- error = XFS_ERROR(EFAULT);
- goto unlock_and_return;
- }
- } else {
- if (copy_to_user(ap, &out,
- sizeof(out))) {
- error = XFS_ERROR(EFAULT);
- goto unlock_and_return;
- }
- }
- bmv->bmv_offset =
- out.bmv_offset + out.bmv_length;
- bmv->bmv_length = MAX((__int64_t)0,
- (__int64_t)(bmvend - bmv->bmv_offset));
- bmv->bmv_entries++;
- ap = (interface & BMV_IF_EXTENDED) ?
- (void __user *)
- ((struct getbmapx __user *)ap + 1) :
- (void __user *)
- ((struct getbmap __user *)ap + 1);
- }
- }
- } while (nmap && nexleft && bmv->bmv_length);
-
-unlock_and_return:
- xfs_iunlock_map_shared(ip, lock);
- xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+ if (XFS_FORCED_SHUTDOWN(mp))
+ return XFS_ERROR(EIO);
- kmem_free(map, subnex * sizeof(*map));
+ ASSERT(current_ext != NULL);
- return error;
-}
-
-/*
- * Check the last inode extent to determine whether this allocation will result
- * in blocks being allocated at the end of the file. When we allocate new data
- * blocks at the end of the file which do not start at the previous data block,
- * we will try to align the new blocks at stripe unit boundaries.
- */
-STATIC int /* error */
-xfs_bmap_isaeof(
- xfs_inode_t *ip, /* incore inode pointer */
- xfs_fileoff_t off, /* file offset in fsblocks */
- int whichfork, /* data or attribute fork */
- char *aeof) /* return value */
-{
- int error; /* error return value */
- xfs_ifork_t *ifp; /* inode fork pointer */
- xfs_bmbt_rec_t *lastrec; /* extent list entry pointer */
- xfs_extnum_t nextents; /* size of extent list */
- xfs_bmbt_irec_t s; /* expanded extent list entry */
-
- ASSERT(whichfork == XFS_DATA_FORK);
ifp = XFS_IFORK_PTR(ip, whichfork);
- if (!(ifp->if_flags & XFS_IFEXTENTS) &&
- (error = xfs_iread_extents(NULL, ip, whichfork)))
- return error;
- nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
- if (nextents == 0) {
- *aeof = 1;
- return 0;
+ if (!(ifp->if_flags & XFS_IFEXTENTS)) {
+ /* Read in all the extents */
+ error = xfs_iread_extents(tp, ip, whichfork);
+ if (error)
+ return error;
}
- /*
- * Go to the last extent
- */
- lastrec = &ifp->if_u1.if_extents[nextents - 1];
- xfs_bmbt_get_all(lastrec, &s);
- /*
- * Check we are allocating in the last extent (for delayed allocations)
- * or past the last extent for non-delayed allocations.
- */
- *aeof = (off >= s.br_startoff &&
- off < s.br_startoff + s.br_blockcount &&
- ISNULLSTARTBLOCK(s.br_startblock)) ||
- off >= s.br_startoff + s.br_blockcount;
- return 0;
-}
-
-/*
- * Check if the endoff is outside the last extent. If so the caller will grow
- * the allocation to a stripe unit boundary.
- */
-int /* error */
-xfs_bmap_eof(
- xfs_inode_t *ip, /* incore inode pointer */
- xfs_fileoff_t endoff, /* file offset in fsblocks */
- int whichfork, /* data or attribute fork */
- int *eof) /* result value */
-{
- xfs_fsblock_t blockcount; /* extent block count */
- int error; /* error return value */
- xfs_ifork_t *ifp; /* inode fork pointer */
- xfs_bmbt_rec_t *lastrec; /* extent list entry pointer */
- xfs_extnum_t nextents; /* size of extent list */
- xfs_fileoff_t startoff; /* extent starting file offset */
- ASSERT(whichfork == XFS_DATA_FORK);
- ifp = XFS_IFORK_PTR(ip, whichfork);
- if (!(ifp->if_flags & XFS_IFEXTENTS) &&
- (error = xfs_iread_extents(NULL, ip, whichfork)))
- return error;
- nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
- if (nextents == 0) {
- *eof = 1;
- return 0;
- }
/*
- * Go to the last extent
+ * If *current_ext is 0, we would need to lookup the extent
+ * from where we would start shifting and store it in gotp.
*/
- lastrec = &ifp->if_u1.if_extents[nextents - 1];
- startoff = xfs_bmbt_get_startoff(lastrec);
- blockcount = xfs_bmbt_get_blockcount(lastrec);
- *eof = endoff >= startoff + blockcount;
- return 0;
-}
-
-#ifdef DEBUG
-/*
- * Check that the extents list for the inode ip is in the right order.
- */
-STATIC void
-xfs_bmap_check_extents(
- xfs_inode_t *ip, /* incore inode pointer */
- int whichfork) /* data or attr fork */
-{
- xfs_bmbt_rec_t *base; /* base of extents list */
- xfs_bmbt_rec_t *ep; /* current extent entry */
- xfs_ifork_t *ifp; /* inode fork pointer */
- xfs_extnum_t nextents; /* number of extents in list */
-
- ifp = XFS_IFORK_PTR(ip, whichfork);
- ASSERT(ifp->if_flags & XFS_IFEXTENTS);
- base = ifp->if_u1.if_extents;
- nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
- for (ep = base; ep < &base[nextents - 1]; ep++) {
- xfs_btree_check_rec(XFS_BTNUM_BMAP, (void *)ep,
- (void *)(ep + 1));
- }
-}
-
-STATIC
-xfs_buf_t *
-xfs_bmap_get_bp(
- xfs_btree_cur_t *cur,
- xfs_fsblock_t bno)
-{
- int i;
- xfs_buf_t *bp;
-
- if (!cur)
- return(NULL);
-
- bp = NULL;
- for(i = 0; i < XFS_BTREE_MAXLEVELS; i++) {
- bp = cur->bc_bufs[i];
- if (!bp) break;
- if (XFS_BUF_ADDR(bp) == bno)
- break; /* Found it */
- }
- if (i == XFS_BTREE_MAXLEVELS)
- bp = NULL;
-
- if (!bp) { /* Chase down all the log items to see if the bp is there */
- xfs_log_item_chunk_t *licp;
- xfs_trans_t *tp;
-
- tp = cur->bc_tp;
- licp = &tp->t_items;
- while (!bp && licp != NULL) {
- if (XFS_LIC_ARE_ALL_FREE(licp)) {
- licp = licp->lic_next;
- continue;
- }
- for (i = 0; i < licp->lic_unused; i++) {
- xfs_log_item_desc_t *lidp;
- xfs_log_item_t *lip;
- xfs_buf_log_item_t *bip;
- xfs_buf_t *lbp;
-
- if (XFS_LIC_ISFREE(licp, i)) {
- continue;
- }
-
- lidp = XFS_LIC_SLOT(licp, i);
- lip = lidp->lid_item;
- if (lip->li_type != XFS_LI_BUF)
- continue;
-
- bip = (xfs_buf_log_item_t *)lip;
- lbp = bip->bli_buf;
-
- if (XFS_BUF_ADDR(lbp) == bno) {
- bp = lbp;
- break; /* Found it */
- }
- }
- licp = licp->lic_next;
- }
- }
- return(bp);
-}
-
-void
-xfs_check_block(
- xfs_bmbt_block_t *block,
- xfs_mount_t *mp,
- int root,
- short sz)
-{
- int i, j, dmxr;
- xfs_bmbt_ptr_t *pp, *thispa; /* pointer to block address */
- xfs_bmbt_key_t *prevp, *keyp;
-
- ASSERT(be16_to_cpu(block->bb_level) > 0);
-
- prevp = NULL;
- for( i = 1; i <= be16_to_cpu(block->bb_numrecs); i++) {
- dmxr = mp->m_bmap_dmxr[0];
-
- if (root) {
- keyp = XFS_BMAP_BROOT_KEY_ADDR(block, i, sz);
- } else {
- keyp = XFS_BTREE_KEY_ADDR(mp->m_sb.sb_blocksize,
- xfs_bmbt, block, i, dmxr);
- }
-
- if (prevp) {
- xfs_btree_check_key(XFS_BTNUM_BMAP, prevp, keyp);
- }
- prevp = keyp;
-
+ if (!*current_ext) {
+ gotp = xfs_iext_bno_to_ext(ifp, start_fsb, current_ext);
/*
- * Compare the block numbers to see if there are dups.
+ * gotp can be null in 2 cases: 1) if there are no extents
+ * or 2) start_fsb lies in a hole beyond which there are
+ * no extents. Either way, we are done.
*/
-
- if (root) {
- pp = XFS_BMAP_BROOT_PTR_ADDR(block, i, sz);
- } else {
- pp = XFS_BTREE_PTR_ADDR(mp->m_sb.sb_blocksize,
- xfs_bmbt, block, i, dmxr);
- }
- for (j = i+1; j <= be16_to_cpu(block->bb_numrecs); j++) {
- if (root) {
- thispa = XFS_BMAP_BROOT_PTR_ADDR(block, j, sz);
- } else {
- thispa = XFS_BTREE_PTR_ADDR(mp->m_sb.sb_blocksize,
- xfs_bmbt, block, j, dmxr);
- }
- if (INT_GET(*thispa, ARCH_CONVERT) ==
- INT_GET(*pp, ARCH_CONVERT)) {
- cmn_err(CE_WARN, "%s: thispa(%d) == pp(%d) %Ld",
- __FUNCTION__, j, i,
- INT_GET(*thispa, ARCH_CONVERT));
- panic("%s: ptrs are equal in node\n",
- __FUNCTION__);
- }
+ if (!gotp) {
+ *done = 1;
+ return 0;
}
}
-}
-
-/*
- * Check that the extents for the inode ip are in the right order in all
- * btree leaves.
- */
-
-STATIC void
-xfs_bmap_check_leaf_extents(
- xfs_btree_cur_t *cur, /* btree cursor or null */
- xfs_inode_t *ip, /* incore inode pointer */
- int whichfork) /* data or attr fork */
-{
- xfs_bmbt_block_t *block; /* current btree block */
- xfs_fsblock_t bno; /* block # of "block" */
- xfs_buf_t *bp; /* buffer for "block" */
- int error; /* error return value */
- xfs_extnum_t i=0; /* index into the extents list */
- xfs_ifork_t *ifp; /* fork structure */
- int level; /* btree level, for checking */
- xfs_mount_t *mp; /* file system mount structure */
- xfs_bmbt_ptr_t *pp; /* pointer to block address */
- xfs_bmbt_rec_t *ep, *lastp; /* extent pointers in block entry */
- int bp_release = 0;
- if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE) {
- return;
+ /* We are going to change core inode */
+ logflags = XFS_ILOG_CORE;
+ if (ifp->if_flags & XFS_IFBROOT) {
+ cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
+ cur->bc_private.b.firstblock = *firstblock;
+ cur->bc_private.b.flist = flist;
+ cur->bc_private.b.flags = 0;
+ } else {
+ cur = NULL;
+ logflags |= XFS_ILOG_DEXT;
}
- bno = NULLFSBLOCK;
- mp = ip->i_mount;
- ifp = XFS_IFORK_PTR(ip, whichfork);
- block = ifp->if_broot;
- /*
- * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out.
- */
- level = be16_to_cpu(block->bb_level);
- ASSERT(level > 0);
- xfs_check_block(block, mp, 1, ifp->if_broot_bytes);
- pp = XFS_BMAP_BROOT_PTR_ADDR(block, 1, ifp->if_broot_bytes);
- ASSERT(INT_GET(*pp, ARCH_CONVERT) != NULLDFSBNO);
- ASSERT(XFS_FSB_TO_AGNO(mp, INT_GET(*pp, ARCH_CONVERT)) < mp->m_sb.sb_agcount);
- ASSERT(XFS_FSB_TO_AGBNO(mp, INT_GET(*pp, ARCH_CONVERT)) < mp->m_sb.sb_agblocks);
- bno = INT_GET(*pp, ARCH_CONVERT);
/*
- * Go down the tree until leaf level is reached, following the first
- * pointer (leftmost) at each level.
+ * There may be delalloc extents in the data fork before the range we
+ * are collapsing out, so we cannot
+ * use the count of real extents here. Instead we have to calculate it
+ * from the incore fork.
*/
- while (level-- > 0) {
- /* See if buf is in cur first */
- bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno));
- if (bp) {
- bp_release = 0;
- } else {
- bp_release = 1;
- }
- if (!bp && (error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp,
- XFS_BMAP_BTREE_REF)))
- goto error_norelse;
- block = XFS_BUF_TO_BMBT_BLOCK(bp);
- XFS_WANT_CORRUPTED_GOTO(
- XFS_BMAP_SANITY_CHECK(mp, block, level),
- error0);
- if (level == 0)
- break;
-
- /*
- * Check this block for basic sanity (increasing keys and
- * no duplicate blocks).
- */
-
- xfs_check_block(block, mp, 0, 0);
- pp = XFS_BTREE_PTR_ADDR(mp->m_sb.sb_blocksize, xfs_bmbt, block,
- 1, mp->m_bmap_dmxr[1]);
- XFS_WANT_CORRUPTED_GOTO(XFS_FSB_SANITY_CHECK(mp, INT_GET(*pp, ARCH_CONVERT)), error0);
- bno = INT_GET(*pp, ARCH_CONVERT);
- if (bp_release) {
- bp_release = 0;
- xfs_trans_brelse(NULL, bp);
- }
- }
+ total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t);
+ while (nexts++ < num_exts && *current_ext < total_extents) {
- /*
- * Here with bp and block set to the leftmost leaf node in the tree.
- */
- i = 0;
-
- /*
- * Loop over all leaf nodes checking that all extents are in the right order.
- */
- lastp = NULL;
- for (;;) {
- xfs_bmbt_rec_t *frp;
- xfs_fsblock_t nextbno;
- xfs_extnum_t num_recs;
-
-
- num_recs = be16_to_cpu(block->bb_numrecs);
+ gotp = xfs_iext_get_ext(ifp, *current_ext);
+ xfs_bmbt_get_all(gotp, &got);
+ startoff = got.br_startoff - offset_shift_fsb;
/*
- * Read-ahead the next leaf block, if any.
+ * Before shifting extent into hole, make sure that the hole
+ * is large enough to accomodate the shift.
*/
+ if (*current_ext) {
+ xfs_bmbt_get_all(xfs_iext_get_ext(ifp,
+ *current_ext - 1), &left);
- nextbno = be64_to_cpu(block->bb_rightsib);
-
- /*
- * Check all the extents to make sure they are OK.
- * If we had a previous block, the last entry should
- * conform with the first entry in this one.
- */
-
- frp = XFS_BTREE_REC_ADDR(mp->m_sb.sb_blocksize, xfs_bmbt,
- block, 1, mp->m_bmap_dmxr[0]);
-
- for (ep = frp;ep < frp + (num_recs - 1); ep++) {
- if (lastp) {
- xfs_btree_check_rec(XFS_BTNUM_BMAP,
- (void *)lastp, (void *)ep);
- }
- xfs_btree_check_rec(XFS_BTNUM_BMAP, (void *)ep,
- (void *)(ep + 1));
+ if (startoff < left.br_startoff + left.br_blockcount)
+ error = XFS_ERROR(EINVAL);
+ } else if (offset_shift_fsb > got.br_startoff) {
+ /*
+ * When first extent is shifted, offset_shift_fsb
+ * should be less than the stating offset of
+ * the first extent.
+ */
+ error = XFS_ERROR(EINVAL);
}
- lastp = frp + num_recs - 1; /* For the next iteration */
- i += num_recs;
- if (bp_release) {
- bp_release = 0;
- xfs_trans_brelse(NULL, bp);
- }
- bno = nextbno;
- /*
- * If we've reached the end, stop.
- */
- if (bno == NULLFSBLOCK)
- break;
+ if (error)
+ goto del_cursor;
- bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno));
- if (bp) {
- bp_release = 0;
- } else {
- bp_release = 1;
+ if (cur) {
+ error = xfs_bmbt_lookup_eq(cur, got.br_startoff,
+ got.br_startblock,
+ got.br_blockcount,
+ &i);
+ if (error)
+ goto del_cursor;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, del_cursor);
}
- if (!bp && (error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp,
- XFS_BMAP_BTREE_REF)))
- goto error_norelse;
- block = XFS_BUF_TO_BMBT_BLOCK(bp);
- }
- if (bp_release) {
- bp_release = 0;
- xfs_trans_brelse(NULL, bp);
- }
- return;
-error0:
- cmn_err(CE_WARN, "%s: at error0", __FUNCTION__);
- if (bp_release)
- xfs_trans_brelse(NULL, bp);
-error_norelse:
- cmn_err(CE_WARN, "%s: BAD after btree leaves for %d extents",
- __FUNCTION__, i);
- panic("%s: CORRUPTED BTREE OR SOMETHING", __FUNCTION__);
- return;
-}
-#endif
+ /* Check if we can merge 2 adjacent extents */
+ if (*current_ext &&
+ left.br_startoff + left.br_blockcount == startoff &&
+ left.br_startblock + left.br_blockcount ==
+ got.br_startblock &&
+ left.br_state == got.br_state &&
+ left.br_blockcount + got.br_blockcount <= MAXEXTLEN) {
+ blockcount = left.br_blockcount +
+ got.br_blockcount;
+ xfs_iext_remove(ip, *current_ext, 1, 0);
+ if (cur) {
+ error = xfs_btree_delete(cur, &i);
+ if (error)
+ goto del_cursor;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, del_cursor);
+ }
+ XFS_IFORK_NEXT_SET(ip, whichfork,
+ XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
+ gotp = xfs_iext_get_ext(ifp, --*current_ext);
+ xfs_bmbt_get_all(gotp, &got);
-/*
- * Count fsblocks of the given fork.
- */
-int /* error */
-xfs_bmap_count_blocks(
- xfs_trans_t *tp, /* transaction pointer */
- xfs_inode_t *ip, /* incore inode */
- int whichfork, /* data or attr fork */
- int *count) /* out: count of blocks */
-{
- xfs_bmbt_block_t *block; /* current btree block */
- xfs_fsblock_t bno; /* block # of "block" */
- xfs_ifork_t *ifp; /* fork structure */
- int level; /* btree level, for checking */
- xfs_mount_t *mp; /* file system mount structure */
- xfs_bmbt_ptr_t *pp; /* pointer to block address */
+ /* Make cursor point to the extent we will update */
+ if (cur) {
+ error = xfs_bmbt_lookup_eq(cur, got.br_startoff,
+ got.br_startblock,
+ got.br_blockcount,
+ &i);
+ if (error)
+ goto del_cursor;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, del_cursor);
+ }
- bno = NULLFSBLOCK;
- mp = ip->i_mount;
- ifp = XFS_IFORK_PTR(ip, whichfork);
- if ( XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS ) {
- if (unlikely(xfs_bmap_count_leaves(ifp->if_u1.if_extents,
- ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t),
- count) < 0)) {
- XFS_ERROR_REPORT("xfs_bmap_count_blocks(1)",
- XFS_ERRLEVEL_LOW, mp);
- return XFS_ERROR(EFSCORRUPTED);
+ xfs_bmbt_set_blockcount(gotp, blockcount);
+ got.br_blockcount = blockcount;
+ } else {
+ /* We have to update the startoff */
+ xfs_bmbt_set_startoff(gotp, startoff);
+ got.br_startoff = startoff;
}
- return 0;
- }
-
- /*
- * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out.
- */
- block = ifp->if_broot;
- level = be16_to_cpu(block->bb_level);
- ASSERT(level > 0);
- pp = XFS_BMAP_BROOT_PTR_ADDR(block, 1, ifp->if_broot_bytes);
- ASSERT(INT_GET(*pp, ARCH_CONVERT) != NULLDFSBNO);
- ASSERT(XFS_FSB_TO_AGNO(mp, INT_GET(*pp, ARCH_CONVERT)) < mp->m_sb.sb_agcount);
- ASSERT(XFS_FSB_TO_AGBNO(mp, INT_GET(*pp, ARCH_CONVERT)) < mp->m_sb.sb_agblocks);
- bno = INT_GET(*pp, ARCH_CONVERT);
-
- if (unlikely(xfs_bmap_count_tree(mp, tp, bno, level, count) < 0)) {
- XFS_ERROR_REPORT("xfs_bmap_count_blocks(2)", XFS_ERRLEVEL_LOW,
- mp);
- return XFS_ERROR(EFSCORRUPTED);
- }
-
- return 0;
-}
-/*
- * Recursively walks each level of a btree
- * to count total fsblocks is use.
- */
-int /* error */
-xfs_bmap_count_tree(
- xfs_mount_t *mp, /* file system mount point */
- xfs_trans_t *tp, /* transaction pointer */
- xfs_fsblock_t blockno, /* file system block number */
- int levelin, /* level in btree */
- int *count) /* Count of blocks */
-{
- int error;
- xfs_buf_t *bp, *nbp;
- int level = levelin;
- xfs_bmbt_ptr_t *pp;
- xfs_fsblock_t bno = blockno;
- xfs_fsblock_t nextbno;
- xfs_bmbt_block_t *block, *nextblock;
- int numrecs;
- xfs_bmbt_rec_t *frp;
-
- if ((error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, XFS_BMAP_BTREE_REF)))
- return error;
- *count += 1;
- block = XFS_BUF_TO_BMBT_BLOCK(bp);
-
- if (--level) {
- /* Not at node above leafs, count this level of nodes */
- nextbno = be64_to_cpu(block->bb_rightsib);
- while (nextbno != NULLFSBLOCK) {
- if ((error = xfs_btree_read_bufl(mp, tp, nextbno,
- 0, &nbp, XFS_BMAP_BTREE_REF)))
- return error;
- *count += 1;
- nextblock = XFS_BUF_TO_BMBT_BLOCK(nbp);
- nextbno = be64_to_cpu(nextblock->bb_rightsib);
- xfs_trans_brelse(tp, nbp);
+ if (cur) {
+ error = xfs_bmbt_update(cur, got.br_startoff,
+ got.br_startblock,
+ got.br_blockcount,
+ got.br_state);
+ if (error)
+ goto del_cursor;
}
- /* Dive to the next level */
- pp = XFS_BTREE_PTR_ADDR(mp->m_sb.sb_blocksize,
- xfs_bmbt, block, 1, mp->m_bmap_dmxr[1]);
- bno = INT_GET(*pp, ARCH_CONVERT);
- if (unlikely((error =
- xfs_bmap_count_tree(mp, tp, bno, level, count)) < 0)) {
- xfs_trans_brelse(tp, bp);
- XFS_ERROR_REPORT("xfs_bmap_count_tree(1)",
- XFS_ERRLEVEL_LOW, mp);
- return XFS_ERROR(EFSCORRUPTED);
- }
- xfs_trans_brelse(tp, bp);
- } else {
- /* count all level 1 nodes and their leaves */
- for (;;) {
- nextbno = be64_to_cpu(block->bb_rightsib);
- numrecs = be16_to_cpu(block->bb_numrecs);
- frp = XFS_BTREE_REC_ADDR(mp->m_sb.sb_blocksize,
- xfs_bmbt, block, 1, mp->m_bmap_dmxr[0]);
- if (unlikely(xfs_bmap_disk_count_leaves(frp, numrecs, count) < 0)) {
- xfs_trans_brelse(tp, bp);
- XFS_ERROR_REPORT("xfs_bmap_count_tree(2)",
- XFS_ERRLEVEL_LOW, mp);
- return XFS_ERROR(EFSCORRUPTED);
- }
- xfs_trans_brelse(tp, bp);
- if (nextbno == NULLFSBLOCK)
- break;
- bno = nextbno;
- if ((error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
- XFS_BMAP_BTREE_REF)))
- return error;
- *count += 1;
- block = XFS_BUF_TO_BMBT_BLOCK(bp);
- }
+ (*current_ext)++;
+ total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t);
}
- return 0;
-}
-
-/*
- * Count leaf blocks given a pointer to an extent list.
- */
-int
-xfs_bmap_count_leaves(
- xfs_bmbt_rec_t *frp,
- int numrecs,
- int *count)
-{
- int b;
- for ( b = 1; b <= numrecs; b++, frp++)
- *count += xfs_bmbt_get_blockcount(frp);
- return 0;
-}
+ /* Check if we are done */
+ if (*current_ext == total_extents)
+ *done = 1;
-/*
- * Count leaf blocks given a pointer to an extent list originally in btree format.
- */
-int
-xfs_bmap_disk_count_leaves(
- xfs_bmbt_rec_t *frp,
- int numrecs,
- int *count)
-{
- int b;
+del_cursor:
+ if (cur)
+ xfs_btree_del_cursor(cur,
+ error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
- for ( b = 1; b <= numrecs; b++, frp++)
- *count += xfs_bmbt_disk_get_blockcount(frp);
- return 0;
+ xfs_trans_log_inode(tp, ip, logflags);
+ return error;
}
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h
index 12cc63dfc2c..b879ca56a64 100644
--- a/fs/xfs/xfs_bmap.h
+++ b/fs/xfs/xfs_bmap.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
* All Rights Reserved.
*
* This program is free software; you can redistribute it and/or
@@ -20,10 +20,13 @@
struct getbmap;
struct xfs_bmbt_irec;
+struct xfs_ifork;
struct xfs_inode;
struct xfs_mount;
struct xfs_trans;
+extern kmem_zone_t *xfs_bmap_free_item_zone;
+
/*
* List of extents to be free "later".
* The list is kept sorted on xbf_startblock.
@@ -37,37 +40,54 @@ typedef struct xfs_bmap_free_item
/*
* Header for free extent list.
+ *
+ * xbf_low is used by the allocator to activate the lowspace algorithm -
+ * when free space is running low the extent allocator may choose to
+ * allocate an extent from an AG without leaving sufficient space for
+ * a btree split when inserting the new extent. In this case the allocator
+ * will enable the lowspace algorithm which is supposed to allow further
+ * allocations (such as btree splits and newroots) to allocate from
+ * sequential AGs. In order to avoid locking AGs out of order the lowspace
+ * algorithm will start searching for free space from AG 0. If the correct
+ * transaction reservations have been made then this algorithm will eventually
+ * find all the space it needs.
*/
typedef struct xfs_bmap_free
{
xfs_bmap_free_item_t *xbf_first; /* list of to-be-free extents */
int xbf_count; /* count of items on list */
- int xbf_low; /* kludge: alloc in low mode */
+ int xbf_low; /* alloc in low mode */
} xfs_bmap_free_t;
#define XFS_BMAP_MAX_NMAP 4
/*
- * Flags for xfs_bmapi
+ * Flags for xfs_bmapi_*
*/
-#define XFS_BMAPI_WRITE 0x001 /* write operation: allocate space */
-#define XFS_BMAPI_DELAY 0x002 /* delayed write operation */
-#define XFS_BMAPI_ENTIRE 0x004 /* return entire extent, not trimmed */
-#define XFS_BMAPI_METADATA 0x008 /* mapping metadata not user data */
-#define XFS_BMAPI_EXACT 0x010 /* allocate only to spec'd bounds */
-#define XFS_BMAPI_ATTRFORK 0x020 /* use attribute fork not data */
-#define XFS_BMAPI_ASYNC 0x040 /* bunmapi xactions can be async */
-#define XFS_BMAPI_RSVBLOCKS 0x080 /* OK to alloc. reserved data blocks */
-#define XFS_BMAPI_PREALLOC 0x100 /* preallocation op: unwritten space */
-#define XFS_BMAPI_IGSTATE 0x200 /* Ignore state - */
+#define XFS_BMAPI_ENTIRE 0x001 /* return entire extent, not trimmed */
+#define XFS_BMAPI_METADATA 0x002 /* mapping metadata not user data */
+#define XFS_BMAPI_ATTRFORK 0x004 /* use attribute fork not data */
+#define XFS_BMAPI_PREALLOC 0x008 /* preallocation op: unwritten space */
+#define XFS_BMAPI_IGSTATE 0x010 /* Ignore state - */
/* combine contig. space */
-#define XFS_BMAPI_CONTIG 0x400 /* must allocate only one extent */
-/* XFS_BMAPI_DIRECT_IO 0x800 */
-#define XFS_BMAPI_CONVERT 0x1000 /* unwritten extent conversion - */
- /* need write cache flushing and no */
- /* additional allocation alignments */
+#define XFS_BMAPI_CONTIG 0x020 /* must allocate only one extent */
+/*
+ * unwritten extent conversion - this needs write cache flushing and no additional
+ * allocation alignments. When specified with XFS_BMAPI_PREALLOC it converts
+ * from written to unwritten, otherwise convert from unwritten to written.
+ */
+#define XFS_BMAPI_CONVERT 0x040
+
+#define XFS_BMAPI_FLAGS \
+ { XFS_BMAPI_ENTIRE, "ENTIRE" }, \
+ { XFS_BMAPI_METADATA, "METADATA" }, \
+ { XFS_BMAPI_ATTRFORK, "ATTRFORK" }, \
+ { XFS_BMAPI_PREALLOC, "PREALLOC" }, \
+ { XFS_BMAPI_IGSTATE, "IGSTATE" }, \
+ { XFS_BMAPI_CONTIG, "CONTIG" }, \
+ { XFS_BMAPI_CONVERT, "CONVERT" }
+
-#define XFS_BMAPI_AFLAG(w) xfs_bmapi_aflag(w)
static inline int xfs_bmapi_aflag(int w)
{
return (w == XFS_ATTR_FORK ? XFS_BMAPI_ATTRFORK : 0);
@@ -79,7 +99,6 @@ static inline int xfs_bmapi_aflag(int w)
#define DELAYSTARTBLOCK ((xfs_fsblock_t)-1LL)
#define HOLESTARTBLOCK ((xfs_fsblock_t)-2LL)
-#define XFS_BMAP_INIT(flp,fbp) xfs_bmap_init(flp,fbp)
static inline void xfs_bmap_init(xfs_bmap_free_t *flp, xfs_fsblock_t *fbp)
{
((flp)->xbf_first = NULL, (flp)->xbf_count = 0, \
@@ -87,269 +106,81 @@ static inline void xfs_bmap_init(xfs_bmap_free_t *flp, xfs_fsblock_t *fbp)
}
/*
- * Argument structure for xfs_bmap_alloc.
+ * Flags for xfs_bmap_add_extent*.
*/
-typedef struct xfs_bmalloca {
- xfs_fsblock_t firstblock; /* i/o first block allocated */
- xfs_fsblock_t rval; /* starting block of new extent */
- xfs_fileoff_t off; /* offset in file filling in */
- struct xfs_trans *tp; /* transaction pointer */
- struct xfs_inode *ip; /* incore inode pointer */
- struct xfs_bmbt_irec *prevp; /* extent before the new one */
- struct xfs_bmbt_irec *gotp; /* extent after, or delayed */
- xfs_extlen_t alen; /* i/o length asked/allocated */
- xfs_extlen_t total; /* total blocks needed for xaction */
- xfs_extlen_t minlen; /* mininum allocation size (blocks) */
- xfs_extlen_t minleft; /* amount must be left after alloc */
- char eof; /* set if allocating past last extent */
- char wasdel; /* replacing a delayed allocation */
- char userdata;/* set if is user data */
- char low; /* low on space, using seq'l ags */
- char aeof; /* allocated space at eof */
- char conv; /* overwriting unwritten extents */
-} xfs_bmalloca_t;
+#define BMAP_LEFT_CONTIG (1 << 0)
+#define BMAP_RIGHT_CONTIG (1 << 1)
+#define BMAP_LEFT_FILLING (1 << 2)
+#define BMAP_RIGHT_FILLING (1 << 3)
+#define BMAP_LEFT_DELAY (1 << 4)
+#define BMAP_RIGHT_DELAY (1 << 5)
+#define BMAP_LEFT_VALID (1 << 6)
+#define BMAP_RIGHT_VALID (1 << 7)
+#define BMAP_ATTRFORK (1 << 8)
-#ifdef __KERNEL__
-
-#if defined(XFS_BMAP_TRACE)
-/*
- * Trace operations for bmap extent tracing
- */
-#define XFS_BMAP_KTRACE_DELETE 1
-#define XFS_BMAP_KTRACE_INSERT 2
-#define XFS_BMAP_KTRACE_PRE_UP 3
-#define XFS_BMAP_KTRACE_POST_UP 4
+#define XFS_BMAP_EXT_FLAGS \
+ { BMAP_LEFT_CONTIG, "LC" }, \
+ { BMAP_RIGHT_CONTIG, "RC" }, \
+ { BMAP_LEFT_FILLING, "LF" }, \
+ { BMAP_RIGHT_FILLING, "RF" }, \
+ { BMAP_ATTRFORK, "ATTR" }
-#define XFS_BMAP_TRACE_SIZE 4096 /* size of global trace buffer */
-#define XFS_BMAP_KTRACE_SIZE 32 /* size of per-inode trace buffer */
-extern ktrace_t *xfs_bmap_trace_buf;
/*
- * Add bmap trace insert entries for all the contents of the extent list.
+ * This macro is used to determine how many extents will be shifted
+ * in one write transaction. We could require two splits,
+ * an extent move on the first and an extent merge on the second,
+ * So it is proper that one extent is shifted inside write transaction
+ * at a time.
*/
-void
-xfs_bmap_trace_exlist(
- char *fname, /* function name */
- struct xfs_inode *ip, /* incore inode pointer */
- xfs_extnum_t cnt, /* count of entries in list */
- int whichfork); /* data or attr fork */
+#define XFS_BMAP_MAX_SHIFT_EXTENTS 1
+
+#ifdef DEBUG
+void xfs_bmap_trace_exlist(struct xfs_inode *ip, xfs_extnum_t cnt,
+ int whichfork, unsigned long caller_ip);
+#define XFS_BMAP_TRACE_EXLIST(ip,c,w) \
+ xfs_bmap_trace_exlist(ip,c,w, _THIS_IP_)
#else
-#define xfs_bmap_trace_exlist(f,ip,c,w)
+#define XFS_BMAP_TRACE_EXLIST(ip,c,w)
#endif
-/*
- * Convert inode from non-attributed to attributed.
- * Must not be in a transaction, ip must not be locked.
- */
-int /* error code */
-xfs_bmap_add_attrfork(
- struct xfs_inode *ip, /* incore inode pointer */
- int size, /* space needed for new attribute */
- int rsvd); /* flag for reserved block allocation */
-
-/*
- * Add the extent to the list of extents to be free at transaction end.
- * The list is maintained sorted (by block number).
- */
-void
-xfs_bmap_add_free(
- xfs_fsblock_t bno, /* fs block number of extent */
- xfs_filblks_t len, /* length of extent */
- xfs_bmap_free_t *flist, /* list of extents */
- struct xfs_mount *mp); /* mount point structure */
-
-/*
- * Routine to clean up the free list data structure when
- * an error occurs during a transaction.
- */
-void
-xfs_bmap_cancel(
- xfs_bmap_free_t *flist); /* free list to clean up */
-
-/*
- * Compute and fill in the value of the maximum depth of a bmap btree
- * in this filesystem. Done once, during mount.
- */
-void
-xfs_bmap_compute_maxlevels(
- struct xfs_mount *mp, /* file system mount structure */
- int whichfork); /* data or attr fork */
-
-/*
- * Routine to be called at transaction's end by xfs_bmapi, xfs_bunmapi
- * caller. Frees all the extents that need freeing, which must be done
- * last due to locking considerations.
- *
- * Return 1 if the given transaction was committed and a new one allocated,
- * and 0 otherwise.
- */
-int /* error */
-xfs_bmap_finish(
- struct xfs_trans **tp, /* transaction pointer addr */
- xfs_bmap_free_t *flist, /* i/o: list extents to free */
- xfs_fsblock_t firstblock, /* controlled a.g. for allocs */
- int *committed); /* xact committed or not */
-
-/*
- * Returns the file-relative block number of the first unused block in the file.
- * This is the lowest-address hole if the file has holes, else the first block
- * past the end of file.
- */
-int /* error */
-xfs_bmap_first_unused(
- struct xfs_trans *tp, /* transaction pointer */
- struct xfs_inode *ip, /* incore inode */
- xfs_extlen_t len, /* size of hole to find */
- xfs_fileoff_t *unused, /* unused block num */
- int whichfork); /* data or attr fork */
-
-/*
- * Returns the file-relative block number of the last block + 1 before
- * last_block (input value) in the file.
- * This is not based on i_size, it is based on the extent list.
- * Returns 0 for local files, as they do not have an extent list.
- */
-int /* error */
-xfs_bmap_last_before(
- struct xfs_trans *tp, /* transaction pointer */
- struct xfs_inode *ip, /* incore inode */
- xfs_fileoff_t *last_block, /* last block */
- int whichfork); /* data or attr fork */
-
-/*
- * Returns the file-relative block number of the first block past eof in
- * the file. This is not based on i_size, it is based on the extent list.
- * Returns 0 for local files, as they do not have an extent list.
- */
-int /* error */
-xfs_bmap_last_offset(
- struct xfs_trans *tp, /* transaction pointer */
- struct xfs_inode *ip, /* incore inode */
- xfs_fileoff_t *unused, /* last block num */
- int whichfork); /* data or attr fork */
-
-/*
- * Returns whether the selected fork of the inode has exactly one
- * block or not. For the data fork we check this matches di_size,
- * implying the file's range is 0..bsize-1.
- */
-int
-xfs_bmap_one_block(
- struct xfs_inode *ip, /* incore inode */
- int whichfork); /* data or attr fork */
-
-/*
- * Read in the extents to iu_extents.
- * All inode fields are set up by caller, we just traverse the btree
- * and copy the records in.
- */
-int /* error */
-xfs_bmap_read_extents(
- struct xfs_trans *tp, /* transaction pointer */
- struct xfs_inode *ip, /* incore inode */
- int whichfork); /* data or attr fork */
-
-/*
- * Map file blocks to filesystem blocks.
- * File range is given by the bno/len pair.
- * Adds blocks to file if a write ("flags & XFS_BMAPI_WRITE" set)
- * into a hole or past eof.
- * Only allocates blocks from a single allocation group,
- * to avoid locking problems.
- * The returned value in "firstblock" from the first call in a transaction
- * must be remembered and presented to subsequent calls in "firstblock".
- * An upper bound for the number of blocks to be allocated is supplied to
- * the first call in "total"; if no allocation group has that many free
- * blocks then the call will fail (return NULLFSBLOCK in "firstblock").
- */
-int /* error */
-xfs_bmapi(
- struct xfs_trans *tp, /* transaction pointer */
- struct xfs_inode *ip, /* incore inode */
- xfs_fileoff_t bno, /* starting file offs. mapped */
- xfs_filblks_t len, /* length to map in file */
- int flags, /* XFS_BMAPI_... */
- xfs_fsblock_t *firstblock, /* first allocated block
- controls a.g. for allocs */
- xfs_extlen_t total, /* total blocks needed */
- struct xfs_bmbt_irec *mval, /* output: map values */
- int *nmap, /* i/o: mval size/count */
- xfs_bmap_free_t *flist); /* i/o: list extents to free */
-
-/*
- * Map file blocks to filesystem blocks, simple version.
- * One block only, read-only.
- * For flags, only the XFS_BMAPI_ATTRFORK flag is examined.
- * For the other flag values, the effect is as if XFS_BMAPI_METADATA
- * was set and all the others were clear.
- */
-int /* error */
-xfs_bmapi_single(
- struct xfs_trans *tp, /* transaction pointer */
- struct xfs_inode *ip, /* incore inode */
- int whichfork, /* data or attr fork */
- xfs_fsblock_t *fsb, /* output: mapped block */
- xfs_fileoff_t bno); /* starting file offs. mapped */
-
-/*
- * Unmap (remove) blocks from a file.
- * If nexts is nonzero then the number of extents to remove is limited to
- * that value. If not all extents in the block range can be removed then
- * *done is set.
- */
-int /* error */
-xfs_bunmapi(
- struct xfs_trans *tp, /* transaction pointer */
- struct xfs_inode *ip, /* incore inode */
- xfs_fileoff_t bno, /* starting offset to unmap */
- xfs_filblks_t len, /* length to unmap in file */
- int flags, /* XFS_BMAPI_... */
- xfs_extnum_t nexts, /* number of extents max */
- xfs_fsblock_t *firstblock, /* first allocated block
- controls a.g. for allocs */
- xfs_bmap_free_t *flist, /* i/o: list extents to free */
- int *done); /* set if not done yet */
-
-/*
- * Fcntl interface to xfs_bmapi.
- */
-int /* error code */
-xfs_getbmap(
- bhv_desc_t *bdp, /* XFS behavior descriptor*/
- struct getbmap *bmv, /* user bmap structure */
- void __user *ap, /* pointer to user's array */
- int iflags); /* interface flags */
-
-/*
- * Check if the endoff is outside the last extent. If so the caller will grow
- * the allocation to a stripe unit boundary
- */
-int
-xfs_bmap_eof(
- struct xfs_inode *ip,
- xfs_fileoff_t endoff,
- int whichfork,
- int *eof);
-
-/*
- * Count fsblocks of the given fork.
- */
-int
-xfs_bmap_count_blocks(
- xfs_trans_t *tp,
- struct xfs_inode *ip,
- int whichfork,
- int *count);
-
-/*
- * Check an extent list, which has just been read, for
- * any bit in the extent flag field.
- */
-int
-xfs_check_nostate_extents(
- xfs_bmbt_rec_t *ep,
- xfs_extnum_t num);
-
-#endif /* __KERNEL__ */
+int xfs_bmap_add_attrfork(struct xfs_inode *ip, int size, int rsvd);
+void xfs_bmap_local_to_extents_empty(struct xfs_inode *ip, int whichfork);
+void xfs_bmap_add_free(xfs_fsblock_t bno, xfs_filblks_t len,
+ struct xfs_bmap_free *flist, struct xfs_mount *mp);
+void xfs_bmap_cancel(struct xfs_bmap_free *flist);
+void xfs_bmap_compute_maxlevels(struct xfs_mount *mp, int whichfork);
+int xfs_bmap_first_unused(struct xfs_trans *tp, struct xfs_inode *ip,
+ xfs_extlen_t len, xfs_fileoff_t *unused, int whichfork);
+int xfs_bmap_last_before(struct xfs_trans *tp, struct xfs_inode *ip,
+ xfs_fileoff_t *last_block, int whichfork);
+int xfs_bmap_last_offset(struct xfs_inode *ip, xfs_fileoff_t *unused,
+ int whichfork);
+int xfs_bmap_one_block(struct xfs_inode *ip, int whichfork);
+int xfs_bmap_read_extents(struct xfs_trans *tp, struct xfs_inode *ip,
+ int whichfork);
+int xfs_bmapi_read(struct xfs_inode *ip, xfs_fileoff_t bno,
+ xfs_filblks_t len, struct xfs_bmbt_irec *mval,
+ int *nmap, int flags);
+int xfs_bmapi_delay(struct xfs_inode *ip, xfs_fileoff_t bno,
+ xfs_filblks_t len, struct xfs_bmbt_irec *mval,
+ int *nmap, int flags);
+int xfs_bmapi_write(struct xfs_trans *tp, struct xfs_inode *ip,
+ xfs_fileoff_t bno, xfs_filblks_t len, int flags,
+ xfs_fsblock_t *firstblock, xfs_extlen_t total,
+ struct xfs_bmbt_irec *mval, int *nmap,
+ struct xfs_bmap_free *flist);
+int xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip,
+ xfs_fileoff_t bno, xfs_filblks_t len, int flags,
+ xfs_extnum_t nexts, xfs_fsblock_t *firstblock,
+ struct xfs_bmap_free *flist, int *done);
+int xfs_check_nostate_extents(struct xfs_ifork *ifp, xfs_extnum_t idx,
+ xfs_extnum_t num);
+uint xfs_default_attroffset(struct xfs_inode *ip);
+int xfs_bmap_shift_extents(struct xfs_trans *tp, struct xfs_inode *ip,
+ int *done, xfs_fileoff_t start_fsb,
+ xfs_fileoff_t offset_shift_fsb, xfs_extnum_t *current_ext,
+ xfs_fsblock_t *firstblock, struct xfs_bmap_free *flist,
+ int num_exts);
#endif /* __XFS_BMAP_H__ */
diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c
index 3f1383d160e..948836c4fd9 100644
--- a/fs/xfs/xfs_bmap_btree.c
+++ b/fs/xfs/xfs_bmap_btree.c
@@ -17,1499 +17,26 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
-#include "xfs_types.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_bit.h"
-#include "xfs_log.h"
-#include "xfs_inum.h"
-#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
-#include "xfs_dir.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
-#include "xfs_dinode.h"
#include "xfs_inode.h"
+#include "xfs_trans.h"
#include "xfs_inode_item.h"
#include "xfs_alloc.h"
#include "xfs_btree.h"
-#include "xfs_ialloc.h"
-#include "xfs_itable.h"
+#include "xfs_bmap_btree.h"
#include "xfs_bmap.h"
#include "xfs_error.h"
#include "xfs_quota.h"
-
-#if defined(XFS_BMBT_TRACE)
-ktrace_t *xfs_bmbt_trace_buf;
-#endif
-
-/*
- * Prototypes for internal btree functions.
- */
-
-
-STATIC int xfs_bmbt_killroot(xfs_btree_cur_t *);
-STATIC void xfs_bmbt_log_keys(xfs_btree_cur_t *, xfs_buf_t *, int, int);
-STATIC void xfs_bmbt_log_ptrs(xfs_btree_cur_t *, xfs_buf_t *, int, int);
-STATIC int xfs_bmbt_lshift(xfs_btree_cur_t *, int, int *);
-STATIC int xfs_bmbt_rshift(xfs_btree_cur_t *, int, int *);
-STATIC int xfs_bmbt_split(xfs_btree_cur_t *, int, xfs_fsblock_t *,
- xfs_bmbt_key_t *, xfs_btree_cur_t **, int *);
-STATIC int xfs_bmbt_updkey(xfs_btree_cur_t *, xfs_bmbt_key_t *, int);
-
-
-#if defined(XFS_BMBT_TRACE)
-
-static char ARGS[] = "args";
-static char ENTRY[] = "entry";
-static char ERROR[] = "error";
-#undef EXIT
-static char EXIT[] = "exit";
-
-/*
- * Add a trace buffer entry for the arguments given to the routine,
- * generic form.
- */
-STATIC void
-xfs_bmbt_trace_enter(
- char *func,
- xfs_btree_cur_t *cur,
- char *s,
- int type,
- int line,
- __psunsigned_t a0,
- __psunsigned_t a1,
- __psunsigned_t a2,
- __psunsigned_t a3,
- __psunsigned_t a4,
- __psunsigned_t a5,
- __psunsigned_t a6,
- __psunsigned_t a7,
- __psunsigned_t a8,
- __psunsigned_t a9,
- __psunsigned_t a10)
-{
- xfs_inode_t *ip;
- int whichfork;
-
- ip = cur->bc_private.b.ip;
- whichfork = cur->bc_private.b.whichfork;
- ktrace_enter(xfs_bmbt_trace_buf,
- (void *)((__psint_t)type | (whichfork << 8) | (line << 16)),
- (void *)func, (void *)s, (void *)ip, (void *)cur,
- (void *)a0, (void *)a1, (void *)a2, (void *)a3,
- (void *)a4, (void *)a5, (void *)a6, (void *)a7,
- (void *)a8, (void *)a9, (void *)a10);
- ASSERT(ip->i_btrace);
- ktrace_enter(ip->i_btrace,
- (void *)((__psint_t)type | (whichfork << 8) | (line << 16)),
- (void *)func, (void *)s, (void *)ip, (void *)cur,
- (void *)a0, (void *)a1, (void *)a2, (void *)a3,
- (void *)a4, (void *)a5, (void *)a6, (void *)a7,
- (void *)a8, (void *)a9, (void *)a10);
-}
-/*
- * Add a trace buffer entry for arguments, for a buffer & 1 integer arg.
- */
-STATIC void
-xfs_bmbt_trace_argbi(
- char *func,
- xfs_btree_cur_t *cur,
- xfs_buf_t *b,
- int i,
- int line)
-{
- xfs_bmbt_trace_enter(func, cur, ARGS, XFS_BMBT_KTRACE_ARGBI, line,
- (__psunsigned_t)b, i, 0, 0,
- 0, 0, 0, 0,
- 0, 0, 0);
-}
-
-/*
- * Add a trace buffer entry for arguments, for a buffer & 2 integer args.
- */
-STATIC void
-xfs_bmbt_trace_argbii(
- char *func,
- xfs_btree_cur_t *cur,
- xfs_buf_t *b,
- int i0,
- int i1,
- int line)
-{
- xfs_bmbt_trace_enter(func, cur, ARGS, XFS_BMBT_KTRACE_ARGBII, line,
- (__psunsigned_t)b, i0, i1, 0,
- 0, 0, 0, 0,
- 0, 0, 0);
-}
-
-/*
- * Add a trace buffer entry for arguments, for 3 block-length args
- * and an integer arg.
- */
-STATIC void
-xfs_bmbt_trace_argfffi(
- char *func,
- xfs_btree_cur_t *cur,
- xfs_dfiloff_t o,
- xfs_dfsbno_t b,
- xfs_dfilblks_t i,
- int j,
- int line)
-{
- xfs_bmbt_trace_enter(func, cur, ARGS, XFS_BMBT_KTRACE_ARGFFFI, line,
- o >> 32, (int)o, b >> 32, (int)b,
- i >> 32, (int)i, (int)j, 0,
- 0, 0, 0);
-}
-
-/*
- * Add a trace buffer entry for arguments, for one integer arg.
- */
-STATIC void
-xfs_bmbt_trace_argi(
- char *func,
- xfs_btree_cur_t *cur,
- int i,
- int line)
-{
- xfs_bmbt_trace_enter(func, cur, ARGS, XFS_BMBT_KTRACE_ARGI, line,
- i, 0, 0, 0,
- 0, 0, 0, 0,
- 0, 0, 0);
-}
-
-/*
- * Add a trace buffer entry for arguments, for int, fsblock, key.
- */
-STATIC void
-xfs_bmbt_trace_argifk(
- char *func,
- xfs_btree_cur_t *cur,
- int i,
- xfs_fsblock_t f,
- xfs_bmbt_key_t *k,
- int line)
-{
- xfs_dfsbno_t d;
- xfs_dfiloff_t o;
-
- d = (xfs_dfsbno_t)f;
- o = INT_GET(k->br_startoff, ARCH_CONVERT);
- xfs_bmbt_trace_enter(func, cur, ARGS, XFS_BMBT_KTRACE_ARGIFK, line,
- i, d >> 32, (int)d, o >> 32,
- (int)o, 0, 0, 0,
- 0, 0, 0);
-}
-
-/*
- * Add a trace buffer entry for arguments, for int, fsblock, rec.
- */
-STATIC void
-xfs_bmbt_trace_argifr(
- char *func,
- xfs_btree_cur_t *cur,
- int i,
- xfs_fsblock_t f,
- xfs_bmbt_rec_t *r,
- int line)
-{
- xfs_dfsbno_t b;
- xfs_dfilblks_t c;
- xfs_dfsbno_t d;
- xfs_dfiloff_t o;
- xfs_bmbt_irec_t s;
-
- d = (xfs_dfsbno_t)f;
- xfs_bmbt_disk_get_all(r, &s);
- o = (xfs_dfiloff_t)s.br_startoff;
- b = (xfs_dfsbno_t)s.br_startblock;
- c = s.br_blockcount;
- xfs_bmbt_trace_enter(func, cur, ARGS, XFS_BMBT_KTRACE_ARGIFR, line,
- i, d >> 32, (int)d, o >> 32,
- (int)o, b >> 32, (int)b, c >> 32,
- (int)c, 0, 0);
-}
-
-/*
- * Add a trace buffer entry for arguments, for int, key.
- */
-STATIC void
-xfs_bmbt_trace_argik(
- char *func,
- xfs_btree_cur_t *cur,
- int i,
- xfs_bmbt_key_t *k,
- int line)
-{
- xfs_dfiloff_t o;
-
- o = INT_GET(k->br_startoff, ARCH_CONVERT);
- xfs_bmbt_trace_enter(func, cur, ARGS, XFS_BMBT_KTRACE_ARGIFK, line,
- i, o >> 32, (int)o, 0,
- 0, 0, 0, 0,
- 0, 0, 0);
-}
-
-/*
- * Add a trace buffer entry for the cursor/operation.
- */
-STATIC void
-xfs_bmbt_trace_cursor(
- char *func,
- xfs_btree_cur_t *cur,
- char *s,
- int line)
-{
- xfs_bmbt_rec_t r;
-
- xfs_bmbt_set_all(&r, &cur->bc_rec.b);
- xfs_bmbt_trace_enter(func, cur, s, XFS_BMBT_KTRACE_CUR, line,
- (cur->bc_nlevels << 24) | (cur->bc_private.b.flags << 16) |
- cur->bc_private.b.allocated,
- INT_GET(r.l0, ARCH_CONVERT) >> 32, (int)INT_GET(r.l0, ARCH_CONVERT), INT_GET(r.l1, ARCH_CONVERT) >> 32, (int)INT_GET(r.l1, ARCH_CONVERT),
- (unsigned long)cur->bc_bufs[0], (unsigned long)cur->bc_bufs[1],
- (unsigned long)cur->bc_bufs[2], (unsigned long)cur->bc_bufs[3],
- (cur->bc_ptrs[0] << 16) | cur->bc_ptrs[1],
- (cur->bc_ptrs[2] << 16) | cur->bc_ptrs[3]);
-}
-
-#define XFS_BMBT_TRACE_ARGBI(c,b,i) \
- xfs_bmbt_trace_argbi(fname, c, b, i, __LINE__)
-#define XFS_BMBT_TRACE_ARGBII(c,b,i,j) \
- xfs_bmbt_trace_argbii(fname, c, b, i, j, __LINE__)
-#define XFS_BMBT_TRACE_ARGFFFI(c,o,b,i,j) \
- xfs_bmbt_trace_argfffi(fname, c, o, b, i, j, __LINE__)
-#define XFS_BMBT_TRACE_ARGI(c,i) \
- xfs_bmbt_trace_argi(fname, c, i, __LINE__)
-#define XFS_BMBT_TRACE_ARGIFK(c,i,f,k) \
- xfs_bmbt_trace_argifk(fname, c, i, f, k, __LINE__)
-#define XFS_BMBT_TRACE_ARGIFR(c,i,f,r) \
- xfs_bmbt_trace_argifr(fname, c, i, f, r, __LINE__)
-#define XFS_BMBT_TRACE_ARGIK(c,i,k) \
- xfs_bmbt_trace_argik(fname, c, i, k, __LINE__)
-#define XFS_BMBT_TRACE_CURSOR(c,s) \
- xfs_bmbt_trace_cursor(fname, c, s, __LINE__)
-#else
-#define XFS_BMBT_TRACE_ARGBI(c,b,i)
-#define XFS_BMBT_TRACE_ARGBII(c,b,i,j)
-#define XFS_BMBT_TRACE_ARGFFFI(c,o,b,i,j)
-#define XFS_BMBT_TRACE_ARGI(c,i)
-#define XFS_BMBT_TRACE_ARGIFK(c,i,f,k)
-#define XFS_BMBT_TRACE_ARGIFR(c,i,f,r)
-#define XFS_BMBT_TRACE_ARGIK(c,i,k)
-#define XFS_BMBT_TRACE_CURSOR(c,s)
-#endif /* XFS_BMBT_TRACE */
-
-
-/*
- * Internal functions.
- */
-
-/*
- * Delete record pointed to by cur/level.
- */
-STATIC int /* error */
-xfs_bmbt_delrec(
- xfs_btree_cur_t *cur,
- int level,
- int *stat) /* success/failure */
-{
- xfs_bmbt_block_t *block; /* bmap btree block */
- xfs_fsblock_t bno; /* fs-relative block number */
- xfs_buf_t *bp; /* buffer for block */
- int error; /* error return value */
-#ifdef XFS_BMBT_TRACE
- static char fname[] = "xfs_bmbt_delrec";
-#endif
- int i; /* loop counter */
- int j; /* temp state */
- xfs_bmbt_key_t key; /* bmap btree key */
- xfs_bmbt_key_t *kp=NULL; /* pointer to bmap btree key */
- xfs_fsblock_t lbno; /* left sibling block number */
- xfs_buf_t *lbp; /* left buffer pointer */
- xfs_bmbt_block_t *left; /* left btree block */
- xfs_bmbt_key_t *lkp; /* left btree key */
- xfs_bmbt_ptr_t *lpp; /* left address pointer */
- int lrecs=0; /* left record count */
- xfs_bmbt_rec_t *lrp; /* left record pointer */
- xfs_mount_t *mp; /* file system mount point */
- xfs_bmbt_ptr_t *pp; /* pointer to bmap block addr */
- int ptr; /* key/record index */
- xfs_fsblock_t rbno; /* right sibling block number */
- xfs_buf_t *rbp; /* right buffer pointer */
- xfs_bmbt_block_t *right; /* right btree block */
- xfs_bmbt_key_t *rkp; /* right btree key */
- xfs_bmbt_rec_t *rp; /* pointer to bmap btree rec */
- xfs_bmbt_ptr_t *rpp; /* right address pointer */
- xfs_bmbt_block_t *rrblock; /* right-right btree block */
- xfs_buf_t *rrbp; /* right-right buffer pointer */
- int rrecs=0; /* right record count */
- xfs_bmbt_rec_t *rrp; /* right record pointer */
- xfs_btree_cur_t *tcur; /* temporary btree cursor */
- int numrecs; /* temporary numrec count */
- int numlrecs, numrrecs;
-
- XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
- XFS_BMBT_TRACE_ARGI(cur, level);
- ptr = cur->bc_ptrs[level];
- tcur = (xfs_btree_cur_t *)0;
- if (ptr == 0) {
- XFS_BMBT_TRACE_CURSOR(cur, EXIT);
- *stat = 0;
- return 0;
- }
- block = xfs_bmbt_get_block(cur, level, &bp);
- numrecs = be16_to_cpu(block->bb_numrecs);
-#ifdef DEBUG
- if ((error = xfs_btree_check_lblock(cur, block, level, bp))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- goto error0;
- }
-#endif
- if (ptr > numrecs) {
- XFS_BMBT_TRACE_CURSOR(cur, EXIT);
- *stat = 0;
- return 0;
- }
- XFS_STATS_INC(xs_bmbt_delrec);
- if (level > 0) {
- kp = XFS_BMAP_KEY_IADDR(block, 1, cur);
- pp = XFS_BMAP_PTR_IADDR(block, 1, cur);
-#ifdef DEBUG
- for (i = ptr; i < numrecs; i++) {
- if ((error = xfs_btree_check_lptr(cur, INT_GET(pp[i], ARCH_CONVERT), level))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- goto error0;
- }
- }
-#endif
- if (ptr < numrecs) {
- memmove(&kp[ptr - 1], &kp[ptr],
- (numrecs - ptr) * sizeof(*kp));
- memmove(&pp[ptr - 1], &pp[ptr], /* INT_: direct copy */
- (numrecs - ptr) * sizeof(*pp));
- xfs_bmbt_log_ptrs(cur, bp, ptr, numrecs - 1);
- xfs_bmbt_log_keys(cur, bp, ptr, numrecs - 1);
- }
- } else {
- rp = XFS_BMAP_REC_IADDR(block, 1, cur);
- if (ptr < numrecs) {
- memmove(&rp[ptr - 1], &rp[ptr],
- (numrecs - ptr) * sizeof(*rp));
- xfs_bmbt_log_recs(cur, bp, ptr, numrecs - 1);
- }
- if (ptr == 1) {
- INT_SET(key.br_startoff, ARCH_CONVERT, xfs_bmbt_disk_get_startoff(rp));
- kp = &key;
- }
- }
- numrecs--;
- block->bb_numrecs = cpu_to_be16(numrecs);
- xfs_bmbt_log_block(cur, bp, XFS_BB_NUMRECS);
- /*
- * We're at the root level.
- * First, shrink the root block in-memory.
- * Try to get rid of the next level down.
- * If we can't then there's nothing left to do.
- */
- if (level == cur->bc_nlevels - 1) {
- xfs_iroot_realloc(cur->bc_private.b.ip, -1,
- cur->bc_private.b.whichfork);
- if ((error = xfs_bmbt_killroot(cur))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- goto error0;
- }
- if (level > 0 && (error = xfs_bmbt_decrement(cur, level, &j))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- goto error0;
- }
- XFS_BMBT_TRACE_CURSOR(cur, EXIT);
- *stat = 1;
- return 0;
- }
- if (ptr == 1 && (error = xfs_bmbt_updkey(cur, kp, level + 1))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- goto error0;
- }
- if (numrecs >= XFS_BMAP_BLOCK_IMINRECS(level, cur)) {
- if (level > 0 && (error = xfs_bmbt_decrement(cur, level, &j))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- goto error0;
- }
- XFS_BMBT_TRACE_CURSOR(cur, EXIT);
- *stat = 1;
- return 0;
- }
- rbno = be64_to_cpu(block->bb_rightsib);
- lbno = be64_to_cpu(block->bb_leftsib);
- /*
- * One child of root, need to get a chance to copy its contents
- * into the root and delete it. Can't go up to next level,
- * there's nothing to delete there.
- */
- if (lbno == NULLFSBLOCK && rbno == NULLFSBLOCK &&
- level == cur->bc_nlevels - 2) {
- if ((error = xfs_bmbt_killroot(cur))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- goto error0;
- }
- if (level > 0 && (error = xfs_bmbt_decrement(cur, level, &i))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- goto error0;
- }
- XFS_BMBT_TRACE_CURSOR(cur, EXIT);
- *stat = 1;
- return 0;
- }
- ASSERT(rbno != NULLFSBLOCK || lbno != NULLFSBLOCK);
- if ((error = xfs_btree_dup_cursor(cur, &tcur))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- goto error0;
- }
- bno = NULLFSBLOCK;
- if (rbno != NULLFSBLOCK) {
- i = xfs_btree_lastrec(tcur, level);
- XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
- if ((error = xfs_bmbt_increment(tcur, level, &i))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- goto error0;
- }
- XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
- i = xfs_btree_lastrec(tcur, level);
- XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
- rbp = tcur->bc_bufs[level];
- right = XFS_BUF_TO_BMBT_BLOCK(rbp);
-#ifdef DEBUG
- if ((error = xfs_btree_check_lblock(cur, right, level, rbp))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- goto error0;
- }
-#endif
- bno = be64_to_cpu(right->bb_leftsib);
- if (be16_to_cpu(right->bb_numrecs) - 1 >=
- XFS_BMAP_BLOCK_IMINRECS(level, cur)) {
- if ((error = xfs_bmbt_lshift(tcur, level, &i))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- goto error0;
- }
- if (i) {
- ASSERT(be16_to_cpu(block->bb_numrecs) >=
- XFS_BMAP_BLOCK_IMINRECS(level, tcur));
- xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
- tcur = NULL;
- if (level > 0) {
- if ((error = xfs_bmbt_decrement(cur,
- level, &i))) {
- XFS_BMBT_TRACE_CURSOR(cur,
- ERROR);
- goto error0;
- }
- }
- XFS_BMBT_TRACE_CURSOR(cur, EXIT);
- *stat = 1;
- return 0;
- }
- }
- rrecs = be16_to_cpu(right->bb_numrecs);
- if (lbno != NULLFSBLOCK) {
- i = xfs_btree_firstrec(tcur, level);
- XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
- if ((error = xfs_bmbt_decrement(tcur, level, &i))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- goto error0;
- }
- XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
- }
- }
- if (lbno != NULLFSBLOCK) {
- i = xfs_btree_firstrec(tcur, level);
- XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
- /*
- * decrement to last in block
- */
- if ((error = xfs_bmbt_decrement(tcur, level, &i))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- goto error0;
- }
- i = xfs_btree_firstrec(tcur, level);
- XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
- lbp = tcur->bc_bufs[level];
- left = XFS_BUF_TO_BMBT_BLOCK(lbp);
-#ifdef DEBUG
- if ((error = xfs_btree_check_lblock(cur, left, level, lbp))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- goto error0;
- }
-#endif
- bno = be64_to_cpu(left->bb_rightsib);
- if (be16_to_cpu(left->bb_numrecs) - 1 >=
- XFS_BMAP_BLOCK_IMINRECS(level, cur)) {
- if ((error = xfs_bmbt_rshift(tcur, level, &i))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- goto error0;
- }
- if (i) {
- ASSERT(be16_to_cpu(block->bb_numrecs) >=
- XFS_BMAP_BLOCK_IMINRECS(level, tcur));
- xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
- tcur = NULL;
- if (level == 0)
- cur->bc_ptrs[0]++;
- XFS_BMBT_TRACE_CURSOR(cur, EXIT);
- *stat = 1;
- return 0;
- }
- }
- lrecs = be16_to_cpu(left->bb_numrecs);
- }
- xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
- tcur = NULL;
- mp = cur->bc_mp;
- ASSERT(bno != NULLFSBLOCK);
- if (lbno != NULLFSBLOCK &&
- lrecs + be16_to_cpu(block->bb_numrecs) <= XFS_BMAP_BLOCK_IMAXRECS(level, cur)) {
- rbno = bno;
- right = block;
- rbp = bp;
- if ((error = xfs_btree_read_bufl(mp, cur->bc_tp, lbno, 0, &lbp,
- XFS_BMAP_BTREE_REF))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- goto error0;
- }
- left = XFS_BUF_TO_BMBT_BLOCK(lbp);
- if ((error = xfs_btree_check_lblock(cur, left, level, lbp))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- goto error0;
- }
- } else if (rbno != NULLFSBLOCK &&
- rrecs + be16_to_cpu(block->bb_numrecs) <=
- XFS_BMAP_BLOCK_IMAXRECS(level, cur)) {
- lbno = bno;
- left = block;
- lbp = bp;
- if ((error = xfs_btree_read_bufl(mp, cur->bc_tp, rbno, 0, &rbp,
- XFS_BMAP_BTREE_REF))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- goto error0;
- }
- right = XFS_BUF_TO_BMBT_BLOCK(rbp);
- if ((error = xfs_btree_check_lblock(cur, right, level, rbp))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- goto error0;
- }
- lrecs = be16_to_cpu(left->bb_numrecs);
- } else {
- if (level > 0 && (error = xfs_bmbt_decrement(cur, level, &i))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- goto error0;
- }
- XFS_BMBT_TRACE_CURSOR(cur, EXIT);
- *stat = 1;
- return 0;
- }
- numlrecs = be16_to_cpu(left->bb_numrecs);
- numrrecs = be16_to_cpu(right->bb_numrecs);
- if (level > 0) {
- lkp = XFS_BMAP_KEY_IADDR(left, numlrecs + 1, cur);
- lpp = XFS_BMAP_PTR_IADDR(left, numlrecs + 1, cur);
- rkp = XFS_BMAP_KEY_IADDR(right, 1, cur);
- rpp = XFS_BMAP_PTR_IADDR(right, 1, cur);
-#ifdef DEBUG
- for (i = 0; i < numrrecs; i++) {
- if ((error = xfs_btree_check_lptr(cur, INT_GET(rpp[i], ARCH_CONVERT), level))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- goto error0;
- }
- }
-#endif
- memcpy(lkp, rkp, numrrecs * sizeof(*lkp));
- memcpy(lpp, rpp, numrrecs * sizeof(*lpp));
- xfs_bmbt_log_keys(cur, lbp, numlrecs + 1, numlrecs + numrrecs);
- xfs_bmbt_log_ptrs(cur, lbp, numlrecs + 1, numlrecs + numrrecs);
- } else {
- lrp = XFS_BMAP_REC_IADDR(left, numlrecs + 1, cur);
- rrp = XFS_BMAP_REC_IADDR(right, 1, cur);
- memcpy(lrp, rrp, numrrecs * sizeof(*lrp));
- xfs_bmbt_log_recs(cur, lbp, numlrecs + 1, numlrecs + numrrecs);
- }
- be16_add(&left->bb_numrecs, numrrecs);
- left->bb_rightsib = right->bb_rightsib;
- xfs_bmbt_log_block(cur, lbp, XFS_BB_RIGHTSIB | XFS_BB_NUMRECS);
- if (be64_to_cpu(left->bb_rightsib) != NULLDFSBNO) {
- if ((error = xfs_btree_read_bufl(mp, cur->bc_tp,
- be64_to_cpu(left->bb_rightsib),
- 0, &rrbp, XFS_BMAP_BTREE_REF))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- goto error0;
- }
- rrblock = XFS_BUF_TO_BMBT_BLOCK(rrbp);
- if ((error = xfs_btree_check_lblock(cur, rrblock, level, rrbp))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- goto error0;
- }
- rrblock->bb_leftsib = cpu_to_be64(lbno);
- xfs_bmbt_log_block(cur, rrbp, XFS_BB_LEFTSIB);
- }
- xfs_bmap_add_free(XFS_DADDR_TO_FSB(mp, XFS_BUF_ADDR(rbp)), 1,
- cur->bc_private.b.flist, mp);
- cur->bc_private.b.ip->i_d.di_nblocks--;
- xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip, XFS_ILOG_CORE);
- XFS_TRANS_MOD_DQUOT_BYINO(mp, cur->bc_tp, cur->bc_private.b.ip,
- XFS_TRANS_DQ_BCOUNT, -1L);
- xfs_trans_binval(cur->bc_tp, rbp);
- if (bp != lbp) {
- cur->bc_bufs[level] = lbp;
- cur->bc_ptrs[level] += lrecs;
- cur->bc_ra[level] = 0;
- } else if ((error = xfs_bmbt_increment(cur, level + 1, &i))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- goto error0;
- }
- if (level > 0)
- cur->bc_ptrs[level]--;
- XFS_BMBT_TRACE_CURSOR(cur, EXIT);
- *stat = 2;
- return 0;
-
-error0:
- if (tcur)
- xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
- return error;
-}
-
-#ifdef DEBUG
-/*
- * Get the data from the pointed-to record.
- */
-int
-xfs_bmbt_get_rec(
- xfs_btree_cur_t *cur,
- xfs_fileoff_t *off,
- xfs_fsblock_t *bno,
- xfs_filblks_t *len,
- xfs_exntst_t *state,
- int *stat)
-{
- xfs_bmbt_block_t *block;
- xfs_buf_t *bp;
-#ifdef DEBUG
- int error;
-#endif
- int ptr;
- xfs_bmbt_rec_t *rp;
-
- block = xfs_bmbt_get_block(cur, 0, &bp);
- ptr = cur->bc_ptrs[0];
-#ifdef DEBUG
- if ((error = xfs_btree_check_lblock(cur, block, 0, bp)))
- return error;
-#endif
- if (ptr > be16_to_cpu(block->bb_numrecs) || ptr <= 0) {
- *stat = 0;
- return 0;
- }
- rp = XFS_BMAP_REC_IADDR(block, ptr, cur);
- *off = xfs_bmbt_disk_get_startoff(rp);
- *bno = xfs_bmbt_disk_get_startblock(rp);
- *len = xfs_bmbt_disk_get_blockcount(rp);
- *state = xfs_bmbt_disk_get_state(rp);
- *stat = 1;
- return 0;
-}
-#endif
-
-/*
- * Insert one record/level. Return information to the caller
- * allowing the next level up to proceed if necessary.
- */
-STATIC int /* error */
-xfs_bmbt_insrec(
- xfs_btree_cur_t *cur,
- int level,
- xfs_fsblock_t *bnop,
- xfs_bmbt_rec_t *recp,
- xfs_btree_cur_t **curp,
- int *stat) /* no-go/done/continue */
-{
- xfs_bmbt_block_t *block; /* bmap btree block */
- xfs_buf_t *bp; /* buffer for block */
- int error; /* error return value */
-#ifdef XFS_BMBT_TRACE
- static char fname[] = "xfs_bmbt_insrec";
-#endif
- int i; /* loop index */
- xfs_bmbt_key_t key; /* bmap btree key */
- xfs_bmbt_key_t *kp=NULL; /* pointer to bmap btree key */
- int logflags; /* inode logging flags */
- xfs_fsblock_t nbno; /* new block number */
- struct xfs_btree_cur *ncur; /* new btree cursor */
- xfs_bmbt_key_t nkey; /* new btree key value */
- xfs_bmbt_rec_t nrec; /* new record count */
- int optr; /* old key/record index */
- xfs_bmbt_ptr_t *pp; /* pointer to bmap block addr */
- int ptr; /* key/record index */
- xfs_bmbt_rec_t *rp=NULL; /* pointer to bmap btree rec */
- int numrecs;
-
- ASSERT(level < cur->bc_nlevels);
- XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
- XFS_BMBT_TRACE_ARGIFR(cur, level, *bnop, recp);
- ncur = (xfs_btree_cur_t *)0;
- INT_SET(key.br_startoff, ARCH_CONVERT,
- xfs_bmbt_disk_get_startoff(recp));
- optr = ptr = cur->bc_ptrs[level];
- if (ptr == 0) {
- XFS_BMBT_TRACE_CURSOR(cur, EXIT);
- *stat = 0;
- return 0;
- }
- XFS_STATS_INC(xs_bmbt_insrec);
- block = xfs_bmbt_get_block(cur, level, &bp);
- numrecs = be16_to_cpu(block->bb_numrecs);
-#ifdef DEBUG
- if ((error = xfs_btree_check_lblock(cur, block, level, bp))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- return error;
- }
- if (ptr <= numrecs) {
- if (level == 0) {
- rp = XFS_BMAP_REC_IADDR(block, ptr, cur);
- xfs_btree_check_rec(XFS_BTNUM_BMAP, recp, rp);
- } else {
- kp = XFS_BMAP_KEY_IADDR(block, ptr, cur);
- xfs_btree_check_key(XFS_BTNUM_BMAP, &key, kp);
- }
- }
-#endif
- nbno = NULLFSBLOCK;
- if (numrecs == XFS_BMAP_BLOCK_IMAXRECS(level, cur)) {
- if (numrecs < XFS_BMAP_BLOCK_DMAXRECS(level, cur)) {
- /*
- * A root block, that can be made bigger.
- */
- xfs_iroot_realloc(cur->bc_private.b.ip, 1,
- cur->bc_private.b.whichfork);
- block = xfs_bmbt_get_block(cur, level, &bp);
- } else if (level == cur->bc_nlevels - 1) {
- if ((error = xfs_bmbt_newroot(cur, &logflags, stat)) ||
- *stat == 0) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- return error;
- }
- xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip,
- logflags);
- block = xfs_bmbt_get_block(cur, level, &bp);
- } else {
- if ((error = xfs_bmbt_rshift(cur, level, &i))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- return error;
- }
- if (i) {
- /* nothing */
- } else {
- if ((error = xfs_bmbt_lshift(cur, level, &i))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- return error;
- }
- if (i) {
- optr = ptr = cur->bc_ptrs[level];
- } else {
- if ((error = xfs_bmbt_split(cur, level,
- &nbno, &nkey, &ncur,
- &i))) {
- XFS_BMBT_TRACE_CURSOR(cur,
- ERROR);
- return error;
- }
- if (i) {
- block = xfs_bmbt_get_block(
- cur, level, &bp);
-#ifdef DEBUG
- if ((error =
- xfs_btree_check_lblock(cur,
- block, level, bp))) {
- XFS_BMBT_TRACE_CURSOR(
- cur, ERROR);
- return error;
- }
-#endif
- ptr = cur->bc_ptrs[level];
- xfs_bmbt_disk_set_allf(&nrec,
- nkey.br_startoff, 0, 0,
- XFS_EXT_NORM);
- } else {
- XFS_BMBT_TRACE_CURSOR(cur,
- EXIT);
- *stat = 0;
- return 0;
- }
- }
- }
- }
- }
- numrecs = be16_to_cpu(block->bb_numrecs);
- if (level > 0) {
- kp = XFS_BMAP_KEY_IADDR(block, 1, cur);
- pp = XFS_BMAP_PTR_IADDR(block, 1, cur);
-#ifdef DEBUG
- for (i = numrecs; i >= ptr; i--) {
- if ((error = xfs_btree_check_lptr(cur, INT_GET(pp[i - 1], ARCH_CONVERT),
- level))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- return error;
- }
- }
-#endif
- memmove(&kp[ptr], &kp[ptr - 1],
- (numrecs - ptr + 1) * sizeof(*kp));
- memmove(&pp[ptr], &pp[ptr - 1], /* INT_: direct copy */
- (numrecs - ptr + 1) * sizeof(*pp));
-#ifdef DEBUG
- if ((error = xfs_btree_check_lptr(cur, (xfs_bmbt_ptr_t)*bnop,
- level))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- return error;
- }
-#endif
- kp[ptr - 1] = key;
- INT_SET(pp[ptr - 1], ARCH_CONVERT, *bnop);
- numrecs++;
- block->bb_numrecs = cpu_to_be16(numrecs);
- xfs_bmbt_log_keys(cur, bp, ptr, numrecs);
- xfs_bmbt_log_ptrs(cur, bp, ptr, numrecs);
- } else {
- rp = XFS_BMAP_REC_IADDR(block, 1, cur);
- memmove(&rp[ptr], &rp[ptr - 1],
- (numrecs - ptr + 1) * sizeof(*rp));
- rp[ptr - 1] = *recp;
- numrecs++;
- block->bb_numrecs = cpu_to_be16(numrecs);
- xfs_bmbt_log_recs(cur, bp, ptr, numrecs);
- }
- xfs_bmbt_log_block(cur, bp, XFS_BB_NUMRECS);
-#ifdef DEBUG
- if (ptr < numrecs) {
- if (level == 0)
- xfs_btree_check_rec(XFS_BTNUM_BMAP, rp + ptr - 1,
- rp + ptr);
- else
- xfs_btree_check_key(XFS_BTNUM_BMAP, kp + ptr - 1,
- kp + ptr);
- }
-#endif
- if (optr == 1 && (error = xfs_bmbt_updkey(cur, &key, level + 1))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- return error;
- }
- *bnop = nbno;
- if (nbno != NULLFSBLOCK) {
- *recp = nrec;
- *curp = ncur;
- }
- XFS_BMBT_TRACE_CURSOR(cur, EXIT);
- *stat = 1;
- return 0;
-}
-
-STATIC int
-xfs_bmbt_killroot(
- xfs_btree_cur_t *cur)
-{
- xfs_bmbt_block_t *block;
- xfs_bmbt_block_t *cblock;
- xfs_buf_t *cbp;
- xfs_bmbt_key_t *ckp;
- xfs_bmbt_ptr_t *cpp;
-#ifdef DEBUG
- int error;
-#endif
-#ifdef XFS_BMBT_TRACE
- static char fname[] = "xfs_bmbt_killroot";
-#endif
- int i;
- xfs_bmbt_key_t *kp;
- xfs_inode_t *ip;
- xfs_ifork_t *ifp;
- int level;
- xfs_bmbt_ptr_t *pp;
-
- XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
- level = cur->bc_nlevels - 1;
- ASSERT(level >= 1);
- /*
- * Don't deal with the root block needs to be a leaf case.
- * We're just going to turn the thing back into extents anyway.
- */
- if (level == 1) {
- XFS_BMBT_TRACE_CURSOR(cur, EXIT);
- return 0;
- }
- block = xfs_bmbt_get_block(cur, level, &cbp);
- /*
- * Give up if the root has multiple children.
- */
- if (be16_to_cpu(block->bb_numrecs) != 1) {
- XFS_BMBT_TRACE_CURSOR(cur, EXIT);
- return 0;
- }
- /*
- * Only do this if the next level will fit.
- * Then the data must be copied up to the inode,
- * instead of freeing the root you free the next level.
- */
- cbp = cur->bc_bufs[level - 1];
- cblock = XFS_BUF_TO_BMBT_BLOCK(cbp);
- if (be16_to_cpu(cblock->bb_numrecs) > XFS_BMAP_BLOCK_DMAXRECS(level, cur)) {
- XFS_BMBT_TRACE_CURSOR(cur, EXIT);
- return 0;
- }
- ASSERT(be64_to_cpu(cblock->bb_leftsib) == NULLDFSBNO);
- ASSERT(be64_to_cpu(cblock->bb_rightsib) == NULLDFSBNO);
- ip = cur->bc_private.b.ip;
- ifp = XFS_IFORK_PTR(ip, cur->bc_private.b.whichfork);
- ASSERT(XFS_BMAP_BLOCK_IMAXRECS(level, cur) ==
- XFS_BMAP_BROOT_MAXRECS(ifp->if_broot_bytes));
- i = (int)(be16_to_cpu(cblock->bb_numrecs) - XFS_BMAP_BLOCK_IMAXRECS(level, cur));
- if (i) {
- xfs_iroot_realloc(ip, i, cur->bc_private.b.whichfork);
- block = ifp->if_broot;
- }
- be16_add(&block->bb_numrecs, i);
- ASSERT(block->bb_numrecs == cblock->bb_numrecs);
- kp = XFS_BMAP_KEY_IADDR(block, 1, cur);
- ckp = XFS_BMAP_KEY_IADDR(cblock, 1, cur);
- memcpy(kp, ckp, be16_to_cpu(block->bb_numrecs) * sizeof(*kp));
- pp = XFS_BMAP_PTR_IADDR(block, 1, cur);
- cpp = XFS_BMAP_PTR_IADDR(cblock, 1, cur);
-#ifdef DEBUG
- for (i = 0; i < be16_to_cpu(cblock->bb_numrecs); i++) {
- if ((error = xfs_btree_check_lptr(cur, INT_GET(cpp[i], ARCH_CONVERT), level - 1))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- return error;
- }
- }
-#endif
- memcpy(pp, cpp, be16_to_cpu(block->bb_numrecs) * sizeof(*pp));
- xfs_bmap_add_free(XFS_DADDR_TO_FSB(cur->bc_mp, XFS_BUF_ADDR(cbp)), 1,
- cur->bc_private.b.flist, cur->bc_mp);
- ip->i_d.di_nblocks--;
- XFS_TRANS_MOD_DQUOT_BYINO(cur->bc_mp, cur->bc_tp, ip,
- XFS_TRANS_DQ_BCOUNT, -1L);
- xfs_trans_binval(cur->bc_tp, cbp);
- cur->bc_bufs[level - 1] = NULL;
- be16_add(&block->bb_level, -1);
- xfs_trans_log_inode(cur->bc_tp, ip,
- XFS_ILOG_CORE | XFS_ILOG_FBROOT(cur->bc_private.b.whichfork));
- cur->bc_nlevels--;
- XFS_BMBT_TRACE_CURSOR(cur, EXIT);
- return 0;
-}
-
-/*
- * Log key values from the btree block.
- */
-STATIC void
-xfs_bmbt_log_keys(
- xfs_btree_cur_t *cur,
- xfs_buf_t *bp,
- int kfirst,
- int klast)
-{
-#ifdef XFS_BMBT_TRACE
- static char fname[] = "xfs_bmbt_log_keys";
-#endif
- xfs_trans_t *tp;
-
- XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
- XFS_BMBT_TRACE_ARGBII(cur, bp, kfirst, klast);
- tp = cur->bc_tp;
- if (bp) {
- xfs_bmbt_block_t *block;
- int first;
- xfs_bmbt_key_t *kp;
- int last;
-
- block = XFS_BUF_TO_BMBT_BLOCK(bp);
- kp = XFS_BMAP_KEY_DADDR(block, 1, cur);
- first = (int)((xfs_caddr_t)&kp[kfirst - 1] - (xfs_caddr_t)block);
- last = (int)(((xfs_caddr_t)&kp[klast] - 1) - (xfs_caddr_t)block);
- xfs_trans_log_buf(tp, bp, first, last);
- } else {
- xfs_inode_t *ip;
-
- ip = cur->bc_private.b.ip;
- xfs_trans_log_inode(tp, ip,
- XFS_ILOG_FBROOT(cur->bc_private.b.whichfork));
- }
- XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-}
-
-/*
- * Log pointer values from the btree block.
- */
-STATIC void
-xfs_bmbt_log_ptrs(
- xfs_btree_cur_t *cur,
- xfs_buf_t *bp,
- int pfirst,
- int plast)
-{
-#ifdef XFS_BMBT_TRACE
- static char fname[] = "xfs_bmbt_log_ptrs";
-#endif
- xfs_trans_t *tp;
-
- XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
- XFS_BMBT_TRACE_ARGBII(cur, bp, pfirst, plast);
- tp = cur->bc_tp;
- if (bp) {
- xfs_bmbt_block_t *block;
- int first;
- int last;
- xfs_bmbt_ptr_t *pp;
-
- block = XFS_BUF_TO_BMBT_BLOCK(bp);
- pp = XFS_BMAP_PTR_DADDR(block, 1, cur);
- first = (int)((xfs_caddr_t)&pp[pfirst - 1] - (xfs_caddr_t)block);
- last = (int)(((xfs_caddr_t)&pp[plast] - 1) - (xfs_caddr_t)block);
- xfs_trans_log_buf(tp, bp, first, last);
- } else {
- xfs_inode_t *ip;
-
- ip = cur->bc_private.b.ip;
- xfs_trans_log_inode(tp, ip,
- XFS_ILOG_FBROOT(cur->bc_private.b.whichfork));
- }
- XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-}
-
-/*
- * Lookup the record. The cursor is made to point to it, based on dir.
- */
-STATIC int /* error */
-xfs_bmbt_lookup(
- xfs_btree_cur_t *cur,
- xfs_lookup_t dir,
- int *stat) /* success/failure */
-{
- xfs_bmbt_block_t *block=NULL;
- xfs_buf_t *bp;
- xfs_daddr_t d;
- xfs_sfiloff_t diff;
- int error; /* error return value */
-#ifdef XFS_BMBT_TRACE
- static char fname[] = "xfs_bmbt_lookup";
-#endif
- xfs_fsblock_t fsbno=0;
- int high;
- int i;
- int keyno=0;
- xfs_bmbt_key_t *kkbase=NULL;
- xfs_bmbt_key_t *kkp;
- xfs_bmbt_rec_t *krbase=NULL;
- xfs_bmbt_rec_t *krp;
- int level;
- int low;
- xfs_mount_t *mp;
- xfs_bmbt_ptr_t *pp;
- xfs_bmbt_irec_t *rp;
- xfs_fileoff_t startoff;
- xfs_trans_t *tp;
-
- XFS_STATS_INC(xs_bmbt_lookup);
- XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
- XFS_BMBT_TRACE_ARGI(cur, (int)dir);
- tp = cur->bc_tp;
- mp = cur->bc_mp;
- rp = &cur->bc_rec.b;
- for (level = cur->bc_nlevels - 1, diff = 1; level >= 0; level--) {
- if (level < cur->bc_nlevels - 1) {
- d = XFS_FSB_TO_DADDR(mp, fsbno);
- bp = cur->bc_bufs[level];
- if (bp && XFS_BUF_ADDR(bp) != d)
- bp = (xfs_buf_t *)0;
- if (!bp) {
- if ((error = xfs_btree_read_bufl(mp, tp, fsbno,
- 0, &bp, XFS_BMAP_BTREE_REF))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- return error;
- }
- xfs_btree_setbuf(cur, level, bp);
- block = XFS_BUF_TO_BMBT_BLOCK(bp);
- if ((error = xfs_btree_check_lblock(cur, block,
- level, bp))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- return error;
- }
- } else
- block = XFS_BUF_TO_BMBT_BLOCK(bp);
- } else
- block = xfs_bmbt_get_block(cur, level, &bp);
- if (diff == 0)
- keyno = 1;
- else {
- if (level > 0)
- kkbase = XFS_BMAP_KEY_IADDR(block, 1, cur);
- else
- krbase = XFS_BMAP_REC_IADDR(block, 1, cur);
- low = 1;
- if (!(high = be16_to_cpu(block->bb_numrecs))) {
- ASSERT(level == 0);
- cur->bc_ptrs[0] = dir != XFS_LOOKUP_LE;
- XFS_BMBT_TRACE_CURSOR(cur, EXIT);
- *stat = 0;
- return 0;
- }
- while (low <= high) {
- XFS_STATS_INC(xs_bmbt_compare);
- keyno = (low + high) >> 1;
- if (level > 0) {
- kkp = kkbase + keyno - 1;
- startoff = INT_GET(kkp->br_startoff, ARCH_CONVERT);
- } else {
- krp = krbase + keyno - 1;
- startoff = xfs_bmbt_disk_get_startoff(krp);
- }
- diff = (xfs_sfiloff_t)
- (startoff - rp->br_startoff);
- if (diff < 0)
- low = keyno + 1;
- else if (diff > 0)
- high = keyno - 1;
- else
- break;
- }
- }
- if (level > 0) {
- if (diff > 0 && --keyno < 1)
- keyno = 1;
- pp = XFS_BMAP_PTR_IADDR(block, keyno, cur);
-#ifdef DEBUG
- if ((error = xfs_btree_check_lptr(cur, INT_GET(*pp, ARCH_CONVERT), level))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- return error;
- }
-#endif
- fsbno = INT_GET(*pp, ARCH_CONVERT);
- cur->bc_ptrs[level] = keyno;
- }
- }
- if (dir != XFS_LOOKUP_LE && diff < 0) {
- keyno++;
- /*
- * If ge search and we went off the end of the block, but it's
- * not the last block, we're in the wrong block.
- */
- if (dir == XFS_LOOKUP_GE && keyno > be16_to_cpu(block->bb_numrecs) &&
- be64_to_cpu(block->bb_rightsib) != NULLDFSBNO) {
- cur->bc_ptrs[0] = keyno;
- if ((error = xfs_bmbt_increment(cur, 0, &i))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- return error;
- }
- XFS_WANT_CORRUPTED_RETURN(i == 1);
- XFS_BMBT_TRACE_CURSOR(cur, EXIT);
- *stat = 1;
- return 0;
- }
- }
- else if (dir == XFS_LOOKUP_LE && diff > 0)
- keyno--;
- cur->bc_ptrs[0] = keyno;
- if (keyno == 0 || keyno > be16_to_cpu(block->bb_numrecs)) {
- XFS_BMBT_TRACE_CURSOR(cur, EXIT);
- *stat = 0;
- } else {
- XFS_BMBT_TRACE_CURSOR(cur, EXIT);
- *stat = ((dir != XFS_LOOKUP_EQ) || (diff == 0));
- }
- return 0;
-}
-
-/*
- * Move 1 record left from cur/level if possible.
- * Update cur to reflect the new path.
- */
-STATIC int /* error */
-xfs_bmbt_lshift(
- xfs_btree_cur_t *cur,
- int level,
- int *stat) /* success/failure */
-{
- int error; /* error return value */
-#ifdef XFS_BMBT_TRACE
- static char fname[] = "xfs_bmbt_lshift";
-#endif
-#ifdef DEBUG
- int i; /* loop counter */
-#endif
- xfs_bmbt_key_t key; /* bmap btree key */
- xfs_buf_t *lbp; /* left buffer pointer */
- xfs_bmbt_block_t *left; /* left btree block */
- xfs_bmbt_key_t *lkp=NULL; /* left btree key */
- xfs_bmbt_ptr_t *lpp; /* left address pointer */
- int lrecs; /* left record count */
- xfs_bmbt_rec_t *lrp=NULL; /* left record pointer */
- xfs_mount_t *mp; /* file system mount point */
- xfs_buf_t *rbp; /* right buffer pointer */
- xfs_bmbt_block_t *right; /* right btree block */
- xfs_bmbt_key_t *rkp=NULL; /* right btree key */
- xfs_bmbt_ptr_t *rpp=NULL; /* right address pointer */
- xfs_bmbt_rec_t *rrp=NULL; /* right record pointer */
- int rrecs; /* right record count */
-
- XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
- XFS_BMBT_TRACE_ARGI(cur, level);
- if (level == cur->bc_nlevels - 1) {
- XFS_BMBT_TRACE_CURSOR(cur, EXIT);
- *stat = 0;
- return 0;
- }
- rbp = cur->bc_bufs[level];
- right = XFS_BUF_TO_BMBT_BLOCK(rbp);
-#ifdef DEBUG
- if ((error = xfs_btree_check_lblock(cur, right, level, rbp))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- return error;
- }
-#endif
- if (be64_to_cpu(right->bb_leftsib) == NULLDFSBNO) {
- XFS_BMBT_TRACE_CURSOR(cur, EXIT);
- *stat = 0;
- return 0;
- }
- if (cur->bc_ptrs[level] <= 1) {
- XFS_BMBT_TRACE_CURSOR(cur, EXIT);
- *stat = 0;
- return 0;
- }
- mp = cur->bc_mp;
- if ((error = xfs_btree_read_bufl(mp, cur->bc_tp, be64_to_cpu(right->bb_leftsib), 0,
- &lbp, XFS_BMAP_BTREE_REF))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- return error;
- }
- left = XFS_BUF_TO_BMBT_BLOCK(lbp);
- if ((error = xfs_btree_check_lblock(cur, left, level, lbp))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- return error;
- }
- if (be16_to_cpu(left->bb_numrecs) == XFS_BMAP_BLOCK_IMAXRECS(level, cur)) {
- XFS_BMBT_TRACE_CURSOR(cur, EXIT);
- *stat = 0;
- return 0;
- }
- lrecs = be16_to_cpu(left->bb_numrecs) + 1;
- if (level > 0) {
- lkp = XFS_BMAP_KEY_IADDR(left, lrecs, cur);
- rkp = XFS_BMAP_KEY_IADDR(right, 1, cur);
- *lkp = *rkp;
- xfs_bmbt_log_keys(cur, lbp, lrecs, lrecs);
- lpp = XFS_BMAP_PTR_IADDR(left, lrecs, cur);
- rpp = XFS_BMAP_PTR_IADDR(right, 1, cur);
-#ifdef DEBUG
- if ((error = xfs_btree_check_lptr(cur, INT_GET(*rpp, ARCH_CONVERT), level))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- return error;
- }
-#endif
- *lpp = *rpp; /* INT_: direct copy */
- xfs_bmbt_log_ptrs(cur, lbp, lrecs, lrecs);
- } else {
- lrp = XFS_BMAP_REC_IADDR(left, lrecs, cur);
- rrp = XFS_BMAP_REC_IADDR(right, 1, cur);
- *lrp = *rrp;
- xfs_bmbt_log_recs(cur, lbp, lrecs, lrecs);
- }
- left->bb_numrecs = cpu_to_be16(lrecs);
- xfs_bmbt_log_block(cur, lbp, XFS_BB_NUMRECS);
-#ifdef DEBUG
- if (level > 0)
- xfs_btree_check_key(XFS_BTNUM_BMAP, lkp - 1, lkp);
- else
- xfs_btree_check_rec(XFS_BTNUM_BMAP, lrp - 1, lrp);
-#endif
- rrecs = be16_to_cpu(right->bb_numrecs) - 1;
- right->bb_numrecs = cpu_to_be16(rrecs);
- xfs_bmbt_log_block(cur, rbp, XFS_BB_NUMRECS);
- if (level > 0) {
-#ifdef DEBUG
- for (i = 0; i < rrecs; i++) {
- if ((error = xfs_btree_check_lptr(cur, INT_GET(rpp[i + 1], ARCH_CONVERT),
- level))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- return error;
- }
- }
-#endif
- memmove(rkp, rkp + 1, rrecs * sizeof(*rkp));
- memmove(rpp, rpp + 1, rrecs * sizeof(*rpp));
- xfs_bmbt_log_keys(cur, rbp, 1, rrecs);
- xfs_bmbt_log_ptrs(cur, rbp, 1, rrecs);
- } else {
- memmove(rrp, rrp + 1, rrecs * sizeof(*rrp));
- xfs_bmbt_log_recs(cur, rbp, 1, rrecs);
- INT_SET(key.br_startoff, ARCH_CONVERT,
- xfs_bmbt_disk_get_startoff(rrp));
- rkp = &key;
- }
- if ((error = xfs_bmbt_updkey(cur, rkp, level + 1))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- return error;
- }
- cur->bc_ptrs[level]--;
- XFS_BMBT_TRACE_CURSOR(cur, EXIT);
- *stat = 1;
- return 0;
-}
-
-/*
- * Move 1 record right from cur/level if possible.
- * Update cur to reflect the new path.
- */
-STATIC int /* error */
-xfs_bmbt_rshift(
- xfs_btree_cur_t *cur,
- int level,
- int *stat) /* success/failure */
-{
- int error; /* error return value */
-#ifdef XFS_BMBT_TRACE
- static char fname[] = "xfs_bmbt_rshift";
-#endif
- int i; /* loop counter */
- xfs_bmbt_key_t key; /* bmap btree key */
- xfs_buf_t *lbp; /* left buffer pointer */
- xfs_bmbt_block_t *left; /* left btree block */
- xfs_bmbt_key_t *lkp; /* left btree key */
- xfs_bmbt_ptr_t *lpp; /* left address pointer */
- xfs_bmbt_rec_t *lrp; /* left record pointer */
- xfs_mount_t *mp; /* file system mount point */
- xfs_buf_t *rbp; /* right buffer pointer */
- xfs_bmbt_block_t *right; /* right btree block */
- xfs_bmbt_key_t *rkp; /* right btree key */
- xfs_bmbt_ptr_t *rpp; /* right address pointer */
- xfs_bmbt_rec_t *rrp=NULL; /* right record pointer */
- struct xfs_btree_cur *tcur; /* temporary btree cursor */
-
- XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
- XFS_BMBT_TRACE_ARGI(cur, level);
- if (level == cur->bc_nlevels - 1) {
- XFS_BMBT_TRACE_CURSOR(cur, EXIT);
- *stat = 0;
- return 0;
- }
- lbp = cur->bc_bufs[level];
- left = XFS_BUF_TO_BMBT_BLOCK(lbp);
-#ifdef DEBUG
- if ((error = xfs_btree_check_lblock(cur, left, level, lbp))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- return error;
- }
-#endif
- if (be64_to_cpu(left->bb_rightsib) == NULLDFSBNO) {
- XFS_BMBT_TRACE_CURSOR(cur, EXIT);
- *stat = 0;
- return 0;
- }
- if (cur->bc_ptrs[level] >= be16_to_cpu(left->bb_numrecs)) {
- XFS_BMBT_TRACE_CURSOR(cur, EXIT);
- *stat = 0;
- return 0;
- }
- mp = cur->bc_mp;
- if ((error = xfs_btree_read_bufl(mp, cur->bc_tp, be64_to_cpu(left->bb_rightsib), 0,
- &rbp, XFS_BMAP_BTREE_REF))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- return error;
- }
- right = XFS_BUF_TO_BMBT_BLOCK(rbp);
- if ((error = xfs_btree_check_lblock(cur, right, level, rbp))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- return error;
- }
- if (be16_to_cpu(right->bb_numrecs) == XFS_BMAP_BLOCK_IMAXRECS(level, cur)) {
- XFS_BMBT_TRACE_CURSOR(cur, EXIT);
- *stat = 0;
- return 0;
- }
- if (level > 0) {
- lkp = XFS_BMAP_KEY_IADDR(left, be16_to_cpu(left->bb_numrecs), cur);
- lpp = XFS_BMAP_PTR_IADDR(left, be16_to_cpu(left->bb_numrecs), cur);
- rkp = XFS_BMAP_KEY_IADDR(right, 1, cur);
- rpp = XFS_BMAP_PTR_IADDR(right, 1, cur);
-#ifdef DEBUG
- for (i = be16_to_cpu(right->bb_numrecs) - 1; i >= 0; i--) {
- if ((error = xfs_btree_check_lptr(cur, INT_GET(rpp[i], ARCH_CONVERT), level))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- return error;
- }
- }
-#endif
- memmove(rkp + 1, rkp, be16_to_cpu(right->bb_numrecs) * sizeof(*rkp));
- memmove(rpp + 1, rpp, be16_to_cpu(right->bb_numrecs) * sizeof(*rpp));
-#ifdef DEBUG
- if ((error = xfs_btree_check_lptr(cur, INT_GET(*lpp, ARCH_CONVERT), level))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- return error;
- }
-#endif
- *rkp = *lkp;
- *rpp = *lpp; /* INT_: direct copy */
- xfs_bmbt_log_keys(cur, rbp, 1, be16_to_cpu(right->bb_numrecs) + 1);
- xfs_bmbt_log_ptrs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs) + 1);
- } else {
- lrp = XFS_BMAP_REC_IADDR(left, be16_to_cpu(left->bb_numrecs), cur);
- rrp = XFS_BMAP_REC_IADDR(right, 1, cur);
- memmove(rrp + 1, rrp, be16_to_cpu(right->bb_numrecs) * sizeof(*rrp));
- *rrp = *lrp;
- xfs_bmbt_log_recs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs) + 1);
- INT_SET(key.br_startoff, ARCH_CONVERT,
- xfs_bmbt_disk_get_startoff(rrp));
- rkp = &key;
- }
- be16_add(&left->bb_numrecs, -1);
- xfs_bmbt_log_block(cur, lbp, XFS_BB_NUMRECS);
- be16_add(&right->bb_numrecs, 1);
-#ifdef DEBUG
- if (level > 0)
- xfs_btree_check_key(XFS_BTNUM_BMAP, rkp, rkp + 1);
- else
- xfs_btree_check_rec(XFS_BTNUM_BMAP, rrp, rrp + 1);
-#endif
- xfs_bmbt_log_block(cur, rbp, XFS_BB_NUMRECS);
- if ((error = xfs_btree_dup_cursor(cur, &tcur))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- return error;
- }
- i = xfs_btree_lastrec(tcur, level);
- XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
- if ((error = xfs_bmbt_increment(tcur, level, &i))) {
- XFS_BMBT_TRACE_CURSOR(tcur, ERROR);
- goto error1;
- }
- XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
- if ((error = xfs_bmbt_updkey(tcur, rkp, level + 1))) {
- XFS_BMBT_TRACE_CURSOR(tcur, ERROR);
- goto error1;
- }
- xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
- XFS_BMBT_TRACE_CURSOR(cur, EXIT);
- *stat = 1;
- return 0;
-error0:
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-error1:
- xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
- return error;
-}
+#include "xfs_trace.h"
+#include "xfs_cksum.h"
+#include "xfs_dinode.h"
/*
* Determine the extent state.
@@ -1527,344 +54,44 @@ xfs_extent_state(
return XFS_EXT_NORM;
}
-
-/*
- * Split cur/level block in half.
- * Return new block number and its first record (to be inserted into parent).
- */
-STATIC int /* error */
-xfs_bmbt_split(
- xfs_btree_cur_t *cur,
- int level,
- xfs_fsblock_t *bnop,
- xfs_bmbt_key_t *keyp,
- xfs_btree_cur_t **curp,
- int *stat) /* success/failure */
-{
- xfs_alloc_arg_t args; /* block allocation args */
- int error; /* error return value */
-#ifdef XFS_BMBT_TRACE
- static char fname[] = "xfs_bmbt_split";
-#endif
- int i; /* loop counter */
- xfs_fsblock_t lbno; /* left sibling block number */
- xfs_buf_t *lbp; /* left buffer pointer */
- xfs_bmbt_block_t *left; /* left btree block */
- xfs_bmbt_key_t *lkp; /* left btree key */
- xfs_bmbt_ptr_t *lpp; /* left address pointer */
- xfs_bmbt_rec_t *lrp; /* left record pointer */
- xfs_buf_t *rbp; /* right buffer pointer */
- xfs_bmbt_block_t *right; /* right btree block */
- xfs_bmbt_key_t *rkp; /* right btree key */
- xfs_bmbt_ptr_t *rpp; /* right address pointer */
- xfs_bmbt_block_t *rrblock; /* right-right btree block */
- xfs_buf_t *rrbp; /* right-right buffer pointer */
- xfs_bmbt_rec_t *rrp; /* right record pointer */
-
- XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
- XFS_BMBT_TRACE_ARGIFK(cur, level, *bnop, keyp);
- args.tp = cur->bc_tp;
- args.mp = cur->bc_mp;
- lbp = cur->bc_bufs[level];
- lbno = XFS_DADDR_TO_FSB(args.mp, XFS_BUF_ADDR(lbp));
- left = XFS_BUF_TO_BMBT_BLOCK(lbp);
- args.fsbno = cur->bc_private.b.firstblock;
- if (args.fsbno == NULLFSBLOCK) {
- args.fsbno = lbno;
- args.type = XFS_ALLOCTYPE_START_BNO;
- } else if (cur->bc_private.b.flist->xbf_low)
- args.type = XFS_ALLOCTYPE_FIRST_AG;
- else
- args.type = XFS_ALLOCTYPE_NEAR_BNO;
- args.mod = args.minleft = args.alignment = args.total = args.isfl =
- args.userdata = args.minalignslop = 0;
- args.minlen = args.maxlen = args.prod = 1;
- args.wasdel = cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL;
- if (!args.wasdel && xfs_trans_get_block_res(args.tp) == 0) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- return XFS_ERROR(ENOSPC);
- }
- if ((error = xfs_alloc_vextent(&args))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- return error;
- }
- if (args.fsbno == NULLFSBLOCK) {
- XFS_BMBT_TRACE_CURSOR(cur, EXIT);
- *stat = 0;
- return 0;
- }
- ASSERT(args.len == 1);
- cur->bc_private.b.firstblock = args.fsbno;
- cur->bc_private.b.allocated++;
- cur->bc_private.b.ip->i_d.di_nblocks++;
- xfs_trans_log_inode(args.tp, cur->bc_private.b.ip, XFS_ILOG_CORE);
- XFS_TRANS_MOD_DQUOT_BYINO(args.mp, args.tp, cur->bc_private.b.ip,
- XFS_TRANS_DQ_BCOUNT, 1L);
- rbp = xfs_btree_get_bufl(args.mp, args.tp, args.fsbno, 0);
- right = XFS_BUF_TO_BMBT_BLOCK(rbp);
-#ifdef DEBUG
- if ((error = xfs_btree_check_lblock(cur, left, level, rbp))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- return error;
- }
-#endif
- right->bb_magic = cpu_to_be32(XFS_BMAP_MAGIC);
- right->bb_level = left->bb_level;
- right->bb_numrecs = cpu_to_be16(be16_to_cpu(left->bb_numrecs) / 2);
- if ((be16_to_cpu(left->bb_numrecs) & 1) &&
- cur->bc_ptrs[level] <= be16_to_cpu(right->bb_numrecs) + 1)
- be16_add(&right->bb_numrecs, 1);
- i = be16_to_cpu(left->bb_numrecs) - be16_to_cpu(right->bb_numrecs) + 1;
- if (level > 0) {
- lkp = XFS_BMAP_KEY_IADDR(left, i, cur);
- lpp = XFS_BMAP_PTR_IADDR(left, i, cur);
- rkp = XFS_BMAP_KEY_IADDR(right, 1, cur);
- rpp = XFS_BMAP_PTR_IADDR(right, 1, cur);
-#ifdef DEBUG
- for (i = 0; i < be16_to_cpu(right->bb_numrecs); i++) {
- if ((error = xfs_btree_check_lptr(cur, INT_GET(lpp[i], ARCH_CONVERT), level))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- return error;
- }
- }
-#endif
- memcpy(rkp, lkp, be16_to_cpu(right->bb_numrecs) * sizeof(*rkp));
- memcpy(rpp, lpp, be16_to_cpu(right->bb_numrecs) * sizeof(*rpp));
- xfs_bmbt_log_keys(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
- xfs_bmbt_log_ptrs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
- keyp->br_startoff = INT_GET(rkp->br_startoff, ARCH_CONVERT);
- } else {
- lrp = XFS_BMAP_REC_IADDR(left, i, cur);
- rrp = XFS_BMAP_REC_IADDR(right, 1, cur);
- memcpy(rrp, lrp, be16_to_cpu(right->bb_numrecs) * sizeof(*rrp));
- xfs_bmbt_log_recs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
- keyp->br_startoff = xfs_bmbt_disk_get_startoff(rrp);
- }
- be16_add(&left->bb_numrecs, -(be16_to_cpu(right->bb_numrecs)));
- right->bb_rightsib = left->bb_rightsib;
- left->bb_rightsib = cpu_to_be64(args.fsbno);
- right->bb_leftsib = cpu_to_be64(lbno);
- xfs_bmbt_log_block(cur, rbp, XFS_BB_ALL_BITS);
- xfs_bmbt_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
- if (be64_to_cpu(right->bb_rightsib) != NULLDFSBNO) {
- if ((error = xfs_btree_read_bufl(args.mp, args.tp,
- be64_to_cpu(right->bb_rightsib), 0, &rrbp,
- XFS_BMAP_BTREE_REF))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- return error;
- }
- rrblock = XFS_BUF_TO_BMBT_BLOCK(rrbp);
- if ((error = xfs_btree_check_lblock(cur, rrblock, level, rrbp))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- return error;
- }
- rrblock->bb_leftsib = cpu_to_be64(args.fsbno);
- xfs_bmbt_log_block(cur, rrbp, XFS_BB_LEFTSIB);
- }
- if (cur->bc_ptrs[level] > be16_to_cpu(left->bb_numrecs) + 1) {
- xfs_btree_setbuf(cur, level, rbp);
- cur->bc_ptrs[level] -= be16_to_cpu(left->bb_numrecs);
- }
- if (level + 1 < cur->bc_nlevels) {
- if ((error = xfs_btree_dup_cursor(cur, curp))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- return error;
- }
- (*curp)->bc_ptrs[level + 1]++;
- }
- *bnop = args.fsbno;
- XFS_BMBT_TRACE_CURSOR(cur, EXIT);
- *stat = 1;
- return 0;
-}
-
-
-/*
- * Update keys for the record.
- */
-STATIC int
-xfs_bmbt_updkey(
- xfs_btree_cur_t *cur,
- xfs_bmbt_key_t *keyp, /* on-disk format */
- int level)
-{
- xfs_bmbt_block_t *block;
- xfs_buf_t *bp;
-#ifdef DEBUG
- int error;
-#endif
-#ifdef XFS_BMBT_TRACE
- static char fname[] = "xfs_bmbt_updkey";
-#endif
- xfs_bmbt_key_t *kp;
- int ptr;
-
- ASSERT(level >= 1);
- XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
- XFS_BMBT_TRACE_ARGIK(cur, level, keyp);
- for (ptr = 1; ptr == 1 && level < cur->bc_nlevels; level++) {
- block = xfs_bmbt_get_block(cur, level, &bp);
-#ifdef DEBUG
- if ((error = xfs_btree_check_lblock(cur, block, level, bp))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- return error;
- }
-#endif
- ptr = cur->bc_ptrs[level];
- kp = XFS_BMAP_KEY_IADDR(block, ptr, cur);
- *kp = *keyp;
- xfs_bmbt_log_keys(cur, bp, ptr, ptr);
- }
- XFS_BMBT_TRACE_CURSOR(cur, EXIT);
- return 0;
-}
-
/*
* Convert on-disk form of btree root to in-memory form.
*/
void
xfs_bmdr_to_bmbt(
+ struct xfs_inode *ip,
xfs_bmdr_block_t *dblock,
int dblocklen,
- xfs_bmbt_block_t *rblock,
+ struct xfs_btree_block *rblock,
int rblocklen)
{
+ struct xfs_mount *mp = ip->i_mount;
int dmxr;
xfs_bmbt_key_t *fkp;
- xfs_bmbt_ptr_t *fpp;
+ __be64 *fpp;
xfs_bmbt_key_t *tkp;
- xfs_bmbt_ptr_t *tpp;
+ __be64 *tpp;
+
+ if (xfs_sb_version_hascrc(&mp->m_sb))
+ xfs_btree_init_block_int(mp, rblock, XFS_BUF_DADDR_NULL,
+ XFS_BMAP_CRC_MAGIC, 0, 0, ip->i_ino,
+ XFS_BTREE_LONG_PTRS | XFS_BTREE_CRC_BLOCKS);
+ else
+ xfs_btree_init_block_int(mp, rblock, XFS_BUF_DADDR_NULL,
+ XFS_BMAP_MAGIC, 0, 0, ip->i_ino,
+ XFS_BTREE_LONG_PTRS);
- rblock->bb_magic = cpu_to_be32(XFS_BMAP_MAGIC);
rblock->bb_level = dblock->bb_level;
ASSERT(be16_to_cpu(rblock->bb_level) > 0);
rblock->bb_numrecs = dblock->bb_numrecs;
- rblock->bb_leftsib = cpu_to_be64(NULLDFSBNO);
- rblock->bb_rightsib = cpu_to_be64(NULLDFSBNO);
- dmxr = (int)XFS_BTREE_BLOCK_MAXRECS(dblocklen, xfs_bmdr, 0);
- fkp = XFS_BTREE_KEY_ADDR(dblocklen, xfs_bmdr, dblock, 1, dmxr);
- tkp = XFS_BMAP_BROOT_KEY_ADDR(rblock, 1, rblocklen);
- fpp = XFS_BTREE_PTR_ADDR(dblocklen, xfs_bmdr, dblock, 1, dmxr);
- tpp = XFS_BMAP_BROOT_PTR_ADDR(rblock, 1, rblocklen);
+ dmxr = xfs_bmdr_maxrecs(dblocklen, 0);
+ fkp = XFS_BMDR_KEY_ADDR(dblock, 1);
+ tkp = XFS_BMBT_KEY_ADDR(mp, rblock, 1);
+ fpp = XFS_BMDR_PTR_ADDR(dblock, 1, dmxr);
+ tpp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, rblocklen);
dmxr = be16_to_cpu(dblock->bb_numrecs);
memcpy(tkp, fkp, sizeof(*fkp) * dmxr);
- memcpy(tpp, fpp, sizeof(*fpp) * dmxr); /* INT_: direct copy */
-}
-
-/*
- * Decrement cursor by one record at the level.
- * For nonzero levels the leaf-ward information is untouched.
- */
-int /* error */
-xfs_bmbt_decrement(
- xfs_btree_cur_t *cur,
- int level,
- int *stat) /* success/failure */
-{
- xfs_bmbt_block_t *block;
- xfs_buf_t *bp;
- int error; /* error return value */
-#ifdef XFS_BMBT_TRACE
- static char fname[] = "xfs_bmbt_decrement";
-#endif
- xfs_fsblock_t fsbno;
- int lev;
- xfs_mount_t *mp;
- xfs_trans_t *tp;
-
- XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
- XFS_BMBT_TRACE_ARGI(cur, level);
- ASSERT(level < cur->bc_nlevels);
- if (level < cur->bc_nlevels - 1)
- xfs_btree_readahead(cur, level, XFS_BTCUR_LEFTRA);
- if (--cur->bc_ptrs[level] > 0) {
- XFS_BMBT_TRACE_CURSOR(cur, EXIT);
- *stat = 1;
- return 0;
- }
- block = xfs_bmbt_get_block(cur, level, &bp);
-#ifdef DEBUG
- if ((error = xfs_btree_check_lblock(cur, block, level, bp))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- return error;
- }
-#endif
- if (be64_to_cpu(block->bb_leftsib) == NULLDFSBNO) {
- XFS_BMBT_TRACE_CURSOR(cur, EXIT);
- *stat = 0;
- return 0;
- }
- for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
- if (--cur->bc_ptrs[lev] > 0)
- break;
- if (lev < cur->bc_nlevels - 1)
- xfs_btree_readahead(cur, lev, XFS_BTCUR_LEFTRA);
- }
- if (lev == cur->bc_nlevels) {
- XFS_BMBT_TRACE_CURSOR(cur, EXIT);
- *stat = 0;
- return 0;
- }
- tp = cur->bc_tp;
- mp = cur->bc_mp;
- for (block = xfs_bmbt_get_block(cur, lev, &bp); lev > level; ) {
- fsbno = INT_GET(*XFS_BMAP_PTR_IADDR(block, cur->bc_ptrs[lev], cur), ARCH_CONVERT);
- if ((error = xfs_btree_read_bufl(mp, tp, fsbno, 0, &bp,
- XFS_BMAP_BTREE_REF))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- return error;
- }
- lev--;
- xfs_btree_setbuf(cur, lev, bp);
- block = XFS_BUF_TO_BMBT_BLOCK(bp);
- if ((error = xfs_btree_check_lblock(cur, block, lev, bp))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- return error;
- }
- cur->bc_ptrs[lev] = be16_to_cpu(block->bb_numrecs);
- }
- XFS_BMBT_TRACE_CURSOR(cur, EXIT);
- *stat = 1;
- return 0;
-}
-
-/*
- * Delete the record pointed to by cur.
- */
-int /* error */
-xfs_bmbt_delete(
- xfs_btree_cur_t *cur,
- int *stat) /* success/failure */
-{
- int error; /* error return value */
-#ifdef XFS_BMBT_TRACE
- static char fname[] = "xfs_bmbt_delete";
-#endif
- int i;
- int level;
-
- XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
- for (level = 0, i = 2; i == 2; level++) {
- if ((error = xfs_bmbt_delrec(cur, level, &i))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- return error;
- }
- }
- if (i == 0) {
- for (level = 1; level < cur->bc_nlevels; level++) {
- if (cur->bc_ptrs[level] == 0) {
- if ((error = xfs_bmbt_decrement(cur, level,
- &i))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- return error;
- }
- break;
- }
- }
- }
- XFS_BMBT_TRACE_CURSOR(cur, EXIT);
- *stat = i;
- return 0;
+ memcpy(tpp, fpp, sizeof(*fpp) * dmxr);
}
/*
@@ -1872,8 +99,7 @@ xfs_bmbt_delete(
* This code must be in sync with the routines xfs_bmbt_get_startoff,
* xfs_bmbt_get_startblock, xfs_bmbt_get_blockcount and xfs_bmbt_get_state.
*/
-
-STATIC __inline__ void
+STATIC void
__xfs_bmbt_get_all(
__uint64_t l0,
__uint64_t l1,
@@ -1884,25 +110,25 @@ __xfs_bmbt_get_all(
ext_flag = (int)(l0 >> (64 - BMBT_EXNTFLAG_BITLEN));
s->br_startoff = ((xfs_fileoff_t)l0 &
- XFS_MASK64LO(64 - BMBT_EXNTFLAG_BITLEN)) >> 9;
+ xfs_mask64lo(64 - BMBT_EXNTFLAG_BITLEN)) >> 9;
#if XFS_BIG_BLKNOS
- s->br_startblock = (((xfs_fsblock_t)l0 & XFS_MASK64LO(9)) << 43) |
+ s->br_startblock = (((xfs_fsblock_t)l0 & xfs_mask64lo(9)) << 43) |
(((xfs_fsblock_t)l1) >> 21);
#else
#ifdef DEBUG
{
xfs_dfsbno_t b;
- b = (((xfs_dfsbno_t)l0 & XFS_MASK64LO(9)) << 43) |
+ b = (((xfs_dfsbno_t)l0 & xfs_mask64lo(9)) << 43) |
(((xfs_dfsbno_t)l1) >> 21);
- ASSERT((b >> 32) == 0 || ISNULLDSTARTBLOCK(b));
+ ASSERT((b >> 32) == 0 || isnulldstartblock(b));
s->br_startblock = (xfs_fsblock_t)b;
}
#else /* !DEBUG */
s->br_startblock = (xfs_fsblock_t)(((xfs_dfsbno_t)l1) >> 21);
#endif /* DEBUG */
#endif /* XFS_BIG_BLKNOS */
- s->br_blockcount = (xfs_filblks_t)(l1 & XFS_MASK64LO(21));
+ s->br_blockcount = (xfs_filblks_t)(l1 & xfs_mask64lo(21));
/* This is xfs_extent_state() in-line */
if (ext_flag) {
ASSERT(s->br_blockcount != 0); /* saved for DMIG */
@@ -1914,45 +140,20 @@ __xfs_bmbt_get_all(
void
xfs_bmbt_get_all(
- xfs_bmbt_rec_t *r,
+ xfs_bmbt_rec_host_t *r,
xfs_bmbt_irec_t *s)
{
__xfs_bmbt_get_all(r->l0, r->l1, s);
}
/*
- * Get the block pointer for the given level of the cursor.
- * Fill in the buffer pointer, if applicable.
- */
-xfs_bmbt_block_t *
-xfs_bmbt_get_block(
- xfs_btree_cur_t *cur,
- int level,
- xfs_buf_t **bpp)
-{
- xfs_ifork_t *ifp;
- xfs_bmbt_block_t *rval;
-
- if (level < cur->bc_nlevels - 1) {
- *bpp = cur->bc_bufs[level];
- rval = XFS_BUF_TO_BMBT_BLOCK(*bpp);
- } else {
- *bpp = NULL;
- ifp = XFS_IFORK_PTR(cur->bc_private.b.ip,
- cur->bc_private.b.whichfork);
- rval = ifp->if_broot;
- }
- return rval;
-}
-
-/*
* Extract the blockcount field from an in memory bmap extent record.
*/
xfs_filblks_t
xfs_bmbt_get_blockcount(
- xfs_bmbt_rec_t *r)
+ xfs_bmbt_rec_host_t *r)
{
- return (xfs_filblks_t)(r->l1 & XFS_MASK64LO(21));
+ return (xfs_filblks_t)(r->l1 & xfs_mask64lo(21));
}
/*
@@ -1960,18 +161,18 @@ xfs_bmbt_get_blockcount(
*/
xfs_fsblock_t
xfs_bmbt_get_startblock(
- xfs_bmbt_rec_t *r)
+ xfs_bmbt_rec_host_t *r)
{
#if XFS_BIG_BLKNOS
- return (((xfs_fsblock_t)r->l0 & XFS_MASK64LO(9)) << 43) |
+ return (((xfs_fsblock_t)r->l0 & xfs_mask64lo(9)) << 43) |
(((xfs_fsblock_t)r->l1) >> 21);
#else
#ifdef DEBUG
xfs_dfsbno_t b;
- b = (((xfs_dfsbno_t)r->l0 & XFS_MASK64LO(9)) << 43) |
+ b = (((xfs_dfsbno_t)r->l0 & xfs_mask64lo(9)) << 43) |
(((xfs_dfsbno_t)r->l1) >> 21);
- ASSERT((b >> 32) == 0 || ISNULLDSTARTBLOCK(b));
+ ASSERT((b >> 32) == 0 || isnulldstartblock(b));
return (xfs_fsblock_t)b;
#else /* !DEBUG */
return (xfs_fsblock_t)(((xfs_dfsbno_t)r->l1) >> 21);
@@ -1984,15 +185,15 @@ xfs_bmbt_get_startblock(
*/
xfs_fileoff_t
xfs_bmbt_get_startoff(
- xfs_bmbt_rec_t *r)
+ xfs_bmbt_rec_host_t *r)
{
return ((xfs_fileoff_t)r->l0 &
- XFS_MASK64LO(64 - BMBT_EXNTFLAG_BITLEN)) >> 9;
+ xfs_mask64lo(64 - BMBT_EXNTFLAG_BITLEN)) >> 9;
}
xfs_exntst_t
xfs_bmbt_get_state(
- xfs_bmbt_rec_t *r)
+ xfs_bmbt_rec_host_t *r)
{
int ext_flag;
@@ -2001,21 +202,6 @@ xfs_bmbt_get_state(
ext_flag);
}
-#ifndef XFS_NATIVE_HOST
-/* Endian flipping versions of the bmbt extraction functions */
-void
-xfs_bmbt_disk_get_all(
- xfs_bmbt_rec_t *r,
- xfs_bmbt_irec_t *s)
-{
- __uint64_t l0, l1;
-
- l0 = INT_GET(r->l0, ARCH_CONVERT);
- l1 = INT_GET(r->l1, ARCH_CONVERT);
-
- __xfs_bmbt_get_all(l0, l1, s);
-}
-
/*
* Extract the blockcount field from an on disk bmap extent record.
*/
@@ -2023,31 +209,7 @@ xfs_filblks_t
xfs_bmbt_disk_get_blockcount(
xfs_bmbt_rec_t *r)
{
- return (xfs_filblks_t)(INT_GET(r->l1, ARCH_CONVERT) & XFS_MASK64LO(21));
-}
-
-/*
- * Extract the startblock field from an on disk bmap extent record.
- */
-xfs_fsblock_t
-xfs_bmbt_disk_get_startblock(
- xfs_bmbt_rec_t *r)
-{
-#if XFS_BIG_BLKNOS
- return (((xfs_fsblock_t)INT_GET(r->l0, ARCH_CONVERT) & XFS_MASK64LO(9)) << 43) |
- (((xfs_fsblock_t)INT_GET(r->l1, ARCH_CONVERT)) >> 21);
-#else
-#ifdef DEBUG
- xfs_dfsbno_t b;
-
- b = (((xfs_dfsbno_t)INT_GET(r->l0, ARCH_CONVERT) & XFS_MASK64LO(9)) << 43) |
- (((xfs_dfsbno_t)INT_GET(r->l1, ARCH_CONVERT)) >> 21);
- ASSERT((b >> 32) == 0 || ISNULLDSTARTBLOCK(b));
- return (xfs_fsblock_t)b;
-#else /* !DEBUG */
- return (xfs_fsblock_t)(((xfs_dfsbno_t)INT_GET(r->l1, ARCH_CONVERT)) >> 21);
-#endif /* DEBUG */
-#endif /* XFS_BIG_BLKNOS */
+ return (xfs_filblks_t)(be64_to_cpu(r->l1) & xfs_mask64lo(21));
}
/*
@@ -2057,563 +219,142 @@ xfs_fileoff_t
xfs_bmbt_disk_get_startoff(
xfs_bmbt_rec_t *r)
{
- return ((xfs_fileoff_t)INT_GET(r->l0, ARCH_CONVERT) &
- XFS_MASK64LO(64 - BMBT_EXNTFLAG_BITLEN)) >> 9;
-}
-
-xfs_exntst_t
-xfs_bmbt_disk_get_state(
- xfs_bmbt_rec_t *r)
-{
- int ext_flag;
-
- ext_flag = (int)((INT_GET(r->l0, ARCH_CONVERT)) >> (64 - BMBT_EXNTFLAG_BITLEN));
- return xfs_extent_state(xfs_bmbt_disk_get_blockcount(r),
- ext_flag);
-}
-#endif /* XFS_NATIVE_HOST */
-
-
-/*
- * Increment cursor by one record at the level.
- * For nonzero levels the leaf-ward information is untouched.
- */
-int /* error */
-xfs_bmbt_increment(
- xfs_btree_cur_t *cur,
- int level,
- int *stat) /* success/failure */
-{
- xfs_bmbt_block_t *block;
- xfs_buf_t *bp;
- int error; /* error return value */
-#ifdef XFS_BMBT_TRACE
- static char fname[] = "xfs_bmbt_increment";
-#endif
- xfs_fsblock_t fsbno;
- int lev;
- xfs_mount_t *mp;
- xfs_trans_t *tp;
-
- XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
- XFS_BMBT_TRACE_ARGI(cur, level);
- ASSERT(level < cur->bc_nlevels);
- if (level < cur->bc_nlevels - 1)
- xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA);
- block = xfs_bmbt_get_block(cur, level, &bp);
-#ifdef DEBUG
- if ((error = xfs_btree_check_lblock(cur, block, level, bp))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- return error;
- }
-#endif
- if (++cur->bc_ptrs[level] <= be16_to_cpu(block->bb_numrecs)) {
- XFS_BMBT_TRACE_CURSOR(cur, EXIT);
- *stat = 1;
- return 0;
- }
- if (be64_to_cpu(block->bb_rightsib) == NULLDFSBNO) {
- XFS_BMBT_TRACE_CURSOR(cur, EXIT);
- *stat = 0;
- return 0;
- }
- for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
- block = xfs_bmbt_get_block(cur, lev, &bp);
-#ifdef DEBUG
- if ((error = xfs_btree_check_lblock(cur, block, lev, bp))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- return error;
- }
-#endif
- if (++cur->bc_ptrs[lev] <= be16_to_cpu(block->bb_numrecs))
- break;
- if (lev < cur->bc_nlevels - 1)
- xfs_btree_readahead(cur, lev, XFS_BTCUR_RIGHTRA);
- }
- if (lev == cur->bc_nlevels) {
- XFS_BMBT_TRACE_CURSOR(cur, EXIT);
- *stat = 0;
- return 0;
- }
- tp = cur->bc_tp;
- mp = cur->bc_mp;
- for (block = xfs_bmbt_get_block(cur, lev, &bp); lev > level; ) {
- fsbno = INT_GET(*XFS_BMAP_PTR_IADDR(block, cur->bc_ptrs[lev], cur), ARCH_CONVERT);
- if ((error = xfs_btree_read_bufl(mp, tp, fsbno, 0, &bp,
- XFS_BMAP_BTREE_REF))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- return error;
- }
- lev--;
- xfs_btree_setbuf(cur, lev, bp);
- block = XFS_BUF_TO_BMBT_BLOCK(bp);
- if ((error = xfs_btree_check_lblock(cur, block, lev, bp))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- return error;
- }
- cur->bc_ptrs[lev] = 1;
- }
- XFS_BMBT_TRACE_CURSOR(cur, EXIT);
- *stat = 1;
- return 0;
-}
-
-/*
- * Insert the current record at the point referenced by cur.
- */
-int /* error */
-xfs_bmbt_insert(
- xfs_btree_cur_t *cur,
- int *stat) /* success/failure */
-{
- int error; /* error return value */
-#ifdef XFS_BMBT_TRACE
- static char fname[] = "xfs_bmbt_insert";
-#endif
- int i;
- int level;
- xfs_fsblock_t nbno;
- xfs_btree_cur_t *ncur;
- xfs_bmbt_rec_t nrec;
- xfs_btree_cur_t *pcur;
-
- XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
- level = 0;
- nbno = NULLFSBLOCK;
- xfs_bmbt_disk_set_all(&nrec, &cur->bc_rec.b);
- ncur = (xfs_btree_cur_t *)0;
- pcur = cur;
- do {
- if ((error = xfs_bmbt_insrec(pcur, level++, &nbno, &nrec, &ncur,
- &i))) {
- if (pcur != cur)
- xfs_btree_del_cursor(pcur, XFS_BTREE_ERROR);
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- return error;
- }
- XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
- if (pcur != cur && (ncur || nbno == NULLFSBLOCK)) {
- cur->bc_nlevels = pcur->bc_nlevels;
- cur->bc_private.b.allocated +=
- pcur->bc_private.b.allocated;
- pcur->bc_private.b.allocated = 0;
- ASSERT((cur->bc_private.b.firstblock != NULLFSBLOCK) ||
- (cur->bc_private.b.ip->i_d.di_flags &
- XFS_DIFLAG_REALTIME));
- cur->bc_private.b.firstblock =
- pcur->bc_private.b.firstblock;
- ASSERT(cur->bc_private.b.flist ==
- pcur->bc_private.b.flist);
- xfs_btree_del_cursor(pcur, XFS_BTREE_NOERROR);
- }
- if (ncur) {
- pcur = ncur;
- ncur = (xfs_btree_cur_t *)0;
- }
- } while (nbno != NULLFSBLOCK);
- XFS_BMBT_TRACE_CURSOR(cur, EXIT);
- *stat = i;
- return 0;
-error0:
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- return error;
+ return ((xfs_fileoff_t)be64_to_cpu(r->l0) &
+ xfs_mask64lo(64 - BMBT_EXNTFLAG_BITLEN)) >> 9;
}
-/*
- * Log fields from the btree block header.
- */
-void
-xfs_bmbt_log_block(
- xfs_btree_cur_t *cur,
- xfs_buf_t *bp,
- int fields)
-{
- int first;
-#ifdef XFS_BMBT_TRACE
- static char fname[] = "xfs_bmbt_log_block";
-#endif
- int last;
- xfs_trans_t *tp;
- static const short offsets[] = {
- offsetof(xfs_bmbt_block_t, bb_magic),
- offsetof(xfs_bmbt_block_t, bb_level),
- offsetof(xfs_bmbt_block_t, bb_numrecs),
- offsetof(xfs_bmbt_block_t, bb_leftsib),
- offsetof(xfs_bmbt_block_t, bb_rightsib),
- sizeof(xfs_bmbt_block_t)
- };
-
- XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
- XFS_BMBT_TRACE_ARGBI(cur, bp, fields);
- tp = cur->bc_tp;
- if (bp) {
- xfs_btree_offsets(fields, offsets, XFS_BB_NUM_BITS, &first,
- &last);
- xfs_trans_log_buf(tp, bp, first, last);
- } else
- xfs_trans_log_inode(tp, cur->bc_private.b.ip,
- XFS_ILOG_FBROOT(cur->bc_private.b.whichfork));
- XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-}
/*
- * Log record values from the btree block.
+ * Set all the fields in a bmap extent record from the arguments.
*/
void
-xfs_bmbt_log_recs(
- xfs_btree_cur_t *cur,
- xfs_buf_t *bp,
- int rfirst,
- int rlast)
-{
- xfs_bmbt_block_t *block;
- int first;
-#ifdef XFS_BMBT_TRACE
- static char fname[] = "xfs_bmbt_log_recs";
-#endif
- int last;
- xfs_bmbt_rec_t *rp;
- xfs_trans_t *tp;
-
- XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
- XFS_BMBT_TRACE_ARGBII(cur, bp, rfirst, rlast);
- ASSERT(bp);
- tp = cur->bc_tp;
- block = XFS_BUF_TO_BMBT_BLOCK(bp);
- rp = XFS_BMAP_REC_DADDR(block, 1, cur);
- first = (int)((xfs_caddr_t)&rp[rfirst - 1] - (xfs_caddr_t)block);
- last = (int)(((xfs_caddr_t)&rp[rlast] - 1) - (xfs_caddr_t)block);
- xfs_trans_log_buf(tp, bp, first, last);
- XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-}
-
-int /* error */
-xfs_bmbt_lookup_eq(
- xfs_btree_cur_t *cur,
- xfs_fileoff_t off,
- xfs_fsblock_t bno,
- xfs_filblks_t len,
- int *stat) /* success/failure */
-{
- cur->bc_rec.b.br_startoff = off;
- cur->bc_rec.b.br_startblock = bno;
- cur->bc_rec.b.br_blockcount = len;
- return xfs_bmbt_lookup(cur, XFS_LOOKUP_EQ, stat);
-}
-
-int /* error */
-xfs_bmbt_lookup_ge(
- xfs_btree_cur_t *cur,
- xfs_fileoff_t off,
- xfs_fsblock_t bno,
- xfs_filblks_t len,
- int *stat) /* success/failure */
-{
- cur->bc_rec.b.br_startoff = off;
- cur->bc_rec.b.br_startblock = bno;
- cur->bc_rec.b.br_blockcount = len;
- return xfs_bmbt_lookup(cur, XFS_LOOKUP_GE, stat);
-}
-
-/*
- * Give the bmap btree a new root block. Copy the old broot contents
- * down into a real block and make the broot point to it.
- */
-int /* error */
-xfs_bmbt_newroot(
- xfs_btree_cur_t *cur, /* btree cursor */
- int *logflags, /* logging flags for inode */
- int *stat) /* return status - 0 fail */
+xfs_bmbt_set_allf(
+ xfs_bmbt_rec_host_t *r,
+ xfs_fileoff_t startoff,
+ xfs_fsblock_t startblock,
+ xfs_filblks_t blockcount,
+ xfs_exntst_t state)
{
- xfs_alloc_arg_t args; /* allocation arguments */
- xfs_bmbt_block_t *block; /* bmap btree block */
- xfs_buf_t *bp; /* buffer for block */
- xfs_bmbt_block_t *cblock; /* child btree block */
- xfs_bmbt_key_t *ckp; /* child key pointer */
- xfs_bmbt_ptr_t *cpp; /* child ptr pointer */
- int error; /* error return code */
-#ifdef XFS_BMBT_TRACE
- static char fname[] = "xfs_bmbt_newroot";
-#endif
-#ifdef DEBUG
- int i; /* loop counter */
-#endif
- xfs_bmbt_key_t *kp; /* pointer to bmap btree key */
- int level; /* btree level */
- xfs_bmbt_ptr_t *pp; /* pointer to bmap block addr */
-
- XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
- level = cur->bc_nlevels - 1;
- block = xfs_bmbt_get_block(cur, level, &bp);
- /*
- * Copy the root into a real block.
- */
- args.mp = cur->bc_mp;
- pp = XFS_BMAP_PTR_IADDR(block, 1, cur);
- args.tp = cur->bc_tp;
- args.fsbno = cur->bc_private.b.firstblock;
- args.mod = args.minleft = args.alignment = args.total = args.isfl =
- args.userdata = args.minalignslop = 0;
- args.minlen = args.maxlen = args.prod = 1;
- args.wasdel = cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL;
- if (args.fsbno == NULLFSBLOCK) {
-#ifdef DEBUG
- if ((error = xfs_btree_check_lptr(cur, INT_GET(*pp, ARCH_CONVERT), level))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- return error;
- }
-#endif
- args.fsbno = INT_GET(*pp, ARCH_CONVERT);
- args.type = XFS_ALLOCTYPE_START_BNO;
- } else if (args.wasdel)
- args.type = XFS_ALLOCTYPE_FIRST_AG;
- else
- args.type = XFS_ALLOCTYPE_NEAR_BNO;
- if ((error = xfs_alloc_vextent(&args))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- return error;
- }
- if (args.fsbno == NULLFSBLOCK) {
- XFS_BMBT_TRACE_CURSOR(cur, EXIT);
- *stat = 0;
- return 0;
- }
- ASSERT(args.len == 1);
- cur->bc_private.b.firstblock = args.fsbno;
- cur->bc_private.b.allocated++;
- cur->bc_private.b.ip->i_d.di_nblocks++;
- XFS_TRANS_MOD_DQUOT_BYINO(args.mp, args.tp, cur->bc_private.b.ip,
- XFS_TRANS_DQ_BCOUNT, 1L);
- bp = xfs_btree_get_bufl(args.mp, cur->bc_tp, args.fsbno, 0);
- cblock = XFS_BUF_TO_BMBT_BLOCK(bp);
- *cblock = *block;
- be16_add(&block->bb_level, 1);
- block->bb_numrecs = cpu_to_be16(1);
- cur->bc_nlevels++;
- cur->bc_ptrs[level + 1] = 1;
- kp = XFS_BMAP_KEY_IADDR(block, 1, cur);
- ckp = XFS_BMAP_KEY_IADDR(cblock, 1, cur);
- memcpy(ckp, kp, be16_to_cpu(cblock->bb_numrecs) * sizeof(*kp));
- cpp = XFS_BMAP_PTR_IADDR(cblock, 1, cur);
-#ifdef DEBUG
- for (i = 0; i < be16_to_cpu(cblock->bb_numrecs); i++) {
- if ((error = xfs_btree_check_lptr(cur, INT_GET(pp[i], ARCH_CONVERT), level))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- return error;
- }
- }
-#endif
- memcpy(cpp, pp, be16_to_cpu(cblock->bb_numrecs) * sizeof(*pp));
-#ifdef DEBUG
- if ((error = xfs_btree_check_lptr(cur, (xfs_bmbt_ptr_t)args.fsbno,
- level))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- return error;
- }
-#endif
- INT_SET(*pp, ARCH_CONVERT, args.fsbno);
- xfs_iroot_realloc(cur->bc_private.b.ip, 1 - be16_to_cpu(cblock->bb_numrecs),
- cur->bc_private.b.whichfork);
- xfs_btree_setbuf(cur, level, bp);
- /*
- * Do all this logging at the end so that
- * the root is at the right level.
- */
- xfs_bmbt_log_block(cur, bp, XFS_BB_ALL_BITS);
- xfs_bmbt_log_keys(cur, bp, 1, be16_to_cpu(cblock->bb_numrecs));
- xfs_bmbt_log_ptrs(cur, bp, 1, be16_to_cpu(cblock->bb_numrecs));
- XFS_BMBT_TRACE_CURSOR(cur, EXIT);
- *logflags |=
- XFS_ILOG_CORE | XFS_ILOG_FBROOT(cur->bc_private.b.whichfork);
- *stat = 1;
- return 0;
-}
+ int extent_flag = (state == XFS_EXT_NORM) ? 0 : 1;
-/*
- * Set all the fields in a bmap extent record from the uncompressed form.
- */
-void
-xfs_bmbt_set_all(
- xfs_bmbt_rec_t *r,
- xfs_bmbt_irec_t *s)
-{
- int extent_flag;
+ ASSERT(state == XFS_EXT_NORM || state == XFS_EXT_UNWRITTEN);
+ ASSERT((startoff & xfs_mask64hi(64-BMBT_STARTOFF_BITLEN)) == 0);
+ ASSERT((blockcount & xfs_mask64hi(64-BMBT_BLOCKCOUNT_BITLEN)) == 0);
- ASSERT((s->br_state == XFS_EXT_NORM) ||
- (s->br_state == XFS_EXT_UNWRITTEN));
- extent_flag = (s->br_state == XFS_EXT_NORM) ? 0 : 1;
- ASSERT((s->br_startoff & XFS_MASK64HI(9)) == 0);
- ASSERT((s->br_blockcount & XFS_MASK64HI(43)) == 0);
#if XFS_BIG_BLKNOS
- ASSERT((s->br_startblock & XFS_MASK64HI(12)) == 0);
+ ASSERT((startblock & xfs_mask64hi(64-BMBT_STARTBLOCK_BITLEN)) == 0);
+
r->l0 = ((xfs_bmbt_rec_base_t)extent_flag << 63) |
- ((xfs_bmbt_rec_base_t)s->br_startoff << 9) |
- ((xfs_bmbt_rec_base_t)s->br_startblock >> 43);
- r->l1 = ((xfs_bmbt_rec_base_t)s->br_startblock << 21) |
- ((xfs_bmbt_rec_base_t)s->br_blockcount &
- (xfs_bmbt_rec_base_t)XFS_MASK64LO(21));
+ ((xfs_bmbt_rec_base_t)startoff << 9) |
+ ((xfs_bmbt_rec_base_t)startblock >> 43);
+ r->l1 = ((xfs_bmbt_rec_base_t)startblock << 21) |
+ ((xfs_bmbt_rec_base_t)blockcount &
+ (xfs_bmbt_rec_base_t)xfs_mask64lo(21));
#else /* !XFS_BIG_BLKNOS */
- if (ISNULLSTARTBLOCK(s->br_startblock)) {
+ if (isnullstartblock(startblock)) {
r->l0 = ((xfs_bmbt_rec_base_t)extent_flag << 63) |
- ((xfs_bmbt_rec_base_t)s->br_startoff << 9) |
- (xfs_bmbt_rec_base_t)XFS_MASK64LO(9);
- r->l1 = XFS_MASK64HI(11) |
- ((xfs_bmbt_rec_base_t)s->br_startblock << 21) |
- ((xfs_bmbt_rec_base_t)s->br_blockcount &
- (xfs_bmbt_rec_base_t)XFS_MASK64LO(21));
+ ((xfs_bmbt_rec_base_t)startoff << 9) |
+ (xfs_bmbt_rec_base_t)xfs_mask64lo(9);
+ r->l1 = xfs_mask64hi(11) |
+ ((xfs_bmbt_rec_base_t)startblock << 21) |
+ ((xfs_bmbt_rec_base_t)blockcount &
+ (xfs_bmbt_rec_base_t)xfs_mask64lo(21));
} else {
r->l0 = ((xfs_bmbt_rec_base_t)extent_flag << 63) |
- ((xfs_bmbt_rec_base_t)s->br_startoff << 9);
- r->l1 = ((xfs_bmbt_rec_base_t)s->br_startblock << 21) |
- ((xfs_bmbt_rec_base_t)s->br_blockcount &
- (xfs_bmbt_rec_base_t)XFS_MASK64LO(21));
+ ((xfs_bmbt_rec_base_t)startoff << 9);
+ r->l1 = ((xfs_bmbt_rec_base_t)startblock << 21) |
+ ((xfs_bmbt_rec_base_t)blockcount &
+ (xfs_bmbt_rec_base_t)xfs_mask64lo(21));
}
#endif /* XFS_BIG_BLKNOS */
}
/*
- * Set all the fields in a bmap extent record from the arguments.
+ * Set all the fields in a bmap extent record from the uncompressed form.
*/
void
-xfs_bmbt_set_allf(
- xfs_bmbt_rec_t *r,
- xfs_fileoff_t o,
- xfs_fsblock_t b,
- xfs_filblks_t c,
- xfs_exntst_t v)
+xfs_bmbt_set_all(
+ xfs_bmbt_rec_host_t *r,
+ xfs_bmbt_irec_t *s)
{
- int extent_flag;
-
- ASSERT((v == XFS_EXT_NORM) || (v == XFS_EXT_UNWRITTEN));
- extent_flag = (v == XFS_EXT_NORM) ? 0 : 1;
- ASSERT((o & XFS_MASK64HI(64-BMBT_STARTOFF_BITLEN)) == 0);
- ASSERT((c & XFS_MASK64HI(64-BMBT_BLOCKCOUNT_BITLEN)) == 0);
-#if XFS_BIG_BLKNOS
- ASSERT((b & XFS_MASK64HI(64-BMBT_STARTBLOCK_BITLEN)) == 0);
- r->l0 = ((xfs_bmbt_rec_base_t)extent_flag << 63) |
- ((xfs_bmbt_rec_base_t)o << 9) |
- ((xfs_bmbt_rec_base_t)b >> 43);
- r->l1 = ((xfs_bmbt_rec_base_t)b << 21) |
- ((xfs_bmbt_rec_base_t)c &
- (xfs_bmbt_rec_base_t)XFS_MASK64LO(21));
-#else /* !XFS_BIG_BLKNOS */
- if (ISNULLSTARTBLOCK(b)) {
- r->l0 = ((xfs_bmbt_rec_base_t)extent_flag << 63) |
- ((xfs_bmbt_rec_base_t)o << 9) |
- (xfs_bmbt_rec_base_t)XFS_MASK64LO(9);
- r->l1 = XFS_MASK64HI(11) |
- ((xfs_bmbt_rec_base_t)b << 21) |
- ((xfs_bmbt_rec_base_t)c &
- (xfs_bmbt_rec_base_t)XFS_MASK64LO(21));
- } else {
- r->l0 = ((xfs_bmbt_rec_base_t)extent_flag << 63) |
- ((xfs_bmbt_rec_base_t)o << 9);
- r->l1 = ((xfs_bmbt_rec_base_t)b << 21) |
- ((xfs_bmbt_rec_base_t)c &
- (xfs_bmbt_rec_base_t)XFS_MASK64LO(21));
- }
-#endif /* XFS_BIG_BLKNOS */
+ xfs_bmbt_set_allf(r, s->br_startoff, s->br_startblock,
+ s->br_blockcount, s->br_state);
}
-#ifndef XFS_NATIVE_HOST
+
/*
- * Set all the fields in a bmap extent record from the uncompressed form.
+ * Set all the fields in a disk format bmap extent record from the arguments.
*/
void
-xfs_bmbt_disk_set_all(
- xfs_bmbt_rec_t *r,
- xfs_bmbt_irec_t *s)
+xfs_bmbt_disk_set_allf(
+ xfs_bmbt_rec_t *r,
+ xfs_fileoff_t startoff,
+ xfs_fsblock_t startblock,
+ xfs_filblks_t blockcount,
+ xfs_exntst_t state)
{
- int extent_flag;
+ int extent_flag = (state == XFS_EXT_NORM) ? 0 : 1;
+
+ ASSERT(state == XFS_EXT_NORM || state == XFS_EXT_UNWRITTEN);
+ ASSERT((startoff & xfs_mask64hi(64-BMBT_STARTOFF_BITLEN)) == 0);
+ ASSERT((blockcount & xfs_mask64hi(64-BMBT_BLOCKCOUNT_BITLEN)) == 0);
- ASSERT((s->br_state == XFS_EXT_NORM) ||
- (s->br_state == XFS_EXT_UNWRITTEN));
- extent_flag = (s->br_state == XFS_EXT_NORM) ? 0 : 1;
- ASSERT((s->br_startoff & XFS_MASK64HI(9)) == 0);
- ASSERT((s->br_blockcount & XFS_MASK64HI(43)) == 0);
#if XFS_BIG_BLKNOS
- ASSERT((s->br_startblock & XFS_MASK64HI(12)) == 0);
- INT_SET(r->l0, ARCH_CONVERT, ((xfs_bmbt_rec_base_t)extent_flag << 63) |
- ((xfs_bmbt_rec_base_t)s->br_startoff << 9) |
- ((xfs_bmbt_rec_base_t)s->br_startblock >> 43));
- INT_SET(r->l1, ARCH_CONVERT, ((xfs_bmbt_rec_base_t)s->br_startblock << 21) |
- ((xfs_bmbt_rec_base_t)s->br_blockcount &
- (xfs_bmbt_rec_base_t)XFS_MASK64LO(21)));
+ ASSERT((startblock & xfs_mask64hi(64-BMBT_STARTBLOCK_BITLEN)) == 0);
+
+ r->l0 = cpu_to_be64(
+ ((xfs_bmbt_rec_base_t)extent_flag << 63) |
+ ((xfs_bmbt_rec_base_t)startoff << 9) |
+ ((xfs_bmbt_rec_base_t)startblock >> 43));
+ r->l1 = cpu_to_be64(
+ ((xfs_bmbt_rec_base_t)startblock << 21) |
+ ((xfs_bmbt_rec_base_t)blockcount &
+ (xfs_bmbt_rec_base_t)xfs_mask64lo(21)));
#else /* !XFS_BIG_BLKNOS */
- if (ISNULLSTARTBLOCK(s->br_startblock)) {
- INT_SET(r->l0, ARCH_CONVERT, ((xfs_bmbt_rec_base_t)extent_flag << 63) |
- ((xfs_bmbt_rec_base_t)s->br_startoff << 9) |
- (xfs_bmbt_rec_base_t)XFS_MASK64LO(9));
- INT_SET(r->l1, ARCH_CONVERT, XFS_MASK64HI(11) |
- ((xfs_bmbt_rec_base_t)s->br_startblock << 21) |
- ((xfs_bmbt_rec_base_t)s->br_blockcount &
- (xfs_bmbt_rec_base_t)XFS_MASK64LO(21)));
+ if (isnullstartblock(startblock)) {
+ r->l0 = cpu_to_be64(
+ ((xfs_bmbt_rec_base_t)extent_flag << 63) |
+ ((xfs_bmbt_rec_base_t)startoff << 9) |
+ (xfs_bmbt_rec_base_t)xfs_mask64lo(9));
+ r->l1 = cpu_to_be64(xfs_mask64hi(11) |
+ ((xfs_bmbt_rec_base_t)startblock << 21) |
+ ((xfs_bmbt_rec_base_t)blockcount &
+ (xfs_bmbt_rec_base_t)xfs_mask64lo(21)));
} else {
- INT_SET(r->l0, ARCH_CONVERT, ((xfs_bmbt_rec_base_t)extent_flag << 63) |
- ((xfs_bmbt_rec_base_t)s->br_startoff << 9));
- INT_SET(r->l1, ARCH_CONVERT, ((xfs_bmbt_rec_base_t)s->br_startblock << 21) |
- ((xfs_bmbt_rec_base_t)s->br_blockcount &
- (xfs_bmbt_rec_base_t)XFS_MASK64LO(21)));
+ r->l0 = cpu_to_be64(
+ ((xfs_bmbt_rec_base_t)extent_flag << 63) |
+ ((xfs_bmbt_rec_base_t)startoff << 9));
+ r->l1 = cpu_to_be64(
+ ((xfs_bmbt_rec_base_t)startblock << 21) |
+ ((xfs_bmbt_rec_base_t)blockcount &
+ (xfs_bmbt_rec_base_t)xfs_mask64lo(21)));
}
#endif /* XFS_BIG_BLKNOS */
}
/*
- * Set all the fields in a disk format bmap extent record from the arguments.
+ * Set all the fields in a bmap extent record from the uncompressed form.
*/
-void
-xfs_bmbt_disk_set_allf(
+STATIC void
+xfs_bmbt_disk_set_all(
xfs_bmbt_rec_t *r,
- xfs_fileoff_t o,
- xfs_fsblock_t b,
- xfs_filblks_t c,
- xfs_exntst_t v)
+ xfs_bmbt_irec_t *s)
{
- int extent_flag;
-
- ASSERT((v == XFS_EXT_NORM) || (v == XFS_EXT_UNWRITTEN));
- extent_flag = (v == XFS_EXT_NORM) ? 0 : 1;
- ASSERT((o & XFS_MASK64HI(64-BMBT_STARTOFF_BITLEN)) == 0);
- ASSERT((c & XFS_MASK64HI(64-BMBT_BLOCKCOUNT_BITLEN)) == 0);
-#if XFS_BIG_BLKNOS
- ASSERT((b & XFS_MASK64HI(64-BMBT_STARTBLOCK_BITLEN)) == 0);
- INT_SET(r->l0, ARCH_CONVERT, ((xfs_bmbt_rec_base_t)extent_flag << 63) |
- ((xfs_bmbt_rec_base_t)o << 9) |
- ((xfs_bmbt_rec_base_t)b >> 43));
- INT_SET(r->l1, ARCH_CONVERT, ((xfs_bmbt_rec_base_t)b << 21) |
- ((xfs_bmbt_rec_base_t)c &
- (xfs_bmbt_rec_base_t)XFS_MASK64LO(21)));
-#else /* !XFS_BIG_BLKNOS */
- if (ISNULLSTARTBLOCK(b)) {
- INT_SET(r->l0, ARCH_CONVERT, ((xfs_bmbt_rec_base_t)extent_flag << 63) |
- ((xfs_bmbt_rec_base_t)o << 9) |
- (xfs_bmbt_rec_base_t)XFS_MASK64LO(9));
- INT_SET(r->l1, ARCH_CONVERT, XFS_MASK64HI(11) |
- ((xfs_bmbt_rec_base_t)b << 21) |
- ((xfs_bmbt_rec_base_t)c &
- (xfs_bmbt_rec_base_t)XFS_MASK64LO(21)));
- } else {
- INT_SET(r->l0, ARCH_CONVERT, ((xfs_bmbt_rec_base_t)extent_flag << 63) |
- ((xfs_bmbt_rec_base_t)o << 9));
- INT_SET(r->l1, ARCH_CONVERT, ((xfs_bmbt_rec_base_t)b << 21) |
- ((xfs_bmbt_rec_base_t)c &
- (xfs_bmbt_rec_base_t)XFS_MASK64LO(21)));
- }
-#endif /* XFS_BIG_BLKNOS */
+ xfs_bmbt_disk_set_allf(r, s->br_startoff, s->br_startblock,
+ s->br_blockcount, s->br_state);
}
-#endif /* XFS_NATIVE_HOST */
/*
* Set the blockcount field in a bmap extent record.
*/
void
xfs_bmbt_set_blockcount(
- xfs_bmbt_rec_t *r,
+ xfs_bmbt_rec_host_t *r,
xfs_filblks_t v)
{
- ASSERT((v & XFS_MASK64HI(43)) == 0);
- r->l1 = (r->l1 & (xfs_bmbt_rec_base_t)XFS_MASK64HI(43)) |
- (xfs_bmbt_rec_base_t)(v & XFS_MASK64LO(21));
+ ASSERT((v & xfs_mask64hi(43)) == 0);
+ r->l1 = (r->l1 & (xfs_bmbt_rec_base_t)xfs_mask64hi(43)) |
+ (xfs_bmbt_rec_base_t)(v & xfs_mask64lo(21));
}
/*
@@ -2621,25 +362,25 @@ xfs_bmbt_set_blockcount(
*/
void
xfs_bmbt_set_startblock(
- xfs_bmbt_rec_t *r,
+ xfs_bmbt_rec_host_t *r,
xfs_fsblock_t v)
{
#if XFS_BIG_BLKNOS
- ASSERT((v & XFS_MASK64HI(12)) == 0);
- r->l0 = (r->l0 & (xfs_bmbt_rec_base_t)XFS_MASK64HI(55)) |
+ ASSERT((v & xfs_mask64hi(12)) == 0);
+ r->l0 = (r->l0 & (xfs_bmbt_rec_base_t)xfs_mask64hi(55)) |
(xfs_bmbt_rec_base_t)(v >> 43);
- r->l1 = (r->l1 & (xfs_bmbt_rec_base_t)XFS_MASK64LO(21)) |
+ r->l1 = (r->l1 & (xfs_bmbt_rec_base_t)xfs_mask64lo(21)) |
(xfs_bmbt_rec_base_t)(v << 21);
#else /* !XFS_BIG_BLKNOS */
- if (ISNULLSTARTBLOCK(v)) {
- r->l0 |= (xfs_bmbt_rec_base_t)XFS_MASK64LO(9);
- r->l1 = (xfs_bmbt_rec_base_t)XFS_MASK64HI(11) |
+ if (isnullstartblock(v)) {
+ r->l0 |= (xfs_bmbt_rec_base_t)xfs_mask64lo(9);
+ r->l1 = (xfs_bmbt_rec_base_t)xfs_mask64hi(11) |
((xfs_bmbt_rec_base_t)v << 21) |
- (r->l1 & (xfs_bmbt_rec_base_t)XFS_MASK64LO(21));
+ (r->l1 & (xfs_bmbt_rec_base_t)xfs_mask64lo(21));
} else {
- r->l0 &= ~(xfs_bmbt_rec_base_t)XFS_MASK64LO(9);
+ r->l0 &= ~(xfs_bmbt_rec_base_t)xfs_mask64lo(9);
r->l1 = ((xfs_bmbt_rec_base_t)v << 21) |
- (r->l1 & (xfs_bmbt_rec_base_t)XFS_MASK64LO(21));
+ (r->l1 & (xfs_bmbt_rec_base_t)xfs_mask64lo(21));
}
#endif /* XFS_BIG_BLKNOS */
}
@@ -2649,13 +390,13 @@ xfs_bmbt_set_startblock(
*/
void
xfs_bmbt_set_startoff(
- xfs_bmbt_rec_t *r,
+ xfs_bmbt_rec_host_t *r,
xfs_fileoff_t v)
{
- ASSERT((v & XFS_MASK64HI(9)) == 0);
- r->l0 = (r->l0 & (xfs_bmbt_rec_base_t) XFS_MASK64HI(1)) |
+ ASSERT((v & xfs_mask64hi(9)) == 0);
+ r->l0 = (r->l0 & (xfs_bmbt_rec_base_t) xfs_mask64hi(1)) |
((xfs_bmbt_rec_base_t)v << 9) |
- (r->l0 & (xfs_bmbt_rec_base_t)XFS_MASK64LO(9));
+ (r->l0 & (xfs_bmbt_rec_base_t)xfs_mask64lo(9));
}
/*
@@ -2663,14 +404,14 @@ xfs_bmbt_set_startoff(
*/
void
xfs_bmbt_set_state(
- xfs_bmbt_rec_t *r,
+ xfs_bmbt_rec_host_t *r,
xfs_exntst_t v)
{
ASSERT(v == XFS_EXT_NORM || v == XFS_EXT_UNWRITTEN);
if (v == XFS_EXT_NORM)
- r->l0 &= XFS_MASK64LO(64 - BMBT_EXNTFLAG_BITLEN);
+ r->l0 &= xfs_mask64lo(64 - BMBT_EXNTFLAG_BITLEN);
else
- r->l0 |= XFS_MASK64HI(BMBT_EXNTFLAG_BITLEN);
+ r->l0 |= xfs_mask64hi(BMBT_EXNTFLAG_BITLEN);
}
/*
@@ -2678,83 +419,42 @@ xfs_bmbt_set_state(
*/
void
xfs_bmbt_to_bmdr(
- xfs_bmbt_block_t *rblock,
+ struct xfs_mount *mp,
+ struct xfs_btree_block *rblock,
int rblocklen,
xfs_bmdr_block_t *dblock,
int dblocklen)
{
int dmxr;
xfs_bmbt_key_t *fkp;
- xfs_bmbt_ptr_t *fpp;
+ __be64 *fpp;
xfs_bmbt_key_t *tkp;
- xfs_bmbt_ptr_t *tpp;
+ __be64 *tpp;
- ASSERT(be32_to_cpu(rblock->bb_magic) == XFS_BMAP_MAGIC);
- ASSERT(be64_to_cpu(rblock->bb_leftsib) == NULLDFSBNO);
- ASSERT(be64_to_cpu(rblock->bb_rightsib) == NULLDFSBNO);
- ASSERT(be16_to_cpu(rblock->bb_level) > 0);
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ ASSERT(rblock->bb_magic == cpu_to_be32(XFS_BMAP_CRC_MAGIC));
+ ASSERT(uuid_equal(&rblock->bb_u.l.bb_uuid, &mp->m_sb.sb_uuid));
+ ASSERT(rblock->bb_u.l.bb_blkno ==
+ cpu_to_be64(XFS_BUF_DADDR_NULL));
+ } else
+ ASSERT(rblock->bb_magic == cpu_to_be32(XFS_BMAP_MAGIC));
+ ASSERT(rblock->bb_u.l.bb_leftsib == cpu_to_be64(NULLDFSBNO));
+ ASSERT(rblock->bb_u.l.bb_rightsib == cpu_to_be64(NULLDFSBNO));
+ ASSERT(rblock->bb_level != 0);
dblock->bb_level = rblock->bb_level;
dblock->bb_numrecs = rblock->bb_numrecs;
- dmxr = (int)XFS_BTREE_BLOCK_MAXRECS(dblocklen, xfs_bmdr, 0);
- fkp = XFS_BMAP_BROOT_KEY_ADDR(rblock, 1, rblocklen);
- tkp = XFS_BTREE_KEY_ADDR(dblocklen, xfs_bmdr, dblock, 1, dmxr);
- fpp = XFS_BMAP_BROOT_PTR_ADDR(rblock, 1, rblocklen);
- tpp = XFS_BTREE_PTR_ADDR(dblocklen, xfs_bmdr, dblock, 1, dmxr);
+ dmxr = xfs_bmdr_maxrecs(dblocklen, 0);
+ fkp = XFS_BMBT_KEY_ADDR(mp, rblock, 1);
+ tkp = XFS_BMDR_KEY_ADDR(dblock, 1);
+ fpp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, rblocklen);
+ tpp = XFS_BMDR_PTR_ADDR(dblock, 1, dmxr);
dmxr = be16_to_cpu(dblock->bb_numrecs);
memcpy(tkp, fkp, sizeof(*fkp) * dmxr);
- memcpy(tpp, fpp, sizeof(*fpp) * dmxr); /* INT_: direct copy */
-}
-
-/*
- * Update the record to the passed values.
- */
-int
-xfs_bmbt_update(
- xfs_btree_cur_t *cur,
- xfs_fileoff_t off,
- xfs_fsblock_t bno,
- xfs_filblks_t len,
- xfs_exntst_t state)
-{
- xfs_bmbt_block_t *block;
- xfs_buf_t *bp;
- int error;
-#ifdef XFS_BMBT_TRACE
- static char fname[] = "xfs_bmbt_update";
-#endif
- xfs_bmbt_key_t key;
- int ptr;
- xfs_bmbt_rec_t *rp;
-
- XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
- XFS_BMBT_TRACE_ARGFFFI(cur, (xfs_dfiloff_t)off, (xfs_dfsbno_t)bno,
- (xfs_dfilblks_t)len, (int)state);
- block = xfs_bmbt_get_block(cur, 0, &bp);
-#ifdef DEBUG
- if ((error = xfs_btree_check_lblock(cur, block, 0, bp))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- return error;
- }
-#endif
- ptr = cur->bc_ptrs[0];
- rp = XFS_BMAP_REC_IADDR(block, ptr, cur);
- xfs_bmbt_disk_set_allf(rp, off, bno, len, state);
- xfs_bmbt_log_recs(cur, bp, ptr, ptr);
- if (ptr > 1) {
- XFS_BMBT_TRACE_CURSOR(cur, EXIT);
- return 0;
- }
- INT_SET(key.br_startoff, ARCH_CONVERT, off);
- if ((error = xfs_bmbt_updkey(cur, &key, 1))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- return error;
- }
- XFS_BMBT_TRACE_CURSOR(cur, EXIT);
- return 0;
+ memcpy(tpp, fpp, sizeof(*fpp) * dmxr);
}
/*
- * Check an extent list, which has just been read, for
+ * Check extent records, which have just been read, for
* any bit in the extent flag field. ASSERT on debug
* kernels, as this condition should not occur.
* Return an error condition (1) if any flags found,
@@ -2763,10 +463,12 @@ xfs_bmbt_update(
int
xfs_check_nostate_extents(
- xfs_bmbt_rec_t *ep,
+ xfs_ifork_t *ifp,
+ xfs_extnum_t idx,
xfs_extnum_t num)
{
- for (; num > 0; num--, ep++) {
+ for (; num > 0; num--, idx++) {
+ xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, idx);
if ((ep->l0 >>
(64 - BMBT_EXNTFLAG_BITLEN)) != 0) {
ASSERT(0);
@@ -2775,3 +477,491 @@ xfs_check_nostate_extents(
}
return 0;
}
+
+
+STATIC struct xfs_btree_cur *
+xfs_bmbt_dup_cursor(
+ struct xfs_btree_cur *cur)
+{
+ struct xfs_btree_cur *new;
+
+ new = xfs_bmbt_init_cursor(cur->bc_mp, cur->bc_tp,
+ cur->bc_private.b.ip, cur->bc_private.b.whichfork);
+
+ /*
+ * Copy the firstblock, flist, and flags values,
+ * since init cursor doesn't get them.
+ */
+ new->bc_private.b.firstblock = cur->bc_private.b.firstblock;
+ new->bc_private.b.flist = cur->bc_private.b.flist;
+ new->bc_private.b.flags = cur->bc_private.b.flags;
+
+ return new;
+}
+
+STATIC void
+xfs_bmbt_update_cursor(
+ struct xfs_btree_cur *src,
+ struct xfs_btree_cur *dst)
+{
+ ASSERT((dst->bc_private.b.firstblock != NULLFSBLOCK) ||
+ (dst->bc_private.b.ip->i_d.di_flags & XFS_DIFLAG_REALTIME));
+ ASSERT(dst->bc_private.b.flist == src->bc_private.b.flist);
+
+ dst->bc_private.b.allocated += src->bc_private.b.allocated;
+ dst->bc_private.b.firstblock = src->bc_private.b.firstblock;
+
+ src->bc_private.b.allocated = 0;
+}
+
+STATIC int
+xfs_bmbt_alloc_block(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *start,
+ union xfs_btree_ptr *new,
+ int *stat)
+{
+ xfs_alloc_arg_t args; /* block allocation args */
+ int error; /* error return value */
+
+ memset(&args, 0, sizeof(args));
+ args.tp = cur->bc_tp;
+ args.mp = cur->bc_mp;
+ args.fsbno = cur->bc_private.b.firstblock;
+ args.firstblock = args.fsbno;
+
+ if (args.fsbno == NULLFSBLOCK) {
+ args.fsbno = be64_to_cpu(start->l);
+ args.type = XFS_ALLOCTYPE_START_BNO;
+ /*
+ * Make sure there is sufficient room left in the AG to
+ * complete a full tree split for an extent insert. If
+ * we are converting the middle part of an extent then
+ * we may need space for two tree splits.
+ *
+ * We are relying on the caller to make the correct block
+ * reservation for this operation to succeed. If the
+ * reservation amount is insufficient then we may fail a
+ * block allocation here and corrupt the filesystem.
+ */
+ args.minleft = xfs_trans_get_block_res(args.tp);
+ } else if (cur->bc_private.b.flist->xbf_low) {
+ args.type = XFS_ALLOCTYPE_START_BNO;
+ } else {
+ args.type = XFS_ALLOCTYPE_NEAR_BNO;
+ }
+
+ args.minlen = args.maxlen = args.prod = 1;
+ args.wasdel = cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL;
+ if (!args.wasdel && xfs_trans_get_block_res(args.tp) == 0) {
+ error = XFS_ERROR(ENOSPC);
+ goto error0;
+ }
+ error = xfs_alloc_vextent(&args);
+ if (error)
+ goto error0;
+
+ if (args.fsbno == NULLFSBLOCK && args.minleft) {
+ /*
+ * Could not find an AG with enough free space to satisfy
+ * a full btree split. Try again without minleft and if
+ * successful activate the lowspace algorithm.
+ */
+ args.fsbno = 0;
+ args.type = XFS_ALLOCTYPE_FIRST_AG;
+ args.minleft = 0;
+ error = xfs_alloc_vextent(&args);
+ if (error)
+ goto error0;
+ cur->bc_private.b.flist->xbf_low = 1;
+ }
+ if (args.fsbno == NULLFSBLOCK) {
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+ *stat = 0;
+ return 0;
+ }
+ ASSERT(args.len == 1);
+ cur->bc_private.b.firstblock = args.fsbno;
+ cur->bc_private.b.allocated++;
+ cur->bc_private.b.ip->i_d.di_nblocks++;
+ xfs_trans_log_inode(args.tp, cur->bc_private.b.ip, XFS_ILOG_CORE);
+ xfs_trans_mod_dquot_byino(args.tp, cur->bc_private.b.ip,
+ XFS_TRANS_DQ_BCOUNT, 1L);
+
+ new->l = cpu_to_be64(args.fsbno);
+
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+ *stat = 1;
+ return 0;
+
+ error0:
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+ return error;
+}
+
+STATIC int
+xfs_bmbt_free_block(
+ struct xfs_btree_cur *cur,
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = cur->bc_mp;
+ struct xfs_inode *ip = cur->bc_private.b.ip;
+ struct xfs_trans *tp = cur->bc_tp;
+ xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, XFS_BUF_ADDR(bp));
+
+ xfs_bmap_add_free(fsbno, 1, cur->bc_private.b.flist, mp);
+ ip->i_d.di_nblocks--;
+
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+ xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L);
+ xfs_trans_binval(tp, bp);
+ return 0;
+}
+
+STATIC int
+xfs_bmbt_get_minrecs(
+ struct xfs_btree_cur *cur,
+ int level)
+{
+ if (level == cur->bc_nlevels - 1) {
+ struct xfs_ifork *ifp;
+
+ ifp = XFS_IFORK_PTR(cur->bc_private.b.ip,
+ cur->bc_private.b.whichfork);
+
+ return xfs_bmbt_maxrecs(cur->bc_mp,
+ ifp->if_broot_bytes, level == 0) / 2;
+ }
+
+ return cur->bc_mp->m_bmap_dmnr[level != 0];
+}
+
+int
+xfs_bmbt_get_maxrecs(
+ struct xfs_btree_cur *cur,
+ int level)
+{
+ if (level == cur->bc_nlevels - 1) {
+ struct xfs_ifork *ifp;
+
+ ifp = XFS_IFORK_PTR(cur->bc_private.b.ip,
+ cur->bc_private.b.whichfork);
+
+ return xfs_bmbt_maxrecs(cur->bc_mp,
+ ifp->if_broot_bytes, level == 0);
+ }
+
+ return cur->bc_mp->m_bmap_dmxr[level != 0];
+
+}
+
+/*
+ * Get the maximum records we could store in the on-disk format.
+ *
+ * For non-root nodes this is equivalent to xfs_bmbt_get_maxrecs, but
+ * for the root node this checks the available space in the dinode fork
+ * so that we can resize the in-memory buffer to match it. After a
+ * resize to the maximum size this function returns the same value
+ * as xfs_bmbt_get_maxrecs for the root node, too.
+ */
+STATIC int
+xfs_bmbt_get_dmaxrecs(
+ struct xfs_btree_cur *cur,
+ int level)
+{
+ if (level != cur->bc_nlevels - 1)
+ return cur->bc_mp->m_bmap_dmxr[level != 0];
+ return xfs_bmdr_maxrecs(cur->bc_private.b.forksize, level == 0);
+}
+
+STATIC void
+xfs_bmbt_init_key_from_rec(
+ union xfs_btree_key *key,
+ union xfs_btree_rec *rec)
+{
+ key->bmbt.br_startoff =
+ cpu_to_be64(xfs_bmbt_disk_get_startoff(&rec->bmbt));
+}
+
+STATIC void
+xfs_bmbt_init_rec_from_key(
+ union xfs_btree_key *key,
+ union xfs_btree_rec *rec)
+{
+ ASSERT(key->bmbt.br_startoff != 0);
+
+ xfs_bmbt_disk_set_allf(&rec->bmbt, be64_to_cpu(key->bmbt.br_startoff),
+ 0, 0, XFS_EXT_NORM);
+}
+
+STATIC void
+xfs_bmbt_init_rec_from_cur(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_rec *rec)
+{
+ xfs_bmbt_disk_set_all(&rec->bmbt, &cur->bc_rec.b);
+}
+
+STATIC void
+xfs_bmbt_init_ptr_from_cur(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *ptr)
+{
+ ptr->l = 0;
+}
+
+STATIC __int64_t
+xfs_bmbt_key_diff(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_key *key)
+{
+ return (__int64_t)be64_to_cpu(key->bmbt.br_startoff) -
+ cur->bc_rec.b.br_startoff;
+}
+
+static bool
+xfs_bmbt_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
+ unsigned int level;
+
+ switch (block->bb_magic) {
+ case cpu_to_be32(XFS_BMAP_CRC_MAGIC):
+ if (!xfs_sb_version_hascrc(&mp->m_sb))
+ return false;
+ if (!uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_uuid))
+ return false;
+ if (be64_to_cpu(block->bb_u.l.bb_blkno) != bp->b_bn)
+ return false;
+ /*
+ * XXX: need a better way of verifying the owner here. Right now
+ * just make sure there has been one set.
+ */
+ if (be64_to_cpu(block->bb_u.l.bb_owner) == 0)
+ return false;
+ /* fall through */
+ case cpu_to_be32(XFS_BMAP_MAGIC):
+ break;
+ default:
+ return false;
+ }
+
+ /*
+ * numrecs and level verification.
+ *
+ * We don't know what fork we belong to, so just verify that the level
+ * is less than the maximum of the two. Later checks will be more
+ * precise.
+ */
+ level = be16_to_cpu(block->bb_level);
+ if (level > max(mp->m_bm_maxlevels[0], mp->m_bm_maxlevels[1]))
+ return false;
+ if (be16_to_cpu(block->bb_numrecs) > mp->m_bmap_dmxr[level != 0])
+ return false;
+
+ /* sibling pointer verification */
+ if (!block->bb_u.l.bb_leftsib ||
+ (block->bb_u.l.bb_leftsib != cpu_to_be64(NULLDFSBNO) &&
+ !XFS_FSB_SANITY_CHECK(mp, be64_to_cpu(block->bb_u.l.bb_leftsib))))
+ return false;
+ if (!block->bb_u.l.bb_rightsib ||
+ (block->bb_u.l.bb_rightsib != cpu_to_be64(NULLDFSBNO) &&
+ !XFS_FSB_SANITY_CHECK(mp, be64_to_cpu(block->bb_u.l.bb_rightsib))))
+ return false;
+
+ return true;
+}
+
+static void
+xfs_bmbt_read_verify(
+ struct xfs_buf *bp)
+{
+ if (!xfs_btree_lblock_verify_crc(bp))
+ xfs_buf_ioerror(bp, EFSBADCRC);
+ else if (!xfs_bmbt_verify(bp))
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
+
+ if (bp->b_error) {
+ trace_xfs_btree_corrupt(bp, _RET_IP_);
+ xfs_verifier_error(bp);
+ }
+}
+
+static void
+xfs_bmbt_write_verify(
+ struct xfs_buf *bp)
+{
+ if (!xfs_bmbt_verify(bp)) {
+ trace_xfs_btree_corrupt(bp, _RET_IP_);
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
+ xfs_verifier_error(bp);
+ return;
+ }
+ xfs_btree_lblock_calc_crc(bp);
+}
+
+const struct xfs_buf_ops xfs_bmbt_buf_ops = {
+ .verify_read = xfs_bmbt_read_verify,
+ .verify_write = xfs_bmbt_write_verify,
+};
+
+
+#if defined(DEBUG) || defined(XFS_WARN)
+STATIC int
+xfs_bmbt_keys_inorder(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_key *k1,
+ union xfs_btree_key *k2)
+{
+ return be64_to_cpu(k1->bmbt.br_startoff) <
+ be64_to_cpu(k2->bmbt.br_startoff);
+}
+
+STATIC int
+xfs_bmbt_recs_inorder(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_rec *r1,
+ union xfs_btree_rec *r2)
+{
+ return xfs_bmbt_disk_get_startoff(&r1->bmbt) +
+ xfs_bmbt_disk_get_blockcount(&r1->bmbt) <=
+ xfs_bmbt_disk_get_startoff(&r2->bmbt);
+}
+#endif /* DEBUG */
+
+static const struct xfs_btree_ops xfs_bmbt_ops = {
+ .rec_len = sizeof(xfs_bmbt_rec_t),
+ .key_len = sizeof(xfs_bmbt_key_t),
+
+ .dup_cursor = xfs_bmbt_dup_cursor,
+ .update_cursor = xfs_bmbt_update_cursor,
+ .alloc_block = xfs_bmbt_alloc_block,
+ .free_block = xfs_bmbt_free_block,
+ .get_maxrecs = xfs_bmbt_get_maxrecs,
+ .get_minrecs = xfs_bmbt_get_minrecs,
+ .get_dmaxrecs = xfs_bmbt_get_dmaxrecs,
+ .init_key_from_rec = xfs_bmbt_init_key_from_rec,
+ .init_rec_from_key = xfs_bmbt_init_rec_from_key,
+ .init_rec_from_cur = xfs_bmbt_init_rec_from_cur,
+ .init_ptr_from_cur = xfs_bmbt_init_ptr_from_cur,
+ .key_diff = xfs_bmbt_key_diff,
+ .buf_ops = &xfs_bmbt_buf_ops,
+#if defined(DEBUG) || defined(XFS_WARN)
+ .keys_inorder = xfs_bmbt_keys_inorder,
+ .recs_inorder = xfs_bmbt_recs_inorder,
+#endif
+};
+
+/*
+ * Allocate a new bmap btree cursor.
+ */
+struct xfs_btree_cur * /* new bmap btree cursor */
+xfs_bmbt_init_cursor(
+ struct xfs_mount *mp, /* file system mount point */
+ struct xfs_trans *tp, /* transaction pointer */
+ struct xfs_inode *ip, /* inode owning the btree */
+ int whichfork) /* data or attr fork */
+{
+ struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ struct xfs_btree_cur *cur;
+
+ cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_SLEEP);
+
+ cur->bc_tp = tp;
+ cur->bc_mp = mp;
+ cur->bc_nlevels = be16_to_cpu(ifp->if_broot->bb_level) + 1;
+ cur->bc_btnum = XFS_BTNUM_BMAP;
+ cur->bc_blocklog = mp->m_sb.sb_blocklog;
+
+ cur->bc_ops = &xfs_bmbt_ops;
+ cur->bc_flags = XFS_BTREE_LONG_PTRS | XFS_BTREE_ROOT_IN_INODE;
+ if (xfs_sb_version_hascrc(&mp->m_sb))
+ cur->bc_flags |= XFS_BTREE_CRC_BLOCKS;
+
+ cur->bc_private.b.forksize = XFS_IFORK_SIZE(ip, whichfork);
+ cur->bc_private.b.ip = ip;
+ cur->bc_private.b.firstblock = NULLFSBLOCK;
+ cur->bc_private.b.flist = NULL;
+ cur->bc_private.b.allocated = 0;
+ cur->bc_private.b.flags = 0;
+ cur->bc_private.b.whichfork = whichfork;
+
+ return cur;
+}
+
+/*
+ * Calculate number of records in a bmap btree block.
+ */
+int
+xfs_bmbt_maxrecs(
+ struct xfs_mount *mp,
+ int blocklen,
+ int leaf)
+{
+ blocklen -= XFS_BMBT_BLOCK_LEN(mp);
+
+ if (leaf)
+ return blocklen / sizeof(xfs_bmbt_rec_t);
+ return blocklen / (sizeof(xfs_bmbt_key_t) + sizeof(xfs_bmbt_ptr_t));
+}
+
+/*
+ * Calculate number of records in a bmap btree inode root.
+ */
+int
+xfs_bmdr_maxrecs(
+ int blocklen,
+ int leaf)
+{
+ blocklen -= sizeof(xfs_bmdr_block_t);
+
+ if (leaf)
+ return blocklen / sizeof(xfs_bmdr_rec_t);
+ return blocklen / (sizeof(xfs_bmdr_key_t) + sizeof(xfs_bmdr_ptr_t));
+}
+
+/*
+ * Change the owner of a btree format fork fo the inode passed in. Change it to
+ * the owner of that is passed in so that we can change owners before or after
+ * we switch forks between inodes. The operation that the caller is doing will
+ * determine whether is needs to change owner before or after the switch.
+ *
+ * For demand paged transactional modification, the fork switch should be done
+ * after reading in all the blocks, modifying them and pinning them in the
+ * transaction. For modification when the buffers are already pinned in memory,
+ * the fork switch can be done before changing the owner as we won't need to
+ * validate the owner until the btree buffers are unpinned and writes can occur
+ * again.
+ *
+ * For recovery based ownership change, there is no transactional context and
+ * so a buffer list must be supplied so that we can record the buffers that we
+ * modified for the caller to issue IO on.
+ */
+int
+xfs_bmbt_change_owner(
+ struct xfs_trans *tp,
+ struct xfs_inode *ip,
+ int whichfork,
+ xfs_ino_t new_owner,
+ struct list_head *buffer_list)
+{
+ struct xfs_btree_cur *cur;
+ int error;
+
+ ASSERT(tp || buffer_list);
+ ASSERT(!(tp && buffer_list));
+ if (whichfork == XFS_DATA_FORK)
+ ASSERT(ip->i_d.di_format == XFS_DINODE_FMT_BTREE);
+ else
+ ASSERT(ip->i_d.di_aformat == XFS_DINODE_FMT_BTREE);
+
+ cur = xfs_bmbt_init_cursor(ip->i_mount, tp, ip, whichfork);
+ if (!cur)
+ return ENOMEM;
+
+ error = xfs_btree_change_owner(cur, new_owner, buffer_list);
+ xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
+ return error;
+}
diff --git a/fs/xfs/xfs_bmap_btree.h b/fs/xfs/xfs_bmap_btree.h
index e095a2d344a..819a8a4dee9 100644
--- a/fs/xfs/xfs_bmap_btree.h
+++ b/fs/xfs/xfs_bmap_btree.h
@@ -18,368 +18,126 @@
#ifndef __XFS_BMAP_BTREE_H__
#define __XFS_BMAP_BTREE_H__
-#define XFS_BMAP_MAGIC 0x424d4150 /* 'BMAP' */
-
struct xfs_btree_cur;
-struct xfs_btree_lblock;
+struct xfs_btree_block;
struct xfs_mount;
struct xfs_inode;
-
-/*
- * Bmap root header, on-disk form only.
- */
-typedef struct xfs_bmdr_block {
- __be16 bb_level; /* 0 is a leaf */
- __be16 bb_numrecs; /* current # of data records */
-} xfs_bmdr_block_t;
-
-/*
- * Bmap btree record and extent descriptor.
- * For 32-bit kernels,
- * l0:31 is an extent flag (value 1 indicates non-normal).
- * l0:0-30 and l1:9-31 are startoff.
- * l1:0-8, l2:0-31, and l3:21-31 are startblock.
- * l3:0-20 are blockcount.
- * For 64-bit kernels,
- * l0:63 is an extent flag (value 1 indicates non-normal).
- * l0:9-62 are startoff.
- * l0:0-8 and l1:21-63 are startblock.
- * l1:0-20 are blockcount.
- */
-
-#ifndef XFS_NATIVE_HOST
-
-#define BMBT_TOTAL_BITLEN 128 /* 128 bits, 16 bytes */
-#define BMBT_EXNTFLAG_BITOFF 0
-#define BMBT_EXNTFLAG_BITLEN 1
-#define BMBT_STARTOFF_BITOFF (BMBT_EXNTFLAG_BITOFF + BMBT_EXNTFLAG_BITLEN)
-#define BMBT_STARTOFF_BITLEN 54
-#define BMBT_STARTBLOCK_BITOFF (BMBT_STARTOFF_BITOFF + BMBT_STARTOFF_BITLEN)
-#define BMBT_STARTBLOCK_BITLEN 52
-#define BMBT_BLOCKCOUNT_BITOFF \
- (BMBT_STARTBLOCK_BITOFF + BMBT_STARTBLOCK_BITLEN)
-#define BMBT_BLOCKCOUNT_BITLEN (BMBT_TOTAL_BITLEN - BMBT_BLOCKCOUNT_BITOFF)
-
-#else
-
-#define BMBT_TOTAL_BITLEN 128 /* 128 bits, 16 bytes */
-#define BMBT_EXNTFLAG_BITOFF 63
-#define BMBT_EXNTFLAG_BITLEN 1
-#define BMBT_STARTOFF_BITOFF (BMBT_EXNTFLAG_BITOFF - BMBT_STARTOFF_BITLEN)
-#define BMBT_STARTOFF_BITLEN 54
-#define BMBT_STARTBLOCK_BITOFF 85 /* 128 - 43 (other 9 is in first word) */
-#define BMBT_STARTBLOCK_BITLEN 52
-#define BMBT_BLOCKCOUNT_BITOFF 64 /* Start of second 64 bit container */
-#define BMBT_BLOCKCOUNT_BITLEN 21
-
-#endif /* XFS_NATIVE_HOST */
-
-
-#define BMBT_USE_64 1
-
-typedef struct xfs_bmbt_rec_32
-{
- __uint32_t l0, l1, l2, l3;
-} xfs_bmbt_rec_32_t;
-typedef struct xfs_bmbt_rec_64
-{
- __uint64_t l0, l1;
-} xfs_bmbt_rec_64_t;
-
-typedef __uint64_t xfs_bmbt_rec_base_t; /* use this for casts */
-typedef xfs_bmbt_rec_64_t xfs_bmbt_rec_t, xfs_bmdr_rec_t;
-
-/*
- * Values and macros for delayed-allocation startblock fields.
- */
-#define STARTBLOCKVALBITS 17
-#define STARTBLOCKMASKBITS (15 + XFS_BIG_BLKNOS * 20)
-#define DSTARTBLOCKMASKBITS (15 + 20)
-#define STARTBLOCKMASK \
- (((((xfs_fsblock_t)1) << STARTBLOCKMASKBITS) - 1) << STARTBLOCKVALBITS)
-#define DSTARTBLOCKMASK \
- (((((xfs_dfsbno_t)1) << DSTARTBLOCKMASKBITS) - 1) << STARTBLOCKVALBITS)
-
-#define ISNULLSTARTBLOCK(x) isnullstartblock(x)
-static inline int isnullstartblock(xfs_fsblock_t x)
-{
- return ((x) & STARTBLOCKMASK) == STARTBLOCKMASK;
-}
-
-#define ISNULLDSTARTBLOCK(x) isnulldstartblock(x)
-static inline int isnulldstartblock(xfs_dfsbno_t x)
-{
- return ((x) & DSTARTBLOCKMASK) == DSTARTBLOCKMASK;
-}
-
-#define NULLSTARTBLOCK(k) nullstartblock(k)
-static inline xfs_fsblock_t nullstartblock(int k)
-{
- ASSERT(k < (1 << STARTBLOCKVALBITS));
- return STARTBLOCKMASK | (k);
-}
-
-#define STARTBLOCKVAL(x) startblockval(x)
-static inline xfs_filblks_t startblockval(xfs_fsblock_t x)
-{
- return (xfs_filblks_t)((x) & ~STARTBLOCKMASK);
-}
-
-/*
- * Possible extent formats.
- */
-typedef enum {
- XFS_EXTFMT_NOSTATE = 0,
- XFS_EXTFMT_HASSTATE
-} xfs_exntfmt_t;
-
-/*
- * Possible extent states.
- */
-typedef enum {
- XFS_EXT_NORM, XFS_EXT_UNWRITTEN,
- XFS_EXT_DMAPI_OFFLINE, XFS_EXT_INVALID
-} xfs_exntst_t;
+struct xfs_trans;
/*
* Extent state and extent format macros.
*/
#define XFS_EXTFMT_INODE(x) \
- (XFS_SB_VERSION_HASEXTFLGBIT(&((x)->i_mount->m_sb)) ? \
+ (xfs_sb_version_hasextflgbit(&((x)->i_mount->m_sb)) ? \
XFS_EXTFMT_HASSTATE : XFS_EXTFMT_NOSTATE)
#define ISUNWRITTEN(x) ((x)->br_state == XFS_EXT_UNWRITTEN)
/*
- * Incore version of above.
+ * Btree block header size depends on a superblock flag.
*/
-typedef struct xfs_bmbt_irec
-{
- xfs_fileoff_t br_startoff; /* starting file offset */
- xfs_fsblock_t br_startblock; /* starting block number */
- xfs_filblks_t br_blockcount; /* number of blocks */
- xfs_exntst_t br_state; /* extent state */
-} xfs_bmbt_irec_t;
-
-/*
- * Key structure for non-leaf levels of the tree.
- */
-typedef struct xfs_bmbt_key
-{
- xfs_dfiloff_t br_startoff; /* starting file offset */
-} xfs_bmbt_key_t, xfs_bmdr_key_t;
-
-typedef xfs_dfsbno_t xfs_bmbt_ptr_t, xfs_bmdr_ptr_t; /* btree pointer type */
- /* btree block header type */
-typedef struct xfs_btree_lblock xfs_bmbt_block_t;
+#define XFS_BMBT_BLOCK_LEN(mp) \
+ (xfs_sb_version_hascrc(&((mp)->m_sb)) ? \
+ XFS_BTREE_LBLOCK_CRC_LEN : XFS_BTREE_LBLOCK_LEN)
-#define XFS_BUF_TO_BMBT_BLOCK(bp) ((xfs_bmbt_block_t *)XFS_BUF_PTR(bp))
+#define XFS_BMBT_REC_ADDR(mp, block, index) \
+ ((xfs_bmbt_rec_t *) \
+ ((char *)(block) + \
+ XFS_BMBT_BLOCK_LEN(mp) + \
+ ((index) - 1) * sizeof(xfs_bmbt_rec_t)))
-#define XFS_BMAP_IBLOCK_SIZE(lev,cur) (1 << (cur)->bc_blocklog)
-#define XFS_BMAP_RBLOCK_DSIZE(lev,cur) ((cur)->bc_private.b.forksize)
-#define XFS_BMAP_RBLOCK_ISIZE(lev,cur) \
- ((int)XFS_IFORK_PTR((cur)->bc_private.b.ip, \
- (cur)->bc_private.b.whichfork)->if_broot_bytes)
+#define XFS_BMBT_KEY_ADDR(mp, block, index) \
+ ((xfs_bmbt_key_t *) \
+ ((char *)(block) + \
+ XFS_BMBT_BLOCK_LEN(mp) + \
+ ((index) - 1) * sizeof(xfs_bmbt_key_t)))
-#define XFS_BMAP_BLOCK_DSIZE(lev,cur) \
- (((lev) == (cur)->bc_nlevels - 1 ? \
- XFS_BMAP_RBLOCK_DSIZE(lev,cur) : XFS_BMAP_IBLOCK_SIZE(lev,cur)))
-#define XFS_BMAP_BLOCK_ISIZE(lev,cur) \
- (((lev) == (cur)->bc_nlevels - 1 ? \
- XFS_BMAP_RBLOCK_ISIZE(lev,cur) : XFS_BMAP_IBLOCK_SIZE(lev,cur)))
+#define XFS_BMBT_PTR_ADDR(mp, block, index, maxrecs) \
+ ((xfs_bmbt_ptr_t *) \
+ ((char *)(block) + \
+ XFS_BMBT_BLOCK_LEN(mp) + \
+ (maxrecs) * sizeof(xfs_bmbt_key_t) + \
+ ((index) - 1) * sizeof(xfs_bmbt_ptr_t)))
-#define XFS_BMAP_BLOCK_DMAXRECS(lev,cur) \
- (((lev) == (cur)->bc_nlevels - 1 ? \
- XFS_BTREE_BLOCK_MAXRECS(XFS_BMAP_RBLOCK_DSIZE(lev,cur), \
- xfs_bmdr, (lev) == 0) : \
- ((cur)->bc_mp->m_bmap_dmxr[(lev) != 0])))
-#define XFS_BMAP_BLOCK_IMAXRECS(lev,cur) \
- (((lev) == (cur)->bc_nlevels - 1 ? \
- XFS_BTREE_BLOCK_MAXRECS(XFS_BMAP_RBLOCK_ISIZE(lev,cur),\
- xfs_bmbt, (lev) == 0) : \
- ((cur)->bc_mp->m_bmap_dmxr[(lev) != 0])))
+#define XFS_BMDR_REC_ADDR(block, index) \
+ ((xfs_bmdr_rec_t *) \
+ ((char *)(block) + \
+ sizeof(struct xfs_bmdr_block) + \
+ ((index) - 1) * sizeof(xfs_bmdr_rec_t)))
-#define XFS_BMAP_BLOCK_DMINRECS(lev,cur) \
- (((lev) == (cur)->bc_nlevels - 1 ? \
- XFS_BTREE_BLOCK_MINRECS(XFS_BMAP_RBLOCK_DSIZE(lev,cur),\
- xfs_bmdr, (lev) == 0) : \
- ((cur)->bc_mp->m_bmap_dmnr[(lev) != 0])))
-#define XFS_BMAP_BLOCK_IMINRECS(lev,cur) \
- (((lev) == (cur)->bc_nlevels - 1 ? \
- XFS_BTREE_BLOCK_MINRECS(XFS_BMAP_RBLOCK_ISIZE(lev,cur),\
- xfs_bmbt, (lev) == 0) : \
- ((cur)->bc_mp->m_bmap_dmnr[(lev) != 0])))
+#define XFS_BMDR_KEY_ADDR(block, index) \
+ ((xfs_bmdr_key_t *) \
+ ((char *)(block) + \
+ sizeof(struct xfs_bmdr_block) + \
+ ((index) - 1) * sizeof(xfs_bmdr_key_t)))
-#define XFS_BMAP_REC_DADDR(bb,i,cur) \
- (XFS_BTREE_REC_ADDR(XFS_BMAP_BLOCK_DSIZE( \
- be16_to_cpu((bb)->bb_level), cur), \
- xfs_bmbt, bb, i, XFS_BMAP_BLOCK_DMAXRECS( \
- be16_to_cpu((bb)->bb_level), cur)))
-#define XFS_BMAP_REC_IADDR(bb,i,cur) \
- (XFS_BTREE_REC_ADDR(XFS_BMAP_BLOCK_ISIZE( \
- be16_to_cpu((bb)->bb_level), cur), \
- xfs_bmbt, bb, i, XFS_BMAP_BLOCK_IMAXRECS( \
- be16_to_cpu((bb)->bb_level), cur)))
-
-#define XFS_BMAP_KEY_DADDR(bb,i,cur) \
- (XFS_BTREE_KEY_ADDR(XFS_BMAP_BLOCK_DSIZE( \
- be16_to_cpu((bb)->bb_level), cur), \
- xfs_bmbt, bb, i, XFS_BMAP_BLOCK_DMAXRECS( \
- be16_to_cpu((bb)->bb_level), cur)))
-#define XFS_BMAP_KEY_IADDR(bb,i,cur) \
- (XFS_BTREE_KEY_ADDR(XFS_BMAP_BLOCK_ISIZE( \
- be16_to_cpu((bb)->bb_level), cur), \
- xfs_bmbt, bb, i, XFS_BMAP_BLOCK_IMAXRECS( \
- be16_to_cpu((bb)->bb_level), cur)))
-
-#define XFS_BMAP_PTR_DADDR(bb,i,cur) \
- (XFS_BTREE_PTR_ADDR(XFS_BMAP_BLOCK_DSIZE( \
- be16_to_cpu((bb)->bb_level), cur), \
- xfs_bmbt, bb, i, XFS_BMAP_BLOCK_DMAXRECS( \
- be16_to_cpu((bb)->bb_level), cur)))
-#define XFS_BMAP_PTR_IADDR(bb,i,cur) \
- (XFS_BTREE_PTR_ADDR(XFS_BMAP_BLOCK_ISIZE( \
- be16_to_cpu((bb)->bb_level), cur), \
- xfs_bmbt, bb, i, XFS_BMAP_BLOCK_IMAXRECS( \
- be16_to_cpu((bb)->bb_level), cur)))
+#define XFS_BMDR_PTR_ADDR(block, index, maxrecs) \
+ ((xfs_bmdr_ptr_t *) \
+ ((char *)(block) + \
+ sizeof(struct xfs_bmdr_block) + \
+ (maxrecs) * sizeof(xfs_bmdr_key_t) + \
+ ((index) - 1) * sizeof(xfs_bmdr_ptr_t)))
/*
* These are to be used when we know the size of the block and
* we don't have a cursor.
*/
-#define XFS_BMAP_BROOT_REC_ADDR(bb,i,sz) \
- (XFS_BTREE_REC_ADDR(sz,xfs_bmbt,bb,i,XFS_BMAP_BROOT_MAXRECS(sz)))
-#define XFS_BMAP_BROOT_KEY_ADDR(bb,i,sz) \
- (XFS_BTREE_KEY_ADDR(sz,xfs_bmbt,bb,i,XFS_BMAP_BROOT_MAXRECS(sz)))
-#define XFS_BMAP_BROOT_PTR_ADDR(bb,i,sz) \
- (XFS_BTREE_PTR_ADDR(sz,xfs_bmbt,bb,i,XFS_BMAP_BROOT_MAXRECS(sz)))
-
-#define XFS_BMAP_BROOT_NUMRECS(bb) be16_to_cpu((bb)->bb_numrecs)
-#define XFS_BMAP_BROOT_MAXRECS(sz) XFS_BTREE_BLOCK_MAXRECS(sz,xfs_bmbt,0)
+#define XFS_BMAP_BROOT_PTR_ADDR(mp, bb, i, sz) \
+ XFS_BMBT_PTR_ADDR(mp, bb, i, xfs_bmbt_maxrecs(mp, sz, 0))
-#define XFS_BMAP_BROOT_SPACE_CALC(nrecs) \
- (int)(sizeof(xfs_bmbt_block_t) + \
+#define XFS_BMAP_BROOT_SPACE_CALC(mp, nrecs) \
+ (int)(XFS_BMBT_BLOCK_LEN(mp) + \
((nrecs) * (sizeof(xfs_bmbt_key_t) + sizeof(xfs_bmbt_ptr_t))))
-#define XFS_BMAP_BROOT_SPACE(bb) \
- (XFS_BMAP_BROOT_SPACE_CALC(be16_to_cpu((bb)->bb_numrecs)))
+#define XFS_BMAP_BROOT_SPACE(mp, bb) \
+ (XFS_BMAP_BROOT_SPACE_CALC(mp, be16_to_cpu((bb)->bb_numrecs)))
#define XFS_BMDR_SPACE_CALC(nrecs) \
(int)(sizeof(xfs_bmdr_block_t) + \
((nrecs) * (sizeof(xfs_bmbt_key_t) + sizeof(xfs_bmbt_ptr_t))))
+#define XFS_BMAP_BMDR_SPACE(bb) \
+ (XFS_BMDR_SPACE_CALC(be16_to_cpu((bb)->bb_numrecs)))
/*
* Maximum number of bmap btree levels.
*/
#define XFS_BM_MAXLEVELS(mp,w) ((mp)->m_bm_maxlevels[(w)])
-#define XFS_BMAP_SANITY_CHECK(mp,bb,level) \
- (be32_to_cpu((bb)->bb_magic) == XFS_BMAP_MAGIC && \
- be16_to_cpu((bb)->bb_level) == level && \
- be16_to_cpu((bb)->bb_numrecs) > 0 && \
- be16_to_cpu((bb)->bb_numrecs) <= (mp)->m_bmap_dmxr[(level) != 0])
-
-
-#ifdef __KERNEL__
-
-#if defined(XFS_BMBT_TRACE)
-/*
- * Trace buffer entry types.
- */
-#define XFS_BMBT_KTRACE_ARGBI 1
-#define XFS_BMBT_KTRACE_ARGBII 2
-#define XFS_BMBT_KTRACE_ARGFFFI 3
-#define XFS_BMBT_KTRACE_ARGI 4
-#define XFS_BMBT_KTRACE_ARGIFK 5
-#define XFS_BMBT_KTRACE_ARGIFR 6
-#define XFS_BMBT_KTRACE_ARGIK 7
-#define XFS_BMBT_KTRACE_CUR 8
-
-#define XFS_BMBT_TRACE_SIZE 4096 /* size of global trace buffer */
-#define XFS_BMBT_KTRACE_SIZE 32 /* size of per-inode trace buffer */
-extern ktrace_t *xfs_bmbt_trace_buf;
-#endif
-
/*
* Prototypes for xfs_bmap.c to call.
*/
-extern void xfs_bmdr_to_bmbt(xfs_bmdr_block_t *, int, xfs_bmbt_block_t *, int);
-extern int xfs_bmbt_decrement(struct xfs_btree_cur *, int, int *);
-extern int xfs_bmbt_delete(struct xfs_btree_cur *, int *);
-extern void xfs_bmbt_get_all(xfs_bmbt_rec_t *r, xfs_bmbt_irec_t *s);
-extern xfs_bmbt_block_t *xfs_bmbt_get_block(struct xfs_btree_cur *cur,
- int, struct xfs_buf **bpp);
-extern xfs_filblks_t xfs_bmbt_get_blockcount(xfs_bmbt_rec_t *r);
-extern xfs_fsblock_t xfs_bmbt_get_startblock(xfs_bmbt_rec_t *r);
-extern xfs_fileoff_t xfs_bmbt_get_startoff(xfs_bmbt_rec_t *r);
-extern xfs_exntst_t xfs_bmbt_get_state(xfs_bmbt_rec_t *r);
+extern void xfs_bmdr_to_bmbt(struct xfs_inode *, xfs_bmdr_block_t *, int,
+ struct xfs_btree_block *, int);
+extern void xfs_bmbt_get_all(xfs_bmbt_rec_host_t *r, xfs_bmbt_irec_t *s);
+extern xfs_filblks_t xfs_bmbt_get_blockcount(xfs_bmbt_rec_host_t *r);
+extern xfs_fsblock_t xfs_bmbt_get_startblock(xfs_bmbt_rec_host_t *r);
+extern xfs_fileoff_t xfs_bmbt_get_startoff(xfs_bmbt_rec_host_t *r);
+extern xfs_exntst_t xfs_bmbt_get_state(xfs_bmbt_rec_host_t *r);
-#ifndef XFS_NATIVE_HOST
-extern void xfs_bmbt_disk_get_all(xfs_bmbt_rec_t *r, xfs_bmbt_irec_t *s);
-extern xfs_exntst_t xfs_bmbt_disk_get_state(xfs_bmbt_rec_t *r);
extern xfs_filblks_t xfs_bmbt_disk_get_blockcount(xfs_bmbt_rec_t *r);
-extern xfs_fsblock_t xfs_bmbt_disk_get_startblock(xfs_bmbt_rec_t *r);
extern xfs_fileoff_t xfs_bmbt_disk_get_startoff(xfs_bmbt_rec_t *r);
-#else
-#define xfs_bmbt_disk_get_all(r, s) xfs_bmbt_get_all(r, s)
-#define xfs_bmbt_disk_get_state(r) xfs_bmbt_get_state(r)
-#define xfs_bmbt_disk_get_blockcount(r) xfs_bmbt_get_blockcount(r)
-#define xfs_bmbt_disk_get_startblock(r) xfs_bmbt_get_blockcount(r)
-#define xfs_bmbt_disk_get_startoff(r) xfs_bmbt_get_startoff(r)
-#endif /* XFS_NATIVE_HOST */
-
-extern int xfs_bmbt_increment(struct xfs_btree_cur *, int, int *);
-extern int xfs_bmbt_insert(struct xfs_btree_cur *, int *);
-extern void xfs_bmbt_log_block(struct xfs_btree_cur *, struct xfs_buf *, int);
-extern void xfs_bmbt_log_recs(struct xfs_btree_cur *, struct xfs_buf *, int,
- int);
-extern int xfs_bmbt_lookup_eq(struct xfs_btree_cur *, xfs_fileoff_t,
- xfs_fsblock_t, xfs_filblks_t, int *);
-extern int xfs_bmbt_lookup_ge(struct xfs_btree_cur *, xfs_fileoff_t,
- xfs_fsblock_t, xfs_filblks_t, int *);
-
-/*
- * Give the bmap btree a new root block. Copy the old broot contents
- * down into a real block and make the broot point to it.
- */
-extern int xfs_bmbt_newroot(struct xfs_btree_cur *cur, int *lflags, int *stat);
-extern void xfs_bmbt_set_all(xfs_bmbt_rec_t *r, xfs_bmbt_irec_t *s);
-extern void xfs_bmbt_set_allf(xfs_bmbt_rec_t *r, xfs_fileoff_t o,
+extern void xfs_bmbt_set_all(xfs_bmbt_rec_host_t *r, xfs_bmbt_irec_t *s);
+extern void xfs_bmbt_set_allf(xfs_bmbt_rec_host_t *r, xfs_fileoff_t o,
xfs_fsblock_t b, xfs_filblks_t c, xfs_exntst_t v);
-extern void xfs_bmbt_set_blockcount(xfs_bmbt_rec_t *r, xfs_filblks_t v);
-extern void xfs_bmbt_set_startblock(xfs_bmbt_rec_t *r, xfs_fsblock_t v);
-extern void xfs_bmbt_set_startoff(xfs_bmbt_rec_t *r, xfs_fileoff_t v);
-extern void xfs_bmbt_set_state(xfs_bmbt_rec_t *r, xfs_exntst_t v);
+extern void xfs_bmbt_set_blockcount(xfs_bmbt_rec_host_t *r, xfs_filblks_t v);
+extern void xfs_bmbt_set_startblock(xfs_bmbt_rec_host_t *r, xfs_fsblock_t v);
+extern void xfs_bmbt_set_startoff(xfs_bmbt_rec_host_t *r, xfs_fileoff_t v);
+extern void xfs_bmbt_set_state(xfs_bmbt_rec_host_t *r, xfs_exntst_t v);
-#ifndef XFS_NATIVE_HOST
-extern void xfs_bmbt_disk_set_all(xfs_bmbt_rec_t *r, xfs_bmbt_irec_t *s);
extern void xfs_bmbt_disk_set_allf(xfs_bmbt_rec_t *r, xfs_fileoff_t o,
xfs_fsblock_t b, xfs_filblks_t c, xfs_exntst_t v);
-#else
-#define xfs_bmbt_disk_set_all(r, s) xfs_bmbt_set_all(r, s)
-#define xfs_bmbt_disk_set_allf(r, o, b, c, v) xfs_bmbt_set_allf(r, o, b, c, v)
-#endif /* XFS_NATIVE_HOST */
-extern void xfs_bmbt_to_bmdr(xfs_bmbt_block_t *, int, xfs_bmdr_block_t *, int);
-extern int xfs_bmbt_update(struct xfs_btree_cur *, xfs_fileoff_t,
- xfs_fsblock_t, xfs_filblks_t, xfs_exntst_t);
+extern void xfs_bmbt_to_bmdr(struct xfs_mount *, struct xfs_btree_block *, int,
+ xfs_bmdr_block_t *, int);
-#ifdef DEBUG
-/*
- * Get the data from the pointed-to record.
- */
-extern int xfs_bmbt_get_rec(struct xfs_btree_cur *, xfs_fileoff_t *,
- xfs_fsblock_t *, xfs_filblks_t *,
- xfs_exntst_t *, int *);
-#endif
+extern int xfs_bmbt_get_maxrecs(struct xfs_btree_cur *, int level);
+extern int xfs_bmdr_maxrecs(int blocklen, int leaf);
+extern int xfs_bmbt_maxrecs(struct xfs_mount *, int blocklen, int leaf);
-/*
- * Search an extent list for the extent which includes block
- * bno.
- */
-xfs_bmbt_rec_t *xfs_bmap_do_search_extents(xfs_bmbt_rec_t *,
- xfs_extnum_t, xfs_extnum_t, xfs_fileoff_t, int *,
- xfs_extnum_t *, xfs_bmbt_irec_t *, xfs_bmbt_irec_t *);
+extern int xfs_bmbt_change_owner(struct xfs_trans *tp, struct xfs_inode *ip,
+ int whichfork, xfs_ino_t new_owner,
+ struct list_head *buffer_list);
-#endif /* __KERNEL__ */
+extern struct xfs_btree_cur *xfs_bmbt_init_cursor(struct xfs_mount *,
+ struct xfs_trans *, struct xfs_inode *, int);
#endif /* __XFS_BMAP_BTREE_H__ */
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
new file mode 100644
index 00000000000..64731ef3324
--- /dev/null
+++ b/fs/xfs/xfs_bmap_util.c
@@ -0,0 +1,1897 @@
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * Copyright (c) 2012 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_da_format.h"
+#include "xfs_inode.h"
+#include "xfs_btree.h"
+#include "xfs_trans.h"
+#include "xfs_extfree_item.h"
+#include "xfs_alloc.h"
+#include "xfs_bmap.h"
+#include "xfs_bmap_util.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_rtalloc.h"
+#include "xfs_error.h"
+#include "xfs_quota.h"
+#include "xfs_trans_space.h"
+#include "xfs_trace.h"
+#include "xfs_icache.h"
+#include "xfs_log.h"
+#include "xfs_dinode.h"
+
+/* Kernel only BMAP related definitions and functions */
+
+/*
+ * Convert the given file system block to a disk block. We have to treat it
+ * differently based on whether the file is a real time file or not, because the
+ * bmap code does.
+ */
+xfs_daddr_t
+xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb)
+{
+ return (XFS_IS_REALTIME_INODE(ip) ? \
+ (xfs_daddr_t)XFS_FSB_TO_BB((ip)->i_mount, (fsb)) : \
+ XFS_FSB_TO_DADDR((ip)->i_mount, (fsb)));
+}
+
+/*
+ * Routine to be called at transaction's end by xfs_bmapi, xfs_bunmapi
+ * caller. Frees all the extents that need freeing, which must be done
+ * last due to locking considerations. We never free any extents in
+ * the first transaction.
+ *
+ * Return 1 if the given transaction was committed and a new one
+ * started, and 0 otherwise in the committed parameter.
+ */
+int /* error */
+xfs_bmap_finish(
+ xfs_trans_t **tp, /* transaction pointer addr */
+ xfs_bmap_free_t *flist, /* i/o: list extents to free */
+ int *committed) /* xact committed or not */
+{
+ xfs_efd_log_item_t *efd; /* extent free data */
+ xfs_efi_log_item_t *efi; /* extent free intention */
+ int error; /* error return value */
+ xfs_bmap_free_item_t *free; /* free extent item */
+ struct xfs_trans_res tres; /* new log reservation */
+ xfs_mount_t *mp; /* filesystem mount structure */
+ xfs_bmap_free_item_t *next; /* next item on free list */
+ xfs_trans_t *ntp; /* new transaction pointer */
+
+ ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES);
+ if (flist->xbf_count == 0) {
+ *committed = 0;
+ return 0;
+ }
+ ntp = *tp;
+ efi = xfs_trans_get_efi(ntp, flist->xbf_count);
+ for (free = flist->xbf_first; free; free = free->xbfi_next)
+ xfs_trans_log_efi_extent(ntp, efi, free->xbfi_startblock,
+ free->xbfi_blockcount);
+
+ tres.tr_logres = ntp->t_log_res;
+ tres.tr_logcount = ntp->t_log_count;
+ tres.tr_logflags = XFS_TRANS_PERM_LOG_RES;
+ ntp = xfs_trans_dup(*tp);
+ error = xfs_trans_commit(*tp, 0);
+ *tp = ntp;
+ *committed = 1;
+ /*
+ * We have a new transaction, so we should return committed=1,
+ * even though we're returning an error.
+ */
+ if (error)
+ return error;
+
+ /*
+ * transaction commit worked ok so we can drop the extra ticket
+ * reference that we gained in xfs_trans_dup()
+ */
+ xfs_log_ticket_put(ntp->t_ticket);
+
+ error = xfs_trans_reserve(ntp, &tres, 0, 0);
+ if (error)
+ return error;
+ efd = xfs_trans_get_efd(ntp, efi, flist->xbf_count);
+ for (free = flist->xbf_first; free != NULL; free = next) {
+ next = free->xbfi_next;
+ if ((error = xfs_free_extent(ntp, free->xbfi_startblock,
+ free->xbfi_blockcount))) {
+ /*
+ * The bmap free list will be cleaned up at a
+ * higher level. The EFI will be canceled when
+ * this transaction is aborted.
+ * Need to force shutdown here to make sure it
+ * happens, since this transaction may not be
+ * dirty yet.
+ */
+ mp = ntp->t_mountp;
+ if (!XFS_FORCED_SHUTDOWN(mp))
+ xfs_force_shutdown(mp,
+ (error == EFSCORRUPTED) ?
+ SHUTDOWN_CORRUPT_INCORE :
+ SHUTDOWN_META_IO_ERROR);
+ return error;
+ }
+ xfs_trans_log_efd_extent(ntp, efd, free->xbfi_startblock,
+ free->xbfi_blockcount);
+ xfs_bmap_del_free(flist, NULL, free);
+ }
+ return 0;
+}
+
+int
+xfs_bmap_rtalloc(
+ struct xfs_bmalloca *ap) /* bmap alloc argument struct */
+{
+ xfs_alloctype_t atype = 0; /* type for allocation routines */
+ int error; /* error return value */
+ xfs_mount_t *mp; /* mount point structure */
+ xfs_extlen_t prod = 0; /* product factor for allocators */
+ xfs_extlen_t ralen = 0; /* realtime allocation length */
+ xfs_extlen_t align; /* minimum allocation alignment */
+ xfs_rtblock_t rtb;
+
+ mp = ap->ip->i_mount;
+ align = xfs_get_extsz_hint(ap->ip);
+ prod = align / mp->m_sb.sb_rextsize;
+ error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev,
+ align, 1, ap->eof, 0,
+ ap->conv, &ap->offset, &ap->length);
+ if (error)
+ return error;
+ ASSERT(ap->length);
+ ASSERT(ap->length % mp->m_sb.sb_rextsize == 0);
+
+ /*
+ * If the offset & length are not perfectly aligned
+ * then kill prod, it will just get us in trouble.
+ */
+ if (do_mod(ap->offset, align) || ap->length % align)
+ prod = 1;
+ /*
+ * Set ralen to be the actual requested length in rtextents.
+ */
+ ralen = ap->length / mp->m_sb.sb_rextsize;
+ /*
+ * If the old value was close enough to MAXEXTLEN that
+ * we rounded up to it, cut it back so it's valid again.
+ * Note that if it's a really large request (bigger than
+ * MAXEXTLEN), we don't hear about that number, and can't
+ * adjust the starting point to match it.
+ */
+ if (ralen * mp->m_sb.sb_rextsize >= MAXEXTLEN)
+ ralen = MAXEXTLEN / mp->m_sb.sb_rextsize;
+
+ /*
+ * Lock out other modifications to the RT bitmap inode.
+ */
+ xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(ap->tp, mp->m_rbmip, XFS_ILOCK_EXCL);
+
+ /*
+ * If it's an allocation to an empty file at offset 0,
+ * pick an extent that will space things out in the rt area.
+ */
+ if (ap->eof && ap->offset == 0) {
+ xfs_rtblock_t uninitialized_var(rtx); /* realtime extent no */
+
+ error = xfs_rtpick_extent(mp, ap->tp, ralen, &rtx);
+ if (error)
+ return error;
+ ap->blkno = rtx * mp->m_sb.sb_rextsize;
+ } else {
+ ap->blkno = 0;
+ }
+
+ xfs_bmap_adjacent(ap);
+
+ /*
+ * Realtime allocation, done through xfs_rtallocate_extent.
+ */
+ atype = ap->blkno == 0 ? XFS_ALLOCTYPE_ANY_AG : XFS_ALLOCTYPE_NEAR_BNO;
+ do_div(ap->blkno, mp->m_sb.sb_rextsize);
+ rtb = ap->blkno;
+ ap->length = ralen;
+ if ((error = xfs_rtallocate_extent(ap->tp, ap->blkno, 1, ap->length,
+ &ralen, atype, ap->wasdel, prod, &rtb)))
+ return error;
+ if (rtb == NULLFSBLOCK && prod > 1 &&
+ (error = xfs_rtallocate_extent(ap->tp, ap->blkno, 1,
+ ap->length, &ralen, atype,
+ ap->wasdel, 1, &rtb)))
+ return error;
+ ap->blkno = rtb;
+ if (ap->blkno != NULLFSBLOCK) {
+ ap->blkno *= mp->m_sb.sb_rextsize;
+ ralen *= mp->m_sb.sb_rextsize;
+ ap->length = ralen;
+ ap->ip->i_d.di_nblocks += ralen;
+ xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE);
+ if (ap->wasdel)
+ ap->ip->i_delayed_blks -= ralen;
+ /*
+ * Adjust the disk quota also. This was reserved
+ * earlier.
+ */
+ xfs_trans_mod_dquot_byino(ap->tp, ap->ip,
+ ap->wasdel ? XFS_TRANS_DQ_DELRTBCOUNT :
+ XFS_TRANS_DQ_RTBCOUNT, (long) ralen);
+ } else {
+ ap->length = 0;
+ }
+ return 0;
+}
+
+/*
+ * Check if the endoff is outside the last extent. If so the caller will grow
+ * the allocation to a stripe unit boundary. All offsets are considered outside
+ * the end of file for an empty fork, so 1 is returned in *eof in that case.
+ */
+int
+xfs_bmap_eof(
+ struct xfs_inode *ip,
+ xfs_fileoff_t endoff,
+ int whichfork,
+ int *eof)
+{
+ struct xfs_bmbt_irec rec;
+ int error;
+
+ error = xfs_bmap_last_extent(NULL, ip, whichfork, &rec, eof);
+ if (error || *eof)
+ return error;
+
+ *eof = endoff >= rec.br_startoff + rec.br_blockcount;
+ return 0;
+}
+
+/*
+ * Extent tree block counting routines.
+ */
+
+/*
+ * Count leaf blocks given a range of extent records.
+ */
+STATIC void
+xfs_bmap_count_leaves(
+ xfs_ifork_t *ifp,
+ xfs_extnum_t idx,
+ int numrecs,
+ int *count)
+{
+ int b;
+
+ for (b = 0; b < numrecs; b++) {
+ xfs_bmbt_rec_host_t *frp = xfs_iext_get_ext(ifp, idx + b);
+ *count += xfs_bmbt_get_blockcount(frp);
+ }
+}
+
+/*
+ * Count leaf blocks given a range of extent records originally
+ * in btree format.
+ */
+STATIC void
+xfs_bmap_disk_count_leaves(
+ struct xfs_mount *mp,
+ struct xfs_btree_block *block,
+ int numrecs,
+ int *count)
+{
+ int b;
+ xfs_bmbt_rec_t *frp;
+
+ for (b = 1; b <= numrecs; b++) {
+ frp = XFS_BMBT_REC_ADDR(mp, block, b);
+ *count += xfs_bmbt_disk_get_blockcount(frp);
+ }
+}
+
+/*
+ * Recursively walks each level of a btree
+ * to count total fsblocks in use.
+ */
+STATIC int /* error */
+xfs_bmap_count_tree(
+ xfs_mount_t *mp, /* file system mount point */
+ xfs_trans_t *tp, /* transaction pointer */
+ xfs_ifork_t *ifp, /* inode fork pointer */
+ xfs_fsblock_t blockno, /* file system block number */
+ int levelin, /* level in btree */
+ int *count) /* Count of blocks */
+{
+ int error;
+ xfs_buf_t *bp, *nbp;
+ int level = levelin;
+ __be64 *pp;
+ xfs_fsblock_t bno = blockno;
+ xfs_fsblock_t nextbno;
+ struct xfs_btree_block *block, *nextblock;
+ int numrecs;
+
+ error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, XFS_BMAP_BTREE_REF,
+ &xfs_bmbt_buf_ops);
+ if (error)
+ return error;
+ *count += 1;
+ block = XFS_BUF_TO_BLOCK(bp);
+
+ if (--level) {
+ /* Not at node above leaves, count this level of nodes */
+ nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
+ while (nextbno != NULLFSBLOCK) {
+ error = xfs_btree_read_bufl(mp, tp, nextbno, 0, &nbp,
+ XFS_BMAP_BTREE_REF,
+ &xfs_bmbt_buf_ops);
+ if (error)
+ return error;
+ *count += 1;
+ nextblock = XFS_BUF_TO_BLOCK(nbp);
+ nextbno = be64_to_cpu(nextblock->bb_u.l.bb_rightsib);
+ xfs_trans_brelse(tp, nbp);
+ }
+
+ /* Dive to the next level */
+ pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]);
+ bno = be64_to_cpu(*pp);
+ if (unlikely((error =
+ xfs_bmap_count_tree(mp, tp, ifp, bno, level, count)) < 0)) {
+ xfs_trans_brelse(tp, bp);
+ XFS_ERROR_REPORT("xfs_bmap_count_tree(1)",
+ XFS_ERRLEVEL_LOW, mp);
+ return XFS_ERROR(EFSCORRUPTED);
+ }
+ xfs_trans_brelse(tp, bp);
+ } else {
+ /* count all level 1 nodes and their leaves */
+ for (;;) {
+ nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
+ numrecs = be16_to_cpu(block->bb_numrecs);
+ xfs_bmap_disk_count_leaves(mp, block, numrecs, count);
+ xfs_trans_brelse(tp, bp);
+ if (nextbno == NULLFSBLOCK)
+ break;
+ bno = nextbno;
+ error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
+ XFS_BMAP_BTREE_REF,
+ &xfs_bmbt_buf_ops);
+ if (error)
+ return error;
+ *count += 1;
+ block = XFS_BUF_TO_BLOCK(bp);
+ }
+ }
+ return 0;
+}
+
+/*
+ * Count fsblocks of the given fork.
+ */
+int /* error */
+xfs_bmap_count_blocks(
+ xfs_trans_t *tp, /* transaction pointer */
+ xfs_inode_t *ip, /* incore inode */
+ int whichfork, /* data or attr fork */
+ int *count) /* out: count of blocks */
+{
+ struct xfs_btree_block *block; /* current btree block */
+ xfs_fsblock_t bno; /* block # of "block" */
+ xfs_ifork_t *ifp; /* fork structure */
+ int level; /* btree level, for checking */
+ xfs_mount_t *mp; /* file system mount structure */
+ __be64 *pp; /* pointer to block address */
+
+ bno = NULLFSBLOCK;
+ mp = ip->i_mount;
+ ifp = XFS_IFORK_PTR(ip, whichfork);
+ if ( XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS ) {
+ xfs_bmap_count_leaves(ifp, 0,
+ ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t),
+ count);
+ return 0;
+ }
+
+ /*
+ * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out.
+ */
+ block = ifp->if_broot;
+ level = be16_to_cpu(block->bb_level);
+ ASSERT(level > 0);
+ pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes);
+ bno = be64_to_cpu(*pp);
+ ASSERT(bno != NULLDFSBNO);
+ ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount);
+ ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks);
+
+ if (unlikely(xfs_bmap_count_tree(mp, tp, ifp, bno, level, count) < 0)) {
+ XFS_ERROR_REPORT("xfs_bmap_count_blocks(2)", XFS_ERRLEVEL_LOW,
+ mp);
+ return XFS_ERROR(EFSCORRUPTED);
+ }
+
+ return 0;
+}
+
+/*
+ * returns 1 for success, 0 if we failed to map the extent.
+ */
+STATIC int
+xfs_getbmapx_fix_eof_hole(
+ xfs_inode_t *ip, /* xfs incore inode pointer */
+ struct getbmapx *out, /* output structure */
+ int prealloced, /* this is a file with
+ * preallocated data space */
+ __int64_t end, /* last block requested */
+ xfs_fsblock_t startblock)
+{
+ __int64_t fixlen;
+ xfs_mount_t *mp; /* file system mount point */
+ xfs_ifork_t *ifp; /* inode fork pointer */
+ xfs_extnum_t lastx; /* last extent pointer */
+ xfs_fileoff_t fileblock;
+
+ if (startblock == HOLESTARTBLOCK) {
+ mp = ip->i_mount;
+ out->bmv_block = -1;
+ fixlen = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, XFS_ISIZE(ip)));
+ fixlen -= out->bmv_offset;
+ if (prealloced && out->bmv_offset + out->bmv_length == end) {
+ /* Came to hole at EOF. Trim it. */
+ if (fixlen <= 0)
+ return 0;
+ out->bmv_length = fixlen;
+ }
+ } else {
+ if (startblock == DELAYSTARTBLOCK)
+ out->bmv_block = -2;
+ else
+ out->bmv_block = xfs_fsb_to_db(ip, startblock);
+ fileblock = XFS_BB_TO_FSB(ip->i_mount, out->bmv_offset);
+ ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+ if (xfs_iext_bno_to_ext(ifp, fileblock, &lastx) &&
+ (lastx == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))-1))
+ out->bmv_oflags |= BMV_OF_LAST;
+ }
+
+ return 1;
+}
+
+/*
+ * Get inode's extents as described in bmv, and format for output.
+ * Calls formatter to fill the user's buffer until all extents
+ * are mapped, until the passed-in bmv->bmv_count slots have
+ * been filled, or until the formatter short-circuits the loop,
+ * if it is tracking filled-in extents on its own.
+ */
+int /* error code */
+xfs_getbmap(
+ xfs_inode_t *ip,
+ struct getbmapx *bmv, /* user bmap structure */
+ xfs_bmap_format_t formatter, /* format to user */
+ void *arg) /* formatter arg */
+{
+ __int64_t bmvend; /* last block requested */
+ int error = 0; /* return value */
+ __int64_t fixlen; /* length for -1 case */
+ int i; /* extent number */
+ int lock; /* lock state */
+ xfs_bmbt_irec_t *map; /* buffer for user's data */
+ xfs_mount_t *mp; /* file system mount point */
+ int nex; /* # of user extents can do */
+ int nexleft; /* # of user extents left */
+ int subnex; /* # of bmapi's can do */
+ int nmap; /* number of map entries */
+ struct getbmapx *out; /* output structure */
+ int whichfork; /* data or attr fork */
+ int prealloced; /* this is a file with
+ * preallocated data space */
+ int iflags; /* interface flags */
+ int bmapi_flags; /* flags for xfs_bmapi */
+ int cur_ext = 0;
+
+ mp = ip->i_mount;
+ iflags = bmv->bmv_iflags;
+ whichfork = iflags & BMV_IF_ATTRFORK ? XFS_ATTR_FORK : XFS_DATA_FORK;
+
+ if (whichfork == XFS_ATTR_FORK) {
+ if (XFS_IFORK_Q(ip)) {
+ if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS &&
+ ip->i_d.di_aformat != XFS_DINODE_FMT_BTREE &&
+ ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)
+ return XFS_ERROR(EINVAL);
+ } else if (unlikely(
+ ip->i_d.di_aformat != 0 &&
+ ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS)) {
+ XFS_ERROR_REPORT("xfs_getbmap", XFS_ERRLEVEL_LOW,
+ ip->i_mount);
+ return XFS_ERROR(EFSCORRUPTED);
+ }
+
+ prealloced = 0;
+ fixlen = 1LL << 32;
+ } else {
+ if (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS &&
+ ip->i_d.di_format != XFS_DINODE_FMT_BTREE &&
+ ip->i_d.di_format != XFS_DINODE_FMT_LOCAL)
+ return XFS_ERROR(EINVAL);
+
+ if (xfs_get_extsz_hint(ip) ||
+ ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC|XFS_DIFLAG_APPEND)){
+ prealloced = 1;
+ fixlen = mp->m_super->s_maxbytes;
+ } else {
+ prealloced = 0;
+ fixlen = XFS_ISIZE(ip);
+ }
+ }
+
+ if (bmv->bmv_length == -1) {
+ fixlen = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, fixlen));
+ bmv->bmv_length =
+ max_t(__int64_t, fixlen - bmv->bmv_offset, 0);
+ } else if (bmv->bmv_length == 0) {
+ bmv->bmv_entries = 0;
+ return 0;
+ } else if (bmv->bmv_length < 0) {
+ return XFS_ERROR(EINVAL);
+ }
+
+ nex = bmv->bmv_count - 1;
+ if (nex <= 0)
+ return XFS_ERROR(EINVAL);
+ bmvend = bmv->bmv_offset + bmv->bmv_length;
+
+
+ if (bmv->bmv_count > ULONG_MAX / sizeof(struct getbmapx))
+ return XFS_ERROR(ENOMEM);
+ out = kmem_zalloc_large(bmv->bmv_count * sizeof(struct getbmapx), 0);
+ if (!out)
+ return XFS_ERROR(ENOMEM);
+
+ xfs_ilock(ip, XFS_IOLOCK_SHARED);
+ if (whichfork == XFS_DATA_FORK) {
+ if (!(iflags & BMV_IF_DELALLOC) &&
+ (ip->i_delayed_blks || XFS_ISIZE(ip) > ip->i_d.di_size)) {
+ error = -filemap_write_and_wait(VFS_I(ip)->i_mapping);
+ if (error)
+ goto out_unlock_iolock;
+
+ /*
+ * Even after flushing the inode, there can still be
+ * delalloc blocks on the inode beyond EOF due to
+ * speculative preallocation. These are not removed
+ * until the release function is called or the inode
+ * is inactivated. Hence we cannot assert here that
+ * ip->i_delayed_blks == 0.
+ */
+ }
+
+ lock = xfs_ilock_data_map_shared(ip);
+ } else {
+ lock = xfs_ilock_attr_map_shared(ip);
+ }
+
+ /*
+ * Don't let nex be bigger than the number of extents
+ * we can have assuming alternating holes and real extents.
+ */
+ if (nex > XFS_IFORK_NEXTENTS(ip, whichfork) * 2 + 1)
+ nex = XFS_IFORK_NEXTENTS(ip, whichfork) * 2 + 1;
+
+ bmapi_flags = xfs_bmapi_aflag(whichfork);
+ if (!(iflags & BMV_IF_PREALLOC))
+ bmapi_flags |= XFS_BMAPI_IGSTATE;
+
+ /*
+ * Allocate enough space to handle "subnex" maps at a time.
+ */
+ error = ENOMEM;
+ subnex = 16;
+ map = kmem_alloc(subnex * sizeof(*map), KM_MAYFAIL | KM_NOFS);
+ if (!map)
+ goto out_unlock_ilock;
+
+ bmv->bmv_entries = 0;
+
+ if (XFS_IFORK_NEXTENTS(ip, whichfork) == 0 &&
+ (whichfork == XFS_ATTR_FORK || !(iflags & BMV_IF_DELALLOC))) {
+ error = 0;
+ goto out_free_map;
+ }
+
+ nexleft = nex;
+
+ do {
+ nmap = (nexleft > subnex) ? subnex : nexleft;
+ error = xfs_bmapi_read(ip, XFS_BB_TO_FSBT(mp, bmv->bmv_offset),
+ XFS_BB_TO_FSB(mp, bmv->bmv_length),
+ map, &nmap, bmapi_flags);
+ if (error)
+ goto out_free_map;
+ ASSERT(nmap <= subnex);
+
+ for (i = 0; i < nmap && nexleft && bmv->bmv_length; i++) {
+ out[cur_ext].bmv_oflags = 0;
+ if (map[i].br_state == XFS_EXT_UNWRITTEN)
+ out[cur_ext].bmv_oflags |= BMV_OF_PREALLOC;
+ else if (map[i].br_startblock == DELAYSTARTBLOCK)
+ out[cur_ext].bmv_oflags |= BMV_OF_DELALLOC;
+ out[cur_ext].bmv_offset =
+ XFS_FSB_TO_BB(mp, map[i].br_startoff);
+ out[cur_ext].bmv_length =
+ XFS_FSB_TO_BB(mp, map[i].br_blockcount);
+ out[cur_ext].bmv_unused1 = 0;
+ out[cur_ext].bmv_unused2 = 0;
+
+ /*
+ * delayed allocation extents that start beyond EOF can
+ * occur due to speculative EOF allocation when the
+ * delalloc extent is larger than the largest freespace
+ * extent at conversion time. These extents cannot be
+ * converted by data writeback, so can exist here even
+ * if we are not supposed to be finding delalloc
+ * extents.
+ */
+ if (map[i].br_startblock == DELAYSTARTBLOCK &&
+ map[i].br_startoff <= XFS_B_TO_FSB(mp, XFS_ISIZE(ip)))
+ ASSERT((iflags & BMV_IF_DELALLOC) != 0);
+
+ if (map[i].br_startblock == HOLESTARTBLOCK &&
+ whichfork == XFS_ATTR_FORK) {
+ /* came to the end of attribute fork */
+ out[cur_ext].bmv_oflags |= BMV_OF_LAST;
+ goto out_free_map;
+ }
+
+ if (!xfs_getbmapx_fix_eof_hole(ip, &out[cur_ext],
+ prealloced, bmvend,
+ map[i].br_startblock))
+ goto out_free_map;
+
+ bmv->bmv_offset =
+ out[cur_ext].bmv_offset +
+ out[cur_ext].bmv_length;
+ bmv->bmv_length =
+ max_t(__int64_t, 0, bmvend - bmv->bmv_offset);
+
+ /*
+ * In case we don't want to return the hole,
+ * don't increase cur_ext so that we can reuse
+ * it in the next loop.
+ */
+ if ((iflags & BMV_IF_NO_HOLES) &&
+ map[i].br_startblock == HOLESTARTBLOCK) {
+ memset(&out[cur_ext], 0, sizeof(out[cur_ext]));
+ continue;
+ }
+
+ nexleft--;
+ bmv->bmv_entries++;
+ cur_ext++;
+ }
+ } while (nmap && nexleft && bmv->bmv_length);
+
+ out_free_map:
+ kmem_free(map);
+ out_unlock_ilock:
+ xfs_iunlock(ip, lock);
+ out_unlock_iolock:
+ xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+
+ for (i = 0; i < cur_ext; i++) {
+ int full = 0; /* user array is full */
+
+ /* format results & advance arg */
+ error = formatter(&arg, &out[i], &full);
+ if (error || full)
+ break;
+ }
+
+ kmem_free(out);
+ return error;
+}
+
+/*
+ * dead simple method of punching delalyed allocation blocks from a range in
+ * the inode. Walks a block at a time so will be slow, but is only executed in
+ * rare error cases so the overhead is not critical. This will always punch out
+ * both the start and end blocks, even if the ranges only partially overlap
+ * them, so it is up to the caller to ensure that partial blocks are not
+ * passed in.
+ */
+int
+xfs_bmap_punch_delalloc_range(
+ struct xfs_inode *ip,
+ xfs_fileoff_t start_fsb,
+ xfs_fileoff_t length)
+{
+ xfs_fileoff_t remaining = length;
+ int error = 0;
+
+ ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+
+ do {
+ int done;
+ xfs_bmbt_irec_t imap;
+ int nimaps = 1;
+ xfs_fsblock_t firstblock;
+ xfs_bmap_free_t flist;
+
+ /*
+ * Map the range first and check that it is a delalloc extent
+ * before trying to unmap the range. Otherwise we will be
+ * trying to remove a real extent (which requires a
+ * transaction) or a hole, which is probably a bad idea...
+ */
+ error = xfs_bmapi_read(ip, start_fsb, 1, &imap, &nimaps,
+ XFS_BMAPI_ENTIRE);
+
+ if (error) {
+ /* something screwed, just bail */
+ if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+ xfs_alert(ip->i_mount,
+ "Failed delalloc mapping lookup ino %lld fsb %lld.",
+ ip->i_ino, start_fsb);
+ }
+ break;
+ }
+ if (!nimaps) {
+ /* nothing there */
+ goto next_block;
+ }
+ if (imap.br_startblock != DELAYSTARTBLOCK) {
+ /* been converted, ignore */
+ goto next_block;
+ }
+ WARN_ON(imap.br_blockcount == 0);
+
+ /*
+ * Note: while we initialise the firstblock/flist pair, they
+ * should never be used because blocks should never be
+ * allocated or freed for a delalloc extent and hence we need
+ * don't cancel or finish them after the xfs_bunmapi() call.
+ */
+ xfs_bmap_init(&flist, &firstblock);
+ error = xfs_bunmapi(NULL, ip, start_fsb, 1, 0, 1, &firstblock,
+ &flist, &done);
+ if (error)
+ break;
+
+ ASSERT(!flist.xbf_count && !flist.xbf_first);
+next_block:
+ start_fsb++;
+ remaining--;
+ } while(remaining > 0);
+
+ return error;
+}
+
+/*
+ * Test whether it is appropriate to check an inode for and free post EOF
+ * blocks. The 'force' parameter determines whether we should also consider
+ * regular files that are marked preallocated or append-only.
+ */
+bool
+xfs_can_free_eofblocks(struct xfs_inode *ip, bool force)
+{
+ /* prealloc/delalloc exists only on regular files */
+ if (!S_ISREG(ip->i_d.di_mode))
+ return false;
+
+ /*
+ * Zero sized files with no cached pages and delalloc blocks will not
+ * have speculative prealloc/delalloc blocks to remove.
+ */
+ if (VFS_I(ip)->i_size == 0 &&
+ VN_CACHED(VFS_I(ip)) == 0 &&
+ ip->i_delayed_blks == 0)
+ return false;
+
+ /* If we haven't read in the extent list, then don't do it now. */
+ if (!(ip->i_df.if_flags & XFS_IFEXTENTS))
+ return false;
+
+ /*
+ * Do not free real preallocated or append-only files unless the file
+ * has delalloc blocks and we are forced to remove them.
+ */
+ if (ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND))
+ if (!force || ip->i_delayed_blks == 0)
+ return false;
+
+ return true;
+}
+
+/*
+ * This is called by xfs_inactive to free any blocks beyond eof
+ * when the link count isn't zero and by xfs_dm_punch_hole() when
+ * punching a hole to EOF.
+ */
+int
+xfs_free_eofblocks(
+ xfs_mount_t *mp,
+ xfs_inode_t *ip,
+ bool need_iolock)
+{
+ xfs_trans_t *tp;
+ int error;
+ xfs_fileoff_t end_fsb;
+ xfs_fileoff_t last_fsb;
+ xfs_filblks_t map_len;
+ int nimaps;
+ xfs_bmbt_irec_t imap;
+
+ /*
+ * Figure out if there are any blocks beyond the end
+ * of the file. If not, then there is nothing to do.
+ */
+ end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_ISIZE(ip));
+ last_fsb = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes);
+ if (last_fsb <= end_fsb)
+ return 0;
+ map_len = last_fsb - end_fsb;
+
+ nimaps = 1;
+ xfs_ilock(ip, XFS_ILOCK_SHARED);
+ error = xfs_bmapi_read(ip, end_fsb, map_len, &imap, &nimaps, 0);
+ xfs_iunlock(ip, XFS_ILOCK_SHARED);
+
+ if (!error && (nimaps != 0) &&
+ (imap.br_startblock != HOLESTARTBLOCK ||
+ ip->i_delayed_blks)) {
+ /*
+ * Attach the dquots to the inode up front.
+ */
+ error = xfs_qm_dqattach(ip, 0);
+ if (error)
+ return error;
+
+ /*
+ * There are blocks after the end of file.
+ * Free them up now by truncating the file to
+ * its current size.
+ */
+ tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
+
+ if (need_iolock) {
+ if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
+ xfs_trans_cancel(tp, 0);
+ return EAGAIN;
+ }
+ }
+
+ error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
+ if (error) {
+ ASSERT(XFS_FORCED_SHUTDOWN(mp));
+ xfs_trans_cancel(tp, 0);
+ if (need_iolock)
+ xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+ return error;
+ }
+
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, ip, 0);
+
+ /*
+ * Do not update the on-disk file size. If we update the
+ * on-disk file size and then the system crashes before the
+ * contents of the file are flushed to disk then the files
+ * may be full of holes (ie NULL files bug).
+ */
+ error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK,
+ XFS_ISIZE(ip));
+ if (error) {
+ /*
+ * If we get an error at this point we simply don't
+ * bother truncating the file.
+ */
+ xfs_trans_cancel(tp,
+ (XFS_TRANS_RELEASE_LOG_RES |
+ XFS_TRANS_ABORT));
+ } else {
+ error = xfs_trans_commit(tp,
+ XFS_TRANS_RELEASE_LOG_RES);
+ if (!error)
+ xfs_inode_clear_eofblocks_tag(ip);
+ }
+
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ if (need_iolock)
+ xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+ }
+ return error;
+}
+
+int
+xfs_alloc_file_space(
+ struct xfs_inode *ip,
+ xfs_off_t offset,
+ xfs_off_t len,
+ int alloc_type)
+{
+ xfs_mount_t *mp = ip->i_mount;
+ xfs_off_t count;
+ xfs_filblks_t allocated_fsb;
+ xfs_filblks_t allocatesize_fsb;
+ xfs_extlen_t extsz, temp;
+ xfs_fileoff_t startoffset_fsb;
+ xfs_fsblock_t firstfsb;
+ int nimaps;
+ int quota_flag;
+ int rt;
+ xfs_trans_t *tp;
+ xfs_bmbt_irec_t imaps[1], *imapp;
+ xfs_bmap_free_t free_list;
+ uint qblocks, resblks, resrtextents;
+ int committed;
+ int error;
+
+ trace_xfs_alloc_file_space(ip);
+
+ if (XFS_FORCED_SHUTDOWN(mp))
+ return XFS_ERROR(EIO);
+
+ error = xfs_qm_dqattach(ip, 0);
+ if (error)
+ return error;
+
+ if (len <= 0)
+ return XFS_ERROR(EINVAL);
+
+ rt = XFS_IS_REALTIME_INODE(ip);
+ extsz = xfs_get_extsz_hint(ip);
+
+ count = len;
+ imapp = &imaps[0];
+ nimaps = 1;
+ startoffset_fsb = XFS_B_TO_FSBT(mp, offset);
+ allocatesize_fsb = XFS_B_TO_FSB(mp, count);
+
+ /*
+ * Allocate file space until done or until there is an error
+ */
+ while (allocatesize_fsb && !error) {
+ xfs_fileoff_t s, e;
+
+ /*
+ * Determine space reservations for data/realtime.
+ */
+ if (unlikely(extsz)) {
+ s = startoffset_fsb;
+ do_div(s, extsz);
+ s *= extsz;
+ e = startoffset_fsb + allocatesize_fsb;
+ if ((temp = do_mod(startoffset_fsb, extsz)))
+ e += temp;
+ if ((temp = do_mod(e, extsz)))
+ e += extsz - temp;
+ } else {
+ s = 0;
+ e = allocatesize_fsb;
+ }
+
+ /*
+ * The transaction reservation is limited to a 32-bit block
+ * count, hence we need to limit the number of blocks we are
+ * trying to reserve to avoid an overflow. We can't allocate
+ * more than @nimaps extents, and an extent is limited on disk
+ * to MAXEXTLEN (21 bits), so use that to enforce the limit.
+ */
+ resblks = min_t(xfs_fileoff_t, (e - s), (MAXEXTLEN * nimaps));
+ if (unlikely(rt)) {
+ resrtextents = qblocks = resblks;
+ resrtextents /= mp->m_sb.sb_rextsize;
+ resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
+ quota_flag = XFS_QMOPT_RES_RTBLKS;
+ } else {
+ resrtextents = 0;
+ resblks = qblocks = XFS_DIOSTRAT_SPACE_RES(mp, resblks);
+ quota_flag = XFS_QMOPT_RES_REGBLKS;
+ }
+
+ /*
+ * Allocate and setup the transaction.
+ */
+ tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
+ error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
+ resblks, resrtextents);
+ /*
+ * Check for running out of space
+ */
+ if (error) {
+ /*
+ * Free the transaction structure.
+ */
+ ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp));
+ xfs_trans_cancel(tp, 0);
+ break;
+ }
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ error = xfs_trans_reserve_quota_nblks(tp, ip, qblocks,
+ 0, quota_flag);
+ if (error)
+ goto error1;
+
+ xfs_trans_ijoin(tp, ip, 0);
+
+ xfs_bmap_init(&free_list, &firstfsb);
+ error = xfs_bmapi_write(tp, ip, startoffset_fsb,
+ allocatesize_fsb, alloc_type, &firstfsb,
+ 0, imapp, &nimaps, &free_list);
+ if (error) {
+ goto error0;
+ }
+
+ /*
+ * Complete the transaction
+ */
+ error = xfs_bmap_finish(&tp, &free_list, &committed);
+ if (error) {
+ goto error0;
+ }
+
+ error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ if (error) {
+ break;
+ }
+
+ allocated_fsb = imapp->br_blockcount;
+
+ if (nimaps == 0) {
+ error = XFS_ERROR(ENOSPC);
+ break;
+ }
+
+ startoffset_fsb += allocated_fsb;
+ allocatesize_fsb -= allocated_fsb;
+ }
+
+ return error;
+
+error0: /* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */
+ xfs_bmap_cancel(&free_list);
+ xfs_trans_unreserve_quota_nblks(tp, ip, (long)qblocks, 0, quota_flag);
+
+error1: /* Just cancel transaction */
+ xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ return error;
+}
+
+/*
+ * Zero file bytes between startoff and endoff inclusive.
+ * The iolock is held exclusive and no blocks are buffered.
+ *
+ * This function is used by xfs_free_file_space() to zero
+ * partial blocks when the range to free is not block aligned.
+ * When unreserving space with boundaries that are not block
+ * aligned we round up the start and round down the end
+ * boundaries and then use this function to zero the parts of
+ * the blocks that got dropped during the rounding.
+ */
+STATIC int
+xfs_zero_remaining_bytes(
+ xfs_inode_t *ip,
+ xfs_off_t startoff,
+ xfs_off_t endoff)
+{
+ xfs_bmbt_irec_t imap;
+ xfs_fileoff_t offset_fsb;
+ xfs_off_t lastoffset;
+ xfs_off_t offset;
+ xfs_buf_t *bp;
+ xfs_mount_t *mp = ip->i_mount;
+ int nimap;
+ int error = 0;
+
+ /*
+ * Avoid doing I/O beyond eof - it's not necessary
+ * since nothing can read beyond eof. The space will
+ * be zeroed when the file is extended anyway.
+ */
+ if (startoff >= XFS_ISIZE(ip))
+ return 0;
+
+ if (endoff > XFS_ISIZE(ip))
+ endoff = XFS_ISIZE(ip);
+
+ bp = xfs_buf_get_uncached(XFS_IS_REALTIME_INODE(ip) ?
+ mp->m_rtdev_targp : mp->m_ddev_targp,
+ BTOBB(mp->m_sb.sb_blocksize), 0);
+ if (!bp)
+ return XFS_ERROR(ENOMEM);
+
+ xfs_buf_unlock(bp);
+
+ for (offset = startoff; offset <= endoff; offset = lastoffset + 1) {
+ uint lock_mode;
+
+ offset_fsb = XFS_B_TO_FSBT(mp, offset);
+ nimap = 1;
+
+ lock_mode = xfs_ilock_data_map_shared(ip);
+ error = xfs_bmapi_read(ip, offset_fsb, 1, &imap, &nimap, 0);
+ xfs_iunlock(ip, lock_mode);
+
+ if (error || nimap < 1)
+ break;
+ ASSERT(imap.br_blockcount >= 1);
+ ASSERT(imap.br_startoff == offset_fsb);
+ lastoffset = XFS_FSB_TO_B(mp, imap.br_startoff + 1) - 1;
+ if (lastoffset > endoff)
+ lastoffset = endoff;
+ if (imap.br_startblock == HOLESTARTBLOCK)
+ continue;
+ ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
+ if (imap.br_state == XFS_EXT_UNWRITTEN)
+ continue;
+ XFS_BUF_UNDONE(bp);
+ XFS_BUF_UNWRITE(bp);
+ XFS_BUF_READ(bp);
+ XFS_BUF_SET_ADDR(bp, xfs_fsb_to_db(ip, imap.br_startblock));
+
+ if (XFS_FORCED_SHUTDOWN(mp)) {
+ error = XFS_ERROR(EIO);
+ break;
+ }
+ xfs_buf_iorequest(bp);
+ error = xfs_buf_iowait(bp);
+ if (error) {
+ xfs_buf_ioerror_alert(bp,
+ "xfs_zero_remaining_bytes(read)");
+ break;
+ }
+ memset(bp->b_addr +
+ (offset - XFS_FSB_TO_B(mp, imap.br_startoff)),
+ 0, lastoffset - offset + 1);
+ XFS_BUF_UNDONE(bp);
+ XFS_BUF_UNREAD(bp);
+ XFS_BUF_WRITE(bp);
+
+ if (XFS_FORCED_SHUTDOWN(mp)) {
+ error = XFS_ERROR(EIO);
+ break;
+ }
+ xfs_buf_iorequest(bp);
+ error = xfs_buf_iowait(bp);
+ if (error) {
+ xfs_buf_ioerror_alert(bp,
+ "xfs_zero_remaining_bytes(write)");
+ break;
+ }
+ }
+ xfs_buf_free(bp);
+ return error;
+}
+
+int
+xfs_free_file_space(
+ struct xfs_inode *ip,
+ xfs_off_t offset,
+ xfs_off_t len)
+{
+ int committed;
+ int done;
+ xfs_fileoff_t endoffset_fsb;
+ int error;
+ xfs_fsblock_t firstfsb;
+ xfs_bmap_free_t free_list;
+ xfs_bmbt_irec_t imap;
+ xfs_off_t ioffset;
+ xfs_extlen_t mod=0;
+ xfs_mount_t *mp;
+ int nimap;
+ uint resblks;
+ xfs_off_t rounding;
+ int rt;
+ xfs_fileoff_t startoffset_fsb;
+ xfs_trans_t *tp;
+
+ mp = ip->i_mount;
+
+ trace_xfs_free_file_space(ip);
+
+ error = xfs_qm_dqattach(ip, 0);
+ if (error)
+ return error;
+
+ error = 0;
+ if (len <= 0) /* if nothing being freed */
+ return error;
+ rt = XFS_IS_REALTIME_INODE(ip);
+ startoffset_fsb = XFS_B_TO_FSB(mp, offset);
+ endoffset_fsb = XFS_B_TO_FSBT(mp, offset + len);
+
+ /* wait for the completion of any pending DIOs */
+ inode_dio_wait(VFS_I(ip));
+
+ rounding = max_t(xfs_off_t, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE);
+ ioffset = offset & ~(rounding - 1);
+ error = -filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
+ ioffset, -1);
+ if (error)
+ goto out;
+ truncate_pagecache_range(VFS_I(ip), ioffset, -1);
+
+ /*
+ * Need to zero the stuff we're not freeing, on disk.
+ * If it's a realtime file & can't use unwritten extents then we
+ * actually need to zero the extent edges. Otherwise xfs_bunmapi
+ * will take care of it for us.
+ */
+ if (rt && !xfs_sb_version_hasextflgbit(&mp->m_sb)) {
+ nimap = 1;
+ error = xfs_bmapi_read(ip, startoffset_fsb, 1,
+ &imap, &nimap, 0);
+ if (error)
+ goto out;
+ ASSERT(nimap == 0 || nimap == 1);
+ if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
+ xfs_daddr_t block;
+
+ ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
+ block = imap.br_startblock;
+ mod = do_div(block, mp->m_sb.sb_rextsize);
+ if (mod)
+ startoffset_fsb += mp->m_sb.sb_rextsize - mod;
+ }
+ nimap = 1;
+ error = xfs_bmapi_read(ip, endoffset_fsb - 1, 1,
+ &imap, &nimap, 0);
+ if (error)
+ goto out;
+ ASSERT(nimap == 0 || nimap == 1);
+ if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
+ ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
+ mod++;
+ if (mod && (mod != mp->m_sb.sb_rextsize))
+ endoffset_fsb -= mod;
+ }
+ }
+ if ((done = (endoffset_fsb <= startoffset_fsb)))
+ /*
+ * One contiguous piece to clear
+ */
+ error = xfs_zero_remaining_bytes(ip, offset, offset + len - 1);
+ else {
+ /*
+ * Some full blocks, possibly two pieces to clear
+ */
+ if (offset < XFS_FSB_TO_B(mp, startoffset_fsb))
+ error = xfs_zero_remaining_bytes(ip, offset,
+ XFS_FSB_TO_B(mp, startoffset_fsb) - 1);
+ if (!error &&
+ XFS_FSB_TO_B(mp, endoffset_fsb) < offset + len)
+ error = xfs_zero_remaining_bytes(ip,
+ XFS_FSB_TO_B(mp, endoffset_fsb),
+ offset + len - 1);
+ }
+
+ /*
+ * free file space until done or until there is an error
+ */
+ resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
+ while (!error && !done) {
+
+ /*
+ * allocate and setup the transaction. Allow this
+ * transaction to dip into the reserve blocks to ensure
+ * the freeing of the space succeeds at ENOSPC.
+ */
+ tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
+ error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, resblks, 0);
+
+ /*
+ * check for running out of space
+ */
+ if (error) {
+ /*
+ * Free the transaction structure.
+ */
+ ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp));
+ xfs_trans_cancel(tp, 0);
+ break;
+ }
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ error = xfs_trans_reserve_quota(tp, mp,
+ ip->i_udquot, ip->i_gdquot, ip->i_pdquot,
+ resblks, 0, XFS_QMOPT_RES_REGBLKS);
+ if (error)
+ goto error1;
+
+ xfs_trans_ijoin(tp, ip, 0);
+
+ /*
+ * issue the bunmapi() call to free the blocks
+ */
+ xfs_bmap_init(&free_list, &firstfsb);
+ error = xfs_bunmapi(tp, ip, startoffset_fsb,
+ endoffset_fsb - startoffset_fsb,
+ 0, 2, &firstfsb, &free_list, &done);
+ if (error) {
+ goto error0;
+ }
+
+ /*
+ * complete the transaction
+ */
+ error = xfs_bmap_finish(&tp, &free_list, &committed);
+ if (error) {
+ goto error0;
+ }
+
+ error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ }
+
+ out:
+ return error;
+
+ error0:
+ xfs_bmap_cancel(&free_list);
+ error1:
+ xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ goto out;
+}
+
+
+int
+xfs_zero_file_space(
+ struct xfs_inode *ip,
+ xfs_off_t offset,
+ xfs_off_t len)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ uint granularity;
+ xfs_off_t start_boundary;
+ xfs_off_t end_boundary;
+ int error;
+
+ trace_xfs_zero_file_space(ip);
+
+ granularity = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE);
+
+ /*
+ * Round the range of extents we are going to convert inwards. If the
+ * offset is aligned, then it doesn't get changed so we zero from the
+ * start of the block offset points to.
+ */
+ start_boundary = round_up(offset, granularity);
+ end_boundary = round_down(offset + len, granularity);
+
+ ASSERT(start_boundary >= offset);
+ ASSERT(end_boundary <= offset + len);
+
+ if (start_boundary < end_boundary - 1) {
+ /*
+ * punch out delayed allocation blocks and the page cache over
+ * the conversion range
+ */
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ error = xfs_bmap_punch_delalloc_range(ip,
+ XFS_B_TO_FSBT(mp, start_boundary),
+ XFS_B_TO_FSB(mp, end_boundary - start_boundary));
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ truncate_pagecache_range(VFS_I(ip), start_boundary,
+ end_boundary - 1);
+
+ /* convert the blocks */
+ error = xfs_alloc_file_space(ip, start_boundary,
+ end_boundary - start_boundary - 1,
+ XFS_BMAPI_PREALLOC | XFS_BMAPI_CONVERT);
+ if (error)
+ goto out;
+
+ /* We've handled the interior of the range, now for the edges */
+ if (start_boundary != offset) {
+ error = xfs_iozero(ip, offset, start_boundary - offset);
+ if (error)
+ goto out;
+ }
+
+ if (end_boundary != offset + len)
+ error = xfs_iozero(ip, end_boundary,
+ offset + len - end_boundary);
+
+ } else {
+ /*
+ * It's either a sub-granularity range or the range spanned lies
+ * partially across two adjacent blocks.
+ */
+ error = xfs_iozero(ip, offset, len);
+ }
+
+out:
+ return error;
+
+}
+
+/*
+ * xfs_collapse_file_space()
+ * This routine frees disk space and shift extent for the given file.
+ * The first thing we do is to free data blocks in the specified range
+ * by calling xfs_free_file_space(). It would also sync dirty data
+ * and invalidate page cache over the region on which collapse range
+ * is working. And Shift extent records to the left to cover a hole.
+ * RETURNS:
+ * 0 on success
+ * errno on error
+ *
+ */
+int
+xfs_collapse_file_space(
+ struct xfs_inode *ip,
+ xfs_off_t offset,
+ xfs_off_t len)
+{
+ int done = 0;
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_trans *tp;
+ int error;
+ xfs_extnum_t current_ext = 0;
+ struct xfs_bmap_free free_list;
+ xfs_fsblock_t first_block;
+ int committed;
+ xfs_fileoff_t start_fsb;
+ xfs_fileoff_t shift_fsb;
+
+ ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
+
+ trace_xfs_collapse_file_space(ip);
+
+ start_fsb = XFS_B_TO_FSB(mp, offset + len);
+ shift_fsb = XFS_B_TO_FSB(mp, len);
+
+ error = xfs_free_file_space(ip, offset, len);
+ if (error)
+ return error;
+
+ while (!error && !done) {
+ tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
+ /*
+ * We would need to reserve permanent block for transaction.
+ * This will come into picture when after shifting extent into
+ * hole we found that adjacent extents can be merged which
+ * may lead to freeing of a block during record update.
+ */
+ error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
+ XFS_DIOSTRAT_SPACE_RES(mp, 0), 0);
+ if (error) {
+ xfs_trans_cancel(tp, 0);
+ break;
+ }
+
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ error = xfs_trans_reserve_quota(tp, mp, ip->i_udquot,
+ ip->i_gdquot, ip->i_pdquot,
+ XFS_DIOSTRAT_SPACE_RES(mp, 0), 0,
+ XFS_QMOPT_RES_REGBLKS);
+ if (error)
+ goto out;
+
+ xfs_trans_ijoin(tp, ip, 0);
+
+ xfs_bmap_init(&free_list, &first_block);
+
+ /*
+ * We are using the write transaction in which max 2 bmbt
+ * updates are allowed
+ */
+ error = xfs_bmap_shift_extents(tp, ip, &done, start_fsb,
+ shift_fsb, &current_ext,
+ &first_block, &free_list,
+ XFS_BMAP_MAX_SHIFT_EXTENTS);
+ if (error)
+ goto out;
+
+ error = xfs_bmap_finish(&tp, &free_list, &committed);
+ if (error)
+ goto out;
+
+ error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ }
+
+ return error;
+
+out:
+ xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ return error;
+}
+
+/*
+ * We need to check that the format of the data fork in the temporary inode is
+ * valid for the target inode before doing the swap. This is not a problem with
+ * attr1 because of the fixed fork offset, but attr2 has a dynamically sized
+ * data fork depending on the space the attribute fork is taking so we can get
+ * invalid formats on the target inode.
+ *
+ * E.g. target has space for 7 extents in extent format, temp inode only has
+ * space for 6. If we defragment down to 7 extents, then the tmp format is a
+ * btree, but when swapped it needs to be in extent format. Hence we can't just
+ * blindly swap data forks on attr2 filesystems.
+ *
+ * Note that we check the swap in both directions so that we don't end up with
+ * a corrupt temporary inode, either.
+ *
+ * Note that fixing the way xfs_fsr sets up the attribute fork in the source
+ * inode will prevent this situation from occurring, so all we do here is
+ * reject and log the attempt. basically we are putting the responsibility on
+ * userspace to get this right.
+ */
+static int
+xfs_swap_extents_check_format(
+ xfs_inode_t *ip, /* target inode */
+ xfs_inode_t *tip) /* tmp inode */
+{
+
+ /* Should never get a local format */
+ if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL ||
+ tip->i_d.di_format == XFS_DINODE_FMT_LOCAL)
+ return EINVAL;
+
+ /*
+ * if the target inode has less extents that then temporary inode then
+ * why did userspace call us?
+ */
+ if (ip->i_d.di_nextents < tip->i_d.di_nextents)
+ return EINVAL;
+
+ /*
+ * if the target inode is in extent form and the temp inode is in btree
+ * form then we will end up with the target inode in the wrong format
+ * as we already know there are less extents in the temp inode.
+ */
+ if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
+ tip->i_d.di_format == XFS_DINODE_FMT_BTREE)
+ return EINVAL;
+
+ /* Check temp in extent form to max in target */
+ if (tip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
+ XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) >
+ XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK))
+ return EINVAL;
+
+ /* Check target in extent form to max in temp */
+ if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
+ XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) >
+ XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK))
+ return EINVAL;
+
+ /*
+ * If we are in a btree format, check that the temp root block will fit
+ * in the target and that it has enough extents to be in btree format
+ * in the target.
+ *
+ * Note that we have to be careful to allow btree->extent conversions
+ * (a common defrag case) which will occur when the temp inode is in
+ * extent format...
+ */
+ if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
+ if (XFS_IFORK_BOFF(ip) &&
+ XFS_BMAP_BMDR_SPACE(tip->i_df.if_broot) > XFS_IFORK_BOFF(ip))
+ return EINVAL;
+ if (XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) <=
+ XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK))
+ return EINVAL;
+ }
+
+ /* Reciprocal target->temp btree format checks */
+ if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
+ if (XFS_IFORK_BOFF(tip) &&
+ XFS_BMAP_BMDR_SPACE(ip->i_df.if_broot) > XFS_IFORK_BOFF(tip))
+ return EINVAL;
+ if (XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) <=
+ XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK))
+ return EINVAL;
+ }
+
+ return 0;
+}
+
+int
+xfs_swap_extents(
+ xfs_inode_t *ip, /* target inode */
+ xfs_inode_t *tip, /* tmp inode */
+ xfs_swapext_t *sxp)
+{
+ xfs_mount_t *mp = ip->i_mount;
+ xfs_trans_t *tp;
+ xfs_bstat_t *sbp = &sxp->sx_stat;
+ xfs_ifork_t *tempifp, *ifp, *tifp;
+ int src_log_flags, target_log_flags;
+ int error = 0;
+ int aforkblks = 0;
+ int taforkblks = 0;
+ __uint64_t tmp;
+
+ tempifp = kmem_alloc(sizeof(xfs_ifork_t), KM_MAYFAIL);
+ if (!tempifp) {
+ error = XFS_ERROR(ENOMEM);
+ goto out;
+ }
+
+ /*
+ * we have to do two separate lock calls here to keep lockdep
+ * happy. If we try to get all the locks in one call, lock will
+ * report false positives when we drop the ILOCK and regain them
+ * below.
+ */
+ xfs_lock_two_inodes(ip, tip, XFS_IOLOCK_EXCL);
+ xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL);
+
+ /* Verify that both files have the same format */
+ if ((ip->i_d.di_mode & S_IFMT) != (tip->i_d.di_mode & S_IFMT)) {
+ error = XFS_ERROR(EINVAL);
+ goto out_unlock;
+ }
+
+ /* Verify both files are either real-time or non-realtime */
+ if (XFS_IS_REALTIME_INODE(ip) != XFS_IS_REALTIME_INODE(tip)) {
+ error = XFS_ERROR(EINVAL);
+ goto out_unlock;
+ }
+
+ error = -filemap_write_and_wait(VFS_I(tip)->i_mapping);
+ if (error)
+ goto out_unlock;
+ truncate_pagecache_range(VFS_I(tip), 0, -1);
+
+ /* Verify O_DIRECT for ftmp */
+ if (VN_CACHED(VFS_I(tip)) != 0) {
+ error = XFS_ERROR(EINVAL);
+ goto out_unlock;
+ }
+
+ /* Verify all data are being swapped */
+ if (sxp->sx_offset != 0 ||
+ sxp->sx_length != ip->i_d.di_size ||
+ sxp->sx_length != tip->i_d.di_size) {
+ error = XFS_ERROR(EFAULT);
+ goto out_unlock;
+ }
+
+ trace_xfs_swap_extent_before(ip, 0);
+ trace_xfs_swap_extent_before(tip, 1);
+
+ /* check inode formats now that data is flushed */
+ error = xfs_swap_extents_check_format(ip, tip);
+ if (error) {
+ xfs_notice(mp,
+ "%s: inode 0x%llx format is incompatible for exchanging.",
+ __func__, ip->i_ino);
+ goto out_unlock;
+ }
+
+ /*
+ * Compare the current change & modify times with that
+ * passed in. If they differ, we abort this swap.
+ * This is the mechanism used to ensure the calling
+ * process that the file was not changed out from
+ * under it.
+ */
+ if ((sbp->bs_ctime.tv_sec != VFS_I(ip)->i_ctime.tv_sec) ||
+ (sbp->bs_ctime.tv_nsec != VFS_I(ip)->i_ctime.tv_nsec) ||
+ (sbp->bs_mtime.tv_sec != VFS_I(ip)->i_mtime.tv_sec) ||
+ (sbp->bs_mtime.tv_nsec != VFS_I(ip)->i_mtime.tv_nsec)) {
+ error = XFS_ERROR(EBUSY);
+ goto out_unlock;
+ }
+
+ /* We need to fail if the file is memory mapped. Once we have tossed
+ * all existing pages, the page fault will have no option
+ * but to go to the filesystem for pages. By making the page fault call
+ * vop_read (or write in the case of autogrow) they block on the iolock
+ * until we have switched the extents.
+ */
+ if (VN_MAPPED(VFS_I(ip))) {
+ error = XFS_ERROR(EBUSY);
+ goto out_unlock;
+ }
+
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ xfs_iunlock(tip, XFS_ILOCK_EXCL);
+
+ /*
+ * There is a race condition here since we gave up the
+ * ilock. However, the data fork will not change since
+ * we have the iolock (locked for truncation too) so we
+ * are safe. We don't really care if non-io related
+ * fields change.
+ */
+ truncate_pagecache_range(VFS_I(ip), 0, -1);
+
+ tp = xfs_trans_alloc(mp, XFS_TRANS_SWAPEXT);
+ error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0);
+ if (error) {
+ xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+ xfs_iunlock(tip, XFS_IOLOCK_EXCL);
+ xfs_trans_cancel(tp, 0);
+ goto out;
+ }
+ xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL);
+
+ /*
+ * Count the number of extended attribute blocks
+ */
+ if ( ((XFS_IFORK_Q(ip) != 0) && (ip->i_d.di_anextents > 0)) &&
+ (ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) {
+ error = xfs_bmap_count_blocks(tp, ip, XFS_ATTR_FORK, &aforkblks);
+ if (error)
+ goto out_trans_cancel;
+ }
+ if ( ((XFS_IFORK_Q(tip) != 0) && (tip->i_d.di_anextents > 0)) &&
+ (tip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) {
+ error = xfs_bmap_count_blocks(tp, tip, XFS_ATTR_FORK,
+ &taforkblks);
+ if (error)
+ goto out_trans_cancel;
+ }
+
+ xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+ xfs_trans_ijoin(tp, tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+
+ /*
+ * Before we've swapped the forks, lets set the owners of the forks
+ * appropriately. We have to do this as we are demand paging the btree
+ * buffers, and so the validation done on read will expect the owner
+ * field to be correctly set. Once we change the owners, we can swap the
+ * inode forks.
+ *
+ * Note the trickiness in setting the log flags - we set the owner log
+ * flag on the opposite inode (i.e. the inode we are setting the new
+ * owner to be) because once we swap the forks and log that, log
+ * recovery is going to see the fork as owned by the swapped inode,
+ * not the pre-swapped inodes.
+ */
+ src_log_flags = XFS_ILOG_CORE;
+ target_log_flags = XFS_ILOG_CORE;
+ if (ip->i_d.di_version == 3 &&
+ ip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
+ target_log_flags |= XFS_ILOG_DOWNER;
+ error = xfs_bmbt_change_owner(tp, ip, XFS_DATA_FORK,
+ tip->i_ino, NULL);
+ if (error)
+ goto out_trans_cancel;
+ }
+
+ if (tip->i_d.di_version == 3 &&
+ tip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
+ src_log_flags |= XFS_ILOG_DOWNER;
+ error = xfs_bmbt_change_owner(tp, tip, XFS_DATA_FORK,
+ ip->i_ino, NULL);
+ if (error)
+ goto out_trans_cancel;
+ }
+
+ /*
+ * Swap the data forks of the inodes
+ */
+ ifp = &ip->i_df;
+ tifp = &tip->i_df;
+ *tempifp = *ifp; /* struct copy */
+ *ifp = *tifp; /* struct copy */
+ *tifp = *tempifp; /* struct copy */
+
+ /*
+ * Fix the on-disk inode values
+ */
+ tmp = (__uint64_t)ip->i_d.di_nblocks;
+ ip->i_d.di_nblocks = tip->i_d.di_nblocks - taforkblks + aforkblks;
+ tip->i_d.di_nblocks = tmp + taforkblks - aforkblks;
+
+ tmp = (__uint64_t) ip->i_d.di_nextents;
+ ip->i_d.di_nextents = tip->i_d.di_nextents;
+ tip->i_d.di_nextents = tmp;
+
+ tmp = (__uint64_t) ip->i_d.di_format;
+ ip->i_d.di_format = tip->i_d.di_format;
+ tip->i_d.di_format = tmp;
+
+ /*
+ * The extents in the source inode could still contain speculative
+ * preallocation beyond EOF (e.g. the file is open but not modified
+ * while defrag is in progress). In that case, we need to copy over the
+ * number of delalloc blocks the data fork in the source inode is
+ * tracking beyond EOF so that when the fork is truncated away when the
+ * temporary inode is unlinked we don't underrun the i_delayed_blks
+ * counter on that inode.
+ */
+ ASSERT(tip->i_delayed_blks == 0);
+ tip->i_delayed_blks = ip->i_delayed_blks;
+ ip->i_delayed_blks = 0;
+
+ switch (ip->i_d.di_format) {
+ case XFS_DINODE_FMT_EXTENTS:
+ /* If the extents fit in the inode, fix the
+ * pointer. Otherwise it's already NULL or
+ * pointing to the extent.
+ */
+ if (ip->i_d.di_nextents <= XFS_INLINE_EXTS) {
+ ifp->if_u1.if_extents =
+ ifp->if_u2.if_inline_ext;
+ }
+ src_log_flags |= XFS_ILOG_DEXT;
+ break;
+ case XFS_DINODE_FMT_BTREE:
+ ASSERT(ip->i_d.di_version < 3 ||
+ (src_log_flags & XFS_ILOG_DOWNER));
+ src_log_flags |= XFS_ILOG_DBROOT;
+ break;
+ }
+
+ switch (tip->i_d.di_format) {
+ case XFS_DINODE_FMT_EXTENTS:
+ /* If the extents fit in the inode, fix the
+ * pointer. Otherwise it's already NULL or
+ * pointing to the extent.
+ */
+ if (tip->i_d.di_nextents <= XFS_INLINE_EXTS) {
+ tifp->if_u1.if_extents =
+ tifp->if_u2.if_inline_ext;
+ }
+ target_log_flags |= XFS_ILOG_DEXT;
+ break;
+ case XFS_DINODE_FMT_BTREE:
+ target_log_flags |= XFS_ILOG_DBROOT;
+ ASSERT(tip->i_d.di_version < 3 ||
+ (target_log_flags & XFS_ILOG_DOWNER));
+ break;
+ }
+
+ xfs_trans_log_inode(tp, ip, src_log_flags);
+ xfs_trans_log_inode(tp, tip, target_log_flags);
+
+ /*
+ * If this is a synchronous mount, make sure that the
+ * transaction goes to disk before returning to the user.
+ */
+ if (mp->m_flags & XFS_MOUNT_WSYNC)
+ xfs_trans_set_sync(tp);
+
+ error = xfs_trans_commit(tp, 0);
+
+ trace_xfs_swap_extent_after(ip, 0);
+ trace_xfs_swap_extent_after(tip, 1);
+out:
+ kmem_free(tempifp);
+ return error;
+
+out_unlock:
+ xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+ xfs_iunlock(tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+ goto out;
+
+out_trans_cancel:
+ xfs_trans_cancel(tp, 0);
+ goto out_unlock;
+}
diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h
new file mode 100644
index 00000000000..2fdb72d2c90
--- /dev/null
+++ b/fs/xfs/xfs_bmap_util.h
@@ -0,0 +1,112 @@
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#ifndef __XFS_BMAP_UTIL_H__
+#define __XFS_BMAP_UTIL_H__
+
+/* Kernel only BMAP related definitions and functions */
+
+struct xfs_bmbt_irec;
+struct xfs_bmap_free_item;
+struct xfs_ifork;
+struct xfs_inode;
+struct xfs_mount;
+struct xfs_trans;
+
+/*
+ * Argument structure for xfs_bmap_alloc.
+ */
+struct xfs_bmalloca {
+ xfs_fsblock_t *firstblock; /* i/o first block allocated */
+ struct xfs_bmap_free *flist; /* bmap freelist */
+ struct xfs_trans *tp; /* transaction pointer */
+ struct xfs_inode *ip; /* incore inode pointer */
+ struct xfs_bmbt_irec prev; /* extent before the new one */
+ struct xfs_bmbt_irec got; /* extent after, or delayed */
+
+ xfs_fileoff_t offset; /* offset in file filling in */
+ xfs_extlen_t length; /* i/o length asked/allocated */
+ xfs_fsblock_t blkno; /* starting block of new extent */
+
+ struct xfs_btree_cur *cur; /* btree cursor */
+ xfs_extnum_t idx; /* current extent index */
+ int nallocs;/* number of extents alloc'd */
+ int logflags;/* flags for transaction logging */
+
+ xfs_extlen_t total; /* total blocks needed for xaction */
+ xfs_extlen_t minlen; /* minimum allocation size (blocks) */
+ xfs_extlen_t minleft; /* amount must be left after alloc */
+ bool eof; /* set if allocating past last extent */
+ bool wasdel; /* replacing a delayed allocation */
+ bool userdata;/* set if is user data */
+ bool aeof; /* allocated space at eof */
+ bool conv; /* overwriting unwritten extents */
+ int flags;
+ struct completion *done;
+ struct work_struct work;
+ int result;
+};
+
+int xfs_bmap_finish(struct xfs_trans **tp, struct xfs_bmap_free *flist,
+ int *committed);
+int xfs_bmap_rtalloc(struct xfs_bmalloca *ap);
+int xfs_bmap_eof(struct xfs_inode *ip, xfs_fileoff_t endoff,
+ int whichfork, int *eof);
+int xfs_bmap_count_blocks(struct xfs_trans *tp, struct xfs_inode *ip,
+ int whichfork, int *count);
+int xfs_bmap_punch_delalloc_range(struct xfs_inode *ip,
+ xfs_fileoff_t start_fsb, xfs_fileoff_t length);
+
+/* bmap to userspace formatter - copy to user & advance pointer */
+typedef int (*xfs_bmap_format_t)(void **, struct getbmapx *, int *);
+int xfs_getbmap(struct xfs_inode *ip, struct getbmapx *bmv,
+ xfs_bmap_format_t formatter, void *arg);
+
+/* functions in xfs_bmap.c that are only needed by xfs_bmap_util.c */
+void xfs_bmap_del_free(struct xfs_bmap_free *flist,
+ struct xfs_bmap_free_item *prev,
+ struct xfs_bmap_free_item *free);
+int xfs_bmap_extsize_align(struct xfs_mount *mp, struct xfs_bmbt_irec *gotp,
+ struct xfs_bmbt_irec *prevp, xfs_extlen_t extsz,
+ int rt, int eof, int delay, int convert,
+ xfs_fileoff_t *offp, xfs_extlen_t *lenp);
+void xfs_bmap_adjacent(struct xfs_bmalloca *ap);
+int xfs_bmap_last_extent(struct xfs_trans *tp, struct xfs_inode *ip,
+ int whichfork, struct xfs_bmbt_irec *rec,
+ int *is_empty);
+
+/* preallocation and hole punch interface */
+int xfs_alloc_file_space(struct xfs_inode *ip, xfs_off_t offset,
+ xfs_off_t len, int alloc_type);
+int xfs_free_file_space(struct xfs_inode *ip, xfs_off_t offset,
+ xfs_off_t len);
+int xfs_zero_file_space(struct xfs_inode *ip, xfs_off_t offset,
+ xfs_off_t len);
+int xfs_collapse_file_space(struct xfs_inode *, xfs_off_t offset,
+ xfs_off_t len);
+
+/* EOF block manipulation functions */
+bool xfs_can_free_eofblocks(struct xfs_inode *ip, bool force);
+int xfs_free_eofblocks(struct xfs_mount *mp, struct xfs_inode *ip,
+ bool need_iolock);
+
+int xfs_swap_extents(struct xfs_inode *ip, struct xfs_inode *tip,
+ struct xfs_swapext *sx);
+
+xfs_daddr_t xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb);
+
+#endif /* __XFS_BMAP_UTIL_H__ */
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
index 52d5d095fc3..cf893bc1e37 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -17,28 +17,23 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
-#include "xfs_types.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_bit.h"
-#include "xfs_log.h"
-#include "xfs_inum.h"
-#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
-#include "xfs_dir.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
-#include "xfs_dinode.h"
#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_inode_item.h"
+#include "xfs_buf_item.h"
#include "xfs_btree.h"
-#include "xfs_ialloc.h"
#include "xfs_error.h"
+#include "xfs_trace.h"
+#include "xfs_cksum.h"
+#include "xfs_alloc.h"
/*
* Cursor allocation zone.
@@ -48,316 +43,242 @@ kmem_zone_t *xfs_btree_cur_zone;
/*
* Btree magic numbers.
*/
-const __uint32_t xfs_magics[XFS_BTNUM_MAX] =
-{
- XFS_ABTB_MAGIC, XFS_ABTC_MAGIC, XFS_BMAP_MAGIC, XFS_IBT_MAGIC
+static const __uint32_t xfs_magics[2][XFS_BTNUM_MAX] = {
+ { XFS_ABTB_MAGIC, XFS_ABTC_MAGIC, XFS_BMAP_MAGIC, XFS_IBT_MAGIC,
+ XFS_FIBT_MAGIC },
+ { XFS_ABTB_CRC_MAGIC, XFS_ABTC_CRC_MAGIC,
+ XFS_BMAP_CRC_MAGIC, XFS_IBT_CRC_MAGIC, XFS_FIBT_CRC_MAGIC }
};
+#define xfs_btree_magic(cur) \
+ xfs_magics[!!((cur)->bc_flags & XFS_BTREE_CRC_BLOCKS)][cur->bc_btnum]
-/*
- * Prototypes for internal routines.
- */
-/*
- * Checking routine: return maxrecs for the block.
- */
-STATIC int /* number of records fitting in block */
-xfs_btree_maxrecs(
- xfs_btree_cur_t *cur, /* btree cursor */
- xfs_btree_block_t *block);/* generic btree block pointer */
+STATIC int /* error (0 or EFSCORRUPTED) */
+xfs_btree_check_lblock(
+ struct xfs_btree_cur *cur, /* btree cursor */
+ struct xfs_btree_block *block, /* btree long form block pointer */
+ int level, /* level of the btree block */
+ struct xfs_buf *bp) /* buffer for block, if any */
+{
+ int lblock_ok = 1; /* block passes checks */
+ struct xfs_mount *mp; /* file system mount point */
-/*
- * Internal routines.
- */
+ mp = cur->bc_mp;
-/*
- * Retrieve the block pointer from the cursor at the given level.
- * This may be a bmap btree root or from a buffer.
- */
-STATIC xfs_btree_block_t * /* generic btree block pointer */
-xfs_btree_get_block(
- xfs_btree_cur_t *cur, /* btree cursor */
- int level, /* level in btree */
- struct xfs_buf **bpp); /* buffer containing the block */
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ lblock_ok = lblock_ok &&
+ uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_uuid) &&
+ block->bb_u.l.bb_blkno == cpu_to_be64(
+ bp ? bp->b_bn : XFS_BUF_DADDR_NULL);
+ }
-/*
- * Checking routine: return maxrecs for the block.
- */
-STATIC int /* number of records fitting in block */
-xfs_btree_maxrecs(
- xfs_btree_cur_t *cur, /* btree cursor */
- xfs_btree_block_t *block) /* generic btree block pointer */
-{
- switch (cur->bc_btnum) {
- case XFS_BTNUM_BNO:
- case XFS_BTNUM_CNT:
- return (int)XFS_ALLOC_BLOCK_MAXRECS(
- be16_to_cpu(block->bb_h.bb_level), cur);
- case XFS_BTNUM_BMAP:
- return (int)XFS_BMAP_BLOCK_IMAXRECS(
- be16_to_cpu(block->bb_h.bb_level), cur);
- case XFS_BTNUM_INO:
- return (int)XFS_INOBT_BLOCK_MAXRECS(
- be16_to_cpu(block->bb_h.bb_level), cur);
- default:
- ASSERT(0);
- return 0;
+ lblock_ok = lblock_ok &&
+ be32_to_cpu(block->bb_magic) == xfs_btree_magic(cur) &&
+ be16_to_cpu(block->bb_level) == level &&
+ be16_to_cpu(block->bb_numrecs) <=
+ cur->bc_ops->get_maxrecs(cur, level) &&
+ block->bb_u.l.bb_leftsib &&
+ (block->bb_u.l.bb_leftsib == cpu_to_be64(NULLDFSBNO) ||
+ XFS_FSB_SANITY_CHECK(mp,
+ be64_to_cpu(block->bb_u.l.bb_leftsib))) &&
+ block->bb_u.l.bb_rightsib &&
+ (block->bb_u.l.bb_rightsib == cpu_to_be64(NULLDFSBNO) ||
+ XFS_FSB_SANITY_CHECK(mp,
+ be64_to_cpu(block->bb_u.l.bb_rightsib)));
+
+ if (unlikely(XFS_TEST_ERROR(!lblock_ok, mp,
+ XFS_ERRTAG_BTREE_CHECK_LBLOCK,
+ XFS_RANDOM_BTREE_CHECK_LBLOCK))) {
+ if (bp)
+ trace_xfs_btree_corrupt(bp, _RET_IP_);
+ XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
+ return XFS_ERROR(EFSCORRUPTED);
}
+ return 0;
}
-/*
- * External routines.
- */
+STATIC int /* error (0 or EFSCORRUPTED) */
+xfs_btree_check_sblock(
+ struct xfs_btree_cur *cur, /* btree cursor */
+ struct xfs_btree_block *block, /* btree short form block pointer */
+ int level, /* level of the btree block */
+ struct xfs_buf *bp) /* buffer containing block */
+{
+ struct xfs_mount *mp; /* file system mount point */
+ struct xfs_buf *agbp; /* buffer for ag. freespace struct */
+ struct xfs_agf *agf; /* ag. freespace structure */
+ xfs_agblock_t agflen; /* native ag. freespace length */
+ int sblock_ok = 1; /* block passes checks */
+
+ mp = cur->bc_mp;
+ agbp = cur->bc_private.a.agbp;
+ agf = XFS_BUF_TO_AGF(agbp);
+ agflen = be32_to_cpu(agf->agf_length);
+
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ sblock_ok = sblock_ok &&
+ uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid) &&
+ block->bb_u.s.bb_blkno == cpu_to_be64(
+ bp ? bp->b_bn : XFS_BUF_DADDR_NULL);
+ }
+
+ sblock_ok = sblock_ok &&
+ be32_to_cpu(block->bb_magic) == xfs_btree_magic(cur) &&
+ be16_to_cpu(block->bb_level) == level &&
+ be16_to_cpu(block->bb_numrecs) <=
+ cur->bc_ops->get_maxrecs(cur, level) &&
+ (block->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK) ||
+ be32_to_cpu(block->bb_u.s.bb_leftsib) < agflen) &&
+ block->bb_u.s.bb_leftsib &&
+ (block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK) ||
+ be32_to_cpu(block->bb_u.s.bb_rightsib) < agflen) &&
+ block->bb_u.s.bb_rightsib;
+
+ if (unlikely(XFS_TEST_ERROR(!sblock_ok, mp,
+ XFS_ERRTAG_BTREE_CHECK_SBLOCK,
+ XFS_RANDOM_BTREE_CHECK_SBLOCK))) {
+ if (bp)
+ trace_xfs_btree_corrupt(bp, _RET_IP_);
+ XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
+ return XFS_ERROR(EFSCORRUPTED);
+ }
+ return 0;
+}
-#ifdef DEBUG
/*
* Debug routine: check that block header is ok.
*/
-void
+int
xfs_btree_check_block(
- xfs_btree_cur_t *cur, /* btree cursor */
- xfs_btree_block_t *block, /* generic btree block pointer */
+ struct xfs_btree_cur *cur, /* btree cursor */
+ struct xfs_btree_block *block, /* generic btree block pointer */
int level, /* level of the btree block */
- xfs_buf_t *bp) /* buffer containing block, if any */
+ struct xfs_buf *bp) /* buffer containing block, if any */
{
- if (XFS_BTREE_LONG_PTRS(cur->bc_btnum))
- xfs_btree_check_lblock(cur, (xfs_btree_lblock_t *)block, level,
- bp);
+ if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+ return xfs_btree_check_lblock(cur, block, level, bp);
else
- xfs_btree_check_sblock(cur, (xfs_btree_sblock_t *)block, level,
- bp);
-}
-
-/*
- * Debug routine: check that keys are in the right order.
- */
-void
-xfs_btree_check_key(
- xfs_btnum_t btnum, /* btree identifier */
- void *ak1, /* pointer to left (lower) key */
- void *ak2) /* pointer to right (higher) key */
-{
- switch (btnum) {
- case XFS_BTNUM_BNO: {
- xfs_alloc_key_t *k1;
- xfs_alloc_key_t *k2;
-
- k1 = ak1;
- k2 = ak2;
- ASSERT(be32_to_cpu(k1->ar_startblock) < be32_to_cpu(k2->ar_startblock));
- break;
- }
- case XFS_BTNUM_CNT: {
- xfs_alloc_key_t *k1;
- xfs_alloc_key_t *k2;
-
- k1 = ak1;
- k2 = ak2;
- ASSERT(be32_to_cpu(k1->ar_blockcount) < be32_to_cpu(k2->ar_blockcount) ||
- (k1->ar_blockcount == k2->ar_blockcount &&
- be32_to_cpu(k1->ar_startblock) < be32_to_cpu(k2->ar_startblock)));
- break;
- }
- case XFS_BTNUM_BMAP: {
- xfs_bmbt_key_t *k1;
- xfs_bmbt_key_t *k2;
-
- k1 = ak1;
- k2 = ak2;
- ASSERT(INT_GET(k1->br_startoff, ARCH_CONVERT) < INT_GET(k2->br_startoff, ARCH_CONVERT));
- break;
- }
- case XFS_BTNUM_INO: {
- xfs_inobt_key_t *k1;
- xfs_inobt_key_t *k2;
-
- k1 = ak1;
- k2 = ak2;
- ASSERT(INT_GET(k1->ir_startino, ARCH_CONVERT) < INT_GET(k2->ir_startino, ARCH_CONVERT));
- break;
- }
- default:
- ASSERT(0);
- }
+ return xfs_btree_check_sblock(cur, block, level, bp);
}
-#endif /* DEBUG */
/*
- * Checking routine: check that long form block header is ok.
+ * Check that (long) pointer is ok.
*/
-/* ARGSUSED */
int /* error (0 or EFSCORRUPTED) */
-xfs_btree_check_lblock(
- xfs_btree_cur_t *cur, /* btree cursor */
- xfs_btree_lblock_t *block, /* btree long form block pointer */
- int level, /* level of the btree block */
- xfs_buf_t *bp) /* buffer for block, if any */
+xfs_btree_check_lptr(
+ struct xfs_btree_cur *cur, /* btree cursor */
+ xfs_dfsbno_t bno, /* btree block disk address */
+ int level) /* btree block level */
{
- int lblock_ok; /* block passes checks */
- xfs_mount_t *mp; /* file system mount point */
-
- mp = cur->bc_mp;
- lblock_ok =
- be32_to_cpu(block->bb_magic) == xfs_magics[cur->bc_btnum] &&
- be16_to_cpu(block->bb_level) == level &&
- be16_to_cpu(block->bb_numrecs) <=
- xfs_btree_maxrecs(cur, (xfs_btree_block_t *)block) &&
- block->bb_leftsib &&
- (be64_to_cpu(block->bb_leftsib) == NULLDFSBNO ||
- XFS_FSB_SANITY_CHECK(mp, be64_to_cpu(block->bb_leftsib))) &&
- block->bb_rightsib &&
- (be64_to_cpu(block->bb_rightsib) == NULLDFSBNO ||
- XFS_FSB_SANITY_CHECK(mp, be64_to_cpu(block->bb_rightsib)));
- if (unlikely(XFS_TEST_ERROR(!lblock_ok, mp, XFS_ERRTAG_BTREE_CHECK_LBLOCK,
- XFS_RANDOM_BTREE_CHECK_LBLOCK))) {
- if (bp)
- xfs_buftrace("LBTREE ERROR", bp);
- XFS_ERROR_REPORT("xfs_btree_check_lblock", XFS_ERRLEVEL_LOW,
- mp);
- return XFS_ERROR(EFSCORRUPTED);
- }
+ XFS_WANT_CORRUPTED_RETURN(
+ level > 0 &&
+ bno != NULLDFSBNO &&
+ XFS_FSB_SANITY_CHECK(cur->bc_mp, bno));
return 0;
}
+#ifdef DEBUG
/*
- * Checking routine: check that (long) pointer is ok.
+ * Check that (short) pointer is ok.
*/
-int /* error (0 or EFSCORRUPTED) */
-xfs_btree_check_lptr(
- xfs_btree_cur_t *cur, /* btree cursor */
- xfs_dfsbno_t ptr, /* btree block disk address */
- int level) /* btree block level */
+STATIC int /* error (0 or EFSCORRUPTED) */
+xfs_btree_check_sptr(
+ struct xfs_btree_cur *cur, /* btree cursor */
+ xfs_agblock_t bno, /* btree block disk address */
+ int level) /* btree block level */
{
- xfs_mount_t *mp; /* file system mount point */
+ xfs_agblock_t agblocks = cur->bc_mp->m_sb.sb_agblocks;
- mp = cur->bc_mp;
XFS_WANT_CORRUPTED_RETURN(
level > 0 &&
- ptr != NULLDFSBNO &&
- XFS_FSB_SANITY_CHECK(mp, ptr));
+ bno != NULLAGBLOCK &&
+ bno != 0 &&
+ bno < agblocks);
return 0;
}
-#ifdef DEBUG
/*
- * Debug routine: check that records are in the right order.
+ * Check that block ptr is ok.
*/
-void
-xfs_btree_check_rec(
- xfs_btnum_t btnum, /* btree identifier */
- void *ar1, /* pointer to left (lower) record */
- void *ar2) /* pointer to right (higher) record */
-{
- switch (btnum) {
- case XFS_BTNUM_BNO: {
- xfs_alloc_rec_t *r1;
- xfs_alloc_rec_t *r2;
-
- r1 = ar1;
- r2 = ar2;
- ASSERT(be32_to_cpu(r1->ar_startblock) +
- be32_to_cpu(r1->ar_blockcount) <=
- be32_to_cpu(r2->ar_startblock));
- break;
- }
- case XFS_BTNUM_CNT: {
- xfs_alloc_rec_t *r1;
- xfs_alloc_rec_t *r2;
-
- r1 = ar1;
- r2 = ar2;
- ASSERT(be32_to_cpu(r1->ar_blockcount) < be32_to_cpu(r2->ar_blockcount) ||
- (r1->ar_blockcount == r2->ar_blockcount &&
- be32_to_cpu(r1->ar_startblock) < be32_to_cpu(r2->ar_startblock)));
- break;
- }
- case XFS_BTNUM_BMAP: {
- xfs_bmbt_rec_t *r1;
- xfs_bmbt_rec_t *r2;
-
- r1 = ar1;
- r2 = ar2;
- ASSERT(xfs_bmbt_disk_get_startoff(r1) +
- xfs_bmbt_disk_get_blockcount(r1) <=
- xfs_bmbt_disk_get_startoff(r2));
- break;
- }
- case XFS_BTNUM_INO: {
- xfs_inobt_rec_t *r1;
- xfs_inobt_rec_t *r2;
-
- r1 = ar1;
- r2 = ar2;
- ASSERT(INT_GET(r1->ir_startino, ARCH_CONVERT) + XFS_INODES_PER_CHUNK <=
- INT_GET(r2->ir_startino, ARCH_CONVERT));
- break;
- }
- default:
- ASSERT(0);
+STATIC int /* error (0 or EFSCORRUPTED) */
+xfs_btree_check_ptr(
+ struct xfs_btree_cur *cur, /* btree cursor */
+ union xfs_btree_ptr *ptr, /* btree block disk address */
+ int index, /* offset from ptr to check */
+ int level) /* btree block level */
+{
+ if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+ return xfs_btree_check_lptr(cur,
+ be64_to_cpu((&ptr->l)[index]), level);
+ } else {
+ return xfs_btree_check_sptr(cur,
+ be32_to_cpu((&ptr->s)[index]), level);
}
}
-#endif /* DEBUG */
+#endif
/*
- * Checking routine: check that block header is ok.
+ * Calculate CRC on the whole btree block and stuff it into the
+ * long-form btree header.
+ *
+ * Prior to calculting the CRC, pull the LSN out of the buffer log item and put
+ * it into the buffer so recovery knows what the last modifcation was that made
+ * it to disk.
*/
-/* ARGSUSED */
-int /* error (0 or EFSCORRUPTED) */
-xfs_btree_check_sblock(
- xfs_btree_cur_t *cur, /* btree cursor */
- xfs_btree_sblock_t *block, /* btree short form block pointer */
- int level, /* level of the btree block */
- xfs_buf_t *bp) /* buffer containing block */
+void
+xfs_btree_lblock_calc_crc(
+ struct xfs_buf *bp)
{
- xfs_buf_t *agbp; /* buffer for ag. freespace struct */
- xfs_agf_t *agf; /* ag. freespace structure */
- xfs_agblock_t agflen; /* native ag. freespace length */
- int sblock_ok; /* block passes checks */
+ struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
+ struct xfs_buf_log_item *bip = bp->b_fspriv;
- agbp = cur->bc_private.a.agbp;
- agf = XFS_BUF_TO_AGF(agbp);
- agflen = be32_to_cpu(agf->agf_length);
- sblock_ok =
- be32_to_cpu(block->bb_magic) == xfs_magics[cur->bc_btnum] &&
- be16_to_cpu(block->bb_level) == level &&
- be16_to_cpu(block->bb_numrecs) <=
- xfs_btree_maxrecs(cur, (xfs_btree_block_t *)block) &&
- (be32_to_cpu(block->bb_leftsib) == NULLAGBLOCK ||
- be32_to_cpu(block->bb_leftsib) < agflen) &&
- block->bb_leftsib &&
- (be32_to_cpu(block->bb_rightsib) == NULLAGBLOCK ||
- be32_to_cpu(block->bb_rightsib) < agflen) &&
- block->bb_rightsib;
- if (unlikely(XFS_TEST_ERROR(!sblock_ok, cur->bc_mp,
- XFS_ERRTAG_BTREE_CHECK_SBLOCK,
- XFS_RANDOM_BTREE_CHECK_SBLOCK))) {
- if (bp)
- xfs_buftrace("SBTREE ERROR", bp);
- XFS_ERROR_REPORT("xfs_btree_check_sblock", XFS_ERRLEVEL_LOW,
- cur->bc_mp);
- return XFS_ERROR(EFSCORRUPTED);
- }
- return 0;
+ if (!xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb))
+ return;
+ if (bip)
+ block->bb_u.l.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn);
+ xfs_buf_update_cksum(bp, XFS_BTREE_LBLOCK_CRC_OFF);
+}
+
+bool
+xfs_btree_lblock_verify_crc(
+ struct xfs_buf *bp)
+{
+ if (xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb))
+ return xfs_buf_verify_cksum(bp, XFS_BTREE_LBLOCK_CRC_OFF);
+
+ return true;
}
/*
- * Checking routine: check that (short) pointer is ok.
+ * Calculate CRC on the whole btree block and stuff it into the
+ * short-form btree header.
+ *
+ * Prior to calculting the CRC, pull the LSN out of the buffer log item and put
+ * it into the buffer so recovery knows what the last modifcation was that made
+ * it to disk.
*/
-int /* error (0 or EFSCORRUPTED) */
-xfs_btree_check_sptr(
- xfs_btree_cur_t *cur, /* btree cursor */
- xfs_agblock_t ptr, /* btree block disk address */
- int level) /* btree block level */
+void
+xfs_btree_sblock_calc_crc(
+ struct xfs_buf *bp)
{
- xfs_buf_t *agbp; /* buffer for ag. freespace struct */
- xfs_agf_t *agf; /* ag. freespace structure */
+ struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
+ struct xfs_buf_log_item *bip = bp->b_fspriv;
- agbp = cur->bc_private.a.agbp;
- agf = XFS_BUF_TO_AGF(agbp);
- XFS_WANT_CORRUPTED_RETURN(
- level > 0 &&
- ptr != NULLAGBLOCK && ptr != 0 &&
- ptr < be32_to_cpu(agf->agf_length));
- return 0;
+ if (!xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb))
+ return;
+ if (bip)
+ block->bb_u.s.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn);
+ xfs_buf_update_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF);
+}
+
+bool
+xfs_btree_sblock_verify_crc(
+ struct xfs_buf *bp)
+{
+ if (xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb))
+ return xfs_buf_verify_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF);
+
+ return true;
}
/*
@@ -382,7 +303,7 @@ xfs_btree_del_cursor(
*/
for (i = 0; i < cur->bc_nlevels; i++) {
if (cur->bc_bufs[i])
- xfs_btree_setbuf(cur, i, NULL);
+ xfs_trans_brelse(cur->bc_tp, cur->bc_bufs[i]);
else if (!error)
break;
}
@@ -416,104 +337,210 @@ xfs_btree_dup_cursor(
tp = cur->bc_tp;
mp = cur->bc_mp;
+
/*
* Allocate a new cursor like the old one.
*/
- new = xfs_btree_init_cursor(mp, tp, cur->bc_private.a.agbp,
- cur->bc_private.a.agno, cur->bc_btnum, cur->bc_private.b.ip,
- cur->bc_private.b.whichfork);
+ new = cur->bc_ops->dup_cursor(cur);
+
/*
* Copy the record currently in the cursor.
*/
new->bc_rec = cur->bc_rec;
+
/*
* For each level current, re-get the buffer and copy the ptr value.
*/
for (i = 0; i < new->bc_nlevels; i++) {
new->bc_ptrs[i] = cur->bc_ptrs[i];
new->bc_ra[i] = cur->bc_ra[i];
- if ((bp = cur->bc_bufs[i])) {
- if ((error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
- XFS_BUF_ADDR(bp), mp->m_bsize, 0, &bp))) {
+ bp = cur->bc_bufs[i];
+ if (bp) {
+ error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
+ XFS_BUF_ADDR(bp), mp->m_bsize,
+ 0, &bp,
+ cur->bc_ops->buf_ops);
+ if (error) {
xfs_btree_del_cursor(new, error);
*ncur = NULL;
return error;
}
- new->bc_bufs[i] = bp;
- ASSERT(bp);
- ASSERT(!XFS_BUF_GETERROR(bp));
- } else
- new->bc_bufs[i] = NULL;
- }
- /*
- * For bmap btrees, copy the firstblock, flist, and flags values,
- * since init cursor doesn't get them.
- */
- if (new->bc_btnum == XFS_BTNUM_BMAP) {
- new->bc_private.b.firstblock = cur->bc_private.b.firstblock;
- new->bc_private.b.flist = cur->bc_private.b.flist;
- new->bc_private.b.flags = cur->bc_private.b.flags;
+ }
+ new->bc_bufs[i] = bp;
}
*ncur = new;
return 0;
}
/*
- * Change the cursor to point to the first record at the given level.
- * Other levels are unaffected.
+ * XFS btree block layout and addressing:
+ *
+ * There are two types of blocks in the btree: leaf and non-leaf blocks.
+ *
+ * The leaf record start with a header then followed by records containing
+ * the values. A non-leaf block also starts with the same header, and
+ * then first contains lookup keys followed by an equal number of pointers
+ * to the btree blocks at the previous level.
+ *
+ * +--------+-------+-------+-------+-------+-------+-------+
+ * Leaf: | header | rec 1 | rec 2 | rec 3 | rec 4 | rec 5 | rec N |
+ * +--------+-------+-------+-------+-------+-------+-------+
+ *
+ * +--------+-------+-------+-------+-------+-------+-------+
+ * Non-Leaf: | header | key 1 | key 2 | key N | ptr 1 | ptr 2 | ptr N |
+ * +--------+-------+-------+-------+-------+-------+-------+
+ *
+ * The header is called struct xfs_btree_block for reasons better left unknown
+ * and comes in different versions for short (32bit) and long (64bit) block
+ * pointers. The record and key structures are defined by the btree instances
+ * and opaque to the btree core. The block pointers are simple disk endian
+ * integers, available in a short (32bit) and long (64bit) variant.
+ *
+ * The helpers below calculate the offset of a given record, key or pointer
+ * into a btree block (xfs_btree_*_offset) or return a pointer to the given
+ * record, key or pointer (xfs_btree_*_addr). Note that all addressing
+ * inside the btree block is done using indices starting at one, not zero!
*/
-int /* success=1, failure=0 */
-xfs_btree_firstrec(
- xfs_btree_cur_t *cur, /* btree cursor */
- int level) /* level to change */
+
+/*
+ * Return size of the btree block header for this btree instance.
+ */
+static inline size_t xfs_btree_block_len(struct xfs_btree_cur *cur)
{
- xfs_btree_block_t *block; /* generic btree block pointer */
- xfs_buf_t *bp; /* buffer containing block */
+ if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+ if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS)
+ return XFS_BTREE_LBLOCK_CRC_LEN;
+ return XFS_BTREE_LBLOCK_LEN;
+ }
+ if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS)
+ return XFS_BTREE_SBLOCK_CRC_LEN;
+ return XFS_BTREE_SBLOCK_LEN;
+}
- /*
- * Get the block pointer for this level.
- */
- block = xfs_btree_get_block(cur, level, &bp);
- xfs_btree_check_block(cur, block, level, bp);
- /*
- * It's empty, there is no such record.
- */
- if (!block->bb_h.bb_numrecs)
- return 0;
- /*
- * Set the ptr value to 1, that's the first record/key.
- */
- cur->bc_ptrs[level] = 1;
- return 1;
+/*
+ * Return size of btree block pointers for this btree instance.
+ */
+static inline size_t xfs_btree_ptr_len(struct xfs_btree_cur *cur)
+{
+ return (cur->bc_flags & XFS_BTREE_LONG_PTRS) ?
+ sizeof(__be64) : sizeof(__be32);
+}
+
+/*
+ * Calculate offset of the n-th record in a btree block.
+ */
+STATIC size_t
+xfs_btree_rec_offset(
+ struct xfs_btree_cur *cur,
+ int n)
+{
+ return xfs_btree_block_len(cur) +
+ (n - 1) * cur->bc_ops->rec_len;
+}
+
+/*
+ * Calculate offset of the n-th key in a btree block.
+ */
+STATIC size_t
+xfs_btree_key_offset(
+ struct xfs_btree_cur *cur,
+ int n)
+{
+ return xfs_btree_block_len(cur) +
+ (n - 1) * cur->bc_ops->key_len;
+}
+
+/*
+ * Calculate offset of the n-th block pointer in a btree block.
+ */
+STATIC size_t
+xfs_btree_ptr_offset(
+ struct xfs_btree_cur *cur,
+ int n,
+ int level)
+{
+ return xfs_btree_block_len(cur) +
+ cur->bc_ops->get_maxrecs(cur, level) * cur->bc_ops->key_len +
+ (n - 1) * xfs_btree_ptr_len(cur);
+}
+
+/*
+ * Return a pointer to the n-th record in the btree block.
+ */
+STATIC union xfs_btree_rec *
+xfs_btree_rec_addr(
+ struct xfs_btree_cur *cur,
+ int n,
+ struct xfs_btree_block *block)
+{
+ return (union xfs_btree_rec *)
+ ((char *)block + xfs_btree_rec_offset(cur, n));
+}
+
+/*
+ * Return a pointer to the n-th key in the btree block.
+ */
+STATIC union xfs_btree_key *
+xfs_btree_key_addr(
+ struct xfs_btree_cur *cur,
+ int n,
+ struct xfs_btree_block *block)
+{
+ return (union xfs_btree_key *)
+ ((char *)block + xfs_btree_key_offset(cur, n));
+}
+
+/*
+ * Return a pointer to the n-th block pointer in the btree block.
+ */
+STATIC union xfs_btree_ptr *
+xfs_btree_ptr_addr(
+ struct xfs_btree_cur *cur,
+ int n,
+ struct xfs_btree_block *block)
+{
+ int level = xfs_btree_get_level(block);
+
+ ASSERT(block->bb_level != 0);
+
+ return (union xfs_btree_ptr *)
+ ((char *)block + xfs_btree_ptr_offset(cur, n, level));
+}
+
+/*
+ * Get the root block which is stored in the inode.
+ *
+ * For now this btree implementation assumes the btree root is always
+ * stored in the if_broot field of an inode fork.
+ */
+STATIC struct xfs_btree_block *
+xfs_btree_get_iroot(
+ struct xfs_btree_cur *cur)
+{
+ struct xfs_ifork *ifp;
+
+ ifp = XFS_IFORK_PTR(cur->bc_private.b.ip, cur->bc_private.b.whichfork);
+ return (struct xfs_btree_block *)ifp->if_broot;
}
/*
* Retrieve the block pointer from the cursor at the given level.
- * This may be a bmap btree root or from a buffer.
+ * This may be an inode btree root or from a buffer.
*/
-STATIC xfs_btree_block_t * /* generic btree block pointer */
+STATIC struct xfs_btree_block * /* generic btree block pointer */
xfs_btree_get_block(
- xfs_btree_cur_t *cur, /* btree cursor */
+ struct xfs_btree_cur *cur, /* btree cursor */
int level, /* level in btree */
- xfs_buf_t **bpp) /* buffer containing the block */
-{
- xfs_btree_block_t *block; /* return value */
- xfs_buf_t *bp; /* return buffer */
- xfs_ifork_t *ifp; /* inode fork pointer */
- int whichfork; /* data or attr fork */
-
- if (cur->bc_btnum == XFS_BTNUM_BMAP && level == cur->bc_nlevels - 1) {
- whichfork = cur->bc_private.b.whichfork;
- ifp = XFS_IFORK_PTR(cur->bc_private.b.ip, whichfork);
- block = (xfs_btree_block_t *)ifp->if_broot;
- bp = NULL;
- } else {
- bp = cur->bc_bufs[level];
- block = XFS_BUF_TO_BLOCK(bp);
+ struct xfs_buf **bpp) /* buffer containing the block */
+{
+ if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
+ (level == cur->bc_nlevels - 1)) {
+ *bpp = NULL;
+ return xfs_btree_get_iroot(cur);
}
- ASSERT(block != NULL);
- *bpp = bp;
- return block;
+
+ *bpp = cur->bc_bufs[level];
+ return XFS_BUF_TO_BLOCK(*bpp);
}
/*
@@ -527,15 +554,11 @@ xfs_btree_get_bufl(
xfs_fsblock_t fsbno, /* file system block number */
uint lock) /* lock flags for get_buf */
{
- xfs_buf_t *bp; /* buffer pointer (return value) */
xfs_daddr_t d; /* real disk block address */
ASSERT(fsbno != NULLFSBLOCK);
d = XFS_FSB_TO_DADDR(mp, fsbno);
- bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, lock);
- ASSERT(bp);
- ASSERT(!XFS_BUF_GETERROR(bp));
- return bp;
+ return xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, lock);
}
/*
@@ -550,138 +573,72 @@ xfs_btree_get_bufs(
xfs_agblock_t agbno, /* allocation group block number */
uint lock) /* lock flags for get_buf */
{
- xfs_buf_t *bp; /* buffer pointer (return value) */
xfs_daddr_t d; /* real disk block address */
ASSERT(agno != NULLAGNUMBER);
ASSERT(agbno != NULLAGBLOCK);
d = XFS_AGB_TO_DADDR(mp, agno, agbno);
- bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, lock);
- ASSERT(bp);
- ASSERT(!XFS_BUF_GETERROR(bp));
- return bp;
+ return xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, lock);
}
/*
- * Allocate a new btree cursor.
- * The cursor is either for allocation (A) or bmap (B) or inodes (I).
+ * Check for the cursor referring to the last block at the given level.
*/
-xfs_btree_cur_t * /* new btree cursor */
-xfs_btree_init_cursor(
- xfs_mount_t *mp, /* file system mount point */
- xfs_trans_t *tp, /* transaction pointer */
- xfs_buf_t *agbp, /* (A only) buffer for agf structure */
- /* (I only) buffer for agi structure */
- xfs_agnumber_t agno, /* (AI only) allocation group number */
- xfs_btnum_t btnum, /* btree identifier */
- xfs_inode_t *ip, /* (B only) inode owning the btree */
- int whichfork) /* (B only) data or attr fork */
+int /* 1=is last block, 0=not last block */
+xfs_btree_islastblock(
+ xfs_btree_cur_t *cur, /* btree cursor */
+ int level) /* level to check */
{
- xfs_agf_t *agf; /* (A) allocation group freespace */
- xfs_agi_t *agi; /* (I) allocation group inodespace */
- xfs_btree_cur_t *cur; /* return value */
- xfs_ifork_t *ifp; /* (I) inode fork pointer */
- int nlevels=0; /* number of levels in the btree */
+ struct xfs_btree_block *block; /* generic btree block pointer */
+ xfs_buf_t *bp; /* buffer containing block */
- ASSERT(xfs_btree_cur_zone != NULL);
- /*
- * Allocate a new cursor.
- */
- cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_SLEEP);
- /*
- * Deduce the number of btree levels from the arguments.
- */
- switch (btnum) {
- case XFS_BTNUM_BNO:
- case XFS_BTNUM_CNT:
- agf = XFS_BUF_TO_AGF(agbp);
- nlevels = be32_to_cpu(agf->agf_levels[btnum]);
- break;
- case XFS_BTNUM_BMAP:
- ifp = XFS_IFORK_PTR(ip, whichfork);
- nlevels = be16_to_cpu(ifp->if_broot->bb_level) + 1;
- break;
- case XFS_BTNUM_INO:
- agi = XFS_BUF_TO_AGI(agbp);
- nlevels = be32_to_cpu(agi->agi_level);
- break;
- default:
- ASSERT(0);
- }
- /*
- * Fill in the common fields.
- */
- cur->bc_tp = tp;
- cur->bc_mp = mp;
- cur->bc_nlevels = nlevels;
- cur->bc_btnum = btnum;
- cur->bc_blocklog = mp->m_sb.sb_blocklog;
- /*
- * Fill in private fields.
- */
- switch (btnum) {
- case XFS_BTNUM_BNO:
- case XFS_BTNUM_CNT:
- /*
- * Allocation btree fields.
- */
- cur->bc_private.a.agbp = agbp;
- cur->bc_private.a.agno = agno;
- break;
- case XFS_BTNUM_BMAP:
- /*
- * Bmap btree fields.
- */
- cur->bc_private.b.forksize = XFS_IFORK_SIZE(ip, whichfork);
- cur->bc_private.b.ip = ip;
- cur->bc_private.b.firstblock = NULLFSBLOCK;
- cur->bc_private.b.flist = NULL;
- cur->bc_private.b.allocated = 0;
- cur->bc_private.b.flags = 0;
- cur->bc_private.b.whichfork = whichfork;
- break;
- case XFS_BTNUM_INO:
- /*
- * Inode allocation btree fields.
- */
- cur->bc_private.i.agbp = agbp;
- cur->bc_private.i.agno = agno;
- break;
- default:
- ASSERT(0);
- }
- return cur;
+ block = xfs_btree_get_block(cur, level, &bp);
+ xfs_btree_check_block(cur, block, level, bp);
+ if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+ return block->bb_u.l.bb_rightsib == cpu_to_be64(NULLDFSBNO);
+ else
+ return block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK);
}
/*
- * Check for the cursor referring to the last block at the given level.
+ * Change the cursor to point to the first record at the given level.
+ * Other levels are unaffected.
*/
-int /* 1=is last block, 0=not last block */
-xfs_btree_islastblock(
+STATIC int /* success=1, failure=0 */
+xfs_btree_firstrec(
xfs_btree_cur_t *cur, /* btree cursor */
- int level) /* level to check */
+ int level) /* level to change */
{
- xfs_btree_block_t *block; /* generic btree block pointer */
+ struct xfs_btree_block *block; /* generic btree block pointer */
xfs_buf_t *bp; /* buffer containing block */
+ /*
+ * Get the block pointer for this level.
+ */
block = xfs_btree_get_block(cur, level, &bp);
xfs_btree_check_block(cur, block, level, bp);
- if (XFS_BTREE_LONG_PTRS(cur->bc_btnum))
- return be64_to_cpu(block->bb_u.l.bb_rightsib) == NULLDFSBNO;
- else
- return be32_to_cpu(block->bb_u.s.bb_rightsib) == NULLAGBLOCK;
+ /*
+ * It's empty, there is no such record.
+ */
+ if (!block->bb_numrecs)
+ return 0;
+ /*
+ * Set the ptr value to 1, that's the first record/key.
+ */
+ cur->bc_ptrs[level] = 1;
+ return 1;
}
/*
* Change the cursor to point to the last record in the current block
* at the given level. Other levels are unaffected.
*/
-int /* success=1, failure=0 */
+STATIC int /* success=1, failure=0 */
xfs_btree_lastrec(
xfs_btree_cur_t *cur, /* btree cursor */
int level) /* level to change */
{
- xfs_btree_block_t *block; /* generic btree block pointer */
+ struct xfs_btree_block *block; /* generic btree block pointer */
xfs_buf_t *bp; /* buffer containing block */
/*
@@ -692,12 +649,12 @@ xfs_btree_lastrec(
/*
* It's empty, there is no such record.
*/
- if (!block->bb_h.bb_numrecs)
+ if (!block->bb_numrecs)
return 0;
/*
* Set the ptr value to numrecs, that's the last record/key.
*/
- cur->bc_ptrs[level] = be16_to_cpu(block->bb_h.bb_numrecs);
+ cur->bc_ptrs[level] = be16_to_cpu(block->bb_numrecs);
return 1;
}
@@ -741,69 +698,28 @@ xfs_btree_offsets(
* Get a buffer for the block, return it read in.
* Long-form addressing.
*/
-int /* error */
+int
xfs_btree_read_bufl(
- xfs_mount_t *mp, /* file system mount point */
- xfs_trans_t *tp, /* transaction pointer */
- xfs_fsblock_t fsbno, /* file system block number */
- uint lock, /* lock flags for read_buf */
- xfs_buf_t **bpp, /* buffer for fsbno */
- int refval) /* ref count value for buffer */
+ struct xfs_mount *mp, /* file system mount point */
+ struct xfs_trans *tp, /* transaction pointer */
+ xfs_fsblock_t fsbno, /* file system block number */
+ uint lock, /* lock flags for read_buf */
+ struct xfs_buf **bpp, /* buffer for fsbno */
+ int refval, /* ref count value for buffer */
+ const struct xfs_buf_ops *ops)
{
- xfs_buf_t *bp; /* return value */
+ struct xfs_buf *bp; /* return value */
xfs_daddr_t d; /* real disk block address */
- int error;
+ int error;
ASSERT(fsbno != NULLFSBLOCK);
d = XFS_FSB_TO_DADDR(mp, fsbno);
- if ((error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d,
- mp->m_bsize, lock, &bp))) {
+ error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d,
+ mp->m_bsize, lock, &bp, ops);
+ if (error)
return error;
- }
- ASSERT(!bp || !XFS_BUF_GETERROR(bp));
- if (bp != NULL) {
- XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, refval);
- }
- *bpp = bp;
- return 0;
-}
-
-/*
- * Get a buffer for the block, return it read in.
- * Short-form addressing.
- */
-int /* error */
-xfs_btree_read_bufs(
- xfs_mount_t *mp, /* file system mount point */
- xfs_trans_t *tp, /* transaction pointer */
- xfs_agnumber_t agno, /* allocation group number */
- xfs_agblock_t agbno, /* allocation group block number */
- uint lock, /* lock flags for read_buf */
- xfs_buf_t **bpp, /* buffer for agno/agbno */
- int refval) /* ref count value for buffer */
-{
- xfs_buf_t *bp; /* return value */
- xfs_daddr_t d; /* real disk block address */
- int error;
-
- ASSERT(agno != NULLAGNUMBER);
- ASSERT(agbno != NULLAGBLOCK);
- d = XFS_AGB_TO_DADDR(mp, agno, agbno);
- if ((error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d,
- mp->m_bsize, lock, &bp))) {
- return error;
- }
- ASSERT(!bp || !XFS_BUF_GETERROR(bp));
- if (bp != NULL) {
- switch (refval) {
- case XFS_ALLOC_BTREE_REF:
- XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, refval);
- break;
- case XFS_INO_BTREE_REF:
- XFS_BUF_SET_VTYPE_REF(bp, B_FS_INOMAP, refval);
- break;
- }
- }
+ if (bp)
+ xfs_buf_set_ref(bp, refval);
*bpp = bp;
return 0;
}
@@ -815,15 +731,16 @@ xfs_btree_read_bufs(
/* ARGSUSED */
void
xfs_btree_reada_bufl(
- xfs_mount_t *mp, /* file system mount point */
- xfs_fsblock_t fsbno, /* file system block number */
- xfs_extlen_t count) /* count of filesystem blocks */
+ struct xfs_mount *mp, /* file system mount point */
+ xfs_fsblock_t fsbno, /* file system block number */
+ xfs_extlen_t count, /* count of filesystem blocks */
+ const struct xfs_buf_ops *ops)
{
xfs_daddr_t d;
ASSERT(fsbno != NULLFSBLOCK);
d = XFS_FSB_TO_DADDR(mp, fsbno);
- xfs_baread(mp->m_ddev_targp, d, mp->m_bsize * count);
+ xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, ops);
}
/*
@@ -833,111 +750,3320 @@ xfs_btree_reada_bufl(
/* ARGSUSED */
void
xfs_btree_reada_bufs(
- xfs_mount_t *mp, /* file system mount point */
- xfs_agnumber_t agno, /* allocation group number */
- xfs_agblock_t agbno, /* allocation group block number */
- xfs_extlen_t count) /* count of filesystem blocks */
+ struct xfs_mount *mp, /* file system mount point */
+ xfs_agnumber_t agno, /* allocation group number */
+ xfs_agblock_t agbno, /* allocation group block number */
+ xfs_extlen_t count, /* count of filesystem blocks */
+ const struct xfs_buf_ops *ops)
{
xfs_daddr_t d;
ASSERT(agno != NULLAGNUMBER);
ASSERT(agbno != NULLAGBLOCK);
d = XFS_AGB_TO_DADDR(mp, agno, agbno);
- xfs_baread(mp->m_ddev_targp, d, mp->m_bsize * count);
+ xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, ops);
+}
+
+STATIC int
+xfs_btree_readahead_lblock(
+ struct xfs_btree_cur *cur,
+ int lr,
+ struct xfs_btree_block *block)
+{
+ int rval = 0;
+ xfs_dfsbno_t left = be64_to_cpu(block->bb_u.l.bb_leftsib);
+ xfs_dfsbno_t right = be64_to_cpu(block->bb_u.l.bb_rightsib);
+
+ if ((lr & XFS_BTCUR_LEFTRA) && left != NULLDFSBNO) {
+ xfs_btree_reada_bufl(cur->bc_mp, left, 1,
+ cur->bc_ops->buf_ops);
+ rval++;
+ }
+
+ if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLDFSBNO) {
+ xfs_btree_reada_bufl(cur->bc_mp, right, 1,
+ cur->bc_ops->buf_ops);
+ rval++;
+ }
+
+ return rval;
+}
+
+STATIC int
+xfs_btree_readahead_sblock(
+ struct xfs_btree_cur *cur,
+ int lr,
+ struct xfs_btree_block *block)
+{
+ int rval = 0;
+ xfs_agblock_t left = be32_to_cpu(block->bb_u.s.bb_leftsib);
+ xfs_agblock_t right = be32_to_cpu(block->bb_u.s.bb_rightsib);
+
+
+ if ((lr & XFS_BTCUR_LEFTRA) && left != NULLAGBLOCK) {
+ xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
+ left, 1, cur->bc_ops->buf_ops);
+ rval++;
+ }
+
+ if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLAGBLOCK) {
+ xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
+ right, 1, cur->bc_ops->buf_ops);
+ rval++;
+ }
+
+ return rval;
}
/*
* Read-ahead btree blocks, at the given level.
* Bits in lr are set from XFS_BTCUR_{LEFT,RIGHT}RA.
*/
-int
-xfs_btree_readahead_core(
- xfs_btree_cur_t *cur, /* btree cursor */
+STATIC int
+xfs_btree_readahead(
+ struct xfs_btree_cur *cur, /* btree cursor */
int lev, /* level in btree */
int lr) /* left/right bits */
{
- xfs_alloc_block_t *a;
- xfs_bmbt_block_t *b;
- xfs_inobt_block_t *i;
- int rval = 0;
+ struct xfs_btree_block *block;
+
+ /*
+ * No readahead needed if we are at the root level and the
+ * btree root is stored in the inode.
+ */
+ if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
+ (lev == cur->bc_nlevels - 1))
+ return 0;
+
+ if ((cur->bc_ra[lev] | lr) == cur->bc_ra[lev])
+ return 0;
- ASSERT(cur->bc_bufs[lev] != NULL);
cur->bc_ra[lev] |= lr;
- switch (cur->bc_btnum) {
- case XFS_BTNUM_BNO:
- case XFS_BTNUM_CNT:
- a = XFS_BUF_TO_ALLOC_BLOCK(cur->bc_bufs[lev]);
- if ((lr & XFS_BTCUR_LEFTRA) && be32_to_cpu(a->bb_leftsib) != NULLAGBLOCK) {
- xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
- be32_to_cpu(a->bb_leftsib), 1);
- rval++;
- }
- if ((lr & XFS_BTCUR_RIGHTRA) && be32_to_cpu(a->bb_rightsib) != NULLAGBLOCK) {
- xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
- be32_to_cpu(a->bb_rightsib), 1);
- rval++;
- }
- break;
- case XFS_BTNUM_BMAP:
- b = XFS_BUF_TO_BMBT_BLOCK(cur->bc_bufs[lev]);
- if ((lr & XFS_BTCUR_LEFTRA) && be64_to_cpu(b->bb_leftsib) != NULLDFSBNO) {
- xfs_btree_reada_bufl(cur->bc_mp, be64_to_cpu(b->bb_leftsib), 1);
- rval++;
- }
- if ((lr & XFS_BTCUR_RIGHTRA) && be64_to_cpu(b->bb_rightsib) != NULLDFSBNO) {
- xfs_btree_reada_bufl(cur->bc_mp, be64_to_cpu(b->bb_rightsib), 1);
- rval++;
- }
- break;
- case XFS_BTNUM_INO:
- i = XFS_BUF_TO_INOBT_BLOCK(cur->bc_bufs[lev]);
- if ((lr & XFS_BTCUR_LEFTRA) && be32_to_cpu(i->bb_leftsib) != NULLAGBLOCK) {
- xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.i.agno,
- be32_to_cpu(i->bb_leftsib), 1);
- rval++;
- }
- if ((lr & XFS_BTCUR_RIGHTRA) && be32_to_cpu(i->bb_rightsib) != NULLAGBLOCK) {
- xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.i.agno,
- be32_to_cpu(i->bb_rightsib), 1);
- rval++;
- }
- break;
- default:
- ASSERT(0);
+ block = XFS_BUF_TO_BLOCK(cur->bc_bufs[lev]);
+
+ if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+ return xfs_btree_readahead_lblock(cur, lr, block);
+ return xfs_btree_readahead_sblock(cur, lr, block);
+}
+
+STATIC xfs_daddr_t
+xfs_btree_ptr_to_daddr(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *ptr)
+{
+ if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+ ASSERT(ptr->l != cpu_to_be64(NULLDFSBNO));
+
+ return XFS_FSB_TO_DADDR(cur->bc_mp, be64_to_cpu(ptr->l));
+ } else {
+ ASSERT(cur->bc_private.a.agno != NULLAGNUMBER);
+ ASSERT(ptr->s != cpu_to_be32(NULLAGBLOCK));
+
+ return XFS_AGB_TO_DADDR(cur->bc_mp, cur->bc_private.a.agno,
+ be32_to_cpu(ptr->s));
}
- return rval;
+}
+
+/*
+ * Readahead @count btree blocks at the given @ptr location.
+ *
+ * We don't need to care about long or short form btrees here as we have a
+ * method of converting the ptr directly to a daddr available to us.
+ */
+STATIC void
+xfs_btree_readahead_ptr(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *ptr,
+ xfs_extlen_t count)
+{
+ xfs_buf_readahead(cur->bc_mp->m_ddev_targp,
+ xfs_btree_ptr_to_daddr(cur, ptr),
+ cur->bc_mp->m_bsize * count, cur->bc_ops->buf_ops);
}
/*
* Set the buffer for level "lev" in the cursor to bp, releasing
* any previous buffer.
*/
-void
+STATIC void
xfs_btree_setbuf(
xfs_btree_cur_t *cur, /* btree cursor */
int lev, /* level in btree */
xfs_buf_t *bp) /* new buffer to set */
{
- xfs_btree_block_t *b; /* btree block */
- xfs_buf_t *obp; /* old buffer pointer */
+ struct xfs_btree_block *b; /* btree block */
- obp = cur->bc_bufs[lev];
- if (obp)
- xfs_trans_brelse(cur->bc_tp, obp);
+ if (cur->bc_bufs[lev])
+ xfs_trans_brelse(cur->bc_tp, cur->bc_bufs[lev]);
cur->bc_bufs[lev] = bp;
cur->bc_ra[lev] = 0;
- if (!bp)
- return;
+
b = XFS_BUF_TO_BLOCK(bp);
- if (XFS_BTREE_LONG_PTRS(cur->bc_btnum)) {
- if (be64_to_cpu(b->bb_u.l.bb_leftsib) == NULLDFSBNO)
+ if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+ if (b->bb_u.l.bb_leftsib == cpu_to_be64(NULLDFSBNO))
cur->bc_ra[lev] |= XFS_BTCUR_LEFTRA;
- if (be64_to_cpu(b->bb_u.l.bb_rightsib) == NULLDFSBNO)
+ if (b->bb_u.l.bb_rightsib == cpu_to_be64(NULLDFSBNO))
cur->bc_ra[lev] |= XFS_BTCUR_RIGHTRA;
} else {
- if (be32_to_cpu(b->bb_u.s.bb_leftsib) == NULLAGBLOCK)
+ if (b->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK))
cur->bc_ra[lev] |= XFS_BTCUR_LEFTRA;
- if (be32_to_cpu(b->bb_u.s.bb_rightsib) == NULLAGBLOCK)
+ if (b->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK))
cur->bc_ra[lev] |= XFS_BTCUR_RIGHTRA;
}
}
+
+STATIC int
+xfs_btree_ptr_is_null(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *ptr)
+{
+ if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+ return ptr->l == cpu_to_be64(NULLDFSBNO);
+ else
+ return ptr->s == cpu_to_be32(NULLAGBLOCK);
+}
+
+STATIC void
+xfs_btree_set_ptr_null(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *ptr)
+{
+ if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+ ptr->l = cpu_to_be64(NULLDFSBNO);
+ else
+ ptr->s = cpu_to_be32(NULLAGBLOCK);
+}
+
+/*
+ * Get/set/init sibling pointers
+ */
+STATIC void
+xfs_btree_get_sibling(
+ struct xfs_btree_cur *cur,
+ struct xfs_btree_block *block,
+ union xfs_btree_ptr *ptr,
+ int lr)
+{
+ ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB);
+
+ if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+ if (lr == XFS_BB_RIGHTSIB)
+ ptr->l = block->bb_u.l.bb_rightsib;
+ else
+ ptr->l = block->bb_u.l.bb_leftsib;
+ } else {
+ if (lr == XFS_BB_RIGHTSIB)
+ ptr->s = block->bb_u.s.bb_rightsib;
+ else
+ ptr->s = block->bb_u.s.bb_leftsib;
+ }
+}
+
+STATIC void
+xfs_btree_set_sibling(
+ struct xfs_btree_cur *cur,
+ struct xfs_btree_block *block,
+ union xfs_btree_ptr *ptr,
+ int lr)
+{
+ ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB);
+
+ if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+ if (lr == XFS_BB_RIGHTSIB)
+ block->bb_u.l.bb_rightsib = ptr->l;
+ else
+ block->bb_u.l.bb_leftsib = ptr->l;
+ } else {
+ if (lr == XFS_BB_RIGHTSIB)
+ block->bb_u.s.bb_rightsib = ptr->s;
+ else
+ block->bb_u.s.bb_leftsib = ptr->s;
+ }
+}
+
+void
+xfs_btree_init_block_int(
+ struct xfs_mount *mp,
+ struct xfs_btree_block *buf,
+ xfs_daddr_t blkno,
+ __u32 magic,
+ __u16 level,
+ __u16 numrecs,
+ __u64 owner,
+ unsigned int flags)
+{
+ buf->bb_magic = cpu_to_be32(magic);
+ buf->bb_level = cpu_to_be16(level);
+ buf->bb_numrecs = cpu_to_be16(numrecs);
+
+ if (flags & XFS_BTREE_LONG_PTRS) {
+ buf->bb_u.l.bb_leftsib = cpu_to_be64(NULLDFSBNO);
+ buf->bb_u.l.bb_rightsib = cpu_to_be64(NULLDFSBNO);
+ if (flags & XFS_BTREE_CRC_BLOCKS) {
+ buf->bb_u.l.bb_blkno = cpu_to_be64(blkno);
+ buf->bb_u.l.bb_owner = cpu_to_be64(owner);
+ uuid_copy(&buf->bb_u.l.bb_uuid, &mp->m_sb.sb_uuid);
+ buf->bb_u.l.bb_pad = 0;
+ buf->bb_u.l.bb_lsn = 0;
+ }
+ } else {
+ /* owner is a 32 bit value on short blocks */
+ __u32 __owner = (__u32)owner;
+
+ buf->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK);
+ buf->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK);
+ if (flags & XFS_BTREE_CRC_BLOCKS) {
+ buf->bb_u.s.bb_blkno = cpu_to_be64(blkno);
+ buf->bb_u.s.bb_owner = cpu_to_be32(__owner);
+ uuid_copy(&buf->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid);
+ buf->bb_u.s.bb_lsn = 0;
+ }
+ }
+}
+
+void
+xfs_btree_init_block(
+ struct xfs_mount *mp,
+ struct xfs_buf *bp,
+ __u32 magic,
+ __u16 level,
+ __u16 numrecs,
+ __u64 owner,
+ unsigned int flags)
+{
+ xfs_btree_init_block_int(mp, XFS_BUF_TO_BLOCK(bp), bp->b_bn,
+ magic, level, numrecs, owner, flags);
+}
+
+STATIC void
+xfs_btree_init_block_cur(
+ struct xfs_btree_cur *cur,
+ struct xfs_buf *bp,
+ int level,
+ int numrecs)
+{
+ __u64 owner;
+
+ /*
+ * we can pull the owner from the cursor right now as the different
+ * owners align directly with the pointer size of the btree. This may
+ * change in future, but is safe for current users of the generic btree
+ * code.
+ */
+ if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+ owner = cur->bc_private.b.ip->i_ino;
+ else
+ owner = cur->bc_private.a.agno;
+
+ xfs_btree_init_block_int(cur->bc_mp, XFS_BUF_TO_BLOCK(bp), bp->b_bn,
+ xfs_btree_magic(cur), level, numrecs,
+ owner, cur->bc_flags);
+}
+
+/*
+ * Return true if ptr is the last record in the btree and
+ * we need to track updates to this record. The decision
+ * will be further refined in the update_lastrec method.
+ */
+STATIC int
+xfs_btree_is_lastrec(
+ struct xfs_btree_cur *cur,
+ struct xfs_btree_block *block,
+ int level)
+{
+ union xfs_btree_ptr ptr;
+
+ if (level > 0)
+ return 0;
+ if (!(cur->bc_flags & XFS_BTREE_LASTREC_UPDATE))
+ return 0;
+
+ xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
+ if (!xfs_btree_ptr_is_null(cur, &ptr))
+ return 0;
+ return 1;
+}
+
+STATIC void
+xfs_btree_buf_to_ptr(
+ struct xfs_btree_cur *cur,
+ struct xfs_buf *bp,
+ union xfs_btree_ptr *ptr)
+{
+ if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+ ptr->l = cpu_to_be64(XFS_DADDR_TO_FSB(cur->bc_mp,
+ XFS_BUF_ADDR(bp)));
+ else {
+ ptr->s = cpu_to_be32(xfs_daddr_to_agbno(cur->bc_mp,
+ XFS_BUF_ADDR(bp)));
+ }
+}
+
+STATIC void
+xfs_btree_set_refs(
+ struct xfs_btree_cur *cur,
+ struct xfs_buf *bp)
+{
+ switch (cur->bc_btnum) {
+ case XFS_BTNUM_BNO:
+ case XFS_BTNUM_CNT:
+ xfs_buf_set_ref(bp, XFS_ALLOC_BTREE_REF);
+ break;
+ case XFS_BTNUM_INO:
+ case XFS_BTNUM_FINO:
+ xfs_buf_set_ref(bp, XFS_INO_BTREE_REF);
+ break;
+ case XFS_BTNUM_BMAP:
+ xfs_buf_set_ref(bp, XFS_BMAP_BTREE_REF);
+ break;
+ default:
+ ASSERT(0);
+ }
+}
+
+STATIC int
+xfs_btree_get_buf_block(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *ptr,
+ int flags,
+ struct xfs_btree_block **block,
+ struct xfs_buf **bpp)
+{
+ struct xfs_mount *mp = cur->bc_mp;
+ xfs_daddr_t d;
+
+ /* need to sort out how callers deal with failures first */
+ ASSERT(!(flags & XBF_TRYLOCK));
+
+ d = xfs_btree_ptr_to_daddr(cur, ptr);
+ *bpp = xfs_trans_get_buf(cur->bc_tp, mp->m_ddev_targp, d,
+ mp->m_bsize, flags);
+
+ if (!*bpp)
+ return ENOMEM;
+
+ (*bpp)->b_ops = cur->bc_ops->buf_ops;
+ *block = XFS_BUF_TO_BLOCK(*bpp);
+ return 0;
+}
+
+/*
+ * Read in the buffer at the given ptr and return the buffer and
+ * the block pointer within the buffer.
+ */
+STATIC int
+xfs_btree_read_buf_block(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *ptr,
+ int flags,
+ struct xfs_btree_block **block,
+ struct xfs_buf **bpp)
+{
+ struct xfs_mount *mp = cur->bc_mp;
+ xfs_daddr_t d;
+ int error;
+
+ /* need to sort out how callers deal with failures first */
+ ASSERT(!(flags & XBF_TRYLOCK));
+
+ d = xfs_btree_ptr_to_daddr(cur, ptr);
+ error = xfs_trans_read_buf(mp, cur->bc_tp, mp->m_ddev_targp, d,
+ mp->m_bsize, flags, bpp,
+ cur->bc_ops->buf_ops);
+ if (error)
+ return error;
+
+ xfs_btree_set_refs(cur, *bpp);
+ *block = XFS_BUF_TO_BLOCK(*bpp);
+ return 0;
+}
+
+/*
+ * Copy keys from one btree block to another.
+ */
+STATIC void
+xfs_btree_copy_keys(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_key *dst_key,
+ union xfs_btree_key *src_key,
+ int numkeys)
+{
+ ASSERT(numkeys >= 0);
+ memcpy(dst_key, src_key, numkeys * cur->bc_ops->key_len);
+}
+
+/*
+ * Copy records from one btree block to another.
+ */
+STATIC void
+xfs_btree_copy_recs(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_rec *dst_rec,
+ union xfs_btree_rec *src_rec,
+ int numrecs)
+{
+ ASSERT(numrecs >= 0);
+ memcpy(dst_rec, src_rec, numrecs * cur->bc_ops->rec_len);
+}
+
+/*
+ * Copy block pointers from one btree block to another.
+ */
+STATIC void
+xfs_btree_copy_ptrs(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *dst_ptr,
+ union xfs_btree_ptr *src_ptr,
+ int numptrs)
+{
+ ASSERT(numptrs >= 0);
+ memcpy(dst_ptr, src_ptr, numptrs * xfs_btree_ptr_len(cur));
+}
+
+/*
+ * Shift keys one index left/right inside a single btree block.
+ */
+STATIC void
+xfs_btree_shift_keys(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_key *key,
+ int dir,
+ int numkeys)
+{
+ char *dst_key;
+
+ ASSERT(numkeys >= 0);
+ ASSERT(dir == 1 || dir == -1);
+
+ dst_key = (char *)key + (dir * cur->bc_ops->key_len);
+ memmove(dst_key, key, numkeys * cur->bc_ops->key_len);
+}
+
+/*
+ * Shift records one index left/right inside a single btree block.
+ */
+STATIC void
+xfs_btree_shift_recs(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_rec *rec,
+ int dir,
+ int numrecs)
+{
+ char *dst_rec;
+
+ ASSERT(numrecs >= 0);
+ ASSERT(dir == 1 || dir == -1);
+
+ dst_rec = (char *)rec + (dir * cur->bc_ops->rec_len);
+ memmove(dst_rec, rec, numrecs * cur->bc_ops->rec_len);
+}
+
+/*
+ * Shift block pointers one index left/right inside a single btree block.
+ */
+STATIC void
+xfs_btree_shift_ptrs(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *ptr,
+ int dir,
+ int numptrs)
+{
+ char *dst_ptr;
+
+ ASSERT(numptrs >= 0);
+ ASSERT(dir == 1 || dir == -1);
+
+ dst_ptr = (char *)ptr + (dir * xfs_btree_ptr_len(cur));
+ memmove(dst_ptr, ptr, numptrs * xfs_btree_ptr_len(cur));
+}
+
+/*
+ * Log key values from the btree block.
+ */
+STATIC void
+xfs_btree_log_keys(
+ struct xfs_btree_cur *cur,
+ struct xfs_buf *bp,
+ int first,
+ int last)
+{
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+ XFS_BTREE_TRACE_ARGBII(cur, bp, first, last);
+
+ if (bp) {
+ xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
+ xfs_trans_log_buf(cur->bc_tp, bp,
+ xfs_btree_key_offset(cur, first),
+ xfs_btree_key_offset(cur, last + 1) - 1);
+ } else {
+ xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip,
+ xfs_ilog_fbroot(cur->bc_private.b.whichfork));
+ }
+
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+}
+
+/*
+ * Log record values from the btree block.
+ */
+void
+xfs_btree_log_recs(
+ struct xfs_btree_cur *cur,
+ struct xfs_buf *bp,
+ int first,
+ int last)
+{
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+ XFS_BTREE_TRACE_ARGBII(cur, bp, first, last);
+
+ xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
+ xfs_trans_log_buf(cur->bc_tp, bp,
+ xfs_btree_rec_offset(cur, first),
+ xfs_btree_rec_offset(cur, last + 1) - 1);
+
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+}
+
+/*
+ * Log block pointer fields from a btree block (nonleaf).
+ */
+STATIC void
+xfs_btree_log_ptrs(
+ struct xfs_btree_cur *cur, /* btree cursor */
+ struct xfs_buf *bp, /* buffer containing btree block */
+ int first, /* index of first pointer to log */
+ int last) /* index of last pointer to log */
+{
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+ XFS_BTREE_TRACE_ARGBII(cur, bp, first, last);
+
+ if (bp) {
+ struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
+ int level = xfs_btree_get_level(block);
+
+ xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
+ xfs_trans_log_buf(cur->bc_tp, bp,
+ xfs_btree_ptr_offset(cur, first, level),
+ xfs_btree_ptr_offset(cur, last + 1, level) - 1);
+ } else {
+ xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip,
+ xfs_ilog_fbroot(cur->bc_private.b.whichfork));
+ }
+
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+}
+
+/*
+ * Log fields from a btree block header.
+ */
+void
+xfs_btree_log_block(
+ struct xfs_btree_cur *cur, /* btree cursor */
+ struct xfs_buf *bp, /* buffer containing btree block */
+ int fields) /* mask of fields: XFS_BB_... */
+{
+ int first; /* first byte offset logged */
+ int last; /* last byte offset logged */
+ static const short soffsets[] = { /* table of offsets (short) */
+ offsetof(struct xfs_btree_block, bb_magic),
+ offsetof(struct xfs_btree_block, bb_level),
+ offsetof(struct xfs_btree_block, bb_numrecs),
+ offsetof(struct xfs_btree_block, bb_u.s.bb_leftsib),
+ offsetof(struct xfs_btree_block, bb_u.s.bb_rightsib),
+ offsetof(struct xfs_btree_block, bb_u.s.bb_blkno),
+ offsetof(struct xfs_btree_block, bb_u.s.bb_lsn),
+ offsetof(struct xfs_btree_block, bb_u.s.bb_uuid),
+ offsetof(struct xfs_btree_block, bb_u.s.bb_owner),
+ offsetof(struct xfs_btree_block, bb_u.s.bb_crc),
+ XFS_BTREE_SBLOCK_CRC_LEN
+ };
+ static const short loffsets[] = { /* table of offsets (long) */
+ offsetof(struct xfs_btree_block, bb_magic),
+ offsetof(struct xfs_btree_block, bb_level),
+ offsetof(struct xfs_btree_block, bb_numrecs),
+ offsetof(struct xfs_btree_block, bb_u.l.bb_leftsib),
+ offsetof(struct xfs_btree_block, bb_u.l.bb_rightsib),
+ offsetof(struct xfs_btree_block, bb_u.l.bb_blkno),
+ offsetof(struct xfs_btree_block, bb_u.l.bb_lsn),
+ offsetof(struct xfs_btree_block, bb_u.l.bb_uuid),
+ offsetof(struct xfs_btree_block, bb_u.l.bb_owner),
+ offsetof(struct xfs_btree_block, bb_u.l.bb_crc),
+ offsetof(struct xfs_btree_block, bb_u.l.bb_pad),
+ XFS_BTREE_LBLOCK_CRC_LEN
+ };
+
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+ XFS_BTREE_TRACE_ARGBI(cur, bp, fields);
+
+ if (bp) {
+ int nbits;
+
+ if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) {
+ /*
+ * We don't log the CRC when updating a btree
+ * block but instead recreate it during log
+ * recovery. As the log buffers have checksums
+ * of their own this is safe and avoids logging a crc
+ * update in a lot of places.
+ */
+ if (fields == XFS_BB_ALL_BITS)
+ fields = XFS_BB_ALL_BITS_CRC;
+ nbits = XFS_BB_NUM_BITS_CRC;
+ } else {
+ nbits = XFS_BB_NUM_BITS;
+ }
+ xfs_btree_offsets(fields,
+ (cur->bc_flags & XFS_BTREE_LONG_PTRS) ?
+ loffsets : soffsets,
+ nbits, &first, &last);
+ xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
+ xfs_trans_log_buf(cur->bc_tp, bp, first, last);
+ } else {
+ xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip,
+ xfs_ilog_fbroot(cur->bc_private.b.whichfork));
+ }
+
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+}
+
+/*
+ * Increment cursor by one record at the level.
+ * For nonzero levels the leaf-ward information is untouched.
+ */
+int /* error */
+xfs_btree_increment(
+ struct xfs_btree_cur *cur,
+ int level,
+ int *stat) /* success/failure */
+{
+ struct xfs_btree_block *block;
+ union xfs_btree_ptr ptr;
+ struct xfs_buf *bp;
+ int error; /* error return value */
+ int lev;
+
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+ XFS_BTREE_TRACE_ARGI(cur, level);
+
+ ASSERT(level < cur->bc_nlevels);
+
+ /* Read-ahead to the right at this level. */
+ xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA);
+
+ /* Get a pointer to the btree block. */
+ block = xfs_btree_get_block(cur, level, &bp);
+
+#ifdef DEBUG
+ error = xfs_btree_check_block(cur, block, level, bp);
+ if (error)
+ goto error0;
+#endif
+
+ /* We're done if we remain in the block after the increment. */
+ if (++cur->bc_ptrs[level] <= xfs_btree_get_numrecs(block))
+ goto out1;
+
+ /* Fail if we just went off the right edge of the tree. */
+ xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
+ if (xfs_btree_ptr_is_null(cur, &ptr))
+ goto out0;
+
+ XFS_BTREE_STATS_INC(cur, increment);
+
+ /*
+ * March up the tree incrementing pointers.
+ * Stop when we don't go off the right edge of a block.
+ */
+ for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
+ block = xfs_btree_get_block(cur, lev, &bp);
+
+#ifdef DEBUG
+ error = xfs_btree_check_block(cur, block, lev, bp);
+ if (error)
+ goto error0;
+#endif
+
+ if (++cur->bc_ptrs[lev] <= xfs_btree_get_numrecs(block))
+ break;
+
+ /* Read-ahead the right block for the next loop. */
+ xfs_btree_readahead(cur, lev, XFS_BTCUR_RIGHTRA);
+ }
+
+ /*
+ * If we went off the root then we are either seriously
+ * confused or have the tree root in an inode.
+ */
+ if (lev == cur->bc_nlevels) {
+ if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
+ goto out0;
+ ASSERT(0);
+ error = EFSCORRUPTED;
+ goto error0;
+ }
+ ASSERT(lev < cur->bc_nlevels);
+
+ /*
+ * Now walk back down the tree, fixing up the cursor's buffer
+ * pointers and key numbers.
+ */
+ for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) {
+ union xfs_btree_ptr *ptrp;
+
+ ptrp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[lev], block);
+ --lev;
+ error = xfs_btree_read_buf_block(cur, ptrp, 0, &block, &bp);
+ if (error)
+ goto error0;
+
+ xfs_btree_setbuf(cur, lev, bp);
+ cur->bc_ptrs[lev] = 1;
+ }
+out1:
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+ *stat = 1;
+ return 0;
+
+out0:
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+ *stat = 0;
+ return 0;
+
+error0:
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+ return error;
+}
+
+/*
+ * Decrement cursor by one record at the level.
+ * For nonzero levels the leaf-ward information is untouched.
+ */
+int /* error */
+xfs_btree_decrement(
+ struct xfs_btree_cur *cur,
+ int level,
+ int *stat) /* success/failure */
+{
+ struct xfs_btree_block *block;
+ xfs_buf_t *bp;
+ int error; /* error return value */
+ int lev;
+ union xfs_btree_ptr ptr;
+
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+ XFS_BTREE_TRACE_ARGI(cur, level);
+
+ ASSERT(level < cur->bc_nlevels);
+
+ /* Read-ahead to the left at this level. */
+ xfs_btree_readahead(cur, level, XFS_BTCUR_LEFTRA);
+
+ /* We're done if we remain in the block after the decrement. */
+ if (--cur->bc_ptrs[level] > 0)
+ goto out1;
+
+ /* Get a pointer to the btree block. */
+ block = xfs_btree_get_block(cur, level, &bp);
+
+#ifdef DEBUG
+ error = xfs_btree_check_block(cur, block, level, bp);
+ if (error)
+ goto error0;
+#endif
+
+ /* Fail if we just went off the left edge of the tree. */
+ xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_LEFTSIB);
+ if (xfs_btree_ptr_is_null(cur, &ptr))
+ goto out0;
+
+ XFS_BTREE_STATS_INC(cur, decrement);
+
+ /*
+ * March up the tree decrementing pointers.
+ * Stop when we don't go off the left edge of a block.
+ */
+ for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
+ if (--cur->bc_ptrs[lev] > 0)
+ break;
+ /* Read-ahead the left block for the next loop. */
+ xfs_btree_readahead(cur, lev, XFS_BTCUR_LEFTRA);
+ }
+
+ /*
+ * If we went off the root then we are seriously confused.
+ * or the root of the tree is in an inode.
+ */
+ if (lev == cur->bc_nlevels) {
+ if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
+ goto out0;
+ ASSERT(0);
+ error = EFSCORRUPTED;
+ goto error0;
+ }
+ ASSERT(lev < cur->bc_nlevels);
+
+ /*
+ * Now walk back down the tree, fixing up the cursor's buffer
+ * pointers and key numbers.
+ */
+ for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) {
+ union xfs_btree_ptr *ptrp;
+
+ ptrp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[lev], block);
+ --lev;
+ error = xfs_btree_read_buf_block(cur, ptrp, 0, &block, &bp);
+ if (error)
+ goto error0;
+ xfs_btree_setbuf(cur, lev, bp);
+ cur->bc_ptrs[lev] = xfs_btree_get_numrecs(block);
+ }
+out1:
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+ *stat = 1;
+ return 0;
+
+out0:
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+ *stat = 0;
+ return 0;
+
+error0:
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+ return error;
+}
+
+STATIC int
+xfs_btree_lookup_get_block(
+ struct xfs_btree_cur *cur, /* btree cursor */
+ int level, /* level in the btree */
+ union xfs_btree_ptr *pp, /* ptr to btree block */
+ struct xfs_btree_block **blkp) /* return btree block */
+{
+ struct xfs_buf *bp; /* buffer pointer for btree block */
+ int error = 0;
+
+ /* special case the root block if in an inode */
+ if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
+ (level == cur->bc_nlevels - 1)) {
+ *blkp = xfs_btree_get_iroot(cur);
+ return 0;
+ }
+
+ /*
+ * If the old buffer at this level for the disk address we are
+ * looking for re-use it.
+ *
+ * Otherwise throw it away and get a new one.
+ */
+ bp = cur->bc_bufs[level];
+ if (bp && XFS_BUF_ADDR(bp) == xfs_btree_ptr_to_daddr(cur, pp)) {
+ *blkp = XFS_BUF_TO_BLOCK(bp);
+ return 0;
+ }
+
+ error = xfs_btree_read_buf_block(cur, pp, 0, blkp, &bp);
+ if (error)
+ return error;
+
+ xfs_btree_setbuf(cur, level, bp);
+ return 0;
+}
+
+/*
+ * Get current search key. For level 0 we don't actually have a key
+ * structure so we make one up from the record. For all other levels
+ * we just return the right key.
+ */
+STATIC union xfs_btree_key *
+xfs_lookup_get_search_key(
+ struct xfs_btree_cur *cur,
+ int level,
+ int keyno,
+ struct xfs_btree_block *block,
+ union xfs_btree_key *kp)
+{
+ if (level == 0) {
+ cur->bc_ops->init_key_from_rec(kp,
+ xfs_btree_rec_addr(cur, keyno, block));
+ return kp;
+ }
+
+ return xfs_btree_key_addr(cur, keyno, block);
+}
+
+/*
+ * Lookup the record. The cursor is made to point to it, based on dir.
+ * stat is set to 0 if can't find any such record, 1 for success.
+ */
+int /* error */
+xfs_btree_lookup(
+ struct xfs_btree_cur *cur, /* btree cursor */
+ xfs_lookup_t dir, /* <=, ==, or >= */
+ int *stat) /* success/failure */
+{
+ struct xfs_btree_block *block; /* current btree block */
+ __int64_t diff; /* difference for the current key */
+ int error; /* error return value */
+ int keyno; /* current key number */
+ int level; /* level in the btree */
+ union xfs_btree_ptr *pp; /* ptr to btree block */
+ union xfs_btree_ptr ptr; /* ptr to btree block */
+
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+ XFS_BTREE_TRACE_ARGI(cur, dir);
+
+ XFS_BTREE_STATS_INC(cur, lookup);
+
+ block = NULL;
+ keyno = 0;
+
+ /* initialise start pointer from cursor */
+ cur->bc_ops->init_ptr_from_cur(cur, &ptr);
+ pp = &ptr;
+
+ /*
+ * Iterate over each level in the btree, starting at the root.
+ * For each level above the leaves, find the key we need, based
+ * on the lookup record, then follow the corresponding block
+ * pointer down to the next level.
+ */
+ for (level = cur->bc_nlevels - 1, diff = 1; level >= 0; level--) {
+ /* Get the block we need to do the lookup on. */
+ error = xfs_btree_lookup_get_block(cur, level, pp, &block);
+ if (error)
+ goto error0;
+
+ if (diff == 0) {
+ /*
+ * If we already had a key match at a higher level, we
+ * know we need to use the first entry in this block.
+ */
+ keyno = 1;
+ } else {
+ /* Otherwise search this block. Do a binary search. */
+
+ int high; /* high entry number */
+ int low; /* low entry number */
+
+ /* Set low and high entry numbers, 1-based. */
+ low = 1;
+ high = xfs_btree_get_numrecs(block);
+ if (!high) {
+ /* Block is empty, must be an empty leaf. */
+ ASSERT(level == 0 && cur->bc_nlevels == 1);
+
+ cur->bc_ptrs[0] = dir != XFS_LOOKUP_LE;
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+ *stat = 0;
+ return 0;
+ }
+
+ /* Binary search the block. */
+ while (low <= high) {
+ union xfs_btree_key key;
+ union xfs_btree_key *kp;
+
+ XFS_BTREE_STATS_INC(cur, compare);
+
+ /* keyno is average of low and high. */
+ keyno = (low + high) >> 1;
+
+ /* Get current search key */
+ kp = xfs_lookup_get_search_key(cur, level,
+ keyno, block, &key);
+
+ /*
+ * Compute difference to get next direction:
+ * - less than, move right
+ * - greater than, move left
+ * - equal, we're done
+ */
+ diff = cur->bc_ops->key_diff(cur, kp);
+ if (diff < 0)
+ low = keyno + 1;
+ else if (diff > 0)
+ high = keyno - 1;
+ else
+ break;
+ }
+ }
+
+ /*
+ * If there are more levels, set up for the next level
+ * by getting the block number and filling in the cursor.
+ */
+ if (level > 0) {
+ /*
+ * If we moved left, need the previous key number,
+ * unless there isn't one.
+ */
+ if (diff > 0 && --keyno < 1)
+ keyno = 1;
+ pp = xfs_btree_ptr_addr(cur, keyno, block);
+
+#ifdef DEBUG
+ error = xfs_btree_check_ptr(cur, pp, 0, level);
+ if (error)
+ goto error0;
+#endif
+ cur->bc_ptrs[level] = keyno;
+ }
+ }
+
+ /* Done with the search. See if we need to adjust the results. */
+ if (dir != XFS_LOOKUP_LE && diff < 0) {
+ keyno++;
+ /*
+ * If ge search and we went off the end of the block, but it's
+ * not the last block, we're in the wrong block.
+ */
+ xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
+ if (dir == XFS_LOOKUP_GE &&
+ keyno > xfs_btree_get_numrecs(block) &&
+ !xfs_btree_ptr_is_null(cur, &ptr)) {
+ int i;
+
+ cur->bc_ptrs[0] = keyno;
+ error = xfs_btree_increment(cur, 0, &i);
+ if (error)
+ goto error0;
+ XFS_WANT_CORRUPTED_RETURN(i == 1);
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+ *stat = 1;
+ return 0;
+ }
+ } else if (dir == XFS_LOOKUP_LE && diff > 0)
+ keyno--;
+ cur->bc_ptrs[0] = keyno;
+
+ /* Return if we succeeded or not. */
+ if (keyno == 0 || keyno > xfs_btree_get_numrecs(block))
+ *stat = 0;
+ else if (dir != XFS_LOOKUP_EQ || diff == 0)
+ *stat = 1;
+ else
+ *stat = 0;
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+ return 0;
+
+error0:
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+ return error;
+}
+
+/*
+ * Update keys at all levels from here to the root along the cursor's path.
+ */
+STATIC int
+xfs_btree_updkey(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_key *keyp,
+ int level)
+{
+ struct xfs_btree_block *block;
+ struct xfs_buf *bp;
+ union xfs_btree_key *kp;
+ int ptr;
+
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+ XFS_BTREE_TRACE_ARGIK(cur, level, keyp);
+
+ ASSERT(!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) || level >= 1);
+
+ /*
+ * Go up the tree from this level toward the root.
+ * At each level, update the key value to the value input.
+ * Stop when we reach a level where the cursor isn't pointing
+ * at the first entry in the block.
+ */
+ for (ptr = 1; ptr == 1 && level < cur->bc_nlevels; level++) {
+#ifdef DEBUG
+ int error;
+#endif
+ block = xfs_btree_get_block(cur, level, &bp);
+#ifdef DEBUG
+ error = xfs_btree_check_block(cur, block, level, bp);
+ if (error) {
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+ return error;
+ }
+#endif
+ ptr = cur->bc_ptrs[level];
+ kp = xfs_btree_key_addr(cur, ptr, block);
+ xfs_btree_copy_keys(cur, kp, keyp, 1);
+ xfs_btree_log_keys(cur, bp, ptr, ptr);
+ }
+
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+ return 0;
+}
+
+/*
+ * Update the record referred to by cur to the value in the
+ * given record. This either works (return 0) or gets an
+ * EFSCORRUPTED error.
+ */
+int
+xfs_btree_update(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_rec *rec)
+{
+ struct xfs_btree_block *block;
+ struct xfs_buf *bp;
+ int error;
+ int ptr;
+ union xfs_btree_rec *rp;
+
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+ XFS_BTREE_TRACE_ARGR(cur, rec);
+
+ /* Pick up the current block. */
+ block = xfs_btree_get_block(cur, 0, &bp);
+
+#ifdef DEBUG
+ error = xfs_btree_check_block(cur, block, 0, bp);
+ if (error)
+ goto error0;
+#endif
+ /* Get the address of the rec to be updated. */
+ ptr = cur->bc_ptrs[0];
+ rp = xfs_btree_rec_addr(cur, ptr, block);
+
+ /* Fill in the new contents and log them. */
+ xfs_btree_copy_recs(cur, rp, rec, 1);
+ xfs_btree_log_recs(cur, bp, ptr, ptr);
+
+ /*
+ * If we are tracking the last record in the tree and
+ * we are at the far right edge of the tree, update it.
+ */
+ if (xfs_btree_is_lastrec(cur, block, 0)) {
+ cur->bc_ops->update_lastrec(cur, block, rec,
+ ptr, LASTREC_UPDATE);
+ }
+
+ /* Updating first rec in leaf. Pass new key value up to our parent. */
+ if (ptr == 1) {
+ union xfs_btree_key key;
+
+ cur->bc_ops->init_key_from_rec(&key, rec);
+ error = xfs_btree_updkey(cur, &key, 1);
+ if (error)
+ goto error0;
+ }
+
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+ return 0;
+
+error0:
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+ return error;
+}
+
+/*
+ * Move 1 record left from cur/level if possible.
+ * Update cur to reflect the new path.
+ */
+STATIC int /* error */
+xfs_btree_lshift(
+ struct xfs_btree_cur *cur,
+ int level,
+ int *stat) /* success/failure */
+{
+ union xfs_btree_key key; /* btree key */
+ struct xfs_buf *lbp; /* left buffer pointer */
+ struct xfs_btree_block *left; /* left btree block */
+ int lrecs; /* left record count */
+ struct xfs_buf *rbp; /* right buffer pointer */
+ struct xfs_btree_block *right; /* right btree block */
+ int rrecs; /* right record count */
+ union xfs_btree_ptr lptr; /* left btree pointer */
+ union xfs_btree_key *rkp = NULL; /* right btree key */
+ union xfs_btree_ptr *rpp = NULL; /* right address pointer */
+ union xfs_btree_rec *rrp = NULL; /* right record pointer */
+ int error; /* error return value */
+
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+ XFS_BTREE_TRACE_ARGI(cur, level);
+
+ if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
+ level == cur->bc_nlevels - 1)
+ goto out0;
+
+ /* Set up variables for this block as "right". */
+ right = xfs_btree_get_block(cur, level, &rbp);
+
+#ifdef DEBUG
+ error = xfs_btree_check_block(cur, right, level, rbp);
+ if (error)
+ goto error0;
+#endif
+
+ /* If we've got no left sibling then we can't shift an entry left. */
+ xfs_btree_get_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
+ if (xfs_btree_ptr_is_null(cur, &lptr))
+ goto out0;
+
+ /*
+ * If the cursor entry is the one that would be moved, don't
+ * do it... it's too complicated.
+ */
+ if (cur->bc_ptrs[level] <= 1)
+ goto out0;
+
+ /* Set up the left neighbor as "left". */
+ error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp);
+ if (error)
+ goto error0;
+
+ /* If it's full, it can't take another entry. */
+ lrecs = xfs_btree_get_numrecs(left);
+ if (lrecs == cur->bc_ops->get_maxrecs(cur, level))
+ goto out0;
+
+ rrecs = xfs_btree_get_numrecs(right);
+
+ /*
+ * We add one entry to the left side and remove one for the right side.
+ * Account for it here, the changes will be updated on disk and logged
+ * later.
+ */
+ lrecs++;
+ rrecs--;
+
+ XFS_BTREE_STATS_INC(cur, lshift);
+ XFS_BTREE_STATS_ADD(cur, moves, 1);
+
+ /*
+ * If non-leaf, copy a key and a ptr to the left block.
+ * Log the changes to the left block.
+ */
+ if (level > 0) {
+ /* It's a non-leaf. Move keys and pointers. */
+ union xfs_btree_key *lkp; /* left btree key */
+ union xfs_btree_ptr *lpp; /* left address pointer */
+
+ lkp = xfs_btree_key_addr(cur, lrecs, left);
+ rkp = xfs_btree_key_addr(cur, 1, right);
+
+ lpp = xfs_btree_ptr_addr(cur, lrecs, left);
+ rpp = xfs_btree_ptr_addr(cur, 1, right);
+#ifdef DEBUG
+ error = xfs_btree_check_ptr(cur, rpp, 0, level);
+ if (error)
+ goto error0;
+#endif
+ xfs_btree_copy_keys(cur, lkp, rkp, 1);
+ xfs_btree_copy_ptrs(cur, lpp, rpp, 1);
+
+ xfs_btree_log_keys(cur, lbp, lrecs, lrecs);
+ xfs_btree_log_ptrs(cur, lbp, lrecs, lrecs);
+
+ ASSERT(cur->bc_ops->keys_inorder(cur,
+ xfs_btree_key_addr(cur, lrecs - 1, left), lkp));
+ } else {
+ /* It's a leaf. Move records. */
+ union xfs_btree_rec *lrp; /* left record pointer */
+
+ lrp = xfs_btree_rec_addr(cur, lrecs, left);
+ rrp = xfs_btree_rec_addr(cur, 1, right);
+
+ xfs_btree_copy_recs(cur, lrp, rrp, 1);
+ xfs_btree_log_recs(cur, lbp, lrecs, lrecs);
+
+ ASSERT(cur->bc_ops->recs_inorder(cur,
+ xfs_btree_rec_addr(cur, lrecs - 1, left), lrp));
+ }
+
+ xfs_btree_set_numrecs(left, lrecs);
+ xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS);
+
+ xfs_btree_set_numrecs(right, rrecs);
+ xfs_btree_log_block(cur, rbp, XFS_BB_NUMRECS);
+
+ /*
+ * Slide the contents of right down one entry.
+ */
+ XFS_BTREE_STATS_ADD(cur, moves, rrecs - 1);
+ if (level > 0) {
+ /* It's a nonleaf. operate on keys and ptrs */
+#ifdef DEBUG
+ int i; /* loop index */
+
+ for (i = 0; i < rrecs; i++) {
+ error = xfs_btree_check_ptr(cur, rpp, i + 1, level);
+ if (error)
+ goto error0;
+ }
+#endif
+ xfs_btree_shift_keys(cur,
+ xfs_btree_key_addr(cur, 2, right),
+ -1, rrecs);
+ xfs_btree_shift_ptrs(cur,
+ xfs_btree_ptr_addr(cur, 2, right),
+ -1, rrecs);
+
+ xfs_btree_log_keys(cur, rbp, 1, rrecs);
+ xfs_btree_log_ptrs(cur, rbp, 1, rrecs);
+ } else {
+ /* It's a leaf. operate on records */
+ xfs_btree_shift_recs(cur,
+ xfs_btree_rec_addr(cur, 2, right),
+ -1, rrecs);
+ xfs_btree_log_recs(cur, rbp, 1, rrecs);
+
+ /*
+ * If it's the first record in the block, we'll need a key
+ * structure to pass up to the next level (updkey).
+ */
+ cur->bc_ops->init_key_from_rec(&key,
+ xfs_btree_rec_addr(cur, 1, right));
+ rkp = &key;
+ }
+
+ /* Update the parent key values of right. */
+ error = xfs_btree_updkey(cur, rkp, level + 1);
+ if (error)
+ goto error0;
+
+ /* Slide the cursor value left one. */
+ cur->bc_ptrs[level]--;
+
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+ *stat = 1;
+ return 0;
+
+out0:
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+ *stat = 0;
+ return 0;
+
+error0:
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+ return error;
+}
+
+/*
+ * Move 1 record right from cur/level if possible.
+ * Update cur to reflect the new path.
+ */
+STATIC int /* error */
+xfs_btree_rshift(
+ struct xfs_btree_cur *cur,
+ int level,
+ int *stat) /* success/failure */
+{
+ union xfs_btree_key key; /* btree key */
+ struct xfs_buf *lbp; /* left buffer pointer */
+ struct xfs_btree_block *left; /* left btree block */
+ struct xfs_buf *rbp; /* right buffer pointer */
+ struct xfs_btree_block *right; /* right btree block */
+ struct xfs_btree_cur *tcur; /* temporary btree cursor */
+ union xfs_btree_ptr rptr; /* right block pointer */
+ union xfs_btree_key *rkp; /* right btree key */
+ int rrecs; /* right record count */
+ int lrecs; /* left record count */
+ int error; /* error return value */
+ int i; /* loop counter */
+
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+ XFS_BTREE_TRACE_ARGI(cur, level);
+
+ if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
+ (level == cur->bc_nlevels - 1))
+ goto out0;
+
+ /* Set up variables for this block as "left". */
+ left = xfs_btree_get_block(cur, level, &lbp);
+
+#ifdef DEBUG
+ error = xfs_btree_check_block(cur, left, level, lbp);
+ if (error)
+ goto error0;
+#endif
+
+ /* If we've got no right sibling then we can't shift an entry right. */
+ xfs_btree_get_sibling(cur, left, &rptr, XFS_BB_RIGHTSIB);
+ if (xfs_btree_ptr_is_null(cur, &rptr))
+ goto out0;
+
+ /*
+ * If the cursor entry is the one that would be moved, don't
+ * do it... it's too complicated.
+ */
+ lrecs = xfs_btree_get_numrecs(left);
+ if (cur->bc_ptrs[level] >= lrecs)
+ goto out0;
+
+ /* Set up the right neighbor as "right". */
+ error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp);
+ if (error)
+ goto error0;
+
+ /* If it's full, it can't take another entry. */
+ rrecs = xfs_btree_get_numrecs(right);
+ if (rrecs == cur->bc_ops->get_maxrecs(cur, level))
+ goto out0;
+
+ XFS_BTREE_STATS_INC(cur, rshift);
+ XFS_BTREE_STATS_ADD(cur, moves, rrecs);
+
+ /*
+ * Make a hole at the start of the right neighbor block, then
+ * copy the last left block entry to the hole.
+ */
+ if (level > 0) {
+ /* It's a nonleaf. make a hole in the keys and ptrs */
+ union xfs_btree_key *lkp;
+ union xfs_btree_ptr *lpp;
+ union xfs_btree_ptr *rpp;
+
+ lkp = xfs_btree_key_addr(cur, lrecs, left);
+ lpp = xfs_btree_ptr_addr(cur, lrecs, left);
+ rkp = xfs_btree_key_addr(cur, 1, right);
+ rpp = xfs_btree_ptr_addr(cur, 1, right);
+
+#ifdef DEBUG
+ for (i = rrecs - 1; i >= 0; i--) {
+ error = xfs_btree_check_ptr(cur, rpp, i, level);
+ if (error)
+ goto error0;
+ }
+#endif
+
+ xfs_btree_shift_keys(cur, rkp, 1, rrecs);
+ xfs_btree_shift_ptrs(cur, rpp, 1, rrecs);
+
+#ifdef DEBUG
+ error = xfs_btree_check_ptr(cur, lpp, 0, level);
+ if (error)
+ goto error0;
+#endif
+
+ /* Now put the new data in, and log it. */
+ xfs_btree_copy_keys(cur, rkp, lkp, 1);
+ xfs_btree_copy_ptrs(cur, rpp, lpp, 1);
+
+ xfs_btree_log_keys(cur, rbp, 1, rrecs + 1);
+ xfs_btree_log_ptrs(cur, rbp, 1, rrecs + 1);
+
+ ASSERT(cur->bc_ops->keys_inorder(cur, rkp,
+ xfs_btree_key_addr(cur, 2, right)));
+ } else {
+ /* It's a leaf. make a hole in the records */
+ union xfs_btree_rec *lrp;
+ union xfs_btree_rec *rrp;
+
+ lrp = xfs_btree_rec_addr(cur, lrecs, left);
+ rrp = xfs_btree_rec_addr(cur, 1, right);
+
+ xfs_btree_shift_recs(cur, rrp, 1, rrecs);
+
+ /* Now put the new data in, and log it. */
+ xfs_btree_copy_recs(cur, rrp, lrp, 1);
+ xfs_btree_log_recs(cur, rbp, 1, rrecs + 1);
+
+ cur->bc_ops->init_key_from_rec(&key, rrp);
+ rkp = &key;
+
+ ASSERT(cur->bc_ops->recs_inorder(cur, rrp,
+ xfs_btree_rec_addr(cur, 2, right)));
+ }
+
+ /*
+ * Decrement and log left's numrecs, bump and log right's numrecs.
+ */
+ xfs_btree_set_numrecs(left, --lrecs);
+ xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS);
+
+ xfs_btree_set_numrecs(right, ++rrecs);
+ xfs_btree_log_block(cur, rbp, XFS_BB_NUMRECS);
+
+ /*
+ * Using a temporary cursor, update the parent key values of the
+ * block on the right.
+ */
+ error = xfs_btree_dup_cursor(cur, &tcur);
+ if (error)
+ goto error0;
+ i = xfs_btree_lastrec(tcur, level);
+ XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+
+ error = xfs_btree_increment(tcur, level, &i);
+ if (error)
+ goto error1;
+
+ error = xfs_btree_updkey(tcur, rkp, level + 1);
+ if (error)
+ goto error1;
+
+ xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+ *stat = 1;
+ return 0;
+
+out0:
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+ *stat = 0;
+ return 0;
+
+error0:
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+ return error;
+
+error1:
+ XFS_BTREE_TRACE_CURSOR(tcur, XBT_ERROR);
+ xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
+ return error;
+}
+
+/*
+ * Split cur/level block in half.
+ * Return new block number and the key to its first
+ * record (to be inserted into parent).
+ */
+STATIC int /* error */
+__xfs_btree_split(
+ struct xfs_btree_cur *cur,
+ int level,
+ union xfs_btree_ptr *ptrp,
+ union xfs_btree_key *key,
+ struct xfs_btree_cur **curp,
+ int *stat) /* success/failure */
+{
+ union xfs_btree_ptr lptr; /* left sibling block ptr */
+ struct xfs_buf *lbp; /* left buffer pointer */
+ struct xfs_btree_block *left; /* left btree block */
+ union xfs_btree_ptr rptr; /* right sibling block ptr */
+ struct xfs_buf *rbp; /* right buffer pointer */
+ struct xfs_btree_block *right; /* right btree block */
+ union xfs_btree_ptr rrptr; /* right-right sibling ptr */
+ struct xfs_buf *rrbp; /* right-right buffer pointer */
+ struct xfs_btree_block *rrblock; /* right-right btree block */
+ int lrecs;
+ int rrecs;
+ int src_index;
+ int error; /* error return value */
+#ifdef DEBUG
+ int i;
+#endif
+
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+ XFS_BTREE_TRACE_ARGIPK(cur, level, *ptrp, key);
+
+ XFS_BTREE_STATS_INC(cur, split);
+
+ /* Set up left block (current one). */
+ left = xfs_btree_get_block(cur, level, &lbp);
+
+#ifdef DEBUG
+ error = xfs_btree_check_block(cur, left, level, lbp);
+ if (error)
+ goto error0;
+#endif
+
+ xfs_btree_buf_to_ptr(cur, lbp, &lptr);
+
+ /* Allocate the new block. If we can't do it, we're toast. Give up. */
+ error = cur->bc_ops->alloc_block(cur, &lptr, &rptr, stat);
+ if (error)
+ goto error0;
+ if (*stat == 0)
+ goto out0;
+ XFS_BTREE_STATS_INC(cur, alloc);
+
+ /* Set up the new block as "right". */
+ error = xfs_btree_get_buf_block(cur, &rptr, 0, &right, &rbp);
+ if (error)
+ goto error0;
+
+ /* Fill in the btree header for the new right block. */
+ xfs_btree_init_block_cur(cur, rbp, xfs_btree_get_level(left), 0);
+
+ /*
+ * Split the entries between the old and the new block evenly.
+ * Make sure that if there's an odd number of entries now, that
+ * each new block will have the same number of entries.
+ */
+ lrecs = xfs_btree_get_numrecs(left);
+ rrecs = lrecs / 2;
+ if ((lrecs & 1) && cur->bc_ptrs[level] <= rrecs + 1)
+ rrecs++;
+ src_index = (lrecs - rrecs + 1);
+
+ XFS_BTREE_STATS_ADD(cur, moves, rrecs);
+
+ /*
+ * Copy btree block entries from the left block over to the
+ * new block, the right. Update the right block and log the
+ * changes.
+ */
+ if (level > 0) {
+ /* It's a non-leaf. Move keys and pointers. */
+ union xfs_btree_key *lkp; /* left btree key */
+ union xfs_btree_ptr *lpp; /* left address pointer */
+ union xfs_btree_key *rkp; /* right btree key */
+ union xfs_btree_ptr *rpp; /* right address pointer */
+
+ lkp = xfs_btree_key_addr(cur, src_index, left);
+ lpp = xfs_btree_ptr_addr(cur, src_index, left);
+ rkp = xfs_btree_key_addr(cur, 1, right);
+ rpp = xfs_btree_ptr_addr(cur, 1, right);
+
+#ifdef DEBUG
+ for (i = src_index; i < rrecs; i++) {
+ error = xfs_btree_check_ptr(cur, lpp, i, level);
+ if (error)
+ goto error0;
+ }
+#endif
+
+ xfs_btree_copy_keys(cur, rkp, lkp, rrecs);
+ xfs_btree_copy_ptrs(cur, rpp, lpp, rrecs);
+
+ xfs_btree_log_keys(cur, rbp, 1, rrecs);
+ xfs_btree_log_ptrs(cur, rbp, 1, rrecs);
+
+ /* Grab the keys to the entries moved to the right block */
+ xfs_btree_copy_keys(cur, key, rkp, 1);
+ } else {
+ /* It's a leaf. Move records. */
+ union xfs_btree_rec *lrp; /* left record pointer */
+ union xfs_btree_rec *rrp; /* right record pointer */
+
+ lrp = xfs_btree_rec_addr(cur, src_index, left);
+ rrp = xfs_btree_rec_addr(cur, 1, right);
+
+ xfs_btree_copy_recs(cur, rrp, lrp, rrecs);
+ xfs_btree_log_recs(cur, rbp, 1, rrecs);
+
+ cur->bc_ops->init_key_from_rec(key,
+ xfs_btree_rec_addr(cur, 1, right));
+ }
+
+
+ /*
+ * Find the left block number by looking in the buffer.
+ * Adjust numrecs, sibling pointers.
+ */
+ xfs_btree_get_sibling(cur, left, &rrptr, XFS_BB_RIGHTSIB);
+ xfs_btree_set_sibling(cur, right, &rrptr, XFS_BB_RIGHTSIB);
+ xfs_btree_set_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
+ xfs_btree_set_sibling(cur, left, &rptr, XFS_BB_RIGHTSIB);
+
+ lrecs -= rrecs;
+ xfs_btree_set_numrecs(left, lrecs);
+ xfs_btree_set_numrecs(right, xfs_btree_get_numrecs(right) + rrecs);
+
+ xfs_btree_log_block(cur, rbp, XFS_BB_ALL_BITS);
+ xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
+
+ /*
+ * If there's a block to the new block's right, make that block
+ * point back to right instead of to left.
+ */
+ if (!xfs_btree_ptr_is_null(cur, &rrptr)) {
+ error = xfs_btree_read_buf_block(cur, &rrptr,
+ 0, &rrblock, &rrbp);
+ if (error)
+ goto error0;
+ xfs_btree_set_sibling(cur, rrblock, &rptr, XFS_BB_LEFTSIB);
+ xfs_btree_log_block(cur, rrbp, XFS_BB_LEFTSIB);
+ }
+ /*
+ * If the cursor is really in the right block, move it there.
+ * If it's just pointing past the last entry in left, then we'll
+ * insert there, so don't change anything in that case.
+ */
+ if (cur->bc_ptrs[level] > lrecs + 1) {
+ xfs_btree_setbuf(cur, level, rbp);
+ cur->bc_ptrs[level] -= lrecs;
+ }
+ /*
+ * If there are more levels, we'll need another cursor which refers
+ * the right block, no matter where this cursor was.
+ */
+ if (level + 1 < cur->bc_nlevels) {
+ error = xfs_btree_dup_cursor(cur, curp);
+ if (error)
+ goto error0;
+ (*curp)->bc_ptrs[level + 1]++;
+ }
+ *ptrp = rptr;
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+ *stat = 1;
+ return 0;
+out0:
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+ *stat = 0;
+ return 0;
+
+error0:
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+ return error;
+}
+
+struct xfs_btree_split_args {
+ struct xfs_btree_cur *cur;
+ int level;
+ union xfs_btree_ptr *ptrp;
+ union xfs_btree_key *key;
+ struct xfs_btree_cur **curp;
+ int *stat; /* success/failure */
+ int result;
+ bool kswapd; /* allocation in kswapd context */
+ struct completion *done;
+ struct work_struct work;
+};
+
+/*
+ * Stack switching interfaces for allocation
+ */
+static void
+xfs_btree_split_worker(
+ struct work_struct *work)
+{
+ struct xfs_btree_split_args *args = container_of(work,
+ struct xfs_btree_split_args, work);
+ unsigned long pflags;
+ unsigned long new_pflags = PF_FSTRANS;
+
+ /*
+ * we are in a transaction context here, but may also be doing work
+ * in kswapd context, and hence we may need to inherit that state
+ * temporarily to ensure that we don't block waiting for memory reclaim
+ * in any way.
+ */
+ if (args->kswapd)
+ new_pflags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
+
+ current_set_flags_nested(&pflags, new_pflags);
+
+ args->result = __xfs_btree_split(args->cur, args->level, args->ptrp,
+ args->key, args->curp, args->stat);
+ complete(args->done);
+
+ current_restore_flags_nested(&pflags, new_pflags);
+}
+
+/*
+ * BMBT split requests often come in with little stack to work on. Push
+ * them off to a worker thread so there is lots of stack to use. For the other
+ * btree types, just call directly to avoid the context switch overhead here.
+ */
+STATIC int /* error */
+xfs_btree_split(
+ struct xfs_btree_cur *cur,
+ int level,
+ union xfs_btree_ptr *ptrp,
+ union xfs_btree_key *key,
+ struct xfs_btree_cur **curp,
+ int *stat) /* success/failure */
+{
+ struct xfs_btree_split_args args;
+ DECLARE_COMPLETION_ONSTACK(done);
+
+ if (cur->bc_btnum != XFS_BTNUM_BMAP)
+ return __xfs_btree_split(cur, level, ptrp, key, curp, stat);
+
+ args.cur = cur;
+ args.level = level;
+ args.ptrp = ptrp;
+ args.key = key;
+ args.curp = curp;
+ args.stat = stat;
+ args.done = &done;
+ args.kswapd = current_is_kswapd();
+ INIT_WORK_ONSTACK(&args.work, xfs_btree_split_worker);
+ queue_work(xfs_alloc_wq, &args.work);
+ wait_for_completion(&done);
+ destroy_work_on_stack(&args.work);
+ return args.result;
+}
+
+
+/*
+ * Copy the old inode root contents into a real block and make the
+ * broot point to it.
+ */
+int /* error */
+xfs_btree_new_iroot(
+ struct xfs_btree_cur *cur, /* btree cursor */
+ int *logflags, /* logging flags for inode */
+ int *stat) /* return status - 0 fail */
+{
+ struct xfs_buf *cbp; /* buffer for cblock */
+ struct xfs_btree_block *block; /* btree block */
+ struct xfs_btree_block *cblock; /* child btree block */
+ union xfs_btree_key *ckp; /* child key pointer */
+ union xfs_btree_ptr *cpp; /* child ptr pointer */
+ union xfs_btree_key *kp; /* pointer to btree key */
+ union xfs_btree_ptr *pp; /* pointer to block addr */
+ union xfs_btree_ptr nptr; /* new block addr */
+ int level; /* btree level */
+ int error; /* error return code */
+#ifdef DEBUG
+ int i; /* loop counter */
+#endif
+
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+ XFS_BTREE_STATS_INC(cur, newroot);
+
+ ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
+
+ level = cur->bc_nlevels - 1;
+
+ block = xfs_btree_get_iroot(cur);
+ pp = xfs_btree_ptr_addr(cur, 1, block);
+
+ /* Allocate the new block. If we can't do it, we're toast. Give up. */
+ error = cur->bc_ops->alloc_block(cur, pp, &nptr, stat);
+ if (error)
+ goto error0;
+ if (*stat == 0) {
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+ return 0;
+ }
+ XFS_BTREE_STATS_INC(cur, alloc);
+
+ /* Copy the root into a real block. */
+ error = xfs_btree_get_buf_block(cur, &nptr, 0, &cblock, &cbp);
+ if (error)
+ goto error0;
+
+ /*
+ * we can't just memcpy() the root in for CRC enabled btree blocks.
+ * In that case have to also ensure the blkno remains correct
+ */
+ memcpy(cblock, block, xfs_btree_block_len(cur));
+ if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) {
+ if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+ cblock->bb_u.l.bb_blkno = cpu_to_be64(cbp->b_bn);
+ else
+ cblock->bb_u.s.bb_blkno = cpu_to_be64(cbp->b_bn);
+ }
+
+ be16_add_cpu(&block->bb_level, 1);
+ xfs_btree_set_numrecs(block, 1);
+ cur->bc_nlevels++;
+ cur->bc_ptrs[level + 1] = 1;
+
+ kp = xfs_btree_key_addr(cur, 1, block);
+ ckp = xfs_btree_key_addr(cur, 1, cblock);
+ xfs_btree_copy_keys(cur, ckp, kp, xfs_btree_get_numrecs(cblock));
+
+ cpp = xfs_btree_ptr_addr(cur, 1, cblock);
+#ifdef DEBUG
+ for (i = 0; i < be16_to_cpu(cblock->bb_numrecs); i++) {
+ error = xfs_btree_check_ptr(cur, pp, i, level);
+ if (error)
+ goto error0;
+ }
+#endif
+ xfs_btree_copy_ptrs(cur, cpp, pp, xfs_btree_get_numrecs(cblock));
+
+#ifdef DEBUG
+ error = xfs_btree_check_ptr(cur, &nptr, 0, level);
+ if (error)
+ goto error0;
+#endif
+ xfs_btree_copy_ptrs(cur, pp, &nptr, 1);
+
+ xfs_iroot_realloc(cur->bc_private.b.ip,
+ 1 - xfs_btree_get_numrecs(cblock),
+ cur->bc_private.b.whichfork);
+
+ xfs_btree_setbuf(cur, level, cbp);
+
+ /*
+ * Do all this logging at the end so that
+ * the root is at the right level.
+ */
+ xfs_btree_log_block(cur, cbp, XFS_BB_ALL_BITS);
+ xfs_btree_log_keys(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs));
+ xfs_btree_log_ptrs(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs));
+
+ *logflags |=
+ XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_private.b.whichfork);
+ *stat = 1;
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+ return 0;
+error0:
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+ return error;
+}
+
+/*
+ * Allocate a new root block, fill it in.
+ */
+STATIC int /* error */
+xfs_btree_new_root(
+ struct xfs_btree_cur *cur, /* btree cursor */
+ int *stat) /* success/failure */
+{
+ struct xfs_btree_block *block; /* one half of the old root block */
+ struct xfs_buf *bp; /* buffer containing block */
+ int error; /* error return value */
+ struct xfs_buf *lbp; /* left buffer pointer */
+ struct xfs_btree_block *left; /* left btree block */
+ struct xfs_buf *nbp; /* new (root) buffer */
+ struct xfs_btree_block *new; /* new (root) btree block */
+ int nptr; /* new value for key index, 1 or 2 */
+ struct xfs_buf *rbp; /* right buffer pointer */
+ struct xfs_btree_block *right; /* right btree block */
+ union xfs_btree_ptr rptr;
+ union xfs_btree_ptr lptr;
+
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+ XFS_BTREE_STATS_INC(cur, newroot);
+
+ /* initialise our start point from the cursor */
+ cur->bc_ops->init_ptr_from_cur(cur, &rptr);
+
+ /* Allocate the new block. If we can't do it, we're toast. Give up. */
+ error = cur->bc_ops->alloc_block(cur, &rptr, &lptr, stat);
+ if (error)
+ goto error0;
+ if (*stat == 0)
+ goto out0;
+ XFS_BTREE_STATS_INC(cur, alloc);
+
+ /* Set up the new block. */
+ error = xfs_btree_get_buf_block(cur, &lptr, 0, &new, &nbp);
+ if (error)
+ goto error0;
+
+ /* Set the root in the holding structure increasing the level by 1. */
+ cur->bc_ops->set_root(cur, &lptr, 1);
+
+ /*
+ * At the previous root level there are now two blocks: the old root,
+ * and the new block generated when it was split. We don't know which
+ * one the cursor is pointing at, so we set up variables "left" and
+ * "right" for each case.
+ */
+ block = xfs_btree_get_block(cur, cur->bc_nlevels - 1, &bp);
+
+#ifdef DEBUG
+ error = xfs_btree_check_block(cur, block, cur->bc_nlevels - 1, bp);
+ if (error)
+ goto error0;
+#endif
+
+ xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
+ if (!xfs_btree_ptr_is_null(cur, &rptr)) {
+ /* Our block is left, pick up the right block. */
+ lbp = bp;
+ xfs_btree_buf_to_ptr(cur, lbp, &lptr);
+ left = block;
+ error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp);
+ if (error)
+ goto error0;
+ bp = rbp;
+ nptr = 1;
+ } else {
+ /* Our block is right, pick up the left block. */
+ rbp = bp;
+ xfs_btree_buf_to_ptr(cur, rbp, &rptr);
+ right = block;
+ xfs_btree_get_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
+ error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp);
+ if (error)
+ goto error0;
+ bp = lbp;
+ nptr = 2;
+ }
+ /* Fill in the new block's btree header and log it. */
+ xfs_btree_init_block_cur(cur, nbp, cur->bc_nlevels, 2);
+ xfs_btree_log_block(cur, nbp, XFS_BB_ALL_BITS);
+ ASSERT(!xfs_btree_ptr_is_null(cur, &lptr) &&
+ !xfs_btree_ptr_is_null(cur, &rptr));
+
+ /* Fill in the key data in the new root. */
+ if (xfs_btree_get_level(left) > 0) {
+ xfs_btree_copy_keys(cur,
+ xfs_btree_key_addr(cur, 1, new),
+ xfs_btree_key_addr(cur, 1, left), 1);
+ xfs_btree_copy_keys(cur,
+ xfs_btree_key_addr(cur, 2, new),
+ xfs_btree_key_addr(cur, 1, right), 1);
+ } else {
+ cur->bc_ops->init_key_from_rec(
+ xfs_btree_key_addr(cur, 1, new),
+ xfs_btree_rec_addr(cur, 1, left));
+ cur->bc_ops->init_key_from_rec(
+ xfs_btree_key_addr(cur, 2, new),
+ xfs_btree_rec_addr(cur, 1, right));
+ }
+ xfs_btree_log_keys(cur, nbp, 1, 2);
+
+ /* Fill in the pointer data in the new root. */
+ xfs_btree_copy_ptrs(cur,
+ xfs_btree_ptr_addr(cur, 1, new), &lptr, 1);
+ xfs_btree_copy_ptrs(cur,
+ xfs_btree_ptr_addr(cur, 2, new), &rptr, 1);
+ xfs_btree_log_ptrs(cur, nbp, 1, 2);
+
+ /* Fix up the cursor. */
+ xfs_btree_setbuf(cur, cur->bc_nlevels, nbp);
+ cur->bc_ptrs[cur->bc_nlevels] = nptr;
+ cur->bc_nlevels++;
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+ *stat = 1;
+ return 0;
+error0:
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+ return error;
+out0:
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+ *stat = 0;
+ return 0;
+}
+
+STATIC int
+xfs_btree_make_block_unfull(
+ struct xfs_btree_cur *cur, /* btree cursor */
+ int level, /* btree level */
+ int numrecs,/* # of recs in block */
+ int *oindex,/* old tree index */
+ int *index, /* new tree index */
+ union xfs_btree_ptr *nptr, /* new btree ptr */
+ struct xfs_btree_cur **ncur, /* new btree cursor */
+ union xfs_btree_rec *nrec, /* new record */
+ int *stat)
+{
+ union xfs_btree_key key; /* new btree key value */
+ int error = 0;
+
+ if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
+ level == cur->bc_nlevels - 1) {
+ struct xfs_inode *ip = cur->bc_private.b.ip;
+
+ if (numrecs < cur->bc_ops->get_dmaxrecs(cur, level)) {
+ /* A root block that can be made bigger. */
+ xfs_iroot_realloc(ip, 1, cur->bc_private.b.whichfork);
+ } else {
+ /* A root block that needs replacing */
+ int logflags = 0;
+
+ error = xfs_btree_new_iroot(cur, &logflags, stat);
+ if (error || *stat == 0)
+ return error;
+
+ xfs_trans_log_inode(cur->bc_tp, ip, logflags);
+ }
+
+ return 0;
+ }
+
+ /* First, try shifting an entry to the right neighbor. */
+ error = xfs_btree_rshift(cur, level, stat);
+ if (error || *stat)
+ return error;
+
+ /* Next, try shifting an entry to the left neighbor. */
+ error = xfs_btree_lshift(cur, level, stat);
+ if (error)
+ return error;
+
+ if (*stat) {
+ *oindex = *index = cur->bc_ptrs[level];
+ return 0;
+ }
+
+ /*
+ * Next, try splitting the current block in half.
+ *
+ * If this works we have to re-set our variables because we
+ * could be in a different block now.
+ */
+ error = xfs_btree_split(cur, level, nptr, &key, ncur, stat);
+ if (error || *stat == 0)
+ return error;
+
+
+ *index = cur->bc_ptrs[level];
+ cur->bc_ops->init_rec_from_key(&key, nrec);
+ return 0;
+}
+
+/*
+ * Insert one record/level. Return information to the caller
+ * allowing the next level up to proceed if necessary.
+ */
+STATIC int
+xfs_btree_insrec(
+ struct xfs_btree_cur *cur, /* btree cursor */
+ int level, /* level to insert record at */
+ union xfs_btree_ptr *ptrp, /* i/o: block number inserted */
+ union xfs_btree_rec *recp, /* i/o: record data inserted */
+ struct xfs_btree_cur **curp, /* output: new cursor replacing cur */
+ int *stat) /* success/failure */
+{
+ struct xfs_btree_block *block; /* btree block */
+ struct xfs_buf *bp; /* buffer for block */
+ union xfs_btree_key key; /* btree key */
+ union xfs_btree_ptr nptr; /* new block ptr */
+ struct xfs_btree_cur *ncur; /* new btree cursor */
+ union xfs_btree_rec nrec; /* new record count */
+ int optr; /* old key/record index */
+ int ptr; /* key/record index */
+ int numrecs;/* number of records */
+ int error; /* error return value */
+#ifdef DEBUG
+ int i;
+#endif
+
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+ XFS_BTREE_TRACE_ARGIPR(cur, level, *ptrp, recp);
+
+ ncur = NULL;
+
+ /*
+ * If we have an external root pointer, and we've made it to the
+ * root level, allocate a new root block and we're done.
+ */
+ if (!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
+ (level >= cur->bc_nlevels)) {
+ error = xfs_btree_new_root(cur, stat);
+ xfs_btree_set_ptr_null(cur, ptrp);
+
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+ return error;
+ }
+
+ /* If we're off the left edge, return failure. */
+ ptr = cur->bc_ptrs[level];
+ if (ptr == 0) {
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+ *stat = 0;
+ return 0;
+ }
+
+ /* Make a key out of the record data to be inserted, and save it. */
+ cur->bc_ops->init_key_from_rec(&key, recp);
+
+ optr = ptr;
+
+ XFS_BTREE_STATS_INC(cur, insrec);
+
+ /* Get pointers to the btree buffer and block. */
+ block = xfs_btree_get_block(cur, level, &bp);
+ numrecs = xfs_btree_get_numrecs(block);
+
+#ifdef DEBUG
+ error = xfs_btree_check_block(cur, block, level, bp);
+ if (error)
+ goto error0;
+
+ /* Check that the new entry is being inserted in the right place. */
+ if (ptr <= numrecs) {
+ if (level == 0) {
+ ASSERT(cur->bc_ops->recs_inorder(cur, recp,
+ xfs_btree_rec_addr(cur, ptr, block)));
+ } else {
+ ASSERT(cur->bc_ops->keys_inorder(cur, &key,
+ xfs_btree_key_addr(cur, ptr, block)));
+ }
+ }
+#endif
+
+ /*
+ * If the block is full, we can't insert the new entry until we
+ * make the block un-full.
+ */
+ xfs_btree_set_ptr_null(cur, &nptr);
+ if (numrecs == cur->bc_ops->get_maxrecs(cur, level)) {
+ error = xfs_btree_make_block_unfull(cur, level, numrecs,
+ &optr, &ptr, &nptr, &ncur, &nrec, stat);
+ if (error || *stat == 0)
+ goto error0;
+ }
+
+ /*
+ * The current block may have changed if the block was
+ * previously full and we have just made space in it.
+ */
+ block = xfs_btree_get_block(cur, level, &bp);
+ numrecs = xfs_btree_get_numrecs(block);
+
+#ifdef DEBUG
+ error = xfs_btree_check_block(cur, block, level, bp);
+ if (error)
+ return error;
+#endif
+
+ /*
+ * At this point we know there's room for our new entry in the block
+ * we're pointing at.
+ */
+ XFS_BTREE_STATS_ADD(cur, moves, numrecs - ptr + 1);
+
+ if (level > 0) {
+ /* It's a nonleaf. make a hole in the keys and ptrs */
+ union xfs_btree_key *kp;
+ union xfs_btree_ptr *pp;
+
+ kp = xfs_btree_key_addr(cur, ptr, block);
+ pp = xfs_btree_ptr_addr(cur, ptr, block);
+
+#ifdef DEBUG
+ for (i = numrecs - ptr; i >= 0; i--) {
+ error = xfs_btree_check_ptr(cur, pp, i, level);
+ if (error)
+ return error;
+ }
+#endif
+
+ xfs_btree_shift_keys(cur, kp, 1, numrecs - ptr + 1);
+ xfs_btree_shift_ptrs(cur, pp, 1, numrecs - ptr + 1);
+
+#ifdef DEBUG
+ error = xfs_btree_check_ptr(cur, ptrp, 0, level);
+ if (error)
+ goto error0;
+#endif
+
+ /* Now put the new data in, bump numrecs and log it. */
+ xfs_btree_copy_keys(cur, kp, &key, 1);
+ xfs_btree_copy_ptrs(cur, pp, ptrp, 1);
+ numrecs++;
+ xfs_btree_set_numrecs(block, numrecs);
+ xfs_btree_log_ptrs(cur, bp, ptr, numrecs);
+ xfs_btree_log_keys(cur, bp, ptr, numrecs);
+#ifdef DEBUG
+ if (ptr < numrecs) {
+ ASSERT(cur->bc_ops->keys_inorder(cur, kp,
+ xfs_btree_key_addr(cur, ptr + 1, block)));
+ }
+#endif
+ } else {
+ /* It's a leaf. make a hole in the records */
+ union xfs_btree_rec *rp;
+
+ rp = xfs_btree_rec_addr(cur, ptr, block);
+
+ xfs_btree_shift_recs(cur, rp, 1, numrecs - ptr + 1);
+
+ /* Now put the new data in, bump numrecs and log it. */
+ xfs_btree_copy_recs(cur, rp, recp, 1);
+ xfs_btree_set_numrecs(block, ++numrecs);
+ xfs_btree_log_recs(cur, bp, ptr, numrecs);
+#ifdef DEBUG
+ if (ptr < numrecs) {
+ ASSERT(cur->bc_ops->recs_inorder(cur, rp,
+ xfs_btree_rec_addr(cur, ptr + 1, block)));
+ }
+#endif
+ }
+
+ /* Log the new number of records in the btree header. */
+ xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS);
+
+ /* If we inserted at the start of a block, update the parents' keys. */
+ if (optr == 1) {
+ error = xfs_btree_updkey(cur, &key, level + 1);
+ if (error)
+ goto error0;
+ }
+
+ /*
+ * If we are tracking the last record in the tree and
+ * we are at the far right edge of the tree, update it.
+ */
+ if (xfs_btree_is_lastrec(cur, block, level)) {
+ cur->bc_ops->update_lastrec(cur, block, recp,
+ ptr, LASTREC_INSREC);
+ }
+
+ /*
+ * Return the new block number, if any.
+ * If there is one, give back a record value and a cursor too.
+ */
+ *ptrp = nptr;
+ if (!xfs_btree_ptr_is_null(cur, &nptr)) {
+ *recp = nrec;
+ *curp = ncur;
+ }
+
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+ *stat = 1;
+ return 0;
+
+error0:
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+ return error;
+}
+
+/*
+ * Insert the record at the point referenced by cur.
+ *
+ * A multi-level split of the tree on insert will invalidate the original
+ * cursor. All callers of this function should assume that the cursor is
+ * no longer valid and revalidate it.
+ */
+int
+xfs_btree_insert(
+ struct xfs_btree_cur *cur,
+ int *stat)
+{
+ int error; /* error return value */
+ int i; /* result value, 0 for failure */
+ int level; /* current level number in btree */
+ union xfs_btree_ptr nptr; /* new block number (split result) */
+ struct xfs_btree_cur *ncur; /* new cursor (split result) */
+ struct xfs_btree_cur *pcur; /* previous level's cursor */
+ union xfs_btree_rec rec; /* record to insert */
+
+ level = 0;
+ ncur = NULL;
+ pcur = cur;
+
+ xfs_btree_set_ptr_null(cur, &nptr);
+ cur->bc_ops->init_rec_from_cur(cur, &rec);
+
+ /*
+ * Loop going up the tree, starting at the leaf level.
+ * Stop when we don't get a split block, that must mean that
+ * the insert is finished with this level.
+ */
+ do {
+ /*
+ * Insert nrec/nptr into this level of the tree.
+ * Note if we fail, nptr will be null.
+ */
+ error = xfs_btree_insrec(pcur, level, &nptr, &rec, &ncur, &i);
+ if (error) {
+ if (pcur != cur)
+ xfs_btree_del_cursor(pcur, XFS_BTREE_ERROR);
+ goto error0;
+ }
+
+ XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+ level++;
+
+ /*
+ * See if the cursor we just used is trash.
+ * Can't trash the caller's cursor, but otherwise we should
+ * if ncur is a new cursor or we're about to be done.
+ */
+ if (pcur != cur &&
+ (ncur || xfs_btree_ptr_is_null(cur, &nptr))) {
+ /* Save the state from the cursor before we trash it */
+ if (cur->bc_ops->update_cursor)
+ cur->bc_ops->update_cursor(pcur, cur);
+ cur->bc_nlevels = pcur->bc_nlevels;
+ xfs_btree_del_cursor(pcur, XFS_BTREE_NOERROR);
+ }
+ /* If we got a new cursor, switch to it. */
+ if (ncur) {
+ pcur = ncur;
+ ncur = NULL;
+ }
+ } while (!xfs_btree_ptr_is_null(cur, &nptr));
+
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+ *stat = i;
+ return 0;
+error0:
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+ return error;
+}
+
+/*
+ * Try to merge a non-leaf block back into the inode root.
+ *
+ * Note: the killroot names comes from the fact that we're effectively
+ * killing the old root block. But because we can't just delete the
+ * inode we have to copy the single block it was pointing to into the
+ * inode.
+ */
+STATIC int
+xfs_btree_kill_iroot(
+ struct xfs_btree_cur *cur)
+{
+ int whichfork = cur->bc_private.b.whichfork;
+ struct xfs_inode *ip = cur->bc_private.b.ip;
+ struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ struct xfs_btree_block *block;
+ struct xfs_btree_block *cblock;
+ union xfs_btree_key *kp;
+ union xfs_btree_key *ckp;
+ union xfs_btree_ptr *pp;
+ union xfs_btree_ptr *cpp;
+ struct xfs_buf *cbp;
+ int level;
+ int index;
+ int numrecs;
+#ifdef DEBUG
+ union xfs_btree_ptr ptr;
+ int i;
+#endif
+
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+
+ ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
+ ASSERT(cur->bc_nlevels > 1);
+
+ /*
+ * Don't deal with the root block needs to be a leaf case.
+ * We're just going to turn the thing back into extents anyway.
+ */
+ level = cur->bc_nlevels - 1;
+ if (level == 1)
+ goto out0;
+
+ /*
+ * Give up if the root has multiple children.
+ */
+ block = xfs_btree_get_iroot(cur);
+ if (xfs_btree_get_numrecs(block) != 1)
+ goto out0;
+
+ cblock = xfs_btree_get_block(cur, level - 1, &cbp);
+ numrecs = xfs_btree_get_numrecs(cblock);
+
+ /*
+ * Only do this if the next level will fit.
+ * Then the data must be copied up to the inode,
+ * instead of freeing the root you free the next level.
+ */
+ if (numrecs > cur->bc_ops->get_dmaxrecs(cur, level))
+ goto out0;
+
+ XFS_BTREE_STATS_INC(cur, killroot);
+
+#ifdef DEBUG
+ xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_LEFTSIB);
+ ASSERT(xfs_btree_ptr_is_null(cur, &ptr));
+ xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
+ ASSERT(xfs_btree_ptr_is_null(cur, &ptr));
+#endif
+
+ index = numrecs - cur->bc_ops->get_maxrecs(cur, level);
+ if (index) {
+ xfs_iroot_realloc(cur->bc_private.b.ip, index,
+ cur->bc_private.b.whichfork);
+ block = ifp->if_broot;
+ }
+
+ be16_add_cpu(&block->bb_numrecs, index);
+ ASSERT(block->bb_numrecs == cblock->bb_numrecs);
+
+ kp = xfs_btree_key_addr(cur, 1, block);
+ ckp = xfs_btree_key_addr(cur, 1, cblock);
+ xfs_btree_copy_keys(cur, kp, ckp, numrecs);
+
+ pp = xfs_btree_ptr_addr(cur, 1, block);
+ cpp = xfs_btree_ptr_addr(cur, 1, cblock);
+#ifdef DEBUG
+ for (i = 0; i < numrecs; i++) {
+ int error;
+
+ error = xfs_btree_check_ptr(cur, cpp, i, level - 1);
+ if (error) {
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+ return error;
+ }
+ }
+#endif
+ xfs_btree_copy_ptrs(cur, pp, cpp, numrecs);
+
+ cur->bc_ops->free_block(cur, cbp);
+ XFS_BTREE_STATS_INC(cur, free);
+
+ cur->bc_bufs[level - 1] = NULL;
+ be16_add_cpu(&block->bb_level, -1);
+ xfs_trans_log_inode(cur->bc_tp, ip,
+ XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_private.b.whichfork));
+ cur->bc_nlevels--;
+out0:
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+ return 0;
+}
+
+/*
+ * Kill the current root node, and replace it with it's only child node.
+ */
+STATIC int
+xfs_btree_kill_root(
+ struct xfs_btree_cur *cur,
+ struct xfs_buf *bp,
+ int level,
+ union xfs_btree_ptr *newroot)
+{
+ int error;
+
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+ XFS_BTREE_STATS_INC(cur, killroot);
+
+ /*
+ * Update the root pointer, decreasing the level by 1 and then
+ * free the old root.
+ */
+ cur->bc_ops->set_root(cur, newroot, -1);
+
+ error = cur->bc_ops->free_block(cur, bp);
+ if (error) {
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+ return error;
+ }
+
+ XFS_BTREE_STATS_INC(cur, free);
+
+ cur->bc_bufs[level] = NULL;
+ cur->bc_ra[level] = 0;
+ cur->bc_nlevels--;
+
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+ return 0;
+}
+
+STATIC int
+xfs_btree_dec_cursor(
+ struct xfs_btree_cur *cur,
+ int level,
+ int *stat)
+{
+ int error;
+ int i;
+
+ if (level > 0) {
+ error = xfs_btree_decrement(cur, level, &i);
+ if (error)
+ return error;
+ }
+
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+ *stat = 1;
+ return 0;
+}
+
+/*
+ * Single level of the btree record deletion routine.
+ * Delete record pointed to by cur/level.
+ * Remove the record from its block then rebalance the tree.
+ * Return 0 for error, 1 for done, 2 to go on to the next level.
+ */
+STATIC int /* error */
+xfs_btree_delrec(
+ struct xfs_btree_cur *cur, /* btree cursor */
+ int level, /* level removing record from */
+ int *stat) /* fail/done/go-on */
+{
+ struct xfs_btree_block *block; /* btree block */
+ union xfs_btree_ptr cptr; /* current block ptr */
+ struct xfs_buf *bp; /* buffer for block */
+ int error; /* error return value */
+ int i; /* loop counter */
+ union xfs_btree_key key; /* storage for keyp */
+ union xfs_btree_key *keyp = &key; /* passed to the next level */
+ union xfs_btree_ptr lptr; /* left sibling block ptr */
+ struct xfs_buf *lbp; /* left buffer pointer */
+ struct xfs_btree_block *left; /* left btree block */
+ int lrecs = 0; /* left record count */
+ int ptr; /* key/record index */
+ union xfs_btree_ptr rptr; /* right sibling block ptr */
+ struct xfs_buf *rbp; /* right buffer pointer */
+ struct xfs_btree_block *right; /* right btree block */
+ struct xfs_btree_block *rrblock; /* right-right btree block */
+ struct xfs_buf *rrbp; /* right-right buffer pointer */
+ int rrecs = 0; /* right record count */
+ struct xfs_btree_cur *tcur; /* temporary btree cursor */
+ int numrecs; /* temporary numrec count */
+
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+ XFS_BTREE_TRACE_ARGI(cur, level);
+
+ tcur = NULL;
+
+ /* Get the index of the entry being deleted, check for nothing there. */
+ ptr = cur->bc_ptrs[level];
+ if (ptr == 0) {
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+ *stat = 0;
+ return 0;
+ }
+
+ /* Get the buffer & block containing the record or key/ptr. */
+ block = xfs_btree_get_block(cur, level, &bp);
+ numrecs = xfs_btree_get_numrecs(block);
+
+#ifdef DEBUG
+ error = xfs_btree_check_block(cur, block, level, bp);
+ if (error)
+ goto error0;
+#endif
+
+ /* Fail if we're off the end of the block. */
+ if (ptr > numrecs) {
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+ *stat = 0;
+ return 0;
+ }
+
+ XFS_BTREE_STATS_INC(cur, delrec);
+ XFS_BTREE_STATS_ADD(cur, moves, numrecs - ptr);
+
+ /* Excise the entries being deleted. */
+ if (level > 0) {
+ /* It's a nonleaf. operate on keys and ptrs */
+ union xfs_btree_key *lkp;
+ union xfs_btree_ptr *lpp;
+
+ lkp = xfs_btree_key_addr(cur, ptr + 1, block);
+ lpp = xfs_btree_ptr_addr(cur, ptr + 1, block);
+
+#ifdef DEBUG
+ for (i = 0; i < numrecs - ptr; i++) {
+ error = xfs_btree_check_ptr(cur, lpp, i, level);
+ if (error)
+ goto error0;
+ }
+#endif
+
+ if (ptr < numrecs) {
+ xfs_btree_shift_keys(cur, lkp, -1, numrecs - ptr);
+ xfs_btree_shift_ptrs(cur, lpp, -1, numrecs - ptr);
+ xfs_btree_log_keys(cur, bp, ptr, numrecs - 1);
+ xfs_btree_log_ptrs(cur, bp, ptr, numrecs - 1);
+ }
+
+ /*
+ * If it's the first record in the block, we'll need to pass a
+ * key up to the next level (updkey).
+ */
+ if (ptr == 1)
+ keyp = xfs_btree_key_addr(cur, 1, block);
+ } else {
+ /* It's a leaf. operate on records */
+ if (ptr < numrecs) {
+ xfs_btree_shift_recs(cur,
+ xfs_btree_rec_addr(cur, ptr + 1, block),
+ -1, numrecs - ptr);
+ xfs_btree_log_recs(cur, bp, ptr, numrecs - 1);
+ }
+
+ /*
+ * If it's the first record in the block, we'll need a key
+ * structure to pass up to the next level (updkey).
+ */
+ if (ptr == 1) {
+ cur->bc_ops->init_key_from_rec(&key,
+ xfs_btree_rec_addr(cur, 1, block));
+ keyp = &key;
+ }
+ }
+
+ /*
+ * Decrement and log the number of entries in the block.
+ */
+ xfs_btree_set_numrecs(block, --numrecs);
+ xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS);
+
+ /*
+ * If we are tracking the last record in the tree and
+ * we are at the far right edge of the tree, update it.
+ */
+ if (xfs_btree_is_lastrec(cur, block, level)) {
+ cur->bc_ops->update_lastrec(cur, block, NULL,
+ ptr, LASTREC_DELREC);
+ }
+
+ /*
+ * We're at the root level. First, shrink the root block in-memory.
+ * Try to get rid of the next level down. If we can't then there's
+ * nothing left to do.
+ */
+ if (level == cur->bc_nlevels - 1) {
+ if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) {
+ xfs_iroot_realloc(cur->bc_private.b.ip, -1,
+ cur->bc_private.b.whichfork);
+
+ error = xfs_btree_kill_iroot(cur);
+ if (error)
+ goto error0;
+
+ error = xfs_btree_dec_cursor(cur, level, stat);
+ if (error)
+ goto error0;
+ *stat = 1;
+ return 0;
+ }
+
+ /*
+ * If this is the root level, and there's only one entry left,
+ * and it's NOT the leaf level, then we can get rid of this
+ * level.
+ */
+ if (numrecs == 1 && level > 0) {
+ union xfs_btree_ptr *pp;
+ /*
+ * pp is still set to the first pointer in the block.
+ * Make it the new root of the btree.
+ */
+ pp = xfs_btree_ptr_addr(cur, 1, block);
+ error = xfs_btree_kill_root(cur, bp, level, pp);
+ if (error)
+ goto error0;
+ } else if (level > 0) {
+ error = xfs_btree_dec_cursor(cur, level, stat);
+ if (error)
+ goto error0;
+ }
+ *stat = 1;
+ return 0;
+ }
+
+ /*
+ * If we deleted the leftmost entry in the block, update the
+ * key values above us in the tree.
+ */
+ if (ptr == 1) {
+ error = xfs_btree_updkey(cur, keyp, level + 1);
+ if (error)
+ goto error0;
+ }
+
+ /*
+ * If the number of records remaining in the block is at least
+ * the minimum, we're done.
+ */
+ if (numrecs >= cur->bc_ops->get_minrecs(cur, level)) {
+ error = xfs_btree_dec_cursor(cur, level, stat);
+ if (error)
+ goto error0;
+ return 0;
+ }
+
+ /*
+ * Otherwise, we have to move some records around to keep the
+ * tree balanced. Look at the left and right sibling blocks to
+ * see if we can re-balance by moving only one record.
+ */
+ xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
+ xfs_btree_get_sibling(cur, block, &lptr, XFS_BB_LEFTSIB);
+
+ if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) {
+ /*
+ * One child of root, need to get a chance to copy its contents
+ * into the root and delete it. Can't go up to next level,
+ * there's nothing to delete there.
+ */
+ if (xfs_btree_ptr_is_null(cur, &rptr) &&
+ xfs_btree_ptr_is_null(cur, &lptr) &&
+ level == cur->bc_nlevels - 2) {
+ error = xfs_btree_kill_iroot(cur);
+ if (!error)
+ error = xfs_btree_dec_cursor(cur, level, stat);
+ if (error)
+ goto error0;
+ return 0;
+ }
+ }
+
+ ASSERT(!xfs_btree_ptr_is_null(cur, &rptr) ||
+ !xfs_btree_ptr_is_null(cur, &lptr));
+
+ /*
+ * Duplicate the cursor so our btree manipulations here won't
+ * disrupt the next level up.
+ */
+ error = xfs_btree_dup_cursor(cur, &tcur);
+ if (error)
+ goto error0;
+
+ /*
+ * If there's a right sibling, see if it's ok to shift an entry
+ * out of it.
+ */
+ if (!xfs_btree_ptr_is_null(cur, &rptr)) {
+ /*
+ * Move the temp cursor to the last entry in the next block.
+ * Actually any entry but the first would suffice.
+ */
+ i = xfs_btree_lastrec(tcur, level);
+ XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+
+ error = xfs_btree_increment(tcur, level, &i);
+ if (error)
+ goto error0;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+
+ i = xfs_btree_lastrec(tcur, level);
+ XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+
+ /* Grab a pointer to the block. */
+ right = xfs_btree_get_block(tcur, level, &rbp);
+#ifdef DEBUG
+ error = xfs_btree_check_block(tcur, right, level, rbp);
+ if (error)
+ goto error0;
+#endif
+ /* Grab the current block number, for future use. */
+ xfs_btree_get_sibling(tcur, right, &cptr, XFS_BB_LEFTSIB);
+
+ /*
+ * If right block is full enough so that removing one entry
+ * won't make it too empty, and left-shifting an entry out
+ * of right to us works, we're done.
+ */
+ if (xfs_btree_get_numrecs(right) - 1 >=
+ cur->bc_ops->get_minrecs(tcur, level)) {
+ error = xfs_btree_lshift(tcur, level, &i);
+ if (error)
+ goto error0;
+ if (i) {
+ ASSERT(xfs_btree_get_numrecs(block) >=
+ cur->bc_ops->get_minrecs(tcur, level));
+
+ xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+ tcur = NULL;
+
+ error = xfs_btree_dec_cursor(cur, level, stat);
+ if (error)
+ goto error0;
+ return 0;
+ }
+ }
+
+ /*
+ * Otherwise, grab the number of records in right for
+ * future reference, and fix up the temp cursor to point
+ * to our block again (last record).
+ */
+ rrecs = xfs_btree_get_numrecs(right);
+ if (!xfs_btree_ptr_is_null(cur, &lptr)) {
+ i = xfs_btree_firstrec(tcur, level);
+ XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+
+ error = xfs_btree_decrement(tcur, level, &i);
+ if (error)
+ goto error0;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+ }
+ }
+
+ /*
+ * If there's a left sibling, see if it's ok to shift an entry
+ * out of it.
+ */
+ if (!xfs_btree_ptr_is_null(cur, &lptr)) {
+ /*
+ * Move the temp cursor to the first entry in the
+ * previous block.
+ */
+ i = xfs_btree_firstrec(tcur, level);
+ XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+
+ error = xfs_btree_decrement(tcur, level, &i);
+ if (error)
+ goto error0;
+ i = xfs_btree_firstrec(tcur, level);
+ XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+
+ /* Grab a pointer to the block. */
+ left = xfs_btree_get_block(tcur, level, &lbp);
+#ifdef DEBUG
+ error = xfs_btree_check_block(cur, left, level, lbp);
+ if (error)
+ goto error0;
+#endif
+ /* Grab the current block number, for future use. */
+ xfs_btree_get_sibling(tcur, left, &cptr, XFS_BB_RIGHTSIB);
+
+ /*
+ * If left block is full enough so that removing one entry
+ * won't make it too empty, and right-shifting an entry out
+ * of left to us works, we're done.
+ */
+ if (xfs_btree_get_numrecs(left) - 1 >=
+ cur->bc_ops->get_minrecs(tcur, level)) {
+ error = xfs_btree_rshift(tcur, level, &i);
+ if (error)
+ goto error0;
+ if (i) {
+ ASSERT(xfs_btree_get_numrecs(block) >=
+ cur->bc_ops->get_minrecs(tcur, level));
+ xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+ tcur = NULL;
+ if (level == 0)
+ cur->bc_ptrs[0]++;
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+ *stat = 1;
+ return 0;
+ }
+ }
+
+ /*
+ * Otherwise, grab the number of records in right for
+ * future reference.
+ */
+ lrecs = xfs_btree_get_numrecs(left);
+ }
+
+ /* Delete the temp cursor, we're done with it. */
+ xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+ tcur = NULL;
+
+ /* If here, we need to do a join to keep the tree balanced. */
+ ASSERT(!xfs_btree_ptr_is_null(cur, &cptr));
+
+ if (!xfs_btree_ptr_is_null(cur, &lptr) &&
+ lrecs + xfs_btree_get_numrecs(block) <=
+ cur->bc_ops->get_maxrecs(cur, level)) {
+ /*
+ * Set "right" to be the starting block,
+ * "left" to be the left neighbor.
+ */
+ rptr = cptr;
+ right = block;
+ rbp = bp;
+ error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp);
+ if (error)
+ goto error0;
+
+ /*
+ * If that won't work, see if we can join with the right neighbor block.
+ */
+ } else if (!xfs_btree_ptr_is_null(cur, &rptr) &&
+ rrecs + xfs_btree_get_numrecs(block) <=
+ cur->bc_ops->get_maxrecs(cur, level)) {
+ /*
+ * Set "left" to be the starting block,
+ * "right" to be the right neighbor.
+ */
+ lptr = cptr;
+ left = block;
+ lbp = bp;
+ error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp);
+ if (error)
+ goto error0;
+
+ /*
+ * Otherwise, we can't fix the imbalance.
+ * Just return. This is probably a logic error, but it's not fatal.
+ */
+ } else {
+ error = xfs_btree_dec_cursor(cur, level, stat);
+ if (error)
+ goto error0;
+ return 0;
+ }
+
+ rrecs = xfs_btree_get_numrecs(right);
+ lrecs = xfs_btree_get_numrecs(left);
+
+ /*
+ * We're now going to join "left" and "right" by moving all the stuff
+ * in "right" to "left" and deleting "right".
+ */
+ XFS_BTREE_STATS_ADD(cur, moves, rrecs);
+ if (level > 0) {
+ /* It's a non-leaf. Move keys and pointers. */
+ union xfs_btree_key *lkp; /* left btree key */
+ union xfs_btree_ptr *lpp; /* left address pointer */
+ union xfs_btree_key *rkp; /* right btree key */
+ union xfs_btree_ptr *rpp; /* right address pointer */
+
+ lkp = xfs_btree_key_addr(cur, lrecs + 1, left);
+ lpp = xfs_btree_ptr_addr(cur, lrecs + 1, left);
+ rkp = xfs_btree_key_addr(cur, 1, right);
+ rpp = xfs_btree_ptr_addr(cur, 1, right);
+#ifdef DEBUG
+ for (i = 1; i < rrecs; i++) {
+ error = xfs_btree_check_ptr(cur, rpp, i, level);
+ if (error)
+ goto error0;
+ }
+#endif
+ xfs_btree_copy_keys(cur, lkp, rkp, rrecs);
+ xfs_btree_copy_ptrs(cur, lpp, rpp, rrecs);
+
+ xfs_btree_log_keys(cur, lbp, lrecs + 1, lrecs + rrecs);
+ xfs_btree_log_ptrs(cur, lbp, lrecs + 1, lrecs + rrecs);
+ } else {
+ /* It's a leaf. Move records. */
+ union xfs_btree_rec *lrp; /* left record pointer */
+ union xfs_btree_rec *rrp; /* right record pointer */
+
+ lrp = xfs_btree_rec_addr(cur, lrecs + 1, left);
+ rrp = xfs_btree_rec_addr(cur, 1, right);
+
+ xfs_btree_copy_recs(cur, lrp, rrp, rrecs);
+ xfs_btree_log_recs(cur, lbp, lrecs + 1, lrecs + rrecs);
+ }
+
+ XFS_BTREE_STATS_INC(cur, join);
+
+ /*
+ * Fix up the number of records and right block pointer in the
+ * surviving block, and log it.
+ */
+ xfs_btree_set_numrecs(left, lrecs + rrecs);
+ xfs_btree_get_sibling(cur, right, &cptr, XFS_BB_RIGHTSIB),
+ xfs_btree_set_sibling(cur, left, &cptr, XFS_BB_RIGHTSIB);
+ xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
+
+ /* If there is a right sibling, point it to the remaining block. */
+ xfs_btree_get_sibling(cur, left, &cptr, XFS_BB_RIGHTSIB);
+ if (!xfs_btree_ptr_is_null(cur, &cptr)) {
+ error = xfs_btree_read_buf_block(cur, &cptr, 0, &rrblock, &rrbp);
+ if (error)
+ goto error0;
+ xfs_btree_set_sibling(cur, rrblock, &lptr, XFS_BB_LEFTSIB);
+ xfs_btree_log_block(cur, rrbp, XFS_BB_LEFTSIB);
+ }
+
+ /* Free the deleted block. */
+ error = cur->bc_ops->free_block(cur, rbp);
+ if (error)
+ goto error0;
+ XFS_BTREE_STATS_INC(cur, free);
+
+ /*
+ * If we joined with the left neighbor, set the buffer in the
+ * cursor to the left block, and fix up the index.
+ */
+ if (bp != lbp) {
+ cur->bc_bufs[level] = lbp;
+ cur->bc_ptrs[level] += lrecs;
+ cur->bc_ra[level] = 0;
+ }
+ /*
+ * If we joined with the right neighbor and there's a level above
+ * us, increment the cursor at that level.
+ */
+ else if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) ||
+ (level + 1 < cur->bc_nlevels)) {
+ error = xfs_btree_increment(cur, level + 1, &i);
+ if (error)
+ goto error0;
+ }
+
+ /*
+ * Readjust the ptr at this level if it's not a leaf, since it's
+ * still pointing at the deletion point, which makes the cursor
+ * inconsistent. If this makes the ptr 0, the caller fixes it up.
+ * We can't use decrement because it would change the next level up.
+ */
+ if (level > 0)
+ cur->bc_ptrs[level]--;
+
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+ /* Return value means the next level up has something to do. */
+ *stat = 2;
+ return 0;
+
+error0:
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+ if (tcur)
+ xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
+ return error;
+}
+
+/*
+ * Delete the record pointed to by cur.
+ * The cursor refers to the place where the record was (could be inserted)
+ * when the operation returns.
+ */
+int /* error */
+xfs_btree_delete(
+ struct xfs_btree_cur *cur,
+ int *stat) /* success/failure */
+{
+ int error; /* error return value */
+ int level;
+ int i;
+
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+
+ /*
+ * Go up the tree, starting at leaf level.
+ *
+ * If 2 is returned then a join was done; go to the next level.
+ * Otherwise we are done.
+ */
+ for (level = 0, i = 2; i == 2; level++) {
+ error = xfs_btree_delrec(cur, level, &i);
+ if (error)
+ goto error0;
+ }
+
+ if (i == 0) {
+ for (level = 1; level < cur->bc_nlevels; level++) {
+ if (cur->bc_ptrs[level] == 0) {
+ error = xfs_btree_decrement(cur, level, &i);
+ if (error)
+ goto error0;
+ break;
+ }
+ }
+ }
+
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+ *stat = i;
+ return 0;
+error0:
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+ return error;
+}
+
+/*
+ * Get the data from the pointed-to record.
+ */
+int /* error */
+xfs_btree_get_rec(
+ struct xfs_btree_cur *cur, /* btree cursor */
+ union xfs_btree_rec **recp, /* output: btree record */
+ int *stat) /* output: success/failure */
+{
+ struct xfs_btree_block *block; /* btree block */
+ struct xfs_buf *bp; /* buffer pointer */
+ int ptr; /* record number */
+#ifdef DEBUG
+ int error; /* error return value */
+#endif
+
+ ptr = cur->bc_ptrs[0];
+ block = xfs_btree_get_block(cur, 0, &bp);
+
+#ifdef DEBUG
+ error = xfs_btree_check_block(cur, block, 0, bp);
+ if (error)
+ return error;
+#endif
+
+ /*
+ * Off the right end or left end, return failure.
+ */
+ if (ptr > xfs_btree_get_numrecs(block) || ptr <= 0) {
+ *stat = 0;
+ return 0;
+ }
+
+ /*
+ * Point to the record and extract its data.
+ */
+ *recp = xfs_btree_rec_addr(cur, ptr, block);
+ *stat = 1;
+ return 0;
+}
+
+/*
+ * Change the owner of a btree.
+ *
+ * The mechanism we use here is ordered buffer logging. Because we don't know
+ * how many buffers were are going to need to modify, we don't really want to
+ * have to make transaction reservations for the worst case of every buffer in a
+ * full size btree as that may be more space that we can fit in the log....
+ *
+ * We do the btree walk in the most optimal manner possible - we have sibling
+ * pointers so we can just walk all the blocks on each level from left to right
+ * in a single pass, and then move to the next level and do the same. We can
+ * also do readahead on the sibling pointers to get IO moving more quickly,
+ * though for slow disks this is unlikely to make much difference to performance
+ * as the amount of CPU work we have to do before moving to the next block is
+ * relatively small.
+ *
+ * For each btree block that we load, modify the owner appropriately, set the
+ * buffer as an ordered buffer and log it appropriately. We need to ensure that
+ * we mark the region we change dirty so that if the buffer is relogged in
+ * a subsequent transaction the changes we make here as an ordered buffer are
+ * correctly relogged in that transaction. If we are in recovery context, then
+ * just queue the modified buffer as delayed write buffer so the transaction
+ * recovery completion writes the changes to disk.
+ */
+static int
+xfs_btree_block_change_owner(
+ struct xfs_btree_cur *cur,
+ int level,
+ __uint64_t new_owner,
+ struct list_head *buffer_list)
+{
+ struct xfs_btree_block *block;
+ struct xfs_buf *bp;
+ union xfs_btree_ptr rptr;
+
+ /* do right sibling readahead */
+ xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA);
+
+ /* modify the owner */
+ block = xfs_btree_get_block(cur, level, &bp);
+ if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+ block->bb_u.l.bb_owner = cpu_to_be64(new_owner);
+ else
+ block->bb_u.s.bb_owner = cpu_to_be32(new_owner);
+
+ /*
+ * If the block is a root block hosted in an inode, we might not have a
+ * buffer pointer here and we shouldn't attempt to log the change as the
+ * information is already held in the inode and discarded when the root
+ * block is formatted into the on-disk inode fork. We still change it,
+ * though, so everything is consistent in memory.
+ */
+ if (bp) {
+ if (cur->bc_tp) {
+ xfs_trans_ordered_buf(cur->bc_tp, bp);
+ xfs_btree_log_block(cur, bp, XFS_BB_OWNER);
+ } else {
+ xfs_buf_delwri_queue(bp, buffer_list);
+ }
+ } else {
+ ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
+ ASSERT(level == cur->bc_nlevels - 1);
+ }
+
+ /* now read rh sibling block for next iteration */
+ xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
+ if (xfs_btree_ptr_is_null(cur, &rptr))
+ return ENOENT;
+
+ return xfs_btree_lookup_get_block(cur, level, &rptr, &block);
+}
+
+int
+xfs_btree_change_owner(
+ struct xfs_btree_cur *cur,
+ __uint64_t new_owner,
+ struct list_head *buffer_list)
+{
+ union xfs_btree_ptr lptr;
+ int level;
+ struct xfs_btree_block *block = NULL;
+ int error = 0;
+
+ cur->bc_ops->init_ptr_from_cur(cur, &lptr);
+
+ /* for each level */
+ for (level = cur->bc_nlevels - 1; level >= 0; level--) {
+ /* grab the left hand block */
+ error = xfs_btree_lookup_get_block(cur, level, &lptr, &block);
+ if (error)
+ return error;
+
+ /* readahead the left most block for the next level down */
+ if (level > 0) {
+ union xfs_btree_ptr *ptr;
+
+ ptr = xfs_btree_ptr_addr(cur, 1, block);
+ xfs_btree_readahead_ptr(cur, ptr, 1);
+
+ /* save for the next iteration of the loop */
+ lptr = *ptr;
+ }
+
+ /* for each buffer in the level */
+ do {
+ error = xfs_btree_block_change_owner(cur, level,
+ new_owner,
+ buffer_list);
+ } while (!error);
+
+ if (error != ENOENT)
+ return error;
+ }
+
+ return 0;
+}
diff --git a/fs/xfs/xfs_btree.h b/fs/xfs/xfs_btree.h
index 44f1bd98064..a04b69422f6 100644
--- a/fs/xfs/xfs_btree.h
+++ b/fs/xfs/xfs_btree.h
@@ -24,6 +24,33 @@ struct xfs_inode;
struct xfs_mount;
struct xfs_trans;
+extern kmem_zone_t *xfs_btree_cur_zone;
+
+/*
+ * Generic key, ptr and record wrapper structures.
+ *
+ * These are disk format structures, and are converted where necessary
+ * by the btree specific code that needs to interpret them.
+ */
+union xfs_btree_ptr {
+ __be32 s; /* short form ptr */
+ __be64 l; /* long form ptr */
+};
+
+union xfs_btree_key {
+ xfs_bmbt_key_t bmbt;
+ xfs_bmdr_key_t bmbr; /* bmbt root block */
+ xfs_alloc_key_t alloc;
+ xfs_inobt_key_t inobt;
+};
+
+union xfs_btree_rec {
+ xfs_bmbt_rec_t bmbt;
+ xfs_bmdr_rec_t bmbr; /* bmbt root block */
+ xfs_alloc_rec_t alloc;
+ xfs_inobt_rec_t inobt;
+};
+
/*
* This nonsense is to make -wlint happy.
*/
@@ -35,104 +62,128 @@ struct xfs_trans;
#define XFS_BTNUM_CNT ((xfs_btnum_t)XFS_BTNUM_CNTi)
#define XFS_BTNUM_BMAP ((xfs_btnum_t)XFS_BTNUM_BMAPi)
#define XFS_BTNUM_INO ((xfs_btnum_t)XFS_BTNUM_INOi)
+#define XFS_BTNUM_FINO ((xfs_btnum_t)XFS_BTNUM_FINOi)
/*
- * Short form header: space allocation btrees.
+ * For logging record fields.
*/
-typedef struct xfs_btree_sblock {
- __be32 bb_magic; /* magic number for block type */
- __be16 bb_level; /* 0 is a leaf */
- __be16 bb_numrecs; /* current # of data records */
- __be32 bb_leftsib; /* left sibling block or NULLAGBLOCK */
- __be32 bb_rightsib; /* right sibling block or NULLAGBLOCK */
-} xfs_btree_sblock_t;
+#define XFS_BB_MAGIC (1 << 0)
+#define XFS_BB_LEVEL (1 << 1)
+#define XFS_BB_NUMRECS (1 << 2)
+#define XFS_BB_LEFTSIB (1 << 3)
+#define XFS_BB_RIGHTSIB (1 << 4)
+#define XFS_BB_BLKNO (1 << 5)
+#define XFS_BB_LSN (1 << 6)
+#define XFS_BB_UUID (1 << 7)
+#define XFS_BB_OWNER (1 << 8)
+#define XFS_BB_NUM_BITS 5
+#define XFS_BB_ALL_BITS ((1 << XFS_BB_NUM_BITS) - 1)
+#define XFS_BB_NUM_BITS_CRC 9
+#define XFS_BB_ALL_BITS_CRC ((1 << XFS_BB_NUM_BITS_CRC) - 1)
+
+/*
+ * Generic stats interface
+ */
+#define __XFS_BTREE_STATS_INC(type, stat) \
+ XFS_STATS_INC(xs_ ## type ## _2_ ## stat)
+#define XFS_BTREE_STATS_INC(cur, stat) \
+do { \
+ switch (cur->bc_btnum) { \
+ case XFS_BTNUM_BNO: __XFS_BTREE_STATS_INC(abtb, stat); break; \
+ case XFS_BTNUM_CNT: __XFS_BTREE_STATS_INC(abtc, stat); break; \
+ case XFS_BTNUM_BMAP: __XFS_BTREE_STATS_INC(bmbt, stat); break; \
+ case XFS_BTNUM_INO: __XFS_BTREE_STATS_INC(ibt, stat); break; \
+ case XFS_BTNUM_FINO: __XFS_BTREE_STATS_INC(fibt, stat); break; \
+ case XFS_BTNUM_MAX: ASSERT(0); /* fucking gcc */ ; break; \
+ } \
+} while (0)
+
+#define __XFS_BTREE_STATS_ADD(type, stat, val) \
+ XFS_STATS_ADD(xs_ ## type ## _2_ ## stat, val)
+#define XFS_BTREE_STATS_ADD(cur, stat, val) \
+do { \
+ switch (cur->bc_btnum) { \
+ case XFS_BTNUM_BNO: __XFS_BTREE_STATS_ADD(abtb, stat, val); break; \
+ case XFS_BTNUM_CNT: __XFS_BTREE_STATS_ADD(abtc, stat, val); break; \
+ case XFS_BTNUM_BMAP: __XFS_BTREE_STATS_ADD(bmbt, stat, val); break; \
+ case XFS_BTNUM_INO: __XFS_BTREE_STATS_ADD(ibt, stat, val); break; \
+ case XFS_BTNUM_FINO: __XFS_BTREE_STATS_ADD(fibt, stat, val); break; \
+ case XFS_BTNUM_MAX: ASSERT(0); /* fucking gcc */ ; break; \
+ } \
+} while (0)
-/*
- * Long form header: bmap btrees.
- */
-typedef struct xfs_btree_lblock {
- __be32 bb_magic; /* magic number for block type */
- __be16 bb_level; /* 0 is a leaf */
- __be16 bb_numrecs; /* current # of data records */
- __be64 bb_leftsib; /* left sibling block or NULLDFSBNO */
- __be64 bb_rightsib; /* right sibling block or NULLDFSBNO */
-} xfs_btree_lblock_t;
+#define XFS_BTREE_MAXLEVELS 8 /* max of all btrees */
-/*
- * Combined header and structure, used by common code.
- */
-typedef struct xfs_btree_hdr
-{
- __be32 bb_magic; /* magic number for block type */
- __be16 bb_level; /* 0 is a leaf */
- __be16 bb_numrecs; /* current # of data records */
-} xfs_btree_hdr_t;
+struct xfs_btree_ops {
+ /* size of the key and record structures */
+ size_t key_len;
+ size_t rec_len;
-typedef struct xfs_btree_block {
- xfs_btree_hdr_t bb_h; /* header */
- union {
- struct {
- __be32 bb_leftsib;
- __be32 bb_rightsib;
- } s; /* short form pointers */
- struct {
- __be64 bb_leftsib;
- __be64 bb_rightsib;
- } l; /* long form pointers */
- } bb_u; /* rest */
-} xfs_btree_block_t;
+ /* cursor operations */
+ struct xfs_btree_cur *(*dup_cursor)(struct xfs_btree_cur *);
+ void (*update_cursor)(struct xfs_btree_cur *src,
+ struct xfs_btree_cur *dst);
-/*
- * For logging record fields.
- */
-#define XFS_BB_MAGIC 0x01
-#define XFS_BB_LEVEL 0x02
-#define XFS_BB_NUMRECS 0x04
-#define XFS_BB_LEFTSIB 0x08
-#define XFS_BB_RIGHTSIB 0x10
-#define XFS_BB_NUM_BITS 5
-#define XFS_BB_ALL_BITS ((1 << XFS_BB_NUM_BITS) - 1)
+ /* update btree root pointer */
+ void (*set_root)(struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *nptr, int level_change);
-/*
- * Boolean to select which form of xfs_btree_block_t.bb_u to use.
- */
-#define XFS_BTREE_LONG_PTRS(btnum) ((btnum) == XFS_BTNUM_BMAP)
+ /* block allocation / freeing */
+ int (*alloc_block)(struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *start_bno,
+ union xfs_btree_ptr *new_bno,
+ int *stat);
+ int (*free_block)(struct xfs_btree_cur *cur, struct xfs_buf *bp);
-/*
- * Magic numbers for btree blocks.
- */
-extern const __uint32_t xfs_magics[];
+ /* update last record information */
+ void (*update_lastrec)(struct xfs_btree_cur *cur,
+ struct xfs_btree_block *block,
+ union xfs_btree_rec *rec,
+ int ptr, int reason);
-/*
- * Maximum and minimum records in a btree block.
- * Given block size, type prefix, and leaf flag (0 or 1).
- * The divisor below is equivalent to lf ? (e1) : (e2) but that produces
- * compiler warnings.
- */
-#define XFS_BTREE_BLOCK_MAXRECS(bsz,t,lf) \
- ((int)(((bsz) - (uint)sizeof(t ## _block_t)) / \
- (((lf) * (uint)sizeof(t ## _rec_t)) + \
- ((1 - (lf)) * \
- ((uint)sizeof(t ## _key_t) + (uint)sizeof(t ## _ptr_t))))))
-#define XFS_BTREE_BLOCK_MINRECS(bsz,t,lf) \
- (XFS_BTREE_BLOCK_MAXRECS(bsz,t,lf) / 2)
+ /* records in block/level */
+ int (*get_minrecs)(struct xfs_btree_cur *cur, int level);
+ int (*get_maxrecs)(struct xfs_btree_cur *cur, int level);
+
+ /* records on disk. Matter for the root in inode case. */
+ int (*get_dmaxrecs)(struct xfs_btree_cur *cur, int level);
+
+ /* init values of btree structures */
+ void (*init_key_from_rec)(union xfs_btree_key *key,
+ union xfs_btree_rec *rec);
+ void (*init_rec_from_key)(union xfs_btree_key *key,
+ union xfs_btree_rec *rec);
+ void (*init_rec_from_cur)(struct xfs_btree_cur *cur,
+ union xfs_btree_rec *rec);
+ void (*init_ptr_from_cur)(struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *ptr);
+
+ /* difference between key value and cursor value */
+ __int64_t (*key_diff)(struct xfs_btree_cur *cur,
+ union xfs_btree_key *key);
+
+ const struct xfs_buf_ops *buf_ops;
+
+#if defined(DEBUG) || defined(XFS_WARN)
+ /* check that k1 is lower than k2 */
+ int (*keys_inorder)(struct xfs_btree_cur *cur,
+ union xfs_btree_key *k1,
+ union xfs_btree_key *k2);
+
+ /* check that r1 is lower than r2 */
+ int (*recs_inorder)(struct xfs_btree_cur *cur,
+ union xfs_btree_rec *r1,
+ union xfs_btree_rec *r2);
+#endif
+};
/*
- * Record, key, and pointer address calculation macros.
- * Given block size, type prefix, block pointer, and index of requested entry
- * (first entry numbered 1).
+ * Reasons for the update_lastrec method to be called.
*/
-#define XFS_BTREE_REC_ADDR(bsz,t,bb,i,mxr) \
- ((t ## _rec_t *)((char *)(bb) + sizeof(t ## _block_t) + \
- ((i) - 1) * sizeof(t ## _rec_t)))
-#define XFS_BTREE_KEY_ADDR(bsz,t,bb,i,mxr) \
- ((t ## _key_t *)((char *)(bb) + sizeof(t ## _block_t) + \
- ((i) - 1) * sizeof(t ## _key_t)))
-#define XFS_BTREE_PTR_ADDR(bsz,t,bb,i,mxr) \
- ((t ## _ptr_t *)((char *)(bb) + sizeof(t ## _block_t) + \
- (mxr) * sizeof(t ## _key_t) + ((i) - 1) * sizeof(t ## _ptr_t)))
+#define LASTREC_UPDATE 0
+#define LASTREC_INSREC 1
+#define LASTREC_DELREC 2
-#define XFS_BTREE_MAXLEVELS 8 /* max of all btrees */
/*
* Btree cursor structure.
@@ -142,10 +193,12 @@ typedef struct xfs_btree_cur
{
struct xfs_trans *bc_tp; /* transaction we're in, if any */
struct xfs_mount *bc_mp; /* file system mount struct */
+ const struct xfs_btree_ops *bc_ops;
+ uint bc_flags; /* btree features - below */
union {
xfs_alloc_rec_incore_t a;
xfs_bmbt_irec_t b;
- xfs_inobt_rec_t i;
+ xfs_inobt_rec_incore_t i;
} bc_rec; /* current insert/search record value */
struct xfs_buf *bc_bufs[XFS_BTREE_MAXLEVELS]; /* buf ptr per level */
int bc_ptrs[XFS_BTREE_MAXLEVELS]; /* key/record # */
@@ -156,8 +209,8 @@ typedef struct xfs_btree_cur
__uint8_t bc_blocklog; /* log2(blocksize) of btree blocks */
xfs_btnum_t bc_btnum; /* identifies which btree type */
union {
- struct { /* needed for BNO, CNT */
- struct xfs_buf *agbp; /* agf buffer pointer */
+ struct { /* needed for BNO, CNT, INO */
+ struct xfs_buf *agbp; /* agf/agi buffer pointer */
xfs_agnumber_t agno; /* ag number */
} a;
struct { /* needed for BMAP */
@@ -170,99 +223,45 @@ typedef struct xfs_btree_cur
char flags; /* flags */
#define XFS_BTCUR_BPRV_WASDEL 1 /* was delayed */
} b;
- struct { /* needed for INO */
- struct xfs_buf *agbp; /* agi buffer pointer */
- xfs_agnumber_t agno; /* ag number */
- } i;
} bc_private; /* per-btree type data */
} xfs_btree_cur_t;
+/* cursor flags */
+#define XFS_BTREE_LONG_PTRS (1<<0) /* pointers are 64bits long */
+#define XFS_BTREE_ROOT_IN_INODE (1<<1) /* root may be variable size */
+#define XFS_BTREE_LASTREC_UPDATE (1<<2) /* track last rec externally */
+#define XFS_BTREE_CRC_BLOCKS (1<<3) /* uses extended btree blocks */
+
+
#define XFS_BTREE_NOERROR 0
#define XFS_BTREE_ERROR 1
/*
* Convert from buffer to btree block header.
*/
-#define XFS_BUF_TO_BLOCK(bp) ((xfs_btree_block_t *)XFS_BUF_PTR(bp))
-#define XFS_BUF_TO_LBLOCK(bp) ((xfs_btree_lblock_t *)XFS_BUF_PTR(bp))
-#define XFS_BUF_TO_SBLOCK(bp) ((xfs_btree_sblock_t *)XFS_BUF_PTR(bp))
+#define XFS_BUF_TO_BLOCK(bp) ((struct xfs_btree_block *)((bp)->b_addr))
-#ifdef __KERNEL__
-
-#ifdef DEBUG
/*
- * Debug routine: check that block header is ok.
+ * Check that block header is ok.
*/
-void
+int
xfs_btree_check_block(
- xfs_btree_cur_t *cur, /* btree cursor */
- xfs_btree_block_t *block, /* generic btree block pointer */
+ struct xfs_btree_cur *cur, /* btree cursor */
+ struct xfs_btree_block *block, /* generic btree block pointer */
int level, /* level of the btree block */
struct xfs_buf *bp); /* buffer containing block, if any */
/*
- * Debug routine: check that keys are in the right order.
- */
-void
-xfs_btree_check_key(
- xfs_btnum_t btnum, /* btree identifier */
- void *ak1, /* pointer to left (lower) key */
- void *ak2); /* pointer to right (higher) key */
-
-/*
- * Debug routine: check that records are in the right order.
- */
-void
-xfs_btree_check_rec(
- xfs_btnum_t btnum, /* btree identifier */
- void *ar1, /* pointer to left (lower) record */
- void *ar2); /* pointer to right (higher) record */
-#else
-#define xfs_btree_check_block(a,b,c,d)
-#define xfs_btree_check_key(a,b,c)
-#define xfs_btree_check_rec(a,b,c)
-#endif /* DEBUG */
-
-/*
- * Checking routine: check that long form block header is ok.
- */
-int /* error (0 or EFSCORRUPTED) */
-xfs_btree_check_lblock(
- xfs_btree_cur_t *cur, /* btree cursor */
- xfs_btree_lblock_t *block, /* btree long form block pointer */
- int level, /* level of the btree block */
- struct xfs_buf *bp); /* buffer containing block, if any */
-
-/*
- * Checking routine: check that (long) pointer is ok.
+ * Check that (long) pointer is ok.
*/
int /* error (0 or EFSCORRUPTED) */
xfs_btree_check_lptr(
- xfs_btree_cur_t *cur, /* btree cursor */
+ struct xfs_btree_cur *cur, /* btree cursor */
xfs_dfsbno_t ptr, /* btree block disk address */
int level); /* btree block level */
/*
- * Checking routine: check that short form block header is ok.
- */
-int /* error (0 or EFSCORRUPTED) */
-xfs_btree_check_sblock(
- xfs_btree_cur_t *cur, /* btree cursor */
- xfs_btree_sblock_t *block, /* btree short form block pointer */
- int level, /* level of the btree block */
- struct xfs_buf *bp); /* buffer containing block */
-
-/*
- * Checking routine: check that (short) pointer is ok.
- */
-int /* error (0 or EFSCORRUPTED) */
-xfs_btree_check_sptr(
- xfs_btree_cur_t *cur, /* btree cursor */
- xfs_agblock_t ptr, /* btree block disk address */
- int level); /* btree block level */
-
-/*
* Delete the btree cursor.
*/
void
@@ -280,15 +279,6 @@ xfs_btree_dup_cursor(
xfs_btree_cur_t **ncur);/* output cursor */
/*
- * Change the cursor to point to the first record in the current block
- * at the given level. Other levels are unaffected.
- */
-int /* success=1, failure=0 */
-xfs_btree_firstrec(
- xfs_btree_cur_t *cur, /* btree cursor */
- int level); /* level to change */
-
-/*
* Get a buffer for the block, return it with no data read.
* Long-form addressing.
*/
@@ -312,20 +302,6 @@ xfs_btree_get_bufs(
uint lock); /* lock flags for get_buf */
/*
- * Allocate a new btree cursor.
- * The cursor is either for allocation (A) or bmap (B).
- */
-xfs_btree_cur_t * /* new btree cursor */
-xfs_btree_init_cursor(
- struct xfs_mount *mp, /* file system mount point */
- struct xfs_trans *tp, /* transaction pointer */
- struct xfs_buf *agbp, /* (A only) buffer for agf structure */
- xfs_agnumber_t agno, /* (A only) allocation group number */
- xfs_btnum_t btnum, /* btree identifier */
- struct xfs_inode *ip, /* (B only) inode owning the btree */
- int whichfork); /* (B only) data/attr fork */
-
-/*
* Check for the cursor referring to the last block at the given level.
*/
int /* 1=is last block, 0=not last block */
@@ -334,15 +310,6 @@ xfs_btree_islastblock(
int level); /* level to check */
/*
- * Change the cursor to point to the last record in the current block
- * at the given level. Other levels are unaffected.
- */
-int /* success=1, failure=0 */
-xfs_btree_lastrec(
- xfs_btree_cur_t *cur, /* btree cursor */
- int level); /* level to change */
-
-/*
* Compute first and last byte offsets for the fields given.
* Interprets the offsets table, which contains struct field offsets.
*/
@@ -365,21 +332,8 @@ xfs_btree_read_bufl(
xfs_fsblock_t fsbno, /* file system block number */
uint lock, /* lock flags for read_buf */
struct xfs_buf **bpp, /* buffer for fsbno */
- int refval);/* ref count value for buffer */
-
-/*
- * Get a buffer for the block, return it read in.
- * Short-form addressing.
- */
-int /* error */
-xfs_btree_read_bufs(
- struct xfs_mount *mp, /* file system mount point */
- struct xfs_trans *tp, /* transaction pointer */
- xfs_agnumber_t agno, /* allocation group number */
- xfs_agblock_t agbno, /* allocation group block number */
- uint lock, /* lock flags for read_buf */
- struct xfs_buf **bpp, /* buffer for agno/agbno */
- int refval);/* ref count value for buffer */
+ int refval, /* ref count value for buffer */
+ const struct xfs_buf_ops *ops);
/*
* Read-ahead the block, don't wait for it, don't return a buffer.
@@ -389,7 +343,8 @@ void /* error */
xfs_btree_reada_bufl(
struct xfs_mount *mp, /* file system mount point */
xfs_fsblock_t fsbno, /* file system block number */
- xfs_extlen_t count); /* count of filesystem blocks */
+ xfs_extlen_t count, /* count of filesystem blocks */
+ const struct xfs_buf_ops *ops);
/*
* Read-ahead the block, don't wait for it, don't return a buffer.
@@ -400,74 +355,114 @@ xfs_btree_reada_bufs(
struct xfs_mount *mp, /* file system mount point */
xfs_agnumber_t agno, /* allocation group number */
xfs_agblock_t agbno, /* allocation group block number */
- xfs_extlen_t count); /* count of filesystem blocks */
+ xfs_extlen_t count, /* count of filesystem blocks */
+ const struct xfs_buf_ops *ops);
/*
- * Read-ahead btree blocks, at the given level.
- * Bits in lr are set from XFS_BTCUR_{LEFT,RIGHT}RA.
+ * Initialise a new btree block header
*/
-int /* readahead block count */
-xfs_btree_readahead_core(
- xfs_btree_cur_t *cur, /* btree cursor */
- int lev, /* level in btree */
- int lr); /* left/right bits */
+void
+xfs_btree_init_block(
+ struct xfs_mount *mp,
+ struct xfs_buf *bp,
+ __u32 magic,
+ __u16 level,
+ __u16 numrecs,
+ __u64 owner,
+ unsigned int flags);
-static inline int /* readahead block count */
-xfs_btree_readahead(
- xfs_btree_cur_t *cur, /* btree cursor */
- int lev, /* level in btree */
- int lr) /* left/right bits */
-{
- if ((cur->bc_ra[lev] | lr) == cur->bc_ra[lev])
- return 0;
+void
+xfs_btree_init_block_int(
+ struct xfs_mount *mp,
+ struct xfs_btree_block *buf,
+ xfs_daddr_t blkno,
+ __u32 magic,
+ __u16 level,
+ __u16 numrecs,
+ __u64 owner,
+ unsigned int flags);
- return xfs_btree_readahead_core(cur, lev, lr);
-}
+/*
+ * Common btree core entry points.
+ */
+int xfs_btree_increment(struct xfs_btree_cur *, int, int *);
+int xfs_btree_decrement(struct xfs_btree_cur *, int, int *);
+int xfs_btree_lookup(struct xfs_btree_cur *, xfs_lookup_t, int *);
+int xfs_btree_update(struct xfs_btree_cur *, union xfs_btree_rec *);
+int xfs_btree_new_iroot(struct xfs_btree_cur *, int *, int *);
+int xfs_btree_insert(struct xfs_btree_cur *, int *);
+int xfs_btree_delete(struct xfs_btree_cur *, int *);
+int xfs_btree_get_rec(struct xfs_btree_cur *, union xfs_btree_rec **, int *);
+int xfs_btree_change_owner(struct xfs_btree_cur *cur, __uint64_t new_owner,
+ struct list_head *buffer_list);
+/*
+ * btree block CRC helpers
+ */
+void xfs_btree_lblock_calc_crc(struct xfs_buf *);
+bool xfs_btree_lblock_verify_crc(struct xfs_buf *);
+void xfs_btree_sblock_calc_crc(struct xfs_buf *);
+bool xfs_btree_sblock_verify_crc(struct xfs_buf *);
/*
- * Set the buffer for level "lev" in the cursor to bp, releasing
- * any previous buffer.
+ * Internal btree helpers also used by xfs_bmap.c.
*/
-void
-xfs_btree_setbuf(
- xfs_btree_cur_t *cur, /* btree cursor */
- int lev, /* level in btree */
- struct xfs_buf *bp); /* new buffer to set */
+void xfs_btree_log_block(struct xfs_btree_cur *, struct xfs_buf *, int);
+void xfs_btree_log_recs(struct xfs_btree_cur *, struct xfs_buf *, int, int);
+
+/*
+ * Helpers.
+ */
+static inline int xfs_btree_get_numrecs(struct xfs_btree_block *block)
+{
+ return be16_to_cpu(block->bb_numrecs);
+}
+
+static inline void xfs_btree_set_numrecs(struct xfs_btree_block *block,
+ __uint16_t numrecs)
+{
+ block->bb_numrecs = cpu_to_be16(numrecs);
+}
-#endif /* __KERNEL__ */
+static inline int xfs_btree_get_level(struct xfs_btree_block *block)
+{
+ return be16_to_cpu(block->bb_level);
+}
/*
* Min and max functions for extlen, agblock, fileoff, and filblks types.
*/
-#define XFS_EXTLEN_MIN(a,b) \
- ((xfs_extlen_t)(a) < (xfs_extlen_t)(b) ? \
- (xfs_extlen_t)(a) : (xfs_extlen_t)(b))
-#define XFS_EXTLEN_MAX(a,b) \
- ((xfs_extlen_t)(a) > (xfs_extlen_t)(b) ? \
- (xfs_extlen_t)(a) : (xfs_extlen_t)(b))
-#define XFS_AGBLOCK_MIN(a,b) \
- ((xfs_agblock_t)(a) < (xfs_agblock_t)(b) ? \
- (xfs_agblock_t)(a) : (xfs_agblock_t)(b))
-#define XFS_AGBLOCK_MAX(a,b) \
- ((xfs_agblock_t)(a) > (xfs_agblock_t)(b) ? \
- (xfs_agblock_t)(a) : (xfs_agblock_t)(b))
-#define XFS_FILEOFF_MIN(a,b) \
- ((xfs_fileoff_t)(a) < (xfs_fileoff_t)(b) ? \
- (xfs_fileoff_t)(a) : (xfs_fileoff_t)(b))
-#define XFS_FILEOFF_MAX(a,b) \
- ((xfs_fileoff_t)(a) > (xfs_fileoff_t)(b) ? \
- (xfs_fileoff_t)(a) : (xfs_fileoff_t)(b))
-#define XFS_FILBLKS_MIN(a,b) \
- ((xfs_filblks_t)(a) < (xfs_filblks_t)(b) ? \
- (xfs_filblks_t)(a) : (xfs_filblks_t)(b))
-#define XFS_FILBLKS_MAX(a,b) \
- ((xfs_filblks_t)(a) > (xfs_filblks_t)(b) ? \
- (xfs_filblks_t)(a) : (xfs_filblks_t)(b))
+#define XFS_EXTLEN_MIN(a,b) min_t(xfs_extlen_t, (a), (b))
+#define XFS_EXTLEN_MAX(a,b) max_t(xfs_extlen_t, (a), (b))
+#define XFS_AGBLOCK_MIN(a,b) min_t(xfs_agblock_t, (a), (b))
+#define XFS_AGBLOCK_MAX(a,b) max_t(xfs_agblock_t, (a), (b))
+#define XFS_FILEOFF_MIN(a,b) min_t(xfs_fileoff_t, (a), (b))
+#define XFS_FILEOFF_MAX(a,b) max_t(xfs_fileoff_t, (a), (b))
+#define XFS_FILBLKS_MIN(a,b) min_t(xfs_filblks_t, (a), (b))
+#define XFS_FILBLKS_MAX(a,b) max_t(xfs_filblks_t, (a), (b))
#define XFS_FSB_SANITY_CHECK(mp,fsb) \
(XFS_FSB_TO_AGNO(mp, fsb) < mp->m_sb.sb_agcount && \
XFS_FSB_TO_AGBNO(mp, fsb) < mp->m_sb.sb_agblocks)
+/*
+ * Trace hooks. Currently not implemented as they need to be ported
+ * over to the generic tracing functionality, which is some effort.
+ *
+ * i,j = integer (32 bit)
+ * b = btree block buffer (xfs_buf_t)
+ * p = btree ptr
+ * r = btree record
+ * k = btree key
+ */
+#define XFS_BTREE_TRACE_ARGBI(c, b, i)
+#define XFS_BTREE_TRACE_ARGBII(c, b, i, j)
+#define XFS_BTREE_TRACE_ARGI(c, i)
+#define XFS_BTREE_TRACE_ARGIPK(c, i, p, s)
+#define XFS_BTREE_TRACE_ARGIPR(c, i, p, r)
+#define XFS_BTREE_TRACE_ARGIK(c, i, k)
+#define XFS_BTREE_TRACE_ARGR(c, r)
+#define XFS_BTREE_TRACE_CURSOR(c, t)
+
#endif /* __XFS_BTREE_H__ */
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
new file mode 100644
index 00000000000..7a34a1ae655
--- /dev/null
+++ b/fs/xfs/xfs_buf.c
@@ -0,0 +1,1890 @@
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "xfs.h"
+#include <linux/stddef.h>
+#include <linux/errno.h>
+#include <linux/gfp.h>
+#include <linux/pagemap.h>
+#include <linux/init.h>
+#include <linux/vmalloc.h>
+#include <linux/bio.h>
+#include <linux/sysctl.h>
+#include <linux/proc_fs.h>
+#include <linux/workqueue.h>
+#include <linux/percpu.h>
+#include <linux/blkdev.h>
+#include <linux/hash.h>
+#include <linux/kthread.h>
+#include <linux/migrate.h>
+#include <linux/backing-dev.h>
+#include <linux/freezer.h>
+
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_trace.h"
+#include "xfs_log.h"
+
+static kmem_zone_t *xfs_buf_zone;
+
+static struct workqueue_struct *xfslogd_workqueue;
+
+#ifdef XFS_BUF_LOCK_TRACKING
+# define XB_SET_OWNER(bp) ((bp)->b_last_holder = current->pid)
+# define XB_CLEAR_OWNER(bp) ((bp)->b_last_holder = -1)
+# define XB_GET_OWNER(bp) ((bp)->b_last_holder)
+#else
+# define XB_SET_OWNER(bp) do { } while (0)
+# define XB_CLEAR_OWNER(bp) do { } while (0)
+# define XB_GET_OWNER(bp) do { } while (0)
+#endif
+
+#define xb_to_gfp(flags) \
+ ((((flags) & XBF_READ_AHEAD) ? __GFP_NORETRY : GFP_NOFS) | __GFP_NOWARN)
+
+
+static inline int
+xfs_buf_is_vmapped(
+ struct xfs_buf *bp)
+{
+ /*
+ * Return true if the buffer is vmapped.
+ *
+ * b_addr is null if the buffer is not mapped, but the code is clever
+ * enough to know it doesn't have to map a single page, so the check has
+ * to be both for b_addr and bp->b_page_count > 1.
+ */
+ return bp->b_addr && bp->b_page_count > 1;
+}
+
+static inline int
+xfs_buf_vmap_len(
+ struct xfs_buf *bp)
+{
+ return (bp->b_page_count * PAGE_SIZE) - bp->b_offset;
+}
+
+/*
+ * When we mark a buffer stale, we remove the buffer from the LRU and clear the
+ * b_lru_ref count so that the buffer is freed immediately when the buffer
+ * reference count falls to zero. If the buffer is already on the LRU, we need
+ * to remove the reference that LRU holds on the buffer.
+ *
+ * This prevents build-up of stale buffers on the LRU.
+ */
+void
+xfs_buf_stale(
+ struct xfs_buf *bp)
+{
+ ASSERT(xfs_buf_islocked(bp));
+
+ bp->b_flags |= XBF_STALE;
+
+ /*
+ * Clear the delwri status so that a delwri queue walker will not
+ * flush this buffer to disk now that it is stale. The delwri queue has
+ * a reference to the buffer, so this is safe to do.
+ */
+ bp->b_flags &= ~_XBF_DELWRI_Q;
+
+ spin_lock(&bp->b_lock);
+ atomic_set(&bp->b_lru_ref, 0);
+ if (!(bp->b_state & XFS_BSTATE_DISPOSE) &&
+ (list_lru_del(&bp->b_target->bt_lru, &bp->b_lru)))
+ atomic_dec(&bp->b_hold);
+
+ ASSERT(atomic_read(&bp->b_hold) >= 1);
+ spin_unlock(&bp->b_lock);
+}
+
+static int
+xfs_buf_get_maps(
+ struct xfs_buf *bp,
+ int map_count)
+{
+ ASSERT(bp->b_maps == NULL);
+ bp->b_map_count = map_count;
+
+ if (map_count == 1) {
+ bp->b_maps = &bp->__b_map;
+ return 0;
+ }
+
+ bp->b_maps = kmem_zalloc(map_count * sizeof(struct xfs_buf_map),
+ KM_NOFS);
+ if (!bp->b_maps)
+ return ENOMEM;
+ return 0;
+}
+
+/*
+ * Frees b_pages if it was allocated.
+ */
+static void
+xfs_buf_free_maps(
+ struct xfs_buf *bp)
+{
+ if (bp->b_maps != &bp->__b_map) {
+ kmem_free(bp->b_maps);
+ bp->b_maps = NULL;
+ }
+}
+
+struct xfs_buf *
+_xfs_buf_alloc(
+ struct xfs_buftarg *target,
+ struct xfs_buf_map *map,
+ int nmaps,
+ xfs_buf_flags_t flags)
+{
+ struct xfs_buf *bp;
+ int error;
+ int i;
+
+ bp = kmem_zone_zalloc(xfs_buf_zone, KM_NOFS);
+ if (unlikely(!bp))
+ return NULL;
+
+ /*
+ * We don't want certain flags to appear in b_flags unless they are
+ * specifically set by later operations on the buffer.
+ */
+ flags &= ~(XBF_UNMAPPED | XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD);
+
+ atomic_set(&bp->b_hold, 1);
+ atomic_set(&bp->b_lru_ref, 1);
+ init_completion(&bp->b_iowait);
+ INIT_LIST_HEAD(&bp->b_lru);
+ INIT_LIST_HEAD(&bp->b_list);
+ RB_CLEAR_NODE(&bp->b_rbnode);
+ sema_init(&bp->b_sema, 0); /* held, no waiters */
+ spin_lock_init(&bp->b_lock);
+ XB_SET_OWNER(bp);
+ bp->b_target = target;
+ bp->b_flags = flags;
+
+ /*
+ * Set length and io_length to the same value initially.
+ * I/O routines should use io_length, which will be the same in
+ * most cases but may be reset (e.g. XFS recovery).
+ */
+ error = xfs_buf_get_maps(bp, nmaps);
+ if (error) {
+ kmem_zone_free(xfs_buf_zone, bp);
+ return NULL;
+ }
+
+ bp->b_bn = map[0].bm_bn;
+ bp->b_length = 0;
+ for (i = 0; i < nmaps; i++) {
+ bp->b_maps[i].bm_bn = map[i].bm_bn;
+ bp->b_maps[i].bm_len = map[i].bm_len;
+ bp->b_length += map[i].bm_len;
+ }
+ bp->b_io_length = bp->b_length;
+
+ atomic_set(&bp->b_pin_count, 0);
+ init_waitqueue_head(&bp->b_waiters);
+
+ XFS_STATS_INC(xb_create);
+ trace_xfs_buf_init(bp, _RET_IP_);
+
+ return bp;
+}
+
+/*
+ * Allocate a page array capable of holding a specified number
+ * of pages, and point the page buf at it.
+ */
+STATIC int
+_xfs_buf_get_pages(
+ xfs_buf_t *bp,
+ int page_count)
+{
+ /* Make sure that we have a page list */
+ if (bp->b_pages == NULL) {
+ bp->b_page_count = page_count;
+ if (page_count <= XB_PAGES) {
+ bp->b_pages = bp->b_page_array;
+ } else {
+ bp->b_pages = kmem_alloc(sizeof(struct page *) *
+ page_count, KM_NOFS);
+ if (bp->b_pages == NULL)
+ return -ENOMEM;
+ }
+ memset(bp->b_pages, 0, sizeof(struct page *) * page_count);
+ }
+ return 0;
+}
+
+/*
+ * Frees b_pages if it was allocated.
+ */
+STATIC void
+_xfs_buf_free_pages(
+ xfs_buf_t *bp)
+{
+ if (bp->b_pages != bp->b_page_array) {
+ kmem_free(bp->b_pages);
+ bp->b_pages = NULL;
+ }
+}
+
+/*
+ * Releases the specified buffer.
+ *
+ * The modification state of any associated pages is left unchanged.
+ * The buffer must not be on any hash - use xfs_buf_rele instead for
+ * hashed and refcounted buffers
+ */
+void
+xfs_buf_free(
+ xfs_buf_t *bp)
+{
+ trace_xfs_buf_free(bp, _RET_IP_);
+
+ ASSERT(list_empty(&bp->b_lru));
+
+ if (bp->b_flags & _XBF_PAGES) {
+ uint i;
+
+ if (xfs_buf_is_vmapped(bp))
+ vm_unmap_ram(bp->b_addr - bp->b_offset,
+ bp->b_page_count);
+
+ for (i = 0; i < bp->b_page_count; i++) {
+ struct page *page = bp->b_pages[i];
+
+ __free_page(page);
+ }
+ } else if (bp->b_flags & _XBF_KMEM)
+ kmem_free(bp->b_addr);
+ _xfs_buf_free_pages(bp);
+ xfs_buf_free_maps(bp);
+ kmem_zone_free(xfs_buf_zone, bp);
+}
+
+/*
+ * Allocates all the pages for buffer in question and builds it's page list.
+ */
+STATIC int
+xfs_buf_allocate_memory(
+ xfs_buf_t *bp,
+ uint flags)
+{
+ size_t size;
+ size_t nbytes, offset;
+ gfp_t gfp_mask = xb_to_gfp(flags);
+ unsigned short page_count, i;
+ xfs_off_t start, end;
+ int error;
+
+ /*
+ * for buffers that are contained within a single page, just allocate
+ * the memory from the heap - there's no need for the complexity of
+ * page arrays to keep allocation down to order 0.
+ */
+ size = BBTOB(bp->b_length);
+ if (size < PAGE_SIZE) {
+ bp->b_addr = kmem_alloc(size, KM_NOFS);
+ if (!bp->b_addr) {
+ /* low memory - use alloc_page loop instead */
+ goto use_alloc_page;
+ }
+
+ if (((unsigned long)(bp->b_addr + size - 1) & PAGE_MASK) !=
+ ((unsigned long)bp->b_addr & PAGE_MASK)) {
+ /* b_addr spans two pages - use alloc_page instead */
+ kmem_free(bp->b_addr);
+ bp->b_addr = NULL;
+ goto use_alloc_page;
+ }
+ bp->b_offset = offset_in_page(bp->b_addr);
+ bp->b_pages = bp->b_page_array;
+ bp->b_pages[0] = virt_to_page(bp->b_addr);
+ bp->b_page_count = 1;
+ bp->b_flags |= _XBF_KMEM;
+ return 0;
+ }
+
+use_alloc_page:
+ start = BBTOB(bp->b_maps[0].bm_bn) >> PAGE_SHIFT;
+ end = (BBTOB(bp->b_maps[0].bm_bn + bp->b_length) + PAGE_SIZE - 1)
+ >> PAGE_SHIFT;
+ page_count = end - start;
+ error = _xfs_buf_get_pages(bp, page_count);
+ if (unlikely(error))
+ return error;
+
+ offset = bp->b_offset;
+ bp->b_flags |= _XBF_PAGES;
+
+ for (i = 0; i < bp->b_page_count; i++) {
+ struct page *page;
+ uint retries = 0;
+retry:
+ page = alloc_page(gfp_mask);
+ if (unlikely(page == NULL)) {
+ if (flags & XBF_READ_AHEAD) {
+ bp->b_page_count = i;
+ error = ENOMEM;
+ goto out_free_pages;
+ }
+
+ /*
+ * This could deadlock.
+ *
+ * But until all the XFS lowlevel code is revamped to
+ * handle buffer allocation failures we can't do much.
+ */
+ if (!(++retries % 100))
+ xfs_err(NULL,
+ "possible memory allocation deadlock in %s (mode:0x%x)",
+ __func__, gfp_mask);
+
+ XFS_STATS_INC(xb_page_retries);
+ congestion_wait(BLK_RW_ASYNC, HZ/50);
+ goto retry;
+ }
+
+ XFS_STATS_INC(xb_page_found);
+
+ nbytes = min_t(size_t, size, PAGE_SIZE - offset);
+ size -= nbytes;
+ bp->b_pages[i] = page;
+ offset = 0;
+ }
+ return 0;
+
+out_free_pages:
+ for (i = 0; i < bp->b_page_count; i++)
+ __free_page(bp->b_pages[i]);
+ return error;
+}
+
+/*
+ * Map buffer into kernel address-space if necessary.
+ */
+STATIC int
+_xfs_buf_map_pages(
+ xfs_buf_t *bp,
+ uint flags)
+{
+ ASSERT(bp->b_flags & _XBF_PAGES);
+ if (bp->b_page_count == 1) {
+ /* A single page buffer is always mappable */
+ bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset;
+ } else if (flags & XBF_UNMAPPED) {
+ bp->b_addr = NULL;
+ } else {
+ int retried = 0;
+ unsigned noio_flag;
+
+ /*
+ * vm_map_ram() will allocate auxillary structures (e.g.
+ * pagetables) with GFP_KERNEL, yet we are likely to be under
+ * GFP_NOFS context here. Hence we need to tell memory reclaim
+ * that we are in such a context via PF_MEMALLOC_NOIO to prevent
+ * memory reclaim re-entering the filesystem here and
+ * potentially deadlocking.
+ */
+ noio_flag = memalloc_noio_save();
+ do {
+ bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count,
+ -1, PAGE_KERNEL);
+ if (bp->b_addr)
+ break;
+ vm_unmap_aliases();
+ } while (retried++ <= 1);
+ memalloc_noio_restore(noio_flag);
+
+ if (!bp->b_addr)
+ return -ENOMEM;
+ bp->b_addr += bp->b_offset;
+ }
+
+ return 0;
+}
+
+/*
+ * Finding and Reading Buffers
+ */
+
+/*
+ * Look up, and creates if absent, a lockable buffer for
+ * a given range of an inode. The buffer is returned
+ * locked. No I/O is implied by this call.
+ */
+xfs_buf_t *
+_xfs_buf_find(
+ struct xfs_buftarg *btp,
+ struct xfs_buf_map *map,
+ int nmaps,
+ xfs_buf_flags_t flags,
+ xfs_buf_t *new_bp)
+{
+ size_t numbytes;
+ struct xfs_perag *pag;
+ struct rb_node **rbp;
+ struct rb_node *parent;
+ xfs_buf_t *bp;
+ xfs_daddr_t blkno = map[0].bm_bn;
+ xfs_daddr_t eofs;
+ int numblks = 0;
+ int i;
+
+ for (i = 0; i < nmaps; i++)
+ numblks += map[i].bm_len;
+ numbytes = BBTOB(numblks);
+
+ /* Check for IOs smaller than the sector size / not sector aligned */
+ ASSERT(!(numbytes < btp->bt_meta_sectorsize));
+ ASSERT(!(BBTOB(blkno) & (xfs_off_t)btp->bt_meta_sectormask));
+
+ /*
+ * Corrupted block numbers can get through to here, unfortunately, so we
+ * have to check that the buffer falls within the filesystem bounds.
+ */
+ eofs = XFS_FSB_TO_BB(btp->bt_mount, btp->bt_mount->m_sb.sb_dblocks);
+ if (blkno >= eofs) {
+ /*
+ * XXX (dgc): we should really be returning EFSCORRUPTED here,
+ * but none of the higher level infrastructure supports
+ * returning a specific error on buffer lookup failures.
+ */
+ xfs_alert(btp->bt_mount,
+ "%s: Block out of range: block 0x%llx, EOFS 0x%llx ",
+ __func__, blkno, eofs);
+ WARN_ON(1);
+ return NULL;
+ }
+
+ /* get tree root */
+ pag = xfs_perag_get(btp->bt_mount,
+ xfs_daddr_to_agno(btp->bt_mount, blkno));
+
+ /* walk tree */
+ spin_lock(&pag->pag_buf_lock);
+ rbp = &pag->pag_buf_tree.rb_node;
+ parent = NULL;
+ bp = NULL;
+ while (*rbp) {
+ parent = *rbp;
+ bp = rb_entry(parent, struct xfs_buf, b_rbnode);
+
+ if (blkno < bp->b_bn)
+ rbp = &(*rbp)->rb_left;
+ else if (blkno > bp->b_bn)
+ rbp = &(*rbp)->rb_right;
+ else {
+ /*
+ * found a block number match. If the range doesn't
+ * match, the only way this is allowed is if the buffer
+ * in the cache is stale and the transaction that made
+ * it stale has not yet committed. i.e. we are
+ * reallocating a busy extent. Skip this buffer and
+ * continue searching to the right for an exact match.
+ */
+ if (bp->b_length != numblks) {
+ ASSERT(bp->b_flags & XBF_STALE);
+ rbp = &(*rbp)->rb_right;
+ continue;
+ }
+ atomic_inc(&bp->b_hold);
+ goto found;
+ }
+ }
+
+ /* No match found */
+ if (new_bp) {
+ rb_link_node(&new_bp->b_rbnode, parent, rbp);
+ rb_insert_color(&new_bp->b_rbnode, &pag->pag_buf_tree);
+ /* the buffer keeps the perag reference until it is freed */
+ new_bp->b_pag = pag;
+ spin_unlock(&pag->pag_buf_lock);
+ } else {
+ XFS_STATS_INC(xb_miss_locked);
+ spin_unlock(&pag->pag_buf_lock);
+ xfs_perag_put(pag);
+ }
+ return new_bp;
+
+found:
+ spin_unlock(&pag->pag_buf_lock);
+ xfs_perag_put(pag);
+
+ if (!xfs_buf_trylock(bp)) {
+ if (flags & XBF_TRYLOCK) {
+ xfs_buf_rele(bp);
+ XFS_STATS_INC(xb_busy_locked);
+ return NULL;
+ }
+ xfs_buf_lock(bp);
+ XFS_STATS_INC(xb_get_locked_waited);
+ }
+
+ /*
+ * if the buffer is stale, clear all the external state associated with
+ * it. We need to keep flags such as how we allocated the buffer memory
+ * intact here.
+ */
+ if (bp->b_flags & XBF_STALE) {
+ ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0);
+ ASSERT(bp->b_iodone == NULL);
+ bp->b_flags &= _XBF_KMEM | _XBF_PAGES;
+ bp->b_ops = NULL;
+ }
+
+ trace_xfs_buf_find(bp, flags, _RET_IP_);
+ XFS_STATS_INC(xb_get_locked);
+ return bp;
+}
+
+/*
+ * Assembles a buffer covering the specified range. The code is optimised for
+ * cache hits, as metadata intensive workloads will see 3 orders of magnitude
+ * more hits than misses.
+ */
+struct xfs_buf *
+xfs_buf_get_map(
+ struct xfs_buftarg *target,
+ struct xfs_buf_map *map,
+ int nmaps,
+ xfs_buf_flags_t flags)
+{
+ struct xfs_buf *bp;
+ struct xfs_buf *new_bp;
+ int error = 0;
+
+ bp = _xfs_buf_find(target, map, nmaps, flags, NULL);
+ if (likely(bp))
+ goto found;
+
+ new_bp = _xfs_buf_alloc(target, map, nmaps, flags);
+ if (unlikely(!new_bp))
+ return NULL;
+
+ error = xfs_buf_allocate_memory(new_bp, flags);
+ if (error) {
+ xfs_buf_free(new_bp);
+ return NULL;
+ }
+
+ bp = _xfs_buf_find(target, map, nmaps, flags, new_bp);
+ if (!bp) {
+ xfs_buf_free(new_bp);
+ return NULL;
+ }
+
+ if (bp != new_bp)
+ xfs_buf_free(new_bp);
+
+found:
+ if (!bp->b_addr) {
+ error = _xfs_buf_map_pages(bp, flags);
+ if (unlikely(error)) {
+ xfs_warn(target->bt_mount,
+ "%s: failed to map pagesn", __func__);
+ xfs_buf_relse(bp);
+ return NULL;
+ }
+ }
+
+ XFS_STATS_INC(xb_get);
+ trace_xfs_buf_get(bp, flags, _RET_IP_);
+ return bp;
+}
+
+STATIC int
+_xfs_buf_read(
+ xfs_buf_t *bp,
+ xfs_buf_flags_t flags)
+{
+ ASSERT(!(flags & XBF_WRITE));
+ ASSERT(bp->b_maps[0].bm_bn != XFS_BUF_DADDR_NULL);
+
+ bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_READ_AHEAD);
+ bp->b_flags |= flags & (XBF_READ | XBF_ASYNC | XBF_READ_AHEAD);
+
+ xfs_buf_iorequest(bp);
+ if (flags & XBF_ASYNC)
+ return 0;
+ return xfs_buf_iowait(bp);
+}
+
+xfs_buf_t *
+xfs_buf_read_map(
+ struct xfs_buftarg *target,
+ struct xfs_buf_map *map,
+ int nmaps,
+ xfs_buf_flags_t flags,
+ const struct xfs_buf_ops *ops)
+{
+ struct xfs_buf *bp;
+
+ flags |= XBF_READ;
+
+ bp = xfs_buf_get_map(target, map, nmaps, flags);
+ if (bp) {
+ trace_xfs_buf_read(bp, flags, _RET_IP_);
+
+ if (!XFS_BUF_ISDONE(bp)) {
+ XFS_STATS_INC(xb_get_read);
+ bp->b_ops = ops;
+ _xfs_buf_read(bp, flags);
+ } else if (flags & XBF_ASYNC) {
+ /*
+ * Read ahead call which is already satisfied,
+ * drop the buffer
+ */
+ xfs_buf_relse(bp);
+ return NULL;
+ } else {
+ /* We do not want read in the flags */
+ bp->b_flags &= ~XBF_READ;
+ }
+ }
+
+ return bp;
+}
+
+/*
+ * If we are not low on memory then do the readahead in a deadlock
+ * safe manner.
+ */
+void
+xfs_buf_readahead_map(
+ struct xfs_buftarg *target,
+ struct xfs_buf_map *map,
+ int nmaps,
+ const struct xfs_buf_ops *ops)
+{
+ if (bdi_read_congested(target->bt_bdi))
+ return;
+
+ xfs_buf_read_map(target, map, nmaps,
+ XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD, ops);
+}
+
+/*
+ * Read an uncached buffer from disk. Allocates and returns a locked
+ * buffer containing the disk contents or nothing.
+ */
+struct xfs_buf *
+xfs_buf_read_uncached(
+ struct xfs_buftarg *target,
+ xfs_daddr_t daddr,
+ size_t numblks,
+ int flags,
+ const struct xfs_buf_ops *ops)
+{
+ struct xfs_buf *bp;
+
+ bp = xfs_buf_get_uncached(target, numblks, flags);
+ if (!bp)
+ return NULL;
+
+ /* set up the buffer for a read IO */
+ ASSERT(bp->b_map_count == 1);
+ bp->b_bn = daddr;
+ bp->b_maps[0].bm_bn = daddr;
+ bp->b_flags |= XBF_READ;
+ bp->b_ops = ops;
+
+ if (XFS_FORCED_SHUTDOWN(target->bt_mount)) {
+ xfs_buf_relse(bp);
+ return NULL;
+ }
+ xfs_buf_iorequest(bp);
+ xfs_buf_iowait(bp);
+ return bp;
+}
+
+/*
+ * Return a buffer allocated as an empty buffer and associated to external
+ * memory via xfs_buf_associate_memory() back to it's empty state.
+ */
+void
+xfs_buf_set_empty(
+ struct xfs_buf *bp,
+ size_t numblks)
+{
+ if (bp->b_pages)
+ _xfs_buf_free_pages(bp);
+
+ bp->b_pages = NULL;
+ bp->b_page_count = 0;
+ bp->b_addr = NULL;
+ bp->b_length = numblks;
+ bp->b_io_length = numblks;
+
+ ASSERT(bp->b_map_count == 1);
+ bp->b_bn = XFS_BUF_DADDR_NULL;
+ bp->b_maps[0].bm_bn = XFS_BUF_DADDR_NULL;
+ bp->b_maps[0].bm_len = bp->b_length;
+}
+
+static inline struct page *
+mem_to_page(
+ void *addr)
+{
+ if ((!is_vmalloc_addr(addr))) {
+ return virt_to_page(addr);
+ } else {
+ return vmalloc_to_page(addr);
+ }
+}
+
+int
+xfs_buf_associate_memory(
+ xfs_buf_t *bp,
+ void *mem,
+ size_t len)
+{
+ int rval;
+ int i = 0;
+ unsigned long pageaddr;
+ unsigned long offset;
+ size_t buflen;
+ int page_count;
+
+ pageaddr = (unsigned long)mem & PAGE_MASK;
+ offset = (unsigned long)mem - pageaddr;
+ buflen = PAGE_ALIGN(len + offset);
+ page_count = buflen >> PAGE_SHIFT;
+
+ /* Free any previous set of page pointers */
+ if (bp->b_pages)
+ _xfs_buf_free_pages(bp);
+
+ bp->b_pages = NULL;
+ bp->b_addr = mem;
+
+ rval = _xfs_buf_get_pages(bp, page_count);
+ if (rval)
+ return rval;
+
+ bp->b_offset = offset;
+
+ for (i = 0; i < bp->b_page_count; i++) {
+ bp->b_pages[i] = mem_to_page((void *)pageaddr);
+ pageaddr += PAGE_SIZE;
+ }
+
+ bp->b_io_length = BTOBB(len);
+ bp->b_length = BTOBB(buflen);
+
+ return 0;
+}
+
+xfs_buf_t *
+xfs_buf_get_uncached(
+ struct xfs_buftarg *target,
+ size_t numblks,
+ int flags)
+{
+ unsigned long page_count;
+ int error, i;
+ struct xfs_buf *bp;
+ DEFINE_SINGLE_BUF_MAP(map, XFS_BUF_DADDR_NULL, numblks);
+
+ bp = _xfs_buf_alloc(target, &map, 1, 0);
+ if (unlikely(bp == NULL))
+ goto fail;
+
+ page_count = PAGE_ALIGN(numblks << BBSHIFT) >> PAGE_SHIFT;
+ error = _xfs_buf_get_pages(bp, page_count);
+ if (error)
+ goto fail_free_buf;
+
+ for (i = 0; i < page_count; i++) {
+ bp->b_pages[i] = alloc_page(xb_to_gfp(flags));
+ if (!bp->b_pages[i])
+ goto fail_free_mem;
+ }
+ bp->b_flags |= _XBF_PAGES;
+
+ error = _xfs_buf_map_pages(bp, 0);
+ if (unlikely(error)) {
+ xfs_warn(target->bt_mount,
+ "%s: failed to map pages", __func__);
+ goto fail_free_mem;
+ }
+
+ trace_xfs_buf_get_uncached(bp, _RET_IP_);
+ return bp;
+
+ fail_free_mem:
+ while (--i >= 0)
+ __free_page(bp->b_pages[i]);
+ _xfs_buf_free_pages(bp);
+ fail_free_buf:
+ xfs_buf_free_maps(bp);
+ kmem_zone_free(xfs_buf_zone, bp);
+ fail:
+ return NULL;
+}
+
+/*
+ * Increment reference count on buffer, to hold the buffer concurrently
+ * with another thread which may release (free) the buffer asynchronously.
+ * Must hold the buffer already to call this function.
+ */
+void
+xfs_buf_hold(
+ xfs_buf_t *bp)
+{
+ trace_xfs_buf_hold(bp, _RET_IP_);
+ atomic_inc(&bp->b_hold);
+}
+
+/*
+ * Releases a hold on the specified buffer. If the
+ * the hold count is 1, calls xfs_buf_free.
+ */
+void
+xfs_buf_rele(
+ xfs_buf_t *bp)
+{
+ struct xfs_perag *pag = bp->b_pag;
+
+ trace_xfs_buf_rele(bp, _RET_IP_);
+
+ if (!pag) {
+ ASSERT(list_empty(&bp->b_lru));
+ ASSERT(RB_EMPTY_NODE(&bp->b_rbnode));
+ if (atomic_dec_and_test(&bp->b_hold))
+ xfs_buf_free(bp);
+ return;
+ }
+
+ ASSERT(!RB_EMPTY_NODE(&bp->b_rbnode));
+
+ ASSERT(atomic_read(&bp->b_hold) > 0);
+ if (atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock)) {
+ spin_lock(&bp->b_lock);
+ if (!(bp->b_flags & XBF_STALE) && atomic_read(&bp->b_lru_ref)) {
+ /*
+ * If the buffer is added to the LRU take a new
+ * reference to the buffer for the LRU and clear the
+ * (now stale) dispose list state flag
+ */
+ if (list_lru_add(&bp->b_target->bt_lru, &bp->b_lru)) {
+ bp->b_state &= ~XFS_BSTATE_DISPOSE;
+ atomic_inc(&bp->b_hold);
+ }
+ spin_unlock(&bp->b_lock);
+ spin_unlock(&pag->pag_buf_lock);
+ } else {
+ /*
+ * most of the time buffers will already be removed from
+ * the LRU, so optimise that case by checking for the
+ * XFS_BSTATE_DISPOSE flag indicating the last list the
+ * buffer was on was the disposal list
+ */
+ if (!(bp->b_state & XFS_BSTATE_DISPOSE)) {
+ list_lru_del(&bp->b_target->bt_lru, &bp->b_lru);
+ } else {
+ ASSERT(list_empty(&bp->b_lru));
+ }
+ spin_unlock(&bp->b_lock);
+
+ ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
+ rb_erase(&bp->b_rbnode, &pag->pag_buf_tree);
+ spin_unlock(&pag->pag_buf_lock);
+ xfs_perag_put(pag);
+ xfs_buf_free(bp);
+ }
+ }
+}
+
+
+/*
+ * Lock a buffer object, if it is not already locked.
+ *
+ * If we come across a stale, pinned, locked buffer, we know that we are
+ * being asked to lock a buffer that has been reallocated. Because it is
+ * pinned, we know that the log has not been pushed to disk and hence it
+ * will still be locked. Rather than continuing to have trylock attempts
+ * fail until someone else pushes the log, push it ourselves before
+ * returning. This means that the xfsaild will not get stuck trying
+ * to push on stale inode buffers.
+ */
+int
+xfs_buf_trylock(
+ struct xfs_buf *bp)
+{
+ int locked;
+
+ locked = down_trylock(&bp->b_sema) == 0;
+ if (locked)
+ XB_SET_OWNER(bp);
+
+ trace_xfs_buf_trylock(bp, _RET_IP_);
+ return locked;
+}
+
+/*
+ * Lock a buffer object.
+ *
+ * If we come across a stale, pinned, locked buffer, we know that we
+ * are being asked to lock a buffer that has been reallocated. Because
+ * it is pinned, we know that the log has not been pushed to disk and
+ * hence it will still be locked. Rather than sleeping until someone
+ * else pushes the log, push it ourselves before trying to get the lock.
+ */
+void
+xfs_buf_lock(
+ struct xfs_buf *bp)
+{
+ trace_xfs_buf_lock(bp, _RET_IP_);
+
+ if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE))
+ xfs_log_force(bp->b_target->bt_mount, 0);
+ down(&bp->b_sema);
+ XB_SET_OWNER(bp);
+
+ trace_xfs_buf_lock_done(bp, _RET_IP_);
+}
+
+void
+xfs_buf_unlock(
+ struct xfs_buf *bp)
+{
+ XB_CLEAR_OWNER(bp);
+ up(&bp->b_sema);
+
+ trace_xfs_buf_unlock(bp, _RET_IP_);
+}
+
+STATIC void
+xfs_buf_wait_unpin(
+ xfs_buf_t *bp)
+{
+ DECLARE_WAITQUEUE (wait, current);
+
+ if (atomic_read(&bp->b_pin_count) == 0)
+ return;
+
+ add_wait_queue(&bp->b_waiters, &wait);
+ for (;;) {
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ if (atomic_read(&bp->b_pin_count) == 0)
+ break;
+ io_schedule();
+ }
+ remove_wait_queue(&bp->b_waiters, &wait);
+ set_current_state(TASK_RUNNING);
+}
+
+/*
+ * Buffer Utility Routines
+ */
+
+STATIC void
+xfs_buf_iodone_work(
+ struct work_struct *work)
+{
+ struct xfs_buf *bp =
+ container_of(work, xfs_buf_t, b_iodone_work);
+ bool read = !!(bp->b_flags & XBF_READ);
+
+ bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD);
+
+ /* only validate buffers that were read without errors */
+ if (read && bp->b_ops && !bp->b_error && (bp->b_flags & XBF_DONE))
+ bp->b_ops->verify_read(bp);
+
+ if (bp->b_iodone)
+ (*(bp->b_iodone))(bp);
+ else if (bp->b_flags & XBF_ASYNC)
+ xfs_buf_relse(bp);
+ else {
+ ASSERT(read && bp->b_ops);
+ complete(&bp->b_iowait);
+ }
+}
+
+void
+xfs_buf_ioend(
+ struct xfs_buf *bp,
+ int schedule)
+{
+ bool read = !!(bp->b_flags & XBF_READ);
+
+ trace_xfs_buf_iodone(bp, _RET_IP_);
+
+ if (bp->b_error == 0)
+ bp->b_flags |= XBF_DONE;
+
+ if (bp->b_iodone || (read && bp->b_ops) || (bp->b_flags & XBF_ASYNC)) {
+ if (schedule) {
+ INIT_WORK(&bp->b_iodone_work, xfs_buf_iodone_work);
+ queue_work(xfslogd_workqueue, &bp->b_iodone_work);
+ } else {
+ xfs_buf_iodone_work(&bp->b_iodone_work);
+ }
+ } else {
+ bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD);
+ complete(&bp->b_iowait);
+ }
+}
+
+void
+xfs_buf_ioerror(
+ xfs_buf_t *bp,
+ int error)
+{
+ ASSERT(error >= 0 && error <= 0xffff);
+ bp->b_error = (unsigned short)error;
+ trace_xfs_buf_ioerror(bp, error, _RET_IP_);
+}
+
+void
+xfs_buf_ioerror_alert(
+ struct xfs_buf *bp,
+ const char *func)
+{
+ xfs_alert(bp->b_target->bt_mount,
+"metadata I/O error: block 0x%llx (\"%s\") error %d numblks %d",
+ (__uint64_t)XFS_BUF_ADDR(bp), func, bp->b_error, bp->b_length);
+}
+
+/*
+ * Called when we want to stop a buffer from getting written or read.
+ * We attach the EIO error, muck with its flags, and call xfs_buf_ioend
+ * so that the proper iodone callbacks get called.
+ */
+STATIC int
+xfs_bioerror(
+ xfs_buf_t *bp)
+{
+#ifdef XFSERRORDEBUG
+ ASSERT(XFS_BUF_ISREAD(bp) || bp->b_iodone);
+#endif
+
+ /*
+ * No need to wait until the buffer is unpinned, we aren't flushing it.
+ */
+ xfs_buf_ioerror(bp, EIO);
+
+ /*
+ * We're calling xfs_buf_ioend, so delete XBF_DONE flag.
+ */
+ XFS_BUF_UNREAD(bp);
+ XFS_BUF_UNDONE(bp);
+ xfs_buf_stale(bp);
+
+ xfs_buf_ioend(bp, 0);
+
+ return EIO;
+}
+
+/*
+ * Same as xfs_bioerror, except that we are releasing the buffer
+ * here ourselves, and avoiding the xfs_buf_ioend call.
+ * This is meant for userdata errors; metadata bufs come with
+ * iodone functions attached, so that we can track down errors.
+ */
+int
+xfs_bioerror_relse(
+ struct xfs_buf *bp)
+{
+ int64_t fl = bp->b_flags;
+ /*
+ * No need to wait until the buffer is unpinned.
+ * We aren't flushing it.
+ *
+ * chunkhold expects B_DONE to be set, whether
+ * we actually finish the I/O or not. We don't want to
+ * change that interface.
+ */
+ XFS_BUF_UNREAD(bp);
+ XFS_BUF_DONE(bp);
+ xfs_buf_stale(bp);
+ bp->b_iodone = NULL;
+ if (!(fl & XBF_ASYNC)) {
+ /*
+ * Mark b_error and B_ERROR _both_.
+ * Lot's of chunkcache code assumes that.
+ * There's no reason to mark error for
+ * ASYNC buffers.
+ */
+ xfs_buf_ioerror(bp, EIO);
+ complete(&bp->b_iowait);
+ } else {
+ xfs_buf_relse(bp);
+ }
+
+ return EIO;
+}
+
+STATIC int
+xfs_bdstrat_cb(
+ struct xfs_buf *bp)
+{
+ if (XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) {
+ trace_xfs_bdstrat_shut(bp, _RET_IP_);
+ /*
+ * Metadata write that didn't get logged but
+ * written delayed anyway. These aren't associated
+ * with a transaction, and can be ignored.
+ */
+ if (!bp->b_iodone && !XFS_BUF_ISREAD(bp))
+ return xfs_bioerror_relse(bp);
+ else
+ return xfs_bioerror(bp);
+ }
+
+ xfs_buf_iorequest(bp);
+ return 0;
+}
+
+int
+xfs_bwrite(
+ struct xfs_buf *bp)
+{
+ int error;
+
+ ASSERT(xfs_buf_islocked(bp));
+
+ bp->b_flags |= XBF_WRITE;
+ bp->b_flags &= ~(XBF_ASYNC | XBF_READ | _XBF_DELWRI_Q | XBF_WRITE_FAIL);
+
+ xfs_bdstrat_cb(bp);
+
+ error = xfs_buf_iowait(bp);
+ if (error) {
+ xfs_force_shutdown(bp->b_target->bt_mount,
+ SHUTDOWN_META_IO_ERROR);
+ }
+ return error;
+}
+
+STATIC void
+_xfs_buf_ioend(
+ xfs_buf_t *bp,
+ int schedule)
+{
+ if (atomic_dec_and_test(&bp->b_io_remaining) == 1)
+ xfs_buf_ioend(bp, schedule);
+}
+
+STATIC void
+xfs_buf_bio_end_io(
+ struct bio *bio,
+ int error)
+{
+ xfs_buf_t *bp = (xfs_buf_t *)bio->bi_private;
+
+ /*
+ * don't overwrite existing errors - otherwise we can lose errors on
+ * buffers that require multiple bios to complete.
+ */
+ if (!bp->b_error)
+ xfs_buf_ioerror(bp, -error);
+
+ if (!bp->b_error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ))
+ invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp));
+
+ _xfs_buf_ioend(bp, 1);
+ bio_put(bio);
+}
+
+static void
+xfs_buf_ioapply_map(
+ struct xfs_buf *bp,
+ int map,
+ int *buf_offset,
+ int *count,
+ int rw)
+{
+ int page_index;
+ int total_nr_pages = bp->b_page_count;
+ int nr_pages;
+ struct bio *bio;
+ sector_t sector = bp->b_maps[map].bm_bn;
+ int size;
+ int offset;
+
+ total_nr_pages = bp->b_page_count;
+
+ /* skip the pages in the buffer before the start offset */
+ page_index = 0;
+ offset = *buf_offset;
+ while (offset >= PAGE_SIZE) {
+ page_index++;
+ offset -= PAGE_SIZE;
+ }
+
+ /*
+ * Limit the IO size to the length of the current vector, and update the
+ * remaining IO count for the next time around.
+ */
+ size = min_t(int, BBTOB(bp->b_maps[map].bm_len), *count);
+ *count -= size;
+ *buf_offset += size;
+
+next_chunk:
+ atomic_inc(&bp->b_io_remaining);
+ nr_pages = BIO_MAX_SECTORS >> (PAGE_SHIFT - BBSHIFT);
+ if (nr_pages > total_nr_pages)
+ nr_pages = total_nr_pages;
+
+ bio = bio_alloc(GFP_NOIO, nr_pages);
+ bio->bi_bdev = bp->b_target->bt_bdev;
+ bio->bi_iter.bi_sector = sector;
+ bio->bi_end_io = xfs_buf_bio_end_io;
+ bio->bi_private = bp;
+
+
+ for (; size && nr_pages; nr_pages--, page_index++) {
+ int rbytes, nbytes = PAGE_SIZE - offset;
+
+ if (nbytes > size)
+ nbytes = size;
+
+ rbytes = bio_add_page(bio, bp->b_pages[page_index], nbytes,
+ offset);
+ if (rbytes < nbytes)
+ break;
+
+ offset = 0;
+ sector += BTOBB(nbytes);
+ size -= nbytes;
+ total_nr_pages--;
+ }
+
+ if (likely(bio->bi_iter.bi_size)) {
+ if (xfs_buf_is_vmapped(bp)) {
+ flush_kernel_vmap_range(bp->b_addr,
+ xfs_buf_vmap_len(bp));
+ }
+ submit_bio(rw, bio);
+ if (size)
+ goto next_chunk;
+ } else {
+ /*
+ * This is guaranteed not to be the last io reference count
+ * because the caller (xfs_buf_iorequest) holds a count itself.
+ */
+ atomic_dec(&bp->b_io_remaining);
+ xfs_buf_ioerror(bp, EIO);
+ bio_put(bio);
+ }
+
+}
+
+STATIC void
+_xfs_buf_ioapply(
+ struct xfs_buf *bp)
+{
+ struct blk_plug plug;
+ int rw;
+ int offset;
+ int size;
+ int i;
+
+ /*
+ * Make sure we capture only current IO errors rather than stale errors
+ * left over from previous use of the buffer (e.g. failed readahead).
+ */
+ bp->b_error = 0;
+
+ if (bp->b_flags & XBF_WRITE) {
+ if (bp->b_flags & XBF_SYNCIO)
+ rw = WRITE_SYNC;
+ else
+ rw = WRITE;
+ if (bp->b_flags & XBF_FUA)
+ rw |= REQ_FUA;
+ if (bp->b_flags & XBF_FLUSH)
+ rw |= REQ_FLUSH;
+
+ /*
+ * Run the write verifier callback function if it exists. If
+ * this function fails it will mark the buffer with an error and
+ * the IO should not be dispatched.
+ */
+ if (bp->b_ops) {
+ bp->b_ops->verify_write(bp);
+ if (bp->b_error) {
+ xfs_force_shutdown(bp->b_target->bt_mount,
+ SHUTDOWN_CORRUPT_INCORE);
+ return;
+ }
+ }
+ } else if (bp->b_flags & XBF_READ_AHEAD) {
+ rw = READA;
+ } else {
+ rw = READ;
+ }
+
+ /* we only use the buffer cache for meta-data */
+ rw |= REQ_META;
+
+ /*
+ * Walk all the vectors issuing IO on them. Set up the initial offset
+ * into the buffer and the desired IO size before we start -
+ * _xfs_buf_ioapply_vec() will modify them appropriately for each
+ * subsequent call.
+ */
+ offset = bp->b_offset;
+ size = BBTOB(bp->b_io_length);
+ blk_start_plug(&plug);
+ for (i = 0; i < bp->b_map_count; i++) {
+ xfs_buf_ioapply_map(bp, i, &offset, &size, rw);
+ if (bp->b_error)
+ break;
+ if (size <= 0)
+ break; /* all done */
+ }
+ blk_finish_plug(&plug);
+}
+
+void
+xfs_buf_iorequest(
+ xfs_buf_t *bp)
+{
+ trace_xfs_buf_iorequest(bp, _RET_IP_);
+
+ ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
+
+ if (bp->b_flags & XBF_WRITE)
+ xfs_buf_wait_unpin(bp);
+ xfs_buf_hold(bp);
+
+ /*
+ * Set the count to 1 initially, this will stop an I/O
+ * completion callout which happens before we have started
+ * all the I/O from calling xfs_buf_ioend too early.
+ */
+ atomic_set(&bp->b_io_remaining, 1);
+ _xfs_buf_ioapply(bp);
+ /*
+ * If _xfs_buf_ioapply failed, we'll get back here with
+ * only the reference we took above. _xfs_buf_ioend will
+ * drop it to zero, so we'd better not queue it for later,
+ * or we'll free it before it's done.
+ */
+ _xfs_buf_ioend(bp, bp->b_error ? 0 : 1);
+
+ xfs_buf_rele(bp);
+}
+
+/*
+ * Waits for I/O to complete on the buffer supplied. It returns immediately if
+ * no I/O is pending or there is already a pending error on the buffer, in which
+ * case nothing will ever complete. It returns the I/O error code, if any, or
+ * 0 if there was no error.
+ */
+int
+xfs_buf_iowait(
+ xfs_buf_t *bp)
+{
+ trace_xfs_buf_iowait(bp, _RET_IP_);
+
+ if (!bp->b_error)
+ wait_for_completion(&bp->b_iowait);
+
+ trace_xfs_buf_iowait_done(bp, _RET_IP_);
+ return bp->b_error;
+}
+
+xfs_caddr_t
+xfs_buf_offset(
+ xfs_buf_t *bp,
+ size_t offset)
+{
+ struct page *page;
+
+ if (bp->b_addr)
+ return bp->b_addr + offset;
+
+ offset += bp->b_offset;
+ page = bp->b_pages[offset >> PAGE_SHIFT];
+ return (xfs_caddr_t)page_address(page) + (offset & (PAGE_SIZE-1));
+}
+
+/*
+ * Move data into or out of a buffer.
+ */
+void
+xfs_buf_iomove(
+ xfs_buf_t *bp, /* buffer to process */
+ size_t boff, /* starting buffer offset */
+ size_t bsize, /* length to copy */
+ void *data, /* data address */
+ xfs_buf_rw_t mode) /* read/write/zero flag */
+{
+ size_t bend;
+
+ bend = boff + bsize;
+ while (boff < bend) {
+ struct page *page;
+ int page_index, page_offset, csize;
+
+ page_index = (boff + bp->b_offset) >> PAGE_SHIFT;
+ page_offset = (boff + bp->b_offset) & ~PAGE_MASK;
+ page = bp->b_pages[page_index];
+ csize = min_t(size_t, PAGE_SIZE - page_offset,
+ BBTOB(bp->b_io_length) - boff);
+
+ ASSERT((csize + page_offset) <= PAGE_SIZE);
+
+ switch (mode) {
+ case XBRW_ZERO:
+ memset(page_address(page) + page_offset, 0, csize);
+ break;
+ case XBRW_READ:
+ memcpy(data, page_address(page) + page_offset, csize);
+ break;
+ case XBRW_WRITE:
+ memcpy(page_address(page) + page_offset, data, csize);
+ }
+
+ boff += csize;
+ data += csize;
+ }
+}
+
+/*
+ * Handling of buffer targets (buftargs).
+ */
+
+/*
+ * Wait for any bufs with callbacks that have been submitted but have not yet
+ * returned. These buffers will have an elevated hold count, so wait on those
+ * while freeing all the buffers only held by the LRU.
+ */
+static enum lru_status
+xfs_buftarg_wait_rele(
+ struct list_head *item,
+ spinlock_t *lru_lock,
+ void *arg)
+
+{
+ struct xfs_buf *bp = container_of(item, struct xfs_buf, b_lru);
+ struct list_head *dispose = arg;
+
+ if (atomic_read(&bp->b_hold) > 1) {
+ /* need to wait, so skip it this pass */
+ trace_xfs_buf_wait_buftarg(bp, _RET_IP_);
+ return LRU_SKIP;
+ }
+ if (!spin_trylock(&bp->b_lock))
+ return LRU_SKIP;
+
+ /*
+ * clear the LRU reference count so the buffer doesn't get
+ * ignored in xfs_buf_rele().
+ */
+ atomic_set(&bp->b_lru_ref, 0);
+ bp->b_state |= XFS_BSTATE_DISPOSE;
+ list_move(item, dispose);
+ spin_unlock(&bp->b_lock);
+ return LRU_REMOVED;
+}
+
+void
+xfs_wait_buftarg(
+ struct xfs_buftarg *btp)
+{
+ LIST_HEAD(dispose);
+ int loop = 0;
+
+ /* loop until there is nothing left on the lru list. */
+ while (list_lru_count(&btp->bt_lru)) {
+ list_lru_walk(&btp->bt_lru, xfs_buftarg_wait_rele,
+ &dispose, LONG_MAX);
+
+ while (!list_empty(&dispose)) {
+ struct xfs_buf *bp;
+ bp = list_first_entry(&dispose, struct xfs_buf, b_lru);
+ list_del_init(&bp->b_lru);
+ if (bp->b_flags & XBF_WRITE_FAIL) {
+ xfs_alert(btp->bt_mount,
+"Corruption Alert: Buffer at block 0x%llx had permanent write failures!\n"
+"Please run xfs_repair to determine the extent of the problem.",
+ (long long)bp->b_bn);
+ }
+ xfs_buf_rele(bp);
+ }
+ if (loop++ != 0)
+ delay(100);
+ }
+}
+
+static enum lru_status
+xfs_buftarg_isolate(
+ struct list_head *item,
+ spinlock_t *lru_lock,
+ void *arg)
+{
+ struct xfs_buf *bp = container_of(item, struct xfs_buf, b_lru);
+ struct list_head *dispose = arg;
+
+ /*
+ * we are inverting the lru lock/bp->b_lock here, so use a trylock.
+ * If we fail to get the lock, just skip it.
+ */
+ if (!spin_trylock(&bp->b_lock))
+ return LRU_SKIP;
+ /*
+ * Decrement the b_lru_ref count unless the value is already
+ * zero. If the value is already zero, we need to reclaim the
+ * buffer, otherwise it gets another trip through the LRU.
+ */
+ if (!atomic_add_unless(&bp->b_lru_ref, -1, 0)) {
+ spin_unlock(&bp->b_lock);
+ return LRU_ROTATE;
+ }
+
+ bp->b_state |= XFS_BSTATE_DISPOSE;
+ list_move(item, dispose);
+ spin_unlock(&bp->b_lock);
+ return LRU_REMOVED;
+}
+
+static unsigned long
+xfs_buftarg_shrink_scan(
+ struct shrinker *shrink,
+ struct shrink_control *sc)
+{
+ struct xfs_buftarg *btp = container_of(shrink,
+ struct xfs_buftarg, bt_shrinker);
+ LIST_HEAD(dispose);
+ unsigned long freed;
+ unsigned long nr_to_scan = sc->nr_to_scan;
+
+ freed = list_lru_walk_node(&btp->bt_lru, sc->nid, xfs_buftarg_isolate,
+ &dispose, &nr_to_scan);
+
+ while (!list_empty(&dispose)) {
+ struct xfs_buf *bp;
+ bp = list_first_entry(&dispose, struct xfs_buf, b_lru);
+ list_del_init(&bp->b_lru);
+ xfs_buf_rele(bp);
+ }
+
+ return freed;
+}
+
+static unsigned long
+xfs_buftarg_shrink_count(
+ struct shrinker *shrink,
+ struct shrink_control *sc)
+{
+ struct xfs_buftarg *btp = container_of(shrink,
+ struct xfs_buftarg, bt_shrinker);
+ return list_lru_count_node(&btp->bt_lru, sc->nid);
+}
+
+void
+xfs_free_buftarg(
+ struct xfs_mount *mp,
+ struct xfs_buftarg *btp)
+{
+ unregister_shrinker(&btp->bt_shrinker);
+ list_lru_destroy(&btp->bt_lru);
+
+ if (mp->m_flags & XFS_MOUNT_BARRIER)
+ xfs_blkdev_issue_flush(btp);
+
+ kmem_free(btp);
+}
+
+int
+xfs_setsize_buftarg(
+ xfs_buftarg_t *btp,
+ unsigned int sectorsize)
+{
+ /* Set up metadata sector size info */
+ btp->bt_meta_sectorsize = sectorsize;
+ btp->bt_meta_sectormask = sectorsize - 1;
+
+ if (set_blocksize(btp->bt_bdev, sectorsize)) {
+ char name[BDEVNAME_SIZE];
+
+ bdevname(btp->bt_bdev, name);
+
+ xfs_warn(btp->bt_mount,
+ "Cannot set_blocksize to %u on device %s",
+ sectorsize, name);
+ return EINVAL;
+ }
+
+ /* Set up device logical sector size mask */
+ btp->bt_logical_sectorsize = bdev_logical_block_size(btp->bt_bdev);
+ btp->bt_logical_sectormask = bdev_logical_block_size(btp->bt_bdev) - 1;
+
+ return 0;
+}
+
+/*
+ * When allocating the initial buffer target we have not yet
+ * read in the superblock, so don't know what sized sectors
+ * are being used at this early stage. Play safe.
+ */
+STATIC int
+xfs_setsize_buftarg_early(
+ xfs_buftarg_t *btp,
+ struct block_device *bdev)
+{
+ return xfs_setsize_buftarg(btp, bdev_logical_block_size(bdev));
+}
+
+xfs_buftarg_t *
+xfs_alloc_buftarg(
+ struct xfs_mount *mp,
+ struct block_device *bdev)
+{
+ xfs_buftarg_t *btp;
+
+ btp = kmem_zalloc(sizeof(*btp), KM_SLEEP | KM_NOFS);
+
+ btp->bt_mount = mp;
+ btp->bt_dev = bdev->bd_dev;
+ btp->bt_bdev = bdev;
+ btp->bt_bdi = blk_get_backing_dev_info(bdev);
+ if (!btp->bt_bdi)
+ goto error;
+
+ if (xfs_setsize_buftarg_early(btp, bdev))
+ goto error;
+
+ if (list_lru_init(&btp->bt_lru))
+ goto error;
+
+ btp->bt_shrinker.count_objects = xfs_buftarg_shrink_count;
+ btp->bt_shrinker.scan_objects = xfs_buftarg_shrink_scan;
+ btp->bt_shrinker.seeks = DEFAULT_SEEKS;
+ btp->bt_shrinker.flags = SHRINKER_NUMA_AWARE;
+ register_shrinker(&btp->bt_shrinker);
+ return btp;
+
+error:
+ kmem_free(btp);
+ return NULL;
+}
+
+/*
+ * Add a buffer to the delayed write list.
+ *
+ * This queues a buffer for writeout if it hasn't already been. Note that
+ * neither this routine nor the buffer list submission functions perform
+ * any internal synchronization. It is expected that the lists are thread-local
+ * to the callers.
+ *
+ * Returns true if we queued up the buffer, or false if it already had
+ * been on the buffer list.
+ */
+bool
+xfs_buf_delwri_queue(
+ struct xfs_buf *bp,
+ struct list_head *list)
+{
+ ASSERT(xfs_buf_islocked(bp));
+ ASSERT(!(bp->b_flags & XBF_READ));
+
+ /*
+ * If the buffer is already marked delwri it already is queued up
+ * by someone else for imediate writeout. Just ignore it in that
+ * case.
+ */
+ if (bp->b_flags & _XBF_DELWRI_Q) {
+ trace_xfs_buf_delwri_queued(bp, _RET_IP_);
+ return false;
+ }
+
+ trace_xfs_buf_delwri_queue(bp, _RET_IP_);
+
+ /*
+ * If a buffer gets written out synchronously or marked stale while it
+ * is on a delwri list we lazily remove it. To do this, the other party
+ * clears the _XBF_DELWRI_Q flag but otherwise leaves the buffer alone.
+ * It remains referenced and on the list. In a rare corner case it
+ * might get readded to a delwri list after the synchronous writeout, in
+ * which case we need just need to re-add the flag here.
+ */
+ bp->b_flags |= _XBF_DELWRI_Q;
+ if (list_empty(&bp->b_list)) {
+ atomic_inc(&bp->b_hold);
+ list_add_tail(&bp->b_list, list);
+ }
+
+ return true;
+}
+
+/*
+ * Compare function is more complex than it needs to be because
+ * the return value is only 32 bits and we are doing comparisons
+ * on 64 bit values
+ */
+static int
+xfs_buf_cmp(
+ void *priv,
+ struct list_head *a,
+ struct list_head *b)
+{
+ struct xfs_buf *ap = container_of(a, struct xfs_buf, b_list);
+ struct xfs_buf *bp = container_of(b, struct xfs_buf, b_list);
+ xfs_daddr_t diff;
+
+ diff = ap->b_maps[0].bm_bn - bp->b_maps[0].bm_bn;
+ if (diff < 0)
+ return -1;
+ if (diff > 0)
+ return 1;
+ return 0;
+}
+
+static int
+__xfs_buf_delwri_submit(
+ struct list_head *buffer_list,
+ struct list_head *io_list,
+ bool wait)
+{
+ struct blk_plug plug;
+ struct xfs_buf *bp, *n;
+ int pinned = 0;
+
+ list_for_each_entry_safe(bp, n, buffer_list, b_list) {
+ if (!wait) {
+ if (xfs_buf_ispinned(bp)) {
+ pinned++;
+ continue;
+ }
+ if (!xfs_buf_trylock(bp))
+ continue;
+ } else {
+ xfs_buf_lock(bp);
+ }
+
+ /*
+ * Someone else might have written the buffer synchronously or
+ * marked it stale in the meantime. In that case only the
+ * _XBF_DELWRI_Q flag got cleared, and we have to drop the
+ * reference and remove it from the list here.
+ */
+ if (!(bp->b_flags & _XBF_DELWRI_Q)) {
+ list_del_init(&bp->b_list);
+ xfs_buf_relse(bp);
+ continue;
+ }
+
+ list_move_tail(&bp->b_list, io_list);
+ trace_xfs_buf_delwri_split(bp, _RET_IP_);
+ }
+
+ list_sort(NULL, io_list, xfs_buf_cmp);
+
+ blk_start_plug(&plug);
+ list_for_each_entry_safe(bp, n, io_list, b_list) {
+ bp->b_flags &= ~(_XBF_DELWRI_Q | XBF_ASYNC | XBF_WRITE_FAIL);
+ bp->b_flags |= XBF_WRITE;
+
+ if (!wait) {
+ bp->b_flags |= XBF_ASYNC;
+ list_del_init(&bp->b_list);
+ }
+ xfs_bdstrat_cb(bp);
+ }
+ blk_finish_plug(&plug);
+
+ return pinned;
+}
+
+/*
+ * Write out a buffer list asynchronously.
+ *
+ * This will take the @buffer_list, write all non-locked and non-pinned buffers
+ * out and not wait for I/O completion on any of the buffers. This interface
+ * is only safely useable for callers that can track I/O completion by higher
+ * level means, e.g. AIL pushing as the @buffer_list is consumed in this
+ * function.
+ */
+int
+xfs_buf_delwri_submit_nowait(
+ struct list_head *buffer_list)
+{
+ LIST_HEAD (io_list);
+ return __xfs_buf_delwri_submit(buffer_list, &io_list, false);
+}
+
+/*
+ * Write out a buffer list synchronously.
+ *
+ * This will take the @buffer_list, write all buffers out and wait for I/O
+ * completion on all of the buffers. @buffer_list is consumed by the function,
+ * so callers must have some other way of tracking buffers if they require such
+ * functionality.
+ */
+int
+xfs_buf_delwri_submit(
+ struct list_head *buffer_list)
+{
+ LIST_HEAD (io_list);
+ int error = 0, error2;
+ struct xfs_buf *bp;
+
+ __xfs_buf_delwri_submit(buffer_list, &io_list, true);
+
+ /* Wait for IO to complete. */
+ while (!list_empty(&io_list)) {
+ bp = list_first_entry(&io_list, struct xfs_buf, b_list);
+
+ list_del_init(&bp->b_list);
+ error2 = xfs_buf_iowait(bp);
+ xfs_buf_relse(bp);
+ if (!error)
+ error = error2;
+ }
+
+ return error;
+}
+
+int __init
+xfs_buf_init(void)
+{
+ xfs_buf_zone = kmem_zone_init_flags(sizeof(xfs_buf_t), "xfs_buf",
+ KM_ZONE_HWALIGN, NULL);
+ if (!xfs_buf_zone)
+ goto out;
+
+ xfslogd_workqueue = alloc_workqueue("xfslogd",
+ WQ_MEM_RECLAIM | WQ_HIGHPRI, 1);
+ if (!xfslogd_workqueue)
+ goto out_free_buf_zone;
+
+ return 0;
+
+ out_free_buf_zone:
+ kmem_zone_destroy(xfs_buf_zone);
+ out:
+ return -ENOMEM;
+}
+
+void
+xfs_buf_terminate(void)
+{
+ destroy_workqueue(xfslogd_workqueue);
+ kmem_zone_destroy(xfs_buf_zone);
+}
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
new file mode 100644
index 00000000000..3a7a5523d3d
--- /dev/null
+++ b/fs/xfs/xfs_buf.h
@@ -0,0 +1,393 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#ifndef __XFS_BUF_H__
+#define __XFS_BUF_H__
+
+#include <linux/list.h>
+#include <linux/types.h>
+#include <linux/spinlock.h>
+#include <linux/mm.h>
+#include <linux/fs.h>
+#include <linux/buffer_head.h>
+#include <linux/uio.h>
+#include <linux/list_lru.h>
+
+/*
+ * Base types
+ */
+
+#define XFS_BUF_DADDR_NULL ((xfs_daddr_t) (-1LL))
+
+typedef enum {
+ XBRW_READ = 1, /* transfer into target memory */
+ XBRW_WRITE = 2, /* transfer from target memory */
+ XBRW_ZERO = 3, /* Zero target memory */
+} xfs_buf_rw_t;
+
+#define XBF_READ (1 << 0) /* buffer intended for reading from device */
+#define XBF_WRITE (1 << 1) /* buffer intended for writing to device */
+#define XBF_READ_AHEAD (1 << 2) /* asynchronous read-ahead */
+#define XBF_ASYNC (1 << 4) /* initiator will not wait for completion */
+#define XBF_DONE (1 << 5) /* all pages in the buffer uptodate */
+#define XBF_STALE (1 << 6) /* buffer has been staled, do not find it */
+#define XBF_WRITE_FAIL (1 << 24)/* async writes have failed on this buffer */
+
+/* I/O hints for the BIO layer */
+#define XBF_SYNCIO (1 << 10)/* treat this buffer as synchronous I/O */
+#define XBF_FUA (1 << 11)/* force cache write through mode */
+#define XBF_FLUSH (1 << 12)/* flush the disk cache before a write */
+
+/* flags used only as arguments to access routines */
+#define XBF_TRYLOCK (1 << 16)/* lock requested, but do not wait */
+#define XBF_UNMAPPED (1 << 17)/* do not map the buffer */
+
+/* flags used only internally */
+#define _XBF_PAGES (1 << 20)/* backed by refcounted pages */
+#define _XBF_KMEM (1 << 21)/* backed by heap memory */
+#define _XBF_DELWRI_Q (1 << 22)/* buffer on a delwri queue */
+#define _XBF_COMPOUND (1 << 23)/* compound buffer */
+
+typedef unsigned int xfs_buf_flags_t;
+
+#define XFS_BUF_FLAGS \
+ { XBF_READ, "READ" }, \
+ { XBF_WRITE, "WRITE" }, \
+ { XBF_READ_AHEAD, "READ_AHEAD" }, \
+ { XBF_ASYNC, "ASYNC" }, \
+ { XBF_DONE, "DONE" }, \
+ { XBF_STALE, "STALE" }, \
+ { XBF_WRITE_FAIL, "WRITE_FAIL" }, \
+ { XBF_SYNCIO, "SYNCIO" }, \
+ { XBF_FUA, "FUA" }, \
+ { XBF_FLUSH, "FLUSH" }, \
+ { XBF_TRYLOCK, "TRYLOCK" }, /* should never be set */\
+ { XBF_UNMAPPED, "UNMAPPED" }, /* ditto */\
+ { _XBF_PAGES, "PAGES" }, \
+ { _XBF_KMEM, "KMEM" }, \
+ { _XBF_DELWRI_Q, "DELWRI_Q" }, \
+ { _XBF_COMPOUND, "COMPOUND" }
+
+
+/*
+ * Internal state flags.
+ */
+#define XFS_BSTATE_DISPOSE (1 << 0) /* buffer being discarded */
+
+/*
+ * The xfs_buftarg contains 2 notions of "sector size" -
+ *
+ * 1) The metadata sector size, which is the minimum unit and
+ * alignment of IO which will be performed by metadata operations.
+ * 2) The device logical sector size
+ *
+ * The first is specified at mkfs time, and is stored on-disk in the
+ * superblock's sb_sectsize.
+ *
+ * The latter is derived from the underlying device, and controls direct IO
+ * alignment constraints.
+ */
+typedef struct xfs_buftarg {
+ dev_t bt_dev;
+ struct block_device *bt_bdev;
+ struct backing_dev_info *bt_bdi;
+ struct xfs_mount *bt_mount;
+ unsigned int bt_meta_sectorsize;
+ size_t bt_meta_sectormask;
+ size_t bt_logical_sectorsize;
+ size_t bt_logical_sectormask;
+
+ /* LRU control structures */
+ struct shrinker bt_shrinker;
+ struct list_lru bt_lru;
+} xfs_buftarg_t;
+
+struct xfs_buf;
+typedef void (*xfs_buf_iodone_t)(struct xfs_buf *);
+
+
+#define XB_PAGES 2
+
+struct xfs_buf_map {
+ xfs_daddr_t bm_bn; /* block number for I/O */
+ int bm_len; /* size of I/O */
+};
+
+#define DEFINE_SINGLE_BUF_MAP(map, blkno, numblk) \
+ struct xfs_buf_map (map) = { .bm_bn = (blkno), .bm_len = (numblk) };
+
+struct xfs_buf_ops {
+ void (*verify_read)(struct xfs_buf *);
+ void (*verify_write)(struct xfs_buf *);
+};
+
+typedef struct xfs_buf {
+ /*
+ * first cacheline holds all the fields needed for an uncontended cache
+ * hit to be fully processed. The semaphore straddles the cacheline
+ * boundary, but the counter and lock sits on the first cacheline,
+ * which is the only bit that is touched if we hit the semaphore
+ * fast-path on locking.
+ */
+ struct rb_node b_rbnode; /* rbtree node */
+ xfs_daddr_t b_bn; /* block number of buffer */
+ int b_length; /* size of buffer in BBs */
+ atomic_t b_hold; /* reference count */
+ atomic_t b_lru_ref; /* lru reclaim ref count */
+ xfs_buf_flags_t b_flags; /* status flags */
+ struct semaphore b_sema; /* semaphore for lockables */
+
+ /*
+ * concurrent access to b_lru and b_lru_flags are protected by
+ * bt_lru_lock and not by b_sema
+ */
+ struct list_head b_lru; /* lru list */
+ spinlock_t b_lock; /* internal state lock */
+ unsigned int b_state; /* internal state flags */
+ wait_queue_head_t b_waiters; /* unpin waiters */
+ struct list_head b_list;
+ struct xfs_perag *b_pag; /* contains rbtree root */
+ xfs_buftarg_t *b_target; /* buffer target (device) */
+ void *b_addr; /* virtual address of buffer */
+ struct work_struct b_iodone_work;
+ xfs_buf_iodone_t b_iodone; /* I/O completion function */
+ struct completion b_iowait; /* queue for I/O waiters */
+ void *b_fspriv;
+ struct xfs_trans *b_transp;
+ struct page **b_pages; /* array of page pointers */
+ struct page *b_page_array[XB_PAGES]; /* inline pages */
+ struct xfs_buf_map *b_maps; /* compound buffer map */
+ struct xfs_buf_map __b_map; /* inline compound buffer map */
+ int b_map_count;
+ int b_io_length; /* IO size in BBs */
+ atomic_t b_pin_count; /* pin count */
+ atomic_t b_io_remaining; /* #outstanding I/O requests */
+ unsigned int b_page_count; /* size of page array */
+ unsigned int b_offset; /* page offset in first page */
+ unsigned short b_error; /* error code on I/O */
+ const struct xfs_buf_ops *b_ops;
+
+#ifdef XFS_BUF_LOCK_TRACKING
+ int b_last_holder;
+#endif
+} xfs_buf_t;
+
+/* Finding and Reading Buffers */
+struct xfs_buf *_xfs_buf_find(struct xfs_buftarg *target,
+ struct xfs_buf_map *map, int nmaps,
+ xfs_buf_flags_t flags, struct xfs_buf *new_bp);
+
+static inline struct xfs_buf *
+xfs_incore(
+ struct xfs_buftarg *target,
+ xfs_daddr_t blkno,
+ size_t numblks,
+ xfs_buf_flags_t flags)
+{
+ DEFINE_SINGLE_BUF_MAP(map, blkno, numblks);
+ return _xfs_buf_find(target, &map, 1, flags, NULL);
+}
+
+struct xfs_buf *_xfs_buf_alloc(struct xfs_buftarg *target,
+ struct xfs_buf_map *map, int nmaps,
+ xfs_buf_flags_t flags);
+
+static inline struct xfs_buf *
+xfs_buf_alloc(
+ struct xfs_buftarg *target,
+ xfs_daddr_t blkno,
+ size_t numblks,
+ xfs_buf_flags_t flags)
+{
+ DEFINE_SINGLE_BUF_MAP(map, blkno, numblks);
+ return _xfs_buf_alloc(target, &map, 1, flags);
+}
+
+struct xfs_buf *xfs_buf_get_map(struct xfs_buftarg *target,
+ struct xfs_buf_map *map, int nmaps,
+ xfs_buf_flags_t flags);
+struct xfs_buf *xfs_buf_read_map(struct xfs_buftarg *target,
+ struct xfs_buf_map *map, int nmaps,
+ xfs_buf_flags_t flags,
+ const struct xfs_buf_ops *ops);
+void xfs_buf_readahead_map(struct xfs_buftarg *target,
+ struct xfs_buf_map *map, int nmaps,
+ const struct xfs_buf_ops *ops);
+
+static inline struct xfs_buf *
+xfs_buf_get(
+ struct xfs_buftarg *target,
+ xfs_daddr_t blkno,
+ size_t numblks,
+ xfs_buf_flags_t flags)
+{
+ DEFINE_SINGLE_BUF_MAP(map, blkno, numblks);
+ return xfs_buf_get_map(target, &map, 1, flags);
+}
+
+static inline struct xfs_buf *
+xfs_buf_read(
+ struct xfs_buftarg *target,
+ xfs_daddr_t blkno,
+ size_t numblks,
+ xfs_buf_flags_t flags,
+ const struct xfs_buf_ops *ops)
+{
+ DEFINE_SINGLE_BUF_MAP(map, blkno, numblks);
+ return xfs_buf_read_map(target, &map, 1, flags, ops);
+}
+
+static inline void
+xfs_buf_readahead(
+ struct xfs_buftarg *target,
+ xfs_daddr_t blkno,
+ size_t numblks,
+ const struct xfs_buf_ops *ops)
+{
+ DEFINE_SINGLE_BUF_MAP(map, blkno, numblks);
+ return xfs_buf_readahead_map(target, &map, 1, ops);
+}
+
+struct xfs_buf *xfs_buf_get_empty(struct xfs_buftarg *target, size_t numblks);
+void xfs_buf_set_empty(struct xfs_buf *bp, size_t numblks);
+int xfs_buf_associate_memory(struct xfs_buf *bp, void *mem, size_t length);
+
+struct xfs_buf *xfs_buf_get_uncached(struct xfs_buftarg *target, size_t numblks,
+ int flags);
+struct xfs_buf *xfs_buf_read_uncached(struct xfs_buftarg *target,
+ xfs_daddr_t daddr, size_t numblks, int flags,
+ const struct xfs_buf_ops *ops);
+void xfs_buf_hold(struct xfs_buf *bp);
+
+/* Releasing Buffers */
+extern void xfs_buf_free(xfs_buf_t *);
+extern void xfs_buf_rele(xfs_buf_t *);
+
+/* Locking and Unlocking Buffers */
+extern int xfs_buf_trylock(xfs_buf_t *);
+extern void xfs_buf_lock(xfs_buf_t *);
+extern void xfs_buf_unlock(xfs_buf_t *);
+#define xfs_buf_islocked(bp) \
+ ((bp)->b_sema.count <= 0)
+
+/* Buffer Read and Write Routines */
+extern int xfs_bwrite(struct xfs_buf *bp);
+extern void xfs_buf_ioend(xfs_buf_t *, int);
+extern void xfs_buf_ioerror(xfs_buf_t *, int);
+extern void xfs_buf_ioerror_alert(struct xfs_buf *, const char *func);
+extern void xfs_buf_iorequest(xfs_buf_t *);
+extern int xfs_buf_iowait(xfs_buf_t *);
+extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, void *,
+ xfs_buf_rw_t);
+#define xfs_buf_zero(bp, off, len) \
+ xfs_buf_iomove((bp), (off), (len), NULL, XBRW_ZERO)
+
+extern int xfs_bioerror_relse(struct xfs_buf *);
+
+/* Buffer Utility Routines */
+extern xfs_caddr_t xfs_buf_offset(xfs_buf_t *, size_t);
+
+/* Delayed Write Buffer Routines */
+extern bool xfs_buf_delwri_queue(struct xfs_buf *, struct list_head *);
+extern int xfs_buf_delwri_submit(struct list_head *);
+extern int xfs_buf_delwri_submit_nowait(struct list_head *);
+
+/* Buffer Daemon Setup Routines */
+extern int xfs_buf_init(void);
+extern void xfs_buf_terminate(void);
+
+#define XFS_BUF_ZEROFLAGS(bp) \
+ ((bp)->b_flags &= ~(XBF_READ|XBF_WRITE|XBF_ASYNC| \
+ XBF_SYNCIO|XBF_FUA|XBF_FLUSH| \
+ XBF_WRITE_FAIL))
+
+void xfs_buf_stale(struct xfs_buf *bp);
+#define XFS_BUF_UNSTALE(bp) ((bp)->b_flags &= ~XBF_STALE)
+#define XFS_BUF_ISSTALE(bp) ((bp)->b_flags & XBF_STALE)
+
+#define XFS_BUF_DONE(bp) ((bp)->b_flags |= XBF_DONE)
+#define XFS_BUF_UNDONE(bp) ((bp)->b_flags &= ~XBF_DONE)
+#define XFS_BUF_ISDONE(bp) ((bp)->b_flags & XBF_DONE)
+
+#define XFS_BUF_ASYNC(bp) ((bp)->b_flags |= XBF_ASYNC)
+#define XFS_BUF_UNASYNC(bp) ((bp)->b_flags &= ~XBF_ASYNC)
+#define XFS_BUF_ISASYNC(bp) ((bp)->b_flags & XBF_ASYNC)
+
+#define XFS_BUF_READ(bp) ((bp)->b_flags |= XBF_READ)
+#define XFS_BUF_UNREAD(bp) ((bp)->b_flags &= ~XBF_READ)
+#define XFS_BUF_ISREAD(bp) ((bp)->b_flags & XBF_READ)
+
+#define XFS_BUF_WRITE(bp) ((bp)->b_flags |= XBF_WRITE)
+#define XFS_BUF_UNWRITE(bp) ((bp)->b_flags &= ~XBF_WRITE)
+#define XFS_BUF_ISWRITE(bp) ((bp)->b_flags & XBF_WRITE)
+
+/*
+ * These macros use the IO block map rather than b_bn. b_bn is now really
+ * just for the buffer cache index for cached buffers. As IO does not use b_bn
+ * anymore, uncached buffers do not use b_bn at all and hence must modify the IO
+ * map directly. Uncached buffers are not allowed to be discontiguous, so this
+ * is safe to do.
+ *
+ * In future, uncached buffers will pass the block number directly to the io
+ * request function and hence these macros will go away at that point.
+ */
+#define XFS_BUF_ADDR(bp) ((bp)->b_maps[0].bm_bn)
+#define XFS_BUF_SET_ADDR(bp, bno) ((bp)->b_maps[0].bm_bn = (xfs_daddr_t)(bno))
+
+static inline void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref)
+{
+ atomic_set(&bp->b_lru_ref, lru_ref);
+}
+
+static inline int xfs_buf_ispinned(struct xfs_buf *bp)
+{
+ return atomic_read(&bp->b_pin_count);
+}
+
+static inline void xfs_buf_relse(xfs_buf_t *bp)
+{
+ xfs_buf_unlock(bp);
+ xfs_buf_rele(bp);
+}
+
+static inline int
+xfs_buf_verify_cksum(struct xfs_buf *bp, unsigned long cksum_offset)
+{
+ return xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length),
+ cksum_offset);
+}
+
+static inline void
+xfs_buf_update_cksum(struct xfs_buf *bp, unsigned long cksum_offset)
+{
+ xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length),
+ cksum_offset);
+}
+
+/*
+ * Handling of buftargs.
+ */
+extern xfs_buftarg_t *xfs_alloc_buftarg(struct xfs_mount *,
+ struct block_device *);
+extern void xfs_free_buftarg(struct xfs_mount *, struct xfs_buftarg *);
+extern void xfs_wait_buftarg(xfs_buftarg_t *);
+extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int);
+
+#define xfs_getsize_buftarg(buftarg) block_size((buftarg)->bt_bdev)
+#define xfs_readonly_buftarg(buftarg) bdev_read_only((buftarg)->bt_bdev)
+
+#endif /* __XFS_BUF_H__ */
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 07e2324152b..4654338b03f 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -17,126 +17,36 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
-#include "xfs_types.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_bit.h"
-#include "xfs_log.h"
-#include "xfs_inum.h"
-#include "xfs_trans.h"
#include "xfs_sb.h"
-#include "xfs_dir.h"
-#include "xfs_dmapi.h"
+#include "xfs_ag.h"
#include "xfs_mount.h"
+#include "xfs_trans.h"
#include "xfs_buf_item.h"
#include "xfs_trans_priv.h"
#include "xfs_error.h"
+#include "xfs_trace.h"
+#include "xfs_log.h"
kmem_zone_t *xfs_buf_item_zone;
-#ifdef XFS_TRANS_DEBUG
-/*
- * This function uses an alternate strategy for tracking the bytes
- * that the user requests to be logged. This can then be used
- * in conjunction with the bli_orig array in the buf log item to
- * catch bugs in our callers' code.
- *
- * We also double check the bits set in xfs_buf_item_log using a
- * simple algorithm to check that every byte is accounted for.
- */
-STATIC void
-xfs_buf_item_log_debug(
- xfs_buf_log_item_t *bip,
- uint first,
- uint last)
+static inline struct xfs_buf_log_item *BUF_ITEM(struct xfs_log_item *lip)
{
- uint x;
- uint byte;
- uint nbytes;
- uint chunk_num;
- uint word_num;
- uint bit_num;
- uint bit_set;
- uint *wordp;
-
- ASSERT(bip->bli_logged != NULL);
- byte = first;
- nbytes = last - first + 1;
- bfset(bip->bli_logged, first, nbytes);
- for (x = 0; x < nbytes; x++) {
- chunk_num = byte >> XFS_BLI_SHIFT;
- word_num = chunk_num >> BIT_TO_WORD_SHIFT;
- bit_num = chunk_num & (NBWORD - 1);
- wordp = &(bip->bli_format.blf_data_map[word_num]);
- bit_set = *wordp & (1 << bit_num);
- ASSERT(bit_set);
- byte++;
- }
+ return container_of(lip, struct xfs_buf_log_item, bli_item);
}
-/*
- * This function is called when we flush something into a buffer without
- * logging it. This happens for things like inodes which are logged
- * separately from the buffer.
- */
-void
-xfs_buf_item_flush_log_debug(
- xfs_buf_t *bp,
- uint first,
- uint last)
-{
- xfs_buf_log_item_t *bip;
- uint nbytes;
-
- bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t*);
- if ((bip == NULL) || (bip->bli_item.li_type != XFS_LI_BUF)) {
- return;
- }
-
- ASSERT(bip->bli_logged != NULL);
- nbytes = last - first + 1;
- bfset(bip->bli_logged, first, nbytes);
-}
+STATIC void xfs_buf_do_callbacks(struct xfs_buf *bp);
-/*
- * This function is called to verify that our caller's have logged
- * all the bytes that they changed.
- *
- * It does this by comparing the original copy of the buffer stored in
- * the buf log item's bli_orig array to the current copy of the buffer
- * and ensuring that all bytes which miscompare are set in the bli_logged
- * array of the buf log item.
- */
-STATIC void
-xfs_buf_item_log_check(
- xfs_buf_log_item_t *bip)
+static inline int
+xfs_buf_log_format_size(
+ struct xfs_buf_log_format *blfp)
{
- char *orig;
- char *buffer;
- int x;
- xfs_buf_t *bp;
-
- ASSERT(bip->bli_orig != NULL);
- ASSERT(bip->bli_logged != NULL);
-
- bp = bip->bli_buf;
- ASSERT(XFS_BUF_COUNT(bp) > 0);
- ASSERT(XFS_BUF_PTR(bp) != NULL);
- orig = bip->bli_orig;
- buffer = XFS_BUF_PTR(bp);
- for (x = 0; x < XFS_BUF_COUNT(bp); x++) {
- if (orig[x] != buffer[x] && !btst(bip->bli_logged, x))
- cmn_err(CE_PANIC,
- "xfs_buf_item_log_check bip %x buffer %x orig %x index %d",
- bip, bp, orig, x);
- }
+ return offsetof(struct xfs_buf_log_format, blf_data_map) +
+ (blfp->blf_map_size * sizeof(blfp->blf_data_map[0]));
}
-#else
-#define xfs_buf_item_log_debug(x,y,z)
-#define xfs_buf_item_log_check(x)
-#endif
-
-STATIC void xfs_buf_error_relse(xfs_buf_t *bp);
-STATIC void xfs_buf_do_callbacks(xfs_buf_t *bp, xfs_log_item_t *lip);
/*
* This returns the number of log iovecs needed to log the
@@ -148,34 +58,28 @@ STATIC void xfs_buf_do_callbacks(xfs_buf_t *bp, xfs_log_item_t *lip);
*
* If the XFS_BLI_STALE flag has been set, then log nothing.
*/
-STATIC uint
-xfs_buf_item_size(
- xfs_buf_log_item_t *bip)
+STATIC void
+xfs_buf_item_size_segment(
+ struct xfs_buf_log_item *bip,
+ struct xfs_buf_log_format *blfp,
+ int *nvecs,
+ int *nbytes)
{
- uint nvecs;
- int next_bit;
- int last_bit;
- xfs_buf_t *bp;
+ struct xfs_buf *bp = bip->bli_buf;
+ int next_bit;
+ int last_bit;
- ASSERT(atomic_read(&bip->bli_refcount) > 0);
- if (bip->bli_flags & XFS_BLI_STALE) {
- /*
- * The buffer is stale, so all we need to log
- * is the buf log format structure with the
- * cancel flag in it.
- */
- xfs_buf_item_trace("SIZE STALE", bip);
- ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL);
- return 1;
- }
+ last_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, 0);
+ if (last_bit == -1)
+ return;
+
+ /*
+ * initial count for a dirty buffer is 2 vectors - the format structure
+ * and the first dirty region.
+ */
+ *nvecs += 2;
+ *nbytes += xfs_buf_log_format_size(blfp) + XFS_BLF_CHUNK;
- bp = bip->bli_buf;
- ASSERT(bip->bli_flags & XFS_BLI_LOGGED);
- nvecs = 1;
- last_bit = xfs_next_bit(bip->bli_format.blf_data_map,
- bip->bli_format.blf_map_size, 0);
- ASSERT(last_bit != -1);
- nvecs++;
while (last_bit != -1) {
/*
* This takes the bit number to start looking from and
@@ -183,76 +87,164 @@ xfs_buf_item_size(
* if there are no more bits set or the start bit is
* beyond the end of the bitmap.
*/
- next_bit = xfs_next_bit(bip->bli_format.blf_data_map,
- bip->bli_format.blf_map_size,
- last_bit + 1);
+ next_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size,
+ last_bit + 1);
/*
* If we run out of bits, leave the loop,
* else if we find a new set of bits bump the number of vecs,
* else keep scanning the current set of bits.
*/
if (next_bit == -1) {
- last_bit = -1;
+ break;
} else if (next_bit != last_bit + 1) {
last_bit = next_bit;
- nvecs++;
- } else if (xfs_buf_offset(bp, next_bit * XFS_BLI_CHUNK) !=
- (xfs_buf_offset(bp, last_bit * XFS_BLI_CHUNK) +
- XFS_BLI_CHUNK)) {
+ (*nvecs)++;
+ } else if (xfs_buf_offset(bp, next_bit * XFS_BLF_CHUNK) !=
+ (xfs_buf_offset(bp, last_bit * XFS_BLF_CHUNK) +
+ XFS_BLF_CHUNK)) {
last_bit = next_bit;
- nvecs++;
+ (*nvecs)++;
} else {
last_bit++;
}
+ *nbytes += XFS_BLF_CHUNK;
}
-
- xfs_buf_item_trace("SIZE NORM", bip);
- return nvecs;
}
/*
- * This is called to fill in the vector of log iovecs for the
- * given log buf item. It fills the first entry with a buf log
- * format structure, and the rest point to contiguous chunks
- * within the buffer.
+ * This returns the number of log iovecs needed to log the given buf log item.
+ *
+ * It calculates this as 1 iovec for the buf log format structure and 1 for each
+ * stretch of non-contiguous chunks to be logged. Contiguous chunks are logged
+ * in a single iovec.
+ *
+ * Discontiguous buffers need a format structure per region that that is being
+ * logged. This makes the changes in the buffer appear to log recovery as though
+ * they came from separate buffers, just like would occur if multiple buffers
+ * were used instead of a single discontiguous buffer. This enables
+ * discontiguous buffers to be in-memory constructs, completely transparent to
+ * what ends up on disk.
+ *
+ * If the XFS_BLI_STALE flag has been set, then log nothing but the buf log
+ * format structures.
*/
STATIC void
-xfs_buf_item_format(
- xfs_buf_log_item_t *bip,
- xfs_log_iovec_t *log_vector)
+xfs_buf_item_size(
+ struct xfs_log_item *lip,
+ int *nvecs,
+ int *nbytes)
+{
+ struct xfs_buf_log_item *bip = BUF_ITEM(lip);
+ int i;
+
+ ASSERT(atomic_read(&bip->bli_refcount) > 0);
+ if (bip->bli_flags & XFS_BLI_STALE) {
+ /*
+ * The buffer is stale, so all we need to log
+ * is the buf log format structure with the
+ * cancel flag in it.
+ */
+ trace_xfs_buf_item_size_stale(bip);
+ ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL);
+ *nvecs += bip->bli_format_count;
+ for (i = 0; i < bip->bli_format_count; i++) {
+ *nbytes += xfs_buf_log_format_size(&bip->bli_formats[i]);
+ }
+ return;
+ }
+
+ ASSERT(bip->bli_flags & XFS_BLI_LOGGED);
+
+ if (bip->bli_flags & XFS_BLI_ORDERED) {
+ /*
+ * The buffer has been logged just to order it.
+ * It is not being included in the transaction
+ * commit, so no vectors are used at all.
+ */
+ trace_xfs_buf_item_size_ordered(bip);
+ *nvecs = XFS_LOG_VEC_ORDERED;
+ return;
+ }
+
+ /*
+ * the vector count is based on the number of buffer vectors we have
+ * dirty bits in. This will only be greater than one when we have a
+ * compound buffer with more than one segment dirty. Hence for compound
+ * buffers we need to track which segment the dirty bits correspond to,
+ * and when we move from one segment to the next increment the vector
+ * count for the extra buf log format structure that will need to be
+ * written.
+ */
+ for (i = 0; i < bip->bli_format_count; i++) {
+ xfs_buf_item_size_segment(bip, &bip->bli_formats[i],
+ nvecs, nbytes);
+ }
+ trace_xfs_buf_item_size(bip);
+}
+
+static inline void
+xfs_buf_item_copy_iovec(
+ struct xfs_log_vec *lv,
+ struct xfs_log_iovec **vecp,
+ struct xfs_buf *bp,
+ uint offset,
+ int first_bit,
+ uint nbits)
+{
+ offset += first_bit * XFS_BLF_CHUNK;
+ xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_BCHUNK,
+ xfs_buf_offset(bp, offset),
+ nbits * XFS_BLF_CHUNK);
+}
+
+static inline bool
+xfs_buf_item_straddle(
+ struct xfs_buf *bp,
+ uint offset,
+ int next_bit,
+ int last_bit)
{
+ return xfs_buf_offset(bp, offset + (next_bit << XFS_BLF_SHIFT)) !=
+ (xfs_buf_offset(bp, offset + (last_bit << XFS_BLF_SHIFT)) +
+ XFS_BLF_CHUNK);
+}
+
+static void
+xfs_buf_item_format_segment(
+ struct xfs_buf_log_item *bip,
+ struct xfs_log_vec *lv,
+ struct xfs_log_iovec **vecp,
+ uint offset,
+ struct xfs_buf_log_format *blfp)
+{
+ struct xfs_buf *bp = bip->bli_buf;
uint base_size;
- uint nvecs;
- xfs_log_iovec_t *vecp;
- xfs_buf_t *bp;
int first_bit;
int last_bit;
int next_bit;
uint nbits;
- uint buffer_offset;
- ASSERT(atomic_read(&bip->bli_refcount) > 0);
- ASSERT((bip->bli_flags & XFS_BLI_LOGGED) ||
- (bip->bli_flags & XFS_BLI_STALE));
- bp = bip->bli_buf;
- ASSERT(XFS_BUF_BP_ISMAPPED(bp));
- vecp = log_vector;
+ /* copy the flags across from the base format item */
+ blfp->blf_flags = bip->__bli_format.blf_flags;
/*
- * The size of the base structure is the size of the
- * declared structure plus the space for the extra words
- * of the bitmap. We subtract one from the map size, because
- * the first element of the bitmap is accounted for in the
- * size of the base structure.
+ * Base size is the actual size of the ondisk structure - it reflects
+ * the actual size of the dirty bitmap rather than the size of the in
+ * memory structure.
*/
- base_size =
- (uint)(sizeof(xfs_buf_log_format_t) +
- ((bip->bli_format.blf_map_size - 1) * sizeof(uint)));
- vecp->i_addr = (xfs_caddr_t)&bip->bli_format;
- vecp->i_len = base_size;
- XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_BFORMAT);
- vecp++;
- nvecs = 1;
+ base_size = xfs_buf_log_format_size(blfp);
+
+ first_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, 0);
+ if (!(bip->bli_flags & XFS_BLI_STALE) && first_bit == -1) {
+ /*
+ * If the map is not be dirty in the transaction, mark
+ * the size as zero and do not advance the vector pointer.
+ */
+ return;
+ }
+
+ blfp = xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_BFORMAT, blfp, base_size);
+ blfp->blf_size = 1;
if (bip->bli_flags & XFS_BLI_STALE) {
/*
@@ -260,18 +252,15 @@ xfs_buf_item_format(
* is the buf log format structure with the
* cancel flag in it.
*/
- xfs_buf_item_trace("FORMAT STALE", bip);
- ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL);
- bip->bli_format.blf_size = nvecs;
+ trace_xfs_buf_item_format_stale(bip);
+ ASSERT(blfp->blf_flags & XFS_BLF_CANCEL);
return;
}
+
/*
* Fill in an iovec for each set of contiguous chunks.
*/
- first_bit = xfs_next_bit(bip->bli_format.blf_data_map,
- bip->bli_format.blf_map_size, 0);
- ASSERT(first_bit != -1);
last_bit = first_bit;
nbits = 1;
for (;;) {
@@ -281,48 +270,25 @@ xfs_buf_item_format(
* if there are no more bits set or the start bit is
* beyond the end of the bitmap.
*/
- next_bit = xfs_next_bit(bip->bli_format.blf_data_map,
- bip->bli_format.blf_map_size,
- (uint)last_bit + 1);
+ next_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size,
+ (uint)last_bit + 1);
/*
- * If we run out of bits fill in the last iovec and get
- * out of the loop.
- * Else if we start a new set of bits then fill in the
- * iovec for the series we were looking at and start
- * counting the bits in the new one.
- * Else we're still in the same set of bits so just
- * keep counting and scanning.
+ * If we run out of bits fill in the last iovec and get out of
+ * the loop. Else if we start a new set of bits then fill in
+ * the iovec for the series we were looking at and start
+ * counting the bits in the new one. Else we're still in the
+ * same set of bits so just keep counting and scanning.
*/
if (next_bit == -1) {
- buffer_offset = first_bit * XFS_BLI_CHUNK;
- vecp->i_addr = xfs_buf_offset(bp, buffer_offset);
- vecp->i_len = nbits * XFS_BLI_CHUNK;
- XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_BCHUNK);
- nvecs++;
+ xfs_buf_item_copy_iovec(lv, vecp, bp, offset,
+ first_bit, nbits);
+ blfp->blf_size++;
break;
- } else if (next_bit != last_bit + 1) {
- buffer_offset = first_bit * XFS_BLI_CHUNK;
- vecp->i_addr = xfs_buf_offset(bp, buffer_offset);
- vecp->i_len = nbits * XFS_BLI_CHUNK;
- XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_BCHUNK);
- nvecs++;
- vecp++;
- first_bit = next_bit;
- last_bit = next_bit;
- nbits = 1;
- } else if (xfs_buf_offset(bp, next_bit << XFS_BLI_SHIFT) !=
- (xfs_buf_offset(bp, last_bit << XFS_BLI_SHIFT) +
- XFS_BLI_CHUNK)) {
- buffer_offset = first_bit * XFS_BLI_CHUNK;
- vecp->i_addr = xfs_buf_offset(bp, buffer_offset);
- vecp->i_len = nbits * XFS_BLI_CHUNK;
- XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_BCHUNK);
-/* You would think we need to bump the nvecs here too, but we do not
- * this number is used by recovery, and it gets confused by the boundary
- * split here
- * nvecs++;
- */
- vecp++;
+ } else if (next_bit != last_bit + 1 ||
+ xfs_buf_item_straddle(bp, offset, next_bit, last_bit)) {
+ xfs_buf_item_copy_iovec(lv, vecp, bp, offset,
+ first_bit, nbits);
+ blfp->blf_size++;
first_bit = next_bit;
last_bit = next_bit;
nbits = 1;
@@ -331,270 +297,363 @@ xfs_buf_item_format(
nbits++;
}
}
- bip->bli_format.blf_size = nvecs;
+}
+
+/*
+ * This is called to fill in the vector of log iovecs for the
+ * given log buf item. It fills the first entry with a buf log
+ * format structure, and the rest point to contiguous chunks
+ * within the buffer.
+ */
+STATIC void
+xfs_buf_item_format(
+ struct xfs_log_item *lip,
+ struct xfs_log_vec *lv)
+{
+ struct xfs_buf_log_item *bip = BUF_ITEM(lip);
+ struct xfs_buf *bp = bip->bli_buf;
+ struct xfs_log_iovec *vecp = NULL;
+ uint offset = 0;
+ int i;
+
+ ASSERT(atomic_read(&bip->bli_refcount) > 0);
+ ASSERT((bip->bli_flags & XFS_BLI_LOGGED) ||
+ (bip->bli_flags & XFS_BLI_STALE));
+
+ /*
+ * If it is an inode buffer, transfer the in-memory state to the
+ * format flags and clear the in-memory state.
+ *
+ * For buffer based inode allocation, we do not transfer
+ * this state if the inode buffer allocation has not yet been committed
+ * to the log as setting the XFS_BLI_INODE_BUF flag will prevent
+ * correct replay of the inode allocation.
+ *
+ * For icreate item based inode allocation, the buffers aren't written
+ * to the journal during allocation, and hence we should always tag the
+ * buffer as an inode buffer so that the correct unlinked list replay
+ * occurs during recovery.
+ */
+ if (bip->bli_flags & XFS_BLI_INODE_BUF) {
+ if (xfs_sb_version_hascrc(&lip->li_mountp->m_sb) ||
+ !((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) &&
+ xfs_log_item_in_current_chkpt(lip)))
+ bip->__bli_format.blf_flags |= XFS_BLF_INODE_BUF;
+ bip->bli_flags &= ~XFS_BLI_INODE_BUF;
+ }
+
+ if ((bip->bli_flags & (XFS_BLI_ORDERED|XFS_BLI_STALE)) ==
+ XFS_BLI_ORDERED) {
+ /*
+ * The buffer has been logged just to order it. It is not being
+ * included in the transaction commit, so don't format it.
+ */
+ trace_xfs_buf_item_format_ordered(bip);
+ return;
+ }
+
+ for (i = 0; i < bip->bli_format_count; i++) {
+ xfs_buf_item_format_segment(bip, lv, &vecp, offset,
+ &bip->bli_formats[i]);
+ offset += bp->b_maps[i].bm_len;
+ }
/*
* Check to make sure everything is consistent.
*/
- xfs_buf_item_trace("FORMAT NORM", bip);
- xfs_buf_item_log_check(bip);
+ trace_xfs_buf_item_format(bip);
}
/*
- * This is called to pin the buffer associated with the buf log
- * item in memory so it cannot be written out. Simply call bpin()
- * on the buffer to do this.
+ * This is called to pin the buffer associated with the buf log item in memory
+ * so it cannot be written out.
+ *
+ * We also always take a reference to the buffer log item here so that the bli
+ * is held while the item is pinned in memory. This means that we can
+ * unconditionally drop the reference count a transaction holds when the
+ * transaction is completed.
*/
STATIC void
xfs_buf_item_pin(
- xfs_buf_log_item_t *bip)
+ struct xfs_log_item *lip)
{
- xfs_buf_t *bp;
+ struct xfs_buf_log_item *bip = BUF_ITEM(lip);
- bp = bip->bli_buf;
- ASSERT(XFS_BUF_ISBUSY(bp));
ASSERT(atomic_read(&bip->bli_refcount) > 0);
ASSERT((bip->bli_flags & XFS_BLI_LOGGED) ||
+ (bip->bli_flags & XFS_BLI_ORDERED) ||
(bip->bli_flags & XFS_BLI_STALE));
- xfs_buf_item_trace("PIN", bip);
- xfs_buftrace("XFS_PIN", bp);
- xfs_bpin(bp);
-}
+ trace_xfs_buf_item_pin(bip);
+
+ atomic_inc(&bip->bli_refcount);
+ atomic_inc(&bip->bli_buf->b_pin_count);
+}
/*
* This is called to unpin the buffer associated with the buf log
* item which was previously pinned with a call to xfs_buf_item_pin().
- * Just call bunpin() on the buffer to do this.
*
* Also drop the reference to the buf item for the current transaction.
* If the XFS_BLI_STALE flag is set and we are the last reference,
* then free up the buf log item and unlock the buffer.
+ *
+ * If the remove flag is set we are called from uncommit in the
+ * forced-shutdown path. If that is true and the reference count on
+ * the log item is going to drop to zero we need to free the item's
+ * descriptor in the transaction.
*/
STATIC void
xfs_buf_item_unpin(
- xfs_buf_log_item_t *bip,
- int stale)
+ struct xfs_log_item *lip,
+ int remove)
{
- xfs_mount_t *mp;
- xfs_buf_t *bp;
+ struct xfs_buf_log_item *bip = BUF_ITEM(lip);
+ xfs_buf_t *bp = bip->bli_buf;
+ struct xfs_ail *ailp = lip->li_ailp;
+ int stale = bip->bli_flags & XFS_BLI_STALE;
int freed;
- SPLDECL(s);
- bp = bip->bli_buf;
- ASSERT(bp != NULL);
- ASSERT(XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *) == bip);
+ ASSERT(bp->b_fspriv == bip);
ASSERT(atomic_read(&bip->bli_refcount) > 0);
- xfs_buf_item_trace("UNPIN", bip);
- xfs_buftrace("XFS_UNPIN", bp);
+
+ trace_xfs_buf_item_unpin(bip);
freed = atomic_dec_and_test(&bip->bli_refcount);
- mp = bip->bli_item.li_mountp;
- xfs_bunpin(bp);
+
+ if (atomic_dec_and_test(&bp->b_pin_count))
+ wake_up_all(&bp->b_waiters);
+
if (freed && stale) {
ASSERT(bip->bli_flags & XFS_BLI_STALE);
- ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
- ASSERT(!(XFS_BUF_ISDELAYWRITE(bp)));
+ ASSERT(xfs_buf_islocked(bp));
ASSERT(XFS_BUF_ISSTALE(bp));
- ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL);
- xfs_buf_item_trace("UNPIN STALE", bip);
- xfs_buftrace("XFS_UNPIN STALE", bp);
+ ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL);
+
+ trace_xfs_buf_item_unpin_stale(bip);
+
+ if (remove) {
+ /*
+ * If we are in a transaction context, we have to
+ * remove the log item from the transaction as we are
+ * about to release our reference to the buffer. If we
+ * don't, the unlock that occurs later in
+ * xfs_trans_uncommit() will try to reference the
+ * buffer which we no longer have a hold on.
+ */
+ if (lip->li_desc)
+ xfs_trans_del_item(lip);
+
+ /*
+ * Since the transaction no longer refers to the buffer,
+ * the buffer should no longer refer to the transaction.
+ */
+ bp->b_transp = NULL;
+ }
+
/*
* If we get called here because of an IO error, we may
- * or may not have the item on the AIL. xfs_trans_delete_ail()
+ * or may not have the item on the AIL. xfs_trans_ail_delete()
* will take care of that situation.
- * xfs_trans_delete_ail() drops the AIL lock.
+ * xfs_trans_ail_delete() drops the AIL lock.
*/
if (bip->bli_flags & XFS_BLI_STALE_INODE) {
- xfs_buf_do_callbacks(bp, (xfs_log_item_t *)bip);
- XFS_BUF_SET_FSPRIVATE(bp, NULL);
- XFS_BUF_CLR_IODONE_FUNC(bp);
+ xfs_buf_do_callbacks(bp);
+ bp->b_fspriv = NULL;
+ bp->b_iodone = NULL;
} else {
- AIL_LOCK(mp,s);
- xfs_trans_delete_ail(mp, (xfs_log_item_t *)bip, s);
+ spin_lock(&ailp->xa_lock);
+ xfs_trans_ail_delete(ailp, lip, SHUTDOWN_LOG_IO_ERROR);
xfs_buf_item_relse(bp);
- ASSERT(XFS_BUF_FSPRIVATE(bp, void *) == NULL);
+ ASSERT(bp->b_fspriv == NULL);
}
xfs_buf_relse(bp);
- }
-}
-
-/*
- * this is called from uncommit in the forced-shutdown path.
- * we need to check to see if the reference count on the log item
- * is going to drop to zero. If so, unpin will free the log item
- * so we need to free the item's descriptor (that points to the item)
- * in the transaction.
- */
-STATIC void
-xfs_buf_item_unpin_remove(
- xfs_buf_log_item_t *bip,
- xfs_trans_t *tp)
-{
- xfs_buf_t *bp;
- xfs_log_item_desc_t *lidp;
- int stale = 0;
-
- bp = bip->bli_buf;
- /*
- * will xfs_buf_item_unpin() call xfs_buf_item_relse()?
- */
- if ((atomic_read(&bip->bli_refcount) == 1) &&
- (bip->bli_flags & XFS_BLI_STALE)) {
- ASSERT(XFS_BUF_VALUSEMA(bip->bli_buf) <= 0);
- xfs_buf_item_trace("UNPIN REMOVE", bip);
- xfs_buftrace("XFS_UNPIN_REMOVE", bp);
+ } else if (freed && remove) {
/*
- * yes -- clear the xaction descriptor in-use flag
- * and free the chunk if required. We can safely
- * do some work here and then call buf_item_unpin
- * to do the rest because if the if is true, then
- * we are holding the buffer locked so no one else
- * will be able to bump up the refcount.
+ * There are currently two references to the buffer - the active
+ * LRU reference and the buf log item. What we are about to do
+ * here - simulate a failed IO completion - requires 3
+ * references.
+ *
+ * The LRU reference is removed by the xfs_buf_stale() call. The
+ * buf item reference is removed by the xfs_buf_iodone()
+ * callback that is run by xfs_buf_do_callbacks() during ioend
+ * processing (via the bp->b_iodone callback), and then finally
+ * the ioend processing will drop the IO reference if the buffer
+ * is marked XBF_ASYNC.
+ *
+ * Hence we need to take an additional reference here so that IO
+ * completion processing doesn't free the buffer prematurely.
*/
- lidp = xfs_trans_find_item(tp, (xfs_log_item_t *) bip);
- stale = lidp->lid_flags & XFS_LID_BUF_STALE;
- xfs_trans_free_item(tp, lidp);
- /*
- * Since the transaction no longer refers to the buffer,
- * the buffer should no longer refer to the transaction.
- */
- XFS_BUF_SET_FSPRIVATE2(bp, NULL);
+ xfs_buf_lock(bp);
+ xfs_buf_hold(bp);
+ bp->b_flags |= XBF_ASYNC;
+ xfs_buf_ioerror(bp, EIO);
+ XFS_BUF_UNDONE(bp);
+ xfs_buf_stale(bp);
+ xfs_buf_ioend(bp, 0);
}
-
- xfs_buf_item_unpin(bip, stale);
-
- return;
}
/*
- * This is called to attempt to lock the buffer associated with this
- * buf log item. Don't sleep on the buffer lock. If we can't get
- * the lock right away, return 0. If we can get the lock, pull the
- * buffer from the free list, mark it busy, and return 1.
+ * Buffer IO error rate limiting. Limit it to no more than 10 messages per 30
+ * seconds so as to not spam logs too much on repeated detection of the same
+ * buffer being bad..
*/
+
+DEFINE_RATELIMIT_STATE(xfs_buf_write_fail_rl_state, 30 * HZ, 10);
+
STATIC uint
-xfs_buf_item_trylock(
- xfs_buf_log_item_t *bip)
+xfs_buf_item_push(
+ struct xfs_log_item *lip,
+ struct list_head *buffer_list)
{
- xfs_buf_t *bp;
-
- bp = bip->bli_buf;
+ struct xfs_buf_log_item *bip = BUF_ITEM(lip);
+ struct xfs_buf *bp = bip->bli_buf;
+ uint rval = XFS_ITEM_SUCCESS;
- if (XFS_BUF_ISPINNED(bp)) {
+ if (xfs_buf_ispinned(bp))
return XFS_ITEM_PINNED;
- }
-
- if (!XFS_BUF_CPSEMA(bp)) {
+ if (!xfs_buf_trylock(bp)) {
+ /*
+ * If we have just raced with a buffer being pinned and it has
+ * been marked stale, we could end up stalling until someone else
+ * issues a log force to unpin the stale buffer. Check for the
+ * race condition here so xfsaild recognizes the buffer is pinned
+ * and queues a log force to move it along.
+ */
+ if (xfs_buf_ispinned(bp))
+ return XFS_ITEM_PINNED;
return XFS_ITEM_LOCKED;
}
- /*
- * Remove the buffer from the free list. Only do this
- * if it's on the free list. Private buffers like the
- * superblock buffer are not.
- */
- XFS_BUF_HOLD(bp);
-
ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
- xfs_buf_item_trace("TRYLOCK SUCCESS", bip);
- return XFS_ITEM_SUCCESS;
+
+ trace_xfs_buf_item_push(bip);
+
+ /* has a previous flush failed due to IO errors? */
+ if ((bp->b_flags & XBF_WRITE_FAIL) &&
+ ___ratelimit(&xfs_buf_write_fail_rl_state, "XFS:")) {
+ xfs_warn(bp->b_target->bt_mount,
+"Detected failing async write on buffer block 0x%llx. Retrying async write.\n",
+ (long long)bp->b_bn);
+ }
+
+ if (!xfs_buf_delwri_queue(bp, buffer_list))
+ rval = XFS_ITEM_FLUSHING;
+ xfs_buf_unlock(bp);
+ return rval;
}
/*
- * Release the buffer associated with the buf log item.
- * If there is no dirty logged data associated with the
- * buffer recorded in the buf log item, then free the
- * buf log item and remove the reference to it in the
- * buffer.
+ * Release the buffer associated with the buf log item. If there is no dirty
+ * logged data associated with the buffer recorded in the buf log item, then
+ * free the buf log item and remove the reference to it in the buffer.
*
- * This call ignores the recursion count. It is only called
- * when the buffer should REALLY be unlocked, regardless
- * of the recursion count.
+ * This call ignores the recursion count. It is only called when the buffer
+ * should REALLY be unlocked, regardless of the recursion count.
*
- * If the XFS_BLI_HOLD flag is set in the buf log item, then
- * free the log item if necessary but do not unlock the buffer.
- * This is for support of xfs_trans_bhold(). Make sure the
- * XFS_BLI_HOLD field is cleared if we don't free the item.
+ * We unconditionally drop the transaction's reference to the log item. If the
+ * item was logged, then another reference was taken when it was pinned, so we
+ * can safely drop the transaction reference now. This also allows us to avoid
+ * potential races with the unpin code freeing the bli by not referencing the
+ * bli after we've dropped the reference count.
+ *
+ * If the XFS_BLI_HOLD flag is set in the buf log item, then free the log item
+ * if necessary but do not unlock the buffer. This is for support of
+ * xfs_trans_bhold(). Make sure the XFS_BLI_HOLD field is cleared if we don't
+ * free the item.
*/
STATIC void
xfs_buf_item_unlock(
- xfs_buf_log_item_t *bip)
+ struct xfs_log_item *lip)
{
- int aborted;
- xfs_buf_t *bp;
- uint hold;
+ struct xfs_buf_log_item *bip = BUF_ITEM(lip);
+ struct xfs_buf *bp = bip->bli_buf;
+ bool clean;
+ bool aborted;
+ int flags;
- bp = bip->bli_buf;
- xfs_buftrace("XFS_UNLOCK", bp);
+ /* Clear the buffer's association with this transaction. */
+ bp->b_transp = NULL;
/*
- * Clear the buffer's association with this transaction.
+ * If this is a transaction abort, don't return early. Instead, allow
+ * the brelse to happen. Normally it would be done for stale
+ * (cancelled) buffers at unpin time, but we'll never go through the
+ * pin/unpin cycle if we abort inside commit.
*/
- XFS_BUF_SET_FSPRIVATE2(bp, NULL);
-
+ aborted = (lip->li_flags & XFS_LI_ABORTED) ? true : false;
/*
- * If this is a transaction abort, don't return early.
- * Instead, allow the brelse to happen.
- * Normally it would be done for stale (cancelled) buffers
- * at unpin time, but we'll never go through the pin/unpin
- * cycle if we abort inside commit.
+ * Before possibly freeing the buf item, copy the per-transaction state
+ * so we can reference it safely later after clearing it from the
+ * buffer log item.
*/
- aborted = (bip->bli_item.li_flags & XFS_LI_ABORTED) != 0;
+ flags = bip->bli_flags;
+ bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_HOLD | XFS_BLI_ORDERED);
/*
- * If the buf item is marked stale, then don't do anything.
- * We'll unlock the buffer and free the buf item when the
- * buffer is unpinned for the last time.
+ * If the buf item is marked stale, then don't do anything. We'll
+ * unlock the buffer and free the buf item when the buffer is unpinned
+ * for the last time.
*/
- if (bip->bli_flags & XFS_BLI_STALE) {
- bip->bli_flags &= ~XFS_BLI_LOGGED;
- xfs_buf_item_trace("UNLOCK STALE", bip);
- ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL);
- if (!aborted)
+ if (flags & XFS_BLI_STALE) {
+ trace_xfs_buf_item_unlock_stale(bip);
+ ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL);
+ if (!aborted) {
+ atomic_dec(&bip->bli_refcount);
return;
+ }
}
- /*
- * Drop the transaction's reference to the log item if
- * it was not logged as part of the transaction. Otherwise
- * we'll drop the reference in xfs_buf_item_unpin() when
- * the transaction is really through with the buffer.
- */
- if (!(bip->bli_flags & XFS_BLI_LOGGED)) {
- atomic_dec(&bip->bli_refcount);
- } else {
- /*
- * Clear the logged flag since this is per
- * transaction state.
- */
- bip->bli_flags &= ~XFS_BLI_LOGGED;
- }
+ trace_xfs_buf_item_unlock(bip);
/*
- * Before possibly freeing the buf item, determine if we should
- * release the buffer at the end of this routine.
+ * If the buf item isn't tracking any data, free it, otherwise drop the
+ * reference we hold to it. If we are aborting the transaction, this may
+ * be the only reference to the buf item, so we free it anyway
+ * regardless of whether it is dirty or not. A dirty abort implies a
+ * shutdown, anyway.
+ *
+ * Ordered buffers are dirty but may have no recorded changes, so ensure
+ * we only release clean items here.
*/
- hold = bip->bli_flags & XFS_BLI_HOLD;
- xfs_buf_item_trace("UNLOCK", bip);
+ clean = (flags & XFS_BLI_DIRTY) ? false : true;
+ if (clean) {
+ int i;
+ for (i = 0; i < bip->bli_format_count; i++) {
+ if (!xfs_bitmap_empty(bip->bli_formats[i].blf_data_map,
+ bip->bli_formats[i].blf_map_size)) {
+ clean = false;
+ break;
+ }
+ }
+ }
/*
- * If the buf item isn't tracking any data, free it.
- * Otherwise, if XFS_BLI_HOLD is set clear it.
+ * Clean buffers, by definition, cannot be in the AIL. However, aborted
+ * buffers may be dirty and hence in the AIL. Therefore if we are
+ * aborting a buffer and we've just taken the last refernce away, we
+ * have to check if it is in the AIL before freeing it. We need to free
+ * it in this case, because an aborted transaction has already shut the
+ * filesystem down and this is the last chance we will have to do so.
*/
- if (xfs_count_bits(bip->bli_format.blf_data_map,
- bip->bli_format.blf_map_size, 0) == 0) {
- xfs_buf_item_relse(bp);
- } else if (hold) {
- bip->bli_flags &= ~XFS_BLI_HOLD;
+ if (atomic_dec_and_test(&bip->bli_refcount)) {
+ if (clean)
+ xfs_buf_item_relse(bp);
+ else if (aborted) {
+ ASSERT(XFS_FORCED_SHUTDOWN(lip->li_mountp));
+ if (lip->li_flags & XFS_LI_IN_AIL) {
+ spin_lock(&lip->li_ailp->xa_lock);
+ xfs_trans_ail_delete(lip->li_ailp, lip,
+ SHUTDOWN_LOG_IO_ERROR);
+ }
+ xfs_buf_item_relse(bp);
+ }
}
- /*
- * Release the buffer if XFS_BLI_HOLD was not set.
- */
- if (!hold) {
+ if (!(flags & XFS_BLI_HOLD))
xfs_buf_relse(bp);
- }
}
/*
@@ -617,91 +676,70 @@ xfs_buf_item_unlock(
*/
STATIC xfs_lsn_t
xfs_buf_item_committed(
- xfs_buf_log_item_t *bip,
+ struct xfs_log_item *lip,
xfs_lsn_t lsn)
{
- xfs_buf_item_trace("COMMITTED", bip);
- if ((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) &&
- (bip->bli_item.li_lsn != 0)) {
- return bip->bli_item.li_lsn;
- }
- return (lsn);
+ struct xfs_buf_log_item *bip = BUF_ITEM(lip);
+
+ trace_xfs_buf_item_committed(bip);
+
+ if ((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) && lip->li_lsn != 0)
+ return lip->li_lsn;
+ return lsn;
}
-/*
- * This is called when the transaction holding the buffer is aborted.
- * Just behave as if the transaction had been cancelled. If we're shutting down
- * and have aborted this transaction, we'll trap this buffer when it tries to
- * get written out.
- */
STATIC void
-xfs_buf_item_abort(
- xfs_buf_log_item_t *bip)
+xfs_buf_item_committing(
+ struct xfs_log_item *lip,
+ xfs_lsn_t commit_lsn)
{
- xfs_buf_t *bp;
-
- bp = bip->bli_buf;
- xfs_buftrace("XFS_ABORT", bp);
- XFS_BUF_SUPER_STALE(bp);
- xfs_buf_item_unlock(bip);
- return;
}
/*
- * This is called to asynchronously write the buffer associated with this
- * buf log item out to disk. The buffer will already have been locked by
- * a successful call to xfs_buf_item_trylock(). If the buffer still has
- * B_DELWRI set, then get it going out to disk with a call to bawrite().
- * If not, then just release the buffer.
+ * This is the ops vector shared by all buf log items.
*/
-STATIC void
-xfs_buf_item_push(
- xfs_buf_log_item_t *bip)
-{
- xfs_buf_t *bp;
-
- ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
- xfs_buf_item_trace("PUSH", bip);
+static const struct xfs_item_ops xfs_buf_item_ops = {
+ .iop_size = xfs_buf_item_size,
+ .iop_format = xfs_buf_item_format,
+ .iop_pin = xfs_buf_item_pin,
+ .iop_unpin = xfs_buf_item_unpin,
+ .iop_unlock = xfs_buf_item_unlock,
+ .iop_committed = xfs_buf_item_committed,
+ .iop_push = xfs_buf_item_push,
+ .iop_committing = xfs_buf_item_committing
+};
- bp = bip->bli_buf;
+STATIC int
+xfs_buf_item_get_format(
+ struct xfs_buf_log_item *bip,
+ int count)
+{
+ ASSERT(bip->bli_formats == NULL);
+ bip->bli_format_count = count;
- if (XFS_BUF_ISDELAYWRITE(bp)) {
- xfs_bawrite(bip->bli_item.li_mountp, bp);
- } else {
- xfs_buf_relse(bp);
+ if (count == 1) {
+ bip->bli_formats = &bip->__bli_format;
+ return 0;
}
+
+ bip->bli_formats = kmem_zalloc(count * sizeof(struct xfs_buf_log_format),
+ KM_SLEEP);
+ if (!bip->bli_formats)
+ return ENOMEM;
+ return 0;
}
-/* ARGSUSED */
STATIC void
-xfs_buf_item_committing(xfs_buf_log_item_t *bip, xfs_lsn_t commit_lsn)
+xfs_buf_item_free_format(
+ struct xfs_buf_log_item *bip)
{
+ if (bip->bli_formats != &bip->__bli_format) {
+ kmem_free(bip->bli_formats);
+ bip->bli_formats = NULL;
+ }
}
/*
- * This is the ops vector shared by all buf log items.
- */
-STATIC struct xfs_item_ops xfs_buf_item_ops = {
- .iop_size = (uint(*)(xfs_log_item_t*))xfs_buf_item_size,
- .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*))
- xfs_buf_item_format,
- .iop_pin = (void(*)(xfs_log_item_t*))xfs_buf_item_pin,
- .iop_unpin = (void(*)(xfs_log_item_t*, int))xfs_buf_item_unpin,
- .iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t *))
- xfs_buf_item_unpin_remove,
- .iop_trylock = (uint(*)(xfs_log_item_t*))xfs_buf_item_trylock,
- .iop_unlock = (void(*)(xfs_log_item_t*))xfs_buf_item_unlock,
- .iop_committed = (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t))
- xfs_buf_item_committed,
- .iop_push = (void(*)(xfs_log_item_t*))xfs_buf_item_push,
- .iop_abort = (void(*)(xfs_log_item_t*))xfs_buf_item_abort,
- .iop_pushbuf = NULL,
- .iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t))
- xfs_buf_item_committing
-};
-
-
-/*
* Allocate a new buf log item to go with the given buffer.
* Set the buffer's b_fsprivate field to point to the new
* buf log item. If there are other item's attached to the
@@ -713,10 +751,12 @@ xfs_buf_item_init(
xfs_buf_t *bp,
xfs_mount_t *mp)
{
- xfs_log_item_t *lip;
+ xfs_log_item_t *lip = bp->b_fspriv;
xfs_buf_log_item_t *bip;
int chunks;
int map_size;
+ int error;
+ int i;
/*
* Check to see if there is already a buf log item for
@@ -724,62 +764,45 @@ xfs_buf_item_init(
* the first. If we do already have one, there is
* nothing to do here so return.
*/
- if (XFS_BUF_FSPRIVATE3(bp, xfs_mount_t *) != mp)
- XFS_BUF_SET_FSPRIVATE3(bp, mp);
- XFS_BUF_SET_BDSTRAT_FUNC(bp, xfs_bdstrat_cb);
- if (XFS_BUF_FSPRIVATE(bp, void *) != NULL) {
- lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
- if (lip->li_type == XFS_LI_BUF) {
- return;
- }
- }
+ ASSERT(bp->b_target->bt_mount == mp);
+ if (lip != NULL && lip->li_type == XFS_LI_BUF)
+ return;
- /*
- * chunks is the number of XFS_BLI_CHUNK size pieces
- * the buffer can be divided into. Make sure not to
- * truncate any pieces. map_size is the size of the
- * bitmap needed to describe the chunks of the buffer.
- */
- chunks = (int)((XFS_BUF_COUNT(bp) + (XFS_BLI_CHUNK - 1)) >> XFS_BLI_SHIFT);
- map_size = (int)((chunks + NBWORD) >> BIT_TO_WORD_SHIFT);
-
- bip = (xfs_buf_log_item_t*)kmem_zone_zalloc(xfs_buf_item_zone,
- KM_SLEEP);
- bip->bli_item.li_type = XFS_LI_BUF;
- bip->bli_item.li_ops = &xfs_buf_item_ops;
- bip->bli_item.li_mountp = mp;
+ bip = kmem_zone_zalloc(xfs_buf_item_zone, KM_SLEEP);
+ xfs_log_item_init(mp, &bip->bli_item, XFS_LI_BUF, &xfs_buf_item_ops);
bip->bli_buf = bp;
- bip->bli_format.blf_type = XFS_LI_BUF;
- bip->bli_format.blf_blkno = (__int64_t)XFS_BUF_ADDR(bp);
- bip->bli_format.blf_len = (ushort)BTOBB(XFS_BUF_COUNT(bp));
- bip->bli_format.blf_map_size = map_size;
-#ifdef XFS_BLI_TRACE
- bip->bli_trace = ktrace_alloc(XFS_BLI_TRACE_SIZE, KM_SLEEP);
-#endif
-
-#ifdef XFS_TRANS_DEBUG
+ xfs_buf_hold(bp);
+
/*
- * Allocate the arrays for tracking what needs to be logged
- * and what our callers request to be logged. bli_orig
- * holds a copy of the original, clean buffer for comparison
- * against, and bli_logged keeps a 1 bit flag per byte in
- * the buffer to indicate which bytes the callers have asked
- * to have logged.
+ * chunks is the number of XFS_BLF_CHUNK size pieces the buffer
+ * can be divided into. Make sure not to truncate any pieces.
+ * map_size is the size of the bitmap needed to describe the
+ * chunks of the buffer.
+ *
+ * Discontiguous buffer support follows the layout of the underlying
+ * buffer. This makes the implementation as simple as possible.
*/
- bip->bli_orig = (char *)kmem_alloc(XFS_BUF_COUNT(bp), KM_SLEEP);
- memcpy(bip->bli_orig, XFS_BUF_PTR(bp), XFS_BUF_COUNT(bp));
- bip->bli_logged = (char *)kmem_zalloc(XFS_BUF_COUNT(bp) / NBBY, KM_SLEEP);
-#endif
+ error = xfs_buf_item_get_format(bip, bp->b_map_count);
+ ASSERT(error == 0);
+
+ for (i = 0; i < bip->bli_format_count; i++) {
+ chunks = DIV_ROUND_UP(BBTOB(bp->b_maps[i].bm_len),
+ XFS_BLF_CHUNK);
+ map_size = DIV_ROUND_UP(chunks, NBWORD);
+
+ bip->bli_formats[i].blf_type = XFS_LI_BUF;
+ bip->bli_formats[i].blf_blkno = bp->b_maps[i].bm_bn;
+ bip->bli_formats[i].blf_len = bp->b_maps[i].bm_len;
+ bip->bli_formats[i].blf_map_size = map_size;
+ }
/*
* Put the buf item into the list of items attached to the
* buffer at the front.
*/
- if (XFS_BUF_FSPRIVATE(bp, void *) != NULL) {
- bip->bli_item.li_bio_list =
- XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
- }
- XFS_BUF_SET_FSPRIVATE(bp, bip);
+ if (bp->b_fspriv)
+ bip->bli_item.li_bio_list = bp->b_fspriv;
+ bp->b_fspriv = bip;
}
@@ -787,11 +810,11 @@ xfs_buf_item_init(
* Mark bytes first through last inclusive as dirty in the buf
* item's bitmap.
*/
-void
-xfs_buf_item_log(
- xfs_buf_log_item_t *bip,
+static void
+xfs_buf_item_log_segment(
uint first,
- uint last)
+ uint last,
+ uint *map)
{
uint first_bit;
uint last_bit;
@@ -804,16 +827,10 @@ xfs_buf_item_log(
uint mask;
/*
- * Mark the item as having some dirty data for
- * quick reference in xfs_buf_item_dirty.
- */
- bip->bli_flags |= XFS_BLI_DIRTY;
-
- /*
* Convert byte offsets to bit numbers.
*/
- first_bit = first >> XFS_BLI_SHIFT;
- last_bit = last >> XFS_BLI_SHIFT;
+ first_bit = first >> XFS_BLF_SHIFT;
+ last_bit = last >> XFS_BLF_SHIFT;
/*
* Calculate the total number of bits to be set.
@@ -825,7 +842,7 @@ xfs_buf_item_log(
* to set a bit in.
*/
word_num = first_bit >> BIT_TO_WORD_SHIFT;
- wordp = &(bip->bli_format.blf_data_map[word_num]);
+ wordp = &map[word_num];
/*
* Calculate the starting bit in the first word.
@@ -868,13 +885,50 @@ xfs_buf_item_log(
mask = (1 << end_bit) - 1;
*wordp |= mask;
}
+}
- xfs_buf_item_log_debug(bip, first, last);
+/*
+ * Mark bytes first through last inclusive as dirty in the buf
+ * item's bitmap.
+ */
+void
+xfs_buf_item_log(
+ xfs_buf_log_item_t *bip,
+ uint first,
+ uint last)
+{
+ int i;
+ uint start;
+ uint end;
+ struct xfs_buf *bp = bip->bli_buf;
+
+ /*
+ * walk each buffer segment and mark them dirty appropriately.
+ */
+ start = 0;
+ for (i = 0; i < bip->bli_format_count; i++) {
+ if (start > last)
+ break;
+ end = start + BBTOB(bp->b_maps[i].bm_len);
+ if (first > end) {
+ start += BBTOB(bp->b_maps[i].bm_len);
+ continue;
+ }
+ if (first < start)
+ first = start;
+ if (end > last)
+ end = last;
+
+ xfs_buf_item_log_segment(first, end,
+ &bip->bli_formats[i].blf_data_map[0]);
+
+ start += bp->b_maps[i].bm_len;
+ }
}
/*
- * Return 1 if the buffer has some data that has been logged (at any
+ * Return 1 if the buffer has been logged or ordered in a transaction (at any
* point, not just the current transaction) and 0 if not.
*/
uint
@@ -884,6 +938,14 @@ xfs_buf_item_dirty(
return (bip->bli_flags & XFS_BLI_DIRTY);
}
+STATIC void
+xfs_buf_item_free(
+ xfs_buf_log_item_t *bip)
+{
+ xfs_buf_item_free_format(bip);
+ kmem_zone_free(xfs_buf_item_zone, bip);
+}
+
/*
* This is called when the buf log item is no longer needed. It should
* free the buf log item associated with the given buffer and clear
@@ -895,28 +957,17 @@ void
xfs_buf_item_relse(
xfs_buf_t *bp)
{
- xfs_buf_log_item_t *bip;
+ xfs_buf_log_item_t *bip = bp->b_fspriv;
- xfs_buftrace("XFS_RELSE", bp);
- bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t*);
- XFS_BUF_SET_FSPRIVATE(bp, bip->bli_item.li_bio_list);
- if ((XFS_BUF_FSPRIVATE(bp, void *) == NULL) &&
- (XFS_BUF_IODONE_FUNC(bp) != NULL)) {
- ASSERT((XFS_BUF_ISUNINITIAL(bp)) == 0);
- XFS_BUF_CLR_IODONE_FUNC(bp);
- }
+ trace_xfs_buf_item_relse(bp, _RET_IP_);
+ ASSERT(!(bip->bli_item.li_flags & XFS_LI_IN_AIL));
-#ifdef XFS_TRANS_DEBUG
- kmem_free(bip->bli_orig, XFS_BUF_COUNT(bp));
- bip->bli_orig = NULL;
- kmem_free(bip->bli_logged, XFS_BUF_COUNT(bp) / NBBY);
- bip->bli_logged = NULL;
-#endif /* XFS_TRANS_DEBUG */
+ bp->b_fspriv = bip->bli_item.li_bio_list;
+ if (bp->b_fspriv == NULL)
+ bp->b_iodone = NULL;
-#ifdef XFS_BLI_TRACE
- ktrace_free(bip->bli_trace);
-#endif
- kmem_zone_free(xfs_buf_item_zone, bip);
+ xfs_buf_rele(bp);
+ xfs_buf_item_free(bip);
}
@@ -937,32 +988,42 @@ xfs_buf_attach_iodone(
{
xfs_log_item_t *head_lip;
- ASSERT(XFS_BUF_ISBUSY(bp));
- ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
+ ASSERT(xfs_buf_islocked(bp));
lip->li_cb = cb;
- if (XFS_BUF_FSPRIVATE(bp, void *) != NULL) {
- head_lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
+ head_lip = bp->b_fspriv;
+ if (head_lip) {
lip->li_bio_list = head_lip->li_bio_list;
head_lip->li_bio_list = lip;
} else {
- XFS_BUF_SET_FSPRIVATE(bp, lip);
+ bp->b_fspriv = lip;
}
- ASSERT((XFS_BUF_IODONE_FUNC(bp) == xfs_buf_iodone_callbacks) ||
- (XFS_BUF_IODONE_FUNC(bp) == NULL));
- XFS_BUF_SET_IODONE_FUNC(bp, xfs_buf_iodone_callbacks);
+ ASSERT(bp->b_iodone == NULL ||
+ bp->b_iodone == xfs_buf_iodone_callbacks);
+ bp->b_iodone = xfs_buf_iodone_callbacks;
}
+/*
+ * We can have many callbacks on a buffer. Running the callbacks individually
+ * can cause a lot of contention on the AIL lock, so we allow for a single
+ * callback to be able to scan the remaining lip->li_bio_list for other items
+ * of the same type and callback to be processed in the first call.
+ *
+ * As a result, the loop walking the callback list below will also modify the
+ * list. it removes the first item from the list and then runs the callback.
+ * The loop then restarts from the new head of the list. This allows the
+ * callback to scan and modify the list attached to the buffer and we don't
+ * have to care about maintaining a next item pointer.
+ */
STATIC void
xfs_buf_do_callbacks(
- xfs_buf_t *bp,
- xfs_log_item_t *lip)
+ struct xfs_buf *bp)
{
- xfs_log_item_t *nlip;
+ struct xfs_log_item *lip;
- while (lip != NULL) {
- nlip = lip->li_bio_list;
+ while ((lip = bp->b_fspriv) != NULL) {
+ bp->b_fspriv = lip->li_bio_list;
ASSERT(lip->li_cb != NULL);
/*
* Clear the next pointer so we don't have any
@@ -972,7 +1033,6 @@ xfs_buf_do_callbacks(
*/
lip->li_bio_list = NULL;
lip->li_cb(bp, lip);
- lip = nlip;
}
}
@@ -985,141 +1045,78 @@ xfs_buf_do_callbacks(
*/
void
xfs_buf_iodone_callbacks(
- xfs_buf_t *bp)
+ struct xfs_buf *bp)
{
- xfs_log_item_t *lip;
- static ulong lasttime;
- static xfs_buftarg_t *lasttarg;
- xfs_mount_t *mp;
+ struct xfs_log_item *lip = bp->b_fspriv;
+ struct xfs_mount *mp = lip->li_mountp;
+ static ulong lasttime;
+ static xfs_buftarg_t *lasttarg;
- ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL);
- lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
+ if (likely(!bp->b_error))
+ goto do_callbacks;
- if (XFS_BUF_GETERROR(bp) != 0) {
- /*
- * If we've already decided to shutdown the filesystem
- * because of IO errors, there's no point in giving this
- * a retry.
- */
- mp = lip->li_mountp;
- if (XFS_FORCED_SHUTDOWN(mp)) {
- ASSERT(XFS_BUF_TARGET(bp) == mp->m_ddev_targp);
- XFS_BUF_SUPER_STALE(bp);
- xfs_buftrace("BUF_IODONE_CB", bp);
- xfs_buf_do_callbacks(bp, lip);
- XFS_BUF_SET_FSPRIVATE(bp, NULL);
- XFS_BUF_CLR_IODONE_FUNC(bp);
+ /*
+ * If we've already decided to shutdown the filesystem because of
+ * I/O errors, there's no point in giving this a retry.
+ */
+ if (XFS_FORCED_SHUTDOWN(mp)) {
+ xfs_buf_stale(bp);
+ XFS_BUF_DONE(bp);
+ trace_xfs_buf_item_iodone(bp, _RET_IP_);
+ goto do_callbacks;
+ }
- /*
- * XFS_SHUT flag gets set when we go thru the
- * entire buffer cache and deliberately start
- * throwing away delayed write buffers.
- * Since there's no biowait done on those,
- * we should just brelse them.
- */
- if (XFS_BUF_ISSHUT(bp)) {
- XFS_BUF_UNSHUT(bp);
- xfs_buf_relse(bp);
- } else {
- xfs_biodone(bp);
- }
+ if (bp->b_target != lasttarg ||
+ time_after(jiffies, (lasttime + 5*HZ))) {
+ lasttime = jiffies;
+ xfs_buf_ioerror_alert(bp, __func__);
+ }
+ lasttarg = bp->b_target;
- return;
- }
+ /*
+ * If the write was asynchronous then no one will be looking for the
+ * error. Clear the error state and write the buffer out again.
+ *
+ * XXX: This helps against transient write errors, but we need to find
+ * a way to shut the filesystem down if the writes keep failing.
+ *
+ * In practice we'll shut the filesystem down soon as non-transient
+ * erorrs tend to affect the whole device and a failing log write
+ * will make us give up. But we really ought to do better here.
+ */
+ if (XFS_BUF_ISASYNC(bp)) {
+ ASSERT(bp->b_iodone != NULL);
- if ((XFS_BUF_TARGET(bp) != lasttarg) ||
- (time_after(jiffies, (lasttime + 5*HZ)))) {
- lasttime = jiffies;
- prdev("XFS write error in file system meta-data "
- "block 0x%llx in %s",
- XFS_BUF_TARGET(bp),
- (__uint64_t)XFS_BUF_ADDR(bp), mp->m_fsname);
- }
- lasttarg = XFS_BUF_TARGET(bp);
+ trace_xfs_buf_item_iodone_async(bp, _RET_IP_);
- if (XFS_BUF_ISASYNC(bp)) {
- /*
- * If the write was asynchronous then noone will be
- * looking for the error. Clear the error state
- * and write the buffer out again delayed write.
- *
- * XXXsup This is OK, so long as we catch these
- * before we start the umount; we don't want these
- * DELWRI metadata bufs to be hanging around.
- */
- XFS_BUF_ERROR(bp,0); /* errno of 0 unsets the flag */
+ xfs_buf_ioerror(bp, 0); /* errno of 0 unsets the flag */
- if (!(XFS_BUF_ISSTALE(bp))) {
- XFS_BUF_DELAYWRITE(bp);
- XFS_BUF_DONE(bp);
- XFS_BUF_SET_START(bp);
- }
- ASSERT(XFS_BUF_IODONE_FUNC(bp));
- xfs_buftrace("BUF_IODONE ASYNC", bp);
- xfs_buf_relse(bp);
+ if (!(bp->b_flags & (XBF_STALE|XBF_WRITE_FAIL))) {
+ bp->b_flags |= XBF_WRITE | XBF_ASYNC |
+ XBF_DONE | XBF_WRITE_FAIL;
+ xfs_buf_iorequest(bp);
} else {
- /*
- * If the write of the buffer was not asynchronous,
- * then we want to make sure to return the error
- * to the caller of bwrite(). Because of this we
- * cannot clear the B_ERROR state at this point.
- * Instead we install a callback function that
- * will be called when the buffer is released, and
- * that routine will clear the error state and
- * set the buffer to be written out again after
- * some delay.
- */
- /* We actually overwrite the existing b-relse
- function at times, but we're gonna be shutting down
- anyway. */
- XFS_BUF_SET_BRELSE_FUNC(bp,xfs_buf_error_relse);
- XFS_BUF_DONE(bp);
- XFS_BUF_V_IODONESEMA(bp);
+ xfs_buf_relse(bp);
}
+
return;
}
-#ifdef XFSERRORDEBUG
- xfs_buftrace("XFS BUFCB NOERR", bp);
-#endif
- xfs_buf_do_callbacks(bp, lip);
- XFS_BUF_SET_FSPRIVATE(bp, NULL);
- XFS_BUF_CLR_IODONE_FUNC(bp);
- xfs_biodone(bp);
-}
-/*
- * This is a callback routine attached to a buffer which gets an error
- * when being written out synchronously.
- */
-STATIC void
-xfs_buf_error_relse(
- xfs_buf_t *bp)
-{
- xfs_log_item_t *lip;
- xfs_mount_t *mp;
-
- lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
- mp = (xfs_mount_t *)lip->li_mountp;
- ASSERT(XFS_BUF_TARGET(bp) == mp->m_ddev_targp);
-
- XFS_BUF_STALE(bp);
- XFS_BUF_DONE(bp);
- XFS_BUF_UNDELAYWRITE(bp);
- XFS_BUF_ERROR(bp,0);
- xfs_buftrace("BUF_ERROR_RELSE", bp);
- if (! XFS_FORCED_SHUTDOWN(mp))
- xfs_force_shutdown(mp, XFS_METADATA_IO_ERROR);
/*
- * We have to unpin the pinned buffers so do the
- * callbacks.
+ * If the write of the buffer was synchronous, we want to make
+ * sure to return the error to the caller of xfs_bwrite().
*/
- xfs_buf_do_callbacks(bp, lip);
- XFS_BUF_SET_FSPRIVATE(bp, NULL);
- XFS_BUF_CLR_IODONE_FUNC(bp);
- XFS_BUF_SET_BRELSE_FUNC(bp,NULL);
- xfs_buf_relse(bp);
-}
+ xfs_buf_stale(bp);
+ XFS_BUF_DONE(bp);
+
+ trace_xfs_buf_error_relse(bp, _RET_IP_);
+do_callbacks:
+ xfs_buf_do_callbacks(bp);
+ bp->b_fspriv = NULL;
+ bp->b_iodone = NULL;
+ xfs_buf_ioend(bp, 0);
+}
/*
* This is the iodone() function for buffers which have been
@@ -1128,74 +1125,27 @@ xfs_buf_error_relse(
* It is called by xfs_buf_iodone_callbacks() above which will take
* care of cleaning up the buffer itself.
*/
-/* ARGSUSED */
void
xfs_buf_iodone(
- xfs_buf_t *bp,
- xfs_buf_log_item_t *bip)
+ struct xfs_buf *bp,
+ struct xfs_log_item *lip)
{
- struct xfs_mount *mp;
- SPLDECL(s);
+ struct xfs_ail *ailp = lip->li_ailp;
- ASSERT(bip->bli_buf == bp);
+ ASSERT(BUF_ITEM(lip)->bli_buf == bp);
- mp = bip->bli_item.li_mountp;
+ xfs_buf_rele(bp);
/*
* If we are forcibly shutting down, this may well be
* off the AIL already. That's because we simulate the
* log-committed callbacks to unpin these buffers. Or we may never
* have put this item on AIL because of the transaction was
- * aborted forcibly. xfs_trans_delete_ail() takes care of these.
+ * aborted forcibly. xfs_trans_ail_delete() takes care of these.
*
* Either way, AIL is useless if we're forcing a shutdown.
*/
- AIL_LOCK(mp,s);
- /*
- * xfs_trans_delete_ail() drops the AIL lock.
- */
- xfs_trans_delete_ail(mp, (xfs_log_item_t *)bip, s);
-
-#ifdef XFS_TRANS_DEBUG
- kmem_free(bip->bli_orig, XFS_BUF_COUNT(bp));
- bip->bli_orig = NULL;
- kmem_free(bip->bli_logged, XFS_BUF_COUNT(bp) / NBBY);
- bip->bli_logged = NULL;
-#endif /* XFS_TRANS_DEBUG */
-
-#ifdef XFS_BLI_TRACE
- ktrace_free(bip->bli_trace);
-#endif
- kmem_zone_free(xfs_buf_item_zone, bip);
-}
-
-#if defined(XFS_BLI_TRACE)
-void
-xfs_buf_item_trace(
- char *id,
- xfs_buf_log_item_t *bip)
-{
- xfs_buf_t *bp;
- ASSERT(bip->bli_trace != NULL);
-
- bp = bip->bli_buf;
- ktrace_enter(bip->bli_trace,
- (void *)id,
- (void *)bip->bli_buf,
- (void *)((unsigned long)bip->bli_flags),
- (void *)((unsigned long)bip->bli_recur),
- (void *)((unsigned long)atomic_read(&bip->bli_refcount)),
- (void *)((unsigned long)
- (0xFFFFFFFF & XFS_BUF_ADDR(bp) >> 32)),
- (void *)((unsigned long)(0xFFFFFFFF & XFS_BUF_ADDR(bp))),
- (void *)((unsigned long)XFS_BUF_COUNT(bp)),
- (void *)((unsigned long)XFS_BUF_BFLAGS(bp)),
- XFS_BUF_FSPRIVATE(bp, void *),
- XFS_BUF_FSPRIVATE2(bp, void *),
- (void *)(unsigned long)XFS_BUF_ISPINNED(bp),
- (void *)XFS_BUF_IODONE_FUNC(bp),
- (void *)((unsigned long)(XFS_BUF_VALUSEMA(bp))),
- (void *)bip->bli_item.li_desc,
- (void *)((unsigned long)bip->bli_item.li_flags));
+ spin_lock(&ailp->xa_lock);
+ xfs_trans_ail_delete(ailp, lip, SHUTDOWN_CORRUPT_INCORE);
+ xfs_buf_item_free(BUF_ITEM(lip));
}
-#endif /* XFS_BLI_TRACE */
diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h
index 07c708c2b52..3f3455a4151 100644
--- a/fs/xfs/xfs_buf_item.h
+++ b/fs/xfs/xfs_buf_item.h
@@ -18,88 +18,33 @@
#ifndef __XFS_BUF_ITEM_H__
#define __XFS_BUF_ITEM_H__
-/*
- * This is the structure used to lay out a buf log item in the
- * log. The data map describes which 128 byte chunks of the buffer
- * have been logged. This structure works only on buffers that
- * reside up to the first TB in the filesystem. These buffers are
- * generated only by pre-6.2 systems and are known as XFS_LI_6_1_BUF.
- */
-typedef struct xfs_buf_log_format_v1 {
- unsigned short blf_type; /* buf log item type indicator */
- unsigned short blf_size; /* size of this item */
- __int32_t blf_blkno; /* starting blkno of this buf */
- ushort blf_flags; /* misc state */
- ushort blf_len; /* number of blocks in this buf */
- unsigned int blf_map_size; /* size of data bitmap in words */
- unsigned int blf_data_map[1];/* variable size bitmap of */
- /* regions of buffer in this item */
-} xfs_buf_log_format_v1_t;
-
-/*
- * This is a form of the above structure with a 64 bit blkno field.
- * For 6.2 and beyond, this is XFS_LI_BUF. We use this to log everything.
- */
-typedef struct xfs_buf_log_format_t {
- unsigned short blf_type; /* buf log item type indicator */
- unsigned short blf_size; /* size of this item */
- ushort blf_flags; /* misc state */
- ushort blf_len; /* number of blocks in this buf */
- __int64_t blf_blkno; /* starting blkno of this buf */
- unsigned int blf_map_size; /* size of data bitmap in words */
- unsigned int blf_data_map[1];/* variable size bitmap of */
- /* regions of buffer in this item */
-} xfs_buf_log_format_t;
+/* kernel only definitions */
-/*
- * This flag indicates that the buffer contains on disk inodes
- * and requires special recovery handling.
- */
-#define XFS_BLI_INODE_BUF 0x1
-/*
- * This flag indicates that the buffer should not be replayed
- * during recovery because its blocks are being freed.
- */
-#define XFS_BLI_CANCEL 0x2
-/*
- * This flag indicates that the buffer contains on disk
- * user or group dquots and may require special recovery handling.
- */
-#define XFS_BLI_UDQUOT_BUF 0x4
-#define XFS_BLI_PDQUOT_BUF 0x8
-#define XFS_BLI_GDQUOT_BUF 0x10
-
-#define XFS_BLI_CHUNK 128
-#define XFS_BLI_SHIFT 7
-#define BIT_TO_WORD_SHIFT 5
-#define NBWORD (NBBY * sizeof(unsigned int))
-
-/*
- * buf log item flags
- */
+/* buf log item flags */
#define XFS_BLI_HOLD 0x01
#define XFS_BLI_DIRTY 0x02
#define XFS_BLI_STALE 0x04
#define XFS_BLI_LOGGED 0x08
#define XFS_BLI_INODE_ALLOC_BUF 0x10
#define XFS_BLI_STALE_INODE 0x20
+#define XFS_BLI_INODE_BUF 0x40
+#define XFS_BLI_ORDERED 0x80
+#define XFS_BLI_FLAGS \
+ { XFS_BLI_HOLD, "HOLD" }, \
+ { XFS_BLI_DIRTY, "DIRTY" }, \
+ { XFS_BLI_STALE, "STALE" }, \
+ { XFS_BLI_LOGGED, "LOGGED" }, \
+ { XFS_BLI_INODE_ALLOC_BUF, "INODE_ALLOC" }, \
+ { XFS_BLI_STALE_INODE, "STALE_INODE" }, \
+ { XFS_BLI_INODE_BUF, "INODE_BUF" }, \
+ { XFS_BLI_ORDERED, "ORDERED" }
-#ifdef __KERNEL__
struct xfs_buf;
-struct ktrace;
struct xfs_mount;
struct xfs_buf_log_item;
-#if defined(XFS_BLI_TRACE)
-#define XFS_BLI_TRACE_SIZE 32
-
-void xfs_buf_item_trace(char *, struct xfs_buf_log_item *);
-#else
-#define xfs_buf_item_trace(id, bip)
-#endif
-
/*
* This is the in core log item structure used to track information
* needed to log buffers. It tracks how many times the lock has been
@@ -111,27 +56,11 @@ typedef struct xfs_buf_log_item {
unsigned int bli_flags; /* misc flags */
unsigned int bli_recur; /* lock recursion count */
atomic_t bli_refcount; /* cnt of tp refs */
-#ifdef XFS_BLI_TRACE
- struct ktrace *bli_trace; /* event trace buf */
-#endif
-#ifdef XFS_TRANS_DEBUG
- char *bli_orig; /* original buffer copy */
- char *bli_logged; /* bytes logged (bitmap) */
-#endif
- xfs_buf_log_format_t bli_format; /* in-log header */
+ int bli_format_count; /* count of headers */
+ struct xfs_buf_log_format *bli_formats; /* array of in-log header ptrs */
+ struct xfs_buf_log_format __bli_format; /* embedded in-log header */
} xfs_buf_log_item_t;
-/*
- * This structure is used during recovery to record the buf log
- * items which have been canceled and should not be replayed.
- */
-typedef struct xfs_buf_cancel {
- xfs_daddr_t bc_blkno;
- uint bc_len;
- int bc_refcount;
- struct xfs_buf_cancel *bc_next;
-} xfs_buf_cancel_t;
-
void xfs_buf_item_init(struct xfs_buf *, struct xfs_mount *);
void xfs_buf_item_relse(struct xfs_buf *);
void xfs_buf_item_log(xfs_buf_log_item_t *, uint, uint);
@@ -140,18 +69,8 @@ void xfs_buf_attach_iodone(struct xfs_buf *,
void(*)(struct xfs_buf *, xfs_log_item_t *),
xfs_log_item_t *);
void xfs_buf_iodone_callbacks(struct xfs_buf *);
-void xfs_buf_iodone(struct xfs_buf *, xfs_buf_log_item_t *);
-
-#ifdef XFS_TRANS_DEBUG
-void
-xfs_buf_item_flush_log_debug(
- struct xfs_buf *bp,
- uint first,
- uint last);
-#else
-#define xfs_buf_item_flush_log_debug(bp, first, last)
-#endif
+void xfs_buf_iodone(struct xfs_buf *, struct xfs_log_item *);
-#endif /* __KERNEL__ */
+extern kmem_zone_t *xfs_buf_item_zone;
#endif /* __XFS_BUF_ITEM_H__ */
diff --git a/fs/xfs/xfs_cap.h b/fs/xfs/xfs_cap.h
deleted file mode 100644
index 433ec537f9b..00000000000
--- a/fs/xfs/xfs_cap.h
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- */
-#ifndef __XFS_CAP_H__
-#define __XFS_CAP_H__
-
-/*
- * Capabilities
- */
-typedef __uint64_t xfs_cap_value_t;
-
-typedef struct xfs_cap_set {
- xfs_cap_value_t cap_effective; /* use in capability checks */
- xfs_cap_value_t cap_permitted; /* combined with file attrs */
- xfs_cap_value_t cap_inheritable;/* pass through exec */
-} xfs_cap_set_t;
-
-/* On-disk XFS extended attribute names */
-#define SGI_CAP_FILE "SGI_CAP_FILE"
-#define SGI_CAP_FILE_SIZE (sizeof(SGI_CAP_FILE)-1)
-#define SGI_CAP_LINUX "SGI_CAP_LINUX"
-#define SGI_CAP_LINUX_SIZE (sizeof(SGI_CAP_LINUX)-1)
-
-/*
- * For Linux, we take the bitfields directly from capability.h
- * and no longer attempt to keep this attribute ondisk compatible
- * with IRIX. Since this attribute is only set on exectuables,
- * it just doesn't make much sense to try. We do use a different
- * named attribute though, to avoid confusion.
- */
-
-#ifdef __KERNEL__
-
-#ifdef CONFIG_FS_POSIX_CAP
-
-#include <linux/posix_cap_xattr.h>
-
-struct vnode;
-
-extern int xfs_cap_vhascap(struct vnode *);
-extern int xfs_cap_vset(struct vnode *, void *, size_t);
-extern int xfs_cap_vget(struct vnode *, void *, size_t);
-extern int xfs_cap_vremove(struct vnode *vp);
-
-#define _CAP_EXISTS xfs_cap_vhascap
-
-#else
-#define xfs_cap_vset(v,p,sz) (-EOPNOTSUPP)
-#define xfs_cap_vget(v,p,sz) (-EOPNOTSUPP)
-#define xfs_cap_vremove(v) (-EOPNOTSUPP)
-#define _CAP_EXISTS (NULL)
-#endif
-
-#endif /* __KERNEL__ */
-
-#endif /* __XFS_CAP_H__ */
diff --git a/fs/xfs/xfs_cksum.h b/fs/xfs/xfs_cksum.h
new file mode 100644
index 00000000000..fad1676ad8c
--- /dev/null
+++ b/fs/xfs/xfs_cksum.h
@@ -0,0 +1,63 @@
+#ifndef _XFS_CKSUM_H
+#define _XFS_CKSUM_H 1
+
+#define XFS_CRC_SEED (~(__uint32_t)0)
+
+/*
+ * Calculate the intermediate checksum for a buffer that has the CRC field
+ * inside it. The offset of the 32bit crc fields is passed as the
+ * cksum_offset parameter.
+ */
+static inline __uint32_t
+xfs_start_cksum(char *buffer, size_t length, unsigned long cksum_offset)
+{
+ __uint32_t zero = 0;
+ __uint32_t crc;
+
+ /* Calculate CRC up to the checksum. */
+ crc = crc32c(XFS_CRC_SEED, buffer, cksum_offset);
+
+ /* Skip checksum field */
+ crc = crc32c(crc, &zero, sizeof(__u32));
+
+ /* Calculate the rest of the CRC. */
+ return crc32c(crc, &buffer[cksum_offset + sizeof(__be32)],
+ length - (cksum_offset + sizeof(__be32)));
+}
+
+/*
+ * Convert the intermediate checksum to the final ondisk format.
+ *
+ * The CRC32c calculation uses LE format even on BE machines, but returns the
+ * result in host endian format. Hence we need to byte swap it back to LE format
+ * so that it is consistent on disk.
+ */
+static inline __le32
+xfs_end_cksum(__uint32_t crc)
+{
+ return ~cpu_to_le32(crc);
+}
+
+/*
+ * Helper to generate the checksum for a buffer.
+ */
+static inline void
+xfs_update_cksum(char *buffer, size_t length, unsigned long cksum_offset)
+{
+ __uint32_t crc = xfs_start_cksum(buffer, length, cksum_offset);
+
+ *(__le32 *)(buffer + cksum_offset) = xfs_end_cksum(crc);
+}
+
+/*
+ * Helper to verify the checksum for a buffer.
+ */
+static inline int
+xfs_verify_cksum(char *buffer, size_t length, unsigned long cksum_offset)
+{
+ __uint32_t crc = xfs_start_cksum(buffer, length, cksum_offset);
+
+ return *(__le32 *)(buffer + cksum_offset) == xfs_end_cksum(crc);
+}
+
+#endif /* _XFS_CKSUM_H */
diff --git a/fs/xfs/xfs_clnt.h b/fs/xfs/xfs_clnt.h
deleted file mode 100644
index f57cc9ac875..00000000000
--- a/fs/xfs/xfs_clnt.h
+++ /dev/null
@@ -1,104 +0,0 @@
-/*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- */
-#ifndef __XFS_CLNT_H__
-#define __XFS_CLNT_H__
-
-/*
- * XFS arguments structure, constructed from the arguments we
- * are passed via the mount system call.
- *
- * NOTE: The mount system call is handled differently between
- * Linux and IRIX. In IRIX we worked work with a binary data
- * structure coming in across the syscall interface from user
- * space (the mount userspace knows about each filesystem type
- * and the set of valid options for it, and converts the users
- * argument string into a binary structure _before_ making the
- * system call), and the ABI issues that this implies.
- *
- * In Linux, we are passed a comma separated set of options;
- * ie. a NULL terminated string of characters. Userspace mount
- * code does not have any knowledge of mount options expected by
- * each filesystem type and so each filesystem parses its mount
- * options in kernel space.
- *
- * For the Linux port, we kept this structure pretty much intact
- * and use it internally (because the existing code groks it).
- */
-struct xfs_mount_args {
- int flags; /* flags -> see XFSMNT_... macros below */
- int flags2; /* flags -> see XFSMNT2_... macros below */
- int logbufs; /* Number of log buffers, -1 to default */
- int logbufsize; /* Size of log buffers, -1 to default */
- char fsname[MAXNAMELEN+1]; /* data device name */
- char rtname[MAXNAMELEN+1]; /* realtime device filename */
- char logname[MAXNAMELEN+1]; /* journal device filename */
- char mtpt[MAXNAMELEN+1]; /* filesystem mount point */
- int sunit; /* stripe unit (BBs) */
- int swidth; /* stripe width (BBs), multiple of sunit */
- uchar_t iosizelog; /* log2 of the preferred I/O size */
- int ihashsize; /* inode hash table size (buckets) */
-};
-
-/*
- * XFS mount option flags -- args->flags1
- */
-#define XFSMNT_ATTR2 0x00000001 /* allow ATTR2 EA format */
-#define XFSMNT_WSYNC 0x00000002 /* safe mode nfs mount
- * compatible */
-#define XFSMNT_INO64 0x00000004 /* move inode numbers up
- * past 2^32 */
-#define XFSMNT_UQUOTA 0x00000008 /* user quota accounting */
-#define XFSMNT_PQUOTA 0x00000010 /* IRIX prj quota accounting */
-#define XFSMNT_UQUOTAENF 0x00000020 /* user quota limit
- * enforcement */
-#define XFSMNT_PQUOTAENF 0x00000040 /* IRIX project quota limit
- * enforcement */
-#define XFSMNT_NOATIME 0x00000100 /* don't modify access
- * times on reads */
-#define XFSMNT_NOALIGN 0x00000200 /* don't allocate at
- * stripe boundaries*/
-#define XFSMNT_RETERR 0x00000400 /* return error to user */
-#define XFSMNT_NORECOVERY 0x00000800 /* no recovery, implies
- * read-only mount */
-#define XFSMNT_SHARED 0x00001000 /* shared XFS mount */
-#define XFSMNT_IOSIZE 0x00002000 /* optimize for I/O size */
-#define XFSMNT_OSYNCISOSYNC 0x00004000 /* o_sync is REALLY o_sync */
- /* (osyncisdsync is default) */
-#define XFSMNT_32BITINODES 0x00200000 /* restrict inodes to 32
- * bits of address space */
-#define XFSMNT_GQUOTA 0x00400000 /* group quota accounting */
-#define XFSMNT_GQUOTAENF 0x00800000 /* group quota limit
- * enforcement */
-#define XFSMNT_NOUUID 0x01000000 /* Ignore fs uuid */
-#define XFSMNT_DMAPI 0x02000000 /* enable dmapi/xdsm */
-#define XFSMNT_BARRIER 0x04000000 /* use write barriers */
-#define XFSMNT_IDELETE 0x08000000 /* inode cluster delete */
-#define XFSMNT_SWALLOC 0x10000000 /* turn on stripe width
- * allocation */
-#define XFSMNT_IHASHSIZE 0x20000000 /* inode hash table size */
-#define XFSMNT_DIRSYNC 0x40000000 /* sync creat,link,unlink,rename
- * symlink,mkdir,rmdir,mknod */
-#define XFSMNT_FLAGS2 0x80000000 /* more flags set in flags2 */
-
-/*
- * XFS mount option flags -- args->flags2
- */
-#define XFSMNT2_COMPAT_IOSIZE 0x00000001 /* don't report large preferred
- * I/O size in stat(2) */
-
-#endif /* __XFS_CLNT_H__ */
diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
index 473671fa5c1..a514ab61665 100644
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/xfs_da_btree.c
@@ -1,5 +1,6 @@
/*
* Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
* All Rights Reserved.
*
* This program is free software; you can redistribute it and/or
@@ -17,38 +18,29 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
-#include "xfs_types.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_bit.h"
-#include "xfs_log.h"
-#include "xfs_inum.h"
-#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
-#include "xfs_dir.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
#include "xfs_mount.h"
+#include "xfs_da_format.h"
#include "xfs_da_btree.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
-#include "xfs_dinode.h"
+#include "xfs_dir2.h"
+#include "xfs_dir2_priv.h"
#include "xfs_inode.h"
+#include "xfs_trans.h"
#include "xfs_inode_item.h"
#include "xfs_alloc.h"
-#include "xfs_btree.h"
#include "xfs_bmap.h"
#include "xfs_attr.h"
#include "xfs_attr_leaf.h"
-#include "xfs_dir_leaf.h"
-#include "xfs_dir2_data.h"
-#include "xfs_dir2_leaf.h"
-#include "xfs_dir2_block.h"
-#include "xfs_dir2_node.h"
#include "xfs_error.h"
+#include "xfs_trace.h"
+#include "xfs_cksum.h"
+#include "xfs_buf_item.h"
/*
* xfs_da_btree.c
@@ -63,44 +55,237 @@
/*
* Routines used for growing the Btree.
*/
-STATIC int xfs_da_root_split(xfs_da_state_t *state,
+STATIC int xfs_da3_root_split(xfs_da_state_t *state,
xfs_da_state_blk_t *existing_root,
xfs_da_state_blk_t *new_child);
-STATIC int xfs_da_node_split(xfs_da_state_t *state,
+STATIC int xfs_da3_node_split(xfs_da_state_t *state,
xfs_da_state_blk_t *existing_blk,
xfs_da_state_blk_t *split_blk,
xfs_da_state_blk_t *blk_to_add,
int treelevel,
int *result);
-STATIC void xfs_da_node_rebalance(xfs_da_state_t *state,
+STATIC void xfs_da3_node_rebalance(xfs_da_state_t *state,
xfs_da_state_blk_t *node_blk_1,
xfs_da_state_blk_t *node_blk_2);
-STATIC void xfs_da_node_add(xfs_da_state_t *state,
+STATIC void xfs_da3_node_add(xfs_da_state_t *state,
xfs_da_state_blk_t *old_node_blk,
xfs_da_state_blk_t *new_node_blk);
/*
* Routines used for shrinking the Btree.
*/
-STATIC int xfs_da_root_join(xfs_da_state_t *state,
+STATIC int xfs_da3_root_join(xfs_da_state_t *state,
xfs_da_state_blk_t *root_blk);
-STATIC int xfs_da_node_toosmall(xfs_da_state_t *state, int *retval);
-STATIC void xfs_da_node_remove(xfs_da_state_t *state,
+STATIC int xfs_da3_node_toosmall(xfs_da_state_t *state, int *retval);
+STATIC void xfs_da3_node_remove(xfs_da_state_t *state,
xfs_da_state_blk_t *drop_blk);
-STATIC void xfs_da_node_unbalance(xfs_da_state_t *state,
+STATIC void xfs_da3_node_unbalance(xfs_da_state_t *state,
xfs_da_state_blk_t *src_node_blk,
xfs_da_state_blk_t *dst_node_blk);
/*
* Utility routines.
*/
-STATIC uint xfs_da_node_lasthash(xfs_dabuf_t *bp, int *count);
-STATIC int xfs_da_node_order(xfs_dabuf_t *node1_bp, xfs_dabuf_t *node2_bp);
-STATIC xfs_dabuf_t *xfs_da_buf_make(int nbuf, xfs_buf_t **bps, inst_t *ra);
-STATIC int xfs_da_blk_unlink(xfs_da_state_t *state,
+STATIC int xfs_da3_blk_unlink(xfs_da_state_t *state,
xfs_da_state_blk_t *drop_blk,
xfs_da_state_blk_t *save_blk);
-STATIC void xfs_da_state_kill_altpath(xfs_da_state_t *state);
+
+
+kmem_zone_t *xfs_da_state_zone; /* anchor for state struct zone */
+
+/*
+ * Allocate a dir-state structure.
+ * We don't put them on the stack since they're large.
+ */
+xfs_da_state_t *
+xfs_da_state_alloc(void)
+{
+ return kmem_zone_zalloc(xfs_da_state_zone, KM_NOFS);
+}
+
+/*
+ * Kill the altpath contents of a da-state structure.
+ */
+STATIC void
+xfs_da_state_kill_altpath(xfs_da_state_t *state)
+{
+ int i;
+
+ for (i = 0; i < state->altpath.active; i++)
+ state->altpath.blk[i].bp = NULL;
+ state->altpath.active = 0;
+}
+
+/*
+ * Free a da-state structure.
+ */
+void
+xfs_da_state_free(xfs_da_state_t *state)
+{
+ xfs_da_state_kill_altpath(state);
+#ifdef DEBUG
+ memset((char *)state, 0, sizeof(*state));
+#endif /* DEBUG */
+ kmem_zone_free(xfs_da_state_zone, state);
+}
+
+static bool
+xfs_da3_node_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_da_intnode *hdr = bp->b_addr;
+ struct xfs_da3_icnode_hdr ichdr;
+ const struct xfs_dir_ops *ops;
+
+ ops = xfs_dir_get_ops(mp, NULL);
+
+ ops->node_hdr_from_disk(&ichdr, hdr);
+
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ struct xfs_da3_node_hdr *hdr3 = bp->b_addr;
+
+ if (ichdr.magic != XFS_DA3_NODE_MAGIC)
+ return false;
+
+ if (!uuid_equal(&hdr3->info.uuid, &mp->m_sb.sb_uuid))
+ return false;
+ if (be64_to_cpu(hdr3->info.blkno) != bp->b_bn)
+ return false;
+ } else {
+ if (ichdr.magic != XFS_DA_NODE_MAGIC)
+ return false;
+ }
+ if (ichdr.level == 0)
+ return false;
+ if (ichdr.level > XFS_DA_NODE_MAXDEPTH)
+ return false;
+ if (ichdr.count == 0)
+ return false;
+
+ /*
+ * we don't know if the node is for and attribute or directory tree,
+ * so only fail if the count is outside both bounds
+ */
+ if (ichdr.count > mp->m_dir_geo->node_ents &&
+ ichdr.count > mp->m_attr_geo->node_ents)
+ return false;
+
+ /* XXX: hash order check? */
+
+ return true;
+}
+
+static void
+xfs_da3_node_write_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_buf_log_item *bip = bp->b_fspriv;
+ struct xfs_da3_node_hdr *hdr3 = bp->b_addr;
+
+ if (!xfs_da3_node_verify(bp)) {
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
+ xfs_verifier_error(bp);
+ return;
+ }
+
+ if (!xfs_sb_version_hascrc(&mp->m_sb))
+ return;
+
+ if (bip)
+ hdr3->info.lsn = cpu_to_be64(bip->bli_item.li_lsn);
+
+ xfs_buf_update_cksum(bp, XFS_DA3_NODE_CRC_OFF);
+}
+
+/*
+ * leaf/node format detection on trees is sketchy, so a node read can be done on
+ * leaf level blocks when detection identifies the tree as a node format tree
+ * incorrectly. In this case, we need to swap the verifier to match the correct
+ * format of the block being read.
+ */
+static void
+xfs_da3_node_read_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_da_blkinfo *info = bp->b_addr;
+
+ switch (be16_to_cpu(info->magic)) {
+ case XFS_DA3_NODE_MAGIC:
+ if (!xfs_buf_verify_cksum(bp, XFS_DA3_NODE_CRC_OFF)) {
+ xfs_buf_ioerror(bp, EFSBADCRC);
+ break;
+ }
+ /* fall through */
+ case XFS_DA_NODE_MAGIC:
+ if (!xfs_da3_node_verify(bp)) {
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
+ break;
+ }
+ return;
+ case XFS_ATTR_LEAF_MAGIC:
+ case XFS_ATTR3_LEAF_MAGIC:
+ bp->b_ops = &xfs_attr3_leaf_buf_ops;
+ bp->b_ops->verify_read(bp);
+ return;
+ case XFS_DIR2_LEAFN_MAGIC:
+ case XFS_DIR3_LEAFN_MAGIC:
+ bp->b_ops = &xfs_dir3_leafn_buf_ops;
+ bp->b_ops->verify_read(bp);
+ return;
+ default:
+ break;
+ }
+
+ /* corrupt block */
+ xfs_verifier_error(bp);
+}
+
+const struct xfs_buf_ops xfs_da3_node_buf_ops = {
+ .verify_read = xfs_da3_node_read_verify,
+ .verify_write = xfs_da3_node_write_verify,
+};
+
+int
+xfs_da3_node_read(
+ struct xfs_trans *tp,
+ struct xfs_inode *dp,
+ xfs_dablk_t bno,
+ xfs_daddr_t mappedbno,
+ struct xfs_buf **bpp,
+ int which_fork)
+{
+ int err;
+
+ err = xfs_da_read_buf(tp, dp, bno, mappedbno, bpp,
+ which_fork, &xfs_da3_node_buf_ops);
+ if (!err && tp) {
+ struct xfs_da_blkinfo *info = (*bpp)->b_addr;
+ int type;
+
+ switch (be16_to_cpu(info->magic)) {
+ case XFS_DA_NODE_MAGIC:
+ case XFS_DA3_NODE_MAGIC:
+ type = XFS_BLFT_DA_NODE_BUF;
+ break;
+ case XFS_ATTR_LEAF_MAGIC:
+ case XFS_ATTR3_LEAF_MAGIC:
+ type = XFS_BLFT_ATTR_LEAF_BUF;
+ break;
+ case XFS_DIR2_LEAFN_MAGIC:
+ case XFS_DIR3_LEAFN_MAGIC:
+ type = XFS_BLFT_DIR_LEAFN_BUF;
+ break;
+ default:
+ type = 0;
+ ASSERT(0);
+ break;
+ }
+ xfs_trans_buf_set_type(tp, *bpp, type);
+ }
+ return err;
+}
/*========================================================================
* Routines used for growing the Btree.
@@ -110,29 +295,46 @@ STATIC void xfs_da_state_kill_altpath(xfs_da_state_t *state);
* Create the initial contents of an intermediate node.
*/
int
-xfs_da_node_create(xfs_da_args_t *args, xfs_dablk_t blkno, int level,
- xfs_dabuf_t **bpp, int whichfork)
+xfs_da3_node_create(
+ struct xfs_da_args *args,
+ xfs_dablk_t blkno,
+ int level,
+ struct xfs_buf **bpp,
+ int whichfork)
{
- xfs_da_intnode_t *node;
- xfs_dabuf_t *bp;
- int error;
- xfs_trans_t *tp;
-
- tp = args->trans;
- error = xfs_da_get_buf(tp, args->dp, blkno, -1, &bp, whichfork);
+ struct xfs_da_intnode *node;
+ struct xfs_trans *tp = args->trans;
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_da3_icnode_hdr ichdr = {0};
+ struct xfs_buf *bp;
+ int error;
+ struct xfs_inode *dp = args->dp;
+
+ trace_xfs_da_node_create(args);
+ ASSERT(level <= XFS_DA_NODE_MAXDEPTH);
+
+ error = xfs_da_get_buf(tp, dp, blkno, -1, &bp, whichfork);
if (error)
return(error);
- ASSERT(bp != NULL);
- node = bp->data;
- node->hdr.info.forw = 0;
- node->hdr.info.back = 0;
- INT_SET(node->hdr.info.magic, ARCH_CONVERT, XFS_DA_NODE_MAGIC);
- node->hdr.info.pad = 0;
- node->hdr.count = 0;
- INT_SET(node->hdr.level, ARCH_CONVERT, level);
-
- xfs_da_log_buf(tp, bp,
- XFS_DA_LOGRANGE(node, &node->hdr, sizeof(node->hdr)));
+ bp->b_ops = &xfs_da3_node_buf_ops;
+ xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DA_NODE_BUF);
+ node = bp->b_addr;
+
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ struct xfs_da3_node_hdr *hdr3 = bp->b_addr;
+
+ ichdr.magic = XFS_DA3_NODE_MAGIC;
+ hdr3->info.blkno = cpu_to_be64(bp->b_bn);
+ hdr3->info.owner = cpu_to_be64(args->dp->i_ino);
+ uuid_copy(&hdr3->info.uuid, &mp->m_sb.sb_uuid);
+ } else {
+ ichdr.magic = XFS_DA_NODE_MAGIC;
+ }
+ ichdr.level = level;
+
+ dp->d_ops->node_hdr_to_disk(node, &ichdr);
+ xfs_trans_log_buf(tp, bp,
+ XFS_DA_LOGRANGE(node, &node->hdr, dp->d_ops->node_hdr_size));
*bpp = bp;
return(0);
@@ -143,12 +345,20 @@ xfs_da_node_create(xfs_da_args_t *args, xfs_dablk_t blkno, int level,
* intermediate nodes, rebalance, etc.
*/
int /* error */
-xfs_da_split(xfs_da_state_t *state)
+xfs_da3_split(
+ struct xfs_da_state *state)
{
- xfs_da_state_blk_t *oldblk, *newblk, *addblk;
- xfs_da_intnode_t *node;
- xfs_dabuf_t *bp;
- int max, action, error, i;
+ struct xfs_da_state_blk *oldblk;
+ struct xfs_da_state_blk *newblk;
+ struct xfs_da_state_blk *addblk;
+ struct xfs_da_intnode *node;
+ struct xfs_buf *bp;
+ int max;
+ int action = 0;
+ int error;
+ int i;
+
+ trace_xfs_da_split(state->args);
/*
* Walk back up the tree splitting/inserting/adjusting as necessary.
@@ -159,7 +369,7 @@ xfs_da_split(xfs_da_state_t *state)
max = state->path.active - 1;
ASSERT((max >= 0) && (max < XFS_DA_NODE_MAXDEPTH));
ASSERT(state->path.blk[max].magic == XFS_ATTR_LEAF_MAGIC ||
- state->path.blk[max].magic == XFS_DIRX_LEAF_MAGIC(state->mp));
+ state->path.blk[max].magic == XFS_DIR2_LEAFN_MAGIC);
addblk = &state->path.blk[max]; /* initial dummy value */
for (i = max; (i >= 0) && addblk; state->path.active--, i--) {
@@ -174,7 +384,7 @@ xfs_da_split(xfs_da_state_t *state)
*/
switch (oldblk->magic) {
case XFS_ATTR_LEAF_MAGIC:
- error = xfs_attr_leaf_split(state, oldblk, newblk);
+ error = xfs_attr3_leaf_split(state, oldblk, newblk);
if ((error != 0) && (error != ENOSPC)) {
return(error); /* GROT: attr is inconsistent */
}
@@ -188,58 +398,28 @@ xfs_da_split(xfs_da_state_t *state)
state->extravalid = 1;
if (state->inleaf) {
state->extraafter = 0; /* before newblk */
- error = xfs_attr_leaf_split(state, oldblk,
+ trace_xfs_attr_leaf_split_before(state->args);
+ error = xfs_attr3_leaf_split(state, oldblk,
&state->extrablk);
} else {
state->extraafter = 1; /* after newblk */
- error = xfs_attr_leaf_split(state, newblk,
+ trace_xfs_attr_leaf_split_after(state->args);
+ error = xfs_attr3_leaf_split(state, newblk,
&state->extrablk);
}
if (error)
return(error); /* GROT: attr inconsistent */
addblk = newblk;
break;
- case XFS_DIR_LEAF_MAGIC:
- ASSERT(XFS_DIR_IS_V1(state->mp));
- error = xfs_dir_leaf_split(state, oldblk, newblk);
- if ((error != 0) && (error != ENOSPC)) {
- return(error); /* GROT: dir is inconsistent */
- }
- if (!error) {
- addblk = newblk;
- break;
- }
- /*
- * Entry wouldn't fit, split the leaf again.
- */
- state->extravalid = 1;
- if (state->inleaf) {
- state->extraafter = 0; /* before newblk */
- error = xfs_dir_leaf_split(state, oldblk,
- &state->extrablk);
- if (error)
- return(error); /* GROT: dir incon. */
- addblk = newblk;
- } else {
- state->extraafter = 1; /* after newblk */
- error = xfs_dir_leaf_split(state, newblk,
- &state->extrablk);
- if (error)
- return(error); /* GROT: dir incon. */
- addblk = newblk;
- }
- break;
case XFS_DIR2_LEAFN_MAGIC:
- ASSERT(XFS_DIR_IS_V2(state->mp));
error = xfs_dir2_leafn_split(state, oldblk, newblk);
if (error)
return error;
addblk = newblk;
break;
case XFS_DA_NODE_MAGIC:
- error = xfs_da_node_split(state, oldblk, newblk, addblk,
+ error = xfs_da3_node_split(state, oldblk, newblk, addblk,
max - i, &action);
- xfs_da_buf_done(addblk->bp);
addblk->bp = NULL;
if (error)
return(error); /* GROT: dir is inconsistent */
@@ -256,14 +436,7 @@ xfs_da_split(xfs_da_state_t *state)
/*
* Update the btree to show the new hashval for this child.
*/
- xfs_da_fixhashpath(state, &state->path);
- /*
- * If we won't need this block again, it's getting dropped
- * from the active path by the loop control, so we need
- * to mark it done now.
- */
- if (i > 0 || !addblk)
- xfs_da_buf_done(oldblk->bp);
+ xfs_da3_fixhashpath(state, &state->path);
}
if (!addblk)
return(0);
@@ -273,10 +446,8 @@ xfs_da_split(xfs_da_state_t *state)
*/
ASSERT(state->path.active == 0);
oldblk = &state->path.blk[0];
- error = xfs_da_root_split(state, oldblk, addblk);
+ error = xfs_da3_root_split(state, oldblk, addblk);
if (error) {
- xfs_da_buf_done(oldblk->bp);
- xfs_da_buf_done(addblk->bp);
addblk->bp = NULL;
return(error); /* GROT: dir is inconsistent */
}
@@ -286,38 +457,40 @@ xfs_da_split(xfs_da_state_t *state)
* just got bumped because of the addition of a new root node.
* There might be three blocks involved if a double split occurred,
* and the original block 0 could be at any position in the list.
+ *
+ * Note: the magic numbers and sibling pointers are in the same
+ * physical place for both v2 and v3 headers (by design). Hence it
+ * doesn't matter which version of the xfs_da_intnode structure we use
+ * here as the result will be the same using either structure.
*/
-
- node = oldblk->bp->data;
+ node = oldblk->bp->b_addr;
if (node->hdr.info.forw) {
- if (INT_GET(node->hdr.info.forw, ARCH_CONVERT) == addblk->blkno) {
+ if (be32_to_cpu(node->hdr.info.forw) == addblk->blkno) {
bp = addblk->bp;
} else {
ASSERT(state->extravalid);
bp = state->extrablk.bp;
}
- node = bp->data;
- INT_SET(node->hdr.info.back, ARCH_CONVERT, oldblk->blkno);
- xfs_da_log_buf(state->args->trans, bp,
+ node = bp->b_addr;
+ node->hdr.info.back = cpu_to_be32(oldblk->blkno);
+ xfs_trans_log_buf(state->args->trans, bp,
XFS_DA_LOGRANGE(node, &node->hdr.info,
sizeof(node->hdr.info)));
}
- node = oldblk->bp->data;
- if (INT_GET(node->hdr.info.back, ARCH_CONVERT)) {
- if (INT_GET(node->hdr.info.back, ARCH_CONVERT) == addblk->blkno) {
+ node = oldblk->bp->b_addr;
+ if (node->hdr.info.back) {
+ if (be32_to_cpu(node->hdr.info.back) == addblk->blkno) {
bp = addblk->bp;
} else {
ASSERT(state->extravalid);
bp = state->extrablk.bp;
}
- node = bp->data;
- INT_SET(node->hdr.info.forw, ARCH_CONVERT, oldblk->blkno);
- xfs_da_log_buf(state->args->trans, bp,
+ node = bp->b_addr;
+ node->hdr.info.forw = cpu_to_be32(oldblk->blkno);
+ xfs_trans_log_buf(state->args->trans, bp,
XFS_DA_LOGRANGE(node, &node->hdr.info,
sizeof(node->hdr.info)));
}
- xfs_da_buf_done(oldblk->bp);
- xfs_da_buf_done(addblk->bp);
addblk->bp = NULL;
return(0);
}
@@ -328,114 +501,169 @@ xfs_da_split(xfs_da_state_t *state)
* the EOF, extending the inode in process.
*/
STATIC int /* error */
-xfs_da_root_split(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
- xfs_da_state_blk_t *blk2)
+xfs_da3_root_split(
+ struct xfs_da_state *state,
+ struct xfs_da_state_blk *blk1,
+ struct xfs_da_state_blk *blk2)
{
- xfs_da_intnode_t *node, *oldroot;
- xfs_da_args_t *args;
- xfs_dablk_t blkno;
- xfs_dabuf_t *bp;
- int error, size;
- xfs_inode_t *dp;
- xfs_trans_t *tp;
- xfs_mount_t *mp;
- xfs_dir2_leaf_t *leaf;
+ struct xfs_da_intnode *node;
+ struct xfs_da_intnode *oldroot;
+ struct xfs_da_node_entry *btree;
+ struct xfs_da3_icnode_hdr nodehdr;
+ struct xfs_da_args *args;
+ struct xfs_buf *bp;
+ struct xfs_inode *dp;
+ struct xfs_trans *tp;
+ struct xfs_mount *mp;
+ struct xfs_dir2_leaf *leaf;
+ xfs_dablk_t blkno;
+ int level;
+ int error;
+ int size;
+
+ trace_xfs_da_root_split(state->args);
/*
* Copy the existing (incorrect) block from the root node position
* to a free space somewhere.
*/
args = state->args;
- ASSERT(args != NULL);
error = xfs_da_grow_inode(args, &blkno);
if (error)
- return(error);
+ return error;
+
dp = args->dp;
tp = args->trans;
mp = state->mp;
error = xfs_da_get_buf(tp, dp, blkno, -1, &bp, args->whichfork);
if (error)
- return(error);
- ASSERT(bp != NULL);
- node = bp->data;
- oldroot = blk1->bp->data;
- if (INT_GET(oldroot->hdr.info.magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC) {
- size = (int)((char *)&oldroot->btree[INT_GET(oldroot->hdr.count, ARCH_CONVERT)] -
- (char *)oldroot);
+ return error;
+ node = bp->b_addr;
+ oldroot = blk1->bp->b_addr;
+ if (oldroot->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC) ||
+ oldroot->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC)) {
+ struct xfs_da3_icnode_hdr nodehdr;
+
+ dp->d_ops->node_hdr_from_disk(&nodehdr, oldroot);
+ btree = dp->d_ops->node_tree_p(oldroot);
+ size = (int)((char *)&btree[nodehdr.count] - (char *)oldroot);
+ level = nodehdr.level;
+
+ /*
+ * we are about to copy oldroot to bp, so set up the type
+ * of bp while we know exactly what it will be.
+ */
+ xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DA_NODE_BUF);
} else {
- ASSERT(XFS_DIR_IS_V2(mp));
- ASSERT(INT_GET(oldroot->hdr.info.magic, ARCH_CONVERT) == XFS_DIR2_LEAFN_MAGIC);
+ struct xfs_dir3_icleaf_hdr leafhdr;
+ struct xfs_dir2_leaf_entry *ents;
+
leaf = (xfs_dir2_leaf_t *)oldroot;
- size = (int)((char *)&leaf->ents[INT_GET(leaf->hdr.count, ARCH_CONVERT)] -
- (char *)leaf);
+ dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
+ ents = dp->d_ops->leaf_ents_p(leaf);
+
+ ASSERT(leafhdr.magic == XFS_DIR2_LEAFN_MAGIC ||
+ leafhdr.magic == XFS_DIR3_LEAFN_MAGIC);
+ size = (int)((char *)&ents[leafhdr.count] - (char *)leaf);
+ level = 0;
+
+ /*
+ * we are about to copy oldroot to bp, so set up the type
+ * of bp while we know exactly what it will be.
+ */
+ xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_LEAFN_BUF);
}
+
+ /*
+ * we can copy most of the information in the node from one block to
+ * another, but for CRC enabled headers we have to make sure that the
+ * block specific identifiers are kept intact. We update the buffer
+ * directly for this.
+ */
memcpy(node, oldroot, size);
- xfs_da_log_buf(tp, bp, 0, size - 1);
- xfs_da_buf_done(blk1->bp);
+ if (oldroot->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC) ||
+ oldroot->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)) {
+ struct xfs_da3_intnode *node3 = (struct xfs_da3_intnode *)node;
+
+ node3->hdr.info.blkno = cpu_to_be64(bp->b_bn);
+ }
+ xfs_trans_log_buf(tp, bp, 0, size - 1);
+
+ bp->b_ops = blk1->bp->b_ops;
+ xfs_trans_buf_copy_type(bp, blk1->bp);
blk1->bp = bp;
blk1->blkno = blkno;
/*
* Set up the new root node.
*/
- error = xfs_da_node_create(args,
- args->whichfork == XFS_DATA_FORK &&
- XFS_DIR_IS_V2(mp) ? mp->m_dirleafblk : 0,
- INT_GET(node->hdr.level, ARCH_CONVERT) + 1, &bp, args->whichfork);
+ error = xfs_da3_node_create(args,
+ (args->whichfork == XFS_DATA_FORK) ? args->geo->leafblk : 0,
+ level + 1, &bp, args->whichfork);
if (error)
- return(error);
- node = bp->data;
- INT_SET(node->btree[0].hashval, ARCH_CONVERT, blk1->hashval);
- INT_SET(node->btree[0].before, ARCH_CONVERT, blk1->blkno);
- INT_SET(node->btree[1].hashval, ARCH_CONVERT, blk2->hashval);
- INT_SET(node->btree[1].before, ARCH_CONVERT, blk2->blkno);
- INT_SET(node->hdr.count, ARCH_CONVERT, 2);
+ return error;
+
+ node = bp->b_addr;
+ dp->d_ops->node_hdr_from_disk(&nodehdr, node);
+ btree = dp->d_ops->node_tree_p(node);
+ btree[0].hashval = cpu_to_be32(blk1->hashval);
+ btree[0].before = cpu_to_be32(blk1->blkno);
+ btree[1].hashval = cpu_to_be32(blk2->hashval);
+ btree[1].before = cpu_to_be32(blk2->blkno);
+ nodehdr.count = 2;
+ dp->d_ops->node_hdr_to_disk(node, &nodehdr);
#ifdef DEBUG
- if (INT_GET(oldroot->hdr.info.magic, ARCH_CONVERT) == XFS_DIR2_LEAFN_MAGIC) {
- ASSERT(blk1->blkno >= mp->m_dirleafblk &&
- blk1->blkno < mp->m_dirfreeblk);
- ASSERT(blk2->blkno >= mp->m_dirleafblk &&
- blk2->blkno < mp->m_dirfreeblk);
+ if (oldroot->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) ||
+ oldroot->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)) {
+ ASSERT(blk1->blkno >= args->geo->leafblk &&
+ blk1->blkno < args->geo->freeblk);
+ ASSERT(blk2->blkno >= args->geo->leafblk &&
+ blk2->blkno < args->geo->freeblk);
}
#endif
/* Header is already logged by xfs_da_node_create */
- xfs_da_log_buf(tp, bp,
- XFS_DA_LOGRANGE(node, node->btree,
- sizeof(xfs_da_node_entry_t) * 2));
- xfs_da_buf_done(bp);
+ xfs_trans_log_buf(tp, bp,
+ XFS_DA_LOGRANGE(node, btree, sizeof(xfs_da_node_entry_t) * 2));
- return(0);
+ return 0;
}
/*
* Split the node, rebalance, then add the new entry.
*/
STATIC int /* error */
-xfs_da_node_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk,
- xfs_da_state_blk_t *newblk,
- xfs_da_state_blk_t *addblk,
- int treelevel, int *result)
+xfs_da3_node_split(
+ struct xfs_da_state *state,
+ struct xfs_da_state_blk *oldblk,
+ struct xfs_da_state_blk *newblk,
+ struct xfs_da_state_blk *addblk,
+ int treelevel,
+ int *result)
{
- xfs_da_intnode_t *node;
- xfs_dablk_t blkno;
- int newcount, error;
- int useextra;
+ struct xfs_da_intnode *node;
+ struct xfs_da3_icnode_hdr nodehdr;
+ xfs_dablk_t blkno;
+ int newcount;
+ int error;
+ int useextra;
+ struct xfs_inode *dp = state->args->dp;
- node = oldblk->bp->data;
- ASSERT(INT_GET(node->hdr.info.magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC);
+ trace_xfs_da_node_split(state->args);
+
+ node = oldblk->bp->b_addr;
+ dp->d_ops->node_hdr_from_disk(&nodehdr, node);
/*
- * With V2 the extra block is data or freespace.
+ * With V2 dirs the extra block is data or freespace.
*/
- useextra = state->extravalid && (XFS_DIR_IS_V1(state->mp) ||
- state->args->whichfork == XFS_ATTR_FORK);
+ useextra = state->extravalid && state->args->whichfork == XFS_ATTR_FORK;
newcount = 1 + useextra;
/*
* Do we have to split the node?
*/
- if ((INT_GET(node->hdr.count, ARCH_CONVERT) + newcount) > state->node_ents) {
+ if (nodehdr.count + newcount > state->args->geo->node_ents) {
/*
* Allocate a new node, add to the doubly linked chain of
* nodes, then move some of our excess entries into it.
@@ -444,14 +672,14 @@ xfs_da_node_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk,
if (error)
return(error); /* GROT: dir is inconsistent */
- error = xfs_da_node_create(state->args, blkno, treelevel,
+ error = xfs_da3_node_create(state->args, blkno, treelevel,
&newblk->bp, state->args->whichfork);
if (error)
return(error); /* GROT: dir is inconsistent */
newblk->blkno = blkno;
newblk->magic = XFS_DA_NODE_MAGIC;
- xfs_da_node_rebalance(state, oldblk, newblk);
- error = xfs_da_blk_link(state, oldblk, newblk);
+ xfs_da3_node_rebalance(state, oldblk, newblk);
+ error = xfs_da3_blk_link(state, oldblk, newblk);
if (error)
return(error);
*result = 1;
@@ -463,7 +691,7 @@ xfs_da_node_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk,
* Insert the new entry(s) into the correct block
* (updating last hashval in the process).
*
- * xfs_da_node_add() inserts BEFORE the given index,
+ * xfs_da3_node_add() inserts BEFORE the given index,
* and as a result of using node_lookup_int() we always
* point to a valid entry (not after one), but a split
* operation always results in a new block whose hashvals
@@ -471,23 +699,24 @@ xfs_da_node_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk,
*
* If we had double-split op below us, then add the extra block too.
*/
- node = oldblk->bp->data;
- if (oldblk->index <= INT_GET(node->hdr.count, ARCH_CONVERT)) {
+ node = oldblk->bp->b_addr;
+ dp->d_ops->node_hdr_from_disk(&nodehdr, node);
+ if (oldblk->index <= nodehdr.count) {
oldblk->index++;
- xfs_da_node_add(state, oldblk, addblk);
+ xfs_da3_node_add(state, oldblk, addblk);
if (useextra) {
if (state->extraafter)
oldblk->index++;
- xfs_da_node_add(state, oldblk, &state->extrablk);
+ xfs_da3_node_add(state, oldblk, &state->extrablk);
state->extravalid = 0;
}
} else {
newblk->index++;
- xfs_da_node_add(state, newblk, addblk);
+ xfs_da3_node_add(state, newblk, addblk);
if (useextra) {
if (state->extraafter)
newblk->index++;
- xfs_da_node_add(state, newblk, &state->extrablk);
+ xfs_da3_node_add(state, newblk, &state->extrablk);
state->extravalid = 0;
}
}
@@ -502,31 +731,54 @@ xfs_da_node_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk,
* NOTE: if blk2 is empty, then it will get the upper half of blk1.
*/
STATIC void
-xfs_da_node_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
- xfs_da_state_blk_t *blk2)
+xfs_da3_node_rebalance(
+ struct xfs_da_state *state,
+ struct xfs_da_state_blk *blk1,
+ struct xfs_da_state_blk *blk2)
{
- xfs_da_intnode_t *node1, *node2, *tmpnode;
- xfs_da_node_entry_t *btree_s, *btree_d;
- int count, tmp;
- xfs_trans_t *tp;
+ struct xfs_da_intnode *node1;
+ struct xfs_da_intnode *node2;
+ struct xfs_da_intnode *tmpnode;
+ struct xfs_da_node_entry *btree1;
+ struct xfs_da_node_entry *btree2;
+ struct xfs_da_node_entry *btree_s;
+ struct xfs_da_node_entry *btree_d;
+ struct xfs_da3_icnode_hdr nodehdr1;
+ struct xfs_da3_icnode_hdr nodehdr2;
+ struct xfs_trans *tp;
+ int count;
+ int tmp;
+ int swap = 0;
+ struct xfs_inode *dp = state->args->dp;
+
+ trace_xfs_da_node_rebalance(state->args);
+
+ node1 = blk1->bp->b_addr;
+ node2 = blk2->bp->b_addr;
+ dp->d_ops->node_hdr_from_disk(&nodehdr1, node1);
+ dp->d_ops->node_hdr_from_disk(&nodehdr2, node2);
+ btree1 = dp->d_ops->node_tree_p(node1);
+ btree2 = dp->d_ops->node_tree_p(node2);
- node1 = blk1->bp->data;
- node2 = blk2->bp->data;
/*
* Figure out how many entries need to move, and in which direction.
* Swap the nodes around if that makes it simpler.
*/
- if ((INT_GET(node1->hdr.count, ARCH_CONVERT) > 0) && (INT_GET(node2->hdr.count, ARCH_CONVERT) > 0) &&
- ((INT_GET(node2->btree[ 0 ].hashval, ARCH_CONVERT) < INT_GET(node1->btree[ 0 ].hashval, ARCH_CONVERT)) ||
- (INT_GET(node2->btree[ INT_GET(node2->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT) <
- INT_GET(node1->btree[ INT_GET(node1->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT)))) {
+ if (nodehdr1.count > 0 && nodehdr2.count > 0 &&
+ ((be32_to_cpu(btree2[0].hashval) < be32_to_cpu(btree1[0].hashval)) ||
+ (be32_to_cpu(btree2[nodehdr2.count - 1].hashval) <
+ be32_to_cpu(btree1[nodehdr1.count - 1].hashval)))) {
tmpnode = node1;
node1 = node2;
node2 = tmpnode;
+ dp->d_ops->node_hdr_from_disk(&nodehdr1, node1);
+ dp->d_ops->node_hdr_from_disk(&nodehdr2, node2);
+ btree1 = dp->d_ops->node_tree_p(node1);
+ btree2 = dp->d_ops->node_tree_p(node2);
+ swap = 1;
}
- ASSERT(INT_GET(node1->hdr.info.magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC);
- ASSERT(INT_GET(node2->hdr.info.magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC);
- count = (INT_GET(node1->hdr.count, ARCH_CONVERT) - INT_GET(node2->hdr.count, ARCH_CONVERT)) / 2;
+
+ count = (nodehdr1.count - nodehdr2.count) / 2;
if (count == 0)
return;
tp = state->args->trans;
@@ -537,10 +789,11 @@ xfs_da_node_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
/*
* Move elements in node2 up to make a hole.
*/
- if ((tmp = INT_GET(node2->hdr.count, ARCH_CONVERT)) > 0) {
+ tmp = nodehdr2.count;
+ if (tmp > 0) {
tmp *= (uint)sizeof(xfs_da_node_entry_t);
- btree_s = &node2->btree[0];
- btree_d = &node2->btree[count];
+ btree_s = &btree2[0];
+ btree_d = &btree2[count];
memmove(btree_d, btree_s, tmp);
}
@@ -548,13 +801,12 @@ xfs_da_node_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
* Move the req'd B-tree elements from high in node1 to
* low in node2.
*/
- INT_MOD(node2->hdr.count, ARCH_CONVERT, count);
+ nodehdr2.count += count;
tmp = count * (uint)sizeof(xfs_da_node_entry_t);
- btree_s = &node1->btree[INT_GET(node1->hdr.count, ARCH_CONVERT) - count];
- btree_d = &node2->btree[0];
+ btree_s = &btree1[nodehdr1.count - count];
+ btree_d = &btree2[0];
memcpy(btree_d, btree_s, tmp);
- INT_MOD(node1->hdr.count, ARCH_CONVERT, -(count));
-
+ nodehdr1.count -= count;
} else {
/*
* Move the req'd B-tree elements from low in node2 to
@@ -562,49 +814,59 @@ xfs_da_node_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
*/
count = -count;
tmp = count * (uint)sizeof(xfs_da_node_entry_t);
- btree_s = &node2->btree[0];
- btree_d = &node1->btree[INT_GET(node1->hdr.count, ARCH_CONVERT)];
+ btree_s = &btree2[0];
+ btree_d = &btree1[nodehdr1.count];
memcpy(btree_d, btree_s, tmp);
- INT_MOD(node1->hdr.count, ARCH_CONVERT, count);
- xfs_da_log_buf(tp, blk1->bp,
+ nodehdr1.count += count;
+
+ xfs_trans_log_buf(tp, blk1->bp,
XFS_DA_LOGRANGE(node1, btree_d, tmp));
/*
* Move elements in node2 down to fill the hole.
*/
- tmp = INT_GET(node2->hdr.count, ARCH_CONVERT) - count;
+ tmp = nodehdr2.count - count;
tmp *= (uint)sizeof(xfs_da_node_entry_t);
- btree_s = &node2->btree[count];
- btree_d = &node2->btree[0];
+ btree_s = &btree2[count];
+ btree_d = &btree2[0];
memmove(btree_d, btree_s, tmp);
- INT_MOD(node2->hdr.count, ARCH_CONVERT, -(count));
+ nodehdr2.count -= count;
}
/*
* Log header of node 1 and all current bits of node 2.
*/
- xfs_da_log_buf(tp, blk1->bp,
- XFS_DA_LOGRANGE(node1, &node1->hdr, sizeof(node1->hdr)));
- xfs_da_log_buf(tp, blk2->bp,
+ dp->d_ops->node_hdr_to_disk(node1, &nodehdr1);
+ xfs_trans_log_buf(tp, blk1->bp,
+ XFS_DA_LOGRANGE(node1, &node1->hdr, dp->d_ops->node_hdr_size));
+
+ dp->d_ops->node_hdr_to_disk(node2, &nodehdr2);
+ xfs_trans_log_buf(tp, blk2->bp,
XFS_DA_LOGRANGE(node2, &node2->hdr,
- sizeof(node2->hdr) +
- sizeof(node2->btree[0]) * INT_GET(node2->hdr.count, ARCH_CONVERT)));
+ dp->d_ops->node_hdr_size +
+ (sizeof(btree2[0]) * nodehdr2.count)));
/*
* Record the last hashval from each block for upward propagation.
* (note: don't use the swapped node pointers)
*/
- node1 = blk1->bp->data;
- node2 = blk2->bp->data;
- blk1->hashval = INT_GET(node1->btree[ INT_GET(node1->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT);
- blk2->hashval = INT_GET(node2->btree[ INT_GET(node2->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT);
+ if (swap) {
+ node1 = blk1->bp->b_addr;
+ node2 = blk2->bp->b_addr;
+ dp->d_ops->node_hdr_from_disk(&nodehdr1, node1);
+ dp->d_ops->node_hdr_from_disk(&nodehdr2, node2);
+ btree1 = dp->d_ops->node_tree_p(node1);
+ btree2 = dp->d_ops->node_tree_p(node2);
+ }
+ blk1->hashval = be32_to_cpu(btree1[nodehdr1.count - 1].hashval);
+ blk2->hashval = be32_to_cpu(btree2[nodehdr2.count - 1].hashval);
/*
* Adjust the expected index for insertion.
*/
- if (blk1->index >= INT_GET(node1->hdr.count, ARCH_CONVERT)) {
- blk2->index = blk1->index - INT_GET(node1->hdr.count, ARCH_CONVERT);
- blk1->index = INT_GET(node1->hdr.count, ARCH_CONVERT) + 1; /* make it invalid */
+ if (blk1->index >= nodehdr1.count) {
+ blk2->index = blk1->index - nodehdr1.count;
+ blk1->index = nodehdr1.count + 1; /* make it invalid */
}
}
@@ -612,44 +874,52 @@ xfs_da_node_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
* Add a new entry to an intermediate node.
*/
STATIC void
-xfs_da_node_add(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk,
- xfs_da_state_blk_t *newblk)
+xfs_da3_node_add(
+ struct xfs_da_state *state,
+ struct xfs_da_state_blk *oldblk,
+ struct xfs_da_state_blk *newblk)
{
- xfs_da_intnode_t *node;
- xfs_da_node_entry_t *btree;
- int tmp;
- xfs_mount_t *mp;
+ struct xfs_da_intnode *node;
+ struct xfs_da3_icnode_hdr nodehdr;
+ struct xfs_da_node_entry *btree;
+ int tmp;
+ struct xfs_inode *dp = state->args->dp;
- node = oldblk->bp->data;
- mp = state->mp;
- ASSERT(INT_GET(node->hdr.info.magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC);
- ASSERT((oldblk->index >= 0) && (oldblk->index <= INT_GET(node->hdr.count, ARCH_CONVERT)));
+ trace_xfs_da_node_add(state->args);
+
+ node = oldblk->bp->b_addr;
+ dp->d_ops->node_hdr_from_disk(&nodehdr, node);
+ btree = dp->d_ops->node_tree_p(node);
+
+ ASSERT(oldblk->index >= 0 && oldblk->index <= nodehdr.count);
ASSERT(newblk->blkno != 0);
- if (state->args->whichfork == XFS_DATA_FORK && XFS_DIR_IS_V2(mp))
- ASSERT(newblk->blkno >= mp->m_dirleafblk &&
- newblk->blkno < mp->m_dirfreeblk);
+ if (state->args->whichfork == XFS_DATA_FORK)
+ ASSERT(newblk->blkno >= state->args->geo->leafblk &&
+ newblk->blkno < state->args->geo->freeblk);
/*
* We may need to make some room before we insert the new node.
*/
tmp = 0;
- btree = &node->btree[ oldblk->index ];
- if (oldblk->index < INT_GET(node->hdr.count, ARCH_CONVERT)) {
- tmp = (INT_GET(node->hdr.count, ARCH_CONVERT) - oldblk->index) * (uint)sizeof(*btree);
- memmove(btree + 1, btree, tmp);
+ if (oldblk->index < nodehdr.count) {
+ tmp = (nodehdr.count - oldblk->index) * (uint)sizeof(*btree);
+ memmove(&btree[oldblk->index + 1], &btree[oldblk->index], tmp);
}
- INT_SET(btree->hashval, ARCH_CONVERT, newblk->hashval);
- INT_SET(btree->before, ARCH_CONVERT, newblk->blkno);
- xfs_da_log_buf(state->args->trans, oldblk->bp,
- XFS_DA_LOGRANGE(node, btree, tmp + sizeof(*btree)));
- INT_MOD(node->hdr.count, ARCH_CONVERT, +1);
- xfs_da_log_buf(state->args->trans, oldblk->bp,
- XFS_DA_LOGRANGE(node, &node->hdr, sizeof(node->hdr)));
+ btree[oldblk->index].hashval = cpu_to_be32(newblk->hashval);
+ btree[oldblk->index].before = cpu_to_be32(newblk->blkno);
+ xfs_trans_log_buf(state->args->trans, oldblk->bp,
+ XFS_DA_LOGRANGE(node, &btree[oldblk->index],
+ tmp + sizeof(*btree)));
+
+ nodehdr.count += 1;
+ dp->d_ops->node_hdr_to_disk(node, &nodehdr);
+ xfs_trans_log_buf(state->args->trans, oldblk->bp,
+ XFS_DA_LOGRANGE(node, &node->hdr, dp->d_ops->node_hdr_size));
/*
* Copy the last hash value from the oldblk to propagate upwards.
*/
- oldblk->hashval = INT_GET(node->btree[ INT_GET(node->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT);
+ oldblk->hashval = be32_to_cpu(btree[nodehdr.count - 1].hashval);
}
/*========================================================================
@@ -661,17 +931,21 @@ xfs_da_node_add(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk,
* possibly deallocating that block, etc...
*/
int
-xfs_da_join(xfs_da_state_t *state)
+xfs_da3_join(
+ struct xfs_da_state *state)
{
- xfs_da_state_blk_t *drop_blk, *save_blk;
- int action, error;
+ struct xfs_da_state_blk *drop_blk;
+ struct xfs_da_state_blk *save_blk;
+ int action = 0;
+ int error;
+
+ trace_xfs_da_join(state->args);
- action = 0;
drop_blk = &state->path.blk[ state->path.active-1 ];
save_blk = &state->altpath.blk[ state->path.active-1 ];
ASSERT(state->path.blk[0].magic == XFS_DA_NODE_MAGIC);
ASSERT(drop_blk->magic == XFS_ATTR_LEAF_MAGIC ||
- drop_blk->magic == XFS_DIRX_LEAF_MAGIC(state->mp));
+ drop_blk->magic == XFS_DIR2_LEAFN_MAGIC);
/*
* Walk back up the tree joining/deallocating as necessary.
@@ -687,24 +961,14 @@ xfs_da_join(xfs_da_state_t *state)
*/
switch (drop_blk->magic) {
case XFS_ATTR_LEAF_MAGIC:
- error = xfs_attr_leaf_toosmall(state, &action);
+ error = xfs_attr3_leaf_toosmall(state, &action);
if (error)
return(error);
if (action == 0)
return(0);
- xfs_attr_leaf_unbalance(state, drop_blk, save_blk);
- break;
- case XFS_DIR_LEAF_MAGIC:
- ASSERT(XFS_DIR_IS_V1(state->mp));
- error = xfs_dir_leaf_toosmall(state, &action);
- if (error)
- return(error);
- if (action == 0)
- return(0);
- xfs_dir_leaf_unbalance(state, drop_blk, save_blk);
+ xfs_attr3_leaf_unbalance(state, drop_blk, save_blk);
break;
case XFS_DIR2_LEAFN_MAGIC:
- ASSERT(XFS_DIR_IS_V2(state->mp));
error = xfs_dir2_leafn_toosmall(state, &action);
if (error)
return error;
@@ -717,18 +981,18 @@ xfs_da_join(xfs_da_state_t *state)
* Remove the offending node, fixup hashvals,
* check for a toosmall neighbor.
*/
- xfs_da_node_remove(state, drop_blk);
- xfs_da_fixhashpath(state, &state->path);
- error = xfs_da_node_toosmall(state, &action);
+ xfs_da3_node_remove(state, drop_blk);
+ xfs_da3_fixhashpath(state, &state->path);
+ error = xfs_da3_node_toosmall(state, &action);
if (error)
return(error);
if (action == 0)
return 0;
- xfs_da_node_unbalance(state, drop_blk, save_blk);
+ xfs_da3_node_unbalance(state, drop_blk, save_blk);
break;
}
- xfs_da_fixhashpath(state, &state->altpath);
- error = xfs_da_blk_unlink(state, drop_blk, save_blk);
+ xfs_da3_fixhashpath(state, &state->altpath);
+ error = xfs_da3_blk_unlink(state, drop_blk, save_blk);
xfs_da_state_kill_altpath(state);
if (error)
return(error);
@@ -743,63 +1007,97 @@ xfs_da_join(xfs_da_state_t *state)
* we only have one entry in the root, make the child block
* the new root.
*/
- xfs_da_node_remove(state, drop_blk);
- xfs_da_fixhashpath(state, &state->path);
- error = xfs_da_root_join(state, &state->path.blk[0]);
+ xfs_da3_node_remove(state, drop_blk);
+ xfs_da3_fixhashpath(state, &state->path);
+ error = xfs_da3_root_join(state, &state->path.blk[0]);
return(error);
}
+#ifdef DEBUG
+static void
+xfs_da_blkinfo_onlychild_validate(struct xfs_da_blkinfo *blkinfo, __u16 level)
+{
+ __be16 magic = blkinfo->magic;
+
+ if (level == 1) {
+ ASSERT(magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) ||
+ magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC) ||
+ magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC) ||
+ magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC));
+ } else {
+ ASSERT(magic == cpu_to_be16(XFS_DA_NODE_MAGIC) ||
+ magic == cpu_to_be16(XFS_DA3_NODE_MAGIC));
+ }
+ ASSERT(!blkinfo->forw);
+ ASSERT(!blkinfo->back);
+}
+#else /* !DEBUG */
+#define xfs_da_blkinfo_onlychild_validate(blkinfo, level)
+#endif /* !DEBUG */
+
/*
* We have only one entry in the root. Copy the only remaining child of
* the old root to block 0 as the new root node.
*/
STATIC int
-xfs_da_root_join(xfs_da_state_t *state, xfs_da_state_blk_t *root_blk)
+xfs_da3_root_join(
+ struct xfs_da_state *state,
+ struct xfs_da_state_blk *root_blk)
{
- xfs_da_intnode_t *oldroot;
- /* REFERENCED */
- xfs_da_blkinfo_t *blkinfo;
- xfs_da_args_t *args;
- xfs_dablk_t child;
- xfs_dabuf_t *bp;
- int error;
+ struct xfs_da_intnode *oldroot;
+ struct xfs_da_args *args;
+ xfs_dablk_t child;
+ struct xfs_buf *bp;
+ struct xfs_da3_icnode_hdr oldroothdr;
+ struct xfs_da_node_entry *btree;
+ int error;
+ struct xfs_inode *dp = state->args->dp;
+
+ trace_xfs_da_root_join(state->args);
- args = state->args;
- ASSERT(args != NULL);
ASSERT(root_blk->magic == XFS_DA_NODE_MAGIC);
- oldroot = root_blk->bp->data;
- ASSERT(INT_GET(oldroot->hdr.info.magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC);
- ASSERT(!oldroot->hdr.info.forw);
- ASSERT(!oldroot->hdr.info.back);
+
+ args = state->args;
+ oldroot = root_blk->bp->b_addr;
+ dp->d_ops->node_hdr_from_disk(&oldroothdr, oldroot);
+ ASSERT(oldroothdr.forw == 0);
+ ASSERT(oldroothdr.back == 0);
/*
* If the root has more than one child, then don't do anything.
*/
- if (INT_GET(oldroot->hdr.count, ARCH_CONVERT) > 1)
- return(0);
+ if (oldroothdr.count > 1)
+ return 0;
/*
* Read in the (only) child block, then copy those bytes into
* the root block's buffer and free the original child block.
*/
- child = INT_GET(oldroot->btree[ 0 ].before, ARCH_CONVERT);
+ btree = dp->d_ops->node_tree_p(oldroot);
+ child = be32_to_cpu(btree[0].before);
ASSERT(child != 0);
- error = xfs_da_read_buf(args->trans, args->dp, child, -1, &bp,
+ error = xfs_da3_node_read(args->trans, dp, child, -1, &bp,
args->whichfork);
if (error)
- return(error);
- ASSERT(bp != NULL);
- blkinfo = bp->data;
- if (INT_GET(oldroot->hdr.level, ARCH_CONVERT) == 1) {
- ASSERT(INT_GET(blkinfo->magic, ARCH_CONVERT) == XFS_DIRX_LEAF_MAGIC(state->mp) ||
- INT_GET(blkinfo->magic, ARCH_CONVERT) == XFS_ATTR_LEAF_MAGIC);
- } else {
- ASSERT(INT_GET(blkinfo->magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC);
- }
- ASSERT(!blkinfo->forw);
- ASSERT(!blkinfo->back);
- memcpy(root_blk->bp->data, bp->data, state->blocksize);
- xfs_da_log_buf(args->trans, root_blk->bp, 0, state->blocksize - 1);
+ return error;
+ xfs_da_blkinfo_onlychild_validate(bp->b_addr, oldroothdr.level);
+
+ /*
+ * This could be copying a leaf back into the root block in the case of
+ * there only being a single leaf block left in the tree. Hence we have
+ * to update the b_ops pointer as well to match the buffer type change
+ * that could occur. For dir3 blocks we also need to update the block
+ * number in the buffer header.
+ */
+ memcpy(root_blk->bp->b_addr, bp->b_addr, args->geo->blksize);
+ root_blk->bp->b_ops = bp->b_ops;
+ xfs_trans_buf_copy_type(root_blk->bp, bp);
+ if (oldroothdr.magic == XFS_DA3_NODE_MAGIC) {
+ struct xfs_da3_blkinfo *da3 = root_blk->bp->b_addr;
+ da3->blkno = cpu_to_be64(root_blk->bp->b_bn);
+ }
+ xfs_trans_log_buf(args->trans, root_blk->bp, 0,
+ args->geo->blksize - 1);
error = xfs_da_shrink_inode(args, child, bp);
return(error);
}
@@ -814,14 +1112,24 @@ xfs_da_root_join(xfs_da_state_t *state, xfs_da_state_blk_t *root_blk)
* If nothing can be done, return 0.
*/
STATIC int
-xfs_da_node_toosmall(xfs_da_state_t *state, int *action)
+xfs_da3_node_toosmall(
+ struct xfs_da_state *state,
+ int *action)
{
- xfs_da_intnode_t *node;
- xfs_da_state_blk_t *blk;
- xfs_da_blkinfo_t *info;
- int count, forward, error, retval, i;
- xfs_dablk_t blkno;
- xfs_dabuf_t *bp;
+ struct xfs_da_intnode *node;
+ struct xfs_da_state_blk *blk;
+ struct xfs_da_blkinfo *info;
+ xfs_dablk_t blkno;
+ struct xfs_buf *bp;
+ struct xfs_da3_icnode_hdr nodehdr;
+ int count;
+ int forward;
+ int error;
+ int retval;
+ int i;
+ struct xfs_inode *dp = state->args->dp;
+
+ trace_xfs_da_node_toosmall(state->args);
/*
* Check for the degenerate case of the block being over 50% full.
@@ -829,11 +1137,10 @@ xfs_da_node_toosmall(xfs_da_state_t *state, int *action)
* to coalesce with a sibling.
*/
blk = &state->path.blk[ state->path.active-1 ];
- info = blk->bp->data;
- ASSERT(INT_GET(info->magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC);
+ info = blk->bp->b_addr;
node = (xfs_da_intnode_t *)info;
- count = INT_GET(node->hdr.count, ARCH_CONVERT);
- if (count > (state->node_ents >> 1)) {
+ dp->d_ops->node_hdr_from_disk(&nodehdr, node);
+ if (nodehdr.count > (state->args->geo->node_ents >> 1)) {
*action = 0; /* blk over 50%, don't try to join */
return(0); /* blk over 50%, don't try to join */
}
@@ -841,17 +1148,17 @@ xfs_da_node_toosmall(xfs_da_state_t *state, int *action)
/*
* Check for the degenerate case of the block being empty.
* If the block is empty, we'll simply delete it, no need to
- * coalesce it with a sibling block. We choose (aribtrarily)
+ * coalesce it with a sibling block. We choose (arbitrarily)
* to merge with the forward block unless it is NULL.
*/
- if (count == 0) {
+ if (nodehdr.count == 0) {
/*
* Make altpath point to the block we want to keep and
* path point to the block we want to drop (this one).
*/
- forward = info->forw;
+ forward = (info->forw != 0);
memcpy(&state->altpath, &state->path, sizeof(state->path));
- error = xfs_da_path_shift(state, &state->altpath, forward,
+ error = xfs_da3_path_shift(state, &state->altpath, forward,
0, &retval);
if (error)
return(error);
@@ -870,36 +1177,35 @@ xfs_da_node_toosmall(xfs_da_state_t *state, int *action)
* We prefer coalescing with the lower numbered sibling so as
* to shrink a directory over time.
*/
+ count = state->args->geo->node_ents;
+ count -= state->args->geo->node_ents >> 2;
+ count -= nodehdr.count;
+
/* start with smaller blk num */
- forward = (INT_GET(info->forw, ARCH_CONVERT)
- < INT_GET(info->back, ARCH_CONVERT));
+ forward = nodehdr.forw < nodehdr.back;
for (i = 0; i < 2; forward = !forward, i++) {
+ struct xfs_da3_icnode_hdr thdr;
if (forward)
- blkno = INT_GET(info->forw, ARCH_CONVERT);
+ blkno = nodehdr.forw;
else
- blkno = INT_GET(info->back, ARCH_CONVERT);
+ blkno = nodehdr.back;
if (blkno == 0)
continue;
- error = xfs_da_read_buf(state->args->trans, state->args->dp,
+ error = xfs_da3_node_read(state->args->trans, dp,
blkno, -1, &bp, state->args->whichfork);
if (error)
return(error);
- ASSERT(bp != NULL);
-
- node = (xfs_da_intnode_t *)info;
- count = state->node_ents;
- count -= state->node_ents >> 2;
- count -= INT_GET(node->hdr.count, ARCH_CONVERT);
- node = bp->data;
- ASSERT(INT_GET(node->hdr.info.magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC);
- count -= INT_GET(node->hdr.count, ARCH_CONVERT);
- xfs_da_brelse(state->args->trans, bp);
- if (count >= 0)
+
+ node = bp->b_addr;
+ dp->d_ops->node_hdr_from_disk(&thdr, node);
+ xfs_trans_brelse(state->args->trans, bp);
+
+ if (count - thdr.count >= 0)
break; /* fits with at least 25% to spare */
}
if (i >= 2) {
*action = 0;
- return(0);
+ return 0;
}
/*
@@ -908,28 +1214,43 @@ xfs_da_node_toosmall(xfs_da_state_t *state, int *action)
*/
memcpy(&state->altpath, &state->path, sizeof(state->path));
if (blkno < blk->blkno) {
- error = xfs_da_path_shift(state, &state->altpath, forward,
+ error = xfs_da3_path_shift(state, &state->altpath, forward,
0, &retval);
- if (error) {
- return(error);
- }
- if (retval) {
- *action = 0;
- return(0);
- }
} else {
- error = xfs_da_path_shift(state, &state->path, forward,
+ error = xfs_da3_path_shift(state, &state->path, forward,
0, &retval);
- if (error) {
- return(error);
- }
- if (retval) {
- *action = 0;
- return(0);
- }
+ }
+ if (error)
+ return error;
+ if (retval) {
+ *action = 0;
+ return 0;
}
*action = 1;
- return(0);
+ return 0;
+}
+
+/*
+ * Pick up the last hashvalue from an intermediate node.
+ */
+STATIC uint
+xfs_da3_node_lasthash(
+ struct xfs_inode *dp,
+ struct xfs_buf *bp,
+ int *count)
+{
+ struct xfs_da_intnode *node;
+ struct xfs_da_node_entry *btree;
+ struct xfs_da3_icnode_hdr nodehdr;
+
+ node = bp->b_addr;
+ dp->d_ops->node_hdr_from_disk(&nodehdr, node);
+ if (count)
+ *count = nodehdr.count;
+ if (!nodehdr.count)
+ return 0;
+ btree = dp->d_ops->node_tree_p(node);
+ return be32_to_cpu(btree[nodehdr.count - 1].hashval);
}
/*
@@ -937,13 +1258,19 @@ xfs_da_node_toosmall(xfs_da_state_t *state, int *action)
* when we stop making changes, return.
*/
void
-xfs_da_fixhashpath(xfs_da_state_t *state, xfs_da_state_path_t *path)
+xfs_da3_fixhashpath(
+ struct xfs_da_state *state,
+ struct xfs_da_state_path *path)
{
- xfs_da_state_blk_t *blk;
- xfs_da_intnode_t *node;
- xfs_da_node_entry_t *btree;
- xfs_dahash_t lasthash=0;
- int level, count;
+ struct xfs_da_state_blk *blk;
+ struct xfs_da_intnode *node;
+ struct xfs_da_node_entry *btree;
+ xfs_dahash_t lasthash=0;
+ int level;
+ int count;
+ struct xfs_inode *dp = state->args->dp;
+
+ trace_xfs_da_fixhashpath(state->args);
level = path->active-1;
blk = &path->blk[ level ];
@@ -953,36 +1280,32 @@ xfs_da_fixhashpath(xfs_da_state_t *state, xfs_da_state_path_t *path)
if (count == 0)
return;
break;
- case XFS_DIR_LEAF_MAGIC:
- ASSERT(XFS_DIR_IS_V1(state->mp));
- lasthash = xfs_dir_leaf_lasthash(blk->bp, &count);
- if (count == 0)
- return;
- break;
case XFS_DIR2_LEAFN_MAGIC:
- ASSERT(XFS_DIR_IS_V2(state->mp));
- lasthash = xfs_dir2_leafn_lasthash(blk->bp, &count);
+ lasthash = xfs_dir2_leafn_lasthash(dp, blk->bp, &count);
if (count == 0)
return;
break;
case XFS_DA_NODE_MAGIC:
- lasthash = xfs_da_node_lasthash(blk->bp, &count);
+ lasthash = xfs_da3_node_lasthash(dp, blk->bp, &count);
if (count == 0)
return;
break;
}
for (blk--, level--; level >= 0; blk--, level--) {
- node = blk->bp->data;
- ASSERT(INT_GET(node->hdr.info.magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC);
- btree = &node->btree[ blk->index ];
- if (INT_GET(btree->hashval, ARCH_CONVERT) == lasthash)
+ struct xfs_da3_icnode_hdr nodehdr;
+
+ node = blk->bp->b_addr;
+ dp->d_ops->node_hdr_from_disk(&nodehdr, node);
+ btree = dp->d_ops->node_tree_p(node);
+ if (be32_to_cpu(btree[blk->index].hashval) == lasthash)
break;
blk->hashval = lasthash;
- INT_SET(btree->hashval, ARCH_CONVERT, lasthash);
- xfs_da_log_buf(state->args->trans, blk->bp,
- XFS_DA_LOGRANGE(node, btree, sizeof(*btree)));
+ btree[blk->index].hashval = cpu_to_be32(lasthash);
+ xfs_trans_log_buf(state->args->trans, blk->bp,
+ XFS_DA_LOGRANGE(node, &btree[blk->index],
+ sizeof(*btree)));
- lasthash = INT_GET(node->btree[ INT_GET(node->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT);
+ lasthash = be32_to_cpu(btree[nodehdr.count - 1].hashval);
}
}
@@ -990,100 +1313,122 @@ xfs_da_fixhashpath(xfs_da_state_t *state, xfs_da_state_path_t *path)
* Remove an entry from an intermediate node.
*/
STATIC void
-xfs_da_node_remove(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk)
+xfs_da3_node_remove(
+ struct xfs_da_state *state,
+ struct xfs_da_state_blk *drop_blk)
{
- xfs_da_intnode_t *node;
- xfs_da_node_entry_t *btree;
- int tmp;
-
- node = drop_blk->bp->data;
- ASSERT(drop_blk->index < INT_GET(node->hdr.count, ARCH_CONVERT));
+ struct xfs_da_intnode *node;
+ struct xfs_da3_icnode_hdr nodehdr;
+ struct xfs_da_node_entry *btree;
+ int index;
+ int tmp;
+ struct xfs_inode *dp = state->args->dp;
+
+ trace_xfs_da_node_remove(state->args);
+
+ node = drop_blk->bp->b_addr;
+ dp->d_ops->node_hdr_from_disk(&nodehdr, node);
+ ASSERT(drop_blk->index < nodehdr.count);
ASSERT(drop_blk->index >= 0);
/*
* Copy over the offending entry, or just zero it out.
*/
- btree = &node->btree[drop_blk->index];
- if (drop_blk->index < (INT_GET(node->hdr.count, ARCH_CONVERT)-1)) {
- tmp = INT_GET(node->hdr.count, ARCH_CONVERT) - drop_blk->index - 1;
+ index = drop_blk->index;
+ btree = dp->d_ops->node_tree_p(node);
+ if (index < nodehdr.count - 1) {
+ tmp = nodehdr.count - index - 1;
tmp *= (uint)sizeof(xfs_da_node_entry_t);
- memmove(btree, btree + 1, tmp);
- xfs_da_log_buf(state->args->trans, drop_blk->bp,
- XFS_DA_LOGRANGE(node, btree, tmp));
- btree = &node->btree[ INT_GET(node->hdr.count, ARCH_CONVERT)-1 ];
- }
- memset((char *)btree, 0, sizeof(xfs_da_node_entry_t));
- xfs_da_log_buf(state->args->trans, drop_blk->bp,
- XFS_DA_LOGRANGE(node, btree, sizeof(*btree)));
- INT_MOD(node->hdr.count, ARCH_CONVERT, -1);
- xfs_da_log_buf(state->args->trans, drop_blk->bp,
- XFS_DA_LOGRANGE(node, &node->hdr, sizeof(node->hdr)));
+ memmove(&btree[index], &btree[index + 1], tmp);
+ xfs_trans_log_buf(state->args->trans, drop_blk->bp,
+ XFS_DA_LOGRANGE(node, &btree[index], tmp));
+ index = nodehdr.count - 1;
+ }
+ memset(&btree[index], 0, sizeof(xfs_da_node_entry_t));
+ xfs_trans_log_buf(state->args->trans, drop_blk->bp,
+ XFS_DA_LOGRANGE(node, &btree[index], sizeof(btree[index])));
+ nodehdr.count -= 1;
+ dp->d_ops->node_hdr_to_disk(node, &nodehdr);
+ xfs_trans_log_buf(state->args->trans, drop_blk->bp,
+ XFS_DA_LOGRANGE(node, &node->hdr, dp->d_ops->node_hdr_size));
/*
* Copy the last hash value from the block to propagate upwards.
*/
- btree--;
- drop_blk->hashval = INT_GET(btree->hashval, ARCH_CONVERT);
+ drop_blk->hashval = be32_to_cpu(btree[index - 1].hashval);
}
/*
- * Unbalance the btree elements between two intermediate nodes,
+ * Unbalance the elements between two intermediate nodes,
* move all Btree elements from one node into another.
*/
STATIC void
-xfs_da_node_unbalance(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk,
- xfs_da_state_blk_t *save_blk)
+xfs_da3_node_unbalance(
+ struct xfs_da_state *state,
+ struct xfs_da_state_blk *drop_blk,
+ struct xfs_da_state_blk *save_blk)
{
- xfs_da_intnode_t *drop_node, *save_node;
- xfs_da_node_entry_t *btree;
- int tmp;
- xfs_trans_t *tp;
-
- drop_node = drop_blk->bp->data;
- save_node = save_blk->bp->data;
- ASSERT(INT_GET(drop_node->hdr.info.magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC);
- ASSERT(INT_GET(save_node->hdr.info.magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC);
+ struct xfs_da_intnode *drop_node;
+ struct xfs_da_intnode *save_node;
+ struct xfs_da_node_entry *drop_btree;
+ struct xfs_da_node_entry *save_btree;
+ struct xfs_da3_icnode_hdr drop_hdr;
+ struct xfs_da3_icnode_hdr save_hdr;
+ struct xfs_trans *tp;
+ int sindex;
+ int tmp;
+ struct xfs_inode *dp = state->args->dp;
+
+ trace_xfs_da_node_unbalance(state->args);
+
+ drop_node = drop_blk->bp->b_addr;
+ save_node = save_blk->bp->b_addr;
+ dp->d_ops->node_hdr_from_disk(&drop_hdr, drop_node);
+ dp->d_ops->node_hdr_from_disk(&save_hdr, save_node);
+ drop_btree = dp->d_ops->node_tree_p(drop_node);
+ save_btree = dp->d_ops->node_tree_p(save_node);
tp = state->args->trans;
/*
* If the dying block has lower hashvals, then move all the
* elements in the remaining block up to make a hole.
*/
- if ((INT_GET(drop_node->btree[ 0 ].hashval, ARCH_CONVERT) < INT_GET(save_node->btree[ 0 ].hashval, ARCH_CONVERT)) ||
- (INT_GET(drop_node->btree[ INT_GET(drop_node->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT) <
- INT_GET(save_node->btree[ INT_GET(save_node->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT)))
- {
- btree = &save_node->btree[ INT_GET(drop_node->hdr.count, ARCH_CONVERT) ];
- tmp = INT_GET(save_node->hdr.count, ARCH_CONVERT) * (uint)sizeof(xfs_da_node_entry_t);
- memmove(btree, &save_node->btree[0], tmp);
- btree = &save_node->btree[0];
- xfs_da_log_buf(tp, save_blk->bp,
- XFS_DA_LOGRANGE(save_node, btree,
- (INT_GET(save_node->hdr.count, ARCH_CONVERT) + INT_GET(drop_node->hdr.count, ARCH_CONVERT)) *
- sizeof(xfs_da_node_entry_t)));
+ if ((be32_to_cpu(drop_btree[0].hashval) <
+ be32_to_cpu(save_btree[0].hashval)) ||
+ (be32_to_cpu(drop_btree[drop_hdr.count - 1].hashval) <
+ be32_to_cpu(save_btree[save_hdr.count - 1].hashval))) {
+ /* XXX: check this - is memmove dst correct? */
+ tmp = save_hdr.count * sizeof(xfs_da_node_entry_t);
+ memmove(&save_btree[drop_hdr.count], &save_btree[0], tmp);
+
+ sindex = 0;
+ xfs_trans_log_buf(tp, save_blk->bp,
+ XFS_DA_LOGRANGE(save_node, &save_btree[0],
+ (save_hdr.count + drop_hdr.count) *
+ sizeof(xfs_da_node_entry_t)));
} else {
- btree = &save_node->btree[ INT_GET(save_node->hdr.count, ARCH_CONVERT) ];
- xfs_da_log_buf(tp, save_blk->bp,
- XFS_DA_LOGRANGE(save_node, btree,
- INT_GET(drop_node->hdr.count, ARCH_CONVERT) *
- sizeof(xfs_da_node_entry_t)));
+ sindex = save_hdr.count;
+ xfs_trans_log_buf(tp, save_blk->bp,
+ XFS_DA_LOGRANGE(save_node, &save_btree[sindex],
+ drop_hdr.count * sizeof(xfs_da_node_entry_t)));
}
/*
* Move all the B-tree elements from drop_blk to save_blk.
*/
- tmp = INT_GET(drop_node->hdr.count, ARCH_CONVERT) * (uint)sizeof(xfs_da_node_entry_t);
- memcpy(btree, &drop_node->btree[0], tmp);
- INT_MOD(save_node->hdr.count, ARCH_CONVERT, INT_GET(drop_node->hdr.count, ARCH_CONVERT));
+ tmp = drop_hdr.count * (uint)sizeof(xfs_da_node_entry_t);
+ memcpy(&save_btree[sindex], &drop_btree[0], tmp);
+ save_hdr.count += drop_hdr.count;
- xfs_da_log_buf(tp, save_blk->bp,
+ dp->d_ops->node_hdr_to_disk(save_node, &save_hdr);
+ xfs_trans_log_buf(tp, save_blk->bp,
XFS_DA_LOGRANGE(save_node, &save_node->hdr,
- sizeof(save_node->hdr)));
+ dp->d_ops->node_hdr_size));
/*
* Save the last hashval in the remaining block for upward propagation.
*/
- save_blk->hashval = INT_GET(save_node->btree[ INT_GET(save_node->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT);
+ save_blk->hashval = be32_to_cpu(save_btree[save_hdr.count - 1].hashval);
}
/*========================================================================
@@ -1102,16 +1447,25 @@ xfs_da_node_unbalance(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk,
* pruned depth-first tree search.
*/
int /* error */
-xfs_da_node_lookup_int(xfs_da_state_t *state, int *result)
+xfs_da3_node_lookup_int(
+ struct xfs_da_state *state,
+ int *result)
{
- xfs_da_state_blk_t *blk;
- xfs_da_blkinfo_t *curr;
- xfs_da_intnode_t *node;
- xfs_da_node_entry_t *btree;
- xfs_dablk_t blkno;
- int probe, span, max, error, retval;
- xfs_dahash_t hashval;
- xfs_da_args_t *args;
+ struct xfs_da_state_blk *blk;
+ struct xfs_da_blkinfo *curr;
+ struct xfs_da_intnode *node;
+ struct xfs_da_node_entry *btree;
+ struct xfs_da3_icnode_hdr nodehdr;
+ struct xfs_da_args *args;
+ xfs_dablk_t blkno;
+ xfs_dahash_t hashval;
+ xfs_dahash_t btreehashval;
+ int probe;
+ int span;
+ int max;
+ int error;
+ int retval;
+ struct xfs_inode *dp = state->args->dp;
args = state->args;
@@ -1119,10 +1473,7 @@ xfs_da_node_lookup_int(xfs_da_state_t *state, int *result)
* Descend thru the B-tree searching each level for the right
* node to use, until the right hashval is found.
*/
- if (args->whichfork == XFS_DATA_FORK && XFS_DIR_IS_V2(state->mp))
- blkno = state->mp->m_dirleafblk;
- else
- blkno = 0;
+ blkno = (args->whichfork == XFS_DATA_FORK)? args->geo->leafblk : 0;
for (blk = &state->path.blk[0], state->path.active = 1;
state->path.active <= XFS_DA_NODE_MAXDEPTH;
blk++, state->path.active++) {
@@ -1130,80 +1481,85 @@ xfs_da_node_lookup_int(xfs_da_state_t *state, int *result)
* Read the next node down in the tree.
*/
blk->blkno = blkno;
- error = xfs_da_read_buf(args->trans, args->dp, blkno,
+ error = xfs_da3_node_read(args->trans, args->dp, blkno,
-1, &blk->bp, args->whichfork);
if (error) {
blk->blkno = 0;
state->path.active--;
return(error);
}
- curr = blk->bp->data;
- ASSERT(INT_GET(curr->magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC ||
- INT_GET(curr->magic, ARCH_CONVERT) == XFS_DIRX_LEAF_MAGIC(state->mp) ||
- INT_GET(curr->magic, ARCH_CONVERT) == XFS_ATTR_LEAF_MAGIC);
+ curr = blk->bp->b_addr;
+ blk->magic = be16_to_cpu(curr->magic);
+
+ if (blk->magic == XFS_ATTR_LEAF_MAGIC ||
+ blk->magic == XFS_ATTR3_LEAF_MAGIC) {
+ blk->magic = XFS_ATTR_LEAF_MAGIC;
+ blk->hashval = xfs_attr_leaf_lasthash(blk->bp, NULL);
+ break;
+ }
+
+ if (blk->magic == XFS_DIR2_LEAFN_MAGIC ||
+ blk->magic == XFS_DIR3_LEAFN_MAGIC) {
+ blk->magic = XFS_DIR2_LEAFN_MAGIC;
+ blk->hashval = xfs_dir2_leafn_lasthash(args->dp,
+ blk->bp, NULL);
+ break;
+ }
+
+ blk->magic = XFS_DA_NODE_MAGIC;
+
/*
* Search an intermediate node for a match.
*/
- blk->magic = INT_GET(curr->magic, ARCH_CONVERT);
- if (INT_GET(curr->magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC) {
- node = blk->bp->data;
- blk->hashval = INT_GET(node->btree[ INT_GET(node->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT);
+ node = blk->bp->b_addr;
+ dp->d_ops->node_hdr_from_disk(&nodehdr, node);
+ btree = dp->d_ops->node_tree_p(node);
- /*
- * Binary search. (note: small blocks will skip loop)
- */
- max = INT_GET(node->hdr.count, ARCH_CONVERT);
- probe = span = max / 2;
- hashval = args->hashval;
- for (btree = &node->btree[probe]; span > 4;
- btree = &node->btree[probe]) {
- span /= 2;
- if (INT_GET(btree->hashval, ARCH_CONVERT) < hashval)
- probe += span;
- else if (INT_GET(btree->hashval, ARCH_CONVERT) > hashval)
- probe -= span;
- else
- break;
- }
- ASSERT((probe >= 0) && (probe < max));
- ASSERT((span <= 4) || (INT_GET(btree->hashval, ARCH_CONVERT) == hashval));
-
- /*
- * Since we may have duplicate hashval's, find the first
- * matching hashval in the node.
- */
- while ((probe > 0) && (INT_GET(btree->hashval, ARCH_CONVERT) >= hashval)) {
- btree--;
- probe--;
- }
- while ((probe < max) && (INT_GET(btree->hashval, ARCH_CONVERT) < hashval)) {
- btree++;
- probe++;
- }
+ max = nodehdr.count;
+ blk->hashval = be32_to_cpu(btree[max - 1].hashval);
- /*
- * Pick the right block to descend on.
- */
- if (probe == max) {
- blk->index = max-1;
- blkno = INT_GET(node->btree[ max-1 ].before, ARCH_CONVERT);
- } else {
- blk->index = probe;
- blkno = INT_GET(btree->before, ARCH_CONVERT);
- }
+ /*
+ * Binary search. (note: small blocks will skip loop)
+ */
+ probe = span = max / 2;
+ hashval = args->hashval;
+ while (span > 4) {
+ span /= 2;
+ btreehashval = be32_to_cpu(btree[probe].hashval);
+ if (btreehashval < hashval)
+ probe += span;
+ else if (btreehashval > hashval)
+ probe -= span;
+ else
+ break;
}
- else if (INT_GET(curr->magic, ARCH_CONVERT) == XFS_ATTR_LEAF_MAGIC) {
- blk->hashval = xfs_attr_leaf_lasthash(blk->bp, NULL);
- break;
+ ASSERT((probe >= 0) && (probe < max));
+ ASSERT((span <= 4) ||
+ (be32_to_cpu(btree[probe].hashval) == hashval));
+
+ /*
+ * Since we may have duplicate hashval's, find the first
+ * matching hashval in the node.
+ */
+ while (probe > 0 &&
+ be32_to_cpu(btree[probe].hashval) >= hashval) {
+ probe--;
}
- else if (INT_GET(curr->magic, ARCH_CONVERT) == XFS_DIR_LEAF_MAGIC) {
- blk->hashval = xfs_dir_leaf_lasthash(blk->bp, NULL);
- break;
+ while (probe < max &&
+ be32_to_cpu(btree[probe].hashval) < hashval) {
+ probe++;
}
- else if (INT_GET(curr->magic, ARCH_CONVERT) == XFS_DIR2_LEAFN_MAGIC) {
- blk->hashval = xfs_dir2_leafn_lasthash(blk->bp, NULL);
- break;
+
+ /*
+ * Pick the right block to descend on.
+ */
+ if (probe == max) {
+ blk->index = max - 1;
+ blkno = be32_to_cpu(btree[max - 1].before);
+ } else {
+ blk->index = probe;
+ blkno = be32_to_cpu(btree[probe].before);
}
}
@@ -1214,30 +1570,26 @@ xfs_da_node_lookup_int(xfs_da_state_t *state, int *result)
* next leaf and keep searching.
*/
for (;;) {
- if (blk->magic == XFS_DIR_LEAF_MAGIC) {
- ASSERT(XFS_DIR_IS_V1(state->mp));
- retval = xfs_dir_leaf_lookup_int(blk->bp, args,
- &blk->index);
- } else if (blk->magic == XFS_DIR2_LEAFN_MAGIC) {
- ASSERT(XFS_DIR_IS_V2(state->mp));
+ if (blk->magic == XFS_DIR2_LEAFN_MAGIC) {
retval = xfs_dir2_leafn_lookup_int(blk->bp, args,
&blk->index, state);
- }
- else if (blk->magic == XFS_ATTR_LEAF_MAGIC) {
- retval = xfs_attr_leaf_lookup_int(blk->bp, args);
+ } else if (blk->magic == XFS_ATTR_LEAF_MAGIC) {
+ retval = xfs_attr3_leaf_lookup_int(blk->bp, args);
blk->index = args->index;
args->blkno = blk->blkno;
+ } else {
+ ASSERT(0);
+ return XFS_ERROR(EFSCORRUPTED);
}
if (((retval == ENOENT) || (retval == ENOATTR)) &&
(blk->hashval == args->hashval)) {
- error = xfs_da_path_shift(state, &state->path, 1, 1,
+ error = xfs_da3_path_shift(state, &state->path, 1, 1,
&retval);
if (error)
return(error);
if (retval == 0) {
continue;
- }
- else if (blk->magic == XFS_ATTR_LEAF_MAGIC) {
+ } else if (blk->magic == XFS_ATTR_LEAF_MAGIC) {
/* path_shift() gives ENOENT */
retval = XFS_ERROR(ENOATTR);
}
@@ -1253,45 +1605,75 @@ xfs_da_node_lookup_int(xfs_da_state_t *state, int *result)
*========================================================================*/
/*
+ * Compare two intermediate nodes for "order".
+ */
+STATIC int
+xfs_da3_node_order(
+ struct xfs_inode *dp,
+ struct xfs_buf *node1_bp,
+ struct xfs_buf *node2_bp)
+{
+ struct xfs_da_intnode *node1;
+ struct xfs_da_intnode *node2;
+ struct xfs_da_node_entry *btree1;
+ struct xfs_da_node_entry *btree2;
+ struct xfs_da3_icnode_hdr node1hdr;
+ struct xfs_da3_icnode_hdr node2hdr;
+
+ node1 = node1_bp->b_addr;
+ node2 = node2_bp->b_addr;
+ dp->d_ops->node_hdr_from_disk(&node1hdr, node1);
+ dp->d_ops->node_hdr_from_disk(&node2hdr, node2);
+ btree1 = dp->d_ops->node_tree_p(node1);
+ btree2 = dp->d_ops->node_tree_p(node2);
+
+ if (node1hdr.count > 0 && node2hdr.count > 0 &&
+ ((be32_to_cpu(btree2[0].hashval) < be32_to_cpu(btree1[0].hashval)) ||
+ (be32_to_cpu(btree2[node2hdr.count - 1].hashval) <
+ be32_to_cpu(btree1[node1hdr.count - 1].hashval)))) {
+ return 1;
+ }
+ return 0;
+}
+
+/*
* Link a new block into a doubly linked list of blocks (of whatever type).
*/
int /* error */
-xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk,
- xfs_da_state_blk_t *new_blk)
+xfs_da3_blk_link(
+ struct xfs_da_state *state,
+ struct xfs_da_state_blk *old_blk,
+ struct xfs_da_state_blk *new_blk)
{
- xfs_da_blkinfo_t *old_info, *new_info, *tmp_info;
- xfs_da_args_t *args;
- int before=0, error;
- xfs_dabuf_t *bp;
+ struct xfs_da_blkinfo *old_info;
+ struct xfs_da_blkinfo *new_info;
+ struct xfs_da_blkinfo *tmp_info;
+ struct xfs_da_args *args;
+ struct xfs_buf *bp;
+ int before = 0;
+ int error;
+ struct xfs_inode *dp = state->args->dp;
/*
* Set up environment.
*/
args = state->args;
ASSERT(args != NULL);
- old_info = old_blk->bp->data;
- new_info = new_blk->bp->data;
+ old_info = old_blk->bp->b_addr;
+ new_info = new_blk->bp->b_addr;
ASSERT(old_blk->magic == XFS_DA_NODE_MAGIC ||
- old_blk->magic == XFS_DIRX_LEAF_MAGIC(state->mp) ||
+ old_blk->magic == XFS_DIR2_LEAFN_MAGIC ||
old_blk->magic == XFS_ATTR_LEAF_MAGIC);
- ASSERT(old_blk->magic == INT_GET(old_info->magic, ARCH_CONVERT));
- ASSERT(new_blk->magic == INT_GET(new_info->magic, ARCH_CONVERT));
- ASSERT(old_blk->magic == new_blk->magic);
switch (old_blk->magic) {
case XFS_ATTR_LEAF_MAGIC:
before = xfs_attr_leaf_order(old_blk->bp, new_blk->bp);
break;
- case XFS_DIR_LEAF_MAGIC:
- ASSERT(XFS_DIR_IS_V1(state->mp));
- before = xfs_dir_leaf_order(old_blk->bp, new_blk->bp);
- break;
case XFS_DIR2_LEAFN_MAGIC:
- ASSERT(XFS_DIR_IS_V2(state->mp));
- before = xfs_dir2_leafn_order(old_blk->bp, new_blk->bp);
+ before = xfs_dir2_leafn_order(dp, old_blk->bp, new_blk->bp);
break;
case XFS_DA_NODE_MAGIC:
- before = xfs_da_node_order(old_blk->bp, new_blk->bp);
+ before = xfs_da3_node_order(dp, old_blk->bp, new_blk->bp);
break;
}
@@ -1302,166 +1684,123 @@ xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk,
/*
* Link new block in before existing block.
*/
- INT_SET(new_info->forw, ARCH_CONVERT, old_blk->blkno);
- new_info->back = old_info->back; /* INT_: direct copy */
- if (INT_GET(old_info->back, ARCH_CONVERT)) {
- error = xfs_da_read_buf(args->trans, args->dp,
- INT_GET(old_info->back,
- ARCH_CONVERT), -1, &bp,
- args->whichfork);
+ trace_xfs_da_link_before(args);
+ new_info->forw = cpu_to_be32(old_blk->blkno);
+ new_info->back = old_info->back;
+ if (old_info->back) {
+ error = xfs_da3_node_read(args->trans, dp,
+ be32_to_cpu(old_info->back),
+ -1, &bp, args->whichfork);
if (error)
return(error);
ASSERT(bp != NULL);
- tmp_info = bp->data;
- ASSERT(INT_GET(tmp_info->magic, ARCH_CONVERT) == INT_GET(old_info->magic, ARCH_CONVERT));
- ASSERT(INT_GET(tmp_info->forw, ARCH_CONVERT) == old_blk->blkno);
- INT_SET(tmp_info->forw, ARCH_CONVERT, new_blk->blkno);
- xfs_da_log_buf(args->trans, bp, 0, sizeof(*tmp_info)-1);
- xfs_da_buf_done(bp);
+ tmp_info = bp->b_addr;
+ ASSERT(tmp_info->magic == old_info->magic);
+ ASSERT(be32_to_cpu(tmp_info->forw) == old_blk->blkno);
+ tmp_info->forw = cpu_to_be32(new_blk->blkno);
+ xfs_trans_log_buf(args->trans, bp, 0, sizeof(*tmp_info)-1);
}
- INT_SET(old_info->back, ARCH_CONVERT, new_blk->blkno);
+ old_info->back = cpu_to_be32(new_blk->blkno);
} else {
/*
* Link new block in after existing block.
*/
- new_info->forw = old_info->forw; /* INT_: direct copy */
- INT_SET(new_info->back, ARCH_CONVERT, old_blk->blkno);
- if (INT_GET(old_info->forw, ARCH_CONVERT)) {
- error = xfs_da_read_buf(args->trans, args->dp,
- INT_GET(old_info->forw, ARCH_CONVERT), -1, &bp,
- args->whichfork);
+ trace_xfs_da_link_after(args);
+ new_info->forw = old_info->forw;
+ new_info->back = cpu_to_be32(old_blk->blkno);
+ if (old_info->forw) {
+ error = xfs_da3_node_read(args->trans, dp,
+ be32_to_cpu(old_info->forw),
+ -1, &bp, args->whichfork);
if (error)
return(error);
ASSERT(bp != NULL);
- tmp_info = bp->data;
- ASSERT(INT_GET(tmp_info->magic, ARCH_CONVERT)
- == INT_GET(old_info->magic, ARCH_CONVERT));
- ASSERT(INT_GET(tmp_info->back, ARCH_CONVERT)
- == old_blk->blkno);
- INT_SET(tmp_info->back, ARCH_CONVERT, new_blk->blkno);
- xfs_da_log_buf(args->trans, bp, 0, sizeof(*tmp_info)-1);
- xfs_da_buf_done(bp);
+ tmp_info = bp->b_addr;
+ ASSERT(tmp_info->magic == old_info->magic);
+ ASSERT(be32_to_cpu(tmp_info->back) == old_blk->blkno);
+ tmp_info->back = cpu_to_be32(new_blk->blkno);
+ xfs_trans_log_buf(args->trans, bp, 0, sizeof(*tmp_info)-1);
}
- INT_SET(old_info->forw, ARCH_CONVERT, new_blk->blkno);
+ old_info->forw = cpu_to_be32(new_blk->blkno);
}
- xfs_da_log_buf(args->trans, old_blk->bp, 0, sizeof(*tmp_info) - 1);
- xfs_da_log_buf(args->trans, new_blk->bp, 0, sizeof(*tmp_info) - 1);
+ xfs_trans_log_buf(args->trans, old_blk->bp, 0, sizeof(*tmp_info) - 1);
+ xfs_trans_log_buf(args->trans, new_blk->bp, 0, sizeof(*tmp_info) - 1);
return(0);
}
/*
- * Compare two intermediate nodes for "order".
- */
-STATIC int
-xfs_da_node_order(xfs_dabuf_t *node1_bp, xfs_dabuf_t *node2_bp)
-{
- xfs_da_intnode_t *node1, *node2;
-
- node1 = node1_bp->data;
- node2 = node2_bp->data;
- ASSERT((INT_GET(node1->hdr.info.magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC) &&
- (INT_GET(node2->hdr.info.magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC));
- if ((INT_GET(node1->hdr.count, ARCH_CONVERT) > 0) && (INT_GET(node2->hdr.count, ARCH_CONVERT) > 0) &&
- ((INT_GET(node2->btree[ 0 ].hashval, ARCH_CONVERT) <
- INT_GET(node1->btree[ 0 ].hashval, ARCH_CONVERT)) ||
- (INT_GET(node2->btree[ INT_GET(node2->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT) <
- INT_GET(node1->btree[ INT_GET(node1->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT)))) {
- return(1);
- }
- return(0);
-}
-
-/*
- * Pick up the last hashvalue from an intermediate node.
- */
-STATIC uint
-xfs_da_node_lasthash(xfs_dabuf_t *bp, int *count)
-{
- xfs_da_intnode_t *node;
-
- node = bp->data;
- ASSERT(INT_GET(node->hdr.info.magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC);
- if (count)
- *count = INT_GET(node->hdr.count, ARCH_CONVERT);
- if (!node->hdr.count)
- return(0);
- return(INT_GET(node->btree[ INT_GET(node->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT));
-}
-
-/*
* Unlink a block from a doubly linked list of blocks.
*/
STATIC int /* error */
-xfs_da_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk,
- xfs_da_state_blk_t *save_blk)
+xfs_da3_blk_unlink(
+ struct xfs_da_state *state,
+ struct xfs_da_state_blk *drop_blk,
+ struct xfs_da_state_blk *save_blk)
{
- xfs_da_blkinfo_t *drop_info, *save_info, *tmp_info;
- xfs_da_args_t *args;
- xfs_dabuf_t *bp;
- int error;
+ struct xfs_da_blkinfo *drop_info;
+ struct xfs_da_blkinfo *save_info;
+ struct xfs_da_blkinfo *tmp_info;
+ struct xfs_da_args *args;
+ struct xfs_buf *bp;
+ int error;
/*
* Set up environment.
*/
args = state->args;
ASSERT(args != NULL);
- save_info = save_blk->bp->data;
- drop_info = drop_blk->bp->data;
+ save_info = save_blk->bp->b_addr;
+ drop_info = drop_blk->bp->b_addr;
ASSERT(save_blk->magic == XFS_DA_NODE_MAGIC ||
- save_blk->magic == XFS_DIRX_LEAF_MAGIC(state->mp) ||
+ save_blk->magic == XFS_DIR2_LEAFN_MAGIC ||
save_blk->magic == XFS_ATTR_LEAF_MAGIC);
- ASSERT(save_blk->magic == INT_GET(save_info->magic, ARCH_CONVERT));
- ASSERT(drop_blk->magic == INT_GET(drop_info->magic, ARCH_CONVERT));
ASSERT(save_blk->magic == drop_blk->magic);
- ASSERT((INT_GET(save_info->forw, ARCH_CONVERT) == drop_blk->blkno) ||
- (INT_GET(save_info->back, ARCH_CONVERT) == drop_blk->blkno));
- ASSERT((INT_GET(drop_info->forw, ARCH_CONVERT) == save_blk->blkno) ||
- (INT_GET(drop_info->back, ARCH_CONVERT) == save_blk->blkno));
+ ASSERT((be32_to_cpu(save_info->forw) == drop_blk->blkno) ||
+ (be32_to_cpu(save_info->back) == drop_blk->blkno));
+ ASSERT((be32_to_cpu(drop_info->forw) == save_blk->blkno) ||
+ (be32_to_cpu(drop_info->back) == save_blk->blkno));
/*
* Unlink the leaf block from the doubly linked chain of leaves.
*/
- if (INT_GET(save_info->back, ARCH_CONVERT) == drop_blk->blkno) {
- save_info->back = drop_info->back; /* INT_: direct copy */
- if (INT_GET(drop_info->back, ARCH_CONVERT)) {
- error = xfs_da_read_buf(args->trans, args->dp,
- INT_GET(drop_info->back,
- ARCH_CONVERT), -1, &bp,
- args->whichfork);
+ if (be32_to_cpu(save_info->back) == drop_blk->blkno) {
+ trace_xfs_da_unlink_back(args);
+ save_info->back = drop_info->back;
+ if (drop_info->back) {
+ error = xfs_da3_node_read(args->trans, args->dp,
+ be32_to_cpu(drop_info->back),
+ -1, &bp, args->whichfork);
if (error)
return(error);
ASSERT(bp != NULL);
- tmp_info = bp->data;
- ASSERT(INT_GET(tmp_info->magic, ARCH_CONVERT) == INT_GET(save_info->magic, ARCH_CONVERT));
- ASSERT(INT_GET(tmp_info->forw, ARCH_CONVERT) == drop_blk->blkno);
- INT_SET(tmp_info->forw, ARCH_CONVERT, save_blk->blkno);
- xfs_da_log_buf(args->trans, bp, 0,
+ tmp_info = bp->b_addr;
+ ASSERT(tmp_info->magic == save_info->magic);
+ ASSERT(be32_to_cpu(tmp_info->forw) == drop_blk->blkno);
+ tmp_info->forw = cpu_to_be32(save_blk->blkno);
+ xfs_trans_log_buf(args->trans, bp, 0,
sizeof(*tmp_info) - 1);
- xfs_da_buf_done(bp);
}
} else {
- save_info->forw = drop_info->forw; /* INT_: direct copy */
- if (INT_GET(drop_info->forw, ARCH_CONVERT)) {
- error = xfs_da_read_buf(args->trans, args->dp,
- INT_GET(drop_info->forw, ARCH_CONVERT), -1, &bp,
- args->whichfork);
+ trace_xfs_da_unlink_forward(args);
+ save_info->forw = drop_info->forw;
+ if (drop_info->forw) {
+ error = xfs_da3_node_read(args->trans, args->dp,
+ be32_to_cpu(drop_info->forw),
+ -1, &bp, args->whichfork);
if (error)
return(error);
ASSERT(bp != NULL);
- tmp_info = bp->data;
- ASSERT(INT_GET(tmp_info->magic, ARCH_CONVERT)
- == INT_GET(save_info->magic, ARCH_CONVERT));
- ASSERT(INT_GET(tmp_info->back, ARCH_CONVERT)
- == drop_blk->blkno);
- INT_SET(tmp_info->back, ARCH_CONVERT, save_blk->blkno);
- xfs_da_log_buf(args->trans, bp, 0,
+ tmp_info = bp->b_addr;
+ ASSERT(tmp_info->magic == save_info->magic);
+ ASSERT(be32_to_cpu(tmp_info->back) == drop_blk->blkno);
+ tmp_info->back = cpu_to_be32(save_blk->blkno);
+ xfs_trans_log_buf(args->trans, bp, 0,
sizeof(*tmp_info) - 1);
- xfs_da_buf_done(bp);
}
}
- xfs_da_log_buf(args->trans, save_blk->bp, 0, sizeof(*save_info) - 1);
+ xfs_trans_log_buf(args->trans, save_blk->bp, 0, sizeof(*save_info) - 1);
return(0);
}
@@ -1474,15 +1813,25 @@ xfs_da_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk,
* the new bottom and the root.
*/
int /* error */
-xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path,
- int forward, int release, int *result)
+xfs_da3_path_shift(
+ struct xfs_da_state *state,
+ struct xfs_da_state_path *path,
+ int forward,
+ int release,
+ int *result)
{
- xfs_da_state_blk_t *blk;
- xfs_da_blkinfo_t *info;
- xfs_da_intnode_t *node;
- xfs_da_args_t *args;
- xfs_dablk_t blkno=0;
- int level, error;
+ struct xfs_da_state_blk *blk;
+ struct xfs_da_blkinfo *info;
+ struct xfs_da_intnode *node;
+ struct xfs_da_args *args;
+ struct xfs_da_node_entry *btree;
+ struct xfs_da3_icnode_hdr nodehdr;
+ xfs_dablk_t blkno = 0;
+ int level;
+ int error;
+ struct xfs_inode *dp = state->args->dp;
+
+ trace_xfs_da_path_shift(state->args);
/*
* Roll up the Btree looking for the first block where our
@@ -1495,22 +1844,23 @@ xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path,
ASSERT((path->active > 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
level = (path->active-1) - 1; /* skip bottom layer in path */
for (blk = &path->blk[level]; level >= 0; blk--, level--) {
- ASSERT(blk->bp != NULL);
- node = blk->bp->data;
- ASSERT(INT_GET(node->hdr.info.magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC);
- if (forward && (blk->index < INT_GET(node->hdr.count, ARCH_CONVERT)-1)) {
+ node = blk->bp->b_addr;
+ dp->d_ops->node_hdr_from_disk(&nodehdr, node);
+ btree = dp->d_ops->node_tree_p(node);
+
+ if (forward && (blk->index < nodehdr.count - 1)) {
blk->index++;
- blkno = INT_GET(node->btree[ blk->index ].before, ARCH_CONVERT);
+ blkno = be32_to_cpu(btree[blk->index].before);
break;
} else if (!forward && (blk->index > 0)) {
blk->index--;
- blkno = INT_GET(node->btree[ blk->index ].before, ARCH_CONVERT);
+ blkno = be32_to_cpu(btree[blk->index].before);
break;
}
}
if (level < 0) {
*result = XFS_ERROR(ENOENT); /* we're out of our tree */
- ASSERT(args->oknoent);
+ ASSERT(args->op_flags & XFS_DA_OP_OKNOENT);
return(0);
}
@@ -1524,58 +1874,65 @@ xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path,
* (if it's dirty, trans won't actually let go)
*/
if (release)
- xfs_da_brelse(args->trans, blk->bp);
+ xfs_trans_brelse(args->trans, blk->bp);
/*
* Read the next child block.
*/
blk->blkno = blkno;
- error = xfs_da_read_buf(args->trans, args->dp, blkno, -1,
- &blk->bp, args->whichfork);
+ error = xfs_da3_node_read(args->trans, dp, blkno, -1,
+ &blk->bp, args->whichfork);
if (error)
return(error);
- ASSERT(blk->bp != NULL);
- info = blk->bp->data;
- ASSERT(INT_GET(info->magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC ||
- INT_GET(info->magic, ARCH_CONVERT) == XFS_DIRX_LEAF_MAGIC(state->mp) ||
- INT_GET(info->magic, ARCH_CONVERT) == XFS_ATTR_LEAF_MAGIC);
- blk->magic = INT_GET(info->magic, ARCH_CONVERT);
- if (INT_GET(info->magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC) {
+ info = blk->bp->b_addr;
+ ASSERT(info->magic == cpu_to_be16(XFS_DA_NODE_MAGIC) ||
+ info->magic == cpu_to_be16(XFS_DA3_NODE_MAGIC) ||
+ info->magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) ||
+ info->magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC) ||
+ info->magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC) ||
+ info->magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC));
+
+
+ /*
+ * Note: we flatten the magic number to a single type so we
+ * don't have to compare against crc/non-crc types elsewhere.
+ */
+ switch (be16_to_cpu(info->magic)) {
+ case XFS_DA_NODE_MAGIC:
+ case XFS_DA3_NODE_MAGIC:
+ blk->magic = XFS_DA_NODE_MAGIC;
node = (xfs_da_intnode_t *)info;
- blk->hashval = INT_GET(node->btree[ INT_GET(node->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT);
+ dp->d_ops->node_hdr_from_disk(&nodehdr, node);
+ btree = dp->d_ops->node_tree_p(node);
+ blk->hashval = be32_to_cpu(btree[nodehdr.count - 1].hashval);
if (forward)
blk->index = 0;
else
- blk->index = INT_GET(node->hdr.count, ARCH_CONVERT)-1;
- blkno = INT_GET(node->btree[ blk->index ].before, ARCH_CONVERT);
- } else {
+ blk->index = nodehdr.count - 1;
+ blkno = be32_to_cpu(btree[blk->index].before);
+ break;
+ case XFS_ATTR_LEAF_MAGIC:
+ case XFS_ATTR3_LEAF_MAGIC:
+ blk->magic = XFS_ATTR_LEAF_MAGIC;
ASSERT(level == path->active-1);
blk->index = 0;
- switch(blk->magic) {
- case XFS_ATTR_LEAF_MAGIC:
- blk->hashval = xfs_attr_leaf_lasthash(blk->bp,
- NULL);
- break;
- case XFS_DIR_LEAF_MAGIC:
- ASSERT(XFS_DIR_IS_V1(state->mp));
- blk->hashval = xfs_dir_leaf_lasthash(blk->bp,
- NULL);
- break;
- case XFS_DIR2_LEAFN_MAGIC:
- ASSERT(XFS_DIR_IS_V2(state->mp));
- blk->hashval = xfs_dir2_leafn_lasthash(blk->bp,
- NULL);
- break;
- default:
- ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC ||
- blk->magic ==
- XFS_DIRX_LEAF_MAGIC(state->mp));
- break;
- }
+ blk->hashval = xfs_attr_leaf_lasthash(blk->bp, NULL);
+ break;
+ case XFS_DIR2_LEAFN_MAGIC:
+ case XFS_DIR3_LEAFN_MAGIC:
+ blk->magic = XFS_DIR2_LEAFN_MAGIC;
+ ASSERT(level == path->active-1);
+ blk->index = 0;
+ blk->hashval = xfs_dir2_leafn_lasthash(args->dp,
+ blk->bp, NULL);
+ break;
+ default:
+ ASSERT(0);
+ break;
}
}
*result = 0;
- return(0);
+ return 0;
}
@@ -1589,7 +1946,7 @@ xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path,
* This is implemented with some source-level loop unrolling.
*/
xfs_dahash_t
-xfs_da_hashname(const uchar_t *name, int namelen)
+xfs_da_hashname(const __uint8_t *name, int namelen)
{
xfs_dahash_t hash;
@@ -1616,78 +1973,82 @@ xfs_da_hashname(const uchar_t *name, int namelen)
}
}
-/*
- * Add a block to the btree ahead of the file.
- * Return the new block number to the caller.
- */
+enum xfs_dacmp
+xfs_da_compname(
+ struct xfs_da_args *args,
+ const unsigned char *name,
+ int len)
+{
+ return (args->namelen == len && memcmp(args->name, name, len) == 0) ?
+ XFS_CMP_EXACT : XFS_CMP_DIFFERENT;
+}
+
+static xfs_dahash_t
+xfs_default_hashname(
+ struct xfs_name *name)
+{
+ return xfs_da_hashname(name->name, name->len);
+}
+
+const struct xfs_nameops xfs_default_nameops = {
+ .hashname = xfs_default_hashname,
+ .compname = xfs_da_compname
+};
+
int
-xfs_da_grow_inode(xfs_da_args_t *args, xfs_dablk_t *new_blkno)
+xfs_da_grow_inode_int(
+ struct xfs_da_args *args,
+ xfs_fileoff_t *bno,
+ int count)
{
- xfs_fileoff_t bno, b;
- xfs_bmbt_irec_t map;
- xfs_bmbt_irec_t *mapp;
- xfs_inode_t *dp;
- int nmap, error, w, count, c, got, i, mapi;
- xfs_fsize_t size;
- xfs_trans_t *tp;
- xfs_mount_t *mp;
+ struct xfs_trans *tp = args->trans;
+ struct xfs_inode *dp = args->dp;
+ int w = args->whichfork;
+ xfs_drfsbno_t nblks = dp->i_d.di_nblocks;
+ struct xfs_bmbt_irec map, *mapp;
+ int nmap, error, got, i, mapi;
- dp = args->dp;
- mp = dp->i_mount;
- w = args->whichfork;
- tp = args->trans;
- /*
- * For new directories adjust the file offset and block count.
- */
- if (w == XFS_DATA_FORK && XFS_DIR_IS_V2(mp)) {
- bno = mp->m_dirleafblk;
- count = mp->m_dirblkfsbs;
- } else {
- bno = 0;
- count = 1;
- }
/*
* Find a spot in the file space to put the new block.
*/
- if ((error = xfs_bmap_first_unused(tp, dp, count, &bno, w))) {
+ error = xfs_bmap_first_unused(tp, dp, count, bno, w);
+ if (error)
return error;
- }
- if (w == XFS_DATA_FORK && XFS_DIR_IS_V2(mp))
- ASSERT(bno >= mp->m_dirleafblk && bno < mp->m_dirfreeblk);
+
/*
* Try mapping it in one filesystem block.
*/
nmap = 1;
ASSERT(args->firstblock != NULL);
- if ((error = xfs_bmapi(tp, dp, bno, count,
- XFS_BMAPI_AFLAG(w)|XFS_BMAPI_WRITE|XFS_BMAPI_METADATA|
- XFS_BMAPI_CONTIG,
+ error = xfs_bmapi_write(tp, dp, *bno, count,
+ xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA|XFS_BMAPI_CONTIG,
args->firstblock, args->total, &map, &nmap,
- args->flist))) {
+ args->flist);
+ if (error)
return error;
- }
+
ASSERT(nmap <= 1);
if (nmap == 1) {
mapp = &map;
mapi = 1;
- }
- /*
- * If we didn't get it and the block might work if fragmented,
- * try without the CONTIG flag. Loop until we get it all.
- */
- else if (nmap == 0 && count > 1) {
+ } else if (nmap == 0 && count > 1) {
+ xfs_fileoff_t b;
+ int c;
+
+ /*
+ * If we didn't get it and the block might work if fragmented,
+ * try without the CONTIG flag. Loop until we get it all.
+ */
mapp = kmem_alloc(sizeof(*mapp) * count, KM_SLEEP);
- for (b = bno, mapi = 0; b < bno + count; ) {
+ for (b = *bno, mapi = 0; b < *bno + count; ) {
nmap = MIN(XFS_BMAP_MAX_NMAP, count);
- c = (int)(bno + count - b);
- if ((error = xfs_bmapi(tp, dp, b, c,
- XFS_BMAPI_AFLAG(w)|XFS_BMAPI_WRITE|
- XFS_BMAPI_METADATA,
+ c = (int)(*bno + count - b);
+ error = xfs_bmapi_write(tp, dp, b, c,
+ xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA,
args->firstblock, args->total,
- &mapp[mapi], &nmap, args->flist))) {
- kmem_free(mapp, sizeof(*mapp) * count);
- return error;
- }
+ &mapp[mapi], &nmap, args->flist);
+ if (error)
+ goto out_free_map;
if (nmap < 1)
break;
mapi += nmap;
@@ -1698,35 +2059,47 @@ xfs_da_grow_inode(xfs_da_args_t *args, xfs_dablk_t *new_blkno)
mapi = 0;
mapp = NULL;
}
+
/*
* Count the blocks we got, make sure it matches the total.
*/
for (i = 0, got = 0; i < mapi; i++)
got += mapp[i].br_blockcount;
- if (got != count || mapp[0].br_startoff != bno ||
+ if (got != count || mapp[0].br_startoff != *bno ||
mapp[mapi - 1].br_startoff + mapp[mapi - 1].br_blockcount !=
- bno + count) {
- if (mapp != &map)
- kmem_free(mapp, sizeof(*mapp) * count);
- return XFS_ERROR(ENOSPC);
+ *bno + count) {
+ error = XFS_ERROR(ENOSPC);
+ goto out_free_map;
}
+
+ /* account for newly allocated blocks in reserved blocks total */
+ args->total -= dp->i_d.di_nblocks - nblks;
+
+out_free_map:
if (mapp != &map)
- kmem_free(mapp, sizeof(*mapp) * count);
- *new_blkno = (xfs_dablk_t)bno;
- /*
- * For version 1 directories, adjust the file size if it changed.
- */
- if (w == XFS_DATA_FORK && XFS_DIR_IS_V1(mp)) {
- ASSERT(mapi == 1);
- if ((error = xfs_bmap_last_offset(tp, dp, &bno, w)))
- return error;
- size = XFS_FSB_TO_B(mp, bno);
- if (size != dp->i_d.di_size) {
- dp->i_d.di_size = size;
- xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
- }
- }
- return 0;
+ kmem_free(mapp);
+ return error;
+}
+
+/*
+ * Add a block to the btree ahead of the file.
+ * Return the new block number to the caller.
+ */
+int
+xfs_da_grow_inode(
+ struct xfs_da_args *args,
+ xfs_dablk_t *new_blkno)
+{
+ xfs_fileoff_t bno;
+ int error;
+
+ trace_xfs_da_grow_inode(args);
+
+ bno = args->geo->leafblk;
+ error = xfs_da_grow_inode_int(args, &bno, args->geo->fsbcount);
+ if (!error)
+ *new_blkno = (xfs_dablk_t)bno;
+ return error;
}
/*
@@ -1738,34 +2111,48 @@ xfs_da_grow_inode(xfs_da_args_t *args, xfs_dablk_t *new_blkno)
* a bmap btree split to do that.
*/
STATIC int
-xfs_da_swap_lastblock(xfs_da_args_t *args, xfs_dablk_t *dead_blknop,
- xfs_dabuf_t **dead_bufp)
+xfs_da3_swap_lastblock(
+ struct xfs_da_args *args,
+ xfs_dablk_t *dead_blknop,
+ struct xfs_buf **dead_bufp)
{
- xfs_dablk_t dead_blkno, last_blkno, sib_blkno, par_blkno;
- xfs_dabuf_t *dead_buf, *last_buf, *sib_buf, *par_buf;
- xfs_fileoff_t lastoff;
- xfs_inode_t *ip;
- xfs_trans_t *tp;
- xfs_mount_t *mp;
- int error, w, entno, level, dead_level;
- xfs_da_blkinfo_t *dead_info, *sib_info;
- xfs_da_intnode_t *par_node, *dead_node;
- xfs_dir_leafblock_t *dead_leaf;
- xfs_dir2_leaf_t *dead_leaf2;
- xfs_dahash_t dead_hash;
+ struct xfs_da_blkinfo *dead_info;
+ struct xfs_da_blkinfo *sib_info;
+ struct xfs_da_intnode *par_node;
+ struct xfs_da_intnode *dead_node;
+ struct xfs_dir2_leaf *dead_leaf2;
+ struct xfs_da_node_entry *btree;
+ struct xfs_da3_icnode_hdr par_hdr;
+ struct xfs_inode *dp;
+ struct xfs_trans *tp;
+ struct xfs_mount *mp;
+ struct xfs_buf *dead_buf;
+ struct xfs_buf *last_buf;
+ struct xfs_buf *sib_buf;
+ struct xfs_buf *par_buf;
+ xfs_dahash_t dead_hash;
+ xfs_fileoff_t lastoff;
+ xfs_dablk_t dead_blkno;
+ xfs_dablk_t last_blkno;
+ xfs_dablk_t sib_blkno;
+ xfs_dablk_t par_blkno;
+ int error;
+ int w;
+ int entno;
+ int level;
+ int dead_level;
+
+ trace_xfs_da_swap_lastblock(args);
dead_buf = *dead_bufp;
dead_blkno = *dead_blknop;
tp = args->trans;
- ip = args->dp;
+ dp = args->dp;
w = args->whichfork;
ASSERT(w == XFS_DATA_FORK);
- mp = ip->i_mount;
- if (XFS_DIR_IS_V2(mp)) {
- lastoff = mp->m_dirfreeblk;
- error = xfs_bmap_last_before(tp, ip, &lastoff, w);
- } else
- error = xfs_bmap_last_offset(tp, ip, &lastoff, w);
+ mp = dp->i_mount;
+ lastoff = args->geo->freeblk;
+ error = xfs_bmap_last_before(tp, dp, &lastoff, w);
if (error)
return error;
if (unlikely(lastoff == 0)) {
@@ -1776,114 +2163,117 @@ xfs_da_swap_lastblock(xfs_da_args_t *args, xfs_dablk_t *dead_blknop,
/*
* Read the last block in the btree space.
*/
- last_blkno = (xfs_dablk_t)lastoff - mp->m_dirblkfsbs;
- if ((error = xfs_da_read_buf(tp, ip, last_blkno, -1, &last_buf, w)))
+ last_blkno = (xfs_dablk_t)lastoff - args->geo->fsbcount;
+ error = xfs_da3_node_read(tp, dp, last_blkno, -1, &last_buf, w);
+ if (error)
return error;
/*
* Copy the last block into the dead buffer and log it.
*/
- memcpy(dead_buf->data, last_buf->data, mp->m_dirblksize);
- xfs_da_log_buf(tp, dead_buf, 0, mp->m_dirblksize - 1);
- dead_info = dead_buf->data;
+ memcpy(dead_buf->b_addr, last_buf->b_addr, args->geo->blksize);
+ xfs_trans_log_buf(tp, dead_buf, 0, args->geo->blksize - 1);
+ dead_info = dead_buf->b_addr;
/*
* Get values from the moved block.
*/
- if (INT_GET(dead_info->magic, ARCH_CONVERT) == XFS_DIR_LEAF_MAGIC) {
- ASSERT(XFS_DIR_IS_V1(mp));
- dead_leaf = (xfs_dir_leafblock_t *)dead_info;
- dead_level = 0;
- dead_hash =
- INT_GET(dead_leaf->entries[INT_GET(dead_leaf->hdr.count, ARCH_CONVERT) - 1].hashval, ARCH_CONVERT);
- } else if (INT_GET(dead_info->magic, ARCH_CONVERT) == XFS_DIR2_LEAFN_MAGIC) {
- ASSERT(XFS_DIR_IS_V2(mp));
+ if (dead_info->magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) ||
+ dead_info->magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)) {
+ struct xfs_dir3_icleaf_hdr leafhdr;
+ struct xfs_dir2_leaf_entry *ents;
+
dead_leaf2 = (xfs_dir2_leaf_t *)dead_info;
+ dp->d_ops->leaf_hdr_from_disk(&leafhdr, dead_leaf2);
+ ents = dp->d_ops->leaf_ents_p(dead_leaf2);
dead_level = 0;
- dead_hash = INT_GET(dead_leaf2->ents[INT_GET(dead_leaf2->hdr.count, ARCH_CONVERT) - 1].hashval, ARCH_CONVERT);
+ dead_hash = be32_to_cpu(ents[leafhdr.count - 1].hashval);
} else {
- ASSERT(INT_GET(dead_info->magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC);
+ struct xfs_da3_icnode_hdr deadhdr;
+
dead_node = (xfs_da_intnode_t *)dead_info;
- dead_level = INT_GET(dead_node->hdr.level, ARCH_CONVERT);
- dead_hash = INT_GET(dead_node->btree[INT_GET(dead_node->hdr.count, ARCH_CONVERT) - 1].hashval, ARCH_CONVERT);
+ dp->d_ops->node_hdr_from_disk(&deadhdr, dead_node);
+ btree = dp->d_ops->node_tree_p(dead_node);
+ dead_level = deadhdr.level;
+ dead_hash = be32_to_cpu(btree[deadhdr.count - 1].hashval);
}
sib_buf = par_buf = NULL;
/*
* If the moved block has a left sibling, fix up the pointers.
*/
- if ((sib_blkno = INT_GET(dead_info->back, ARCH_CONVERT))) {
- if ((error = xfs_da_read_buf(tp, ip, sib_blkno, -1, &sib_buf, w)))
+ if ((sib_blkno = be32_to_cpu(dead_info->back))) {
+ error = xfs_da3_node_read(tp, dp, sib_blkno, -1, &sib_buf, w);
+ if (error)
goto done;
- sib_info = sib_buf->data;
+ sib_info = sib_buf->b_addr;
if (unlikely(
- INT_GET(sib_info->forw, ARCH_CONVERT) != last_blkno ||
- INT_GET(sib_info->magic, ARCH_CONVERT) != INT_GET(dead_info->magic, ARCH_CONVERT))) {
+ be32_to_cpu(sib_info->forw) != last_blkno ||
+ sib_info->magic != dead_info->magic)) {
XFS_ERROR_REPORT("xfs_da_swap_lastblock(2)",
XFS_ERRLEVEL_LOW, mp);
error = XFS_ERROR(EFSCORRUPTED);
goto done;
}
- INT_SET(sib_info->forw, ARCH_CONVERT, dead_blkno);
- xfs_da_log_buf(tp, sib_buf,
+ sib_info->forw = cpu_to_be32(dead_blkno);
+ xfs_trans_log_buf(tp, sib_buf,
XFS_DA_LOGRANGE(sib_info, &sib_info->forw,
sizeof(sib_info->forw)));
- xfs_da_buf_done(sib_buf);
sib_buf = NULL;
}
/*
* If the moved block has a right sibling, fix up the pointers.
*/
- if ((sib_blkno = INT_GET(dead_info->forw, ARCH_CONVERT))) {
- if ((error = xfs_da_read_buf(tp, ip, sib_blkno, -1, &sib_buf, w)))
+ if ((sib_blkno = be32_to_cpu(dead_info->forw))) {
+ error = xfs_da3_node_read(tp, dp, sib_blkno, -1, &sib_buf, w);
+ if (error)
goto done;
- sib_info = sib_buf->data;
+ sib_info = sib_buf->b_addr;
if (unlikely(
- INT_GET(sib_info->back, ARCH_CONVERT) != last_blkno
- || INT_GET(sib_info->magic, ARCH_CONVERT)
- != INT_GET(dead_info->magic, ARCH_CONVERT))) {
+ be32_to_cpu(sib_info->back) != last_blkno ||
+ sib_info->magic != dead_info->magic)) {
XFS_ERROR_REPORT("xfs_da_swap_lastblock(3)",
XFS_ERRLEVEL_LOW, mp);
error = XFS_ERROR(EFSCORRUPTED);
goto done;
}
- INT_SET(sib_info->back, ARCH_CONVERT, dead_blkno);
- xfs_da_log_buf(tp, sib_buf,
+ sib_info->back = cpu_to_be32(dead_blkno);
+ xfs_trans_log_buf(tp, sib_buf,
XFS_DA_LOGRANGE(sib_info, &sib_info->back,
sizeof(sib_info->back)));
- xfs_da_buf_done(sib_buf);
sib_buf = NULL;
}
- par_blkno = XFS_DIR_IS_V1(mp) ? 0 : mp->m_dirleafblk;
+ par_blkno = args->geo->leafblk;
level = -1;
/*
* Walk down the tree looking for the parent of the moved block.
*/
for (;;) {
- if ((error = xfs_da_read_buf(tp, ip, par_blkno, -1, &par_buf, w)))
+ error = xfs_da3_node_read(tp, dp, par_blkno, -1, &par_buf, w);
+ if (error)
goto done;
- par_node = par_buf->data;
- if (unlikely(
- INT_GET(par_node->hdr.info.magic, ARCH_CONVERT) != XFS_DA_NODE_MAGIC ||
- (level >= 0 && level != INT_GET(par_node->hdr.level, ARCH_CONVERT) + 1))) {
+ par_node = par_buf->b_addr;
+ dp->d_ops->node_hdr_from_disk(&par_hdr, par_node);
+ if (level >= 0 && level != par_hdr.level + 1) {
XFS_ERROR_REPORT("xfs_da_swap_lastblock(4)",
XFS_ERRLEVEL_LOW, mp);
error = XFS_ERROR(EFSCORRUPTED);
goto done;
}
- level = INT_GET(par_node->hdr.level, ARCH_CONVERT);
+ level = par_hdr.level;
+ btree = dp->d_ops->node_tree_p(par_node);
for (entno = 0;
- entno < INT_GET(par_node->hdr.count, ARCH_CONVERT) &&
- INT_GET(par_node->btree[entno].hashval, ARCH_CONVERT) < dead_hash;
+ entno < par_hdr.count &&
+ be32_to_cpu(btree[entno].hashval) < dead_hash;
entno++)
continue;
- if (unlikely(entno == INT_GET(par_node->hdr.count, ARCH_CONVERT))) {
+ if (entno == par_hdr.count) {
XFS_ERROR_REPORT("xfs_da_swap_lastblock(5)",
XFS_ERRLEVEL_LOW, mp);
error = XFS_ERROR(EFSCORRUPTED);
goto done;
}
- par_blkno = INT_GET(par_node->btree[entno].before, ARCH_CONVERT);
+ par_blkno = be32_to_cpu(btree[entno].before);
if (level == dead_level + 1)
break;
- xfs_da_brelse(tp, par_buf);
+ xfs_trans_brelse(tp, par_buf);
par_buf = NULL;
}
/*
@@ -1892,14 +2282,14 @@ xfs_da_swap_lastblock(xfs_da_args_t *args, xfs_dablk_t *dead_blknop,
*/
for (;;) {
for (;
- entno < INT_GET(par_node->hdr.count, ARCH_CONVERT) &&
- INT_GET(par_node->btree[entno].before, ARCH_CONVERT) != last_blkno;
+ entno < par_hdr.count &&
+ be32_to_cpu(btree[entno].before) != last_blkno;
entno++)
continue;
- if (entno < INT_GET(par_node->hdr.count, ARCH_CONVERT))
+ if (entno < par_hdr.count)
break;
- par_blkno = INT_GET(par_node->hdr.info.forw, ARCH_CONVERT);
- xfs_da_brelse(tp, par_buf);
+ par_blkno = par_hdr.forw;
+ xfs_trans_brelse(tp, par_buf);
par_buf = NULL;
if (unlikely(par_blkno == 0)) {
XFS_ERROR_REPORT("xfs_da_swap_lastblock(6)",
@@ -1907,37 +2297,36 @@ xfs_da_swap_lastblock(xfs_da_args_t *args, xfs_dablk_t *dead_blknop,
error = XFS_ERROR(EFSCORRUPTED);
goto done;
}
- if ((error = xfs_da_read_buf(tp, ip, par_blkno, -1, &par_buf, w)))
+ error = xfs_da3_node_read(tp, dp, par_blkno, -1, &par_buf, w);
+ if (error)
goto done;
- par_node = par_buf->data;
- if (unlikely(
- INT_GET(par_node->hdr.level, ARCH_CONVERT) != level ||
- INT_GET(par_node->hdr.info.magic, ARCH_CONVERT) != XFS_DA_NODE_MAGIC)) {
+ par_node = par_buf->b_addr;
+ dp->d_ops->node_hdr_from_disk(&par_hdr, par_node);
+ if (par_hdr.level != level) {
XFS_ERROR_REPORT("xfs_da_swap_lastblock(7)",
XFS_ERRLEVEL_LOW, mp);
error = XFS_ERROR(EFSCORRUPTED);
goto done;
}
+ btree = dp->d_ops->node_tree_p(par_node);
entno = 0;
}
/*
* Update the parent entry pointing to the moved block.
*/
- INT_SET(par_node->btree[entno].before, ARCH_CONVERT, dead_blkno);
- xfs_da_log_buf(tp, par_buf,
- XFS_DA_LOGRANGE(par_node, &par_node->btree[entno].before,
- sizeof(par_node->btree[entno].before)));
- xfs_da_buf_done(par_buf);
- xfs_da_buf_done(dead_buf);
+ btree[entno].before = cpu_to_be32(dead_blkno);
+ xfs_trans_log_buf(tp, par_buf,
+ XFS_DA_LOGRANGE(par_node, &btree[entno].before,
+ sizeof(btree[entno].before)));
*dead_blknop = last_blkno;
*dead_bufp = last_buf;
return 0;
done:
if (par_buf)
- xfs_da_brelse(tp, par_buf);
+ xfs_trans_brelse(tp, par_buf);
if (sib_buf)
- xfs_da_brelse(tp, sib_buf);
- xfs_da_brelse(tp, last_buf);
+ xfs_trans_brelse(tp, sib_buf);
+ xfs_trans_brelse(tp, last_buf);
return error;
}
@@ -1945,60 +2334,43 @@ done:
* Remove a btree block from a directory or attribute.
*/
int
-xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno,
- xfs_dabuf_t *dead_buf)
+xfs_da_shrink_inode(
+ xfs_da_args_t *args,
+ xfs_dablk_t dead_blkno,
+ struct xfs_buf *dead_buf)
{
xfs_inode_t *dp;
int done, error, w, count;
- xfs_fileoff_t bno;
- xfs_fsize_t size;
xfs_trans_t *tp;
xfs_mount_t *mp;
+ trace_xfs_da_shrink_inode(args);
+
dp = args->dp;
w = args->whichfork;
tp = args->trans;
mp = dp->i_mount;
- if (w == XFS_DATA_FORK && XFS_DIR_IS_V2(mp))
- count = mp->m_dirblkfsbs;
- else
- count = 1;
+ count = args->geo->fsbcount;
for (;;) {
/*
* Remove extents. If we get ENOSPC for a dir we have to move
* the last block to the place we want to kill.
*/
- if ((error = xfs_bunmapi(tp, dp, dead_blkno, count,
- XFS_BMAPI_AFLAG(w)|XFS_BMAPI_METADATA,
- 0, args->firstblock, args->flist,
- &done)) == ENOSPC) {
+ error = xfs_bunmapi(tp, dp, dead_blkno, count,
+ xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA,
+ 0, args->firstblock, args->flist, &done);
+ if (error == ENOSPC) {
if (w != XFS_DATA_FORK)
- goto done;
- if ((error = xfs_da_swap_lastblock(args, &dead_blkno,
- &dead_buf)))
- goto done;
- } else if (error)
- goto done;
- else
+ break;
+ error = xfs_da3_swap_lastblock(args, &dead_blkno,
+ &dead_buf);
+ if (error)
+ break;
+ } else {
break;
- }
- ASSERT(done);
- xfs_da_binval(tp, dead_buf);
- /*
- * Adjust the directory size for version 1.
- */
- if (w == XFS_DATA_FORK && XFS_DIR_IS_V1(mp)) {
- if ((error = xfs_bmap_last_offset(tp, dp, &bno, w)))
- return error;
- size = XFS_FSB_TO_B(dp->i_mount, bno);
- if (size != dp->i_d.di_size) {
- dp->i_d.di_size = size;
- xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
}
}
- return 0;
-done:
- xfs_da_binval(tp, dead_buf);
+ xfs_trans_binval(tp, dead_buf);
return error;
}
@@ -2030,39 +2402,78 @@ xfs_da_map_covers_blocks(
}
/*
- * Make a dabuf.
- * Used for get_buf, read_buf, read_bufr, and reada_buf.
+ * Convert a struct xfs_bmbt_irec to a struct xfs_buf_map.
+ *
+ * For the single map case, it is assumed that the caller has provided a pointer
+ * to a valid xfs_buf_map. For the multiple map case, this function will
+ * allocate the xfs_buf_map to hold all the maps and replace the caller's single
+ * map pointer with the allocated map.
*/
-STATIC int
-xfs_da_do_buf(
- xfs_trans_t *trans,
- xfs_inode_t *dp,
- xfs_dablk_t bno,
- xfs_daddr_t *mappedbnop,
- xfs_dabuf_t **bpp,
- int whichfork,
- int caller,
- inst_t *ra)
+static int
+xfs_buf_map_from_irec(
+ struct xfs_mount *mp,
+ struct xfs_buf_map **mapp,
+ int *nmaps,
+ struct xfs_bmbt_irec *irecs,
+ int nirecs)
{
- xfs_buf_t *bp = NULL;
- xfs_buf_t **bplist;
- int error=0;
- int i;
- xfs_bmbt_irec_t map;
- xfs_bmbt_irec_t *mapp;
- xfs_daddr_t mappedbno;
- xfs_mount_t *mp;
- int nbplist=0;
- int nfsb;
- int nmap;
- xfs_dabuf_t *rbp;
+ struct xfs_buf_map *map;
+ int i;
- mp = dp->i_mount;
- if (whichfork == XFS_DATA_FORK && XFS_DIR_IS_V2(mp))
- nfsb = mp->m_dirblkfsbs;
+ ASSERT(*nmaps == 1);
+ ASSERT(nirecs >= 1);
+
+ if (nirecs > 1) {
+ map = kmem_zalloc(nirecs * sizeof(struct xfs_buf_map),
+ KM_SLEEP | KM_NOFS);
+ if (!map)
+ return ENOMEM;
+ *mapp = map;
+ }
+
+ *nmaps = nirecs;
+ map = *mapp;
+ for (i = 0; i < *nmaps; i++) {
+ ASSERT(irecs[i].br_startblock != DELAYSTARTBLOCK &&
+ irecs[i].br_startblock != HOLESTARTBLOCK);
+ map[i].bm_bn = XFS_FSB_TO_DADDR(mp, irecs[i].br_startblock);
+ map[i].bm_len = XFS_FSB_TO_BB(mp, irecs[i].br_blockcount);
+ }
+ return 0;
+}
+
+/*
+ * Map the block we are given ready for reading. There are three possible return
+ * values:
+ * -1 - will be returned if we land in a hole and mappedbno == -2 so the
+ * caller knows not to execute a subsequent read.
+ * 0 - if we mapped the block successfully
+ * >0 - positive error number if there was an error.
+ */
+static int
+xfs_dabuf_map(
+ struct xfs_inode *dp,
+ xfs_dablk_t bno,
+ xfs_daddr_t mappedbno,
+ int whichfork,
+ struct xfs_buf_map **map,
+ int *nmaps)
+{
+ struct xfs_mount *mp = dp->i_mount;
+ int nfsb;
+ int error = 0;
+ struct xfs_bmbt_irec irec;
+ struct xfs_bmbt_irec *irecs = &irec;
+ int nirecs;
+
+ ASSERT(map && *map);
+ ASSERT(*nmaps == 1);
+
+ if (whichfork == XFS_DATA_FORK)
+ nfsb = mp->m_dir_geo->fsbcount;
else
- nfsb = 1;
- mappedbno = *mappedbnop;
+ nfsb = mp->m_attr_geo->fsbcount;
+
/*
* Caller doesn't have a mapping. -2 means don't complain
* if we land in a hole.
@@ -2071,180 +2482,50 @@ xfs_da_do_buf(
/*
* Optimize the one-block case.
*/
- if (nfsb == 1) {
- xfs_fsblock_t fsb;
+ if (nfsb != 1)
+ irecs = kmem_zalloc(sizeof(irec) * nfsb,
+ KM_SLEEP | KM_NOFS);
- if ((error =
- xfs_bmapi_single(trans, dp, whichfork, &fsb,
- (xfs_fileoff_t)bno))) {
- return error;
- }
- mapp = &map;
- if (fsb == NULLFSBLOCK) {
- nmap = 0;
- } else {
- map.br_startblock = fsb;
- map.br_startoff = (xfs_fileoff_t)bno;
- map.br_blockcount = 1;
- nmap = 1;
- }
- } else {
- mapp = kmem_alloc(sizeof(*mapp) * nfsb, KM_SLEEP);
- nmap = nfsb;
- if ((error = xfs_bmapi(trans, dp, (xfs_fileoff_t)bno,
- nfsb,
- XFS_BMAPI_METADATA |
- XFS_BMAPI_AFLAG(whichfork),
- NULL, 0, mapp, &nmap, NULL)))
- goto exit0;
- }
+ nirecs = nfsb;
+ error = xfs_bmapi_read(dp, (xfs_fileoff_t)bno, nfsb, irecs,
+ &nirecs, xfs_bmapi_aflag(whichfork));
+ if (error)
+ goto out;
} else {
- map.br_startblock = XFS_DADDR_TO_FSB(mp, mappedbno);
- map.br_startoff = (xfs_fileoff_t)bno;
- map.br_blockcount = nfsb;
- mapp = &map;
- nmap = 1;
+ irecs->br_startblock = XFS_DADDR_TO_FSB(mp, mappedbno);
+ irecs->br_startoff = (xfs_fileoff_t)bno;
+ irecs->br_blockcount = nfsb;
+ irecs->br_state = 0;
+ nirecs = 1;
}
- if (!xfs_da_map_covers_blocks(nmap, mapp, bno, nfsb)) {
- error = mappedbno == -2 ? 0 : XFS_ERROR(EFSCORRUPTED);
+
+ if (!xfs_da_map_covers_blocks(nirecs, irecs, bno, nfsb)) {
+ error = mappedbno == -2 ? -1 : XFS_ERROR(EFSCORRUPTED);
if (unlikely(error == EFSCORRUPTED)) {
if (xfs_error_level >= XFS_ERRLEVEL_LOW) {
- int i;
- cmn_err(CE_ALERT, "xfs_da_do_buf: bno %lld\n",
- (long long)bno);
- cmn_err(CE_ALERT, "dir: inode %lld\n",
+ int i;
+ xfs_alert(mp, "%s: bno %lld dir: inode %lld",
+ __func__, (long long)bno,
(long long)dp->i_ino);
- for (i = 0; i < nmap; i++) {
- cmn_err(CE_ALERT,
- "[%02d] br_startoff %lld br_startblock %lld br_blockcount %lld br_state %d\n",
+ for (i = 0; i < *nmaps; i++) {
+ xfs_alert(mp,
+"[%02d] br_startoff %lld br_startblock %lld br_blockcount %lld br_state %d",
i,
- (long long)mapp[i].br_startoff,
- (long long)mapp[i].br_startblock,
- (long long)mapp[i].br_blockcount,
- mapp[i].br_state);
+ (long long)irecs[i].br_startoff,
+ (long long)irecs[i].br_startblock,
+ (long long)irecs[i].br_blockcount,
+ irecs[i].br_state);
}
}
XFS_ERROR_REPORT("xfs_da_do_buf(1)",
XFS_ERRLEVEL_LOW, mp);
}
- goto exit0;
- }
- if (caller != 3 && nmap > 1) {
- bplist = kmem_alloc(sizeof(*bplist) * nmap, KM_SLEEP);
- nbplist = 0;
- } else
- bplist = NULL;
- /*
- * Turn the mapping(s) into buffer(s).
- */
- for (i = 0; i < nmap; i++) {
- int nmapped;
-
- mappedbno = XFS_FSB_TO_DADDR(mp, mapp[i].br_startblock);
- if (i == 0)
- *mappedbnop = mappedbno;
- nmapped = (int)XFS_FSB_TO_BB(mp, mapp[i].br_blockcount);
- switch (caller) {
- case 0:
- bp = xfs_trans_get_buf(trans, mp->m_ddev_targp,
- mappedbno, nmapped, 0);
- error = bp ? XFS_BUF_GETERROR(bp) : XFS_ERROR(EIO);
- break;
- case 1:
- case 2:
- bp = NULL;
- error = xfs_trans_read_buf(mp, trans, mp->m_ddev_targp,
- mappedbno, nmapped, 0, &bp);
- break;
- case 3:
- xfs_baread(mp->m_ddev_targp, mappedbno, nmapped);
- error = 0;
- bp = NULL;
- break;
- }
- if (error) {
- if (bp)
- xfs_trans_brelse(trans, bp);
- goto exit1;
- }
- if (!bp)
- continue;
- if (caller == 1) {
- if (whichfork == XFS_ATTR_FORK) {
- XFS_BUF_SET_VTYPE_REF(bp, B_FS_ATTR_BTREE,
- XFS_ATTR_BTREE_REF);
- } else {
- XFS_BUF_SET_VTYPE_REF(bp, B_FS_DIR_BTREE,
- XFS_DIR_BTREE_REF);
- }
- }
- if (bplist) {
- bplist[nbplist++] = bp;
- }
- }
- /*
- * Build a dabuf structure.
- */
- if (bplist) {
- rbp = xfs_da_buf_make(nbplist, bplist, ra);
- } else if (bp)
- rbp = xfs_da_buf_make(1, &bp, ra);
- else
- rbp = NULL;
- /*
- * For read_buf, check the magic number.
- */
- if (caller == 1) {
- xfs_dir2_data_t *data;
- xfs_dir2_free_t *free;
- xfs_da_blkinfo_t *info;
- uint magic, magic1;
-
- info = rbp->data;
- data = rbp->data;
- free = rbp->data;
- magic = INT_GET(info->magic, ARCH_CONVERT);
- magic1 = INT_GET(data->hdr.magic, ARCH_CONVERT);
- if (unlikely(
- XFS_TEST_ERROR((magic != XFS_DA_NODE_MAGIC) &&
- (magic != XFS_DIR_LEAF_MAGIC) &&
- (magic != XFS_ATTR_LEAF_MAGIC) &&
- (magic != XFS_DIR2_LEAF1_MAGIC) &&
- (magic != XFS_DIR2_LEAFN_MAGIC) &&
- (magic1 != XFS_DIR2_BLOCK_MAGIC) &&
- (magic1 != XFS_DIR2_DATA_MAGIC) &&
- (INT_GET(free->hdr.magic, ARCH_CONVERT) != XFS_DIR2_FREE_MAGIC),
- mp, XFS_ERRTAG_DA_READ_BUF,
- XFS_RANDOM_DA_READ_BUF))) {
- xfs_buftrace("DA READ ERROR", rbp->bps[0]);
- XFS_CORRUPTION_ERROR("xfs_da_do_buf(2)",
- XFS_ERRLEVEL_LOW, mp, info);
- error = XFS_ERROR(EFSCORRUPTED);
- xfs_da_brelse(trans, rbp);
- nbplist = 0;
- goto exit1;
- }
+ goto out;
}
- if (bplist) {
- kmem_free(bplist, sizeof(*bplist) * nmap);
- }
- if (mapp != &map) {
- kmem_free(mapp, sizeof(*mapp) * nfsb);
- }
- if (bpp)
- *bpp = rbp;
- return 0;
-exit1:
- if (bplist) {
- for (i = 0; i < nbplist; i++)
- xfs_trans_brelse(trans, bplist[i]);
- kmem_free(bplist, sizeof(*bplist) * nmap);
- }
-exit0:
- if (mapp != &map)
- kmem_free(mapp, sizeof(*mapp) * nfsb);
- if (bpp)
- *bpp = NULL;
+ error = xfs_buf_map_from_irec(mp, map, nmaps, irecs, nirecs);
+out:
+ if (irecs != &irec)
+ kmem_free(irecs);
return error;
}
@@ -2253,346 +2534,132 @@ exit0:
*/
int
xfs_da_get_buf(
- xfs_trans_t *trans,
- xfs_inode_t *dp,
- xfs_dablk_t bno,
- xfs_daddr_t mappedbno,
- xfs_dabuf_t **bpp,
- int whichfork)
-{
- return xfs_da_do_buf(trans, dp, bno, &mappedbno, bpp, whichfork, 0,
- (inst_t *)__return_address);
-}
-
-/*
- * Get a buffer for the dir/attr block, fill in the contents.
- */
-int
-xfs_da_read_buf(
- xfs_trans_t *trans,
- xfs_inode_t *dp,
- xfs_dablk_t bno,
+ struct xfs_trans *trans,
+ struct xfs_inode *dp,
+ xfs_dablk_t bno,
xfs_daddr_t mappedbno,
- xfs_dabuf_t **bpp,
- int whichfork)
-{
- return xfs_da_do_buf(trans, dp, bno, &mappedbno, bpp, whichfork, 1,
- (inst_t *)__return_address);
-}
-
-/*
- * Readahead the dir/attr block.
- */
-xfs_daddr_t
-xfs_da_reada_buf(
- xfs_trans_t *trans,
- xfs_inode_t *dp,
- xfs_dablk_t bno,
- int whichfork)
-{
- xfs_daddr_t rval;
-
- rval = -1;
- if (xfs_da_do_buf(trans, dp, bno, &rval, NULL, whichfork, 3,
- (inst_t *)__return_address))
- return -1;
- else
- return rval;
-}
-
-/*
- * Calculate the number of bits needed to hold i different values.
- */
-uint
-xfs_da_log2_roundup(uint i)
-{
- uint rval;
-
- for (rval = 0; rval < NBBY * sizeof(i); rval++) {
- if ((1 << rval) >= i)
- break;
- }
- return(rval);
-}
-
-kmem_zone_t *xfs_da_state_zone; /* anchor for state struct zone */
-kmem_zone_t *xfs_dabuf_zone; /* dabuf zone */
-
-/*
- * Allocate a dir-state structure.
- * We don't put them on the stack since they're large.
- */
-xfs_da_state_t *
-xfs_da_state_alloc(void)
-{
- return kmem_zone_zalloc(xfs_da_state_zone, KM_SLEEP);
-}
-
-/*
- * Kill the altpath contents of a da-state structure.
- */
-STATIC void
-xfs_da_state_kill_altpath(xfs_da_state_t *state)
+ struct xfs_buf **bpp,
+ int whichfork)
{
- int i;
-
- for (i = 0; i < state->altpath.active; i++) {
- if (state->altpath.blk[i].bp) {
- if (state->altpath.blk[i].bp != state->path.blk[i].bp)
- xfs_da_buf_done(state->altpath.blk[i].bp);
- state->altpath.blk[i].bp = NULL;
- }
+ struct xfs_buf *bp;
+ struct xfs_buf_map map;
+ struct xfs_buf_map *mapp;
+ int nmap;
+ int error;
+
+ *bpp = NULL;
+ mapp = &map;
+ nmap = 1;
+ error = xfs_dabuf_map(dp, bno, mappedbno, whichfork,
+ &mapp, &nmap);
+ if (error) {
+ /* mapping a hole is not an error, but we don't continue */
+ if (error == -1)
+ error = 0;
+ goto out_free;
}
- state->altpath.active = 0;
-}
-/*
- * Free a da-state structure.
- */
-void
-xfs_da_state_free(xfs_da_state_t *state)
-{
- int i;
-
- xfs_da_state_kill_altpath(state);
- for (i = 0; i < state->path.active; i++) {
- if (state->path.blk[i].bp)
- xfs_da_buf_done(state->path.blk[i].bp);
+ bp = xfs_trans_get_buf_map(trans, dp->i_mount->m_ddev_targp,
+ mapp, nmap, 0);
+ error = bp ? bp->b_error : XFS_ERROR(EIO);
+ if (error) {
+ xfs_trans_brelse(trans, bp);
+ goto out_free;
}
- if (state->extravalid && state->extrablk.bp)
- xfs_da_buf_done(state->extrablk.bp);
-#ifdef DEBUG
- memset((char *)state, 0, sizeof(*state));
-#endif /* DEBUG */
- kmem_zone_free(xfs_da_state_zone, state);
-}
-#ifdef XFS_DABUF_DEBUG
-xfs_dabuf_t *xfs_dabuf_global_list;
-lock_t xfs_dabuf_global_lock;
-#endif
+ *bpp = bp;
-/*
- * Create a dabuf.
- */
-/* ARGSUSED */
-STATIC xfs_dabuf_t *
-xfs_da_buf_make(int nbuf, xfs_buf_t **bps, inst_t *ra)
-{
- xfs_buf_t *bp;
- xfs_dabuf_t *dabuf;
- int i;
- int off;
+out_free:
+ if (mapp != &map)
+ kmem_free(mapp);
- if (nbuf == 1)
- dabuf = kmem_zone_alloc(xfs_dabuf_zone, KM_SLEEP);
- else
- dabuf = kmem_alloc(XFS_DA_BUF_SIZE(nbuf), KM_SLEEP);
- dabuf->dirty = 0;
-#ifdef XFS_DABUF_DEBUG
- dabuf->ra = ra;
- dabuf->target = XFS_BUF_TARGET(bps[0]);
- dabuf->blkno = XFS_BUF_ADDR(bps[0]);
-#endif
- if (nbuf == 1) {
- dabuf->nbuf = 1;
- bp = bps[0];
- dabuf->bbcount = (short)BTOBB(XFS_BUF_COUNT(bp));
- dabuf->data = XFS_BUF_PTR(bp);
- dabuf->bps[0] = bp;
- } else {
- dabuf->nbuf = nbuf;
- for (i = 0, dabuf->bbcount = 0; i < nbuf; i++) {
- dabuf->bps[i] = bp = bps[i];
- dabuf->bbcount += BTOBB(XFS_BUF_COUNT(bp));
- }
- dabuf->data = kmem_alloc(BBTOB(dabuf->bbcount), KM_SLEEP);
- for (i = off = 0; i < nbuf; i++, off += XFS_BUF_COUNT(bp)) {
- bp = bps[i];
- memcpy((char *)dabuf->data + off, XFS_BUF_PTR(bp),
- XFS_BUF_COUNT(bp));
- }
- }
-#ifdef XFS_DABUF_DEBUG
- {
- SPLDECL(s);
- xfs_dabuf_t *p;
-
- s = mutex_spinlock(&xfs_dabuf_global_lock);
- for (p = xfs_dabuf_global_list; p; p = p->next) {
- ASSERT(p->blkno != dabuf->blkno ||
- p->target != dabuf->target);
- }
- dabuf->prev = NULL;
- if (xfs_dabuf_global_list)
- xfs_dabuf_global_list->prev = dabuf;
- dabuf->next = xfs_dabuf_global_list;
- xfs_dabuf_global_list = dabuf;
- mutex_spinunlock(&xfs_dabuf_global_lock, s);
- }
-#endif
- return dabuf;
+ return error;
}
/*
- * Un-dirty a dabuf.
+ * Get a buffer for the dir/attr block, fill in the contents.
*/
-STATIC void
-xfs_da_buf_clean(xfs_dabuf_t *dabuf)
+int
+xfs_da_read_buf(
+ struct xfs_trans *trans,
+ struct xfs_inode *dp,
+ xfs_dablk_t bno,
+ xfs_daddr_t mappedbno,
+ struct xfs_buf **bpp,
+ int whichfork,
+ const struct xfs_buf_ops *ops)
{
- xfs_buf_t *bp;
- int i;
- int off;
-
- if (dabuf->dirty) {
- ASSERT(dabuf->nbuf > 1);
- dabuf->dirty = 0;
- for (i = off = 0; i < dabuf->nbuf;
- i++, off += XFS_BUF_COUNT(bp)) {
- bp = dabuf->bps[i];
- memcpy(XFS_BUF_PTR(bp), (char *)dabuf->data + off,
- XFS_BUF_COUNT(bp));
- }
+ struct xfs_buf *bp;
+ struct xfs_buf_map map;
+ struct xfs_buf_map *mapp;
+ int nmap;
+ int error;
+
+ *bpp = NULL;
+ mapp = &map;
+ nmap = 1;
+ error = xfs_dabuf_map(dp, bno, mappedbno, whichfork,
+ &mapp, &nmap);
+ if (error) {
+ /* mapping a hole is not an error, but we don't continue */
+ if (error == -1)
+ error = 0;
+ goto out_free;
}
-}
-/*
- * Release a dabuf.
- */
-void
-xfs_da_buf_done(xfs_dabuf_t *dabuf)
-{
- ASSERT(dabuf);
- ASSERT(dabuf->nbuf && dabuf->data && dabuf->bbcount && dabuf->bps[0]);
- if (dabuf->dirty)
- xfs_da_buf_clean(dabuf);
- if (dabuf->nbuf > 1)
- kmem_free(dabuf->data, BBTOB(dabuf->bbcount));
-#ifdef XFS_DABUF_DEBUG
- {
- SPLDECL(s);
-
- s = mutex_spinlock(&xfs_dabuf_global_lock);
- if (dabuf->prev)
- dabuf->prev->next = dabuf->next;
- else
- xfs_dabuf_global_list = dabuf->next;
- if (dabuf->next)
- dabuf->next->prev = dabuf->prev;
- mutex_spinunlock(&xfs_dabuf_global_lock, s);
- }
- memset(dabuf, 0, XFS_DA_BUF_SIZE(dabuf->nbuf));
-#endif
- if (dabuf->nbuf == 1)
- kmem_zone_free(xfs_dabuf_zone, dabuf);
- else
- kmem_free(dabuf, XFS_DA_BUF_SIZE(dabuf->nbuf));
-}
+ error = xfs_trans_read_buf_map(dp->i_mount, trans,
+ dp->i_mount->m_ddev_targp,
+ mapp, nmap, 0, &bp, ops);
+ if (error)
+ goto out_free;
-/*
- * Log transaction from a dabuf.
- */
-void
-xfs_da_log_buf(xfs_trans_t *tp, xfs_dabuf_t *dabuf, uint first, uint last)
-{
- xfs_buf_t *bp;
- uint f;
- int i;
- uint l;
- int off;
+ if (whichfork == XFS_ATTR_FORK)
+ xfs_buf_set_ref(bp, XFS_ATTR_BTREE_REF);
+ else
+ xfs_buf_set_ref(bp, XFS_DIR_BTREE_REF);
+ *bpp = bp;
+out_free:
+ if (mapp != &map)
+ kmem_free(mapp);
- ASSERT(dabuf->nbuf && dabuf->data && dabuf->bbcount && dabuf->bps[0]);
- if (dabuf->nbuf == 1) {
- ASSERT(dabuf->data == (void *)XFS_BUF_PTR(dabuf->bps[0]));
- xfs_trans_log_buf(tp, dabuf->bps[0], first, last);
- return;
- }
- dabuf->dirty = 1;
- ASSERT(first <= last);
- for (i = off = 0; i < dabuf->nbuf; i++, off += XFS_BUF_COUNT(bp)) {
- bp = dabuf->bps[i];
- f = off;
- l = f + XFS_BUF_COUNT(bp) - 1;
- if (f < first)
- f = first;
- if (l > last)
- l = last;
- if (f <= l)
- xfs_trans_log_buf(tp, bp, f - off, l - off);
- /*
- * B_DONE is set by xfs_trans_log buf.
- * If we don't set it on a new buffer (get not read)
- * then if we don't put anything in the buffer it won't
- * be set, and at commit it it released into the cache,
- * and then a read will fail.
- */
- else if (!(XFS_BUF_ISDONE(bp)))
- XFS_BUF_DONE(bp);
- }
- ASSERT(last < off);
+ return error;
}
/*
- * Release dabuf from a transaction.
- * Have to free up the dabuf before the buffers are released,
- * since the synchronization on the dabuf is really the lock on the buffer.
+ * Readahead the dir/attr block.
*/
-void
-xfs_da_brelse(xfs_trans_t *tp, xfs_dabuf_t *dabuf)
+xfs_daddr_t
+xfs_da_reada_buf(
+ struct xfs_inode *dp,
+ xfs_dablk_t bno,
+ xfs_daddr_t mappedbno,
+ int whichfork,
+ const struct xfs_buf_ops *ops)
{
- xfs_buf_t *bp;
- xfs_buf_t **bplist;
- int i;
- int nbuf;
+ struct xfs_buf_map map;
+ struct xfs_buf_map *mapp;
+ int nmap;
+ int error;
- ASSERT(dabuf->nbuf && dabuf->data && dabuf->bbcount && dabuf->bps[0]);
- if ((nbuf = dabuf->nbuf) == 1) {
- bplist = &bp;
- bp = dabuf->bps[0];
- } else {
- bplist = kmem_alloc(nbuf * sizeof(*bplist), KM_SLEEP);
- memcpy(bplist, dabuf->bps, nbuf * sizeof(*bplist));
+ mapp = &map;
+ nmap = 1;
+ error = xfs_dabuf_map(dp, bno, mappedbno, whichfork,
+ &mapp, &nmap);
+ if (error) {
+ /* mapping a hole is not an error, but we don't continue */
+ if (error == -1)
+ error = 0;
+ goto out_free;
}
- xfs_da_buf_done(dabuf);
- for (i = 0; i < nbuf; i++)
- xfs_trans_brelse(tp, bplist[i]);
- if (bplist != &bp)
- kmem_free(bplist, nbuf * sizeof(*bplist));
-}
-/*
- * Invalidate dabuf from a transaction.
- */
-void
-xfs_da_binval(xfs_trans_t *tp, xfs_dabuf_t *dabuf)
-{
- xfs_buf_t *bp;
- xfs_buf_t **bplist;
- int i;
- int nbuf;
+ mappedbno = mapp[0].bm_bn;
+ xfs_buf_readahead_map(dp->i_mount->m_ddev_targp, mapp, nmap, ops);
- ASSERT(dabuf->nbuf && dabuf->data && dabuf->bbcount && dabuf->bps[0]);
- if ((nbuf = dabuf->nbuf) == 1) {
- bplist = &bp;
- bp = dabuf->bps[0];
- } else {
- bplist = kmem_alloc(nbuf * sizeof(*bplist), KM_SLEEP);
- memcpy(bplist, dabuf->bps, nbuf * sizeof(*bplist));
- }
- xfs_da_buf_done(dabuf);
- for (i = 0; i < nbuf; i++)
- xfs_trans_binval(tp, bplist[i]);
- if (bplist != &bp)
- kmem_free(bplist, nbuf * sizeof(*bplist));
-}
+out_free:
+ if (mapp != &map)
+ kmem_free(mapp);
-/*
- * Get the first daddr from a dabuf.
- */
-xfs_daddr_t
-xfs_da_blkno(xfs_dabuf_t *dabuf)
-{
- ASSERT(dabuf->nbuf);
- ASSERT(dabuf->data);
- return XFS_BUF_ADDR(dabuf->bps[0]);
+ if (error)
+ return -1;
+ return mappedbno;
}
diff --git a/fs/xfs/xfs_da_btree.h b/fs/xfs/xfs_da_btree.h
index 41352113721..6e153e399a7 100644
--- a/fs/xfs/xfs_da_btree.h
+++ b/fs/xfs/xfs_da_btree.h
@@ -1,5 +1,6 @@
/*
* Copyright (c) 2000,2002,2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
* All Rights Reserved.
*
* This program is free software; you can redistribute it and/or
@@ -18,97 +19,51 @@
#ifndef __XFS_DA_BTREE_H__
#define __XFS_DA_BTREE_H__
-struct xfs_buf;
struct xfs_bmap_free;
struct xfs_inode;
-struct xfs_mount;
struct xfs_trans;
struct zone;
-
-/*========================================================================
- * Directory Structure when greater than XFS_LBSIZE(mp) bytes.
- *========================================================================*/
+struct xfs_dir_ops;
/*
- * This structure is common to both leaf nodes and non-leaf nodes in the Btree.
- *
- * Is is used to manage a doubly linked list of all blocks at the same
- * level in the Btree, and to identify which type of block this is.
+ * Directory/attribute geometry information. There will be one of these for each
+ * data fork type, and it will be passed around via the xfs_da_args. Global
+ * structures will be attached to the xfs_mount.
*/
-#define XFS_DA_NODE_MAGIC 0xfebe /* magic number: non-leaf blocks */
-#define XFS_DIR_LEAF_MAGIC 0xfeeb /* magic number: directory leaf blks */
-#define XFS_ATTR_LEAF_MAGIC 0xfbee /* magic number: attribute leaf blks */
-#define XFS_DIR2_LEAF1_MAGIC 0xd2f1 /* magic number: v2 dirlf single blks */
-#define XFS_DIR2_LEAFN_MAGIC 0xd2ff /* magic number: v2 dirlf multi blks */
-
-#define XFS_DIRX_LEAF_MAGIC(mp) \
- (XFS_DIR_IS_V1(mp) ? XFS_DIR_LEAF_MAGIC : XFS_DIR2_LEAFN_MAGIC)
-
-typedef struct xfs_da_blkinfo {
- xfs_dablk_t forw; /* previous block in list */
- xfs_dablk_t back; /* following block in list */
- __uint16_t magic; /* validity check on block */
- __uint16_t pad; /* unused */
-} xfs_da_blkinfo_t;
-
-/*
- * This is the structure of the root and intermediate nodes in the Btree.
- * The leaf nodes are defined above.
- *
- * Entries are not packed.
- *
- * Since we have duplicate keys, use a binary search but always follow
- * all match in the block, not just the first match found.
- */
-#define XFS_DA_NODE_MAXDEPTH 5 /* max depth of Btree */
-
-typedef struct xfs_da_intnode {
- struct xfs_da_node_hdr { /* constant-structure header block */
- xfs_da_blkinfo_t info; /* block type, links, etc. */
- __uint16_t count; /* count of active entries */
- __uint16_t level; /* level above leaves (leaf == 0) */
- } hdr;
- struct xfs_da_node_entry {
- xfs_dahash_t hashval; /* hash value for this descendant */
- xfs_dablk_t before; /* Btree block before this key */
- } btree[1]; /* variable sized array of keys */
-} xfs_da_intnode_t;
-typedef struct xfs_da_node_hdr xfs_da_node_hdr_t;
-typedef struct xfs_da_node_entry xfs_da_node_entry_t;
-
-#define XFS_DA_MAXHASH ((xfs_dahash_t)-1) /* largest valid hash value */
-
-#define XFS_LBSIZE(mp) (mp)->m_sb.sb_blocksize
-#define XFS_LBLOG(mp) (mp)->m_sb.sb_blocklog
-
-#define XFS_DA_MAKE_BNOENTRY(mp,bno,entry) \
- (((bno) << (mp)->m_dircook_elog) | (entry))
-#define XFS_DA_MAKE_COOKIE(mp,bno,entry,hash) \
- (((xfs_off_t)XFS_DA_MAKE_BNOENTRY(mp, bno, entry) << 32) | (hash))
-#define XFS_DA_COOKIE_HASH(mp,cookie) ((xfs_dahash_t)cookie)
-#define XFS_DA_COOKIE_BNO(mp,cookie) \
- ((((xfs_off_t)(cookie) >> 31) == -1LL ? \
- (xfs_dablk_t)0 : \
- (xfs_dablk_t)((xfs_off_t)(cookie) >> \
- ((mp)->m_dircook_elog + 32))))
-#define XFS_DA_COOKIE_ENTRY(mp,cookie) \
- ((((xfs_off_t)(cookie) >> 31) == -1LL ? \
- (xfs_dablk_t)0 : \
- (xfs_dablk_t)(((xfs_off_t)(cookie) >> 32) & \
- ((1 << (mp)->m_dircook_elog) - 1))))
-
+struct xfs_da_geometry {
+ int blksize; /* da block size in bytes */
+ int fsbcount; /* da block size in filesystem blocks */
+ uint8_t fsblog; /* log2 of _filesystem_ block size */
+ uint8_t blklog; /* log2 of da block size */
+ uint node_ents; /* # of entries in a danode */
+ int magicpct; /* 37% of block size in bytes */
+ xfs_dablk_t datablk; /* blockno of dir data v2 */
+ xfs_dablk_t leafblk; /* blockno of leaf data v2 */
+ xfs_dablk_t freeblk; /* blockno of free data v2 */
+};
/*========================================================================
* Btree searching and modification structure definitions.
*========================================================================*/
/*
+ * Search comparison results
+ */
+enum xfs_dacmp {
+ XFS_CMP_DIFFERENT, /* names are completely different */
+ XFS_CMP_EXACT, /* names are exactly the same */
+ XFS_CMP_CASE /* names are same but differ in case */
+};
+
+/*
* Structure to ease passing around component names.
*/
typedef struct xfs_da_args {
- const uchar_t *name; /* string (maybe not NULL terminated) */
+ struct xfs_da_geometry *geo; /* da block geometry */
+ const __uint8_t *name; /* string (maybe not NULL terminated) */
int namelen; /* length of string (maybe no NULL) */
- uchar_t *value; /* set of bytes (maybe contain NULLs) */
+ __uint8_t filetype; /* filetype of inode for directories */
+ __uint8_t *value; /* set of bytes (maybe contain NULLs) */
int valuelen; /* length of value */
int flags; /* argument flags (eg: ATTR_NOCREATE) */
xfs_dahash_t hashval; /* hash value of name */
@@ -123,44 +78,31 @@ typedef struct xfs_da_args {
int index; /* index of attr of interest in blk */
xfs_dablk_t rmtblkno; /* remote attr value starting blkno */
int rmtblkcnt; /* remote attr value block count */
+ int rmtvaluelen; /* remote attr value length in bytes */
xfs_dablk_t blkno2; /* blkno of 2nd attr leaf of interest */
int index2; /* index of 2nd attr in blk */
xfs_dablk_t rmtblkno2; /* remote attr value starting blkno */
int rmtblkcnt2; /* remote attr value block count */
- unsigned char justcheck; /* T/F: check for ok with no space */
- unsigned char rename; /* T/F: this is an atomic rename op */
- unsigned char addname; /* T/F: this is an add operation */
- unsigned char oknoent; /* T/F: ok to return ENOENT, else die */
+ int rmtvaluelen2; /* remote attr value length in bytes */
+ int op_flags; /* operation flags */
+ enum xfs_dacmp cmpresult; /* name compare result for lookups */
} xfs_da_args_t;
/*
- * Structure to describe buffer(s) for a block.
- * This is needed in the directory version 2 format case, when
- * multiple non-contiguous fsblocks might be needed to cover one
- * logical directory block.
- * If the buffer count is 1 then the data pointer points to the
- * same place as the b_addr field for the buffer, else to kmem_alloced memory.
+ * Operation flags:
*/
-typedef struct xfs_dabuf {
- int nbuf; /* number of buffer pointers present */
- short dirty; /* data needs to be copied back */
- short bbcount; /* how large is data in bbs */
- void *data; /* pointer for buffers' data */
-#ifdef XFS_DABUF_DEBUG
- inst_t *ra; /* return address of caller to make */
- struct xfs_dabuf *next; /* next in global chain */
- struct xfs_dabuf *prev; /* previous in global chain */
- struct xfs_buftarg *target; /* device for buffer */
- xfs_daddr_t blkno; /* daddr first in bps[0] */
-#endif
- struct xfs_buf *bps[1]; /* actually nbuf of these */
-} xfs_dabuf_t;
-#define XFS_DA_BUF_SIZE(n) \
- (sizeof(xfs_dabuf_t) + sizeof(struct xfs_buf *) * ((n) - 1))
-
-#ifdef XFS_DABUF_DEBUG
-extern xfs_dabuf_t *xfs_dabuf_global_list;
-#endif
+#define XFS_DA_OP_JUSTCHECK 0x0001 /* check for ok with no space */
+#define XFS_DA_OP_RENAME 0x0002 /* this is an atomic rename op */
+#define XFS_DA_OP_ADDNAME 0x0004 /* this is an add operation */
+#define XFS_DA_OP_OKNOENT 0x0008 /* lookup/add op, ENOENT ok, else die */
+#define XFS_DA_OP_CILOOKUP 0x0010 /* lookup to return CI name if found */
+
+#define XFS_DA_OP_FLAGS \
+ { XFS_DA_OP_JUSTCHECK, "JUSTCHECK" }, \
+ { XFS_DA_OP_RENAME, "RENAME" }, \
+ { XFS_DA_OP_ADDNAME, "ADDNAME" }, \
+ { XFS_DA_OP_OKNOENT, "OKNOENT" }, \
+ { XFS_DA_OP_CILOOKUP, "CILOOKUP" }
/*
* Storage for holding state during Btree searches and split/join ops.
@@ -170,7 +112,7 @@ extern xfs_dabuf_t *xfs_dabuf_global_list;
* which is slightly more than enough.
*/
typedef struct xfs_da_state_blk {
- xfs_dabuf_t *bp; /* buffer containing block */
+ struct xfs_buf *bp; /* buffer containing block */
xfs_dablk_t blkno; /* filesystem blkno of buffer */
xfs_daddr_t disk_blkno; /* on-disk blkno (in BBs) of buffer */
int index; /* relevant index into block */
@@ -186,14 +128,12 @@ typedef struct xfs_da_state_path {
typedef struct xfs_da_state {
xfs_da_args_t *args; /* filename arguments */
struct xfs_mount *mp; /* filesystem mount point */
- unsigned int blocksize; /* logical block size */
- unsigned int node_ents; /* how many entries in danode */
xfs_da_state_path_t path; /* search/split paths */
xfs_da_state_path_t altpath; /* alternate path for join */
unsigned char inleaf; /* insert into 1->lf, 0->splf */
unsigned char extravalid; /* T/F: extrablk is in use */
unsigned char extraafter; /* T/F: extrablk is after new */
- xfs_da_state_blk_t extrablk; /* for double-splits on leafs */
+ xfs_da_state_blk_t extrablk; /* for double-splits on leaves */
/* for dirv2 extrablk is data */
} xfs_da_state_t;
@@ -205,66 +145,77 @@ typedef struct xfs_da_state {
(uint)(XFS_DA_LOGOFF(BASE, ADDR)), \
(uint)(XFS_DA_LOGOFF(BASE, ADDR)+(SIZE)-1)
+/*
+ * Name ops for directory and/or attr name operations
+ */
+struct xfs_nameops {
+ xfs_dahash_t (*hashname)(struct xfs_name *);
+ enum xfs_dacmp (*compname)(struct xfs_da_args *,
+ const unsigned char *, int);
+};
+
-#ifdef __KERNEL__
/*========================================================================
- * Function prototypes for the kernel.
+ * Function prototypes.
*========================================================================*/
/*
* Routines used for growing the Btree.
*/
-int xfs_da_node_create(xfs_da_args_t *args, xfs_dablk_t blkno, int level,
- xfs_dabuf_t **bpp, int whichfork);
-int xfs_da_split(xfs_da_state_t *state);
+int xfs_da3_node_create(struct xfs_da_args *args, xfs_dablk_t blkno,
+ int level, struct xfs_buf **bpp, int whichfork);
+int xfs_da3_split(xfs_da_state_t *state);
/*
* Routines used for shrinking the Btree.
*/
-int xfs_da_join(xfs_da_state_t *state);
-void xfs_da_fixhashpath(xfs_da_state_t *state,
- xfs_da_state_path_t *path_to_to_fix);
+int xfs_da3_join(xfs_da_state_t *state);
+void xfs_da3_fixhashpath(struct xfs_da_state *state,
+ struct xfs_da_state_path *path_to_to_fix);
/*
* Routines used for finding things in the Btree.
*/
-int xfs_da_node_lookup_int(xfs_da_state_t *state, int *result);
-int xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path,
+int xfs_da3_node_lookup_int(xfs_da_state_t *state, int *result);
+int xfs_da3_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path,
int forward, int release, int *result);
/*
* Utility routines.
*/
-int xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk,
+int xfs_da3_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk,
xfs_da_state_blk_t *new_blk);
+int xfs_da3_node_read(struct xfs_trans *tp, struct xfs_inode *dp,
+ xfs_dablk_t bno, xfs_daddr_t mappedbno,
+ struct xfs_buf **bpp, int which_fork);
/*
* Utility routines.
*/
int xfs_da_grow_inode(xfs_da_args_t *args, xfs_dablk_t *new_blkno);
+int xfs_da_grow_inode_int(struct xfs_da_args *args, xfs_fileoff_t *bno,
+ int count);
int xfs_da_get_buf(struct xfs_trans *trans, struct xfs_inode *dp,
xfs_dablk_t bno, xfs_daddr_t mappedbno,
- xfs_dabuf_t **bp, int whichfork);
+ struct xfs_buf **bp, int whichfork);
int xfs_da_read_buf(struct xfs_trans *trans, struct xfs_inode *dp,
xfs_dablk_t bno, xfs_daddr_t mappedbno,
- xfs_dabuf_t **bpp, int whichfork);
-xfs_daddr_t xfs_da_reada_buf(struct xfs_trans *trans, struct xfs_inode *dp,
- xfs_dablk_t bno, int whichfork);
+ struct xfs_buf **bpp, int whichfork,
+ const struct xfs_buf_ops *ops);
+xfs_daddr_t xfs_da_reada_buf(struct xfs_inode *dp, xfs_dablk_t bno,
+ xfs_daddr_t mapped_bno, int whichfork,
+ const struct xfs_buf_ops *ops);
int xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno,
- xfs_dabuf_t *dead_buf);
+ struct xfs_buf *dead_buf);
+
+uint xfs_da_hashname(const __uint8_t *name_string, int name_length);
+enum xfs_dacmp xfs_da_compname(struct xfs_da_args *args,
+ const unsigned char *name, int len);
+
-uint xfs_da_hashname(const uchar_t *name_string, int name_length);
-uint xfs_da_log2_roundup(uint i);
xfs_da_state_t *xfs_da_state_alloc(void);
void xfs_da_state_free(xfs_da_state_t *state);
-void xfs_da_buf_done(xfs_dabuf_t *dabuf);
-void xfs_da_log_buf(struct xfs_trans *tp, xfs_dabuf_t *dabuf, uint first,
- uint last);
-void xfs_da_brelse(struct xfs_trans *tp, xfs_dabuf_t *dabuf);
-void xfs_da_binval(struct xfs_trans *tp, xfs_dabuf_t *dabuf);
-xfs_daddr_t xfs_da_blkno(xfs_dabuf_t *dabuf);
-
extern struct kmem_zone *xfs_da_state_zone;
-#endif /* __KERNEL__ */
+extern const struct xfs_nameops xfs_default_nameops;
#endif /* __XFS_DA_BTREE_H__ */
diff --git a/fs/xfs/xfs_da_format.c b/fs/xfs/xfs_da_format.c
new file mode 100644
index 00000000000..c9aee52a37e
--- /dev/null
+++ b/fs/xfs/xfs_da_format.c
@@ -0,0 +1,911 @@
+/*
+ * Copyright (c) 2000,2002,2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_inode.h"
+#include "xfs_dir2.h"
+#include "xfs_dir2_priv.h"
+
+/*
+ * Shortform directory ops
+ */
+static int
+xfs_dir2_sf_entsize(
+ struct xfs_dir2_sf_hdr *hdr,
+ int len)
+{
+ int count = sizeof(struct xfs_dir2_sf_entry); /* namelen + offset */
+
+ count += len; /* name */
+ count += hdr->i8count ? sizeof(xfs_dir2_ino8_t) :
+ sizeof(xfs_dir2_ino4_t); /* ino # */
+ return count;
+}
+
+static int
+xfs_dir3_sf_entsize(
+ struct xfs_dir2_sf_hdr *hdr,
+ int len)
+{
+ return xfs_dir2_sf_entsize(hdr, len) + sizeof(__uint8_t);
+}
+
+static struct xfs_dir2_sf_entry *
+xfs_dir2_sf_nextentry(
+ struct xfs_dir2_sf_hdr *hdr,
+ struct xfs_dir2_sf_entry *sfep)
+{
+ return (struct xfs_dir2_sf_entry *)
+ ((char *)sfep + xfs_dir2_sf_entsize(hdr, sfep->namelen));
+}
+
+static struct xfs_dir2_sf_entry *
+xfs_dir3_sf_nextentry(
+ struct xfs_dir2_sf_hdr *hdr,
+ struct xfs_dir2_sf_entry *sfep)
+{
+ return (struct xfs_dir2_sf_entry *)
+ ((char *)sfep + xfs_dir3_sf_entsize(hdr, sfep->namelen));
+}
+
+
+/*
+ * For filetype enabled shortform directories, the file type field is stored at
+ * the end of the name. Because it's only a single byte, endian conversion is
+ * not necessary. For non-filetype enable directories, the type is always
+ * unknown and we never store the value.
+ */
+static __uint8_t
+xfs_dir2_sfe_get_ftype(
+ struct xfs_dir2_sf_entry *sfep)
+{
+ return XFS_DIR3_FT_UNKNOWN;
+}
+
+static void
+xfs_dir2_sfe_put_ftype(
+ struct xfs_dir2_sf_entry *sfep,
+ __uint8_t ftype)
+{
+ ASSERT(ftype < XFS_DIR3_FT_MAX);
+}
+
+static __uint8_t
+xfs_dir3_sfe_get_ftype(
+ struct xfs_dir2_sf_entry *sfep)
+{
+ __uint8_t ftype;
+
+ ftype = sfep->name[sfep->namelen];
+ if (ftype >= XFS_DIR3_FT_MAX)
+ return XFS_DIR3_FT_UNKNOWN;
+ return ftype;
+}
+
+static void
+xfs_dir3_sfe_put_ftype(
+ struct xfs_dir2_sf_entry *sfep,
+ __uint8_t ftype)
+{
+ ASSERT(ftype < XFS_DIR3_FT_MAX);
+
+ sfep->name[sfep->namelen] = ftype;
+}
+
+/*
+ * Inode numbers in short-form directories can come in two versions,
+ * either 4 bytes or 8 bytes wide. These helpers deal with the
+ * two forms transparently by looking at the headers i8count field.
+ *
+ * For 64-bit inode number the most significant byte must be zero.
+ */
+static xfs_ino_t
+xfs_dir2_sf_get_ino(
+ struct xfs_dir2_sf_hdr *hdr,
+ xfs_dir2_inou_t *from)
+{
+ if (hdr->i8count)
+ return get_unaligned_be64(&from->i8.i) & 0x00ffffffffffffffULL;
+ else
+ return get_unaligned_be32(&from->i4.i);
+}
+
+static void
+xfs_dir2_sf_put_ino(
+ struct xfs_dir2_sf_hdr *hdr,
+ xfs_dir2_inou_t *to,
+ xfs_ino_t ino)
+{
+ ASSERT((ino & 0xff00000000000000ULL) == 0);
+
+ if (hdr->i8count)
+ put_unaligned_be64(ino, &to->i8.i);
+ else
+ put_unaligned_be32(ino, &to->i4.i);
+}
+
+static xfs_ino_t
+xfs_dir2_sf_get_parent_ino(
+ struct xfs_dir2_sf_hdr *hdr)
+{
+ return xfs_dir2_sf_get_ino(hdr, &hdr->parent);
+}
+
+static void
+xfs_dir2_sf_put_parent_ino(
+ struct xfs_dir2_sf_hdr *hdr,
+ xfs_ino_t ino)
+{
+ xfs_dir2_sf_put_ino(hdr, &hdr->parent, ino);
+}
+
+/*
+ * In short-form directory entries the inode numbers are stored at variable
+ * offset behind the entry name. If the entry stores a filetype value, then it
+ * sits between the name and the inode number. Hence the inode numbers may only
+ * be accessed through the helpers below.
+ */
+static xfs_ino_t
+xfs_dir2_sfe_get_ino(
+ struct xfs_dir2_sf_hdr *hdr,
+ struct xfs_dir2_sf_entry *sfep)
+{
+ return xfs_dir2_sf_get_ino(hdr,
+ (xfs_dir2_inou_t *)&sfep->name[sfep->namelen]);
+}
+
+static void
+xfs_dir2_sfe_put_ino(
+ struct xfs_dir2_sf_hdr *hdr,
+ struct xfs_dir2_sf_entry *sfep,
+ xfs_ino_t ino)
+{
+ xfs_dir2_sf_put_ino(hdr,
+ (xfs_dir2_inou_t *)&sfep->name[sfep->namelen], ino);
+}
+
+static xfs_ino_t
+xfs_dir3_sfe_get_ino(
+ struct xfs_dir2_sf_hdr *hdr,
+ struct xfs_dir2_sf_entry *sfep)
+{
+ return xfs_dir2_sf_get_ino(hdr,
+ (xfs_dir2_inou_t *)&sfep->name[sfep->namelen + 1]);
+}
+
+static void
+xfs_dir3_sfe_put_ino(
+ struct xfs_dir2_sf_hdr *hdr,
+ struct xfs_dir2_sf_entry *sfep,
+ xfs_ino_t ino)
+{
+ xfs_dir2_sf_put_ino(hdr,
+ (xfs_dir2_inou_t *)&sfep->name[sfep->namelen + 1], ino);
+}
+
+
+/*
+ * Directory data block operations
+ */
+
+/*
+ * For special situations, the dirent size ends up fixed because we always know
+ * what the size of the entry is. That's true for the "." and "..", and
+ * therefore we know that they are a fixed size and hence their offsets are
+ * constant, as is the first entry.
+ *
+ * Hence, this calculation is written as a macro to be able to be calculated at
+ * compile time and so certain offsets can be calculated directly in the
+ * structure initaliser via the macro. There are two macros - one for dirents
+ * with ftype and without so there are no unresolvable conditionals in the
+ * calculations. We also use round_up() as XFS_DIR2_DATA_ALIGN is always a power
+ * of 2 and the compiler doesn't reject it (unlike roundup()).
+ */
+#define XFS_DIR2_DATA_ENTSIZE(n) \
+ round_up((offsetof(struct xfs_dir2_data_entry, name[0]) + (n) + \
+ sizeof(xfs_dir2_data_off_t)), XFS_DIR2_DATA_ALIGN)
+
+#define XFS_DIR3_DATA_ENTSIZE(n) \
+ round_up((offsetof(struct xfs_dir2_data_entry, name[0]) + (n) + \
+ sizeof(xfs_dir2_data_off_t) + sizeof(__uint8_t)), \
+ XFS_DIR2_DATA_ALIGN)
+
+static int
+xfs_dir2_data_entsize(
+ int n)
+{
+ return XFS_DIR2_DATA_ENTSIZE(n);
+}
+
+static int
+xfs_dir3_data_entsize(
+ int n)
+{
+ return XFS_DIR3_DATA_ENTSIZE(n);
+}
+
+static __uint8_t
+xfs_dir2_data_get_ftype(
+ struct xfs_dir2_data_entry *dep)
+{
+ return XFS_DIR3_FT_UNKNOWN;
+}
+
+static void
+xfs_dir2_data_put_ftype(
+ struct xfs_dir2_data_entry *dep,
+ __uint8_t ftype)
+{
+ ASSERT(ftype < XFS_DIR3_FT_MAX);
+}
+
+static __uint8_t
+xfs_dir3_data_get_ftype(
+ struct xfs_dir2_data_entry *dep)
+{
+ __uint8_t ftype = dep->name[dep->namelen];
+
+ ASSERT(ftype < XFS_DIR3_FT_MAX);
+ if (ftype >= XFS_DIR3_FT_MAX)
+ return XFS_DIR3_FT_UNKNOWN;
+ return ftype;
+}
+
+static void
+xfs_dir3_data_put_ftype(
+ struct xfs_dir2_data_entry *dep,
+ __uint8_t type)
+{
+ ASSERT(type < XFS_DIR3_FT_MAX);
+ ASSERT(dep->namelen != 0);
+
+ dep->name[dep->namelen] = type;
+}
+
+/*
+ * Pointer to an entry's tag word.
+ */
+static __be16 *
+xfs_dir2_data_entry_tag_p(
+ struct xfs_dir2_data_entry *dep)
+{
+ return (__be16 *)((char *)dep +
+ xfs_dir2_data_entsize(dep->namelen) - sizeof(__be16));
+}
+
+static __be16 *
+xfs_dir3_data_entry_tag_p(
+ struct xfs_dir2_data_entry *dep)
+{
+ return (__be16 *)((char *)dep +
+ xfs_dir3_data_entsize(dep->namelen) - sizeof(__be16));
+}
+
+/*
+ * location of . and .. in data space (always block 0)
+ */
+static struct xfs_dir2_data_entry *
+xfs_dir2_data_dot_entry_p(
+ struct xfs_dir2_data_hdr *hdr)
+{
+ return (struct xfs_dir2_data_entry *)
+ ((char *)hdr + sizeof(struct xfs_dir2_data_hdr));
+}
+
+static struct xfs_dir2_data_entry *
+xfs_dir2_data_dotdot_entry_p(
+ struct xfs_dir2_data_hdr *hdr)
+{
+ return (struct xfs_dir2_data_entry *)
+ ((char *)hdr + sizeof(struct xfs_dir2_data_hdr) +
+ XFS_DIR2_DATA_ENTSIZE(1));
+}
+
+static struct xfs_dir2_data_entry *
+xfs_dir2_data_first_entry_p(
+ struct xfs_dir2_data_hdr *hdr)
+{
+ return (struct xfs_dir2_data_entry *)
+ ((char *)hdr + sizeof(struct xfs_dir2_data_hdr) +
+ XFS_DIR2_DATA_ENTSIZE(1) +
+ XFS_DIR2_DATA_ENTSIZE(2));
+}
+
+static struct xfs_dir2_data_entry *
+xfs_dir2_ftype_data_dotdot_entry_p(
+ struct xfs_dir2_data_hdr *hdr)
+{
+ return (struct xfs_dir2_data_entry *)
+ ((char *)hdr + sizeof(struct xfs_dir2_data_hdr) +
+ XFS_DIR3_DATA_ENTSIZE(1));
+}
+
+static struct xfs_dir2_data_entry *
+xfs_dir2_ftype_data_first_entry_p(
+ struct xfs_dir2_data_hdr *hdr)
+{
+ return (struct xfs_dir2_data_entry *)
+ ((char *)hdr + sizeof(struct xfs_dir2_data_hdr) +
+ XFS_DIR3_DATA_ENTSIZE(1) +
+ XFS_DIR3_DATA_ENTSIZE(2));
+}
+
+static struct xfs_dir2_data_entry *
+xfs_dir3_data_dot_entry_p(
+ struct xfs_dir2_data_hdr *hdr)
+{
+ return (struct xfs_dir2_data_entry *)
+ ((char *)hdr + sizeof(struct xfs_dir3_data_hdr));
+}
+
+static struct xfs_dir2_data_entry *
+xfs_dir3_data_dotdot_entry_p(
+ struct xfs_dir2_data_hdr *hdr)
+{
+ return (struct xfs_dir2_data_entry *)
+ ((char *)hdr + sizeof(struct xfs_dir3_data_hdr) +
+ XFS_DIR3_DATA_ENTSIZE(1));
+}
+
+static struct xfs_dir2_data_entry *
+xfs_dir3_data_first_entry_p(
+ struct xfs_dir2_data_hdr *hdr)
+{
+ return (struct xfs_dir2_data_entry *)
+ ((char *)hdr + sizeof(struct xfs_dir3_data_hdr) +
+ XFS_DIR3_DATA_ENTSIZE(1) +
+ XFS_DIR3_DATA_ENTSIZE(2));
+}
+
+static struct xfs_dir2_data_free *
+xfs_dir2_data_bestfree_p(struct xfs_dir2_data_hdr *hdr)
+{
+ return hdr->bestfree;
+}
+
+static struct xfs_dir2_data_free *
+xfs_dir3_data_bestfree_p(struct xfs_dir2_data_hdr *hdr)
+{
+ return ((struct xfs_dir3_data_hdr *)hdr)->best_free;
+}
+
+static struct xfs_dir2_data_entry *
+xfs_dir2_data_entry_p(struct xfs_dir2_data_hdr *hdr)
+{
+ return (struct xfs_dir2_data_entry *)
+ ((char *)hdr + sizeof(struct xfs_dir2_data_hdr));
+}
+
+static struct xfs_dir2_data_unused *
+xfs_dir2_data_unused_p(struct xfs_dir2_data_hdr *hdr)
+{
+ return (struct xfs_dir2_data_unused *)
+ ((char *)hdr + sizeof(struct xfs_dir2_data_hdr));
+}
+
+static struct xfs_dir2_data_entry *
+xfs_dir3_data_entry_p(struct xfs_dir2_data_hdr *hdr)
+{
+ return (struct xfs_dir2_data_entry *)
+ ((char *)hdr + sizeof(struct xfs_dir3_data_hdr));
+}
+
+static struct xfs_dir2_data_unused *
+xfs_dir3_data_unused_p(struct xfs_dir2_data_hdr *hdr)
+{
+ return (struct xfs_dir2_data_unused *)
+ ((char *)hdr + sizeof(struct xfs_dir3_data_hdr));
+}
+
+
+/*
+ * Directory Leaf block operations
+ */
+static int
+xfs_dir2_max_leaf_ents(struct xfs_da_geometry *geo)
+{
+ return (geo->blksize - sizeof(struct xfs_dir2_leaf_hdr)) /
+ (uint)sizeof(struct xfs_dir2_leaf_entry);
+}
+
+static struct xfs_dir2_leaf_entry *
+xfs_dir2_leaf_ents_p(struct xfs_dir2_leaf *lp)
+{
+ return lp->__ents;
+}
+
+static int
+xfs_dir3_max_leaf_ents(struct xfs_da_geometry *geo)
+{
+ return (geo->blksize - sizeof(struct xfs_dir3_leaf_hdr)) /
+ (uint)sizeof(struct xfs_dir2_leaf_entry);
+}
+
+static struct xfs_dir2_leaf_entry *
+xfs_dir3_leaf_ents_p(struct xfs_dir2_leaf *lp)
+{
+ return ((struct xfs_dir3_leaf *)lp)->__ents;
+}
+
+static void
+xfs_dir2_leaf_hdr_from_disk(
+ struct xfs_dir3_icleaf_hdr *to,
+ struct xfs_dir2_leaf *from)
+{
+ to->forw = be32_to_cpu(from->hdr.info.forw);
+ to->back = be32_to_cpu(from->hdr.info.back);
+ to->magic = be16_to_cpu(from->hdr.info.magic);
+ to->count = be16_to_cpu(from->hdr.count);
+ to->stale = be16_to_cpu(from->hdr.stale);
+
+ ASSERT(to->magic == XFS_DIR2_LEAF1_MAGIC ||
+ to->magic == XFS_DIR2_LEAFN_MAGIC);
+}
+
+static void
+xfs_dir2_leaf_hdr_to_disk(
+ struct xfs_dir2_leaf *to,
+ struct xfs_dir3_icleaf_hdr *from)
+{
+ ASSERT(from->magic == XFS_DIR2_LEAF1_MAGIC ||
+ from->magic == XFS_DIR2_LEAFN_MAGIC);
+
+ to->hdr.info.forw = cpu_to_be32(from->forw);
+ to->hdr.info.back = cpu_to_be32(from->back);
+ to->hdr.info.magic = cpu_to_be16(from->magic);
+ to->hdr.count = cpu_to_be16(from->count);
+ to->hdr.stale = cpu_to_be16(from->stale);
+}
+
+static void
+xfs_dir3_leaf_hdr_from_disk(
+ struct xfs_dir3_icleaf_hdr *to,
+ struct xfs_dir2_leaf *from)
+{
+ struct xfs_dir3_leaf_hdr *hdr3 = (struct xfs_dir3_leaf_hdr *)from;
+
+ to->forw = be32_to_cpu(hdr3->info.hdr.forw);
+ to->back = be32_to_cpu(hdr3->info.hdr.back);
+ to->magic = be16_to_cpu(hdr3->info.hdr.magic);
+ to->count = be16_to_cpu(hdr3->count);
+ to->stale = be16_to_cpu(hdr3->stale);
+
+ ASSERT(to->magic == XFS_DIR3_LEAF1_MAGIC ||
+ to->magic == XFS_DIR3_LEAFN_MAGIC);
+}
+
+static void
+xfs_dir3_leaf_hdr_to_disk(
+ struct xfs_dir2_leaf *to,
+ struct xfs_dir3_icleaf_hdr *from)
+{
+ struct xfs_dir3_leaf_hdr *hdr3 = (struct xfs_dir3_leaf_hdr *)to;
+
+ ASSERT(from->magic == XFS_DIR3_LEAF1_MAGIC ||
+ from->magic == XFS_DIR3_LEAFN_MAGIC);
+
+ hdr3->info.hdr.forw = cpu_to_be32(from->forw);
+ hdr3->info.hdr.back = cpu_to_be32(from->back);
+ hdr3->info.hdr.magic = cpu_to_be16(from->magic);
+ hdr3->count = cpu_to_be16(from->count);
+ hdr3->stale = cpu_to_be16(from->stale);
+}
+
+
+/*
+ * Directory/Attribute Node block operations
+ */
+static struct xfs_da_node_entry *
+xfs_da2_node_tree_p(struct xfs_da_intnode *dap)
+{
+ return dap->__btree;
+}
+
+static struct xfs_da_node_entry *
+xfs_da3_node_tree_p(struct xfs_da_intnode *dap)
+{
+ return ((struct xfs_da3_intnode *)dap)->__btree;
+}
+
+static void
+xfs_da2_node_hdr_from_disk(
+ struct xfs_da3_icnode_hdr *to,
+ struct xfs_da_intnode *from)
+{
+ ASSERT(from->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC));
+ to->forw = be32_to_cpu(from->hdr.info.forw);
+ to->back = be32_to_cpu(from->hdr.info.back);
+ to->magic = be16_to_cpu(from->hdr.info.magic);
+ to->count = be16_to_cpu(from->hdr.__count);
+ to->level = be16_to_cpu(from->hdr.__level);
+}
+
+static void
+xfs_da2_node_hdr_to_disk(
+ struct xfs_da_intnode *to,
+ struct xfs_da3_icnode_hdr *from)
+{
+ ASSERT(from->magic == XFS_DA_NODE_MAGIC);
+ to->hdr.info.forw = cpu_to_be32(from->forw);
+ to->hdr.info.back = cpu_to_be32(from->back);
+ to->hdr.info.magic = cpu_to_be16(from->magic);
+ to->hdr.__count = cpu_to_be16(from->count);
+ to->hdr.__level = cpu_to_be16(from->level);
+}
+
+static void
+xfs_da3_node_hdr_from_disk(
+ struct xfs_da3_icnode_hdr *to,
+ struct xfs_da_intnode *from)
+{
+ struct xfs_da3_node_hdr *hdr3 = (struct xfs_da3_node_hdr *)from;
+
+ ASSERT(from->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC));
+ to->forw = be32_to_cpu(hdr3->info.hdr.forw);
+ to->back = be32_to_cpu(hdr3->info.hdr.back);
+ to->magic = be16_to_cpu(hdr3->info.hdr.magic);
+ to->count = be16_to_cpu(hdr3->__count);
+ to->level = be16_to_cpu(hdr3->__level);
+}
+
+static void
+xfs_da3_node_hdr_to_disk(
+ struct xfs_da_intnode *to,
+ struct xfs_da3_icnode_hdr *from)
+{
+ struct xfs_da3_node_hdr *hdr3 = (struct xfs_da3_node_hdr *)to;
+
+ ASSERT(from->magic == XFS_DA3_NODE_MAGIC);
+ hdr3->info.hdr.forw = cpu_to_be32(from->forw);
+ hdr3->info.hdr.back = cpu_to_be32(from->back);
+ hdr3->info.hdr.magic = cpu_to_be16(from->magic);
+ hdr3->__count = cpu_to_be16(from->count);
+ hdr3->__level = cpu_to_be16(from->level);
+}
+
+
+/*
+ * Directory free space block operations
+ */
+static int
+xfs_dir2_free_max_bests(struct xfs_da_geometry *geo)
+{
+ return (geo->blksize - sizeof(struct xfs_dir2_free_hdr)) /
+ sizeof(xfs_dir2_data_off_t);
+}
+
+static __be16 *
+xfs_dir2_free_bests_p(struct xfs_dir2_free *free)
+{
+ return (__be16 *)((char *)free + sizeof(struct xfs_dir2_free_hdr));
+}
+
+/*
+ * Convert data space db to the corresponding free db.
+ */
+static xfs_dir2_db_t
+xfs_dir2_db_to_fdb(struct xfs_da_geometry *geo, xfs_dir2_db_t db)
+{
+ return xfs_dir2_byte_to_db(geo, XFS_DIR2_FREE_OFFSET) +
+ (db / xfs_dir2_free_max_bests(geo));
+}
+
+/*
+ * Convert data space db to the corresponding index in a free db.
+ */
+static int
+xfs_dir2_db_to_fdindex(struct xfs_da_geometry *geo, xfs_dir2_db_t db)
+{
+ return db % xfs_dir2_free_max_bests(geo);
+}
+
+static int
+xfs_dir3_free_max_bests(struct xfs_da_geometry *geo)
+{
+ return (geo->blksize - sizeof(struct xfs_dir3_free_hdr)) /
+ sizeof(xfs_dir2_data_off_t);
+}
+
+static __be16 *
+xfs_dir3_free_bests_p(struct xfs_dir2_free *free)
+{
+ return (__be16 *)((char *)free + sizeof(struct xfs_dir3_free_hdr));
+}
+
+/*
+ * Convert data space db to the corresponding free db.
+ */
+static xfs_dir2_db_t
+xfs_dir3_db_to_fdb(struct xfs_da_geometry *geo, xfs_dir2_db_t db)
+{
+ return xfs_dir2_byte_to_db(geo, XFS_DIR2_FREE_OFFSET) +
+ (db / xfs_dir3_free_max_bests(geo));
+}
+
+/*
+ * Convert data space db to the corresponding index in a free db.
+ */
+static int
+xfs_dir3_db_to_fdindex(struct xfs_da_geometry *geo, xfs_dir2_db_t db)
+{
+ return db % xfs_dir3_free_max_bests(geo);
+}
+
+static void
+xfs_dir2_free_hdr_from_disk(
+ struct xfs_dir3_icfree_hdr *to,
+ struct xfs_dir2_free *from)
+{
+ to->magic = be32_to_cpu(from->hdr.magic);
+ to->firstdb = be32_to_cpu(from->hdr.firstdb);
+ to->nvalid = be32_to_cpu(from->hdr.nvalid);
+ to->nused = be32_to_cpu(from->hdr.nused);
+ ASSERT(to->magic == XFS_DIR2_FREE_MAGIC);
+}
+
+static void
+xfs_dir2_free_hdr_to_disk(
+ struct xfs_dir2_free *to,
+ struct xfs_dir3_icfree_hdr *from)
+{
+ ASSERT(from->magic == XFS_DIR2_FREE_MAGIC);
+
+ to->hdr.magic = cpu_to_be32(from->magic);
+ to->hdr.firstdb = cpu_to_be32(from->firstdb);
+ to->hdr.nvalid = cpu_to_be32(from->nvalid);
+ to->hdr.nused = cpu_to_be32(from->nused);
+}
+
+static void
+xfs_dir3_free_hdr_from_disk(
+ struct xfs_dir3_icfree_hdr *to,
+ struct xfs_dir2_free *from)
+{
+ struct xfs_dir3_free_hdr *hdr3 = (struct xfs_dir3_free_hdr *)from;
+
+ to->magic = be32_to_cpu(hdr3->hdr.magic);
+ to->firstdb = be32_to_cpu(hdr3->firstdb);
+ to->nvalid = be32_to_cpu(hdr3->nvalid);
+ to->nused = be32_to_cpu(hdr3->nused);
+
+ ASSERT(to->magic == XFS_DIR3_FREE_MAGIC);
+}
+
+static void
+xfs_dir3_free_hdr_to_disk(
+ struct xfs_dir2_free *to,
+ struct xfs_dir3_icfree_hdr *from)
+{
+ struct xfs_dir3_free_hdr *hdr3 = (struct xfs_dir3_free_hdr *)to;
+
+ ASSERT(from->magic == XFS_DIR3_FREE_MAGIC);
+
+ hdr3->hdr.magic = cpu_to_be32(from->magic);
+ hdr3->firstdb = cpu_to_be32(from->firstdb);
+ hdr3->nvalid = cpu_to_be32(from->nvalid);
+ hdr3->nused = cpu_to_be32(from->nused);
+}
+
+static const struct xfs_dir_ops xfs_dir2_ops = {
+ .sf_entsize = xfs_dir2_sf_entsize,
+ .sf_nextentry = xfs_dir2_sf_nextentry,
+ .sf_get_ftype = xfs_dir2_sfe_get_ftype,
+ .sf_put_ftype = xfs_dir2_sfe_put_ftype,
+ .sf_get_ino = xfs_dir2_sfe_get_ino,
+ .sf_put_ino = xfs_dir2_sfe_put_ino,
+ .sf_get_parent_ino = xfs_dir2_sf_get_parent_ino,
+ .sf_put_parent_ino = xfs_dir2_sf_put_parent_ino,
+
+ .data_entsize = xfs_dir2_data_entsize,
+ .data_get_ftype = xfs_dir2_data_get_ftype,
+ .data_put_ftype = xfs_dir2_data_put_ftype,
+ .data_entry_tag_p = xfs_dir2_data_entry_tag_p,
+ .data_bestfree_p = xfs_dir2_data_bestfree_p,
+
+ .data_dot_offset = sizeof(struct xfs_dir2_data_hdr),
+ .data_dotdot_offset = sizeof(struct xfs_dir2_data_hdr) +
+ XFS_DIR2_DATA_ENTSIZE(1),
+ .data_first_offset = sizeof(struct xfs_dir2_data_hdr) +
+ XFS_DIR2_DATA_ENTSIZE(1) +
+ XFS_DIR2_DATA_ENTSIZE(2),
+ .data_entry_offset = sizeof(struct xfs_dir2_data_hdr),
+
+ .data_dot_entry_p = xfs_dir2_data_dot_entry_p,
+ .data_dotdot_entry_p = xfs_dir2_data_dotdot_entry_p,
+ .data_first_entry_p = xfs_dir2_data_first_entry_p,
+ .data_entry_p = xfs_dir2_data_entry_p,
+ .data_unused_p = xfs_dir2_data_unused_p,
+
+ .leaf_hdr_size = sizeof(struct xfs_dir2_leaf_hdr),
+ .leaf_hdr_to_disk = xfs_dir2_leaf_hdr_to_disk,
+ .leaf_hdr_from_disk = xfs_dir2_leaf_hdr_from_disk,
+ .leaf_max_ents = xfs_dir2_max_leaf_ents,
+ .leaf_ents_p = xfs_dir2_leaf_ents_p,
+
+ .node_hdr_size = sizeof(struct xfs_da_node_hdr),
+ .node_hdr_to_disk = xfs_da2_node_hdr_to_disk,
+ .node_hdr_from_disk = xfs_da2_node_hdr_from_disk,
+ .node_tree_p = xfs_da2_node_tree_p,
+
+ .free_hdr_size = sizeof(struct xfs_dir2_free_hdr),
+ .free_hdr_to_disk = xfs_dir2_free_hdr_to_disk,
+ .free_hdr_from_disk = xfs_dir2_free_hdr_from_disk,
+ .free_max_bests = xfs_dir2_free_max_bests,
+ .free_bests_p = xfs_dir2_free_bests_p,
+ .db_to_fdb = xfs_dir2_db_to_fdb,
+ .db_to_fdindex = xfs_dir2_db_to_fdindex,
+};
+
+static const struct xfs_dir_ops xfs_dir2_ftype_ops = {
+ .sf_entsize = xfs_dir3_sf_entsize,
+ .sf_nextentry = xfs_dir3_sf_nextentry,
+ .sf_get_ftype = xfs_dir3_sfe_get_ftype,
+ .sf_put_ftype = xfs_dir3_sfe_put_ftype,
+ .sf_get_ino = xfs_dir3_sfe_get_ino,
+ .sf_put_ino = xfs_dir3_sfe_put_ino,
+ .sf_get_parent_ino = xfs_dir2_sf_get_parent_ino,
+ .sf_put_parent_ino = xfs_dir2_sf_put_parent_ino,
+
+ .data_entsize = xfs_dir3_data_entsize,
+ .data_get_ftype = xfs_dir3_data_get_ftype,
+ .data_put_ftype = xfs_dir3_data_put_ftype,
+ .data_entry_tag_p = xfs_dir3_data_entry_tag_p,
+ .data_bestfree_p = xfs_dir2_data_bestfree_p,
+
+ .data_dot_offset = sizeof(struct xfs_dir2_data_hdr),
+ .data_dotdot_offset = sizeof(struct xfs_dir2_data_hdr) +
+ XFS_DIR3_DATA_ENTSIZE(1),
+ .data_first_offset = sizeof(struct xfs_dir2_data_hdr) +
+ XFS_DIR3_DATA_ENTSIZE(1) +
+ XFS_DIR3_DATA_ENTSIZE(2),
+ .data_entry_offset = sizeof(struct xfs_dir2_data_hdr),
+
+ .data_dot_entry_p = xfs_dir2_data_dot_entry_p,
+ .data_dotdot_entry_p = xfs_dir2_ftype_data_dotdot_entry_p,
+ .data_first_entry_p = xfs_dir2_ftype_data_first_entry_p,
+ .data_entry_p = xfs_dir2_data_entry_p,
+ .data_unused_p = xfs_dir2_data_unused_p,
+
+ .leaf_hdr_size = sizeof(struct xfs_dir2_leaf_hdr),
+ .leaf_hdr_to_disk = xfs_dir2_leaf_hdr_to_disk,
+ .leaf_hdr_from_disk = xfs_dir2_leaf_hdr_from_disk,
+ .leaf_max_ents = xfs_dir2_max_leaf_ents,
+ .leaf_ents_p = xfs_dir2_leaf_ents_p,
+
+ .node_hdr_size = sizeof(struct xfs_da_node_hdr),
+ .node_hdr_to_disk = xfs_da2_node_hdr_to_disk,
+ .node_hdr_from_disk = xfs_da2_node_hdr_from_disk,
+ .node_tree_p = xfs_da2_node_tree_p,
+
+ .free_hdr_size = sizeof(struct xfs_dir2_free_hdr),
+ .free_hdr_to_disk = xfs_dir2_free_hdr_to_disk,
+ .free_hdr_from_disk = xfs_dir2_free_hdr_from_disk,
+ .free_max_bests = xfs_dir2_free_max_bests,
+ .free_bests_p = xfs_dir2_free_bests_p,
+ .db_to_fdb = xfs_dir2_db_to_fdb,
+ .db_to_fdindex = xfs_dir2_db_to_fdindex,
+};
+
+static const struct xfs_dir_ops xfs_dir3_ops = {
+ .sf_entsize = xfs_dir3_sf_entsize,
+ .sf_nextentry = xfs_dir3_sf_nextentry,
+ .sf_get_ftype = xfs_dir3_sfe_get_ftype,
+ .sf_put_ftype = xfs_dir3_sfe_put_ftype,
+ .sf_get_ino = xfs_dir3_sfe_get_ino,
+ .sf_put_ino = xfs_dir3_sfe_put_ino,
+ .sf_get_parent_ino = xfs_dir2_sf_get_parent_ino,
+ .sf_put_parent_ino = xfs_dir2_sf_put_parent_ino,
+
+ .data_entsize = xfs_dir3_data_entsize,
+ .data_get_ftype = xfs_dir3_data_get_ftype,
+ .data_put_ftype = xfs_dir3_data_put_ftype,
+ .data_entry_tag_p = xfs_dir3_data_entry_tag_p,
+ .data_bestfree_p = xfs_dir3_data_bestfree_p,
+
+ .data_dot_offset = sizeof(struct xfs_dir3_data_hdr),
+ .data_dotdot_offset = sizeof(struct xfs_dir3_data_hdr) +
+ XFS_DIR3_DATA_ENTSIZE(1),
+ .data_first_offset = sizeof(struct xfs_dir3_data_hdr) +
+ XFS_DIR3_DATA_ENTSIZE(1) +
+ XFS_DIR3_DATA_ENTSIZE(2),
+ .data_entry_offset = sizeof(struct xfs_dir3_data_hdr),
+
+ .data_dot_entry_p = xfs_dir3_data_dot_entry_p,
+ .data_dotdot_entry_p = xfs_dir3_data_dotdot_entry_p,
+ .data_first_entry_p = xfs_dir3_data_first_entry_p,
+ .data_entry_p = xfs_dir3_data_entry_p,
+ .data_unused_p = xfs_dir3_data_unused_p,
+
+ .leaf_hdr_size = sizeof(struct xfs_dir3_leaf_hdr),
+ .leaf_hdr_to_disk = xfs_dir3_leaf_hdr_to_disk,
+ .leaf_hdr_from_disk = xfs_dir3_leaf_hdr_from_disk,
+ .leaf_max_ents = xfs_dir3_max_leaf_ents,
+ .leaf_ents_p = xfs_dir3_leaf_ents_p,
+
+ .node_hdr_size = sizeof(struct xfs_da3_node_hdr),
+ .node_hdr_to_disk = xfs_da3_node_hdr_to_disk,
+ .node_hdr_from_disk = xfs_da3_node_hdr_from_disk,
+ .node_tree_p = xfs_da3_node_tree_p,
+
+ .free_hdr_size = sizeof(struct xfs_dir3_free_hdr),
+ .free_hdr_to_disk = xfs_dir3_free_hdr_to_disk,
+ .free_hdr_from_disk = xfs_dir3_free_hdr_from_disk,
+ .free_max_bests = xfs_dir3_free_max_bests,
+ .free_bests_p = xfs_dir3_free_bests_p,
+ .db_to_fdb = xfs_dir3_db_to_fdb,
+ .db_to_fdindex = xfs_dir3_db_to_fdindex,
+};
+
+static const struct xfs_dir_ops xfs_dir2_nondir_ops = {
+ .node_hdr_size = sizeof(struct xfs_da_node_hdr),
+ .node_hdr_to_disk = xfs_da2_node_hdr_to_disk,
+ .node_hdr_from_disk = xfs_da2_node_hdr_from_disk,
+ .node_tree_p = xfs_da2_node_tree_p,
+};
+
+static const struct xfs_dir_ops xfs_dir3_nondir_ops = {
+ .node_hdr_size = sizeof(struct xfs_da3_node_hdr),
+ .node_hdr_to_disk = xfs_da3_node_hdr_to_disk,
+ .node_hdr_from_disk = xfs_da3_node_hdr_from_disk,
+ .node_tree_p = xfs_da3_node_tree_p,
+};
+
+/*
+ * Return the ops structure according to the current config. If we are passed
+ * an inode, then that overrides the default config we use which is based on
+ * feature bits.
+ */
+const struct xfs_dir_ops *
+xfs_dir_get_ops(
+ struct xfs_mount *mp,
+ struct xfs_inode *dp)
+{
+ if (dp)
+ return dp->d_ops;
+ if (mp->m_dir_inode_ops)
+ return mp->m_dir_inode_ops;
+ if (xfs_sb_version_hascrc(&mp->m_sb))
+ return &xfs_dir3_ops;
+ if (xfs_sb_version_hasftype(&mp->m_sb))
+ return &xfs_dir2_ftype_ops;
+ return &xfs_dir2_ops;
+}
+
+const struct xfs_dir_ops *
+xfs_nondir_get_ops(
+ struct xfs_mount *mp,
+ struct xfs_inode *dp)
+{
+ if (dp)
+ return dp->d_ops;
+ if (mp->m_nondir_inode_ops)
+ return mp->m_nondir_inode_ops;
+ if (xfs_sb_version_hascrc(&mp->m_sb))
+ return &xfs_dir3_nondir_ops;
+ return &xfs_dir2_nondir_ops;
+}
diff --git a/fs/xfs/xfs_da_format.h b/fs/xfs/xfs_da_format.h
new file mode 100644
index 00000000000..0a49b028637
--- /dev/null
+++ b/fs/xfs/xfs_da_format.h
@@ -0,0 +1,861 @@
+/*
+ * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#ifndef __XFS_DA_FORMAT_H__
+#define __XFS_DA_FORMAT_H__
+
+/*
+ * This structure is common to both leaf nodes and non-leaf nodes in the Btree.
+ *
+ * It is used to manage a doubly linked list of all blocks at the same
+ * level in the Btree, and to identify which type of block this is.
+ */
+#define XFS_DA_NODE_MAGIC 0xfebe /* magic number: non-leaf blocks */
+#define XFS_ATTR_LEAF_MAGIC 0xfbee /* magic number: attribute leaf blks */
+#define XFS_DIR2_LEAF1_MAGIC 0xd2f1 /* magic number: v2 dirlf single blks */
+#define XFS_DIR2_LEAFN_MAGIC 0xd2ff /* magic number: v2 dirlf multi blks */
+
+typedef struct xfs_da_blkinfo {
+ __be32 forw; /* previous block in list */
+ __be32 back; /* following block in list */
+ __be16 magic; /* validity check on block */
+ __be16 pad; /* unused */
+} xfs_da_blkinfo_t;
+
+/*
+ * CRC enabled directory structure types
+ *
+ * The headers change size for the additional verification information, but
+ * otherwise the tree layouts and contents are unchanged. Hence the da btree
+ * code can use the struct xfs_da_blkinfo for manipulating the tree links and
+ * magic numbers without modification for both v2 and v3 nodes.
+ */
+#define XFS_DA3_NODE_MAGIC 0x3ebe /* magic number: non-leaf blocks */
+#define XFS_ATTR3_LEAF_MAGIC 0x3bee /* magic number: attribute leaf blks */
+#define XFS_DIR3_LEAF1_MAGIC 0x3df1 /* magic number: v2 dirlf single blks */
+#define XFS_DIR3_LEAFN_MAGIC 0x3dff /* magic number: v2 dirlf multi blks */
+
+struct xfs_da3_blkinfo {
+ /*
+ * the node link manipulation code relies on the fact that the first
+ * element of this structure is the struct xfs_da_blkinfo so it can
+ * ignore the differences in the rest of the structures.
+ */
+ struct xfs_da_blkinfo hdr;
+ __be32 crc; /* CRC of block */
+ __be64 blkno; /* first block of the buffer */
+ __be64 lsn; /* sequence number of last write */
+ uuid_t uuid; /* filesystem we belong to */
+ __be64 owner; /* inode that owns the block */
+};
+
+/*
+ * This is the structure of the root and intermediate nodes in the Btree.
+ * The leaf nodes are defined above.
+ *
+ * Entries are not packed.
+ *
+ * Since we have duplicate keys, use a binary search but always follow
+ * all match in the block, not just the first match found.
+ */
+#define XFS_DA_NODE_MAXDEPTH 5 /* max depth of Btree */
+
+typedef struct xfs_da_node_hdr {
+ struct xfs_da_blkinfo info; /* block type, links, etc. */
+ __be16 __count; /* count of active entries */
+ __be16 __level; /* level above leaves (leaf == 0) */
+} xfs_da_node_hdr_t;
+
+struct xfs_da3_node_hdr {
+ struct xfs_da3_blkinfo info; /* block type, links, etc. */
+ __be16 __count; /* count of active entries */
+ __be16 __level; /* level above leaves (leaf == 0) */
+ __be32 __pad32;
+};
+
+#define XFS_DA3_NODE_CRC_OFF (offsetof(struct xfs_da3_node_hdr, info.crc))
+
+typedef struct xfs_da_node_entry {
+ __be32 hashval; /* hash value for this descendant */
+ __be32 before; /* Btree block before this key */
+} xfs_da_node_entry_t;
+
+typedef struct xfs_da_intnode {
+ struct xfs_da_node_hdr hdr;
+ struct xfs_da_node_entry __btree[];
+} xfs_da_intnode_t;
+
+struct xfs_da3_intnode {
+ struct xfs_da3_node_hdr hdr;
+ struct xfs_da_node_entry __btree[];
+};
+
+/*
+ * In-core version of the node header to abstract the differences in the v2 and
+ * v3 disk format of the headers. Callers need to convert to/from disk format as
+ * appropriate.
+ */
+struct xfs_da3_icnode_hdr {
+ __uint32_t forw;
+ __uint32_t back;
+ __uint16_t magic;
+ __uint16_t count;
+ __uint16_t level;
+};
+
+/*
+ * Directory version 2.
+ *
+ * There are 4 possible formats:
+ * - shortform - embedded into the inode
+ * - single block - data with embedded leaf at the end
+ * - multiple data blocks, single leaf+freeindex block
+ * - data blocks, node and leaf blocks (btree), freeindex blocks
+ *
+ * Note: many node blocks structures and constants are shared with the attr
+ * code and defined in xfs_da_btree.h.
+ */
+
+#define XFS_DIR2_BLOCK_MAGIC 0x58443242 /* XD2B: single block dirs */
+#define XFS_DIR2_DATA_MAGIC 0x58443244 /* XD2D: multiblock dirs */
+#define XFS_DIR2_FREE_MAGIC 0x58443246 /* XD2F: free index blocks */
+
+/*
+ * Directory Version 3 With CRCs.
+ *
+ * The tree formats are the same as for version 2 directories. The difference
+ * is in the block header and dirent formats. In many cases the v3 structures
+ * use v2 definitions as they are no different and this makes code sharing much
+ * easier.
+ *
+ * Also, the xfs_dir3_*() functions handle both v2 and v3 formats - if the
+ * format is v2 then they switch to the existing v2 code, or the format is v3
+ * they implement the v3 functionality. This means the existing dir2 is a mix of
+ * xfs_dir2/xfs_dir3 calls and functions. The xfs_dir3 functions are called
+ * where there is a difference in the formats, otherwise the code is unchanged.
+ *
+ * Where it is possible, the code decides what to do based on the magic numbers
+ * in the blocks rather than feature bits in the superblock. This means the code
+ * is as independent of the external XFS code as possible as doesn't require
+ * passing struct xfs_mount pointers into places where it isn't really
+ * necessary.
+ *
+ * Version 3 includes:
+ *
+ * - a larger block header for CRC and identification purposes and so the
+ * offsets of all the structures inside the blocks are different.
+ *
+ * - new magic numbers to be able to detect the v2/v3 types on the fly.
+ */
+
+#define XFS_DIR3_BLOCK_MAGIC 0x58444233 /* XDB3: single block dirs */
+#define XFS_DIR3_DATA_MAGIC 0x58444433 /* XDD3: multiblock dirs */
+#define XFS_DIR3_FREE_MAGIC 0x58444633 /* XDF3: free index blocks */
+
+/*
+ * Dirents in version 3 directories have a file type field. Additions to this
+ * list are an on-disk format change, requiring feature bits. Valid values
+ * are as follows:
+ */
+#define XFS_DIR3_FT_UNKNOWN 0
+#define XFS_DIR3_FT_REG_FILE 1
+#define XFS_DIR3_FT_DIR 2
+#define XFS_DIR3_FT_CHRDEV 3
+#define XFS_DIR3_FT_BLKDEV 4
+#define XFS_DIR3_FT_FIFO 5
+#define XFS_DIR3_FT_SOCK 6
+#define XFS_DIR3_FT_SYMLINK 7
+#define XFS_DIR3_FT_WHT 8
+
+#define XFS_DIR3_FT_MAX 9
+
+/*
+ * Byte offset in data block and shortform entry.
+ */
+typedef __uint16_t xfs_dir2_data_off_t;
+#define NULLDATAOFF 0xffffU
+typedef uint xfs_dir2_data_aoff_t; /* argument form */
+
+/*
+ * Normalized offset (in a data block) of the entry, really xfs_dir2_data_off_t.
+ * Only need 16 bits, this is the byte offset into the single block form.
+ */
+typedef struct { __uint8_t i[2]; } __arch_pack xfs_dir2_sf_off_t;
+
+/*
+ * Offset in data space of a data entry.
+ */
+typedef __uint32_t xfs_dir2_dataptr_t;
+#define XFS_DIR2_MAX_DATAPTR ((xfs_dir2_dataptr_t)0xffffffff)
+#define XFS_DIR2_NULL_DATAPTR ((xfs_dir2_dataptr_t)0)
+
+/*
+ * Byte offset in a directory.
+ */
+typedef xfs_off_t xfs_dir2_off_t;
+
+/*
+ * Directory block number (logical dirblk in file)
+ */
+typedef __uint32_t xfs_dir2_db_t;
+
+/*
+ * Inode number stored as 8 8-bit values.
+ */
+typedef struct { __uint8_t i[8]; } xfs_dir2_ino8_t;
+
+/*
+ * Inode number stored as 4 8-bit values.
+ * Works a lot of the time, when all the inode numbers in a directory
+ * fit in 32 bits.
+ */
+typedef struct { __uint8_t i[4]; } xfs_dir2_ino4_t;
+
+typedef union {
+ xfs_dir2_ino8_t i8;
+ xfs_dir2_ino4_t i4;
+} xfs_dir2_inou_t;
+#define XFS_DIR2_MAX_SHORT_INUM ((xfs_ino_t)0xffffffffULL)
+
+/*
+ * Directory layout when stored internal to an inode.
+ *
+ * Small directories are packed as tightly as possible so as to fit into the
+ * literal area of the inode. These "shortform" directories consist of a
+ * single xfs_dir2_sf_hdr header followed by zero or more xfs_dir2_sf_entry
+ * structures. Due the different inode number storage size and the variable
+ * length name field in the xfs_dir2_sf_entry all these structure are
+ * variable length, and the accessors in this file should be used to iterate
+ * over them.
+ */
+typedef struct xfs_dir2_sf_hdr {
+ __uint8_t count; /* count of entries */
+ __uint8_t i8count; /* count of 8-byte inode #s */
+ xfs_dir2_inou_t parent; /* parent dir inode number */
+} __arch_pack xfs_dir2_sf_hdr_t;
+
+typedef struct xfs_dir2_sf_entry {
+ __u8 namelen; /* actual name length */
+ xfs_dir2_sf_off_t offset; /* saved offset */
+ __u8 name[]; /* name, variable size */
+ /*
+ * A single byte containing the file type field follows the inode
+ * number for version 3 directory entries.
+ *
+ * A xfs_dir2_ino8_t or xfs_dir2_ino4_t follows here, at a
+ * variable offset after the name.
+ */
+} __arch_pack xfs_dir2_sf_entry_t;
+
+static inline int xfs_dir2_sf_hdr_size(int i8count)
+{
+ return sizeof(struct xfs_dir2_sf_hdr) -
+ (i8count == 0) *
+ (sizeof(xfs_dir2_ino8_t) - sizeof(xfs_dir2_ino4_t));
+}
+
+static inline xfs_dir2_data_aoff_t
+xfs_dir2_sf_get_offset(xfs_dir2_sf_entry_t *sfep)
+{
+ return get_unaligned_be16(&sfep->offset.i);
+}
+
+static inline void
+xfs_dir2_sf_put_offset(xfs_dir2_sf_entry_t *sfep, xfs_dir2_data_aoff_t off)
+{
+ put_unaligned_be16(off, &sfep->offset.i);
+}
+
+static inline struct xfs_dir2_sf_entry *
+xfs_dir2_sf_firstentry(struct xfs_dir2_sf_hdr *hdr)
+{
+ return (struct xfs_dir2_sf_entry *)
+ ((char *)hdr + xfs_dir2_sf_hdr_size(hdr->i8count));
+}
+
+/*
+ * Data block structures.
+ *
+ * A pure data block looks like the following drawing on disk:
+ *
+ * +-------------------------------------------------+
+ * | xfs_dir2_data_hdr_t |
+ * +-------------------------------------------------+
+ * | xfs_dir2_data_entry_t OR xfs_dir2_data_unused_t |
+ * | xfs_dir2_data_entry_t OR xfs_dir2_data_unused_t |
+ * | xfs_dir2_data_entry_t OR xfs_dir2_data_unused_t |
+ * | ... |
+ * +-------------------------------------------------+
+ * | unused space |
+ * +-------------------------------------------------+
+ *
+ * As all the entries are variable size structures the accessors below should
+ * be used to iterate over them.
+ *
+ * In addition to the pure data blocks for the data and node formats,
+ * most structures are also used for the combined data/freespace "block"
+ * format below.
+ */
+
+#define XFS_DIR2_DATA_ALIGN_LOG 3 /* i.e., 8 bytes */
+#define XFS_DIR2_DATA_ALIGN (1 << XFS_DIR2_DATA_ALIGN_LOG)
+#define XFS_DIR2_DATA_FREE_TAG 0xffff
+#define XFS_DIR2_DATA_FD_COUNT 3
+
+/*
+ * Directory address space divided into sections,
+ * spaces separated by 32GB.
+ */
+#define XFS_DIR2_SPACE_SIZE (1ULL << (32 + XFS_DIR2_DATA_ALIGN_LOG))
+#define XFS_DIR2_DATA_SPACE 0
+#define XFS_DIR2_DATA_OFFSET (XFS_DIR2_DATA_SPACE * XFS_DIR2_SPACE_SIZE)
+
+/*
+ * Describe a free area in the data block.
+ *
+ * The freespace will be formatted as a xfs_dir2_data_unused_t.
+ */
+typedef struct xfs_dir2_data_free {
+ __be16 offset; /* start of freespace */
+ __be16 length; /* length of freespace */
+} xfs_dir2_data_free_t;
+
+/*
+ * Header for the data blocks.
+ *
+ * The code knows that XFS_DIR2_DATA_FD_COUNT is 3.
+ */
+typedef struct xfs_dir2_data_hdr {
+ __be32 magic; /* XFS_DIR2_DATA_MAGIC or */
+ /* XFS_DIR2_BLOCK_MAGIC */
+ xfs_dir2_data_free_t bestfree[XFS_DIR2_DATA_FD_COUNT];
+} xfs_dir2_data_hdr_t;
+
+/*
+ * define a structure for all the verification fields we are adding to the
+ * directory block structures. This will be used in several structures.
+ * The magic number must be the first entry to align with all the dir2
+ * structures so we determine how to decode them just by the magic number.
+ */
+struct xfs_dir3_blk_hdr {
+ __be32 magic; /* magic number */
+ __be32 crc; /* CRC of block */
+ __be64 blkno; /* first block of the buffer */
+ __be64 lsn; /* sequence number of last write */
+ uuid_t uuid; /* filesystem we belong to */
+ __be64 owner; /* inode that owns the block */
+};
+
+struct xfs_dir3_data_hdr {
+ struct xfs_dir3_blk_hdr hdr;
+ xfs_dir2_data_free_t best_free[XFS_DIR2_DATA_FD_COUNT];
+ __be32 pad; /* 64 bit alignment */
+};
+
+#define XFS_DIR3_DATA_CRC_OFF offsetof(struct xfs_dir3_data_hdr, hdr.crc)
+
+/*
+ * Active entry in a data block.
+ *
+ * Aligned to 8 bytes. After the variable length name field there is a
+ * 2 byte tag field, which can be accessed using xfs_dir3_data_entry_tag_p.
+ *
+ * For dir3 structures, there is file type field between the name and the tag.
+ * This can only be manipulated by helper functions. It is packed hard against
+ * the end of the name so any padding for rounding is between the file type and
+ * the tag.
+ */
+typedef struct xfs_dir2_data_entry {
+ __be64 inumber; /* inode number */
+ __u8 namelen; /* name length */
+ __u8 name[]; /* name bytes, no null */
+ /* __u8 filetype; */ /* type of inode we point to */
+ /* __be16 tag; */ /* starting offset of us */
+} xfs_dir2_data_entry_t;
+
+/*
+ * Unused entry in a data block.
+ *
+ * Aligned to 8 bytes. Tag appears as the last 2 bytes and must be accessed
+ * using xfs_dir2_data_unused_tag_p.
+ */
+typedef struct xfs_dir2_data_unused {
+ __be16 freetag; /* XFS_DIR2_DATA_FREE_TAG */
+ __be16 length; /* total free length */
+ /* variable offset */
+ __be16 tag; /* starting offset of us */
+} xfs_dir2_data_unused_t;
+
+/*
+ * Pointer to a freespace's tag word.
+ */
+static inline __be16 *
+xfs_dir2_data_unused_tag_p(struct xfs_dir2_data_unused *dup)
+{
+ return (__be16 *)((char *)dup +
+ be16_to_cpu(dup->length) - sizeof(__be16));
+}
+
+/*
+ * Leaf block structures.
+ *
+ * A pure leaf block looks like the following drawing on disk:
+ *
+ * +---------------------------+
+ * | xfs_dir2_leaf_hdr_t |
+ * +---------------------------+
+ * | xfs_dir2_leaf_entry_t |
+ * | xfs_dir2_leaf_entry_t |
+ * | xfs_dir2_leaf_entry_t |
+ * | xfs_dir2_leaf_entry_t |
+ * | ... |
+ * +---------------------------+
+ * | xfs_dir2_data_off_t |
+ * | xfs_dir2_data_off_t |
+ * | xfs_dir2_data_off_t |
+ * | ... |
+ * +---------------------------+
+ * | xfs_dir2_leaf_tail_t |
+ * +---------------------------+
+ *
+ * The xfs_dir2_data_off_t members (bests) and tail are at the end of the block
+ * for single-leaf (magic = XFS_DIR2_LEAF1_MAGIC) blocks only, but not present
+ * for directories with separate leaf nodes and free space blocks
+ * (magic = XFS_DIR2_LEAFN_MAGIC).
+ *
+ * As all the entries are variable size structures the accessors below should
+ * be used to iterate over them.
+ */
+
+/*
+ * Offset of the leaf/node space. First block in this space
+ * is the btree root.
+ */
+#define XFS_DIR2_LEAF_SPACE 1
+#define XFS_DIR2_LEAF_OFFSET (XFS_DIR2_LEAF_SPACE * XFS_DIR2_SPACE_SIZE)
+
+/*
+ * Leaf block header.
+ */
+typedef struct xfs_dir2_leaf_hdr {
+ xfs_da_blkinfo_t info; /* header for da routines */
+ __be16 count; /* count of entries */
+ __be16 stale; /* count of stale entries */
+} xfs_dir2_leaf_hdr_t;
+
+struct xfs_dir3_leaf_hdr {
+ struct xfs_da3_blkinfo info; /* header for da routines */
+ __be16 count; /* count of entries */
+ __be16 stale; /* count of stale entries */
+ __be32 pad; /* 64 bit alignment */
+};
+
+struct xfs_dir3_icleaf_hdr {
+ __uint32_t forw;
+ __uint32_t back;
+ __uint16_t magic;
+ __uint16_t count;
+ __uint16_t stale;
+};
+
+/*
+ * Leaf block entry.
+ */
+typedef struct xfs_dir2_leaf_entry {
+ __be32 hashval; /* hash value of name */
+ __be32 address; /* address of data entry */
+} xfs_dir2_leaf_entry_t;
+
+/*
+ * Leaf block tail.
+ */
+typedef struct xfs_dir2_leaf_tail {
+ __be32 bestcount;
+} xfs_dir2_leaf_tail_t;
+
+/*
+ * Leaf block.
+ */
+typedef struct xfs_dir2_leaf {
+ xfs_dir2_leaf_hdr_t hdr; /* leaf header */
+ xfs_dir2_leaf_entry_t __ents[]; /* entries */
+} xfs_dir2_leaf_t;
+
+struct xfs_dir3_leaf {
+ struct xfs_dir3_leaf_hdr hdr; /* leaf header */
+ struct xfs_dir2_leaf_entry __ents[]; /* entries */
+};
+
+#define XFS_DIR3_LEAF_CRC_OFF offsetof(struct xfs_dir3_leaf_hdr, info.crc)
+
+/*
+ * Get address of the bests array in the single-leaf block.
+ */
+static inline __be16 *
+xfs_dir2_leaf_bests_p(struct xfs_dir2_leaf_tail *ltp)
+{
+ return (__be16 *)ltp - be32_to_cpu(ltp->bestcount);
+}
+
+/*
+ * Free space block defintions for the node format.
+ */
+
+/*
+ * Offset of the freespace index.
+ */
+#define XFS_DIR2_FREE_SPACE 2
+#define XFS_DIR2_FREE_OFFSET (XFS_DIR2_FREE_SPACE * XFS_DIR2_SPACE_SIZE)
+
+typedef struct xfs_dir2_free_hdr {
+ __be32 magic; /* XFS_DIR2_FREE_MAGIC */
+ __be32 firstdb; /* db of first entry */
+ __be32 nvalid; /* count of valid entries */
+ __be32 nused; /* count of used entries */
+} xfs_dir2_free_hdr_t;
+
+typedef struct xfs_dir2_free {
+ xfs_dir2_free_hdr_t hdr; /* block header */
+ __be16 bests[]; /* best free counts */
+ /* unused entries are -1 */
+} xfs_dir2_free_t;
+
+struct xfs_dir3_free_hdr {
+ struct xfs_dir3_blk_hdr hdr;
+ __be32 firstdb; /* db of first entry */
+ __be32 nvalid; /* count of valid entries */
+ __be32 nused; /* count of used entries */
+ __be32 pad; /* 64 bit alignment */
+};
+
+struct xfs_dir3_free {
+ struct xfs_dir3_free_hdr hdr;
+ __be16 bests[]; /* best free counts */
+ /* unused entries are -1 */
+};
+
+#define XFS_DIR3_FREE_CRC_OFF offsetof(struct xfs_dir3_free, hdr.hdr.crc)
+
+/*
+ * In core version of the free block header, abstracted away from on-disk format
+ * differences. Use this in the code, and convert to/from the disk version using
+ * xfs_dir3_free_hdr_from_disk/xfs_dir3_free_hdr_to_disk.
+ */
+struct xfs_dir3_icfree_hdr {
+ __uint32_t magic;
+ __uint32_t firstdb;
+ __uint32_t nvalid;
+ __uint32_t nused;
+
+};
+
+/*
+ * Single block format.
+ *
+ * The single block format looks like the following drawing on disk:
+ *
+ * +-------------------------------------------------+
+ * | xfs_dir2_data_hdr_t |
+ * +-------------------------------------------------+
+ * | xfs_dir2_data_entry_t OR xfs_dir2_data_unused_t |
+ * | xfs_dir2_data_entry_t OR xfs_dir2_data_unused_t |
+ * | xfs_dir2_data_entry_t OR xfs_dir2_data_unused_t :
+ * | ... |
+ * +-------------------------------------------------+
+ * | unused space |
+ * +-------------------------------------------------+
+ * | ... |
+ * | xfs_dir2_leaf_entry_t |
+ * | xfs_dir2_leaf_entry_t |
+ * +-------------------------------------------------+
+ * | xfs_dir2_block_tail_t |
+ * +-------------------------------------------------+
+ *
+ * As all the entries are variable size structures the accessors below should
+ * be used to iterate over them.
+ */
+
+typedef struct xfs_dir2_block_tail {
+ __be32 count; /* count of leaf entries */
+ __be32 stale; /* count of stale lf entries */
+} xfs_dir2_block_tail_t;
+
+/*
+ * Pointer to the leaf entries embedded in a data block (1-block format)
+ */
+static inline struct xfs_dir2_leaf_entry *
+xfs_dir2_block_leaf_p(struct xfs_dir2_block_tail *btp)
+{
+ return ((struct xfs_dir2_leaf_entry *)btp) - be32_to_cpu(btp->count);
+}
+
+
+/*
+ * Attribute storage layout
+ *
+ * Attribute lists are structured around Btrees where all the data
+ * elements are in the leaf nodes. Attribute names are hashed into an int,
+ * then that int is used as the index into the Btree. Since the hashval
+ * of an attribute name may not be unique, we may have duplicate keys. The
+ * internal links in the Btree are logical block offsets into the file.
+ *
+ * Struct leaf_entry's are packed from the top. Name/values grow from the
+ * bottom but are not packed. The freemap contains run-length-encoded entries
+ * for the free bytes after the leaf_entry's, but only the N largest such,
+ * smaller runs are dropped. When the freemap doesn't show enough space
+ * for an allocation, we compact the name/value area and try again. If we
+ * still don't have enough space, then we have to split the block. The
+ * name/value structs (both local and remote versions) must be 32bit aligned.
+ *
+ * Since we have duplicate hash keys, for each key that matches, compare
+ * the actual name string. The root and intermediate node search always
+ * takes the first-in-the-block key match found, so we should only have
+ * to work "forw"ard. If none matches, continue with the "forw"ard leaf
+ * nodes until the hash key changes or the attribute name is found.
+ *
+ * We store the fact that an attribute is a ROOT/USER/SECURE attribute in
+ * the leaf_entry. The namespaces are independent only because we also look
+ * at the namespace bit when we are looking for a matching attribute name.
+ *
+ * We also store an "incomplete" bit in the leaf_entry. It shows that an
+ * attribute is in the middle of being created and should not be shown to
+ * the user if we crash during the time that the bit is set. We clear the
+ * bit when we have finished setting up the attribute. We do this because
+ * we cannot create some large attributes inside a single transaction, and we
+ * need some indication that we weren't finished if we crash in the middle.
+ */
+#define XFS_ATTR_LEAF_MAPSIZE 3 /* how many freespace slots */
+
+typedef struct xfs_attr_leaf_map { /* RLE map of free bytes */
+ __be16 base; /* base of free region */
+ __be16 size; /* length of free region */
+} xfs_attr_leaf_map_t;
+
+typedef struct xfs_attr_leaf_hdr { /* constant-structure header block */
+ xfs_da_blkinfo_t info; /* block type, links, etc. */
+ __be16 count; /* count of active leaf_entry's */
+ __be16 usedbytes; /* num bytes of names/values stored */
+ __be16 firstused; /* first used byte in name area */
+ __u8 holes; /* != 0 if blk needs compaction */
+ __u8 pad1;
+ xfs_attr_leaf_map_t freemap[XFS_ATTR_LEAF_MAPSIZE];
+ /* N largest free regions */
+} xfs_attr_leaf_hdr_t;
+
+typedef struct xfs_attr_leaf_entry { /* sorted on key, not name */
+ __be32 hashval; /* hash value of name */
+ __be16 nameidx; /* index into buffer of name/value */
+ __u8 flags; /* LOCAL/ROOT/SECURE/INCOMPLETE flag */
+ __u8 pad2; /* unused pad byte */
+} xfs_attr_leaf_entry_t;
+
+typedef struct xfs_attr_leaf_name_local {
+ __be16 valuelen; /* number of bytes in value */
+ __u8 namelen; /* length of name bytes */
+ __u8 nameval[1]; /* name/value bytes */
+} xfs_attr_leaf_name_local_t;
+
+typedef struct xfs_attr_leaf_name_remote {
+ __be32 valueblk; /* block number of value bytes */
+ __be32 valuelen; /* number of bytes in value */
+ __u8 namelen; /* length of name bytes */
+ __u8 name[1]; /* name bytes */
+} xfs_attr_leaf_name_remote_t;
+
+typedef struct xfs_attr_leafblock {
+ xfs_attr_leaf_hdr_t hdr; /* constant-structure header block */
+ xfs_attr_leaf_entry_t entries[1]; /* sorted on key, not name */
+ xfs_attr_leaf_name_local_t namelist; /* grows from bottom of buf */
+ xfs_attr_leaf_name_remote_t valuelist; /* grows from bottom of buf */
+} xfs_attr_leafblock_t;
+
+/*
+ * CRC enabled leaf structures. Called "version 3" structures to match the
+ * version number of the directory and dablk structures for this feature, and
+ * attr2 is already taken by the variable inode attribute fork size feature.
+ */
+struct xfs_attr3_leaf_hdr {
+ struct xfs_da3_blkinfo info;
+ __be16 count;
+ __be16 usedbytes;
+ __be16 firstused;
+ __u8 holes;
+ __u8 pad1;
+ struct xfs_attr_leaf_map freemap[XFS_ATTR_LEAF_MAPSIZE];
+ __be32 pad2; /* 64 bit alignment */
+};
+
+#define XFS_ATTR3_LEAF_CRC_OFF (offsetof(struct xfs_attr3_leaf_hdr, info.crc))
+
+struct xfs_attr3_leafblock {
+ struct xfs_attr3_leaf_hdr hdr;
+ struct xfs_attr_leaf_entry entries[1];
+
+ /*
+ * The rest of the block contains the following structures after the
+ * leaf entries, growing from the bottom up. The variables are never
+ * referenced, the locations accessed purely from helper functions.
+ *
+ * struct xfs_attr_leaf_name_local
+ * struct xfs_attr_leaf_name_remote
+ */
+};
+
+/*
+ * incore, neutral version of the attribute leaf header
+ */
+struct xfs_attr3_icleaf_hdr {
+ __uint32_t forw;
+ __uint32_t back;
+ __uint16_t magic;
+ __uint16_t count;
+ __uint16_t usedbytes;
+ __uint16_t firstused;
+ __u8 holes;
+ struct {
+ __uint16_t base;
+ __uint16_t size;
+ } freemap[XFS_ATTR_LEAF_MAPSIZE];
+};
+
+/*
+ * Flags used in the leaf_entry[i].flags field.
+ * NOTE: the INCOMPLETE bit must not collide with the flags bits specified
+ * on the system call, they are "or"ed together for various operations.
+ */
+#define XFS_ATTR_LOCAL_BIT 0 /* attr is stored locally */
+#define XFS_ATTR_ROOT_BIT 1 /* limit access to trusted attrs */
+#define XFS_ATTR_SECURE_BIT 2 /* limit access to secure attrs */
+#define XFS_ATTR_INCOMPLETE_BIT 7 /* attr in middle of create/delete */
+#define XFS_ATTR_LOCAL (1 << XFS_ATTR_LOCAL_BIT)
+#define XFS_ATTR_ROOT (1 << XFS_ATTR_ROOT_BIT)
+#define XFS_ATTR_SECURE (1 << XFS_ATTR_SECURE_BIT)
+#define XFS_ATTR_INCOMPLETE (1 << XFS_ATTR_INCOMPLETE_BIT)
+
+/*
+ * Conversion macros for converting namespace bits from argument flags
+ * to ondisk flags.
+ */
+#define XFS_ATTR_NSP_ARGS_MASK (ATTR_ROOT | ATTR_SECURE)
+#define XFS_ATTR_NSP_ONDISK_MASK (XFS_ATTR_ROOT | XFS_ATTR_SECURE)
+#define XFS_ATTR_NSP_ONDISK(flags) ((flags) & XFS_ATTR_NSP_ONDISK_MASK)
+#define XFS_ATTR_NSP_ARGS(flags) ((flags) & XFS_ATTR_NSP_ARGS_MASK)
+#define XFS_ATTR_NSP_ARGS_TO_ONDISK(x) (((x) & ATTR_ROOT ? XFS_ATTR_ROOT : 0) |\
+ ((x) & ATTR_SECURE ? XFS_ATTR_SECURE : 0))
+#define XFS_ATTR_NSP_ONDISK_TO_ARGS(x) (((x) & XFS_ATTR_ROOT ? ATTR_ROOT : 0) |\
+ ((x) & XFS_ATTR_SECURE ? ATTR_SECURE : 0))
+
+/*
+ * Alignment for namelist and valuelist entries (since they are mixed
+ * there can be only one alignment value)
+ */
+#define XFS_ATTR_LEAF_NAME_ALIGN ((uint)sizeof(xfs_dablk_t))
+
+static inline int
+xfs_attr3_leaf_hdr_size(struct xfs_attr_leafblock *leafp)
+{
+ if (leafp->hdr.info.magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC))
+ return sizeof(struct xfs_attr3_leaf_hdr);
+ return sizeof(struct xfs_attr_leaf_hdr);
+}
+
+static inline struct xfs_attr_leaf_entry *
+xfs_attr3_leaf_entryp(xfs_attr_leafblock_t *leafp)
+{
+ if (leafp->hdr.info.magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC))
+ return &((struct xfs_attr3_leafblock *)leafp)->entries[0];
+ return &leafp->entries[0];
+}
+
+/*
+ * Cast typed pointers for "local" and "remote" name/value structs.
+ */
+static inline char *
+xfs_attr3_leaf_name(xfs_attr_leafblock_t *leafp, int idx)
+{
+ struct xfs_attr_leaf_entry *entries = xfs_attr3_leaf_entryp(leafp);
+
+ return &((char *)leafp)[be16_to_cpu(entries[idx].nameidx)];
+}
+
+static inline xfs_attr_leaf_name_remote_t *
+xfs_attr3_leaf_name_remote(xfs_attr_leafblock_t *leafp, int idx)
+{
+ return (xfs_attr_leaf_name_remote_t *)xfs_attr3_leaf_name(leafp, idx);
+}
+
+static inline xfs_attr_leaf_name_local_t *
+xfs_attr3_leaf_name_local(xfs_attr_leafblock_t *leafp, int idx)
+{
+ return (xfs_attr_leaf_name_local_t *)xfs_attr3_leaf_name(leafp, idx);
+}
+
+/*
+ * Calculate total bytes used (including trailing pad for alignment) for
+ * a "local" name/value structure, a "remote" name/value structure, and
+ * a pointer which might be either.
+ */
+static inline int xfs_attr_leaf_entsize_remote(int nlen)
+{
+ return ((uint)sizeof(xfs_attr_leaf_name_remote_t) - 1 + (nlen) + \
+ XFS_ATTR_LEAF_NAME_ALIGN - 1) & ~(XFS_ATTR_LEAF_NAME_ALIGN - 1);
+}
+
+static inline int xfs_attr_leaf_entsize_local(int nlen, int vlen)
+{
+ return ((uint)sizeof(xfs_attr_leaf_name_local_t) - 1 + (nlen) + (vlen) +
+ XFS_ATTR_LEAF_NAME_ALIGN - 1) & ~(XFS_ATTR_LEAF_NAME_ALIGN - 1);
+}
+
+static inline int xfs_attr_leaf_entsize_local_max(int bsize)
+{
+ return (((bsize) >> 1) + ((bsize) >> 2));
+}
+
+
+
+/*
+ * Remote attribute block format definition
+ *
+ * There is one of these headers per filesystem block in a remote attribute.
+ * This is done to ensure there is a 1:1 mapping between the attribute value
+ * length and the number of blocks needed to store the attribute. This makes the
+ * verification of a buffer a little more complex, but greatly simplifies the
+ * allocation, reading and writing of these attributes as we don't have to guess
+ * the number of blocks needed to store the attribute data.
+ */
+#define XFS_ATTR3_RMT_MAGIC 0x5841524d /* XARM */
+
+struct xfs_attr3_rmt_hdr {
+ __be32 rm_magic;
+ __be32 rm_offset;
+ __be32 rm_bytes;
+ __be32 rm_crc;
+ uuid_t rm_uuid;
+ __be64 rm_owner;
+ __be64 rm_blkno;
+ __be64 rm_lsn;
+};
+
+#define XFS_ATTR3_RMT_CRC_OFF offsetof(struct xfs_attr3_rmt_hdr, rm_crc)
+
+#define XFS_ATTR3_RMT_BUF_SPACE(mp, bufsize) \
+ ((bufsize) - (xfs_sb_version_hascrc(&(mp)->m_sb) ? \
+ sizeof(struct xfs_attr3_rmt_hdr) : 0))
+
+#endif /* __XFS_DA_FORMAT_H__ */
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
deleted file mode 100644
index c6191d00ad2..00000000000
--- a/fs/xfs/xfs_dfrag.c
+++ /dev/null
@@ -1,375 +0,0 @@
-/*
- * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_types.h"
-#include "xfs_bit.h"
-#include "xfs_log.h"
-#include "xfs_inum.h"
-#include "xfs_trans.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_dir.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
-#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
-#include "xfs_dinode.h"
-#include "xfs_inode.h"
-#include "xfs_inode_item.h"
-#include "xfs_bmap.h"
-#include "xfs_btree.h"
-#include "xfs_ialloc.h"
-#include "xfs_itable.h"
-#include "xfs_dfrag.h"
-#include "xfs_error.h"
-#include "xfs_mac.h"
-#include "xfs_rw.h"
-
-/*
- * Syssgi interface for swapext
- */
-int
-xfs_swapext(
- xfs_swapext_t __user *sxu)
-{
- xfs_swapext_t *sxp;
- xfs_inode_t *ip=NULL, *tip=NULL, *ips[2];
- xfs_trans_t *tp;
- xfs_mount_t *mp;
- xfs_bstat_t *sbp;
- struct file *fp = NULL, *tfp = NULL;
- vnode_t *vp, *tvp;
- static uint lock_flags = XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL;
- int ilf_fields, tilf_fields;
- int error = 0;
- xfs_ifork_t *tempifp, *ifp, *tifp;
- __uint64_t tmp;
- int aforkblks = 0;
- int taforkblks = 0;
- char locked = 0;
-
- sxp = kmem_alloc(sizeof(xfs_swapext_t), KM_MAYFAIL);
- tempifp = kmem_alloc(sizeof(xfs_ifork_t), KM_MAYFAIL);
- if (!sxp || !tempifp) {
- error = XFS_ERROR(ENOMEM);
- goto error0;
- }
-
- if (copy_from_user(sxp, sxu, sizeof(xfs_swapext_t))) {
- error = XFS_ERROR(EFAULT);
- goto error0;
- }
-
- /* Pull information for the target fd */
- if (((fp = fget((int)sxp->sx_fdtarget)) == NULL) ||
- ((vp = LINVFS_GET_VP(fp->f_dentry->d_inode)) == NULL)) {
- error = XFS_ERROR(EINVAL);
- goto error0;
- }
-
- ip = xfs_vtoi(vp);
- if (ip == NULL) {
- error = XFS_ERROR(EBADF);
- goto error0;
- }
-
- if (((tfp = fget((int)sxp->sx_fdtmp)) == NULL) ||
- ((tvp = LINVFS_GET_VP(tfp->f_dentry->d_inode)) == NULL)) {
- error = XFS_ERROR(EINVAL);
- goto error0;
- }
-
- tip = xfs_vtoi(tvp);
- if (tip == NULL) {
- error = XFS_ERROR(EBADF);
- goto error0;
- }
-
- if (ip->i_mount != tip->i_mount) {
- error = XFS_ERROR(EINVAL);
- goto error0;
- }
-
- if (ip->i_ino == tip->i_ino) {
- error = XFS_ERROR(EINVAL);
- goto error0;
- }
-
- mp = ip->i_mount;
-
- sbp = &sxp->sx_stat;
-
- if (XFS_FORCED_SHUTDOWN(mp)) {
- error = XFS_ERROR(EIO);
- goto error0;
- }
-
- locked = 1;
-
- /* Lock in i_ino order */
- if (ip->i_ino < tip->i_ino) {
- ips[0] = ip;
- ips[1] = tip;
- } else {
- ips[0] = tip;
- ips[1] = ip;
- }
-
- xfs_lock_inodes(ips, 2, 0, lock_flags);
-
- /* Check permissions */
- error = xfs_iaccess(ip, S_IWUSR, NULL);
- if (error)
- goto error0;
-
- error = xfs_iaccess(tip, S_IWUSR, NULL);
- if (error)
- goto error0;
-
- /* Verify that both files have the same format */
- if ((ip->i_d.di_mode & S_IFMT) != (tip->i_d.di_mode & S_IFMT)) {
- error = XFS_ERROR(EINVAL);
- goto error0;
- }
-
- /* Verify both files are either real-time or non-realtime */
- if ((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) !=
- (tip->i_d.di_flags & XFS_DIFLAG_REALTIME)) {
- error = XFS_ERROR(EINVAL);
- goto error0;
- }
-
- /* Should never get a local format */
- if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL ||
- tip->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
- error = XFS_ERROR(EINVAL);
- goto error0;
- }
-
- if (VN_CACHED(tvp) != 0) {
- xfs_inval_cached_trace(&tip->i_iocore, 0, -1, 0, -1);
- VOP_FLUSHINVAL_PAGES(tvp, 0, -1, FI_REMAPF_LOCKED);
- }
-
- /* Verify O_DIRECT for ftmp */
- if (VN_CACHED(tvp) != 0) {
- error = XFS_ERROR(EINVAL);
- goto error0;
- }
-
- /* Verify all data are being swapped */
- if (sxp->sx_offset != 0 ||
- sxp->sx_length != ip->i_d.di_size ||
- sxp->sx_length != tip->i_d.di_size) {
- error = XFS_ERROR(EFAULT);
- goto error0;
- }
-
- /*
- * If the target has extended attributes, the tmp file
- * must also in order to ensure the correct data fork
- * format.
- */
- if ( XFS_IFORK_Q(ip) != XFS_IFORK_Q(tip) ) {
- error = XFS_ERROR(EINVAL);
- goto error0;
- }
-
- /*
- * Compare the current change & modify times with that
- * passed in. If they differ, we abort this swap.
- * This is the mechanism used to ensure the calling
- * process that the file was not changed out from
- * under it.
- */
- if ((sbp->bs_ctime.tv_sec != ip->i_d.di_ctime.t_sec) ||
- (sbp->bs_ctime.tv_nsec != ip->i_d.di_ctime.t_nsec) ||
- (sbp->bs_mtime.tv_sec != ip->i_d.di_mtime.t_sec) ||
- (sbp->bs_mtime.tv_nsec != ip->i_d.di_mtime.t_nsec)) {
- error = XFS_ERROR(EBUSY);
- goto error0;
- }
-
- /* We need to fail if the file is memory mapped. Once we have tossed
- * all existing pages, the page fault will have no option
- * but to go to the filesystem for pages. By making the page fault call
- * VOP_READ (or write in the case of autogrow) they block on the iolock
- * until we have switched the extents.
- */
- if (VN_MAPPED(vp)) {
- error = XFS_ERROR(EBUSY);
- goto error0;
- }
-
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- xfs_iunlock(tip, XFS_ILOCK_EXCL);
-
- /*
- * There is a race condition here since we gave up the
- * ilock. However, the data fork will not change since
- * we have the iolock (locked for truncation too) so we
- * are safe. We don't really care if non-io related
- * fields change.
- */
-
- VOP_TOSS_PAGES(vp, 0, -1, FI_REMAPF);
-
- tp = xfs_trans_alloc(mp, XFS_TRANS_SWAPEXT);
- if ((error = xfs_trans_reserve(tp, 0,
- XFS_ICHANGE_LOG_RES(mp), 0,
- 0, 0))) {
- xfs_iunlock(ip, XFS_IOLOCK_EXCL);
- xfs_iunlock(tip, XFS_IOLOCK_EXCL);
- xfs_trans_cancel(tp, 0);
- locked = 0;
- goto error0;
- }
- xfs_lock_inodes(ips, 2, 0, XFS_ILOCK_EXCL);
-
- /*
- * Count the number of extended attribute blocks
- */
- if ( ((XFS_IFORK_Q(ip) != 0) && (ip->i_d.di_anextents > 0)) &&
- (ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) {
- error = xfs_bmap_count_blocks(tp, ip, XFS_ATTR_FORK, &aforkblks);
- if (error) {
- xfs_trans_cancel(tp, 0);
- goto error0;
- }
- }
- if ( ((XFS_IFORK_Q(tip) != 0) && (tip->i_d.di_anextents > 0)) &&
- (tip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) {
- error = xfs_bmap_count_blocks(tp, tip, XFS_ATTR_FORK,
- &taforkblks);
- if (error) {
- xfs_trans_cancel(tp, 0);
- goto error0;
- }
- }
-
- /*
- * Swap the data forks of the inodes
- */
- ifp = &ip->i_df;
- tifp = &tip->i_df;
- *tempifp = *ifp; /* struct copy */
- *ifp = *tifp; /* struct copy */
- *tifp = *tempifp; /* struct copy */
-
- /*
- * Fix the on-disk inode values
- */
- tmp = (__uint64_t)ip->i_d.di_nblocks;
- ip->i_d.di_nblocks = tip->i_d.di_nblocks - taforkblks + aforkblks;
- tip->i_d.di_nblocks = tmp + taforkblks - aforkblks;
-
- tmp = (__uint64_t) ip->i_d.di_nextents;
- ip->i_d.di_nextents = tip->i_d.di_nextents;
- tip->i_d.di_nextents = tmp;
-
- tmp = (__uint64_t) ip->i_d.di_format;
- ip->i_d.di_format = tip->i_d.di_format;
- tip->i_d.di_format = tmp;
-
- ilf_fields = XFS_ILOG_CORE;
-
- switch(ip->i_d.di_format) {
- case XFS_DINODE_FMT_EXTENTS:
- /* If the extents fit in the inode, fix the
- * pointer. Otherwise it's already NULL or
- * pointing to the extent.
- */
- if (ip->i_d.di_nextents <= XFS_INLINE_EXTS) {
- ifp->if_u1.if_extents =
- ifp->if_u2.if_inline_ext;
- }
- ilf_fields |= XFS_ILOG_DEXT;
- break;
- case XFS_DINODE_FMT_BTREE:
- ilf_fields |= XFS_ILOG_DBROOT;
- break;
- }
-
- tilf_fields = XFS_ILOG_CORE;
-
- switch(tip->i_d.di_format) {
- case XFS_DINODE_FMT_EXTENTS:
- /* If the extents fit in the inode, fix the
- * pointer. Otherwise it's already NULL or
- * pointing to the extent.
- */
- if (tip->i_d.di_nextents <= XFS_INLINE_EXTS) {
- tifp->if_u1.if_extents =
- tifp->if_u2.if_inline_ext;
- }
- tilf_fields |= XFS_ILOG_DEXT;
- break;
- case XFS_DINODE_FMT_BTREE:
- tilf_fields |= XFS_ILOG_DBROOT;
- break;
- }
-
- /*
- * Increment vnode ref counts since xfs_trans_commit &
- * xfs_trans_cancel will both unlock the inodes and
- * decrement the associated ref counts.
- */
- VN_HOLD(vp);
- VN_HOLD(tvp);
-
- xfs_trans_ijoin(tp, ip, lock_flags);
- xfs_trans_ijoin(tp, tip, lock_flags);
-
- xfs_trans_log_inode(tp, ip, ilf_fields);
- xfs_trans_log_inode(tp, tip, tilf_fields);
-
- /*
- * If this is a synchronous mount, make sure that the
- * transaction goes to disk before returning to the user.
- */
- if (mp->m_flags & XFS_MOUNT_WSYNC) {
- xfs_trans_set_sync(tp);
- }
-
- error = xfs_trans_commit(tp, XFS_TRANS_SWAPEXT, NULL);
- locked = 0;
-
- error0:
- if (locked) {
- xfs_iunlock(ip, lock_flags);
- xfs_iunlock(tip, lock_flags);
- }
-
- if (fp != NULL)
- fput(fp);
- if (tfp != NULL)
- fput(tfp);
-
- if (sxp != NULL)
- kmem_free(sxp, sizeof(xfs_swapext_t));
- if (tempifp != NULL)
- kmem_free(tempifp, sizeof(xfs_ifork_t));
-
- return error;
-}
diff --git a/fs/xfs/xfs_dfrag.h b/fs/xfs/xfs_dfrag.h
deleted file mode 100644
index f678559abc4..00000000000
--- a/fs/xfs/xfs_dfrag.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * Copyright (c) 2000,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- */
-#ifndef __XFS_DFRAG_H__
-#define __XFS_DFRAG_H__
-
-/*
- * Structure passed to xfs_swapext
- */
-
-typedef struct xfs_swapext
-{
- __int64_t sx_version; /* version */
- __int64_t sx_fdtarget; /* fd of target file */
- __int64_t sx_fdtmp; /* fd of tmp file */
- xfs_off_t sx_offset; /* offset into file */
- xfs_off_t sx_length; /* leng from offset */
- char sx_pad[16]; /* pad space, unused */
- xfs_bstat_t sx_stat; /* stat of target b4 copy */
-} xfs_swapext_t;
-
-/*
- * Version flag
- */
-#define XFS_SX_VERSION 0
-
-#ifdef __KERNEL__
-/*
- * Prototypes for visible xfs_dfrag.c routines.
- */
-
-/*
- * Syscall interface for xfs_swapext
- */
-int xfs_swapext(struct xfs_swapext __user *sx);
-
-#endif /* __KERNEL__ */
-
-#endif /* __XFS_DFRAG_H__ */
diff --git a/fs/xfs/xfs_dinode.h b/fs/xfs/xfs_dinode.h
index 79d0d9e1fba..623bbe8fd92 100644
--- a/fs/xfs/xfs_dinode.h
+++ b/fs/xfs/xfs_dinode.h
@@ -18,86 +18,91 @@
#ifndef __XFS_DINODE_H__
#define __XFS_DINODE_H__
-struct xfs_buf;
-struct xfs_mount;
+#define XFS_DINODE_MAGIC 0x494e /* 'IN' */
+#define XFS_DINODE_GOOD_VERSION(v) ((v) >= 1 && (v) <= 3)
-#define XFS_DINODE_VERSION_1 1
-#define XFS_DINODE_VERSION_2 2
-#define XFS_DINODE_GOOD_VERSION(v) \
- (((v) == XFS_DINODE_VERSION_1 || (v) == XFS_DINODE_VERSION_2))
-#define XFS_DINODE_MAGIC 0x494e /* 'IN' */
-
-/*
- * Disk inode structure.
- * This is just the header; the inode is expanded to fill a variable size
- * with the last field expanding. It is split into the core and "other"
- * because we only need the core part in the in-core inode.
- */
typedef struct xfs_timestamp {
- __int32_t t_sec; /* timestamp seconds */
- __int32_t t_nsec; /* timestamp nanoseconds */
+ __be32 t_sec; /* timestamp seconds */
+ __be32 t_nsec; /* timestamp nanoseconds */
} xfs_timestamp_t;
/*
- * Note: Coordinate changes to this structure with the XFS_DI_* #defines
- * below and the offsets table in xfs_ialloc_log_di().
+ * On-disk inode structure.
+ *
+ * This is just the header or "dinode core", the inode is expanded to fill a
+ * variable size the leftover area split into a data and an attribute fork.
+ * The format of the data and attribute fork depends on the format of the
+ * inode as indicated by di_format and di_aformat. To access the data and
+ * attribute use the XFS_DFORK_DPTR, XFS_DFORK_APTR, and XFS_DFORK_PTR macros
+ * below.
+ *
+ * There is a very similar struct icdinode in xfs_inode which matches the
+ * layout of the first 96 bytes of this structure, but is kept in native
+ * format instead of big endian.
+ *
+ * Note: di_flushiter is only used by v1/2 inodes - it's effectively a zeroed
+ * padding field for v3 inodes.
*/
-typedef struct xfs_dinode_core
-{
- __uint16_t di_magic; /* inode magic # = XFS_DINODE_MAGIC */
- __uint16_t di_mode; /* mode and type of file */
- __int8_t di_version; /* inode version */
- __int8_t di_format; /* format of di_c data */
- __uint16_t di_onlink; /* old number of links to file */
- __uint32_t di_uid; /* owner's user id */
- __uint32_t di_gid; /* owner's group id */
- __uint32_t di_nlink; /* number of links to file */
- __uint16_t di_projid; /* owner's project id */
- __uint8_t di_pad[8]; /* unused, zeroed space */
- __uint16_t di_flushiter; /* incremented on flush */
+typedef struct xfs_dinode {
+ __be16 di_magic; /* inode magic # = XFS_DINODE_MAGIC */
+ __be16 di_mode; /* mode and type of file */
+ __u8 di_version; /* inode version */
+ __u8 di_format; /* format of di_c data */
+ __be16 di_onlink; /* old number of links to file */
+ __be32 di_uid; /* owner's user id */
+ __be32 di_gid; /* owner's group id */
+ __be32 di_nlink; /* number of links to file */
+ __be16 di_projid_lo; /* lower part of owner's project id */
+ __be16 di_projid_hi; /* higher part owner's project id */
+ __u8 di_pad[6]; /* unused, zeroed space */
+ __be16 di_flushiter; /* incremented on flush */
xfs_timestamp_t di_atime; /* time last accessed */
xfs_timestamp_t di_mtime; /* time last modified */
xfs_timestamp_t di_ctime; /* time created/inode modified */
- xfs_fsize_t di_size; /* number of bytes in file */
- xfs_drfsbno_t di_nblocks; /* # of direct & btree blocks used */
- xfs_extlen_t di_extsize; /* basic/minimum extent size for file */
- xfs_extnum_t di_nextents; /* number of extents in data fork */
- xfs_aextnum_t di_anextents; /* number of extents in attribute fork*/
- __uint8_t di_forkoff; /* attr fork offs, <<3 for 64b align */
- __int8_t di_aformat; /* format of attr fork's data */
- __uint32_t di_dmevmask; /* DMIG event mask */
- __uint16_t di_dmstate; /* DMIG state info */
- __uint16_t di_flags; /* random flags, XFS_DIFLAG_... */
- __uint32_t di_gen; /* generation number */
-} xfs_dinode_core_t;
+ __be64 di_size; /* number of bytes in file */
+ __be64 di_nblocks; /* # of direct & btree blocks used */
+ __be32 di_extsize; /* basic/minimum extent size for file */
+ __be32 di_nextents; /* number of extents in data fork */
+ __be16 di_anextents; /* number of extents in attribute fork*/
+ __u8 di_forkoff; /* attr fork offs, <<3 for 64b align */
+ __s8 di_aformat; /* format of attr fork's data */
+ __be32 di_dmevmask; /* DMIG event mask */
+ __be16 di_dmstate; /* DMIG state info */
+ __be16 di_flags; /* random flags, XFS_DIFLAG_... */
+ __be32 di_gen; /* generation number */
+
+ /* di_next_unlinked is the only non-core field in the old dinode */
+ __be32 di_next_unlinked;/* agi unlinked list ptr */
+
+ /* start of the extended dinode, writable fields */
+ __le32 di_crc; /* CRC of the inode */
+ __be64 di_changecount; /* number of attribute changes */
+ __be64 di_lsn; /* flush sequence */
+ __be64 di_flags2; /* more random flags */
+ __u8 di_pad2[16]; /* more padding for future expansion */
+
+ /* fields only written to during inode creation */
+ xfs_timestamp_t di_crtime; /* time created */
+ __be64 di_ino; /* inode number */
+ uuid_t di_uuid; /* UUID of the filesystem */
+
+ /* structure must be padded to 64 bit alignment */
+} xfs_dinode_t;
+
+#define XFS_DINODE_CRC_OFF offsetof(struct xfs_dinode, di_crc)
#define DI_MAX_FLUSH 0xffff
-typedef struct xfs_dinode
+/*
+ * Size of the core inode on disk. Version 1 and 2 inodes have
+ * the same size, but version 3 has grown a few additional fields.
+ */
+static inline uint xfs_dinode_size(int version)
{
- xfs_dinode_core_t di_core;
- /*
- * In adding anything between the core and the union, be
- * sure to update the macros like XFS_LITINO below and
- * XFS_BMAP_RBLOCK_DSIZE in xfs_bmap_btree.h.
- */
- xfs_agino_t di_next_unlinked;/* agi unlinked list ptr */
- union {
- xfs_bmdr_block_t di_bmbt; /* btree root block */
- xfs_bmbt_rec_32_t di_bmx[1]; /* extent list */
- xfs_dir_shortform_t di_dirsf; /* shortform directory */
- xfs_dir2_sf_t di_dir2sf; /* shortform directory v2 */
- char di_c[1]; /* local contents */
- xfs_dev_t di_dev; /* device for S_IFCHR/S_IFBLK */
- uuid_t di_muuid; /* mount point value */
- char di_symlink[1]; /* local symbolic link */
- } di_u;
- union {
- xfs_bmdr_block_t di_abmbt; /* btree root block */
- xfs_bmbt_rec_32_t di_abmx[1]; /* extent list */
- xfs_attr_shortform_t di_attrsf; /* shortform attribute list */
- } di_a;
-} xfs_dinode_t;
+ if (version == 3)
+ return sizeof(struct xfs_dinode);
+ return offsetof(struct xfs_dinode, di_crc);
+}
/*
* The 32 bit link count in the inode theoretically maxes out at UINT_MAX.
@@ -108,50 +113,14 @@ typedef struct xfs_dinode
#define XFS_MAXLINK_1 65535U
/*
- * Bit names for logging disk inodes only
- */
-#define XFS_DI_MAGIC 0x0000001
-#define XFS_DI_MODE 0x0000002
-#define XFS_DI_VERSION 0x0000004
-#define XFS_DI_FORMAT 0x0000008
-#define XFS_DI_ONLINK 0x0000010
-#define XFS_DI_UID 0x0000020
-#define XFS_DI_GID 0x0000040
-#define XFS_DI_NLINK 0x0000080
-#define XFS_DI_PROJID 0x0000100
-#define XFS_DI_PAD 0x0000200
-#define XFS_DI_ATIME 0x0000400
-#define XFS_DI_MTIME 0x0000800
-#define XFS_DI_CTIME 0x0001000
-#define XFS_DI_SIZE 0x0002000
-#define XFS_DI_NBLOCKS 0x0004000
-#define XFS_DI_EXTSIZE 0x0008000
-#define XFS_DI_NEXTENTS 0x0010000
-#define XFS_DI_NAEXTENTS 0x0020000
-#define XFS_DI_FORKOFF 0x0040000
-#define XFS_DI_AFORMAT 0x0080000
-#define XFS_DI_DMEVMASK 0x0100000
-#define XFS_DI_DMSTATE 0x0200000
-#define XFS_DI_FLAGS 0x0400000
-#define XFS_DI_GEN 0x0800000
-#define XFS_DI_NEXT_UNLINKED 0x1000000
-#define XFS_DI_U 0x2000000
-#define XFS_DI_A 0x4000000
-#define XFS_DI_NUM_BITS 27
-#define XFS_DI_ALL_BITS ((1 << XFS_DI_NUM_BITS) - 1)
-#define XFS_DI_CORE_BITS (XFS_DI_ALL_BITS & ~(XFS_DI_U|XFS_DI_A))
-
-/*
* Values for di_format
*/
-typedef enum xfs_dinode_fmt
-{
- XFS_DINODE_FMT_DEV, /* CHR, BLK: di_dev */
- XFS_DINODE_FMT_LOCAL, /* DIR, REG: di_c */
- /* LNK: di_symlink */
- XFS_DINODE_FMT_EXTENTS, /* DIR, REG, LNK: di_bmx */
- XFS_DINODE_FMT_BTREE, /* DIR, REG, LNK: di_bmbt */
- XFS_DINODE_FMT_UUID /* MNT: di_uuid */
+typedef enum xfs_dinode_fmt {
+ XFS_DINODE_FMT_DEV, /* xfs_dev_t */
+ XFS_DINODE_FMT_LOCAL, /* bulk data */
+ XFS_DINODE_FMT_EXTENTS, /* struct xfs_bmbt_rec */
+ XFS_DINODE_FMT_BTREE, /* struct xfs_bmdr_block */
+ XFS_DINODE_FMT_UUID /* uuid_t */
} xfs_dinode_fmt_t;
/*
@@ -165,79 +134,62 @@ typedef enum xfs_dinode_fmt
/*
* Inode size for given fs.
*/
-#define XFS_LITINO(mp) ((mp)->m_litino)
-#define XFS_BROOT_SIZE_ADJ \
- (sizeof(xfs_bmbt_block_t) - sizeof(xfs_bmdr_block_t))
+#define XFS_LITINO(mp, version) \
+ ((int)(((mp)->m_sb.sb_inodesize) - xfs_dinode_size(version)))
/*
* Inode data & attribute fork sizes, per inode.
*/
-#define XFS_CFORK_Q(dcp) ((dcp)->di_forkoff != 0)
-#define XFS_CFORK_Q_DISK(dcp) ((dcp)->di_forkoff != 0)
-
-#define XFS_CFORK_BOFF(dcp) ((int)((dcp)->di_forkoff << 3))
-#define XFS_CFORK_BOFF_DISK(dcp) \
- ((int)(INT_GET((dcp)->di_forkoff, ARCH_CONVERT) << 3))
-
-#define XFS_CFORK_DSIZE_DISK(dcp,mp) \
- (XFS_CFORK_Q_DISK(dcp) ? XFS_CFORK_BOFF_DISK(dcp) : XFS_LITINO(mp))
-#define XFS_CFORK_DSIZE(dcp,mp) \
- (XFS_CFORK_Q(dcp) ? XFS_CFORK_BOFF(dcp) : XFS_LITINO(mp))
-
-#define XFS_CFORK_ASIZE_DISK(dcp,mp) \
- (XFS_CFORK_Q_DISK(dcp) ? XFS_LITINO(mp) - XFS_CFORK_BOFF_DISK(dcp) : 0)
-#define XFS_CFORK_ASIZE(dcp,mp) \
- (XFS_CFORK_Q(dcp) ? XFS_LITINO(mp) - XFS_CFORK_BOFF(dcp) : 0)
-
-#define XFS_CFORK_SIZE_DISK(dcp,mp,w) \
- ((w) == XFS_DATA_FORK ? \
- XFS_CFORK_DSIZE_DISK(dcp, mp) : \
- XFS_CFORK_ASIZE_DISK(dcp, mp))
-#define XFS_CFORK_SIZE(dcp,mp,w) \
- ((w) == XFS_DATA_FORK ? \
- XFS_CFORK_DSIZE(dcp, mp) : XFS_CFORK_ASIZE(dcp, mp))
+#define XFS_DFORK_Q(dip) ((dip)->di_forkoff != 0)
+#define XFS_DFORK_BOFF(dip) ((int)((dip)->di_forkoff << 3))
#define XFS_DFORK_DSIZE(dip,mp) \
- XFS_CFORK_DSIZE_DISK(&(dip)->di_core, mp)
-#define XFS_DFORK_DSIZE_HOST(dip,mp) \
- XFS_CFORK_DSIZE(&(dip)->di_core, mp)
+ (XFS_DFORK_Q(dip) ? \
+ XFS_DFORK_BOFF(dip) : \
+ XFS_LITINO(mp, (dip)->di_version))
#define XFS_DFORK_ASIZE(dip,mp) \
- XFS_CFORK_ASIZE_DISK(&(dip)->di_core, mp)
-#define XFS_DFORK_ASIZE_HOST(dip,mp) \
- XFS_CFORK_ASIZE(&(dip)->di_core, mp)
-#define XFS_DFORK_SIZE(dip,mp,w) \
- XFS_CFORK_SIZE_DISK(&(dip)->di_core, mp, w)
-#define XFS_DFORK_SIZE_HOST(dip,mp,w) \
- XFS_CFORK_SIZE(&(dip)->di_core, mp, w)
+ (XFS_DFORK_Q(dip) ? \
+ XFS_LITINO(mp, (dip)->di_version) - XFS_DFORK_BOFF(dip) : \
+ 0)
+#define XFS_DFORK_SIZE(dip,mp,w) \
+ ((w) == XFS_DATA_FORK ? \
+ XFS_DFORK_DSIZE(dip, mp) : \
+ XFS_DFORK_ASIZE(dip, mp))
-#define XFS_DFORK_Q(dip) XFS_CFORK_Q_DISK(&(dip)->di_core)
-#define XFS_DFORK_BOFF(dip) XFS_CFORK_BOFF_DISK(&(dip)->di_core)
-#define XFS_DFORK_DPTR(dip) ((dip)->di_u.di_c)
-#define XFS_DFORK_APTR(dip) \
- ((dip)->di_u.di_c + XFS_DFORK_BOFF(dip))
-#define XFS_DFORK_PTR(dip,w) \
+/*
+ * Return pointers to the data or attribute forks.
+ */
+#define XFS_DFORK_DPTR(dip) \
+ ((char *)dip + xfs_dinode_size(dip->di_version))
+#define XFS_DFORK_APTR(dip) \
+ (XFS_DFORK_DPTR(dip) + XFS_DFORK_BOFF(dip))
+#define XFS_DFORK_PTR(dip,w) \
((w) == XFS_DATA_FORK ? XFS_DFORK_DPTR(dip) : XFS_DFORK_APTR(dip))
-#define XFS_CFORK_FORMAT(dcp,w) \
- ((w) == XFS_DATA_FORK ? (dcp)->di_format : (dcp)->di_aformat)
-#define XFS_CFORK_FMT_SET(dcp,w,n) \
- ((w) == XFS_DATA_FORK ? \
- ((dcp)->di_format = (n)) : ((dcp)->di_aformat = (n)))
-#define XFS_DFORK_FORMAT(dip,w) XFS_CFORK_FORMAT(&(dip)->di_core, w)
-#define XFS_CFORK_NEXTENTS_DISK(dcp,w) \
+#define XFS_DFORK_FORMAT(dip,w) \
((w) == XFS_DATA_FORK ? \
- INT_GET((dcp)->di_nextents, ARCH_CONVERT) : \
- INT_GET((dcp)->di_anextents, ARCH_CONVERT))
-#define XFS_CFORK_NEXTENTS(dcp,w) \
- ((w) == XFS_DATA_FORK ? (dcp)->di_nextents : (dcp)->di_anextents)
-#define XFS_DFORK_NEXTENTS(dip,w) XFS_CFORK_NEXTENTS_DISK(&(dip)->di_core, w)
-#define XFS_DFORK_NEXTENTS_HOST(dip,w) XFS_CFORK_NEXTENTS(&(dip)->di_core, w)
-
-#define XFS_CFORK_NEXT_SET(dcp,w,n) \
+ (dip)->di_format : \
+ (dip)->di_aformat)
+#define XFS_DFORK_NEXTENTS(dip,w) \
((w) == XFS_DATA_FORK ? \
- ((dcp)->di_nextents = (n)) : ((dcp)->di_anextents = (n)))
+ be32_to_cpu((dip)->di_nextents) : \
+ be16_to_cpu((dip)->di_anextents))
+
+#define XFS_BUF_TO_DINODE(bp) ((xfs_dinode_t *)((bp)->b_addr))
+
+/*
+ * For block and character special files the 32bit dev_t is stored at the
+ * beginning of the data fork.
+ */
+static inline xfs_dev_t xfs_dinode_get_rdev(struct xfs_dinode *dip)
+{
+ return be32_to_cpu(*(__be32 *)XFS_DFORK_DPTR(dip));
+}
-#define XFS_BUF_TO_DINODE(bp) ((xfs_dinode_t *)XFS_BUF_PTR(bp))
+static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev)
+{
+ *(__be32 *)XFS_DFORK_DPTR(dip) = cpu_to_be32(rdev);
+}
/*
* Values for di_flags
@@ -257,6 +209,8 @@ typedef enum xfs_dinode_fmt
#define XFS_DIFLAG_NOSYMLINKS_BIT 10 /* disallow symlink creation */
#define XFS_DIFLAG_EXTSIZE_BIT 11 /* inode extent size allocator hint */
#define XFS_DIFLAG_EXTSZINHERIT_BIT 12 /* inherit inode extent size */
+#define XFS_DIFLAG_NODEFRAG_BIT 13 /* do not reorganize/defragment */
+#define XFS_DIFLAG_FILESTREAM_BIT 14 /* use filestream allocator */
#define XFS_DIFLAG_REALTIME (1 << XFS_DIFLAG_REALTIME_BIT)
#define XFS_DIFLAG_PREALLOC (1 << XFS_DIFLAG_PREALLOC_BIT)
#define XFS_DIFLAG_NEWRTBM (1 << XFS_DIFLAG_NEWRTBM_BIT)
@@ -270,12 +224,20 @@ typedef enum xfs_dinode_fmt
#define XFS_DIFLAG_NOSYMLINKS (1 << XFS_DIFLAG_NOSYMLINKS_BIT)
#define XFS_DIFLAG_EXTSIZE (1 << XFS_DIFLAG_EXTSIZE_BIT)
#define XFS_DIFLAG_EXTSZINHERIT (1 << XFS_DIFLAG_EXTSZINHERIT_BIT)
+#define XFS_DIFLAG_NODEFRAG (1 << XFS_DIFLAG_NODEFRAG_BIT)
+#define XFS_DIFLAG_FILESTREAM (1 << XFS_DIFLAG_FILESTREAM_BIT)
+
+#ifdef CONFIG_XFS_RT
+#define XFS_IS_REALTIME_INODE(ip) ((ip)->i_d.di_flags & XFS_DIFLAG_REALTIME)
+#else
+#define XFS_IS_REALTIME_INODE(ip) (0)
+#endif
#define XFS_DIFLAG_ANY \
(XFS_DIFLAG_REALTIME | XFS_DIFLAG_PREALLOC | XFS_DIFLAG_NEWRTBM | \
XFS_DIFLAG_IMMUTABLE | XFS_DIFLAG_APPEND | XFS_DIFLAG_SYNC | \
XFS_DIFLAG_NOATIME | XFS_DIFLAG_NODUMP | XFS_DIFLAG_RTINHERIT | \
XFS_DIFLAG_PROJINHERIT | XFS_DIFLAG_NOSYMLINKS | XFS_DIFLAG_EXTSIZE | \
- XFS_DIFLAG_EXTSZINHERIT)
+ XFS_DIFLAG_EXTSZINHERIT | XFS_DIFLAG_NODEFRAG | XFS_DIFLAG_FILESTREAM)
#endif /* __XFS_DINODE_H__ */
diff --git a/fs/xfs/xfs_dir.c b/fs/xfs/xfs_dir.c
deleted file mode 100644
index bb87d2a700a..00000000000
--- a/fs/xfs/xfs_dir.c
+++ /dev/null
@@ -1,1219 +0,0 @@
-/*
- * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_types.h"
-#include "xfs_log.h"
-#include "xfs_inum.h"
-#include "xfs_trans.h"
-#include "xfs_sb.h"
-#include "xfs_dir.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
-#include "xfs_mount.h"
-#include "xfs_da_btree.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_alloc.h"
-#include "xfs_btree.h"
-#include "xfs_dir_sf.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
-#include "xfs_dinode.h"
-#include "xfs_inode.h"
-#include "xfs_bmap.h"
-#include "xfs_dir_leaf.h"
-#include "xfs_error.h"
-
-/*
- * xfs_dir.c
- *
- * Provide the external interfaces to manage directories.
- */
-
-/*========================================================================
- * Function prototypes for the kernel.
- *========================================================================*/
-
-/*
- * Functions for the dirops interfaces.
- */
-static void xfs_dir_mount(struct xfs_mount *mp);
-
-static int xfs_dir_isempty(struct xfs_inode *dp);
-
-static int xfs_dir_init(struct xfs_trans *trans,
- struct xfs_inode *dir,
- struct xfs_inode *parent_dir);
-
-static int xfs_dir_createname(struct xfs_trans *trans,
- struct xfs_inode *dp,
- char *name_string,
- int name_len,
- xfs_ino_t inode_number,
- xfs_fsblock_t *firstblock,
- xfs_bmap_free_t *flist,
- xfs_extlen_t total);
-
-static int xfs_dir_lookup(struct xfs_trans *tp,
- struct xfs_inode *dp,
- char *name_string,
- int name_length,
- xfs_ino_t *inode_number);
-
-static int xfs_dir_removename(struct xfs_trans *trans,
- struct xfs_inode *dp,
- char *name_string,
- int name_length,
- xfs_ino_t ino,
- xfs_fsblock_t *firstblock,
- xfs_bmap_free_t *flist,
- xfs_extlen_t total);
-
-static int xfs_dir_getdents(struct xfs_trans *tp,
- struct xfs_inode *dp,
- struct uio *uiop,
- int *eofp);
-
-static int xfs_dir_replace(struct xfs_trans *tp,
- struct xfs_inode *dp,
- char *name_string,
- int name_length,
- xfs_ino_t inode_number,
- xfs_fsblock_t *firstblock,
- xfs_bmap_free_t *flist,
- xfs_extlen_t total);
-
-static int xfs_dir_canenter(struct xfs_trans *tp,
- struct xfs_inode *dp,
- char *name_string,
- int name_length);
-
-static int xfs_dir_shortform_validate_ondisk(xfs_mount_t *mp,
- xfs_dinode_t *dip);
-
-xfs_dirops_t xfsv1_dirops = {
- .xd_mount = xfs_dir_mount,
- .xd_isempty = xfs_dir_isempty,
- .xd_init = xfs_dir_init,
- .xd_createname = xfs_dir_createname,
- .xd_lookup = xfs_dir_lookup,
- .xd_removename = xfs_dir_removename,
- .xd_getdents = xfs_dir_getdents,
- .xd_replace = xfs_dir_replace,
- .xd_canenter = xfs_dir_canenter,
- .xd_shortform_validate_ondisk = xfs_dir_shortform_validate_ondisk,
- .xd_shortform_to_single = xfs_dir_shortform_to_leaf,
-};
-
-/*
- * Internal routines when dirsize == XFS_LBSIZE(mp).
- */
-STATIC int xfs_dir_leaf_lookup(xfs_da_args_t *args);
-STATIC int xfs_dir_leaf_removename(xfs_da_args_t *args, int *number_entries,
- int *total_namebytes);
-STATIC int xfs_dir_leaf_getdents(xfs_trans_t *trans, xfs_inode_t *dp,
- uio_t *uio, int *eofp,
- xfs_dirent_t *dbp,
- xfs_dir_put_t put);
-STATIC int xfs_dir_leaf_replace(xfs_da_args_t *args);
-
-/*
- * Internal routines when dirsize > XFS_LBSIZE(mp).
- */
-STATIC int xfs_dir_node_addname(xfs_da_args_t *args);
-STATIC int xfs_dir_node_lookup(xfs_da_args_t *args);
-STATIC int xfs_dir_node_removename(xfs_da_args_t *args);
-STATIC int xfs_dir_node_getdents(xfs_trans_t *trans, xfs_inode_t *dp,
- uio_t *uio, int *eofp,
- xfs_dirent_t *dbp,
- xfs_dir_put_t put);
-STATIC int xfs_dir_node_replace(xfs_da_args_t *args);
-
-#if defined(XFS_DIR_TRACE)
-ktrace_t *xfs_dir_trace_buf;
-#endif
-
-
-/*========================================================================
- * Overall external interface routines.
- *========================================================================*/
-
-xfs_dahash_t xfs_dir_hash_dot, xfs_dir_hash_dotdot;
-
-/*
- * One-time startup routine called from xfs_init().
- */
-void
-xfs_dir_startup(void)
-{
- xfs_dir_hash_dot = xfs_da_hashname(".", 1);
- xfs_dir_hash_dotdot = xfs_da_hashname("..", 2);
-}
-
-/*
- * Initialize directory-related fields in the mount structure.
- */
-static void
-xfs_dir_mount(xfs_mount_t *mp)
-{
- uint shortcount, leafcount, count;
-
- mp->m_dirversion = 1;
- if (!(mp->m_flags & XFS_MOUNT_ATTR2)) {
- shortcount = (mp->m_attroffset -
- (uint)sizeof(xfs_dir_sf_hdr_t)) /
- (uint)sizeof(xfs_dir_sf_entry_t);
- leafcount = (XFS_LBSIZE(mp) -
- (uint)sizeof(xfs_dir_leaf_hdr_t)) /
- ((uint)sizeof(xfs_dir_leaf_entry_t) +
- (uint)sizeof(xfs_dir_leaf_name_t));
- } else {
- shortcount = (XFS_BMDR_SPACE_CALC(MINABTPTRS) -
- (uint)sizeof(xfs_dir_sf_hdr_t)) /
- (uint)sizeof(xfs_dir_sf_entry_t);
- leafcount = (XFS_LBSIZE(mp) -
- (uint)sizeof(xfs_dir_leaf_hdr_t)) /
- ((uint)sizeof(xfs_dir_leaf_entry_t) +
- (uint)sizeof(xfs_dir_leaf_name_t));
- }
- count = shortcount > leafcount ? shortcount : leafcount;
- mp->m_dircook_elog = xfs_da_log2_roundup(count + 1);
- ASSERT(mp->m_dircook_elog <= mp->m_sb.sb_blocklog);
- mp->m_dir_node_ents = mp->m_attr_node_ents =
- (XFS_LBSIZE(mp) - (uint)sizeof(xfs_da_node_hdr_t)) /
- (uint)sizeof(xfs_da_node_entry_t);
- mp->m_dir_magicpct = (XFS_LBSIZE(mp) * 37) / 100;
- mp->m_dirblksize = mp->m_sb.sb_blocksize;
- mp->m_dirblkfsbs = 1;
-}
-
-/*
- * Return 1 if directory contains only "." and "..".
- */
-static int
-xfs_dir_isempty(xfs_inode_t *dp)
-{
- xfs_dir_sf_hdr_t *hdr;
-
- ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
- if (dp->i_d.di_size == 0)
- return(1);
- if (dp->i_d.di_size > XFS_IFORK_DSIZE(dp))
- return(0);
- hdr = (xfs_dir_sf_hdr_t *)dp->i_df.if_u1.if_data;
- return(hdr->count == 0);
-}
-
-/*
- * Initialize a directory with its "." and ".." entries.
- */
-static int
-xfs_dir_init(xfs_trans_t *trans, xfs_inode_t *dir, xfs_inode_t *parent_dir)
-{
- xfs_da_args_t args;
- int error;
-
- memset((char *)&args, 0, sizeof(args));
- args.dp = dir;
- args.trans = trans;
-
- ASSERT((dir->i_d.di_mode & S_IFMT) == S_IFDIR);
- if ((error = xfs_dir_ino_validate(trans->t_mountp, parent_dir->i_ino)))
- return error;
-
- return(xfs_dir_shortform_create(&args, parent_dir->i_ino));
-}
-
-/*
- * Generic handler routine to add a name to a directory.
- * Transitions directory from shortform to Btree as necessary.
- */
-static int /* error */
-xfs_dir_createname(xfs_trans_t *trans, xfs_inode_t *dp, char *name,
- int namelen, xfs_ino_t inum, xfs_fsblock_t *firstblock,
- xfs_bmap_free_t *flist, xfs_extlen_t total)
-{
- xfs_da_args_t args;
- int retval, newsize, done;
-
- ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
-
- if ((retval = xfs_dir_ino_validate(trans->t_mountp, inum)))
- return (retval);
-
- XFS_STATS_INC(xs_dir_create);
- /*
- * Fill in the arg structure for this request.
- */
- args.name = name;
- args.namelen = namelen;
- args.hashval = xfs_da_hashname(name, namelen);
- args.inumber = inum;
- args.dp = dp;
- args.firstblock = firstblock;
- args.flist = flist;
- args.total = total;
- args.whichfork = XFS_DATA_FORK;
- args.trans = trans;
- args.justcheck = 0;
- args.addname = args.oknoent = 1;
-
- /*
- * Decide on what work routines to call based on the inode size.
- */
- done = 0;
- if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
- newsize = XFS_DIR_SF_ENTSIZE_BYNAME(args.namelen);
- if ((dp->i_d.di_size + newsize) <= XFS_IFORK_DSIZE(dp)) {
- retval = xfs_dir_shortform_addname(&args);
- done = 1;
- } else {
- if (total == 0)
- return XFS_ERROR(ENOSPC);
- retval = xfs_dir_shortform_to_leaf(&args);
- done = retval != 0;
- }
- }
- if (!done && xfs_bmap_one_block(dp, XFS_DATA_FORK)) {
- retval = xfs_dir_leaf_addname(&args);
- done = retval != ENOSPC;
- if (!done) {
- if (total == 0)
- return XFS_ERROR(ENOSPC);
- retval = xfs_dir_leaf_to_node(&args);
- done = retval != 0;
- }
- }
- if (!done) {
- retval = xfs_dir_node_addname(&args);
- }
- return(retval);
-}
-
-/*
- * Generic handler routine to check if a name can be added to a directory,
- * without adding any blocks to the directory.
- */
-static int /* error */
-xfs_dir_canenter(xfs_trans_t *trans, xfs_inode_t *dp, char *name, int namelen)
-{
- xfs_da_args_t args;
- int retval, newsize;
-
- ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
- /*
- * Fill in the arg structure for this request.
- */
- args.name = name;
- args.namelen = namelen;
- args.hashval = xfs_da_hashname(name, namelen);
- args.inumber = 0;
- args.dp = dp;
- args.firstblock = NULL;
- args.flist = NULL;
- args.total = 0;
- args.whichfork = XFS_DATA_FORK;
- args.trans = trans;
- args.justcheck = args.addname = args.oknoent = 1;
-
- /*
- * Decide on what work routines to call based on the inode size.
- */
- if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
- newsize = XFS_DIR_SF_ENTSIZE_BYNAME(args.namelen);
- if ((dp->i_d.di_size + newsize) <= XFS_IFORK_DSIZE(dp))
- retval = 0;
- else
- retval = XFS_ERROR(ENOSPC);
- } else if (xfs_bmap_one_block(dp, XFS_DATA_FORK)) {
- retval = xfs_dir_leaf_addname(&args);
- } else {
- retval = xfs_dir_node_addname(&args);
- }
- return(retval);
-}
-
-/*
- * Generic handler routine to remove a name from a directory.
- * Transitions directory from Btree to shortform as necessary.
- */
-static int /* error */
-xfs_dir_removename(xfs_trans_t *trans, xfs_inode_t *dp, char *name,
- int namelen, xfs_ino_t ino, xfs_fsblock_t *firstblock,
- xfs_bmap_free_t *flist, xfs_extlen_t total)
-{
- xfs_da_args_t args;
- int count, totallen, newsize, retval;
-
- ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
- XFS_STATS_INC(xs_dir_remove);
- /*
- * Fill in the arg structure for this request.
- */
- args.name = name;
- args.namelen = namelen;
- args.hashval = xfs_da_hashname(name, namelen);
- args.inumber = ino;
- args.dp = dp;
- args.firstblock = firstblock;
- args.flist = flist;
- args.total = total;
- args.whichfork = XFS_DATA_FORK;
- args.trans = trans;
- args.justcheck = args.addname = args.oknoent = 0;
-
- /*
- * Decide on what work routines to call based on the inode size.
- */
- if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
- retval = xfs_dir_shortform_removename(&args);
- } else if (xfs_bmap_one_block(dp, XFS_DATA_FORK)) {
- retval = xfs_dir_leaf_removename(&args, &count, &totallen);
- if (retval == 0) {
- newsize = XFS_DIR_SF_ALLFIT(count, totallen);
- if (newsize <= XFS_IFORK_DSIZE(dp)) {
- retval = xfs_dir_leaf_to_shortform(&args);
- }
- }
- } else {
- retval = xfs_dir_node_removename(&args);
- }
- return(retval);
-}
-
-static int /* error */
-xfs_dir_lookup(xfs_trans_t *trans, xfs_inode_t *dp, char *name, int namelen,
- xfs_ino_t *inum)
-{
- xfs_da_args_t args;
- int retval;
-
- ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
-
- XFS_STATS_INC(xs_dir_lookup);
- /*
- * Fill in the arg structure for this request.
- */
- args.name = name;
- args.namelen = namelen;
- args.hashval = xfs_da_hashname(name, namelen);
- args.inumber = 0;
- args.dp = dp;
- args.firstblock = NULL;
- args.flist = NULL;
- args.total = 0;
- args.whichfork = XFS_DATA_FORK;
- args.trans = trans;
- args.justcheck = args.addname = 0;
- args.oknoent = 1;
-
- /*
- * Decide on what work routines to call based on the inode size.
- */
- if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
- retval = xfs_dir_shortform_lookup(&args);
- } else if (xfs_bmap_one_block(dp, XFS_DATA_FORK)) {
- retval = xfs_dir_leaf_lookup(&args);
- } else {
- retval = xfs_dir_node_lookup(&args);
- }
- if (retval == EEXIST)
- retval = 0;
- *inum = args.inumber;
- return(retval);
-}
-
-/*
- * Implement readdir.
- */
-static int /* error */
-xfs_dir_getdents(xfs_trans_t *trans, xfs_inode_t *dp, uio_t *uio, int *eofp)
-{
- xfs_dirent_t *dbp;
- int alignment, retval;
- xfs_dir_put_t put;
-
- XFS_STATS_INC(xs_dir_getdents);
- ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
-
- /*
- * If our caller has given us a single contiguous memory buffer,
- * just work directly within that buffer. If it's in user memory,
- * lock it down first.
- */
- alignment = sizeof(xfs_off_t) - 1;
- if ((uio->uio_iovcnt == 1) &&
- (((__psint_t)uio->uio_iov[0].iov_base & alignment) == 0) &&
- ((uio->uio_iov[0].iov_len & alignment) == 0)) {
- dbp = NULL;
- put = xfs_dir_put_dirent64_direct;
- } else {
- dbp = kmem_alloc(sizeof(*dbp) + MAXNAMELEN, KM_SLEEP);
- put = xfs_dir_put_dirent64_uio;
- }
-
- /*
- * Decide on what work routines to call based on the inode size.
- */
- *eofp = 0;
-
- if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
- retval = xfs_dir_shortform_getdents(dp, uio, eofp, dbp, put);
- } else if (xfs_bmap_one_block(dp, XFS_DATA_FORK)) {
- retval = xfs_dir_leaf_getdents(trans, dp, uio, eofp, dbp, put);
- } else {
- retval = xfs_dir_node_getdents(trans, dp, uio, eofp, dbp, put);
- }
- if (dbp != NULL)
- kmem_free(dbp, sizeof(*dbp) + MAXNAMELEN);
-
- return(retval);
-}
-
-static int /* error */
-xfs_dir_replace(xfs_trans_t *trans, xfs_inode_t *dp, char *name, int namelen,
- xfs_ino_t inum, xfs_fsblock_t *firstblock,
- xfs_bmap_free_t *flist, xfs_extlen_t total)
-{
- xfs_da_args_t args;
- int retval;
-
- ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
-
- if ((retval = xfs_dir_ino_validate(trans->t_mountp, inum)))
- return retval;
-
- /*
- * Fill in the arg structure for this request.
- */
- args.name = name;
- args.namelen = namelen;
- args.hashval = xfs_da_hashname(name, namelen);
- args.inumber = inum;
- args.dp = dp;
- args.firstblock = firstblock;
- args.flist = flist;
- args.total = total;
- args.whichfork = XFS_DATA_FORK;
- args.trans = trans;
- args.justcheck = args.addname = args.oknoent = 0;
-
- /*
- * Decide on what work routines to call based on the inode size.
- */
- if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
- retval = xfs_dir_shortform_replace(&args);
- } else if (xfs_bmap_one_block(dp, XFS_DATA_FORK)) {
- retval = xfs_dir_leaf_replace(&args);
- } else {
- retval = xfs_dir_node_replace(&args);
- }
-
- return(retval);
-}
-
-static int
-xfs_dir_shortform_validate_ondisk(xfs_mount_t *mp, xfs_dinode_t *dp)
-{
- xfs_ino_t ino;
- int namelen_sum;
- int count;
- xfs_dir_shortform_t *sf;
- xfs_dir_sf_entry_t *sfe;
- int i;
-
-
-
- if ((INT_GET(dp->di_core.di_mode, ARCH_CONVERT) & S_IFMT) != S_IFDIR) {
- return 0;
- }
- if (INT_GET(dp->di_core.di_format, ARCH_CONVERT) != XFS_DINODE_FMT_LOCAL) {
- return 0;
- }
- if (INT_GET(dp->di_core.di_size, ARCH_CONVERT) < sizeof(sf->hdr)) {
- xfs_fs_cmn_err(CE_WARN, mp, "Invalid shortform size: dp 0x%p",
- dp);
- return 1;
- }
- sf = (xfs_dir_shortform_t *)(&dp->di_u.di_dirsf);
- ino = XFS_GET_DIR_INO8(sf->hdr.parent);
- if (xfs_dir_ino_validate(mp, ino))
- return 1;
-
- count = sf->hdr.count;
- if ((count < 0) || ((count * 10) > XFS_LITINO(mp))) {
- xfs_fs_cmn_err(CE_WARN, mp,
- "Invalid shortform count: dp 0x%p", dp);
- return(1);
- }
-
- if (count == 0) {
- return 0;
- }
-
- namelen_sum = 0;
- sfe = &sf->list[0];
- for (i = sf->hdr.count - 1; i >= 0; i--) {
- ino = XFS_GET_DIR_INO8(sfe->inumber);
- xfs_dir_ino_validate(mp, ino);
- if (sfe->namelen >= XFS_LITINO(mp)) {
- xfs_fs_cmn_err(CE_WARN, mp,
- "Invalid shortform namelen: dp 0x%p", dp);
- return 1;
- }
- namelen_sum += sfe->namelen;
- sfe = XFS_DIR_SF_NEXTENTRY(sfe);
- }
- if (namelen_sum >= XFS_LITINO(mp)) {
- xfs_fs_cmn_err(CE_WARN, mp,
- "Invalid shortform namelen: dp 0x%p", dp);
- return 1;
- }
-
- return 0;
-}
-
-/*========================================================================
- * External routines when dirsize == XFS_LBSIZE(dp->i_mount).
- *========================================================================*/
-
-/*
- * Add a name to the leaf directory structure
- * This is the external routine.
- */
-int
-xfs_dir_leaf_addname(xfs_da_args_t *args)
-{
- int index, retval;
- xfs_dabuf_t *bp;
-
- retval = xfs_da_read_buf(args->trans, args->dp, 0, -1, &bp,
- XFS_DATA_FORK);
- if (retval)
- return(retval);
- ASSERT(bp != NULL);
-
- retval = xfs_dir_leaf_lookup_int(bp, args, &index);
- if (retval == ENOENT)
- retval = xfs_dir_leaf_add(bp, args, index);
- xfs_da_buf_done(bp);
- return(retval);
-}
-
-/*
- * Remove a name from the leaf directory structure
- * This is the external routine.
- */
-STATIC int
-xfs_dir_leaf_removename(xfs_da_args_t *args, int *count, int *totallen)
-{
- xfs_dir_leafblock_t *leaf;
- int index, retval;
- xfs_dabuf_t *bp;
-
- retval = xfs_da_read_buf(args->trans, args->dp, 0, -1, &bp,
- XFS_DATA_FORK);
- if (retval)
- return(retval);
- ASSERT(bp != NULL);
- leaf = bp->data;
- ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR_LEAF_MAGIC);
- retval = xfs_dir_leaf_lookup_int(bp, args, &index);
- if (retval == EEXIST) {
- (void)xfs_dir_leaf_remove(args->trans, bp, index);
- *count = INT_GET(leaf->hdr.count, ARCH_CONVERT);
- *totallen = INT_GET(leaf->hdr.namebytes, ARCH_CONVERT);
- retval = 0;
- }
- xfs_da_buf_done(bp);
- return(retval);
-}
-
-/*
- * Look up a name in a leaf directory structure.
- * This is the external routine.
- */
-STATIC int
-xfs_dir_leaf_lookup(xfs_da_args_t *args)
-{
- int index, retval;
- xfs_dabuf_t *bp;
-
- retval = xfs_da_read_buf(args->trans, args->dp, 0, -1, &bp,
- XFS_DATA_FORK);
- if (retval)
- return(retval);
- ASSERT(bp != NULL);
- retval = xfs_dir_leaf_lookup_int(bp, args, &index);
- xfs_da_brelse(args->trans, bp);
- return(retval);
-}
-
-/*
- * Copy out directory entries for getdents(), for leaf directories.
- */
-STATIC int
-xfs_dir_leaf_getdents(xfs_trans_t *trans, xfs_inode_t *dp, uio_t *uio,
- int *eofp, xfs_dirent_t *dbp, xfs_dir_put_t put)
-{
- xfs_dabuf_t *bp;
- int retval, eob;
-
- retval = xfs_da_read_buf(dp->i_transp, dp, 0, -1, &bp, XFS_DATA_FORK);
- if (retval)
- return(retval);
- ASSERT(bp != NULL);
- retval = xfs_dir_leaf_getdents_int(bp, dp, 0, uio, &eob, dbp, put, -1);
- xfs_da_brelse(trans, bp);
- *eofp = (eob == 0);
- return(retval);
-}
-
-/*
- * Look up a name in a leaf directory structure, replace the inode number.
- * This is the external routine.
- */
-STATIC int
-xfs_dir_leaf_replace(xfs_da_args_t *args)
-{
- int index, retval;
- xfs_dabuf_t *bp;
- xfs_ino_t inum;
- xfs_dir_leafblock_t *leaf;
- xfs_dir_leaf_entry_t *entry;
- xfs_dir_leaf_name_t *namest;
-
- inum = args->inumber;
- retval = xfs_da_read_buf(args->trans, args->dp, 0, -1, &bp,
- XFS_DATA_FORK);
- if (retval)
- return(retval);
- ASSERT(bp != NULL);
- retval = xfs_dir_leaf_lookup_int(bp, args, &index);
- if (retval == EEXIST) {
- leaf = bp->data;
- entry = &leaf->entries[index];
- namest = XFS_DIR_LEAF_NAMESTRUCT(leaf, INT_GET(entry->nameidx, ARCH_CONVERT));
- /* XXX - replace assert? */
- XFS_DIR_SF_PUT_DIRINO(&inum, &namest->inumber);
- xfs_da_log_buf(args->trans, bp,
- XFS_DA_LOGRANGE(leaf, namest, sizeof(namest->inumber)));
- xfs_da_buf_done(bp);
- retval = 0;
- } else
- xfs_da_brelse(args->trans, bp);
- return(retval);
-}
-
-
-/*========================================================================
- * External routines when dirsize > XFS_LBSIZE(mp).
- *========================================================================*/
-
-/*
- * Add a name to a Btree-format directory.
- *
- * This will involve walking down the Btree, and may involve splitting
- * leaf nodes and even splitting intermediate nodes up to and including
- * the root node (a special case of an intermediate node).
- */
-STATIC int
-xfs_dir_node_addname(xfs_da_args_t *args)
-{
- xfs_da_state_t *state;
- xfs_da_state_blk_t *blk;
- int retval, error;
-
- /*
- * Fill in bucket of arguments/results/context to carry around.
- */
- state = xfs_da_state_alloc();
- state->args = args;
- state->mp = args->dp->i_mount;
- state->blocksize = state->mp->m_sb.sb_blocksize;
- state->node_ents = state->mp->m_dir_node_ents;
-
- /*
- * Search to see if name already exists, and get back a pointer
- * to where it should go.
- */
- error = xfs_da_node_lookup_int(state, &retval);
- if (error)
- retval = error;
- if (retval != ENOENT)
- goto error;
- blk = &state->path.blk[ state->path.active-1 ];
- ASSERT(blk->magic == XFS_DIR_LEAF_MAGIC);
- retval = xfs_dir_leaf_add(blk->bp, args, blk->index);
- if (retval == 0) {
- /*
- * Addition succeeded, update Btree hashvals.
- */
- if (!args->justcheck)
- xfs_da_fixhashpath(state, &state->path);
- } else {
- /*
- * Addition failed, split as many Btree elements as required.
- */
- if (args->total == 0) {
- ASSERT(retval == ENOSPC);
- goto error;
- }
- retval = xfs_da_split(state);
- }
-error:
- xfs_da_state_free(state);
-
- return(retval);
-}
-
-/*
- * Remove a name from a B-tree directory.
- *
- * This will involve walking down the Btree, and may involve joining
- * leaf nodes and even joining intermediate nodes up to and including
- * the root node (a special case of an intermediate node).
- */
-STATIC int
-xfs_dir_node_removename(xfs_da_args_t *args)
-{
- xfs_da_state_t *state;
- xfs_da_state_blk_t *blk;
- int retval, error;
-
- state = xfs_da_state_alloc();
- state->args = args;
- state->mp = args->dp->i_mount;
- state->blocksize = state->mp->m_sb.sb_blocksize;
- state->node_ents = state->mp->m_dir_node_ents;
-
- /*
- * Search to see if name exists, and get back a pointer to it.
- */
- error = xfs_da_node_lookup_int(state, &retval);
- if (error)
- retval = error;
- if (retval != EEXIST) {
- xfs_da_state_free(state);
- return(retval);
- }
-
- /*
- * Remove the name and update the hashvals in the tree.
- */
- blk = &state->path.blk[ state->path.active-1 ];
- ASSERT(blk->magic == XFS_DIR_LEAF_MAGIC);
- retval = xfs_dir_leaf_remove(args->trans, blk->bp, blk->index);
- xfs_da_fixhashpath(state, &state->path);
-
- /*
- * Check to see if the tree needs to be collapsed.
- */
- error = 0;
- if (retval) {
- error = xfs_da_join(state);
- }
-
- xfs_da_state_free(state);
- if (error)
- return(error);
- return(0);
-}
-
-/*
- * Look up a filename in a int directory.
- * Use an internal routine to actually do all the work.
- */
-STATIC int
-xfs_dir_node_lookup(xfs_da_args_t *args)
-{
- xfs_da_state_t *state;
- int retval, error, i;
-
- state = xfs_da_state_alloc();
- state->args = args;
- state->mp = args->dp->i_mount;
- state->blocksize = state->mp->m_sb.sb_blocksize;
- state->node_ents = state->mp->m_dir_node_ents;
-
- /*
- * Search to see if name exists,
- * and get back a pointer to it.
- */
- error = xfs_da_node_lookup_int(state, &retval);
- if (error) {
- retval = error;
- }
-
- /*
- * If not in a transaction, we have to release all the buffers.
- */
- for (i = 0; i < state->path.active; i++) {
- xfs_da_brelse(args->trans, state->path.blk[i].bp);
- state->path.blk[i].bp = NULL;
- }
-
- xfs_da_state_free(state);
- return(retval);
-}
-
-STATIC int
-xfs_dir_node_getdents(xfs_trans_t *trans, xfs_inode_t *dp, uio_t *uio,
- int *eofp, xfs_dirent_t *dbp, xfs_dir_put_t put)
-{
- xfs_da_intnode_t *node;
- xfs_da_node_entry_t *btree;
- xfs_dir_leafblock_t *leaf = NULL;
- xfs_dablk_t bno, nextbno;
- xfs_dahash_t cookhash;
- xfs_mount_t *mp;
- int error, eob, i;
- xfs_dabuf_t *bp;
- xfs_daddr_t nextda;
-
- /*
- * Pick up our context.
- */
- mp = dp->i_mount;
- bp = NULL;
- bno = XFS_DA_COOKIE_BNO(mp, uio->uio_offset);
- cookhash = XFS_DA_COOKIE_HASH(mp, uio->uio_offset);
-
- xfs_dir_trace_g_du("node: start", dp, uio);
-
- /*
- * Re-find our place, even if we're confused about what our place is.
- *
- * First we check the block number from the magic cookie, it is a
- * cache of where we ended last time. If we find a leaf block, and
- * the starting hashval in that block is less than our desired
- * hashval, then we run with it.
- */
- if (bno > 0) {
- error = xfs_da_read_buf(trans, dp, bno, -2, &bp, XFS_DATA_FORK);
- if ((error != 0) && (error != EFSCORRUPTED))
- return(error);
- if (bp)
- leaf = bp->data;
- if (bp && INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) != XFS_DIR_LEAF_MAGIC) {
- xfs_dir_trace_g_dub("node: block not a leaf",
- dp, uio, bno);
- xfs_da_brelse(trans, bp);
- bp = NULL;
- }
- if (bp && INT_GET(leaf->entries[0].hashval, ARCH_CONVERT) > cookhash) {
- xfs_dir_trace_g_dub("node: leaf hash too large",
- dp, uio, bno);
- xfs_da_brelse(trans, bp);
- bp = NULL;
- }
- if (bp &&
- cookhash > INT_GET(leaf->entries[INT_GET(leaf->hdr.count, ARCH_CONVERT) - 1].hashval, ARCH_CONVERT)) {
- xfs_dir_trace_g_dub("node: leaf hash too small",
- dp, uio, bno);
- xfs_da_brelse(trans, bp);
- bp = NULL;
- }
- }
-
- /*
- * If we did not find a leaf block from the blockno in the cookie,
- * or we there was no blockno in the cookie (eg: first time thru),
- * the we start at the top of the Btree and re-find our hashval.
- */
- if (bp == NULL) {
- xfs_dir_trace_g_du("node: start at root" , dp, uio);
- bno = 0;
- for (;;) {
- error = xfs_da_read_buf(trans, dp, bno, -1, &bp,
- XFS_DATA_FORK);
- if (error)
- return(error);
- if (bp == NULL)
- return(XFS_ERROR(EFSCORRUPTED));
- node = bp->data;
- if (INT_GET(node->hdr.info.magic, ARCH_CONVERT) != XFS_DA_NODE_MAGIC)
- break;
- btree = &node->btree[0];
- xfs_dir_trace_g_dun("node: node detail", dp, uio, node);
- for (i = 0; i < INT_GET(node->hdr.count, ARCH_CONVERT); btree++, i++) {
- if (INT_GET(btree->hashval, ARCH_CONVERT) >= cookhash) {
- bno = INT_GET(btree->before, ARCH_CONVERT);
- break;
- }
- }
- if (i == INT_GET(node->hdr.count, ARCH_CONVERT)) {
- xfs_da_brelse(trans, bp);
- xfs_dir_trace_g_du("node: hash beyond EOF",
- dp, uio);
- uio->uio_offset = XFS_DA_MAKE_COOKIE(mp, 0, 0,
- XFS_DA_MAXHASH);
- *eofp = 1;
- return(0);
- }
- xfs_dir_trace_g_dub("node: going to block",
- dp, uio, bno);
- xfs_da_brelse(trans, bp);
- }
- }
- ASSERT(cookhash != XFS_DA_MAXHASH);
-
- /*
- * We've dropped down to the (first) leaf block that contains the
- * hashval we are interested in. Continue rolling upward thru the
- * leaf blocks until we fill up our buffer.
- */
- for (;;) {
- leaf = bp->data;
- if (unlikely(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) != XFS_DIR_LEAF_MAGIC)) {
- xfs_dir_trace_g_dul("node: not a leaf", dp, uio, leaf);
- xfs_da_brelse(trans, bp);
- XFS_CORRUPTION_ERROR("xfs_dir_node_getdents(1)",
- XFS_ERRLEVEL_LOW, mp, leaf);
- return XFS_ERROR(EFSCORRUPTED);
- }
- xfs_dir_trace_g_dul("node: leaf detail", dp, uio, leaf);
- if ((nextbno = INT_GET(leaf->hdr.info.forw, ARCH_CONVERT))) {
- nextda = xfs_da_reada_buf(trans, dp, nextbno,
- XFS_DATA_FORK);
- } else
- nextda = -1;
- error = xfs_dir_leaf_getdents_int(bp, dp, bno, uio, &eob, dbp,
- put, nextda);
- xfs_da_brelse(trans, bp);
- bno = nextbno;
- if (eob) {
- xfs_dir_trace_g_dub("node: E-O-B", dp, uio, bno);
- *eofp = 0;
- return(error);
- }
- if (bno == 0)
- break;
- error = xfs_da_read_buf(trans, dp, bno, nextda, &bp,
- XFS_DATA_FORK);
- if (error)
- return(error);
- if (unlikely(bp == NULL)) {
- XFS_ERROR_REPORT("xfs_dir_node_getdents(2)",
- XFS_ERRLEVEL_LOW, mp);
- return(XFS_ERROR(EFSCORRUPTED));
- }
- }
- *eofp = 1;
- xfs_dir_trace_g_du("node: E-O-F", dp, uio);
- return(0);
-}
-
-/*
- * Look up a filename in an int directory, replace the inode number.
- * Use an internal routine to actually do the lookup.
- */
-STATIC int
-xfs_dir_node_replace(xfs_da_args_t *args)
-{
- xfs_da_state_t *state;
- xfs_da_state_blk_t *blk;
- xfs_dir_leafblock_t *leaf;
- xfs_dir_leaf_entry_t *entry;
- xfs_dir_leaf_name_t *namest;
- xfs_ino_t inum;
- int retval, error, i;
- xfs_dabuf_t *bp;
-
- state = xfs_da_state_alloc();
- state->args = args;
- state->mp = args->dp->i_mount;
- state->blocksize = state->mp->m_sb.sb_blocksize;
- state->node_ents = state->mp->m_dir_node_ents;
- inum = args->inumber;
-
- /*
- * Search to see if name exists,
- * and get back a pointer to it.
- */
- error = xfs_da_node_lookup_int(state, &retval);
- if (error) {
- retval = error;
- }
-
- if (retval == EEXIST) {
- blk = &state->path.blk[state->path.active - 1];
- ASSERT(blk->magic == XFS_DIR_LEAF_MAGIC);
- bp = blk->bp;
- leaf = bp->data;
- entry = &leaf->entries[blk->index];
- namest = XFS_DIR_LEAF_NAMESTRUCT(leaf, INT_GET(entry->nameidx, ARCH_CONVERT));
- /* XXX - replace assert ? */
- XFS_DIR_SF_PUT_DIRINO(&inum, &namest->inumber);
- xfs_da_log_buf(args->trans, bp,
- XFS_DA_LOGRANGE(leaf, namest, sizeof(namest->inumber)));
- xfs_da_buf_done(bp);
- blk->bp = NULL;
- retval = 0;
- } else {
- i = state->path.active - 1;
- xfs_da_brelse(args->trans, state->path.blk[i].bp);
- state->path.blk[i].bp = NULL;
- }
- for (i = 0; i < state->path.active - 1; i++) {
- xfs_da_brelse(args->trans, state->path.blk[i].bp);
- state->path.blk[i].bp = NULL;
- }
-
- xfs_da_state_free(state);
- return(retval);
-}
-
-#if defined(XFS_DIR_TRACE)
-/*
- * Add a trace buffer entry for an inode and a uio.
- */
-void
-xfs_dir_trace_g_du(char *where, xfs_inode_t *dp, uio_t *uio)
-{
- xfs_dir_trace_enter(XFS_DIR_KTRACE_G_DU, where,
- (void *)dp, (void *)dp->i_mount,
- (void *)((unsigned long)(uio->uio_offset >> 32)),
- (void *)((unsigned long)(uio->uio_offset & 0xFFFFFFFF)),
- (void *)(unsigned long)uio->uio_resid,
- NULL, NULL, NULL, NULL, NULL, NULL, NULL);
-}
-
-/*
- * Add a trace buffer entry for an inode and a uio.
- */
-void
-xfs_dir_trace_g_dub(char *where, xfs_inode_t *dp, uio_t *uio, xfs_dablk_t bno)
-{
- xfs_dir_trace_enter(XFS_DIR_KTRACE_G_DUB, where,
- (void *)dp, (void *)dp->i_mount,
- (void *)((unsigned long)(uio->uio_offset >> 32)),
- (void *)((unsigned long)(uio->uio_offset & 0xFFFFFFFF)),
- (void *)(unsigned long)uio->uio_resid,
- (void *)(unsigned long)bno,
- NULL, NULL, NULL, NULL, NULL, NULL);
-}
-
-/*
- * Add a trace buffer entry for an inode and a uio.
- */
-void
-xfs_dir_trace_g_dun(char *where, xfs_inode_t *dp, uio_t *uio,
- xfs_da_intnode_t *node)
-{
- int last = INT_GET(node->hdr.count, ARCH_CONVERT) - 1;
-
- xfs_dir_trace_enter(XFS_DIR_KTRACE_G_DUN, where,
- (void *)dp, (void *)dp->i_mount,
- (void *)((unsigned long)(uio->uio_offset >> 32)),
- (void *)((unsigned long)(uio->uio_offset & 0xFFFFFFFF)),
- (void *)(unsigned long)uio->uio_resid,
- (void *)(unsigned long)
- INT_GET(node->hdr.info.forw, ARCH_CONVERT),
- (void *)(unsigned long)
- INT_GET(node->hdr.count, ARCH_CONVERT),
- (void *)(unsigned long)
- INT_GET(node->btree[0].hashval, ARCH_CONVERT),
- (void *)(unsigned long)
- INT_GET(node->btree[last].hashval, ARCH_CONVERT),
- NULL, NULL, NULL);
-}
-
-/*
- * Add a trace buffer entry for an inode and a uio.
- */
-void
-xfs_dir_trace_g_dul(char *where, xfs_inode_t *dp, uio_t *uio,
- xfs_dir_leafblock_t *leaf)
-{
- int last = INT_GET(leaf->hdr.count, ARCH_CONVERT) - 1;
-
- xfs_dir_trace_enter(XFS_DIR_KTRACE_G_DUL, where,
- (void *)dp, (void *)dp->i_mount,
- (void *)((unsigned long)(uio->uio_offset >> 32)),
- (void *)((unsigned long)(uio->uio_offset & 0xFFFFFFFF)),
- (void *)(unsigned long)uio->uio_resid,
- (void *)(unsigned long)
- INT_GET(leaf->hdr.info.forw, ARCH_CONVERT),
- (void *)(unsigned long)
- INT_GET(leaf->hdr.count, ARCH_CONVERT),
- (void *)(unsigned long)
- INT_GET(leaf->entries[0].hashval, ARCH_CONVERT),
- (void *)(unsigned long)
- INT_GET(leaf->entries[last].hashval, ARCH_CONVERT),
- NULL, NULL, NULL);
-}
-
-/*
- * Add a trace buffer entry for an inode and a uio.
- */
-void
-xfs_dir_trace_g_due(char *where, xfs_inode_t *dp, uio_t *uio,
- xfs_dir_leaf_entry_t *entry)
-{
- xfs_dir_trace_enter(XFS_DIR_KTRACE_G_DUE, where,
- (void *)dp, (void *)dp->i_mount,
- (void *)((unsigned long)(uio->uio_offset >> 32)),
- (void *)((unsigned long)(uio->uio_offset & 0xFFFFFFFF)),
- (void *)(unsigned long)uio->uio_resid,
- (void *)(unsigned long)
- INT_GET(entry->hashval, ARCH_CONVERT),
- NULL, NULL, NULL, NULL, NULL, NULL);
-}
-
-/*
- * Add a trace buffer entry for an inode and a uio.
- */
-void
-xfs_dir_trace_g_duc(char *where, xfs_inode_t *dp, uio_t *uio, xfs_off_t cookie)
-{
- xfs_dir_trace_enter(XFS_DIR_KTRACE_G_DUC, where,
- (void *)dp, (void *)dp->i_mount,
- (void *)((unsigned long)(uio->uio_offset >> 32)),
- (void *)((unsigned long)(uio->uio_offset & 0xFFFFFFFF)),
- (void *)(unsigned long)uio->uio_resid,
- (void *)((unsigned long)(cookie >> 32)),
- (void *)((unsigned long)(cookie & 0xFFFFFFFF)),
- NULL, NULL, NULL, NULL, NULL);
-}
-
-/*
- * Add a trace buffer entry for the arguments given to the routine,
- * generic form.
- */
-void
-xfs_dir_trace_enter(int type, char *where,
- void * a0, void * a1,
- void * a2, void * a3,
- void * a4, void * a5,
- void * a6, void * a7,
- void * a8, void * a9,
- void * a10, void * a11)
-{
- ASSERT(xfs_dir_trace_buf);
- ktrace_enter(xfs_dir_trace_buf, (void *)(unsigned long)type,
- (void *)where,
- (void *)a0, (void *)a1, (void *)a2,
- (void *)a3, (void *)a4, (void *)a5,
- (void *)a6, (void *)a7, (void *)a8,
- (void *)a9, (void *)a10, (void *)a11,
- NULL, NULL);
-}
-#endif /* XFS_DIR_TRACE */
diff --git a/fs/xfs/xfs_dir.h b/fs/xfs/xfs_dir.h
deleted file mode 100644
index 8cc8afb9f6c..00000000000
--- a/fs/xfs/xfs_dir.h
+++ /dev/null
@@ -1,142 +0,0 @@
-/*
- * Copyright (c) 2000,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- */
-#ifndef __XFS_DIR_H__
-#define __XFS_DIR_H__
-
-/*
- * Large directories are structured around Btrees where all the data
- * elements are in the leaf nodes. Filenames are hashed into an int,
- * then that int is used as the index into the Btree. Since the hashval
- * of a filename may not be unique, we may have duplicate keys. The
- * internal links in the Btree are logical block offsets into the file.
- *
- * Small directories use a different format and are packed as tightly
- * as possible so as to fit into the literal area of the inode.
- */
-
-/*========================================================================
- * Function prototypes for the kernel.
- *========================================================================*/
-
-struct uio;
-struct xfs_bmap_free;
-struct xfs_da_args;
-struct xfs_dinode;
-struct xfs_inode;
-struct xfs_mount;
-struct xfs_trans;
-
-/*
- * Directory function types.
- * Put in structures (xfs_dirops_t) for v1 and v2 directories.
- */
-typedef void (*xfs_dir_mount_t)(struct xfs_mount *mp);
-typedef int (*xfs_dir_isempty_t)(struct xfs_inode *dp);
-typedef int (*xfs_dir_init_t)(struct xfs_trans *tp,
- struct xfs_inode *dp,
- struct xfs_inode *pdp);
-typedef int (*xfs_dir_createname_t)(struct xfs_trans *tp,
- struct xfs_inode *dp,
- char *name,
- int namelen,
- xfs_ino_t inum,
- xfs_fsblock_t *first,
- struct xfs_bmap_free *flist,
- xfs_extlen_t total);
-typedef int (*xfs_dir_lookup_t)(struct xfs_trans *tp,
- struct xfs_inode *dp,
- char *name,
- int namelen,
- xfs_ino_t *inum);
-typedef int (*xfs_dir_removename_t)(struct xfs_trans *tp,
- struct xfs_inode *dp,
- char *name,
- int namelen,
- xfs_ino_t ino,
- xfs_fsblock_t *first,
- struct xfs_bmap_free *flist,
- xfs_extlen_t total);
-typedef int (*xfs_dir_getdents_t)(struct xfs_trans *tp,
- struct xfs_inode *dp,
- struct uio *uio,
- int *eofp);
-typedef int (*xfs_dir_replace_t)(struct xfs_trans *tp,
- struct xfs_inode *dp,
- char *name,
- int namelen,
- xfs_ino_t inum,
- xfs_fsblock_t *first,
- struct xfs_bmap_free *flist,
- xfs_extlen_t total);
-typedef int (*xfs_dir_canenter_t)(struct xfs_trans *tp,
- struct xfs_inode *dp,
- char *name,
- int namelen);
-typedef int (*xfs_dir_shortform_validate_ondisk_t)(struct xfs_mount *mp,
- struct xfs_dinode *dip);
-typedef int (*xfs_dir_shortform_to_single_t)(struct xfs_da_args *args);
-
-typedef struct xfs_dirops {
- xfs_dir_mount_t xd_mount;
- xfs_dir_isempty_t xd_isempty;
- xfs_dir_init_t xd_init;
- xfs_dir_createname_t xd_createname;
- xfs_dir_lookup_t xd_lookup;
- xfs_dir_removename_t xd_removename;
- xfs_dir_getdents_t xd_getdents;
- xfs_dir_replace_t xd_replace;
- xfs_dir_canenter_t xd_canenter;
- xfs_dir_shortform_validate_ondisk_t xd_shortform_validate_ondisk;
- xfs_dir_shortform_to_single_t xd_shortform_to_single;
-} xfs_dirops_t;
-
-/*
- * Overall external interface routines.
- */
-void xfs_dir_startup(void); /* called exactly once */
-
-#define XFS_DIR_MOUNT(mp) \
- ((mp)->m_dirops.xd_mount(mp))
-#define XFS_DIR_ISEMPTY(mp,dp) \
- ((mp)->m_dirops.xd_isempty(dp))
-#define XFS_DIR_INIT(mp,tp,dp,pdp) \
- ((mp)->m_dirops.xd_init(tp,dp,pdp))
-#define XFS_DIR_CREATENAME(mp,tp,dp,name,namelen,inum,first,flist,total) \
- ((mp)->m_dirops.xd_createname(tp,dp,name,namelen,inum,first,flist,\
- total))
-#define XFS_DIR_LOOKUP(mp,tp,dp,name,namelen,inum) \
- ((mp)->m_dirops.xd_lookup(tp,dp,name,namelen,inum))
-#define XFS_DIR_REMOVENAME(mp,tp,dp,name,namelen,ino,first,flist,total) \
- ((mp)->m_dirops.xd_removename(tp,dp,name,namelen,ino,first,flist,total))
-#define XFS_DIR_GETDENTS(mp,tp,dp,uio,eofp) \
- ((mp)->m_dirops.xd_getdents(tp,dp,uio,eofp))
-#define XFS_DIR_REPLACE(mp,tp,dp,name,namelen,inum,first,flist,total) \
- ((mp)->m_dirops.xd_replace(tp,dp,name,namelen,inum,first,flist,total))
-#define XFS_DIR_CANENTER(mp,tp,dp,name,namelen) \
- ((mp)->m_dirops.xd_canenter(tp,dp,name,namelen))
-#define XFS_DIR_SHORTFORM_VALIDATE_ONDISK(mp,dip) \
- ((mp)->m_dirops.xd_shortform_validate_ondisk(mp,dip))
-#define XFS_DIR_SHORTFORM_TO_SINGLE(mp,args) \
- ((mp)->m_dirops.xd_shortform_to_single(args))
-
-#define XFS_DIR_IS_V1(mp) ((mp)->m_dirversion == 1)
-#define XFS_DIR_IS_V2(mp) ((mp)->m_dirversion == 2)
-extern xfs_dirops_t xfsv1_dirops;
-extern xfs_dirops_t xfsv2_dirops;
-
-#endif /* __XFS_DIR_H__ */
diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c
index 022c8398ab6..79670cda48a 100644
--- a/fs/xfs/xfs_dir2.c
+++ b/fs/xfs/xfs_dir2.c
@@ -17,616 +17,618 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
-#include "xfs_types.h"
-#include "xfs_bit.h"
-#include "xfs_log.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_inum.h"
-#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
-#include "xfs_dir.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
#include "xfs_mount.h"
+#include "xfs_da_format.h"
#include "xfs_da_btree.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_dir_sf.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
-#include "xfs_dinode.h"
#include "xfs_inode.h"
+#include "xfs_trans.h"
#include "xfs_inode_item.h"
#include "xfs_bmap.h"
-#include "xfs_dir_leaf.h"
-#include "xfs_dir2_data.h"
-#include "xfs_dir2_leaf.h"
-#include "xfs_dir2_block.h"
-#include "xfs_dir2_node.h"
-#include "xfs_dir2_trace.h"
+#include "xfs_dir2.h"
+#include "xfs_dir2_priv.h"
#include "xfs_error.h"
+#include "xfs_trace.h"
+#include "xfs_dinode.h"
-/*
- * Declarations for interface routines.
- */
-static void xfs_dir2_mount(xfs_mount_t *mp);
-static int xfs_dir2_isempty(xfs_inode_t *dp);
-static int xfs_dir2_init(xfs_trans_t *tp, xfs_inode_t *dp,
- xfs_inode_t *pdp);
-static int xfs_dir2_createname(xfs_trans_t *tp, xfs_inode_t *dp,
- char *name, int namelen, xfs_ino_t inum,
- xfs_fsblock_t *first,
- xfs_bmap_free_t *flist, xfs_extlen_t total);
-static int xfs_dir2_lookup(xfs_trans_t *tp, xfs_inode_t *dp, char *name,
- int namelen, xfs_ino_t *inum);
-static int xfs_dir2_removename(xfs_trans_t *tp, xfs_inode_t *dp,
- char *name, int namelen, xfs_ino_t ino,
- xfs_fsblock_t *first,
- xfs_bmap_free_t *flist, xfs_extlen_t total);
-static int xfs_dir2_getdents(xfs_trans_t *tp, xfs_inode_t *dp, uio_t *uio,
- int *eofp);
-static int xfs_dir2_replace(xfs_trans_t *tp, xfs_inode_t *dp, char *name,
- int namelen, xfs_ino_t inum,
- xfs_fsblock_t *first, xfs_bmap_free_t *flist,
- xfs_extlen_t total);
-static int xfs_dir2_canenter(xfs_trans_t *tp, xfs_inode_t *dp, char *name,
- int namelen);
-static int xfs_dir2_shortform_validate_ondisk(xfs_mount_t *mp,
- xfs_dinode_t *dip);
+struct xfs_name xfs_name_dotdot = { (unsigned char *)"..", 2, XFS_DIR3_FT_DIR };
-/*
- * Utility routine declarations.
- */
-static int xfs_dir2_put_dirent64_direct(xfs_dir2_put_args_t *pa);
-static int xfs_dir2_put_dirent64_uio(xfs_dir2_put_args_t *pa);
/*
- * Directory operations vector.
+ * ASCII case-insensitive (ie. A-Z) support for directories that was
+ * used in IRIX.
*/
-xfs_dirops_t xfsv2_dirops = {
- .xd_mount = xfs_dir2_mount,
- .xd_isempty = xfs_dir2_isempty,
- .xd_init = xfs_dir2_init,
- .xd_createname = xfs_dir2_createname,
- .xd_lookup = xfs_dir2_lookup,
- .xd_removename = xfs_dir2_removename,
- .xd_getdents = xfs_dir2_getdents,
- .xd_replace = xfs_dir2_replace,
- .xd_canenter = xfs_dir2_canenter,
- .xd_shortform_validate_ondisk = xfs_dir2_shortform_validate_ondisk,
- .xd_shortform_to_single = xfs_dir2_sf_to_block,
-};
+STATIC xfs_dahash_t
+xfs_ascii_ci_hashname(
+ struct xfs_name *name)
+{
+ xfs_dahash_t hash;
+ int i;
-/*
- * Interface routines.
- */
+ for (i = 0, hash = 0; i < name->len; i++)
+ hash = tolower(name->name[i]) ^ rol32(hash, 7);
-/*
- * Initialize directory-related fields in the mount structure.
- */
-static void
-xfs_dir2_mount(
- xfs_mount_t *mp) /* filesystem mount point */
+ return hash;
+}
+
+STATIC enum xfs_dacmp
+xfs_ascii_ci_compname(
+ struct xfs_da_args *args,
+ const unsigned char *name,
+ int len)
+{
+ enum xfs_dacmp result;
+ int i;
+
+ if (args->namelen != len)
+ return XFS_CMP_DIFFERENT;
+
+ result = XFS_CMP_EXACT;
+ for (i = 0; i < len; i++) {
+ if (args->name[i] == name[i])
+ continue;
+ if (tolower(args->name[i]) != tolower(name[i]))
+ return XFS_CMP_DIFFERENT;
+ result = XFS_CMP_CASE;
+ }
+
+ return result;
+}
+
+static struct xfs_nameops xfs_ascii_ci_nameops = {
+ .hashname = xfs_ascii_ci_hashname,
+ .compname = xfs_ascii_ci_compname,
+};
+
+int
+xfs_da_mount(
+ struct xfs_mount *mp)
{
- mp->m_dirversion = 2;
+ struct xfs_da_geometry *dageo;
+ int nodehdr_size;
+
+
+ ASSERT(mp->m_sb.sb_versionnum & XFS_SB_VERSION_DIRV2BIT);
ASSERT((1 << (mp->m_sb.sb_blocklog + mp->m_sb.sb_dirblklog)) <=
XFS_MAX_BLOCKSIZE);
- mp->m_dirblksize = 1 << (mp->m_sb.sb_blocklog + mp->m_sb.sb_dirblklog);
- mp->m_dirblkfsbs = 1 << mp->m_sb.sb_dirblklog;
- mp->m_dirdatablk = XFS_DIR2_DB_TO_DA(mp, XFS_DIR2_DATA_FIRSTDB(mp));
- mp->m_dirleafblk = XFS_DIR2_DB_TO_DA(mp, XFS_DIR2_LEAF_FIRSTDB(mp));
- mp->m_dirfreeblk = XFS_DIR2_DB_TO_DA(mp, XFS_DIR2_FREE_FIRSTDB(mp));
- mp->m_attr_node_ents =
- (mp->m_sb.sb_blocksize - (uint)sizeof(xfs_da_node_hdr_t)) /
- (uint)sizeof(xfs_da_node_entry_t);
- mp->m_dir_node_ents =
- (mp->m_dirblksize - (uint)sizeof(xfs_da_node_hdr_t)) /
- (uint)sizeof(xfs_da_node_entry_t);
- mp->m_dir_magicpct = (mp->m_dirblksize * 37) / 100;
+
+ mp->m_dir_inode_ops = xfs_dir_get_ops(mp, NULL);
+ mp->m_nondir_inode_ops = xfs_nondir_get_ops(mp, NULL);
+
+ nodehdr_size = mp->m_dir_inode_ops->node_hdr_size;
+ mp->m_dir_geo = kmem_zalloc(sizeof(struct xfs_da_geometry),
+ KM_SLEEP | KM_MAYFAIL);
+ mp->m_attr_geo = kmem_zalloc(sizeof(struct xfs_da_geometry),
+ KM_SLEEP | KM_MAYFAIL);
+ if (!mp->m_dir_geo || !mp->m_attr_geo) {
+ kmem_free(mp->m_dir_geo);
+ kmem_free(mp->m_attr_geo);
+ return ENOMEM;
+ }
+
+ /* set up directory geometry */
+ dageo = mp->m_dir_geo;
+ dageo->blklog = mp->m_sb.sb_blocklog + mp->m_sb.sb_dirblklog;
+ dageo->fsblog = mp->m_sb.sb_blocklog;
+ dageo->blksize = 1 << dageo->blklog;
+ dageo->fsbcount = 1 << mp->m_sb.sb_dirblklog;
+
+ /*
+ * Now we've set up the block conversion variables, we can calculate the
+ * segment block constants using the geometry structure.
+ */
+ dageo->datablk = xfs_dir2_byte_to_da(dageo, XFS_DIR2_DATA_OFFSET);
+ dageo->leafblk = xfs_dir2_byte_to_da(dageo, XFS_DIR2_LEAF_OFFSET);
+ dageo->freeblk = xfs_dir2_byte_to_da(dageo, XFS_DIR2_FREE_OFFSET);
+ dageo->node_ents = (dageo->blksize - nodehdr_size) /
+ (uint)sizeof(xfs_da_node_entry_t);
+ dageo->magicpct = (dageo->blksize * 37) / 100;
+
+ /* set up attribute geometry - single fsb only */
+ dageo = mp->m_attr_geo;
+ dageo->blklog = mp->m_sb.sb_blocklog;
+ dageo->fsblog = mp->m_sb.sb_blocklog;
+ dageo->blksize = 1 << dageo->blklog;
+ dageo->fsbcount = 1;
+ dageo->node_ents = (dageo->blksize - nodehdr_size) /
+ (uint)sizeof(xfs_da_node_entry_t);
+ dageo->magicpct = (dageo->blksize * 37) / 100;
+
+ if (xfs_sb_version_hasasciici(&mp->m_sb))
+ mp->m_dirnameops = &xfs_ascii_ci_nameops;
+ else
+ mp->m_dirnameops = &xfs_default_nameops;
+
+ return 0;
+}
+
+void
+xfs_da_unmount(
+ struct xfs_mount *mp)
+{
+ kmem_free(mp->m_dir_geo);
+ kmem_free(mp->m_attr_geo);
}
/*
* Return 1 if directory contains only "." and "..".
*/
-static int /* return code */
-xfs_dir2_isempty(
- xfs_inode_t *dp) /* incore inode structure */
+int
+xfs_dir_isempty(
+ xfs_inode_t *dp)
{
- xfs_dir2_sf_t *sfp; /* shortform directory structure */
+ xfs_dir2_sf_hdr_t *sfp;
- ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
- /*
- * Might happen during shutdown.
- */
- if (dp->i_d.di_size == 0) {
+ ASSERT(S_ISDIR(dp->i_d.di_mode));
+ if (dp->i_d.di_size == 0) /* might happen during shutdown. */
return 1;
- }
if (dp->i_d.di_size > XFS_IFORK_DSIZE(dp))
return 0;
- sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
- return !sfp->hdr.count;
+ sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+ return !sfp->count;
+}
+
+/*
+ * Validate a given inode number.
+ */
+int
+xfs_dir_ino_validate(
+ xfs_mount_t *mp,
+ xfs_ino_t ino)
+{
+ xfs_agblock_t agblkno;
+ xfs_agino_t agino;
+ xfs_agnumber_t agno;
+ int ino_ok;
+ int ioff;
+
+ agno = XFS_INO_TO_AGNO(mp, ino);
+ agblkno = XFS_INO_TO_AGBNO(mp, ino);
+ ioff = XFS_INO_TO_OFFSET(mp, ino);
+ agino = XFS_OFFBNO_TO_AGINO(mp, agblkno, ioff);
+ ino_ok =
+ agno < mp->m_sb.sb_agcount &&
+ agblkno < mp->m_sb.sb_agblocks &&
+ agblkno != 0 &&
+ ioff < (1 << mp->m_sb.sb_inopblog) &&
+ XFS_AGINO_TO_INO(mp, agno, agino) == ino;
+ if (unlikely(XFS_TEST_ERROR(!ino_ok, mp, XFS_ERRTAG_DIR_INO_VALIDATE,
+ XFS_RANDOM_DIR_INO_VALIDATE))) {
+ xfs_warn(mp, "Invalid inode number 0x%Lx",
+ (unsigned long long) ino);
+ XFS_ERROR_REPORT("xfs_dir_ino_validate", XFS_ERRLEVEL_LOW, mp);
+ return XFS_ERROR(EFSCORRUPTED);
+ }
+ return 0;
}
/*
* Initialize a directory with its "." and ".." entries.
*/
-static int /* error */
-xfs_dir2_init(
- xfs_trans_t *tp, /* transaction pointer */
- xfs_inode_t *dp, /* incore directory inode */
- xfs_inode_t *pdp) /* incore parent directory inode */
+int
+xfs_dir_init(
+ xfs_trans_t *tp,
+ xfs_inode_t *dp,
+ xfs_inode_t *pdp)
{
- xfs_da_args_t args; /* operation arguments */
- int error; /* error return value */
-
- memset((char *)&args, 0, sizeof(args));
- args.dp = dp;
- args.trans = tp;
- ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
- if ((error = xfs_dir_ino_validate(tp->t_mountp, pdp->i_ino))) {
+ struct xfs_da_args *args;
+ int error;
+
+ ASSERT(S_ISDIR(dp->i_d.di_mode));
+ error = xfs_dir_ino_validate(tp->t_mountp, pdp->i_ino);
+ if (error)
return error;
- }
- return xfs_dir2_sf_create(&args, pdp->i_ino);
+
+ args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
+ if (!args)
+ return ENOMEM;
+
+ args->geo = dp->i_mount->m_dir_geo;
+ args->dp = dp;
+ args->trans = tp;
+ error = xfs_dir2_sf_create(args, pdp->i_ino);
+ kmem_free(args);
+ return error;
}
/*
Enter a name in a directory.
*/
-static int /* error */
-xfs_dir2_createname(
- xfs_trans_t *tp, /* transaction pointer */
- xfs_inode_t *dp, /* incore directory inode */
- char *name, /* new entry name */
- int namelen, /* new entry name length */
+int
+xfs_dir_createname(
+ xfs_trans_t *tp,
+ xfs_inode_t *dp,
+ struct xfs_name *name,
xfs_ino_t inum, /* new entry inode number */
xfs_fsblock_t *first, /* bmap's firstblock */
xfs_bmap_free_t *flist, /* bmap's freeblock list */
xfs_extlen_t total) /* bmap's total block count */
{
- xfs_da_args_t args; /* operation arguments */
- int rval; /* return value */
+ struct xfs_da_args *args;
+ int rval;
int v; /* type-checking value */
- ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
- if ((rval = xfs_dir_ino_validate(tp->t_mountp, inum))) {
+ ASSERT(S_ISDIR(dp->i_d.di_mode));
+ rval = xfs_dir_ino_validate(tp->t_mountp, inum);
+ if (rval)
return rval;
- }
XFS_STATS_INC(xs_dir_create);
- /*
- * Fill in the arg structure for this request.
- */
- args.name = name;
- args.namelen = namelen;
- args.hashval = xfs_da_hashname(name, namelen);
- args.inumber = inum;
- args.dp = dp;
- args.firstblock = first;
- args.flist = flist;
- args.total = total;
- args.whichfork = XFS_DATA_FORK;
- args.trans = tp;
- args.justcheck = 0;
- args.addname = args.oknoent = 1;
- /*
- * Decide on what work routines to call based on the inode size.
- */
- if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
- rval = xfs_dir2_sf_addname(&args);
- else if ((rval = xfs_dir2_isblock(tp, dp, &v))) {
- return rval;
- } else if (v)
- rval = xfs_dir2_block_addname(&args);
- else if ((rval = xfs_dir2_isleaf(tp, dp, &v))) {
- return rval;
- } else if (v)
- rval = xfs_dir2_leaf_addname(&args);
+
+ args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
+ if (!args)
+ return ENOMEM;
+
+ args->geo = dp->i_mount->m_dir_geo;
+ args->name = name->name;
+ args->namelen = name->len;
+ args->filetype = name->type;
+ args->hashval = dp->i_mount->m_dirnameops->hashname(name);
+ args->inumber = inum;
+ args->dp = dp;
+ args->firstblock = first;
+ args->flist = flist;
+ args->total = total;
+ args->whichfork = XFS_DATA_FORK;
+ args->trans = tp;
+ args->op_flags = XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT;
+
+ if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
+ rval = xfs_dir2_sf_addname(args);
+ goto out_free;
+ }
+
+ rval = xfs_dir2_isblock(args, &v);
+ if (rval)
+ goto out_free;
+ if (v) {
+ rval = xfs_dir2_block_addname(args);
+ goto out_free;
+ }
+
+ rval = xfs_dir2_isleaf(args, &v);
+ if (rval)
+ goto out_free;
+ if (v)
+ rval = xfs_dir2_leaf_addname(args);
else
- rval = xfs_dir2_node_addname(&args);
+ rval = xfs_dir2_node_addname(args);
+
+out_free:
+ kmem_free(args);
return rval;
}
/*
+ * If doing a CI lookup and case-insensitive match, dup actual name into
+ * args.value. Return EEXIST for success (ie. name found) or an error.
+ */
+int
+xfs_dir_cilookup_result(
+ struct xfs_da_args *args,
+ const unsigned char *name,
+ int len)
+{
+ if (args->cmpresult == XFS_CMP_DIFFERENT)
+ return ENOENT;
+ if (args->cmpresult != XFS_CMP_CASE ||
+ !(args->op_flags & XFS_DA_OP_CILOOKUP))
+ return EEXIST;
+
+ args->value = kmem_alloc(len, KM_NOFS | KM_MAYFAIL);
+ if (!args->value)
+ return ENOMEM;
+
+ memcpy(args->value, name, len);
+ args->valuelen = len;
+ return EEXIST;
+}
+
+/*
* Lookup a name in a directory, give back the inode number.
+ * If ci_name is not NULL, returns the actual name in ci_name if it differs
+ * to name, or ci_name->name is set to NULL for an exact match.
*/
-static int /* error */
-xfs_dir2_lookup(
- xfs_trans_t *tp, /* transaction pointer */
- xfs_inode_t *dp, /* incore directory inode */
- char *name, /* lookup name */
- int namelen, /* lookup name length */
- xfs_ino_t *inum) /* out: inode number */
+
+int
+xfs_dir_lookup(
+ xfs_trans_t *tp,
+ xfs_inode_t *dp,
+ struct xfs_name *name,
+ xfs_ino_t *inum, /* out: inode number */
+ struct xfs_name *ci_name) /* out: actual name if CI match */
{
- xfs_da_args_t args; /* operation arguments */
- int rval; /* return value */
+ struct xfs_da_args *args;
+ int rval;
int v; /* type-checking value */
- ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
+ ASSERT(S_ISDIR(dp->i_d.di_mode));
XFS_STATS_INC(xs_dir_lookup);
/*
- * Fill in the arg structure for this request.
- */
- args.name = name;
- args.namelen = namelen;
- args.hashval = xfs_da_hashname(name, namelen);
- args.inumber = 0;
- args.dp = dp;
- args.firstblock = NULL;
- args.flist = NULL;
- args.total = 0;
- args.whichfork = XFS_DATA_FORK;
- args.trans = tp;
- args.justcheck = args.addname = 0;
- args.oknoent = 1;
- /*
- * Decide on what work routines to call based on the inode size.
+ * We need to use KM_NOFS here so that lockdep will not throw false
+ * positive deadlock warnings on a non-transactional lookup path. It is
+ * safe to recurse into inode recalim in that case, but lockdep can't
+ * easily be taught about it. Hence KM_NOFS avoids having to add more
+ * lockdep Doing this avoids having to add a bunch of lockdep class
+ * annotations into the reclaim path for the ilock.
*/
- if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
- rval = xfs_dir2_sf_lookup(&args);
- else if ((rval = xfs_dir2_isblock(tp, dp, &v))) {
- return rval;
- } else if (v)
- rval = xfs_dir2_block_lookup(&args);
- else if ((rval = xfs_dir2_isleaf(tp, dp, &v))) {
- return rval;
- } else if (v)
- rval = xfs_dir2_leaf_lookup(&args);
+ args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
+ args->geo = dp->i_mount->m_dir_geo;
+ args->name = name->name;
+ args->namelen = name->len;
+ args->filetype = name->type;
+ args->hashval = dp->i_mount->m_dirnameops->hashname(name);
+ args->dp = dp;
+ args->whichfork = XFS_DATA_FORK;
+ args->trans = tp;
+ args->op_flags = XFS_DA_OP_OKNOENT;
+ if (ci_name)
+ args->op_flags |= XFS_DA_OP_CILOOKUP;
+
+ if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
+ rval = xfs_dir2_sf_lookup(args);
+ goto out_check_rval;
+ }
+
+ rval = xfs_dir2_isblock(args, &v);
+ if (rval)
+ goto out_free;
+ if (v) {
+ rval = xfs_dir2_block_lookup(args);
+ goto out_check_rval;
+ }
+
+ rval = xfs_dir2_isleaf(args, &v);
+ if (rval)
+ goto out_free;
+ if (v)
+ rval = xfs_dir2_leaf_lookup(args);
else
- rval = xfs_dir2_node_lookup(&args);
+ rval = xfs_dir2_node_lookup(args);
+
+out_check_rval:
if (rval == EEXIST)
rval = 0;
- if (rval == 0)
- *inum = args.inumber;
+ if (!rval) {
+ *inum = args->inumber;
+ if (ci_name) {
+ ci_name->name = args->value;
+ ci_name->len = args->valuelen;
+ }
+ }
+out_free:
+ kmem_free(args);
return rval;
}
/*
* Remove an entry from a directory.
*/
-static int /* error */
-xfs_dir2_removename(
- xfs_trans_t *tp, /* transaction pointer */
- xfs_inode_t *dp, /* incore directory inode */
- char *name, /* name of entry to remove */
- int namelen, /* name length of entry to remove */
- xfs_ino_t ino, /* inode number of entry to remove */
+int
+xfs_dir_removename(
+ xfs_trans_t *tp,
+ xfs_inode_t *dp,
+ struct xfs_name *name,
+ xfs_ino_t ino,
xfs_fsblock_t *first, /* bmap's firstblock */
xfs_bmap_free_t *flist, /* bmap's freeblock list */
xfs_extlen_t total) /* bmap's total block count */
{
- xfs_da_args_t args; /* operation arguments */
- int rval; /* return value */
+ struct xfs_da_args *args;
+ int rval;
int v; /* type-checking value */
- ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
+ ASSERT(S_ISDIR(dp->i_d.di_mode));
XFS_STATS_INC(xs_dir_remove);
- /*
- * Fill in the arg structure for this request.
- */
- args.name = name;
- args.namelen = namelen;
- args.hashval = xfs_da_hashname(name, namelen);
- args.inumber = ino;
- args.dp = dp;
- args.firstblock = first;
- args.flist = flist;
- args.total = total;
- args.whichfork = XFS_DATA_FORK;
- args.trans = tp;
- args.justcheck = args.addname = args.oknoent = 0;
- /*
- * Decide on what work routines to call based on the inode size.
- */
- if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
- rval = xfs_dir2_sf_removename(&args);
- else if ((rval = xfs_dir2_isblock(tp, dp, &v))) {
- return rval;
- } else if (v)
- rval = xfs_dir2_block_removename(&args);
- else if ((rval = xfs_dir2_isleaf(tp, dp, &v))) {
- return rval;
- } else if (v)
- rval = xfs_dir2_leaf_removename(&args);
- else
- rval = xfs_dir2_node_removename(&args);
- return rval;
-}
-/*
- * Read a directory.
- */
-static int /* error */
-xfs_dir2_getdents(
- xfs_trans_t *tp, /* transaction pointer */
- xfs_inode_t *dp, /* incore directory inode */
- uio_t *uio, /* caller's buffer control */
- int *eofp) /* out: eof reached */
-{
- int alignment; /* alignment required for ABI */
- xfs_dirent_t *dbp; /* malloc'ed buffer */
- xfs_dir2_put_t put; /* entry formatting routine */
- int rval; /* return value */
- int v; /* type-checking value */
+ args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
+ if (!args)
+ return ENOMEM;
+
+ args->geo = dp->i_mount->m_dir_geo;
+ args->name = name->name;
+ args->namelen = name->len;
+ args->filetype = name->type;
+ args->hashval = dp->i_mount->m_dirnameops->hashname(name);
+ args->inumber = ino;
+ args->dp = dp;
+ args->firstblock = first;
+ args->flist = flist;
+ args->total = total;
+ args->whichfork = XFS_DATA_FORK;
+ args->trans = tp;
+
+ if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
+ rval = xfs_dir2_sf_removename(args);
+ goto out_free;
+ }
- ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
- XFS_STATS_INC(xs_dir_getdents);
- /*
- * If our caller has given us a single contiguous aligned memory buffer,
- * just work directly within that buffer. If it's in user memory,
- * lock it down first.
- */
- alignment = sizeof(xfs_off_t) - 1;
- if ((uio->uio_iovcnt == 1) &&
- (((__psint_t)uio->uio_iov[0].iov_base & alignment) == 0) &&
- ((uio->uio_iov[0].iov_len & alignment) == 0)) {
- dbp = NULL;
- put = xfs_dir2_put_dirent64_direct;
- } else {
- dbp = kmem_alloc(sizeof(*dbp) + MAXNAMELEN, KM_SLEEP);
- put = xfs_dir2_put_dirent64_uio;
+ rval = xfs_dir2_isblock(args, &v);
+ if (rval)
+ goto out_free;
+ if (v) {
+ rval = xfs_dir2_block_removename(args);
+ goto out_free;
}
- *eofp = 0;
- /*
- * Decide on what work routines to call based on the inode size.
- */
- if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
- rval = xfs_dir2_sf_getdents(dp, uio, eofp, dbp, put);
- else if ((rval = xfs_dir2_isblock(tp, dp, &v))) {
- ;
- } else if (v)
- rval = xfs_dir2_block_getdents(tp, dp, uio, eofp, dbp, put);
+ rval = xfs_dir2_isleaf(args, &v);
+ if (rval)
+ goto out_free;
+ if (v)
+ rval = xfs_dir2_leaf_removename(args);
else
- rval = xfs_dir2_leaf_getdents(tp, dp, uio, eofp, dbp, put);
- if (dbp != NULL)
- kmem_free(dbp, sizeof(*dbp) + MAXNAMELEN);
+ rval = xfs_dir2_node_removename(args);
+out_free:
+ kmem_free(args);
return rval;
}
/*
* Replace the inode number of a directory entry.
*/
-static int /* error */
-xfs_dir2_replace(
- xfs_trans_t *tp, /* transaction pointer */
- xfs_inode_t *dp, /* incore directory inode */
- char *name, /* name of entry to replace */
- int namelen, /* name length of entry to replace */
+int
+xfs_dir_replace(
+ xfs_trans_t *tp,
+ xfs_inode_t *dp,
+ struct xfs_name *name, /* name of entry to replace */
xfs_ino_t inum, /* new inode number */
xfs_fsblock_t *first, /* bmap's firstblock */
xfs_bmap_free_t *flist, /* bmap's freeblock list */
xfs_extlen_t total) /* bmap's total block count */
{
- xfs_da_args_t args; /* operation arguments */
- int rval; /* return value */
+ struct xfs_da_args *args;
+ int rval;
int v; /* type-checking value */
- ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
+ ASSERT(S_ISDIR(dp->i_d.di_mode));
- if ((rval = xfs_dir_ino_validate(tp->t_mountp, inum))) {
+ rval = xfs_dir_ino_validate(tp->t_mountp, inum);
+ if (rval)
return rval;
+
+ args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
+ if (!args)
+ return ENOMEM;
+
+ args->geo = dp->i_mount->m_dir_geo;
+ args->name = name->name;
+ args->namelen = name->len;
+ args->filetype = name->type;
+ args->hashval = dp->i_mount->m_dirnameops->hashname(name);
+ args->inumber = inum;
+ args->dp = dp;
+ args->firstblock = first;
+ args->flist = flist;
+ args->total = total;
+ args->whichfork = XFS_DATA_FORK;
+ args->trans = tp;
+
+ if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
+ rval = xfs_dir2_sf_replace(args);
+ goto out_free;
}
- /*
- * Fill in the arg structure for this request.
- */
- args.name = name;
- args.namelen = namelen;
- args.hashval = xfs_da_hashname(name, namelen);
- args.inumber = inum;
- args.dp = dp;
- args.firstblock = first;
- args.flist = flist;
- args.total = total;
- args.whichfork = XFS_DATA_FORK;
- args.trans = tp;
- args.justcheck = args.addname = args.oknoent = 0;
- /*
- * Decide on what work routines to call based on the inode size.
- */
- if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
- rval = xfs_dir2_sf_replace(&args);
- else if ((rval = xfs_dir2_isblock(tp, dp, &v))) {
- return rval;
- } else if (v)
- rval = xfs_dir2_block_replace(&args);
- else if ((rval = xfs_dir2_isleaf(tp, dp, &v))) {
- return rval;
- } else if (v)
- rval = xfs_dir2_leaf_replace(&args);
+
+ rval = xfs_dir2_isblock(args, &v);
+ if (rval)
+ goto out_free;
+ if (v) {
+ rval = xfs_dir2_block_replace(args);
+ goto out_free;
+ }
+
+ rval = xfs_dir2_isleaf(args, &v);
+ if (rval)
+ goto out_free;
+ if (v)
+ rval = xfs_dir2_leaf_replace(args);
else
- rval = xfs_dir2_node_replace(&args);
+ rval = xfs_dir2_node_replace(args);
+out_free:
+ kmem_free(args);
return rval;
}
/*
* See if this entry can be added to the directory without allocating space.
+ * First checks that the caller couldn't reserve enough space (resblks = 0).
*/
-static int /* error */
-xfs_dir2_canenter(
- xfs_trans_t *tp, /* transaction pointer */
- xfs_inode_t *dp, /* incore directory inode */
- char *name, /* name of entry to add */
- int namelen) /* name length of entry to add */
+int
+xfs_dir_canenter(
+ xfs_trans_t *tp,
+ xfs_inode_t *dp,
+ struct xfs_name *name, /* name of entry to add */
+ uint resblks)
{
- xfs_da_args_t args; /* operation arguments */
- int rval; /* return value */
+ struct xfs_da_args *args;
+ int rval;
int v; /* type-checking value */
- ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
- /*
- * Fill in the arg structure for this request.
- */
- args.name = name;
- args.namelen = namelen;
- args.hashval = xfs_da_hashname(name, namelen);
- args.inumber = 0;
- args.dp = dp;
- args.firstblock = NULL;
- args.flist = NULL;
- args.total = 0;
- args.whichfork = XFS_DATA_FORK;
- args.trans = tp;
- args.justcheck = args.addname = args.oknoent = 1;
- /*
- * Decide on what work routines to call based on the inode size.
- */
- if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
- rval = xfs_dir2_sf_addname(&args);
- else if ((rval = xfs_dir2_isblock(tp, dp, &v))) {
- return rval;
- } else if (v)
- rval = xfs_dir2_block_addname(&args);
- else if ((rval = xfs_dir2_isleaf(tp, dp, &v))) {
- return rval;
- } else if (v)
- rval = xfs_dir2_leaf_addname(&args);
+ if (resblks)
+ return 0;
+
+ ASSERT(S_ISDIR(dp->i_d.di_mode));
+
+ args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
+ if (!args)
+ return ENOMEM;
+
+ args->geo = dp->i_mount->m_dir_geo;
+ args->name = name->name;
+ args->namelen = name->len;
+ args->filetype = name->type;
+ args->hashval = dp->i_mount->m_dirnameops->hashname(name);
+ args->dp = dp;
+ args->whichfork = XFS_DATA_FORK;
+ args->trans = tp;
+ args->op_flags = XFS_DA_OP_JUSTCHECK | XFS_DA_OP_ADDNAME |
+ XFS_DA_OP_OKNOENT;
+
+ if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
+ rval = xfs_dir2_sf_addname(args);
+ goto out_free;
+ }
+
+ rval = xfs_dir2_isblock(args, &v);
+ if (rval)
+ goto out_free;
+ if (v) {
+ rval = xfs_dir2_block_addname(args);
+ goto out_free;
+ }
+
+ rval = xfs_dir2_isleaf(args, &v);
+ if (rval)
+ goto out_free;
+ if (v)
+ rval = xfs_dir2_leaf_addname(args);
else
- rval = xfs_dir2_node_addname(&args);
+ rval = xfs_dir2_node_addname(args);
+out_free:
+ kmem_free(args);
return rval;
}
/*
- * Dummy routine for shortform inode validation.
- * Can't really do this.
- */
-/* ARGSUSED */
-static int /* error */
-xfs_dir2_shortform_validate_ondisk(
- xfs_mount_t *mp, /* filesystem mount point */
- xfs_dinode_t *dip) /* ondisk inode */
-{
- return 0;
-}
-
-/*
* Utility routines.
*/
/*
* Add a block to the directory.
- * This routine is for data and free blocks, not leaf/node blocks
- * which are handled by xfs_da_grow_inode.
+ *
+ * This routine is for data and free blocks, not leaf/node blocks which are
+ * handled by xfs_da_grow_inode.
*/
-int /* error */
+int
xfs_dir2_grow_inode(
- xfs_da_args_t *args, /* operation arguments */
- int space, /* v2 dir's space XFS_DIR2_xxx_SPACE */
- xfs_dir2_db_t *dbp) /* out: block number added */
+ struct xfs_da_args *args,
+ int space, /* v2 dir's space XFS_DIR2_xxx_SPACE */
+ xfs_dir2_db_t *dbp) /* out: block number added */
{
- xfs_fileoff_t bno; /* directory offset of new block */
- int count; /* count of filesystem blocks */
- xfs_inode_t *dp; /* incore directory inode */
- int error; /* error return value */
- int got; /* blocks actually mapped */
- int i; /* temp mapping index */
- xfs_bmbt_irec_t map; /* single structure for bmap */
- int mapi; /* mapping index */
- xfs_bmbt_irec_t *mapp; /* bmap mapping structure(s) */
- xfs_mount_t *mp; /* filesystem mount point */
- int nmap; /* number of bmap entries */
- xfs_trans_t *tp; /* transaction pointer */
-
- xfs_dir2_trace_args_s("grow_inode", args, space);
- dp = args->dp;
- tp = args->trans;
- mp = dp->i_mount;
+ struct xfs_inode *dp = args->dp;
+ struct xfs_mount *mp = dp->i_mount;
+ xfs_fileoff_t bno; /* directory offset of new block */
+ int count; /* count of filesystem blocks */
+ int error;
+
+ trace_xfs_dir2_grow_inode(args, space);
+
/*
* Set lowest possible block in the space requested.
*/
bno = XFS_B_TO_FSBT(mp, space * XFS_DIR2_SPACE_SIZE);
- count = mp->m_dirblkfsbs;
- /*
- * Find the first hole for our block.
- */
- if ((error = xfs_bmap_first_unused(tp, dp, count, &bno, XFS_DATA_FORK))) {
- return error;
- }
- nmap = 1;
- ASSERT(args->firstblock != NULL);
- /*
- * Try mapping the new block contiguously (one extent).
- */
- if ((error = xfs_bmapi(tp, dp, bno, count,
- XFS_BMAPI_WRITE|XFS_BMAPI_METADATA|XFS_BMAPI_CONTIG,
- args->firstblock, args->total, &map, &nmap,
- args->flist))) {
+ count = args->geo->fsbcount;
+
+ error = xfs_da_grow_inode_int(args, &bno, count);
+ if (error)
return error;
- }
- ASSERT(nmap <= 1);
- /*
- * Got it in 1.
- */
- if (nmap == 1) {
- mapp = &map;
- mapi = 1;
- }
- /*
- * Didn't work and this is a multiple-fsb directory block.
- * Try again with contiguous flag turned on.
- */
- else if (nmap == 0 && count > 1) {
- xfs_fileoff_t b; /* current file offset */
- /*
- * Space for maximum number of mappings.
- */
- mapp = kmem_alloc(sizeof(*mapp) * count, KM_SLEEP);
- /*
- * Iterate until we get to the end of our block.
- */
- for (b = bno, mapi = 0; b < bno + count; ) {
- int c; /* current fsb count */
-
- /*
- * Can't map more than MAX_NMAP at once.
- */
- nmap = MIN(XFS_BMAP_MAX_NMAP, count);
- c = (int)(bno + count - b);
- if ((error = xfs_bmapi(tp, dp, b, c,
- XFS_BMAPI_WRITE|XFS_BMAPI_METADATA,
- args->firstblock, args->total,
- &mapp[mapi], &nmap, args->flist))) {
- kmem_free(mapp, sizeof(*mapp) * count);
- return error;
- }
- if (nmap < 1)
- break;
- /*
- * Add this bunch into our table, go to the next offset.
- */
- mapi += nmap;
- b = mapp[mapi - 1].br_startoff +
- mapp[mapi - 1].br_blockcount;
- }
- }
- /*
- * Didn't work.
- */
- else {
- mapi = 0;
- mapp = NULL;
- }
- /*
- * See how many fsb's we got.
- */
- for (i = 0, got = 0; i < mapi; i++)
- got += mapp[i].br_blockcount;
- /*
- * Didn't get enough fsb's, or the first/last block's are wrong.
- */
- if (got != count || mapp[0].br_startoff != bno ||
- mapp[mapi - 1].br_startoff + mapp[mapi - 1].br_blockcount !=
- bno + count) {
- if (mapp != &map)
- kmem_free(mapp, sizeof(*mapp) * count);
- return XFS_ERROR(ENOSPC);
- }
- /*
- * Done with the temporary mapping table.
- */
- if (mapp != &map)
- kmem_free(mapp, sizeof(*mapp) * count);
- *dbp = XFS_DIR2_DA_TO_DB(mp, (xfs_dablk_t)bno);
+ *dbp = xfs_dir2_da_to_db(args->geo, (xfs_dablk_t)bno);
+
/*
* Update file's size if this is the data space and it grew.
*/
@@ -636,7 +638,7 @@ xfs_dir2_grow_inode(
size = XFS_FSB_TO_B(mp, bno + count);
if (size > dp->i_d.di_size) {
dp->i_d.di_size = size;
- xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
+ xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE);
}
}
return 0;
@@ -645,22 +647,18 @@ xfs_dir2_grow_inode(
/*
* See if the directory is a single-block form directory.
*/
-int /* error */
+int
xfs_dir2_isblock(
- xfs_trans_t *tp, /* transaction pointer */
- xfs_inode_t *dp, /* incore directory inode */
- int *vp) /* out: 1 is block, 0 is not block */
+ struct xfs_da_args *args,
+ int *vp) /* out: 1 is block, 0 is not block */
{
- xfs_fileoff_t last; /* last file offset */
- xfs_mount_t *mp; /* filesystem mount point */
- int rval; /* return value */
+ xfs_fileoff_t last; /* last file offset */
+ int rval;
- mp = dp->i_mount;
- if ((rval = xfs_bmap_last_offset(tp, dp, &last, XFS_DATA_FORK))) {
+ if ((rval = xfs_bmap_last_offset(args->dp, &last, XFS_DATA_FORK)))
return rval;
- }
- rval = XFS_FSB_TO_B(mp, last) == mp->m_dirblksize;
- ASSERT(rval == 0 || dp->i_d.di_size == mp->m_dirblksize);
+ rval = XFS_FSB_TO_B(args->dp->i_mount, last) == args->geo->blksize;
+ ASSERT(rval == 0 || args->dp->i_d.di_size == args->geo->blksize);
*vp = rval;
return 0;
}
@@ -668,123 +666,49 @@ xfs_dir2_isblock(
/*
* See if the directory is a single-leaf form directory.
*/
-int /* error */
+int
xfs_dir2_isleaf(
- xfs_trans_t *tp, /* transaction pointer */
- xfs_inode_t *dp, /* incore directory inode */
- int *vp) /* out: 1 is leaf, 0 is not leaf */
+ struct xfs_da_args *args,
+ int *vp) /* out: 1 is block, 0 is not block */
{
- xfs_fileoff_t last; /* last file offset */
- xfs_mount_t *mp; /* filesystem mount point */
- int rval; /* return value */
+ xfs_fileoff_t last; /* last file offset */
+ int rval;
- mp = dp->i_mount;
- if ((rval = xfs_bmap_last_offset(tp, dp, &last, XFS_DATA_FORK))) {
+ if ((rval = xfs_bmap_last_offset(args->dp, &last, XFS_DATA_FORK)))
return rval;
- }
- *vp = last == mp->m_dirleafblk + (1 << mp->m_sb.sb_dirblklog);
+ *vp = last == args->geo->leafblk + args->geo->fsbcount;
return 0;
}
/*
- * Getdents put routine for 64-bit ABI, direct form.
- */
-static int /* error */
-xfs_dir2_put_dirent64_direct(
- xfs_dir2_put_args_t *pa) /* argument bundle */
-{
- xfs_dirent_t *idbp; /* dirent pointer */
- iovec_t *iovp; /* io vector */
- int namelen; /* entry name length */
- int reclen; /* entry total length */
- uio_t *uio; /* I/O control */
-
- namelen = pa->namelen;
- reclen = DIRENTSIZE(namelen);
- uio = pa->uio;
- /*
- * Won't fit in the remaining space.
- */
- if (reclen > uio->uio_resid) {
- pa->done = 0;
- return 0;
- }
- iovp = uio->uio_iov;
- idbp = (xfs_dirent_t *)iovp->iov_base;
- iovp->iov_base = (char *)idbp + reclen;
- iovp->iov_len -= reclen;
- uio->uio_resid -= reclen;
- idbp->d_reclen = reclen;
- idbp->d_ino = pa->ino;
- idbp->d_off = pa->cook;
- idbp->d_name[namelen] = '\0';
- pa->done = 1;
- memcpy(idbp->d_name, pa->name, namelen);
- return 0;
-}
-
-/*
- * Getdents put routine for 64-bit ABI, uio form.
- */
-static int /* error */
-xfs_dir2_put_dirent64_uio(
- xfs_dir2_put_args_t *pa) /* argument bundle */
-{
- xfs_dirent_t *idbp; /* dirent pointer */
- int namelen; /* entry name length */
- int reclen; /* entry total length */
- int rval; /* return value */
- uio_t *uio; /* I/O control */
-
- namelen = pa->namelen;
- reclen = DIRENTSIZE(namelen);
- uio = pa->uio;
- /*
- * Won't fit in the remaining space.
- */
- if (reclen > uio->uio_resid) {
- pa->done = 0;
- return 0;
- }
- idbp = pa->dbp;
- idbp->d_reclen = reclen;
- idbp->d_ino = pa->ino;
- idbp->d_off = pa->cook;
- idbp->d_name[namelen] = '\0';
- memcpy(idbp->d_name, pa->name, namelen);
- rval = uio_read((caddr_t)idbp, reclen, uio);
- pa->done = (rval == 0);
- return rval;
-}
-
-/*
* Remove the given block from the directory.
* This routine is used for data and free blocks, leaf/node are done
* by xfs_da_shrink_inode.
*/
int
xfs_dir2_shrink_inode(
- xfs_da_args_t *args, /* operation arguments */
- xfs_dir2_db_t db, /* directory block number */
- xfs_dabuf_t *bp) /* block's buffer */
+ xfs_da_args_t *args,
+ xfs_dir2_db_t db,
+ struct xfs_buf *bp)
{
xfs_fileoff_t bno; /* directory file offset */
xfs_dablk_t da; /* directory file offset */
int done; /* bunmap is finished */
- xfs_inode_t *dp; /* incore directory inode */
- int error; /* error return value */
- xfs_mount_t *mp; /* filesystem mount point */
- xfs_trans_t *tp; /* transaction pointer */
+ xfs_inode_t *dp;
+ int error;
+ xfs_mount_t *mp;
+ xfs_trans_t *tp;
+
+ trace_xfs_dir2_shrink_inode(args, db);
- xfs_dir2_trace_args_db("shrink_inode", args, db, bp);
dp = args->dp;
mp = dp->i_mount;
tp = args->trans;
- da = XFS_DIR2_DB_TO_DA(mp, db);
+ da = xfs_dir2_db_to_da(args->geo, db);
/*
* Unmap the fsblock(s).
*/
- if ((error = xfs_bunmapi(tp, dp, da, mp->m_dirblkfsbs,
+ if ((error = xfs_bunmapi(tp, dp, da, args->geo->fsbcount,
XFS_BMAPI_METADATA, 0, args->firstblock, args->flist,
&done))) {
/*
@@ -807,16 +731,16 @@ xfs_dir2_shrink_inode(
/*
* Invalidate the buffer from the transaction.
*/
- xfs_da_binval(tp, bp);
+ xfs_trans_binval(tp, bp);
/*
* If it's not a data block, we're done.
*/
- if (db >= XFS_DIR2_LEAF_FIRSTDB(mp))
+ if (db >= xfs_dir2_byte_to_db(args->geo, XFS_DIR2_LEAF_OFFSET))
return 0;
/*
* If the block isn't the last one in the directory, we're done.
*/
- if (dp->i_d.di_size > XFS_DIR2_DB_OFF_TO_BYTE(mp, db + 1, 0))
+ if (dp->i_d.di_size > xfs_dir2_db_off_to_byte(args->geo, db + 1, 0))
return 0;
bno = da;
if ((error = xfs_bmap_last_before(tp, dp, &bno, XFS_DATA_FORK))) {
@@ -825,7 +749,7 @@ xfs_dir2_shrink_inode(
*/
return error;
}
- if (db == mp->m_dirdatablk)
+ if (db == args->geo->datablk)
ASSERT(bno == 0);
else
ASSERT(bno > 0);
diff --git a/fs/xfs/xfs_dir2.h b/fs/xfs/xfs_dir2.h
index 3158f5dc431..c8e86b0b5e9 100644
--- a/fs/xfs/xfs_dir2.h
+++ b/fs/xfs/xfs_dir2.h
@@ -16,77 +16,165 @@
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef __XFS_DIR2_H__
-#define __XFS_DIR2_H__
+#define __XFS_DIR2_H__
-struct uio;
-struct xfs_dabuf;
+struct xfs_bmap_free;
struct xfs_da_args;
-struct xfs_dir2_put_args;
struct xfs_inode;
+struct xfs_mount;
struct xfs_trans;
+struct xfs_dir2_sf_hdr;
+struct xfs_dir2_sf_entry;
+struct xfs_dir2_data_hdr;
+struct xfs_dir2_data_entry;
+struct xfs_dir2_data_unused;
-/*
- * Directory version 2.
- * There are 4 possible formats:
- * shortform
- * single block - data with embedded leaf at the end
- * multiple data blocks, single leaf+freeindex block
- * data blocks, node&leaf blocks (btree), freeindex blocks
- *
- * The shortform format is in xfs_dir2_sf.h.
- * The single block format is in xfs_dir2_block.h.
- * The data block format is in xfs_dir2_data.h.
- * The leaf and freeindex block formats are in xfs_dir2_leaf.h.
- * Node blocks are the same as the other version, in xfs_da_btree.h.
- */
+extern struct xfs_name xfs_name_dotdot;
/*
- * Byte offset in data block and shortform entry.
+ * directory operations vector for encode/decode routines
*/
-typedef __uint16_t xfs_dir2_data_off_t;
-#define NULLDATAOFF 0xffffU
-typedef uint xfs_dir2_data_aoff_t; /* argument form */
+struct xfs_dir_ops {
+ int (*sf_entsize)(struct xfs_dir2_sf_hdr *hdr, int len);
+ struct xfs_dir2_sf_entry *
+ (*sf_nextentry)(struct xfs_dir2_sf_hdr *hdr,
+ struct xfs_dir2_sf_entry *sfep);
+ __uint8_t (*sf_get_ftype)(struct xfs_dir2_sf_entry *sfep);
+ void (*sf_put_ftype)(struct xfs_dir2_sf_entry *sfep,
+ __uint8_t ftype);
+ xfs_ino_t (*sf_get_ino)(struct xfs_dir2_sf_hdr *hdr,
+ struct xfs_dir2_sf_entry *sfep);
+ void (*sf_put_ino)(struct xfs_dir2_sf_hdr *hdr,
+ struct xfs_dir2_sf_entry *sfep,
+ xfs_ino_t ino);
+ xfs_ino_t (*sf_get_parent_ino)(struct xfs_dir2_sf_hdr *hdr);
+ void (*sf_put_parent_ino)(struct xfs_dir2_sf_hdr *hdr,
+ xfs_ino_t ino);
-/*
- * Directory block number (logical dirblk in file)
- */
-typedef __uint32_t xfs_dir2_db_t;
+ int (*data_entsize)(int len);
+ __uint8_t (*data_get_ftype)(struct xfs_dir2_data_entry *dep);
+ void (*data_put_ftype)(struct xfs_dir2_data_entry *dep,
+ __uint8_t ftype);
+ __be16 * (*data_entry_tag_p)(struct xfs_dir2_data_entry *dep);
+ struct xfs_dir2_data_free *
+ (*data_bestfree_p)(struct xfs_dir2_data_hdr *hdr);
+
+ xfs_dir2_data_aoff_t data_dot_offset;
+ xfs_dir2_data_aoff_t data_dotdot_offset;
+ xfs_dir2_data_aoff_t data_first_offset;
+ size_t data_entry_offset;
+
+ struct xfs_dir2_data_entry *
+ (*data_dot_entry_p)(struct xfs_dir2_data_hdr *hdr);
+ struct xfs_dir2_data_entry *
+ (*data_dotdot_entry_p)(struct xfs_dir2_data_hdr *hdr);
+ struct xfs_dir2_data_entry *
+ (*data_first_entry_p)(struct xfs_dir2_data_hdr *hdr);
+ struct xfs_dir2_data_entry *
+ (*data_entry_p)(struct xfs_dir2_data_hdr *hdr);
+ struct xfs_dir2_data_unused *
+ (*data_unused_p)(struct xfs_dir2_data_hdr *hdr);
+
+ int leaf_hdr_size;
+ void (*leaf_hdr_to_disk)(struct xfs_dir2_leaf *to,
+ struct xfs_dir3_icleaf_hdr *from);
+ void (*leaf_hdr_from_disk)(struct xfs_dir3_icleaf_hdr *to,
+ struct xfs_dir2_leaf *from);
+ int (*leaf_max_ents)(struct xfs_da_geometry *geo);
+ struct xfs_dir2_leaf_entry *
+ (*leaf_ents_p)(struct xfs_dir2_leaf *lp);
+
+ int node_hdr_size;
+ void (*node_hdr_to_disk)(struct xfs_da_intnode *to,
+ struct xfs_da3_icnode_hdr *from);
+ void (*node_hdr_from_disk)(struct xfs_da3_icnode_hdr *to,
+ struct xfs_da_intnode *from);
+ struct xfs_da_node_entry *
+ (*node_tree_p)(struct xfs_da_intnode *dap);
+
+ int free_hdr_size;
+ void (*free_hdr_to_disk)(struct xfs_dir2_free *to,
+ struct xfs_dir3_icfree_hdr *from);
+ void (*free_hdr_from_disk)(struct xfs_dir3_icfree_hdr *to,
+ struct xfs_dir2_free *from);
+ int (*free_max_bests)(struct xfs_da_geometry *geo);
+ __be16 * (*free_bests_p)(struct xfs_dir2_free *free);
+ xfs_dir2_db_t (*db_to_fdb)(struct xfs_da_geometry *geo,
+ xfs_dir2_db_t db);
+ int (*db_to_fdindex)(struct xfs_da_geometry *geo,
+ xfs_dir2_db_t db);
+};
+
+extern const struct xfs_dir_ops *
+ xfs_dir_get_ops(struct xfs_mount *mp, struct xfs_inode *dp);
+extern const struct xfs_dir_ops *
+ xfs_nondir_get_ops(struct xfs_mount *mp, struct xfs_inode *dp);
/*
- * Byte offset in a directory.
+ * Generic directory interface routines
*/
-typedef xfs_off_t xfs_dir2_off_t;
+extern void xfs_dir_startup(void);
+extern int xfs_da_mount(struct xfs_mount *mp);
+extern void xfs_da_unmount(struct xfs_mount *mp);
+
+extern int xfs_dir_isempty(struct xfs_inode *dp);
+extern int xfs_dir_init(struct xfs_trans *tp, struct xfs_inode *dp,
+ struct xfs_inode *pdp);
+extern int xfs_dir_createname(struct xfs_trans *tp, struct xfs_inode *dp,
+ struct xfs_name *name, xfs_ino_t inum,
+ xfs_fsblock_t *first,
+ struct xfs_bmap_free *flist, xfs_extlen_t tot);
+extern int xfs_dir_lookup(struct xfs_trans *tp, struct xfs_inode *dp,
+ struct xfs_name *name, xfs_ino_t *inum,
+ struct xfs_name *ci_name);
+extern int xfs_dir_removename(struct xfs_trans *tp, struct xfs_inode *dp,
+ struct xfs_name *name, xfs_ino_t ino,
+ xfs_fsblock_t *first,
+ struct xfs_bmap_free *flist, xfs_extlen_t tot);
+extern int xfs_dir_replace(struct xfs_trans *tp, struct xfs_inode *dp,
+ struct xfs_name *name, xfs_ino_t inum,
+ xfs_fsblock_t *first,
+ struct xfs_bmap_free *flist, xfs_extlen_t tot);
+extern int xfs_dir_canenter(struct xfs_trans *tp, struct xfs_inode *dp,
+ struct xfs_name *name, uint resblks);
/*
- * For getdents, argument struct for put routines.
+ * Direct call from the bmap code, bypassing the generic directory layer.
*/
-typedef int (*xfs_dir2_put_t)(struct xfs_dir2_put_args *pa);
-typedef struct xfs_dir2_put_args {
- xfs_off_t cook; /* cookie of (next) entry */
- xfs_intino_t ino; /* inode number */
- struct xfs_dirent *dbp; /* buffer pointer */
- char *name; /* directory entry name */
- int namelen; /* length of name */
- int done; /* output: set if value was stored */
- xfs_dir2_put_t put; /* put function ptr (i/o) */
- struct uio *uio; /* uio control structure */
-} xfs_dir2_put_args_t;
+extern int xfs_dir2_sf_to_block(struct xfs_da_args *args);
/*
- * Other interfaces used by the rest of the dir v2 code.
+ * Interface routines used by userspace utilities
*/
-extern int
- xfs_dir2_grow_inode(struct xfs_da_args *args, int space,
- xfs_dir2_db_t *dbp);
+extern int xfs_dir2_isblock(struct xfs_da_args *args, int *r);
+extern int xfs_dir2_isleaf(struct xfs_da_args *args, int *r);
+extern int xfs_dir2_shrink_inode(struct xfs_da_args *args, xfs_dir2_db_t db,
+ struct xfs_buf *bp);
-extern int
- xfs_dir2_isblock(struct xfs_trans *tp, struct xfs_inode *dp, int *vp);
+extern void xfs_dir2_data_freescan(struct xfs_inode *dp,
+ struct xfs_dir2_data_hdr *hdr, int *loghead);
+extern void xfs_dir2_data_log_entry(struct xfs_da_args *args,
+ struct xfs_buf *bp, struct xfs_dir2_data_entry *dep);
+extern void xfs_dir2_data_log_header(struct xfs_da_args *args,
+ struct xfs_buf *bp);
+extern void xfs_dir2_data_log_unused(struct xfs_da_args *args,
+ struct xfs_buf *bp, struct xfs_dir2_data_unused *dup);
+extern void xfs_dir2_data_make_free(struct xfs_da_args *args,
+ struct xfs_buf *bp, xfs_dir2_data_aoff_t offset,
+ xfs_dir2_data_aoff_t len, int *needlogp, int *needscanp);
+extern void xfs_dir2_data_use_free(struct xfs_da_args *args,
+ struct xfs_buf *bp, struct xfs_dir2_data_unused *dup,
+ xfs_dir2_data_aoff_t offset, xfs_dir2_data_aoff_t len,
+ int *needlogp, int *needscanp);
-extern int
- xfs_dir2_isleaf(struct xfs_trans *tp, struct xfs_inode *dp, int *vp);
+extern struct xfs_dir2_data_free *xfs_dir2_data_freefind(
+ struct xfs_dir2_data_hdr *hdr, struct xfs_dir2_data_free *bf,
+ struct xfs_dir2_data_unused *dup);
-extern int
- xfs_dir2_shrink_inode(struct xfs_da_args *args, xfs_dir2_db_t db,
- struct xfs_dabuf *bp);
+extern const struct xfs_buf_ops xfs_dir3_block_buf_ops;
+extern const struct xfs_buf_ops xfs_dir3_leafn_buf_ops;
+extern const struct xfs_buf_ops xfs_dir3_leaf1_buf_ops;
+extern const struct xfs_buf_ops xfs_dir3_free_buf_ops;
+extern const struct xfs_buf_ops xfs_dir3_data_buf_ops;
#endif /* __XFS_DIR2_H__ */
diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c
index 31bc99faa70..c7cd3154026 100644
--- a/fs/xfs/xfs_dir2_block.c
+++ b/fs/xfs/xfs_dir2_block.c
@@ -1,5 +1,6 @@
/*
* Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
* All Rights Reserved.
*
* This program is free software; you can redistribute it and/or
@@ -17,40 +18,315 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
-#include "xfs_types.h"
-#include "xfs_log.h"
-#include "xfs_inum.h"
-#include "xfs_trans.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_sb.h"
-#include "xfs_dir.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
+#include "xfs_ag.h"
#include "xfs_mount.h"
+#include "xfs_da_format.h"
#include "xfs_da_btree.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_dir_sf.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
-#include "xfs_dinode.h"
#include "xfs_inode.h"
+#include "xfs_trans.h"
#include "xfs_inode_item.h"
-#include "xfs_dir_leaf.h"
-#include "xfs_dir2_data.h"
-#include "xfs_dir2_leaf.h"
-#include "xfs_dir2_block.h"
-#include "xfs_dir2_trace.h"
+#include "xfs_bmap.h"
+#include "xfs_buf_item.h"
+#include "xfs_dir2.h"
+#include "xfs_dir2_priv.h"
#include "xfs_error.h"
+#include "xfs_trace.h"
+#include "xfs_cksum.h"
+#include "xfs_dinode.h"
/*
* Local function prototypes.
*/
-static void xfs_dir2_block_log_leaf(xfs_trans_t *tp, xfs_dabuf_t *bp, int first,
- int last);
-static void xfs_dir2_block_log_tail(xfs_trans_t *tp, xfs_dabuf_t *bp);
-static int xfs_dir2_block_lookup_int(xfs_da_args_t *args, xfs_dabuf_t **bpp,
+static void xfs_dir2_block_log_leaf(xfs_trans_t *tp, struct xfs_buf *bp,
+ int first, int last);
+static void xfs_dir2_block_log_tail(xfs_trans_t *tp, struct xfs_buf *bp);
+static int xfs_dir2_block_lookup_int(xfs_da_args_t *args, struct xfs_buf **bpp,
int *entno);
static int xfs_dir2_block_sort(const void *a, const void *b);
+static xfs_dahash_t xfs_dir_hash_dot, xfs_dir_hash_dotdot;
+
+/*
+ * One-time startup routine called from xfs_init().
+ */
+void
+xfs_dir_startup(void)
+{
+ xfs_dir_hash_dot = xfs_da_hashname((unsigned char *)".", 1);
+ xfs_dir_hash_dotdot = xfs_da_hashname((unsigned char *)"..", 2);
+}
+
+static bool
+xfs_dir3_block_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
+
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ if (hdr3->magic != cpu_to_be32(XFS_DIR3_BLOCK_MAGIC))
+ return false;
+ if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_uuid))
+ return false;
+ if (be64_to_cpu(hdr3->blkno) != bp->b_bn)
+ return false;
+ } else {
+ if (hdr3->magic != cpu_to_be32(XFS_DIR2_BLOCK_MAGIC))
+ return false;
+ }
+ if (__xfs_dir3_data_check(NULL, bp))
+ return false;
+ return true;
+}
+
+static void
+xfs_dir3_block_read_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+
+ if (xfs_sb_version_hascrc(&mp->m_sb) &&
+ !xfs_buf_verify_cksum(bp, XFS_DIR3_DATA_CRC_OFF))
+ xfs_buf_ioerror(bp, EFSBADCRC);
+ else if (!xfs_dir3_block_verify(bp))
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
+
+ if (bp->b_error)
+ xfs_verifier_error(bp);
+}
+
+static void
+xfs_dir3_block_write_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_buf_log_item *bip = bp->b_fspriv;
+ struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
+
+ if (!xfs_dir3_block_verify(bp)) {
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
+ xfs_verifier_error(bp);
+ return;
+ }
+
+ if (!xfs_sb_version_hascrc(&mp->m_sb))
+ return;
+
+ if (bip)
+ hdr3->lsn = cpu_to_be64(bip->bli_item.li_lsn);
+
+ xfs_buf_update_cksum(bp, XFS_DIR3_DATA_CRC_OFF);
+}
+
+const struct xfs_buf_ops xfs_dir3_block_buf_ops = {
+ .verify_read = xfs_dir3_block_read_verify,
+ .verify_write = xfs_dir3_block_write_verify,
+};
+
+int
+xfs_dir3_block_read(
+ struct xfs_trans *tp,
+ struct xfs_inode *dp,
+ struct xfs_buf **bpp)
+{
+ struct xfs_mount *mp = dp->i_mount;
+ int err;
+
+ err = xfs_da_read_buf(tp, dp, mp->m_dir_geo->datablk, -1, bpp,
+ XFS_DATA_FORK, &xfs_dir3_block_buf_ops);
+ if (!err && tp)
+ xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_BLOCK_BUF);
+ return err;
+}
+
+static void
+xfs_dir3_block_init(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ struct xfs_buf *bp,
+ struct xfs_inode *dp)
+{
+ struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
+
+ bp->b_ops = &xfs_dir3_block_buf_ops;
+ xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_BLOCK_BUF);
+
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ memset(hdr3, 0, sizeof(*hdr3));
+ hdr3->magic = cpu_to_be32(XFS_DIR3_BLOCK_MAGIC);
+ hdr3->blkno = cpu_to_be64(bp->b_bn);
+ hdr3->owner = cpu_to_be64(dp->i_ino);
+ uuid_copy(&hdr3->uuid, &mp->m_sb.sb_uuid);
+ return;
+
+ }
+ hdr3->magic = cpu_to_be32(XFS_DIR2_BLOCK_MAGIC);
+}
+
+static void
+xfs_dir2_block_need_space(
+ struct xfs_inode *dp,
+ struct xfs_dir2_data_hdr *hdr,
+ struct xfs_dir2_block_tail *btp,
+ struct xfs_dir2_leaf_entry *blp,
+ __be16 **tagpp,
+ struct xfs_dir2_data_unused **dupp,
+ struct xfs_dir2_data_unused **enddupp,
+ int *compact,
+ int len)
+{
+ struct xfs_dir2_data_free *bf;
+ __be16 *tagp = NULL;
+ struct xfs_dir2_data_unused *dup = NULL;
+ struct xfs_dir2_data_unused *enddup = NULL;
+
+ *compact = 0;
+ bf = dp->d_ops->data_bestfree_p(hdr);
+
+ /*
+ * If there are stale entries we'll use one for the leaf.
+ */
+ if (btp->stale) {
+ if (be16_to_cpu(bf[0].length) >= len) {
+ /*
+ * The biggest entry enough to avoid compaction.
+ */
+ dup = (xfs_dir2_data_unused_t *)
+ ((char *)hdr + be16_to_cpu(bf[0].offset));
+ goto out;
+ }
+
+ /*
+ * Will need to compact to make this work.
+ * Tag just before the first leaf entry.
+ */
+ *compact = 1;
+ tagp = (__be16 *)blp - 1;
+
+ /* Data object just before the first leaf entry. */
+ dup = (xfs_dir2_data_unused_t *)((char *)hdr + be16_to_cpu(*tagp));
+
+ /*
+ * If it's not free then the data will go where the
+ * leaf data starts now, if it works at all.
+ */
+ if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
+ if (be16_to_cpu(dup->length) + (be32_to_cpu(btp->stale) - 1) *
+ (uint)sizeof(*blp) < len)
+ dup = NULL;
+ } else if ((be32_to_cpu(btp->stale) - 1) * (uint)sizeof(*blp) < len)
+ dup = NULL;
+ else
+ dup = (xfs_dir2_data_unused_t *)blp;
+ goto out;
+ }
+
+ /*
+ * no stale entries, so just use free space.
+ * Tag just before the first leaf entry.
+ */
+ tagp = (__be16 *)blp - 1;
+
+ /* Data object just before the first leaf entry. */
+ enddup = (xfs_dir2_data_unused_t *)((char *)hdr + be16_to_cpu(*tagp));
+
+ /*
+ * If it's not free then can't do this add without cleaning up:
+ * the space before the first leaf entry needs to be free so it
+ * can be expanded to hold the pointer to the new entry.
+ */
+ if (be16_to_cpu(enddup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
+ /*
+ * Check out the biggest freespace and see if it's the same one.
+ */
+ dup = (xfs_dir2_data_unused_t *)
+ ((char *)hdr + be16_to_cpu(bf[0].offset));
+ if (dup != enddup) {
+ /*
+ * Not the same free entry, just check its length.
+ */
+ if (be16_to_cpu(dup->length) < len)
+ dup = NULL;
+ goto out;
+ }
+
+ /*
+ * It is the biggest freespace, can it hold the leaf too?
+ */
+ if (be16_to_cpu(dup->length) < len + (uint)sizeof(*blp)) {
+ /*
+ * Yes, use the second-largest entry instead if it works.
+ */
+ if (be16_to_cpu(bf[1].length) >= len)
+ dup = (xfs_dir2_data_unused_t *)
+ ((char *)hdr + be16_to_cpu(bf[1].offset));
+ else
+ dup = NULL;
+ }
+ }
+out:
+ *tagpp = tagp;
+ *dupp = dup;
+ *enddupp = enddup;
+}
+
+/*
+ * compact the leaf entries.
+ * Leave the highest-numbered stale entry stale.
+ * XXX should be the one closest to mid but mid is not yet computed.
+ */
+static void
+xfs_dir2_block_compact(
+ struct xfs_da_args *args,
+ struct xfs_buf *bp,
+ struct xfs_dir2_data_hdr *hdr,
+ struct xfs_dir2_block_tail *btp,
+ struct xfs_dir2_leaf_entry *blp,
+ int *needlog,
+ int *lfloghigh,
+ int *lfloglow)
+{
+ int fromidx; /* source leaf index */
+ int toidx; /* target leaf index */
+ int needscan = 0;
+ int highstale; /* high stale index */
+
+ fromidx = toidx = be32_to_cpu(btp->count) - 1;
+ highstale = *lfloghigh = -1;
+ for (; fromidx >= 0; fromidx--) {
+ if (blp[fromidx].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) {
+ if (highstale == -1)
+ highstale = toidx;
+ else {
+ if (*lfloghigh == -1)
+ *lfloghigh = toidx;
+ continue;
+ }
+ }
+ if (fromidx < toidx)
+ blp[toidx] = blp[fromidx];
+ toidx--;
+ }
+ *lfloglow = toidx + 1 - (be32_to_cpu(btp->stale) - 1);
+ *lfloghigh -= be32_to_cpu(btp->stale) - 1;
+ be32_add_cpu(&btp->count, -(be32_to_cpu(btp->stale) - 1));
+ xfs_dir2_data_make_free(args, bp,
+ (xfs_dir2_data_aoff_t)((char *)blp - (char *)hdr),
+ (xfs_dir2_data_aoff_t)((be32_to_cpu(btp->stale) - 1) * sizeof(*blp)),
+ needlog, &needscan);
+ btp->stale = cpu_to_be32(1);
+ /*
+ * If we now need to rebuild the bestfree map, do so.
+ * This needs to happen before the next call to use_free.
+ */
+ if (needscan)
+ xfs_dir2_data_freescan(args->dp, hdr, needlog);
+}
+
/*
* Add an entry to a block directory.
*/
@@ -58,10 +334,9 @@ int /* error */
xfs_dir2_block_addname(
xfs_da_args_t *args) /* directory op arguments */
{
- xfs_dir2_data_free_t *bf; /* bestfree table in block */
- xfs_dir2_block_t *block; /* directory block structure */
+ xfs_dir2_data_hdr_t *hdr; /* block header */
xfs_dir2_leaf_entry_t *blp; /* block leaf entries */
- xfs_dabuf_t *bp; /* buffer for block */
+ struct xfs_buf *bp; /* buffer for block */
xfs_dir2_block_tail_t *btp; /* block tail */
int compact; /* need to compact leaf ents */
xfs_dir2_data_entry_t *dep; /* block data entry */
@@ -81,225 +356,95 @@ xfs_dir2_block_addname(
xfs_mount_t *mp; /* filesystem mount point */
int needlog; /* need to log header */
int needscan; /* need to rescan freespace */
- xfs_dir2_data_off_t *tagp; /* pointer to tag value */
+ __be16 *tagp; /* pointer to tag value */
xfs_trans_t *tp; /* transaction structure */
- xfs_dir2_trace_args("block_addname", args);
+ trace_xfs_dir2_block_addname(args);
+
dp = args->dp;
tp = args->trans;
mp = dp->i_mount;
- /*
- * Read the (one and only) directory block into dabuf bp.
- */
- if ((error =
- xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, &bp, XFS_DATA_FORK))) {
+
+ /* Read the (one and only) directory block into bp. */
+ error = xfs_dir3_block_read(tp, dp, &bp);
+ if (error)
return error;
- }
- ASSERT(bp != NULL);
- block = bp->data;
- /*
- * Check the magic number, corrupted if wrong.
- */
- if (unlikely(INT_GET(block->hdr.magic, ARCH_CONVERT)
- != XFS_DIR2_BLOCK_MAGIC)) {
- XFS_CORRUPTION_ERROR("xfs_dir2_block_addname",
- XFS_ERRLEVEL_LOW, mp, block);
- xfs_da_brelse(tp, bp);
- return XFS_ERROR(EFSCORRUPTED);
- }
- len = XFS_DIR2_DATA_ENTSIZE(args->namelen);
+
+ len = dp->d_ops->data_entsize(args->namelen);
+
/*
* Set up pointers to parts of the block.
*/
- bf = block->hdr.bestfree;
- btp = XFS_DIR2_BLOCK_TAIL_P(mp, block);
- blp = XFS_DIR2_BLOCK_LEAF_P(btp);
- /*
- * No stale entries? Need space for entry and new leaf.
- */
- if (!btp->stale) {
- /*
- * Tag just before the first leaf entry.
- */
- tagp = (xfs_dir2_data_off_t *)blp - 1;
- /*
- * Data object just before the first leaf entry.
- */
- enddup = (xfs_dir2_data_unused_t *)((char *)block + INT_GET(*tagp, ARCH_CONVERT));
- /*
- * If it's not free then can't do this add without cleaning up:
- * the space before the first leaf entry needs to be free so it
- * can be expanded to hold the pointer to the new entry.
- */
- if (INT_GET(enddup->freetag, ARCH_CONVERT) != XFS_DIR2_DATA_FREE_TAG)
- dup = enddup = NULL;
- /*
- * Check out the biggest freespace and see if it's the same one.
- */
- else {
- dup = (xfs_dir2_data_unused_t *)
- ((char *)block + INT_GET(bf[0].offset, ARCH_CONVERT));
- if (dup == enddup) {
- /*
- * It is the biggest freespace, is it too small
- * to hold the new leaf too?
- */
- if (INT_GET(dup->length, ARCH_CONVERT) < len + (uint)sizeof(*blp)) {
- /*
- * Yes, we use the second-largest
- * entry instead if it works.
- */
- if (INT_GET(bf[1].length, ARCH_CONVERT) >= len)
- dup = (xfs_dir2_data_unused_t *)
- ((char *)block +
- INT_GET(bf[1].offset, ARCH_CONVERT));
- else
- dup = NULL;
- }
- } else {
- /*
- * Not the same free entry,
- * just check its length.
- */
- if (INT_GET(dup->length, ARCH_CONVERT) < len) {
- dup = NULL;
- }
- }
- }
- compact = 0;
- }
+ hdr = bp->b_addr;
+ btp = xfs_dir2_block_tail_p(args->geo, hdr);
+ blp = xfs_dir2_block_leaf_p(btp);
+
/*
- * If there are stale entries we'll use one for the leaf.
- * Is the biggest entry enough to avoid compaction?
+ * Find out if we can reuse stale entries or whether we need extra
+ * space for entry and new leaf.
*/
- else if (INT_GET(bf[0].length, ARCH_CONVERT) >= len) {
- dup = (xfs_dir2_data_unused_t *)
- ((char *)block + INT_GET(bf[0].offset, ARCH_CONVERT));
- compact = 0;
- }
+ xfs_dir2_block_need_space(dp, hdr, btp, blp, &tagp, &dup,
+ &enddup, &compact, len);
+
/*
- * Will need to compact to make this work.
+ * Done everything we need for a space check now.
*/
- else {
- /*
- * Tag just before the first leaf entry.
- */
- tagp = (xfs_dir2_data_off_t *)blp - 1;
- /*
- * Data object just before the first leaf entry.
- */
- dup = (xfs_dir2_data_unused_t *)((char *)block + INT_GET(*tagp, ARCH_CONVERT));
- /*
- * If it's not free then the data will go where the
- * leaf data starts now, if it works at all.
- */
- if (INT_GET(dup->freetag, ARCH_CONVERT) == XFS_DIR2_DATA_FREE_TAG) {
- if (INT_GET(dup->length, ARCH_CONVERT) + (INT_GET(btp->stale, ARCH_CONVERT) - 1) *
- (uint)sizeof(*blp) < len)
- dup = NULL;
- } else if ((INT_GET(btp->stale, ARCH_CONVERT) - 1) * (uint)sizeof(*blp) < len)
- dup = NULL;
- else
- dup = (xfs_dir2_data_unused_t *)blp;
- compact = 1;
+ if (args->op_flags & XFS_DA_OP_JUSTCHECK) {
+ xfs_trans_brelse(tp, bp);
+ if (!dup)
+ return XFS_ERROR(ENOSPC);
+ return 0;
}
- /*
- * If this isn't a real add, we're done with the buffer.
- */
- if (args->justcheck)
- xfs_da_brelse(tp, bp);
+
/*
* If we don't have space for the new entry & leaf ...
*/
if (!dup) {
- /*
- * Not trying to actually do anything, or don't have
- * a space reservation: return no-space.
- */
- if (args->justcheck || args->total == 0)
+ /* Don't have a space reservation: return no-space. */
+ if (args->total == 0)
return XFS_ERROR(ENOSPC);
/*
* Convert to the next larger format.
* Then add the new entry in that format.
*/
error = xfs_dir2_block_to_leaf(args, bp);
- xfs_da_buf_done(bp);
if (error)
return error;
return xfs_dir2_leaf_addname(args);
}
- /*
- * Just checking, and it would work, so say so.
- */
- if (args->justcheck)
- return 0;
+
needlog = needscan = 0;
+
/*
* If need to compact the leaf entries, do it now.
- * Leave the highest-numbered stale entry stale.
- * XXX should be the one closest to mid but mid is not yet computed.
*/
if (compact) {
- int fromidx; /* source leaf index */
- int toidx; /* target leaf index */
-
- for (fromidx = toidx = INT_GET(btp->count, ARCH_CONVERT) - 1,
- highstale = lfloghigh = -1;
- fromidx >= 0;
- fromidx--) {
- if (INT_GET(blp[fromidx].address, ARCH_CONVERT) == XFS_DIR2_NULL_DATAPTR) {
- if (highstale == -1)
- highstale = toidx;
- else {
- if (lfloghigh == -1)
- lfloghigh = toidx;
- continue;
- }
- }
- if (fromidx < toidx)
- blp[toidx] = blp[fromidx];
- toidx--;
- }
- lfloglow = toidx + 1 - (INT_GET(btp->stale, ARCH_CONVERT) - 1);
- lfloghigh -= INT_GET(btp->stale, ARCH_CONVERT) - 1;
- INT_MOD(btp->count, ARCH_CONVERT, -(INT_GET(btp->stale, ARCH_CONVERT) - 1));
- xfs_dir2_data_make_free(tp, bp,
- (xfs_dir2_data_aoff_t)((char *)blp - (char *)block),
- (xfs_dir2_data_aoff_t)((INT_GET(btp->stale, ARCH_CONVERT) - 1) * sizeof(*blp)),
- &needlog, &needscan);
- blp += INT_GET(btp->stale, ARCH_CONVERT) - 1;
- INT_SET(btp->stale, ARCH_CONVERT, 1);
+ xfs_dir2_block_compact(args, bp, hdr, btp, blp, &needlog,
+ &lfloghigh, &lfloglow);
+ /* recalculate blp post-compaction */
+ blp = xfs_dir2_block_leaf_p(btp);
+ } else if (btp->stale) {
/*
- * If we now need to rebuild the bestfree map, do so.
- * This needs to happen before the next call to use_free.
+ * Set leaf logging boundaries to impossible state.
+ * For the no-stale case they're set explicitly.
*/
- if (needscan) {
- xfs_dir2_data_freescan(mp, (xfs_dir2_data_t *)block,
- &needlog, NULL);
- needscan = 0;
- }
- }
- /*
- * Set leaf logging boundaries to impossible state.
- * For the no-stale case they're set explicitly.
- */
- else if (INT_GET(btp->stale, ARCH_CONVERT)) {
- lfloglow = INT_GET(btp->count, ARCH_CONVERT);
+ lfloglow = be32_to_cpu(btp->count);
lfloghigh = -1;
}
+
/*
* Find the slot that's first lower than our hash value, -1 if none.
*/
- for (low = 0, high = INT_GET(btp->count, ARCH_CONVERT) - 1; low <= high; ) {
+ for (low = 0, high = be32_to_cpu(btp->count) - 1; low <= high; ) {
mid = (low + high) >> 1;
- if ((hash = INT_GET(blp[mid].hashval, ARCH_CONVERT)) == args->hashval)
+ if ((hash = be32_to_cpu(blp[mid].hashval)) == args->hashval)
break;
if (hash < args->hashval)
low = mid + 1;
else
high = mid - 1;
}
- while (mid >= 0 && INT_GET(blp[mid].hashval, ARCH_CONVERT) >= args->hashval) {
+ while (mid >= 0 && be32_to_cpu(blp[mid].hashval) >= args->hashval) {
mid--;
}
/*
@@ -309,23 +454,22 @@ xfs_dir2_block_addname(
/*
* Mark the space needed for the new leaf entry, now in use.
*/
- xfs_dir2_data_use_free(tp, bp, enddup,
+ xfs_dir2_data_use_free(args, bp, enddup,
(xfs_dir2_data_aoff_t)
- ((char *)enddup - (char *)block + INT_GET(enddup->length, ARCH_CONVERT) -
+ ((char *)enddup - (char *)hdr + be16_to_cpu(enddup->length) -
sizeof(*blp)),
(xfs_dir2_data_aoff_t)sizeof(*blp),
&needlog, &needscan);
/*
* Update the tail (entry count).
*/
- INT_MOD(btp->count, ARCH_CONVERT, +1);
+ be32_add_cpu(&btp->count, 1);
/*
* If we now need to rebuild the bestfree map, do so.
* This needs to happen before the next call to use_free.
*/
if (needscan) {
- xfs_dir2_data_freescan(mp, (xfs_dir2_data_t *)block,
- &needlog, NULL);
+ xfs_dir2_data_freescan(dp, hdr, &needlog);
needscan = 0;
}
/*
@@ -346,12 +490,14 @@ xfs_dir2_block_addname(
else {
for (lowstale = mid;
lowstale >= 0 &&
- INT_GET(blp[lowstale].address, ARCH_CONVERT) != XFS_DIR2_NULL_DATAPTR;
+ blp[lowstale].address !=
+ cpu_to_be32(XFS_DIR2_NULL_DATAPTR);
lowstale--)
continue;
for (highstale = mid + 1;
- highstale < INT_GET(btp->count, ARCH_CONVERT) &&
- INT_GET(blp[highstale].address, ARCH_CONVERT) != XFS_DIR2_NULL_DATAPTR &&
+ highstale < be32_to_cpu(btp->count) &&
+ blp[highstale].address !=
+ cpu_to_be32(XFS_DIR2_NULL_DATAPTR) &&
(lowstale < 0 || mid - lowstale > highstale - mid);
highstale++)
continue;
@@ -359,7 +505,7 @@ xfs_dir2_block_addname(
* Move entries toward the low-numbered stale entry.
*/
if (lowstale >= 0 &&
- (highstale == INT_GET(btp->count, ARCH_CONVERT) ||
+ (highstale == be32_to_cpu(btp->count) ||
mid - lowstale <= highstale - mid)) {
if (mid - lowstale)
memmove(&blp[lowstale], &blp[lowstale + 1],
@@ -371,7 +517,7 @@ xfs_dir2_block_addname(
* Move entries toward the high-numbered stale entry.
*/
else {
- ASSERT(highstale < INT_GET(btp->count, ARCH_CONVERT));
+ ASSERT(highstale < be32_to_cpu(btp->count));
mid++;
if (highstale - mid)
memmove(&blp[mid + 1], &blp[mid],
@@ -379,7 +525,7 @@ xfs_dir2_block_addname(
lfloglow = MIN(mid, lfloglow);
lfloghigh = MAX(highstale, lfloghigh);
}
- INT_MOD(btp->stale, ARCH_CONVERT, -1);
+ be32_add_cpu(&btp->stale, -1);
}
/*
* Point to the new data entry.
@@ -388,160 +534,35 @@ xfs_dir2_block_addname(
/*
* Fill in the leaf entry.
*/
- INT_SET(blp[mid].hashval, ARCH_CONVERT, args->hashval);
- INT_SET(blp[mid].address, ARCH_CONVERT, XFS_DIR2_BYTE_TO_DATAPTR(mp, (char *)dep - (char *)block));
+ blp[mid].hashval = cpu_to_be32(args->hashval);
+ blp[mid].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(
+ (char *)dep - (char *)hdr));
xfs_dir2_block_log_leaf(tp, bp, lfloglow, lfloghigh);
/*
* Mark space for the data entry used.
*/
- xfs_dir2_data_use_free(tp, bp, dup,
- (xfs_dir2_data_aoff_t)((char *)dup - (char *)block),
+ xfs_dir2_data_use_free(args, bp, dup,
+ (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr),
(xfs_dir2_data_aoff_t)len, &needlog, &needscan);
/*
* Create the new data entry.
*/
- INT_SET(dep->inumber, ARCH_CONVERT, args->inumber);
+ dep->inumber = cpu_to_be64(args->inumber);
dep->namelen = args->namelen;
memcpy(dep->name, args->name, args->namelen);
- tagp = XFS_DIR2_DATA_ENTRY_TAG_P(dep);
- INT_SET(*tagp, ARCH_CONVERT, (xfs_dir2_data_off_t)((char *)dep - (char *)block));
+ dp->d_ops->data_put_ftype(dep, args->filetype);
+ tagp = dp->d_ops->data_entry_tag_p(dep);
+ *tagp = cpu_to_be16((char *)dep - (char *)hdr);
/*
* Clean up the bestfree array and log the header, tail, and entry.
*/
if (needscan)
- xfs_dir2_data_freescan(mp, (xfs_dir2_data_t *)block, &needlog,
- NULL);
+ xfs_dir2_data_freescan(dp, hdr, &needlog);
if (needlog)
- xfs_dir2_data_log_header(tp, bp);
+ xfs_dir2_data_log_header(args, bp);
xfs_dir2_block_log_tail(tp, bp);
- xfs_dir2_data_log_entry(tp, bp, dep);
- xfs_dir2_data_check(dp, bp);
- xfs_da_buf_done(bp);
- return 0;
-}
-
-/*
- * Readdir for block directories.
- */
-int /* error */
-xfs_dir2_block_getdents(
- xfs_trans_t *tp, /* transaction (NULL) */
- xfs_inode_t *dp, /* incore inode */
- uio_t *uio, /* caller's buffer control */
- int *eofp, /* eof reached? (out) */
- xfs_dirent_t *dbp, /* caller's buffer */
- xfs_dir2_put_t put) /* abi's formatting function */
-{
- xfs_dir2_block_t *block; /* directory block structure */
- xfs_dabuf_t *bp; /* buffer for block */
- xfs_dir2_block_tail_t *btp; /* block tail */
- xfs_dir2_data_entry_t *dep; /* block data entry */
- xfs_dir2_data_unused_t *dup; /* block unused entry */
- char *endptr; /* end of the data entries */
- int error; /* error return value */
- xfs_mount_t *mp; /* filesystem mount point */
- xfs_dir2_put_args_t p; /* arg package for put rtn */
- char *ptr; /* current data entry */
- int wantoff; /* starting block offset */
-
- mp = dp->i_mount;
- /*
- * If the block number in the offset is out of range, we're done.
- */
- if (XFS_DIR2_DATAPTR_TO_DB(mp, uio->uio_offset) > mp->m_dirdatablk) {
- *eofp = 1;
- return 0;
- }
- /*
- * Can't read the block, give up, else get dabuf in bp.
- */
- if ((error =
- xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, &bp, XFS_DATA_FORK))) {
- return error;
- }
- ASSERT(bp != NULL);
- /*
- * Extract the byte offset we start at from the seek pointer.
- * We'll skip entries before this.
- */
- wantoff = XFS_DIR2_DATAPTR_TO_OFF(mp, uio->uio_offset);
- block = bp->data;
- xfs_dir2_data_check(dp, bp);
- /*
- * Set up values for the loop.
- */
- btp = XFS_DIR2_BLOCK_TAIL_P(mp, block);
- ptr = (char *)block->u;
- endptr = (char *)XFS_DIR2_BLOCK_LEAF_P(btp);
- p.dbp = dbp;
- p.put = put;
- p.uio = uio;
- /*
- * Loop over the data portion of the block.
- * Each object is a real entry (dep) or an unused one (dup).
- */
- while (ptr < endptr) {
- dup = (xfs_dir2_data_unused_t *)ptr;
- /*
- * Unused, skip it.
- */
- if (INT_GET(dup->freetag, ARCH_CONVERT) == XFS_DIR2_DATA_FREE_TAG) {
- ptr += INT_GET(dup->length, ARCH_CONVERT);
- continue;
- }
-
- dep = (xfs_dir2_data_entry_t *)ptr;
-
- /*
- * Bump pointer for the next iteration.
- */
- ptr += XFS_DIR2_DATA_ENTSIZE(dep->namelen);
- /*
- * The entry is before the desired starting point, skip it.
- */
- if ((char *)dep - (char *)block < wantoff)
- continue;
- /*
- * Set up argument structure for put routine.
- */
- p.namelen = dep->namelen;
-
- p.cook = XFS_DIR2_DB_OFF_TO_DATAPTR(mp, mp->m_dirdatablk,
- ptr - (char *)block);
- p.ino = INT_GET(dep->inumber, ARCH_CONVERT);
-#if XFS_BIG_INUMS
- p.ino += mp->m_inoadd;
-#endif
- p.name = (char *)dep->name;
-
- /*
- * Put the entry in the caller's buffer.
- */
- error = p.put(&p);
-
- /*
- * If it didn't fit, set the final offset to here & return.
- */
- if (!p.done) {
- uio->uio_offset =
- XFS_DIR2_DB_OFF_TO_DATAPTR(mp, mp->m_dirdatablk,
- (char *)dep - (char *)block);
- xfs_da_brelse(tp, bp);
- return error;
- }
- }
-
- /*
- * Reached the end of the block.
- * Set the offset to a nonexistent block 1 and return.
- */
- *eofp = 1;
-
- uio->uio_offset =
- XFS_DIR2_DB_OFF_TO_DATAPTR(mp, mp->m_dirdatablk + 1, 0);
-
- xfs_da_brelse(tp, bp);
-
+ xfs_dir2_data_log_entry(args, bp, dep);
+ xfs_dir3_data_check(dp, bp);
return 0;
}
@@ -551,21 +572,18 @@ xfs_dir2_block_getdents(
static void
xfs_dir2_block_log_leaf(
xfs_trans_t *tp, /* transaction structure */
- xfs_dabuf_t *bp, /* block buffer */
+ struct xfs_buf *bp, /* block buffer */
int first, /* index of first logged leaf */
int last) /* index of last logged leaf */
{
- xfs_dir2_block_t *block; /* directory block structure */
- xfs_dir2_leaf_entry_t *blp; /* block leaf entries */
- xfs_dir2_block_tail_t *btp; /* block tail */
- xfs_mount_t *mp; /* filesystem mount point */
+ xfs_dir2_data_hdr_t *hdr = bp->b_addr;
+ xfs_dir2_leaf_entry_t *blp;
+ xfs_dir2_block_tail_t *btp;
- mp = tp->t_mountp;
- block = bp->data;
- btp = XFS_DIR2_BLOCK_TAIL_P(mp, block);
- blp = XFS_DIR2_BLOCK_LEAF_P(btp);
- xfs_da_log_buf(tp, bp, (uint)((char *)&blp[first] - (char *)block),
- (uint)((char *)&blp[last + 1] - (char *)block - 1));
+ btp = xfs_dir2_block_tail_p(tp->t_mountp->m_dir_geo, hdr);
+ blp = xfs_dir2_block_leaf_p(btp);
+ xfs_trans_log_buf(tp, bp, (uint)((char *)&blp[first] - (char *)hdr),
+ (uint)((char *)&blp[last + 1] - (char *)hdr - 1));
}
/*
@@ -574,17 +592,14 @@ xfs_dir2_block_log_leaf(
static void
xfs_dir2_block_log_tail(
xfs_trans_t *tp, /* transaction structure */
- xfs_dabuf_t *bp) /* block buffer */
+ struct xfs_buf *bp) /* block buffer */
{
- xfs_dir2_block_t *block; /* directory block structure */
- xfs_dir2_block_tail_t *btp; /* block tail */
- xfs_mount_t *mp; /* filesystem mount point */
+ xfs_dir2_data_hdr_t *hdr = bp->b_addr;
+ xfs_dir2_block_tail_t *btp;
- mp = tp->t_mountp;
- block = bp->data;
- btp = XFS_DIR2_BLOCK_TAIL_P(mp, block);
- xfs_da_log_buf(tp, bp, (uint)((char *)btp - (char *)block),
- (uint)((char *)(btp + 1) - (char *)block - 1));
+ btp = xfs_dir2_block_tail_p(tp->t_mountp->m_dir_geo, hdr);
+ xfs_trans_log_buf(tp, bp, (uint)((char *)btp - (char *)hdr),
+ (uint)((char *)(btp + 1) - (char *)hdr - 1));
}
/*
@@ -595,9 +610,9 @@ int /* error */
xfs_dir2_block_lookup(
xfs_da_args_t *args) /* dir lookup arguments */
{
- xfs_dir2_block_t *block; /* block structure */
+ xfs_dir2_data_hdr_t *hdr; /* block header */
xfs_dir2_leaf_entry_t *blp; /* block leaf entries */
- xfs_dabuf_t *bp; /* block buffer */
+ struct xfs_buf *bp; /* block buffer */
xfs_dir2_block_tail_t *btp; /* block tail */
xfs_dir2_data_entry_t *dep; /* block data entry */
xfs_inode_t *dp; /* incore inode */
@@ -605,7 +620,8 @@ xfs_dir2_block_lookup(
int error; /* error return value */
xfs_mount_t *mp; /* filesystem mount point */
- xfs_dir2_trace_args("block_lookup", args);
+ trace_xfs_dir2_block_lookup(args);
+
/*
* Get the buffer, look up the entry.
* If not found (ENOENT) then return, have no buffer.
@@ -614,21 +630,24 @@ xfs_dir2_block_lookup(
return error;
dp = args->dp;
mp = dp->i_mount;
- block = bp->data;
- xfs_dir2_data_check(dp, bp);
- btp = XFS_DIR2_BLOCK_TAIL_P(mp, block);
- blp = XFS_DIR2_BLOCK_LEAF_P(btp);
+ hdr = bp->b_addr;
+ xfs_dir3_data_check(dp, bp);
+ btp = xfs_dir2_block_tail_p(args->geo, hdr);
+ blp = xfs_dir2_block_leaf_p(btp);
/*
* Get the offset from the leaf entry, to point to the data.
*/
- dep = (xfs_dir2_data_entry_t *)
- ((char *)block + XFS_DIR2_DATAPTR_TO_OFF(mp, INT_GET(blp[ent].address, ARCH_CONVERT)));
+ dep = (xfs_dir2_data_entry_t *)((char *)hdr +
+ xfs_dir2_dataptr_to_off(args->geo,
+ be32_to_cpu(blp[ent].address)));
/*
- * Fill in inode number, release the block.
+ * Fill in inode number, CI name if appropriate, release the block.
*/
- args->inumber = INT_GET(dep->inumber, ARCH_CONVERT);
- xfs_da_brelse(args->trans, bp);
- return XFS_ERROR(EEXIST);
+ args->inumber = be64_to_cpu(dep->inumber);
+ args->filetype = dp->d_ops->data_get_ftype(dep);
+ error = xfs_dir_cilookup_result(args, dep->name, dep->namelen);
+ xfs_trans_brelse(args->trans, bp);
+ return XFS_ERROR(error);
}
/*
@@ -637,13 +656,13 @@ xfs_dir2_block_lookup(
static int /* error */
xfs_dir2_block_lookup_int(
xfs_da_args_t *args, /* dir lookup arguments */
- xfs_dabuf_t **bpp, /* returned block buffer */
+ struct xfs_buf **bpp, /* returned block buffer */
int *entno) /* returned entry number */
{
xfs_dir2_dataptr_t addr; /* data entry address */
- xfs_dir2_block_t *block; /* block structure */
+ xfs_dir2_data_hdr_t *hdr; /* block header */
xfs_dir2_leaf_entry_t *blp; /* block leaf entries */
- xfs_dabuf_t *bp; /* block buffer */
+ struct xfs_buf *bp; /* block buffer */
xfs_dir2_block_tail_t *btp; /* block tail */
xfs_dir2_data_entry_t *dep; /* block data entry */
xfs_inode_t *dp; /* incore inode */
@@ -654,45 +673,43 @@ xfs_dir2_block_lookup_int(
int mid; /* binary search current idx */
xfs_mount_t *mp; /* filesystem mount point */
xfs_trans_t *tp; /* transaction pointer */
+ enum xfs_dacmp cmp; /* comparison result */
dp = args->dp;
tp = args->trans;
mp = dp->i_mount;
- /*
- * Read the buffer, return error if we can't get it.
- */
- if ((error =
- xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, &bp, XFS_DATA_FORK))) {
+
+ error = xfs_dir3_block_read(tp, dp, &bp);
+ if (error)
return error;
- }
- ASSERT(bp != NULL);
- block = bp->data;
- xfs_dir2_data_check(dp, bp);
- btp = XFS_DIR2_BLOCK_TAIL_P(mp, block);
- blp = XFS_DIR2_BLOCK_LEAF_P(btp);
+
+ hdr = bp->b_addr;
+ xfs_dir3_data_check(dp, bp);
+ btp = xfs_dir2_block_tail_p(args->geo, hdr);
+ blp = xfs_dir2_block_leaf_p(btp);
/*
* Loop doing a binary search for our hash value.
* Find our entry, ENOENT if it's not there.
*/
- for (low = 0, high = INT_GET(btp->count, ARCH_CONVERT) - 1; ; ) {
+ for (low = 0, high = be32_to_cpu(btp->count) - 1; ; ) {
ASSERT(low <= high);
mid = (low + high) >> 1;
- if ((hash = INT_GET(blp[mid].hashval, ARCH_CONVERT)) == args->hashval)
+ if ((hash = be32_to_cpu(blp[mid].hashval)) == args->hashval)
break;
if (hash < args->hashval)
low = mid + 1;
else
high = mid - 1;
if (low > high) {
- ASSERT(args->oknoent);
- xfs_da_brelse(tp, bp);
+ ASSERT(args->op_flags & XFS_DA_OP_OKNOENT);
+ xfs_trans_brelse(tp, bp);
return XFS_ERROR(ENOENT);
}
}
/*
* Back up to the first one with the right hash value.
*/
- while (mid > 0 && INT_GET(blp[mid - 1].hashval, ARCH_CONVERT) == args->hashval) {
+ while (mid > 0 && be32_to_cpu(blp[mid - 1].hashval) == args->hashval) {
mid--;
}
/*
@@ -700,29 +717,40 @@ xfs_dir2_block_lookup_int(
* right hash value looking for our name.
*/
do {
- if ((addr = INT_GET(blp[mid].address, ARCH_CONVERT)) == XFS_DIR2_NULL_DATAPTR)
+ if ((addr = be32_to_cpu(blp[mid].address)) == XFS_DIR2_NULL_DATAPTR)
continue;
/*
* Get pointer to the entry from the leaf.
*/
dep = (xfs_dir2_data_entry_t *)
- ((char *)block + XFS_DIR2_DATAPTR_TO_OFF(mp, addr));
+ ((char *)hdr + xfs_dir2_dataptr_to_off(args->geo, addr));
/*
- * Compare, if it's right give back buffer & entry number.
+ * Compare name and if it's an exact match, return the index
+ * and buffer. If it's the first case-insensitive match, store
+ * the index and buffer and continue looking for an exact match.
*/
- if (dep->namelen == args->namelen &&
- dep->name[0] == args->name[0] &&
- memcmp(dep->name, args->name, args->namelen) == 0) {
+ cmp = mp->m_dirnameops->compname(args, dep->name, dep->namelen);
+ if (cmp != XFS_CMP_DIFFERENT && cmp != args->cmpresult) {
+ args->cmpresult = cmp;
*bpp = bp;
*entno = mid;
- return 0;
+ if (cmp == XFS_CMP_EXACT)
+ return 0;
}
- } while (++mid < INT_GET(btp->count, ARCH_CONVERT) && INT_GET(blp[mid].hashval, ARCH_CONVERT) == hash);
+ } while (++mid < be32_to_cpu(btp->count) &&
+ be32_to_cpu(blp[mid].hashval) == hash);
+
+ ASSERT(args->op_flags & XFS_DA_OP_OKNOENT);
+ /*
+ * Here, we can only be doing a lookup (not a rename or replace).
+ * If a case-insensitive match was found earlier, return success.
+ */
+ if (args->cmpresult == XFS_CMP_CASE)
+ return 0;
/*
* No match, release the buffer and return ENOENT.
*/
- ASSERT(args->oknoent);
- xfs_da_brelse(tp, bp);
+ xfs_trans_brelse(tp, bp);
return XFS_ERROR(ENOENT);
}
@@ -734,9 +762,9 @@ int /* error */
xfs_dir2_block_removename(
xfs_da_args_t *args) /* directory operation args */
{
- xfs_dir2_block_t *block; /* block structure */
+ xfs_dir2_data_hdr_t *hdr; /* block header */
xfs_dir2_leaf_entry_t *blp; /* block leaf pointer */
- xfs_dabuf_t *bp; /* block buffer */
+ struct xfs_buf *bp; /* block buffer */
xfs_dir2_block_tail_t *btp; /* block tail */
xfs_dir2_data_entry_t *dep; /* block data entry */
xfs_inode_t *dp; /* incore inode */
@@ -749,7 +777,8 @@ xfs_dir2_block_removename(
int size; /* shortform size */
xfs_trans_t *tp; /* transaction pointer */
- xfs_dir2_trace_args("block_removename", args);
+ trace_xfs_dir2_block_removename(args);
+
/*
* Look up the entry in the block. Gets the buffer and entry index.
* It will always be there, the vnodeops level does a lookup first.
@@ -760,48 +789,47 @@ xfs_dir2_block_removename(
dp = args->dp;
tp = args->trans;
mp = dp->i_mount;
- block = bp->data;
- btp = XFS_DIR2_BLOCK_TAIL_P(mp, block);
- blp = XFS_DIR2_BLOCK_LEAF_P(btp);
+ hdr = bp->b_addr;
+ btp = xfs_dir2_block_tail_p(args->geo, hdr);
+ blp = xfs_dir2_block_leaf_p(btp);
/*
* Point to the data entry using the leaf entry.
*/
- dep = (xfs_dir2_data_entry_t *)
- ((char *)block + XFS_DIR2_DATAPTR_TO_OFF(mp, INT_GET(blp[ent].address, ARCH_CONVERT)));
+ dep = (xfs_dir2_data_entry_t *)((char *)hdr +
+ xfs_dir2_dataptr_to_off(args->geo,
+ be32_to_cpu(blp[ent].address)));
/*
* Mark the data entry's space free.
*/
needlog = needscan = 0;
- xfs_dir2_data_make_free(tp, bp,
- (xfs_dir2_data_aoff_t)((char *)dep - (char *)block),
- XFS_DIR2_DATA_ENTSIZE(dep->namelen), &needlog, &needscan);
+ xfs_dir2_data_make_free(args, bp,
+ (xfs_dir2_data_aoff_t)((char *)dep - (char *)hdr),
+ dp->d_ops->data_entsize(dep->namelen), &needlog, &needscan);
/*
* Fix up the block tail.
*/
- INT_MOD(btp->stale, ARCH_CONVERT, +1);
+ be32_add_cpu(&btp->stale, 1);
xfs_dir2_block_log_tail(tp, bp);
/*
* Remove the leaf entry by marking it stale.
*/
- INT_SET(blp[ent].address, ARCH_CONVERT, XFS_DIR2_NULL_DATAPTR);
+ blp[ent].address = cpu_to_be32(XFS_DIR2_NULL_DATAPTR);
xfs_dir2_block_log_leaf(tp, bp, ent, ent);
/*
* Fix up bestfree, log the header if necessary.
*/
if (needscan)
- xfs_dir2_data_freescan(mp, (xfs_dir2_data_t *)block, &needlog,
- NULL);
+ xfs_dir2_data_freescan(dp, hdr, &needlog);
if (needlog)
- xfs_dir2_data_log_header(tp, bp);
- xfs_dir2_data_check(dp, bp);
+ xfs_dir2_data_log_header(args, bp);
+ xfs_dir3_data_check(dp, bp);
/*
* See if the size as a shortform is good enough.
*/
- if ((size = xfs_dir2_block_sfsize(dp, block, &sfh)) >
- XFS_IFORK_DSIZE(dp)) {
- xfs_da_buf_done(bp);
+ size = xfs_dir2_block_sfsize(dp, hdr, &sfh);
+ if (size > XFS_IFORK_DSIZE(dp))
return 0;
- }
+
/*
* If it works, do the conversion.
*/
@@ -816,9 +844,9 @@ int /* error */
xfs_dir2_block_replace(
xfs_da_args_t *args) /* directory operation args */
{
- xfs_dir2_block_t *block; /* block structure */
+ xfs_dir2_data_hdr_t *hdr; /* block header */
xfs_dir2_leaf_entry_t *blp; /* block leaf entries */
- xfs_dabuf_t *bp; /* block buffer */
+ struct xfs_buf *bp; /* block buffer */
xfs_dir2_block_tail_t *btp; /* block tail */
xfs_dir2_data_entry_t *dep; /* block data entry */
xfs_inode_t *dp; /* incore inode */
@@ -826,7 +854,8 @@ xfs_dir2_block_replace(
int error; /* error return value */
xfs_mount_t *mp; /* filesystem mount point */
- xfs_dir2_trace_args("block_replace", args);
+ trace_xfs_dir2_block_replace(args);
+
/*
* Lookup the entry in the directory. Get buffer and entry index.
* This will always succeed since the caller has already done a lookup.
@@ -836,22 +865,23 @@ xfs_dir2_block_replace(
}
dp = args->dp;
mp = dp->i_mount;
- block = bp->data;
- btp = XFS_DIR2_BLOCK_TAIL_P(mp, block);
- blp = XFS_DIR2_BLOCK_LEAF_P(btp);
+ hdr = bp->b_addr;
+ btp = xfs_dir2_block_tail_p(args->geo, hdr);
+ blp = xfs_dir2_block_leaf_p(btp);
/*
* Point to the data entry we need to change.
*/
- dep = (xfs_dir2_data_entry_t *)
- ((char *)block + XFS_DIR2_DATAPTR_TO_OFF(mp, INT_GET(blp[ent].address, ARCH_CONVERT)));
- ASSERT(INT_GET(dep->inumber, ARCH_CONVERT) != args->inumber);
+ dep = (xfs_dir2_data_entry_t *)((char *)hdr +
+ xfs_dir2_dataptr_to_off(args->geo,
+ be32_to_cpu(blp[ent].address)));
+ ASSERT(be64_to_cpu(dep->inumber) != args->inumber);
/*
* Change the inode number to the new value.
*/
- INT_SET(dep->inumber, ARCH_CONVERT, args->inumber);
- xfs_dir2_data_log_entry(args->trans, bp, dep);
- xfs_dir2_data_check(dp, bp);
- xfs_da_buf_done(bp);
+ dep->inumber = cpu_to_be64(args->inumber);
+ dp->d_ops->data_put_ftype(dep, args->filetype);
+ xfs_dir2_data_log_entry(args, bp, dep);
+ xfs_dir3_data_check(dp, bp);
return 0;
}
@@ -868,8 +898,8 @@ xfs_dir2_block_sort(
la = a;
lb = b;
- return INT_GET(la->hashval, ARCH_CONVERT) < INT_GET(lb->hashval, ARCH_CONVERT) ? -1 :
- (INT_GET(la->hashval, ARCH_CONVERT) > INT_GET(lb->hashval, ARCH_CONVERT) ? 1 : 0);
+ return be32_to_cpu(la->hashval) < be32_to_cpu(lb->hashval) ? -1 :
+ (be32_to_cpu(la->hashval) > be32_to_cpu(lb->hashval) ? 1 : 0);
}
/*
@@ -878,11 +908,11 @@ xfs_dir2_block_sort(
int /* error */
xfs_dir2_leaf_to_block(
xfs_da_args_t *args, /* operation arguments */
- xfs_dabuf_t *lbp, /* leaf buffer */
- xfs_dabuf_t *dbp) /* data buffer */
+ struct xfs_buf *lbp, /* leaf buffer */
+ struct xfs_buf *dbp) /* data buffer */
{
- xfs_dir2_data_off_t *bestsp; /* leaf bests table */
- xfs_dir2_block_t *block; /* block structure */
+ __be16 *bestsp; /* leaf bests table */
+ xfs_dir2_data_hdr_t *hdr; /* block header */
xfs_dir2_block_tail_t *btp; /* block tail */
xfs_inode_t *dp; /* incore directory inode */
xfs_dir2_data_unused_t *dup; /* unused data entry */
@@ -896,123 +926,125 @@ xfs_dir2_leaf_to_block(
int needscan; /* need to scan for bestfree */
xfs_dir2_sf_hdr_t sfh; /* shortform header */
int size; /* bytes used */
- xfs_dir2_data_off_t *tagp; /* end of entry (tag) */
+ __be16 *tagp; /* end of entry (tag) */
int to; /* block/leaf to index */
xfs_trans_t *tp; /* transaction pointer */
+ struct xfs_dir2_leaf_entry *ents;
+ struct xfs_dir3_icleaf_hdr leafhdr;
+
+ trace_xfs_dir2_leaf_to_block(args);
- xfs_dir2_trace_args_bb("leaf_to_block", args, lbp, dbp);
dp = args->dp;
tp = args->trans;
mp = dp->i_mount;
- leaf = lbp->data;
- ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR2_LEAF1_MAGIC);
- ltp = XFS_DIR2_LEAF_TAIL_P(mp, leaf);
+ leaf = lbp->b_addr;
+ dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
+ ents = dp->d_ops->leaf_ents_p(leaf);
+ ltp = xfs_dir2_leaf_tail_p(args->geo, leaf);
+
+ ASSERT(leafhdr.magic == XFS_DIR2_LEAF1_MAGIC ||
+ leafhdr.magic == XFS_DIR3_LEAF1_MAGIC);
/*
* If there are data blocks other than the first one, take this
* opportunity to remove trailing empty data blocks that may have
* been left behind during no-space-reservation operations.
* These will show up in the leaf bests table.
*/
- while (dp->i_d.di_size > mp->m_dirblksize) {
- bestsp = XFS_DIR2_LEAF_BESTS_P(ltp);
- if (INT_GET(bestsp[INT_GET(ltp->bestcount, ARCH_CONVERT) - 1], ARCH_CONVERT) ==
- mp->m_dirblksize - (uint)sizeof(block->hdr)) {
+ while (dp->i_d.di_size > args->geo->blksize) {
+ int hdrsz;
+
+ hdrsz = dp->d_ops->data_entry_offset;
+ bestsp = xfs_dir2_leaf_bests_p(ltp);
+ if (be16_to_cpu(bestsp[be32_to_cpu(ltp->bestcount) - 1]) ==
+ args->geo->blksize - hdrsz) {
if ((error =
xfs_dir2_leaf_trim_data(args, lbp,
- (xfs_dir2_db_t)(INT_GET(ltp->bestcount, ARCH_CONVERT) - 1))))
- goto out;
- } else {
- error = 0;
- goto out;
- }
+ (xfs_dir2_db_t)(be32_to_cpu(ltp->bestcount) - 1))))
+ return error;
+ } else
+ return 0;
}
/*
* Read the data block if we don't already have it, give up if it fails.
*/
- if (dbp == NULL &&
- (error = xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, &dbp,
- XFS_DATA_FORK))) {
- goto out;
+ if (!dbp) {
+ error = xfs_dir3_data_read(tp, dp, args->geo->datablk, -1, &dbp);
+ if (error)
+ return error;
}
- block = dbp->data;
- ASSERT(INT_GET(block->hdr.magic, ARCH_CONVERT) == XFS_DIR2_DATA_MAGIC);
+ hdr = dbp->b_addr;
+ ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
+ hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC));
+
/*
* Size of the "leaf" area in the block.
*/
- size = (uint)sizeof(block->tail) +
- (uint)sizeof(*lep) * (INT_GET(leaf->hdr.count, ARCH_CONVERT) - INT_GET(leaf->hdr.stale, ARCH_CONVERT));
+ size = (uint)sizeof(xfs_dir2_block_tail_t) +
+ (uint)sizeof(*lep) * (leafhdr.count - leafhdr.stale);
/*
* Look at the last data entry.
*/
- tagp = (xfs_dir2_data_off_t *)((char *)block + mp->m_dirblksize) - 1;
- dup = (xfs_dir2_data_unused_t *)((char *)block + INT_GET(*tagp, ARCH_CONVERT));
+ tagp = (__be16 *)((char *)hdr + args->geo->blksize) - 1;
+ dup = (xfs_dir2_data_unused_t *)((char *)hdr + be16_to_cpu(*tagp));
/*
* If it's not free or is too short we can't do it.
*/
- if (INT_GET(dup->freetag, ARCH_CONVERT) != XFS_DIR2_DATA_FREE_TAG || INT_GET(dup->length, ARCH_CONVERT) < size) {
- error = 0;
- goto out;
- }
+ if (be16_to_cpu(dup->freetag) != XFS_DIR2_DATA_FREE_TAG ||
+ be16_to_cpu(dup->length) < size)
+ return 0;
+
/*
* Start converting it to block form.
*/
- INT_SET(block->hdr.magic, ARCH_CONVERT, XFS_DIR2_BLOCK_MAGIC);
+ xfs_dir3_block_init(mp, tp, dbp, dp);
+
needlog = 1;
needscan = 0;
/*
* Use up the space at the end of the block (blp/btp).
*/
- xfs_dir2_data_use_free(tp, dbp, dup, mp->m_dirblksize - size, size,
+ xfs_dir2_data_use_free(args, dbp, dup, args->geo->blksize - size, size,
&needlog, &needscan);
/*
* Initialize the block tail.
*/
- btp = XFS_DIR2_BLOCK_TAIL_P(mp, block);
- INT_SET(btp->count, ARCH_CONVERT, INT_GET(leaf->hdr.count, ARCH_CONVERT) - INT_GET(leaf->hdr.stale, ARCH_CONVERT));
+ btp = xfs_dir2_block_tail_p(args->geo, hdr);
+ btp->count = cpu_to_be32(leafhdr.count - leafhdr.stale);
btp->stale = 0;
xfs_dir2_block_log_tail(tp, dbp);
/*
* Initialize the block leaf area. We compact out stale entries.
*/
- lep = XFS_DIR2_BLOCK_LEAF_P(btp);
- for (from = to = 0; from < INT_GET(leaf->hdr.count, ARCH_CONVERT); from++) {
- if (INT_GET(leaf->ents[from].address, ARCH_CONVERT) == XFS_DIR2_NULL_DATAPTR)
+ lep = xfs_dir2_block_leaf_p(btp);
+ for (from = to = 0; from < leafhdr.count; from++) {
+ if (ents[from].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
continue;
- lep[to++] = leaf->ents[from];
+ lep[to++] = ents[from];
}
- ASSERT(to == INT_GET(btp->count, ARCH_CONVERT));
- xfs_dir2_block_log_leaf(tp, dbp, 0, INT_GET(btp->count, ARCH_CONVERT) - 1);
+ ASSERT(to == be32_to_cpu(btp->count));
+ xfs_dir2_block_log_leaf(tp, dbp, 0, be32_to_cpu(btp->count) - 1);
/*
* Scan the bestfree if we need it and log the data block header.
*/
if (needscan)
- xfs_dir2_data_freescan(mp, (xfs_dir2_data_t *)block, &needlog,
- NULL);
+ xfs_dir2_data_freescan(dp, hdr, &needlog);
if (needlog)
- xfs_dir2_data_log_header(tp, dbp);
+ xfs_dir2_data_log_header(args, dbp);
/*
* Pitch the old leaf block.
*/
- error = xfs_da_shrink_inode(args, mp->m_dirleafblk, lbp);
- lbp = NULL;
- if (error) {
- goto out;
- }
+ error = xfs_da_shrink_inode(args, args->geo->leafblk, lbp);
+ if (error)
+ return error;
+
/*
* Now see if the resulting block can be shrunken to shortform.
*/
- if ((size = xfs_dir2_block_sfsize(dp, block, &sfh)) >
- XFS_IFORK_DSIZE(dp)) {
- error = 0;
- goto out;
- }
+ size = xfs_dir2_block_sfsize(dp, hdr, &sfh);
+ if (size > XFS_IFORK_DSIZE(dp))
+ return 0;
+
return xfs_dir2_block_to_sf(args, dbp, size, &sfh);
-out:
- if (lbp)
- xfs_da_buf_done(lbp);
- if (dbp)
- xfs_da_buf_done(dbp);
- return error;
}
/*
@@ -1023,12 +1055,10 @@ xfs_dir2_sf_to_block(
xfs_da_args_t *args) /* operation arguments */
{
xfs_dir2_db_t blkno; /* dir-relative block # (0) */
- xfs_dir2_block_t *block; /* block structure */
+ xfs_dir2_data_hdr_t *hdr; /* block header */
xfs_dir2_leaf_entry_t *blp; /* block leaf entries */
- xfs_dabuf_t *bp; /* block buffer */
+ struct xfs_buf *bp; /* block buffer */
xfs_dir2_block_tail_t *btp; /* block tail pointer */
- char *buf; /* sf buffer */
- int buf_len;
xfs_dir2_data_entry_t *dep; /* data entry pointer */
xfs_inode_t *dp; /* incore directory inode */
int dummy; /* trash */
@@ -1042,15 +1072,20 @@ xfs_dir2_sf_to_block(
int newoffset; /* offset from current entry */
int offset; /* target block offset */
xfs_dir2_sf_entry_t *sfep; /* sf entry pointer */
- xfs_dir2_sf_t *sfp; /* shortform structure */
- xfs_dir2_data_off_t *tagp; /* end of data entry */
+ xfs_dir2_sf_hdr_t *oldsfp; /* old shortform header */
+ xfs_dir2_sf_hdr_t *sfp; /* shortform header */
+ __be16 *tagp; /* end of data entry */
xfs_trans_t *tp; /* transaction pointer */
+ struct xfs_name name;
+ struct xfs_ifork *ifp;
+
+ trace_xfs_dir2_sf_to_block(args);
- xfs_dir2_trace_args("sf_to_block", args);
dp = args->dp;
tp = args->trans;
mp = dp->i_mount;
- ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
+ ifp = XFS_IFORK_PTR(dp, XFS_DATA_FORK);
+ ASSERT(ifp->if_flags & XFS_IFINLINE);
/*
* Bomb out if the shortform directory is way too short.
*/
@@ -1058,106 +1093,109 @@ xfs_dir2_sf_to_block(
ASSERT(XFS_FORCED_SHUTDOWN(mp));
return XFS_ERROR(EIO);
}
- ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
- ASSERT(dp->i_df.if_u1.if_data != NULL);
- sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
- ASSERT(dp->i_d.di_size >= XFS_DIR2_SF_HDR_SIZE(sfp->hdr.i8count));
+
+ oldsfp = (xfs_dir2_sf_hdr_t *)ifp->if_u1.if_data;
+
+ ASSERT(ifp->if_bytes == dp->i_d.di_size);
+ ASSERT(ifp->if_u1.if_data != NULL);
+ ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(oldsfp->i8count));
+ ASSERT(dp->i_d.di_nextents == 0);
+
/*
- * Copy the directory into the stack buffer.
+ * Copy the directory into a temporary buffer.
* Then pitch the incore inode data so we can make extents.
*/
+ sfp = kmem_alloc(ifp->if_bytes, KM_SLEEP);
+ memcpy(sfp, oldsfp, ifp->if_bytes);
- buf_len = dp->i_df.if_bytes;
- buf = kmem_alloc(dp->i_df.if_bytes, KM_SLEEP);
-
- memcpy(buf, sfp, dp->i_df.if_bytes);
- xfs_idata_realloc(dp, -dp->i_df.if_bytes, XFS_DATA_FORK);
+ xfs_idata_realloc(dp, -ifp->if_bytes, XFS_DATA_FORK);
+ xfs_bmap_local_to_extents_empty(dp, XFS_DATA_FORK);
dp->i_d.di_size = 0;
- xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
- /*
- * Reset pointer - old sfp is gone.
- */
- sfp = (xfs_dir2_sf_t *)buf;
+
/*
* Add block 0 to the inode.
*/
error = xfs_dir2_grow_inode(args, XFS_DIR2_DATA_SPACE, &blkno);
if (error) {
- kmem_free(buf, buf_len);
+ kmem_free(sfp);
return error;
}
/*
- * Initialize the data block.
+ * Initialize the data block, then convert it to block format.
*/
- error = xfs_dir2_data_init(args, blkno, &bp);
+ error = xfs_dir3_data_init(args, blkno, &bp);
if (error) {
- kmem_free(buf, buf_len);
+ kmem_free(sfp);
return error;
}
- block = bp->data;
- INT_SET(block->hdr.magic, ARCH_CONVERT, XFS_DIR2_BLOCK_MAGIC);
+ xfs_dir3_block_init(mp, tp, bp, dp);
+ hdr = bp->b_addr;
+
/*
* Compute size of block "tail" area.
*/
i = (uint)sizeof(*btp) +
- (INT_GET(sfp->hdr.count, ARCH_CONVERT) + 2) * (uint)sizeof(xfs_dir2_leaf_entry_t);
+ (sfp->count + 2) * (uint)sizeof(xfs_dir2_leaf_entry_t);
/*
* The whole thing is initialized to free by the init routine.
* Say we're using the leaf and tail area.
*/
- dup = (xfs_dir2_data_unused_t *)block->u;
+ dup = dp->d_ops->data_unused_p(hdr);
needlog = needscan = 0;
- xfs_dir2_data_use_free(tp, bp, dup, mp->m_dirblksize - i, i, &needlog,
- &needscan);
+ xfs_dir2_data_use_free(args, bp, dup, args->geo->blksize - i,
+ i, &needlog, &needscan);
ASSERT(needscan == 0);
/*
* Fill in the tail.
*/
- btp = XFS_DIR2_BLOCK_TAIL_P(mp, block);
- INT_SET(btp->count, ARCH_CONVERT, INT_GET(sfp->hdr.count, ARCH_CONVERT) + 2); /* ., .. */
+ btp = xfs_dir2_block_tail_p(args->geo, hdr);
+ btp->count = cpu_to_be32(sfp->count + 2); /* ., .. */
btp->stale = 0;
- blp = XFS_DIR2_BLOCK_LEAF_P(btp);
- endoffset = (uint)((char *)blp - (char *)block);
+ blp = xfs_dir2_block_leaf_p(btp);
+ endoffset = (uint)((char *)blp - (char *)hdr);
/*
* Remove the freespace, we'll manage it.
*/
- xfs_dir2_data_use_free(tp, bp, dup,
- (xfs_dir2_data_aoff_t)((char *)dup - (char *)block),
- INT_GET(dup->length, ARCH_CONVERT), &needlog, &needscan);
+ xfs_dir2_data_use_free(args, bp, dup,
+ (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr),
+ be16_to_cpu(dup->length), &needlog, &needscan);
/*
* Create entry for .
*/
- dep = (xfs_dir2_data_entry_t *)
- ((char *)block + XFS_DIR2_DATA_DOT_OFFSET);
- INT_SET(dep->inumber, ARCH_CONVERT, dp->i_ino);
+ dep = dp->d_ops->data_dot_entry_p(hdr);
+ dep->inumber = cpu_to_be64(dp->i_ino);
dep->namelen = 1;
dep->name[0] = '.';
- tagp = XFS_DIR2_DATA_ENTRY_TAG_P(dep);
- INT_SET(*tagp, ARCH_CONVERT, (xfs_dir2_data_off_t)((char *)dep - (char *)block));
- xfs_dir2_data_log_entry(tp, bp, dep);
- INT_SET(blp[0].hashval, ARCH_CONVERT, xfs_dir_hash_dot);
- INT_SET(blp[0].address, ARCH_CONVERT, XFS_DIR2_BYTE_TO_DATAPTR(mp, (char *)dep - (char *)block));
+ dp->d_ops->data_put_ftype(dep, XFS_DIR3_FT_DIR);
+ tagp = dp->d_ops->data_entry_tag_p(dep);
+ *tagp = cpu_to_be16((char *)dep - (char *)hdr);
+ xfs_dir2_data_log_entry(args, bp, dep);
+ blp[0].hashval = cpu_to_be32(xfs_dir_hash_dot);
+ blp[0].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(
+ (char *)dep - (char *)hdr));
/*
* Create entry for ..
*/
- dep = (xfs_dir2_data_entry_t *)
- ((char *)block + XFS_DIR2_DATA_DOTDOT_OFFSET);
- INT_SET(dep->inumber, ARCH_CONVERT, XFS_DIR2_SF_GET_INUMBER(sfp, &sfp->hdr.parent));
+ dep = dp->d_ops->data_dotdot_entry_p(hdr);
+ dep->inumber = cpu_to_be64(dp->d_ops->sf_get_parent_ino(sfp));
dep->namelen = 2;
dep->name[0] = dep->name[1] = '.';
- tagp = XFS_DIR2_DATA_ENTRY_TAG_P(dep);
- INT_SET(*tagp, ARCH_CONVERT, (xfs_dir2_data_off_t)((char *)dep - (char *)block));
- xfs_dir2_data_log_entry(tp, bp, dep);
- INT_SET(blp[1].hashval, ARCH_CONVERT, xfs_dir_hash_dotdot);
- INT_SET(blp[1].address, ARCH_CONVERT, XFS_DIR2_BYTE_TO_DATAPTR(mp, (char *)dep - (char *)block));
- offset = XFS_DIR2_DATA_FIRST_OFFSET;
+ dp->d_ops->data_put_ftype(dep, XFS_DIR3_FT_DIR);
+ tagp = dp->d_ops->data_entry_tag_p(dep);
+ *tagp = cpu_to_be16((char *)dep - (char *)hdr);
+ xfs_dir2_data_log_entry(args, bp, dep);
+ blp[1].hashval = cpu_to_be32(xfs_dir_hash_dotdot);
+ blp[1].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(
+ (char *)dep - (char *)hdr));
+ offset = dp->d_ops->data_first_offset;
/*
* Loop over existing entries, stuff them in.
*/
- if ((i = 0) == INT_GET(sfp->hdr.count, ARCH_CONVERT))
+ i = 0;
+ if (!sfp->count)
sfep = NULL;
else
- sfep = XFS_DIR2_SF_FIRSTENTRY(sfp);
+ sfep = xfs_dir2_sf_firstentry(sfp);
/*
* Need to preserve the existing offset values in the sf directory.
* Insert holes (unused entries) where necessary.
@@ -1169,58 +1207,59 @@ xfs_dir2_sf_to_block(
if (sfep == NULL)
newoffset = endoffset;
else
- newoffset = XFS_DIR2_SF_GET_OFFSET(sfep);
+ newoffset = xfs_dir2_sf_get_offset(sfep);
/*
* There should be a hole here, make one.
*/
if (offset < newoffset) {
- dup = (xfs_dir2_data_unused_t *)
- ((char *)block + offset);
- INT_SET(dup->freetag, ARCH_CONVERT, XFS_DIR2_DATA_FREE_TAG);
- INT_SET(dup->length, ARCH_CONVERT, newoffset - offset);
- INT_SET(*XFS_DIR2_DATA_UNUSED_TAG_P(dup), ARCH_CONVERT,
- (xfs_dir2_data_off_t)
- ((char *)dup - (char *)block));
- xfs_dir2_data_log_unused(tp, bp, dup);
- (void)xfs_dir2_data_freeinsert((xfs_dir2_data_t *)block,
- dup, &dummy);
- offset += INT_GET(dup->length, ARCH_CONVERT);
+ dup = (xfs_dir2_data_unused_t *)((char *)hdr + offset);
+ dup->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG);
+ dup->length = cpu_to_be16(newoffset - offset);
+ *xfs_dir2_data_unused_tag_p(dup) = cpu_to_be16(
+ ((char *)dup - (char *)hdr));
+ xfs_dir2_data_log_unused(args, bp, dup);
+ xfs_dir2_data_freeinsert(hdr,
+ dp->d_ops->data_bestfree_p(hdr),
+ dup, &dummy);
+ offset += be16_to_cpu(dup->length);
continue;
}
/*
* Copy a real entry.
*/
- dep = (xfs_dir2_data_entry_t *)((char *)block + newoffset);
- INT_SET(dep->inumber, ARCH_CONVERT, XFS_DIR2_SF_GET_INUMBER(sfp,
- XFS_DIR2_SF_INUMBERP(sfep)));
+ dep = (xfs_dir2_data_entry_t *)((char *)hdr + newoffset);
+ dep->inumber = cpu_to_be64(dp->d_ops->sf_get_ino(sfp, sfep));
dep->namelen = sfep->namelen;
+ dp->d_ops->data_put_ftype(dep, dp->d_ops->sf_get_ftype(sfep));
memcpy(dep->name, sfep->name, dep->namelen);
- tagp = XFS_DIR2_DATA_ENTRY_TAG_P(dep);
- INT_SET(*tagp, ARCH_CONVERT, (xfs_dir2_data_off_t)((char *)dep - (char *)block));
- xfs_dir2_data_log_entry(tp, bp, dep);
- INT_SET(blp[2 + i].hashval, ARCH_CONVERT, xfs_da_hashname((char *)sfep->name, sfep->namelen));
- INT_SET(blp[2 + i].address, ARCH_CONVERT, XFS_DIR2_BYTE_TO_DATAPTR(mp,
- (char *)dep - (char *)block));
- offset = (int)((char *)(tagp + 1) - (char *)block);
- if (++i == INT_GET(sfp->hdr.count, ARCH_CONVERT))
+ tagp = dp->d_ops->data_entry_tag_p(dep);
+ *tagp = cpu_to_be16((char *)dep - (char *)hdr);
+ xfs_dir2_data_log_entry(args, bp, dep);
+ name.name = sfep->name;
+ name.len = sfep->namelen;
+ blp[2 + i].hashval = cpu_to_be32(mp->m_dirnameops->
+ hashname(&name));
+ blp[2 + i].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(
+ (char *)dep - (char *)hdr));
+ offset = (int)((char *)(tagp + 1) - (char *)hdr);
+ if (++i == sfp->count)
sfep = NULL;
else
- sfep = XFS_DIR2_SF_NEXTENTRY(sfp, sfep);
+ sfep = dp->d_ops->sf_nextentry(sfp, sfep);
}
/* Done with the temporary buffer */
- kmem_free(buf, buf_len);
+ kmem_free(sfp);
/*
* Sort the leaf entries by hash value.
*/
- xfs_sort(blp, INT_GET(btp->count, ARCH_CONVERT), sizeof(*blp), xfs_dir2_block_sort);
+ xfs_sort(blp, be32_to_cpu(btp->count), sizeof(*blp), xfs_dir2_block_sort);
/*
* Log the leaf entry area and tail.
* Already logged the header in data_init, ignore needlog.
*/
ASSERT(needscan == 0);
- xfs_dir2_block_log_leaf(tp, bp, 0, INT_GET(btp->count, ARCH_CONVERT) - 1);
+ xfs_dir2_block_log_leaf(tp, bp, 0, be32_to_cpu(btp->count) - 1);
xfs_dir2_block_log_tail(tp, bp);
- xfs_dir2_data_check(dp, bp);
- xfs_da_buf_done(bp);
+ xfs_dir3_data_check(dp, bp);
return 0;
}
diff --git a/fs/xfs/xfs_dir2_block.h b/fs/xfs/xfs_dir2_block.h
deleted file mode 100644
index a2e5cb98a83..00000000000
--- a/fs/xfs/xfs_dir2_block.h
+++ /dev/null
@@ -1,96 +0,0 @@
-/*
- * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- */
-#ifndef __XFS_DIR2_BLOCK_H__
-#define __XFS_DIR2_BLOCK_H__
-
-/*
- * xfs_dir2_block.h
- * Directory version 2, single block format structures
- */
-
-struct uio;
-struct xfs_dabuf;
-struct xfs_da_args;
-struct xfs_dir2_data_hdr;
-struct xfs_dir2_leaf_entry;
-struct xfs_inode;
-struct xfs_mount;
-struct xfs_trans;
-
-/*
- * The single block format is as follows:
- * xfs_dir2_data_hdr_t structure
- * xfs_dir2_data_entry_t and xfs_dir2_data_unused_t structures
- * xfs_dir2_leaf_entry_t structures
- * xfs_dir2_block_tail_t structure
- */
-
-#define XFS_DIR2_BLOCK_MAGIC 0x58443242 /* XD2B: for one block dirs */
-
-typedef struct xfs_dir2_block_tail {
- __uint32_t count; /* count of leaf entries */
- __uint32_t stale; /* count of stale lf entries */
-} xfs_dir2_block_tail_t;
-
-/*
- * Generic single-block structure, for xfs_db.
- */
-typedef struct xfs_dir2_block {
- xfs_dir2_data_hdr_t hdr; /* magic XFS_DIR2_BLOCK_MAGIC */
- xfs_dir2_data_union_t u[1];
- xfs_dir2_leaf_entry_t leaf[1];
- xfs_dir2_block_tail_t tail;
-} xfs_dir2_block_t;
-
-/*
- * Pointer to the leaf header embedded in a data block (1-block format)
- */
-#define XFS_DIR2_BLOCK_TAIL_P(mp,block) xfs_dir2_block_tail_p(mp,block)
-static inline xfs_dir2_block_tail_t *
-xfs_dir2_block_tail_p(struct xfs_mount *mp, xfs_dir2_block_t *block)
-{
- return (((xfs_dir2_block_tail_t *)
- ((char *)(block) + (mp)->m_dirblksize)) - 1);
-}
-
-/*
- * Pointer to the leaf entries embedded in a data block (1-block format)
- */
-#define XFS_DIR2_BLOCK_LEAF_P(btp) xfs_dir2_block_leaf_p(btp)
-static inline struct xfs_dir2_leaf_entry *
-xfs_dir2_block_leaf_p(xfs_dir2_block_tail_t *btp)
-{
- return (((struct xfs_dir2_leaf_entry *)
- (btp)) - INT_GET((btp)->count, ARCH_CONVERT));
-}
-
-/*
- * Function declarations.
- */
-extern int xfs_dir2_block_addname(struct xfs_da_args *args);
-extern int xfs_dir2_block_getdents(struct xfs_trans *tp, struct xfs_inode *dp,
- struct uio *uio, int *eofp,
- struct xfs_dirent *dbp, xfs_dir2_put_t put);
-extern int xfs_dir2_block_lookup(struct xfs_da_args *args);
-extern int xfs_dir2_block_removename(struct xfs_da_args *args);
-extern int xfs_dir2_block_replace(struct xfs_da_args *args);
-extern int xfs_dir2_leaf_to_block(struct xfs_da_args *args,
- struct xfs_dabuf *lbp, struct xfs_dabuf *dbp);
-extern int xfs_dir2_sf_to_block(struct xfs_da_args *args);
-
-#endif /* __XFS_DIR2_BLOCK_H__ */
diff --git a/fs/xfs/xfs_dir2_data.c b/fs/xfs/xfs_dir2_data.c
index 5b7c47e2f14..8c2f6422648 100644
--- a/fs/xfs/xfs_dir2_data.c
+++ b/fs/xfs/xfs_dir2_data.c
@@ -1,5 +1,6 @@
/*
* Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
* All Rights Reserved.
*
* This program is free software; you can redistribute it and/or
@@ -17,44 +18,37 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
-#include "xfs_types.h"
-#include "xfs_log.h"
-#include "xfs_inum.h"
-#include "xfs_trans.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_sb.h"
-#include "xfs_dir.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
+#include "xfs_ag.h"
#include "xfs_mount.h"
+#include "xfs_da_format.h"
#include "xfs_da_btree.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_dir_sf.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
-#include "xfs_dinode.h"
#include "xfs_inode.h"
-#include "xfs_dir_leaf.h"
-#include "xfs_dir2_data.h"
-#include "xfs_dir2_leaf.h"
-#include "xfs_dir2_block.h"
+#include "xfs_dir2.h"
+#include "xfs_dir2_priv.h"
#include "xfs_error.h"
+#include "xfs_trans.h"
+#include "xfs_buf_item.h"
+#include "xfs_cksum.h"
-#ifdef DEBUG
/*
* Check the consistency of the data block.
* The input can also be a block-format directory.
- * Pop an assert if we find anything bad.
+ * Return 0 is the buffer is good, otherwise an error.
*/
-void
-xfs_dir2_data_check(
- xfs_inode_t *dp, /* incore inode pointer */
- xfs_dabuf_t *bp) /* data block's buffer */
+int
+__xfs_dir3_data_check(
+ struct xfs_inode *dp, /* incore inode pointer */
+ struct xfs_buf *bp) /* data block's buffer */
{
xfs_dir2_dataptr_t addr; /* addr for leaf lookup */
xfs_dir2_data_free_t *bf; /* bestfree table */
xfs_dir2_block_tail_t *btp=NULL; /* block tail */
int count; /* count of entries found */
- xfs_dir2_data_t *d; /* data block pointer */
+ xfs_dir2_data_hdr_t *hdr; /* data block header */
xfs_dir2_data_entry_t *dep; /* data entry */
xfs_dir2_data_free_t *dfp; /* bestfree entry */
xfs_dir2_data_unused_t *dup; /* unused entry */
@@ -67,37 +61,70 @@ xfs_dir2_data_check(
xfs_mount_t *mp; /* filesystem mount point */
char *p; /* current data position */
int stale; /* count of stale leaves */
+ struct xfs_name name;
+ const struct xfs_dir_ops *ops;
+ struct xfs_da_geometry *geo;
- mp = dp->i_mount;
- d = bp->data;
- ASSERT(INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_DATA_MAGIC ||
- INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_BLOCK_MAGIC);
- bf = d->hdr.bestfree;
- p = (char *)d->u;
- if (INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_BLOCK_MAGIC) {
- btp = XFS_DIR2_BLOCK_TAIL_P(mp, (xfs_dir2_block_t *)d);
- lep = XFS_DIR2_BLOCK_LEAF_P(btp);
+ mp = bp->b_target->bt_mount;
+ geo = mp->m_dir_geo;
+
+ /*
+ * We can be passed a null dp here from a verifier, so we need to go the
+ * hard way to get them.
+ */
+ ops = xfs_dir_get_ops(mp, dp);
+
+ hdr = bp->b_addr;
+ p = (char *)ops->data_entry_p(hdr);
+
+ switch (hdr->magic) {
+ case cpu_to_be32(XFS_DIR3_BLOCK_MAGIC):
+ case cpu_to_be32(XFS_DIR2_BLOCK_MAGIC):
+ btp = xfs_dir2_block_tail_p(geo, hdr);
+ lep = xfs_dir2_block_leaf_p(btp);
endp = (char *)lep;
- } else
- endp = (char *)d + mp->m_dirblksize;
- count = lastfree = freeseen = 0;
+
+ /*
+ * The number of leaf entries is limited by the size of the
+ * block and the amount of space used by the data entries.
+ * We don't know how much space is used by the data entries yet,
+ * so just ensure that the count falls somewhere inside the
+ * block right now.
+ */
+ XFS_WANT_CORRUPTED_RETURN(be32_to_cpu(btp->count) <
+ ((char *)btp - p) / sizeof(struct xfs_dir2_leaf_entry));
+ break;
+ case cpu_to_be32(XFS_DIR3_DATA_MAGIC):
+ case cpu_to_be32(XFS_DIR2_DATA_MAGIC):
+ endp = (char *)hdr + geo->blksize;
+ break;
+ default:
+ XFS_ERROR_REPORT("Bad Magic", XFS_ERRLEVEL_LOW, mp);
+ return EFSCORRUPTED;
+ }
+
/*
* Account for zero bestfree entries.
*/
+ bf = ops->data_bestfree_p(hdr);
+ count = lastfree = freeseen = 0;
if (!bf[0].length) {
- ASSERT(!bf[0].offset);
+ XFS_WANT_CORRUPTED_RETURN(!bf[0].offset);
freeseen |= 1 << 0;
}
if (!bf[1].length) {
- ASSERT(!bf[1].offset);
+ XFS_WANT_CORRUPTED_RETURN(!bf[1].offset);
freeseen |= 1 << 1;
}
if (!bf[2].length) {
- ASSERT(!bf[2].offset);
+ XFS_WANT_CORRUPTED_RETURN(!bf[2].offset);
freeseen |= 1 << 2;
}
- ASSERT(INT_GET(bf[0].length, ARCH_CONVERT) >= INT_GET(bf[1].length, ARCH_CONVERT));
- ASSERT(INT_GET(bf[1].length, ARCH_CONVERT) >= INT_GET(bf[2].length, ARCH_CONVERT));
+
+ XFS_WANT_CORRUPTED_RETURN(be16_to_cpu(bf[0].length) >=
+ be16_to_cpu(bf[1].length));
+ XFS_WANT_CORRUPTED_RETURN(be16_to_cpu(bf[1].length) >=
+ be16_to_cpu(bf[2].length));
/*
* Loop over the data/unused entries.
*/
@@ -108,18 +135,23 @@ xfs_dir2_data_check(
* If we find it, account for that, else make sure it
* doesn't need to be there.
*/
- if (INT_GET(dup->freetag, ARCH_CONVERT) == XFS_DIR2_DATA_FREE_TAG) {
- ASSERT(lastfree == 0);
- ASSERT(INT_GET(*XFS_DIR2_DATA_UNUSED_TAG_P(dup), ARCH_CONVERT) ==
- (char *)dup - (char *)d);
- dfp = xfs_dir2_data_freefind(d, dup);
+ if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
+ XFS_WANT_CORRUPTED_RETURN(lastfree == 0);
+ XFS_WANT_CORRUPTED_RETURN(
+ be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup)) ==
+ (char *)dup - (char *)hdr);
+ dfp = xfs_dir2_data_freefind(hdr, bf, dup);
if (dfp) {
i = (int)(dfp - bf);
- ASSERT((freeseen & (1 << i)) == 0);
+ XFS_WANT_CORRUPTED_RETURN(
+ (freeseen & (1 << i)) == 0);
freeseen |= 1 << i;
- } else
- ASSERT(INT_GET(dup->length, ARCH_CONVERT) <= INT_GET(bf[2].length, ARCH_CONVERT));
- p += INT_GET(dup->length, ARCH_CONVERT);
+ } else {
+ XFS_WANT_CORRUPTED_RETURN(
+ be16_to_cpu(dup->length) <=
+ be16_to_cpu(bf[2].length));
+ }
+ p += be16_to_cpu(dup->length);
lastfree = 1;
continue;
}
@@ -130,42 +162,182 @@ xfs_dir2_data_check(
* The linear search is crude but this is DEBUG code.
*/
dep = (xfs_dir2_data_entry_t *)p;
- ASSERT(dep->namelen != 0);
- ASSERT(xfs_dir_ino_validate(mp, INT_GET(dep->inumber, ARCH_CONVERT)) == 0);
- ASSERT(INT_GET(*XFS_DIR2_DATA_ENTRY_TAG_P(dep), ARCH_CONVERT) ==
- (char *)dep - (char *)d);
+ XFS_WANT_CORRUPTED_RETURN(dep->namelen != 0);
+ XFS_WANT_CORRUPTED_RETURN(
+ !xfs_dir_ino_validate(mp, be64_to_cpu(dep->inumber)));
+ XFS_WANT_CORRUPTED_RETURN(
+ be16_to_cpu(*ops->data_entry_tag_p(dep)) ==
+ (char *)dep - (char *)hdr);
+ XFS_WANT_CORRUPTED_RETURN(
+ ops->data_get_ftype(dep) < XFS_DIR3_FT_MAX);
count++;
lastfree = 0;
- if (INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_BLOCK_MAGIC) {
- addr = XFS_DIR2_DB_OFF_TO_DATAPTR(mp, mp->m_dirdatablk,
- (xfs_dir2_data_aoff_t)
- ((char *)dep - (char *)d));
- hash = xfs_da_hashname((char *)dep->name, dep->namelen);
- for (i = 0; i < INT_GET(btp->count, ARCH_CONVERT); i++) {
- if (INT_GET(lep[i].address, ARCH_CONVERT) == addr &&
- INT_GET(lep[i].hashval, ARCH_CONVERT) == hash)
+ if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
+ hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)) {
+ addr = xfs_dir2_db_off_to_dataptr(geo, geo->datablk,
+ (xfs_dir2_data_aoff_t)
+ ((char *)dep - (char *)hdr));
+ name.name = dep->name;
+ name.len = dep->namelen;
+ hash = mp->m_dirnameops->hashname(&name);
+ for (i = 0; i < be32_to_cpu(btp->count); i++) {
+ if (be32_to_cpu(lep[i].address) == addr &&
+ be32_to_cpu(lep[i].hashval) == hash)
break;
}
- ASSERT(i < INT_GET(btp->count, ARCH_CONVERT));
+ XFS_WANT_CORRUPTED_RETURN(i < be32_to_cpu(btp->count));
}
- p += XFS_DIR2_DATA_ENTSIZE(dep->namelen);
+ p += ops->data_entsize(dep->namelen);
}
/*
* Need to have seen all the entries and all the bestfree slots.
*/
- ASSERT(freeseen == 7);
- if (INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_BLOCK_MAGIC) {
- for (i = stale = 0; i < INT_GET(btp->count, ARCH_CONVERT); i++) {
- if (INT_GET(lep[i].address, ARCH_CONVERT) == XFS_DIR2_NULL_DATAPTR)
+ XFS_WANT_CORRUPTED_RETURN(freeseen == 7);
+ if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
+ hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)) {
+ for (i = stale = 0; i < be32_to_cpu(btp->count); i++) {
+ if (lep[i].address ==
+ cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
stale++;
if (i > 0)
- ASSERT(INT_GET(lep[i].hashval, ARCH_CONVERT) >= INT_GET(lep[i - 1].hashval, ARCH_CONVERT));
+ XFS_WANT_CORRUPTED_RETURN(
+ be32_to_cpu(lep[i].hashval) >=
+ be32_to_cpu(lep[i - 1].hashval));
}
- ASSERT(count == INT_GET(btp->count, ARCH_CONVERT) - INT_GET(btp->stale, ARCH_CONVERT));
- ASSERT(stale == INT_GET(btp->stale, ARCH_CONVERT));
+ XFS_WANT_CORRUPTED_RETURN(count ==
+ be32_to_cpu(btp->count) - be32_to_cpu(btp->stale));
+ XFS_WANT_CORRUPTED_RETURN(stale == be32_to_cpu(btp->stale));
+ }
+ return 0;
+}
+
+static bool
+xfs_dir3_data_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
+
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ if (hdr3->magic != cpu_to_be32(XFS_DIR3_DATA_MAGIC))
+ return false;
+ if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_uuid))
+ return false;
+ if (be64_to_cpu(hdr3->blkno) != bp->b_bn)
+ return false;
+ } else {
+ if (hdr3->magic != cpu_to_be32(XFS_DIR2_DATA_MAGIC))
+ return false;
}
+ if (__xfs_dir3_data_check(NULL, bp))
+ return false;
+ return true;
+}
+
+/*
+ * Readahead of the first block of the directory when it is opened is completely
+ * oblivious to the format of the directory. Hence we can either get a block
+ * format buffer or a data format buffer on readahead.
+ */
+static void
+xfs_dir3_data_reada_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_dir2_data_hdr *hdr = bp->b_addr;
+
+ switch (hdr->magic) {
+ case cpu_to_be32(XFS_DIR2_BLOCK_MAGIC):
+ case cpu_to_be32(XFS_DIR3_BLOCK_MAGIC):
+ bp->b_ops = &xfs_dir3_block_buf_ops;
+ bp->b_ops->verify_read(bp);
+ return;
+ case cpu_to_be32(XFS_DIR2_DATA_MAGIC):
+ case cpu_to_be32(XFS_DIR3_DATA_MAGIC):
+ xfs_dir3_data_verify(bp);
+ return;
+ default:
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
+ xfs_verifier_error(bp);
+ break;
+ }
+}
+
+static void
+xfs_dir3_data_read_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+
+ if (xfs_sb_version_hascrc(&mp->m_sb) &&
+ !xfs_buf_verify_cksum(bp, XFS_DIR3_DATA_CRC_OFF))
+ xfs_buf_ioerror(bp, EFSBADCRC);
+ else if (!xfs_dir3_data_verify(bp))
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
+
+ if (bp->b_error)
+ xfs_verifier_error(bp);
+}
+
+static void
+xfs_dir3_data_write_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_buf_log_item *bip = bp->b_fspriv;
+ struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
+
+ if (!xfs_dir3_data_verify(bp)) {
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
+ xfs_verifier_error(bp);
+ return;
+ }
+
+ if (!xfs_sb_version_hascrc(&mp->m_sb))
+ return;
+
+ if (bip)
+ hdr3->lsn = cpu_to_be64(bip->bli_item.li_lsn);
+
+ xfs_buf_update_cksum(bp, XFS_DIR3_DATA_CRC_OFF);
+}
+
+const struct xfs_buf_ops xfs_dir3_data_buf_ops = {
+ .verify_read = xfs_dir3_data_read_verify,
+ .verify_write = xfs_dir3_data_write_verify,
+};
+
+static const struct xfs_buf_ops xfs_dir3_data_reada_buf_ops = {
+ .verify_read = xfs_dir3_data_reada_verify,
+ .verify_write = xfs_dir3_data_write_verify,
+};
+
+
+int
+xfs_dir3_data_read(
+ struct xfs_trans *tp,
+ struct xfs_inode *dp,
+ xfs_dablk_t bno,
+ xfs_daddr_t mapped_bno,
+ struct xfs_buf **bpp)
+{
+ int err;
+
+ err = xfs_da_read_buf(tp, dp, bno, mapped_bno, bpp,
+ XFS_DATA_FORK, &xfs_dir3_data_buf_ops);
+ if (!err && tp)
+ xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_DATA_BUF);
+ return err;
+}
+
+int
+xfs_dir3_data_readahead(
+ struct xfs_inode *dp,
+ xfs_dablk_t bno,
+ xfs_daddr_t mapped_bno)
+{
+ return xfs_da_reada_buf(dp, bno, mapped_bno,
+ XFS_DATA_FORK, &xfs_dir3_data_reada_buf_ops);
}
-#endif
/*
* Given a data block and an unused entry from that block,
@@ -173,27 +345,31 @@ xfs_dir2_data_check(
*/
xfs_dir2_data_free_t *
xfs_dir2_data_freefind(
- xfs_dir2_data_t *d, /* data block */
- xfs_dir2_data_unused_t *dup) /* data unused entry */
+ struct xfs_dir2_data_hdr *hdr, /* data block header */
+ struct xfs_dir2_data_free *bf, /* bestfree table pointer */
+ struct xfs_dir2_data_unused *dup) /* unused space */
{
xfs_dir2_data_free_t *dfp; /* bestfree entry */
xfs_dir2_data_aoff_t off; /* offset value needed */
-#if defined(DEBUG) && defined(__KERNEL__)
+#ifdef DEBUG
int matched; /* matched the value */
int seenzero; /* saw a 0 bestfree entry */
#endif
- off = (xfs_dir2_data_aoff_t)((char *)dup - (char *)d);
-#if defined(DEBUG) && defined(__KERNEL__)
+ off = (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr);
+
+#ifdef DEBUG
/*
* Validate some consistency in the bestfree table.
* Check order, non-overlapping entries, and if we find the
* one we're looking for it has to be exact.
*/
- ASSERT(INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_DATA_MAGIC ||
- INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_BLOCK_MAGIC);
- for (dfp = &d->hdr.bestfree[0], seenzero = matched = 0;
- dfp < &d->hdr.bestfree[XFS_DIR2_DATA_FD_COUNT];
+ ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
+ hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
+ hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
+ hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC));
+ for (dfp = &bf[0], seenzero = matched = 0;
+ dfp < &bf[XFS_DIR2_DATA_FD_COUNT];
dfp++) {
if (!dfp->offset) {
ASSERT(!dfp->length);
@@ -201,33 +377,32 @@ xfs_dir2_data_freefind(
continue;
}
ASSERT(seenzero == 0);
- if (INT_GET(dfp->offset, ARCH_CONVERT) == off) {
+ if (be16_to_cpu(dfp->offset) == off) {
matched = 1;
- ASSERT(INT_GET(dfp->length, ARCH_CONVERT) == INT_GET(dup->length, ARCH_CONVERT));
- } else if (off < INT_GET(dfp->offset, ARCH_CONVERT))
- ASSERT(off + INT_GET(dup->length, ARCH_CONVERT) <= INT_GET(dfp->offset, ARCH_CONVERT));
+ ASSERT(dfp->length == dup->length);
+ } else if (off < be16_to_cpu(dfp->offset))
+ ASSERT(off + be16_to_cpu(dup->length) <= be16_to_cpu(dfp->offset));
else
- ASSERT(INT_GET(dfp->offset, ARCH_CONVERT) + INT_GET(dfp->length, ARCH_CONVERT) <= off);
- ASSERT(matched || INT_GET(dfp->length, ARCH_CONVERT) >= INT_GET(dup->length, ARCH_CONVERT));
- if (dfp > &d->hdr.bestfree[0])
- ASSERT(INT_GET(dfp[-1].length, ARCH_CONVERT) >= INT_GET(dfp[0].length, ARCH_CONVERT));
+ ASSERT(be16_to_cpu(dfp->offset) + be16_to_cpu(dfp->length) <= off);
+ ASSERT(matched || be16_to_cpu(dfp->length) >= be16_to_cpu(dup->length));
+ if (dfp > &bf[0])
+ ASSERT(be16_to_cpu(dfp[-1].length) >= be16_to_cpu(dfp[0].length));
}
#endif
/*
* If this is smaller than the smallest bestfree entry,
* it can't be there since they're sorted.
*/
- if (INT_GET(dup->length, ARCH_CONVERT) < INT_GET(d->hdr.bestfree[XFS_DIR2_DATA_FD_COUNT - 1].length, ARCH_CONVERT))
+ if (be16_to_cpu(dup->length) <
+ be16_to_cpu(bf[XFS_DIR2_DATA_FD_COUNT - 1].length))
return NULL;
/*
* Look at the three bestfree entries for our guy.
*/
- for (dfp = &d->hdr.bestfree[0];
- dfp < &d->hdr.bestfree[XFS_DIR2_DATA_FD_COUNT];
- dfp++) {
+ for (dfp = &bf[0]; dfp < &bf[XFS_DIR2_DATA_FD_COUNT]; dfp++) {
if (!dfp->offset)
return NULL;
- if (INT_GET(dfp->offset, ARCH_CONVERT) == off)
+ if (be16_to_cpu(dfp->offset) == off)
return dfp;
}
/*
@@ -241,37 +416,38 @@ xfs_dir2_data_freefind(
*/
xfs_dir2_data_free_t * /* entry inserted */
xfs_dir2_data_freeinsert(
- xfs_dir2_data_t *d, /* data block pointer */
- xfs_dir2_data_unused_t *dup, /* unused space */
+ struct xfs_dir2_data_hdr *hdr, /* data block pointer */
+ struct xfs_dir2_data_free *dfp, /* bestfree table pointer */
+ struct xfs_dir2_data_unused *dup, /* unused space */
int *loghead) /* log the data header (out) */
{
- xfs_dir2_data_free_t *dfp; /* bestfree table pointer */
xfs_dir2_data_free_t new; /* new bestfree entry */
-#ifdef __KERNEL__
- ASSERT(INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_DATA_MAGIC ||
- INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_BLOCK_MAGIC);
-#endif
- dfp = d->hdr.bestfree;
- INT_COPY(new.length, dup->length, ARCH_CONVERT);
- INT_SET(new.offset, ARCH_CONVERT, (xfs_dir2_data_off_t)((char *)dup - (char *)d));
+ ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
+ hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
+ hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
+ hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC));
+
+ new.length = dup->length;
+ new.offset = cpu_to_be16((char *)dup - (char *)hdr);
+
/*
* Insert at position 0, 1, or 2; or not at all.
*/
- if (INT_GET(new.length, ARCH_CONVERT) > INT_GET(dfp[0].length, ARCH_CONVERT)) {
+ if (be16_to_cpu(new.length) > be16_to_cpu(dfp[0].length)) {
dfp[2] = dfp[1];
dfp[1] = dfp[0];
dfp[0] = new;
*loghead = 1;
return &dfp[0];
}
- if (INT_GET(new.length, ARCH_CONVERT) > INT_GET(dfp[1].length, ARCH_CONVERT)) {
+ if (be16_to_cpu(new.length) > be16_to_cpu(dfp[1].length)) {
dfp[2] = dfp[1];
dfp[1] = new;
*loghead = 1;
return &dfp[1];
}
- if (INT_GET(new.length, ARCH_CONVERT) > INT_GET(dfp[2].length, ARCH_CONVERT)) {
+ if (be16_to_cpu(new.length) > be16_to_cpu(dfp[2].length)) {
dfp[2] = new;
*loghead = 1;
return &dfp[2];
@@ -284,36 +460,39 @@ xfs_dir2_data_freeinsert(
*/
STATIC void
xfs_dir2_data_freeremove(
- xfs_dir2_data_t *d, /* data block pointer */
- xfs_dir2_data_free_t *dfp, /* bestfree entry pointer */
+ struct xfs_dir2_data_hdr *hdr, /* data block header */
+ struct xfs_dir2_data_free *bf, /* bestfree table pointer */
+ struct xfs_dir2_data_free *dfp, /* bestfree entry pointer */
int *loghead) /* out: log data header */
{
-#ifdef __KERNEL__
- ASSERT(INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_DATA_MAGIC ||
- INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_BLOCK_MAGIC);
-#endif
+
+ ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
+ hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
+ hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
+ hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC));
+
/*
* It's the first entry, slide the next 2 up.
*/
- if (dfp == &d->hdr.bestfree[0]) {
- d->hdr.bestfree[0] = d->hdr.bestfree[1];
- d->hdr.bestfree[1] = d->hdr.bestfree[2];
+ if (dfp == &bf[0]) {
+ bf[0] = bf[1];
+ bf[1] = bf[2];
}
/*
* It's the second entry, slide the 3rd entry up.
*/
- else if (dfp == &d->hdr.bestfree[1])
- d->hdr.bestfree[1] = d->hdr.bestfree[2];
+ else if (dfp == &bf[1])
+ bf[1] = bf[2];
/*
* Must be the last entry.
*/
else
- ASSERT(dfp == &d->hdr.bestfree[2]);
+ ASSERT(dfp == &bf[2]);
/*
* Clear the 3rd entry, must be zero now.
*/
- d->hdr.bestfree[2].length = 0;
- d->hdr.bestfree[2].offset = 0;
+ bf[2].length = 0;
+ bf[2].offset = 0;
*loghead = 1;
}
@@ -322,37 +501,39 @@ xfs_dir2_data_freeremove(
*/
void
xfs_dir2_data_freescan(
- xfs_mount_t *mp, /* filesystem mount point */
- xfs_dir2_data_t *d, /* data block pointer */
- int *loghead, /* out: log data header */
- char *aendp) /* in: caller's endp */
+ struct xfs_inode *dp,
+ struct xfs_dir2_data_hdr *hdr,
+ int *loghead)
{
xfs_dir2_block_tail_t *btp; /* block tail */
xfs_dir2_data_entry_t *dep; /* active data entry */
xfs_dir2_data_unused_t *dup; /* unused data entry */
+ struct xfs_dir2_data_free *bf;
char *endp; /* end of block's data */
char *p; /* current entry pointer */
+ struct xfs_da_geometry *geo = dp->i_mount->m_dir_geo;
+
+ ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
+ hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
+ hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
+ hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC));
-#ifdef __KERNEL__
- ASSERT(INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_DATA_MAGIC ||
- INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_BLOCK_MAGIC);
-#endif
/*
* Start by clearing the table.
*/
- memset(d->hdr.bestfree, 0, sizeof(d->hdr.bestfree));
+ bf = dp->d_ops->data_bestfree_p(hdr);
+ memset(bf, 0, sizeof(*bf) * XFS_DIR2_DATA_FD_COUNT);
*loghead = 1;
/*
* Set up pointers.
*/
- p = (char *)d->u;
- if (aendp)
- endp = aendp;
- else if (INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_BLOCK_MAGIC) {
- btp = XFS_DIR2_BLOCK_TAIL_P(mp, (xfs_dir2_block_t *)d);
- endp = (char *)XFS_DIR2_BLOCK_LEAF_P(btp);
+ p = (char *)dp->d_ops->data_entry_p(hdr);
+ if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
+ hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)) {
+ btp = xfs_dir2_block_tail_p(geo, hdr);
+ endp = (char *)xfs_dir2_block_leaf_p(btp);
} else
- endp = (char *)d + mp->m_dirblksize;
+ endp = (char *)hdr + geo->blksize;
/*
* Loop over the block's entries.
*/
@@ -361,20 +542,20 @@ xfs_dir2_data_freescan(
/*
* If it's a free entry, insert it.
*/
- if (INT_GET(dup->freetag, ARCH_CONVERT) == XFS_DIR2_DATA_FREE_TAG) {
- ASSERT((char *)dup - (char *)d ==
- INT_GET(*XFS_DIR2_DATA_UNUSED_TAG_P(dup), ARCH_CONVERT));
- xfs_dir2_data_freeinsert(d, dup, loghead);
- p += INT_GET(dup->length, ARCH_CONVERT);
+ if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
+ ASSERT((char *)dup - (char *)hdr ==
+ be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup)));
+ xfs_dir2_data_freeinsert(hdr, bf, dup, loghead);
+ p += be16_to_cpu(dup->length);
}
/*
* For active entries, check their tags and skip them.
*/
else {
dep = (xfs_dir2_data_entry_t *)p;
- ASSERT((char *)dep - (char *)d ==
- INT_GET(*XFS_DIR2_DATA_ENTRY_TAG_P(dep), ARCH_CONVERT));
- p += XFS_DIR2_DATA_ENTSIZE(dep->namelen);
+ ASSERT((char *)dep - (char *)hdr ==
+ be16_to_cpu(*dp->d_ops->data_entry_tag_p(dep)));
+ p += dp->d_ops->data_entsize(dep->namelen);
}
}
}
@@ -384,15 +565,16 @@ xfs_dir2_data_freescan(
* Give back the buffer for the created block.
*/
int /* error */
-xfs_dir2_data_init(
+xfs_dir3_data_init(
xfs_da_args_t *args, /* directory operation args */
xfs_dir2_db_t blkno, /* logical dir block number */
- xfs_dabuf_t **bpp) /* output block buffer */
+ struct xfs_buf **bpp) /* output block buffer */
{
- xfs_dabuf_t *bp; /* block buffer */
- xfs_dir2_data_t *d; /* pointer to block */
+ struct xfs_buf *bp; /* block buffer */
+ xfs_dir2_data_hdr_t *hdr; /* data block header */
xfs_inode_t *dp; /* incore directory inode */
xfs_dir2_data_unused_t *dup; /* unused entry pointer */
+ struct xfs_dir2_data_free *bf;
int error; /* error return value */
int i; /* bestfree index */
xfs_mount_t *mp; /* filesystem mount point */
@@ -405,38 +587,51 @@ xfs_dir2_data_init(
/*
* Get the buffer set up for the block.
*/
- error = xfs_da_get_buf(tp, dp, XFS_DIR2_DB_TO_DA(mp, blkno), -1, &bp,
- XFS_DATA_FORK);
- if (error) {
+ error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(args->geo, blkno),
+ -1, &bp, XFS_DATA_FORK);
+ if (error)
return error;
- }
- ASSERT(bp != NULL);
+ bp->b_ops = &xfs_dir3_data_buf_ops;
+ xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_DATA_BUF);
+
/*
* Initialize the header.
*/
- d = bp->data;
- INT_SET(d->hdr.magic, ARCH_CONVERT, XFS_DIR2_DATA_MAGIC);
- INT_SET(d->hdr.bestfree[0].offset, ARCH_CONVERT, (xfs_dir2_data_off_t)sizeof(d->hdr));
+ hdr = bp->b_addr;
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
+
+ memset(hdr3, 0, sizeof(*hdr3));
+ hdr3->magic = cpu_to_be32(XFS_DIR3_DATA_MAGIC);
+ hdr3->blkno = cpu_to_be64(bp->b_bn);
+ hdr3->owner = cpu_to_be64(dp->i_ino);
+ uuid_copy(&hdr3->uuid, &mp->m_sb.sb_uuid);
+
+ } else
+ hdr->magic = cpu_to_be32(XFS_DIR2_DATA_MAGIC);
+
+ bf = dp->d_ops->data_bestfree_p(hdr);
+ bf[0].offset = cpu_to_be16(dp->d_ops->data_entry_offset);
for (i = 1; i < XFS_DIR2_DATA_FD_COUNT; i++) {
- d->hdr.bestfree[i].length = 0;
- d->hdr.bestfree[i].offset = 0;
+ bf[i].length = 0;
+ bf[i].offset = 0;
}
+
/*
* Set up an unused entry for the block's body.
*/
- dup = &d->u[0].unused;
- INT_SET(dup->freetag, ARCH_CONVERT, XFS_DIR2_DATA_FREE_TAG);
+ dup = dp->d_ops->data_unused_p(hdr);
+ dup->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG);
- t=mp->m_dirblksize - (uint)sizeof(d->hdr);
- INT_SET(d->hdr.bestfree[0].length, ARCH_CONVERT, t);
- INT_SET(dup->length, ARCH_CONVERT, t);
- INT_SET(*XFS_DIR2_DATA_UNUSED_TAG_P(dup), ARCH_CONVERT,
- (xfs_dir2_data_off_t)((char *)dup - (char *)d));
+ t = args->geo->blksize - (uint)dp->d_ops->data_entry_offset;
+ bf[0].length = cpu_to_be16(t);
+ dup->length = cpu_to_be16(t);
+ *xfs_dir2_data_unused_tag_p(dup) = cpu_to_be16((char *)dup - (char *)hdr);
/*
* Log it and return it.
*/
- xfs_dir2_data_log_header(tp, bp);
- xfs_dir2_data_log_unused(tp, bp, dup);
+ xfs_dir2_data_log_header(args, bp);
+ xfs_dir2_data_log_unused(args, bp, dup);
*bpp = bp;
return 0;
}
@@ -446,18 +641,20 @@ xfs_dir2_data_init(
*/
void
xfs_dir2_data_log_entry(
- xfs_trans_t *tp, /* transaction pointer */
- xfs_dabuf_t *bp, /* block buffer */
+ struct xfs_da_args *args,
+ struct xfs_buf *bp,
xfs_dir2_data_entry_t *dep) /* data entry pointer */
{
- xfs_dir2_data_t *d; /* data block pointer */
-
- d = bp->data;
- ASSERT(INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_DATA_MAGIC ||
- INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_BLOCK_MAGIC);
- xfs_da_log_buf(tp, bp, (uint)((char *)dep - (char *)d),
- (uint)((char *)(XFS_DIR2_DATA_ENTRY_TAG_P(dep) + 1) -
- (char *)d - 1));
+ struct xfs_dir2_data_hdr *hdr = bp->b_addr;
+
+ ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
+ hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
+ hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
+ hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC));
+
+ xfs_trans_log_buf(args->trans, bp, (uint)((char *)dep - (char *)hdr),
+ (uint)((char *)(args->dp->d_ops->data_entry_tag_p(dep) + 1) -
+ (char *)hdr - 1));
}
/*
@@ -465,16 +662,20 @@ xfs_dir2_data_log_entry(
*/
void
xfs_dir2_data_log_header(
- xfs_trans_t *tp, /* transaction pointer */
- xfs_dabuf_t *bp) /* block buffer */
+ struct xfs_da_args *args,
+ struct xfs_buf *bp)
{
- xfs_dir2_data_t *d; /* data block pointer */
+#ifdef DEBUG
+ struct xfs_dir2_data_hdr *hdr = bp->b_addr;
- d = bp->data;
- ASSERT(INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_DATA_MAGIC ||
- INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_BLOCK_MAGIC);
- xfs_da_log_buf(tp, bp, (uint)((char *)&d->hdr - (char *)d),
- (uint)(sizeof(d->hdr) - 1));
+ ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
+ hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
+ hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
+ hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC));
+#endif
+
+ xfs_trans_log_buf(args->trans, bp, 0,
+ args->dp->d_ops->data_entry_offset - 1);
}
/*
@@ -482,27 +683,29 @@ xfs_dir2_data_log_header(
*/
void
xfs_dir2_data_log_unused(
- xfs_trans_t *tp, /* transaction pointer */
- xfs_dabuf_t *bp, /* block buffer */
+ struct xfs_da_args *args,
+ struct xfs_buf *bp,
xfs_dir2_data_unused_t *dup) /* data unused pointer */
{
- xfs_dir2_data_t *d; /* data block pointer */
+ xfs_dir2_data_hdr_t *hdr = bp->b_addr;
+
+ ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
+ hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
+ hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
+ hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC));
- d = bp->data;
- ASSERT(INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_DATA_MAGIC ||
- INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_BLOCK_MAGIC);
/*
* Log the first part of the unused entry.
*/
- xfs_da_log_buf(tp, bp, (uint)((char *)dup - (char *)d),
+ xfs_trans_log_buf(args->trans, bp, (uint)((char *)dup - (char *)hdr),
(uint)((char *)&dup->length + sizeof(dup->length) -
- 1 - (char *)d));
+ 1 - (char *)hdr));
/*
* Log the end (tag) of the unused entry.
*/
- xfs_da_log_buf(tp, bp,
- (uint)((char *)XFS_DIR2_DATA_UNUSED_TAG_P(dup) - (char *)d),
- (uint)((char *)XFS_DIR2_DATA_UNUSED_TAG_P(dup) - (char *)d +
+ xfs_trans_log_buf(args->trans, bp,
+ (uint)((char *)xfs_dir2_data_unused_tag_p(dup) - (char *)hdr),
+ (uint)((char *)xfs_dir2_data_unused_tag_p(dup) - (char *)hdr +
sizeof(xfs_dir2_data_off_t) - 1));
}
@@ -512,46 +715,48 @@ xfs_dir2_data_log_unused(
*/
void
xfs_dir2_data_make_free(
- xfs_trans_t *tp, /* transaction pointer */
- xfs_dabuf_t *bp, /* block buffer */
+ struct xfs_da_args *args,
+ struct xfs_buf *bp,
xfs_dir2_data_aoff_t offset, /* starting byte offset */
xfs_dir2_data_aoff_t len, /* length in bytes */
int *needlogp, /* out: log header */
int *needscanp) /* out: regen bestfree */
{
- xfs_dir2_data_t *d; /* data block pointer */
+ xfs_dir2_data_hdr_t *hdr; /* data block pointer */
xfs_dir2_data_free_t *dfp; /* bestfree pointer */
char *endptr; /* end of data area */
- xfs_mount_t *mp; /* filesystem mount point */
int needscan; /* need to regen bestfree */
xfs_dir2_data_unused_t *newdup; /* new unused entry */
xfs_dir2_data_unused_t *postdup; /* unused entry after us */
xfs_dir2_data_unused_t *prevdup; /* unused entry before us */
+ struct xfs_dir2_data_free *bf;
+
+ hdr = bp->b_addr;
- mp = tp->t_mountp;
- d = bp->data;
/*
* Figure out where the end of the data area is.
*/
- if (INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_DATA_MAGIC)
- endptr = (char *)d + mp->m_dirblksize;
+ if (hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
+ hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC))
+ endptr = (char *)hdr + args->geo->blksize;
else {
xfs_dir2_block_tail_t *btp; /* block tail */
- ASSERT(INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_BLOCK_MAGIC);
- btp = XFS_DIR2_BLOCK_TAIL_P(mp, (xfs_dir2_block_t *)d);
- endptr = (char *)XFS_DIR2_BLOCK_LEAF_P(btp);
+ ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
+ hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC));
+ btp = xfs_dir2_block_tail_p(args->geo, hdr);
+ endptr = (char *)xfs_dir2_block_leaf_p(btp);
}
/*
* If this isn't the start of the block, then back up to
* the previous entry and see if it's free.
*/
- if (offset > sizeof(d->hdr)) {
- xfs_dir2_data_off_t *tagp; /* tag just before us */
+ if (offset > args->dp->d_ops->data_entry_offset) {
+ __be16 *tagp; /* tag just before us */
- tagp = (xfs_dir2_data_off_t *)((char *)d + offset) - 1;
- prevdup = (xfs_dir2_data_unused_t *)((char *)d + INT_GET(*tagp, ARCH_CONVERT));
- if (INT_GET(prevdup->freetag, ARCH_CONVERT) != XFS_DIR2_DATA_FREE_TAG)
+ tagp = (__be16 *)((char *)hdr + offset) - 1;
+ prevdup = (xfs_dir2_data_unused_t *)((char *)hdr + be16_to_cpu(*tagp));
+ if (be16_to_cpu(prevdup->freetag) != XFS_DIR2_DATA_FREE_TAG)
prevdup = NULL;
} else
prevdup = NULL;
@@ -559,10 +764,10 @@ xfs_dir2_data_make_free(
* If this isn't the end of the block, see if the entry after
* us is free.
*/
- if ((char *)d + offset + len < endptr) {
+ if ((char *)hdr + offset + len < endptr) {
postdup =
- (xfs_dir2_data_unused_t *)((char *)d + offset + len);
- if (INT_GET(postdup->freetag, ARCH_CONVERT) != XFS_DIR2_DATA_FREE_TAG)
+ (xfs_dir2_data_unused_t *)((char *)hdr + offset + len);
+ if (be16_to_cpu(postdup->freetag) != XFS_DIR2_DATA_FREE_TAG)
postdup = NULL;
} else
postdup = NULL;
@@ -572,28 +777,29 @@ xfs_dir2_data_make_free(
* Previous and following entries are both free,
* merge everything into a single free entry.
*/
+ bf = args->dp->d_ops->data_bestfree_p(hdr);
if (prevdup && postdup) {
xfs_dir2_data_free_t *dfp2; /* another bestfree pointer */
/*
* See if prevdup and/or postdup are in bestfree table.
*/
- dfp = xfs_dir2_data_freefind(d, prevdup);
- dfp2 = xfs_dir2_data_freefind(d, postdup);
+ dfp = xfs_dir2_data_freefind(hdr, bf, prevdup);
+ dfp2 = xfs_dir2_data_freefind(hdr, bf, postdup);
/*
* We need a rescan unless there are exactly 2 free entries
* namely our two. Then we know what's happening, otherwise
* since the third bestfree is there, there might be more
* entries.
*/
- needscan = d->hdr.bestfree[2].length;
+ needscan = (bf[2].length != 0);
/*
* Fix up the new big freespace.
*/
- INT_MOD(prevdup->length, ARCH_CONVERT, len + INT_GET(postdup->length, ARCH_CONVERT));
- INT_SET(*XFS_DIR2_DATA_UNUSED_TAG_P(prevdup), ARCH_CONVERT,
- (xfs_dir2_data_off_t)((char *)prevdup - (char *)d));
- xfs_dir2_data_log_unused(tp, bp, prevdup);
+ be16_add_cpu(&prevdup->length, len + be16_to_cpu(postdup->length));
+ *xfs_dir2_data_unused_tag_p(prevdup) =
+ cpu_to_be16((char *)prevdup - (char *)hdr);
+ xfs_dir2_data_log_unused(args, bp, prevdup);
if (!needscan) {
/*
* Has to be the case that entries 0 and 1 are
@@ -602,19 +808,20 @@ xfs_dir2_data_make_free(
* Remove entry 1 first then entry 0.
*/
ASSERT(dfp && dfp2);
- if (dfp == &d->hdr.bestfree[1]) {
- dfp = &d->hdr.bestfree[0];
+ if (dfp == &bf[1]) {
+ dfp = &bf[0];
ASSERT(dfp2 == dfp);
- dfp2 = &d->hdr.bestfree[1];
+ dfp2 = &bf[1];
}
- xfs_dir2_data_freeremove(d, dfp2, needlogp);
- xfs_dir2_data_freeremove(d, dfp, needlogp);
+ xfs_dir2_data_freeremove(hdr, bf, dfp2, needlogp);
+ xfs_dir2_data_freeremove(hdr, bf, dfp, needlogp);
/*
* Now insert the new entry.
*/
- dfp = xfs_dir2_data_freeinsert(d, prevdup, needlogp);
- ASSERT(dfp == &d->hdr.bestfree[0]);
- ASSERT(INT_GET(dfp->length, ARCH_CONVERT) == INT_GET(prevdup->length, ARCH_CONVERT));
+ dfp = xfs_dir2_data_freeinsert(hdr, bf, prevdup,
+ needlogp);
+ ASSERT(dfp == &bf[0]);
+ ASSERT(dfp->length == prevdup->length);
ASSERT(!dfp[1].length);
ASSERT(!dfp[2].length);
}
@@ -623,63 +830,67 @@ xfs_dir2_data_make_free(
* The entry before us is free, merge with it.
*/
else if (prevdup) {
- dfp = xfs_dir2_data_freefind(d, prevdup);
- INT_MOD(prevdup->length, ARCH_CONVERT, len);
- INT_SET(*XFS_DIR2_DATA_UNUSED_TAG_P(prevdup), ARCH_CONVERT,
- (xfs_dir2_data_off_t)((char *)prevdup - (char *)d));
- xfs_dir2_data_log_unused(tp, bp, prevdup);
+ dfp = xfs_dir2_data_freefind(hdr, bf, prevdup);
+ be16_add_cpu(&prevdup->length, len);
+ *xfs_dir2_data_unused_tag_p(prevdup) =
+ cpu_to_be16((char *)prevdup - (char *)hdr);
+ xfs_dir2_data_log_unused(args, bp, prevdup);
/*
* If the previous entry was in the table, the new entry
* is longer, so it will be in the table too. Remove
* the old one and add the new one.
*/
if (dfp) {
- xfs_dir2_data_freeremove(d, dfp, needlogp);
- (void)xfs_dir2_data_freeinsert(d, prevdup, needlogp);
+ xfs_dir2_data_freeremove(hdr, bf, dfp, needlogp);
+ xfs_dir2_data_freeinsert(hdr, bf, prevdup, needlogp);
}
/*
* Otherwise we need a scan if the new entry is big enough.
*/
- else
- needscan = INT_GET(prevdup->length, ARCH_CONVERT) > INT_GET(d->hdr.bestfree[2].length, ARCH_CONVERT);
+ else {
+ needscan = be16_to_cpu(prevdup->length) >
+ be16_to_cpu(bf[2].length);
+ }
}
/*
* The following entry is free, merge with it.
*/
else if (postdup) {
- dfp = xfs_dir2_data_freefind(d, postdup);
- newdup = (xfs_dir2_data_unused_t *)((char *)d + offset);
- INT_SET(newdup->freetag, ARCH_CONVERT, XFS_DIR2_DATA_FREE_TAG);
- INT_SET(newdup->length, ARCH_CONVERT, len + INT_GET(postdup->length, ARCH_CONVERT));
- INT_SET(*XFS_DIR2_DATA_UNUSED_TAG_P(newdup), ARCH_CONVERT,
- (xfs_dir2_data_off_t)((char *)newdup - (char *)d));
- xfs_dir2_data_log_unused(tp, bp, newdup);
+ dfp = xfs_dir2_data_freefind(hdr, bf, postdup);
+ newdup = (xfs_dir2_data_unused_t *)((char *)hdr + offset);
+ newdup->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG);
+ newdup->length = cpu_to_be16(len + be16_to_cpu(postdup->length));
+ *xfs_dir2_data_unused_tag_p(newdup) =
+ cpu_to_be16((char *)newdup - (char *)hdr);
+ xfs_dir2_data_log_unused(args, bp, newdup);
/*
* If the following entry was in the table, the new entry
* is longer, so it will be in the table too. Remove
* the old one and add the new one.
*/
if (dfp) {
- xfs_dir2_data_freeremove(d, dfp, needlogp);
- (void)xfs_dir2_data_freeinsert(d, newdup, needlogp);
+ xfs_dir2_data_freeremove(hdr, bf, dfp, needlogp);
+ xfs_dir2_data_freeinsert(hdr, bf, newdup, needlogp);
}
/*
* Otherwise we need a scan if the new entry is big enough.
*/
- else
- needscan = INT_GET(newdup->length, ARCH_CONVERT) > INT_GET(d->hdr.bestfree[2].length, ARCH_CONVERT);
+ else {
+ needscan = be16_to_cpu(newdup->length) >
+ be16_to_cpu(bf[2].length);
+ }
}
/*
* Neither neighbor is free. Make a new entry.
*/
else {
- newdup = (xfs_dir2_data_unused_t *)((char *)d + offset);
- INT_SET(newdup->freetag, ARCH_CONVERT, XFS_DIR2_DATA_FREE_TAG);
- INT_SET(newdup->length, ARCH_CONVERT, len);
- INT_SET(*XFS_DIR2_DATA_UNUSED_TAG_P(newdup), ARCH_CONVERT,
- (xfs_dir2_data_off_t)((char *)newdup - (char *)d));
- xfs_dir2_data_log_unused(tp, bp, newdup);
- (void)xfs_dir2_data_freeinsert(d, newdup, needlogp);
+ newdup = (xfs_dir2_data_unused_t *)((char *)hdr + offset);
+ newdup->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG);
+ newdup->length = cpu_to_be16(len);
+ *xfs_dir2_data_unused_tag_p(newdup) =
+ cpu_to_be16((char *)newdup - (char *)hdr);
+ xfs_dir2_data_log_unused(args, bp, newdup);
+ xfs_dir2_data_freeinsert(hdr, bf, newdup, needlogp);
}
*needscanp = needscan;
}
@@ -689,15 +900,15 @@ xfs_dir2_data_make_free(
*/
void
xfs_dir2_data_use_free(
- xfs_trans_t *tp, /* transaction pointer */
- xfs_dabuf_t *bp, /* data block buffer */
+ struct xfs_da_args *args,
+ struct xfs_buf *bp,
xfs_dir2_data_unused_t *dup, /* unused entry */
xfs_dir2_data_aoff_t offset, /* starting offset to use */
xfs_dir2_data_aoff_t len, /* length to use */
int *needlogp, /* out: need to log header */
int *needscanp) /* out: need regen bestfree */
{
- xfs_dir2_data_t *d; /* data block */
+ xfs_dir2_data_hdr_t *hdr; /* data block header */
xfs_dir2_data_free_t *dfp; /* bestfree pointer */
int matchback; /* matches end of freespace */
int matchfront; /* matches start of freespace */
@@ -705,25 +916,29 @@ xfs_dir2_data_use_free(
xfs_dir2_data_unused_t *newdup; /* new unused entry */
xfs_dir2_data_unused_t *newdup2; /* another new unused entry */
int oldlen; /* old unused entry's length */
+ struct xfs_dir2_data_free *bf;
- d = bp->data;
- ASSERT(INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_DATA_MAGIC ||
- INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_BLOCK_MAGIC);
- ASSERT(INT_GET(dup->freetag, ARCH_CONVERT) == XFS_DIR2_DATA_FREE_TAG);
- ASSERT(offset >= (char *)dup - (char *)d);
- ASSERT(offset + len <= (char *)dup + INT_GET(dup->length, ARCH_CONVERT) - (char *)d);
- ASSERT((char *)dup - (char *)d == INT_GET(*XFS_DIR2_DATA_UNUSED_TAG_P(dup), ARCH_CONVERT));
+ hdr = bp->b_addr;
+ ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
+ hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
+ hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
+ hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC));
+ ASSERT(be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG);
+ ASSERT(offset >= (char *)dup - (char *)hdr);
+ ASSERT(offset + len <= (char *)dup + be16_to_cpu(dup->length) - (char *)hdr);
+ ASSERT((char *)dup - (char *)hdr == be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup)));
/*
* Look up the entry in the bestfree table.
*/
- dfp = xfs_dir2_data_freefind(d, dup);
- oldlen = INT_GET(dup->length, ARCH_CONVERT);
- ASSERT(dfp || oldlen <= INT_GET(d->hdr.bestfree[2].length, ARCH_CONVERT));
+ oldlen = be16_to_cpu(dup->length);
+ bf = args->dp->d_ops->data_bestfree_p(hdr);
+ dfp = xfs_dir2_data_freefind(hdr, bf, dup);
+ ASSERT(dfp || oldlen <= be16_to_cpu(bf[2].length));
/*
* Check for alignment with front and back of the entry.
*/
- matchfront = (char *)dup - (char *)d == offset;
- matchback = (char *)dup + oldlen - (char *)d == offset + len;
+ matchfront = (char *)dup - (char *)hdr == offset;
+ matchback = (char *)dup + oldlen - (char *)hdr == offset + len;
ASSERT(*needscanp == 0);
needscan = 0;
/*
@@ -732,9 +947,10 @@ xfs_dir2_data_use_free(
*/
if (matchfront && matchback) {
if (dfp) {
- needscan = d->hdr.bestfree[2].offset;
+ needscan = (bf[2].offset != 0);
if (!needscan)
- xfs_dir2_data_freeremove(d, dfp, needlogp);
+ xfs_dir2_data_freeremove(hdr, bf, dfp,
+ needlogp);
}
}
/*
@@ -742,27 +958,28 @@ xfs_dir2_data_use_free(
* Make a new entry with the remaining freespace.
*/
else if (matchfront) {
- newdup = (xfs_dir2_data_unused_t *)((char *)d + offset + len);
- INT_SET(newdup->freetag, ARCH_CONVERT, XFS_DIR2_DATA_FREE_TAG);
- INT_SET(newdup->length, ARCH_CONVERT, oldlen - len);
- INT_SET(*XFS_DIR2_DATA_UNUSED_TAG_P(newdup), ARCH_CONVERT,
- (xfs_dir2_data_off_t)((char *)newdup - (char *)d));
- xfs_dir2_data_log_unused(tp, bp, newdup);
+ newdup = (xfs_dir2_data_unused_t *)((char *)hdr + offset + len);
+ newdup->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG);
+ newdup->length = cpu_to_be16(oldlen - len);
+ *xfs_dir2_data_unused_tag_p(newdup) =
+ cpu_to_be16((char *)newdup - (char *)hdr);
+ xfs_dir2_data_log_unused(args, bp, newdup);
/*
* If it was in the table, remove it and add the new one.
*/
if (dfp) {
- xfs_dir2_data_freeremove(d, dfp, needlogp);
- dfp = xfs_dir2_data_freeinsert(d, newdup, needlogp);
+ xfs_dir2_data_freeremove(hdr, bf, dfp, needlogp);
+ dfp = xfs_dir2_data_freeinsert(hdr, bf, newdup,
+ needlogp);
ASSERT(dfp != NULL);
- ASSERT(INT_GET(dfp->length, ARCH_CONVERT) == INT_GET(newdup->length, ARCH_CONVERT));
- ASSERT(INT_GET(dfp->offset, ARCH_CONVERT) == (char *)newdup - (char *)d);
+ ASSERT(dfp->length == newdup->length);
+ ASSERT(be16_to_cpu(dfp->offset) == (char *)newdup - (char *)hdr);
/*
* If we got inserted at the last slot,
* that means we don't know if there was a better
* choice for the last slot, or not. Rescan.
*/
- needscan = dfp == &d->hdr.bestfree[2];
+ needscan = dfp == &bf[2];
}
}
/*
@@ -771,26 +988,26 @@ xfs_dir2_data_use_free(
*/
else if (matchback) {
newdup = dup;
- INT_SET(newdup->length, ARCH_CONVERT, (xfs_dir2_data_off_t)
- (((char *)d + offset) - (char *)newdup));
- INT_SET(*XFS_DIR2_DATA_UNUSED_TAG_P(newdup), ARCH_CONVERT,
- (xfs_dir2_data_off_t)((char *)newdup - (char *)d));
- xfs_dir2_data_log_unused(tp, bp, newdup);
+ newdup->length = cpu_to_be16(((char *)hdr + offset) - (char *)newdup);
+ *xfs_dir2_data_unused_tag_p(newdup) =
+ cpu_to_be16((char *)newdup - (char *)hdr);
+ xfs_dir2_data_log_unused(args, bp, newdup);
/*
* If it was in the table, remove it and add the new one.
*/
if (dfp) {
- xfs_dir2_data_freeremove(d, dfp, needlogp);
- dfp = xfs_dir2_data_freeinsert(d, newdup, needlogp);
+ xfs_dir2_data_freeremove(hdr, bf, dfp, needlogp);
+ dfp = xfs_dir2_data_freeinsert(hdr, bf, newdup,
+ needlogp);
ASSERT(dfp != NULL);
- ASSERT(INT_GET(dfp->length, ARCH_CONVERT) == INT_GET(newdup->length, ARCH_CONVERT));
- ASSERT(INT_GET(dfp->offset, ARCH_CONVERT) == (char *)newdup - (char *)d);
+ ASSERT(dfp->length == newdup->length);
+ ASSERT(be16_to_cpu(dfp->offset) == (char *)newdup - (char *)hdr);
/*
* If we got inserted at the last slot,
* that means we don't know if there was a better
* choice for the last slot, or not. Rescan.
*/
- needscan = dfp == &d->hdr.bestfree[2];
+ needscan = dfp == &bf[2];
}
}
/*
@@ -799,17 +1016,16 @@ xfs_dir2_data_use_free(
*/
else {
newdup = dup;
- INT_SET(newdup->length, ARCH_CONVERT, (xfs_dir2_data_off_t)
- (((char *)d + offset) - (char *)newdup));
- INT_SET(*XFS_DIR2_DATA_UNUSED_TAG_P(newdup), ARCH_CONVERT,
- (xfs_dir2_data_off_t)((char *)newdup - (char *)d));
- xfs_dir2_data_log_unused(tp, bp, newdup);
- newdup2 = (xfs_dir2_data_unused_t *)((char *)d + offset + len);
- INT_SET(newdup2->freetag, ARCH_CONVERT, XFS_DIR2_DATA_FREE_TAG);
- INT_SET(newdup2->length, ARCH_CONVERT, oldlen - len - INT_GET(newdup->length, ARCH_CONVERT));
- INT_SET(*XFS_DIR2_DATA_UNUSED_TAG_P(newdup2), ARCH_CONVERT,
- (xfs_dir2_data_off_t)((char *)newdup2 - (char *)d));
- xfs_dir2_data_log_unused(tp, bp, newdup2);
+ newdup->length = cpu_to_be16(((char *)hdr + offset) - (char *)newdup);
+ *xfs_dir2_data_unused_tag_p(newdup) =
+ cpu_to_be16((char *)newdup - (char *)hdr);
+ xfs_dir2_data_log_unused(args, bp, newdup);
+ newdup2 = (xfs_dir2_data_unused_t *)((char *)hdr + offset + len);
+ newdup2->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG);
+ newdup2->length = cpu_to_be16(oldlen - len - be16_to_cpu(newdup->length));
+ *xfs_dir2_data_unused_tag_p(newdup2) =
+ cpu_to_be16((char *)newdup2 - (char *)hdr);
+ xfs_dir2_data_log_unused(args, bp, newdup2);
/*
* If the old entry was in the table, we need to scan
* if the 3rd entry was valid, since these entries
@@ -819,13 +1035,14 @@ xfs_dir2_data_use_free(
* the 2 new will work.
*/
if (dfp) {
- needscan = d->hdr.bestfree[2].length;
+ needscan = (bf[2].length != 0);
if (!needscan) {
- xfs_dir2_data_freeremove(d, dfp, needlogp);
- (void)xfs_dir2_data_freeinsert(d, newdup,
- needlogp);
- (void)xfs_dir2_data_freeinsert(d, newdup2,
- needlogp);
+ xfs_dir2_data_freeremove(hdr, bf, dfp,
+ needlogp);
+ xfs_dir2_data_freeinsert(hdr, bf, newdup,
+ needlogp);
+ xfs_dir2_data_freeinsert(hdr, bf, newdup2,
+ needlogp);
}
}
}
diff --git a/fs/xfs/xfs_dir2_data.h b/fs/xfs/xfs_dir2_data.h
deleted file mode 100644
index 5e3a7f9ec73..00000000000
--- a/fs/xfs/xfs_dir2_data.h
+++ /dev/null
@@ -1,190 +0,0 @@
-/*
- * Copyright (c) 2000,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- */
-#ifndef __XFS_DIR2_DATA_H__
-#define __XFS_DIR2_DATA_H__
-
-/*
- * Directory format 2, data block structures.
- */
-
-struct xfs_dabuf;
-struct xfs_da_args;
-struct xfs_inode;
-struct xfs_trans;
-
-/*
- * Constants.
- */
-#define XFS_DIR2_DATA_MAGIC 0x58443244 /* XD2D: for multiblock dirs */
-#define XFS_DIR2_DATA_ALIGN_LOG 3 /* i.e., 8 bytes */
-#define XFS_DIR2_DATA_ALIGN (1 << XFS_DIR2_DATA_ALIGN_LOG)
-#define XFS_DIR2_DATA_FREE_TAG 0xffff
-#define XFS_DIR2_DATA_FD_COUNT 3
-
-/*
- * Directory address space divided into sections,
- * spaces separated by 32gb.
- */
-#define XFS_DIR2_SPACE_SIZE (1ULL << (32 + XFS_DIR2_DATA_ALIGN_LOG))
-#define XFS_DIR2_DATA_SPACE 0
-#define XFS_DIR2_DATA_OFFSET (XFS_DIR2_DATA_SPACE * XFS_DIR2_SPACE_SIZE)
-#define XFS_DIR2_DATA_FIRSTDB(mp) \
- XFS_DIR2_BYTE_TO_DB(mp, XFS_DIR2_DATA_OFFSET)
-
-/*
- * Offsets of . and .. in data space (always block 0)
- */
-#define XFS_DIR2_DATA_DOT_OFFSET \
- ((xfs_dir2_data_aoff_t)sizeof(xfs_dir2_data_hdr_t))
-#define XFS_DIR2_DATA_DOTDOT_OFFSET \
- (XFS_DIR2_DATA_DOT_OFFSET + XFS_DIR2_DATA_ENTSIZE(1))
-#define XFS_DIR2_DATA_FIRST_OFFSET \
- (XFS_DIR2_DATA_DOTDOT_OFFSET + XFS_DIR2_DATA_ENTSIZE(2))
-
-/*
- * Structures.
- */
-
-/*
- * Describe a free area in the data block.
- * The freespace will be formatted as a xfs_dir2_data_unused_t.
- */
-typedef struct xfs_dir2_data_free {
- xfs_dir2_data_off_t offset; /* start of freespace */
- xfs_dir2_data_off_t length; /* length of freespace */
-} xfs_dir2_data_free_t;
-
-/*
- * Header for the data blocks.
- * Always at the beginning of a directory-sized block.
- * The code knows that XFS_DIR2_DATA_FD_COUNT is 3.
- */
-typedef struct xfs_dir2_data_hdr {
- __uint32_t magic; /* XFS_DIR2_DATA_MAGIC */
- /* or XFS_DIR2_BLOCK_MAGIC */
- xfs_dir2_data_free_t bestfree[XFS_DIR2_DATA_FD_COUNT];
-} xfs_dir2_data_hdr_t;
-
-/*
- * Active entry in a data block. Aligned to 8 bytes.
- * Tag appears as the last 2 bytes.
- */
-typedef struct xfs_dir2_data_entry {
- xfs_ino_t inumber; /* inode number */
- __uint8_t namelen; /* name length */
- __uint8_t name[1]; /* name bytes, no null */
- /* variable offset */
- xfs_dir2_data_off_t tag; /* starting offset of us */
-} xfs_dir2_data_entry_t;
-
-/*
- * Unused entry in a data block. Aligned to 8 bytes.
- * Tag appears as the last 2 bytes.
- */
-typedef struct xfs_dir2_data_unused {
- __uint16_t freetag; /* XFS_DIR2_DATA_FREE_TAG */
- xfs_dir2_data_off_t length; /* total free length */
- /* variable offset */
- xfs_dir2_data_off_t tag; /* starting offset of us */
-} xfs_dir2_data_unused_t;
-
-typedef union {
- xfs_dir2_data_entry_t entry;
- xfs_dir2_data_unused_t unused;
-} xfs_dir2_data_union_t;
-
-/*
- * Generic data block structure, for xfs_db.
- */
-typedef struct xfs_dir2_data {
- xfs_dir2_data_hdr_t hdr; /* magic XFS_DIR2_DATA_MAGIC */
- xfs_dir2_data_union_t u[1];
-} xfs_dir2_data_t;
-
-/*
- * Macros.
- */
-
-/*
- * Size of a data entry.
- */
-#define XFS_DIR2_DATA_ENTSIZE(n) xfs_dir2_data_entsize(n)
-static inline int xfs_dir2_data_entsize(int n)
-{
- return (int)roundup(offsetof(xfs_dir2_data_entry_t, name[0]) + (n) + \
- (uint)sizeof(xfs_dir2_data_off_t), XFS_DIR2_DATA_ALIGN);
-}
-
-/*
- * Pointer to an entry's tag word.
- */
-#define XFS_DIR2_DATA_ENTRY_TAG_P(dep) xfs_dir2_data_entry_tag_p(dep)
-static inline xfs_dir2_data_off_t *
-xfs_dir2_data_entry_tag_p(xfs_dir2_data_entry_t *dep)
-{
- return (xfs_dir2_data_off_t *) \
- ((char *)(dep) + XFS_DIR2_DATA_ENTSIZE((dep)->namelen) - \
- (uint)sizeof(xfs_dir2_data_off_t));
-}
-
-/*
- * Pointer to a freespace's tag word.
- */
-#define XFS_DIR2_DATA_UNUSED_TAG_P(dup) \
- xfs_dir2_data_unused_tag_p(dup)
-static inline xfs_dir2_data_off_t *
-xfs_dir2_data_unused_tag_p(xfs_dir2_data_unused_t *dup)
-{
- return (xfs_dir2_data_off_t *) \
- ((char *)(dup) + INT_GET((dup)->length, ARCH_CONVERT) \
- - (uint)sizeof(xfs_dir2_data_off_t));
-}
-
-/*
- * Function declarations.
- */
-#ifdef DEBUG
-extern void xfs_dir2_data_check(struct xfs_inode *dp, struct xfs_dabuf *bp);
-#else
-#define xfs_dir2_data_check(dp,bp)
-#endif
-extern xfs_dir2_data_free_t *xfs_dir2_data_freefind(xfs_dir2_data_t *d,
- xfs_dir2_data_unused_t *dup);
-extern xfs_dir2_data_free_t *xfs_dir2_data_freeinsert(xfs_dir2_data_t *d,
- xfs_dir2_data_unused_t *dup, int *loghead);
-extern void xfs_dir2_data_freescan(struct xfs_mount *mp, xfs_dir2_data_t *d,
- int *loghead, char *aendp);
-extern int xfs_dir2_data_init(struct xfs_da_args *args, xfs_dir2_db_t blkno,
- struct xfs_dabuf **bpp);
-extern void xfs_dir2_data_log_entry(struct xfs_trans *tp, struct xfs_dabuf *bp,
- xfs_dir2_data_entry_t *dep);
-extern void xfs_dir2_data_log_header(struct xfs_trans *tp,
- struct xfs_dabuf *bp);
-extern void xfs_dir2_data_log_unused(struct xfs_trans *tp, struct xfs_dabuf *bp,
- xfs_dir2_data_unused_t *dup);
-extern void xfs_dir2_data_make_free(struct xfs_trans *tp, struct xfs_dabuf *bp,
- xfs_dir2_data_aoff_t offset,
- xfs_dir2_data_aoff_t len, int *needlogp,
- int *needscanp);
-extern void xfs_dir2_data_use_free(struct xfs_trans *tp, struct xfs_dabuf *bp,
- xfs_dir2_data_unused_t *dup,
- xfs_dir2_data_aoff_t offset,
- xfs_dir2_data_aoff_t len, int *needlogp,
- int *needscanp);
-
-#endif /* __XFS_DIR2_DATA_H__ */
diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c
index d342b6b5523..fb0aad4440c 100644
--- a/fs/xfs/xfs_dir2_leaf.c
+++ b/fs/xfs/xfs_dir2_leaf.c
@@ -1,5 +1,6 @@
/*
* Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
* All Rights Reserved.
*
* This program is free software; you can redistribute it and/or
@@ -17,46 +18,352 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
-#include "xfs_types.h"
-#include "xfs_bit.h"
-#include "xfs_log.h"
-#include "xfs_inum.h"
-#include "xfs_trans.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
-#include "xfs_dir.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
#include "xfs_mount.h"
+#include "xfs_da_format.h"
#include "xfs_da_btree.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_attr_sf.h"
-#include "xfs_dir_sf.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_dinode.h"
#include "xfs_inode.h"
#include "xfs_bmap.h"
-#include "xfs_dir2_data.h"
-#include "xfs_dir2_leaf.h"
-#include "xfs_dir2_block.h"
-#include "xfs_dir2_node.h"
-#include "xfs_dir2_trace.h"
+#include "xfs_dir2.h"
+#include "xfs_dir2_priv.h"
#include "xfs_error.h"
+#include "xfs_trace.h"
+#include "xfs_trans.h"
+#include "xfs_buf_item.h"
+#include "xfs_cksum.h"
/*
* Local function declarations.
*/
+static int xfs_dir2_leaf_lookup_int(xfs_da_args_t *args, struct xfs_buf **lbpp,
+ int *indexp, struct xfs_buf **dbpp);
+static void xfs_dir3_leaf_log_bests(struct xfs_da_args *args,
+ struct xfs_buf *bp, int first, int last);
+static void xfs_dir3_leaf_log_tail(struct xfs_da_args *args,
+ struct xfs_buf *bp);
+
+/*
+ * Check the internal consistency of a leaf1 block.
+ * Pop an assert if something is wrong.
+ */
#ifdef DEBUG
-static void xfs_dir2_leaf_check(xfs_inode_t *dp, xfs_dabuf_t *bp);
+#define xfs_dir3_leaf_check(dp, bp) \
+do { \
+ if (!xfs_dir3_leaf1_check((dp), (bp))) \
+ ASSERT(0); \
+} while (0);
+
+STATIC bool
+xfs_dir3_leaf1_check(
+ struct xfs_inode *dp,
+ struct xfs_buf *bp)
+{
+ struct xfs_dir2_leaf *leaf = bp->b_addr;
+ struct xfs_dir3_icleaf_hdr leafhdr;
+
+ dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
+
+ if (leafhdr.magic == XFS_DIR3_LEAF1_MAGIC) {
+ struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr;
+ if (be64_to_cpu(leaf3->info.blkno) != bp->b_bn)
+ return false;
+ } else if (leafhdr.magic != XFS_DIR2_LEAF1_MAGIC)
+ return false;
+
+ return xfs_dir3_leaf_check_int(dp->i_mount, dp, &leafhdr, leaf);
+}
#else
-#define xfs_dir2_leaf_check(dp, bp)
+#define xfs_dir3_leaf_check(dp, bp)
#endif
-static int xfs_dir2_leaf_lookup_int(xfs_da_args_t *args, xfs_dabuf_t **lbpp,
- int *indexp, xfs_dabuf_t **dbpp);
-static void xfs_dir2_leaf_log_bests(struct xfs_trans *tp, struct xfs_dabuf *bp,
- int first, int last);
-static void xfs_dir2_leaf_log_tail(struct xfs_trans *tp, struct xfs_dabuf *bp);
+bool
+xfs_dir3_leaf_check_int(
+ struct xfs_mount *mp,
+ struct xfs_inode *dp,
+ struct xfs_dir3_icleaf_hdr *hdr,
+ struct xfs_dir2_leaf *leaf)
+{
+ struct xfs_dir2_leaf_entry *ents;
+ xfs_dir2_leaf_tail_t *ltp;
+ int stale;
+ int i;
+ const struct xfs_dir_ops *ops;
+ struct xfs_dir3_icleaf_hdr leafhdr;
+ struct xfs_da_geometry *geo = mp->m_dir_geo;
+
+ /*
+ * we can be passed a null dp here from a verifier, so we need to go the
+ * hard way to get them.
+ */
+ ops = xfs_dir_get_ops(mp, dp);
+
+ if (!hdr) {
+ ops->leaf_hdr_from_disk(&leafhdr, leaf);
+ hdr = &leafhdr;
+ }
+
+ ents = ops->leaf_ents_p(leaf);
+ ltp = xfs_dir2_leaf_tail_p(geo, leaf);
+
+ /*
+ * XXX (dgc): This value is not restrictive enough.
+ * Should factor in the size of the bests table as well.
+ * We can deduce a value for that from di_size.
+ */
+ if (hdr->count > ops->leaf_max_ents(geo))
+ return false;
+
+ /* Leaves and bests don't overlap in leaf format. */
+ if ((hdr->magic == XFS_DIR2_LEAF1_MAGIC ||
+ hdr->magic == XFS_DIR3_LEAF1_MAGIC) &&
+ (char *)&ents[hdr->count] > (char *)xfs_dir2_leaf_bests_p(ltp))
+ return false;
+
+ /* Check hash value order, count stale entries. */
+ for (i = stale = 0; i < hdr->count; i++) {
+ if (i + 1 < hdr->count) {
+ if (be32_to_cpu(ents[i].hashval) >
+ be32_to_cpu(ents[i + 1].hashval))
+ return false;
+ }
+ if (ents[i].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
+ stale++;
+ }
+ if (hdr->stale != stale)
+ return false;
+ return true;
+}
+
+/*
+ * We verify the magic numbers before decoding the leaf header so that on debug
+ * kernels we don't get assertion failures in xfs_dir3_leaf_hdr_from_disk() due
+ * to incorrect magic numbers.
+ */
+static bool
+xfs_dir3_leaf_verify(
+ struct xfs_buf *bp,
+ __uint16_t magic)
+{
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_dir2_leaf *leaf = bp->b_addr;
+
+ ASSERT(magic == XFS_DIR2_LEAF1_MAGIC || magic == XFS_DIR2_LEAFN_MAGIC);
+
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr;
+ __uint16_t magic3;
+
+ magic3 = (magic == XFS_DIR2_LEAF1_MAGIC) ? XFS_DIR3_LEAF1_MAGIC
+ : XFS_DIR3_LEAFN_MAGIC;
+
+ if (leaf3->info.hdr.magic != cpu_to_be16(magic3))
+ return false;
+ if (!uuid_equal(&leaf3->info.uuid, &mp->m_sb.sb_uuid))
+ return false;
+ if (be64_to_cpu(leaf3->info.blkno) != bp->b_bn)
+ return false;
+ } else {
+ if (leaf->hdr.info.magic != cpu_to_be16(magic))
+ return false;
+ }
+
+ return xfs_dir3_leaf_check_int(mp, NULL, NULL, leaf);
+}
+
+static void
+__read_verify(
+ struct xfs_buf *bp,
+ __uint16_t magic)
+{
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+
+ if (xfs_sb_version_hascrc(&mp->m_sb) &&
+ !xfs_buf_verify_cksum(bp, XFS_DIR3_LEAF_CRC_OFF))
+ xfs_buf_ioerror(bp, EFSBADCRC);
+ else if (!xfs_dir3_leaf_verify(bp, magic))
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
+
+ if (bp->b_error)
+ xfs_verifier_error(bp);
+}
+
+static void
+__write_verify(
+ struct xfs_buf *bp,
+ __uint16_t magic)
+{
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_buf_log_item *bip = bp->b_fspriv;
+ struct xfs_dir3_leaf_hdr *hdr3 = bp->b_addr;
+
+ if (!xfs_dir3_leaf_verify(bp, magic)) {
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
+ xfs_verifier_error(bp);
+ return;
+ }
+
+ if (!xfs_sb_version_hascrc(&mp->m_sb))
+ return;
+
+ if (bip)
+ hdr3->info.lsn = cpu_to_be64(bip->bli_item.li_lsn);
+
+ xfs_buf_update_cksum(bp, XFS_DIR3_LEAF_CRC_OFF);
+}
+
+static void
+xfs_dir3_leaf1_read_verify(
+ struct xfs_buf *bp)
+{
+ __read_verify(bp, XFS_DIR2_LEAF1_MAGIC);
+}
+
+static void
+xfs_dir3_leaf1_write_verify(
+ struct xfs_buf *bp)
+{
+ __write_verify(bp, XFS_DIR2_LEAF1_MAGIC);
+}
+
+static void
+xfs_dir3_leafn_read_verify(
+ struct xfs_buf *bp)
+{
+ __read_verify(bp, XFS_DIR2_LEAFN_MAGIC);
+}
+
+static void
+xfs_dir3_leafn_write_verify(
+ struct xfs_buf *bp)
+{
+ __write_verify(bp, XFS_DIR2_LEAFN_MAGIC);
+}
+
+const struct xfs_buf_ops xfs_dir3_leaf1_buf_ops = {
+ .verify_read = xfs_dir3_leaf1_read_verify,
+ .verify_write = xfs_dir3_leaf1_write_verify,
+};
+
+const struct xfs_buf_ops xfs_dir3_leafn_buf_ops = {
+ .verify_read = xfs_dir3_leafn_read_verify,
+ .verify_write = xfs_dir3_leafn_write_verify,
+};
+
+static int
+xfs_dir3_leaf_read(
+ struct xfs_trans *tp,
+ struct xfs_inode *dp,
+ xfs_dablk_t fbno,
+ xfs_daddr_t mappedbno,
+ struct xfs_buf **bpp)
+{
+ int err;
+
+ err = xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp,
+ XFS_DATA_FORK, &xfs_dir3_leaf1_buf_ops);
+ if (!err && tp)
+ xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_LEAF1_BUF);
+ return err;
+}
+
+int
+xfs_dir3_leafn_read(
+ struct xfs_trans *tp,
+ struct xfs_inode *dp,
+ xfs_dablk_t fbno,
+ xfs_daddr_t mappedbno,
+ struct xfs_buf **bpp)
+{
+ int err;
+
+ err = xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp,
+ XFS_DATA_FORK, &xfs_dir3_leafn_buf_ops);
+ if (!err && tp)
+ xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_LEAFN_BUF);
+ return err;
+}
+
+/*
+ * Initialize a new leaf block, leaf1 or leafn magic accepted.
+ */
+static void
+xfs_dir3_leaf_init(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ struct xfs_buf *bp,
+ xfs_ino_t owner,
+ __uint16_t type)
+{
+ struct xfs_dir2_leaf *leaf = bp->b_addr;
+
+ ASSERT(type == XFS_DIR2_LEAF1_MAGIC || type == XFS_DIR2_LEAFN_MAGIC);
+
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr;
+
+ memset(leaf3, 0, sizeof(*leaf3));
+
+ leaf3->info.hdr.magic = (type == XFS_DIR2_LEAF1_MAGIC)
+ ? cpu_to_be16(XFS_DIR3_LEAF1_MAGIC)
+ : cpu_to_be16(XFS_DIR3_LEAFN_MAGIC);
+ leaf3->info.blkno = cpu_to_be64(bp->b_bn);
+ leaf3->info.owner = cpu_to_be64(owner);
+ uuid_copy(&leaf3->info.uuid, &mp->m_sb.sb_uuid);
+ } else {
+ memset(leaf, 0, sizeof(*leaf));
+ leaf->hdr.info.magic = cpu_to_be16(type);
+ }
+
+ /*
+ * If it's a leaf-format directory initialize the tail.
+ * Caller is responsible for initialising the bests table.
+ */
+ if (type == XFS_DIR2_LEAF1_MAGIC) {
+ struct xfs_dir2_leaf_tail *ltp;
+
+ ltp = xfs_dir2_leaf_tail_p(mp->m_dir_geo, leaf);
+ ltp->bestcount = 0;
+ bp->b_ops = &xfs_dir3_leaf1_buf_ops;
+ xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_LEAF1_BUF);
+ } else {
+ bp->b_ops = &xfs_dir3_leafn_buf_ops;
+ xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_LEAFN_BUF);
+ }
+}
+
+int
+xfs_dir3_leaf_get_buf(
+ xfs_da_args_t *args,
+ xfs_dir2_db_t bno,
+ struct xfs_buf **bpp,
+ __uint16_t magic)
+{
+ struct xfs_inode *dp = args->dp;
+ struct xfs_trans *tp = args->trans;
+ struct xfs_mount *mp = dp->i_mount;
+ struct xfs_buf *bp;
+ int error;
+
+ ASSERT(magic == XFS_DIR2_LEAF1_MAGIC || magic == XFS_DIR2_LEAFN_MAGIC);
+ ASSERT(bno >= xfs_dir2_byte_to_db(args->geo, XFS_DIR2_LEAF_OFFSET) &&
+ bno < xfs_dir2_byte_to_db(args->geo, XFS_DIR2_FREE_OFFSET));
+
+ error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(args->geo, bno),
+ -1, &bp, XFS_DATA_FORK);
+ if (error)
+ return error;
+
+ xfs_dir3_leaf_init(mp, tp, bp, dp->i_ino, magic);
+ xfs_dir3_leaf_log_header(args, bp);
+ if (magic == XFS_DIR2_LEAF1_MAGIC)
+ xfs_dir3_leaf_log_tail(args, bp);
+ *bpp = bp;
+ return 0;
+}
/*
* Convert a block form directory to a leaf form directory.
@@ -64,16 +371,16 @@ static void xfs_dir2_leaf_log_tail(struct xfs_trans *tp, struct xfs_dabuf *bp);
int /* error */
xfs_dir2_block_to_leaf(
xfs_da_args_t *args, /* operation arguments */
- xfs_dabuf_t *dbp) /* input block's buffer */
+ struct xfs_buf *dbp) /* input block's buffer */
{
- xfs_dir2_data_off_t *bestsp; /* leaf's bestsp entries */
+ __be16 *bestsp; /* leaf's bestsp entries */
xfs_dablk_t blkno; /* leaf block's bno */
- xfs_dir2_block_t *block; /* block structure */
+ xfs_dir2_data_hdr_t *hdr; /* block header */
xfs_dir2_leaf_entry_t *blp; /* block's leaf entries */
xfs_dir2_block_tail_t *btp; /* block's tail */
xfs_inode_t *dp; /* incore directory inode */
int error; /* error return code */
- xfs_dabuf_t *lbp; /* leaf block's buffer */
+ struct xfs_buf *lbp; /* leaf block's buffer */
xfs_dir2_db_t ldb; /* leaf block's bno */
xfs_dir2_leaf_t *leaf; /* leaf structure */
xfs_dir2_leaf_tail_t *ltp; /* leaf's tail */
@@ -81,8 +388,12 @@ xfs_dir2_block_to_leaf(
int needlog; /* need to log block header */
int needscan; /* need to rescan bestfree */
xfs_trans_t *tp; /* transaction pointer */
+ struct xfs_dir2_data_free *bf;
+ struct xfs_dir2_leaf_entry *ents;
+ struct xfs_dir3_icleaf_hdr leafhdr;
+
+ trace_xfs_dir2_block_to_leaf(args);
- xfs_dir2_trace_args_b("block_to_leaf", args, dbp);
dp = args->dp;
mp = dp->i_mount;
tp = args->trans;
@@ -94,68 +405,200 @@ xfs_dir2_block_to_leaf(
if ((error = xfs_da_grow_inode(args, &blkno))) {
return error;
}
- ldb = XFS_DIR2_DA_TO_DB(mp, blkno);
- ASSERT(ldb == XFS_DIR2_LEAF_FIRSTDB(mp));
+ ldb = xfs_dir2_da_to_db(args->geo, blkno);
+ ASSERT(ldb == xfs_dir2_byte_to_db(args->geo, XFS_DIR2_LEAF_OFFSET));
/*
* Initialize the leaf block, get a buffer for it.
*/
- if ((error = xfs_dir2_leaf_init(args, ldb, &lbp, XFS_DIR2_LEAF1_MAGIC))) {
+ error = xfs_dir3_leaf_get_buf(args, ldb, &lbp, XFS_DIR2_LEAF1_MAGIC);
+ if (error)
return error;
- }
- ASSERT(lbp != NULL);
- leaf = lbp->data;
- block = dbp->data;
- xfs_dir2_data_check(dp, dbp);
- btp = XFS_DIR2_BLOCK_TAIL_P(mp, block);
- blp = XFS_DIR2_BLOCK_LEAF_P(btp);
+
+ leaf = lbp->b_addr;
+ hdr = dbp->b_addr;
+ xfs_dir3_data_check(dp, dbp);
+ btp = xfs_dir2_block_tail_p(args->geo, hdr);
+ blp = xfs_dir2_block_leaf_p(btp);
+ bf = dp->d_ops->data_bestfree_p(hdr);
+ ents = dp->d_ops->leaf_ents_p(leaf);
+
/*
* Set the counts in the leaf header.
*/
- INT_COPY(leaf->hdr.count, btp->count, ARCH_CONVERT); /* INT_: type change */
- INT_COPY(leaf->hdr.stale, btp->stale, ARCH_CONVERT); /* INT_: type change */
+ dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
+ leafhdr.count = be32_to_cpu(btp->count);
+ leafhdr.stale = be32_to_cpu(btp->stale);
+ dp->d_ops->leaf_hdr_to_disk(leaf, &leafhdr);
+ xfs_dir3_leaf_log_header(args, lbp);
+
/*
* Could compact these but I think we always do the conversion
* after squeezing out stale entries.
*/
- memcpy(leaf->ents, blp, INT_GET(btp->count, ARCH_CONVERT) * sizeof(xfs_dir2_leaf_entry_t));
- xfs_dir2_leaf_log_ents(tp, lbp, 0, INT_GET(leaf->hdr.count, ARCH_CONVERT) - 1);
+ memcpy(ents, blp, be32_to_cpu(btp->count) * sizeof(xfs_dir2_leaf_entry_t));
+ xfs_dir3_leaf_log_ents(args, lbp, 0, leafhdr.count - 1);
needscan = 0;
needlog = 1;
/*
* Make the space formerly occupied by the leaf entries and block
* tail be free.
*/
- xfs_dir2_data_make_free(tp, dbp,
- (xfs_dir2_data_aoff_t)((char *)blp - (char *)block),
- (xfs_dir2_data_aoff_t)((char *)block + mp->m_dirblksize -
+ xfs_dir2_data_make_free(args, dbp,
+ (xfs_dir2_data_aoff_t)((char *)blp - (char *)hdr),
+ (xfs_dir2_data_aoff_t)((char *)hdr + args->geo->blksize -
(char *)blp),
&needlog, &needscan);
/*
* Fix up the block header, make it a data block.
*/
- INT_SET(block->hdr.magic, ARCH_CONVERT, XFS_DIR2_DATA_MAGIC);
+ dbp->b_ops = &xfs_dir3_data_buf_ops;
+ xfs_trans_buf_set_type(tp, dbp, XFS_BLFT_DIR_DATA_BUF);
+ if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC))
+ hdr->magic = cpu_to_be32(XFS_DIR2_DATA_MAGIC);
+ else
+ hdr->magic = cpu_to_be32(XFS_DIR3_DATA_MAGIC);
+
if (needscan)
- xfs_dir2_data_freescan(mp, (xfs_dir2_data_t *)block, &needlog,
- NULL);
+ xfs_dir2_data_freescan(dp, hdr, &needlog);
/*
* Set up leaf tail and bests table.
*/
- ltp = XFS_DIR2_LEAF_TAIL_P(mp, leaf);
- INT_SET(ltp->bestcount, ARCH_CONVERT, 1);
- bestsp = XFS_DIR2_LEAF_BESTS_P(ltp);
- INT_COPY(bestsp[0], block->hdr.bestfree[0].length, ARCH_CONVERT);
+ ltp = xfs_dir2_leaf_tail_p(args->geo, leaf);
+ ltp->bestcount = cpu_to_be32(1);
+ bestsp = xfs_dir2_leaf_bests_p(ltp);
+ bestsp[0] = bf[0].length;
/*
* Log the data header and leaf bests table.
*/
if (needlog)
- xfs_dir2_data_log_header(tp, dbp);
- xfs_dir2_leaf_check(dp, lbp);
- xfs_dir2_data_check(dp, dbp);
- xfs_dir2_leaf_log_bests(tp, lbp, 0, 0);
- xfs_da_buf_done(lbp);
+ xfs_dir2_data_log_header(args, dbp);
+ xfs_dir3_leaf_check(dp, lbp);
+ xfs_dir3_data_check(dp, dbp);
+ xfs_dir3_leaf_log_bests(args, lbp, 0, 0);
return 0;
}
+STATIC void
+xfs_dir3_leaf_find_stale(
+ struct xfs_dir3_icleaf_hdr *leafhdr,
+ struct xfs_dir2_leaf_entry *ents,
+ int index,
+ int *lowstale,
+ int *highstale)
+{
+ /*
+ * Find the first stale entry before our index, if any.
+ */
+ for (*lowstale = index - 1; *lowstale >= 0; --*lowstale) {
+ if (ents[*lowstale].address ==
+ cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
+ break;
+ }
+
+ /*
+ * Find the first stale entry at or after our index, if any.
+ * Stop if the result would require moving more entries than using
+ * lowstale.
+ */
+ for (*highstale = index; *highstale < leafhdr->count; ++*highstale) {
+ if (ents[*highstale].address ==
+ cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
+ break;
+ if (*lowstale >= 0 && index - *lowstale <= *highstale - index)
+ break;
+ }
+}
+
+struct xfs_dir2_leaf_entry *
+xfs_dir3_leaf_find_entry(
+ struct xfs_dir3_icleaf_hdr *leafhdr,
+ struct xfs_dir2_leaf_entry *ents,
+ int index, /* leaf table position */
+ int compact, /* need to compact leaves */
+ int lowstale, /* index of prev stale leaf */
+ int highstale, /* index of next stale leaf */
+ int *lfloglow, /* low leaf logging index */
+ int *lfloghigh) /* high leaf logging index */
+{
+ if (!leafhdr->stale) {
+ xfs_dir2_leaf_entry_t *lep; /* leaf entry table pointer */
+
+ /*
+ * Now we need to make room to insert the leaf entry.
+ *
+ * If there are no stale entries, just insert a hole at index.
+ */
+ lep = &ents[index];
+ if (index < leafhdr->count)
+ memmove(lep + 1, lep,
+ (leafhdr->count - index) * sizeof(*lep));
+
+ /*
+ * Record low and high logging indices for the leaf.
+ */
+ *lfloglow = index;
+ *lfloghigh = leafhdr->count++;
+ return lep;
+ }
+
+ /*
+ * There are stale entries.
+ *
+ * We will use one of them for the new entry. It's probably not at
+ * the right location, so we'll have to shift some up or down first.
+ *
+ * If we didn't compact before, we need to find the nearest stale
+ * entries before and after our insertion point.
+ */
+ if (compact == 0)
+ xfs_dir3_leaf_find_stale(leafhdr, ents, index,
+ &lowstale, &highstale);
+
+ /*
+ * If the low one is better, use it.
+ */
+ if (lowstale >= 0 &&
+ (highstale == leafhdr->count ||
+ index - lowstale - 1 < highstale - index)) {
+ ASSERT(index - lowstale - 1 >= 0);
+ ASSERT(ents[lowstale].address ==
+ cpu_to_be32(XFS_DIR2_NULL_DATAPTR));
+
+ /*
+ * Copy entries up to cover the stale entry and make room
+ * for the new entry.
+ */
+ if (index - lowstale - 1 > 0) {
+ memmove(&ents[lowstale], &ents[lowstale + 1],
+ (index - lowstale - 1) *
+ sizeof(xfs_dir2_leaf_entry_t));
+ }
+ *lfloglow = MIN(lowstale, *lfloglow);
+ *lfloghigh = MAX(index - 1, *lfloghigh);
+ leafhdr->stale--;
+ return &ents[index - 1];
+ }
+
+ /*
+ * The high one is better, so use that one.
+ */
+ ASSERT(highstale - index >= 0);
+ ASSERT(ents[highstale].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR));
+
+ /*
+ * Copy entries down to cover the stale entry and make room for the
+ * new entry.
+ */
+ if (highstale - index > 0) {
+ memmove(&ents[index + 1], &ents[index],
+ (highstale - index) * sizeof(xfs_dir2_leaf_entry_t));
+ }
+ *lfloglow = MIN(index, *lfloglow);
+ *lfloghigh = MAX(highstale, *lfloghigh);
+ leafhdr->stale--;
+ return &ents[index];
+}
+
/*
* Add an entry to a leaf form directory.
*/
@@ -163,10 +606,10 @@ int /* error */
xfs_dir2_leaf_addname(
xfs_da_args_t *args) /* operation arguments */
{
- xfs_dir2_data_off_t *bestsp; /* freespace table in leaf */
+ __be16 *bestsp; /* freespace table in leaf */
int compact; /* need to compact leaves */
- xfs_dir2_data_t *data; /* data block structure */
- xfs_dabuf_t *dbp; /* data block buffer */
+ xfs_dir2_data_hdr_t *hdr; /* data block header */
+ struct xfs_buf *dbp; /* data block buffer */
xfs_dir2_data_entry_t *dep; /* data block entry */
xfs_inode_t *dp; /* incore directory inode */
xfs_dir2_data_unused_t *dup; /* data unused entry */
@@ -175,7 +618,7 @@ xfs_dir2_leaf_addname(
int highstale; /* index of next stale leaf */
int i; /* temporary, index */
int index; /* leaf table position */
- xfs_dabuf_t *lbp; /* leaf's buffer */
+ struct xfs_buf *lbp; /* leaf's buffer */
xfs_dir2_leaf_t *leaf; /* leaf structure */
int length; /* length of new entry */
xfs_dir2_leaf_entry_t *lep; /* leaf entry table pointer */
@@ -187,23 +630,23 @@ xfs_dir2_leaf_addname(
int needbytes; /* leaf block bytes needed */
int needlog; /* need to log data header */
int needscan; /* need to rescan data free */
- xfs_dir2_data_off_t *tagp; /* end of data entry */
+ __be16 *tagp; /* end of data entry */
xfs_trans_t *tp; /* transaction pointer */
xfs_dir2_db_t use_block; /* data block number */
+ struct xfs_dir2_data_free *bf; /* bestfree table */
+ struct xfs_dir2_leaf_entry *ents;
+ struct xfs_dir3_icleaf_hdr leafhdr;
+
+ trace_xfs_dir2_leaf_addname(args);
- xfs_dir2_trace_args("leaf_addname", args);
dp = args->dp;
tp = args->trans;
mp = dp->i_mount;
- /*
- * Read the leaf block.
- */
- error = xfs_da_read_buf(tp, dp, mp->m_dirleafblk, -1, &lbp,
- XFS_DATA_FORK);
- if (error) {
+
+ error = xfs_dir3_leaf_read(tp, dp, args->geo->leafblk, -1, &lbp);
+ if (error)
return error;
- }
- ASSERT(lbp != NULL);
+
/*
* Look up the entry by hash value and name.
* We know it's not there, our caller has already done a lookup.
@@ -211,25 +654,28 @@ xfs_dir2_leaf_addname(
* But if there are dup hash values the index is of the first of those.
*/
index = xfs_dir2_leaf_search_hash(args, lbp);
- leaf = lbp->data;
- ltp = XFS_DIR2_LEAF_TAIL_P(mp, leaf);
- bestsp = XFS_DIR2_LEAF_BESTS_P(ltp);
- length = XFS_DIR2_DATA_ENTSIZE(args->namelen);
+ leaf = lbp->b_addr;
+ ltp = xfs_dir2_leaf_tail_p(args->geo, leaf);
+ ents = dp->d_ops->leaf_ents_p(leaf);
+ dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
+ bestsp = xfs_dir2_leaf_bests_p(ltp);
+ length = dp->d_ops->data_entsize(args->namelen);
+
/*
* See if there are any entries with the same hash value
* and space in their block for the new entry.
* This is good because it puts multiple same-hash value entries
* in a data block, improving the lookup of those entries.
*/
- for (use_block = -1, lep = &leaf->ents[index];
- index < INT_GET(leaf->hdr.count, ARCH_CONVERT) && INT_GET(lep->hashval, ARCH_CONVERT) == args->hashval;
+ for (use_block = -1, lep = &ents[index];
+ index < leafhdr.count && be32_to_cpu(lep->hashval) == args->hashval;
index++, lep++) {
- if (INT_GET(lep->address, ARCH_CONVERT) == XFS_DIR2_NULL_DATAPTR)
+ if (be32_to_cpu(lep->address) == XFS_DIR2_NULL_DATAPTR)
continue;
- i = XFS_DIR2_DATAPTR_TO_DB(mp, INT_GET(lep->address, ARCH_CONVERT));
- ASSERT(i < INT_GET(ltp->bestcount, ARCH_CONVERT));
- ASSERT(INT_GET(bestsp[i], ARCH_CONVERT) != NULLDATAOFF);
- if (INT_GET(bestsp[i], ARCH_CONVERT) >= length) {
+ i = xfs_dir2_dataptr_to_db(args->geo, be32_to_cpu(lep->address));
+ ASSERT(i < be32_to_cpu(ltp->bestcount));
+ ASSERT(bestsp[i] != cpu_to_be16(NULLDATAOFF));
+ if (be16_to_cpu(bestsp[i]) >= length) {
use_block = i;
break;
}
@@ -238,13 +684,14 @@ xfs_dir2_leaf_addname(
* Didn't find a block yet, linear search all the data blocks.
*/
if (use_block == -1) {
- for (i = 0; i < INT_GET(ltp->bestcount, ARCH_CONVERT); i++) {
+ for (i = 0; i < be32_to_cpu(ltp->bestcount); i++) {
/*
* Remember a block we see that's missing.
*/
- if (INT_GET(bestsp[i], ARCH_CONVERT) == NULLDATAOFF && use_block == -1)
+ if (bestsp[i] == cpu_to_be16(NULLDATAOFF) &&
+ use_block == -1)
use_block = i;
- else if (INT_GET(bestsp[i], ARCH_CONVERT) >= length) {
+ else if (be16_to_cpu(bestsp[i]) >= length) {
use_block = i;
break;
}
@@ -253,41 +700,43 @@ xfs_dir2_leaf_addname(
/*
* How many bytes do we need in the leaf block?
*/
- needbytes =
- (leaf->hdr.stale ? 0 : (uint)sizeof(leaf->ents[0])) +
- (use_block != -1 ? 0 : (uint)sizeof(leaf->bests[0]));
+ needbytes = 0;
+ if (!leafhdr.stale)
+ needbytes += sizeof(xfs_dir2_leaf_entry_t);
+ if (use_block == -1)
+ needbytes += sizeof(xfs_dir2_data_off_t);
+
/*
* Now kill use_block if it refers to a missing block, so we
* can use it as an indication of allocation needed.
*/
- if (use_block != -1 && INT_GET(bestsp[use_block], ARCH_CONVERT) == NULLDATAOFF)
+ if (use_block != -1 && bestsp[use_block] == cpu_to_be16(NULLDATAOFF))
use_block = -1;
/*
* If we don't have enough free bytes but we can make enough
* by compacting out stale entries, we'll do that.
*/
- if ((char *)bestsp - (char *)&leaf->ents[INT_GET(leaf->hdr.count, ARCH_CONVERT)] < needbytes &&
- INT_GET(leaf->hdr.stale, ARCH_CONVERT) > 1) {
+ if ((char *)bestsp - (char *)&ents[leafhdr.count] < needbytes &&
+ leafhdr.stale > 1)
compact = 1;
- }
+
/*
* Otherwise if we don't have enough free bytes we need to
* convert to node form.
*/
- else if ((char *)bestsp - (char *)&leaf->ents[INT_GET(leaf->hdr.count, ARCH_CONVERT)] <
- needbytes) {
+ else if ((char *)bestsp - (char *)&ents[leafhdr.count] < needbytes) {
/*
* Just checking or no space reservation, give up.
*/
- if (args->justcheck || args->total == 0) {
- xfs_da_brelse(tp, lbp);
+ if ((args->op_flags & XFS_DA_OP_JUSTCHECK) ||
+ args->total == 0) {
+ xfs_trans_brelse(tp, lbp);
return XFS_ERROR(ENOSPC);
}
/*
* Convert to node form.
*/
error = xfs_dir2_leaf_to_node(args, lbp);
- xfs_da_buf_done(lbp);
if (error)
return error;
/*
@@ -304,8 +753,8 @@ xfs_dir2_leaf_addname(
* If just checking, then it will fit unless we needed to allocate
* a new data block.
*/
- if (args->justcheck) {
- xfs_da_brelse(tp, lbp);
+ if (args->op_flags & XFS_DA_OP_JUSTCHECK) {
+ xfs_trans_brelse(tp, lbp);
return use_block == -1 ? XFS_ERROR(ENOSPC) : 0;
}
/*
@@ -313,7 +762,7 @@ xfs_dir2_leaf_addname(
* changed anything.
*/
if (args->total == 0 && use_block == -1) {
- xfs_da_brelse(tp, lbp);
+ xfs_trans_brelse(tp, lbp);
return XFS_ERROR(ENOSPC);
}
/*
@@ -323,15 +772,15 @@ xfs_dir2_leaf_addname(
* point later.
*/
if (compact) {
- xfs_dir2_leaf_compact_x1(lbp, &index, &lowstale, &highstale,
- &lfloglow, &lfloghigh);
+ xfs_dir3_leaf_compact_x1(&leafhdr, ents, &index, &lowstale,
+ &highstale, &lfloglow, &lfloghigh);
}
/*
* There are stale entries, so we'll need log-low and log-high
* impossibly bad values later.
*/
- else if (INT_GET(leaf->hdr.stale, ARCH_CONVERT)) {
- lfloglow = INT_GET(leaf->hdr.count, ARCH_CONVERT);
+ else if (leafhdr.stale) {
+ lfloglow = leafhdr.count;
lfloghigh = -1;
}
/*
@@ -344,275 +793,146 @@ xfs_dir2_leaf_addname(
*/
if ((error = xfs_dir2_grow_inode(args, XFS_DIR2_DATA_SPACE,
&use_block))) {
- xfs_da_brelse(tp, lbp);
+ xfs_trans_brelse(tp, lbp);
return error;
}
/*
* Initialize the block.
*/
- if ((error = xfs_dir2_data_init(args, use_block, &dbp))) {
- xfs_da_brelse(tp, lbp);
+ if ((error = xfs_dir3_data_init(args, use_block, &dbp))) {
+ xfs_trans_brelse(tp, lbp);
return error;
}
/*
* If we're adding a new data block on the end we need to
* extend the bests table. Copy it up one entry.
*/
- if (use_block >= INT_GET(ltp->bestcount, ARCH_CONVERT)) {
+ if (use_block >= be32_to_cpu(ltp->bestcount)) {
bestsp--;
memmove(&bestsp[0], &bestsp[1],
- INT_GET(ltp->bestcount, ARCH_CONVERT) * sizeof(bestsp[0]));
- INT_MOD(ltp->bestcount, ARCH_CONVERT, +1);
- xfs_dir2_leaf_log_tail(tp, lbp);
- xfs_dir2_leaf_log_bests(tp, lbp, 0, INT_GET(ltp->bestcount, ARCH_CONVERT) - 1);
+ be32_to_cpu(ltp->bestcount) * sizeof(bestsp[0]));
+ be32_add_cpu(&ltp->bestcount, 1);
+ xfs_dir3_leaf_log_tail(args, lbp);
+ xfs_dir3_leaf_log_bests(args, lbp, 0,
+ be32_to_cpu(ltp->bestcount) - 1);
}
/*
* If we're filling in a previously empty block just log it.
*/
else
- xfs_dir2_leaf_log_bests(tp, lbp, use_block, use_block);
- data = dbp->data;
- INT_COPY(bestsp[use_block], data->hdr.bestfree[0].length, ARCH_CONVERT);
+ xfs_dir3_leaf_log_bests(args, lbp, use_block, use_block);
+ hdr = dbp->b_addr;
+ bf = dp->d_ops->data_bestfree_p(hdr);
+ bestsp[use_block] = bf[0].length;
grown = 1;
- }
- /*
- * Already had space in some data block.
- * Just read that one in.
- */
- else {
- if ((error =
- xfs_da_read_buf(tp, dp, XFS_DIR2_DB_TO_DA(mp, use_block),
- -1, &dbp, XFS_DATA_FORK))) {
- xfs_da_brelse(tp, lbp);
+ } else {
+ /*
+ * Already had space in some data block.
+ * Just read that one in.
+ */
+ error = xfs_dir3_data_read(tp, dp,
+ xfs_dir2_db_to_da(args->geo, use_block),
+ -1, &dbp);
+ if (error) {
+ xfs_trans_brelse(tp, lbp);
return error;
}
- data = dbp->data;
+ hdr = dbp->b_addr;
+ bf = dp->d_ops->data_bestfree_p(hdr);
grown = 0;
}
- xfs_dir2_data_check(dp, dbp);
/*
* Point to the biggest freespace in our data block.
*/
dup = (xfs_dir2_data_unused_t *)
- ((char *)data + INT_GET(data->hdr.bestfree[0].offset, ARCH_CONVERT));
- ASSERT(INT_GET(dup->length, ARCH_CONVERT) >= length);
+ ((char *)hdr + be16_to_cpu(bf[0].offset));
+ ASSERT(be16_to_cpu(dup->length) >= length);
needscan = needlog = 0;
/*
* Mark the initial part of our freespace in use for the new entry.
*/
- xfs_dir2_data_use_free(tp, dbp, dup,
- (xfs_dir2_data_aoff_t)((char *)dup - (char *)data), length,
+ xfs_dir2_data_use_free(args, dbp, dup,
+ (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr), length,
&needlog, &needscan);
/*
* Initialize our new entry (at last).
*/
dep = (xfs_dir2_data_entry_t *)dup;
- INT_SET(dep->inumber, ARCH_CONVERT, args->inumber);
+ dep->inumber = cpu_to_be64(args->inumber);
dep->namelen = args->namelen;
memcpy(dep->name, args->name, dep->namelen);
- tagp = XFS_DIR2_DATA_ENTRY_TAG_P(dep);
- INT_SET(*tagp, ARCH_CONVERT, (xfs_dir2_data_off_t)((char *)dep - (char *)data));
+ dp->d_ops->data_put_ftype(dep, args->filetype);
+ tagp = dp->d_ops->data_entry_tag_p(dep);
+ *tagp = cpu_to_be16((char *)dep - (char *)hdr);
/*
* Need to scan fix up the bestfree table.
*/
if (needscan)
- xfs_dir2_data_freescan(mp, data, &needlog, NULL);
+ xfs_dir2_data_freescan(dp, hdr, &needlog);
/*
* Need to log the data block's header.
*/
if (needlog)
- xfs_dir2_data_log_header(tp, dbp);
- xfs_dir2_data_log_entry(tp, dbp, dep);
+ xfs_dir2_data_log_header(args, dbp);
+ xfs_dir2_data_log_entry(args, dbp, dep);
/*
* If the bests table needs to be changed, do it.
* Log the change unless we've already done that.
*/
- if (INT_GET(bestsp[use_block], ARCH_CONVERT) != INT_GET(data->hdr.bestfree[0].length, ARCH_CONVERT)) {
- INT_COPY(bestsp[use_block], data->hdr.bestfree[0].length, ARCH_CONVERT);
+ if (be16_to_cpu(bestsp[use_block]) != be16_to_cpu(bf[0].length)) {
+ bestsp[use_block] = bf[0].length;
if (!grown)
- xfs_dir2_leaf_log_bests(tp, lbp, use_block, use_block);
- }
- /*
- * Now we need to make room to insert the leaf entry.
- * If there are no stale entries, we just insert a hole at index.
- */
- if (!leaf->hdr.stale) {
- /*
- * lep is still good as the index leaf entry.
- */
- if (index < INT_GET(leaf->hdr.count, ARCH_CONVERT))
- memmove(lep + 1, lep,
- (INT_GET(leaf->hdr.count, ARCH_CONVERT) - index) * sizeof(*lep));
- /*
- * Record low and high logging indices for the leaf.
- */
- lfloglow = index;
- lfloghigh = INT_GET(leaf->hdr.count, ARCH_CONVERT);
- INT_MOD(leaf->hdr.count, ARCH_CONVERT, +1);
- }
- /*
- * There are stale entries.
- * We will use one of them for the new entry.
- * It's probably not at the right location, so we'll have to
- * shift some up or down first.
- */
- else {
- /*
- * If we didn't compact before, we need to find the nearest
- * stale entries before and after our insertion point.
- */
- if (compact == 0) {
- /*
- * Find the first stale entry before the insertion
- * point, if any.
- */
- for (lowstale = index - 1;
- lowstale >= 0 &&
- INT_GET(leaf->ents[lowstale].address, ARCH_CONVERT) !=
- XFS_DIR2_NULL_DATAPTR;
- lowstale--)
- continue;
- /*
- * Find the next stale entry at or after the insertion
- * point, if any. Stop if we go so far that the
- * lowstale entry would be better.
- */
- for (highstale = index;
- highstale < INT_GET(leaf->hdr.count, ARCH_CONVERT) &&
- INT_GET(leaf->ents[highstale].address, ARCH_CONVERT) !=
- XFS_DIR2_NULL_DATAPTR &&
- (lowstale < 0 ||
- index - lowstale - 1 >= highstale - index);
- highstale++)
- continue;
- }
- /*
- * If the low one is better, use it.
- */
- if (lowstale >= 0 &&
- (highstale == INT_GET(leaf->hdr.count, ARCH_CONVERT) ||
- index - lowstale - 1 < highstale - index)) {
- ASSERT(index - lowstale - 1 >= 0);
- ASSERT(INT_GET(leaf->ents[lowstale].address, ARCH_CONVERT) ==
- XFS_DIR2_NULL_DATAPTR);
- /*
- * Copy entries up to cover the stale entry
- * and make room for the new entry.
- */
- if (index - lowstale - 1 > 0)
- memmove(&leaf->ents[lowstale],
- &leaf->ents[lowstale + 1],
- (index - lowstale - 1) * sizeof(*lep));
- lep = &leaf->ents[index - 1];
- lfloglow = MIN(lowstale, lfloglow);
- lfloghigh = MAX(index - 1, lfloghigh);
- }
- /*
- * The high one is better, so use that one.
- */
- else {
- ASSERT(highstale - index >= 0);
- ASSERT(INT_GET(leaf->ents[highstale].address, ARCH_CONVERT) ==
- XFS_DIR2_NULL_DATAPTR);
- /*
- * Copy entries down to copver the stale entry
- * and make room for the new entry.
- */
- if (highstale - index > 0)
- memmove(&leaf->ents[index + 1],
- &leaf->ents[index],
- (highstale - index) * sizeof(*lep));
- lep = &leaf->ents[index];
- lfloglow = MIN(index, lfloglow);
- lfloghigh = MAX(highstale, lfloghigh);
- }
- INT_MOD(leaf->hdr.stale, ARCH_CONVERT, -1);
+ xfs_dir3_leaf_log_bests(args, lbp, use_block, use_block);
}
+
+ lep = xfs_dir3_leaf_find_entry(&leafhdr, ents, index, compact, lowstale,
+ highstale, &lfloglow, &lfloghigh);
+
/*
* Fill in the new leaf entry.
*/
- INT_SET(lep->hashval, ARCH_CONVERT, args->hashval);
- INT_SET(lep->address, ARCH_CONVERT, XFS_DIR2_DB_OFF_TO_DATAPTR(mp, use_block, INT_GET(*tagp, ARCH_CONVERT)));
+ lep->hashval = cpu_to_be32(args->hashval);
+ lep->address = cpu_to_be32(
+ xfs_dir2_db_off_to_dataptr(args->geo, use_block,
+ be16_to_cpu(*tagp)));
/*
* Log the leaf fields and give up the buffers.
*/
- xfs_dir2_leaf_log_header(tp, lbp);
- xfs_dir2_leaf_log_ents(tp, lbp, lfloglow, lfloghigh);
- xfs_dir2_leaf_check(dp, lbp);
- xfs_da_buf_done(lbp);
- xfs_dir2_data_check(dp, dbp);
- xfs_da_buf_done(dbp);
+ dp->d_ops->leaf_hdr_to_disk(leaf, &leafhdr);
+ xfs_dir3_leaf_log_header(args, lbp);
+ xfs_dir3_leaf_log_ents(args, lbp, lfloglow, lfloghigh);
+ xfs_dir3_leaf_check(dp, lbp);
+ xfs_dir3_data_check(dp, dbp);
return 0;
}
-#ifdef DEBUG
-/*
- * Check the internal consistency of a leaf1 block.
- * Pop an assert if something is wrong.
- */
-void
-xfs_dir2_leaf_check(
- xfs_inode_t *dp, /* incore directory inode */
- xfs_dabuf_t *bp) /* leaf's buffer */
-{
- int i; /* leaf index */
- xfs_dir2_leaf_t *leaf; /* leaf structure */
- xfs_dir2_leaf_tail_t *ltp; /* leaf tail pointer */
- xfs_mount_t *mp; /* filesystem mount point */
- int stale; /* count of stale leaves */
-
- leaf = bp->data;
- mp = dp->i_mount;
- ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR2_LEAF1_MAGIC);
- /*
- * This value is not restrictive enough.
- * Should factor in the size of the bests table as well.
- * We can deduce a value for that from di_size.
- */
- ASSERT(INT_GET(leaf->hdr.count, ARCH_CONVERT) <= XFS_DIR2_MAX_LEAF_ENTS(mp));
- ltp = XFS_DIR2_LEAF_TAIL_P(mp, leaf);
- /*
- * Leaves and bests don't overlap.
- */
- ASSERT((char *)&leaf->ents[INT_GET(leaf->hdr.count, ARCH_CONVERT)] <=
- (char *)XFS_DIR2_LEAF_BESTS_P(ltp));
- /*
- * Check hash value order, count stale entries.
- */
- for (i = stale = 0; i < INT_GET(leaf->hdr.count, ARCH_CONVERT); i++) {
- if (i + 1 < INT_GET(leaf->hdr.count, ARCH_CONVERT))
- ASSERT(INT_GET(leaf->ents[i].hashval, ARCH_CONVERT) <=
- INT_GET(leaf->ents[i + 1].hashval, ARCH_CONVERT));
- if (INT_GET(leaf->ents[i].address, ARCH_CONVERT) == XFS_DIR2_NULL_DATAPTR)
- stale++;
- }
- ASSERT(INT_GET(leaf->hdr.stale, ARCH_CONVERT) == stale);
-}
-#endif /* DEBUG */
-
/*
* Compact out any stale entries in the leaf.
* Log the header and changed leaf entries, if any.
*/
void
-xfs_dir2_leaf_compact(
+xfs_dir3_leaf_compact(
xfs_da_args_t *args, /* operation arguments */
- xfs_dabuf_t *bp) /* leaf buffer */
+ struct xfs_dir3_icleaf_hdr *leafhdr,
+ struct xfs_buf *bp) /* leaf buffer */
{
int from; /* source leaf index */
xfs_dir2_leaf_t *leaf; /* leaf structure */
int loglow; /* first leaf entry to log */
int to; /* target leaf index */
+ struct xfs_dir2_leaf_entry *ents;
+ struct xfs_inode *dp = args->dp;
- leaf = bp->data;
- if (!leaf->hdr.stale) {
+ leaf = bp->b_addr;
+ if (!leafhdr->stale)
return;
- }
+
/*
* Compress out the stale entries in place.
*/
- for (from = to = 0, loglow = -1; from < INT_GET(leaf->hdr.count, ARCH_CONVERT); from++) {
- if (INT_GET(leaf->ents[from].address, ARCH_CONVERT) == XFS_DIR2_NULL_DATAPTR)
+ ents = dp->d_ops->leaf_ents_p(leaf);
+ for (from = to = 0, loglow = -1; from < leafhdr->count; from++) {
+ if (ents[from].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
continue;
/*
* Only actually copy the entries that are different.
@@ -620,19 +940,21 @@ xfs_dir2_leaf_compact(
if (from > to) {
if (loglow == -1)
loglow = to;
- leaf->ents[to] = leaf->ents[from];
+ ents[to] = ents[from];
}
to++;
}
/*
* Update and log the header, log the leaf entries.
*/
- ASSERT(INT_GET(leaf->hdr.stale, ARCH_CONVERT) == from - to);
- INT_MOD(leaf->hdr.count, ARCH_CONVERT, -(INT_GET(leaf->hdr.stale, ARCH_CONVERT)));
- leaf->hdr.stale = 0;
- xfs_dir2_leaf_log_header(args->trans, bp);
+ ASSERT(leafhdr->stale == from - to);
+ leafhdr->count -= leafhdr->stale;
+ leafhdr->stale = 0;
+
+ dp->d_ops->leaf_hdr_to_disk(leaf, leafhdr);
+ xfs_dir3_leaf_log_header(args, bp);
if (loglow != -1)
- xfs_dir2_leaf_log_ents(args->trans, bp, loglow, to - 1);
+ xfs_dir3_leaf_log_ents(args, bp, loglow, to - 1);
}
/*
@@ -644,8 +966,9 @@ xfs_dir2_leaf_compact(
* and leaf logging indices.
*/
void
-xfs_dir2_leaf_compact_x1(
- xfs_dabuf_t *bp, /* leaf buffer */
+xfs_dir3_leaf_compact_x1(
+ struct xfs_dir3_icleaf_hdr *leafhdr,
+ struct xfs_dir2_leaf_entry *ents,
int *indexp, /* insertion index */
int *lowstalep, /* out: stale entry before us */
int *highstalep, /* out: stale entry after us */
@@ -656,37 +979,20 @@ xfs_dir2_leaf_compact_x1(
int highstale; /* stale entry at/after index */
int index; /* insertion index */
int keepstale; /* source index of kept stale */
- xfs_dir2_leaf_t *leaf; /* leaf structure */
int lowstale; /* stale entry before index */
int newindex=0; /* new insertion index */
int to; /* destination copy index */
- leaf = bp->data;
- ASSERT(INT_GET(leaf->hdr.stale, ARCH_CONVERT) > 1);
+ ASSERT(leafhdr->stale > 1);
index = *indexp;
- /*
- * Find the first stale entry before our index, if any.
- */
- for (lowstale = index - 1;
- lowstale >= 0 &&
- INT_GET(leaf->ents[lowstale].address, ARCH_CONVERT) != XFS_DIR2_NULL_DATAPTR;
- lowstale--)
- continue;
- /*
- * Find the first stale entry at or after our index, if any.
- * Stop if the answer would be worse than lowstale.
- */
- for (highstale = index;
- highstale < INT_GET(leaf->hdr.count, ARCH_CONVERT) &&
- INT_GET(leaf->ents[highstale].address, ARCH_CONVERT) != XFS_DIR2_NULL_DATAPTR &&
- (lowstale < 0 || index - lowstale > highstale - index);
- highstale++)
- continue;
+
+ xfs_dir3_leaf_find_stale(leafhdr, ents, index, &lowstale, &highstale);
+
/*
* Pick the better of lowstale and highstale.
*/
if (lowstale >= 0 &&
- (highstale == INT_GET(leaf->hdr.count, ARCH_CONVERT) ||
+ (highstale == leafhdr->count ||
index - lowstale <= highstale - index))
keepstale = lowstale;
else
@@ -695,14 +1001,14 @@ xfs_dir2_leaf_compact_x1(
* Copy the entries in place, removing all the stale entries
* except keepstale.
*/
- for (from = to = 0; from < INT_GET(leaf->hdr.count, ARCH_CONVERT); from++) {
+ for (from = to = 0; from < leafhdr->count; from++) {
/*
* Notice the new value of index.
*/
if (index == from)
newindex = to;
if (from != keepstale &&
- INT_GET(leaf->ents[from].address, ARCH_CONVERT) == XFS_DIR2_NULL_DATAPTR) {
+ ents[from].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) {
if (from == to)
*lowlogp = to;
continue;
@@ -716,7 +1022,7 @@ xfs_dir2_leaf_compact_x1(
* Copy only the entries that have moved.
*/
if (from > to)
- leaf->ents[to] = leaf->ents[from];
+ ents[to] = ents[from];
to++;
}
ASSERT(from > to);
@@ -730,8 +1036,8 @@ xfs_dir2_leaf_compact_x1(
/*
* Adjust the leaf header values.
*/
- INT_MOD(leaf->hdr.count, ARCH_CONVERT, -(from - to));
- INT_SET(leaf->hdr.stale, ARCH_CONVERT, 1);
+ leafhdr->count -= from - to;
+ leafhdr->stale = 1;
/*
* Remember the low/high stale value only in the "right"
* direction.
@@ -739,479 +1045,35 @@ xfs_dir2_leaf_compact_x1(
if (lowstale >= newindex)
lowstale = -1;
else
- highstale = INT_GET(leaf->hdr.count, ARCH_CONVERT);
- *highlogp = INT_GET(leaf->hdr.count, ARCH_CONVERT) - 1;
+ highstale = leafhdr->count;
+ *highlogp = leafhdr->count - 1;
*lowstalep = lowstale;
*highstalep = highstale;
}
/*
- * Getdents (readdir) for leaf and node directories.
- * This reads the data blocks only, so is the same for both forms.
- */
-int /* error */
-xfs_dir2_leaf_getdents(
- xfs_trans_t *tp, /* transaction pointer */
- xfs_inode_t *dp, /* incore directory inode */
- uio_t *uio, /* I/O control & vectors */
- int *eofp, /* out: reached end of dir */
- xfs_dirent_t *dbp, /* caller's buffer */
- xfs_dir2_put_t put) /* ABI formatting routine */
-{
- xfs_dabuf_t *bp; /* data block buffer */
- int byteoff; /* offset in current block */
- xfs_dir2_db_t curdb; /* db for current block */
- xfs_dir2_off_t curoff; /* current overall offset */
- xfs_dir2_data_t *data; /* data block structure */
- xfs_dir2_data_entry_t *dep; /* data entry */
- xfs_dir2_data_unused_t *dup; /* unused entry */
- int eof; /* reached end of directory */
- int error=0; /* error return value */
- int i; /* temporary loop index */
- int j; /* temporary loop index */
- int length; /* temporary length value */
- xfs_bmbt_irec_t *map; /* map vector for blocks */
- xfs_extlen_t map_blocks; /* number of fsbs in map */
- xfs_dablk_t map_off; /* last mapped file offset */
- int map_size; /* total entries in *map */
- int map_valid; /* valid entries in *map */
- xfs_mount_t *mp; /* filesystem mount point */
- xfs_dir2_off_t newoff; /* new curoff after new blk */
- int nmap; /* mappings to ask xfs_bmapi */
- xfs_dir2_put_args_t p; /* formatting arg bundle */
- char *ptr=NULL; /* pointer to current data */
- int ra_current; /* number of read-ahead blks */
- int ra_index; /* *map index for read-ahead */
- int ra_offset; /* map entry offset for ra */
- int ra_want; /* readahead count wanted */
-
- /*
- * If the offset is at or past the largest allowed value,
- * give up right away, return eof.
- */
- if (uio->uio_offset >= XFS_DIR2_MAX_DATAPTR) {
- *eofp = 1;
- return 0;
- }
- mp = dp->i_mount;
- /*
- * Setup formatting arguments.
- */
- p.dbp = dbp;
- p.put = put;
- p.uio = uio;
- /*
- * Set up to bmap a number of blocks based on the caller's
- * buffer size, the directory block size, and the filesystem
- * block size.
- */
- map_size =
- howmany(uio->uio_resid + mp->m_dirblksize,
- mp->m_sb.sb_blocksize);
- map = kmem_alloc(map_size * sizeof(*map), KM_SLEEP);
- map_valid = ra_index = ra_offset = ra_current = map_blocks = 0;
- bp = NULL;
- eof = 1;
- /*
- * Inside the loop we keep the main offset value as a byte offset
- * in the directory file.
- */
- curoff = XFS_DIR2_DATAPTR_TO_BYTE(mp, uio->uio_offset);
- /*
- * Force this conversion through db so we truncate the offset
- * down to get the start of the data block.
- */
- map_off = XFS_DIR2_DB_TO_DA(mp, XFS_DIR2_BYTE_TO_DB(mp, curoff));
- /*
- * Loop over directory entries until we reach the end offset.
- * Get more blocks and readahead as necessary.
- */
- while (curoff < XFS_DIR2_LEAF_OFFSET) {
- /*
- * If we have no buffer, or we're off the end of the
- * current buffer, need to get another one.
- */
- if (!bp || ptr >= (char *)bp->data + mp->m_dirblksize) {
- /*
- * If we have a buffer, we need to release it and
- * take it out of the mapping.
- */
- if (bp) {
- xfs_da_brelse(tp, bp);
- bp = NULL;
- map_blocks -= mp->m_dirblkfsbs;
- /*
- * Loop to get rid of the extents for the
- * directory block.
- */
- for (i = mp->m_dirblkfsbs; i > 0; ) {
- j = MIN((int)map->br_blockcount, i);
- map->br_blockcount -= j;
- map->br_startblock += j;
- map->br_startoff += j;
- /*
- * If mapping is done, pitch it from
- * the table.
- */
- if (!map->br_blockcount && --map_valid)
- memmove(&map[0], &map[1],
- sizeof(map[0]) *
- map_valid);
- i -= j;
- }
- }
- /*
- * Recalculate the readahead blocks wanted.
- */
- ra_want = howmany(uio->uio_resid + mp->m_dirblksize,
- mp->m_sb.sb_blocksize) - 1;
- /*
- * If we don't have as many as we want, and we haven't
- * run out of data blocks, get some more mappings.
- */
- if (1 + ra_want > map_blocks &&
- map_off <
- XFS_DIR2_BYTE_TO_DA(mp, XFS_DIR2_LEAF_OFFSET)) {
- /*
- * Get more bmaps, fill in after the ones
- * we already have in the table.
- */
- nmap = map_size - map_valid;
- error = xfs_bmapi(tp, dp,
- map_off,
- XFS_DIR2_BYTE_TO_DA(mp,
- XFS_DIR2_LEAF_OFFSET) - map_off,
- XFS_BMAPI_METADATA, NULL, 0,
- &map[map_valid], &nmap, NULL);
- /*
- * Don't know if we should ignore this or
- * try to return an error.
- * The trouble with returning errors
- * is that readdir will just stop without
- * actually passing the error through.
- */
- if (error)
- break; /* XXX */
- /*
- * If we got all the mappings we asked for,
- * set the final map offset based on the
- * last bmap value received.
- * Otherwise, we've reached the end.
- */
- if (nmap == map_size - map_valid)
- map_off =
- map[map_valid + nmap - 1].br_startoff +
- map[map_valid + nmap - 1].br_blockcount;
- else
- map_off =
- XFS_DIR2_BYTE_TO_DA(mp,
- XFS_DIR2_LEAF_OFFSET);
- /*
- * Look for holes in the mapping, and
- * eliminate them. Count up the valid blocks.
- */
- for (i = map_valid; i < map_valid + nmap; ) {
- if (map[i].br_startblock ==
- HOLESTARTBLOCK) {
- nmap--;
- length = map_valid + nmap - i;
- if (length)
- memmove(&map[i],
- &map[i + 1],
- sizeof(map[i]) *
- length);
- } else {
- map_blocks +=
- map[i].br_blockcount;
- i++;
- }
- }
- map_valid += nmap;
- }
- /*
- * No valid mappings, so no more data blocks.
- */
- if (!map_valid) {
- curoff = XFS_DIR2_DA_TO_BYTE(mp, map_off);
- break;
- }
- /*
- * Read the directory block starting at the first
- * mapping.
- */
- curdb = XFS_DIR2_DA_TO_DB(mp, map->br_startoff);
- error = xfs_da_read_buf(tp, dp, map->br_startoff,
- map->br_blockcount >= mp->m_dirblkfsbs ?
- XFS_FSB_TO_DADDR(mp, map->br_startblock) :
- -1,
- &bp, XFS_DATA_FORK);
- /*
- * Should just skip over the data block instead
- * of giving up.
- */
- if (error)
- break; /* XXX */
- /*
- * Adjust the current amount of read-ahead: we just
- * read a block that was previously ra.
- */
- if (ra_current)
- ra_current -= mp->m_dirblkfsbs;
- /*
- * Do we need more readahead?
- */
- for (ra_index = ra_offset = i = 0;
- ra_want > ra_current && i < map_blocks;
- i += mp->m_dirblkfsbs) {
- ASSERT(ra_index < map_valid);
- /*
- * Read-ahead a contiguous directory block.
- */
- if (i > ra_current &&
- map[ra_index].br_blockcount >=
- mp->m_dirblkfsbs) {
- xfs_baread(mp->m_ddev_targp,
- XFS_FSB_TO_DADDR(mp,
- map[ra_index].br_startblock +
- ra_offset),
- (int)BTOBB(mp->m_dirblksize));
- ra_current = i;
- }
- /*
- * Read-ahead a non-contiguous directory block.
- * This doesn't use our mapping, but this
- * is a very rare case.
- */
- else if (i > ra_current) {
- (void)xfs_da_reada_buf(tp, dp,
- map[ra_index].br_startoff +
- ra_offset, XFS_DATA_FORK);
- ra_current = i;
- }
- /*
- * Advance offset through the mapping table.
- */
- for (j = 0; j < mp->m_dirblkfsbs; j++) {
- /*
- * The rest of this extent but not
- * more than a dir block.
- */
- length = MIN(mp->m_dirblkfsbs,
- (int)(map[ra_index].br_blockcount -
- ra_offset));
- j += length;
- ra_offset += length;
- /*
- * Advance to the next mapping if
- * this one is used up.
- */
- if (ra_offset ==
- map[ra_index].br_blockcount) {
- ra_offset = 0;
- ra_index++;
- }
- }
- }
- /*
- * Having done a read, we need to set a new offset.
- */
- newoff = XFS_DIR2_DB_OFF_TO_BYTE(mp, curdb, 0);
- /*
- * Start of the current block.
- */
- if (curoff < newoff)
- curoff = newoff;
- /*
- * Make sure we're in the right block.
- */
- else if (curoff > newoff)
- ASSERT(XFS_DIR2_BYTE_TO_DB(mp, curoff) ==
- curdb);
- data = bp->data;
- xfs_dir2_data_check(dp, bp);
- /*
- * Find our position in the block.
- */
- ptr = (char *)&data->u;
- byteoff = XFS_DIR2_BYTE_TO_OFF(mp, curoff);
- /*
- * Skip past the header.
- */
- if (byteoff == 0)
- curoff += (uint)sizeof(data->hdr);
- /*
- * Skip past entries until we reach our offset.
- */
- else {
- while ((char *)ptr - (char *)data < byteoff) {
- dup = (xfs_dir2_data_unused_t *)ptr;
-
- if (INT_GET(dup->freetag, ARCH_CONVERT)
- == XFS_DIR2_DATA_FREE_TAG) {
-
- length = INT_GET(dup->length,
- ARCH_CONVERT);
- ptr += length;
- continue;
- }
- dep = (xfs_dir2_data_entry_t *)ptr;
- length =
- XFS_DIR2_DATA_ENTSIZE(dep->namelen);
- ptr += length;
- }
- /*
- * Now set our real offset.
- */
- curoff =
- XFS_DIR2_DB_OFF_TO_BYTE(mp,
- XFS_DIR2_BYTE_TO_DB(mp, curoff),
- (char *)ptr - (char *)data);
- if (ptr >= (char *)data + mp->m_dirblksize) {
- continue;
- }
- }
- }
- /*
- * We have a pointer to an entry.
- * Is it a live one?
- */
- dup = (xfs_dir2_data_unused_t *)ptr;
- /*
- * No, it's unused, skip over it.
- */
- if (INT_GET(dup->freetag, ARCH_CONVERT)
- == XFS_DIR2_DATA_FREE_TAG) {
- length = INT_GET(dup->length, ARCH_CONVERT);
- ptr += length;
- curoff += length;
- continue;
- }
-
- /*
- * Copy the entry into the putargs, and try formatting it.
- */
- dep = (xfs_dir2_data_entry_t *)ptr;
-
- p.namelen = dep->namelen;
-
- length = XFS_DIR2_DATA_ENTSIZE(p.namelen);
-
- p.cook = XFS_DIR2_BYTE_TO_DATAPTR(mp, curoff + length);
-
- p.ino = INT_GET(dep->inumber, ARCH_CONVERT);
-#if XFS_BIG_INUMS
- p.ino += mp->m_inoadd;
-#endif
- p.name = (char *)dep->name;
-
- error = p.put(&p);
-
- /*
- * Won't fit. Return to caller.
- */
- if (!p.done) {
- eof = 0;
- break;
- }
- /*
- * Advance to next entry in the block.
- */
- ptr += length;
- curoff += length;
- }
-
- /*
- * All done. Set output offset value to current offset.
- */
- *eofp = eof;
- if (curoff > XFS_DIR2_DATAPTR_TO_BYTE(mp, XFS_DIR2_MAX_DATAPTR))
- uio->uio_offset = XFS_DIR2_MAX_DATAPTR;
- else
- uio->uio_offset = XFS_DIR2_BYTE_TO_DATAPTR(mp, curoff);
- kmem_free(map, map_size * sizeof(*map));
- if (bp)
- xfs_da_brelse(tp, bp);
- return error;
-}
-
-/*
- * Initialize a new leaf block, leaf1 or leafn magic accepted.
- */
-int
-xfs_dir2_leaf_init(
- xfs_da_args_t *args, /* operation arguments */
- xfs_dir2_db_t bno, /* directory block number */
- xfs_dabuf_t **bpp, /* out: leaf buffer */
- int magic) /* magic number for block */
-{
- xfs_dabuf_t *bp; /* leaf buffer */
- xfs_inode_t *dp; /* incore directory inode */
- int error; /* error return code */
- xfs_dir2_leaf_t *leaf; /* leaf structure */
- xfs_dir2_leaf_tail_t *ltp; /* leaf tail structure */
- xfs_mount_t *mp; /* filesystem mount point */
- xfs_trans_t *tp; /* transaction pointer */
-
- dp = args->dp;
- ASSERT(dp != NULL);
- tp = args->trans;
- mp = dp->i_mount;
- ASSERT(bno >= XFS_DIR2_LEAF_FIRSTDB(mp) &&
- bno < XFS_DIR2_FREE_FIRSTDB(mp));
- /*
- * Get the buffer for the block.
- */
- error = xfs_da_get_buf(tp, dp, XFS_DIR2_DB_TO_DA(mp, bno), -1, &bp,
- XFS_DATA_FORK);
- if (error) {
- return error;
- }
- ASSERT(bp != NULL);
- leaf = bp->data;
- /*
- * Initialize the header.
- */
- INT_SET(leaf->hdr.info.magic, ARCH_CONVERT, magic);
- leaf->hdr.info.forw = 0;
- leaf->hdr.info.back = 0;
- leaf->hdr.count = 0;
- leaf->hdr.stale = 0;
- xfs_dir2_leaf_log_header(tp, bp);
- /*
- * If it's a leaf-format directory initialize the tail.
- * In this case our caller has the real bests table to copy into
- * the block.
- */
- if (magic == XFS_DIR2_LEAF1_MAGIC) {
- ltp = XFS_DIR2_LEAF_TAIL_P(mp, leaf);
- ltp->bestcount = 0;
- xfs_dir2_leaf_log_tail(tp, bp);
- }
- *bpp = bp;
- return 0;
-}
-
-/*
* Log the bests entries indicated from a leaf1 block.
*/
static void
-xfs_dir2_leaf_log_bests(
- xfs_trans_t *tp, /* transaction pointer */
- xfs_dabuf_t *bp, /* leaf buffer */
+xfs_dir3_leaf_log_bests(
+ struct xfs_da_args *args,
+ struct xfs_buf *bp, /* leaf buffer */
int first, /* first entry to log */
int last) /* last entry to log */
{
- xfs_dir2_data_off_t *firstb; /* pointer to first entry */
- xfs_dir2_data_off_t *lastb; /* pointer to last entry */
- xfs_dir2_leaf_t *leaf; /* leaf structure */
+ __be16 *firstb; /* pointer to first entry */
+ __be16 *lastb; /* pointer to last entry */
+ struct xfs_dir2_leaf *leaf = bp->b_addr;
xfs_dir2_leaf_tail_t *ltp; /* leaf tail structure */
- leaf = bp->data;
- ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR2_LEAF1_MAGIC);
- ltp = XFS_DIR2_LEAF_TAIL_P(tp->t_mountp, leaf);
- firstb = XFS_DIR2_LEAF_BESTS_P(ltp) + first;
- lastb = XFS_DIR2_LEAF_BESTS_P(ltp) + last;
- xfs_da_log_buf(tp, bp, (uint)((char *)firstb - (char *)leaf),
+ ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC) ||
+ leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAF1_MAGIC));
+
+ ltp = xfs_dir2_leaf_tail_p(args->geo, leaf);
+ firstb = xfs_dir2_leaf_bests_p(ltp) + first;
+ lastb = xfs_dir2_leaf_bests_p(ltp) + last;
+ xfs_trans_log_buf(args->trans, bp,
+ (uint)((char *)firstb - (char *)leaf),
(uint)((char *)lastb - (char *)leaf + sizeof(*lastb) - 1));
}
@@ -1219,22 +1081,27 @@ xfs_dir2_leaf_log_bests(
* Log the leaf entries indicated from a leaf1 or leafn block.
*/
void
-xfs_dir2_leaf_log_ents(
- xfs_trans_t *tp, /* transaction pointer */
- xfs_dabuf_t *bp, /* leaf buffer */
- int first, /* first entry to log */
- int last) /* last entry to log */
+xfs_dir3_leaf_log_ents(
+ struct xfs_da_args *args,
+ struct xfs_buf *bp,
+ int first,
+ int last)
{
xfs_dir2_leaf_entry_t *firstlep; /* pointer to first entry */
xfs_dir2_leaf_entry_t *lastlep; /* pointer to last entry */
- xfs_dir2_leaf_t *leaf; /* leaf structure */
+ struct xfs_dir2_leaf *leaf = bp->b_addr;
+ struct xfs_dir2_leaf_entry *ents;
- leaf = bp->data;
- ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR2_LEAF1_MAGIC ||
- INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR2_LEAFN_MAGIC);
- firstlep = &leaf->ents[first];
- lastlep = &leaf->ents[last];
- xfs_da_log_buf(tp, bp, (uint)((char *)firstlep - (char *)leaf),
+ ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC) ||
+ leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAF1_MAGIC) ||
+ leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) ||
+ leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC));
+
+ ents = args->dp->d_ops->leaf_ents_p(leaf);
+ firstlep = &ents[first];
+ lastlep = &ents[last];
+ xfs_trans_log_buf(args->trans, bp,
+ (uint)((char *)firstlep - (char *)leaf),
(uint)((char *)lastlep - (char *)leaf + sizeof(*lastlep) - 1));
}
@@ -1242,37 +1109,41 @@ xfs_dir2_leaf_log_ents(
* Log the header of the leaf1 or leafn block.
*/
void
-xfs_dir2_leaf_log_header(
- xfs_trans_t *tp, /* transaction pointer */
- xfs_dabuf_t *bp) /* leaf buffer */
+xfs_dir3_leaf_log_header(
+ struct xfs_da_args *args,
+ struct xfs_buf *bp)
{
- xfs_dir2_leaf_t *leaf; /* leaf structure */
+ struct xfs_dir2_leaf *leaf = bp->b_addr;
+
+ ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC) ||
+ leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAF1_MAGIC) ||
+ leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) ||
+ leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC));
- leaf = bp->data;
- ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR2_LEAF1_MAGIC ||
- INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR2_LEAFN_MAGIC);
- xfs_da_log_buf(tp, bp, (uint)((char *)&leaf->hdr - (char *)leaf),
- (uint)(sizeof(leaf->hdr) - 1));
+ xfs_trans_log_buf(args->trans, bp,
+ (uint)((char *)&leaf->hdr - (char *)leaf),
+ args->dp->d_ops->leaf_hdr_size - 1);
}
/*
* Log the tail of the leaf1 block.
*/
STATIC void
-xfs_dir2_leaf_log_tail(
- xfs_trans_t *tp, /* transaction pointer */
- xfs_dabuf_t *bp) /* leaf buffer */
+xfs_dir3_leaf_log_tail(
+ struct xfs_da_args *args,
+ struct xfs_buf *bp)
{
- xfs_dir2_leaf_t *leaf; /* leaf structure */
+ struct xfs_dir2_leaf *leaf = bp->b_addr;
xfs_dir2_leaf_tail_t *ltp; /* leaf tail structure */
- xfs_mount_t *mp; /* filesystem mount point */
- mp = tp->t_mountp;
- leaf = bp->data;
- ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR2_LEAF1_MAGIC);
- ltp = XFS_DIR2_LEAF_TAIL_P(mp, leaf);
- xfs_da_log_buf(tp, bp, (uint)((char *)ltp - (char *)leaf),
- (uint)(mp->m_dirblksize - 1));
+ ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC) ||
+ leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAF1_MAGIC) ||
+ leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) ||
+ leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC));
+
+ ltp = xfs_dir2_leaf_tail_p(args->geo, leaf);
+ xfs_trans_log_buf(args->trans, bp, (uint)((char *)ltp - (char *)leaf),
+ (uint)(args->geo->blksize - 1));
}
/*
@@ -1284,17 +1155,19 @@ int
xfs_dir2_leaf_lookup(
xfs_da_args_t *args) /* operation arguments */
{
- xfs_dabuf_t *dbp; /* data block buffer */
+ struct xfs_buf *dbp; /* data block buffer */
xfs_dir2_data_entry_t *dep; /* data block entry */
xfs_inode_t *dp; /* incore directory inode */
int error; /* error return code */
int index; /* found entry index */
- xfs_dabuf_t *lbp; /* leaf buffer */
+ struct xfs_buf *lbp; /* leaf buffer */
xfs_dir2_leaf_t *leaf; /* leaf structure */
xfs_dir2_leaf_entry_t *lep; /* leaf entry */
xfs_trans_t *tp; /* transaction pointer */
+ struct xfs_dir2_leaf_entry *ents;
+
+ trace_xfs_dir2_leaf_lookup(args);
- xfs_dir2_trace_args("leaf_lookup", args);
/*
* Look up name in the leaf block, returning both buffers and index.
*/
@@ -1303,25 +1176,29 @@ xfs_dir2_leaf_lookup(
}
tp = args->trans;
dp = args->dp;
- xfs_dir2_leaf_check(dp, lbp);
- leaf = lbp->data;
+ xfs_dir3_leaf_check(dp, lbp);
+ leaf = lbp->b_addr;
+ ents = dp->d_ops->leaf_ents_p(leaf);
/*
* Get to the leaf entry and contained data entry address.
*/
- lep = &leaf->ents[index];
+ lep = &ents[index];
+
/*
* Point to the data entry.
*/
dep = (xfs_dir2_data_entry_t *)
- ((char *)dbp->data +
- XFS_DIR2_DATAPTR_TO_OFF(dp->i_mount, INT_GET(lep->address, ARCH_CONVERT)));
+ ((char *)dbp->b_addr +
+ xfs_dir2_dataptr_to_off(args->geo, be32_to_cpu(lep->address)));
/*
- * Return the found inode number.
+ * Return the found inode number & CI name if appropriate
*/
- args->inumber = INT_GET(dep->inumber, ARCH_CONVERT);
- xfs_da_brelse(tp, dbp);
- xfs_da_brelse(tp, lbp);
- return XFS_ERROR(EEXIST);
+ args->inumber = be64_to_cpu(dep->inumber);
+ args->filetype = dp->d_ops->data_get_ftype(dep);
+ error = xfs_dir_cilookup_result(args, dep->name, dep->namelen);
+ xfs_trans_brelse(tp, dbp);
+ xfs_trans_brelse(tp, lbp);
+ return XFS_ERROR(error);
}
/*
@@ -1333,37 +1210,41 @@ xfs_dir2_leaf_lookup(
static int /* error */
xfs_dir2_leaf_lookup_int(
xfs_da_args_t *args, /* operation arguments */
- xfs_dabuf_t **lbpp, /* out: leaf buffer */
+ struct xfs_buf **lbpp, /* out: leaf buffer */
int *indexp, /* out: index in leaf block */
- xfs_dabuf_t **dbpp) /* out: data buffer */
+ struct xfs_buf **dbpp) /* out: data buffer */
{
- xfs_dir2_db_t curdb; /* current data block number */
- xfs_dabuf_t *dbp; /* data buffer */
+ xfs_dir2_db_t curdb = -1; /* current data block number */
+ struct xfs_buf *dbp = NULL; /* data buffer */
xfs_dir2_data_entry_t *dep; /* data entry */
xfs_inode_t *dp; /* incore directory inode */
int error; /* error return code */
int index; /* index in leaf block */
- xfs_dabuf_t *lbp; /* leaf buffer */
+ struct xfs_buf *lbp; /* leaf buffer */
xfs_dir2_leaf_entry_t *lep; /* leaf entry */
xfs_dir2_leaf_t *leaf; /* leaf structure */
xfs_mount_t *mp; /* filesystem mount point */
xfs_dir2_db_t newdb; /* new data block number */
xfs_trans_t *tp; /* transaction pointer */
+ xfs_dir2_db_t cidb = -1; /* case match data block no. */
+ enum xfs_dacmp cmp; /* name compare result */
+ struct xfs_dir2_leaf_entry *ents;
+ struct xfs_dir3_icleaf_hdr leafhdr;
dp = args->dp;
tp = args->trans;
mp = dp->i_mount;
- /*
- * Read the leaf block into the buffer.
- */
- if ((error =
- xfs_da_read_buf(tp, dp, mp->m_dirleafblk, -1, &lbp,
- XFS_DATA_FORK))) {
+
+ error = xfs_dir3_leaf_read(tp, dp, args->geo->leafblk, -1, &lbp);
+ if (error)
return error;
- }
+
*lbpp = lbp;
- leaf = lbp->data;
- xfs_dir2_leaf_check(dp, lbp);
+ leaf = lbp->b_addr;
+ xfs_dir3_leaf_check(dp, lbp);
+ ents = dp->d_ops->leaf_ents_p(leaf);
+ dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
+
/*
* Look for the first leaf entry with our hash value.
*/
@@ -1372,59 +1253,86 @@ xfs_dir2_leaf_lookup_int(
* Loop over all the entries with the right hash value
* looking to match the name.
*/
- for (lep = &leaf->ents[index], dbp = NULL, curdb = -1;
- index < INT_GET(leaf->hdr.count, ARCH_CONVERT) && INT_GET(lep->hashval, ARCH_CONVERT) == args->hashval;
+ for (lep = &ents[index];
+ index < leafhdr.count && be32_to_cpu(lep->hashval) == args->hashval;
lep++, index++) {
/*
* Skip over stale leaf entries.
*/
- if (INT_GET(lep->address, ARCH_CONVERT) == XFS_DIR2_NULL_DATAPTR)
+ if (be32_to_cpu(lep->address) == XFS_DIR2_NULL_DATAPTR)
continue;
/*
* Get the new data block number.
*/
- newdb = XFS_DIR2_DATAPTR_TO_DB(mp, INT_GET(lep->address, ARCH_CONVERT));
+ newdb = xfs_dir2_dataptr_to_db(args->geo,
+ be32_to_cpu(lep->address));
/*
* If it's not the same as the old data block number,
* need to pitch the old one and read the new one.
*/
if (newdb != curdb) {
if (dbp)
- xfs_da_brelse(tp, dbp);
- if ((error =
- xfs_da_read_buf(tp, dp,
- XFS_DIR2_DB_TO_DA(mp, newdb), -1, &dbp,
- XFS_DATA_FORK))) {
- xfs_da_brelse(tp, lbp);
+ xfs_trans_brelse(tp, dbp);
+ error = xfs_dir3_data_read(tp, dp,
+ xfs_dir2_db_to_da(args->geo, newdb),
+ -1, &dbp);
+ if (error) {
+ xfs_trans_brelse(tp, lbp);
return error;
}
- xfs_dir2_data_check(dp, dbp);
curdb = newdb;
}
/*
* Point to the data entry.
*/
- dep = (xfs_dir2_data_entry_t *)
- ((char *)dbp->data +
- XFS_DIR2_DATAPTR_TO_OFF(mp, INT_GET(lep->address, ARCH_CONVERT)));
+ dep = (xfs_dir2_data_entry_t *)((char *)dbp->b_addr +
+ xfs_dir2_dataptr_to_off(args->geo,
+ be32_to_cpu(lep->address)));
/*
- * If it matches then return it.
+ * Compare name and if it's an exact match, return the index
+ * and buffer. If it's the first case-insensitive match, store
+ * the index and buffer and continue looking for an exact match.
*/
- if (dep->namelen == args->namelen &&
- dep->name[0] == args->name[0] &&
- memcmp(dep->name, args->name, args->namelen) == 0) {
- *dbpp = dbp;
+ cmp = mp->m_dirnameops->compname(args, dep->name, dep->namelen);
+ if (cmp != XFS_CMP_DIFFERENT && cmp != args->cmpresult) {
+ args->cmpresult = cmp;
*indexp = index;
- return 0;
+ /* case exact match: return the current buffer. */
+ if (cmp == XFS_CMP_EXACT) {
+ *dbpp = dbp;
+ return 0;
+ }
+ cidb = curdb;
+ }
+ }
+ ASSERT(args->op_flags & XFS_DA_OP_OKNOENT);
+ /*
+ * Here, we can only be doing a lookup (not a rename or remove).
+ * If a case-insensitive match was found earlier, re-read the
+ * appropriate data block if required and return it.
+ */
+ if (args->cmpresult == XFS_CMP_CASE) {
+ ASSERT(cidb != -1);
+ if (cidb != curdb) {
+ xfs_trans_brelse(tp, dbp);
+ error = xfs_dir3_data_read(tp, dp,
+ xfs_dir2_db_to_da(args->geo, cidb),
+ -1, &dbp);
+ if (error) {
+ xfs_trans_brelse(tp, lbp);
+ return error;
+ }
}
+ *dbpp = dbp;
+ return 0;
}
/*
* No match found, return ENOENT.
*/
- ASSERT(args->oknoent);
+ ASSERT(cidb == -1);
if (dbp)
- xfs_da_brelse(tp, dbp);
- xfs_da_brelse(tp, lbp);
+ xfs_trans_brelse(tp, dbp);
+ xfs_trans_brelse(tp, lbp);
return XFS_ERROR(ENOENT);
}
@@ -1435,16 +1343,16 @@ int /* error */
xfs_dir2_leaf_removename(
xfs_da_args_t *args) /* operation arguments */
{
- xfs_dir2_data_off_t *bestsp; /* leaf block best freespace */
- xfs_dir2_data_t *data; /* data block structure */
+ __be16 *bestsp; /* leaf block best freespace */
+ xfs_dir2_data_hdr_t *hdr; /* data block header */
xfs_dir2_db_t db; /* data block number */
- xfs_dabuf_t *dbp; /* data block buffer */
+ struct xfs_buf *dbp; /* data block buffer */
xfs_dir2_data_entry_t *dep; /* data entry structure */
xfs_inode_t *dp; /* incore directory inode */
int error; /* error return code */
xfs_dir2_db_t i; /* temporary data block # */
int index; /* index into leaf entries */
- xfs_dabuf_t *lbp; /* leaf buffer */
+ struct xfs_buf *lbp; /* leaf buffer */
xfs_dir2_leaf_t *leaf; /* leaf structure */
xfs_dir2_leaf_entry_t *lep; /* leaf entry */
xfs_dir2_leaf_tail_t *ltp; /* leaf tail structure */
@@ -1453,8 +1361,12 @@ xfs_dir2_leaf_removename(
int needscan; /* need to rescan data frees */
xfs_dir2_data_off_t oldbest; /* old value of best free */
xfs_trans_t *tp; /* transaction pointer */
+ struct xfs_dir2_data_free *bf; /* bestfree table */
+ struct xfs_dir2_leaf_entry *ents;
+ struct xfs_dir3_icleaf_hdr leafhdr;
+
+ trace_xfs_dir2_leaf_removename(args);
- xfs_dir2_trace_args("leaf_removename", args);
/*
* Lookup the leaf entry, get the leaf and data blocks read in.
*/
@@ -1464,57 +1376,63 @@ xfs_dir2_leaf_removename(
dp = args->dp;
tp = args->trans;
mp = dp->i_mount;
- leaf = lbp->data;
- data = dbp->data;
- xfs_dir2_data_check(dp, dbp);
+ leaf = lbp->b_addr;
+ hdr = dbp->b_addr;
+ xfs_dir3_data_check(dp, dbp);
+ bf = dp->d_ops->data_bestfree_p(hdr);
+ dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
+ ents = dp->d_ops->leaf_ents_p(leaf);
/*
* Point to the leaf entry, use that to point to the data entry.
*/
- lep = &leaf->ents[index];
- db = XFS_DIR2_DATAPTR_TO_DB(mp, INT_GET(lep->address, ARCH_CONVERT));
- dep = (xfs_dir2_data_entry_t *)
- ((char *)data + XFS_DIR2_DATAPTR_TO_OFF(mp, INT_GET(lep->address, ARCH_CONVERT)));
+ lep = &ents[index];
+ db = xfs_dir2_dataptr_to_db(args->geo, be32_to_cpu(lep->address));
+ dep = (xfs_dir2_data_entry_t *)((char *)hdr +
+ xfs_dir2_dataptr_to_off(args->geo, be32_to_cpu(lep->address)));
needscan = needlog = 0;
- oldbest = INT_GET(data->hdr.bestfree[0].length, ARCH_CONVERT);
- ltp = XFS_DIR2_LEAF_TAIL_P(mp, leaf);
- bestsp = XFS_DIR2_LEAF_BESTS_P(ltp);
- ASSERT(INT_GET(bestsp[db], ARCH_CONVERT) == oldbest);
+ oldbest = be16_to_cpu(bf[0].length);
+ ltp = xfs_dir2_leaf_tail_p(args->geo, leaf);
+ bestsp = xfs_dir2_leaf_bests_p(ltp);
+ ASSERT(be16_to_cpu(bestsp[db]) == oldbest);
/*
* Mark the former data entry unused.
*/
- xfs_dir2_data_make_free(tp, dbp,
- (xfs_dir2_data_aoff_t)((char *)dep - (char *)data),
- XFS_DIR2_DATA_ENTSIZE(dep->namelen), &needlog, &needscan);
+ xfs_dir2_data_make_free(args, dbp,
+ (xfs_dir2_data_aoff_t)((char *)dep - (char *)hdr),
+ dp->d_ops->data_entsize(dep->namelen), &needlog, &needscan);
/*
* We just mark the leaf entry stale by putting a null in it.
*/
- INT_MOD(leaf->hdr.stale, ARCH_CONVERT, +1);
- xfs_dir2_leaf_log_header(tp, lbp);
- INT_SET(lep->address, ARCH_CONVERT, XFS_DIR2_NULL_DATAPTR);
- xfs_dir2_leaf_log_ents(tp, lbp, index, index);
+ leafhdr.stale++;
+ dp->d_ops->leaf_hdr_to_disk(leaf, &leafhdr);
+ xfs_dir3_leaf_log_header(args, lbp);
+
+ lep->address = cpu_to_be32(XFS_DIR2_NULL_DATAPTR);
+ xfs_dir3_leaf_log_ents(args, lbp, index, index);
+
/*
* Scan the freespace in the data block again if necessary,
* log the data block header if necessary.
*/
if (needscan)
- xfs_dir2_data_freescan(mp, data, &needlog, NULL);
+ xfs_dir2_data_freescan(dp, hdr, &needlog);
if (needlog)
- xfs_dir2_data_log_header(tp, dbp);
+ xfs_dir2_data_log_header(args, dbp);
/*
* If the longest freespace in the data block has changed,
* put the new value in the bests table and log that.
*/
- if (INT_GET(data->hdr.bestfree[0].length, ARCH_CONVERT) != oldbest) {
- INT_COPY(bestsp[db], data->hdr.bestfree[0].length, ARCH_CONVERT);
- xfs_dir2_leaf_log_bests(tp, lbp, db, db);
+ if (be16_to_cpu(bf[0].length) != oldbest) {
+ bestsp[db] = bf[0].length;
+ xfs_dir3_leaf_log_bests(args, lbp, db, db);
}
- xfs_dir2_data_check(dp, dbp);
+ xfs_dir3_data_check(dp, dbp);
/*
* If the data block is now empty then get rid of the data block.
*/
- if (INT_GET(data->hdr.bestfree[0].length, ARCH_CONVERT) ==
- mp->m_dirblksize - (uint)sizeof(data->hdr)) {
- ASSERT(db != mp->m_dirdatablk);
+ if (be16_to_cpu(bf[0].length) ==
+ args->geo->blksize - dp->d_ops->data_entry_offset) {
+ ASSERT(db != args->geo->datablk);
if ((error = xfs_dir2_shrink_inode(args, db, dbp))) {
/*
* Nope, can't get rid of it because it caused
@@ -1522,12 +1440,9 @@ xfs_dir2_leaf_removename(
* Just go on, returning success, leaving the
* empty block in place.
*/
- if (error == ENOSPC && args->total == 0) {
- xfs_da_buf_done(dbp);
+ if (error == ENOSPC && args->total == 0)
error = 0;
- }
- xfs_dir2_leaf_check(dp, lbp);
- xfs_da_buf_done(lbp);
+ xfs_dir3_leaf_check(dp, lbp);
return error;
}
dbp = NULL;
@@ -1535,12 +1450,12 @@ xfs_dir2_leaf_removename(
* If this is the last data block then compact the
* bests table by getting rid of entries.
*/
- if (db == INT_GET(ltp->bestcount, ARCH_CONVERT) - 1) {
+ if (db == be32_to_cpu(ltp->bestcount) - 1) {
/*
* Look for the last active entry (i).
*/
for (i = db - 1; i > 0; i--) {
- if (INT_GET(bestsp[i], ARCH_CONVERT) != NULLDATAOFF)
+ if (bestsp[i] != cpu_to_be16(NULLDATAOFF))
break;
}
/*
@@ -1548,21 +1463,21 @@ xfs_dir2_leaf_removename(
* end are removed.
*/
memmove(&bestsp[db - i], bestsp,
- (INT_GET(ltp->bestcount, ARCH_CONVERT) - (db - i)) * sizeof(*bestsp));
- INT_MOD(ltp->bestcount, ARCH_CONVERT, -(db - i));
- xfs_dir2_leaf_log_tail(tp, lbp);
- xfs_dir2_leaf_log_bests(tp, lbp, 0, INT_GET(ltp->bestcount, ARCH_CONVERT) - 1);
+ (be32_to_cpu(ltp->bestcount) - (db - i)) * sizeof(*bestsp));
+ be32_add_cpu(&ltp->bestcount, -(db - i));
+ xfs_dir3_leaf_log_tail(args, lbp);
+ xfs_dir3_leaf_log_bests(args, lbp, 0,
+ be32_to_cpu(ltp->bestcount) - 1);
} else
- INT_SET(bestsp[db], ARCH_CONVERT, NULLDATAOFF);
+ bestsp[db] = cpu_to_be16(NULLDATAOFF);
}
/*
* If the data block was not the first one, drop it.
*/
- else if (db != mp->m_dirdatablk && dbp != NULL) {
- xfs_da_buf_done(dbp);
+ else if (db != args->geo->datablk)
dbp = NULL;
- }
- xfs_dir2_leaf_check(dp, lbp);
+
+ xfs_dir3_leaf_check(dp, lbp);
/*
* See if we can convert to block form.
*/
@@ -1576,17 +1491,19 @@ int /* error */
xfs_dir2_leaf_replace(
xfs_da_args_t *args) /* operation arguments */
{
- xfs_dabuf_t *dbp; /* data block buffer */
+ struct xfs_buf *dbp; /* data block buffer */
xfs_dir2_data_entry_t *dep; /* data block entry */
xfs_inode_t *dp; /* incore directory inode */
int error; /* error return code */
int index; /* index of leaf entry */
- xfs_dabuf_t *lbp; /* leaf buffer */
+ struct xfs_buf *lbp; /* leaf buffer */
xfs_dir2_leaf_t *leaf; /* leaf structure */
xfs_dir2_leaf_entry_t *lep; /* leaf entry */
xfs_trans_t *tp; /* transaction pointer */
+ struct xfs_dir2_leaf_entry *ents;
+
+ trace_xfs_dir2_leaf_replace(args);
- xfs_dir2_trace_args("leaf_replace", args);
/*
* Look up the entry.
*/
@@ -1594,27 +1511,28 @@ xfs_dir2_leaf_replace(
return error;
}
dp = args->dp;
- leaf = lbp->data;
+ leaf = lbp->b_addr;
+ ents = dp->d_ops->leaf_ents_p(leaf);
/*
* Point to the leaf entry, get data address from it.
*/
- lep = &leaf->ents[index];
+ lep = &ents[index];
/*
* Point to the data entry.
*/
dep = (xfs_dir2_data_entry_t *)
- ((char *)dbp->data +
- XFS_DIR2_DATAPTR_TO_OFF(dp->i_mount, INT_GET(lep->address, ARCH_CONVERT)));
- ASSERT(args->inumber != INT_GET(dep->inumber, ARCH_CONVERT));
+ ((char *)dbp->b_addr +
+ xfs_dir2_dataptr_to_off(args->geo, be32_to_cpu(lep->address)));
+ ASSERT(args->inumber != be64_to_cpu(dep->inumber));
/*
* Put the new inode number in, log it.
*/
- INT_SET(dep->inumber, ARCH_CONVERT, args->inumber);
+ dep->inumber = cpu_to_be64(args->inumber);
+ dp->d_ops->data_put_ftype(dep, args->filetype);
tp = args->trans;
- xfs_dir2_data_log_entry(tp, dbp, dep);
- xfs_da_buf_done(dbp);
- xfs_dir2_leaf_check(dp, lbp);
- xfs_da_brelse(tp, lbp);
+ xfs_dir2_data_log_entry(args, dbp, dep);
+ xfs_dir3_leaf_check(dp, lbp);
+ xfs_trans_brelse(tp, lbp);
return 0;
}
@@ -1626,7 +1544,7 @@ xfs_dir2_leaf_replace(
int /* index value */
xfs_dir2_leaf_search_hash(
xfs_da_args_t *args, /* operation arguments */
- xfs_dabuf_t *lbp) /* leaf buffer */
+ struct xfs_buf *lbp) /* leaf buffer */
{
xfs_dahash_t hash=0; /* hash from this entry */
xfs_dahash_t hashwant; /* hash value looking for */
@@ -1635,21 +1553,22 @@ xfs_dir2_leaf_search_hash(
xfs_dir2_leaf_t *leaf; /* leaf structure */
xfs_dir2_leaf_entry_t *lep; /* leaf entry */
int mid=0; /* current leaf index */
+ struct xfs_dir2_leaf_entry *ents;
+ struct xfs_dir3_icleaf_hdr leafhdr;
+
+ leaf = lbp->b_addr;
+ ents = args->dp->d_ops->leaf_ents_p(leaf);
+ args->dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
- leaf = lbp->data;
-#ifndef __KERNEL__
- if (!leaf->hdr.count)
- return 0;
-#endif
/*
* Note, the table cannot be empty, so we have to go through the loop.
* Binary search the leaf entries looking for our hash value.
*/
- for (lep = leaf->ents, low = 0, high = INT_GET(leaf->hdr.count, ARCH_CONVERT) - 1,
+ for (lep = ents, low = 0, high = leafhdr.count - 1,
hashwant = args->hashval;
low <= high; ) {
mid = (low + high) >> 1;
- if ((hash = INT_GET(lep[mid].hashval, ARCH_CONVERT)) == hashwant)
+ if ((hash = be32_to_cpu(lep[mid].hashval)) == hashwant)
break;
if (hash < hashwant)
low = mid + 1;
@@ -1660,7 +1579,7 @@ xfs_dir2_leaf_search_hash(
* Found one, back up through all the equal hash values.
*/
if (hash == hashwant) {
- while (mid > 0 && INT_GET(lep[mid - 1].hashval, ARCH_CONVERT) == hashwant) {
+ while (mid > 0 && be32_to_cpu(lep[mid - 1].hashval) == hashwant) {
mid--;
}
}
@@ -1679,14 +1598,11 @@ xfs_dir2_leaf_search_hash(
int /* error */
xfs_dir2_leaf_trim_data(
xfs_da_args_t *args, /* operation arguments */
- xfs_dabuf_t *lbp, /* leaf buffer */
+ struct xfs_buf *lbp, /* leaf buffer */
xfs_dir2_db_t db) /* data block number */
{
- xfs_dir2_data_off_t *bestsp; /* leaf bests table */
-#ifdef DEBUG
- xfs_dir2_data_t *data; /* data block structure */
-#endif
- xfs_dabuf_t *dbp; /* data block buffer */
+ __be16 *bestsp; /* leaf bests table */
+ struct xfs_buf *dbp; /* data block buffer */
xfs_inode_t *dp; /* incore directory inode */
int error; /* error return value */
xfs_dir2_leaf_t *leaf; /* leaf structure */
@@ -1700,43 +1616,66 @@ xfs_dir2_leaf_trim_data(
/*
* Read the offending data block. We need its buffer.
*/
- if ((error = xfs_da_read_buf(tp, dp, XFS_DIR2_DB_TO_DA(mp, db), -1, &dbp,
- XFS_DATA_FORK))) {
+ error = xfs_dir3_data_read(tp, dp, xfs_dir2_db_to_da(args->geo, db),
+ -1, &dbp);
+ if (error)
return error;
- }
+
+ leaf = lbp->b_addr;
+ ltp = xfs_dir2_leaf_tail_p(args->geo, leaf);
+
#ifdef DEBUG
- data = dbp->data;
- ASSERT(INT_GET(data->hdr.magic, ARCH_CONVERT) == XFS_DIR2_DATA_MAGIC);
+{
+ struct xfs_dir2_data_hdr *hdr = dbp->b_addr;
+ struct xfs_dir2_data_free *bf = dp->d_ops->data_bestfree_p(hdr);
+
+ ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
+ hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC));
+ ASSERT(be16_to_cpu(bf[0].length) ==
+ args->geo->blksize - dp->d_ops->data_entry_offset);
+ ASSERT(db == be32_to_cpu(ltp->bestcount) - 1);
+}
#endif
- /* this seems to be an error
- * data is only valid if DEBUG is defined?
- * RMC 09/08/1999
- */
- leaf = lbp->data;
- ltp = XFS_DIR2_LEAF_TAIL_P(mp, leaf);
- ASSERT(INT_GET(data->hdr.bestfree[0].length, ARCH_CONVERT) ==
- mp->m_dirblksize - (uint)sizeof(data->hdr));
- ASSERT(db == INT_GET(ltp->bestcount, ARCH_CONVERT) - 1);
/*
* Get rid of the data block.
*/
if ((error = xfs_dir2_shrink_inode(args, db, dbp))) {
ASSERT(error != ENOSPC);
- xfs_da_brelse(tp, dbp);
+ xfs_trans_brelse(tp, dbp);
return error;
}
/*
* Eliminate the last bests entry from the table.
*/
- bestsp = XFS_DIR2_LEAF_BESTS_P(ltp);
- INT_MOD(ltp->bestcount, ARCH_CONVERT, -1);
- memmove(&bestsp[1], &bestsp[0], INT_GET(ltp->bestcount, ARCH_CONVERT) * sizeof(*bestsp));
- xfs_dir2_leaf_log_tail(tp, lbp);
- xfs_dir2_leaf_log_bests(tp, lbp, 0, INT_GET(ltp->bestcount, ARCH_CONVERT) - 1);
+ bestsp = xfs_dir2_leaf_bests_p(ltp);
+ be32_add_cpu(&ltp->bestcount, -1);
+ memmove(&bestsp[1], &bestsp[0], be32_to_cpu(ltp->bestcount) * sizeof(*bestsp));
+ xfs_dir3_leaf_log_tail(args, lbp);
+ xfs_dir3_leaf_log_bests(args, lbp, 0, be32_to_cpu(ltp->bestcount) - 1);
return 0;
}
+static inline size_t
+xfs_dir3_leaf_size(
+ struct xfs_dir3_icleaf_hdr *hdr,
+ int counts)
+{
+ int entries;
+ int hdrsize;
+
+ entries = hdr->count - hdr->stale;
+ if (hdr->magic == XFS_DIR2_LEAF1_MAGIC ||
+ hdr->magic == XFS_DIR2_LEAFN_MAGIC)
+ hdrsize = sizeof(struct xfs_dir2_leaf_hdr);
+ else
+ hdrsize = sizeof(struct xfs_dir3_leaf_hdr);
+
+ return hdrsize + entries * sizeof(xfs_dir2_leaf_entry_t)
+ + counts * sizeof(xfs_dir2_data_off_t)
+ + sizeof(xfs_dir2_leaf_tail_t);
+}
+
/*
* Convert node form directory to leaf form directory.
* The root of the node form dir needs to already be a LEAFN block.
@@ -1749,15 +1688,17 @@ xfs_dir2_node_to_leaf(
xfs_da_args_t *args; /* operation arguments */
xfs_inode_t *dp; /* incore directory inode */
int error; /* error return code */
- xfs_dabuf_t *fbp; /* buffer for freespace block */
+ struct xfs_buf *fbp; /* buffer for freespace block */
xfs_fileoff_t fo; /* freespace file offset */
xfs_dir2_free_t *free; /* freespace structure */
- xfs_dabuf_t *lbp; /* buffer for leaf block */
+ struct xfs_buf *lbp; /* buffer for leaf block */
xfs_dir2_leaf_tail_t *ltp; /* tail of leaf structure */
xfs_dir2_leaf_t *leaf; /* leaf structure */
xfs_mount_t *mp; /* filesystem mount point */
int rval; /* successful free trim? */
xfs_trans_t *tp; /* transaction pointer */
+ struct xfs_dir3_icleaf_hdr leafhdr;
+ struct xfs_dir3_icfree_hdr freehdr;
/*
* There's more than a leaf level in the btree, so there must
@@ -1766,29 +1707,31 @@ xfs_dir2_node_to_leaf(
if (state->path.active > 1)
return 0;
args = state->args;
- xfs_dir2_trace_args("node_to_leaf", args);
+
+ trace_xfs_dir2_node_to_leaf(args);
+
mp = state->mp;
dp = args->dp;
tp = args->trans;
/*
* Get the last offset in the file.
*/
- if ((error = xfs_bmap_last_offset(tp, dp, &fo, XFS_DATA_FORK))) {
+ if ((error = xfs_bmap_last_offset(dp, &fo, XFS_DATA_FORK))) {
return error;
}
- fo -= mp->m_dirblkfsbs;
+ fo -= args->geo->fsbcount;
/*
* If there are freespace blocks other than the first one,
* take this opportunity to remove trailing empty freespace blocks
* that may have been left behind during no-space-reservation
* operations.
*/
- while (fo > mp->m_dirfreeblk) {
+ while (fo > args->geo->freeblk) {
if ((error = xfs_dir2_node_trim_free(args, fo, &rval))) {
return error;
}
if (rval)
- fo -= mp->m_dirblkfsbs;
+ fo -= args->geo->fsbcount;
else
return 0;
}
@@ -1801,59 +1744,71 @@ xfs_dir2_node_to_leaf(
/*
* If it's not the single leaf block, give up.
*/
- if (XFS_FSB_TO_B(mp, fo) > XFS_DIR2_LEAF_OFFSET + mp->m_dirblksize)
+ if (XFS_FSB_TO_B(mp, fo) > XFS_DIR2_LEAF_OFFSET + args->geo->blksize)
return 0;
lbp = state->path.blk[0].bp;
- leaf = lbp->data;
- ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR2_LEAFN_MAGIC);
+ leaf = lbp->b_addr;
+ dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
+
+ ASSERT(leafhdr.magic == XFS_DIR2_LEAFN_MAGIC ||
+ leafhdr.magic == XFS_DIR3_LEAFN_MAGIC);
+
/*
* Read the freespace block.
*/
- if ((error = xfs_da_read_buf(tp, dp, mp->m_dirfreeblk, -1, &fbp,
- XFS_DATA_FORK))) {
+ error = xfs_dir2_free_read(tp, dp, args->geo->freeblk, &fbp);
+ if (error)
return error;
- }
- free = fbp->data;
- ASSERT(INT_GET(free->hdr.magic, ARCH_CONVERT) == XFS_DIR2_FREE_MAGIC);
- ASSERT(!free->hdr.firstdb);
+ free = fbp->b_addr;
+ dp->d_ops->free_hdr_from_disk(&freehdr, free);
+
+ ASSERT(!freehdr.firstdb);
+
/*
* Now see if the leafn and free data will fit in a leaf1.
* If not, release the buffer and give up.
*/
- if ((uint)sizeof(leaf->hdr) +
- (INT_GET(leaf->hdr.count, ARCH_CONVERT) - INT_GET(leaf->hdr.stale, ARCH_CONVERT)) * (uint)sizeof(leaf->ents[0]) +
- INT_GET(free->hdr.nvalid, ARCH_CONVERT) * (uint)sizeof(leaf->bests[0]) +
- (uint)sizeof(leaf->tail) >
- mp->m_dirblksize) {
- xfs_da_brelse(tp, fbp);
+ if (xfs_dir3_leaf_size(&leafhdr, freehdr.nvalid) > args->geo->blksize) {
+ xfs_trans_brelse(tp, fbp);
return 0;
}
+
/*
* If the leaf has any stale entries in it, compress them out.
- * The compact routine will log the header.
*/
- if (INT_GET(leaf->hdr.stale, ARCH_CONVERT))
- xfs_dir2_leaf_compact(args, lbp);
- else
- xfs_dir2_leaf_log_header(tp, lbp);
- INT_SET(leaf->hdr.info.magic, ARCH_CONVERT, XFS_DIR2_LEAF1_MAGIC);
+ if (leafhdr.stale)
+ xfs_dir3_leaf_compact(args, &leafhdr, lbp);
+
+ lbp->b_ops = &xfs_dir3_leaf1_buf_ops;
+ xfs_trans_buf_set_type(tp, lbp, XFS_BLFT_DIR_LEAF1_BUF);
+ leafhdr.magic = (leafhdr.magic == XFS_DIR2_LEAFN_MAGIC)
+ ? XFS_DIR2_LEAF1_MAGIC
+ : XFS_DIR3_LEAF1_MAGIC;
+
/*
* Set up the leaf tail from the freespace block.
*/
- ltp = XFS_DIR2_LEAF_TAIL_P(mp, leaf);
- INT_COPY(ltp->bestcount, free->hdr.nvalid, ARCH_CONVERT);
+ ltp = xfs_dir2_leaf_tail_p(args->geo, leaf);
+ ltp->bestcount = cpu_to_be32(freehdr.nvalid);
+
/*
* Set up the leaf bests table.
*/
- memcpy(XFS_DIR2_LEAF_BESTS_P(ltp), free->bests,
- INT_GET(ltp->bestcount, ARCH_CONVERT) * sizeof(leaf->bests[0]));
- xfs_dir2_leaf_log_bests(tp, lbp, 0, INT_GET(ltp->bestcount, ARCH_CONVERT) - 1);
- xfs_dir2_leaf_log_tail(tp, lbp);
- xfs_dir2_leaf_check(dp, lbp);
+ memcpy(xfs_dir2_leaf_bests_p(ltp), dp->d_ops->free_bests_p(free),
+ freehdr.nvalid * sizeof(xfs_dir2_data_off_t));
+
+ dp->d_ops->leaf_hdr_to_disk(leaf, &leafhdr);
+ xfs_dir3_leaf_log_header(args, lbp);
+ xfs_dir3_leaf_log_bests(args, lbp, 0, be32_to_cpu(ltp->bestcount) - 1);
+ xfs_dir3_leaf_log_tail(args, lbp);
+ xfs_dir3_leaf_check(dp, lbp);
+
/*
* Get rid of the freespace block.
*/
- error = xfs_dir2_shrink_inode(args, XFS_DIR2_FREE_FIRSTDB(mp), fbp);
+ error = xfs_dir2_shrink_inode(args,
+ xfs_dir2_byte_to_db(args->geo, XFS_DIR2_FREE_OFFSET),
+ fbp);
if (error) {
/*
* This can't fail here because it can only happen when
diff --git a/fs/xfs/xfs_dir2_leaf.h b/fs/xfs/xfs_dir2_leaf.h
deleted file mode 100644
index 1393993d61e..00000000000
--- a/fs/xfs/xfs_dir2_leaf.h
+++ /dev/null
@@ -1,271 +0,0 @@
-/*
- * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- */
-#ifndef __XFS_DIR2_LEAF_H__
-#define __XFS_DIR2_LEAF_H__
-
-struct uio;
-struct xfs_dabuf;
-struct xfs_da_args;
-struct xfs_inode;
-struct xfs_mount;
-struct xfs_trans;
-
-/*
- * Offset of the leaf/node space. First block in this space
- * is the btree root.
- */
-#define XFS_DIR2_LEAF_SPACE 1
-#define XFS_DIR2_LEAF_OFFSET (XFS_DIR2_LEAF_SPACE * XFS_DIR2_SPACE_SIZE)
-#define XFS_DIR2_LEAF_FIRSTDB(mp) \
- XFS_DIR2_BYTE_TO_DB(mp, XFS_DIR2_LEAF_OFFSET)
-
-/*
- * Offset in data space of a data entry.
- */
-typedef __uint32_t xfs_dir2_dataptr_t;
-#define XFS_DIR2_MAX_DATAPTR ((xfs_dir2_dataptr_t)0xffffffff)
-#define XFS_DIR2_NULL_DATAPTR ((xfs_dir2_dataptr_t)0)
-
-/*
- * Leaf block header.
- */
-typedef struct xfs_dir2_leaf_hdr {
- xfs_da_blkinfo_t info; /* header for da routines */
- __uint16_t count; /* count of entries */
- __uint16_t stale; /* count of stale entries */
-} xfs_dir2_leaf_hdr_t;
-
-/*
- * Leaf block entry.
- */
-typedef struct xfs_dir2_leaf_entry {
- xfs_dahash_t hashval; /* hash value of name */
- xfs_dir2_dataptr_t address; /* address of data entry */
-} xfs_dir2_leaf_entry_t;
-
-/*
- * Leaf block tail.
- */
-typedef struct xfs_dir2_leaf_tail {
- __uint32_t bestcount;
-} xfs_dir2_leaf_tail_t;
-
-/*
- * Leaf block.
- * bests and tail are at the end of the block for single-leaf only
- * (magic = XFS_DIR2_LEAF1_MAGIC not XFS_DIR2_LEAFN_MAGIC).
- */
-typedef struct xfs_dir2_leaf {
- xfs_dir2_leaf_hdr_t hdr; /* leaf header */
- xfs_dir2_leaf_entry_t ents[1]; /* entries */
- /* ... */
- xfs_dir2_data_off_t bests[1]; /* best free counts */
- xfs_dir2_leaf_tail_t tail; /* leaf tail */
-} xfs_dir2_leaf_t;
-
-/*
- * DB blocks here are logical directory block numbers, not filesystem blocks.
- */
-
-#define XFS_DIR2_MAX_LEAF_ENTS(mp) xfs_dir2_max_leaf_ents(mp)
-static inline int xfs_dir2_max_leaf_ents(struct xfs_mount *mp)
-{
- return (int)(((mp)->m_dirblksize - (uint)sizeof(xfs_dir2_leaf_hdr_t)) /
- (uint)sizeof(xfs_dir2_leaf_entry_t));
-}
-
-/*
- * Get address of the bestcount field in the single-leaf block.
- */
-#define XFS_DIR2_LEAF_TAIL_P(mp,lp) xfs_dir2_leaf_tail_p(mp, lp)
-static inline xfs_dir2_leaf_tail_t *
-xfs_dir2_leaf_tail_p(struct xfs_mount *mp, xfs_dir2_leaf_t *lp)
-{
- return (xfs_dir2_leaf_tail_t *)
- ((char *)(lp) + (mp)->m_dirblksize -
- (uint)sizeof(xfs_dir2_leaf_tail_t));
-}
-
-/*
- * Get address of the bests array in the single-leaf block.
- */
-#define XFS_DIR2_LEAF_BESTS_P(ltp) xfs_dir2_leaf_bests_p(ltp)
-static inline xfs_dir2_data_off_t *
-xfs_dir2_leaf_bests_p(xfs_dir2_leaf_tail_t *ltp)
-{
- return (xfs_dir2_data_off_t *)
- (ltp) - INT_GET((ltp)->bestcount, ARCH_CONVERT);
-}
-
-/*
- * Convert dataptr to byte in file space
- */
-#define XFS_DIR2_DATAPTR_TO_BYTE(mp,dp) xfs_dir2_dataptr_to_byte(mp, dp)
-static inline xfs_dir2_off_t
-xfs_dir2_dataptr_to_byte(struct xfs_mount *mp, xfs_dir2_dataptr_t dp)
-{
- return (xfs_dir2_off_t)(dp) << XFS_DIR2_DATA_ALIGN_LOG;
-}
-
-/*
- * Convert byte in file space to dataptr. It had better be aligned.
- */
-#define XFS_DIR2_BYTE_TO_DATAPTR(mp,by) xfs_dir2_byte_to_dataptr(mp,by)
-static inline xfs_dir2_dataptr_t
-xfs_dir2_byte_to_dataptr(struct xfs_mount *mp, xfs_dir2_off_t by)
-{
- return (xfs_dir2_dataptr_t)((by) >> XFS_DIR2_DATA_ALIGN_LOG);
-}
-
-/*
- * Convert byte in space to (DB) block
- */
-#define XFS_DIR2_BYTE_TO_DB(mp,by) xfs_dir2_byte_to_db(mp, by)
-static inline xfs_dir2_db_t
-xfs_dir2_byte_to_db(struct xfs_mount *mp, xfs_dir2_off_t by)
-{
- return (xfs_dir2_db_t)((by) >> \
- ((mp)->m_sb.sb_blocklog + (mp)->m_sb.sb_dirblklog));
-}
-
-/*
- * Convert dataptr to a block number
- */
-#define XFS_DIR2_DATAPTR_TO_DB(mp,dp) xfs_dir2_dataptr_to_db(mp, dp)
-static inline xfs_dir2_db_t
-xfs_dir2_dataptr_to_db(struct xfs_mount *mp, xfs_dir2_dataptr_t dp)
-{
- return XFS_DIR2_BYTE_TO_DB(mp, XFS_DIR2_DATAPTR_TO_BYTE(mp, dp));
-}
-
-/*
- * Convert byte in space to offset in a block
- */
-#define XFS_DIR2_BYTE_TO_OFF(mp,by) xfs_dir2_byte_to_off(mp, by)
-static inline xfs_dir2_data_aoff_t
-xfs_dir2_byte_to_off(struct xfs_mount *mp, xfs_dir2_off_t by)
-{
- return (xfs_dir2_data_aoff_t)((by) & \
- ((1 << ((mp)->m_sb.sb_blocklog + (mp)->m_sb.sb_dirblklog)) - 1));
-}
-
-/*
- * Convert dataptr to a byte offset in a block
- */
-#define XFS_DIR2_DATAPTR_TO_OFF(mp,dp) xfs_dir2_dataptr_to_off(mp, dp)
-static inline xfs_dir2_data_aoff_t
-xfs_dir2_dataptr_to_off(struct xfs_mount *mp, xfs_dir2_dataptr_t dp)
-{
- return XFS_DIR2_BYTE_TO_OFF(mp, XFS_DIR2_DATAPTR_TO_BYTE(mp, dp));
-}
-
-/*
- * Convert block and offset to byte in space
- */
-#define XFS_DIR2_DB_OFF_TO_BYTE(mp,db,o) \
- xfs_dir2_db_off_to_byte(mp, db, o)
-static inline xfs_dir2_off_t
-xfs_dir2_db_off_to_byte(struct xfs_mount *mp, xfs_dir2_db_t db,
- xfs_dir2_data_aoff_t o)
-{
- return ((xfs_dir2_off_t)(db) << \
- ((mp)->m_sb.sb_blocklog + (mp)->m_sb.sb_dirblklog)) + (o);
-}
-
-/*
- * Convert block (DB) to block (dablk)
- */
-#define XFS_DIR2_DB_TO_DA(mp,db) xfs_dir2_db_to_da(mp, db)
-static inline xfs_dablk_t
-xfs_dir2_db_to_da(struct xfs_mount *mp, xfs_dir2_db_t db)
-{
- return (xfs_dablk_t)((db) << (mp)->m_sb.sb_dirblklog);
-}
-
-/*
- * Convert byte in space to (DA) block
- */
-#define XFS_DIR2_BYTE_TO_DA(mp,by) xfs_dir2_byte_to_da(mp, by)
-static inline xfs_dablk_t
-xfs_dir2_byte_to_da(struct xfs_mount *mp, xfs_dir2_off_t by)
-{
- return XFS_DIR2_DB_TO_DA(mp, XFS_DIR2_BYTE_TO_DB(mp, by));
-}
-
-/*
- * Convert block and offset to dataptr
- */
-#define XFS_DIR2_DB_OFF_TO_DATAPTR(mp,db,o) \
- xfs_dir2_db_off_to_dataptr(mp, db, o)
-static inline xfs_dir2_dataptr_t
-xfs_dir2_db_off_to_dataptr(struct xfs_mount *mp, xfs_dir2_db_t db,
- xfs_dir2_data_aoff_t o)
-{
- return XFS_DIR2_BYTE_TO_DATAPTR(mp, XFS_DIR2_DB_OFF_TO_BYTE(mp, db, o));
-}
-
-/*
- * Convert block (dablk) to block (DB)
- */
-#define XFS_DIR2_DA_TO_DB(mp,da) xfs_dir2_da_to_db(mp, da)
-static inline xfs_dir2_db_t
-xfs_dir2_da_to_db(struct xfs_mount *mp, xfs_dablk_t da)
-{
- return (xfs_dir2_db_t)((da) >> (mp)->m_sb.sb_dirblklog);
-}
-
-/*
- * Convert block (dablk) to byte offset in space
- */
-#define XFS_DIR2_DA_TO_BYTE(mp,da) xfs_dir2_da_to_byte(mp, da)
-static inline xfs_dir2_off_t
-xfs_dir2_da_to_byte(struct xfs_mount *mp, xfs_dablk_t da)
-{
- return XFS_DIR2_DB_OFF_TO_BYTE(mp, XFS_DIR2_DA_TO_DB(mp, da), 0);
-}
-
-/*
- * Function declarations.
- */
-extern int xfs_dir2_block_to_leaf(struct xfs_da_args *args,
- struct xfs_dabuf *dbp);
-extern int xfs_dir2_leaf_addname(struct xfs_da_args *args);
-extern void xfs_dir2_leaf_compact(struct xfs_da_args *args,
- struct xfs_dabuf *bp);
-extern void xfs_dir2_leaf_compact_x1(struct xfs_dabuf *bp, int *indexp,
- int *lowstalep, int *highstalep,
- int *lowlogp, int *highlogp);
-extern int xfs_dir2_leaf_getdents(struct xfs_trans *tp, struct xfs_inode *dp,
- struct uio *uio, int *eofp,
- struct xfs_dirent *dbp, xfs_dir2_put_t put);
-extern int xfs_dir2_leaf_init(struct xfs_da_args *args, xfs_dir2_db_t bno,
- struct xfs_dabuf **bpp, int magic);
-extern void xfs_dir2_leaf_log_ents(struct xfs_trans *tp, struct xfs_dabuf *bp,
- int first, int last);
-extern void xfs_dir2_leaf_log_header(struct xfs_trans *tp,
- struct xfs_dabuf *bp);
-extern int xfs_dir2_leaf_lookup(struct xfs_da_args *args);
-extern int xfs_dir2_leaf_removename(struct xfs_da_args *args);
-extern int xfs_dir2_leaf_replace(struct xfs_da_args *args);
-extern int xfs_dir2_leaf_search_hash(struct xfs_da_args *args,
- struct xfs_dabuf *lbp);
-extern int xfs_dir2_leaf_trim_data(struct xfs_da_args *args,
- struct xfs_dabuf *lbp, xfs_dir2_db_t db);
-extern int xfs_dir2_node_to_leaf(struct xfs_da_state *state);
-
-#endif /* __XFS_DIR2_LEAF_H__ */
diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c
index 641f8633d25..da43d304fca 100644
--- a/fs/xfs/xfs_dir2_node.c
+++ b/fs/xfs/xfs_dir2_node.c
@@ -1,5 +1,6 @@
/*
* Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
* All Rights Reserved.
*
* This program is free software; you can redistribute it and/or
@@ -17,70 +18,245 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
-#include "xfs_types.h"
-#include "xfs_log.h"
-#include "xfs_inum.h"
-#include "xfs_trans.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_sb.h"
-#include "xfs_dir.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
+#include "xfs_ag.h"
#include "xfs_mount.h"
+#include "xfs_da_format.h"
#include "xfs_da_btree.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_dir_sf.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
-#include "xfs_dinode.h"
#include "xfs_inode.h"
#include "xfs_bmap.h"
-#include "xfs_dir2_data.h"
-#include "xfs_dir2_leaf.h"
-#include "xfs_dir2_block.h"
-#include "xfs_dir2_node.h"
-#include "xfs_dir2_trace.h"
+#include "xfs_dir2.h"
+#include "xfs_dir2_priv.h"
#include "xfs_error.h"
+#include "xfs_trace.h"
+#include "xfs_trans.h"
+#include "xfs_buf_item.h"
+#include "xfs_cksum.h"
/*
* Function declarations.
*/
-static void xfs_dir2_free_log_header(xfs_trans_t *tp, xfs_dabuf_t *bp);
-static int xfs_dir2_leafn_add(xfs_dabuf_t *bp, xfs_da_args_t *args, int index);
-#ifdef DEBUG
-static void xfs_dir2_leafn_check(xfs_inode_t *dp, xfs_dabuf_t *bp);
-#else
-#define xfs_dir2_leafn_check(dp, bp)
-#endif
-static void xfs_dir2_leafn_moveents(xfs_da_args_t *args, xfs_dabuf_t *bp_s,
- int start_s, xfs_dabuf_t *bp_d, int start_d,
- int count);
+static int xfs_dir2_leafn_add(struct xfs_buf *bp, xfs_da_args_t *args,
+ int index);
static void xfs_dir2_leafn_rebalance(xfs_da_state_t *state,
xfs_da_state_blk_t *blk1,
xfs_da_state_blk_t *blk2);
-static int xfs_dir2_leafn_remove(xfs_da_args_t *args, xfs_dabuf_t *bp,
+static int xfs_dir2_leafn_remove(xfs_da_args_t *args, struct xfs_buf *bp,
int index, xfs_da_state_blk_t *dblk,
int *rval);
static int xfs_dir2_node_addname_int(xfs_da_args_t *args,
xfs_da_state_blk_t *fblk);
/*
+ * Check internal consistency of a leafn block.
+ */
+#ifdef DEBUG
+#define xfs_dir3_leaf_check(dp, bp) \
+do { \
+ if (!xfs_dir3_leafn_check((dp), (bp))) \
+ ASSERT(0); \
+} while (0);
+
+static bool
+xfs_dir3_leafn_check(
+ struct xfs_inode *dp,
+ struct xfs_buf *bp)
+{
+ struct xfs_dir2_leaf *leaf = bp->b_addr;
+ struct xfs_dir3_icleaf_hdr leafhdr;
+
+ dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
+
+ if (leafhdr.magic == XFS_DIR3_LEAFN_MAGIC) {
+ struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr;
+ if (be64_to_cpu(leaf3->info.blkno) != bp->b_bn)
+ return false;
+ } else if (leafhdr.magic != XFS_DIR2_LEAFN_MAGIC)
+ return false;
+
+ return xfs_dir3_leaf_check_int(dp->i_mount, dp, &leafhdr, leaf);
+}
+#else
+#define xfs_dir3_leaf_check(dp, bp)
+#endif
+
+static bool
+xfs_dir3_free_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_dir2_free_hdr *hdr = bp->b_addr;
+
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
+
+ if (hdr3->magic != cpu_to_be32(XFS_DIR3_FREE_MAGIC))
+ return false;
+ if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_uuid))
+ return false;
+ if (be64_to_cpu(hdr3->blkno) != bp->b_bn)
+ return false;
+ } else {
+ if (hdr->magic != cpu_to_be32(XFS_DIR2_FREE_MAGIC))
+ return false;
+ }
+
+ /* XXX: should bounds check the xfs_dir3_icfree_hdr here */
+
+ return true;
+}
+
+static void
+xfs_dir3_free_read_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+
+ if (xfs_sb_version_hascrc(&mp->m_sb) &&
+ !xfs_buf_verify_cksum(bp, XFS_DIR3_FREE_CRC_OFF))
+ xfs_buf_ioerror(bp, EFSBADCRC);
+ else if (!xfs_dir3_free_verify(bp))
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
+
+ if (bp->b_error)
+ xfs_verifier_error(bp);
+}
+
+static void
+xfs_dir3_free_write_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_buf_log_item *bip = bp->b_fspriv;
+ struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
+
+ if (!xfs_dir3_free_verify(bp)) {
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
+ xfs_verifier_error(bp);
+ return;
+ }
+
+ if (!xfs_sb_version_hascrc(&mp->m_sb))
+ return;
+
+ if (bip)
+ hdr3->lsn = cpu_to_be64(bip->bli_item.li_lsn);
+
+ xfs_buf_update_cksum(bp, XFS_DIR3_FREE_CRC_OFF);
+}
+
+const struct xfs_buf_ops xfs_dir3_free_buf_ops = {
+ .verify_read = xfs_dir3_free_read_verify,
+ .verify_write = xfs_dir3_free_write_verify,
+};
+
+
+static int
+__xfs_dir3_free_read(
+ struct xfs_trans *tp,
+ struct xfs_inode *dp,
+ xfs_dablk_t fbno,
+ xfs_daddr_t mappedbno,
+ struct xfs_buf **bpp)
+{
+ int err;
+
+ err = xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp,
+ XFS_DATA_FORK, &xfs_dir3_free_buf_ops);
+
+ /* try read returns without an error or *bpp if it lands in a hole */
+ if (!err && tp && *bpp)
+ xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_FREE_BUF);
+ return err;
+}
+
+int
+xfs_dir2_free_read(
+ struct xfs_trans *tp,
+ struct xfs_inode *dp,
+ xfs_dablk_t fbno,
+ struct xfs_buf **bpp)
+{
+ return __xfs_dir3_free_read(tp, dp, fbno, -1, bpp);
+}
+
+static int
+xfs_dir2_free_try_read(
+ struct xfs_trans *tp,
+ struct xfs_inode *dp,
+ xfs_dablk_t fbno,
+ struct xfs_buf **bpp)
+{
+ return __xfs_dir3_free_read(tp, dp, fbno, -2, bpp);
+}
+
+static int
+xfs_dir3_free_get_buf(
+ xfs_da_args_t *args,
+ xfs_dir2_db_t fbno,
+ struct xfs_buf **bpp)
+{
+ struct xfs_trans *tp = args->trans;
+ struct xfs_inode *dp = args->dp;
+ struct xfs_mount *mp = dp->i_mount;
+ struct xfs_buf *bp;
+ int error;
+ struct xfs_dir3_icfree_hdr hdr;
+
+ error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(args->geo, fbno),
+ -1, &bp, XFS_DATA_FORK);
+ if (error)
+ return error;
+
+ xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_FREE_BUF);
+ bp->b_ops = &xfs_dir3_free_buf_ops;
+
+ /*
+ * Initialize the new block to be empty, and remember
+ * its first slot as our empty slot.
+ */
+ memset(bp->b_addr, 0, sizeof(struct xfs_dir3_free_hdr));
+ memset(&hdr, 0, sizeof(hdr));
+
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ struct xfs_dir3_free_hdr *hdr3 = bp->b_addr;
+
+ hdr.magic = XFS_DIR3_FREE_MAGIC;
+
+ hdr3->hdr.blkno = cpu_to_be64(bp->b_bn);
+ hdr3->hdr.owner = cpu_to_be64(dp->i_ino);
+ uuid_copy(&hdr3->hdr.uuid, &mp->m_sb.sb_uuid);
+ } else
+ hdr.magic = XFS_DIR2_FREE_MAGIC;
+ dp->d_ops->free_hdr_to_disk(bp->b_addr, &hdr);
+ *bpp = bp;
+ return 0;
+}
+
+/*
* Log entries from a freespace block.
*/
-void
+STATIC void
xfs_dir2_free_log_bests(
- xfs_trans_t *tp, /* transaction pointer */
- xfs_dabuf_t *bp, /* freespace buffer */
+ struct xfs_da_args *args,
+ struct xfs_buf *bp,
int first, /* first entry to log */
int last) /* last entry to log */
{
xfs_dir2_free_t *free; /* freespace structure */
+ __be16 *bests;
- free = bp->data;
- ASSERT(INT_GET(free->hdr.magic, ARCH_CONVERT) == XFS_DIR2_FREE_MAGIC);
- xfs_da_log_buf(tp, bp,
- (uint)((char *)&free->bests[first] - (char *)free),
- (uint)((char *)&free->bests[last] - (char *)free +
- sizeof(free->bests[0]) - 1));
+ free = bp->b_addr;
+ bests = args->dp->d_ops->free_bests_p(free);
+ ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC) ||
+ free->hdr.magic == cpu_to_be32(XFS_DIR3_FREE_MAGIC));
+ xfs_trans_log_buf(args->trans, bp,
+ (uint)((char *)&bests[first] - (char *)free),
+ (uint)((char *)&bests[last] - (char *)free +
+ sizeof(bests[0]) - 1));
}
/*
@@ -88,15 +264,18 @@ xfs_dir2_free_log_bests(
*/
static void
xfs_dir2_free_log_header(
- xfs_trans_t *tp, /* transaction pointer */
- xfs_dabuf_t *bp) /* freespace buffer */
+ struct xfs_da_args *args,
+ struct xfs_buf *bp)
{
+#ifdef DEBUG
xfs_dir2_free_t *free; /* freespace structure */
- free = bp->data;
- ASSERT(INT_GET(free->hdr.magic, ARCH_CONVERT) == XFS_DIR2_FREE_MAGIC);
- xfs_da_log_buf(tp, bp, (uint)((char *)&free->hdr - (char *)free),
- (uint)(sizeof(xfs_dir2_free_hdr_t) - 1));
+ free = bp->b_addr;
+ ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC) ||
+ free->hdr.magic == cpu_to_be32(XFS_DIR3_FREE_MAGIC));
+#endif
+ xfs_trans_log_buf(args->trans, bp, 0,
+ args->dp->d_ops->free_hdr_size - 1);
}
/*
@@ -107,24 +286,26 @@ xfs_dir2_free_log_header(
int /* error */
xfs_dir2_leaf_to_node(
xfs_da_args_t *args, /* operation arguments */
- xfs_dabuf_t *lbp) /* leaf buffer */
+ struct xfs_buf *lbp) /* leaf buffer */
{
xfs_inode_t *dp; /* incore directory inode */
int error; /* error return value */
- xfs_dabuf_t *fbp; /* freespace buffer */
+ struct xfs_buf *fbp; /* freespace buffer */
xfs_dir2_db_t fdb; /* freespace block number */
xfs_dir2_free_t *free; /* freespace structure */
- xfs_dir2_data_off_t *from; /* pointer to freespace entry */
+ __be16 *from; /* pointer to freespace entry */
int i; /* leaf freespace index */
xfs_dir2_leaf_t *leaf; /* leaf structure */
xfs_dir2_leaf_tail_t *ltp; /* leaf tail structure */
xfs_mount_t *mp; /* filesystem mount point */
int n; /* count of live freespc ents */
xfs_dir2_data_off_t off; /* freespace entry value */
- xfs_dir2_data_off_t *to; /* pointer to freespace entry */
+ __be16 *to; /* pointer to freespace entry */
xfs_trans_t *tp; /* transaction pointer */
+ struct xfs_dir3_icfree_hdr freehdr;
+
+ trace_xfs_dir2_leaf_to_node(args);
- xfs_dir2_trace_args_b("leaf_to_node", args, lbp);
dp = args->dp;
mp = dp->i_mount;
tp = args->trans;
@@ -134,45 +315,57 @@ xfs_dir2_leaf_to_node(
if ((error = xfs_dir2_grow_inode(args, XFS_DIR2_FREE_SPACE, &fdb))) {
return error;
}
- ASSERT(fdb == XFS_DIR2_FREE_FIRSTDB(mp));
+ ASSERT(fdb == xfs_dir2_byte_to_db(args->geo, XFS_DIR2_FREE_OFFSET));
/*
* Get the buffer for the new freespace block.
*/
- if ((error = xfs_da_get_buf(tp, dp, XFS_DIR2_DB_TO_DA(mp, fdb), -1, &fbp,
- XFS_DATA_FORK))) {
+ error = xfs_dir3_free_get_buf(args, fdb, &fbp);
+ if (error)
return error;
- }
- ASSERT(fbp != NULL);
- free = fbp->data;
- leaf = lbp->data;
- ltp = XFS_DIR2_LEAF_TAIL_P(mp, leaf);
- /*
- * Initialize the freespace block header.
- */
- INT_SET(free->hdr.magic, ARCH_CONVERT, XFS_DIR2_FREE_MAGIC);
- free->hdr.firstdb = 0;
- ASSERT(INT_GET(ltp->bestcount, ARCH_CONVERT) <= (uint)dp->i_d.di_size / mp->m_dirblksize);
- INT_COPY(free->hdr.nvalid, ltp->bestcount, ARCH_CONVERT);
+
+ free = fbp->b_addr;
+ dp->d_ops->free_hdr_from_disk(&freehdr, free);
+ leaf = lbp->b_addr;
+ ltp = xfs_dir2_leaf_tail_p(args->geo, leaf);
+ ASSERT(be32_to_cpu(ltp->bestcount) <=
+ (uint)dp->i_d.di_size / args->geo->blksize);
+
/*
* Copy freespace entries from the leaf block to the new block.
* Count active entries.
*/
- for (i = n = 0, from = XFS_DIR2_LEAF_BESTS_P(ltp), to = free->bests;
- i < INT_GET(ltp->bestcount, ARCH_CONVERT); i++, from++, to++) {
- if ((off = INT_GET(*from, ARCH_CONVERT)) != NULLDATAOFF)
+ from = xfs_dir2_leaf_bests_p(ltp);
+ to = dp->d_ops->free_bests_p(free);
+ for (i = n = 0; i < be32_to_cpu(ltp->bestcount); i++, from++, to++) {
+ if ((off = be16_to_cpu(*from)) != NULLDATAOFF)
n++;
- INT_SET(*to, ARCH_CONVERT, off);
+ *to = cpu_to_be16(off);
}
- INT_SET(free->hdr.nused, ARCH_CONVERT, n);
- INT_SET(leaf->hdr.info.magic, ARCH_CONVERT, XFS_DIR2_LEAFN_MAGIC);
+
+ /*
+ * Now initialize the freespace block header.
+ */
+ freehdr.nused = n;
+ freehdr.nvalid = be32_to_cpu(ltp->bestcount);
+
+ dp->d_ops->free_hdr_to_disk(fbp->b_addr, &freehdr);
+ xfs_dir2_free_log_bests(args, fbp, 0, freehdr.nvalid - 1);
+ xfs_dir2_free_log_header(args, fbp);
+
/*
- * Log everything.
+ * Converting the leaf to a leafnode is just a matter of changing the
+ * magic number and the ops. Do the change directly to the buffer as
+ * it's less work (and less code) than decoding the header to host
+ * format and back again.
*/
- xfs_dir2_leaf_log_header(tp, lbp);
- xfs_dir2_free_log_header(tp, fbp);
- xfs_dir2_free_log_bests(tp, fbp, 0, INT_GET(free->hdr.nvalid, ARCH_CONVERT) - 1);
- xfs_da_buf_done(fbp);
- xfs_dir2_leafn_check(dp, lbp);
+ if (leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC))
+ leaf->hdr.info.magic = cpu_to_be16(XFS_DIR2_LEAFN_MAGIC);
+ else
+ leaf->hdr.info.magic = cpu_to_be16(XFS_DIR3_LEAFN_MAGIC);
+ lbp->b_ops = &xfs_dir3_leafn_buf_ops;
+ xfs_trans_buf_set_type(tp, lbp, XFS_BLFT_DIR_LEAFN_BUF);
+ xfs_dir3_leaf_log_header(args, lbp);
+ xfs_dir3_leaf_check(dp, lbp);
return 0;
}
@@ -182,7 +375,7 @@ xfs_dir2_leaf_to_node(
*/
static int /* error */
xfs_dir2_leafn_add(
- xfs_dabuf_t *bp, /* leaf buffer */
+ struct xfs_buf *bp, /* leaf buffer */
xfs_da_args_t *args, /* operation arguments */
int index) /* insertion pt for new entry */
{
@@ -196,12 +389,17 @@ xfs_dir2_leafn_add(
int lowstale; /* previous stale entry */
xfs_mount_t *mp; /* filesystem mount point */
xfs_trans_t *tp; /* transaction pointer */
+ struct xfs_dir3_icleaf_hdr leafhdr;
+ struct xfs_dir2_leaf_entry *ents;
+
+ trace_xfs_dir2_leafn_add(args, index);
- xfs_dir2_trace_args_sb("leafn_add", args, index, bp);
dp = args->dp;
mp = dp->i_mount;
tp = args->trans;
- leaf = bp->data;
+ leaf = bp->b_addr;
+ dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
+ ents = dp->d_ops->leaf_ents_p(leaf);
/*
* Quick check just to make sure we are not going to index
@@ -217,153 +415,69 @@ xfs_dir2_leafn_add(
* a compact.
*/
- if (INT_GET(leaf->hdr.count, ARCH_CONVERT) == XFS_DIR2_MAX_LEAF_ENTS(mp)) {
- if (!leaf->hdr.stale)
+ if (leafhdr.count == dp->d_ops->leaf_max_ents(args->geo)) {
+ if (!leafhdr.stale)
return XFS_ERROR(ENOSPC);
- compact = INT_GET(leaf->hdr.stale, ARCH_CONVERT) > 1;
+ compact = leafhdr.stale > 1;
} else
compact = 0;
- ASSERT(index == 0 || INT_GET(leaf->ents[index - 1].hashval, ARCH_CONVERT) <= args->hashval);
- ASSERT(index == INT_GET(leaf->hdr.count, ARCH_CONVERT) ||
- INT_GET(leaf->ents[index].hashval, ARCH_CONVERT) >= args->hashval);
+ ASSERT(index == 0 || be32_to_cpu(ents[index - 1].hashval) <= args->hashval);
+ ASSERT(index == leafhdr.count ||
+ be32_to_cpu(ents[index].hashval) >= args->hashval);
- if (args->justcheck)
+ if (args->op_flags & XFS_DA_OP_JUSTCHECK)
return 0;
/*
* Compact out all but one stale leaf entry. Leaves behind
* the entry closest to index.
*/
- if (compact) {
- xfs_dir2_leaf_compact_x1(bp, &index, &lowstale, &highstale,
- &lfloglow, &lfloghigh);
- }
- /*
- * Set impossible logging indices for this case.
- */
- else if (leaf->hdr.stale) {
- lfloglow = INT_GET(leaf->hdr.count, ARCH_CONVERT);
- lfloghigh = -1;
- }
- /*
- * No stale entries, just insert a space for the new entry.
- */
- if (!leaf->hdr.stale) {
- lep = &leaf->ents[index];
- if (index < INT_GET(leaf->hdr.count, ARCH_CONVERT))
- memmove(lep + 1, lep,
- (INT_GET(leaf->hdr.count, ARCH_CONVERT) - index) * sizeof(*lep));
- lfloglow = index;
- lfloghigh = INT_GET(leaf->hdr.count, ARCH_CONVERT);
- INT_MOD(leaf->hdr.count, ARCH_CONVERT, +1);
- }
- /*
- * There are stale entries. We'll use one for the new entry.
- */
- else {
- /*
- * If we didn't do a compact then we need to figure out
- * which stale entry will be used.
- */
- if (compact == 0) {
- /*
- * Find first stale entry before our insertion point.
- */
- for (lowstale = index - 1;
- lowstale >= 0 &&
- INT_GET(leaf->ents[lowstale].address, ARCH_CONVERT) !=
- XFS_DIR2_NULL_DATAPTR;
- lowstale--)
- continue;
- /*
- * Find next stale entry after insertion point.
- * Stop looking if the answer would be worse than
- * lowstale already found.
- */
- for (highstale = index;
- highstale < INT_GET(leaf->hdr.count, ARCH_CONVERT) &&
- INT_GET(leaf->ents[highstale].address, ARCH_CONVERT) !=
- XFS_DIR2_NULL_DATAPTR &&
- (lowstale < 0 ||
- index - lowstale - 1 >= highstale - index);
- highstale++)
- continue;
- }
+ if (compact)
+ xfs_dir3_leaf_compact_x1(&leafhdr, ents, &index, &lowstale,
+ &highstale, &lfloglow, &lfloghigh);
+ else if (leafhdr.stale) {
/*
- * Using the low stale entry.
- * Shift entries up toward the stale slot.
+ * Set impossible logging indices for this case.
*/
- if (lowstale >= 0 &&
- (highstale == INT_GET(leaf->hdr.count, ARCH_CONVERT) ||
- index - lowstale - 1 < highstale - index)) {
- ASSERT(INT_GET(leaf->ents[lowstale].address, ARCH_CONVERT) ==
- XFS_DIR2_NULL_DATAPTR);
- ASSERT(index - lowstale - 1 >= 0);
- if (index - lowstale - 1 > 0)
- memmove(&leaf->ents[lowstale],
- &leaf->ents[lowstale + 1],
- (index - lowstale - 1) * sizeof(*lep));
- lep = &leaf->ents[index - 1];
- lfloglow = MIN(lowstale, lfloglow);
- lfloghigh = MAX(index - 1, lfloghigh);
- }
- /*
- * Using the high stale entry.
- * Shift entries down toward the stale slot.
- */
- else {
- ASSERT(INT_GET(leaf->ents[highstale].address, ARCH_CONVERT) ==
- XFS_DIR2_NULL_DATAPTR);
- ASSERT(highstale - index >= 0);
- if (highstale - index > 0)
- memmove(&leaf->ents[index + 1],
- &leaf->ents[index],
- (highstale - index) * sizeof(*lep));
- lep = &leaf->ents[index];
- lfloglow = MIN(index, lfloglow);
- lfloghigh = MAX(highstale, lfloghigh);
- }
- INT_MOD(leaf->hdr.stale, ARCH_CONVERT, -1);
+ lfloglow = leafhdr.count;
+ lfloghigh = -1;
}
+
/*
* Insert the new entry, log everything.
*/
- INT_SET(lep->hashval, ARCH_CONVERT, args->hashval);
- INT_SET(lep->address, ARCH_CONVERT, XFS_DIR2_DB_OFF_TO_DATAPTR(mp, args->blkno, args->index));
- xfs_dir2_leaf_log_header(tp, bp);
- xfs_dir2_leaf_log_ents(tp, bp, lfloglow, lfloghigh);
- xfs_dir2_leafn_check(dp, bp);
+ lep = xfs_dir3_leaf_find_entry(&leafhdr, ents, index, compact, lowstale,
+ highstale, &lfloglow, &lfloghigh);
+
+ lep->hashval = cpu_to_be32(args->hashval);
+ lep->address = cpu_to_be32(xfs_dir2_db_off_to_dataptr(args->geo,
+ args->blkno, args->index));
+
+ dp->d_ops->leaf_hdr_to_disk(leaf, &leafhdr);
+ xfs_dir3_leaf_log_header(args, bp);
+ xfs_dir3_leaf_log_ents(args, bp, lfloglow, lfloghigh);
+ xfs_dir3_leaf_check(dp, bp);
return 0;
}
#ifdef DEBUG
-/*
- * Check internal consistency of a leafn block.
- */
-void
-xfs_dir2_leafn_check(
- xfs_inode_t *dp, /* incore directory inode */
- xfs_dabuf_t *bp) /* leaf buffer */
+static void
+xfs_dir2_free_hdr_check(
+ struct xfs_inode *dp,
+ struct xfs_buf *bp,
+ xfs_dir2_db_t db)
{
- int i; /* leaf index */
- xfs_dir2_leaf_t *leaf; /* leaf structure */
- xfs_mount_t *mp; /* filesystem mount point */
- int stale; /* count of stale leaves */
+ struct xfs_dir3_icfree_hdr hdr;
- leaf = bp->data;
- mp = dp->i_mount;
- ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR2_LEAFN_MAGIC);
- ASSERT(INT_GET(leaf->hdr.count, ARCH_CONVERT) <= XFS_DIR2_MAX_LEAF_ENTS(mp));
- for (i = stale = 0; i < INT_GET(leaf->hdr.count, ARCH_CONVERT); i++) {
- if (i + 1 < INT_GET(leaf->hdr.count, ARCH_CONVERT)) {
- ASSERT(INT_GET(leaf->ents[i].hashval, ARCH_CONVERT) <=
- INT_GET(leaf->ents[i + 1].hashval, ARCH_CONVERT));
- }
- if (INT_GET(leaf->ents[i].address, ARCH_CONVERT) == XFS_DIR2_NULL_DATAPTR)
- stale++;
- }
- ASSERT(INT_GET(leaf->hdr.stale, ARCH_CONVERT) == stale);
+ dp->d_ops->free_hdr_from_disk(&hdr, bp->b_addr);
+
+ ASSERT((hdr.firstdb %
+ dp->d_ops->free_max_bests(dp->i_mount->m_dir_geo)) == 0);
+ ASSERT(hdr.firstdb <= db);
+ ASSERT(db < hdr.firstdb + hdr.nvalid);
}
+#else
+#define xfs_dir2_free_hdr_check(dp, bp, db)
#endif /* DEBUG */
/*
@@ -372,58 +486,67 @@ xfs_dir2_leafn_check(
*/
xfs_dahash_t /* hash value */
xfs_dir2_leafn_lasthash(
- xfs_dabuf_t *bp, /* leaf buffer */
+ struct xfs_inode *dp,
+ struct xfs_buf *bp, /* leaf buffer */
int *count) /* count of entries in leaf */
{
- xfs_dir2_leaf_t *leaf; /* leaf structure */
+ struct xfs_dir2_leaf *leaf = bp->b_addr;
+ struct xfs_dir2_leaf_entry *ents;
+ struct xfs_dir3_icleaf_hdr leafhdr;
+
+ dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
+
+ ASSERT(leafhdr.magic == XFS_DIR2_LEAFN_MAGIC ||
+ leafhdr.magic == XFS_DIR3_LEAFN_MAGIC);
- leaf = bp->data;
- ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR2_LEAFN_MAGIC);
if (count)
- *count = INT_GET(leaf->hdr.count, ARCH_CONVERT);
- if (!leaf->hdr.count)
+ *count = leafhdr.count;
+ if (!leafhdr.count)
return 0;
- return INT_GET(leaf->ents[INT_GET(leaf->hdr.count, ARCH_CONVERT) - 1].hashval, ARCH_CONVERT);
+
+ ents = dp->d_ops->leaf_ents_p(leaf);
+ return be32_to_cpu(ents[leafhdr.count - 1].hashval);
}
/*
- * Look up a leaf entry in a node-format leaf block.
- * If this is an addname then the extrablk in state is a freespace block,
- * otherwise it's a data block.
+ * Look up a leaf entry for space to add a name in a node-format leaf block.
+ * The extrablk in state is a freespace block.
*/
-int
-xfs_dir2_leafn_lookup_int(
- xfs_dabuf_t *bp, /* leaf buffer */
+STATIC int
+xfs_dir2_leafn_lookup_for_addname(
+ struct xfs_buf *bp, /* leaf buffer */
xfs_da_args_t *args, /* operation arguments */
int *indexp, /* out: leaf entry index */
xfs_da_state_t *state) /* state to fill in */
{
- xfs_dabuf_t *curbp; /* current data/free buffer */
- xfs_dir2_db_t curdb; /* current data block number */
- xfs_dir2_db_t curfdb; /* current free block number */
- xfs_dir2_data_entry_t *dep; /* data block entry */
+ struct xfs_buf *curbp = NULL; /* current data/free buffer */
+ xfs_dir2_db_t curdb = -1; /* current data block number */
+ xfs_dir2_db_t curfdb = -1; /* current free block number */
xfs_inode_t *dp; /* incore directory inode */
int error; /* error return value */
int fi; /* free entry index */
- xfs_dir2_free_t *free=NULL; /* free block structure */
+ xfs_dir2_free_t *free = NULL; /* free block structure */
int index; /* leaf entry index */
xfs_dir2_leaf_t *leaf; /* leaf structure */
- int length=0; /* length of new data entry */
+ int length; /* length of new data entry */
xfs_dir2_leaf_entry_t *lep; /* leaf entry */
xfs_mount_t *mp; /* filesystem mount point */
xfs_dir2_db_t newdb; /* new data block number */
xfs_dir2_db_t newfdb; /* new free block number */
xfs_trans_t *tp; /* transaction pointer */
+ struct xfs_dir2_leaf_entry *ents;
+ struct xfs_dir3_icleaf_hdr leafhdr;
dp = args->dp;
tp = args->trans;
mp = dp->i_mount;
- leaf = bp->data;
- ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR2_LEAFN_MAGIC);
-#ifdef __KERNEL__
- ASSERT(INT_GET(leaf->hdr.count, ARCH_CONVERT) > 0);
-#endif
- xfs_dir2_leafn_check(dp, bp);
+ leaf = bp->b_addr;
+ dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
+ ents = dp->d_ops->leaf_ents_p(leaf);
+
+ xfs_dir3_leaf_check(dp, bp);
+ ASSERT(leafhdr.count > 0);
+
/*
* Look up the hash value in the leaf entries.
*/
@@ -431,248 +554,333 @@ xfs_dir2_leafn_lookup_int(
/*
* Do we have a buffer coming in?
*/
- if (state->extravalid)
+ if (state->extravalid) {
+ /* If so, it's a free block buffer, get the block number. */
curbp = state->extrablk.bp;
- else
- curbp = NULL;
- /*
- * For addname, it's a free block buffer, get the block number.
- */
- if (args->addname) {
- curfdb = curbp ? state->extrablk.blkno : -1;
- curdb = -1;
- length = XFS_DIR2_DATA_ENTSIZE(args->namelen);
- if ((free = (curbp ? curbp->data : NULL)))
- ASSERT(INT_GET(free->hdr.magic, ARCH_CONVERT) == XFS_DIR2_FREE_MAGIC);
- }
- /*
- * For others, it's a data block buffer, get the block number.
- */
- else {
- curfdb = -1;
- curdb = curbp ? state->extrablk.blkno : -1;
+ curfdb = state->extrablk.blkno;
+ free = curbp->b_addr;
+ ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC) ||
+ free->hdr.magic == cpu_to_be32(XFS_DIR3_FREE_MAGIC));
}
+ length = dp->d_ops->data_entsize(args->namelen);
/*
* Loop over leaf entries with the right hash value.
*/
- for (lep = &leaf->ents[index];
- index < INT_GET(leaf->hdr.count, ARCH_CONVERT) && INT_GET(lep->hashval, ARCH_CONVERT) == args->hashval;
+ for (lep = &ents[index];
+ index < leafhdr.count && be32_to_cpu(lep->hashval) == args->hashval;
lep++, index++) {
/*
* Skip stale leaf entries.
*/
- if (INT_GET(lep->address, ARCH_CONVERT) == XFS_DIR2_NULL_DATAPTR)
+ if (be32_to_cpu(lep->address) == XFS_DIR2_NULL_DATAPTR)
continue;
/*
* Pull the data block number from the entry.
*/
- newdb = XFS_DIR2_DATAPTR_TO_DB(mp, INT_GET(lep->address, ARCH_CONVERT));
+ newdb = xfs_dir2_dataptr_to_db(args->geo,
+ be32_to_cpu(lep->address));
/*
* For addname, we're looking for a place to put the new entry.
* We want to use a data block with an entry of equal
* hash value to ours if there is one with room.
+ *
+ * If this block isn't the data block we already have
+ * in hand, take a look at it.
*/
- if (args->addname) {
+ if (newdb != curdb) {
+ __be16 *bests;
+
+ curdb = newdb;
/*
- * If this block isn't the data block we already have
- * in hand, take a look at it.
+ * Convert the data block to the free block
+ * holding its freespace information.
*/
- if (newdb != curdb) {
- curdb = newdb;
- /*
- * Convert the data block to the free block
- * holding its freespace information.
- */
- newfdb = XFS_DIR2_DB_TO_FDB(mp, newdb);
- /*
- * If it's not the one we have in hand,
- * read it in.
- */
- if (newfdb != curfdb) {
- /*
- * If we had one before, drop it.
- */
- if (curbp)
- xfs_da_brelse(tp, curbp);
- /*
- * Read the free block.
- */
- if ((error = xfs_da_read_buf(tp, dp,
- XFS_DIR2_DB_TO_DA(mp,
- newfdb),
- -1, &curbp,
- XFS_DATA_FORK))) {
- return error;
- }
- curfdb = newfdb;
- free = curbp->data;
- ASSERT(INT_GET(free->hdr.magic, ARCH_CONVERT) ==
- XFS_DIR2_FREE_MAGIC);
- ASSERT((INT_GET(free->hdr.firstdb, ARCH_CONVERT) %
- XFS_DIR2_MAX_FREE_BESTS(mp)) ==
- 0);
- ASSERT(INT_GET(free->hdr.firstdb, ARCH_CONVERT) <= curdb);
- ASSERT(curdb <
- INT_GET(free->hdr.firstdb, ARCH_CONVERT) +
- INT_GET(free->hdr.nvalid, ARCH_CONVERT));
- }
- /*
- * Get the index for our entry.
- */
- fi = XFS_DIR2_DB_TO_FDINDEX(mp, curdb);
- /*
- * If it has room, return it.
- */
- if (unlikely(INT_GET(free->bests[fi], ARCH_CONVERT) == NULLDATAOFF)) {
- XFS_ERROR_REPORT("xfs_dir2_leafn_lookup_int",
- XFS_ERRLEVEL_LOW, mp);
- return XFS_ERROR(EFSCORRUPTED);
- }
- if (INT_GET(free->bests[fi], ARCH_CONVERT) >= length) {
- *indexp = index;
- state->extravalid = 1;
- state->extrablk.bp = curbp;
- state->extrablk.blkno = curfdb;
- state->extrablk.index = fi;
- state->extrablk.magic =
- XFS_DIR2_FREE_MAGIC;
- ASSERT(args->oknoent);
- return XFS_ERROR(ENOENT);
- }
- }
- }
- /*
- * Not adding a new entry, so we really want to find
- * the name given to us.
- */
- else {
+ newfdb = dp->d_ops->db_to_fdb(args->geo, newdb);
/*
- * If it's a different data block, go get it.
+ * If it's not the one we have in hand, read it in.
*/
- if (newdb != curdb) {
+ if (newfdb != curfdb) {
/*
- * If we had a block before, drop it.
+ * If we had one before, drop it.
*/
if (curbp)
- xfs_da_brelse(tp, curbp);
- /*
- * Read the data block.
- */
- if ((error =
- xfs_da_read_buf(tp, dp,
- XFS_DIR2_DB_TO_DA(mp, newdb), -1,
- &curbp, XFS_DATA_FORK))) {
+ xfs_trans_brelse(tp, curbp);
+
+ error = xfs_dir2_free_read(tp, dp,
+ xfs_dir2_db_to_da(args->geo,
+ newfdb),
+ &curbp);
+ if (error)
return error;
- }
- xfs_dir2_data_check(dp, curbp);
- curdb = newdb;
+ free = curbp->b_addr;
+
+ xfs_dir2_free_hdr_check(dp, curbp, curdb);
}
/*
- * Point to the data entry.
+ * Get the index for our entry.
*/
- dep = (xfs_dir2_data_entry_t *)
- ((char *)curbp->data +
- XFS_DIR2_DATAPTR_TO_OFF(mp, INT_GET(lep->address, ARCH_CONVERT)));
+ fi = dp->d_ops->db_to_fdindex(args->geo, curdb);
/*
- * Compare the entry, return it if it matches.
+ * If it has room, return it.
*/
- if (dep->namelen == args->namelen &&
- dep->name[0] == args->name[0] &&
- memcmp(dep->name, args->name, args->namelen) == 0) {
- args->inumber = INT_GET(dep->inumber, ARCH_CONVERT);
- *indexp = index;
- state->extravalid = 1;
- state->extrablk.bp = curbp;
- state->extrablk.blkno = curdb;
- state->extrablk.index =
- (int)((char *)dep -
- (char *)curbp->data);
- state->extrablk.magic = XFS_DIR2_DATA_MAGIC;
- return XFS_ERROR(EEXIST);
+ bests = dp->d_ops->free_bests_p(free);
+ if (unlikely(bests[fi] == cpu_to_be16(NULLDATAOFF))) {
+ XFS_ERROR_REPORT("xfs_dir2_leafn_lookup_int",
+ XFS_ERRLEVEL_LOW, mp);
+ if (curfdb != newfdb)
+ xfs_trans_brelse(tp, curbp);
+ return XFS_ERROR(EFSCORRUPTED);
}
+ curfdb = newfdb;
+ if (be16_to_cpu(bests[fi]) >= length)
+ goto out;
}
}
+ /* Didn't find any space */
+ fi = -1;
+out:
+ ASSERT(args->op_flags & XFS_DA_OP_OKNOENT);
+ if (curbp) {
+ /* Giving back a free block. */
+ state->extravalid = 1;
+ state->extrablk.bp = curbp;
+ state->extrablk.index = fi;
+ state->extrablk.blkno = curfdb;
+
+ /*
+ * Important: this magic number is not in the buffer - it's for
+ * buffer type information and therefore only the free/data type
+ * matters here, not whether CRCs are enabled or not.
+ */
+ state->extrablk.magic = XFS_DIR2_FREE_MAGIC;
+ } else {
+ state->extravalid = 0;
+ }
/*
- * Didn't find a match.
- * If we are holding a buffer, give it back in case our caller
- * finds it useful.
+ * Return the index, that will be the insertion point.
*/
- if ((state->extravalid = (curbp != NULL))) {
- state->extrablk.bp = curbp;
- state->extrablk.index = -1;
+ *indexp = index;
+ return XFS_ERROR(ENOENT);
+}
+
+/*
+ * Look up a leaf entry in a node-format leaf block.
+ * The extrablk in state a data block.
+ */
+STATIC int
+xfs_dir2_leafn_lookup_for_entry(
+ struct xfs_buf *bp, /* leaf buffer */
+ xfs_da_args_t *args, /* operation arguments */
+ int *indexp, /* out: leaf entry index */
+ xfs_da_state_t *state) /* state to fill in */
+{
+ struct xfs_buf *curbp = NULL; /* current data/free buffer */
+ xfs_dir2_db_t curdb = -1; /* current data block number */
+ xfs_dir2_data_entry_t *dep; /* data block entry */
+ xfs_inode_t *dp; /* incore directory inode */
+ int error; /* error return value */
+ int index; /* leaf entry index */
+ xfs_dir2_leaf_t *leaf; /* leaf structure */
+ xfs_dir2_leaf_entry_t *lep; /* leaf entry */
+ xfs_mount_t *mp; /* filesystem mount point */
+ xfs_dir2_db_t newdb; /* new data block number */
+ xfs_trans_t *tp; /* transaction pointer */
+ enum xfs_dacmp cmp; /* comparison result */
+ struct xfs_dir2_leaf_entry *ents;
+ struct xfs_dir3_icleaf_hdr leafhdr;
+
+ dp = args->dp;
+ tp = args->trans;
+ mp = dp->i_mount;
+ leaf = bp->b_addr;
+ dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
+ ents = dp->d_ops->leaf_ents_p(leaf);
+
+ xfs_dir3_leaf_check(dp, bp);
+ ASSERT(leafhdr.count > 0);
+
+ /*
+ * Look up the hash value in the leaf entries.
+ */
+ index = xfs_dir2_leaf_search_hash(args, bp);
+ /*
+ * Do we have a buffer coming in?
+ */
+ if (state->extravalid) {
+ curbp = state->extrablk.bp;
+ curdb = state->extrablk.blkno;
+ }
+ /*
+ * Loop over leaf entries with the right hash value.
+ */
+ for (lep = &ents[index];
+ index < leafhdr.count && be32_to_cpu(lep->hashval) == args->hashval;
+ lep++, index++) {
+ /*
+ * Skip stale leaf entries.
+ */
+ if (be32_to_cpu(lep->address) == XFS_DIR2_NULL_DATAPTR)
+ continue;
+ /*
+ * Pull the data block number from the entry.
+ */
+ newdb = xfs_dir2_dataptr_to_db(args->geo,
+ be32_to_cpu(lep->address));
/*
- * For addname, giving back a free block.
+ * Not adding a new entry, so we really want to find
+ * the name given to us.
+ *
+ * If it's a different data block, go get it.
*/
- if (args->addname) {
- state->extrablk.blkno = curfdb;
- state->extrablk.magic = XFS_DIR2_FREE_MAGIC;
+ if (newdb != curdb) {
+ /*
+ * If we had a block before that we aren't saving
+ * for a CI name, drop it
+ */
+ if (curbp && (args->cmpresult == XFS_CMP_DIFFERENT ||
+ curdb != state->extrablk.blkno))
+ xfs_trans_brelse(tp, curbp);
+ /*
+ * If needing the block that is saved with a CI match,
+ * use it otherwise read in the new data block.
+ */
+ if (args->cmpresult != XFS_CMP_DIFFERENT &&
+ newdb == state->extrablk.blkno) {
+ ASSERT(state->extravalid);
+ curbp = state->extrablk.bp;
+ } else {
+ error = xfs_dir3_data_read(tp, dp,
+ xfs_dir2_db_to_da(args->geo,
+ newdb),
+ -1, &curbp);
+ if (error)
+ return error;
+ }
+ xfs_dir3_data_check(dp, curbp);
+ curdb = newdb;
}
/*
- * For other callers, giving back a data block.
+ * Point to the data entry.
*/
- else {
+ dep = (xfs_dir2_data_entry_t *)((char *)curbp->b_addr +
+ xfs_dir2_dataptr_to_off(args->geo,
+ be32_to_cpu(lep->address)));
+ /*
+ * Compare the entry and if it's an exact match, return
+ * EEXIST immediately. If it's the first case-insensitive
+ * match, store the block & inode number and continue looking.
+ */
+ cmp = mp->m_dirnameops->compname(args, dep->name, dep->namelen);
+ if (cmp != XFS_CMP_DIFFERENT && cmp != args->cmpresult) {
+ /* If there is a CI match block, drop it */
+ if (args->cmpresult != XFS_CMP_DIFFERENT &&
+ curdb != state->extrablk.blkno)
+ xfs_trans_brelse(tp, state->extrablk.bp);
+ args->cmpresult = cmp;
+ args->inumber = be64_to_cpu(dep->inumber);
+ args->filetype = dp->d_ops->data_get_ftype(dep);
+ *indexp = index;
+ state->extravalid = 1;
+ state->extrablk.bp = curbp;
state->extrablk.blkno = curdb;
+ state->extrablk.index = (int)((char *)dep -
+ (char *)curbp->b_addr);
state->extrablk.magic = XFS_DIR2_DATA_MAGIC;
+ curbp->b_ops = &xfs_dir3_data_buf_ops;
+ xfs_trans_buf_set_type(tp, curbp, XFS_BLFT_DIR_DATA_BUF);
+ if (cmp == XFS_CMP_EXACT)
+ return XFS_ERROR(EEXIST);
}
}
- /*
- * Return the final index, that will be the insertion point.
- */
+ ASSERT(index == leafhdr.count || (args->op_flags & XFS_DA_OP_OKNOENT));
+ if (curbp) {
+ if (args->cmpresult == XFS_CMP_DIFFERENT) {
+ /* Giving back last used data block. */
+ state->extravalid = 1;
+ state->extrablk.bp = curbp;
+ state->extrablk.index = -1;
+ state->extrablk.blkno = curdb;
+ state->extrablk.magic = XFS_DIR2_DATA_MAGIC;
+ curbp->b_ops = &xfs_dir3_data_buf_ops;
+ xfs_trans_buf_set_type(tp, curbp, XFS_BLFT_DIR_DATA_BUF);
+ } else {
+ /* If the curbp is not the CI match block, drop it */
+ if (state->extrablk.bp != curbp)
+ xfs_trans_brelse(tp, curbp);
+ }
+ } else {
+ state->extravalid = 0;
+ }
*indexp = index;
- ASSERT(index == INT_GET(leaf->hdr.count, ARCH_CONVERT) || args->oknoent);
return XFS_ERROR(ENOENT);
}
/*
+ * Look up a leaf entry in a node-format leaf block.
+ * If this is an addname then the extrablk in state is a freespace block,
+ * otherwise it's a data block.
+ */
+int
+xfs_dir2_leafn_lookup_int(
+ struct xfs_buf *bp, /* leaf buffer */
+ xfs_da_args_t *args, /* operation arguments */
+ int *indexp, /* out: leaf entry index */
+ xfs_da_state_t *state) /* state to fill in */
+{
+ if (args->op_flags & XFS_DA_OP_ADDNAME)
+ return xfs_dir2_leafn_lookup_for_addname(bp, args, indexp,
+ state);
+ return xfs_dir2_leafn_lookup_for_entry(bp, args, indexp, state);
+}
+
+/*
* Move count leaf entries from source to destination leaf.
* Log entries and headers. Stale entries are preserved.
*/
static void
-xfs_dir2_leafn_moveents(
- xfs_da_args_t *args, /* operation arguments */
- xfs_dabuf_t *bp_s, /* source leaf buffer */
- int start_s, /* source leaf index */
- xfs_dabuf_t *bp_d, /* destination leaf buffer */
- int start_d, /* destination leaf index */
- int count) /* count of leaves to copy */
+xfs_dir3_leafn_moveents(
+ xfs_da_args_t *args, /* operation arguments */
+ struct xfs_buf *bp_s, /* source */
+ struct xfs_dir3_icleaf_hdr *shdr,
+ struct xfs_dir2_leaf_entry *sents,
+ int start_s,/* source leaf index */
+ struct xfs_buf *bp_d, /* destination */
+ struct xfs_dir3_icleaf_hdr *dhdr,
+ struct xfs_dir2_leaf_entry *dents,
+ int start_d,/* destination leaf index */
+ int count) /* count of leaves to copy */
{
- xfs_dir2_leaf_t *leaf_d; /* destination leaf structure */
- xfs_dir2_leaf_t *leaf_s; /* source leaf structure */
- int stale; /* count stale leaves copied */
- xfs_trans_t *tp; /* transaction pointer */
+ int stale; /* count stale leaves copied */
+
+ trace_xfs_dir2_leafn_moveents(args, start_s, start_d, count);
- xfs_dir2_trace_args_bibii("leafn_moveents", args, bp_s, start_s, bp_d,
- start_d, count);
/*
* Silently return if nothing to do.
*/
- if (count == 0) {
+ if (count == 0)
return;
- }
- tp = args->trans;
- leaf_s = bp_s->data;
- leaf_d = bp_d->data;
+
/*
* If the destination index is not the end of the current
* destination leaf entries, open up a hole in the destination
* to hold the new entries.
*/
- if (start_d < INT_GET(leaf_d->hdr.count, ARCH_CONVERT)) {
- memmove(&leaf_d->ents[start_d + count], &leaf_d->ents[start_d],
- (INT_GET(leaf_d->hdr.count, ARCH_CONVERT) - start_d) *
- sizeof(xfs_dir2_leaf_entry_t));
- xfs_dir2_leaf_log_ents(tp, bp_d, start_d + count,
- count + INT_GET(leaf_d->hdr.count, ARCH_CONVERT) - 1);
+ if (start_d < dhdr->count) {
+ memmove(&dents[start_d + count], &dents[start_d],
+ (dhdr->count - start_d) * sizeof(xfs_dir2_leaf_entry_t));
+ xfs_dir3_leaf_log_ents(args, bp_d, start_d + count,
+ count + dhdr->count - 1);
}
/*
* If the source has stale leaves, count the ones in the copy range
* so we can update the header correctly.
*/
- if (leaf_s->hdr.stale) {
+ if (shdr->stale) {
int i; /* temp leaf index */
for (i = start_s, stale = 0; i < start_s + count; i++) {
- if (INT_GET(leaf_s->ents[i].address, ARCH_CONVERT) == XFS_DIR2_NULL_DATAPTR)
+ if (sents[i].address ==
+ cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
stale++;
}
} else
@@ -680,29 +888,27 @@ xfs_dir2_leafn_moveents(
/*
* Copy the leaf entries from source to destination.
*/
- memcpy(&leaf_d->ents[start_d], &leaf_s->ents[start_s],
+ memcpy(&dents[start_d], &sents[start_s],
count * sizeof(xfs_dir2_leaf_entry_t));
- xfs_dir2_leaf_log_ents(tp, bp_d, start_d, start_d + count - 1);
+ xfs_dir3_leaf_log_ents(args, bp_d, start_d, start_d + count - 1);
+
/*
* If there are source entries after the ones we copied,
* delete the ones we copied by sliding the next ones down.
*/
- if (start_s + count < INT_GET(leaf_s->hdr.count, ARCH_CONVERT)) {
- memmove(&leaf_s->ents[start_s], &leaf_s->ents[start_s + count],
+ if (start_s + count < shdr->count) {
+ memmove(&sents[start_s], &sents[start_s + count],
count * sizeof(xfs_dir2_leaf_entry_t));
- xfs_dir2_leaf_log_ents(tp, bp_s, start_s, start_s + count - 1);
+ xfs_dir3_leaf_log_ents(args, bp_s, start_s, start_s + count - 1);
}
+
/*
* Update the headers and log them.
*/
- INT_MOD(leaf_s->hdr.count, ARCH_CONVERT, -(count));
- INT_MOD(leaf_s->hdr.stale, ARCH_CONVERT, -(stale));
- INT_MOD(leaf_d->hdr.count, ARCH_CONVERT, count);
- INT_MOD(leaf_d->hdr.stale, ARCH_CONVERT, stale);
- xfs_dir2_leaf_log_header(tp, bp_s);
- xfs_dir2_leaf_log_header(tp, bp_d);
- xfs_dir2_leafn_check(args->dp, bp_s);
- xfs_dir2_leafn_check(args->dp, bp_d);
+ shdr->count -= count;
+ shdr->stale -= stale;
+ dhdr->count += count;
+ dhdr->stale += stale;
}
/*
@@ -711,21 +917,26 @@ xfs_dir2_leafn_moveents(
*/
int /* sort order */
xfs_dir2_leafn_order(
- xfs_dabuf_t *leaf1_bp, /* leaf1 buffer */
- xfs_dabuf_t *leaf2_bp) /* leaf2 buffer */
+ struct xfs_inode *dp,
+ struct xfs_buf *leaf1_bp, /* leaf1 buffer */
+ struct xfs_buf *leaf2_bp) /* leaf2 buffer */
{
- xfs_dir2_leaf_t *leaf1; /* leaf1 structure */
- xfs_dir2_leaf_t *leaf2; /* leaf2 structure */
-
- leaf1 = leaf1_bp->data;
- leaf2 = leaf2_bp->data;
- ASSERT(INT_GET(leaf1->hdr.info.magic, ARCH_CONVERT) == XFS_DIR2_LEAFN_MAGIC);
- ASSERT(INT_GET(leaf2->hdr.info.magic, ARCH_CONVERT) == XFS_DIR2_LEAFN_MAGIC);
- if (INT_GET(leaf1->hdr.count, ARCH_CONVERT) > 0 &&
- INT_GET(leaf2->hdr.count, ARCH_CONVERT) > 0 &&
- (INT_GET(leaf2->ents[0].hashval, ARCH_CONVERT) < INT_GET(leaf1->ents[0].hashval, ARCH_CONVERT) ||
- INT_GET(leaf2->ents[INT_GET(leaf2->hdr.count, ARCH_CONVERT) - 1].hashval, ARCH_CONVERT) <
- INT_GET(leaf1->ents[INT_GET(leaf1->hdr.count, ARCH_CONVERT) - 1].hashval, ARCH_CONVERT)))
+ struct xfs_dir2_leaf *leaf1 = leaf1_bp->b_addr;
+ struct xfs_dir2_leaf *leaf2 = leaf2_bp->b_addr;
+ struct xfs_dir2_leaf_entry *ents1;
+ struct xfs_dir2_leaf_entry *ents2;
+ struct xfs_dir3_icleaf_hdr hdr1;
+ struct xfs_dir3_icleaf_hdr hdr2;
+
+ dp->d_ops->leaf_hdr_from_disk(&hdr1, leaf1);
+ dp->d_ops->leaf_hdr_from_disk(&hdr2, leaf2);
+ ents1 = dp->d_ops->leaf_ents_p(leaf1);
+ ents2 = dp->d_ops->leaf_ents_p(leaf2);
+
+ if (hdr1.count > 0 && hdr2.count > 0 &&
+ (be32_to_cpu(ents2[0].hashval) < be32_to_cpu(ents1[0].hashval) ||
+ be32_to_cpu(ents2[hdr2.count - 1].hashval) <
+ be32_to_cpu(ents1[hdr1.count - 1].hashval)))
return 1;
return 0;
}
@@ -749,30 +960,41 @@ xfs_dir2_leafn_rebalance(
xfs_dir2_leaf_t *leaf1; /* first leaf structure */
xfs_dir2_leaf_t *leaf2; /* second leaf structure */
int mid; /* midpoint leaf index */
-#ifdef DEBUG
+#if defined(DEBUG) || defined(XFS_WARN)
int oldstale; /* old count of stale leaves */
#endif
int oldsum; /* old total leaf count */
int swap; /* swapped leaf blocks */
+ struct xfs_dir2_leaf_entry *ents1;
+ struct xfs_dir2_leaf_entry *ents2;
+ struct xfs_dir3_icleaf_hdr hdr1;
+ struct xfs_dir3_icleaf_hdr hdr2;
+ struct xfs_inode *dp = state->args->dp;
args = state->args;
/*
* If the block order is wrong, swap the arguments.
*/
- if ((swap = xfs_dir2_leafn_order(blk1->bp, blk2->bp))) {
+ if ((swap = xfs_dir2_leafn_order(dp, blk1->bp, blk2->bp))) {
xfs_da_state_blk_t *tmp; /* temp for block swap */
tmp = blk1;
blk1 = blk2;
blk2 = tmp;
}
- leaf1 = blk1->bp->data;
- leaf2 = blk2->bp->data;
- oldsum = INT_GET(leaf1->hdr.count, ARCH_CONVERT) + INT_GET(leaf2->hdr.count, ARCH_CONVERT);
-#ifdef DEBUG
- oldstale = INT_GET(leaf1->hdr.stale, ARCH_CONVERT) + INT_GET(leaf2->hdr.stale, ARCH_CONVERT);
+ leaf1 = blk1->bp->b_addr;
+ leaf2 = blk2->bp->b_addr;
+ dp->d_ops->leaf_hdr_from_disk(&hdr1, leaf1);
+ dp->d_ops->leaf_hdr_from_disk(&hdr2, leaf2);
+ ents1 = dp->d_ops->leaf_ents_p(leaf1);
+ ents2 = dp->d_ops->leaf_ents_p(leaf2);
+
+ oldsum = hdr1.count + hdr2.count;
+#if defined(DEBUG) || defined(XFS_WARN)
+ oldstale = hdr1.stale + hdr2.stale;
#endif
mid = oldsum >> 1;
+
/*
* If the old leaf count was odd then the new one will be even,
* so we need to divide the new count evenly.
@@ -780,10 +1002,10 @@ xfs_dir2_leafn_rebalance(
if (oldsum & 1) {
xfs_dahash_t midhash; /* middle entry hash value */
- if (mid >= INT_GET(leaf1->hdr.count, ARCH_CONVERT))
- midhash = INT_GET(leaf2->ents[mid - INT_GET(leaf1->hdr.count, ARCH_CONVERT)].hashval, ARCH_CONVERT);
+ if (mid >= hdr1.count)
+ midhash = be32_to_cpu(ents2[mid - hdr1.count].hashval);
else
- midhash = INT_GET(leaf1->ents[mid].hashval, ARCH_CONVERT);
+ midhash = be32_to_cpu(ents1[mid].hashval);
isleft = args->hashval <= midhash;
}
/*
@@ -797,44 +1019,135 @@ xfs_dir2_leafn_rebalance(
* Calculate moved entry count. Positive means left-to-right,
* negative means right-to-left. Then move the entries.
*/
- count = INT_GET(leaf1->hdr.count, ARCH_CONVERT) - mid + (isleft == 0);
+ count = hdr1.count - mid + (isleft == 0);
if (count > 0)
- xfs_dir2_leafn_moveents(args, blk1->bp,
- INT_GET(leaf1->hdr.count, ARCH_CONVERT) - count, blk2->bp, 0, count);
+ xfs_dir3_leafn_moveents(args, blk1->bp, &hdr1, ents1,
+ hdr1.count - count, blk2->bp,
+ &hdr2, ents2, 0, count);
else if (count < 0)
- xfs_dir2_leafn_moveents(args, blk2->bp, 0, blk1->bp,
- INT_GET(leaf1->hdr.count, ARCH_CONVERT), count);
- ASSERT(INT_GET(leaf1->hdr.count, ARCH_CONVERT) + INT_GET(leaf2->hdr.count, ARCH_CONVERT) == oldsum);
- ASSERT(INT_GET(leaf1->hdr.stale, ARCH_CONVERT) + INT_GET(leaf2->hdr.stale, ARCH_CONVERT) == oldstale);
+ xfs_dir3_leafn_moveents(args, blk2->bp, &hdr2, ents2, 0,
+ blk1->bp, &hdr1, ents1,
+ hdr1.count, count);
+
+ ASSERT(hdr1.count + hdr2.count == oldsum);
+ ASSERT(hdr1.stale + hdr2.stale == oldstale);
+
+ /* log the changes made when moving the entries */
+ dp->d_ops->leaf_hdr_to_disk(leaf1, &hdr1);
+ dp->d_ops->leaf_hdr_to_disk(leaf2, &hdr2);
+ xfs_dir3_leaf_log_header(args, blk1->bp);
+ xfs_dir3_leaf_log_header(args, blk2->bp);
+
+ xfs_dir3_leaf_check(dp, blk1->bp);
+ xfs_dir3_leaf_check(dp, blk2->bp);
+
/*
* Mark whether we're inserting into the old or new leaf.
*/
- if (INT_GET(leaf1->hdr.count, ARCH_CONVERT) < INT_GET(leaf2->hdr.count, ARCH_CONVERT))
+ if (hdr1.count < hdr2.count)
state->inleaf = swap;
- else if (INT_GET(leaf1->hdr.count, ARCH_CONVERT) > INT_GET(leaf2->hdr.count, ARCH_CONVERT))
+ else if (hdr1.count > hdr2.count)
state->inleaf = !swap;
else
- state->inleaf =
- swap ^ (blk1->index <= INT_GET(leaf1->hdr.count, ARCH_CONVERT));
+ state->inleaf = swap ^ (blk1->index <= hdr1.count);
/*
* Adjust the expected index for insertion.
*/
if (!state->inleaf)
- blk2->index = blk1->index - INT_GET(leaf1->hdr.count, ARCH_CONVERT);
-
- /*
- * Finally sanity check just to make sure we are not returning a negative index
+ blk2->index = blk1->index - hdr1.count;
+
+ /*
+ * Finally sanity check just to make sure we are not returning a
+ * negative index
*/
- if(blk2->index < 0) {
+ if (blk2->index < 0) {
state->inleaf = 1;
blk2->index = 0;
- cmn_err(CE_ALERT,
- "xfs_dir2_leafn_rebalance: picked the wrong leaf? reverting orignal leaf: "
- "blk1->index %d\n",
- blk1->index);
+ xfs_alert(dp->i_mount,
+ "%s: picked the wrong leaf? reverting original leaf: blk1->index %d",
+ __func__, blk1->index);
}
}
+static int
+xfs_dir3_data_block_free(
+ xfs_da_args_t *args,
+ struct xfs_dir2_data_hdr *hdr,
+ struct xfs_dir2_free *free,
+ xfs_dir2_db_t fdb,
+ int findex,
+ struct xfs_buf *fbp,
+ int longest)
+{
+ int logfree = 0;
+ __be16 *bests;
+ struct xfs_dir3_icfree_hdr freehdr;
+ struct xfs_inode *dp = args->dp;
+
+ dp->d_ops->free_hdr_from_disk(&freehdr, free);
+ bests = dp->d_ops->free_bests_p(free);
+ if (hdr) {
+ /*
+ * Data block is not empty, just set the free entry to the new
+ * value.
+ */
+ bests[findex] = cpu_to_be16(longest);
+ xfs_dir2_free_log_bests(args, fbp, findex, findex);
+ return 0;
+ }
+
+ /* One less used entry in the free table. */
+ freehdr.nused--;
+
+ /*
+ * If this was the last entry in the table, we can trim the table size
+ * back. There might be other entries at the end referring to
+ * non-existent data blocks, get those too.
+ */
+ if (findex == freehdr.nvalid - 1) {
+ int i; /* free entry index */
+
+ for (i = findex - 1; i >= 0; i--) {
+ if (bests[i] != cpu_to_be16(NULLDATAOFF))
+ break;
+ }
+ freehdr.nvalid = i + 1;
+ logfree = 0;
+ } else {
+ /* Not the last entry, just punch it out. */
+ bests[findex] = cpu_to_be16(NULLDATAOFF);
+ logfree = 1;
+ }
+
+ dp->d_ops->free_hdr_to_disk(free, &freehdr);
+ xfs_dir2_free_log_header(args, fbp);
+
+ /*
+ * If there are no useful entries left in the block, get rid of the
+ * block if we can.
+ */
+ if (!freehdr.nused) {
+ int error;
+
+ error = xfs_dir2_shrink_inode(args, fdb, fbp);
+ if (error == 0) {
+ fbp = NULL;
+ logfree = 0;
+ } else if (error != ENOSPC || args->total != 0)
+ return error;
+ /*
+ * It's possible to get ENOSPC if there is no
+ * space reservation. In this case some one
+ * else will eventually get rid of this block.
+ */
+ }
+
+ /* Log the free entry that changed, unless we got rid of it. */
+ if (logfree)
+ xfs_dir2_free_log_bests(args, fbp, findex, findex);
+ return 0;
+}
+
/*
* Remove an entry from a node directory.
* This removes the leaf entry and the data entry,
@@ -843,14 +1156,14 @@ xfs_dir2_leafn_rebalance(
static int /* error */
xfs_dir2_leafn_remove(
xfs_da_args_t *args, /* operation arguments */
- xfs_dabuf_t *bp, /* leaf buffer */
+ struct xfs_buf *bp, /* leaf buffer */
int index, /* leaf entry index */
xfs_da_state_blk_t *dblk, /* data block */
int *rval) /* resulting block needs join */
{
- xfs_dir2_data_t *data; /* data block structure */
+ xfs_dir2_data_hdr_t *hdr; /* data block header */
xfs_dir2_db_t db; /* data block number */
- xfs_dabuf_t *dbp; /* data block buffer */
+ struct xfs_buf *dbp; /* data block buffer */
xfs_dir2_data_entry_t *dep; /* data block entry */
xfs_inode_t *dp; /* incore directory inode */
xfs_dir2_leaf_t *leaf; /* leaf structure */
@@ -861,186 +1174,140 @@ xfs_dir2_leafn_remove(
int needlog; /* need to log data header */
int needscan; /* need to rescan data frees */
xfs_trans_t *tp; /* transaction pointer */
+ struct xfs_dir2_data_free *bf; /* bestfree table */
+ struct xfs_dir3_icleaf_hdr leafhdr;
+ struct xfs_dir2_leaf_entry *ents;
+
+ trace_xfs_dir2_leafn_remove(args, index);
- xfs_dir2_trace_args_sb("leafn_remove", args, index, bp);
dp = args->dp;
tp = args->trans;
mp = dp->i_mount;
- leaf = bp->data;
- ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR2_LEAFN_MAGIC);
+ leaf = bp->b_addr;
+ dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
+ ents = dp->d_ops->leaf_ents_p(leaf);
+
/*
* Point to the entry we're removing.
*/
- lep = &leaf->ents[index];
+ lep = &ents[index];
+
/*
* Extract the data block and offset from the entry.
*/
- db = XFS_DIR2_DATAPTR_TO_DB(mp, INT_GET(lep->address, ARCH_CONVERT));
+ db = xfs_dir2_dataptr_to_db(args->geo, be32_to_cpu(lep->address));
ASSERT(dblk->blkno == db);
- off = XFS_DIR2_DATAPTR_TO_OFF(mp, INT_GET(lep->address, ARCH_CONVERT));
+ off = xfs_dir2_dataptr_to_off(args->geo, be32_to_cpu(lep->address));
ASSERT(dblk->index == off);
+
/*
* Kill the leaf entry by marking it stale.
* Log the leaf block changes.
*/
- INT_MOD(leaf->hdr.stale, ARCH_CONVERT, +1);
- xfs_dir2_leaf_log_header(tp, bp);
- INT_SET(lep->address, ARCH_CONVERT, XFS_DIR2_NULL_DATAPTR);
- xfs_dir2_leaf_log_ents(tp, bp, index, index);
+ leafhdr.stale++;
+ dp->d_ops->leaf_hdr_to_disk(leaf, &leafhdr);
+ xfs_dir3_leaf_log_header(args, bp);
+
+ lep->address = cpu_to_be32(XFS_DIR2_NULL_DATAPTR);
+ xfs_dir3_leaf_log_ents(args, bp, index, index);
+
/*
* Make the data entry free. Keep track of the longest freespace
* in the data block in case it changes.
*/
dbp = dblk->bp;
- data = dbp->data;
- dep = (xfs_dir2_data_entry_t *)((char *)data + off);
- longest = INT_GET(data->hdr.bestfree[0].length, ARCH_CONVERT);
+ hdr = dbp->b_addr;
+ dep = (xfs_dir2_data_entry_t *)((char *)hdr + off);
+ bf = dp->d_ops->data_bestfree_p(hdr);
+ longest = be16_to_cpu(bf[0].length);
needlog = needscan = 0;
- xfs_dir2_data_make_free(tp, dbp, off,
- XFS_DIR2_DATA_ENTSIZE(dep->namelen), &needlog, &needscan);
+ xfs_dir2_data_make_free(args, dbp, off,
+ dp->d_ops->data_entsize(dep->namelen), &needlog, &needscan);
/*
* Rescan the data block freespaces for bestfree.
* Log the data block header if needed.
*/
if (needscan)
- xfs_dir2_data_freescan(mp, data, &needlog, NULL);
+ xfs_dir2_data_freescan(dp, hdr, &needlog);
if (needlog)
- xfs_dir2_data_log_header(tp, dbp);
- xfs_dir2_data_check(dp, dbp);
+ xfs_dir2_data_log_header(args, dbp);
+ xfs_dir3_data_check(dp, dbp);
/*
* If the longest data block freespace changes, need to update
* the corresponding freeblock entry.
*/
- if (longest < INT_GET(data->hdr.bestfree[0].length, ARCH_CONVERT)) {
+ if (longest < be16_to_cpu(bf[0].length)) {
int error; /* error return value */
- xfs_dabuf_t *fbp; /* freeblock buffer */
+ struct xfs_buf *fbp; /* freeblock buffer */
xfs_dir2_db_t fdb; /* freeblock block number */
int findex; /* index in freeblock entries */
xfs_dir2_free_t *free; /* freeblock structure */
- int logfree; /* need to log free entry */
/*
* Convert the data block number to a free block,
* read in the free block.
*/
- fdb = XFS_DIR2_DB_TO_FDB(mp, db);
- if ((error = xfs_da_read_buf(tp, dp, XFS_DIR2_DB_TO_DA(mp, fdb),
- -1, &fbp, XFS_DATA_FORK))) {
+ fdb = dp->d_ops->db_to_fdb(args->geo, db);
+ error = xfs_dir2_free_read(tp, dp,
+ xfs_dir2_db_to_da(args->geo, fdb),
+ &fbp);
+ if (error)
return error;
- }
- free = fbp->data;
- ASSERT(INT_GET(free->hdr.magic, ARCH_CONVERT) == XFS_DIR2_FREE_MAGIC);
- ASSERT(INT_GET(free->hdr.firstdb, ARCH_CONVERT) ==
- XFS_DIR2_MAX_FREE_BESTS(mp) *
- (fdb - XFS_DIR2_FREE_FIRSTDB(mp)));
+ free = fbp->b_addr;
+#ifdef DEBUG
+ {
+ struct xfs_dir3_icfree_hdr freehdr;
+ dp->d_ops->free_hdr_from_disk(&freehdr, free);
+ ASSERT(freehdr.firstdb == dp->d_ops->free_max_bests(args->geo) *
+ (fdb - xfs_dir2_byte_to_db(args->geo,
+ XFS_DIR2_FREE_OFFSET)));
+ }
+#endif
/*
* Calculate which entry we need to fix.
*/
- findex = XFS_DIR2_DB_TO_FDINDEX(mp, db);
- longest = INT_GET(data->hdr.bestfree[0].length, ARCH_CONVERT);
+ findex = dp->d_ops->db_to_fdindex(args->geo, db);
+ longest = be16_to_cpu(bf[0].length);
/*
* If the data block is now empty we can get rid of it
* (usually).
*/
- if (longest == mp->m_dirblksize - (uint)sizeof(data->hdr)) {
+ if (longest == args->geo->blksize -
+ dp->d_ops->data_entry_offset) {
/*
* Try to punch out the data block.
*/
error = xfs_dir2_shrink_inode(args, db, dbp);
if (error == 0) {
dblk->bp = NULL;
- data = NULL;
+ hdr = NULL;
}
/*
* We can get ENOSPC if there's no space reservation.
* In this case just drop the buffer and some one else
* will eventually get rid of the empty block.
*/
- else if (error == ENOSPC && args->total == 0)
- xfs_da_buf_done(dbp);
- else
+ else if (!(error == ENOSPC && args->total == 0))
return error;
}
/*
* If we got rid of the data block, we can eliminate that entry
* in the free block.
*/
- if (data == NULL) {
- /*
- * One less used entry in the free table.
- */
- INT_MOD(free->hdr.nused, ARCH_CONVERT, -1);
- xfs_dir2_free_log_header(tp, fbp);
- /*
- * If this was the last entry in the table, we can
- * trim the table size back. There might be other
- * entries at the end referring to non-existent
- * data blocks, get those too.
- */
- if (findex == INT_GET(free->hdr.nvalid, ARCH_CONVERT) - 1) {
- int i; /* free entry index */
-
- for (i = findex - 1;
- i >= 0 && INT_GET(free->bests[i], ARCH_CONVERT) == NULLDATAOFF;
- i--)
- continue;
- INT_SET(free->hdr.nvalid, ARCH_CONVERT, i + 1);
- logfree = 0;
- }
- /*
- * Not the last entry, just punch it out.
- */
- else {
- INT_SET(free->bests[findex], ARCH_CONVERT, NULLDATAOFF);
- logfree = 1;
- }
- /*
- * If there are no useful entries left in the block,
- * get rid of the block if we can.
- */
- if (!free->hdr.nused) {
- error = xfs_dir2_shrink_inode(args, fdb, fbp);
- if (error == 0) {
- fbp = NULL;
- logfree = 0;
- } else if (error != ENOSPC || args->total != 0)
- return error;
- /*
- * It's possible to get ENOSPC if there is no
- * space reservation. In this case some one
- * else will eventually get rid of this block.
- */
- }
- }
- /*
- * Data block is not empty, just set the free entry to
- * the new value.
- */
- else {
- INT_SET(free->bests[findex], ARCH_CONVERT, longest);
- logfree = 1;
- }
- /*
- * Log the free entry that changed, unless we got rid of it.
- */
- if (logfree)
- xfs_dir2_free_log_bests(tp, fbp, findex, findex);
- /*
- * Drop the buffer if we still have it.
- */
- if (fbp)
- xfs_da_buf_done(fbp);
+ error = xfs_dir3_data_block_free(args, hdr, free,
+ fdb, findex, fbp, longest);
+ if (error)
+ return error;
}
- xfs_dir2_leafn_check(dp, bp);
+
+ xfs_dir3_leaf_check(dp, bp);
/*
- * Return indication of whether this leaf block is emtpy enough
+ * Return indication of whether this leaf block is empty enough
* to justify trying to join it with a neighbor.
*/
- *rval =
- ((uint)sizeof(leaf->hdr) +
- (uint)sizeof(leaf->ents[0]) *
- (INT_GET(leaf->hdr.count, ARCH_CONVERT) - INT_GET(leaf->hdr.stale, ARCH_CONVERT))) <
- mp->m_dir_magicpct;
+ *rval = (dp->d_ops->leaf_hdr_size +
+ (uint)sizeof(ents[0]) * (leafhdr.count - leafhdr.stale)) <
+ args->geo->magicpct;
return 0;
}
@@ -1057,13 +1324,14 @@ xfs_dir2_leafn_split(
xfs_dablk_t blkno; /* new leaf block number */
int error; /* error return value */
xfs_mount_t *mp; /* filesystem mount point */
+ struct xfs_inode *dp;
/*
* Allocate space for a new leaf node.
*/
args = state->args;
- mp = args->dp->i_mount;
- ASSERT(args != NULL);
+ dp = args->dp;
+ mp = dp->i_mount;
ASSERT(oldblk->magic == XFS_DIR2_LEAFN_MAGIC);
error = xfs_da_grow_inode(args, &blkno);
if (error) {
@@ -1072,11 +1340,11 @@ xfs_dir2_leafn_split(
/*
* Initialize the new leaf block.
*/
- error = xfs_dir2_leaf_init(args, XFS_DIR2_DA_TO_DB(mp, blkno),
- &newblk->bp, XFS_DIR2_LEAFN_MAGIC);
- if (error) {
+ error = xfs_dir3_leaf_get_buf(args, xfs_dir2_da_to_db(args->geo, blkno),
+ &newblk->bp, XFS_DIR2_LEAFN_MAGIC);
+ if (error)
return error;
- }
+
newblk->blkno = blkno;
newblk->magic = XFS_DIR2_LEAFN_MAGIC;
/*
@@ -1084,7 +1352,7 @@ xfs_dir2_leafn_split(
* block into the leaves.
*/
xfs_dir2_leafn_rebalance(state, oldblk, newblk);
- error = xfs_da_blk_link(state, oldblk, newblk);
+ error = xfs_da3_blk_link(state, oldblk, newblk);
if (error) {
return error;
}
@@ -1098,10 +1366,10 @@ xfs_dir2_leafn_split(
/*
* Update last hashval in each block since we added the name.
*/
- oldblk->hashval = xfs_dir2_leafn_lasthash(oldblk->bp, NULL);
- newblk->hashval = xfs_dir2_leafn_lasthash(newblk->bp, NULL);
- xfs_dir2_leafn_check(args->dp, oldblk->bp);
- xfs_dir2_leafn_check(args->dp, newblk->bp);
+ oldblk->hashval = xfs_dir2_leafn_lasthash(dp, oldblk->bp, NULL);
+ newblk->hashval = xfs_dir2_leafn_lasthash(dp, newblk->bp, NULL);
+ xfs_dir3_leaf_check(dp, oldblk->bp);
+ xfs_dir3_leaf_check(dp, newblk->bp);
return error;
}
@@ -1121,15 +1389,17 @@ xfs_dir2_leafn_toosmall(
{
xfs_da_state_blk_t *blk; /* leaf block */
xfs_dablk_t blkno; /* leaf block number */
- xfs_dabuf_t *bp; /* leaf buffer */
+ struct xfs_buf *bp; /* leaf buffer */
int bytes; /* bytes in use */
int count; /* leaf live entry count */
int error; /* error return value */
int forward; /* sibling block direction */
int i; /* sibling counter */
- xfs_da_blkinfo_t *info; /* leaf block header */
xfs_dir2_leaf_t *leaf; /* leaf structure */
int rval; /* result from path_shift */
+ struct xfs_dir3_icleaf_hdr leafhdr;
+ struct xfs_dir2_leaf_entry *ents;
+ struct xfs_inode *dp = state->args->dp;
/*
* Check for the degenerate case of the block being over 50% full.
@@ -1137,12 +1407,14 @@ xfs_dir2_leafn_toosmall(
* to coalesce with a sibling.
*/
blk = &state->path.blk[state->path.active - 1];
- info = blk->bp->data;
- ASSERT(INT_GET(info->magic, ARCH_CONVERT) == XFS_DIR2_LEAFN_MAGIC);
- leaf = (xfs_dir2_leaf_t *)info;
- count = INT_GET(leaf->hdr.count, ARCH_CONVERT) - INT_GET(leaf->hdr.stale, ARCH_CONVERT);
- bytes = (uint)sizeof(leaf->hdr) + count * (uint)sizeof(leaf->ents[0]);
- if (bytes > (state->blocksize >> 1)) {
+ leaf = blk->bp->b_addr;
+ dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
+ ents = dp->d_ops->leaf_ents_p(leaf);
+ xfs_dir3_leaf_check(dp, blk->bp);
+
+ count = leafhdr.count - leafhdr.stale;
+ bytes = dp->d_ops->leaf_hdr_size + count * sizeof(ents[0]);
+ if (bytes > (state->args->geo->blksize >> 1)) {
/*
* Blk over 50%, don't try to join.
*/
@@ -1160,9 +1432,9 @@ xfs_dir2_leafn_toosmall(
* Make altpath point to the block we want to keep and
* path point to the block we want to drop (this one).
*/
- forward = info->forw;
+ forward = (leafhdr.forw != 0);
memcpy(&state->altpath, &state->path, sizeof(state->path));
- error = xfs_da_path_shift(state, &state->altpath, forward, 0,
+ error = xfs_da3_path_shift(state, &state->altpath, forward, 0,
&rval);
if (error)
return error;
@@ -1176,36 +1448,40 @@ xfs_dir2_leafn_toosmall(
* We prefer coalescing with the lower numbered sibling so as
* to shrink a directory over time.
*/
- forward = INT_GET(info->forw, ARCH_CONVERT) < INT_GET(info->back, ARCH_CONVERT);
+ forward = leafhdr.forw < leafhdr.back;
for (i = 0, bp = NULL; i < 2; forward = !forward, i++) {
- blkno = forward ?INT_GET( info->forw, ARCH_CONVERT) : INT_GET(info->back, ARCH_CONVERT);
+ struct xfs_dir3_icleaf_hdr hdr2;
+
+ blkno = forward ? leafhdr.forw : leafhdr.back;
if (blkno == 0)
continue;
/*
* Read the sibling leaf block.
*/
- if ((error =
- xfs_da_read_buf(state->args->trans, state->args->dp, blkno,
- -1, &bp, XFS_DATA_FORK))) {
+ error = xfs_dir3_leafn_read(state->args->trans, dp,
+ blkno, -1, &bp);
+ if (error)
return error;
- }
- ASSERT(bp != NULL);
+
/*
* Count bytes in the two blocks combined.
*/
- leaf = (xfs_dir2_leaf_t *)info;
- count = INT_GET(leaf->hdr.count, ARCH_CONVERT) - INT_GET(leaf->hdr.stale, ARCH_CONVERT);
- bytes = state->blocksize - (state->blocksize >> 2);
- leaf = bp->data;
- ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR2_LEAFN_MAGIC);
- count += INT_GET(leaf->hdr.count, ARCH_CONVERT) - INT_GET(leaf->hdr.stale, ARCH_CONVERT);
- bytes -= count * (uint)sizeof(leaf->ents[0]);
+ count = leafhdr.count - leafhdr.stale;
+ bytes = state->args->geo->blksize -
+ (state->args->geo->blksize >> 2);
+
+ leaf = bp->b_addr;
+ dp->d_ops->leaf_hdr_from_disk(&hdr2, leaf);
+ ents = dp->d_ops->leaf_ents_p(leaf);
+ count += hdr2.count - hdr2.stale;
+ bytes -= count * sizeof(ents[0]);
+
/*
* Fits with at least 25% to spare.
*/
if (bytes >= 0)
break;
- xfs_da_brelse(state->args->trans, bp);
+ xfs_trans_brelse(state->args->trans, bp);
}
/*
* Didn't like either block, give up.
@@ -1214,21 +1490,17 @@ xfs_dir2_leafn_toosmall(
*action = 0;
return 0;
}
- /*
- * Done with the sibling leaf block here, drop the dabuf
- * so path_shift can get it.
- */
- xfs_da_buf_done(bp);
+
/*
* Make altpath point to the block we want to keep (the lower
* numbered block) and path point to the block we want to drop.
*/
memcpy(&state->altpath, &state->path, sizeof(state->path));
if (blkno < blk->blkno)
- error = xfs_da_path_shift(state, &state->altpath, forward, 0,
+ error = xfs_da3_path_shift(state, &state->altpath, forward, 0,
&rval);
else
- error = xfs_da_path_shift(state, &state->path, forward, 0,
+ error = xfs_da3_path_shift(state, &state->path, forward, 0,
&rval);
if (error) {
return error;
@@ -1250,34 +1522,54 @@ xfs_dir2_leafn_unbalance(
xfs_da_args_t *args; /* operation arguments */
xfs_dir2_leaf_t *drop_leaf; /* dead leaf structure */
xfs_dir2_leaf_t *save_leaf; /* surviving leaf structure */
+ struct xfs_dir3_icleaf_hdr savehdr;
+ struct xfs_dir3_icleaf_hdr drophdr;
+ struct xfs_dir2_leaf_entry *sents;
+ struct xfs_dir2_leaf_entry *dents;
+ struct xfs_inode *dp = state->args->dp;
args = state->args;
ASSERT(drop_blk->magic == XFS_DIR2_LEAFN_MAGIC);
ASSERT(save_blk->magic == XFS_DIR2_LEAFN_MAGIC);
- drop_leaf = drop_blk->bp->data;
- save_leaf = save_blk->bp->data;
- ASSERT(INT_GET(drop_leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR2_LEAFN_MAGIC);
- ASSERT(INT_GET(save_leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR2_LEAFN_MAGIC);
+ drop_leaf = drop_blk->bp->b_addr;
+ save_leaf = save_blk->bp->b_addr;
+
+ dp->d_ops->leaf_hdr_from_disk(&savehdr, save_leaf);
+ dp->d_ops->leaf_hdr_from_disk(&drophdr, drop_leaf);
+ sents = dp->d_ops->leaf_ents_p(save_leaf);
+ dents = dp->d_ops->leaf_ents_p(drop_leaf);
+
/*
* If there are any stale leaf entries, take this opportunity
* to purge them.
*/
- if (INT_GET(drop_leaf->hdr.stale, ARCH_CONVERT))
- xfs_dir2_leaf_compact(args, drop_blk->bp);
- if (INT_GET(save_leaf->hdr.stale, ARCH_CONVERT))
- xfs_dir2_leaf_compact(args, save_blk->bp);
+ if (drophdr.stale)
+ xfs_dir3_leaf_compact(args, &drophdr, drop_blk->bp);
+ if (savehdr.stale)
+ xfs_dir3_leaf_compact(args, &savehdr, save_blk->bp);
+
/*
* Move the entries from drop to the appropriate end of save.
*/
- drop_blk->hashval = INT_GET(drop_leaf->ents[INT_GET(drop_leaf->hdr.count, ARCH_CONVERT) - 1].hashval, ARCH_CONVERT);
- if (xfs_dir2_leafn_order(save_blk->bp, drop_blk->bp))
- xfs_dir2_leafn_moveents(args, drop_blk->bp, 0, save_blk->bp, 0,
- INT_GET(drop_leaf->hdr.count, ARCH_CONVERT));
+ drop_blk->hashval = be32_to_cpu(dents[drophdr.count - 1].hashval);
+ if (xfs_dir2_leafn_order(dp, save_blk->bp, drop_blk->bp))
+ xfs_dir3_leafn_moveents(args, drop_blk->bp, &drophdr, dents, 0,
+ save_blk->bp, &savehdr, sents, 0,
+ drophdr.count);
else
- xfs_dir2_leafn_moveents(args, drop_blk->bp, 0, save_blk->bp,
- INT_GET(save_leaf->hdr.count, ARCH_CONVERT), INT_GET(drop_leaf->hdr.count, ARCH_CONVERT));
- save_blk->hashval = INT_GET(save_leaf->ents[INT_GET(save_leaf->hdr.count, ARCH_CONVERT) - 1].hashval, ARCH_CONVERT);
- xfs_dir2_leafn_check(args->dp, save_blk->bp);
+ xfs_dir3_leafn_moveents(args, drop_blk->bp, &drophdr, dents, 0,
+ save_blk->bp, &savehdr, sents,
+ savehdr.count, drophdr.count);
+ save_blk->hashval = be32_to_cpu(sents[savehdr.count - 1].hashval);
+
+ /* log the changes made when moving the entries */
+ dp->d_ops->leaf_hdr_to_disk(save_leaf, &savehdr);
+ dp->d_ops->leaf_hdr_to_disk(drop_leaf, &drophdr);
+ xfs_dir3_leaf_log_header(args, save_blk->bp);
+ xfs_dir3_leaf_log_header(args, drop_blk->bp);
+
+ xfs_dir3_leaf_check(dp, save_blk->bp);
+ xfs_dir3_leaf_check(dp, drop_blk->bp);
}
/*
@@ -1292,20 +1584,19 @@ xfs_dir2_node_addname(
int rval; /* sub-return value */
xfs_da_state_t *state; /* btree cursor */
- xfs_dir2_trace_args("node_addname", args);
+ trace_xfs_dir2_node_addname(args);
+
/*
* Allocate and initialize the state (btree cursor).
*/
state = xfs_da_state_alloc();
state->args = args;
state->mp = args->dp->i_mount;
- state->blocksize = state->mp->m_dirblksize;
- state->node_ents = state->mp->m_dir_node_ents;
/*
* Look up the name. We're not supposed to find it, but
* this gives us the insertion point.
*/
- error = xfs_da_node_lookup_int(state, &rval);
+ error = xfs_da3_node_lookup_int(state, &rval);
if (error)
rval = error;
if (rval != ENOENT) {
@@ -1330,8 +1621,8 @@ xfs_dir2_node_addname(
/*
* It worked, fix the hash values up the btree.
*/
- if (!args->justcheck)
- xfs_da_fixhashpath(state, &state->path);
+ if (!(args->op_flags & XFS_DA_OP_JUSTCHECK))
+ xfs_da3_fixhashpath(state, &state->path);
} else {
/*
* It didn't work, we need to split the leaf block.
@@ -1343,7 +1634,7 @@ xfs_dir2_node_addname(
/*
* Split the leaf block and insert the new entry.
*/
- rval = xfs_da_split(state);
+ rval = xfs_da3_split(state);
}
done:
xfs_da_state_free(state);
@@ -1360,15 +1651,15 @@ xfs_dir2_node_addname_int(
xfs_da_args_t *args, /* operation arguments */
xfs_da_state_blk_t *fblk) /* optional freespace block */
{
- xfs_dir2_data_t *data; /* data block structure */
+ xfs_dir2_data_hdr_t *hdr; /* data block header */
xfs_dir2_db_t dbno; /* data block number */
- xfs_dabuf_t *dbp; /* data block buffer */
+ struct xfs_buf *dbp; /* data block buffer */
xfs_dir2_data_entry_t *dep; /* data entry pointer */
xfs_inode_t *dp; /* incore directory inode */
xfs_dir2_data_unused_t *dup; /* data unused entry pointer */
int error; /* error return value */
xfs_dir2_db_t fbno; /* freespace block number */
- xfs_dabuf_t *fbp; /* freespace buffer */
+ struct xfs_buf *fbp; /* freespace buffer */
int findex; /* freespace entry index */
xfs_dir2_free_t *free=NULL; /* freespace block structure */
xfs_dir2_db_t ifbno; /* initial freespace block no */
@@ -1378,13 +1669,16 @@ xfs_dir2_node_addname_int(
xfs_mount_t *mp; /* filesystem mount point */
int needlog; /* need to log data header */
int needscan; /* need to rescan data frees */
- xfs_dir2_data_off_t *tagp; /* data entry tag pointer */
+ __be16 *tagp; /* data entry tag pointer */
xfs_trans_t *tp; /* transaction pointer */
+ __be16 *bests;
+ struct xfs_dir3_icfree_hdr freehdr;
+ struct xfs_dir2_data_free *bf;
dp = args->dp;
mp = dp->i_mount;
tp = args->trans;
- length = XFS_DIR2_DATA_ENTSIZE(args->namelen);
+ length = dp->d_ops->data_entsize(args->namelen);
/*
* If we came in with a freespace block that means that lookup
* found an entry with our hash value. This is the freespace
@@ -1396,37 +1690,38 @@ xfs_dir2_node_addname_int(
* Remember initial freespace block number.
*/
ifbno = fblk->blkno;
- free = fbp->data;
- ASSERT(INT_GET(free->hdr.magic, ARCH_CONVERT) == XFS_DIR2_FREE_MAGIC);
+ free = fbp->b_addr;
findex = fblk->index;
+ bests = dp->d_ops->free_bests_p(free);
+ dp->d_ops->free_hdr_from_disk(&freehdr, free);
+
/*
* This means the free entry showed that the data block had
* space for our entry, so we remembered it.
* Use that data block.
*/
if (findex >= 0) {
- ASSERT(findex < INT_GET(free->hdr.nvalid, ARCH_CONVERT));
- ASSERT(INT_GET(free->bests[findex], ARCH_CONVERT) != NULLDATAOFF);
- ASSERT(INT_GET(free->bests[findex], ARCH_CONVERT) >= length);
- dbno = INT_GET(free->hdr.firstdb, ARCH_CONVERT) + findex;
- }
- /*
- * The data block looked at didn't have enough room.
- * We'll start at the beginning of the freespace entries.
- */
- else {
+ ASSERT(findex < freehdr.nvalid);
+ ASSERT(be16_to_cpu(bests[findex]) != NULLDATAOFF);
+ ASSERT(be16_to_cpu(bests[findex]) >= length);
+ dbno = freehdr.firstdb + findex;
+ } else {
+ /*
+ * The data block looked at didn't have enough room.
+ * We'll start at the beginning of the freespace entries.
+ */
dbno = -1;
findex = 0;
}
- }
- /*
- * Didn't come in with a freespace block, so don't have a data block.
- */
- else {
+ } else {
+ /*
+ * Didn't come in with a freespace block, so no data block.
+ */
ifbno = dbno = -1;
fbp = NULL;
findex = 0;
}
+
/*
* If we don't have a data block yet, we're going to scan the
* freespace blocks looking for one. Figure out what the
@@ -1435,9 +1730,9 @@ xfs_dir2_node_addname_int(
if (dbno == -1) {
xfs_fileoff_t fo; /* freespace block number */
- if ((error = xfs_bmap_last_offset(tp, dp, &fo, XFS_DATA_FORK)))
+ if ((error = xfs_bmap_last_offset(dp, &fo, XFS_DATA_FORK)))
return error;
- lastfbno = XFS_DIR2_DA_TO_DB(mp, (xfs_dablk_t)fo);
+ lastfbno = xfs_dir2_da_to_db(args->geo, (xfs_dablk_t)fo);
fbno = ifbno;
}
/*
@@ -1455,7 +1750,8 @@ xfs_dir2_node_addname_int(
* us a freespace block to start with.
*/
if (++fbno == 0)
- fbno = XFS_DIR2_FREE_FIRSTDB(mp);
+ fbno = xfs_dir2_byte_to_db(args->geo,
+ XFS_DIR2_FREE_OFFSET);
/*
* If it's ifbno we already looked at it.
*/
@@ -1472,33 +1768,38 @@ xfs_dir2_node_addname_int(
* This should be really rare, so there's no reason
* to avoid it.
*/
- if ((error = xfs_da_read_buf(tp, dp,
- XFS_DIR2_DB_TO_DA(mp, fbno), -2, &fbp,
- XFS_DATA_FORK))) {
+ error = xfs_dir2_free_try_read(tp, dp,
+ xfs_dir2_db_to_da(args->geo, fbno),
+ &fbp);
+ if (error)
return error;
- }
- if (unlikely(fbp == NULL)) {
+ if (!fbp)
continue;
- }
- free = fbp->data;
- ASSERT(INT_GET(free->hdr.magic, ARCH_CONVERT) == XFS_DIR2_FREE_MAGIC);
+ free = fbp->b_addr;
findex = 0;
}
/*
* Look at the current free entry. Is it good enough?
+ *
+ * The bests initialisation should be where the bufer is read in
+ * the above branch. But gcc is too stupid to realise that bests
+ * and the freehdr are actually initialised if they are placed
+ * there, so we have to do it here to avoid warnings. Blech.
*/
- if (INT_GET(free->bests[findex], ARCH_CONVERT) != NULLDATAOFF &&
- INT_GET(free->bests[findex], ARCH_CONVERT) >= length)
- dbno = INT_GET(free->hdr.firstdb, ARCH_CONVERT) + findex;
+ bests = dp->d_ops->free_bests_p(free);
+ dp->d_ops->free_hdr_from_disk(&freehdr, free);
+ if (be16_to_cpu(bests[findex]) != NULLDATAOFF &&
+ be16_to_cpu(bests[findex]) >= length)
+ dbno = freehdr.firstdb + findex;
else {
/*
* Are we done with the freeblock?
*/
- if (++findex == INT_GET(free->hdr.nvalid, ARCH_CONVERT)) {
+ if (++findex == freehdr.nvalid) {
/*
* Drop the block.
*/
- xfs_da_brelse(tp, fbp);
+ xfs_trans_brelse(tp, fbp);
fbp = NULL;
if (fblk && fblk->bp)
fblk->bp = NULL;
@@ -1513,35 +1814,23 @@ xfs_dir2_node_addname_int(
/*
* Not allowed to allocate, return failure.
*/
- if (args->justcheck || args->total == 0) {
- /*
- * Drop the freespace buffer unless it came from our
- * caller.
- */
- if ((fblk == NULL || fblk->bp == NULL) && fbp != NULL)
- xfs_da_buf_done(fbp);
+ if ((args->op_flags & XFS_DA_OP_JUSTCHECK) || args->total == 0)
return XFS_ERROR(ENOSPC);
- }
+
/*
* Allocate and initialize the new data block.
*/
if (unlikely((error = xfs_dir2_grow_inode(args,
XFS_DIR2_DATA_SPACE,
&dbno)) ||
- (error = xfs_dir2_data_init(args, dbno, &dbp)))) {
- /*
- * Drop the freespace buffer unless it came from our
- * caller.
- */
- if ((fblk == NULL || fblk->bp == NULL) && fbp != NULL)
- xfs_da_buf_done(fbp);
+ (error = xfs_dir3_data_init(args, dbno, &dbp))))
return error;
- }
+
/*
* If (somehow) we have a freespace block, get rid of it.
*/
if (fbp)
- xfs_da_brelse(tp, fbp);
+ xfs_trans_brelse(tp, fbp);
if (fblk && fblk->bp)
fblk->bp = NULL;
@@ -1549,44 +1838,41 @@ xfs_dir2_node_addname_int(
* Get the freespace block corresponding to the data block
* that was just allocated.
*/
- fbno = XFS_DIR2_DB_TO_FDB(mp, dbno);
- if (unlikely(error = xfs_da_read_buf(tp, dp,
- XFS_DIR2_DB_TO_DA(mp, fbno), -2, &fbp,
- XFS_DATA_FORK))) {
- xfs_da_buf_done(dbp);
+ fbno = dp->d_ops->db_to_fdb(args->geo, dbno);
+ error = xfs_dir2_free_try_read(tp, dp,
+ xfs_dir2_db_to_da(args->geo, fbno),
+ &fbp);
+ if (error)
return error;
- }
+
/*
* If there wasn't a freespace block, the read will
* return a NULL fbp. Allocate and initialize a new one.
*/
- if( fbp == NULL ) {
- if ((error = xfs_dir2_grow_inode(args, XFS_DIR2_FREE_SPACE,
- &fbno))) {
+ if (!fbp) {
+ error = xfs_dir2_grow_inode(args, XFS_DIR2_FREE_SPACE,
+ &fbno);
+ if (error)
return error;
- }
- if (unlikely(XFS_DIR2_DB_TO_FDB(mp, dbno) != fbno)) {
- cmn_err(CE_ALERT,
- "xfs_dir2_node_addname_int: dir ino "
- "%llu needed freesp block %lld for\n"
- " data block %lld, got %lld\n"
- " ifbno %llu lastfbno %d\n",
- (unsigned long long)dp->i_ino,
- (long long)XFS_DIR2_DB_TO_FDB(mp, dbno),
+ if (dp->d_ops->db_to_fdb(args->geo, dbno) != fbno) {
+ xfs_alert(mp,
+ "%s: dir ino %llu needed freesp block %lld for\n"
+ " data block %lld, got %lld ifbno %llu lastfbno %d",
+ __func__, (unsigned long long)dp->i_ino,
+ (long long)dp->d_ops->db_to_fdb(
+ args->geo, dbno),
(long long)dbno, (long long)fbno,
(unsigned long long)ifbno, lastfbno);
if (fblk) {
- cmn_err(CE_ALERT,
- " fblk 0x%p blkno %llu "
- "index %d magic 0x%x\n",
+ xfs_alert(mp,
+ " fblk 0x%p blkno %llu index %d magic 0x%x",
fblk,
(unsigned long long)fblk->blkno,
fblk->index,
fblk->magic);
} else {
- cmn_err(CE_ALERT,
- " ... fblk is NULL\n");
+ xfs_alert(mp, " ... fblk is NULL");
}
XFS_ERROR_REPORT("xfs_dir2_node_addname_int",
XFS_ERRLEVEL_LOW, mp);
@@ -1596,60 +1882,59 @@ xfs_dir2_node_addname_int(
/*
* Get a buffer for the new block.
*/
- if ((error = xfs_da_get_buf(tp, dp,
- XFS_DIR2_DB_TO_DA(mp, fbno),
- -1, &fbp, XFS_DATA_FORK))) {
+ error = xfs_dir3_free_get_buf(args, fbno, &fbp);
+ if (error)
return error;
- }
- ASSERT(fbp != NULL);
+ free = fbp->b_addr;
+ bests = dp->d_ops->free_bests_p(free);
+ dp->d_ops->free_hdr_from_disk(&freehdr, free);
/*
- * Initialize the new block to be empty, and remember
- * its first slot as our empty slot.
+ * Remember the first slot as our empty slot.
*/
- free = fbp->data;
- INT_SET(free->hdr.magic, ARCH_CONVERT, XFS_DIR2_FREE_MAGIC);
- INT_SET(free->hdr.firstdb, ARCH_CONVERT,
- (fbno - XFS_DIR2_FREE_FIRSTDB(mp)) *
- XFS_DIR2_MAX_FREE_BESTS(mp));
- free->hdr.nvalid = 0;
- free->hdr.nused = 0;
+ freehdr.firstdb =
+ (fbno - xfs_dir2_byte_to_db(args->geo,
+ XFS_DIR2_FREE_OFFSET)) *
+ dp->d_ops->free_max_bests(args->geo);
} else {
- free = fbp->data;
- ASSERT(INT_GET(free->hdr.magic, ARCH_CONVERT) == XFS_DIR2_FREE_MAGIC);
+ free = fbp->b_addr;
+ bests = dp->d_ops->free_bests_p(free);
+ dp->d_ops->free_hdr_from_disk(&freehdr, free);
}
/*
* Set the freespace block index from the data block number.
*/
- findex = XFS_DIR2_DB_TO_FDINDEX(mp, dbno);
+ findex = dp->d_ops->db_to_fdindex(args->geo, dbno);
/*
* If it's after the end of the current entries in the
* freespace block, extend that table.
*/
- if (findex >= INT_GET(free->hdr.nvalid, ARCH_CONVERT)) {
- ASSERT(findex < XFS_DIR2_MAX_FREE_BESTS(mp));
- INT_SET(free->hdr.nvalid, ARCH_CONVERT, findex + 1);
+ if (findex >= freehdr.nvalid) {
+ ASSERT(findex < dp->d_ops->free_max_bests(args->geo));
+ freehdr.nvalid = findex + 1;
/*
* Tag new entry so nused will go up.
*/
- INT_SET(free->bests[findex], ARCH_CONVERT, NULLDATAOFF);
+ bests[findex] = cpu_to_be16(NULLDATAOFF);
}
/*
* If this entry was for an empty data block
* (this should always be true) then update the header.
*/
- if (INT_GET(free->bests[findex], ARCH_CONVERT) == NULLDATAOFF) {
- INT_MOD(free->hdr.nused, ARCH_CONVERT, +1);
- xfs_dir2_free_log_header(tp, fbp);
+ if (bests[findex] == cpu_to_be16(NULLDATAOFF)) {
+ freehdr.nused++;
+ dp->d_ops->free_hdr_to_disk(fbp->b_addr, &freehdr);
+ xfs_dir2_free_log_header(args, fbp);
}
/*
* Update the real value in the table.
* We haven't allocated the data entry yet so this will
* change again.
*/
- data = dbp->data;
- INT_COPY(free->bests[findex], data->hdr.bestfree[0].length, ARCH_CONVERT);
+ hdr = dbp->b_addr;
+ bf = dp->d_ops->data_bestfree_p(hdr);
+ bests[findex] = bf[0].length;
logfree = 1;
}
/*
@@ -1659,86 +1944,79 @@ xfs_dir2_node_addname_int(
/*
* If just checking, we succeeded.
*/
- if (args->justcheck) {
- if ((fblk == NULL || fblk->bp == NULL) && fbp != NULL)
- xfs_da_buf_done(fbp);
+ if (args->op_flags & XFS_DA_OP_JUSTCHECK)
return 0;
- }
+
/*
* Read the data block in.
*/
- if (unlikely(
- error = xfs_da_read_buf(tp, dp, XFS_DIR2_DB_TO_DA(mp, dbno),
- -1, &dbp, XFS_DATA_FORK))) {
- if ((fblk == NULL || fblk->bp == NULL) && fbp != NULL)
- xfs_da_buf_done(fbp);
+ error = xfs_dir3_data_read(tp, dp,
+ xfs_dir2_db_to_da(args->geo, dbno),
+ -1, &dbp);
+ if (error)
return error;
- }
- data = dbp->data;
+ hdr = dbp->b_addr;
+ bf = dp->d_ops->data_bestfree_p(hdr);
logfree = 0;
}
- ASSERT(INT_GET(data->hdr.bestfree[0].length, ARCH_CONVERT) >= length);
+ ASSERT(be16_to_cpu(bf[0].length) >= length);
/*
* Point to the existing unused space.
*/
dup = (xfs_dir2_data_unused_t *)
- ((char *)data + INT_GET(data->hdr.bestfree[0].offset, ARCH_CONVERT));
+ ((char *)hdr + be16_to_cpu(bf[0].offset));
needscan = needlog = 0;
/*
* Mark the first part of the unused space, inuse for us.
*/
- xfs_dir2_data_use_free(tp, dbp, dup,
- (xfs_dir2_data_aoff_t)((char *)dup - (char *)data), length,
+ xfs_dir2_data_use_free(args, dbp, dup,
+ (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr), length,
&needlog, &needscan);
/*
* Fill in the new entry and log it.
*/
dep = (xfs_dir2_data_entry_t *)dup;
- INT_SET(dep->inumber, ARCH_CONVERT, args->inumber);
+ dep->inumber = cpu_to_be64(args->inumber);
dep->namelen = args->namelen;
memcpy(dep->name, args->name, dep->namelen);
- tagp = XFS_DIR2_DATA_ENTRY_TAG_P(dep);
- INT_SET(*tagp, ARCH_CONVERT, (xfs_dir2_data_off_t)((char *)dep - (char *)data));
- xfs_dir2_data_log_entry(tp, dbp, dep);
+ dp->d_ops->data_put_ftype(dep, args->filetype);
+ tagp = dp->d_ops->data_entry_tag_p(dep);
+ *tagp = cpu_to_be16((char *)dep - (char *)hdr);
+ xfs_dir2_data_log_entry(args, dbp, dep);
/*
* Rescan the block for bestfree if needed.
*/
if (needscan)
- xfs_dir2_data_freescan(mp, data, &needlog, NULL);
+ xfs_dir2_data_freescan(dp, hdr, &needlog);
/*
* Log the data block header if needed.
*/
if (needlog)
- xfs_dir2_data_log_header(tp, dbp);
+ xfs_dir2_data_log_header(args, dbp);
/*
* If the freespace entry is now wrong, update it.
*/
- if (INT_GET(free->bests[findex], ARCH_CONVERT) != INT_GET(data->hdr.bestfree[0].length, ARCH_CONVERT)) {
- INT_COPY(free->bests[findex], data->hdr.bestfree[0].length, ARCH_CONVERT);
+ bests = dp->d_ops->free_bests_p(free); /* gcc is so stupid */
+ if (be16_to_cpu(bests[findex]) != be16_to_cpu(bf[0].length)) {
+ bests[findex] = bf[0].length;
logfree = 1;
}
/*
* Log the freespace entry if needed.
*/
if (logfree)
- xfs_dir2_free_log_bests(tp, fbp, findex, findex);
- /*
- * If the caller didn't hand us the freespace block, drop it.
- */
- if ((fblk == NULL || fblk->bp == NULL) && fbp != NULL)
- xfs_da_buf_done(fbp);
+ xfs_dir2_free_log_bests(args, fbp, findex, findex);
/*
* Return the data block and offset in args, then drop the data block.
*/
args->blkno = (xfs_dablk_t)dbno;
- args->index = INT_GET(*tagp, ARCH_CONVERT);
- xfs_da_buf_done(dbp);
+ args->index = be16_to_cpu(*tagp);
return 0;
}
/*
* Lookup an entry in a node-format directory.
- * All the real work happens in xfs_da_node_lookup_int.
+ * All the real work happens in xfs_da3_node_lookup_int.
* The only real output is the inode number of the entry.
*/
int /* error */
@@ -1750,33 +2028,41 @@ xfs_dir2_node_lookup(
int rval; /* operation return value */
xfs_da_state_t *state; /* btree cursor */
- xfs_dir2_trace_args("node_lookup", args);
+ trace_xfs_dir2_node_lookup(args);
+
/*
* Allocate and initialize the btree cursor.
*/
state = xfs_da_state_alloc();
state->args = args;
state->mp = args->dp->i_mount;
- state->blocksize = state->mp->m_dirblksize;
- state->node_ents = state->mp->m_dir_node_ents;
/*
* Fill in the path to the entry in the cursor.
*/
- error = xfs_da_node_lookup_int(state, &rval);
+ error = xfs_da3_node_lookup_int(state, &rval);
if (error)
rval = error;
+ else if (rval == ENOENT && args->cmpresult == XFS_CMP_CASE) {
+ /* If a CI match, dup the actual name and return EEXIST */
+ xfs_dir2_data_entry_t *dep;
+
+ dep = (xfs_dir2_data_entry_t *)
+ ((char *)state->extrablk.bp->b_addr +
+ state->extrablk.index);
+ rval = xfs_dir_cilookup_result(args, dep->name, dep->namelen);
+ }
/*
* Release the btree blocks and leaf block.
*/
for (i = 0; i < state->path.active; i++) {
- xfs_da_brelse(args->trans, state->path.blk[i].bp);
+ xfs_trans_brelse(args->trans, state->path.blk[i].bp);
state->path.blk[i].bp = NULL;
}
/*
* Release the data block if we have it.
*/
if (state->extravalid && state->extrablk.bp) {
- xfs_da_brelse(args->trans, state->extrablk.bp);
+ xfs_trans_brelse(args->trans, state->extrablk.bp);
state->extrablk.bp = NULL;
}
xfs_da_state_free(state);
@@ -1788,36 +2074,33 @@ xfs_dir2_node_lookup(
*/
int /* error */
xfs_dir2_node_removename(
- xfs_da_args_t *args) /* operation arguments */
+ struct xfs_da_args *args) /* operation arguments */
{
- xfs_da_state_blk_t *blk; /* leaf block */
+ struct xfs_da_state_blk *blk; /* leaf block */
int error; /* error return value */
int rval; /* operation return value */
- xfs_da_state_t *state; /* btree cursor */
+ struct xfs_da_state *state; /* btree cursor */
+
+ trace_xfs_dir2_node_removename(args);
- xfs_dir2_trace_args("node_removename", args);
/*
* Allocate and initialize the btree cursor.
*/
state = xfs_da_state_alloc();
state->args = args;
state->mp = args->dp->i_mount;
- state->blocksize = state->mp->m_dirblksize;
- state->node_ents = state->mp->m_dir_node_ents;
- /*
- * Look up the entry we're deleting, set up the cursor.
- */
- error = xfs_da_node_lookup_int(state, &rval);
- if (error) {
- rval = error;
- }
- /*
- * Didn't find it, upper layer screwed up.
- */
+
+ /* Look up the entry we're deleting, set up the cursor. */
+ error = xfs_da3_node_lookup_int(state, &rval);
+ if (error)
+ goto out_free;
+
+ /* Didn't find it, upper layer screwed up. */
if (rval != EEXIST) {
- xfs_da_state_free(state);
- return rval;
+ error = rval;
+ goto out_free;
}
+
blk = &state->path.blk[state->path.active - 1];
ASSERT(blk->magic == XFS_DIR2_LEAFN_MAGIC);
ASSERT(state->extravalid);
@@ -1827,23 +2110,23 @@ xfs_dir2_node_removename(
*/
error = xfs_dir2_leafn_remove(args, blk->bp, blk->index,
&state->extrablk, &rval);
- if (error) {
- return error;
- }
+ if (error)
+ goto out_free;
/*
* Fix the hash values up the btree.
*/
- xfs_da_fixhashpath(state, &state->path);
+ xfs_da3_fixhashpath(state, &state->path);
/*
* If we need to join leaf blocks, do it.
*/
if (rval && state->path.active > 1)
- error = xfs_da_join(state);
+ error = xfs_da3_join(state);
/*
* If no errors so far, try conversion to leaf format.
*/
if (!error)
error = xfs_dir2_node_to_leaf(state);
+out_free:
xfs_da_state_free(state);
return error;
}
@@ -1856,7 +2139,7 @@ xfs_dir2_node_replace(
xfs_da_args_t *args) /* operation arguments */
{
xfs_da_state_blk_t *blk; /* leaf block */
- xfs_dir2_data_t *data; /* data block structure */
+ xfs_dir2_data_hdr_t *hdr; /* data block header */
xfs_dir2_data_entry_t *dep; /* data entry changed */
int error; /* error return value */
int i; /* btree level */
@@ -1866,20 +2149,19 @@ xfs_dir2_node_replace(
int rval; /* internal return value */
xfs_da_state_t *state; /* btree cursor */
- xfs_dir2_trace_args("node_replace", args);
+ trace_xfs_dir2_node_replace(args);
+
/*
* Allocate and initialize the btree cursor.
*/
state = xfs_da_state_alloc();
state->args = args;
state->mp = args->dp->i_mount;
- state->blocksize = state->mp->m_dirblksize;
- state->node_ents = state->mp->m_dir_node_ents;
inum = args->inumber;
/*
* Lookup the entry to change in the btree.
*/
- error = xfs_da_node_lookup_int(state, &rval);
+ error = xfs_da3_node_lookup_int(state, &rval);
if (error) {
rval = error;
}
@@ -1888,42 +2170,47 @@ xfs_dir2_node_replace(
* and locked it. But paranoia is good.
*/
if (rval == EEXIST) {
+ struct xfs_dir2_leaf_entry *ents;
/*
* Find the leaf entry.
*/
blk = &state->path.blk[state->path.active - 1];
ASSERT(blk->magic == XFS_DIR2_LEAFN_MAGIC);
- leaf = blk->bp->data;
- lep = &leaf->ents[blk->index];
+ leaf = blk->bp->b_addr;
+ ents = args->dp->d_ops->leaf_ents_p(leaf);
+ lep = &ents[blk->index];
ASSERT(state->extravalid);
/*
* Point to the data entry.
*/
- data = state->extrablk.bp->data;
- ASSERT(INT_GET(data->hdr.magic, ARCH_CONVERT) == XFS_DIR2_DATA_MAGIC);
+ hdr = state->extrablk.bp->b_addr;
+ ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
+ hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC));
dep = (xfs_dir2_data_entry_t *)
- ((char *)data +
- XFS_DIR2_DATAPTR_TO_OFF(state->mp, INT_GET(lep->address, ARCH_CONVERT)));
- ASSERT(inum != INT_GET(dep->inumber, ARCH_CONVERT));
+ ((char *)hdr +
+ xfs_dir2_dataptr_to_off(args->geo,
+ be32_to_cpu(lep->address)));
+ ASSERT(inum != be64_to_cpu(dep->inumber));
/*
* Fill in the new inode number and log the entry.
*/
- INT_SET(dep->inumber, ARCH_CONVERT, inum);
- xfs_dir2_data_log_entry(args->trans, state->extrablk.bp, dep);
+ dep->inumber = cpu_to_be64(inum);
+ args->dp->d_ops->data_put_ftype(dep, args->filetype);
+ xfs_dir2_data_log_entry(args, state->extrablk.bp, dep);
rval = 0;
}
/*
* Didn't find it, and we're holding a data block. Drop it.
*/
else if (state->extravalid) {
- xfs_da_brelse(args->trans, state->extrablk.bp);
+ xfs_trans_brelse(args->trans, state->extrablk.bp);
state->extrablk.bp = NULL;
}
/*
* Release all the buffers in the cursor.
*/
for (i = 0; i < state->path.active; i++) {
- xfs_da_brelse(args->trans, state->path.blk[i].bp);
+ xfs_trans_brelse(args->trans, state->path.blk[i].bp);
state->path.blk[i].bp = NULL;
}
xfs_da_state_free(state);
@@ -1940,12 +2227,13 @@ xfs_dir2_node_trim_free(
xfs_fileoff_t fo, /* free block number */
int *rvalp) /* out: did something */
{
- xfs_dabuf_t *bp; /* freespace buffer */
+ struct xfs_buf *bp; /* freespace buffer */
xfs_inode_t *dp; /* incore directory inode */
int error; /* error return code */
xfs_dir2_free_t *free; /* freespace structure */
xfs_mount_t *mp; /* filesystem mount point */
xfs_trans_t *tp; /* transaction pointer */
+ struct xfs_dir3_icfree_hdr freehdr;
dp = args->dp;
mp = dp->i_mount;
@@ -1953,41 +2241,39 @@ xfs_dir2_node_trim_free(
/*
* Read the freespace block.
*/
- if (unlikely(error = xfs_da_read_buf(tp, dp, (xfs_dablk_t)fo, -2, &bp,
- XFS_DATA_FORK))) {
+ error = xfs_dir2_free_try_read(tp, dp, fo, &bp);
+ if (error)
return error;
- }
-
/*
* There can be holes in freespace. If fo is a hole, there's
* nothing to do.
*/
- if (bp == NULL) {
+ if (!bp)
return 0;
- }
- free = bp->data;
- ASSERT(INT_GET(free->hdr.magic, ARCH_CONVERT) == XFS_DIR2_FREE_MAGIC);
+ free = bp->b_addr;
+ dp->d_ops->free_hdr_from_disk(&freehdr, free);
+
/*
* If there are used entries, there's nothing to do.
*/
- if (INT_GET(free->hdr.nused, ARCH_CONVERT) > 0) {
- xfs_da_brelse(tp, bp);
+ if (freehdr.nused > 0) {
+ xfs_trans_brelse(tp, bp);
*rvalp = 0;
return 0;
}
/*
* Blow the block away.
*/
- if ((error =
- xfs_dir2_shrink_inode(args, XFS_DIR2_DA_TO_DB(mp, (xfs_dablk_t)fo),
- bp))) {
+ error = xfs_dir2_shrink_inode(args,
+ xfs_dir2_da_to_db(args->geo, (xfs_dablk_t)fo), bp);
+ if (error) {
/*
* Can't fail with ENOSPC since that only happens with no
* space reservation, when breaking up an extent into two
* pieces. This is the last block of an extent.
*/
ASSERT(error != ENOSPC);
- xfs_da_brelse(tp, bp);
+ xfs_trans_brelse(tp, bp);
return error;
}
/*
diff --git a/fs/xfs/xfs_dir2_node.h b/fs/xfs/xfs_dir2_node.h
deleted file mode 100644
index 0ab8fbd5951..00000000000
--- a/fs/xfs/xfs_dir2_node.h
+++ /dev/null
@@ -1,104 +0,0 @@
-/*
- * Copyright (c) 2000,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- */
-#ifndef __XFS_DIR2_NODE_H__
-#define __XFS_DIR2_NODE_H__
-
-/*
- * Directory version 2, btree node format structures
- */
-
-struct uio;
-struct xfs_dabuf;
-struct xfs_da_args;
-struct xfs_da_state;
-struct xfs_da_state_blk;
-struct xfs_inode;
-struct xfs_trans;
-
-/*
- * Offset of the freespace index.
- */
-#define XFS_DIR2_FREE_SPACE 2
-#define XFS_DIR2_FREE_OFFSET (XFS_DIR2_FREE_SPACE * XFS_DIR2_SPACE_SIZE)
-#define XFS_DIR2_FREE_FIRSTDB(mp) \
- XFS_DIR2_BYTE_TO_DB(mp, XFS_DIR2_FREE_OFFSET)
-
-#define XFS_DIR2_FREE_MAGIC 0x58443246 /* XD2F */
-
-typedef struct xfs_dir2_free_hdr {
- __uint32_t magic; /* XFS_DIR2_FREE_MAGIC */
- __int32_t firstdb; /* db of first entry */
- __int32_t nvalid; /* count of valid entries */
- __int32_t nused; /* count of used entries */
-} xfs_dir2_free_hdr_t;
-
-typedef struct xfs_dir2_free {
- xfs_dir2_free_hdr_t hdr; /* block header */
- xfs_dir2_data_off_t bests[1]; /* best free counts */
- /* unused entries are -1 */
-} xfs_dir2_free_t;
-
-#define XFS_DIR2_MAX_FREE_BESTS(mp) \
- (((mp)->m_dirblksize - (uint)sizeof(xfs_dir2_free_hdr_t)) / \
- (uint)sizeof(xfs_dir2_data_off_t))
-
-/*
- * Convert data space db to the corresponding free db.
- */
-#define XFS_DIR2_DB_TO_FDB(mp,db) xfs_dir2_db_to_fdb(mp, db)
-static inline xfs_dir2_db_t
-xfs_dir2_db_to_fdb(struct xfs_mount *mp, xfs_dir2_db_t db)
-{
- return (XFS_DIR2_FREE_FIRSTDB(mp) + (db) / XFS_DIR2_MAX_FREE_BESTS(mp));
-}
-
-/*
- * Convert data space db to the corresponding index in a free db.
- */
-#define XFS_DIR2_DB_TO_FDINDEX(mp,db) xfs_dir2_db_to_fdindex(mp, db)
-static inline int
-xfs_dir2_db_to_fdindex(struct xfs_mount *mp, xfs_dir2_db_t db)
-{
- return ((db) % XFS_DIR2_MAX_FREE_BESTS(mp));
-}
-
-extern void xfs_dir2_free_log_bests(struct xfs_trans *tp, struct xfs_dabuf *bp,
- int first, int last);
-extern int xfs_dir2_leaf_to_node(struct xfs_da_args *args,
- struct xfs_dabuf *lbp);
-extern xfs_dahash_t xfs_dir2_leafn_lasthash(struct xfs_dabuf *bp, int *count);
-extern int xfs_dir2_leafn_lookup_int(struct xfs_dabuf *bp,
- struct xfs_da_args *args, int *indexp,
- struct xfs_da_state *state);
-extern int xfs_dir2_leafn_order(struct xfs_dabuf *leaf1_bp,
- struct xfs_dabuf *leaf2_bp);
-extern int xfs_dir2_leafn_split(struct xfs_da_state *state,
- struct xfs_da_state_blk *oldblk,
- struct xfs_da_state_blk *newblk);
-extern int xfs_dir2_leafn_toosmall(struct xfs_da_state *state, int *action);
-extern void xfs_dir2_leafn_unbalance(struct xfs_da_state *state,
- struct xfs_da_state_blk *drop_blk,
- struct xfs_da_state_blk *save_blk);
-extern int xfs_dir2_node_addname(struct xfs_da_args *args);
-extern int xfs_dir2_node_lookup(struct xfs_da_args *args);
-extern int xfs_dir2_node_removename(struct xfs_da_args *args);
-extern int xfs_dir2_node_replace(struct xfs_da_args *args);
-extern int xfs_dir2_node_trim_free(struct xfs_da_args *args, xfs_fileoff_t fo,
- int *rvalp);
-
-#endif /* __XFS_DIR2_NODE_H__ */
diff --git a/fs/xfs/xfs_dir2_priv.h b/fs/xfs/xfs_dir2_priv.h
new file mode 100644
index 00000000000..27ce0794d19
--- /dev/null
+++ b/fs/xfs/xfs_dir2_priv.h
@@ -0,0 +1,274 @@
+/*
+ * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#ifndef __XFS_DIR2_PRIV_H__
+#define __XFS_DIR2_PRIV_H__
+
+struct dir_context;
+
+/*
+ * Directory offset/block conversion functions.
+ *
+ * DB blocks here are logical directory block numbers, not filesystem blocks.
+ */
+
+/*
+ * Convert dataptr to byte in file space
+ */
+static inline xfs_dir2_off_t
+xfs_dir2_dataptr_to_byte(xfs_dir2_dataptr_t dp)
+{
+ return (xfs_dir2_off_t)dp << XFS_DIR2_DATA_ALIGN_LOG;
+}
+
+/*
+ * Convert byte in file space to dataptr. It had better be aligned.
+ */
+static inline xfs_dir2_dataptr_t
+xfs_dir2_byte_to_dataptr(xfs_dir2_off_t by)
+{
+ return (xfs_dir2_dataptr_t)(by >> XFS_DIR2_DATA_ALIGN_LOG);
+}
+
+/*
+ * Convert byte in space to (DB) block
+ */
+static inline xfs_dir2_db_t
+xfs_dir2_byte_to_db(struct xfs_da_geometry *geo, xfs_dir2_off_t by)
+{
+ return (xfs_dir2_db_t)(by >> geo->blklog);
+}
+
+/*
+ * Convert dataptr to a block number
+ */
+static inline xfs_dir2_db_t
+xfs_dir2_dataptr_to_db(struct xfs_da_geometry *geo, xfs_dir2_dataptr_t dp)
+{
+ return xfs_dir2_byte_to_db(geo, xfs_dir2_dataptr_to_byte(dp));
+}
+
+/*
+ * Convert byte in space to offset in a block
+ */
+static inline xfs_dir2_data_aoff_t
+xfs_dir2_byte_to_off(struct xfs_da_geometry *geo, xfs_dir2_off_t by)
+{
+ return (xfs_dir2_data_aoff_t)(by & (geo->blksize - 1));
+}
+
+/*
+ * Convert dataptr to a byte offset in a block
+ */
+static inline xfs_dir2_data_aoff_t
+xfs_dir2_dataptr_to_off(struct xfs_da_geometry *geo, xfs_dir2_dataptr_t dp)
+{
+ return xfs_dir2_byte_to_off(geo, xfs_dir2_dataptr_to_byte(dp));
+}
+
+/*
+ * Convert block and offset to byte in space
+ */
+static inline xfs_dir2_off_t
+xfs_dir2_db_off_to_byte(struct xfs_da_geometry *geo, xfs_dir2_db_t db,
+ xfs_dir2_data_aoff_t o)
+{
+ return ((xfs_dir2_off_t)db << geo->blklog) + o;
+}
+
+/*
+ * Convert block (DB) to block (dablk)
+ */
+static inline xfs_dablk_t
+xfs_dir2_db_to_da(struct xfs_da_geometry *geo, xfs_dir2_db_t db)
+{
+ return (xfs_dablk_t)(db << (geo->blklog - geo->fsblog));
+}
+
+/*
+ * Convert byte in space to (DA) block
+ */
+static inline xfs_dablk_t
+xfs_dir2_byte_to_da(struct xfs_da_geometry *geo, xfs_dir2_off_t by)
+{
+ return xfs_dir2_db_to_da(geo, xfs_dir2_byte_to_db(geo, by));
+}
+
+/*
+ * Convert block and offset to dataptr
+ */
+static inline xfs_dir2_dataptr_t
+xfs_dir2_db_off_to_dataptr(struct xfs_da_geometry *geo, xfs_dir2_db_t db,
+ xfs_dir2_data_aoff_t o)
+{
+ return xfs_dir2_byte_to_dataptr(xfs_dir2_db_off_to_byte(geo, db, o));
+}
+
+/*
+ * Convert block (dablk) to block (DB)
+ */
+static inline xfs_dir2_db_t
+xfs_dir2_da_to_db(struct xfs_da_geometry *geo, xfs_dablk_t da)
+{
+ return (xfs_dir2_db_t)(da >> (geo->blklog - geo->fsblog));
+}
+
+/*
+ * Convert block (dablk) to byte offset in space
+ */
+static inline xfs_dir2_off_t
+xfs_dir2_da_to_byte(struct xfs_da_geometry *geo, xfs_dablk_t da)
+{
+ return xfs_dir2_db_off_to_byte(geo, xfs_dir2_da_to_db(geo, da), 0);
+}
+
+/*
+ * Directory tail pointer accessor functions. Based on block geometry.
+ */
+static inline struct xfs_dir2_block_tail *
+xfs_dir2_block_tail_p(struct xfs_da_geometry *geo, struct xfs_dir2_data_hdr *hdr)
+{
+ return ((struct xfs_dir2_block_tail *)
+ ((char *)hdr + geo->blksize)) - 1;
+}
+
+static inline struct xfs_dir2_leaf_tail *
+xfs_dir2_leaf_tail_p(struct xfs_da_geometry *geo, struct xfs_dir2_leaf *lp)
+{
+ return (struct xfs_dir2_leaf_tail *)
+ ((char *)lp + geo->blksize -
+ sizeof(struct xfs_dir2_leaf_tail));
+}
+
+/* xfs_dir2.c */
+extern int xfs_dir_ino_validate(struct xfs_mount *mp, xfs_ino_t ino);
+extern int xfs_dir2_grow_inode(struct xfs_da_args *args, int space,
+ xfs_dir2_db_t *dbp);
+extern int xfs_dir_cilookup_result(struct xfs_da_args *args,
+ const unsigned char *name, int len);
+
+#define S_SHIFT 12
+extern const unsigned char xfs_mode_to_ftype[];
+
+extern unsigned char xfs_dir3_get_dtype(struct xfs_mount *mp,
+ __uint8_t filetype);
+
+
+/* xfs_dir2_block.c */
+extern int xfs_dir3_block_read(struct xfs_trans *tp, struct xfs_inode *dp,
+ struct xfs_buf **bpp);
+extern int xfs_dir2_block_addname(struct xfs_da_args *args);
+extern int xfs_dir2_block_lookup(struct xfs_da_args *args);
+extern int xfs_dir2_block_removename(struct xfs_da_args *args);
+extern int xfs_dir2_block_replace(struct xfs_da_args *args);
+extern int xfs_dir2_leaf_to_block(struct xfs_da_args *args,
+ struct xfs_buf *lbp, struct xfs_buf *dbp);
+
+/* xfs_dir2_data.c */
+#ifdef DEBUG
+#define xfs_dir3_data_check(dp,bp) __xfs_dir3_data_check(dp, bp);
+#else
+#define xfs_dir3_data_check(dp,bp)
+#endif
+
+extern int __xfs_dir3_data_check(struct xfs_inode *dp, struct xfs_buf *bp);
+extern int xfs_dir3_data_read(struct xfs_trans *tp, struct xfs_inode *dp,
+ xfs_dablk_t bno, xfs_daddr_t mapped_bno, struct xfs_buf **bpp);
+extern int xfs_dir3_data_readahead(struct xfs_inode *dp, xfs_dablk_t bno,
+ xfs_daddr_t mapped_bno);
+
+extern struct xfs_dir2_data_free *
+xfs_dir2_data_freeinsert(struct xfs_dir2_data_hdr *hdr,
+ struct xfs_dir2_data_free *bf, struct xfs_dir2_data_unused *dup,
+ int *loghead);
+extern int xfs_dir3_data_init(struct xfs_da_args *args, xfs_dir2_db_t blkno,
+ struct xfs_buf **bpp);
+
+/* xfs_dir2_leaf.c */
+extern int xfs_dir3_leafn_read(struct xfs_trans *tp, struct xfs_inode *dp,
+ xfs_dablk_t fbno, xfs_daddr_t mappedbno, struct xfs_buf **bpp);
+extern int xfs_dir2_block_to_leaf(struct xfs_da_args *args,
+ struct xfs_buf *dbp);
+extern int xfs_dir2_leaf_addname(struct xfs_da_args *args);
+extern void xfs_dir3_leaf_compact(struct xfs_da_args *args,
+ struct xfs_dir3_icleaf_hdr *leafhdr, struct xfs_buf *bp);
+extern void xfs_dir3_leaf_compact_x1(struct xfs_dir3_icleaf_hdr *leafhdr,
+ struct xfs_dir2_leaf_entry *ents, int *indexp,
+ int *lowstalep, int *highstalep, int *lowlogp, int *highlogp);
+extern int xfs_dir3_leaf_get_buf(struct xfs_da_args *args, xfs_dir2_db_t bno,
+ struct xfs_buf **bpp, __uint16_t magic);
+extern void xfs_dir3_leaf_log_ents(struct xfs_da_args *args,
+ struct xfs_buf *bp, int first, int last);
+extern void xfs_dir3_leaf_log_header(struct xfs_da_args *args,
+ struct xfs_buf *bp);
+extern int xfs_dir2_leaf_lookup(struct xfs_da_args *args);
+extern int xfs_dir2_leaf_removename(struct xfs_da_args *args);
+extern int xfs_dir2_leaf_replace(struct xfs_da_args *args);
+extern int xfs_dir2_leaf_search_hash(struct xfs_da_args *args,
+ struct xfs_buf *lbp);
+extern int xfs_dir2_leaf_trim_data(struct xfs_da_args *args,
+ struct xfs_buf *lbp, xfs_dir2_db_t db);
+extern struct xfs_dir2_leaf_entry *
+xfs_dir3_leaf_find_entry(struct xfs_dir3_icleaf_hdr *leafhdr,
+ struct xfs_dir2_leaf_entry *ents, int index, int compact,
+ int lowstale, int highstale, int *lfloglow, int *lfloghigh);
+extern int xfs_dir2_node_to_leaf(struct xfs_da_state *state);
+
+extern bool xfs_dir3_leaf_check_int(struct xfs_mount *mp, struct xfs_inode *dp,
+ struct xfs_dir3_icleaf_hdr *hdr, struct xfs_dir2_leaf *leaf);
+
+/* xfs_dir2_node.c */
+extern int xfs_dir2_leaf_to_node(struct xfs_da_args *args,
+ struct xfs_buf *lbp);
+extern xfs_dahash_t xfs_dir2_leafn_lasthash(struct xfs_inode *dp,
+ struct xfs_buf *bp, int *count);
+extern int xfs_dir2_leafn_lookup_int(struct xfs_buf *bp,
+ struct xfs_da_args *args, int *indexp,
+ struct xfs_da_state *state);
+extern int xfs_dir2_leafn_order(struct xfs_inode *dp, struct xfs_buf *leaf1_bp,
+ struct xfs_buf *leaf2_bp);
+extern int xfs_dir2_leafn_split(struct xfs_da_state *state,
+ struct xfs_da_state_blk *oldblk, struct xfs_da_state_blk *newblk);
+extern int xfs_dir2_leafn_toosmall(struct xfs_da_state *state, int *action);
+extern void xfs_dir2_leafn_unbalance(struct xfs_da_state *state,
+ struct xfs_da_state_blk *drop_blk,
+ struct xfs_da_state_blk *save_blk);
+extern int xfs_dir2_node_addname(struct xfs_da_args *args);
+extern int xfs_dir2_node_lookup(struct xfs_da_args *args);
+extern int xfs_dir2_node_removename(struct xfs_da_args *args);
+extern int xfs_dir2_node_replace(struct xfs_da_args *args);
+extern int xfs_dir2_node_trim_free(struct xfs_da_args *args, xfs_fileoff_t fo,
+ int *rvalp);
+extern int xfs_dir2_free_read(struct xfs_trans *tp, struct xfs_inode *dp,
+ xfs_dablk_t fbno, struct xfs_buf **bpp);
+
+/* xfs_dir2_sf.c */
+extern int xfs_dir2_block_sfsize(struct xfs_inode *dp,
+ struct xfs_dir2_data_hdr *block, struct xfs_dir2_sf_hdr *sfhp);
+extern int xfs_dir2_block_to_sf(struct xfs_da_args *args, struct xfs_buf *bp,
+ int size, xfs_dir2_sf_hdr_t *sfhp);
+extern int xfs_dir2_sf_addname(struct xfs_da_args *args);
+extern int xfs_dir2_sf_create(struct xfs_da_args *args, xfs_ino_t pino);
+extern int xfs_dir2_sf_lookup(struct xfs_da_args *args);
+extern int xfs_dir2_sf_removename(struct xfs_da_args *args);
+extern int xfs_dir2_sf_replace(struct xfs_da_args *args);
+
+/* xfs_dir2_readdir.c */
+extern int xfs_readdir(struct xfs_inode *dp, struct dir_context *ctx,
+ size_t bufsize);
+
+#endif /* __XFS_DIR2_PRIV_H__ */
diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c
new file mode 100644
index 00000000000..48e99afb9cb
--- /dev/null
+++ b/fs/xfs/xfs_dir2_readdir.c
@@ -0,0 +1,700 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_inode.h"
+#include "xfs_dir2.h"
+#include "xfs_dir2_priv.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+#include "xfs_bmap.h"
+#include "xfs_trans.h"
+#include "xfs_dinode.h"
+
+/*
+ * Directory file type support functions
+ */
+static unsigned char xfs_dir3_filetype_table[] = {
+ DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK,
+ DT_FIFO, DT_SOCK, DT_LNK, DT_WHT,
+};
+
+unsigned char
+xfs_dir3_get_dtype(
+ struct xfs_mount *mp,
+ __uint8_t filetype)
+{
+ if (!xfs_sb_version_hasftype(&mp->m_sb))
+ return DT_UNKNOWN;
+
+ if (filetype >= XFS_DIR3_FT_MAX)
+ return DT_UNKNOWN;
+
+ return xfs_dir3_filetype_table[filetype];
+}
+/*
+ * @mode, if set, indicates that the type field needs to be set up.
+ * This uses the transformation from file mode to DT_* as defined in linux/fs.h
+ * for file type specification. This will be propagated into the directory
+ * structure if appropriate for the given operation and filesystem config.
+ */
+const unsigned char xfs_mode_to_ftype[S_IFMT >> S_SHIFT] = {
+ [0] = XFS_DIR3_FT_UNKNOWN,
+ [S_IFREG >> S_SHIFT] = XFS_DIR3_FT_REG_FILE,
+ [S_IFDIR >> S_SHIFT] = XFS_DIR3_FT_DIR,
+ [S_IFCHR >> S_SHIFT] = XFS_DIR3_FT_CHRDEV,
+ [S_IFBLK >> S_SHIFT] = XFS_DIR3_FT_BLKDEV,
+ [S_IFIFO >> S_SHIFT] = XFS_DIR3_FT_FIFO,
+ [S_IFSOCK >> S_SHIFT] = XFS_DIR3_FT_SOCK,
+ [S_IFLNK >> S_SHIFT] = XFS_DIR3_FT_SYMLINK,
+};
+
+STATIC int
+xfs_dir2_sf_getdents(
+ struct xfs_da_args *args,
+ struct dir_context *ctx)
+{
+ int i; /* shortform entry number */
+ struct xfs_inode *dp = args->dp; /* incore directory inode */
+ xfs_dir2_dataptr_t off; /* current entry's offset */
+ xfs_dir2_sf_entry_t *sfep; /* shortform directory entry */
+ xfs_dir2_sf_hdr_t *sfp; /* shortform structure */
+ xfs_dir2_dataptr_t dot_offset;
+ xfs_dir2_dataptr_t dotdot_offset;
+ xfs_ino_t ino;
+ struct xfs_da_geometry *geo = args->geo;
+
+ ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
+ /*
+ * Give up if the directory is way too short.
+ */
+ if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) {
+ ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount));
+ return XFS_ERROR(EIO);
+ }
+
+ ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
+ ASSERT(dp->i_df.if_u1.if_data != NULL);
+
+ sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+
+ ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(sfp->i8count));
+
+ /*
+ * If the block number in the offset is out of range, we're done.
+ */
+ if (xfs_dir2_dataptr_to_db(geo, ctx->pos) > geo->datablk)
+ return 0;
+
+ /*
+ * Precalculate offsets for . and .. as we will always need them.
+ *
+ * XXX(hch): the second argument is sometimes 0 and sometimes
+ * geo->datablk
+ */
+ dot_offset = xfs_dir2_db_off_to_dataptr(geo, geo->datablk,
+ dp->d_ops->data_dot_offset);
+ dotdot_offset = xfs_dir2_db_off_to_dataptr(geo, geo->datablk,
+ dp->d_ops->data_dotdot_offset);
+
+ /*
+ * Put . entry unless we're starting past it.
+ */
+ if (ctx->pos <= dot_offset) {
+ ctx->pos = dot_offset & 0x7fffffff;
+ if (!dir_emit(ctx, ".", 1, dp->i_ino, DT_DIR))
+ return 0;
+ }
+
+ /*
+ * Put .. entry unless we're starting past it.
+ */
+ if (ctx->pos <= dotdot_offset) {
+ ino = dp->d_ops->sf_get_parent_ino(sfp);
+ ctx->pos = dotdot_offset & 0x7fffffff;
+ if (!dir_emit(ctx, "..", 2, ino, DT_DIR))
+ return 0;
+ }
+
+ /*
+ * Loop while there are more entries and put'ing works.
+ */
+ sfep = xfs_dir2_sf_firstentry(sfp);
+ for (i = 0; i < sfp->count; i++) {
+ __uint8_t filetype;
+
+ off = xfs_dir2_db_off_to_dataptr(geo, geo->datablk,
+ xfs_dir2_sf_get_offset(sfep));
+
+ if (ctx->pos > off) {
+ sfep = dp->d_ops->sf_nextentry(sfp, sfep);
+ continue;
+ }
+
+ ino = dp->d_ops->sf_get_ino(sfp, sfep);
+ filetype = dp->d_ops->sf_get_ftype(sfep);
+ ctx->pos = off & 0x7fffffff;
+ if (!dir_emit(ctx, (char *)sfep->name, sfep->namelen, ino,
+ xfs_dir3_get_dtype(dp->i_mount, filetype)))
+ return 0;
+ sfep = dp->d_ops->sf_nextentry(sfp, sfep);
+ }
+
+ ctx->pos = xfs_dir2_db_off_to_dataptr(geo, geo->datablk + 1, 0) &
+ 0x7fffffff;
+ return 0;
+}
+
+/*
+ * Readdir for block directories.
+ */
+STATIC int
+xfs_dir2_block_getdents(
+ struct xfs_da_args *args,
+ struct dir_context *ctx)
+{
+ struct xfs_inode *dp = args->dp; /* incore directory inode */
+ xfs_dir2_data_hdr_t *hdr; /* block header */
+ struct xfs_buf *bp; /* buffer for block */
+ xfs_dir2_block_tail_t *btp; /* block tail */
+ xfs_dir2_data_entry_t *dep; /* block data entry */
+ xfs_dir2_data_unused_t *dup; /* block unused entry */
+ char *endptr; /* end of the data entries */
+ int error; /* error return value */
+ char *ptr; /* current data entry */
+ int wantoff; /* starting block offset */
+ xfs_off_t cook;
+ struct xfs_da_geometry *geo = args->geo;
+
+ /*
+ * If the block number in the offset is out of range, we're done.
+ */
+ if (xfs_dir2_dataptr_to_db(geo, ctx->pos) > geo->datablk)
+ return 0;
+
+ error = xfs_dir3_block_read(NULL, dp, &bp);
+ if (error)
+ return error;
+
+ /*
+ * Extract the byte offset we start at from the seek pointer.
+ * We'll skip entries before this.
+ */
+ wantoff = xfs_dir2_dataptr_to_off(geo, ctx->pos);
+ hdr = bp->b_addr;
+ xfs_dir3_data_check(dp, bp);
+ /*
+ * Set up values for the loop.
+ */
+ btp = xfs_dir2_block_tail_p(geo, hdr);
+ ptr = (char *)dp->d_ops->data_entry_p(hdr);
+ endptr = (char *)xfs_dir2_block_leaf_p(btp);
+
+ /*
+ * Loop over the data portion of the block.
+ * Each object is a real entry (dep) or an unused one (dup).
+ */
+ while (ptr < endptr) {
+ __uint8_t filetype;
+
+ dup = (xfs_dir2_data_unused_t *)ptr;
+ /*
+ * Unused, skip it.
+ */
+ if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
+ ptr += be16_to_cpu(dup->length);
+ continue;
+ }
+
+ dep = (xfs_dir2_data_entry_t *)ptr;
+
+ /*
+ * Bump pointer for the next iteration.
+ */
+ ptr += dp->d_ops->data_entsize(dep->namelen);
+ /*
+ * The entry is before the desired starting point, skip it.
+ */
+ if ((char *)dep - (char *)hdr < wantoff)
+ continue;
+
+ cook = xfs_dir2_db_off_to_dataptr(geo, geo->datablk,
+ (char *)dep - (char *)hdr);
+
+ ctx->pos = cook & 0x7fffffff;
+ filetype = dp->d_ops->data_get_ftype(dep);
+ /*
+ * If it didn't fit, set the final offset to here & return.
+ */
+ if (!dir_emit(ctx, (char *)dep->name, dep->namelen,
+ be64_to_cpu(dep->inumber),
+ xfs_dir3_get_dtype(dp->i_mount, filetype))) {
+ xfs_trans_brelse(NULL, bp);
+ return 0;
+ }
+ }
+
+ /*
+ * Reached the end of the block.
+ * Set the offset to a non-existent block 1 and return.
+ */
+ ctx->pos = xfs_dir2_db_off_to_dataptr(geo, geo->datablk + 1, 0) &
+ 0x7fffffff;
+ xfs_trans_brelse(NULL, bp);
+ return 0;
+}
+
+struct xfs_dir2_leaf_map_info {
+ xfs_extlen_t map_blocks; /* number of fsbs in map */
+ xfs_dablk_t map_off; /* last mapped file offset */
+ int map_size; /* total entries in *map */
+ int map_valid; /* valid entries in *map */
+ int nmap; /* mappings to ask xfs_bmapi */
+ xfs_dir2_db_t curdb; /* db for current block */
+ int ra_current; /* number of read-ahead blks */
+ int ra_index; /* *map index for read-ahead */
+ int ra_offset; /* map entry offset for ra */
+ int ra_want; /* readahead count wanted */
+ struct xfs_bmbt_irec map[]; /* map vector for blocks */
+};
+
+STATIC int
+xfs_dir2_leaf_readbuf(
+ struct xfs_da_args *args,
+ size_t bufsize,
+ struct xfs_dir2_leaf_map_info *mip,
+ xfs_dir2_off_t *curoff,
+ struct xfs_buf **bpp)
+{
+ struct xfs_inode *dp = args->dp;
+ struct xfs_buf *bp = *bpp;
+ struct xfs_bmbt_irec *map = mip->map;
+ struct blk_plug plug;
+ int error = 0;
+ int length;
+ int i;
+ int j;
+ struct xfs_da_geometry *geo = args->geo;
+
+ /*
+ * If we have a buffer, we need to release it and
+ * take it out of the mapping.
+ */
+
+ if (bp) {
+ xfs_trans_brelse(NULL, bp);
+ bp = NULL;
+ mip->map_blocks -= geo->fsbcount;
+ /*
+ * Loop to get rid of the extents for the
+ * directory block.
+ */
+ for (i = geo->fsbcount; i > 0; ) {
+ j = min_t(int, map->br_blockcount, i);
+ map->br_blockcount -= j;
+ map->br_startblock += j;
+ map->br_startoff += j;
+ /*
+ * If mapping is done, pitch it from
+ * the table.
+ */
+ if (!map->br_blockcount && --mip->map_valid)
+ memmove(&map[0], &map[1],
+ sizeof(map[0]) * mip->map_valid);
+ i -= j;
+ }
+ }
+
+ /*
+ * Recalculate the readahead blocks wanted.
+ */
+ mip->ra_want = howmany(bufsize + geo->blksize, (1 << geo->fsblog)) - 1;
+ ASSERT(mip->ra_want >= 0);
+
+ /*
+ * If we don't have as many as we want, and we haven't
+ * run out of data blocks, get some more mappings.
+ */
+ if (1 + mip->ra_want > mip->map_blocks &&
+ mip->map_off < xfs_dir2_byte_to_da(geo, XFS_DIR2_LEAF_OFFSET)) {
+ /*
+ * Get more bmaps, fill in after the ones
+ * we already have in the table.
+ */
+ mip->nmap = mip->map_size - mip->map_valid;
+ error = xfs_bmapi_read(dp, mip->map_off,
+ xfs_dir2_byte_to_da(geo, XFS_DIR2_LEAF_OFFSET) -
+ mip->map_off,
+ &map[mip->map_valid], &mip->nmap, 0);
+
+ /*
+ * Don't know if we should ignore this or try to return an
+ * error. The trouble with returning errors is that readdir
+ * will just stop without actually passing the error through.
+ */
+ if (error)
+ goto out; /* XXX */
+
+ /*
+ * If we got all the mappings we asked for, set the final map
+ * offset based on the last bmap value received. Otherwise,
+ * we've reached the end.
+ */
+ if (mip->nmap == mip->map_size - mip->map_valid) {
+ i = mip->map_valid + mip->nmap - 1;
+ mip->map_off = map[i].br_startoff + map[i].br_blockcount;
+ } else
+ mip->map_off = xfs_dir2_byte_to_da(geo,
+ XFS_DIR2_LEAF_OFFSET);
+
+ /*
+ * Look for holes in the mapping, and eliminate them. Count up
+ * the valid blocks.
+ */
+ for (i = mip->map_valid; i < mip->map_valid + mip->nmap; ) {
+ if (map[i].br_startblock == HOLESTARTBLOCK) {
+ mip->nmap--;
+ length = mip->map_valid + mip->nmap - i;
+ if (length)
+ memmove(&map[i], &map[i + 1],
+ sizeof(map[i]) * length);
+ } else {
+ mip->map_blocks += map[i].br_blockcount;
+ i++;
+ }
+ }
+ mip->map_valid += mip->nmap;
+ }
+
+ /*
+ * No valid mappings, so no more data blocks.
+ */
+ if (!mip->map_valid) {
+ *curoff = xfs_dir2_da_to_byte(geo, mip->map_off);
+ goto out;
+ }
+
+ /*
+ * Read the directory block starting at the first mapping.
+ */
+ mip->curdb = xfs_dir2_da_to_db(geo, map->br_startoff);
+ error = xfs_dir3_data_read(NULL, dp, map->br_startoff,
+ map->br_blockcount >= geo->fsbcount ?
+ XFS_FSB_TO_DADDR(dp->i_mount, map->br_startblock) :
+ -1, &bp);
+ /*
+ * Should just skip over the data block instead of giving up.
+ */
+ if (error)
+ goto out; /* XXX */
+
+ /*
+ * Adjust the current amount of read-ahead: we just read a block that
+ * was previously ra.
+ */
+ if (mip->ra_current)
+ mip->ra_current -= geo->fsbcount;
+
+ /*
+ * Do we need more readahead?
+ */
+ blk_start_plug(&plug);
+ for (mip->ra_index = mip->ra_offset = i = 0;
+ mip->ra_want > mip->ra_current && i < mip->map_blocks;
+ i += geo->fsbcount) {
+ ASSERT(mip->ra_index < mip->map_valid);
+ /*
+ * Read-ahead a contiguous directory block.
+ */
+ if (i > mip->ra_current &&
+ map[mip->ra_index].br_blockcount >= geo->fsbcount) {
+ xfs_dir3_data_readahead(dp,
+ map[mip->ra_index].br_startoff + mip->ra_offset,
+ XFS_FSB_TO_DADDR(dp->i_mount,
+ map[mip->ra_index].br_startblock +
+ mip->ra_offset));
+ mip->ra_current = i;
+ }
+
+ /*
+ * Read-ahead a non-contiguous directory block. This doesn't
+ * use our mapping, but this is a very rare case.
+ */
+ else if (i > mip->ra_current) {
+ xfs_dir3_data_readahead(dp,
+ map[mip->ra_index].br_startoff +
+ mip->ra_offset, -1);
+ mip->ra_current = i;
+ }
+
+ /*
+ * Advance offset through the mapping table.
+ */
+ for (j = 0; j < geo->fsbcount; j += length ) {
+ /*
+ * The rest of this extent but not more than a dir
+ * block.
+ */
+ length = min_t(int, geo->fsbcount,
+ map[mip->ra_index].br_blockcount -
+ mip->ra_offset);
+ mip->ra_offset += length;
+
+ /*
+ * Advance to the next mapping if this one is used up.
+ */
+ if (mip->ra_offset == map[mip->ra_index].br_blockcount) {
+ mip->ra_offset = 0;
+ mip->ra_index++;
+ }
+ }
+ }
+ blk_finish_plug(&plug);
+
+out:
+ *bpp = bp;
+ return error;
+}
+
+/*
+ * Getdents (readdir) for leaf and node directories.
+ * This reads the data blocks only, so is the same for both forms.
+ */
+STATIC int
+xfs_dir2_leaf_getdents(
+ struct xfs_da_args *args,
+ struct dir_context *ctx,
+ size_t bufsize)
+{
+ struct xfs_inode *dp = args->dp;
+ struct xfs_buf *bp = NULL; /* data block buffer */
+ xfs_dir2_data_hdr_t *hdr; /* data block header */
+ xfs_dir2_data_entry_t *dep; /* data entry */
+ xfs_dir2_data_unused_t *dup; /* unused entry */
+ int error = 0; /* error return value */
+ int length; /* temporary length value */
+ int byteoff; /* offset in current block */
+ xfs_dir2_off_t curoff; /* current overall offset */
+ xfs_dir2_off_t newoff; /* new curoff after new blk */
+ char *ptr = NULL; /* pointer to current data */
+ struct xfs_dir2_leaf_map_info *map_info;
+ struct xfs_da_geometry *geo = args->geo;
+
+ /*
+ * If the offset is at or past the largest allowed value,
+ * give up right away.
+ */
+ if (ctx->pos >= XFS_DIR2_MAX_DATAPTR)
+ return 0;
+
+ /*
+ * Set up to bmap a number of blocks based on the caller's
+ * buffer size, the directory block size, and the filesystem
+ * block size.
+ */
+ length = howmany(bufsize + geo->blksize, (1 << geo->fsblog));
+ map_info = kmem_zalloc(offsetof(struct xfs_dir2_leaf_map_info, map) +
+ (length * sizeof(struct xfs_bmbt_irec)),
+ KM_SLEEP | KM_NOFS);
+ map_info->map_size = length;
+
+ /*
+ * Inside the loop we keep the main offset value as a byte offset
+ * in the directory file.
+ */
+ curoff = xfs_dir2_dataptr_to_byte(ctx->pos);
+
+ /*
+ * Force this conversion through db so we truncate the offset
+ * down to get the start of the data block.
+ */
+ map_info->map_off = xfs_dir2_db_to_da(geo,
+ xfs_dir2_byte_to_db(geo, curoff));
+
+ /*
+ * Loop over directory entries until we reach the end offset.
+ * Get more blocks and readahead as necessary.
+ */
+ while (curoff < XFS_DIR2_LEAF_OFFSET) {
+ __uint8_t filetype;
+
+ /*
+ * If we have no buffer, or we're off the end of the
+ * current buffer, need to get another one.
+ */
+ if (!bp || ptr >= (char *)bp->b_addr + geo->blksize) {
+
+ error = xfs_dir2_leaf_readbuf(args, bufsize, map_info,
+ &curoff, &bp);
+ if (error || !map_info->map_valid)
+ break;
+
+ /*
+ * Having done a read, we need to set a new offset.
+ */
+ newoff = xfs_dir2_db_off_to_byte(geo,
+ map_info->curdb, 0);
+ /*
+ * Start of the current block.
+ */
+ if (curoff < newoff)
+ curoff = newoff;
+ /*
+ * Make sure we're in the right block.
+ */
+ else if (curoff > newoff)
+ ASSERT(xfs_dir2_byte_to_db(geo, curoff) ==
+ map_info->curdb);
+ hdr = bp->b_addr;
+ xfs_dir3_data_check(dp, bp);
+ /*
+ * Find our position in the block.
+ */
+ ptr = (char *)dp->d_ops->data_entry_p(hdr);
+ byteoff = xfs_dir2_byte_to_off(geo, curoff);
+ /*
+ * Skip past the header.
+ */
+ if (byteoff == 0)
+ curoff += dp->d_ops->data_entry_offset;
+ /*
+ * Skip past entries until we reach our offset.
+ */
+ else {
+ while ((char *)ptr - (char *)hdr < byteoff) {
+ dup = (xfs_dir2_data_unused_t *)ptr;
+
+ if (be16_to_cpu(dup->freetag)
+ == XFS_DIR2_DATA_FREE_TAG) {
+
+ length = be16_to_cpu(dup->length);
+ ptr += length;
+ continue;
+ }
+ dep = (xfs_dir2_data_entry_t *)ptr;
+ length =
+ dp->d_ops->data_entsize(dep->namelen);
+ ptr += length;
+ }
+ /*
+ * Now set our real offset.
+ */
+ curoff =
+ xfs_dir2_db_off_to_byte(geo,
+ xfs_dir2_byte_to_db(geo, curoff),
+ (char *)ptr - (char *)hdr);
+ if (ptr >= (char *)hdr + geo->blksize) {
+ continue;
+ }
+ }
+ }
+ /*
+ * We have a pointer to an entry.
+ * Is it a live one?
+ */
+ dup = (xfs_dir2_data_unused_t *)ptr;
+ /*
+ * No, it's unused, skip over it.
+ */
+ if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
+ length = be16_to_cpu(dup->length);
+ ptr += length;
+ curoff += length;
+ continue;
+ }
+
+ dep = (xfs_dir2_data_entry_t *)ptr;
+ length = dp->d_ops->data_entsize(dep->namelen);
+ filetype = dp->d_ops->data_get_ftype(dep);
+
+ ctx->pos = xfs_dir2_byte_to_dataptr(curoff) & 0x7fffffff;
+ if (!dir_emit(ctx, (char *)dep->name, dep->namelen,
+ be64_to_cpu(dep->inumber),
+ xfs_dir3_get_dtype(dp->i_mount, filetype)))
+ break;
+
+ /*
+ * Advance to next entry in the block.
+ */
+ ptr += length;
+ curoff += length;
+ /* bufsize may have just been a guess; don't go negative */
+ bufsize = bufsize > length ? bufsize - length : 0;
+ }
+
+ /*
+ * All done. Set output offset value to current offset.
+ */
+ if (curoff > xfs_dir2_dataptr_to_byte(XFS_DIR2_MAX_DATAPTR))
+ ctx->pos = XFS_DIR2_MAX_DATAPTR & 0x7fffffff;
+ else
+ ctx->pos = xfs_dir2_byte_to_dataptr(curoff) & 0x7fffffff;
+ kmem_free(map_info);
+ if (bp)
+ xfs_trans_brelse(NULL, bp);
+ return error;
+}
+
+/*
+ * Read a directory.
+ */
+int
+xfs_readdir(
+ struct xfs_inode *dp,
+ struct dir_context *ctx,
+ size_t bufsize)
+{
+ struct xfs_da_args args = { NULL };
+ int rval;
+ int v;
+ uint lock_mode;
+
+ trace_xfs_readdir(dp);
+
+ if (XFS_FORCED_SHUTDOWN(dp->i_mount))
+ return XFS_ERROR(EIO);
+
+ ASSERT(S_ISDIR(dp->i_d.di_mode));
+ XFS_STATS_INC(xs_dir_getdents);
+
+ args.dp = dp;
+ args.geo = dp->i_mount->m_dir_geo;
+
+ lock_mode = xfs_ilock_data_map_shared(dp);
+ if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
+ rval = xfs_dir2_sf_getdents(&args, ctx);
+ else if ((rval = xfs_dir2_isblock(&args, &v)))
+ ;
+ else if (v)
+ rval = xfs_dir2_block_getdents(&args, ctx);
+ else
+ rval = xfs_dir2_leaf_getdents(&args, ctx, bufsize);
+ xfs_iunlock(dp, lock_mode);
+
+ return rval;
+}
diff --git a/fs/xfs/xfs_dir2_sf.c b/fs/xfs/xfs_dir2_sf.c
index ec8e7476c8b..53c3be619db 100644
--- a/fs/xfs/xfs_dir2_sf.c
+++ b/fs/xfs/xfs_dir2_sf.c
@@ -17,29 +17,22 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
-#include "xfs_types.h"
-#include "xfs_log.h"
-#include "xfs_inum.h"
-#include "xfs_trans.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_sb.h"
-#include "xfs_dir.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
+#include "xfs_ag.h"
#include "xfs_mount.h"
+#include "xfs_da_format.h"
#include "xfs_da_btree.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_dir_sf.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
-#include "xfs_dinode.h"
#include "xfs_inode.h"
+#include "xfs_trans.h"
#include "xfs_inode_item.h"
-#include "xfs_dir_leaf.h"
#include "xfs_error.h"
-#include "xfs_dir2_data.h"
-#include "xfs_dir2_leaf.h"
-#include "xfs_dir2_block.h"
-#include "xfs_dir2_trace.h"
+#include "xfs_dir2.h"
+#include "xfs_dir2_priv.h"
+#include "xfs_trace.h"
+#include "xfs_dinode.h"
/*
* Prototypes for internal functions.
@@ -72,7 +65,7 @@ static void xfs_dir2_sf_toino8(xfs_da_args_t *args);
int /* size for sf form */
xfs_dir2_block_sfsize(
xfs_inode_t *dp, /* incore inode pointer */
- xfs_dir2_block_t *block, /* block directory data */
+ xfs_dir2_data_hdr_t *hdr, /* block directory data */
xfs_dir2_sf_hdr_t *sfhp) /* output: header for sf form */
{
xfs_dir2_dataptr_t addr; /* data entry address */
@@ -88,24 +81,33 @@ xfs_dir2_block_sfsize(
int namelen; /* total name bytes */
xfs_ino_t parent = 0; /* parent inode number */
int size=0; /* total computed size */
+ int has_ftype;
+ struct xfs_da_geometry *geo;
mp = dp->i_mount;
+ geo = mp->m_dir_geo;
+
+ /*
+ * if there is a filetype field, add the extra byte to the namelen
+ * for each entry that we see.
+ */
+ has_ftype = xfs_sb_version_hasftype(&mp->m_sb) ? 1 : 0;
count = i8count = namelen = 0;
- btp = XFS_DIR2_BLOCK_TAIL_P(mp, block);
- blp = XFS_DIR2_BLOCK_LEAF_P(btp);
+ btp = xfs_dir2_block_tail_p(geo, hdr);
+ blp = xfs_dir2_block_leaf_p(btp);
/*
* Iterate over the block's data entries by using the leaf pointers.
*/
- for (i = 0; i < INT_GET(btp->count, ARCH_CONVERT); i++) {
- if ((addr = INT_GET(blp[i].address, ARCH_CONVERT)) == XFS_DIR2_NULL_DATAPTR)
+ for (i = 0; i < be32_to_cpu(btp->count); i++) {
+ if ((addr = be32_to_cpu(blp[i].address)) == XFS_DIR2_NULL_DATAPTR)
continue;
/*
* Calculate the pointer to the entry at hand.
*/
- dep = (xfs_dir2_data_entry_t *)
- ((char *)block + XFS_DIR2_DATAPTR_TO_OFF(mp, addr));
+ dep = (xfs_dir2_data_entry_t *)((char *)hdr +
+ xfs_dir2_dataptr_to_off(geo, addr));
/*
* Detect . and .., so we can special-case them.
* . is not included in sf directories.
@@ -117,17 +119,18 @@ xfs_dir2_block_sfsize(
dep->name[0] == '.' && dep->name[1] == '.';
#if XFS_BIG_INUMS
if (!isdot)
- i8count += INT_GET(dep->inumber, ARCH_CONVERT) > XFS_DIR2_MAX_SHORT_INUM;
+ i8count += be64_to_cpu(dep->inumber) > XFS_DIR2_MAX_SHORT_INUM;
#endif
+ /* take into account the file type field */
if (!isdot && !isdotdot) {
count++;
- namelen += dep->namelen;
+ namelen += dep->namelen + has_ftype;
} else if (isdotdot)
- parent = INT_GET(dep->inumber, ARCH_CONVERT);
+ parent = be64_to_cpu(dep->inumber);
/*
* Calculate the new size, see if we should give up yet.
*/
- size = XFS_DIR2_SF_HDR_SIZE(i8count) + /* header */
+ size = xfs_dir2_sf_hdr_size(i8count) + /* header */
count + /* namelen */
count * (uint)sizeof(xfs_dir2_sf_off_t) + /* offset */
namelen + /* name */
@@ -142,7 +145,7 @@ xfs_dir2_block_sfsize(
*/
sfhp->count = count;
sfhp->i8count = i8count;
- XFS_DIR2_SF_PUT_INUMBER((xfs_dir2_sf_t *)sfhp, &parent, &sfhp->parent);
+ dp->d_ops->sf_put_parent_ino(sfhp, parent);
return size;
}
@@ -153,11 +156,11 @@ xfs_dir2_block_sfsize(
int /* error */
xfs_dir2_block_to_sf(
xfs_da_args_t *args, /* operation arguments */
- xfs_dabuf_t *bp, /* block buffer */
+ struct xfs_buf *bp,
int size, /* shortform directory size */
xfs_dir2_sf_hdr_t *sfhp) /* shortform directory hdr */
{
- xfs_dir2_block_t *block; /* block structure */
+ xfs_dir2_data_hdr_t *hdr; /* block header */
xfs_dir2_block_tail_t *btp; /* block tail pointer */
xfs_dir2_data_entry_t *dep; /* data entry pointer */
xfs_inode_t *dp; /* incore directory inode */
@@ -168,49 +171,36 @@ xfs_dir2_block_to_sf(
xfs_mount_t *mp; /* filesystem mount point */
char *ptr; /* current data pointer */
xfs_dir2_sf_entry_t *sfep; /* shortform entry */
- xfs_dir2_sf_t *sfp; /* shortform structure */
- xfs_ino_t temp;
+ xfs_dir2_sf_hdr_t *sfp; /* shortform directory header */
+ xfs_dir2_sf_hdr_t *dst; /* temporary data buffer */
+
+ trace_xfs_dir2_block_to_sf(args);
- xfs_dir2_trace_args_sb("block_to_sf", args, size, bp);
dp = args->dp;
mp = dp->i_mount;
/*
- * Make a copy of the block data, so we can shrink the inode
- * and add local data.
+ * allocate a temporary destination buffer the size of the inode
+ * to format the data into. Once we have formatted the data, we
+ * can free the block and copy the formatted data into the inode literal
+ * area.
*/
- block = kmem_alloc(mp->m_dirblksize, KM_SLEEP);
- memcpy(block, bp->data, mp->m_dirblksize);
- logflags = XFS_ILOG_CORE;
- if ((error = xfs_dir2_shrink_inode(args, mp->m_dirdatablk, bp))) {
- ASSERT(error != ENOSPC);
- goto out;
- }
- /*
- * The buffer is now unconditionally gone, whether
- * xfs_dir2_shrink_inode worked or not.
- *
- * Convert the inode to local format.
- */
- dp->i_df.if_flags &= ~XFS_IFEXTENTS;
- dp->i_df.if_flags |= XFS_IFINLINE;
- dp->i_d.di_format = XFS_DINODE_FMT_LOCAL;
- ASSERT(dp->i_df.if_bytes == 0);
- xfs_idata_realloc(dp, size, XFS_DATA_FORK);
- logflags |= XFS_ILOG_DDATA;
+ dst = kmem_alloc(mp->m_sb.sb_inodesize, KM_SLEEP);
+ hdr = bp->b_addr;
+
/*
* Copy the header into the newly allocate local space.
*/
- sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
- memcpy(sfp, sfhp, XFS_DIR2_SF_HDR_SIZE(sfhp->i8count));
- dp->i_d.di_size = size;
+ sfp = (xfs_dir2_sf_hdr_t *)dst;
+ memcpy(sfp, sfhp, xfs_dir2_sf_hdr_size(sfhp->i8count));
+
/*
* Set up to loop over the block's entries.
*/
- btp = XFS_DIR2_BLOCK_TAIL_P(mp, block);
- ptr = (char *)block->u;
- endptr = (char *)XFS_DIR2_BLOCK_LEAF_P(btp);
- sfep = XFS_DIR2_SF_FIRSTENTRY(sfp);
+ btp = xfs_dir2_block_tail_p(args->geo, hdr);
+ ptr = (char *)dp->d_ops->data_entry_p(hdr);
+ endptr = (char *)xfs_dir2_block_leaf_p(btp);
+ sfep = xfs_dir2_sf_firstentry(sfp);
/*
* Loop over the active and unused entries.
* Stop when we reach the leaf/tail portion of the block.
@@ -220,8 +210,8 @@ xfs_dir2_block_to_sf(
* If it's unused, just skip over it.
*/
dup = (xfs_dir2_data_unused_t *)ptr;
- if (INT_GET(dup->freetag, ARCH_CONVERT) == XFS_DIR2_DATA_FREE_TAG) {
- ptr += INT_GET(dup->length, ARCH_CONVERT);
+ if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
+ ptr += be16_to_cpu(dup->length);
continue;
}
dep = (xfs_dir2_data_entry_t *)ptr;
@@ -229,35 +219,61 @@ xfs_dir2_block_to_sf(
* Skip .
*/
if (dep->namelen == 1 && dep->name[0] == '.')
- ASSERT(INT_GET(dep->inumber, ARCH_CONVERT) == dp->i_ino);
+ ASSERT(be64_to_cpu(dep->inumber) == dp->i_ino);
/*
* Skip .., but make sure the inode number is right.
*/
else if (dep->namelen == 2 &&
dep->name[0] == '.' && dep->name[1] == '.')
- ASSERT(INT_GET(dep->inumber, ARCH_CONVERT) ==
- XFS_DIR2_SF_GET_INUMBER(sfp, &sfp->hdr.parent));
+ ASSERT(be64_to_cpu(dep->inumber) ==
+ dp->d_ops->sf_get_parent_ino(sfp));
/*
* Normal entry, copy it into shortform.
*/
else {
sfep->namelen = dep->namelen;
- XFS_DIR2_SF_PUT_OFFSET(sfep,
+ xfs_dir2_sf_put_offset(sfep,
(xfs_dir2_data_aoff_t)
- ((char *)dep - (char *)block));
+ ((char *)dep - (char *)hdr));
memcpy(sfep->name, dep->name, dep->namelen);
- temp=INT_GET(dep->inumber, ARCH_CONVERT);
- XFS_DIR2_SF_PUT_INUMBER(sfp, &temp,
- XFS_DIR2_SF_INUMBERP(sfep));
- sfep = XFS_DIR2_SF_NEXTENTRY(sfp, sfep);
+ dp->d_ops->sf_put_ino(sfp, sfep,
+ be64_to_cpu(dep->inumber));
+ dp->d_ops->sf_put_ftype(sfep,
+ dp->d_ops->data_get_ftype(dep));
+
+ sfep = dp->d_ops->sf_nextentry(sfp, sfep);
}
- ptr += XFS_DIR2_DATA_ENTSIZE(dep->namelen);
+ ptr += dp->d_ops->data_entsize(dep->namelen);
}
ASSERT((char *)sfep - (char *)sfp == size);
+
+ /* now we are done with the block, we can shrink the inode */
+ logflags = XFS_ILOG_CORE;
+ error = xfs_dir2_shrink_inode(args, args->geo->datablk, bp);
+ if (error) {
+ ASSERT(error != ENOSPC);
+ goto out;
+ }
+
+ /*
+ * The buffer is now unconditionally gone, whether
+ * xfs_dir2_shrink_inode worked or not.
+ *
+ * Convert the inode to local format and copy the data in.
+ */
+ dp->i_df.if_flags &= ~XFS_IFEXTENTS;
+ dp->i_df.if_flags |= XFS_IFINLINE;
+ dp->i_d.di_format = XFS_DINODE_FMT_LOCAL;
+ ASSERT(dp->i_df.if_bytes == 0);
+ xfs_idata_realloc(dp, size, XFS_DATA_FORK);
+
+ logflags |= XFS_ILOG_DDATA;
+ memcpy(dp->i_df.if_u1.if_data, dst, size);
+ dp->i_d.di_size = size;
xfs_dir2_sf_check(args);
out:
xfs_trans_log_inode(args->trans, dp, logflags);
- kmem_free(block, mp->m_dirblksize);
+ kmem_free(dst);
return error;
}
@@ -271,19 +287,18 @@ int /* error */
xfs_dir2_sf_addname(
xfs_da_args_t *args) /* operation arguments */
{
- int add_entsize; /* size of the new entry */
xfs_inode_t *dp; /* incore directory inode */
int error; /* error return value */
int incr_isize; /* total change in size */
int new_isize; /* di_size after adding name */
int objchange; /* changing to 8-byte inodes */
xfs_dir2_data_aoff_t offset = 0; /* offset for new entry */
- int old_isize; /* di_size before adding name */
int pick; /* which algorithm to use */
- xfs_dir2_sf_t *sfp; /* shortform structure */
+ xfs_dir2_sf_hdr_t *sfp; /* shortform structure */
xfs_dir2_sf_entry_t *sfep = NULL; /* shortform entry */
- xfs_dir2_trace_args("sf_addname", args);
+ trace_xfs_dir2_sf_addname(args);
+
ASSERT(xfs_dir2_sf_lookup(args) == ENOENT);
dp = args->dp;
ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
@@ -296,34 +311,29 @@ xfs_dir2_sf_addname(
}
ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
ASSERT(dp->i_df.if_u1.if_data != NULL);
- sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
- ASSERT(dp->i_d.di_size >= XFS_DIR2_SF_HDR_SIZE(sfp->hdr.i8count));
+ sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+ ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(sfp->i8count));
/*
* Compute entry (and change in) size.
*/
- add_entsize = XFS_DIR2_SF_ENTSIZE_BYNAME(sfp, args->namelen);
- incr_isize = add_entsize;
+ incr_isize = dp->d_ops->sf_entsize(sfp, args->namelen);
objchange = 0;
#if XFS_BIG_INUMS
/*
* Do we have to change to 8 byte inodes?
*/
- if (args->inumber > XFS_DIR2_MAX_SHORT_INUM && sfp->hdr.i8count == 0) {
+ if (args->inumber > XFS_DIR2_MAX_SHORT_INUM && sfp->i8count == 0) {
/*
- * Yes, adjust the entry size and the total size.
+ * Yes, adjust the inode size. old count + (parent + new)
*/
- add_entsize +=
- (uint)sizeof(xfs_dir2_ino8_t) -
- (uint)sizeof(xfs_dir2_ino4_t);
incr_isize +=
- (sfp->hdr.count + 2) *
+ (sfp->count + 2) *
((uint)sizeof(xfs_dir2_ino8_t) -
(uint)sizeof(xfs_dir2_ino4_t));
objchange = 1;
}
#endif
- old_isize = (int)dp->i_d.di_size;
- new_isize = old_isize + incr_isize;
+ new_isize = (int)dp->i_d.di_size + incr_isize;
/*
* Won't fit as shortform any more (due to size),
* or the pick routine says it won't (due to offset values).
@@ -334,7 +344,7 @@ xfs_dir2_sf_addname(
/*
* Just checking or no space reservation, it doesn't fit.
*/
- if (args->justcheck || args->total == 0)
+ if ((args->op_flags & XFS_DA_OP_JUSTCHECK) || args->total == 0)
return XFS_ERROR(ENOSPC);
/*
* Convert to block form then add the name.
@@ -347,7 +357,7 @@ xfs_dir2_sf_addname(
/*
* Just checking, it fits.
*/
- if (args->justcheck)
+ if (args->op_flags & XFS_DA_OP_JUSTCHECK)
return 0;
/*
* Do it the easy way - just add it at the end.
@@ -386,37 +396,38 @@ xfs_dir2_sf_addname_easy(
{
int byteoff; /* byte offset in sf dir */
xfs_inode_t *dp; /* incore directory inode */
- xfs_dir2_sf_t *sfp; /* shortform structure */
+ xfs_dir2_sf_hdr_t *sfp; /* shortform structure */
dp = args->dp;
- sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
+ sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
byteoff = (int)((char *)sfep - (char *)sfp);
/*
* Grow the in-inode space.
*/
- xfs_idata_realloc(dp, XFS_DIR2_SF_ENTSIZE_BYNAME(sfp, args->namelen),
- XFS_DATA_FORK);
+ xfs_idata_realloc(dp, dp->d_ops->sf_entsize(sfp, args->namelen),
+ XFS_DATA_FORK);
/*
* Need to set up again due to realloc of the inode data.
*/
- sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
+ sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
sfep = (xfs_dir2_sf_entry_t *)((char *)sfp + byteoff);
/*
* Fill in the new entry.
*/
sfep->namelen = args->namelen;
- XFS_DIR2_SF_PUT_OFFSET(sfep, offset);
+ xfs_dir2_sf_put_offset(sfep, offset);
memcpy(sfep->name, args->name, sfep->namelen);
- XFS_DIR2_SF_PUT_INUMBER(sfp, &args->inumber,
- XFS_DIR2_SF_INUMBERP(sfep));
+ dp->d_ops->sf_put_ino(sfp, sfep, args->inumber);
+ dp->d_ops->sf_put_ftype(sfep, args->filetype);
+
/*
* Update the header and inode.
*/
- sfp->hdr.count++;
+ sfp->count++;
#if XFS_BIG_INUMS
if (args->inumber > XFS_DIR2_MAX_SHORT_INUM)
- sfp->hdr.i8count++;
+ sfp->i8count++;
#endif
dp->i_d.di_size = new_isize;
xfs_dir2_sf_check(args);
@@ -446,34 +457,36 @@ xfs_dir2_sf_addname_hard(
xfs_dir2_data_aoff_t offset; /* current offset value */
int old_isize; /* previous di_size */
xfs_dir2_sf_entry_t *oldsfep; /* entry in original dir */
- xfs_dir2_sf_t *oldsfp; /* original shortform dir */
+ xfs_dir2_sf_hdr_t *oldsfp; /* original shortform dir */
xfs_dir2_sf_entry_t *sfep; /* entry in new dir */
- xfs_dir2_sf_t *sfp; /* new shortform dir */
+ xfs_dir2_sf_hdr_t *sfp; /* new shortform dir */
+ struct xfs_mount *mp;
/*
* Copy the old directory to the stack buffer.
*/
dp = args->dp;
+ mp = dp->i_mount;
- sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
+ sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
old_isize = (int)dp->i_d.di_size;
buf = kmem_alloc(old_isize, KM_SLEEP);
- oldsfp = (xfs_dir2_sf_t *)buf;
+ oldsfp = (xfs_dir2_sf_hdr_t *)buf;
memcpy(oldsfp, sfp, old_isize);
/*
* Loop over the old directory finding the place we're going
* to insert the new entry.
* If it's going to end up at the end then oldsfep will point there.
*/
- for (offset = XFS_DIR2_DATA_FIRST_OFFSET,
- oldsfep = XFS_DIR2_SF_FIRSTENTRY(oldsfp),
- add_datasize = XFS_DIR2_DATA_ENTSIZE(args->namelen),
+ for (offset = dp->d_ops->data_first_offset,
+ oldsfep = xfs_dir2_sf_firstentry(oldsfp),
+ add_datasize = dp->d_ops->data_entsize(args->namelen),
eof = (char *)oldsfep == &buf[old_isize];
!eof;
- offset = new_offset + XFS_DIR2_DATA_ENTSIZE(oldsfep->namelen),
- oldsfep = XFS_DIR2_SF_NEXTENTRY(oldsfp, oldsfep),
+ offset = new_offset + dp->d_ops->data_entsize(oldsfep->namelen),
+ oldsfep = dp->d_ops->sf_nextentry(oldsfp, oldsfep),
eof = (char *)oldsfep == &buf[old_isize]) {
- new_offset = XFS_DIR2_SF_GET_OFFSET(oldsfep);
+ new_offset = xfs_dir2_sf_get_offset(oldsfep);
if (offset + add_datasize <= new_offset)
break;
}
@@ -487,7 +500,7 @@ xfs_dir2_sf_addname_hard(
/*
* Reset the pointer since the buffer was reallocated.
*/
- sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
+ sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
/*
* Copy the first part of the directory, including the header.
*/
@@ -498,23 +511,23 @@ xfs_dir2_sf_addname_hard(
* Fill in the new entry, and update the header counts.
*/
sfep->namelen = args->namelen;
- XFS_DIR2_SF_PUT_OFFSET(sfep, offset);
+ xfs_dir2_sf_put_offset(sfep, offset);
memcpy(sfep->name, args->name, sfep->namelen);
- XFS_DIR2_SF_PUT_INUMBER(sfp, &args->inumber,
- XFS_DIR2_SF_INUMBERP(sfep));
- sfp->hdr.count++;
+ dp->d_ops->sf_put_ino(sfp, sfep, args->inumber);
+ dp->d_ops->sf_put_ftype(sfep, args->filetype);
+ sfp->count++;
#if XFS_BIG_INUMS
if (args->inumber > XFS_DIR2_MAX_SHORT_INUM && !objchange)
- sfp->hdr.i8count++;
+ sfp->i8count++;
#endif
/*
* If there's more left to copy, do that.
*/
if (!eof) {
- sfep = XFS_DIR2_SF_NEXTENTRY(sfp, sfep);
+ sfep = dp->d_ops->sf_nextentry(sfp, sfep);
memcpy(sfep, oldsfep, old_isize - nbytes);
}
- kmem_free(buf, old_isize);
+ kmem_free(buf);
dp->i_d.di_size = new_isize;
xfs_dir2_sf_check(args);
}
@@ -539,43 +552,43 @@ xfs_dir2_sf_addname_pick(
xfs_mount_t *mp; /* filesystem mount point */
xfs_dir2_data_aoff_t offset; /* data block offset */
xfs_dir2_sf_entry_t *sfep; /* shortform entry */
- xfs_dir2_sf_t *sfp; /* shortform structure */
+ xfs_dir2_sf_hdr_t *sfp; /* shortform structure */
int size; /* entry's data size */
int used; /* data bytes used */
dp = args->dp;
mp = dp->i_mount;
- sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
- size = XFS_DIR2_DATA_ENTSIZE(args->namelen);
- offset = XFS_DIR2_DATA_FIRST_OFFSET;
- sfep = XFS_DIR2_SF_FIRSTENTRY(sfp);
+ sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+ size = dp->d_ops->data_entsize(args->namelen);
+ offset = dp->d_ops->data_first_offset;
+ sfep = xfs_dir2_sf_firstentry(sfp);
holefit = 0;
/*
* Loop over sf entries.
* Keep track of data offset and whether we've seen a place
* to insert the new entry.
*/
- for (i = 0; i < sfp->hdr.count; i++) {
+ for (i = 0; i < sfp->count; i++) {
if (!holefit)
- holefit = offset + size <= XFS_DIR2_SF_GET_OFFSET(sfep);
- offset = XFS_DIR2_SF_GET_OFFSET(sfep) +
- XFS_DIR2_DATA_ENTSIZE(sfep->namelen);
- sfep = XFS_DIR2_SF_NEXTENTRY(sfp, sfep);
+ holefit = offset + size <= xfs_dir2_sf_get_offset(sfep);
+ offset = xfs_dir2_sf_get_offset(sfep) +
+ dp->d_ops->data_entsize(sfep->namelen);
+ sfep = dp->d_ops->sf_nextentry(sfp, sfep);
}
/*
* Calculate data bytes used excluding the new entry, if this
* was a data block (block form directory).
*/
used = offset +
- (sfp->hdr.count + 3) * (uint)sizeof(xfs_dir2_leaf_entry_t) +
+ (sfp->count + 3) * (uint)sizeof(xfs_dir2_leaf_entry_t) +
(uint)sizeof(xfs_dir2_block_tail_t);
/*
* If it won't fit in a block form then we can't insert it,
* we'll go back, convert to block, then try the insert and convert
* to leaf.
*/
- if (used + (holefit ? 0 : size) > mp->m_dirblksize)
+ if (used + (holefit ? 0 : size) > args->geo->blksize)
return 0;
/*
* If changing the inode number size, do it the hard way.
@@ -590,7 +603,7 @@ xfs_dir2_sf_addname_pick(
/*
* If it won't fit at the end then do it the hard way (use the hole).
*/
- if (used + size > mp->m_dirblksize)
+ if (used + size > args->geo->blksize)
return 2;
/*
* Do it the easy way.
@@ -614,32 +627,34 @@ xfs_dir2_sf_check(
xfs_ino_t ino; /* entry inode number */
int offset; /* data offset */
xfs_dir2_sf_entry_t *sfep; /* shortform dir entry */
- xfs_dir2_sf_t *sfp; /* shortform structure */
+ xfs_dir2_sf_hdr_t *sfp; /* shortform structure */
+ struct xfs_mount *mp;
dp = args->dp;
+ mp = dp->i_mount;
- sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
- offset = XFS_DIR2_DATA_FIRST_OFFSET;
- ino = XFS_DIR2_SF_GET_INUMBER(sfp, &sfp->hdr.parent);
+ sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+ offset = dp->d_ops->data_first_offset;
+ ino = dp->d_ops->sf_get_parent_ino(sfp);
i8count = ino > XFS_DIR2_MAX_SHORT_INUM;
- for (i = 0, sfep = XFS_DIR2_SF_FIRSTENTRY(sfp);
- i < sfp->hdr.count;
- i++, sfep = XFS_DIR2_SF_NEXTENTRY(sfp, sfep)) {
- ASSERT(XFS_DIR2_SF_GET_OFFSET(sfep) >= offset);
- ino = XFS_DIR2_SF_GET_INUMBER(sfp, XFS_DIR2_SF_INUMBERP(sfep));
+ for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp);
+ i < sfp->count;
+ i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep)) {
+ ASSERT(xfs_dir2_sf_get_offset(sfep) >= offset);
+ ino = dp->d_ops->sf_get_ino(sfp, sfep);
i8count += ino > XFS_DIR2_MAX_SHORT_INUM;
offset =
- XFS_DIR2_SF_GET_OFFSET(sfep) +
- XFS_DIR2_DATA_ENTSIZE(sfep->namelen);
+ xfs_dir2_sf_get_offset(sfep) +
+ dp->d_ops->data_entsize(sfep->namelen);
+ ASSERT(dp->d_ops->sf_get_ftype(sfep) < XFS_DIR3_FT_MAX);
}
- ASSERT(i8count == sfp->hdr.i8count);
+ ASSERT(i8count == sfp->i8count);
ASSERT(XFS_BIG_INUMS || i8count == 0);
ASSERT((char *)sfep - (char *)sfp == dp->i_d.di_size);
ASSERT(offset +
- (sfp->hdr.count + 2) * (uint)sizeof(xfs_dir2_leaf_entry_t) +
- (uint)sizeof(xfs_dir2_block_tail_t) <=
- dp->i_mount->m_dirblksize);
+ (sfp->count + 2) * (uint)sizeof(xfs_dir2_leaf_entry_t) +
+ (uint)sizeof(xfs_dir2_block_tail_t) <= args->geo->blksize);
}
#endif /* DEBUG */
@@ -653,10 +668,11 @@ xfs_dir2_sf_create(
{
xfs_inode_t *dp; /* incore directory inode */
int i8count; /* parent inode is an 8-byte number */
- xfs_dir2_sf_t *sfp; /* shortform structure */
+ xfs_dir2_sf_hdr_t *sfp; /* shortform structure */
int size; /* directory size */
- xfs_dir2_trace_args_i("sf_create", args, pino);
+ trace_xfs_dir2_sf_create(args);
+
dp = args->dp;
ASSERT(dp != NULL);
@@ -674,7 +690,7 @@ xfs_dir2_sf_create(
ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
ASSERT(dp->i_df.if_bytes == 0);
i8count = pino > XFS_DIR2_MAX_SHORT_INUM;
- size = XFS_DIR2_SF_HDR_SIZE(i8count);
+ size = xfs_dir2_sf_hdr_size(i8count);
/*
* Make a buffer for the data.
*/
@@ -682,164 +698,19 @@ xfs_dir2_sf_create(
/*
* Fill in the header,
*/
- sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
- sfp->hdr.i8count = i8count;
+ sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+ sfp->i8count = i8count;
/*
* Now can put in the inode number, since i8count is set.
*/
- XFS_DIR2_SF_PUT_INUMBER(sfp, &pino, &sfp->hdr.parent);
- sfp->hdr.count = 0;
+ dp->d_ops->sf_put_parent_ino(sfp, pino);
+ sfp->count = 0;
dp->i_d.di_size = size;
xfs_dir2_sf_check(args);
xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
return 0;
}
-int /* error */
-xfs_dir2_sf_getdents(
- xfs_inode_t *dp, /* incore directory inode */
- uio_t *uio, /* caller's buffer control */
- int *eofp, /* eof reached? (out) */
- xfs_dirent_t *dbp, /* caller's buffer */
- xfs_dir2_put_t put) /* abi's formatting function */
-{
- int error; /* error return value */
- int i; /* shortform entry number */
- xfs_mount_t *mp; /* filesystem mount point */
- xfs_dir2_dataptr_t off; /* current entry's offset */
- xfs_dir2_put_args_t p; /* arg package for put rtn */
- xfs_dir2_sf_entry_t *sfep; /* shortform directory entry */
- xfs_dir2_sf_t *sfp; /* shortform structure */
- xfs_off_t dir_offset;
-
- mp = dp->i_mount;
-
- ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
- /*
- * Give up if the directory is way too short.
- */
- if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) {
- ASSERT(XFS_FORCED_SHUTDOWN(mp));
- return XFS_ERROR(EIO);
- }
-
- dir_offset = uio->uio_offset;
-
- ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
- ASSERT(dp->i_df.if_u1.if_data != NULL);
-
- sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
-
- ASSERT(dp->i_d.di_size >= XFS_DIR2_SF_HDR_SIZE(sfp->hdr.i8count));
-
- /*
- * If the block number in the offset is out of range, we're done.
- */
- if (XFS_DIR2_DATAPTR_TO_DB(mp, dir_offset) > mp->m_dirdatablk) {
- *eofp = 1;
- return 0;
- }
-
- /*
- * Set up putargs structure.
- */
- p.dbp = dbp;
- p.put = put;
- p.uio = uio;
- /*
- * Put . entry unless we're starting past it.
- */
- if (dir_offset <=
- XFS_DIR2_DB_OFF_TO_DATAPTR(mp, mp->m_dirdatablk,
- XFS_DIR2_DATA_DOT_OFFSET)) {
- p.cook = XFS_DIR2_DB_OFF_TO_DATAPTR(mp, 0,
- XFS_DIR2_DATA_DOTDOT_OFFSET);
- p.ino = dp->i_ino;
-#if XFS_BIG_INUMS
- p.ino += mp->m_inoadd;
-#endif
- p.name = ".";
- p.namelen = 1;
-
- error = p.put(&p);
-
- if (!p.done) {
- uio->uio_offset =
- XFS_DIR2_DB_OFF_TO_DATAPTR(mp, mp->m_dirdatablk,
- XFS_DIR2_DATA_DOT_OFFSET);
- return error;
- }
- }
-
- /*
- * Put .. entry unless we're starting past it.
- */
- if (dir_offset <=
- XFS_DIR2_DB_OFF_TO_DATAPTR(mp, mp->m_dirdatablk,
- XFS_DIR2_DATA_DOTDOT_OFFSET)) {
- p.cook = XFS_DIR2_DB_OFF_TO_DATAPTR(mp, mp->m_dirdatablk,
- XFS_DIR2_DATA_FIRST_OFFSET);
- p.ino = XFS_DIR2_SF_GET_INUMBER(sfp, &sfp->hdr.parent);
-#if XFS_BIG_INUMS
- p.ino += mp->m_inoadd;
-#endif
- p.name = "..";
- p.namelen = 2;
-
- error = p.put(&p);
-
- if (!p.done) {
- uio->uio_offset =
- XFS_DIR2_DB_OFF_TO_DATAPTR(mp, mp->m_dirdatablk,
- XFS_DIR2_DATA_DOTDOT_OFFSET);
- return error;
- }
- }
-
- /*
- * Loop while there are more entries and put'ing works.
- */
- for (i = 0, sfep = XFS_DIR2_SF_FIRSTENTRY(sfp);
- i < sfp->hdr.count;
- i++, sfep = XFS_DIR2_SF_NEXTENTRY(sfp, sfep)) {
-
- off = XFS_DIR2_DB_OFF_TO_DATAPTR(mp, mp->m_dirdatablk,
- XFS_DIR2_SF_GET_OFFSET(sfep));
-
- if (dir_offset > off)
- continue;
-
- p.namelen = sfep->namelen;
-
- p.cook = XFS_DIR2_DB_OFF_TO_DATAPTR(mp, mp->m_dirdatablk,
- XFS_DIR2_SF_GET_OFFSET(sfep) +
- XFS_DIR2_DATA_ENTSIZE(p.namelen));
-
- p.ino = XFS_DIR2_SF_GET_INUMBER(sfp, XFS_DIR2_SF_INUMBERP(sfep));
-#if XFS_BIG_INUMS
- p.ino += mp->m_inoadd;
-#endif
- p.name = (char *)sfep->name;
-
- error = p.put(&p);
-
- if (!p.done) {
- uio->uio_offset = off;
- return error;
- }
- }
-
- /*
- * They all fit.
- */
- *eofp = 1;
-
- uio->uio_offset =
- XFS_DIR2_DB_OFF_TO_DATAPTR(mp, mp->m_dirdatablk + 1, 0);
-
- return 0;
-}
-
/*
* Lookup an entry in a shortform directory.
* Returns EEXIST if found, ENOENT if not found.
@@ -850,10 +721,14 @@ xfs_dir2_sf_lookup(
{
xfs_inode_t *dp; /* incore directory inode */
int i; /* entry index */
+ int error;
xfs_dir2_sf_entry_t *sfep; /* shortform directory entry */
- xfs_dir2_sf_t *sfp; /* shortform structure */
+ xfs_dir2_sf_hdr_t *sfp; /* shortform structure */
+ enum xfs_dacmp cmp; /* comparison result */
+ xfs_dir2_sf_entry_t *ci_sfep; /* case-insens. entry */
+
+ trace_xfs_dir2_sf_lookup(args);
- xfs_dir2_trace_args("sf_lookup", args);
xfs_dir2_sf_check(args);
dp = args->dp;
@@ -867,13 +742,15 @@ xfs_dir2_sf_lookup(
}
ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
ASSERT(dp->i_df.if_u1.if_data != NULL);
- sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
- ASSERT(dp->i_d.di_size >= XFS_DIR2_SF_HDR_SIZE(sfp->hdr.i8count));
+ sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+ ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(sfp->i8count));
/*
* Special case for .
*/
if (args->namelen == 1 && args->name[0] == '.') {
args->inumber = dp->i_ino;
+ args->cmpresult = XFS_CMP_EXACT;
+ args->filetype = XFS_DIR3_FT_DIR;
return XFS_ERROR(EEXIST);
}
/*
@@ -881,29 +758,43 @@ xfs_dir2_sf_lookup(
*/
if (args->namelen == 2 &&
args->name[0] == '.' && args->name[1] == '.') {
- args->inumber = XFS_DIR2_SF_GET_INUMBER(sfp, &sfp->hdr.parent);
+ args->inumber = dp->d_ops->sf_get_parent_ino(sfp);
+ args->cmpresult = XFS_CMP_EXACT;
+ args->filetype = XFS_DIR3_FT_DIR;
return XFS_ERROR(EEXIST);
}
/*
* Loop over all the entries trying to match ours.
*/
- for (i = 0, sfep = XFS_DIR2_SF_FIRSTENTRY(sfp);
- i < sfp->hdr.count;
- i++, sfep = XFS_DIR2_SF_NEXTENTRY(sfp, sfep)) {
- if (sfep->namelen == args->namelen &&
- sfep->name[0] == args->name[0] &&
- memcmp(args->name, sfep->name, args->namelen) == 0) {
- args->inumber =
- XFS_DIR2_SF_GET_INUMBER(sfp,
- XFS_DIR2_SF_INUMBERP(sfep));
- return XFS_ERROR(EEXIST);
+ ci_sfep = NULL;
+ for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp); i < sfp->count;
+ i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep)) {
+ /*
+ * Compare name and if it's an exact match, return the inode
+ * number. If it's the first case-insensitive match, store the
+ * inode number and continue looking for an exact match.
+ */
+ cmp = dp->i_mount->m_dirnameops->compname(args, sfep->name,
+ sfep->namelen);
+ if (cmp != XFS_CMP_DIFFERENT && cmp != args->cmpresult) {
+ args->cmpresult = cmp;
+ args->inumber = dp->d_ops->sf_get_ino(sfp, sfep);
+ args->filetype = dp->d_ops->sf_get_ftype(sfep);
+ if (cmp == XFS_CMP_EXACT)
+ return XFS_ERROR(EEXIST);
+ ci_sfep = sfep;
}
}
+ ASSERT(args->op_flags & XFS_DA_OP_OKNOENT);
/*
- * Didn't find it.
+ * Here, we can only be doing a lookup (not a rename or replace).
+ * If a case-insensitive match was not found, return ENOENT.
*/
- ASSERT(args->oknoent);
- return XFS_ERROR(ENOENT);
+ if (!ci_sfep)
+ return XFS_ERROR(ENOENT);
+ /* otherwise process the CI match as required by the caller */
+ error = xfs_dir_cilookup_result(args, ci_sfep->name, ci_sfep->namelen);
+ return XFS_ERROR(error);
}
/*
@@ -920,9 +811,10 @@ xfs_dir2_sf_removename(
int newsize; /* new inode size */
int oldsize; /* old inode size */
xfs_dir2_sf_entry_t *sfep; /* shortform directory entry */
- xfs_dir2_sf_t *sfp; /* shortform structure */
+ xfs_dir2_sf_hdr_t *sfp; /* shortform structure */
+
+ trace_xfs_dir2_sf_removename(args);
- xfs_dir2_trace_args("sf_removename", args);
dp = args->dp;
ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
@@ -936,35 +828,31 @@ xfs_dir2_sf_removename(
}
ASSERT(dp->i_df.if_bytes == oldsize);
ASSERT(dp->i_df.if_u1.if_data != NULL);
- sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
- ASSERT(oldsize >= XFS_DIR2_SF_HDR_SIZE(sfp->hdr.i8count));
+ sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+ ASSERT(oldsize >= xfs_dir2_sf_hdr_size(sfp->i8count));
/*
* Loop over the old directory entries.
* Find the one we're deleting.
*/
- for (i = 0, sfep = XFS_DIR2_SF_FIRSTENTRY(sfp);
- i < sfp->hdr.count;
- i++, sfep = XFS_DIR2_SF_NEXTENTRY(sfp, sfep)) {
- if (sfep->namelen == args->namelen &&
- sfep->name[0] == args->name[0] &&
- memcmp(sfep->name, args->name, args->namelen) == 0) {
- ASSERT(XFS_DIR2_SF_GET_INUMBER(sfp,
- XFS_DIR2_SF_INUMBERP(sfep)) ==
- args->inumber);
+ for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp); i < sfp->count;
+ i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep)) {
+ if (xfs_da_compname(args, sfep->name, sfep->namelen) ==
+ XFS_CMP_EXACT) {
+ ASSERT(dp->d_ops->sf_get_ino(sfp, sfep) ==
+ args->inumber);
break;
}
}
/*
* Didn't find it.
*/
- if (i == sfp->hdr.count) {
+ if (i == sfp->count)
return XFS_ERROR(ENOENT);
- }
/*
* Calculate sizes.
*/
byteoff = (int)((char *)sfep - (char *)sfp);
- entsize = XFS_DIR2_SF_ENTSIZE_BYNAME(sfp, args->namelen);
+ entsize = dp->d_ops->sf_entsize(sfp, args->namelen);
newsize = oldsize - entsize;
/*
* Copy the part if any after the removed entry, sliding it down.
@@ -975,22 +863,22 @@ xfs_dir2_sf_removename(
/*
* Fix up the header and file size.
*/
- sfp->hdr.count--;
+ sfp->count--;
dp->i_d.di_size = newsize;
/*
* Reallocate, making it smaller.
*/
xfs_idata_realloc(dp, newsize - oldsize, XFS_DATA_FORK);
- sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
+ sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
#if XFS_BIG_INUMS
/*
* Are we changing inode number size?
*/
if (args->inumber > XFS_DIR2_MAX_SHORT_INUM) {
- if (sfp->hdr.i8count == 1)
+ if (sfp->i8count == 1)
xfs_dir2_sf_toino4(args);
else
- sfp->hdr.i8count--;
+ sfp->i8count--;
}
#endif
xfs_dir2_sf_check(args);
@@ -1014,9 +902,10 @@ xfs_dir2_sf_replace(
int i8elevated; /* sf_toino8 set i8count=1 */
#endif
xfs_dir2_sf_entry_t *sfep; /* shortform directory entry */
- xfs_dir2_sf_t *sfp; /* shortform structure */
+ xfs_dir2_sf_hdr_t *sfp; /* shortform structure */
+
+ trace_xfs_dir2_sf_replace(args);
- xfs_dir2_trace_args("sf_replace", args);
dp = args->dp;
ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
@@ -1029,19 +918,19 @@ xfs_dir2_sf_replace(
}
ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
ASSERT(dp->i_df.if_u1.if_data != NULL);
- sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
- ASSERT(dp->i_d.di_size >= XFS_DIR2_SF_HDR_SIZE(sfp->hdr.i8count));
+ sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+ ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(sfp->i8count));
#if XFS_BIG_INUMS
/*
* New inode number is large, and need to convert to 8-byte inodes.
*/
- if (args->inumber > XFS_DIR2_MAX_SHORT_INUM && sfp->hdr.i8count == 0) {
+ if (args->inumber > XFS_DIR2_MAX_SHORT_INUM && sfp->i8count == 0) {
int error; /* error return value */
int newsize; /* new inode size */
newsize =
dp->i_df.if_bytes +
- (sfp->hdr.count + 1) *
+ (sfp->count + 1) *
((uint)sizeof(xfs_dir2_ino8_t) -
(uint)sizeof(xfs_dir2_ino4_t));
/*
@@ -1059,7 +948,7 @@ xfs_dir2_sf_replace(
*/
xfs_dir2_sf_toino8(args);
i8elevated = 1;
- sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
+ sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
} else
i8elevated = 0;
#endif
@@ -1070,36 +959,33 @@ xfs_dir2_sf_replace(
if (args->namelen == 2 &&
args->name[0] == '.' && args->name[1] == '.') {
#if XFS_BIG_INUMS || defined(DEBUG)
- ino = XFS_DIR2_SF_GET_INUMBER(sfp, &sfp->hdr.parent);
+ ino = dp->d_ops->sf_get_parent_ino(sfp);
ASSERT(args->inumber != ino);
#endif
- XFS_DIR2_SF_PUT_INUMBER(sfp, &args->inumber, &sfp->hdr.parent);
+ dp->d_ops->sf_put_parent_ino(sfp, args->inumber);
}
/*
* Normal entry, look for the name.
*/
else {
- for (i = 0, sfep = XFS_DIR2_SF_FIRSTENTRY(sfp);
- i < sfp->hdr.count;
- i++, sfep = XFS_DIR2_SF_NEXTENTRY(sfp, sfep)) {
- if (sfep->namelen == args->namelen &&
- sfep->name[0] == args->name[0] &&
- memcmp(args->name, sfep->name, args->namelen) == 0) {
+ for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp); i < sfp->count;
+ i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep)) {
+ if (xfs_da_compname(args, sfep->name, sfep->namelen) ==
+ XFS_CMP_EXACT) {
#if XFS_BIG_INUMS || defined(DEBUG)
- ino = XFS_DIR2_SF_GET_INUMBER(sfp,
- XFS_DIR2_SF_INUMBERP(sfep));
+ ino = dp->d_ops->sf_get_ino(sfp, sfep);
ASSERT(args->inumber != ino);
#endif
- XFS_DIR2_SF_PUT_INUMBER(sfp, &args->inumber,
- XFS_DIR2_SF_INUMBERP(sfep));
+ dp->d_ops->sf_put_ino(sfp, sfep, args->inumber);
+ dp->d_ops->sf_put_ftype(sfep, args->filetype);
break;
}
}
/*
* Didn't find it.
*/
- if (i == sfp->hdr.count) {
- ASSERT(args->oknoent);
+ if (i == sfp->count) {
+ ASSERT(args->op_flags & XFS_DA_OP_OKNOENT);
#if XFS_BIG_INUMS
if (i8elevated)
xfs_dir2_sf_toino4(args);
@@ -1116,10 +1002,10 @@ xfs_dir2_sf_replace(
/*
* And the old count was one, so need to convert to small.
*/
- if (sfp->hdr.i8count == 1)
+ if (sfp->i8count == 1)
xfs_dir2_sf_toino4(args);
else
- sfp->hdr.i8count--;
+ sfp->i8count--;
}
/*
* See if the old number was small, the new number is large.
@@ -1130,9 +1016,9 @@ xfs_dir2_sf_replace(
* add to the i8count unless we just converted to 8-byte
* inodes (which does an implied i8count = 1)
*/
- ASSERT(sfp->hdr.i8count != 0);
+ ASSERT(sfp->i8count != 0);
if (!i8elevated)
- sfp->hdr.i8count++;
+ sfp->i8count++;
}
#endif
xfs_dir2_sf_check(args);
@@ -1152,16 +1038,18 @@ xfs_dir2_sf_toino4(
char *buf; /* old dir's buffer */
xfs_inode_t *dp; /* incore directory inode */
int i; /* entry index */
- xfs_ino_t ino; /* entry inode number */
int newsize; /* new inode size */
xfs_dir2_sf_entry_t *oldsfep; /* old sf entry */
- xfs_dir2_sf_t *oldsfp; /* old sf directory */
+ xfs_dir2_sf_hdr_t *oldsfp; /* old sf directory */
int oldsize; /* old inode size */
xfs_dir2_sf_entry_t *sfep; /* new sf entry */
- xfs_dir2_sf_t *sfp; /* new sf directory */
+ xfs_dir2_sf_hdr_t *sfp; /* new sf directory */
+ struct xfs_mount *mp;
+
+ trace_xfs_dir2_sf_toino4(args);
- xfs_dir2_trace_args("sf_toino4", args);
dp = args->dp;
+ mp = dp->i_mount;
/*
* Copy the old directory to the buffer.
@@ -1170,57 +1058,56 @@ xfs_dir2_sf_toino4(
*/
oldsize = dp->i_df.if_bytes;
buf = kmem_alloc(oldsize, KM_SLEEP);
- oldsfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
- ASSERT(oldsfp->hdr.i8count == 1);
+ oldsfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+ ASSERT(oldsfp->i8count == 1);
memcpy(buf, oldsfp, oldsize);
/*
* Compute the new inode size.
*/
newsize =
oldsize -
- (oldsfp->hdr.count + 1) *
+ (oldsfp->count + 1) *
((uint)sizeof(xfs_dir2_ino8_t) - (uint)sizeof(xfs_dir2_ino4_t));
xfs_idata_realloc(dp, -oldsize, XFS_DATA_FORK);
xfs_idata_realloc(dp, newsize, XFS_DATA_FORK);
/*
* Reset our pointers, the data has moved.
*/
- oldsfp = (xfs_dir2_sf_t *)buf;
- sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
+ oldsfp = (xfs_dir2_sf_hdr_t *)buf;
+ sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
/*
* Fill in the new header.
*/
- sfp->hdr.count = oldsfp->hdr.count;
- sfp->hdr.i8count = 0;
- ino = XFS_DIR2_SF_GET_INUMBER(oldsfp, &oldsfp->hdr.parent);
- XFS_DIR2_SF_PUT_INUMBER(sfp, &ino, &sfp->hdr.parent);
+ sfp->count = oldsfp->count;
+ sfp->i8count = 0;
+ dp->d_ops->sf_put_parent_ino(sfp, dp->d_ops->sf_get_parent_ino(oldsfp));
/*
* Copy the entries field by field.
*/
- for (i = 0, sfep = XFS_DIR2_SF_FIRSTENTRY(sfp),
- oldsfep = XFS_DIR2_SF_FIRSTENTRY(oldsfp);
- i < sfp->hdr.count;
- i++, sfep = XFS_DIR2_SF_NEXTENTRY(sfp, sfep),
- oldsfep = XFS_DIR2_SF_NEXTENTRY(oldsfp, oldsfep)) {
+ for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp),
+ oldsfep = xfs_dir2_sf_firstentry(oldsfp);
+ i < sfp->count;
+ i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep),
+ oldsfep = dp->d_ops->sf_nextentry(oldsfp, oldsfep)) {
sfep->namelen = oldsfep->namelen;
sfep->offset = oldsfep->offset;
memcpy(sfep->name, oldsfep->name, sfep->namelen);
- ino = XFS_DIR2_SF_GET_INUMBER(oldsfp,
- XFS_DIR2_SF_INUMBERP(oldsfep));
- XFS_DIR2_SF_PUT_INUMBER(sfp, &ino, XFS_DIR2_SF_INUMBERP(sfep));
+ dp->d_ops->sf_put_ino(sfp, sfep,
+ dp->d_ops->sf_get_ino(oldsfp, oldsfep));
+ dp->d_ops->sf_put_ftype(sfep, dp->d_ops->sf_get_ftype(oldsfep));
}
/*
* Clean up the inode.
*/
- kmem_free(buf, oldsize);
+ kmem_free(buf);
dp->i_d.di_size = newsize;
xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
}
/*
- * Convert from 4-byte inode numbers to 8-byte inode numbers.
- * The new 8-byte inode number is not there yet, we leave with the
- * count 1 but no corresponding entry.
+ * Convert existing entries from 4-byte inode numbers to 8-byte inode numbers.
+ * The new entry w/ an 8-byte inode number is not there yet; we leave with
+ * i8count set to 1, but no corresponding 8-byte entry.
*/
static void
xfs_dir2_sf_toino8(
@@ -1229,16 +1116,18 @@ xfs_dir2_sf_toino8(
char *buf; /* old dir's buffer */
xfs_inode_t *dp; /* incore directory inode */
int i; /* entry index */
- xfs_ino_t ino; /* entry inode number */
int newsize; /* new inode size */
xfs_dir2_sf_entry_t *oldsfep; /* old sf entry */
- xfs_dir2_sf_t *oldsfp; /* old sf directory */
+ xfs_dir2_sf_hdr_t *oldsfp; /* old sf directory */
int oldsize; /* old inode size */
xfs_dir2_sf_entry_t *sfep; /* new sf entry */
- xfs_dir2_sf_t *sfp; /* new sf directory */
+ xfs_dir2_sf_hdr_t *sfp; /* new sf directory */
+ struct xfs_mount *mp;
+
+ trace_xfs_dir2_sf_toino8(args);
- xfs_dir2_trace_args("sf_toino8", args);
dp = args->dp;
+ mp = dp->i_mount;
/*
* Copy the old directory to the buffer.
@@ -1247,49 +1136,48 @@ xfs_dir2_sf_toino8(
*/
oldsize = dp->i_df.if_bytes;
buf = kmem_alloc(oldsize, KM_SLEEP);
- oldsfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
- ASSERT(oldsfp->hdr.i8count == 0);
+ oldsfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+ ASSERT(oldsfp->i8count == 0);
memcpy(buf, oldsfp, oldsize);
/*
- * Compute the new inode size.
+ * Compute the new inode size (nb: entry count + 1 for parent)
*/
newsize =
oldsize +
- (oldsfp->hdr.count + 1) *
+ (oldsfp->count + 1) *
((uint)sizeof(xfs_dir2_ino8_t) - (uint)sizeof(xfs_dir2_ino4_t));
xfs_idata_realloc(dp, -oldsize, XFS_DATA_FORK);
xfs_idata_realloc(dp, newsize, XFS_DATA_FORK);
/*
* Reset our pointers, the data has moved.
*/
- oldsfp = (xfs_dir2_sf_t *)buf;
- sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
+ oldsfp = (xfs_dir2_sf_hdr_t *)buf;
+ sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
/*
* Fill in the new header.
*/
- sfp->hdr.count = oldsfp->hdr.count;
- sfp->hdr.i8count = 1;
- ino = XFS_DIR2_SF_GET_INUMBER(oldsfp, &oldsfp->hdr.parent);
- XFS_DIR2_SF_PUT_INUMBER(sfp, &ino, &sfp->hdr.parent);
+ sfp->count = oldsfp->count;
+ sfp->i8count = 1;
+ dp->d_ops->sf_put_parent_ino(sfp, dp->d_ops->sf_get_parent_ino(oldsfp));
/*
* Copy the entries field by field.
*/
- for (i = 0, sfep = XFS_DIR2_SF_FIRSTENTRY(sfp),
- oldsfep = XFS_DIR2_SF_FIRSTENTRY(oldsfp);
- i < sfp->hdr.count;
- i++, sfep = XFS_DIR2_SF_NEXTENTRY(sfp, sfep),
- oldsfep = XFS_DIR2_SF_NEXTENTRY(oldsfp, oldsfep)) {
+ for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp),
+ oldsfep = xfs_dir2_sf_firstentry(oldsfp);
+ i < sfp->count;
+ i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep),
+ oldsfep = dp->d_ops->sf_nextentry(oldsfp, oldsfep)) {
sfep->namelen = oldsfep->namelen;
sfep->offset = oldsfep->offset;
memcpy(sfep->name, oldsfep->name, sfep->namelen);
- ino = XFS_DIR2_SF_GET_INUMBER(oldsfp,
- XFS_DIR2_SF_INUMBERP(oldsfep));
- XFS_DIR2_SF_PUT_INUMBER(sfp, &ino, XFS_DIR2_SF_INUMBERP(sfep));
+ dp->d_ops->sf_put_ino(sfp, sfep,
+ dp->d_ops->sf_get_ino(oldsfp, oldsfep));
+ dp->d_ops->sf_put_ftype(sfep, dp->d_ops->sf_get_ftype(oldsfep));
}
/*
* Clean up the inode.
*/
- kmem_free(buf, oldsize);
+ kmem_free(buf);
dp->i_d.di_size = newsize;
xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
}
diff --git a/fs/xfs/xfs_dir2_sf.h b/fs/xfs/xfs_dir2_sf.h
deleted file mode 100644
index 42f015b7001..00000000000
--- a/fs/xfs/xfs_dir2_sf.h
+++ /dev/null
@@ -1,195 +0,0 @@
-/*
- * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- */
-#ifndef __XFS_DIR2_SF_H__
-#define __XFS_DIR2_SF_H__
-
-/*
- * Directory layout when stored internal to an inode.
- *
- * Small directories are packed as tightly as possible so as to
- * fit into the literal area of the inode.
- */
-
-struct uio;
-struct xfs_dabuf;
-struct xfs_da_args;
-struct xfs_dir2_block;
-struct xfs_inode;
-struct xfs_mount;
-struct xfs_trans;
-
-/*
- * Maximum size of a shortform directory.
- */
-#define XFS_DIR2_SF_MAX_SIZE \
- (XFS_DINODE_MAX_SIZE - (uint)sizeof(xfs_dinode_core_t) - \
- (uint)sizeof(xfs_agino_t))
-
-/*
- * Inode number stored as 8 8-bit values.
- */
-typedef struct { __uint8_t i[8]; } xfs_dir2_ino8_t;
-
-/*
- * Inode number stored as 4 8-bit values.
- * Works a lot of the time, when all the inode numbers in a directory
- * fit in 32 bits.
- */
-typedef struct { __uint8_t i[4]; } xfs_dir2_ino4_t;
-
-typedef union {
- xfs_dir2_ino8_t i8;
- xfs_dir2_ino4_t i4;
-} xfs_dir2_inou_t;
-#define XFS_DIR2_MAX_SHORT_INUM ((xfs_ino_t)0xffffffffULL)
-
-/*
- * Normalized offset (in a data block) of the entry, really xfs_dir2_data_off_t.
- * Only need 16 bits, this is the byte offset into the single block form.
- */
-typedef struct { __uint8_t i[2]; } xfs_dir2_sf_off_t;
-
-/*
- * The parent directory has a dedicated field, and the self-pointer must
- * be calculated on the fly.
- *
- * Entries are packed toward the top as tightly as possible. The header
- * and the elements must be memcpy'd out into a work area to get correct
- * alignment for the inode number fields.
- */
-typedef struct xfs_dir2_sf_hdr {
- __uint8_t count; /* count of entries */
- __uint8_t i8count; /* count of 8-byte inode #s */
- xfs_dir2_inou_t parent; /* parent dir inode number */
-} xfs_dir2_sf_hdr_t;
-
-typedef struct xfs_dir2_sf_entry {
- __uint8_t namelen; /* actual name length */
- xfs_dir2_sf_off_t offset; /* saved offset */
- __uint8_t name[1]; /* name, variable size */
- xfs_dir2_inou_t inumber; /* inode number, var. offset */
-} xfs_dir2_sf_entry_t;
-
-typedef struct xfs_dir2_sf {
- xfs_dir2_sf_hdr_t hdr; /* shortform header */
- xfs_dir2_sf_entry_t list[1]; /* shortform entries */
-} xfs_dir2_sf_t;
-
-#define XFS_DIR2_SF_HDR_SIZE(i8count) xfs_dir2_sf_hdr_size(i8count)
-static inline int xfs_dir2_sf_hdr_size(int i8count)
-{
- return ((uint)sizeof(xfs_dir2_sf_hdr_t) - \
- ((i8count) == 0) * \
- ((uint)sizeof(xfs_dir2_ino8_t) - (uint)sizeof(xfs_dir2_ino4_t)));
-}
-
-#define XFS_DIR2_SF_INUMBERP(sfep) xfs_dir2_sf_inumberp(sfep)
-static inline xfs_dir2_inou_t *xfs_dir2_sf_inumberp(xfs_dir2_sf_entry_t *sfep)
-{
- return (xfs_dir2_inou_t *)&(sfep)->name[(sfep)->namelen];
-}
-
-#define XFS_DIR2_SF_GET_INUMBER(sfp, from) \
- xfs_dir2_sf_get_inumber(sfp, from)
-static inline xfs_intino_t
-xfs_dir2_sf_get_inumber(xfs_dir2_sf_t *sfp, xfs_dir2_inou_t *from)
-{
- return ((sfp)->hdr.i8count == 0 ? \
- (xfs_intino_t)XFS_GET_DIR_INO4((from)->i4) : \
- (xfs_intino_t)XFS_GET_DIR_INO8((from)->i8));
-}
-
-#define XFS_DIR2_SF_PUT_INUMBER(sfp,from,to) \
- xfs_dir2_sf_put_inumber(sfp,from,to)
-static inline void xfs_dir2_sf_put_inumber(xfs_dir2_sf_t *sfp, xfs_ino_t *from,
- xfs_dir2_inou_t *to)
-{
- if ((sfp)->hdr.i8count == 0)
- XFS_PUT_DIR_INO4(*(from), (to)->i4);
- else
- XFS_PUT_DIR_INO8(*(from), (to)->i8);
-}
-
-#define XFS_DIR2_SF_GET_OFFSET(sfep) \
- xfs_dir2_sf_get_offset(sfep)
-static inline xfs_dir2_data_aoff_t
-xfs_dir2_sf_get_offset(xfs_dir2_sf_entry_t *sfep)
-{
- return INT_GET_UNALIGNED_16_BE(&(sfep)->offset.i);
-}
-
-#define XFS_DIR2_SF_PUT_OFFSET(sfep,off) \
- xfs_dir2_sf_put_offset(sfep,off)
-static inline void
-xfs_dir2_sf_put_offset(xfs_dir2_sf_entry_t *sfep, xfs_dir2_data_aoff_t off)
-{
- INT_SET_UNALIGNED_16_BE(&(sfep)->offset.i, off);
-}
-
-#define XFS_DIR2_SF_ENTSIZE_BYNAME(sfp,len) \
- xfs_dir2_sf_entsize_byname(sfp,len)
-static inline int xfs_dir2_sf_entsize_byname(xfs_dir2_sf_t *sfp, int len)
-{
- return ((uint)sizeof(xfs_dir2_sf_entry_t) - 1 + (len) - \
- ((sfp)->hdr.i8count == 0) * \
- ((uint)sizeof(xfs_dir2_ino8_t) - (uint)sizeof(xfs_dir2_ino4_t)));
-}
-
-#define XFS_DIR2_SF_ENTSIZE_BYENTRY(sfp,sfep) \
- xfs_dir2_sf_entsize_byentry(sfp,sfep)
-static inline int
-xfs_dir2_sf_entsize_byentry(xfs_dir2_sf_t *sfp, xfs_dir2_sf_entry_t *sfep)
-{
- return ((uint)sizeof(xfs_dir2_sf_entry_t) - 1 + (sfep)->namelen - \
- ((sfp)->hdr.i8count == 0) * \
- ((uint)sizeof(xfs_dir2_ino8_t) - (uint)sizeof(xfs_dir2_ino4_t)));
-}
-
-#define XFS_DIR2_SF_FIRSTENTRY(sfp) xfs_dir2_sf_firstentry(sfp)
-static inline xfs_dir2_sf_entry_t *xfs_dir2_sf_firstentry(xfs_dir2_sf_t *sfp)
-{
- return ((xfs_dir2_sf_entry_t *) \
- ((char *)(sfp) + XFS_DIR2_SF_HDR_SIZE(sfp->hdr.i8count)));
-}
-
-#define XFS_DIR2_SF_NEXTENTRY(sfp,sfep) xfs_dir2_sf_nextentry(sfp,sfep)
-static inline xfs_dir2_sf_entry_t *
-xfs_dir2_sf_nextentry(xfs_dir2_sf_t *sfp, xfs_dir2_sf_entry_t *sfep)
-{
- return ((xfs_dir2_sf_entry_t *) \
- ((char *)(sfep) + XFS_DIR2_SF_ENTSIZE_BYENTRY(sfp,sfep)));
-}
-
-/*
- * Functions.
- */
-extern int xfs_dir2_block_sfsize(struct xfs_inode *dp,
- struct xfs_dir2_block *block,
- xfs_dir2_sf_hdr_t *sfhp);
-extern int xfs_dir2_block_to_sf(struct xfs_da_args *args, struct xfs_dabuf *bp,
- int size, xfs_dir2_sf_hdr_t *sfhp);
-extern int xfs_dir2_sf_addname(struct xfs_da_args *args);
-extern int xfs_dir2_sf_create(struct xfs_da_args *args, xfs_ino_t pino);
-extern int xfs_dir2_sf_getdents(struct xfs_inode *dp, struct uio *uio,
- int *eofp, struct xfs_dirent *dbp,
- xfs_dir2_put_t put);
-extern int xfs_dir2_sf_lookup(struct xfs_da_args *args);
-extern int xfs_dir2_sf_removename(struct xfs_da_args *args);
-extern int xfs_dir2_sf_replace(struct xfs_da_args *args);
-
-#endif /* __XFS_DIR2_SF_H__ */
diff --git a/fs/xfs/xfs_dir2_trace.c b/fs/xfs/xfs_dir2_trace.c
deleted file mode 100644
index c626943b411..00000000000
--- a/fs/xfs/xfs_dir2_trace.c
+++ /dev/null
@@ -1,216 +0,0 @@
-/*
- * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_types.h"
-#include "xfs_inum.h"
-#include "xfs_dir.h"
-#include "xfs_dir2.h"
-#include "xfs_da_btree.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_dir_sf.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
-#include "xfs_dinode.h"
-#include "xfs_inode.h"
-#include "xfs_dir2_trace.h"
-
-#ifdef XFS_DIR2_TRACE
-ktrace_t *xfs_dir2_trace_buf;
-
-/*
- * Enter something in the trace buffers.
- */
-static void
-xfs_dir2_trace_enter(
- xfs_inode_t *dp,
- int type,
- char *where,
- char *name,
- int namelen,
- void *a0,
- void *a1,
- void *a2,
- void *a3,
- void *a4,
- void *a5,
- void *a6,
- void *a7)
-{
- void *n[5];
-
- ASSERT(xfs_dir2_trace_buf);
- ASSERT(dp->i_dir_trace);
- if (name)
- memcpy(n, name, min((int)sizeof(n), namelen));
- else
- memset((char *)n, 0, sizeof(n));
- ktrace_enter(xfs_dir2_trace_buf,
- (void *)(long)type, (void *)where,
- (void *)a0, (void *)a1, (void *)a2, (void *)a3,
- (void *)a4, (void *)a5, (void *)a6, (void *)a7,
- (void *)(long)namelen,
- (void *)n[0], (void *)n[1], (void *)n[2],
- (void *)n[3], (void *)n[4]);
- ktrace_enter(dp->i_dir_trace,
- (void *)(long)type, (void *)where,
- (void *)a0, (void *)a1, (void *)a2, (void *)a3,
- (void *)a4, (void *)a5, (void *)a6, (void *)a7,
- (void *)(long)namelen,
- (void *)n[0], (void *)n[1], (void *)n[2],
- (void *)n[3], (void *)n[4]);
-}
-
-void
-xfs_dir2_trace_args(
- char *where,
- xfs_da_args_t *args)
-{
- xfs_dir2_trace_enter(args->dp, XFS_DIR2_KTRACE_ARGS, where,
- (char *)args->name, (int)args->namelen,
- (void *)(unsigned long)args->hashval,
- (void *)((unsigned long)(args->inumber >> 32)),
- (void *)((unsigned long)(args->inumber & 0xFFFFFFFF)),
- (void *)args->dp, (void *)args->trans,
- (void *)(unsigned long)args->justcheck, NULL, NULL);
-}
-
-void
-xfs_dir2_trace_args_b(
- char *where,
- xfs_da_args_t *args,
- xfs_dabuf_t *bp)
-{
- xfs_dir2_trace_enter(args->dp, XFS_DIR2_KTRACE_ARGS_B, where,
- (char *)args->name, (int)args->namelen,
- (void *)(unsigned long)args->hashval,
- (void *)((unsigned long)(args->inumber >> 32)),
- (void *)((unsigned long)(args->inumber & 0xFFFFFFFF)),
- (void *)args->dp, (void *)args->trans,
- (void *)(unsigned long)args->justcheck,
- (void *)(bp ? bp->bps[0] : NULL), NULL);
-}
-
-void
-xfs_dir2_trace_args_bb(
- char *where,
- xfs_da_args_t *args,
- xfs_dabuf_t *lbp,
- xfs_dabuf_t *dbp)
-{
- xfs_dir2_trace_enter(args->dp, XFS_DIR2_KTRACE_ARGS_BB, where,
- (char *)args->name, (int)args->namelen,
- (void *)(unsigned long)args->hashval,
- (void *)((unsigned long)(args->inumber >> 32)),
- (void *)((unsigned long)(args->inumber & 0xFFFFFFFF)),
- (void *)args->dp, (void *)args->trans,
- (void *)(unsigned long)args->justcheck,
- (void *)(lbp ? lbp->bps[0] : NULL),
- (void *)(dbp ? dbp->bps[0] : NULL));
-}
-
-void
-xfs_dir2_trace_args_bibii(
- char *where,
- xfs_da_args_t *args,
- xfs_dabuf_t *bs,
- int ss,
- xfs_dabuf_t *bd,
- int sd,
- int c)
-{
- xfs_buf_t *bpbs = bs ? bs->bps[0] : NULL;
- xfs_buf_t *bpbd = bd ? bd->bps[0] : NULL;
-
- xfs_dir2_trace_enter(args->dp, XFS_DIR2_KTRACE_ARGS_BIBII, where,
- (char *)args->name, (int)args->namelen,
- (void *)args->dp, (void *)args->trans,
- (void *)bpbs, (void *)(long)ss, (void *)bpbd, (void *)(long)sd,
- (void *)(long)c, NULL);
-}
-
-void
-xfs_dir2_trace_args_db(
- char *where,
- xfs_da_args_t *args,
- xfs_dir2_db_t db,
- xfs_dabuf_t *bp)
-{
- xfs_buf_t *dbp = bp ? bp->bps[0] : NULL;
-
- xfs_dir2_trace_enter(args->dp, XFS_DIR2_KTRACE_ARGS_DB, where,
- (char *)args->name, (int)args->namelen,
- (void *)(unsigned long)args->hashval,
- (void *)((unsigned long)(args->inumber >> 32)),
- (void *)((unsigned long)(args->inumber & 0xFFFFFFFF)),
- (void *)args->dp, (void *)args->trans,
- (void *)(unsigned long)args->justcheck, (void *)(long)db,
- (void *)dbp);
-}
-
-void
-xfs_dir2_trace_args_i(
- char *where,
- xfs_da_args_t *args,
- xfs_ino_t i)
-{
- xfs_dir2_trace_enter(args->dp, XFS_DIR2_KTRACE_ARGS_I, where,
- (char *)args->name, (int)args->namelen,
- (void *)(unsigned long)args->hashval,
- (void *)((unsigned long)(args->inumber >> 32)),
- (void *)((unsigned long)(args->inumber & 0xFFFFFFFF)),
- (void *)args->dp, (void *)args->trans,
- (void *)(unsigned long)args->justcheck,
- (void *)((unsigned long)(i >> 32)),
- (void *)((unsigned long)(i & 0xFFFFFFFF)));
-}
-
-void
-xfs_dir2_trace_args_s(
- char *where,
- xfs_da_args_t *args,
- int s)
-{
- xfs_dir2_trace_enter(args->dp, XFS_DIR2_KTRACE_ARGS_S, where,
- (char *)args->name, (int)args->namelen,
- (void *)(unsigned long)args->hashval,
- (void *)((unsigned long)(args->inumber >> 32)),
- (void *)((unsigned long)(args->inumber & 0xFFFFFFFF)),
- (void *)args->dp, (void *)args->trans,
- (void *)(unsigned long)args->justcheck, (void *)(long)s, NULL);
-}
-
-void
-xfs_dir2_trace_args_sb(
- char *where,
- xfs_da_args_t *args,
- int s,
- xfs_dabuf_t *bp)
-{
- xfs_buf_t *dbp = bp ? bp->bps[0] : NULL;
-
- xfs_dir2_trace_enter(args->dp, XFS_DIR2_KTRACE_ARGS_SB, where,
- (char *)args->name, (int)args->namelen,
- (void *)(unsigned long)args->hashval,
- (void *)((unsigned long)(args->inumber >> 32)),
- (void *)((unsigned long)(args->inumber & 0xFFFFFFFF)),
- (void *)args->dp, (void *)args->trans,
- (void *)(unsigned long)args->justcheck, (void *)(long)s,
- (void *)dbp);
-}
-#endif /* XFS_DIR2_TRACE */
diff --git a/fs/xfs/xfs_dir2_trace.h b/fs/xfs/xfs_dir2_trace.h
deleted file mode 100644
index ca3c754f482..00000000000
--- a/fs/xfs/xfs_dir2_trace.h
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- * Copyright (c) 2000,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- */
-#ifndef __XFS_DIR2_TRACE_H__
-#define __XFS_DIR2_TRACE_H__
-
-/*
- * Tracing for xfs v2 directories.
- */
-
-#if defined(XFS_DIR2_TRACE)
-
-struct ktrace;
-struct xfs_dabuf;
-struct xfs_da_args;
-
-#define XFS_DIR2_GTRACE_SIZE 4096 /* global buffer */
-#define XFS_DIR2_KTRACE_SIZE 32 /* per-inode buffer */
-extern struct ktrace *xfs_dir2_trace_buf;
-
-#define XFS_DIR2_KTRACE_ARGS 1 /* args only */
-#define XFS_DIR2_KTRACE_ARGS_B 2 /* args + buffer */
-#define XFS_DIR2_KTRACE_ARGS_BB 3 /* args + 2 buffers */
-#define XFS_DIR2_KTRACE_ARGS_DB 4 /* args, db, buffer */
-#define XFS_DIR2_KTRACE_ARGS_I 5 /* args, inum */
-#define XFS_DIR2_KTRACE_ARGS_S 6 /* args, int */
-#define XFS_DIR2_KTRACE_ARGS_SB 7 /* args, int, buffer */
-#define XFS_DIR2_KTRACE_ARGS_BIBII 8 /* args, buf/int/buf/int/int */
-
-void xfs_dir2_trace_args(char *where, struct xfs_da_args *args);
-void xfs_dir2_trace_args_b(char *where, struct xfs_da_args *args,
- struct xfs_dabuf *bp);
-void xfs_dir2_trace_args_bb(char *where, struct xfs_da_args *args,
- struct xfs_dabuf *lbp, struct xfs_dabuf *dbp);
-void xfs_dir2_trace_args_bibii(char *where, struct xfs_da_args *args,
- struct xfs_dabuf *bs, int ss,
- struct xfs_dabuf *bd, int sd, int c);
-void xfs_dir2_trace_args_db(char *where, struct xfs_da_args *args,
- xfs_dir2_db_t db, struct xfs_dabuf *bp);
-void xfs_dir2_trace_args_i(char *where, struct xfs_da_args *args, xfs_ino_t i);
-void xfs_dir2_trace_args_s(char *where, struct xfs_da_args *args, int s);
-void xfs_dir2_trace_args_sb(char *where, struct xfs_da_args *args, int s,
- struct xfs_dabuf *bp);
-
-#else /* XFS_DIR2_TRACE */
-
-#define xfs_dir2_trace_args(where, args)
-#define xfs_dir2_trace_args_b(where, args, bp)
-#define xfs_dir2_trace_args_bb(where, args, lbp, dbp)
-#define xfs_dir2_trace_args_bibii(where, args, bs, ss, bd, sd, c)
-#define xfs_dir2_trace_args_db(where, args, db, bp)
-#define xfs_dir2_trace_args_i(where, args, i)
-#define xfs_dir2_trace_args_s(where, args, s)
-#define xfs_dir2_trace_args_sb(where, args, s, bp)
-
-#endif /* XFS_DIR2_TRACE */
-
-#endif /* __XFS_DIR2_TRACE_H__ */
diff --git a/fs/xfs/xfs_dir_leaf.c b/fs/xfs/xfs_dir_leaf.c
deleted file mode 100644
index e83074016ab..00000000000
--- a/fs/xfs/xfs_dir_leaf.c
+++ /dev/null
@@ -1,2213 +0,0 @@
-/*
- * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_types.h"
-#include "xfs_log.h"
-#include "xfs_inum.h"
-#include "xfs_trans.h"
-#include "xfs_sb.h"
-#include "xfs_dir.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
-#include "xfs_mount.h"
-#include "xfs_da_btree.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
-#include "xfs_dinode.h"
-#include "xfs_inode.h"
-#include "xfs_inode_item.h"
-#include "xfs_alloc.h"
-#include "xfs_btree.h"
-#include "xfs_bmap.h"
-#include "xfs_dir_leaf.h"
-#include "xfs_error.h"
-
-/*
- * xfs_dir_leaf.c
- *
- * Routines to implement leaf blocks of directories as Btrees of hashed names.
- */
-
-/*========================================================================
- * Function prototypes for the kernel.
- *========================================================================*/
-
-/*
- * Routines used for growing the Btree.
- */
-STATIC void xfs_dir_leaf_add_work(xfs_dabuf_t *leaf_buffer, xfs_da_args_t *args,
- int insertion_index,
- int freemap_index);
-STATIC int xfs_dir_leaf_compact(xfs_trans_t *trans, xfs_dabuf_t *leaf_buffer,
- int musthave, int justcheck);
-STATIC void xfs_dir_leaf_rebalance(xfs_da_state_t *state,
- xfs_da_state_blk_t *blk1,
- xfs_da_state_blk_t *blk2);
-STATIC int xfs_dir_leaf_figure_balance(xfs_da_state_t *state,
- xfs_da_state_blk_t *leaf_blk_1,
- xfs_da_state_blk_t *leaf_blk_2,
- int *number_entries_in_blk1,
- int *number_namebytes_in_blk1);
-
-STATIC int xfs_dir_leaf_create(struct xfs_da_args *args,
- xfs_dablk_t which_block,
- struct xfs_dabuf **bpp);
-
-/*
- * Utility routines.
- */
-STATIC void xfs_dir_leaf_moveents(xfs_dir_leafblock_t *src_leaf,
- int src_start,
- xfs_dir_leafblock_t *dst_leaf,
- int dst_start, int move_count,
- xfs_mount_t *mp);
-
-
-/*========================================================================
- * External routines when dirsize < XFS_IFORK_DSIZE(dp).
- *========================================================================*/
-
-
-/*
- * Validate a given inode number.
- */
-int
-xfs_dir_ino_validate(xfs_mount_t *mp, xfs_ino_t ino)
-{
- xfs_agblock_t agblkno;
- xfs_agino_t agino;
- xfs_agnumber_t agno;
- int ino_ok;
- int ioff;
-
- agno = XFS_INO_TO_AGNO(mp, ino);
- agblkno = XFS_INO_TO_AGBNO(mp, ino);
- ioff = XFS_INO_TO_OFFSET(mp, ino);
- agino = XFS_OFFBNO_TO_AGINO(mp, agblkno, ioff);
- ino_ok =
- agno < mp->m_sb.sb_agcount &&
- agblkno < mp->m_sb.sb_agblocks &&
- agblkno != 0 &&
- ioff < (1 << mp->m_sb.sb_inopblog) &&
- XFS_AGINO_TO_INO(mp, agno, agino) == ino;
- if (unlikely(XFS_TEST_ERROR(!ino_ok, mp, XFS_ERRTAG_DIR_INO_VALIDATE,
- XFS_RANDOM_DIR_INO_VALIDATE))) {
- xfs_fs_cmn_err(CE_WARN, mp, "Invalid inode number 0x%Lx",
- (unsigned long long) ino);
- XFS_ERROR_REPORT("xfs_dir_ino_validate", XFS_ERRLEVEL_LOW, mp);
- return XFS_ERROR(EFSCORRUPTED);
- }
- return 0;
-}
-
-/*
- * Create the initial contents of a shortform directory.
- */
-int
-xfs_dir_shortform_create(xfs_da_args_t *args, xfs_ino_t parent)
-{
- xfs_dir_sf_hdr_t *hdr;
- xfs_inode_t *dp;
-
- dp = args->dp;
- ASSERT(dp != NULL);
- ASSERT(dp->i_d.di_size == 0);
- if (dp->i_d.di_format == XFS_DINODE_FMT_EXTENTS) {
- dp->i_df.if_flags &= ~XFS_IFEXTENTS; /* just in case */
- dp->i_d.di_format = XFS_DINODE_FMT_LOCAL;
- xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE);
- dp->i_df.if_flags |= XFS_IFINLINE;
- }
- ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
- ASSERT(dp->i_df.if_bytes == 0);
- xfs_idata_realloc(dp, sizeof(*hdr), XFS_DATA_FORK);
- hdr = (xfs_dir_sf_hdr_t *)dp->i_df.if_u1.if_data;
- XFS_DIR_SF_PUT_DIRINO(&parent, &hdr->parent);
-
- hdr->count = 0;
- dp->i_d.di_size = sizeof(*hdr);
- xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
- return 0;
-}
-
-/*
- * Add a name to the shortform directory structure.
- * Overflow from the inode has already been checked for.
- */
-int
-xfs_dir_shortform_addname(xfs_da_args_t *args)
-{
- xfs_dir_shortform_t *sf;
- xfs_dir_sf_entry_t *sfe;
- int i, offset, size;
- xfs_inode_t *dp;
-
- dp = args->dp;
- ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
- /*
- * Catch the case where the conversion from shortform to leaf
- * failed part way through.
- */
- if (dp->i_d.di_size < sizeof(xfs_dir_sf_hdr_t)) {
- ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount));
- return XFS_ERROR(EIO);
- }
- ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
- ASSERT(dp->i_df.if_u1.if_data != NULL);
- sf = (xfs_dir_shortform_t *)dp->i_df.if_u1.if_data;
- sfe = &sf->list[0];
- for (i = INT_GET(sf->hdr.count, ARCH_CONVERT)-1; i >= 0; i--) {
- if (sfe->namelen == args->namelen &&
- args->name[0] == sfe->name[0] &&
- memcmp(args->name, sfe->name, args->namelen) == 0)
- return XFS_ERROR(EEXIST);
- sfe = XFS_DIR_SF_NEXTENTRY(sfe);
- }
-
- offset = (int)((char *)sfe - (char *)sf);
- size = XFS_DIR_SF_ENTSIZE_BYNAME(args->namelen);
- xfs_idata_realloc(dp, size, XFS_DATA_FORK);
- sf = (xfs_dir_shortform_t *)dp->i_df.if_u1.if_data;
- sfe = (xfs_dir_sf_entry_t *)((char *)sf + offset);
-
- XFS_DIR_SF_PUT_DIRINO(&args->inumber, &sfe->inumber);
- sfe->namelen = args->namelen;
- memcpy(sfe->name, args->name, sfe->namelen);
- INT_MOD(sf->hdr.count, ARCH_CONVERT, +1);
-
- dp->i_d.di_size += size;
- xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
-
- return 0;
-}
-
-/*
- * Remove a name from the shortform directory structure.
- */
-int
-xfs_dir_shortform_removename(xfs_da_args_t *args)
-{
- xfs_dir_shortform_t *sf;
- xfs_dir_sf_entry_t *sfe;
- int base, size = 0, i;
- xfs_inode_t *dp;
-
- dp = args->dp;
- ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
- /*
- * Catch the case where the conversion from shortform to leaf
- * failed part way through.
- */
- if (dp->i_d.di_size < sizeof(xfs_dir_sf_hdr_t)) {
- ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount));
- return XFS_ERROR(EIO);
- }
- ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
- ASSERT(dp->i_df.if_u1.if_data != NULL);
- base = sizeof(xfs_dir_sf_hdr_t);
- sf = (xfs_dir_shortform_t *)dp->i_df.if_u1.if_data;
- sfe = &sf->list[0];
- for (i = INT_GET(sf->hdr.count, ARCH_CONVERT)-1; i >= 0; i--) {
- size = XFS_DIR_SF_ENTSIZE_BYENTRY(sfe);
- if (sfe->namelen == args->namelen &&
- sfe->name[0] == args->name[0] &&
- memcmp(sfe->name, args->name, args->namelen) == 0)
- break;
- base += size;
- sfe = XFS_DIR_SF_NEXTENTRY(sfe);
- }
- if (i < 0) {
- ASSERT(args->oknoent);
- return XFS_ERROR(ENOENT);
- }
-
- if ((base + size) != dp->i_d.di_size) {
- memmove(&((char *)sf)[base], &((char *)sf)[base+size],
- dp->i_d.di_size - (base+size));
- }
- INT_MOD(sf->hdr.count, ARCH_CONVERT, -1);
-
- xfs_idata_realloc(dp, -size, XFS_DATA_FORK);
- dp->i_d.di_size -= size;
- xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
-
- return 0;
-}
-
-/*
- * Look up a name in a shortform directory structure.
- */
-int
-xfs_dir_shortform_lookup(xfs_da_args_t *args)
-{
- xfs_dir_shortform_t *sf;
- xfs_dir_sf_entry_t *sfe;
- int i;
- xfs_inode_t *dp;
-
- dp = args->dp;
- ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
- /*
- * Catch the case where the conversion from shortform to leaf
- * failed part way through.
- */
- if (dp->i_d.di_size < sizeof(xfs_dir_sf_hdr_t)) {
- ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount));
- return XFS_ERROR(EIO);
- }
- ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
- ASSERT(dp->i_df.if_u1.if_data != NULL);
- sf = (xfs_dir_shortform_t *)dp->i_df.if_u1.if_data;
- if (args->namelen == 2 &&
- args->name[0] == '.' && args->name[1] == '.') {
- XFS_DIR_SF_GET_DIRINO(&sf->hdr.parent, &args->inumber);
- return(XFS_ERROR(EEXIST));
- }
- if (args->namelen == 1 && args->name[0] == '.') {
- args->inumber = dp->i_ino;
- return(XFS_ERROR(EEXIST));
- }
- sfe = &sf->list[0];
- for (i = INT_GET(sf->hdr.count, ARCH_CONVERT)-1; i >= 0; i--) {
- if (sfe->namelen == args->namelen &&
- sfe->name[0] == args->name[0] &&
- memcmp(args->name, sfe->name, args->namelen) == 0) {
- XFS_DIR_SF_GET_DIRINO(&sfe->inumber, &args->inumber);
- return(XFS_ERROR(EEXIST));
- }
- sfe = XFS_DIR_SF_NEXTENTRY(sfe);
- }
- ASSERT(args->oknoent);
- return(XFS_ERROR(ENOENT));
-}
-
-/*
- * Convert from using the shortform to the leaf.
- */
-int
-xfs_dir_shortform_to_leaf(xfs_da_args_t *iargs)
-{
- xfs_inode_t *dp;
- xfs_dir_shortform_t *sf;
- xfs_dir_sf_entry_t *sfe;
- xfs_da_args_t args;
- xfs_ino_t inumber;
- char *tmpbuffer;
- int retval, i, size;
- xfs_dablk_t blkno;
- xfs_dabuf_t *bp;
-
- dp = iargs->dp;
- /*
- * Catch the case where the conversion from shortform to leaf
- * failed part way through.
- */
- if (dp->i_d.di_size < sizeof(xfs_dir_sf_hdr_t)) {
- ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount));
- return XFS_ERROR(EIO);
- }
- ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
- ASSERT(dp->i_df.if_u1.if_data != NULL);
- size = dp->i_df.if_bytes;
- tmpbuffer = kmem_alloc(size, KM_SLEEP);
- ASSERT(tmpbuffer != NULL);
-
- memcpy(tmpbuffer, dp->i_df.if_u1.if_data, size);
-
- sf = (xfs_dir_shortform_t *)tmpbuffer;
- XFS_DIR_SF_GET_DIRINO(&sf->hdr.parent, &inumber);
-
- xfs_idata_realloc(dp, -size, XFS_DATA_FORK);
- dp->i_d.di_size = 0;
- xfs_trans_log_inode(iargs->trans, dp, XFS_ILOG_CORE);
- retval = xfs_da_grow_inode(iargs, &blkno);
- if (retval)
- goto out;
-
- ASSERT(blkno == 0);
- retval = xfs_dir_leaf_create(iargs, blkno, &bp);
- if (retval)
- goto out;
- xfs_da_buf_done(bp);
-
- args.name = ".";
- args.namelen = 1;
- args.hashval = xfs_dir_hash_dot;
- args.inumber = dp->i_ino;
- args.dp = dp;
- args.firstblock = iargs->firstblock;
- args.flist = iargs->flist;
- args.total = iargs->total;
- args.whichfork = XFS_DATA_FORK;
- args.trans = iargs->trans;
- args.justcheck = 0;
- args.addname = args.oknoent = 1;
- retval = xfs_dir_leaf_addname(&args);
- if (retval)
- goto out;
-
- args.name = "..";
- args.namelen = 2;
- args.hashval = xfs_dir_hash_dotdot;
- args.inumber = inumber;
- retval = xfs_dir_leaf_addname(&args);
- if (retval)
- goto out;
-
- sfe = &sf->list[0];
- for (i = 0; i < INT_GET(sf->hdr.count, ARCH_CONVERT); i++) {
- args.name = (char *)(sfe->name);
- args.namelen = sfe->namelen;
- args.hashval = xfs_da_hashname((char *)(sfe->name),
- sfe->namelen);
- XFS_DIR_SF_GET_DIRINO(&sfe->inumber, &args.inumber);
- retval = xfs_dir_leaf_addname(&args);
- if (retval)
- goto out;
- sfe = XFS_DIR_SF_NEXTENTRY(sfe);
- }
- retval = 0;
-
-out:
- kmem_free(tmpbuffer, size);
- return retval;
-}
-
-STATIC int
-xfs_dir_shortform_compare(const void *a, const void *b)
-{
- xfs_dir_sf_sort_t *sa, *sb;
-
- sa = (xfs_dir_sf_sort_t *)a;
- sb = (xfs_dir_sf_sort_t *)b;
- if (sa->hash < sb->hash)
- return -1;
- else if (sa->hash > sb->hash)
- return 1;
- else
- return sa->entno - sb->entno;
-}
-
-/*
- * Copy out directory entries for getdents(), for shortform directories.
- */
-/*ARGSUSED*/
-int
-xfs_dir_shortform_getdents(xfs_inode_t *dp, uio_t *uio, int *eofp,
- xfs_dirent_t *dbp, xfs_dir_put_t put)
-{
- xfs_dir_shortform_t *sf;
- xfs_dir_sf_entry_t *sfe;
- int retval, i, sbsize, nsbuf, lastresid=0, want_entno;
- xfs_mount_t *mp;
- xfs_dahash_t cookhash, hash;
- xfs_dir_put_args_t p;
- xfs_dir_sf_sort_t *sbuf, *sbp;
-
- mp = dp->i_mount;
- sf = (xfs_dir_shortform_t *)dp->i_df.if_u1.if_data;
- cookhash = XFS_DA_COOKIE_HASH(mp, uio->uio_offset);
- want_entno = XFS_DA_COOKIE_ENTRY(mp, uio->uio_offset);
- nsbuf = INT_GET(sf->hdr.count, ARCH_CONVERT) + 2;
- sbsize = (nsbuf + 1) * sizeof(*sbuf);
- sbp = sbuf = kmem_alloc(sbsize, KM_SLEEP);
-
- xfs_dir_trace_g_du("sf: start", dp, uio);
-
- /*
- * Collect all the entries into the buffer.
- * Entry 0 is .
- */
- sbp->entno = 0;
- sbp->seqno = 0;
- sbp->hash = xfs_dir_hash_dot;
- sbp->ino = dp->i_ino;
- sbp->name = ".";
- sbp->namelen = 1;
- sbp++;
-
- /*
- * Entry 1 is ..
- */
- sbp->entno = 1;
- sbp->seqno = 0;
- sbp->hash = xfs_dir_hash_dotdot;
- sbp->ino = XFS_GET_DIR_INO8(sf->hdr.parent);
- sbp->name = "..";
- sbp->namelen = 2;
- sbp++;
-
- /*
- * Scan the directory data for the rest of the entries.
- */
- for (i = 0, sfe = &sf->list[0];
- i < INT_GET(sf->hdr.count, ARCH_CONVERT); i++) {
-
- if (unlikely(
- ((char *)sfe < (char *)sf) ||
- ((char *)sfe >= ((char *)sf + dp->i_df.if_bytes)))) {
- xfs_dir_trace_g_du("sf: corrupted", dp, uio);
- XFS_CORRUPTION_ERROR("xfs_dir_shortform_getdents",
- XFS_ERRLEVEL_LOW, mp, sfe);
- kmem_free(sbuf, sbsize);
- return XFS_ERROR(EFSCORRUPTED);
- }
-
- sbp->entno = i + 2;
- sbp->seqno = 0;
- sbp->hash = xfs_da_hashname((char *)sfe->name, sfe->namelen);
- sbp->ino = XFS_GET_DIR_INO8(sfe->inumber);
- sbp->name = (char *)sfe->name;
- sbp->namelen = sfe->namelen;
- sfe = XFS_DIR_SF_NEXTENTRY(sfe);
- sbp++;
- }
-
- /*
- * Sort the entries on hash then entno.
- */
- xfs_sort(sbuf, nsbuf, sizeof(*sbuf), xfs_dir_shortform_compare);
- /*
- * Stuff in last entry.
- */
- sbp->entno = nsbuf;
- sbp->hash = XFS_DA_MAXHASH;
- sbp->seqno = 0;
- /*
- * Figure out the sequence numbers in case there's a hash duplicate.
- */
- for (hash = sbuf->hash, sbp = sbuf + 1;
- sbp < &sbuf[nsbuf + 1]; sbp++) {
- if (sbp->hash == hash)
- sbp->seqno = sbp[-1].seqno + 1;
- else
- hash = sbp->hash;
- }
-
- /*
- * Set up put routine.
- */
- p.dbp = dbp;
- p.put = put;
- p.uio = uio;
-
- /*
- * Find our place.
- */
- for (sbp = sbuf; sbp < &sbuf[nsbuf + 1]; sbp++) {
- if (sbp->hash > cookhash ||
- (sbp->hash == cookhash && sbp->seqno >= want_entno))
- break;
- }
-
- /*
- * Did we fail to find anything? We stop at the last entry,
- * the one we put maxhash into.
- */
- if (sbp == &sbuf[nsbuf]) {
- kmem_free(sbuf, sbsize);
- xfs_dir_trace_g_du("sf: hash beyond end", dp, uio);
- uio->uio_offset = XFS_DA_MAKE_COOKIE(mp, 0, 0, XFS_DA_MAXHASH);
- *eofp = 1;
- return 0;
- }
-
- /*
- * Loop putting entries into the user buffer.
- */
- while (sbp < &sbuf[nsbuf]) {
- /*
- * Save the first resid in a run of equal-hashval entries
- * so that we can back them out if they don't all fit.
- */
- if (sbp->seqno == 0 || sbp == sbuf)
- lastresid = uio->uio_resid;
- XFS_PUT_COOKIE(p.cook, mp, 0, sbp[1].seqno, sbp[1].hash);
- p.ino = sbp->ino;
-#if XFS_BIG_INUMS
- p.ino += mp->m_inoadd;
-#endif
- p.name = sbp->name;
- p.namelen = sbp->namelen;
- retval = p.put(&p);
- if (!p.done) {
- uio->uio_offset =
- XFS_DA_MAKE_COOKIE(mp, 0, 0, sbp->hash);
- kmem_free(sbuf, sbsize);
- uio->uio_resid = lastresid;
- xfs_dir_trace_g_du("sf: E-O-B", dp, uio);
- return retval;
- }
- sbp++;
- }
- kmem_free(sbuf, sbsize);
- uio->uio_offset = p.cook.o;
- *eofp = 1;
- xfs_dir_trace_g_du("sf: E-O-F", dp, uio);
- return 0;
-}
-
-/*
- * Look up a name in a shortform directory structure, replace the inode number.
- */
-int
-xfs_dir_shortform_replace(xfs_da_args_t *args)
-{
- xfs_dir_shortform_t *sf;
- xfs_dir_sf_entry_t *sfe;
- xfs_inode_t *dp;
- int i;
-
- dp = args->dp;
- ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
- /*
- * Catch the case where the conversion from shortform to leaf
- * failed part way through.
- */
- if (dp->i_d.di_size < sizeof(xfs_dir_sf_hdr_t)) {
- ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount));
- return XFS_ERROR(EIO);
- }
- ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
- ASSERT(dp->i_df.if_u1.if_data != NULL);
- sf = (xfs_dir_shortform_t *)dp->i_df.if_u1.if_data;
- if (args->namelen == 2 &&
- args->name[0] == '.' && args->name[1] == '.') {
- /* XXX - replace assert? */
- XFS_DIR_SF_PUT_DIRINO(&args->inumber, &sf->hdr.parent);
- xfs_trans_log_inode(args->trans, dp, XFS_ILOG_DDATA);
- return 0;
- }
- ASSERT(args->namelen != 1 || args->name[0] != '.');
- sfe = &sf->list[0];
- for (i = INT_GET(sf->hdr.count, ARCH_CONVERT)-1; i >= 0; i--) {
- if (sfe->namelen == args->namelen &&
- sfe->name[0] == args->name[0] &&
- memcmp(args->name, sfe->name, args->namelen) == 0) {
- ASSERT(memcmp((char *)&args->inumber,
- (char *)&sfe->inumber, sizeof(xfs_ino_t)));
- XFS_DIR_SF_PUT_DIRINO(&args->inumber, &sfe->inumber);
- xfs_trans_log_inode(args->trans, dp, XFS_ILOG_DDATA);
- return 0;
- }
- sfe = XFS_DIR_SF_NEXTENTRY(sfe);
- }
- ASSERT(args->oknoent);
- return XFS_ERROR(ENOENT);
-}
-
-/*
- * Convert a leaf directory to shortform structure
- */
-int
-xfs_dir_leaf_to_shortform(xfs_da_args_t *iargs)
-{
- xfs_dir_leafblock_t *leaf;
- xfs_dir_leaf_hdr_t *hdr;
- xfs_dir_leaf_entry_t *entry;
- xfs_dir_leaf_name_t *namest;
- xfs_da_args_t args;
- xfs_inode_t *dp;
- xfs_ino_t parent = 0;
- char *tmpbuffer;
- int retval, i;
- xfs_dabuf_t *bp;
-
- dp = iargs->dp;
- tmpbuffer = kmem_alloc(XFS_LBSIZE(dp->i_mount), KM_SLEEP);
- ASSERT(tmpbuffer != NULL);
-
- retval = xfs_da_read_buf(iargs->trans, iargs->dp, 0, -1, &bp,
- XFS_DATA_FORK);
- if (retval)
- goto out;
- ASSERT(bp != NULL);
- memcpy(tmpbuffer, bp->data, XFS_LBSIZE(dp->i_mount));
- leaf = (xfs_dir_leafblock_t *)tmpbuffer;
- ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR_LEAF_MAGIC);
- memset(bp->data, 0, XFS_LBSIZE(dp->i_mount));
-
- /*
- * Find and special case the parent inode number
- */
- hdr = &leaf->hdr;
- entry = &leaf->entries[0];
- for (i = INT_GET(hdr->count, ARCH_CONVERT)-1; i >= 0; entry++, i--) {
- namest = XFS_DIR_LEAF_NAMESTRUCT(leaf, INT_GET(entry->nameidx, ARCH_CONVERT));
- if ((entry->namelen == 2) &&
- (namest->name[0] == '.') &&
- (namest->name[1] == '.')) {
- XFS_DIR_SF_GET_DIRINO(&namest->inumber, &parent);
- entry->nameidx = 0;
- } else if ((entry->namelen == 1) && (namest->name[0] == '.')) {
- entry->nameidx = 0;
- }
- }
- retval = xfs_da_shrink_inode(iargs, 0, bp);
- if (retval)
- goto out;
- retval = xfs_dir_shortform_create(iargs, parent);
- if (retval)
- goto out;
-
- /*
- * Copy the rest of the filenames
- */
- entry = &leaf->entries[0];
- args.dp = dp;
- args.firstblock = iargs->firstblock;
- args.flist = iargs->flist;
- args.total = iargs->total;
- args.whichfork = XFS_DATA_FORK;
- args.trans = iargs->trans;
- args.justcheck = 0;
- args.addname = args.oknoent = 1;
- for (i = 0; i < INT_GET(hdr->count, ARCH_CONVERT); entry++, i++) {
- if (!entry->nameidx)
- continue;
- namest = XFS_DIR_LEAF_NAMESTRUCT(leaf, INT_GET(entry->nameidx, ARCH_CONVERT));
- args.name = (char *)(namest->name);
- args.namelen = entry->namelen;
- args.hashval = INT_GET(entry->hashval, ARCH_CONVERT);
- XFS_DIR_SF_GET_DIRINO(&namest->inumber, &args.inumber);
- xfs_dir_shortform_addname(&args);
- }
-
-out:
- kmem_free(tmpbuffer, XFS_LBSIZE(dp->i_mount));
- return retval;
-}
-
-/*
- * Convert from using a single leaf to a root node and a leaf.
- */
-int
-xfs_dir_leaf_to_node(xfs_da_args_t *args)
-{
- xfs_dir_leafblock_t *leaf;
- xfs_da_intnode_t *node;
- xfs_inode_t *dp;
- xfs_dabuf_t *bp1, *bp2;
- xfs_dablk_t blkno;
- int retval;
-
- dp = args->dp;
- retval = xfs_da_grow_inode(args, &blkno);
- ASSERT(blkno == 1);
- if (retval)
- return retval;
- retval = xfs_da_read_buf(args->trans, args->dp, 0, -1, &bp1,
- XFS_DATA_FORK);
- if (retval)
- return retval;
- ASSERT(bp1 != NULL);
- retval = xfs_da_get_buf(args->trans, args->dp, 1, -1, &bp2,
- XFS_DATA_FORK);
- if (retval) {
- xfs_da_buf_done(bp1);
- return retval;
- }
- ASSERT(bp2 != NULL);
- memcpy(bp2->data, bp1->data, XFS_LBSIZE(dp->i_mount));
- xfs_da_buf_done(bp1);
- xfs_da_log_buf(args->trans, bp2, 0, XFS_LBSIZE(dp->i_mount) - 1);
-
- /*
- * Set up the new root node.
- */
- retval = xfs_da_node_create(args, 0, 1, &bp1, XFS_DATA_FORK);
- if (retval) {
- xfs_da_buf_done(bp2);
- return retval;
- }
- node = bp1->data;
- leaf = bp2->data;
- ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR_LEAF_MAGIC);
- INT_SET(node->btree[0].hashval, ARCH_CONVERT, INT_GET(leaf->entries[ INT_GET(leaf->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT));
- xfs_da_buf_done(bp2);
- INT_SET(node->btree[0].before, ARCH_CONVERT, blkno);
- INT_SET(node->hdr.count, ARCH_CONVERT, 1);
- xfs_da_log_buf(args->trans, bp1,
- XFS_DA_LOGRANGE(node, &node->btree[0], sizeof(node->btree[0])));
- xfs_da_buf_done(bp1);
-
- return retval;
-}
-
-
-/*========================================================================
- * Routines used for growing the Btree.
- *========================================================================*/
-
-/*
- * Create the initial contents of a leaf directory
- * or a leaf in a node directory.
- */
-STATIC int
-xfs_dir_leaf_create(xfs_da_args_t *args, xfs_dablk_t blkno, xfs_dabuf_t **bpp)
-{
- xfs_dir_leafblock_t *leaf;
- xfs_dir_leaf_hdr_t *hdr;
- xfs_inode_t *dp;
- xfs_dabuf_t *bp;
- int retval;
-
- dp = args->dp;
- ASSERT(dp != NULL);
- retval = xfs_da_get_buf(args->trans, dp, blkno, -1, &bp, XFS_DATA_FORK);
- if (retval)
- return retval;
- ASSERT(bp != NULL);
- leaf = bp->data;
- memset((char *)leaf, 0, XFS_LBSIZE(dp->i_mount));
- hdr = &leaf->hdr;
- INT_SET(hdr->info.magic, ARCH_CONVERT, XFS_DIR_LEAF_MAGIC);
- INT_SET(hdr->firstused, ARCH_CONVERT, XFS_LBSIZE(dp->i_mount));
- if (!hdr->firstused)
- INT_SET(hdr->firstused, ARCH_CONVERT, XFS_LBSIZE(dp->i_mount) - 1);
- INT_SET(hdr->freemap[0].base, ARCH_CONVERT, sizeof(xfs_dir_leaf_hdr_t));
- INT_SET(hdr->freemap[0].size, ARCH_CONVERT, INT_GET(hdr->firstused, ARCH_CONVERT) - INT_GET(hdr->freemap[0].base, ARCH_CONVERT));
-
- xfs_da_log_buf(args->trans, bp, 0, XFS_LBSIZE(dp->i_mount) - 1);
-
- *bpp = bp;
- return 0;
-}
-
-/*
- * Split the leaf node, rebalance, then add the new entry.
- */
-int
-xfs_dir_leaf_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk,
- xfs_da_state_blk_t *newblk)
-{
- xfs_dablk_t blkno;
- xfs_da_args_t *args;
- int error;
-
- /*
- * Allocate space for a new leaf node.
- */
- args = state->args;
- ASSERT(args != NULL);
- ASSERT(oldblk->magic == XFS_DIR_LEAF_MAGIC);
- error = xfs_da_grow_inode(args, &blkno);
- if (error)
- return error;
- error = xfs_dir_leaf_create(args, blkno, &newblk->bp);
- if (error)
- return error;
- newblk->blkno = blkno;
- newblk->magic = XFS_DIR_LEAF_MAGIC;
-
- /*
- * Rebalance the entries across the two leaves.
- */
- xfs_dir_leaf_rebalance(state, oldblk, newblk);
- error = xfs_da_blk_link(state, oldblk, newblk);
- if (error)
- return error;
-
- /*
- * Insert the new entry in the correct block.
- */
- if (state->inleaf) {
- error = xfs_dir_leaf_add(oldblk->bp, args, oldblk->index);
- } else {
- error = xfs_dir_leaf_add(newblk->bp, args, newblk->index);
- }
-
- /*
- * Update last hashval in each block since we added the name.
- */
- oldblk->hashval = xfs_dir_leaf_lasthash(oldblk->bp, NULL);
- newblk->hashval = xfs_dir_leaf_lasthash(newblk->bp, NULL);
- return error;
-}
-
-/*
- * Add a name to the leaf directory structure.
- *
- * Must take into account fragmented leaves and leaves where spacemap has
- * lost some freespace information (ie: holes).
- */
-int
-xfs_dir_leaf_add(xfs_dabuf_t *bp, xfs_da_args_t *args, int index)
-{
- xfs_dir_leafblock_t *leaf;
- xfs_dir_leaf_hdr_t *hdr;
- xfs_dir_leaf_map_t *map;
- int tablesize, entsize, sum, i, tmp, error;
-
- leaf = bp->data;
- ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR_LEAF_MAGIC);
- ASSERT((index >= 0) && (index <= INT_GET(leaf->hdr.count, ARCH_CONVERT)));
- hdr = &leaf->hdr;
- entsize = XFS_DIR_LEAF_ENTSIZE_BYNAME(args->namelen);
-
- /*
- * Search through freemap for first-fit on new name length.
- * (may need to figure in size of entry struct too)
- */
- tablesize = (INT_GET(hdr->count, ARCH_CONVERT) + 1) * (uint)sizeof(xfs_dir_leaf_entry_t)
- + (uint)sizeof(xfs_dir_leaf_hdr_t);
- map = &hdr->freemap[XFS_DIR_LEAF_MAPSIZE-1];
- for (sum = 0, i = XFS_DIR_LEAF_MAPSIZE-1; i >= 0; map--, i--) {
- if (tablesize > INT_GET(hdr->firstused, ARCH_CONVERT)) {
- sum += INT_GET(map->size, ARCH_CONVERT);
- continue;
- }
- if (!map->size)
- continue; /* no space in this map */
- tmp = entsize;
- if (INT_GET(map->base, ARCH_CONVERT) < INT_GET(hdr->firstused, ARCH_CONVERT))
- tmp += (uint)sizeof(xfs_dir_leaf_entry_t);
- if (INT_GET(map->size, ARCH_CONVERT) >= tmp) {
- if (!args->justcheck)
- xfs_dir_leaf_add_work(bp, args, index, i);
- return 0;
- }
- sum += INT_GET(map->size, ARCH_CONVERT);
- }
-
- /*
- * If there are no holes in the address space of the block,
- * and we don't have enough freespace, then compaction will do us
- * no good and we should just give up.
- */
- if (!hdr->holes && (sum < entsize))
- return XFS_ERROR(ENOSPC);
-
- /*
- * Compact the entries to coalesce free space.
- * Pass the justcheck flag so the checking pass can return
- * an error, without changing anything, if it won't fit.
- */
- error = xfs_dir_leaf_compact(args->trans, bp,
- args->total == 0 ?
- entsize +
- (uint)sizeof(xfs_dir_leaf_entry_t) : 0,
- args->justcheck);
- if (error)
- return error;
- /*
- * After compaction, the block is guaranteed to have only one
- * free region, in freemap[0]. If it is not big enough, give up.
- */
- if (INT_GET(hdr->freemap[0].size, ARCH_CONVERT) <
- (entsize + (uint)sizeof(xfs_dir_leaf_entry_t)))
- return XFS_ERROR(ENOSPC);
-
- if (!args->justcheck)
- xfs_dir_leaf_add_work(bp, args, index, 0);
- return 0;
-}
-
-/*
- * Add a name to a leaf directory structure.
- */
-STATIC void
-xfs_dir_leaf_add_work(xfs_dabuf_t *bp, xfs_da_args_t *args, int index,
- int mapindex)
-{
- xfs_dir_leafblock_t *leaf;
- xfs_dir_leaf_hdr_t *hdr;
- xfs_dir_leaf_entry_t *entry;
- xfs_dir_leaf_name_t *namest;
- xfs_dir_leaf_map_t *map;
- /* REFERENCED */
- xfs_mount_t *mp;
- int tmp, i;
-
- leaf = bp->data;
- ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR_LEAF_MAGIC);
- hdr = &leaf->hdr;
- ASSERT((mapindex >= 0) && (mapindex < XFS_DIR_LEAF_MAPSIZE));
- ASSERT((index >= 0) && (index <= INT_GET(hdr->count, ARCH_CONVERT)));
-
- /*
- * Force open some space in the entry array and fill it in.
- */
- entry = &leaf->entries[index];
- if (index < INT_GET(hdr->count, ARCH_CONVERT)) {
- tmp = INT_GET(hdr->count, ARCH_CONVERT) - index;
- tmp *= (uint)sizeof(xfs_dir_leaf_entry_t);
- memmove(entry + 1, entry, tmp);
- xfs_da_log_buf(args->trans, bp,
- XFS_DA_LOGRANGE(leaf, entry, tmp + (uint)sizeof(*entry)));
- }
- INT_MOD(hdr->count, ARCH_CONVERT, +1);
-
- /*
- * Allocate space for the new string (at the end of the run).
- */
- map = &hdr->freemap[mapindex];
- mp = args->trans->t_mountp;
- ASSERT(INT_GET(map->base, ARCH_CONVERT) < XFS_LBSIZE(mp));
- ASSERT(INT_GET(map->size, ARCH_CONVERT) >= XFS_DIR_LEAF_ENTSIZE_BYNAME(args->namelen));
- ASSERT(INT_GET(map->size, ARCH_CONVERT) < XFS_LBSIZE(mp));
- INT_MOD(map->size, ARCH_CONVERT, -(XFS_DIR_LEAF_ENTSIZE_BYNAME(args->namelen)));
- INT_SET(entry->nameidx, ARCH_CONVERT, INT_GET(map->base, ARCH_CONVERT) + INT_GET(map->size, ARCH_CONVERT));
- INT_SET(entry->hashval, ARCH_CONVERT, args->hashval);
- entry->namelen = args->namelen;
- xfs_da_log_buf(args->trans, bp,
- XFS_DA_LOGRANGE(leaf, entry, sizeof(*entry)));
-
- /*
- * Copy the string and inode number into the new space.
- */
- namest = XFS_DIR_LEAF_NAMESTRUCT(leaf, INT_GET(entry->nameidx, ARCH_CONVERT));
- XFS_DIR_SF_PUT_DIRINO(&args->inumber, &namest->inumber);
- memcpy(namest->name, args->name, args->namelen);
- xfs_da_log_buf(args->trans, bp,
- XFS_DA_LOGRANGE(leaf, namest, XFS_DIR_LEAF_ENTSIZE_BYENTRY(entry)));
-
- /*
- * Update the control info for this leaf node
- */
- if (INT_GET(entry->nameidx, ARCH_CONVERT) < INT_GET(hdr->firstused, ARCH_CONVERT))
- INT_COPY(hdr->firstused, entry->nameidx, ARCH_CONVERT);
- ASSERT(INT_GET(hdr->firstused, ARCH_CONVERT) >= ((INT_GET(hdr->count, ARCH_CONVERT)*sizeof(*entry))+sizeof(*hdr)));
- tmp = (INT_GET(hdr->count, ARCH_CONVERT)-1) * (uint)sizeof(xfs_dir_leaf_entry_t)
- + (uint)sizeof(xfs_dir_leaf_hdr_t);
- map = &hdr->freemap[0];
- for (i = 0; i < XFS_DIR_LEAF_MAPSIZE; map++, i++) {
- if (INT_GET(map->base, ARCH_CONVERT) == tmp) {
- INT_MOD(map->base, ARCH_CONVERT, (uint)sizeof(xfs_dir_leaf_entry_t));
- INT_MOD(map->size, ARCH_CONVERT, -((uint)sizeof(xfs_dir_leaf_entry_t)));
- }
- }
- INT_MOD(hdr->namebytes, ARCH_CONVERT, args->namelen);
- xfs_da_log_buf(args->trans, bp,
- XFS_DA_LOGRANGE(leaf, hdr, sizeof(*hdr)));
-}
-
-/*
- * Garbage collect a leaf directory block by copying it to a new buffer.
- */
-STATIC int
-xfs_dir_leaf_compact(xfs_trans_t *trans, xfs_dabuf_t *bp, int musthave,
- int justcheck)
-{
- xfs_dir_leafblock_t *leaf_s, *leaf_d;
- xfs_dir_leaf_hdr_t *hdr_s, *hdr_d;
- xfs_mount_t *mp;
- char *tmpbuffer;
- char *tmpbuffer2=NULL;
- int rval;
- int lbsize;
-
- mp = trans->t_mountp;
- lbsize = XFS_LBSIZE(mp);
- tmpbuffer = kmem_alloc(lbsize, KM_SLEEP);
- ASSERT(tmpbuffer != NULL);
- memcpy(tmpbuffer, bp->data, lbsize);
-
- /*
- * Make a second copy in case xfs_dir_leaf_moveents()
- * below destroys the original.
- */
- if (musthave || justcheck) {
- tmpbuffer2 = kmem_alloc(lbsize, KM_SLEEP);
- memcpy(tmpbuffer2, bp->data, lbsize);
- }
- memset(bp->data, 0, lbsize);
-
- /*
- * Copy basic information
- */
- leaf_s = (xfs_dir_leafblock_t *)tmpbuffer;
- leaf_d = bp->data;
- hdr_s = &leaf_s->hdr;
- hdr_d = &leaf_d->hdr;
- hdr_d->info = hdr_s->info; /* struct copy */
- INT_SET(hdr_d->firstused, ARCH_CONVERT, lbsize);
- if (!hdr_d->firstused)
- INT_SET(hdr_d->firstused, ARCH_CONVERT, lbsize - 1);
- hdr_d->namebytes = 0;
- hdr_d->count = 0;
- hdr_d->holes = 0;
- INT_SET(hdr_d->freemap[0].base, ARCH_CONVERT, sizeof(xfs_dir_leaf_hdr_t));
- INT_SET(hdr_d->freemap[0].size, ARCH_CONVERT, INT_GET(hdr_d->firstused, ARCH_CONVERT) - INT_GET(hdr_d->freemap[0].base, ARCH_CONVERT));
-
- /*
- * Copy all entry's in the same (sorted) order,
- * but allocate filenames packed and in sequence.
- * This changes the source (leaf_s) as well.
- */
- xfs_dir_leaf_moveents(leaf_s, 0, leaf_d, 0, (int)INT_GET(hdr_s->count, ARCH_CONVERT), mp);
-
- if (musthave && INT_GET(hdr_d->freemap[0].size, ARCH_CONVERT) < musthave)
- rval = XFS_ERROR(ENOSPC);
- else
- rval = 0;
-
- if (justcheck || rval == ENOSPC) {
- ASSERT(tmpbuffer2);
- memcpy(bp->data, tmpbuffer2, lbsize);
- } else {
- xfs_da_log_buf(trans, bp, 0, lbsize - 1);
- }
-
- kmem_free(tmpbuffer, lbsize);
- if (musthave || justcheck)
- kmem_free(tmpbuffer2, lbsize);
- return rval;
-}
-
-/*
- * Redistribute the directory entries between two leaf nodes,
- * taking into account the size of the new entry.
- *
- * NOTE: if new block is empty, then it will get the upper half of old block.
- */
-STATIC void
-xfs_dir_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
- xfs_da_state_blk_t *blk2)
-{
- xfs_da_state_blk_t *tmp_blk;
- xfs_dir_leafblock_t *leaf1, *leaf2;
- xfs_dir_leaf_hdr_t *hdr1, *hdr2;
- int count, totallen, max, space, swap;
-
- /*
- * Set up environment.
- */
- ASSERT(blk1->magic == XFS_DIR_LEAF_MAGIC);
- ASSERT(blk2->magic == XFS_DIR_LEAF_MAGIC);
- leaf1 = blk1->bp->data;
- leaf2 = blk2->bp->data;
- ASSERT(INT_GET(leaf1->hdr.info.magic, ARCH_CONVERT) == XFS_DIR_LEAF_MAGIC);
- ASSERT(INT_GET(leaf2->hdr.info.magic, ARCH_CONVERT) == XFS_DIR_LEAF_MAGIC);
-
- /*
- * Check ordering of blocks, reverse if it makes things simpler.
- */
- swap = 0;
- if (xfs_dir_leaf_order(blk1->bp, blk2->bp)) {
- tmp_blk = blk1;
- blk1 = blk2;
- blk2 = tmp_blk;
- leaf1 = blk1->bp->data;
- leaf2 = blk2->bp->data;
- swap = 1;
- }
- hdr1 = &leaf1->hdr;
- hdr2 = &leaf2->hdr;
-
- /*
- * Examine entries until we reduce the absolute difference in
- * byte usage between the two blocks to a minimum. Then get
- * the direction to copy and the number of elements to move.
- */
- state->inleaf = xfs_dir_leaf_figure_balance(state, blk1, blk2,
- &count, &totallen);
- if (swap)
- state->inleaf = !state->inleaf;
-
- /*
- * Move any entries required from leaf to leaf:
- */
- if (count < INT_GET(hdr1->count, ARCH_CONVERT)) {
- /*
- * Figure the total bytes to be added to the destination leaf.
- */
- count = INT_GET(hdr1->count, ARCH_CONVERT) - count; /* number entries being moved */
- space = INT_GET(hdr1->namebytes, ARCH_CONVERT) - totallen;
- space += count * ((uint)sizeof(xfs_dir_leaf_name_t)-1);
- space += count * (uint)sizeof(xfs_dir_leaf_entry_t);
-
- /*
- * leaf2 is the destination, compact it if it looks tight.
- */
- max = INT_GET(hdr2->firstused, ARCH_CONVERT) - (uint)sizeof(xfs_dir_leaf_hdr_t);
- max -= INT_GET(hdr2->count, ARCH_CONVERT) * (uint)sizeof(xfs_dir_leaf_entry_t);
- if (space > max) {
- xfs_dir_leaf_compact(state->args->trans, blk2->bp,
- 0, 0);
- }
-
- /*
- * Move high entries from leaf1 to low end of leaf2.
- */
- xfs_dir_leaf_moveents(leaf1, INT_GET(hdr1->count, ARCH_CONVERT) - count,
- leaf2, 0, count, state->mp);
-
- xfs_da_log_buf(state->args->trans, blk1->bp, 0,
- state->blocksize-1);
- xfs_da_log_buf(state->args->trans, blk2->bp, 0,
- state->blocksize-1);
-
- } else if (count > INT_GET(hdr1->count, ARCH_CONVERT)) {
- /*
- * Figure the total bytes to be added to the destination leaf.
- */
- count -= INT_GET(hdr1->count, ARCH_CONVERT); /* number entries being moved */
- space = totallen - INT_GET(hdr1->namebytes, ARCH_CONVERT);
- space += count * ((uint)sizeof(xfs_dir_leaf_name_t)-1);
- space += count * (uint)sizeof(xfs_dir_leaf_entry_t);
-
- /*
- * leaf1 is the destination, compact it if it looks tight.
- */
- max = INT_GET(hdr1->firstused, ARCH_CONVERT) - (uint)sizeof(xfs_dir_leaf_hdr_t);
- max -= INT_GET(hdr1->count, ARCH_CONVERT) * (uint)sizeof(xfs_dir_leaf_entry_t);
- if (space > max) {
- xfs_dir_leaf_compact(state->args->trans, blk1->bp,
- 0, 0);
- }
-
- /*
- * Move low entries from leaf2 to high end of leaf1.
- */
- xfs_dir_leaf_moveents(leaf2, 0, leaf1, (int)INT_GET(hdr1->count, ARCH_CONVERT),
- count, state->mp);
-
- xfs_da_log_buf(state->args->trans, blk1->bp, 0,
- state->blocksize-1);
- xfs_da_log_buf(state->args->trans, blk2->bp, 0,
- state->blocksize-1);
- }
-
- /*
- * Copy out last hashval in each block for B-tree code.
- */
- blk1->hashval = INT_GET(leaf1->entries[ INT_GET(leaf1->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT);
- blk2->hashval = INT_GET(leaf2->entries[ INT_GET(leaf2->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT);
-
- /*
- * Adjust the expected index for insertion.
- * GROT: this doesn't work unless blk2 was originally empty.
- */
- if (!state->inleaf) {
- blk2->index = blk1->index - INT_GET(leaf1->hdr.count, ARCH_CONVERT);
- }
-}
-
-/*
- * Examine entries until we reduce the absolute difference in
- * byte usage between the two blocks to a minimum.
- * GROT: Is this really necessary? With other than a 512 byte blocksize,
- * GROT: there will always be enough room in either block for a new entry.
- * GROT: Do a double-split for this case?
- */
-STATIC int
-xfs_dir_leaf_figure_balance(xfs_da_state_t *state,
- xfs_da_state_blk_t *blk1,
- xfs_da_state_blk_t *blk2,
- int *countarg, int *namebytesarg)
-{
- xfs_dir_leafblock_t *leaf1, *leaf2;
- xfs_dir_leaf_hdr_t *hdr1, *hdr2;
- xfs_dir_leaf_entry_t *entry;
- int count, max, totallen, half;
- int lastdelta, foundit, tmp;
-
- /*
- * Set up environment.
- */
- leaf1 = blk1->bp->data;
- leaf2 = blk2->bp->data;
- hdr1 = &leaf1->hdr;
- hdr2 = &leaf2->hdr;
- foundit = 0;
- totallen = 0;
-
- /*
- * Examine entries until we reduce the absolute difference in
- * byte usage between the two blocks to a minimum.
- */
- max = INT_GET(hdr1->count, ARCH_CONVERT) + INT_GET(hdr2->count, ARCH_CONVERT);
- half = (max+1) * (uint)(sizeof(*entry)+sizeof(xfs_dir_leaf_entry_t)-1);
- half += INT_GET(hdr1->namebytes, ARCH_CONVERT) + INT_GET(hdr2->namebytes, ARCH_CONVERT) + state->args->namelen;
- half /= 2;
- lastdelta = state->blocksize;
- entry = &leaf1->entries[0];
- for (count = 0; count < max; entry++, count++) {
-
-#define XFS_DIR_ABS(A) (((A) < 0) ? -(A) : (A))
- /*
- * The new entry is in the first block, account for it.
- */
- if (count == blk1->index) {
- tmp = totallen + (uint)sizeof(*entry)
- + XFS_DIR_LEAF_ENTSIZE_BYNAME(state->args->namelen);
- if (XFS_DIR_ABS(half - tmp) > lastdelta)
- break;
- lastdelta = XFS_DIR_ABS(half - tmp);
- totallen = tmp;
- foundit = 1;
- }
-
- /*
- * Wrap around into the second block if necessary.
- */
- if (count == INT_GET(hdr1->count, ARCH_CONVERT)) {
- leaf1 = leaf2;
- entry = &leaf1->entries[0];
- }
-
- /*
- * Figure out if next leaf entry would be too much.
- */
- tmp = totallen + (uint)sizeof(*entry)
- + XFS_DIR_LEAF_ENTSIZE_BYENTRY(entry);
- if (XFS_DIR_ABS(half - tmp) > lastdelta)
- break;
- lastdelta = XFS_DIR_ABS(half - tmp);
- totallen = tmp;
-#undef XFS_DIR_ABS
- }
-
- /*
- * Calculate the number of namebytes that will end up in lower block.
- * If new entry not in lower block, fix up the count.
- */
- totallen -=
- count * (uint)(sizeof(*entry)+sizeof(xfs_dir_leaf_entry_t)-1);
- if (foundit) {
- totallen -= (sizeof(*entry)+sizeof(xfs_dir_leaf_entry_t)-1) +
- state->args->namelen;
- }
-
- *countarg = count;
- *namebytesarg = totallen;
- return foundit;
-}
-
-/*========================================================================
- * Routines used for shrinking the Btree.
- *========================================================================*/
-
-/*
- * Check a leaf block and its neighbors to see if the block should be
- * collapsed into one or the other neighbor. Always keep the block
- * with the smaller block number.
- * If the current block is over 50% full, don't try to join it, return 0.
- * If the block is empty, fill in the state structure and return 2.
- * If it can be collapsed, fill in the state structure and return 1.
- * If nothing can be done, return 0.
- */
-int
-xfs_dir_leaf_toosmall(xfs_da_state_t *state, int *action)
-{
- xfs_dir_leafblock_t *leaf;
- xfs_da_state_blk_t *blk;
- xfs_da_blkinfo_t *info;
- int count, bytes, forward, error, retval, i;
- xfs_dablk_t blkno;
- xfs_dabuf_t *bp;
-
- /*
- * Check for the degenerate case of the block being over 50% full.
- * If so, it's not worth even looking to see if we might be able
- * to coalesce with a sibling.
- */
- blk = &state->path.blk[ state->path.active-1 ];
- info = blk->bp->data;
- ASSERT(INT_GET(info->magic, ARCH_CONVERT) == XFS_DIR_LEAF_MAGIC);
- leaf = (xfs_dir_leafblock_t *)info;
- count = INT_GET(leaf->hdr.count, ARCH_CONVERT);
- bytes = (uint)sizeof(xfs_dir_leaf_hdr_t) +
- count * (uint)sizeof(xfs_dir_leaf_entry_t) +
- count * ((uint)sizeof(xfs_dir_leaf_name_t)-1) +
- INT_GET(leaf->hdr.namebytes, ARCH_CONVERT);
- if (bytes > (state->blocksize >> 1)) {
- *action = 0; /* blk over 50%, don't try to join */
- return 0;
- }
-
- /*
- * Check for the degenerate case of the block being empty.
- * If the block is empty, we'll simply delete it, no need to
- * coalesce it with a sibling block. We choose (aribtrarily)
- * to merge with the forward block unless it is NULL.
- */
- if (count == 0) {
- /*
- * Make altpath point to the block we want to keep and
- * path point to the block we want to drop (this one).
- */
- forward = info->forw;
- memcpy(&state->altpath, &state->path, sizeof(state->path));
- error = xfs_da_path_shift(state, &state->altpath, forward,
- 0, &retval);
- if (error)
- return error;
- if (retval) {
- *action = 0;
- } else {
- *action = 2;
- }
- return 0;
- }
-
- /*
- * Examine each sibling block to see if we can coalesce with
- * at least 25% free space to spare. We need to figure out
- * whether to merge with the forward or the backward block.
- * We prefer coalescing with the lower numbered sibling so as
- * to shrink a directory over time.
- */
- forward = (INT_GET(info->forw, ARCH_CONVERT) < INT_GET(info->back, ARCH_CONVERT)); /* start with smaller blk num */
- for (i = 0; i < 2; forward = !forward, i++) {
- if (forward)
- blkno = INT_GET(info->forw, ARCH_CONVERT);
- else
- blkno = INT_GET(info->back, ARCH_CONVERT);
- if (blkno == 0)
- continue;
- error = xfs_da_read_buf(state->args->trans, state->args->dp,
- blkno, -1, &bp,
- XFS_DATA_FORK);
- if (error)
- return error;
- ASSERT(bp != NULL);
-
- leaf = (xfs_dir_leafblock_t *)info;
- count = INT_GET(leaf->hdr.count, ARCH_CONVERT);
- bytes = state->blocksize - (state->blocksize>>2);
- bytes -= INT_GET(leaf->hdr.namebytes, ARCH_CONVERT);
- leaf = bp->data;
- ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR_LEAF_MAGIC);
- count += INT_GET(leaf->hdr.count, ARCH_CONVERT);
- bytes -= INT_GET(leaf->hdr.namebytes, ARCH_CONVERT);
- bytes -= count * ((uint)sizeof(xfs_dir_leaf_name_t) - 1);
- bytes -= count * (uint)sizeof(xfs_dir_leaf_entry_t);
- bytes -= (uint)sizeof(xfs_dir_leaf_hdr_t);
- if (bytes >= 0)
- break; /* fits with at least 25% to spare */
-
- xfs_da_brelse(state->args->trans, bp);
- }
- if (i >= 2) {
- *action = 0;
- return 0;
- }
- xfs_da_buf_done(bp);
-
- /*
- * Make altpath point to the block we want to keep (the lower
- * numbered block) and path point to the block we want to drop.
- */
- memcpy(&state->altpath, &state->path, sizeof(state->path));
- if (blkno < blk->blkno) {
- error = xfs_da_path_shift(state, &state->altpath, forward,
- 0, &retval);
- } else {
- error = xfs_da_path_shift(state, &state->path, forward,
- 0, &retval);
- }
- if (error)
- return error;
- if (retval) {
- *action = 0;
- } else {
- *action = 1;
- }
- return 0;
-}
-
-/*
- * Remove a name from the leaf directory structure.
- *
- * Return 1 if leaf is less than 37% full, 0 if >= 37% full.
- * If two leaves are 37% full, when combined they will leave 25% free.
- */
-int
-xfs_dir_leaf_remove(xfs_trans_t *trans, xfs_dabuf_t *bp, int index)
-{
- xfs_dir_leafblock_t *leaf;
- xfs_dir_leaf_hdr_t *hdr;
- xfs_dir_leaf_map_t *map;
- xfs_dir_leaf_entry_t *entry;
- xfs_dir_leaf_name_t *namest;
- int before, after, smallest, entsize;
- int tablesize, tmp, i;
- xfs_mount_t *mp;
-
- leaf = bp->data;
- ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR_LEAF_MAGIC);
- hdr = &leaf->hdr;
- mp = trans->t_mountp;
- ASSERT((INT_GET(hdr->count, ARCH_CONVERT) > 0) && (INT_GET(hdr->count, ARCH_CONVERT) < (XFS_LBSIZE(mp)/8)));
- ASSERT((index >= 0) && (index < INT_GET(hdr->count, ARCH_CONVERT)));
- ASSERT(INT_GET(hdr->firstused, ARCH_CONVERT) >= ((INT_GET(hdr->count, ARCH_CONVERT)*sizeof(*entry))+sizeof(*hdr)));
- entry = &leaf->entries[index];
- ASSERT(INT_GET(entry->nameidx, ARCH_CONVERT) >= INT_GET(hdr->firstused, ARCH_CONVERT));
- ASSERT(INT_GET(entry->nameidx, ARCH_CONVERT) < XFS_LBSIZE(mp));
-
- /*
- * Scan through free region table:
- * check for adjacency of free'd entry with an existing one,
- * find smallest free region in case we need to replace it,
- * adjust any map that borders the entry table,
- */
- tablesize = INT_GET(hdr->count, ARCH_CONVERT) * (uint)sizeof(xfs_dir_leaf_entry_t)
- + (uint)sizeof(xfs_dir_leaf_hdr_t);
- map = &hdr->freemap[0];
- tmp = INT_GET(map->size, ARCH_CONVERT);
- before = after = -1;
- smallest = XFS_DIR_LEAF_MAPSIZE - 1;
- entsize = XFS_DIR_LEAF_ENTSIZE_BYENTRY(entry);
- for (i = 0; i < XFS_DIR_LEAF_MAPSIZE; map++, i++) {
- ASSERT(INT_GET(map->base, ARCH_CONVERT) < XFS_LBSIZE(mp));
- ASSERT(INT_GET(map->size, ARCH_CONVERT) < XFS_LBSIZE(mp));
- if (INT_GET(map->base, ARCH_CONVERT) == tablesize) {
- INT_MOD(map->base, ARCH_CONVERT, -((uint)sizeof(xfs_dir_leaf_entry_t)));
- INT_MOD(map->size, ARCH_CONVERT, (uint)sizeof(xfs_dir_leaf_entry_t));
- }
-
- if ((INT_GET(map->base, ARCH_CONVERT) + INT_GET(map->size, ARCH_CONVERT)) == INT_GET(entry->nameidx, ARCH_CONVERT)) {
- before = i;
- } else if (INT_GET(map->base, ARCH_CONVERT) == (INT_GET(entry->nameidx, ARCH_CONVERT) + entsize)) {
- after = i;
- } else if (INT_GET(map->size, ARCH_CONVERT) < tmp) {
- tmp = INT_GET(map->size, ARCH_CONVERT);
- smallest = i;
- }
- }
-
- /*
- * Coalesce adjacent freemap regions,
- * or replace the smallest region.
- */
- if ((before >= 0) || (after >= 0)) {
- if ((before >= 0) && (after >= 0)) {
- map = &hdr->freemap[before];
- INT_MOD(map->size, ARCH_CONVERT, entsize);
- INT_MOD(map->size, ARCH_CONVERT, INT_GET(hdr->freemap[after].size, ARCH_CONVERT));
- hdr->freemap[after].base = 0;
- hdr->freemap[after].size = 0;
- } else if (before >= 0) {
- map = &hdr->freemap[before];
- INT_MOD(map->size, ARCH_CONVERT, entsize);
- } else {
- map = &hdr->freemap[after];
- INT_COPY(map->base, entry->nameidx, ARCH_CONVERT);
- INT_MOD(map->size, ARCH_CONVERT, entsize);
- }
- } else {
- /*
- * Replace smallest region (if it is smaller than free'd entry)
- */
- map = &hdr->freemap[smallest];
- if (INT_GET(map->size, ARCH_CONVERT) < entsize) {
- INT_COPY(map->base, entry->nameidx, ARCH_CONVERT);
- INT_SET(map->size, ARCH_CONVERT, entsize);
- }
- }
-
- /*
- * Did we remove the first entry?
- */
- if (INT_GET(entry->nameidx, ARCH_CONVERT) == INT_GET(hdr->firstused, ARCH_CONVERT))
- smallest = 1;
- else
- smallest = 0;
-
- /*
- * Compress the remaining entries and zero out the removed stuff.
- */
- namest = XFS_DIR_LEAF_NAMESTRUCT(leaf, INT_GET(entry->nameidx, ARCH_CONVERT));
- memset((char *)namest, 0, entsize);
- xfs_da_log_buf(trans, bp, XFS_DA_LOGRANGE(leaf, namest, entsize));
-
- INT_MOD(hdr->namebytes, ARCH_CONVERT, -(entry->namelen));
- tmp = (INT_GET(hdr->count, ARCH_CONVERT) - index) * (uint)sizeof(xfs_dir_leaf_entry_t);
- memmove(entry, entry + 1, tmp);
- INT_MOD(hdr->count, ARCH_CONVERT, -1);
- xfs_da_log_buf(trans, bp,
- XFS_DA_LOGRANGE(leaf, entry, tmp + (uint)sizeof(*entry)));
- entry = &leaf->entries[INT_GET(hdr->count, ARCH_CONVERT)];
- memset((char *)entry, 0, sizeof(xfs_dir_leaf_entry_t));
-
- /*
- * If we removed the first entry, re-find the first used byte
- * in the name area. Note that if the entry was the "firstused",
- * then we don't have a "hole" in our block resulting from
- * removing the name.
- */
- if (smallest) {
- tmp = XFS_LBSIZE(mp);
- entry = &leaf->entries[0];
- for (i = INT_GET(hdr->count, ARCH_CONVERT)-1; i >= 0; entry++, i--) {
- ASSERT(INT_GET(entry->nameidx, ARCH_CONVERT) >= INT_GET(hdr->firstused, ARCH_CONVERT));
- ASSERT(INT_GET(entry->nameidx, ARCH_CONVERT) < XFS_LBSIZE(mp));
- if (INT_GET(entry->nameidx, ARCH_CONVERT) < tmp)
- tmp = INT_GET(entry->nameidx, ARCH_CONVERT);
- }
- INT_SET(hdr->firstused, ARCH_CONVERT, tmp);
- if (!hdr->firstused)
- INT_SET(hdr->firstused, ARCH_CONVERT, tmp - 1);
- } else {
- hdr->holes = 1; /* mark as needing compaction */
- }
-
- xfs_da_log_buf(trans, bp, XFS_DA_LOGRANGE(leaf, hdr, sizeof(*hdr)));
-
- /*
- * Check if leaf is less than 50% full, caller may want to
- * "join" the leaf with a sibling if so.
- */
- tmp = (uint)sizeof(xfs_dir_leaf_hdr_t);
- tmp += INT_GET(leaf->hdr.count, ARCH_CONVERT) * (uint)sizeof(xfs_dir_leaf_entry_t);
- tmp += INT_GET(leaf->hdr.count, ARCH_CONVERT) * ((uint)sizeof(xfs_dir_leaf_name_t) - 1);
- tmp += INT_GET(leaf->hdr.namebytes, ARCH_CONVERT);
- if (tmp < mp->m_dir_magicpct)
- return 1; /* leaf is < 37% full */
- return 0;
-}
-
-/*
- * Move all the directory entries from drop_leaf into save_leaf.
- */
-void
-xfs_dir_leaf_unbalance(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk,
- xfs_da_state_blk_t *save_blk)
-{
- xfs_dir_leafblock_t *drop_leaf, *save_leaf, *tmp_leaf;
- xfs_dir_leaf_hdr_t *drop_hdr, *save_hdr, *tmp_hdr;
- xfs_mount_t *mp;
- char *tmpbuffer;
-
- /*
- * Set up environment.
- */
- mp = state->mp;
- ASSERT(drop_blk->magic == XFS_DIR_LEAF_MAGIC);
- ASSERT(save_blk->magic == XFS_DIR_LEAF_MAGIC);
- drop_leaf = drop_blk->bp->data;
- save_leaf = save_blk->bp->data;
- ASSERT(INT_GET(drop_leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR_LEAF_MAGIC);
- ASSERT(INT_GET(save_leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR_LEAF_MAGIC);
- drop_hdr = &drop_leaf->hdr;
- save_hdr = &save_leaf->hdr;
-
- /*
- * Save last hashval from dying block for later Btree fixup.
- */
- drop_blk->hashval = INT_GET(drop_leaf->entries[ drop_leaf->hdr.count-1 ].hashval, ARCH_CONVERT);
-
- /*
- * Check if we need a temp buffer, or can we do it in place.
- * Note that we don't check "leaf" for holes because we will
- * always be dropping it, toosmall() decided that for us already.
- */
- if (save_hdr->holes == 0) {
- /*
- * dest leaf has no holes, so we add there. May need
- * to make some room in the entry array.
- */
- if (xfs_dir_leaf_order(save_blk->bp, drop_blk->bp)) {
- xfs_dir_leaf_moveents(drop_leaf, 0, save_leaf, 0,
- (int)INT_GET(drop_hdr->count, ARCH_CONVERT), mp);
- } else {
- xfs_dir_leaf_moveents(drop_leaf, 0,
- save_leaf, INT_GET(save_hdr->count, ARCH_CONVERT),
- (int)INT_GET(drop_hdr->count, ARCH_CONVERT), mp);
- }
- } else {
- /*
- * Destination has holes, so we make a temporary copy
- * of the leaf and add them both to that.
- */
- tmpbuffer = kmem_alloc(state->blocksize, KM_SLEEP);
- ASSERT(tmpbuffer != NULL);
- memset(tmpbuffer, 0, state->blocksize);
- tmp_leaf = (xfs_dir_leafblock_t *)tmpbuffer;
- tmp_hdr = &tmp_leaf->hdr;
- tmp_hdr->info = save_hdr->info; /* struct copy */
- tmp_hdr->count = 0;
- INT_SET(tmp_hdr->firstused, ARCH_CONVERT, state->blocksize);
- if (!tmp_hdr->firstused)
- INT_SET(tmp_hdr->firstused, ARCH_CONVERT, state->blocksize - 1);
- tmp_hdr->namebytes = 0;
- if (xfs_dir_leaf_order(save_blk->bp, drop_blk->bp)) {
- xfs_dir_leaf_moveents(drop_leaf, 0, tmp_leaf, 0,
- (int)INT_GET(drop_hdr->count, ARCH_CONVERT), mp);
- xfs_dir_leaf_moveents(save_leaf, 0,
- tmp_leaf, INT_GET(tmp_leaf->hdr.count, ARCH_CONVERT),
- (int)INT_GET(save_hdr->count, ARCH_CONVERT), mp);
- } else {
- xfs_dir_leaf_moveents(save_leaf, 0, tmp_leaf, 0,
- (int)INT_GET(save_hdr->count, ARCH_CONVERT), mp);
- xfs_dir_leaf_moveents(drop_leaf, 0,
- tmp_leaf, INT_GET(tmp_leaf->hdr.count, ARCH_CONVERT),
- (int)INT_GET(drop_hdr->count, ARCH_CONVERT), mp);
- }
- memcpy(save_leaf, tmp_leaf, state->blocksize);
- kmem_free(tmpbuffer, state->blocksize);
- }
-
- xfs_da_log_buf(state->args->trans, save_blk->bp, 0,
- state->blocksize - 1);
-
- /*
- * Copy out last hashval in each block for B-tree code.
- */
- save_blk->hashval = INT_GET(save_leaf->entries[ INT_GET(save_leaf->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT);
-}
-
-/*========================================================================
- * Routines used for finding things in the Btree.
- *========================================================================*/
-
-/*
- * Look up a name in a leaf directory structure.
- * This is the internal routine, it uses the caller's buffer.
- *
- * Note that duplicate keys are allowed, but only check within the
- * current leaf node. The Btree code must check in adjacent leaf nodes.
- *
- * Return in *index the index into the entry[] array of either the found
- * entry, or where the entry should have been (insert before that entry).
- *
- * Don't change the args->inumber unless we find the filename.
- */
-int
-xfs_dir_leaf_lookup_int(xfs_dabuf_t *bp, xfs_da_args_t *args, int *index)
-{
- xfs_dir_leafblock_t *leaf;
- xfs_dir_leaf_entry_t *entry;
- xfs_dir_leaf_name_t *namest;
- int probe, span;
- xfs_dahash_t hashval;
-
- leaf = bp->data;
- ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR_LEAF_MAGIC);
- ASSERT(INT_GET(leaf->hdr.count, ARCH_CONVERT) < (XFS_LBSIZE(args->dp->i_mount)/8));
-
- /*
- * Binary search. (note: small blocks will skip this loop)
- */
- hashval = args->hashval;
- probe = span = INT_GET(leaf->hdr.count, ARCH_CONVERT) / 2;
- for (entry = &leaf->entries[probe]; span > 4;
- entry = &leaf->entries[probe]) {
- span /= 2;
- if (INT_GET(entry->hashval, ARCH_CONVERT) < hashval)
- probe += span;
- else if (INT_GET(entry->hashval, ARCH_CONVERT) > hashval)
- probe -= span;
- else
- break;
- }
- ASSERT((probe >= 0) && \
- ((!leaf->hdr.count) || (probe < INT_GET(leaf->hdr.count, ARCH_CONVERT))));
- ASSERT((span <= 4) || (INT_GET(entry->hashval, ARCH_CONVERT) == hashval));
-
- /*
- * Since we may have duplicate hashval's, find the first matching
- * hashval in the leaf.
- */
- while ((probe > 0) && (INT_GET(entry->hashval, ARCH_CONVERT) >= hashval)) {
- entry--;
- probe--;
- }
- while ((probe < INT_GET(leaf->hdr.count, ARCH_CONVERT)) && (INT_GET(entry->hashval, ARCH_CONVERT) < hashval)) {
- entry++;
- probe++;
- }
- if ((probe == INT_GET(leaf->hdr.count, ARCH_CONVERT)) || (INT_GET(entry->hashval, ARCH_CONVERT) != hashval)) {
- *index = probe;
- ASSERT(args->oknoent);
- return XFS_ERROR(ENOENT);
- }
-
- /*
- * Duplicate keys may be present, so search all of them for a match.
- */
- while ((probe < INT_GET(leaf->hdr.count, ARCH_CONVERT)) && (INT_GET(entry->hashval, ARCH_CONVERT) == hashval)) {
- namest = XFS_DIR_LEAF_NAMESTRUCT(leaf, INT_GET(entry->nameidx, ARCH_CONVERT));
- if (entry->namelen == args->namelen &&
- namest->name[0] == args->name[0] &&
- memcmp(args->name, namest->name, args->namelen) == 0) {
- XFS_DIR_SF_GET_DIRINO(&namest->inumber, &args->inumber);
- *index = probe;
- return XFS_ERROR(EEXIST);
- }
- entry++;
- probe++;
- }
- *index = probe;
- ASSERT(probe == INT_GET(leaf->hdr.count, ARCH_CONVERT) || args->oknoent);
- return XFS_ERROR(ENOENT);
-}
-
-/*========================================================================
- * Utility routines.
- *========================================================================*/
-
-/*
- * Move the indicated entries from one leaf to another.
- * NOTE: this routine modifies both source and destination leaves.
- */
-/* ARGSUSED */
-STATIC void
-xfs_dir_leaf_moveents(xfs_dir_leafblock_t *leaf_s, int start_s,
- xfs_dir_leafblock_t *leaf_d, int start_d,
- int count, xfs_mount_t *mp)
-{
- xfs_dir_leaf_hdr_t *hdr_s, *hdr_d;
- xfs_dir_leaf_entry_t *entry_s, *entry_d;
- int tmp, i;
-
- /*
- * Check for nothing to do.
- */
- if (count == 0)
- return;
-
- /*
- * Set up environment.
- */
- ASSERT(INT_GET(leaf_s->hdr.info.magic, ARCH_CONVERT) == XFS_DIR_LEAF_MAGIC);
- ASSERT(INT_GET(leaf_d->hdr.info.magic, ARCH_CONVERT) == XFS_DIR_LEAF_MAGIC);
- hdr_s = &leaf_s->hdr;
- hdr_d = &leaf_d->hdr;
- ASSERT((INT_GET(hdr_s->count, ARCH_CONVERT) > 0) && (INT_GET(hdr_s->count, ARCH_CONVERT) < (XFS_LBSIZE(mp)/8)));
- ASSERT(INT_GET(hdr_s->firstused, ARCH_CONVERT) >=
- ((INT_GET(hdr_s->count, ARCH_CONVERT)*sizeof(*entry_s))+sizeof(*hdr_s)));
- ASSERT(INT_GET(hdr_d->count, ARCH_CONVERT) < (XFS_LBSIZE(mp)/8));
- ASSERT(INT_GET(hdr_d->firstused, ARCH_CONVERT) >=
- ((INT_GET(hdr_d->count, ARCH_CONVERT)*sizeof(*entry_d))+sizeof(*hdr_d)));
-
- ASSERT(start_s < INT_GET(hdr_s->count, ARCH_CONVERT));
- ASSERT(start_d <= INT_GET(hdr_d->count, ARCH_CONVERT));
- ASSERT(count <= INT_GET(hdr_s->count, ARCH_CONVERT));
-
- /*
- * Move the entries in the destination leaf up to make a hole?
- */
- if (start_d < INT_GET(hdr_d->count, ARCH_CONVERT)) {
- tmp = INT_GET(hdr_d->count, ARCH_CONVERT) - start_d;
- tmp *= (uint)sizeof(xfs_dir_leaf_entry_t);
- entry_s = &leaf_d->entries[start_d];
- entry_d = &leaf_d->entries[start_d + count];
- memcpy(entry_d, entry_s, tmp);
- }
-
- /*
- * Copy all entry's in the same (sorted) order,
- * but allocate filenames packed and in sequence.
- */
- entry_s = &leaf_s->entries[start_s];
- entry_d = &leaf_d->entries[start_d];
- for (i = 0; i < count; entry_s++, entry_d++, i++) {
- ASSERT(INT_GET(entry_s->nameidx, ARCH_CONVERT) >= INT_GET(hdr_s->firstused, ARCH_CONVERT));
- tmp = XFS_DIR_LEAF_ENTSIZE_BYENTRY(entry_s);
- INT_MOD(hdr_d->firstused, ARCH_CONVERT, -(tmp));
- entry_d->hashval = entry_s->hashval; /* INT_: direct copy */
- INT_COPY(entry_d->nameidx, hdr_d->firstused, ARCH_CONVERT);
- entry_d->namelen = entry_s->namelen;
- ASSERT(INT_GET(entry_d->nameidx, ARCH_CONVERT) + tmp <= XFS_LBSIZE(mp));
- memcpy(XFS_DIR_LEAF_NAMESTRUCT(leaf_d, INT_GET(entry_d->nameidx, ARCH_CONVERT)),
- XFS_DIR_LEAF_NAMESTRUCT(leaf_s, INT_GET(entry_s->nameidx, ARCH_CONVERT)), tmp);
- ASSERT(INT_GET(entry_s->nameidx, ARCH_CONVERT) + tmp <= XFS_LBSIZE(mp));
- memset((char *)XFS_DIR_LEAF_NAMESTRUCT(leaf_s, INT_GET(entry_s->nameidx, ARCH_CONVERT)),
- 0, tmp);
- INT_MOD(hdr_s->namebytes, ARCH_CONVERT, -(entry_d->namelen));
- INT_MOD(hdr_d->namebytes, ARCH_CONVERT, entry_d->namelen);
- INT_MOD(hdr_s->count, ARCH_CONVERT, -1);
- INT_MOD(hdr_d->count, ARCH_CONVERT, +1);
- tmp = INT_GET(hdr_d->count, ARCH_CONVERT) * (uint)sizeof(xfs_dir_leaf_entry_t)
- + (uint)sizeof(xfs_dir_leaf_hdr_t);
- ASSERT(INT_GET(hdr_d->firstused, ARCH_CONVERT) >= tmp);
-
- }
-
- /*
- * Zero out the entries we just copied.
- */
- if (start_s == INT_GET(hdr_s->count, ARCH_CONVERT)) {
- tmp = count * (uint)sizeof(xfs_dir_leaf_entry_t);
- entry_s = &leaf_s->entries[start_s];
- ASSERT((char *)entry_s + tmp <= (char *)leaf_s + XFS_LBSIZE(mp));
- memset((char *)entry_s, 0, tmp);
- } else {
- /*
- * Move the remaining entries down to fill the hole,
- * then zero the entries at the top.
- */
- tmp = INT_GET(hdr_s->count, ARCH_CONVERT) - count;
- tmp *= (uint)sizeof(xfs_dir_leaf_entry_t);
- entry_s = &leaf_s->entries[start_s + count];
- entry_d = &leaf_s->entries[start_s];
- memcpy(entry_d, entry_s, tmp);
-
- tmp = count * (uint)sizeof(xfs_dir_leaf_entry_t);
- entry_s = &leaf_s->entries[INT_GET(hdr_s->count, ARCH_CONVERT)];
- ASSERT((char *)entry_s + tmp <= (char *)leaf_s + XFS_LBSIZE(mp));
- memset((char *)entry_s, 0, tmp);
- }
-
- /*
- * Fill in the freemap information
- */
- INT_SET(hdr_d->freemap[0].base, ARCH_CONVERT, (uint)sizeof(xfs_dir_leaf_hdr_t));
- INT_MOD(hdr_d->freemap[0].base, ARCH_CONVERT, INT_GET(hdr_d->count, ARCH_CONVERT) * (uint)sizeof(xfs_dir_leaf_entry_t));
- INT_SET(hdr_d->freemap[0].size, ARCH_CONVERT, INT_GET(hdr_d->firstused, ARCH_CONVERT) - INT_GET(hdr_d->freemap[0].base, ARCH_CONVERT));
- INT_SET(hdr_d->freemap[1].base, ARCH_CONVERT, (hdr_d->freemap[2].base = 0));
- INT_SET(hdr_d->freemap[1].size, ARCH_CONVERT, (hdr_d->freemap[2].size = 0));
- hdr_s->holes = 1; /* leaf may not be compact */
-}
-
-/*
- * Compare two leaf blocks "order".
- */
-int
-xfs_dir_leaf_order(xfs_dabuf_t *leaf1_bp, xfs_dabuf_t *leaf2_bp)
-{
- xfs_dir_leafblock_t *leaf1, *leaf2;
-
- leaf1 = leaf1_bp->data;
- leaf2 = leaf2_bp->data;
- ASSERT((INT_GET(leaf1->hdr.info.magic, ARCH_CONVERT) == XFS_DIR_LEAF_MAGIC) &&
- (INT_GET(leaf2->hdr.info.magic, ARCH_CONVERT) == XFS_DIR_LEAF_MAGIC));
- if ((INT_GET(leaf1->hdr.count, ARCH_CONVERT) > 0) && (INT_GET(leaf2->hdr.count, ARCH_CONVERT) > 0) &&
- ((INT_GET(leaf2->entries[ 0 ].hashval, ARCH_CONVERT) <
- INT_GET(leaf1->entries[ 0 ].hashval, ARCH_CONVERT)) ||
- (INT_GET(leaf2->entries[ INT_GET(leaf2->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT) <
- INT_GET(leaf1->entries[ INT_GET(leaf1->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT)))) {
- return 1;
- }
- return 0;
-}
-
-/*
- * Pick up the last hashvalue from a leaf block.
- */
-xfs_dahash_t
-xfs_dir_leaf_lasthash(xfs_dabuf_t *bp, int *count)
-{
- xfs_dir_leafblock_t *leaf;
-
- leaf = bp->data;
- ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR_LEAF_MAGIC);
- if (count)
- *count = INT_GET(leaf->hdr.count, ARCH_CONVERT);
- if (!leaf->hdr.count)
- return(0);
- return(INT_GET(leaf->entries[ INT_GET(leaf->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT));
-}
-
-/*
- * Copy out directory entries for getdents(), for leaf directories.
- */
-int
-xfs_dir_leaf_getdents_int(
- xfs_dabuf_t *bp,
- xfs_inode_t *dp,
- xfs_dablk_t bno,
- uio_t *uio,
- int *eobp,
- xfs_dirent_t *dbp,
- xfs_dir_put_t put,
- xfs_daddr_t nextda)
-{
- xfs_dir_leafblock_t *leaf;
- xfs_dir_leaf_entry_t *entry;
- xfs_dir_leaf_name_t *namest;
- int entno, want_entno, i, nextentno;
- xfs_mount_t *mp;
- xfs_dahash_t cookhash;
- xfs_dahash_t nexthash = 0;
-#if (BITS_PER_LONG == 32)
- xfs_dahash_t lasthash = XFS_DA_MAXHASH;
-#endif
- xfs_dir_put_args_t p;
-
- mp = dp->i_mount;
- leaf = bp->data;
- if (INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) != XFS_DIR_LEAF_MAGIC) {
- *eobp = 1;
- return XFS_ERROR(ENOENT); /* XXX wrong code */
- }
-
- want_entno = XFS_DA_COOKIE_ENTRY(mp, uio->uio_offset);
-
- cookhash = XFS_DA_COOKIE_HASH(mp, uio->uio_offset);
-
- xfs_dir_trace_g_dul("leaf: start", dp, uio, leaf);
-
- /*
- * Re-find our place.
- */
- for (i = entno = 0, entry = &leaf->entries[0];
- i < INT_GET(leaf->hdr.count, ARCH_CONVERT);
- entry++, i++) {
-
- namest = XFS_DIR_LEAF_NAMESTRUCT(leaf,
- INT_GET(entry->nameidx, ARCH_CONVERT));
-
- if (unlikely(
- ((char *)namest < (char *)leaf) ||
- ((char *)namest >= (char *)leaf + XFS_LBSIZE(mp)))) {
- XFS_CORRUPTION_ERROR("xfs_dir_leaf_getdents_int(1)",
- XFS_ERRLEVEL_LOW, mp, leaf);
- xfs_dir_trace_g_du("leaf: corrupted", dp, uio);
- return XFS_ERROR(EFSCORRUPTED);
- }
- if (INT_GET(entry->hashval, ARCH_CONVERT) >= cookhash) {
- if ( entno < want_entno
- && INT_GET(entry->hashval, ARCH_CONVERT)
- == cookhash) {
- /*
- * Trying to get to a particular offset in a
- * run of equal-hashval entries.
- */
- entno++;
- } else if ( want_entno > 0
- && entno == want_entno
- && INT_GET(entry->hashval, ARCH_CONVERT)
- == cookhash) {
- break;
- } else {
- entno = 0;
- break;
- }
- }
- }
-
- if (i == INT_GET(leaf->hdr.count, ARCH_CONVERT)) {
- xfs_dir_trace_g_du("leaf: hash not found", dp, uio);
- if (!INT_GET(leaf->hdr.info.forw, ARCH_CONVERT))
- uio->uio_offset =
- XFS_DA_MAKE_COOKIE(mp, 0, 0, XFS_DA_MAXHASH);
- /*
- * Don't set uio_offset if there's another block:
- * the node code will be setting uio_offset anyway.
- */
- *eobp = 0;
- return 0;
- }
- xfs_dir_trace_g_due("leaf: hash found", dp, uio, entry);
-
- p.dbp = dbp;
- p.put = put;
- p.uio = uio;
-
- /*
- * We're synchronized, start copying entries out to the user.
- */
- for (; entno >= 0 && i < INT_GET(leaf->hdr.count, ARCH_CONVERT);
- entry++, i++, (entno = nextentno)) {
- int lastresid=0, retval;
- xfs_dircook_t lastoffset;
- xfs_dahash_t thishash;
-
- /*
- * Check for a damaged directory leaf block and pick up
- * the inode number from this entry.
- */
- namest = XFS_DIR_LEAF_NAMESTRUCT(leaf,
- INT_GET(entry->nameidx, ARCH_CONVERT));
-
- if (unlikely(
- ((char *)namest < (char *)leaf) ||
- ((char *)namest >= (char *)leaf + XFS_LBSIZE(mp)))) {
- XFS_CORRUPTION_ERROR("xfs_dir_leaf_getdents_int(2)",
- XFS_ERRLEVEL_LOW, mp, leaf);
- xfs_dir_trace_g_du("leaf: corrupted", dp, uio);
- return XFS_ERROR(EFSCORRUPTED);
- }
-
- xfs_dir_trace_g_duc("leaf: middle cookie ",
- dp, uio, p.cook.o);
-
- if (i < (INT_GET(leaf->hdr.count, ARCH_CONVERT) - 1)) {
- nexthash = INT_GET(entry[1].hashval, ARCH_CONVERT);
-
- if (nexthash == INT_GET(entry->hashval, ARCH_CONVERT))
- nextentno = entno + 1;
- else
- nextentno = 0;
- XFS_PUT_COOKIE(p.cook, mp, bno, nextentno, nexthash);
- xfs_dir_trace_g_duc("leaf: middle cookie ",
- dp, uio, p.cook.o);
-
- } else if ((thishash = INT_GET(leaf->hdr.info.forw,
- ARCH_CONVERT))) {
- xfs_dabuf_t *bp2;
- xfs_dir_leafblock_t *leaf2;
-
- ASSERT(nextda != -1);
-
- retval = xfs_da_read_buf(dp->i_transp, dp, thishash,
- nextda, &bp2, XFS_DATA_FORK);
- if (retval)
- return retval;
-
- ASSERT(bp2 != NULL);
-
- leaf2 = bp2->data;
-
- if (unlikely(
- (INT_GET(leaf2->hdr.info.magic, ARCH_CONVERT)
- != XFS_DIR_LEAF_MAGIC)
- || (INT_GET(leaf2->hdr.info.back, ARCH_CONVERT)
- != bno))) { /* GROT */
- XFS_CORRUPTION_ERROR("xfs_dir_leaf_getdents_int(3)",
- XFS_ERRLEVEL_LOW, mp,
- leaf2);
- xfs_da_brelse(dp->i_transp, bp2);
-
- return XFS_ERROR(EFSCORRUPTED);
- }
-
- nexthash = INT_GET(leaf2->entries[0].hashval,
- ARCH_CONVERT);
- nextentno = -1;
- XFS_PUT_COOKIE(p.cook, mp, thishash, 0, nexthash);
- xfs_da_brelse(dp->i_transp, bp2);
- xfs_dir_trace_g_duc("leaf: next blk cookie",
- dp, uio, p.cook.o);
- } else {
- nextentno = -1;
- XFS_PUT_COOKIE(p.cook, mp, 0, 0, XFS_DA_MAXHASH);
- }
-
- /*
- * Save off the cookie so we can fall back should the
- * 'put' into the outgoing buffer fails. To handle a run
- * of equal-hashvals, the off_t structure on 64bit
- * builds has entno built into the cookie to ID the
- * entry. On 32bit builds, we only have space for the
- * hashval so we can't ID specific entries within a group
- * of same hashval entries. For this, lastoffset is set
- * to the first in the run of equal hashvals so we don't
- * include any entries unless we can include all entries
- * that share the same hashval. Hopefully the buffer
- * provided is big enough to handle it (see pv763517).
- */
-#if (BITS_PER_LONG == 32)
- if ((thishash = INT_GET(entry->hashval, ARCH_CONVERT))
- != lasthash) {
- XFS_PUT_COOKIE(lastoffset, mp, bno, entno, thishash);
- lastresid = uio->uio_resid;
- lasthash = thishash;
- } else {
- xfs_dir_trace_g_duc("leaf: DUP COOKIES, skipped",
- dp, uio, p.cook.o);
- }
-#else
- thishash = INT_GET(entry->hashval, ARCH_CONVERT);
- XFS_PUT_COOKIE(lastoffset, mp, bno, entno, thishash);
- lastresid = uio->uio_resid;
-#endif /* BITS_PER_LONG == 32 */
-
- /*
- * Put the current entry into the outgoing buffer. If we fail
- * then restore the UIO to the first entry in the current
- * run of equal-hashval entries (probably one 1 entry long).
- */
- p.ino = XFS_GET_DIR_INO8(namest->inumber);
-#if XFS_BIG_INUMS
- p.ino += mp->m_inoadd;
-#endif
- p.name = (char *)namest->name;
- p.namelen = entry->namelen;
-
- retval = p.put(&p);
-
- if (!p.done) {
- uio->uio_offset = lastoffset.o;
- uio->uio_resid = lastresid;
-
- *eobp = 1;
-
- xfs_dir_trace_g_du("leaf: E-O-B", dp, uio);
-
- return retval;
- }
- }
-
- uio->uio_offset = p.cook.o;
-
- *eobp = 0;
-
- xfs_dir_trace_g_du("leaf: E-O-F", dp, uio);
-
- return 0;
-}
-
-/*
- * Format a dirent64 structure and copy it out the the user's buffer.
- */
-int
-xfs_dir_put_dirent64_direct(xfs_dir_put_args_t *pa)
-{
- iovec_t *iovp;
- int reclen, namelen;
- xfs_dirent_t *idbp;
- uio_t *uio;
-
- namelen = pa->namelen;
- reclen = DIRENTSIZE(namelen);
- uio = pa->uio;
- if (reclen > uio->uio_resid) {
- pa->done = 0;
- return 0;
- }
- iovp = uio->uio_iov;
- idbp = (xfs_dirent_t *)iovp->iov_base;
- iovp->iov_base = (char *)idbp + reclen;
- iovp->iov_len -= reclen;
- uio->uio_resid -= reclen;
- idbp->d_reclen = reclen;
- idbp->d_ino = pa->ino;
- idbp->d_off = pa->cook.o;
- idbp->d_name[namelen] = '\0';
- pa->done = 1;
- memcpy(idbp->d_name, pa->name, namelen);
- return 0;
-}
-
-/*
- * Format a dirent64 structure and copy it out the the user's buffer.
- */
-int
-xfs_dir_put_dirent64_uio(xfs_dir_put_args_t *pa)
-{
- int retval, reclen, namelen;
- xfs_dirent_t *idbp;
- uio_t *uio;
-
- namelen = pa->namelen;
- reclen = DIRENTSIZE(namelen);
- uio = pa->uio;
- if (reclen > uio->uio_resid) {
- pa->done = 0;
- return 0;
- }
- idbp = pa->dbp;
- idbp->d_reclen = reclen;
- idbp->d_ino = pa->ino;
- idbp->d_off = pa->cook.o;
- idbp->d_name[namelen] = '\0';
- memcpy(idbp->d_name, pa->name, namelen);
- retval = uio_read((caddr_t)idbp, reclen, uio);
- pa->done = (retval == 0);
- return retval;
-}
diff --git a/fs/xfs/xfs_dir_leaf.h b/fs/xfs/xfs_dir_leaf.h
deleted file mode 100644
index eb8cd9a4667..00000000000
--- a/fs/xfs/xfs_dir_leaf.h
+++ /dev/null
@@ -1,231 +0,0 @@
-/*
- * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- */
-#ifndef __XFS_DIR_LEAF_H__
-#define __XFS_DIR_LEAF_H__
-
-/*
- * Directory layout, internal structure, access macros, etc.
- *
- * Large directories are structured around Btrees where all the data
- * elements are in the leaf nodes. Filenames are hashed into an int,
- * then that int is used as the index into the Btree. Since the hashval
- * of a filename may not be unique, we may have duplicate keys. The
- * internal links in the Btree are logical block offsets into the file.
- */
-
-struct uio;
-struct xfs_bmap_free;
-struct xfs_dabuf;
-struct xfs_da_args;
-struct xfs_da_state;
-struct xfs_da_state_blk;
-struct xfs_dir_put_args;
-struct xfs_inode;
-struct xfs_mount;
-struct xfs_trans;
-
-/*========================================================================
- * Directory Structure when equal to XFS_LBSIZE(mp) bytes.
- *========================================================================*/
-
-/*
- * This is the structure of the leaf nodes in the Btree.
- *
- * Struct leaf_entry's are packed from the top. Names grow from the bottom
- * but are not packed. The freemap contains run-length-encoded entries
- * for the free bytes after the leaf_entry's, but only the N largest such,
- * smaller runs are dropped. When the freemap doesn't show enough space
- * for an allocation, we compact the namelist area and try again. If we
- * still don't have enough space, then we have to split the block.
- *
- * Since we have duplicate hash keys, for each key that matches, compare
- * the actual string. The root and intermediate node search always takes
- * the first-in-the-block key match found, so we should only have to work
- * "forw"ard. If none matches, continue with the "forw"ard leaf nodes
- * until the hash key changes or the filename is found.
- *
- * The parent directory and the self-pointer are explicitly represented
- * (ie: there are entries for "." and "..").
- *
- * Note that the count being a __uint16_t limits us to something like a
- * blocksize of 1.3MB in the face of worst case (short) filenames.
- */
-#define XFS_DIR_LEAF_MAPSIZE 3 /* how many freespace slots */
-
-typedef struct xfs_dir_leaf_map { /* RLE map of free bytes */
- __uint16_t base; /* base of free region */
- __uint16_t size; /* run length of free region */
-} xfs_dir_leaf_map_t;
-
-typedef struct xfs_dir_leaf_hdr { /* constant-structure header block */
- xfs_da_blkinfo_t info; /* block type, links, etc. */
- __uint16_t count; /* count of active leaf_entry's */
- __uint16_t namebytes; /* num bytes of name strings stored */
- __uint16_t firstused; /* first used byte in name area */
- __uint8_t holes; /* != 0 if blk needs compaction */
- __uint8_t pad1;
- xfs_dir_leaf_map_t freemap[XFS_DIR_LEAF_MAPSIZE];
-} xfs_dir_leaf_hdr_t;
-
-typedef struct xfs_dir_leaf_entry { /* sorted on key, not name */
- xfs_dahash_t hashval; /* hash value of name */
- __uint16_t nameidx; /* index into buffer of name */
- __uint8_t namelen; /* length of name string */
- __uint8_t pad2;
-} xfs_dir_leaf_entry_t;
-
-typedef struct xfs_dir_leaf_name {
- xfs_dir_ino_t inumber; /* inode number for this key */
- __uint8_t name[1]; /* name string itself */
-} xfs_dir_leaf_name_t;
-
-typedef struct xfs_dir_leafblock {
- xfs_dir_leaf_hdr_t hdr; /* constant-structure header block */
- xfs_dir_leaf_entry_t entries[1]; /* var sized array */
- xfs_dir_leaf_name_t namelist[1]; /* grows from bottom of buf */
-} xfs_dir_leafblock_t;
-
-/*
- * Length of name for which a 512-byte block filesystem
- * can get a double split.
- */
-#define XFS_DIR_LEAF_CAN_DOUBLE_SPLIT_LEN \
- (512 - (uint)sizeof(xfs_dir_leaf_hdr_t) - \
- (uint)sizeof(xfs_dir_leaf_entry_t) * 2 - \
- (uint)sizeof(xfs_dir_leaf_name_t) * 2 - (MAXNAMELEN - 2) + 1 + 1)
-
-typedef int (*xfs_dir_put_t)(struct xfs_dir_put_args *pa);
-
-typedef union {
- xfs_off_t o; /* offset (cookie) */
- /*
- * Watch the order here (endian-ness dependent).
- */
- struct {
-#ifndef XFS_NATIVE_HOST
- xfs_dahash_t h; /* hash value */
- __uint32_t be; /* block and entry */
-#else
- __uint32_t be; /* block and entry */
- xfs_dahash_t h; /* hash value */
-#endif /* XFS_NATIVE_HOST */
- } s;
-} xfs_dircook_t;
-
-#define XFS_PUT_COOKIE(c,mp,bno,entry,hash) \
- ((c).s.be = XFS_DA_MAKE_BNOENTRY(mp, bno, entry), (c).s.h = (hash))
-
-typedef struct xfs_dir_put_args {
- xfs_dircook_t cook; /* cookie of (next) entry */
- xfs_intino_t ino; /* inode number */
- struct xfs_dirent *dbp; /* buffer pointer */
- char *name; /* directory entry name */
- int namelen; /* length of name */
- int done; /* output: set if value was stored */
- xfs_dir_put_t put; /* put function ptr (i/o) */
- struct uio *uio; /* uio control structure */
-} xfs_dir_put_args_t;
-
-#define XFS_DIR_LEAF_ENTSIZE_BYNAME(len) \
- xfs_dir_leaf_entsize_byname(len)
-static inline int xfs_dir_leaf_entsize_byname(int len)
-{
- return (uint)sizeof(xfs_dir_leaf_name_t)-1 + len;
-}
-
-#define XFS_DIR_LEAF_ENTSIZE_BYENTRY(entry) \
- xfs_dir_leaf_entsize_byentry(entry)
-static inline int xfs_dir_leaf_entsize_byentry(xfs_dir_leaf_entry_t *entry)
-{
- return (uint)sizeof(xfs_dir_leaf_name_t)-1 + (entry)->namelen;
-}
-
-#define XFS_DIR_LEAF_NAMESTRUCT(leafp,offset) \
- xfs_dir_leaf_namestruct(leafp,offset)
-static inline xfs_dir_leaf_name_t *
-xfs_dir_leaf_namestruct(xfs_dir_leafblock_t *leafp, int offset)
-{
- return (xfs_dir_leaf_name_t *)&((char *)(leafp))[offset];
-}
-
-/*========================================================================
- * Function prototypes for the kernel.
- *========================================================================*/
-
-/*
- * Internal routines when dirsize < XFS_LITINO(mp).
- */
-int xfs_dir_shortform_create(struct xfs_da_args *args, xfs_ino_t parent);
-int xfs_dir_shortform_addname(struct xfs_da_args *args);
-int xfs_dir_shortform_lookup(struct xfs_da_args *args);
-int xfs_dir_shortform_to_leaf(struct xfs_da_args *args);
-int xfs_dir_shortform_removename(struct xfs_da_args *args);
-int xfs_dir_shortform_getdents(struct xfs_inode *dp, struct uio *uio, int *eofp,
- struct xfs_dirent *dbp, xfs_dir_put_t put);
-int xfs_dir_shortform_replace(struct xfs_da_args *args);
-
-/*
- * Internal routines when dirsize == XFS_LBSIZE(mp).
- */
-int xfs_dir_leaf_to_node(struct xfs_da_args *args);
-int xfs_dir_leaf_to_shortform(struct xfs_da_args *args);
-
-/*
- * Routines used for growing the Btree.
- */
-int xfs_dir_leaf_split(struct xfs_da_state *state,
- struct xfs_da_state_blk *oldblk,
- struct xfs_da_state_blk *newblk);
-int xfs_dir_leaf_add(struct xfs_dabuf *leaf_buffer,
- struct xfs_da_args *args, int insertion_index);
-int xfs_dir_leaf_addname(struct xfs_da_args *args);
-int xfs_dir_leaf_lookup_int(struct xfs_dabuf *leaf_buffer,
- struct xfs_da_args *args,
- int *index_found_at);
-int xfs_dir_leaf_remove(struct xfs_trans *trans,
- struct xfs_dabuf *leaf_buffer,
- int index_to_remove);
-int xfs_dir_leaf_getdents_int(struct xfs_dabuf *bp, struct xfs_inode *dp,
- xfs_dablk_t bno, struct uio *uio,
- int *eobp, struct xfs_dirent *dbp,
- xfs_dir_put_t put, xfs_daddr_t nextda);
-
-/*
- * Routines used for shrinking the Btree.
- */
-int xfs_dir_leaf_toosmall(struct xfs_da_state *state, int *retval);
-void xfs_dir_leaf_unbalance(struct xfs_da_state *state,
- struct xfs_da_state_blk *drop_blk,
- struct xfs_da_state_blk *save_blk);
-
-/*
- * Utility routines.
- */
-uint xfs_dir_leaf_lasthash(struct xfs_dabuf *bp, int *count);
-int xfs_dir_leaf_order(struct xfs_dabuf *leaf1_bp,
- struct xfs_dabuf *leaf2_bp);
-int xfs_dir_put_dirent64_direct(xfs_dir_put_args_t *pa);
-int xfs_dir_put_dirent64_uio(xfs_dir_put_args_t *pa);
-int xfs_dir_ino_validate(struct xfs_mount *mp, xfs_ino_t ino);
-
-/*
- * Global data.
- */
-extern xfs_dahash_t xfs_dir_hash_dot, xfs_dir_hash_dotdot;
-
-#endif /* __XFS_DIR_LEAF_H__ */
diff --git a/fs/xfs/xfs_dir_sf.h b/fs/xfs/xfs_dir_sf.h
deleted file mode 100644
index fe44c6f4d56..00000000000
--- a/fs/xfs/xfs_dir_sf.h
+++ /dev/null
@@ -1,153 +0,0 @@
-/*
- * Copyright (c) 2000,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- */
-#ifndef __XFS_DIR_SF_H__
-#define __XFS_DIR_SF_H__
-
-/*
- * Directory layout when stored internal to an inode.
- *
- * Small directories are packed as tightly as possible so as to
- * fit into the literal area of the inode.
- */
-
-typedef struct { __uint8_t i[sizeof(xfs_ino_t)]; } xfs_dir_ino_t;
-
-/*
- * The parent directory has a dedicated field, and the self-pointer must
- * be calculated on the fly.
- *
- * Entries are packed toward the top as tight as possible. The header
- * and the elements much be memcpy'd out into a work area to get correct
- * alignment for the inode number fields.
- */
-typedef struct xfs_dir_shortform {
- struct xfs_dir_sf_hdr { /* constant-structure header block */
- xfs_dir_ino_t parent; /* parent dir inode number */
- __uint8_t count; /* count of active entries */
- } hdr;
- struct xfs_dir_sf_entry {
- xfs_dir_ino_t inumber; /* referenced inode number */
- __uint8_t namelen; /* actual length of name (no NULL) */
- __uint8_t name[1]; /* name */
- } list[1]; /* variable sized array */
-} xfs_dir_shortform_t;
-typedef struct xfs_dir_sf_hdr xfs_dir_sf_hdr_t;
-typedef struct xfs_dir_sf_entry xfs_dir_sf_entry_t;
-
-/*
- * We generate this then sort it, so that readdirs are returned in
- * hash-order. Else seekdir won't work.
- */
-typedef struct xfs_dir_sf_sort {
- __uint8_t entno; /* .=0, ..=1, else entry# + 2 */
- __uint8_t seqno; /* sequence # with same hash value */
- __uint8_t namelen; /* length of name value (no null) */
- xfs_dahash_t hash; /* this entry's hash value */
- xfs_intino_t ino; /* this entry's inode number */
- char *name; /* name value, pointer into buffer */
-} xfs_dir_sf_sort_t;
-
-#define XFS_DIR_SF_GET_DIRINO(from,to) xfs_dir_sf_get_dirino(from, to)
-static inline void xfs_dir_sf_get_dirino(xfs_dir_ino_t *from, xfs_ino_t *to)
-{
- *(to) = XFS_GET_DIR_INO8(*from);
-}
-
-#define XFS_DIR_SF_PUT_DIRINO(from,to) xfs_dir_sf_put_dirino(from, to)
-static inline void xfs_dir_sf_put_dirino(xfs_ino_t *from, xfs_dir_ino_t *to)
-{
- XFS_PUT_DIR_INO8(*(from), *(to));
-}
-
-#define XFS_DIR_SF_ENTSIZE_BYNAME(len) xfs_dir_sf_entsize_byname(len)
-static inline int xfs_dir_sf_entsize_byname(int len)
-{
- return (uint)sizeof(xfs_dir_sf_entry_t)-1 + (len);
-}
-
-#define XFS_DIR_SF_ENTSIZE_BYENTRY(sfep) xfs_dir_sf_entsize_byentry(sfep)
-static inline int xfs_dir_sf_entsize_byentry(xfs_dir_sf_entry_t *sfep)
-{
- return (uint)sizeof(xfs_dir_sf_entry_t)-1 + (sfep)->namelen;
-}
-
-#define XFS_DIR_SF_NEXTENTRY(sfep) xfs_dir_sf_nextentry(sfep)
-static inline xfs_dir_sf_entry_t *xfs_dir_sf_nextentry(xfs_dir_sf_entry_t *sfep)
-{
- return (xfs_dir_sf_entry_t *) \
- ((char *)(sfep) + XFS_DIR_SF_ENTSIZE_BYENTRY(sfep));
-}
-
-#define XFS_DIR_SF_ALLFIT(count,totallen) \
- xfs_dir_sf_allfit(count,totallen)
-static inline int xfs_dir_sf_allfit(int count, int totallen)
-{
- return ((uint)sizeof(xfs_dir_sf_hdr_t) + \
- ((uint)sizeof(xfs_dir_sf_entry_t)-1)*(count) + (totallen));
-}
-
-#if defined(XFS_DIR_TRACE)
-
-/*
- * Kernel tracing support for directories.
- */
-struct uio;
-struct xfs_inode;
-struct xfs_da_intnode;
-struct xfs_dinode;
-struct xfs_dir_leafblock;
-struct xfs_dir_leaf_entry;
-
-#define XFS_DIR_TRACE_SIZE 4096 /* size of global trace buffer */
-extern ktrace_t *xfs_dir_trace_buf;
-
-/*
- * Trace record types.
- */
-#define XFS_DIR_KTRACE_G_DU 1 /* dp, uio */
-#define XFS_DIR_KTRACE_G_DUB 2 /* dp, uio, bno */
-#define XFS_DIR_KTRACE_G_DUN 3 /* dp, uio, node */
-#define XFS_DIR_KTRACE_G_DUL 4 /* dp, uio, leaf */
-#define XFS_DIR_KTRACE_G_DUE 5 /* dp, uio, leaf entry */
-#define XFS_DIR_KTRACE_G_DUC 6 /* dp, uio, cookie */
-
-void xfs_dir_trace_g_du(char *where, struct xfs_inode *dp, struct uio *uio);
-void xfs_dir_trace_g_dub(char *where, struct xfs_inode *dp, struct uio *uio,
- xfs_dablk_t bno);
-void xfs_dir_trace_g_dun(char *where, struct xfs_inode *dp, struct uio *uio,
- struct xfs_da_intnode *node);
-void xfs_dir_trace_g_dul(char *where, struct xfs_inode *dp, struct uio *uio,
- struct xfs_dir_leafblock *leaf);
-void xfs_dir_trace_g_due(char *where, struct xfs_inode *dp, struct uio *uio,
- struct xfs_dir_leaf_entry *entry);
-void xfs_dir_trace_g_duc(char *where, struct xfs_inode *dp, struct uio *uio,
- xfs_off_t cookie);
-void xfs_dir_trace_enter(int type, char *where,
- void *a0, void *a1, void *a2, void *a3,
- void *a4, void *a5, void *a6, void *a7,
- void *a8, void *a9, void *a10, void *a11);
-#else
-#define xfs_dir_trace_g_du(w,d,u)
-#define xfs_dir_trace_g_dub(w,d,u,b)
-#define xfs_dir_trace_g_dun(w,d,u,n)
-#define xfs_dir_trace_g_dul(w,d,u,l)
-#define xfs_dir_trace_g_due(w,d,u,e)
-#define xfs_dir_trace_g_duc(w,d,u,c)
-#endif /* DEBUG */
-
-#endif /* __XFS_DIR_SF_H__ */
diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c
new file mode 100644
index 00000000000..4f11ef01113
--- /dev/null
+++ b/fs/xfs/xfs_discard.c
@@ -0,0 +1,240 @@
+/*
+ * Copyright (C) 2010 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "xfs.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_quota.h"
+#include "xfs_inode.h"
+#include "xfs_btree.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_alloc.h"
+#include "xfs_error.h"
+#include "xfs_extent_busy.h"
+#include "xfs_discard.h"
+#include "xfs_trace.h"
+#include "xfs_log.h"
+
+STATIC int
+xfs_trim_extents(
+ struct xfs_mount *mp,
+ xfs_agnumber_t agno,
+ xfs_daddr_t start,
+ xfs_daddr_t end,
+ xfs_daddr_t minlen,
+ __uint64_t *blocks_trimmed)
+{
+ struct block_device *bdev = mp->m_ddev_targp->bt_bdev;
+ struct xfs_btree_cur *cur;
+ struct xfs_buf *agbp;
+ struct xfs_perag *pag;
+ int error;
+ int i;
+
+ pag = xfs_perag_get(mp, agno);
+
+ error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp);
+ if (error || !agbp)
+ goto out_put_perag;
+
+ cur = xfs_allocbt_init_cursor(mp, NULL, agbp, agno, XFS_BTNUM_CNT);
+
+ /*
+ * Force out the log. This means any transactions that might have freed
+ * space before we took the AGF buffer lock are now on disk, and the
+ * volatile disk cache is flushed.
+ */
+ xfs_log_force(mp, XFS_LOG_SYNC);
+
+ /*
+ * Look up the longest btree in the AGF and start with it.
+ */
+ error = xfs_alloc_lookup_ge(cur, 0,
+ be32_to_cpu(XFS_BUF_TO_AGF(agbp)->agf_longest), &i);
+ if (error)
+ goto out_del_cursor;
+
+ /*
+ * Loop until we are done with all extents that are large
+ * enough to be worth discarding.
+ */
+ while (i) {
+ xfs_agblock_t fbno;
+ xfs_extlen_t flen;
+ xfs_daddr_t dbno;
+ xfs_extlen_t dlen;
+
+ error = xfs_alloc_get_rec(cur, &fbno, &flen, &i);
+ if (error)
+ goto out_del_cursor;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, out_del_cursor);
+ ASSERT(flen <= be32_to_cpu(XFS_BUF_TO_AGF(agbp)->agf_longest));
+
+ /*
+ * use daddr format for all range/len calculations as that is
+ * the format the range/len variables are supplied in by
+ * userspace.
+ */
+ dbno = XFS_AGB_TO_DADDR(mp, agno, fbno);
+ dlen = XFS_FSB_TO_BB(mp, flen);
+
+ /*
+ * Too small? Give up.
+ */
+ if (dlen < minlen) {
+ trace_xfs_discard_toosmall(mp, agno, fbno, flen);
+ goto out_del_cursor;
+ }
+
+ /*
+ * If the extent is entirely outside of the range we are
+ * supposed to discard skip it. Do not bother to trim
+ * down partially overlapping ranges for now.
+ */
+ if (dbno + dlen < start || dbno > end) {
+ trace_xfs_discard_exclude(mp, agno, fbno, flen);
+ goto next_extent;
+ }
+
+ /*
+ * If any blocks in the range are still busy, skip the
+ * discard and try again the next time.
+ */
+ if (xfs_extent_busy_search(mp, agno, fbno, flen)) {
+ trace_xfs_discard_busy(mp, agno, fbno, flen);
+ goto next_extent;
+ }
+
+ trace_xfs_discard_extent(mp, agno, fbno, flen);
+ error = -blkdev_issue_discard(bdev, dbno, dlen, GFP_NOFS, 0);
+ if (error)
+ goto out_del_cursor;
+ *blocks_trimmed += flen;
+
+next_extent:
+ error = xfs_btree_decrement(cur, 0, &i);
+ if (error)
+ goto out_del_cursor;
+ }
+
+out_del_cursor:
+ xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
+ xfs_buf_relse(agbp);
+out_put_perag:
+ xfs_perag_put(pag);
+ return error;
+}
+
+/*
+ * trim a range of the filesystem.
+ *
+ * Note: the parameters passed from userspace are byte ranges into the
+ * filesystem which does not match to the format we use for filesystem block
+ * addressing. FSB addressing is sparse (AGNO|AGBNO), while the incoming format
+ * is a linear address range. Hence we need to use DADDR based conversions and
+ * comparisons for determining the correct offset and regions to trim.
+ */
+int
+xfs_ioc_trim(
+ struct xfs_mount *mp,
+ struct fstrim_range __user *urange)
+{
+ struct request_queue *q = bdev_get_queue(mp->m_ddev_targp->bt_bdev);
+ unsigned int granularity = q->limits.discard_granularity;
+ struct fstrim_range range;
+ xfs_daddr_t start, end, minlen;
+ xfs_agnumber_t start_agno, end_agno, agno;
+ __uint64_t blocks_trimmed = 0;
+ int error, last_error = 0;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -XFS_ERROR(EPERM);
+ if (!blk_queue_discard(q))
+ return -XFS_ERROR(EOPNOTSUPP);
+ if (copy_from_user(&range, urange, sizeof(range)))
+ return -XFS_ERROR(EFAULT);
+
+ /*
+ * Truncating down the len isn't actually quite correct, but using
+ * BBTOB would mean we trivially get overflows for values
+ * of ULLONG_MAX or slightly lower. And ULLONG_MAX is the default
+ * used by the fstrim application. In the end it really doesn't
+ * matter as trimming blocks is an advisory interface.
+ */
+ if (range.start >= XFS_FSB_TO_B(mp, mp->m_sb.sb_dblocks) ||
+ range.minlen > XFS_FSB_TO_B(mp, XFS_ALLOC_AG_MAX_USABLE(mp)) ||
+ range.len < mp->m_sb.sb_blocksize)
+ return -XFS_ERROR(EINVAL);
+
+ start = BTOBB(range.start);
+ end = start + BTOBBT(range.len) - 1;
+ minlen = BTOBB(max_t(u64, granularity, range.minlen));
+
+ if (end > XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks) - 1)
+ end = XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)- 1;
+
+ start_agno = xfs_daddr_to_agno(mp, start);
+ end_agno = xfs_daddr_to_agno(mp, end);
+
+ for (agno = start_agno; agno <= end_agno; agno++) {
+ error = -xfs_trim_extents(mp, agno, start, end, minlen,
+ &blocks_trimmed);
+ if (error)
+ last_error = error;
+ }
+
+ if (last_error)
+ return last_error;
+
+ range.len = XFS_FSB_TO_B(mp, blocks_trimmed);
+ if (copy_to_user(urange, &range, sizeof(range)))
+ return -XFS_ERROR(EFAULT);
+ return 0;
+}
+
+int
+xfs_discard_extents(
+ struct xfs_mount *mp,
+ struct list_head *list)
+{
+ struct xfs_extent_busy *busyp;
+ int error = 0;
+
+ list_for_each_entry(busyp, list, list) {
+ trace_xfs_discard_extent(mp, busyp->agno, busyp->bno,
+ busyp->length);
+
+ error = -blkdev_issue_discard(mp->m_ddev_targp->bt_bdev,
+ XFS_AGB_TO_DADDR(mp, busyp->agno, busyp->bno),
+ XFS_FSB_TO_BB(mp, busyp->length),
+ GFP_NOFS, 0);
+ if (error && error != EOPNOTSUPP) {
+ xfs_info(mp,
+ "discard failed for extent [0x%llu,%u], error %d",
+ (unsigned long long)busyp->bno,
+ busyp->length,
+ error);
+ return error;
+ }
+ }
+
+ return 0;
+}
diff --git a/fs/xfs/xfs_discard.h b/fs/xfs/xfs_discard.h
new file mode 100644
index 00000000000..344879aea64
--- /dev/null
+++ b/fs/xfs/xfs_discard.h
@@ -0,0 +1,10 @@
+#ifndef XFS_DISCARD_H
+#define XFS_DISCARD_H 1
+
+struct fstrim_range;
+struct list_head;
+
+extern int xfs_ioc_trim(struct xfs_mount *, struct fstrim_range __user *);
+extern int xfs_discard_extents(struct xfs_mount *, struct list_head *);
+
+#endif /* XFS_DISCARD_H */
diff --git a/fs/xfs/xfs_dmapi.h b/fs/xfs/xfs_dmapi.h
deleted file mode 100644
index b4c7f2bc55a..00000000000
--- a/fs/xfs/xfs_dmapi.h
+++ /dev/null
@@ -1,204 +0,0 @@
-/*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- */
-#ifndef __XFS_DMAPI_H__
-#define __XFS_DMAPI_H__
-
-#include <linux/version.h>
-/* Values used to define the on-disk version of dm_attrname_t. All
- * on-disk attribute names start with the 8-byte string "SGI_DMI_".
- *
- * In the on-disk inode, DMAPI attribute names consist of the user-provided
- * name with the DMATTR_PREFIXSTRING pre-pended. This string must NEVER be
- * changed.
- */
-
-#define DMATTR_PREFIXLEN 8
-#define DMATTR_PREFIXSTRING "SGI_DMI_"
-
-typedef enum {
- DM_EVENT_INVALID = -1,
- DM_EVENT_CANCEL = 0, /* not supported */
- DM_EVENT_MOUNT = 1,
- DM_EVENT_PREUNMOUNT = 2,
- DM_EVENT_UNMOUNT = 3,
- DM_EVENT_DEBUT = 4, /* not supported */
- DM_EVENT_CREATE = 5,
- DM_EVENT_CLOSE = 6, /* not supported */
- DM_EVENT_POSTCREATE = 7,
- DM_EVENT_REMOVE = 8,
- DM_EVENT_POSTREMOVE = 9,
- DM_EVENT_RENAME = 10,
- DM_EVENT_POSTRENAME = 11,
- DM_EVENT_LINK = 12,
- DM_EVENT_POSTLINK = 13,
- DM_EVENT_SYMLINK = 14,
- DM_EVENT_POSTSYMLINK = 15,
- DM_EVENT_READ = 16,
- DM_EVENT_WRITE = 17,
- DM_EVENT_TRUNCATE = 18,
- DM_EVENT_ATTRIBUTE = 19,
- DM_EVENT_DESTROY = 20,
- DM_EVENT_NOSPACE = 21,
- DM_EVENT_USER = 22,
- DM_EVENT_MAX = 23
-} dm_eventtype_t;
-#define HAVE_DM_EVENTTYPE_T
-
-typedef enum {
- DM_RIGHT_NULL,
- DM_RIGHT_SHARED,
- DM_RIGHT_EXCL
-} dm_right_t;
-#define HAVE_DM_RIGHT_T
-
-/* Defines for determining if an event message should be sent. */
-#define DM_EVENT_ENABLED(vfsp, ip, event) ( \
- unlikely ((vfsp)->vfs_flag & VFS_DMI) && \
- ( ((ip)->i_d.di_dmevmask & (1 << event)) || \
- ((ip)->i_mount->m_dmevmask & (1 << event)) ) \
- )
-
-#define DM_EVENT_ENABLED_IO(vfsp, io, event) ( \
- unlikely ((vfsp)->vfs_flag & VFS_DMI) && \
- ( ((io)->io_dmevmask & (1 << event)) || \
- ((io)->io_mount->m_dmevmask & (1 << event)) ) \
- )
-
-#define DM_XFS_VALID_FS_EVENTS ( \
- (1 << DM_EVENT_PREUNMOUNT) | \
- (1 << DM_EVENT_UNMOUNT) | \
- (1 << DM_EVENT_NOSPACE) | \
- (1 << DM_EVENT_DEBUT) | \
- (1 << DM_EVENT_CREATE) | \
- (1 << DM_EVENT_POSTCREATE) | \
- (1 << DM_EVENT_REMOVE) | \
- (1 << DM_EVENT_POSTREMOVE) | \
- (1 << DM_EVENT_RENAME) | \
- (1 << DM_EVENT_POSTRENAME) | \
- (1 << DM_EVENT_LINK) | \
- (1 << DM_EVENT_POSTLINK) | \
- (1 << DM_EVENT_SYMLINK) | \
- (1 << DM_EVENT_POSTSYMLINK) | \
- (1 << DM_EVENT_ATTRIBUTE) | \
- (1 << DM_EVENT_DESTROY) )
-
-/* Events valid in dm_set_eventlist() when called with a file handle for
- a regular file or a symlink. These events are persistent.
-*/
-
-#define DM_XFS_VALID_FILE_EVENTS ( \
- (1 << DM_EVENT_ATTRIBUTE) | \
- (1 << DM_EVENT_DESTROY) )
-
-/* Events valid in dm_set_eventlist() when called with a file handle for
- a directory. These events are persistent.
-*/
-
-#define DM_XFS_VALID_DIRECTORY_EVENTS ( \
- (1 << DM_EVENT_CREATE) | \
- (1 << DM_EVENT_POSTCREATE) | \
- (1 << DM_EVENT_REMOVE) | \
- (1 << DM_EVENT_POSTREMOVE) | \
- (1 << DM_EVENT_RENAME) | \
- (1 << DM_EVENT_POSTRENAME) | \
- (1 << DM_EVENT_LINK) | \
- (1 << DM_EVENT_POSTLINK) | \
- (1 << DM_EVENT_SYMLINK) | \
- (1 << DM_EVENT_POSTSYMLINK) | \
- (1 << DM_EVENT_ATTRIBUTE) | \
- (1 << DM_EVENT_DESTROY) )
-
-/* Events supported by the XFS filesystem. */
-#define DM_XFS_SUPPORTED_EVENTS ( \
- (1 << DM_EVENT_MOUNT) | \
- (1 << DM_EVENT_PREUNMOUNT) | \
- (1 << DM_EVENT_UNMOUNT) | \
- (1 << DM_EVENT_NOSPACE) | \
- (1 << DM_EVENT_CREATE) | \
- (1 << DM_EVENT_POSTCREATE) | \
- (1 << DM_EVENT_REMOVE) | \
- (1 << DM_EVENT_POSTREMOVE) | \
- (1 << DM_EVENT_RENAME) | \
- (1 << DM_EVENT_POSTRENAME) | \
- (1 << DM_EVENT_LINK) | \
- (1 << DM_EVENT_POSTLINK) | \
- (1 << DM_EVENT_SYMLINK) | \
- (1 << DM_EVENT_POSTSYMLINK) | \
- (1 << DM_EVENT_READ) | \
- (1 << DM_EVENT_WRITE) | \
- (1 << DM_EVENT_TRUNCATE) | \
- (1 << DM_EVENT_ATTRIBUTE) | \
- (1 << DM_EVENT_DESTROY) )
-
-
-/*
- * Definitions used for the flags field on dm_send_*_event().
- */
-
-#define DM_FLAGS_NDELAY 0x001 /* return EAGAIN after dm_pending() */
-#define DM_FLAGS_UNWANTED 0x002 /* event not in fsys dm_eventset_t */
-#define DM_FLAGS_IMUX 0x004 /* thread holds i_mutex */
-#define DM_FLAGS_IALLOCSEM_RD 0x010 /* thread holds i_alloc_sem rd */
-#define DM_FLAGS_IALLOCSEM_WR 0x020 /* thread holds i_alloc_sem wr */
-
-/*
- * Based on IO_ISDIRECT, decide which i_ flag is set.
- */
-#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,0)
-#define DM_SEM_FLAG_RD(ioflags) (((ioflags) & IO_ISDIRECT) ? \
- DM_FLAGS_IMUX : 0)
-#define DM_SEM_FLAG_WR (DM_FLAGS_IALLOCSEM_WR | DM_FLAGS_IMUX)
-#endif
-
-#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)) && \
- (LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,22))
-#define DM_SEM_FLAG_RD(ioflags) (((ioflags) & IO_ISDIRECT) ? \
- DM_FLAGS_IALLOCSEM_RD : DM_FLAGS_IMUX)
-#define DM_SEM_FLAG_WR (DM_FLAGS_IALLOCSEM_WR | DM_FLAGS_IMUX)
-#endif
-
-#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,4,21)
-#define DM_SEM_FLAG_RD(ioflags) (((ioflags) & IO_ISDIRECT) ? \
- 0 : DM_FLAGS_IMUX)
-#define DM_SEM_FLAG_WR (DM_FLAGS_IMUX)
-#endif
-
-
-/*
- * Macros to turn caller specified delay/block flags into
- * dm_send_xxxx_event flag DM_FLAGS_NDELAY.
- */
-
-#define FILP_DELAY_FLAG(filp) ((filp->f_flags&(O_NDELAY|O_NONBLOCK)) ? \
- DM_FLAGS_NDELAY : 0)
-#define AT_DELAY_FLAG(f) ((f&ATTR_NONBLOCK) ? DM_FLAGS_NDELAY : 0)
-
-
-extern struct bhv_vfsops xfs_dmops;
-
-#ifdef CONFIG_XFS_DMAPI
-void xfs_dm_init(struct file_system_type *);
-void xfs_dm_exit(struct file_system_type *);
-#define XFS_DM_INIT(fstype) xfs_dm_init(fstype)
-#define XFS_DM_EXIT(fstype) xfs_dm_exit(fstype)
-#else
-#define XFS_DM_INIT(fstype)
-#define XFS_DM_EXIT(fstype)
-#endif
-
-#endif /* __XFS_DMAPI_H__ */
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
new file mode 100644
index 00000000000..3ee0cd43edc
--- /dev/null
+++ b/fs/xfs/xfs_dquot.c
@@ -0,0 +1,1105 @@
+/*
+ * Copyright (c) 2000-2003 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_shared.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_bmap.h"
+#include "xfs_bmap_util.h"
+#include "xfs_alloc.h"
+#include "xfs_quota.h"
+#include "xfs_error.h"
+#include "xfs_trans.h"
+#include "xfs_buf_item.h"
+#include "xfs_trans_space.h"
+#include "xfs_trans_priv.h"
+#include "xfs_qm.h"
+#include "xfs_cksum.h"
+#include "xfs_trace.h"
+#include "xfs_log.h"
+#include "xfs_bmap_btree.h"
+
+/*
+ * Lock order:
+ *
+ * ip->i_lock
+ * qi->qi_tree_lock
+ * dquot->q_qlock (xfs_dqlock() and friends)
+ * dquot->q_flush (xfs_dqflock() and friends)
+ * qi->qi_lru_lock
+ *
+ * If two dquots need to be locked the order is user before group/project,
+ * otherwise by the lowest id first, see xfs_dqlock2.
+ */
+
+#ifdef DEBUG
+xfs_buftarg_t *xfs_dqerror_target;
+int xfs_do_dqerror;
+int xfs_dqreq_num;
+int xfs_dqerror_mod = 33;
+#endif
+
+struct kmem_zone *xfs_qm_dqtrxzone;
+static struct kmem_zone *xfs_qm_dqzone;
+
+static struct lock_class_key xfs_dquot_group_class;
+static struct lock_class_key xfs_dquot_project_class;
+
+/*
+ * This is called to free all the memory associated with a dquot
+ */
+void
+xfs_qm_dqdestroy(
+ xfs_dquot_t *dqp)
+{
+ ASSERT(list_empty(&dqp->q_lru));
+
+ mutex_destroy(&dqp->q_qlock);
+ kmem_zone_free(xfs_qm_dqzone, dqp);
+
+ XFS_STATS_DEC(xs_qm_dquot);
+}
+
+/*
+ * If default limits are in force, push them into the dquot now.
+ * We overwrite the dquot limits only if they are zero and this
+ * is not the root dquot.
+ */
+void
+xfs_qm_adjust_dqlimits(
+ struct xfs_mount *mp,
+ struct xfs_dquot *dq)
+{
+ struct xfs_quotainfo *q = mp->m_quotainfo;
+ struct xfs_disk_dquot *d = &dq->q_core;
+ int prealloc = 0;
+
+ ASSERT(d->d_id);
+
+ if (q->qi_bsoftlimit && !d->d_blk_softlimit) {
+ d->d_blk_softlimit = cpu_to_be64(q->qi_bsoftlimit);
+ prealloc = 1;
+ }
+ if (q->qi_bhardlimit && !d->d_blk_hardlimit) {
+ d->d_blk_hardlimit = cpu_to_be64(q->qi_bhardlimit);
+ prealloc = 1;
+ }
+ if (q->qi_isoftlimit && !d->d_ino_softlimit)
+ d->d_ino_softlimit = cpu_to_be64(q->qi_isoftlimit);
+ if (q->qi_ihardlimit && !d->d_ino_hardlimit)
+ d->d_ino_hardlimit = cpu_to_be64(q->qi_ihardlimit);
+ if (q->qi_rtbsoftlimit && !d->d_rtb_softlimit)
+ d->d_rtb_softlimit = cpu_to_be64(q->qi_rtbsoftlimit);
+ if (q->qi_rtbhardlimit && !d->d_rtb_hardlimit)
+ d->d_rtb_hardlimit = cpu_to_be64(q->qi_rtbhardlimit);
+
+ if (prealloc)
+ xfs_dquot_set_prealloc_limits(dq);
+}
+
+/*
+ * Check the limits and timers of a dquot and start or reset timers
+ * if necessary.
+ * This gets called even when quota enforcement is OFF, which makes our
+ * life a little less complicated. (We just don't reject any quota
+ * reservations in that case, when enforcement is off).
+ * We also return 0 as the values of the timers in Q_GETQUOTA calls, when
+ * enforcement's off.
+ * In contrast, warnings are a little different in that they don't
+ * 'automatically' get started when limits get exceeded. They do
+ * get reset to zero, however, when we find the count to be under
+ * the soft limit (they are only ever set non-zero via userspace).
+ */
+void
+xfs_qm_adjust_dqtimers(
+ xfs_mount_t *mp,
+ xfs_disk_dquot_t *d)
+{
+ ASSERT(d->d_id);
+
+#ifdef DEBUG
+ if (d->d_blk_hardlimit)
+ ASSERT(be64_to_cpu(d->d_blk_softlimit) <=
+ be64_to_cpu(d->d_blk_hardlimit));
+ if (d->d_ino_hardlimit)
+ ASSERT(be64_to_cpu(d->d_ino_softlimit) <=
+ be64_to_cpu(d->d_ino_hardlimit));
+ if (d->d_rtb_hardlimit)
+ ASSERT(be64_to_cpu(d->d_rtb_softlimit) <=
+ be64_to_cpu(d->d_rtb_hardlimit));
+#endif
+
+ if (!d->d_btimer) {
+ if ((d->d_blk_softlimit &&
+ (be64_to_cpu(d->d_bcount) >
+ be64_to_cpu(d->d_blk_softlimit))) ||
+ (d->d_blk_hardlimit &&
+ (be64_to_cpu(d->d_bcount) >
+ be64_to_cpu(d->d_blk_hardlimit)))) {
+ d->d_btimer = cpu_to_be32(get_seconds() +
+ mp->m_quotainfo->qi_btimelimit);
+ } else {
+ d->d_bwarns = 0;
+ }
+ } else {
+ if ((!d->d_blk_softlimit ||
+ (be64_to_cpu(d->d_bcount) <=
+ be64_to_cpu(d->d_blk_softlimit))) &&
+ (!d->d_blk_hardlimit ||
+ (be64_to_cpu(d->d_bcount) <=
+ be64_to_cpu(d->d_blk_hardlimit)))) {
+ d->d_btimer = 0;
+ }
+ }
+
+ if (!d->d_itimer) {
+ if ((d->d_ino_softlimit &&
+ (be64_to_cpu(d->d_icount) >
+ be64_to_cpu(d->d_ino_softlimit))) ||
+ (d->d_ino_hardlimit &&
+ (be64_to_cpu(d->d_icount) >
+ be64_to_cpu(d->d_ino_hardlimit)))) {
+ d->d_itimer = cpu_to_be32(get_seconds() +
+ mp->m_quotainfo->qi_itimelimit);
+ } else {
+ d->d_iwarns = 0;
+ }
+ } else {
+ if ((!d->d_ino_softlimit ||
+ (be64_to_cpu(d->d_icount) <=
+ be64_to_cpu(d->d_ino_softlimit))) &&
+ (!d->d_ino_hardlimit ||
+ (be64_to_cpu(d->d_icount) <=
+ be64_to_cpu(d->d_ino_hardlimit)))) {
+ d->d_itimer = 0;
+ }
+ }
+
+ if (!d->d_rtbtimer) {
+ if ((d->d_rtb_softlimit &&
+ (be64_to_cpu(d->d_rtbcount) >
+ be64_to_cpu(d->d_rtb_softlimit))) ||
+ (d->d_rtb_hardlimit &&
+ (be64_to_cpu(d->d_rtbcount) >
+ be64_to_cpu(d->d_rtb_hardlimit)))) {
+ d->d_rtbtimer = cpu_to_be32(get_seconds() +
+ mp->m_quotainfo->qi_rtbtimelimit);
+ } else {
+ d->d_rtbwarns = 0;
+ }
+ } else {
+ if ((!d->d_rtb_softlimit ||
+ (be64_to_cpu(d->d_rtbcount) <=
+ be64_to_cpu(d->d_rtb_softlimit))) &&
+ (!d->d_rtb_hardlimit ||
+ (be64_to_cpu(d->d_rtbcount) <=
+ be64_to_cpu(d->d_rtb_hardlimit)))) {
+ d->d_rtbtimer = 0;
+ }
+ }
+}
+
+/*
+ * initialize a buffer full of dquots and log the whole thing
+ */
+STATIC void
+xfs_qm_init_dquot_blk(
+ xfs_trans_t *tp,
+ xfs_mount_t *mp,
+ xfs_dqid_t id,
+ uint type,
+ xfs_buf_t *bp)
+{
+ struct xfs_quotainfo *q = mp->m_quotainfo;
+ xfs_dqblk_t *d;
+ int curid, i;
+
+ ASSERT(tp);
+ ASSERT(xfs_buf_islocked(bp));
+
+ d = bp->b_addr;
+
+ /*
+ * ID of the first dquot in the block - id's are zero based.
+ */
+ curid = id - (id % q->qi_dqperchunk);
+ ASSERT(curid >= 0);
+ memset(d, 0, BBTOB(q->qi_dqchunklen));
+ for (i = 0; i < q->qi_dqperchunk; i++, d++, curid++) {
+ d->dd_diskdq.d_magic = cpu_to_be16(XFS_DQUOT_MAGIC);
+ d->dd_diskdq.d_version = XFS_DQUOT_VERSION;
+ d->dd_diskdq.d_id = cpu_to_be32(curid);
+ d->dd_diskdq.d_flags = type;
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ uuid_copy(&d->dd_uuid, &mp->m_sb.sb_uuid);
+ xfs_update_cksum((char *)d, sizeof(struct xfs_dqblk),
+ XFS_DQUOT_CRC_OFF);
+ }
+ }
+
+ xfs_trans_dquot_buf(tp, bp,
+ (type & XFS_DQ_USER ? XFS_BLF_UDQUOT_BUF :
+ ((type & XFS_DQ_PROJ) ? XFS_BLF_PDQUOT_BUF :
+ XFS_BLF_GDQUOT_BUF)));
+ xfs_trans_log_buf(tp, bp, 0, BBTOB(q->qi_dqchunklen) - 1);
+}
+
+/*
+ * Initialize the dynamic speculative preallocation thresholds. The lo/hi
+ * watermarks correspond to the soft and hard limits by default. If a soft limit
+ * is not specified, we use 95% of the hard limit.
+ */
+void
+xfs_dquot_set_prealloc_limits(struct xfs_dquot *dqp)
+{
+ __uint64_t space;
+
+ dqp->q_prealloc_hi_wmark = be64_to_cpu(dqp->q_core.d_blk_hardlimit);
+ dqp->q_prealloc_lo_wmark = be64_to_cpu(dqp->q_core.d_blk_softlimit);
+ if (!dqp->q_prealloc_lo_wmark) {
+ dqp->q_prealloc_lo_wmark = dqp->q_prealloc_hi_wmark;
+ do_div(dqp->q_prealloc_lo_wmark, 100);
+ dqp->q_prealloc_lo_wmark *= 95;
+ }
+
+ space = dqp->q_prealloc_hi_wmark;
+
+ do_div(space, 100);
+ dqp->q_low_space[XFS_QLOWSP_1_PCNT] = space;
+ dqp->q_low_space[XFS_QLOWSP_3_PCNT] = space * 3;
+ dqp->q_low_space[XFS_QLOWSP_5_PCNT] = space * 5;
+}
+
+/*
+ * Allocate a block and fill it with dquots.
+ * This is called when the bmapi finds a hole.
+ */
+STATIC int
+xfs_qm_dqalloc(
+ xfs_trans_t **tpp,
+ xfs_mount_t *mp,
+ xfs_dquot_t *dqp,
+ xfs_inode_t *quotip,
+ xfs_fileoff_t offset_fsb,
+ xfs_buf_t **O_bpp)
+{
+ xfs_fsblock_t firstblock;
+ xfs_bmap_free_t flist;
+ xfs_bmbt_irec_t map;
+ int nmaps, error, committed;
+ xfs_buf_t *bp;
+ xfs_trans_t *tp = *tpp;
+
+ ASSERT(tp != NULL);
+
+ trace_xfs_dqalloc(dqp);
+
+ /*
+ * Initialize the bmap freelist prior to calling bmapi code.
+ */
+ xfs_bmap_init(&flist, &firstblock);
+ xfs_ilock(quotip, XFS_ILOCK_EXCL);
+ /*
+ * Return if this type of quotas is turned off while we didn't
+ * have an inode lock
+ */
+ if (!xfs_this_quota_on(dqp->q_mount, dqp->dq_flags)) {
+ xfs_iunlock(quotip, XFS_ILOCK_EXCL);
+ return (ESRCH);
+ }
+
+ xfs_trans_ijoin(tp, quotip, XFS_ILOCK_EXCL);
+ nmaps = 1;
+ error = xfs_bmapi_write(tp, quotip, offset_fsb,
+ XFS_DQUOT_CLUSTER_SIZE_FSB, XFS_BMAPI_METADATA,
+ &firstblock, XFS_QM_DQALLOC_SPACE_RES(mp),
+ &map, &nmaps, &flist);
+ if (error)
+ goto error0;
+ ASSERT(map.br_blockcount == XFS_DQUOT_CLUSTER_SIZE_FSB);
+ ASSERT(nmaps == 1);
+ ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
+ (map.br_startblock != HOLESTARTBLOCK));
+
+ /*
+ * Keep track of the blkno to save a lookup later
+ */
+ dqp->q_blkno = XFS_FSB_TO_DADDR(mp, map.br_startblock);
+
+ /* now we can just get the buffer (there's nothing to read yet) */
+ bp = xfs_trans_get_buf(tp, mp->m_ddev_targp,
+ dqp->q_blkno,
+ mp->m_quotainfo->qi_dqchunklen,
+ 0);
+ if (!bp) {
+ error = ENOMEM;
+ goto error1;
+ }
+ bp->b_ops = &xfs_dquot_buf_ops;
+
+ /*
+ * Make a chunk of dquots out of this buffer and log
+ * the entire thing.
+ */
+ xfs_qm_init_dquot_blk(tp, mp, be32_to_cpu(dqp->q_core.d_id),
+ dqp->dq_flags & XFS_DQ_ALLTYPES, bp);
+
+ /*
+ * xfs_bmap_finish() may commit the current transaction and
+ * start a second transaction if the freelist is not empty.
+ *
+ * Since we still want to modify this buffer, we need to
+ * ensure that the buffer is not released on commit of
+ * the first transaction and ensure the buffer is added to the
+ * second transaction.
+ *
+ * If there is only one transaction then don't stop the buffer
+ * from being released when it commits later on.
+ */
+
+ xfs_trans_bhold(tp, bp);
+
+ if ((error = xfs_bmap_finish(tpp, &flist, &committed))) {
+ goto error1;
+ }
+
+ if (committed) {
+ tp = *tpp;
+ xfs_trans_bjoin(tp, bp);
+ } else {
+ xfs_trans_bhold_release(tp, bp);
+ }
+
+ *O_bpp = bp;
+ return 0;
+
+ error1:
+ xfs_bmap_cancel(&flist);
+ error0:
+ xfs_iunlock(quotip, XFS_ILOCK_EXCL);
+
+ return (error);
+}
+
+STATIC int
+xfs_qm_dqrepair(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ struct xfs_dquot *dqp,
+ xfs_dqid_t firstid,
+ struct xfs_buf **bpp)
+{
+ int error;
+ struct xfs_disk_dquot *ddq;
+ struct xfs_dqblk *d;
+ int i;
+
+ /*
+ * Read the buffer without verification so we get the corrupted
+ * buffer returned to us. make sure we verify it on write, though.
+ */
+ error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, dqp->q_blkno,
+ mp->m_quotainfo->qi_dqchunklen,
+ 0, bpp, NULL);
+
+ if (error) {
+ ASSERT(*bpp == NULL);
+ return XFS_ERROR(error);
+ }
+ (*bpp)->b_ops = &xfs_dquot_buf_ops;
+
+ ASSERT(xfs_buf_islocked(*bpp));
+ d = (struct xfs_dqblk *)(*bpp)->b_addr;
+
+ /* Do the actual repair of dquots in this buffer */
+ for (i = 0; i < mp->m_quotainfo->qi_dqperchunk; i++) {
+ ddq = &d[i].dd_diskdq;
+ error = xfs_dqcheck(mp, ddq, firstid + i,
+ dqp->dq_flags & XFS_DQ_ALLTYPES,
+ XFS_QMOPT_DQREPAIR, "xfs_qm_dqrepair");
+ if (error) {
+ /* repair failed, we're screwed */
+ xfs_trans_brelse(tp, *bpp);
+ return XFS_ERROR(EIO);
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * Maps a dquot to the buffer containing its on-disk version.
+ * This returns a ptr to the buffer containing the on-disk dquot
+ * in the bpp param, and a ptr to the on-disk dquot within that buffer
+ */
+STATIC int
+xfs_qm_dqtobp(
+ xfs_trans_t **tpp,
+ xfs_dquot_t *dqp,
+ xfs_disk_dquot_t **O_ddpp,
+ xfs_buf_t **O_bpp,
+ uint flags)
+{
+ struct xfs_bmbt_irec map;
+ int nmaps = 1, error;
+ struct xfs_buf *bp;
+ struct xfs_inode *quotip = xfs_dq_to_quota_inode(dqp);
+ struct xfs_mount *mp = dqp->q_mount;
+ xfs_dqid_t id = be32_to_cpu(dqp->q_core.d_id);
+ struct xfs_trans *tp = (tpp ? *tpp : NULL);
+ uint lock_mode;
+
+ dqp->q_fileoffset = (xfs_fileoff_t)id / mp->m_quotainfo->qi_dqperchunk;
+
+ lock_mode = xfs_ilock_data_map_shared(quotip);
+ if (!xfs_this_quota_on(dqp->q_mount, dqp->dq_flags)) {
+ /*
+ * Return if this type of quotas is turned off while we
+ * didn't have the quota inode lock.
+ */
+ xfs_iunlock(quotip, lock_mode);
+ return ESRCH;
+ }
+
+ /*
+ * Find the block map; no allocations yet
+ */
+ error = xfs_bmapi_read(quotip, dqp->q_fileoffset,
+ XFS_DQUOT_CLUSTER_SIZE_FSB, &map, &nmaps, 0);
+
+ xfs_iunlock(quotip, lock_mode);
+ if (error)
+ return error;
+
+ ASSERT(nmaps == 1);
+ ASSERT(map.br_blockcount == 1);
+
+ /*
+ * Offset of dquot in the (fixed sized) dquot chunk.
+ */
+ dqp->q_bufoffset = (id % mp->m_quotainfo->qi_dqperchunk) *
+ sizeof(xfs_dqblk_t);
+
+ ASSERT(map.br_startblock != DELAYSTARTBLOCK);
+ if (map.br_startblock == HOLESTARTBLOCK) {
+ /*
+ * We don't allocate unless we're asked to
+ */
+ if (!(flags & XFS_QMOPT_DQALLOC))
+ return ENOENT;
+
+ ASSERT(tp);
+ error = xfs_qm_dqalloc(tpp, mp, dqp, quotip,
+ dqp->q_fileoffset, &bp);
+ if (error)
+ return error;
+ tp = *tpp;
+ } else {
+ trace_xfs_dqtobp_read(dqp);
+
+ /*
+ * store the blkno etc so that we don't have to do the
+ * mapping all the time
+ */
+ dqp->q_blkno = XFS_FSB_TO_DADDR(mp, map.br_startblock);
+
+ error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
+ dqp->q_blkno,
+ mp->m_quotainfo->qi_dqchunklen,
+ 0, &bp, &xfs_dquot_buf_ops);
+
+ if (error == EFSCORRUPTED && (flags & XFS_QMOPT_DQREPAIR)) {
+ xfs_dqid_t firstid = (xfs_dqid_t)map.br_startoff *
+ mp->m_quotainfo->qi_dqperchunk;
+ ASSERT(bp == NULL);
+ error = xfs_qm_dqrepair(mp, tp, dqp, firstid, &bp);
+ }
+
+ if (error) {
+ ASSERT(bp == NULL);
+ return XFS_ERROR(error);
+ }
+ }
+
+ ASSERT(xfs_buf_islocked(bp));
+ *O_bpp = bp;
+ *O_ddpp = bp->b_addr + dqp->q_bufoffset;
+
+ return (0);
+}
+
+
+/*
+ * Read in the ondisk dquot using dqtobp() then copy it to an incore version,
+ * and release the buffer immediately.
+ *
+ * If XFS_QMOPT_DQALLOC is set, allocate a dquot on disk if it needed.
+ */
+int
+xfs_qm_dqread(
+ struct xfs_mount *mp,
+ xfs_dqid_t id,
+ uint type,
+ uint flags,
+ struct xfs_dquot **O_dqpp)
+{
+ struct xfs_dquot *dqp;
+ struct xfs_disk_dquot *ddqp;
+ struct xfs_buf *bp;
+ struct xfs_trans *tp = NULL;
+ int error;
+ int cancelflags = 0;
+
+
+ dqp = kmem_zone_zalloc(xfs_qm_dqzone, KM_SLEEP);
+
+ dqp->dq_flags = type;
+ dqp->q_core.d_id = cpu_to_be32(id);
+ dqp->q_mount = mp;
+ INIT_LIST_HEAD(&dqp->q_lru);
+ mutex_init(&dqp->q_qlock);
+ init_waitqueue_head(&dqp->q_pinwait);
+
+ /*
+ * Because we want to use a counting completion, complete
+ * the flush completion once to allow a single access to
+ * the flush completion without blocking.
+ */
+ init_completion(&dqp->q_flush);
+ complete(&dqp->q_flush);
+
+ /*
+ * Make sure group quotas have a different lock class than user
+ * quotas.
+ */
+ switch (type) {
+ case XFS_DQ_USER:
+ /* uses the default lock class */
+ break;
+ case XFS_DQ_GROUP:
+ lockdep_set_class(&dqp->q_qlock, &xfs_dquot_group_class);
+ break;
+ case XFS_DQ_PROJ:
+ lockdep_set_class(&dqp->q_qlock, &xfs_dquot_project_class);
+ break;
+ default:
+ ASSERT(0);
+ break;
+ }
+
+ XFS_STATS_INC(xs_qm_dquot);
+
+ trace_xfs_dqread(dqp);
+
+ if (flags & XFS_QMOPT_DQALLOC) {
+ tp = xfs_trans_alloc(mp, XFS_TRANS_QM_DQALLOC);
+ error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_dqalloc,
+ XFS_QM_DQALLOC_SPACE_RES(mp), 0);
+ if (error)
+ goto error1;
+ cancelflags = XFS_TRANS_RELEASE_LOG_RES;
+ }
+
+ /*
+ * get a pointer to the on-disk dquot and the buffer containing it
+ * dqp already knows its own type (GROUP/USER).
+ */
+ error = xfs_qm_dqtobp(&tp, dqp, &ddqp, &bp, flags);
+ if (error) {
+ /*
+ * This can happen if quotas got turned off (ESRCH),
+ * or if the dquot didn't exist on disk and we ask to
+ * allocate (ENOENT).
+ */
+ trace_xfs_dqread_fail(dqp);
+ cancelflags |= XFS_TRANS_ABORT;
+ goto error1;
+ }
+
+ /* copy everything from disk dquot to the incore dquot */
+ memcpy(&dqp->q_core, ddqp, sizeof(xfs_disk_dquot_t));
+ xfs_qm_dquot_logitem_init(dqp);
+
+ /*
+ * Reservation counters are defined as reservation plus current usage
+ * to avoid having to add every time.
+ */
+ dqp->q_res_bcount = be64_to_cpu(ddqp->d_bcount);
+ dqp->q_res_icount = be64_to_cpu(ddqp->d_icount);
+ dqp->q_res_rtbcount = be64_to_cpu(ddqp->d_rtbcount);
+
+ /* initialize the dquot speculative prealloc thresholds */
+ xfs_dquot_set_prealloc_limits(dqp);
+
+ /* Mark the buf so that this will stay incore a little longer */
+ xfs_buf_set_ref(bp, XFS_DQUOT_REF);
+
+ /*
+ * We got the buffer with a xfs_trans_read_buf() (in dqtobp())
+ * So we need to release with xfs_trans_brelse().
+ * The strategy here is identical to that of inodes; we lock
+ * the dquot in xfs_qm_dqget() before making it accessible to
+ * others. This is because dquots, like inodes, need a good level of
+ * concurrency, and we don't want to take locks on the entire buffers
+ * for dquot accesses.
+ * Note also that the dquot buffer may even be dirty at this point, if
+ * this particular dquot was repaired. We still aren't afraid to
+ * brelse it because we have the changes incore.
+ */
+ ASSERT(xfs_buf_islocked(bp));
+ xfs_trans_brelse(tp, bp);
+
+ if (tp) {
+ error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+ if (error)
+ goto error0;
+ }
+
+ *O_dqpp = dqp;
+ return error;
+
+error1:
+ if (tp)
+ xfs_trans_cancel(tp, cancelflags);
+error0:
+ xfs_qm_dqdestroy(dqp);
+ *O_dqpp = NULL;
+ return error;
+}
+
+/*
+ * Given the file system, inode OR id, and type (UDQUOT/GDQUOT), return a
+ * a locked dquot, doing an allocation (if requested) as needed.
+ * When both an inode and an id are given, the inode's id takes precedence.
+ * That is, if the id changes while we don't hold the ilock inside this
+ * function, the new dquot is returned, not necessarily the one requested
+ * in the id argument.
+ */
+int
+xfs_qm_dqget(
+ xfs_mount_t *mp,
+ xfs_inode_t *ip, /* locked inode (optional) */
+ xfs_dqid_t id, /* uid/projid/gid depending on type */
+ uint type, /* XFS_DQ_USER/XFS_DQ_PROJ/XFS_DQ_GROUP */
+ uint flags, /* DQALLOC, DQSUSER, DQREPAIR, DOWARN */
+ xfs_dquot_t **O_dqpp) /* OUT : locked incore dquot */
+{
+ struct xfs_quotainfo *qi = mp->m_quotainfo;
+ struct radix_tree_root *tree = xfs_dquot_tree(qi, type);
+ struct xfs_dquot *dqp;
+ int error;
+
+ ASSERT(XFS_IS_QUOTA_RUNNING(mp));
+ if ((! XFS_IS_UQUOTA_ON(mp) && type == XFS_DQ_USER) ||
+ (! XFS_IS_PQUOTA_ON(mp) && type == XFS_DQ_PROJ) ||
+ (! XFS_IS_GQUOTA_ON(mp) && type == XFS_DQ_GROUP)) {
+ return (ESRCH);
+ }
+
+#ifdef DEBUG
+ if (xfs_do_dqerror) {
+ if ((xfs_dqerror_target == mp->m_ddev_targp) &&
+ (xfs_dqreq_num++ % xfs_dqerror_mod) == 0) {
+ xfs_debug(mp, "Returning error in dqget");
+ return (EIO);
+ }
+ }
+
+ ASSERT(type == XFS_DQ_USER ||
+ type == XFS_DQ_PROJ ||
+ type == XFS_DQ_GROUP);
+ if (ip) {
+ ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+ ASSERT(xfs_inode_dquot(ip, type) == NULL);
+ }
+#endif
+
+restart:
+ mutex_lock(&qi->qi_tree_lock);
+ dqp = radix_tree_lookup(tree, id);
+ if (dqp) {
+ xfs_dqlock(dqp);
+ if (dqp->dq_flags & XFS_DQ_FREEING) {
+ xfs_dqunlock(dqp);
+ mutex_unlock(&qi->qi_tree_lock);
+ trace_xfs_dqget_freeing(dqp);
+ delay(1);
+ goto restart;
+ }
+
+ dqp->q_nrefs++;
+ mutex_unlock(&qi->qi_tree_lock);
+
+ trace_xfs_dqget_hit(dqp);
+ XFS_STATS_INC(xs_qm_dqcachehits);
+ *O_dqpp = dqp;
+ return 0;
+ }
+ mutex_unlock(&qi->qi_tree_lock);
+ XFS_STATS_INC(xs_qm_dqcachemisses);
+
+ /*
+ * Dquot cache miss. We don't want to keep the inode lock across
+ * a (potential) disk read. Also we don't want to deal with the lock
+ * ordering between quotainode and this inode. OTOH, dropping the inode
+ * lock here means dealing with a chown that can happen before
+ * we re-acquire the lock.
+ */
+ if (ip)
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+
+ error = xfs_qm_dqread(mp, id, type, flags, &dqp);
+
+ if (ip)
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+
+ if (error)
+ return error;
+
+ if (ip) {
+ /*
+ * A dquot could be attached to this inode by now, since
+ * we had dropped the ilock.
+ */
+ if (xfs_this_quota_on(mp, type)) {
+ struct xfs_dquot *dqp1;
+
+ dqp1 = xfs_inode_dquot(ip, type);
+ if (dqp1) {
+ xfs_qm_dqdestroy(dqp);
+ dqp = dqp1;
+ xfs_dqlock(dqp);
+ goto dqret;
+ }
+ } else {
+ /* inode stays locked on return */
+ xfs_qm_dqdestroy(dqp);
+ return XFS_ERROR(ESRCH);
+ }
+ }
+
+ mutex_lock(&qi->qi_tree_lock);
+ error = -radix_tree_insert(tree, id, dqp);
+ if (unlikely(error)) {
+ WARN_ON(error != EEXIST);
+
+ /*
+ * Duplicate found. Just throw away the new dquot and start
+ * over.
+ */
+ mutex_unlock(&qi->qi_tree_lock);
+ trace_xfs_dqget_dup(dqp);
+ xfs_qm_dqdestroy(dqp);
+ XFS_STATS_INC(xs_qm_dquot_dups);
+ goto restart;
+ }
+
+ /*
+ * We return a locked dquot to the caller, with a reference taken
+ */
+ xfs_dqlock(dqp);
+ dqp->q_nrefs = 1;
+
+ qi->qi_dquots++;
+ mutex_unlock(&qi->qi_tree_lock);
+
+ dqret:
+ ASSERT((ip == NULL) || xfs_isilocked(ip, XFS_ILOCK_EXCL));
+ trace_xfs_dqget_miss(dqp);
+ *O_dqpp = dqp;
+ return (0);
+}
+
+/*
+ * Release a reference to the dquot (decrement ref-count) and unlock it.
+ *
+ * If there is a group quota attached to this dquot, carefully release that
+ * too without tripping over deadlocks'n'stuff.
+ */
+void
+xfs_qm_dqput(
+ struct xfs_dquot *dqp)
+{
+ ASSERT(dqp->q_nrefs > 0);
+ ASSERT(XFS_DQ_IS_LOCKED(dqp));
+
+ trace_xfs_dqput(dqp);
+
+ if (--dqp->q_nrefs == 0) {
+ struct xfs_quotainfo *qi = dqp->q_mount->m_quotainfo;
+ trace_xfs_dqput_free(dqp);
+
+ if (list_lru_add(&qi->qi_lru, &dqp->q_lru))
+ XFS_STATS_INC(xs_qm_dquot_unused);
+ }
+ xfs_dqunlock(dqp);
+}
+
+/*
+ * Release a dquot. Flush it if dirty, then dqput() it.
+ * dquot must not be locked.
+ */
+void
+xfs_qm_dqrele(
+ xfs_dquot_t *dqp)
+{
+ if (!dqp)
+ return;
+
+ trace_xfs_dqrele(dqp);
+
+ xfs_dqlock(dqp);
+ /*
+ * We don't care to flush it if the dquot is dirty here.
+ * That will create stutters that we want to avoid.
+ * Instead we do a delayed write when we try to reclaim
+ * a dirty dquot. Also xfs_sync will take part of the burden...
+ */
+ xfs_qm_dqput(dqp);
+}
+
+/*
+ * This is the dquot flushing I/O completion routine. It is called
+ * from interrupt level when the buffer containing the dquot is
+ * flushed to disk. It is responsible for removing the dquot logitem
+ * from the AIL if it has not been re-logged, and unlocking the dquot's
+ * flush lock. This behavior is very similar to that of inodes..
+ */
+STATIC void
+xfs_qm_dqflush_done(
+ struct xfs_buf *bp,
+ struct xfs_log_item *lip)
+{
+ xfs_dq_logitem_t *qip = (struct xfs_dq_logitem *)lip;
+ xfs_dquot_t *dqp = qip->qli_dquot;
+ struct xfs_ail *ailp = lip->li_ailp;
+
+ /*
+ * We only want to pull the item from the AIL if its
+ * location in the log has not changed since we started the flush.
+ * Thus, we only bother if the dquot's lsn has
+ * not changed. First we check the lsn outside the lock
+ * since it's cheaper, and then we recheck while
+ * holding the lock before removing the dquot from the AIL.
+ */
+ if ((lip->li_flags & XFS_LI_IN_AIL) &&
+ lip->li_lsn == qip->qli_flush_lsn) {
+
+ /* xfs_trans_ail_delete() drops the AIL lock. */
+ spin_lock(&ailp->xa_lock);
+ if (lip->li_lsn == qip->qli_flush_lsn)
+ xfs_trans_ail_delete(ailp, lip, SHUTDOWN_CORRUPT_INCORE);
+ else
+ spin_unlock(&ailp->xa_lock);
+ }
+
+ /*
+ * Release the dq's flush lock since we're done with it.
+ */
+ xfs_dqfunlock(dqp);
+}
+
+/*
+ * Write a modified dquot to disk.
+ * The dquot must be locked and the flush lock too taken by caller.
+ * The flush lock will not be unlocked until the dquot reaches the disk,
+ * but the dquot is free to be unlocked and modified by the caller
+ * in the interim. Dquot is still locked on return. This behavior is
+ * identical to that of inodes.
+ */
+int
+xfs_qm_dqflush(
+ struct xfs_dquot *dqp,
+ struct xfs_buf **bpp)
+{
+ struct xfs_mount *mp = dqp->q_mount;
+ struct xfs_buf *bp;
+ struct xfs_disk_dquot *ddqp;
+ int error;
+
+ ASSERT(XFS_DQ_IS_LOCKED(dqp));
+ ASSERT(!completion_done(&dqp->q_flush));
+
+ trace_xfs_dqflush(dqp);
+
+ *bpp = NULL;
+
+ xfs_qm_dqunpin_wait(dqp);
+
+ /*
+ * This may have been unpinned because the filesystem is shutting
+ * down forcibly. If that's the case we must not write this dquot
+ * to disk, because the log record didn't make it to disk.
+ *
+ * We also have to remove the log item from the AIL in this case,
+ * as we wait for an emptry AIL as part of the unmount process.
+ */
+ if (XFS_FORCED_SHUTDOWN(mp)) {
+ struct xfs_log_item *lip = &dqp->q_logitem.qli_item;
+ dqp->dq_flags &= ~XFS_DQ_DIRTY;
+
+ spin_lock(&mp->m_ail->xa_lock);
+ if (lip->li_flags & XFS_LI_IN_AIL)
+ xfs_trans_ail_delete(mp->m_ail, lip,
+ SHUTDOWN_CORRUPT_INCORE);
+ else
+ spin_unlock(&mp->m_ail->xa_lock);
+ error = XFS_ERROR(EIO);
+ goto out_unlock;
+ }
+
+ /*
+ * Get the buffer containing the on-disk dquot
+ */
+ error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dqp->q_blkno,
+ mp->m_quotainfo->qi_dqchunklen, 0, &bp, NULL);
+ if (error)
+ goto out_unlock;
+
+ /*
+ * Calculate the location of the dquot inside the buffer.
+ */
+ ddqp = bp->b_addr + dqp->q_bufoffset;
+
+ /*
+ * A simple sanity check in case we got a corrupted dquot..
+ */
+ error = xfs_dqcheck(mp, &dqp->q_core, be32_to_cpu(ddqp->d_id), 0,
+ XFS_QMOPT_DOWARN, "dqflush (incore copy)");
+ if (error) {
+ xfs_buf_relse(bp);
+ xfs_dqfunlock(dqp);
+ xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+ return XFS_ERROR(EIO);
+ }
+
+ /* This is the only portion of data that needs to persist */
+ memcpy(ddqp, &dqp->q_core, sizeof(xfs_disk_dquot_t));
+
+ /*
+ * Clear the dirty field and remember the flush lsn for later use.
+ */
+ dqp->dq_flags &= ~XFS_DQ_DIRTY;
+
+ xfs_trans_ail_copy_lsn(mp->m_ail, &dqp->q_logitem.qli_flush_lsn,
+ &dqp->q_logitem.qli_item.li_lsn);
+
+ /*
+ * copy the lsn into the on-disk dquot now while we have the in memory
+ * dquot here. This can't be done later in the write verifier as we
+ * can't get access to the log item at that point in time.
+ *
+ * We also calculate the CRC here so that the on-disk dquot in the
+ * buffer always has a valid CRC. This ensures there is no possibility
+ * of a dquot without an up-to-date CRC getting to disk.
+ */
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ struct xfs_dqblk *dqb = (struct xfs_dqblk *)ddqp;
+
+ dqb->dd_lsn = cpu_to_be64(dqp->q_logitem.qli_item.li_lsn);
+ xfs_update_cksum((char *)dqb, sizeof(struct xfs_dqblk),
+ XFS_DQUOT_CRC_OFF);
+ }
+
+ /*
+ * Attach an iodone routine so that we can remove this dquot from the
+ * AIL and release the flush lock once the dquot is synced to disk.
+ */
+ xfs_buf_attach_iodone(bp, xfs_qm_dqflush_done,
+ &dqp->q_logitem.qli_item);
+
+ /*
+ * If the buffer is pinned then push on the log so we won't
+ * get stuck waiting in the write for too long.
+ */
+ if (xfs_buf_ispinned(bp)) {
+ trace_xfs_dqflush_force(dqp);
+ xfs_log_force(mp, 0);
+ }
+
+ trace_xfs_dqflush_done(dqp);
+ *bpp = bp;
+ return 0;
+
+out_unlock:
+ xfs_dqfunlock(dqp);
+ return XFS_ERROR(EIO);
+}
+
+/*
+ * Lock two xfs_dquot structures.
+ *
+ * To avoid deadlocks we always lock the quota structure with
+ * the lowerd id first.
+ */
+void
+xfs_dqlock2(
+ xfs_dquot_t *d1,
+ xfs_dquot_t *d2)
+{
+ if (d1 && d2) {
+ ASSERT(d1 != d2);
+ if (be32_to_cpu(d1->q_core.d_id) >
+ be32_to_cpu(d2->q_core.d_id)) {
+ mutex_lock(&d2->q_qlock);
+ mutex_lock_nested(&d1->q_qlock, XFS_QLOCK_NESTED);
+ } else {
+ mutex_lock(&d1->q_qlock);
+ mutex_lock_nested(&d2->q_qlock, XFS_QLOCK_NESTED);
+ }
+ } else if (d1) {
+ mutex_lock(&d1->q_qlock);
+ } else if (d2) {
+ mutex_lock(&d2->q_qlock);
+ }
+}
+
+int __init
+xfs_qm_init(void)
+{
+ xfs_qm_dqzone =
+ kmem_zone_init(sizeof(struct xfs_dquot), "xfs_dquot");
+ if (!xfs_qm_dqzone)
+ goto out;
+
+ xfs_qm_dqtrxzone =
+ kmem_zone_init(sizeof(struct xfs_dquot_acct), "xfs_dqtrx");
+ if (!xfs_qm_dqtrxzone)
+ goto out_free_dqzone;
+
+ return 0;
+
+out_free_dqzone:
+ kmem_zone_destroy(xfs_qm_dqzone);
+out:
+ return -ENOMEM;
+}
+
+void
+xfs_qm_exit(void)
+{
+ kmem_zone_destroy(xfs_qm_dqtrxzone);
+ kmem_zone_destroy(xfs_qm_dqzone);
+}
diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h
new file mode 100644
index 00000000000..68a68f70483
--- /dev/null
+++ b/fs/xfs/xfs_dquot.h
@@ -0,0 +1,173 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#ifndef __XFS_DQUOT_H__
+#define __XFS_DQUOT_H__
+
+/*
+ * Dquots are structures that hold quota information about a user or a group,
+ * much like inodes are for files. In fact, dquots share many characteristics
+ * with inodes. However, dquots can also be a centralized resource, relative
+ * to a collection of inodes. In this respect, dquots share some characteristics
+ * of the superblock.
+ * XFS dquots exploit both those in its algorithms. They make every attempt
+ * to not be a bottleneck when quotas are on and have minimal impact, if any,
+ * when quotas are off.
+ */
+
+struct xfs_mount;
+struct xfs_trans;
+
+enum {
+ XFS_QLOWSP_1_PCNT = 0,
+ XFS_QLOWSP_3_PCNT,
+ XFS_QLOWSP_5_PCNT,
+ XFS_QLOWSP_MAX
+};
+
+/*
+ * The incore dquot structure
+ */
+typedef struct xfs_dquot {
+ uint dq_flags; /* various flags (XFS_DQ_*) */
+ struct list_head q_lru; /* global free list of dquots */
+ struct xfs_mount*q_mount; /* filesystem this relates to */
+ struct xfs_trans*q_transp; /* trans this belongs to currently */
+ uint q_nrefs; /* # active refs from inodes */
+ xfs_daddr_t q_blkno; /* blkno of dquot buffer */
+ int q_bufoffset; /* off of dq in buffer (# dquots) */
+ xfs_fileoff_t q_fileoffset; /* offset in quotas file */
+
+ xfs_disk_dquot_t q_core; /* actual usage & quotas */
+ xfs_dq_logitem_t q_logitem; /* dquot log item */
+ xfs_qcnt_t q_res_bcount; /* total regular nblks used+reserved */
+ xfs_qcnt_t q_res_icount; /* total inos allocd+reserved */
+ xfs_qcnt_t q_res_rtbcount;/* total realtime blks used+reserved */
+ xfs_qcnt_t q_prealloc_lo_wmark;/* prealloc throttle wmark */
+ xfs_qcnt_t q_prealloc_hi_wmark;/* prealloc disabled wmark */
+ int64_t q_low_space[XFS_QLOWSP_MAX];
+ struct mutex q_qlock; /* quota lock */
+ struct completion q_flush; /* flush completion queue */
+ atomic_t q_pincount; /* dquot pin count */
+ wait_queue_head_t q_pinwait; /* dquot pinning wait queue */
+} xfs_dquot_t;
+
+/*
+ * Lock hierarchy for q_qlock:
+ * XFS_QLOCK_NORMAL is the implicit default,
+ * XFS_QLOCK_NESTED is the dquot with the higher id in xfs_dqlock2
+ */
+enum {
+ XFS_QLOCK_NORMAL = 0,
+ XFS_QLOCK_NESTED,
+};
+
+/*
+ * Manage the q_flush completion queue embedded in the dquot. This completion
+ * queue synchronizes processes attempting to flush the in-core dquot back to
+ * disk.
+ */
+static inline void xfs_dqflock(xfs_dquot_t *dqp)
+{
+ wait_for_completion(&dqp->q_flush);
+}
+
+static inline int xfs_dqflock_nowait(xfs_dquot_t *dqp)
+{
+ return try_wait_for_completion(&dqp->q_flush);
+}
+
+static inline void xfs_dqfunlock(xfs_dquot_t *dqp)
+{
+ complete(&dqp->q_flush);
+}
+
+static inline int xfs_dqlock_nowait(struct xfs_dquot *dqp)
+{
+ return mutex_trylock(&dqp->q_qlock);
+}
+
+static inline void xfs_dqlock(struct xfs_dquot *dqp)
+{
+ mutex_lock(&dqp->q_qlock);
+}
+
+static inline void xfs_dqunlock(struct xfs_dquot *dqp)
+{
+ mutex_unlock(&dqp->q_qlock);
+}
+
+static inline int xfs_this_quota_on(struct xfs_mount *mp, int type)
+{
+ switch (type & XFS_DQ_ALLTYPES) {
+ case XFS_DQ_USER:
+ return XFS_IS_UQUOTA_ON(mp);
+ case XFS_DQ_GROUP:
+ return XFS_IS_GQUOTA_ON(mp);
+ case XFS_DQ_PROJ:
+ return XFS_IS_PQUOTA_ON(mp);
+ default:
+ return 0;
+ }
+}
+
+static inline xfs_dquot_t *xfs_inode_dquot(struct xfs_inode *ip, int type)
+{
+ switch (type & XFS_DQ_ALLTYPES) {
+ case XFS_DQ_USER:
+ return ip->i_udquot;
+ case XFS_DQ_GROUP:
+ return ip->i_gdquot;
+ case XFS_DQ_PROJ:
+ return ip->i_pdquot;
+ default:
+ return NULL;
+ }
+}
+
+#define XFS_DQ_IS_LOCKED(dqp) (mutex_is_locked(&((dqp)->q_qlock)))
+#define XFS_DQ_IS_DIRTY(dqp) ((dqp)->dq_flags & XFS_DQ_DIRTY)
+#define XFS_QM_ISUDQ(dqp) ((dqp)->dq_flags & XFS_DQ_USER)
+#define XFS_QM_ISPDQ(dqp) ((dqp)->dq_flags & XFS_DQ_PROJ)
+#define XFS_QM_ISGDQ(dqp) ((dqp)->dq_flags & XFS_DQ_GROUP)
+
+extern int xfs_qm_dqread(struct xfs_mount *, xfs_dqid_t, uint,
+ uint, struct xfs_dquot **);
+extern void xfs_qm_dqdestroy(xfs_dquot_t *);
+extern int xfs_qm_dqflush(struct xfs_dquot *, struct xfs_buf **);
+extern void xfs_qm_dqunpin_wait(xfs_dquot_t *);
+extern void xfs_qm_adjust_dqtimers(xfs_mount_t *,
+ xfs_disk_dquot_t *);
+extern void xfs_qm_adjust_dqlimits(struct xfs_mount *,
+ struct xfs_dquot *);
+extern int xfs_qm_dqget(xfs_mount_t *, xfs_inode_t *,
+ xfs_dqid_t, uint, uint, xfs_dquot_t **);
+extern void xfs_qm_dqput(xfs_dquot_t *);
+
+extern void xfs_dqlock2(struct xfs_dquot *, struct xfs_dquot *);
+
+extern void xfs_dquot_set_prealloc_limits(struct xfs_dquot *);
+
+static inline struct xfs_dquot *xfs_qm_dqhold(struct xfs_dquot *dqp)
+{
+ xfs_dqlock(dqp);
+ dqp->q_nrefs++;
+ xfs_dqunlock(dqp);
+ return dqp;
+}
+
+#endif /* __XFS_DQUOT_H__ */
diff --git a/fs/xfs/xfs_dquot_buf.c b/fs/xfs/xfs_dquot_buf.c
new file mode 100644
index 00000000000..c2ac0c611ad
--- /dev/null
+++ b/fs/xfs/xfs_dquot_buf.c
@@ -0,0 +1,290 @@
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_quota.h"
+#include "xfs_trans.h"
+#include "xfs_qm.h"
+#include "xfs_error.h"
+#include "xfs_cksum.h"
+#include "xfs_trace.h"
+
+int
+xfs_calc_dquots_per_chunk(
+ unsigned int nbblks) /* basic block units */
+{
+ unsigned int ndquots;
+
+ ASSERT(nbblks > 0);
+ ndquots = BBTOB(nbblks);
+ do_div(ndquots, sizeof(xfs_dqblk_t));
+
+ return ndquots;
+}
+
+/*
+ * Do some primitive error checking on ondisk dquot data structures.
+ */
+int
+xfs_dqcheck(
+ struct xfs_mount *mp,
+ xfs_disk_dquot_t *ddq,
+ xfs_dqid_t id,
+ uint type, /* used only when IO_dorepair is true */
+ uint flags,
+ char *str)
+{
+ xfs_dqblk_t *d = (xfs_dqblk_t *)ddq;
+ int errs = 0;
+
+ /*
+ * We can encounter an uninitialized dquot buffer for 2 reasons:
+ * 1. If we crash while deleting the quotainode(s), and those blks got
+ * used for user data. This is because we take the path of regular
+ * file deletion; however, the size field of quotainodes is never
+ * updated, so all the tricks that we play in itruncate_finish
+ * don't quite matter.
+ *
+ * 2. We don't play the quota buffers when there's a quotaoff logitem.
+ * But the allocation will be replayed so we'll end up with an
+ * uninitialized quota block.
+ *
+ * This is all fine; things are still consistent, and we haven't lost
+ * any quota information. Just don't complain about bad dquot blks.
+ */
+ if (ddq->d_magic != cpu_to_be16(XFS_DQUOT_MAGIC)) {
+ if (flags & XFS_QMOPT_DOWARN)
+ xfs_alert(mp,
+ "%s : XFS dquot ID 0x%x, magic 0x%x != 0x%x",
+ str, id, be16_to_cpu(ddq->d_magic), XFS_DQUOT_MAGIC);
+ errs++;
+ }
+ if (ddq->d_version != XFS_DQUOT_VERSION) {
+ if (flags & XFS_QMOPT_DOWARN)
+ xfs_alert(mp,
+ "%s : XFS dquot ID 0x%x, version 0x%x != 0x%x",
+ str, id, ddq->d_version, XFS_DQUOT_VERSION);
+ errs++;
+ }
+
+ if (ddq->d_flags != XFS_DQ_USER &&
+ ddq->d_flags != XFS_DQ_PROJ &&
+ ddq->d_flags != XFS_DQ_GROUP) {
+ if (flags & XFS_QMOPT_DOWARN)
+ xfs_alert(mp,
+ "%s : XFS dquot ID 0x%x, unknown flags 0x%x",
+ str, id, ddq->d_flags);
+ errs++;
+ }
+
+ if (id != -1 && id != be32_to_cpu(ddq->d_id)) {
+ if (flags & XFS_QMOPT_DOWARN)
+ xfs_alert(mp,
+ "%s : ondisk-dquot 0x%p, ID mismatch: "
+ "0x%x expected, found id 0x%x",
+ str, ddq, id, be32_to_cpu(ddq->d_id));
+ errs++;
+ }
+
+ if (!errs && ddq->d_id) {
+ if (ddq->d_blk_softlimit &&
+ be64_to_cpu(ddq->d_bcount) >
+ be64_to_cpu(ddq->d_blk_softlimit)) {
+ if (!ddq->d_btimer) {
+ if (flags & XFS_QMOPT_DOWARN)
+ xfs_alert(mp,
+ "%s : Dquot ID 0x%x (0x%p) BLK TIMER NOT STARTED",
+ str, (int)be32_to_cpu(ddq->d_id), ddq);
+ errs++;
+ }
+ }
+ if (ddq->d_ino_softlimit &&
+ be64_to_cpu(ddq->d_icount) >
+ be64_to_cpu(ddq->d_ino_softlimit)) {
+ if (!ddq->d_itimer) {
+ if (flags & XFS_QMOPT_DOWARN)
+ xfs_alert(mp,
+ "%s : Dquot ID 0x%x (0x%p) INODE TIMER NOT STARTED",
+ str, (int)be32_to_cpu(ddq->d_id), ddq);
+ errs++;
+ }
+ }
+ if (ddq->d_rtb_softlimit &&
+ be64_to_cpu(ddq->d_rtbcount) >
+ be64_to_cpu(ddq->d_rtb_softlimit)) {
+ if (!ddq->d_rtbtimer) {
+ if (flags & XFS_QMOPT_DOWARN)
+ xfs_alert(mp,
+ "%s : Dquot ID 0x%x (0x%p) RTBLK TIMER NOT STARTED",
+ str, (int)be32_to_cpu(ddq->d_id), ddq);
+ errs++;
+ }
+ }
+ }
+
+ if (!errs || !(flags & XFS_QMOPT_DQREPAIR))
+ return errs;
+
+ if (flags & XFS_QMOPT_DOWARN)
+ xfs_notice(mp, "Re-initializing dquot ID 0x%x", id);
+
+ /*
+ * Typically, a repair is only requested by quotacheck.
+ */
+ ASSERT(id != -1);
+ ASSERT(flags & XFS_QMOPT_DQREPAIR);
+ memset(d, 0, sizeof(xfs_dqblk_t));
+
+ d->dd_diskdq.d_magic = cpu_to_be16(XFS_DQUOT_MAGIC);
+ d->dd_diskdq.d_version = XFS_DQUOT_VERSION;
+ d->dd_diskdq.d_flags = type;
+ d->dd_diskdq.d_id = cpu_to_be32(id);
+
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ uuid_copy(&d->dd_uuid, &mp->m_sb.sb_uuid);
+ xfs_update_cksum((char *)d, sizeof(struct xfs_dqblk),
+ XFS_DQUOT_CRC_OFF);
+ }
+
+ return errs;
+}
+
+STATIC bool
+xfs_dquot_buf_verify_crc(
+ struct xfs_mount *mp,
+ struct xfs_buf *bp)
+{
+ struct xfs_dqblk *d = (struct xfs_dqblk *)bp->b_addr;
+ int ndquots;
+ int i;
+
+ if (!xfs_sb_version_hascrc(&mp->m_sb))
+ return true;
+
+ /*
+ * if we are in log recovery, the quota subsystem has not been
+ * initialised so we have no quotainfo structure. In that case, we need
+ * to manually calculate the number of dquots in the buffer.
+ */
+ if (mp->m_quotainfo)
+ ndquots = mp->m_quotainfo->qi_dqperchunk;
+ else
+ ndquots = xfs_calc_dquots_per_chunk(
+ XFS_BB_TO_FSB(mp, bp->b_length));
+
+ for (i = 0; i < ndquots; i++, d++) {
+ if (!xfs_verify_cksum((char *)d, sizeof(struct xfs_dqblk),
+ XFS_DQUOT_CRC_OFF))
+ return false;
+ if (!uuid_equal(&d->dd_uuid, &mp->m_sb.sb_uuid))
+ return false;
+ }
+ return true;
+}
+
+STATIC bool
+xfs_dquot_buf_verify(
+ struct xfs_mount *mp,
+ struct xfs_buf *bp)
+{
+ struct xfs_dqblk *d = (struct xfs_dqblk *)bp->b_addr;
+ xfs_dqid_t id = 0;
+ int ndquots;
+ int i;
+
+ /*
+ * if we are in log recovery, the quota subsystem has not been
+ * initialised so we have no quotainfo structure. In that case, we need
+ * to manually calculate the number of dquots in the buffer.
+ */
+ if (mp->m_quotainfo)
+ ndquots = mp->m_quotainfo->qi_dqperchunk;
+ else
+ ndquots = xfs_calc_dquots_per_chunk(bp->b_length);
+
+ /*
+ * On the first read of the buffer, verify that each dquot is valid.
+ * We don't know what the id of the dquot is supposed to be, just that
+ * they should be increasing monotonically within the buffer. If the
+ * first id is corrupt, then it will fail on the second dquot in the
+ * buffer so corruptions could point to the wrong dquot in this case.
+ */
+ for (i = 0; i < ndquots; i++) {
+ struct xfs_disk_dquot *ddq;
+ int error;
+
+ ddq = &d[i].dd_diskdq;
+
+ if (i == 0)
+ id = be32_to_cpu(ddq->d_id);
+
+ error = xfs_dqcheck(mp, ddq, id + i, 0, XFS_QMOPT_DOWARN,
+ "xfs_dquot_buf_verify");
+ if (error)
+ return false;
+ }
+ return true;
+}
+
+static void
+xfs_dquot_buf_read_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+
+ if (!xfs_dquot_buf_verify_crc(mp, bp))
+ xfs_buf_ioerror(bp, EFSBADCRC);
+ else if (!xfs_dquot_buf_verify(mp, bp))
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
+
+ if (bp->b_error)
+ xfs_verifier_error(bp);
+}
+
+/*
+ * we don't calculate the CRC here as that is done when the dquot is flushed to
+ * the buffer after the update is done. This ensures that the dquot in the
+ * buffer always has an up-to-date CRC value.
+ */
+static void
+xfs_dquot_buf_write_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+
+ if (!xfs_dquot_buf_verify(mp, bp)) {
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
+ xfs_verifier_error(bp);
+ return;
+ }
+}
+
+const struct xfs_buf_ops xfs_dquot_buf_ops = {
+ .verify_read = xfs_dquot_buf_read_verify,
+ .verify_write = xfs_dquot_buf_write_verify,
+};
+
diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c
new file mode 100644
index 00000000000..f33fbaaa4d8
--- /dev/null
+++ b/fs/xfs/xfs_dquot_item.c
@@ -0,0 +1,445 @@
+/*
+ * Copyright (c) 2000-2003 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_quota.h"
+#include "xfs_error.h"
+#include "xfs_trans.h"
+#include "xfs_buf_item.h"
+#include "xfs_trans_priv.h"
+#include "xfs_qm.h"
+#include "xfs_log.h"
+
+static inline struct xfs_dq_logitem *DQUOT_ITEM(struct xfs_log_item *lip)
+{
+ return container_of(lip, struct xfs_dq_logitem, qli_item);
+}
+
+/*
+ * returns the number of iovecs needed to log the given dquot item.
+ */
+STATIC void
+xfs_qm_dquot_logitem_size(
+ struct xfs_log_item *lip,
+ int *nvecs,
+ int *nbytes)
+{
+ *nvecs += 2;
+ *nbytes += sizeof(struct xfs_dq_logformat) +
+ sizeof(struct xfs_disk_dquot);
+}
+
+/*
+ * fills in the vector of log iovecs for the given dquot log item.
+ */
+STATIC void
+xfs_qm_dquot_logitem_format(
+ struct xfs_log_item *lip,
+ struct xfs_log_vec *lv)
+{
+ struct xfs_dq_logitem *qlip = DQUOT_ITEM(lip);
+ struct xfs_log_iovec *vecp = NULL;
+ struct xfs_dq_logformat *qlf;
+
+ qlf = xlog_prepare_iovec(lv, &vecp, XLOG_REG_TYPE_QFORMAT);
+ qlf->qlf_type = XFS_LI_DQUOT;
+ qlf->qlf_size = 2;
+ qlf->qlf_id = be32_to_cpu(qlip->qli_dquot->q_core.d_id);
+ qlf->qlf_blkno = qlip->qli_dquot->q_blkno;
+ qlf->qlf_len = 1;
+ qlf->qlf_boffset = qlip->qli_dquot->q_bufoffset;
+ xlog_finish_iovec(lv, vecp, sizeof(struct xfs_dq_logformat));
+
+ xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_DQUOT,
+ &qlip->qli_dquot->q_core,
+ sizeof(struct xfs_disk_dquot));
+}
+
+/*
+ * Increment the pin count of the given dquot.
+ */
+STATIC void
+xfs_qm_dquot_logitem_pin(
+ struct xfs_log_item *lip)
+{
+ struct xfs_dquot *dqp = DQUOT_ITEM(lip)->qli_dquot;
+
+ ASSERT(XFS_DQ_IS_LOCKED(dqp));
+ atomic_inc(&dqp->q_pincount);
+}
+
+/*
+ * Decrement the pin count of the given dquot, and wake up
+ * anyone in xfs_dqwait_unpin() if the count goes to 0. The
+ * dquot must have been previously pinned with a call to
+ * xfs_qm_dquot_logitem_pin().
+ */
+STATIC void
+xfs_qm_dquot_logitem_unpin(
+ struct xfs_log_item *lip,
+ int remove)
+{
+ struct xfs_dquot *dqp = DQUOT_ITEM(lip)->qli_dquot;
+
+ ASSERT(atomic_read(&dqp->q_pincount) > 0);
+ if (atomic_dec_and_test(&dqp->q_pincount))
+ wake_up(&dqp->q_pinwait);
+}
+
+STATIC xfs_lsn_t
+xfs_qm_dquot_logitem_committed(
+ struct xfs_log_item *lip,
+ xfs_lsn_t lsn)
+{
+ /*
+ * We always re-log the entire dquot when it becomes dirty,
+ * so, the latest copy _is_ the only one that matters.
+ */
+ return lsn;
+}
+
+/*
+ * This is called to wait for the given dquot to be unpinned.
+ * Most of these pin/unpin routines are plagiarized from inode code.
+ */
+void
+xfs_qm_dqunpin_wait(
+ struct xfs_dquot *dqp)
+{
+ ASSERT(XFS_DQ_IS_LOCKED(dqp));
+ if (atomic_read(&dqp->q_pincount) == 0)
+ return;
+
+ /*
+ * Give the log a push so we don't wait here too long.
+ */
+ xfs_log_force(dqp->q_mount, 0);
+ wait_event(dqp->q_pinwait, (atomic_read(&dqp->q_pincount) == 0));
+}
+
+STATIC uint
+xfs_qm_dquot_logitem_push(
+ struct xfs_log_item *lip,
+ struct list_head *buffer_list) __releases(&lip->li_ailp->xa_lock)
+ __acquires(&lip->li_ailp->xa_lock)
+{
+ struct xfs_dquot *dqp = DQUOT_ITEM(lip)->qli_dquot;
+ struct xfs_buf *bp = NULL;
+ uint rval = XFS_ITEM_SUCCESS;
+ int error;
+
+ if (atomic_read(&dqp->q_pincount) > 0)
+ return XFS_ITEM_PINNED;
+
+ if (!xfs_dqlock_nowait(dqp))
+ return XFS_ITEM_LOCKED;
+
+ /*
+ * Re-check the pincount now that we stabilized the value by
+ * taking the quota lock.
+ */
+ if (atomic_read(&dqp->q_pincount) > 0) {
+ rval = XFS_ITEM_PINNED;
+ goto out_unlock;
+ }
+
+ /*
+ * Someone else is already flushing the dquot. Nothing we can do
+ * here but wait for the flush to finish and remove the item from
+ * the AIL.
+ */
+ if (!xfs_dqflock_nowait(dqp)) {
+ rval = XFS_ITEM_FLUSHING;
+ goto out_unlock;
+ }
+
+ spin_unlock(&lip->li_ailp->xa_lock);
+
+ error = xfs_qm_dqflush(dqp, &bp);
+ if (error) {
+ xfs_warn(dqp->q_mount, "%s: push error %d on dqp %p",
+ __func__, error, dqp);
+ } else {
+ if (!xfs_buf_delwri_queue(bp, buffer_list))
+ rval = XFS_ITEM_FLUSHING;
+ xfs_buf_relse(bp);
+ }
+
+ spin_lock(&lip->li_ailp->xa_lock);
+out_unlock:
+ xfs_dqunlock(dqp);
+ return rval;
+}
+
+/*
+ * Unlock the dquot associated with the log item.
+ * Clear the fields of the dquot and dquot log item that
+ * are specific to the current transaction. If the
+ * hold flags is set, do not unlock the dquot.
+ */
+STATIC void
+xfs_qm_dquot_logitem_unlock(
+ struct xfs_log_item *lip)
+{
+ struct xfs_dquot *dqp = DQUOT_ITEM(lip)->qli_dquot;
+
+ ASSERT(XFS_DQ_IS_LOCKED(dqp));
+
+ /*
+ * Clear the transaction pointer in the dquot
+ */
+ dqp->q_transp = NULL;
+
+ /*
+ * dquots are never 'held' from getting unlocked at the end of
+ * a transaction. Their locking and unlocking is hidden inside the
+ * transaction layer, within trans_commit. Hence, no LI_HOLD flag
+ * for the logitem.
+ */
+ xfs_dqunlock(dqp);
+}
+
+/*
+ * this needs to stamp an lsn into the dquot, I think.
+ * rpc's that look at user dquot's would then have to
+ * push on the dependency recorded in the dquot
+ */
+STATIC void
+xfs_qm_dquot_logitem_committing(
+ struct xfs_log_item *lip,
+ xfs_lsn_t lsn)
+{
+}
+
+/*
+ * This is the ops vector for dquots
+ */
+static const struct xfs_item_ops xfs_dquot_item_ops = {
+ .iop_size = xfs_qm_dquot_logitem_size,
+ .iop_format = xfs_qm_dquot_logitem_format,
+ .iop_pin = xfs_qm_dquot_logitem_pin,
+ .iop_unpin = xfs_qm_dquot_logitem_unpin,
+ .iop_unlock = xfs_qm_dquot_logitem_unlock,
+ .iop_committed = xfs_qm_dquot_logitem_committed,
+ .iop_push = xfs_qm_dquot_logitem_push,
+ .iop_committing = xfs_qm_dquot_logitem_committing
+};
+
+/*
+ * Initialize the dquot log item for a newly allocated dquot.
+ * The dquot isn't locked at this point, but it isn't on any of the lists
+ * either, so we don't care.
+ */
+void
+xfs_qm_dquot_logitem_init(
+ struct xfs_dquot *dqp)
+{
+ struct xfs_dq_logitem *lp = &dqp->q_logitem;
+
+ xfs_log_item_init(dqp->q_mount, &lp->qli_item, XFS_LI_DQUOT,
+ &xfs_dquot_item_ops);
+ lp->qli_dquot = dqp;
+}
+
+/*------------------ QUOTAOFF LOG ITEMS -------------------*/
+
+static inline struct xfs_qoff_logitem *QOFF_ITEM(struct xfs_log_item *lip)
+{
+ return container_of(lip, struct xfs_qoff_logitem, qql_item);
+}
+
+
+/*
+ * This returns the number of iovecs needed to log the given quotaoff item.
+ * We only need 1 iovec for an quotaoff item. It just logs the
+ * quotaoff_log_format structure.
+ */
+STATIC void
+xfs_qm_qoff_logitem_size(
+ struct xfs_log_item *lip,
+ int *nvecs,
+ int *nbytes)
+{
+ *nvecs += 1;
+ *nbytes += sizeof(struct xfs_qoff_logitem);
+}
+
+STATIC void
+xfs_qm_qoff_logitem_format(
+ struct xfs_log_item *lip,
+ struct xfs_log_vec *lv)
+{
+ struct xfs_qoff_logitem *qflip = QOFF_ITEM(lip);
+ struct xfs_log_iovec *vecp = NULL;
+ struct xfs_qoff_logformat *qlf;
+
+ qlf = xlog_prepare_iovec(lv, &vecp, XLOG_REG_TYPE_QUOTAOFF);
+ qlf->qf_type = XFS_LI_QUOTAOFF;
+ qlf->qf_size = 1;
+ qlf->qf_flags = qflip->qql_flags;
+ xlog_finish_iovec(lv, vecp, sizeof(struct xfs_qoff_logitem));
+}
+
+/*
+ * Pinning has no meaning for an quotaoff item, so just return.
+ */
+STATIC void
+xfs_qm_qoff_logitem_pin(
+ struct xfs_log_item *lip)
+{
+}
+
+/*
+ * Since pinning has no meaning for an quotaoff item, unpinning does
+ * not either.
+ */
+STATIC void
+xfs_qm_qoff_logitem_unpin(
+ struct xfs_log_item *lip,
+ int remove)
+{
+}
+
+/*
+ * There isn't much you can do to push a quotaoff item. It is simply
+ * stuck waiting for the log to be flushed to disk.
+ */
+STATIC uint
+xfs_qm_qoff_logitem_push(
+ struct xfs_log_item *lip,
+ struct list_head *buffer_list)
+{
+ return XFS_ITEM_LOCKED;
+}
+
+/*
+ * Quotaoff items have no locking or pushing, so return failure
+ * so that the caller doesn't bother with us.
+ */
+STATIC void
+xfs_qm_qoff_logitem_unlock(
+ struct xfs_log_item *lip)
+{
+}
+
+/*
+ * The quotaoff-start-item is logged only once and cannot be moved in the log,
+ * so simply return the lsn at which it's been logged.
+ */
+STATIC xfs_lsn_t
+xfs_qm_qoff_logitem_committed(
+ struct xfs_log_item *lip,
+ xfs_lsn_t lsn)
+{
+ return lsn;
+}
+
+STATIC xfs_lsn_t
+xfs_qm_qoffend_logitem_committed(
+ struct xfs_log_item *lip,
+ xfs_lsn_t lsn)
+{
+ struct xfs_qoff_logitem *qfe = QOFF_ITEM(lip);
+ struct xfs_qoff_logitem *qfs = qfe->qql_start_lip;
+ struct xfs_ail *ailp = qfs->qql_item.li_ailp;
+
+ /*
+ * Delete the qoff-start logitem from the AIL.
+ * xfs_trans_ail_delete() drops the AIL lock.
+ */
+ spin_lock(&ailp->xa_lock);
+ xfs_trans_ail_delete(ailp, &qfs->qql_item, SHUTDOWN_LOG_IO_ERROR);
+
+ kmem_free(qfs);
+ kmem_free(qfe);
+ return (xfs_lsn_t)-1;
+}
+
+/*
+ * XXX rcc - don't know quite what to do with this. I think we can
+ * just ignore it. The only time that isn't the case is if we allow
+ * the client to somehow see that quotas have been turned off in which
+ * we can't allow that to get back until the quotaoff hits the disk.
+ * So how would that happen? Also, do we need different routines for
+ * quotaoff start and quotaoff end? I suspect the answer is yes but
+ * to be sure, I need to look at the recovery code and see how quota off
+ * recovery is handled (do we roll forward or back or do something else).
+ * If we roll forwards or backwards, then we need two separate routines,
+ * one that does nothing and one that stamps in the lsn that matters
+ * (truly makes the quotaoff irrevocable). If we do something else,
+ * then maybe we don't need two.
+ */
+STATIC void
+xfs_qm_qoff_logitem_committing(
+ struct xfs_log_item *lip,
+ xfs_lsn_t commit_lsn)
+{
+}
+
+static const struct xfs_item_ops xfs_qm_qoffend_logitem_ops = {
+ .iop_size = xfs_qm_qoff_logitem_size,
+ .iop_format = xfs_qm_qoff_logitem_format,
+ .iop_pin = xfs_qm_qoff_logitem_pin,
+ .iop_unpin = xfs_qm_qoff_logitem_unpin,
+ .iop_unlock = xfs_qm_qoff_logitem_unlock,
+ .iop_committed = xfs_qm_qoffend_logitem_committed,
+ .iop_push = xfs_qm_qoff_logitem_push,
+ .iop_committing = xfs_qm_qoff_logitem_committing
+};
+
+/*
+ * This is the ops vector shared by all quotaoff-start log items.
+ */
+static const struct xfs_item_ops xfs_qm_qoff_logitem_ops = {
+ .iop_size = xfs_qm_qoff_logitem_size,
+ .iop_format = xfs_qm_qoff_logitem_format,
+ .iop_pin = xfs_qm_qoff_logitem_pin,
+ .iop_unpin = xfs_qm_qoff_logitem_unpin,
+ .iop_unlock = xfs_qm_qoff_logitem_unlock,
+ .iop_committed = xfs_qm_qoff_logitem_committed,
+ .iop_push = xfs_qm_qoff_logitem_push,
+ .iop_committing = xfs_qm_qoff_logitem_committing
+};
+
+/*
+ * Allocate and initialize an quotaoff item of the correct quota type(s).
+ */
+struct xfs_qoff_logitem *
+xfs_qm_qoff_logitem_init(
+ struct xfs_mount *mp,
+ struct xfs_qoff_logitem *start,
+ uint flags)
+{
+ struct xfs_qoff_logitem *qf;
+
+ qf = kmem_zalloc(sizeof(struct xfs_qoff_logitem), KM_SLEEP);
+
+ xfs_log_item_init(mp, &qf->qql_item, XFS_LI_QUOTAOFF, start ?
+ &xfs_qm_qoffend_logitem_ops : &xfs_qm_qoff_logitem_ops);
+ qf->qql_item.li_mountp = mp;
+ qf->qql_start_lip = start;
+ qf->qql_flags = flags;
+ return qf;
+}
diff --git a/fs/xfs/quota/xfs_dquot_item.h b/fs/xfs/xfs_dquot_item.h
index 5a632531f84..502e9464634 100644
--- a/fs/xfs/quota/xfs_dquot_item.h
+++ b/fs/xfs/xfs_dquot_item.h
@@ -27,17 +27,12 @@ typedef struct xfs_dq_logitem {
xfs_log_item_t qli_item; /* common portion */
struct xfs_dquot *qli_dquot; /* dquot ptr */
xfs_lsn_t qli_flush_lsn; /* lsn at last flush */
- unsigned short qli_pushbuf_flag; /* 1 bit used in push_ail */
-#ifdef DEBUG
- uint64_t qli_push_owner;
-#endif
- xfs_dq_logformat_t qli_format; /* logged structure */
} xfs_dq_logitem_t;
typedef struct xfs_qoff_logitem {
xfs_log_item_t qql_item; /* common portion */
struct xfs_qoff_logitem *qql_start_lip; /* qoff-start logitem, if any */
- xfs_qoff_logformat_t qql_format; /* logged structure */
+ unsigned int qql_flags;
} xfs_qoff_logitem_t;
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index 2a21c502401..edac5b057d2 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -16,23 +16,13 @@
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "xfs.h"
+#include "xfs_format.h"
#include "xfs_fs.h"
-#include "xfs_types.h"
-#include "xfs_log.h"
-#include "xfs_inum.h"
-#include "xfs_trans.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_sb.h"
-#include "xfs_dir.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
+#include "xfs_ag.h"
#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_dir_sf.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
-#include "xfs_dinode.h"
-#include "xfs_inode.h"
-#include "xfs_utils.h"
#include "xfs_error.h"
#ifdef DEBUG
@@ -53,27 +43,17 @@ xfs_error_trap(int e)
break;
if (e != xfs_etrap[i])
continue;
- cmn_err(CE_NOTE, "xfs_error_trap: error %d", e);
+ xfs_notice(NULL, "%s: error %d", __func__, e);
BUG();
break;
}
return e;
}
-#endif
-
-#if (defined(DEBUG) || defined(INDUCE_IO_ERROR))
int xfs_etest[XFS_NUM_INJECT_ERROR];
int64_t xfs_etest_fsid[XFS_NUM_INJECT_ERROR];
char * xfs_etest_fsname[XFS_NUM_INJECT_ERROR];
-
-void
-xfs_error_test_init(void)
-{
- memset(xfs_etest, 0, sizeof(xfs_etest));
- memset(xfs_etest_fsid, 0, sizeof(xfs_etest_fsid));
- memset(xfs_etest_fsname, 0, sizeof(xfs_etest_fsname));
-}
+int xfs_error_test_active;
int
xfs_error_test(int error_tag, int *fsidp, char *expression,
@@ -82,14 +62,14 @@ xfs_error_test(int error_tag, int *fsidp, char *expression,
int i;
int64_t fsid;
- if (random() % randfactor)
+ if (prandom_u32() % randfactor)
return 0;
memcpy(&fsid, fsidp, sizeof(xfs_fsid_t));
for (i = 0; i < XFS_NUM_INJECT_ERROR; i++) {
if (xfs_etest[i] == error_tag && xfs_etest_fsid[i] == fsid) {
- cmn_err(CE_WARN,
+ xfs_warn(NULL,
"Injecting error (%s) at file %s, line %d, on filesystem \"%s\"",
expression, file, line, xfs_etest_fsname[i]);
return 1;
@@ -110,201 +90,116 @@ xfs_errortag_add(int error_tag, xfs_mount_t *mp)
for (i = 0; i < XFS_NUM_INJECT_ERROR; i++) {
if (xfs_etest_fsid[i] == fsid && xfs_etest[i] == error_tag) {
- cmn_err(CE_WARN, "XFS error tag #%d on", error_tag);
+ xfs_warn(mp, "error tag #%d on", error_tag);
return 0;
}
}
for (i = 0; i < XFS_NUM_INJECT_ERROR; i++) {
if (xfs_etest[i] == 0) {
- cmn_err(CE_WARN, "Turned on XFS error tag #%d",
+ xfs_warn(mp, "Turned on XFS error tag #%d",
error_tag);
xfs_etest[i] = error_tag;
xfs_etest_fsid[i] = fsid;
len = strlen(mp->m_fsname);
xfs_etest_fsname[i] = kmem_alloc(len + 1, KM_SLEEP);
strcpy(xfs_etest_fsname[i], mp->m_fsname);
+ xfs_error_test_active++;
return 0;
}
}
- cmn_err(CE_WARN, "error tag overflow, too many turned on");
+ xfs_warn(mp, "error tag overflow, too many turned on");
return 1;
}
int
-xfs_errortag_clear(int error_tag, xfs_mount_t *mp)
+xfs_errortag_clearall(xfs_mount_t *mp, int loud)
{
- int i;
int64_t fsid;
+ int cleared = 0;
+ int i;
memcpy(&fsid, mp->m_fixedfsid, sizeof(xfs_fsid_t));
- for (i = 0; i < XFS_NUM_INJECT_ERROR; i++) {
- if (xfs_etest_fsid[i] == fsid && xfs_etest[i] == error_tag) {
- xfs_etest[i] = 0;
- xfs_etest_fsid[i] = 0LL;
- kmem_free(xfs_etest_fsname[i],
- strlen(xfs_etest_fsname[i]) + 1);
- xfs_etest_fsname[i] = NULL;
- cmn_err(CE_WARN, "Cleared XFS error tag #%d",
- error_tag);
- return 0;
- }
- }
-
- cmn_err(CE_WARN, "XFS error tag %d not on", error_tag);
-
- return 1;
-}
-
-int
-xfs_errortag_clearall_umount(int64_t fsid, char *fsname, int loud)
-{
- int i;
- int cleared = 0;
for (i = 0; i < XFS_NUM_INJECT_ERROR; i++) {
if ((fsid == 0LL || xfs_etest_fsid[i] == fsid) &&
xfs_etest[i] != 0) {
cleared = 1;
- cmn_err(CE_WARN, "Clearing XFS error tag #%d",
+ xfs_warn(mp, "Clearing XFS error tag #%d",
xfs_etest[i]);
xfs_etest[i] = 0;
xfs_etest_fsid[i] = 0LL;
- kmem_free(xfs_etest_fsname[i],
- strlen(xfs_etest_fsname[i]) + 1);
+ kmem_free(xfs_etest_fsname[i]);
xfs_etest_fsname[i] = NULL;
+ xfs_error_test_active--;
}
}
if (loud || cleared)
- cmn_err(CE_WARN,
- "Cleared all XFS error tags for filesystem \"%s\"",
- fsname);
+ xfs_warn(mp, "Cleared all XFS error tags for filesystem");
return 0;
}
-
-int
-xfs_errortag_clearall(xfs_mount_t *mp)
-{
- int64_t fsid;
-
- memcpy(&fsid, mp->m_fixedfsid, sizeof(xfs_fsid_t));
-
- return xfs_errortag_clearall_umount(fsid, mp->m_fsname, 1);
-}
-#endif /* DEBUG || INDUCE_IO_ERROR */
-
-static void
-xfs_fs_vcmn_err(int level, xfs_mount_t *mp, char *fmt, va_list ap)
-{
- if (mp != NULL) {
- char *newfmt;
- int len = 16 + mp->m_fsname_len + strlen(fmt);
-
- newfmt = kmem_alloc(len, KM_SLEEP);
- sprintf(newfmt, "Filesystem \"%s\": %s", mp->m_fsname, fmt);
- icmn_err(level, newfmt, ap);
- kmem_free(newfmt, len);
- } else {
- icmn_err(level, fmt, ap);
- }
-}
+#endif /* DEBUG */
void
-xfs_fs_cmn_err(int level, xfs_mount_t *mp, char *fmt, ...)
+xfs_error_report(
+ const char *tag,
+ int level,
+ struct xfs_mount *mp,
+ const char *filename,
+ int linenum,
+ inst_t *ra)
{
- va_list ap;
+ if (level <= xfs_error_level) {
+ xfs_alert_tag(mp, XFS_PTAG_ERROR_REPORT,
+ "Internal error %s at line %d of file %s. Caller %pF",
+ tag, linenum, filename, ra);
- va_start(ap, fmt);
- xfs_fs_vcmn_err(level, mp, fmt, ap);
- va_end(ap);
+ xfs_stack_trace();
+ }
}
void
-xfs_cmn_err(int panic_tag, int level, xfs_mount_t *mp, char *fmt, ...)
+xfs_corruption_error(
+ const char *tag,
+ int level,
+ struct xfs_mount *mp,
+ void *p,
+ const char *filename,
+ int linenum,
+ inst_t *ra)
{
- va_list ap;
-
-#ifdef DEBUG
- xfs_panic_mask |= XFS_PTAG_SHUTDOWN_CORRUPT;
-#endif
-
- if (xfs_panic_mask && (xfs_panic_mask & panic_tag)
- && (level & CE_ALERT)) {
- level &= ~CE_ALERT;
- level |= CE_PANIC;
- cmn_err(CE_ALERT, "XFS: Transforming an alert into a BUG.");
- }
- va_start(ap, fmt);
- xfs_fs_vcmn_err(level, mp, fmt, ap);
- va_end(ap);
+ if (level <= xfs_error_level)
+ xfs_hex_dump(p, 64);
+ xfs_error_report(tag, level, mp, filename, linenum, ra);
+ xfs_alert(mp, "Corruption detected. Unmount and run xfs_repair");
}
+/*
+ * Warnings specifically for verifier errors. Differentiate CRC vs. invalid
+ * values, and omit the stack trace unless the error level is tuned high.
+ */
void
-xfs_error_report(
- char *tag,
- int level,
- xfs_mount_t *mp,
- char *fname,
- int linenum,
- inst_t *ra)
+xfs_verifier_error(
+ struct xfs_buf *bp)
{
- if (level <= xfs_error_level) {
- xfs_cmn_err(XFS_PTAG_ERROR_REPORT,
- CE_ALERT, mp,
- "XFS internal error %s at line %d of file %s. Caller 0x%p\n",
- tag, linenum, fname, ra);
-
- xfs_stack_trace();
- }
-}
+ struct xfs_mount *mp = bp->b_target->bt_mount;
-STATIC void
-xfs_hex_dump(void *p, int length)
-{
- __uint8_t *uip = (__uint8_t*)p;
- int i;
- char sbuf[128], *s;
+ xfs_alert(mp, "Metadata %s detected at %pF, block 0x%llx",
+ bp->b_error == EFSBADCRC ? "CRC error" : "corruption",
+ __return_address, bp->b_bn);
- s = sbuf;
- *s = '\0';
- for (i=0; i<length; i++, uip++) {
- if ((i % 16) == 0) {
- if (*s != '\0')
- cmn_err(CE_ALERT, "%s\n", sbuf);
- s = sbuf;
- sprintf(s, "0x%x: ", i);
- while( *s != '\0')
- s++;
- }
- sprintf(s, "%02x ", *uip);
+ xfs_alert(mp, "Unmount and run xfs_repair");
- /*
- * the kernel sprintf is a void; user sprintf returns
- * the sprintf'ed string's length. Find the new end-
- * of-string
- */
- while( *s != '\0')
- s++;
+ if (xfs_error_level >= XFS_ERRLEVEL_LOW) {
+ xfs_alert(mp, "First 64 bytes of corrupted metadata buffer:");
+ xfs_hex_dump(xfs_buf_offset(bp, 0), 64);
}
- cmn_err(CE_ALERT, "%s\n", sbuf);
-}
-void
-xfs_corruption_error(
- char *tag,
- int level,
- xfs_mount_t *mp,
- void *p,
- char *fname,
- int linenum,
- inst_t *ra)
-{
- if (level <= xfs_error_level)
- xfs_hex_dump(p, 16);
- xfs_error_report(tag, level, mp, fname, linenum, ra);
+ if (xfs_error_level >= XFS_ERRLEVEL_HIGH)
+ xfs_stack_trace();
}
diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h
index 26b8e709a56..c1c57d4a4b5 100644
--- a/fs/xfs/xfs_error.h
+++ b/fs/xfs/xfs_error.h
@@ -18,14 +18,6 @@
#ifndef __XFS_ERROR_H__
#define __XFS_ERROR_H__
-#define XFS_ERECOVER 1 /* Failure to recover log */
-#define XFS_ELOGSTAT 2 /* Failure to stat log in user space */
-#define XFS_ENOLOGSPACE 3 /* Reservation too large */
-#define XFS_ENOTSUP 4 /* Operation not supported */
-#define XFS_ENOLSN 5 /* Can't find the lsn you asked for */
-#define XFS_ENOTFOUND 6
-#define XFS_ENOTXFS 7 /* Not XFS filesystem */
-
#ifdef DEBUG
#define XFS_ERROR_NTRAP 10
extern int xfs_etrap[XFS_ERROR_NTRAP];
@@ -37,10 +29,12 @@ extern int xfs_error_trap(int);
struct xfs_mount;
-extern void xfs_error_report(char *tag, int level, struct xfs_mount *mp,
- char *fname, int linenum, inst_t *ra);
-extern void xfs_corruption_error(char *tag, int level, struct xfs_mount *mp,
- void *p, char *fname, int linenum, inst_t *ra);
+extern void xfs_error_report(const char *tag, int level, struct xfs_mount *mp,
+ const char *filename, int linenum, inst_t *ra);
+extern void xfs_corruption_error(const char *tag, int level,
+ struct xfs_mount *mp, void *p, const char *filename,
+ int linenum, inst_t *ra);
+extern void xfs_verifier_error(struct xfs_buf *bp);
#define XFS_ERROR_REPORT(e, lvl, mp) \
xfs_error_report(e, lvl, mp, __FILE__, __LINE__, __return_address)
@@ -133,39 +127,27 @@ extern void xfs_corruption_error(char *tag, int level, struct xfs_mount *mp,
#define XFS_RANDOM_DIOWRITE_IOERR (XFS_RANDOM_DEFAULT/10)
#define XFS_RANDOM_BMAPIFORMAT XFS_RANDOM_DEFAULT
-#if (defined(DEBUG) || defined(INDUCE_IO_ERROR))
+#ifdef DEBUG
+extern int xfs_error_test_active;
extern int xfs_error_test(int, int *, char *, int, char *, unsigned long);
-extern void xfs_error_test_init(void);
#define XFS_NUM_INJECT_ERROR 10
-
-#ifdef __ANSI_CPP__
-#define XFS_TEST_ERROR(expr, mp, tag, rf) \
- ((expr) || \
- xfs_error_test((tag), (mp)->m_fixedfsid, #expr, __LINE__, __FILE__, \
- (rf)))
-#else
#define XFS_TEST_ERROR(expr, mp, tag, rf) \
- ((expr) || \
+ ((expr) || (xfs_error_test_active && \
xfs_error_test((tag), (mp)->m_fixedfsid, "expr", __LINE__, __FILE__, \
- (rf)))
-#endif /* __ANSI_CPP__ */
+ (rf))))
-extern int xfs_errortag_add(int error_tag, xfs_mount_t *mp);
-extern int xfs_errortag_clear(int error_tag, xfs_mount_t *mp);
-extern int xfs_errortag_clearall(xfs_mount_t *mp);
-extern int xfs_errortag_clearall_umount(int64_t fsid, char *fsname, int loud);
+extern int xfs_errortag_add(int error_tag, struct xfs_mount *mp);
+extern int xfs_errortag_clearall(struct xfs_mount *mp, int loud);
#else
#define XFS_TEST_ERROR(expr, mp, tag, rf) (expr)
#define xfs_errortag_add(tag, mp) (ENOSYS)
-#define xfs_errortag_clearall(mp) (ENOSYS)
-#endif /* (DEBUG || INDUCE_IO_ERROR) */
+#define xfs_errortag_clearall(mp, loud) (ENOSYS)
+#endif /* DEBUG */
/*
- * XFS panic tags -- allow a call to xfs_cmn_err() be turned into
- * a panic by setting xfs_panic_mask in a
- * sysctl. update xfs_max[XFS_PARAM] if
- * more are added.
+ * XFS panic tags -- allow a call to xfs_alert_tag() be turned into
+ * a panic by setting xfs_panic_mask in a sysctl.
*/
#define XFS_NO_PTAG 0
#define XFS_PTAG_IFLUSH 0x00000001
@@ -175,15 +157,6 @@ extern int xfs_errortag_clearall_umount(int64_t fsid, char *fsname, int loud);
#define XFS_PTAG_SHUTDOWN_CORRUPT 0x00000010
#define XFS_PTAG_SHUTDOWN_IOERROR 0x00000020
#define XFS_PTAG_SHUTDOWN_LOGERROR 0x00000040
-
-struct xfs_mount;
-/* PRINTFLIKE4 */
-extern void xfs_cmn_err(int panic_tag, int level, struct xfs_mount *mp,
- char *fmt, ...);
-/* PRINTFLIKE3 */
-extern void xfs_fs_cmn_err(int level, struct xfs_mount *mp, char *fmt, ...);
-
-#define xfs_fs_repair_cmn_err(level, mp, fmt, args...) \
- xfs_fs_cmn_err(level, mp, fmt " Unmount and run xfs_repair.", ## args)
+#define XFS_PTAG_FSBLOCK_ZERO 0x00000080
#endif /* __XFS_ERROR_H__ */
diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c
new file mode 100644
index 00000000000..753e467aa1a
--- /dev/null
+++ b/fs/xfs/xfs_export.c
@@ -0,0 +1,249 @@
+/*
+ * Copyright (c) 2004-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "xfs.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_da_format.h"
+#include "xfs_dir2.h"
+#include "xfs_export.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_inode_item.h"
+#include "xfs_trace.h"
+#include "xfs_icache.h"
+#include "xfs_log.h"
+
+/*
+ * Note that we only accept fileids which are long enough rather than allow
+ * the parent generation number to default to zero. XFS considers zero a
+ * valid generation number not an invalid/wildcard value.
+ */
+static int xfs_fileid_length(int fileid_type)
+{
+ switch (fileid_type) {
+ case FILEID_INO32_GEN:
+ return 2;
+ case FILEID_INO32_GEN_PARENT:
+ return 4;
+ case FILEID_INO32_GEN | XFS_FILEID_TYPE_64FLAG:
+ return 3;
+ case FILEID_INO32_GEN_PARENT | XFS_FILEID_TYPE_64FLAG:
+ return 6;
+ }
+ return FILEID_INVALID;
+}
+
+STATIC int
+xfs_fs_encode_fh(
+ struct inode *inode,
+ __u32 *fh,
+ int *max_len,
+ struct inode *parent)
+{
+ struct fid *fid = (struct fid *)fh;
+ struct xfs_fid64 *fid64 = (struct xfs_fid64 *)fh;
+ int fileid_type;
+ int len;
+
+ /* Directories don't need their parent encoded, they have ".." */
+ if (!parent)
+ fileid_type = FILEID_INO32_GEN;
+ else
+ fileid_type = FILEID_INO32_GEN_PARENT;
+
+ /*
+ * If the the filesystem may contain 64bit inode numbers, we need
+ * to use larger file handles that can represent them.
+ *
+ * While we only allocate inodes that do not fit into 32 bits any
+ * large enough filesystem may contain them, thus the slightly
+ * confusing looking conditional below.
+ */
+ if (!(XFS_M(inode->i_sb)->m_flags & XFS_MOUNT_SMALL_INUMS) ||
+ (XFS_M(inode->i_sb)->m_flags & XFS_MOUNT_32BITINODES))
+ fileid_type |= XFS_FILEID_TYPE_64FLAG;
+
+ /*
+ * Only encode if there is enough space given. In practice
+ * this means we can't export a filesystem with 64bit inodes
+ * over NFSv2 with the subtree_check export option; the other
+ * seven combinations work. The real answer is "don't use v2".
+ */
+ len = xfs_fileid_length(fileid_type);
+ if (*max_len < len) {
+ *max_len = len;
+ return FILEID_INVALID;
+ }
+ *max_len = len;
+
+ switch (fileid_type) {
+ case FILEID_INO32_GEN_PARENT:
+ fid->i32.parent_ino = XFS_I(parent)->i_ino;
+ fid->i32.parent_gen = parent->i_generation;
+ /*FALLTHRU*/
+ case FILEID_INO32_GEN:
+ fid->i32.ino = XFS_I(inode)->i_ino;
+ fid->i32.gen = inode->i_generation;
+ break;
+ case FILEID_INO32_GEN_PARENT | XFS_FILEID_TYPE_64FLAG:
+ fid64->parent_ino = XFS_I(parent)->i_ino;
+ fid64->parent_gen = parent->i_generation;
+ /*FALLTHRU*/
+ case FILEID_INO32_GEN | XFS_FILEID_TYPE_64FLAG:
+ fid64->ino = XFS_I(inode)->i_ino;
+ fid64->gen = inode->i_generation;
+ break;
+ }
+
+ return fileid_type;
+}
+
+STATIC struct inode *
+xfs_nfs_get_inode(
+ struct super_block *sb,
+ u64 ino,
+ u32 generation)
+ {
+ xfs_mount_t *mp = XFS_M(sb);
+ xfs_inode_t *ip;
+ int error;
+
+ /*
+ * NFS can sometimes send requests for ino 0. Fail them gracefully.
+ */
+ if (ino == 0)
+ return ERR_PTR(-ESTALE);
+
+ /*
+ * The XFS_IGET_UNTRUSTED means that an invalid inode number is just
+ * fine and not an indication of a corrupted filesystem as clients can
+ * send invalid file handles and we have to handle it gracefully..
+ */
+ error = xfs_iget(mp, NULL, ino, XFS_IGET_UNTRUSTED, 0, &ip);
+ if (error) {
+ /*
+ * EINVAL means the inode cluster doesn't exist anymore.
+ * This implies the filehandle is stale, so we should
+ * translate it here.
+ * We don't use ESTALE directly down the chain to not
+ * confuse applications using bulkstat that expect EINVAL.
+ */
+ if (error == EINVAL || error == ENOENT)
+ error = ESTALE;
+ return ERR_PTR(-error);
+ }
+
+ if (ip->i_d.di_gen != generation) {
+ IRELE(ip);
+ return ERR_PTR(-ESTALE);
+ }
+
+ return VFS_I(ip);
+}
+
+STATIC struct dentry *
+xfs_fs_fh_to_dentry(struct super_block *sb, struct fid *fid,
+ int fh_len, int fileid_type)
+{
+ struct xfs_fid64 *fid64 = (struct xfs_fid64 *)fid;
+ struct inode *inode = NULL;
+
+ if (fh_len < xfs_fileid_length(fileid_type))
+ return NULL;
+
+ switch (fileid_type) {
+ case FILEID_INO32_GEN_PARENT:
+ case FILEID_INO32_GEN:
+ inode = xfs_nfs_get_inode(sb, fid->i32.ino, fid->i32.gen);
+ break;
+ case FILEID_INO32_GEN_PARENT | XFS_FILEID_TYPE_64FLAG:
+ case FILEID_INO32_GEN | XFS_FILEID_TYPE_64FLAG:
+ inode = xfs_nfs_get_inode(sb, fid64->ino, fid64->gen);
+ break;
+ }
+
+ return d_obtain_alias(inode);
+}
+
+STATIC struct dentry *
+xfs_fs_fh_to_parent(struct super_block *sb, struct fid *fid,
+ int fh_len, int fileid_type)
+{
+ struct xfs_fid64 *fid64 = (struct xfs_fid64 *)fid;
+ struct inode *inode = NULL;
+
+ if (fh_len < xfs_fileid_length(fileid_type))
+ return NULL;
+
+ switch (fileid_type) {
+ case FILEID_INO32_GEN_PARENT:
+ inode = xfs_nfs_get_inode(sb, fid->i32.parent_ino,
+ fid->i32.parent_gen);
+ break;
+ case FILEID_INO32_GEN_PARENT | XFS_FILEID_TYPE_64FLAG:
+ inode = xfs_nfs_get_inode(sb, fid64->parent_ino,
+ fid64->parent_gen);
+ break;
+ }
+
+ return d_obtain_alias(inode);
+}
+
+STATIC struct dentry *
+xfs_fs_get_parent(
+ struct dentry *child)
+{
+ int error;
+ struct xfs_inode *cip;
+
+ error = xfs_lookup(XFS_I(child->d_inode), &xfs_name_dotdot, &cip, NULL);
+ if (unlikely(error))
+ return ERR_PTR(-error);
+
+ return d_obtain_alias(VFS_I(cip));
+}
+
+STATIC int
+xfs_fs_nfs_commit_metadata(
+ struct inode *inode)
+{
+ struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_mount *mp = ip->i_mount;
+ xfs_lsn_t lsn = 0;
+
+ xfs_ilock(ip, XFS_ILOCK_SHARED);
+ if (xfs_ipincount(ip))
+ lsn = ip->i_itemp->ili_last_lsn;
+ xfs_iunlock(ip, XFS_ILOCK_SHARED);
+
+ if (!lsn)
+ return 0;
+ return -_xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL);
+}
+
+const struct export_operations xfs_export_operations = {
+ .encode_fh = xfs_fs_encode_fh,
+ .fh_to_dentry = xfs_fs_fh_to_dentry,
+ .fh_to_parent = xfs_fs_fh_to_parent,
+ .get_parent = xfs_fs_get_parent,
+ .commit_metadata = xfs_fs_nfs_commit_metadata,
+};
diff --git a/fs/xfs/linux-2.6/xfs_export.h b/fs/xfs/xfs_export.h
index e5b0559700a..3272b6ae7a3 100644
--- a/fs/xfs/linux-2.6/xfs_export.h
+++ b/fs/xfs/xfs_export.h
@@ -54,55 +54,19 @@
* Note, the NFS filehandle also includes an fsid portion which
* may have an inode number in it. That number is hardcoded to
* 32bits and there is no way for XFS to intercept it. In
- * practice this means when exporting an XFS filesytem with 64bit
+ * practice this means when exporting an XFS filesystem with 64bit
* inodes you should either export the mountpoint (rather than
* a subdirectory) or use the "fsid" export option.
*/
+struct xfs_fid64 {
+ u64 ino;
+ u32 gen;
+ u64 parent_ino;
+ u32 parent_gen;
+} __attribute__((packed));
+
/* This flag goes on the wire. Don't play with it. */
#define XFS_FILEID_TYPE_64FLAG 0x80 /* NFS fileid has 64bit inodes */
-/* Calculate the length in u32 units of the fileid data */
-static inline int
-xfs_fileid_length(int hasparent, int is64)
-{
- return hasparent ? (is64 ? 6 : 4) : (is64 ? 3 : 2);
-}
-
-/*
- * Decode encoded inode information (either for the inode itself
- * or the parent) into an xfs_fid2_t structure. Advances and
- * returns the new data pointer
- */
-static inline __u32 *
-xfs_fileid_decode_fid2(__u32 *p, xfs_fid2_t *fid, int is64)
-{
- fid->fid_len = sizeof(xfs_fid2_t) - sizeof(fid->fid_len);
- fid->fid_pad = 0;
- fid->fid_ino = *p++;
-#if XFS_BIG_INUMS
- if (is64)
- fid->fid_ino |= (((__u64)(*p++)) << 32);
-#endif
- fid->fid_gen = *p++;
- return p;
-}
-
-/*
- * Encode inode information (either for the inode itself or the
- * parent) into a fileid buffer. Advances and returns the new
- * data pointer.
- */
-static inline __u32 *
-xfs_fileid_encode_inode(__u32 *p, struct inode *inode, int is64)
-{
- *p++ = (__u32)inode->i_ino;
-#if XFS_BIG_INUMS
- if (is64)
- *p++ = (__u32)(inode->i_ino >> 32);
-#endif
- *p++ = inode->i_generation;
- return p;
-}
-
#endif /* __XFS_EXPORT_H__ */
diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c
new file mode 100644
index 00000000000..fd22f69049d
--- /dev/null
+++ b/fs/xfs/xfs_extent_busy.c
@@ -0,0 +1,605 @@
+/*
+ * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
+ * Copyright (c) 2010 David Chinner.
+ * Copyright (c) 2011 Christoph Hellwig.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_shared.h"
+#include "xfs_trans_resv.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_alloc.h"
+#include "xfs_extent_busy.h"
+#include "xfs_trace.h"
+#include "xfs_trans.h"
+#include "xfs_log.h"
+
+void
+xfs_extent_busy_insert(
+ struct xfs_trans *tp,
+ xfs_agnumber_t agno,
+ xfs_agblock_t bno,
+ xfs_extlen_t len,
+ unsigned int flags)
+{
+ struct xfs_extent_busy *new;
+ struct xfs_extent_busy *busyp;
+ struct xfs_perag *pag;
+ struct rb_node **rbp;
+ struct rb_node *parent = NULL;
+
+ new = kmem_zalloc(sizeof(struct xfs_extent_busy), KM_MAYFAIL);
+ if (!new) {
+ /*
+ * No Memory! Since it is now not possible to track the free
+ * block, make this a synchronous transaction to insure that
+ * the block is not reused before this transaction commits.
+ */
+ trace_xfs_extent_busy_enomem(tp->t_mountp, agno, bno, len);
+ xfs_trans_set_sync(tp);
+ return;
+ }
+
+ new->agno = agno;
+ new->bno = bno;
+ new->length = len;
+ INIT_LIST_HEAD(&new->list);
+ new->flags = flags;
+
+ /* trace before insert to be able to see failed inserts */
+ trace_xfs_extent_busy(tp->t_mountp, agno, bno, len);
+
+ pag = xfs_perag_get(tp->t_mountp, new->agno);
+ spin_lock(&pag->pagb_lock);
+ rbp = &pag->pagb_tree.rb_node;
+ while (*rbp) {
+ parent = *rbp;
+ busyp = rb_entry(parent, struct xfs_extent_busy, rb_node);
+
+ if (new->bno < busyp->bno) {
+ rbp = &(*rbp)->rb_left;
+ ASSERT(new->bno + new->length <= busyp->bno);
+ } else if (new->bno > busyp->bno) {
+ rbp = &(*rbp)->rb_right;
+ ASSERT(bno >= busyp->bno + busyp->length);
+ } else {
+ ASSERT(0);
+ }
+ }
+
+ rb_link_node(&new->rb_node, parent, rbp);
+ rb_insert_color(&new->rb_node, &pag->pagb_tree);
+
+ list_add(&new->list, &tp->t_busy);
+ spin_unlock(&pag->pagb_lock);
+ xfs_perag_put(pag);
+}
+
+/*
+ * Search for a busy extent within the range of the extent we are about to
+ * allocate. You need to be holding the busy extent tree lock when calling
+ * xfs_extent_busy_search(). This function returns 0 for no overlapping busy
+ * extent, -1 for an overlapping but not exact busy extent, and 1 for an exact
+ * match. This is done so that a non-zero return indicates an overlap that
+ * will require a synchronous transaction, but it can still be
+ * used to distinguish between a partial or exact match.
+ */
+int
+xfs_extent_busy_search(
+ struct xfs_mount *mp,
+ xfs_agnumber_t agno,
+ xfs_agblock_t bno,
+ xfs_extlen_t len)
+{
+ struct xfs_perag *pag;
+ struct rb_node *rbp;
+ struct xfs_extent_busy *busyp;
+ int match = 0;
+
+ pag = xfs_perag_get(mp, agno);
+ spin_lock(&pag->pagb_lock);
+
+ rbp = pag->pagb_tree.rb_node;
+
+ /* find closest start bno overlap */
+ while (rbp) {
+ busyp = rb_entry(rbp, struct xfs_extent_busy, rb_node);
+ if (bno < busyp->bno) {
+ /* may overlap, but exact start block is lower */
+ if (bno + len > busyp->bno)
+ match = -1;
+ rbp = rbp->rb_left;
+ } else if (bno > busyp->bno) {
+ /* may overlap, but exact start block is higher */
+ if (bno < busyp->bno + busyp->length)
+ match = -1;
+ rbp = rbp->rb_right;
+ } else {
+ /* bno matches busyp, length determines exact match */
+ match = (busyp->length == len) ? 1 : -1;
+ break;
+ }
+ }
+ spin_unlock(&pag->pagb_lock);
+ xfs_perag_put(pag);
+ return match;
+}
+
+/*
+ * The found free extent [fbno, fend] overlaps part or all of the given busy
+ * extent. If the overlap covers the beginning, the end, or all of the busy
+ * extent, the overlapping portion can be made unbusy and used for the
+ * allocation. We can't split a busy extent because we can't modify a
+ * transaction/CIL context busy list, but we can update an entry's block
+ * number or length.
+ *
+ * Returns true if the extent can safely be reused, or false if the search
+ * needs to be restarted.
+ */
+STATIC bool
+xfs_extent_busy_update_extent(
+ struct xfs_mount *mp,
+ struct xfs_perag *pag,
+ struct xfs_extent_busy *busyp,
+ xfs_agblock_t fbno,
+ xfs_extlen_t flen,
+ bool userdata) __releases(&pag->pagb_lock)
+ __acquires(&pag->pagb_lock)
+{
+ xfs_agblock_t fend = fbno + flen;
+ xfs_agblock_t bbno = busyp->bno;
+ xfs_agblock_t bend = bbno + busyp->length;
+
+ /*
+ * This extent is currently being discarded. Give the thread
+ * performing the discard a chance to mark the extent unbusy
+ * and retry.
+ */
+ if (busyp->flags & XFS_EXTENT_BUSY_DISCARDED) {
+ spin_unlock(&pag->pagb_lock);
+ delay(1);
+ spin_lock(&pag->pagb_lock);
+ return false;
+ }
+
+ /*
+ * If there is a busy extent overlapping a user allocation, we have
+ * no choice but to force the log and retry the search.
+ *
+ * Fortunately this does not happen during normal operation, but
+ * only if the filesystem is very low on space and has to dip into
+ * the AGFL for normal allocations.
+ */
+ if (userdata)
+ goto out_force_log;
+
+ if (bbno < fbno && bend > fend) {
+ /*
+ * Case 1:
+ * bbno bend
+ * +BBBBBBBBBBBBBBBBB+
+ * +---------+
+ * fbno fend
+ */
+
+ /*
+ * We would have to split the busy extent to be able to track
+ * it correct, which we cannot do because we would have to
+ * modify the list of busy extents attached to the transaction
+ * or CIL context, which is immutable.
+ *
+ * Force out the log to clear the busy extent and retry the
+ * search.
+ */
+ goto out_force_log;
+ } else if (bbno >= fbno && bend <= fend) {
+ /*
+ * Case 2:
+ * bbno bend
+ * +BBBBBBBBBBBBBBBBB+
+ * +-----------------+
+ * fbno fend
+ *
+ * Case 3:
+ * bbno bend
+ * +BBBBBBBBBBBBBBBBB+
+ * +--------------------------+
+ * fbno fend
+ *
+ * Case 4:
+ * bbno bend
+ * +BBBBBBBBBBBBBBBBB+
+ * +--------------------------+
+ * fbno fend
+ *
+ * Case 5:
+ * bbno bend
+ * +BBBBBBBBBBBBBBBBB+
+ * +-----------------------------------+
+ * fbno fend
+ *
+ */
+
+ /*
+ * The busy extent is fully covered by the extent we are
+ * allocating, and can simply be removed from the rbtree.
+ * However we cannot remove it from the immutable list
+ * tracking busy extents in the transaction or CIL context,
+ * so set the length to zero to mark it invalid.
+ *
+ * We also need to restart the busy extent search from the
+ * tree root, because erasing the node can rearrange the
+ * tree topology.
+ */
+ rb_erase(&busyp->rb_node, &pag->pagb_tree);
+ busyp->length = 0;
+ return false;
+ } else if (fend < bend) {
+ /*
+ * Case 6:
+ * bbno bend
+ * +BBBBBBBBBBBBBBBBB+
+ * +---------+
+ * fbno fend
+ *
+ * Case 7:
+ * bbno bend
+ * +BBBBBBBBBBBBBBBBB+
+ * +------------------+
+ * fbno fend
+ *
+ */
+ busyp->bno = fend;
+ } else if (bbno < fbno) {
+ /*
+ * Case 8:
+ * bbno bend
+ * +BBBBBBBBBBBBBBBBB+
+ * +-------------+
+ * fbno fend
+ *
+ * Case 9:
+ * bbno bend
+ * +BBBBBBBBBBBBBBBBB+
+ * +----------------------+
+ * fbno fend
+ */
+ busyp->length = fbno - busyp->bno;
+ } else {
+ ASSERT(0);
+ }
+
+ trace_xfs_extent_busy_reuse(mp, pag->pag_agno, fbno, flen);
+ return true;
+
+out_force_log:
+ spin_unlock(&pag->pagb_lock);
+ xfs_log_force(mp, XFS_LOG_SYNC);
+ trace_xfs_extent_busy_force(mp, pag->pag_agno, fbno, flen);
+ spin_lock(&pag->pagb_lock);
+ return false;
+}
+
+
+/*
+ * For a given extent [fbno, flen], make sure we can reuse it safely.
+ */
+void
+xfs_extent_busy_reuse(
+ struct xfs_mount *mp,
+ xfs_agnumber_t agno,
+ xfs_agblock_t fbno,
+ xfs_extlen_t flen,
+ bool userdata)
+{
+ struct xfs_perag *pag;
+ struct rb_node *rbp;
+
+ ASSERT(flen > 0);
+
+ pag = xfs_perag_get(mp, agno);
+ spin_lock(&pag->pagb_lock);
+restart:
+ rbp = pag->pagb_tree.rb_node;
+ while (rbp) {
+ struct xfs_extent_busy *busyp =
+ rb_entry(rbp, struct xfs_extent_busy, rb_node);
+ xfs_agblock_t bbno = busyp->bno;
+ xfs_agblock_t bend = bbno + busyp->length;
+
+ if (fbno + flen <= bbno) {
+ rbp = rbp->rb_left;
+ continue;
+ } else if (fbno >= bend) {
+ rbp = rbp->rb_right;
+ continue;
+ }
+
+ if (!xfs_extent_busy_update_extent(mp, pag, busyp, fbno, flen,
+ userdata))
+ goto restart;
+ }
+ spin_unlock(&pag->pagb_lock);
+ xfs_perag_put(pag);
+}
+
+/*
+ * For a given extent [fbno, flen], search the busy extent list to find a
+ * subset of the extent that is not busy. If *rlen is smaller than
+ * args->minlen no suitable extent could be found, and the higher level
+ * code needs to force out the log and retry the allocation.
+ */
+void
+xfs_extent_busy_trim(
+ struct xfs_alloc_arg *args,
+ xfs_agblock_t bno,
+ xfs_extlen_t len,
+ xfs_agblock_t *rbno,
+ xfs_extlen_t *rlen)
+{
+ xfs_agblock_t fbno;
+ xfs_extlen_t flen;
+ struct rb_node *rbp;
+
+ ASSERT(len > 0);
+
+ spin_lock(&args->pag->pagb_lock);
+restart:
+ fbno = bno;
+ flen = len;
+ rbp = args->pag->pagb_tree.rb_node;
+ while (rbp && flen >= args->minlen) {
+ struct xfs_extent_busy *busyp =
+ rb_entry(rbp, struct xfs_extent_busy, rb_node);
+ xfs_agblock_t fend = fbno + flen;
+ xfs_agblock_t bbno = busyp->bno;
+ xfs_agblock_t bend = bbno + busyp->length;
+
+ if (fend <= bbno) {
+ rbp = rbp->rb_left;
+ continue;
+ } else if (fbno >= bend) {
+ rbp = rbp->rb_right;
+ continue;
+ }
+
+ /*
+ * If this is a metadata allocation, try to reuse the busy
+ * extent instead of trimming the allocation.
+ */
+ if (!args->userdata &&
+ !(busyp->flags & XFS_EXTENT_BUSY_DISCARDED)) {
+ if (!xfs_extent_busy_update_extent(args->mp, args->pag,
+ busyp, fbno, flen,
+ false))
+ goto restart;
+ continue;
+ }
+
+ if (bbno <= fbno) {
+ /* start overlap */
+
+ /*
+ * Case 1:
+ * bbno bend
+ * +BBBBBBBBBBBBBBBBB+
+ * +---------+
+ * fbno fend
+ *
+ * Case 2:
+ * bbno bend
+ * +BBBBBBBBBBBBBBBBB+
+ * +-------------+
+ * fbno fend
+ *
+ * Case 3:
+ * bbno bend
+ * +BBBBBBBBBBBBBBBBB+
+ * +-------------+
+ * fbno fend
+ *
+ * Case 4:
+ * bbno bend
+ * +BBBBBBBBBBBBBBBBB+
+ * +-----------------+
+ * fbno fend
+ *
+ * No unbusy region in extent, return failure.
+ */
+ if (fend <= bend)
+ goto fail;
+
+ /*
+ * Case 5:
+ * bbno bend
+ * +BBBBBBBBBBBBBBBBB+
+ * +----------------------+
+ * fbno fend
+ *
+ * Case 6:
+ * bbno bend
+ * +BBBBBBBBBBBBBBBBB+
+ * +--------------------------+
+ * fbno fend
+ *
+ * Needs to be trimmed to:
+ * +-------+
+ * fbno fend
+ */
+ fbno = bend;
+ } else if (bend >= fend) {
+ /* end overlap */
+
+ /*
+ * Case 7:
+ * bbno bend
+ * +BBBBBBBBBBBBBBBBB+
+ * +------------------+
+ * fbno fend
+ *
+ * Case 8:
+ * bbno bend
+ * +BBBBBBBBBBBBBBBBB+
+ * +--------------------------+
+ * fbno fend
+ *
+ * Needs to be trimmed to:
+ * +-------+
+ * fbno fend
+ */
+ fend = bbno;
+ } else {
+ /* middle overlap */
+
+ /*
+ * Case 9:
+ * bbno bend
+ * +BBBBBBBBBBBBBBBBB+
+ * +-----------------------------------+
+ * fbno fend
+ *
+ * Can be trimmed to:
+ * +-------+ OR +-------+
+ * fbno fend fbno fend
+ *
+ * Backward allocation leads to significant
+ * fragmentation of directories, which degrades
+ * directory performance, therefore we always want to
+ * choose the option that produces forward allocation
+ * patterns.
+ * Preferring the lower bno extent will make the next
+ * request use "fend" as the start of the next
+ * allocation; if the segment is no longer busy at
+ * that point, we'll get a contiguous allocation, but
+ * even if it is still busy, we will get a forward
+ * allocation.
+ * We try to avoid choosing the segment at "bend",
+ * because that can lead to the next allocation
+ * taking the segment at "fbno", which would be a
+ * backward allocation. We only use the segment at
+ * "fbno" if it is much larger than the current
+ * requested size, because in that case there's a
+ * good chance subsequent allocations will be
+ * contiguous.
+ */
+ if (bbno - fbno >= args->maxlen) {
+ /* left candidate fits perfect */
+ fend = bbno;
+ } else if (fend - bend >= args->maxlen * 4) {
+ /* right candidate has enough free space */
+ fbno = bend;
+ } else if (bbno - fbno >= args->minlen) {
+ /* left candidate fits minimum requirement */
+ fend = bbno;
+ } else {
+ goto fail;
+ }
+ }
+
+ flen = fend - fbno;
+ }
+ spin_unlock(&args->pag->pagb_lock);
+
+ if (fbno != bno || flen != len) {
+ trace_xfs_extent_busy_trim(args->mp, args->agno, bno, len,
+ fbno, flen);
+ }
+ *rbno = fbno;
+ *rlen = flen;
+ return;
+fail:
+ /*
+ * Return a zero extent length as failure indications. All callers
+ * re-check if the trimmed extent satisfies the minlen requirement.
+ */
+ spin_unlock(&args->pag->pagb_lock);
+ trace_xfs_extent_busy_trim(args->mp, args->agno, bno, len, fbno, 0);
+ *rbno = fbno;
+ *rlen = 0;
+}
+
+STATIC void
+xfs_extent_busy_clear_one(
+ struct xfs_mount *mp,
+ struct xfs_perag *pag,
+ struct xfs_extent_busy *busyp)
+{
+ if (busyp->length) {
+ trace_xfs_extent_busy_clear(mp, busyp->agno, busyp->bno,
+ busyp->length);
+ rb_erase(&busyp->rb_node, &pag->pagb_tree);
+ }
+
+ list_del_init(&busyp->list);
+ kmem_free(busyp);
+}
+
+/*
+ * Remove all extents on the passed in list from the busy extents tree.
+ * If do_discard is set skip extents that need to be discarded, and mark
+ * these as undergoing a discard operation instead.
+ */
+void
+xfs_extent_busy_clear(
+ struct xfs_mount *mp,
+ struct list_head *list,
+ bool do_discard)
+{
+ struct xfs_extent_busy *busyp, *n;
+ struct xfs_perag *pag = NULL;
+ xfs_agnumber_t agno = NULLAGNUMBER;
+
+ list_for_each_entry_safe(busyp, n, list, list) {
+ if (busyp->agno != agno) {
+ if (pag) {
+ spin_unlock(&pag->pagb_lock);
+ xfs_perag_put(pag);
+ }
+ pag = xfs_perag_get(mp, busyp->agno);
+ spin_lock(&pag->pagb_lock);
+ agno = busyp->agno;
+ }
+
+ if (do_discard && busyp->length &&
+ !(busyp->flags & XFS_EXTENT_BUSY_SKIP_DISCARD))
+ busyp->flags = XFS_EXTENT_BUSY_DISCARDED;
+ else
+ xfs_extent_busy_clear_one(mp, pag, busyp);
+ }
+
+ if (pag) {
+ spin_unlock(&pag->pagb_lock);
+ xfs_perag_put(pag);
+ }
+}
+
+/*
+ * Callback for list_sort to sort busy extents by the AG they reside in.
+ */
+int
+xfs_extent_busy_ag_cmp(
+ void *priv,
+ struct list_head *a,
+ struct list_head *b)
+{
+ return container_of(a, struct xfs_extent_busy, list)->agno -
+ container_of(b, struct xfs_extent_busy, list)->agno;
+}
diff --git a/fs/xfs/xfs_extent_busy.h b/fs/xfs/xfs_extent_busy.h
new file mode 100644
index 00000000000..bfff284d2dc
--- /dev/null
+++ b/fs/xfs/xfs_extent_busy.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
+ * Copyright (c) 2010 David Chinner.
+ * Copyright (c) 2011 Christoph Hellwig.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#ifndef __XFS_EXTENT_BUSY_H__
+#define __XFS_EXTENT_BUSY_H__
+
+struct xfs_mount;
+struct xfs_trans;
+struct xfs_alloc_arg;
+
+/*
+ * Busy block/extent entry. Indexed by a rbtree in perag to mark blocks that
+ * have been freed but whose transactions aren't committed to disk yet.
+ *
+ * Note that we use the transaction ID to record the transaction, not the
+ * transaction structure itself. See xfs_extent_busy_insert() for details.
+ */
+struct xfs_extent_busy {
+ struct rb_node rb_node; /* ag by-bno indexed search tree */
+ struct list_head list; /* transaction busy extent list */
+ xfs_agnumber_t agno;
+ xfs_agblock_t bno;
+ xfs_extlen_t length;
+ unsigned int flags;
+#define XFS_EXTENT_BUSY_DISCARDED 0x01 /* undergoing a discard op. */
+#define XFS_EXTENT_BUSY_SKIP_DISCARD 0x02 /* do not discard */
+};
+
+void
+xfs_extent_busy_insert(struct xfs_trans *tp, xfs_agnumber_t agno,
+ xfs_agblock_t bno, xfs_extlen_t len, unsigned int flags);
+
+void
+xfs_extent_busy_clear(struct xfs_mount *mp, struct list_head *list,
+ bool do_discard);
+
+int
+xfs_extent_busy_search(struct xfs_mount *mp, xfs_agnumber_t agno,
+ xfs_agblock_t bno, xfs_extlen_t len);
+
+void
+xfs_extent_busy_reuse(struct xfs_mount *mp, xfs_agnumber_t agno,
+ xfs_agblock_t fbno, xfs_extlen_t flen, bool userdata);
+
+void
+xfs_extent_busy_trim(struct xfs_alloc_arg *args, xfs_agblock_t bno,
+ xfs_extlen_t len, xfs_agblock_t *rbno, xfs_extlen_t *rlen);
+
+int
+xfs_extent_busy_ag_cmp(void *priv, struct list_head *a, struct list_head *b);
+
+static inline void xfs_extent_busy_sort(struct list_head *list)
+{
+ list_sort(NULL, list, xfs_extent_busy_ag_cmp);
+}
+
+#endif /* __XFS_EXTENT_BUSY_H__ */
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index f19282ec854..fb7a4c1ce1c 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -17,37 +17,55 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
-#include "xfs_types.h"
-#include "xfs_log.h"
-#include "xfs_inum.h"
-#include "xfs_trans.h"
-#include "xfs_buf_item.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_sb.h"
-#include "xfs_dir.h"
-#include "xfs_dmapi.h"
+#include "xfs_ag.h"
#include "xfs_mount.h"
+#include "xfs_trans.h"
#include "xfs_trans_priv.h"
+#include "xfs_buf_item.h"
#include "xfs_extfree_item.h"
+#include "xfs_log.h"
kmem_zone_t *xfs_efi_zone;
kmem_zone_t *xfs_efd_zone;
-STATIC void xfs_efi_item_unlock(xfs_efi_log_item_t *);
-STATIC void xfs_efi_item_abort(xfs_efi_log_item_t *);
-STATIC void xfs_efd_item_abort(xfs_efd_log_item_t *);
-
+static inline struct xfs_efi_log_item *EFI_ITEM(struct xfs_log_item *lip)
+{
+ return container_of(lip, struct xfs_efi_log_item, efi_item);
+}
void
-xfs_efi_item_free(xfs_efi_log_item_t *efip)
+xfs_efi_item_free(
+ struct xfs_efi_log_item *efip)
{
- int nexts = efip->efi_format.efi_nextents;
-
- if (nexts > XFS_EFI_MAX_FAST_EXTENTS) {
- kmem_free(efip, sizeof(xfs_efi_log_item_t) +
- (nexts - 1) * sizeof(xfs_extent_t));
- } else {
+ if (efip->efi_format.efi_nextents > XFS_EFI_MAX_FAST_EXTENTS)
+ kmem_free(efip);
+ else
kmem_zone_free(xfs_efi_zone, efip);
+}
+
+/*
+ * Freeing the efi requires that we remove it from the AIL if it has already
+ * been placed there. However, the EFI may not yet have been placed in the AIL
+ * when called by xfs_efi_release() from EFD processing due to the ordering of
+ * committed vs unpin operations in bulk insert operations. Hence the reference
+ * count to ensure only the last caller frees the EFI.
+ */
+STATIC void
+__xfs_efi_release(
+ struct xfs_efi_log_item *efip)
+{
+ struct xfs_ail *ailp = efip->efi_item.li_ailp;
+
+ if (atomic_dec_and_test(&efip->efi_refcount)) {
+ spin_lock(&ailp->xa_lock);
+ /* xfs_trans_ail_delete() drops the AIL lock. */
+ xfs_trans_ail_delete(ailp, &efip->efi_item,
+ SHUTDOWN_LOG_IO_ERROR);
+ xfs_efi_item_free(efip);
}
}
@@ -56,11 +74,22 @@ xfs_efi_item_free(xfs_efi_log_item_t *efip)
* We only need 1 iovec for an efi item. It just logs the efi_log_format
* structure.
*/
-/*ARGSUSED*/
-STATIC uint
-xfs_efi_item_size(xfs_efi_log_item_t *efip)
+static inline int
+xfs_efi_item_sizeof(
+ struct xfs_efi_log_item *efip)
{
- return 1;
+ return sizeof(struct xfs_efi_log_format) +
+ (efip->efi_format.efi_nextents - 1) * sizeof(xfs_extent_t);
+}
+
+STATIC void
+xfs_efi_item_size(
+ struct xfs_log_item *lip,
+ int *nvecs,
+ int *nbytes)
+{
+ *nvecs += 1;
+ *nbytes += xfs_efi_item_sizeof(EFI_ITEM(lip));
}
/*
@@ -71,304 +100,240 @@ xfs_efi_item_size(xfs_efi_log_item_t *efip)
* slots in the efi item have been filled.
*/
STATIC void
-xfs_efi_item_format(xfs_efi_log_item_t *efip,
- xfs_log_iovec_t *log_vector)
+xfs_efi_item_format(
+ struct xfs_log_item *lip,
+ struct xfs_log_vec *lv)
{
- uint size;
+ struct xfs_efi_log_item *efip = EFI_ITEM(lip);
+ struct xfs_log_iovec *vecp = NULL;
- ASSERT(efip->efi_next_extent == efip->efi_format.efi_nextents);
+ ASSERT(atomic_read(&efip->efi_next_extent) ==
+ efip->efi_format.efi_nextents);
efip->efi_format.efi_type = XFS_LI_EFI;
-
- size = sizeof(xfs_efi_log_format_t);
- size += (efip->efi_format.efi_nextents - 1) * sizeof(xfs_extent_t);
efip->efi_format.efi_size = 1;
- log_vector->i_addr = (xfs_caddr_t)&(efip->efi_format);
- log_vector->i_len = size;
- XLOG_VEC_SET_TYPE(log_vector, XLOG_REG_TYPE_EFI_FORMAT);
- ASSERT(size >= sizeof(xfs_efi_log_format_t));
+ xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_EFI_FORMAT,
+ &efip->efi_format,
+ xfs_efi_item_sizeof(efip));
}
/*
* Pinning has no meaning for an efi item, so just return.
*/
-/*ARGSUSED*/
STATIC void
-xfs_efi_item_pin(xfs_efi_log_item_t *efip)
+xfs_efi_item_pin(
+ struct xfs_log_item *lip)
{
- return;
}
-
/*
- * While EFIs cannot really be pinned, the unpin operation is the
- * last place at which the EFI is manipulated during a transaction.
- * Here we coordinate with xfs_efi_cancel() to determine who gets to
- * free the EFI.
+ * While EFIs cannot really be pinned, the unpin operation is the last place at
+ * which the EFI is manipulated during a transaction. If we are being asked to
+ * remove the EFI it's because the transaction has been cancelled and by
+ * definition that means the EFI cannot be in the AIL so remove it from the
+ * transaction and free it. Otherwise coordinate with xfs_efi_release()
+ * to determine who gets to free the EFI.
*/
-/*ARGSUSED*/
STATIC void
-xfs_efi_item_unpin(xfs_efi_log_item_t *efip, int stale)
+xfs_efi_item_unpin(
+ struct xfs_log_item *lip,
+ int remove)
{
- xfs_mount_t *mp;
- SPLDECL(s);
-
- mp = efip->efi_item.li_mountp;
- AIL_LOCK(mp, s);
- if (efip->efi_flags & XFS_EFI_CANCELED) {
- /*
- * xfs_trans_delete_ail() drops the AIL lock.
- */
- xfs_trans_delete_ail(mp, (xfs_log_item_t *)efip, s);
- xfs_efi_item_free(efip);
- } else {
- efip->efi_flags |= XFS_EFI_COMMITTED;
- AIL_UNLOCK(mp, s);
- }
-}
+ struct xfs_efi_log_item *efip = EFI_ITEM(lip);
-/*
- * like unpin only we have to also clear the xaction descriptor
- * pointing the log item if we free the item. This routine duplicates
- * unpin because efi_flags is protected by the AIL lock. Freeing
- * the descriptor and then calling unpin would force us to drop the AIL
- * lock which would open up a race condition.
- */
-STATIC void
-xfs_efi_item_unpin_remove(xfs_efi_log_item_t *efip, xfs_trans_t *tp)
-{
- xfs_mount_t *mp;
- xfs_log_item_desc_t *lidp;
- SPLDECL(s);
-
- mp = efip->efi_item.li_mountp;
- AIL_LOCK(mp, s);
- if (efip->efi_flags & XFS_EFI_CANCELED) {
- /*
- * free the xaction descriptor pointing to this item
- */
- lidp = xfs_trans_find_item(tp, (xfs_log_item_t *) efip);
- xfs_trans_free_item(tp, lidp);
- /*
- * pull the item off the AIL.
- * xfs_trans_delete_ail() drops the AIL lock.
- */
- xfs_trans_delete_ail(mp, (xfs_log_item_t *)efip, s);
+ if (remove) {
+ ASSERT(!(lip->li_flags & XFS_LI_IN_AIL));
+ if (lip->li_desc)
+ xfs_trans_del_item(lip);
xfs_efi_item_free(efip);
- } else {
- efip->efi_flags |= XFS_EFI_COMMITTED;
- AIL_UNLOCK(mp, s);
+ return;
}
+ __xfs_efi_release(efip);
}
/*
- * Efi items have no locking or pushing. However, since EFIs are
- * pulled from the AIL when their corresponding EFDs are committed
- * to disk, their situation is very similar to being pinned. Return
- * XFS_ITEM_PINNED so that the caller will eventually flush the log.
- * This should help in getting the EFI out of the AIL.
+ * Efi items have no locking or pushing. However, since EFIs are pulled from
+ * the AIL when their corresponding EFDs are committed to disk, their situation
+ * is very similar to being pinned. Return XFS_ITEM_PINNED so that the caller
+ * will eventually flush the log. This should help in getting the EFI out of
+ * the AIL.
*/
-/*ARGSUSED*/
STATIC uint
-xfs_efi_item_trylock(xfs_efi_log_item_t *efip)
+xfs_efi_item_push(
+ struct xfs_log_item *lip,
+ struct list_head *buffer_list)
{
return XFS_ITEM_PINNED;
}
-/*
- * Efi items have no locking, so just return.
- */
-/*ARGSUSED*/
STATIC void
-xfs_efi_item_unlock(xfs_efi_log_item_t *efip)
+xfs_efi_item_unlock(
+ struct xfs_log_item *lip)
{
- if (efip->efi_item.li_flags & XFS_LI_ABORTED)
- xfs_efi_item_abort(efip);
- return;
+ if (lip->li_flags & XFS_LI_ABORTED)
+ xfs_efi_item_free(EFI_ITEM(lip));
}
/*
- * The EFI is logged only once and cannot be moved in the log, so
- * simply return the lsn at which it's been logged. The canceled
- * flag is not paid any attention here. Checking for that is delayed
- * until the EFI is unpinned.
+ * The EFI is logged only once and cannot be moved in the log, so simply return
+ * the lsn at which it's been logged.
*/
-/*ARGSUSED*/
STATIC xfs_lsn_t
-xfs_efi_item_committed(xfs_efi_log_item_t *efip, xfs_lsn_t lsn)
+xfs_efi_item_committed(
+ struct xfs_log_item *lip,
+ xfs_lsn_t lsn)
{
return lsn;
}
/*
- * This is called when the transaction logging the EFI is aborted.
- * Free up the EFI and return. No need to clean up the slot for
- * the item in the transaction. That was done by the unpin code
- * which is called prior to this routine in the abort/fs-shutdown path.
- */
-STATIC void
-xfs_efi_item_abort(xfs_efi_log_item_t *efip)
-{
- xfs_efi_item_free(efip);
-}
-
-/*
- * There isn't much you can do to push on an efi item. It is simply
- * stuck waiting for all of its corresponding efd items to be
- * committed to disk.
- */
-/*ARGSUSED*/
-STATIC void
-xfs_efi_item_push(xfs_efi_log_item_t *efip)
-{
- return;
-}
-
-/*
* The EFI dependency tracking op doesn't do squat. It can't because
* it doesn't know where the free extent is coming from. The dependency
* tracking has to be handled by the "enclosing" metadata object. For
* example, for inodes, the inode is locked throughout the extent freeing
* so the dependency should be recorded there.
*/
-/*ARGSUSED*/
STATIC void
-xfs_efi_item_committing(xfs_efi_log_item_t *efip, xfs_lsn_t lsn)
+xfs_efi_item_committing(
+ struct xfs_log_item *lip,
+ xfs_lsn_t lsn)
{
- return;
}
/*
* This is the ops vector shared by all efi log items.
*/
-STATIC struct xfs_item_ops xfs_efi_item_ops = {
- .iop_size = (uint(*)(xfs_log_item_t*))xfs_efi_item_size,
- .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*))
- xfs_efi_item_format,
- .iop_pin = (void(*)(xfs_log_item_t*))xfs_efi_item_pin,
- .iop_unpin = (void(*)(xfs_log_item_t*, int))xfs_efi_item_unpin,
- .iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t *))
- xfs_efi_item_unpin_remove,
- .iop_trylock = (uint(*)(xfs_log_item_t*))xfs_efi_item_trylock,
- .iop_unlock = (void(*)(xfs_log_item_t*))xfs_efi_item_unlock,
- .iop_committed = (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t))
- xfs_efi_item_committed,
- .iop_push = (void(*)(xfs_log_item_t*))xfs_efi_item_push,
- .iop_abort = (void(*)(xfs_log_item_t*))xfs_efi_item_abort,
- .iop_pushbuf = NULL,
- .iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t))
- xfs_efi_item_committing
+static const struct xfs_item_ops xfs_efi_item_ops = {
+ .iop_size = xfs_efi_item_size,
+ .iop_format = xfs_efi_item_format,
+ .iop_pin = xfs_efi_item_pin,
+ .iop_unpin = xfs_efi_item_unpin,
+ .iop_unlock = xfs_efi_item_unlock,
+ .iop_committed = xfs_efi_item_committed,
+ .iop_push = xfs_efi_item_push,
+ .iop_committing = xfs_efi_item_committing
};
/*
* Allocate and initialize an efi item with the given number of extents.
*/
-xfs_efi_log_item_t *
-xfs_efi_init(xfs_mount_t *mp,
- uint nextents)
+struct xfs_efi_log_item *
+xfs_efi_init(
+ struct xfs_mount *mp,
+ uint nextents)
{
- xfs_efi_log_item_t *efip;
+ struct xfs_efi_log_item *efip;
uint size;
ASSERT(nextents > 0);
if (nextents > XFS_EFI_MAX_FAST_EXTENTS) {
size = (uint)(sizeof(xfs_efi_log_item_t) +
((nextents - 1) * sizeof(xfs_extent_t)));
- efip = (xfs_efi_log_item_t*)kmem_zalloc(size, KM_SLEEP);
+ efip = kmem_zalloc(size, KM_SLEEP);
} else {
- efip = (xfs_efi_log_item_t*)kmem_zone_zalloc(xfs_efi_zone,
- KM_SLEEP);
+ efip = kmem_zone_zalloc(xfs_efi_zone, KM_SLEEP);
}
- efip->efi_item.li_type = XFS_LI_EFI;
- efip->efi_item.li_ops = &xfs_efi_item_ops;
- efip->efi_item.li_mountp = mp;
+ xfs_log_item_init(mp, &efip->efi_item, XFS_LI_EFI, &xfs_efi_item_ops);
efip->efi_format.efi_nextents = nextents;
efip->efi_format.efi_id = (__psint_t)(void*)efip;
+ atomic_set(&efip->efi_next_extent, 0);
+ atomic_set(&efip->efi_refcount, 2);
- return (efip);
+ return efip;
}
/*
- * This is called by the efd item code below to release references to
- * the given efi item. Each efd calls this with the number of
- * extents that it has logged, and when the sum of these reaches
- * the total number of extents logged by this efi item we can free
- * the efi item.
- *
- * Freeing the efi item requires that we remove it from the AIL.
- * We'll use the AIL lock to protect our counters as well as
- * the removal from the AIL.
+ * Copy an EFI format buffer from the given buf, and into the destination
+ * EFI format structure.
+ * The given buffer can be in 32 bit or 64 bit form (which has different padding),
+ * one of which will be the native format for this kernel.
+ * It will handle the conversion of formats if necessary.
*/
-void
-xfs_efi_release(xfs_efi_log_item_t *efip,
- uint nextents)
+int
+xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt)
{
- xfs_mount_t *mp;
- int extents_left;
- SPLDECL(s);
-
- mp = efip->efi_item.li_mountp;
- ASSERT(efip->efi_next_extent > 0);
- ASSERT(efip->efi_flags & XFS_EFI_COMMITTED);
-
- AIL_LOCK(mp, s);
- ASSERT(efip->efi_next_extent >= nextents);
- efip->efi_next_extent -= nextents;
- extents_left = efip->efi_next_extent;
- if (extents_left == 0) {
- /*
- * xfs_trans_delete_ail() drops the AIL lock.
- */
- xfs_trans_delete_ail(mp, (xfs_log_item_t *)efip, s);
- xfs_efi_item_free(efip);
- } else {
- AIL_UNLOCK(mp, s);
+ xfs_efi_log_format_t *src_efi_fmt = buf->i_addr;
+ uint i;
+ uint len = sizeof(xfs_efi_log_format_t) +
+ (src_efi_fmt->efi_nextents - 1) * sizeof(xfs_extent_t);
+ uint len32 = sizeof(xfs_efi_log_format_32_t) +
+ (src_efi_fmt->efi_nextents - 1) * sizeof(xfs_extent_32_t);
+ uint len64 = sizeof(xfs_efi_log_format_64_t) +
+ (src_efi_fmt->efi_nextents - 1) * sizeof(xfs_extent_64_t);
+
+ if (buf->i_len == len) {
+ memcpy((char *)dst_efi_fmt, (char*)src_efi_fmt, len);
+ return 0;
+ } else if (buf->i_len == len32) {
+ xfs_efi_log_format_32_t *src_efi_fmt_32 = buf->i_addr;
+
+ dst_efi_fmt->efi_type = src_efi_fmt_32->efi_type;
+ dst_efi_fmt->efi_size = src_efi_fmt_32->efi_size;
+ dst_efi_fmt->efi_nextents = src_efi_fmt_32->efi_nextents;
+ dst_efi_fmt->efi_id = src_efi_fmt_32->efi_id;
+ for (i = 0; i < dst_efi_fmt->efi_nextents; i++) {
+ dst_efi_fmt->efi_extents[i].ext_start =
+ src_efi_fmt_32->efi_extents[i].ext_start;
+ dst_efi_fmt->efi_extents[i].ext_len =
+ src_efi_fmt_32->efi_extents[i].ext_len;
+ }
+ return 0;
+ } else if (buf->i_len == len64) {
+ xfs_efi_log_format_64_t *src_efi_fmt_64 = buf->i_addr;
+
+ dst_efi_fmt->efi_type = src_efi_fmt_64->efi_type;
+ dst_efi_fmt->efi_size = src_efi_fmt_64->efi_size;
+ dst_efi_fmt->efi_nextents = src_efi_fmt_64->efi_nextents;
+ dst_efi_fmt->efi_id = src_efi_fmt_64->efi_id;
+ for (i = 0; i < dst_efi_fmt->efi_nextents; i++) {
+ dst_efi_fmt->efi_extents[i].ext_start =
+ src_efi_fmt_64->efi_extents[i].ext_start;
+ dst_efi_fmt->efi_extents[i].ext_len =
+ src_efi_fmt_64->efi_extents[i].ext_len;
+ }
+ return 0;
}
+ return EFSCORRUPTED;
}
/*
- * This is called when the transaction that should be committing the
- * EFD corresponding to the given EFI is aborted. The committed and
- * canceled flags are used to coordinate the freeing of the EFI and
- * the references by the transaction that committed it.
+ * This is called by the efd item code below to release references to the given
+ * efi item. Each efd calls this with the number of extents that it has
+ * logged, and when the sum of these reaches the total number of extents logged
+ * by this efi item we can free the efi item.
*/
-STATIC void
-xfs_efi_cancel(
- xfs_efi_log_item_t *efip)
+void
+xfs_efi_release(xfs_efi_log_item_t *efip,
+ uint nextents)
{
- xfs_mount_t *mp;
- SPLDECL(s);
-
- mp = efip->efi_item.li_mountp;
- AIL_LOCK(mp, s);
- if (efip->efi_flags & XFS_EFI_COMMITTED) {
- /*
- * xfs_trans_delete_ail() drops the AIL lock.
- */
- xfs_trans_delete_ail(mp, (xfs_log_item_t *)efip, s);
- xfs_efi_item_free(efip);
- } else {
- efip->efi_flags |= XFS_EFI_CANCELED;
- AIL_UNLOCK(mp, s);
+ ASSERT(atomic_read(&efip->efi_next_extent) >= nextents);
+ if (atomic_sub_and_test(nextents, &efip->efi_next_extent)) {
+ /* recovery needs us to drop the EFI reference, too */
+ if (test_bit(XFS_EFI_RECOVERED, &efip->efi_flags))
+ __xfs_efi_release(efip);
+
+ __xfs_efi_release(efip);
+ /* efip may now have been freed, do not reference it again. */
}
}
-STATIC void
-xfs_efd_item_free(xfs_efd_log_item_t *efdp)
+static inline struct xfs_efd_log_item *EFD_ITEM(struct xfs_log_item *lip)
{
- int nexts = efdp->efd_format.efd_nextents;
+ return container_of(lip, struct xfs_efd_log_item, efd_item);
+}
- if (nexts > XFS_EFD_MAX_FAST_EXTENTS) {
- kmem_free(efdp, sizeof(xfs_efd_log_item_t) +
- (nexts - 1) * sizeof(xfs_extent_t));
- } else {
+STATIC void
+xfs_efd_item_free(struct xfs_efd_log_item *efdp)
+{
+ if (efdp->efd_format.efd_nextents > XFS_EFD_MAX_FAST_EXTENTS)
+ kmem_free(efdp);
+ else
kmem_zone_free(xfs_efd_zone, efdp);
- }
}
/*
@@ -376,11 +341,22 @@ xfs_efd_item_free(xfs_efd_log_item_t *efdp)
* We only need 1 iovec for an efd item. It just logs the efd_log_format
* structure.
*/
-/*ARGSUSED*/
-STATIC uint
-xfs_efd_item_size(xfs_efd_log_item_t *efdp)
+static inline int
+xfs_efd_item_sizeof(
+ struct xfs_efd_log_item *efdp)
+{
+ return sizeof(xfs_efd_log_format_t) +
+ (efdp->efd_format.efd_nextents - 1) * sizeof(xfs_extent_t);
+}
+
+STATIC void
+xfs_efd_item_size(
+ struct xfs_log_item *lip,
+ int *nvecs,
+ int *nbytes)
{
- return 1;
+ *nvecs += 1;
+ *nbytes += xfs_efd_item_sizeof(EFD_ITEM(lip));
}
/*
@@ -391,76 +367,61 @@ xfs_efd_item_size(xfs_efd_log_item_t *efdp)
* slots in the efd item have been filled.
*/
STATIC void
-xfs_efd_item_format(xfs_efd_log_item_t *efdp,
- xfs_log_iovec_t *log_vector)
+xfs_efd_item_format(
+ struct xfs_log_item *lip,
+ struct xfs_log_vec *lv)
{
- uint size;
+ struct xfs_efd_log_item *efdp = EFD_ITEM(lip);
+ struct xfs_log_iovec *vecp = NULL;
ASSERT(efdp->efd_next_extent == efdp->efd_format.efd_nextents);
efdp->efd_format.efd_type = XFS_LI_EFD;
-
- size = sizeof(xfs_efd_log_format_t);
- size += (efdp->efd_format.efd_nextents - 1) * sizeof(xfs_extent_t);
efdp->efd_format.efd_size = 1;
- log_vector->i_addr = (xfs_caddr_t)&(efdp->efd_format);
- log_vector->i_len = size;
- XLOG_VEC_SET_TYPE(log_vector, XLOG_REG_TYPE_EFD_FORMAT);
- ASSERT(size >= sizeof(xfs_efd_log_format_t));
+ xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_EFD_FORMAT,
+ &efdp->efd_format,
+ xfs_efd_item_sizeof(efdp));
}
-
/*
* Pinning has no meaning for an efd item, so just return.
*/
-/*ARGSUSED*/
STATIC void
-xfs_efd_item_pin(xfs_efd_log_item_t *efdp)
+xfs_efd_item_pin(
+ struct xfs_log_item *lip)
{
- return;
}
-
/*
* Since pinning has no meaning for an efd item, unpinning does
* not either.
*/
-/*ARGSUSED*/
-STATIC void
-xfs_efd_item_unpin(xfs_efd_log_item_t *efdp, int stale)
-{
- return;
-}
-
-/*ARGSUSED*/
STATIC void
-xfs_efd_item_unpin_remove(xfs_efd_log_item_t *efdp, xfs_trans_t *tp)
+xfs_efd_item_unpin(
+ struct xfs_log_item *lip,
+ int remove)
{
- return;
}
/*
- * Efd items have no locking, so just return success.
+ * There isn't much you can do to push on an efd item. It is simply stuck
+ * waiting for the log to be flushed to disk.
*/
-/*ARGSUSED*/
STATIC uint
-xfs_efd_item_trylock(xfs_efd_log_item_t *efdp)
+xfs_efd_item_push(
+ struct xfs_log_item *lip,
+ struct list_head *buffer_list)
{
- return XFS_ITEM_LOCKED;
+ return XFS_ITEM_PINNED;
}
-/*
- * Efd items have no locking or pushing, so return failure
- * so that the caller doesn't bother with us.
- */
-/*ARGSUSED*/
STATIC void
-xfs_efd_item_unlock(xfs_efd_log_item_t *efdp)
+xfs_efd_item_unlock(
+ struct xfs_log_item *lip)
{
- if (efdp->efd_item.li_flags & XFS_LI_ABORTED)
- xfs_efd_item_abort(efdp);
- return;
+ if (lip->li_flags & XFS_LI_ABORTED)
+ xfs_efd_item_free(EFD_ITEM(lip));
}
/*
@@ -470,15 +431,18 @@ xfs_efd_item_unlock(xfs_efd_log_item_t *efdp)
* return -1 to keep the transaction code from further referencing
* this item.
*/
-/*ARGSUSED*/
STATIC xfs_lsn_t
-xfs_efd_item_committed(xfs_efd_log_item_t *efdp, xfs_lsn_t lsn)
+xfs_efd_item_committed(
+ struct xfs_log_item *lip,
+ xfs_lsn_t lsn)
{
+ struct xfs_efd_log_item *efdp = EFD_ITEM(lip);
+
/*
* If we got a log I/O error, it's always the case that the LR with the
* EFI got unpinned and freed before the EFD got aborted.
*/
- if ((efdp->efd_item.li_flags & XFS_LI_ABORTED) == 0)
+ if (!(lip->li_flags & XFS_LI_ABORTED))
xfs_efi_release(efdp->efd_efip, efdp->efd_format.efd_nextents);
xfs_efd_item_free(efdp);
@@ -486,102 +450,59 @@ xfs_efd_item_committed(xfs_efd_log_item_t *efdp, xfs_lsn_t lsn)
}
/*
- * The transaction of which this EFD is a part has been aborted.
- * Inform its companion EFI of this fact and then clean up after
- * ourselves. No need to clean up the slot for the item in the
- * transaction. That was done by the unpin code which is called
- * prior to this routine in the abort/fs-shutdown path.
- */
-STATIC void
-xfs_efd_item_abort(xfs_efd_log_item_t *efdp)
-{
- /*
- * If we got a log I/O error, it's always the case that the LR with the
- * EFI got unpinned and freed before the EFD got aborted. So don't
- * reference the EFI at all in that case.
- */
- if ((efdp->efd_item.li_flags & XFS_LI_ABORTED) == 0)
- xfs_efi_cancel(efdp->efd_efip);
-
- xfs_efd_item_free(efdp);
-}
-
-/*
- * There isn't much you can do to push on an efd item. It is simply
- * stuck waiting for the log to be flushed to disk.
- */
-/*ARGSUSED*/
-STATIC void
-xfs_efd_item_push(xfs_efd_log_item_t *efdp)
-{
- return;
-}
-
-/*
* The EFD dependency tracking op doesn't do squat. It can't because
* it doesn't know where the free extent is coming from. The dependency
* tracking has to be handled by the "enclosing" metadata object. For
* example, for inodes, the inode is locked throughout the extent freeing
* so the dependency should be recorded there.
*/
-/*ARGSUSED*/
STATIC void
-xfs_efd_item_committing(xfs_efd_log_item_t *efip, xfs_lsn_t lsn)
+xfs_efd_item_committing(
+ struct xfs_log_item *lip,
+ xfs_lsn_t lsn)
{
- return;
}
/*
* This is the ops vector shared by all efd log items.
*/
-STATIC struct xfs_item_ops xfs_efd_item_ops = {
- .iop_size = (uint(*)(xfs_log_item_t*))xfs_efd_item_size,
- .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*))
- xfs_efd_item_format,
- .iop_pin = (void(*)(xfs_log_item_t*))xfs_efd_item_pin,
- .iop_unpin = (void(*)(xfs_log_item_t*, int))xfs_efd_item_unpin,
- .iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t*))
- xfs_efd_item_unpin_remove,
- .iop_trylock = (uint(*)(xfs_log_item_t*))xfs_efd_item_trylock,
- .iop_unlock = (void(*)(xfs_log_item_t*))xfs_efd_item_unlock,
- .iop_committed = (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t))
- xfs_efd_item_committed,
- .iop_push = (void(*)(xfs_log_item_t*))xfs_efd_item_push,
- .iop_abort = (void(*)(xfs_log_item_t*))xfs_efd_item_abort,
- .iop_pushbuf = NULL,
- .iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t))
- xfs_efd_item_committing
+static const struct xfs_item_ops xfs_efd_item_ops = {
+ .iop_size = xfs_efd_item_size,
+ .iop_format = xfs_efd_item_format,
+ .iop_pin = xfs_efd_item_pin,
+ .iop_unpin = xfs_efd_item_unpin,
+ .iop_unlock = xfs_efd_item_unlock,
+ .iop_committed = xfs_efd_item_committed,
+ .iop_push = xfs_efd_item_push,
+ .iop_committing = xfs_efd_item_committing
};
-
/*
* Allocate and initialize an efd item with the given number of extents.
*/
-xfs_efd_log_item_t *
-xfs_efd_init(xfs_mount_t *mp,
- xfs_efi_log_item_t *efip,
- uint nextents)
+struct xfs_efd_log_item *
+xfs_efd_init(
+ struct xfs_mount *mp,
+ struct xfs_efi_log_item *efip,
+ uint nextents)
{
- xfs_efd_log_item_t *efdp;
+ struct xfs_efd_log_item *efdp;
uint size;
ASSERT(nextents > 0);
if (nextents > XFS_EFD_MAX_FAST_EXTENTS) {
size = (uint)(sizeof(xfs_efd_log_item_t) +
((nextents - 1) * sizeof(xfs_extent_t)));
- efdp = (xfs_efd_log_item_t*)kmem_zalloc(size, KM_SLEEP);
+ efdp = kmem_zalloc(size, KM_SLEEP);
} else {
- efdp = (xfs_efd_log_item_t*)kmem_zone_zalloc(xfs_efd_zone,
- KM_SLEEP);
+ efdp = kmem_zone_zalloc(xfs_efd_zone, KM_SLEEP);
}
- efdp->efd_item.li_type = XFS_LI_EFD;
- efdp->efd_item.li_ops = &xfs_efd_item_ops;
- efdp->efd_item.li_mountp = mp;
+ xfs_log_item_init(mp, &efdp->efd_item, XFS_LI_EFD, &xfs_efd_item_ops);
efdp->efd_efip = efip;
efdp->efd_format.efd_nextents = nextents;
efdp->efd_format.efd_efi_id = efip->efi_format.efi_id;
- return (efdp);
+ return efdp;
}
diff --git a/fs/xfs/xfs_extfree_item.h b/fs/xfs/xfs_extfree_item.h
index 5bf681708fe..0ffbce32d56 100644
--- a/fs/xfs/xfs_extfree_item.h
+++ b/fs/xfs/xfs_extfree_item.h
@@ -18,65 +18,36 @@
#ifndef __XFS_EXTFREE_ITEM_H__
#define __XFS_EXTFREE_ITEM_H__
+/* kernel only EFI/EFD definitions */
+
struct xfs_mount;
struct kmem_zone;
-typedef struct xfs_extent {
- xfs_dfsbno_t ext_start;
- xfs_extlen_t ext_len;
-} xfs_extent_t;
-
-/*
- * This is the structure used to lay out an efi log item in the
- * log. The efi_extents field is a variable size array whose
- * size is given by efi_nextents.
- */
-typedef struct xfs_efi_log_format {
- unsigned short efi_type; /* efi log item type */
- unsigned short efi_size; /* size of this item */
- uint efi_nextents; /* # extents to free */
- __uint64_t efi_id; /* efi identifier */
- xfs_extent_t efi_extents[1]; /* array of extents to free */
-} xfs_efi_log_format_t;
-
-/*
- * This is the structure used to lay out an efd log item in the
- * log. The efd_extents array is a variable size array whose
- * size is given by efd_nextents;
- */
-typedef struct xfs_efd_log_format {
- unsigned short efd_type; /* efd log item type */
- unsigned short efd_size; /* size of this item */
- uint efd_nextents; /* # of extents freed */
- __uint64_t efd_efi_id; /* id of corresponding efi */
- xfs_extent_t efd_extents[1]; /* array of extents freed */
-} xfs_efd_log_format_t;
-
-
-#ifdef __KERNEL__
-
/*
* Max number of extents in fast allocation path.
*/
#define XFS_EFI_MAX_FAST_EXTENTS 16
/*
- * Define EFI flags.
+ * Define EFI flag bits. Manipulated by set/clear/test_bit operators.
*/
-#define XFS_EFI_RECOVERED 0x1
-#define XFS_EFI_COMMITTED 0x2
-#define XFS_EFI_CANCELED 0x4
+#define XFS_EFI_RECOVERED 1
/*
- * This is the "extent free intention" log item. It is used
- * to log the fact that some extents need to be free. It is
- * used in conjunction with the "extent free done" log item
- * described below.
+ * This is the "extent free intention" log item. It is used to log the fact
+ * that some extents need to be free. It is used in conjunction with the
+ * "extent free done" log item described below.
+ *
+ * The EFI is reference counted so that it is not freed prior to both the EFI
+ * and EFD being committed and unpinned. This ensures that when the last
+ * reference goes away the EFI will always be in the AIL as it has been
+ * unpinned, regardless of whether the EFD is processed before or after the EFI.
*/
typedef struct xfs_efi_log_item {
xfs_log_item_t efi_item;
- uint efi_flags; /* misc flags */
- uint efi_next_extent;
+ atomic_t efi_refcount;
+ atomic_t efi_next_extent;
+ unsigned long efi_flags; /* misc flags */
xfs_efi_log_format_t efi_format;
} xfs_efi_log_item_t;
@@ -103,9 +74,8 @@ extern struct kmem_zone *xfs_efd_zone;
xfs_efi_log_item_t *xfs_efi_init(struct xfs_mount *, uint);
xfs_efd_log_item_t *xfs_efd_init(struct xfs_mount *, xfs_efi_log_item_t *,
uint);
-
+int xfs_efi_copy_format(xfs_log_iovec_t *buf,
+ xfs_efi_log_format_t *dst_efi_fmt);
void xfs_efi_item_free(xfs_efi_log_item_t *);
-#endif /* __KERNEL__ */
-
#endif /* __XFS_EXTFREE_ITEM_H__ */
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
new file mode 100644
index 00000000000..1f66779d7a4
--- /dev/null
+++ b/fs/xfs/xfs_file.c
@@ -0,0 +1,1433 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_inode_item.h"
+#include "xfs_bmap.h"
+#include "xfs_bmap_util.h"
+#include "xfs_error.h"
+#include "xfs_dir2.h"
+#include "xfs_dir2_priv.h"
+#include "xfs_ioctl.h"
+#include "xfs_trace.h"
+#include "xfs_log.h"
+#include "xfs_dinode.h"
+
+#include <linux/aio.h>
+#include <linux/dcache.h>
+#include <linux/falloc.h>
+#include <linux/pagevec.h>
+
+static const struct vm_operations_struct xfs_file_vm_ops;
+
+/*
+ * Locking primitives for read and write IO paths to ensure we consistently use
+ * and order the inode->i_mutex, ip->i_lock and ip->i_iolock.
+ */
+static inline void
+xfs_rw_ilock(
+ struct xfs_inode *ip,
+ int type)
+{
+ if (type & XFS_IOLOCK_EXCL)
+ mutex_lock(&VFS_I(ip)->i_mutex);
+ xfs_ilock(ip, type);
+}
+
+static inline void
+xfs_rw_iunlock(
+ struct xfs_inode *ip,
+ int type)
+{
+ xfs_iunlock(ip, type);
+ if (type & XFS_IOLOCK_EXCL)
+ mutex_unlock(&VFS_I(ip)->i_mutex);
+}
+
+static inline void
+xfs_rw_ilock_demote(
+ struct xfs_inode *ip,
+ int type)
+{
+ xfs_ilock_demote(ip, type);
+ if (type & XFS_IOLOCK_EXCL)
+ mutex_unlock(&VFS_I(ip)->i_mutex);
+}
+
+/*
+ * xfs_iozero
+ *
+ * xfs_iozero clears the specified range of buffer supplied,
+ * and marks all the affected blocks as valid and modified. If
+ * an affected block is not allocated, it will be allocated. If
+ * an affected block is not completely overwritten, and is not
+ * valid before the operation, it will be read from disk before
+ * being partially zeroed.
+ */
+int
+xfs_iozero(
+ struct xfs_inode *ip, /* inode */
+ loff_t pos, /* offset in file */
+ size_t count) /* size of data to zero */
+{
+ struct page *page;
+ struct address_space *mapping;
+ int status;
+
+ mapping = VFS_I(ip)->i_mapping;
+ do {
+ unsigned offset, bytes;
+ void *fsdata;
+
+ offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
+ bytes = PAGE_CACHE_SIZE - offset;
+ if (bytes > count)
+ bytes = count;
+
+ status = pagecache_write_begin(NULL, mapping, pos, bytes,
+ AOP_FLAG_UNINTERRUPTIBLE,
+ &page, &fsdata);
+ if (status)
+ break;
+
+ zero_user(page, offset, bytes);
+
+ status = pagecache_write_end(NULL, mapping, pos, bytes, bytes,
+ page, fsdata);
+ WARN_ON(status <= 0); /* can't return less than zero! */
+ pos += bytes;
+ count -= bytes;
+ status = 0;
+ } while (count);
+
+ return (-status);
+}
+
+/*
+ * Fsync operations on directories are much simpler than on regular files,
+ * as there is no file data to flush, and thus also no need for explicit
+ * cache flush operations, and there are no non-transaction metadata updates
+ * on directories either.
+ */
+STATIC int
+xfs_dir_fsync(
+ struct file *file,
+ loff_t start,
+ loff_t end,
+ int datasync)
+{
+ struct xfs_inode *ip = XFS_I(file->f_mapping->host);
+ struct xfs_mount *mp = ip->i_mount;
+ xfs_lsn_t lsn = 0;
+
+ trace_xfs_dir_fsync(ip);
+
+ xfs_ilock(ip, XFS_ILOCK_SHARED);
+ if (xfs_ipincount(ip))
+ lsn = ip->i_itemp->ili_last_lsn;
+ xfs_iunlock(ip, XFS_ILOCK_SHARED);
+
+ if (!lsn)
+ return 0;
+ return -_xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL);
+}
+
+STATIC int
+xfs_file_fsync(
+ struct file *file,
+ loff_t start,
+ loff_t end,
+ int datasync)
+{
+ struct inode *inode = file->f_mapping->host;
+ struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_mount *mp = ip->i_mount;
+ int error = 0;
+ int log_flushed = 0;
+ xfs_lsn_t lsn = 0;
+
+ trace_xfs_file_fsync(ip);
+
+ error = filemap_write_and_wait_range(inode->i_mapping, start, end);
+ if (error)
+ return error;
+
+ if (XFS_FORCED_SHUTDOWN(mp))
+ return -XFS_ERROR(EIO);
+
+ xfs_iflags_clear(ip, XFS_ITRUNCATED);
+
+ if (mp->m_flags & XFS_MOUNT_BARRIER) {
+ /*
+ * If we have an RT and/or log subvolume we need to make sure
+ * to flush the write cache the device used for file data
+ * first. This is to ensure newly written file data make
+ * it to disk before logging the new inode size in case of
+ * an extending write.
+ */
+ if (XFS_IS_REALTIME_INODE(ip))
+ xfs_blkdev_issue_flush(mp->m_rtdev_targp);
+ else if (mp->m_logdev_targp != mp->m_ddev_targp)
+ xfs_blkdev_issue_flush(mp->m_ddev_targp);
+ }
+
+ /*
+ * All metadata updates are logged, which means that we just have
+ * to flush the log up to the latest LSN that touched the inode.
+ */
+ xfs_ilock(ip, XFS_ILOCK_SHARED);
+ if (xfs_ipincount(ip)) {
+ if (!datasync ||
+ (ip->i_itemp->ili_fields & ~XFS_ILOG_TIMESTAMP))
+ lsn = ip->i_itemp->ili_last_lsn;
+ }
+ xfs_iunlock(ip, XFS_ILOCK_SHARED);
+
+ if (lsn)
+ error = _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, &log_flushed);
+
+ /*
+ * If we only have a single device, and the log force about was
+ * a no-op we might have to flush the data device cache here.
+ * This can only happen for fdatasync/O_DSYNC if we were overwriting
+ * an already allocated file and thus do not have any metadata to
+ * commit.
+ */
+ if ((mp->m_flags & XFS_MOUNT_BARRIER) &&
+ mp->m_logdev_targp == mp->m_ddev_targp &&
+ !XFS_IS_REALTIME_INODE(ip) &&
+ !log_flushed)
+ xfs_blkdev_issue_flush(mp->m_ddev_targp);
+
+ return -error;
+}
+
+STATIC ssize_t
+xfs_file_read_iter(
+ struct kiocb *iocb,
+ struct iov_iter *to)
+{
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = file->f_mapping->host;
+ struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_mount *mp = ip->i_mount;
+ size_t size = iov_iter_count(to);
+ ssize_t ret = 0;
+ int ioflags = 0;
+ xfs_fsize_t n;
+ loff_t pos = iocb->ki_pos;
+
+ XFS_STATS_INC(xs_read_calls);
+
+ if (unlikely(file->f_flags & O_DIRECT))
+ ioflags |= IO_ISDIRECT;
+ if (file->f_mode & FMODE_NOCMTIME)
+ ioflags |= IO_INVIS;
+
+ if (unlikely(ioflags & IO_ISDIRECT)) {
+ xfs_buftarg_t *target =
+ XFS_IS_REALTIME_INODE(ip) ?
+ mp->m_rtdev_targp : mp->m_ddev_targp;
+ /* DIO must be aligned to device logical sector size */
+ if ((pos | size) & target->bt_logical_sectormask) {
+ if (pos == i_size_read(inode))
+ return 0;
+ return -XFS_ERROR(EINVAL);
+ }
+ }
+
+ n = mp->m_super->s_maxbytes - pos;
+ if (n <= 0 || size == 0)
+ return 0;
+
+ if (n < size)
+ size = n;
+
+ if (XFS_FORCED_SHUTDOWN(mp))
+ return -EIO;
+
+ /*
+ * Locking is a bit tricky here. If we take an exclusive lock
+ * for direct IO, we effectively serialise all new concurrent
+ * read IO to this file and block it behind IO that is currently in
+ * progress because IO in progress holds the IO lock shared. We only
+ * need to hold the lock exclusive to blow away the page cache, so
+ * only take lock exclusively if the page cache needs invalidation.
+ * This allows the normal direct IO case of no page cache pages to
+ * proceeed concurrently without serialisation.
+ */
+ xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
+ if ((ioflags & IO_ISDIRECT) && inode->i_mapping->nrpages) {
+ xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
+ xfs_rw_ilock(ip, XFS_IOLOCK_EXCL);
+
+ if (inode->i_mapping->nrpages) {
+ ret = filemap_write_and_wait_range(
+ VFS_I(ip)->i_mapping,
+ pos, -1);
+ if (ret) {
+ xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL);
+ return ret;
+ }
+ truncate_pagecache_range(VFS_I(ip), pos, -1);
+ }
+ xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
+ }
+
+ trace_xfs_file_read(ip, size, pos, ioflags);
+
+ ret = generic_file_read_iter(iocb, to);
+ if (ret > 0)
+ XFS_STATS_ADD(xs_read_bytes, ret);
+
+ xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
+ return ret;
+}
+
+STATIC ssize_t
+xfs_file_splice_read(
+ struct file *infilp,
+ loff_t *ppos,
+ struct pipe_inode_info *pipe,
+ size_t count,
+ unsigned int flags)
+{
+ struct xfs_inode *ip = XFS_I(infilp->f_mapping->host);
+ int ioflags = 0;
+ ssize_t ret;
+
+ XFS_STATS_INC(xs_read_calls);
+
+ if (infilp->f_mode & FMODE_NOCMTIME)
+ ioflags |= IO_INVIS;
+
+ if (XFS_FORCED_SHUTDOWN(ip->i_mount))
+ return -EIO;
+
+ xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
+
+ trace_xfs_file_splice_read(ip, count, *ppos, ioflags);
+
+ ret = generic_file_splice_read(infilp, ppos, pipe, count, flags);
+ if (ret > 0)
+ XFS_STATS_ADD(xs_read_bytes, ret);
+
+ xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
+ return ret;
+}
+
+/*
+ * This routine is called to handle zeroing any space in the last block of the
+ * file that is beyond the EOF. We do this since the size is being increased
+ * without writing anything to that block and we don't want to read the
+ * garbage on the disk.
+ */
+STATIC int /* error (positive) */
+xfs_zero_last_block(
+ struct xfs_inode *ip,
+ xfs_fsize_t offset,
+ xfs_fsize_t isize)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ xfs_fileoff_t last_fsb = XFS_B_TO_FSBT(mp, isize);
+ int zero_offset = XFS_B_FSB_OFFSET(mp, isize);
+ int zero_len;
+ int nimaps = 1;
+ int error = 0;
+ struct xfs_bmbt_irec imap;
+
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ error = xfs_bmapi_read(ip, last_fsb, 1, &imap, &nimaps, 0);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ if (error)
+ return error;
+
+ ASSERT(nimaps > 0);
+
+ /*
+ * If the block underlying isize is just a hole, then there
+ * is nothing to zero.
+ */
+ if (imap.br_startblock == HOLESTARTBLOCK)
+ return 0;
+
+ zero_len = mp->m_sb.sb_blocksize - zero_offset;
+ if (isize + zero_len > offset)
+ zero_len = offset - isize;
+ return xfs_iozero(ip, isize, zero_len);
+}
+
+/*
+ * Zero any on disk space between the current EOF and the new, larger EOF.
+ *
+ * This handles the normal case of zeroing the remainder of the last block in
+ * the file and the unusual case of zeroing blocks out beyond the size of the
+ * file. This second case only happens with fixed size extents and when the
+ * system crashes before the inode size was updated but after blocks were
+ * allocated.
+ *
+ * Expects the iolock to be held exclusive, and will take the ilock internally.
+ */
+int /* error (positive) */
+xfs_zero_eof(
+ struct xfs_inode *ip,
+ xfs_off_t offset, /* starting I/O offset */
+ xfs_fsize_t isize) /* current inode size */
+{
+ struct xfs_mount *mp = ip->i_mount;
+ xfs_fileoff_t start_zero_fsb;
+ xfs_fileoff_t end_zero_fsb;
+ xfs_fileoff_t zero_count_fsb;
+ xfs_fileoff_t last_fsb;
+ xfs_fileoff_t zero_off;
+ xfs_fsize_t zero_len;
+ int nimaps;
+ int error = 0;
+ struct xfs_bmbt_irec imap;
+
+ ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
+ ASSERT(offset > isize);
+
+ /*
+ * First handle zeroing the block on which isize resides.
+ *
+ * We only zero a part of that block so it is handled specially.
+ */
+ if (XFS_B_FSB_OFFSET(mp, isize) != 0) {
+ error = xfs_zero_last_block(ip, offset, isize);
+ if (error)
+ return error;
+ }
+
+ /*
+ * Calculate the range between the new size and the old where blocks
+ * needing to be zeroed may exist.
+ *
+ * To get the block where the last byte in the file currently resides,
+ * we need to subtract one from the size and truncate back to a block
+ * boundary. We subtract 1 in case the size is exactly on a block
+ * boundary.
+ */
+ last_fsb = isize ? XFS_B_TO_FSBT(mp, isize - 1) : (xfs_fileoff_t)-1;
+ start_zero_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize);
+ end_zero_fsb = XFS_B_TO_FSBT(mp, offset - 1);
+ ASSERT((xfs_sfiloff_t)last_fsb < (xfs_sfiloff_t)start_zero_fsb);
+ if (last_fsb == end_zero_fsb) {
+ /*
+ * The size was only incremented on its last block.
+ * We took care of that above, so just return.
+ */
+ return 0;
+ }
+
+ ASSERT(start_zero_fsb <= end_zero_fsb);
+ while (start_zero_fsb <= end_zero_fsb) {
+ nimaps = 1;
+ zero_count_fsb = end_zero_fsb - start_zero_fsb + 1;
+
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ error = xfs_bmapi_read(ip, start_zero_fsb, zero_count_fsb,
+ &imap, &nimaps, 0);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ if (error)
+ return error;
+
+ ASSERT(nimaps > 0);
+
+ if (imap.br_state == XFS_EXT_UNWRITTEN ||
+ imap.br_startblock == HOLESTARTBLOCK) {
+ start_zero_fsb = imap.br_startoff + imap.br_blockcount;
+ ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
+ continue;
+ }
+
+ /*
+ * There are blocks we need to zero.
+ */
+ zero_off = XFS_FSB_TO_B(mp, start_zero_fsb);
+ zero_len = XFS_FSB_TO_B(mp, imap.br_blockcount);
+
+ if ((zero_off + zero_len) > offset)
+ zero_len = offset - zero_off;
+
+ error = xfs_iozero(ip, zero_off, zero_len);
+ if (error)
+ return error;
+
+ start_zero_fsb = imap.br_startoff + imap.br_blockcount;
+ ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
+ }
+
+ return 0;
+}
+
+/*
+ * Common pre-write limit and setup checks.
+ *
+ * Called with the iolocked held either shared and exclusive according to
+ * @iolock, and returns with it held. Might upgrade the iolock to exclusive
+ * if called for a direct write beyond i_size.
+ */
+STATIC ssize_t
+xfs_file_aio_write_checks(
+ struct file *file,
+ loff_t *pos,
+ size_t *count,
+ int *iolock)
+{
+ struct inode *inode = file->f_mapping->host;
+ struct xfs_inode *ip = XFS_I(inode);
+ int error = 0;
+
+restart:
+ error = generic_write_checks(file, pos, count, S_ISBLK(inode->i_mode));
+ if (error)
+ return error;
+
+ /*
+ * If the offset is beyond the size of the file, we need to zero any
+ * blocks that fall between the existing EOF and the start of this
+ * write. If zeroing is needed and we are currently holding the
+ * iolock shared, we need to update it to exclusive which implies
+ * having to redo all checks before.
+ */
+ if (*pos > i_size_read(inode)) {
+ if (*iolock == XFS_IOLOCK_SHARED) {
+ xfs_rw_iunlock(ip, *iolock);
+ *iolock = XFS_IOLOCK_EXCL;
+ xfs_rw_ilock(ip, *iolock);
+ goto restart;
+ }
+ error = -xfs_zero_eof(ip, *pos, i_size_read(inode));
+ if (error)
+ return error;
+ }
+
+ /*
+ * Updating the timestamps will grab the ilock again from
+ * xfs_fs_dirty_inode, so we have to call it after dropping the
+ * lock above. Eventually we should look into a way to avoid
+ * the pointless lock roundtrip.
+ */
+ if (likely(!(file->f_mode & FMODE_NOCMTIME))) {
+ error = file_update_time(file);
+ if (error)
+ return error;
+ }
+
+ /*
+ * If we're writing the file then make sure to clear the setuid and
+ * setgid bits if the process is not being run by root. This keeps
+ * people from modifying setuid and setgid binaries.
+ */
+ return file_remove_suid(file);
+}
+
+/*
+ * xfs_file_dio_aio_write - handle direct IO writes
+ *
+ * Lock the inode appropriately to prepare for and issue a direct IO write.
+ * By separating it from the buffered write path we remove all the tricky to
+ * follow locking changes and looping.
+ *
+ * If there are cached pages or we're extending the file, we need IOLOCK_EXCL
+ * until we're sure the bytes at the new EOF have been zeroed and/or the cached
+ * pages are flushed out.
+ *
+ * In most cases the direct IO writes will be done holding IOLOCK_SHARED
+ * allowing them to be done in parallel with reads and other direct IO writes.
+ * However, if the IO is not aligned to filesystem blocks, the direct IO layer
+ * needs to do sub-block zeroing and that requires serialisation against other
+ * direct IOs to the same block. In this case we need to serialise the
+ * submission of the unaligned IOs so that we don't get racing block zeroing in
+ * the dio layer. To avoid the problem with aio, we also need to wait for
+ * outstanding IOs to complete so that unwritten extent conversion is completed
+ * before we try to map the overlapping block. This is currently implemented by
+ * hitting it with a big hammer (i.e. inode_dio_wait()).
+ *
+ * Returns with locks held indicated by @iolock and errors indicated by
+ * negative return values.
+ */
+STATIC ssize_t
+xfs_file_dio_aio_write(
+ struct kiocb *iocb,
+ struct iov_iter *from)
+{
+ struct file *file = iocb->ki_filp;
+ struct address_space *mapping = file->f_mapping;
+ struct inode *inode = mapping->host;
+ struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_mount *mp = ip->i_mount;
+ ssize_t ret = 0;
+ int unaligned_io = 0;
+ int iolock;
+ size_t count = iov_iter_count(from);
+ loff_t pos = iocb->ki_pos;
+ struct xfs_buftarg *target = XFS_IS_REALTIME_INODE(ip) ?
+ mp->m_rtdev_targp : mp->m_ddev_targp;
+
+ /* DIO must be aligned to device logical sector size */
+ if ((pos | count) & target->bt_logical_sectormask)
+ return -XFS_ERROR(EINVAL);
+
+ /* "unaligned" here means not aligned to a filesystem block */
+ if ((pos & mp->m_blockmask) || ((pos + count) & mp->m_blockmask))
+ unaligned_io = 1;
+
+ /*
+ * We don't need to take an exclusive lock unless there page cache needs
+ * to be invalidated or unaligned IO is being executed. We don't need to
+ * consider the EOF extension case here because
+ * xfs_file_aio_write_checks() will relock the inode as necessary for
+ * EOF zeroing cases and fill out the new inode size as appropriate.
+ */
+ if (unaligned_io || mapping->nrpages)
+ iolock = XFS_IOLOCK_EXCL;
+ else
+ iolock = XFS_IOLOCK_SHARED;
+ xfs_rw_ilock(ip, iolock);
+
+ /*
+ * Recheck if there are cached pages that need invalidate after we got
+ * the iolock to protect against other threads adding new pages while
+ * we were waiting for the iolock.
+ */
+ if (mapping->nrpages && iolock == XFS_IOLOCK_SHARED) {
+ xfs_rw_iunlock(ip, iolock);
+ iolock = XFS_IOLOCK_EXCL;
+ xfs_rw_ilock(ip, iolock);
+ }
+
+ ret = xfs_file_aio_write_checks(file, &pos, &count, &iolock);
+ if (ret)
+ goto out;
+ iov_iter_truncate(from, count);
+
+ if (mapping->nrpages) {
+ ret = filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
+ pos, -1);
+ if (ret)
+ goto out;
+ truncate_pagecache_range(VFS_I(ip), pos, -1);
+ }
+
+ /*
+ * If we are doing unaligned IO, wait for all other IO to drain,
+ * otherwise demote the lock if we had to flush cached pages
+ */
+ if (unaligned_io)
+ inode_dio_wait(inode);
+ else if (iolock == XFS_IOLOCK_EXCL) {
+ xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
+ iolock = XFS_IOLOCK_SHARED;
+ }
+
+ trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0);
+ ret = generic_file_direct_write(iocb, from, pos);
+
+out:
+ xfs_rw_iunlock(ip, iolock);
+
+ /* No fallback to buffered IO on errors for XFS. */
+ ASSERT(ret < 0 || ret == count);
+ return ret;
+}
+
+STATIC ssize_t
+xfs_file_buffered_aio_write(
+ struct kiocb *iocb,
+ struct iov_iter *from)
+{
+ struct file *file = iocb->ki_filp;
+ struct address_space *mapping = file->f_mapping;
+ struct inode *inode = mapping->host;
+ struct xfs_inode *ip = XFS_I(inode);
+ ssize_t ret;
+ int enospc = 0;
+ int iolock = XFS_IOLOCK_EXCL;
+ loff_t pos = iocb->ki_pos;
+ size_t count = iov_iter_count(from);
+
+ xfs_rw_ilock(ip, iolock);
+
+ ret = xfs_file_aio_write_checks(file, &pos, &count, &iolock);
+ if (ret)
+ goto out;
+
+ iov_iter_truncate(from, count);
+ /* We can write back this queue in page reclaim */
+ current->backing_dev_info = mapping->backing_dev_info;
+
+write_retry:
+ trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, 0);
+ ret = generic_perform_write(file, from, pos);
+ if (likely(ret >= 0))
+ iocb->ki_pos = pos + ret;
+ /*
+ * If we just got an ENOSPC, try to write back all dirty inodes to
+ * convert delalloc space to free up some of the excess reserved
+ * metadata space.
+ */
+ if (ret == -ENOSPC && !enospc) {
+ enospc = 1;
+ xfs_flush_inodes(ip->i_mount);
+ goto write_retry;
+ }
+
+ current->backing_dev_info = NULL;
+out:
+ xfs_rw_iunlock(ip, iolock);
+ return ret;
+}
+
+STATIC ssize_t
+xfs_file_write_iter(
+ struct kiocb *iocb,
+ struct iov_iter *from)
+{
+ struct file *file = iocb->ki_filp;
+ struct address_space *mapping = file->f_mapping;
+ struct inode *inode = mapping->host;
+ struct xfs_inode *ip = XFS_I(inode);
+ ssize_t ret;
+ size_t ocount = iov_iter_count(from);
+
+ XFS_STATS_INC(xs_write_calls);
+
+ if (ocount == 0)
+ return 0;
+
+ if (XFS_FORCED_SHUTDOWN(ip->i_mount))
+ return -EIO;
+
+ if (unlikely(file->f_flags & O_DIRECT))
+ ret = xfs_file_dio_aio_write(iocb, from);
+ else
+ ret = xfs_file_buffered_aio_write(iocb, from);
+
+ if (ret > 0) {
+ ssize_t err;
+
+ XFS_STATS_ADD(xs_write_bytes, ret);
+
+ /* Handle various SYNC-type writes */
+ err = generic_write_sync(file, iocb->ki_pos - ret, ret);
+ if (err < 0)
+ ret = err;
+ }
+ return ret;
+}
+
+STATIC long
+xfs_file_fallocate(
+ struct file *file,
+ int mode,
+ loff_t offset,
+ loff_t len)
+{
+ struct inode *inode = file_inode(file);
+ struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_trans *tp;
+ long error;
+ loff_t new_size = 0;
+
+ if (!S_ISREG(inode->i_mode))
+ return -EINVAL;
+ if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
+ FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE))
+ return -EOPNOTSUPP;
+
+ xfs_ilock(ip, XFS_IOLOCK_EXCL);
+ if (mode & FALLOC_FL_PUNCH_HOLE) {
+ error = xfs_free_file_space(ip, offset, len);
+ if (error)
+ goto out_unlock;
+ } else if (mode & FALLOC_FL_COLLAPSE_RANGE) {
+ unsigned blksize_mask = (1 << inode->i_blkbits) - 1;
+
+ if (offset & blksize_mask || len & blksize_mask) {
+ error = EINVAL;
+ goto out_unlock;
+ }
+
+ /*
+ * There is no need to overlap collapse range with EOF,
+ * in which case it is effectively a truncate operation
+ */
+ if (offset + len >= i_size_read(inode)) {
+ error = EINVAL;
+ goto out_unlock;
+ }
+
+ new_size = i_size_read(inode) - len;
+
+ error = xfs_collapse_file_space(ip, offset, len);
+ if (error)
+ goto out_unlock;
+ } else {
+ if (!(mode & FALLOC_FL_KEEP_SIZE) &&
+ offset + len > i_size_read(inode)) {
+ new_size = offset + len;
+ error = -inode_newsize_ok(inode, new_size);
+ if (error)
+ goto out_unlock;
+ }
+
+ if (mode & FALLOC_FL_ZERO_RANGE)
+ error = xfs_zero_file_space(ip, offset, len);
+ else
+ error = xfs_alloc_file_space(ip, offset, len,
+ XFS_BMAPI_PREALLOC);
+ if (error)
+ goto out_unlock;
+ }
+
+ tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_WRITEID);
+ error = xfs_trans_reserve(tp, &M_RES(ip->i_mount)->tr_writeid, 0, 0);
+ if (error) {
+ xfs_trans_cancel(tp, 0);
+ goto out_unlock;
+ }
+
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+ ip->i_d.di_mode &= ~S_ISUID;
+ if (ip->i_d.di_mode & S_IXGRP)
+ ip->i_d.di_mode &= ~S_ISGID;
+
+ if (!(mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_COLLAPSE_RANGE)))
+ ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC;
+
+ xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+ if (file->f_flags & O_DSYNC)
+ xfs_trans_set_sync(tp);
+ error = xfs_trans_commit(tp, 0);
+ if (error)
+ goto out_unlock;
+
+ /* Change file size if needed */
+ if (new_size) {
+ struct iattr iattr;
+
+ iattr.ia_valid = ATTR_SIZE;
+ iattr.ia_size = new_size;
+ error = xfs_setattr_size(ip, &iattr);
+ }
+
+out_unlock:
+ xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+ return -error;
+}
+
+
+STATIC int
+xfs_file_open(
+ struct inode *inode,
+ struct file *file)
+{
+ if (!(file->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS)
+ return -EFBIG;
+ if (XFS_FORCED_SHUTDOWN(XFS_M(inode->i_sb)))
+ return -EIO;
+ return 0;
+}
+
+STATIC int
+xfs_dir_open(
+ struct inode *inode,
+ struct file *file)
+{
+ struct xfs_inode *ip = XFS_I(inode);
+ int mode;
+ int error;
+
+ error = xfs_file_open(inode, file);
+ if (error)
+ return error;
+
+ /*
+ * If there are any blocks, read-ahead block 0 as we're almost
+ * certain to have the next operation be a read there.
+ */
+ mode = xfs_ilock_data_map_shared(ip);
+ if (ip->i_d.di_nextents > 0)
+ xfs_dir3_data_readahead(ip, 0, -1);
+ xfs_iunlock(ip, mode);
+ return 0;
+}
+
+STATIC int
+xfs_file_release(
+ struct inode *inode,
+ struct file *filp)
+{
+ return -xfs_release(XFS_I(inode));
+}
+
+STATIC int
+xfs_file_readdir(
+ struct file *file,
+ struct dir_context *ctx)
+{
+ struct inode *inode = file_inode(file);
+ xfs_inode_t *ip = XFS_I(inode);
+ int error;
+ size_t bufsize;
+
+ /*
+ * The Linux API doesn't pass down the total size of the buffer
+ * we read into down to the filesystem. With the filldir concept
+ * it's not needed for correct information, but the XFS dir2 leaf
+ * code wants an estimate of the buffer size to calculate it's
+ * readahead window and size the buffers used for mapping to
+ * physical blocks.
+ *
+ * Try to give it an estimate that's good enough, maybe at some
+ * point we can change the ->readdir prototype to include the
+ * buffer size. For now we use the current glibc buffer size.
+ */
+ bufsize = (size_t)min_t(loff_t, 32768, ip->i_d.di_size);
+
+ error = xfs_readdir(ip, ctx, bufsize);
+ if (error)
+ return -error;
+ return 0;
+}
+
+STATIC int
+xfs_file_mmap(
+ struct file *filp,
+ struct vm_area_struct *vma)
+{
+ vma->vm_ops = &xfs_file_vm_ops;
+
+ file_accessed(filp);
+ return 0;
+}
+
+/*
+ * mmap()d file has taken write protection fault and is being made
+ * writable. We can set the page state up correctly for a writable
+ * page, which means we can do correct delalloc accounting (ENOSPC
+ * checking!) and unwritten extent mapping.
+ */
+STATIC int
+xfs_vm_page_mkwrite(
+ struct vm_area_struct *vma,
+ struct vm_fault *vmf)
+{
+ return block_page_mkwrite(vma, vmf, xfs_get_blocks);
+}
+
+/*
+ * This type is designed to indicate the type of offset we would like
+ * to search from page cache for either xfs_seek_data() or xfs_seek_hole().
+ */
+enum {
+ HOLE_OFF = 0,
+ DATA_OFF,
+};
+
+/*
+ * Lookup the desired type of offset from the given page.
+ *
+ * On success, return true and the offset argument will point to the
+ * start of the region that was found. Otherwise this function will
+ * return false and keep the offset argument unchanged.
+ */
+STATIC bool
+xfs_lookup_buffer_offset(
+ struct page *page,
+ loff_t *offset,
+ unsigned int type)
+{
+ loff_t lastoff = page_offset(page);
+ bool found = false;
+ struct buffer_head *bh, *head;
+
+ bh = head = page_buffers(page);
+ do {
+ /*
+ * Unwritten extents that have data in the page
+ * cache covering them can be identified by the
+ * BH_Unwritten state flag. Pages with multiple
+ * buffers might have a mix of holes, data and
+ * unwritten extents - any buffer with valid
+ * data in it should have BH_Uptodate flag set
+ * on it.
+ */
+ if (buffer_unwritten(bh) ||
+ buffer_uptodate(bh)) {
+ if (type == DATA_OFF)
+ found = true;
+ } else {
+ if (type == HOLE_OFF)
+ found = true;
+ }
+
+ if (found) {
+ *offset = lastoff;
+ break;
+ }
+ lastoff += bh->b_size;
+ } while ((bh = bh->b_this_page) != head);
+
+ return found;
+}
+
+/*
+ * This routine is called to find out and return a data or hole offset
+ * from the page cache for unwritten extents according to the desired
+ * type for xfs_seek_data() or xfs_seek_hole().
+ *
+ * The argument offset is used to tell where we start to search from the
+ * page cache. Map is used to figure out the end points of the range to
+ * lookup pages.
+ *
+ * Return true if the desired type of offset was found, and the argument
+ * offset is filled with that address. Otherwise, return false and keep
+ * offset unchanged.
+ */
+STATIC bool
+xfs_find_get_desired_pgoff(
+ struct inode *inode,
+ struct xfs_bmbt_irec *map,
+ unsigned int type,
+ loff_t *offset)
+{
+ struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_mount *mp = ip->i_mount;
+ struct pagevec pvec;
+ pgoff_t index;
+ pgoff_t end;
+ loff_t endoff;
+ loff_t startoff = *offset;
+ loff_t lastoff = startoff;
+ bool found = false;
+
+ pagevec_init(&pvec, 0);
+
+ index = startoff >> PAGE_CACHE_SHIFT;
+ endoff = XFS_FSB_TO_B(mp, map->br_startoff + map->br_blockcount);
+ end = endoff >> PAGE_CACHE_SHIFT;
+ do {
+ int want;
+ unsigned nr_pages;
+ unsigned int i;
+
+ want = min_t(pgoff_t, end - index, PAGEVEC_SIZE);
+ nr_pages = pagevec_lookup(&pvec, inode->i_mapping, index,
+ want);
+ /*
+ * No page mapped into given range. If we are searching holes
+ * and if this is the first time we got into the loop, it means
+ * that the given offset is landed in a hole, return it.
+ *
+ * If we have already stepped through some block buffers to find
+ * holes but they all contains data. In this case, the last
+ * offset is already updated and pointed to the end of the last
+ * mapped page, if it does not reach the endpoint to search,
+ * that means there should be a hole between them.
+ */
+ if (nr_pages == 0) {
+ /* Data search found nothing */
+ if (type == DATA_OFF)
+ break;
+
+ ASSERT(type == HOLE_OFF);
+ if (lastoff == startoff || lastoff < endoff) {
+ found = true;
+ *offset = lastoff;
+ }
+ break;
+ }
+
+ /*
+ * At lease we found one page. If this is the first time we
+ * step into the loop, and if the first page index offset is
+ * greater than the given search offset, a hole was found.
+ */
+ if (type == HOLE_OFF && lastoff == startoff &&
+ lastoff < page_offset(pvec.pages[0])) {
+ found = true;
+ break;
+ }
+
+ for (i = 0; i < nr_pages; i++) {
+ struct page *page = pvec.pages[i];
+ loff_t b_offset;
+
+ /*
+ * At this point, the page may be truncated or
+ * invalidated (changing page->mapping to NULL),
+ * or even swizzled back from swapper_space to tmpfs
+ * file mapping. However, page->index will not change
+ * because we have a reference on the page.
+ *
+ * Searching done if the page index is out of range.
+ * If the current offset is not reaches the end of
+ * the specified search range, there should be a hole
+ * between them.
+ */
+ if (page->index > end) {
+ if (type == HOLE_OFF && lastoff < endoff) {
+ *offset = lastoff;
+ found = true;
+ }
+ goto out;
+ }
+
+ lock_page(page);
+ /*
+ * Page truncated or invalidated(page->mapping == NULL).
+ * We can freely skip it and proceed to check the next
+ * page.
+ */
+ if (unlikely(page->mapping != inode->i_mapping)) {
+ unlock_page(page);
+ continue;
+ }
+
+ if (!page_has_buffers(page)) {
+ unlock_page(page);
+ continue;
+ }
+
+ found = xfs_lookup_buffer_offset(page, &b_offset, type);
+ if (found) {
+ /*
+ * The found offset may be less than the start
+ * point to search if this is the first time to
+ * come here.
+ */
+ *offset = max_t(loff_t, startoff, b_offset);
+ unlock_page(page);
+ goto out;
+ }
+
+ /*
+ * We either searching data but nothing was found, or
+ * searching hole but found a data buffer. In either
+ * case, probably the next page contains the desired
+ * things, update the last offset to it so.
+ */
+ lastoff = page_offset(page) + PAGE_SIZE;
+ unlock_page(page);
+ }
+
+ /*
+ * The number of returned pages less than our desired, search
+ * done. In this case, nothing was found for searching data,
+ * but we found a hole behind the last offset.
+ */
+ if (nr_pages < want) {
+ if (type == HOLE_OFF) {
+ *offset = lastoff;
+ found = true;
+ }
+ break;
+ }
+
+ index = pvec.pages[i - 1]->index + 1;
+ pagevec_release(&pvec);
+ } while (index <= end);
+
+out:
+ pagevec_release(&pvec);
+ return found;
+}
+
+STATIC loff_t
+xfs_seek_data(
+ struct file *file,
+ loff_t start)
+{
+ struct inode *inode = file->f_mapping->host;
+ struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_mount *mp = ip->i_mount;
+ loff_t uninitialized_var(offset);
+ xfs_fsize_t isize;
+ xfs_fileoff_t fsbno;
+ xfs_filblks_t end;
+ uint lock;
+ int error;
+
+ lock = xfs_ilock_data_map_shared(ip);
+
+ isize = i_size_read(inode);
+ if (start >= isize) {
+ error = ENXIO;
+ goto out_unlock;
+ }
+
+ /*
+ * Try to read extents from the first block indicated
+ * by fsbno to the end block of the file.
+ */
+ fsbno = XFS_B_TO_FSBT(mp, start);
+ end = XFS_B_TO_FSB(mp, isize);
+ for (;;) {
+ struct xfs_bmbt_irec map[2];
+ int nmap = 2;
+ unsigned int i;
+
+ error = xfs_bmapi_read(ip, fsbno, end - fsbno, map, &nmap,
+ XFS_BMAPI_ENTIRE);
+ if (error)
+ goto out_unlock;
+
+ /* No extents at given offset, must be beyond EOF */
+ if (nmap == 0) {
+ error = ENXIO;
+ goto out_unlock;
+ }
+
+ for (i = 0; i < nmap; i++) {
+ offset = max_t(loff_t, start,
+ XFS_FSB_TO_B(mp, map[i].br_startoff));
+
+ /* Landed in a data extent */
+ if (map[i].br_startblock == DELAYSTARTBLOCK ||
+ (map[i].br_state == XFS_EXT_NORM &&
+ !isnullstartblock(map[i].br_startblock)))
+ goto out;
+
+ /*
+ * Landed in an unwritten extent, try to search data
+ * from page cache.
+ */
+ if (map[i].br_state == XFS_EXT_UNWRITTEN) {
+ if (xfs_find_get_desired_pgoff(inode, &map[i],
+ DATA_OFF, &offset))
+ goto out;
+ }
+ }
+
+ /*
+ * map[0] is hole or its an unwritten extent but
+ * without data in page cache. Probably means that
+ * we are reading after EOF if nothing in map[1].
+ */
+ if (nmap == 1) {
+ error = ENXIO;
+ goto out_unlock;
+ }
+
+ ASSERT(i > 1);
+
+ /*
+ * Nothing was found, proceed to the next round of search
+ * if reading offset not beyond or hit EOF.
+ */
+ fsbno = map[i - 1].br_startoff + map[i - 1].br_blockcount;
+ start = XFS_FSB_TO_B(mp, fsbno);
+ if (start >= isize) {
+ error = ENXIO;
+ goto out_unlock;
+ }
+ }
+
+out:
+ offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
+
+out_unlock:
+ xfs_iunlock(ip, lock);
+
+ if (error)
+ return -error;
+ return offset;
+}
+
+STATIC loff_t
+xfs_seek_hole(
+ struct file *file,
+ loff_t start)
+{
+ struct inode *inode = file->f_mapping->host;
+ struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_mount *mp = ip->i_mount;
+ loff_t uninitialized_var(offset);
+ xfs_fsize_t isize;
+ xfs_fileoff_t fsbno;
+ xfs_filblks_t end;
+ uint lock;
+ int error;
+
+ if (XFS_FORCED_SHUTDOWN(mp))
+ return -XFS_ERROR(EIO);
+
+ lock = xfs_ilock_data_map_shared(ip);
+
+ isize = i_size_read(inode);
+ if (start >= isize) {
+ error = ENXIO;
+ goto out_unlock;
+ }
+
+ fsbno = XFS_B_TO_FSBT(mp, start);
+ end = XFS_B_TO_FSB(mp, isize);
+
+ for (;;) {
+ struct xfs_bmbt_irec map[2];
+ int nmap = 2;
+ unsigned int i;
+
+ error = xfs_bmapi_read(ip, fsbno, end - fsbno, map, &nmap,
+ XFS_BMAPI_ENTIRE);
+ if (error)
+ goto out_unlock;
+
+ /* No extents at given offset, must be beyond EOF */
+ if (nmap == 0) {
+ error = ENXIO;
+ goto out_unlock;
+ }
+
+ for (i = 0; i < nmap; i++) {
+ offset = max_t(loff_t, start,
+ XFS_FSB_TO_B(mp, map[i].br_startoff));
+
+ /* Landed in a hole */
+ if (map[i].br_startblock == HOLESTARTBLOCK)
+ goto out;
+
+ /*
+ * Landed in an unwritten extent, try to search hole
+ * from page cache.
+ */
+ if (map[i].br_state == XFS_EXT_UNWRITTEN) {
+ if (xfs_find_get_desired_pgoff(inode, &map[i],
+ HOLE_OFF, &offset))
+ goto out;
+ }
+ }
+
+ /*
+ * map[0] contains data or its unwritten but contains
+ * data in page cache, probably means that we are
+ * reading after EOF. We should fix offset to point
+ * to the end of the file(i.e., there is an implicit
+ * hole at the end of any file).
+ */
+ if (nmap == 1) {
+ offset = isize;
+ break;
+ }
+
+ ASSERT(i > 1);
+
+ /*
+ * Both mappings contains data, proceed to the next round of
+ * search if the current reading offset not beyond or hit EOF.
+ */
+ fsbno = map[i - 1].br_startoff + map[i - 1].br_blockcount;
+ start = XFS_FSB_TO_B(mp, fsbno);
+ if (start >= isize) {
+ offset = isize;
+ break;
+ }
+ }
+
+out:
+ /*
+ * At this point, we must have found a hole. However, the returned
+ * offset may be bigger than the file size as it may be aligned to
+ * page boundary for unwritten extents, we need to deal with this
+ * situation in particular.
+ */
+ offset = min_t(loff_t, offset, isize);
+ offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
+
+out_unlock:
+ xfs_iunlock(ip, lock);
+
+ if (error)
+ return -error;
+ return offset;
+}
+
+STATIC loff_t
+xfs_file_llseek(
+ struct file *file,
+ loff_t offset,
+ int origin)
+{
+ switch (origin) {
+ case SEEK_END:
+ case SEEK_CUR:
+ case SEEK_SET:
+ return generic_file_llseek(file, offset, origin);
+ case SEEK_DATA:
+ return xfs_seek_data(file, offset);
+ case SEEK_HOLE:
+ return xfs_seek_hole(file, offset);
+ default:
+ return -EINVAL;
+ }
+}
+
+const struct file_operations xfs_file_operations = {
+ .llseek = xfs_file_llseek,
+ .read = new_sync_read,
+ .write = new_sync_write,
+ .read_iter = xfs_file_read_iter,
+ .write_iter = xfs_file_write_iter,
+ .splice_read = xfs_file_splice_read,
+ .splice_write = iter_file_splice_write,
+ .unlocked_ioctl = xfs_file_ioctl,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = xfs_file_compat_ioctl,
+#endif
+ .mmap = xfs_file_mmap,
+ .open = xfs_file_open,
+ .release = xfs_file_release,
+ .fsync = xfs_file_fsync,
+ .fallocate = xfs_file_fallocate,
+};
+
+const struct file_operations xfs_dir_file_operations = {
+ .open = xfs_dir_open,
+ .read = generic_read_dir,
+ .iterate = xfs_file_readdir,
+ .llseek = generic_file_llseek,
+ .unlocked_ioctl = xfs_file_ioctl,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = xfs_file_compat_ioctl,
+#endif
+ .fsync = xfs_dir_fsync,
+};
+
+static const struct vm_operations_struct xfs_file_vm_ops = {
+ .fault = filemap_fault,
+ .map_pages = filemap_map_pages,
+ .page_mkwrite = xfs_vm_page_mkwrite,
+ .remap_pages = generic_file_remap_pages,
+};
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
new file mode 100644
index 00000000000..8ec81bed799
--- /dev/null
+++ b/fs/xfs/xfs_filestream.c
@@ -0,0 +1,434 @@
+/*
+ * Copyright (c) 2006-2007 Silicon Graphics, Inc.
+ * Copyright (c) 2014 Christoph Hellwig.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "xfs.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_ag.h"
+#include "xfs_sb.h"
+#include "xfs_mount.h"
+#include "xfs_inum.h"
+#include "xfs_inode.h"
+#include "xfs_bmap.h"
+#include "xfs_bmap_util.h"
+#include "xfs_alloc.h"
+#include "xfs_mru_cache.h"
+#include "xfs_dinode.h"
+#include "xfs_filestream.h"
+#include "xfs_trace.h"
+
+struct xfs_fstrm_item {
+ struct xfs_mru_cache_elem mru;
+ struct xfs_inode *ip;
+ xfs_agnumber_t ag; /* AG in use for this directory */
+};
+
+enum xfs_fstrm_alloc {
+ XFS_PICK_USERDATA = 1,
+ XFS_PICK_LOWSPACE = 2,
+};
+
+/*
+ * Allocation group filestream associations are tracked with per-ag atomic
+ * counters. These counters allow xfs_filestream_pick_ag() to tell whether a
+ * particular AG already has active filestreams associated with it. The mount
+ * point's m_peraglock is used to protect these counters from per-ag array
+ * re-allocation during a growfs operation. When xfs_growfs_data_private() is
+ * about to reallocate the array, it calls xfs_filestream_flush() with the
+ * m_peraglock held in write mode.
+ *
+ * Since xfs_mru_cache_flush() guarantees that all the free functions for all
+ * the cache elements have finished executing before it returns, it's safe for
+ * the free functions to use the atomic counters without m_peraglock protection.
+ * This allows the implementation of xfs_fstrm_free_func() to be agnostic about
+ * whether it was called with the m_peraglock held in read mode, write mode or
+ * not held at all. The race condition this addresses is the following:
+ *
+ * - The work queue scheduler fires and pulls a filestream directory cache
+ * element off the LRU end of the cache for deletion, then gets pre-empted.
+ * - A growfs operation grabs the m_peraglock in write mode, flushes all the
+ * remaining items from the cache and reallocates the mount point's per-ag
+ * array, resetting all the counters to zero.
+ * - The work queue thread resumes and calls the free function for the element
+ * it started cleaning up earlier. In the process it decrements the
+ * filestreams counter for an AG that now has no references.
+ *
+ * With a shrinkfs feature, the above scenario could panic the system.
+ *
+ * All other uses of the following macros should be protected by either the
+ * m_peraglock held in read mode, or the cache's internal locking exposed by the
+ * interval between a call to xfs_mru_cache_lookup() and a call to
+ * xfs_mru_cache_done(). In addition, the m_peraglock must be held in read mode
+ * when new elements are added to the cache.
+ *
+ * Combined, these locking rules ensure that no associations will ever exist in
+ * the cache that reference per-ag array elements that have since been
+ * reallocated.
+ */
+int
+xfs_filestream_peek_ag(
+ xfs_mount_t *mp,
+ xfs_agnumber_t agno)
+{
+ struct xfs_perag *pag;
+ int ret;
+
+ pag = xfs_perag_get(mp, agno);
+ ret = atomic_read(&pag->pagf_fstrms);
+ xfs_perag_put(pag);
+ return ret;
+}
+
+static int
+xfs_filestream_get_ag(
+ xfs_mount_t *mp,
+ xfs_agnumber_t agno)
+{
+ struct xfs_perag *pag;
+ int ret;
+
+ pag = xfs_perag_get(mp, agno);
+ ret = atomic_inc_return(&pag->pagf_fstrms);
+ xfs_perag_put(pag);
+ return ret;
+}
+
+static void
+xfs_filestream_put_ag(
+ xfs_mount_t *mp,
+ xfs_agnumber_t agno)
+{
+ struct xfs_perag *pag;
+
+ pag = xfs_perag_get(mp, agno);
+ atomic_dec(&pag->pagf_fstrms);
+ xfs_perag_put(pag);
+}
+
+static void
+xfs_fstrm_free_func(
+ struct xfs_mru_cache_elem *mru)
+{
+ struct xfs_fstrm_item *item =
+ container_of(mru, struct xfs_fstrm_item, mru);
+
+ xfs_filestream_put_ag(item->ip->i_mount, item->ag);
+
+ trace_xfs_filestream_free(item->ip, item->ag);
+
+ kmem_free(item);
+}
+
+/*
+ * Scan the AGs starting at startag looking for an AG that isn't in use and has
+ * at least minlen blocks free.
+ */
+static int
+xfs_filestream_pick_ag(
+ struct xfs_inode *ip,
+ xfs_agnumber_t startag,
+ xfs_agnumber_t *agp,
+ int flags,
+ xfs_extlen_t minlen)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_fstrm_item *item;
+ struct xfs_perag *pag;
+ xfs_extlen_t longest, free = 0, minfree, maxfree = 0;
+ xfs_agnumber_t ag, max_ag = NULLAGNUMBER;
+ int err, trylock, nscan;
+
+ ASSERT(S_ISDIR(ip->i_d.di_mode));
+
+ /* 2% of an AG's blocks must be free for it to be chosen. */
+ minfree = mp->m_sb.sb_agblocks / 50;
+
+ ag = startag;
+ *agp = NULLAGNUMBER;
+
+ /* For the first pass, don't sleep trying to init the per-AG. */
+ trylock = XFS_ALLOC_FLAG_TRYLOCK;
+
+ for (nscan = 0; 1; nscan++) {
+ trace_xfs_filestream_scan(ip, ag);
+
+ pag = xfs_perag_get(mp, ag);
+
+ if (!pag->pagf_init) {
+ err = xfs_alloc_pagf_init(mp, NULL, ag, trylock);
+ if (err && !trylock) {
+ xfs_perag_put(pag);
+ return err;
+ }
+ }
+
+ /* Might fail sometimes during the 1st pass with trylock set. */
+ if (!pag->pagf_init)
+ goto next_ag;
+
+ /* Keep track of the AG with the most free blocks. */
+ if (pag->pagf_freeblks > maxfree) {
+ maxfree = pag->pagf_freeblks;
+ max_ag = ag;
+ }
+
+ /*
+ * The AG reference count does two things: it enforces mutual
+ * exclusion when examining the suitability of an AG in this
+ * loop, and it guards against two filestreams being established
+ * in the same AG as each other.
+ */
+ if (xfs_filestream_get_ag(mp, ag) > 1) {
+ xfs_filestream_put_ag(mp, ag);
+ goto next_ag;
+ }
+
+ longest = xfs_alloc_longest_free_extent(mp, pag);
+ if (((minlen && longest >= minlen) ||
+ (!minlen && pag->pagf_freeblks >= minfree)) &&
+ (!pag->pagf_metadata || !(flags & XFS_PICK_USERDATA) ||
+ (flags & XFS_PICK_LOWSPACE))) {
+
+ /* Break out, retaining the reference on the AG. */
+ free = pag->pagf_freeblks;
+ xfs_perag_put(pag);
+ *agp = ag;
+ break;
+ }
+
+ /* Drop the reference on this AG, it's not usable. */
+ xfs_filestream_put_ag(mp, ag);
+next_ag:
+ xfs_perag_put(pag);
+ /* Move to the next AG, wrapping to AG 0 if necessary. */
+ if (++ag >= mp->m_sb.sb_agcount)
+ ag = 0;
+
+ /* If a full pass of the AGs hasn't been done yet, continue. */
+ if (ag != startag)
+ continue;
+
+ /* Allow sleeping in xfs_alloc_pagf_init() on the 2nd pass. */
+ if (trylock != 0) {
+ trylock = 0;
+ continue;
+ }
+
+ /* Finally, if lowspace wasn't set, set it for the 3rd pass. */
+ if (!(flags & XFS_PICK_LOWSPACE)) {
+ flags |= XFS_PICK_LOWSPACE;
+ continue;
+ }
+
+ /*
+ * Take the AG with the most free space, regardless of whether
+ * it's already in use by another filestream.
+ */
+ if (max_ag != NULLAGNUMBER) {
+ xfs_filestream_get_ag(mp, max_ag);
+ free = maxfree;
+ *agp = max_ag;
+ break;
+ }
+
+ /* take AG 0 if none matched */
+ trace_xfs_filestream_pick(ip, *agp, free, nscan);
+ *agp = 0;
+ return 0;
+ }
+
+ trace_xfs_filestream_pick(ip, *agp, free, nscan);
+
+ if (*agp == NULLAGNUMBER)
+ return 0;
+
+ err = ENOMEM;
+ item = kmem_alloc(sizeof(*item), KM_MAYFAIL);
+ if (!item)
+ goto out_put_ag;
+
+ item->ag = *agp;
+ item->ip = ip;
+
+ err = xfs_mru_cache_insert(mp->m_filestream, ip->i_ino, &item->mru);
+ if (err) {
+ if (err == EEXIST)
+ err = 0;
+ goto out_free_item;
+ }
+
+ return 0;
+
+out_free_item:
+ kmem_free(item);
+out_put_ag:
+ xfs_filestream_put_ag(mp, *agp);
+ return err;
+}
+
+static struct xfs_inode *
+xfs_filestream_get_parent(
+ struct xfs_inode *ip)
+{
+ struct inode *inode = VFS_I(ip), *dir = NULL;
+ struct dentry *dentry, *parent;
+
+ dentry = d_find_alias(inode);
+ if (!dentry)
+ goto out;
+
+ parent = dget_parent(dentry);
+ if (!parent)
+ goto out_dput;
+
+ dir = igrab(parent->d_inode);
+ dput(parent);
+
+out_dput:
+ dput(dentry);
+out:
+ return dir ? XFS_I(dir) : NULL;
+}
+
+/*
+ * Find the right allocation group for a file, either by finding an
+ * existing file stream or creating a new one.
+ *
+ * Returns NULLAGNUMBER in case of an error.
+ */
+xfs_agnumber_t
+xfs_filestream_lookup_ag(
+ struct xfs_inode *ip)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_inode *pip = NULL;
+ xfs_agnumber_t startag, ag = NULLAGNUMBER;
+ struct xfs_mru_cache_elem *mru;
+
+ ASSERT(S_ISREG(ip->i_d.di_mode));
+
+ pip = xfs_filestream_get_parent(ip);
+ if (!pip)
+ goto out;
+
+ mru = xfs_mru_cache_lookup(mp->m_filestream, pip->i_ino);
+ if (mru) {
+ ag = container_of(mru, struct xfs_fstrm_item, mru)->ag;
+ xfs_mru_cache_done(mp->m_filestream);
+
+ trace_xfs_filestream_lookup(ip, ag);
+ goto out;
+ }
+
+ /*
+ * Set the starting AG using the rotor for inode32, otherwise
+ * use the directory inode's AG.
+ */
+ if (mp->m_flags & XFS_MOUNT_32BITINODES) {
+ xfs_agnumber_t rotorstep = xfs_rotorstep;
+ startag = (mp->m_agfrotor / rotorstep) % mp->m_sb.sb_agcount;
+ mp->m_agfrotor = (mp->m_agfrotor + 1) %
+ (mp->m_sb.sb_agcount * rotorstep);
+ } else
+ startag = XFS_INO_TO_AGNO(mp, pip->i_ino);
+
+ if (xfs_filestream_pick_ag(pip, startag, &ag, 0, 0))
+ ag = NULLAGNUMBER;
+out:
+ IRELE(pip);
+ return ag;
+}
+
+/*
+ * Pick a new allocation group for the current file and its file stream.
+ *
+ * This is called when the allocator can't find a suitable extent in the
+ * current AG, and we have to move the stream into a new AG with more space.
+ */
+int
+xfs_filestream_new_ag(
+ struct xfs_bmalloca *ap,
+ xfs_agnumber_t *agp)
+{
+ struct xfs_inode *ip = ap->ip, *pip;
+ struct xfs_mount *mp = ip->i_mount;
+ xfs_extlen_t minlen = ap->length;
+ xfs_agnumber_t startag = 0;
+ int flags, err = 0;
+ struct xfs_mru_cache_elem *mru;
+
+ *agp = NULLAGNUMBER;
+
+ pip = xfs_filestream_get_parent(ip);
+ if (!pip)
+ goto exit;
+
+ mru = xfs_mru_cache_remove(mp->m_filestream, pip->i_ino);
+ if (mru) {
+ struct xfs_fstrm_item *item =
+ container_of(mru, struct xfs_fstrm_item, mru);
+ startag = (item->ag + 1) % mp->m_sb.sb_agcount;
+ }
+
+ flags = (ap->userdata ? XFS_PICK_USERDATA : 0) |
+ (ap->flist->xbf_low ? XFS_PICK_LOWSPACE : 0);
+
+ err = xfs_filestream_pick_ag(pip, startag, agp, flags, minlen);
+
+ /*
+ * Only free the item here so we skip over the old AG earlier.
+ */
+ if (mru)
+ xfs_fstrm_free_func(mru);
+
+ IRELE(pip);
+exit:
+ if (*agp == NULLAGNUMBER)
+ *agp = 0;
+ return err;
+}
+
+void
+xfs_filestream_deassociate(
+ struct xfs_inode *ip)
+{
+ xfs_mru_cache_delete(ip->i_mount->m_filestream, ip->i_ino);
+}
+
+int
+xfs_filestream_mount(
+ xfs_mount_t *mp)
+{
+ /*
+ * The filestream timer tunable is currently fixed within the range of
+ * one second to four minutes, with five seconds being the default. The
+ * group count is somewhat arbitrary, but it'd be nice to adhere to the
+ * timer tunable to within about 10 percent. This requires at least 10
+ * groups.
+ */
+ return xfs_mru_cache_create(&mp->m_filestream, xfs_fstrm_centisecs * 10,
+ 10, xfs_fstrm_free_func);
+}
+
+void
+xfs_filestream_unmount(
+ xfs_mount_t *mp)
+{
+ xfs_mru_cache_destroy(mp->m_filestream);
+}
diff --git a/fs/xfs/xfs_filestream.h b/fs/xfs/xfs_filestream.h
new file mode 100644
index 00000000000..2ef43406e53
--- /dev/null
+++ b/fs/xfs/xfs_filestream.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2006-2007 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#ifndef __XFS_FILESTREAM_H__
+#define __XFS_FILESTREAM_H__
+
+struct xfs_mount;
+struct xfs_inode;
+struct xfs_bmalloca;
+
+int xfs_filestream_mount(struct xfs_mount *mp);
+void xfs_filestream_unmount(struct xfs_mount *mp);
+void xfs_filestream_deassociate(struct xfs_inode *ip);
+xfs_agnumber_t xfs_filestream_lookup_ag(struct xfs_inode *ip);
+int xfs_filestream_new_ag(struct xfs_bmalloca *ap, xfs_agnumber_t *agp);
+int xfs_filestream_peek_ag(struct xfs_mount *mp, xfs_agnumber_t agno);
+
+static inline int
+xfs_inode_is_filestream(
+ struct xfs_inode *ip)
+{
+ return (ip->i_mount->m_flags & XFS_MOUNT_FILESTREAMS) ||
+ (ip->i_d.di_flags & XFS_DIFLAG_FILESTREAM);
+}
+
+#endif /* __XFS_FILESTREAM_H__ */
diff --git a/fs/xfs/xfs_format.h b/fs/xfs/xfs_format.h
new file mode 100644
index 00000000000..34d85aca305
--- /dev/null
+++ b/fs/xfs/xfs_format.h
@@ -0,0 +1,428 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#ifndef __XFS_FORMAT_H__
+#define __XFS_FORMAT_H__
+
+/*
+ * XFS On Disk Format Definitions
+ *
+ * This header file defines all the on-disk format definitions for
+ * general XFS objects. Directory and attribute related objects are defined in
+ * xfs_da_format.h, which log and log item formats are defined in
+ * xfs_log_format.h. Everything else goes here.
+ */
+
+struct xfs_mount;
+struct xfs_trans;
+struct xfs_inode;
+struct xfs_buf;
+struct xfs_ifork;
+
+/*
+ * RealTime Device format definitions
+ */
+
+/* Min and max rt extent sizes, specified in bytes */
+#define XFS_MAX_RTEXTSIZE (1024 * 1024 * 1024) /* 1GB */
+#define XFS_DFL_RTEXTSIZE (64 * 1024) /* 64kB */
+#define XFS_MIN_RTEXTSIZE (4 * 1024) /* 4kB */
+
+#define XFS_BLOCKSIZE(mp) ((mp)->m_sb.sb_blocksize)
+#define XFS_BLOCKMASK(mp) ((mp)->m_blockmask)
+#define XFS_BLOCKWSIZE(mp) ((mp)->m_blockwsize)
+#define XFS_BLOCKWMASK(mp) ((mp)->m_blockwmask)
+
+/*
+ * RT Summary and bit manipulation macros.
+ */
+#define XFS_SUMOFFS(mp,ls,bb) ((int)((ls) * (mp)->m_sb.sb_rbmblocks + (bb)))
+#define XFS_SUMOFFSTOBLOCK(mp,s) \
+ (((s) * (uint)sizeof(xfs_suminfo_t)) >> (mp)->m_sb.sb_blocklog)
+#define XFS_SUMPTR(mp,bp,so) \
+ ((xfs_suminfo_t *)((bp)->b_addr + \
+ (((so) * (uint)sizeof(xfs_suminfo_t)) & XFS_BLOCKMASK(mp))))
+
+#define XFS_BITTOBLOCK(mp,bi) ((bi) >> (mp)->m_blkbit_log)
+#define XFS_BLOCKTOBIT(mp,bb) ((bb) << (mp)->m_blkbit_log)
+#define XFS_BITTOWORD(mp,bi) \
+ ((int)(((bi) >> XFS_NBWORDLOG) & XFS_BLOCKWMASK(mp)))
+
+#define XFS_RTMIN(a,b) ((a) < (b) ? (a) : (b))
+#define XFS_RTMAX(a,b) ((a) > (b) ? (a) : (b))
+
+#define XFS_RTLOBIT(w) xfs_lowbit32(w)
+#define XFS_RTHIBIT(w) xfs_highbit32(w)
+
+#if XFS_BIG_BLKNOS
+#define XFS_RTBLOCKLOG(b) xfs_highbit64(b)
+#else
+#define XFS_RTBLOCKLOG(b) xfs_highbit32(b)
+#endif
+
+/*
+ * Dquot and dquot block format definitions
+ */
+#define XFS_DQUOT_MAGIC 0x4451 /* 'DQ' */
+#define XFS_DQUOT_VERSION (u_int8_t)0x01 /* latest version number */
+
+/*
+ * This is the main portion of the on-disk representation of quota
+ * information for a user. This is the q_core of the xfs_dquot_t that
+ * is kept in kernel memory. We pad this with some more expansion room
+ * to construct the on disk structure.
+ */
+typedef struct xfs_disk_dquot {
+ __be16 d_magic; /* dquot magic = XFS_DQUOT_MAGIC */
+ __u8 d_version; /* dquot version */
+ __u8 d_flags; /* XFS_DQ_USER/PROJ/GROUP */
+ __be32 d_id; /* user,project,group id */
+ __be64 d_blk_hardlimit;/* absolute limit on disk blks */
+ __be64 d_blk_softlimit;/* preferred limit on disk blks */
+ __be64 d_ino_hardlimit;/* maximum # allocated inodes */
+ __be64 d_ino_softlimit;/* preferred inode limit */
+ __be64 d_bcount; /* disk blocks owned by the user */
+ __be64 d_icount; /* inodes owned by the user */
+ __be32 d_itimer; /* zero if within inode limits if not,
+ this is when we refuse service */
+ __be32 d_btimer; /* similar to above; for disk blocks */
+ __be16 d_iwarns; /* warnings issued wrt num inodes */
+ __be16 d_bwarns; /* warnings issued wrt disk blocks */
+ __be32 d_pad0; /* 64 bit align */
+ __be64 d_rtb_hardlimit;/* absolute limit on realtime blks */
+ __be64 d_rtb_softlimit;/* preferred limit on RT disk blks */
+ __be64 d_rtbcount; /* realtime blocks owned */
+ __be32 d_rtbtimer; /* similar to above; for RT disk blocks */
+ __be16 d_rtbwarns; /* warnings issued wrt RT disk blocks */
+ __be16 d_pad;
+} xfs_disk_dquot_t;
+
+/*
+ * This is what goes on disk. This is separated from the xfs_disk_dquot because
+ * carrying the unnecessary padding would be a waste of memory.
+ */
+typedef struct xfs_dqblk {
+ xfs_disk_dquot_t dd_diskdq; /* portion that lives incore as well */
+ char dd_fill[4]; /* filling for posterity */
+
+ /*
+ * These two are only present on filesystems with the CRC bits set.
+ */
+ __be32 dd_crc; /* checksum */
+ __be64 dd_lsn; /* last modification in log */
+ uuid_t dd_uuid; /* location information */
+} xfs_dqblk_t;
+
+#define XFS_DQUOT_CRC_OFF offsetof(struct xfs_dqblk, dd_crc)
+
+/*
+ * Remote symlink format and access functions.
+ */
+#define XFS_SYMLINK_MAGIC 0x58534c4d /* XSLM */
+
+struct xfs_dsymlink_hdr {
+ __be32 sl_magic;
+ __be32 sl_offset;
+ __be32 sl_bytes;
+ __be32 sl_crc;
+ uuid_t sl_uuid;
+ __be64 sl_owner;
+ __be64 sl_blkno;
+ __be64 sl_lsn;
+};
+
+#define XFS_SYMLINK_CRC_OFF offsetof(struct xfs_dsymlink_hdr, sl_crc)
+
+/*
+ * The maximum pathlen is 1024 bytes. Since the minimum file system
+ * blocksize is 512 bytes, we can get a max of 3 extents back from
+ * bmapi when crc headers are taken into account.
+ */
+#define XFS_SYMLINK_MAPS 3
+
+#define XFS_SYMLINK_BUF_SPACE(mp, bufsize) \
+ ((bufsize) - (xfs_sb_version_hascrc(&(mp)->m_sb) ? \
+ sizeof(struct xfs_dsymlink_hdr) : 0))
+
+
+/*
+ * Allocation Btree format definitions
+ *
+ * There are two on-disk btrees, one sorted by blockno and one sorted
+ * by blockcount and blockno. All blocks look the same to make the code
+ * simpler; if we have time later, we'll make the optimizations.
+ */
+#define XFS_ABTB_MAGIC 0x41425442 /* 'ABTB' for bno tree */
+#define XFS_ABTB_CRC_MAGIC 0x41423342 /* 'AB3B' */
+#define XFS_ABTC_MAGIC 0x41425443 /* 'ABTC' for cnt tree */
+#define XFS_ABTC_CRC_MAGIC 0x41423343 /* 'AB3C' */
+
+/*
+ * Data record/key structure
+ */
+typedef struct xfs_alloc_rec {
+ __be32 ar_startblock; /* starting block number */
+ __be32 ar_blockcount; /* count of free blocks */
+} xfs_alloc_rec_t, xfs_alloc_key_t;
+
+typedef struct xfs_alloc_rec_incore {
+ xfs_agblock_t ar_startblock; /* starting block number */
+ xfs_extlen_t ar_blockcount; /* count of free blocks */
+} xfs_alloc_rec_incore_t;
+
+/* btree pointer type */
+typedef __be32 xfs_alloc_ptr_t;
+
+/*
+ * Block numbers in the AG:
+ * SB is sector 0, AGF is sector 1, AGI is sector 2, AGFL is sector 3.
+ */
+#define XFS_BNO_BLOCK(mp) ((xfs_agblock_t)(XFS_AGFL_BLOCK(mp) + 1))
+#define XFS_CNT_BLOCK(mp) ((xfs_agblock_t)(XFS_BNO_BLOCK(mp) + 1))
+
+
+/*
+ * Inode Allocation Btree format definitions
+ *
+ * There is a btree for the inode map per allocation group.
+ */
+#define XFS_IBT_MAGIC 0x49414254 /* 'IABT' */
+#define XFS_IBT_CRC_MAGIC 0x49414233 /* 'IAB3' */
+#define XFS_FIBT_MAGIC 0x46494254 /* 'FIBT' */
+#define XFS_FIBT_CRC_MAGIC 0x46494233 /* 'FIB3' */
+
+typedef __uint64_t xfs_inofree_t;
+#define XFS_INODES_PER_CHUNK (NBBY * sizeof(xfs_inofree_t))
+#define XFS_INODES_PER_CHUNK_LOG (XFS_NBBYLOG + 3)
+#define XFS_INOBT_ALL_FREE ((xfs_inofree_t)-1)
+#define XFS_INOBT_MASK(i) ((xfs_inofree_t)1 << (i))
+
+static inline xfs_inofree_t xfs_inobt_maskn(int i, int n)
+{
+ return ((n >= XFS_INODES_PER_CHUNK ? 0 : XFS_INOBT_MASK(n)) - 1) << i;
+}
+
+/*
+ * Data record structure
+ */
+typedef struct xfs_inobt_rec {
+ __be32 ir_startino; /* starting inode number */
+ __be32 ir_freecount; /* count of free inodes (set bits) */
+ __be64 ir_free; /* free inode mask */
+} xfs_inobt_rec_t;
+
+typedef struct xfs_inobt_rec_incore {
+ xfs_agino_t ir_startino; /* starting inode number */
+ __int32_t ir_freecount; /* count of free inodes (set bits) */
+ xfs_inofree_t ir_free; /* free inode mask */
+} xfs_inobt_rec_incore_t;
+
+
+/*
+ * Key structure
+ */
+typedef struct xfs_inobt_key {
+ __be32 ir_startino; /* starting inode number */
+} xfs_inobt_key_t;
+
+/* btree pointer type */
+typedef __be32 xfs_inobt_ptr_t;
+
+/*
+ * block numbers in the AG.
+ */
+#define XFS_IBT_BLOCK(mp) ((xfs_agblock_t)(XFS_CNT_BLOCK(mp) + 1))
+#define XFS_FIBT_BLOCK(mp) ((xfs_agblock_t)(XFS_IBT_BLOCK(mp) + 1))
+
+/*
+ * The first data block of an AG depends on whether the filesystem was formatted
+ * with the finobt feature. If so, account for the finobt reserved root btree
+ * block.
+ */
+#define XFS_PREALLOC_BLOCKS(mp) \
+ (xfs_sb_version_hasfinobt(&((mp)->m_sb)) ? \
+ XFS_FIBT_BLOCK(mp) + 1 : \
+ XFS_IBT_BLOCK(mp) + 1)
+
+
+
+/*
+ * BMAP Btree format definitions
+ *
+ * This includes both the root block definition that sits inside an inode fork
+ * and the record/pointer formats for the leaf/node in the blocks.
+ */
+#define XFS_BMAP_MAGIC 0x424d4150 /* 'BMAP' */
+#define XFS_BMAP_CRC_MAGIC 0x424d4133 /* 'BMA3' */
+
+/*
+ * Bmap root header, on-disk form only.
+ */
+typedef struct xfs_bmdr_block {
+ __be16 bb_level; /* 0 is a leaf */
+ __be16 bb_numrecs; /* current # of data records */
+} xfs_bmdr_block_t;
+
+/*
+ * Bmap btree record and extent descriptor.
+ * l0:63 is an extent flag (value 1 indicates non-normal).
+ * l0:9-62 are startoff.
+ * l0:0-8 and l1:21-63 are startblock.
+ * l1:0-20 are blockcount.
+ */
+#define BMBT_EXNTFLAG_BITLEN 1
+#define BMBT_STARTOFF_BITLEN 54
+#define BMBT_STARTBLOCK_BITLEN 52
+#define BMBT_BLOCKCOUNT_BITLEN 21
+
+typedef struct xfs_bmbt_rec {
+ __be64 l0, l1;
+} xfs_bmbt_rec_t;
+
+typedef __uint64_t xfs_bmbt_rec_base_t; /* use this for casts */
+typedef xfs_bmbt_rec_t xfs_bmdr_rec_t;
+
+typedef struct xfs_bmbt_rec_host {
+ __uint64_t l0, l1;
+} xfs_bmbt_rec_host_t;
+
+/*
+ * Values and macros for delayed-allocation startblock fields.
+ */
+#define STARTBLOCKVALBITS 17
+#define STARTBLOCKMASKBITS (15 + XFS_BIG_BLKNOS * 20)
+#define DSTARTBLOCKMASKBITS (15 + 20)
+#define STARTBLOCKMASK \
+ (((((xfs_fsblock_t)1) << STARTBLOCKMASKBITS) - 1) << STARTBLOCKVALBITS)
+#define DSTARTBLOCKMASK \
+ (((((xfs_dfsbno_t)1) << DSTARTBLOCKMASKBITS) - 1) << STARTBLOCKVALBITS)
+
+static inline int isnullstartblock(xfs_fsblock_t x)
+{
+ return ((x) & STARTBLOCKMASK) == STARTBLOCKMASK;
+}
+
+static inline int isnulldstartblock(xfs_dfsbno_t x)
+{
+ return ((x) & DSTARTBLOCKMASK) == DSTARTBLOCKMASK;
+}
+
+static inline xfs_fsblock_t nullstartblock(int k)
+{
+ ASSERT(k < (1 << STARTBLOCKVALBITS));
+ return STARTBLOCKMASK | (k);
+}
+
+static inline xfs_filblks_t startblockval(xfs_fsblock_t x)
+{
+ return (xfs_filblks_t)((x) & ~STARTBLOCKMASK);
+}
+
+/*
+ * Possible extent formats.
+ */
+typedef enum {
+ XFS_EXTFMT_NOSTATE = 0,
+ XFS_EXTFMT_HASSTATE
+} xfs_exntfmt_t;
+
+/*
+ * Possible extent states.
+ */
+typedef enum {
+ XFS_EXT_NORM, XFS_EXT_UNWRITTEN,
+ XFS_EXT_DMAPI_OFFLINE, XFS_EXT_INVALID
+} xfs_exntst_t;
+
+/*
+ * Incore version of above.
+ */
+typedef struct xfs_bmbt_irec
+{
+ xfs_fileoff_t br_startoff; /* starting file offset */
+ xfs_fsblock_t br_startblock; /* starting block number */
+ xfs_filblks_t br_blockcount; /* number of blocks */
+ xfs_exntst_t br_state; /* extent state */
+} xfs_bmbt_irec_t;
+
+/*
+ * Key structure for non-leaf levels of the tree.
+ */
+typedef struct xfs_bmbt_key {
+ __be64 br_startoff; /* starting file offset */
+} xfs_bmbt_key_t, xfs_bmdr_key_t;
+
+/* btree pointer type */
+typedef __be64 xfs_bmbt_ptr_t, xfs_bmdr_ptr_t;
+
+
+/*
+ * Generic Btree block format definitions
+ *
+ * This is a combination of the actual format used on disk for short and long
+ * format btrees. The first three fields are shared by both format, but the
+ * pointers are different and should be used with care.
+ *
+ * To get the size of the actual short or long form headers please use the size
+ * macros below. Never use sizeof(xfs_btree_block).
+ *
+ * The blkno, crc, lsn, owner and uuid fields are only available in filesystems
+ * with the crc feature bit, and all accesses to them must be conditional on
+ * that flag.
+ */
+struct xfs_btree_block {
+ __be32 bb_magic; /* magic number for block type */
+ __be16 bb_level; /* 0 is a leaf */
+ __be16 bb_numrecs; /* current # of data records */
+ union {
+ struct {
+ __be32 bb_leftsib;
+ __be32 bb_rightsib;
+
+ __be64 bb_blkno;
+ __be64 bb_lsn;
+ uuid_t bb_uuid;
+ __be32 bb_owner;
+ __le32 bb_crc;
+ } s; /* short form pointers */
+ struct {
+ __be64 bb_leftsib;
+ __be64 bb_rightsib;
+
+ __be64 bb_blkno;
+ __be64 bb_lsn;
+ uuid_t bb_uuid;
+ __be64 bb_owner;
+ __le32 bb_crc;
+ __be32 bb_pad; /* padding for alignment */
+ } l; /* long form pointers */
+ } bb_u; /* rest */
+};
+
+#define XFS_BTREE_SBLOCK_LEN 16 /* size of a short form block */
+#define XFS_BTREE_LBLOCK_LEN 24 /* size of a long form block */
+
+/* sizes of CRC enabled btree blocks */
+#define XFS_BTREE_SBLOCK_CRC_LEN (XFS_BTREE_SBLOCK_LEN + 40)
+#define XFS_BTREE_LBLOCK_CRC_LEN (XFS_BTREE_LBLOCK_LEN + 48)
+
+#define XFS_BTREE_SBLOCK_CRC_OFF \
+ offsetof(struct xfs_btree_block, bb_u.s.bb_crc)
+#define XFS_BTREE_LBLOCK_CRC_OFF \
+ offsetof(struct xfs_btree_block, bb_u.l.bb_crc)
+
+#endif /* __XFS_FORMAT_H__ */
diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h
index 14010f1fa82..d34703dbcb4 100644
--- a/fs/xfs/xfs_fs.h
+++ b/fs/xfs/xfs_fs.h
@@ -22,8 +22,6 @@
* SGI's XFS filesystem's major stuff (constants, structures)
*/
-#define XFS_NAME "xfs"
-
/*
* Direct I/O attribute record used with XFS_IOC_DIOINFO
* d_miniosz is the min xfer size, xfer size multiple and file seek offset
@@ -67,14 +65,16 @@ struct fsxattr {
#define XFS_XFLAG_NOSYMLINKS 0x00000400 /* disallow symlink creation */
#define XFS_XFLAG_EXTSIZE 0x00000800 /* extent size allocator hint */
#define XFS_XFLAG_EXTSZINHERIT 0x00001000 /* inherit inode extent size */
+#define XFS_XFLAG_NODEFRAG 0x00002000 /* do not defragment */
+#define XFS_XFLAG_FILESTREAM 0x00004000 /* use filestream allocator */
#define XFS_XFLAG_HASATTR 0x80000000 /* no DIFLAG for this */
/*
* Structure for XFS_IOC_GETBMAP.
* On input, fill in bmv_offset and bmv_length of the first structure
- * to indicate the area of interest in the file, and bmv_entry with the
- * number of array elements given. The first structure is updated on
- * return to give the offset and length for the next call.
+ * to indicate the area of interest in the file, and bmv_entries with
+ * the number of array elements given back. The first structure is
+ * updated on return to give the offset and length for the next call.
*/
#ifndef HAVE_GETBMAP
struct getbmap {
@@ -113,22 +113,16 @@ struct getbmapx {
#define BMV_IF_ATTRFORK 0x1 /* return attr fork rather than data */
#define BMV_IF_NO_DMAPI_READ 0x2 /* Do not generate DMAPI read event */
#define BMV_IF_PREALLOC 0x4 /* rtn status BMV_OF_PREALLOC if req */
-#define BMV_IF_VALID (BMV_IF_ATTRFORK|BMV_IF_NO_DMAPI_READ|BMV_IF_PREALLOC)
-#ifdef __KERNEL__
-#define BMV_IF_EXTENDED 0x40000000 /* getpmapx if set */
-#endif
+#define BMV_IF_DELALLOC 0x8 /* rtn status BMV_OF_DELALLOC if req */
+#define BMV_IF_NO_HOLES 0x10 /* Do not return holes */
+#define BMV_IF_VALID \
+ (BMV_IF_ATTRFORK|BMV_IF_NO_DMAPI_READ|BMV_IF_PREALLOC| \
+ BMV_IF_DELALLOC|BMV_IF_NO_HOLES)
-/* bmv_oflags values - returned for for each non-header segment */
+/* bmv_oflags values - returned for each non-header segment */
#define BMV_OF_PREALLOC 0x1 /* segment = unwritten pre-allocation */
-
-/* Convert getbmap <-> getbmapx - move fields from p1 to p2. */
-#define GETBMAP_CONVERT(p1,p2) { \
- p2.bmv_offset = p1.bmv_offset; \
- p2.bmv_block = p1.bmv_block; \
- p2.bmv_length = p1.bmv_length; \
- p2.bmv_count = p1.bmv_count; \
- p2.bmv_entries = p1.bmv_entries; }
-
+#define BMV_OF_DELALLOC 0x2 /* segment = delayed allocation */
+#define BMV_OF_LAST 0x4 /* segment is the last in the file */
/*
* Structure for XFS_IOC_FSSETDM.
@@ -239,16 +233,31 @@ typedef struct xfs_fsop_resblks {
#define XFS_FSOP_GEOM_FLAGS_LOGV2 0x0100 /* log format version 2 */
#define XFS_FSOP_GEOM_FLAGS_SECTOR 0x0200 /* sector sizes >1BB */
#define XFS_FSOP_GEOM_FLAGS_ATTR2 0x0400 /* inline attributes rework */
-
+#define XFS_FSOP_GEOM_FLAGS_PROJID32 0x0800 /* 32-bit project IDs */
+#define XFS_FSOP_GEOM_FLAGS_DIRV2CI 0x1000 /* ASCII only CI names */
+#define XFS_FSOP_GEOM_FLAGS_LAZYSB 0x4000 /* lazy superblock counters */
+#define XFS_FSOP_GEOM_FLAGS_V5SB 0x8000 /* version 5 superblock */
+#define XFS_FSOP_GEOM_FLAGS_FTYPE 0x10000 /* inode directory types */
+#define XFS_FSOP_GEOM_FLAGS_FINOBT 0x20000 /* free inode btree */
/*
- * Minimum and maximum sizes need for growth checks
+ * Minimum and maximum sizes need for growth checks.
+ *
+ * Block counts are in units of filesystem blocks, not basic blocks.
*/
#define XFS_MIN_AG_BLOCKS 64
-#define XFS_MIN_LOG_BLOCKS 512
-#define XFS_MAX_LOG_BLOCKS (64 * 1024)
-#define XFS_MIN_LOG_BYTES (256 * 1024)
-#define XFS_MAX_LOG_BYTES (128 * 1024 * 1024)
+#define XFS_MIN_LOG_BLOCKS 512ULL
+#define XFS_MAX_LOG_BLOCKS (1024 * 1024ULL)
+#define XFS_MIN_LOG_BYTES (10 * 1024 * 1024ULL)
+
+/* keep the maximum size under 2^31 by a small amount */
+#define XFS_MAX_LOG_BYTES \
+ ((2 * 1024 * 1024 * 1024ULL) - XFS_MIN_LOG_BYTES)
+
+/* Used for sanity checks on superblock */
+#define XFS_MAX_DBLOCKS(s) ((xfs_drfsbno_t)(s)->sb_agcount * (s)->sb_agblocks)
+#define XFS_MIN_DBLOCKS(s) ((xfs_drfsbno_t)((s)->sb_agcount - 1) * \
+ (s)->sb_agblocks + XFS_MIN_AG_BLOCKS)
/*
* Structures for XFS_IOC_FSGROWFSDATA, XFS_IOC_FSGROWFSLOG & XFS_IOC_FSGROWFSRT
@@ -294,14 +303,28 @@ typedef struct xfs_bstat {
__s32 bs_extsize; /* extent size */
__s32 bs_extents; /* number of extents */
__u32 bs_gen; /* generation count */
- __u16 bs_projid; /* project id */
- unsigned char bs_pad[14]; /* pad space, unused */
+ __u16 bs_projid_lo; /* lower part of project id */
+#define bs_projid bs_projid_lo /* (previously just bs_projid) */
+ __u16 bs_forkoff; /* inode fork offset in bytes */
+ __u16 bs_projid_hi; /* higher part of project id */
+ unsigned char bs_pad[10]; /* pad space, unused */
__u32 bs_dmevmask; /* DMIG event mask */
__u16 bs_dmstate; /* DMIG state info */
__u16 bs_aextents; /* attribute number of extents */
} xfs_bstat_t;
/*
+ * Project quota id helpers (previously projid was 16bit only
+ * and using two 16bit values to hold new 32bit projid was choosen
+ * to retain compatibility with "old" filesystems).
+ */
+static inline __uint32_t
+bstat_get_projid(struct xfs_bstat *bs)
+{
+ return (__uint32_t)bs->bs_projid_hi << 16 | bs->bs_projid_lo;
+}
+
+/*
* The user-level BulkStat Request interface structure.
*/
typedef struct xfs_fsop_bulkreq {
@@ -332,6 +355,35 @@ typedef struct xfs_error_injection {
/*
+ * Speculative preallocation trimming.
+ */
+#define XFS_EOFBLOCKS_VERSION 1
+struct xfs_fs_eofblocks {
+ __u32 eof_version;
+ __u32 eof_flags;
+ uid_t eof_uid;
+ gid_t eof_gid;
+ prid_t eof_prid;
+ __u32 pad32;
+ __u64 eof_min_file_size;
+ __u64 pad64[12];
+};
+
+/* eof_flags values */
+#define XFS_EOF_FLAGS_SYNC (1 << 0) /* sync/wait mode scan */
+#define XFS_EOF_FLAGS_UID (1 << 1) /* filter by uid */
+#define XFS_EOF_FLAGS_GID (1 << 2) /* filter by gid */
+#define XFS_EOF_FLAGS_PRID (1 << 3) /* filter by project id */
+#define XFS_EOF_FLAGS_MINFILESIZE (1 << 4) /* filter by min file size */
+#define XFS_EOF_FLAGS_VALID \
+ (XFS_EOF_FLAGS_SYNC | \
+ XFS_EOF_FLAGS_UID | \
+ XFS_EOF_FLAGS_GID | \
+ XFS_EOF_FLAGS_PRID | \
+ XFS_EOF_FLAGS_MINFILESIZE)
+
+
+/*
* The user-level Handle Request interface structure.
*/
typedef struct xfs_fsop_handlereq {
@@ -370,6 +422,9 @@ typedef struct xfs_fsop_attrlist_handlereq {
typedef struct xfs_attr_multiop {
__u32 am_opcode;
+#define ATTR_OP_GET 1 /* return the indicated attr's value */
+#define ATTR_OP_SET 2 /* set/create the indicated attr/value pair */
+#define ATTR_OP_REMOVE 3 /* remove the indicated attr */
__s32 am_error;
void __user *am_attrname;
void __user *am_attrvalue;
@@ -388,30 +443,13 @@ typedef struct xfs_fsop_attrmulti_handlereq {
*/
typedef struct { __u32 val[2]; } xfs_fsid_t; /* file system id type */
-
-#ifndef HAVE_FID
-#define MAXFIDSZ 46
-
-typedef struct fid {
- __u16 fid_len; /* length of data in bytes */
- unsigned char fid_data[MAXFIDSZ]; /* data (fid_len worth) */
-} fid_t;
-#endif
-
typedef struct xfs_fid {
- __u16 xfs_fid_len; /* length of remainder */
- __u16 xfs_fid_pad;
- __u32 xfs_fid_gen; /* generation number */
- __u64 xfs_fid_ino; /* 64 bits inode number */
+ __u16 fid_len; /* length of remainder */
+ __u16 fid_pad;
+ __u32 fid_gen; /* generation number */
+ __u64 fid_ino; /* 64 bits inode number */
} xfs_fid_t;
-typedef struct xfs_fid2 {
- __u16 fid_len; /* length of remainder */
- __u16 fid_pad; /* padding, must be zero */
- __u32 fid_gen; /* generation number */
- __u64 fid_ino; /* inode number */
-} xfs_fid2_t;
-
typedef struct xfs_handle {
union {
__s64 align; /* force alignment of ha_fid */
@@ -421,15 +459,26 @@ typedef struct xfs_handle {
} xfs_handle_t;
#define ha_fsid ha_u._ha_fsid
-#define XFS_HSIZE(handle) (((char *) &(handle).ha_fid.xfs_fid_pad \
+#define XFS_HSIZE(handle) (((char *) &(handle).ha_fid.fid_pad \
- (char *) &(handle)) \
- + (handle).ha_fid.xfs_fid_len)
+ + (handle).ha_fid.fid_len)
-#define XFS_HANDLE_CMP(h1, h2) memcmp(h1, h2, sizeof(xfs_handle_t))
-
-#define FSHSIZE sizeof(fsid_t)
+/*
+ * Structure passed to XFS_IOC_SWAPEXT
+ */
+typedef struct xfs_swapext
+{
+ __int64_t sx_version; /* version */
+#define XFS_SX_VERSION 0
+ __int64_t sx_fdtarget; /* fd of target file */
+ __int64_t sx_fdtmp; /* fd of tmp file */
+ xfs_off_t sx_offset; /* offset into file */
+ xfs_off_t sx_length; /* leng from offset */
+ char sx_pad[16]; /* pad space, unused */
+ xfs_bstat_t sx_stat; /* stat of target b4 copy */
+} xfs_swapext_t;
-/*
+/*
* Flags for going down operation
*/
#define XFS_FSOP_GOING_FLAGS_DEFAULT 0x0 /* going down */
@@ -439,9 +488,9 @@ typedef struct xfs_handle {
/*
* ioctl commands that are used by Linux filesystems
*/
-#define XFS_IOC_GETXFLAGS _IOR('f', 1, long)
-#define XFS_IOC_SETXFLAGS _IOW('f', 2, long)
-#define XFS_IOC_GETVERSION _IOR('v', 1, long)
+#define XFS_IOC_GETXFLAGS FS_IOC_GETFLAGS
+#define XFS_IOC_SETXFLAGS FS_IOC_SETFLAGS
+#define XFS_IOC_GETVERSION FS_IOC_GETVERSION
/*
* ioctl commands that replace IRIX fcntl()'s
@@ -466,6 +515,8 @@ typedef struct xfs_handle {
/* XFS_IOC_SETBIOSIZE ---- deprecated 46 */
/* XFS_IOC_GETBIOSIZE ---- deprecated 47 */
#define XFS_IOC_GETBMAPX _IOWR('X', 56, struct getbmap)
+#define XFS_IOC_ZERO_RANGE _IOW ('X', 57, struct xfs_flock64)
+#define XFS_IOC_FREE_EOFBLOCKS _IOR ('X', 58, struct xfs_fs_eofblocks)
/*
* ioctl commands that replace IRIX syssgi()'s
@@ -489,8 +540,14 @@ typedef struct xfs_handle {
#define XFS_IOC_ERROR_INJECTION _IOW ('X', 116, struct xfs_error_injection)
#define XFS_IOC_ERROR_CLEARALL _IOW ('X', 117, struct xfs_error_injection)
/* XFS_IOC_ATTRCTL_BY_HANDLE -- deprecated 118 */
+
+/* XFS_IOC_FREEZE -- FIFREEZE 119 */
+/* XFS_IOC_THAW -- FITHAW 120 */
+#ifndef FIFREEZE
#define XFS_IOC_FREEZE _IOWR('X', 119, int)
#define XFS_IOC_THAW _IOWR('X', 120, int)
+#endif
+
#define XFS_IOC_FSSETDM_BY_HANDLE _IOW ('X', 121, struct xfs_fsop_setdm_handlereq)
#define XFS_IOC_ATTRLIST_BY_HANDLE _IOW ('X', 122, struct xfs_fsop_attrlist_handlereq)
#define XFS_IOC_ATTRMULTI_BY_HANDLE _IOW ('X', 123, struct xfs_fsop_attrmulti_handlereq)
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index b4d971b0158..d2295561570 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -17,35 +17,31 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
-#include "xfs_types.h"
-#include "xfs_bit.h"
-#include "xfs_inum.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
-#include "xfs_dir.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
-#include "xfs_dinode.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
#include "xfs_inode.h"
+#include "xfs_trans.h"
#include "xfs_inode_item.h"
-#include "xfs_btree.h"
#include "xfs_error.h"
+#include "xfs_btree.h"
+#include "xfs_alloc_btree.h"
#include "xfs_alloc.h"
#include "xfs_ialloc.h"
#include "xfs_fsops.h"
#include "xfs_itable.h"
#include "xfs_trans_space.h"
#include "xfs_rtalloc.h"
-#include "xfs_rw.h"
+#include "xfs_trace.h"
+#include "xfs_log.h"
+#include "xfs_dinode.h"
+#include "xfs_filestream.h"
/*
* File system operations
@@ -57,6 +53,9 @@ xfs_fs_geometry(
xfs_fsop_geom_t *geo,
int new_version)
{
+
+ memset(geo, 0, sizeof(*geo));
+
geo->blocksize = mp->m_sb.sb_blocksize;
geo->rtextsize = mp->m_sb.sb_rextsize;
geo->agblocks = mp->m_sb.sb_agblocks;
@@ -77,57 +76,86 @@ xfs_fs_geometry(
}
if (new_version >= 3) {
geo->version = XFS_FSOP_GEOM_VERSION;
- geo->flags =
- (XFS_SB_VERSION_HASATTR(&mp->m_sb) ?
+ geo->flags = XFS_FSOP_GEOM_FLAGS_NLINK |
+ XFS_FSOP_GEOM_FLAGS_DIRV2 |
+ (xfs_sb_version_hasattr(&mp->m_sb) ?
XFS_FSOP_GEOM_FLAGS_ATTR : 0) |
- (XFS_SB_VERSION_HASNLINK(&mp->m_sb) ?
- XFS_FSOP_GEOM_FLAGS_NLINK : 0) |
- (XFS_SB_VERSION_HASQUOTA(&mp->m_sb) ?
+ (xfs_sb_version_hasquota(&mp->m_sb) ?
XFS_FSOP_GEOM_FLAGS_QUOTA : 0) |
- (XFS_SB_VERSION_HASALIGN(&mp->m_sb) ?
+ (xfs_sb_version_hasalign(&mp->m_sb) ?
XFS_FSOP_GEOM_FLAGS_IALIGN : 0) |
- (XFS_SB_VERSION_HASDALIGN(&mp->m_sb) ?
+ (xfs_sb_version_hasdalign(&mp->m_sb) ?
XFS_FSOP_GEOM_FLAGS_DALIGN : 0) |
- (XFS_SB_VERSION_HASSHARED(&mp->m_sb) ?
- XFS_FSOP_GEOM_FLAGS_SHARED : 0) |
- (XFS_SB_VERSION_HASEXTFLGBIT(&mp->m_sb) ?
+ (xfs_sb_version_hasextflgbit(&mp->m_sb) ?
XFS_FSOP_GEOM_FLAGS_EXTFLG : 0) |
- (XFS_SB_VERSION_HASDIRV2(&mp->m_sb) ?
- XFS_FSOP_GEOM_FLAGS_DIRV2 : 0) |
- (XFS_SB_VERSION_HASSECTOR(&mp->m_sb) ?
+ (xfs_sb_version_hassector(&mp->m_sb) ?
XFS_FSOP_GEOM_FLAGS_SECTOR : 0) |
- (XFS_SB_VERSION_HASATTR2(&mp->m_sb) ?
- XFS_FSOP_GEOM_FLAGS_ATTR2 : 0);
- geo->logsectsize = XFS_SB_VERSION_HASSECTOR(&mp->m_sb) ?
+ (xfs_sb_version_hasasciici(&mp->m_sb) ?
+ XFS_FSOP_GEOM_FLAGS_DIRV2CI : 0) |
+ (xfs_sb_version_haslazysbcount(&mp->m_sb) ?
+ XFS_FSOP_GEOM_FLAGS_LAZYSB : 0) |
+ (xfs_sb_version_hasattr2(&mp->m_sb) ?
+ XFS_FSOP_GEOM_FLAGS_ATTR2 : 0) |
+ (xfs_sb_version_hasprojid32bit(&mp->m_sb) ?
+ XFS_FSOP_GEOM_FLAGS_PROJID32 : 0) |
+ (xfs_sb_version_hascrc(&mp->m_sb) ?
+ XFS_FSOP_GEOM_FLAGS_V5SB : 0) |
+ (xfs_sb_version_hasftype(&mp->m_sb) ?
+ XFS_FSOP_GEOM_FLAGS_FTYPE : 0) |
+ (xfs_sb_version_hasfinobt(&mp->m_sb) ?
+ XFS_FSOP_GEOM_FLAGS_FINOBT : 0);
+ geo->logsectsize = xfs_sb_version_hassector(&mp->m_sb) ?
mp->m_sb.sb_logsectsize : BBSIZE;
geo->rtsectsize = mp->m_sb.sb_blocksize;
- geo->dirblocksize = mp->m_dirblksize;
+ geo->dirblocksize = mp->m_dir_geo->blksize;
}
if (new_version >= 4) {
geo->flags |=
- (XFS_SB_VERSION_HASLOGV2(&mp->m_sb) ?
+ (xfs_sb_version_haslogv2(&mp->m_sb) ?
XFS_FSOP_GEOM_FLAGS_LOGV2 : 0);
geo->logsunit = mp->m_sb.sb_logsunit;
}
return 0;
}
+static struct xfs_buf *
+xfs_growfs_get_hdr_buf(
+ struct xfs_mount *mp,
+ xfs_daddr_t blkno,
+ size_t numblks,
+ int flags,
+ const struct xfs_buf_ops *ops)
+{
+ struct xfs_buf *bp;
+
+ bp = xfs_buf_get_uncached(mp->m_ddev_targp, numblks, flags);
+ if (!bp)
+ return NULL;
+
+ xfs_buf_zero(bp, 0, BBTOB(bp->b_length));
+ bp->b_bn = blkno;
+ bp->b_maps[0].bm_bn = blkno;
+ bp->b_ops = ops;
+
+ return bp;
+}
+
static int
xfs_growfs_data_private(
xfs_mount_t *mp, /* mount point for filesystem */
xfs_growfs_data_t *in) /* growfs data input struct */
{
xfs_agf_t *agf;
+ struct xfs_agfl *agfl;
xfs_agi_t *agi;
xfs_agnumber_t agno;
xfs_extlen_t agsize;
xfs_extlen_t tmpsize;
xfs_alloc_rec_t *arec;
- xfs_btree_sblock_t *block;
xfs_buf_t *bp;
int bucket;
int dpct;
- int error;
+ int error, saved_error = 0;
xfs_agnumber_t nagcount;
xfs_agnumber_t nagimax = 0;
xfs_rfsblock_t nb, nb_mod;
@@ -135,20 +163,25 @@ xfs_growfs_data_private(
xfs_rfsblock_t nfree;
xfs_agnumber_t oagcount;
int pct;
- xfs_sb_t *sbp;
xfs_trans_t *tp;
nb = in->newblocks;
pct = in->imaxpct;
if (nb < mp->m_sb.sb_dblocks || pct < 0 || pct > 100)
return XFS_ERROR(EINVAL);
+ if ((error = xfs_sb_validate_fsb_count(&mp->m_sb, nb)))
+ return error;
dpct = pct - mp->m_sb.sb_imax_pct;
- error = xfs_read_buf(mp, mp->m_ddev_targp,
- XFS_FSB_TO_BB(mp, nb) - XFS_FSS_TO_BB(mp, 1),
- XFS_FSS_TO_BB(mp, 1), 0, &bp);
- if (error)
+ bp = xfs_buf_read_uncached(mp->m_ddev_targp,
+ XFS_FSB_TO_BB(mp, nb) - XFS_FSS_TO_BB(mp, 1),
+ XFS_FSS_TO_BB(mp, 1), 0, NULL);
+ if (!bp)
+ return EIO;
+ if (bp->b_error) {
+ error = bp->b_error;
+ xfs_buf_relse(bp);
return error;
- ASSERT(bp);
+ }
xfs_buf_relse(bp);
new = nb; /* use new as a temporary here */
@@ -156,41 +189,51 @@ xfs_growfs_data_private(
nagcount = new + (nb_mod != 0);
if (nb_mod && nb_mod < XFS_MIN_AG_BLOCKS) {
nagcount--;
- nb = nagcount * mp->m_sb.sb_agblocks;
+ nb = (xfs_rfsblock_t)nagcount * mp->m_sb.sb_agblocks;
if (nb < mp->m_sb.sb_dblocks)
return XFS_ERROR(EINVAL);
}
new = nb - mp->m_sb.sb_dblocks;
oagcount = mp->m_sb.sb_agcount;
+
+ /* allocate the new per-ag structures */
if (nagcount > oagcount) {
- down_write(&mp->m_peraglock);
- mp->m_perag = kmem_realloc(mp->m_perag,
- sizeof(xfs_perag_t) * nagcount,
- sizeof(xfs_perag_t) * oagcount,
- KM_SLEEP);
- memset(&mp->m_perag[oagcount], 0,
- (nagcount - oagcount) * sizeof(xfs_perag_t));
- mp->m_flags |= XFS_MOUNT_32BITINODES;
- nagimax = xfs_initialize_perag(XFS_MTOVFS(mp), mp, nagcount);
- up_write(&mp->m_peraglock);
+ error = xfs_initialize_perag(mp, nagcount, &nagimax);
+ if (error)
+ return error;
}
+
tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFS);
- if ((error = xfs_trans_reserve(tp, XFS_GROWFS_SPACE_RES(mp),
- XFS_GROWDATA_LOG_RES(mp), 0, 0, 0))) {
+ tp->t_flags |= XFS_TRANS_RESERVE;
+ error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growdata,
+ XFS_GROWFS_SPACE_RES(mp), 0);
+ if (error) {
xfs_trans_cancel(tp, 0);
return error;
}
+ /*
+ * Write new AG headers to disk. Non-transactional, but written
+ * synchronously so they are completed prior to the growfs transaction
+ * being logged.
+ */
nfree = 0;
for (agno = nagcount - 1; agno >= oagcount; agno--, new -= agsize) {
+ __be32 *agfl_bno;
+
/*
- * AG freelist header block
+ * AG freespace header block
*/
- bp = xfs_buf_get(mp->m_ddev_targp,
- XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)),
- XFS_FSS_TO_BB(mp, 1), 0);
+ bp = xfs_growfs_get_hdr_buf(mp,
+ XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)),
+ XFS_FSS_TO_BB(mp, 1), 0,
+ &xfs_agf_buf_ops);
+ if (!bp) {
+ error = ENOMEM;
+ goto error0;
+ }
+
agf = XFS_BUF_TO_AGF(bp);
- memset(agf, 0, mp->m_sb.sb_sectsize);
agf->agf_magicnum = cpu_to_be32(XFS_AGF_MAGIC);
agf->agf_versionnum = cpu_to_be32(XFS_AGF_VERSION);
agf->agf_seqno = cpu_to_be32(agno);
@@ -211,18 +254,55 @@ xfs_growfs_data_private(
tmpsize = agsize - XFS_PREALLOC_BLOCKS(mp);
agf->agf_freeblks = cpu_to_be32(tmpsize);
agf->agf_longest = cpu_to_be32(tmpsize);
- error = xfs_bwrite(mp, bp);
- if (error) {
+ if (xfs_sb_version_hascrc(&mp->m_sb))
+ uuid_copy(&agf->agf_uuid, &mp->m_sb.sb_uuid);
+
+ error = xfs_bwrite(bp);
+ xfs_buf_relse(bp);
+ if (error)
+ goto error0;
+
+ /*
+ * AG freelist header block
+ */
+ bp = xfs_growfs_get_hdr_buf(mp,
+ XFS_AG_DADDR(mp, agno, XFS_AGFL_DADDR(mp)),
+ XFS_FSS_TO_BB(mp, 1), 0,
+ &xfs_agfl_buf_ops);
+ if (!bp) {
+ error = ENOMEM;
goto error0;
}
+
+ agfl = XFS_BUF_TO_AGFL(bp);
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ agfl->agfl_magicnum = cpu_to_be32(XFS_AGFL_MAGIC);
+ agfl->agfl_seqno = cpu_to_be32(agno);
+ uuid_copy(&agfl->agfl_uuid, &mp->m_sb.sb_uuid);
+ }
+
+ agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, bp);
+ for (bucket = 0; bucket < XFS_AGFL_SIZE(mp); bucket++)
+ agfl_bno[bucket] = cpu_to_be32(NULLAGBLOCK);
+
+ error = xfs_bwrite(bp);
+ xfs_buf_relse(bp);
+ if (error)
+ goto error0;
+
/*
* AG inode header block
*/
- bp = xfs_buf_get(mp->m_ddev_targp,
- XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)),
- XFS_FSS_TO_BB(mp, 1), 0);
+ bp = xfs_growfs_get_hdr_buf(mp,
+ XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)),
+ XFS_FSS_TO_BB(mp, 1), 0,
+ &xfs_agi_buf_ops);
+ if (!bp) {
+ error = ENOMEM;
+ goto error0;
+ }
+
agi = XFS_BUF_TO_AGI(bp);
- memset(agi, 0, mp->m_sb.sb_sectsize);
agi->agi_magicnum = cpu_to_be32(XFS_AGI_MAGIC);
agi->agi_versionnum = cpu_to_be32(XFS_AGI_VERSION);
agi->agi_seqno = cpu_to_be32(agno);
@@ -233,74 +313,131 @@ xfs_growfs_data_private(
agi->agi_freecount = 0;
agi->agi_newino = cpu_to_be32(NULLAGINO);
agi->agi_dirino = cpu_to_be32(NULLAGINO);
+ if (xfs_sb_version_hascrc(&mp->m_sb))
+ uuid_copy(&agi->agi_uuid, &mp->m_sb.sb_uuid);
+ if (xfs_sb_version_hasfinobt(&mp->m_sb)) {
+ agi->agi_free_root = cpu_to_be32(XFS_FIBT_BLOCK(mp));
+ agi->agi_free_level = cpu_to_be32(1);
+ }
for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++)
agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO);
- error = xfs_bwrite(mp, bp);
- if (error) {
+
+ error = xfs_bwrite(bp);
+ xfs_buf_relse(bp);
+ if (error)
goto error0;
- }
+
/*
* BNO btree root block
*/
- bp = xfs_buf_get(mp->m_ddev_targp,
- XFS_AGB_TO_DADDR(mp, agno, XFS_BNO_BLOCK(mp)),
- BTOBB(mp->m_sb.sb_blocksize), 0);
- block = XFS_BUF_TO_SBLOCK(bp);
- memset(block, 0, mp->m_sb.sb_blocksize);
- block->bb_magic = cpu_to_be32(XFS_ABTB_MAGIC);
- block->bb_level = 0;
- block->bb_numrecs = cpu_to_be16(1);
- block->bb_leftsib = cpu_to_be32(NULLAGBLOCK);
- block->bb_rightsib = cpu_to_be32(NULLAGBLOCK);
- arec = XFS_BTREE_REC_ADDR(mp->m_sb.sb_blocksize, xfs_alloc,
- block, 1, mp->m_alloc_mxr[0]);
+ bp = xfs_growfs_get_hdr_buf(mp,
+ XFS_AGB_TO_DADDR(mp, agno, XFS_BNO_BLOCK(mp)),
+ BTOBB(mp->m_sb.sb_blocksize), 0,
+ &xfs_allocbt_buf_ops);
+
+ if (!bp) {
+ error = ENOMEM;
+ goto error0;
+ }
+
+ if (xfs_sb_version_hascrc(&mp->m_sb))
+ xfs_btree_init_block(mp, bp, XFS_ABTB_CRC_MAGIC, 0, 1,
+ agno, XFS_BTREE_CRC_BLOCKS);
+ else
+ xfs_btree_init_block(mp, bp, XFS_ABTB_MAGIC, 0, 1,
+ agno, 0);
+
+ arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1);
arec->ar_startblock = cpu_to_be32(XFS_PREALLOC_BLOCKS(mp));
arec->ar_blockcount = cpu_to_be32(
agsize - be32_to_cpu(arec->ar_startblock));
- error = xfs_bwrite(mp, bp);
- if (error) {
+
+ error = xfs_bwrite(bp);
+ xfs_buf_relse(bp);
+ if (error)
goto error0;
- }
+
/*
* CNT btree root block
*/
- bp = xfs_buf_get(mp->m_ddev_targp,
- XFS_AGB_TO_DADDR(mp, agno, XFS_CNT_BLOCK(mp)),
- BTOBB(mp->m_sb.sb_blocksize), 0);
- block = XFS_BUF_TO_SBLOCK(bp);
- memset(block, 0, mp->m_sb.sb_blocksize);
- block->bb_magic = cpu_to_be32(XFS_ABTC_MAGIC);
- block->bb_level = 0;
- block->bb_numrecs = cpu_to_be16(1);
- block->bb_leftsib = cpu_to_be32(NULLAGBLOCK);
- block->bb_rightsib = cpu_to_be32(NULLAGBLOCK);
- arec = XFS_BTREE_REC_ADDR(mp->m_sb.sb_blocksize, xfs_alloc,
- block, 1, mp->m_alloc_mxr[0]);
+ bp = xfs_growfs_get_hdr_buf(mp,
+ XFS_AGB_TO_DADDR(mp, agno, XFS_CNT_BLOCK(mp)),
+ BTOBB(mp->m_sb.sb_blocksize), 0,
+ &xfs_allocbt_buf_ops);
+ if (!bp) {
+ error = ENOMEM;
+ goto error0;
+ }
+
+ if (xfs_sb_version_hascrc(&mp->m_sb))
+ xfs_btree_init_block(mp, bp, XFS_ABTC_CRC_MAGIC, 0, 1,
+ agno, XFS_BTREE_CRC_BLOCKS);
+ else
+ xfs_btree_init_block(mp, bp, XFS_ABTC_MAGIC, 0, 1,
+ agno, 0);
+
+ arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1);
arec->ar_startblock = cpu_to_be32(XFS_PREALLOC_BLOCKS(mp));
arec->ar_blockcount = cpu_to_be32(
agsize - be32_to_cpu(arec->ar_startblock));
nfree += be32_to_cpu(arec->ar_blockcount);
- error = xfs_bwrite(mp, bp);
- if (error) {
+
+ error = xfs_bwrite(bp);
+ xfs_buf_relse(bp);
+ if (error)
goto error0;
- }
+
/*
* INO btree root block
*/
- bp = xfs_buf_get(mp->m_ddev_targp,
- XFS_AGB_TO_DADDR(mp, agno, XFS_IBT_BLOCK(mp)),
- BTOBB(mp->m_sb.sb_blocksize), 0);
- block = XFS_BUF_TO_SBLOCK(bp);
- memset(block, 0, mp->m_sb.sb_blocksize);
- block->bb_magic = cpu_to_be32(XFS_IBT_MAGIC);
- block->bb_level = 0;
- block->bb_numrecs = 0;
- block->bb_leftsib = cpu_to_be32(NULLAGBLOCK);
- block->bb_rightsib = cpu_to_be32(NULLAGBLOCK);
- error = xfs_bwrite(mp, bp);
- if (error) {
+ bp = xfs_growfs_get_hdr_buf(mp,
+ XFS_AGB_TO_DADDR(mp, agno, XFS_IBT_BLOCK(mp)),
+ BTOBB(mp->m_sb.sb_blocksize), 0,
+ &xfs_inobt_buf_ops);
+ if (!bp) {
+ error = ENOMEM;
goto error0;
}
+
+ if (xfs_sb_version_hascrc(&mp->m_sb))
+ xfs_btree_init_block(mp, bp, XFS_IBT_CRC_MAGIC, 0, 0,
+ agno, XFS_BTREE_CRC_BLOCKS);
+ else
+ xfs_btree_init_block(mp, bp, XFS_IBT_MAGIC, 0, 0,
+ agno, 0);
+
+ error = xfs_bwrite(bp);
+ xfs_buf_relse(bp);
+ if (error)
+ goto error0;
+
+ /*
+ * FINO btree root block
+ */
+ if (xfs_sb_version_hasfinobt(&mp->m_sb)) {
+ bp = xfs_growfs_get_hdr_buf(mp,
+ XFS_AGB_TO_DADDR(mp, agno, XFS_FIBT_BLOCK(mp)),
+ BTOBB(mp->m_sb.sb_blocksize), 0,
+ &xfs_inobt_buf_ops);
+ if (!bp) {
+ error = ENOMEM;
+ goto error0;
+ }
+
+ if (xfs_sb_version_hascrc(&mp->m_sb))
+ xfs_btree_init_block(mp, bp, XFS_FIBT_CRC_MAGIC,
+ 0, 0, agno,
+ XFS_BTREE_CRC_BLOCKS);
+ else
+ xfs_btree_init_block(mp, bp, XFS_FIBT_MAGIC, 0,
+ 0, agno, 0);
+
+ error = xfs_bwrite(bp);
+ xfs_buf_relse(bp);
+ if (error)
+ goto error0;
+ }
+
}
xfs_trans_agblocks_delta(tp, nfree);
/*
@@ -316,7 +453,7 @@ xfs_growfs_data_private(
}
ASSERT(bp);
agi = XFS_BUF_TO_AGI(bp);
- be32_add(&agi->agi_length, new);
+ be32_add_cpu(&agi->agi_length, new);
ASSERT(nagcount == oagcount ||
be32_to_cpu(agi->agi_length) == mp->m_sb.sb_agblocks);
xfs_ialloc_log_agi(tp, bp, XFS_AGI_LENGTH);
@@ -329,9 +466,11 @@ xfs_growfs_data_private(
}
ASSERT(bp);
agf = XFS_BUF_TO_AGF(bp);
- be32_add(&agf->agf_length, new);
+ be32_add_cpu(&agf->agf_length, new);
ASSERT(be32_to_cpu(agf->agf_length) ==
be32_to_cpu(agi->agi_length));
+
+ xfs_alloc_log_agf(tp, bp, XFS_AGF_LENGTH);
/*
* Free the new space.
*/
@@ -341,6 +480,12 @@ xfs_growfs_data_private(
goto error0;
}
}
+
+ /*
+ * Update changed superblock fields transactionally. These are not
+ * seen by the rest of the world until the transaction commit applies
+ * them atomically to the superblock.
+ */
if (nagcount > oagcount)
xfs_trans_mod_sb(tp, XFS_TRANS_SB_AGCOUNT, nagcount - oagcount);
if (nb > mp->m_sb.sb_dblocks)
@@ -350,10 +495,10 @@ xfs_growfs_data_private(
xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, nfree);
if (dpct)
xfs_trans_mod_sb(tp, XFS_TRANS_SB_IMAXPCT, dpct);
- error = xfs_trans_commit(tp, 0, NULL);
- if (error) {
+ error = xfs_trans_commit(tp, 0);
+ if (error)
return error;
- }
+
/* New allocation groups fully initialized, so update mount struct */
if (nagimax)
mp->m_maxagi = nagimax;
@@ -363,33 +508,59 @@ xfs_growfs_data_private(
mp->m_maxicount = icount << mp->m_sb.sb_inopblog;
} else
mp->m_maxicount = 0;
+ xfs_set_low_space_thresholds(mp);
+
+ /* update secondary superblocks. */
for (agno = 1; agno < nagcount; agno++) {
- error = xfs_read_buf(mp, mp->m_ddev_targp,
+ error = 0;
+ /*
+ * new secondary superblocks need to be zeroed, not read from
+ * disk as the contents of the new area we are growing into is
+ * completely unknown.
+ */
+ if (agno < oagcount) {
+ error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp,
XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)),
- XFS_FSS_TO_BB(mp, 1), 0, &bp);
- if (error) {
- xfs_fs_cmn_err(CE_WARN, mp,
- "error %d reading secondary superblock for ag %d",
- error, agno);
- break;
+ XFS_FSS_TO_BB(mp, 1), 0, &bp,
+ &xfs_sb_buf_ops);
+ } else {
+ bp = xfs_trans_get_buf(NULL, mp->m_ddev_targp,
+ XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)),
+ XFS_FSS_TO_BB(mp, 1), 0);
+ if (bp) {
+ bp->b_ops = &xfs_sb_buf_ops;
+ xfs_buf_zero(bp, 0, BBTOB(bp->b_length));
+ } else
+ error = ENOMEM;
}
- sbp = XFS_BUF_TO_SBP(bp);
- xfs_xlatesb(sbp, &mp->m_sb, -1, XFS_SB_ALL_BITS);
+
/*
- * If we get an error writing out the alternate superblocks,
- * just issue a warning and continue. The real work is
- * already done and committed.
+ * If we get an error reading or writing alternate superblocks,
+ * continue. xfs_repair chooses the "best" superblock based
+ * on most matches; if we break early, we'll leave more
+ * superblocks un-updated than updated, and xfs_repair may
+ * pick them over the properly-updated primary.
*/
- if (!(error = xfs_bwrite(mp, bp))) {
+ if (error) {
+ xfs_warn(mp,
+ "error %d reading secondary superblock for ag %d",
+ error, agno);
+ saved_error = error;
continue;
- } else {
- xfs_fs_cmn_err(CE_WARN, mp,
+ }
+ xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb, XFS_SB_ALL_BITS);
+
+ error = xfs_bwrite(bp);
+ xfs_buf_relse(bp);
+ if (error) {
+ xfs_warn(mp,
"write error %d updating secondary superblock for ag %d",
error, agno);
- break; /* no point in continuing */
+ saved_error = error;
+ continue;
}
}
- return 0;
+ return saved_error ? saved_error : error;
error0:
xfs_trans_cancel(tp, XFS_TRANS_ABORT);
@@ -431,10 +602,13 @@ xfs_growfs_data(
xfs_growfs_data_t *in)
{
int error;
- if (!cpsema(&mp->m_growlock))
+
+ if (!capable(CAP_SYS_ADMIN))
+ return XFS_ERROR(EPERM);
+ if (!mutex_trylock(&mp->m_growlock))
return XFS_ERROR(EWOULDBLOCK);
error = xfs_growfs_data_private(mp, in);
- vsema(&mp->m_growlock);
+ mutex_unlock(&mp->m_growlock);
return error;
}
@@ -444,10 +618,13 @@ xfs_growfs_log(
xfs_growfs_log_t *in)
{
int error;
- if (!cpsema(&mp->m_growlock))
+
+ if (!capable(CAP_SYS_ADMIN))
+ return XFS_ERROR(EPERM);
+ if (!mutex_trylock(&mp->m_growlock))
return XFS_ERROR(EWOULDBLOCK);
error = xfs_growfs_log_private(mp, in);
- vsema(&mp->m_growlock);
+ mutex_unlock(&mp->m_growlock);
return error;
}
@@ -460,14 +637,13 @@ xfs_fs_counts(
xfs_mount_t *mp,
xfs_fsop_counts_t *cnt)
{
- unsigned long s;
-
- s = XFS_SB_LOCK(mp);
- cnt->freedata = mp->m_sb.sb_fdblocks;
+ xfs_icsb_sync_counters(mp, XFS_ICSB_LAZY_COUNT);
+ spin_lock(&mp->m_sb_lock);
+ cnt->freedata = mp->m_sb.sb_fdblocks - XFS_ALLOC_SET_ASIDE(mp);
cnt->freertx = mp->m_sb.sb_frextents;
cnt->freeino = mp->m_sb.sb_ifree;
cnt->allocino = mp->m_sb.sb_icount;
- XFS_SB_UNLOCK(mp, s);
+ spin_unlock(&mp->m_sb_lock);
return 0;
}
@@ -476,7 +652,7 @@ xfs_fs_counts(
*
* xfs_reserve_blocks is called to set m_resblks
* in the in-core mount table. The number of unused reserved blocks
- * is kept in m_resbls_avail.
+ * is kept in m_resblks_avail.
*
* Reserve the requested number of blocks if available. Otherwise return
* as many as possible to satisfy the request. The actual number
@@ -492,78 +668,130 @@ xfs_reserve_blocks(
__uint64_t *inval,
xfs_fsop_resblks_t *outval)
{
- __int64_t lcounter, delta;
+ __int64_t lcounter, delta, fdblks_delta;
__uint64_t request;
- unsigned long s;
/* If inval is null, report current values and return */
-
if (inval == (__uint64_t *)NULL) {
+ if (!outval)
+ return EINVAL;
outval->resblks = mp->m_resblks;
outval->resblks_avail = mp->m_resblks_avail;
return 0;
}
request = *inval;
- s = XFS_SB_LOCK(mp);
+
+ /*
+ * With per-cpu counters, this becomes an interesting
+ * problem. we needto work out if we are freeing or allocation
+ * blocks first, then we can do the modification as necessary.
+ *
+ * We do this under the m_sb_lock so that if we are near
+ * ENOSPC, we will hold out any changes while we work out
+ * what to do. This means that the amount of free space can
+ * change while we do this, so we need to retry if we end up
+ * trying to reserve more space than is available.
+ *
+ * We also use the xfs_mod_incore_sb() interface so that we
+ * don't have to care about whether per cpu counter are
+ * enabled, disabled or even compiled in....
+ */
+retry:
+ spin_lock(&mp->m_sb_lock);
+ xfs_icsb_sync_counters_locked(mp, 0);
/*
* If our previous reservation was larger than the current value,
* then move any unused blocks back to the free pool.
*/
-
+ fdblks_delta = 0;
if (mp->m_resblks > request) {
lcounter = mp->m_resblks_avail - request;
if (lcounter > 0) { /* release unused blocks */
- mp->m_sb.sb_fdblocks += lcounter;
+ fdblks_delta = lcounter;
mp->m_resblks_avail -= lcounter;
}
mp->m_resblks = request;
} else {
+ __int64_t free;
+
+ free = mp->m_sb.sb_fdblocks - XFS_ALLOC_SET_ASIDE(mp);
+ if (!free)
+ goto out; /* ENOSPC and fdblks_delta = 0 */
+
delta = request - mp->m_resblks;
- lcounter = mp->m_sb.sb_fdblocks - delta;
+ lcounter = free - delta;
if (lcounter < 0) {
/* We can't satisfy the request, just get what we can */
- mp->m_resblks += mp->m_sb.sb_fdblocks;
- mp->m_resblks_avail += mp->m_sb.sb_fdblocks;
- mp->m_sb.sb_fdblocks = 0;
+ mp->m_resblks += free;
+ mp->m_resblks_avail += free;
+ fdblks_delta = -free;
} else {
- mp->m_sb.sb_fdblocks = lcounter;
+ fdblks_delta = -delta;
mp->m_resblks = request;
mp->m_resblks_avail += delta;
}
}
+out:
+ if (outval) {
+ outval->resblks = mp->m_resblks;
+ outval->resblks_avail = mp->m_resblks_avail;
+ }
+ spin_unlock(&mp->m_sb_lock);
- outval->resblks = mp->m_resblks;
- outval->resblks_avail = mp->m_resblks_avail;
- XFS_SB_UNLOCK(mp, s);
+ if (fdblks_delta) {
+ /*
+ * If we are putting blocks back here, m_resblks_avail is
+ * already at its max so this will put it in the free pool.
+ *
+ * If we need space, we'll either succeed in getting it
+ * from the free block count or we'll get an enospc. If
+ * we get a ENOSPC, it means things changed while we were
+ * calculating fdblks_delta and so we should try again to
+ * see if there is anything left to reserve.
+ *
+ * Don't set the reserved flag here - we don't want to reserve
+ * the extra reserve blocks from the reserve.....
+ */
+ int error;
+ error = xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
+ fdblks_delta, 0);
+ if (error == ENOSPC)
+ goto retry;
+ }
return 0;
}
-void
-xfs_fs_log_dummy(xfs_mount_t *mp)
+/*
+ * Dump a transaction into the log that contains no real change. This is needed
+ * to be able to make the log dirty or stamp the current tail LSN into the log
+ * during the covering operation.
+ *
+ * We cannot use an inode here for this - that will push dirty state back up
+ * into the VFS and then periodic inode flushing will prevent log covering from
+ * making progress. Hence we log a field in the superblock instead and use a
+ * synchronous transaction to ensure the superblock is immediately unpinned
+ * and can be written back.
+ */
+int
+xfs_fs_log_dummy(
+ xfs_mount_t *mp)
{
- xfs_trans_t *tp;
- xfs_inode_t *ip;
-
+ xfs_trans_t *tp;
+ int error;
- tp = _xfs_trans_alloc(mp, XFS_TRANS_DUMMY1);
- atomic_inc(&mp->m_active_trans);
- if (xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0)) {
+ tp = _xfs_trans_alloc(mp, XFS_TRANS_DUMMY1, KM_SLEEP);
+ error = xfs_trans_reserve(tp, &M_RES(mp)->tr_sb, 0, 0);
+ if (error) {
xfs_trans_cancel(tp, 0);
- return;
+ return error;
}
- ip = mp->m_rootip;
- xfs_ilock(ip, XFS_ILOCK_EXCL);
-
- xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
- xfs_trans_ihold(tp, ip);
- xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+ /* log the UUID because it is an unchanging field */
+ xfs_mod_sb(tp, XFS_SB_UUID);
xfs_trans_set_sync(tp);
- xfs_trans_commit(tp, 0, NULL);
-
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ return xfs_trans_commit(tp, 0);
}
int
@@ -573,21 +801,21 @@ xfs_fs_goingdown(
{
switch (inflags) {
case XFS_FSOP_GOING_FLAGS_DEFAULT: {
- struct vfs *vfsp = XFS_MTOVFS(mp);
- struct super_block *sb = freeze_bdev(vfsp->vfs_super->s_bdev);
+ struct super_block *sb = freeze_bdev(mp->m_super->s_bdev);
if (sb && !IS_ERR(sb)) {
- xfs_force_shutdown(mp, XFS_FORCE_UMOUNT);
+ xfs_force_shutdown(mp, SHUTDOWN_FORCE_UMOUNT);
thaw_bdev(sb->s_bdev, sb);
}
-
+
break;
}
case XFS_FSOP_GOING_FLAGS_LOGFLUSH:
- xfs_force_shutdown(mp, XFS_FORCE_UMOUNT);
+ xfs_force_shutdown(mp, SHUTDOWN_FORCE_UMOUNT);
break;
case XFS_FSOP_GOING_FLAGS_NOLOGFLUSH:
- xfs_force_shutdown(mp, XFS_FORCE_UMOUNT|XFS_LOG_IO_ERROR);
+ xfs_force_shutdown(mp,
+ SHUTDOWN_FORCE_UMOUNT | SHUTDOWN_LOG_IO_ERROR);
break;
default:
return XFS_ERROR(EINVAL);
@@ -595,3 +823,63 @@ xfs_fs_goingdown(
return 0;
}
+
+/*
+ * Force a shutdown of the filesystem instantly while keeping the filesystem
+ * consistent. We don't do an unmount here; just shutdown the shop, make sure
+ * that absolutely nothing persistent happens to this filesystem after this
+ * point.
+ */
+void
+xfs_do_force_shutdown(
+ xfs_mount_t *mp,
+ int flags,
+ char *fname,
+ int lnnum)
+{
+ int logerror;
+
+ logerror = flags & SHUTDOWN_LOG_IO_ERROR;
+
+ if (!(flags & SHUTDOWN_FORCE_UMOUNT)) {
+ xfs_notice(mp,
+ "%s(0x%x) called from line %d of file %s. Return address = 0x%p",
+ __func__, flags, lnnum, fname, __return_address);
+ }
+ /*
+ * No need to duplicate efforts.
+ */
+ if (XFS_FORCED_SHUTDOWN(mp) && !logerror)
+ return;
+
+ /*
+ * This flags XFS_MOUNT_FS_SHUTDOWN, makes sure that we don't
+ * queue up anybody new on the log reservations, and wakes up
+ * everybody who's sleeping on log reservations to tell them
+ * the bad news.
+ */
+ if (xfs_log_force_umount(mp, logerror))
+ return;
+
+ if (flags & SHUTDOWN_CORRUPT_INCORE) {
+ xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_CORRUPT,
+ "Corruption of in-memory data detected. Shutting down filesystem");
+ if (XFS_ERRLEVEL_HIGH <= xfs_error_level)
+ xfs_stack_trace();
+ } else if (!(flags & SHUTDOWN_FORCE_UMOUNT)) {
+ if (logerror) {
+ xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_LOGERROR,
+ "Log I/O Error Detected. Shutting down filesystem");
+ } else if (flags & SHUTDOWN_DEVICE_REQ) {
+ xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_IOERROR,
+ "All device paths lost. Shutting down filesystem");
+ } else if (!(flags & SHUTDOWN_REMOTE_REQ)) {
+ xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_IOERROR,
+ "I/O Error Detected. Shutting down filesystem");
+ }
+ }
+ if (!(flags & SHUTDOWN_FORCE_UMOUNT)) {
+ xfs_alert(mp,
+ "Please umount the filesystem and rectify the problem(s)");
+ }
+}
diff --git a/fs/xfs/xfs_fsops.h b/fs/xfs/xfs_fsops.h
index 300d0c9d61a..1b6a98b6688 100644
--- a/fs/xfs/xfs_fsops.h
+++ b/fs/xfs/xfs_fsops.h
@@ -25,6 +25,6 @@ extern int xfs_fs_counts(xfs_mount_t *mp, xfs_fsop_counts_t *cnt);
extern int xfs_reserve_blocks(xfs_mount_t *mp, __uint64_t *inval,
xfs_fsop_resblks_t *outval);
extern int xfs_fs_goingdown(xfs_mount_t *mp, __uint32_t inflags);
-extern void xfs_fs_log_dummy(xfs_mount_t *mp);
+extern int xfs_fs_log_dummy(struct xfs_mount *mp);
#endif /* __XFS_FSOPS_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_globals.c b/fs/xfs/xfs_globals.c
index 6e8085f3463..5399ef222dd 100644
--- a/fs/xfs/linux-2.6/xfs_globals.c
+++ b/fs/xfs/xfs_globals.c
@@ -16,25 +16,19 @@
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "xfs.h"
-#include "xfs_cred.h"
#include "xfs_sysctl.h"
/*
- * System memory size - used to scale certain data structures in XFS.
- */
-unsigned long xfs_physmem;
-
-/*
* Tunable XFS parameters. xfs_params is required even when CONFIG_SYSCTL=n,
* other XFS code uses these values. Times are measured in centisecs (i.e.
- * 100ths of a second).
+ * 100ths of a second) with the exception of eofb_timer, which is measured in
+ * seconds.
*/
xfs_param_t xfs_params = {
/* MIN DFLT MAX */
- .restrict_chown = { 0, 1, 1 },
.sgid_inherit = { 0, 0, 1 },
.symlink_mode = { 0, 0, 1 },
- .panic_mask = { 0, 0, 127 },
+ .panic_mask = { 0, 0, 255 },
.error_level = { 0, 3, 11 },
.syncd_timer = { 1*100, 30*100, 7200*100},
.stats_clear = { 0, 0, 1 },
@@ -45,10 +39,7 @@ xfs_param_t xfs_params = {
.xfs_buf_age = { 1*100, 15*100, 7200*100},
.inherit_nosym = { 0, 0, 1 },
.rotorstep = { 1, 1, 255 },
+ .inherit_nodfrg = { 0, 1, 1 },
+ .fstrm_timer = { 1, 30*100, 3600*100},
+ .eofb_timer = { 1, 300, 3600*24},
};
-
-/*
- * Global system credential structure.
- */
-cred_t sys_cred_val, *sys_cred = &sys_cred_val;
-
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index 8f3fae1aa98..5960e5593fe 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -17,98 +17,337 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
-#include "xfs_types.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_bit.h"
-#include "xfs_log.h"
#include "xfs_inum.h"
-#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
-#include "xfs_dir.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
-#include "xfs_dinode.h"
#include "xfs_inode.h"
#include "xfs_btree.h"
#include "xfs_ialloc.h"
+#include "xfs_ialloc_btree.h"
#include "xfs_alloc.h"
#include "xfs_rtalloc.h"
#include "xfs_error.h"
#include "xfs_bmap.h"
+#include "xfs_cksum.h"
+#include "xfs_trans.h"
+#include "xfs_buf_item.h"
+#include "xfs_icreate_item.h"
+#include "xfs_icache.h"
+#include "xfs_dinode.h"
+#include "xfs_trace.h"
+
/*
- * Log specified fields for the inode given by bp and off.
+ * Allocation group level functions.
*/
-STATIC void
-xfs_ialloc_log_di(
- xfs_trans_t *tp, /* transaction pointer */
- xfs_buf_t *bp, /* inode buffer */
- int off, /* index of inode in buffer */
- int fields) /* bitmask of fields to log */
+static inline int
+xfs_ialloc_cluster_alignment(
+ xfs_alloc_arg_t *args)
{
- int first; /* first byte number */
- int ioffset; /* off in bytes */
- int last; /* last byte number */
- xfs_mount_t *mp; /* mount point structure */
- static const short offsets[] = { /* field offsets */
- /* keep in sync with bits */
- offsetof(xfs_dinode_core_t, di_magic),
- offsetof(xfs_dinode_core_t, di_mode),
- offsetof(xfs_dinode_core_t, di_version),
- offsetof(xfs_dinode_core_t, di_format),
- offsetof(xfs_dinode_core_t, di_onlink),
- offsetof(xfs_dinode_core_t, di_uid),
- offsetof(xfs_dinode_core_t, di_gid),
- offsetof(xfs_dinode_core_t, di_nlink),
- offsetof(xfs_dinode_core_t, di_projid),
- offsetof(xfs_dinode_core_t, di_pad),
- offsetof(xfs_dinode_core_t, di_atime),
- offsetof(xfs_dinode_core_t, di_mtime),
- offsetof(xfs_dinode_core_t, di_ctime),
- offsetof(xfs_dinode_core_t, di_size),
- offsetof(xfs_dinode_core_t, di_nblocks),
- offsetof(xfs_dinode_core_t, di_extsize),
- offsetof(xfs_dinode_core_t, di_nextents),
- offsetof(xfs_dinode_core_t, di_anextents),
- offsetof(xfs_dinode_core_t, di_forkoff),
- offsetof(xfs_dinode_core_t, di_aformat),
- offsetof(xfs_dinode_core_t, di_dmevmask),
- offsetof(xfs_dinode_core_t, di_dmstate),
- offsetof(xfs_dinode_core_t, di_flags),
- offsetof(xfs_dinode_core_t, di_gen),
- offsetof(xfs_dinode_t, di_next_unlinked),
- offsetof(xfs_dinode_t, di_u),
- offsetof(xfs_dinode_t, di_a),
- sizeof(xfs_dinode_t)
- };
+ if (xfs_sb_version_hasalign(&args->mp->m_sb) &&
+ args->mp->m_sb.sb_inoalignmt >=
+ XFS_B_TO_FSBT(args->mp, args->mp->m_inode_cluster_size))
+ return args->mp->m_sb.sb_inoalignmt;
+ return 1;
+}
+/*
+ * Lookup a record by ino in the btree given by cur.
+ */
+int /* error */
+xfs_inobt_lookup(
+ struct xfs_btree_cur *cur, /* btree cursor */
+ xfs_agino_t ino, /* starting inode of chunk */
+ xfs_lookup_t dir, /* <=, >=, == */
+ int *stat) /* success/failure */
+{
+ cur->bc_rec.i.ir_startino = ino;
+ cur->bc_rec.i.ir_freecount = 0;
+ cur->bc_rec.i.ir_free = 0;
+ return xfs_btree_lookup(cur, dir, stat);
+}
+
+/*
+ * Update the record referred to by cur to the value given.
+ * This either works (return 0) or gets an EFSCORRUPTED error.
+ */
+STATIC int /* error */
+xfs_inobt_update(
+ struct xfs_btree_cur *cur, /* btree cursor */
+ xfs_inobt_rec_incore_t *irec) /* btree record */
+{
+ union xfs_btree_rec rec;
+
+ rec.inobt.ir_startino = cpu_to_be32(irec->ir_startino);
+ rec.inobt.ir_freecount = cpu_to_be32(irec->ir_freecount);
+ rec.inobt.ir_free = cpu_to_be64(irec->ir_free);
+ return xfs_btree_update(cur, &rec);
+}
+
+/*
+ * Get the data from the pointed-to record.
+ */
+int /* error */
+xfs_inobt_get_rec(
+ struct xfs_btree_cur *cur, /* btree cursor */
+ xfs_inobt_rec_incore_t *irec, /* btree record */
+ int *stat) /* output: success/failure */
+{
+ union xfs_btree_rec *rec;
+ int error;
+
+ error = xfs_btree_get_rec(cur, &rec, stat);
+ if (!error && *stat == 1) {
+ irec->ir_startino = be32_to_cpu(rec->inobt.ir_startino);
+ irec->ir_freecount = be32_to_cpu(rec->inobt.ir_freecount);
+ irec->ir_free = be64_to_cpu(rec->inobt.ir_free);
+ }
+ return error;
+}
+
+/*
+ * Insert a single inobt record. Cursor must already point to desired location.
+ */
+STATIC int
+xfs_inobt_insert_rec(
+ struct xfs_btree_cur *cur,
+ __int32_t freecount,
+ xfs_inofree_t free,
+ int *stat)
+{
+ cur->bc_rec.i.ir_freecount = freecount;
+ cur->bc_rec.i.ir_free = free;
+ return xfs_btree_insert(cur, stat);
+}
+
+/*
+ * Insert records describing a newly allocated inode chunk into the inobt.
+ */
+STATIC int
+xfs_inobt_insert(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ struct xfs_buf *agbp,
+ xfs_agino_t newino,
+ xfs_agino_t newlen,
+ xfs_btnum_t btnum)
+{
+ struct xfs_btree_cur *cur;
+ struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp);
+ xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno);
+ xfs_agino_t thisino;
+ int i;
+ int error;
+
+ cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, btnum);
+
+ for (thisino = newino;
+ thisino < newino + newlen;
+ thisino += XFS_INODES_PER_CHUNK) {
+ error = xfs_inobt_lookup(cur, thisino, XFS_LOOKUP_EQ, &i);
+ if (error) {
+ xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+ return error;
+ }
+ ASSERT(i == 0);
+
+ error = xfs_inobt_insert_rec(cur, XFS_INODES_PER_CHUNK,
+ XFS_INOBT_ALL_FREE, &i);
+ if (error) {
+ xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+ return error;
+ }
+ ASSERT(i == 1);
+ }
+
+ xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+
+ return 0;
+}
+
+/*
+ * Verify that the number of free inodes in the AGI is correct.
+ */
+#ifdef DEBUG
+STATIC int
+xfs_check_agi_freecount(
+ struct xfs_btree_cur *cur,
+ struct xfs_agi *agi)
+{
+ if (cur->bc_nlevels == 1) {
+ xfs_inobt_rec_incore_t rec;
+ int freecount = 0;
+ int error;
+ int i;
+
+ error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i);
+ if (error)
+ return error;
+
+ do {
+ error = xfs_inobt_get_rec(cur, &rec, &i);
+ if (error)
+ return error;
+
+ if (i) {
+ freecount += rec.ir_freecount;
+ error = xfs_btree_increment(cur, 0, &i);
+ if (error)
+ return error;
+ }
+ } while (i == 1);
+
+ if (!XFS_FORCED_SHUTDOWN(cur->bc_mp))
+ ASSERT(freecount == be32_to_cpu(agi->agi_freecount));
+ }
+ return 0;
+}
+#else
+#define xfs_check_agi_freecount(cur, agi) 0
+#endif
+
+/*
+ * Initialise a new set of inodes. When called without a transaction context
+ * (e.g. from recovery) we initiate a delayed write of the inode buffers rather
+ * than logging them (which in a transaction context puts them into the AIL
+ * for writeback rather than the xfsbufd queue).
+ */
+int
+xfs_ialloc_inode_init(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ struct list_head *buffer_list,
+ xfs_agnumber_t agno,
+ xfs_agblock_t agbno,
+ xfs_agblock_t length,
+ unsigned int gen)
+{
+ struct xfs_buf *fbuf;
+ struct xfs_dinode *free;
+ int nbufs, blks_per_cluster, inodes_per_cluster;
+ int version;
+ int i, j;
+ xfs_daddr_t d;
+ xfs_ino_t ino = 0;
- ASSERT(offsetof(xfs_dinode_t, di_core) == 0);
- ASSERT((fields & (XFS_DI_U|XFS_DI_A)) == 0);
- mp = tp->t_mountp;
/*
- * Get the inode-relative first and last bytes for these fields
+ * Loop over the new block(s), filling in the inodes. For small block
+ * sizes, manipulate the inodes in buffers which are multiples of the
+ * blocks size.
*/
- xfs_btree_offsets(fields, offsets, XFS_DI_NUM_BITS, &first, &last);
+ blks_per_cluster = xfs_icluster_size_fsb(mp);
+ inodes_per_cluster = blks_per_cluster << mp->m_sb.sb_inopblog;
+ nbufs = length / blks_per_cluster;
+
/*
- * Convert to buffer offsets and log it.
+ * Figure out what version number to use in the inodes we create. If
+ * the superblock version has caught up to the one that supports the new
+ * inode format, then use the new inode version. Otherwise use the old
+ * version so that old kernels will continue to be able to use the file
+ * system.
+ *
+ * For v3 inodes, we also need to write the inode number into the inode,
+ * so calculate the first inode number of the chunk here as
+ * XFS_OFFBNO_TO_AGINO() only works within a filesystem block, not
+ * across multiple filesystem blocks (such as a cluster) and so cannot
+ * be used in the cluster buffer loop below.
+ *
+ * Further, because we are writing the inode directly into the buffer
+ * and calculating a CRC on the entire inode, we have ot log the entire
+ * inode so that the entire range the CRC covers is present in the log.
+ * That means for v3 inode we log the entire buffer rather than just the
+ * inode cores.
*/
- ioffset = off << mp->m_sb.sb_inodelog;
- first += ioffset;
- last += ioffset;
- xfs_trans_log_buf(tp, bp, first, last);
-}
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ version = 3;
+ ino = XFS_AGINO_TO_INO(mp, agno,
+ XFS_OFFBNO_TO_AGINO(mp, agbno, 0));
-/*
- * Allocation group level functions.
- */
+ /*
+ * log the initialisation that is about to take place as an
+ * logical operation. This means the transaction does not
+ * need to log the physical changes to the inode buffers as log
+ * recovery will know what initialisation is actually needed.
+ * Hence we only need to log the buffers as "ordered" buffers so
+ * they track in the AIL as if they were physically logged.
+ */
+ if (tp)
+ xfs_icreate_log(tp, agno, agbno, mp->m_ialloc_inos,
+ mp->m_sb.sb_inodesize, length, gen);
+ } else
+ version = 2;
+
+ for (j = 0; j < nbufs; j++) {
+ /*
+ * Get the block.
+ */
+ d = XFS_AGB_TO_DADDR(mp, agno, agbno + (j * blks_per_cluster));
+ fbuf = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
+ mp->m_bsize * blks_per_cluster,
+ XBF_UNMAPPED);
+ if (!fbuf)
+ return ENOMEM;
+
+ /* Initialize the inode buffers and log them appropriately. */
+ fbuf->b_ops = &xfs_inode_buf_ops;
+ xfs_buf_zero(fbuf, 0, BBTOB(fbuf->b_length));
+ for (i = 0; i < inodes_per_cluster; i++) {
+ int ioffset = i << mp->m_sb.sb_inodelog;
+ uint isize = xfs_dinode_size(version);
+
+ free = xfs_make_iptr(mp, fbuf, i);
+ free->di_magic = cpu_to_be16(XFS_DINODE_MAGIC);
+ free->di_version = version;
+ free->di_gen = cpu_to_be32(gen);
+ free->di_next_unlinked = cpu_to_be32(NULLAGINO);
+
+ if (version == 3) {
+ free->di_ino = cpu_to_be64(ino);
+ ino++;
+ uuid_copy(&free->di_uuid, &mp->m_sb.sb_uuid);
+ xfs_dinode_calc_crc(mp, free);
+ } else if (tp) {
+ /* just log the inode core */
+ xfs_trans_log_buf(tp, fbuf, ioffset,
+ ioffset + isize - 1);
+ }
+ }
+
+ if (tp) {
+ /*
+ * Mark the buffer as an inode allocation buffer so it
+ * sticks in AIL at the point of this allocation
+ * transaction. This ensures the they are on disk before
+ * the tail of the log can be moved past this
+ * transaction (i.e. by preventing relogging from moving
+ * it forward in the log).
+ */
+ xfs_trans_inode_alloc_buf(tp, fbuf);
+ if (version == 3) {
+ /*
+ * Mark the buffer as ordered so that they are
+ * not physically logged in the transaction but
+ * still tracked in the AIL as part of the
+ * transaction and pin the log appropriately.
+ */
+ xfs_trans_ordered_buf(tp, fbuf);
+ xfs_trans_log_buf(tp, fbuf, 0,
+ BBTOB(fbuf->b_length) - 1);
+ }
+ } else {
+ fbuf->b_flags |= XBF_DONE;
+ xfs_buf_delwri_queue(fbuf, buffer_list);
+ xfs_buf_relse(fbuf);
+ }
+ }
+ return 0;
+}
/*
* Allocate new inodes in the allocation group specified by agbp.
@@ -122,25 +361,15 @@ xfs_ialloc_ag_alloc(
{
xfs_agi_t *agi; /* allocation group header */
xfs_alloc_arg_t args; /* allocation argument structure */
- int blks_per_cluster; /* fs blocks per inode cluster */
- xfs_btree_cur_t *cur; /* inode btree cursor */
- xfs_daddr_t d; /* disk addr of buffer */
+ xfs_agnumber_t agno;
int error;
- xfs_buf_t *fbuf; /* new free inodes' buffer */
- xfs_dinode_t *free; /* new free inode structure */
- int i; /* inode counter */
- int j; /* block counter */
- int nbufs; /* num bufs of new inodes */
xfs_agino_t newino; /* new first inode's number */
xfs_agino_t newlen; /* new number of inodes */
- int ninodes; /* num inodes per buf */
- xfs_agino_t thisino; /* current inode number, for loop */
- int version; /* inode version number to use */
- int isaligned; /* inode allocation at stripe unit */
+ int isaligned = 0; /* inode allocation at stripe unit */
/* boundary */
- xfs_dinode_core_t dic; /* a dinode_core to copy to new */
- /* inodes */
+ struct xfs_perag *pag;
+ memset(&args, 0, sizeof(args));
args.tp = tp;
args.mp = tp->t_mountp;
@@ -148,52 +377,97 @@ xfs_ialloc_ag_alloc(
* Locking will ensure that we don't have two callers in here
* at one time.
*/
- newlen = XFS_IALLOC_INODES(args.mp);
+ newlen = args.mp->m_ialloc_inos;
if (args.mp->m_maxicount &&
args.mp->m_sb.sb_icount + newlen > args.mp->m_maxicount)
return XFS_ERROR(ENOSPC);
- args.minlen = args.maxlen = XFS_IALLOC_BLOCKS(args.mp);
+ args.minlen = args.maxlen = args.mp->m_ialloc_blks;
/*
- * Set the alignment for the allocation.
- * If stripe alignment is turned on then align at stripe unit
- * boundary.
- * If the cluster size is smaller than a filesystem block
- * then we're doing I/O for inodes in filesystem block size pieces,
- * so don't need alignment anyway.
+ * First try to allocate inodes contiguous with the last-allocated
+ * chunk of inodes. If the filesystem is striped, this will fill
+ * an entire stripe unit with inodes.
*/
- isaligned = 0;
- if (args.mp->m_sinoalign) {
- ASSERT(!(args.mp->m_flags & XFS_MOUNT_NOALIGN));
- args.alignment = args.mp->m_dalign;
- isaligned = 1;
- } else if (XFS_SB_VERSION_HASALIGN(&args.mp->m_sb) &&
- args.mp->m_sb.sb_inoalignmt >=
- XFS_B_TO_FSBT(args.mp, XFS_INODE_CLUSTER_SIZE(args.mp)))
- args.alignment = args.mp->m_sb.sb_inoalignmt;
- else
- args.alignment = 1;
agi = XFS_BUF_TO_AGI(agbp);
- /*
- * Need to figure out where to allocate the inode blocks.
- * Ideally they should be spaced out through the a.g.
- * For now, just allocate blocks up front.
- */
- args.agbno = be32_to_cpu(agi->agi_root);
- args.fsbno = XFS_AGB_TO_FSB(args.mp, be32_to_cpu(agi->agi_seqno),
- args.agbno);
- /*
- * Allocate a fixed-size extent of inodes.
- */
- args.type = XFS_ALLOCTYPE_NEAR_BNO;
- args.mod = args.total = args.wasdel = args.isfl = args.userdata =
+ newino = be32_to_cpu(agi->agi_newino);
+ agno = be32_to_cpu(agi->agi_seqno);
+ args.agbno = XFS_AGINO_TO_AGBNO(args.mp, newino) +
+ args.mp->m_ialloc_blks;
+ if (likely(newino != NULLAGINO &&
+ (args.agbno < be32_to_cpu(agi->agi_length)))) {
+ args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno);
+ args.type = XFS_ALLOCTYPE_THIS_BNO;
+ args.prod = 1;
+
+ /*
+ * We need to take into account alignment here to ensure that
+ * we don't modify the free list if we fail to have an exact
+ * block. If we don't have an exact match, and every oher
+ * attempt allocation attempt fails, we'll end up cancelling
+ * a dirty transaction and shutting down.
+ *
+ * For an exact allocation, alignment must be 1,
+ * however we need to take cluster alignment into account when
+ * fixing up the freelist. Use the minalignslop field to
+ * indicate that extra blocks might be required for alignment,
+ * but not to use them in the actual exact allocation.
+ */
+ args.alignment = 1;
+ args.minalignslop = xfs_ialloc_cluster_alignment(&args) - 1;
+
+ /* Allow space for the inode btree to split. */
+ args.minleft = args.mp->m_in_maxlevels - 1;
+ if ((error = xfs_alloc_vextent(&args)))
+ return error;
+
+ /*
+ * This request might have dirtied the transaction if the AG can
+ * satisfy the request, but the exact block was not available.
+ * If the allocation did fail, subsequent requests will relax
+ * the exact agbno requirement and increase the alignment
+ * instead. It is critical that the total size of the request
+ * (len + alignment + slop) does not increase from this point
+ * on, so reset minalignslop to ensure it is not included in
+ * subsequent requests.
+ */
args.minalignslop = 0;
- args.prod = 1;
- /*
- * Allow space for the inode btree to split.
- */
- args.minleft = XFS_IN_MAXLEVELS(args.mp) - 1;
- if ((error = xfs_alloc_vextent(&args)))
- return error;
+ } else
+ args.fsbno = NULLFSBLOCK;
+
+ if (unlikely(args.fsbno == NULLFSBLOCK)) {
+ /*
+ * Set the alignment for the allocation.
+ * If stripe alignment is turned on then align at stripe unit
+ * boundary.
+ * If the cluster size is smaller than a filesystem block
+ * then we're doing I/O for inodes in filesystem block size
+ * pieces, so don't need alignment anyway.
+ */
+ isaligned = 0;
+ if (args.mp->m_sinoalign) {
+ ASSERT(!(args.mp->m_flags & XFS_MOUNT_NOALIGN));
+ args.alignment = args.mp->m_dalign;
+ isaligned = 1;
+ } else
+ args.alignment = xfs_ialloc_cluster_alignment(&args);
+ /*
+ * Need to figure out where to allocate the inode blocks.
+ * Ideally they should be spaced out through the a.g.
+ * For now, just allocate blocks up front.
+ */
+ args.agbno = be32_to_cpu(agi->agi_root);
+ args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno);
+ /*
+ * Allocate a fixed-size extent of inodes.
+ */
+ args.type = XFS_ALLOCTYPE_NEAR_BNO;
+ args.prod = 1;
+ /*
+ * Allow space for the inode btree to split.
+ */
+ args.minleft = args.mp->m_in_maxlevels - 1;
+ if ((error = xfs_alloc_vextent(&args)))
+ return error;
+ }
/*
* If stripe alignment is turned on, then try again with cluster
@@ -202,14 +476,8 @@ xfs_ialloc_ag_alloc(
if (isaligned && args.fsbno == NULLFSBLOCK) {
args.type = XFS_ALLOCTYPE_NEAR_BNO;
args.agbno = be32_to_cpu(agi->agi_root);
- args.fsbno = XFS_AGB_TO_FSB(args.mp,
- be32_to_cpu(agi->agi_seqno), args.agbno);
- if (XFS_SB_VERSION_HASALIGN(&args.mp->m_sb) &&
- args.mp->m_sb.sb_inoalignmt >=
- XFS_B_TO_FSBT(args.mp, XFS_INODE_CLUSTER_SIZE(args.mp)))
- args.alignment = args.mp->m_sb.sb_inoalignmt;
- else
- args.alignment = 1;
+ args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno);
+ args.alignment = xfs_ialloc_cluster_alignment(&args);
if ((error = xfs_alloc_vextent(&args)))
return error;
}
@@ -219,93 +487,46 @@ xfs_ialloc_ag_alloc(
return 0;
}
ASSERT(args.len == args.minlen);
+
/*
- * Convert the results.
- */
- newino = XFS_OFFBNO_TO_AGINO(args.mp, args.agbno, 0);
- /*
- * Loop over the new block(s), filling in the inodes.
- * For small block sizes, manipulate the inodes in buffers
- * which are multiples of the blocks size.
+ * Stamp and write the inode buffers.
+ *
+ * Seed the new inode cluster with a random generation number. This
+ * prevents short-term reuse of generation numbers if a chunk is
+ * freed and then immediately reallocated. We use random numbers
+ * rather than a linear progression to prevent the next generation
+ * number from being easily guessable.
*/
- if (args.mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(args.mp)) {
- blks_per_cluster = 1;
- nbufs = (int)args.len;
- ninodes = args.mp->m_sb.sb_inopblock;
- } else {
- blks_per_cluster = XFS_INODE_CLUSTER_SIZE(args.mp) /
- args.mp->m_sb.sb_blocksize;
- nbufs = (int)args.len / blks_per_cluster;
- ninodes = blks_per_cluster * args.mp->m_sb.sb_inopblock;
- }
+ error = xfs_ialloc_inode_init(args.mp, tp, NULL, agno, args.agbno,
+ args.len, prandom_u32());
+
+ if (error)
+ return error;
/*
- * Figure out what version number to use in the inodes we create.
- * If the superblock version has caught up to the one that supports
- * the new inode format, then use the new inode version. Otherwise
- * use the old version so that old kernels will continue to be
- * able to use the file system.
+ * Convert the results.
*/
- if (XFS_SB_VERSION_HASNLINK(&args.mp->m_sb))
- version = XFS_DINODE_VERSION_2;
- else
- version = XFS_DINODE_VERSION_1;
-
- memset(&dic, 0, sizeof(xfs_dinode_core_t));
- INT_SET(dic.di_magic, ARCH_CONVERT, XFS_DINODE_MAGIC);
- INT_SET(dic.di_version, ARCH_CONVERT, version);
-
- for (j = 0; j < nbufs; j++) {
- /*
- * Get the block.
- */
- d = XFS_AGB_TO_DADDR(args.mp, be32_to_cpu(agi->agi_seqno),
- args.agbno + (j * blks_per_cluster));
- fbuf = xfs_trans_get_buf(tp, args.mp->m_ddev_targp, d,
- args.mp->m_bsize * blks_per_cluster,
- XFS_BUF_LOCK);
- ASSERT(fbuf);
- ASSERT(!XFS_BUF_GETERROR(fbuf));
- /*
- * Loop over the inodes in this buffer.
- */
-
- for (i = 0; i < ninodes; i++) {
- free = XFS_MAKE_IPTR(args.mp, fbuf, i);
- memcpy(&(free->di_core), &dic, sizeof(xfs_dinode_core_t));
- INT_SET(free->di_next_unlinked, ARCH_CONVERT, NULLAGINO);
- xfs_ialloc_log_di(tp, fbuf, i,
- XFS_DI_CORE_BITS | XFS_DI_NEXT_UNLINKED);
- }
- xfs_trans_inode_alloc_buf(tp, fbuf);
- }
- be32_add(&agi->agi_count, newlen);
- be32_add(&agi->agi_freecount, newlen);
- down_read(&args.mp->m_peraglock);
- args.mp->m_perag[be32_to_cpu(agi->agi_seqno)].pagi_freecount += newlen;
- up_read(&args.mp->m_peraglock);
+ newino = XFS_OFFBNO_TO_AGINO(args.mp, args.agbno, 0);
+ be32_add_cpu(&agi->agi_count, newlen);
+ be32_add_cpu(&agi->agi_freecount, newlen);
+ pag = xfs_perag_get(args.mp, agno);
+ pag->pagi_freecount += newlen;
+ xfs_perag_put(pag);
agi->agi_newino = cpu_to_be32(newino);
+
/*
- * Insert records describing the new inode chunk into the btree.
+ * Insert records describing the new inode chunk into the btrees.
*/
- cur = xfs_btree_init_cursor(args.mp, tp, agbp,
- be32_to_cpu(agi->agi_seqno),
- XFS_BTNUM_INO, (xfs_inode_t *)0, 0);
- for (thisino = newino;
- thisino < newino + newlen;
- thisino += XFS_INODES_PER_CHUNK) {
- if ((error = xfs_inobt_lookup_eq(cur, thisino,
- XFS_INODES_PER_CHUNK, XFS_INOBT_ALL_FREE, &i))) {
- xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
- return error;
- }
- ASSERT(i == 0);
- if ((error = xfs_inobt_insert(cur, &i))) {
- xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+ error = xfs_inobt_insert(args.mp, tp, agbp, newino, newlen,
+ XFS_BTNUM_INO);
+ if (error)
+ return error;
+
+ if (xfs_sb_version_hasfinobt(&args.mp->m_sb)) {
+ error = xfs_inobt_insert(args.mp, tp, agbp, newino, newlen,
+ XFS_BTNUM_FINO);
+ if (error)
return error;
- }
- ASSERT(i == 1);
}
- xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
/*
* Log allocation group header fields
*/
@@ -320,7 +541,7 @@ xfs_ialloc_ag_alloc(
return 0;
}
-STATIC __inline xfs_agnumber_t
+STATIC xfs_agnumber_t
xfs_ialloc_next_ag(
xfs_mount_t *mp)
{
@@ -328,7 +549,7 @@ xfs_ialloc_next_ag(
spin_lock(&mp->m_agirotor_lock);
agno = mp->m_agirotor;
- if (++mp->m_agirotor == mp->m_maxagi)
+ if (++mp->m_agirotor >= mp->m_maxagi)
mp->m_agirotor = 0;
spin_unlock(&mp->m_agirotor_lock);
@@ -337,16 +558,15 @@ xfs_ialloc_next_ag(
/*
* Select an allocation group to look for a free inode in, based on the parent
- * inode and then mode. Return the allocation group buffer.
+ * inode and the mode. Return the allocation group buffer.
*/
-STATIC xfs_buf_t * /* allocation group buffer */
+STATIC xfs_agnumber_t
xfs_ialloc_ag_select(
xfs_trans_t *tp, /* transaction pointer */
xfs_ino_t parent, /* parent directory inode number */
- mode_t mode, /* bits set to indicate file type */
+ umode_t mode, /* bits set to indicate file type */
int okalloc) /* ok to allocate more space */
{
- xfs_buf_t *agbp; /* allocation group header buffer */
xfs_agnumber_t agcount; /* number of ag's in the filesystem */
xfs_agnumber_t agno; /* current ag number */
int flags; /* alloc buffer locking flags */
@@ -356,6 +576,7 @@ xfs_ialloc_ag_select(
int needspace; /* file mode implies space allocated */
xfs_perag_t *pag; /* per allocation group data */
xfs_agnumber_t pagno; /* parent (starting) ag number */
+ int error;
/*
* Files of these types need at least one block if length > 0
@@ -371,7 +592,9 @@ xfs_ialloc_ag_select(
if (pagno >= agcount)
pagno = 0;
}
+
ASSERT(pagno < agcount);
+
/*
* Loop through allocation groups, looking for one with a little
* free space in it. Note we don't look for free inodes, exactly.
@@ -381,637 +604,876 @@ xfs_ialloc_ag_select(
*/
agno = pagno;
flags = XFS_ALLOC_FLAG_TRYLOCK;
- down_read(&mp->m_peraglock);
for (;;) {
- pag = &mp->m_perag[agno];
+ pag = xfs_perag_get(mp, agno);
+ if (!pag->pagi_inodeok) {
+ xfs_ialloc_next_ag(mp);
+ goto nextag;
+ }
+
if (!pag->pagi_init) {
- if (xfs_ialloc_read_agi(mp, tp, agno, &agbp)) {
- agbp = NULL;
+ error = xfs_ialloc_pagi_init(mp, tp, agno);
+ if (error)
goto nextag;
- }
- } else
- agbp = NULL;
+ }
- if (!pag->pagi_inodeok) {
- xfs_ialloc_next_ag(mp);
- goto unlock_nextag;
+ if (pag->pagi_freecount) {
+ xfs_perag_put(pag);
+ return agno;
}
- /*
- * Is there enough free space for the file plus a block
- * of inodes (if we need to allocate some)?
- */
- ineed = pag->pagi_freecount ? 0 : XFS_IALLOC_BLOCKS(mp);
- if (ineed && !pag->pagf_init) {
- if (agbp == NULL &&
- xfs_ialloc_read_agi(mp, tp, agno, &agbp)) {
- agbp = NULL;
+ if (!okalloc)
+ goto nextag;
+
+ if (!pag->pagf_init) {
+ error = xfs_alloc_pagf_init(mp, tp, agno, flags);
+ if (error)
goto nextag;
- }
- (void)xfs_alloc_pagf_init(mp, tp, agno, flags);
}
- if (!ineed || pag->pagf_init) {
- if (ineed && !(longest = pag->pagf_longest))
- longest = pag->pagf_flcount > 0;
- if (!ineed ||
- (pag->pagf_freeblks >= needspace + ineed &&
- longest >= ineed &&
- okalloc)) {
- if (agbp == NULL &&
- xfs_ialloc_read_agi(mp, tp, agno, &agbp)) {
- agbp = NULL;
- goto nextag;
- }
- up_read(&mp->m_peraglock);
- return agbp;
- }
+
+ /*
+ * Is there enough free space for the file plus a block of
+ * inodes? (if we need to allocate some)?
+ */
+ ineed = mp->m_ialloc_blks;
+ longest = pag->pagf_longest;
+ if (!longest)
+ longest = pag->pagf_flcount > 0;
+
+ if (pag->pagf_freeblks >= needspace + ineed &&
+ longest >= ineed) {
+ xfs_perag_put(pag);
+ return agno;
}
-unlock_nextag:
- if (agbp)
- xfs_trans_brelse(tp, agbp);
nextag:
+ xfs_perag_put(pag);
/*
* No point in iterating over the rest, if we're shutting
* down.
*/
- if (XFS_FORCED_SHUTDOWN(mp)) {
- up_read(&mp->m_peraglock);
- return (xfs_buf_t *)0;
- }
+ if (XFS_FORCED_SHUTDOWN(mp))
+ return NULLAGNUMBER;
agno++;
if (agno >= agcount)
agno = 0;
if (agno == pagno) {
- if (flags == 0) {
- up_read(&mp->m_peraglock);
- return (xfs_buf_t *)0;
- }
+ if (flags == 0)
+ return NULLAGNUMBER;
flags = 0;
}
}
}
/*
- * Visible inode allocation functions.
- */
-
-/*
- * Allocate an inode on disk.
- * Mode is used to tell whether the new inode will need space, and whether
- * it is a directory.
- *
- * The arguments IO_agbp and alloc_done are defined to work within
- * the constraint of one allocation per transaction.
- * xfs_dialloc() is designed to be called twice if it has to do an
- * allocation to make more free inodes. On the first call,
- * IO_agbp should be set to NULL. If an inode is available,
- * i.e., xfs_dialloc() did not need to do an allocation, an inode
- * number is returned. In this case, IO_agbp would be set to the
- * current ag_buf and alloc_done set to false.
- * If an allocation needed to be done, xfs_dialloc would return
- * the current ag_buf in IO_agbp and set alloc_done to true.
- * The caller should then commit the current transaction, allocate a new
- * transaction, and call xfs_dialloc() again, passing in the previous
- * value of IO_agbp. IO_agbp should be held across the transactions.
- * Since the agbp is locked across the two calls, the second call is
- * guaranteed to have a free inode available.
- *
- * Once we successfully pick an inode its number is returned and the
- * on-disk data structures are updated. The inode itself is not read
- * in, since doing so would break ordering constraints with xfs_reclaim.
+ * Try to retrieve the next record to the left/right from the current one.
*/
-int
-xfs_dialloc(
- xfs_trans_t *tp, /* transaction pointer */
- xfs_ino_t parent, /* parent inode (directory) */
- mode_t mode, /* mode bits for new inode */
- int okalloc, /* ok to allocate more space */
- xfs_buf_t **IO_agbp, /* in/out ag header's buffer */
- boolean_t *alloc_done, /* true if we needed to replenish
- inode freelist */
- xfs_ino_t *inop) /* inode number allocated */
+STATIC int
+xfs_ialloc_next_rec(
+ struct xfs_btree_cur *cur,
+ xfs_inobt_rec_incore_t *rec,
+ int *done,
+ int left)
{
- xfs_agnumber_t agcount; /* number of allocation groups */
- xfs_buf_t *agbp; /* allocation group header's buffer */
- xfs_agnumber_t agno; /* allocation group number */
- xfs_agi_t *agi; /* allocation group header structure */
- xfs_btree_cur_t *cur; /* inode allocation btree cursor */
- int error; /* error return value */
- int i; /* result code */
- int ialloced; /* inode allocation status */
- int noroom = 0; /* no space for inode blk allocation */
- xfs_ino_t ino; /* fs-relative inode to be returned */
- /* REFERENCED */
- int j; /* result code */
- xfs_mount_t *mp; /* file system mount structure */
- int offset; /* index of inode in chunk */
- xfs_agino_t pagino; /* parent's a.g. relative inode # */
- xfs_agnumber_t pagno; /* parent's allocation group number */
- xfs_inobt_rec_t rec; /* inode allocation record */
- xfs_agnumber_t tagno; /* testing allocation group number */
- xfs_btree_cur_t *tcur; /* temp cursor */
- xfs_inobt_rec_t trec; /* temp inode allocation record */
-
-
- if (*IO_agbp == NULL) {
- /*
- * We do not have an agbp, so select an initial allocation
- * group for inode allocation.
- */
- agbp = xfs_ialloc_ag_select(tp, parent, mode, okalloc);
- /*
- * Couldn't find an allocation group satisfying the
- * criteria, give up.
- */
- if (!agbp) {
- *inop = NULLFSINO;
- return 0;
- }
- agi = XFS_BUF_TO_AGI(agbp);
- ASSERT(be32_to_cpu(agi->agi_magicnum) == XFS_AGI_MAGIC);
- } else {
- /*
- * Continue where we left off before. In this case, we
- * know that the allocation group has free inodes.
- */
- agbp = *IO_agbp;
- agi = XFS_BUF_TO_AGI(agbp);
- ASSERT(be32_to_cpu(agi->agi_magicnum) == XFS_AGI_MAGIC);
- ASSERT(be32_to_cpu(agi->agi_freecount) > 0);
- }
- mp = tp->t_mountp;
- agcount = mp->m_sb.sb_agcount;
- agno = be32_to_cpu(agi->agi_seqno);
- tagno = agno;
- pagno = XFS_INO_TO_AGNO(mp, parent);
- pagino = XFS_INO_TO_AGINO(mp, parent);
+ int error;
+ int i;
- /*
- * If we have already hit the ceiling of inode blocks then clear
- * okalloc so we scan all available agi structures for a free
- * inode.
- */
+ if (left)
+ error = xfs_btree_decrement(cur, 0, &i);
+ else
+ error = xfs_btree_increment(cur, 0, &i);
- if (mp->m_maxicount &&
- mp->m_sb.sb_icount + XFS_IALLOC_INODES(mp) > mp->m_maxicount) {
- noroom = 1;
- okalloc = 0;
+ if (error)
+ return error;
+ *done = !i;
+ if (i) {
+ error = xfs_inobt_get_rec(cur, rec, &i);
+ if (error)
+ return error;
+ XFS_WANT_CORRUPTED_RETURN(i == 1);
}
- /*
- * Loop until we find an allocation group that either has free inodes
- * or in which we can allocate some inodes. Iterate through the
- * allocation groups upward, wrapping at the end.
- */
- *alloc_done = B_FALSE;
- while (!agi->agi_freecount) {
- /*
- * Don't do anything if we're not supposed to allocate
- * any blocks, just go on to the next ag.
- */
- if (okalloc) {
- /*
- * Try to allocate some new inodes in the allocation
- * group.
- */
- if ((error = xfs_ialloc_ag_alloc(tp, agbp, &ialloced))) {
- xfs_trans_brelse(tp, agbp);
- if (error == ENOSPC) {
- *inop = NULLFSINO;
- return 0;
- } else
- return error;
- }
- if (ialloced) {
- /*
- * We successfully allocated some inodes, return
- * the current context to the caller so that it
- * can commit the current transaction and call
- * us again where we left off.
- */
- ASSERT(be32_to_cpu(agi->agi_freecount) > 0);
- *alloc_done = B_TRUE;
- *IO_agbp = agbp;
- *inop = NULLFSINO;
- return 0;
- }
- }
- /*
- * If it failed, give up on this ag.
- */
- xfs_trans_brelse(tp, agbp);
- /*
- * Go on to the next ag: get its ag header.
- */
-nextag:
- if (++tagno == agcount)
- tagno = 0;
- if (tagno == agno) {
- *inop = NULLFSINO;
- return noroom ? ENOSPC : 0;
- }
- down_read(&mp->m_peraglock);
- if (mp->m_perag[tagno].pagi_inodeok == 0) {
- up_read(&mp->m_peraglock);
- goto nextag;
- }
- error = xfs_ialloc_read_agi(mp, tp, tagno, &agbp);
- up_read(&mp->m_peraglock);
+ return 0;
+}
+
+STATIC int
+xfs_ialloc_get_rec(
+ struct xfs_btree_cur *cur,
+ xfs_agino_t agino,
+ xfs_inobt_rec_incore_t *rec,
+ int *done)
+{
+ int error;
+ int i;
+
+ error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_EQ, &i);
+ if (error)
+ return error;
+ *done = !i;
+ if (i) {
+ error = xfs_inobt_get_rec(cur, rec, &i);
if (error)
- goto nextag;
- agi = XFS_BUF_TO_AGI(agbp);
- ASSERT(be32_to_cpu(agi->agi_magicnum) == XFS_AGI_MAGIC);
+ return error;
+ XFS_WANT_CORRUPTED_RETURN(i == 1);
}
- /*
- * Here with an allocation group that has a free inode.
- * Reset agno since we may have chosen a new ag in the
- * loop above.
- */
- agno = tagno;
- *IO_agbp = NULL;
- cur = xfs_btree_init_cursor(mp, tp, agbp, be32_to_cpu(agi->agi_seqno),
- XFS_BTNUM_INO, (xfs_inode_t *)0, 0);
+
+ return 0;
+}
+
+/*
+ * Allocate an inode using the inobt-only algorithm.
+ */
+STATIC int
+xfs_dialloc_ag_inobt(
+ struct xfs_trans *tp,
+ struct xfs_buf *agbp,
+ xfs_ino_t parent,
+ xfs_ino_t *inop)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp);
+ xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno);
+ xfs_agnumber_t pagno = XFS_INO_TO_AGNO(mp, parent);
+ xfs_agino_t pagino = XFS_INO_TO_AGINO(mp, parent);
+ struct xfs_perag *pag;
+ struct xfs_btree_cur *cur, *tcur;
+ struct xfs_inobt_rec_incore rec, trec;
+ xfs_ino_t ino;
+ int error;
+ int offset;
+ int i, j;
+
+ pag = xfs_perag_get(mp, agno);
+
+ ASSERT(pag->pagi_init);
+ ASSERT(pag->pagi_inodeok);
+ ASSERT(pag->pagi_freecount > 0);
+
+ restart_pagno:
+ cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO);
/*
* If pagino is 0 (this is the root inode allocation) use newino.
* This must work because we've just allocated some.
*/
if (!pagino)
pagino = be32_to_cpu(agi->agi_newino);
-#ifdef DEBUG
- if (cur->bc_nlevels == 1) {
- int freecount = 0;
- if ((error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i)))
- goto error0;
- XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
- do {
- if ((error = xfs_inobt_get_rec(cur, &rec.ir_startino,
- &rec.ir_freecount, &rec.ir_free, &i)))
- goto error0;
- XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
- freecount += rec.ir_freecount;
- if ((error = xfs_inobt_increment(cur, 0, &i)))
- goto error0;
- } while (i == 1);
+ error = xfs_check_agi_freecount(cur, agi);
+ if (error)
+ goto error0;
- ASSERT(freecount == be32_to_cpu(agi->agi_freecount) ||
- XFS_FORCED_SHUTDOWN(mp));
- }
-#endif
/*
- * If in the same a.g. as the parent, try to get near the parent.
+ * If in the same AG as the parent, try to get near the parent.
*/
if (pagno == agno) {
- if ((error = xfs_inobt_lookup_le(cur, pagino, 0, 0, &i)))
+ int doneleft; /* done, to the left */
+ int doneright; /* done, to the right */
+ int searchdistance = 10;
+
+ error = xfs_inobt_lookup(cur, pagino, XFS_LOOKUP_LE, &i);
+ if (error)
+ goto error0;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+
+ error = xfs_inobt_get_rec(cur, &rec, &j);
+ if (error)
goto error0;
- if (i != 0 &&
- (error = xfs_inobt_get_rec(cur, &rec.ir_startino,
- &rec.ir_freecount, &rec.ir_free, &j)) == 0 &&
- j == 1 &&
- rec.ir_freecount > 0) {
+ XFS_WANT_CORRUPTED_GOTO(j == 1, error0);
+
+ if (rec.ir_freecount > 0) {
/*
* Found a free inode in the same chunk
- * as parent, done.
+ * as the parent, done.
*/
+ goto alloc_inode;
}
+
+
/*
- * In the same a.g. as parent, but parent's chunk is full.
+ * In the same AG as parent, but parent's chunk is full.
*/
- else {
- int doneleft; /* done, to the left */
- int doneright; /* done, to the right */
+ /* duplicate the cursor, search left & right simultaneously */
+ error = xfs_btree_dup_cursor(cur, &tcur);
+ if (error)
+ goto error0;
+
+ /*
+ * Skip to last blocks looked up if same parent inode.
+ */
+ if (pagino != NULLAGINO &&
+ pag->pagl_pagino == pagino &&
+ pag->pagl_leftrec != NULLAGINO &&
+ pag->pagl_rightrec != NULLAGINO) {
+ error = xfs_ialloc_get_rec(tcur, pag->pagl_leftrec,
+ &trec, &doneleft);
if (error)
- goto error0;
- ASSERT(i == 1);
- ASSERT(j == 1);
- /*
- * Duplicate the cursor, search left & right
- * simultaneously.
- */
- if ((error = xfs_btree_dup_cursor(cur, &tcur)))
- goto error0;
- /*
- * Search left with tcur, back up 1 record.
- */
- if ((error = xfs_inobt_decrement(tcur, 0, &i)))
goto error1;
- doneleft = !i;
- if (!doneleft) {
- if ((error = xfs_inobt_get_rec(tcur,
- &trec.ir_startino,
- &trec.ir_freecount,
- &trec.ir_free, &i)))
- goto error1;
- XFS_WANT_CORRUPTED_GOTO(i == 1, error1);
- }
- /*
- * Search right with cur, go forward 1 record.
- */
- if ((error = xfs_inobt_increment(cur, 0, &i)))
+
+ error = xfs_ialloc_get_rec(cur, pag->pagl_rightrec,
+ &rec, &doneright);
+ if (error)
+ goto error1;
+ } else {
+ /* search left with tcur, back up 1 record */
+ error = xfs_ialloc_next_rec(tcur, &trec, &doneleft, 1);
+ if (error)
goto error1;
- doneright = !i;
- if (!doneright) {
- if ((error = xfs_inobt_get_rec(cur,
- &rec.ir_startino,
- &rec.ir_freecount,
- &rec.ir_free, &i)))
- goto error1;
- XFS_WANT_CORRUPTED_GOTO(i == 1, error1);
- }
- /*
- * Loop until we find the closest inode chunk
- * with a free one.
- */
- while (!doneleft || !doneright) {
- int useleft; /* using left inode
- chunk this time */
+ /* search right with cur, go forward 1 record. */
+ error = xfs_ialloc_next_rec(cur, &rec, &doneright, 0);
+ if (error)
+ goto error1;
+ }
+
+ /*
+ * Loop until we find an inode chunk with a free inode.
+ */
+ while (!doneleft || !doneright) {
+ int useleft; /* using left inode chunk this time */
+
+ if (!--searchdistance) {
/*
- * Figure out which block is closer,
- * if both are valid.
- */
- if (!doneleft && !doneright)
- useleft =
- pagino -
- (trec.ir_startino +
- XFS_INODES_PER_CHUNK - 1) <
- rec.ir_startino - pagino;
- else
- useleft = !doneleft;
- /*
- * If checking the left, does it have
- * free inodes?
- */
- if (useleft && trec.ir_freecount) {
- /*
- * Yes, set it up as the chunk to use.
- */
- rec = trec;
- xfs_btree_del_cursor(cur,
- XFS_BTREE_NOERROR);
- cur = tcur;
- break;
- }
- /*
- * If checking the right, does it have
- * free inodes?
- */
- if (!useleft && rec.ir_freecount) {
- /*
- * Yes, it's already set up.
- */
- xfs_btree_del_cursor(tcur,
- XFS_BTREE_NOERROR);
- break;
- }
- /*
- * If used the left, get another one
- * further left.
- */
- if (useleft) {
- if ((error = xfs_inobt_decrement(tcur, 0,
- &i)))
- goto error1;
- doneleft = !i;
- if (!doneleft) {
- if ((error = xfs_inobt_get_rec(
- tcur,
- &trec.ir_startino,
- &trec.ir_freecount,
- &trec.ir_free, &i)))
- goto error1;
- XFS_WANT_CORRUPTED_GOTO(i == 1,
- error1);
- }
- }
- /*
- * If used the right, get another one
- * further right.
+ * Not in range - save last search
+ * location and allocate a new inode
*/
- else {
- if ((error = xfs_inobt_increment(cur, 0,
- &i)))
- goto error1;
- doneright = !i;
- if (!doneright) {
- if ((error = xfs_inobt_get_rec(
- cur,
- &rec.ir_startino,
- &rec.ir_freecount,
- &rec.ir_free, &i)))
- goto error1;
- XFS_WANT_CORRUPTED_GOTO(i == 1,
- error1);
- }
- }
+ xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+ pag->pagl_leftrec = trec.ir_startino;
+ pag->pagl_rightrec = rec.ir_startino;
+ pag->pagl_pagino = pagino;
+ goto newino;
+ }
+
+ /* figure out the closer block if both are valid. */
+ if (!doneleft && !doneright) {
+ useleft = pagino -
+ (trec.ir_startino + XFS_INODES_PER_CHUNK - 1) <
+ rec.ir_startino - pagino;
+ } else {
+ useleft = !doneleft;
+ }
+
+ /* free inodes to the left? */
+ if (useleft && trec.ir_freecount) {
+ rec = trec;
+ xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+ cur = tcur;
+
+ pag->pagl_leftrec = trec.ir_startino;
+ pag->pagl_rightrec = rec.ir_startino;
+ pag->pagl_pagino = pagino;
+ goto alloc_inode;
}
- ASSERT(!doneleft || !doneright);
+
+ /* free inodes to the right? */
+ if (!useleft && rec.ir_freecount) {
+ xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+
+ pag->pagl_leftrec = trec.ir_startino;
+ pag->pagl_rightrec = rec.ir_startino;
+ pag->pagl_pagino = pagino;
+ goto alloc_inode;
+ }
+
+ /* get next record to check */
+ if (useleft) {
+ error = xfs_ialloc_next_rec(tcur, &trec,
+ &doneleft, 1);
+ } else {
+ error = xfs_ialloc_next_rec(cur, &rec,
+ &doneright, 0);
+ }
+ if (error)
+ goto error1;
}
+
+ /*
+ * We've reached the end of the btree. because
+ * we are only searching a small chunk of the
+ * btree each search, there is obviously free
+ * inodes closer to the parent inode than we
+ * are now. restart the search again.
+ */
+ pag->pagl_pagino = NULLAGINO;
+ pag->pagl_leftrec = NULLAGINO;
+ pag->pagl_rightrec = NULLAGINO;
+ xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+ xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+ goto restart_pagno;
}
+
/*
- * In a different a.g. from the parent.
+ * In a different AG from the parent.
* See if the most recently allocated block has any free.
*/
- else if (be32_to_cpu(agi->agi_newino) != NULLAGINO) {
- if ((error = xfs_inobt_lookup_eq(cur,
- be32_to_cpu(agi->agi_newino), 0, 0, &i)))
+newino:
+ if (agi->agi_newino != cpu_to_be32(NULLAGINO)) {
+ error = xfs_inobt_lookup(cur, be32_to_cpu(agi->agi_newino),
+ XFS_LOOKUP_EQ, &i);
+ if (error)
goto error0;
- if (i == 1 &&
- (error = xfs_inobt_get_rec(cur, &rec.ir_startino,
- &rec.ir_freecount, &rec.ir_free, &j)) == 0 &&
- j == 1 &&
- rec.ir_freecount > 0) {
- /*
- * The last chunk allocated in the group still has
- * a free inode.
- */
- }
- /*
- * None left in the last group, search the whole a.g.
- */
- else {
+
+ if (i == 1) {
+ error = xfs_inobt_get_rec(cur, &rec, &j);
if (error)
goto error0;
- if ((error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i)))
- goto error0;
- ASSERT(i == 1);
- for (;;) {
- if ((error = xfs_inobt_get_rec(cur,
- &rec.ir_startino,
- &rec.ir_freecount, &rec.ir_free,
- &i)))
- goto error0;
- XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
- if (rec.ir_freecount > 0)
- break;
- if ((error = xfs_inobt_increment(cur, 0, &i)))
- goto error0;
- XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+
+ if (j == 1 && rec.ir_freecount > 0) {
+ /*
+ * The last chunk allocated in the group
+ * still has a free inode.
+ */
+ goto alloc_inode;
}
}
}
- offset = XFS_IALLOC_FIND_FREE(&rec.ir_free);
+
+ /*
+ * None left in the last group, search the whole AG
+ */
+ error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i);
+ if (error)
+ goto error0;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+
+ for (;;) {
+ error = xfs_inobt_get_rec(cur, &rec, &i);
+ if (error)
+ goto error0;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+ if (rec.ir_freecount > 0)
+ break;
+ error = xfs_btree_increment(cur, 0, &i);
+ if (error)
+ goto error0;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+ }
+
+alloc_inode:
+ offset = xfs_lowbit64(rec.ir_free);
ASSERT(offset >= 0);
ASSERT(offset < XFS_INODES_PER_CHUNK);
ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) %
XFS_INODES_PER_CHUNK) == 0);
ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino + offset);
- XFS_INOBT_CLR_FREE(&rec, offset);
+ rec.ir_free &= ~XFS_INOBT_MASK(offset);
rec.ir_freecount--;
- if ((error = xfs_inobt_update(cur, rec.ir_startino, rec.ir_freecount,
- rec.ir_free)))
+ error = xfs_inobt_update(cur, &rec);
+ if (error)
goto error0;
- be32_add(&agi->agi_freecount, -1);
+ be32_add_cpu(&agi->agi_freecount, -1);
xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT);
- down_read(&mp->m_peraglock);
- mp->m_perag[tagno].pagi_freecount--;
- up_read(&mp->m_peraglock);
-#ifdef DEBUG
- if (cur->bc_nlevels == 1) {
- int freecount = 0;
+ pag->pagi_freecount--;
+
+ error = xfs_check_agi_freecount(cur, agi);
+ if (error)
+ goto error0;
- if ((error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i)))
- goto error0;
- do {
- if ((error = xfs_inobt_get_rec(cur, &rec.ir_startino,
- &rec.ir_freecount, &rec.ir_free, &i)))
- goto error0;
- XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
- freecount += rec.ir_freecount;
- if ((error = xfs_inobt_increment(cur, 0, &i)))
- goto error0;
- } while (i == 1);
- ASSERT(freecount == be32_to_cpu(agi->agi_freecount) ||
- XFS_FORCED_SHUTDOWN(mp));
- }
-#endif
xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -1);
+ xfs_perag_put(pag);
*inop = ino;
return 0;
error1:
xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
error0:
xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+ xfs_perag_put(pag);
return error;
}
/*
- * Free disk inode. Carefully avoids touching the incore inode, all
- * manipulations incore are the caller's responsibility.
- * The on-disk inode is not changed by this operation, only the
- * btree (free inode mask) is changed.
+ * Use the free inode btree to allocate an inode based on distance from the
+ * parent. Note that the provided cursor may be deleted and replaced.
*/
-int
-xfs_difree(
- xfs_trans_t *tp, /* transaction pointer */
- xfs_ino_t inode, /* inode to be freed */
- xfs_bmap_free_t *flist, /* extents to free */
- int *delete, /* set if inode cluster was deleted */
- xfs_ino_t *first_ino) /* first inode in deleted cluster */
+STATIC int
+xfs_dialloc_ag_finobt_near(
+ xfs_agino_t pagino,
+ struct xfs_btree_cur **ocur,
+ struct xfs_inobt_rec_incore *rec)
{
- /* REFERENCED */
- xfs_agblock_t agbno; /* block number containing inode */
- xfs_buf_t *agbp; /* buffer containing allocation group header */
- xfs_agino_t agino; /* inode number relative to allocation group */
- xfs_agnumber_t agno; /* allocation group number */
- xfs_agi_t *agi; /* allocation group header */
- xfs_btree_cur_t *cur; /* inode btree cursor */
- int error; /* error return value */
- int i; /* result code */
- int ilen; /* inodes in an inode cluster */
- xfs_mount_t *mp; /* mount structure for filesystem */
- int off; /* offset of inode in inode chunk */
- xfs_inobt_rec_t rec; /* btree record */
+ struct xfs_btree_cur *lcur = *ocur; /* left search cursor */
+ struct xfs_btree_cur *rcur; /* right search cursor */
+ struct xfs_inobt_rec_incore rrec;
+ int error;
+ int i, j;
- mp = tp->t_mountp;
+ error = xfs_inobt_lookup(lcur, pagino, XFS_LOOKUP_LE, &i);
+ if (error)
+ return error;
+
+ if (i == 1) {
+ error = xfs_inobt_get_rec(lcur, rec, &i);
+ if (error)
+ return error;
+ XFS_WANT_CORRUPTED_RETURN(i == 1);
+
+ /*
+ * See if we've landed in the parent inode record. The finobt
+ * only tracks chunks with at least one free inode, so record
+ * existence is enough.
+ */
+ if (pagino >= rec->ir_startino &&
+ pagino < (rec->ir_startino + XFS_INODES_PER_CHUNK))
+ return 0;
+ }
+
+ error = xfs_btree_dup_cursor(lcur, &rcur);
+ if (error)
+ return error;
+
+ error = xfs_inobt_lookup(rcur, pagino, XFS_LOOKUP_GE, &j);
+ if (error)
+ goto error_rcur;
+ if (j == 1) {
+ error = xfs_inobt_get_rec(rcur, &rrec, &j);
+ if (error)
+ goto error_rcur;
+ XFS_WANT_CORRUPTED_GOTO(j == 1, error_rcur);
+ }
+
+ XFS_WANT_CORRUPTED_GOTO(i == 1 || j == 1, error_rcur);
+ if (i == 1 && j == 1) {
+ /*
+ * Both the left and right records are valid. Choose the closer
+ * inode chunk to the target.
+ */
+ if ((pagino - rec->ir_startino + XFS_INODES_PER_CHUNK - 1) >
+ (rrec.ir_startino - pagino)) {
+ *rec = rrec;
+ xfs_btree_del_cursor(lcur, XFS_BTREE_NOERROR);
+ *ocur = rcur;
+ } else {
+ xfs_btree_del_cursor(rcur, XFS_BTREE_NOERROR);
+ }
+ } else if (j == 1) {
+ /* only the right record is valid */
+ *rec = rrec;
+ xfs_btree_del_cursor(lcur, XFS_BTREE_NOERROR);
+ *ocur = rcur;
+ } else if (i == 1) {
+ /* only the left record is valid */
+ xfs_btree_del_cursor(rcur, XFS_BTREE_NOERROR);
+ }
+
+ return 0;
+
+error_rcur:
+ xfs_btree_del_cursor(rcur, XFS_BTREE_ERROR);
+ return error;
+}
+
+/*
+ * Use the free inode btree to find a free inode based on a newino hint. If
+ * the hint is NULL, find the first free inode in the AG.
+ */
+STATIC int
+xfs_dialloc_ag_finobt_newino(
+ struct xfs_agi *agi,
+ struct xfs_btree_cur *cur,
+ struct xfs_inobt_rec_incore *rec)
+{
+ int error;
+ int i;
+
+ if (agi->agi_newino != cpu_to_be32(NULLAGINO)) {
+ error = xfs_inobt_lookup(cur, agi->agi_newino, XFS_LOOKUP_EQ,
+ &i);
+ if (error)
+ return error;
+ if (i == 1) {
+ error = xfs_inobt_get_rec(cur, rec, &i);
+ if (error)
+ return error;
+ XFS_WANT_CORRUPTED_RETURN(i == 1);
+
+ return 0;
+ }
+ }
/*
- * Break up inode number into its components.
+ * Find the first inode available in the AG.
*/
- agno = XFS_INO_TO_AGNO(mp, inode);
- if (agno >= mp->m_sb.sb_agcount) {
- cmn_err(CE_WARN,
- "xfs_difree: agno >= mp->m_sb.sb_agcount (%d >= %d) on %s. Returning EINVAL.",
- agno, mp->m_sb.sb_agcount, mp->m_fsname);
- ASSERT(0);
- return XFS_ERROR(EINVAL);
+ error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i);
+ if (error)
+ return error;
+ XFS_WANT_CORRUPTED_RETURN(i == 1);
+
+ error = xfs_inobt_get_rec(cur, rec, &i);
+ if (error)
+ return error;
+ XFS_WANT_CORRUPTED_RETURN(i == 1);
+
+ return 0;
+}
+
+/*
+ * Update the inobt based on a modification made to the finobt. Also ensure that
+ * the records from both trees are equivalent post-modification.
+ */
+STATIC int
+xfs_dialloc_ag_update_inobt(
+ struct xfs_btree_cur *cur, /* inobt cursor */
+ struct xfs_inobt_rec_incore *frec, /* finobt record */
+ int offset) /* inode offset */
+{
+ struct xfs_inobt_rec_incore rec;
+ int error;
+ int i;
+
+ error = xfs_inobt_lookup(cur, frec->ir_startino, XFS_LOOKUP_EQ, &i);
+ if (error)
+ return error;
+ XFS_WANT_CORRUPTED_RETURN(i == 1);
+
+ error = xfs_inobt_get_rec(cur, &rec, &i);
+ if (error)
+ return error;
+ XFS_WANT_CORRUPTED_RETURN(i == 1);
+ ASSERT((XFS_AGINO_TO_OFFSET(cur->bc_mp, rec.ir_startino) %
+ XFS_INODES_PER_CHUNK) == 0);
+
+ rec.ir_free &= ~XFS_INOBT_MASK(offset);
+ rec.ir_freecount--;
+
+ XFS_WANT_CORRUPTED_RETURN((rec.ir_free == frec->ir_free) &&
+ (rec.ir_freecount == frec->ir_freecount));
+
+ error = xfs_inobt_update(cur, &rec);
+ if (error)
+ return error;
+
+ return 0;
+}
+
+/*
+ * Allocate an inode using the free inode btree, if available. Otherwise, fall
+ * back to the inobt search algorithm.
+ *
+ * The caller selected an AG for us, and made sure that free inodes are
+ * available.
+ */
+STATIC int
+xfs_dialloc_ag(
+ struct xfs_trans *tp,
+ struct xfs_buf *agbp,
+ xfs_ino_t parent,
+ xfs_ino_t *inop)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp);
+ xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno);
+ xfs_agnumber_t pagno = XFS_INO_TO_AGNO(mp, parent);
+ xfs_agino_t pagino = XFS_INO_TO_AGINO(mp, parent);
+ struct xfs_perag *pag;
+ struct xfs_btree_cur *cur; /* finobt cursor */
+ struct xfs_btree_cur *icur; /* inobt cursor */
+ struct xfs_inobt_rec_incore rec;
+ xfs_ino_t ino;
+ int error;
+ int offset;
+ int i;
+
+ if (!xfs_sb_version_hasfinobt(&mp->m_sb))
+ return xfs_dialloc_ag_inobt(tp, agbp, parent, inop);
+
+ pag = xfs_perag_get(mp, agno);
+
+ /*
+ * If pagino is 0 (this is the root inode allocation) use newino.
+ * This must work because we've just allocated some.
+ */
+ if (!pagino)
+ pagino = be32_to_cpu(agi->agi_newino);
+
+ cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_FINO);
+
+ error = xfs_check_agi_freecount(cur, agi);
+ if (error)
+ goto error_cur;
+
+ /*
+ * The search algorithm depends on whether we're in the same AG as the
+ * parent. If so, find the closest available inode to the parent. If
+ * not, consider the agi hint or find the first free inode in the AG.
+ */
+ if (agno == pagno)
+ error = xfs_dialloc_ag_finobt_near(pagino, &cur, &rec);
+ else
+ error = xfs_dialloc_ag_finobt_newino(agi, cur, &rec);
+ if (error)
+ goto error_cur;
+
+ offset = xfs_lowbit64(rec.ir_free);
+ ASSERT(offset >= 0);
+ ASSERT(offset < XFS_INODES_PER_CHUNK);
+ ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) %
+ XFS_INODES_PER_CHUNK) == 0);
+ ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino + offset);
+
+ /*
+ * Modify or remove the finobt record.
+ */
+ rec.ir_free &= ~XFS_INOBT_MASK(offset);
+ rec.ir_freecount--;
+ if (rec.ir_freecount)
+ error = xfs_inobt_update(cur, &rec);
+ else
+ error = xfs_btree_delete(cur, &i);
+ if (error)
+ goto error_cur;
+
+ /*
+ * The finobt has now been updated appropriately. We haven't updated the
+ * agi and superblock yet, so we can create an inobt cursor and validate
+ * the original freecount. If all is well, make the equivalent update to
+ * the inobt using the finobt record and offset information.
+ */
+ icur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO);
+
+ error = xfs_check_agi_freecount(icur, agi);
+ if (error)
+ goto error_icur;
+
+ error = xfs_dialloc_ag_update_inobt(icur, &rec, offset);
+ if (error)
+ goto error_icur;
+
+ /*
+ * Both trees have now been updated. We must update the perag and
+ * superblock before we can check the freecount for each btree.
+ */
+ be32_add_cpu(&agi->agi_freecount, -1);
+ xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT);
+ pag->pagi_freecount--;
+
+ xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -1);
+
+ error = xfs_check_agi_freecount(icur, agi);
+ if (error)
+ goto error_icur;
+ error = xfs_check_agi_freecount(cur, agi);
+ if (error)
+ goto error_icur;
+
+ xfs_btree_del_cursor(icur, XFS_BTREE_NOERROR);
+ xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+ xfs_perag_put(pag);
+ *inop = ino;
+ return 0;
+
+error_icur:
+ xfs_btree_del_cursor(icur, XFS_BTREE_ERROR);
+error_cur:
+ xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+ xfs_perag_put(pag);
+ return error;
+}
+
+/*
+ * Allocate an inode on disk.
+ *
+ * Mode is used to tell whether the new inode will need space, and whether it
+ * is a directory.
+ *
+ * This function is designed to be called twice if it has to do an allocation
+ * to make more free inodes. On the first call, *IO_agbp should be set to NULL.
+ * If an inode is available without having to performn an allocation, an inode
+ * number is returned. In this case, *IO_agbp is set to NULL. If an allocation
+ * needs to be done, xfs_dialloc returns the current AGI buffer in *IO_agbp.
+ * The caller should then commit the current transaction, allocate a
+ * new transaction, and call xfs_dialloc() again, passing in the previous value
+ * of *IO_agbp. IO_agbp should be held across the transactions. Since the AGI
+ * buffer is locked across the two calls, the second call is guaranteed to have
+ * a free inode available.
+ *
+ * Once we successfully pick an inode its number is returned and the on-disk
+ * data structures are updated. The inode itself is not read in, since doing so
+ * would break ordering constraints with xfs_reclaim.
+ */
+int
+xfs_dialloc(
+ struct xfs_trans *tp,
+ xfs_ino_t parent,
+ umode_t mode,
+ int okalloc,
+ struct xfs_buf **IO_agbp,
+ xfs_ino_t *inop)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_buf *agbp;
+ xfs_agnumber_t agno;
+ int error;
+ int ialloced;
+ int noroom = 0;
+ xfs_agnumber_t start_agno;
+ struct xfs_perag *pag;
+
+ if (*IO_agbp) {
+ /*
+ * If the caller passes in a pointer to the AGI buffer,
+ * continue where we left off before. In this case, we
+ * know that the allocation group has free inodes.
+ */
+ agbp = *IO_agbp;
+ goto out_alloc;
}
- agino = XFS_INO_TO_AGINO(mp, inode);
- if (inode != XFS_AGINO_TO_INO(mp, agno, agino)) {
- cmn_err(CE_WARN,
- "xfs_difree: inode != XFS_AGINO_TO_INO() "
- "(%llu != %llu) on %s. Returning EINVAL.",
- (unsigned long long)inode,
- (unsigned long long)XFS_AGINO_TO_INO(mp, agno, agino),
- mp->m_fsname);
- ASSERT(0);
- return XFS_ERROR(EINVAL);
+
+ /*
+ * We do not have an agbp, so select an initial allocation
+ * group for inode allocation.
+ */
+ start_agno = xfs_ialloc_ag_select(tp, parent, mode, okalloc);
+ if (start_agno == NULLAGNUMBER) {
+ *inop = NULLFSINO;
+ return 0;
}
- agbno = XFS_AGINO_TO_AGBNO(mp, agino);
- if (agbno >= mp->m_sb.sb_agblocks) {
- cmn_err(CE_WARN,
- "xfs_difree: agbno >= mp->m_sb.sb_agblocks (%d >= %d) on %s. Returning EINVAL.",
- agbno, mp->m_sb.sb_agblocks, mp->m_fsname);
- ASSERT(0);
- return XFS_ERROR(EINVAL);
+
+ /*
+ * If we have already hit the ceiling of inode blocks then clear
+ * okalloc so we scan all available agi structures for a free
+ * inode.
+ */
+ if (mp->m_maxicount &&
+ mp->m_sb.sb_icount + mp->m_ialloc_inos > mp->m_maxicount) {
+ noroom = 1;
+ okalloc = 0;
}
+
/*
- * Get the allocation group header.
+ * Loop until we find an allocation group that either has free inodes
+ * or in which we can allocate some inodes. Iterate through the
+ * allocation groups upward, wrapping at the end.
*/
- down_read(&mp->m_peraglock);
- error = xfs_ialloc_read_agi(mp, tp, agno, &agbp);
- up_read(&mp->m_peraglock);
- if (error) {
- cmn_err(CE_WARN,
- "xfs_difree: xfs_ialloc_read_agi() returned an error %d on %s. Returning error.",
- error, mp->m_fsname);
- return error;
+ agno = start_agno;
+ for (;;) {
+ pag = xfs_perag_get(mp, agno);
+ if (!pag->pagi_inodeok) {
+ xfs_ialloc_next_ag(mp);
+ goto nextag;
+ }
+
+ if (!pag->pagi_init) {
+ error = xfs_ialloc_pagi_init(mp, tp, agno);
+ if (error)
+ goto out_error;
+ }
+
+ /*
+ * Do a first racy fast path check if this AG is usable.
+ */
+ if (!pag->pagi_freecount && !okalloc)
+ goto nextag;
+
+ /*
+ * Then read in the AGI buffer and recheck with the AGI buffer
+ * lock held.
+ */
+ error = xfs_ialloc_read_agi(mp, tp, agno, &agbp);
+ if (error)
+ goto out_error;
+
+ if (pag->pagi_freecount) {
+ xfs_perag_put(pag);
+ goto out_alloc;
+ }
+
+ if (!okalloc)
+ goto nextag_relse_buffer;
+
+
+ error = xfs_ialloc_ag_alloc(tp, agbp, &ialloced);
+ if (error) {
+ xfs_trans_brelse(tp, agbp);
+
+ if (error != ENOSPC)
+ goto out_error;
+
+ xfs_perag_put(pag);
+ *inop = NULLFSINO;
+ return 0;
+ }
+
+ if (ialloced) {
+ /*
+ * We successfully allocated some inodes, return
+ * the current context to the caller so that it
+ * can commit the current transaction and call
+ * us again where we left off.
+ */
+ ASSERT(pag->pagi_freecount > 0);
+ xfs_perag_put(pag);
+
+ *IO_agbp = agbp;
+ *inop = NULLFSINO;
+ return 0;
+ }
+
+nextag_relse_buffer:
+ xfs_trans_brelse(tp, agbp);
+nextag:
+ xfs_perag_put(pag);
+ if (++agno == mp->m_sb.sb_agcount)
+ agno = 0;
+ if (agno == start_agno) {
+ *inop = NULLFSINO;
+ return noroom ? ENOSPC : 0;
+ }
}
- agi = XFS_BUF_TO_AGI(agbp);
- ASSERT(be32_to_cpu(agi->agi_magicnum) == XFS_AGI_MAGIC);
- ASSERT(agbno < be32_to_cpu(agi->agi_length));
+
+out_alloc:
+ *IO_agbp = NULL;
+ return xfs_dialloc_ag(tp, agbp, parent, inop);
+out_error:
+ xfs_perag_put(pag);
+ return XFS_ERROR(error);
+}
+
+STATIC int
+xfs_difree_inobt(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ struct xfs_buf *agbp,
+ xfs_agino_t agino,
+ struct xfs_bmap_free *flist,
+ int *deleted,
+ xfs_ino_t *first_ino,
+ struct xfs_inobt_rec_incore *orec)
+{
+ struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp);
+ xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno);
+ struct xfs_perag *pag;
+ struct xfs_btree_cur *cur;
+ struct xfs_inobt_rec_incore rec;
+ int ilen;
+ int error;
+ int i;
+ int off;
+
+ ASSERT(agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC));
+ ASSERT(XFS_AGINO_TO_AGBNO(mp, agino) < be32_to_cpu(agi->agi_length));
+
/*
* Initialize the cursor.
*/
- cur = xfs_btree_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO,
- (xfs_inode_t *)0, 0);
-#ifdef DEBUG
- if (cur->bc_nlevels == 1) {
- int freecount = 0;
+ cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO);
+
+ error = xfs_check_agi_freecount(cur, agi);
+ if (error)
+ goto error0;
- if ((error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i)))
- goto error0;
- do {
- if ((error = xfs_inobt_get_rec(cur, &rec.ir_startino,
- &rec.ir_freecount, &rec.ir_free, &i)))
- goto error0;
- if (i) {
- freecount += rec.ir_freecount;
- if ((error = xfs_inobt_increment(cur, 0, &i)))
- goto error0;
- }
- } while (i == 1);
- ASSERT(freecount == be32_to_cpu(agi->agi_freecount) ||
- XFS_FORCED_SHUTDOWN(mp));
- }
-#endif
/*
* Look for the entry describing this inode.
*/
- if ((error = xfs_inobt_lookup_le(cur, agino, 0, 0, &i))) {
- cmn_err(CE_WARN,
- "xfs_difree: xfs_inobt_lookup_le returned() an error %d on %s. Returning error.",
- error, mp->m_fsname);
+ if ((error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i))) {
+ xfs_warn(mp, "%s: xfs_inobt_lookup() returned error %d.",
+ __func__, error);
goto error0;
}
XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
- if ((error = xfs_inobt_get_rec(cur, &rec.ir_startino, &rec.ir_freecount,
- &rec.ir_free, &i))) {
- cmn_err(CE_WARN,
- "xfs_difree: xfs_inobt_get_rec() returned an error %d on %s. Returning error.",
- error, mp->m_fsname);
+ error = xfs_inobt_get_rec(cur, &rec, &i);
+ if (error) {
+ xfs_warn(mp, "%s: xfs_inobt_get_rec() returned error %d.",
+ __func__, error);
goto error0;
}
XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
@@ -1020,20 +1482,20 @@ xfs_difree(
*/
off = agino - rec.ir_startino;
ASSERT(off >= 0 && off < XFS_INODES_PER_CHUNK);
- ASSERT(!XFS_INOBT_IS_FREE(&rec, off));
+ ASSERT(!(rec.ir_free & XFS_INOBT_MASK(off)));
/*
* Mark the inode free & increment the count.
*/
- XFS_INOBT_SET_FREE(&rec, off);
+ rec.ir_free |= XFS_INOBT_MASK(off);
rec.ir_freecount++;
/*
- * When an inode cluster is free, it becomes elgible for removal
+ * When an inode cluster is free, it becomes eligible for removal
*/
- if ((mp->m_flags & XFS_MOUNT_IDELETE) &&
- (rec.ir_freecount == XFS_IALLOC_INODES(mp))) {
+ if (!(mp->m_flags & XFS_MOUNT_IKEEP) &&
+ (rec.ir_freecount == mp->m_ialloc_inos)) {
- *delete = 1;
+ *deleted = 1;
*first_ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino);
/*
@@ -1041,67 +1503,51 @@ xfs_difree(
* AGI and Superblock inode counts, and mark the disk space
* to be freed when the transaction is committed.
*/
- ilen = XFS_IALLOC_INODES(mp);
- be32_add(&agi->agi_count, -ilen);
- be32_add(&agi->agi_freecount, -(ilen - 1));
+ ilen = mp->m_ialloc_inos;
+ be32_add_cpu(&agi->agi_count, -ilen);
+ be32_add_cpu(&agi->agi_freecount, -(ilen - 1));
xfs_ialloc_log_agi(tp, agbp, XFS_AGI_COUNT | XFS_AGI_FREECOUNT);
- down_read(&mp->m_peraglock);
- mp->m_perag[agno].pagi_freecount -= ilen - 1;
- up_read(&mp->m_peraglock);
+ pag = xfs_perag_get(mp, agno);
+ pag->pagi_freecount -= ilen - 1;
+ xfs_perag_put(pag);
xfs_trans_mod_sb(tp, XFS_TRANS_SB_ICOUNT, -ilen);
xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -(ilen - 1));
- if ((error = xfs_inobt_delete(cur, &i))) {
- cmn_err(CE_WARN, "xfs_difree: xfs_inobt_delete returned an error %d on %s.\n",
- error, mp->m_fsname);
+ if ((error = xfs_btree_delete(cur, &i))) {
+ xfs_warn(mp, "%s: xfs_btree_delete returned error %d.",
+ __func__, error);
goto error0;
}
- xfs_bmap_add_free(XFS_AGB_TO_FSB(mp,
- agno, XFS_INO_TO_AGBNO(mp,rec.ir_startino)),
- XFS_IALLOC_BLOCKS(mp), flist, mp);
+ xfs_bmap_add_free(XFS_AGB_TO_FSB(mp, agno,
+ XFS_AGINO_TO_AGBNO(mp, rec.ir_startino)),
+ mp->m_ialloc_blks, flist, mp);
} else {
- *delete = 0;
+ *deleted = 0;
- if ((error = xfs_inobt_update(cur, rec.ir_startino, rec.ir_freecount, rec.ir_free))) {
- cmn_err(CE_WARN,
- "xfs_difree: xfs_inobt_update() returned an error %d on %s. Returning error.",
- error, mp->m_fsname);
+ error = xfs_inobt_update(cur, &rec);
+ if (error) {
+ xfs_warn(mp, "%s: xfs_inobt_update returned error %d.",
+ __func__, error);
goto error0;
}
+
/*
* Change the inode free counts and log the ag/sb changes.
*/
- be32_add(&agi->agi_freecount, 1);
+ be32_add_cpu(&agi->agi_freecount, 1);
xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT);
- down_read(&mp->m_peraglock);
- mp->m_perag[agno].pagi_freecount++;
- up_read(&mp->m_peraglock);
+ pag = xfs_perag_get(mp, agno);
+ pag->pagi_freecount++;
+ xfs_perag_put(pag);
xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, 1);
}
-#ifdef DEBUG
- if (cur->bc_nlevels == 1) {
- int freecount = 0;
+ error = xfs_check_agi_freecount(cur, agi);
+ if (error)
+ goto error0;
- if ((error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i)))
- goto error0;
- do {
- if ((error = xfs_inobt_get_rec(cur,
- &rec.ir_startino,
- &rec.ir_freecount,
- &rec.ir_free, &i)))
- goto error0;
- if (i) {
- freecount += rec.ir_freecount;
- if ((error = xfs_inobt_increment(cur, 0, &i)))
- goto error0;
- }
- } while (i == 1);
- ASSERT(freecount == be32_to_cpu(agi->agi_freecount) ||
- XFS_FORCED_SHUTDOWN(mp));
- }
-#endif
+ *orec = rec;
xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
return 0;
@@ -1111,36 +1557,264 @@ error0:
}
/*
- * Return the location of the inode in bno/off, for mapping it into a buffer.
+ * Free an inode in the free inode btree.
+ */
+STATIC int
+xfs_difree_finobt(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ struct xfs_buf *agbp,
+ xfs_agino_t agino,
+ struct xfs_inobt_rec_incore *ibtrec) /* inobt record */
+{
+ struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp);
+ xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno);
+ struct xfs_btree_cur *cur;
+ struct xfs_inobt_rec_incore rec;
+ int offset = agino - ibtrec->ir_startino;
+ int error;
+ int i;
+
+ cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_FINO);
+
+ error = xfs_inobt_lookup(cur, ibtrec->ir_startino, XFS_LOOKUP_EQ, &i);
+ if (error)
+ goto error;
+ if (i == 0) {
+ /*
+ * If the record does not exist in the finobt, we must have just
+ * freed an inode in a previously fully allocated chunk. If not,
+ * something is out of sync.
+ */
+ XFS_WANT_CORRUPTED_GOTO(ibtrec->ir_freecount == 1, error);
+
+ error = xfs_inobt_insert_rec(cur, ibtrec->ir_freecount,
+ ibtrec->ir_free, &i);
+ if (error)
+ goto error;
+ ASSERT(i == 1);
+
+ goto out;
+ }
+
+ /*
+ * Read and update the existing record. We could just copy the ibtrec
+ * across here, but that would defeat the purpose of having redundant
+ * metadata. By making the modifications independently, we can catch
+ * corruptions that we wouldn't see if we just copied from one record
+ * to another.
+ */
+ error = xfs_inobt_get_rec(cur, &rec, &i);
+ if (error)
+ goto error;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, error);
+
+ rec.ir_free |= XFS_INOBT_MASK(offset);
+ rec.ir_freecount++;
+
+ XFS_WANT_CORRUPTED_GOTO((rec.ir_free == ibtrec->ir_free) &&
+ (rec.ir_freecount == ibtrec->ir_freecount),
+ error);
+
+ /*
+ * The content of inobt records should always match between the inobt
+ * and finobt. The lifecycle of records in the finobt is different from
+ * the inobt in that the finobt only tracks records with at least one
+ * free inode. Hence, if all of the inodes are free and we aren't
+ * keeping inode chunks permanently on disk, remove the record.
+ * Otherwise, update the record with the new information.
+ */
+ if (rec.ir_freecount == mp->m_ialloc_inos &&
+ !(mp->m_flags & XFS_MOUNT_IKEEP)) {
+ error = xfs_btree_delete(cur, &i);
+ if (error)
+ goto error;
+ ASSERT(i == 1);
+ } else {
+ error = xfs_inobt_update(cur, &rec);
+ if (error)
+ goto error;
+ }
+
+out:
+ error = xfs_check_agi_freecount(cur, agi);
+ if (error)
+ goto error;
+
+ xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+ return 0;
+
+error:
+ xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+ return error;
+}
+
+/*
+ * Free disk inode. Carefully avoids touching the incore inode, all
+ * manipulations incore are the caller's responsibility.
+ * The on-disk inode is not changed by this operation, only the
+ * btree (free inode mask) is changed.
*/
-/*ARGSUSED*/
int
-xfs_dilocate(
- xfs_mount_t *mp, /* file system mount structure */
- xfs_trans_t *tp, /* transaction pointer */
+xfs_difree(
+ struct xfs_trans *tp, /* transaction pointer */
+ xfs_ino_t inode, /* inode to be freed */
+ struct xfs_bmap_free *flist, /* extents to free */
+ int *deleted,/* set if inode cluster was deleted */
+ xfs_ino_t *first_ino)/* first inode in deleted cluster */
+{
+ /* REFERENCED */
+ xfs_agblock_t agbno; /* block number containing inode */
+ struct xfs_buf *agbp; /* buffer for allocation group header */
+ xfs_agino_t agino; /* allocation group inode number */
+ xfs_agnumber_t agno; /* allocation group number */
+ int error; /* error return value */
+ struct xfs_mount *mp; /* mount structure for filesystem */
+ struct xfs_inobt_rec_incore rec;/* btree record */
+
+ mp = tp->t_mountp;
+
+ /*
+ * Break up inode number into its components.
+ */
+ agno = XFS_INO_TO_AGNO(mp, inode);
+ if (agno >= mp->m_sb.sb_agcount) {
+ xfs_warn(mp, "%s: agno >= mp->m_sb.sb_agcount (%d >= %d).",
+ __func__, agno, mp->m_sb.sb_agcount);
+ ASSERT(0);
+ return XFS_ERROR(EINVAL);
+ }
+ agino = XFS_INO_TO_AGINO(mp, inode);
+ if (inode != XFS_AGINO_TO_INO(mp, agno, agino)) {
+ xfs_warn(mp, "%s: inode != XFS_AGINO_TO_INO() (%llu != %llu).",
+ __func__, (unsigned long long)inode,
+ (unsigned long long)XFS_AGINO_TO_INO(mp, agno, agino));
+ ASSERT(0);
+ return XFS_ERROR(EINVAL);
+ }
+ agbno = XFS_AGINO_TO_AGBNO(mp, agino);
+ if (agbno >= mp->m_sb.sb_agblocks) {
+ xfs_warn(mp, "%s: agbno >= mp->m_sb.sb_agblocks (%d >= %d).",
+ __func__, agbno, mp->m_sb.sb_agblocks);
+ ASSERT(0);
+ return XFS_ERROR(EINVAL);
+ }
+ /*
+ * Get the allocation group header.
+ */
+ error = xfs_ialloc_read_agi(mp, tp, agno, &agbp);
+ if (error) {
+ xfs_warn(mp, "%s: xfs_ialloc_read_agi() returned error %d.",
+ __func__, error);
+ return error;
+ }
+
+ /*
+ * Fix up the inode allocation btree.
+ */
+ error = xfs_difree_inobt(mp, tp, agbp, agino, flist, deleted, first_ino,
+ &rec);
+ if (error)
+ goto error0;
+
+ /*
+ * Fix up the free inode btree.
+ */
+ if (xfs_sb_version_hasfinobt(&mp->m_sb)) {
+ error = xfs_difree_finobt(mp, tp, agbp, agino, &rec);
+ if (error)
+ goto error0;
+ }
+
+ return 0;
+
+error0:
+ return error;
+}
+
+STATIC int
+xfs_imap_lookup(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ xfs_agnumber_t agno,
+ xfs_agino_t agino,
+ xfs_agblock_t agbno,
+ xfs_agblock_t *chunk_agbno,
+ xfs_agblock_t *offset_agbno,
+ int flags)
+{
+ struct xfs_inobt_rec_incore rec;
+ struct xfs_btree_cur *cur;
+ struct xfs_buf *agbp;
+ int error;
+ int i;
+
+ error = xfs_ialloc_read_agi(mp, tp, agno, &agbp);
+ if (error) {
+ xfs_alert(mp,
+ "%s: xfs_ialloc_read_agi() returned error %d, agno %d",
+ __func__, error, agno);
+ return error;
+ }
+
+ /*
+ * Lookup the inode record for the given agino. If the record cannot be
+ * found, then it's an invalid inode number and we should abort. Once
+ * we have a record, we need to ensure it contains the inode number
+ * we are looking up.
+ */
+ cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO);
+ error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i);
+ if (!error) {
+ if (i)
+ error = xfs_inobt_get_rec(cur, &rec, &i);
+ if (!error && i == 0)
+ error = EINVAL;
+ }
+
+ xfs_trans_brelse(tp, agbp);
+ xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+ if (error)
+ return error;
+
+ /* check that the returned record contains the required inode */
+ if (rec.ir_startino > agino ||
+ rec.ir_startino + mp->m_ialloc_inos <= agino)
+ return EINVAL;
+
+ /* for untrusted inodes check it is allocated first */
+ if ((flags & XFS_IGET_UNTRUSTED) &&
+ (rec.ir_free & XFS_INOBT_MASK(agino - rec.ir_startino)))
+ return EINVAL;
+
+ *chunk_agbno = XFS_AGINO_TO_AGBNO(mp, rec.ir_startino);
+ *offset_agbno = agbno - *chunk_agbno;
+ return 0;
+}
+
+/*
+ * Return the location of the inode in imap, for mapping it into a buffer.
+ */
+int
+xfs_imap(
+ xfs_mount_t *mp, /* file system mount structure */
+ xfs_trans_t *tp, /* transaction pointer */
xfs_ino_t ino, /* inode to locate */
- xfs_fsblock_t *bno, /* output: block containing inode */
- int *len, /* output: num blocks in inode cluster */
- int *off, /* output: index in block of inode */
- uint flags) /* flags concerning inode lookup */
+ struct xfs_imap *imap, /* location map structure */
+ uint flags) /* flags for inode btree lookup */
{
xfs_agblock_t agbno; /* block number of inode in the alloc group */
- xfs_buf_t *agbp; /* agi buffer */
xfs_agino_t agino; /* inode number within alloc group */
xfs_agnumber_t agno; /* allocation group number */
int blks_per_cluster; /* num blocks per inode cluster */
xfs_agblock_t chunk_agbno; /* first block in inode chunk */
- xfs_agino_t chunk_agino; /* first agino in inode chunk */
- __int32_t chunk_cnt; /* count of free inodes in chunk */
- xfs_inofree_t chunk_free; /* mask of free inodes in chunk */
xfs_agblock_t cluster_agbno; /* first block in inode cluster */
- xfs_btree_cur_t *cur; /* inode btree cursor */
int error; /* error code */
- int i; /* temp state */
int offset; /* index of inode in its buffer */
- int offset_agbno; /* blks from chunk start to inode */
+ xfs_agblock_t offset_agbno; /* blks from chunk start to inode */
ASSERT(ino != NULLFSINO);
+
/*
* Split up the inode number into its parts.
*/
@@ -1150,108 +1824,107 @@ xfs_dilocate(
if (agno >= mp->m_sb.sb_agcount || agbno >= mp->m_sb.sb_agblocks ||
ino != XFS_AGINO_TO_INO(mp, agno, agino)) {
#ifdef DEBUG
+ /*
+ * Don't output diagnostic information for untrusted inodes
+ * as they can be invalid without implying corruption.
+ */
+ if (flags & XFS_IGET_UNTRUSTED)
+ return XFS_ERROR(EINVAL);
if (agno >= mp->m_sb.sb_agcount) {
- xfs_fs_cmn_err(CE_ALERT, mp,
- "xfs_dilocate: agno (%d) >= "
- "mp->m_sb.sb_agcount (%d)",
- agno, mp->m_sb.sb_agcount);
+ xfs_alert(mp,
+ "%s: agno (%d) >= mp->m_sb.sb_agcount (%d)",
+ __func__, agno, mp->m_sb.sb_agcount);
}
if (agbno >= mp->m_sb.sb_agblocks) {
- xfs_fs_cmn_err(CE_ALERT, mp,
- "xfs_dilocate: agbno (0x%llx) >= "
- "mp->m_sb.sb_agblocks (0x%lx)",
- (unsigned long long) agbno,
- (unsigned long) mp->m_sb.sb_agblocks);
+ xfs_alert(mp,
+ "%s: agbno (0x%llx) >= mp->m_sb.sb_agblocks (0x%lx)",
+ __func__, (unsigned long long)agbno,
+ (unsigned long)mp->m_sb.sb_agblocks);
}
if (ino != XFS_AGINO_TO_INO(mp, agno, agino)) {
- xfs_fs_cmn_err(CE_ALERT, mp,
- "xfs_dilocate: ino (0x%llx) != "
- "XFS_AGINO_TO_INO(mp, agno, agino) "
- "(0x%llx)",
- ino, XFS_AGINO_TO_INO(mp, agno, agino));
+ xfs_alert(mp,
+ "%s: ino (0x%llx) != XFS_AGINO_TO_INO() (0x%llx)",
+ __func__, ino,
+ XFS_AGINO_TO_INO(mp, agno, agino));
}
+ xfs_stack_trace();
#endif /* DEBUG */
return XFS_ERROR(EINVAL);
}
- if ((mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(mp)) ||
- !(flags & XFS_IMAP_LOOKUP)) {
- offset = XFS_INO_TO_OFFSET(mp, ino);
- ASSERT(offset < mp->m_sb.sb_inopblock);
- *bno = XFS_AGB_TO_FSB(mp, agno, agbno);
- *off = offset;
- *len = 1;
- return 0;
+
+ blks_per_cluster = xfs_icluster_size_fsb(mp);
+
+ /*
+ * For bulkstat and handle lookups, we have an untrusted inode number
+ * that we have to verify is valid. We cannot do this just by reading
+ * the inode buffer as it may have been unlinked and removed leaving
+ * inodes in stale state on disk. Hence we have to do a btree lookup
+ * in all cases where an untrusted inode number is passed.
+ */
+ if (flags & XFS_IGET_UNTRUSTED) {
+ error = xfs_imap_lookup(mp, tp, agno, agino, agbno,
+ &chunk_agbno, &offset_agbno, flags);
+ if (error)
+ return error;
+ goto out_map;
}
- blks_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_blocklog;
- if (*bno != NULLFSBLOCK) {
+
+ /*
+ * If the inode cluster size is the same as the blocksize or
+ * smaller we get to the buffer by simple arithmetics.
+ */
+ if (blks_per_cluster == 1) {
offset = XFS_INO_TO_OFFSET(mp, ino);
ASSERT(offset < mp->m_sb.sb_inopblock);
- cluster_agbno = XFS_FSB_TO_AGBNO(mp, *bno);
- *off = ((agbno - cluster_agbno) * mp->m_sb.sb_inopblock) +
- offset;
- *len = blks_per_cluster;
+
+ imap->im_blkno = XFS_AGB_TO_DADDR(mp, agno, agbno);
+ imap->im_len = XFS_FSB_TO_BB(mp, 1);
+ imap->im_boffset = (ushort)(offset << mp->m_sb.sb_inodelog);
return 0;
}
+
+ /*
+ * If the inode chunks are aligned then use simple maths to
+ * find the location. Otherwise we have to do a btree
+ * lookup to find the location.
+ */
if (mp->m_inoalign_mask) {
offset_agbno = agbno & mp->m_inoalign_mask;
chunk_agbno = agbno - offset_agbno;
} else {
- down_read(&mp->m_peraglock);
- error = xfs_ialloc_read_agi(mp, tp, agno, &agbp);
- up_read(&mp->m_peraglock);
- if (error) {
-#ifdef DEBUG
- xfs_fs_cmn_err(CE_ALERT, mp, "xfs_dilocate: "
- "xfs_ialloc_read_agi() returned "
- "error %d, agno %d",
- error, agno);
-#endif /* DEBUG */
- return error;
- }
- cur = xfs_btree_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO,
- (xfs_inode_t *)0, 0);
- if ((error = xfs_inobt_lookup_le(cur, agino, 0, 0, &i))) {
-#ifdef DEBUG
- xfs_fs_cmn_err(CE_ALERT, mp, "xfs_dilocate: "
- "xfs_inobt_lookup_le() failed");
-#endif /* DEBUG */
- goto error0;
- }
- if ((error = xfs_inobt_get_rec(cur, &chunk_agino, &chunk_cnt,
- &chunk_free, &i))) {
-#ifdef DEBUG
- xfs_fs_cmn_err(CE_ALERT, mp, "xfs_dilocate: "
- "xfs_inobt_get_rec() failed");
-#endif /* DEBUG */
- goto error0;
- }
- if (i == 0) {
-#ifdef DEBUG
- xfs_fs_cmn_err(CE_ALERT, mp, "xfs_dilocate: "
- "xfs_inobt_get_rec() failed");
-#endif /* DEBUG */
- error = XFS_ERROR(EINVAL);
- }
- xfs_trans_brelse(tp, agbp);
- xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+ error = xfs_imap_lookup(mp, tp, agno, agino, agbno,
+ &chunk_agbno, &offset_agbno, flags);
if (error)
return error;
- chunk_agbno = XFS_AGINO_TO_AGBNO(mp, chunk_agino);
- offset_agbno = agbno - chunk_agbno;
}
+
+out_map:
ASSERT(agbno >= chunk_agbno);
cluster_agbno = chunk_agbno +
((offset_agbno / blks_per_cluster) * blks_per_cluster);
offset = ((agbno - cluster_agbno) * mp->m_sb.sb_inopblock) +
XFS_INO_TO_OFFSET(mp, ino);
- *bno = XFS_AGB_TO_FSB(mp, agno, cluster_agbno);
- *off = offset;
- *len = blks_per_cluster;
+
+ imap->im_blkno = XFS_AGB_TO_DADDR(mp, agno, cluster_agbno);
+ imap->im_len = XFS_FSB_TO_BB(mp, blks_per_cluster);
+ imap->im_boffset = (ushort)(offset << mp->m_sb.sb_inodelog);
+
+ /*
+ * If the inode number maps to a block outside the bounds
+ * of the file system then return NULL rather than calling
+ * read_buf and panicing when we get an error from the
+ * driver.
+ */
+ if ((imap->im_blkno + imap->im_len) >
+ XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)) {
+ xfs_alert(mp,
+ "%s: (im_blkno (0x%llx) + im_len (0x%llx)) > sb_dblocks (0x%llx)",
+ __func__, (unsigned long long) imap->im_blkno,
+ (unsigned long long) imap->im_len,
+ XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks));
+ return XFS_ERROR(EINVAL);
+ }
return 0;
-error0:
- xfs_trans_brelse(tp, agbp);
- xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
- return error;
}
/*
@@ -1278,7 +1951,16 @@ xfs_ialloc_compute_maxlevels(
}
/*
- * Log specified fields for the ag hdr (inode section)
+ * Log specified fields for the ag hdr (inode section). The growth of the agi
+ * structure over time requires that we interpret the buffer as two logical
+ * regions delineated by the end of the unlinked list. This is due to the size
+ * of the hash table and its location in the middle of the agi.
+ *
+ * For example, a request to log a field before agi_unlinked and a field after
+ * agi_unlinked could cause us to log the entire hash table and use an excessive
+ * amount of log space. To avoid this behavior, log the region up through
+ * agi_unlinked in one call and the region after agi_unlinked through the end of
+ * the structure in another.
*/
void
xfs_ialloc_log_agi(
@@ -1301,86 +1983,207 @@ xfs_ialloc_log_agi(
offsetof(xfs_agi_t, agi_newino),
offsetof(xfs_agi_t, agi_dirino),
offsetof(xfs_agi_t, agi_unlinked),
+ offsetof(xfs_agi_t, agi_free_root),
+ offsetof(xfs_agi_t, agi_free_level),
sizeof(xfs_agi_t)
};
#ifdef DEBUG
xfs_agi_t *agi; /* allocation group header */
agi = XFS_BUF_TO_AGI(bp);
- ASSERT(be32_to_cpu(agi->agi_magicnum) == XFS_AGI_MAGIC);
+ ASSERT(agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC));
#endif
+
+ xfs_trans_buf_set_type(tp, bp, XFS_BLFT_AGI_BUF);
+
/*
- * Compute byte offsets for the first and last fields.
+ * Compute byte offsets for the first and last fields in the first
+ * region and log the agi buffer. This only logs up through
+ * agi_unlinked.
*/
- xfs_btree_offsets(fields, offsets, XFS_AGI_NUM_BITS, &first, &last);
+ if (fields & XFS_AGI_ALL_BITS_R1) {
+ xfs_btree_offsets(fields, offsets, XFS_AGI_NUM_BITS_R1,
+ &first, &last);
+ xfs_trans_log_buf(tp, bp, first, last);
+ }
+
+ /*
+ * Mask off the bits in the first region and calculate the first and
+ * last field offsets for any bits in the second region.
+ */
+ fields &= ~XFS_AGI_ALL_BITS_R1;
+ if (fields) {
+ xfs_btree_offsets(fields, offsets, XFS_AGI_NUM_BITS_R2,
+ &first, &last);
+ xfs_trans_log_buf(tp, bp, first, last);
+ }
+}
+
+#ifdef DEBUG
+STATIC void
+xfs_check_agi_unlinked(
+ struct xfs_agi *agi)
+{
+ int i;
+
+ for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++)
+ ASSERT(agi->agi_unlinked[i]);
+}
+#else
+#define xfs_check_agi_unlinked(agi)
+#endif
+
+static bool
+xfs_agi_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_agi *agi = XFS_BUF_TO_AGI(bp);
+
+ if (xfs_sb_version_hascrc(&mp->m_sb) &&
+ !uuid_equal(&agi->agi_uuid, &mp->m_sb.sb_uuid))
+ return false;
+ /*
+ * Validate the magic number of the agi block.
+ */
+ if (agi->agi_magicnum != cpu_to_be32(XFS_AGI_MAGIC))
+ return false;
+ if (!XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum)))
+ return false;
+
/*
- * Log the allocation group inode header buffer.
+ * during growfs operations, the perag is not fully initialised,
+ * so we can't use it for any useful checking. growfs ensures we can't
+ * use it by using uncached buffers that don't have the perag attached
+ * so we can detect and avoid this problem.
*/
- xfs_trans_log_buf(tp, bp, first, last);
+ if (bp->b_pag && be32_to_cpu(agi->agi_seqno) != bp->b_pag->pag_agno)
+ return false;
+
+ xfs_check_agi_unlinked(agi);
+ return true;
+}
+
+static void
+xfs_agi_read_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+
+ if (xfs_sb_version_hascrc(&mp->m_sb) &&
+ !xfs_buf_verify_cksum(bp, XFS_AGI_CRC_OFF))
+ xfs_buf_ioerror(bp, EFSBADCRC);
+ else if (XFS_TEST_ERROR(!xfs_agi_verify(bp), mp,
+ XFS_ERRTAG_IALLOC_READ_AGI,
+ XFS_RANDOM_IALLOC_READ_AGI))
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
+
+ if (bp->b_error)
+ xfs_verifier_error(bp);
}
+static void
+xfs_agi_write_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_buf_log_item *bip = bp->b_fspriv;
+
+ if (!xfs_agi_verify(bp)) {
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
+ xfs_verifier_error(bp);
+ return;
+ }
+
+ if (!xfs_sb_version_hascrc(&mp->m_sb))
+ return;
+
+ if (bip)
+ XFS_BUF_TO_AGI(bp)->agi_lsn = cpu_to_be64(bip->bli_item.li_lsn);
+ xfs_buf_update_cksum(bp, XFS_AGI_CRC_OFF);
+}
+
+const struct xfs_buf_ops xfs_agi_buf_ops = {
+ .verify_read = xfs_agi_read_verify,
+ .verify_write = xfs_agi_write_verify,
+};
+
/*
* Read in the allocation group header (inode allocation section)
*/
int
-xfs_ialloc_read_agi(
- xfs_mount_t *mp, /* file system mount structure */
- xfs_trans_t *tp, /* transaction pointer */
- xfs_agnumber_t agno, /* allocation group number */
- xfs_buf_t **bpp) /* allocation group hdr buf */
+xfs_read_agi(
+ struct xfs_mount *mp, /* file system mount structure */
+ struct xfs_trans *tp, /* transaction pointer */
+ xfs_agnumber_t agno, /* allocation group number */
+ struct xfs_buf **bpp) /* allocation group hdr buf */
{
- xfs_agi_t *agi; /* allocation group header */
- int agi_ok; /* agi is consistent */
- xfs_buf_t *bp; /* allocation group hdr buf */
- xfs_perag_t *pag; /* per allocation group data */
- int error;
+ int error;
+
+ trace_xfs_read_agi(mp, agno);
ASSERT(agno != NULLAGNUMBER);
- error = xfs_trans_read_buf(
- mp, tp, mp->m_ddev_targp,
+ error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)),
- XFS_FSS_TO_BB(mp, 1), 0, &bp);
+ XFS_FSS_TO_BB(mp, 1), 0, bpp, &xfs_agi_buf_ops);
if (error)
return error;
- ASSERT(bp && !XFS_BUF_GETERROR(bp));
- /*
- * Validate the magic number of the agi block.
- */
- agi = XFS_BUF_TO_AGI(bp);
- agi_ok =
- be32_to_cpu(agi->agi_magicnum) == XFS_AGI_MAGIC &&
- XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum));
- if (unlikely(XFS_TEST_ERROR(!agi_ok, mp, XFS_ERRTAG_IALLOC_READ_AGI,
- XFS_RANDOM_IALLOC_READ_AGI))) {
- XFS_CORRUPTION_ERROR("xfs_ialloc_read_agi", XFS_ERRLEVEL_LOW,
- mp, agi);
- xfs_trans_brelse(tp, bp);
- return XFS_ERROR(EFSCORRUPTED);
- }
- pag = &mp->m_perag[agno];
+ xfs_buf_set_ref(*bpp, XFS_AGI_REF);
+ return 0;
+}
+
+int
+xfs_ialloc_read_agi(
+ struct xfs_mount *mp, /* file system mount structure */
+ struct xfs_trans *tp, /* transaction pointer */
+ xfs_agnumber_t agno, /* allocation group number */
+ struct xfs_buf **bpp) /* allocation group hdr buf */
+{
+ struct xfs_agi *agi; /* allocation group header */
+ struct xfs_perag *pag; /* per allocation group data */
+ int error;
+
+ trace_xfs_ialloc_read_agi(mp, agno);
+
+ error = xfs_read_agi(mp, tp, agno, bpp);
+ if (error)
+ return error;
+
+ agi = XFS_BUF_TO_AGI(*bpp);
+ pag = xfs_perag_get(mp, agno);
if (!pag->pagi_init) {
pag->pagi_freecount = be32_to_cpu(agi->agi_freecount);
+ pag->pagi_count = be32_to_cpu(agi->agi_count);
pag->pagi_init = 1;
- } else {
- /*
- * It's possible for these to be out of sync if
- * we are in the middle of a forced shutdown.
- */
- ASSERT(pag->pagi_freecount == be32_to_cpu(agi->agi_freecount) ||
- XFS_FORCED_SHUTDOWN(mp));
}
-#ifdef DEBUG
- {
- int i;
+ /*
+ * It's possible for these to be out of sync if
+ * we are in the middle of a forced shutdown.
+ */
+ ASSERT(pag->pagi_freecount == be32_to_cpu(agi->agi_freecount) ||
+ XFS_FORCED_SHUTDOWN(mp));
+ xfs_perag_put(pag);
+ return 0;
+}
- for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++)
- ASSERT(agi->agi_unlinked[i]);
- }
-#endif
+/*
+ * Read in the agi to initialise the per-ag data in the mount structure
+ */
+int
+xfs_ialloc_pagi_init(
+ xfs_mount_t *mp, /* file system mount structure */
+ xfs_trans_t *tp, /* transaction pointer */
+ xfs_agnumber_t agno) /* allocation group number */
+{
+ xfs_buf_t *bp = NULL;
+ int error;
- XFS_BUF_SET_VTYPE_REF(bp, B_FS_AGI, XFS_AGI_REF);
- *bpp = bp;
+ error = xfs_ialloc_read_agi(mp, tp, agno, &bp);
+ if (error)
+ return error;
+ if (bp)
+ xfs_trans_brelse(tp, bp);
return 0;
}
diff --git a/fs/xfs/xfs_ialloc.h b/fs/xfs/xfs_ialloc.h
index 7f5debe1acb..95ad1c002d6 100644
--- a/fs/xfs/xfs_ialloc.h
+++ b/fs/xfs/xfs_ialloc.h
@@ -20,49 +20,35 @@
struct xfs_buf;
struct xfs_dinode;
+struct xfs_imap;
struct xfs_mount;
struct xfs_trans;
+struct xfs_btree_cur;
-/*
- * Allocation parameters for inode allocation.
- */
-#define XFS_IALLOC_INODES(mp) (mp)->m_ialloc_inos
-#define XFS_IALLOC_BLOCKS(mp) (mp)->m_ialloc_blks
-
-/*
- * For small block file systems, move inodes in clusters of this size.
- * When we don't have a lot of memory, however, we go a bit smaller
- * to reduce the number of AGI and ialloc btree blocks we need to keep
- * around for xfs_dilocate(). We choose which one to use in
- * xfs_mount_int().
- */
+/* Move inodes in clusters of this size */
#define XFS_INODE_BIG_CLUSTER_SIZE 8192
-#define XFS_INODE_SMALL_CLUSTER_SIZE 4096
-#define XFS_INODE_CLUSTER_SIZE(mp) (mp)->m_inode_cluster_size
+
+/* Calculate and return the number of filesystem blocks per inode cluster */
+static inline int
+xfs_icluster_size_fsb(
+ struct xfs_mount *mp)
+{
+ if (mp->m_sb.sb_blocksize >= mp->m_inode_cluster_size)
+ return 1;
+ return mp->m_inode_cluster_size >> mp->m_sb.sb_blocklog;
+}
/*
* Make an inode pointer out of the buffer/offset.
*/
-#define XFS_MAKE_IPTR(mp,b,o) xfs_make_iptr(mp,b,o)
static inline struct xfs_dinode *
xfs_make_iptr(struct xfs_mount *mp, struct xfs_buf *b, int o)
{
- return (xfs_dinode_t *)
+ return (struct xfs_dinode *)
(xfs_buf_offset(b, o << (mp)->m_sb.sb_inodelog));
}
/*
- * Find a free (set) bit in the inode bitmask.
- */
-#define XFS_IALLOC_FIND_FREE(fp) xfs_ialloc_find_free(fp)
-static inline int xfs_ialloc_find_free(xfs_inofree_t *fp)
-{
- return xfs_lowbit64(*fp);
-}
-
-
-#ifdef __KERNEL__
-/*
* Allocate an inode on disk.
* Mode is used to tell whether the new inode will need space, and whether
* it is a directory.
@@ -88,11 +74,9 @@ int /* error */
xfs_dialloc(
struct xfs_trans *tp, /* transaction pointer */
xfs_ino_t parent, /* parent inode (directory) */
- mode_t mode, /* mode bits for new inode */
+ umode_t mode, /* mode bits for new inode */
int okalloc, /* ok to allocate more space */
struct xfs_buf **agbp, /* buf for a.g. inode header */
- boolean_t *alloc_done, /* an allocation was done to replenish
- the free inodes */
xfs_ino_t *inop); /* inode number allocated */
/*
@@ -106,21 +90,18 @@ xfs_difree(
struct xfs_trans *tp, /* transaction pointer */
xfs_ino_t inode, /* inode to be freed */
struct xfs_bmap_free *flist, /* extents to free */
- int *delete, /* set if inode cluster was deleted */
+ int *deleted, /* set if inode cluster was deleted */
xfs_ino_t *first_ino); /* first inode in deleted cluster */
/*
- * Return the location of the inode in bno/len/off,
- * for mapping it into a buffer.
+ * Return the location of the inode in imap, for mapping it into a buffer.
*/
int
-xfs_dilocate(
+xfs_imap(
struct xfs_mount *mp, /* file system mount structure */
struct xfs_trans *tp, /* transaction pointer */
xfs_ino_t ino, /* inode to locate */
- xfs_fsblock_t *bno, /* output: block containing inode */
- int *len, /* output: num blocks in cluster*/
- int *off, /* output: index in block of inode */
+ struct xfs_imap *imap, /* location map structure */
uint flags); /* flags for inode btree lookup */
/*
@@ -149,6 +130,34 @@ xfs_ialloc_read_agi(
xfs_agnumber_t agno, /* allocation group number */
struct xfs_buf **bpp); /* allocation group hdr buf */
-#endif /* __KERNEL__ */
+/*
+ * Read in the allocation group header to initialise the per-ag data
+ * in the mount structure
+ */
+int
+xfs_ialloc_pagi_init(
+ struct xfs_mount *mp, /* file system mount structure */
+ struct xfs_trans *tp, /* transaction pointer */
+ xfs_agnumber_t agno); /* allocation group number */
+
+/*
+ * Lookup a record by ino in the btree given by cur.
+ */
+int xfs_inobt_lookup(struct xfs_btree_cur *cur, xfs_agino_t ino,
+ xfs_lookup_t dir, int *stat);
+
+/*
+ * Get the data from the pointed-to record.
+ */
+int xfs_inobt_get_rec(struct xfs_btree_cur *cur,
+ xfs_inobt_rec_incore_t *rec, int *stat);
+
+/*
+ * Inode chunk initialisation routine
+ */
+int xfs_ialloc_inode_init(struct xfs_mount *mp, struct xfs_trans *tp,
+ struct list_head *buffer_list,
+ xfs_agnumber_t agno, xfs_agblock_t agbno,
+ xfs_agblock_t length, unsigned int gen);
#endif /* __XFS_IALLOC_H__ */
diff --git a/fs/xfs/xfs_ialloc_btree.c b/fs/xfs/xfs_ialloc_btree.c
index 60c65683462..726f83a681a 100644
--- a/fs/xfs/xfs_ialloc_btree.c
+++ b/fs/xfs/xfs_ialloc_btree.c
@@ -17,2064 +17,406 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
-#include "xfs_types.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_bit.h"
-#include "xfs_log.h"
-#include "xfs_inum.h"
-#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
-#include "xfs_dir.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
-#include "xfs_dinode.h"
#include "xfs_inode.h"
#include "xfs_btree.h"
#include "xfs_ialloc.h"
+#include "xfs_ialloc_btree.h"
#include "xfs_alloc.h"
#include "xfs_error.h"
+#include "xfs_trace.h"
+#include "xfs_cksum.h"
+#include "xfs_trans.h"
-STATIC void xfs_inobt_log_block(xfs_trans_t *, xfs_buf_t *, int);
-STATIC void xfs_inobt_log_keys(xfs_btree_cur_t *, xfs_buf_t *, int, int);
-STATIC void xfs_inobt_log_ptrs(xfs_btree_cur_t *, xfs_buf_t *, int, int);
-STATIC void xfs_inobt_log_recs(xfs_btree_cur_t *, xfs_buf_t *, int, int);
-STATIC int xfs_inobt_lshift(xfs_btree_cur_t *, int, int *);
-STATIC int xfs_inobt_newroot(xfs_btree_cur_t *, int *);
-STATIC int xfs_inobt_rshift(xfs_btree_cur_t *, int, int *);
-STATIC int xfs_inobt_split(xfs_btree_cur_t *, int, xfs_agblock_t *,
- xfs_inobt_key_t *, xfs_btree_cur_t **, int *);
-STATIC int xfs_inobt_updkey(xfs_btree_cur_t *, xfs_inobt_key_t *, int);
-/*
- * Single level of the xfs_inobt_delete record deletion routine.
- * Delete record pointed to by cur/level.
- * Remove the record from its block then rebalance the tree.
- * Return 0 for error, 1 for done, 2 to go on to the next level.
- */
-STATIC int /* error */
-xfs_inobt_delrec(
- xfs_btree_cur_t *cur, /* btree cursor */
- int level, /* level removing record from */
- int *stat) /* fail/done/go-on */
+STATIC int
+xfs_inobt_get_minrecs(
+ struct xfs_btree_cur *cur,
+ int level)
{
- xfs_buf_t *agbp; /* buffer for a.g. inode header */
- xfs_mount_t *mp; /* mount structure */
- xfs_agi_t *agi; /* allocation group inode header */
- xfs_inobt_block_t *block; /* btree block record/key lives in */
- xfs_agblock_t bno; /* btree block number */
- xfs_buf_t *bp; /* buffer for block */
- int error; /* error return value */
- int i; /* loop index */
- xfs_inobt_key_t key; /* kp points here if block is level 0 */
- xfs_inobt_key_t *kp = NULL; /* pointer to btree keys */
- xfs_agblock_t lbno; /* left block's block number */
- xfs_buf_t *lbp; /* left block's buffer pointer */
- xfs_inobt_block_t *left; /* left btree block */
- xfs_inobt_key_t *lkp; /* left block key pointer */
- xfs_inobt_ptr_t *lpp; /* left block address pointer */
- int lrecs = 0; /* number of records in left block */
- xfs_inobt_rec_t *lrp; /* left block record pointer */
- xfs_inobt_ptr_t *pp = NULL; /* pointer to btree addresses */
- int ptr; /* index in btree block for this rec */
- xfs_agblock_t rbno; /* right block's block number */
- xfs_buf_t *rbp; /* right block's buffer pointer */
- xfs_inobt_block_t *right; /* right btree block */
- xfs_inobt_key_t *rkp; /* right block key pointer */
- xfs_inobt_rec_t *rp; /* pointer to btree records */
- xfs_inobt_ptr_t *rpp; /* right block address pointer */
- int rrecs = 0; /* number of records in right block */
- int numrecs;
- xfs_inobt_rec_t *rrp; /* right block record pointer */
- xfs_btree_cur_t *tcur; /* temporary btree cursor */
-
- mp = cur->bc_mp;
+ return cur->bc_mp->m_inobt_mnr[level != 0];
+}
- /*
- * Get the index of the entry being deleted, check for nothing there.
- */
- ptr = cur->bc_ptrs[level];
- if (ptr == 0) {
- *stat = 0;
- return 0;
- }
+STATIC struct xfs_btree_cur *
+xfs_inobt_dup_cursor(
+ struct xfs_btree_cur *cur)
+{
+ return xfs_inobt_init_cursor(cur->bc_mp, cur->bc_tp,
+ cur->bc_private.a.agbp, cur->bc_private.a.agno,
+ cur->bc_btnum);
+}
- /*
- * Get the buffer & block containing the record or key/ptr.
- */
- bp = cur->bc_bufs[level];
- block = XFS_BUF_TO_INOBT_BLOCK(bp);
-#ifdef DEBUG
- if ((error = xfs_btree_check_sblock(cur, block, level, bp)))
- return error;
-#endif
- /*
- * Fail if we're off the end of the block.
- */
+STATIC void
+xfs_inobt_set_root(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *nptr,
+ int inc) /* level change */
+{
+ struct xfs_buf *agbp = cur->bc_private.a.agbp;
+ struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp);
- numrecs = be16_to_cpu(block->bb_numrecs);
- if (ptr > numrecs) {
- *stat = 0;
- return 0;
- }
- /*
- * It's a nonleaf. Excise the key and ptr being deleted, by
- * sliding the entries past them down one.
- * Log the changed areas of the block.
- */
- if (level > 0) {
- kp = XFS_INOBT_KEY_ADDR(block, 1, cur);
- pp = XFS_INOBT_PTR_ADDR(block, 1, cur);
-#ifdef DEBUG
- for (i = ptr; i < numrecs; i++) {
- if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(pp[i]), level)))
- return error;
- }
-#endif
- if (ptr < numrecs) {
- memmove(&kp[ptr - 1], &kp[ptr],
- (numrecs - ptr) * sizeof(*kp));
- memmove(&pp[ptr - 1], &pp[ptr],
- (numrecs - ptr) * sizeof(*kp));
- xfs_inobt_log_keys(cur, bp, ptr, numrecs - 1);
- xfs_inobt_log_ptrs(cur, bp, ptr, numrecs - 1);
- }
- }
- /*
- * It's a leaf. Excise the record being deleted, by sliding the
- * entries past it down one. Log the changed areas of the block.
- */
- else {
- rp = XFS_INOBT_REC_ADDR(block, 1, cur);
- if (ptr < numrecs) {
- memmove(&rp[ptr - 1], &rp[ptr],
- (numrecs - ptr) * sizeof(*rp));
- xfs_inobt_log_recs(cur, bp, ptr, numrecs - 1);
- }
- /*
- * If it's the first record in the block, we'll need a key
- * structure to pass up to the next level (updkey).
- */
- if (ptr == 1) {
- key.ir_startino = rp->ir_startino;
- kp = &key;
- }
- }
- /*
- * Decrement and log the number of entries in the block.
- */
- numrecs--;
- block->bb_numrecs = cpu_to_be16(numrecs);
- xfs_inobt_log_block(cur->bc_tp, bp, XFS_BB_NUMRECS);
- /*
- * Is this the root level? If so, we're almost done.
- */
- if (level == cur->bc_nlevels - 1) {
- /*
- * If this is the root level,
- * and there's only one entry left,
- * and it's NOT the leaf level,
- * then we can get rid of this level.
- */
- if (numrecs == 1 && level > 0) {
- agbp = cur->bc_private.i.agbp;
- agi = XFS_BUF_TO_AGI(agbp);
- /*
- * pp is still set to the first pointer in the block.
- * Make it the new root of the btree.
- */
- bno = be32_to_cpu(agi->agi_root);
- agi->agi_root = *pp;
- be32_add(&agi->agi_level, -1);
- /*
- * Free the block.
- */
- if ((error = xfs_free_extent(cur->bc_tp,
- XFS_AGB_TO_FSB(mp, cur->bc_private.i.agno, bno), 1)))
- return error;
- xfs_trans_binval(cur->bc_tp, bp);
- xfs_ialloc_log_agi(cur->bc_tp, agbp,
- XFS_AGI_ROOT | XFS_AGI_LEVEL);
- /*
- * Update the cursor so there's one fewer level.
- */
- cur->bc_bufs[level] = NULL;
- cur->bc_nlevels--;
- } else if (level > 0 &&
- (error = xfs_inobt_decrement(cur, level, &i)))
- return error;
- *stat = 1;
- return 0;
- }
- /*
- * If we deleted the leftmost entry in the block, update the
- * key values above us in the tree.
- */
- if (ptr == 1 && (error = xfs_inobt_updkey(cur, kp, level + 1)))
- return error;
- /*
- * If the number of records remaining in the block is at least
- * the minimum, we're done.
- */
- if (numrecs >= XFS_INOBT_BLOCK_MINRECS(level, cur)) {
- if (level > 0 &&
- (error = xfs_inobt_decrement(cur, level, &i)))
- return error;
- *stat = 1;
- return 0;
- }
- /*
- * Otherwise, we have to move some records around to keep the
- * tree balanced. Look at the left and right sibling blocks to
- * see if we can re-balance by moving only one record.
- */
- rbno = be32_to_cpu(block->bb_rightsib);
- lbno = be32_to_cpu(block->bb_leftsib);
- bno = NULLAGBLOCK;
- ASSERT(rbno != NULLAGBLOCK || lbno != NULLAGBLOCK);
- /*
- * Duplicate the cursor so our btree manipulations here won't
- * disrupt the next level up.
- */
- if ((error = xfs_btree_dup_cursor(cur, &tcur)))
- return error;
- /*
- * If there's a right sibling, see if it's ok to shift an entry
- * out of it.
- */
- if (rbno != NULLAGBLOCK) {
- /*
- * Move the temp cursor to the last entry in the next block.
- * Actually any entry but the first would suffice.
- */
- i = xfs_btree_lastrec(tcur, level);
- XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
- if ((error = xfs_inobt_increment(tcur, level, &i)))
- goto error0;
- XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
- i = xfs_btree_lastrec(tcur, level);
- XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
- /*
- * Grab a pointer to the block.
- */
- rbp = tcur->bc_bufs[level];
- right = XFS_BUF_TO_INOBT_BLOCK(rbp);
-#ifdef DEBUG
- if ((error = xfs_btree_check_sblock(cur, right, level, rbp)))
- goto error0;
-#endif
- /*
- * Grab the current block number, for future use.
- */
- bno = be32_to_cpu(right->bb_leftsib);
- /*
- * If right block is full enough so that removing one entry
- * won't make it too empty, and left-shifting an entry out
- * of right to us works, we're done.
- */
- if (be16_to_cpu(right->bb_numrecs) - 1 >=
- XFS_INOBT_BLOCK_MINRECS(level, cur)) {
- if ((error = xfs_inobt_lshift(tcur, level, &i)))
- goto error0;
- if (i) {
- ASSERT(be16_to_cpu(block->bb_numrecs) >=
- XFS_INOBT_BLOCK_MINRECS(level, cur));
- xfs_btree_del_cursor(tcur,
- XFS_BTREE_NOERROR);
- if (level > 0 &&
- (error = xfs_inobt_decrement(cur, level,
- &i)))
- return error;
- *stat = 1;
- return 0;
- }
- }
- /*
- * Otherwise, grab the number of records in right for
- * future reference, and fix up the temp cursor to point
- * to our block again (last record).
- */
- rrecs = be16_to_cpu(right->bb_numrecs);
- if (lbno != NULLAGBLOCK) {
- xfs_btree_firstrec(tcur, level);
- if ((error = xfs_inobt_decrement(tcur, level, &i)))
- goto error0;
- }
- }
- /*
- * If there's a left sibling, see if it's ok to shift an entry
- * out of it.
- */
- if (lbno != NULLAGBLOCK) {
- /*
- * Move the temp cursor to the first entry in the
- * previous block.
- */
- xfs_btree_firstrec(tcur, level);
- if ((error = xfs_inobt_decrement(tcur, level, &i)))
- goto error0;
- xfs_btree_firstrec(tcur, level);
- /*
- * Grab a pointer to the block.
- */
- lbp = tcur->bc_bufs[level];
- left = XFS_BUF_TO_INOBT_BLOCK(lbp);
-#ifdef DEBUG
- if ((error = xfs_btree_check_sblock(cur, left, level, lbp)))
- goto error0;
-#endif
- /*
- * Grab the current block number, for future use.
- */
- bno = be32_to_cpu(left->bb_rightsib);
- /*
- * If left block is full enough so that removing one entry
- * won't make it too empty, and right-shifting an entry out
- * of left to us works, we're done.
- */
- if (be16_to_cpu(left->bb_numrecs) - 1 >=
- XFS_INOBT_BLOCK_MINRECS(level, cur)) {
- if ((error = xfs_inobt_rshift(tcur, level, &i)))
- goto error0;
- if (i) {
- ASSERT(be16_to_cpu(block->bb_numrecs) >=
- XFS_INOBT_BLOCK_MINRECS(level, cur));
- xfs_btree_del_cursor(tcur,
- XFS_BTREE_NOERROR);
- if (level == 0)
- cur->bc_ptrs[0]++;
- *stat = 1;
- return 0;
- }
- }
- /*
- * Otherwise, grab the number of records in right for
- * future reference.
- */
- lrecs = be16_to_cpu(left->bb_numrecs);
- }
- /*
- * Delete the temp cursor, we're done with it.
- */
- xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
- /*
- * If here, we need to do a join to keep the tree balanced.
- */
- ASSERT(bno != NULLAGBLOCK);
- /*
- * See if we can join with the left neighbor block.
- */
- if (lbno != NULLAGBLOCK &&
- lrecs + numrecs <= XFS_INOBT_BLOCK_MAXRECS(level, cur)) {
- /*
- * Set "right" to be the starting block,
- * "left" to be the left neighbor.
- */
- rbno = bno;
- right = block;
- rrecs = be16_to_cpu(right->bb_numrecs);
- rbp = bp;
- if ((error = xfs_btree_read_bufs(mp, cur->bc_tp,
- cur->bc_private.i.agno, lbno, 0, &lbp,
- XFS_INO_BTREE_REF)))
- return error;
- left = XFS_BUF_TO_INOBT_BLOCK(lbp);
- lrecs = be16_to_cpu(left->bb_numrecs);
- if ((error = xfs_btree_check_sblock(cur, left, level, lbp)))
- return error;
- }
- /*
- * If that won't work, see if we can join with the right neighbor block.
- */
- else if (rbno != NULLAGBLOCK &&
- rrecs + numrecs <= XFS_INOBT_BLOCK_MAXRECS(level, cur)) {
- /*
- * Set "left" to be the starting block,
- * "right" to be the right neighbor.
- */
- lbno = bno;
- left = block;
- lrecs = be16_to_cpu(left->bb_numrecs);
- lbp = bp;
- if ((error = xfs_btree_read_bufs(mp, cur->bc_tp,
- cur->bc_private.i.agno, rbno, 0, &rbp,
- XFS_INO_BTREE_REF)))
- return error;
- right = XFS_BUF_TO_INOBT_BLOCK(rbp);
- rrecs = be16_to_cpu(right->bb_numrecs);
- if ((error = xfs_btree_check_sblock(cur, right, level, rbp)))
- return error;
- }
- /*
- * Otherwise, we can't fix the imbalance.
- * Just return. This is probably a logic error, but it's not fatal.
- */
- else {
- if (level > 0 && (error = xfs_inobt_decrement(cur, level, &i)))
- return error;
- *stat = 1;
- return 0;
- }
- /*
- * We're now going to join "left" and "right" by moving all the stuff
- * in "right" to "left" and deleting "right".
- */
- if (level > 0) {
- /*
- * It's a non-leaf. Move keys and pointers.
- */
- lkp = XFS_INOBT_KEY_ADDR(left, lrecs + 1, cur);
- lpp = XFS_INOBT_PTR_ADDR(left, lrecs + 1, cur);
- rkp = XFS_INOBT_KEY_ADDR(right, 1, cur);
- rpp = XFS_INOBT_PTR_ADDR(right, 1, cur);
-#ifdef DEBUG
- for (i = 0; i < rrecs; i++) {
- if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(rpp[i]), level)))
- return error;
- }
-#endif
- memcpy(lkp, rkp, rrecs * sizeof(*lkp));
- memcpy(lpp, rpp, rrecs * sizeof(*lpp));
- xfs_inobt_log_keys(cur, lbp, lrecs + 1, lrecs + rrecs);
- xfs_inobt_log_ptrs(cur, lbp, lrecs + 1, lrecs + rrecs);
- } else {
- /*
- * It's a leaf. Move records.
- */
- lrp = XFS_INOBT_REC_ADDR(left, lrecs + 1, cur);
- rrp = XFS_INOBT_REC_ADDR(right, 1, cur);
- memcpy(lrp, rrp, rrecs * sizeof(*lrp));
- xfs_inobt_log_recs(cur, lbp, lrecs + 1, lrecs + rrecs);
- }
- /*
- * If we joined with the left neighbor, set the buffer in the
- * cursor to the left block, and fix up the index.
- */
- if (bp != lbp) {
- xfs_btree_setbuf(cur, level, lbp);
- cur->bc_ptrs[level] += lrecs;
- }
- /*
- * If we joined with the right neighbor and there's a level above
- * us, increment the cursor at that level.
- */
- else if (level + 1 < cur->bc_nlevels &&
- (error = xfs_alloc_increment(cur, level + 1, &i)))
- return error;
- /*
- * Fix up the number of records in the surviving block.
- */
- lrecs += rrecs;
- left->bb_numrecs = cpu_to_be16(lrecs);
- /*
- * Fix up the right block pointer in the surviving block, and log it.
- */
- left->bb_rightsib = right->bb_rightsib;
- xfs_inobt_log_block(cur->bc_tp, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
- /*
- * If there is a right sibling now, make it point to the
- * remaining block.
- */
- if (be32_to_cpu(left->bb_rightsib) != NULLAGBLOCK) {
- xfs_inobt_block_t *rrblock;
- xfs_buf_t *rrbp;
+ agi->agi_root = nptr->s;
+ be32_add_cpu(&agi->agi_level, inc);
+ xfs_ialloc_log_agi(cur->bc_tp, agbp, XFS_AGI_ROOT | XFS_AGI_LEVEL);
+}
- if ((error = xfs_btree_read_bufs(mp, cur->bc_tp,
- cur->bc_private.i.agno, be32_to_cpu(left->bb_rightsib), 0,
- &rrbp, XFS_INO_BTREE_REF)))
- return error;
- rrblock = XFS_BUF_TO_INOBT_BLOCK(rrbp);
- if ((error = xfs_btree_check_sblock(cur, rrblock, level, rrbp)))
- return error;
- rrblock->bb_leftsib = cpu_to_be32(lbno);
- xfs_inobt_log_block(cur->bc_tp, rrbp, XFS_BB_LEFTSIB);
- }
- /*
- * Free the deleting block.
- */
- if ((error = xfs_free_extent(cur->bc_tp, XFS_AGB_TO_FSB(mp,
- cur->bc_private.i.agno, rbno), 1)))
- return error;
- xfs_trans_binval(cur->bc_tp, rbp);
- /*
- * Readjust the ptr at this level if it's not a leaf, since it's
- * still pointing at the deletion point, which makes the cursor
- * inconsistent. If this makes the ptr 0, the caller fixes it up.
- * We can't use decrement because it would change the next level up.
- */
- if (level > 0)
- cur->bc_ptrs[level]--;
- /*
- * Return value means the next level up has something to do.
- */
- *stat = 2;
- return 0;
+STATIC void
+xfs_finobt_set_root(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *nptr,
+ int inc) /* level change */
+{
+ struct xfs_buf *agbp = cur->bc_private.a.agbp;
+ struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp);
-error0:
- xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
- return error;
+ agi->agi_free_root = nptr->s;
+ be32_add_cpu(&agi->agi_free_level, inc);
+ xfs_ialloc_log_agi(cur->bc_tp, agbp,
+ XFS_AGI_FREE_ROOT | XFS_AGI_FREE_LEVEL);
}
-/*
- * Insert one record/level. Return information to the caller
- * allowing the next level up to proceed if necessary.
- */
-STATIC int /* error */
-xfs_inobt_insrec(
- xfs_btree_cur_t *cur, /* btree cursor */
- int level, /* level to insert record at */
- xfs_agblock_t *bnop, /* i/o: block number inserted */
- xfs_inobt_rec_t *recp, /* i/o: record data inserted */
- xfs_btree_cur_t **curp, /* output: new cursor replacing cur */
- int *stat) /* success/failure */
+STATIC int
+xfs_inobt_alloc_block(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *start,
+ union xfs_btree_ptr *new,
+ int *stat)
{
- xfs_inobt_block_t *block; /* btree block record/key lives in */
- xfs_buf_t *bp; /* buffer for block */
- int error; /* error return value */
- int i; /* loop index */
- xfs_inobt_key_t key; /* key value being inserted */
- xfs_inobt_key_t *kp=NULL; /* pointer to btree keys */
- xfs_agblock_t nbno; /* block number of allocated block */
- xfs_btree_cur_t *ncur; /* new cursor to be used at next lvl */
- xfs_inobt_key_t nkey; /* new key value, from split */
- xfs_inobt_rec_t nrec; /* new record value, for caller */
- int numrecs;
- int optr; /* old ptr value */
- xfs_inobt_ptr_t *pp; /* pointer to btree addresses */
- int ptr; /* index in btree block for this rec */
- xfs_inobt_rec_t *rp=NULL; /* pointer to btree records */
+ xfs_alloc_arg_t args; /* block allocation args */
+ int error; /* error return value */
+ xfs_agblock_t sbno = be32_to_cpu(start->s);
- /*
- * GCC doesn't understand the (arguably complex) control flow in
- * this function and complains about uninitialized structure fields
- * without this.
- */
- memset(&nrec, 0, sizeof(nrec));
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
- /*
- * If we made it to the root level, allocate a new root block
- * and we're done.
- */
- if (level >= cur->bc_nlevels) {
- error = xfs_inobt_newroot(cur, &i);
- *bnop = NULLAGBLOCK;
- *stat = i;
+ memset(&args, 0, sizeof(args));
+ args.tp = cur->bc_tp;
+ args.mp = cur->bc_mp;
+ args.fsbno = XFS_AGB_TO_FSB(args.mp, cur->bc_private.a.agno, sbno);
+ args.minlen = 1;
+ args.maxlen = 1;
+ args.prod = 1;
+ args.type = XFS_ALLOCTYPE_NEAR_BNO;
+
+ error = xfs_alloc_vextent(&args);
+ if (error) {
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
return error;
}
- /*
- * Make a key out of the record data to be inserted, and save it.
- */
- key.ir_startino = recp->ir_startino; /* INT_: direct copy */
- optr = ptr = cur->bc_ptrs[level];
- /*
- * If we're off the left edge, return failure.
- */
- if (ptr == 0) {
+ if (args.fsbno == NULLFSBLOCK) {
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
*stat = 0;
return 0;
}
- /*
- * Get pointers to the btree buffer and block.
- */
- bp = cur->bc_bufs[level];
- block = XFS_BUF_TO_INOBT_BLOCK(bp);
- numrecs = be16_to_cpu(block->bb_numrecs);
-#ifdef DEBUG
- if ((error = xfs_btree_check_sblock(cur, block, level, bp)))
- return error;
- /*
- * Check that the new entry is being inserted in the right place.
- */
- if (ptr <= numrecs) {
- if (level == 0) {
- rp = XFS_INOBT_REC_ADDR(block, ptr, cur);
- xfs_btree_check_rec(cur->bc_btnum, recp, rp);
- } else {
- kp = XFS_INOBT_KEY_ADDR(block, ptr, cur);
- xfs_btree_check_key(cur->bc_btnum, &key, kp);
- }
- }
-#endif
- nbno = NULLAGBLOCK;
- ncur = (xfs_btree_cur_t *)0;
- /*
- * If the block is full, we can't insert the new entry until we
- * make the block un-full.
- */
- if (numrecs == XFS_INOBT_BLOCK_MAXRECS(level, cur)) {
- /*
- * First, try shifting an entry to the right neighbor.
- */
- if ((error = xfs_inobt_rshift(cur, level, &i)))
- return error;
- if (i) {
- /* nothing */
- }
- /*
- * Next, try shifting an entry to the left neighbor.
- */
- else {
- if ((error = xfs_inobt_lshift(cur, level, &i)))
- return error;
- if (i) {
- optr = ptr = cur->bc_ptrs[level];
- } else {
- /*
- * Next, try splitting the current block
- * in half. If this works we have to
- * re-set our variables because
- * we could be in a different block now.
- */
- if ((error = xfs_inobt_split(cur, level, &nbno,
- &nkey, &ncur, &i)))
- return error;
- if (i) {
- bp = cur->bc_bufs[level];
- block = XFS_BUF_TO_INOBT_BLOCK(bp);
-#ifdef DEBUG
- if ((error = xfs_btree_check_sblock(cur,
- block, level, bp)))
- return error;
-#endif
- ptr = cur->bc_ptrs[level];
- nrec.ir_startino = nkey.ir_startino; /* INT_: direct copy */
- } else {
- /*
- * Otherwise the insert fails.
- */
- *stat = 0;
- return 0;
- }
- }
- }
- }
- /*
- * At this point we know there's room for our new entry in the block
- * we're pointing at.
- */
- numrecs = be16_to_cpu(block->bb_numrecs);
- if (level > 0) {
- /*
- * It's a non-leaf entry. Make a hole for the new data
- * in the key and ptr regions of the block.
- */
- kp = XFS_INOBT_KEY_ADDR(block, 1, cur);
- pp = XFS_INOBT_PTR_ADDR(block, 1, cur);
-#ifdef DEBUG
- for (i = numrecs; i >= ptr; i--) {
- if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(pp[i - 1]), level)))
- return error;
- }
-#endif
- memmove(&kp[ptr], &kp[ptr - 1],
- (numrecs - ptr + 1) * sizeof(*kp));
- memmove(&pp[ptr], &pp[ptr - 1],
- (numrecs - ptr + 1) * sizeof(*pp));
- /*
- * Now stuff the new data in, bump numrecs and log the new data.
- */
-#ifdef DEBUG
- if ((error = xfs_btree_check_sptr(cur, *bnop, level)))
- return error;
-#endif
- kp[ptr - 1] = key; /* INT_: struct copy */
- pp[ptr - 1] = cpu_to_be32(*bnop);
- numrecs++;
- block->bb_numrecs = cpu_to_be16(numrecs);
- xfs_inobt_log_keys(cur, bp, ptr, numrecs);
- xfs_inobt_log_ptrs(cur, bp, ptr, numrecs);
- } else {
- /*
- * It's a leaf entry. Make a hole for the new record.
- */
- rp = XFS_INOBT_REC_ADDR(block, 1, cur);
- memmove(&rp[ptr], &rp[ptr - 1],
- (numrecs - ptr + 1) * sizeof(*rp));
- /*
- * Now stuff the new record in, bump numrecs
- * and log the new data.
- */
- rp[ptr - 1] = *recp; /* INT_: struct copy */
- numrecs++;
- block->bb_numrecs = cpu_to_be16(numrecs);
- xfs_inobt_log_recs(cur, bp, ptr, numrecs);
- }
- /*
- * Log the new number of records in the btree header.
- */
- xfs_inobt_log_block(cur->bc_tp, bp, XFS_BB_NUMRECS);
-#ifdef DEBUG
- /*
- * Check that the key/record is in the right place, now.
- */
- if (ptr < numrecs) {
- if (level == 0)
- xfs_btree_check_rec(cur->bc_btnum, rp + ptr - 1,
- rp + ptr);
- else
- xfs_btree_check_key(cur->bc_btnum, kp + ptr - 1,
- kp + ptr);
- }
-#endif
- /*
- * If we inserted at the start of a block, update the parents' keys.
- */
- if (optr == 1 && (error = xfs_inobt_updkey(cur, &key, level + 1)))
- return error;
- /*
- * Return the new block number, if any.
- * If there is one, give back a record value and a cursor too.
- */
- *bnop = nbno;
- if (nbno != NULLAGBLOCK) {
- *recp = nrec; /* INT_: struct copy */
- *curp = ncur;
- }
+ ASSERT(args.len == 1);
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+
+ new->s = cpu_to_be32(XFS_FSB_TO_AGBNO(args.mp, args.fsbno));
*stat = 1;
return 0;
}
-/*
- * Log header fields from a btree block.
- */
-STATIC void
-xfs_inobt_log_block(
- xfs_trans_t *tp, /* transaction pointer */
- xfs_buf_t *bp, /* buffer containing btree block */
- int fields) /* mask of fields: XFS_BB_... */
+STATIC int
+xfs_inobt_free_block(
+ struct xfs_btree_cur *cur,
+ struct xfs_buf *bp)
{
- int first; /* first byte offset logged */
- int last; /* last byte offset logged */
- static const short offsets[] = { /* table of offsets */
- offsetof(xfs_inobt_block_t, bb_magic),
- offsetof(xfs_inobt_block_t, bb_level),
- offsetof(xfs_inobt_block_t, bb_numrecs),
- offsetof(xfs_inobt_block_t, bb_leftsib),
- offsetof(xfs_inobt_block_t, bb_rightsib),
- sizeof(xfs_inobt_block_t)
- };
+ xfs_fsblock_t fsbno;
+ int error;
+
+ fsbno = XFS_DADDR_TO_FSB(cur->bc_mp, XFS_BUF_ADDR(bp));
+ error = xfs_free_extent(cur->bc_tp, fsbno, 1);
+ if (error)
+ return error;
- xfs_btree_offsets(fields, offsets, XFS_BB_NUM_BITS, &first, &last);
- xfs_trans_log_buf(tp, bp, first, last);
+ xfs_trans_binval(cur->bc_tp, bp);
+ return error;
}
-/*
- * Log keys from a btree block (nonleaf).
- */
-STATIC void
-xfs_inobt_log_keys(
- xfs_btree_cur_t *cur, /* btree cursor */
- xfs_buf_t *bp, /* buffer containing btree block */
- int kfirst, /* index of first key to log */
- int klast) /* index of last key to log */
+STATIC int
+xfs_inobt_get_maxrecs(
+ struct xfs_btree_cur *cur,
+ int level)
{
- xfs_inobt_block_t *block; /* btree block to log from */
- int first; /* first byte offset logged */
- xfs_inobt_key_t *kp; /* key pointer in btree block */
- int last; /* last byte offset logged */
-
- block = XFS_BUF_TO_INOBT_BLOCK(bp);
- kp = XFS_INOBT_KEY_ADDR(block, 1, cur);
- first = (int)((xfs_caddr_t)&kp[kfirst - 1] - (xfs_caddr_t)block);
- last = (int)(((xfs_caddr_t)&kp[klast] - 1) - (xfs_caddr_t)block);
- xfs_trans_log_buf(cur->bc_tp, bp, first, last);
+ return cur->bc_mp->m_inobt_mxr[level != 0];
}
-/*
- * Log block pointer fields from a btree block (nonleaf).
- */
STATIC void
-xfs_inobt_log_ptrs(
- xfs_btree_cur_t *cur, /* btree cursor */
- xfs_buf_t *bp, /* buffer containing btree block */
- int pfirst, /* index of first pointer to log */
- int plast) /* index of last pointer to log */
+xfs_inobt_init_key_from_rec(
+ union xfs_btree_key *key,
+ union xfs_btree_rec *rec)
{
- xfs_inobt_block_t *block; /* btree block to log from */
- int first; /* first byte offset logged */
- int last; /* last byte offset logged */
- xfs_inobt_ptr_t *pp; /* block-pointer pointer in btree blk */
-
- block = XFS_BUF_TO_INOBT_BLOCK(bp);
- pp = XFS_INOBT_PTR_ADDR(block, 1, cur);
- first = (int)((xfs_caddr_t)&pp[pfirst - 1] - (xfs_caddr_t)block);
- last = (int)(((xfs_caddr_t)&pp[plast] - 1) - (xfs_caddr_t)block);
- xfs_trans_log_buf(cur->bc_tp, bp, first, last);
+ key->inobt.ir_startino = rec->inobt.ir_startino;
}
-/*
- * Log records from a btree block (leaf).
- */
STATIC void
-xfs_inobt_log_recs(
- xfs_btree_cur_t *cur, /* btree cursor */
- xfs_buf_t *bp, /* buffer containing btree block */
- int rfirst, /* index of first record to log */
- int rlast) /* index of last record to log */
+xfs_inobt_init_rec_from_key(
+ union xfs_btree_key *key,
+ union xfs_btree_rec *rec)
{
- xfs_inobt_block_t *block; /* btree block to log from */
- int first; /* first byte offset logged */
- int last; /* last byte offset logged */
- xfs_inobt_rec_t *rp; /* record pointer for btree block */
-
- block = XFS_BUF_TO_INOBT_BLOCK(bp);
- rp = XFS_INOBT_REC_ADDR(block, 1, cur);
- first = (int)((xfs_caddr_t)&rp[rfirst - 1] - (xfs_caddr_t)block);
- last = (int)(((xfs_caddr_t)&rp[rlast] - 1) - (xfs_caddr_t)block);
- xfs_trans_log_buf(cur->bc_tp, bp, first, last);
+ rec->inobt.ir_startino = key->inobt.ir_startino;
}
-/*
- * Lookup the record. The cursor is made to point to it, based on dir.
- * Return 0 if can't find any such record, 1 for success.
- */
-STATIC int /* error */
-xfs_inobt_lookup(
- xfs_btree_cur_t *cur, /* btree cursor */
- xfs_lookup_t dir, /* <=, ==, or >= */
- int *stat) /* success/failure */
+STATIC void
+xfs_inobt_init_rec_from_cur(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_rec *rec)
{
- xfs_agblock_t agbno; /* a.g. relative btree block number */
- xfs_agnumber_t agno; /* allocation group number */
- xfs_inobt_block_t *block=NULL; /* current btree block */
- __int64_t diff; /* difference for the current key */
- int error; /* error return value */
- int keyno=0; /* current key number */
- int level; /* level in the btree */
- xfs_mount_t *mp; /* file system mount point */
-
- /*
- * Get the allocation group header, and the root block number.
- */
- mp = cur->bc_mp;
- {
- xfs_agi_t *agi; /* a.g. inode header */
-
- agi = XFS_BUF_TO_AGI(cur->bc_private.i.agbp);
- agno = be32_to_cpu(agi->agi_seqno);
- agbno = be32_to_cpu(agi->agi_root);
- }
- /*
- * Iterate over each level in the btree, starting at the root.
- * For each level above the leaves, find the key we need, based
- * on the lookup record, then follow the corresponding block
- * pointer down to the next level.
- */
- for (level = cur->bc_nlevels - 1, diff = 1; level >= 0; level--) {
- xfs_buf_t *bp; /* buffer pointer for btree block */
- xfs_daddr_t d; /* disk address of btree block */
-
- /*
- * Get the disk address we're looking for.
- */
- d = XFS_AGB_TO_DADDR(mp, agno, agbno);
- /*
- * If the old buffer at this level is for a different block,
- * throw it away, otherwise just use it.
- */
- bp = cur->bc_bufs[level];
- if (bp && XFS_BUF_ADDR(bp) != d)
- bp = (xfs_buf_t *)0;
- if (!bp) {
- /*
- * Need to get a new buffer. Read it, then
- * set it in the cursor, releasing the old one.
- */
- if ((error = xfs_btree_read_bufs(mp, cur->bc_tp,
- agno, agbno, 0, &bp, XFS_INO_BTREE_REF)))
- return error;
- xfs_btree_setbuf(cur, level, bp);
- /*
- * Point to the btree block, now that we have the buffer
- */
- block = XFS_BUF_TO_INOBT_BLOCK(bp);
- if ((error = xfs_btree_check_sblock(cur, block, level,
- bp)))
- return error;
- } else
- block = XFS_BUF_TO_INOBT_BLOCK(bp);
- /*
- * If we already had a key match at a higher level, we know
- * we need to use the first entry in this block.
- */
- if (diff == 0)
- keyno = 1;
- /*
- * Otherwise we need to search this block. Do a binary search.
- */
- else {
- int high; /* high entry number */
- xfs_inobt_key_t *kkbase=NULL;/* base of keys in block */
- xfs_inobt_rec_t *krbase=NULL;/* base of records in block */
- int low; /* low entry number */
-
- /*
- * Get a pointer to keys or records.
- */
- if (level > 0)
- kkbase = XFS_INOBT_KEY_ADDR(block, 1, cur);
- else
- krbase = XFS_INOBT_REC_ADDR(block, 1, cur);
- /*
- * Set low and high entry numbers, 1-based.
- */
- low = 1;
- if (!(high = be16_to_cpu(block->bb_numrecs))) {
- /*
- * If the block is empty, the tree must
- * be an empty leaf.
- */
- ASSERT(level == 0 && cur->bc_nlevels == 1);
- cur->bc_ptrs[0] = dir != XFS_LOOKUP_LE;
- *stat = 0;
- return 0;
- }
- /*
- * Binary search the block.
- */
- while (low <= high) {
- xfs_agino_t startino; /* key value */
-
- /*
- * keyno is average of low and high.
- */
- keyno = (low + high) >> 1;
- /*
- * Get startino.
- */
- if (level > 0) {
- xfs_inobt_key_t *kkp;
-
- kkp = kkbase + keyno - 1;
- startino = INT_GET(kkp->ir_startino, ARCH_CONVERT);
- } else {
- xfs_inobt_rec_t *krp;
-
- krp = krbase + keyno - 1;
- startino = INT_GET(krp->ir_startino, ARCH_CONVERT);
- }
- /*
- * Compute difference to get next direction.
- */
- diff = (__int64_t)
- startino - cur->bc_rec.i.ir_startino;
- /*
- * Less than, move right.
- */
- if (diff < 0)
- low = keyno + 1;
- /*
- * Greater than, move left.
- */
- else if (diff > 0)
- high = keyno - 1;
- /*
- * Equal, we're done.
- */
- else
- break;
- }
- }
- /*
- * If there are more levels, set up for the next level
- * by getting the block number and filling in the cursor.
- */
- if (level > 0) {
- /*
- * If we moved left, need the previous key number,
- * unless there isn't one.
- */
- if (diff > 0 && --keyno < 1)
- keyno = 1;
- agbno = be32_to_cpu(*XFS_INOBT_PTR_ADDR(block, keyno, cur));
-#ifdef DEBUG
- if ((error = xfs_btree_check_sptr(cur, agbno, level)))
- return error;
-#endif
- cur->bc_ptrs[level] = keyno;
- }
- }
- /*
- * Done with the search.
- * See if we need to adjust the results.
- */
- if (dir != XFS_LOOKUP_LE && diff < 0) {
- keyno++;
- /*
- * If ge search and we went off the end of the block, but it's
- * not the last block, we're in the wrong block.
- */
- if (dir == XFS_LOOKUP_GE &&
- keyno > be16_to_cpu(block->bb_numrecs) &&
- be32_to_cpu(block->bb_rightsib) != NULLAGBLOCK) {
- int i;
-
- cur->bc_ptrs[0] = keyno;
- if ((error = xfs_inobt_increment(cur, 0, &i)))
- return error;
- ASSERT(i == 1);
- *stat = 1;
- return 0;
- }
- }
- else if (dir == XFS_LOOKUP_LE && diff > 0)
- keyno--;
- cur->bc_ptrs[0] = keyno;
- /*
- * Return if we succeeded or not.
- */
- if (keyno == 0 || keyno > be16_to_cpu(block->bb_numrecs))
- *stat = 0;
- else
- *stat = ((dir != XFS_LOOKUP_EQ) || (diff == 0));
- return 0;
+ rec->inobt.ir_startino = cpu_to_be32(cur->bc_rec.i.ir_startino);
+ rec->inobt.ir_freecount = cpu_to_be32(cur->bc_rec.i.ir_freecount);
+ rec->inobt.ir_free = cpu_to_be64(cur->bc_rec.i.ir_free);
}
/*
- * Move 1 record left from cur/level if possible.
- * Update cur to reflect the new path.
+ * initial value of ptr for lookup
*/
-STATIC int /* error */
-xfs_inobt_lshift(
- xfs_btree_cur_t *cur, /* btree cursor */
- int level, /* level to shift record on */
- int *stat) /* success/failure */
+STATIC void
+xfs_inobt_init_ptr_from_cur(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *ptr)
{
- int error; /* error return value */
-#ifdef DEBUG
- int i; /* loop index */
-#endif
- xfs_inobt_key_t key; /* key value for leaf level upward */
- xfs_buf_t *lbp; /* buffer for left neighbor block */
- xfs_inobt_block_t *left; /* left neighbor btree block */
- xfs_inobt_key_t *lkp=NULL; /* key pointer for left block */
- xfs_inobt_ptr_t *lpp; /* address pointer for left block */
- xfs_inobt_rec_t *lrp=NULL; /* record pointer for left block */
- int nrec; /* new number of left block entries */
- xfs_buf_t *rbp; /* buffer for right (current) block */
- xfs_inobt_block_t *right; /* right (current) btree block */
- xfs_inobt_key_t *rkp=NULL; /* key pointer for right block */
- xfs_inobt_ptr_t *rpp=NULL; /* address pointer for right block */
- xfs_inobt_rec_t *rrp=NULL; /* record pointer for right block */
+ struct xfs_agi *agi = XFS_BUF_TO_AGI(cur->bc_private.a.agbp);
- /*
- * Set up variables for this block as "right".
- */
- rbp = cur->bc_bufs[level];
- right = XFS_BUF_TO_INOBT_BLOCK(rbp);
-#ifdef DEBUG
- if ((error = xfs_btree_check_sblock(cur, right, level, rbp)))
- return error;
-#endif
- /*
- * If we've got no left sibling then we can't shift an entry left.
- */
- if (be32_to_cpu(right->bb_leftsib) == NULLAGBLOCK) {
- *stat = 0;
- return 0;
- }
- /*
- * If the cursor entry is the one that would be moved, don't
- * do it... it's too complicated.
- */
- if (cur->bc_ptrs[level] <= 1) {
- *stat = 0;
- return 0;
- }
- /*
- * Set up the left neighbor as "left".
- */
- if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
- cur->bc_private.i.agno, be32_to_cpu(right->bb_leftsib),
- 0, &lbp, XFS_INO_BTREE_REF)))
- return error;
- left = XFS_BUF_TO_INOBT_BLOCK(lbp);
- if ((error = xfs_btree_check_sblock(cur, left, level, lbp)))
- return error;
- /*
- * If it's full, it can't take another entry.
- */
- if (be16_to_cpu(left->bb_numrecs) == XFS_INOBT_BLOCK_MAXRECS(level, cur)) {
- *stat = 0;
- return 0;
- }
- nrec = be16_to_cpu(left->bb_numrecs) + 1;
- /*
- * If non-leaf, copy a key and a ptr to the left block.
- */
- if (level > 0) {
- lkp = XFS_INOBT_KEY_ADDR(left, nrec, cur);
- rkp = XFS_INOBT_KEY_ADDR(right, 1, cur);
- *lkp = *rkp;
- xfs_inobt_log_keys(cur, lbp, nrec, nrec);
- lpp = XFS_INOBT_PTR_ADDR(left, nrec, cur);
- rpp = XFS_INOBT_PTR_ADDR(right, 1, cur);
-#ifdef DEBUG
- if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(*rpp), level)))
- return error;
-#endif
- *lpp = *rpp; /* INT_: no-change copy */
- xfs_inobt_log_ptrs(cur, lbp, nrec, nrec);
- }
- /*
- * If leaf, copy a record to the left block.
- */
- else {
- lrp = XFS_INOBT_REC_ADDR(left, nrec, cur);
- rrp = XFS_INOBT_REC_ADDR(right, 1, cur);
- *lrp = *rrp;
- xfs_inobt_log_recs(cur, lbp, nrec, nrec);
- }
- /*
- * Bump and log left's numrecs, decrement and log right's numrecs.
- */
- be16_add(&left->bb_numrecs, 1);
- xfs_inobt_log_block(cur->bc_tp, lbp, XFS_BB_NUMRECS);
-#ifdef DEBUG
- if (level > 0)
- xfs_btree_check_key(cur->bc_btnum, lkp - 1, lkp);
- else
- xfs_btree_check_rec(cur->bc_btnum, lrp - 1, lrp);
-#endif
- be16_add(&right->bb_numrecs, -1);
- xfs_inobt_log_block(cur->bc_tp, rbp, XFS_BB_NUMRECS);
- /*
- * Slide the contents of right down one entry.
- */
- if (level > 0) {
-#ifdef DEBUG
- for (i = 0; i < be16_to_cpu(right->bb_numrecs); i++) {
- if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(rpp[i + 1]),
- level)))
- return error;
- }
-#endif
- memmove(rkp, rkp + 1, be16_to_cpu(right->bb_numrecs) * sizeof(*rkp));
- memmove(rpp, rpp + 1, be16_to_cpu(right->bb_numrecs) * sizeof(*rpp));
- xfs_inobt_log_keys(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
- xfs_inobt_log_ptrs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
- } else {
- memmove(rrp, rrp + 1, be16_to_cpu(right->bb_numrecs) * sizeof(*rrp));
- xfs_inobt_log_recs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
- key.ir_startino = rrp->ir_startino; /* INT_: direct copy */
- rkp = &key;
- }
- /*
- * Update the parent key values of right.
- */
- if ((error = xfs_inobt_updkey(cur, rkp, level + 1)))
- return error;
- /*
- * Slide the cursor value left one.
- */
- cur->bc_ptrs[level]--;
- *stat = 1;
- return 0;
+ ASSERT(cur->bc_private.a.agno == be32_to_cpu(agi->agi_seqno));
+
+ ptr->s = agi->agi_root;
}
-/*
- * Allocate a new root block, fill it in.
- */
-STATIC int /* error */
-xfs_inobt_newroot(
- xfs_btree_cur_t *cur, /* btree cursor */
- int *stat) /* success/failure */
+STATIC void
+xfs_finobt_init_ptr_from_cur(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *ptr)
{
- xfs_agi_t *agi; /* a.g. inode header */
- xfs_alloc_arg_t args; /* allocation argument structure */
- xfs_inobt_block_t *block; /* one half of the old root block */
- xfs_buf_t *bp; /* buffer containing block */
- int error; /* error return value */
- xfs_inobt_key_t *kp; /* btree key pointer */
- xfs_agblock_t lbno; /* left block number */
- xfs_buf_t *lbp; /* left buffer pointer */
- xfs_inobt_block_t *left; /* left btree block */
- xfs_buf_t *nbp; /* new (root) buffer */
- xfs_inobt_block_t *new; /* new (root) btree block */
- int nptr; /* new value for key index, 1 or 2 */
- xfs_inobt_ptr_t *pp; /* btree address pointer */
- xfs_agblock_t rbno; /* right block number */
- xfs_buf_t *rbp; /* right buffer pointer */
- xfs_inobt_block_t *right; /* right btree block */
- xfs_inobt_rec_t *rp; /* btree record pointer */
+ struct xfs_agi *agi = XFS_BUF_TO_AGI(cur->bc_private.a.agbp);
- ASSERT(cur->bc_nlevels < XFS_IN_MAXLEVELS(cur->bc_mp));
-
- /*
- * Get a block & a buffer.
- */
- agi = XFS_BUF_TO_AGI(cur->bc_private.i.agbp);
- args.tp = cur->bc_tp;
- args.mp = cur->bc_mp;
- args.fsbno = XFS_AGB_TO_FSB(args.mp, cur->bc_private.i.agno,
- be32_to_cpu(agi->agi_root));
- args.mod = args.minleft = args.alignment = args.total = args.wasdel =
- args.isfl = args.userdata = args.minalignslop = 0;
- args.minlen = args.maxlen = args.prod = 1;
- args.type = XFS_ALLOCTYPE_NEAR_BNO;
- if ((error = xfs_alloc_vextent(&args)))
- return error;
- /*
- * None available, we fail.
- */
- if (args.fsbno == NULLFSBLOCK) {
- *stat = 0;
- return 0;
- }
- ASSERT(args.len == 1);
- nbp = xfs_btree_get_bufs(args.mp, args.tp, args.agno, args.agbno, 0);
- new = XFS_BUF_TO_INOBT_BLOCK(nbp);
- /*
- * Set the root data in the a.g. inode structure.
- */
- agi->agi_root = cpu_to_be32(args.agbno);
- be32_add(&agi->agi_level, 1);
- xfs_ialloc_log_agi(args.tp, cur->bc_private.i.agbp,
- XFS_AGI_ROOT | XFS_AGI_LEVEL);
- /*
- * At the previous root level there are now two blocks: the old
- * root, and the new block generated when it was split.
- * We don't know which one the cursor is pointing at, so we
- * set up variables "left" and "right" for each case.
- */
- bp = cur->bc_bufs[cur->bc_nlevels - 1];
- block = XFS_BUF_TO_INOBT_BLOCK(bp);
-#ifdef DEBUG
- if ((error = xfs_btree_check_sblock(cur, block, cur->bc_nlevels - 1, bp)))
- return error;
-#endif
- if (be32_to_cpu(block->bb_rightsib) != NULLAGBLOCK) {
- /*
- * Our block is left, pick up the right block.
- */
- lbp = bp;
- lbno = XFS_DADDR_TO_AGBNO(args.mp, XFS_BUF_ADDR(lbp));
- left = block;
- rbno = be32_to_cpu(left->bb_rightsib);
- if ((error = xfs_btree_read_bufs(args.mp, args.tp, args.agno,
- rbno, 0, &rbp, XFS_INO_BTREE_REF)))
- return error;
- bp = rbp;
- right = XFS_BUF_TO_INOBT_BLOCK(rbp);
- if ((error = xfs_btree_check_sblock(cur, right,
- cur->bc_nlevels - 1, rbp)))
- return error;
- nptr = 1;
- } else {
- /*
- * Our block is right, pick up the left block.
- */
- rbp = bp;
- rbno = XFS_DADDR_TO_AGBNO(args.mp, XFS_BUF_ADDR(rbp));
- right = block;
- lbno = be32_to_cpu(right->bb_leftsib);
- if ((error = xfs_btree_read_bufs(args.mp, args.tp, args.agno,
- lbno, 0, &lbp, XFS_INO_BTREE_REF)))
- return error;
- bp = lbp;
- left = XFS_BUF_TO_INOBT_BLOCK(lbp);
- if ((error = xfs_btree_check_sblock(cur, left,
- cur->bc_nlevels - 1, lbp)))
- return error;
- nptr = 2;
- }
- /*
- * Fill in the new block's btree header and log it.
- */
- new->bb_magic = cpu_to_be32(xfs_magics[cur->bc_btnum]);
- new->bb_level = cpu_to_be16(cur->bc_nlevels);
- new->bb_numrecs = cpu_to_be16(2);
- new->bb_leftsib = cpu_to_be32(NULLAGBLOCK);
- new->bb_rightsib = cpu_to_be32(NULLAGBLOCK);
- xfs_inobt_log_block(args.tp, nbp, XFS_BB_ALL_BITS);
- ASSERT(lbno != NULLAGBLOCK && rbno != NULLAGBLOCK);
- /*
- * Fill in the key data in the new root.
- */
- kp = XFS_INOBT_KEY_ADDR(new, 1, cur);
- if (be16_to_cpu(left->bb_level) > 0) {
- kp[0] = *XFS_INOBT_KEY_ADDR(left, 1, cur); /* INT_: struct copy */
- kp[1] = *XFS_INOBT_KEY_ADDR(right, 1, cur); /* INT_: struct copy */
- } else {
- rp = XFS_INOBT_REC_ADDR(left, 1, cur);
- INT_COPY(kp[0].ir_startino, rp->ir_startino, ARCH_CONVERT);
- rp = XFS_INOBT_REC_ADDR(right, 1, cur);
- INT_COPY(kp[1].ir_startino, rp->ir_startino, ARCH_CONVERT);
- }
- xfs_inobt_log_keys(cur, nbp, 1, 2);
- /*
- * Fill in the pointer data in the new root.
- */
- pp = XFS_INOBT_PTR_ADDR(new, 1, cur);
- pp[0] = cpu_to_be32(lbno);
- pp[1] = cpu_to_be32(rbno);
- xfs_inobt_log_ptrs(cur, nbp, 1, 2);
- /*
- * Fix up the cursor.
- */
- xfs_btree_setbuf(cur, cur->bc_nlevels, nbp);
- cur->bc_ptrs[cur->bc_nlevels] = nptr;
- cur->bc_nlevels++;
- *stat = 1;
- return 0;
+ ASSERT(cur->bc_private.a.agno == be32_to_cpu(agi->agi_seqno));
+ ptr->s = agi->agi_free_root;
}
-/*
- * Move 1 record right from cur/level if possible.
- * Update cur to reflect the new path.
- */
-STATIC int /* error */
-xfs_inobt_rshift(
- xfs_btree_cur_t *cur, /* btree cursor */
- int level, /* level to shift record on */
- int *stat) /* success/failure */
+STATIC __int64_t
+xfs_inobt_key_diff(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_key *key)
{
- int error; /* error return value */
- int i; /* loop index */
- xfs_inobt_key_t key; /* key value for leaf level upward */
- xfs_buf_t *lbp; /* buffer for left (current) block */
- xfs_inobt_block_t *left; /* left (current) btree block */
- xfs_inobt_key_t *lkp; /* key pointer for left block */
- xfs_inobt_ptr_t *lpp; /* address pointer for left block */
- xfs_inobt_rec_t *lrp; /* record pointer for left block */
- xfs_buf_t *rbp; /* buffer for right neighbor block */
- xfs_inobt_block_t *right; /* right neighbor btree block */
- xfs_inobt_key_t *rkp; /* key pointer for right block */
- xfs_inobt_ptr_t *rpp; /* address pointer for right block */
- xfs_inobt_rec_t *rrp=NULL; /* record pointer for right block */
- xfs_btree_cur_t *tcur; /* temporary cursor */
-
- /*
- * Set up variables for this block as "left".
- */
- lbp = cur->bc_bufs[level];
- left = XFS_BUF_TO_INOBT_BLOCK(lbp);
-#ifdef DEBUG
- if ((error = xfs_btree_check_sblock(cur, left, level, lbp)))
- return error;
-#endif
- /*
- * If we've got no right sibling then we can't shift an entry right.
- */
- if (be32_to_cpu(left->bb_rightsib) == NULLAGBLOCK) {
- *stat = 0;
- return 0;
- }
- /*
- * If the cursor entry is the one that would be moved, don't
- * do it... it's too complicated.
- */
- if (cur->bc_ptrs[level] >= be16_to_cpu(left->bb_numrecs)) {
- *stat = 0;
- return 0;
- }
- /*
- * Set up the right neighbor as "right".
- */
- if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
- cur->bc_private.i.agno, be32_to_cpu(left->bb_rightsib),
- 0, &rbp, XFS_INO_BTREE_REF)))
- return error;
- right = XFS_BUF_TO_INOBT_BLOCK(rbp);
- if ((error = xfs_btree_check_sblock(cur, right, level, rbp)))
- return error;
- /*
- * If it's full, it can't take another entry.
- */
- if (be16_to_cpu(right->bb_numrecs) == XFS_INOBT_BLOCK_MAXRECS(level, cur)) {
- *stat = 0;
- return 0;
- }
- /*
- * Make a hole at the start of the right neighbor block, then
- * copy the last left block entry to the hole.
- */
- if (level > 0) {
- lkp = XFS_INOBT_KEY_ADDR(left, be16_to_cpu(left->bb_numrecs), cur);
- lpp = XFS_INOBT_PTR_ADDR(left, be16_to_cpu(left->bb_numrecs), cur);
- rkp = XFS_INOBT_KEY_ADDR(right, 1, cur);
- rpp = XFS_INOBT_PTR_ADDR(right, 1, cur);
-#ifdef DEBUG
- for (i = be16_to_cpu(right->bb_numrecs) - 1; i >= 0; i--) {
- if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(rpp[i]), level)))
- return error;
- }
-#endif
- memmove(rkp + 1, rkp, be16_to_cpu(right->bb_numrecs) * sizeof(*rkp));
- memmove(rpp + 1, rpp, be16_to_cpu(right->bb_numrecs) * sizeof(*rpp));
-#ifdef DEBUG
- if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(*lpp), level)))
- return error;
-#endif
- *rkp = *lkp; /* INT_: no change copy */
- *rpp = *lpp; /* INT_: no change copy */
- xfs_inobt_log_keys(cur, rbp, 1, be16_to_cpu(right->bb_numrecs) + 1);
- xfs_inobt_log_ptrs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs) + 1);
- } else {
- lrp = XFS_INOBT_REC_ADDR(left, be16_to_cpu(left->bb_numrecs), cur);
- rrp = XFS_INOBT_REC_ADDR(right, 1, cur);
- memmove(rrp + 1, rrp, be16_to_cpu(right->bb_numrecs) * sizeof(*rrp));
- *rrp = *lrp;
- xfs_inobt_log_recs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs) + 1);
- key.ir_startino = rrp->ir_startino; /* INT_: direct copy */
- rkp = &key;
- }
- /*
- * Decrement and log left's numrecs, bump and log right's numrecs.
- */
- be16_add(&left->bb_numrecs, -1);
- xfs_inobt_log_block(cur->bc_tp, lbp, XFS_BB_NUMRECS);
- be16_add(&right->bb_numrecs, 1);
-#ifdef DEBUG
- if (level > 0)
- xfs_btree_check_key(cur->bc_btnum, rkp, rkp + 1);
- else
- xfs_btree_check_rec(cur->bc_btnum, rrp, rrp + 1);
-#endif
- xfs_inobt_log_block(cur->bc_tp, rbp, XFS_BB_NUMRECS);
- /*
- * Using a temporary cursor, update the parent key values of the
- * block on the right.
- */
- if ((error = xfs_btree_dup_cursor(cur, &tcur)))
- return error;
- xfs_btree_lastrec(tcur, level);
- if ((error = xfs_inobt_increment(tcur, level, &i)) ||
- (error = xfs_inobt_updkey(tcur, rkp, level + 1))) {
- xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
- return error;
- }
- xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
- *stat = 1;
- return 0;
+ return (__int64_t)be32_to_cpu(key->inobt.ir_startino) -
+ cur->bc_rec.i.ir_startino;
}
-/*
- * Split cur/level block in half.
- * Return new block number and its first record (to be inserted into parent).
- */
-STATIC int /* error */
-xfs_inobt_split(
- xfs_btree_cur_t *cur, /* btree cursor */
- int level, /* level to split */
- xfs_agblock_t *bnop, /* output: block number allocated */
- xfs_inobt_key_t *keyp, /* output: first key of new block */
- xfs_btree_cur_t **curp, /* output: new cursor */
- int *stat) /* success/failure */
+static int
+xfs_inobt_verify(
+ struct xfs_buf *bp)
{
- xfs_alloc_arg_t args; /* allocation argument structure */
- int error; /* error return value */
- int i; /* loop index/record number */
- xfs_agblock_t lbno; /* left (current) block number */
- xfs_buf_t *lbp; /* buffer for left block */
- xfs_inobt_block_t *left; /* left (current) btree block */
- xfs_inobt_key_t *lkp; /* left btree key pointer */
- xfs_inobt_ptr_t *lpp; /* left btree address pointer */
- xfs_inobt_rec_t *lrp; /* left btree record pointer */
- xfs_buf_t *rbp; /* buffer for right block */
- xfs_inobt_block_t *right; /* right (new) btree block */
- xfs_inobt_key_t *rkp; /* right btree key pointer */
- xfs_inobt_ptr_t *rpp; /* right btree address pointer */
- xfs_inobt_rec_t *rrp; /* right btree record pointer */
-
- /*
- * Set up left block (current one).
- */
- lbp = cur->bc_bufs[level];
- args.tp = cur->bc_tp;
- args.mp = cur->bc_mp;
- lbno = XFS_DADDR_TO_AGBNO(args.mp, XFS_BUF_ADDR(lbp));
- /*
- * Allocate the new block.
- * If we can't do it, we're toast. Give up.
- */
- args.fsbno = XFS_AGB_TO_FSB(args.mp, cur->bc_private.i.agno, lbno);
- args.mod = args.minleft = args.alignment = args.total = args.wasdel =
- args.isfl = args.userdata = args.minalignslop = 0;
- args.minlen = args.maxlen = args.prod = 1;
- args.type = XFS_ALLOCTYPE_NEAR_BNO;
- if ((error = xfs_alloc_vextent(&args)))
- return error;
- if (args.fsbno == NULLFSBLOCK) {
- *stat = 0;
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
+ struct xfs_perag *pag = bp->b_pag;
+ unsigned int level;
+
+ /*
+ * During growfs operations, we can't verify the exact owner as the
+ * perag is not fully initialised and hence not attached to the buffer.
+ *
+ * Similarly, during log recovery we will have a perag structure
+ * attached, but the agi information will not yet have been initialised
+ * from the on disk AGI. We don't currently use any of this information,
+ * but beware of the landmine (i.e. need to check pag->pagi_init) if we
+ * ever do.
+ */
+ switch (block->bb_magic) {
+ case cpu_to_be32(XFS_IBT_CRC_MAGIC):
+ case cpu_to_be32(XFS_FIBT_CRC_MAGIC):
+ if (!xfs_sb_version_hascrc(&mp->m_sb))
+ return false;
+ if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid))
+ return false;
+ if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn))
+ return false;
+ if (pag &&
+ be32_to_cpu(block->bb_u.s.bb_owner) != pag->pag_agno)
+ return false;
+ /* fall through */
+ case cpu_to_be32(XFS_IBT_MAGIC):
+ case cpu_to_be32(XFS_FIBT_MAGIC):
+ break;
+ default:
return 0;
}
- ASSERT(args.len == 1);
- rbp = xfs_btree_get_bufs(args.mp, args.tp, args.agno, args.agbno, 0);
- /*
- * Set up the new block as "right".
- */
- right = XFS_BUF_TO_INOBT_BLOCK(rbp);
- /*
- * "Left" is the current (according to the cursor) block.
- */
- left = XFS_BUF_TO_INOBT_BLOCK(lbp);
-#ifdef DEBUG
- if ((error = xfs_btree_check_sblock(cur, left, level, lbp)))
- return error;
-#endif
- /*
- * Fill in the btree header for the new block.
- */
- right->bb_magic = cpu_to_be32(xfs_magics[cur->bc_btnum]);
- right->bb_level = left->bb_level;
- right->bb_numrecs = cpu_to_be16(be16_to_cpu(left->bb_numrecs) / 2);
- /*
- * Make sure that if there's an odd number of entries now, that
- * each new block will have the same number of entries.
- */
- if ((be16_to_cpu(left->bb_numrecs) & 1) &&
- cur->bc_ptrs[level] <= be16_to_cpu(right->bb_numrecs) + 1)
- be16_add(&right->bb_numrecs, 1);
- i = be16_to_cpu(left->bb_numrecs) - be16_to_cpu(right->bb_numrecs) + 1;
- /*
- * For non-leaf blocks, copy keys and addresses over to the new block.
- */
- if (level > 0) {
- lkp = XFS_INOBT_KEY_ADDR(left, i, cur);
- lpp = XFS_INOBT_PTR_ADDR(left, i, cur);
- rkp = XFS_INOBT_KEY_ADDR(right, 1, cur);
- rpp = XFS_INOBT_PTR_ADDR(right, 1, cur);
-#ifdef DEBUG
- for (i = 0; i < be16_to_cpu(right->bb_numrecs); i++) {
- if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(lpp[i]), level)))
- return error;
- }
-#endif
- memcpy(rkp, lkp, be16_to_cpu(right->bb_numrecs) * sizeof(*rkp));
- memcpy(rpp, lpp, be16_to_cpu(right->bb_numrecs) * sizeof(*rpp));
- xfs_inobt_log_keys(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
- xfs_inobt_log_ptrs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
- *keyp = *rkp;
- }
- /*
- * For leaf blocks, copy records over to the new block.
- */
- else {
- lrp = XFS_INOBT_REC_ADDR(left, i, cur);
- rrp = XFS_INOBT_REC_ADDR(right, 1, cur);
- memcpy(rrp, lrp, be16_to_cpu(right->bb_numrecs) * sizeof(*rrp));
- xfs_inobt_log_recs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
- keyp->ir_startino = rrp->ir_startino; /* INT_: direct copy */
- }
- /*
- * Find the left block number by looking in the buffer.
- * Adjust numrecs, sibling pointers.
- */
- be16_add(&left->bb_numrecs, -(be16_to_cpu(right->bb_numrecs)));
- right->bb_rightsib = left->bb_rightsib;
- left->bb_rightsib = cpu_to_be32(args.agbno);
- right->bb_leftsib = cpu_to_be32(lbno);
- xfs_inobt_log_block(args.tp, rbp, XFS_BB_ALL_BITS);
- xfs_inobt_log_block(args.tp, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
- /*
- * If there's a block to the new block's right, make that block
- * point back to right instead of to left.
- */
- if (be32_to_cpu(right->bb_rightsib) != NULLAGBLOCK) {
- xfs_inobt_block_t *rrblock; /* rr btree block */
- xfs_buf_t *rrbp; /* buffer for rrblock */
- if ((error = xfs_btree_read_bufs(args.mp, args.tp, args.agno,
- be32_to_cpu(right->bb_rightsib), 0, &rrbp,
- XFS_INO_BTREE_REF)))
- return error;
- rrblock = XFS_BUF_TO_INOBT_BLOCK(rrbp);
- if ((error = xfs_btree_check_sblock(cur, rrblock, level, rrbp)))
- return error;
- rrblock->bb_leftsib = cpu_to_be32(args.agbno);
- xfs_inobt_log_block(args.tp, rrbp, XFS_BB_LEFTSIB);
- }
- /*
- * If the cursor is really in the right block, move it there.
- * If it's just pointing past the last entry in left, then we'll
- * insert there, so don't change anything in that case.
- */
- if (cur->bc_ptrs[level] > be16_to_cpu(left->bb_numrecs) + 1) {
- xfs_btree_setbuf(cur, level, rbp);
- cur->bc_ptrs[level] -= be16_to_cpu(left->bb_numrecs);
- }
- /*
- * If there are more levels, we'll need another cursor which refers
- * the right block, no matter where this cursor was.
- */
- if (level + 1 < cur->bc_nlevels) {
- if ((error = xfs_btree_dup_cursor(cur, curp)))
- return error;
- (*curp)->bc_ptrs[level + 1]++;
- }
- *bnop = args.agbno;
- *stat = 1;
- return 0;
+ /* numrecs and level verification */
+ level = be16_to_cpu(block->bb_level);
+ if (level >= mp->m_in_maxlevels)
+ return false;
+ if (be16_to_cpu(block->bb_numrecs) > mp->m_inobt_mxr[level != 0])
+ return false;
+
+ /* sibling pointer verification */
+ if (!block->bb_u.s.bb_leftsib ||
+ (be32_to_cpu(block->bb_u.s.bb_leftsib) >= mp->m_sb.sb_agblocks &&
+ block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK)))
+ return false;
+ if (!block->bb_u.s.bb_rightsib ||
+ (be32_to_cpu(block->bb_u.s.bb_rightsib) >= mp->m_sb.sb_agblocks &&
+ block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK)))
+ return false;
+
+ return true;
}
-/*
- * Update keys at all levels from here to the root along the cursor's path.
- */
-STATIC int /* error */
-xfs_inobt_updkey(
- xfs_btree_cur_t *cur, /* btree cursor */
- xfs_inobt_key_t *keyp, /* new key value to update to */
- int level) /* starting level for update */
+static void
+xfs_inobt_read_verify(
+ struct xfs_buf *bp)
{
- int ptr; /* index of key in block */
-
- /*
- * Go up the tree from this level toward the root.
- * At each level, update the key value to the value input.
- * Stop when we reach a level where the cursor isn't pointing
- * at the first entry in the block.
- */
- for (ptr = 1; ptr == 1 && level < cur->bc_nlevels; level++) {
- xfs_buf_t *bp; /* buffer for block */
- xfs_inobt_block_t *block; /* btree block */
-#ifdef DEBUG
- int error; /* error return value */
-#endif
- xfs_inobt_key_t *kp; /* ptr to btree block keys */
+ if (!xfs_btree_sblock_verify_crc(bp))
+ xfs_buf_ioerror(bp, EFSBADCRC);
+ else if (!xfs_inobt_verify(bp))
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
- bp = cur->bc_bufs[level];
- block = XFS_BUF_TO_INOBT_BLOCK(bp);
-#ifdef DEBUG
- if ((error = xfs_btree_check_sblock(cur, block, level, bp)))
- return error;
-#endif
- ptr = cur->bc_ptrs[level];
- kp = XFS_INOBT_KEY_ADDR(block, ptr, cur);
- *kp = *keyp;
- xfs_inobt_log_keys(cur, bp, ptr, ptr);
+ if (bp->b_error) {
+ trace_xfs_btree_corrupt(bp, _RET_IP_);
+ xfs_verifier_error(bp);
}
- return 0;
}
-/*
- * Externally visible routines.
- */
-
-/*
- * Decrement cursor by one record at the level.
- * For nonzero levels the leaf-ward information is untouched.
- */
-int /* error */
-xfs_inobt_decrement(
- xfs_btree_cur_t *cur, /* btree cursor */
- int level, /* level in btree, 0 is leaf */
- int *stat) /* success/failure */
+static void
+xfs_inobt_write_verify(
+ struct xfs_buf *bp)
{
- xfs_inobt_block_t *block; /* btree block */
- int error;
- int lev; /* btree level */
-
- ASSERT(level < cur->bc_nlevels);
- /*
- * Read-ahead to the left at this level.
- */
- xfs_btree_readahead(cur, level, XFS_BTCUR_LEFTRA);
- /*
- * Decrement the ptr at this level. If we're still in the block
- * then we're done.
- */
- if (--cur->bc_ptrs[level] > 0) {
- *stat = 1;
- return 0;
- }
- /*
- * Get a pointer to the btree block.
- */
- block = XFS_BUF_TO_INOBT_BLOCK(cur->bc_bufs[level]);
-#ifdef DEBUG
- if ((error = xfs_btree_check_sblock(cur, block, level,
- cur->bc_bufs[level])))
- return error;
-#endif
- /*
- * If we just went off the left edge of the tree, return failure.
- */
- if (be32_to_cpu(block->bb_leftsib) == NULLAGBLOCK) {
- *stat = 0;
- return 0;
- }
- /*
- * March up the tree decrementing pointers.
- * Stop when we don't go off the left edge of a block.
- */
- for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
- if (--cur->bc_ptrs[lev] > 0)
- break;
- /*
- * Read-ahead the left block, we're going to read it
- * in the next loop.
- */
- xfs_btree_readahead(cur, lev, XFS_BTCUR_LEFTRA);
+ if (!xfs_inobt_verify(bp)) {
+ trace_xfs_btree_corrupt(bp, _RET_IP_);
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
+ xfs_verifier_error(bp);
+ return;
}
- /*
- * If we went off the root then we are seriously confused.
- */
- ASSERT(lev < cur->bc_nlevels);
- /*
- * Now walk back down the tree, fixing up the cursor's buffer
- * pointers and key numbers.
- */
- for (block = XFS_BUF_TO_INOBT_BLOCK(cur->bc_bufs[lev]); lev > level; ) {
- xfs_agblock_t agbno; /* block number of btree block */
- xfs_buf_t *bp; /* buffer containing btree block */
+ xfs_btree_sblock_calc_crc(bp);
- agbno = be32_to_cpu(*XFS_INOBT_PTR_ADDR(block, cur->bc_ptrs[lev], cur));
- if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
- cur->bc_private.i.agno, agbno, 0, &bp,
- XFS_INO_BTREE_REF)))
- return error;
- lev--;
- xfs_btree_setbuf(cur, lev, bp);
- block = XFS_BUF_TO_INOBT_BLOCK(bp);
- if ((error = xfs_btree_check_sblock(cur, block, lev, bp)))
- return error;
- cur->bc_ptrs[lev] = be16_to_cpu(block->bb_numrecs);
- }
- *stat = 1;
- return 0;
}
-/*
- * Delete the record pointed to by cur.
- * The cursor refers to the place where the record was (could be inserted)
- * when the operation returns.
- */
-int /* error */
-xfs_inobt_delete(
- xfs_btree_cur_t *cur, /* btree cursor */
- int *stat) /* success/failure */
+const struct xfs_buf_ops xfs_inobt_buf_ops = {
+ .verify_read = xfs_inobt_read_verify,
+ .verify_write = xfs_inobt_write_verify,
+};
+
+#if defined(DEBUG) || defined(XFS_WARN)
+STATIC int
+xfs_inobt_keys_inorder(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_key *k1,
+ union xfs_btree_key *k2)
{
- int error;
- int i; /* result code */
- int level; /* btree level */
-
- /*
- * Go up the tree, starting at leaf level.
- * If 2 is returned then a join was done; go to the next level.
- * Otherwise we are done.
- */
- for (level = 0, i = 2; i == 2; level++) {
- if ((error = xfs_inobt_delrec(cur, level, &i)))
- return error;
- }
- if (i == 0) {
- for (level = 1; level < cur->bc_nlevels; level++) {
- if (cur->bc_ptrs[level] == 0) {
- if ((error = xfs_inobt_decrement(cur, level, &i)))
- return error;
- break;
- }
- }
- }
- *stat = i;
- return 0;
+ return be32_to_cpu(k1->inobt.ir_startino) <
+ be32_to_cpu(k2->inobt.ir_startino);
}
-
-/*
- * Get the data from the pointed-to record.
- */
-int /* error */
-xfs_inobt_get_rec(
- xfs_btree_cur_t *cur, /* btree cursor */
- xfs_agino_t *ino, /* output: starting inode of chunk */
- __int32_t *fcnt, /* output: number of free inodes */
- xfs_inofree_t *free, /* output: free inode mask */
- int *stat) /* output: success/failure */
+STATIC int
+xfs_inobt_recs_inorder(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_rec *r1,
+ union xfs_btree_rec *r2)
{
- xfs_inobt_block_t *block; /* btree block */
- xfs_buf_t *bp; /* buffer containing btree block */
-#ifdef DEBUG
- int error; /* error return value */
+ return be32_to_cpu(r1->inobt.ir_startino) + XFS_INODES_PER_CHUNK <=
+ be32_to_cpu(r2->inobt.ir_startino);
+}
+#endif /* DEBUG */
+
+static const struct xfs_btree_ops xfs_inobt_ops = {
+ .rec_len = sizeof(xfs_inobt_rec_t),
+ .key_len = sizeof(xfs_inobt_key_t),
+
+ .dup_cursor = xfs_inobt_dup_cursor,
+ .set_root = xfs_inobt_set_root,
+ .alloc_block = xfs_inobt_alloc_block,
+ .free_block = xfs_inobt_free_block,
+ .get_minrecs = xfs_inobt_get_minrecs,
+ .get_maxrecs = xfs_inobt_get_maxrecs,
+ .init_key_from_rec = xfs_inobt_init_key_from_rec,
+ .init_rec_from_key = xfs_inobt_init_rec_from_key,
+ .init_rec_from_cur = xfs_inobt_init_rec_from_cur,
+ .init_ptr_from_cur = xfs_inobt_init_ptr_from_cur,
+ .key_diff = xfs_inobt_key_diff,
+ .buf_ops = &xfs_inobt_buf_ops,
+#if defined(DEBUG) || defined(XFS_WARN)
+ .keys_inorder = xfs_inobt_keys_inorder,
+ .recs_inorder = xfs_inobt_recs_inorder,
#endif
- int ptr; /* record number */
- xfs_inobt_rec_t *rec; /* record data */
-
- bp = cur->bc_bufs[0];
- ptr = cur->bc_ptrs[0];
- block = XFS_BUF_TO_INOBT_BLOCK(bp);
-#ifdef DEBUG
- if ((error = xfs_btree_check_sblock(cur, block, 0, bp)))
- return error;
+};
+
+static const struct xfs_btree_ops xfs_finobt_ops = {
+ .rec_len = sizeof(xfs_inobt_rec_t),
+ .key_len = sizeof(xfs_inobt_key_t),
+
+ .dup_cursor = xfs_inobt_dup_cursor,
+ .set_root = xfs_finobt_set_root,
+ .alloc_block = xfs_inobt_alloc_block,
+ .free_block = xfs_inobt_free_block,
+ .get_minrecs = xfs_inobt_get_minrecs,
+ .get_maxrecs = xfs_inobt_get_maxrecs,
+ .init_key_from_rec = xfs_inobt_init_key_from_rec,
+ .init_rec_from_key = xfs_inobt_init_rec_from_key,
+ .init_rec_from_cur = xfs_inobt_init_rec_from_cur,
+ .init_ptr_from_cur = xfs_finobt_init_ptr_from_cur,
+ .key_diff = xfs_inobt_key_diff,
+ .buf_ops = &xfs_inobt_buf_ops,
+#if defined(DEBUG) || defined(XFS_WARN)
+ .keys_inorder = xfs_inobt_keys_inorder,
+ .recs_inorder = xfs_inobt_recs_inorder,
#endif
- /*
- * Off the right end or left end, return failure.
- */
- if (ptr > be16_to_cpu(block->bb_numrecs) || ptr <= 0) {
- *stat = 0;
- return 0;
- }
- /*
- * Point to the record and extract its data.
- */
- rec = XFS_INOBT_REC_ADDR(block, ptr, cur);
- *ino = INT_GET(rec->ir_startino, ARCH_CONVERT);
- *fcnt = INT_GET(rec->ir_freecount, ARCH_CONVERT);
- *free = INT_GET(rec->ir_free, ARCH_CONVERT);
- *stat = 1;
- return 0;
-}
+};
/*
- * Increment cursor by one record at the level.
- * For nonzero levels the leaf-ward information is untouched.
+ * Allocate a new inode btree cursor.
*/
-int /* error */
-xfs_inobt_increment(
- xfs_btree_cur_t *cur, /* btree cursor */
- int level, /* level in btree, 0 is leaf */
- int *stat) /* success/failure */
+struct xfs_btree_cur * /* new inode btree cursor */
+xfs_inobt_init_cursor(
+ struct xfs_mount *mp, /* file system mount point */
+ struct xfs_trans *tp, /* transaction pointer */
+ struct xfs_buf *agbp, /* buffer for agi structure */
+ xfs_agnumber_t agno, /* allocation group number */
+ xfs_btnum_t btnum) /* ialloc or free ino btree */
{
- xfs_inobt_block_t *block; /* btree block */
- xfs_buf_t *bp; /* buffer containing btree block */
- int error; /* error return value */
- int lev; /* btree level */
+ struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp);
+ struct xfs_btree_cur *cur;
- ASSERT(level < cur->bc_nlevels);
- /*
- * Read-ahead to the right at this level.
- */
- xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA);
- /*
- * Get a pointer to the btree block.
- */
- bp = cur->bc_bufs[level];
- block = XFS_BUF_TO_INOBT_BLOCK(bp);
-#ifdef DEBUG
- if ((error = xfs_btree_check_sblock(cur, block, level, bp)))
- return error;
-#endif
- /*
- * Increment the ptr at this level. If we're still in the block
- * then we're done.
- */
- if (++cur->bc_ptrs[level] <= be16_to_cpu(block->bb_numrecs)) {
- *stat = 1;
- return 0;
- }
- /*
- * If we just went off the right edge of the tree, return failure.
- */
- if (be32_to_cpu(block->bb_rightsib) == NULLAGBLOCK) {
- *stat = 0;
- return 0;
- }
- /*
- * March up the tree incrementing pointers.
- * Stop when we don't go off the right edge of a block.
- */
- for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
- bp = cur->bc_bufs[lev];
- block = XFS_BUF_TO_INOBT_BLOCK(bp);
-#ifdef DEBUG
- if ((error = xfs_btree_check_sblock(cur, block, lev, bp)))
- return error;
-#endif
- if (++cur->bc_ptrs[lev] <= be16_to_cpu(block->bb_numrecs))
- break;
- /*
- * Read-ahead the right block, we're going to read it
- * in the next loop.
- */
- xfs_btree_readahead(cur, lev, XFS_BTCUR_RIGHTRA);
- }
- /*
- * If we went off the root then we are seriously confused.
- */
- ASSERT(lev < cur->bc_nlevels);
- /*
- * Now walk back down the tree, fixing up the cursor's buffer
- * pointers and key numbers.
- */
- for (bp = cur->bc_bufs[lev], block = XFS_BUF_TO_INOBT_BLOCK(bp);
- lev > level; ) {
- xfs_agblock_t agbno; /* block number of btree block */
+ cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_SLEEP);
- agbno = be32_to_cpu(*XFS_INOBT_PTR_ADDR(block, cur->bc_ptrs[lev], cur));
- if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
- cur->bc_private.i.agno, agbno, 0, &bp,
- XFS_INO_BTREE_REF)))
- return error;
- lev--;
- xfs_btree_setbuf(cur, lev, bp);
- block = XFS_BUF_TO_INOBT_BLOCK(bp);
- if ((error = xfs_btree_check_sblock(cur, block, lev, bp)))
- return error;
- cur->bc_ptrs[lev] = 1;
+ cur->bc_tp = tp;
+ cur->bc_mp = mp;
+ cur->bc_btnum = btnum;
+ if (btnum == XFS_BTNUM_INO) {
+ cur->bc_nlevels = be32_to_cpu(agi->agi_level);
+ cur->bc_ops = &xfs_inobt_ops;
+ } else {
+ cur->bc_nlevels = be32_to_cpu(agi->agi_free_level);
+ cur->bc_ops = &xfs_finobt_ops;
}
- *stat = 1;
- return 0;
-}
-
-/*
- * Insert the current record at the point referenced by cur.
- * The cursor may be inconsistent on return if splits have been done.
- */
-int /* error */
-xfs_inobt_insert(
- xfs_btree_cur_t *cur, /* btree cursor */
- int *stat) /* success/failure */
-{
- int error; /* error return value */
- int i; /* result value, 0 for failure */
- int level; /* current level number in btree */
- xfs_agblock_t nbno; /* new block number (split result) */
- xfs_btree_cur_t *ncur; /* new cursor (split result) */
- xfs_inobt_rec_t nrec; /* record being inserted this level */
- xfs_btree_cur_t *pcur; /* previous level's cursor */
- level = 0;
- nbno = NULLAGBLOCK;
- INT_SET(nrec.ir_startino, ARCH_CONVERT, cur->bc_rec.i.ir_startino);
- INT_SET(nrec.ir_freecount, ARCH_CONVERT, cur->bc_rec.i.ir_freecount);
- INT_SET(nrec.ir_free, ARCH_CONVERT, cur->bc_rec.i.ir_free);
- ncur = (xfs_btree_cur_t *)0;
- pcur = cur;
- /*
- * Loop going up the tree, starting at the leaf level.
- * Stop when we don't get a split block, that must mean that
- * the insert is finished with this level.
- */
- do {
- /*
- * Insert nrec/nbno into this level of the tree.
- * Note if we fail, nbno will be null.
- */
- if ((error = xfs_inobt_insrec(pcur, level++, &nbno, &nrec, &ncur,
- &i))) {
- if (pcur != cur)
- xfs_btree_del_cursor(pcur, XFS_BTREE_ERROR);
- return error;
- }
- /*
- * See if the cursor we just used is trash.
- * Can't trash the caller's cursor, but otherwise we should
- * if ncur is a new cursor or we're about to be done.
- */
- if (pcur != cur && (ncur || nbno == NULLAGBLOCK)) {
- cur->bc_nlevels = pcur->bc_nlevels;
- xfs_btree_del_cursor(pcur, XFS_BTREE_NOERROR);
- }
- /*
- * If we got a new cursor, switch to it.
- */
- if (ncur) {
- pcur = ncur;
- ncur = (xfs_btree_cur_t *)0;
- }
- } while (nbno != NULLAGBLOCK);
- *stat = i;
- return 0;
-}
+ cur->bc_blocklog = mp->m_sb.sb_blocklog;
-/*
- * Lookup the record equal to ino in the btree given by cur.
- */
-int /* error */
-xfs_inobt_lookup_eq(
- xfs_btree_cur_t *cur, /* btree cursor */
- xfs_agino_t ino, /* starting inode of chunk */
- __int32_t fcnt, /* free inode count */
- xfs_inofree_t free, /* free inode mask */
- int *stat) /* success/failure */
-{
- cur->bc_rec.i.ir_startino = ino;
- cur->bc_rec.i.ir_freecount = fcnt;
- cur->bc_rec.i.ir_free = free;
- return xfs_inobt_lookup(cur, XFS_LOOKUP_EQ, stat);
-}
+ if (xfs_sb_version_hascrc(&mp->m_sb))
+ cur->bc_flags |= XFS_BTREE_CRC_BLOCKS;
-/*
- * Lookup the first record greater than or equal to ino
- * in the btree given by cur.
- */
-int /* error */
-xfs_inobt_lookup_ge(
- xfs_btree_cur_t *cur, /* btree cursor */
- xfs_agino_t ino, /* starting inode of chunk */
- __int32_t fcnt, /* free inode count */
- xfs_inofree_t free, /* free inode mask */
- int *stat) /* success/failure */
-{
- cur->bc_rec.i.ir_startino = ino;
- cur->bc_rec.i.ir_freecount = fcnt;
- cur->bc_rec.i.ir_free = free;
- return xfs_inobt_lookup(cur, XFS_LOOKUP_GE, stat);
-}
+ cur->bc_private.a.agbp = agbp;
+ cur->bc_private.a.agno = agno;
-/*
- * Lookup the first record less than or equal to ino
- * in the btree given by cur.
- */
-int /* error */
-xfs_inobt_lookup_le(
- xfs_btree_cur_t *cur, /* btree cursor */
- xfs_agino_t ino, /* starting inode of chunk */
- __int32_t fcnt, /* free inode count */
- xfs_inofree_t free, /* free inode mask */
- int *stat) /* success/failure */
-{
- cur->bc_rec.i.ir_startino = ino;
- cur->bc_rec.i.ir_freecount = fcnt;
- cur->bc_rec.i.ir_free = free;
- return xfs_inobt_lookup(cur, XFS_LOOKUP_LE, stat);
+ return cur;
}
/*
- * Update the record referred to by cur, to the value given
- * by [ino, fcnt, free].
- * This either works (return 0) or gets an EFSCORRUPTED error.
+ * Calculate number of records in an inobt btree block.
*/
-int /* error */
-xfs_inobt_update(
- xfs_btree_cur_t *cur, /* btree cursor */
- xfs_agino_t ino, /* starting inode of chunk */
- __int32_t fcnt, /* free inode count */
- xfs_inofree_t free) /* free inode mask */
+int
+xfs_inobt_maxrecs(
+ struct xfs_mount *mp,
+ int blocklen,
+ int leaf)
{
- xfs_inobt_block_t *block; /* btree block to update */
- xfs_buf_t *bp; /* buffer containing btree block */
- int error; /* error return value */
- int ptr; /* current record number (updating) */
- xfs_inobt_rec_t *rp; /* pointer to updated record */
+ blocklen -= XFS_INOBT_BLOCK_LEN(mp);
- /*
- * Pick up the current block.
- */
- bp = cur->bc_bufs[0];
- block = XFS_BUF_TO_INOBT_BLOCK(bp);
-#ifdef DEBUG
- if ((error = xfs_btree_check_sblock(cur, block, 0, bp)))
- return error;
-#endif
- /*
- * Get the address of the rec to be updated.
- */
- ptr = cur->bc_ptrs[0];
- rp = XFS_INOBT_REC_ADDR(block, ptr, cur);
- /*
- * Fill in the new contents and log them.
- */
- INT_SET(rp->ir_startino, ARCH_CONVERT, ino);
- INT_SET(rp->ir_freecount, ARCH_CONVERT, fcnt);
- INT_SET(rp->ir_free, ARCH_CONVERT, free);
- xfs_inobt_log_recs(cur, bp, ptr, ptr);
- /*
- * Updating first record in leaf. Pass new key value up to our parent.
- */
- if (ptr == 1) {
- xfs_inobt_key_t key; /* key containing [ino] */
-
- INT_SET(key.ir_startino, ARCH_CONVERT, ino);
- if ((error = xfs_inobt_updkey(cur, &key, 1)))
- return error;
- }
- return 0;
+ if (leaf)
+ return blocklen / sizeof(xfs_inobt_rec_t);
+ return blocklen / (sizeof(xfs_inobt_key_t) + sizeof(xfs_inobt_ptr_t));
}
diff --git a/fs/xfs/xfs_ialloc_btree.h b/fs/xfs/xfs_ialloc_btree.h
index ae3904cb1ee..d7ebea72c2d 100644
--- a/fs/xfs/xfs_ialloc_btree.h
+++ b/fs/xfs/xfs_ialloc_btree.h
@@ -24,154 +24,42 @@
struct xfs_buf;
struct xfs_btree_cur;
-struct xfs_btree_sblock;
struct xfs_mount;
/*
- * There is a btree for the inode map per allocation group.
+ * Btree block header size depends on a superblock flag.
*/
-#define XFS_IBT_MAGIC 0x49414254 /* 'IABT' */
-
-typedef __uint64_t xfs_inofree_t;
-#define XFS_INODES_PER_CHUNK (NBBY * sizeof(xfs_inofree_t))
-#define XFS_INODES_PER_CHUNK_LOG (XFS_NBBYLOG + 3)
-#define XFS_INOBT_ALL_FREE ((xfs_inofree_t)-1)
-
-#define XFS_INOBT_MASKN(i,n) xfs_inobt_maskn(i,n)
-static inline xfs_inofree_t xfs_inobt_maskn(int i, int n)
-{
- return (((n) >= XFS_INODES_PER_CHUNK ? \
- (xfs_inofree_t)0 : ((xfs_inofree_t)1 << (n))) - 1) << (i);
-}
-
-/*
- * Data record structure
- */
-typedef struct xfs_inobt_rec
-{
- xfs_agino_t ir_startino; /* starting inode number */
- __int32_t ir_freecount; /* count of free inodes (set bits) */
- xfs_inofree_t ir_free; /* free inode mask */
-} xfs_inobt_rec_t;
-
-/*
- * Key structure
- */
-typedef struct xfs_inobt_key
-{
- xfs_agino_t ir_startino; /* starting inode number */
-} xfs_inobt_key_t;
-
-/* btree pointer type */
-typedef __be32 xfs_inobt_ptr_t;
-
-/* btree block header type */
-typedef struct xfs_btree_sblock xfs_inobt_block_t;
-
-#define XFS_BUF_TO_INOBT_BLOCK(bp) ((xfs_inobt_block_t *)XFS_BUF_PTR(bp))
-
-/*
- * Bit manipulations for ir_free.
- */
-#define XFS_INOBT_MASK(i) ((xfs_inofree_t)1 << (i))
-#define XFS_INOBT_IS_FREE(rp,i) \
- (((rp)->ir_free & XFS_INOBT_MASK(i)) != 0)
-#define XFS_INOBT_IS_FREE_DISK(rp,i) \
- ((INT_GET((rp)->ir_free,ARCH_CONVERT) & XFS_INOBT_MASK(i)) != 0)
-#define XFS_INOBT_SET_FREE(rp,i) ((rp)->ir_free |= XFS_INOBT_MASK(i))
-#define XFS_INOBT_CLR_FREE(rp,i) ((rp)->ir_free &= ~XFS_INOBT_MASK(i))
-
-/*
- * Real block structures have a size equal to the disk block size.
- */
-#define XFS_INOBT_BLOCK_SIZE(lev,cur) (1 << (cur)->bc_blocklog)
-#define XFS_INOBT_BLOCK_MAXRECS(lev,cur) ((cur)->bc_mp->m_inobt_mxr[lev != 0])
-#define XFS_INOBT_BLOCK_MINRECS(lev,cur) ((cur)->bc_mp->m_inobt_mnr[lev != 0])
-#define XFS_INOBT_IS_LAST_REC(cur) \
- ((cur)->bc_ptrs[0] == be16_to_cpu(XFS_BUF_TO_INOBT_BLOCK((cur)->bc_bufs[0])->bb_numrecs))
-
-/*
- * Maximum number of inode btree levels.
- */
-#define XFS_IN_MAXLEVELS(mp) ((mp)->m_in_maxlevels)
-
-/*
- * block numbers in the AG.
- */
-#define XFS_IBT_BLOCK(mp) ((xfs_agblock_t)(XFS_CNT_BLOCK(mp) + 1))
-#define XFS_PREALLOC_BLOCKS(mp) ((xfs_agblock_t)(XFS_IBT_BLOCK(mp) + 1))
+#define XFS_INOBT_BLOCK_LEN(mp) \
+ (xfs_sb_version_hascrc(&((mp)->m_sb)) ? \
+ XFS_BTREE_SBLOCK_CRC_LEN : XFS_BTREE_SBLOCK_LEN)
/*
* Record, key, and pointer address macros for btree blocks.
- */
-#define XFS_INOBT_REC_ADDR(bb,i,cur) \
- (XFS_BTREE_REC_ADDR(XFS_INOBT_BLOCK_SIZE(0,cur), xfs_inobt, bb, \
- i, XFS_INOBT_BLOCK_MAXRECS(0, cur)))
-#define XFS_INOBT_KEY_ADDR(bb,i,cur) \
- (XFS_BTREE_KEY_ADDR(XFS_INOBT_BLOCK_SIZE(1,cur), xfs_inobt, bb, \
- i, XFS_INOBT_BLOCK_MAXRECS(1, cur)))
-
-#define XFS_INOBT_PTR_ADDR(bb,i,cur) \
- (XFS_BTREE_PTR_ADDR(XFS_INOBT_BLOCK_SIZE(1,cur), xfs_inobt, bb, \
- i, XFS_INOBT_BLOCK_MAXRECS(1, cur)))
-
-/*
- * Decrement cursor by one record at the level.
- * For nonzero levels the leaf-ward information is untouched.
- */
-extern int xfs_inobt_decrement(struct xfs_btree_cur *cur, int level, int *stat);
-
-/*
- * Delete the record pointed to by cur.
- * The cursor refers to the place where the record was (could be inserted)
- * when the operation returns.
- */
-extern int xfs_inobt_delete(struct xfs_btree_cur *cur, int *stat);
-
-/*
- * Get the data from the pointed-to record.
- */
-extern int xfs_inobt_get_rec(struct xfs_btree_cur *cur, xfs_agino_t *ino,
- __int32_t *fcnt, xfs_inofree_t *free, int *stat);
-
-/*
- * Increment cursor by one record at the level.
- * For nonzero levels the leaf-ward information is untouched.
- */
-extern int xfs_inobt_increment(struct xfs_btree_cur *cur, int level, int *stat);
-
-/*
- * Insert the current record at the point referenced by cur.
- * The cursor may be inconsistent on return if splits have been done.
- */
-extern int xfs_inobt_insert(struct xfs_btree_cur *cur, int *stat);
-
-/*
- * Lookup the record equal to ino in the btree given by cur.
- */
-extern int xfs_inobt_lookup_eq(struct xfs_btree_cur *cur, xfs_agino_t ino,
- __int32_t fcnt, xfs_inofree_t free, int *stat);
-
-/*
- * Lookup the first record greater than or equal to ino
- * in the btree given by cur.
- */
-extern int xfs_inobt_lookup_ge(struct xfs_btree_cur *cur, xfs_agino_t ino,
- __int32_t fcnt, xfs_inofree_t free, int *stat);
-
-/*
- * Lookup the first record less than or equal to ino
- * in the btree given by cur.
- */
-extern int xfs_inobt_lookup_le(struct xfs_btree_cur *cur, xfs_agino_t ino,
- __int32_t fcnt, xfs_inofree_t free, int *stat);
-
-/*
- * Update the record referred to by cur, to the value given
- * by [ino, fcnt, free].
- * This either works (return 0) or gets an EFSCORRUPTED error.
- */
-extern int xfs_inobt_update(struct xfs_btree_cur *cur, xfs_agino_t ino,
- __int32_t fcnt, xfs_inofree_t free);
+ *
+ * (note that some of these may appear unused, but they are used in userspace)
+ */
+#define XFS_INOBT_REC_ADDR(mp, block, index) \
+ ((xfs_inobt_rec_t *) \
+ ((char *)(block) + \
+ XFS_INOBT_BLOCK_LEN(mp) + \
+ (((index) - 1) * sizeof(xfs_inobt_rec_t))))
+
+#define XFS_INOBT_KEY_ADDR(mp, block, index) \
+ ((xfs_inobt_key_t *) \
+ ((char *)(block) + \
+ XFS_INOBT_BLOCK_LEN(mp) + \
+ ((index) - 1) * sizeof(xfs_inobt_key_t)))
+
+#define XFS_INOBT_PTR_ADDR(mp, block, index, maxrecs) \
+ ((xfs_inobt_ptr_t *) \
+ ((char *)(block) + \
+ XFS_INOBT_BLOCK_LEN(mp) + \
+ (maxrecs) * sizeof(xfs_inobt_key_t) + \
+ ((index) - 1) * sizeof(xfs_inobt_ptr_t)))
+
+extern struct xfs_btree_cur *xfs_inobt_init_cursor(struct xfs_mount *,
+ struct xfs_trans *, struct xfs_buf *, xfs_agnumber_t,
+ xfs_btnum_t);
+extern int xfs_inobt_maxrecs(struct xfs_mount *, int, int);
#endif /* __XFS_IALLOC_BTREE_H__ */
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
new file mode 100644
index 00000000000..c48df5f25b9
--- /dev/null
+++ b/fs/xfs/xfs_icache.c
@@ -0,0 +1,1327 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_inum.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_error.h"
+#include "xfs_trans.h"
+#include "xfs_trans_priv.h"
+#include "xfs_inode_item.h"
+#include "xfs_quota.h"
+#include "xfs_trace.h"
+#include "xfs_icache.h"
+#include "xfs_bmap_util.h"
+
+#include <linux/kthread.h>
+#include <linux/freezer.h>
+
+STATIC void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp,
+ struct xfs_perag *pag, struct xfs_inode *ip);
+
+/*
+ * Allocate and initialise an xfs_inode.
+ */
+struct xfs_inode *
+xfs_inode_alloc(
+ struct xfs_mount *mp,
+ xfs_ino_t ino)
+{
+ struct xfs_inode *ip;
+
+ /*
+ * if this didn't occur in transactions, we could use
+ * KM_MAYFAIL and return NULL here on ENOMEM. Set the
+ * code up to do this anyway.
+ */
+ ip = kmem_zone_alloc(xfs_inode_zone, KM_SLEEP);
+ if (!ip)
+ return NULL;
+ if (inode_init_always(mp->m_super, VFS_I(ip))) {
+ kmem_zone_free(xfs_inode_zone, ip);
+ return NULL;
+ }
+
+ ASSERT(atomic_read(&ip->i_pincount) == 0);
+ ASSERT(!spin_is_locked(&ip->i_flags_lock));
+ ASSERT(!xfs_isiflocked(ip));
+ ASSERT(ip->i_ino == 0);
+
+ mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
+
+ /* initialise the xfs inode */
+ ip->i_ino = ino;
+ ip->i_mount = mp;
+ memset(&ip->i_imap, 0, sizeof(struct xfs_imap));
+ ip->i_afp = NULL;
+ memset(&ip->i_df, 0, sizeof(xfs_ifork_t));
+ ip->i_flags = 0;
+ ip->i_delayed_blks = 0;
+ memset(&ip->i_d, 0, sizeof(xfs_icdinode_t));
+
+ return ip;
+}
+
+STATIC void
+xfs_inode_free_callback(
+ struct rcu_head *head)
+{
+ struct inode *inode = container_of(head, struct inode, i_rcu);
+ struct xfs_inode *ip = XFS_I(inode);
+
+ kmem_zone_free(xfs_inode_zone, ip);
+}
+
+void
+xfs_inode_free(
+ struct xfs_inode *ip)
+{
+ switch (ip->i_d.di_mode & S_IFMT) {
+ case S_IFREG:
+ case S_IFDIR:
+ case S_IFLNK:
+ xfs_idestroy_fork(ip, XFS_DATA_FORK);
+ break;
+ }
+
+ if (ip->i_afp)
+ xfs_idestroy_fork(ip, XFS_ATTR_FORK);
+
+ if (ip->i_itemp) {
+ ASSERT(!(ip->i_itemp->ili_item.li_flags & XFS_LI_IN_AIL));
+ xfs_inode_item_destroy(ip);
+ ip->i_itemp = NULL;
+ }
+
+ /*
+ * Because we use RCU freeing we need to ensure the inode always
+ * appears to be reclaimed with an invalid inode number when in the
+ * free state. The ip->i_flags_lock provides the barrier against lookup
+ * races.
+ */
+ spin_lock(&ip->i_flags_lock);
+ ip->i_flags = XFS_IRECLAIM;
+ ip->i_ino = 0;
+ spin_unlock(&ip->i_flags_lock);
+
+ /* asserts to verify all state is correct here */
+ ASSERT(atomic_read(&ip->i_pincount) == 0);
+ ASSERT(!xfs_isiflocked(ip));
+
+ call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback);
+}
+
+/*
+ * Check the validity of the inode we just found it the cache
+ */
+static int
+xfs_iget_cache_hit(
+ struct xfs_perag *pag,
+ struct xfs_inode *ip,
+ xfs_ino_t ino,
+ int flags,
+ int lock_flags) __releases(RCU)
+{
+ struct inode *inode = VFS_I(ip);
+ struct xfs_mount *mp = ip->i_mount;
+ int error;
+
+ /*
+ * check for re-use of an inode within an RCU grace period due to the
+ * radix tree nodes not being updated yet. We monitor for this by
+ * setting the inode number to zero before freeing the inode structure.
+ * If the inode has been reallocated and set up, then the inode number
+ * will not match, so check for that, too.
+ */
+ spin_lock(&ip->i_flags_lock);
+ if (ip->i_ino != ino) {
+ trace_xfs_iget_skip(ip);
+ XFS_STATS_INC(xs_ig_frecycle);
+ error = EAGAIN;
+ goto out_error;
+ }
+
+
+ /*
+ * If we are racing with another cache hit that is currently
+ * instantiating this inode or currently recycling it out of
+ * reclaimabe state, wait for the initialisation to complete
+ * before continuing.
+ *
+ * XXX(hch): eventually we should do something equivalent to
+ * wait_on_inode to wait for these flags to be cleared
+ * instead of polling for it.
+ */
+ if (ip->i_flags & (XFS_INEW|XFS_IRECLAIM)) {
+ trace_xfs_iget_skip(ip);
+ XFS_STATS_INC(xs_ig_frecycle);
+ error = EAGAIN;
+ goto out_error;
+ }
+
+ /*
+ * If lookup is racing with unlink return an error immediately.
+ */
+ if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) {
+ error = ENOENT;
+ goto out_error;
+ }
+
+ /*
+ * If IRECLAIMABLE is set, we've torn down the VFS inode already.
+ * Need to carefully get it back into useable state.
+ */
+ if (ip->i_flags & XFS_IRECLAIMABLE) {
+ trace_xfs_iget_reclaim(ip);
+
+ /*
+ * We need to set XFS_IRECLAIM to prevent xfs_reclaim_inode
+ * from stomping over us while we recycle the inode. We can't
+ * clear the radix tree reclaimable tag yet as it requires
+ * pag_ici_lock to be held exclusive.
+ */
+ ip->i_flags |= XFS_IRECLAIM;
+
+ spin_unlock(&ip->i_flags_lock);
+ rcu_read_unlock();
+
+ error = -inode_init_always(mp->m_super, inode);
+ if (error) {
+ /*
+ * Re-initializing the inode failed, and we are in deep
+ * trouble. Try to re-add it to the reclaim list.
+ */
+ rcu_read_lock();
+ spin_lock(&ip->i_flags_lock);
+
+ ip->i_flags &= ~(XFS_INEW | XFS_IRECLAIM);
+ ASSERT(ip->i_flags & XFS_IRECLAIMABLE);
+ trace_xfs_iget_reclaim_fail(ip);
+ goto out_error;
+ }
+
+ spin_lock(&pag->pag_ici_lock);
+ spin_lock(&ip->i_flags_lock);
+
+ /*
+ * Clear the per-lifetime state in the inode as we are now
+ * effectively a new inode and need to return to the initial
+ * state before reuse occurs.
+ */
+ ip->i_flags &= ~XFS_IRECLAIM_RESET_FLAGS;
+ ip->i_flags |= XFS_INEW;
+ __xfs_inode_clear_reclaim_tag(mp, pag, ip);
+ inode->i_state = I_NEW;
+
+ ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock));
+ mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
+
+ spin_unlock(&ip->i_flags_lock);
+ spin_unlock(&pag->pag_ici_lock);
+ } else {
+ /* If the VFS inode is being torn down, pause and try again. */
+ if (!igrab(inode)) {
+ trace_xfs_iget_skip(ip);
+ error = EAGAIN;
+ goto out_error;
+ }
+
+ /* We've got a live one. */
+ spin_unlock(&ip->i_flags_lock);
+ rcu_read_unlock();
+ trace_xfs_iget_hit(ip);
+ }
+
+ if (lock_flags != 0)
+ xfs_ilock(ip, lock_flags);
+
+ xfs_iflags_clear(ip, XFS_ISTALE | XFS_IDONTCACHE);
+ XFS_STATS_INC(xs_ig_found);
+
+ return 0;
+
+out_error:
+ spin_unlock(&ip->i_flags_lock);
+ rcu_read_unlock();
+ return error;
+}
+
+
+static int
+xfs_iget_cache_miss(
+ struct xfs_mount *mp,
+ struct xfs_perag *pag,
+ xfs_trans_t *tp,
+ xfs_ino_t ino,
+ struct xfs_inode **ipp,
+ int flags,
+ int lock_flags)
+{
+ struct xfs_inode *ip;
+ int error;
+ xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ino);
+ int iflags;
+
+ ip = xfs_inode_alloc(mp, ino);
+ if (!ip)
+ return ENOMEM;
+
+ error = xfs_iread(mp, tp, ip, flags);
+ if (error)
+ goto out_destroy;
+
+ trace_xfs_iget_miss(ip);
+
+ if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) {
+ error = ENOENT;
+ goto out_destroy;
+ }
+
+ /*
+ * Preload the radix tree so we can insert safely under the
+ * write spinlock. Note that we cannot sleep inside the preload
+ * region. Since we can be called from transaction context, don't
+ * recurse into the file system.
+ */
+ if (radix_tree_preload(GFP_NOFS)) {
+ error = EAGAIN;
+ goto out_destroy;
+ }
+
+ /*
+ * Because the inode hasn't been added to the radix-tree yet it can't
+ * be found by another thread, so we can do the non-sleeping lock here.
+ */
+ if (lock_flags) {
+ if (!xfs_ilock_nowait(ip, lock_flags))
+ BUG();
+ }
+
+ /*
+ * These values must be set before inserting the inode into the radix
+ * tree as the moment it is inserted a concurrent lookup (allowed by the
+ * RCU locking mechanism) can find it and that lookup must see that this
+ * is an inode currently under construction (i.e. that XFS_INEW is set).
+ * The ip->i_flags_lock that protects the XFS_INEW flag forms the
+ * memory barrier that ensures this detection works correctly at lookup
+ * time.
+ */
+ iflags = XFS_INEW;
+ if (flags & XFS_IGET_DONTCACHE)
+ iflags |= XFS_IDONTCACHE;
+ ip->i_udquot = NULL;
+ ip->i_gdquot = NULL;
+ ip->i_pdquot = NULL;
+ xfs_iflags_set(ip, iflags);
+
+ /* insert the new inode */
+ spin_lock(&pag->pag_ici_lock);
+ error = radix_tree_insert(&pag->pag_ici_root, agino, ip);
+ if (unlikely(error)) {
+ WARN_ON(error != -EEXIST);
+ XFS_STATS_INC(xs_ig_dup);
+ error = EAGAIN;
+ goto out_preload_end;
+ }
+ spin_unlock(&pag->pag_ici_lock);
+ radix_tree_preload_end();
+
+ *ipp = ip;
+ return 0;
+
+out_preload_end:
+ spin_unlock(&pag->pag_ici_lock);
+ radix_tree_preload_end();
+ if (lock_flags)
+ xfs_iunlock(ip, lock_flags);
+out_destroy:
+ __destroy_inode(VFS_I(ip));
+ xfs_inode_free(ip);
+ return error;
+}
+
+/*
+ * Look up an inode by number in the given file system.
+ * The inode is looked up in the cache held in each AG.
+ * If the inode is found in the cache, initialise the vfs inode
+ * if necessary.
+ *
+ * If it is not in core, read it in from the file system's device,
+ * add it to the cache and initialise the vfs inode.
+ *
+ * The inode is locked according to the value of the lock_flags parameter.
+ * This flag parameter indicates how and if the inode's IO lock and inode lock
+ * should be taken.
+ *
+ * mp -- the mount point structure for the current file system. It points
+ * to the inode hash table.
+ * tp -- a pointer to the current transaction if there is one. This is
+ * simply passed through to the xfs_iread() call.
+ * ino -- the number of the inode desired. This is the unique identifier
+ * within the file system for the inode being requested.
+ * lock_flags -- flags indicating how to lock the inode. See the comment
+ * for xfs_ilock() for a list of valid values.
+ */
+int
+xfs_iget(
+ xfs_mount_t *mp,
+ xfs_trans_t *tp,
+ xfs_ino_t ino,
+ uint flags,
+ uint lock_flags,
+ xfs_inode_t **ipp)
+{
+ xfs_inode_t *ip;
+ int error;
+ xfs_perag_t *pag;
+ xfs_agino_t agino;
+
+ /*
+ * xfs_reclaim_inode() uses the ILOCK to ensure an inode
+ * doesn't get freed while it's being referenced during a
+ * radix tree traversal here. It assumes this function
+ * aqcuires only the ILOCK (and therefore it has no need to
+ * involve the IOLOCK in this synchronization).
+ */
+ ASSERT((lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)) == 0);
+
+ /* reject inode numbers outside existing AGs */
+ if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount)
+ return EINVAL;
+
+ /* get the perag structure and ensure that it's inode capable */
+ pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino));
+ agino = XFS_INO_TO_AGINO(mp, ino);
+
+again:
+ error = 0;
+ rcu_read_lock();
+ ip = radix_tree_lookup(&pag->pag_ici_root, agino);
+
+ if (ip) {
+ error = xfs_iget_cache_hit(pag, ip, ino, flags, lock_flags);
+ if (error)
+ goto out_error_or_again;
+ } else {
+ rcu_read_unlock();
+ XFS_STATS_INC(xs_ig_missed);
+
+ error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip,
+ flags, lock_flags);
+ if (error)
+ goto out_error_or_again;
+ }
+ xfs_perag_put(pag);
+
+ *ipp = ip;
+
+ /*
+ * If we have a real type for an on-disk inode, we can set ops(&unlock)
+ * now. If it's a new inode being created, xfs_ialloc will handle it.
+ */
+ if (xfs_iflags_test(ip, XFS_INEW) && ip->i_d.di_mode != 0)
+ xfs_setup_inode(ip);
+ return 0;
+
+out_error_or_again:
+ if (error == EAGAIN) {
+ delay(1);
+ goto again;
+ }
+ xfs_perag_put(pag);
+ return error;
+}
+
+/*
+ * The inode lookup is done in batches to keep the amount of lock traffic and
+ * radix tree lookups to a minimum. The batch size is a trade off between
+ * lookup reduction and stack usage. This is in the reclaim path, so we can't
+ * be too greedy.
+ */
+#define XFS_LOOKUP_BATCH 32
+
+STATIC int
+xfs_inode_ag_walk_grab(
+ struct xfs_inode *ip)
+{
+ struct inode *inode = VFS_I(ip);
+
+ ASSERT(rcu_read_lock_held());
+
+ /*
+ * check for stale RCU freed inode
+ *
+ * If the inode has been reallocated, it doesn't matter if it's not in
+ * the AG we are walking - we are walking for writeback, so if it
+ * passes all the "valid inode" checks and is dirty, then we'll write
+ * it back anyway. If it has been reallocated and still being
+ * initialised, the XFS_INEW check below will catch it.
+ */
+ spin_lock(&ip->i_flags_lock);
+ if (!ip->i_ino)
+ goto out_unlock_noent;
+
+ /* avoid new or reclaimable inodes. Leave for reclaim code to flush */
+ if (__xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM))
+ goto out_unlock_noent;
+ spin_unlock(&ip->i_flags_lock);
+
+ /* nothing to sync during shutdown */
+ if (XFS_FORCED_SHUTDOWN(ip->i_mount))
+ return EFSCORRUPTED;
+
+ /* If we can't grab the inode, it must on it's way to reclaim. */
+ if (!igrab(inode))
+ return ENOENT;
+
+ /* inode is valid */
+ return 0;
+
+out_unlock_noent:
+ spin_unlock(&ip->i_flags_lock);
+ return ENOENT;
+}
+
+STATIC int
+xfs_inode_ag_walk(
+ struct xfs_mount *mp,
+ struct xfs_perag *pag,
+ int (*execute)(struct xfs_inode *ip, int flags,
+ void *args),
+ int flags,
+ void *args,
+ int tag)
+{
+ uint32_t first_index;
+ int last_error = 0;
+ int skipped;
+ int done;
+ int nr_found;
+
+restart:
+ done = 0;
+ skipped = 0;
+ first_index = 0;
+ nr_found = 0;
+ do {
+ struct xfs_inode *batch[XFS_LOOKUP_BATCH];
+ int error = 0;
+ int i;
+
+ rcu_read_lock();
+
+ if (tag == -1)
+ nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
+ (void **)batch, first_index,
+ XFS_LOOKUP_BATCH);
+ else
+ nr_found = radix_tree_gang_lookup_tag(
+ &pag->pag_ici_root,
+ (void **) batch, first_index,
+ XFS_LOOKUP_BATCH, tag);
+
+ if (!nr_found) {
+ rcu_read_unlock();
+ break;
+ }
+
+ /*
+ * Grab the inodes before we drop the lock. if we found
+ * nothing, nr == 0 and the loop will be skipped.
+ */
+ for (i = 0; i < nr_found; i++) {
+ struct xfs_inode *ip = batch[i];
+
+ if (done || xfs_inode_ag_walk_grab(ip))
+ batch[i] = NULL;
+
+ /*
+ * Update the index for the next lookup. Catch
+ * overflows into the next AG range which can occur if
+ * we have inodes in the last block of the AG and we
+ * are currently pointing to the last inode.
+ *
+ * Because we may see inodes that are from the wrong AG
+ * due to RCU freeing and reallocation, only update the
+ * index if it lies in this AG. It was a race that lead
+ * us to see this inode, so another lookup from the
+ * same index will not find it again.
+ */
+ if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno)
+ continue;
+ first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
+ if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
+ done = 1;
+ }
+
+ /* unlock now we've grabbed the inodes. */
+ rcu_read_unlock();
+
+ for (i = 0; i < nr_found; i++) {
+ if (!batch[i])
+ continue;
+ error = execute(batch[i], flags, args);
+ IRELE(batch[i]);
+ if (error == EAGAIN) {
+ skipped++;
+ continue;
+ }
+ if (error && last_error != EFSCORRUPTED)
+ last_error = error;
+ }
+
+ /* bail out if the filesystem is corrupted. */
+ if (error == EFSCORRUPTED)
+ break;
+
+ cond_resched();
+
+ } while (nr_found && !done);
+
+ if (skipped) {
+ delay(1);
+ goto restart;
+ }
+ return last_error;
+}
+
+/*
+ * Background scanning to trim post-EOF preallocated space. This is queued
+ * based on the 'speculative_prealloc_lifetime' tunable (5m by default).
+ */
+STATIC void
+xfs_queue_eofblocks(
+ struct xfs_mount *mp)
+{
+ rcu_read_lock();
+ if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_EOFBLOCKS_TAG))
+ queue_delayed_work(mp->m_eofblocks_workqueue,
+ &mp->m_eofblocks_work,
+ msecs_to_jiffies(xfs_eofb_secs * 1000));
+ rcu_read_unlock();
+}
+
+void
+xfs_eofblocks_worker(
+ struct work_struct *work)
+{
+ struct xfs_mount *mp = container_of(to_delayed_work(work),
+ struct xfs_mount, m_eofblocks_work);
+ xfs_icache_free_eofblocks(mp, NULL);
+ xfs_queue_eofblocks(mp);
+}
+
+int
+xfs_inode_ag_iterator(
+ struct xfs_mount *mp,
+ int (*execute)(struct xfs_inode *ip, int flags,
+ void *args),
+ int flags,
+ void *args)
+{
+ struct xfs_perag *pag;
+ int error = 0;
+ int last_error = 0;
+ xfs_agnumber_t ag;
+
+ ag = 0;
+ while ((pag = xfs_perag_get(mp, ag))) {
+ ag = pag->pag_agno + 1;
+ error = xfs_inode_ag_walk(mp, pag, execute, flags, args, -1);
+ xfs_perag_put(pag);
+ if (error) {
+ last_error = error;
+ if (error == EFSCORRUPTED)
+ break;
+ }
+ }
+ return XFS_ERROR(last_error);
+}
+
+int
+xfs_inode_ag_iterator_tag(
+ struct xfs_mount *mp,
+ int (*execute)(struct xfs_inode *ip, int flags,
+ void *args),
+ int flags,
+ void *args,
+ int tag)
+{
+ struct xfs_perag *pag;
+ int error = 0;
+ int last_error = 0;
+ xfs_agnumber_t ag;
+
+ ag = 0;
+ while ((pag = xfs_perag_get_tag(mp, ag, tag))) {
+ ag = pag->pag_agno + 1;
+ error = xfs_inode_ag_walk(mp, pag, execute, flags, args, tag);
+ xfs_perag_put(pag);
+ if (error) {
+ last_error = error;
+ if (error == EFSCORRUPTED)
+ break;
+ }
+ }
+ return XFS_ERROR(last_error);
+}
+
+/*
+ * Queue a new inode reclaim pass if there are reclaimable inodes and there
+ * isn't a reclaim pass already in progress. By default it runs every 5s based
+ * on the xfs periodic sync default of 30s. Perhaps this should have it's own
+ * tunable, but that can be done if this method proves to be ineffective or too
+ * aggressive.
+ */
+static void
+xfs_reclaim_work_queue(
+ struct xfs_mount *mp)
+{
+
+ rcu_read_lock();
+ if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) {
+ queue_delayed_work(mp->m_reclaim_workqueue, &mp->m_reclaim_work,
+ msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10));
+ }
+ rcu_read_unlock();
+}
+
+/*
+ * This is a fast pass over the inode cache to try to get reclaim moving on as
+ * many inodes as possible in a short period of time. It kicks itself every few
+ * seconds, as well as being kicked by the inode cache shrinker when memory
+ * goes low. It scans as quickly as possible avoiding locked inodes or those
+ * already being flushed, and once done schedules a future pass.
+ */
+void
+xfs_reclaim_worker(
+ struct work_struct *work)
+{
+ struct xfs_mount *mp = container_of(to_delayed_work(work),
+ struct xfs_mount, m_reclaim_work);
+
+ xfs_reclaim_inodes(mp, SYNC_TRYLOCK);
+ xfs_reclaim_work_queue(mp);
+}
+
+static void
+__xfs_inode_set_reclaim_tag(
+ struct xfs_perag *pag,
+ struct xfs_inode *ip)
+{
+ radix_tree_tag_set(&pag->pag_ici_root,
+ XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino),
+ XFS_ICI_RECLAIM_TAG);
+
+ if (!pag->pag_ici_reclaimable) {
+ /* propagate the reclaim tag up into the perag radix tree */
+ spin_lock(&ip->i_mount->m_perag_lock);
+ radix_tree_tag_set(&ip->i_mount->m_perag_tree,
+ XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
+ XFS_ICI_RECLAIM_TAG);
+ spin_unlock(&ip->i_mount->m_perag_lock);
+
+ /* schedule periodic background inode reclaim */
+ xfs_reclaim_work_queue(ip->i_mount);
+
+ trace_xfs_perag_set_reclaim(ip->i_mount, pag->pag_agno,
+ -1, _RET_IP_);
+ }
+ pag->pag_ici_reclaimable++;
+}
+
+/*
+ * We set the inode flag atomically with the radix tree tag.
+ * Once we get tag lookups on the radix tree, this inode flag
+ * can go away.
+ */
+void
+xfs_inode_set_reclaim_tag(
+ xfs_inode_t *ip)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_perag *pag;
+
+ pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
+ spin_lock(&pag->pag_ici_lock);
+ spin_lock(&ip->i_flags_lock);
+ __xfs_inode_set_reclaim_tag(pag, ip);
+ __xfs_iflags_set(ip, XFS_IRECLAIMABLE);
+ spin_unlock(&ip->i_flags_lock);
+ spin_unlock(&pag->pag_ici_lock);
+ xfs_perag_put(pag);
+}
+
+STATIC void
+__xfs_inode_clear_reclaim(
+ xfs_perag_t *pag,
+ xfs_inode_t *ip)
+{
+ pag->pag_ici_reclaimable--;
+ if (!pag->pag_ici_reclaimable) {
+ /* clear the reclaim tag from the perag radix tree */
+ spin_lock(&ip->i_mount->m_perag_lock);
+ radix_tree_tag_clear(&ip->i_mount->m_perag_tree,
+ XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
+ XFS_ICI_RECLAIM_TAG);
+ spin_unlock(&ip->i_mount->m_perag_lock);
+ trace_xfs_perag_clear_reclaim(ip->i_mount, pag->pag_agno,
+ -1, _RET_IP_);
+ }
+}
+
+STATIC void
+__xfs_inode_clear_reclaim_tag(
+ xfs_mount_t *mp,
+ xfs_perag_t *pag,
+ xfs_inode_t *ip)
+{
+ radix_tree_tag_clear(&pag->pag_ici_root,
+ XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
+ __xfs_inode_clear_reclaim(pag, ip);
+}
+
+/*
+ * Grab the inode for reclaim exclusively.
+ * Return 0 if we grabbed it, non-zero otherwise.
+ */
+STATIC int
+xfs_reclaim_inode_grab(
+ struct xfs_inode *ip,
+ int flags)
+{
+ ASSERT(rcu_read_lock_held());
+
+ /* quick check for stale RCU freed inode */
+ if (!ip->i_ino)
+ return 1;
+
+ /*
+ * If we are asked for non-blocking operation, do unlocked checks to
+ * see if the inode already is being flushed or in reclaim to avoid
+ * lock traffic.
+ */
+ if ((flags & SYNC_TRYLOCK) &&
+ __xfs_iflags_test(ip, XFS_IFLOCK | XFS_IRECLAIM))
+ return 1;
+
+ /*
+ * The radix tree lock here protects a thread in xfs_iget from racing
+ * with us starting reclaim on the inode. Once we have the
+ * XFS_IRECLAIM flag set it will not touch us.
+ *
+ * Due to RCU lookup, we may find inodes that have been freed and only
+ * have XFS_IRECLAIM set. Indeed, we may see reallocated inodes that
+ * aren't candidates for reclaim at all, so we must check the
+ * XFS_IRECLAIMABLE is set first before proceeding to reclaim.
+ */
+ spin_lock(&ip->i_flags_lock);
+ if (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) ||
+ __xfs_iflags_test(ip, XFS_IRECLAIM)) {
+ /* not a reclaim candidate. */
+ spin_unlock(&ip->i_flags_lock);
+ return 1;
+ }
+ __xfs_iflags_set(ip, XFS_IRECLAIM);
+ spin_unlock(&ip->i_flags_lock);
+ return 0;
+}
+
+/*
+ * Inodes in different states need to be treated differently. The following
+ * table lists the inode states and the reclaim actions necessary:
+ *
+ * inode state iflush ret required action
+ * --------------- ---------- ---------------
+ * bad - reclaim
+ * shutdown EIO unpin and reclaim
+ * clean, unpinned 0 reclaim
+ * stale, unpinned 0 reclaim
+ * clean, pinned(*) 0 requeue
+ * stale, pinned EAGAIN requeue
+ * dirty, async - requeue
+ * dirty, sync 0 reclaim
+ *
+ * (*) dgc: I don't think the clean, pinned state is possible but it gets
+ * handled anyway given the order of checks implemented.
+ *
+ * Also, because we get the flush lock first, we know that any inode that has
+ * been flushed delwri has had the flush completed by the time we check that
+ * the inode is clean.
+ *
+ * Note that because the inode is flushed delayed write by AIL pushing, the
+ * flush lock may already be held here and waiting on it can result in very
+ * long latencies. Hence for sync reclaims, where we wait on the flush lock,
+ * the caller should push the AIL first before trying to reclaim inodes to
+ * minimise the amount of time spent waiting. For background relaim, we only
+ * bother to reclaim clean inodes anyway.
+ *
+ * Hence the order of actions after gaining the locks should be:
+ * bad => reclaim
+ * shutdown => unpin and reclaim
+ * pinned, async => requeue
+ * pinned, sync => unpin
+ * stale => reclaim
+ * clean => reclaim
+ * dirty, async => requeue
+ * dirty, sync => flush, wait and reclaim
+ */
+STATIC int
+xfs_reclaim_inode(
+ struct xfs_inode *ip,
+ struct xfs_perag *pag,
+ int sync_mode)
+{
+ struct xfs_buf *bp = NULL;
+ int error;
+
+restart:
+ error = 0;
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ if (!xfs_iflock_nowait(ip)) {
+ if (!(sync_mode & SYNC_WAIT))
+ goto out;
+ xfs_iflock(ip);
+ }
+
+ if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+ xfs_iunpin_wait(ip);
+ xfs_iflush_abort(ip, false);
+ goto reclaim;
+ }
+ if (xfs_ipincount(ip)) {
+ if (!(sync_mode & SYNC_WAIT))
+ goto out_ifunlock;
+ xfs_iunpin_wait(ip);
+ }
+ if (xfs_iflags_test(ip, XFS_ISTALE))
+ goto reclaim;
+ if (xfs_inode_clean(ip))
+ goto reclaim;
+
+ /*
+ * Never flush out dirty data during non-blocking reclaim, as it would
+ * just contend with AIL pushing trying to do the same job.
+ */
+ if (!(sync_mode & SYNC_WAIT))
+ goto out_ifunlock;
+
+ /*
+ * Now we have an inode that needs flushing.
+ *
+ * Note that xfs_iflush will never block on the inode buffer lock, as
+ * xfs_ifree_cluster() can lock the inode buffer before it locks the
+ * ip->i_lock, and we are doing the exact opposite here. As a result,
+ * doing a blocking xfs_imap_to_bp() to get the cluster buffer would
+ * result in an ABBA deadlock with xfs_ifree_cluster().
+ *
+ * As xfs_ifree_cluser() must gather all inodes that are active in the
+ * cache to mark them stale, if we hit this case we don't actually want
+ * to do IO here - we want the inode marked stale so we can simply
+ * reclaim it. Hence if we get an EAGAIN error here, just unlock the
+ * inode, back off and try again. Hopefully the next pass through will
+ * see the stale flag set on the inode.
+ */
+ error = xfs_iflush(ip, &bp);
+ if (error == EAGAIN) {
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ /* backoff longer than in xfs_ifree_cluster */
+ delay(2);
+ goto restart;
+ }
+
+ if (!error) {
+ error = xfs_bwrite(bp);
+ xfs_buf_relse(bp);
+ }
+
+ xfs_iflock(ip);
+reclaim:
+ xfs_ifunlock(ip);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+
+ XFS_STATS_INC(xs_ig_reclaims);
+ /*
+ * Remove the inode from the per-AG radix tree.
+ *
+ * Because radix_tree_delete won't complain even if the item was never
+ * added to the tree assert that it's been there before to catch
+ * problems with the inode life time early on.
+ */
+ spin_lock(&pag->pag_ici_lock);
+ if (!radix_tree_delete(&pag->pag_ici_root,
+ XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino)))
+ ASSERT(0);
+ __xfs_inode_clear_reclaim(pag, ip);
+ spin_unlock(&pag->pag_ici_lock);
+
+ /*
+ * Here we do an (almost) spurious inode lock in order to coordinate
+ * with inode cache radix tree lookups. This is because the lookup
+ * can reference the inodes in the cache without taking references.
+ *
+ * We make that OK here by ensuring that we wait until the inode is
+ * unlocked after the lookup before we go ahead and free it.
+ */
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ xfs_qm_dqdetach(ip);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+
+ xfs_inode_free(ip);
+ return error;
+
+out_ifunlock:
+ xfs_ifunlock(ip);
+out:
+ xfs_iflags_clear(ip, XFS_IRECLAIM);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ /*
+ * We could return EAGAIN here to make reclaim rescan the inode tree in
+ * a short while. However, this just burns CPU time scanning the tree
+ * waiting for IO to complete and the reclaim work never goes back to
+ * the idle state. Instead, return 0 to let the next scheduled
+ * background reclaim attempt to reclaim the inode again.
+ */
+ return 0;
+}
+
+/*
+ * Walk the AGs and reclaim the inodes in them. Even if the filesystem is
+ * corrupted, we still want to try to reclaim all the inodes. If we don't,
+ * then a shut down during filesystem unmount reclaim walk leak all the
+ * unreclaimed inodes.
+ */
+STATIC int
+xfs_reclaim_inodes_ag(
+ struct xfs_mount *mp,
+ int flags,
+ int *nr_to_scan)
+{
+ struct xfs_perag *pag;
+ int error = 0;
+ int last_error = 0;
+ xfs_agnumber_t ag;
+ int trylock = flags & SYNC_TRYLOCK;
+ int skipped;
+
+restart:
+ ag = 0;
+ skipped = 0;
+ while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) {
+ unsigned long first_index = 0;
+ int done = 0;
+ int nr_found = 0;
+
+ ag = pag->pag_agno + 1;
+
+ if (trylock) {
+ if (!mutex_trylock(&pag->pag_ici_reclaim_lock)) {
+ skipped++;
+ xfs_perag_put(pag);
+ continue;
+ }
+ first_index = pag->pag_ici_reclaim_cursor;
+ } else
+ mutex_lock(&pag->pag_ici_reclaim_lock);
+
+ do {
+ struct xfs_inode *batch[XFS_LOOKUP_BATCH];
+ int i;
+
+ rcu_read_lock();
+ nr_found = radix_tree_gang_lookup_tag(
+ &pag->pag_ici_root,
+ (void **)batch, first_index,
+ XFS_LOOKUP_BATCH,
+ XFS_ICI_RECLAIM_TAG);
+ if (!nr_found) {
+ done = 1;
+ rcu_read_unlock();
+ break;
+ }
+
+ /*
+ * Grab the inodes before we drop the lock. if we found
+ * nothing, nr == 0 and the loop will be skipped.
+ */
+ for (i = 0; i < nr_found; i++) {
+ struct xfs_inode *ip = batch[i];
+
+ if (done || xfs_reclaim_inode_grab(ip, flags))
+ batch[i] = NULL;
+
+ /*
+ * Update the index for the next lookup. Catch
+ * overflows into the next AG range which can
+ * occur if we have inodes in the last block of
+ * the AG and we are currently pointing to the
+ * last inode.
+ *
+ * Because we may see inodes that are from the
+ * wrong AG due to RCU freeing and
+ * reallocation, only update the index if it
+ * lies in this AG. It was a race that lead us
+ * to see this inode, so another lookup from
+ * the same index will not find it again.
+ */
+ if (XFS_INO_TO_AGNO(mp, ip->i_ino) !=
+ pag->pag_agno)
+ continue;
+ first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
+ if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
+ done = 1;
+ }
+
+ /* unlock now we've grabbed the inodes. */
+ rcu_read_unlock();
+
+ for (i = 0; i < nr_found; i++) {
+ if (!batch[i])
+ continue;
+ error = xfs_reclaim_inode(batch[i], pag, flags);
+ if (error && last_error != EFSCORRUPTED)
+ last_error = error;
+ }
+
+ *nr_to_scan -= XFS_LOOKUP_BATCH;
+
+ cond_resched();
+
+ } while (nr_found && !done && *nr_to_scan > 0);
+
+ if (trylock && !done)
+ pag->pag_ici_reclaim_cursor = first_index;
+ else
+ pag->pag_ici_reclaim_cursor = 0;
+ mutex_unlock(&pag->pag_ici_reclaim_lock);
+ xfs_perag_put(pag);
+ }
+
+ /*
+ * if we skipped any AG, and we still have scan count remaining, do
+ * another pass this time using blocking reclaim semantics (i.e
+ * waiting on the reclaim locks and ignoring the reclaim cursors). This
+ * ensure that when we get more reclaimers than AGs we block rather
+ * than spin trying to execute reclaim.
+ */
+ if (skipped && (flags & SYNC_WAIT) && *nr_to_scan > 0) {
+ trylock = 0;
+ goto restart;
+ }
+ return XFS_ERROR(last_error);
+}
+
+int
+xfs_reclaim_inodes(
+ xfs_mount_t *mp,
+ int mode)
+{
+ int nr_to_scan = INT_MAX;
+
+ return xfs_reclaim_inodes_ag(mp, mode, &nr_to_scan);
+}
+
+/*
+ * Scan a certain number of inodes for reclaim.
+ *
+ * When called we make sure that there is a background (fast) inode reclaim in
+ * progress, while we will throttle the speed of reclaim via doing synchronous
+ * reclaim of inodes. That means if we come across dirty inodes, we wait for
+ * them to be cleaned, which we hope will not be very long due to the
+ * background walker having already kicked the IO off on those dirty inodes.
+ */
+long
+xfs_reclaim_inodes_nr(
+ struct xfs_mount *mp,
+ int nr_to_scan)
+{
+ /* kick background reclaimer and push the AIL */
+ xfs_reclaim_work_queue(mp);
+ xfs_ail_push_all(mp->m_ail);
+
+ return xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK | SYNC_WAIT, &nr_to_scan);
+}
+
+/*
+ * Return the number of reclaimable inodes in the filesystem for
+ * the shrinker to determine how much to reclaim.
+ */
+int
+xfs_reclaim_inodes_count(
+ struct xfs_mount *mp)
+{
+ struct xfs_perag *pag;
+ xfs_agnumber_t ag = 0;
+ int reclaimable = 0;
+
+ while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) {
+ ag = pag->pag_agno + 1;
+ reclaimable += pag->pag_ici_reclaimable;
+ xfs_perag_put(pag);
+ }
+ return reclaimable;
+}
+
+STATIC int
+xfs_inode_match_id(
+ struct xfs_inode *ip,
+ struct xfs_eofblocks *eofb)
+{
+ if ((eofb->eof_flags & XFS_EOF_FLAGS_UID) &&
+ !uid_eq(VFS_I(ip)->i_uid, eofb->eof_uid))
+ return 0;
+
+ if ((eofb->eof_flags & XFS_EOF_FLAGS_GID) &&
+ !gid_eq(VFS_I(ip)->i_gid, eofb->eof_gid))
+ return 0;
+
+ if ((eofb->eof_flags & XFS_EOF_FLAGS_PRID) &&
+ xfs_get_projid(ip) != eofb->eof_prid)
+ return 0;
+
+ return 1;
+}
+
+STATIC int
+xfs_inode_free_eofblocks(
+ struct xfs_inode *ip,
+ int flags,
+ void *args)
+{
+ int ret;
+ struct xfs_eofblocks *eofb = args;
+
+ if (!xfs_can_free_eofblocks(ip, false)) {
+ /* inode could be preallocated or append-only */
+ trace_xfs_inode_free_eofblocks_invalid(ip);
+ xfs_inode_clear_eofblocks_tag(ip);
+ return 0;
+ }
+
+ /*
+ * If the mapping is dirty the operation can block and wait for some
+ * time. Unless we are waiting, skip it.
+ */
+ if (!(flags & SYNC_WAIT) &&
+ mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY))
+ return 0;
+
+ if (eofb) {
+ if (!xfs_inode_match_id(ip, eofb))
+ return 0;
+
+ /* skip the inode if the file size is too small */
+ if (eofb->eof_flags & XFS_EOF_FLAGS_MINFILESIZE &&
+ XFS_ISIZE(ip) < eofb->eof_min_file_size)
+ return 0;
+ }
+
+ ret = xfs_free_eofblocks(ip->i_mount, ip, true);
+
+ /* don't revisit the inode if we're not waiting */
+ if (ret == EAGAIN && !(flags & SYNC_WAIT))
+ ret = 0;
+
+ return ret;
+}
+
+int
+xfs_icache_free_eofblocks(
+ struct xfs_mount *mp,
+ struct xfs_eofblocks *eofb)
+{
+ int flags = SYNC_TRYLOCK;
+
+ if (eofb && (eofb->eof_flags & XFS_EOF_FLAGS_SYNC))
+ flags = SYNC_WAIT;
+
+ return xfs_inode_ag_iterator_tag(mp, xfs_inode_free_eofblocks, flags,
+ eofb, XFS_ICI_EOFBLOCKS_TAG);
+}
+
+void
+xfs_inode_set_eofblocks_tag(
+ xfs_inode_t *ip)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_perag *pag;
+ int tagged;
+
+ pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
+ spin_lock(&pag->pag_ici_lock);
+ trace_xfs_inode_set_eofblocks_tag(ip);
+
+ tagged = radix_tree_tagged(&pag->pag_ici_root,
+ XFS_ICI_EOFBLOCKS_TAG);
+ radix_tree_tag_set(&pag->pag_ici_root,
+ XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino),
+ XFS_ICI_EOFBLOCKS_TAG);
+ if (!tagged) {
+ /* propagate the eofblocks tag up into the perag radix tree */
+ spin_lock(&ip->i_mount->m_perag_lock);
+ radix_tree_tag_set(&ip->i_mount->m_perag_tree,
+ XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
+ XFS_ICI_EOFBLOCKS_TAG);
+ spin_unlock(&ip->i_mount->m_perag_lock);
+
+ /* kick off background trimming */
+ xfs_queue_eofblocks(ip->i_mount);
+
+ trace_xfs_perag_set_eofblocks(ip->i_mount, pag->pag_agno,
+ -1, _RET_IP_);
+ }
+
+ spin_unlock(&pag->pag_ici_lock);
+ xfs_perag_put(pag);
+}
+
+void
+xfs_inode_clear_eofblocks_tag(
+ xfs_inode_t *ip)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_perag *pag;
+
+ pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
+ spin_lock(&pag->pag_ici_lock);
+ trace_xfs_inode_clear_eofblocks_tag(ip);
+
+ radix_tree_tag_clear(&pag->pag_ici_root,
+ XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino),
+ XFS_ICI_EOFBLOCKS_TAG);
+ if (!radix_tree_tagged(&pag->pag_ici_root, XFS_ICI_EOFBLOCKS_TAG)) {
+ /* clear the eofblocks tag from the perag radix tree */
+ spin_lock(&ip->i_mount->m_perag_lock);
+ radix_tree_tag_clear(&ip->i_mount->m_perag_tree,
+ XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
+ XFS_ICI_EOFBLOCKS_TAG);
+ spin_unlock(&ip->i_mount->m_perag_lock);
+ trace_xfs_perag_clear_eofblocks(ip->i_mount, pag->pag_agno,
+ -1, _RET_IP_);
+ }
+
+ spin_unlock(&pag->pag_ici_lock);
+ xfs_perag_put(pag);
+}
+
diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h
new file mode 100644
index 00000000000..9cf017b899b
--- /dev/null
+++ b/fs/xfs/xfs_icache.h
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#ifndef XFS_SYNC_H
+#define XFS_SYNC_H 1
+
+struct xfs_mount;
+struct xfs_perag;
+
+struct xfs_eofblocks {
+ __u32 eof_flags;
+ kuid_t eof_uid;
+ kgid_t eof_gid;
+ prid_t eof_prid;
+ __u64 eof_min_file_size;
+};
+
+#define SYNC_WAIT 0x0001 /* wait for i/o to complete */
+#define SYNC_TRYLOCK 0x0002 /* only try to lock inodes */
+
+/*
+ * Flags for xfs_iget()
+ */
+#define XFS_IGET_CREATE 0x1
+#define XFS_IGET_UNTRUSTED 0x2
+#define XFS_IGET_DONTCACHE 0x4
+
+int xfs_iget(struct xfs_mount *mp, struct xfs_trans *tp, xfs_ino_t ino,
+ uint flags, uint lock_flags, xfs_inode_t **ipp);
+
+/* recovery needs direct inode allocation capability */
+struct xfs_inode * xfs_inode_alloc(struct xfs_mount *mp, xfs_ino_t ino);
+void xfs_inode_free(struct xfs_inode *ip);
+
+void xfs_reclaim_worker(struct work_struct *work);
+
+int xfs_reclaim_inodes(struct xfs_mount *mp, int mode);
+int xfs_reclaim_inodes_count(struct xfs_mount *mp);
+long xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan);
+
+void xfs_inode_set_reclaim_tag(struct xfs_inode *ip);
+
+void xfs_inode_set_eofblocks_tag(struct xfs_inode *ip);
+void xfs_inode_clear_eofblocks_tag(struct xfs_inode *ip);
+int xfs_icache_free_eofblocks(struct xfs_mount *, struct xfs_eofblocks *);
+void xfs_eofblocks_worker(struct work_struct *);
+
+int xfs_inode_ag_iterator(struct xfs_mount *mp,
+ int (*execute)(struct xfs_inode *ip, int flags, void *args),
+ int flags, void *args);
+int xfs_inode_ag_iterator_tag(struct xfs_mount *mp,
+ int (*execute)(struct xfs_inode *ip, int flags, void *args),
+ int flags, void *args, int tag);
+
+static inline int
+xfs_fs_eofblocks_from_user(
+ struct xfs_fs_eofblocks *src,
+ struct xfs_eofblocks *dst)
+{
+ if (src->eof_version != XFS_EOFBLOCKS_VERSION)
+ return EINVAL;
+
+ if (src->eof_flags & ~XFS_EOF_FLAGS_VALID)
+ return EINVAL;
+
+ if (memchr_inv(&src->pad32, 0, sizeof(src->pad32)) ||
+ memchr_inv(src->pad64, 0, sizeof(src->pad64)))
+ return EINVAL;
+
+ dst->eof_flags = src->eof_flags;
+ dst->eof_prid = src->eof_prid;
+ dst->eof_min_file_size = src->eof_min_file_size;
+
+ dst->eof_uid = INVALID_UID;
+ if (src->eof_flags & XFS_EOF_FLAGS_UID) {
+ dst->eof_uid = make_kuid(current_user_ns(), src->eof_uid);
+ if (!uid_valid(dst->eof_uid))
+ return EINVAL;
+ }
+
+ dst->eof_gid = INVALID_GID;
+ if (src->eof_flags & XFS_EOF_FLAGS_GID) {
+ dst->eof_gid = make_kgid(current_user_ns(), src->eof_gid);
+ if (!gid_valid(dst->eof_gid))
+ return EINVAL;
+ }
+ return 0;
+}
+
+#endif
diff --git a/fs/xfs/xfs_icreate_item.c b/fs/xfs/xfs_icreate_item.c
new file mode 100644
index 00000000000..7e454923325
--- /dev/null
+++ b/fs/xfs/xfs_icreate_item.c
@@ -0,0 +1,189 @@
+/*
+ * Copyright (c) 2008-2010, 2013 Dave Chinner
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_trans.h"
+#include "xfs_trans_priv.h"
+#include "xfs_error.h"
+#include "xfs_icreate_item.h"
+#include "xfs_log.h"
+
+kmem_zone_t *xfs_icreate_zone; /* inode create item zone */
+
+static inline struct xfs_icreate_item *ICR_ITEM(struct xfs_log_item *lip)
+{
+ return container_of(lip, struct xfs_icreate_item, ic_item);
+}
+
+/*
+ * This returns the number of iovecs needed to log the given inode item.
+ *
+ * We only need one iovec for the icreate log structure.
+ */
+STATIC void
+xfs_icreate_item_size(
+ struct xfs_log_item *lip,
+ int *nvecs,
+ int *nbytes)
+{
+ *nvecs += 1;
+ *nbytes += sizeof(struct xfs_icreate_log);
+}
+
+/*
+ * This is called to fill in the vector of log iovecs for the
+ * given inode create log item.
+ */
+STATIC void
+xfs_icreate_item_format(
+ struct xfs_log_item *lip,
+ struct xfs_log_vec *lv)
+{
+ struct xfs_icreate_item *icp = ICR_ITEM(lip);
+ struct xfs_log_iovec *vecp = NULL;
+
+ xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_ICREATE,
+ &icp->ic_format,
+ sizeof(struct xfs_icreate_log));
+}
+
+
+/* Pinning has no meaning for the create item, so just return. */
+STATIC void
+xfs_icreate_item_pin(
+ struct xfs_log_item *lip)
+{
+}
+
+
+/* pinning has no meaning for the create item, so just return. */
+STATIC void
+xfs_icreate_item_unpin(
+ struct xfs_log_item *lip,
+ int remove)
+{
+}
+
+STATIC void
+xfs_icreate_item_unlock(
+ struct xfs_log_item *lip)
+{
+ struct xfs_icreate_item *icp = ICR_ITEM(lip);
+
+ if (icp->ic_item.li_flags & XFS_LI_ABORTED)
+ kmem_zone_free(xfs_icreate_zone, icp);
+ return;
+}
+
+/*
+ * Because we have ordered buffers being tracked in the AIL for the inode
+ * creation, we don't need the create item after this. Hence we can free
+ * the log item and return -1 to tell the caller we're done with the item.
+ */
+STATIC xfs_lsn_t
+xfs_icreate_item_committed(
+ struct xfs_log_item *lip,
+ xfs_lsn_t lsn)
+{
+ struct xfs_icreate_item *icp = ICR_ITEM(lip);
+
+ kmem_zone_free(xfs_icreate_zone, icp);
+ return (xfs_lsn_t)-1;
+}
+
+/* item can never get into the AIL */
+STATIC uint
+xfs_icreate_item_push(
+ struct xfs_log_item *lip,
+ struct list_head *buffer_list)
+{
+ ASSERT(0);
+ return XFS_ITEM_SUCCESS;
+}
+
+/* Ordered buffers do the dependency tracking here, so this does nothing. */
+STATIC void
+xfs_icreate_item_committing(
+ struct xfs_log_item *lip,
+ xfs_lsn_t lsn)
+{
+}
+
+/*
+ * This is the ops vector shared by all buf log items.
+ */
+static struct xfs_item_ops xfs_icreate_item_ops = {
+ .iop_size = xfs_icreate_item_size,
+ .iop_format = xfs_icreate_item_format,
+ .iop_pin = xfs_icreate_item_pin,
+ .iop_unpin = xfs_icreate_item_unpin,
+ .iop_push = xfs_icreate_item_push,
+ .iop_unlock = xfs_icreate_item_unlock,
+ .iop_committed = xfs_icreate_item_committed,
+ .iop_committing = xfs_icreate_item_committing,
+};
+
+
+/*
+ * Initialize the inode log item for a newly allocated (in-core) inode.
+ *
+ * Inode extents can only reside within an AG. Hence specify the starting
+ * block for the inode chunk by offset within an AG as well as the
+ * length of the allocated extent.
+ *
+ * This joins the item to the transaction and marks it dirty so
+ * that we don't need a separate call to do this, nor does the
+ * caller need to know anything about the icreate item.
+ */
+void
+xfs_icreate_log(
+ struct xfs_trans *tp,
+ xfs_agnumber_t agno,
+ xfs_agblock_t agbno,
+ unsigned int count,
+ unsigned int inode_size,
+ xfs_agblock_t length,
+ unsigned int generation)
+{
+ struct xfs_icreate_item *icp;
+
+ icp = kmem_zone_zalloc(xfs_icreate_zone, KM_SLEEP);
+
+ xfs_log_item_init(tp->t_mountp, &icp->ic_item, XFS_LI_ICREATE,
+ &xfs_icreate_item_ops);
+
+ icp->ic_format.icl_type = XFS_LI_ICREATE;
+ icp->ic_format.icl_size = 1; /* single vector */
+ icp->ic_format.icl_ag = cpu_to_be32(agno);
+ icp->ic_format.icl_agbno = cpu_to_be32(agbno);
+ icp->ic_format.icl_count = cpu_to_be32(count);
+ icp->ic_format.icl_isize = cpu_to_be32(inode_size);
+ icp->ic_format.icl_length = cpu_to_be32(length);
+ icp->ic_format.icl_gen = cpu_to_be32(generation);
+
+ xfs_trans_add_item(tp, &icp->ic_item);
+ tp->t_flags |= XFS_TRANS_DIRTY;
+ icp->ic_item.li_desc->lid_flags |= XFS_LID_DIRTY;
+}
diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.h b/fs/xfs/xfs_icreate_item.h
index 011c273bec5..59e89f87c09 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl32.h
+++ b/fs/xfs/xfs_icreate_item.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2004-2005 Silicon Graphics, Inc.
+ * Copyright (c) 2008-2010, Dave Chinner
* All Rights Reserved.
*
* This program is free software; you can redistribute it and/or
@@ -15,10 +15,20 @@
* along with this program; if not, write the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
-#ifndef __XFS_IOCTL32_H__
-#define __XFS_IOCTL32_H__
+#ifndef XFS_ICREATE_ITEM_H
+#define XFS_ICREATE_ITEM_H 1
-extern long linvfs_compat_ioctl(struct file *, unsigned, unsigned long);
-extern long linvfs_compat_invis_ioctl(struct file *f, unsigned, unsigned long);
+/* in memory log item structure */
+struct xfs_icreate_item {
+ struct xfs_log_item ic_item;
+ struct xfs_icreate_log ic_format;
+};
-#endif /* __XFS_IOCTL32_H__ */
+extern kmem_zone_t *xfs_icreate_zone; /* inode create item zone */
+
+void xfs_icreate_log(struct xfs_trans *tp, xfs_agnumber_t agno,
+ xfs_agblock_t agbno, unsigned int count,
+ unsigned int inode_size, xfs_agblock_t length,
+ unsigned int generation);
+
+#endif /* XFS_ICREATE_ITEM_H */
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
deleted file mode 100644
index 8e380a1fb79..00000000000
--- a/fs/xfs/xfs_iget.c
+++ /dev/null
@@ -1,1043 +0,0 @@
-/*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_types.h"
-#include "xfs_bit.h"
-#include "xfs_log.h"
-#include "xfs_inum.h"
-#include "xfs_trans.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_dir.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
-#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
-#include "xfs_dinode.h"
-#include "xfs_inode.h"
-#include "xfs_btree.h"
-#include "xfs_ialloc.h"
-#include "xfs_quota.h"
-#include "xfs_utils.h"
-
-/*
- * Initialize the inode hash table for the newly mounted file system.
- * Choose an initial table size based on user specified value, else
- * use a simple algorithm using the maximum number of inodes as an
- * indicator for table size, and clamp it between one and some large
- * number of pages.
- */
-void
-xfs_ihash_init(xfs_mount_t *mp)
-{
- __uint64_t icount;
- uint i, flags = KM_SLEEP | KM_MAYFAIL;
-
- if (!mp->m_ihsize) {
- icount = mp->m_maxicount ? mp->m_maxicount :
- (mp->m_sb.sb_dblocks << mp->m_sb.sb_inopblog);
- mp->m_ihsize = 1 << max_t(uint, 8,
- (xfs_highbit64(icount) + 1) / 2);
- mp->m_ihsize = min_t(uint, mp->m_ihsize,
- (64 * NBPP) / sizeof(xfs_ihash_t));
- }
-
- while (!(mp->m_ihash = (xfs_ihash_t *)kmem_zalloc(mp->m_ihsize *
- sizeof(xfs_ihash_t), flags))) {
- if ((mp->m_ihsize >>= 1) <= NBPP)
- flags = KM_SLEEP;
- }
- for (i = 0; i < mp->m_ihsize; i++) {
- rwlock_init(&(mp->m_ihash[i].ih_lock));
- }
-}
-
-/*
- * Free up structures allocated by xfs_ihash_init, at unmount time.
- */
-void
-xfs_ihash_free(xfs_mount_t *mp)
-{
- kmem_free(mp->m_ihash, mp->m_ihsize*sizeof(xfs_ihash_t));
- mp->m_ihash = NULL;
-}
-
-/*
- * Initialize the inode cluster hash table for the newly mounted file system.
- * Its size is derived from the ihash table size.
- */
-void
-xfs_chash_init(xfs_mount_t *mp)
-{
- uint i;
-
- mp->m_chsize = max_t(uint, 1, mp->m_ihsize /
- (XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog));
- mp->m_chsize = min_t(uint, mp->m_chsize, mp->m_ihsize);
- mp->m_chash = (xfs_chash_t *)kmem_zalloc(mp->m_chsize
- * sizeof(xfs_chash_t),
- KM_SLEEP);
- for (i = 0; i < mp->m_chsize; i++) {
- spinlock_init(&mp->m_chash[i].ch_lock,"xfshash");
- }
-}
-
-/*
- * Free up structures allocated by xfs_chash_init, at unmount time.
- */
-void
-xfs_chash_free(xfs_mount_t *mp)
-{
- int i;
-
- for (i = 0; i < mp->m_chsize; i++) {
- spinlock_destroy(&mp->m_chash[i].ch_lock);
- }
-
- kmem_free(mp->m_chash, mp->m_chsize*sizeof(xfs_chash_t));
- mp->m_chash = NULL;
-}
-
-/*
- * Try to move an inode to the front of its hash list if possible
- * (and if its not there already). Called right after obtaining
- * the list version number and then dropping the read_lock on the
- * hash list in question (which is done right after looking up the
- * inode in question...).
- */
-STATIC void
-xfs_ihash_promote(
- xfs_ihash_t *ih,
- xfs_inode_t *ip,
- ulong version)
-{
- xfs_inode_t *iq;
-
- if ((ip->i_prevp != &ih->ih_next) && write_trylock(&ih->ih_lock)) {
- if (likely(version == ih->ih_version)) {
- /* remove from list */
- if ((iq = ip->i_next)) {
- iq->i_prevp = ip->i_prevp;
- }
- *ip->i_prevp = iq;
-
- /* insert at list head */
- iq = ih->ih_next;
- iq->i_prevp = &ip->i_next;
- ip->i_next = iq;
- ip->i_prevp = &ih->ih_next;
- ih->ih_next = ip;
- }
- write_unlock(&ih->ih_lock);
- }
-}
-
-/*
- * Look up an inode by number in the given file system.
- * The inode is looked up in the hash table for the file system
- * represented by the mount point parameter mp. Each bucket of
- * the hash table is guarded by an individual semaphore.
- *
- * If the inode is found in the hash table, its corresponding vnode
- * is obtained with a call to vn_get(). This call takes care of
- * coordination with the reclamation of the inode and vnode. Note
- * that the vmap structure is filled in while holding the hash lock.
- * This gives us the state of the inode/vnode when we found it and
- * is used for coordination in vn_get().
- *
- * If it is not in core, read it in from the file system's device and
- * add the inode into the hash table.
- *
- * The inode is locked according to the value of the lock_flags parameter.
- * This flag parameter indicates how and if the inode's IO lock and inode lock
- * should be taken.
- *
- * mp -- the mount point structure for the current file system. It points
- * to the inode hash table.
- * tp -- a pointer to the current transaction if there is one. This is
- * simply passed through to the xfs_iread() call.
- * ino -- the number of the inode desired. This is the unique identifier
- * within the file system for the inode being requested.
- * lock_flags -- flags indicating how to lock the inode. See the comment
- * for xfs_ilock() for a list of valid values.
- * bno -- the block number starting the buffer containing the inode,
- * if known (as by bulkstat), else 0.
- */
-STATIC int
-xfs_iget_core(
- vnode_t *vp,
- xfs_mount_t *mp,
- xfs_trans_t *tp,
- xfs_ino_t ino,
- uint flags,
- uint lock_flags,
- xfs_inode_t **ipp,
- xfs_daddr_t bno)
-{
- xfs_ihash_t *ih;
- xfs_inode_t *ip;
- xfs_inode_t *iq;
- vnode_t *inode_vp;
- ulong version;
- int error;
- /* REFERENCED */
- xfs_chash_t *ch;
- xfs_chashlist_t *chl, *chlnew;
- SPLDECL(s);
-
-
- ih = XFS_IHASH(mp, ino);
-
-again:
- read_lock(&ih->ih_lock);
-
- for (ip = ih->ih_next; ip != NULL; ip = ip->i_next) {
- if (ip->i_ino == ino) {
- /*
- * If INEW is set this inode is being set up
- * we need to pause and try again.
- */
- if (ip->i_flags & XFS_INEW) {
- read_unlock(&ih->ih_lock);
- delay(1);
- XFS_STATS_INC(xs_ig_frecycle);
-
- goto again;
- }
-
- inode_vp = XFS_ITOV_NULL(ip);
- if (inode_vp == NULL) {
- /*
- * If IRECLAIM is set this inode is
- * on its way out of the system,
- * we need to pause and try again.
- */
- if (ip->i_flags & XFS_IRECLAIM) {
- read_unlock(&ih->ih_lock);
- delay(1);
- XFS_STATS_INC(xs_ig_frecycle);
-
- goto again;
- }
-
- vn_trace_exit(vp, "xfs_iget.alloc",
- (inst_t *)__return_address);
-
- XFS_STATS_INC(xs_ig_found);
-
- ip->i_flags &= ~XFS_IRECLAIMABLE;
- version = ih->ih_version;
- read_unlock(&ih->ih_lock);
- xfs_ihash_promote(ih, ip, version);
-
- XFS_MOUNT_ILOCK(mp);
- list_del_init(&ip->i_reclaim);
- XFS_MOUNT_IUNLOCK(mp);
-
- goto finish_inode;
-
- } else if (vp != inode_vp) {
- struct inode *inode = LINVFS_GET_IP(inode_vp);
-
- /* The inode is being torn down, pause and
- * try again.
- */
- if (inode->i_state & (I_FREEING | I_CLEAR)) {
- read_unlock(&ih->ih_lock);
- delay(1);
- XFS_STATS_INC(xs_ig_frecycle);
-
- goto again;
- }
-/* Chances are the other vnode (the one in the inode) is being torn
- * down right now, and we landed on top of it. Question is, what do
- * we do? Unhook the old inode and hook up the new one?
- */
- cmn_err(CE_PANIC,
- "xfs_iget_core: ambiguous vns: vp/0x%p, invp/0x%p",
- inode_vp, vp);
- }
-
- /*
- * Inode cache hit: if ip is not at the front of
- * its hash chain, move it there now.
- * Do this with the lock held for update, but
- * do statistics after releasing the lock.
- */
- version = ih->ih_version;
- read_unlock(&ih->ih_lock);
- xfs_ihash_promote(ih, ip, version);
- XFS_STATS_INC(xs_ig_found);
-
-finish_inode:
- if (ip->i_d.di_mode == 0) {
- if (!(flags & IGET_CREATE))
- return ENOENT;
- xfs_iocore_inode_reinit(ip);
- }
-
- if (lock_flags != 0)
- xfs_ilock(ip, lock_flags);
-
- ip->i_flags &= ~XFS_ISTALE;
-
- vn_trace_exit(vp, "xfs_iget.found",
- (inst_t *)__return_address);
- goto return_ip;
- }
- }
-
- /*
- * Inode cache miss: save the hash chain version stamp and unlock
- * the chain, so we don't deadlock in vn_alloc.
- */
- XFS_STATS_INC(xs_ig_missed);
-
- version = ih->ih_version;
-
- read_unlock(&ih->ih_lock);
-
- /*
- * Read the disk inode attributes into a new inode structure and get
- * a new vnode for it. This should also initialize i_ino and i_mount.
- */
- error = xfs_iread(mp, tp, ino, &ip, bno);
- if (error) {
- return error;
- }
-
- vn_trace_exit(vp, "xfs_iget.alloc", (inst_t *)__return_address);
-
- xfs_inode_lock_init(ip, vp);
- xfs_iocore_inode_init(ip);
-
- if (lock_flags != 0) {
- xfs_ilock(ip, lock_flags);
- }
-
- if ((ip->i_d.di_mode == 0) && !(flags & IGET_CREATE)) {
- xfs_idestroy(ip);
- return ENOENT;
- }
-
- /*
- * Put ip on its hash chain, unless someone else hashed a duplicate
- * after we released the hash lock.
- */
- write_lock(&ih->ih_lock);
-
- if (ih->ih_version != version) {
- for (iq = ih->ih_next; iq != NULL; iq = iq->i_next) {
- if (iq->i_ino == ino) {
- write_unlock(&ih->ih_lock);
- xfs_idestroy(ip);
-
- XFS_STATS_INC(xs_ig_dup);
- goto again;
- }
- }
- }
-
- /*
- * These values _must_ be set before releasing ihlock!
- */
- ip->i_hash = ih;
- if ((iq = ih->ih_next)) {
- iq->i_prevp = &ip->i_next;
- }
- ip->i_next = iq;
- ip->i_prevp = &ih->ih_next;
- ih->ih_next = ip;
- ip->i_udquot = ip->i_gdquot = NULL;
- ih->ih_version++;
- ip->i_flags |= XFS_INEW;
-
- write_unlock(&ih->ih_lock);
-
- /*
- * put ip on its cluster's hash chain
- */
- ASSERT(ip->i_chash == NULL && ip->i_cprev == NULL &&
- ip->i_cnext == NULL);
-
- chlnew = NULL;
- ch = XFS_CHASH(mp, ip->i_blkno);
- chlredo:
- s = mutex_spinlock(&ch->ch_lock);
- for (chl = ch->ch_list; chl != NULL; chl = chl->chl_next) {
- if (chl->chl_blkno == ip->i_blkno) {
-
- /* insert this inode into the doubly-linked list
- * where chl points */
- if ((iq = chl->chl_ip)) {
- ip->i_cprev = iq->i_cprev;
- iq->i_cprev->i_cnext = ip;
- iq->i_cprev = ip;
- ip->i_cnext = iq;
- } else {
- ip->i_cnext = ip;
- ip->i_cprev = ip;
- }
- chl->chl_ip = ip;
- ip->i_chash = chl;
- break;
- }
- }
-
- /* no hash list found for this block; add a new hash list */
- if (chl == NULL) {
- if (chlnew == NULL) {
- mutex_spinunlock(&ch->ch_lock, s);
- ASSERT(xfs_chashlist_zone != NULL);
- chlnew = (xfs_chashlist_t *)
- kmem_zone_alloc(xfs_chashlist_zone,
- KM_SLEEP);
- ASSERT(chlnew != NULL);
- goto chlredo;
- } else {
- ip->i_cnext = ip;
- ip->i_cprev = ip;
- ip->i_chash = chlnew;
- chlnew->chl_ip = ip;
- chlnew->chl_blkno = ip->i_blkno;
- chlnew->chl_next = ch->ch_list;
- ch->ch_list = chlnew;
- chlnew = NULL;
- }
- } else {
- if (chlnew != NULL) {
- kmem_zone_free(xfs_chashlist_zone, chlnew);
- }
- }
-
- mutex_spinunlock(&ch->ch_lock, s);
-
-
- /*
- * Link ip to its mount and thread it on the mount's inode list.
- */
- XFS_MOUNT_ILOCK(mp);
- if ((iq = mp->m_inodes)) {
- ASSERT(iq->i_mprev->i_mnext == iq);
- ip->i_mprev = iq->i_mprev;
- iq->i_mprev->i_mnext = ip;
- iq->i_mprev = ip;
- ip->i_mnext = iq;
- } else {
- ip->i_mnext = ip;
- ip->i_mprev = ip;
- }
- mp->m_inodes = ip;
-
- XFS_MOUNT_IUNLOCK(mp);
-
- return_ip:
- ASSERT(ip->i_df.if_ext_max ==
- XFS_IFORK_DSIZE(ip) / sizeof(xfs_bmbt_rec_t));
-
- ASSERT(((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) != 0) ==
- ((ip->i_iocore.io_flags & XFS_IOCORE_RT) != 0));
-
- *ipp = ip;
-
- /*
- * If we have a real type for an on-disk inode, we can set ops(&unlock)
- * now. If it's a new inode being created, xfs_ialloc will handle it.
- */
- VFS_INIT_VNODE(XFS_MTOVFS(mp), vp, XFS_ITOBHV(ip), 1);
-
- return 0;
-}
-
-
-/*
- * The 'normal' internal xfs_iget, if needed it will
- * 'allocate', or 'get', the vnode.
- */
-int
-xfs_iget(
- xfs_mount_t *mp,
- xfs_trans_t *tp,
- xfs_ino_t ino,
- uint flags,
- uint lock_flags,
- xfs_inode_t **ipp,
- xfs_daddr_t bno)
-{
- struct inode *inode;
- vnode_t *vp = NULL;
- int error;
-
- XFS_STATS_INC(xs_ig_attempts);
-
-retry:
- if ((inode = iget_locked(XFS_MTOVFS(mp)->vfs_super, ino))) {
- xfs_inode_t *ip;
-
- vp = LINVFS_GET_VP(inode);
- if (inode->i_state & I_NEW) {
- vn_initialize(inode);
- error = xfs_iget_core(vp, mp, tp, ino, flags,
- lock_flags, ipp, bno);
- if (error) {
- vn_mark_bad(vp);
- if (inode->i_state & I_NEW)
- unlock_new_inode(inode);
- iput(inode);
- }
- } else {
- /*
- * If the inode is not fully constructed due to
- * filehandle mistmatches wait for the inode to go
- * away and try again.
- *
- * iget_locked will call __wait_on_freeing_inode
- * to wait for the inode to go away.
- */
- if (is_bad_inode(inode) ||
- ((ip = xfs_vtoi(vp)) == NULL)) {
- iput(inode);
- delay(1);
- goto retry;
- }
-
- if (lock_flags != 0)
- xfs_ilock(ip, lock_flags);
- XFS_STATS_INC(xs_ig_found);
- *ipp = ip;
- error = 0;
- }
- } else
- error = ENOMEM; /* If we got no inode we are out of memory */
-
- return error;
-}
-
-/*
- * Do the setup for the various locks within the incore inode.
- */
-void
-xfs_inode_lock_init(
- xfs_inode_t *ip,
- vnode_t *vp)
-{
- mrlock_init(&ip->i_lock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER,
- "xfsino", (long)vp->v_number);
- mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", vp->v_number);
- init_waitqueue_head(&ip->i_ipin_wait);
- atomic_set(&ip->i_pincount, 0);
- init_sema(&ip->i_flock, 1, "xfsfino", vp->v_number);
-}
-
-/*
- * Look for the inode corresponding to the given ino in the hash table.
- * If it is there and its i_transp pointer matches tp, return it.
- * Otherwise, return NULL.
- */
-xfs_inode_t *
-xfs_inode_incore(xfs_mount_t *mp,
- xfs_ino_t ino,
- xfs_trans_t *tp)
-{
- xfs_ihash_t *ih;
- xfs_inode_t *ip;
- ulong version;
-
- ih = XFS_IHASH(mp, ino);
- read_lock(&ih->ih_lock);
- for (ip = ih->ih_next; ip != NULL; ip = ip->i_next) {
- if (ip->i_ino == ino) {
- /*
- * If we find it and tp matches, return it.
- * Also move it to the front of the hash list
- * if we find it and it is not already there.
- * Otherwise break from the loop and return
- * NULL.
- */
- if (ip->i_transp == tp) {
- version = ih->ih_version;
- read_unlock(&ih->ih_lock);
- xfs_ihash_promote(ih, ip, version);
- return (ip);
- }
- break;
- }
- }
- read_unlock(&ih->ih_lock);
- return (NULL);
-}
-
-/*
- * Decrement reference count of an inode structure and unlock it.
- *
- * ip -- the inode being released
- * lock_flags -- this parameter indicates the inode's locks to be
- * to be released. See the comment on xfs_iunlock() for a list
- * of valid values.
- */
-void
-xfs_iput(xfs_inode_t *ip,
- uint lock_flags)
-{
- vnode_t *vp = XFS_ITOV(ip);
-
- vn_trace_entry(vp, "xfs_iput", (inst_t *)__return_address);
-
- xfs_iunlock(ip, lock_flags);
-
- VN_RELE(vp);
-}
-
-/*
- * Special iput for brand-new inodes that are still locked
- */
-void
-xfs_iput_new(xfs_inode_t *ip,
- uint lock_flags)
-{
- vnode_t *vp = XFS_ITOV(ip);
- struct inode *inode = LINVFS_GET_IP(vp);
-
- vn_trace_entry(vp, "xfs_iput_new", (inst_t *)__return_address);
-
- if ((ip->i_d.di_mode == 0)) {
- ASSERT(!(ip->i_flags & XFS_IRECLAIMABLE));
- vn_mark_bad(vp);
- }
- if (inode->i_state & I_NEW)
- unlock_new_inode(inode);
- if (lock_flags)
- xfs_iunlock(ip, lock_flags);
- VN_RELE(vp);
-}
-
-
-/*
- * This routine embodies the part of the reclaim code that pulls
- * the inode from the inode hash table and the mount structure's
- * inode list.
- * This should only be called from xfs_reclaim().
- */
-void
-xfs_ireclaim(xfs_inode_t *ip)
-{
- vnode_t *vp;
-
- /*
- * Remove from old hash list and mount list.
- */
- XFS_STATS_INC(xs_ig_reclaims);
-
- xfs_iextract(ip);
-
- /*
- * Here we do a spurious inode lock in order to coordinate with
- * xfs_sync(). This is because xfs_sync() references the inodes
- * in the mount list without taking references on the corresponding
- * vnodes. We make that OK here by ensuring that we wait until
- * the inode is unlocked in xfs_sync() before we go ahead and
- * free it. We get both the regular lock and the io lock because
- * the xfs_sync() code may need to drop the regular one but will
- * still hold the io lock.
- */
- xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
-
- /*
- * Release dquots (and their references) if any. An inode may escape
- * xfs_inactive and get here via vn_alloc->vn_reclaim path.
- */
- XFS_QM_DQDETACH(ip->i_mount, ip);
-
- /*
- * Pull our behavior descriptor from the vnode chain.
- */
- vp = XFS_ITOV_NULL(ip);
- if (vp) {
- vn_bhv_remove(VN_BHV_HEAD(vp), XFS_ITOBHV(ip));
- }
-
- /*
- * Free all memory associated with the inode.
- */
- xfs_idestroy(ip);
-}
-
-/*
- * This routine removes an about-to-be-destroyed inode from
- * all of the lists in which it is located with the exception
- * of the behavior chain.
- */
-void
-xfs_iextract(
- xfs_inode_t *ip)
-{
- xfs_ihash_t *ih;
- xfs_inode_t *iq;
- xfs_mount_t *mp;
- xfs_chash_t *ch;
- xfs_chashlist_t *chl, *chm;
- SPLDECL(s);
-
- ih = ip->i_hash;
- write_lock(&ih->ih_lock);
- if ((iq = ip->i_next)) {
- iq->i_prevp = ip->i_prevp;
- }
- *ip->i_prevp = iq;
- ih->ih_version++;
- write_unlock(&ih->ih_lock);
-
- /*
- * Remove from cluster hash list
- * 1) delete the chashlist if this is the last inode on the chashlist
- * 2) unchain from list of inodes
- * 3) point chashlist->chl_ip to 'chl_next' if to this inode.
- */
- mp = ip->i_mount;
- ch = XFS_CHASH(mp, ip->i_blkno);
- s = mutex_spinlock(&ch->ch_lock);
-
- if (ip->i_cnext == ip) {
- /* Last inode on chashlist */
- ASSERT(ip->i_cnext == ip && ip->i_cprev == ip);
- ASSERT(ip->i_chash != NULL);
- chm=NULL;
- for (chl = ch->ch_list; chl != NULL; chl = chl->chl_next) {
- if (chl->chl_blkno == ip->i_blkno) {
- if (chm == NULL) {
- /* first item on the list */
- ch->ch_list = chl->chl_next;
- } else {
- chm->chl_next = chl->chl_next;
- }
- kmem_zone_free(xfs_chashlist_zone, chl);
- break;
- } else {
- ASSERT(chl->chl_ip != ip);
- chm = chl;
- }
- }
- ASSERT_ALWAYS(chl != NULL);
- } else {
- /* delete one inode from a non-empty list */
- iq = ip->i_cnext;
- iq->i_cprev = ip->i_cprev;
- ip->i_cprev->i_cnext = iq;
- if (ip->i_chash->chl_ip == ip) {
- ip->i_chash->chl_ip = iq;
- }
- ip->i_chash = __return_address;
- ip->i_cprev = __return_address;
- ip->i_cnext = __return_address;
- }
- mutex_spinunlock(&ch->ch_lock, s);
-
- /*
- * Remove from mount's inode list.
- */
- XFS_MOUNT_ILOCK(mp);
- ASSERT((ip->i_mnext != NULL) && (ip->i_mprev != NULL));
- iq = ip->i_mnext;
- iq->i_mprev = ip->i_mprev;
- ip->i_mprev->i_mnext = iq;
-
- /*
- * Fix up the head pointer if it points to the inode being deleted.
- */
- if (mp->m_inodes == ip) {
- if (ip == iq) {
- mp->m_inodes = NULL;
- } else {
- mp->m_inodes = iq;
- }
- }
-
- /* Deal with the deleted inodes list */
- list_del_init(&ip->i_reclaim);
-
- mp->m_ireclaims++;
- XFS_MOUNT_IUNLOCK(mp);
-}
-
-/*
- * This is a wrapper routine around the xfs_ilock() routine
- * used to centralize some grungy code. It is used in places
- * that wish to lock the inode solely for reading the extents.
- * The reason these places can't just call xfs_ilock(SHARED)
- * is that the inode lock also guards to bringing in of the
- * extents from disk for a file in b-tree format. If the inode
- * is in b-tree format, then we need to lock the inode exclusively
- * until the extents are read in. Locking it exclusively all
- * the time would limit our parallelism unnecessarily, though.
- * What we do instead is check to see if the extents have been
- * read in yet, and only lock the inode exclusively if they
- * have not.
- *
- * The function returns a value which should be given to the
- * corresponding xfs_iunlock_map_shared(). This value is
- * the mode in which the lock was actually taken.
- */
-uint
-xfs_ilock_map_shared(
- xfs_inode_t *ip)
-{
- uint lock_mode;
-
- if ((ip->i_d.di_format == XFS_DINODE_FMT_BTREE) &&
- ((ip->i_df.if_flags & XFS_IFEXTENTS) == 0)) {
- lock_mode = XFS_ILOCK_EXCL;
- } else {
- lock_mode = XFS_ILOCK_SHARED;
- }
-
- xfs_ilock(ip, lock_mode);
-
- return lock_mode;
-}
-
-/*
- * This is simply the unlock routine to go with xfs_ilock_map_shared().
- * All it does is call xfs_iunlock() with the given lock_mode.
- */
-void
-xfs_iunlock_map_shared(
- xfs_inode_t *ip,
- unsigned int lock_mode)
-{
- xfs_iunlock(ip, lock_mode);
-}
-
-/*
- * The xfs inode contains 2 locks: a multi-reader lock called the
- * i_iolock and a multi-reader lock called the i_lock. This routine
- * allows either or both of the locks to be obtained.
- *
- * The 2 locks should always be ordered so that the IO lock is
- * obtained first in order to prevent deadlock.
- *
- * ip -- the inode being locked
- * lock_flags -- this parameter indicates the inode's locks
- * to be locked. It can be:
- * XFS_IOLOCK_SHARED,
- * XFS_IOLOCK_EXCL,
- * XFS_ILOCK_SHARED,
- * XFS_ILOCK_EXCL,
- * XFS_IOLOCK_SHARED | XFS_ILOCK_SHARED,
- * XFS_IOLOCK_SHARED | XFS_ILOCK_EXCL,
- * XFS_IOLOCK_EXCL | XFS_ILOCK_SHARED,
- * XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL
- */
-void
-xfs_ilock(xfs_inode_t *ip,
- uint lock_flags)
-{
- /*
- * You can't set both SHARED and EXCL for the same lock,
- * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
- * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
- */
- ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
- (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
- ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
- (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
- ASSERT((lock_flags & ~XFS_LOCK_MASK) == 0);
-
- if (lock_flags & XFS_IOLOCK_EXCL) {
- mrupdate(&ip->i_iolock);
- } else if (lock_flags & XFS_IOLOCK_SHARED) {
- mraccess(&ip->i_iolock);
- }
- if (lock_flags & XFS_ILOCK_EXCL) {
- mrupdate(&ip->i_lock);
- } else if (lock_flags & XFS_ILOCK_SHARED) {
- mraccess(&ip->i_lock);
- }
- xfs_ilock_trace(ip, 1, lock_flags, (inst_t *)__return_address);
-}
-
-/*
- * This is just like xfs_ilock(), except that the caller
- * is guaranteed not to sleep. It returns 1 if it gets
- * the requested locks and 0 otherwise. If the IO lock is
- * obtained but the inode lock cannot be, then the IO lock
- * is dropped before returning.
- *
- * ip -- the inode being locked
- * lock_flags -- this parameter indicates the inode's locks to be
- * to be locked. See the comment for xfs_ilock() for a list
- * of valid values.
- *
- */
-int
-xfs_ilock_nowait(xfs_inode_t *ip,
- uint lock_flags)
-{
- int iolocked;
- int ilocked;
-
- /*
- * You can't set both SHARED and EXCL for the same lock,
- * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
- * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
- */
- ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
- (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
- ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
- (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
- ASSERT((lock_flags & ~XFS_LOCK_MASK) == 0);
-
- iolocked = 0;
- if (lock_flags & XFS_IOLOCK_EXCL) {
- iolocked = mrtryupdate(&ip->i_iolock);
- if (!iolocked) {
- return 0;
- }
- } else if (lock_flags & XFS_IOLOCK_SHARED) {
- iolocked = mrtryaccess(&ip->i_iolock);
- if (!iolocked) {
- return 0;
- }
- }
- if (lock_flags & XFS_ILOCK_EXCL) {
- ilocked = mrtryupdate(&ip->i_lock);
- if (!ilocked) {
- if (iolocked) {
- mrunlock(&ip->i_iolock);
- }
- return 0;
- }
- } else if (lock_flags & XFS_ILOCK_SHARED) {
- ilocked = mrtryaccess(&ip->i_lock);
- if (!ilocked) {
- if (iolocked) {
- mrunlock(&ip->i_iolock);
- }
- return 0;
- }
- }
- xfs_ilock_trace(ip, 2, lock_flags, (inst_t *)__return_address);
- return 1;
-}
-
-/*
- * xfs_iunlock() is used to drop the inode locks acquired with
- * xfs_ilock() and xfs_ilock_nowait(). The caller must pass
- * in the flags given to xfs_ilock() or xfs_ilock_nowait() so
- * that we know which locks to drop.
- *
- * ip -- the inode being unlocked
- * lock_flags -- this parameter indicates the inode's locks to be
- * to be unlocked. See the comment for xfs_ilock() for a list
- * of valid values for this parameter.
- *
- */
-void
-xfs_iunlock(xfs_inode_t *ip,
- uint lock_flags)
-{
- /*
- * You can't set both SHARED and EXCL for the same lock,
- * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
- * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
- */
- ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
- (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
- ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
- (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
- ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_IUNLOCK_NONOTIFY)) == 0);
- ASSERT(lock_flags != 0);
-
- if (lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) {
- ASSERT(!(lock_flags & XFS_IOLOCK_SHARED) ||
- (ismrlocked(&ip->i_iolock, MR_ACCESS)));
- ASSERT(!(lock_flags & XFS_IOLOCK_EXCL) ||
- (ismrlocked(&ip->i_iolock, MR_UPDATE)));
- mrunlock(&ip->i_iolock);
- }
-
- if (lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) {
- ASSERT(!(lock_flags & XFS_ILOCK_SHARED) ||
- (ismrlocked(&ip->i_lock, MR_ACCESS)));
- ASSERT(!(lock_flags & XFS_ILOCK_EXCL) ||
- (ismrlocked(&ip->i_lock, MR_UPDATE)));
- mrunlock(&ip->i_lock);
-
- /*
- * Let the AIL know that this item has been unlocked in case
- * it is in the AIL and anyone is waiting on it. Don't do
- * this if the caller has asked us not to.
- */
- if (!(lock_flags & XFS_IUNLOCK_NONOTIFY) &&
- ip->i_itemp != NULL) {
- xfs_trans_unlocked_item(ip->i_mount,
- (xfs_log_item_t*)(ip->i_itemp));
- }
- }
- xfs_ilock_trace(ip, 3, lock_flags, (inst_t *)__return_address);
-}
-
-/*
- * give up write locks. the i/o lock cannot be held nested
- * if it is being demoted.
- */
-void
-xfs_ilock_demote(xfs_inode_t *ip,
- uint lock_flags)
-{
- ASSERT(lock_flags & (XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL));
- ASSERT((lock_flags & ~(XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL)) == 0);
-
- if (lock_flags & XFS_ILOCK_EXCL) {
- ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE));
- mrdemote(&ip->i_lock);
- }
- if (lock_flags & XFS_IOLOCK_EXCL) {
- ASSERT(ismrlocked(&ip->i_iolock, MR_UPDATE));
- mrdemote(&ip->i_iolock);
- }
-}
-
-/*
- * The following three routines simply manage the i_flock
- * semaphore embedded in the inode. This semaphore synchronizes
- * processes attempting to flush the in-core inode back to disk.
- */
-void
-xfs_iflock(xfs_inode_t *ip)
-{
- psema(&(ip->i_flock), PINOD|PLTWAIT);
-}
-
-int
-xfs_iflock_nowait(xfs_inode_t *ip)
-{
- return (cpsema(&(ip->i_flock)));
-}
-
-void
-xfs_ifunlock(xfs_inode_t *ip)
-{
- ASSERT(valusema(&(ip->i_flock)) <= 0);
- vsema(&(ip->i_flock));
-}
diff --git a/fs/xfs/xfs_imap.h b/fs/xfs/xfs_imap.h
deleted file mode 100644
index d3645000398..00000000000
--- a/fs/xfs/xfs_imap.h
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * Copyright (c) 2000,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- */
-#ifndef __XFS_IMAP_H__
-#define __XFS_IMAP_H__
-
-/*
- * This is the structure passed to xfs_imap() to map
- * an inode number to its on disk location.
- */
-typedef struct xfs_imap {
- xfs_daddr_t im_blkno; /* starting BB of inode chunk */
- uint im_len; /* length in BBs of inode chunk */
- xfs_agblock_t im_agblkno; /* logical block of inode chunk in ag */
- ushort im_ioffset; /* inode offset in block in "inodes" */
- ushort im_boffset; /* inode offset in block in bytes */
-} xfs_imap_t;
-
-#ifdef __KERNEL__
-struct xfs_mount;
-struct xfs_trans;
-int xfs_imap(struct xfs_mount *, struct xfs_trans *, xfs_ino_t,
- xfs_imap_t *, uint);
-#endif
-
-#endif /* __XFS_IMAP_H__ */
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 1d7f5a7e063..a6115fe1ac9 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
* All Rights Reserved.
*
* This program is free software; you can redistribute it and/or
@@ -15,776 +15,498 @@
* along with this program; if not, write the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
+#include <linux/log2.h>
+
#include "xfs.h"
#include "xfs_fs.h"
-#include "xfs_types.h"
-#include "xfs_bit.h"
-#include "xfs_log.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_inum.h"
-#include "xfs_imap.h"
-#include "xfs_trans.h"
-#include "xfs_trans_priv.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
-#include "xfs_dir.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
-#include "xfs_dinode.h"
#include "xfs_inode.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_dir2.h"
+#include "xfs_attr_sf.h"
+#include "xfs_attr.h"
+#include "xfs_trans_space.h"
+#include "xfs_trans.h"
#include "xfs_buf_item.h"
#include "xfs_inode_item.h"
-#include "xfs_btree.h"
-#include "xfs_alloc.h"
#include "xfs_ialloc.h"
#include "xfs_bmap.h"
-#include "xfs_rw.h"
+#include "xfs_bmap_util.h"
#include "xfs_error.h"
-#include "xfs_utils.h"
-#include "xfs_dir2_trace.h"
#include "xfs_quota.h"
-#include "xfs_mac.h"
-#include "xfs_acl.h"
-
+#include "xfs_filestream.h"
+#include "xfs_cksum.h"
+#include "xfs_trace.h"
+#include "xfs_icache.h"
+#include "xfs_symlink.h"
+#include "xfs_trans_priv.h"
+#include "xfs_log.h"
+#include "xfs_bmap_btree.h"
-kmem_zone_t *xfs_ifork_zone;
kmem_zone_t *xfs_inode_zone;
-kmem_zone_t *xfs_chashlist_zone;
/*
- * Used in xfs_itruncate(). This is the maximum number of extents
+ * Used in xfs_itruncate_extents(). This is the maximum number of extents
* freed from a file in a single transaction.
*/
#define XFS_ITRUNC_MAX_EXTENTS 2
STATIC int xfs_iflush_int(xfs_inode_t *, xfs_buf_t *);
-STATIC int xfs_iformat_local(xfs_inode_t *, xfs_dinode_t *, int, int);
-STATIC int xfs_iformat_extents(xfs_inode_t *, xfs_dinode_t *, int);
-STATIC int xfs_iformat_btree(xfs_inode_t *, xfs_dinode_t *, int);
+STATIC int xfs_iunlink_remove(xfs_trans_t *, xfs_inode_t *);
-#ifdef DEBUG
/*
- * Make sure that the extents in the given memory buffer
- * are valid.
+ * helper function to extract extent size hint from inode
*/
-STATIC void
-xfs_validate_extents(
- xfs_bmbt_rec_t *ep,
- int nrecs,
- int disk,
- xfs_exntfmt_t fmt)
+xfs_extlen_t
+xfs_get_extsz_hint(
+ struct xfs_inode *ip)
{
- xfs_bmbt_irec_t irec;
- xfs_bmbt_rec_t rec;
- int i;
-
- for (i = 0; i < nrecs; i++) {
- rec.l0 = get_unaligned((__uint64_t*)&ep->l0);
- rec.l1 = get_unaligned((__uint64_t*)&ep->l1);
- if (disk)
- xfs_bmbt_disk_get_all(&rec, &irec);
- else
- xfs_bmbt_get_all(&rec, &irec);
- if (fmt == XFS_EXTFMT_NOSTATE)
- ASSERT(irec.br_state == XFS_EXT_NORM);
- ep++;
- }
+ if ((ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE) && ip->i_d.di_extsize)
+ return ip->i_d.di_extsize;
+ if (XFS_IS_REALTIME_INODE(ip))
+ return ip->i_mount->m_sb.sb_rextsize;
+ return 0;
}
-#else /* DEBUG */
-#define xfs_validate_extents(ep, nrecs, disk, fmt)
-#endif /* DEBUG */
/*
- * Check that none of the inode's in the buffer have a next
- * unlinked field of 0.
+ * These two are wrapper routines around the xfs_ilock() routine used to
+ * centralize some grungy code. They are used in places that wish to lock the
+ * inode solely for reading the extents. The reason these places can't just
+ * call xfs_ilock(ip, XFS_ILOCK_SHARED) is that the inode lock also guards to
+ * bringing in of the extents from disk for a file in b-tree format. If the
+ * inode is in b-tree format, then we need to lock the inode exclusively until
+ * the extents are read in. Locking it exclusively all the time would limit
+ * our parallelism unnecessarily, though. What we do instead is check to see
+ * if the extents have been read in yet, and only lock the inode exclusively
+ * if they have not.
+ *
+ * The functions return a value which should be given to the corresponding
+ * xfs_iunlock() call.
*/
-#if defined(DEBUG)
-void
-xfs_inobp_check(
- xfs_mount_t *mp,
- xfs_buf_t *bp)
+uint
+xfs_ilock_data_map_shared(
+ struct xfs_inode *ip)
{
- int i;
- int j;
- xfs_dinode_t *dip;
+ uint lock_mode = XFS_ILOCK_SHARED;
+
+ if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE &&
+ (ip->i_df.if_flags & XFS_IFEXTENTS) == 0)
+ lock_mode = XFS_ILOCK_EXCL;
+ xfs_ilock(ip, lock_mode);
+ return lock_mode;
+}
- j = mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog;
+uint
+xfs_ilock_attr_map_shared(
+ struct xfs_inode *ip)
+{
+ uint lock_mode = XFS_ILOCK_SHARED;
- for (i = 0; i < j; i++) {
- dip = (xfs_dinode_t *)xfs_buf_offset(bp,
- i * mp->m_sb.sb_inodesize);
- if (!dip->di_next_unlinked) {
- xfs_fs_cmn_err(CE_ALERT, mp,
- "Detected a bogus zero next_unlinked field in incore inode buffer 0x%p. About to pop an ASSERT.",
- bp);
- ASSERT(dip->di_next_unlinked);
- }
- }
+ if (ip->i_d.di_aformat == XFS_DINODE_FMT_BTREE &&
+ (ip->i_afp->if_flags & XFS_IFEXTENTS) == 0)
+ lock_mode = XFS_ILOCK_EXCL;
+ xfs_ilock(ip, lock_mode);
+ return lock_mode;
}
-#endif
/*
- * This routine is called to map an inode number within a file
- * system to the buffer containing the on-disk version of the
- * inode. It returns a pointer to the buffer containing the
- * on-disk inode in the bpp parameter, and in the dip parameter
- * it returns a pointer to the on-disk inode within that buffer.
+ * The xfs inode contains 2 locks: a multi-reader lock called the
+ * i_iolock and a multi-reader lock called the i_lock. This routine
+ * allows either or both of the locks to be obtained.
*
- * If a non-zero error is returned, then the contents of bpp and
- * dipp are undefined.
+ * The 2 locks should always be ordered so that the IO lock is
+ * obtained first in order to prevent deadlock.
*
- * Use xfs_imap() to determine the size and location of the
- * buffer to read from disk.
+ * ip -- the inode being locked
+ * lock_flags -- this parameter indicates the inode's locks
+ * to be locked. It can be:
+ * XFS_IOLOCK_SHARED,
+ * XFS_IOLOCK_EXCL,
+ * XFS_ILOCK_SHARED,
+ * XFS_ILOCK_EXCL,
+ * XFS_IOLOCK_SHARED | XFS_ILOCK_SHARED,
+ * XFS_IOLOCK_SHARED | XFS_ILOCK_EXCL,
+ * XFS_IOLOCK_EXCL | XFS_ILOCK_SHARED,
+ * XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL
*/
-STATIC int
-xfs_inotobp(
- xfs_mount_t *mp,
- xfs_trans_t *tp,
- xfs_ino_t ino,
- xfs_dinode_t **dipp,
- xfs_buf_t **bpp,
- int *offset)
+void
+xfs_ilock(
+ xfs_inode_t *ip,
+ uint lock_flags)
{
- int di_ok;
- xfs_imap_t imap;
- xfs_buf_t *bp;
- int error;
- xfs_dinode_t *dip;
+ trace_xfs_ilock(ip, lock_flags, _RET_IP_);
/*
- * Call the space managment code to find the location of the
- * inode on disk.
+ * You can't set both SHARED and EXCL for the same lock,
+ * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
+ * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
*/
- imap.im_blkno = 0;
- error = xfs_imap(mp, tp, ino, &imap, XFS_IMAP_LOOKUP);
- if (error != 0) {
- cmn_err(CE_WARN,
- "xfs_inotobp: xfs_imap() returned an "
- "error %d on %s. Returning error.", error, mp->m_fsname);
- return error;
- }
+ ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
+ (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
+ ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
+ (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
+ ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
- /*
- * If the inode number maps to a block outside the bounds of the
- * file system then return NULL rather than calling read_buf
- * and panicing when we get an error from the driver.
- */
- if ((imap.im_blkno + imap.im_len) >
- XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)) {
- cmn_err(CE_WARN,
- "xfs_inotobp: inode number (%llu + %d) maps to a block outside the bounds "
- "of the file system %s. Returning EINVAL.",
- (unsigned long long)imap.im_blkno,
- imap.im_len, mp->m_fsname);
- return XFS_ERROR(EINVAL);
- }
+ if (lock_flags & XFS_IOLOCK_EXCL)
+ mrupdate_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags));
+ else if (lock_flags & XFS_IOLOCK_SHARED)
+ mraccess_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags));
+
+ if (lock_flags & XFS_ILOCK_EXCL)
+ mrupdate_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
+ else if (lock_flags & XFS_ILOCK_SHARED)
+ mraccess_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
+}
+/*
+ * This is just like xfs_ilock(), except that the caller
+ * is guaranteed not to sleep. It returns 1 if it gets
+ * the requested locks and 0 otherwise. If the IO lock is
+ * obtained but the inode lock cannot be, then the IO lock
+ * is dropped before returning.
+ *
+ * ip -- the inode being locked
+ * lock_flags -- this parameter indicates the inode's locks to be
+ * to be locked. See the comment for xfs_ilock() for a list
+ * of valid values.
+ */
+int
+xfs_ilock_nowait(
+ xfs_inode_t *ip,
+ uint lock_flags)
+{
+ trace_xfs_ilock_nowait(ip, lock_flags, _RET_IP_);
+
+ /*
+ * You can't set both SHARED and EXCL for the same lock,
+ * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
+ * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
+ */
+ ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
+ (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
+ ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
+ (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
+ ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
+
+ if (lock_flags & XFS_IOLOCK_EXCL) {
+ if (!mrtryupdate(&ip->i_iolock))
+ goto out;
+ } else if (lock_flags & XFS_IOLOCK_SHARED) {
+ if (!mrtryaccess(&ip->i_iolock))
+ goto out;
+ }
+ if (lock_flags & XFS_ILOCK_EXCL) {
+ if (!mrtryupdate(&ip->i_lock))
+ goto out_undo_iolock;
+ } else if (lock_flags & XFS_ILOCK_SHARED) {
+ if (!mrtryaccess(&ip->i_lock))
+ goto out_undo_iolock;
+ }
+ return 1;
+
+ out_undo_iolock:
+ if (lock_flags & XFS_IOLOCK_EXCL)
+ mrunlock_excl(&ip->i_iolock);
+ else if (lock_flags & XFS_IOLOCK_SHARED)
+ mrunlock_shared(&ip->i_iolock);
+ out:
+ return 0;
+}
+
+/*
+ * xfs_iunlock() is used to drop the inode locks acquired with
+ * xfs_ilock() and xfs_ilock_nowait(). The caller must pass
+ * in the flags given to xfs_ilock() or xfs_ilock_nowait() so
+ * that we know which locks to drop.
+ *
+ * ip -- the inode being unlocked
+ * lock_flags -- this parameter indicates the inode's locks to be
+ * to be unlocked. See the comment for xfs_ilock() for a list
+ * of valid values for this parameter.
+ *
+ */
+void
+xfs_iunlock(
+ xfs_inode_t *ip,
+ uint lock_flags)
+{
/*
- * Read in the buffer. If tp is NULL, xfs_trans_read_buf() will
- * default to just a read_buf() call.
+ * You can't set both SHARED and EXCL for the same lock,
+ * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
+ * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
*/
- error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap.im_blkno,
- (int)imap.im_len, XFS_BUF_LOCK, &bp);
+ ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
+ (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
+ ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
+ (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
+ ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
+ ASSERT(lock_flags != 0);
- if (error) {
- cmn_err(CE_WARN,
- "xfs_inotobp: xfs_trans_read_buf() returned an "
- "error %d on %s. Returning error.", error, mp->m_fsname);
- return error;
- }
- dip = (xfs_dinode_t *)xfs_buf_offset(bp, 0);
- di_ok =
- INT_GET(dip->di_core.di_magic, ARCH_CONVERT) == XFS_DINODE_MAGIC &&
- XFS_DINODE_GOOD_VERSION(INT_GET(dip->di_core.di_version, ARCH_CONVERT));
- if (unlikely(XFS_TEST_ERROR(!di_ok, mp, XFS_ERRTAG_ITOBP_INOTOBP,
- XFS_RANDOM_ITOBP_INOTOBP))) {
- XFS_CORRUPTION_ERROR("xfs_inotobp", XFS_ERRLEVEL_LOW, mp, dip);
- xfs_trans_brelse(tp, bp);
- cmn_err(CE_WARN,
- "xfs_inotobp: XFS_TEST_ERROR() returned an "
- "error on %s. Returning EFSCORRUPTED.", mp->m_fsname);
- return XFS_ERROR(EFSCORRUPTED);
- }
+ if (lock_flags & XFS_IOLOCK_EXCL)
+ mrunlock_excl(&ip->i_iolock);
+ else if (lock_flags & XFS_IOLOCK_SHARED)
+ mrunlock_shared(&ip->i_iolock);
- xfs_inobp_check(mp, bp);
+ if (lock_flags & XFS_ILOCK_EXCL)
+ mrunlock_excl(&ip->i_lock);
+ else if (lock_flags & XFS_ILOCK_SHARED)
+ mrunlock_shared(&ip->i_lock);
- /*
- * Set *dipp to point to the on-disk inode in the buffer.
- */
- *dipp = (xfs_dinode_t *)xfs_buf_offset(bp, imap.im_boffset);
- *bpp = bp;
- *offset = imap.im_boffset;
- return 0;
+ trace_xfs_iunlock(ip, lock_flags, _RET_IP_);
}
-
/*
- * This routine is called to map an inode to the buffer containing
- * the on-disk version of the inode. It returns a pointer to the
- * buffer containing the on-disk inode in the bpp parameter, and in
- * the dip parameter it returns a pointer to the on-disk inode within
- * that buffer.
- *
- * If a non-zero error is returned, then the contents of bpp and
- * dipp are undefined.
- *
- * If the inode is new and has not yet been initialized, use xfs_imap()
- * to determine the size and location of the buffer to read from disk.
- * If the inode has already been mapped to its buffer and read in once,
- * then use the mapping information stored in the inode rather than
- * calling xfs_imap(). This allows us to avoid the overhead of looking
- * at the inode btree for small block file systems (see xfs_dilocate()).
- * We can tell whether the inode has been mapped in before by comparing
- * its disk block address to 0. Only uninitialized inodes will have
- * 0 for the disk block address.
+ * give up write locks. the i/o lock cannot be held nested
+ * if it is being demoted.
*/
-int
-xfs_itobp(
- xfs_mount_t *mp,
- xfs_trans_t *tp,
- xfs_inode_t *ip,
- xfs_dinode_t **dipp,
- xfs_buf_t **bpp,
- xfs_daddr_t bno)
+void
+xfs_ilock_demote(
+ xfs_inode_t *ip,
+ uint lock_flags)
{
- xfs_buf_t *bp;
- int error;
- xfs_imap_t imap;
-#ifdef __KERNEL__
- int i;
- int ni;
-#endif
+ ASSERT(lock_flags & (XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL));
+ ASSERT((lock_flags & ~(XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL)) == 0);
- if (ip->i_blkno == (xfs_daddr_t)0) {
- /*
- * Call the space management code to find the location of the
- * inode on disk.
- */
- imap.im_blkno = bno;
- error = xfs_imap(mp, tp, ip->i_ino, &imap, XFS_IMAP_LOOKUP);
- if (error != 0) {
- return error;
- }
+ if (lock_flags & XFS_ILOCK_EXCL)
+ mrdemote(&ip->i_lock);
+ if (lock_flags & XFS_IOLOCK_EXCL)
+ mrdemote(&ip->i_iolock);
- /*
- * If the inode number maps to a block outside the bounds
- * of the file system then return NULL rather than calling
- * read_buf and panicing when we get an error from the
- * driver.
- */
- if ((imap.im_blkno + imap.im_len) >
- XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)) {
-#ifdef DEBUG
- xfs_fs_cmn_err(CE_ALERT, mp, "xfs_itobp: "
- "(imap.im_blkno (0x%llx) "
- "+ imap.im_len (0x%llx)) > "
- " XFS_FSB_TO_BB(mp, "
- "mp->m_sb.sb_dblocks) (0x%llx)",
- (unsigned long long) imap.im_blkno,
- (unsigned long long) imap.im_len,
- XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks));
-#endif /* DEBUG */
- return XFS_ERROR(EINVAL);
- }
+ trace_xfs_ilock_demote(ip, lock_flags, _RET_IP_);
+}
- /*
- * Fill in the fields in the inode that will be used to
- * map the inode to its buffer from now on.
- */
- ip->i_blkno = imap.im_blkno;
- ip->i_len = imap.im_len;
- ip->i_boffset = imap.im_boffset;
- } else {
- /*
- * We've already mapped the inode once, so just use the
- * mapping that we saved the first time.
- */
- imap.im_blkno = ip->i_blkno;
- imap.im_len = ip->i_len;
- imap.im_boffset = ip->i_boffset;
+#if defined(DEBUG) || defined(XFS_WARN)
+int
+xfs_isilocked(
+ xfs_inode_t *ip,
+ uint lock_flags)
+{
+ if (lock_flags & (XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)) {
+ if (!(lock_flags & XFS_ILOCK_SHARED))
+ return !!ip->i_lock.mr_writer;
+ return rwsem_is_locked(&ip->i_lock.mr_lock);
}
- ASSERT(bno == 0 || bno == imap.im_blkno);
- /*
- * Read in the buffer. If tp is NULL, xfs_trans_read_buf() will
- * default to just a read_buf() call.
- */
- error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap.im_blkno,
- (int)imap.im_len, XFS_BUF_LOCK, &bp);
-
- if (error) {
-#ifdef DEBUG
- xfs_fs_cmn_err(CE_ALERT, mp, "xfs_itobp: "
- "xfs_trans_read_buf() returned error %d, "
- "imap.im_blkno 0x%llx, imap.im_len 0x%llx",
- error, (unsigned long long) imap.im_blkno,
- (unsigned long long) imap.im_len);
-#endif /* DEBUG */
- return error;
+ if (lock_flags & (XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)) {
+ if (!(lock_flags & XFS_IOLOCK_SHARED))
+ return !!ip->i_iolock.mr_writer;
+ return rwsem_is_locked(&ip->i_iolock.mr_lock);
}
-#ifdef __KERNEL__
- /*
- * Validate the magic number and version of every inode in the buffer
- * (if DEBUG kernel) or the first inode in the buffer, otherwise.
- */
-#ifdef DEBUG
- ni = BBTOB(imap.im_len) >> mp->m_sb.sb_inodelog;
-#else
- ni = 1;
+
+ ASSERT(0);
+ return 0;
+}
#endif
- for (i = 0; i < ni; i++) {
- int di_ok;
- xfs_dinode_t *dip;
-
- dip = (xfs_dinode_t *)xfs_buf_offset(bp,
- (i << mp->m_sb.sb_inodelog));
- di_ok = INT_GET(dip->di_core.di_magic, ARCH_CONVERT) == XFS_DINODE_MAGIC &&
- XFS_DINODE_GOOD_VERSION(INT_GET(dip->di_core.di_version, ARCH_CONVERT));
- if (unlikely(XFS_TEST_ERROR(!di_ok, mp, XFS_ERRTAG_ITOBP_INOTOBP,
- XFS_RANDOM_ITOBP_INOTOBP))) {
+
#ifdef DEBUG
- prdev("bad inode magic/vsn daddr %lld #%d (magic=%x)",
- mp->m_ddev_targp,
- (unsigned long long)imap.im_blkno, i,
- INT_GET(dip->di_core.di_magic, ARCH_CONVERT));
+int xfs_locked_n;
+int xfs_small_retries;
+int xfs_middle_retries;
+int xfs_lots_retries;
+int xfs_lock_delays;
#endif
- XFS_CORRUPTION_ERROR("xfs_itobp", XFS_ERRLEVEL_HIGH,
- mp, dip);
- xfs_trans_brelse(tp, bp);
- return XFS_ERROR(EFSCORRUPTED);
- }
- }
-#endif /* __KERNEL__ */
-
- xfs_inobp_check(mp, bp);
- /*
- * Mark the buffer as an inode buffer now that it looks good
- */
- XFS_BUF_SET_VTYPE(bp, B_FS_INO);
+/*
+ * Bump the subclass so xfs_lock_inodes() acquires each lock with
+ * a different value
+ */
+static inline int
+xfs_lock_inumorder(int lock_mode, int subclass)
+{
+ if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL))
+ lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_IOLOCK_SHIFT;
+ if (lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL))
+ lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_ILOCK_SHIFT;
- /*
- * Set *dipp to point to the on-disk inode in the buffer.
- */
- *dipp = (xfs_dinode_t *)xfs_buf_offset(bp, imap.im_boffset);
- *bpp = bp;
- return 0;
+ return lock_mode;
}
/*
- * Move inode type and inode format specific information from the
- * on-disk inode to the in-core inode. For fifos, devs, and sockets
- * this means set if_rdev to the proper value. For files, directories,
- * and symlinks this means to bring in the in-line data or extent
- * pointers. For a file in B-tree format, only the root is immediately
- * brought in-core. The rest will be in-lined in if_extents when it
- * is first referenced (see xfs_iread_extents()).
+ * The following routine will lock n inodes in exclusive mode.
+ * We assume the caller calls us with the inodes in i_ino order.
+ *
+ * We need to detect deadlock where an inode that we lock
+ * is in the AIL and we start waiting for another inode that is locked
+ * by a thread in a long running transaction (such as truncate). This can
+ * result in deadlock since the long running trans might need to wait
+ * for the inode we just locked in order to push the tail and free space
+ * in the log.
*/
-STATIC int
-xfs_iformat(
- xfs_inode_t *ip,
- xfs_dinode_t *dip)
+void
+xfs_lock_inodes(
+ xfs_inode_t **ips,
+ int inodes,
+ uint lock_mode)
{
- xfs_attr_shortform_t *atp;
- int size;
- int error;
- xfs_fsize_t di_size;
- ip->i_df.if_ext_max =
- XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
- error = 0;
+ int attempts = 0, i, j, try_lock;
+ xfs_log_item_t *lp;
- if (unlikely(
- INT_GET(dip->di_core.di_nextents, ARCH_CONVERT) +
- INT_GET(dip->di_core.di_anextents, ARCH_CONVERT) >
- INT_GET(dip->di_core.di_nblocks, ARCH_CONVERT))) {
- xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
- "corrupt dinode %Lu, extent total = %d, nblocks = %Lu.",
- (unsigned long long)ip->i_ino,
- (int)(INT_GET(dip->di_core.di_nextents, ARCH_CONVERT)
- + INT_GET(dip->di_core.di_anextents, ARCH_CONVERT)),
- (unsigned long long)
- INT_GET(dip->di_core.di_nblocks, ARCH_CONVERT));
- XFS_CORRUPTION_ERROR("xfs_iformat(1)", XFS_ERRLEVEL_LOW,
- ip->i_mount, dip);
- return XFS_ERROR(EFSCORRUPTED);
- }
-
- if (unlikely(INT_GET(dip->di_core.di_forkoff, ARCH_CONVERT) > ip->i_mount->m_sb.sb_inodesize)) {
- xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
- "corrupt dinode %Lu, forkoff = 0x%x.",
- (unsigned long long)ip->i_ino,
- (int)(INT_GET(dip->di_core.di_forkoff, ARCH_CONVERT)));
- XFS_CORRUPTION_ERROR("xfs_iformat(2)", XFS_ERRLEVEL_LOW,
- ip->i_mount, dip);
- return XFS_ERROR(EFSCORRUPTED);
- }
-
- switch (ip->i_d.di_mode & S_IFMT) {
- case S_IFIFO:
- case S_IFCHR:
- case S_IFBLK:
- case S_IFSOCK:
- if (unlikely(INT_GET(dip->di_core.di_format, ARCH_CONVERT) != XFS_DINODE_FMT_DEV)) {
- XFS_CORRUPTION_ERROR("xfs_iformat(3)", XFS_ERRLEVEL_LOW,
- ip->i_mount, dip);
- return XFS_ERROR(EFSCORRUPTED);
+ ASSERT(ips && (inodes >= 2)); /* we need at least two */
+
+ try_lock = 0;
+ i = 0;
+
+again:
+ for (; i < inodes; i++) {
+ ASSERT(ips[i]);
+
+ if (i && (ips[i] == ips[i-1])) /* Already locked */
+ continue;
+
+ /*
+ * If try_lock is not set yet, make sure all locked inodes
+ * are not in the AIL.
+ * If any are, set try_lock to be used later.
+ */
+
+ if (!try_lock) {
+ for (j = (i - 1); j >= 0 && !try_lock; j--) {
+ lp = (xfs_log_item_t *)ips[j]->i_itemp;
+ if (lp && (lp->li_flags & XFS_LI_IN_AIL)) {
+ try_lock++;
+ }
+ }
}
- ip->i_d.di_size = 0;
- ip->i_df.if_u2.if_rdev = INT_GET(dip->di_u.di_dev, ARCH_CONVERT);
- break;
- case S_IFREG:
- case S_IFLNK:
- case S_IFDIR:
- switch (INT_GET(dip->di_core.di_format, ARCH_CONVERT)) {
- case XFS_DINODE_FMT_LOCAL:
+ /*
+ * If any of the previous locks we have locked is in the AIL,
+ * we must TRY to get the second and subsequent locks. If
+ * we can't get any, we must release all we have
+ * and try again.
+ */
+
+ if (try_lock) {
+ /* try_lock must be 0 if i is 0. */
/*
- * no local regular files yet
+ * try_lock means we have an inode locked
+ * that is in the AIL.
*/
- if (unlikely((INT_GET(dip->di_core.di_mode, ARCH_CONVERT) & S_IFMT) == S_IFREG)) {
- xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
- "corrupt inode %Lu "
- "(local format for regular file).",
- (unsigned long long) ip->i_ino);
- XFS_CORRUPTION_ERROR("xfs_iformat(4)",
- XFS_ERRLEVEL_LOW,
- ip->i_mount, dip);
- return XFS_ERROR(EFSCORRUPTED);
- }
+ ASSERT(i != 0);
+ if (!xfs_ilock_nowait(ips[i], xfs_lock_inumorder(lock_mode, i))) {
+ attempts++;
- di_size = INT_GET(dip->di_core.di_size, ARCH_CONVERT);
- if (unlikely(di_size > XFS_DFORK_DSIZE(dip, ip->i_mount))) {
- xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
- "corrupt inode %Lu "
- "(bad size %Ld for local inode).",
- (unsigned long long) ip->i_ino,
- (long long) di_size);
- XFS_CORRUPTION_ERROR("xfs_iformat(5)",
- XFS_ERRLEVEL_LOW,
- ip->i_mount, dip);
- return XFS_ERROR(EFSCORRUPTED);
- }
+ /*
+ * Unlock all previous guys and try again.
+ * xfs_iunlock will try to push the tail
+ * if the inode is in the AIL.
+ */
- size = (int)di_size;
- error = xfs_iformat_local(ip, dip, XFS_DATA_FORK, size);
- break;
- case XFS_DINODE_FMT_EXTENTS:
- error = xfs_iformat_extents(ip, dip, XFS_DATA_FORK);
- break;
- case XFS_DINODE_FMT_BTREE:
- error = xfs_iformat_btree(ip, dip, XFS_DATA_FORK);
- break;
- default:
- XFS_ERROR_REPORT("xfs_iformat(6)", XFS_ERRLEVEL_LOW,
- ip->i_mount);
- return XFS_ERROR(EFSCORRUPTED);
- }
- break;
+ for(j = i - 1; j >= 0; j--) {
- default:
- XFS_ERROR_REPORT("xfs_iformat(7)", XFS_ERRLEVEL_LOW, ip->i_mount);
- return XFS_ERROR(EFSCORRUPTED);
- }
- if (error) {
- return error;
- }
- if (!XFS_DFORK_Q(dip))
- return 0;
- ASSERT(ip->i_afp == NULL);
- ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP);
- ip->i_afp->if_ext_max =
- XFS_IFORK_ASIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
- switch (INT_GET(dip->di_core.di_aformat, ARCH_CONVERT)) {
- case XFS_DINODE_FMT_LOCAL:
- atp = (xfs_attr_shortform_t *)XFS_DFORK_APTR(dip);
- size = (int)INT_GET(atp->hdr.totsize, ARCH_CONVERT);
- error = xfs_iformat_local(ip, dip, XFS_ATTR_FORK, size);
- break;
- case XFS_DINODE_FMT_EXTENTS:
- error = xfs_iformat_extents(ip, dip, XFS_ATTR_FORK);
- break;
- case XFS_DINODE_FMT_BTREE:
- error = xfs_iformat_btree(ip, dip, XFS_ATTR_FORK);
- break;
- default:
- error = XFS_ERROR(EFSCORRUPTED);
- break;
+ /*
+ * Check to see if we've already
+ * unlocked this one.
+ * Not the first one going back,
+ * and the inode ptr is the same.
+ */
+ if ((j != (i - 1)) && ips[j] ==
+ ips[j+1])
+ continue;
+
+ xfs_iunlock(ips[j], lock_mode);
+ }
+
+ if ((attempts % 5) == 0) {
+ delay(1); /* Don't just spin the CPU */
+#ifdef DEBUG
+ xfs_lock_delays++;
+#endif
+ }
+ i = 0;
+ try_lock = 0;
+ goto again;
+ }
+ } else {
+ xfs_ilock(ips[i], xfs_lock_inumorder(lock_mode, i));
+ }
}
- if (error) {
- kmem_zone_free(xfs_ifork_zone, ip->i_afp);
- ip->i_afp = NULL;
- xfs_idestroy_fork(ip, XFS_DATA_FORK);
+
+#ifdef DEBUG
+ if (attempts) {
+ if (attempts < 5) xfs_small_retries++;
+ else if (attempts < 100) xfs_middle_retries++;
+ else xfs_lots_retries++;
+ } else {
+ xfs_locked_n++;
}
- return error;
+#endif
}
/*
- * The file is in-lined in the on-disk inode.
- * If it fits into if_inline_data, then copy
- * it there, otherwise allocate a buffer for it
- * and copy the data there. Either way, set
- * if_data to point at the data.
- * If we allocate a buffer for the data, make
- * sure that its size is a multiple of 4 and
- * record the real size in i_real_bytes.
+ * xfs_lock_two_inodes() can only be used to lock one type of lock
+ * at a time - the iolock or the ilock, but not both at once. If
+ * we lock both at once, lockdep will report false positives saying
+ * we have violated locking orders.
*/
-STATIC int
-xfs_iformat_local(
- xfs_inode_t *ip,
- xfs_dinode_t *dip,
- int whichfork,
- int size)
+void
+xfs_lock_two_inodes(
+ xfs_inode_t *ip0,
+ xfs_inode_t *ip1,
+ uint lock_mode)
{
- xfs_ifork_t *ifp;
- int real_size;
-
- /*
- * If the size is unreasonable, then something
- * is wrong and we just bail out rather than crash in
- * kmem_alloc() or memcpy() below.
- */
- if (unlikely(size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) {
- xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
- "corrupt inode %Lu "
- "(bad size %d for local fork, size = %d).",
- (unsigned long long) ip->i_ino, size,
- XFS_DFORK_SIZE(dip, ip->i_mount, whichfork));
- XFS_CORRUPTION_ERROR("xfs_iformat_local", XFS_ERRLEVEL_LOW,
- ip->i_mount, dip);
- return XFS_ERROR(EFSCORRUPTED);
- }
- ifp = XFS_IFORK_PTR(ip, whichfork);
- real_size = 0;
- if (size == 0)
- ifp->if_u1.if_data = NULL;
- else if (size <= sizeof(ifp->if_u2.if_inline_data))
- ifp->if_u1.if_data = ifp->if_u2.if_inline_data;
- else {
- real_size = roundup(size, 4);
- ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP);
- }
- ifp->if_bytes = size;
- ifp->if_real_bytes = real_size;
- if (size)
- memcpy(ifp->if_u1.if_data, XFS_DFORK_PTR(dip, whichfork), size);
- ifp->if_flags &= ~XFS_IFEXTENTS;
- ifp->if_flags |= XFS_IFINLINE;
- return 0;
-}
+ xfs_inode_t *temp;
+ int attempts = 0;
+ xfs_log_item_t *lp;
-/*
- * The file consists of a set of extents all
- * of which fit into the on-disk inode.
- * If there are few enough extents to fit into
- * the if_inline_ext, then copy them there.
- * Otherwise allocate a buffer for them and copy
- * them into it. Either way, set if_extents
- * to point at the extents.
- */
-STATIC int
-xfs_iformat_extents(
- xfs_inode_t *ip,
- xfs_dinode_t *dip,
- int whichfork)
-{
- xfs_bmbt_rec_t *ep, *dp;
- xfs_ifork_t *ifp;
- int nex;
- int real_size;
- int size;
- int i;
-
- ifp = XFS_IFORK_PTR(ip, whichfork);
- nex = XFS_DFORK_NEXTENTS(dip, whichfork);
- size = nex * (uint)sizeof(xfs_bmbt_rec_t);
-
- /*
- * If the number of extents is unreasonable, then something
- * is wrong and we just bail out rather than crash in
- * kmem_alloc() or memcpy() below.
- */
- if (unlikely(size < 0 || size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) {
- xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
- "corrupt inode %Lu ((a)extents = %d).",
- (unsigned long long) ip->i_ino, nex);
- XFS_CORRUPTION_ERROR("xfs_iformat_extents(1)", XFS_ERRLEVEL_LOW,
- ip->i_mount, dip);
- return XFS_ERROR(EFSCORRUPTED);
- }
-
- real_size = 0;
- if (nex == 0)
- ifp->if_u1.if_extents = NULL;
- else if (nex <= XFS_INLINE_EXTS)
- ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext;
- else {
- ifp->if_u1.if_extents = kmem_alloc(size, KM_SLEEP);
- ASSERT(ifp->if_u1.if_extents != NULL);
- real_size = size;
- }
- ifp->if_bytes = size;
- ifp->if_real_bytes = real_size;
- if (size) {
- dp = (xfs_bmbt_rec_t *) XFS_DFORK_PTR(dip, whichfork);
- xfs_validate_extents(dp, nex, 1, XFS_EXTFMT_INODE(ip));
- ep = ifp->if_u1.if_extents;
- for (i = 0; i < nex; i++, ep++, dp++) {
- ep->l0 = INT_GET(get_unaligned((__uint64_t*)&dp->l0),
- ARCH_CONVERT);
- ep->l1 = INT_GET(get_unaligned((__uint64_t*)&dp->l1),
- ARCH_CONVERT);
- }
- xfs_bmap_trace_exlist("xfs_iformat_extents", ip, nex,
- whichfork);
- if (whichfork != XFS_DATA_FORK ||
- XFS_EXTFMT_INODE(ip) == XFS_EXTFMT_NOSTATE)
- if (unlikely(xfs_check_nostate_extents(
- ifp->if_u1.if_extents, nex))) {
- XFS_ERROR_REPORT("xfs_iformat_extents(2)",
- XFS_ERRLEVEL_LOW,
- ip->i_mount);
- return XFS_ERROR(EFSCORRUPTED);
- }
+ if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL))
+ ASSERT((lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)) == 0);
+ ASSERT(ip0->i_ino != ip1->i_ino);
+
+ if (ip0->i_ino > ip1->i_ino) {
+ temp = ip0;
+ ip0 = ip1;
+ ip1 = temp;
}
- ifp->if_flags |= XFS_IFEXTENTS;
- return 0;
-}
-/*
- * The file has too many extents to fit into
- * the inode, so they are in B-tree format.
- * Allocate a buffer for the root of the B-tree
- * and copy the root into it. The i_extents
- * field will remain NULL until all of the
- * extents are read in (when they are needed).
- */
-STATIC int
-xfs_iformat_btree(
- xfs_inode_t *ip,
- xfs_dinode_t *dip,
- int whichfork)
-{
- xfs_bmdr_block_t *dfp;
- xfs_ifork_t *ifp;
- /* REFERENCED */
- int nrecs;
- int size;
-
- ifp = XFS_IFORK_PTR(ip, whichfork);
- dfp = (xfs_bmdr_block_t *)XFS_DFORK_PTR(dip, whichfork);
- size = XFS_BMAP_BROOT_SPACE(dfp);
- nrecs = XFS_BMAP_BROOT_NUMRECS(dfp);
-
- /*
- * blow out if -- fork has less extents than can fit in
- * fork (fork shouldn't be a btree format), root btree
- * block has more records than can fit into the fork,
- * or the number of extents is greater than the number of
- * blocks.
- */
- if (unlikely(XFS_IFORK_NEXTENTS(ip, whichfork) <= ifp->if_ext_max
- || XFS_BMDR_SPACE_CALC(nrecs) >
- XFS_DFORK_SIZE(dip, ip->i_mount, whichfork)
- || XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks)) {
- xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
- "corrupt inode %Lu (btree).",
- (unsigned long long) ip->i_ino);
- XFS_ERROR_REPORT("xfs_iformat_btree", XFS_ERRLEVEL_LOW,
- ip->i_mount);
- return XFS_ERROR(EFSCORRUPTED);
- }
-
- ifp->if_broot_bytes = size;
- ifp->if_broot = kmem_alloc(size, KM_SLEEP);
- ASSERT(ifp->if_broot != NULL);
- /*
- * Copy and convert from the on-disk structure
- * to the in-memory structure.
- */
- xfs_bmdr_to_bmbt(dfp, XFS_DFORK_SIZE(dip, ip->i_mount, whichfork),
- ifp->if_broot, size);
- ifp->if_flags &= ~XFS_IFEXTENTS;
- ifp->if_flags |= XFS_IFBROOT;
+ again:
+ xfs_ilock(ip0, xfs_lock_inumorder(lock_mode, 0));
- return 0;
+ /*
+ * If the first lock we have locked is in the AIL, we must TRY to get
+ * the second lock. If we can't get it, we must release the first one
+ * and try again.
+ */
+ lp = (xfs_log_item_t *)ip0->i_itemp;
+ if (lp && (lp->li_flags & XFS_LI_IN_AIL)) {
+ if (!xfs_ilock_nowait(ip1, xfs_lock_inumorder(lock_mode, 1))) {
+ xfs_iunlock(ip0, lock_mode);
+ if ((++attempts % 5) == 0)
+ delay(1); /* Don't just spin the CPU */
+ goto again;
+ }
+ } else {
+ xfs_ilock(ip1, xfs_lock_inumorder(lock_mode, 1));
+ }
}
-/*
- * xfs_xlate_dinode_core - translate an xfs_inode_core_t between ondisk
- * and native format
- *
- * buf = on-disk representation
- * dip = native representation
- * dir = direction - +ve -> disk to native
- * -ve -> native to disk
- */
+
void
-xfs_xlate_dinode_core(
- xfs_caddr_t buf,
- xfs_dinode_core_t *dip,
- int dir)
+__xfs_iflock(
+ struct xfs_inode *ip)
{
- xfs_dinode_core_t *buf_core = (xfs_dinode_core_t *)buf;
- xfs_dinode_core_t *mem_core = (xfs_dinode_core_t *)dip;
- xfs_arch_t arch = ARCH_CONVERT;
-
- ASSERT(dir);
-
- INT_XLATE(buf_core->di_magic, mem_core->di_magic, dir, arch);
- INT_XLATE(buf_core->di_mode, mem_core->di_mode, dir, arch);
- INT_XLATE(buf_core->di_version, mem_core->di_version, dir, arch);
- INT_XLATE(buf_core->di_format, mem_core->di_format, dir, arch);
- INT_XLATE(buf_core->di_onlink, mem_core->di_onlink, dir, arch);
- INT_XLATE(buf_core->di_uid, mem_core->di_uid, dir, arch);
- INT_XLATE(buf_core->di_gid, mem_core->di_gid, dir, arch);
- INT_XLATE(buf_core->di_nlink, mem_core->di_nlink, dir, arch);
- INT_XLATE(buf_core->di_projid, mem_core->di_projid, dir, arch);
-
- if (dir > 0) {
- memcpy(mem_core->di_pad, buf_core->di_pad,
- sizeof(buf_core->di_pad));
- } else {
- memcpy(buf_core->di_pad, mem_core->di_pad,
- sizeof(buf_core->di_pad));
- }
-
- INT_XLATE(buf_core->di_flushiter, mem_core->di_flushiter, dir, arch);
-
- INT_XLATE(buf_core->di_atime.t_sec, mem_core->di_atime.t_sec,
- dir, arch);
- INT_XLATE(buf_core->di_atime.t_nsec, mem_core->di_atime.t_nsec,
- dir, arch);
- INT_XLATE(buf_core->di_mtime.t_sec, mem_core->di_mtime.t_sec,
- dir, arch);
- INT_XLATE(buf_core->di_mtime.t_nsec, mem_core->di_mtime.t_nsec,
- dir, arch);
- INT_XLATE(buf_core->di_ctime.t_sec, mem_core->di_ctime.t_sec,
- dir, arch);
- INT_XLATE(buf_core->di_ctime.t_nsec, mem_core->di_ctime.t_nsec,
- dir, arch);
- INT_XLATE(buf_core->di_size, mem_core->di_size, dir, arch);
- INT_XLATE(buf_core->di_nblocks, mem_core->di_nblocks, dir, arch);
- INT_XLATE(buf_core->di_extsize, mem_core->di_extsize, dir, arch);
- INT_XLATE(buf_core->di_nextents, mem_core->di_nextents, dir, arch);
- INT_XLATE(buf_core->di_anextents, mem_core->di_anextents, dir, arch);
- INT_XLATE(buf_core->di_forkoff, mem_core->di_forkoff, dir, arch);
- INT_XLATE(buf_core->di_aformat, mem_core->di_aformat, dir, arch);
- INT_XLATE(buf_core->di_dmevmask, mem_core->di_dmevmask, dir, arch);
- INT_XLATE(buf_core->di_dmstate, mem_core->di_dmstate, dir, arch);
- INT_XLATE(buf_core->di_flags, mem_core->di_flags, dir, arch);
- INT_XLATE(buf_core->di_gen, mem_core->di_gen, dir, arch);
+ wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IFLOCK_BIT);
+ DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IFLOCK_BIT);
+
+ do {
+ prepare_to_wait_exclusive(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
+ if (xfs_isiflocked(ip))
+ io_schedule();
+ } while (!xfs_iflock_nowait(ip));
+
+ finish_wait(wq, &wait.wait);
}
STATIC uint
_xfs_dic2xflags(
- xfs_dinode_core_t *dic,
__uint16_t di_flags)
{
uint flags = 0;
@@ -814,6 +536,10 @@ _xfs_dic2xflags(
flags |= XFS_XFLAG_EXTSIZE;
if (di_flags & XFS_DIFLAG_EXTSZINHERIT)
flags |= XFS_XFLAG_EXTSZINHERIT;
+ if (di_flags & XFS_DIFLAG_NODEFRAG)
+ flags |= XFS_XFLAG_NODEFRAG;
+ if (di_flags & XFS_DIFLAG_FILESTREAM)
+ flags |= XFS_XFLAG_FILESTREAM;
}
return flags;
@@ -823,226 +549,61 @@ uint
xfs_ip2xflags(
xfs_inode_t *ip)
{
- xfs_dinode_core_t *dic = &ip->i_d;
+ xfs_icdinode_t *dic = &ip->i_d;
- return _xfs_dic2xflags(dic, dic->di_flags) |
- (XFS_CFORK_Q(dic) ? XFS_XFLAG_HASATTR : 0);
+ return _xfs_dic2xflags(dic->di_flags) |
+ (XFS_IFORK_Q(ip) ? XFS_XFLAG_HASATTR : 0);
}
uint
xfs_dic2xflags(
- xfs_dinode_core_t *dic)
+ xfs_dinode_t *dip)
{
- return _xfs_dic2xflags(dic, INT_GET(dic->di_flags, ARCH_CONVERT)) |
- (XFS_CFORK_Q_DISK(dic) ? XFS_XFLAG_HASATTR : 0);
+ return _xfs_dic2xflags(be16_to_cpu(dip->di_flags)) |
+ (XFS_DFORK_Q(dip) ? XFS_XFLAG_HASATTR : 0);
}
/*
- * Given a mount structure and an inode number, return a pointer
- * to a newly allocated in-core inode coresponding to the given
- * inode number.
- *
- * Initialize the inode's attributes and extent pointers if it
- * already has them (it will not if the inode has no links).
+ * Lookups up an inode from "name". If ci_name is not NULL, then a CI match
+ * is allowed, otherwise it has to be an exact match. If a CI match is found,
+ * ci_name->name will point to a the actual name (caller must free) or
+ * will be set to NULL if an exact match is found.
*/
int
-xfs_iread(
- xfs_mount_t *mp,
- xfs_trans_t *tp,
- xfs_ino_t ino,
- xfs_inode_t **ipp,
- xfs_daddr_t bno)
+xfs_lookup(
+ xfs_inode_t *dp,
+ struct xfs_name *name,
+ xfs_inode_t **ipp,
+ struct xfs_name *ci_name)
{
- xfs_buf_t *bp;
- xfs_dinode_t *dip;
- xfs_inode_t *ip;
- int error;
-
- ASSERT(xfs_inode_zone != NULL);
-
- ip = kmem_zone_zalloc(xfs_inode_zone, KM_SLEEP);
- ip->i_ino = ino;
- ip->i_mount = mp;
-
- /*
- * Get pointer's to the on-disk inode and the buffer containing it.
- * If the inode number refers to a block outside the file system
- * then xfs_itobp() will return NULL. In this case we should
- * return NULL as well. Set i_blkno to 0 so that xfs_itobp() will
- * know that this is a new incore inode.
- */
- error = xfs_itobp(mp, tp, ip, &dip, &bp, bno);
-
- if (error != 0) {
- kmem_zone_free(xfs_inode_zone, ip);
- return error;
- }
-
- /*
- * Initialize inode's trace buffers.
- * Do this before xfs_iformat in case it adds entries.
- */
-#ifdef XFS_BMAP_TRACE
- ip->i_xtrace = ktrace_alloc(XFS_BMAP_KTRACE_SIZE, KM_SLEEP);
-#endif
-#ifdef XFS_BMBT_TRACE
- ip->i_btrace = ktrace_alloc(XFS_BMBT_KTRACE_SIZE, KM_SLEEP);
-#endif
-#ifdef XFS_RW_TRACE
- ip->i_rwtrace = ktrace_alloc(XFS_RW_KTRACE_SIZE, KM_SLEEP);
-#endif
-#ifdef XFS_ILOCK_TRACE
- ip->i_lock_trace = ktrace_alloc(XFS_ILOCK_KTRACE_SIZE, KM_SLEEP);
-#endif
-#ifdef XFS_DIR2_TRACE
- ip->i_dir_trace = ktrace_alloc(XFS_DIR2_KTRACE_SIZE, KM_SLEEP);
-#endif
+ xfs_ino_t inum;
+ int error;
+ uint lock_mode;
- /*
- * If we got something that isn't an inode it means someone
- * (nfs or dmi) has a stale handle.
- */
- if (INT_GET(dip->di_core.di_magic, ARCH_CONVERT) != XFS_DINODE_MAGIC) {
- kmem_zone_free(xfs_inode_zone, ip);
- xfs_trans_brelse(tp, bp);
-#ifdef DEBUG
- xfs_fs_cmn_err(CE_ALERT, mp, "xfs_iread: "
- "dip->di_core.di_magic (0x%x) != "
- "XFS_DINODE_MAGIC (0x%x)",
- INT_GET(dip->di_core.di_magic, ARCH_CONVERT),
- XFS_DINODE_MAGIC);
-#endif /* DEBUG */
- return XFS_ERROR(EINVAL);
- }
-
- /*
- * If the on-disk inode is already linked to a directory
- * entry, copy all of the inode into the in-core inode.
- * xfs_iformat() handles copying in the inode format
- * specific information.
- * Otherwise, just get the truly permanent information.
- */
- if (dip->di_core.di_mode) {
- xfs_xlate_dinode_core((xfs_caddr_t)&dip->di_core,
- &(ip->i_d), 1);
- error = xfs_iformat(ip, dip);
- if (error) {
- kmem_zone_free(xfs_inode_zone, ip);
- xfs_trans_brelse(tp, bp);
-#ifdef DEBUG
- xfs_fs_cmn_err(CE_ALERT, mp, "xfs_iread: "
- "xfs_iformat() returned error %d",
- error);
-#endif /* DEBUG */
- return error;
- }
- } else {
- ip->i_d.di_magic = INT_GET(dip->di_core.di_magic, ARCH_CONVERT);
- ip->i_d.di_version = INT_GET(dip->di_core.di_version, ARCH_CONVERT);
- ip->i_d.di_gen = INT_GET(dip->di_core.di_gen, ARCH_CONVERT);
- ip->i_d.di_flushiter = INT_GET(dip->di_core.di_flushiter, ARCH_CONVERT);
- /*
- * Make sure to pull in the mode here as well in
- * case the inode is released without being used.
- * This ensures that xfs_inactive() will see that
- * the inode is already free and not try to mess
- * with the uninitialized part of it.
- */
- ip->i_d.di_mode = 0;
- /*
- * Initialize the per-fork minima and maxima for a new
- * inode here. xfs_iformat will do it for old inodes.
- */
- ip->i_df.if_ext_max =
- XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
- }
+ trace_xfs_lookup(dp, name);
- INIT_LIST_HEAD(&ip->i_reclaim);
+ if (XFS_FORCED_SHUTDOWN(dp->i_mount))
+ return XFS_ERROR(EIO);
- /*
- * The inode format changed when we moved the link count and
- * made it 32 bits long. If this is an old format inode,
- * convert it in memory to look like a new one. If it gets
- * flushed to disk we will convert back before flushing or
- * logging it. We zero out the new projid field and the old link
- * count field. We'll handle clearing the pad field (the remains
- * of the old uuid field) when we actually convert the inode to
- * the new format. We don't change the version number so that we
- * can distinguish this from a real new format inode.
- */
- if (ip->i_d.di_version == XFS_DINODE_VERSION_1) {
- ip->i_d.di_nlink = ip->i_d.di_onlink;
- ip->i_d.di_onlink = 0;
- ip->i_d.di_projid = 0;
- }
+ lock_mode = xfs_ilock_data_map_shared(dp);
+ error = xfs_dir_lookup(NULL, dp, name, &inum, ci_name);
+ xfs_iunlock(dp, lock_mode);
- ip->i_delayed_blks = 0;
+ if (error)
+ goto out;
- /*
- * Mark the buffer containing the inode as something to keep
- * around for a while. This helps to keep recently accessed
- * meta-data in-core longer.
- */
- XFS_BUF_SET_REF(bp, XFS_INO_REF);
+ error = xfs_iget(dp->i_mount, NULL, inum, 0, 0, ipp);
+ if (error)
+ goto out_free_name;
- /*
- * Use xfs_trans_brelse() to release the buffer containing the
- * on-disk inode, because it was acquired with xfs_trans_read_buf()
- * in xfs_itobp() above. If tp is NULL, this is just a normal
- * brelse(). If we're within a transaction, then xfs_trans_brelse()
- * will only release the buffer if it is not dirty within the
- * transaction. It will be OK to release the buffer in this case,
- * because inodes on disk are never destroyed and we will be
- * locking the new in-core inode before putting it in the hash
- * table where other processes can find it. Thus we don't have
- * to worry about the inode being changed just because we released
- * the buffer.
- */
- xfs_trans_brelse(tp, bp);
- *ipp = ip;
return 0;
-}
-
-/*
- * Read in extents from a btree-format inode.
- * Allocate and fill in if_extents. Real work is done in xfs_bmap.c.
- */
-int
-xfs_iread_extents(
- xfs_trans_t *tp,
- xfs_inode_t *ip,
- int whichfork)
-{
- int error;
- xfs_ifork_t *ifp;
- size_t size;
- if (unlikely(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)) {
- XFS_ERROR_REPORT("xfs_iread_extents", XFS_ERRLEVEL_LOW,
- ip->i_mount);
- return XFS_ERROR(EFSCORRUPTED);
- }
- size = XFS_IFORK_NEXTENTS(ip, whichfork) * (uint)sizeof(xfs_bmbt_rec_t);
- ifp = XFS_IFORK_PTR(ip, whichfork);
- /*
- * We know that the size is valid (it's checked in iformat_btree)
- */
- ifp->if_u1.if_extents = kmem_alloc(size, KM_SLEEP);
- ASSERT(ifp->if_u1.if_extents != NULL);
- ifp->if_lastex = NULLEXTNUM;
- ifp->if_bytes = ifp->if_real_bytes = (int)size;
- ifp->if_flags |= XFS_IFEXTENTS;
- error = xfs_bmap_read_extents(tp, ip, whichfork);
- if (error) {
- kmem_free(ifp->if_u1.if_extents, size);
- ifp->if_u1.if_extents = NULL;
- ifp->if_bytes = ifp->if_real_bytes = 0;
- ifp->if_flags &= ~XFS_IFEXTENTS;
- return error;
- }
- xfs_validate_extents((xfs_bmbt_rec_t *)ifp->if_u1.if_extents,
- XFS_IFORK_NEXTENTS(ip, whichfork), 0, XFS_EXTFMT_INODE(ip));
- return 0;
+out_free_name:
+ if (ci_name)
+ kmem_free(ci_name->name);
+out:
+ *ipp = NULL;
+ return error;
}
/*
@@ -1052,16 +613,16 @@ xfs_iread_extents(
* set according to the contents of the given cred structure.
*
* Use xfs_dialloc() to allocate the on-disk inode. If xfs_dialloc()
- * has a free inode available, call xfs_iget()
- * to obtain the in-core version of the allocated inode. Finally,
- * fill in the inode and log its initial contents. In this case,
- * ialloc_context would be set to NULL and call_again set to false.
+ * has a free inode available, call xfs_iget() to obtain the in-core
+ * version of the allocated inode. Finally, fill in the inode and
+ * log its initial contents. In this case, ialloc_context would be
+ * set to NULL.
*
- * If xfs_dialloc() does not have an available inode,
- * it will replenish its supply by doing an allocation. Since we can
- * only do one allocation within a transaction without deadlocks, we
- * must commit the current transaction before returning the inode itself.
- * In this case, therefore, we will set call_again to true and return.
+ * If xfs_dialloc() does not have an available inode, it will replenish
+ * its supply by doing an allocation. Since we can only do one
+ * allocation within a transaction without deadlocks, we must commit
+ * the current transaction before returning the inode itself.
+ * In this case, therefore, we will set ialloc_context and return.
* The caller should then commit the current transaction, start a new
* transaction, and call xfs_ialloc() again to actually get the inode.
*
@@ -1070,37 +631,40 @@ xfs_iread_extents(
* also returns the [locked] bp pointing to the head of the freelist
* as ialloc_context. The caller should hold this buffer across
* the commit and pass it back into this routine on the second call.
+ *
+ * If we are allocating quota inodes, we do not have a parent inode
+ * to attach to or associate with (i.e. pip == NULL) because they
+ * are not linked into the directory structure - they are attached
+ * directly to the superblock - and so have no parent.
*/
int
xfs_ialloc(
xfs_trans_t *tp,
xfs_inode_t *pip,
- mode_t mode,
+ umode_t mode,
xfs_nlink_t nlink,
xfs_dev_t rdev,
- cred_t *cr,
- xfs_prid_t prid,
+ prid_t prid,
int okalloc,
xfs_buf_t **ialloc_context,
- boolean_t *call_again,
xfs_inode_t **ipp)
{
+ struct xfs_mount *mp = tp->t_mountp;
xfs_ino_t ino;
xfs_inode_t *ip;
- vnode_t *vp;
uint flags;
int error;
+ timespec_t tv;
/*
* Call the space management code to pick
* the on-disk inode to be allocated.
*/
- error = xfs_dialloc(tp, pip->i_ino, mode, okalloc,
- ialloc_context, call_again, &ino);
- if (error != 0) {
+ error = xfs_dialloc(tp, pip ? pip->i_ino : 0, mode, okalloc,
+ ialloc_context, &ino);
+ if (error)
return error;
- }
- if (*call_again || ino == NULLFSINO) {
+ if (*ialloc_context || ino == NULLFSINO) {
*ipp = NULL;
return 0;
}
@@ -1111,47 +675,32 @@ xfs_ialloc(
* This is because we're setting fields here we need
* to prevent others from looking at until we're done.
*/
- error = xfs_trans_iget(tp->t_mountp, tp, ino,
- IGET_CREATE, XFS_ILOCK_EXCL, &ip);
- if (error != 0) {
+ error = xfs_iget(mp, tp, ino, XFS_IGET_CREATE,
+ XFS_ILOCK_EXCL, &ip);
+ if (error)
return error;
- }
ASSERT(ip != NULL);
- vp = XFS_ITOV(ip);
- ip->i_d.di_mode = (__uint16_t)mode;
+ /*
+ * We always convert v1 inodes to v2 now - we only support filesystems
+ * with >= v2 inode capability, so there is no reason for ever leaving
+ * an inode in v1 format.
+ */
+ if (ip->i_d.di_version == 1)
+ ip->i_d.di_version = 2;
+
+ ip->i_d.di_mode = mode;
ip->i_d.di_onlink = 0;
ip->i_d.di_nlink = nlink;
ASSERT(ip->i_d.di_nlink == nlink);
- ip->i_d.di_uid = current_fsuid(cr);
- ip->i_d.di_gid = current_fsgid(cr);
- ip->i_d.di_projid = prid;
+ ip->i_d.di_uid = xfs_kuid_to_uid(current_fsuid());
+ ip->i_d.di_gid = xfs_kgid_to_gid(current_fsgid());
+ xfs_set_projid(ip, prid);
memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
- /*
- * If the superblock version is up to where we support new format
- * inodes and this is currently an old format inode, then change
- * the inode version number now. This way we only do the conversion
- * here rather than here and in the flush/logging code.
- */
- if (XFS_SB_VERSION_HASNLINK(&tp->t_mountp->m_sb) &&
- ip->i_d.di_version == XFS_DINODE_VERSION_1) {
- ip->i_d.di_version = XFS_DINODE_VERSION_2;
- /*
- * We've already zeroed the old link count, the projid field,
- * and the pad field.
- */
- }
-
- /*
- * Project ids won't be stored on disk if we are using a version 1 inode.
- */
- if ( (prid != 0) && (ip->i_d.di_version == XFS_DINODE_VERSION_1))
- xfs_bump_ino_vers2(tp, ip);
-
- if (XFS_INHERIT_GID(pip, vp->v_vfsp)) {
+ if (pip && XFS_INHERIT_GID(pip)) {
ip->i_d.di_gid = pip->i_d.di_gid;
- if ((pip->i_d.di_mode & S_ISGID) && (mode & S_IFMT) == S_IFDIR) {
+ if ((pip->i_d.di_mode & S_ISGID) && S_ISDIR(mode)) {
ip->i_d.di_mode |= S_ISGID;
}
}
@@ -1163,14 +712,20 @@ xfs_ialloc(
*/
if ((irix_sgid_inherit) &&
(ip->i_d.di_mode & S_ISGID) &&
- (!in_group_p((gid_t)ip->i_d.di_gid))) {
+ (!in_group_p(xfs_gid_to_kgid(ip->i_d.di_gid)))) {
ip->i_d.di_mode &= ~S_ISGID;
}
ip->i_d.di_size = 0;
ip->i_d.di_nextents = 0;
ASSERT(ip->i_d.di_nblocks == 0);
- xfs_ichgtime(ip, XFS_ICHGTIME_CHG|XFS_ICHGTIME_ACC|XFS_ICHGTIME_MOD);
+
+ nanotime(&tv);
+ ip->i_d.di_mtime.t_sec = (__int32_t)tv.tv_sec;
+ ip->i_d.di_mtime.t_nsec = (__int32_t)tv.tv_nsec;
+ ip->i_d.di_atime = ip->i_d.di_mtime;
+ ip->i_d.di_ctime = ip->i_d.di_mtime;
+
/*
* di_gen will have been taken care of in xfs_iread.
*/
@@ -1178,6 +733,19 @@ xfs_ialloc(
ip->i_d.di_dmevmask = 0;
ip->i_d.di_dmstate = 0;
ip->i_d.di_flags = 0;
+
+ if (ip->i_d.di_version == 3) {
+ ASSERT(ip->i_d.di_ino == ino);
+ ASSERT(uuid_equal(&ip->i_d.di_uuid, &mp->m_sb.sb_uuid));
+ ip->i_d.di_crc = 0;
+ ip->i_d.di_changecount = 1;
+ ip->i_d.di_lsn = 0;
+ ip->i_d.di_flags2 = 0;
+ memset(&(ip->i_d.di_pad2[0]), 0, sizeof(ip->i_d.di_pad2));
+ ip->i_d.di_crtime = ip->i_d.di_mtime;
+ }
+
+
flags = XFS_ILOG_CORE;
switch (mode & S_IFMT) {
case S_IFIFO:
@@ -1191,21 +759,19 @@ xfs_ialloc(
break;
case S_IFREG:
case S_IFDIR:
- if (unlikely(pip->i_d.di_flags & XFS_DIFLAG_ANY)) {
+ if (pip && (pip->i_d.di_flags & XFS_DIFLAG_ANY)) {
uint di_flags = 0;
- if ((mode & S_IFMT) == S_IFDIR) {
+ if (S_ISDIR(mode)) {
if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT)
di_flags |= XFS_DIFLAG_RTINHERIT;
if (pip->i_d.di_flags & XFS_DIFLAG_EXTSZINHERIT) {
di_flags |= XFS_DIFLAG_EXTSZINHERIT;
ip->i_d.di_extsize = pip->i_d.di_extsize;
}
- } else if ((mode & S_IFMT) == S_IFREG) {
- if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT) {
+ } else if (S_ISREG(mode)) {
+ if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT)
di_flags |= XFS_DIFLAG_REALTIME;
- ip->i_iocore.io_flags |= XFS_IOCORE_RT;
- }
if (pip->i_d.di_flags & XFS_DIFLAG_EXTSZINHERIT) {
di_flags |= XFS_DIFLAG_EXTSIZE;
ip->i_d.di_extsize = pip->i_d.di_extsize;
@@ -1225,6 +791,11 @@ xfs_ialloc(
di_flags |= XFS_DIFLAG_NOSYMLINKS;
if (pip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
di_flags |= XFS_DIFLAG_PROJINHERIT;
+ if ((pip->i_d.di_flags & XFS_DIFLAG_NODEFRAG) &&
+ xfs_inherit_nodefrag)
+ di_flags |= XFS_DIFLAG_NODEFRAG;
+ if (pip->i_d.di_flags & XFS_DIFLAG_FILESTREAM)
+ di_flags |= XFS_DIFLAG_FILESTREAM;
ip->i_d.di_flags |= di_flags;
}
/* FALLTHROUGH */
@@ -1246,384 +817,708 @@ xfs_ialloc(
/*
* Log the new values stuffed into the inode.
*/
+ xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
xfs_trans_log_inode(tp, ip, flags);
- /* now that we have an i_mode we can set Linux inode ops (& unlock) */
- VFS_INIT_VNODE(XFS_MTOVFS(tp->t_mountp), vp, XFS_ITOBHV(ip), 1);
+ /* now that we have an i_mode we can setup inode ops and unlock */
+ xfs_setup_inode(ip);
*ipp = ip;
return 0;
}
/*
- * Check to make sure that there are no blocks allocated to the
- * file beyond the size of the file. We don't check this for
- * files with fixed size extents or real time extents, but we
- * at least do it for regular files.
+ * Allocates a new inode from disk and return a pointer to the
+ * incore copy. This routine will internally commit the current
+ * transaction and allocate a new one if the Space Manager needed
+ * to do an allocation to replenish the inode free-list.
+ *
+ * This routine is designed to be called from xfs_create and
+ * xfs_create_dir.
+ *
*/
-#ifdef DEBUG
-void
-xfs_isize_check(
- xfs_mount_t *mp,
- xfs_inode_t *ip,
- xfs_fsize_t isize)
+int
+xfs_dir_ialloc(
+ xfs_trans_t **tpp, /* input: current transaction;
+ output: may be a new transaction. */
+ xfs_inode_t *dp, /* directory within whose allocate
+ the inode. */
+ umode_t mode,
+ xfs_nlink_t nlink,
+ xfs_dev_t rdev,
+ prid_t prid, /* project id */
+ int okalloc, /* ok to allocate new space */
+ xfs_inode_t **ipp, /* pointer to inode; it will be
+ locked. */
+ int *committed)
+
{
- xfs_fileoff_t map_first;
- int nimaps;
- xfs_bmbt_irec_t imaps[2];
+ xfs_trans_t *tp;
+ xfs_trans_t *ntp;
+ xfs_inode_t *ip;
+ xfs_buf_t *ialloc_context = NULL;
+ int code;
+ void *dqinfo;
+ uint tflags;
+
+ tp = *tpp;
+ ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
+
+ /*
+ * xfs_ialloc will return a pointer to an incore inode if
+ * the Space Manager has an available inode on the free
+ * list. Otherwise, it will do an allocation and replenish
+ * the freelist. Since we can only do one allocation per
+ * transaction without deadlocks, we will need to commit the
+ * current transaction and start a new one. We will then
+ * need to call xfs_ialloc again to get the inode.
+ *
+ * If xfs_ialloc did an allocation to replenish the freelist,
+ * it returns the bp containing the head of the freelist as
+ * ialloc_context. We will hold a lock on it across the
+ * transaction commit so that no other process can steal
+ * the inode(s) that we've just allocated.
+ */
+ code = xfs_ialloc(tp, dp, mode, nlink, rdev, prid, okalloc,
+ &ialloc_context, &ip);
- if ((ip->i_d.di_mode & S_IFMT) != S_IFREG)
- return;
+ /*
+ * Return an error if we were unable to allocate a new inode.
+ * This should only happen if we run out of space on disk or
+ * encounter a disk error.
+ */
+ if (code) {
+ *ipp = NULL;
+ return code;
+ }
+ if (!ialloc_context && !ip) {
+ *ipp = NULL;
+ return XFS_ERROR(ENOSPC);
+ }
- if (ip->i_d.di_flags & (XFS_DIFLAG_REALTIME | XFS_DIFLAG_EXTSIZE))
- return;
+ /*
+ * If the AGI buffer is non-NULL, then we were unable to get an
+ * inode in one operation. We need to commit the current
+ * transaction and call xfs_ialloc() again. It is guaranteed
+ * to succeed the second time.
+ */
+ if (ialloc_context) {
+ struct xfs_trans_res tres;
+
+ /*
+ * Normally, xfs_trans_commit releases all the locks.
+ * We call bhold to hang on to the ialloc_context across
+ * the commit. Holding this buffer prevents any other
+ * processes from doing any allocations in this
+ * allocation group.
+ */
+ xfs_trans_bhold(tp, ialloc_context);
+ /*
+ * Save the log reservation so we can use
+ * them in the next transaction.
+ */
+ tres.tr_logres = xfs_trans_get_log_res(tp);
+ tres.tr_logcount = xfs_trans_get_log_count(tp);
+
+ /*
+ * We want the quota changes to be associated with the next
+ * transaction, NOT this one. So, detach the dqinfo from this
+ * and attach it to the next transaction.
+ */
+ dqinfo = NULL;
+ tflags = 0;
+ if (tp->t_dqinfo) {
+ dqinfo = (void *)tp->t_dqinfo;
+ tp->t_dqinfo = NULL;
+ tflags = tp->t_flags & XFS_TRANS_DQ_DIRTY;
+ tp->t_flags &= ~(XFS_TRANS_DQ_DIRTY);
+ }
+
+ ntp = xfs_trans_dup(tp);
+ code = xfs_trans_commit(tp, 0);
+ tp = ntp;
+ if (committed != NULL) {
+ *committed = 1;
+ }
+ /*
+ * If we get an error during the commit processing,
+ * release the buffer that is still held and return
+ * to the caller.
+ */
+ if (code) {
+ xfs_buf_relse(ialloc_context);
+ if (dqinfo) {
+ tp->t_dqinfo = dqinfo;
+ xfs_trans_free_dqinfo(tp);
+ }
+ *tpp = ntp;
+ *ipp = NULL;
+ return code;
+ }
+
+ /*
+ * transaction commit worked ok so we can drop the extra ticket
+ * reference that we gained in xfs_trans_dup()
+ */
+ xfs_log_ticket_put(tp->t_ticket);
+ tres.tr_logflags = XFS_TRANS_PERM_LOG_RES;
+ code = xfs_trans_reserve(tp, &tres, 0, 0);
+
+ /*
+ * Re-attach the quota info that we detached from prev trx.
+ */
+ if (dqinfo) {
+ tp->t_dqinfo = dqinfo;
+ tp->t_flags |= tflags;
+ }
+
+ if (code) {
+ xfs_buf_relse(ialloc_context);
+ *tpp = ntp;
+ *ipp = NULL;
+ return code;
+ }
+ xfs_trans_bjoin(tp, ialloc_context);
+
+ /*
+ * Call ialloc again. Since we've locked out all
+ * other allocations in this allocation group,
+ * this call should always succeed.
+ */
+ code = xfs_ialloc(tp, dp, mode, nlink, rdev, prid,
+ okalloc, &ialloc_context, &ip);
+
+ /*
+ * If we get an error at this point, return to the caller
+ * so that the current transaction can be aborted.
+ */
+ if (code) {
+ *tpp = tp;
+ *ipp = NULL;
+ return code;
+ }
+ ASSERT(!ialloc_context && ip);
+
+ } else {
+ if (committed != NULL)
+ *committed = 0;
+ }
+
+ *ipp = ip;
+ *tpp = tp;
- nimaps = 2;
- map_first = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize);
- /*
- * The filesystem could be shutting down, so bmapi may return
- * an error.
- */
- if (xfs_bmapi(NULL, ip, map_first,
- (XFS_B_TO_FSB(mp,
- (xfs_ufsize_t)XFS_MAXIOFFSET(mp)) -
- map_first),
- XFS_BMAPI_ENTIRE, NULL, 0, imaps, &nimaps,
- NULL))
- return;
- ASSERT(nimaps == 1);
- ASSERT(imaps[0].br_startblock == HOLESTARTBLOCK);
+ return 0;
}
-#endif /* DEBUG */
/*
- * Calculate the last possible buffered byte in a file. This must
- * include data that was buffered beyond the EOF by the write code.
- * This also needs to deal with overflowing the xfs_fsize_t type
- * which can happen for sizes near the limit.
- *
- * We also need to take into account any blocks beyond the EOF. It
- * may be the case that they were buffered by a write which failed.
- * In that case the pages will still be in memory, but the inode size
- * will never have been updated.
+ * Decrement the link count on an inode & log the change.
+ * If this causes the link count to go to zero, initiate the
+ * logging activity required to truncate a file.
*/
-xfs_fsize_t
-xfs_file_last_byte(
- xfs_inode_t *ip)
+int /* error */
+xfs_droplink(
+ xfs_trans_t *tp,
+ xfs_inode_t *ip)
{
- xfs_mount_t *mp;
- xfs_fsize_t last_byte;
- xfs_fileoff_t last_block;
- xfs_fileoff_t size_last_block;
- int error;
+ int error;
- ASSERT(ismrlocked(&(ip->i_iolock), MR_UPDATE | MR_ACCESS));
+ xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
+
+ ASSERT (ip->i_d.di_nlink > 0);
+ ip->i_d.di_nlink--;
+ drop_nlink(VFS_I(ip));
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+ error = 0;
+ if (ip->i_d.di_nlink == 0) {
+ /*
+ * We're dropping the last link to this file.
+ * Move the on-disk inode to the AGI unlinked list.
+ * From xfs_inactive() we will pull the inode from
+ * the list and free it.
+ */
+ error = xfs_iunlink(tp, ip);
+ }
+ return error;
+}
+
+/*
+ * Increment the link count on an inode & log the change.
+ */
+int
+xfs_bumplink(
+ xfs_trans_t *tp,
+ xfs_inode_t *ip)
+{
+ xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
+
+ ASSERT(ip->i_d.di_version > 1);
+ ASSERT(ip->i_d.di_nlink > 0 || (VFS_I(ip)->i_state & I_LINKABLE));
+ ip->i_d.di_nlink++;
+ inc_nlink(VFS_I(ip));
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+ return 0;
+}
+
+int
+xfs_create(
+ xfs_inode_t *dp,
+ struct xfs_name *name,
+ umode_t mode,
+ xfs_dev_t rdev,
+ xfs_inode_t **ipp)
+{
+ int is_dir = S_ISDIR(mode);
+ struct xfs_mount *mp = dp->i_mount;
+ struct xfs_inode *ip = NULL;
+ struct xfs_trans *tp = NULL;
+ int error;
+ xfs_bmap_free_t free_list;
+ xfs_fsblock_t first_block;
+ bool unlock_dp_on_error = false;
+ uint cancel_flags;
+ int committed;
+ prid_t prid;
+ struct xfs_dquot *udqp = NULL;
+ struct xfs_dquot *gdqp = NULL;
+ struct xfs_dquot *pdqp = NULL;
+ struct xfs_trans_res tres;
+ uint resblks;
+
+ trace_xfs_create(dp, name);
+
+ if (XFS_FORCED_SHUTDOWN(mp))
+ return XFS_ERROR(EIO);
+
+ prid = xfs_get_initial_prid(dp);
- mp = ip->i_mount;
/*
- * Only check for blocks beyond the EOF if the extents have
- * been read in. This eliminates the need for the inode lock,
- * and it also saves us from looking when it really isn't
- * necessary.
+ * Make sure that we have allocated dquot(s) on disk.
*/
- if (ip->i_df.if_flags & XFS_IFEXTENTS) {
- error = xfs_bmap_last_offset(NULL, ip, &last_block,
- XFS_DATA_FORK);
- if (error) {
- last_block = 0;
- }
+ error = xfs_qm_vop_dqalloc(dp, xfs_kuid_to_uid(current_fsuid()),
+ xfs_kgid_to_gid(current_fsgid()), prid,
+ XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT,
+ &udqp, &gdqp, &pdqp);
+ if (error)
+ return error;
+
+ if (is_dir) {
+ rdev = 0;
+ resblks = XFS_MKDIR_SPACE_RES(mp, name->len);
+ tres.tr_logres = M_RES(mp)->tr_mkdir.tr_logres;
+ tres.tr_logcount = XFS_MKDIR_LOG_COUNT;
+ tp = xfs_trans_alloc(mp, XFS_TRANS_MKDIR);
} else {
- last_block = 0;
+ resblks = XFS_CREATE_SPACE_RES(mp, name->len);
+ tres.tr_logres = M_RES(mp)->tr_create.tr_logres;
+ tres.tr_logcount = XFS_CREATE_LOG_COUNT;
+ tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE);
}
- size_last_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)ip->i_d.di_size);
- last_block = XFS_FILEOFF_MAX(last_block, size_last_block);
- last_byte = XFS_FSB_TO_B(mp, last_block);
- if (last_byte < 0) {
- return XFS_MAXIOFFSET(mp);
+ cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
+
+ /*
+ * Initially assume that the file does not exist and
+ * reserve the resources for that case. If that is not
+ * the case we'll drop the one we have and get a more
+ * appropriate transaction later.
+ */
+ tres.tr_logflags = XFS_TRANS_PERM_LOG_RES;
+ error = xfs_trans_reserve(tp, &tres, resblks, 0);
+ if (error == ENOSPC) {
+ /* flush outstanding delalloc blocks and retry */
+ xfs_flush_inodes(mp);
+ error = xfs_trans_reserve(tp, &tres, resblks, 0);
}
- last_byte += (1 << mp->m_writeio_log);
- if (last_byte < 0) {
- return XFS_MAXIOFFSET(mp);
+ if (error == ENOSPC) {
+ /* No space at all so try a "no-allocation" reservation */
+ resblks = 0;
+ error = xfs_trans_reserve(tp, &tres, 0, 0);
+ }
+ if (error) {
+ cancel_flags = 0;
+ goto out_trans_cancel;
}
- return last_byte;
-}
-#if defined(XFS_RW_TRACE)
-STATIC void
-xfs_itrunc_trace(
- int tag,
- xfs_inode_t *ip,
- int flag,
- xfs_fsize_t new_size,
- xfs_off_t toss_start,
- xfs_off_t toss_finish)
-{
- if (ip->i_rwtrace == NULL) {
- return;
+ xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
+ unlock_dp_on_error = true;
+
+ xfs_bmap_init(&free_list, &first_block);
+
+ /*
+ * Reserve disk quota and the inode.
+ */
+ error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp,
+ pdqp, resblks, 1, 0);
+ if (error)
+ goto out_trans_cancel;
+
+ error = xfs_dir_canenter(tp, dp, name, resblks);
+ if (error)
+ goto out_trans_cancel;
+
+ /*
+ * A newly created regular or special file just has one directory
+ * entry pointing to them, but a directory also the "." entry
+ * pointing to itself.
+ */
+ error = xfs_dir_ialloc(&tp, dp, mode, is_dir ? 2 : 1, rdev,
+ prid, resblks > 0, &ip, &committed);
+ if (error) {
+ if (error == ENOSPC)
+ goto out_trans_cancel;
+ goto out_trans_abort;
+ }
+
+ /*
+ * Now we join the directory inode to the transaction. We do not do it
+ * earlier because xfs_dir_ialloc might commit the previous transaction
+ * (and release all the locks). An error from here on will result in
+ * the transaction cancel unlocking dp so don't do it explicitly in the
+ * error path.
+ */
+ xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
+ unlock_dp_on_error = false;
+
+ error = xfs_dir_createname(tp, dp, name, ip->i_ino,
+ &first_block, &free_list, resblks ?
+ resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
+ if (error) {
+ ASSERT(error != ENOSPC);
+ goto out_trans_abort;
}
+ xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+ xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
+
+ if (is_dir) {
+ error = xfs_dir_init(tp, ip, dp);
+ if (error)
+ goto out_bmap_cancel;
- ktrace_enter(ip->i_rwtrace,
- (void*)((long)tag),
- (void*)ip,
- (void*)(unsigned long)((ip->i_d.di_size >> 32) & 0xffffffff),
- (void*)(unsigned long)(ip->i_d.di_size & 0xffffffff),
- (void*)((long)flag),
- (void*)(unsigned long)((new_size >> 32) & 0xffffffff),
- (void*)(unsigned long)(new_size & 0xffffffff),
- (void*)(unsigned long)((toss_start >> 32) & 0xffffffff),
- (void*)(unsigned long)(toss_start & 0xffffffff),
- (void*)(unsigned long)((toss_finish >> 32) & 0xffffffff),
- (void*)(unsigned long)(toss_finish & 0xffffffff),
- (void*)(unsigned long)current_cpu(),
- (void*)0,
- (void*)0,
- (void*)0,
- (void*)0);
+ error = xfs_bumplink(tp, dp);
+ if (error)
+ goto out_bmap_cancel;
+ }
+
+ /*
+ * If this is a synchronous mount, make sure that the
+ * create transaction goes to disk before returning to
+ * the user.
+ */
+ if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC))
+ xfs_trans_set_sync(tp);
+
+ /*
+ * Attach the dquot(s) to the inodes and modify them incore.
+ * These ids of the inode couldn't have changed since the new
+ * inode has been locked ever since it was created.
+ */
+ xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp);
+
+ error = xfs_bmap_finish(&tp, &free_list, &committed);
+ if (error)
+ goto out_bmap_cancel;
+
+ error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+ if (error)
+ goto out_release_inode;
+
+ xfs_qm_dqrele(udqp);
+ xfs_qm_dqrele(gdqp);
+ xfs_qm_dqrele(pdqp);
+
+ *ipp = ip;
+ return 0;
+
+ out_bmap_cancel:
+ xfs_bmap_cancel(&free_list);
+ out_trans_abort:
+ cancel_flags |= XFS_TRANS_ABORT;
+ out_trans_cancel:
+ xfs_trans_cancel(tp, cancel_flags);
+ out_release_inode:
+ /*
+ * Wait until after the current transaction is aborted to
+ * release the inode. This prevents recursive transactions
+ * and deadlocks from xfs_inactive.
+ */
+ if (ip)
+ IRELE(ip);
+
+ xfs_qm_dqrele(udqp);
+ xfs_qm_dqrele(gdqp);
+ xfs_qm_dqrele(pdqp);
+
+ if (unlock_dp_on_error)
+ xfs_iunlock(dp, XFS_ILOCK_EXCL);
+ return error;
}
-#else
-#define xfs_itrunc_trace(tag, ip, flag, new_size, toss_start, toss_finish)
-#endif
-/*
- * Start the truncation of the file to new_size. The new size
- * must be smaller than the current size. This routine will
- * clear the buffer and page caches of file data in the removed
- * range, and xfs_itruncate_finish() will remove the underlying
- * disk blocks.
- *
- * The inode must have its I/O lock locked EXCLUSIVELY, and it
- * must NOT have the inode lock held at all. This is because we're
- * calling into the buffer/page cache code and we can't hold the
- * inode lock when we do so.
- *
- * The flags parameter can have either the value XFS_ITRUNC_DEFINITE
- * or XFS_ITRUNC_MAYBE. The XFS_ITRUNC_MAYBE value should be used
- * in the case that the caller is locking things out of order and
- * may not be able to call xfs_itruncate_finish() with the inode lock
- * held without dropping the I/O lock. If the caller must drop the
- * I/O lock before calling xfs_itruncate_finish(), then xfs_itruncate_start()
- * must be called again with all the same restrictions as the initial
- * call.
- */
-void
-xfs_itruncate_start(
- xfs_inode_t *ip,
- uint flags,
- xfs_fsize_t new_size)
+int
+xfs_create_tmpfile(
+ struct xfs_inode *dp,
+ struct dentry *dentry,
+ umode_t mode,
+ struct xfs_inode **ipp)
{
- xfs_fsize_t last_byte;
- xfs_off_t toss_start;
- xfs_mount_t *mp;
- vnode_t *vp;
+ struct xfs_mount *mp = dp->i_mount;
+ struct xfs_inode *ip = NULL;
+ struct xfs_trans *tp = NULL;
+ int error;
+ uint cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
+ prid_t prid;
+ struct xfs_dquot *udqp = NULL;
+ struct xfs_dquot *gdqp = NULL;
+ struct xfs_dquot *pdqp = NULL;
+ struct xfs_trans_res *tres;
+ uint resblks;
+
+ if (XFS_FORCED_SHUTDOWN(mp))
+ return XFS_ERROR(EIO);
- ASSERT(ismrlocked(&ip->i_iolock, MR_UPDATE) != 0);
- ASSERT((new_size == 0) || (new_size <= ip->i_d.di_size));
- ASSERT((flags == XFS_ITRUNC_DEFINITE) ||
- (flags == XFS_ITRUNC_MAYBE));
+ prid = xfs_get_initial_prid(dp);
- mp = ip->i_mount;
- vp = XFS_ITOV(ip);
- /*
- * Call VOP_TOSS_PAGES() or VOP_FLUSHINVAL_PAGES() to get rid of pages and buffers
- * overlapping the region being removed. We have to use
- * the less efficient VOP_FLUSHINVAL_PAGES() in the case that the
- * caller may not be able to finish the truncate without
- * dropping the inode's I/O lock. Make sure
- * to catch any pages brought in by buffers overlapping
- * the EOF by searching out beyond the isize by our
- * block size. We round new_size up to a block boundary
- * so that we don't toss things on the same block as
- * new_size but before it.
- *
- * Before calling VOP_TOSS_PAGES() or VOP_FLUSHINVAL_PAGES(), make sure to
- * call remapf() over the same region if the file is mapped.
- * This frees up mapped file references to the pages in the
- * given range and for the VOP_FLUSHINVAL_PAGES() case it ensures
- * that we get the latest mapped changes flushed out.
- */
- toss_start = XFS_B_TO_FSB(mp, (xfs_ufsize_t)new_size);
- toss_start = XFS_FSB_TO_B(mp, toss_start);
- if (toss_start < 0) {
- /*
- * The place to start tossing is beyond our maximum
- * file size, so there is no way that the data extended
- * out there.
- */
- return;
+ /*
+ * Make sure that we have allocated dquot(s) on disk.
+ */
+ error = xfs_qm_vop_dqalloc(dp, xfs_kuid_to_uid(current_fsuid()),
+ xfs_kgid_to_gid(current_fsgid()), prid,
+ XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT,
+ &udqp, &gdqp, &pdqp);
+ if (error)
+ return error;
+
+ resblks = XFS_IALLOC_SPACE_RES(mp);
+ tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE_TMPFILE);
+
+ tres = &M_RES(mp)->tr_create_tmpfile;
+ error = xfs_trans_reserve(tp, tres, resblks, 0);
+ if (error == ENOSPC) {
+ /* No space at all so try a "no-allocation" reservation */
+ resblks = 0;
+ error = xfs_trans_reserve(tp, tres, 0, 0);
}
- last_byte = xfs_file_last_byte(ip);
- xfs_itrunc_trace(XFS_ITRUNC_START, ip, flags, new_size, toss_start,
- last_byte);
- if (last_byte > toss_start) {
- if (flags & XFS_ITRUNC_DEFINITE) {
- VOP_TOSS_PAGES(vp, toss_start, -1, FI_REMAPF_LOCKED);
- } else {
- VOP_FLUSHINVAL_PAGES(vp, toss_start, -1, FI_REMAPF_LOCKED);
- }
+ if (error) {
+ cancel_flags = 0;
+ goto out_trans_cancel;
}
-#ifdef DEBUG
- if (new_size == 0) {
- ASSERT(VN_CACHED(vp) == 0);
+ error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp,
+ pdqp, resblks, 1, 0);
+ if (error)
+ goto out_trans_cancel;
+
+ error = xfs_dir_ialloc(&tp, dp, mode, 1, 0,
+ prid, resblks > 0, &ip, NULL);
+ if (error) {
+ if (error == ENOSPC)
+ goto out_trans_cancel;
+ goto out_trans_abort;
}
-#endif
+
+ if (mp->m_flags & XFS_MOUNT_WSYNC)
+ xfs_trans_set_sync(tp);
+
+ /*
+ * Attach the dquot(s) to the inodes and modify them incore.
+ * These ids of the inode couldn't have changed since the new
+ * inode has been locked ever since it was created.
+ */
+ xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp);
+
+ ip->i_d.di_nlink--;
+ error = xfs_iunlink(tp, ip);
+ if (error)
+ goto out_trans_abort;
+
+ error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+ if (error)
+ goto out_release_inode;
+
+ xfs_qm_dqrele(udqp);
+ xfs_qm_dqrele(gdqp);
+ xfs_qm_dqrele(pdqp);
+
+ *ipp = ip;
+ return 0;
+
+ out_trans_abort:
+ cancel_flags |= XFS_TRANS_ABORT;
+ out_trans_cancel:
+ xfs_trans_cancel(tp, cancel_flags);
+ out_release_inode:
+ /*
+ * Wait until after the current transaction is aborted to
+ * release the inode. This prevents recursive transactions
+ * and deadlocks from xfs_inactive.
+ */
+ if (ip)
+ IRELE(ip);
+
+ xfs_qm_dqrele(udqp);
+ xfs_qm_dqrele(gdqp);
+ xfs_qm_dqrele(pdqp);
+
+ return error;
}
-/*
- * Shrink the file to the given new_size. The new
- * size must be smaller than the current size.
- * This will free up the underlying blocks
- * in the removed range after a call to xfs_itruncate_start()
- * or xfs_atruncate_start().
- *
- * The transaction passed to this routine must have made
- * a permanent log reservation of at least XFS_ITRUNCATE_LOG_RES.
- * This routine may commit the given transaction and
- * start new ones, so make sure everything involved in
- * the transaction is tidy before calling here.
- * Some transaction will be returned to the caller to be
- * committed. The incoming transaction must already include
- * the inode, and both inode locks must be held exclusively.
- * The inode must also be "held" within the transaction. On
- * return the inode will be "held" within the returned transaction.
- * This routine does NOT require any disk space to be reserved
- * for it within the transaction.
- *
- * The fork parameter must be either xfs_attr_fork or xfs_data_fork,
- * and it indicates the fork which is to be truncated. For the
- * attribute fork we only support truncation to size 0.
- *
- * We use the sync parameter to indicate whether or not the first
- * transaction we perform might have to be synchronous. For the attr fork,
- * it needs to be so if the unlink of the inode is not yet known to be
- * permanent in the log. This keeps us from freeing and reusing the
- * blocks of the attribute fork before the unlink of the inode becomes
- * permanent.
- *
- * For the data fork, we normally have to run synchronously if we're
- * being called out of the inactive path or we're being called
- * out of the create path where we're truncating an existing file.
- * Either way, the truncate needs to be sync so blocks don't reappear
- * in the file with altered data in case of a crash. wsync filesystems
- * can run the first case async because anything that shrinks the inode
- * has to run sync so by the time we're called here from inactive, the
- * inode size is permanently set to 0.
- *
- * Calls from the truncate path always need to be sync unless we're
- * in a wsync filesystem and the file has already been unlinked.
- *
- * The caller is responsible for correctly setting the sync parameter.
- * It gets too hard for us to guess here which path we're being called
- * out of just based on inode state.
- */
int
-xfs_itruncate_finish(
- xfs_trans_t **tp,
- xfs_inode_t *ip,
- xfs_fsize_t new_size,
- int fork,
- int sync)
+xfs_link(
+ xfs_inode_t *tdp,
+ xfs_inode_t *sip,
+ struct xfs_name *target_name)
{
- xfs_fsblock_t first_block;
- xfs_fileoff_t first_unmap_block;
- xfs_fileoff_t last_block;
- xfs_filblks_t unmap_len=0;
- xfs_mount_t *mp;
- xfs_trans_t *ntp;
- int done;
- int committed;
- xfs_bmap_free_t free_list;
- int error;
+ xfs_mount_t *mp = tdp->i_mount;
+ xfs_trans_t *tp;
+ int error;
+ xfs_bmap_free_t free_list;
+ xfs_fsblock_t first_block;
+ int cancel_flags;
+ int committed;
+ int resblks;
- ASSERT(ismrlocked(&ip->i_iolock, MR_UPDATE) != 0);
- ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE) != 0);
- ASSERT((new_size == 0) || (new_size <= ip->i_d.di_size));
- ASSERT(*tp != NULL);
- ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES);
- ASSERT(ip->i_transp == *tp);
- ASSERT(ip->i_itemp != NULL);
- ASSERT(ip->i_itemp->ili_flags & XFS_ILI_HOLD);
+ trace_xfs_link(tdp, target_name);
+ ASSERT(!S_ISDIR(sip->i_d.di_mode));
+
+ if (XFS_FORCED_SHUTDOWN(mp))
+ return XFS_ERROR(EIO);
- ntp = *tp;
- mp = (ntp)->t_mountp;
- ASSERT(! XFS_NOT_DQATTACHED(mp, ip));
+ error = xfs_qm_dqattach(sip, 0);
+ if (error)
+ goto std_return;
+
+ error = xfs_qm_dqattach(tdp, 0);
+ if (error)
+ goto std_return;
+
+ tp = xfs_trans_alloc(mp, XFS_TRANS_LINK);
+ cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
+ resblks = XFS_LINK_SPACE_RES(mp, target_name->len);
+ error = xfs_trans_reserve(tp, &M_RES(mp)->tr_link, resblks, 0);
+ if (error == ENOSPC) {
+ resblks = 0;
+ error = xfs_trans_reserve(tp, &M_RES(mp)->tr_link, 0, 0);
+ }
+ if (error) {
+ cancel_flags = 0;
+ goto error_return;
+ }
+
+ xfs_lock_two_inodes(sip, tdp, XFS_ILOCK_EXCL);
+
+ xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, tdp, XFS_ILOCK_EXCL);
/*
- * We only support truncating the entire attribute fork.
+ * If we are using project inheritance, we only allow hard link
+ * creation in our tree when the project IDs are the same; else
+ * the tree quota mechanism could be circumvented.
*/
- if (fork == XFS_ATTR_FORK) {
- new_size = 0LL;
+ if (unlikely((tdp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
+ (xfs_get_projid(tdp) != xfs_get_projid(sip)))) {
+ error = XFS_ERROR(EXDEV);
+ goto error_return;
}
- first_unmap_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)new_size);
- xfs_itrunc_trace(XFS_ITRUNC_FINISH1, ip, 0, new_size, 0, 0);
- /*
- * The first thing we do is set the size to new_size permanently
- * on disk. This way we don't have to worry about anyone ever
- * being able to look at the data being freed even in the face
- * of a crash. What we're getting around here is the case where
- * we free a block, it is allocated to another file, it is written
- * to, and then we crash. If the new data gets written to the
- * file but the log buffers containing the free and reallocation
- * don't, then we'd end up with garbage in the blocks being freed.
- * As long as we make the new_size permanent before actually
- * freeing any blocks it doesn't matter if they get writtten to.
- *
- * The callers must signal into us whether or not the size
- * setting here must be synchronous. There are a few cases
- * where it doesn't have to be synchronous. Those cases
- * occur if the file is unlinked and we know the unlink is
- * permanent or if the blocks being truncated are guaranteed
- * to be beyond the inode eof (regardless of the link count)
- * and the eof value is permanent. Both of these cases occur
- * only on wsync-mounted filesystems. In those cases, we're
- * guaranteed that no user will ever see the data in the blocks
- * that are being truncated so the truncate can run async.
- * In the free beyond eof case, the file may wind up with
- * more blocks allocated to it than it needs if we crash
- * and that won't get fixed until the next time the file
- * is re-opened and closed but that's ok as that shouldn't
- * be too many blocks.
- *
- * However, we can't just make all wsync xactions run async
- * because there's one call out of the create path that needs
- * to run sync where it's truncating an existing file to size
- * 0 whose size is > 0.
- *
- * It's probably possible to come up with a test in this
- * routine that would correctly distinguish all the above
- * cases from the values of the function parameters and the
- * inode state but for sanity's sake, I've decided to let the
- * layers above just tell us. It's simpler to correctly figure
- * out in the layer above exactly under what conditions we
- * can run async and I think it's easier for others read and
- * follow the logic in case something has to be changed.
- * cscope is your friend -- rcc.
- *
- * The attribute fork is much simpler.
- *
- * For the attribute fork we allow the caller to tell us whether
- * the unlink of the inode that led to this call is yet permanent
- * in the on disk log. If it is not and we will be freeing extents
- * in this inode then we make the first transaction synchronous
- * to make sure that the unlink is permanent by the time we free
- * the blocks.
- */
- if (fork == XFS_DATA_FORK) {
- if (ip->i_d.di_nextents > 0) {
- ip->i_d.di_size = new_size;
- xfs_trans_log_inode(ntp, ip, XFS_ILOG_CORE);
- }
- } else if (sync) {
- ASSERT(!(mp->m_flags & XFS_MOUNT_WSYNC));
- if (ip->i_d.di_anextents > 0)
- xfs_trans_set_sync(ntp);
+
+ error = xfs_dir_canenter(tp, tdp, target_name, resblks);
+ if (error)
+ goto error_return;
+
+ xfs_bmap_init(&free_list, &first_block);
+
+ if (sip->i_d.di_nlink == 0) {
+ error = xfs_iunlink_remove(tp, sip);
+ if (error)
+ goto abort_return;
}
- ASSERT(fork == XFS_DATA_FORK ||
- (fork == XFS_ATTR_FORK &&
- ((sync && !(mp->m_flags & XFS_MOUNT_WSYNC)) ||
- (sync == 0 && (mp->m_flags & XFS_MOUNT_WSYNC)))));
+
+ error = xfs_dir_createname(tp, tdp, target_name, sip->i_ino,
+ &first_block, &free_list, resblks);
+ if (error)
+ goto abort_return;
+ xfs_trans_ichgtime(tp, tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+ xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE);
+
+ error = xfs_bumplink(tp, sip);
+ if (error)
+ goto abort_return;
+
+ /*
+ * If this is a synchronous mount, make sure that the
+ * link transaction goes to disk before returning to
+ * the user.
+ */
+ if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
+ xfs_trans_set_sync(tp);
+ }
+
+ error = xfs_bmap_finish (&tp, &free_list, &committed);
+ if (error) {
+ xfs_bmap_cancel(&free_list);
+ goto abort_return;
+ }
+
+ return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+
+ abort_return:
+ cancel_flags |= XFS_TRANS_ABORT;
+ error_return:
+ xfs_trans_cancel(tp, cancel_flags);
+ std_return:
+ return error;
+}
+
+/*
+ * Free up the underlying blocks past new_size. The new size must be smaller
+ * than the current size. This routine can be used both for the attribute and
+ * data fork, and does not modify the inode size, which is left to the caller.
+ *
+ * The transaction passed to this routine must have made a permanent log
+ * reservation of at least XFS_ITRUNCATE_LOG_RES. This routine may commit the
+ * given transaction and start new ones, so make sure everything involved in
+ * the transaction is tidy before calling here. Some transaction will be
+ * returned to the caller to be committed. The incoming transaction must
+ * already include the inode, and both inode locks must be held exclusively.
+ * The inode must also be "held" within the transaction. On return the inode
+ * will be "held" within the returned transaction. This routine does NOT
+ * require any disk space to be reserved for it within the transaction.
+ *
+ * If we get an error, we must return with the inode locked and linked into the
+ * current transaction. This keeps things simple for the higher level code,
+ * because it always knows that the inode is locked and held in the transaction
+ * that returns to it whether errors occur or not. We don't mark the inode
+ * dirty on error so that transactions can be easily aborted if possible.
+ */
+int
+xfs_itruncate_extents(
+ struct xfs_trans **tpp,
+ struct xfs_inode *ip,
+ int whichfork,
+ xfs_fsize_t new_size)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_trans *tp = *tpp;
+ struct xfs_trans *ntp;
+ xfs_bmap_free_t free_list;
+ xfs_fsblock_t first_block;
+ xfs_fileoff_t first_unmap_block;
+ xfs_fileoff_t last_block;
+ xfs_filblks_t unmap_len;
+ int committed;
+ int error = 0;
+ int done = 0;
+
+ ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+ ASSERT(!atomic_read(&VFS_I(ip)->i_count) ||
+ xfs_isilocked(ip, XFS_IOLOCK_EXCL));
+ ASSERT(new_size <= XFS_ISIZE(ip));
+ ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
+ ASSERT(ip->i_itemp != NULL);
+ ASSERT(ip->i_itemp->ili_lock_flags == 0);
+ ASSERT(!XFS_NOT_DQATTACHED(mp, ip));
+
+ trace_xfs_itruncate_extents_start(ip, new_size);
/*
* Since it is possible for space to become allocated beyond
@@ -1634,199 +1529,395 @@ xfs_itruncate_finish(
* beyond the maximum file size (ie it is the same as last_block),
* then there is nothing to do.
*/
- last_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp));
- ASSERT(first_unmap_block <= last_block);
- done = 0;
- if (last_block == first_unmap_block) {
- done = 1;
- } else {
- unmap_len = last_block - first_unmap_block + 1;
- }
+ first_unmap_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)new_size);
+ last_block = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes);
+ if (first_unmap_block == last_block)
+ return 0;
+
+ ASSERT(first_unmap_block < last_block);
+ unmap_len = last_block - first_unmap_block + 1;
while (!done) {
- /*
- * Free up up to XFS_ITRUNC_MAX_EXTENTS. xfs_bunmapi()
- * will tell us whether it freed the entire range or
- * not. If this is a synchronous mount (wsync),
- * then we can tell bunmapi to keep all the
- * transactions asynchronous since the unlink
- * transaction that made this inode inactive has
- * already hit the disk. There's no danger of
- * the freed blocks being reused, there being a
- * crash, and the reused blocks suddenly reappearing
- * in this file with garbage in them once recovery
- * runs.
- */
- XFS_BMAP_INIT(&free_list, &first_block);
- error = xfs_bunmapi(ntp, ip, first_unmap_block,
- unmap_len,
- XFS_BMAPI_AFLAG(fork) |
- (sync ? 0 : XFS_BMAPI_ASYNC),
+ xfs_bmap_init(&free_list, &first_block);
+ error = xfs_bunmapi(tp, ip,
+ first_unmap_block, unmap_len,
+ xfs_bmapi_aflag(whichfork),
XFS_ITRUNC_MAX_EXTENTS,
- &first_block, &free_list, &done);
- if (error) {
- /*
- * If the bunmapi call encounters an error,
- * return to the caller where the transaction
- * can be properly aborted. We just need to
- * make sure we're not holding any resources
- * that we were not when we came in.
- */
- xfs_bmap_cancel(&free_list);
- return error;
- }
+ &first_block, &free_list,
+ &done);
+ if (error)
+ goto out_bmap_cancel;
/*
* Duplicate the transaction that has the permanent
* reservation and commit the old transaction.
*/
- error = xfs_bmap_finish(tp, &free_list, first_block,
- &committed);
- ntp = *tp;
- if (error) {
- /*
- * If the bmap finish call encounters an error,
- * return to the caller where the transaction
- * can be properly aborted. We just need to
- * make sure we're not holding any resources
- * that we were not when we came in.
- *
- * Aborting from this point might lose some
- * blocks in the file system, but oh well.
- */
- xfs_bmap_cancel(&free_list);
- if (committed) {
- /*
- * If the passed in transaction committed
- * in xfs_bmap_finish(), then we want to
- * add the inode to this one before returning.
- * This keeps things simple for the higher
- * level code, because it always knows that
- * the inode is locked and held in the
- * transaction that returns to it whether
- * errors occur or not. We don't mark the
- * inode dirty so that this transaction can
- * be easily aborted if possible.
- */
- xfs_trans_ijoin(ntp, ip,
- XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
- xfs_trans_ihold(ntp, ip);
- }
- return error;
- }
+ error = xfs_bmap_finish(&tp, &free_list, &committed);
+ if (committed)
+ xfs_trans_ijoin(tp, ip, 0);
+ if (error)
+ goto out_bmap_cancel;
if (committed) {
/*
- * The first xact was committed,
- * so add the inode to the new one.
- * Mark it dirty so it will be logged
- * and moved forward in the log as
- * part of every commit.
+ * Mark the inode dirty so it will be logged and
+ * moved forward in the log as part of every commit.
*/
- xfs_trans_ijoin(ntp, ip,
- XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
- xfs_trans_ihold(ntp, ip);
- xfs_trans_log_inode(ntp, ip, XFS_ILOG_CORE);
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
}
- ntp = xfs_trans_dup(ntp);
- (void) xfs_trans_commit(*tp, 0, NULL);
- *tp = ntp;
- error = xfs_trans_reserve(ntp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0,
- XFS_TRANS_PERM_LOG_RES,
- XFS_ITRUNCATE_LOG_COUNT);
+
+ ntp = xfs_trans_dup(tp);
+ error = xfs_trans_commit(tp, 0);
+ tp = ntp;
+
+ xfs_trans_ijoin(tp, ip, 0);
+
+ if (error)
+ goto out;
+
/*
- * Add the inode being truncated to the next chained
- * transaction.
+ * Transaction commit worked ok so we can drop the extra ticket
+ * reference that we gained in xfs_trans_dup()
*/
- xfs_trans_ijoin(ntp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
- xfs_trans_ihold(ntp, ip);
+ xfs_log_ticket_put(tp->t_ticket);
+ error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
if (error)
- return (error);
+ goto out;
}
+
/*
- * Only update the size in the case of the data fork, but
- * always re-log the inode so that our permanent transaction
- * can keep on rolling it forward in the log.
+ * Always re-log the inode so that our permanent transaction can keep
+ * on rolling it forward in the log.
*/
- if (fork == XFS_DATA_FORK) {
- xfs_isize_check(mp, ip, new_size);
- ip->i_d.di_size = new_size;
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+ trace_xfs_itruncate_extents_end(ip, new_size);
+
+out:
+ *tpp = tp;
+ return error;
+out_bmap_cancel:
+ /*
+ * If the bunmapi call encounters an error, return to the caller where
+ * the transaction can be properly aborted. We just need to make sure
+ * we're not holding any resources that we were not when we came in.
+ */
+ xfs_bmap_cancel(&free_list);
+ goto out;
+}
+
+int
+xfs_release(
+ xfs_inode_t *ip)
+{
+ xfs_mount_t *mp = ip->i_mount;
+ int error;
+
+ if (!S_ISREG(ip->i_d.di_mode) || (ip->i_d.di_mode == 0))
+ return 0;
+
+ /* If this is a read-only mount, don't do this (would generate I/O) */
+ if (mp->m_flags & XFS_MOUNT_RDONLY)
+ return 0;
+
+ if (!XFS_FORCED_SHUTDOWN(mp)) {
+ int truncated;
+
+ /*
+ * If we previously truncated this file and removed old data
+ * in the process, we want to initiate "early" writeout on
+ * the last close. This is an attempt to combat the notorious
+ * NULL files problem which is particularly noticeable from a
+ * truncate down, buffered (re-)write (delalloc), followed by
+ * a crash. What we are effectively doing here is
+ * significantly reducing the time window where we'd otherwise
+ * be exposed to that problem.
+ */
+ truncated = xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED);
+ if (truncated) {
+ xfs_iflags_clear(ip, XFS_IDIRTY_RELEASE);
+ if (VN_DIRTY(VFS_I(ip)) && ip->i_delayed_blks > 0) {
+ error = -filemap_flush(VFS_I(ip)->i_mapping);
+ if (error)
+ return error;
+ }
+ }
+ }
+
+ if (ip->i_d.di_nlink == 0)
+ return 0;
+
+ if (xfs_can_free_eofblocks(ip, false)) {
+
+ /*
+ * If we can't get the iolock just skip truncating the blocks
+ * past EOF because we could deadlock with the mmap_sem
+ * otherwise. We'll get another chance to drop them once the
+ * last reference to the inode is dropped, so we'll never leak
+ * blocks permanently.
+ *
+ * Further, check if the inode is being opened, written and
+ * closed frequently and we have delayed allocation blocks
+ * outstanding (e.g. streaming writes from the NFS server),
+ * truncating the blocks past EOF will cause fragmentation to
+ * occur.
+ *
+ * In this case don't do the truncation, either, but we have to
+ * be careful how we detect this case. Blocks beyond EOF show
+ * up as i_delayed_blks even when the inode is clean, so we
+ * need to truncate them away first before checking for a dirty
+ * release. Hence on the first dirty close we will still remove
+ * the speculative allocation, but after that we will leave it
+ * in place.
+ */
+ if (xfs_iflags_test(ip, XFS_IDIRTY_RELEASE))
+ return 0;
+
+ error = xfs_free_eofblocks(mp, ip, true);
+ if (error && error != EAGAIN)
+ return error;
+
+ /* delalloc blocks after truncation means it really is dirty */
+ if (ip->i_delayed_blks)
+ xfs_iflags_set(ip, XFS_IDIRTY_RELEASE);
}
- xfs_trans_log_inode(ntp, ip, XFS_ILOG_CORE);
- ASSERT((new_size != 0) ||
- (fork == XFS_ATTR_FORK) ||
- (ip->i_delayed_blks == 0));
- ASSERT((new_size != 0) ||
- (fork == XFS_ATTR_FORK) ||
- (ip->i_d.di_nextents == 0));
- xfs_itrunc_trace(XFS_ITRUNC_FINISH2, ip, 0, new_size, 0, 0);
return 0;
}
-
/*
- * xfs_igrow_start
+ * xfs_inactive_truncate
*
- * Do the first part of growing a file: zero any data in the last
- * block that is beyond the old EOF. We need to do this before
- * the inode is joined to the transaction to modify the i_size.
- * That way we can drop the inode lock and call into the buffer
- * cache to get the buffer mapping the EOF.
+ * Called to perform a truncate when an inode becomes unlinked.
*/
-int
-xfs_igrow_start(
- xfs_inode_t *ip,
- xfs_fsize_t new_size,
- cred_t *credp)
+STATIC int
+xfs_inactive_truncate(
+ struct xfs_inode *ip)
{
- int error;
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_trans *tp;
+ int error;
+
+ tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
+ error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
+ if (error) {
+ ASSERT(XFS_FORCED_SHUTDOWN(mp));
+ xfs_trans_cancel(tp, 0);
+ return error;
+ }
- ASSERT(ismrlocked(&(ip->i_lock), MR_UPDATE) != 0);
- ASSERT(ismrlocked(&(ip->i_iolock), MR_UPDATE) != 0);
- ASSERT(new_size > ip->i_d.di_size);
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, ip, 0);
/*
- * Zero any pages that may have been created by
- * xfs_write_file() beyond the end of the file
- * and any blocks between the old and new file sizes.
+ * Log the inode size first to prevent stale data exposure in the event
+ * of a system crash before the truncate completes. See the related
+ * comment in xfs_setattr_size() for details.
*/
- error = xfs_zero_eof(XFS_ITOV(ip), &ip->i_iocore, new_size,
- ip->i_d.di_size, new_size);
+ ip->i_d.di_size = 0;
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+ error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, 0);
+ if (error)
+ goto error_trans_cancel;
+
+ ASSERT(ip->i_d.di_nextents == 0);
+
+ error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+ if (error)
+ goto error_unlock;
+
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ return 0;
+
+error_trans_cancel:
+ xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
+error_unlock:
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
return error;
}
/*
- * xfs_igrow_finish
+ * xfs_inactive_ifree()
*
- * This routine is called to extend the size of a file.
- * The inode must have both the iolock and the ilock locked
- * for update and it must be a part of the current transaction.
- * The xfs_igrow_start() function must have been called previously.
- * If the change_flag is not zero, the inode change timestamp will
- * be updated.
+ * Perform the inode free when an inode is unlinked.
*/
-void
-xfs_igrow_finish(
- xfs_trans_t *tp,
- xfs_inode_t *ip,
- xfs_fsize_t new_size,
- int change_flag)
+STATIC int
+xfs_inactive_ifree(
+ struct xfs_inode *ip)
{
- ASSERT(ismrlocked(&(ip->i_lock), MR_UPDATE) != 0);
- ASSERT(ismrlocked(&(ip->i_iolock), MR_UPDATE) != 0);
- ASSERT(ip->i_transp == tp);
- ASSERT(new_size > ip->i_d.di_size);
+ xfs_bmap_free_t free_list;
+ xfs_fsblock_t first_block;
+ int committed;
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_trans *tp;
+ int error;
+
+ tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
/*
- * Update the file size. Update the inode change timestamp
- * if change_flag set.
+ * The ifree transaction might need to allocate blocks for record
+ * insertion to the finobt. We don't want to fail here at ENOSPC, so
+ * allow ifree to dip into the reserved block pool if necessary.
+ *
+ * Freeing large sets of inodes generally means freeing inode chunks,
+ * directory and file data blocks, so this should be relatively safe.
+ * Only under severe circumstances should it be possible to free enough
+ * inodes to exhaust the reserve block pool via finobt expansion while
+ * at the same time not creating free space in the filesystem.
+ *
+ * Send a warning if the reservation does happen to fail, as the inode
+ * now remains allocated and sits on the unlinked list until the fs is
+ * repaired.
*/
- ip->i_d.di_size = new_size;
- if (change_flag)
- xfs_ichgtime(ip, XFS_ICHGTIME_CHG);
- xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+ tp->t_flags |= XFS_TRANS_RESERVE;
+ error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ifree,
+ XFS_IFREE_SPACE_RES(mp), 0);
+ if (error) {
+ if (error == ENOSPC) {
+ xfs_warn_ratelimited(mp,
+ "Failed to remove inode(s) from unlinked list. "
+ "Please free space, unmount and run xfs_repair.");
+ } else {
+ ASSERT(XFS_FORCED_SHUTDOWN(mp));
+ }
+ xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES);
+ return error;
+ }
+
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, ip, 0);
+ xfs_bmap_init(&free_list, &first_block);
+ error = xfs_ifree(tp, ip, &free_list);
+ if (error) {
+ /*
+ * If we fail to free the inode, shut down. The cancel
+ * might do that, we need to make sure. Otherwise the
+ * inode might be lost for a long time or forever.
+ */
+ if (!XFS_FORCED_SHUTDOWN(mp)) {
+ xfs_notice(mp, "%s: xfs_ifree returned error %d",
+ __func__, error);
+ xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
+ }
+ xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ return error;
+ }
+
+ /*
+ * Credit the quota account(s). The inode is gone.
+ */
+ xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_ICOUNT, -1);
+
+ /*
+ * Just ignore errors at this point. There is nothing we can
+ * do except to try to keep going. Make sure it's not a silent
+ * error.
+ */
+ error = xfs_bmap_finish(&tp, &free_list, &committed);
+ if (error)
+ xfs_notice(mp, "%s: xfs_bmap_finish returned error %d",
+ __func__, error);
+ error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+ if (error)
+ xfs_notice(mp, "%s: xfs_trans_commit returned error %d",
+ __func__, error);
+
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ return 0;
}
+/*
+ * xfs_inactive
+ *
+ * This is called when the vnode reference count for the vnode
+ * goes to zero. If the file has been unlinked, then it must
+ * now be truncated. Also, we clear all of the read-ahead state
+ * kept for the inode here since the file is now closed.
+ */
+void
+xfs_inactive(
+ xfs_inode_t *ip)
+{
+ struct xfs_mount *mp;
+ int error;
+ int truncate = 0;
+
+ /*
+ * If the inode is already free, then there can be nothing
+ * to clean up here.
+ */
+ if (ip->i_d.di_mode == 0) {
+ ASSERT(ip->i_df.if_real_bytes == 0);
+ ASSERT(ip->i_df.if_broot_bytes == 0);
+ return;
+ }
+
+ mp = ip->i_mount;
+
+ /* If this is a read-only mount, don't do this (would generate I/O) */
+ if (mp->m_flags & XFS_MOUNT_RDONLY)
+ return;
+
+ if (ip->i_d.di_nlink != 0) {
+ /*
+ * force is true because we are evicting an inode from the
+ * cache. Post-eof blocks must be freed, lest we end up with
+ * broken free space accounting.
+ */
+ if (xfs_can_free_eofblocks(ip, true))
+ xfs_free_eofblocks(mp, ip, false);
+
+ return;
+ }
+
+ if (S_ISREG(ip->i_d.di_mode) &&
+ (ip->i_d.di_size != 0 || XFS_ISIZE(ip) != 0 ||
+ ip->i_d.di_nextents > 0 || ip->i_delayed_blks > 0))
+ truncate = 1;
+
+ error = xfs_qm_dqattach(ip, 0);
+ if (error)
+ return;
+
+ if (S_ISLNK(ip->i_d.di_mode))
+ error = xfs_inactive_symlink(ip);
+ else if (truncate)
+ error = xfs_inactive_truncate(ip);
+ if (error)
+ return;
+
+ /*
+ * If there are attributes associated with the file then blow them away
+ * now. The code calls a routine that recursively deconstructs the
+ * attribute fork. We need to just commit the current transaction
+ * because we can't use it for xfs_attr_inactive().
+ */
+ if (ip->i_d.di_anextents > 0) {
+ ASSERT(ip->i_d.di_forkoff != 0);
+
+ error = xfs_attr_inactive(ip);
+ if (error)
+ return;
+ }
+
+ if (ip->i_afp)
+ xfs_idestroy_fork(ip, XFS_ATTR_FORK);
+
+ ASSERT(ip->i_d.di_anextents == 0);
+
+ /*
+ * Free the inode.
+ */
+ error = xfs_inactive_ifree(ip);
+ if (error)
+ return;
+
+ /*
+ * Release the dquots held by inode, if any.
+ */
+ xfs_qm_dqdetach(ip);
+}
/*
* This is called when the inode's link count goes to 0.
@@ -1843,45 +1934,25 @@ xfs_iunlink(
xfs_dinode_t *dip;
xfs_buf_t *agibp;
xfs_buf_t *ibp;
- xfs_agnumber_t agno;
- xfs_daddr_t agdaddr;
xfs_agino_t agino;
short bucket_index;
int offset;
int error;
- int agi_ok;
ASSERT(ip->i_d.di_nlink == 0);
ASSERT(ip->i_d.di_mode != 0);
- ASSERT(ip->i_transp == tp);
mp = tp->t_mountp;
- agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
- agdaddr = XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp));
-
/*
* Get the agi buffer first. It ensures lock ordering
* on the list.
*/
- error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, agdaddr,
- XFS_FSS_TO_BB(mp, 1), 0, &agibp);
- if (error) {
+ error = xfs_read_agi(mp, tp, XFS_INO_TO_AGNO(mp, ip->i_ino), &agibp);
+ if (error)
return error;
- }
- /*
- * Validate the magic number of the agi block.
- */
agi = XFS_BUF_TO_AGI(agibp);
- agi_ok =
- be32_to_cpu(agi->agi_magicnum) == XFS_AGI_MAGIC &&
- XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum));
- if (unlikely(XFS_TEST_ERROR(!agi_ok, mp, XFS_ERRTAG_IUNLINK,
- XFS_RANDOM_IUNLINK))) {
- XFS_CORRUPTION_ERROR("xfs_iunlink", XFS_ERRLEVEL_LOW, mp, agi);
- xfs_trans_brelse(tp, agibp);
- return XFS_ERROR(EFSCORRUPTED);
- }
+
/*
* Get the index into the agi hash table for the
* list this inode will go on.
@@ -1892,23 +1963,26 @@ xfs_iunlink(
ASSERT(agi->agi_unlinked[bucket_index]);
ASSERT(be32_to_cpu(agi->agi_unlinked[bucket_index]) != agino);
- if (be32_to_cpu(agi->agi_unlinked[bucket_index]) != NULLAGINO) {
+ if (agi->agi_unlinked[bucket_index] != cpu_to_be32(NULLAGINO)) {
/*
* There is already another inode in the bucket we need
* to add ourselves to. Add us at the front of the list.
* Here we put the head pointer into our next pointer,
* and then we fall through to point the head at us.
*/
- error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0);
- if (error) {
+ error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp,
+ 0, 0);
+ if (error)
return error;
- }
- ASSERT(INT_GET(dip->di_next_unlinked, ARCH_CONVERT) == NULLAGINO);
- ASSERT(dip->di_next_unlinked);
- /* both on-disk, don't endian flip twice */
+
+ ASSERT(dip->di_next_unlinked == cpu_to_be32(NULLAGINO));
dip->di_next_unlinked = agi->agi_unlinked[bucket_index];
- offset = ip->i_boffset +
+ offset = ip->i_imap.im_boffset +
offsetof(xfs_dinode_t, di_next_unlinked);
+
+ /* need to recalc the inode CRC if appropriate */
+ xfs_dinode_calc_crc(mp, dip);
+
xfs_trans_inode_buf(tp, ibp);
xfs_trans_log_buf(tp, ibp, offset,
(offset + sizeof(xfs_agino_t) - 1));
@@ -1942,53 +2016,27 @@ xfs_iunlink_remove(
xfs_buf_t *agibp;
xfs_buf_t *ibp;
xfs_agnumber_t agno;
- xfs_daddr_t agdaddr;
xfs_agino_t agino;
xfs_agino_t next_agino;
xfs_buf_t *last_ibp;
- xfs_dinode_t *last_dip;
+ xfs_dinode_t *last_dip = NULL;
short bucket_index;
- int offset, last_offset;
+ int offset, last_offset = 0;
int error;
- int agi_ok;
- /*
- * First pull the on-disk inode from the AGI unlinked list.
- */
mp = tp->t_mountp;
-
agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
- agdaddr = XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp));
/*
* Get the agi buffer first. It ensures lock ordering
* on the list.
*/
- error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, agdaddr,
- XFS_FSS_TO_BB(mp, 1), 0, &agibp);
- if (error) {
- cmn_err(CE_WARN,
- "xfs_iunlink_remove: xfs_trans_read_buf() returned an error %d on %s. Returning error.",
- error, mp->m_fsname);
+ error = xfs_read_agi(mp, tp, agno, &agibp);
+ if (error)
return error;
- }
- /*
- * Validate the magic number of the agi block.
- */
+
agi = XFS_BUF_TO_AGI(agibp);
- agi_ok =
- be32_to_cpu(agi->agi_magicnum) == XFS_AGI_MAGIC &&
- XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum));
- if (unlikely(XFS_TEST_ERROR(!agi_ok, mp, XFS_ERRTAG_IUNLINK_REMOVE,
- XFS_RANDOM_IUNLINK_REMOVE))) {
- XFS_CORRUPTION_ERROR("xfs_iunlink_remove", XFS_ERRLEVEL_LOW,
- mp, agi);
- xfs_trans_brelse(tp, agibp);
- cmn_err(CE_WARN,
- "xfs_iunlink_remove: XFS_TEST_ERROR() returned an error on %s. Returning EFSCORRUPTED.",
- mp->m_fsname);
- return XFS_ERROR(EFSCORRUPTED);
- }
+
/*
* Get the index into the agi hash table for the
* list this inode will go on.
@@ -1996,31 +2044,34 @@ xfs_iunlink_remove(
agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
ASSERT(agino != 0);
bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
- ASSERT(be32_to_cpu(agi->agi_unlinked[bucket_index]) != NULLAGINO);
+ ASSERT(agi->agi_unlinked[bucket_index] != cpu_to_be32(NULLAGINO));
ASSERT(agi->agi_unlinked[bucket_index]);
if (be32_to_cpu(agi->agi_unlinked[bucket_index]) == agino) {
/*
- * We're at the head of the list. Get the inode's
- * on-disk buffer to see if there is anyone after us
- * on the list. Only modify our next pointer if it
- * is not already NULLAGINO. This saves us the overhead
- * of dealing with the buffer when there is no need to
- * change it.
+ * We're at the head of the list. Get the inode's on-disk
+ * buffer to see if there is anyone after us on the list.
+ * Only modify our next pointer if it is not already NULLAGINO.
+ * This saves us the overhead of dealing with the buffer when
+ * there is no need to change it.
*/
- error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0);
+ error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp,
+ 0, 0);
if (error) {
- cmn_err(CE_WARN,
- "xfs_iunlink_remove: xfs_itobp() returned an error %d on %s. Returning error.",
- error, mp->m_fsname);
+ xfs_warn(mp, "%s: xfs_imap_to_bp returned error %d.",
+ __func__, error);
return error;
}
- next_agino = INT_GET(dip->di_next_unlinked, ARCH_CONVERT);
+ next_agino = be32_to_cpu(dip->di_next_unlinked);
ASSERT(next_agino != 0);
if (next_agino != NULLAGINO) {
- INT_SET(dip->di_next_unlinked, ARCH_CONVERT, NULLAGINO);
- offset = ip->i_boffset +
+ dip->di_next_unlinked = cpu_to_be32(NULLAGINO);
+ offset = ip->i_imap.im_boffset +
offsetof(xfs_dinode_t, di_next_unlinked);
+
+ /* need to recalc the inode CRC if appropriate */
+ xfs_dinode_calc_crc(mp, dip);
+
xfs_trans_inode_buf(tp, ibp);
xfs_trans_log_buf(tp, ibp, offset,
(offset + sizeof(xfs_agino_t) - 1));
@@ -2045,45 +2096,59 @@ xfs_iunlink_remove(
next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]);
last_ibp = NULL;
while (next_agino != agino) {
- /*
- * If the last inode wasn't the one pointing to
- * us, then release its buffer since we're not
- * going to do anything with it.
- */
- if (last_ibp != NULL) {
+ struct xfs_imap imap;
+
+ if (last_ibp)
xfs_trans_brelse(tp, last_ibp);
- }
+
+ imap.im_blkno = 0;
next_ino = XFS_AGINO_TO_INO(mp, agno, next_agino);
- error = xfs_inotobp(mp, tp, next_ino, &last_dip,
- &last_ibp, &last_offset);
+
+ error = xfs_imap(mp, tp, next_ino, &imap, 0);
if (error) {
- cmn_err(CE_WARN,
- "xfs_iunlink_remove: xfs_inotobp() returned an error %d on %s. Returning error.",
- error, mp->m_fsname);
+ xfs_warn(mp,
+ "%s: xfs_imap returned error %d.",
+ __func__, error);
return error;
}
- next_agino = INT_GET(last_dip->di_next_unlinked, ARCH_CONVERT);
+
+ error = xfs_imap_to_bp(mp, tp, &imap, &last_dip,
+ &last_ibp, 0, 0);
+ if (error) {
+ xfs_warn(mp,
+ "%s: xfs_imap_to_bp returned error %d.",
+ __func__, error);
+ return error;
+ }
+
+ last_offset = imap.im_boffset;
+ next_agino = be32_to_cpu(last_dip->di_next_unlinked);
ASSERT(next_agino != NULLAGINO);
ASSERT(next_agino != 0);
}
+
/*
- * Now last_ibp points to the buffer previous to us on
- * the unlinked list. Pull us from the list.
+ * Now last_ibp points to the buffer previous to us on the
+ * unlinked list. Pull us from the list.
*/
- error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0);
+ error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp,
+ 0, 0);
if (error) {
- cmn_err(CE_WARN,
- "xfs_iunlink_remove: xfs_itobp() returned an error %d on %s. Returning error.",
- error, mp->m_fsname);
+ xfs_warn(mp, "%s: xfs_imap_to_bp(2) returned error %d.",
+ __func__, error);
return error;
}
- next_agino = INT_GET(dip->di_next_unlinked, ARCH_CONVERT);
+ next_agino = be32_to_cpu(dip->di_next_unlinked);
ASSERT(next_agino != 0);
ASSERT(next_agino != agino);
if (next_agino != NULLAGINO) {
- INT_SET(dip->di_next_unlinked, ARCH_CONVERT, NULLAGINO);
- offset = ip->i_boffset +
+ dip->di_next_unlinked = cpu_to_be32(NULLAGINO);
+ offset = ip->i_imap.im_boffset +
offsetof(xfs_dinode_t, di_next_unlinked);
+
+ /* need to recalc the inode CRC if appropriate */
+ xfs_dinode_calc_crc(mp, dip);
+
xfs_trans_inode_buf(tp, ibp);
xfs_trans_log_buf(tp, ibp, offset,
(offset + sizeof(xfs_agino_t) - 1));
@@ -2094,9 +2159,13 @@ xfs_iunlink_remove(
/*
* Point the previous inode on the list to the next inode.
*/
- INT_SET(last_dip->di_next_unlinked, ARCH_CONVERT, next_agino);
+ last_dip->di_next_unlinked = cpu_to_be32(next_agino);
ASSERT(next_agino != 0);
offset = last_offset + offsetof(xfs_dinode_t, di_next_unlinked);
+
+ /* need to recalc the inode CRC if appropriate */
+ xfs_dinode_calc_crc(mp, last_dip);
+
xfs_trans_inode_buf(tp, last_ibp);
xfs_trans_log_buf(tp, last_ibp, offset,
(offset + sizeof(xfs_agino_t) - 1));
@@ -2105,14 +2174,12 @@ xfs_iunlink_remove(
return 0;
}
-static __inline__ int xfs_inode_clean(xfs_inode_t *ip)
-{
- return (((ip->i_itemp == NULL) ||
- !(ip->i_itemp->ili_format.ilf_fields & XFS_ILOG_ALL)) &&
- (ip->i_update_core == 0));
-}
-
-STATIC void
+/*
+ * A big issue when freeing the inode cluster is that we _cannot_ skip any
+ * inodes that are in memory - they all must be marked stale and attached to
+ * the cluster buffer.
+ */
+STATIC int
xfs_ifree_cluster(
xfs_inode_t *free_ip,
xfs_trans_t *tp,
@@ -2120,163 +2187,160 @@ xfs_ifree_cluster(
{
xfs_mount_t *mp = free_ip->i_mount;
int blks_per_cluster;
+ int inodes_per_cluster;
int nbufs;
- int ninodes;
- int i, j, found, pre_flushed;
+ int i, j;
xfs_daddr_t blkno;
xfs_buf_t *bp;
- xfs_ihash_t *ih;
- xfs_inode_t *ip, **ip_found;
+ xfs_inode_t *ip;
xfs_inode_log_item_t *iip;
xfs_log_item_t *lip;
- SPLDECL(s);
+ struct xfs_perag *pag;
- if (mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(mp)) {
- blks_per_cluster = 1;
- ninodes = mp->m_sb.sb_inopblock;
- nbufs = XFS_IALLOC_BLOCKS(mp);
- } else {
- blks_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) /
- mp->m_sb.sb_blocksize;
- ninodes = blks_per_cluster * mp->m_sb.sb_inopblock;
- nbufs = XFS_IALLOC_BLOCKS(mp) / blks_per_cluster;
- }
+ pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, inum));
+ blks_per_cluster = xfs_icluster_size_fsb(mp);
+ inodes_per_cluster = blks_per_cluster << mp->m_sb.sb_inopblog;
+ nbufs = mp->m_ialloc_blks / blks_per_cluster;
- ip_found = kmem_alloc(ninodes * sizeof(xfs_inode_t *), KM_NOFS);
-
- for (j = 0; j < nbufs; j++, inum += ninodes) {
+ for (j = 0; j < nbufs; j++, inum += inodes_per_cluster) {
blkno = XFS_AGB_TO_DADDR(mp, XFS_INO_TO_AGNO(mp, inum),
XFS_INO_TO_AGBNO(mp, inum));
+ /*
+ * We obtain and lock the backing buffer first in the process
+ * here, as we have to ensure that any dirty inode that we
+ * can't get the flush lock on is attached to the buffer.
+ * If we scan the in-memory inodes first, then buffer IO can
+ * complete before we get a lock on it, and hence we may fail
+ * to mark all the active inodes on the buffer stale.
+ */
+ bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno,
+ mp->m_bsize * blks_per_cluster,
+ XBF_UNMAPPED);
+
+ if (!bp)
+ return ENOMEM;
/*
- * Look for each inode in memory and attempt to lock it,
- * we can be racing with flush and tail pushing here.
- * any inode we get the locks on, add to an array of
- * inode items to process later.
- *
- * The get the buffer lock, we could beat a flush
- * or tail pushing thread to the lock here, in which
- * case they will go looking for the inode buffer
- * and fail, we need some other form of interlock
- * here.
+ * This buffer may not have been correctly initialised as we
+ * didn't read it from disk. That's not important because we are
+ * only using to mark the buffer as stale in the log, and to
+ * attach stale cached inodes on it. That means it will never be
+ * dispatched for IO. If it is, we want to know about it, and we
+ * want it to fail. We can acheive this by adding a write
+ * verifier to the buffer.
*/
- found = 0;
- for (i = 0; i < ninodes; i++) {
- ih = XFS_IHASH(mp, inum + i);
- read_lock(&ih->ih_lock);
- for (ip = ih->ih_next; ip != NULL; ip = ip->i_next) {
- if (ip->i_ino == inum + i)
- break;
- }
+ bp->b_ops = &xfs_inode_buf_ops;
- /* Inode not in memory or we found it already,
- * nothing to do
- */
- if (!ip || (ip->i_flags & XFS_ISTALE)) {
- read_unlock(&ih->ih_lock);
- continue;
+ /*
+ * Walk the inodes already attached to the buffer and mark them
+ * stale. These will all have the flush locks held, so an
+ * in-memory inode walk can't lock them. By marking them all
+ * stale first, we will not attempt to lock them in the loop
+ * below as the XFS_ISTALE flag will be set.
+ */
+ lip = bp->b_fspriv;
+ while (lip) {
+ if (lip->li_type == XFS_LI_INODE) {
+ iip = (xfs_inode_log_item_t *)lip;
+ ASSERT(iip->ili_logged == 1);
+ lip->li_cb = xfs_istale_done;
+ xfs_trans_ail_copy_lsn(mp->m_ail,
+ &iip->ili_flush_lsn,
+ &iip->ili_item.li_lsn);
+ xfs_iflags_set(iip->ili_inode, XFS_ISTALE);
}
+ lip = lip->li_bio_list;
+ }
- if (xfs_inode_clean(ip)) {
- read_unlock(&ih->ih_lock);
+
+ /*
+ * For each inode in memory attempt to add it to the inode
+ * buffer and set it up for being staled on buffer IO
+ * completion. This is safe as we've locked out tail pushing
+ * and flushing by locking the buffer.
+ *
+ * We have already marked every inode that was part of a
+ * transaction stale above, which means there is no point in
+ * even trying to lock them.
+ */
+ for (i = 0; i < inodes_per_cluster; i++) {
+retry:
+ rcu_read_lock();
+ ip = radix_tree_lookup(&pag->pag_ici_root,
+ XFS_INO_TO_AGINO(mp, (inum + i)));
+
+ /* Inode not in memory, nothing to do */
+ if (!ip) {
+ rcu_read_unlock();
continue;
}
- /* If we can get the locks then add it to the
- * list, otherwise by the time we get the bp lock
- * below it will already be attached to the
- * inode buffer.
- */
-
- /* This inode will already be locked - by us, lets
- * keep it that way.
+ /*
+ * because this is an RCU protected lookup, we could
+ * find a recently freed or even reallocated inode
+ * during the lookup. We need to check under the
+ * i_flags_lock for a valid inode here. Skip it if it
+ * is not valid, the wrong inode or stale.
*/
-
- if (ip == free_ip) {
- if (xfs_iflock_nowait(ip)) {
- ip->i_flags |= XFS_ISTALE;
-
- if (xfs_inode_clean(ip)) {
- xfs_ifunlock(ip);
- } else {
- ip_found[found++] = ip;
- }
- }
- read_unlock(&ih->ih_lock);
+ spin_lock(&ip->i_flags_lock);
+ if (ip->i_ino != inum + i ||
+ __xfs_iflags_test(ip, XFS_ISTALE)) {
+ spin_unlock(&ip->i_flags_lock);
+ rcu_read_unlock();
continue;
}
+ spin_unlock(&ip->i_flags_lock);
- if (xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
- if (xfs_iflock_nowait(ip)) {
- ip->i_flags |= XFS_ISTALE;
-
- if (xfs_inode_clean(ip)) {
- xfs_ifunlock(ip);
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- } else {
- ip_found[found++] = ip;
- }
- } else {
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- }
+ /*
+ * Don't try to lock/unlock the current inode, but we
+ * _cannot_ skip the other inodes that we did not find
+ * in the list attached to the buffer and are not
+ * already marked stale. If we can't lock it, back off
+ * and retry.
+ */
+ if (ip != free_ip &&
+ !xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
+ rcu_read_unlock();
+ delay(1);
+ goto retry;
}
+ rcu_read_unlock();
- read_unlock(&ih->ih_lock);
- }
-
- bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno,
- mp->m_bsize * blks_per_cluster,
- XFS_BUF_LOCK);
-
- pre_flushed = 0;
- lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
- while (lip) {
- if (lip->li_type == XFS_LI_INODE) {
- iip = (xfs_inode_log_item_t *)lip;
- ASSERT(iip->ili_logged == 1);
- lip->li_cb = (void(*)(xfs_buf_t*,xfs_log_item_t*)) xfs_istale_done;
- AIL_LOCK(mp,s);
- iip->ili_flush_lsn = iip->ili_item.li_lsn;
- AIL_UNLOCK(mp, s);
- iip->ili_inode->i_flags |= XFS_ISTALE;
- pre_flushed++;
- }
- lip = lip->li_bio_list;
- }
+ xfs_iflock(ip);
+ xfs_iflags_set(ip, XFS_ISTALE);
- for (i = 0; i < found; i++) {
- ip = ip_found[i];
+ /*
+ * we don't need to attach clean inodes or those only
+ * with unlogged changes (which we throw away, anyway).
+ */
iip = ip->i_itemp;
-
- if (!iip) {
- ip->i_update_core = 0;
+ if (!iip || xfs_inode_clean(ip)) {
+ ASSERT(ip != free_ip);
xfs_ifunlock(ip);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
continue;
}
- iip->ili_last_fields = iip->ili_format.ilf_fields;
- iip->ili_format.ilf_fields = 0;
+ iip->ili_last_fields = iip->ili_fields;
+ iip->ili_fields = 0;
iip->ili_logged = 1;
- AIL_LOCK(mp,s);
- iip->ili_flush_lsn = iip->ili_item.li_lsn;
- AIL_UNLOCK(mp, s);
-
- xfs_buf_attach_iodone(bp,
- (void(*)(xfs_buf_t*,xfs_log_item_t*))
- xfs_istale_done, (xfs_log_item_t *)iip);
- if (ip != free_ip) {
+ xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn,
+ &iip->ili_item.li_lsn);
+
+ xfs_buf_attach_iodone(bp, xfs_istale_done,
+ &iip->ili_item);
+
+ if (ip != free_ip)
xfs_iunlock(ip, XFS_ILOCK_EXCL);
- }
}
- if (found || pre_flushed)
- xfs_trans_stale_inode_buf(tp, bp);
+ xfs_trans_stale_inode_buf(tp, bp);
xfs_trans_binval(tp, bp);
}
- kmem_free(ip_found, ninodes * sizeof(xfs_inode_t *));
+ xfs_perag_put(pag);
+ return 0;
}
/*
@@ -2299,33 +2363,28 @@ xfs_ifree(
int delete;
xfs_ino_t first_ino;
- ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE));
- ASSERT(ip->i_transp == tp);
+ ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
ASSERT(ip->i_d.di_nlink == 0);
ASSERT(ip->i_d.di_nextents == 0);
ASSERT(ip->i_d.di_anextents == 0);
- ASSERT((ip->i_d.di_size == 0) ||
- ((ip->i_d.di_mode & S_IFMT) != S_IFREG));
+ ASSERT(ip->i_d.di_size == 0 || !S_ISREG(ip->i_d.di_mode));
ASSERT(ip->i_d.di_nblocks == 0);
/*
* Pull the on-disk inode from the AGI unlinked list.
*/
error = xfs_iunlink_remove(tp, ip);
- if (error != 0) {
+ if (error)
return error;
- }
error = xfs_difree(tp, ip->i_ino, flist, &delete, &first_ino);
- if (error != 0) {
+ if (error)
return error;
- }
+
ip->i_d.di_mode = 0; /* mark incore inode as free */
ip->i_d.di_flags = 0;
ip->i_d.di_dmevmask = 0;
ip->i_d.di_forkoff = 0; /* mark the attr fork not in use */
- ip->i_df.if_ext_max =
- XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS;
ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
/*
@@ -2335,1101 +2394,873 @@ xfs_ifree(
ip->i_d.di_gen++;
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
- if (delete) {
- xfs_ifree_cluster(ip, tp, first_ino);
- }
+ if (delete)
+ error = xfs_ifree_cluster(ip, tp, first_ino);
- return 0;
+ return error;
}
/*
- * Reallocate the space for if_broot based on the number of records
- * being added or deleted as indicated in rec_diff. Move the records
- * and pointers in if_broot to fit the new size. When shrinking this
- * will eliminate holes between the records and pointers created by
- * the caller. When growing this will create holes to be filled in
- * by the caller.
- *
- * The caller must not request to add more records than would fit in
- * the on-disk inode root. If the if_broot is currently NULL, then
- * if we adding records one will be allocated. The caller must also
- * not request that the number of records go below zero, although
- * it can go to zero.
- *
- * ip -- the inode whose if_broot area is changing
- * ext_diff -- the change in the number of records, positive or negative,
- * requested for the if_broot array.
+ * This is called to unpin an inode. The caller must have the inode locked
+ * in at least shared mode so that the buffer cannot be subsequently pinned
+ * once someone is waiting for it to be unpinned.
*/
-void
-xfs_iroot_realloc(
- xfs_inode_t *ip,
- int rec_diff,
- int whichfork)
+static void
+xfs_iunpin(
+ struct xfs_inode *ip)
{
- int cur_max;
- xfs_ifork_t *ifp;
- xfs_bmbt_block_t *new_broot;
- int new_max;
- size_t new_size;
- char *np;
- char *op;
+ ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
- /*
- * Handle the degenerate case quietly.
- */
- if (rec_diff == 0) {
- return;
- }
+ trace_xfs_inode_unpin_nowait(ip, _RET_IP_);
- ifp = XFS_IFORK_PTR(ip, whichfork);
- if (rec_diff > 0) {
- /*
- * If there wasn't any memory allocated before, just
- * allocate it now and get out.
- */
- if (ifp->if_broot_bytes == 0) {
- new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(rec_diff);
- ifp->if_broot = (xfs_bmbt_block_t*)kmem_alloc(new_size,
- KM_SLEEP);
- ifp->if_broot_bytes = (int)new_size;
- return;
- }
+ /* Give the log a push to start the unpinning I/O */
+ xfs_log_force_lsn(ip->i_mount, ip->i_itemp->ili_last_lsn, 0);
- /*
- * If there is already an existing if_broot, then we need
- * to realloc() it and shift the pointers to their new
- * location. The records don't change location because
- * they are kept butted up against the btree block header.
- */
- cur_max = XFS_BMAP_BROOT_MAXRECS(ifp->if_broot_bytes);
- new_max = cur_max + rec_diff;
- new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(new_max);
- ifp->if_broot = (xfs_bmbt_block_t *)
- kmem_realloc(ifp->if_broot,
- new_size,
- (size_t)XFS_BMAP_BROOT_SPACE_CALC(cur_max), /* old size */
- KM_SLEEP);
- op = (char *)XFS_BMAP_BROOT_PTR_ADDR(ifp->if_broot, 1,
- ifp->if_broot_bytes);
- np = (char *)XFS_BMAP_BROOT_PTR_ADDR(ifp->if_broot, 1,
- (int)new_size);
- ifp->if_broot_bytes = (int)new_size;
- ASSERT(ifp->if_broot_bytes <=
- XFS_IFORK_SIZE(ip, whichfork) + XFS_BROOT_SIZE_ADJ);
- memmove(np, op, cur_max * (uint)sizeof(xfs_dfsbno_t));
- return;
- }
+}
- /*
- * rec_diff is less than 0. In this case, we are shrinking the
- * if_broot buffer. It must already exist. If we go to zero
- * records, just get rid of the root and clear the status bit.
- */
- ASSERT((ifp->if_broot != NULL) && (ifp->if_broot_bytes > 0));
- cur_max = XFS_BMAP_BROOT_MAXRECS(ifp->if_broot_bytes);
- new_max = cur_max + rec_diff;
- ASSERT(new_max >= 0);
- if (new_max > 0)
- new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(new_max);
- else
- new_size = 0;
- if (new_size > 0) {
- new_broot = (xfs_bmbt_block_t *)kmem_alloc(new_size, KM_SLEEP);
- /*
- * First copy over the btree block header.
- */
- memcpy(new_broot, ifp->if_broot, sizeof(xfs_bmbt_block_t));
- } else {
- new_broot = NULL;
- ifp->if_flags &= ~XFS_IFBROOT;
- }
+static void
+__xfs_iunpin_wait(
+ struct xfs_inode *ip)
+{
+ wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IPINNED_BIT);
+ DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IPINNED_BIT);
- /*
- * Only copy the records and pointers if there are any.
- */
- if (new_max > 0) {
- /*
- * First copy the records.
- */
- op = (char *)XFS_BMAP_BROOT_REC_ADDR(ifp->if_broot, 1,
- ifp->if_broot_bytes);
- np = (char *)XFS_BMAP_BROOT_REC_ADDR(new_broot, 1,
- (int)new_size);
- memcpy(np, op, new_max * (uint)sizeof(xfs_bmbt_rec_t));
+ xfs_iunpin(ip);
- /*
- * Then copy the pointers.
- */
- op = (char *)XFS_BMAP_BROOT_PTR_ADDR(ifp->if_broot, 1,
- ifp->if_broot_bytes);
- np = (char *)XFS_BMAP_BROOT_PTR_ADDR(new_broot, 1,
- (int)new_size);
- memcpy(np, op, new_max * (uint)sizeof(xfs_dfsbno_t));
- }
- kmem_free(ifp->if_broot, ifp->if_broot_bytes);
- ifp->if_broot = new_broot;
- ifp->if_broot_bytes = (int)new_size;
- ASSERT(ifp->if_broot_bytes <=
- XFS_IFORK_SIZE(ip, whichfork) + XFS_BROOT_SIZE_ADJ);
- return;
+ do {
+ prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
+ if (xfs_ipincount(ip))
+ io_schedule();
+ } while (xfs_ipincount(ip));
+ finish_wait(wq, &wait.wait);
}
+void
+xfs_iunpin_wait(
+ struct xfs_inode *ip)
+{
+ if (xfs_ipincount(ip))
+ __xfs_iunpin_wait(ip);
+}
/*
- * This is called when the amount of space needed for if_extents
- * is increased or decreased. The change in size is indicated by
- * the number of extents that need to be added or deleted in the
- * ext_diff parameter.
+ * Removing an inode from the namespace involves removing the directory entry
+ * and dropping the link count on the inode. Removing the directory entry can
+ * result in locking an AGF (directory blocks were freed) and removing a link
+ * count can result in placing the inode on an unlinked list which results in
+ * locking an AGI.
+ *
+ * The big problem here is that we have an ordering constraint on AGF and AGI
+ * locking - inode allocation locks the AGI, then can allocate a new extent for
+ * new inodes, locking the AGF after the AGI. Similarly, freeing the inode
+ * removes the inode from the unlinked list, requiring that we lock the AGI
+ * first, and then freeing the inode can result in an inode chunk being freed
+ * and hence freeing disk space requiring that we lock an AGF.
*
- * If the amount of space needed has decreased below the size of the
- * inline buffer, then switch to using the inline buffer. Otherwise,
- * use kmem_realloc() or kmem_alloc() to adjust the size of the buffer
- * to what is needed.
+ * Hence the ordering that is imposed by other parts of the code is AGI before
+ * AGF. This means we cannot remove the directory entry before we drop the inode
+ * reference count and put it on the unlinked list as this results in a lock
+ * order of AGF then AGI, and this can deadlock against inode allocation and
+ * freeing. Therefore we must drop the link counts before we remove the
+ * directory entry.
*
- * ip -- the inode whose if_extents area is changing
- * ext_diff -- the change in the number of extents, positive or negative,
- * requested for the if_extents array.
+ * This is still safe from a transactional point of view - it is not until we
+ * get to xfs_bmap_finish() that we have the possibility of multiple
+ * transactions in this operation. Hence as long as we remove the directory
+ * entry and drop the link count in the first transaction of the remove
+ * operation, there are no transactional constraints on the ordering here.
*/
-void
-xfs_iext_realloc(
- xfs_inode_t *ip,
- int ext_diff,
- int whichfork)
+int
+xfs_remove(
+ xfs_inode_t *dp,
+ struct xfs_name *name,
+ xfs_inode_t *ip)
{
- int byte_diff;
- xfs_ifork_t *ifp;
- int new_size;
- uint rnew_size;
+ xfs_mount_t *mp = dp->i_mount;
+ xfs_trans_t *tp = NULL;
+ int is_dir = S_ISDIR(ip->i_d.di_mode);
+ int error = 0;
+ xfs_bmap_free_t free_list;
+ xfs_fsblock_t first_block;
+ int cancel_flags;
+ int committed;
+ int link_zero;
+ uint resblks;
+ uint log_count;
+
+ trace_xfs_remove(dp, name);
+
+ if (XFS_FORCED_SHUTDOWN(mp))
+ return XFS_ERROR(EIO);
- if (ext_diff == 0) {
- return;
- }
+ error = xfs_qm_dqattach(dp, 0);
+ if (error)
+ goto std_return;
- ifp = XFS_IFORK_PTR(ip, whichfork);
- byte_diff = ext_diff * (uint)sizeof(xfs_bmbt_rec_t);
- new_size = (int)ifp->if_bytes + byte_diff;
- ASSERT(new_size >= 0);
+ error = xfs_qm_dqattach(ip, 0);
+ if (error)
+ goto std_return;
- if (new_size == 0) {
- if (ifp->if_u1.if_extents != ifp->if_u2.if_inline_ext) {
- ASSERT(ifp->if_real_bytes != 0);
- kmem_free(ifp->if_u1.if_extents, ifp->if_real_bytes);
- }
- ifp->if_u1.if_extents = NULL;
- rnew_size = 0;
- } else if (new_size <= sizeof(ifp->if_u2.if_inline_ext)) {
- /*
- * If the valid extents can fit in if_inline_ext,
- * copy them from the malloc'd vector and free it.
- */
- if (ifp->if_u1.if_extents != ifp->if_u2.if_inline_ext) {
- /*
- * For now, empty files are format EXTENTS,
- * so the if_extents pointer is null.
- */
- if (ifp->if_u1.if_extents) {
- memcpy(ifp->if_u2.if_inline_ext,
- ifp->if_u1.if_extents, new_size);
- kmem_free(ifp->if_u1.if_extents,
- ifp->if_real_bytes);
- }
- ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext;
- }
- rnew_size = 0;
+ if (is_dir) {
+ tp = xfs_trans_alloc(mp, XFS_TRANS_RMDIR);
+ log_count = XFS_DEFAULT_LOG_COUNT;
} else {
- rnew_size = new_size;
- if ((rnew_size & (rnew_size - 1)) != 0)
- rnew_size = xfs_iroundup(rnew_size);
- /*
- * Stuck with malloc/realloc.
- */
- if (ifp->if_u1.if_extents == ifp->if_u2.if_inline_ext) {
- ifp->if_u1.if_extents = (xfs_bmbt_rec_t *)
- kmem_alloc(rnew_size, KM_SLEEP);
- memcpy(ifp->if_u1.if_extents, ifp->if_u2.if_inline_ext,
- sizeof(ifp->if_u2.if_inline_ext));
- } else if (rnew_size != ifp->if_real_bytes) {
- ifp->if_u1.if_extents = (xfs_bmbt_rec_t *)
- kmem_realloc(ifp->if_u1.if_extents,
- rnew_size,
- ifp->if_real_bytes,
- KM_NOFS);
- }
+ tp = xfs_trans_alloc(mp, XFS_TRANS_REMOVE);
+ log_count = XFS_REMOVE_LOG_COUNT;
}
- ifp->if_real_bytes = rnew_size;
- ifp->if_bytes = new_size;
-}
-
-
-/*
- * This is called when the amount of space needed for if_data
- * is increased or decreased. The change in size is indicated by
- * the number of bytes that need to be added or deleted in the
- * byte_diff parameter.
- *
- * If the amount of space needed has decreased below the size of the
- * inline buffer, then switch to using the inline buffer. Otherwise,
- * use kmem_realloc() or kmem_alloc() to adjust the size of the buffer
- * to what is needed.
- *
- * ip -- the inode whose if_data area is changing
- * byte_diff -- the change in the number of bytes, positive or negative,
- * requested for the if_data array.
- */
-void
-xfs_idata_realloc(
- xfs_inode_t *ip,
- int byte_diff,
- int whichfork)
-{
- xfs_ifork_t *ifp;
- int new_size;
- int real_size;
+ cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
- if (byte_diff == 0) {
- return;
+ /*
+ * We try to get the real space reservation first,
+ * allowing for directory btree deletion(s) implying
+ * possible bmap insert(s). If we can't get the space
+ * reservation then we use 0 instead, and avoid the bmap
+ * btree insert(s) in the directory code by, if the bmap
+ * insert tries to happen, instead trimming the LAST
+ * block from the directory.
+ */
+ resblks = XFS_REMOVE_SPACE_RES(mp);
+ error = xfs_trans_reserve(tp, &M_RES(mp)->tr_remove, resblks, 0);
+ if (error == ENOSPC) {
+ resblks = 0;
+ error = xfs_trans_reserve(tp, &M_RES(mp)->tr_remove, 0, 0);
+ }
+ if (error) {
+ ASSERT(error != ENOSPC);
+ cancel_flags = 0;
+ goto out_trans_cancel;
}
- ifp = XFS_IFORK_PTR(ip, whichfork);
- new_size = (int)ifp->if_bytes + byte_diff;
- ASSERT(new_size >= 0);
+ xfs_lock_two_inodes(dp, ip, XFS_ILOCK_EXCL);
- if (new_size == 0) {
- if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) {
- kmem_free(ifp->if_u1.if_data, ifp->if_real_bytes);
+ xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+
+ /*
+ * If we're removing a directory perform some additional validation.
+ */
+ cancel_flags |= XFS_TRANS_ABORT;
+ if (is_dir) {
+ ASSERT(ip->i_d.di_nlink >= 2);
+ if (ip->i_d.di_nlink != 2) {
+ error = XFS_ERROR(ENOTEMPTY);
+ goto out_trans_cancel;
}
- ifp->if_u1.if_data = NULL;
- real_size = 0;
- } else if (new_size <= sizeof(ifp->if_u2.if_inline_data)) {
- /*
- * If the valid extents/data can fit in if_inline_ext/data,
- * copy them from the malloc'd vector and free it.
- */
- if (ifp->if_u1.if_data == NULL) {
- ifp->if_u1.if_data = ifp->if_u2.if_inline_data;
- } else if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) {
- ASSERT(ifp->if_real_bytes != 0);
- memcpy(ifp->if_u2.if_inline_data, ifp->if_u1.if_data,
- new_size);
- kmem_free(ifp->if_u1.if_data, ifp->if_real_bytes);
- ifp->if_u1.if_data = ifp->if_u2.if_inline_data;
+ if (!xfs_dir_isempty(ip)) {
+ error = XFS_ERROR(ENOTEMPTY);
+ goto out_trans_cancel;
}
- real_size = 0;
+
+ /* Drop the link from ip's "..". */
+ error = xfs_droplink(tp, dp);
+ if (error)
+ goto out_trans_cancel;
+
+ /* Drop the "." link from ip to self. */
+ error = xfs_droplink(tp, ip);
+ if (error)
+ goto out_trans_cancel;
} else {
/*
- * Stuck with malloc/realloc.
- * For inline data, the underlying buffer must be
- * a multiple of 4 bytes in size so that it can be
- * logged and stay on word boundaries. We enforce
- * that here.
+ * When removing a non-directory we need to log the parent
+ * inode here. For a directory this is done implicitly
+ * by the xfs_droplink call for the ".." entry.
*/
- real_size = roundup(new_size, 4);
- if (ifp->if_u1.if_data == NULL) {
- ASSERT(ifp->if_real_bytes == 0);
- ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP);
- } else if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) {
- /*
- * Only do the realloc if the underlying size
- * is really changing.
- */
- if (ifp->if_real_bytes != real_size) {
- ifp->if_u1.if_data =
- kmem_realloc(ifp->if_u1.if_data,
- real_size,
- ifp->if_real_bytes,
- KM_SLEEP);
- }
- } else {
- ASSERT(ifp->if_real_bytes == 0);
- ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP);
- memcpy(ifp->if_u1.if_data, ifp->if_u2.if_inline_data,
- ifp->if_bytes);
- }
+ xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
}
- ifp->if_real_bytes = real_size;
- ifp->if_bytes = new_size;
- ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork));
-}
-
+ xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+ /* Drop the link from dp to ip. */
+ error = xfs_droplink(tp, ip);
+ if (error)
+ goto out_trans_cancel;
+ /* Determine if this is the last link while the inode is locked */
+ link_zero = (ip->i_d.di_nlink == 0);
-/*
- * Map inode to disk block and offset.
- *
- * mp -- the mount point structure for the current file system
- * tp -- the current transaction
- * ino -- the inode number of the inode to be located
- * imap -- this structure is filled in with the information necessary
- * to retrieve the given inode from disk
- * flags -- flags to pass to xfs_dilocate indicating whether or not
- * lookups in the inode btree were OK or not
- */
-int
-xfs_imap(
- xfs_mount_t *mp,
- xfs_trans_t *tp,
- xfs_ino_t ino,
- xfs_imap_t *imap,
- uint flags)
-{
- xfs_fsblock_t fsbno;
- int len;
- int off;
- int error;
-
- fsbno = imap->im_blkno ?
- XFS_DADDR_TO_FSB(mp, imap->im_blkno) : NULLFSBLOCK;
- error = xfs_dilocate(mp, tp, ino, &fsbno, &len, &off, flags);
- if (error != 0) {
- return error;
- }
- imap->im_blkno = XFS_FSB_TO_DADDR(mp, fsbno);
- imap->im_len = XFS_FSB_TO_BB(mp, len);
- imap->im_agblkno = XFS_FSB_TO_AGBNO(mp, fsbno);
- imap->im_ioffset = (ushort)off;
- imap->im_boffset = (ushort)(off << mp->m_sb.sb_inodelog);
- return 0;
-}
-
-void
-xfs_idestroy_fork(
- xfs_inode_t *ip,
- int whichfork)
-{
- xfs_ifork_t *ifp;
-
- ifp = XFS_IFORK_PTR(ip, whichfork);
- if (ifp->if_broot != NULL) {
- kmem_free(ifp->if_broot, ifp->if_broot_bytes);
- ifp->if_broot = NULL;
+ xfs_bmap_init(&free_list, &first_block);
+ error = xfs_dir_removename(tp, dp, name, ip->i_ino,
+ &first_block, &free_list, resblks);
+ if (error) {
+ ASSERT(error != ENOENT);
+ goto out_bmap_cancel;
}
/*
- * If the format is local, then we can't have an extents
- * array so just look for an inline data array. If we're
- * not local then we may or may not have an extents list,
- * so check and free it up if we do.
+ * If this is a synchronous mount, make sure that the
+ * remove transaction goes to disk before returning to
+ * the user.
*/
- if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) {
- if ((ifp->if_u1.if_data != ifp->if_u2.if_inline_data) &&
- (ifp->if_u1.if_data != NULL)) {
- ASSERT(ifp->if_real_bytes != 0);
- kmem_free(ifp->if_u1.if_data, ifp->if_real_bytes);
- ifp->if_u1.if_data = NULL;
- ifp->if_real_bytes = 0;
- }
- } else if ((ifp->if_flags & XFS_IFEXTENTS) &&
- (ifp->if_u1.if_extents != NULL) &&
- (ifp->if_u1.if_extents != ifp->if_u2.if_inline_ext)) {
- ASSERT(ifp->if_real_bytes != 0);
- kmem_free(ifp->if_u1.if_extents, ifp->if_real_bytes);
- ifp->if_u1.if_extents = NULL;
- ifp->if_real_bytes = 0;
- }
- ASSERT(ifp->if_u1.if_extents == NULL ||
- ifp->if_u1.if_extents == ifp->if_u2.if_inline_ext);
- ASSERT(ifp->if_real_bytes == 0);
- if (whichfork == XFS_ATTR_FORK) {
- kmem_zone_free(xfs_ifork_zone, ip->i_afp);
- ip->i_afp = NULL;
- }
-}
+ if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC))
+ xfs_trans_set_sync(tp);
-/*
- * This is called free all the memory associated with an inode.
- * It must free the inode itself and any buffers allocated for
- * if_extents/if_data and if_broot. It must also free the lock
- * associated with the inode.
- */
-void
-xfs_idestroy(
- xfs_inode_t *ip)
-{
+ error = xfs_bmap_finish(&tp, &free_list, &committed);
+ if (error)
+ goto out_bmap_cancel;
- switch (ip->i_d.di_mode & S_IFMT) {
- case S_IFREG:
- case S_IFDIR:
- case S_IFLNK:
- xfs_idestroy_fork(ip, XFS_DATA_FORK);
- break;
- }
- if (ip->i_afp)
- xfs_idestroy_fork(ip, XFS_ATTR_FORK);
- mrfree(&ip->i_lock);
- mrfree(&ip->i_iolock);
- freesema(&ip->i_flock);
-#ifdef XFS_BMAP_TRACE
- ktrace_free(ip->i_xtrace);
-#endif
-#ifdef XFS_BMBT_TRACE
- ktrace_free(ip->i_btrace);
-#endif
-#ifdef XFS_RW_TRACE
- ktrace_free(ip->i_rwtrace);
-#endif
-#ifdef XFS_ILOCK_TRACE
- ktrace_free(ip->i_lock_trace);
-#endif
-#ifdef XFS_DIR2_TRACE
- ktrace_free(ip->i_dir_trace);
-#endif
- if (ip->i_itemp) {
- /* XXXdpd should be able to assert this but shutdown
- * is leaving the AIL behind. */
- ASSERT(((ip->i_itemp->ili_item.li_flags & XFS_LI_IN_AIL) == 0) ||
- XFS_FORCED_SHUTDOWN(ip->i_mount));
- xfs_inode_item_destroy(ip);
- }
- kmem_zone_free(xfs_inode_zone, ip);
-}
+ error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+ if (error)
+ goto std_return;
+ if (is_dir && xfs_inode_is_filestream(ip))
+ xfs_filestream_deassociate(ip);
-/*
- * Increment the pin count of the given buffer.
- * This value is protected by ipinlock spinlock in the mount structure.
- */
-void
-xfs_ipin(
- xfs_inode_t *ip)
-{
- ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE));
+ return 0;
- atomic_inc(&ip->i_pincount);
+ out_bmap_cancel:
+ xfs_bmap_cancel(&free_list);
+ out_trans_cancel:
+ xfs_trans_cancel(tp, cancel_flags);
+ std_return:
+ return error;
}
/*
- * Decrement the pin count of the given inode, and wake up
- * anyone in xfs_iwait_unpin() if the count goes to 0. The
- * inode must have been previoulsy pinned with a call to xfs_ipin().
+ * Enter all inodes for a rename transaction into a sorted array.
*/
-void
-xfs_iunpin(
- xfs_inode_t *ip)
+STATIC void
+xfs_sort_for_rename(
+ xfs_inode_t *dp1, /* in: old (source) directory inode */
+ xfs_inode_t *dp2, /* in: new (target) directory inode */
+ xfs_inode_t *ip1, /* in: inode of old entry */
+ xfs_inode_t *ip2, /* in: inode of new entry, if it
+ already exists, NULL otherwise. */
+ xfs_inode_t **i_tab,/* out: array of inode returned, sorted */
+ int *num_inodes) /* out: number of inodes in array */
{
- ASSERT(atomic_read(&ip->i_pincount) > 0);
+ xfs_inode_t *temp;
+ int i, j;
- if (atomic_dec_and_test(&ip->i_pincount)) {
- vnode_t *vp = XFS_ITOV_NULL(ip);
-
- /* make sync come back and flush this inode */
- if (vp) {
- struct inode *inode = LINVFS_GET_IP(vp);
+ /*
+ * i_tab contains a list of pointers to inodes. We initialize
+ * the table here & we'll sort it. We will then use it to
+ * order the acquisition of the inode locks.
+ *
+ * Note that the table may contain duplicates. e.g., dp1 == dp2.
+ */
+ i_tab[0] = dp1;
+ i_tab[1] = dp2;
+ i_tab[2] = ip1;
+ if (ip2) {
+ *num_inodes = 4;
+ i_tab[3] = ip2;
+ } else {
+ *num_inodes = 3;
+ i_tab[3] = NULL;
+ }
- if (!(inode->i_state & I_NEW))
- mark_inode_dirty_sync(inode);
+ /*
+ * Sort the elements via bubble sort. (Remember, there are at
+ * most 4 elements to sort, so this is adequate.)
+ */
+ for (i = 0; i < *num_inodes; i++) {
+ for (j = 1; j < *num_inodes; j++) {
+ if (i_tab[j]->i_ino < i_tab[j-1]->i_ino) {
+ temp = i_tab[j];
+ i_tab[j] = i_tab[j-1];
+ i_tab[j-1] = temp;
+ }
}
-
- wake_up(&ip->i_ipin_wait);
}
}
/*
- * This is called to wait for the given inode to be unpinned.
- * It will sleep until this happens. The caller must have the
- * inode locked in at least shared mode so that the buffer cannot
- * be subsequently pinned once someone is waiting for it to be
- * unpinned.
+ * xfs_rename
*/
-STATIC void
-xfs_iunpin_wait(
- xfs_inode_t *ip)
+int
+xfs_rename(
+ xfs_inode_t *src_dp,
+ struct xfs_name *src_name,
+ xfs_inode_t *src_ip,
+ xfs_inode_t *target_dp,
+ struct xfs_name *target_name,
+ xfs_inode_t *target_ip)
{
- xfs_inode_log_item_t *iip;
- xfs_lsn_t lsn;
+ xfs_trans_t *tp = NULL;
+ xfs_mount_t *mp = src_dp->i_mount;
+ int new_parent; /* moving to a new dir */
+ int src_is_directory; /* src_name is a directory */
+ int error;
+ xfs_bmap_free_t free_list;
+ xfs_fsblock_t first_block;
+ int cancel_flags;
+ int committed;
+ xfs_inode_t *inodes[4];
+ int spaceres;
+ int num_inodes;
- ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE | MR_ACCESS));
+ trace_xfs_rename(src_dp, target_dp, src_name, target_name);
- if (atomic_read(&ip->i_pincount) == 0) {
- return;
- }
+ new_parent = (src_dp != target_dp);
+ src_is_directory = S_ISDIR(src_ip->i_d.di_mode);
- iip = ip->i_itemp;
- if (iip && iip->ili_last_lsn) {
- lsn = iip->ili_last_lsn;
- } else {
- lsn = (xfs_lsn_t)0;
+ xfs_sort_for_rename(src_dp, target_dp, src_ip, target_ip,
+ inodes, &num_inodes);
+
+ xfs_bmap_init(&free_list, &first_block);
+ tp = xfs_trans_alloc(mp, XFS_TRANS_RENAME);
+ cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
+ spaceres = XFS_RENAME_SPACE_RES(mp, target_name->len);
+ error = xfs_trans_reserve(tp, &M_RES(mp)->tr_rename, spaceres, 0);
+ if (error == ENOSPC) {
+ spaceres = 0;
+ error = xfs_trans_reserve(tp, &M_RES(mp)->tr_rename, 0, 0);
+ }
+ if (error) {
+ xfs_trans_cancel(tp, 0);
+ goto std_return;
}
/*
- * Give the log a push so we don't wait here too long.
+ * Attach the dquots to the inodes
*/
- xfs_log_force(ip->i_mount, lsn, XFS_LOG_FORCE);
-
- wait_event(ip->i_ipin_wait, (atomic_read(&ip->i_pincount) == 0));
-}
-
-
-/*
- * xfs_iextents_copy()
- *
- * This is called to copy the REAL extents (as opposed to the delayed
- * allocation extents) from the inode into the given buffer. It
- * returns the number of bytes copied into the buffer.
- *
- * If there are no delayed allocation extents, then we can just
- * memcpy() the extents into the buffer. Otherwise, we need to
- * examine each extent in turn and skip those which are delayed.
- */
-int
-xfs_iextents_copy(
- xfs_inode_t *ip,
- xfs_bmbt_rec_t *buffer,
- int whichfork)
-{
- int copied;
- xfs_bmbt_rec_t *dest_ep;
- xfs_bmbt_rec_t *ep;
-#ifdef XFS_BMAP_TRACE
- static char fname[] = "xfs_iextents_copy";
-#endif
- int i;
- xfs_ifork_t *ifp;
- int nrecs;
- xfs_fsblock_t start_block;
-
- ifp = XFS_IFORK_PTR(ip, whichfork);
- ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE|MR_ACCESS));
- ASSERT(ifp->if_bytes > 0);
-
- nrecs = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
- xfs_bmap_trace_exlist(fname, ip, nrecs, whichfork);
- ASSERT(nrecs > 0);
-
- /*
- * There are some delayed allocation extents in the
- * inode, so copy the extents one at a time and skip
- * the delayed ones. There must be at least one
- * non-delayed extent.
- */
- ep = ifp->if_u1.if_extents;
- dest_ep = buffer;
- copied = 0;
- for (i = 0; i < nrecs; i++) {
- start_block = xfs_bmbt_get_startblock(ep);
- if (ISNULLSTARTBLOCK(start_block)) {
- /*
- * It's a delayed allocation extent, so skip it.
- */
- ep++;
- continue;
- }
-
- /* Translate to on disk format */
- put_unaligned(INT_GET(ep->l0, ARCH_CONVERT),
- (__uint64_t*)&dest_ep->l0);
- put_unaligned(INT_GET(ep->l1, ARCH_CONVERT),
- (__uint64_t*)&dest_ep->l1);
- dest_ep++;
- ep++;
- copied++;
+ error = xfs_qm_vop_rename_dqattach(inodes);
+ if (error) {
+ xfs_trans_cancel(tp, cancel_flags);
+ goto std_return;
}
- ASSERT(copied != 0);
- xfs_validate_extents(buffer, copied, 1, XFS_EXTFMT_INODE(ip));
- return (copied * (uint)sizeof(xfs_bmbt_rec_t));
-}
+ /*
+ * Lock all the participating inodes. Depending upon whether
+ * the target_name exists in the target directory, and
+ * whether the target directory is the same as the source
+ * directory, we can lock from 2 to 4 inodes.
+ */
+ xfs_lock_inodes(inodes, num_inodes, XFS_ILOCK_EXCL);
-/*
- * Each of the following cases stores data into the same region
- * of the on-disk inode, so only one of them can be valid at
- * any given time. While it is possible to have conflicting formats
- * and log flags, e.g. having XFS_ILOG_?DATA set when the fork is
- * in EXTENTS format, this can only happen when the fork has
- * changed formats after being modified but before being flushed.
- * In these cases, the format always takes precedence, because the
- * format indicates the current state of the fork.
- */
-/*ARGSUSED*/
-STATIC int
-xfs_iflush_fork(
- xfs_inode_t *ip,
- xfs_dinode_t *dip,
- xfs_inode_log_item_t *iip,
- int whichfork,
- xfs_buf_t *bp)
-{
- char *cp;
- xfs_ifork_t *ifp;
- xfs_mount_t *mp;
-#ifdef XFS_TRANS_DEBUG
- int first;
-#endif
- static const short brootflag[2] =
- { XFS_ILOG_DBROOT, XFS_ILOG_ABROOT };
- static const short dataflag[2] =
- { XFS_ILOG_DDATA, XFS_ILOG_ADATA };
- static const short extflag[2] =
- { XFS_ILOG_DEXT, XFS_ILOG_AEXT };
-
- if (iip == NULL)
- return 0;
- ifp = XFS_IFORK_PTR(ip, whichfork);
/*
- * This can happen if we gave up in iformat in an error path,
- * for the attribute fork.
+ * Join all the inodes to the transaction. From this point on,
+ * we can rely on either trans_commit or trans_cancel to unlock
+ * them.
*/
- if (ifp == NULL) {
- ASSERT(whichfork == XFS_ATTR_FORK);
- return 0;
+ xfs_trans_ijoin(tp, src_dp, XFS_ILOCK_EXCL);
+ if (new_parent)
+ xfs_trans_ijoin(tp, target_dp, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, src_ip, XFS_ILOCK_EXCL);
+ if (target_ip)
+ xfs_trans_ijoin(tp, target_ip, XFS_ILOCK_EXCL);
+
+ /*
+ * If we are using project inheritance, we only allow renames
+ * into our tree when the project IDs are the same; else the
+ * tree quota mechanism would be circumvented.
+ */
+ if (unlikely((target_dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
+ (xfs_get_projid(target_dp) != xfs_get_projid(src_ip)))) {
+ error = XFS_ERROR(EXDEV);
+ goto error_return;
}
- cp = XFS_DFORK_PTR(dip, whichfork);
- mp = ip->i_mount;
- switch (XFS_IFORK_FORMAT(ip, whichfork)) {
- case XFS_DINODE_FMT_LOCAL:
- if ((iip->ili_format.ilf_fields & dataflag[whichfork]) &&
- (ifp->if_bytes > 0)) {
- ASSERT(ifp->if_u1.if_data != NULL);
- ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork));
- memcpy(cp, ifp->if_u1.if_data, ifp->if_bytes);
- }
- if (whichfork == XFS_DATA_FORK) {
- if (unlikely(XFS_DIR_SHORTFORM_VALIDATE_ONDISK(mp, dip))) {
- XFS_ERROR_REPORT("xfs_iflush_fork",
- XFS_ERRLEVEL_LOW, mp);
- return XFS_ERROR(EFSCORRUPTED);
- }
- }
- break;
- case XFS_DINODE_FMT_EXTENTS:
- ASSERT((ifp->if_flags & XFS_IFEXTENTS) ||
- !(iip->ili_format.ilf_fields & extflag[whichfork]));
- ASSERT((ifp->if_u1.if_extents != NULL) || (ifp->if_bytes == 0));
- ASSERT((ifp->if_u1.if_extents == NULL) || (ifp->if_bytes > 0));
- if ((iip->ili_format.ilf_fields & extflag[whichfork]) &&
- (ifp->if_bytes > 0)) {
- ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) > 0);
- (void)xfs_iextents_copy(ip, (xfs_bmbt_rec_t *)cp,
- whichfork);
- }
- break;
+ /*
+ * Set up the target.
+ */
+ if (target_ip == NULL) {
+ /*
+ * If there's no space reservation, check the entry will
+ * fit before actually inserting it.
+ */
+ error = xfs_dir_canenter(tp, target_dp, target_name, spaceres);
+ if (error)
+ goto error_return;
+ /*
+ * If target does not exist and the rename crosses
+ * directories, adjust the target directory link count
+ * to account for the ".." reference from the new entry.
+ */
+ error = xfs_dir_createname(tp, target_dp, target_name,
+ src_ip->i_ino, &first_block,
+ &free_list, spaceres);
+ if (error == ENOSPC)
+ goto error_return;
+ if (error)
+ goto abort_return;
- case XFS_DINODE_FMT_BTREE:
- if ((iip->ili_format.ilf_fields & brootflag[whichfork]) &&
- (ifp->if_broot_bytes > 0)) {
- ASSERT(ifp->if_broot != NULL);
- ASSERT(ifp->if_broot_bytes <=
- (XFS_IFORK_SIZE(ip, whichfork) +
- XFS_BROOT_SIZE_ADJ));
- xfs_bmbt_to_bmdr(ifp->if_broot, ifp->if_broot_bytes,
- (xfs_bmdr_block_t *)cp,
- XFS_DFORK_SIZE(dip, mp, whichfork));
- }
- break;
+ xfs_trans_ichgtime(tp, target_dp,
+ XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
- case XFS_DINODE_FMT_DEV:
- if (iip->ili_format.ilf_fields & XFS_ILOG_DEV) {
- ASSERT(whichfork == XFS_DATA_FORK);
- INT_SET(dip->di_u.di_dev, ARCH_CONVERT, ip->i_df.if_u2.if_rdev);
+ if (new_parent && src_is_directory) {
+ error = xfs_bumplink(tp, target_dp);
+ if (error)
+ goto abort_return;
}
- break;
-
- case XFS_DINODE_FMT_UUID:
- if (iip->ili_format.ilf_fields & XFS_ILOG_UUID) {
- ASSERT(whichfork == XFS_DATA_FORK);
- memcpy(&dip->di_u.di_muuid, &ip->i_df.if_u2.if_uuid,
- sizeof(uuid_t));
+ } else { /* target_ip != NULL */
+ /*
+ * If target exists and it's a directory, check that both
+ * target and source are directories and that target can be
+ * destroyed, or that neither is a directory.
+ */
+ if (S_ISDIR(target_ip->i_d.di_mode)) {
+ /*
+ * Make sure target dir is empty.
+ */
+ if (!(xfs_dir_isempty(target_ip)) ||
+ (target_ip->i_d.di_nlink > 2)) {
+ error = XFS_ERROR(EEXIST);
+ goto error_return;
+ }
}
- break;
-
- default:
- ASSERT(0);
- break;
- }
- return 0;
-}
-
-/*
- * xfs_iflush() will write a modified inode's changes out to the
- * inode's on disk home. The caller must have the inode lock held
- * in at least shared mode and the inode flush semaphore must be
- * held as well. The inode lock will still be held upon return from
- * the call and the caller is free to unlock it.
- * The inode flush lock will be unlocked when the inode reaches the disk.
- * The flags indicate how the inode's buffer should be written out.
- */
-int
-xfs_iflush(
- xfs_inode_t *ip,
- uint flags)
-{
- xfs_inode_log_item_t *iip;
- xfs_buf_t *bp;
- xfs_dinode_t *dip;
- xfs_mount_t *mp;
- int error;
- /* REFERENCED */
- xfs_chash_t *ch;
- xfs_inode_t *iq;
- int clcount; /* count of inodes clustered */
- int bufwasdelwri;
- enum { INT_DELWRI = (1 << 0), INT_ASYNC = (1 << 1) };
- SPLDECL(s);
+ /*
+ * Link the source inode under the target name.
+ * If the source inode is a directory and we are moving
+ * it across directories, its ".." entry will be
+ * inconsistent until we replace that down below.
+ *
+ * In case there is already an entry with the same
+ * name at the destination directory, remove it first.
+ */
+ error = xfs_dir_replace(tp, target_dp, target_name,
+ src_ip->i_ino,
+ &first_block, &free_list, spaceres);
+ if (error)
+ goto abort_return;
- XFS_STATS_INC(xs_iflush_count);
+ xfs_trans_ichgtime(tp, target_dp,
+ XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
- ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE|MR_ACCESS));
- ASSERT(valusema(&ip->i_flock) <= 0);
- ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
- ip->i_d.di_nextents > ip->i_df.if_ext_max);
+ /*
+ * Decrement the link count on the target since the target
+ * dir no longer points to it.
+ */
+ error = xfs_droplink(tp, target_ip);
+ if (error)
+ goto abort_return;
- iip = ip->i_itemp;
- mp = ip->i_mount;
+ if (src_is_directory) {
+ /*
+ * Drop the link from the old "." entry.
+ */
+ error = xfs_droplink(tp, target_ip);
+ if (error)
+ goto abort_return;
+ }
+ } /* target_ip != NULL */
/*
- * If the inode isn't dirty, then just release the inode
- * flush lock and do nothing.
+ * Remove the source.
*/
- if ((ip->i_update_core == 0) &&
- ((iip == NULL) || !(iip->ili_format.ilf_fields & XFS_ILOG_ALL))) {
- ASSERT((iip != NULL) ?
- !(iip->ili_item.li_flags & XFS_LI_IN_AIL) : 1);
- xfs_ifunlock(ip);
- return 0;
+ if (new_parent && src_is_directory) {
+ /*
+ * Rewrite the ".." entry to point to the new
+ * directory.
+ */
+ error = xfs_dir_replace(tp, src_ip, &xfs_name_dotdot,
+ target_dp->i_ino,
+ &first_block, &free_list, spaceres);
+ ASSERT(error != EEXIST);
+ if (error)
+ goto abort_return;
}
/*
- * We can't flush the inode until it is unpinned, so
- * wait for it. We know noone new can pin it, because
- * we are holding the inode lock shared and you need
- * to hold it exclusively to pin the inode.
+ * We always want to hit the ctime on the source inode.
+ *
+ * This isn't strictly required by the standards since the source
+ * inode isn't really being changed, but old unix file systems did
+ * it and some incremental backup programs won't work without it.
*/
- xfs_iunpin_wait(ip);
+ xfs_trans_ichgtime(tp, src_ip, XFS_ICHGTIME_CHG);
+ xfs_trans_log_inode(tp, src_ip, XFS_ILOG_CORE);
/*
- * This may have been unpinned because the filesystem is shutting
- * down forcibly. If that's the case we must not write this inode
- * to disk, because the log record didn't make it to disk!
+ * Adjust the link count on src_dp. This is necessary when
+ * renaming a directory, either within one parent when
+ * the target existed, or across two parent directories.
*/
- if (XFS_FORCED_SHUTDOWN(mp)) {
- ip->i_update_core = 0;
- if (iip)
- iip->ili_format.ilf_fields = 0;
- xfs_ifunlock(ip);
- return XFS_ERROR(EIO);
- }
+ if (src_is_directory && (new_parent || target_ip != NULL)) {
- /*
- * Get the buffer containing the on-disk inode.
- */
- error = xfs_itobp(mp, NULL, ip, &dip, &bp, 0);
- if (error != 0) {
- xfs_ifunlock(ip);
- return error;
- }
-
- /*
- * Decide how buffer will be flushed out. This is done before
- * the call to xfs_iflush_int because this field is zeroed by it.
- */
- if (iip != NULL && iip->ili_format.ilf_fields != 0) {
/*
- * Flush out the inode buffer according to the directions
- * of the caller. In the cases where the caller has given
- * us a choice choose the non-delwri case. This is because
- * the inode is in the AIL and we need to get it out soon.
+ * Decrement link count on src_directory since the
+ * entry that's moved no longer points to it.
*/
- switch (flags) {
- case XFS_IFLUSH_SYNC:
- case XFS_IFLUSH_DELWRI_ELSE_SYNC:
- flags = 0;
- break;
- case XFS_IFLUSH_ASYNC:
- case XFS_IFLUSH_DELWRI_ELSE_ASYNC:
- flags = INT_ASYNC;
- break;
- case XFS_IFLUSH_DELWRI:
- flags = INT_DELWRI;
- break;
- default:
- ASSERT(0);
- flags = 0;
- break;
- }
- } else {
- switch (flags) {
- case XFS_IFLUSH_DELWRI_ELSE_SYNC:
- case XFS_IFLUSH_DELWRI_ELSE_ASYNC:
- case XFS_IFLUSH_DELWRI:
- flags = INT_DELWRI;
- break;
- case XFS_IFLUSH_ASYNC:
- flags = INT_ASYNC;
- break;
- case XFS_IFLUSH_SYNC:
- flags = 0;
- break;
- default:
- ASSERT(0);
- flags = 0;
- break;
- }
+ error = xfs_droplink(tp, src_dp);
+ if (error)
+ goto abort_return;
}
+ error = xfs_dir_removename(tp, src_dp, src_name, src_ip->i_ino,
+ &first_block, &free_list, spaceres);
+ if (error)
+ goto abort_return;
+
+ xfs_trans_ichgtime(tp, src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+ xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE);
+ if (new_parent)
+ xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE);
+
/*
- * First flush out the inode that xfs_iflush was called with.
+ * If this is a synchronous mount, make sure that the
+ * rename transaction goes to disk before returning to
+ * the user.
*/
- error = xfs_iflush_int(ip, bp);
+ if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
+ xfs_trans_set_sync(tp);
+ }
+
+ error = xfs_bmap_finish(&tp, &free_list, &committed);
if (error) {
- goto corrupt_out;
+ xfs_bmap_cancel(&free_list);
+ xfs_trans_cancel(tp, (XFS_TRANS_RELEASE_LOG_RES |
+ XFS_TRANS_ABORT));
+ goto std_return;
}
/*
- * inode clustering:
- * see if other inodes can be gathered into this write
+ * trans_commit will unlock src_ip, target_ip & decrement
+ * the vnode references.
*/
+ return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+
+ abort_return:
+ cancel_flags |= XFS_TRANS_ABORT;
+ error_return:
+ xfs_bmap_cancel(&free_list);
+ xfs_trans_cancel(tp, cancel_flags);
+ std_return:
+ return error;
+}
+
+STATIC int
+xfs_iflush_cluster(
+ xfs_inode_t *ip,
+ xfs_buf_t *bp)
+{
+ xfs_mount_t *mp = ip->i_mount;
+ struct xfs_perag *pag;
+ unsigned long first_index, mask;
+ unsigned long inodes_per_cluster;
+ int ilist_size;
+ xfs_inode_t **ilist;
+ xfs_inode_t *iq;
+ int nr_found;
+ int clcount = 0;
+ int bufwasdelwri;
+ int i;
- ip->i_chash->chl_buf = bp;
+ pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
+
+ inodes_per_cluster = mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog;
+ ilist_size = inodes_per_cluster * sizeof(xfs_inode_t *);
+ ilist = kmem_alloc(ilist_size, KM_MAYFAIL|KM_NOFS);
+ if (!ilist)
+ goto out_put;
+
+ mask = ~(((mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog)) - 1);
+ first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask;
+ rcu_read_lock();
+ /* really need a gang lookup range call here */
+ nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)ilist,
+ first_index, inodes_per_cluster);
+ if (nr_found == 0)
+ goto out_free;
+
+ for (i = 0; i < nr_found; i++) {
+ iq = ilist[i];
+ if (iq == ip)
+ continue;
- ch = XFS_CHASH(mp, ip->i_blkno);
- s = mutex_spinlock(&ch->ch_lock);
+ /*
+ * because this is an RCU protected lookup, we could find a
+ * recently freed or even reallocated inode during the lookup.
+ * We need to check under the i_flags_lock for a valid inode
+ * here. Skip it if it is not valid or the wrong inode.
+ */
+ spin_lock(&ip->i_flags_lock);
+ if (!ip->i_ino ||
+ (XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) != first_index) {
+ spin_unlock(&ip->i_flags_lock);
+ continue;
+ }
+ spin_unlock(&ip->i_flags_lock);
- clcount = 0;
- for (iq = ip->i_cnext; iq != ip; iq = iq->i_cnext) {
/*
* Do an un-protected check to see if the inode is dirty and
* is a candidate for flushing. These checks will be repeated
* later after the appropriate locks are acquired.
*/
- iip = iq->i_itemp;
- if ((iq->i_update_core == 0) &&
- ((iip == NULL) ||
- !(iip->ili_format.ilf_fields & XFS_ILOG_ALL)) &&
- xfs_ipincount(iq) == 0) {
+ if (xfs_inode_clean(iq) && xfs_ipincount(iq) == 0)
continue;
- }
/*
- * Try to get locks. If any are unavailable,
+ * Try to get locks. If any are unavailable or it is pinned,
* then this inode cannot be flushed and is skipped.
*/
- /* get inode locks (just i_lock) */
- if (xfs_ilock_nowait(iq, XFS_ILOCK_SHARED)) {
- /* get inode flush lock */
- if (xfs_iflock_nowait(iq)) {
- /* check if pinned */
- if (xfs_ipincount(iq) == 0) {
- /* arriving here means that
- * this inode can be flushed.
- * first re-check that it's
- * dirty
- */
- iip = iq->i_itemp;
- if ((iq->i_update_core != 0)||
- ((iip != NULL) &&
- (iip->ili_format.ilf_fields & XFS_ILOG_ALL))) {
- clcount++;
- error = xfs_iflush_int(iq, bp);
- if (error) {
- xfs_iunlock(iq,
- XFS_ILOCK_SHARED);
- goto cluster_corrupt_out;
- }
- } else {
- xfs_ifunlock(iq);
- }
- } else {
- xfs_ifunlock(iq);
- }
- }
+ if (!xfs_ilock_nowait(iq, XFS_ILOCK_SHARED))
+ continue;
+ if (!xfs_iflock_nowait(iq)) {
+ xfs_iunlock(iq, XFS_ILOCK_SHARED);
+ continue;
+ }
+ if (xfs_ipincount(iq)) {
+ xfs_ifunlock(iq);
xfs_iunlock(iq, XFS_ILOCK_SHARED);
+ continue;
+ }
+
+ /*
+ * arriving here means that this inode can be flushed. First
+ * re-check that it's dirty before flushing.
+ */
+ if (!xfs_inode_clean(iq)) {
+ int error;
+ error = xfs_iflush_int(iq, bp);
+ if (error) {
+ xfs_iunlock(iq, XFS_ILOCK_SHARED);
+ goto cluster_corrupt_out;
+ }
+ clcount++;
+ } else {
+ xfs_ifunlock(iq);
}
+ xfs_iunlock(iq, XFS_ILOCK_SHARED);
}
- mutex_spinunlock(&ch->ch_lock, s);
if (clcount) {
XFS_STATS_INC(xs_icluster_flushcnt);
XFS_STATS_ADD(xs_icluster_flushinode, clcount);
}
- /*
- * If the buffer is pinned then push on the log so we won't
- * get stuck waiting in the write for too long.
- */
- if (XFS_BUF_ISPINNED(bp)){
- xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE);
- }
-
- if (flags & INT_DELWRI) {
- xfs_bdwrite(mp, bp);
- } else if (flags & INT_ASYNC) {
- xfs_bawrite(mp, bp);
- } else {
- error = xfs_bwrite(mp, bp);
- }
- return error;
+out_free:
+ rcu_read_unlock();
+ kmem_free(ilist);
+out_put:
+ xfs_perag_put(pag);
+ return 0;
-corrupt_out:
- xfs_buf_relse(bp);
- xfs_force_shutdown(mp, XFS_CORRUPT_INCORE);
- xfs_iflush_abort(ip);
- /*
- * Unlocks the flush lock
- */
- return XFS_ERROR(EFSCORRUPTED);
cluster_corrupt_out:
- /* Corruption detected in the clustering loop. Invalidate the
+ /*
+ * Corruption detected in the clustering loop. Invalidate the
* inode buffer and shut down the filesystem.
*/
- mutex_spinunlock(&ch->ch_lock, s);
-
+ rcu_read_unlock();
/*
- * Clean up the buffer. If it was B_DELWRI, just release it --
+ * Clean up the buffer. If it was delwri, just release it --
* brelse can handle it with no problems. If not, shut down the
* filesystem before releasing the buffer.
*/
- if ((bufwasdelwri= XFS_BUF_ISDELAYWRITE(bp))) {
+ bufwasdelwri = (bp->b_flags & _XBF_DELWRI_Q);
+ if (bufwasdelwri)
xfs_buf_relse(bp);
- }
- xfs_force_shutdown(mp, XFS_CORRUPT_INCORE);
+ xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
- if(!bufwasdelwri) {
+ if (!bufwasdelwri) {
/*
* Just like incore_relse: if we have b_iodone functions,
* mark the buffer as an error and call them. Otherwise
* mark it as stale and brelse.
*/
- if (XFS_BUF_IODONE_FUNC(bp)) {
- XFS_BUF_CLR_BDSTRAT_FUNC(bp);
+ if (bp->b_iodone) {
XFS_BUF_UNDONE(bp);
- XFS_BUF_STALE(bp);
- XFS_BUF_SHUT(bp);
- XFS_BUF_ERROR(bp,EIO);
- xfs_biodone(bp);
+ xfs_buf_stale(bp);
+ xfs_buf_ioerror(bp, EIO);
+ xfs_buf_ioend(bp, 0);
} else {
- XFS_BUF_STALE(bp);
+ xfs_buf_stale(bp);
xfs_buf_relse(bp);
}
}
- xfs_iflush_abort(iq);
/*
* Unlocks the flush lock
*/
+ xfs_iflush_abort(iq, false);
+ kmem_free(ilist);
+ xfs_perag_put(pag);
return XFS_ERROR(EFSCORRUPTED);
}
-
-STATIC int
-xfs_iflush_int(
- xfs_inode_t *ip,
- xfs_buf_t *bp)
+/*
+ * Flush dirty inode metadata into the backing buffer.
+ *
+ * The caller must have the inode lock and the inode flush lock held. The
+ * inode lock will still be held upon return to the caller, and the inode
+ * flush lock will be released after the inode has reached the disk.
+ *
+ * The caller must write out the buffer returned in *bpp and release it.
+ */
+int
+xfs_iflush(
+ struct xfs_inode *ip,
+ struct xfs_buf **bpp)
{
- xfs_inode_log_item_t *iip;
- xfs_dinode_t *dip;
- xfs_mount_t *mp;
-#ifdef XFS_TRANS_DEBUG
- int first;
-#endif
- SPLDECL(s);
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_buf *bp;
+ struct xfs_dinode *dip;
+ int error;
+
+ XFS_STATS_INC(xs_iflush_count);
- ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE|MR_ACCESS));
- ASSERT(valusema(&ip->i_flock) <= 0);
+ ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
+ ASSERT(xfs_isiflocked(ip));
ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
- ip->i_d.di_nextents > ip->i_df.if_ext_max);
+ ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK));
- iip = ip->i_itemp;
- mp = ip->i_mount;
+ *bpp = NULL;
+ xfs_iunpin_wait(ip);
/*
- * If the inode isn't dirty, then just release the inode
- * flush lock and do nothing.
+ * For stale inodes we cannot rely on the backing buffer remaining
+ * stale in cache for the remaining life of the stale inode and so
+ * xfs_imap_to_bp() below may give us a buffer that no longer contains
+ * inodes below. We have to check this after ensuring the inode is
+ * unpinned so that it is safe to reclaim the stale inode after the
+ * flush call.
*/
- if ((ip->i_update_core == 0) &&
- ((iip == NULL) || !(iip->ili_format.ilf_fields & XFS_ILOG_ALL))) {
+ if (xfs_iflags_test(ip, XFS_ISTALE)) {
xfs_ifunlock(ip);
return 0;
}
- /* set *dip = inode's place in the buffer */
- dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_boffset);
+ /*
+ * This may have been unpinned because the filesystem is shutting
+ * down forcibly. If that's the case we must not write this inode
+ * to disk, because the log record didn't make it to disk.
+ *
+ * We also have to remove the log item from the AIL in this case,
+ * as we wait for an empty AIL as part of the unmount process.
+ */
+ if (XFS_FORCED_SHUTDOWN(mp)) {
+ error = XFS_ERROR(EIO);
+ goto abort_out;
+ }
/*
- * Clear i_update_core before copying out the data.
- * This is for coordination with our timestamp updates
- * that don't hold the inode lock. They will always
- * update the timestamps BEFORE setting i_update_core,
- * so if we clear i_update_core after they set it we
- * are guaranteed to see their updates to the timestamps.
- * I believe that this depends on strongly ordered memory
- * semantics, but we have that. We use the SYNCHRONIZE
- * macro to make sure that the compiler does not reorder
- * the i_update_core access below the data copy below.
+ * Get the buffer containing the on-disk inode.
*/
- ip->i_update_core = 0;
- SYNCHRONIZE();
+ error = xfs_imap_to_bp(mp, NULL, &ip->i_imap, &dip, &bp, XBF_TRYLOCK,
+ 0);
+ if (error || !bp) {
+ xfs_ifunlock(ip);
+ return error;
+ }
/*
- * Make sure to get the latest atime from the Linux inode.
+ * First flush out the inode that xfs_iflush was called with.
*/
- xfs_synchronize_atime(ip);
+ error = xfs_iflush_int(ip, bp);
+ if (error)
+ goto corrupt_out;
+
+ /*
+ * If the buffer is pinned then push on the log now so we won't
+ * get stuck waiting in the write for too long.
+ */
+ if (xfs_buf_ispinned(bp))
+ xfs_log_force(mp, 0);
+
+ /*
+ * inode clustering:
+ * see if other inodes can be gathered into this write
+ */
+ error = xfs_iflush_cluster(ip, bp);
+ if (error)
+ goto cluster_corrupt_out;
+
+ *bpp = bp;
+ return 0;
+
+corrupt_out:
+ xfs_buf_relse(bp);
+ xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+cluster_corrupt_out:
+ error = XFS_ERROR(EFSCORRUPTED);
+abort_out:
+ /*
+ * Unlocks the flush lock
+ */
+ xfs_iflush_abort(ip, false);
+ return error;
+}
+
+STATIC int
+xfs_iflush_int(
+ struct xfs_inode *ip,
+ struct xfs_buf *bp)
+{
+ struct xfs_inode_log_item *iip = ip->i_itemp;
+ struct xfs_dinode *dip;
+ struct xfs_mount *mp = ip->i_mount;
+
+ ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
+ ASSERT(xfs_isiflocked(ip));
+ ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
+ ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK));
+ ASSERT(iip != NULL && iip->ili_fields != 0);
+ ASSERT(ip->i_d.di_version > 1);
+
+ /* set *dip = inode's place in the buffer */
+ dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset);
- if (XFS_TEST_ERROR(INT_GET(dip->di_core.di_magic,ARCH_CONVERT) != XFS_DINODE_MAGIC,
+ if (XFS_TEST_ERROR(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC),
mp, XFS_ERRTAG_IFLUSH_1, XFS_RANDOM_IFLUSH_1)) {
- xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp,
- "xfs_iflush: Bad inode %Lu magic number 0x%x, ptr 0x%p",
- ip->i_ino, (int) INT_GET(dip->di_core.di_magic, ARCH_CONVERT), dip);
+ xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
+ "%s: Bad inode %Lu magic number 0x%x, ptr 0x%p",
+ __func__, ip->i_ino, be16_to_cpu(dip->di_magic), dip);
goto corrupt_out;
}
if (XFS_TEST_ERROR(ip->i_d.di_magic != XFS_DINODE_MAGIC,
mp, XFS_ERRTAG_IFLUSH_2, XFS_RANDOM_IFLUSH_2)) {
- xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp,
- "xfs_iflush: Bad inode %Lu, ptr 0x%p, magic number 0x%x",
- ip->i_ino, ip, ip->i_d.di_magic);
+ xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
+ "%s: Bad inode %Lu, ptr 0x%p, magic number 0x%x",
+ __func__, ip->i_ino, ip, ip->i_d.di_magic);
goto corrupt_out;
}
- if ((ip->i_d.di_mode & S_IFMT) == S_IFREG) {
+ if (S_ISREG(ip->i_d.di_mode)) {
if (XFS_TEST_ERROR(
(ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) &&
(ip->i_d.di_format != XFS_DINODE_FMT_BTREE),
mp, XFS_ERRTAG_IFLUSH_3, XFS_RANDOM_IFLUSH_3)) {
- xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp,
- "xfs_iflush: Bad regular inode %Lu, ptr 0x%p",
- ip->i_ino, ip);
+ xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
+ "%s: Bad regular inode %Lu, ptr 0x%p",
+ __func__, ip->i_ino, ip);
goto corrupt_out;
}
- } else if ((ip->i_d.di_mode & S_IFMT) == S_IFDIR) {
+ } else if (S_ISDIR(ip->i_d.di_mode)) {
if (XFS_TEST_ERROR(
(ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) &&
(ip->i_d.di_format != XFS_DINODE_FMT_BTREE) &&
(ip->i_d.di_format != XFS_DINODE_FMT_LOCAL),
mp, XFS_ERRTAG_IFLUSH_4, XFS_RANDOM_IFLUSH_4)) {
- xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp,
- "xfs_iflush: Bad directory inode %Lu, ptr 0x%p",
- ip->i_ino, ip);
+ xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
+ "%s: Bad directory inode %Lu, ptr 0x%p",
+ __func__, ip->i_ino, ip);
goto corrupt_out;
}
}
if (XFS_TEST_ERROR(ip->i_d.di_nextents + ip->i_d.di_anextents >
ip->i_d.di_nblocks, mp, XFS_ERRTAG_IFLUSH_5,
XFS_RANDOM_IFLUSH_5)) {
- xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp,
- "xfs_iflush: detected corrupt incore inode %Lu, total extents = %d, nblocks = %Ld, ptr 0x%p",
- ip->i_ino,
+ xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
+ "%s: detected corrupt incore inode %Lu, "
+ "total extents = %d, nblocks = %Ld, ptr 0x%p",
+ __func__, ip->i_ino,
ip->i_d.di_nextents + ip->i_d.di_anextents,
- ip->i_d.di_nblocks,
- ip);
+ ip->i_d.di_nblocks, ip);
goto corrupt_out;
}
if (XFS_TEST_ERROR(ip->i_d.di_forkoff > mp->m_sb.sb_inodesize,
mp, XFS_ERRTAG_IFLUSH_6, XFS_RANDOM_IFLUSH_6)) {
- xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp,
- "xfs_iflush: bad inode %Lu, forkoff 0x%x, ptr 0x%p",
- ip->i_ino, ip->i_d.di_forkoff, ip);
+ xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
+ "%s: bad inode %Lu, forkoff 0x%x, ptr 0x%p",
+ __func__, ip->i_ino, ip->i_d.di_forkoff, ip);
goto corrupt_out;
}
+
/*
- * bump the flush iteration count, used to detect flushes which
- * postdate a log record during recovery.
+ * Inode item log recovery for v2 inodes are dependent on the
+ * di_flushiter count for correct sequencing. We bump the flush
+ * iteration count so we can detect flushes which postdate a log record
+ * during recovery. This is redundant as we now log every change and
+ * hence this can't happen but we need to still do it to ensure
+ * backwards compatibility with old kernels that predate logging all
+ * inode changes.
*/
-
- ip->i_d.di_flushiter++;
+ if (ip->i_d.di_version < 3)
+ ip->i_d.di_flushiter++;
/*
* Copy the dirty parts of the inode into the on-disk
@@ -3437,270 +3268,68 @@ xfs_iflush_int(
* because if the inode is dirty at all the core must
* be.
*/
- xfs_xlate_dinode_core((xfs_caddr_t)&(dip->di_core), &(ip->i_d), -1);
+ xfs_dinode_to_disk(dip, &ip->i_d);
/* Wrap, we never let the log put out DI_MAX_FLUSH */
if (ip->i_d.di_flushiter == DI_MAX_FLUSH)
ip->i_d.di_flushiter = 0;
- /*
- * If this is really an old format inode and the superblock version
- * has not been updated to support only new format inodes, then
- * convert back to the old inode format. If the superblock version
- * has been updated, then make the conversion permanent.
- */
- ASSERT(ip->i_d.di_version == XFS_DINODE_VERSION_1 ||
- XFS_SB_VERSION_HASNLINK(&mp->m_sb));
- if (ip->i_d.di_version == XFS_DINODE_VERSION_1) {
- if (!XFS_SB_VERSION_HASNLINK(&mp->m_sb)) {
- /*
- * Convert it back.
- */
- ASSERT(ip->i_d.di_nlink <= XFS_MAXLINK_1);
- INT_SET(dip->di_core.di_onlink, ARCH_CONVERT, ip->i_d.di_nlink);
- } else {
- /*
- * The superblock version has already been bumped,
- * so just make the conversion to the new inode
- * format permanent.
- */
- ip->i_d.di_version = XFS_DINODE_VERSION_2;
- INT_SET(dip->di_core.di_version, ARCH_CONVERT, XFS_DINODE_VERSION_2);
- ip->i_d.di_onlink = 0;
- dip->di_core.di_onlink = 0;
- memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
- memset(&(dip->di_core.di_pad[0]), 0,
- sizeof(dip->di_core.di_pad));
- ASSERT(ip->i_d.di_projid == 0);
- }
- }
-
- if (xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK, bp) == EFSCORRUPTED) {
- goto corrupt_out;
- }
-
- if (XFS_IFORK_Q(ip)) {
- /*
- * The only error from xfs_iflush_fork is on the data fork.
- */
- (void) xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK, bp);
- }
+ xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK);
+ if (XFS_IFORK_Q(ip))
+ xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK);
xfs_inobp_check(mp, bp);
/*
- * We've recorded everything logged in the inode, so we'd
- * like to clear the ilf_fields bits so we don't log and
- * flush things unnecessarily. However, we can't stop
- * logging all this information until the data we've copied
- * into the disk buffer is written to disk. If we did we might
- * overwrite the copy of the inode in the log with all the
- * data after re-logging only part of it, and in the face of
- * a crash we wouldn't have all the data we need to recover.
+ * We've recorded everything logged in the inode, so we'd like to clear
+ * the ili_fields bits so we don't log and flush things unnecessarily.
+ * However, we can't stop logging all this information until the data
+ * we've copied into the disk buffer is written to disk. If we did we
+ * might overwrite the copy of the inode in the log with all the data
+ * after re-logging only part of it, and in the face of a crash we
+ * wouldn't have all the data we need to recover.
*
- * What we do is move the bits to the ili_last_fields field.
- * When logging the inode, these bits are moved back to the
- * ilf_fields field. In the xfs_iflush_done() routine we
- * clear ili_last_fields, since we know that the information
- * those bits represent is permanently on disk. As long as
- * the flush completes before the inode is logged again, then
- * both ilf_fields and ili_last_fields will be cleared.
+ * What we do is move the bits to the ili_last_fields field. When
+ * logging the inode, these bits are moved back to the ili_fields field.
+ * In the xfs_iflush_done() routine we clear ili_last_fields, since we
+ * know that the information those bits represent is permanently on
+ * disk. As long as the flush completes before the inode is logged
+ * again, then both ili_fields and ili_last_fields will be cleared.
*
- * We can play with the ilf_fields bits here, because the inode
- * lock must be held exclusively in order to set bits there
- * and the flush lock protects the ili_last_fields bits.
- * Set ili_logged so the flush done
- * routine can tell whether or not to look in the AIL.
- * Also, store the current LSN of the inode so that we can tell
- * whether the item has moved in the AIL from xfs_iflush_done().
- * In order to read the lsn we need the AIL lock, because
- * it is a 64 bit value that cannot be read atomically.
- */
- if (iip != NULL && iip->ili_format.ilf_fields != 0) {
- iip->ili_last_fields = iip->ili_format.ilf_fields;
- iip->ili_format.ilf_fields = 0;
- iip->ili_logged = 1;
-
- ASSERT(sizeof(xfs_lsn_t) == 8); /* don't lock if it shrinks */
- AIL_LOCK(mp,s);
- iip->ili_flush_lsn = iip->ili_item.li_lsn;
- AIL_UNLOCK(mp, s);
-
- /*
- * Attach the function xfs_iflush_done to the inode's
- * buffer. This will remove the inode from the AIL
- * and unlock the inode's flush lock when the inode is
- * completely written to disk.
- */
- xfs_buf_attach_iodone(bp, (void(*)(xfs_buf_t*,xfs_log_item_t*))
- xfs_iflush_done, (xfs_log_item_t *)iip);
-
- ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL);
- ASSERT(XFS_BUF_IODONE_FUNC(bp) != NULL);
- } else {
- /*
- * We're flushing an inode which is not in the AIL and has
- * not been logged but has i_update_core set. For this
- * case we can use a B_DELWRI flush and immediately drop
- * the inode flush lock because we can avoid the whole
- * AIL state thing. It's OK to drop the flush lock now,
- * because we've already locked the buffer and to do anything
- * you really need both.
- */
- if (iip != NULL) {
- ASSERT(iip->ili_logged == 0);
- ASSERT(iip->ili_last_fields == 0);
- ASSERT((iip->ili_item.li_flags & XFS_LI_IN_AIL) == 0);
- }
- xfs_ifunlock(ip);
- }
-
- return 0;
-
-corrupt_out:
- return XFS_ERROR(EFSCORRUPTED);
-}
-
-
-/*
- * Flush all inactive inodes in mp.
- */
-void
-xfs_iflush_all(
- xfs_mount_t *mp)
-{
- xfs_inode_t *ip;
- vnode_t *vp;
-
- again:
- XFS_MOUNT_ILOCK(mp);
- ip = mp->m_inodes;
- if (ip == NULL)
- goto out;
-
- do {
- /* Make sure we skip markers inserted by sync */
- if (ip->i_mount == NULL) {
- ip = ip->i_mnext;
- continue;
- }
-
- vp = XFS_ITOV_NULL(ip);
- if (!vp) {
- XFS_MOUNT_IUNLOCK(mp);
- xfs_finish_reclaim(ip, 0, XFS_IFLUSH_ASYNC);
- goto again;
- }
-
- ASSERT(vn_count(vp) == 0);
-
- ip = ip->i_mnext;
- } while (ip != mp->m_inodes);
- out:
- XFS_MOUNT_IUNLOCK(mp);
-}
-
-/*
- * xfs_iaccess: check accessibility of inode for mode.
- */
-int
-xfs_iaccess(
- xfs_inode_t *ip,
- mode_t mode,
- cred_t *cr)
-{
- int error;
- mode_t orgmode = mode;
- struct inode *inode = LINVFS_GET_IP(XFS_ITOV(ip));
-
- if (mode & S_IWUSR) {
- umode_t imode = inode->i_mode;
-
- if (IS_RDONLY(inode) &&
- (S_ISREG(imode) || S_ISDIR(imode) || S_ISLNK(imode)))
- return XFS_ERROR(EROFS);
-
- if (IS_IMMUTABLE(inode))
- return XFS_ERROR(EACCES);
- }
-
- /*
- * If there's an Access Control List it's used instead of
- * the mode bits.
+ * We can play with the ili_fields bits here, because the inode lock
+ * must be held exclusively in order to set bits there and the flush
+ * lock protects the ili_last_fields bits. Set ili_logged so the flush
+ * done routine can tell whether or not to look in the AIL. Also, store
+ * the current LSN of the inode so that we can tell whether the item has
+ * moved in the AIL from xfs_iflush_done(). In order to read the lsn we
+ * need the AIL lock, because it is a 64 bit value that cannot be read
+ * atomically.
*/
- if ((error = _ACL_XFS_IACCESS(ip, mode, cr)) != -1)
- return error ? XFS_ERROR(error) : 0;
+ iip->ili_last_fields = iip->ili_fields;
+ iip->ili_fields = 0;
+ iip->ili_logged = 1;
- if (current_fsuid(cr) != ip->i_d.di_uid) {
- mode >>= 3;
- if (!in_group_p((gid_t)ip->i_d.di_gid))
- mode >>= 3;
- }
+ xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn,
+ &iip->ili_item.li_lsn);
/*
- * If the DACs are ok we don't need any capability check.
- */
- if ((ip->i_d.di_mode & mode) == mode)
- return 0;
- /*
- * Read/write DACs are always overridable.
- * Executable DACs are overridable if at least one exec bit is set.
+ * Attach the function xfs_iflush_done to the inode's
+ * buffer. This will remove the inode from the AIL
+ * and unlock the inode's flush lock when the inode is
+ * completely written to disk.
*/
- if (!(orgmode & S_IXUSR) ||
- (inode->i_mode & S_IXUGO) || S_ISDIR(inode->i_mode))
- if (capable_cred(cr, CAP_DAC_OVERRIDE))
- return 0;
+ xfs_buf_attach_iodone(bp, xfs_iflush_done, &iip->ili_item);
- if ((orgmode == S_IRUSR) ||
- (S_ISDIR(inode->i_mode) && (!(orgmode & S_IWUSR)))) {
- if (capable_cred(cr, CAP_DAC_READ_SEARCH))
- return 0;
-#ifdef NOISE
- cmn_err(CE_NOTE, "Ick: mode=%o, orgmode=%o", mode, orgmode);
-#endif /* NOISE */
- return XFS_ERROR(EACCES);
- }
- return XFS_ERROR(EACCES);
-}
+ /* update the lsn in the on disk inode if required */
+ if (ip->i_d.di_version == 3)
+ dip->di_lsn = cpu_to_be64(iip->ili_item.li_lsn);
-/*
- * xfs_iroundup: round up argument to next power of two
- */
-uint
-xfs_iroundup(
- uint v)
-{
- int i;
- uint m;
-
- if ((v & (v - 1)) == 0)
- return v;
- ASSERT((v & 0x80000000) == 0);
- if ((v & (v + 1)) == 0)
- return v + 1;
- for (i = 0, m = 1; i < 31; i++, m <<= 1) {
- if (v & m)
- continue;
- v |= m;
- if ((v & (v + 1)) == 0)
- return v + 1;
- }
- ASSERT(0);
- return( 0 );
-}
+ /* generate the checksum. */
+ xfs_dinode_calc_crc(mp, dip);
-#ifdef XFS_ILOCK_TRACE
-ktrace_t *xfs_ilock_trace_buf;
+ ASSERT(bp->b_fspriv != NULL);
+ ASSERT(bp->b_iodone != NULL);
+ return 0;
-void
-xfs_ilock_trace(xfs_inode_t *ip, int lock, unsigned int lockflags, inst_t *ra)
-{
- ktrace_enter(ip->i_lock_trace,
- (void *)ip,
- (void *)(unsigned long)lock, /* 1 = LOCK, 3=UNLOCK, etc */
- (void *)(unsigned long)lockflags, /* XFS_ILOCK_EXCL etc */
- (void *)ra, /* caller of ilock */
- (void *)(unsigned long)current_cpu(),
- (void *)(unsigned long)current_pid(),
- NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL);
+corrupt_out:
+ return XFS_ERROR(EFSCORRUPTED);
}
-#endif
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h