aboutsummaryrefslogtreecommitdiff
path: root/fs/xfs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/xfs')
-rw-r--r--fs/xfs/Kconfig17
-rw-r--r--fs/xfs/Makefile29
-rw-r--r--fs/xfs/kmem.c40
-rw-r--r--fs/xfs/kmem.h30
-rw-r--r--fs/xfs/mrlock.h12
-rw-r--r--fs/xfs/xfs.h5
-rw-r--r--fs/xfs/xfs_acl.c218
-rw-r--r--fs/xfs/xfs_acl.h40
-rw-r--r--fs/xfs/xfs_ag.h149
-rw-r--r--fs/xfs/xfs_alloc.c266
-rw-r--r--fs/xfs/xfs_alloc.h3
-rw-r--r--fs/xfs/xfs_alloc_btree.c126
-rw-r--r--fs/xfs/xfs_alloc_btree.h39
-rw-r--r--fs/xfs/xfs_aops.c322
-rw-r--r--fs/xfs/xfs_aops.h3
-rw-r--r--fs/xfs/xfs_attr.c1166
-rw-r--r--fs/xfs/xfs_attr.h10
-rw-r--r--fs/xfs/xfs_attr_inactive.c452
-rw-r--r--fs/xfs/xfs_attr_leaf.c2363
-rw-r--r--fs/xfs/xfs_attr_leaf.h204
-rw-r--r--fs/xfs/xfs_attr_list.c653
-rw-r--r--fs/xfs/xfs_attr_remote.c628
-rw-r--r--fs/xfs/xfs_attr_remote.h (renamed from fs/xfs/xfs_utils.h)18
-rw-r--r--fs/xfs/xfs_bit.c4
-rw-r--r--fs/xfs/xfs_bit.h7
-rw-r--r--fs/xfs/xfs_bmap.c4930
-rw-r--r--fs/xfs/xfs_bmap.h80
-rw-r--r--fs/xfs/xfs_bmap_btree.c180
-rw-r--r--fs/xfs/xfs_bmap_btree.h128
-rw-r--r--fs/xfs/xfs_bmap_util.c1897
-rw-r--r--fs/xfs/xfs_bmap_util.h112
-rw-r--r--fs/xfs/xfs_btree.c587
-rw-r--r--fs/xfs/xfs_btree.h105
-rw-r--r--fs/xfs/xfs_buf.c417
-rw-r--r--fs/xfs/xfs_buf.h69
-rw-r--r--fs/xfs/xfs_buf_item.c444
-rw-r--r--fs/xfs/xfs_buf_item.h68
-rw-r--r--fs/xfs/xfs_da_btree.c1591
-rw-r--r--fs/xfs/xfs_da_btree.h95
-rw-r--r--fs/xfs/xfs_da_format.c911
-rw-r--r--fs/xfs/xfs_da_format.h861
-rw-r--r--fs/xfs/xfs_dfrag.c449
-rw-r--r--fs/xfs/xfs_dfrag.h53
-rw-r--r--fs/xfs/xfs_dinode.h47
-rw-r--r--fs/xfs/xfs_dir2.c511
-rw-r--r--fs/xfs/xfs_dir2.h122
-rw-r--r--fs/xfs/xfs_dir2_block.c435
-rw-r--r--fs/xfs/xfs_dir2_data.c465
-rw-r--r--fs/xfs/xfs_dir2_format.h597
-rw-r--r--fs/xfs/xfs_dir2_leaf.c1391
-rw-r--r--fs/xfs/xfs_dir2_node.c1162
-rw-r--r--fs/xfs/xfs_dir2_priv.h232
-rw-r--r--fs/xfs/xfs_dir2_readdir.c700
-rw-r--r--fs/xfs/xfs_dir2_sf.c410
-rw-r--r--fs/xfs/xfs_discard.c17
-rw-r--r--fs/xfs/xfs_dquot.c245
-rw-r--r--fs/xfs/xfs_dquot.h29
-rw-r--r--fs/xfs/xfs_dquot_buf.c290
-rw-r--r--fs/xfs/xfs_dquot_item.c107
-rw-r--r--fs/xfs/xfs_dquot_item.h3
-rw-r--r--fs/xfs/xfs_error.c41
-rw-r--r--fs/xfs/xfs_error.h1
-rw-r--r--fs/xfs/xfs_export.c19
-rw-r--r--fs/xfs/xfs_extent_busy.c16
-rw-r--r--fs/xfs/xfs_extent_busy.h4
-rw-r--r--fs/xfs/xfs_extfree_item.c99
-rw-r--r--fs/xfs/xfs_extfree_item.h102
-rw-r--r--fs/xfs/xfs_file.c293
-rw-r--r--fs/xfs/xfs_filestream.c702
-rw-r--r--fs/xfs/xfs_filestream.h38
-rw-r--r--fs/xfs/xfs_format.h428
-rw-r--r--fs/xfs/xfs_fs.h46
-rw-r--r--fs/xfs/xfs_fsops.c144
-rw-r--r--fs/xfs/xfs_ialloc.c965
-rw-r--r--fs/xfs/xfs_ialloc.h34
-rw-r--r--fs/xfs/xfs_ialloc_btree.c175
-rw-r--r--fs/xfs/xfs_ialloc_btree.h59
-rw-r--r--fs/xfs/xfs_icache.c68
-rw-r--r--fs/xfs/xfs_icache.h63
-rw-r--r--fs/xfs/xfs_icreate_item.c189
-rw-r--r--fs/xfs/xfs_icreate_item.h34
-rw-r--r--fs/xfs/xfs_inode.c4095
-rw-r--r--fs/xfs/xfs_inode.h310
-rw-r--r--fs/xfs/xfs_inode_buf.c479
-rw-r--r--fs/xfs/xfs_inode_buf.h50
-rw-r--r--fs/xfs/xfs_inode_fork.c1906
-rw-r--r--fs/xfs/xfs_inode_fork.h171
-rw-r--r--fs/xfs/xfs_inode_item.c445
-rw-r--r--fs/xfs/xfs_inode_item.h124
-rw-r--r--fs/xfs/xfs_ioctl.c393
-rw-r--r--fs/xfs/xfs_ioctl.h10
-rw-r--r--fs/xfs/xfs_ioctl32.c29
-rw-r--r--fs/xfs/xfs_iomap.c299
-rw-r--r--fs/xfs/xfs_iomap.h8
-rw-r--r--fs/xfs/xfs_iops.c417
-rw-r--r--fs/xfs/xfs_iops.h9
-rw-r--r--fs/xfs/xfs_itable.c83
-rw-r--r--fs/xfs/xfs_linux.h87
-rw-r--r--fs/xfs/xfs_log.c299
-rw-r--r--fs/xfs/xfs_log.h148
-rw-r--r--fs/xfs/xfs_log_cil.c477
-rw-r--r--fs/xfs/xfs_log_format.h679
-rw-r--r--fs/xfs/xfs_log_priv.h173
-rw-r--r--fs/xfs/xfs_log_recover.c1223
-rw-r--r--fs/xfs/xfs_log_rlimit.c150
-rw-r--r--fs/xfs/xfs_message.c13
-rw-r--r--fs/xfs/xfs_message.h27
-rw-r--r--fs/xfs/xfs_mount.c741
-rw-r--r--fs/xfs/xfs_mount.h127
-rw-r--r--fs/xfs/xfs_mru_cache.c151
-rw-r--r--fs/xfs/xfs_mru_cache.h31
-rw-r--r--fs/xfs/xfs_qm.c912
-rw-r--r--fs/xfs/xfs_qm.h123
-rw-r--r--fs/xfs/xfs_qm_bhv.c25
-rw-r--r--fs/xfs/xfs_qm_syscalls.c302
-rw-r--r--fs/xfs/xfs_quota.h281
-rw-r--r--fs/xfs/xfs_quota_defs.h161
-rw-r--r--fs/xfs/xfs_quota_priv.h42
-rw-r--r--fs/xfs/xfs_quotaops.c55
-rw-r--r--fs/xfs/xfs_rename.c346
-rw-r--r--fs/xfs/xfs_rtalloc.c1564
-rw-r--r--fs/xfs/xfs_rtalloc.h77
-rw-r--r--fs/xfs/xfs_rtbitmap.c973
-rw-r--r--fs/xfs/xfs_sb.c836
-rw-r--r--fs/xfs/xfs_sb.h386
-rw-r--r--fs/xfs/xfs_shared.h246
-rw-r--r--fs/xfs/xfs_stats.c1
-rw-r--r--fs/xfs/xfs_stats.h18
-rw-r--r--fs/xfs/xfs_super.c188
-rw-r--r--fs/xfs/xfs_symlink.c599
-rw-r--r--fs/xfs/xfs_symlink.h27
-rw-r--r--fs/xfs/xfs_symlink_remote.c201
-rw-r--r--fs/xfs/xfs_sysctl.c26
-rw-r--r--fs/xfs/xfs_trace.c20
-rw-r--r--fs/xfs/xfs_trace.h190
-rw-r--r--fs/xfs/xfs_trans.c621
-rw-r--r--fs/xfs/xfs_trans.h325
-rw-r--r--fs/xfs/xfs_trans_ail.c47
-rw-r--r--fs/xfs/xfs_trans_buf.c135
-rw-r--r--fs/xfs/xfs_trans_dquot.c158
-rw-r--r--fs/xfs/xfs_trans_extfree.c7
-rw-r--r--fs/xfs/xfs_trans_inode.c67
-rw-r--r--fs/xfs/xfs_trans_priv.h17
-rw-r--r--fs/xfs/xfs_trans_resv.c894
-rw-r--r--fs/xfs/xfs_trans_resv.h117
-rw-r--r--fs/xfs/xfs_trans_space.h12
-rw-r--r--fs/xfs/xfs_types.h63
-rw-r--r--fs/xfs/xfs_utils.c314
-rw-r--r--fs/xfs/xfs_vnode.h17
-rw-r--r--fs/xfs/xfs_vnodeops.c2348
-rw-r--r--fs/xfs/xfs_vnodeops.h56
-rw-r--r--fs/xfs/xfs_xattr.c14
152 files changed, 32082 insertions, 23947 deletions
diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig
index 5a7ffe54f5d..399e8cec6e6 100644
--- a/fs/xfs/Kconfig
+++ b/fs/xfs/Kconfig
@@ -69,9 +69,22 @@ config XFS_RT
If unsure, say N.
+config XFS_WARN
+ bool "XFS Verbose Warnings"
+ depends on XFS_FS && !XFS_DEBUG
+ help
+ Say Y here to get an XFS build with many additional warnings.
+ It converts ASSERT checks to WARN, so will log any out-of-bounds
+ conditions that occur that would otherwise be missed. It is much
+ lighter weight than XFS_DEBUG and does not modify algorithms and will
+ not cause the kernel to panic on non-fatal errors.
+
+ However, similar to XFS_DEBUG, it is only advisable to use this if you
+ are debugging a particular problem.
+
config XFS_DEBUG
- bool "XFS Debugging support (EXPERIMENTAL)"
- depends on XFS_FS && EXPERIMENTAL
+ bool "XFS Debugging support"
+ depends on XFS_FS
help
Say Y here to get an XFS build with many debugging features,
including ASSERT checks, function wrappers around macros,
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index d02201df855..c21f4350666 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -27,9 +27,12 @@ xfs-y += xfs_trace.o
# highlevel code
xfs-y += xfs_aops.o \
+ xfs_attr_inactive.o \
+ xfs_attr_list.o \
xfs_bit.o \
+ xfs_bmap_util.o \
xfs_buf.o \
- xfs_dfrag.o \
+ xfs_dir2_readdir.o \
xfs_discard.o \
xfs_error.o \
xfs_export.o \
@@ -44,12 +47,12 @@ xfs-y += xfs_aops.o \
xfs_iops.o \
xfs_itable.o \
xfs_message.o \
+ xfs_mount.o \
xfs_mru_cache.o \
xfs_super.o \
+ xfs_symlink.o \
+ xfs_trans.o \
xfs_xattr.o \
- xfs_rename.o \
- xfs_utils.o \
- xfs_vnodeops.o \
kmem.o \
uuid.o
@@ -58,22 +61,30 @@ xfs-y += xfs_alloc.o \
xfs_alloc_btree.o \
xfs_attr.o \
xfs_attr_leaf.o \
+ xfs_attr_remote.o \
xfs_bmap.o \
xfs_bmap_btree.o \
xfs_btree.o \
xfs_da_btree.o \
+ xfs_da_format.o \
xfs_dir2.o \
xfs_dir2_block.o \
xfs_dir2_data.o \
xfs_dir2_leaf.o \
xfs_dir2_node.o \
xfs_dir2_sf.o \
+ xfs_dquot_buf.o \
xfs_ialloc.o \
xfs_ialloc_btree.o \
+ xfs_icreate_item.o \
xfs_inode.o \
+ xfs_inode_fork.o \
+ xfs_inode_buf.o \
xfs_log_recover.o \
- xfs_mount.o \
- xfs_trans.o
+ xfs_log_rlimit.o \
+ xfs_sb.o \
+ xfs_symlink_remote.o \
+ xfs_trans_resv.o
# low-level transaction/log code
xfs-y += xfs_log.o \
@@ -94,7 +105,11 @@ xfs-$(CONFIG_XFS_QUOTA) += xfs_dquot.o \
xfs_qm_bhv.o \
xfs_qm.o \
xfs_quotaops.o
-xfs-$(CONFIG_XFS_RT) += xfs_rtalloc.o
+
+# xfs_rtbitmap is shared with libxfs
+xfs-$(CONFIG_XFS_RT) += xfs_rtalloc.o \
+ xfs_rtbitmap.o
+
xfs-$(CONFIG_XFS_POSIX_ACL) += xfs_acl.o
xfs-$(CONFIG_PROC_FS) += xfs_stats.o
xfs-$(CONFIG_SYSCTL) += xfs_sysctl.o
diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c
index 4a7286c1dc8..844e288b957 100644
--- a/fs/xfs/kmem.c
+++ b/fs/xfs/kmem.c
@@ -27,8 +27,6 @@
/*
* Greedy allocation. May fail and may return vmalloced memory.
- *
- * Must be freed using kmem_free_large.
*/
void *
kmem_zalloc_greedy(size_t *size, size_t minsize, size_t maxsize)
@@ -36,7 +34,7 @@ kmem_zalloc_greedy(size_t *size, size_t minsize, size_t maxsize)
void *ptr;
size_t kmsize = maxsize;
- while (!(ptr = kmem_zalloc_large(kmsize))) {
+ while (!(ptr = vzalloc(kmsize))) {
if ((kmsize >>= 1) <= minsize)
kmsize = minsize;
}
@@ -65,13 +63,32 @@ kmem_alloc(size_t size, xfs_km_flags_t flags)
}
void *
-kmem_zalloc(size_t size, xfs_km_flags_t flags)
+kmem_zalloc_large(size_t size, xfs_km_flags_t flags)
{
+ unsigned noio_flag = 0;
void *ptr;
+ gfp_t lflags;
- ptr = kmem_alloc(size, flags);
+ ptr = kmem_zalloc(size, flags | KM_MAYFAIL);
if (ptr)
- memset((char *)ptr, 0, (int)size);
+ return ptr;
+
+ /*
+ * __vmalloc() will allocate data pages and auxillary structures (e.g.
+ * pagetables) with GFP_KERNEL, yet we may be under GFP_NOFS context
+ * here. Hence we need to tell memory reclaim that we are in such a
+ * context via PF_MEMALLOC_NOIO to prevent memory reclaim re-entering
+ * the filesystem here and potentially deadlocking.
+ */
+ if ((current->flags & PF_FSTRANS) || (flags & KM_NOFS))
+ noio_flag = memalloc_noio_save();
+
+ lflags = kmem_flags_convert(flags);
+ ptr = __vmalloc(size, lflags | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL);
+
+ if ((current->flags & PF_FSTRANS) || (flags & KM_NOFS))
+ memalloc_noio_restore(noio_flag);
+
return ptr;
}
@@ -119,14 +136,3 @@ kmem_zone_alloc(kmem_zone_t *zone, xfs_km_flags_t flags)
congestion_wait(BLK_RW_ASYNC, HZ/50);
} while (1);
}
-
-void *
-kmem_zone_zalloc(kmem_zone_t *zone, xfs_km_flags_t flags)
-{
- void *ptr;
-
- ptr = kmem_zone_alloc(zone, flags);
- if (ptr)
- memset((char *)ptr, 0, kmem_cache_size(zone));
- return ptr;
-}
diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h
index b2f2620f9a8..64db0e53ede 100644
--- a/fs/xfs/kmem.h
+++ b/fs/xfs/kmem.h
@@ -32,6 +32,7 @@ typedef unsigned __bitwise xfs_km_flags_t;
#define KM_NOSLEEP ((__force xfs_km_flags_t)0x0002u)
#define KM_NOFS ((__force xfs_km_flags_t)0x0004u)
#define KM_MAYFAIL ((__force xfs_km_flags_t)0x0008u)
+#define KM_ZERO ((__force xfs_km_flags_t)0x0010u)
/*
* We use a special process flag to avoid recursive callbacks into
@@ -43,7 +44,7 @@ kmem_flags_convert(xfs_km_flags_t flags)
{
gfp_t lflags;
- BUG_ON(flags & ~(KM_SLEEP|KM_NOSLEEP|KM_NOFS|KM_MAYFAIL));
+ BUG_ON(flags & ~(KM_SLEEP|KM_NOSLEEP|KM_NOFS|KM_MAYFAIL|KM_ZERO));
if (flags & KM_NOSLEEP) {
lflags = GFP_ATOMIC | __GFP_NOWARN;
@@ -52,25 +53,27 @@ kmem_flags_convert(xfs_km_flags_t flags)
if ((current->flags & PF_FSTRANS) || (flags & KM_NOFS))
lflags &= ~__GFP_FS;
}
+
+ if (flags & KM_ZERO)
+ lflags |= __GFP_ZERO;
+
return lflags;
}
extern void *kmem_alloc(size_t, xfs_km_flags_t);
-extern void *kmem_zalloc(size_t, xfs_km_flags_t);
+extern void *kmem_zalloc_large(size_t size, xfs_km_flags_t);
extern void *kmem_realloc(const void *, size_t, size_t, xfs_km_flags_t);
extern void kmem_free(const void *);
-static inline void *kmem_zalloc_large(size_t size)
-{
- return vzalloc(size);
-}
-static inline void kmem_free_large(void *ptr)
-{
- vfree(ptr);
-}
extern void *kmem_zalloc_greedy(size_t *, size_t, size_t);
+static inline void *
+kmem_zalloc(size_t size, xfs_km_flags_t flags)
+{
+ return kmem_alloc(size, flags | KM_ZERO);
+}
+
/*
* Zone interfaces
*/
@@ -109,6 +112,11 @@ kmem_zone_destroy(kmem_zone_t *zone)
}
extern void *kmem_zone_alloc(kmem_zone_t *, xfs_km_flags_t);
-extern void *kmem_zone_zalloc(kmem_zone_t *, xfs_km_flags_t);
+
+static inline void *
+kmem_zone_zalloc(kmem_zone_t *zone, xfs_km_flags_t flags)
+{
+ return kmem_zone_alloc(zone, flags | KM_ZERO);
+}
#endif /* __XFS_SUPPORT_KMEM_H__ */
diff --git a/fs/xfs/mrlock.h b/fs/xfs/mrlock.h
index ff6a19873e5..e3c92d19e54 100644
--- a/fs/xfs/mrlock.h
+++ b/fs/xfs/mrlock.h
@@ -22,12 +22,12 @@
typedef struct {
struct rw_semaphore mr_lock;
-#ifdef DEBUG
+#if defined(DEBUG) || defined(XFS_WARN)
int mr_writer;
#endif
} mrlock_t;
-#ifdef DEBUG
+#if defined(DEBUG) || defined(XFS_WARN)
#define mrinit(mrp, name) \
do { (mrp)->mr_writer = 0; init_rwsem(&(mrp)->mr_lock); } while (0)
#else
@@ -46,7 +46,7 @@ static inline void mraccess_nested(mrlock_t *mrp, int subclass)
static inline void mrupdate_nested(mrlock_t *mrp, int subclass)
{
down_write_nested(&mrp->mr_lock, subclass);
-#ifdef DEBUG
+#if defined(DEBUG) || defined(XFS_WARN)
mrp->mr_writer = 1;
#endif
}
@@ -60,7 +60,7 @@ static inline int mrtryupdate(mrlock_t *mrp)
{
if (!down_write_trylock(&mrp->mr_lock))
return 0;
-#ifdef DEBUG
+#if defined(DEBUG) || defined(XFS_WARN)
mrp->mr_writer = 1;
#endif
return 1;
@@ -68,7 +68,7 @@ static inline int mrtryupdate(mrlock_t *mrp)
static inline void mrunlock_excl(mrlock_t *mrp)
{
-#ifdef DEBUG
+#if defined(DEBUG) || defined(XFS_WARN)
mrp->mr_writer = 0;
#endif
up_write(&mrp->mr_lock);
@@ -81,7 +81,7 @@ static inline void mrunlock_shared(mrlock_t *mrp)
static inline void mrdemote(mrlock_t *mrp)
{
-#ifdef DEBUG
+#if defined(DEBUG) || defined(XFS_WARN)
mrp->mr_writer = 0;
#endif
downgrade_write(&mrp->mr_lock);
diff --git a/fs/xfs/xfs.h b/fs/xfs/xfs.h
index d8b11b7f94a..a742c47f7d5 100644
--- a/fs/xfs/xfs.h
+++ b/fs/xfs/xfs.h
@@ -24,6 +24,11 @@
#define XFS_BUF_LOCK_TRACKING 1
#endif
+#ifdef CONFIG_XFS_WARN
+#define XFS_WARN 1
+#endif
+
+
#include "xfs_linux.h"
#endif /* __XFS_H__ */
diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
index 1d32f1d5276..6888ad886ff 100644
--- a/fs/xfs/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
@@ -16,11 +16,15 @@
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "xfs.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_ag.h"
+#include "xfs_sb.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
#include "xfs_acl.h"
#include "xfs_attr.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_inode.h"
-#include "xfs_vnodeops.h"
#include "xfs_trace.h"
#include <linux/slab.h>
#include <linux/xattr.h>
@@ -34,7 +38,9 @@
*/
STATIC struct posix_acl *
-xfs_acl_from_disk(struct xfs_acl *aclp)
+xfs_acl_from_disk(
+ struct xfs_acl *aclp,
+ int max_entries)
{
struct posix_acl_entry *acl_e;
struct posix_acl *acl;
@@ -42,7 +48,7 @@ xfs_acl_from_disk(struct xfs_acl *aclp)
unsigned int count, i;
count = be32_to_cpu(aclp->acl_cnt);
- if (count > XFS_ACL_MAX_ENTRIES)
+ if (count > max_entries)
return ERR_PTR(-EFSCORRUPTED);
acl = posix_acl_alloc(count, GFP_KERNEL);
@@ -64,14 +70,15 @@ xfs_acl_from_disk(struct xfs_acl *aclp)
switch (acl_e->e_tag) {
case ACL_USER:
+ acl_e->e_uid = xfs_uid_to_kuid(be32_to_cpu(ace->ae_id));
+ break;
case ACL_GROUP:
- acl_e->e_id = be32_to_cpu(ace->ae_id);
+ acl_e->e_gid = xfs_gid_to_kgid(be32_to_cpu(ace->ae_id));
break;
case ACL_USER_OBJ:
case ACL_GROUP_OBJ:
case ACL_MASK:
case ACL_OTHER:
- acl_e->e_id = ACL_UNDEFINED_ID;
break;
default:
goto fail;
@@ -97,7 +104,18 @@ xfs_acl_to_disk(struct xfs_acl *aclp, const struct posix_acl *acl)
acl_e = &acl->a_entries[i];
ace->ae_tag = cpu_to_be32(acl_e->e_tag);
- ace->ae_id = cpu_to_be32(acl_e->e_id);
+ switch (acl_e->e_tag) {
+ case ACL_USER:
+ ace->ae_id = cpu_to_be32(xfs_kuid_to_uid(acl_e->e_uid));
+ break;
+ case ACL_GROUP:
+ ace->ae_id = cpu_to_be32(xfs_kgid_to_gid(acl_e->e_gid));
+ break;
+ default:
+ ace->ae_id = cpu_to_be32(ACL_UNDEFINED_ID);
+ break;
+ }
+
ace->ae_perm = cpu_to_be16(acl_e->e_perm);
}
}
@@ -106,15 +124,11 @@ struct posix_acl *
xfs_get_acl(struct inode *inode, int type)
{
struct xfs_inode *ip = XFS_I(inode);
- struct posix_acl *acl;
+ struct posix_acl *acl = NULL;
struct xfs_acl *xfs_acl;
- int len = sizeof(struct xfs_acl);
unsigned char *ea_name;
int error;
-
- acl = get_cached_acl(inode, type);
- if (acl != ACL_NOT_CACHED)
- return acl;
+ int len;
trace_xfs_get_acl(ip);
@@ -133,8 +147,8 @@ xfs_get_acl(struct inode *inode, int type)
* If we have a cached ACLs value just return it, not need to
* go out to the disk.
*/
-
- xfs_acl = kzalloc(sizeof(struct xfs_acl), GFP_KERNEL);
+ len = XFS_ACL_MAX_SIZE(ip->i_mount);
+ xfs_acl = kmem_zalloc_large(len, KM_SLEEP);
if (!xfs_acl)
return ERR_PTR(-ENOMEM);
@@ -146,34 +160,29 @@ xfs_get_acl(struct inode *inode, int type)
* cache entry, for any other error assume it is transient and
* leave the cache entry as ACL_NOT_CACHED.
*/
- if (error == -ENOATTR) {
- acl = NULL;
+ if (error == -ENOATTR)
goto out_update_cache;
- }
goto out;
}
- acl = xfs_acl_from_disk(xfs_acl);
+ acl = xfs_acl_from_disk(xfs_acl, XFS_ACL_MAX_ENTRIES(ip->i_mount));
if (IS_ERR(acl))
goto out;
- out_update_cache:
+out_update_cache:
set_cached_acl(inode, type, acl);
- out:
- kfree(xfs_acl);
+out:
+ kmem_free(xfs_acl);
return acl;
}
STATIC int
-xfs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
+__xfs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
{
struct xfs_inode *ip = XFS_I(inode);
unsigned char *ea_name;
int error;
- if (S_ISLNK(inode->i_mode))
- return -EOPNOTSUPP;
-
switch (type) {
case ACL_TYPE_ACCESS:
ea_name = SGI_ACL_FILE;
@@ -189,21 +198,22 @@ xfs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
if (acl) {
struct xfs_acl *xfs_acl;
- int len;
+ int len = XFS_ACL_MAX_SIZE(ip->i_mount);
- xfs_acl = kzalloc(sizeof(struct xfs_acl), GFP_KERNEL);
+ xfs_acl = kmem_zalloc_large(len, KM_SLEEP);
if (!xfs_acl)
return -ENOMEM;
xfs_acl_to_disk(xfs_acl, acl);
- len = sizeof(struct xfs_acl) -
- (sizeof(struct xfs_acl_entry) *
- (XFS_ACL_MAX_ENTRIES - acl->a_count));
+
+ /* subtract away the unused acl entries */
+ len -= sizeof(struct xfs_acl_entry) *
+ (XFS_ACL_MAX_ENTRIES(ip->i_mount) - acl->a_count);
error = -xfs_attr_set(ip, ea_name, (unsigned char *)xfs_acl,
len, ATTR_ROOT);
- kfree(xfs_acl);
+ kmem_free(xfs_acl);
} else {
/*
* A NULL ACL argument means we want to remove the ACL.
@@ -243,7 +253,7 @@ xfs_set_mode(struct inode *inode, umode_t mode)
static int
xfs_acl_exists(struct inode *inode, unsigned char *name)
{
- int len = sizeof(struct xfs_acl);
+ int len = XFS_ACL_MAX_SIZE(XFS_M(inode->i_sb));
return (xfs_attr_get(XFS_I(inode), name, NULL, &len,
ATTR_ROOT|ATTR_KERNOVAL) == 0);
@@ -263,131 +273,23 @@ posix_acl_default_exists(struct inode *inode)
return xfs_acl_exists(inode, SGI_ACL_DEFAULT);
}
-/*
- * No need for i_mutex because the inode is not yet exposed to the VFS.
- */
-int
-xfs_inherit_acl(struct inode *inode, struct posix_acl *acl)
-{
- umode_t mode = inode->i_mode;
- int error = 0, inherit = 0;
-
- if (S_ISDIR(inode->i_mode)) {
- error = xfs_set_acl(inode, ACL_TYPE_DEFAULT, acl);
- if (error)
- goto out;
- }
-
- error = posix_acl_create(&acl, GFP_KERNEL, &mode);
- if (error < 0)
- return error;
-
- /*
- * If posix_acl_create returns a positive value we need to
- * inherit a permission that can't be represented using the Unix
- * mode bits and we actually need to set an ACL.
- */
- if (error > 0)
- inherit = 1;
-
- error = xfs_set_mode(inode, mode);
- if (error)
- goto out;
-
- if (inherit)
- error = xfs_set_acl(inode, ACL_TYPE_ACCESS, acl);
-
-out:
- posix_acl_release(acl);
- return error;
-}
-
int
-xfs_acl_chmod(struct inode *inode)
-{
- struct posix_acl *acl;
- int error;
-
- if (S_ISLNK(inode->i_mode))
- return -EOPNOTSUPP;
-
- acl = xfs_get_acl(inode, ACL_TYPE_ACCESS);
- if (IS_ERR(acl) || !acl)
- return PTR_ERR(acl);
-
- error = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode);
- if (error)
- return error;
-
- error = xfs_set_acl(inode, ACL_TYPE_ACCESS, acl);
- posix_acl_release(acl);
- return error;
-}
-
-static int
-xfs_xattr_acl_get(struct dentry *dentry, const char *name,
- void *value, size_t size, int type)
-{
- struct posix_acl *acl;
- int error;
-
- acl = xfs_get_acl(dentry->d_inode, type);
- if (IS_ERR(acl))
- return PTR_ERR(acl);
- if (acl == NULL)
- return -ENODATA;
-
- error = posix_acl_to_xattr(&init_user_ns, acl, value, size);
- posix_acl_release(acl);
-
- return error;
-}
-
-static int
-xfs_xattr_acl_set(struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags, int type)
+xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
{
- struct inode *inode = dentry->d_inode;
- struct posix_acl *acl = NULL;
int error = 0;
- if (flags & XATTR_CREATE)
- return -EINVAL;
- if (type == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode))
- return value ? -EACCES : 0;
- if ((current_fsuid() != inode->i_uid) && !capable(CAP_FOWNER))
- return -EPERM;
-
- if (!value)
+ if (!acl)
goto set_acl;
- acl = posix_acl_from_xattr(&init_user_ns, value, size);
- if (!acl) {
- /*
- * acl_set_file(3) may request that we set default ACLs with
- * zero length -- defend (gracefully) against that here.
- */
- goto out;
- }
- if (IS_ERR(acl)) {
- error = PTR_ERR(acl);
- goto out;
- }
-
- error = posix_acl_valid(acl);
- if (error)
- goto out_release;
-
- error = -EINVAL;
- if (acl->a_count > XFS_ACL_MAX_ENTRIES)
- goto out_release;
+ error = -E2BIG;
+ if (acl->a_count > XFS_ACL_MAX_ENTRIES(XFS_M(inode->i_sb)))
+ return error;
if (type == ACL_TYPE_ACCESS) {
umode_t mode = inode->i_mode;
error = posix_acl_equiv_mode(acl, &mode);
if (error <= 0) {
- posix_acl_release(acl);
acl = NULL;
if (error < 0)
@@ -396,27 +298,9 @@ xfs_xattr_acl_set(struct dentry *dentry, const char *name,
error = xfs_set_mode(inode, mode);
if (error)
- goto out_release;
+ return error;
}
set_acl:
- error = xfs_set_acl(inode, type, acl);
- out_release:
- posix_acl_release(acl);
- out:
- return error;
+ return __xfs_set_acl(inode, type, acl);
}
-
-const struct xattr_handler xfs_xattr_acl_access_handler = {
- .prefix = POSIX_ACL_XATTR_ACCESS,
- .flags = ACL_TYPE_ACCESS,
- .get = xfs_xattr_acl_get,
- .set = xfs_xattr_acl_set,
-};
-
-const struct xattr_handler xfs_xattr_acl_default_handler = {
- .prefix = POSIX_ACL_XATTR_DEFAULT,
- .flags = ACL_TYPE_DEFAULT,
- .get = xfs_xattr_acl_get,
- .set = xfs_xattr_acl_set,
-};
diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h
index 39632d94135..5dc16374451 100644
--- a/fs/xfs/xfs_acl.h
+++ b/fs/xfs/xfs_acl.h
@@ -22,19 +22,36 @@ struct inode;
struct posix_acl;
struct xfs_inode;
-#define XFS_ACL_MAX_ENTRIES 25
#define XFS_ACL_NOT_PRESENT (-1)
/* On-disk XFS access control list structure */
+struct xfs_acl_entry {
+ __be32 ae_tag;
+ __be32 ae_id;
+ __be16 ae_perm;
+ __be16 ae_pad; /* fill the implicit hole in the structure */
+};
+
struct xfs_acl {
- __be32 acl_cnt;
- struct xfs_acl_entry {
- __be32 ae_tag;
- __be32 ae_id;
- __be16 ae_perm;
- } acl_entry[XFS_ACL_MAX_ENTRIES];
+ __be32 acl_cnt;
+ struct xfs_acl_entry acl_entry[0];
};
+/*
+ * The number of ACL entries allowed is defined by the on-disk format.
+ * For v4 superblocks, that is limited to 25 entries. For v5 superblocks, it is
+ * limited only by the maximum size of the xattr that stores the information.
+ */
+#define XFS_ACL_MAX_ENTRIES(mp) \
+ (xfs_sb_version_hascrc(&mp->m_sb) \
+ ? (XATTR_SIZE_MAX - sizeof(struct xfs_acl)) / \
+ sizeof(struct xfs_acl_entry) \
+ : 25)
+
+#define XFS_ACL_MAX_SIZE(mp) \
+ (sizeof(struct xfs_acl) + \
+ sizeof(struct xfs_acl_entry) * XFS_ACL_MAX_ENTRIES((mp)))
+
/* On-disk XFS extended attribute names */
#define SGI_ACL_FILE (unsigned char *)"SGI_ACL_FILE"
#define SGI_ACL_DEFAULT (unsigned char *)"SGI_ACL_DEFAULT"
@@ -43,20 +60,15 @@ struct xfs_acl {
#ifdef CONFIG_XFS_POSIX_ACL
extern struct posix_acl *xfs_get_acl(struct inode *inode, int type);
-extern int xfs_inherit_acl(struct inode *inode, struct posix_acl *default_acl);
-extern int xfs_acl_chmod(struct inode *inode);
+extern int xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type);
extern int posix_acl_access_exists(struct inode *inode);
extern int posix_acl_default_exists(struct inode *inode);
-
-extern const struct xattr_handler xfs_xattr_acl_access_handler;
-extern const struct xattr_handler xfs_xattr_acl_default_handler;
#else
static inline struct posix_acl *xfs_get_acl(struct inode *inode, int type)
{
return NULL;
}
-# define xfs_inherit_acl(inode, default_acl) 0
-# define xfs_acl_chmod(inode) 0
+# define xfs_set_acl NULL
# define posix_acl_access_exists(inode) 0
# define posix_acl_default_exists(inode) 0
#endif /* CONFIG_XFS_POSIX_ACL */
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index f2aeedb6a57..6e247a99f5d 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -30,6 +30,7 @@ struct xfs_trans;
#define XFS_AGF_MAGIC 0x58414746 /* 'XAGF' */
#define XFS_AGI_MAGIC 0x58414749 /* 'XAGI' */
+#define XFS_AGFL_MAGIC 0x5841464c /* 'XAFL' */
#define XFS_AGF_VERSION 1
#define XFS_AGI_VERSION 1
@@ -63,14 +64,33 @@ typedef struct xfs_agf {
__be32 agf_spare0; /* spare field */
__be32 agf_levels[XFS_BTNUM_AGF]; /* btree levels */
__be32 agf_spare1; /* spare field */
+
__be32 agf_flfirst; /* first freelist block's index */
__be32 agf_fllast; /* last freelist block's index */
__be32 agf_flcount; /* count of blocks in freelist */
__be32 agf_freeblks; /* total free blocks */
+
__be32 agf_longest; /* longest free space */
__be32 agf_btreeblks; /* # of blocks held in AGF btrees */
+ uuid_t agf_uuid; /* uuid of filesystem */
+
+ /*
+ * reserve some contiguous space for future logged fields before we add
+ * the unlogged fields. This makes the range logging via flags and
+ * structure offsets much simpler.
+ */
+ __be64 agf_spare64[16];
+
+ /* unlogged fields, written during buffer writeback. */
+ __be64 agf_lsn; /* last write sequence */
+ __be32 agf_crc; /* crc of agf sector */
+ __be32 agf_spare2;
+
+ /* structure must be padded to 64 bit alignment */
} xfs_agf_t;
+#define XFS_AGF_CRC_OFF offsetof(struct xfs_agf, agf_crc)
+
#define XFS_AGF_MAGICNUM 0x00000001
#define XFS_AGF_VERSIONNUM 0x00000002
#define XFS_AGF_SEQNO 0x00000004
@@ -83,7 +103,8 @@ typedef struct xfs_agf {
#define XFS_AGF_FREEBLKS 0x00000200
#define XFS_AGF_LONGEST 0x00000400
#define XFS_AGF_BTREEBLKS 0x00000800
-#define XFS_AGF_NUM_BITS 12
+#define XFS_AGF_UUID 0x00001000
+#define XFS_AGF_NUM_BITS 13
#define XFS_AGF_ALL_BITS ((1 << XFS_AGF_NUM_BITS) - 1)
#define XFS_AGF_FLAGS \
@@ -98,7 +119,8 @@ typedef struct xfs_agf {
{ XFS_AGF_FLCOUNT, "FLCOUNT" }, \
{ XFS_AGF_FREEBLKS, "FREEBLKS" }, \
{ XFS_AGF_LONGEST, "LONGEST" }, \
- { XFS_AGF_BTREEBLKS, "BTREEBLKS" }
+ { XFS_AGF_BTREEBLKS, "BTREEBLKS" }, \
+ { XFS_AGF_UUID, "UUID" }
/* disk block (xfs_daddr_t) in the AG */
#define XFS_AGF_DADDR(mp) ((xfs_daddr_t)(1 << (mp)->m_sectbb_log))
@@ -108,8 +130,6 @@ typedef struct xfs_agf {
extern int xfs_read_agf(struct xfs_mount *mp, struct xfs_trans *tp,
xfs_agnumber_t agno, int flags, struct xfs_buf **bpp);
-extern const struct xfs_buf_ops xfs_agf_buf_ops;
-
/*
* Size of the unlinked inode hash table in the agi.
*/
@@ -132,6 +152,7 @@ typedef struct xfs_agi {
__be32 agi_root; /* root of inode btree */
__be32 agi_level; /* levels in inode btree */
__be32 agi_freecount; /* number of free inodes */
+
__be32 agi_newino; /* new inode just allocated */
__be32 agi_dirino; /* last directory inode chunk */
/*
@@ -139,21 +160,38 @@ typedef struct xfs_agi {
* still being referenced.
*/
__be32 agi_unlinked[XFS_AGI_UNLINKED_BUCKETS];
+ /*
+ * This marks the end of logging region 1 and start of logging region 2.
+ */
+ uuid_t agi_uuid; /* uuid of filesystem */
+ __be32 agi_crc; /* crc of agi sector */
+ __be32 agi_pad32;
+ __be64 agi_lsn; /* last write sequence */
+
+ __be32 agi_free_root; /* root of the free inode btree */
+ __be32 agi_free_level;/* levels in free inode btree */
+
+ /* structure must be padded to 64 bit alignment */
} xfs_agi_t;
-#define XFS_AGI_MAGICNUM 0x00000001
-#define XFS_AGI_VERSIONNUM 0x00000002
-#define XFS_AGI_SEQNO 0x00000004
-#define XFS_AGI_LENGTH 0x00000008
-#define XFS_AGI_COUNT 0x00000010
-#define XFS_AGI_ROOT 0x00000020
-#define XFS_AGI_LEVEL 0x00000040
-#define XFS_AGI_FREECOUNT 0x00000080
-#define XFS_AGI_NEWINO 0x00000100
-#define XFS_AGI_DIRINO 0x00000200
-#define XFS_AGI_UNLINKED 0x00000400
-#define XFS_AGI_NUM_BITS 11
-#define XFS_AGI_ALL_BITS ((1 << XFS_AGI_NUM_BITS) - 1)
+#define XFS_AGI_CRC_OFF offsetof(struct xfs_agi, agi_crc)
+
+#define XFS_AGI_MAGICNUM (1 << 0)
+#define XFS_AGI_VERSIONNUM (1 << 1)
+#define XFS_AGI_SEQNO (1 << 2)
+#define XFS_AGI_LENGTH (1 << 3)
+#define XFS_AGI_COUNT (1 << 4)
+#define XFS_AGI_ROOT (1 << 5)
+#define XFS_AGI_LEVEL (1 << 6)
+#define XFS_AGI_FREECOUNT (1 << 7)
+#define XFS_AGI_NEWINO (1 << 8)
+#define XFS_AGI_DIRINO (1 << 9)
+#define XFS_AGI_UNLINKED (1 << 10)
+#define XFS_AGI_NUM_BITS_R1 11 /* end of the 1st agi logging region */
+#define XFS_AGI_ALL_BITS_R1 ((1 << XFS_AGI_NUM_BITS_R1) - 1)
+#define XFS_AGI_FREE_ROOT (1 << 11)
+#define XFS_AGI_FREE_LEVEL (1 << 12)
+#define XFS_AGI_NUM_BITS_R2 13
/* disk block (xfs_daddr_t) in the AG */
#define XFS_AGI_DADDR(mp) ((xfs_daddr_t)(2 << (mp)->m_sectbb_log))
@@ -163,73 +201,40 @@ typedef struct xfs_agi {
extern int xfs_read_agi(struct xfs_mount *mp, struct xfs_trans *tp,
xfs_agnumber_t agno, struct xfs_buf **bpp);
-extern const struct xfs_buf_ops xfs_agi_buf_ops;
-
/*
* The third a.g. block contains the a.g. freelist, an array
* of block pointers to blocks owned by the allocation btree code.
*/
#define XFS_AGFL_DADDR(mp) ((xfs_daddr_t)(3 << (mp)->m_sectbb_log))
#define XFS_AGFL_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_AGFL_DADDR(mp))
-#define XFS_AGFL_SIZE(mp) ((mp)->m_sb.sb_sectsize / sizeof(xfs_agblock_t))
#define XFS_BUF_TO_AGFL(bp) ((xfs_agfl_t *)((bp)->b_addr))
-typedef struct xfs_agfl {
- __be32 agfl_bno[1]; /* actually XFS_AGFL_SIZE(mp) */
-} xfs_agfl_t;
+#define XFS_BUF_TO_AGFL_BNO(mp, bp) \
+ (xfs_sb_version_hascrc(&((mp)->m_sb)) ? \
+ &(XFS_BUF_TO_AGFL(bp)->agfl_bno[0]) : \
+ (__be32 *)(bp)->b_addr)
/*
- * Per-ag incore structure, copies of information in agf and agi,
- * to improve the performance of allocation group selection.
+ * Size of the AGFL. For CRC-enabled filesystes we steal a couple of
+ * slots in the beginning of the block for a proper header with the
+ * location information and CRC.
*/
-#define XFS_PAGB_NUM_SLOTS 128
-
-typedef struct xfs_perag {
- struct xfs_mount *pag_mount; /* owner filesystem */
- xfs_agnumber_t pag_agno; /* AG this structure belongs to */
- atomic_t pag_ref; /* perag reference count */
- char pagf_init; /* this agf's entry is initialized */
- char pagi_init; /* this agi's entry is initialized */
- char pagf_metadata; /* the agf is preferred to be metadata */
- char pagi_inodeok; /* The agi is ok for inodes */
- __uint8_t pagf_levels[XFS_BTNUM_AGF];
- /* # of levels in bno & cnt btree */
- __uint32_t pagf_flcount; /* count of blocks in freelist */
- xfs_extlen_t pagf_freeblks; /* total free blocks */
- xfs_extlen_t pagf_longest; /* longest free space */
- __uint32_t pagf_btreeblks; /* # of blocks held in AGF btrees */
- xfs_agino_t pagi_freecount; /* number of free inodes */
- xfs_agino_t pagi_count; /* number of allocated inodes */
+#define XFS_AGFL_SIZE(mp) \
+ (((mp)->m_sb.sb_sectsize - \
+ (xfs_sb_version_hascrc(&((mp)->m_sb)) ? \
+ sizeof(struct xfs_agfl) : 0)) / \
+ sizeof(xfs_agblock_t))
- /*
- * Inode allocation search lookup optimisation.
- * If the pagino matches, the search for new inodes
- * doesn't need to search the near ones again straight away
- */
- xfs_agino_t pagl_pagino;
- xfs_agino_t pagl_leftrec;
- xfs_agino_t pagl_rightrec;
-#ifdef __KERNEL__
- spinlock_t pagb_lock; /* lock for pagb_tree */
- struct rb_root pagb_tree; /* ordered tree of busy extents */
-
- atomic_t pagf_fstrms; /* # of filestreams active in this AG */
-
- spinlock_t pag_ici_lock; /* incore inode cache lock */
- struct radix_tree_root pag_ici_root; /* incore inode cache root */
- int pag_ici_reclaimable; /* reclaimable inodes */
- struct mutex pag_ici_reclaim_lock; /* serialisation point */
- unsigned long pag_ici_reclaim_cursor; /* reclaim restart point */
-
- /* buffer cache index */
- spinlock_t pag_buf_lock; /* lock for pag_buf_tree */
- struct rb_root pag_buf_tree; /* ordered tree of active buffers */
-
- /* for rcu-safe freeing */
- struct rcu_head rcu_head;
-#endif
- int pagb_count; /* pagb slots in use */
-} xfs_perag_t;
+typedef struct xfs_agfl {
+ __be32 agfl_magicnum;
+ __be32 agfl_seqno;
+ uuid_t agfl_uuid;
+ __be64 agfl_lsn;
+ __be32 agfl_crc;
+ __be32 agfl_bno[]; /* actually XFS_AGFL_SIZE(mp) */
+} xfs_agfl_t;
+
+#define XFS_AGFL_CRC_OFF offsetof(struct xfs_agfl, agfl_crc)
/*
* tags for inode radix tree
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index 393055fe3ae..d43813267a8 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -17,23 +17,25 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
-#include "xfs_types.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_shared.h"
+#include "xfs_trans_resv.h"
#include "xfs_bit.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dinode.h"
#include "xfs_inode.h"
#include "xfs_btree.h"
+#include "xfs_alloc_btree.h"
#include "xfs_alloc.h"
#include "xfs_extent_busy.h"
#include "xfs_error.h"
+#include "xfs_cksum.h"
#include "xfs_trace.h"
+#include "xfs_trans.h"
+#include "xfs_buf_item.h"
+#include "xfs_log.h"
struct workqueue_struct *xfs_alloc_wq;
@@ -173,6 +175,7 @@ xfs_alloc_compute_diff(
xfs_agblock_t wantbno, /* target starting block */
xfs_extlen_t wantlen, /* target length */
xfs_extlen_t alignment, /* target alignment */
+ char userdata, /* are we allocating data? */
xfs_agblock_t freebno, /* freespace's starting block */
xfs_extlen_t freelen, /* freespace's length */
xfs_agblock_t *newbnop) /* result: best start block from free */
@@ -187,7 +190,14 @@ xfs_alloc_compute_diff(
ASSERT(freelen >= wantlen);
freeend = freebno + freelen;
wantend = wantbno + wantlen;
- if (freebno >= wantbno) {
+ /*
+ * We want to allocate from the start of a free extent if it is past
+ * the desired block or if we are allocating user data and the free
+ * extent is before desired block. The second case is there to allow
+ * for contiguous allocation from the remaining free space if the file
+ * grows in the short term.
+ */
+ if (freebno >= wantbno || (userdata && freeend < wantend)) {
if ((newbno1 = roundup(freebno, alignment)) >= freeend)
newbno1 = NULLAGBLOCK;
} else if (freeend >= wantend && alignment > 1) {
@@ -247,16 +257,14 @@ xfs_alloc_fix_len(
k = rlen % args->prod;
if (k == args->mod)
return;
- if (k > args->mod) {
- if ((int)(rlen = rlen - k - args->mod) < (int)args->minlen)
- return;
- } else {
- if ((int)(rlen = rlen - args->prod - (args->mod - k)) <
- (int)args->minlen)
- return;
- }
- ASSERT(rlen >= args->minlen);
- ASSERT(rlen <= args->maxlen);
+ if (k > args->mod)
+ rlen = rlen - (k - args->mod);
+ else
+ rlen = rlen - args->prod + (args->mod - k);
+ if ((int)rlen < (int)args->minlen)
+ return;
+ ASSERT(rlen >= args->minlen && rlen <= args->maxlen);
+ ASSERT(rlen % args->prod == args->mod);
args->len = rlen;
}
@@ -430,53 +438,80 @@ xfs_alloc_fixup_trees(
return 0;
}
-static void
+static bool
xfs_agfl_verify(
struct xfs_buf *bp)
{
-#ifdef WHEN_CRCS_COME_ALONG
- /*
- * we cannot actually do any verification of the AGFL because mkfs does
- * not initialise the AGFL to zero or NULL. Hence the only valid part of
- * the AGFL is what the AGF says is active. We can't get to the AGF, so
- * we can't verify just those entries are valid.
- *
- * This problem goes away when the CRC format change comes along as that
- * requires the AGFL to be initialised by mkfs. At that point, we can
- * verify the blocks in the agfl -active or not- lie within the bounds
- * of the AG. Until then, just leave this check ifdef'd out.
- */
struct xfs_mount *mp = bp->b_target->bt_mount;
struct xfs_agfl *agfl = XFS_BUF_TO_AGFL(bp);
- int agfl_ok = 1;
-
int i;
+ if (!uuid_equal(&agfl->agfl_uuid, &mp->m_sb.sb_uuid))
+ return false;
+ if (be32_to_cpu(agfl->agfl_magicnum) != XFS_AGFL_MAGIC)
+ return false;
+ /*
+ * during growfs operations, the perag is not fully initialised,
+ * so we can't use it for any useful checking. growfs ensures we can't
+ * use it by using uncached buffers that don't have the perag attached
+ * so we can detect and avoid this problem.
+ */
+ if (bp->b_pag && be32_to_cpu(agfl->agfl_seqno) != bp->b_pag->pag_agno)
+ return false;
+
for (i = 0; i < XFS_AGFL_SIZE(mp); i++) {
- if (be32_to_cpu(agfl->agfl_bno[i]) == NULLAGBLOCK ||
+ if (be32_to_cpu(agfl->agfl_bno[i]) != NULLAGBLOCK &&
be32_to_cpu(agfl->agfl_bno[i]) >= mp->m_sb.sb_agblocks)
- agfl_ok = 0;
+ return false;
}
-
- if (!agfl_ok) {
- XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, agfl);
- xfs_buf_ioerror(bp, EFSCORRUPTED);
- }
-#endif
+ return true;
}
static void
-xfs_agfl_write_verify(
+xfs_agfl_read_verify(
struct xfs_buf *bp)
{
- xfs_agfl_verify(bp);
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+
+ /*
+ * There is no verification of non-crc AGFLs because mkfs does not
+ * initialise the AGFL to zero or NULL. Hence the only valid part of the
+ * AGFL is what the AGF says is active. We can't get to the AGF, so we
+ * can't verify just those entries are valid.
+ */
+ if (!xfs_sb_version_hascrc(&mp->m_sb))
+ return;
+
+ if (!xfs_buf_verify_cksum(bp, XFS_AGFL_CRC_OFF))
+ xfs_buf_ioerror(bp, EFSBADCRC);
+ else if (!xfs_agfl_verify(bp))
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
+
+ if (bp->b_error)
+ xfs_verifier_error(bp);
}
static void
-xfs_agfl_read_verify(
+xfs_agfl_write_verify(
struct xfs_buf *bp)
{
- xfs_agfl_verify(bp);
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_buf_log_item *bip = bp->b_fspriv;
+
+ /* no verification of non-crc AGFLs */
+ if (!xfs_sb_version_hascrc(&mp->m_sb))
+ return;
+
+ if (!xfs_agfl_verify(bp)) {
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
+ xfs_verifier_error(bp);
+ return;
+ }
+
+ if (bip)
+ XFS_BUF_TO_AGFL(bp)->agfl_lsn = cpu_to_be64(bip->bli_item.li_lsn);
+
+ xfs_buf_update_cksum(bp, XFS_AGFL_CRC_OFF);
}
const struct xfs_buf_ops xfs_agfl_buf_ops = {
@@ -504,7 +539,6 @@ xfs_alloc_read_agfl(
XFS_FSS_TO_BB(mp, 1), 0, &bp, &xfs_agfl_buf_ops);
if (error)
return error;
- ASSERT(!xfs_buf_geterror(bp));
xfs_buf_set_ref(bp, XFS_AGFL_REF);
*bpp = bp;
return 0;
@@ -772,7 +806,8 @@ xfs_alloc_find_best_extent(
xfs_alloc_fix_len(args);
sdiff = xfs_alloc_compute_diff(args->agbno, args->len,
- args->alignment, *sbnoa,
+ args->alignment,
+ args->userdata, *sbnoa,
*slena, &new);
/*
@@ -836,13 +871,13 @@ xfs_alloc_ag_vextent_near(
xfs_agblock_t ltnew; /* useful start bno of left side */
xfs_extlen_t rlen; /* length of returned extent */
int forced = 0;
-#if defined(DEBUG) && defined(__KERNEL__)
+#ifdef DEBUG
/*
* Randomly don't execute the first algorithm.
*/
int dofirst; /* set to do first algorithm */
- dofirst = random32() & 1;
+ dofirst = prandom_u32() & 1;
#endif
restart:
@@ -896,8 +931,8 @@ restart:
xfs_extlen_t blen=0;
xfs_agblock_t bnew=0;
-#if defined(DEBUG) && defined(__KERNEL__)
- if (!dofirst)
+#ifdef DEBUG
+ if (dofirst)
break;
#endif
/*
@@ -943,7 +978,8 @@ restart:
if (args->len < blen)
continue;
ltdiff = xfs_alloc_compute_diff(args->agbno, args->len,
- args->alignment, ltbnoa, ltlena, &ltnew);
+ args->alignment, args->userdata, ltbnoa,
+ ltlena, &ltnew);
if (ltnew != NULLAGBLOCK &&
(args->len > blen || ltdiff < bdiff)) {
bdiff = ltdiff;
@@ -1095,7 +1131,8 @@ restart:
args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
xfs_alloc_fix_len(args);
ltdiff = xfs_alloc_compute_diff(args->agbno, args->len,
- args->alignment, ltbnoa, ltlena, &ltnew);
+ args->alignment, args->userdata, ltbnoa,
+ ltlena, &ltnew);
error = xfs_alloc_find_best_extent(args,
&bno_cur_lt, &bno_cur_gt,
@@ -1111,7 +1148,8 @@ restart:
args->len = XFS_EXTLEN_MIN(gtlena, args->maxlen);
xfs_alloc_fix_len(args);
gtdiff = xfs_alloc_compute_diff(args->agbno, args->len,
- args->alignment, gtbnoa, gtlena, &gtnew);
+ args->alignment, args->userdata, gtbnoa,
+ gtlena, &gtnew);
error = xfs_alloc_find_best_extent(args,
&bno_cur_gt, &bno_cur_lt,
@@ -1170,7 +1208,7 @@ restart:
}
rlen = args->len;
(void)xfs_alloc_compute_diff(args->agbno, rlen, args->alignment,
- ltbnoa, ltlena, &ltnew);
+ args->userdata, ltbnoa, ltlena, &ltnew);
ASSERT(ltnew >= ltbno);
ASSERT(ltnew + rlen <= ltbnoa + ltlena);
ASSERT(ltnew + rlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
@@ -1925,8 +1963,6 @@ xfs_alloc_fix_freelist(
targs.mp = mp;
targs.agbp = agbp;
targs.agno = args->agno;
- targs.mod = targs.minleft = targs.wasdel = targs.userdata =
- targs.minalignslop = 0;
targs.alignment = targs.minlen = targs.prod = targs.isfl = 1;
targs.type = XFS_ALLOCTYPE_THIS_AG;
targs.pag = pag;
@@ -1984,18 +2020,18 @@ xfs_alloc_get_freelist(
int btreeblk) /* destination is a AGF btree */
{
xfs_agf_t *agf; /* a.g. freespace structure */
- xfs_agfl_t *agfl; /* a.g. freelist structure */
xfs_buf_t *agflbp;/* buffer for a.g. freelist structure */
xfs_agblock_t bno; /* block number returned */
+ __be32 *agfl_bno;
int error;
int logflags;
- xfs_mount_t *mp; /* mount structure */
+ xfs_mount_t *mp = tp->t_mountp;
xfs_perag_t *pag; /* per allocation group data */
- agf = XFS_BUF_TO_AGF(agbp);
/*
* Freelist is empty, give up.
*/
+ agf = XFS_BUF_TO_AGF(agbp);
if (!agf->agf_flcount) {
*bnop = NULLAGBLOCK;
return 0;
@@ -2003,15 +2039,17 @@ xfs_alloc_get_freelist(
/*
* Read the array of free blocks.
*/
- mp = tp->t_mountp;
- if ((error = xfs_alloc_read_agfl(mp, tp,
- be32_to_cpu(agf->agf_seqno), &agflbp)))
+ error = xfs_alloc_read_agfl(mp, tp, be32_to_cpu(agf->agf_seqno),
+ &agflbp);
+ if (error)
return error;
- agfl = XFS_BUF_TO_AGFL(agflbp);
+
+
/*
* Get the block number and update the data structures.
*/
- bno = be32_to_cpu(agfl->agfl_bno[be32_to_cpu(agf->agf_flfirst)]);
+ agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, agflbp);
+ bno = be32_to_cpu(agfl_bno[be32_to_cpu(agf->agf_flfirst)]);
be32_add_cpu(&agf->agf_flfirst, 1);
xfs_trans_brelse(tp, agflbp);
if (be32_to_cpu(agf->agf_flfirst) == XFS_AGFL_SIZE(mp))
@@ -2060,11 +2098,14 @@ xfs_alloc_log_agf(
offsetof(xfs_agf_t, agf_freeblks),
offsetof(xfs_agf_t, agf_longest),
offsetof(xfs_agf_t, agf_btreeblks),
+ offsetof(xfs_agf_t, agf_uuid),
sizeof(xfs_agf_t)
};
trace_xfs_agf(tp->t_mountp, XFS_BUF_TO_AGF(bp), fields, _RET_IP_);
+ xfs_trans_buf_set_type(tp, bp, XFS_BLFT_AGF_BUF);
+
xfs_btree_offsets(fields, offsets, XFS_AGF_NUM_BITS, &first, &last);
xfs_trans_log_buf(tp, bp, (uint)first, (uint)last);
}
@@ -2101,12 +2142,13 @@ xfs_alloc_put_freelist(
int btreeblk) /* block came from a AGF btree */
{
xfs_agf_t *agf; /* a.g. freespace structure */
- xfs_agfl_t *agfl; /* a.g. free block array */
__be32 *blockp;/* pointer to array entry */
int error;
int logflags;
xfs_mount_t *mp; /* mount structure */
xfs_perag_t *pag; /* per allocation group data */
+ __be32 *agfl_bno;
+ int startoff;
agf = XFS_BUF_TO_AGF(agbp);
mp = tp->t_mountp;
@@ -2114,7 +2156,6 @@ xfs_alloc_put_freelist(
if (!agflbp && (error = xfs_alloc_read_agfl(mp, tp,
be32_to_cpu(agf->agf_seqno), &agflbp)))
return error;
- agfl = XFS_BUF_TO_AGFL(agflbp);
be32_add_cpu(&agf->agf_fllast, 1);
if (be32_to_cpu(agf->agf_fllast) == XFS_AGFL_SIZE(mp))
agf->agf_fllast = 0;
@@ -2135,32 +2176,38 @@ xfs_alloc_put_freelist(
xfs_alloc_log_agf(tp, agbp, logflags);
ASSERT(be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp));
- blockp = &agfl->agfl_bno[be32_to_cpu(agf->agf_fllast)];
+
+ agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, agflbp);
+ blockp = &agfl_bno[be32_to_cpu(agf->agf_fllast)];
*blockp = cpu_to_be32(bno);
+ startoff = (char *)blockp - (char *)agflbp->b_addr;
+
xfs_alloc_log_agf(tp, agbp, logflags);
- xfs_trans_log_buf(tp, agflbp,
- (int)((xfs_caddr_t)blockp - (xfs_caddr_t)agfl),
- (int)((xfs_caddr_t)blockp - (xfs_caddr_t)agfl +
- sizeof(xfs_agblock_t) - 1));
+
+ xfs_trans_buf_set_type(tp, agflbp, XFS_BLFT_AGFL_BUF);
+ xfs_trans_log_buf(tp, agflbp, startoff,
+ startoff + sizeof(xfs_agblock_t) - 1);
return 0;
}
-static void
+static bool
xfs_agf_verify(
+ struct xfs_mount *mp,
struct xfs_buf *bp)
{
- struct xfs_mount *mp = bp->b_target->bt_mount;
- struct xfs_agf *agf;
- int agf_ok;
+ struct xfs_agf *agf = XFS_BUF_TO_AGF(bp);
- agf = XFS_BUF_TO_AGF(bp);
+ if (xfs_sb_version_hascrc(&mp->m_sb) &&
+ !uuid_equal(&agf->agf_uuid, &mp->m_sb.sb_uuid))
+ return false;
- agf_ok = agf->agf_magicnum == cpu_to_be32(XFS_AGF_MAGIC) &&
- XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) &&
- be32_to_cpu(agf->agf_freeblks) <= be32_to_cpu(agf->agf_length) &&
- be32_to_cpu(agf->agf_flfirst) < XFS_AGFL_SIZE(mp) &&
- be32_to_cpu(agf->agf_fllast) < XFS_AGFL_SIZE(mp) &&
- be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp);
+ if (!(agf->agf_magicnum == cpu_to_be32(XFS_AGF_MAGIC) &&
+ XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) &&
+ be32_to_cpu(agf->agf_freeblks) <= be32_to_cpu(agf->agf_length) &&
+ be32_to_cpu(agf->agf_flfirst) < XFS_AGFL_SIZE(mp) &&
+ be32_to_cpu(agf->agf_fllast) < XFS_AGFL_SIZE(mp) &&
+ be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp)))
+ return false;
/*
* during growfs operations, the perag is not fully initialised,
@@ -2168,33 +2215,55 @@ xfs_agf_verify(
* use it by using uncached buffers that don't have the perag attached
* so we can detect and avoid this problem.
*/
- if (bp->b_pag)
- agf_ok = agf_ok && be32_to_cpu(agf->agf_seqno) ==
- bp->b_pag->pag_agno;
+ if (bp->b_pag && be32_to_cpu(agf->agf_seqno) != bp->b_pag->pag_agno)
+ return false;
- if (xfs_sb_version_haslazysbcount(&mp->m_sb))
- agf_ok = agf_ok && be32_to_cpu(agf->agf_btreeblks) <=
- be32_to_cpu(agf->agf_length);
+ if (xfs_sb_version_haslazysbcount(&mp->m_sb) &&
+ be32_to_cpu(agf->agf_btreeblks) > be32_to_cpu(agf->agf_length))
+ return false;
+
+ return true;;
- if (unlikely(XFS_TEST_ERROR(!agf_ok, mp, XFS_ERRTAG_ALLOC_READ_AGF,
- XFS_RANDOM_ALLOC_READ_AGF))) {
- XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, agf);
- xfs_buf_ioerror(bp, EFSCORRUPTED);
- }
}
static void
xfs_agf_read_verify(
struct xfs_buf *bp)
{
- xfs_agf_verify(bp);
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+
+ if (xfs_sb_version_hascrc(&mp->m_sb) &&
+ !xfs_buf_verify_cksum(bp, XFS_AGF_CRC_OFF))
+ xfs_buf_ioerror(bp, EFSBADCRC);
+ else if (XFS_TEST_ERROR(!xfs_agf_verify(mp, bp), mp,
+ XFS_ERRTAG_ALLOC_READ_AGF,
+ XFS_RANDOM_ALLOC_READ_AGF))
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
+
+ if (bp->b_error)
+ xfs_verifier_error(bp);
}
static void
xfs_agf_write_verify(
struct xfs_buf *bp)
{
- xfs_agf_verify(bp);
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_buf_log_item *bip = bp->b_fspriv;
+
+ if (!xfs_agf_verify(mp, bp)) {
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
+ xfs_verifier_error(bp);
+ return;
+ }
+
+ if (!xfs_sb_version_hascrc(&mp->m_sb))
+ return;
+
+ if (bip)
+ XFS_BUF_TO_AGF(bp)->agf_lsn = cpu_to_be64(bip->bli_item.li_lsn);
+
+ xfs_buf_update_cksum(bp, XFS_AGF_CRC_OFF);
}
const struct xfs_buf_ops xfs_agf_buf_ops = {
@@ -2215,6 +2284,8 @@ xfs_read_agf(
{
int error;
+ trace_xfs_read_agf(mp, agno);
+
ASSERT(agno != NULLAGNUMBER);
error = xfs_trans_read_buf(
mp, tp, mp->m_ddev_targp,
@@ -2245,8 +2316,9 @@ xfs_alloc_read_agf(
struct xfs_perag *pag; /* per allocation group data */
int error;
- ASSERT(agno != NULLAGNUMBER);
+ trace_xfs_alloc_read_agf(mp, agno);
+ ASSERT(agno != NULLAGNUMBER);
error = xfs_read_agf(mp, tp, agno,
(flags & XFS_ALLOC_FLAG_TRYLOCK) ? XBF_TRYLOCK : 0,
bpp);
diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h
index 99d0a610155..feacb061bab 100644
--- a/fs/xfs/xfs_alloc.h
+++ b/fs/xfs/xfs_alloc.h
@@ -231,7 +231,4 @@ xfs_alloc_get_rec(
xfs_extlen_t *len, /* output: length of extent */
int *stat); /* output: success/failure */
-extern const struct xfs_buf_ops xfs_agf_buf_ops;
-extern const struct xfs_buf_ops xfs_agfl_buf_ops;
-
#endif /* __XFS_ALLOC_H__ */
diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c
index b1ddef6b268..8358f1ded94 100644
--- a/fs/xfs/xfs_alloc_btree.c
+++ b/fs/xfs/xfs_alloc_btree.c
@@ -17,22 +17,21 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
-#include "xfs_types.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dinode.h"
-#include "xfs_inode.h"
#include "xfs_btree.h"
+#include "xfs_alloc_btree.h"
#include "xfs_alloc.h"
#include "xfs_extent_busy.h"
#include "xfs_error.h"
#include "xfs_trace.h"
+#include "xfs_cksum.h"
+#include "xfs_trans.h"
STATIC struct xfs_btree_cur *
@@ -71,7 +70,6 @@ xfs_allocbt_alloc_block(
struct xfs_btree_cur *cur,
union xfs_btree_ptr *start,
union xfs_btree_ptr *new,
- int length,
int *stat)
{
int error;
@@ -272,7 +270,7 @@ xfs_allocbt_key_diff(
return (__int64_t)be32_to_cpu(kp->ar_startblock) - rec->ar_startblock;
}
-static void
+static bool
xfs_allocbt_verify(
struct xfs_buf *bp)
{
@@ -280,66 +278,105 @@ xfs_allocbt_verify(
struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
struct xfs_perag *pag = bp->b_pag;
unsigned int level;
- int sblock_ok; /* block passes checks */
/*
* magic number and level verification
*
- * During growfs operations, we can't verify the exact level as the
- * perag is not fully initialised and hence not attached to the buffer.
- * In this case, check against the maximum tree depth.
+ * During growfs operations, we can't verify the exact level or owner as
+ * the perag is not fully initialised and hence not attached to the
+ * buffer. In this case, check against the maximum tree depth.
+ *
+ * Similarly, during log recovery we will have a perag structure
+ * attached, but the agf information will not yet have been initialised
+ * from the on disk AGF. Again, we can only check against maximum limits
+ * in this case.
*/
level = be16_to_cpu(block->bb_level);
switch (block->bb_magic) {
+ case cpu_to_be32(XFS_ABTB_CRC_MAGIC):
+ if (!xfs_sb_version_hascrc(&mp->m_sb))
+ return false;
+ if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid))
+ return false;
+ if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn))
+ return false;
+ if (pag &&
+ be32_to_cpu(block->bb_u.s.bb_owner) != pag->pag_agno)
+ return false;
+ /* fall through */
case cpu_to_be32(XFS_ABTB_MAGIC):
- if (pag)
- sblock_ok = level < pag->pagf_levels[XFS_BTNUM_BNOi];
- else
- sblock_ok = level < mp->m_ag_maxlevels;
+ if (pag && pag->pagf_init) {
+ if (level >= pag->pagf_levels[XFS_BTNUM_BNOi])
+ return false;
+ } else if (level >= mp->m_ag_maxlevels)
+ return false;
break;
+ case cpu_to_be32(XFS_ABTC_CRC_MAGIC):
+ if (!xfs_sb_version_hascrc(&mp->m_sb))
+ return false;
+ if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid))
+ return false;
+ if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn))
+ return false;
+ if (pag &&
+ be32_to_cpu(block->bb_u.s.bb_owner) != pag->pag_agno)
+ return false;
+ /* fall through */
case cpu_to_be32(XFS_ABTC_MAGIC):
- if (pag)
- sblock_ok = level < pag->pagf_levels[XFS_BTNUM_CNTi];
- else
- sblock_ok = level < mp->m_ag_maxlevels;
+ if (pag && pag->pagf_init) {
+ if (level >= pag->pagf_levels[XFS_BTNUM_CNTi])
+ return false;
+ } else if (level >= mp->m_ag_maxlevels)
+ return false;
break;
default:
- sblock_ok = 0;
- break;
+ return false;
}
/* numrecs verification */
- sblock_ok = sblock_ok &&
- be16_to_cpu(block->bb_numrecs) <= mp->m_alloc_mxr[level != 0];
+ if (be16_to_cpu(block->bb_numrecs) > mp->m_alloc_mxr[level != 0])
+ return false;
/* sibling pointer verification */
- sblock_ok = sblock_ok &&
- (block->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK) ||
- be32_to_cpu(block->bb_u.s.bb_leftsib) < mp->m_sb.sb_agblocks) &&
- block->bb_u.s.bb_leftsib &&
- (block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK) ||
- be32_to_cpu(block->bb_u.s.bb_rightsib) < mp->m_sb.sb_agblocks) &&
- block->bb_u.s.bb_rightsib;
-
- if (!sblock_ok) {
- trace_xfs_btree_corrupt(bp, _RET_IP_);
- XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, block);
- xfs_buf_ioerror(bp, EFSCORRUPTED);
- }
+ if (!block->bb_u.s.bb_leftsib ||
+ (be32_to_cpu(block->bb_u.s.bb_leftsib) >= mp->m_sb.sb_agblocks &&
+ block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK)))
+ return false;
+ if (!block->bb_u.s.bb_rightsib ||
+ (be32_to_cpu(block->bb_u.s.bb_rightsib) >= mp->m_sb.sb_agblocks &&
+ block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK)))
+ return false;
+
+ return true;
}
static void
xfs_allocbt_read_verify(
struct xfs_buf *bp)
{
- xfs_allocbt_verify(bp);
+ if (!xfs_btree_sblock_verify_crc(bp))
+ xfs_buf_ioerror(bp, EFSBADCRC);
+ else if (!xfs_allocbt_verify(bp))
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
+
+ if (bp->b_error) {
+ trace_xfs_btree_corrupt(bp, _RET_IP_);
+ xfs_verifier_error(bp);
+ }
}
static void
xfs_allocbt_write_verify(
struct xfs_buf *bp)
{
- xfs_allocbt_verify(bp);
+ if (!xfs_allocbt_verify(bp)) {
+ trace_xfs_btree_corrupt(bp, _RET_IP_);
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
+ xfs_verifier_error(bp);
+ return;
+ }
+ xfs_btree_sblock_calc_crc(bp);
+
}
const struct xfs_buf_ops xfs_allocbt_buf_ops = {
@@ -348,7 +385,7 @@ const struct xfs_buf_ops xfs_allocbt_buf_ops = {
};
-#ifdef DEBUG
+#if defined(DEBUG) || defined(XFS_WARN)
STATIC int
xfs_allocbt_keys_inorder(
struct xfs_btree_cur *cur,
@@ -404,7 +441,7 @@ static const struct xfs_btree_ops xfs_allocbt_ops = {
.init_ptr_from_cur = xfs_allocbt_init_ptr_from_cur,
.key_diff = xfs_allocbt_key_diff,
.buf_ops = &xfs_allocbt_buf_ops,
-#ifdef DEBUG
+#if defined(DEBUG) || defined(XFS_WARN)
.keys_inorder = xfs_allocbt_keys_inorder,
.recs_inorder = xfs_allocbt_recs_inorder,
#endif
@@ -444,6 +481,9 @@ xfs_allocbt_init_cursor(
cur->bc_private.a.agbp = agbp;
cur->bc_private.a.agno = agno;
+ if (xfs_sb_version_hascrc(&mp->m_sb))
+ cur->bc_flags |= XFS_BTREE_CRC_BLOCKS;
+
return cur;
}
diff --git a/fs/xfs/xfs_alloc_btree.h b/fs/xfs/xfs_alloc_btree.h
index 7e89a2b429d..45e189e7e81 100644
--- a/fs/xfs/xfs_alloc_btree.h
+++ b/fs/xfs/xfs_alloc_btree.h
@@ -27,42 +27,11 @@ struct xfs_btree_cur;
struct xfs_mount;
/*
- * There are two on-disk btrees, one sorted by blockno and one sorted
- * by blockcount and blockno. All blocks look the same to make the code
- * simpler; if we have time later, we'll make the optimizations.
- */
-#define XFS_ABTB_MAGIC 0x41425442 /* 'ABTB' for bno tree */
-#define XFS_ABTC_MAGIC 0x41425443 /* 'ABTC' for cnt tree */
-
-/*
- * Data record/key structure
- */
-typedef struct xfs_alloc_rec {
- __be32 ar_startblock; /* starting block number */
- __be32 ar_blockcount; /* count of free blocks */
-} xfs_alloc_rec_t, xfs_alloc_key_t;
-
-typedef struct xfs_alloc_rec_incore {
- xfs_agblock_t ar_startblock; /* starting block number */
- xfs_extlen_t ar_blockcount; /* count of free blocks */
-} xfs_alloc_rec_incore_t;
-
-/* btree pointer type */
-typedef __be32 xfs_alloc_ptr_t;
-
-/*
- * Block numbers in the AG:
- * SB is sector 0, AGF is sector 1, AGI is sector 2, AGFL is sector 3.
- */
-#define XFS_BNO_BLOCK(mp) ((xfs_agblock_t)(XFS_AGFL_BLOCK(mp) + 1))
-#define XFS_CNT_BLOCK(mp) ((xfs_agblock_t)(XFS_BNO_BLOCK(mp) + 1))
-
-/*
* Btree block header size depends on a superblock flag.
- *
- * (not quite yet, but soon)
*/
-#define XFS_ALLOC_BLOCK_LEN(mp) XFS_BTREE_SBLOCK_LEN
+#define XFS_ALLOC_BLOCK_LEN(mp) \
+ (xfs_sb_version_hascrc(&((mp)->m_sb)) ? \
+ XFS_BTREE_SBLOCK_CRC_LEN : XFS_BTREE_SBLOCK_LEN)
/*
* Record, key, and pointer address macros for btree blocks.
@@ -93,6 +62,4 @@ extern struct xfs_btree_cur *xfs_allocbt_init_cursor(struct xfs_mount *,
xfs_agnumber_t, xfs_btnum_t);
extern int xfs_allocbt_maxrecs(struct xfs_mount *, int, int);
-extern const struct xfs_buf_ops xfs_allocbt_buf_ops;
-
#endif /* __XFS_ALLOC_BTREE_H__ */
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 4111a40ebe1..faaf716e208 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -16,21 +16,25 @@
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "xfs.h"
-#include "xfs_log.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
-#include "xfs_trans.h"
#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_dinode.h"
#include "xfs_inode.h"
+#include "xfs_trans.h"
#include "xfs_inode_item.h"
#include "xfs_alloc.h"
#include "xfs_error.h"
#include "xfs_iomap.h"
-#include "xfs_vnodeops.h"
#include "xfs_trace.h"
#include "xfs_bmap.h"
+#include "xfs_bmap_util.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_dinode.h"
+#include <linux/aio.h>
#include <linux/gfp.h>
#include <linux/mpage.h>
#include <linux/pagevec.h>
@@ -85,14 +89,6 @@ xfs_destroy_ioend(
bh->b_end_io(bh, !ioend->io_error);
}
- if (ioend->io_iocb) {
- if (ioend->io_isasync) {
- aio_complete(ioend->io_iocb, ioend->io_error ?
- ioend->io_error : ioend->io_result, 0);
- }
- inode_dio_done(ioend->io_inode);
- }
-
mempool_free(ioend, xfs_ioend_pool);
}
@@ -115,7 +111,7 @@ xfs_setfilesize_trans_alloc(
tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
- error = xfs_trans_reserve(tp, 0, XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0);
+ error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0);
if (error) {
xfs_trans_cancel(tp, 0);
return error;
@@ -280,7 +276,6 @@ xfs_alloc_ioend(
* all the I/O from calling the completion routine too early.
*/
atomic_set(&ioend->io_remaining, 1);
- ioend->io_isasync = 0;
ioend->io_isdirect = 0;
ioend->io_error = 0;
ioend->io_list = NULL;
@@ -290,8 +285,6 @@ xfs_alloc_ioend(
ioend->io_buffer_tail = NULL;
ioend->io_offset = 0;
ioend->io_size = 0;
- ioend->io_iocb = NULL;
- ioend->io_result = 0;
ioend->io_append_trans = NULL;
INIT_WORK(&ioend->io_work, xfs_end_io);
@@ -343,7 +336,7 @@ xfs_map_blocks(
if (type == XFS_IO_DELALLOC &&
(!nimaps || isnullstartblock(imap->br_startblock))) {
- error = xfs_iomap_write_allocate(ip, offset, count, imap);
+ error = xfs_iomap_write_allocate(ip, offset, imap);
if (!error)
trace_xfs_map_blocks_alloc(ip, offset, count, type, imap);
return -XFS_ERROR(error);
@@ -414,7 +407,7 @@ xfs_alloc_ioend_bio(
struct bio *bio = bio_alloc(GFP_NOIO, nvecs);
ASSERT(bio->bi_private == NULL);
- bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
+ bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
bio->bi_bdev = bh->b_bdev;
return bio;
}
@@ -450,7 +443,7 @@ xfs_start_page_writeback(
end_page_writeback(page);
}
-static inline int bio_add_buffer(struct bio *bio, struct buffer_head *bh)
+static inline int xfs_bio_add_buffer(struct bio *bio, struct buffer_head *bh)
{
return bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh));
}
@@ -524,7 +517,7 @@ xfs_submit_ioend(
goto retry;
}
- if (bio_add_buffer(bio, bh) != bh->b_size) {
+ if (xfs_bio_add_buffer(bio, bh) != bh->b_size) {
xfs_submit_ioend_bio(wbc, ioend, bio);
goto retry;
}
@@ -639,38 +632,46 @@ xfs_map_at_offset(
}
/*
- * Test if a given page is suitable for writing as part of an unwritten
- * or delayed allocate extent.
+ * Test if a given page contains at least one buffer of a given @type.
+ * If @check_all_buffers is true, then we walk all the buffers in the page to
+ * try to find one of the type passed in. If it is not set, then the caller only
+ * needs to check the first buffer on the page for a match.
*/
-STATIC int
+STATIC bool
xfs_check_page_type(
struct page *page,
- unsigned int type)
+ unsigned int type,
+ bool check_all_buffers)
{
- if (PageWriteback(page))
- return 0;
+ struct buffer_head *bh;
+ struct buffer_head *head;
- if (page->mapping && page_has_buffers(page)) {
- struct buffer_head *bh, *head;
- int acceptable = 0;
+ if (PageWriteback(page))
+ return false;
+ if (!page->mapping)
+ return false;
+ if (!page_has_buffers(page))
+ return false;
- bh = head = page_buffers(page);
- do {
- if (buffer_unwritten(bh))
- acceptable += (type == XFS_IO_UNWRITTEN);
- else if (buffer_delay(bh))
- acceptable += (type == XFS_IO_DELALLOC);
- else if (buffer_dirty(bh) && buffer_mapped(bh))
- acceptable += (type == XFS_IO_OVERWRITE);
- else
- break;
- } while ((bh = bh->b_this_page) != head);
+ bh = head = page_buffers(page);
+ do {
+ if (buffer_unwritten(bh)) {
+ if (type == XFS_IO_UNWRITTEN)
+ return true;
+ } else if (buffer_delay(bh)) {
+ if (type == XFS_IO_DELALLOC)
+ return true;
+ } else if (buffer_dirty(bh) && buffer_mapped(bh)) {
+ if (type == XFS_IO_OVERWRITE)
+ return true;
+ }
- if (acceptable)
- return 1;
- }
+ /* If we are only checking the first buffer, we are done now. */
+ if (!check_all_buffers)
+ break;
+ } while ((bh = bh->b_this_page) != head);
- return 0;
+ return false;
}
/*
@@ -704,7 +705,7 @@ xfs_convert_page(
goto fail_unlock_page;
if (page->mapping != inode->i_mapping)
goto fail_unlock_page;
- if (!xfs_check_page_type(page, (*ioendp)->io_type))
+ if (!xfs_check_page_type(page, (*ioendp)->io_type, false))
goto fail_unlock_page;
/*
@@ -724,12 +725,40 @@ xfs_convert_page(
(xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT,
i_size_read(inode));
+ /*
+ * If the current map does not span the entire page we are about to try
+ * to write, then give up. The only way we can write a page that spans
+ * multiple mappings in a single writeback iteration is via the
+ * xfs_vm_writepage() function. Data integrity writeback requires the
+ * entire page to be written in a single attempt, otherwise the part of
+ * the page we don't write here doesn't get written as part of the data
+ * integrity sync.
+ *
+ * For normal writeback, we also don't attempt to write partial pages
+ * here as it simply means that write_cache_pages() will see it under
+ * writeback and ignore the page until some point in the future, at
+ * which time this will be the only page in the file that needs
+ * writeback. Hence for more optimal IO patterns, we should always
+ * avoid partial page writeback due to multiple mappings on a page here.
+ */
+ if (!xfs_imap_valid(inode, imap, end_offset))
+ goto fail_unlock_page;
+
len = 1 << inode->i_blkbits;
p_offset = min_t(unsigned long, end_offset & (PAGE_CACHE_SIZE - 1),
PAGE_CACHE_SIZE);
p_offset = p_offset ? roundup(p_offset, len) : PAGE_CACHE_SIZE;
page_dirty = p_offset / len;
+ /*
+ * The moment we find a buffer that doesn't match our current type
+ * specification or can't be written, abort the loop and start
+ * writeback. As per the above xfs_imap_valid() check, only
+ * xfs_vm_writepage() can handle partial page writeback fully - we are
+ * limited here to the buffers that are contiguous with the current
+ * ioend, and hence a buffer we can't write breaks that contiguity and
+ * we have to defer the rest of the IO to xfs_vm_writepage().
+ */
bh = head = page_buffers(page);
do {
if (offset >= end_offset)
@@ -738,7 +767,7 @@ xfs_convert_page(
uptodate = 0;
if (!(PageUptodate(page) || buffer_uptodate(bh))) {
done = 1;
- continue;
+ break;
}
if (buffer_unwritten(bh) || buffer_delay(bh) ||
@@ -750,10 +779,11 @@ xfs_convert_page(
else
type = XFS_IO_OVERWRITE;
- if (!xfs_imap_valid(inode, imap, offset)) {
- done = 1;
- continue;
- }
+ /*
+ * imap should always be valid because of the above
+ * partial page end_offset check on the imap.
+ */
+ ASSERT(xfs_imap_valid(inode, imap, offset));
lock_buffer(bh);
if (type != XFS_IO_OVERWRITE)
@@ -765,6 +795,7 @@ xfs_convert_page(
count++;
} else {
done = 1;
+ break;
}
} while (offset += len, (bh = bh->b_this_page) != head);
@@ -823,10 +854,12 @@ xfs_cluster_write(
STATIC void
xfs_vm_invalidatepage(
struct page *page,
- unsigned long offset)
+ unsigned int offset,
+ unsigned int length)
{
- trace_xfs_invalidatepage(page->mapping->host, page, offset);
- block_invalidatepage(page, offset);
+ trace_xfs_invalidatepage(page->mapping->host, page, offset,
+ length);
+ block_invalidatepage(page, offset, length);
}
/*
@@ -854,7 +887,7 @@ xfs_aops_discard_page(
struct buffer_head *bh, *head;
loff_t offset = page_offset(page);
- if (!xfs_check_page_type(page, XFS_IO_DELALLOC))
+ if (!xfs_check_page_type(page, XFS_IO_DELALLOC, true))
goto out_invalidate;
if (XFS_FORCED_SHUTDOWN(ip->i_mount))
@@ -890,7 +923,7 @@ next_buffer:
xfs_iunlock(ip, XFS_ILOCK_EXCL);
out_invalidate:
- xfs_vm_invalidatepage(page, 0);
+ xfs_vm_invalidatepage(page, 0, PAGE_CACHE_SIZE);
return;
}
@@ -920,7 +953,7 @@ xfs_vm_writepage(
int count = 0;
int nonblocking = 0;
- trace_xfs_writepage(inode, page, 0);
+ trace_xfs_writepage(inode, page, 0, 0);
ASSERT(page_has_buffers(page));
@@ -942,39 +975,76 @@ xfs_vm_writepage(
* Given that we do not allow direct reclaim to call us, we should
* never be called while in a filesystem transaction.
*/
- if (WARN_ON(current->flags & PF_FSTRANS))
+ if (WARN_ON_ONCE(current->flags & PF_FSTRANS))
goto redirty;
/* Is this page beyond the end of the file? */
offset = i_size_read(inode);
end_index = offset >> PAGE_CACHE_SHIFT;
last_index = (offset - 1) >> PAGE_CACHE_SHIFT;
- if (page->index >= end_index) {
+
+ /*
+ * The page index is less than the end_index, adjust the end_offset
+ * to the highest offset that this page should represent.
+ * -----------------------------------------------------
+ * | file mapping | <EOF> |
+ * -----------------------------------------------------
+ * | Page ... | Page N-2 | Page N-1 | Page N | |
+ * ^--------------------------------^----------|--------
+ * | desired writeback range | see else |
+ * ---------------------------------^------------------|
+ */
+ if (page->index < end_index)
+ end_offset = (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT;
+ else {
+ /*
+ * Check whether the page to write out is beyond or straddles
+ * i_size or not.
+ * -------------------------------------------------------
+ * | file mapping | <EOF> |
+ * -------------------------------------------------------
+ * | Page ... | Page N-2 | Page N-1 | Page N | Beyond |
+ * ^--------------------------------^-----------|---------
+ * | | Straddles |
+ * ---------------------------------^-----------|--------|
+ */
unsigned offset_into_page = offset & (PAGE_CACHE_SIZE - 1);
/*
- * Just skip the page if it is fully outside i_size, e.g. due
- * to a truncate operation that is in progress.
+ * Skip the page if it is fully outside i_size, e.g. due to a
+ * truncate operation that is in progress. We must redirty the
+ * page so that reclaim stops reclaiming it. Otherwise
+ * xfs_vm_releasepage() is called on it and gets confused.
+ *
+ * Note that the end_index is unsigned long, it would overflow
+ * if the given offset is greater than 16TB on 32-bit system
+ * and if we do check the page is fully outside i_size or not
+ * via "if (page->index >= end_index + 1)" as "end_index + 1"
+ * will be evaluated to 0. Hence this page will be redirtied
+ * and be written out repeatedly which would result in an
+ * infinite loop, the user program that perform this operation
+ * will hang. Instead, we can verify this situation by checking
+ * if the page to write is totally beyond the i_size or if it's
+ * offset is just equal to the EOF.
*/
- if (page->index >= end_index + 1 || offset_into_page == 0) {
- unlock_page(page);
- return 0;
- }
+ if (page->index > end_index ||
+ (page->index == end_index && offset_into_page == 0))
+ goto redirty;
/*
* The page straddles i_size. It must be zeroed out on each
* and every writepage invocation because it may be mmapped.
* "A file is mapped in multiples of the page size. For a file
- * that is not a multiple of the page size, the remaining
+ * that is not a multiple of the page size, the remaining
* memory is zeroed when mapped, and writes to that region are
* not written out to the file."
*/
zero_user_segment(page, offset_into_page, PAGE_CACHE_SIZE);
+
+ /* Adjust the end_offset to the end of file */
+ end_offset = offset;
}
- end_offset = min_t(unsigned long long,
- (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT,
- offset);
len = 1 << inode->i_blkbits;
bh = head = page_buffers(page);
@@ -1151,13 +1221,13 @@ xfs_vm_releasepage(
{
int delalloc, unwritten;
- trace_xfs_releasepage(page->mapping->host, page, 0);
+ trace_xfs_releasepage(page->mapping->host, page, 0, 0);
xfs_count_page_state(page, &delalloc, &unwritten);
- if (WARN_ON(delalloc))
+ if (WARN_ON_ONCE(delalloc))
return 0;
- if (WARN_ON(unwritten))
+ if (WARN_ON_ONCE(unwritten))
return 0;
return try_to_free_buffers(page);
@@ -1203,7 +1273,7 @@ __xfs_get_blocks(
lockmode = XFS_ILOCK_EXCL;
xfs_ilock(ip, lockmode);
} else {
- lockmode = xfs_ilock_map_shared(ip);
+ lockmode = xfs_ilock_data_map_shared(ip);
}
ASSERT(offset <= mp->m_super->s_maxbytes);
@@ -1270,8 +1340,10 @@ __xfs_get_blocks(
if (create || !ISUNWRITTEN(&imap))
xfs_map_buffer(inode, bh_result, &imap, offset);
if (create && ISUNWRITTEN(&imap)) {
- if (direct)
+ if (direct) {
bh_result->b_private = inode;
+ set_buffer_defer_completion(bh_result);
+ }
set_buffer_unwritten(bh_result);
}
}
@@ -1309,6 +1381,14 @@ __xfs_get_blocks(
/*
* If this is O_DIRECT or the mpage code calling tell them how large
* the mapping is, so that we can avoid repeated get_blocks calls.
+ *
+ * If the mapping spans EOF, then we have to break the mapping up as the
+ * mapping for blocks beyond EOF must be marked new so that sub block
+ * regions can be correctly zeroed. We can't do this for mappings within
+ * EOF unless the mapping was just allocated or is unwritten, otherwise
+ * the callers would overwrite existing data with zeros. Hence we have
+ * to split the mapping into a range up to and including EOF, and a
+ * second mapping for beyond EOF.
*/
if (direct || size > (1 << inode->i_blkbits)) {
xfs_off_t mapping_size;
@@ -1319,6 +1399,12 @@ __xfs_get_blocks(
ASSERT(mapping_size > 0);
if (mapping_size > size)
mapping_size = size;
+ if (offset < i_size_read(inode) &&
+ offset + mapping_size >= i_size_read(inode)) {
+ /* limit mapping to block that spans EOF */
+ mapping_size = roundup_64(i_size_read(inode) - offset,
+ 1 << inode->i_blkbits);
+ }
if (mapping_size > LONG_MAX)
mapping_size = LONG_MAX;
@@ -1368,9 +1454,7 @@ xfs_end_io_direct_write(
struct kiocb *iocb,
loff_t offset,
ssize_t size,
- void *private,
- int ret,
- bool is_async)
+ void *private)
{
struct xfs_ioend *ioend = iocb->private;
@@ -1392,26 +1476,18 @@ xfs_end_io_direct_write(
ioend->io_offset = offset;
ioend->io_size = size;
- ioend->io_iocb = iocb;
- ioend->io_result = ret;
if (private && size > 0)
ioend->io_type = XFS_IO_UNWRITTEN;
- if (is_async) {
- ioend->io_isasync = 1;
- xfs_finish_ioend(ioend);
- } else {
- xfs_finish_ioend_sync(ioend);
- }
+ xfs_finish_ioend_sync(ioend);
}
STATIC ssize_t
xfs_vm_direct_IO(
int rw,
struct kiocb *iocb,
- const struct iovec *iov,
- loff_t offset,
- unsigned long nr_segs)
+ struct iov_iter *iter,
+ loff_t offset)
{
struct inode *inode = iocb->ki_filp->f_mapping->host;
struct block_device *bdev = xfs_find_bdev_for_inode(inode);
@@ -1419,7 +1495,7 @@ xfs_vm_direct_IO(
ssize_t ret;
if (rw & WRITE) {
- size_t size = iov_length(iov, nr_segs);
+ size_t size = iov_iter_count(iter);
/*
* We cannot preallocate a size update transaction here as we
@@ -1431,16 +1507,15 @@ xfs_vm_direct_IO(
if (offset + size > XFS_I(inode)->i_d.di_size)
ioend->io_isdirect = 1;
- ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov,
- offset, nr_segs,
- xfs_get_blocks_direct,
- xfs_end_io_direct_write, NULL, 0);
+ ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iter,
+ offset, xfs_get_blocks_direct,
+ xfs_end_io_direct_write, NULL,
+ DIO_ASYNC_EXTEND);
if (ret != -EIOCBQUEUED && iocb->private)
goto out_destroy_ioend;
} else {
- ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov,
- offset, nr_segs,
- xfs_get_blocks_direct,
+ ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iter,
+ offset, xfs_get_blocks_direct,
NULL, NULL, 0);
}
@@ -1494,13 +1569,26 @@ xfs_vm_write_failed(
loff_t pos,
unsigned len)
{
- loff_t block_offset = pos & PAGE_MASK;
+ loff_t block_offset;
loff_t block_start;
loff_t block_end;
loff_t from = pos & (PAGE_CACHE_SIZE - 1);
loff_t to = from + len;
struct buffer_head *bh, *head;
+ /*
+ * The request pos offset might be 32 or 64 bit, this is all fine
+ * on 64-bit platform. However, for 64-bit pos request on 32-bit
+ * platform, the high 32-bit will be masked off if we evaluate the
+ * block_offset via (pos & PAGE_MASK) because the PAGE_MASK is
+ * 0xfffff000 as an unsigned long, hence the result is incorrect
+ * which could cause the following ASSERT failed in most cases.
+ * In order to avoid this, we can evaluate the block_offset of the
+ * start of the page by using shifts rather than masks the mismatch
+ * problem.
+ */
+ block_offset = (pos >> PAGE_CACHE_SHIFT) << PAGE_CACHE_SHIFT;
+
ASSERT(block_offset + from == pos);
head = page_buffers(page);
@@ -1526,6 +1614,16 @@ xfs_vm_write_failed(
xfs_vm_kill_delalloc_range(inode, block_offset,
block_offset + bh->b_size);
+
+ /*
+ * This buffer does not contain data anymore. make sure anyone
+ * who finds it knows that for certain.
+ */
+ clear_buffer_delay(bh);
+ clear_buffer_uptodate(bh);
+ clear_buffer_mapped(bh);
+ clear_buffer_new(bh);
+ clear_buffer_dirty(bh);
}
}
@@ -1552,20 +1650,28 @@ xfs_vm_write_begin(
ASSERT(len <= PAGE_CACHE_SIZE);
- page = grab_cache_page_write_begin(mapping, index,
- flags | AOP_FLAG_NOFS);
+ page = grab_cache_page_write_begin(mapping, index, flags);
if (!page)
return -ENOMEM;
status = __block_write_begin(page, pos, len, xfs_get_blocks);
if (unlikely(status)) {
struct inode *inode = mapping->host;
+ size_t isize = i_size_read(inode);
xfs_vm_write_failed(inode, page, pos, len);
unlock_page(page);
- if (pos + len > i_size_read(inode))
- truncate_pagecache(inode, pos + len, i_size_read(inode));
+ /*
+ * If the write is beyond EOF, we only want to kill blocks
+ * allocated in this write, not blocks that were previously
+ * written successfully.
+ */
+ if (pos + len > isize) {
+ ssize_t start = max_t(ssize_t, pos, isize);
+
+ truncate_pagecache_range(inode, start, pos + len);
+ }
page_cache_release(page);
page = NULL;
@@ -1576,9 +1682,12 @@ xfs_vm_write_begin(
}
/*
- * On failure, we only need to kill delalloc blocks beyond EOF because they
- * will never be written. For blocks within EOF, generic_write_end() zeros them
- * so they are safe to leave alone and be written with all the other valid data.
+ * On failure, we only need to kill delalloc blocks beyond EOF in the range of
+ * this specific write because they will never be written. Previous writes
+ * beyond EOF where block allocation succeeded do not need to be trashed, so
+ * only new blocks from this write should be trashed. For blocks within
+ * EOF, generic_write_end() zeros them so they are safe to leave alone and be
+ * written with all the other valid data.
*/
STATIC int
xfs_vm_write_end(
@@ -1601,8 +1710,11 @@ xfs_vm_write_end(
loff_t to = pos + len;
if (to > isize) {
- truncate_pagecache(inode, to, isize);
+ /* only kill blocks in this write beyond EOF */
+ if (pos > isize)
+ isize = pos;
xfs_vm_kill_delalloc_range(inode, isize, to);
+ truncate_pagecache_range(inode, isize, to);
}
}
return ret;
diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h
index c325abb8d61..f94dd459dff 100644
--- a/fs/xfs/xfs_aops.h
+++ b/fs/xfs/xfs_aops.h
@@ -45,7 +45,6 @@ typedef struct xfs_ioend {
unsigned int io_type; /* delalloc / unwritten */
int io_error; /* I/O error code */
atomic_t io_remaining; /* hold count */
- unsigned int io_isasync : 1; /* needs aio_complete */
unsigned int io_isdirect : 1;/* direct I/O */
struct inode *io_inode; /* file being written to */
struct buffer_head *io_buffer_head;/* buffer linked list head */
@@ -54,8 +53,6 @@ typedef struct xfs_ioend {
xfs_off_t io_offset; /* offset in the file */
struct work_struct io_work; /* xfsdatad work queue */
struct xfs_trans *io_append_trans;/* xact. for size update */
- struct kiocb *io_iocb;
- int io_result;
} xfs_ioend_t;
extern const struct address_space_operations xfs_address_space_operations;
diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index aaf472532b3..bfe36fc2cdc 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -15,31 +15,34 @@
* along with this program; if not, write the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
-
#include "xfs.h"
#include "xfs_fs.h"
-#include "xfs_types.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_bit.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_mount.h"
+#include "xfs_da_format.h"
#include "xfs_da_btree.h"
-#include "xfs_bmap_btree.h"
#include "xfs_attr_sf.h"
-#include "xfs_dinode.h"
#include "xfs_inode.h"
#include "xfs_alloc.h"
+#include "xfs_trans.h"
#include "xfs_inode_item.h"
#include "xfs_bmap.h"
+#include "xfs_bmap_util.h"
+#include "xfs_bmap_btree.h"
#include "xfs_attr.h"
#include "xfs_attr_leaf.h"
+#include "xfs_attr_remote.h"
#include "xfs_error.h"
#include "xfs_quota.h"
#include "xfs_trans_space.h"
-#include "xfs_vnodeops.h"
#include "xfs_trace.h"
+#include "xfs_dinode.h"
/*
* xfs_attr.c
@@ -62,7 +65,6 @@ STATIC int xfs_attr_shortform_addname(xfs_da_args_t *args);
STATIC int xfs_attr_leaf_get(xfs_da_args_t *args);
STATIC int xfs_attr_leaf_addname(xfs_da_args_t *args);
STATIC int xfs_attr_leaf_removename(xfs_da_args_t *args);
-STATIC int xfs_attr_leaf_list(xfs_attr_list_context_t *context);
/*
* Internal routines when attribute list is more than one block.
@@ -70,34 +72,36 @@ STATIC int xfs_attr_leaf_list(xfs_attr_list_context_t *context);
STATIC int xfs_attr_node_get(xfs_da_args_t *args);
STATIC int xfs_attr_node_addname(xfs_da_args_t *args);
STATIC int xfs_attr_node_removename(xfs_da_args_t *args);
-STATIC int xfs_attr_node_list(xfs_attr_list_context_t *context);
STATIC int xfs_attr_fillstate(xfs_da_state_t *state);
STATIC int xfs_attr_refillstate(xfs_da_state_t *state);
-/*
- * Routines to manipulate out-of-line attribute values.
- */
-STATIC int xfs_attr_rmtval_set(xfs_da_args_t *args);
-STATIC int xfs_attr_rmtval_remove(xfs_da_args_t *args);
-
-#define ATTR_RMTVALUE_MAPSIZE 1 /* # of map entries at once */
STATIC int
-xfs_attr_name_to_xname(
- struct xfs_name *xname,
- const unsigned char *aname)
+xfs_attr_args_init(
+ struct xfs_da_args *args,
+ struct xfs_inode *dp,
+ const unsigned char *name,
+ int flags)
{
- if (!aname)
+
+ if (!name)
return EINVAL;
- xname->name = aname;
- xname->len = strlen((char *)aname);
- if (xname->len >= MAXNAMELEN)
+
+ memset(args, 0, sizeof(*args));
+ args->geo = dp->i_mount->m_attr_geo;
+ args->whichfork = XFS_ATTR_FORK;
+ args->dp = dp;
+ args->flags = flags;
+ args->name = name;
+ args->namelen = strlen((const char *)name);
+ if (args->namelen >= MAXNAMELEN)
return EFAULT; /* match IRIX behaviour */
+ args->hashval = xfs_da_hashname(args->name, args->namelen);
return 0;
}
-STATIC int
+int
xfs_inode_hasattr(
struct xfs_inode *ip)
{
@@ -112,78 +116,46 @@ xfs_inode_hasattr(
* Overall external interface routines.
*========================================================================*/
-STATIC int
-xfs_attr_get_int(
+int
+xfs_attr_get(
struct xfs_inode *ip,
- struct xfs_name *name,
+ const unsigned char *name,
unsigned char *value,
int *valuelenp,
int flags)
{
- xfs_da_args_t args;
- int error;
+ struct xfs_da_args args;
+ uint lock_mode;
+ int error;
+
+ XFS_STATS_INC(xs_attr_get);
+
+ if (XFS_FORCED_SHUTDOWN(ip->i_mount))
+ return EIO;
if (!xfs_inode_hasattr(ip))
return ENOATTR;
- /*
- * Fill in the arg structure for this request.
- */
- memset((char *)&args, 0, sizeof(args));
- args.name = name->name;
- args.namelen = name->len;
+ error = xfs_attr_args_init(&args, ip, name, flags);
+ if (error)
+ return error;
+
args.value = value;
args.valuelen = *valuelenp;
- args.flags = flags;
- args.hashval = xfs_da_hashname(args.name, args.namelen);
- args.dp = ip;
- args.whichfork = XFS_ATTR_FORK;
- /*
- * Decide on what work routines to call based on the inode size.
- */
- if (ip->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) {
+ lock_mode = xfs_ilock_attr_map_shared(ip);
+ if (!xfs_inode_hasattr(ip))
+ error = ENOATTR;
+ else if (ip->i_d.di_aformat == XFS_DINODE_FMT_LOCAL)
error = xfs_attr_shortform_getvalue(&args);
- } else if (xfs_bmap_one_block(ip, XFS_ATTR_FORK)) {
+ else if (xfs_bmap_one_block(ip, XFS_ATTR_FORK))
error = xfs_attr_leaf_get(&args);
- } else {
+ else
error = xfs_attr_node_get(&args);
- }
+ xfs_iunlock(ip, lock_mode);
- /*
- * Return the number of bytes in the value to the caller.
- */
*valuelenp = args.valuelen;
-
- if (error == EEXIST)
- error = 0;
- return(error);
-}
-
-int
-xfs_attr_get(
- xfs_inode_t *ip,
- const unsigned char *name,
- unsigned char *value,
- int *valuelenp,
- int flags)
-{
- int error;
- struct xfs_name xname;
-
- XFS_STATS_INC(xs_attr_get);
-
- if (XFS_FORCED_SHUTDOWN(ip->i_mount))
- return(EIO);
-
- error = xfs_attr_name_to_xname(&xname, name);
- if (error)
- return error;
-
- xfs_ilock(ip, XFS_ILOCK_SHARED);
- error = xfs_attr_get_int(ip, &xname, value, valuelenp, flags);
- xfs_iunlock(ip, XFS_ILOCK_SHARED);
- return(error);
+ return error == EEXIST ? 0 : error;
}
/*
@@ -191,12 +163,10 @@ xfs_attr_get(
*/
STATIC int
xfs_attr_calc_size(
- struct xfs_inode *ip,
- int namelen,
- int valuelen,
+ struct xfs_da_args *args,
int *local)
{
- struct xfs_mount *mp = ip->i_mount;
+ struct xfs_mount *mp = args->dp->i_mount;
int size;
int nblks;
@@ -204,12 +174,10 @@ xfs_attr_calc_size(
* Determine space new attribute will use, and if it would be
* "local" or "remote" (note: local != inline).
*/
- size = xfs_attr_leaf_newentsize(namelen, valuelen,
- mp->m_sb.sb_blocksize, local);
-
+ size = xfs_attr_leaf_newentsize(args, local);
nblks = XFS_DAENTER_SPACE_RES(mp, XFS_ATTR_FORK);
if (*local) {
- if (size > (mp->m_sb.sb_blocksize >> 1)) {
+ if (size > (args->geo->blksize / 2)) {
/* Double split possible */
nblks *= 2;
}
@@ -218,7 +186,7 @@ xfs_attr_calc_size(
* Out of line attribute, cannot double split, but
* make room for the attribute value itself.
*/
- uint dblocks = XFS_B_TO_FSB(mp, valuelen);
+ uint dblocks = xfs_attr3_rmt_blocks(mp, args->valuelen);
nblks += dblocks;
nblks += XFS_NEXTENTADD_SPACE_RES(mp, dblocks, XFS_ATTR_FORK);
}
@@ -226,25 +194,38 @@ xfs_attr_calc_size(
return nblks;
}
-STATIC int
-xfs_attr_set_int(
- struct xfs_inode *dp,
- struct xfs_name *name,
- unsigned char *value,
- int valuelen,
- int flags)
+int
+xfs_attr_set(
+ struct xfs_inode *dp,
+ const unsigned char *name,
+ unsigned char *value,
+ int valuelen,
+ int flags)
{
- xfs_da_args_t args;
- xfs_fsblock_t firstblock;
- xfs_bmap_free_t flist;
- int error, err2, committed;
- xfs_mount_t *mp = dp->i_mount;
- int rsvd = (flags & ATTR_ROOT) != 0;
- int local;
+ struct xfs_mount *mp = dp->i_mount;
+ struct xfs_da_args args;
+ struct xfs_bmap_free flist;
+ struct xfs_trans_res tres;
+ xfs_fsblock_t firstblock;
+ int rsvd = (flags & ATTR_ROOT) != 0;
+ int error, err2, committed, local;
+
+ XFS_STATS_INC(xs_attr_set);
+
+ if (XFS_FORCED_SHUTDOWN(dp->i_mount))
+ return EIO;
+
+ error = xfs_attr_args_init(&args, dp, name, flags);
+ if (error)
+ return error;
+
+ args.value = value;
+ args.valuelen = valuelen;
+ args.firstblock = &firstblock;
+ args.flist = &flist;
+ args.op_flags = XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT;
+ args.total = xfs_attr_calc_size(&args, &local);
- /*
- * Attach the dquots to the inode.
- */
error = xfs_qm_dqattach(dp, 0);
if (error)
return error;
@@ -255,32 +236,14 @@ xfs_attr_set_int(
*/
if (XFS_IFORK_Q(dp) == 0) {
int sf_size = sizeof(xfs_attr_sf_hdr_t) +
- XFS_ATTR_SF_ENTSIZE_BYNAME(name->len, valuelen);
+ XFS_ATTR_SF_ENTSIZE_BYNAME(args.namelen, valuelen);
- if ((error = xfs_bmap_add_attrfork(dp, sf_size, rsvd)))
- return(error);
+ error = xfs_bmap_add_attrfork(dp, sf_size, rsvd);
+ if (error)
+ return error;
}
/*
- * Fill in the arg structure for this request.
- */
- memset((char *)&args, 0, sizeof(args));
- args.name = name->name;
- args.namelen = name->len;
- args.value = value;
- args.valuelen = valuelen;
- args.flags = flags;
- args.hashval = xfs_da_hashname(args.name, args.namelen);
- args.dp = dp;
- args.firstblock = &firstblock;
- args.flist = &flist;
- args.whichfork = XFS_ATTR_FORK;
- args.op_flags = XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT;
-
- /* Size is now blocks for attribute data */
- args.total = xfs_attr_calc_size(dp, name->len, valuelen, &local);
-
- /*
* Start our first transaction of the day.
*
* All future transactions during this code must be "chained" off
@@ -300,11 +263,14 @@ xfs_attr_set_int(
if (rsvd)
args.trans->t_flags |= XFS_TRANS_RESERVE;
- if ((error = xfs_trans_reserve(args.trans, args.total,
- XFS_ATTRSET_LOG_RES(mp, args.total), 0,
- XFS_TRANS_PERM_LOG_RES, XFS_ATTRSET_LOG_COUNT))) {
+ tres.tr_logres = M_RES(mp)->tr_attrsetm.tr_logres +
+ M_RES(mp)->tr_attrsetrt.tr_logres * args.total;
+ tres.tr_logcount = XFS_ATTRSET_LOG_COUNT;
+ tres.tr_logflags = XFS_TRANS_PERM_LOG_RES;
+ error = xfs_trans_reserve(args.trans, &tres, args.total, 0);
+ if (error) {
xfs_trans_cancel(args.trans, 0);
- return(error);
+ return error;
}
xfs_ilock(dp, XFS_ILOCK_EXCL);
@@ -314,7 +280,7 @@ xfs_attr_set_int(
if (error) {
xfs_iunlock(dp, XFS_ILOCK_EXCL);
xfs_trans_cancel(args.trans, XFS_TRANS_RELEASE_LOG_RES);
- return (error);
+ return error;
}
xfs_trans_ijoin(args.trans, dp, 0);
@@ -323,9 +289,9 @@ xfs_attr_set_int(
* If the attribute list is non-existent or a shortform list,
* upgrade it to a single-leaf-block attribute list.
*/
- if ((dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) ||
- ((dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS) &&
- (dp->i_d.di_anextents == 0))) {
+ if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL ||
+ (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS &&
+ dp->i_d.di_anextents == 0)) {
/*
* Build initial attribute list (if required).
@@ -350,9 +316,8 @@ xfs_attr_set_int(
* the transaction goes to disk before returning
* to the user.
*/
- if (mp->m_flags & XFS_MOUNT_WSYNC) {
+ if (mp->m_flags & XFS_MOUNT_WSYNC)
xfs_trans_set_sync(args.trans);
- }
if (!error && (flags & ATTR_KERNOTIME) == 0) {
xfs_trans_ichgtime(args.trans, dp,
@@ -362,7 +327,7 @@ xfs_attr_set_int(
XFS_TRANS_RELEASE_LOG_RES);
xfs_iunlock(dp, XFS_ILOCK_EXCL);
- return(error == 0 ? err2 : error);
+ return error ? error : err2;
}
/*
@@ -400,22 +365,19 @@ xfs_attr_set_int(
}
- if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) {
+ if (xfs_bmap_one_block(dp, XFS_ATTR_FORK))
error = xfs_attr_leaf_addname(&args);
- } else {
+ else
error = xfs_attr_node_addname(&args);
- }
- if (error) {
+ if (error)
goto out;
- }
/*
* If this is a synchronous mount, make sure that the
* transaction goes to disk before returning to the user.
*/
- if (mp->m_flags & XFS_MOUNT_WSYNC) {
+ if (mp->m_flags & XFS_MOUNT_WSYNC)
xfs_trans_set_sync(args.trans);
- }
if ((flags & ATTR_KERNOTIME) == 0)
xfs_trans_ichgtime(args.trans, dp, XFS_ICHGTIME_CHG);
@@ -427,65 +389,47 @@ xfs_attr_set_int(
error = xfs_trans_commit(args.trans, XFS_TRANS_RELEASE_LOG_RES);
xfs_iunlock(dp, XFS_ILOCK_EXCL);
- return(error);
+ return error;
out:
- if (args.trans)
+ if (args.trans) {
xfs_trans_cancel(args.trans,
XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
+ }
xfs_iunlock(dp, XFS_ILOCK_EXCL);
- return(error);
+ return error;
}
+/*
+ * Generic handler routine to remove a name from an attribute list.
+ * Transitions attribute list from Btree to shortform as necessary.
+ */
int
-xfs_attr_set(
- xfs_inode_t *dp,
- const unsigned char *name,
- unsigned char *value,
- int valuelen,
- int flags)
+xfs_attr_remove(
+ struct xfs_inode *dp,
+ const unsigned char *name,
+ int flags)
{
- int error;
- struct xfs_name xname;
+ struct xfs_mount *mp = dp->i_mount;
+ struct xfs_da_args args;
+ struct xfs_bmap_free flist;
+ xfs_fsblock_t firstblock;
+ int error;
- XFS_STATS_INC(xs_attr_set);
+ XFS_STATS_INC(xs_attr_remove);
if (XFS_FORCED_SHUTDOWN(dp->i_mount))
- return (EIO);
+ return EIO;
- error = xfs_attr_name_to_xname(&xname, name);
+ if (!xfs_inode_hasattr(dp))
+ return ENOATTR;
+
+ error = xfs_attr_args_init(&args, dp, name, flags);
if (error)
return error;
- return xfs_attr_set_int(dp, &xname, value, valuelen, flags);
-}
-
-/*
- * Generic handler routine to remove a name from an attribute list.
- * Transitions attribute list from Btree to shortform as necessary.
- */
-STATIC int
-xfs_attr_remove_int(xfs_inode_t *dp, struct xfs_name *name, int flags)
-{
- xfs_da_args_t args;
- xfs_fsblock_t firstblock;
- xfs_bmap_free_t flist;
- int error;
- xfs_mount_t *mp = dp->i_mount;
-
- /*
- * Fill in the arg structure for this request.
- */
- memset((char *)&args, 0, sizeof(args));
- args.name = name->name;
- args.namelen = name->len;
- args.flags = flags;
- args.hashval = xfs_da_hashname(args.name, args.namelen);
- args.dp = dp;
args.firstblock = &firstblock;
args.flist = &flist;
- args.total = 0;
- args.whichfork = XFS_ATTR_FORK;
/*
* we have no control over the attribute names that userspace passes us
@@ -494,9 +438,6 @@ xfs_attr_remove_int(xfs_inode_t *dp, struct xfs_name *name, int flags)
*/
args.op_flags = XFS_DA_OP_OKNOENT;
- /*
- * Attach the dquots to the inode.
- */
error = xfs_qm_dqattach(dp, 0);
if (error)
return error;
@@ -521,13 +462,11 @@ xfs_attr_remove_int(xfs_inode_t *dp, struct xfs_name *name, int flags)
if (flags & ATTR_ROOT)
args.trans->t_flags |= XFS_TRANS_RESERVE;
- if ((error = xfs_trans_reserve(args.trans,
- XFS_ATTRRM_SPACE_RES(mp),
- XFS_ATTRRM_LOG_RES(mp),
- 0, XFS_TRANS_PERM_LOG_RES,
- XFS_ATTRRM_LOG_COUNT))) {
+ error = xfs_trans_reserve(args.trans, &M_RES(mp)->tr_attrrm,
+ XFS_ATTRRM_SPACE_RES(mp), 0);
+ if (error) {
xfs_trans_cancel(args.trans, 0);
- return(error);
+ return error;
}
xfs_ilock(dp, XFS_ILOCK_EXCL);
@@ -537,35 +476,26 @@ xfs_attr_remove_int(xfs_inode_t *dp, struct xfs_name *name, int flags)
*/
xfs_trans_ijoin(args.trans, dp, 0);
- /*
- * Decide on what work routines to call based on the inode size.
- */
if (!xfs_inode_hasattr(dp)) {
error = XFS_ERROR(ENOATTR);
- goto out;
- }
- if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) {
+ } else if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) {
ASSERT(dp->i_afp->if_flags & XFS_IFINLINE);
error = xfs_attr_shortform_remove(&args);
- if (error) {
- goto out;
- }
} else if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) {
error = xfs_attr_leaf_removename(&args);
} else {
error = xfs_attr_node_removename(&args);
}
- if (error) {
+
+ if (error)
goto out;
- }
/*
* If this is a synchronous mount, make sure that the
* transaction goes to disk before returning to the user.
*/
- if (mp->m_flags & XFS_MOUNT_WSYNC) {
+ if (mp->m_flags & XFS_MOUNT_WSYNC)
xfs_trans_set_sync(args.trans);
- }
if ((flags & ATTR_KERNOTIME) == 0)
xfs_trans_ichgtime(args.trans, dp, XFS_ICHGTIME_CHG);
@@ -577,267 +507,17 @@ xfs_attr_remove_int(xfs_inode_t *dp, struct xfs_name *name, int flags)
error = xfs_trans_commit(args.trans, XFS_TRANS_RELEASE_LOG_RES);
xfs_iunlock(dp, XFS_ILOCK_EXCL);
- return(error);
+ return error;
out:
- if (args.trans)
+ if (args.trans) {
xfs_trans_cancel(args.trans,
XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
- xfs_iunlock(dp, XFS_ILOCK_EXCL);
- return(error);
-}
-
-int
-xfs_attr_remove(
- xfs_inode_t *dp,
- const unsigned char *name,
- int flags)
-{
- int error;
- struct xfs_name xname;
-
- XFS_STATS_INC(xs_attr_remove);
-
- if (XFS_FORCED_SHUTDOWN(dp->i_mount))
- return (EIO);
-
- error = xfs_attr_name_to_xname(&xname, name);
- if (error)
- return error;
-
- xfs_ilock(dp, XFS_ILOCK_SHARED);
- if (!xfs_inode_hasattr(dp)) {
- xfs_iunlock(dp, XFS_ILOCK_SHARED);
- return XFS_ERROR(ENOATTR);
- }
- xfs_iunlock(dp, XFS_ILOCK_SHARED);
-
- return xfs_attr_remove_int(dp, &xname, flags);
-}
-
-int
-xfs_attr_list_int(xfs_attr_list_context_t *context)
-{
- int error;
- xfs_inode_t *dp = context->dp;
-
- XFS_STATS_INC(xs_attr_list);
-
- if (XFS_FORCED_SHUTDOWN(dp->i_mount))
- return EIO;
-
- xfs_ilock(dp, XFS_ILOCK_SHARED);
-
- /*
- * Decide on what work routines to call based on the inode size.
- */
- if (!xfs_inode_hasattr(dp)) {
- error = 0;
- } else if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) {
- error = xfs_attr_shortform_list(context);
- } else if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) {
- error = xfs_attr_leaf_list(context);
- } else {
- error = xfs_attr_node_list(context);
- }
-
- xfs_iunlock(dp, XFS_ILOCK_SHARED);
-
- return error;
-}
-
-#define ATTR_ENTBASESIZE /* minimum bytes used by an attr */ \
- (((struct attrlist_ent *) 0)->a_name - (char *) 0)
-#define ATTR_ENTSIZE(namelen) /* actual bytes used by an attr */ \
- ((ATTR_ENTBASESIZE + (namelen) + 1 + sizeof(u_int32_t)-1) \
- & ~(sizeof(u_int32_t)-1))
-
-/*
- * Format an attribute and copy it out to the user's buffer.
- * Take care to check values and protect against them changing later,
- * we may be reading them directly out of a user buffer.
- */
-/*ARGSUSED*/
-STATIC int
-xfs_attr_put_listent(
- xfs_attr_list_context_t *context,
- int flags,
- unsigned char *name,
- int namelen,
- int valuelen,
- unsigned char *value)
-{
- struct attrlist *alist = (struct attrlist *)context->alist;
- attrlist_ent_t *aep;
- int arraytop;
-
- ASSERT(!(context->flags & ATTR_KERNOVAL));
- ASSERT(context->count >= 0);
- ASSERT(context->count < (ATTR_MAX_VALUELEN/8));
- ASSERT(context->firstu >= sizeof(*alist));
- ASSERT(context->firstu <= context->bufsize);
-
- /*
- * Only list entries in the right namespace.
- */
- if (((context->flags & ATTR_SECURE) == 0) !=
- ((flags & XFS_ATTR_SECURE) == 0))
- return 0;
- if (((context->flags & ATTR_ROOT) == 0) !=
- ((flags & XFS_ATTR_ROOT) == 0))
- return 0;
-
- arraytop = sizeof(*alist) +
- context->count * sizeof(alist->al_offset[0]);
- context->firstu -= ATTR_ENTSIZE(namelen);
- if (context->firstu < arraytop) {
- trace_xfs_attr_list_full(context);
- alist->al_more = 1;
- context->seen_enough = 1;
- return 1;
- }
-
- aep = (attrlist_ent_t *)&context->alist[context->firstu];
- aep->a_valuelen = valuelen;
- memcpy(aep->a_name, name, namelen);
- aep->a_name[namelen] = 0;
- alist->al_offset[context->count++] = context->firstu;
- alist->al_count = context->count;
- trace_xfs_attr_list_add(context);
- return 0;
-}
-
-/*
- * Generate a list of extended attribute names and optionally
- * also value lengths. Positive return value follows the XFS
- * convention of being an error, zero or negative return code
- * is the length of the buffer returned (negated), indicating
- * success.
- */
-int
-xfs_attr_list(
- xfs_inode_t *dp,
- char *buffer,
- int bufsize,
- int flags,
- attrlist_cursor_kern_t *cursor)
-{
- xfs_attr_list_context_t context;
- struct attrlist *alist;
- int error;
-
- /*
- * Validate the cursor.
- */
- if (cursor->pad1 || cursor->pad2)
- return(XFS_ERROR(EINVAL));
- if ((cursor->initted == 0) &&
- (cursor->hashval || cursor->blkno || cursor->offset))
- return XFS_ERROR(EINVAL);
-
- /*
- * Check for a properly aligned buffer.
- */
- if (((long)buffer) & (sizeof(int)-1))
- return XFS_ERROR(EFAULT);
- if (flags & ATTR_KERNOVAL)
- bufsize = 0;
-
- /*
- * Initialize the output buffer.
- */
- memset(&context, 0, sizeof(context));
- context.dp = dp;
- context.cursor = cursor;
- context.resynch = 1;
- context.flags = flags;
- context.alist = buffer;
- context.bufsize = (bufsize & ~(sizeof(int)-1)); /* align */
- context.firstu = context.bufsize;
- context.put_listent = xfs_attr_put_listent;
-
- alist = (struct attrlist *)context.alist;
- alist->al_count = 0;
- alist->al_more = 0;
- alist->al_offset[0] = context.bufsize;
-
- error = xfs_attr_list_int(&context);
- ASSERT(error >= 0);
- return error;
-}
-
-int /* error */
-xfs_attr_inactive(xfs_inode_t *dp)
-{
- xfs_trans_t *trans;
- xfs_mount_t *mp;
- int error;
-
- mp = dp->i_mount;
- ASSERT(! XFS_NOT_DQATTACHED(mp, dp));
-
- xfs_ilock(dp, XFS_ILOCK_SHARED);
- if (!xfs_inode_hasattr(dp) ||
- dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) {
- xfs_iunlock(dp, XFS_ILOCK_SHARED);
- return 0;
- }
- xfs_iunlock(dp, XFS_ILOCK_SHARED);
-
- /*
- * Start our first transaction of the day.
- *
- * All future transactions during this code must be "chained" off
- * this one via the trans_dup() call. All transactions will contain
- * the inode, and the inode will always be marked with trans_ihold().
- * Since the inode will be locked in all transactions, we must log
- * the inode in every transaction to let it float upward through
- * the log.
- */
- trans = xfs_trans_alloc(mp, XFS_TRANS_ATTRINVAL);
- if ((error = xfs_trans_reserve(trans, 0, XFS_ATTRINVAL_LOG_RES(mp), 0,
- XFS_TRANS_PERM_LOG_RES,
- XFS_ATTRINVAL_LOG_COUNT))) {
- xfs_trans_cancel(trans, 0);
- return(error);
- }
- xfs_ilock(dp, XFS_ILOCK_EXCL);
-
- /*
- * No need to make quota reservations here. We expect to release some
- * blocks, not allocate, in the common case.
- */
- xfs_trans_ijoin(trans, dp, 0);
-
- /*
- * Decide on what work routines to call based on the inode size.
- */
- if (!xfs_inode_hasattr(dp) ||
- dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) {
- error = 0;
- goto out;
}
- error = xfs_attr_root_inactive(&trans, dp);
- if (error)
- goto out;
-
- error = xfs_itruncate_extents(&trans, dp, XFS_ATTR_FORK, 0);
- if (error)
- goto out;
-
- error = xfs_trans_commit(trans, XFS_TRANS_RELEASE_LOG_RES);
xfs_iunlock(dp, XFS_ILOCK_EXCL);
-
- return(error);
-
-out:
- xfs_trans_cancel(trans, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
- xfs_iunlock(dp, XFS_ILOCK_EXCL);
- return(error);
+ return error;
}
-
-
/*========================================================================
* External routines when attribute list is inside the inode
*========================================================================*/
@@ -903,7 +583,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
*/
dp = args->dp;
args->blkno = 0;
- error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
+ error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
if (error)
return error;
@@ -911,30 +591,41 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
* Look up the given attribute in the leaf block. Figure out if
* the given flags produce an error or call for an atomic rename.
*/
- retval = xfs_attr_leaf_lookup_int(bp, args);
+ retval = xfs_attr3_leaf_lookup_int(bp, args);
if ((args->flags & ATTR_REPLACE) && (retval == ENOATTR)) {
xfs_trans_brelse(args->trans, bp);
- return(retval);
+ return retval;
} else if (retval == EEXIST) {
if (args->flags & ATTR_CREATE) { /* pure create op */
xfs_trans_brelse(args->trans, bp);
- return(retval);
+ return retval;
}
trace_xfs_attr_leaf_replace(args);
+ /* save the attribute state for later removal*/
args->op_flags |= XFS_DA_OP_RENAME; /* an atomic rename */
args->blkno2 = args->blkno; /* set 2nd entry info*/
args->index2 = args->index;
args->rmtblkno2 = args->rmtblkno;
args->rmtblkcnt2 = args->rmtblkcnt;
+ args->rmtvaluelen2 = args->rmtvaluelen;
+
+ /*
+ * clear the remote attr state now that it is saved so that the
+ * values reflect the state of the attribute we are about to
+ * add, not the attribute we just found and will remove later.
+ */
+ args->rmtblkno = 0;
+ args->rmtblkcnt = 0;
+ args->rmtvaluelen = 0;
}
/*
* Add the attribute to the leaf block, transitioning to a Btree
* if required.
*/
- retval = xfs_attr_leaf_add(bp, args);
+ retval = xfs_attr3_leaf_add(bp, args);
if (retval == ENOSPC) {
/*
* Promote the attribute list to the Btree format, then
@@ -942,7 +633,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
* can manage its own transactions.
*/
xfs_bmap_init(args->flist, args->firstblock);
- error = xfs_attr_leaf_to_node(args);
+ error = xfs_attr3_leaf_to_node(args);
if (!error) {
error = xfs_bmap_finish(&args->trans, args->flist,
&committed);
@@ -1007,7 +698,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
* In a separate transaction, set the incomplete flag on the
* "old" attr and clear the incomplete flag on the "new" attr.
*/
- error = xfs_attr_leaf_flipflags(args);
+ error = xfs_attr3_leaf_flipflags(args);
if (error)
return(error);
@@ -1019,6 +710,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
args->blkno = args->blkno2;
args->rmtblkno = args->rmtblkno2;
args->rmtblkcnt = args->rmtblkcnt2;
+ args->rmtvaluelen = args->rmtvaluelen2;
if (args->rmtblkno) {
error = xfs_attr_rmtval_remove(args);
if (error)
@@ -1029,19 +721,19 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
* Read in the block containing the "old" attr, then
* remove the "old" attr from that block (neat, huh!)
*/
- error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno,
+ error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno,
-1, &bp);
if (error)
return error;
- xfs_attr_leaf_remove(bp, args);
+ xfs_attr3_leaf_remove(bp, args);
/*
* If the result is small enough, shrink it all into the inode.
*/
if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) {
xfs_bmap_init(args->flist, args->firstblock);
- error = xfs_attr_leaf_to_shortform(bp, args, forkoff);
+ error = xfs_attr3_leaf_to_shortform(bp, args, forkoff);
/* bp is gone due to xfs_da_shrink_inode */
if (!error) {
error = xfs_bmap_finish(&args->trans,
@@ -1073,9 +765,9 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
/*
* Added a "remote" value, just clear the incomplete flag.
*/
- error = xfs_attr_leaf_clearflag(args);
+ error = xfs_attr3_leaf_clearflag(args);
}
- return(error);
+ return error;
}
/*
@@ -1098,24 +790,24 @@ xfs_attr_leaf_removename(xfs_da_args_t *args)
*/
dp = args->dp;
args->blkno = 0;
- error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
+ error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
if (error)
return error;
- error = xfs_attr_leaf_lookup_int(bp, args);
+ error = xfs_attr3_leaf_lookup_int(bp, args);
if (error == ENOATTR) {
xfs_trans_brelse(args->trans, bp);
- return(error);
+ return error;
}
- xfs_attr_leaf_remove(bp, args);
+ xfs_attr3_leaf_remove(bp, args);
/*
* If the result is small enough, shrink it all into the inode.
*/
if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) {
xfs_bmap_init(args->flist, args->firstblock);
- error = xfs_attr_leaf_to_shortform(bp, args, forkoff);
+ error = xfs_attr3_leaf_to_shortform(bp, args, forkoff);
/* bp is gone due to xfs_da_shrink_inode */
if (!error) {
error = xfs_bmap_finish(&args->trans, args->flist,
@@ -1125,7 +817,7 @@ xfs_attr_leaf_removename(xfs_da_args_t *args)
ASSERT(committed);
args->trans = NULL;
xfs_bmap_cancel(args->flist);
- return(error);
+ return error;
}
/*
@@ -1135,7 +827,7 @@ xfs_attr_leaf_removename(xfs_da_args_t *args)
if (committed)
xfs_trans_ijoin(args->trans, dp, 0);
}
- return(0);
+ return 0;
}
/*
@@ -1153,47 +845,25 @@ xfs_attr_leaf_get(xfs_da_args_t *args)
trace_xfs_attr_leaf_get(args);
args->blkno = 0;
- error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
+ error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
if (error)
return error;
- error = xfs_attr_leaf_lookup_int(bp, args);
+ error = xfs_attr3_leaf_lookup_int(bp, args);
if (error != EEXIST) {
xfs_trans_brelse(args->trans, bp);
- return(error);
+ return error;
}
- error = xfs_attr_leaf_getvalue(bp, args);
+ error = xfs_attr3_leaf_getvalue(bp, args);
xfs_trans_brelse(args->trans, bp);
if (!error && (args->rmtblkno > 0) && !(args->flags & ATTR_KERNOVAL)) {
error = xfs_attr_rmtval_get(args);
}
- return(error);
-}
-
-/*
- * Copy out attribute entries for attr_list(), for leaf attribute lists.
- */
-STATIC int
-xfs_attr_leaf_list(xfs_attr_list_context_t *context)
-{
- int error;
- struct xfs_buf *bp;
-
- trace_xfs_attr_leaf_list(context);
-
- context->cursor->blkno = 0;
- error = xfs_attr_leaf_read(NULL, context->dp, 0, -1, &bp);
- if (error)
- return XFS_ERROR(error);
-
- error = xfs_attr_leaf_list_int(bp, context);
- xfs_trans_brelse(NULL, bp);
- return XFS_ERROR(error);
+ return error;
}
-
/*========================================================================
- * External routines when attribute list size > XFS_LBSIZE(mp).
+ * External routines when attribute list size > geo->blksize
*========================================================================*/
/*
@@ -1226,14 +896,12 @@ restart:
state = xfs_da_state_alloc();
state->args = args;
state->mp = mp;
- state->blocksize = state->mp->m_sb.sb_blocksize;
- state->node_ents = state->mp->m_attr_node_ents;
/*
* Search to see if name already exists, and get back a pointer
* to where it should go.
*/
- error = xfs_da_node_lookup_int(state, &retval);
+ error = xfs_da3_node_lookup_int(state, &retval);
if (error)
goto out;
blk = &state->path.blk[ state->path.active-1 ];
@@ -1246,16 +914,25 @@ restart:
trace_xfs_attr_node_replace(args);
+ /* save the attribute state for later removal*/
args->op_flags |= XFS_DA_OP_RENAME; /* atomic rename op */
args->blkno2 = args->blkno; /* set 2nd entry info*/
args->index2 = args->index;
args->rmtblkno2 = args->rmtblkno;
args->rmtblkcnt2 = args->rmtblkcnt;
+ args->rmtvaluelen2 = args->rmtvaluelen;
+
+ /*
+ * clear the remote attr state now that it is saved so that the
+ * values reflect the state of the attribute we are about to
+ * add, not the attribute we just found and will remove later.
+ */
args->rmtblkno = 0;
args->rmtblkcnt = 0;
+ args->rmtvaluelen = 0;
}
- retval = xfs_attr_leaf_add(blk->bp, state->args);
+ retval = xfs_attr3_leaf_add(blk->bp, state->args);
if (retval == ENOSPC) {
if (state->path.active == 1) {
/*
@@ -1264,8 +941,9 @@ restart:
* have been a b-tree.
*/
xfs_da_state_free(state);
+ state = NULL;
xfs_bmap_init(args->flist, args->firstblock);
- error = xfs_attr_leaf_to_node(args);
+ error = xfs_attr3_leaf_to_node(args);
if (!error) {
error = xfs_bmap_finish(&args->trans,
args->flist,
@@ -1304,7 +982,7 @@ restart:
* in the index2/blkno2/rmtblkno2/rmtblkcnt2 fields.
*/
xfs_bmap_init(args->flist, args->firstblock);
- error = xfs_da_split(state);
+ error = xfs_da3_split(state);
if (!error) {
error = xfs_bmap_finish(&args->trans, args->flist,
&committed);
@@ -1326,7 +1004,7 @@ restart:
/*
* Addition succeeded, update Btree hashvals.
*/
- xfs_da_fixhashpath(state, &state->path);
+ xfs_da3_fixhashpath(state, &state->path);
}
/*
@@ -1367,7 +1045,7 @@ restart:
* In a separate transaction, set the incomplete flag on the
* "old" attr and clear the incomplete flag on the "new" attr.
*/
- error = xfs_attr_leaf_flipflags(args);
+ error = xfs_attr3_leaf_flipflags(args);
if (error)
goto out;
@@ -1379,6 +1057,7 @@ restart:
args->blkno = args->blkno2;
args->rmtblkno = args->rmtblkno2;
args->rmtblkcnt = args->rmtblkcnt2;
+ args->rmtvaluelen = args->rmtvaluelen2;
if (args->rmtblkno) {
error = xfs_attr_rmtval_remove(args);
if (error)
@@ -1394,10 +1073,8 @@ restart:
state = xfs_da_state_alloc();
state->args = args;
state->mp = mp;
- state->blocksize = state->mp->m_sb.sb_blocksize;
- state->node_ents = state->mp->m_attr_node_ents;
state->inleaf = 0;
- error = xfs_da_node_lookup_int(state, &retval);
+ error = xfs_da3_node_lookup_int(state, &retval);
if (error)
goto out;
@@ -1406,15 +1083,15 @@ restart:
*/
blk = &state->path.blk[ state->path.active-1 ];
ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC);
- error = xfs_attr_leaf_remove(blk->bp, args);
- xfs_da_fixhashpath(state, &state->path);
+ error = xfs_attr3_leaf_remove(blk->bp, args);
+ xfs_da3_fixhashpath(state, &state->path);
/*
* Check to see if the tree needs to be collapsed.
*/
if (retval && (state->path.active > 1)) {
xfs_bmap_init(args->flist, args->firstblock);
- error = xfs_da_join(state);
+ error = xfs_da3_join(state);
if (!error) {
error = xfs_bmap_finish(&args->trans,
args->flist,
@@ -1447,7 +1124,7 @@ restart:
/*
* Added a "remote" value, just clear the incomplete flag.
*/
- error = xfs_attr_leaf_clearflag(args);
+ error = xfs_attr3_leaf_clearflag(args);
if (error)
goto out;
}
@@ -1486,13 +1163,11 @@ xfs_attr_node_removename(xfs_da_args_t *args)
state = xfs_da_state_alloc();
state->args = args;
state->mp = dp->i_mount;
- state->blocksize = state->mp->m_sb.sb_blocksize;
- state->node_ents = state->mp->m_attr_node_ents;
/*
* Search to see if name exists, and get back a pointer to it.
*/
- error = xfs_da_node_lookup_int(state, &retval);
+ error = xfs_da3_node_lookup_int(state, &retval);
if (error || (retval != EEXIST)) {
if (error == 0)
error = retval;
@@ -1521,7 +1196,7 @@ xfs_attr_node_removename(xfs_da_args_t *args)
* Mark the attribute as INCOMPLETE, then bunmapi() the
* remote value.
*/
- error = xfs_attr_leaf_setflag(args);
+ error = xfs_attr3_leaf_setflag(args);
if (error)
goto out;
error = xfs_attr_rmtval_remove(args);
@@ -1542,15 +1217,15 @@ xfs_attr_node_removename(xfs_da_args_t *args)
*/
blk = &state->path.blk[ state->path.active-1 ];
ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC);
- retval = xfs_attr_leaf_remove(blk->bp, args);
- xfs_da_fixhashpath(state, &state->path);
+ retval = xfs_attr3_leaf_remove(blk->bp, args);
+ xfs_da3_fixhashpath(state, &state->path);
/*
* Check to see if the tree needs to be collapsed.
*/
if (retval && (state->path.active > 1)) {
xfs_bmap_init(args->flist, args->firstblock);
- error = xfs_da_join(state);
+ error = xfs_da3_join(state);
if (!error) {
error = xfs_bmap_finish(&args->trans, args->flist,
&committed);
@@ -1588,13 +1263,13 @@ xfs_attr_node_removename(xfs_da_args_t *args)
ASSERT(state->path.blk[0].bp);
state->path.blk[0].bp = NULL;
- error = xfs_attr_leaf_read(args->trans, args->dp, 0, -1, &bp);
+ error = xfs_attr3_leaf_read(args->trans, args->dp, 0, -1, &bp);
if (error)
goto out;
if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) {
xfs_bmap_init(args->flist, args->firstblock);
- error = xfs_attr_leaf_to_shortform(bp, args, forkoff);
+ error = xfs_attr3_leaf_to_shortform(bp, args, forkoff);
/* bp is gone due to xfs_da_shrink_inode */
if (!error) {
error = xfs_bmap_finish(&args->trans,
@@ -1696,7 +1371,7 @@ xfs_attr_refillstate(xfs_da_state_t *state)
ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
if (blk->disk_blkno) {
- error = xfs_da_node_read(state->args->trans,
+ error = xfs_da3_node_read(state->args->trans,
state->args->dp,
blk->blkno, blk->disk_blkno,
&blk->bp, XFS_ATTR_FORK);
@@ -1715,7 +1390,7 @@ xfs_attr_refillstate(xfs_da_state_t *state)
ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
if (blk->disk_blkno) {
- error = xfs_da_node_read(state->args->trans,
+ error = xfs_da3_node_read(state->args->trans,
state->args->dp,
blk->blkno, blk->disk_blkno,
&blk->bp, XFS_ATTR_FORK);
@@ -1749,13 +1424,11 @@ xfs_attr_node_get(xfs_da_args_t *args)
state = xfs_da_state_alloc();
state->args = args;
state->mp = args->dp->i_mount;
- state->blocksize = state->mp->m_sb.sb_blocksize;
- state->node_ents = state->mp->m_attr_node_ents;
/*
* Search to see if name exists, and get back a pointer to it.
*/
- error = xfs_da_node_lookup_int(state, &retval);
+ error = xfs_da3_node_lookup_int(state, &retval);
if (error) {
retval = error;
} else if (retval == EEXIST) {
@@ -1766,7 +1439,7 @@ xfs_attr_node_get(xfs_da_args_t *args)
/*
* Get the value, local or "remote"
*/
- retval = xfs_attr_leaf_getvalue(blk->bp, args);
+ retval = xfs_attr3_leaf_getvalue(blk->bp, args);
if (!retval && (args->rmtblkno > 0)
&& !(args->flags & ATTR_KERNOVAL)) {
retval = xfs_attr_rmtval_get(args);
@@ -1784,420 +1457,3 @@ xfs_attr_node_get(xfs_da_args_t *args)
xfs_da_state_free(state);
return(retval);
}
-
-STATIC int /* error */
-xfs_attr_node_list(xfs_attr_list_context_t *context)
-{
- attrlist_cursor_kern_t *cursor;
- xfs_attr_leafblock_t *leaf;
- xfs_da_intnode_t *node;
- xfs_da_node_entry_t *btree;
- int error, i;
- struct xfs_buf *bp;
-
- trace_xfs_attr_node_list(context);
-
- cursor = context->cursor;
- cursor->initted = 1;
-
- /*
- * Do all sorts of validation on the passed-in cursor structure.
- * If anything is amiss, ignore the cursor and look up the hashval
- * starting from the btree root.
- */
- bp = NULL;
- if (cursor->blkno > 0) {
- error = xfs_da_node_read(NULL, context->dp, cursor->blkno, -1,
- &bp, XFS_ATTR_FORK);
- if ((error != 0) && (error != EFSCORRUPTED))
- return(error);
- if (bp) {
- node = bp->b_addr;
- switch (be16_to_cpu(node->hdr.info.magic)) {
- case XFS_DA_NODE_MAGIC:
- trace_xfs_attr_list_wrong_blk(context);
- xfs_trans_brelse(NULL, bp);
- bp = NULL;
- break;
- case XFS_ATTR_LEAF_MAGIC:
- leaf = bp->b_addr;
- if (cursor->hashval > be32_to_cpu(leaf->entries[
- be16_to_cpu(leaf->hdr.count)-1].hashval)) {
- trace_xfs_attr_list_wrong_blk(context);
- xfs_trans_brelse(NULL, bp);
- bp = NULL;
- } else if (cursor->hashval <=
- be32_to_cpu(leaf->entries[0].hashval)) {
- trace_xfs_attr_list_wrong_blk(context);
- xfs_trans_brelse(NULL, bp);
- bp = NULL;
- }
- break;
- default:
- trace_xfs_attr_list_wrong_blk(context);
- xfs_trans_brelse(NULL, bp);
- bp = NULL;
- }
- }
- }
-
- /*
- * We did not find what we expected given the cursor's contents,
- * so we start from the top and work down based on the hash value.
- * Note that start of node block is same as start of leaf block.
- */
- if (bp == NULL) {
- cursor->blkno = 0;
- for (;;) {
- error = xfs_da_node_read(NULL, context->dp,
- cursor->blkno, -1, &bp,
- XFS_ATTR_FORK);
- if (error)
- return(error);
- node = bp->b_addr;
- if (node->hdr.info.magic ==
- cpu_to_be16(XFS_ATTR_LEAF_MAGIC))
- break;
- if (unlikely(node->hdr.info.magic !=
- cpu_to_be16(XFS_DA_NODE_MAGIC))) {
- XFS_CORRUPTION_ERROR("xfs_attr_node_list(3)",
- XFS_ERRLEVEL_LOW,
- context->dp->i_mount,
- node);
- xfs_trans_brelse(NULL, bp);
- return(XFS_ERROR(EFSCORRUPTED));
- }
- btree = node->btree;
- for (i = 0; i < be16_to_cpu(node->hdr.count);
- btree++, i++) {
- if (cursor->hashval
- <= be32_to_cpu(btree->hashval)) {
- cursor->blkno = be32_to_cpu(btree->before);
- trace_xfs_attr_list_node_descend(context,
- btree);
- break;
- }
- }
- if (i == be16_to_cpu(node->hdr.count)) {
- xfs_trans_brelse(NULL, bp);
- return(0);
- }
- xfs_trans_brelse(NULL, bp);
- }
- }
- ASSERT(bp != NULL);
-
- /*
- * Roll upward through the blocks, processing each leaf block in
- * order. As long as there is space in the result buffer, keep
- * adding the information.
- */
- for (;;) {
- leaf = bp->b_addr;
- error = xfs_attr_leaf_list_int(bp, context);
- if (error) {
- xfs_trans_brelse(NULL, bp);
- return error;
- }
- if (context->seen_enough || leaf->hdr.info.forw == 0)
- break;
- cursor->blkno = be32_to_cpu(leaf->hdr.info.forw);
- xfs_trans_brelse(NULL, bp);
- error = xfs_attr_leaf_read(NULL, context->dp, cursor->blkno, -1,
- &bp);
- if (error)
- return error;
- }
- xfs_trans_brelse(NULL, bp);
- return(0);
-}
-
-
-/*========================================================================
- * External routines for manipulating out-of-line attribute values.
- *========================================================================*/
-
-/*
- * Read the value associated with an attribute from the out-of-line buffer
- * that we stored it in.
- */
-int
-xfs_attr_rmtval_get(xfs_da_args_t *args)
-{
- xfs_bmbt_irec_t map[ATTR_RMTVALUE_MAPSIZE];
- xfs_mount_t *mp;
- xfs_daddr_t dblkno;
- void *dst;
- xfs_buf_t *bp;
- int nmap, error, tmp, valuelen, blkcnt, i;
- xfs_dablk_t lblkno;
-
- trace_xfs_attr_rmtval_get(args);
-
- ASSERT(!(args->flags & ATTR_KERNOVAL));
-
- mp = args->dp->i_mount;
- dst = args->value;
- valuelen = args->valuelen;
- lblkno = args->rmtblkno;
- while (valuelen > 0) {
- nmap = ATTR_RMTVALUE_MAPSIZE;
- error = xfs_bmapi_read(args->dp, (xfs_fileoff_t)lblkno,
- args->rmtblkcnt, map, &nmap,
- XFS_BMAPI_ATTRFORK);
- if (error)
- return(error);
- ASSERT(nmap >= 1);
-
- for (i = 0; (i < nmap) && (valuelen > 0); i++) {
- ASSERT((map[i].br_startblock != DELAYSTARTBLOCK) &&
- (map[i].br_startblock != HOLESTARTBLOCK));
- dblkno = XFS_FSB_TO_DADDR(mp, map[i].br_startblock);
- blkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount);
- error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp,
- dblkno, blkcnt, 0, &bp, NULL);
- if (error)
- return(error);
-
- tmp = min_t(int, valuelen, BBTOB(bp->b_length));
- xfs_buf_iomove(bp, 0, tmp, dst, XBRW_READ);
- xfs_buf_relse(bp);
- dst += tmp;
- valuelen -= tmp;
-
- lblkno += map[i].br_blockcount;
- }
- }
- ASSERT(valuelen == 0);
- return(0);
-}
-
-/*
- * Write the value associated with an attribute into the out-of-line buffer
- * that we have defined for it.
- */
-STATIC int
-xfs_attr_rmtval_set(xfs_da_args_t *args)
-{
- xfs_mount_t *mp;
- xfs_fileoff_t lfileoff;
- xfs_inode_t *dp;
- xfs_bmbt_irec_t map;
- xfs_daddr_t dblkno;
- void *src;
- xfs_buf_t *bp;
- xfs_dablk_t lblkno;
- int blkcnt, valuelen, nmap, error, tmp, committed;
-
- trace_xfs_attr_rmtval_set(args);
-
- dp = args->dp;
- mp = dp->i_mount;
- src = args->value;
-
- /*
- * Find a "hole" in the attribute address space large enough for
- * us to drop the new attribute's value into.
- */
- blkcnt = XFS_B_TO_FSB(mp, args->valuelen);
- lfileoff = 0;
- error = xfs_bmap_first_unused(args->trans, args->dp, blkcnt, &lfileoff,
- XFS_ATTR_FORK);
- if (error) {
- return(error);
- }
- args->rmtblkno = lblkno = (xfs_dablk_t)lfileoff;
- args->rmtblkcnt = blkcnt;
-
- /*
- * Roll through the "value", allocating blocks on disk as required.
- */
- while (blkcnt > 0) {
- /*
- * Allocate a single extent, up to the size of the value.
- */
- xfs_bmap_init(args->flist, args->firstblock);
- nmap = 1;
- error = xfs_bmapi_write(args->trans, dp, (xfs_fileoff_t)lblkno,
- blkcnt,
- XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
- args->firstblock, args->total, &map, &nmap,
- args->flist);
- if (!error) {
- error = xfs_bmap_finish(&args->trans, args->flist,
- &committed);
- }
- if (error) {
- ASSERT(committed);
- args->trans = NULL;
- xfs_bmap_cancel(args->flist);
- return(error);
- }
-
- /*
- * bmap_finish() may have committed the last trans and started
- * a new one. We need the inode to be in all transactions.
- */
- if (committed)
- xfs_trans_ijoin(args->trans, dp, 0);
-
- ASSERT(nmap == 1);
- ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
- (map.br_startblock != HOLESTARTBLOCK));
- lblkno += map.br_blockcount;
- blkcnt -= map.br_blockcount;
-
- /*
- * Start the next trans in the chain.
- */
- error = xfs_trans_roll(&args->trans, dp);
- if (error)
- return (error);
- }
-
- /*
- * Roll through the "value", copying the attribute value to the
- * already-allocated blocks. Blocks are written synchronously
- * so that we can know they are all on disk before we turn off
- * the INCOMPLETE flag.
- */
- lblkno = args->rmtblkno;
- valuelen = args->valuelen;
- while (valuelen > 0) {
- int buflen;
-
- /*
- * Try to remember where we decided to put the value.
- */
- xfs_bmap_init(args->flist, args->firstblock);
- nmap = 1;
- error = xfs_bmapi_read(dp, (xfs_fileoff_t)lblkno,
- args->rmtblkcnt, &map, &nmap,
- XFS_BMAPI_ATTRFORK);
- if (error)
- return(error);
- ASSERT(nmap == 1);
- ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
- (map.br_startblock != HOLESTARTBLOCK));
-
- dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock),
- blkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount);
-
- bp = xfs_buf_get(mp->m_ddev_targp, dblkno, blkcnt, 0);
- if (!bp)
- return ENOMEM;
-
- buflen = BBTOB(bp->b_length);
- tmp = min_t(int, valuelen, buflen);
- xfs_buf_iomove(bp, 0, tmp, src, XBRW_WRITE);
- if (tmp < buflen)
- xfs_buf_zero(bp, tmp, buflen - tmp);
-
- error = xfs_bwrite(bp); /* GROT: NOTE: synchronous write */
- xfs_buf_relse(bp);
- if (error)
- return error;
- src += tmp;
- valuelen -= tmp;
-
- lblkno += map.br_blockcount;
- }
- ASSERT(valuelen == 0);
- return(0);
-}
-
-/*
- * Remove the value associated with an attribute by deleting the
- * out-of-line buffer that it is stored on.
- */
-STATIC int
-xfs_attr_rmtval_remove(xfs_da_args_t *args)
-{
- xfs_mount_t *mp;
- xfs_bmbt_irec_t map;
- xfs_buf_t *bp;
- xfs_daddr_t dblkno;
- xfs_dablk_t lblkno;
- int valuelen, blkcnt, nmap, error, done, committed;
-
- trace_xfs_attr_rmtval_remove(args);
-
- mp = args->dp->i_mount;
-
- /*
- * Roll through the "value", invalidating the attribute value's
- * blocks.
- */
- lblkno = args->rmtblkno;
- valuelen = args->rmtblkcnt;
- while (valuelen > 0) {
- /*
- * Try to remember where we decided to put the value.
- */
- nmap = 1;
- error = xfs_bmapi_read(args->dp, (xfs_fileoff_t)lblkno,
- args->rmtblkcnt, &map, &nmap,
- XFS_BMAPI_ATTRFORK);
- if (error)
- return(error);
- ASSERT(nmap == 1);
- ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
- (map.br_startblock != HOLESTARTBLOCK));
-
- dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock),
- blkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount);
-
- /*
- * If the "remote" value is in the cache, remove it.
- */
- bp = xfs_incore(mp->m_ddev_targp, dblkno, blkcnt, XBF_TRYLOCK);
- if (bp) {
- xfs_buf_stale(bp);
- xfs_buf_relse(bp);
- bp = NULL;
- }
-
- valuelen -= map.br_blockcount;
-
- lblkno += map.br_blockcount;
- }
-
- /*
- * Keep de-allocating extents until the remote-value region is gone.
- */
- lblkno = args->rmtblkno;
- blkcnt = args->rmtblkcnt;
- done = 0;
- while (!done) {
- xfs_bmap_init(args->flist, args->firstblock);
- error = xfs_bunmapi(args->trans, args->dp, lblkno, blkcnt,
- XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
- 1, args->firstblock, args->flist,
- &done);
- if (!error) {
- error = xfs_bmap_finish(&args->trans, args->flist,
- &committed);
- }
- if (error) {
- ASSERT(committed);
- args->trans = NULL;
- xfs_bmap_cancel(args->flist);
- return(error);
- }
-
- /*
- * bmap_finish() may have committed the last trans and started
- * a new one. We need the inode to be in all transactions.
- */
- if (committed)
- xfs_trans_ijoin(args->trans, args->dp, 0);
-
- /*
- * Close out trans and start the next one in the chain.
- */
- error = xfs_trans_roll(&args->trans, args->dp);
- if (error)
- return (error);
- }
- return(0);
-}
diff --git a/fs/xfs/xfs_attr.h b/fs/xfs/xfs_attr.h
index e920d68ef50..dd482458947 100644
--- a/fs/xfs/xfs_attr.h
+++ b/fs/xfs/xfs_attr.h
@@ -140,7 +140,15 @@ typedef struct xfs_attr_list_context {
* Overall external interface routines.
*/
int xfs_attr_inactive(struct xfs_inode *dp);
-int xfs_attr_rmtval_get(struct xfs_da_args *args);
int xfs_attr_list_int(struct xfs_attr_list_context *);
+int xfs_inode_hasattr(struct xfs_inode *ip);
+int xfs_attr_get(struct xfs_inode *ip, const unsigned char *name,
+ unsigned char *value, int *valuelenp, int flags);
+int xfs_attr_set(struct xfs_inode *dp, const unsigned char *name,
+ unsigned char *value, int valuelen, int flags);
+int xfs_attr_remove(struct xfs_inode *dp, const unsigned char *name, int flags);
+int xfs_attr_list(struct xfs_inode *dp, char *buffer, int bufsize,
+ int flags, struct attrlist_cursor_kern *cursor);
+
#endif /* __XFS_ATTR_H__ */
diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c
new file mode 100644
index 00000000000..09480c57f06
--- /dev/null
+++ b/fs/xfs/xfs_attr_inactive.c
@@ -0,0 +1,452 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_inode.h"
+#include "xfs_alloc.h"
+#include "xfs_attr_remote.h"
+#include "xfs_trans.h"
+#include "xfs_inode_item.h"
+#include "xfs_bmap.h"
+#include "xfs_attr.h"
+#include "xfs_attr_leaf.h"
+#include "xfs_error.h"
+#include "xfs_quota.h"
+#include "xfs_trace.h"
+#include "xfs_dinode.h"
+#include "xfs_dir2.h"
+
+/*
+ * Look at all the extents for this logical region,
+ * invalidate any buffers that are incore/in transactions.
+ */
+STATIC int
+xfs_attr3_leaf_freextent(
+ struct xfs_trans **trans,
+ struct xfs_inode *dp,
+ xfs_dablk_t blkno,
+ int blkcnt)
+{
+ struct xfs_bmbt_irec map;
+ struct xfs_buf *bp;
+ xfs_dablk_t tblkno;
+ xfs_daddr_t dblkno;
+ int tblkcnt;
+ int dblkcnt;
+ int nmap;
+ int error;
+
+ /*
+ * Roll through the "value", invalidating the attribute value's
+ * blocks.
+ */
+ tblkno = blkno;
+ tblkcnt = blkcnt;
+ while (tblkcnt > 0) {
+ /*
+ * Try to remember where we decided to put the value.
+ */
+ nmap = 1;
+ error = xfs_bmapi_read(dp, (xfs_fileoff_t)tblkno, tblkcnt,
+ &map, &nmap, XFS_BMAPI_ATTRFORK);
+ if (error) {
+ return(error);
+ }
+ ASSERT(nmap == 1);
+ ASSERT(map.br_startblock != DELAYSTARTBLOCK);
+
+ /*
+ * If it's a hole, these are already unmapped
+ * so there's nothing to invalidate.
+ */
+ if (map.br_startblock != HOLESTARTBLOCK) {
+
+ dblkno = XFS_FSB_TO_DADDR(dp->i_mount,
+ map.br_startblock);
+ dblkcnt = XFS_FSB_TO_BB(dp->i_mount,
+ map.br_blockcount);
+ bp = xfs_trans_get_buf(*trans,
+ dp->i_mount->m_ddev_targp,
+ dblkno, dblkcnt, 0);
+ if (!bp)
+ return ENOMEM;
+ xfs_trans_binval(*trans, bp);
+ /*
+ * Roll to next transaction.
+ */
+ error = xfs_trans_roll(trans, dp);
+ if (error)
+ return (error);
+ }
+
+ tblkno += map.br_blockcount;
+ tblkcnt -= map.br_blockcount;
+ }
+
+ return(0);
+}
+
+/*
+ * Invalidate all of the "remote" value regions pointed to by a particular
+ * leaf block.
+ * Note that we must release the lock on the buffer so that we are not
+ * caught holding something that the logging code wants to flush to disk.
+ */
+STATIC int
+xfs_attr3_leaf_inactive(
+ struct xfs_trans **trans,
+ struct xfs_inode *dp,
+ struct xfs_buf *bp)
+{
+ struct xfs_attr_leafblock *leaf;
+ struct xfs_attr3_icleaf_hdr ichdr;
+ struct xfs_attr_leaf_entry *entry;
+ struct xfs_attr_leaf_name_remote *name_rmt;
+ struct xfs_attr_inactive_list *list;
+ struct xfs_attr_inactive_list *lp;
+ int error;
+ int count;
+ int size;
+ int tmp;
+ int i;
+
+ leaf = bp->b_addr;
+ xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
+
+ /*
+ * Count the number of "remote" value extents.
+ */
+ count = 0;
+ entry = xfs_attr3_leaf_entryp(leaf);
+ for (i = 0; i < ichdr.count; entry++, i++) {
+ if (be16_to_cpu(entry->nameidx) &&
+ ((entry->flags & XFS_ATTR_LOCAL) == 0)) {
+ name_rmt = xfs_attr3_leaf_name_remote(leaf, i);
+ if (name_rmt->valueblk)
+ count++;
+ }
+ }
+
+ /*
+ * If there are no "remote" values, we're done.
+ */
+ if (count == 0) {
+ xfs_trans_brelse(*trans, bp);
+ return 0;
+ }
+
+ /*
+ * Allocate storage for a list of all the "remote" value extents.
+ */
+ size = count * sizeof(xfs_attr_inactive_list_t);
+ list = kmem_alloc(size, KM_SLEEP);
+
+ /*
+ * Identify each of the "remote" value extents.
+ */
+ lp = list;
+ entry = xfs_attr3_leaf_entryp(leaf);
+ for (i = 0; i < ichdr.count; entry++, i++) {
+ if (be16_to_cpu(entry->nameidx) &&
+ ((entry->flags & XFS_ATTR_LOCAL) == 0)) {
+ name_rmt = xfs_attr3_leaf_name_remote(leaf, i);
+ if (name_rmt->valueblk) {
+ lp->valueblk = be32_to_cpu(name_rmt->valueblk);
+ lp->valuelen = xfs_attr3_rmt_blocks(dp->i_mount,
+ be32_to_cpu(name_rmt->valuelen));
+ lp++;
+ }
+ }
+ }
+ xfs_trans_brelse(*trans, bp); /* unlock for trans. in freextent() */
+
+ /*
+ * Invalidate each of the "remote" value extents.
+ */
+ error = 0;
+ for (lp = list, i = 0; i < count; i++, lp++) {
+ tmp = xfs_attr3_leaf_freextent(trans, dp,
+ lp->valueblk, lp->valuelen);
+
+ if (error == 0)
+ error = tmp; /* save only the 1st errno */
+ }
+
+ kmem_free(list);
+ return error;
+}
+
+/*
+ * Recurse (gasp!) through the attribute nodes until we find leaves.
+ * We're doing a depth-first traversal in order to invalidate everything.
+ */
+STATIC int
+xfs_attr3_node_inactive(
+ struct xfs_trans **trans,
+ struct xfs_inode *dp,
+ struct xfs_buf *bp,
+ int level)
+{
+ xfs_da_blkinfo_t *info;
+ xfs_da_intnode_t *node;
+ xfs_dablk_t child_fsb;
+ xfs_daddr_t parent_blkno, child_blkno;
+ int error, i;
+ struct xfs_buf *child_bp;
+ struct xfs_da_node_entry *btree;
+ struct xfs_da3_icnode_hdr ichdr;
+
+ /*
+ * Since this code is recursive (gasp!) we must protect ourselves.
+ */
+ if (level > XFS_DA_NODE_MAXDEPTH) {
+ xfs_trans_brelse(*trans, bp); /* no locks for later trans */
+ return XFS_ERROR(EIO);
+ }
+
+ node = bp->b_addr;
+ dp->d_ops->node_hdr_from_disk(&ichdr, node);
+ parent_blkno = bp->b_bn;
+ if (!ichdr.count) {
+ xfs_trans_brelse(*trans, bp);
+ return 0;
+ }
+ btree = dp->d_ops->node_tree_p(node);
+ child_fsb = be32_to_cpu(btree[0].before);
+ xfs_trans_brelse(*trans, bp); /* no locks for later trans */
+
+ /*
+ * If this is the node level just above the leaves, simply loop
+ * over the leaves removing all of them. If this is higher up
+ * in the tree, recurse downward.
+ */
+ for (i = 0; i < ichdr.count; i++) {
+ /*
+ * Read the subsidiary block to see what we have to work with.
+ * Don't do this in a transaction. This is a depth-first
+ * traversal of the tree so we may deal with many blocks
+ * before we come back to this one.
+ */
+ error = xfs_da3_node_read(*trans, dp, child_fsb, -2, &child_bp,
+ XFS_ATTR_FORK);
+ if (error)
+ return(error);
+ if (child_bp) {
+ /* save for re-read later */
+ child_blkno = XFS_BUF_ADDR(child_bp);
+
+ /*
+ * Invalidate the subtree, however we have to.
+ */
+ info = child_bp->b_addr;
+ switch (info->magic) {
+ case cpu_to_be16(XFS_DA_NODE_MAGIC):
+ case cpu_to_be16(XFS_DA3_NODE_MAGIC):
+ error = xfs_attr3_node_inactive(trans, dp,
+ child_bp, level + 1);
+ break;
+ case cpu_to_be16(XFS_ATTR_LEAF_MAGIC):
+ case cpu_to_be16(XFS_ATTR3_LEAF_MAGIC):
+ error = xfs_attr3_leaf_inactive(trans, dp,
+ child_bp);
+ break;
+ default:
+ error = XFS_ERROR(EIO);
+ xfs_trans_brelse(*trans, child_bp);
+ break;
+ }
+ if (error)
+ return error;
+
+ /*
+ * Remove the subsidiary block from the cache
+ * and from the log.
+ */
+ error = xfs_da_get_buf(*trans, dp, 0, child_blkno,
+ &child_bp, XFS_ATTR_FORK);
+ if (error)
+ return error;
+ xfs_trans_binval(*trans, child_bp);
+ }
+
+ /*
+ * If we're not done, re-read the parent to get the next
+ * child block number.
+ */
+ if (i + 1 < ichdr.count) {
+ error = xfs_da3_node_read(*trans, dp, 0, parent_blkno,
+ &bp, XFS_ATTR_FORK);
+ if (error)
+ return error;
+ child_fsb = be32_to_cpu(btree[i + 1].before);
+ xfs_trans_brelse(*trans, bp);
+ }
+ /*
+ * Atomically commit the whole invalidate stuff.
+ */
+ error = xfs_trans_roll(trans, dp);
+ if (error)
+ return error;
+ }
+
+ return 0;
+}
+
+/*
+ * Indiscriminately delete the entire attribute fork
+ *
+ * Recurse (gasp!) through the attribute nodes until we find leaves.
+ * We're doing a depth-first traversal in order to invalidate everything.
+ */
+int
+xfs_attr3_root_inactive(
+ struct xfs_trans **trans,
+ struct xfs_inode *dp)
+{
+ struct xfs_da_blkinfo *info;
+ struct xfs_buf *bp;
+ xfs_daddr_t blkno;
+ int error;
+
+ /*
+ * Read block 0 to see what we have to work with.
+ * We only get here if we have extents, since we remove
+ * the extents in reverse order the extent containing
+ * block 0 must still be there.
+ */
+ error = xfs_da3_node_read(*trans, dp, 0, -1, &bp, XFS_ATTR_FORK);
+ if (error)
+ return error;
+ blkno = bp->b_bn;
+
+ /*
+ * Invalidate the tree, even if the "tree" is only a single leaf block.
+ * This is a depth-first traversal!
+ */
+ info = bp->b_addr;
+ switch (info->magic) {
+ case cpu_to_be16(XFS_DA_NODE_MAGIC):
+ case cpu_to_be16(XFS_DA3_NODE_MAGIC):
+ error = xfs_attr3_node_inactive(trans, dp, bp, 1);
+ break;
+ case cpu_to_be16(XFS_ATTR_LEAF_MAGIC):
+ case cpu_to_be16(XFS_ATTR3_LEAF_MAGIC):
+ error = xfs_attr3_leaf_inactive(trans, dp, bp);
+ break;
+ default:
+ error = XFS_ERROR(EIO);
+ xfs_trans_brelse(*trans, bp);
+ break;
+ }
+ if (error)
+ return error;
+
+ /*
+ * Invalidate the incore copy of the root block.
+ */
+ error = xfs_da_get_buf(*trans, dp, 0, blkno, &bp, XFS_ATTR_FORK);
+ if (error)
+ return error;
+ xfs_trans_binval(*trans, bp); /* remove from cache */
+ /*
+ * Commit the invalidate and start the next transaction.
+ */
+ error = xfs_trans_roll(trans, dp);
+
+ return error;
+}
+
+int
+xfs_attr_inactive(xfs_inode_t *dp)
+{
+ xfs_trans_t *trans;
+ xfs_mount_t *mp;
+ int error;
+
+ mp = dp->i_mount;
+ ASSERT(! XFS_NOT_DQATTACHED(mp, dp));
+
+ xfs_ilock(dp, XFS_ILOCK_SHARED);
+ if (!xfs_inode_hasattr(dp) ||
+ dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) {
+ xfs_iunlock(dp, XFS_ILOCK_SHARED);
+ return 0;
+ }
+ xfs_iunlock(dp, XFS_ILOCK_SHARED);
+
+ /*
+ * Start our first transaction of the day.
+ *
+ * All future transactions during this code must be "chained" off
+ * this one via the trans_dup() call. All transactions will contain
+ * the inode, and the inode will always be marked with trans_ihold().
+ * Since the inode will be locked in all transactions, we must log
+ * the inode in every transaction to let it float upward through
+ * the log.
+ */
+ trans = xfs_trans_alloc(mp, XFS_TRANS_ATTRINVAL);
+ error = xfs_trans_reserve(trans, &M_RES(mp)->tr_attrinval, 0, 0);
+ if (error) {
+ xfs_trans_cancel(trans, 0);
+ return(error);
+ }
+ xfs_ilock(dp, XFS_ILOCK_EXCL);
+
+ /*
+ * No need to make quota reservations here. We expect to release some
+ * blocks, not allocate, in the common case.
+ */
+ xfs_trans_ijoin(trans, dp, 0);
+
+ /*
+ * Decide on what work routines to call based on the inode size.
+ */
+ if (!xfs_inode_hasattr(dp) ||
+ dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) {
+ error = 0;
+ goto out;
+ }
+ error = xfs_attr3_root_inactive(&trans, dp);
+ if (error)
+ goto out;
+
+ error = xfs_itruncate_extents(&trans, dp, XFS_ATTR_FORK, 0);
+ if (error)
+ goto out;
+
+ error = xfs_trans_commit(trans, XFS_TRANS_RELEASE_LOG_RES);
+ xfs_iunlock(dp, XFS_ILOCK_EXCL);
+
+ return(error);
+
+out:
+ xfs_trans_cancel(trans, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
+ xfs_iunlock(dp, XFS_ILOCK_EXCL);
+ return(error);
+}
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index ee24993c7d1..28712d29e43 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -1,5 +1,6 @@
/*
* Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
* All Rights Reserved.
*
* This program is free software; you can redistribute it and/or
@@ -17,28 +18,32 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
-#include "xfs_types.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_bit.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_mount.h"
+#include "xfs_da_format.h"
#include "xfs_da_btree.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_alloc.h"
-#include "xfs_btree.h"
-#include "xfs_attr_sf.h"
-#include "xfs_dinode.h"
#include "xfs_inode.h"
+#include "xfs_trans.h"
#include "xfs_inode_item.h"
+#include "xfs_bmap_btree.h"
#include "xfs_bmap.h"
+#include "xfs_attr_sf.h"
+#include "xfs_attr_remote.h"
#include "xfs_attr.h"
#include "xfs_attr_leaf.h"
#include "xfs_error.h"
#include "xfs_trace.h"
+#include "xfs_buf_item.h"
+#include "xfs_cksum.h"
+#include "xfs_dinode.h"
+#include "xfs_dir2.h"
+
/*
* xfs_attr_leaf.c
@@ -53,85 +58,218 @@
/*
* Routines used for growing the Btree.
*/
-STATIC int xfs_attr_leaf_create(xfs_da_args_t *args, xfs_dablk_t which_block,
- struct xfs_buf **bpp);
-STATIC int xfs_attr_leaf_add_work(struct xfs_buf *leaf_buffer,
- xfs_da_args_t *args, int freemap_index);
-STATIC void xfs_attr_leaf_compact(struct xfs_da_args *args,
- struct xfs_buf *leaf_buffer);
-STATIC void xfs_attr_leaf_rebalance(xfs_da_state_t *state,
+STATIC int xfs_attr3_leaf_create(struct xfs_da_args *args,
+ xfs_dablk_t which_block, struct xfs_buf **bpp);
+STATIC int xfs_attr3_leaf_add_work(struct xfs_buf *leaf_buffer,
+ struct xfs_attr3_icleaf_hdr *ichdr,
+ struct xfs_da_args *args, int freemap_index);
+STATIC void xfs_attr3_leaf_compact(struct xfs_da_args *args,
+ struct xfs_attr3_icleaf_hdr *ichdr,
+ struct xfs_buf *leaf_buffer);
+STATIC void xfs_attr3_leaf_rebalance(xfs_da_state_t *state,
xfs_da_state_blk_t *blk1,
xfs_da_state_blk_t *blk2);
-STATIC int xfs_attr_leaf_figure_balance(xfs_da_state_t *state,
- xfs_da_state_blk_t *leaf_blk_1,
- xfs_da_state_blk_t *leaf_blk_2,
- int *number_entries_in_blk1,
- int *number_usedbytes_in_blk1);
-
-/*
- * Routines used for shrinking the Btree.
- */
-STATIC int xfs_attr_node_inactive(xfs_trans_t **trans, xfs_inode_t *dp,
- struct xfs_buf *bp, int level);
-STATIC int xfs_attr_leaf_inactive(xfs_trans_t **trans, xfs_inode_t *dp,
- struct xfs_buf *bp);
-STATIC int xfs_attr_leaf_freextent(xfs_trans_t **trans, xfs_inode_t *dp,
- xfs_dablk_t blkno, int blkcnt);
+STATIC int xfs_attr3_leaf_figure_balance(xfs_da_state_t *state,
+ xfs_da_state_blk_t *leaf_blk_1,
+ struct xfs_attr3_icleaf_hdr *ichdr1,
+ xfs_da_state_blk_t *leaf_blk_2,
+ struct xfs_attr3_icleaf_hdr *ichdr2,
+ int *number_entries_in_blk1,
+ int *number_usedbytes_in_blk1);
/*
* Utility routines.
*/
-STATIC void xfs_attr_leaf_moveents(xfs_attr_leafblock_t *src_leaf,
- int src_start,
- xfs_attr_leafblock_t *dst_leaf,
- int dst_start, int move_count,
- xfs_mount_t *mp);
+STATIC void xfs_attr3_leaf_moveents(struct xfs_da_args *args,
+ struct xfs_attr_leafblock *src_leaf,
+ struct xfs_attr3_icleaf_hdr *src_ichdr, int src_start,
+ struct xfs_attr_leafblock *dst_leaf,
+ struct xfs_attr3_icleaf_hdr *dst_ichdr, int dst_start,
+ int move_count);
STATIC int xfs_attr_leaf_entsize(xfs_attr_leafblock_t *leaf, int index);
-static void
-xfs_attr_leaf_verify(
+void
+xfs_attr3_leaf_hdr_from_disk(
+ struct xfs_attr3_icleaf_hdr *to,
+ struct xfs_attr_leafblock *from)
+{
+ int i;
+
+ ASSERT(from->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC) ||
+ from->hdr.info.magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC));
+
+ if (from->hdr.info.magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC)) {
+ struct xfs_attr3_leaf_hdr *hdr3 = (struct xfs_attr3_leaf_hdr *)from;
+
+ to->forw = be32_to_cpu(hdr3->info.hdr.forw);
+ to->back = be32_to_cpu(hdr3->info.hdr.back);
+ to->magic = be16_to_cpu(hdr3->info.hdr.magic);
+ to->count = be16_to_cpu(hdr3->count);
+ to->usedbytes = be16_to_cpu(hdr3->usedbytes);
+ to->firstused = be16_to_cpu(hdr3->firstused);
+ to->holes = hdr3->holes;
+
+ for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) {
+ to->freemap[i].base = be16_to_cpu(hdr3->freemap[i].base);
+ to->freemap[i].size = be16_to_cpu(hdr3->freemap[i].size);
+ }
+ return;
+ }
+ to->forw = be32_to_cpu(from->hdr.info.forw);
+ to->back = be32_to_cpu(from->hdr.info.back);
+ to->magic = be16_to_cpu(from->hdr.info.magic);
+ to->count = be16_to_cpu(from->hdr.count);
+ to->usedbytes = be16_to_cpu(from->hdr.usedbytes);
+ to->firstused = be16_to_cpu(from->hdr.firstused);
+ to->holes = from->hdr.holes;
+
+ for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) {
+ to->freemap[i].base = be16_to_cpu(from->hdr.freemap[i].base);
+ to->freemap[i].size = be16_to_cpu(from->hdr.freemap[i].size);
+ }
+}
+
+void
+xfs_attr3_leaf_hdr_to_disk(
+ struct xfs_attr_leafblock *to,
+ struct xfs_attr3_icleaf_hdr *from)
+{
+ int i;
+
+ ASSERT(from->magic == XFS_ATTR_LEAF_MAGIC ||
+ from->magic == XFS_ATTR3_LEAF_MAGIC);
+
+ if (from->magic == XFS_ATTR3_LEAF_MAGIC) {
+ struct xfs_attr3_leaf_hdr *hdr3 = (struct xfs_attr3_leaf_hdr *)to;
+
+ hdr3->info.hdr.forw = cpu_to_be32(from->forw);
+ hdr3->info.hdr.back = cpu_to_be32(from->back);
+ hdr3->info.hdr.magic = cpu_to_be16(from->magic);
+ hdr3->count = cpu_to_be16(from->count);
+ hdr3->usedbytes = cpu_to_be16(from->usedbytes);
+ hdr3->firstused = cpu_to_be16(from->firstused);
+ hdr3->holes = from->holes;
+ hdr3->pad1 = 0;
+
+ for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) {
+ hdr3->freemap[i].base = cpu_to_be16(from->freemap[i].base);
+ hdr3->freemap[i].size = cpu_to_be16(from->freemap[i].size);
+ }
+ return;
+ }
+ to->hdr.info.forw = cpu_to_be32(from->forw);
+ to->hdr.info.back = cpu_to_be32(from->back);
+ to->hdr.info.magic = cpu_to_be16(from->magic);
+ to->hdr.count = cpu_to_be16(from->count);
+ to->hdr.usedbytes = cpu_to_be16(from->usedbytes);
+ to->hdr.firstused = cpu_to_be16(from->firstused);
+ to->hdr.holes = from->holes;
+ to->hdr.pad1 = 0;
+
+ for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) {
+ to->hdr.freemap[i].base = cpu_to_be16(from->freemap[i].base);
+ to->hdr.freemap[i].size = cpu_to_be16(from->freemap[i].size);
+ }
+}
+
+static bool
+xfs_attr3_leaf_verify(
struct xfs_buf *bp)
{
struct xfs_mount *mp = bp->b_target->bt_mount;
- struct xfs_attr_leaf_hdr *hdr = bp->b_addr;
- int block_ok = 0;
+ struct xfs_attr_leafblock *leaf = bp->b_addr;
+ struct xfs_attr3_icleaf_hdr ichdr;
- block_ok = hdr->info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC);
- if (!block_ok) {
- XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr);
- xfs_buf_ioerror(bp, EFSCORRUPTED);
+ xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
+
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ struct xfs_da3_node_hdr *hdr3 = bp->b_addr;
+
+ if (ichdr.magic != XFS_ATTR3_LEAF_MAGIC)
+ return false;
+
+ if (!uuid_equal(&hdr3->info.uuid, &mp->m_sb.sb_uuid))
+ return false;
+ if (be64_to_cpu(hdr3->info.blkno) != bp->b_bn)
+ return false;
+ } else {
+ if (ichdr.magic != XFS_ATTR_LEAF_MAGIC)
+ return false;
}
+ if (ichdr.count == 0)
+ return false;
+
+ /* XXX: need to range check rest of attr header values */
+ /* XXX: hash order check? */
+
+ return true;
}
static void
-xfs_attr_leaf_read_verify(
+xfs_attr3_leaf_write_verify(
struct xfs_buf *bp)
{
- xfs_attr_leaf_verify(bp);
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_buf_log_item *bip = bp->b_fspriv;
+ struct xfs_attr3_leaf_hdr *hdr3 = bp->b_addr;
+
+ if (!xfs_attr3_leaf_verify(bp)) {
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
+ xfs_verifier_error(bp);
+ return;
+ }
+
+ if (!xfs_sb_version_hascrc(&mp->m_sb))
+ return;
+
+ if (bip)
+ hdr3->info.lsn = cpu_to_be64(bip->bli_item.li_lsn);
+
+ xfs_buf_update_cksum(bp, XFS_ATTR3_LEAF_CRC_OFF);
}
+/*
+ * leaf/node format detection on trees is sketchy, so a node read can be done on
+ * leaf level blocks when detection identifies the tree as a node format tree
+ * incorrectly. In this case, we need to swap the verifier to match the correct
+ * format of the block being read.
+ */
static void
-xfs_attr_leaf_write_verify(
- struct xfs_buf *bp)
+xfs_attr3_leaf_read_verify(
+ struct xfs_buf *bp)
{
- xfs_attr_leaf_verify(bp);
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+
+ if (xfs_sb_version_hascrc(&mp->m_sb) &&
+ !xfs_buf_verify_cksum(bp, XFS_ATTR3_LEAF_CRC_OFF))
+ xfs_buf_ioerror(bp, EFSBADCRC);
+ else if (!xfs_attr3_leaf_verify(bp))
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
+
+ if (bp->b_error)
+ xfs_verifier_error(bp);
}
-const struct xfs_buf_ops xfs_attr_leaf_buf_ops = {
- .verify_read = xfs_attr_leaf_read_verify,
- .verify_write = xfs_attr_leaf_write_verify,
+const struct xfs_buf_ops xfs_attr3_leaf_buf_ops = {
+ .verify_read = xfs_attr3_leaf_read_verify,
+ .verify_write = xfs_attr3_leaf_write_verify,
};
int
-xfs_attr_leaf_read(
+xfs_attr3_leaf_read(
struct xfs_trans *tp,
struct xfs_inode *dp,
xfs_dablk_t bno,
xfs_daddr_t mappedbno,
struct xfs_buf **bpp)
{
- return xfs_da_read_buf(tp, dp, bno, mappedbno, bpp,
- XFS_ATTR_FORK, &xfs_attr_leaf_buf_ops);
+ int err;
+
+ err = xfs_da_read_buf(tp, dp, bno, mappedbno, bpp,
+ XFS_ATTR_FORK, &xfs_attr3_leaf_buf_ops);
+ if (!err && tp)
+ xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_ATTR_LEAF_BUF);
+ return err;
}
/*========================================================================
@@ -172,7 +310,8 @@ xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes)
int dsize;
xfs_mount_t *mp = dp->i_mount;
- offset = (XFS_LITINO(mp) - bytes) >> 3; /* rounded down */
+ /* rounded down */
+ offset = (XFS_LITINO(mp, dp->i_d.di_version) - bytes) >> 3;
switch (dp->i_d.di_format) {
case XFS_DINODE_FMT_DEV:
@@ -231,7 +370,7 @@ xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes)
return 0;
return dp->i_d.di_forkoff;
}
- dsize = XFS_BMAP_BROOT_SPACE(dp->i_df.if_broot);
+ dsize = XFS_BMAP_BROOT_SPACE(mp, dp->i_df.if_broot);
break;
}
@@ -243,7 +382,8 @@ xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes)
minforkoff = roundup(minforkoff, 8) >> 3;
/* attr fork btree root can have at least this many key/ptr pairs */
- maxforkoff = XFS_LITINO(mp) - XFS_BMDR_SPACE_CALC(MINABTPTRS);
+ maxforkoff = XFS_LITINO(mp, dp->i_d.di_version) -
+ XFS_BMDR_SPACE_CALC(MINABTPTRS);
maxforkoff = maxforkoff >> 3; /* rounded down */
if (offset >= maxforkoff)
@@ -487,7 +627,7 @@ xfs_attr_shortform_getvalue(xfs_da_args_t *args)
xfs_attr_sf_entry_t *sfe;
int i;
- ASSERT(args->dp->i_d.di_aformat == XFS_IFINLINE);
+ ASSERT(args->dp->i_afp->if_flags == XFS_IFINLINE);
sf = (xfs_attr_shortform_t *)args->dp->i_afp->if_u1.if_data;
sfe = &sf->list[0];
for (i = 0; i < sf->hdr.count;
@@ -542,6 +682,8 @@ xfs_attr_shortform_to_leaf(xfs_da_args_t *args)
sf = (xfs_attr_shortform_t *)tmpbuffer;
xfs_idata_realloc(dp, -size, XFS_ATTR_FORK);
+ xfs_bmap_local_to_extents_empty(dp, XFS_ATTR_FORK);
+
bp = NULL;
error = xfs_da_grow_inode(args, &blkno);
if (error) {
@@ -557,7 +699,7 @@ xfs_attr_shortform_to_leaf(xfs_da_args_t *args)
}
ASSERT(blkno == 0);
- error = xfs_attr_leaf_create(args, blkno, &bp);
+ error = xfs_attr3_leaf_create(args, blkno, &bp);
if (error) {
error = xfs_da_shrink_inode(args, 0, bp);
bp = NULL;
@@ -570,6 +712,7 @@ xfs_attr_shortform_to_leaf(xfs_da_args_t *args)
memset((char *)&nargs, 0, sizeof(nargs));
nargs.dp = dp;
+ nargs.geo = args->geo;
nargs.firstblock = args->firstblock;
nargs.flist = args->flist;
nargs.total = args->total;
@@ -586,9 +729,9 @@ xfs_attr_shortform_to_leaf(xfs_da_args_t *args)
nargs.hashval = xfs_da_hashname(sfe->nameval,
sfe->namelen);
nargs.flags = XFS_ATTR_NSP_ONDISK_TO_ARGS(sfe->flags);
- error = xfs_attr_leaf_lookup_int(bp, &nargs); /* set a->index */
+ error = xfs_attr3_leaf_lookup_int(bp, &nargs); /* set a->index */
ASSERT(error == ENOATTR);
- error = xfs_attr_leaf_add(bp, &nargs);
+ error = xfs_attr3_leaf_add(bp, &nargs);
ASSERT(error != ENOSPC);
if (error)
goto out;
@@ -601,250 +744,81 @@ out:
return(error);
}
-STATIC int
-xfs_attr_shortform_compare(const void *a, const void *b)
-{
- xfs_attr_sf_sort_t *sa, *sb;
-
- sa = (xfs_attr_sf_sort_t *)a;
- sb = (xfs_attr_sf_sort_t *)b;
- if (sa->hash < sb->hash) {
- return(-1);
- } else if (sa->hash > sb->hash) {
- return(1);
- } else {
- return(sa->entno - sb->entno);
- }
-}
-
-
-#define XFS_ISRESET_CURSOR(cursor) \
- (!((cursor)->initted) && !((cursor)->hashval) && \
- !((cursor)->blkno) && !((cursor)->offset))
-/*
- * Copy out entries of shortform attribute lists for attr_list().
- * Shortform attribute lists are not stored in hashval sorted order.
- * If the output buffer is not large enough to hold them all, then we
- * we have to calculate each entries' hashvalue and sort them before
- * we can begin returning them to the user.
- */
-/*ARGSUSED*/
-int
-xfs_attr_shortform_list(xfs_attr_list_context_t *context)
-{
- attrlist_cursor_kern_t *cursor;
- xfs_attr_sf_sort_t *sbuf, *sbp;
- xfs_attr_shortform_t *sf;
- xfs_attr_sf_entry_t *sfe;
- xfs_inode_t *dp;
- int sbsize, nsbuf, count, i;
- int error;
-
- ASSERT(context != NULL);
- dp = context->dp;
- ASSERT(dp != NULL);
- ASSERT(dp->i_afp != NULL);
- sf = (xfs_attr_shortform_t *)dp->i_afp->if_u1.if_data;
- ASSERT(sf != NULL);
- if (!sf->hdr.count)
- return(0);
- cursor = context->cursor;
- ASSERT(cursor != NULL);
-
- trace_xfs_attr_list_sf(context);
-
- /*
- * If the buffer is large enough and the cursor is at the start,
- * do not bother with sorting since we will return everything in
- * one buffer and another call using the cursor won't need to be
- * made.
- * Note the generous fudge factor of 16 overhead bytes per entry.
- * If bufsize is zero then put_listent must be a search function
- * and can just scan through what we have.
- */
- if (context->bufsize == 0 ||
- (XFS_ISRESET_CURSOR(cursor) &&
- (dp->i_afp->if_bytes + sf->hdr.count * 16) < context->bufsize)) {
- for (i = 0, sfe = &sf->list[0]; i < sf->hdr.count; i++) {
- error = context->put_listent(context,
- sfe->flags,
- sfe->nameval,
- (int)sfe->namelen,
- (int)sfe->valuelen,
- &sfe->nameval[sfe->namelen]);
-
- /*
- * Either search callback finished early or
- * didn't fit it all in the buffer after all.
- */
- if (context->seen_enough)
- break;
-
- if (error)
- return error;
- sfe = XFS_ATTR_SF_NEXTENTRY(sfe);
- }
- trace_xfs_attr_list_sf_all(context);
- return(0);
- }
-
- /* do no more for a search callback */
- if (context->bufsize == 0)
- return 0;
-
- /*
- * It didn't all fit, so we have to sort everything on hashval.
- */
- sbsize = sf->hdr.count * sizeof(*sbuf);
- sbp = sbuf = kmem_alloc(sbsize, KM_SLEEP | KM_NOFS);
-
- /*
- * Scan the attribute list for the rest of the entries, storing
- * the relevant info from only those that match into a buffer.
- */
- nsbuf = 0;
- for (i = 0, sfe = &sf->list[0]; i < sf->hdr.count; i++) {
- if (unlikely(
- ((char *)sfe < (char *)sf) ||
- ((char *)sfe >= ((char *)sf + dp->i_afp->if_bytes)))) {
- XFS_CORRUPTION_ERROR("xfs_attr_shortform_list",
- XFS_ERRLEVEL_LOW,
- context->dp->i_mount, sfe);
- kmem_free(sbuf);
- return XFS_ERROR(EFSCORRUPTED);
- }
-
- sbp->entno = i;
- sbp->hash = xfs_da_hashname(sfe->nameval, sfe->namelen);
- sbp->name = sfe->nameval;
- sbp->namelen = sfe->namelen;
- /* These are bytes, and both on-disk, don't endian-flip */
- sbp->valuelen = sfe->valuelen;
- sbp->flags = sfe->flags;
- sfe = XFS_ATTR_SF_NEXTENTRY(sfe);
- sbp++;
- nsbuf++;
- }
-
- /*
- * Sort the entries on hash then entno.
- */
- xfs_sort(sbuf, nsbuf, sizeof(*sbuf), xfs_attr_shortform_compare);
-
- /*
- * Re-find our place IN THE SORTED LIST.
- */
- count = 0;
- cursor->initted = 1;
- cursor->blkno = 0;
- for (sbp = sbuf, i = 0; i < nsbuf; i++, sbp++) {
- if (sbp->hash == cursor->hashval) {
- if (cursor->offset == count) {
- break;
- }
- count++;
- } else if (sbp->hash > cursor->hashval) {
- break;
- }
- }
- if (i == nsbuf) {
- kmem_free(sbuf);
- return(0);
- }
-
- /*
- * Loop putting entries into the user buffer.
- */
- for ( ; i < nsbuf; i++, sbp++) {
- if (cursor->hashval != sbp->hash) {
- cursor->hashval = sbp->hash;
- cursor->offset = 0;
- }
- error = context->put_listent(context,
- sbp->flags,
- sbp->name,
- sbp->namelen,
- sbp->valuelen,
- &sbp->name[sbp->namelen]);
- if (error)
- return error;
- if (context->seen_enough)
- break;
- cursor->offset++;
- }
-
- kmem_free(sbuf);
- return(0);
-}
-
/*
* Check a leaf attribute block to see if all the entries would fit into
* a shortform attribute list.
*/
int
xfs_attr_shortform_allfit(
- struct xfs_buf *bp,
- struct xfs_inode *dp)
+ struct xfs_buf *bp,
+ struct xfs_inode *dp)
{
- xfs_attr_leafblock_t *leaf;
- xfs_attr_leaf_entry_t *entry;
+ struct xfs_attr_leafblock *leaf;
+ struct xfs_attr_leaf_entry *entry;
xfs_attr_leaf_name_local_t *name_loc;
- int bytes, i;
+ struct xfs_attr3_icleaf_hdr leafhdr;
+ int bytes;
+ int i;
leaf = bp->b_addr;
- ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
+ xfs_attr3_leaf_hdr_from_disk(&leafhdr, leaf);
+ entry = xfs_attr3_leaf_entryp(leaf);
- entry = &leaf->entries[0];
bytes = sizeof(struct xfs_attr_sf_hdr);
- for (i = 0; i < be16_to_cpu(leaf->hdr.count); entry++, i++) {
+ for (i = 0; i < leafhdr.count; entry++, i++) {
if (entry->flags & XFS_ATTR_INCOMPLETE)
continue; /* don't copy partial entries */
if (!(entry->flags & XFS_ATTR_LOCAL))
return(0);
- name_loc = xfs_attr_leaf_name_local(leaf, i);
+ name_loc = xfs_attr3_leaf_name_local(leaf, i);
if (name_loc->namelen >= XFS_ATTR_SF_ENTSIZE_MAX)
return(0);
if (be16_to_cpu(name_loc->valuelen) >= XFS_ATTR_SF_ENTSIZE_MAX)
return(0);
- bytes += sizeof(struct xfs_attr_sf_entry)-1
+ bytes += sizeof(struct xfs_attr_sf_entry) - 1
+ name_loc->namelen
+ be16_to_cpu(name_loc->valuelen);
}
if ((dp->i_mount->m_flags & XFS_MOUNT_ATTR2) &&
(dp->i_d.di_format != XFS_DINODE_FMT_BTREE) &&
(bytes == sizeof(struct xfs_attr_sf_hdr)))
- return(-1);
- return(xfs_attr_shortform_bytesfit(dp, bytes));
+ return -1;
+ return xfs_attr_shortform_bytesfit(dp, bytes);
}
/*
* Convert a leaf attribute list to shortform attribute list
*/
int
-xfs_attr_leaf_to_shortform(
- struct xfs_buf *bp,
- xfs_da_args_t *args,
- int forkoff)
+xfs_attr3_leaf_to_shortform(
+ struct xfs_buf *bp,
+ struct xfs_da_args *args,
+ int forkoff)
{
- xfs_attr_leafblock_t *leaf;
- xfs_attr_leaf_entry_t *entry;
- xfs_attr_leaf_name_local_t *name_loc;
- xfs_da_args_t nargs;
- xfs_inode_t *dp;
- char *tmpbuffer;
- int error, i;
+ struct xfs_attr_leafblock *leaf;
+ struct xfs_attr3_icleaf_hdr ichdr;
+ struct xfs_attr_leaf_entry *entry;
+ struct xfs_attr_leaf_name_local *name_loc;
+ struct xfs_da_args nargs;
+ struct xfs_inode *dp = args->dp;
+ char *tmpbuffer;
+ int error;
+ int i;
trace_xfs_attr_leaf_to_sf(args);
- dp = args->dp;
- tmpbuffer = kmem_alloc(XFS_LBSIZE(dp->i_mount), KM_SLEEP);
- ASSERT(tmpbuffer != NULL);
+ tmpbuffer = kmem_alloc(args->geo->blksize, KM_SLEEP);
+ if (!tmpbuffer)
+ return ENOMEM;
+
+ memcpy(tmpbuffer, bp->b_addr, args->geo->blksize);
- ASSERT(bp != NULL);
- memcpy(tmpbuffer, bp->b_addr, XFS_LBSIZE(dp->i_mount));
leaf = (xfs_attr_leafblock_t *)tmpbuffer;
- ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
- memset(bp->b_addr, 0, XFS_LBSIZE(dp->i_mount));
+ xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
+ entry = xfs_attr3_leaf_entryp(leaf);
+
+ /* XXX (dgc): buffer is about to be marked stale - why zero it? */
+ memset(bp->b_addr, 0, args->geo->blksize);
/*
* Clean out the prior contents of the attribute list.
@@ -866,6 +840,7 @@ xfs_attr_leaf_to_shortform(
* Copy the attributes
*/
memset((char *)&nargs, 0, sizeof(nargs));
+ nargs.geo = args->geo;
nargs.dp = dp;
nargs.firstblock = args->firstblock;
nargs.flist = args->flist;
@@ -873,14 +848,14 @@ xfs_attr_leaf_to_shortform(
nargs.whichfork = XFS_ATTR_FORK;
nargs.trans = args->trans;
nargs.op_flags = XFS_DA_OP_OKNOENT;
- entry = &leaf->entries[0];
- for (i = 0; i < be16_to_cpu(leaf->hdr.count); entry++, i++) {
+
+ for (i = 0; i < ichdr.count; entry++, i++) {
if (entry->flags & XFS_ATTR_INCOMPLETE)
continue; /* don't copy partial entries */
if (!entry->nameidx)
continue;
ASSERT(entry->flags & XFS_ATTR_LOCAL);
- name_loc = xfs_attr_leaf_name_local(leaf, i);
+ name_loc = xfs_attr3_leaf_name_local(leaf, i);
nargs.name = name_loc->nameval;
nargs.namelen = name_loc->namelen;
nargs.value = &name_loc->nameval[nargs.namelen];
@@ -893,64 +868,77 @@ xfs_attr_leaf_to_shortform(
out:
kmem_free(tmpbuffer);
- return(error);
+ return error;
}
/*
* Convert from using a single leaf to a root node and a leaf.
*/
int
-xfs_attr_leaf_to_node(xfs_da_args_t *args)
+xfs_attr3_leaf_to_node(
+ struct xfs_da_args *args)
{
- xfs_attr_leafblock_t *leaf;
- xfs_da_intnode_t *node;
- xfs_inode_t *dp;
- struct xfs_buf *bp1, *bp2;
- xfs_dablk_t blkno;
- int error;
+ struct xfs_attr_leafblock *leaf;
+ struct xfs_attr3_icleaf_hdr icleafhdr;
+ struct xfs_attr_leaf_entry *entries;
+ struct xfs_da_node_entry *btree;
+ struct xfs_da3_icnode_hdr icnodehdr;
+ struct xfs_da_intnode *node;
+ struct xfs_inode *dp = args->dp;
+ struct xfs_mount *mp = dp->i_mount;
+ struct xfs_buf *bp1 = NULL;
+ struct xfs_buf *bp2 = NULL;
+ xfs_dablk_t blkno;
+ int error;
trace_xfs_attr_leaf_to_node(args);
- dp = args->dp;
- bp1 = bp2 = NULL;
error = xfs_da_grow_inode(args, &blkno);
if (error)
goto out;
- error = xfs_attr_leaf_read(args->trans, args->dp, 0, -1, &bp1);
+ error = xfs_attr3_leaf_read(args->trans, dp, 0, -1, &bp1);
if (error)
goto out;
- bp2 = NULL;
- error = xfs_da_get_buf(args->trans, args->dp, blkno, -1, &bp2,
- XFS_ATTR_FORK);
+ error = xfs_da_get_buf(args->trans, dp, blkno, -1, &bp2, XFS_ATTR_FORK);
if (error)
goto out;
+
+ /* copy leaf to new buffer, update identifiers */
+ xfs_trans_buf_set_type(args->trans, bp2, XFS_BLFT_ATTR_LEAF_BUF);
bp2->b_ops = bp1->b_ops;
- memcpy(bp2->b_addr, bp1->b_addr, XFS_LBSIZE(dp->i_mount));
- bp1 = NULL;
- xfs_trans_log_buf(args->trans, bp2, 0, XFS_LBSIZE(dp->i_mount) - 1);
+ memcpy(bp2->b_addr, bp1->b_addr, args->geo->blksize);
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ struct xfs_da3_blkinfo *hdr3 = bp2->b_addr;
+ hdr3->blkno = cpu_to_be64(bp2->b_bn);
+ }
+ xfs_trans_log_buf(args->trans, bp2, 0, args->geo->blksize - 1);
/*
* Set up the new root node.
*/
- error = xfs_da_node_create(args, 0, 1, &bp1, XFS_ATTR_FORK);
+ error = xfs_da3_node_create(args, 0, 1, &bp1, XFS_ATTR_FORK);
if (error)
goto out;
node = bp1->b_addr;
+ dp->d_ops->node_hdr_from_disk(&icnodehdr, node);
+ btree = dp->d_ops->node_tree_p(node);
+
leaf = bp2->b_addr;
- ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
+ xfs_attr3_leaf_hdr_from_disk(&icleafhdr, leaf);
+ entries = xfs_attr3_leaf_entryp(leaf);
+
/* both on-disk, don't endian-flip twice */
- node->btree[0].hashval =
- leaf->entries[be16_to_cpu(leaf->hdr.count)-1 ].hashval;
- node->btree[0].before = cpu_to_be32(blkno);
- node->hdr.count = cpu_to_be16(1);
- xfs_trans_log_buf(args->trans, bp1, 0, XFS_LBSIZE(dp->i_mount) - 1);
+ btree[0].hashval = entries[icleafhdr.count - 1].hashval;
+ btree[0].before = cpu_to_be32(blkno);
+ icnodehdr.count = 1;
+ dp->d_ops->node_hdr_to_disk(node, &icnodehdr);
+ xfs_trans_log_buf(args->trans, bp1, 0, args->geo->blksize - 1);
error = 0;
out:
- return(error);
+ return error;
}
-
/*========================================================================
* Routines used for growing the Btree.
*========================================================================*/
@@ -960,52 +948,63 @@ out:
* or a leaf in a node attribute list.
*/
STATIC int
-xfs_attr_leaf_create(
- xfs_da_args_t *args,
- xfs_dablk_t blkno,
- struct xfs_buf **bpp)
+xfs_attr3_leaf_create(
+ struct xfs_da_args *args,
+ xfs_dablk_t blkno,
+ struct xfs_buf **bpp)
{
- xfs_attr_leafblock_t *leaf;
- xfs_attr_leaf_hdr_t *hdr;
- xfs_inode_t *dp;
- struct xfs_buf *bp;
- int error;
+ struct xfs_attr_leafblock *leaf;
+ struct xfs_attr3_icleaf_hdr ichdr;
+ struct xfs_inode *dp = args->dp;
+ struct xfs_mount *mp = dp->i_mount;
+ struct xfs_buf *bp;
+ int error;
trace_xfs_attr_leaf_create(args);
- dp = args->dp;
- ASSERT(dp != NULL);
error = xfs_da_get_buf(args->trans, args->dp, blkno, -1, &bp,
XFS_ATTR_FORK);
if (error)
- return(error);
- bp->b_ops = &xfs_attr_leaf_buf_ops;
+ return error;
+ bp->b_ops = &xfs_attr3_leaf_buf_ops;
+ xfs_trans_buf_set_type(args->trans, bp, XFS_BLFT_ATTR_LEAF_BUF);
leaf = bp->b_addr;
- memset((char *)leaf, 0, XFS_LBSIZE(dp->i_mount));
- hdr = &leaf->hdr;
- hdr->info.magic = cpu_to_be16(XFS_ATTR_LEAF_MAGIC);
- hdr->firstused = cpu_to_be16(XFS_LBSIZE(dp->i_mount));
- if (!hdr->firstused) {
- hdr->firstused = cpu_to_be16(
- XFS_LBSIZE(dp->i_mount) - XFS_ATTR_LEAF_NAME_ALIGN);
- }
+ memset(leaf, 0, args->geo->blksize);
- hdr->freemap[0].base = cpu_to_be16(sizeof(xfs_attr_leaf_hdr_t));
- hdr->freemap[0].size = cpu_to_be16(be16_to_cpu(hdr->firstused) -
- sizeof(xfs_attr_leaf_hdr_t));
+ memset(&ichdr, 0, sizeof(ichdr));
+ ichdr.firstused = args->geo->blksize;
- xfs_trans_log_buf(args->trans, bp, 0, XFS_LBSIZE(dp->i_mount) - 1);
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ struct xfs_da3_blkinfo *hdr3 = bp->b_addr;
+
+ ichdr.magic = XFS_ATTR3_LEAF_MAGIC;
+
+ hdr3->blkno = cpu_to_be64(bp->b_bn);
+ hdr3->owner = cpu_to_be64(dp->i_ino);
+ uuid_copy(&hdr3->uuid, &mp->m_sb.sb_uuid);
+
+ ichdr.freemap[0].base = sizeof(struct xfs_attr3_leaf_hdr);
+ } else {
+ ichdr.magic = XFS_ATTR_LEAF_MAGIC;
+ ichdr.freemap[0].base = sizeof(struct xfs_attr_leaf_hdr);
+ }
+ ichdr.freemap[0].size = ichdr.firstused - ichdr.freemap[0].base;
+
+ xfs_attr3_leaf_hdr_to_disk(leaf, &ichdr);
+ xfs_trans_log_buf(args->trans, bp, 0, args->geo->blksize - 1);
*bpp = bp;
- return(0);
+ return 0;
}
/*
* Split the leaf node, rebalance, then add the new entry.
*/
int
-xfs_attr_leaf_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk,
- xfs_da_state_blk_t *newblk)
+xfs_attr3_leaf_split(
+ struct xfs_da_state *state,
+ struct xfs_da_state_blk *oldblk,
+ struct xfs_da_state_blk *newblk)
{
xfs_dablk_t blkno;
int error;
@@ -1019,7 +1018,7 @@ xfs_attr_leaf_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk,
error = xfs_da_grow_inode(state->args, &blkno);
if (error)
return(error);
- error = xfs_attr_leaf_create(state->args, blkno, &newblk->bp);
+ error = xfs_attr3_leaf_create(state->args, blkno, &newblk->bp);
if (error)
return(error);
newblk->blkno = blkno;
@@ -1029,8 +1028,8 @@ xfs_attr_leaf_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk,
* Rebalance the entries across the two leaves.
* NOTE: rebalance() currently depends on the 2nd block being empty.
*/
- xfs_attr_leaf_rebalance(state, oldblk, newblk);
- error = xfs_da_blk_link(state, oldblk, newblk);
+ xfs_attr3_leaf_rebalance(state, oldblk, newblk);
+ error = xfs_da3_blk_link(state, oldblk, newblk);
if (error)
return(error);
@@ -1043,10 +1042,10 @@ xfs_attr_leaf_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk,
*/
if (state->inleaf) {
trace_xfs_attr_leaf_add_old(state->args);
- error = xfs_attr_leaf_add(oldblk->bp, state->args);
+ error = xfs_attr3_leaf_add(oldblk->bp, state->args);
} else {
trace_xfs_attr_leaf_add_new(state->args);
- error = xfs_attr_leaf_add(newblk->bp, state->args);
+ error = xfs_attr3_leaf_add(newblk->bp, state->args);
}
/*
@@ -1061,48 +1060,46 @@ xfs_attr_leaf_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk,
* Add a name to the leaf attribute list structure.
*/
int
-xfs_attr_leaf_add(
+xfs_attr3_leaf_add(
struct xfs_buf *bp,
struct xfs_da_args *args)
{
- xfs_attr_leafblock_t *leaf;
- xfs_attr_leaf_hdr_t *hdr;
- xfs_attr_leaf_map_t *map;
- int tablesize, entsize, sum, tmp, i;
+ struct xfs_attr_leafblock *leaf;
+ struct xfs_attr3_icleaf_hdr ichdr;
+ int tablesize;
+ int entsize;
+ int sum;
+ int tmp;
+ int i;
trace_xfs_attr_leaf_add(args);
leaf = bp->b_addr;
- ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
- ASSERT((args->index >= 0)
- && (args->index <= be16_to_cpu(leaf->hdr.count)));
- hdr = &leaf->hdr;
- entsize = xfs_attr_leaf_newentsize(args->namelen, args->valuelen,
- args->trans->t_mountp->m_sb.sb_blocksize, NULL);
+ xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
+ ASSERT(args->index >= 0 && args->index <= ichdr.count);
+ entsize = xfs_attr_leaf_newentsize(args, NULL);
/*
* Search through freemap for first-fit on new name length.
* (may need to figure in size of entry struct too)
*/
- tablesize = (be16_to_cpu(hdr->count) + 1)
- * sizeof(xfs_attr_leaf_entry_t)
- + sizeof(xfs_attr_leaf_hdr_t);
- map = &hdr->freemap[XFS_ATTR_LEAF_MAPSIZE-1];
- for (sum = 0, i = XFS_ATTR_LEAF_MAPSIZE-1; i >= 0; map--, i--) {
- if (tablesize > be16_to_cpu(hdr->firstused)) {
- sum += be16_to_cpu(map->size);
+ tablesize = (ichdr.count + 1) * sizeof(xfs_attr_leaf_entry_t)
+ + xfs_attr3_leaf_hdr_size(leaf);
+ for (sum = 0, i = XFS_ATTR_LEAF_MAPSIZE - 1; i >= 0; i--) {
+ if (tablesize > ichdr.firstused) {
+ sum += ichdr.freemap[i].size;
continue;
}
- if (!map->size)
+ if (!ichdr.freemap[i].size)
continue; /* no space in this map */
tmp = entsize;
- if (be16_to_cpu(map->base) < be16_to_cpu(hdr->firstused))
+ if (ichdr.freemap[i].base < ichdr.firstused)
tmp += sizeof(xfs_attr_leaf_entry_t);
- if (be16_to_cpu(map->size) >= tmp) {
- tmp = xfs_attr_leaf_add_work(bp, args, i);
- return(tmp);
+ if (ichdr.freemap[i].size >= tmp) {
+ tmp = xfs_attr3_leaf_add_work(bp, &ichdr, args, i);
+ goto out_log_hdr;
}
- sum += be16_to_cpu(map->size);
+ sum += ichdr.freemap[i].size;
}
/*
@@ -1110,82 +1107,86 @@ xfs_attr_leaf_add(
* and we don't have enough freespace, then compaction will do us
* no good and we should just give up.
*/
- if (!hdr->holes && (sum < entsize))
- return(XFS_ERROR(ENOSPC));
+ if (!ichdr.holes && sum < entsize)
+ return XFS_ERROR(ENOSPC);
/*
* Compact the entries to coalesce free space.
* This may change the hdr->count via dropping INCOMPLETE entries.
*/
- xfs_attr_leaf_compact(args, bp);
+ xfs_attr3_leaf_compact(args, &ichdr, bp);
/*
* After compaction, the block is guaranteed to have only one
* free region, in freemap[0]. If it is not big enough, give up.
*/
- if (be16_to_cpu(hdr->freemap[0].size)
- < (entsize + sizeof(xfs_attr_leaf_entry_t)))
- return(XFS_ERROR(ENOSPC));
+ if (ichdr.freemap[0].size < (entsize + sizeof(xfs_attr_leaf_entry_t))) {
+ tmp = ENOSPC;
+ goto out_log_hdr;
+ }
+
+ tmp = xfs_attr3_leaf_add_work(bp, &ichdr, args, 0);
- return(xfs_attr_leaf_add_work(bp, args, 0));
+out_log_hdr:
+ xfs_attr3_leaf_hdr_to_disk(leaf, &ichdr);
+ xfs_trans_log_buf(args->trans, bp,
+ XFS_DA_LOGRANGE(leaf, &leaf->hdr,
+ xfs_attr3_leaf_hdr_size(leaf)));
+ return tmp;
}
/*
* Add a name to a leaf attribute list structure.
*/
STATIC int
-xfs_attr_leaf_add_work(
- struct xfs_buf *bp,
- xfs_da_args_t *args,
- int mapindex)
+xfs_attr3_leaf_add_work(
+ struct xfs_buf *bp,
+ struct xfs_attr3_icleaf_hdr *ichdr,
+ struct xfs_da_args *args,
+ int mapindex)
{
- xfs_attr_leafblock_t *leaf;
- xfs_attr_leaf_hdr_t *hdr;
- xfs_attr_leaf_entry_t *entry;
- xfs_attr_leaf_name_local_t *name_loc;
- xfs_attr_leaf_name_remote_t *name_rmt;
- xfs_attr_leaf_map_t *map;
- xfs_mount_t *mp;
- int tmp, i;
+ struct xfs_attr_leafblock *leaf;
+ struct xfs_attr_leaf_entry *entry;
+ struct xfs_attr_leaf_name_local *name_loc;
+ struct xfs_attr_leaf_name_remote *name_rmt;
+ struct xfs_mount *mp;
+ int tmp;
+ int i;
trace_xfs_attr_leaf_add_work(args);
leaf = bp->b_addr;
- ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
- hdr = &leaf->hdr;
- ASSERT((mapindex >= 0) && (mapindex < XFS_ATTR_LEAF_MAPSIZE));
- ASSERT((args->index >= 0) && (args->index <= be16_to_cpu(hdr->count)));
+ ASSERT(mapindex >= 0 && mapindex < XFS_ATTR_LEAF_MAPSIZE);
+ ASSERT(args->index >= 0 && args->index <= ichdr->count);
/*
* Force open some space in the entry array and fill it in.
*/
- entry = &leaf->entries[args->index];
- if (args->index < be16_to_cpu(hdr->count)) {
- tmp = be16_to_cpu(hdr->count) - args->index;
+ entry = &xfs_attr3_leaf_entryp(leaf)[args->index];
+ if (args->index < ichdr->count) {
+ tmp = ichdr->count - args->index;
tmp *= sizeof(xfs_attr_leaf_entry_t);
- memmove((char *)(entry+1), (char *)entry, tmp);
+ memmove(entry + 1, entry, tmp);
xfs_trans_log_buf(args->trans, bp,
XFS_DA_LOGRANGE(leaf, entry, tmp + sizeof(*entry)));
}
- be16_add_cpu(&hdr->count, 1);
+ ichdr->count++;
/*
* Allocate space for the new string (at the end of the run).
*/
- map = &hdr->freemap[mapindex];
mp = args->trans->t_mountp;
- ASSERT(be16_to_cpu(map->base) < XFS_LBSIZE(mp));
- ASSERT((be16_to_cpu(map->base) & 0x3) == 0);
- ASSERT(be16_to_cpu(map->size) >=
- xfs_attr_leaf_newentsize(args->namelen, args->valuelen,
- mp->m_sb.sb_blocksize, NULL));
- ASSERT(be16_to_cpu(map->size) < XFS_LBSIZE(mp));
- ASSERT((be16_to_cpu(map->size) & 0x3) == 0);
- be16_add_cpu(&map->size,
- -xfs_attr_leaf_newentsize(args->namelen, args->valuelen,
- mp->m_sb.sb_blocksize, &tmp));
- entry->nameidx = cpu_to_be16(be16_to_cpu(map->base) +
- be16_to_cpu(map->size));
+ ASSERT(ichdr->freemap[mapindex].base < args->geo->blksize);
+ ASSERT((ichdr->freemap[mapindex].base & 0x3) == 0);
+ ASSERT(ichdr->freemap[mapindex].size >=
+ xfs_attr_leaf_newentsize(args, NULL));
+ ASSERT(ichdr->freemap[mapindex].size < args->geo->blksize);
+ ASSERT((ichdr->freemap[mapindex].size & 0x3) == 0);
+
+ ichdr->freemap[mapindex].size -= xfs_attr_leaf_newentsize(args, &tmp);
+
+ entry->nameidx = cpu_to_be16(ichdr->freemap[mapindex].base +
+ ichdr->freemap[mapindex].size);
entry->hashval = cpu_to_be32(args->hashval);
entry->flags = tmp ? XFS_ATTR_LOCAL : 0;
entry->flags |= XFS_ATTR_NSP_ARGS_TO_ONDISK(args->flags);
@@ -1200,7 +1201,7 @@ xfs_attr_leaf_add_work(
XFS_DA_LOGRANGE(leaf, entry, sizeof(*entry)));
ASSERT((args->index == 0) ||
(be32_to_cpu(entry->hashval) >= be32_to_cpu((entry-1)->hashval)));
- ASSERT((args->index == be16_to_cpu(hdr->count)-1) ||
+ ASSERT((args->index == ichdr->count - 1) ||
(be32_to_cpu(entry->hashval) <= be32_to_cpu((entry+1)->hashval)));
/*
@@ -1211,14 +1212,14 @@ xfs_attr_leaf_add_work(
* as part of this transaction (a split operation for example).
*/
if (entry->flags & XFS_ATTR_LOCAL) {
- name_loc = xfs_attr_leaf_name_local(leaf, args->index);
+ name_loc = xfs_attr3_leaf_name_local(leaf, args->index);
name_loc->namelen = args->namelen;
name_loc->valuelen = cpu_to_be16(args->valuelen);
memcpy((char *)name_loc->nameval, args->name, args->namelen);
memcpy((char *)&name_loc->nameval[args->namelen], args->value,
be16_to_cpu(name_loc->valuelen));
} else {
- name_rmt = xfs_attr_leaf_name_remote(leaf, args->index);
+ name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index);
name_rmt->namelen = args->namelen;
memcpy((char *)name_rmt->name, args->name, args->namelen);
entry->flags |= XFS_ATTR_INCOMPLETE;
@@ -1226,91 +1227,132 @@ xfs_attr_leaf_add_work(
name_rmt->valuelen = 0;
name_rmt->valueblk = 0;
args->rmtblkno = 1;
- args->rmtblkcnt = XFS_B_TO_FSB(mp, args->valuelen);
+ args->rmtblkcnt = xfs_attr3_rmt_blocks(mp, args->valuelen);
+ args->rmtvaluelen = args->valuelen;
}
xfs_trans_log_buf(args->trans, bp,
- XFS_DA_LOGRANGE(leaf, xfs_attr_leaf_name(leaf, args->index),
+ XFS_DA_LOGRANGE(leaf, xfs_attr3_leaf_name(leaf, args->index),
xfs_attr_leaf_entsize(leaf, args->index)));
/*
* Update the control info for this leaf node
*/
- if (be16_to_cpu(entry->nameidx) < be16_to_cpu(hdr->firstused)) {
- /* both on-disk, don't endian-flip twice */
- hdr->firstused = entry->nameidx;
- }
- ASSERT(be16_to_cpu(hdr->firstused) >=
- ((be16_to_cpu(hdr->count) * sizeof(*entry)) + sizeof(*hdr)));
- tmp = (be16_to_cpu(hdr->count)-1) * sizeof(xfs_attr_leaf_entry_t)
- + sizeof(xfs_attr_leaf_hdr_t);
- map = &hdr->freemap[0];
- for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; map++, i++) {
- if (be16_to_cpu(map->base) == tmp) {
- be16_add_cpu(&map->base, sizeof(xfs_attr_leaf_entry_t));
- be16_add_cpu(&map->size,
- -((int)sizeof(xfs_attr_leaf_entry_t)));
+ if (be16_to_cpu(entry->nameidx) < ichdr->firstused)
+ ichdr->firstused = be16_to_cpu(entry->nameidx);
+
+ ASSERT(ichdr->firstused >= ichdr->count * sizeof(xfs_attr_leaf_entry_t)
+ + xfs_attr3_leaf_hdr_size(leaf));
+ tmp = (ichdr->count - 1) * sizeof(xfs_attr_leaf_entry_t)
+ + xfs_attr3_leaf_hdr_size(leaf);
+
+ for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) {
+ if (ichdr->freemap[i].base == tmp) {
+ ichdr->freemap[i].base += sizeof(xfs_attr_leaf_entry_t);
+ ichdr->freemap[i].size -= sizeof(xfs_attr_leaf_entry_t);
}
}
- be16_add_cpu(&hdr->usedbytes, xfs_attr_leaf_entsize(leaf, args->index));
- xfs_trans_log_buf(args->trans, bp,
- XFS_DA_LOGRANGE(leaf, hdr, sizeof(*hdr)));
- return(0);
+ ichdr->usedbytes += xfs_attr_leaf_entsize(leaf, args->index);
+ return 0;
}
/*
* Garbage collect a leaf attribute list block by copying it to a new buffer.
*/
STATIC void
-xfs_attr_leaf_compact(
+xfs_attr3_leaf_compact(
struct xfs_da_args *args,
+ struct xfs_attr3_icleaf_hdr *ichdr_dst,
struct xfs_buf *bp)
{
- xfs_attr_leafblock_t *leaf_s, *leaf_d;
- xfs_attr_leaf_hdr_t *hdr_s, *hdr_d;
+ struct xfs_attr_leafblock *leaf_src;
+ struct xfs_attr_leafblock *leaf_dst;
+ struct xfs_attr3_icleaf_hdr ichdr_src;
struct xfs_trans *trans = args->trans;
- struct xfs_mount *mp = trans->t_mountp;
char *tmpbuffer;
trace_xfs_attr_leaf_compact(args);
- tmpbuffer = kmem_alloc(XFS_LBSIZE(mp), KM_SLEEP);
- ASSERT(tmpbuffer != NULL);
- memcpy(tmpbuffer, bp->b_addr, XFS_LBSIZE(mp));
- memset(bp->b_addr, 0, XFS_LBSIZE(mp));
+ tmpbuffer = kmem_alloc(args->geo->blksize, KM_SLEEP);
+ memcpy(tmpbuffer, bp->b_addr, args->geo->blksize);
+ memset(bp->b_addr, 0, args->geo->blksize);
+ leaf_src = (xfs_attr_leafblock_t *)tmpbuffer;
+ leaf_dst = bp->b_addr;
/*
- * Copy basic information
+ * Copy the on-disk header back into the destination buffer to ensure
+ * all the information in the header that is not part of the incore
+ * header structure is preserved.
*/
- leaf_s = (xfs_attr_leafblock_t *)tmpbuffer;
- leaf_d = bp->b_addr;
- hdr_s = &leaf_s->hdr;
- hdr_d = &leaf_d->hdr;
- hdr_d->info = hdr_s->info; /* struct copy */
- hdr_d->firstused = cpu_to_be16(XFS_LBSIZE(mp));
- /* handle truncation gracefully */
- if (!hdr_d->firstused) {
- hdr_d->firstused = cpu_to_be16(
- XFS_LBSIZE(mp) - XFS_ATTR_LEAF_NAME_ALIGN);
- }
- hdr_d->usedbytes = 0;
- hdr_d->count = 0;
- hdr_d->holes = 0;
- hdr_d->freemap[0].base = cpu_to_be16(sizeof(xfs_attr_leaf_hdr_t));
- hdr_d->freemap[0].size = cpu_to_be16(be16_to_cpu(hdr_d->firstused) -
- sizeof(xfs_attr_leaf_hdr_t));
+ memcpy(bp->b_addr, tmpbuffer, xfs_attr3_leaf_hdr_size(leaf_src));
+
+ /* Initialise the incore headers */
+ ichdr_src = *ichdr_dst; /* struct copy */
+ ichdr_dst->firstused = args->geo->blksize;
+ ichdr_dst->usedbytes = 0;
+ ichdr_dst->count = 0;
+ ichdr_dst->holes = 0;
+ ichdr_dst->freemap[0].base = xfs_attr3_leaf_hdr_size(leaf_src);
+ ichdr_dst->freemap[0].size = ichdr_dst->firstused -
+ ichdr_dst->freemap[0].base;
+
+ /* write the header back to initialise the underlying buffer */
+ xfs_attr3_leaf_hdr_to_disk(leaf_dst, ichdr_dst);
/*
* Copy all entry's in the same (sorted) order,
* but allocate name/value pairs packed and in sequence.
*/
- xfs_attr_leaf_moveents(leaf_s, 0, leaf_d, 0,
- be16_to_cpu(hdr_s->count), mp);
- xfs_trans_log_buf(trans, bp, 0, XFS_LBSIZE(mp) - 1);
+ xfs_attr3_leaf_moveents(args, leaf_src, &ichdr_src, 0,
+ leaf_dst, ichdr_dst, 0, ichdr_src.count);
+ /*
+ * this logs the entire buffer, but the caller must write the header
+ * back to the buffer when it is finished modifying it.
+ */
+ xfs_trans_log_buf(trans, bp, 0, args->geo->blksize - 1);
kmem_free(tmpbuffer);
}
/*
+ * Compare two leaf blocks "order".
+ * Return 0 unless leaf2 should go before leaf1.
+ */
+static int
+xfs_attr3_leaf_order(
+ struct xfs_buf *leaf1_bp,
+ struct xfs_attr3_icleaf_hdr *leaf1hdr,
+ struct xfs_buf *leaf2_bp,
+ struct xfs_attr3_icleaf_hdr *leaf2hdr)
+{
+ struct xfs_attr_leaf_entry *entries1;
+ struct xfs_attr_leaf_entry *entries2;
+
+ entries1 = xfs_attr3_leaf_entryp(leaf1_bp->b_addr);
+ entries2 = xfs_attr3_leaf_entryp(leaf2_bp->b_addr);
+ if (leaf1hdr->count > 0 && leaf2hdr->count > 0 &&
+ ((be32_to_cpu(entries2[0].hashval) <
+ be32_to_cpu(entries1[0].hashval)) ||
+ (be32_to_cpu(entries2[leaf2hdr->count - 1].hashval) <
+ be32_to_cpu(entries1[leaf1hdr->count - 1].hashval)))) {
+ return 1;
+ }
+ return 0;
+}
+
+int
+xfs_attr_leaf_order(
+ struct xfs_buf *leaf1_bp,
+ struct xfs_buf *leaf2_bp)
+{
+ struct xfs_attr3_icleaf_hdr ichdr1;
+ struct xfs_attr3_icleaf_hdr ichdr2;
+
+ xfs_attr3_leaf_hdr_from_disk(&ichdr1, leaf1_bp->b_addr);
+ xfs_attr3_leaf_hdr_from_disk(&ichdr2, leaf2_bp->b_addr);
+ return xfs_attr3_leaf_order(leaf1_bp, &ichdr1, leaf2_bp, &ichdr2);
+}
+
+/*
* Redistribute the attribute list entries between two leaf nodes,
* taking into account the size of the new entry.
*
@@ -1323,14 +1365,23 @@ xfs_attr_leaf_compact(
* the "new" and "old" values can end up in different blocks.
*/
STATIC void
-xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
- xfs_da_state_blk_t *blk2)
+xfs_attr3_leaf_rebalance(
+ struct xfs_da_state *state,
+ struct xfs_da_state_blk *blk1,
+ struct xfs_da_state_blk *blk2)
{
- xfs_da_args_t *args;
- xfs_da_state_blk_t *tmp_blk;
- xfs_attr_leafblock_t *leaf1, *leaf2;
- xfs_attr_leaf_hdr_t *hdr1, *hdr2;
- int count, totallen, max, space, swap;
+ struct xfs_da_args *args;
+ struct xfs_attr_leafblock *leaf1;
+ struct xfs_attr_leafblock *leaf2;
+ struct xfs_attr3_icleaf_hdr ichdr1;
+ struct xfs_attr3_icleaf_hdr ichdr2;
+ struct xfs_attr_leaf_entry *entries1;
+ struct xfs_attr_leaf_entry *entries2;
+ int count;
+ int totallen;
+ int max;
+ int space;
+ int swap;
/*
* Set up environment.
@@ -1339,9 +1390,9 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
ASSERT(blk2->magic == XFS_ATTR_LEAF_MAGIC);
leaf1 = blk1->bp->b_addr;
leaf2 = blk2->bp->b_addr;
- ASSERT(leaf1->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
- ASSERT(leaf2->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
- ASSERT(leaf2->hdr.count == 0);
+ xfs_attr3_leaf_hdr_from_disk(&ichdr1, leaf1);
+ xfs_attr3_leaf_hdr_from_disk(&ichdr2, leaf2);
+ ASSERT(ichdr2.count == 0);
args = state->args;
trace_xfs_attr_leaf_rebalance(args);
@@ -1353,16 +1404,23 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
* second block, this code should never set "swap".
*/
swap = 0;
- if (xfs_attr_leaf_order(blk1->bp, blk2->bp)) {
+ if (xfs_attr3_leaf_order(blk1->bp, &ichdr1, blk2->bp, &ichdr2)) {
+ struct xfs_da_state_blk *tmp_blk;
+ struct xfs_attr3_icleaf_hdr tmp_ichdr;
+
tmp_blk = blk1;
blk1 = blk2;
blk2 = tmp_blk;
+
+ /* struct copies to swap them rather than reconverting */
+ tmp_ichdr = ichdr1;
+ ichdr1 = ichdr2;
+ ichdr2 = tmp_ichdr;
+
leaf1 = blk1->bp->b_addr;
leaf2 = blk2->bp->b_addr;
swap = 1;
}
- hdr1 = &leaf1->hdr;
- hdr2 = &leaf2->hdr;
/*
* Examine entries until we reduce the absolute difference in
@@ -1372,41 +1430,39 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
* "inleaf" is true if the new entry should be inserted into blk1.
* If "swap" is also true, then reverse the sense of "inleaf".
*/
- state->inleaf = xfs_attr_leaf_figure_balance(state, blk1, blk2,
- &count, &totallen);
+ state->inleaf = xfs_attr3_leaf_figure_balance(state, blk1, &ichdr1,
+ blk2, &ichdr2,
+ &count, &totallen);
if (swap)
state->inleaf = !state->inleaf;
/*
* Move any entries required from leaf to leaf:
*/
- if (count < be16_to_cpu(hdr1->count)) {
+ if (count < ichdr1.count) {
/*
* Figure the total bytes to be added to the destination leaf.
*/
/* number entries being moved */
- count = be16_to_cpu(hdr1->count) - count;
- space = be16_to_cpu(hdr1->usedbytes) - totallen;
+ count = ichdr1.count - count;
+ space = ichdr1.usedbytes - totallen;
space += count * sizeof(xfs_attr_leaf_entry_t);
/*
* leaf2 is the destination, compact it if it looks tight.
*/
- max = be16_to_cpu(hdr2->firstused)
- - sizeof(xfs_attr_leaf_hdr_t);
- max -= be16_to_cpu(hdr2->count) * sizeof(xfs_attr_leaf_entry_t);
+ max = ichdr2.firstused - xfs_attr3_leaf_hdr_size(leaf1);
+ max -= ichdr2.count * sizeof(xfs_attr_leaf_entry_t);
if (space > max)
- xfs_attr_leaf_compact(args, blk2->bp);
+ xfs_attr3_leaf_compact(args, &ichdr2, blk2->bp);
/*
* Move high entries from leaf1 to low end of leaf2.
*/
- xfs_attr_leaf_moveents(leaf1, be16_to_cpu(hdr1->count) - count,
- leaf2, 0, count, state->mp);
+ xfs_attr3_leaf_moveents(args, leaf1, &ichdr1,
+ ichdr1.count - count, leaf2, &ichdr2, 0, count);
- xfs_trans_log_buf(args->trans, blk1->bp, 0, state->blocksize-1);
- xfs_trans_log_buf(args->trans, blk2->bp, 0, state->blocksize-1);
- } else if (count > be16_to_cpu(hdr1->count)) {
+ } else if (count > ichdr1.count) {
/*
* I assert that since all callers pass in an empty
* second buffer, this code should never execute.
@@ -1417,36 +1473,37 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
* Figure the total bytes to be added to the destination leaf.
*/
/* number entries being moved */
- count -= be16_to_cpu(hdr1->count);
- space = totallen - be16_to_cpu(hdr1->usedbytes);
+ count -= ichdr1.count;
+ space = totallen - ichdr1.usedbytes;
space += count * sizeof(xfs_attr_leaf_entry_t);
/*
* leaf1 is the destination, compact it if it looks tight.
*/
- max = be16_to_cpu(hdr1->firstused)
- - sizeof(xfs_attr_leaf_hdr_t);
- max -= be16_to_cpu(hdr1->count) * sizeof(xfs_attr_leaf_entry_t);
+ max = ichdr1.firstused - xfs_attr3_leaf_hdr_size(leaf1);
+ max -= ichdr1.count * sizeof(xfs_attr_leaf_entry_t);
if (space > max)
- xfs_attr_leaf_compact(args, blk1->bp);
+ xfs_attr3_leaf_compact(args, &ichdr1, blk1->bp);
/*
* Move low entries from leaf2 to high end of leaf1.
*/
- xfs_attr_leaf_moveents(leaf2, 0, leaf1,
- be16_to_cpu(hdr1->count), count, state->mp);
-
- xfs_trans_log_buf(args->trans, blk1->bp, 0, state->blocksize-1);
- xfs_trans_log_buf(args->trans, blk2->bp, 0, state->blocksize-1);
+ xfs_attr3_leaf_moveents(args, leaf2, &ichdr2, 0, leaf1, &ichdr1,
+ ichdr1.count, count);
}
+ xfs_attr3_leaf_hdr_to_disk(leaf1, &ichdr1);
+ xfs_attr3_leaf_hdr_to_disk(leaf2, &ichdr2);
+ xfs_trans_log_buf(args->trans, blk1->bp, 0, args->geo->blksize - 1);
+ xfs_trans_log_buf(args->trans, blk2->bp, 0, args->geo->blksize - 1);
+
/*
* Copy out last hashval in each block for B-tree code.
*/
- blk1->hashval = be32_to_cpu(
- leaf1->entries[be16_to_cpu(leaf1->hdr.count)-1].hashval);
- blk2->hashval = be32_to_cpu(
- leaf2->entries[be16_to_cpu(leaf2->hdr.count)-1].hashval);
+ entries1 = xfs_attr3_leaf_entryp(leaf1);
+ entries2 = xfs_attr3_leaf_entryp(leaf2);
+ blk1->hashval = be32_to_cpu(entries1[ichdr1.count - 1].hashval);
+ blk2->hashval = be32_to_cpu(entries2[ichdr2.count - 1].hashval);
/*
* Adjust the expected index for insertion.
@@ -1460,12 +1517,12 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
* inserting. The index/blkno fields refer to the "old" entry,
* while the index2/blkno2 fields refer to the "new" entry.
*/
- if (blk1->index > be16_to_cpu(leaf1->hdr.count)) {
+ if (blk1->index > ichdr1.count) {
ASSERT(state->inleaf == 0);
- blk2->index = blk1->index - be16_to_cpu(leaf1->hdr.count);
+ blk2->index = blk1->index - ichdr1.count;
args->index = args->index2 = blk2->index;
args->blkno = args->blkno2 = blk2->blkno;
- } else if (blk1->index == be16_to_cpu(leaf1->hdr.count)) {
+ } else if (blk1->index == ichdr1.count) {
if (state->inleaf) {
args->index = blk1->index;
args->blkno = blk1->blkno;
@@ -1477,8 +1534,7 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
* is already stored in blkno2/index2, so don't
* overwrite it overwise we corrupt the tree.
*/
- blk2->index = blk1->index
- - be16_to_cpu(leaf1->hdr.count);
+ blk2->index = blk1->index - ichdr1.count;
args->index = blk2->index;
args->blkno = blk2->blkno;
if (!state->extravalid) {
@@ -1506,42 +1562,38 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
* GROT: Do a double-split for this case?
*/
STATIC int
-xfs_attr_leaf_figure_balance(xfs_da_state_t *state,
- xfs_da_state_blk_t *blk1,
- xfs_da_state_blk_t *blk2,
- int *countarg, int *usedbytesarg)
+xfs_attr3_leaf_figure_balance(
+ struct xfs_da_state *state,
+ struct xfs_da_state_blk *blk1,
+ struct xfs_attr3_icleaf_hdr *ichdr1,
+ struct xfs_da_state_blk *blk2,
+ struct xfs_attr3_icleaf_hdr *ichdr2,
+ int *countarg,
+ int *usedbytesarg)
{
- xfs_attr_leafblock_t *leaf1, *leaf2;
- xfs_attr_leaf_hdr_t *hdr1, *hdr2;
- xfs_attr_leaf_entry_t *entry;
- int count, max, index, totallen, half;
- int lastdelta, foundit, tmp;
-
- /*
- * Set up environment.
- */
- leaf1 = blk1->bp->b_addr;
- leaf2 = blk2->bp->b_addr;
- hdr1 = &leaf1->hdr;
- hdr2 = &leaf2->hdr;
- foundit = 0;
- totallen = 0;
+ struct xfs_attr_leafblock *leaf1 = blk1->bp->b_addr;
+ struct xfs_attr_leafblock *leaf2 = blk2->bp->b_addr;
+ struct xfs_attr_leaf_entry *entry;
+ int count;
+ int max;
+ int index;
+ int totallen = 0;
+ int half;
+ int lastdelta;
+ int foundit = 0;
+ int tmp;
/*
* Examine entries until we reduce the absolute difference in
* byte usage between the two blocks to a minimum.
*/
- max = be16_to_cpu(hdr1->count) + be16_to_cpu(hdr2->count);
- half = (max+1) * sizeof(*entry);
- half += be16_to_cpu(hdr1->usedbytes) +
- be16_to_cpu(hdr2->usedbytes) +
- xfs_attr_leaf_newentsize(
- state->args->namelen,
- state->args->valuelen,
- state->blocksize, NULL);
+ max = ichdr1->count + ichdr2->count;
+ half = (max + 1) * sizeof(*entry);
+ half += ichdr1->usedbytes + ichdr2->usedbytes +
+ xfs_attr_leaf_newentsize(state->args, NULL);
half /= 2;
- lastdelta = state->blocksize;
- entry = &leaf1->entries[0];
+ lastdelta = state->args->geo->blksize;
+ entry = xfs_attr3_leaf_entryp(leaf1);
for (count = index = 0; count < max; entry++, index++, count++) {
#define XFS_ATTR_ABS(A) (((A) < 0) ? -(A) : (A))
@@ -1550,10 +1602,7 @@ xfs_attr_leaf_figure_balance(xfs_da_state_t *state,
*/
if (count == blk1->index) {
tmp = totallen + sizeof(*entry) +
- xfs_attr_leaf_newentsize(
- state->args->namelen,
- state->args->valuelen,
- state->blocksize, NULL);
+ xfs_attr_leaf_newentsize(state->args, NULL);
if (XFS_ATTR_ABS(half - tmp) > lastdelta)
break;
lastdelta = XFS_ATTR_ABS(half - tmp);
@@ -1564,9 +1613,9 @@ xfs_attr_leaf_figure_balance(xfs_da_state_t *state,
/*
* Wrap around into the second block if necessary.
*/
- if (count == be16_to_cpu(hdr1->count)) {
+ if (count == ichdr1->count) {
leaf1 = leaf2;
- entry = &leaf1->entries[0];
+ entry = xfs_attr3_leaf_entryp(leaf1);
index = 0;
}
@@ -1589,15 +1638,12 @@ xfs_attr_leaf_figure_balance(xfs_da_state_t *state,
totallen -= count * sizeof(*entry);
if (foundit) {
totallen -= sizeof(*entry) +
- xfs_attr_leaf_newentsize(
- state->args->namelen,
- state->args->valuelen,
- state->blocksize, NULL);
+ xfs_attr_leaf_newentsize(state->args, NULL);
}
*countarg = count;
*usedbytesarg = totallen;
- return(foundit);
+ return foundit;
}
/*========================================================================
@@ -1616,14 +1662,20 @@ xfs_attr_leaf_figure_balance(xfs_da_state_t *state,
* GROT: allow for INCOMPLETE entries in calculation.
*/
int
-xfs_attr_leaf_toosmall(xfs_da_state_t *state, int *action)
+xfs_attr3_leaf_toosmall(
+ struct xfs_da_state *state,
+ int *action)
{
- xfs_attr_leafblock_t *leaf;
- xfs_da_state_blk_t *blk;
- xfs_da_blkinfo_t *info;
- int count, bytes, forward, error, retval, i;
- xfs_dablk_t blkno;
- struct xfs_buf *bp;
+ struct xfs_attr_leafblock *leaf;
+ struct xfs_da_state_blk *blk;
+ struct xfs_attr3_icleaf_hdr ichdr;
+ struct xfs_buf *bp;
+ xfs_dablk_t blkno;
+ int bytes;
+ int forward;
+ int error;
+ int retval;
+ int i;
trace_xfs_attr_leaf_toosmall(state->args);
@@ -1633,14 +1685,12 @@ xfs_attr_leaf_toosmall(xfs_da_state_t *state, int *action)
* to coalesce with a sibling.
*/
blk = &state->path.blk[ state->path.active-1 ];
- info = blk->bp->b_addr;
- ASSERT(info->magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
- leaf = (xfs_attr_leafblock_t *)info;
- count = be16_to_cpu(leaf->hdr.count);
- bytes = sizeof(xfs_attr_leaf_hdr_t) +
- count * sizeof(xfs_attr_leaf_entry_t) +
- be16_to_cpu(leaf->hdr.usedbytes);
- if (bytes > (state->blocksize >> 1)) {
+ leaf = blk->bp->b_addr;
+ xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
+ bytes = xfs_attr3_leaf_hdr_size(leaf) +
+ ichdr.count * sizeof(xfs_attr_leaf_entry_t) +
+ ichdr.usedbytes;
+ if (bytes > (state->args->geo->blksize >> 1)) {
*action = 0; /* blk over 50%, don't try to join */
return(0);
}
@@ -1651,14 +1701,14 @@ xfs_attr_leaf_toosmall(xfs_da_state_t *state, int *action)
* coalesce it with a sibling block. We choose (arbitrarily)
* to merge with the forward block unless it is NULL.
*/
- if (count == 0) {
+ if (ichdr.count == 0) {
/*
* Make altpath point to the block we want to keep and
* path point to the block we want to drop (this one).
*/
- forward = (info->forw != 0);
+ forward = (ichdr.forw != 0);
memcpy(&state->altpath, &state->path, sizeof(state->path));
- error = xfs_da_path_shift(state, &state->altpath, forward,
+ error = xfs_da3_path_shift(state, &state->altpath, forward,
0, &retval);
if (error)
return(error);
@@ -1667,7 +1717,7 @@ xfs_attr_leaf_toosmall(xfs_da_state_t *state, int *action)
} else {
*action = 2;
}
- return(0);
+ return 0;
}
/*
@@ -1678,28 +1728,29 @@ xfs_attr_leaf_toosmall(xfs_da_state_t *state, int *action)
* to shrink an attribute list over time.
*/
/* start with smaller blk num */
- forward = (be32_to_cpu(info->forw) < be32_to_cpu(info->back));
+ forward = ichdr.forw < ichdr.back;
for (i = 0; i < 2; forward = !forward, i++) {
+ struct xfs_attr3_icleaf_hdr ichdr2;
if (forward)
- blkno = be32_to_cpu(info->forw);
+ blkno = ichdr.forw;
else
- blkno = be32_to_cpu(info->back);
+ blkno = ichdr.back;
if (blkno == 0)
continue;
- error = xfs_attr_leaf_read(state->args->trans, state->args->dp,
+ error = xfs_attr3_leaf_read(state->args->trans, state->args->dp,
blkno, -1, &bp);
if (error)
return(error);
- leaf = (xfs_attr_leafblock_t *)info;
- count = be16_to_cpu(leaf->hdr.count);
- bytes = state->blocksize - (state->blocksize>>2);
- bytes -= be16_to_cpu(leaf->hdr.usedbytes);
- leaf = bp->b_addr;
- count += be16_to_cpu(leaf->hdr.count);
- bytes -= be16_to_cpu(leaf->hdr.usedbytes);
- bytes -= count * sizeof(xfs_attr_leaf_entry_t);
- bytes -= sizeof(xfs_attr_leaf_hdr_t);
+ xfs_attr3_leaf_hdr_from_disk(&ichdr2, bp->b_addr);
+
+ bytes = state->args->geo->blksize -
+ (state->args->geo->blksize >> 2) -
+ ichdr.usedbytes - ichdr2.usedbytes -
+ ((ichdr.count + ichdr2.count) *
+ sizeof(xfs_attr_leaf_entry_t)) -
+ xfs_attr3_leaf_hdr_size(leaf);
+
xfs_trans_brelse(state->args->trans, bp);
if (bytes >= 0)
break; /* fits with at least 25% to spare */
@@ -1715,10 +1766,10 @@ xfs_attr_leaf_toosmall(xfs_da_state_t *state, int *action)
*/
memcpy(&state->altpath, &state->path, sizeof(state->path));
if (blkno < blk->blkno) {
- error = xfs_da_path_shift(state, &state->altpath, forward,
+ error = xfs_da3_path_shift(state, &state->altpath, forward,
0, &retval);
} else {
- error = xfs_da_path_shift(state, &state->path, forward,
+ error = xfs_da3_path_shift(state, &state->path, forward,
0, &retval);
}
if (error)
@@ -1738,33 +1789,35 @@ xfs_attr_leaf_toosmall(xfs_da_state_t *state, int *action)
* If two leaves are 37% full, when combined they will leave 25% free.
*/
int
-xfs_attr_leaf_remove(
- struct xfs_buf *bp,
- xfs_da_args_t *args)
+xfs_attr3_leaf_remove(
+ struct xfs_buf *bp,
+ struct xfs_da_args *args)
{
- xfs_attr_leafblock_t *leaf;
- xfs_attr_leaf_hdr_t *hdr;
- xfs_attr_leaf_map_t *map;
- xfs_attr_leaf_entry_t *entry;
- int before, after, smallest, entsize;
- int tablesize, tmp, i;
- xfs_mount_t *mp;
+ struct xfs_attr_leafblock *leaf;
+ struct xfs_attr3_icleaf_hdr ichdr;
+ struct xfs_attr_leaf_entry *entry;
+ int before;
+ int after;
+ int smallest;
+ int entsize;
+ int tablesize;
+ int tmp;
+ int i;
trace_xfs_attr_leaf_remove(args);
leaf = bp->b_addr;
- ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
- hdr = &leaf->hdr;
- mp = args->trans->t_mountp;
- ASSERT((be16_to_cpu(hdr->count) > 0)
- && (be16_to_cpu(hdr->count) < (XFS_LBSIZE(mp)/8)));
- ASSERT((args->index >= 0)
- && (args->index < be16_to_cpu(hdr->count)));
- ASSERT(be16_to_cpu(hdr->firstused) >=
- ((be16_to_cpu(hdr->count) * sizeof(*entry)) + sizeof(*hdr)));
- entry = &leaf->entries[args->index];
- ASSERT(be16_to_cpu(entry->nameidx) >= be16_to_cpu(hdr->firstused));
- ASSERT(be16_to_cpu(entry->nameidx) < XFS_LBSIZE(mp));
+ xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
+
+ ASSERT(ichdr.count > 0 && ichdr.count < args->geo->blksize / 8);
+ ASSERT(args->index >= 0 && args->index < ichdr.count);
+ ASSERT(ichdr.firstused >= ichdr.count * sizeof(*entry) +
+ xfs_attr3_leaf_hdr_size(leaf));
+
+ entry = &xfs_attr3_leaf_entryp(leaf)[args->index];
+
+ ASSERT(be16_to_cpu(entry->nameidx) >= ichdr.firstused);
+ ASSERT(be16_to_cpu(entry->nameidx) < args->geo->blksize);
/*
* Scan through free region table:
@@ -1772,30 +1825,28 @@ xfs_attr_leaf_remove(
* find smallest free region in case we need to replace it,
* adjust any map that borders the entry table,
*/
- tablesize = be16_to_cpu(hdr->count) * sizeof(xfs_attr_leaf_entry_t)
- + sizeof(xfs_attr_leaf_hdr_t);
- map = &hdr->freemap[0];
- tmp = be16_to_cpu(map->size);
+ tablesize = ichdr.count * sizeof(xfs_attr_leaf_entry_t)
+ + xfs_attr3_leaf_hdr_size(leaf);
+ tmp = ichdr.freemap[0].size;
before = after = -1;
smallest = XFS_ATTR_LEAF_MAPSIZE - 1;
entsize = xfs_attr_leaf_entsize(leaf, args->index);
- for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; map++, i++) {
- ASSERT(be16_to_cpu(map->base) < XFS_LBSIZE(mp));
- ASSERT(be16_to_cpu(map->size) < XFS_LBSIZE(mp));
- if (be16_to_cpu(map->base) == tablesize) {
- be16_add_cpu(&map->base,
- -((int)sizeof(xfs_attr_leaf_entry_t)));
- be16_add_cpu(&map->size, sizeof(xfs_attr_leaf_entry_t));
+ for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) {
+ ASSERT(ichdr.freemap[i].base < args->geo->blksize);
+ ASSERT(ichdr.freemap[i].size < args->geo->blksize);
+ if (ichdr.freemap[i].base == tablesize) {
+ ichdr.freemap[i].base -= sizeof(xfs_attr_leaf_entry_t);
+ ichdr.freemap[i].size += sizeof(xfs_attr_leaf_entry_t);
}
- if ((be16_to_cpu(map->base) + be16_to_cpu(map->size))
- == be16_to_cpu(entry->nameidx)) {
+ if (ichdr.freemap[i].base + ichdr.freemap[i].size ==
+ be16_to_cpu(entry->nameidx)) {
before = i;
- } else if (be16_to_cpu(map->base)
- == (be16_to_cpu(entry->nameidx) + entsize)) {
+ } else if (ichdr.freemap[i].base ==
+ (be16_to_cpu(entry->nameidx) + entsize)) {
after = i;
- } else if (be16_to_cpu(map->size) < tmp) {
- tmp = be16_to_cpu(map->size);
+ } else if (ichdr.freemap[i].size < tmp) {
+ tmp = ichdr.freemap[i].size;
smallest = i;
}
}
@@ -1806,36 +1857,30 @@ xfs_attr_leaf_remove(
*/
if ((before >= 0) || (after >= 0)) {
if ((before >= 0) && (after >= 0)) {
- map = &hdr->freemap[before];
- be16_add_cpu(&map->size, entsize);
- be16_add_cpu(&map->size,
- be16_to_cpu(hdr->freemap[after].size));
- hdr->freemap[after].base = 0;
- hdr->freemap[after].size = 0;
+ ichdr.freemap[before].size += entsize;
+ ichdr.freemap[before].size += ichdr.freemap[after].size;
+ ichdr.freemap[after].base = 0;
+ ichdr.freemap[after].size = 0;
} else if (before >= 0) {
- map = &hdr->freemap[before];
- be16_add_cpu(&map->size, entsize);
+ ichdr.freemap[before].size += entsize;
} else {
- map = &hdr->freemap[after];
- /* both on-disk, don't endian flip twice */
- map->base = entry->nameidx;
- be16_add_cpu(&map->size, entsize);
+ ichdr.freemap[after].base = be16_to_cpu(entry->nameidx);
+ ichdr.freemap[after].size += entsize;
}
} else {
/*
* Replace smallest region (if it is smaller than free'd entry)
*/
- map = &hdr->freemap[smallest];
- if (be16_to_cpu(map->size) < entsize) {
- map->base = cpu_to_be16(be16_to_cpu(entry->nameidx));
- map->size = cpu_to_be16(entsize);
+ if (ichdr.freemap[smallest].size < entsize) {
+ ichdr.freemap[smallest].base = be16_to_cpu(entry->nameidx);
+ ichdr.freemap[smallest].size = entsize;
}
}
/*
* Did we remove the first entry?
*/
- if (be16_to_cpu(entry->nameidx) == be16_to_cpu(hdr->firstused))
+ if (be16_to_cpu(entry->nameidx) == ichdr.firstused)
smallest = 1;
else
smallest = 0;
@@ -1843,20 +1888,20 @@ xfs_attr_leaf_remove(
/*
* Compress the remaining entries and zero out the removed stuff.
*/
- memset(xfs_attr_leaf_name(leaf, args->index), 0, entsize);
- be16_add_cpu(&hdr->usedbytes, -entsize);
+ memset(xfs_attr3_leaf_name(leaf, args->index), 0, entsize);
+ ichdr.usedbytes -= entsize;
xfs_trans_log_buf(args->trans, bp,
- XFS_DA_LOGRANGE(leaf, xfs_attr_leaf_name(leaf, args->index),
+ XFS_DA_LOGRANGE(leaf, xfs_attr3_leaf_name(leaf, args->index),
entsize));
- tmp = (be16_to_cpu(hdr->count) - args->index)
- * sizeof(xfs_attr_leaf_entry_t);
- memmove((char *)entry, (char *)(entry+1), tmp);
- be16_add_cpu(&hdr->count, -1);
+ tmp = (ichdr.count - args->index) * sizeof(xfs_attr_leaf_entry_t);
+ memmove(entry, entry + 1, tmp);
+ ichdr.count--;
xfs_trans_log_buf(args->trans, bp,
- XFS_DA_LOGRANGE(leaf, entry, tmp + sizeof(*entry)));
- entry = &leaf->entries[be16_to_cpu(hdr->count)];
- memset((char *)entry, 0, sizeof(xfs_attr_leaf_entry_t));
+ XFS_DA_LOGRANGE(leaf, entry, tmp + sizeof(xfs_attr_leaf_entry_t)));
+
+ entry = &xfs_attr3_leaf_entryp(leaf)[ichdr.count];
+ memset(entry, 0, sizeof(xfs_attr_leaf_entry_t));
/*
* If we removed the first entry, re-find the first used byte
@@ -1865,131 +1910,146 @@ xfs_attr_leaf_remove(
* removing the name.
*/
if (smallest) {
- tmp = XFS_LBSIZE(mp);
- entry = &leaf->entries[0];
- for (i = be16_to_cpu(hdr->count)-1; i >= 0; entry++, i--) {
- ASSERT(be16_to_cpu(entry->nameidx) >=
- be16_to_cpu(hdr->firstused));
- ASSERT(be16_to_cpu(entry->nameidx) < XFS_LBSIZE(mp));
+ tmp = args->geo->blksize;
+ entry = xfs_attr3_leaf_entryp(leaf);
+ for (i = ichdr.count - 1; i >= 0; entry++, i--) {
+ ASSERT(be16_to_cpu(entry->nameidx) >= ichdr.firstused);
+ ASSERT(be16_to_cpu(entry->nameidx) < args->geo->blksize);
if (be16_to_cpu(entry->nameidx) < tmp)
tmp = be16_to_cpu(entry->nameidx);
}
- hdr->firstused = cpu_to_be16(tmp);
- if (!hdr->firstused) {
- hdr->firstused = cpu_to_be16(
- tmp - XFS_ATTR_LEAF_NAME_ALIGN);
- }
+ ichdr.firstused = tmp;
+ if (!ichdr.firstused)
+ ichdr.firstused = tmp - XFS_ATTR_LEAF_NAME_ALIGN;
} else {
- hdr->holes = 1; /* mark as needing compaction */
+ ichdr.holes = 1; /* mark as needing compaction */
}
+ xfs_attr3_leaf_hdr_to_disk(leaf, &ichdr);
xfs_trans_log_buf(args->trans, bp,
- XFS_DA_LOGRANGE(leaf, hdr, sizeof(*hdr)));
+ XFS_DA_LOGRANGE(leaf, &leaf->hdr,
+ xfs_attr3_leaf_hdr_size(leaf)));
/*
* Check if leaf is less than 50% full, caller may want to
* "join" the leaf with a sibling if so.
*/
- tmp = sizeof(xfs_attr_leaf_hdr_t);
- tmp += be16_to_cpu(leaf->hdr.count) * sizeof(xfs_attr_leaf_entry_t);
- tmp += be16_to_cpu(leaf->hdr.usedbytes);
- return(tmp < mp->m_attr_magicpct); /* leaf is < 37% full */
+ tmp = ichdr.usedbytes + xfs_attr3_leaf_hdr_size(leaf) +
+ ichdr.count * sizeof(xfs_attr_leaf_entry_t);
+
+ return tmp < args->geo->magicpct; /* leaf is < 37% full */
}
/*
* Move all the attribute list entries from drop_leaf into save_leaf.
*/
void
-xfs_attr_leaf_unbalance(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk,
- xfs_da_state_blk_t *save_blk)
+xfs_attr3_leaf_unbalance(
+ struct xfs_da_state *state,
+ struct xfs_da_state_blk *drop_blk,
+ struct xfs_da_state_blk *save_blk)
{
- xfs_attr_leafblock_t *drop_leaf, *save_leaf, *tmp_leaf;
- xfs_attr_leaf_hdr_t *drop_hdr, *save_hdr, *tmp_hdr;
- xfs_mount_t *mp;
- char *tmpbuffer;
+ struct xfs_attr_leafblock *drop_leaf = drop_blk->bp->b_addr;
+ struct xfs_attr_leafblock *save_leaf = save_blk->bp->b_addr;
+ struct xfs_attr3_icleaf_hdr drophdr;
+ struct xfs_attr3_icleaf_hdr savehdr;
+ struct xfs_attr_leaf_entry *entry;
trace_xfs_attr_leaf_unbalance(state->args);
- /*
- * Set up environment.
- */
- mp = state->mp;
- ASSERT(drop_blk->magic == XFS_ATTR_LEAF_MAGIC);
- ASSERT(save_blk->magic == XFS_ATTR_LEAF_MAGIC);
drop_leaf = drop_blk->bp->b_addr;
save_leaf = save_blk->bp->b_addr;
- ASSERT(drop_leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
- ASSERT(save_leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
- drop_hdr = &drop_leaf->hdr;
- save_hdr = &save_leaf->hdr;
+ xfs_attr3_leaf_hdr_from_disk(&drophdr, drop_leaf);
+ xfs_attr3_leaf_hdr_from_disk(&savehdr, save_leaf);
+ entry = xfs_attr3_leaf_entryp(drop_leaf);
/*
* Save last hashval from dying block for later Btree fixup.
*/
- drop_blk->hashval = be32_to_cpu(
- drop_leaf->entries[be16_to_cpu(drop_leaf->hdr.count)-1].hashval);
+ drop_blk->hashval = be32_to_cpu(entry[drophdr.count - 1].hashval);
/*
* Check if we need a temp buffer, or can we do it in place.
* Note that we don't check "leaf" for holes because we will
* always be dropping it, toosmall() decided that for us already.
*/
- if (save_hdr->holes == 0) {
+ if (savehdr.holes == 0) {
/*
* dest leaf has no holes, so we add there. May need
* to make some room in the entry array.
*/
- if (xfs_attr_leaf_order(save_blk->bp, drop_blk->bp)) {
- xfs_attr_leaf_moveents(drop_leaf, 0, save_leaf, 0,
- be16_to_cpu(drop_hdr->count), mp);
+ if (xfs_attr3_leaf_order(save_blk->bp, &savehdr,
+ drop_blk->bp, &drophdr)) {
+ xfs_attr3_leaf_moveents(state->args,
+ drop_leaf, &drophdr, 0,
+ save_leaf, &savehdr, 0,
+ drophdr.count);
} else {
- xfs_attr_leaf_moveents(drop_leaf, 0, save_leaf,
- be16_to_cpu(save_hdr->count),
- be16_to_cpu(drop_hdr->count), mp);
+ xfs_attr3_leaf_moveents(state->args,
+ drop_leaf, &drophdr, 0,
+ save_leaf, &savehdr,
+ savehdr.count, drophdr.count);
}
} else {
/*
* Destination has holes, so we make a temporary copy
* of the leaf and add them both to that.
*/
- tmpbuffer = kmem_alloc(state->blocksize, KM_SLEEP);
- ASSERT(tmpbuffer != NULL);
- memset(tmpbuffer, 0, state->blocksize);
- tmp_leaf = (xfs_attr_leafblock_t *)tmpbuffer;
- tmp_hdr = &tmp_leaf->hdr;
- tmp_hdr->info = save_hdr->info; /* struct copy */
- tmp_hdr->count = 0;
- tmp_hdr->firstused = cpu_to_be16(state->blocksize);
- if (!tmp_hdr->firstused) {
- tmp_hdr->firstused = cpu_to_be16(
- state->blocksize - XFS_ATTR_LEAF_NAME_ALIGN);
- }
- tmp_hdr->usedbytes = 0;
- if (xfs_attr_leaf_order(save_blk->bp, drop_blk->bp)) {
- xfs_attr_leaf_moveents(drop_leaf, 0, tmp_leaf, 0,
- be16_to_cpu(drop_hdr->count), mp);
- xfs_attr_leaf_moveents(save_leaf, 0, tmp_leaf,
- be16_to_cpu(tmp_leaf->hdr.count),
- be16_to_cpu(save_hdr->count), mp);
+ struct xfs_attr_leafblock *tmp_leaf;
+ struct xfs_attr3_icleaf_hdr tmphdr;
+
+ tmp_leaf = kmem_zalloc(state->args->geo->blksize, KM_SLEEP);
+
+ /*
+ * Copy the header into the temp leaf so that all the stuff
+ * not in the incore header is present and gets copied back in
+ * once we've moved all the entries.
+ */
+ memcpy(tmp_leaf, save_leaf, xfs_attr3_leaf_hdr_size(save_leaf));
+
+ memset(&tmphdr, 0, sizeof(tmphdr));
+ tmphdr.magic = savehdr.magic;
+ tmphdr.forw = savehdr.forw;
+ tmphdr.back = savehdr.back;
+ tmphdr.firstused = state->args->geo->blksize;
+
+ /* write the header to the temp buffer to initialise it */
+ xfs_attr3_leaf_hdr_to_disk(tmp_leaf, &tmphdr);
+
+ if (xfs_attr3_leaf_order(save_blk->bp, &savehdr,
+ drop_blk->bp, &drophdr)) {
+ xfs_attr3_leaf_moveents(state->args,
+ drop_leaf, &drophdr, 0,
+ tmp_leaf, &tmphdr, 0,
+ drophdr.count);
+ xfs_attr3_leaf_moveents(state->args,
+ save_leaf, &savehdr, 0,
+ tmp_leaf, &tmphdr, tmphdr.count,
+ savehdr.count);
} else {
- xfs_attr_leaf_moveents(save_leaf, 0, tmp_leaf, 0,
- be16_to_cpu(save_hdr->count), mp);
- xfs_attr_leaf_moveents(drop_leaf, 0, tmp_leaf,
- be16_to_cpu(tmp_leaf->hdr.count),
- be16_to_cpu(drop_hdr->count), mp);
+ xfs_attr3_leaf_moveents(state->args,
+ save_leaf, &savehdr, 0,
+ tmp_leaf, &tmphdr, 0,
+ savehdr.count);
+ xfs_attr3_leaf_moveents(state->args,
+ drop_leaf, &drophdr, 0,
+ tmp_leaf, &tmphdr, tmphdr.count,
+ drophdr.count);
}
- memcpy((char *)save_leaf, (char *)tmp_leaf, state->blocksize);
- kmem_free(tmpbuffer);
+ memcpy(save_leaf, tmp_leaf, state->args->geo->blksize);
+ savehdr = tmphdr; /* struct copy */
+ kmem_free(tmp_leaf);
}
+ xfs_attr3_leaf_hdr_to_disk(save_leaf, &savehdr);
xfs_trans_log_buf(state->args->trans, save_blk->bp, 0,
- state->blocksize - 1);
+ state->args->geo->blksize - 1);
/*
* Copy out last hashval in each block for B-tree code.
*/
- save_blk->hashval = be32_to_cpu(
- save_leaf->entries[be16_to_cpu(save_leaf->hdr.count)-1].hashval);
+ entry = xfs_attr3_leaf_entryp(save_leaf);
+ save_blk->hashval = be32_to_cpu(entry[savehdr.count - 1].hashval);
}
/*========================================================================
@@ -2010,31 +2070,33 @@ xfs_attr_leaf_unbalance(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk,
* Don't change the args->value unless we find the attribute.
*/
int
-xfs_attr_leaf_lookup_int(
- struct xfs_buf *bp,
- xfs_da_args_t *args)
+xfs_attr3_leaf_lookup_int(
+ struct xfs_buf *bp,
+ struct xfs_da_args *args)
{
- xfs_attr_leafblock_t *leaf;
- xfs_attr_leaf_entry_t *entry;
- xfs_attr_leaf_name_local_t *name_loc;
- xfs_attr_leaf_name_remote_t *name_rmt;
- int probe, span;
- xfs_dahash_t hashval;
+ struct xfs_attr_leafblock *leaf;
+ struct xfs_attr3_icleaf_hdr ichdr;
+ struct xfs_attr_leaf_entry *entry;
+ struct xfs_attr_leaf_entry *entries;
+ struct xfs_attr_leaf_name_local *name_loc;
+ struct xfs_attr_leaf_name_remote *name_rmt;
+ xfs_dahash_t hashval;
+ int probe;
+ int span;
trace_xfs_attr_leaf_lookup(args);
leaf = bp->b_addr;
- ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
- ASSERT(be16_to_cpu(leaf->hdr.count)
- < (XFS_LBSIZE(args->dp->i_mount)/8));
+ xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
+ entries = xfs_attr3_leaf_entryp(leaf);
+ ASSERT(ichdr.count < args->geo->blksize / 8);
/*
* Binary search. (note: small blocks will skip this loop)
*/
hashval = args->hashval;
- probe = span = be16_to_cpu(leaf->hdr.count) / 2;
- for (entry = &leaf->entries[probe]; span > 4;
- entry = &leaf->entries[probe]) {
+ probe = span = ichdr.count / 2;
+ for (entry = &entries[probe]; span > 4; entry = &entries[probe]) {
span /= 2;
if (be32_to_cpu(entry->hashval) < hashval)
probe += span;
@@ -2043,35 +2105,31 @@ xfs_attr_leaf_lookup_int(
else
break;
}
- ASSERT((probe >= 0) &&
- (!leaf->hdr.count
- || (probe < be16_to_cpu(leaf->hdr.count))));
- ASSERT((span <= 4) || (be32_to_cpu(entry->hashval) == hashval));
+ ASSERT(probe >= 0 && (!ichdr.count || probe < ichdr.count));
+ ASSERT(span <= 4 || be32_to_cpu(entry->hashval) == hashval);
/*
* Since we may have duplicate hashval's, find the first matching
* hashval in the leaf.
*/
- while ((probe > 0) && (be32_to_cpu(entry->hashval) >= hashval)) {
+ while (probe > 0 && be32_to_cpu(entry->hashval) >= hashval) {
entry--;
probe--;
}
- while ((probe < be16_to_cpu(leaf->hdr.count)) &&
- (be32_to_cpu(entry->hashval) < hashval)) {
+ while (probe < ichdr.count &&
+ be32_to_cpu(entry->hashval) < hashval) {
entry++;
probe++;
}
- if ((probe == be16_to_cpu(leaf->hdr.count)) ||
- (be32_to_cpu(entry->hashval) != hashval)) {
+ if (probe == ichdr.count || be32_to_cpu(entry->hashval) != hashval) {
args->index = probe;
- return(XFS_ERROR(ENOATTR));
+ return XFS_ERROR(ENOATTR);
}
/*
* Duplicate keys may be present, so search all of them for a match.
*/
- for ( ; (probe < be16_to_cpu(leaf->hdr.count)) &&
- (be32_to_cpu(entry->hashval) == hashval);
+ for (; probe < ichdr.count && (be32_to_cpu(entry->hashval) == hashval);
entry++, probe++) {
/*
* GROT: Add code to remove incomplete entries.
@@ -2085,33 +2143,36 @@ xfs_attr_leaf_lookup_int(
continue;
}
if (entry->flags & XFS_ATTR_LOCAL) {
- name_loc = xfs_attr_leaf_name_local(leaf, probe);
+ name_loc = xfs_attr3_leaf_name_local(leaf, probe);
if (name_loc->namelen != args->namelen)
continue;
- if (memcmp(args->name, (char *)name_loc->nameval, args->namelen) != 0)
+ if (memcmp(args->name, name_loc->nameval,
+ args->namelen) != 0)
continue;
if (!xfs_attr_namesp_match(args->flags, entry->flags))
continue;
args->index = probe;
- return(XFS_ERROR(EEXIST));
+ return XFS_ERROR(EEXIST);
} else {
- name_rmt = xfs_attr_leaf_name_remote(leaf, probe);
+ name_rmt = xfs_attr3_leaf_name_remote(leaf, probe);
if (name_rmt->namelen != args->namelen)
continue;
- if (memcmp(args->name, (char *)name_rmt->name,
- args->namelen) != 0)
+ if (memcmp(args->name, name_rmt->name,
+ args->namelen) != 0)
continue;
if (!xfs_attr_namesp_match(args->flags, entry->flags))
continue;
args->index = probe;
+ args->rmtvaluelen = be32_to_cpu(name_rmt->valuelen);
args->rmtblkno = be32_to_cpu(name_rmt->valueblk);
- args->rmtblkcnt = XFS_B_TO_FSB(args->dp->i_mount,
- be32_to_cpu(name_rmt->valuelen));
- return(XFS_ERROR(EEXIST));
+ args->rmtblkcnt = xfs_attr3_rmt_blocks(
+ args->dp->i_mount,
+ args->rmtvaluelen);
+ return XFS_ERROR(EEXIST);
}
}
args->index = probe;
- return(XFS_ERROR(ENOATTR));
+ return XFS_ERROR(ENOATTR);
}
/*
@@ -2119,56 +2180,57 @@ xfs_attr_leaf_lookup_int(
* list structure.
*/
int
-xfs_attr_leaf_getvalue(
- struct xfs_buf *bp,
- xfs_da_args_t *args)
+xfs_attr3_leaf_getvalue(
+ struct xfs_buf *bp,
+ struct xfs_da_args *args)
{
- int valuelen;
- xfs_attr_leafblock_t *leaf;
- xfs_attr_leaf_entry_t *entry;
- xfs_attr_leaf_name_local_t *name_loc;
- xfs_attr_leaf_name_remote_t *name_rmt;
+ struct xfs_attr_leafblock *leaf;
+ struct xfs_attr3_icleaf_hdr ichdr;
+ struct xfs_attr_leaf_entry *entry;
+ struct xfs_attr_leaf_name_local *name_loc;
+ struct xfs_attr_leaf_name_remote *name_rmt;
+ int valuelen;
leaf = bp->b_addr;
- ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
- ASSERT(be16_to_cpu(leaf->hdr.count)
- < (XFS_LBSIZE(args->dp->i_mount)/8));
- ASSERT(args->index < be16_to_cpu(leaf->hdr.count));
+ xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
+ ASSERT(ichdr.count < args->geo->blksize / 8);
+ ASSERT(args->index < ichdr.count);
- entry = &leaf->entries[args->index];
+ entry = &xfs_attr3_leaf_entryp(leaf)[args->index];
if (entry->flags & XFS_ATTR_LOCAL) {
- name_loc = xfs_attr_leaf_name_local(leaf, args->index);
+ name_loc = xfs_attr3_leaf_name_local(leaf, args->index);
ASSERT(name_loc->namelen == args->namelen);
ASSERT(memcmp(args->name, name_loc->nameval, args->namelen) == 0);
valuelen = be16_to_cpu(name_loc->valuelen);
if (args->flags & ATTR_KERNOVAL) {
args->valuelen = valuelen;
- return(0);
+ return 0;
}
if (args->valuelen < valuelen) {
args->valuelen = valuelen;
- return(XFS_ERROR(ERANGE));
+ return XFS_ERROR(ERANGE);
}
args->valuelen = valuelen;
memcpy(args->value, &name_loc->nameval[args->namelen], valuelen);
} else {
- name_rmt = xfs_attr_leaf_name_remote(leaf, args->index);
+ name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index);
ASSERT(name_rmt->namelen == args->namelen);
ASSERT(memcmp(args->name, name_rmt->name, args->namelen) == 0);
- valuelen = be32_to_cpu(name_rmt->valuelen);
+ args->rmtvaluelen = be32_to_cpu(name_rmt->valuelen);
args->rmtblkno = be32_to_cpu(name_rmt->valueblk);
- args->rmtblkcnt = XFS_B_TO_FSB(args->dp->i_mount, valuelen);
+ args->rmtblkcnt = xfs_attr3_rmt_blocks(args->dp->i_mount,
+ args->rmtvaluelen);
if (args->flags & ATTR_KERNOVAL) {
- args->valuelen = valuelen;
- return(0);
+ args->valuelen = args->rmtvaluelen;
+ return 0;
}
- if (args->valuelen < valuelen) {
- args->valuelen = valuelen;
- return(XFS_ERROR(ERANGE));
+ if (args->valuelen < args->rmtvaluelen) {
+ args->valuelen = args->rmtvaluelen;
+ return XFS_ERROR(ERANGE);
}
- args->valuelen = valuelen;
+ args->valuelen = args->rmtvaluelen;
}
- return(0);
+ return 0;
}
/*========================================================================
@@ -2181,13 +2243,21 @@ xfs_attr_leaf_getvalue(
*/
/*ARGSUSED*/
STATIC void
-xfs_attr_leaf_moveents(xfs_attr_leafblock_t *leaf_s, int start_s,
- xfs_attr_leafblock_t *leaf_d, int start_d,
- int count, xfs_mount_t *mp)
+xfs_attr3_leaf_moveents(
+ struct xfs_da_args *args,
+ struct xfs_attr_leafblock *leaf_s,
+ struct xfs_attr3_icleaf_hdr *ichdr_s,
+ int start_s,
+ struct xfs_attr_leafblock *leaf_d,
+ struct xfs_attr3_icleaf_hdr *ichdr_d,
+ int start_d,
+ int count)
{
- xfs_attr_leaf_hdr_t *hdr_s, *hdr_d;
- xfs_attr_leaf_entry_t *entry_s, *entry_d;
- int desti, tmp, i;
+ struct xfs_attr_leaf_entry *entry_s;
+ struct xfs_attr_leaf_entry *entry_d;
+ int desti;
+ int tmp;
+ int i;
/*
* Check for nothing to do.
@@ -2198,45 +2268,41 @@ xfs_attr_leaf_moveents(xfs_attr_leafblock_t *leaf_s, int start_s,
/*
* Set up environment.
*/
- ASSERT(leaf_s->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
- ASSERT(leaf_d->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
- hdr_s = &leaf_s->hdr;
- hdr_d = &leaf_d->hdr;
- ASSERT((be16_to_cpu(hdr_s->count) > 0) &&
- (be16_to_cpu(hdr_s->count) < (XFS_LBSIZE(mp)/8)));
- ASSERT(be16_to_cpu(hdr_s->firstused) >=
- ((be16_to_cpu(hdr_s->count)
- * sizeof(*entry_s))+sizeof(*hdr_s)));
- ASSERT(be16_to_cpu(hdr_d->count) < (XFS_LBSIZE(mp)/8));
- ASSERT(be16_to_cpu(hdr_d->firstused) >=
- ((be16_to_cpu(hdr_d->count)
- * sizeof(*entry_d))+sizeof(*hdr_d)));
-
- ASSERT(start_s < be16_to_cpu(hdr_s->count));
- ASSERT(start_d <= be16_to_cpu(hdr_d->count));
- ASSERT(count <= be16_to_cpu(hdr_s->count));
+ ASSERT(ichdr_s->magic == XFS_ATTR_LEAF_MAGIC ||
+ ichdr_s->magic == XFS_ATTR3_LEAF_MAGIC);
+ ASSERT(ichdr_s->magic == ichdr_d->magic);
+ ASSERT(ichdr_s->count > 0 && ichdr_s->count < args->geo->blksize / 8);
+ ASSERT(ichdr_s->firstused >= (ichdr_s->count * sizeof(*entry_s))
+ + xfs_attr3_leaf_hdr_size(leaf_s));
+ ASSERT(ichdr_d->count < args->geo->blksize / 8);
+ ASSERT(ichdr_d->firstused >= (ichdr_d->count * sizeof(*entry_d))
+ + xfs_attr3_leaf_hdr_size(leaf_d));
+
+ ASSERT(start_s < ichdr_s->count);
+ ASSERT(start_d <= ichdr_d->count);
+ ASSERT(count <= ichdr_s->count);
+
/*
* Move the entries in the destination leaf up to make a hole?
*/
- if (start_d < be16_to_cpu(hdr_d->count)) {
- tmp = be16_to_cpu(hdr_d->count) - start_d;
+ if (start_d < ichdr_d->count) {
+ tmp = ichdr_d->count - start_d;
tmp *= sizeof(xfs_attr_leaf_entry_t);
- entry_s = &leaf_d->entries[start_d];
- entry_d = &leaf_d->entries[start_d + count];
- memmove((char *)entry_d, (char *)entry_s, tmp);
+ entry_s = &xfs_attr3_leaf_entryp(leaf_d)[start_d];
+ entry_d = &xfs_attr3_leaf_entryp(leaf_d)[start_d + count];
+ memmove(entry_d, entry_s, tmp);
}
/*
* Copy all entry's in the same (sorted) order,
* but allocate attribute info packed and in sequence.
*/
- entry_s = &leaf_s->entries[start_s];
- entry_d = &leaf_d->entries[start_d];
+ entry_s = &xfs_attr3_leaf_entryp(leaf_s)[start_s];
+ entry_d = &xfs_attr3_leaf_entryp(leaf_d)[start_d];
desti = start_d;
for (i = 0; i < count; entry_s++, entry_d++, desti++, i++) {
- ASSERT(be16_to_cpu(entry_s->nameidx)
- >= be16_to_cpu(hdr_s->firstused));
+ ASSERT(be16_to_cpu(entry_s->nameidx) >= ichdr_s->firstused);
tmp = xfs_attr_leaf_entsize(leaf_s, start_s + i);
#ifdef GROT
/*
@@ -2245,36 +2311,34 @@ xfs_attr_leaf_moveents(xfs_attr_leafblock_t *leaf_s, int start_s,
* off for 6.2, should be revisited later.
*/
if (entry_s->flags & XFS_ATTR_INCOMPLETE) { /* skip partials? */
- memset(xfs_attr_leaf_name(leaf_s, start_s + i), 0, tmp);
- be16_add_cpu(&hdr_s->usedbytes, -tmp);
- be16_add_cpu(&hdr_s->count, -1);
+ memset(xfs_attr3_leaf_name(leaf_s, start_s + i), 0, tmp);
+ ichdr_s->usedbytes -= tmp;
+ ichdr_s->count -= 1;
entry_d--; /* to compensate for ++ in loop hdr */
desti--;
if ((start_s + i) < offset)
result++; /* insertion index adjustment */
} else {
#endif /* GROT */
- be16_add_cpu(&hdr_d->firstused, -tmp);
+ ichdr_d->firstused -= tmp;
/* both on-disk, don't endian flip twice */
entry_d->hashval = entry_s->hashval;
- /* both on-disk, don't endian flip twice */
- entry_d->nameidx = hdr_d->firstused;
+ entry_d->nameidx = cpu_to_be16(ichdr_d->firstused);
entry_d->flags = entry_s->flags;
ASSERT(be16_to_cpu(entry_d->nameidx) + tmp
- <= XFS_LBSIZE(mp));
- memmove(xfs_attr_leaf_name(leaf_d, desti),
- xfs_attr_leaf_name(leaf_s, start_s + i), tmp);
+ <= args->geo->blksize);
+ memmove(xfs_attr3_leaf_name(leaf_d, desti),
+ xfs_attr3_leaf_name(leaf_s, start_s + i), tmp);
ASSERT(be16_to_cpu(entry_s->nameidx) + tmp
- <= XFS_LBSIZE(mp));
- memset(xfs_attr_leaf_name(leaf_s, start_s + i), 0, tmp);
- be16_add_cpu(&hdr_s->usedbytes, -tmp);
- be16_add_cpu(&hdr_d->usedbytes, tmp);
- be16_add_cpu(&hdr_s->count, -1);
- be16_add_cpu(&hdr_d->count, 1);
- tmp = be16_to_cpu(hdr_d->count)
- * sizeof(xfs_attr_leaf_entry_t)
- + sizeof(xfs_attr_leaf_hdr_t);
- ASSERT(be16_to_cpu(hdr_d->firstused) >= tmp);
+ <= args->geo->blksize);
+ memset(xfs_attr3_leaf_name(leaf_s, start_s + i), 0, tmp);
+ ichdr_s->usedbytes -= tmp;
+ ichdr_d->usedbytes += tmp;
+ ichdr_s->count -= 1;
+ ichdr_d->count += 1;
+ tmp = ichdr_d->count * sizeof(xfs_attr_leaf_entry_t)
+ + xfs_attr3_leaf_hdr_size(leaf_d);
+ ASSERT(ichdr_d->firstused >= tmp);
#ifdef GROT
}
#endif /* GROT */
@@ -2283,71 +2347,40 @@ xfs_attr_leaf_moveents(xfs_attr_leafblock_t *leaf_s, int start_s,
/*
* Zero out the entries we just copied.
*/
- if (start_s == be16_to_cpu(hdr_s->count)) {
+ if (start_s == ichdr_s->count) {
tmp = count * sizeof(xfs_attr_leaf_entry_t);
- entry_s = &leaf_s->entries[start_s];
+ entry_s = &xfs_attr3_leaf_entryp(leaf_s)[start_s];
ASSERT(((char *)entry_s + tmp) <=
- ((char *)leaf_s + XFS_LBSIZE(mp)));
- memset((char *)entry_s, 0, tmp);
+ ((char *)leaf_s + args->geo->blksize));
+ memset(entry_s, 0, tmp);
} else {
/*
* Move the remaining entries down to fill the hole,
* then zero the entries at the top.
*/
- tmp = be16_to_cpu(hdr_s->count) - count;
- tmp *= sizeof(xfs_attr_leaf_entry_t);
- entry_s = &leaf_s->entries[start_s + count];
- entry_d = &leaf_s->entries[start_s];
- memmove((char *)entry_d, (char *)entry_s, tmp);
+ tmp = (ichdr_s->count - count) * sizeof(xfs_attr_leaf_entry_t);
+ entry_s = &xfs_attr3_leaf_entryp(leaf_s)[start_s + count];
+ entry_d = &xfs_attr3_leaf_entryp(leaf_s)[start_s];
+ memmove(entry_d, entry_s, tmp);
tmp = count * sizeof(xfs_attr_leaf_entry_t);
- entry_s = &leaf_s->entries[be16_to_cpu(hdr_s->count)];
+ entry_s = &xfs_attr3_leaf_entryp(leaf_s)[ichdr_s->count];
ASSERT(((char *)entry_s + tmp) <=
- ((char *)leaf_s + XFS_LBSIZE(mp)));
- memset((char *)entry_s, 0, tmp);
+ ((char *)leaf_s + args->geo->blksize));
+ memset(entry_s, 0, tmp);
}
/*
* Fill in the freemap information
*/
- hdr_d->freemap[0].base = cpu_to_be16(sizeof(xfs_attr_leaf_hdr_t));
- be16_add_cpu(&hdr_d->freemap[0].base, be16_to_cpu(hdr_d->count) *
- sizeof(xfs_attr_leaf_entry_t));
- hdr_d->freemap[0].size = cpu_to_be16(be16_to_cpu(hdr_d->firstused)
- - be16_to_cpu(hdr_d->freemap[0].base));
- hdr_d->freemap[1].base = 0;
- hdr_d->freemap[2].base = 0;
- hdr_d->freemap[1].size = 0;
- hdr_d->freemap[2].size = 0;
- hdr_s->holes = 1; /* leaf may not be compact */
-}
-
-/*
- * Compare two leaf blocks "order".
- * Return 0 unless leaf2 should go before leaf1.
- */
-int
-xfs_attr_leaf_order(
- struct xfs_buf *leaf1_bp,
- struct xfs_buf *leaf2_bp)
-{
- xfs_attr_leafblock_t *leaf1, *leaf2;
-
- leaf1 = leaf1_bp->b_addr;
- leaf2 = leaf2_bp->b_addr;
- ASSERT((leaf1->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)) &&
- (leaf2->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)));
- if ((be16_to_cpu(leaf1->hdr.count) > 0) &&
- (be16_to_cpu(leaf2->hdr.count) > 0) &&
- ((be32_to_cpu(leaf2->entries[0].hashval) <
- be32_to_cpu(leaf1->entries[0].hashval)) ||
- (be32_to_cpu(leaf2->entries[
- be16_to_cpu(leaf2->hdr.count)-1].hashval) <
- be32_to_cpu(leaf1->entries[
- be16_to_cpu(leaf1->hdr.count)-1].hashval)))) {
- return(1);
- }
- return(0);
+ ichdr_d->freemap[0].base = xfs_attr3_leaf_hdr_size(leaf_d);
+ ichdr_d->freemap[0].base += ichdr_d->count * sizeof(xfs_attr_leaf_entry_t);
+ ichdr_d->freemap[0].size = ichdr_d->firstused - ichdr_d->freemap[0].base;
+ ichdr_d->freemap[1].base = 0;
+ ichdr_d->freemap[2].base = 0;
+ ichdr_d->freemap[1].size = 0;
+ ichdr_d->freemap[2].size = 0;
+ ichdr_s->holes = 1; /* leaf may not be compact */
}
/*
@@ -2358,15 +2391,16 @@ xfs_attr_leaf_lasthash(
struct xfs_buf *bp,
int *count)
{
- xfs_attr_leafblock_t *leaf;
+ struct xfs_attr3_icleaf_hdr ichdr;
+ struct xfs_attr_leaf_entry *entries;
- leaf = bp->b_addr;
- ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
+ xfs_attr3_leaf_hdr_from_disk(&ichdr, bp->b_addr);
+ entries = xfs_attr3_leaf_entryp(bp->b_addr);
if (count)
- *count = be16_to_cpu(leaf->hdr.count);
- if (!leaf->hdr.count)
- return(0);
- return be32_to_cpu(leaf->entries[be16_to_cpu(leaf->hdr.count)-1].hashval);
+ *count = ichdr.count;
+ if (!ichdr.count)
+ return 0;
+ return be32_to_cpu(entries[ichdr.count - 1].hashval);
}
/*
@@ -2376,20 +2410,21 @@ xfs_attr_leaf_lasthash(
STATIC int
xfs_attr_leaf_entsize(xfs_attr_leafblock_t *leaf, int index)
{
+ struct xfs_attr_leaf_entry *entries;
xfs_attr_leaf_name_local_t *name_loc;
xfs_attr_leaf_name_remote_t *name_rmt;
int size;
- ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
- if (leaf->entries[index].flags & XFS_ATTR_LOCAL) {
- name_loc = xfs_attr_leaf_name_local(leaf, index);
+ entries = xfs_attr3_leaf_entryp(leaf);
+ if (entries[index].flags & XFS_ATTR_LOCAL) {
+ name_loc = xfs_attr3_leaf_name_local(leaf, index);
size = xfs_attr_leaf_entsize_local(name_loc->namelen,
be16_to_cpu(name_loc->valuelen));
} else {
- name_rmt = xfs_attr_leaf_name_remote(leaf, index);
+ name_rmt = xfs_attr3_leaf_name_remote(leaf, index);
size = xfs_attr_leaf_entsize_remote(name_rmt->namelen);
}
- return(size);
+ return size;
}
/*
@@ -2399,140 +2434,21 @@ xfs_attr_leaf_entsize(xfs_attr_leafblock_t *leaf, int index)
* a "local" or a "remote" attribute.
*/
int
-xfs_attr_leaf_newentsize(int namelen, int valuelen, int blocksize, int *local)
+xfs_attr_leaf_newentsize(
+ struct xfs_da_args *args,
+ int *local)
{
- int size;
+ int size;
- size = xfs_attr_leaf_entsize_local(namelen, valuelen);
- if (size < xfs_attr_leaf_entsize_local_max(blocksize)) {
- if (local) {
+ size = xfs_attr_leaf_entsize_local(args->namelen, args->valuelen);
+ if (size < xfs_attr_leaf_entsize_local_max(args->geo->blksize)) {
+ if (local)
*local = 1;
- }
- } else {
- size = xfs_attr_leaf_entsize_remote(namelen);
- if (local) {
- *local = 0;
- }
+ return size;
}
- return(size);
-}
-
-/*
- * Copy out attribute list entries for attr_list(), for leaf attribute lists.
- */
-int
-xfs_attr_leaf_list_int(
- struct xfs_buf *bp,
- xfs_attr_list_context_t *context)
-{
- attrlist_cursor_kern_t *cursor;
- xfs_attr_leafblock_t *leaf;
- xfs_attr_leaf_entry_t *entry;
- int retval, i;
-
- ASSERT(bp != NULL);
- leaf = bp->b_addr;
- cursor = context->cursor;
- cursor->initted = 1;
-
- trace_xfs_attr_list_leaf(context);
-
- /*
- * Re-find our place in the leaf block if this is a new syscall.
- */
- if (context->resynch) {
- entry = &leaf->entries[0];
- for (i = 0; i < be16_to_cpu(leaf->hdr.count); entry++, i++) {
- if (be32_to_cpu(entry->hashval) == cursor->hashval) {
- if (cursor->offset == context->dupcnt) {
- context->dupcnt = 0;
- break;
- }
- context->dupcnt++;
- } else if (be32_to_cpu(entry->hashval) >
- cursor->hashval) {
- context->dupcnt = 0;
- break;
- }
- }
- if (i == be16_to_cpu(leaf->hdr.count)) {
- trace_xfs_attr_list_notfound(context);
- return(0);
- }
- } else {
- entry = &leaf->entries[0];
- i = 0;
- }
- context->resynch = 0;
-
- /*
- * We have found our place, start copying out the new attributes.
- */
- retval = 0;
- for ( ; (i < be16_to_cpu(leaf->hdr.count)); entry++, i++) {
- if (be32_to_cpu(entry->hashval) != cursor->hashval) {
- cursor->hashval = be32_to_cpu(entry->hashval);
- cursor->offset = 0;
- }
-
- if (entry->flags & XFS_ATTR_INCOMPLETE)
- continue; /* skip incomplete entries */
-
- if (entry->flags & XFS_ATTR_LOCAL) {
- xfs_attr_leaf_name_local_t *name_loc =
- xfs_attr_leaf_name_local(leaf, i);
-
- retval = context->put_listent(context,
- entry->flags,
- name_loc->nameval,
- (int)name_loc->namelen,
- be16_to_cpu(name_loc->valuelen),
- &name_loc->nameval[name_loc->namelen]);
- if (retval)
- return retval;
- } else {
- xfs_attr_leaf_name_remote_t *name_rmt =
- xfs_attr_leaf_name_remote(leaf, i);
-
- int valuelen = be32_to_cpu(name_rmt->valuelen);
-
- if (context->put_value) {
- xfs_da_args_t args;
-
- memset((char *)&args, 0, sizeof(args));
- args.dp = context->dp;
- args.whichfork = XFS_ATTR_FORK;
- args.valuelen = valuelen;
- args.value = kmem_alloc(valuelen, KM_SLEEP | KM_NOFS);
- args.rmtblkno = be32_to_cpu(name_rmt->valueblk);
- args.rmtblkcnt = XFS_B_TO_FSB(args.dp->i_mount, valuelen);
- retval = xfs_attr_rmtval_get(&args);
- if (retval)
- return retval;
- retval = context->put_listent(context,
- entry->flags,
- name_rmt->name,
- (int)name_rmt->namelen,
- valuelen,
- args.value);
- kmem_free(args.value);
- } else {
- retval = context->put_listent(context,
- entry->flags,
- name_rmt->name,
- (int)name_rmt->namelen,
- valuelen,
- NULL);
- }
- if (retval)
- return retval;
- }
- if (context->seen_enough)
- break;
- cursor->offset++;
- }
- trace_xfs_attr_list_leaf_end(context);
- return(retval);
+ if (local)
+ *local = 0;
+ return xfs_attr_leaf_entsize_remote(args->namelen);
}
@@ -2544,14 +2460,16 @@ xfs_attr_leaf_list_int(
* Clear the INCOMPLETE flag on an entry in a leaf block.
*/
int
-xfs_attr_leaf_clearflag(xfs_da_args_t *args)
+xfs_attr3_leaf_clearflag(
+ struct xfs_da_args *args)
{
- xfs_attr_leafblock_t *leaf;
- xfs_attr_leaf_entry_t *entry;
- xfs_attr_leaf_name_remote_t *name_rmt;
- struct xfs_buf *bp;
- int error;
+ struct xfs_attr_leafblock *leaf;
+ struct xfs_attr_leaf_entry *entry;
+ struct xfs_attr_leaf_name_remote *name_rmt;
+ struct xfs_buf *bp;
+ int error;
#ifdef DEBUG
+ struct xfs_attr3_icleaf_hdr ichdr;
xfs_attr_leaf_name_local_t *name_loc;
int namelen;
char *name;
@@ -2561,23 +2479,25 @@ xfs_attr_leaf_clearflag(xfs_da_args_t *args)
/*
* Set up the operation.
*/
- error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
+ error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
if (error)
return(error);
leaf = bp->b_addr;
- ASSERT(args->index < be16_to_cpu(leaf->hdr.count));
- ASSERT(args->index >= 0);
- entry = &leaf->entries[ args->index ];
+ entry = &xfs_attr3_leaf_entryp(leaf)[args->index];
ASSERT(entry->flags & XFS_ATTR_INCOMPLETE);
#ifdef DEBUG
+ xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
+ ASSERT(args->index < ichdr.count);
+ ASSERT(args->index >= 0);
+
if (entry->flags & XFS_ATTR_LOCAL) {
- name_loc = xfs_attr_leaf_name_local(leaf, args->index);
+ name_loc = xfs_attr3_leaf_name_local(leaf, args->index);
namelen = name_loc->namelen;
name = (char *)name_loc->nameval;
} else {
- name_rmt = xfs_attr_leaf_name_remote(leaf, args->index);
+ name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index);
namelen = name_rmt->namelen;
name = (char *)name_rmt->name;
}
@@ -2592,9 +2512,9 @@ xfs_attr_leaf_clearflag(xfs_da_args_t *args)
if (args->rmtblkno) {
ASSERT((entry->flags & XFS_ATTR_LOCAL) == 0);
- name_rmt = xfs_attr_leaf_name_remote(leaf, args->index);
+ name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index);
name_rmt->valueblk = cpu_to_be32(args->rmtblkno);
- name_rmt->valuelen = cpu_to_be32(args->valuelen);
+ name_rmt->valuelen = cpu_to_be32(args->rmtvaluelen);
xfs_trans_log_buf(args->trans, bp,
XFS_DA_LOGRANGE(leaf, name_rmt, sizeof(*name_rmt)));
}
@@ -2609,34 +2529,41 @@ xfs_attr_leaf_clearflag(xfs_da_args_t *args)
* Set the INCOMPLETE flag on an entry in a leaf block.
*/
int
-xfs_attr_leaf_setflag(xfs_da_args_t *args)
+xfs_attr3_leaf_setflag(
+ struct xfs_da_args *args)
{
- xfs_attr_leafblock_t *leaf;
- xfs_attr_leaf_entry_t *entry;
- xfs_attr_leaf_name_remote_t *name_rmt;
- struct xfs_buf *bp;
+ struct xfs_attr_leafblock *leaf;
+ struct xfs_attr_leaf_entry *entry;
+ struct xfs_attr_leaf_name_remote *name_rmt;
+ struct xfs_buf *bp;
int error;
+#ifdef DEBUG
+ struct xfs_attr3_icleaf_hdr ichdr;
+#endif
trace_xfs_attr_leaf_setflag(args);
/*
* Set up the operation.
*/
- error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
+ error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
if (error)
return(error);
leaf = bp->b_addr;
- ASSERT(args->index < be16_to_cpu(leaf->hdr.count));
+#ifdef DEBUG
+ xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
+ ASSERT(args->index < ichdr.count);
ASSERT(args->index >= 0);
- entry = &leaf->entries[ args->index ];
+#endif
+ entry = &xfs_attr3_leaf_entryp(leaf)[args->index];
ASSERT((entry->flags & XFS_ATTR_INCOMPLETE) == 0);
entry->flags |= XFS_ATTR_INCOMPLETE;
xfs_trans_log_buf(args->trans, bp,
XFS_DA_LOGRANGE(leaf, entry, sizeof(*entry)));
if ((entry->flags & XFS_ATTR_LOCAL) == 0) {
- name_rmt = xfs_attr_leaf_name_remote(leaf, args->index);
+ name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index);
name_rmt->valueblk = 0;
name_rmt->valuelen = 0;
xfs_trans_log_buf(args->trans, bp,
@@ -2657,14 +2584,20 @@ xfs_attr_leaf_setflag(xfs_da_args_t *args)
* Note that they could be in different blocks, or in the same block.
*/
int
-xfs_attr_leaf_flipflags(xfs_da_args_t *args)
+xfs_attr3_leaf_flipflags(
+ struct xfs_da_args *args)
{
- xfs_attr_leafblock_t *leaf1, *leaf2;
- xfs_attr_leaf_entry_t *entry1, *entry2;
- xfs_attr_leaf_name_remote_t *name_rmt;
- struct xfs_buf *bp1, *bp2;
+ struct xfs_attr_leafblock *leaf1;
+ struct xfs_attr_leafblock *leaf2;
+ struct xfs_attr_leaf_entry *entry1;
+ struct xfs_attr_leaf_entry *entry2;
+ struct xfs_attr_leaf_name_remote *name_rmt;
+ struct xfs_buf *bp1;
+ struct xfs_buf *bp2;
int error;
#ifdef DEBUG
+ struct xfs_attr3_icleaf_hdr ichdr1;
+ struct xfs_attr3_icleaf_hdr ichdr2;
xfs_attr_leaf_name_local_t *name_loc;
int namelen1, namelen2;
char *name1, *name2;
@@ -2675,7 +2608,7 @@ xfs_attr_leaf_flipflags(xfs_da_args_t *args)
/*
* Read the block containing the "old" attr
*/
- error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, -1, &bp1);
+ error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp1);
if (error)
return error;
@@ -2683,7 +2616,7 @@ xfs_attr_leaf_flipflags(xfs_da_args_t *args)
* Read the block containing the "new" attr, if it is different
*/
if (args->blkno2 != args->blkno) {
- error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno2,
+ error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno2,
-1, &bp2);
if (error)
return error;
@@ -2692,31 +2625,35 @@ xfs_attr_leaf_flipflags(xfs_da_args_t *args)
}
leaf1 = bp1->b_addr;
- ASSERT(args->index < be16_to_cpu(leaf1->hdr.count));
- ASSERT(args->index >= 0);
- entry1 = &leaf1->entries[ args->index ];
+ entry1 = &xfs_attr3_leaf_entryp(leaf1)[args->index];
leaf2 = bp2->b_addr;
- ASSERT(args->index2 < be16_to_cpu(leaf2->hdr.count));
- ASSERT(args->index2 >= 0);
- entry2 = &leaf2->entries[ args->index2 ];
+ entry2 = &xfs_attr3_leaf_entryp(leaf2)[args->index2];
#ifdef DEBUG
+ xfs_attr3_leaf_hdr_from_disk(&ichdr1, leaf1);
+ ASSERT(args->index < ichdr1.count);
+ ASSERT(args->index >= 0);
+
+ xfs_attr3_leaf_hdr_from_disk(&ichdr2, leaf2);
+ ASSERT(args->index2 < ichdr2.count);
+ ASSERT(args->index2 >= 0);
+
if (entry1->flags & XFS_ATTR_LOCAL) {
- name_loc = xfs_attr_leaf_name_local(leaf1, args->index);
+ name_loc = xfs_attr3_leaf_name_local(leaf1, args->index);
namelen1 = name_loc->namelen;
name1 = (char *)name_loc->nameval;
} else {
- name_rmt = xfs_attr_leaf_name_remote(leaf1, args->index);
+ name_rmt = xfs_attr3_leaf_name_remote(leaf1, args->index);
namelen1 = name_rmt->namelen;
name1 = (char *)name_rmt->name;
}
if (entry2->flags & XFS_ATTR_LOCAL) {
- name_loc = xfs_attr_leaf_name_local(leaf2, args->index2);
+ name_loc = xfs_attr3_leaf_name_local(leaf2, args->index2);
namelen2 = name_loc->namelen;
name2 = (char *)name_loc->nameval;
} else {
- name_rmt = xfs_attr_leaf_name_remote(leaf2, args->index2);
+ name_rmt = xfs_attr3_leaf_name_remote(leaf2, args->index2);
namelen2 = name_rmt->namelen;
name2 = (char *)name_rmt->name;
}
@@ -2733,9 +2670,9 @@ xfs_attr_leaf_flipflags(xfs_da_args_t *args)
XFS_DA_LOGRANGE(leaf1, entry1, sizeof(*entry1)));
if (args->rmtblkno) {
ASSERT((entry1->flags & XFS_ATTR_LOCAL) == 0);
- name_rmt = xfs_attr_leaf_name_remote(leaf1, args->index);
+ name_rmt = xfs_attr3_leaf_name_remote(leaf1, args->index);
name_rmt->valueblk = cpu_to_be32(args->rmtblkno);
- name_rmt->valuelen = cpu_to_be32(args->valuelen);
+ name_rmt->valuelen = cpu_to_be32(args->rmtvaluelen);
xfs_trans_log_buf(args->trans, bp1,
XFS_DA_LOGRANGE(leaf1, name_rmt, sizeof(*name_rmt)));
}
@@ -2744,7 +2681,7 @@ xfs_attr_leaf_flipflags(xfs_da_args_t *args)
xfs_trans_log_buf(args->trans, bp2,
XFS_DA_LOGRANGE(leaf2, entry2, sizeof(*entry2)));
if ((entry2->flags & XFS_ATTR_LOCAL) == 0) {
- name_rmt = xfs_attr_leaf_name_remote(leaf2, args->index2);
+ name_rmt = xfs_attr3_leaf_name_remote(leaf2, args->index2);
name_rmt->valueblk = 0;
name_rmt->valuelen = 0;
xfs_trans_log_buf(args->trans, bp2,
@@ -2756,319 +2693,5 @@ xfs_attr_leaf_flipflags(xfs_da_args_t *args)
*/
error = xfs_trans_roll(&args->trans, args->dp);
- return(error);
-}
-
-/*========================================================================
- * Indiscriminately delete the entire attribute fork
- *========================================================================*/
-
-/*
- * Recurse (gasp!) through the attribute nodes until we find leaves.
- * We're doing a depth-first traversal in order to invalidate everything.
- */
-int
-xfs_attr_root_inactive(xfs_trans_t **trans, xfs_inode_t *dp)
-{
- xfs_da_blkinfo_t *info;
- xfs_daddr_t blkno;
- struct xfs_buf *bp;
- int error;
-
- /*
- * Read block 0 to see what we have to work with.
- * We only get here if we have extents, since we remove
- * the extents in reverse order the extent containing
- * block 0 must still be there.
- */
- error = xfs_da_node_read(*trans, dp, 0, -1, &bp, XFS_ATTR_FORK);
- if (error)
- return(error);
- blkno = XFS_BUF_ADDR(bp);
-
- /*
- * Invalidate the tree, even if the "tree" is only a single leaf block.
- * This is a depth-first traversal!
- */
- info = bp->b_addr;
- if (info->magic == cpu_to_be16(XFS_DA_NODE_MAGIC)) {
- error = xfs_attr_node_inactive(trans, dp, bp, 1);
- } else if (info->magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)) {
- error = xfs_attr_leaf_inactive(trans, dp, bp);
- } else {
- error = XFS_ERROR(EIO);
- xfs_trans_brelse(*trans, bp);
- }
- if (error)
- return(error);
-
- /*
- * Invalidate the incore copy of the root block.
- */
- error = xfs_da_get_buf(*trans, dp, 0, blkno, &bp, XFS_ATTR_FORK);
- if (error)
- return(error);
- xfs_trans_binval(*trans, bp); /* remove from cache */
- /*
- * Commit the invalidate and start the next transaction.
- */
- error = xfs_trans_roll(trans, dp);
-
- return (error);
-}
-
-/*
- * Recurse (gasp!) through the attribute nodes until we find leaves.
- * We're doing a depth-first traversal in order to invalidate everything.
- */
-STATIC int
-xfs_attr_node_inactive(
- struct xfs_trans **trans,
- struct xfs_inode *dp,
- struct xfs_buf *bp,
- int level)
-{
- xfs_da_blkinfo_t *info;
- xfs_da_intnode_t *node;
- xfs_dablk_t child_fsb;
- xfs_daddr_t parent_blkno, child_blkno;
- int error, count, i;
- struct xfs_buf *child_bp;
-
- /*
- * Since this code is recursive (gasp!) we must protect ourselves.
- */
- if (level > XFS_DA_NODE_MAXDEPTH) {
- xfs_trans_brelse(*trans, bp); /* no locks for later trans */
- return(XFS_ERROR(EIO));
- }
-
- node = bp->b_addr;
- ASSERT(node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC));
- parent_blkno = XFS_BUF_ADDR(bp); /* save for re-read later */
- count = be16_to_cpu(node->hdr.count);
- if (!count) {
- xfs_trans_brelse(*trans, bp);
- return(0);
- }
- child_fsb = be32_to_cpu(node->btree[0].before);
- xfs_trans_brelse(*trans, bp); /* no locks for later trans */
-
- /*
- * If this is the node level just above the leaves, simply loop
- * over the leaves removing all of them. If this is higher up
- * in the tree, recurse downward.
- */
- for (i = 0; i < count; i++) {
- /*
- * Read the subsidiary block to see what we have to work with.
- * Don't do this in a transaction. This is a depth-first
- * traversal of the tree so we may deal with many blocks
- * before we come back to this one.
- */
- error = xfs_da_node_read(*trans, dp, child_fsb, -2, &child_bp,
- XFS_ATTR_FORK);
- if (error)
- return(error);
- if (child_bp) {
- /* save for re-read later */
- child_blkno = XFS_BUF_ADDR(child_bp);
-
- /*
- * Invalidate the subtree, however we have to.
- */
- info = child_bp->b_addr;
- if (info->magic == cpu_to_be16(XFS_DA_NODE_MAGIC)) {
- error = xfs_attr_node_inactive(trans, dp,
- child_bp, level+1);
- } else if (info->magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)) {
- error = xfs_attr_leaf_inactive(trans, dp,
- child_bp);
- } else {
- error = XFS_ERROR(EIO);
- xfs_trans_brelse(*trans, child_bp);
- }
- if (error)
- return(error);
-
- /*
- * Remove the subsidiary block from the cache
- * and from the log.
- */
- error = xfs_da_get_buf(*trans, dp, 0, child_blkno,
- &child_bp, XFS_ATTR_FORK);
- if (error)
- return(error);
- xfs_trans_binval(*trans, child_bp);
- }
-
- /*
- * If we're not done, re-read the parent to get the next
- * child block number.
- */
- if ((i+1) < count) {
- error = xfs_da_node_read(*trans, dp, 0, parent_blkno,
- &bp, XFS_ATTR_FORK);
- if (error)
- return(error);
- child_fsb = be32_to_cpu(node->btree[i+1].before);
- xfs_trans_brelse(*trans, bp);
- }
- /*
- * Atomically commit the whole invalidate stuff.
- */
- error = xfs_trans_roll(trans, dp);
- if (error)
- return (error);
- }
-
- return(0);
-}
-
-/*
- * Invalidate all of the "remote" value regions pointed to by a particular
- * leaf block.
- * Note that we must release the lock on the buffer so that we are not
- * caught holding something that the logging code wants to flush to disk.
- */
-STATIC int
-xfs_attr_leaf_inactive(
- struct xfs_trans **trans,
- struct xfs_inode *dp,
- struct xfs_buf *bp)
-{
- xfs_attr_leafblock_t *leaf;
- xfs_attr_leaf_entry_t *entry;
- xfs_attr_leaf_name_remote_t *name_rmt;
- xfs_attr_inactive_list_t *list, *lp;
- int error, count, size, tmp, i;
-
- leaf = bp->b_addr;
- ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
-
- /*
- * Count the number of "remote" value extents.
- */
- count = 0;
- entry = &leaf->entries[0];
- for (i = 0; i < be16_to_cpu(leaf->hdr.count); entry++, i++) {
- if (be16_to_cpu(entry->nameidx) &&
- ((entry->flags & XFS_ATTR_LOCAL) == 0)) {
- name_rmt = xfs_attr_leaf_name_remote(leaf, i);
- if (name_rmt->valueblk)
- count++;
- }
- }
-
- /*
- * If there are no "remote" values, we're done.
- */
- if (count == 0) {
- xfs_trans_brelse(*trans, bp);
- return(0);
- }
-
- /*
- * Allocate storage for a list of all the "remote" value extents.
- */
- size = count * sizeof(xfs_attr_inactive_list_t);
- list = (xfs_attr_inactive_list_t *)kmem_alloc(size, KM_SLEEP);
-
- /*
- * Identify each of the "remote" value extents.
- */
- lp = list;
- entry = &leaf->entries[0];
- for (i = 0; i < be16_to_cpu(leaf->hdr.count); entry++, i++) {
- if (be16_to_cpu(entry->nameidx) &&
- ((entry->flags & XFS_ATTR_LOCAL) == 0)) {
- name_rmt = xfs_attr_leaf_name_remote(leaf, i);
- if (name_rmt->valueblk) {
- lp->valueblk = be32_to_cpu(name_rmt->valueblk);
- lp->valuelen = XFS_B_TO_FSB(dp->i_mount,
- be32_to_cpu(name_rmt->valuelen));
- lp++;
- }
- }
- }
- xfs_trans_brelse(*trans, bp); /* unlock for trans. in freextent() */
-
- /*
- * Invalidate each of the "remote" value extents.
- */
- error = 0;
- for (lp = list, i = 0; i < count; i++, lp++) {
- tmp = xfs_attr_leaf_freextent(trans, dp,
- lp->valueblk, lp->valuelen);
-
- if (error == 0)
- error = tmp; /* save only the 1st errno */
- }
-
- kmem_free((xfs_caddr_t)list);
- return(error);
-}
-
-/*
- * Look at all the extents for this logical region,
- * invalidate any buffers that are incore/in transactions.
- */
-STATIC int
-xfs_attr_leaf_freextent(xfs_trans_t **trans, xfs_inode_t *dp,
- xfs_dablk_t blkno, int blkcnt)
-{
- xfs_bmbt_irec_t map;
- xfs_dablk_t tblkno;
- int tblkcnt, dblkcnt, nmap, error;
- xfs_daddr_t dblkno;
- xfs_buf_t *bp;
-
- /*
- * Roll through the "value", invalidating the attribute value's
- * blocks.
- */
- tblkno = blkno;
- tblkcnt = blkcnt;
- while (tblkcnt > 0) {
- /*
- * Try to remember where we decided to put the value.
- */
- nmap = 1;
- error = xfs_bmapi_read(dp, (xfs_fileoff_t)tblkno, tblkcnt,
- &map, &nmap, XFS_BMAPI_ATTRFORK);
- if (error) {
- return(error);
- }
- ASSERT(nmap == 1);
- ASSERT(map.br_startblock != DELAYSTARTBLOCK);
-
- /*
- * If it's a hole, these are already unmapped
- * so there's nothing to invalidate.
- */
- if (map.br_startblock != HOLESTARTBLOCK) {
-
- dblkno = XFS_FSB_TO_DADDR(dp->i_mount,
- map.br_startblock);
- dblkcnt = XFS_FSB_TO_BB(dp->i_mount,
- map.br_blockcount);
- bp = xfs_trans_get_buf(*trans,
- dp->i_mount->m_ddev_targp,
- dblkno, dblkcnt, 0);
- if (!bp)
- return ENOMEM;
- xfs_trans_binval(*trans, bp);
- /*
- * Roll to next transaction.
- */
- error = xfs_trans_roll(trans, dp);
- if (error)
- return (error);
- }
-
- tblkno += map.br_blockcount;
- tblkcnt -= map.br_blockcount;
- }
-
- return(0);
+ return error;
}
diff --git a/fs/xfs/xfs_attr_leaf.h b/fs/xfs/xfs_attr_leaf.h
index 77de139a58f..e2929da7c3b 100644
--- a/fs/xfs/xfs_attr_leaf.h
+++ b/fs/xfs/xfs_attr_leaf.h
@@ -1,5 +1,6 @@
/*
* Copyright (c) 2000,2002-2003,2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
* All Rights Reserved.
*
* This program is free software; you can redistribute it and/or
@@ -18,16 +19,6 @@
#ifndef __XFS_ATTR_LEAF_H__
#define __XFS_ATTR_LEAF_H__
-/*
- * Attribute storage layout, internal structure, access macros, etc.
- *
- * Attribute lists are structured around Btrees where all the data
- * elements are in the leaf nodes. Attribute names are hashed into an int,
- * then that int is used as the index into the Btree. Since the hashval
- * of an attribute name may not be unique, we may have duplicate keys. The
- * internal links in the Btree are logical block offsets into the file.
- */
-
struct attrlist;
struct attrlist_cursor_kern;
struct xfs_attr_list_context;
@@ -37,160 +28,6 @@ struct xfs_da_state_blk;
struct xfs_inode;
struct xfs_trans;
-/*========================================================================
- * Attribute structure when equal to XFS_LBSIZE(mp) bytes.
- *========================================================================*/
-
-/*
- * This is the structure of the leaf nodes in the Btree.
- *
- * Struct leaf_entry's are packed from the top. Name/values grow from the
- * bottom but are not packed. The freemap contains run-length-encoded entries
- * for the free bytes after the leaf_entry's, but only the N largest such,
- * smaller runs are dropped. When the freemap doesn't show enough space
- * for an allocation, we compact the name/value area and try again. If we
- * still don't have enough space, then we have to split the block. The
- * name/value structs (both local and remote versions) must be 32bit aligned.
- *
- * Since we have duplicate hash keys, for each key that matches, compare
- * the actual name string. The root and intermediate node search always
- * takes the first-in-the-block key match found, so we should only have
- * to work "forw"ard. If none matches, continue with the "forw"ard leaf
- * nodes until the hash key changes or the attribute name is found.
- *
- * We store the fact that an attribute is a ROOT/USER/SECURE attribute in
- * the leaf_entry. The namespaces are independent only because we also look
- * at the namespace bit when we are looking for a matching attribute name.
- *
- * We also store an "incomplete" bit in the leaf_entry. It shows that an
- * attribute is in the middle of being created and should not be shown to
- * the user if we crash during the time that the bit is set. We clear the
- * bit when we have finished setting up the attribute. We do this because
- * we cannot create some large attributes inside a single transaction, and we
- * need some indication that we weren't finished if we crash in the middle.
- */
-#define XFS_ATTR_LEAF_MAPSIZE 3 /* how many freespace slots */
-
-typedef struct xfs_attr_leaf_map { /* RLE map of free bytes */
- __be16 base; /* base of free region */
- __be16 size; /* length of free region */
-} xfs_attr_leaf_map_t;
-
-typedef struct xfs_attr_leaf_hdr { /* constant-structure header block */
- xfs_da_blkinfo_t info; /* block type, links, etc. */
- __be16 count; /* count of active leaf_entry's */
- __be16 usedbytes; /* num bytes of names/values stored */
- __be16 firstused; /* first used byte in name area */
- __u8 holes; /* != 0 if blk needs compaction */
- __u8 pad1;
- xfs_attr_leaf_map_t freemap[XFS_ATTR_LEAF_MAPSIZE];
- /* N largest free regions */
-} xfs_attr_leaf_hdr_t;
-
-typedef struct xfs_attr_leaf_entry { /* sorted on key, not name */
- __be32 hashval; /* hash value of name */
- __be16 nameidx; /* index into buffer of name/value */
- __u8 flags; /* LOCAL/ROOT/SECURE/INCOMPLETE flag */
- __u8 pad2; /* unused pad byte */
-} xfs_attr_leaf_entry_t;
-
-typedef struct xfs_attr_leaf_name_local {
- __be16 valuelen; /* number of bytes in value */
- __u8 namelen; /* length of name bytes */
- __u8 nameval[1]; /* name/value bytes */
-} xfs_attr_leaf_name_local_t;
-
-typedef struct xfs_attr_leaf_name_remote {
- __be32 valueblk; /* block number of value bytes */
- __be32 valuelen; /* number of bytes in value */
- __u8 namelen; /* length of name bytes */
- __u8 name[1]; /* name bytes */
-} xfs_attr_leaf_name_remote_t;
-
-typedef struct xfs_attr_leafblock {
- xfs_attr_leaf_hdr_t hdr; /* constant-structure header block */
- xfs_attr_leaf_entry_t entries[1]; /* sorted on key, not name */
- xfs_attr_leaf_name_local_t namelist; /* grows from bottom of buf */
- xfs_attr_leaf_name_remote_t valuelist; /* grows from bottom of buf */
-} xfs_attr_leafblock_t;
-
-/*
- * Flags used in the leaf_entry[i].flags field.
- * NOTE: the INCOMPLETE bit must not collide with the flags bits specified
- * on the system call, they are "or"ed together for various operations.
- */
-#define XFS_ATTR_LOCAL_BIT 0 /* attr is stored locally */
-#define XFS_ATTR_ROOT_BIT 1 /* limit access to trusted attrs */
-#define XFS_ATTR_SECURE_BIT 2 /* limit access to secure attrs */
-#define XFS_ATTR_INCOMPLETE_BIT 7 /* attr in middle of create/delete */
-#define XFS_ATTR_LOCAL (1 << XFS_ATTR_LOCAL_BIT)
-#define XFS_ATTR_ROOT (1 << XFS_ATTR_ROOT_BIT)
-#define XFS_ATTR_SECURE (1 << XFS_ATTR_SECURE_BIT)
-#define XFS_ATTR_INCOMPLETE (1 << XFS_ATTR_INCOMPLETE_BIT)
-
-/*
- * Conversion macros for converting namespace bits from argument flags
- * to ondisk flags.
- */
-#define XFS_ATTR_NSP_ARGS_MASK (ATTR_ROOT | ATTR_SECURE)
-#define XFS_ATTR_NSP_ONDISK_MASK (XFS_ATTR_ROOT | XFS_ATTR_SECURE)
-#define XFS_ATTR_NSP_ONDISK(flags) ((flags) & XFS_ATTR_NSP_ONDISK_MASK)
-#define XFS_ATTR_NSP_ARGS(flags) ((flags) & XFS_ATTR_NSP_ARGS_MASK)
-#define XFS_ATTR_NSP_ARGS_TO_ONDISK(x) (((x) & ATTR_ROOT ? XFS_ATTR_ROOT : 0) |\
- ((x) & ATTR_SECURE ? XFS_ATTR_SECURE : 0))
-#define XFS_ATTR_NSP_ONDISK_TO_ARGS(x) (((x) & XFS_ATTR_ROOT ? ATTR_ROOT : 0) |\
- ((x) & XFS_ATTR_SECURE ? ATTR_SECURE : 0))
-
-/*
- * Alignment for namelist and valuelist entries (since they are mixed
- * there can be only one alignment value)
- */
-#define XFS_ATTR_LEAF_NAME_ALIGN ((uint)sizeof(xfs_dablk_t))
-
-/*
- * Cast typed pointers for "local" and "remote" name/value structs.
- */
-static inline xfs_attr_leaf_name_remote_t *
-xfs_attr_leaf_name_remote(xfs_attr_leafblock_t *leafp, int idx)
-{
- return (xfs_attr_leaf_name_remote_t *)
- &((char *)leafp)[be16_to_cpu(leafp->entries[idx].nameidx)];
-}
-
-static inline xfs_attr_leaf_name_local_t *
-xfs_attr_leaf_name_local(xfs_attr_leafblock_t *leafp, int idx)
-{
- return (xfs_attr_leaf_name_local_t *)
- &((char *)leafp)[be16_to_cpu(leafp->entries[idx].nameidx)];
-}
-
-static inline char *xfs_attr_leaf_name(xfs_attr_leafblock_t *leafp, int idx)
-{
- return &((char *)leafp)[be16_to_cpu(leafp->entries[idx].nameidx)];
-}
-
-/*
- * Calculate total bytes used (including trailing pad for alignment) for
- * a "local" name/value structure, a "remote" name/value structure, and
- * a pointer which might be either.
- */
-static inline int xfs_attr_leaf_entsize_remote(int nlen)
-{
- return ((uint)sizeof(xfs_attr_leaf_name_remote_t) - 1 + (nlen) + \
- XFS_ATTR_LEAF_NAME_ALIGN - 1) & ~(XFS_ATTR_LEAF_NAME_ALIGN - 1);
-}
-
-static inline int xfs_attr_leaf_entsize_local(int nlen, int vlen)
-{
- return ((uint)sizeof(xfs_attr_leaf_name_local_t) - 1 + (nlen) + (vlen) +
- XFS_ATTR_LEAF_NAME_ALIGN - 1) & ~(XFS_ATTR_LEAF_NAME_ALIGN - 1);
-}
-
-static inline int xfs_attr_leaf_entsize_local_max(int bsize)
-{
- return (((bsize) >> 1) + ((bsize) >> 2));
-}
-
/*
* Used to keep a list of "remote value" extents when unlinking an inode.
*/
@@ -221,37 +58,37 @@ int xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes);
/*
* Internal routines when attribute fork size == XFS_LBSIZE(mp).
*/
-int xfs_attr_leaf_to_node(struct xfs_da_args *args);
-int xfs_attr_leaf_to_shortform(struct xfs_buf *bp,
+int xfs_attr3_leaf_to_node(struct xfs_da_args *args);
+int xfs_attr3_leaf_to_shortform(struct xfs_buf *bp,
struct xfs_da_args *args, int forkoff);
-int xfs_attr_leaf_clearflag(struct xfs_da_args *args);
-int xfs_attr_leaf_setflag(struct xfs_da_args *args);
-int xfs_attr_leaf_flipflags(xfs_da_args_t *args);
+int xfs_attr3_leaf_clearflag(struct xfs_da_args *args);
+int xfs_attr3_leaf_setflag(struct xfs_da_args *args);
+int xfs_attr3_leaf_flipflags(struct xfs_da_args *args);
/*
* Routines used for growing the Btree.
*/
-int xfs_attr_leaf_split(struct xfs_da_state *state,
+int xfs_attr3_leaf_split(struct xfs_da_state *state,
struct xfs_da_state_blk *oldblk,
struct xfs_da_state_blk *newblk);
-int xfs_attr_leaf_lookup_int(struct xfs_buf *leaf,
+int xfs_attr3_leaf_lookup_int(struct xfs_buf *leaf,
struct xfs_da_args *args);
-int xfs_attr_leaf_getvalue(struct xfs_buf *bp, struct xfs_da_args *args);
-int xfs_attr_leaf_add(struct xfs_buf *leaf_buffer,
+int xfs_attr3_leaf_getvalue(struct xfs_buf *bp, struct xfs_da_args *args);
+int xfs_attr3_leaf_add(struct xfs_buf *leaf_buffer,
struct xfs_da_args *args);
-int xfs_attr_leaf_remove(struct xfs_buf *leaf_buffer,
+int xfs_attr3_leaf_remove(struct xfs_buf *leaf_buffer,
struct xfs_da_args *args);
-int xfs_attr_leaf_list_int(struct xfs_buf *bp,
+int xfs_attr3_leaf_list_int(struct xfs_buf *bp,
struct xfs_attr_list_context *context);
/*
* Routines used for shrinking the Btree.
*/
-int xfs_attr_leaf_toosmall(struct xfs_da_state *state, int *retval);
-void xfs_attr_leaf_unbalance(struct xfs_da_state *state,
+int xfs_attr3_leaf_toosmall(struct xfs_da_state *state, int *retval);
+void xfs_attr3_leaf_unbalance(struct xfs_da_state *state,
struct xfs_da_state_blk *drop_blk,
struct xfs_da_state_blk *save_blk);
-int xfs_attr_root_inactive(struct xfs_trans **trans, struct xfs_inode *dp);
+int xfs_attr3_root_inactive(struct xfs_trans **trans, struct xfs_inode *dp);
/*
* Utility routines.
@@ -259,12 +96,13 @@ int xfs_attr_root_inactive(struct xfs_trans **trans, struct xfs_inode *dp);
xfs_dahash_t xfs_attr_leaf_lasthash(struct xfs_buf *bp, int *count);
int xfs_attr_leaf_order(struct xfs_buf *leaf1_bp,
struct xfs_buf *leaf2_bp);
-int xfs_attr_leaf_newentsize(int namelen, int valuelen, int blocksize,
- int *local);
-int xfs_attr_leaf_read(struct xfs_trans *tp, struct xfs_inode *dp,
+int xfs_attr_leaf_newentsize(struct xfs_da_args *args, int *local);
+int xfs_attr3_leaf_read(struct xfs_trans *tp, struct xfs_inode *dp,
xfs_dablk_t bno, xfs_daddr_t mappedbno,
struct xfs_buf **bpp);
-
-extern const struct xfs_buf_ops xfs_attr_leaf_buf_ops;
+void xfs_attr3_leaf_hdr_from_disk(struct xfs_attr3_icleaf_hdr *to,
+ struct xfs_attr_leafblock *from);
+void xfs_attr3_leaf_hdr_to_disk(struct xfs_attr_leafblock *to,
+ struct xfs_attr3_icleaf_hdr *from);
#endif /* __XFS_ATTR_LEAF_H__ */
diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c
new file mode 100644
index 00000000000..90e2eeb2120
--- /dev/null
+++ b/fs/xfs/xfs_attr_list.c
@@ -0,0 +1,653 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_inode_item.h"
+#include "xfs_bmap.h"
+#include "xfs_attr.h"
+#include "xfs_attr_sf.h"
+#include "xfs_attr_remote.h"
+#include "xfs_attr_leaf.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+#include "xfs_buf_item.h"
+#include "xfs_cksum.h"
+#include "xfs_dinode.h"
+#include "xfs_dir2.h"
+
+STATIC int
+xfs_attr_shortform_compare(const void *a, const void *b)
+{
+ xfs_attr_sf_sort_t *sa, *sb;
+
+ sa = (xfs_attr_sf_sort_t *)a;
+ sb = (xfs_attr_sf_sort_t *)b;
+ if (sa->hash < sb->hash) {
+ return(-1);
+ } else if (sa->hash > sb->hash) {
+ return(1);
+ } else {
+ return(sa->entno - sb->entno);
+ }
+}
+
+#define XFS_ISRESET_CURSOR(cursor) \
+ (!((cursor)->initted) && !((cursor)->hashval) && \
+ !((cursor)->blkno) && !((cursor)->offset))
+/*
+ * Copy out entries of shortform attribute lists for attr_list().
+ * Shortform attribute lists are not stored in hashval sorted order.
+ * If the output buffer is not large enough to hold them all, then we
+ * we have to calculate each entries' hashvalue and sort them before
+ * we can begin returning them to the user.
+ */
+int
+xfs_attr_shortform_list(xfs_attr_list_context_t *context)
+{
+ attrlist_cursor_kern_t *cursor;
+ xfs_attr_sf_sort_t *sbuf, *sbp;
+ xfs_attr_shortform_t *sf;
+ xfs_attr_sf_entry_t *sfe;
+ xfs_inode_t *dp;
+ int sbsize, nsbuf, count, i;
+ int error;
+
+ ASSERT(context != NULL);
+ dp = context->dp;
+ ASSERT(dp != NULL);
+ ASSERT(dp->i_afp != NULL);
+ sf = (xfs_attr_shortform_t *)dp->i_afp->if_u1.if_data;
+ ASSERT(sf != NULL);
+ if (!sf->hdr.count)
+ return(0);
+ cursor = context->cursor;
+ ASSERT(cursor != NULL);
+
+ trace_xfs_attr_list_sf(context);
+
+ /*
+ * If the buffer is large enough and the cursor is at the start,
+ * do not bother with sorting since we will return everything in
+ * one buffer and another call using the cursor won't need to be
+ * made.
+ * Note the generous fudge factor of 16 overhead bytes per entry.
+ * If bufsize is zero then put_listent must be a search function
+ * and can just scan through what we have.
+ */
+ if (context->bufsize == 0 ||
+ (XFS_ISRESET_CURSOR(cursor) &&
+ (dp->i_afp->if_bytes + sf->hdr.count * 16) < context->bufsize)) {
+ for (i = 0, sfe = &sf->list[0]; i < sf->hdr.count; i++) {
+ error = context->put_listent(context,
+ sfe->flags,
+ sfe->nameval,
+ (int)sfe->namelen,
+ (int)sfe->valuelen,
+ &sfe->nameval[sfe->namelen]);
+
+ /*
+ * Either search callback finished early or
+ * didn't fit it all in the buffer after all.
+ */
+ if (context->seen_enough)
+ break;
+
+ if (error)
+ return error;
+ sfe = XFS_ATTR_SF_NEXTENTRY(sfe);
+ }
+ trace_xfs_attr_list_sf_all(context);
+ return(0);
+ }
+
+ /* do no more for a search callback */
+ if (context->bufsize == 0)
+ return 0;
+
+ /*
+ * It didn't all fit, so we have to sort everything on hashval.
+ */
+ sbsize = sf->hdr.count * sizeof(*sbuf);
+ sbp = sbuf = kmem_alloc(sbsize, KM_SLEEP | KM_NOFS);
+
+ /*
+ * Scan the attribute list for the rest of the entries, storing
+ * the relevant info from only those that match into a buffer.
+ */
+ nsbuf = 0;
+ for (i = 0, sfe = &sf->list[0]; i < sf->hdr.count; i++) {
+ if (unlikely(
+ ((char *)sfe < (char *)sf) ||
+ ((char *)sfe >= ((char *)sf + dp->i_afp->if_bytes)))) {
+ XFS_CORRUPTION_ERROR("xfs_attr_shortform_list",
+ XFS_ERRLEVEL_LOW,
+ context->dp->i_mount, sfe);
+ kmem_free(sbuf);
+ return XFS_ERROR(EFSCORRUPTED);
+ }
+
+ sbp->entno = i;
+ sbp->hash = xfs_da_hashname(sfe->nameval, sfe->namelen);
+ sbp->name = sfe->nameval;
+ sbp->namelen = sfe->namelen;
+ /* These are bytes, and both on-disk, don't endian-flip */
+ sbp->valuelen = sfe->valuelen;
+ sbp->flags = sfe->flags;
+ sfe = XFS_ATTR_SF_NEXTENTRY(sfe);
+ sbp++;
+ nsbuf++;
+ }
+
+ /*
+ * Sort the entries on hash then entno.
+ */
+ xfs_sort(sbuf, nsbuf, sizeof(*sbuf), xfs_attr_shortform_compare);
+
+ /*
+ * Re-find our place IN THE SORTED LIST.
+ */
+ count = 0;
+ cursor->initted = 1;
+ cursor->blkno = 0;
+ for (sbp = sbuf, i = 0; i < nsbuf; i++, sbp++) {
+ if (sbp->hash == cursor->hashval) {
+ if (cursor->offset == count) {
+ break;
+ }
+ count++;
+ } else if (sbp->hash > cursor->hashval) {
+ break;
+ }
+ }
+ if (i == nsbuf) {
+ kmem_free(sbuf);
+ return(0);
+ }
+
+ /*
+ * Loop putting entries into the user buffer.
+ */
+ for ( ; i < nsbuf; i++, sbp++) {
+ if (cursor->hashval != sbp->hash) {
+ cursor->hashval = sbp->hash;
+ cursor->offset = 0;
+ }
+ error = context->put_listent(context,
+ sbp->flags,
+ sbp->name,
+ sbp->namelen,
+ sbp->valuelen,
+ &sbp->name[sbp->namelen]);
+ if (error)
+ return error;
+ if (context->seen_enough)
+ break;
+ cursor->offset++;
+ }
+
+ kmem_free(sbuf);
+ return(0);
+}
+
+STATIC int
+xfs_attr_node_list(xfs_attr_list_context_t *context)
+{
+ attrlist_cursor_kern_t *cursor;
+ xfs_attr_leafblock_t *leaf;
+ xfs_da_intnode_t *node;
+ struct xfs_attr3_icleaf_hdr leafhdr;
+ struct xfs_da3_icnode_hdr nodehdr;
+ struct xfs_da_node_entry *btree;
+ int error, i;
+ struct xfs_buf *bp;
+ struct xfs_inode *dp = context->dp;
+
+ trace_xfs_attr_node_list(context);
+
+ cursor = context->cursor;
+ cursor->initted = 1;
+
+ /*
+ * Do all sorts of validation on the passed-in cursor structure.
+ * If anything is amiss, ignore the cursor and look up the hashval
+ * starting from the btree root.
+ */
+ bp = NULL;
+ if (cursor->blkno > 0) {
+ error = xfs_da3_node_read(NULL, dp, cursor->blkno, -1,
+ &bp, XFS_ATTR_FORK);
+ if ((error != 0) && (error != EFSCORRUPTED))
+ return(error);
+ if (bp) {
+ struct xfs_attr_leaf_entry *entries;
+
+ node = bp->b_addr;
+ switch (be16_to_cpu(node->hdr.info.magic)) {
+ case XFS_DA_NODE_MAGIC:
+ case XFS_DA3_NODE_MAGIC:
+ trace_xfs_attr_list_wrong_blk(context);
+ xfs_trans_brelse(NULL, bp);
+ bp = NULL;
+ break;
+ case XFS_ATTR_LEAF_MAGIC:
+ case XFS_ATTR3_LEAF_MAGIC:
+ leaf = bp->b_addr;
+ xfs_attr3_leaf_hdr_from_disk(&leafhdr, leaf);
+ entries = xfs_attr3_leaf_entryp(leaf);
+ if (cursor->hashval > be32_to_cpu(
+ entries[leafhdr.count - 1].hashval)) {
+ trace_xfs_attr_list_wrong_blk(context);
+ xfs_trans_brelse(NULL, bp);
+ bp = NULL;
+ } else if (cursor->hashval <= be32_to_cpu(
+ entries[0].hashval)) {
+ trace_xfs_attr_list_wrong_blk(context);
+ xfs_trans_brelse(NULL, bp);
+ bp = NULL;
+ }
+ break;
+ default:
+ trace_xfs_attr_list_wrong_blk(context);
+ xfs_trans_brelse(NULL, bp);
+ bp = NULL;
+ }
+ }
+ }
+
+ /*
+ * We did not find what we expected given the cursor's contents,
+ * so we start from the top and work down based on the hash value.
+ * Note that start of node block is same as start of leaf block.
+ */
+ if (bp == NULL) {
+ cursor->blkno = 0;
+ for (;;) {
+ __uint16_t magic;
+
+ error = xfs_da3_node_read(NULL, dp,
+ cursor->blkno, -1, &bp,
+ XFS_ATTR_FORK);
+ if (error)
+ return(error);
+ node = bp->b_addr;
+ magic = be16_to_cpu(node->hdr.info.magic);
+ if (magic == XFS_ATTR_LEAF_MAGIC ||
+ magic == XFS_ATTR3_LEAF_MAGIC)
+ break;
+ if (magic != XFS_DA_NODE_MAGIC &&
+ magic != XFS_DA3_NODE_MAGIC) {
+ XFS_CORRUPTION_ERROR("xfs_attr_node_list(3)",
+ XFS_ERRLEVEL_LOW,
+ context->dp->i_mount,
+ node);
+ xfs_trans_brelse(NULL, bp);
+ return XFS_ERROR(EFSCORRUPTED);
+ }
+
+ dp->d_ops->node_hdr_from_disk(&nodehdr, node);
+ btree = dp->d_ops->node_tree_p(node);
+ for (i = 0; i < nodehdr.count; btree++, i++) {
+ if (cursor->hashval
+ <= be32_to_cpu(btree->hashval)) {
+ cursor->blkno = be32_to_cpu(btree->before);
+ trace_xfs_attr_list_node_descend(context,
+ btree);
+ break;
+ }
+ }
+ if (i == nodehdr.count) {
+ xfs_trans_brelse(NULL, bp);
+ return 0;
+ }
+ xfs_trans_brelse(NULL, bp);
+ }
+ }
+ ASSERT(bp != NULL);
+
+ /*
+ * Roll upward through the blocks, processing each leaf block in
+ * order. As long as there is space in the result buffer, keep
+ * adding the information.
+ */
+ for (;;) {
+ leaf = bp->b_addr;
+ error = xfs_attr3_leaf_list_int(bp, context);
+ if (error) {
+ xfs_trans_brelse(NULL, bp);
+ return error;
+ }
+ xfs_attr3_leaf_hdr_from_disk(&leafhdr, leaf);
+ if (context->seen_enough || leafhdr.forw == 0)
+ break;
+ cursor->blkno = leafhdr.forw;
+ xfs_trans_brelse(NULL, bp);
+ error = xfs_attr3_leaf_read(NULL, dp, cursor->blkno, -1, &bp);
+ if (error)
+ return error;
+ }
+ xfs_trans_brelse(NULL, bp);
+ return 0;
+}
+
+/*
+ * Copy out attribute list entries for attr_list(), for leaf attribute lists.
+ */
+int
+xfs_attr3_leaf_list_int(
+ struct xfs_buf *bp,
+ struct xfs_attr_list_context *context)
+{
+ struct attrlist_cursor_kern *cursor;
+ struct xfs_attr_leafblock *leaf;
+ struct xfs_attr3_icleaf_hdr ichdr;
+ struct xfs_attr_leaf_entry *entries;
+ struct xfs_attr_leaf_entry *entry;
+ int retval;
+ int i;
+
+ trace_xfs_attr_list_leaf(context);
+
+ leaf = bp->b_addr;
+ xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
+ entries = xfs_attr3_leaf_entryp(leaf);
+
+ cursor = context->cursor;
+ cursor->initted = 1;
+
+ /*
+ * Re-find our place in the leaf block if this is a new syscall.
+ */
+ if (context->resynch) {
+ entry = &entries[0];
+ for (i = 0; i < ichdr.count; entry++, i++) {
+ if (be32_to_cpu(entry->hashval) == cursor->hashval) {
+ if (cursor->offset == context->dupcnt) {
+ context->dupcnt = 0;
+ break;
+ }
+ context->dupcnt++;
+ } else if (be32_to_cpu(entry->hashval) >
+ cursor->hashval) {
+ context->dupcnt = 0;
+ break;
+ }
+ }
+ if (i == ichdr.count) {
+ trace_xfs_attr_list_notfound(context);
+ return 0;
+ }
+ } else {
+ entry = &entries[0];
+ i = 0;
+ }
+ context->resynch = 0;
+
+ /*
+ * We have found our place, start copying out the new attributes.
+ */
+ retval = 0;
+ for (; i < ichdr.count; entry++, i++) {
+ if (be32_to_cpu(entry->hashval) != cursor->hashval) {
+ cursor->hashval = be32_to_cpu(entry->hashval);
+ cursor->offset = 0;
+ }
+
+ if (entry->flags & XFS_ATTR_INCOMPLETE)
+ continue; /* skip incomplete entries */
+
+ if (entry->flags & XFS_ATTR_LOCAL) {
+ xfs_attr_leaf_name_local_t *name_loc =
+ xfs_attr3_leaf_name_local(leaf, i);
+
+ retval = context->put_listent(context,
+ entry->flags,
+ name_loc->nameval,
+ (int)name_loc->namelen,
+ be16_to_cpu(name_loc->valuelen),
+ &name_loc->nameval[name_loc->namelen]);
+ if (retval)
+ return retval;
+ } else {
+ xfs_attr_leaf_name_remote_t *name_rmt =
+ xfs_attr3_leaf_name_remote(leaf, i);
+
+ int valuelen = be32_to_cpu(name_rmt->valuelen);
+
+ if (context->put_value) {
+ xfs_da_args_t args;
+
+ memset((char *)&args, 0, sizeof(args));
+ args.geo = context->dp->i_mount->m_attr_geo;
+ args.dp = context->dp;
+ args.whichfork = XFS_ATTR_FORK;
+ args.valuelen = valuelen;
+ args.rmtvaluelen = valuelen;
+ args.value = kmem_alloc(valuelen, KM_SLEEP | KM_NOFS);
+ args.rmtblkno = be32_to_cpu(name_rmt->valueblk);
+ args.rmtblkcnt = xfs_attr3_rmt_blocks(
+ args.dp->i_mount, valuelen);
+ retval = xfs_attr_rmtval_get(&args);
+ if (retval)
+ return retval;
+ retval = context->put_listent(context,
+ entry->flags,
+ name_rmt->name,
+ (int)name_rmt->namelen,
+ valuelen,
+ args.value);
+ kmem_free(args.value);
+ } else {
+ retval = context->put_listent(context,
+ entry->flags,
+ name_rmt->name,
+ (int)name_rmt->namelen,
+ valuelen,
+ NULL);
+ }
+ if (retval)
+ return retval;
+ }
+ if (context->seen_enough)
+ break;
+ cursor->offset++;
+ }
+ trace_xfs_attr_list_leaf_end(context);
+ return retval;
+}
+
+/*
+ * Copy out attribute entries for attr_list(), for leaf attribute lists.
+ */
+STATIC int
+xfs_attr_leaf_list(xfs_attr_list_context_t *context)
+{
+ int error;
+ struct xfs_buf *bp;
+
+ trace_xfs_attr_leaf_list(context);
+
+ context->cursor->blkno = 0;
+ error = xfs_attr3_leaf_read(NULL, context->dp, 0, -1, &bp);
+ if (error)
+ return XFS_ERROR(error);
+
+ error = xfs_attr3_leaf_list_int(bp, context);
+ xfs_trans_brelse(NULL, bp);
+ return XFS_ERROR(error);
+}
+
+int
+xfs_attr_list_int(
+ xfs_attr_list_context_t *context)
+{
+ int error;
+ xfs_inode_t *dp = context->dp;
+ uint lock_mode;
+
+ XFS_STATS_INC(xs_attr_list);
+
+ if (XFS_FORCED_SHUTDOWN(dp->i_mount))
+ return EIO;
+
+ /*
+ * Decide on what work routines to call based on the inode size.
+ */
+ lock_mode = xfs_ilock_attr_map_shared(dp);
+ if (!xfs_inode_hasattr(dp)) {
+ error = 0;
+ } else if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) {
+ error = xfs_attr_shortform_list(context);
+ } else if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) {
+ error = xfs_attr_leaf_list(context);
+ } else {
+ error = xfs_attr_node_list(context);
+ }
+ xfs_iunlock(dp, lock_mode);
+ return error;
+}
+
+#define ATTR_ENTBASESIZE /* minimum bytes used by an attr */ \
+ (((struct attrlist_ent *) 0)->a_name - (char *) 0)
+#define ATTR_ENTSIZE(namelen) /* actual bytes used by an attr */ \
+ ((ATTR_ENTBASESIZE + (namelen) + 1 + sizeof(u_int32_t)-1) \
+ & ~(sizeof(u_int32_t)-1))
+
+/*
+ * Format an attribute and copy it out to the user's buffer.
+ * Take care to check values and protect against them changing later,
+ * we may be reading them directly out of a user buffer.
+ */
+STATIC int
+xfs_attr_put_listent(
+ xfs_attr_list_context_t *context,
+ int flags,
+ unsigned char *name,
+ int namelen,
+ int valuelen,
+ unsigned char *value)
+{
+ struct attrlist *alist = (struct attrlist *)context->alist;
+ attrlist_ent_t *aep;
+ int arraytop;
+
+ ASSERT(!(context->flags & ATTR_KERNOVAL));
+ ASSERT(context->count >= 0);
+ ASSERT(context->count < (ATTR_MAX_VALUELEN/8));
+ ASSERT(context->firstu >= sizeof(*alist));
+ ASSERT(context->firstu <= context->bufsize);
+
+ /*
+ * Only list entries in the right namespace.
+ */
+ if (((context->flags & ATTR_SECURE) == 0) !=
+ ((flags & XFS_ATTR_SECURE) == 0))
+ return 0;
+ if (((context->flags & ATTR_ROOT) == 0) !=
+ ((flags & XFS_ATTR_ROOT) == 0))
+ return 0;
+
+ arraytop = sizeof(*alist) +
+ context->count * sizeof(alist->al_offset[0]);
+ context->firstu -= ATTR_ENTSIZE(namelen);
+ if (context->firstu < arraytop) {
+ trace_xfs_attr_list_full(context);
+ alist->al_more = 1;
+ context->seen_enough = 1;
+ return 1;
+ }
+
+ aep = (attrlist_ent_t *)&context->alist[context->firstu];
+ aep->a_valuelen = valuelen;
+ memcpy(aep->a_name, name, namelen);
+ aep->a_name[namelen] = 0;
+ alist->al_offset[context->count++] = context->firstu;
+ alist->al_count = context->count;
+ trace_xfs_attr_list_add(context);
+ return 0;
+}
+
+/*
+ * Generate a list of extended attribute names and optionally
+ * also value lengths. Positive return value follows the XFS
+ * convention of being an error, zero or negative return code
+ * is the length of the buffer returned (negated), indicating
+ * success.
+ */
+int
+xfs_attr_list(
+ xfs_inode_t *dp,
+ char *buffer,
+ int bufsize,
+ int flags,
+ attrlist_cursor_kern_t *cursor)
+{
+ xfs_attr_list_context_t context;
+ struct attrlist *alist;
+ int error;
+
+ /*
+ * Validate the cursor.
+ */
+ if (cursor->pad1 || cursor->pad2)
+ return(XFS_ERROR(EINVAL));
+ if ((cursor->initted == 0) &&
+ (cursor->hashval || cursor->blkno || cursor->offset))
+ return XFS_ERROR(EINVAL);
+
+ /*
+ * Check for a properly aligned buffer.
+ */
+ if (((long)buffer) & (sizeof(int)-1))
+ return XFS_ERROR(EFAULT);
+ if (flags & ATTR_KERNOVAL)
+ bufsize = 0;
+
+ /*
+ * Initialize the output buffer.
+ */
+ memset(&context, 0, sizeof(context));
+ context.dp = dp;
+ context.cursor = cursor;
+ context.resynch = 1;
+ context.flags = flags;
+ context.alist = buffer;
+ context.bufsize = (bufsize & ~(sizeof(int)-1)); /* align */
+ context.firstu = context.bufsize;
+ context.put_listent = xfs_attr_put_listent;
+
+ alist = (struct attrlist *)context.alist;
+ alist->al_count = 0;
+ alist->al_more = 0;
+ alist->al_offset[0] = context.bufsize;
+
+ error = xfs_attr_list_int(&context);
+ ASSERT(error >= 0);
+ return error;
+}
diff --git a/fs/xfs/xfs_attr_remote.c b/fs/xfs/xfs_attr_remote.c
new file mode 100644
index 00000000000..b5adfecbb8e
--- /dev/null
+++ b/fs/xfs/xfs_attr_remote.c
@@ -0,0 +1,628 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_inode.h"
+#include "xfs_alloc.h"
+#include "xfs_trans.h"
+#include "xfs_inode_item.h"
+#include "xfs_bmap.h"
+#include "xfs_bmap_util.h"
+#include "xfs_attr.h"
+#include "xfs_attr_leaf.h"
+#include "xfs_attr_remote.h"
+#include "xfs_trans_space.h"
+#include "xfs_trace.h"
+#include "xfs_cksum.h"
+#include "xfs_buf_item.h"
+#include "xfs_error.h"
+
+#define ATTR_RMTVALUE_MAPSIZE 1 /* # of map entries at once */
+
+/*
+ * Each contiguous block has a header, so it is not just a simple attribute
+ * length to FSB conversion.
+ */
+int
+xfs_attr3_rmt_blocks(
+ struct xfs_mount *mp,
+ int attrlen)
+{
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ int buflen = XFS_ATTR3_RMT_BUF_SPACE(mp, mp->m_sb.sb_blocksize);
+ return (attrlen + buflen - 1) / buflen;
+ }
+ return XFS_B_TO_FSB(mp, attrlen);
+}
+
+/*
+ * Checking of the remote attribute header is split into two parts. The verifier
+ * does CRC, location and bounds checking, the unpacking function checks the
+ * attribute parameters and owner.
+ */
+static bool
+xfs_attr3_rmt_hdr_ok(
+ void *ptr,
+ xfs_ino_t ino,
+ uint32_t offset,
+ uint32_t size,
+ xfs_daddr_t bno)
+{
+ struct xfs_attr3_rmt_hdr *rmt = ptr;
+
+ if (bno != be64_to_cpu(rmt->rm_blkno))
+ return false;
+ if (offset != be32_to_cpu(rmt->rm_offset))
+ return false;
+ if (size != be32_to_cpu(rmt->rm_bytes))
+ return false;
+ if (ino != be64_to_cpu(rmt->rm_owner))
+ return false;
+
+ /* ok */
+ return true;
+}
+
+static bool
+xfs_attr3_rmt_verify(
+ struct xfs_mount *mp,
+ void *ptr,
+ int fsbsize,
+ xfs_daddr_t bno)
+{
+ struct xfs_attr3_rmt_hdr *rmt = ptr;
+
+ if (!xfs_sb_version_hascrc(&mp->m_sb))
+ return false;
+ if (rmt->rm_magic != cpu_to_be32(XFS_ATTR3_RMT_MAGIC))
+ return false;
+ if (!uuid_equal(&rmt->rm_uuid, &mp->m_sb.sb_uuid))
+ return false;
+ if (be64_to_cpu(rmt->rm_blkno) != bno)
+ return false;
+ if (be32_to_cpu(rmt->rm_bytes) > fsbsize - sizeof(*rmt))
+ return false;
+ if (be32_to_cpu(rmt->rm_offset) +
+ be32_to_cpu(rmt->rm_bytes) > XATTR_SIZE_MAX)
+ return false;
+ if (rmt->rm_owner == 0)
+ return false;
+
+ return true;
+}
+
+static void
+xfs_attr3_rmt_read_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ char *ptr;
+ int len;
+ xfs_daddr_t bno;
+ int blksize = mp->m_attr_geo->blksize;
+
+ /* no verification of non-crc buffers */
+ if (!xfs_sb_version_hascrc(&mp->m_sb))
+ return;
+
+ ptr = bp->b_addr;
+ bno = bp->b_bn;
+ len = BBTOB(bp->b_length);
+ ASSERT(len >= blksize);
+
+ while (len > 0) {
+ if (!xfs_verify_cksum(ptr, blksize, XFS_ATTR3_RMT_CRC_OFF)) {
+ xfs_buf_ioerror(bp, EFSBADCRC);
+ break;
+ }
+ if (!xfs_attr3_rmt_verify(mp, ptr, blksize, bno)) {
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
+ break;
+ }
+ len -= blksize;
+ ptr += blksize;
+ bno += BTOBB(blksize);
+ }
+
+ if (bp->b_error)
+ xfs_verifier_error(bp);
+ else
+ ASSERT(len == 0);
+}
+
+static void
+xfs_attr3_rmt_write_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_buf_log_item *bip = bp->b_fspriv;
+ char *ptr;
+ int len;
+ xfs_daddr_t bno;
+ int blksize = mp->m_attr_geo->blksize;
+
+ /* no verification of non-crc buffers */
+ if (!xfs_sb_version_hascrc(&mp->m_sb))
+ return;
+
+ ptr = bp->b_addr;
+ bno = bp->b_bn;
+ len = BBTOB(bp->b_length);
+ ASSERT(len >= blksize);
+
+ while (len > 0) {
+ if (!xfs_attr3_rmt_verify(mp, ptr, blksize, bno)) {
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
+ xfs_verifier_error(bp);
+ return;
+ }
+ if (bip) {
+ struct xfs_attr3_rmt_hdr *rmt;
+
+ rmt = (struct xfs_attr3_rmt_hdr *)ptr;
+ rmt->rm_lsn = cpu_to_be64(bip->bli_item.li_lsn);
+ }
+ xfs_update_cksum(ptr, blksize, XFS_ATTR3_RMT_CRC_OFF);
+
+ len -= blksize;
+ ptr += blksize;
+ bno += BTOBB(blksize);
+ }
+ ASSERT(len == 0);
+}
+
+const struct xfs_buf_ops xfs_attr3_rmt_buf_ops = {
+ .verify_read = xfs_attr3_rmt_read_verify,
+ .verify_write = xfs_attr3_rmt_write_verify,
+};
+
+STATIC int
+xfs_attr3_rmt_hdr_set(
+ struct xfs_mount *mp,
+ void *ptr,
+ xfs_ino_t ino,
+ uint32_t offset,
+ uint32_t size,
+ xfs_daddr_t bno)
+{
+ struct xfs_attr3_rmt_hdr *rmt = ptr;
+
+ if (!xfs_sb_version_hascrc(&mp->m_sb))
+ return 0;
+
+ rmt->rm_magic = cpu_to_be32(XFS_ATTR3_RMT_MAGIC);
+ rmt->rm_offset = cpu_to_be32(offset);
+ rmt->rm_bytes = cpu_to_be32(size);
+ uuid_copy(&rmt->rm_uuid, &mp->m_sb.sb_uuid);
+ rmt->rm_owner = cpu_to_be64(ino);
+ rmt->rm_blkno = cpu_to_be64(bno);
+
+ return sizeof(struct xfs_attr3_rmt_hdr);
+}
+
+/*
+ * Helper functions to copy attribute data in and out of the one disk extents
+ */
+STATIC int
+xfs_attr_rmtval_copyout(
+ struct xfs_mount *mp,
+ struct xfs_buf *bp,
+ xfs_ino_t ino,
+ int *offset,
+ int *valuelen,
+ __uint8_t **dst)
+{
+ char *src = bp->b_addr;
+ xfs_daddr_t bno = bp->b_bn;
+ int len = BBTOB(bp->b_length);
+ int blksize = mp->m_attr_geo->blksize;
+
+ ASSERT(len >= blksize);
+
+ while (len > 0 && *valuelen > 0) {
+ int hdr_size = 0;
+ int byte_cnt = XFS_ATTR3_RMT_BUF_SPACE(mp, blksize);
+
+ byte_cnt = min(*valuelen, byte_cnt);
+
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ if (!xfs_attr3_rmt_hdr_ok(src, ino, *offset,
+ byte_cnt, bno)) {
+ xfs_alert(mp,
+"remote attribute header mismatch bno/off/len/owner (0x%llx/0x%x/Ox%x/0x%llx)",
+ bno, *offset, byte_cnt, ino);
+ return EFSCORRUPTED;
+ }
+ hdr_size = sizeof(struct xfs_attr3_rmt_hdr);
+ }
+
+ memcpy(*dst, src + hdr_size, byte_cnt);
+
+ /* roll buffer forwards */
+ len -= blksize;
+ src += blksize;
+ bno += BTOBB(blksize);
+
+ /* roll attribute data forwards */
+ *valuelen -= byte_cnt;
+ *dst += byte_cnt;
+ *offset += byte_cnt;
+ }
+ return 0;
+}
+
+STATIC void
+xfs_attr_rmtval_copyin(
+ struct xfs_mount *mp,
+ struct xfs_buf *bp,
+ xfs_ino_t ino,
+ int *offset,
+ int *valuelen,
+ __uint8_t **src)
+{
+ char *dst = bp->b_addr;
+ xfs_daddr_t bno = bp->b_bn;
+ int len = BBTOB(bp->b_length);
+ int blksize = mp->m_attr_geo->blksize;
+
+ ASSERT(len >= blksize);
+
+ while (len > 0 && *valuelen > 0) {
+ int hdr_size;
+ int byte_cnt = XFS_ATTR3_RMT_BUF_SPACE(mp, blksize);
+
+ byte_cnt = min(*valuelen, byte_cnt);
+ hdr_size = xfs_attr3_rmt_hdr_set(mp, dst, ino, *offset,
+ byte_cnt, bno);
+
+ memcpy(dst + hdr_size, *src, byte_cnt);
+
+ /*
+ * If this is the last block, zero the remainder of it.
+ * Check that we are actually the last block, too.
+ */
+ if (byte_cnt + hdr_size < blksize) {
+ ASSERT(*valuelen - byte_cnt == 0);
+ ASSERT(len == blksize);
+ memset(dst + hdr_size + byte_cnt, 0,
+ blksize - hdr_size - byte_cnt);
+ }
+
+ /* roll buffer forwards */
+ len -= blksize;
+ dst += blksize;
+ bno += BTOBB(blksize);
+
+ /* roll attribute data forwards */
+ *valuelen -= byte_cnt;
+ *src += byte_cnt;
+ *offset += byte_cnt;
+ }
+}
+
+/*
+ * Read the value associated with an attribute from the out-of-line buffer
+ * that we stored it in.
+ */
+int
+xfs_attr_rmtval_get(
+ struct xfs_da_args *args)
+{
+ struct xfs_bmbt_irec map[ATTR_RMTVALUE_MAPSIZE];
+ struct xfs_mount *mp = args->dp->i_mount;
+ struct xfs_buf *bp;
+ xfs_dablk_t lblkno = args->rmtblkno;
+ __uint8_t *dst = args->value;
+ int valuelen;
+ int nmap;
+ int error;
+ int blkcnt = args->rmtblkcnt;
+ int i;
+ int offset = 0;
+
+ trace_xfs_attr_rmtval_get(args);
+
+ ASSERT(!(args->flags & ATTR_KERNOVAL));
+ ASSERT(args->rmtvaluelen == args->valuelen);
+
+ valuelen = args->rmtvaluelen;
+ while (valuelen > 0) {
+ nmap = ATTR_RMTVALUE_MAPSIZE;
+ error = xfs_bmapi_read(args->dp, (xfs_fileoff_t)lblkno,
+ blkcnt, map, &nmap,
+ XFS_BMAPI_ATTRFORK);
+ if (error)
+ return error;
+ ASSERT(nmap >= 1);
+
+ for (i = 0; (i < nmap) && (valuelen > 0); i++) {
+ xfs_daddr_t dblkno;
+ int dblkcnt;
+
+ ASSERT((map[i].br_startblock != DELAYSTARTBLOCK) &&
+ (map[i].br_startblock != HOLESTARTBLOCK));
+ dblkno = XFS_FSB_TO_DADDR(mp, map[i].br_startblock);
+ dblkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount);
+ error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp,
+ dblkno, dblkcnt, 0, &bp,
+ &xfs_attr3_rmt_buf_ops);
+ if (error)
+ return error;
+
+ error = xfs_attr_rmtval_copyout(mp, bp, args->dp->i_ino,
+ &offset, &valuelen,
+ &dst);
+ xfs_buf_relse(bp);
+ if (error)
+ return error;
+
+ /* roll attribute extent map forwards */
+ lblkno += map[i].br_blockcount;
+ blkcnt -= map[i].br_blockcount;
+ }
+ }
+ ASSERT(valuelen == 0);
+ return 0;
+}
+
+/*
+ * Write the value associated with an attribute into the out-of-line buffer
+ * that we have defined for it.
+ */
+int
+xfs_attr_rmtval_set(
+ struct xfs_da_args *args)
+{
+ struct xfs_inode *dp = args->dp;
+ struct xfs_mount *mp = dp->i_mount;
+ struct xfs_bmbt_irec map;
+ xfs_dablk_t lblkno;
+ xfs_fileoff_t lfileoff = 0;
+ __uint8_t *src = args->value;
+ int blkcnt;
+ int valuelen;
+ int nmap;
+ int error;
+ int offset = 0;
+
+ trace_xfs_attr_rmtval_set(args);
+
+ /*
+ * Find a "hole" in the attribute address space large enough for
+ * us to drop the new attribute's value into. Because CRC enable
+ * attributes have headers, we can't just do a straight byte to FSB
+ * conversion and have to take the header space into account.
+ */
+ blkcnt = xfs_attr3_rmt_blocks(mp, args->rmtvaluelen);
+ error = xfs_bmap_first_unused(args->trans, args->dp, blkcnt, &lfileoff,
+ XFS_ATTR_FORK);
+ if (error)
+ return error;
+
+ args->rmtblkno = lblkno = (xfs_dablk_t)lfileoff;
+ args->rmtblkcnt = blkcnt;
+
+ /*
+ * Roll through the "value", allocating blocks on disk as required.
+ */
+ while (blkcnt > 0) {
+ int committed;
+
+ /*
+ * Allocate a single extent, up to the size of the value.
+ */
+ xfs_bmap_init(args->flist, args->firstblock);
+ nmap = 1;
+ error = xfs_bmapi_write(args->trans, dp, (xfs_fileoff_t)lblkno,
+ blkcnt,
+ XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
+ args->firstblock, args->total, &map, &nmap,
+ args->flist);
+ if (!error) {
+ error = xfs_bmap_finish(&args->trans, args->flist,
+ &committed);
+ }
+ if (error) {
+ ASSERT(committed);
+ args->trans = NULL;
+ xfs_bmap_cancel(args->flist);
+ return(error);
+ }
+
+ /*
+ * bmap_finish() may have committed the last trans and started
+ * a new one. We need the inode to be in all transactions.
+ */
+ if (committed)
+ xfs_trans_ijoin(args->trans, dp, 0);
+
+ ASSERT(nmap == 1);
+ ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
+ (map.br_startblock != HOLESTARTBLOCK));
+ lblkno += map.br_blockcount;
+ blkcnt -= map.br_blockcount;
+
+ /*
+ * Start the next trans in the chain.
+ */
+ error = xfs_trans_roll(&args->trans, dp);
+ if (error)
+ return (error);
+ }
+
+ /*
+ * Roll through the "value", copying the attribute value to the
+ * already-allocated blocks. Blocks are written synchronously
+ * so that we can know they are all on disk before we turn off
+ * the INCOMPLETE flag.
+ */
+ lblkno = args->rmtblkno;
+ blkcnt = args->rmtblkcnt;
+ valuelen = args->rmtvaluelen;
+ while (valuelen > 0) {
+ struct xfs_buf *bp;
+ xfs_daddr_t dblkno;
+ int dblkcnt;
+
+ ASSERT(blkcnt > 0);
+
+ xfs_bmap_init(args->flist, args->firstblock);
+ nmap = 1;
+ error = xfs_bmapi_read(dp, (xfs_fileoff_t)lblkno,
+ blkcnt, &map, &nmap,
+ XFS_BMAPI_ATTRFORK);
+ if (error)
+ return(error);
+ ASSERT(nmap == 1);
+ ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
+ (map.br_startblock != HOLESTARTBLOCK));
+
+ dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock),
+ dblkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount);
+
+ bp = xfs_buf_get(mp->m_ddev_targp, dblkno, dblkcnt, 0);
+ if (!bp)
+ return ENOMEM;
+ bp->b_ops = &xfs_attr3_rmt_buf_ops;
+
+ xfs_attr_rmtval_copyin(mp, bp, args->dp->i_ino, &offset,
+ &valuelen, &src);
+
+ error = xfs_bwrite(bp); /* GROT: NOTE: synchronous write */
+ xfs_buf_relse(bp);
+ if (error)
+ return error;
+
+
+ /* roll attribute extent map forwards */
+ lblkno += map.br_blockcount;
+ blkcnt -= map.br_blockcount;
+ }
+ ASSERT(valuelen == 0);
+ return 0;
+}
+
+/*
+ * Remove the value associated with an attribute by deleting the
+ * out-of-line buffer that it is stored on.
+ */
+int
+xfs_attr_rmtval_remove(
+ struct xfs_da_args *args)
+{
+ struct xfs_mount *mp = args->dp->i_mount;
+ xfs_dablk_t lblkno;
+ int blkcnt;
+ int error;
+ int done;
+
+ trace_xfs_attr_rmtval_remove(args);
+
+ /*
+ * Roll through the "value", invalidating the attribute value's blocks.
+ */
+ lblkno = args->rmtblkno;
+ blkcnt = args->rmtblkcnt;
+ while (blkcnt > 0) {
+ struct xfs_bmbt_irec map;
+ struct xfs_buf *bp;
+ xfs_daddr_t dblkno;
+ int dblkcnt;
+ int nmap;
+
+ /*
+ * Try to remember where we decided to put the value.
+ */
+ nmap = 1;
+ error = xfs_bmapi_read(args->dp, (xfs_fileoff_t)lblkno,
+ blkcnt, &map, &nmap, XFS_BMAPI_ATTRFORK);
+ if (error)
+ return(error);
+ ASSERT(nmap == 1);
+ ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
+ (map.br_startblock != HOLESTARTBLOCK));
+
+ dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock),
+ dblkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount);
+
+ /*
+ * If the "remote" value is in the cache, remove it.
+ */
+ bp = xfs_incore(mp->m_ddev_targp, dblkno, dblkcnt, XBF_TRYLOCK);
+ if (bp) {
+ xfs_buf_stale(bp);
+ xfs_buf_relse(bp);
+ bp = NULL;
+ }
+
+ lblkno += map.br_blockcount;
+ blkcnt -= map.br_blockcount;
+ }
+
+ /*
+ * Keep de-allocating extents until the remote-value region is gone.
+ */
+ lblkno = args->rmtblkno;
+ blkcnt = args->rmtblkcnt;
+ done = 0;
+ while (!done) {
+ int committed;
+
+ xfs_bmap_init(args->flist, args->firstblock);
+ error = xfs_bunmapi(args->trans, args->dp, lblkno, blkcnt,
+ XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
+ 1, args->firstblock, args->flist,
+ &done);
+ if (!error) {
+ error = xfs_bmap_finish(&args->trans, args->flist,
+ &committed);
+ }
+ if (error) {
+ ASSERT(committed);
+ args->trans = NULL;
+ xfs_bmap_cancel(args->flist);
+ return error;
+ }
+
+ /*
+ * bmap_finish() may have committed the last trans and started
+ * a new one. We need the inode to be in all transactions.
+ */
+ if (committed)
+ xfs_trans_ijoin(args->trans, args->dp, 0);
+
+ /*
+ * Close out trans and start the next one in the chain.
+ */
+ error = xfs_trans_roll(&args->trans, args->dp);
+ if (error)
+ return (error);
+ }
+ return(0);
+}
diff --git a/fs/xfs/xfs_utils.h b/fs/xfs/xfs_attr_remote.h
index 5eeab4690cf..5a9acfa156d 100644
--- a/fs/xfs/xfs_utils.h
+++ b/fs/xfs/xfs_attr_remote.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
* All Rights Reserved.
*
* This program is free software; you can redistribute it and/or
@@ -15,13 +15,13 @@
* along with this program; if not, write the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
-#ifndef __XFS_UTILS_H__
-#define __XFS_UTILS_H__
+#ifndef __XFS_ATTR_REMOTE_H__
+#define __XFS_ATTR_REMOTE_H__
-extern int xfs_dir_ialloc(xfs_trans_t **, xfs_inode_t *, umode_t, xfs_nlink_t,
- xfs_dev_t, prid_t, int, xfs_inode_t **, int *);
-extern int xfs_droplink(xfs_trans_t *, xfs_inode_t *);
-extern int xfs_bumplink(xfs_trans_t *, xfs_inode_t *);
-extern void xfs_bump_ino_vers2(xfs_trans_t *, xfs_inode_t *);
+int xfs_attr3_rmt_blocks(struct xfs_mount *mp, int attrlen);
-#endif /* __XFS_UTILS_H__ */
+int xfs_attr_rmtval_get(struct xfs_da_args *args);
+int xfs_attr_rmtval_set(struct xfs_da_args *args);
+int xfs_attr_rmtval_remove(struct xfs_da_args *args);
+
+#endif /* __XFS_ATTR_REMOTE_H__ */
diff --git a/fs/xfs/xfs_bit.c b/fs/xfs/xfs_bit.c
index 48228848f5a..0e8885a5964 100644
--- a/fs/xfs/xfs_bit.c
+++ b/fs/xfs/xfs_bit.c
@@ -16,10 +16,8 @@
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "xfs.h"
+#include "xfs_log_format.h"
#include "xfs_bit.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
-#include "xfs_buf_item.h"
/*
* XFS bit manipulation routines, used in non-realtime code.
diff --git a/fs/xfs/xfs_bit.h b/fs/xfs/xfs_bit.h
index f1e3c907044..e1649c0d3e0 100644
--- a/fs/xfs/xfs_bit.h
+++ b/fs/xfs/xfs_bit.h
@@ -66,8 +66,11 @@ static inline int xfs_lowbit64(__uint64_t v)
n = ffs(w);
} else { /* upper bits */
w = (__uint32_t)(v >> 32);
- if (w && (n = ffs(w)))
- n += 32;
+ if (w) {
+ n = ffs(w);
+ if (n)
+ n += 32;
+ }
}
return n - 1;
}
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 0e92d12765d..75c3fe5f3d9 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -17,207 +17,95 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
-#include "xfs_types.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_bit.h"
-#include "xfs_log.h"
#include "xfs_inum.h"
-#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
-#include "xfs_dir2.h"
+#include "xfs_mount.h"
+#include "xfs_da_format.h"
#include "xfs_da_btree.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dinode.h"
+#include "xfs_dir2.h"
#include "xfs_inode.h"
#include "xfs_btree.h"
-#include "xfs_mount.h"
-#include "xfs_itable.h"
+#include "xfs_trans.h"
#include "xfs_inode_item.h"
#include "xfs_extfree_item.h"
#include "xfs_alloc.h"
#include "xfs_bmap.h"
+#include "xfs_bmap_util.h"
+#include "xfs_bmap_btree.h"
#include "xfs_rtalloc.h"
#include "xfs_error.h"
-#include "xfs_attr_leaf.h"
#include "xfs_quota.h"
#include "xfs_trans_space.h"
#include "xfs_buf_item.h"
-#include "xfs_filestream.h"
-#include "xfs_vnodeops.h"
#include "xfs_trace.h"
+#include "xfs_symlink.h"
+#include "xfs_attr_leaf.h"
+#include "xfs_dinode.h"
+#include "xfs_filestream.h"
kmem_zone_t *xfs_bmap_free_item_zone;
/*
- * Prototypes for internal bmap routines.
- */
-
-#ifdef DEBUG
-STATIC void
-xfs_bmap_check_leaf_extents(
- struct xfs_btree_cur *cur,
- struct xfs_inode *ip,
- int whichfork);
-#else
-#define xfs_bmap_check_leaf_extents(cur, ip, whichfork) do { } while (0)
-#endif
-
-
-/*
- * Called from xfs_bmap_add_attrfork to handle extents format files.
- */
-STATIC int /* error */
-xfs_bmap_add_attrfork_extents(
- xfs_trans_t *tp, /* transaction pointer */
- xfs_inode_t *ip, /* incore inode pointer */
- xfs_fsblock_t *firstblock, /* first block allocated */
- xfs_bmap_free_t *flist, /* blocks to free at commit */
- int *flags); /* inode logging flags */
-
-/*
- * Called from xfs_bmap_add_attrfork to handle local format files.
- */
-STATIC int /* error */
-xfs_bmap_add_attrfork_local(
- xfs_trans_t *tp, /* transaction pointer */
- xfs_inode_t *ip, /* incore inode pointer */
- xfs_fsblock_t *firstblock, /* first block allocated */
- xfs_bmap_free_t *flist, /* blocks to free at commit */
- int *flags); /* inode logging flags */
-
-/*
- * xfs_bmap_alloc is called by xfs_bmapi to allocate an extent for a file.
- * It figures out where to ask the underlying allocator to put the new extent.
- */
-STATIC int /* error */
-xfs_bmap_alloc(
- xfs_bmalloca_t *ap); /* bmap alloc argument struct */
-
-/*
- * Transform a btree format file with only one leaf node, where the
- * extents list will fit in the inode, into an extents format file.
- * Since the file extents are already in-core, all we have to do is
- * give up the space for the btree root and pitch the leaf block.
- */
-STATIC int /* error */
-xfs_bmap_btree_to_extents(
- xfs_trans_t *tp, /* transaction pointer */
- xfs_inode_t *ip, /* incore inode pointer */
- xfs_btree_cur_t *cur, /* btree cursor */
- int *logflagsp, /* inode logging flags */
- int whichfork); /* data or attr fork */
-
-/*
- * Remove the entry "free" from the free item list. Prev points to the
- * previous entry, unless "free" is the head of the list.
- */
-STATIC void
-xfs_bmap_del_free(
- xfs_bmap_free_t *flist, /* free item list header */
- xfs_bmap_free_item_t *prev, /* previous item on list, if any */
- xfs_bmap_free_item_t *free); /* list item to be freed */
-
-/*
- * Convert an extents-format file into a btree-format file.
- * The new file will have a root block (in the inode) and a single child block.
- */
-STATIC int /* error */
-xfs_bmap_extents_to_btree(
- xfs_trans_t *tp, /* transaction pointer */
- xfs_inode_t *ip, /* incore inode pointer */
- xfs_fsblock_t *firstblock, /* first-block-allocated */
- xfs_bmap_free_t *flist, /* blocks freed in xaction */
- xfs_btree_cur_t **curp, /* cursor returned to caller */
- int wasdel, /* converting a delayed alloc */
- int *logflagsp, /* inode logging flags */
- int whichfork); /* data or attr fork */
-
-/*
- * Convert a local file to an extents file.
- * This code is sort of bogus, since the file data needs to get
- * logged so it won't be lost. The bmap-level manipulations are ok, though.
- */
-STATIC int /* error */
-xfs_bmap_local_to_extents(
- xfs_trans_t *tp, /* transaction pointer */
- xfs_inode_t *ip, /* incore inode pointer */
- xfs_fsblock_t *firstblock, /* first block allocated in xaction */
- xfs_extlen_t total, /* total blocks needed by transaction */
- int *logflagsp, /* inode logging flags */
- int whichfork); /* data or attr fork */
-
-/*
- * Search the extents list for the inode, for the extent containing bno.
- * If bno lies in a hole, point to the next entry. If bno lies past eof,
- * *eofp will be set, and *prevp will contain the last entry (null if none).
- * Else, *lastxp will be set to the index of the found
- * entry; *gotp will contain the entry.
+ * Miscellaneous helper functions
*/
-STATIC xfs_bmbt_rec_host_t * /* pointer to found extent entry */
-xfs_bmap_search_extents(
- xfs_inode_t *ip, /* incore inode pointer */
- xfs_fileoff_t bno, /* block number searched for */
- int whichfork, /* data or attr fork */
- int *eofp, /* out: end of file found */
- xfs_extnum_t *lastxp, /* out: last extent index */
- xfs_bmbt_irec_t *gotp, /* out: extent entry found */
- xfs_bmbt_irec_t *prevp); /* out: previous extent entry found */
/*
- * Compute the worst-case number of indirect blocks that will be used
- * for ip's delayed extent of length "len".
- */
-STATIC xfs_filblks_t
-xfs_bmap_worst_indlen(
- xfs_inode_t *ip, /* incore inode pointer */
- xfs_filblks_t len); /* delayed extent length */
-
-#ifdef DEBUG
-/*
- * Perform various validation checks on the values being returned
- * from xfs_bmapi().
+ * Compute and fill in the value of the maximum depth of a bmap btree
+ * in this filesystem. Done once, during mount.
*/
-STATIC void
-xfs_bmap_validate_ret(
- xfs_fileoff_t bno,
- xfs_filblks_t len,
- int flags,
- xfs_bmbt_irec_t *mval,
- int nmap,
- int ret_nmap);
-#else
-#define xfs_bmap_validate_ret(bno,len,flags,mval,onmap,nmap)
-#endif /* DEBUG */
-
-STATIC int
-xfs_bmap_count_tree(
- xfs_mount_t *mp,
- xfs_trans_t *tp,
- xfs_ifork_t *ifp,
- xfs_fsblock_t blockno,
- int levelin,
- int *count);
-
-STATIC void
-xfs_bmap_count_leaves(
- xfs_ifork_t *ifp,
- xfs_extnum_t idx,
- int numrecs,
- int *count);
-
-STATIC void
-xfs_bmap_disk_count_leaves(
- struct xfs_mount *mp,
- struct xfs_btree_block *block,
- int numrecs,
- int *count);
+void
+xfs_bmap_compute_maxlevels(
+ xfs_mount_t *mp, /* file system mount structure */
+ int whichfork) /* data or attr fork */
+{
+ int level; /* btree level */
+ uint maxblocks; /* max blocks at this level */
+ uint maxleafents; /* max leaf entries possible */
+ int maxrootrecs; /* max records in root block */
+ int minleafrecs; /* min records in leaf block */
+ int minnoderecs; /* min records in node block */
+ int sz; /* root block size */
-/*
- * Bmap internal routines.
- */
+ /*
+ * The maximum number of extents in a file, hence the maximum
+ * number of leaf entries, is controlled by the type of di_nextents
+ * (a signed 32-bit number, xfs_extnum_t), or by di_anextents
+ * (a signed 16-bit number, xfs_aextnum_t).
+ *
+ * Note that we can no longer assume that if we are in ATTR1 that
+ * the fork offset of all the inodes will be
+ * (xfs_default_attroffset(ip) >> 3) because we could have mounted
+ * with ATTR2 and then mounted back with ATTR1, keeping the
+ * di_forkoff's fixed but probably at various positions. Therefore,
+ * for both ATTR1 and ATTR2 we have to assume the worst case scenario
+ * of a minimum size available.
+ */
+ if (whichfork == XFS_DATA_FORK) {
+ maxleafents = MAXEXTNUM;
+ sz = XFS_BMDR_SPACE_CALC(MINDBTPTRS);
+ } else {
+ maxleafents = MAXAEXTNUM;
+ sz = XFS_BMDR_SPACE_CALC(MINABTPTRS);
+ }
+ maxrootrecs = xfs_bmdr_maxrecs(sz, 0);
+ minleafrecs = mp->m_bmap_dmnr[0];
+ minnoderecs = mp->m_bmap_dmnr[1];
+ maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs;
+ for (level = 1; maxblocks > 1; level++) {
+ if (maxblocks <= maxrootrecs)
+ maxblocks = 1;
+ else
+ maxblocks = (maxblocks + minnoderecs - 1) / minnoderecs;
+ }
+ mp->m_bm_maxlevels[whichfork] = level;
+}
STATIC int /* error */
xfs_bmbt_lookup_eq(
@@ -287,6 +175,834 @@ xfs_bmbt_update(
}
/*
+ * Compute the worst-case number of indirect blocks that will be used
+ * for ip's delayed extent of length "len".
+ */
+STATIC xfs_filblks_t
+xfs_bmap_worst_indlen(
+ xfs_inode_t *ip, /* incore inode pointer */
+ xfs_filblks_t len) /* delayed extent length */
+{
+ int level; /* btree level number */
+ int maxrecs; /* maximum record count at this level */
+ xfs_mount_t *mp; /* mount structure */
+ xfs_filblks_t rval; /* return value */
+
+ mp = ip->i_mount;
+ maxrecs = mp->m_bmap_dmxr[0];
+ for (level = 0, rval = 0;
+ level < XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK);
+ level++) {
+ len += maxrecs - 1;
+ do_div(len, maxrecs);
+ rval += len;
+ if (len == 1)
+ return rval + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) -
+ level - 1;
+ if (level == 0)
+ maxrecs = mp->m_bmap_dmxr[1];
+ }
+ return rval;
+}
+
+/*
+ * Calculate the default attribute fork offset for newly created inodes.
+ */
+uint
+xfs_default_attroffset(
+ struct xfs_inode *ip)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ uint offset;
+
+ if (mp->m_sb.sb_inodesize == 256) {
+ offset = XFS_LITINO(mp, ip->i_d.di_version) -
+ XFS_BMDR_SPACE_CALC(MINABTPTRS);
+ } else {
+ offset = XFS_BMDR_SPACE_CALC(6 * MINABTPTRS);
+ }
+
+ ASSERT(offset < XFS_LITINO(mp, ip->i_d.di_version));
+ return offset;
+}
+
+/*
+ * Helper routine to reset inode di_forkoff field when switching
+ * attribute fork from local to extent format - we reset it where
+ * possible to make space available for inline data fork extents.
+ */
+STATIC void
+xfs_bmap_forkoff_reset(
+ xfs_inode_t *ip,
+ int whichfork)
+{
+ if (whichfork == XFS_ATTR_FORK &&
+ ip->i_d.di_format != XFS_DINODE_FMT_DEV &&
+ ip->i_d.di_format != XFS_DINODE_FMT_UUID &&
+ ip->i_d.di_format != XFS_DINODE_FMT_BTREE) {
+ uint dfl_forkoff = xfs_default_attroffset(ip) >> 3;
+
+ if (dfl_forkoff > ip->i_d.di_forkoff)
+ ip->i_d.di_forkoff = dfl_forkoff;
+ }
+}
+
+/*
+ * Debug/sanity checking code
+ */
+
+STATIC int
+xfs_bmap_sanity_check(
+ struct xfs_mount *mp,
+ struct xfs_buf *bp,
+ int level)
+{
+ struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
+
+ if (block->bb_magic != cpu_to_be32(XFS_BMAP_CRC_MAGIC) &&
+ block->bb_magic != cpu_to_be32(XFS_BMAP_MAGIC))
+ return 0;
+
+ if (be16_to_cpu(block->bb_level) != level ||
+ be16_to_cpu(block->bb_numrecs) == 0 ||
+ be16_to_cpu(block->bb_numrecs) > mp->m_bmap_dmxr[level != 0])
+ return 0;
+
+ return 1;
+}
+
+#ifdef DEBUG
+STATIC struct xfs_buf *
+xfs_bmap_get_bp(
+ struct xfs_btree_cur *cur,
+ xfs_fsblock_t bno)
+{
+ struct xfs_log_item_desc *lidp;
+ int i;
+
+ if (!cur)
+ return NULL;
+
+ for (i = 0; i < XFS_BTREE_MAXLEVELS; i++) {
+ if (!cur->bc_bufs[i])
+ break;
+ if (XFS_BUF_ADDR(cur->bc_bufs[i]) == bno)
+ return cur->bc_bufs[i];
+ }
+
+ /* Chase down all the log items to see if the bp is there */
+ list_for_each_entry(lidp, &cur->bc_tp->t_items, lid_trans) {
+ struct xfs_buf_log_item *bip;
+ bip = (struct xfs_buf_log_item *)lidp->lid_item;
+ if (bip->bli_item.li_type == XFS_LI_BUF &&
+ XFS_BUF_ADDR(bip->bli_buf) == bno)
+ return bip->bli_buf;
+ }
+
+ return NULL;
+}
+
+STATIC void
+xfs_check_block(
+ struct xfs_btree_block *block,
+ xfs_mount_t *mp,
+ int root,
+ short sz)
+{
+ int i, j, dmxr;
+ __be64 *pp, *thispa; /* pointer to block address */
+ xfs_bmbt_key_t *prevp, *keyp;
+
+ ASSERT(be16_to_cpu(block->bb_level) > 0);
+
+ prevp = NULL;
+ for( i = 1; i <= xfs_btree_get_numrecs(block); i++) {
+ dmxr = mp->m_bmap_dmxr[0];
+ keyp = XFS_BMBT_KEY_ADDR(mp, block, i);
+
+ if (prevp) {
+ ASSERT(be64_to_cpu(prevp->br_startoff) <
+ be64_to_cpu(keyp->br_startoff));
+ }
+ prevp = keyp;
+
+ /*
+ * Compare the block numbers to see if there are dups.
+ */
+ if (root)
+ pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, i, sz);
+ else
+ pp = XFS_BMBT_PTR_ADDR(mp, block, i, dmxr);
+
+ for (j = i+1; j <= be16_to_cpu(block->bb_numrecs); j++) {
+ if (root)
+ thispa = XFS_BMAP_BROOT_PTR_ADDR(mp, block, j, sz);
+ else
+ thispa = XFS_BMBT_PTR_ADDR(mp, block, j, dmxr);
+ if (*thispa == *pp) {
+ xfs_warn(mp, "%s: thispa(%d) == pp(%d) %Ld",
+ __func__, j, i,
+ (unsigned long long)be64_to_cpu(*thispa));
+ panic("%s: ptrs are equal in node\n",
+ __func__);
+ }
+ }
+ }
+}
+
+/*
+ * Check that the extents for the inode ip are in the right order in all
+ * btree leaves.
+ */
+
+STATIC void
+xfs_bmap_check_leaf_extents(
+ xfs_btree_cur_t *cur, /* btree cursor or null */
+ xfs_inode_t *ip, /* incore inode pointer */
+ int whichfork) /* data or attr fork */
+{
+ struct xfs_btree_block *block; /* current btree block */
+ xfs_fsblock_t bno; /* block # of "block" */
+ xfs_buf_t *bp; /* buffer for "block" */
+ int error; /* error return value */
+ xfs_extnum_t i=0, j; /* index into the extents list */
+ xfs_ifork_t *ifp; /* fork structure */
+ int level; /* btree level, for checking */
+ xfs_mount_t *mp; /* file system mount structure */
+ __be64 *pp; /* pointer to block address */
+ xfs_bmbt_rec_t *ep; /* pointer to current extent */
+ xfs_bmbt_rec_t last = {0, 0}; /* last extent in prev block */
+ xfs_bmbt_rec_t *nextp; /* pointer to next extent */
+ int bp_release = 0;
+
+ if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE) {
+ return;
+ }
+
+ bno = NULLFSBLOCK;
+ mp = ip->i_mount;
+ ifp = XFS_IFORK_PTR(ip, whichfork);
+ block = ifp->if_broot;
+ /*
+ * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out.
+ */
+ level = be16_to_cpu(block->bb_level);
+ ASSERT(level > 0);
+ xfs_check_block(block, mp, 1, ifp->if_broot_bytes);
+ pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes);
+ bno = be64_to_cpu(*pp);
+
+ ASSERT(bno != NULLDFSBNO);
+ ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount);
+ ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks);
+
+ /*
+ * Go down the tree until leaf level is reached, following the first
+ * pointer (leftmost) at each level.
+ */
+ while (level-- > 0) {
+ /* See if buf is in cur first */
+ bp_release = 0;
+ bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno));
+ if (!bp) {
+ bp_release = 1;
+ error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp,
+ XFS_BMAP_BTREE_REF,
+ &xfs_bmbt_buf_ops);
+ if (error)
+ goto error_norelse;
+ }
+ block = XFS_BUF_TO_BLOCK(bp);
+ XFS_WANT_CORRUPTED_GOTO(
+ xfs_bmap_sanity_check(mp, bp, level),
+ error0);
+ if (level == 0)
+ break;
+
+ /*
+ * Check this block for basic sanity (increasing keys and
+ * no duplicate blocks).
+ */
+
+ xfs_check_block(block, mp, 0, 0);
+ pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]);
+ bno = be64_to_cpu(*pp);
+ XFS_WANT_CORRUPTED_GOTO(XFS_FSB_SANITY_CHECK(mp, bno), error0);
+ if (bp_release) {
+ bp_release = 0;
+ xfs_trans_brelse(NULL, bp);
+ }
+ }
+
+ /*
+ * Here with bp and block set to the leftmost leaf node in the tree.
+ */
+ i = 0;
+
+ /*
+ * Loop over all leaf nodes checking that all extents are in the right order.
+ */
+ for (;;) {
+ xfs_fsblock_t nextbno;
+ xfs_extnum_t num_recs;
+
+
+ num_recs = xfs_btree_get_numrecs(block);
+
+ /*
+ * Read-ahead the next leaf block, if any.
+ */
+
+ nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
+
+ /*
+ * Check all the extents to make sure they are OK.
+ * If we had a previous block, the last entry should
+ * conform with the first entry in this one.
+ */
+
+ ep = XFS_BMBT_REC_ADDR(mp, block, 1);
+ if (i) {
+ ASSERT(xfs_bmbt_disk_get_startoff(&last) +
+ xfs_bmbt_disk_get_blockcount(&last) <=
+ xfs_bmbt_disk_get_startoff(ep));
+ }
+ for (j = 1; j < num_recs; j++) {
+ nextp = XFS_BMBT_REC_ADDR(mp, block, j + 1);
+ ASSERT(xfs_bmbt_disk_get_startoff(ep) +
+ xfs_bmbt_disk_get_blockcount(ep) <=
+ xfs_bmbt_disk_get_startoff(nextp));
+ ep = nextp;
+ }
+
+ last = *ep;
+ i += num_recs;
+ if (bp_release) {
+ bp_release = 0;
+ xfs_trans_brelse(NULL, bp);
+ }
+ bno = nextbno;
+ /*
+ * If we've reached the end, stop.
+ */
+ if (bno == NULLFSBLOCK)
+ break;
+
+ bp_release = 0;
+ bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno));
+ if (!bp) {
+ bp_release = 1;
+ error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp,
+ XFS_BMAP_BTREE_REF,
+ &xfs_bmbt_buf_ops);
+ if (error)
+ goto error_norelse;
+ }
+ block = XFS_BUF_TO_BLOCK(bp);
+ }
+ if (bp_release) {
+ bp_release = 0;
+ xfs_trans_brelse(NULL, bp);
+ }
+ return;
+
+error0:
+ xfs_warn(mp, "%s: at error0", __func__);
+ if (bp_release)
+ xfs_trans_brelse(NULL, bp);
+error_norelse:
+ xfs_warn(mp, "%s: BAD after btree leaves for %d extents",
+ __func__, i);
+ panic("%s: CORRUPTED BTREE OR SOMETHING", __func__);
+ return;
+}
+
+/*
+ * Add bmap trace insert entries for all the contents of the extent records.
+ */
+void
+xfs_bmap_trace_exlist(
+ xfs_inode_t *ip, /* incore inode pointer */
+ xfs_extnum_t cnt, /* count of entries in the list */
+ int whichfork, /* data or attr fork */
+ unsigned long caller_ip)
+{
+ xfs_extnum_t idx; /* extent record index */
+ xfs_ifork_t *ifp; /* inode fork pointer */
+ int state = 0;
+
+ if (whichfork == XFS_ATTR_FORK)
+ state |= BMAP_ATTRFORK;
+
+ ifp = XFS_IFORK_PTR(ip, whichfork);
+ ASSERT(cnt == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)));
+ for (idx = 0; idx < cnt; idx++)
+ trace_xfs_extlist(ip, idx, whichfork, caller_ip);
+}
+
+/*
+ * Validate that the bmbt_irecs being returned from bmapi are valid
+ * given the caller's original parameters. Specifically check the
+ * ranges of the returned irecs to ensure that they only extend beyond
+ * the given parameters if the XFS_BMAPI_ENTIRE flag was set.
+ */
+STATIC void
+xfs_bmap_validate_ret(
+ xfs_fileoff_t bno,
+ xfs_filblks_t len,
+ int flags,
+ xfs_bmbt_irec_t *mval,
+ int nmap,
+ int ret_nmap)
+{
+ int i; /* index to map values */
+
+ ASSERT(ret_nmap <= nmap);
+
+ for (i = 0; i < ret_nmap; i++) {
+ ASSERT(mval[i].br_blockcount > 0);
+ if (!(flags & XFS_BMAPI_ENTIRE)) {
+ ASSERT(mval[i].br_startoff >= bno);
+ ASSERT(mval[i].br_blockcount <= len);
+ ASSERT(mval[i].br_startoff + mval[i].br_blockcount <=
+ bno + len);
+ } else {
+ ASSERT(mval[i].br_startoff < bno + len);
+ ASSERT(mval[i].br_startoff + mval[i].br_blockcount >
+ bno);
+ }
+ ASSERT(i == 0 ||
+ mval[i - 1].br_startoff + mval[i - 1].br_blockcount ==
+ mval[i].br_startoff);
+ ASSERT(mval[i].br_startblock != DELAYSTARTBLOCK &&
+ mval[i].br_startblock != HOLESTARTBLOCK);
+ ASSERT(mval[i].br_state == XFS_EXT_NORM ||
+ mval[i].br_state == XFS_EXT_UNWRITTEN);
+ }
+}
+
+#else
+#define xfs_bmap_check_leaf_extents(cur, ip, whichfork) do { } while (0)
+#define xfs_bmap_validate_ret(bno,len,flags,mval,onmap,nmap)
+#endif /* DEBUG */
+
+/*
+ * bmap free list manipulation functions
+ */
+
+/*
+ * Add the extent to the list of extents to be free at transaction end.
+ * The list is maintained sorted (by block number).
+ */
+void
+xfs_bmap_add_free(
+ xfs_fsblock_t bno, /* fs block number of extent */
+ xfs_filblks_t len, /* length of extent */
+ xfs_bmap_free_t *flist, /* list of extents */
+ xfs_mount_t *mp) /* mount point structure */
+{
+ xfs_bmap_free_item_t *cur; /* current (next) element */
+ xfs_bmap_free_item_t *new; /* new element */
+ xfs_bmap_free_item_t *prev; /* previous element */
+#ifdef DEBUG
+ xfs_agnumber_t agno;
+ xfs_agblock_t agbno;
+
+ ASSERT(bno != NULLFSBLOCK);
+ ASSERT(len > 0);
+ ASSERT(len <= MAXEXTLEN);
+ ASSERT(!isnullstartblock(bno));
+ agno = XFS_FSB_TO_AGNO(mp, bno);
+ agbno = XFS_FSB_TO_AGBNO(mp, bno);
+ ASSERT(agno < mp->m_sb.sb_agcount);
+ ASSERT(agbno < mp->m_sb.sb_agblocks);
+ ASSERT(len < mp->m_sb.sb_agblocks);
+ ASSERT(agbno + len <= mp->m_sb.sb_agblocks);
+#endif
+ ASSERT(xfs_bmap_free_item_zone != NULL);
+ new = kmem_zone_alloc(xfs_bmap_free_item_zone, KM_SLEEP);
+ new->xbfi_startblock = bno;
+ new->xbfi_blockcount = (xfs_extlen_t)len;
+ for (prev = NULL, cur = flist->xbf_first;
+ cur != NULL;
+ prev = cur, cur = cur->xbfi_next) {
+ if (cur->xbfi_startblock >= bno)
+ break;
+ }
+ if (prev)
+ prev->xbfi_next = new;
+ else
+ flist->xbf_first = new;
+ new->xbfi_next = cur;
+ flist->xbf_count++;
+}
+
+/*
+ * Remove the entry "free" from the free item list. Prev points to the
+ * previous entry, unless "free" is the head of the list.
+ */
+void
+xfs_bmap_del_free(
+ xfs_bmap_free_t *flist, /* free item list header */
+ xfs_bmap_free_item_t *prev, /* previous item on list, if any */
+ xfs_bmap_free_item_t *free) /* list item to be freed */
+{
+ if (prev)
+ prev->xbfi_next = free->xbfi_next;
+ else
+ flist->xbf_first = free->xbfi_next;
+ flist->xbf_count--;
+ kmem_zone_free(xfs_bmap_free_item_zone, free);
+}
+
+/*
+ * Free up any items left in the list.
+ */
+void
+xfs_bmap_cancel(
+ xfs_bmap_free_t *flist) /* list of bmap_free_items */
+{
+ xfs_bmap_free_item_t *free; /* free list item */
+ xfs_bmap_free_item_t *next;
+
+ if (flist->xbf_count == 0)
+ return;
+ ASSERT(flist->xbf_first != NULL);
+ for (free = flist->xbf_first; free; free = next) {
+ next = free->xbfi_next;
+ xfs_bmap_del_free(flist, NULL, free);
+ }
+ ASSERT(flist->xbf_count == 0);
+}
+
+/*
+ * Inode fork format manipulation functions
+ */
+
+/*
+ * Transform a btree format file with only one leaf node, where the
+ * extents list will fit in the inode, into an extents format file.
+ * Since the file extents are already in-core, all we have to do is
+ * give up the space for the btree root and pitch the leaf block.
+ */
+STATIC int /* error */
+xfs_bmap_btree_to_extents(
+ xfs_trans_t *tp, /* transaction pointer */
+ xfs_inode_t *ip, /* incore inode pointer */
+ xfs_btree_cur_t *cur, /* btree cursor */
+ int *logflagsp, /* inode logging flags */
+ int whichfork) /* data or attr fork */
+{
+ /* REFERENCED */
+ struct xfs_btree_block *cblock;/* child btree block */
+ xfs_fsblock_t cbno; /* child block number */
+ xfs_buf_t *cbp; /* child block's buffer */
+ int error; /* error return value */
+ xfs_ifork_t *ifp; /* inode fork data */
+ xfs_mount_t *mp; /* mount point structure */
+ __be64 *pp; /* ptr to block address */
+ struct xfs_btree_block *rblock;/* root btree block */
+
+ mp = ip->i_mount;
+ ifp = XFS_IFORK_PTR(ip, whichfork);
+ ASSERT(ifp->if_flags & XFS_IFEXTENTS);
+ ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE);
+ rblock = ifp->if_broot;
+ ASSERT(be16_to_cpu(rblock->bb_level) == 1);
+ ASSERT(be16_to_cpu(rblock->bb_numrecs) == 1);
+ ASSERT(xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0) == 1);
+ pp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, ifp->if_broot_bytes);
+ cbno = be64_to_cpu(*pp);
+ *logflagsp = 0;
+#ifdef DEBUG
+ if ((error = xfs_btree_check_lptr(cur, cbno, 1)))
+ return error;
+#endif
+ error = xfs_btree_read_bufl(mp, tp, cbno, 0, &cbp, XFS_BMAP_BTREE_REF,
+ &xfs_bmbt_buf_ops);
+ if (error)
+ return error;
+ cblock = XFS_BUF_TO_BLOCK(cbp);
+ if ((error = xfs_btree_check_block(cur, cblock, 0, cbp)))
+ return error;
+ xfs_bmap_add_free(cbno, 1, cur->bc_private.b.flist, mp);
+ ip->i_d.di_nblocks--;
+ xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L);
+ xfs_trans_binval(tp, cbp);
+ if (cur->bc_bufs[0] == cbp)
+ cur->bc_bufs[0] = NULL;
+ xfs_iroot_realloc(ip, -1, whichfork);
+ ASSERT(ifp->if_broot == NULL);
+ ASSERT((ifp->if_flags & XFS_IFBROOT) == 0);
+ XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS);
+ *logflagsp = XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
+ return 0;
+}
+
+/*
+ * Convert an extents-format file into a btree-format file.
+ * The new file will have a root block (in the inode) and a single child block.
+ */
+STATIC int /* error */
+xfs_bmap_extents_to_btree(
+ xfs_trans_t *tp, /* transaction pointer */
+ xfs_inode_t *ip, /* incore inode pointer */
+ xfs_fsblock_t *firstblock, /* first-block-allocated */
+ xfs_bmap_free_t *flist, /* blocks freed in xaction */
+ xfs_btree_cur_t **curp, /* cursor returned to caller */
+ int wasdel, /* converting a delayed alloc */
+ int *logflagsp, /* inode logging flags */
+ int whichfork) /* data or attr fork */
+{
+ struct xfs_btree_block *ablock; /* allocated (child) bt block */
+ xfs_buf_t *abp; /* buffer for ablock */
+ xfs_alloc_arg_t args; /* allocation arguments */
+ xfs_bmbt_rec_t *arp; /* child record pointer */
+ struct xfs_btree_block *block; /* btree root block */
+ xfs_btree_cur_t *cur; /* bmap btree cursor */
+ xfs_bmbt_rec_host_t *ep; /* extent record pointer */
+ int error; /* error return value */
+ xfs_extnum_t i, cnt; /* extent record index */
+ xfs_ifork_t *ifp; /* inode fork pointer */
+ xfs_bmbt_key_t *kp; /* root block key pointer */
+ xfs_mount_t *mp; /* mount structure */
+ xfs_extnum_t nextents; /* number of file extents */
+ xfs_bmbt_ptr_t *pp; /* root block address pointer */
+
+ mp = ip->i_mount;
+ ifp = XFS_IFORK_PTR(ip, whichfork);
+ ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS);
+
+ /*
+ * Make space in the inode incore.
+ */
+ xfs_iroot_realloc(ip, 1, whichfork);
+ ifp->if_flags |= XFS_IFBROOT;
+
+ /*
+ * Fill in the root.
+ */
+ block = ifp->if_broot;
+ if (xfs_sb_version_hascrc(&mp->m_sb))
+ xfs_btree_init_block_int(mp, block, XFS_BUF_DADDR_NULL,
+ XFS_BMAP_CRC_MAGIC, 1, 1, ip->i_ino,
+ XFS_BTREE_LONG_PTRS | XFS_BTREE_CRC_BLOCKS);
+ else
+ xfs_btree_init_block_int(mp, block, XFS_BUF_DADDR_NULL,
+ XFS_BMAP_MAGIC, 1, 1, ip->i_ino,
+ XFS_BTREE_LONG_PTRS);
+
+ /*
+ * Need a cursor. Can't allocate until bb_level is filled in.
+ */
+ cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
+ cur->bc_private.b.firstblock = *firstblock;
+ cur->bc_private.b.flist = flist;
+ cur->bc_private.b.flags = wasdel ? XFS_BTCUR_BPRV_WASDEL : 0;
+ /*
+ * Convert to a btree with two levels, one record in root.
+ */
+ XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_BTREE);
+ memset(&args, 0, sizeof(args));
+ args.tp = tp;
+ args.mp = mp;
+ args.firstblock = *firstblock;
+ if (*firstblock == NULLFSBLOCK) {
+ args.type = XFS_ALLOCTYPE_START_BNO;
+ args.fsbno = XFS_INO_TO_FSB(mp, ip->i_ino);
+ } else if (flist->xbf_low) {
+ args.type = XFS_ALLOCTYPE_START_BNO;
+ args.fsbno = *firstblock;
+ } else {
+ args.type = XFS_ALLOCTYPE_NEAR_BNO;
+ args.fsbno = *firstblock;
+ }
+ args.minlen = args.maxlen = args.prod = 1;
+ args.wasdel = wasdel;
+ *logflagsp = 0;
+ if ((error = xfs_alloc_vextent(&args))) {
+ xfs_iroot_realloc(ip, -1, whichfork);
+ xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+ return error;
+ }
+ /*
+ * Allocation can't fail, the space was reserved.
+ */
+ ASSERT(args.fsbno != NULLFSBLOCK);
+ ASSERT(*firstblock == NULLFSBLOCK ||
+ args.agno == XFS_FSB_TO_AGNO(mp, *firstblock) ||
+ (flist->xbf_low &&
+ args.agno > XFS_FSB_TO_AGNO(mp, *firstblock)));
+ *firstblock = cur->bc_private.b.firstblock = args.fsbno;
+ cur->bc_private.b.allocated++;
+ ip->i_d.di_nblocks++;
+ xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, 1L);
+ abp = xfs_btree_get_bufl(mp, tp, args.fsbno, 0);
+ /*
+ * Fill in the child block.
+ */
+ abp->b_ops = &xfs_bmbt_buf_ops;
+ ablock = XFS_BUF_TO_BLOCK(abp);
+ if (xfs_sb_version_hascrc(&mp->m_sb))
+ xfs_btree_init_block_int(mp, ablock, abp->b_bn,
+ XFS_BMAP_CRC_MAGIC, 0, 0, ip->i_ino,
+ XFS_BTREE_LONG_PTRS | XFS_BTREE_CRC_BLOCKS);
+ else
+ xfs_btree_init_block_int(mp, ablock, abp->b_bn,
+ XFS_BMAP_MAGIC, 0, 0, ip->i_ino,
+ XFS_BTREE_LONG_PTRS);
+
+ arp = XFS_BMBT_REC_ADDR(mp, ablock, 1);
+ nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+ for (cnt = i = 0; i < nextents; i++) {
+ ep = xfs_iext_get_ext(ifp, i);
+ if (!isnullstartblock(xfs_bmbt_get_startblock(ep))) {
+ arp->l0 = cpu_to_be64(ep->l0);
+ arp->l1 = cpu_to_be64(ep->l1);
+ arp++; cnt++;
+ }
+ }
+ ASSERT(cnt == XFS_IFORK_NEXTENTS(ip, whichfork));
+ xfs_btree_set_numrecs(ablock, cnt);
+
+ /*
+ * Fill in the root key and pointer.
+ */
+ kp = XFS_BMBT_KEY_ADDR(mp, block, 1);
+ arp = XFS_BMBT_REC_ADDR(mp, ablock, 1);
+ kp->br_startoff = cpu_to_be64(xfs_bmbt_disk_get_startoff(arp));
+ pp = XFS_BMBT_PTR_ADDR(mp, block, 1, xfs_bmbt_get_maxrecs(cur,
+ be16_to_cpu(block->bb_level)));
+ *pp = cpu_to_be64(args.fsbno);
+
+ /*
+ * Do all this logging at the end so that
+ * the root is at the right level.
+ */
+ xfs_btree_log_block(cur, abp, XFS_BB_ALL_BITS);
+ xfs_btree_log_recs(cur, abp, 1, be16_to_cpu(ablock->bb_numrecs));
+ ASSERT(*curp == NULL);
+ *curp = cur;
+ *logflagsp = XFS_ILOG_CORE | xfs_ilog_fbroot(whichfork);
+ return 0;
+}
+
+/*
+ * Convert a local file to an extents file.
+ * This code is out of bounds for data forks of regular files,
+ * since the file data needs to get logged so things will stay consistent.
+ * (The bmap-level manipulations are ok, though).
+ */
+void
+xfs_bmap_local_to_extents_empty(
+ struct xfs_inode *ip,
+ int whichfork)
+{
+ struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+
+ ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL);
+ ASSERT(ifp->if_bytes == 0);
+ ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) == 0);
+
+ xfs_bmap_forkoff_reset(ip, whichfork);
+ ifp->if_flags &= ~XFS_IFINLINE;
+ ifp->if_flags |= XFS_IFEXTENTS;
+ XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS);
+}
+
+
+STATIC int /* error */
+xfs_bmap_local_to_extents(
+ xfs_trans_t *tp, /* transaction pointer */
+ xfs_inode_t *ip, /* incore inode pointer */
+ xfs_fsblock_t *firstblock, /* first block allocated in xaction */
+ xfs_extlen_t total, /* total blocks needed by transaction */
+ int *logflagsp, /* inode logging flags */
+ int whichfork,
+ void (*init_fn)(struct xfs_trans *tp,
+ struct xfs_buf *bp,
+ struct xfs_inode *ip,
+ struct xfs_ifork *ifp))
+{
+ int error = 0;
+ int flags; /* logging flags returned */
+ xfs_ifork_t *ifp; /* inode fork pointer */
+ xfs_alloc_arg_t args; /* allocation arguments */
+ xfs_buf_t *bp; /* buffer for extent block */
+ xfs_bmbt_rec_host_t *ep; /* extent record pointer */
+
+ /*
+ * We don't want to deal with the case of keeping inode data inline yet.
+ * So sending the data fork of a regular inode is invalid.
+ */
+ ASSERT(!(S_ISREG(ip->i_d.di_mode) && whichfork == XFS_DATA_FORK));
+ ifp = XFS_IFORK_PTR(ip, whichfork);
+ ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL);
+
+ if (!ifp->if_bytes) {
+ xfs_bmap_local_to_extents_empty(ip, whichfork);
+ flags = XFS_ILOG_CORE;
+ goto done;
+ }
+
+ flags = 0;
+ error = 0;
+ ASSERT((ifp->if_flags & (XFS_IFINLINE|XFS_IFEXTENTS|XFS_IFEXTIREC)) ==
+ XFS_IFINLINE);
+ memset(&args, 0, sizeof(args));
+ args.tp = tp;
+ args.mp = ip->i_mount;
+ args.firstblock = *firstblock;
+ /*
+ * Allocate a block. We know we need only one, since the
+ * file currently fits in an inode.
+ */
+ if (*firstblock == NULLFSBLOCK) {
+ args.fsbno = XFS_INO_TO_FSB(args.mp, ip->i_ino);
+ args.type = XFS_ALLOCTYPE_START_BNO;
+ } else {
+ args.fsbno = *firstblock;
+ args.type = XFS_ALLOCTYPE_NEAR_BNO;
+ }
+ args.total = total;
+ args.minlen = args.maxlen = args.prod = 1;
+ error = xfs_alloc_vextent(&args);
+ if (error)
+ goto done;
+
+ /* Can't fail, the space was reserved. */
+ ASSERT(args.fsbno != NULLFSBLOCK);
+ ASSERT(args.len == 1);
+ *firstblock = args.fsbno;
+ bp = xfs_btree_get_bufl(args.mp, tp, args.fsbno, 0);
+
+ /* initialise the block and copy the data */
+ init_fn(tp, bp, ip, ifp);
+
+ /* account for the change in fork size and log everything */
+ xfs_trans_log_buf(tp, bp, 0, ifp->if_bytes - 1);
+ xfs_idata_realloc(ip, -ifp->if_bytes, whichfork);
+ xfs_bmap_local_to_extents_empty(ip, whichfork);
+ flags |= XFS_ILOG_CORE;
+
+ xfs_iext_add(ifp, 0, 1);
+ ep = xfs_iext_get_ext(ifp, 0);
+ xfs_bmbt_set_allf(ep, 0, args.fsbno, 1, XFS_EXT_NORM);
+ trace_xfs_bmap_post_update(ip, 0,
+ whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0,
+ _THIS_IP_);
+ XFS_IFORK_NEXT_SET(ip, whichfork, 1);
+ ip->i_d.di_nblocks = 1;
+ xfs_trans_mod_dquot_byino(tp, ip,
+ XFS_TRANS_DQ_BCOUNT, 1L);
+ flags |= xfs_ilog_fext(whichfork);
+
+done:
+ *logflagsp = flags;
+ return error;
+}
+
+/*
* Called from xfs_bmap_add_attrfork to handle btree format files.
*/
STATIC int /* error */
@@ -357,7 +1073,15 @@ xfs_bmap_add_attrfork_extents(
}
/*
- * Called from xfs_bmap_add_attrfork to handle local format files.
+ * Called from xfs_bmap_add_attrfork to handle local format files. Each
+ * different data fork content type needs a different callout to do the
+ * conversion. Some are basic and only require special block initialisation
+ * callouts for the data formating, others (directories) are so specialised they
+ * handle everything themselves.
+ *
+ * XXX (dgc): investigate whether directory conversion can use the generic
+ * formatting callout. It should be possible - it's just a very complex
+ * formatter.
*/
STATIC int /* error */
xfs_bmap_add_attrfork_local(
@@ -368,27 +1092,651 @@ xfs_bmap_add_attrfork_local(
int *flags) /* inode logging flags */
{
xfs_da_args_t dargs; /* args for dir/attr code */
- int error; /* error return value */
- xfs_mount_t *mp; /* mount structure pointer */
if (ip->i_df.if_bytes <= XFS_IFORK_DSIZE(ip))
return 0;
+
if (S_ISDIR(ip->i_d.di_mode)) {
- mp = ip->i_mount;
memset(&dargs, 0, sizeof(dargs));
+ dargs.geo = ip->i_mount->m_dir_geo;
dargs.dp = ip;
dargs.firstblock = firstblock;
dargs.flist = flist;
- dargs.total = mp->m_dirblkfsbs;
+ dargs.total = dargs.geo->fsbcount;
dargs.whichfork = XFS_DATA_FORK;
dargs.trans = tp;
- error = xfs_dir2_sf_to_block(&dargs);
- } else
- error = xfs_bmap_local_to_extents(tp, ip, firstblock, 1, flags,
- XFS_DATA_FORK);
+ return xfs_dir2_sf_to_block(&dargs);
+ }
+
+ if (S_ISLNK(ip->i_d.di_mode))
+ return xfs_bmap_local_to_extents(tp, ip, firstblock, 1,
+ flags, XFS_DATA_FORK,
+ xfs_symlink_local_to_remote);
+
+ /* should only be called for types that support local format data */
+ ASSERT(0);
+ return EFSCORRUPTED;
+}
+
+/*
+ * Convert inode from non-attributed to attributed.
+ * Must not be in a transaction, ip must not be locked.
+ */
+int /* error code */
+xfs_bmap_add_attrfork(
+ xfs_inode_t *ip, /* incore inode pointer */
+ int size, /* space new attribute needs */
+ int rsvd) /* xact may use reserved blks */
+{
+ xfs_fsblock_t firstblock; /* 1st block/ag allocated */
+ xfs_bmap_free_t flist; /* freed extent records */
+ xfs_mount_t *mp; /* mount structure */
+ xfs_trans_t *tp; /* transaction pointer */
+ int blks; /* space reservation */
+ int version = 1; /* superblock attr version */
+ int committed; /* xaction was committed */
+ int logflags; /* logging flags */
+ int error; /* error return value */
+ int cancel_flags = 0;
+
+ ASSERT(XFS_IFORK_Q(ip) == 0);
+
+ mp = ip->i_mount;
+ ASSERT(!XFS_NOT_DQATTACHED(mp, ip));
+ tp = xfs_trans_alloc(mp, XFS_TRANS_ADDAFORK);
+ blks = XFS_ADDAFORK_SPACE_RES(mp);
+ if (rsvd)
+ tp->t_flags |= XFS_TRANS_RESERVE;
+ error = xfs_trans_reserve(tp, &M_RES(mp)->tr_addafork, blks, 0);
+ if (error) {
+ xfs_trans_cancel(tp, 0);
+ return error;
+ }
+ cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ error = xfs_trans_reserve_quota_nblks(tp, ip, blks, 0, rsvd ?
+ XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES :
+ XFS_QMOPT_RES_REGBLKS);
+ if (error)
+ goto trans_cancel;
+ cancel_flags |= XFS_TRANS_ABORT;
+ if (XFS_IFORK_Q(ip))
+ goto trans_cancel;
+ if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS) {
+ /*
+ * For inodes coming from pre-6.2 filesystems.
+ */
+ ASSERT(ip->i_d.di_aformat == 0);
+ ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
+ }
+ ASSERT(ip->i_d.di_anextents == 0);
+
+ xfs_trans_ijoin(tp, ip, 0);
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+ switch (ip->i_d.di_format) {
+ case XFS_DINODE_FMT_DEV:
+ ip->i_d.di_forkoff = roundup(sizeof(xfs_dev_t), 8) >> 3;
+ break;
+ case XFS_DINODE_FMT_UUID:
+ ip->i_d.di_forkoff = roundup(sizeof(uuid_t), 8) >> 3;
+ break;
+ case XFS_DINODE_FMT_LOCAL:
+ case XFS_DINODE_FMT_EXTENTS:
+ case XFS_DINODE_FMT_BTREE:
+ ip->i_d.di_forkoff = xfs_attr_shortform_bytesfit(ip, size);
+ if (!ip->i_d.di_forkoff)
+ ip->i_d.di_forkoff = xfs_default_attroffset(ip) >> 3;
+ else if (mp->m_flags & XFS_MOUNT_ATTR2)
+ version = 2;
+ break;
+ default:
+ ASSERT(0);
+ error = XFS_ERROR(EINVAL);
+ goto trans_cancel;
+ }
+
+ ASSERT(ip->i_afp == NULL);
+ ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP);
+ ip->i_afp->if_flags = XFS_IFEXTENTS;
+ logflags = 0;
+ xfs_bmap_init(&flist, &firstblock);
+ switch (ip->i_d.di_format) {
+ case XFS_DINODE_FMT_LOCAL:
+ error = xfs_bmap_add_attrfork_local(tp, ip, &firstblock, &flist,
+ &logflags);
+ break;
+ case XFS_DINODE_FMT_EXTENTS:
+ error = xfs_bmap_add_attrfork_extents(tp, ip, &firstblock,
+ &flist, &logflags);
+ break;
+ case XFS_DINODE_FMT_BTREE:
+ error = xfs_bmap_add_attrfork_btree(tp, ip, &firstblock, &flist,
+ &logflags);
+ break;
+ default:
+ error = 0;
+ break;
+ }
+ if (logflags)
+ xfs_trans_log_inode(tp, ip, logflags);
+ if (error)
+ goto bmap_cancel;
+ if (!xfs_sb_version_hasattr(&mp->m_sb) ||
+ (!xfs_sb_version_hasattr2(&mp->m_sb) && version == 2)) {
+ __int64_t sbfields = 0;
+
+ spin_lock(&mp->m_sb_lock);
+ if (!xfs_sb_version_hasattr(&mp->m_sb)) {
+ xfs_sb_version_addattr(&mp->m_sb);
+ sbfields |= XFS_SB_VERSIONNUM;
+ }
+ if (!xfs_sb_version_hasattr2(&mp->m_sb) && version == 2) {
+ xfs_sb_version_addattr2(&mp->m_sb);
+ sbfields |= (XFS_SB_VERSIONNUM | XFS_SB_FEATURES2);
+ }
+ if (sbfields) {
+ spin_unlock(&mp->m_sb_lock);
+ xfs_mod_sb(tp, sbfields);
+ } else
+ spin_unlock(&mp->m_sb_lock);
+ }
+
+ error = xfs_bmap_finish(&tp, &flist, &committed);
+ if (error)
+ goto bmap_cancel;
+ error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
return error;
+
+bmap_cancel:
+ xfs_bmap_cancel(&flist);
+trans_cancel:
+ xfs_trans_cancel(tp, cancel_flags);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ return error;
+}
+
+/*
+ * Internal and external extent tree search functions.
+ */
+
+/*
+ * Read in the extents to if_extents.
+ * All inode fields are set up by caller, we just traverse the btree
+ * and copy the records in. If the file system cannot contain unwritten
+ * extents, the records are checked for no "state" flags.
+ */
+int /* error */
+xfs_bmap_read_extents(
+ xfs_trans_t *tp, /* transaction pointer */
+ xfs_inode_t *ip, /* incore inode */
+ int whichfork) /* data or attr fork */
+{
+ struct xfs_btree_block *block; /* current btree block */
+ xfs_fsblock_t bno; /* block # of "block" */
+ xfs_buf_t *bp; /* buffer for "block" */
+ int error; /* error return value */
+ xfs_exntfmt_t exntf; /* XFS_EXTFMT_NOSTATE, if checking */
+ xfs_extnum_t i, j; /* index into the extents list */
+ xfs_ifork_t *ifp; /* fork structure */
+ int level; /* btree level, for checking */
+ xfs_mount_t *mp; /* file system mount structure */
+ __be64 *pp; /* pointer to block address */
+ /* REFERENCED */
+ xfs_extnum_t room; /* number of entries there's room for */
+
+ bno = NULLFSBLOCK;
+ mp = ip->i_mount;
+ ifp = XFS_IFORK_PTR(ip, whichfork);
+ exntf = (whichfork != XFS_DATA_FORK) ? XFS_EXTFMT_NOSTATE :
+ XFS_EXTFMT_INODE(ip);
+ block = ifp->if_broot;
+ /*
+ * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out.
+ */
+ level = be16_to_cpu(block->bb_level);
+ ASSERT(level > 0);
+ pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes);
+ bno = be64_to_cpu(*pp);
+ ASSERT(bno != NULLDFSBNO);
+ ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount);
+ ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks);
+ /*
+ * Go down the tree until leaf level is reached, following the first
+ * pointer (leftmost) at each level.
+ */
+ while (level-- > 0) {
+ error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
+ XFS_BMAP_BTREE_REF, &xfs_bmbt_buf_ops);
+ if (error)
+ return error;
+ block = XFS_BUF_TO_BLOCK(bp);
+ XFS_WANT_CORRUPTED_GOTO(
+ xfs_bmap_sanity_check(mp, bp, level),
+ error0);
+ if (level == 0)
+ break;
+ pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]);
+ bno = be64_to_cpu(*pp);
+ XFS_WANT_CORRUPTED_GOTO(XFS_FSB_SANITY_CHECK(mp, bno), error0);
+ xfs_trans_brelse(tp, bp);
+ }
+ /*
+ * Here with bp and block set to the leftmost leaf node in the tree.
+ */
+ room = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+ i = 0;
+ /*
+ * Loop over all leaf nodes. Copy information to the extent records.
+ */
+ for (;;) {
+ xfs_bmbt_rec_t *frp;
+ xfs_fsblock_t nextbno;
+ xfs_extnum_t num_recs;
+ xfs_extnum_t start;
+
+ num_recs = xfs_btree_get_numrecs(block);
+ if (unlikely(i + num_recs > room)) {
+ ASSERT(i + num_recs <= room);
+ xfs_warn(ip->i_mount,
+ "corrupt dinode %Lu, (btree extents).",
+ (unsigned long long) ip->i_ino);
+ XFS_CORRUPTION_ERROR("xfs_bmap_read_extents(1)",
+ XFS_ERRLEVEL_LOW, ip->i_mount, block);
+ goto error0;
+ }
+ XFS_WANT_CORRUPTED_GOTO(
+ xfs_bmap_sanity_check(mp, bp, 0),
+ error0);
+ /*
+ * Read-ahead the next leaf block, if any.
+ */
+ nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
+ if (nextbno != NULLFSBLOCK)
+ xfs_btree_reada_bufl(mp, nextbno, 1,
+ &xfs_bmbt_buf_ops);
+ /*
+ * Copy records into the extent records.
+ */
+ frp = XFS_BMBT_REC_ADDR(mp, block, 1);
+ start = i;
+ for (j = 0; j < num_recs; j++, i++, frp++) {
+ xfs_bmbt_rec_host_t *trp = xfs_iext_get_ext(ifp, i);
+ trp->l0 = be64_to_cpu(frp->l0);
+ trp->l1 = be64_to_cpu(frp->l1);
+ }
+ if (exntf == XFS_EXTFMT_NOSTATE) {
+ /*
+ * Check all attribute bmap btree records and
+ * any "older" data bmap btree records for a
+ * set bit in the "extent flag" position.
+ */
+ if (unlikely(xfs_check_nostate_extents(ifp,
+ start, num_recs))) {
+ XFS_ERROR_REPORT("xfs_bmap_read_extents(2)",
+ XFS_ERRLEVEL_LOW,
+ ip->i_mount);
+ goto error0;
+ }
+ }
+ xfs_trans_brelse(tp, bp);
+ bno = nextbno;
+ /*
+ * If we've reached the end, stop.
+ */
+ if (bno == NULLFSBLOCK)
+ break;
+ error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
+ XFS_BMAP_BTREE_REF, &xfs_bmbt_buf_ops);
+ if (error)
+ return error;
+ block = XFS_BUF_TO_BLOCK(bp);
+ }
+ ASSERT(i == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)));
+ ASSERT(i == XFS_IFORK_NEXTENTS(ip, whichfork));
+ XFS_BMAP_TRACE_EXLIST(ip, i, whichfork);
+ return 0;
+error0:
+ xfs_trans_brelse(tp, bp);
+ return XFS_ERROR(EFSCORRUPTED);
}
+
+/*
+ * Search the extent records for the entry containing block bno.
+ * If bno lies in a hole, point to the next entry. If bno lies
+ * past eof, *eofp will be set, and *prevp will contain the last
+ * entry (null if none). Else, *lastxp will be set to the index
+ * of the found entry; *gotp will contain the entry.
+ */
+STATIC xfs_bmbt_rec_host_t * /* pointer to found extent entry */
+xfs_bmap_search_multi_extents(
+ xfs_ifork_t *ifp, /* inode fork pointer */
+ xfs_fileoff_t bno, /* block number searched for */
+ int *eofp, /* out: end of file found */
+ xfs_extnum_t *lastxp, /* out: last extent index */
+ xfs_bmbt_irec_t *gotp, /* out: extent entry found */
+ xfs_bmbt_irec_t *prevp) /* out: previous extent entry found */
+{
+ xfs_bmbt_rec_host_t *ep; /* extent record pointer */
+ xfs_extnum_t lastx; /* last extent index */
+
+ /*
+ * Initialize the extent entry structure to catch access to
+ * uninitialized br_startblock field.
+ */
+ gotp->br_startoff = 0xffa5a5a5a5a5a5a5LL;
+ gotp->br_blockcount = 0xa55a5a5a5a5a5a5aLL;
+ gotp->br_state = XFS_EXT_INVALID;
+#if XFS_BIG_BLKNOS
+ gotp->br_startblock = 0xffffa5a5a5a5a5a5LL;
+#else
+ gotp->br_startblock = 0xffffa5a5;
+#endif
+ prevp->br_startoff = NULLFILEOFF;
+
+ ep = xfs_iext_bno_to_ext(ifp, bno, &lastx);
+ if (lastx > 0) {
+ xfs_bmbt_get_all(xfs_iext_get_ext(ifp, lastx - 1), prevp);
+ }
+ if (lastx < (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))) {
+ xfs_bmbt_get_all(ep, gotp);
+ *eofp = 0;
+ } else {
+ if (lastx > 0) {
+ *gotp = *prevp;
+ }
+ *eofp = 1;
+ ep = NULL;
+ }
+ *lastxp = lastx;
+ return ep;
+}
+
+/*
+ * Search the extents list for the inode, for the extent containing bno.
+ * If bno lies in a hole, point to the next entry. If bno lies past eof,
+ * *eofp will be set, and *prevp will contain the last entry (null if none).
+ * Else, *lastxp will be set to the index of the found
+ * entry; *gotp will contain the entry.
+ */
+STATIC xfs_bmbt_rec_host_t * /* pointer to found extent entry */
+xfs_bmap_search_extents(
+ xfs_inode_t *ip, /* incore inode pointer */
+ xfs_fileoff_t bno, /* block number searched for */
+ int fork, /* data or attr fork */
+ int *eofp, /* out: end of file found */
+ xfs_extnum_t *lastxp, /* out: last extent index */
+ xfs_bmbt_irec_t *gotp, /* out: extent entry found */
+ xfs_bmbt_irec_t *prevp) /* out: previous extent entry found */
+{
+ xfs_ifork_t *ifp; /* inode fork pointer */
+ xfs_bmbt_rec_host_t *ep; /* extent record pointer */
+
+ XFS_STATS_INC(xs_look_exlist);
+ ifp = XFS_IFORK_PTR(ip, fork);
+
+ ep = xfs_bmap_search_multi_extents(ifp, bno, eofp, lastxp, gotp, prevp);
+
+ if (unlikely(!(gotp->br_startblock) && (*lastxp != NULLEXTNUM) &&
+ !(XFS_IS_REALTIME_INODE(ip) && fork == XFS_DATA_FORK))) {
+ xfs_alert_tag(ip->i_mount, XFS_PTAG_FSBLOCK_ZERO,
+ "Access to block zero in inode %llu "
+ "start_block: %llx start_off: %llx "
+ "blkcnt: %llx extent-state: %x lastx: %x",
+ (unsigned long long)ip->i_ino,
+ (unsigned long long)gotp->br_startblock,
+ (unsigned long long)gotp->br_startoff,
+ (unsigned long long)gotp->br_blockcount,
+ gotp->br_state, *lastxp);
+ *lastxp = NULLEXTNUM;
+ *eofp = 1;
+ return NULL;
+ }
+ return ep;
+}
+
+/*
+ * Returns the file-relative block number of the first unused block(s)
+ * in the file with at least "len" logically contiguous blocks free.
+ * This is the lowest-address hole if the file has holes, else the first block
+ * past the end of file.
+ * Return 0 if the file is currently local (in-inode).
+ */
+int /* error */
+xfs_bmap_first_unused(
+ xfs_trans_t *tp, /* transaction pointer */
+ xfs_inode_t *ip, /* incore inode */
+ xfs_extlen_t len, /* size of hole to find */
+ xfs_fileoff_t *first_unused, /* unused block */
+ int whichfork) /* data or attr fork */
+{
+ int error; /* error return value */
+ int idx; /* extent record index */
+ xfs_ifork_t *ifp; /* inode fork pointer */
+ xfs_fileoff_t lastaddr; /* last block number seen */
+ xfs_fileoff_t lowest; /* lowest useful block */
+ xfs_fileoff_t max; /* starting useful block */
+ xfs_fileoff_t off; /* offset for this block */
+ xfs_extnum_t nextents; /* number of extent entries */
+
+ ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE ||
+ XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS ||
+ XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL);
+ if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) {
+ *first_unused = 0;
+ return 0;
+ }
+ ifp = XFS_IFORK_PTR(ip, whichfork);
+ if (!(ifp->if_flags & XFS_IFEXTENTS) &&
+ (error = xfs_iread_extents(tp, ip, whichfork)))
+ return error;
+ lowest = *first_unused;
+ nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+ for (idx = 0, lastaddr = 0, max = lowest; idx < nextents; idx++) {
+ xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, idx);
+ off = xfs_bmbt_get_startoff(ep);
+ /*
+ * See if the hole before this extent will work.
+ */
+ if (off >= lowest + len && off - max >= len) {
+ *first_unused = max;
+ return 0;
+ }
+ lastaddr = off + xfs_bmbt_get_blockcount(ep);
+ max = XFS_FILEOFF_MAX(lastaddr, lowest);
+ }
+ *first_unused = max;
+ return 0;
+}
+
+/*
+ * Returns the file-relative block number of the last block - 1 before
+ * last_block (input value) in the file.
+ * This is not based on i_size, it is based on the extent records.
+ * Returns 0 for local files, as they do not have extent records.
+ */
+int /* error */
+xfs_bmap_last_before(
+ xfs_trans_t *tp, /* transaction pointer */
+ xfs_inode_t *ip, /* incore inode */
+ xfs_fileoff_t *last_block, /* last block */
+ int whichfork) /* data or attr fork */
+{
+ xfs_fileoff_t bno; /* input file offset */
+ int eof; /* hit end of file */
+ xfs_bmbt_rec_host_t *ep; /* pointer to last extent */
+ int error; /* error return value */
+ xfs_bmbt_irec_t got; /* current extent value */
+ xfs_ifork_t *ifp; /* inode fork pointer */
+ xfs_extnum_t lastx; /* last extent used */
+ xfs_bmbt_irec_t prev; /* previous extent value */
+
+ if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE &&
+ XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
+ XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL)
+ return XFS_ERROR(EIO);
+ if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) {
+ *last_block = 0;
+ return 0;
+ }
+ ifp = XFS_IFORK_PTR(ip, whichfork);
+ if (!(ifp->if_flags & XFS_IFEXTENTS) &&
+ (error = xfs_iread_extents(tp, ip, whichfork)))
+ return error;
+ bno = *last_block - 1;
+ ep = xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got,
+ &prev);
+ if (eof || xfs_bmbt_get_startoff(ep) > bno) {
+ if (prev.br_startoff == NULLFILEOFF)
+ *last_block = 0;
+ else
+ *last_block = prev.br_startoff + prev.br_blockcount;
+ }
+ /*
+ * Otherwise *last_block is already the right answer.
+ */
+ return 0;
+}
+
+int
+xfs_bmap_last_extent(
+ struct xfs_trans *tp,
+ struct xfs_inode *ip,
+ int whichfork,
+ struct xfs_bmbt_irec *rec,
+ int *is_empty)
+{
+ struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ int error;
+ int nextents;
+
+ if (!(ifp->if_flags & XFS_IFEXTENTS)) {
+ error = xfs_iread_extents(tp, ip, whichfork);
+ if (error)
+ return error;
+ }
+
+ nextents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t);
+ if (nextents == 0) {
+ *is_empty = 1;
+ return 0;
+ }
+
+ xfs_bmbt_get_all(xfs_iext_get_ext(ifp, nextents - 1), rec);
+ *is_empty = 0;
+ return 0;
+}
+
+/*
+ * Check the last inode extent to determine whether this allocation will result
+ * in blocks being allocated at the end of the file. When we allocate new data
+ * blocks at the end of the file which do not start at the previous data block,
+ * we will try to align the new blocks at stripe unit boundaries.
+ *
+ * Returns 1 in bma->aeof if the file (fork) is empty as any new write will be
+ * at, or past the EOF.
+ */
+STATIC int
+xfs_bmap_isaeof(
+ struct xfs_bmalloca *bma,
+ int whichfork)
+{
+ struct xfs_bmbt_irec rec;
+ int is_empty;
+ int error;
+
+ bma->aeof = 0;
+ error = xfs_bmap_last_extent(NULL, bma->ip, whichfork, &rec,
+ &is_empty);
+ if (error)
+ return error;
+
+ if (is_empty) {
+ bma->aeof = 1;
+ return 0;
+ }
+
+ /*
+ * Check if we are allocation or past the last extent, or at least into
+ * the last delayed allocated extent.
+ */
+ bma->aeof = bma->offset >= rec.br_startoff + rec.br_blockcount ||
+ (bma->offset >= rec.br_startoff &&
+ isnullstartblock(rec.br_startblock));
+ return 0;
+}
+
+/*
+ * Returns the file-relative block number of the first block past eof in
+ * the file. This is not based on i_size, it is based on the extent records.
+ * Returns 0 for local files, as they do not have extent records.
+ */
+int
+xfs_bmap_last_offset(
+ struct xfs_inode *ip,
+ xfs_fileoff_t *last_block,
+ int whichfork)
+{
+ struct xfs_bmbt_irec rec;
+ int is_empty;
+ int error;
+
+ *last_block = 0;
+
+ if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL)
+ return 0;
+
+ if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE &&
+ XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS)
+ return XFS_ERROR(EIO);
+
+ error = xfs_bmap_last_extent(NULL, ip, whichfork, &rec, &is_empty);
+ if (error || is_empty)
+ return error;
+
+ *last_block = rec.br_startoff + rec.br_blockcount;
+ return 0;
+}
+
+/*
+ * Returns whether the selected fork of the inode has exactly one
+ * block or not. For the data fork we check this matches di_size,
+ * implying the file's range is 0..bsize-1.
+ */
+int /* 1=>1 block, 0=>otherwise */
+xfs_bmap_one_block(
+ xfs_inode_t *ip, /* incore inode */
+ int whichfork) /* data or attr fork */
+{
+ xfs_bmbt_rec_host_t *ep; /* ptr to fork's extent */
+ xfs_ifork_t *ifp; /* inode fork pointer */
+ int rval; /* return value */
+ xfs_bmbt_irec_t s; /* internal version of extent */
+
+#ifndef DEBUG
+ if (whichfork == XFS_DATA_FORK)
+ return XFS_ISIZE(ip) == ip->i_mount->m_sb.sb_blocksize;
+#endif /* !DEBUG */
+ if (XFS_IFORK_NEXTENTS(ip, whichfork) != 1)
+ return 0;
+ if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS)
+ return 0;
+ ifp = XFS_IFORK_PTR(ip, whichfork);
+ ASSERT(ifp->if_flags & XFS_IFEXTENTS);
+ ep = xfs_iext_get_ext(ifp, 0);
+ xfs_bmbt_get_all(ep, &s);
+ rval = s.br_startoff == 0 && s.br_blockcount == 1;
+ if (rval && whichfork == XFS_DATA_FORK)
+ ASSERT(XFS_ISIZE(ip) == ip->i_mount->m_sb.sb_blocksize);
+ return rval;
+}
+
+/*
+ * Extent tree manipulation functions used during allocation.
+ */
+
/*
* Convert a delayed allocation to a real allocation.
*/
@@ -1852,9 +3200,13 @@ done:
}
/*
+ * Functions used in the extent read, allocate and remove paths
+ */
+
+/*
* Adjust the size of the new extent based on di_extsize and rt extsize.
*/
-STATIC int
+int
xfs_bmap_extsize_align(
xfs_mount_t *mp,
xfs_bmbt_irec_t *gotp, /* next extent pointer */
@@ -2016,9 +3368,9 @@ xfs_bmap_extsize_align(
#define XFS_ALLOC_GAP_UNITS 4
-STATIC void
+void
xfs_bmap_adjacent(
- xfs_bmalloca_t *ap) /* bmap alloc argument struct */
+ struct xfs_bmalloca *ap) /* bmap alloc argument struct */
{
xfs_fsblock_t adjust; /* adjustment to block numbers */
xfs_agnumber_t fb_agno; /* ag number of ap->firstblock */
@@ -2164,107 +3516,65 @@ xfs_bmap_adjacent(
#undef ISVALID
}
-STATIC int
-xfs_bmap_rtalloc(
- xfs_bmalloca_t *ap) /* bmap alloc argument struct */
+static int
+xfs_bmap_longest_free_extent(
+ struct xfs_trans *tp,
+ xfs_agnumber_t ag,
+ xfs_extlen_t *blen,
+ int *notinit)
{
- xfs_alloctype_t atype = 0; /* type for allocation routines */
- int error; /* error return value */
- xfs_mount_t *mp; /* mount point structure */
- xfs_extlen_t prod = 0; /* product factor for allocators */
- xfs_extlen_t ralen = 0; /* realtime allocation length */
- xfs_extlen_t align; /* minimum allocation alignment */
- xfs_rtblock_t rtb;
-
- mp = ap->ip->i_mount;
- align = xfs_get_extsz_hint(ap->ip);
- prod = align / mp->m_sb.sb_rextsize;
- error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev,
- align, 1, ap->eof, 0,
- ap->conv, &ap->offset, &ap->length);
- if (error)
- return error;
- ASSERT(ap->length);
- ASSERT(ap->length % mp->m_sb.sb_rextsize == 0);
-
- /*
- * If the offset & length are not perfectly aligned
- * then kill prod, it will just get us in trouble.
- */
- if (do_mod(ap->offset, align) || ap->length % align)
- prod = 1;
- /*
- * Set ralen to be the actual requested length in rtextents.
- */
- ralen = ap->length / mp->m_sb.sb_rextsize;
- /*
- * If the old value was close enough to MAXEXTLEN that
- * we rounded up to it, cut it back so it's valid again.
- * Note that if it's a really large request (bigger than
- * MAXEXTLEN), we don't hear about that number, and can't
- * adjust the starting point to match it.
- */
- if (ralen * mp->m_sb.sb_rextsize >= MAXEXTLEN)
- ralen = MAXEXTLEN / mp->m_sb.sb_rextsize;
-
- /*
- * Lock out other modifications to the RT bitmap inode.
- */
- xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL);
- xfs_trans_ijoin(ap->tp, mp->m_rbmip, XFS_ILOCK_EXCL);
-
- /*
- * If it's an allocation to an empty file at offset 0,
- * pick an extent that will space things out in the rt area.
- */
- if (ap->eof && ap->offset == 0) {
- xfs_rtblock_t uninitialized_var(rtx); /* realtime extent no */
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_perag *pag;
+ xfs_extlen_t longest;
+ int error = 0;
- error = xfs_rtpick_extent(mp, ap->tp, ralen, &rtx);
+ pag = xfs_perag_get(mp, ag);
+ if (!pag->pagf_init) {
+ error = xfs_alloc_pagf_init(mp, tp, ag, XFS_ALLOC_FLAG_TRYLOCK);
if (error)
- return error;
- ap->blkno = rtx * mp->m_sb.sb_rextsize;
- } else {
- ap->blkno = 0;
+ goto out;
+
+ if (!pag->pagf_init) {
+ *notinit = 1;
+ goto out;
+ }
}
- xfs_bmap_adjacent(ap);
+ longest = xfs_alloc_longest_free_extent(mp, pag);
+ if (*blen < longest)
+ *blen = longest;
- /*
- * Realtime allocation, done through xfs_rtallocate_extent.
- */
- atype = ap->blkno == 0 ? XFS_ALLOCTYPE_ANY_AG : XFS_ALLOCTYPE_NEAR_BNO;
- do_div(ap->blkno, mp->m_sb.sb_rextsize);
- rtb = ap->blkno;
- ap->length = ralen;
- if ((error = xfs_rtallocate_extent(ap->tp, ap->blkno, 1, ap->length,
- &ralen, atype, ap->wasdel, prod, &rtb)))
- return error;
- if (rtb == NULLFSBLOCK && prod > 1 &&
- (error = xfs_rtallocate_extent(ap->tp, ap->blkno, 1,
- ap->length, &ralen, atype,
- ap->wasdel, 1, &rtb)))
- return error;
- ap->blkno = rtb;
- if (ap->blkno != NULLFSBLOCK) {
- ap->blkno *= mp->m_sb.sb_rextsize;
- ralen *= mp->m_sb.sb_rextsize;
- ap->length = ralen;
- ap->ip->i_d.di_nblocks += ralen;
- xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE);
- if (ap->wasdel)
- ap->ip->i_delayed_blks -= ralen;
+out:
+ xfs_perag_put(pag);
+ return error;
+}
+
+static void
+xfs_bmap_select_minlen(
+ struct xfs_bmalloca *ap,
+ struct xfs_alloc_arg *args,
+ xfs_extlen_t *blen,
+ int notinit)
+{
+ if (notinit || *blen < ap->minlen) {
/*
- * Adjust the disk quota also. This was reserved
- * earlier.
+ * Since we did a BUF_TRYLOCK above, it is possible that
+ * there is space for this request.
*/
- xfs_trans_mod_dquot_byino(ap->tp, ap->ip,
- ap->wasdel ? XFS_TRANS_DQ_DELRTBCOUNT :
- XFS_TRANS_DQ_RTBCOUNT, (long) ralen);
+ args->minlen = ap->minlen;
+ } else if (*blen < args->maxlen) {
+ /*
+ * If the best seen length is less than the request length,
+ * use the best as the minimum.
+ */
+ args->minlen = *blen;
} else {
- ap->length = 0;
+ /*
+ * Otherwise we've seen an extent as big as maxlen, use that
+ * as the minimum.
+ */
+ args->minlen = args->maxlen;
}
- return 0;
}
STATIC int
@@ -2274,117 +3584,80 @@ xfs_bmap_btalloc_nullfb(
xfs_extlen_t *blen)
{
struct xfs_mount *mp = ap->ip->i_mount;
- struct xfs_perag *pag;
xfs_agnumber_t ag, startag;
int notinit = 0;
int error;
- if (ap->userdata && xfs_inode_is_filestream(ap->ip))
- args->type = XFS_ALLOCTYPE_NEAR_BNO;
- else
- args->type = XFS_ALLOCTYPE_START_BNO;
+ args->type = XFS_ALLOCTYPE_START_BNO;
args->total = ap->total;
- /*
- * Search for an allocation group with a single extent large enough
- * for the request. If one isn't found, then adjust the minimum
- * allocation size to the largest space found.
- */
startag = ag = XFS_FSB_TO_AGNO(mp, args->fsbno);
if (startag == NULLAGNUMBER)
startag = ag = 0;
- pag = xfs_perag_get(mp, ag);
while (*blen < args->maxlen) {
- if (!pag->pagf_init) {
- error = xfs_alloc_pagf_init(mp, args->tp, ag,
- XFS_ALLOC_FLAG_TRYLOCK);
- if (error) {
- xfs_perag_put(pag);
- return error;
- }
- }
-
- /*
- * See xfs_alloc_fix_freelist...
- */
- if (pag->pagf_init) {
- xfs_extlen_t longest;
- longest = xfs_alloc_longest_free_extent(mp, pag);
- if (*blen < longest)
- *blen = longest;
- } else
- notinit = 1;
-
- if (xfs_inode_is_filestream(ap->ip)) {
- if (*blen >= args->maxlen)
- break;
-
- if (ap->userdata) {
- /*
- * If startag is an invalid AG, we've
- * come here once before and
- * xfs_filestream_new_ag picked the
- * best currently available.
- *
- * Don't continue looping, since we
- * could loop forever.
- */
- if (startag == NULLAGNUMBER)
- break;
-
- error = xfs_filestream_new_ag(ap, &ag);
- xfs_perag_put(pag);
- if (error)
- return error;
+ error = xfs_bmap_longest_free_extent(args->tp, ag, blen,
+ &notinit);
+ if (error)
+ return error;
- /* loop again to set 'blen'*/
- startag = NULLAGNUMBER;
- pag = xfs_perag_get(mp, ag);
- continue;
- }
- }
if (++ag == mp->m_sb.sb_agcount)
ag = 0;
if (ag == startag)
break;
- xfs_perag_put(pag);
- pag = xfs_perag_get(mp, ag);
}
- xfs_perag_put(pag);
- /*
- * Since the above loop did a BUF_TRYLOCK, it is
- * possible that there is space for this request.
- */
- if (notinit || *blen < ap->minlen)
- args->minlen = ap->minlen;
- /*
- * If the best seen length is less than the request
- * length, use the best as the minimum.
- */
- else if (*blen < args->maxlen)
- args->minlen = *blen;
- /*
- * Otherwise we've seen an extent as big as maxlen,
- * use that as the minimum.
- */
- else
- args->minlen = args->maxlen;
+ xfs_bmap_select_minlen(ap, args, blen, notinit);
+ return 0;
+}
+
+STATIC int
+xfs_bmap_btalloc_filestreams(
+ struct xfs_bmalloca *ap,
+ struct xfs_alloc_arg *args,
+ xfs_extlen_t *blen)
+{
+ struct xfs_mount *mp = ap->ip->i_mount;
+ xfs_agnumber_t ag;
+ int notinit = 0;
+ int error;
+
+ args->type = XFS_ALLOCTYPE_NEAR_BNO;
+ args->total = ap->total;
+
+ ag = XFS_FSB_TO_AGNO(mp, args->fsbno);
+ if (ag == NULLAGNUMBER)
+ ag = 0;
+
+ error = xfs_bmap_longest_free_extent(args->tp, ag, blen, &notinit);
+ if (error)
+ return error;
+
+ if (*blen < args->maxlen) {
+ error = xfs_filestream_new_ag(ap, &ag);
+ if (error)
+ return error;
+
+ error = xfs_bmap_longest_free_extent(args->tp, ag, blen,
+ &notinit);
+ if (error)
+ return error;
+
+ }
+
+ xfs_bmap_select_minlen(ap, args, blen, notinit);
/*
- * set the failure fallback case to look in the selected
- * AG as the stream may have moved.
+ * Set the failure fallback case to look in the selected AG as stream
+ * may have moved.
*/
- if (xfs_inode_is_filestream(ap->ip))
- ap->blkno = args->fsbno = XFS_AGB_TO_FSB(mp, ag, 0);
-
+ ap->blkno = args->fsbno = XFS_AGB_TO_FSB(mp, ag, 0);
return 0;
}
STATIC int
xfs_bmap_btalloc(
- xfs_bmalloca_t *ap) /* bmap alloc argument struct */
+ struct xfs_bmalloca *ap) /* bmap alloc argument struct */
{
xfs_mount_t *mp; /* mount point structure */
xfs_alloctype_t atype = 0; /* type for allocation routines */
@@ -2398,10 +3671,19 @@ xfs_bmap_btalloc(
int isaligned;
int tryagain;
int error;
+ int stripe_align;
ASSERT(ap->length);
mp = ap->ip->i_mount;
+
+ /* stripe alignment for allocation is determined by mount parameters */
+ stripe_align = 0;
+ if (mp->m_swidth && (mp->m_flags & XFS_MOUNT_SWALLOC))
+ stripe_align = mp->m_swidth;
+ else if (mp->m_dalign)
+ stripe_align = mp->m_dalign;
+
align = ap->userdata ? xfs_get_extsz_hint(ap->ip) : 0;
if (unlikely(align)) {
error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev,
@@ -2410,6 +3692,8 @@ xfs_bmap_btalloc(
ASSERT(!error);
ASSERT(ap->length);
}
+
+
nullfb = *ap->firstblock == NULLFSBLOCK;
fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, *ap->firstblock);
if (nullfb) {
@@ -2447,7 +3731,15 @@ xfs_bmap_btalloc(
args.firstblock = *ap->firstblock;
blen = 0;
if (nullfb) {
- error = xfs_bmap_btalloc_nullfb(ap, &args, &blen);
+ /*
+ * Search for an allocation group with a single extent large
+ * enough for the request. If one isn't found, then adjust
+ * the minimum allocation size to the largest space found.
+ */
+ if (ap->userdata && xfs_inode_is_filestream(ap->ip))
+ error = xfs_bmap_btalloc_filestreams(ap, &args, &blen);
+ else
+ error = xfs_bmap_btalloc_nullfb(ap, &args, &blen);
if (error)
return error;
} else if (ap->flist->xbf_low) {
@@ -2485,7 +3777,7 @@ xfs_bmap_btalloc(
*/
if (!ap->flist->xbf_low && ap->aeof) {
if (!ap->offset) {
- args.alignment = mp->m_dalign;
+ args.alignment = stripe_align;
atype = args.type;
isaligned = 1;
/*
@@ -2510,13 +3802,13 @@ xfs_bmap_btalloc(
* of minlen+alignment+slop doesn't go up
* between the calls.
*/
- if (blen > mp->m_dalign && blen <= args.maxlen)
- nextminlen = blen - mp->m_dalign;
+ if (blen > stripe_align && blen <= args.maxlen)
+ nextminlen = blen - stripe_align;
else
nextminlen = args.minlen;
- if (nextminlen + mp->m_dalign > args.minlen + 1)
+ if (nextminlen + stripe_align > args.minlen + 1)
args.minalignslop =
- nextminlen + mp->m_dalign -
+ nextminlen + stripe_align -
args.minlen - 1;
else
args.minalignslop = 0;
@@ -2538,7 +3830,7 @@ xfs_bmap_btalloc(
*/
args.type = atype;
args.fsbno = ap->blkno;
- args.alignment = mp->m_dalign;
+ args.alignment = stripe_align;
args.minlen = nextminlen;
args.minalignslop = 0;
isaligned = 1;
@@ -2616,7 +3908,7 @@ xfs_bmap_btalloc(
*/
STATIC int
xfs_bmap_alloc(
- xfs_bmalloca_t *ap) /* bmap alloc argument struct */
+ struct xfs_bmalloca *ap) /* bmap alloc argument struct */
{
if (XFS_IS_REALTIME_INODE(ap->ip) && ap->userdata)
return xfs_bmap_rtalloc(ap);
@@ -2624,1626 +3916,6 @@ xfs_bmap_alloc(
}
/*
- * Transform a btree format file with only one leaf node, where the
- * extents list will fit in the inode, into an extents format file.
- * Since the file extents are already in-core, all we have to do is
- * give up the space for the btree root and pitch the leaf block.
- */
-STATIC int /* error */
-xfs_bmap_btree_to_extents(
- xfs_trans_t *tp, /* transaction pointer */
- xfs_inode_t *ip, /* incore inode pointer */
- xfs_btree_cur_t *cur, /* btree cursor */
- int *logflagsp, /* inode logging flags */
- int whichfork) /* data or attr fork */
-{
- /* REFERENCED */
- struct xfs_btree_block *cblock;/* child btree block */
- xfs_fsblock_t cbno; /* child block number */
- xfs_buf_t *cbp; /* child block's buffer */
- int error; /* error return value */
- xfs_ifork_t *ifp; /* inode fork data */
- xfs_mount_t *mp; /* mount point structure */
- __be64 *pp; /* ptr to block address */
- struct xfs_btree_block *rblock;/* root btree block */
-
- mp = ip->i_mount;
- ifp = XFS_IFORK_PTR(ip, whichfork);
- ASSERT(ifp->if_flags & XFS_IFEXTENTS);
- ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE);
- rblock = ifp->if_broot;
- ASSERT(be16_to_cpu(rblock->bb_level) == 1);
- ASSERT(be16_to_cpu(rblock->bb_numrecs) == 1);
- ASSERT(xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0) == 1);
- pp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, ifp->if_broot_bytes);
- cbno = be64_to_cpu(*pp);
- *logflagsp = 0;
-#ifdef DEBUG
- if ((error = xfs_btree_check_lptr(cur, cbno, 1)))
- return error;
-#endif
- error = xfs_btree_read_bufl(mp, tp, cbno, 0, &cbp, XFS_BMAP_BTREE_REF,
- &xfs_bmbt_buf_ops);
- if (error)
- return error;
- cblock = XFS_BUF_TO_BLOCK(cbp);
- if ((error = xfs_btree_check_block(cur, cblock, 0, cbp)))
- return error;
- xfs_bmap_add_free(cbno, 1, cur->bc_private.b.flist, mp);
- ip->i_d.di_nblocks--;
- xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L);
- xfs_trans_binval(tp, cbp);
- if (cur->bc_bufs[0] == cbp)
- cur->bc_bufs[0] = NULL;
- xfs_iroot_realloc(ip, -1, whichfork);
- ASSERT(ifp->if_broot == NULL);
- ASSERT((ifp->if_flags & XFS_IFBROOT) == 0);
- XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS);
- *logflagsp = XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
- return 0;
-}
-
-/*
- * Called by xfs_bmapi to update file extent records and the btree
- * after removing space (or undoing a delayed allocation).
- */
-STATIC int /* error */
-xfs_bmap_del_extent(
- xfs_inode_t *ip, /* incore inode pointer */
- xfs_trans_t *tp, /* current transaction pointer */
- xfs_extnum_t *idx, /* extent number to update/delete */
- xfs_bmap_free_t *flist, /* list of extents to be freed */
- xfs_btree_cur_t *cur, /* if null, not a btree */
- xfs_bmbt_irec_t *del, /* data to remove from extents */
- int *logflagsp, /* inode logging flags */
- int whichfork) /* data or attr fork */
-{
- xfs_filblks_t da_new; /* new delay-alloc indirect blocks */
- xfs_filblks_t da_old; /* old delay-alloc indirect blocks */
- xfs_fsblock_t del_endblock=0; /* first block past del */
- xfs_fileoff_t del_endoff; /* first offset past del */
- int delay; /* current block is delayed allocated */
- int do_fx; /* free extent at end of routine */
- xfs_bmbt_rec_host_t *ep; /* current extent entry pointer */
- int error; /* error return value */
- int flags; /* inode logging flags */
- xfs_bmbt_irec_t got; /* current extent entry */
- xfs_fileoff_t got_endoff; /* first offset past got */
- int i; /* temp state */
- xfs_ifork_t *ifp; /* inode fork pointer */
- xfs_mount_t *mp; /* mount structure */
- xfs_filblks_t nblks; /* quota/sb block count */
- xfs_bmbt_irec_t new; /* new record to be inserted */
- /* REFERENCED */
- uint qfield; /* quota field to update */
- xfs_filblks_t temp; /* for indirect length calculations */
- xfs_filblks_t temp2; /* for indirect length calculations */
- int state = 0;
-
- XFS_STATS_INC(xs_del_exlist);
-
- if (whichfork == XFS_ATTR_FORK)
- state |= BMAP_ATTRFORK;
-
- mp = ip->i_mount;
- ifp = XFS_IFORK_PTR(ip, whichfork);
- ASSERT((*idx >= 0) && (*idx < ifp->if_bytes /
- (uint)sizeof(xfs_bmbt_rec_t)));
- ASSERT(del->br_blockcount > 0);
- ep = xfs_iext_get_ext(ifp, *idx);
- xfs_bmbt_get_all(ep, &got);
- ASSERT(got.br_startoff <= del->br_startoff);
- del_endoff = del->br_startoff + del->br_blockcount;
- got_endoff = got.br_startoff + got.br_blockcount;
- ASSERT(got_endoff >= del_endoff);
- delay = isnullstartblock(got.br_startblock);
- ASSERT(isnullstartblock(del->br_startblock) == delay);
- flags = 0;
- qfield = 0;
- error = 0;
- /*
- * If deleting a real allocation, must free up the disk space.
- */
- if (!delay) {
- flags = XFS_ILOG_CORE;
- /*
- * Realtime allocation. Free it and record di_nblocks update.
- */
- if (whichfork == XFS_DATA_FORK && XFS_IS_REALTIME_INODE(ip)) {
- xfs_fsblock_t bno;
- xfs_filblks_t len;
-
- ASSERT(do_mod(del->br_blockcount,
- mp->m_sb.sb_rextsize) == 0);
- ASSERT(do_mod(del->br_startblock,
- mp->m_sb.sb_rextsize) == 0);
- bno = del->br_startblock;
- len = del->br_blockcount;
- do_div(bno, mp->m_sb.sb_rextsize);
- do_div(len, mp->m_sb.sb_rextsize);
- error = xfs_rtfree_extent(tp, bno, (xfs_extlen_t)len);
- if (error)
- goto done;
- do_fx = 0;
- nblks = len * mp->m_sb.sb_rextsize;
- qfield = XFS_TRANS_DQ_RTBCOUNT;
- }
- /*
- * Ordinary allocation.
- */
- else {
- do_fx = 1;
- nblks = del->br_blockcount;
- qfield = XFS_TRANS_DQ_BCOUNT;
- }
- /*
- * Set up del_endblock and cur for later.
- */
- del_endblock = del->br_startblock + del->br_blockcount;
- if (cur) {
- if ((error = xfs_bmbt_lookup_eq(cur, got.br_startoff,
- got.br_startblock, got.br_blockcount,
- &i)))
- goto done;
- XFS_WANT_CORRUPTED_GOTO(i == 1, done);
- }
- da_old = da_new = 0;
- } else {
- da_old = startblockval(got.br_startblock);
- da_new = 0;
- nblks = 0;
- do_fx = 0;
- }
- /*
- * Set flag value to use in switch statement.
- * Left-contig is 2, right-contig is 1.
- */
- switch (((got.br_startoff == del->br_startoff) << 1) |
- (got_endoff == del_endoff)) {
- case 3:
- /*
- * Matches the whole extent. Delete the entry.
- */
- xfs_iext_remove(ip, *idx, 1,
- whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0);
- --*idx;
- if (delay)
- break;
-
- XFS_IFORK_NEXT_SET(ip, whichfork,
- XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
- flags |= XFS_ILOG_CORE;
- if (!cur) {
- flags |= xfs_ilog_fext(whichfork);
- break;
- }
- if ((error = xfs_btree_delete(cur, &i)))
- goto done;
- XFS_WANT_CORRUPTED_GOTO(i == 1, done);
- break;
-
- case 2:
- /*
- * Deleting the first part of the extent.
- */
- trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
- xfs_bmbt_set_startoff(ep, del_endoff);
- temp = got.br_blockcount - del->br_blockcount;
- xfs_bmbt_set_blockcount(ep, temp);
- if (delay) {
- temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
- da_old);
- xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
- trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
- da_new = temp;
- break;
- }
- xfs_bmbt_set_startblock(ep, del_endblock);
- trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
- if (!cur) {
- flags |= xfs_ilog_fext(whichfork);
- break;
- }
- if ((error = xfs_bmbt_update(cur, del_endoff, del_endblock,
- got.br_blockcount - del->br_blockcount,
- got.br_state)))
- goto done;
- break;
-
- case 1:
- /*
- * Deleting the last part of the extent.
- */
- temp = got.br_blockcount - del->br_blockcount;
- trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
- xfs_bmbt_set_blockcount(ep, temp);
- if (delay) {
- temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
- da_old);
- xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
- trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
- da_new = temp;
- break;
- }
- trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
- if (!cur) {
- flags |= xfs_ilog_fext(whichfork);
- break;
- }
- if ((error = xfs_bmbt_update(cur, got.br_startoff,
- got.br_startblock,
- got.br_blockcount - del->br_blockcount,
- got.br_state)))
- goto done;
- break;
-
- case 0:
- /*
- * Deleting the middle of the extent.
- */
- temp = del->br_startoff - got.br_startoff;
- trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
- xfs_bmbt_set_blockcount(ep, temp);
- new.br_startoff = del_endoff;
- temp2 = got_endoff - del_endoff;
- new.br_blockcount = temp2;
- new.br_state = got.br_state;
- if (!delay) {
- new.br_startblock = del_endblock;
- flags |= XFS_ILOG_CORE;
- if (cur) {
- if ((error = xfs_bmbt_update(cur,
- got.br_startoff,
- got.br_startblock, temp,
- got.br_state)))
- goto done;
- if ((error = xfs_btree_increment(cur, 0, &i)))
- goto done;
- cur->bc_rec.b = new;
- error = xfs_btree_insert(cur, &i);
- if (error && error != ENOSPC)
- goto done;
- /*
- * If get no-space back from btree insert,
- * it tried a split, and we have a zero
- * block reservation.
- * Fix up our state and return the error.
- */
- if (error == ENOSPC) {
- /*
- * Reset the cursor, don't trust
- * it after any insert operation.
- */
- if ((error = xfs_bmbt_lookup_eq(cur,
- got.br_startoff,
- got.br_startblock,
- temp, &i)))
- goto done;
- XFS_WANT_CORRUPTED_GOTO(i == 1, done);
- /*
- * Update the btree record back
- * to the original value.
- */
- if ((error = xfs_bmbt_update(cur,
- got.br_startoff,
- got.br_startblock,
- got.br_blockcount,
- got.br_state)))
- goto done;
- /*
- * Reset the extent record back
- * to the original value.
- */
- xfs_bmbt_set_blockcount(ep,
- got.br_blockcount);
- flags = 0;
- error = XFS_ERROR(ENOSPC);
- goto done;
- }
- XFS_WANT_CORRUPTED_GOTO(i == 1, done);
- } else
- flags |= xfs_ilog_fext(whichfork);
- XFS_IFORK_NEXT_SET(ip, whichfork,
- XFS_IFORK_NEXTENTS(ip, whichfork) + 1);
- } else {
- ASSERT(whichfork == XFS_DATA_FORK);
- temp = xfs_bmap_worst_indlen(ip, temp);
- xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
- temp2 = xfs_bmap_worst_indlen(ip, temp2);
- new.br_startblock = nullstartblock((int)temp2);
- da_new = temp + temp2;
- while (da_new > da_old) {
- if (temp) {
- temp--;
- da_new--;
- xfs_bmbt_set_startblock(ep,
- nullstartblock((int)temp));
- }
- if (da_new == da_old)
- break;
- if (temp2) {
- temp2--;
- da_new--;
- new.br_startblock =
- nullstartblock((int)temp2);
- }
- }
- }
- trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
- xfs_iext_insert(ip, *idx + 1, 1, &new, state);
- ++*idx;
- break;
- }
- /*
- * If we need to, add to list of extents to delete.
- */
- if (do_fx)
- xfs_bmap_add_free(del->br_startblock, del->br_blockcount, flist,
- mp);
- /*
- * Adjust inode # blocks in the file.
- */
- if (nblks)
- ip->i_d.di_nblocks -= nblks;
- /*
- * Adjust quota data.
- */
- if (qfield)
- xfs_trans_mod_dquot_byino(tp, ip, qfield, (long)-nblks);
-
- /*
- * Account for change in delayed indirect blocks.
- * Nothing to do for disk quota accounting here.
- */
- ASSERT(da_old >= da_new);
- if (da_old > da_new) {
- xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
- (int64_t)(da_old - da_new), 0);
- }
-done:
- *logflagsp = flags;
- return error;
-}
-
-/*
- * Remove the entry "free" from the free item list. Prev points to the
- * previous entry, unless "free" is the head of the list.
- */
-STATIC void
-xfs_bmap_del_free(
- xfs_bmap_free_t *flist, /* free item list header */
- xfs_bmap_free_item_t *prev, /* previous item on list, if any */
- xfs_bmap_free_item_t *free) /* list item to be freed */
-{
- if (prev)
- prev->xbfi_next = free->xbfi_next;
- else
- flist->xbf_first = free->xbfi_next;
- flist->xbf_count--;
- kmem_zone_free(xfs_bmap_free_item_zone, free);
-}
-
-/*
- * Convert an extents-format file into a btree-format file.
- * The new file will have a root block (in the inode) and a single child block.
- */
-STATIC int /* error */
-xfs_bmap_extents_to_btree(
- xfs_trans_t *tp, /* transaction pointer */
- xfs_inode_t *ip, /* incore inode pointer */
- xfs_fsblock_t *firstblock, /* first-block-allocated */
- xfs_bmap_free_t *flist, /* blocks freed in xaction */
- xfs_btree_cur_t **curp, /* cursor returned to caller */
- int wasdel, /* converting a delayed alloc */
- int *logflagsp, /* inode logging flags */
- int whichfork) /* data or attr fork */
-{
- struct xfs_btree_block *ablock; /* allocated (child) bt block */
- xfs_buf_t *abp; /* buffer for ablock */
- xfs_alloc_arg_t args; /* allocation arguments */
- xfs_bmbt_rec_t *arp; /* child record pointer */
- struct xfs_btree_block *block; /* btree root block */
- xfs_btree_cur_t *cur; /* bmap btree cursor */
- xfs_bmbt_rec_host_t *ep; /* extent record pointer */
- int error; /* error return value */
- xfs_extnum_t i, cnt; /* extent record index */
- xfs_ifork_t *ifp; /* inode fork pointer */
- xfs_bmbt_key_t *kp; /* root block key pointer */
- xfs_mount_t *mp; /* mount structure */
- xfs_extnum_t nextents; /* number of file extents */
- xfs_bmbt_ptr_t *pp; /* root block address pointer */
-
- ifp = XFS_IFORK_PTR(ip, whichfork);
- ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS);
-
- /*
- * Make space in the inode incore.
- */
- xfs_iroot_realloc(ip, 1, whichfork);
- ifp->if_flags |= XFS_IFBROOT;
-
- /*
- * Fill in the root.
- */
- block = ifp->if_broot;
- block->bb_magic = cpu_to_be32(XFS_BMAP_MAGIC);
- block->bb_level = cpu_to_be16(1);
- block->bb_numrecs = cpu_to_be16(1);
- block->bb_u.l.bb_leftsib = cpu_to_be64(NULLDFSBNO);
- block->bb_u.l.bb_rightsib = cpu_to_be64(NULLDFSBNO);
-
- /*
- * Need a cursor. Can't allocate until bb_level is filled in.
- */
- mp = ip->i_mount;
- cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
- cur->bc_private.b.firstblock = *firstblock;
- cur->bc_private.b.flist = flist;
- cur->bc_private.b.flags = wasdel ? XFS_BTCUR_BPRV_WASDEL : 0;
- /*
- * Convert to a btree with two levels, one record in root.
- */
- XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_BTREE);
- memset(&args, 0, sizeof(args));
- args.tp = tp;
- args.mp = mp;
- args.firstblock = *firstblock;
- if (*firstblock == NULLFSBLOCK) {
- args.type = XFS_ALLOCTYPE_START_BNO;
- args.fsbno = XFS_INO_TO_FSB(mp, ip->i_ino);
- } else if (flist->xbf_low) {
- args.type = XFS_ALLOCTYPE_START_BNO;
- args.fsbno = *firstblock;
- } else {
- args.type = XFS_ALLOCTYPE_NEAR_BNO;
- args.fsbno = *firstblock;
- }
- args.minlen = args.maxlen = args.prod = 1;
- args.total = args.minleft = args.alignment = args.mod = args.isfl =
- args.minalignslop = 0;
- args.wasdel = wasdel;
- *logflagsp = 0;
- if ((error = xfs_alloc_vextent(&args))) {
- xfs_iroot_realloc(ip, -1, whichfork);
- xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
- return error;
- }
- /*
- * Allocation can't fail, the space was reserved.
- */
- ASSERT(args.fsbno != NULLFSBLOCK);
- ASSERT(*firstblock == NULLFSBLOCK ||
- args.agno == XFS_FSB_TO_AGNO(mp, *firstblock) ||
- (flist->xbf_low &&
- args.agno > XFS_FSB_TO_AGNO(mp, *firstblock)));
- *firstblock = cur->bc_private.b.firstblock = args.fsbno;
- cur->bc_private.b.allocated++;
- ip->i_d.di_nblocks++;
- xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, 1L);
- abp = xfs_btree_get_bufl(mp, tp, args.fsbno, 0);
- /*
- * Fill in the child block.
- */
- abp->b_ops = &xfs_bmbt_buf_ops;
- ablock = XFS_BUF_TO_BLOCK(abp);
- ablock->bb_magic = cpu_to_be32(XFS_BMAP_MAGIC);
- ablock->bb_level = 0;
- ablock->bb_u.l.bb_leftsib = cpu_to_be64(NULLDFSBNO);
- ablock->bb_u.l.bb_rightsib = cpu_to_be64(NULLDFSBNO);
- arp = XFS_BMBT_REC_ADDR(mp, ablock, 1);
- nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
- for (cnt = i = 0; i < nextents; i++) {
- ep = xfs_iext_get_ext(ifp, i);
- if (!isnullstartblock(xfs_bmbt_get_startblock(ep))) {
- arp->l0 = cpu_to_be64(ep->l0);
- arp->l1 = cpu_to_be64(ep->l1);
- arp++; cnt++;
- }
- }
- ASSERT(cnt == XFS_IFORK_NEXTENTS(ip, whichfork));
- xfs_btree_set_numrecs(ablock, cnt);
-
- /*
- * Fill in the root key and pointer.
- */
- kp = XFS_BMBT_KEY_ADDR(mp, block, 1);
- arp = XFS_BMBT_REC_ADDR(mp, ablock, 1);
- kp->br_startoff = cpu_to_be64(xfs_bmbt_disk_get_startoff(arp));
- pp = XFS_BMBT_PTR_ADDR(mp, block, 1, xfs_bmbt_get_maxrecs(cur,
- be16_to_cpu(block->bb_level)));
- *pp = cpu_to_be64(args.fsbno);
-
- /*
- * Do all this logging at the end so that
- * the root is at the right level.
- */
- xfs_btree_log_block(cur, abp, XFS_BB_ALL_BITS);
- xfs_btree_log_recs(cur, abp, 1, be16_to_cpu(ablock->bb_numrecs));
- ASSERT(*curp == NULL);
- *curp = cur;
- *logflagsp = XFS_ILOG_CORE | xfs_ilog_fbroot(whichfork);
- return 0;
-}
-
-/*
- * Calculate the default attribute fork offset for newly created inodes.
- */
-uint
-xfs_default_attroffset(
- struct xfs_inode *ip)
-{
- struct xfs_mount *mp = ip->i_mount;
- uint offset;
-
- if (mp->m_sb.sb_inodesize == 256) {
- offset = XFS_LITINO(mp) -
- XFS_BMDR_SPACE_CALC(MINABTPTRS);
- } else {
- offset = XFS_BMDR_SPACE_CALC(6 * MINABTPTRS);
- }
-
- ASSERT(offset < XFS_LITINO(mp));
- return offset;
-}
-
-/*
- * Helper routine to reset inode di_forkoff field when switching
- * attribute fork from local to extent format - we reset it where
- * possible to make space available for inline data fork extents.
- */
-STATIC void
-xfs_bmap_forkoff_reset(
- xfs_mount_t *mp,
- xfs_inode_t *ip,
- int whichfork)
-{
- if (whichfork == XFS_ATTR_FORK &&
- ip->i_d.di_format != XFS_DINODE_FMT_DEV &&
- ip->i_d.di_format != XFS_DINODE_FMT_UUID &&
- ip->i_d.di_format != XFS_DINODE_FMT_BTREE) {
- uint dfl_forkoff = xfs_default_attroffset(ip) >> 3;
-
- if (dfl_forkoff > ip->i_d.di_forkoff)
- ip->i_d.di_forkoff = dfl_forkoff;
- }
-}
-
-/*
- * Convert a local file to an extents file.
- * This code is out of bounds for data forks of regular files,
- * since the file data needs to get logged so things will stay consistent.
- * (The bmap-level manipulations are ok, though).
- */
-STATIC int /* error */
-xfs_bmap_local_to_extents(
- xfs_trans_t *tp, /* transaction pointer */
- xfs_inode_t *ip, /* incore inode pointer */
- xfs_fsblock_t *firstblock, /* first block allocated in xaction */
- xfs_extlen_t total, /* total blocks needed by transaction */
- int *logflagsp, /* inode logging flags */
- int whichfork) /* data or attr fork */
-{
- int error; /* error return value */
- int flags; /* logging flags returned */
- xfs_ifork_t *ifp; /* inode fork pointer */
-
- /*
- * We don't want to deal with the case of keeping inode data inline yet.
- * So sending the data fork of a regular inode is invalid.
- */
- ASSERT(!(S_ISREG(ip->i_d.di_mode) && whichfork == XFS_DATA_FORK));
- ifp = XFS_IFORK_PTR(ip, whichfork);
- ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL);
- flags = 0;
- error = 0;
- if (ifp->if_bytes) {
- xfs_alloc_arg_t args; /* allocation arguments */
- xfs_buf_t *bp; /* buffer for extent block */
- xfs_bmbt_rec_host_t *ep;/* extent record pointer */
-
- memset(&args, 0, sizeof(args));
- args.tp = tp;
- args.mp = ip->i_mount;
- args.firstblock = *firstblock;
- ASSERT((ifp->if_flags &
- (XFS_IFINLINE|XFS_IFEXTENTS|XFS_IFEXTIREC)) == XFS_IFINLINE);
- /*
- * Allocate a block. We know we need only one, since the
- * file currently fits in an inode.
- */
- if (*firstblock == NULLFSBLOCK) {
- args.fsbno = XFS_INO_TO_FSB(args.mp, ip->i_ino);
- args.type = XFS_ALLOCTYPE_START_BNO;
- } else {
- args.fsbno = *firstblock;
- args.type = XFS_ALLOCTYPE_NEAR_BNO;
- }
- args.total = total;
- args.mod = args.minleft = args.alignment = args.wasdel =
- args.isfl = args.minalignslop = 0;
- args.minlen = args.maxlen = args.prod = 1;
- if ((error = xfs_alloc_vextent(&args)))
- goto done;
- /*
- * Can't fail, the space was reserved.
- */
- ASSERT(args.fsbno != NULLFSBLOCK);
- ASSERT(args.len == 1);
- *firstblock = args.fsbno;
- bp = xfs_btree_get_bufl(args.mp, tp, args.fsbno, 0);
- bp->b_ops = &xfs_bmbt_buf_ops;
- memcpy(bp->b_addr, ifp->if_u1.if_data, ifp->if_bytes);
- xfs_trans_log_buf(tp, bp, 0, ifp->if_bytes - 1);
- xfs_bmap_forkoff_reset(args.mp, ip, whichfork);
- xfs_idata_realloc(ip, -ifp->if_bytes, whichfork);
- xfs_iext_add(ifp, 0, 1);
- ep = xfs_iext_get_ext(ifp, 0);
- xfs_bmbt_set_allf(ep, 0, args.fsbno, 1, XFS_EXT_NORM);
- trace_xfs_bmap_post_update(ip, 0,
- whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0,
- _THIS_IP_);
- XFS_IFORK_NEXT_SET(ip, whichfork, 1);
- ip->i_d.di_nblocks = 1;
- xfs_trans_mod_dquot_byino(tp, ip,
- XFS_TRANS_DQ_BCOUNT, 1L);
- flags |= xfs_ilog_fext(whichfork);
- } else {
- ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) == 0);
- xfs_bmap_forkoff_reset(ip->i_mount, ip, whichfork);
- }
- ifp->if_flags &= ~XFS_IFINLINE;
- ifp->if_flags |= XFS_IFEXTENTS;
- XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS);
- flags |= XFS_ILOG_CORE;
-done:
- *logflagsp = flags;
- return error;
-}
-
-/*
- * Search the extent records for the entry containing block bno.
- * If bno lies in a hole, point to the next entry. If bno lies
- * past eof, *eofp will be set, and *prevp will contain the last
- * entry (null if none). Else, *lastxp will be set to the index
- * of the found entry; *gotp will contain the entry.
- */
-STATIC xfs_bmbt_rec_host_t * /* pointer to found extent entry */
-xfs_bmap_search_multi_extents(
- xfs_ifork_t *ifp, /* inode fork pointer */
- xfs_fileoff_t bno, /* block number searched for */
- int *eofp, /* out: end of file found */
- xfs_extnum_t *lastxp, /* out: last extent index */
- xfs_bmbt_irec_t *gotp, /* out: extent entry found */
- xfs_bmbt_irec_t *prevp) /* out: previous extent entry found */
-{
- xfs_bmbt_rec_host_t *ep; /* extent record pointer */
- xfs_extnum_t lastx; /* last extent index */
-
- /*
- * Initialize the extent entry structure to catch access to
- * uninitialized br_startblock field.
- */
- gotp->br_startoff = 0xffa5a5a5a5a5a5a5LL;
- gotp->br_blockcount = 0xa55a5a5a5a5a5a5aLL;
- gotp->br_state = XFS_EXT_INVALID;
-#if XFS_BIG_BLKNOS
- gotp->br_startblock = 0xffffa5a5a5a5a5a5LL;
-#else
- gotp->br_startblock = 0xffffa5a5;
-#endif
- prevp->br_startoff = NULLFILEOFF;
-
- ep = xfs_iext_bno_to_ext(ifp, bno, &lastx);
- if (lastx > 0) {
- xfs_bmbt_get_all(xfs_iext_get_ext(ifp, lastx - 1), prevp);
- }
- if (lastx < (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))) {
- xfs_bmbt_get_all(ep, gotp);
- *eofp = 0;
- } else {
- if (lastx > 0) {
- *gotp = *prevp;
- }
- *eofp = 1;
- ep = NULL;
- }
- *lastxp = lastx;
- return ep;
-}
-
-/*
- * Search the extents list for the inode, for the extent containing bno.
- * If bno lies in a hole, point to the next entry. If bno lies past eof,
- * *eofp will be set, and *prevp will contain the last entry (null if none).
- * Else, *lastxp will be set to the index of the found
- * entry; *gotp will contain the entry.
- */
-STATIC xfs_bmbt_rec_host_t * /* pointer to found extent entry */
-xfs_bmap_search_extents(
- xfs_inode_t *ip, /* incore inode pointer */
- xfs_fileoff_t bno, /* block number searched for */
- int fork, /* data or attr fork */
- int *eofp, /* out: end of file found */
- xfs_extnum_t *lastxp, /* out: last extent index */
- xfs_bmbt_irec_t *gotp, /* out: extent entry found */
- xfs_bmbt_irec_t *prevp) /* out: previous extent entry found */
-{
- xfs_ifork_t *ifp; /* inode fork pointer */
- xfs_bmbt_rec_host_t *ep; /* extent record pointer */
-
- XFS_STATS_INC(xs_look_exlist);
- ifp = XFS_IFORK_PTR(ip, fork);
-
- ep = xfs_bmap_search_multi_extents(ifp, bno, eofp, lastxp, gotp, prevp);
-
- if (unlikely(!(gotp->br_startblock) && (*lastxp != NULLEXTNUM) &&
- !(XFS_IS_REALTIME_INODE(ip) && fork == XFS_DATA_FORK))) {
- xfs_alert_tag(ip->i_mount, XFS_PTAG_FSBLOCK_ZERO,
- "Access to block zero in inode %llu "
- "start_block: %llx start_off: %llx "
- "blkcnt: %llx extent-state: %x lastx: %x\n",
- (unsigned long long)ip->i_ino,
- (unsigned long long)gotp->br_startblock,
- (unsigned long long)gotp->br_startoff,
- (unsigned long long)gotp->br_blockcount,
- gotp->br_state, *lastxp);
- *lastxp = NULLEXTNUM;
- *eofp = 1;
- return NULL;
- }
- return ep;
-}
-
-/*
- * Compute the worst-case number of indirect blocks that will be used
- * for ip's delayed extent of length "len".
- */
-STATIC xfs_filblks_t
-xfs_bmap_worst_indlen(
- xfs_inode_t *ip, /* incore inode pointer */
- xfs_filblks_t len) /* delayed extent length */
-{
- int level; /* btree level number */
- int maxrecs; /* maximum record count at this level */
- xfs_mount_t *mp; /* mount structure */
- xfs_filblks_t rval; /* return value */
-
- mp = ip->i_mount;
- maxrecs = mp->m_bmap_dmxr[0];
- for (level = 0, rval = 0;
- level < XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK);
- level++) {
- len += maxrecs - 1;
- do_div(len, maxrecs);
- rval += len;
- if (len == 1)
- return rval + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) -
- level - 1;
- if (level == 0)
- maxrecs = mp->m_bmap_dmxr[1];
- }
- return rval;
-}
-
-/*
- * Convert inode from non-attributed to attributed.
- * Must not be in a transaction, ip must not be locked.
- */
-int /* error code */
-xfs_bmap_add_attrfork(
- xfs_inode_t *ip, /* incore inode pointer */
- int size, /* space new attribute needs */
- int rsvd) /* xact may use reserved blks */
-{
- xfs_fsblock_t firstblock; /* 1st block/ag allocated */
- xfs_bmap_free_t flist; /* freed extent records */
- xfs_mount_t *mp; /* mount structure */
- xfs_trans_t *tp; /* transaction pointer */
- int blks; /* space reservation */
- int version = 1; /* superblock attr version */
- int committed; /* xaction was committed */
- int logflags; /* logging flags */
- int error; /* error return value */
-
- ASSERT(XFS_IFORK_Q(ip) == 0);
-
- mp = ip->i_mount;
- ASSERT(!XFS_NOT_DQATTACHED(mp, ip));
- tp = xfs_trans_alloc(mp, XFS_TRANS_ADDAFORK);
- blks = XFS_ADDAFORK_SPACE_RES(mp);
- if (rsvd)
- tp->t_flags |= XFS_TRANS_RESERVE;
- if ((error = xfs_trans_reserve(tp, blks, XFS_ADDAFORK_LOG_RES(mp), 0,
- XFS_TRANS_PERM_LOG_RES, XFS_ADDAFORK_LOG_COUNT)))
- goto error0;
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- error = xfs_trans_reserve_quota_nblks(tp, ip, blks, 0, rsvd ?
- XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES :
- XFS_QMOPT_RES_REGBLKS);
- if (error) {
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES);
- return error;
- }
- if (XFS_IFORK_Q(ip))
- goto error1;
- if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS) {
- /*
- * For inodes coming from pre-6.2 filesystems.
- */
- ASSERT(ip->i_d.di_aformat == 0);
- ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
- }
- ASSERT(ip->i_d.di_anextents == 0);
-
- xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
- xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-
- switch (ip->i_d.di_format) {
- case XFS_DINODE_FMT_DEV:
- ip->i_d.di_forkoff = roundup(sizeof(xfs_dev_t), 8) >> 3;
- break;
- case XFS_DINODE_FMT_UUID:
- ip->i_d.di_forkoff = roundup(sizeof(uuid_t), 8) >> 3;
- break;
- case XFS_DINODE_FMT_LOCAL:
- case XFS_DINODE_FMT_EXTENTS:
- case XFS_DINODE_FMT_BTREE:
- ip->i_d.di_forkoff = xfs_attr_shortform_bytesfit(ip, size);
- if (!ip->i_d.di_forkoff)
- ip->i_d.di_forkoff = xfs_default_attroffset(ip) >> 3;
- else if (mp->m_flags & XFS_MOUNT_ATTR2)
- version = 2;
- break;
- default:
- ASSERT(0);
- error = XFS_ERROR(EINVAL);
- goto error1;
- }
-
- ASSERT(ip->i_afp == NULL);
- ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP);
- ip->i_afp->if_flags = XFS_IFEXTENTS;
- logflags = 0;
- xfs_bmap_init(&flist, &firstblock);
- switch (ip->i_d.di_format) {
- case XFS_DINODE_FMT_LOCAL:
- error = xfs_bmap_add_attrfork_local(tp, ip, &firstblock, &flist,
- &logflags);
- break;
- case XFS_DINODE_FMT_EXTENTS:
- error = xfs_bmap_add_attrfork_extents(tp, ip, &firstblock,
- &flist, &logflags);
- break;
- case XFS_DINODE_FMT_BTREE:
- error = xfs_bmap_add_attrfork_btree(tp, ip, &firstblock, &flist,
- &logflags);
- break;
- default:
- error = 0;
- break;
- }
- if (logflags)
- xfs_trans_log_inode(tp, ip, logflags);
- if (error)
- goto error2;
- if (!xfs_sb_version_hasattr(&mp->m_sb) ||
- (!xfs_sb_version_hasattr2(&mp->m_sb) && version == 2)) {
- __int64_t sbfields = 0;
-
- spin_lock(&mp->m_sb_lock);
- if (!xfs_sb_version_hasattr(&mp->m_sb)) {
- xfs_sb_version_addattr(&mp->m_sb);
- sbfields |= XFS_SB_VERSIONNUM;
- }
- if (!xfs_sb_version_hasattr2(&mp->m_sb) && version == 2) {
- xfs_sb_version_addattr2(&mp->m_sb);
- sbfields |= (XFS_SB_VERSIONNUM | XFS_SB_FEATURES2);
- }
- if (sbfields) {
- spin_unlock(&mp->m_sb_lock);
- xfs_mod_sb(tp, sbfields);
- } else
- spin_unlock(&mp->m_sb_lock);
- }
-
- error = xfs_bmap_finish(&tp, &flist, &committed);
- if (error)
- goto error2;
- return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
-error2:
- xfs_bmap_cancel(&flist);
-error1:
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
-error0:
- xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
- return error;
-}
-
-/*
- * Add the extent to the list of extents to be free at transaction end.
- * The list is maintained sorted (by block number).
- */
-/* ARGSUSED */
-void
-xfs_bmap_add_free(
- xfs_fsblock_t bno, /* fs block number of extent */
- xfs_filblks_t len, /* length of extent */
- xfs_bmap_free_t *flist, /* list of extents */
- xfs_mount_t *mp) /* mount point structure */
-{
- xfs_bmap_free_item_t *cur; /* current (next) element */
- xfs_bmap_free_item_t *new; /* new element */
- xfs_bmap_free_item_t *prev; /* previous element */
-#ifdef DEBUG
- xfs_agnumber_t agno;
- xfs_agblock_t agbno;
-
- ASSERT(bno != NULLFSBLOCK);
- ASSERT(len > 0);
- ASSERT(len <= MAXEXTLEN);
- ASSERT(!isnullstartblock(bno));
- agno = XFS_FSB_TO_AGNO(mp, bno);
- agbno = XFS_FSB_TO_AGBNO(mp, bno);
- ASSERT(agno < mp->m_sb.sb_agcount);
- ASSERT(agbno < mp->m_sb.sb_agblocks);
- ASSERT(len < mp->m_sb.sb_agblocks);
- ASSERT(agbno + len <= mp->m_sb.sb_agblocks);
-#endif
- ASSERT(xfs_bmap_free_item_zone != NULL);
- new = kmem_zone_alloc(xfs_bmap_free_item_zone, KM_SLEEP);
- new->xbfi_startblock = bno;
- new->xbfi_blockcount = (xfs_extlen_t)len;
- for (prev = NULL, cur = flist->xbf_first;
- cur != NULL;
- prev = cur, cur = cur->xbfi_next) {
- if (cur->xbfi_startblock >= bno)
- break;
- }
- if (prev)
- prev->xbfi_next = new;
- else
- flist->xbf_first = new;
- new->xbfi_next = cur;
- flist->xbf_count++;
-}
-
-/*
- * Compute and fill in the value of the maximum depth of a bmap btree
- * in this filesystem. Done once, during mount.
- */
-void
-xfs_bmap_compute_maxlevels(
- xfs_mount_t *mp, /* file system mount structure */
- int whichfork) /* data or attr fork */
-{
- int level; /* btree level */
- uint maxblocks; /* max blocks at this level */
- uint maxleafents; /* max leaf entries possible */
- int maxrootrecs; /* max records in root block */
- int minleafrecs; /* min records in leaf block */
- int minnoderecs; /* min records in node block */
- int sz; /* root block size */
-
- /*
- * The maximum number of extents in a file, hence the maximum
- * number of leaf entries, is controlled by the type of di_nextents
- * (a signed 32-bit number, xfs_extnum_t), or by di_anextents
- * (a signed 16-bit number, xfs_aextnum_t).
- *
- * Note that we can no longer assume that if we are in ATTR1 that
- * the fork offset of all the inodes will be
- * (xfs_default_attroffset(ip) >> 3) because we could have mounted
- * with ATTR2 and then mounted back with ATTR1, keeping the
- * di_forkoff's fixed but probably at various positions. Therefore,
- * for both ATTR1 and ATTR2 we have to assume the worst case scenario
- * of a minimum size available.
- */
- if (whichfork == XFS_DATA_FORK) {
- maxleafents = MAXEXTNUM;
- sz = XFS_BMDR_SPACE_CALC(MINDBTPTRS);
- } else {
- maxleafents = MAXAEXTNUM;
- sz = XFS_BMDR_SPACE_CALC(MINABTPTRS);
- }
- maxrootrecs = xfs_bmdr_maxrecs(mp, sz, 0);
- minleafrecs = mp->m_bmap_dmnr[0];
- minnoderecs = mp->m_bmap_dmnr[1];
- maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs;
- for (level = 1; maxblocks > 1; level++) {
- if (maxblocks <= maxrootrecs)
- maxblocks = 1;
- else
- maxblocks = (maxblocks + minnoderecs - 1) / minnoderecs;
- }
- mp->m_bm_maxlevels[whichfork] = level;
-}
-
-/*
- * Routine to be called at transaction's end by xfs_bmapi, xfs_bunmapi
- * caller. Frees all the extents that need freeing, which must be done
- * last due to locking considerations. We never free any extents in
- * the first transaction.
- *
- * Return 1 if the given transaction was committed and a new one
- * started, and 0 otherwise in the committed parameter.
- */
-int /* error */
-xfs_bmap_finish(
- xfs_trans_t **tp, /* transaction pointer addr */
- xfs_bmap_free_t *flist, /* i/o: list extents to free */
- int *committed) /* xact committed or not */
-{
- xfs_efd_log_item_t *efd; /* extent free data */
- xfs_efi_log_item_t *efi; /* extent free intention */
- int error; /* error return value */
- xfs_bmap_free_item_t *free; /* free extent item */
- unsigned int logres; /* new log reservation */
- unsigned int logcount; /* new log count */
- xfs_mount_t *mp; /* filesystem mount structure */
- xfs_bmap_free_item_t *next; /* next item on free list */
- xfs_trans_t *ntp; /* new transaction pointer */
-
- ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES);
- if (flist->xbf_count == 0) {
- *committed = 0;
- return 0;
- }
- ntp = *tp;
- efi = xfs_trans_get_efi(ntp, flist->xbf_count);
- for (free = flist->xbf_first; free; free = free->xbfi_next)
- xfs_trans_log_efi_extent(ntp, efi, free->xbfi_startblock,
- free->xbfi_blockcount);
- logres = ntp->t_log_res;
- logcount = ntp->t_log_count;
- ntp = xfs_trans_dup(*tp);
- error = xfs_trans_commit(*tp, 0);
- *tp = ntp;
- *committed = 1;
- /*
- * We have a new transaction, so we should return committed=1,
- * even though we're returning an error.
- */
- if (error)
- return error;
-
- /*
- * transaction commit worked ok so we can drop the extra ticket
- * reference that we gained in xfs_trans_dup()
- */
- xfs_log_ticket_put(ntp->t_ticket);
-
- if ((error = xfs_trans_reserve(ntp, 0, logres, 0, XFS_TRANS_PERM_LOG_RES,
- logcount)))
- return error;
- efd = xfs_trans_get_efd(ntp, efi, flist->xbf_count);
- for (free = flist->xbf_first; free != NULL; free = next) {
- next = free->xbfi_next;
- if ((error = xfs_free_extent(ntp, free->xbfi_startblock,
- free->xbfi_blockcount))) {
- /*
- * The bmap free list will be cleaned up at a
- * higher level. The EFI will be canceled when
- * this transaction is aborted.
- * Need to force shutdown here to make sure it
- * happens, since this transaction may not be
- * dirty yet.
- */
- mp = ntp->t_mountp;
- if (!XFS_FORCED_SHUTDOWN(mp))
- xfs_force_shutdown(mp,
- (error == EFSCORRUPTED) ?
- SHUTDOWN_CORRUPT_INCORE :
- SHUTDOWN_META_IO_ERROR);
- return error;
- }
- xfs_trans_log_efd_extent(ntp, efd, free->xbfi_startblock,
- free->xbfi_blockcount);
- xfs_bmap_del_free(flist, NULL, free);
- }
- return 0;
-}
-
-/*
- * Free up any items left in the list.
- */
-void
-xfs_bmap_cancel(
- xfs_bmap_free_t *flist) /* list of bmap_free_items */
-{
- xfs_bmap_free_item_t *free; /* free list item */
- xfs_bmap_free_item_t *next;
-
- if (flist->xbf_count == 0)
- return;
- ASSERT(flist->xbf_first != NULL);
- for (free = flist->xbf_first; free; free = next) {
- next = free->xbfi_next;
- xfs_bmap_del_free(flist, NULL, free);
- }
- ASSERT(flist->xbf_count == 0);
-}
-
-/*
- * Returns the file-relative block number of the first unused block(s)
- * in the file with at least "len" logically contiguous blocks free.
- * This is the lowest-address hole if the file has holes, else the first block
- * past the end of file.
- * Return 0 if the file is currently local (in-inode).
- */
-int /* error */
-xfs_bmap_first_unused(
- xfs_trans_t *tp, /* transaction pointer */
- xfs_inode_t *ip, /* incore inode */
- xfs_extlen_t len, /* size of hole to find */
- xfs_fileoff_t *first_unused, /* unused block */
- int whichfork) /* data or attr fork */
-{
- int error; /* error return value */
- int idx; /* extent record index */
- xfs_ifork_t *ifp; /* inode fork pointer */
- xfs_fileoff_t lastaddr; /* last block number seen */
- xfs_fileoff_t lowest; /* lowest useful block */
- xfs_fileoff_t max; /* starting useful block */
- xfs_fileoff_t off; /* offset for this block */
- xfs_extnum_t nextents; /* number of extent entries */
-
- ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE ||
- XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS ||
- XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL);
- if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) {
- *first_unused = 0;
- return 0;
- }
- ifp = XFS_IFORK_PTR(ip, whichfork);
- if (!(ifp->if_flags & XFS_IFEXTENTS) &&
- (error = xfs_iread_extents(tp, ip, whichfork)))
- return error;
- lowest = *first_unused;
- nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
- for (idx = 0, lastaddr = 0, max = lowest; idx < nextents; idx++) {
- xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, idx);
- off = xfs_bmbt_get_startoff(ep);
- /*
- * See if the hole before this extent will work.
- */
- if (off >= lowest + len && off - max >= len) {
- *first_unused = max;
- return 0;
- }
- lastaddr = off + xfs_bmbt_get_blockcount(ep);
- max = XFS_FILEOFF_MAX(lastaddr, lowest);
- }
- *first_unused = max;
- return 0;
-}
-
-/*
- * Returns the file-relative block number of the last block + 1 before
- * last_block (input value) in the file.
- * This is not based on i_size, it is based on the extent records.
- * Returns 0 for local files, as they do not have extent records.
- */
-int /* error */
-xfs_bmap_last_before(
- xfs_trans_t *tp, /* transaction pointer */
- xfs_inode_t *ip, /* incore inode */
- xfs_fileoff_t *last_block, /* last block */
- int whichfork) /* data or attr fork */
-{
- xfs_fileoff_t bno; /* input file offset */
- int eof; /* hit end of file */
- xfs_bmbt_rec_host_t *ep; /* pointer to last extent */
- int error; /* error return value */
- xfs_bmbt_irec_t got; /* current extent value */
- xfs_ifork_t *ifp; /* inode fork pointer */
- xfs_extnum_t lastx; /* last extent used */
- xfs_bmbt_irec_t prev; /* previous extent value */
-
- if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE &&
- XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
- XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL)
- return XFS_ERROR(EIO);
- if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) {
- *last_block = 0;
- return 0;
- }
- ifp = XFS_IFORK_PTR(ip, whichfork);
- if (!(ifp->if_flags & XFS_IFEXTENTS) &&
- (error = xfs_iread_extents(tp, ip, whichfork)))
- return error;
- bno = *last_block - 1;
- ep = xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got,
- &prev);
- if (eof || xfs_bmbt_get_startoff(ep) > bno) {
- if (prev.br_startoff == NULLFILEOFF)
- *last_block = 0;
- else
- *last_block = prev.br_startoff + prev.br_blockcount;
- }
- /*
- * Otherwise *last_block is already the right answer.
- */
- return 0;
-}
-
-STATIC int
-xfs_bmap_last_extent(
- struct xfs_trans *tp,
- struct xfs_inode *ip,
- int whichfork,
- struct xfs_bmbt_irec *rec,
- int *is_empty)
-{
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
- int error;
- int nextents;
-
- if (!(ifp->if_flags & XFS_IFEXTENTS)) {
- error = xfs_iread_extents(tp, ip, whichfork);
- if (error)
- return error;
- }
-
- nextents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t);
- if (nextents == 0) {
- *is_empty = 1;
- return 0;
- }
-
- xfs_bmbt_get_all(xfs_iext_get_ext(ifp, nextents - 1), rec);
- *is_empty = 0;
- return 0;
-}
-
-/*
- * Check the last inode extent to determine whether this allocation will result
- * in blocks being allocated at the end of the file. When we allocate new data
- * blocks at the end of the file which do not start at the previous data block,
- * we will try to align the new blocks at stripe unit boundaries.
- *
- * Returns 0 in bma->aeof if the file (fork) is empty as any new write will be
- * at, or past the EOF.
- */
-STATIC int
-xfs_bmap_isaeof(
- struct xfs_bmalloca *bma,
- int whichfork)
-{
- struct xfs_bmbt_irec rec;
- int is_empty;
- int error;
-
- bma->aeof = 0;
- error = xfs_bmap_last_extent(NULL, bma->ip, whichfork, &rec,
- &is_empty);
- if (error || is_empty)
- return error;
-
- /*
- * Check if we are allocation or past the last extent, or at least into
- * the last delayed allocated extent.
- */
- bma->aeof = bma->offset >= rec.br_startoff + rec.br_blockcount ||
- (bma->offset >= rec.br_startoff &&
- isnullstartblock(rec.br_startblock));
- return 0;
-}
-
-/*
- * Check if the endoff is outside the last extent. If so the caller will grow
- * the allocation to a stripe unit boundary. All offsets are considered outside
- * the end of file for an empty fork, so 1 is returned in *eof in that case.
- */
-int
-xfs_bmap_eof(
- struct xfs_inode *ip,
- xfs_fileoff_t endoff,
- int whichfork,
- int *eof)
-{
- struct xfs_bmbt_irec rec;
- int error;
-
- error = xfs_bmap_last_extent(NULL, ip, whichfork, &rec, eof);
- if (error || *eof)
- return error;
-
- *eof = endoff >= rec.br_startoff + rec.br_blockcount;
- return 0;
-}
-
-/*
- * Returns the file-relative block number of the first block past eof in
- * the file. This is not based on i_size, it is based on the extent records.
- * Returns 0 for local files, as they do not have extent records.
- */
-int
-xfs_bmap_last_offset(
- struct xfs_trans *tp,
- struct xfs_inode *ip,
- xfs_fileoff_t *last_block,
- int whichfork)
-{
- struct xfs_bmbt_irec rec;
- int is_empty;
- int error;
-
- *last_block = 0;
-
- if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL)
- return 0;
-
- if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE &&
- XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS)
- return XFS_ERROR(EIO);
-
- error = xfs_bmap_last_extent(NULL, ip, whichfork, &rec, &is_empty);
- if (error || is_empty)
- return error;
-
- *last_block = rec.br_startoff + rec.br_blockcount;
- return 0;
-}
-
-/*
- * Returns whether the selected fork of the inode has exactly one
- * block or not. For the data fork we check this matches di_size,
- * implying the file's range is 0..bsize-1.
- */
-int /* 1=>1 block, 0=>otherwise */
-xfs_bmap_one_block(
- xfs_inode_t *ip, /* incore inode */
- int whichfork) /* data or attr fork */
-{
- xfs_bmbt_rec_host_t *ep; /* ptr to fork's extent */
- xfs_ifork_t *ifp; /* inode fork pointer */
- int rval; /* return value */
- xfs_bmbt_irec_t s; /* internal version of extent */
-
-#ifndef DEBUG
- if (whichfork == XFS_DATA_FORK)
- return XFS_ISIZE(ip) == ip->i_mount->m_sb.sb_blocksize;
-#endif /* !DEBUG */
- if (XFS_IFORK_NEXTENTS(ip, whichfork) != 1)
- return 0;
- if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS)
- return 0;
- ifp = XFS_IFORK_PTR(ip, whichfork);
- ASSERT(ifp->if_flags & XFS_IFEXTENTS);
- ep = xfs_iext_get_ext(ifp, 0);
- xfs_bmbt_get_all(ep, &s);
- rval = s.br_startoff == 0 && s.br_blockcount == 1;
- if (rval && whichfork == XFS_DATA_FORK)
- ASSERT(XFS_ISIZE(ip) == ip->i_mount->m_sb.sb_blocksize);
- return rval;
-}
-
-STATIC int
-xfs_bmap_sanity_check(
- struct xfs_mount *mp,
- struct xfs_buf *bp,
- int level)
-{
- struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
-
- if (block->bb_magic != cpu_to_be32(XFS_BMAP_MAGIC) ||
- be16_to_cpu(block->bb_level) != level ||
- be16_to_cpu(block->bb_numrecs) == 0 ||
- be16_to_cpu(block->bb_numrecs) > mp->m_bmap_dmxr[level != 0])
- return 0;
- return 1;
-}
-
-/*
- * Read in the extents to if_extents.
- * All inode fields are set up by caller, we just traverse the btree
- * and copy the records in. If the file system cannot contain unwritten
- * extents, the records are checked for no "state" flags.
- */
-int /* error */
-xfs_bmap_read_extents(
- xfs_trans_t *tp, /* transaction pointer */
- xfs_inode_t *ip, /* incore inode */
- int whichfork) /* data or attr fork */
-{
- struct xfs_btree_block *block; /* current btree block */
- xfs_fsblock_t bno; /* block # of "block" */
- xfs_buf_t *bp; /* buffer for "block" */
- int error; /* error return value */
- xfs_exntfmt_t exntf; /* XFS_EXTFMT_NOSTATE, if checking */
- xfs_extnum_t i, j; /* index into the extents list */
- xfs_ifork_t *ifp; /* fork structure */
- int level; /* btree level, for checking */
- xfs_mount_t *mp; /* file system mount structure */
- __be64 *pp; /* pointer to block address */
- /* REFERENCED */
- xfs_extnum_t room; /* number of entries there's room for */
-
- bno = NULLFSBLOCK;
- mp = ip->i_mount;
- ifp = XFS_IFORK_PTR(ip, whichfork);
- exntf = (whichfork != XFS_DATA_FORK) ? XFS_EXTFMT_NOSTATE :
- XFS_EXTFMT_INODE(ip);
- block = ifp->if_broot;
- /*
- * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out.
- */
- level = be16_to_cpu(block->bb_level);
- ASSERT(level > 0);
- pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes);
- bno = be64_to_cpu(*pp);
- ASSERT(bno != NULLDFSBNO);
- ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount);
- ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks);
- /*
- * Go down the tree until leaf level is reached, following the first
- * pointer (leftmost) at each level.
- */
- while (level-- > 0) {
- error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
- XFS_BMAP_BTREE_REF, &xfs_bmbt_buf_ops);
- if (error)
- return error;
- block = XFS_BUF_TO_BLOCK(bp);
- XFS_WANT_CORRUPTED_GOTO(
- xfs_bmap_sanity_check(mp, bp, level),
- error0);
- if (level == 0)
- break;
- pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]);
- bno = be64_to_cpu(*pp);
- XFS_WANT_CORRUPTED_GOTO(XFS_FSB_SANITY_CHECK(mp, bno), error0);
- xfs_trans_brelse(tp, bp);
- }
- /*
- * Here with bp and block set to the leftmost leaf node in the tree.
- */
- room = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
- i = 0;
- /*
- * Loop over all leaf nodes. Copy information to the extent records.
- */
- for (;;) {
- xfs_bmbt_rec_t *frp;
- xfs_fsblock_t nextbno;
- xfs_extnum_t num_recs;
- xfs_extnum_t start;
-
- num_recs = xfs_btree_get_numrecs(block);
- if (unlikely(i + num_recs > room)) {
- ASSERT(i + num_recs <= room);
- xfs_warn(ip->i_mount,
- "corrupt dinode %Lu, (btree extents).",
- (unsigned long long) ip->i_ino);
- XFS_CORRUPTION_ERROR("xfs_bmap_read_extents(1)",
- XFS_ERRLEVEL_LOW, ip->i_mount, block);
- goto error0;
- }
- XFS_WANT_CORRUPTED_GOTO(
- xfs_bmap_sanity_check(mp, bp, 0),
- error0);
- /*
- * Read-ahead the next leaf block, if any.
- */
- nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
- if (nextbno != NULLFSBLOCK)
- xfs_btree_reada_bufl(mp, nextbno, 1,
- &xfs_bmbt_buf_ops);
- /*
- * Copy records into the extent records.
- */
- frp = XFS_BMBT_REC_ADDR(mp, block, 1);
- start = i;
- for (j = 0; j < num_recs; j++, i++, frp++) {
- xfs_bmbt_rec_host_t *trp = xfs_iext_get_ext(ifp, i);
- trp->l0 = be64_to_cpu(frp->l0);
- trp->l1 = be64_to_cpu(frp->l1);
- }
- if (exntf == XFS_EXTFMT_NOSTATE) {
- /*
- * Check all attribute bmap btree records and
- * any "older" data bmap btree records for a
- * set bit in the "extent flag" position.
- */
- if (unlikely(xfs_check_nostate_extents(ifp,
- start, num_recs))) {
- XFS_ERROR_REPORT("xfs_bmap_read_extents(2)",
- XFS_ERRLEVEL_LOW,
- ip->i_mount);
- goto error0;
- }
- }
- xfs_trans_brelse(tp, bp);
- bno = nextbno;
- /*
- * If we've reached the end, stop.
- */
- if (bno == NULLFSBLOCK)
- break;
- error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
- XFS_BMAP_BTREE_REF, &xfs_bmbt_buf_ops);
- if (error)
- return error;
- block = XFS_BUF_TO_BLOCK(bp);
- }
- ASSERT(i == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)));
- ASSERT(i == XFS_IFORK_NEXTENTS(ip, whichfork));
- XFS_BMAP_TRACE_EXLIST(ip, i, whichfork);
- return 0;
-error0:
- xfs_trans_brelse(tp, bp);
- return XFS_ERROR(EFSCORRUPTED);
-}
-
-#ifdef DEBUG
-/*
- * Add bmap trace insert entries for all the contents of the extent records.
- */
-void
-xfs_bmap_trace_exlist(
- xfs_inode_t *ip, /* incore inode pointer */
- xfs_extnum_t cnt, /* count of entries in the list */
- int whichfork, /* data or attr fork */
- unsigned long caller_ip)
-{
- xfs_extnum_t idx; /* extent record index */
- xfs_ifork_t *ifp; /* inode fork pointer */
- int state = 0;
-
- if (whichfork == XFS_ATTR_FORK)
- state |= BMAP_ATTRFORK;
-
- ifp = XFS_IFORK_PTR(ip, whichfork);
- ASSERT(cnt == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)));
- for (idx = 0; idx < cnt; idx++)
- trace_xfs_extlist(ip, idx, whichfork, caller_ip);
-}
-
-/*
- * Validate that the bmbt_irecs being returned from bmapi are valid
- * given the callers original parameters. Specifically check the
- * ranges of the returned irecs to ensure that they only extent beyond
- * the given parameters if the XFS_BMAPI_ENTIRE flag was set.
- */
-STATIC void
-xfs_bmap_validate_ret(
- xfs_fileoff_t bno,
- xfs_filblks_t len,
- int flags,
- xfs_bmbt_irec_t *mval,
- int nmap,
- int ret_nmap)
-{
- int i; /* index to map values */
-
- ASSERT(ret_nmap <= nmap);
-
- for (i = 0; i < ret_nmap; i++) {
- ASSERT(mval[i].br_blockcount > 0);
- if (!(flags & XFS_BMAPI_ENTIRE)) {
- ASSERT(mval[i].br_startoff >= bno);
- ASSERT(mval[i].br_blockcount <= len);
- ASSERT(mval[i].br_startoff + mval[i].br_blockcount <=
- bno + len);
- } else {
- ASSERT(mval[i].br_startoff < bno + len);
- ASSERT(mval[i].br_startoff + mval[i].br_blockcount >
- bno);
- }
- ASSERT(i == 0 ||
- mval[i - 1].br_startoff + mval[i - 1].br_blockcount ==
- mval[i].br_startoff);
- ASSERT(mval[i].br_startblock != DELAYSTARTBLOCK &&
- mval[i].br_startblock != HOLESTARTBLOCK);
- ASSERT(mval[i].br_state == XFS_EXT_NORM ||
- mval[i].br_state == XFS_EXT_UNWRITTEN);
- }
-}
-#endif /* DEBUG */
-
-
-/*
* Trim the returned map to the required bounds
*/
STATIC void
@@ -4372,6 +4044,7 @@ xfs_bmapi_read(
ASSERT(*nmap >= 1);
ASSERT(!(flags & ~(XFS_BMAPI_ATTRFORK|XFS_BMAPI_ENTIRE|
XFS_BMAPI_IGSTATE)));
+ ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED|XFS_ILOCK_EXCL));
if (unlikely(XFS_TEST_ERROR(
(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
@@ -4566,6 +4239,7 @@ xfs_bmapi_delay(
ASSERT(*nmap >= 1);
ASSERT(*nmap <= XFS_BMAP_MAX_NMAP);
ASSERT(!(flags & ~XFS_BMAPI_ENTIRE));
+ ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
if (unlikely(XFS_TEST_ERROR(
(XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_EXTENTS &&
@@ -4624,8 +4298,8 @@ xfs_bmapi_delay(
}
-STATIC int
-__xfs_bmapi_allocate(
+static int
+xfs_bmapi_allocate(
struct xfs_bmalloca *bma)
{
struct xfs_mount *mp = bma->ip->i_mount;
@@ -4634,12 +4308,9 @@ __xfs_bmapi_allocate(
struct xfs_ifork *ifp = XFS_IFORK_PTR(bma->ip, whichfork);
int tmp_logflags = 0;
int error;
- int rt;
ASSERT(bma->length > 0);
- rt = (whichfork == XFS_DATA_FORK) && XFS_IS_REALTIME_INODE(bma->ip);
-
/*
* For the wasdelay case, we could also just allocate the stuff asked
* for in this bmap call but that wouldn't be as good.
@@ -4680,9 +4351,6 @@ __xfs_bmapi_allocate(
return error;
}
- if (bma->flags & XFS_BMAPI_STACK_SWITCH)
- bma->stack_switch = 1;
-
error = xfs_bmap_alloc(bma);
if (error)
return error;
@@ -4745,45 +4413,6 @@ __xfs_bmapi_allocate(
return 0;
}
-static void
-xfs_bmapi_allocate_worker(
- struct work_struct *work)
-{
- struct xfs_bmalloca *args = container_of(work,
- struct xfs_bmalloca, work);
- unsigned long pflags;
-
- /* we are in a transaction context here */
- current_set_flags_nested(&pflags, PF_FSTRANS);
-
- args->result = __xfs_bmapi_allocate(args);
- complete(args->done);
-
- current_restore_flags_nested(&pflags, PF_FSTRANS);
-}
-
-/*
- * Some allocation requests often come in with little stack to work on. Push
- * them off to a worker thread so there is lots of stack to use. Otherwise just
- * call directly to avoid the context switch overhead here.
- */
-int
-xfs_bmapi_allocate(
- struct xfs_bmalloca *args)
-{
- DECLARE_COMPLETION_ONSTACK(done);
-
- if (!args->stack_switch)
- return __xfs_bmapi_allocate(args);
-
-
- args->done = &done;
- INIT_WORK_ONSTACK(&args->work, xfs_bmapi_allocate_worker);
- queue_work(xfs_alloc_wq, &args->work);
- wait_for_completion(&done);
- return args->result;
-}
-
STATIC int
xfs_bmapi_convert_unwritten(
struct xfs_bmalloca *bma,
@@ -4872,7 +4501,7 @@ xfs_bmapi_write(
{
struct xfs_mount *mp = ip->i_mount;
struct xfs_ifork *ifp;
- struct xfs_bmalloca bma = { 0 }; /* args for xfs_bmap_alloc */
+ struct xfs_bmalloca bma = { NULL }; /* args for xfs_bmap_alloc */
xfs_fileoff_t end; /* end of mapped file region */
int eof; /* after the end of extents */
int error; /* error return */
@@ -4895,20 +4524,20 @@ xfs_bmapi_write(
orig_mval = mval;
orig_nmap = *nmap;
#endif
+ whichfork = (flags & XFS_BMAPI_ATTRFORK) ?
+ XFS_ATTR_FORK : XFS_DATA_FORK;
ASSERT(*nmap >= 1);
ASSERT(*nmap <= XFS_BMAP_MAX_NMAP);
ASSERT(!(flags & XFS_BMAPI_IGSTATE));
ASSERT(tp != NULL);
ASSERT(len > 0);
-
- whichfork = (flags & XFS_BMAPI_ATTRFORK) ?
- XFS_ATTR_FORK : XFS_DATA_FORK;
+ ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL);
+ ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
if (unlikely(XFS_TEST_ERROR(
(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
- XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE &&
- XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL),
+ XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE),
mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) {
XFS_ERROR_REPORT("xfs_bmapi_write", XFS_ERRLEVEL_LOW, mp);
return XFS_ERROR(EFSCORRUPTED);
@@ -4921,13 +4550,6 @@ xfs_bmapi_write(
XFS_STATS_INC(xs_blk_mapw);
- if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) {
- error = xfs_bmap_local_to_extents(tp, ip, firstblock, total,
- &bma.logflags, whichfork);
- if (error)
- goto error0;
- }
-
if (*firstblock == NULLFSBLOCK) {
if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE)
bma.minleft = be16_to_cpu(ifp->if_broot->bb_level) + 1;
@@ -5083,6 +4705,328 @@ error0:
}
/*
+ * Called by xfs_bmapi to update file extent records and the btree
+ * after removing space (or undoing a delayed allocation).
+ */
+STATIC int /* error */
+xfs_bmap_del_extent(
+ xfs_inode_t *ip, /* incore inode pointer */
+ xfs_trans_t *tp, /* current transaction pointer */
+ xfs_extnum_t *idx, /* extent number to update/delete */
+ xfs_bmap_free_t *flist, /* list of extents to be freed */
+ xfs_btree_cur_t *cur, /* if null, not a btree */
+ xfs_bmbt_irec_t *del, /* data to remove from extents */
+ int *logflagsp, /* inode logging flags */
+ int whichfork) /* data or attr fork */
+{
+ xfs_filblks_t da_new; /* new delay-alloc indirect blocks */
+ xfs_filblks_t da_old; /* old delay-alloc indirect blocks */
+ xfs_fsblock_t del_endblock=0; /* first block past del */
+ xfs_fileoff_t del_endoff; /* first offset past del */
+ int delay; /* current block is delayed allocated */
+ int do_fx; /* free extent at end of routine */
+ xfs_bmbt_rec_host_t *ep; /* current extent entry pointer */
+ int error; /* error return value */
+ int flags; /* inode logging flags */
+ xfs_bmbt_irec_t got; /* current extent entry */
+ xfs_fileoff_t got_endoff; /* first offset past got */
+ int i; /* temp state */
+ xfs_ifork_t *ifp; /* inode fork pointer */
+ xfs_mount_t *mp; /* mount structure */
+ xfs_filblks_t nblks; /* quota/sb block count */
+ xfs_bmbt_irec_t new; /* new record to be inserted */
+ /* REFERENCED */
+ uint qfield; /* quota field to update */
+ xfs_filblks_t temp; /* for indirect length calculations */
+ xfs_filblks_t temp2; /* for indirect length calculations */
+ int state = 0;
+
+ XFS_STATS_INC(xs_del_exlist);
+
+ if (whichfork == XFS_ATTR_FORK)
+ state |= BMAP_ATTRFORK;
+
+ mp = ip->i_mount;
+ ifp = XFS_IFORK_PTR(ip, whichfork);
+ ASSERT((*idx >= 0) && (*idx < ifp->if_bytes /
+ (uint)sizeof(xfs_bmbt_rec_t)));
+ ASSERT(del->br_blockcount > 0);
+ ep = xfs_iext_get_ext(ifp, *idx);
+ xfs_bmbt_get_all(ep, &got);
+ ASSERT(got.br_startoff <= del->br_startoff);
+ del_endoff = del->br_startoff + del->br_blockcount;
+ got_endoff = got.br_startoff + got.br_blockcount;
+ ASSERT(got_endoff >= del_endoff);
+ delay = isnullstartblock(got.br_startblock);
+ ASSERT(isnullstartblock(del->br_startblock) == delay);
+ flags = 0;
+ qfield = 0;
+ error = 0;
+ /*
+ * If deleting a real allocation, must free up the disk space.
+ */
+ if (!delay) {
+ flags = XFS_ILOG_CORE;
+ /*
+ * Realtime allocation. Free it and record di_nblocks update.
+ */
+ if (whichfork == XFS_DATA_FORK && XFS_IS_REALTIME_INODE(ip)) {
+ xfs_fsblock_t bno;
+ xfs_filblks_t len;
+
+ ASSERT(do_mod(del->br_blockcount,
+ mp->m_sb.sb_rextsize) == 0);
+ ASSERT(do_mod(del->br_startblock,
+ mp->m_sb.sb_rextsize) == 0);
+ bno = del->br_startblock;
+ len = del->br_blockcount;
+ do_div(bno, mp->m_sb.sb_rextsize);
+ do_div(len, mp->m_sb.sb_rextsize);
+ error = xfs_rtfree_extent(tp, bno, (xfs_extlen_t)len);
+ if (error)
+ goto done;
+ do_fx = 0;
+ nblks = len * mp->m_sb.sb_rextsize;
+ qfield = XFS_TRANS_DQ_RTBCOUNT;
+ }
+ /*
+ * Ordinary allocation.
+ */
+ else {
+ do_fx = 1;
+ nblks = del->br_blockcount;
+ qfield = XFS_TRANS_DQ_BCOUNT;
+ }
+ /*
+ * Set up del_endblock and cur for later.
+ */
+ del_endblock = del->br_startblock + del->br_blockcount;
+ if (cur) {
+ if ((error = xfs_bmbt_lookup_eq(cur, got.br_startoff,
+ got.br_startblock, got.br_blockcount,
+ &i)))
+ goto done;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+ }
+ da_old = da_new = 0;
+ } else {
+ da_old = startblockval(got.br_startblock);
+ da_new = 0;
+ nblks = 0;
+ do_fx = 0;
+ }
+ /*
+ * Set flag value to use in switch statement.
+ * Left-contig is 2, right-contig is 1.
+ */
+ switch (((got.br_startoff == del->br_startoff) << 1) |
+ (got_endoff == del_endoff)) {
+ case 3:
+ /*
+ * Matches the whole extent. Delete the entry.
+ */
+ xfs_iext_remove(ip, *idx, 1,
+ whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0);
+ --*idx;
+ if (delay)
+ break;
+
+ XFS_IFORK_NEXT_SET(ip, whichfork,
+ XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
+ flags |= XFS_ILOG_CORE;
+ if (!cur) {
+ flags |= xfs_ilog_fext(whichfork);
+ break;
+ }
+ if ((error = xfs_btree_delete(cur, &i)))
+ goto done;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+ break;
+
+ case 2:
+ /*
+ * Deleting the first part of the extent.
+ */
+ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+ xfs_bmbt_set_startoff(ep, del_endoff);
+ temp = got.br_blockcount - del->br_blockcount;
+ xfs_bmbt_set_blockcount(ep, temp);
+ if (delay) {
+ temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
+ da_old);
+ xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+ da_new = temp;
+ break;
+ }
+ xfs_bmbt_set_startblock(ep, del_endblock);
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+ if (!cur) {
+ flags |= xfs_ilog_fext(whichfork);
+ break;
+ }
+ if ((error = xfs_bmbt_update(cur, del_endoff, del_endblock,
+ got.br_blockcount - del->br_blockcount,
+ got.br_state)))
+ goto done;
+ break;
+
+ case 1:
+ /*
+ * Deleting the last part of the extent.
+ */
+ temp = got.br_blockcount - del->br_blockcount;
+ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+ xfs_bmbt_set_blockcount(ep, temp);
+ if (delay) {
+ temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
+ da_old);
+ xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+ da_new = temp;
+ break;
+ }
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+ if (!cur) {
+ flags |= xfs_ilog_fext(whichfork);
+ break;
+ }
+ if ((error = xfs_bmbt_update(cur, got.br_startoff,
+ got.br_startblock,
+ got.br_blockcount - del->br_blockcount,
+ got.br_state)))
+ goto done;
+ break;
+
+ case 0:
+ /*
+ * Deleting the middle of the extent.
+ */
+ temp = del->br_startoff - got.br_startoff;
+ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+ xfs_bmbt_set_blockcount(ep, temp);
+ new.br_startoff = del_endoff;
+ temp2 = got_endoff - del_endoff;
+ new.br_blockcount = temp2;
+ new.br_state = got.br_state;
+ if (!delay) {
+ new.br_startblock = del_endblock;
+ flags |= XFS_ILOG_CORE;
+ if (cur) {
+ if ((error = xfs_bmbt_update(cur,
+ got.br_startoff,
+ got.br_startblock, temp,
+ got.br_state)))
+ goto done;
+ if ((error = xfs_btree_increment(cur, 0, &i)))
+ goto done;
+ cur->bc_rec.b = new;
+ error = xfs_btree_insert(cur, &i);
+ if (error && error != ENOSPC)
+ goto done;
+ /*
+ * If get no-space back from btree insert,
+ * it tried a split, and we have a zero
+ * block reservation.
+ * Fix up our state and return the error.
+ */
+ if (error == ENOSPC) {
+ /*
+ * Reset the cursor, don't trust
+ * it after any insert operation.
+ */
+ if ((error = xfs_bmbt_lookup_eq(cur,
+ got.br_startoff,
+ got.br_startblock,
+ temp, &i)))
+ goto done;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+ /*
+ * Update the btree record back
+ * to the original value.
+ */
+ if ((error = xfs_bmbt_update(cur,
+ got.br_startoff,
+ got.br_startblock,
+ got.br_blockcount,
+ got.br_state)))
+ goto done;
+ /*
+ * Reset the extent record back
+ * to the original value.
+ */
+ xfs_bmbt_set_blockcount(ep,
+ got.br_blockcount);
+ flags = 0;
+ error = XFS_ERROR(ENOSPC);
+ goto done;
+ }
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+ } else
+ flags |= xfs_ilog_fext(whichfork);
+ XFS_IFORK_NEXT_SET(ip, whichfork,
+ XFS_IFORK_NEXTENTS(ip, whichfork) + 1);
+ } else {
+ ASSERT(whichfork == XFS_DATA_FORK);
+ temp = xfs_bmap_worst_indlen(ip, temp);
+ xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
+ temp2 = xfs_bmap_worst_indlen(ip, temp2);
+ new.br_startblock = nullstartblock((int)temp2);
+ da_new = temp + temp2;
+ while (da_new > da_old) {
+ if (temp) {
+ temp--;
+ da_new--;
+ xfs_bmbt_set_startblock(ep,
+ nullstartblock((int)temp));
+ }
+ if (da_new == da_old)
+ break;
+ if (temp2) {
+ temp2--;
+ da_new--;
+ new.br_startblock =
+ nullstartblock((int)temp2);
+ }
+ }
+ }
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+ xfs_iext_insert(ip, *idx + 1, 1, &new, state);
+ ++*idx;
+ break;
+ }
+ /*
+ * If we need to, add to list of extents to delete.
+ */
+ if (do_fx)
+ xfs_bmap_add_free(del->br_startblock, del->br_blockcount, flist,
+ mp);
+ /*
+ * Adjust inode # blocks in the file.
+ */
+ if (nblks)
+ ip->i_d.di_nblocks -= nblks;
+ /*
+ * Adjust quota data.
+ */
+ if (qfield)
+ xfs_trans_mod_dquot_byino(tp, ip, qfield, (long)-nblks);
+
+ /*
+ * Account for change in delayed indirect blocks.
+ * Nothing to do for disk quota accounting here.
+ */
+ ASSERT(da_old >= da_new);
+ if (da_old > da_new) {
+ xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
+ (int64_t)(da_old - da_new), 0);
+ }
+done:
+ *logflagsp = flags;
+ return error;
+}
+
+/*
* Unmap (remove) blocks from a file.
* If nexts is nonzero then the number of extents to remove is limited to
* that value. If not all extents in the block range can be removed then
@@ -5138,6 +5082,7 @@ xfs_bunmapi(
if (XFS_FORCED_SHUTDOWN(mp))
return XFS_ERROR(EIO);
+ ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
ASSERT(len > 0);
ASSERT(nexts >= 0);
@@ -5463,780 +5408,199 @@ error0:
}
/*
- * returns 1 for success, 0 if we failed to map the extent.
- */
-STATIC int
-xfs_getbmapx_fix_eof_hole(
- xfs_inode_t *ip, /* xfs incore inode pointer */
- struct getbmapx *out, /* output structure */
- int prealloced, /* this is a file with
- * preallocated data space */
- __int64_t end, /* last block requested */
- xfs_fsblock_t startblock)
-{
- __int64_t fixlen;
- xfs_mount_t *mp; /* file system mount point */
- xfs_ifork_t *ifp; /* inode fork pointer */
- xfs_extnum_t lastx; /* last extent pointer */
- xfs_fileoff_t fileblock;
-
- if (startblock == HOLESTARTBLOCK) {
- mp = ip->i_mount;
- out->bmv_block = -1;
- fixlen = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, XFS_ISIZE(ip)));
- fixlen -= out->bmv_offset;
- if (prealloced && out->bmv_offset + out->bmv_length == end) {
- /* Came to hole at EOF. Trim it. */
- if (fixlen <= 0)
- return 0;
- out->bmv_length = fixlen;
- }
- } else {
- if (startblock == DELAYSTARTBLOCK)
- out->bmv_block = -2;
- else
- out->bmv_block = xfs_fsb_to_db(ip, startblock);
- fileblock = XFS_BB_TO_FSB(ip->i_mount, out->bmv_offset);
- ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
- if (xfs_iext_bno_to_ext(ifp, fileblock, &lastx) &&
- (lastx == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))-1))
- out->bmv_oflags |= BMV_OF_LAST;
- }
-
- return 1;
-}
-
-/*
- * Get inode's extents as described in bmv, and format for output.
- * Calls formatter to fill the user's buffer until all extents
- * are mapped, until the passed-in bmv->bmv_count slots have
- * been filled, or until the formatter short-circuits the loop,
- * if it is tracking filled-in extents on its own.
+ * Shift extent records to the left to cover a hole.
+ *
+ * The maximum number of extents to be shifted in a single operation
+ * is @num_exts, and @current_ext keeps track of the current extent
+ * index we have shifted. @offset_shift_fsb is the length by which each
+ * extent is shifted. If there is no hole to shift the extents
+ * into, this will be considered invalid operation and we abort immediately.
*/
-int /* error code */
-xfs_getbmap(
- xfs_inode_t *ip,
- struct getbmapx *bmv, /* user bmap structure */
- xfs_bmap_format_t formatter, /* format to user */
- void *arg) /* formatter arg */
+int
+xfs_bmap_shift_extents(
+ struct xfs_trans *tp,
+ struct xfs_inode *ip,
+ int *done,
+ xfs_fileoff_t start_fsb,
+ xfs_fileoff_t offset_shift_fsb,
+ xfs_extnum_t *current_ext,
+ xfs_fsblock_t *firstblock,
+ struct xfs_bmap_free *flist,
+ int num_exts)
{
- __int64_t bmvend; /* last block requested */
- int error = 0; /* return value */
- __int64_t fixlen; /* length for -1 case */
- int i; /* extent number */
- int lock; /* lock state */
- xfs_bmbt_irec_t *map; /* buffer for user's data */
- xfs_mount_t *mp; /* file system mount point */
- int nex; /* # of user extents can do */
- int nexleft; /* # of user extents left */
- int subnex; /* # of bmapi's can do */
- int nmap; /* number of map entries */
- struct getbmapx *out; /* output structure */
- int whichfork; /* data or attr fork */
- int prealloced; /* this is a file with
- * preallocated data space */
- int iflags; /* interface flags */
- int bmapi_flags; /* flags for xfs_bmapi */
- int cur_ext = 0;
-
- mp = ip->i_mount;
- iflags = bmv->bmv_iflags;
- whichfork = iflags & BMV_IF_ATTRFORK ? XFS_ATTR_FORK : XFS_DATA_FORK;
-
- if (whichfork == XFS_ATTR_FORK) {
- if (XFS_IFORK_Q(ip)) {
- if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS &&
- ip->i_d.di_aformat != XFS_DINODE_FMT_BTREE &&
- ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)
- return XFS_ERROR(EINVAL);
- } else if (unlikely(
- ip->i_d.di_aformat != 0 &&
- ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS)) {
- XFS_ERROR_REPORT("xfs_getbmap", XFS_ERRLEVEL_LOW,
- ip->i_mount);
- return XFS_ERROR(EFSCORRUPTED);
- }
-
- prealloced = 0;
- fixlen = 1LL << 32;
- } else {
- if (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS &&
- ip->i_d.di_format != XFS_DINODE_FMT_BTREE &&
- ip->i_d.di_format != XFS_DINODE_FMT_LOCAL)
- return XFS_ERROR(EINVAL);
-
- if (xfs_get_extsz_hint(ip) ||
- ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC|XFS_DIFLAG_APPEND)){
- prealloced = 1;
- fixlen = mp->m_super->s_maxbytes;
- } else {
- prealloced = 0;
- fixlen = XFS_ISIZE(ip);
- }
- }
-
- if (bmv->bmv_length == -1) {
- fixlen = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, fixlen));
- bmv->bmv_length =
- max_t(__int64_t, fixlen - bmv->bmv_offset, 0);
- } else if (bmv->bmv_length == 0) {
- bmv->bmv_entries = 0;
- return 0;
- } else if (bmv->bmv_length < 0) {
- return XFS_ERROR(EINVAL);
- }
-
- nex = bmv->bmv_count - 1;
- if (nex <= 0)
- return XFS_ERROR(EINVAL);
- bmvend = bmv->bmv_offset + bmv->bmv_length;
-
-
- if (bmv->bmv_count > ULONG_MAX / sizeof(struct getbmapx))
- return XFS_ERROR(ENOMEM);
- out = kmem_zalloc(bmv->bmv_count * sizeof(struct getbmapx), KM_MAYFAIL);
- if (!out) {
- out = kmem_zalloc_large(bmv->bmv_count *
- sizeof(struct getbmapx));
- if (!out)
- return XFS_ERROR(ENOMEM);
- }
+ struct xfs_btree_cur *cur;
+ struct xfs_bmbt_rec_host *gotp;
+ struct xfs_bmbt_irec got;
+ struct xfs_bmbt_irec left;
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_ifork *ifp;
+ xfs_extnum_t nexts = 0;
+ xfs_fileoff_t startoff;
+ int error = 0;
+ int i;
+ int whichfork = XFS_DATA_FORK;
+ int logflags;
+ xfs_filblks_t blockcount = 0;
+ int total_extents;
- xfs_ilock(ip, XFS_IOLOCK_SHARED);
- if (whichfork == XFS_DATA_FORK && !(iflags & BMV_IF_DELALLOC)) {
- if (ip->i_delayed_blks || XFS_ISIZE(ip) > ip->i_d.di_size) {
- error = -filemap_write_and_wait(VFS_I(ip)->i_mapping);
- if (error)
- goto out_unlock_iolock;
- }
- /*
- * even after flushing the inode, there can still be delalloc
- * blocks on the inode beyond EOF due to speculative
- * preallocation. These are not removed until the release
- * function is called or the inode is inactivated. Hence we
- * cannot assert here that ip->i_delayed_blks == 0.
- */
+ if (unlikely(XFS_TEST_ERROR(
+ (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
+ XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE),
+ mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) {
+ XFS_ERROR_REPORT("xfs_bmap_shift_extents",
+ XFS_ERRLEVEL_LOW, mp);
+ return XFS_ERROR(EFSCORRUPTED);
}
- lock = xfs_ilock_map_shared(ip);
-
- /*
- * Don't let nex be bigger than the number of extents
- * we can have assuming alternating holes and real extents.
- */
- if (nex > XFS_IFORK_NEXTENTS(ip, whichfork) * 2 + 1)
- nex = XFS_IFORK_NEXTENTS(ip, whichfork) * 2 + 1;
-
- bmapi_flags = xfs_bmapi_aflag(whichfork);
- if (!(iflags & BMV_IF_PREALLOC))
- bmapi_flags |= XFS_BMAPI_IGSTATE;
-
- /*
- * Allocate enough space to handle "subnex" maps at a time.
- */
- error = ENOMEM;
- subnex = 16;
- map = kmem_alloc(subnex * sizeof(*map), KM_MAYFAIL | KM_NOFS);
- if (!map)
- goto out_unlock_ilock;
-
- bmv->bmv_entries = 0;
-
- if (XFS_IFORK_NEXTENTS(ip, whichfork) == 0 &&
- (whichfork == XFS_ATTR_FORK || !(iflags & BMV_IF_DELALLOC))) {
- error = 0;
- goto out_free_map;
- }
+ if (XFS_FORCED_SHUTDOWN(mp))
+ return XFS_ERROR(EIO);
- nexleft = nex;
+ ASSERT(current_ext != NULL);
- do {
- nmap = (nexleft > subnex) ? subnex : nexleft;
- error = xfs_bmapi_read(ip, XFS_BB_TO_FSBT(mp, bmv->bmv_offset),
- XFS_BB_TO_FSB(mp, bmv->bmv_length),
- map, &nmap, bmapi_flags);
+ ifp = XFS_IFORK_PTR(ip, whichfork);
+ if (!(ifp->if_flags & XFS_IFEXTENTS)) {
+ /* Read in all the extents */
+ error = xfs_iread_extents(tp, ip, whichfork);
if (error)
- goto out_free_map;
- ASSERT(nmap <= subnex);
-
- for (i = 0; i < nmap && nexleft && bmv->bmv_length; i++) {
- out[cur_ext].bmv_oflags = 0;
- if (map[i].br_state == XFS_EXT_UNWRITTEN)
- out[cur_ext].bmv_oflags |= BMV_OF_PREALLOC;
- else if (map[i].br_startblock == DELAYSTARTBLOCK)
- out[cur_ext].bmv_oflags |= BMV_OF_DELALLOC;
- out[cur_ext].bmv_offset =
- XFS_FSB_TO_BB(mp, map[i].br_startoff);
- out[cur_ext].bmv_length =
- XFS_FSB_TO_BB(mp, map[i].br_blockcount);
- out[cur_ext].bmv_unused1 = 0;
- out[cur_ext].bmv_unused2 = 0;
-
- /*
- * delayed allocation extents that start beyond EOF can
- * occur due to speculative EOF allocation when the
- * delalloc extent is larger than the largest freespace
- * extent at conversion time. These extents cannot be
- * converted by data writeback, so can exist here even
- * if we are not supposed to be finding delalloc
- * extents.
- */
- if (map[i].br_startblock == DELAYSTARTBLOCK &&
- map[i].br_startoff <= XFS_B_TO_FSB(mp, XFS_ISIZE(ip)))
- ASSERT((iflags & BMV_IF_DELALLOC) != 0);
-
- if (map[i].br_startblock == HOLESTARTBLOCK &&
- whichfork == XFS_ATTR_FORK) {
- /* came to the end of attribute fork */
- out[cur_ext].bmv_oflags |= BMV_OF_LAST;
- goto out_free_map;
- }
-
- if (!xfs_getbmapx_fix_eof_hole(ip, &out[cur_ext],
- prealloced, bmvend,
- map[i].br_startblock))
- goto out_free_map;
-
- bmv->bmv_offset =
- out[cur_ext].bmv_offset +
- out[cur_ext].bmv_length;
- bmv->bmv_length =
- max_t(__int64_t, 0, bmvend - bmv->bmv_offset);
-
- /*
- * In case we don't want to return the hole,
- * don't increase cur_ext so that we can reuse
- * it in the next loop.
- */
- if ((iflags & BMV_IF_NO_HOLES) &&
- map[i].br_startblock == HOLESTARTBLOCK) {
- memset(&out[cur_ext], 0, sizeof(out[cur_ext]));
- continue;
- }
-
- nexleft--;
- bmv->bmv_entries++;
- cur_ext++;
- }
- } while (nmap && nexleft && bmv->bmv_length);
-
- out_free_map:
- kmem_free(map);
- out_unlock_ilock:
- xfs_iunlock_map_shared(ip, lock);
- out_unlock_iolock:
- xfs_iunlock(ip, XFS_IOLOCK_SHARED);
-
- for (i = 0; i < cur_ext; i++) {
- int full = 0; /* user array is full */
-
- /* format results & advance arg */
- error = formatter(&arg, &out[i], &full);
- if (error || full)
- break;
- }
-
- if (is_vmalloc_addr(out))
- kmem_free_large(out);
- else
- kmem_free(out);
- return error;
-}
-
-#ifdef DEBUG
-STATIC struct xfs_buf *
-xfs_bmap_get_bp(
- struct xfs_btree_cur *cur,
- xfs_fsblock_t bno)
-{
- struct xfs_log_item_desc *lidp;
- int i;
-
- if (!cur)
- return NULL;
-
- for (i = 0; i < XFS_BTREE_MAXLEVELS; i++) {
- if (!cur->bc_bufs[i])
- break;
- if (XFS_BUF_ADDR(cur->bc_bufs[i]) == bno)
- return cur->bc_bufs[i];
- }
-
- /* Chase down all the log items to see if the bp is there */
- list_for_each_entry(lidp, &cur->bc_tp->t_items, lid_trans) {
- struct xfs_buf_log_item *bip;
- bip = (struct xfs_buf_log_item *)lidp->lid_item;
- if (bip->bli_item.li_type == XFS_LI_BUF &&
- XFS_BUF_ADDR(bip->bli_buf) == bno)
- return bip->bli_buf;
- }
-
- return NULL;
-}
-
-STATIC void
-xfs_check_block(
- struct xfs_btree_block *block,
- xfs_mount_t *mp,
- int root,
- short sz)
-{
- int i, j, dmxr;
- __be64 *pp, *thispa; /* pointer to block address */
- xfs_bmbt_key_t *prevp, *keyp;
-
- ASSERT(be16_to_cpu(block->bb_level) > 0);
-
- prevp = NULL;
- for( i = 1; i <= xfs_btree_get_numrecs(block); i++) {
- dmxr = mp->m_bmap_dmxr[0];
- keyp = XFS_BMBT_KEY_ADDR(mp, block, i);
-
- if (prevp) {
- ASSERT(be64_to_cpu(prevp->br_startoff) <
- be64_to_cpu(keyp->br_startoff));
- }
- prevp = keyp;
-
- /*
- * Compare the block numbers to see if there are dups.
- */
- if (root)
- pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, i, sz);
- else
- pp = XFS_BMBT_PTR_ADDR(mp, block, i, dmxr);
-
- for (j = i+1; j <= be16_to_cpu(block->bb_numrecs); j++) {
- if (root)
- thispa = XFS_BMAP_BROOT_PTR_ADDR(mp, block, j, sz);
- else
- thispa = XFS_BMBT_PTR_ADDR(mp, block, j, dmxr);
- if (*thispa == *pp) {
- xfs_warn(mp, "%s: thispa(%d) == pp(%d) %Ld",
- __func__, j, i,
- (unsigned long long)be64_to_cpu(*thispa));
- panic("%s: ptrs are equal in node\n",
- __func__);
- }
- }
- }
-}
-
-/*
- * Check that the extents for the inode ip are in the right order in all
- * btree leaves.
- */
-
-STATIC void
-xfs_bmap_check_leaf_extents(
- xfs_btree_cur_t *cur, /* btree cursor or null */
- xfs_inode_t *ip, /* incore inode pointer */
- int whichfork) /* data or attr fork */
-{
- struct xfs_btree_block *block; /* current btree block */
- xfs_fsblock_t bno; /* block # of "block" */
- xfs_buf_t *bp; /* buffer for "block" */
- int error; /* error return value */
- xfs_extnum_t i=0, j; /* index into the extents list */
- xfs_ifork_t *ifp; /* fork structure */
- int level; /* btree level, for checking */
- xfs_mount_t *mp; /* file system mount structure */
- __be64 *pp; /* pointer to block address */
- xfs_bmbt_rec_t *ep; /* pointer to current extent */
- xfs_bmbt_rec_t last = {0, 0}; /* last extent in prev block */
- xfs_bmbt_rec_t *nextp; /* pointer to next extent */
- int bp_release = 0;
-
- if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE) {
- return;
+ return error;
}
- bno = NULLFSBLOCK;
- mp = ip->i_mount;
- ifp = XFS_IFORK_PTR(ip, whichfork);
- block = ifp->if_broot;
/*
- * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out.
+ * If *current_ext is 0, we would need to lookup the extent
+ * from where we would start shifting and store it in gotp.
*/
- level = be16_to_cpu(block->bb_level);
- ASSERT(level > 0);
- xfs_check_block(block, mp, 1, ifp->if_broot_bytes);
- pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes);
- bno = be64_to_cpu(*pp);
-
- ASSERT(bno != NULLDFSBNO);
- ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount);
- ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks);
-
- /*
- * Go down the tree until leaf level is reached, following the first
- * pointer (leftmost) at each level.
- */
- while (level-- > 0) {
- /* See if buf is in cur first */
- bp_release = 0;
- bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno));
- if (!bp) {
- bp_release = 1;
- error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp,
- XFS_BMAP_BTREE_REF,
- &xfs_bmbt_buf_ops);
- if (error)
- goto error_norelse;
- }
- block = XFS_BUF_TO_BLOCK(bp);
- XFS_WANT_CORRUPTED_GOTO(
- xfs_bmap_sanity_check(mp, bp, level),
- error0);
- if (level == 0)
- break;
-
+ if (!*current_ext) {
+ gotp = xfs_iext_bno_to_ext(ifp, start_fsb, current_ext);
/*
- * Check this block for basic sanity (increasing keys and
- * no duplicate blocks).
+ * gotp can be null in 2 cases: 1) if there are no extents
+ * or 2) start_fsb lies in a hole beyond which there are
+ * no extents. Either way, we are done.
*/
-
- xfs_check_block(block, mp, 0, 0);
- pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]);
- bno = be64_to_cpu(*pp);
- XFS_WANT_CORRUPTED_GOTO(XFS_FSB_SANITY_CHECK(mp, bno), error0);
- if (bp_release) {
- bp_release = 0;
- xfs_trans_brelse(NULL, bp);
+ if (!gotp) {
+ *done = 1;
+ return 0;
}
}
- /*
- * Here with bp and block set to the leftmost leaf node in the tree.
- */
- i = 0;
+ /* We are going to change core inode */
+ logflags = XFS_ILOG_CORE;
+ if (ifp->if_flags & XFS_IFBROOT) {
+ cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
+ cur->bc_private.b.firstblock = *firstblock;
+ cur->bc_private.b.flist = flist;
+ cur->bc_private.b.flags = 0;
+ } else {
+ cur = NULL;
+ logflags |= XFS_ILOG_DEXT;
+ }
/*
- * Loop over all leaf nodes checking that all extents are in the right order.
+ * There may be delalloc extents in the data fork before the range we
+ * are collapsing out, so we cannot
+ * use the count of real extents here. Instead we have to calculate it
+ * from the incore fork.
*/
- for (;;) {
- xfs_fsblock_t nextbno;
- xfs_extnum_t num_recs;
-
-
- num_recs = xfs_btree_get_numrecs(block);
+ total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t);
+ while (nexts++ < num_exts && *current_ext < total_extents) {
- /*
- * Read-ahead the next leaf block, if any.
- */
-
- nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
+ gotp = xfs_iext_get_ext(ifp, *current_ext);
+ xfs_bmbt_get_all(gotp, &got);
+ startoff = got.br_startoff - offset_shift_fsb;
/*
- * Check all the extents to make sure they are OK.
- * If we had a previous block, the last entry should
- * conform with the first entry in this one.
+ * Before shifting extent into hole, make sure that the hole
+ * is large enough to accomodate the shift.
*/
+ if (*current_ext) {
+ xfs_bmbt_get_all(xfs_iext_get_ext(ifp,
+ *current_ext - 1), &left);
- ep = XFS_BMBT_REC_ADDR(mp, block, 1);
- if (i) {
- ASSERT(xfs_bmbt_disk_get_startoff(&last) +
- xfs_bmbt_disk_get_blockcount(&last) <=
- xfs_bmbt_disk_get_startoff(ep));
- }
- for (j = 1; j < num_recs; j++) {
- nextp = XFS_BMBT_REC_ADDR(mp, block, j + 1);
- ASSERT(xfs_bmbt_disk_get_startoff(ep) +
- xfs_bmbt_disk_get_blockcount(ep) <=
- xfs_bmbt_disk_get_startoff(nextp));
- ep = nextp;
+ if (startoff < left.br_startoff + left.br_blockcount)
+ error = XFS_ERROR(EINVAL);
+ } else if (offset_shift_fsb > got.br_startoff) {
+ /*
+ * When first extent is shifted, offset_shift_fsb
+ * should be less than the stating offset of
+ * the first extent.
+ */
+ error = XFS_ERROR(EINVAL);
}
- last = *ep;
- i += num_recs;
- if (bp_release) {
- bp_release = 0;
- xfs_trans_brelse(NULL, bp);
- }
- bno = nextbno;
- /*
- * If we've reached the end, stop.
- */
- if (bno == NULLFSBLOCK)
- break;
+ if (error)
+ goto del_cursor;
- bp_release = 0;
- bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno));
- if (!bp) {
- bp_release = 1;
- error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp,
- XFS_BMAP_BTREE_REF,
- &xfs_bmbt_buf_ops);
+ if (cur) {
+ error = xfs_bmbt_lookup_eq(cur, got.br_startoff,
+ got.br_startblock,
+ got.br_blockcount,
+ &i);
if (error)
- goto error_norelse;
+ goto del_cursor;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, del_cursor);
}
- block = XFS_BUF_TO_BLOCK(bp);
- }
- if (bp_release) {
- bp_release = 0;
- xfs_trans_brelse(NULL, bp);
- }
- return;
-error0:
- xfs_warn(mp, "%s: at error0", __func__);
- if (bp_release)
- xfs_trans_brelse(NULL, bp);
-error_norelse:
- xfs_warn(mp, "%s: BAD after btree leaves for %d extents",
- __func__, i);
- panic("%s: CORRUPTED BTREE OR SOMETHING", __func__);
- return;
-}
-#endif
-
-/*
- * Count fsblocks of the given fork.
- */
-int /* error */
-xfs_bmap_count_blocks(
- xfs_trans_t *tp, /* transaction pointer */
- xfs_inode_t *ip, /* incore inode */
- int whichfork, /* data or attr fork */
- int *count) /* out: count of blocks */
-{
- struct xfs_btree_block *block; /* current btree block */
- xfs_fsblock_t bno; /* block # of "block" */
- xfs_ifork_t *ifp; /* fork structure */
- int level; /* btree level, for checking */
- xfs_mount_t *mp; /* file system mount structure */
- __be64 *pp; /* pointer to block address */
-
- bno = NULLFSBLOCK;
- mp = ip->i_mount;
- ifp = XFS_IFORK_PTR(ip, whichfork);
- if ( XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS ) {
- xfs_bmap_count_leaves(ifp, 0,
- ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t),
- count);
- return 0;
- }
-
- /*
- * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out.
- */
- block = ifp->if_broot;
- level = be16_to_cpu(block->bb_level);
- ASSERT(level > 0);
- pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes);
- bno = be64_to_cpu(*pp);
- ASSERT(bno != NULLDFSBNO);
- ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount);
- ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks);
-
- if (unlikely(xfs_bmap_count_tree(mp, tp, ifp, bno, level, count) < 0)) {
- XFS_ERROR_REPORT("xfs_bmap_count_blocks(2)", XFS_ERRLEVEL_LOW,
- mp);
- return XFS_ERROR(EFSCORRUPTED);
- }
-
- return 0;
-}
+ /* Check if we can merge 2 adjacent extents */
+ if (*current_ext &&
+ left.br_startoff + left.br_blockcount == startoff &&
+ left.br_startblock + left.br_blockcount ==
+ got.br_startblock &&
+ left.br_state == got.br_state &&
+ left.br_blockcount + got.br_blockcount <= MAXEXTLEN) {
+ blockcount = left.br_blockcount +
+ got.br_blockcount;
+ xfs_iext_remove(ip, *current_ext, 1, 0);
+ if (cur) {
+ error = xfs_btree_delete(cur, &i);
+ if (error)
+ goto del_cursor;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, del_cursor);
+ }
+ XFS_IFORK_NEXT_SET(ip, whichfork,
+ XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
+ gotp = xfs_iext_get_ext(ifp, --*current_ext);
+ xfs_bmbt_get_all(gotp, &got);
-/*
- * Recursively walks each level of a btree
- * to count total fsblocks is use.
- */
-STATIC int /* error */
-xfs_bmap_count_tree(
- xfs_mount_t *mp, /* file system mount point */
- xfs_trans_t *tp, /* transaction pointer */
- xfs_ifork_t *ifp, /* inode fork pointer */
- xfs_fsblock_t blockno, /* file system block number */
- int levelin, /* level in btree */
- int *count) /* Count of blocks */
-{
- int error;
- xfs_buf_t *bp, *nbp;
- int level = levelin;
- __be64 *pp;
- xfs_fsblock_t bno = blockno;
- xfs_fsblock_t nextbno;
- struct xfs_btree_block *block, *nextblock;
- int numrecs;
-
- error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, XFS_BMAP_BTREE_REF,
- &xfs_bmbt_buf_ops);
- if (error)
- return error;
- *count += 1;
- block = XFS_BUF_TO_BLOCK(bp);
+ /* Make cursor point to the extent we will update */
+ if (cur) {
+ error = xfs_bmbt_lookup_eq(cur, got.br_startoff,
+ got.br_startblock,
+ got.br_blockcount,
+ &i);
+ if (error)
+ goto del_cursor;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, del_cursor);
+ }
- if (--level) {
- /* Not at node above leaves, count this level of nodes */
- nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
- while (nextbno != NULLFSBLOCK) {
- error = xfs_btree_read_bufl(mp, tp, nextbno, 0, &nbp,
- XFS_BMAP_BTREE_REF,
- &xfs_bmbt_buf_ops);
- if (error)
- return error;
- *count += 1;
- nextblock = XFS_BUF_TO_BLOCK(nbp);
- nextbno = be64_to_cpu(nextblock->bb_u.l.bb_rightsib);
- xfs_trans_brelse(tp, nbp);
+ xfs_bmbt_set_blockcount(gotp, blockcount);
+ got.br_blockcount = blockcount;
+ } else {
+ /* We have to update the startoff */
+ xfs_bmbt_set_startoff(gotp, startoff);
+ got.br_startoff = startoff;
}
- /* Dive to the next level */
- pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]);
- bno = be64_to_cpu(*pp);
- if (unlikely((error =
- xfs_bmap_count_tree(mp, tp, ifp, bno, level, count)) < 0)) {
- xfs_trans_brelse(tp, bp);
- XFS_ERROR_REPORT("xfs_bmap_count_tree(1)",
- XFS_ERRLEVEL_LOW, mp);
- return XFS_ERROR(EFSCORRUPTED);
- }
- xfs_trans_brelse(tp, bp);
- } else {
- /* count all level 1 nodes and their leaves */
- for (;;) {
- nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
- numrecs = be16_to_cpu(block->bb_numrecs);
- xfs_bmap_disk_count_leaves(mp, block, numrecs, count);
- xfs_trans_brelse(tp, bp);
- if (nextbno == NULLFSBLOCK)
- break;
- bno = nextbno;
- error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
- XFS_BMAP_BTREE_REF,
- &xfs_bmbt_buf_ops);
+ if (cur) {
+ error = xfs_bmbt_update(cur, got.br_startoff,
+ got.br_startblock,
+ got.br_blockcount,
+ got.br_state);
if (error)
- return error;
- *count += 1;
- block = XFS_BUF_TO_BLOCK(bp);
+ goto del_cursor;
}
- }
- return 0;
-}
-/*
- * Count leaf blocks given a range of extent records.
- */
-STATIC void
-xfs_bmap_count_leaves(
- xfs_ifork_t *ifp,
- xfs_extnum_t idx,
- int numrecs,
- int *count)
-{
- int b;
-
- for (b = 0; b < numrecs; b++) {
- xfs_bmbt_rec_host_t *frp = xfs_iext_get_ext(ifp, idx + b);
- *count += xfs_bmbt_get_blockcount(frp);
+ (*current_ext)++;
+ total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t);
}
-}
-/*
- * Count leaf blocks given a range of extent records originally
- * in btree format.
- */
-STATIC void
-xfs_bmap_disk_count_leaves(
- struct xfs_mount *mp,
- struct xfs_btree_block *block,
- int numrecs,
- int *count)
-{
- int b;
- xfs_bmbt_rec_t *frp;
-
- for (b = 1; b <= numrecs; b++) {
- frp = XFS_BMBT_REC_ADDR(mp, block, b);
- *count += xfs_bmbt_disk_get_blockcount(frp);
- }
-}
-
-/*
- * dead simple method of punching delalyed allocation blocks from a range in
- * the inode. Walks a block at a time so will be slow, but is only executed in
- * rare error cases so the overhead is not critical. This will alays punch out
- * both the start and end blocks, even if the ranges only partially overlap
- * them, so it is up to the caller to ensure that partial blocks are not
- * passed in.
- */
-int
-xfs_bmap_punch_delalloc_range(
- struct xfs_inode *ip,
- xfs_fileoff_t start_fsb,
- xfs_fileoff_t length)
-{
- xfs_fileoff_t remaining = length;
- int error = 0;
-
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
-
- do {
- int done;
- xfs_bmbt_irec_t imap;
- int nimaps = 1;
- xfs_fsblock_t firstblock;
- xfs_bmap_free_t flist;
-
- /*
- * Map the range first and check that it is a delalloc extent
- * before trying to unmap the range. Otherwise we will be
- * trying to remove a real extent (which requires a
- * transaction) or a hole, which is probably a bad idea...
- */
- error = xfs_bmapi_read(ip, start_fsb, 1, &imap, &nimaps,
- XFS_BMAPI_ENTIRE);
-
- if (error) {
- /* something screwed, just bail */
- if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
- xfs_alert(ip->i_mount,
- "Failed delalloc mapping lookup ino %lld fsb %lld.",
- ip->i_ino, start_fsb);
- }
- break;
- }
- if (!nimaps) {
- /* nothing there */
- goto next_block;
- }
- if (imap.br_startblock != DELAYSTARTBLOCK) {
- /* been converted, ignore */
- goto next_block;
- }
- WARN_ON(imap.br_blockcount == 0);
-
- /*
- * Note: while we initialise the firstblock/flist pair, they
- * should never be used because blocks should never be
- * allocated or freed for a delalloc extent and hence we need
- * don't cancel or finish them after the xfs_bunmapi() call.
- */
- xfs_bmap_init(&flist, &firstblock);
- error = xfs_bunmapi(NULL, ip, start_fsb, 1, 0, 1, &firstblock,
- &flist, &done);
- if (error)
- break;
+ /* Check if we are done */
+ if (*current_ext == total_extents)
+ *done = 1;
- ASSERT(!flist.xbf_count && !flist.xbf_first);
-next_block:
- start_fsb++;
- remaining--;
- } while(remaining > 0);
+del_cursor:
+ if (cur)
+ xfs_btree_del_cursor(cur,
+ error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
+ xfs_trans_log_inode(tp, ip, logflags);
return error;
}
-
-/*
- * Convert the given file system block to a disk block. We have to treat it
- * differently based on whether the file is a real time file or not, because the
- * bmap code does.
- */
-xfs_daddr_t
-xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb)
-{
- return (XFS_IS_REALTIME_INODE(ip) ? \
- (xfs_daddr_t)XFS_FSB_TO_BB((ip)->i_mount, (fsb)) : \
- XFS_FSB_TO_DADDR((ip)->i_mount, (fsb)));
-}
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h
index 5f469c3516e..b879ca56a64 100644
--- a/fs/xfs/xfs_bmap.h
+++ b/fs/xfs/xfs_bmap.h
@@ -77,7 +77,6 @@ typedef struct xfs_bmap_free
* from written to unwritten, otherwise convert from unwritten to written.
*/
#define XFS_BMAPI_CONVERT 0x040
-#define XFS_BMAPI_STACK_SWITCH 0x080
#define XFS_BMAPI_FLAGS \
{ XFS_BMAPI_ENTIRE, "ENTIRE" }, \
@@ -86,8 +85,7 @@ typedef struct xfs_bmap_free
{ XFS_BMAPI_PREALLOC, "PREALLOC" }, \
{ XFS_BMAPI_IGSTATE, "IGSTATE" }, \
{ XFS_BMAPI_CONTIG, "CONTIG" }, \
- { XFS_BMAPI_CONVERT, "CONVERT" }, \
- { XFS_BMAPI_STACK_SWITCH, "STACK_SWITCH" }
+ { XFS_BMAPI_CONVERT, "CONVERT" }
static inline int xfs_bmapi_aflag(int w)
@@ -108,41 +106,6 @@ static inline void xfs_bmap_init(xfs_bmap_free_t *flp, xfs_fsblock_t *fbp)
}
/*
- * Argument structure for xfs_bmap_alloc.
- */
-typedef struct xfs_bmalloca {
- xfs_fsblock_t *firstblock; /* i/o first block allocated */
- struct xfs_bmap_free *flist; /* bmap freelist */
- struct xfs_trans *tp; /* transaction pointer */
- struct xfs_inode *ip; /* incore inode pointer */
- struct xfs_bmbt_irec prev; /* extent before the new one */
- struct xfs_bmbt_irec got; /* extent after, or delayed */
-
- xfs_fileoff_t offset; /* offset in file filling in */
- xfs_extlen_t length; /* i/o length asked/allocated */
- xfs_fsblock_t blkno; /* starting block of new extent */
-
- struct xfs_btree_cur *cur; /* btree cursor */
- xfs_extnum_t idx; /* current extent index */
- int nallocs;/* number of extents alloc'd */
- int logflags;/* flags for transaction logging */
-
- xfs_extlen_t total; /* total blocks needed for xaction */
- xfs_extlen_t minlen; /* minimum allocation size (blocks) */
- xfs_extlen_t minleft; /* amount must be left after alloc */
- char eof; /* set if allocating past last extent */
- char wasdel; /* replacing a delayed allocation */
- char userdata;/* set if is user data */
- char aeof; /* allocated space at eof */
- char conv; /* overwriting unwritten extents */
- char stack_switch;
- int flags;
- struct completion *done;
- struct work_struct work;
- int result;
-} xfs_bmalloca_t;
-
-/*
* Flags for xfs_bmap_add_extent*.
*/
#define BMAP_LEFT_CONTIG (1 << 0)
@@ -162,7 +125,17 @@ typedef struct xfs_bmalloca {
{ BMAP_RIGHT_FILLING, "RF" }, \
{ BMAP_ATTRFORK, "ATTR" }
-#if defined(__KERNEL) && defined(DEBUG)
+
+/*
+ * This macro is used to determine how many extents will be shifted
+ * in one write transaction. We could require two splits,
+ * an extent move on the first and an extent merge on the second,
+ * So it is proper that one extent is shifted inside write transaction
+ * at a time.
+ */
+#define XFS_BMAP_MAX_SHIFT_EXTENTS 1
+
+#ifdef DEBUG
void xfs_bmap_trace_exlist(struct xfs_inode *ip, xfs_extnum_t cnt,
int whichfork, unsigned long caller_ip);
#define XFS_BMAP_TRACE_EXLIST(ip,c,w) \
@@ -172,6 +145,7 @@ void xfs_bmap_trace_exlist(struct xfs_inode *ip, xfs_extnum_t cnt,
#endif
int xfs_bmap_add_attrfork(struct xfs_inode *ip, int size, int rsvd);
+void xfs_bmap_local_to_extents_empty(struct xfs_inode *ip, int whichfork);
void xfs_bmap_add_free(xfs_fsblock_t bno, xfs_filblks_t len,
struct xfs_bmap_free *flist, struct xfs_mount *mp);
void xfs_bmap_cancel(struct xfs_bmap_free *flist);
@@ -180,8 +154,8 @@ int xfs_bmap_first_unused(struct xfs_trans *tp, struct xfs_inode *ip,
xfs_extlen_t len, xfs_fileoff_t *unused, int whichfork);
int xfs_bmap_last_before(struct xfs_trans *tp, struct xfs_inode *ip,
xfs_fileoff_t *last_block, int whichfork);
-int xfs_bmap_last_offset(struct xfs_trans *tp, struct xfs_inode *ip,
- xfs_fileoff_t *unused, int whichfork);
+int xfs_bmap_last_offset(struct xfs_inode *ip, xfs_fileoff_t *unused,
+ int whichfork);
int xfs_bmap_one_block(struct xfs_inode *ip, int whichfork);
int xfs_bmap_read_extents(struct xfs_trans *tp, struct xfs_inode *ip,
int whichfork);
@@ -203,24 +177,10 @@ int xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip,
int xfs_check_nostate_extents(struct xfs_ifork *ifp, xfs_extnum_t idx,
xfs_extnum_t num);
uint xfs_default_attroffset(struct xfs_inode *ip);
-
-#ifdef __KERNEL__
-/* bmap to userspace formatter - copy to user & advance pointer */
-typedef int (*xfs_bmap_format_t)(void **, struct getbmapx *, int *);
-
-int xfs_bmap_finish(struct xfs_trans **tp, struct xfs_bmap_free *flist,
- int *committed);
-int xfs_getbmap(struct xfs_inode *ip, struct getbmapx *bmv,
- xfs_bmap_format_t formatter, void *arg);
-int xfs_bmap_eof(struct xfs_inode *ip, xfs_fileoff_t endoff,
- int whichfork, int *eof);
-int xfs_bmap_count_blocks(struct xfs_trans *tp, struct xfs_inode *ip,
- int whichfork, int *count);
-int xfs_bmap_punch_delalloc_range(struct xfs_inode *ip,
- xfs_fileoff_t start_fsb, xfs_fileoff_t length);
-
-xfs_daddr_t xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb);
-
-#endif /* __KERNEL__ */
+int xfs_bmap_shift_extents(struct xfs_trans *tp, struct xfs_inode *ip,
+ int *done, xfs_fileoff_t start_fsb,
+ xfs_fileoff_t offset_shift_fsb, xfs_extnum_t *current_ext,
+ xfs_fsblock_t *firstblock, struct xfs_bmap_free *flist,
+ int num_exts);
#endif /* __XFS_BMAP_H__ */
diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c
index 061b45cbe61..948836c4fd9 100644
--- a/fs/xfs/xfs_bmap_btree.c
+++ b/fs/xfs/xfs_bmap_btree.c
@@ -17,26 +17,26 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
-#include "xfs_types.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_bit.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dinode.h"
#include "xfs_inode.h"
+#include "xfs_trans.h"
#include "xfs_inode_item.h"
#include "xfs_alloc.h"
#include "xfs_btree.h"
-#include "xfs_itable.h"
+#include "xfs_bmap_btree.h"
#include "xfs_bmap.h"
#include "xfs_error.h"
#include "xfs_quota.h"
#include "xfs_trace.h"
+#include "xfs_cksum.h"
+#include "xfs_dinode.h"
/*
* Determine the extent state.
@@ -59,25 +59,32 @@ xfs_extent_state(
*/
void
xfs_bmdr_to_bmbt(
- struct xfs_mount *mp,
+ struct xfs_inode *ip,
xfs_bmdr_block_t *dblock,
int dblocklen,
struct xfs_btree_block *rblock,
int rblocklen)
{
+ struct xfs_mount *mp = ip->i_mount;
int dmxr;
xfs_bmbt_key_t *fkp;
__be64 *fpp;
xfs_bmbt_key_t *tkp;
__be64 *tpp;
- rblock->bb_magic = cpu_to_be32(XFS_BMAP_MAGIC);
+ if (xfs_sb_version_hascrc(&mp->m_sb))
+ xfs_btree_init_block_int(mp, rblock, XFS_BUF_DADDR_NULL,
+ XFS_BMAP_CRC_MAGIC, 0, 0, ip->i_ino,
+ XFS_BTREE_LONG_PTRS | XFS_BTREE_CRC_BLOCKS);
+ else
+ xfs_btree_init_block_int(mp, rblock, XFS_BUF_DADDR_NULL,
+ XFS_BMAP_MAGIC, 0, 0, ip->i_ino,
+ XFS_BTREE_LONG_PTRS);
+
rblock->bb_level = dblock->bb_level;
ASSERT(be16_to_cpu(rblock->bb_level) > 0);
rblock->bb_numrecs = dblock->bb_numrecs;
- rblock->bb_u.l.bb_leftsib = cpu_to_be64(NULLDFSBNO);
- rblock->bb_u.l.bb_rightsib = cpu_to_be64(NULLDFSBNO);
- dmxr = xfs_bmdr_maxrecs(mp, dblocklen, 0);
+ dmxr = xfs_bmdr_maxrecs(dblocklen, 0);
fkp = XFS_BMDR_KEY_ADDR(dblock, 1);
tkp = XFS_BMBT_KEY_ADDR(mp, rblock, 1);
fpp = XFS_BMDR_PTR_ADDR(dblock, 1, dmxr);
@@ -424,13 +431,19 @@ xfs_bmbt_to_bmdr(
xfs_bmbt_key_t *tkp;
__be64 *tpp;
- ASSERT(rblock->bb_magic == cpu_to_be32(XFS_BMAP_MAGIC));
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ ASSERT(rblock->bb_magic == cpu_to_be32(XFS_BMAP_CRC_MAGIC));
+ ASSERT(uuid_equal(&rblock->bb_u.l.bb_uuid, &mp->m_sb.sb_uuid));
+ ASSERT(rblock->bb_u.l.bb_blkno ==
+ cpu_to_be64(XFS_BUF_DADDR_NULL));
+ } else
+ ASSERT(rblock->bb_magic == cpu_to_be32(XFS_BMAP_MAGIC));
ASSERT(rblock->bb_u.l.bb_leftsib == cpu_to_be64(NULLDFSBNO));
ASSERT(rblock->bb_u.l.bb_rightsib == cpu_to_be64(NULLDFSBNO));
ASSERT(rblock->bb_level != 0);
dblock->bb_level = rblock->bb_level;
dblock->bb_numrecs = rblock->bb_numrecs;
- dmxr = xfs_bmdr_maxrecs(mp, dblocklen, 0);
+ dmxr = xfs_bmdr_maxrecs(dblocklen, 0);
fkp = XFS_BMBT_KEY_ADDR(mp, rblock, 1);
tkp = XFS_BMDR_KEY_ADDR(dblock, 1);
fpp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, rblocklen);
@@ -506,7 +519,6 @@ xfs_bmbt_alloc_block(
struct xfs_btree_cur *cur,
union xfs_btree_ptr *start,
union xfs_btree_ptr *new,
- int length,
int *stat)
{
xfs_alloc_arg_t args; /* block allocation args */
@@ -659,8 +671,7 @@ xfs_bmbt_get_dmaxrecs(
{
if (level != cur->bc_nlevels - 1)
return cur->bc_mp->m_bmap_dmxr[level != 0];
- return xfs_bmdr_maxrecs(cur->bc_mp, cur->bc_private.b.forksize,
- level == 0);
+ return xfs_bmdr_maxrecs(cur->bc_private.b.forksize, level == 0);
}
STATIC void
@@ -708,59 +719,87 @@ xfs_bmbt_key_diff(
cur->bc_rec.b.br_startoff;
}
-static void
+static bool
xfs_bmbt_verify(
struct xfs_buf *bp)
{
struct xfs_mount *mp = bp->b_target->bt_mount;
struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
unsigned int level;
- int lblock_ok; /* block passes checks */
- /* magic number and level verification.
+ switch (block->bb_magic) {
+ case cpu_to_be32(XFS_BMAP_CRC_MAGIC):
+ if (!xfs_sb_version_hascrc(&mp->m_sb))
+ return false;
+ if (!uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_uuid))
+ return false;
+ if (be64_to_cpu(block->bb_u.l.bb_blkno) != bp->b_bn)
+ return false;
+ /*
+ * XXX: need a better way of verifying the owner here. Right now
+ * just make sure there has been one set.
+ */
+ if (be64_to_cpu(block->bb_u.l.bb_owner) == 0)
+ return false;
+ /* fall through */
+ case cpu_to_be32(XFS_BMAP_MAGIC):
+ break;
+ default:
+ return false;
+ }
+
+ /*
+ * numrecs and level verification.
*
- * We don't know waht fork we belong to, so just verify that the level
+ * We don't know what fork we belong to, so just verify that the level
* is less than the maximum of the two. Later checks will be more
* precise.
*/
level = be16_to_cpu(block->bb_level);
- lblock_ok = block->bb_magic == cpu_to_be32(XFS_BMAP_MAGIC) &&
- level < max(mp->m_bm_maxlevels[0], mp->m_bm_maxlevels[1]);
-
- /* numrecs verification */
- lblock_ok = lblock_ok &&
- be16_to_cpu(block->bb_numrecs) <= mp->m_bmap_dmxr[level != 0];
+ if (level > max(mp->m_bm_maxlevels[0], mp->m_bm_maxlevels[1]))
+ return false;
+ if (be16_to_cpu(block->bb_numrecs) > mp->m_bmap_dmxr[level != 0])
+ return false;
/* sibling pointer verification */
- lblock_ok = lblock_ok &&
- block->bb_u.l.bb_leftsib &&
- (block->bb_u.l.bb_leftsib == cpu_to_be64(NULLDFSBNO) ||
- XFS_FSB_SANITY_CHECK(mp,
- be64_to_cpu(block->bb_u.l.bb_leftsib))) &&
- block->bb_u.l.bb_rightsib &&
- (block->bb_u.l.bb_rightsib == cpu_to_be64(NULLDFSBNO) ||
- XFS_FSB_SANITY_CHECK(mp,
- be64_to_cpu(block->bb_u.l.bb_rightsib)));
-
- if (!lblock_ok) {
- trace_xfs_btree_corrupt(bp, _RET_IP_);
- XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, block);
- xfs_buf_ioerror(bp, EFSCORRUPTED);
- }
+ if (!block->bb_u.l.bb_leftsib ||
+ (block->bb_u.l.bb_leftsib != cpu_to_be64(NULLDFSBNO) &&
+ !XFS_FSB_SANITY_CHECK(mp, be64_to_cpu(block->bb_u.l.bb_leftsib))))
+ return false;
+ if (!block->bb_u.l.bb_rightsib ||
+ (block->bb_u.l.bb_rightsib != cpu_to_be64(NULLDFSBNO) &&
+ !XFS_FSB_SANITY_CHECK(mp, be64_to_cpu(block->bb_u.l.bb_rightsib))))
+ return false;
+
+ return true;
}
static void
xfs_bmbt_read_verify(
struct xfs_buf *bp)
{
- xfs_bmbt_verify(bp);
+ if (!xfs_btree_lblock_verify_crc(bp))
+ xfs_buf_ioerror(bp, EFSBADCRC);
+ else if (!xfs_bmbt_verify(bp))
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
+
+ if (bp->b_error) {
+ trace_xfs_btree_corrupt(bp, _RET_IP_);
+ xfs_verifier_error(bp);
+ }
}
static void
xfs_bmbt_write_verify(
struct xfs_buf *bp)
{
- xfs_bmbt_verify(bp);
+ if (!xfs_bmbt_verify(bp)) {
+ trace_xfs_btree_corrupt(bp, _RET_IP_);
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
+ xfs_verifier_error(bp);
+ return;
+ }
+ xfs_btree_lblock_calc_crc(bp);
}
const struct xfs_buf_ops xfs_bmbt_buf_ops = {
@@ -769,7 +808,7 @@ const struct xfs_buf_ops xfs_bmbt_buf_ops = {
};
-#ifdef DEBUG
+#if defined(DEBUG) || defined(XFS_WARN)
STATIC int
xfs_bmbt_keys_inorder(
struct xfs_btree_cur *cur,
@@ -809,7 +848,7 @@ static const struct xfs_btree_ops xfs_bmbt_ops = {
.init_ptr_from_cur = xfs_bmbt_init_ptr_from_cur,
.key_diff = xfs_bmbt_key_diff,
.buf_ops = &xfs_bmbt_buf_ops,
-#ifdef DEBUG
+#if defined(DEBUG) || defined(XFS_WARN)
.keys_inorder = xfs_bmbt_keys_inorder,
.recs_inorder = xfs_bmbt_recs_inorder,
#endif
@@ -838,6 +877,8 @@ xfs_bmbt_init_cursor(
cur->bc_ops = &xfs_bmbt_ops;
cur->bc_flags = XFS_BTREE_LONG_PTRS | XFS_BTREE_ROOT_IN_INODE;
+ if (xfs_sb_version_hascrc(&mp->m_sb))
+ cur->bc_flags |= XFS_BTREE_CRC_BLOCKS;
cur->bc_private.b.forksize = XFS_IFORK_SIZE(ip, whichfork);
cur->bc_private.b.ip = ip;
@@ -871,7 +912,6 @@ xfs_bmbt_maxrecs(
*/
int
xfs_bmdr_maxrecs(
- struct xfs_mount *mp,
int blocklen,
int leaf)
{
@@ -881,3 +921,47 @@ xfs_bmdr_maxrecs(
return blocklen / sizeof(xfs_bmdr_rec_t);
return blocklen / (sizeof(xfs_bmdr_key_t) + sizeof(xfs_bmdr_ptr_t));
}
+
+/*
+ * Change the owner of a btree format fork fo the inode passed in. Change it to
+ * the owner of that is passed in so that we can change owners before or after
+ * we switch forks between inodes. The operation that the caller is doing will
+ * determine whether is needs to change owner before or after the switch.
+ *
+ * For demand paged transactional modification, the fork switch should be done
+ * after reading in all the blocks, modifying them and pinning them in the
+ * transaction. For modification when the buffers are already pinned in memory,
+ * the fork switch can be done before changing the owner as we won't need to
+ * validate the owner until the btree buffers are unpinned and writes can occur
+ * again.
+ *
+ * For recovery based ownership change, there is no transactional context and
+ * so a buffer list must be supplied so that we can record the buffers that we
+ * modified for the caller to issue IO on.
+ */
+int
+xfs_bmbt_change_owner(
+ struct xfs_trans *tp,
+ struct xfs_inode *ip,
+ int whichfork,
+ xfs_ino_t new_owner,
+ struct list_head *buffer_list)
+{
+ struct xfs_btree_cur *cur;
+ int error;
+
+ ASSERT(tp || buffer_list);
+ ASSERT(!(tp && buffer_list));
+ if (whichfork == XFS_DATA_FORK)
+ ASSERT(ip->i_d.di_format == XFS_DINODE_FMT_BTREE);
+ else
+ ASSERT(ip->i_d.di_aformat == XFS_DINODE_FMT_BTREE);
+
+ cur = xfs_bmbt_init_cursor(ip->i_mount, tp, ip, whichfork);
+ if (!cur)
+ return ENOMEM;
+
+ error = xfs_btree_change_owner(cur, new_owner, buffer_list);
+ xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
+ return error;
+}
diff --git a/fs/xfs/xfs_bmap_btree.h b/fs/xfs/xfs_bmap_btree.h
index 88469ca0869..819a8a4dee9 100644
--- a/fs/xfs/xfs_bmap_btree.h
+++ b/fs/xfs/xfs_bmap_btree.h
@@ -18,8 +18,6 @@
#ifndef __XFS_BMAP_BTREE_H__
#define __XFS_BMAP_BTREE_H__
-#define XFS_BMAP_MAGIC 0x424d4150 /* 'BMAP' */
-
struct xfs_btree_cur;
struct xfs_btree_block;
struct xfs_mount;
@@ -27,85 +25,6 @@ struct xfs_inode;
struct xfs_trans;
/*
- * Bmap root header, on-disk form only.
- */
-typedef struct xfs_bmdr_block {
- __be16 bb_level; /* 0 is a leaf */
- __be16 bb_numrecs; /* current # of data records */
-} xfs_bmdr_block_t;
-
-/*
- * Bmap btree record and extent descriptor.
- * l0:63 is an extent flag (value 1 indicates non-normal).
- * l0:9-62 are startoff.
- * l0:0-8 and l1:21-63 are startblock.
- * l1:0-20 are blockcount.
- */
-#define BMBT_EXNTFLAG_BITLEN 1
-#define BMBT_STARTOFF_BITLEN 54
-#define BMBT_STARTBLOCK_BITLEN 52
-#define BMBT_BLOCKCOUNT_BITLEN 21
-
-typedef struct xfs_bmbt_rec {
- __be64 l0, l1;
-} xfs_bmbt_rec_t;
-
-typedef __uint64_t xfs_bmbt_rec_base_t; /* use this for casts */
-typedef xfs_bmbt_rec_t xfs_bmdr_rec_t;
-
-typedef struct xfs_bmbt_rec_host {
- __uint64_t l0, l1;
-} xfs_bmbt_rec_host_t;
-
-/*
- * Values and macros for delayed-allocation startblock fields.
- */
-#define STARTBLOCKVALBITS 17
-#define STARTBLOCKMASKBITS (15 + XFS_BIG_BLKNOS * 20)
-#define DSTARTBLOCKMASKBITS (15 + 20)
-#define STARTBLOCKMASK \
- (((((xfs_fsblock_t)1) << STARTBLOCKMASKBITS) - 1) << STARTBLOCKVALBITS)
-#define DSTARTBLOCKMASK \
- (((((xfs_dfsbno_t)1) << DSTARTBLOCKMASKBITS) - 1) << STARTBLOCKVALBITS)
-
-static inline int isnullstartblock(xfs_fsblock_t x)
-{
- return ((x) & STARTBLOCKMASK) == STARTBLOCKMASK;
-}
-
-static inline int isnulldstartblock(xfs_dfsbno_t x)
-{
- return ((x) & DSTARTBLOCKMASK) == DSTARTBLOCKMASK;
-}
-
-static inline xfs_fsblock_t nullstartblock(int k)
-{
- ASSERT(k < (1 << STARTBLOCKVALBITS));
- return STARTBLOCKMASK | (k);
-}
-
-static inline xfs_filblks_t startblockval(xfs_fsblock_t x)
-{
- return (xfs_filblks_t)((x) & ~STARTBLOCKMASK);
-}
-
-/*
- * Possible extent formats.
- */
-typedef enum {
- XFS_EXTFMT_NOSTATE = 0,
- XFS_EXTFMT_HASSTATE
-} xfs_exntfmt_t;
-
-/*
- * Possible extent states.
- */
-typedef enum {
- XFS_EXT_NORM, XFS_EXT_UNWRITTEN,
- XFS_EXT_DMAPI_OFFLINE, XFS_EXT_INVALID
-} xfs_exntst_t;
-
-/*
* Extent state and extent format macros.
*/
#define XFS_EXTFMT_INODE(x) \
@@ -114,32 +33,11 @@ typedef enum {
#define ISUNWRITTEN(x) ((x)->br_state == XFS_EXT_UNWRITTEN)
/*
- * Incore version of above.
- */
-typedef struct xfs_bmbt_irec
-{
- xfs_fileoff_t br_startoff; /* starting file offset */
- xfs_fsblock_t br_startblock; /* starting block number */
- xfs_filblks_t br_blockcount; /* number of blocks */
- xfs_exntst_t br_state; /* extent state */
-} xfs_bmbt_irec_t;
-
-/*
- * Key structure for non-leaf levels of the tree.
- */
-typedef struct xfs_bmbt_key {
- __be64 br_startoff; /* starting file offset */
-} xfs_bmbt_key_t, xfs_bmdr_key_t;
-
-/* btree pointer type */
-typedef __be64 xfs_bmbt_ptr_t, xfs_bmdr_ptr_t;
-
-/*
* Btree block header size depends on a superblock flag.
- *
- * (not quite yet, but soon)
*/
-#define XFS_BMBT_BLOCK_LEN(mp) XFS_BTREE_LBLOCK_LEN
+#define XFS_BMBT_BLOCK_LEN(mp) \
+ (xfs_sb_version_hascrc(&((mp)->m_sb)) ? \
+ XFS_BTREE_LBLOCK_CRC_LEN : XFS_BTREE_LBLOCK_LEN)
#define XFS_BMBT_REC_ADDR(mp, block, index) \
((xfs_bmbt_rec_t *) \
@@ -186,15 +84,17 @@ typedef __be64 xfs_bmbt_ptr_t, xfs_bmdr_ptr_t;
#define XFS_BMAP_BROOT_PTR_ADDR(mp, bb, i, sz) \
XFS_BMBT_PTR_ADDR(mp, bb, i, xfs_bmbt_maxrecs(mp, sz, 0))
-#define XFS_BMAP_BROOT_SPACE_CALC(nrecs) \
- (int)(XFS_BTREE_LBLOCK_LEN + \
+#define XFS_BMAP_BROOT_SPACE_CALC(mp, nrecs) \
+ (int)(XFS_BMBT_BLOCK_LEN(mp) + \
((nrecs) * (sizeof(xfs_bmbt_key_t) + sizeof(xfs_bmbt_ptr_t))))
-#define XFS_BMAP_BROOT_SPACE(bb) \
- (XFS_BMAP_BROOT_SPACE_CALC(be16_to_cpu((bb)->bb_numrecs)))
+#define XFS_BMAP_BROOT_SPACE(mp, bb) \
+ (XFS_BMAP_BROOT_SPACE_CALC(mp, be16_to_cpu((bb)->bb_numrecs)))
#define XFS_BMDR_SPACE_CALC(nrecs) \
(int)(sizeof(xfs_bmdr_block_t) + \
((nrecs) * (sizeof(xfs_bmbt_key_t) + sizeof(xfs_bmbt_ptr_t))))
+#define XFS_BMAP_BMDR_SPACE(bb) \
+ (XFS_BMDR_SPACE_CALC(be16_to_cpu((bb)->bb_numrecs)))
/*
* Maximum number of bmap btree levels.
@@ -204,7 +104,7 @@ typedef __be64 xfs_bmbt_ptr_t, xfs_bmdr_ptr_t;
/*
* Prototypes for xfs_bmap.c to call.
*/
-extern void xfs_bmdr_to_bmbt(struct xfs_mount *, xfs_bmdr_block_t *, int,
+extern void xfs_bmdr_to_bmbt(struct xfs_inode *, xfs_bmdr_block_t *, int,
struct xfs_btree_block *, int);
extern void xfs_bmbt_get_all(xfs_bmbt_rec_host_t *r, xfs_bmbt_irec_t *s);
extern xfs_filblks_t xfs_bmbt_get_blockcount(xfs_bmbt_rec_host_t *r);
@@ -230,12 +130,14 @@ extern void xfs_bmbt_to_bmdr(struct xfs_mount *, struct xfs_btree_block *, int,
xfs_bmdr_block_t *, int);
extern int xfs_bmbt_get_maxrecs(struct xfs_btree_cur *, int level);
-extern int xfs_bmdr_maxrecs(struct xfs_mount *, int blocklen, int leaf);
+extern int xfs_bmdr_maxrecs(int blocklen, int leaf);
extern int xfs_bmbt_maxrecs(struct xfs_mount *, int blocklen, int leaf);
+extern int xfs_bmbt_change_owner(struct xfs_trans *tp, struct xfs_inode *ip,
+ int whichfork, xfs_ino_t new_owner,
+ struct list_head *buffer_list);
+
extern struct xfs_btree_cur *xfs_bmbt_init_cursor(struct xfs_mount *,
struct xfs_trans *, struct xfs_inode *, int);
-extern const struct xfs_buf_ops xfs_bmbt_buf_ops;
-
#endif /* __XFS_BMAP_BTREE_H__ */
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
new file mode 100644
index 00000000000..64731ef3324
--- /dev/null
+++ b/fs/xfs/xfs_bmap_util.c
@@ -0,0 +1,1897 @@
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * Copyright (c) 2012 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_da_format.h"
+#include "xfs_inode.h"
+#include "xfs_btree.h"
+#include "xfs_trans.h"
+#include "xfs_extfree_item.h"
+#include "xfs_alloc.h"
+#include "xfs_bmap.h"
+#include "xfs_bmap_util.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_rtalloc.h"
+#include "xfs_error.h"
+#include "xfs_quota.h"
+#include "xfs_trans_space.h"
+#include "xfs_trace.h"
+#include "xfs_icache.h"
+#include "xfs_log.h"
+#include "xfs_dinode.h"
+
+/* Kernel only BMAP related definitions and functions */
+
+/*
+ * Convert the given file system block to a disk block. We have to treat it
+ * differently based on whether the file is a real time file or not, because the
+ * bmap code does.
+ */
+xfs_daddr_t
+xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb)
+{
+ return (XFS_IS_REALTIME_INODE(ip) ? \
+ (xfs_daddr_t)XFS_FSB_TO_BB((ip)->i_mount, (fsb)) : \
+ XFS_FSB_TO_DADDR((ip)->i_mount, (fsb)));
+}
+
+/*
+ * Routine to be called at transaction's end by xfs_bmapi, xfs_bunmapi
+ * caller. Frees all the extents that need freeing, which must be done
+ * last due to locking considerations. We never free any extents in
+ * the first transaction.
+ *
+ * Return 1 if the given transaction was committed and a new one
+ * started, and 0 otherwise in the committed parameter.
+ */
+int /* error */
+xfs_bmap_finish(
+ xfs_trans_t **tp, /* transaction pointer addr */
+ xfs_bmap_free_t *flist, /* i/o: list extents to free */
+ int *committed) /* xact committed or not */
+{
+ xfs_efd_log_item_t *efd; /* extent free data */
+ xfs_efi_log_item_t *efi; /* extent free intention */
+ int error; /* error return value */
+ xfs_bmap_free_item_t *free; /* free extent item */
+ struct xfs_trans_res tres; /* new log reservation */
+ xfs_mount_t *mp; /* filesystem mount structure */
+ xfs_bmap_free_item_t *next; /* next item on free list */
+ xfs_trans_t *ntp; /* new transaction pointer */
+
+ ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES);
+ if (flist->xbf_count == 0) {
+ *committed = 0;
+ return 0;
+ }
+ ntp = *tp;
+ efi = xfs_trans_get_efi(ntp, flist->xbf_count);
+ for (free = flist->xbf_first; free; free = free->xbfi_next)
+ xfs_trans_log_efi_extent(ntp, efi, free->xbfi_startblock,
+ free->xbfi_blockcount);
+
+ tres.tr_logres = ntp->t_log_res;
+ tres.tr_logcount = ntp->t_log_count;
+ tres.tr_logflags = XFS_TRANS_PERM_LOG_RES;
+ ntp = xfs_trans_dup(*tp);
+ error = xfs_trans_commit(*tp, 0);
+ *tp = ntp;
+ *committed = 1;
+ /*
+ * We have a new transaction, so we should return committed=1,
+ * even though we're returning an error.
+ */
+ if (error)
+ return error;
+
+ /*
+ * transaction commit worked ok so we can drop the extra ticket
+ * reference that we gained in xfs_trans_dup()
+ */
+ xfs_log_ticket_put(ntp->t_ticket);
+
+ error = xfs_trans_reserve(ntp, &tres, 0, 0);
+ if (error)
+ return error;
+ efd = xfs_trans_get_efd(ntp, efi, flist->xbf_count);
+ for (free = flist->xbf_first; free != NULL; free = next) {
+ next = free->xbfi_next;
+ if ((error = xfs_free_extent(ntp, free->xbfi_startblock,
+ free->xbfi_blockcount))) {
+ /*
+ * The bmap free list will be cleaned up at a
+ * higher level. The EFI will be canceled when
+ * this transaction is aborted.
+ * Need to force shutdown here to make sure it
+ * happens, since this transaction may not be
+ * dirty yet.
+ */
+ mp = ntp->t_mountp;
+ if (!XFS_FORCED_SHUTDOWN(mp))
+ xfs_force_shutdown(mp,
+ (error == EFSCORRUPTED) ?
+ SHUTDOWN_CORRUPT_INCORE :
+ SHUTDOWN_META_IO_ERROR);
+ return error;
+ }
+ xfs_trans_log_efd_extent(ntp, efd, free->xbfi_startblock,
+ free->xbfi_blockcount);
+ xfs_bmap_del_free(flist, NULL, free);
+ }
+ return 0;
+}
+
+int
+xfs_bmap_rtalloc(
+ struct xfs_bmalloca *ap) /* bmap alloc argument struct */
+{
+ xfs_alloctype_t atype = 0; /* type for allocation routines */
+ int error; /* error return value */
+ xfs_mount_t *mp; /* mount point structure */
+ xfs_extlen_t prod = 0; /* product factor for allocators */
+ xfs_extlen_t ralen = 0; /* realtime allocation length */
+ xfs_extlen_t align; /* minimum allocation alignment */
+ xfs_rtblock_t rtb;
+
+ mp = ap->ip->i_mount;
+ align = xfs_get_extsz_hint(ap->ip);
+ prod = align / mp->m_sb.sb_rextsize;
+ error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev,
+ align, 1, ap->eof, 0,
+ ap->conv, &ap->offset, &ap->length);
+ if (error)
+ return error;
+ ASSERT(ap->length);
+ ASSERT(ap->length % mp->m_sb.sb_rextsize == 0);
+
+ /*
+ * If the offset & length are not perfectly aligned
+ * then kill prod, it will just get us in trouble.
+ */
+ if (do_mod(ap->offset, align) || ap->length % align)
+ prod = 1;
+ /*
+ * Set ralen to be the actual requested length in rtextents.
+ */
+ ralen = ap->length / mp->m_sb.sb_rextsize;
+ /*
+ * If the old value was close enough to MAXEXTLEN that
+ * we rounded up to it, cut it back so it's valid again.
+ * Note that if it's a really large request (bigger than
+ * MAXEXTLEN), we don't hear about that number, and can't
+ * adjust the starting point to match it.
+ */
+ if (ralen * mp->m_sb.sb_rextsize >= MAXEXTLEN)
+ ralen = MAXEXTLEN / mp->m_sb.sb_rextsize;
+
+ /*
+ * Lock out other modifications to the RT bitmap inode.
+ */
+ xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(ap->tp, mp->m_rbmip, XFS_ILOCK_EXCL);
+
+ /*
+ * If it's an allocation to an empty file at offset 0,
+ * pick an extent that will space things out in the rt area.
+ */
+ if (ap->eof && ap->offset == 0) {
+ xfs_rtblock_t uninitialized_var(rtx); /* realtime extent no */
+
+ error = xfs_rtpick_extent(mp, ap->tp, ralen, &rtx);
+ if (error)
+ return error;
+ ap->blkno = rtx * mp->m_sb.sb_rextsize;
+ } else {
+ ap->blkno = 0;
+ }
+
+ xfs_bmap_adjacent(ap);
+
+ /*
+ * Realtime allocation, done through xfs_rtallocate_extent.
+ */
+ atype = ap->blkno == 0 ? XFS_ALLOCTYPE_ANY_AG : XFS_ALLOCTYPE_NEAR_BNO;
+ do_div(ap->blkno, mp->m_sb.sb_rextsize);
+ rtb = ap->blkno;
+ ap->length = ralen;
+ if ((error = xfs_rtallocate_extent(ap->tp, ap->blkno, 1, ap->length,
+ &ralen, atype, ap->wasdel, prod, &rtb)))
+ return error;
+ if (rtb == NULLFSBLOCK && prod > 1 &&
+ (error = xfs_rtallocate_extent(ap->tp, ap->blkno, 1,
+ ap->length, &ralen, atype,
+ ap->wasdel, 1, &rtb)))
+ return error;
+ ap->blkno = rtb;
+ if (ap->blkno != NULLFSBLOCK) {
+ ap->blkno *= mp->m_sb.sb_rextsize;
+ ralen *= mp->m_sb.sb_rextsize;
+ ap->length = ralen;
+ ap->ip->i_d.di_nblocks += ralen;
+ xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE);
+ if (ap->wasdel)
+ ap->ip->i_delayed_blks -= ralen;
+ /*
+ * Adjust the disk quota also. This was reserved
+ * earlier.
+ */
+ xfs_trans_mod_dquot_byino(ap->tp, ap->ip,
+ ap->wasdel ? XFS_TRANS_DQ_DELRTBCOUNT :
+ XFS_TRANS_DQ_RTBCOUNT, (long) ralen);
+ } else {
+ ap->length = 0;
+ }
+ return 0;
+}
+
+/*
+ * Check if the endoff is outside the last extent. If so the caller will grow
+ * the allocation to a stripe unit boundary. All offsets are considered outside
+ * the end of file for an empty fork, so 1 is returned in *eof in that case.
+ */
+int
+xfs_bmap_eof(
+ struct xfs_inode *ip,
+ xfs_fileoff_t endoff,
+ int whichfork,
+ int *eof)
+{
+ struct xfs_bmbt_irec rec;
+ int error;
+
+ error = xfs_bmap_last_extent(NULL, ip, whichfork, &rec, eof);
+ if (error || *eof)
+ return error;
+
+ *eof = endoff >= rec.br_startoff + rec.br_blockcount;
+ return 0;
+}
+
+/*
+ * Extent tree block counting routines.
+ */
+
+/*
+ * Count leaf blocks given a range of extent records.
+ */
+STATIC void
+xfs_bmap_count_leaves(
+ xfs_ifork_t *ifp,
+ xfs_extnum_t idx,
+ int numrecs,
+ int *count)
+{
+ int b;
+
+ for (b = 0; b < numrecs; b++) {
+ xfs_bmbt_rec_host_t *frp = xfs_iext_get_ext(ifp, idx + b);
+ *count += xfs_bmbt_get_blockcount(frp);
+ }
+}
+
+/*
+ * Count leaf blocks given a range of extent records originally
+ * in btree format.
+ */
+STATIC void
+xfs_bmap_disk_count_leaves(
+ struct xfs_mount *mp,
+ struct xfs_btree_block *block,
+ int numrecs,
+ int *count)
+{
+ int b;
+ xfs_bmbt_rec_t *frp;
+
+ for (b = 1; b <= numrecs; b++) {
+ frp = XFS_BMBT_REC_ADDR(mp, block, b);
+ *count += xfs_bmbt_disk_get_blockcount(frp);
+ }
+}
+
+/*
+ * Recursively walks each level of a btree
+ * to count total fsblocks in use.
+ */
+STATIC int /* error */
+xfs_bmap_count_tree(
+ xfs_mount_t *mp, /* file system mount point */
+ xfs_trans_t *tp, /* transaction pointer */
+ xfs_ifork_t *ifp, /* inode fork pointer */
+ xfs_fsblock_t blockno, /* file system block number */
+ int levelin, /* level in btree */
+ int *count) /* Count of blocks */
+{
+ int error;
+ xfs_buf_t *bp, *nbp;
+ int level = levelin;
+ __be64 *pp;
+ xfs_fsblock_t bno = blockno;
+ xfs_fsblock_t nextbno;
+ struct xfs_btree_block *block, *nextblock;
+ int numrecs;
+
+ error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, XFS_BMAP_BTREE_REF,
+ &xfs_bmbt_buf_ops);
+ if (error)
+ return error;
+ *count += 1;
+ block = XFS_BUF_TO_BLOCK(bp);
+
+ if (--level) {
+ /* Not at node above leaves, count this level of nodes */
+ nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
+ while (nextbno != NULLFSBLOCK) {
+ error = xfs_btree_read_bufl(mp, tp, nextbno, 0, &nbp,
+ XFS_BMAP_BTREE_REF,
+ &xfs_bmbt_buf_ops);
+ if (error)
+ return error;
+ *count += 1;
+ nextblock = XFS_BUF_TO_BLOCK(nbp);
+ nextbno = be64_to_cpu(nextblock->bb_u.l.bb_rightsib);
+ xfs_trans_brelse(tp, nbp);
+ }
+
+ /* Dive to the next level */
+ pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]);
+ bno = be64_to_cpu(*pp);
+ if (unlikely((error =
+ xfs_bmap_count_tree(mp, tp, ifp, bno, level, count)) < 0)) {
+ xfs_trans_brelse(tp, bp);
+ XFS_ERROR_REPORT("xfs_bmap_count_tree(1)",
+ XFS_ERRLEVEL_LOW, mp);
+ return XFS_ERROR(EFSCORRUPTED);
+ }
+ xfs_trans_brelse(tp, bp);
+ } else {
+ /* count all level 1 nodes and their leaves */
+ for (;;) {
+ nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
+ numrecs = be16_to_cpu(block->bb_numrecs);
+ xfs_bmap_disk_count_leaves(mp, block, numrecs, count);
+ xfs_trans_brelse(tp, bp);
+ if (nextbno == NULLFSBLOCK)
+ break;
+ bno = nextbno;
+ error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
+ XFS_BMAP_BTREE_REF,
+ &xfs_bmbt_buf_ops);
+ if (error)
+ return error;
+ *count += 1;
+ block = XFS_BUF_TO_BLOCK(bp);
+ }
+ }
+ return 0;
+}
+
+/*
+ * Count fsblocks of the given fork.
+ */
+int /* error */
+xfs_bmap_count_blocks(
+ xfs_trans_t *tp, /* transaction pointer */
+ xfs_inode_t *ip, /* incore inode */
+ int whichfork, /* data or attr fork */
+ int *count) /* out: count of blocks */
+{
+ struct xfs_btree_block *block; /* current btree block */
+ xfs_fsblock_t bno; /* block # of "block" */
+ xfs_ifork_t *ifp; /* fork structure */
+ int level; /* btree level, for checking */
+ xfs_mount_t *mp; /* file system mount structure */
+ __be64 *pp; /* pointer to block address */
+
+ bno = NULLFSBLOCK;
+ mp = ip->i_mount;
+ ifp = XFS_IFORK_PTR(ip, whichfork);
+ if ( XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS ) {
+ xfs_bmap_count_leaves(ifp, 0,
+ ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t),
+ count);
+ return 0;
+ }
+
+ /*
+ * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out.
+ */
+ block = ifp->if_broot;
+ level = be16_to_cpu(block->bb_level);
+ ASSERT(level > 0);
+ pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes);
+ bno = be64_to_cpu(*pp);
+ ASSERT(bno != NULLDFSBNO);
+ ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount);
+ ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks);
+
+ if (unlikely(xfs_bmap_count_tree(mp, tp, ifp, bno, level, count) < 0)) {
+ XFS_ERROR_REPORT("xfs_bmap_count_blocks(2)", XFS_ERRLEVEL_LOW,
+ mp);
+ return XFS_ERROR(EFSCORRUPTED);
+ }
+
+ return 0;
+}
+
+/*
+ * returns 1 for success, 0 if we failed to map the extent.
+ */
+STATIC int
+xfs_getbmapx_fix_eof_hole(
+ xfs_inode_t *ip, /* xfs incore inode pointer */
+ struct getbmapx *out, /* output structure */
+ int prealloced, /* this is a file with
+ * preallocated data space */
+ __int64_t end, /* last block requested */
+ xfs_fsblock_t startblock)
+{
+ __int64_t fixlen;
+ xfs_mount_t *mp; /* file system mount point */
+ xfs_ifork_t *ifp; /* inode fork pointer */
+ xfs_extnum_t lastx; /* last extent pointer */
+ xfs_fileoff_t fileblock;
+
+ if (startblock == HOLESTARTBLOCK) {
+ mp = ip->i_mount;
+ out->bmv_block = -1;
+ fixlen = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, XFS_ISIZE(ip)));
+ fixlen -= out->bmv_offset;
+ if (prealloced && out->bmv_offset + out->bmv_length == end) {
+ /* Came to hole at EOF. Trim it. */
+ if (fixlen <= 0)
+ return 0;
+ out->bmv_length = fixlen;
+ }
+ } else {
+ if (startblock == DELAYSTARTBLOCK)
+ out->bmv_block = -2;
+ else
+ out->bmv_block = xfs_fsb_to_db(ip, startblock);
+ fileblock = XFS_BB_TO_FSB(ip->i_mount, out->bmv_offset);
+ ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+ if (xfs_iext_bno_to_ext(ifp, fileblock, &lastx) &&
+ (lastx == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))-1))
+ out->bmv_oflags |= BMV_OF_LAST;
+ }
+
+ return 1;
+}
+
+/*
+ * Get inode's extents as described in bmv, and format for output.
+ * Calls formatter to fill the user's buffer until all extents
+ * are mapped, until the passed-in bmv->bmv_count slots have
+ * been filled, or until the formatter short-circuits the loop,
+ * if it is tracking filled-in extents on its own.
+ */
+int /* error code */
+xfs_getbmap(
+ xfs_inode_t *ip,
+ struct getbmapx *bmv, /* user bmap structure */
+ xfs_bmap_format_t formatter, /* format to user */
+ void *arg) /* formatter arg */
+{
+ __int64_t bmvend; /* last block requested */
+ int error = 0; /* return value */
+ __int64_t fixlen; /* length for -1 case */
+ int i; /* extent number */
+ int lock; /* lock state */
+ xfs_bmbt_irec_t *map; /* buffer for user's data */
+ xfs_mount_t *mp; /* file system mount point */
+ int nex; /* # of user extents can do */
+ int nexleft; /* # of user extents left */
+ int subnex; /* # of bmapi's can do */
+ int nmap; /* number of map entries */
+ struct getbmapx *out; /* output structure */
+ int whichfork; /* data or attr fork */
+ int prealloced; /* this is a file with
+ * preallocated data space */
+ int iflags; /* interface flags */
+ int bmapi_flags; /* flags for xfs_bmapi */
+ int cur_ext = 0;
+
+ mp = ip->i_mount;
+ iflags = bmv->bmv_iflags;
+ whichfork = iflags & BMV_IF_ATTRFORK ? XFS_ATTR_FORK : XFS_DATA_FORK;
+
+ if (whichfork == XFS_ATTR_FORK) {
+ if (XFS_IFORK_Q(ip)) {
+ if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS &&
+ ip->i_d.di_aformat != XFS_DINODE_FMT_BTREE &&
+ ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)
+ return XFS_ERROR(EINVAL);
+ } else if (unlikely(
+ ip->i_d.di_aformat != 0 &&
+ ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS)) {
+ XFS_ERROR_REPORT("xfs_getbmap", XFS_ERRLEVEL_LOW,
+ ip->i_mount);
+ return XFS_ERROR(EFSCORRUPTED);
+ }
+
+ prealloced = 0;
+ fixlen = 1LL << 32;
+ } else {
+ if (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS &&
+ ip->i_d.di_format != XFS_DINODE_FMT_BTREE &&
+ ip->i_d.di_format != XFS_DINODE_FMT_LOCAL)
+ return XFS_ERROR(EINVAL);
+
+ if (xfs_get_extsz_hint(ip) ||
+ ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC|XFS_DIFLAG_APPEND)){
+ prealloced = 1;
+ fixlen = mp->m_super->s_maxbytes;
+ } else {
+ prealloced = 0;
+ fixlen = XFS_ISIZE(ip);
+ }
+ }
+
+ if (bmv->bmv_length == -1) {
+ fixlen = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, fixlen));
+ bmv->bmv_length =
+ max_t(__int64_t, fixlen - bmv->bmv_offset, 0);
+ } else if (bmv->bmv_length == 0) {
+ bmv->bmv_entries = 0;
+ return 0;
+ } else if (bmv->bmv_length < 0) {
+ return XFS_ERROR(EINVAL);
+ }
+
+ nex = bmv->bmv_count - 1;
+ if (nex <= 0)
+ return XFS_ERROR(EINVAL);
+ bmvend = bmv->bmv_offset + bmv->bmv_length;
+
+
+ if (bmv->bmv_count > ULONG_MAX / sizeof(struct getbmapx))
+ return XFS_ERROR(ENOMEM);
+ out = kmem_zalloc_large(bmv->bmv_count * sizeof(struct getbmapx), 0);
+ if (!out)
+ return XFS_ERROR(ENOMEM);
+
+ xfs_ilock(ip, XFS_IOLOCK_SHARED);
+ if (whichfork == XFS_DATA_FORK) {
+ if (!(iflags & BMV_IF_DELALLOC) &&
+ (ip->i_delayed_blks || XFS_ISIZE(ip) > ip->i_d.di_size)) {
+ error = -filemap_write_and_wait(VFS_I(ip)->i_mapping);
+ if (error)
+ goto out_unlock_iolock;
+
+ /*
+ * Even after flushing the inode, there can still be
+ * delalloc blocks on the inode beyond EOF due to
+ * speculative preallocation. These are not removed
+ * until the release function is called or the inode
+ * is inactivated. Hence we cannot assert here that
+ * ip->i_delayed_blks == 0.
+ */
+ }
+
+ lock = xfs_ilock_data_map_shared(ip);
+ } else {
+ lock = xfs_ilock_attr_map_shared(ip);
+ }
+
+ /*
+ * Don't let nex be bigger than the number of extents
+ * we can have assuming alternating holes and real extents.
+ */
+ if (nex > XFS_IFORK_NEXTENTS(ip, whichfork) * 2 + 1)
+ nex = XFS_IFORK_NEXTENTS(ip, whichfork) * 2 + 1;
+
+ bmapi_flags = xfs_bmapi_aflag(whichfork);
+ if (!(iflags & BMV_IF_PREALLOC))
+ bmapi_flags |= XFS_BMAPI_IGSTATE;
+
+ /*
+ * Allocate enough space to handle "subnex" maps at a time.
+ */
+ error = ENOMEM;
+ subnex = 16;
+ map = kmem_alloc(subnex * sizeof(*map), KM_MAYFAIL | KM_NOFS);
+ if (!map)
+ goto out_unlock_ilock;
+
+ bmv->bmv_entries = 0;
+
+ if (XFS_IFORK_NEXTENTS(ip, whichfork) == 0 &&
+ (whichfork == XFS_ATTR_FORK || !(iflags & BMV_IF_DELALLOC))) {
+ error = 0;
+ goto out_free_map;
+ }
+
+ nexleft = nex;
+
+ do {
+ nmap = (nexleft > subnex) ? subnex : nexleft;
+ error = xfs_bmapi_read(ip, XFS_BB_TO_FSBT(mp, bmv->bmv_offset),
+ XFS_BB_TO_FSB(mp, bmv->bmv_length),
+ map, &nmap, bmapi_flags);
+ if (error)
+ goto out_free_map;
+ ASSERT(nmap <= subnex);
+
+ for (i = 0; i < nmap && nexleft && bmv->bmv_length; i++) {
+ out[cur_ext].bmv_oflags = 0;
+ if (map[i].br_state == XFS_EXT_UNWRITTEN)
+ out[cur_ext].bmv_oflags |= BMV_OF_PREALLOC;
+ else if (map[i].br_startblock == DELAYSTARTBLOCK)
+ out[cur_ext].bmv_oflags |= BMV_OF_DELALLOC;
+ out[cur_ext].bmv_offset =
+ XFS_FSB_TO_BB(mp, map[i].br_startoff);
+ out[cur_ext].bmv_length =
+ XFS_FSB_TO_BB(mp, map[i].br_blockcount);
+ out[cur_ext].bmv_unused1 = 0;
+ out[cur_ext].bmv_unused2 = 0;
+
+ /*
+ * delayed allocation extents that start beyond EOF can
+ * occur due to speculative EOF allocation when the
+ * delalloc extent is larger than the largest freespace
+ * extent at conversion time. These extents cannot be
+ * converted by data writeback, so can exist here even
+ * if we are not supposed to be finding delalloc
+ * extents.
+ */
+ if (map[i].br_startblock == DELAYSTARTBLOCK &&
+ map[i].br_startoff <= XFS_B_TO_FSB(mp, XFS_ISIZE(ip)))
+ ASSERT((iflags & BMV_IF_DELALLOC) != 0);
+
+ if (map[i].br_startblock == HOLESTARTBLOCK &&
+ whichfork == XFS_ATTR_FORK) {
+ /* came to the end of attribute fork */
+ out[cur_ext].bmv_oflags |= BMV_OF_LAST;
+ goto out_free_map;
+ }
+
+ if (!xfs_getbmapx_fix_eof_hole(ip, &out[cur_ext],
+ prealloced, bmvend,
+ map[i].br_startblock))
+ goto out_free_map;
+
+ bmv->bmv_offset =
+ out[cur_ext].bmv_offset +
+ out[cur_ext].bmv_length;
+ bmv->bmv_length =
+ max_t(__int64_t, 0, bmvend - bmv->bmv_offset);
+
+ /*
+ * In case we don't want to return the hole,
+ * don't increase cur_ext so that we can reuse
+ * it in the next loop.
+ */
+ if ((iflags & BMV_IF_NO_HOLES) &&
+ map[i].br_startblock == HOLESTARTBLOCK) {
+ memset(&out[cur_ext], 0, sizeof(out[cur_ext]));
+ continue;
+ }
+
+ nexleft--;
+ bmv->bmv_entries++;
+ cur_ext++;
+ }
+ } while (nmap && nexleft && bmv->bmv_length);
+
+ out_free_map:
+ kmem_free(map);
+ out_unlock_ilock:
+ xfs_iunlock(ip, lock);
+ out_unlock_iolock:
+ xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+
+ for (i = 0; i < cur_ext; i++) {
+ int full = 0; /* user array is full */
+
+ /* format results & advance arg */
+ error = formatter(&arg, &out[i], &full);
+ if (error || full)
+ break;
+ }
+
+ kmem_free(out);
+ return error;
+}
+
+/*
+ * dead simple method of punching delalyed allocation blocks from a range in
+ * the inode. Walks a block at a time so will be slow, but is only executed in
+ * rare error cases so the overhead is not critical. This will always punch out
+ * both the start and end blocks, even if the ranges only partially overlap
+ * them, so it is up to the caller to ensure that partial blocks are not
+ * passed in.
+ */
+int
+xfs_bmap_punch_delalloc_range(
+ struct xfs_inode *ip,
+ xfs_fileoff_t start_fsb,
+ xfs_fileoff_t length)
+{
+ xfs_fileoff_t remaining = length;
+ int error = 0;
+
+ ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+
+ do {
+ int done;
+ xfs_bmbt_irec_t imap;
+ int nimaps = 1;
+ xfs_fsblock_t firstblock;
+ xfs_bmap_free_t flist;
+
+ /*
+ * Map the range first and check that it is a delalloc extent
+ * before trying to unmap the range. Otherwise we will be
+ * trying to remove a real extent (which requires a
+ * transaction) or a hole, which is probably a bad idea...
+ */
+ error = xfs_bmapi_read(ip, start_fsb, 1, &imap, &nimaps,
+ XFS_BMAPI_ENTIRE);
+
+ if (error) {
+ /* something screwed, just bail */
+ if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+ xfs_alert(ip->i_mount,
+ "Failed delalloc mapping lookup ino %lld fsb %lld.",
+ ip->i_ino, start_fsb);
+ }
+ break;
+ }
+ if (!nimaps) {
+ /* nothing there */
+ goto next_block;
+ }
+ if (imap.br_startblock != DELAYSTARTBLOCK) {
+ /* been converted, ignore */
+ goto next_block;
+ }
+ WARN_ON(imap.br_blockcount == 0);
+
+ /*
+ * Note: while we initialise the firstblock/flist pair, they
+ * should never be used because blocks should never be
+ * allocated or freed for a delalloc extent and hence we need
+ * don't cancel or finish them after the xfs_bunmapi() call.
+ */
+ xfs_bmap_init(&flist, &firstblock);
+ error = xfs_bunmapi(NULL, ip, start_fsb, 1, 0, 1, &firstblock,
+ &flist, &done);
+ if (error)
+ break;
+
+ ASSERT(!flist.xbf_count && !flist.xbf_first);
+next_block:
+ start_fsb++;
+ remaining--;
+ } while(remaining > 0);
+
+ return error;
+}
+
+/*
+ * Test whether it is appropriate to check an inode for and free post EOF
+ * blocks. The 'force' parameter determines whether we should also consider
+ * regular files that are marked preallocated or append-only.
+ */
+bool
+xfs_can_free_eofblocks(struct xfs_inode *ip, bool force)
+{
+ /* prealloc/delalloc exists only on regular files */
+ if (!S_ISREG(ip->i_d.di_mode))
+ return false;
+
+ /*
+ * Zero sized files with no cached pages and delalloc blocks will not
+ * have speculative prealloc/delalloc blocks to remove.
+ */
+ if (VFS_I(ip)->i_size == 0 &&
+ VN_CACHED(VFS_I(ip)) == 0 &&
+ ip->i_delayed_blks == 0)
+ return false;
+
+ /* If we haven't read in the extent list, then don't do it now. */
+ if (!(ip->i_df.if_flags & XFS_IFEXTENTS))
+ return false;
+
+ /*
+ * Do not free real preallocated or append-only files unless the file
+ * has delalloc blocks and we are forced to remove them.
+ */
+ if (ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND))
+ if (!force || ip->i_delayed_blks == 0)
+ return false;
+
+ return true;
+}
+
+/*
+ * This is called by xfs_inactive to free any blocks beyond eof
+ * when the link count isn't zero and by xfs_dm_punch_hole() when
+ * punching a hole to EOF.
+ */
+int
+xfs_free_eofblocks(
+ xfs_mount_t *mp,
+ xfs_inode_t *ip,
+ bool need_iolock)
+{
+ xfs_trans_t *tp;
+ int error;
+ xfs_fileoff_t end_fsb;
+ xfs_fileoff_t last_fsb;
+ xfs_filblks_t map_len;
+ int nimaps;
+ xfs_bmbt_irec_t imap;
+
+ /*
+ * Figure out if there are any blocks beyond the end
+ * of the file. If not, then there is nothing to do.
+ */
+ end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_ISIZE(ip));
+ last_fsb = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes);
+ if (last_fsb <= end_fsb)
+ return 0;
+ map_len = last_fsb - end_fsb;
+
+ nimaps = 1;
+ xfs_ilock(ip, XFS_ILOCK_SHARED);
+ error = xfs_bmapi_read(ip, end_fsb, map_len, &imap, &nimaps, 0);
+ xfs_iunlock(ip, XFS_ILOCK_SHARED);
+
+ if (!error && (nimaps != 0) &&
+ (imap.br_startblock != HOLESTARTBLOCK ||
+ ip->i_delayed_blks)) {
+ /*
+ * Attach the dquots to the inode up front.
+ */
+ error = xfs_qm_dqattach(ip, 0);
+ if (error)
+ return error;
+
+ /*
+ * There are blocks after the end of file.
+ * Free them up now by truncating the file to
+ * its current size.
+ */
+ tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
+
+ if (need_iolock) {
+ if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
+ xfs_trans_cancel(tp, 0);
+ return EAGAIN;
+ }
+ }
+
+ error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
+ if (error) {
+ ASSERT(XFS_FORCED_SHUTDOWN(mp));
+ xfs_trans_cancel(tp, 0);
+ if (need_iolock)
+ xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+ return error;
+ }
+
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, ip, 0);
+
+ /*
+ * Do not update the on-disk file size. If we update the
+ * on-disk file size and then the system crashes before the
+ * contents of the file are flushed to disk then the files
+ * may be full of holes (ie NULL files bug).
+ */
+ error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK,
+ XFS_ISIZE(ip));
+ if (error) {
+ /*
+ * If we get an error at this point we simply don't
+ * bother truncating the file.
+ */
+ xfs_trans_cancel(tp,
+ (XFS_TRANS_RELEASE_LOG_RES |
+ XFS_TRANS_ABORT));
+ } else {
+ error = xfs_trans_commit(tp,
+ XFS_TRANS_RELEASE_LOG_RES);
+ if (!error)
+ xfs_inode_clear_eofblocks_tag(ip);
+ }
+
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ if (need_iolock)
+ xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+ }
+ return error;
+}
+
+int
+xfs_alloc_file_space(
+ struct xfs_inode *ip,
+ xfs_off_t offset,
+ xfs_off_t len,
+ int alloc_type)
+{
+ xfs_mount_t *mp = ip->i_mount;
+ xfs_off_t count;
+ xfs_filblks_t allocated_fsb;
+ xfs_filblks_t allocatesize_fsb;
+ xfs_extlen_t extsz, temp;
+ xfs_fileoff_t startoffset_fsb;
+ xfs_fsblock_t firstfsb;
+ int nimaps;
+ int quota_flag;
+ int rt;
+ xfs_trans_t *tp;
+ xfs_bmbt_irec_t imaps[1], *imapp;
+ xfs_bmap_free_t free_list;
+ uint qblocks, resblks, resrtextents;
+ int committed;
+ int error;
+
+ trace_xfs_alloc_file_space(ip);
+
+ if (XFS_FORCED_SHUTDOWN(mp))
+ return XFS_ERROR(EIO);
+
+ error = xfs_qm_dqattach(ip, 0);
+ if (error)
+ return error;
+
+ if (len <= 0)
+ return XFS_ERROR(EINVAL);
+
+ rt = XFS_IS_REALTIME_INODE(ip);
+ extsz = xfs_get_extsz_hint(ip);
+
+ count = len;
+ imapp = &imaps[0];
+ nimaps = 1;
+ startoffset_fsb = XFS_B_TO_FSBT(mp, offset);
+ allocatesize_fsb = XFS_B_TO_FSB(mp, count);
+
+ /*
+ * Allocate file space until done or until there is an error
+ */
+ while (allocatesize_fsb && !error) {
+ xfs_fileoff_t s, e;
+
+ /*
+ * Determine space reservations for data/realtime.
+ */
+ if (unlikely(extsz)) {
+ s = startoffset_fsb;
+ do_div(s, extsz);
+ s *= extsz;
+ e = startoffset_fsb + allocatesize_fsb;
+ if ((temp = do_mod(startoffset_fsb, extsz)))
+ e += temp;
+ if ((temp = do_mod(e, extsz)))
+ e += extsz - temp;
+ } else {
+ s = 0;
+ e = allocatesize_fsb;
+ }
+
+ /*
+ * The transaction reservation is limited to a 32-bit block
+ * count, hence we need to limit the number of blocks we are
+ * trying to reserve to avoid an overflow. We can't allocate
+ * more than @nimaps extents, and an extent is limited on disk
+ * to MAXEXTLEN (21 bits), so use that to enforce the limit.
+ */
+ resblks = min_t(xfs_fileoff_t, (e - s), (MAXEXTLEN * nimaps));
+ if (unlikely(rt)) {
+ resrtextents = qblocks = resblks;
+ resrtextents /= mp->m_sb.sb_rextsize;
+ resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
+ quota_flag = XFS_QMOPT_RES_RTBLKS;
+ } else {
+ resrtextents = 0;
+ resblks = qblocks = XFS_DIOSTRAT_SPACE_RES(mp, resblks);
+ quota_flag = XFS_QMOPT_RES_REGBLKS;
+ }
+
+ /*
+ * Allocate and setup the transaction.
+ */
+ tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
+ error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
+ resblks, resrtextents);
+ /*
+ * Check for running out of space
+ */
+ if (error) {
+ /*
+ * Free the transaction structure.
+ */
+ ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp));
+ xfs_trans_cancel(tp, 0);
+ break;
+ }
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ error = xfs_trans_reserve_quota_nblks(tp, ip, qblocks,
+ 0, quota_flag);
+ if (error)
+ goto error1;
+
+ xfs_trans_ijoin(tp, ip, 0);
+
+ xfs_bmap_init(&free_list, &firstfsb);
+ error = xfs_bmapi_write(tp, ip, startoffset_fsb,
+ allocatesize_fsb, alloc_type, &firstfsb,
+ 0, imapp, &nimaps, &free_list);
+ if (error) {
+ goto error0;
+ }
+
+ /*
+ * Complete the transaction
+ */
+ error = xfs_bmap_finish(&tp, &free_list, &committed);
+ if (error) {
+ goto error0;
+ }
+
+ error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ if (error) {
+ break;
+ }
+
+ allocated_fsb = imapp->br_blockcount;
+
+ if (nimaps == 0) {
+ error = XFS_ERROR(ENOSPC);
+ break;
+ }
+
+ startoffset_fsb += allocated_fsb;
+ allocatesize_fsb -= allocated_fsb;
+ }
+
+ return error;
+
+error0: /* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */
+ xfs_bmap_cancel(&free_list);
+ xfs_trans_unreserve_quota_nblks(tp, ip, (long)qblocks, 0, quota_flag);
+
+error1: /* Just cancel transaction */
+ xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ return error;
+}
+
+/*
+ * Zero file bytes between startoff and endoff inclusive.
+ * The iolock is held exclusive and no blocks are buffered.
+ *
+ * This function is used by xfs_free_file_space() to zero
+ * partial blocks when the range to free is not block aligned.
+ * When unreserving space with boundaries that are not block
+ * aligned we round up the start and round down the end
+ * boundaries and then use this function to zero the parts of
+ * the blocks that got dropped during the rounding.
+ */
+STATIC int
+xfs_zero_remaining_bytes(
+ xfs_inode_t *ip,
+ xfs_off_t startoff,
+ xfs_off_t endoff)
+{
+ xfs_bmbt_irec_t imap;
+ xfs_fileoff_t offset_fsb;
+ xfs_off_t lastoffset;
+ xfs_off_t offset;
+ xfs_buf_t *bp;
+ xfs_mount_t *mp = ip->i_mount;
+ int nimap;
+ int error = 0;
+
+ /*
+ * Avoid doing I/O beyond eof - it's not necessary
+ * since nothing can read beyond eof. The space will
+ * be zeroed when the file is extended anyway.
+ */
+ if (startoff >= XFS_ISIZE(ip))
+ return 0;
+
+ if (endoff > XFS_ISIZE(ip))
+ endoff = XFS_ISIZE(ip);
+
+ bp = xfs_buf_get_uncached(XFS_IS_REALTIME_INODE(ip) ?
+ mp->m_rtdev_targp : mp->m_ddev_targp,
+ BTOBB(mp->m_sb.sb_blocksize), 0);
+ if (!bp)
+ return XFS_ERROR(ENOMEM);
+
+ xfs_buf_unlock(bp);
+
+ for (offset = startoff; offset <= endoff; offset = lastoffset + 1) {
+ uint lock_mode;
+
+ offset_fsb = XFS_B_TO_FSBT(mp, offset);
+ nimap = 1;
+
+ lock_mode = xfs_ilock_data_map_shared(ip);
+ error = xfs_bmapi_read(ip, offset_fsb, 1, &imap, &nimap, 0);
+ xfs_iunlock(ip, lock_mode);
+
+ if (error || nimap < 1)
+ break;
+ ASSERT(imap.br_blockcount >= 1);
+ ASSERT(imap.br_startoff == offset_fsb);
+ lastoffset = XFS_FSB_TO_B(mp, imap.br_startoff + 1) - 1;
+ if (lastoffset > endoff)
+ lastoffset = endoff;
+ if (imap.br_startblock == HOLESTARTBLOCK)
+ continue;
+ ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
+ if (imap.br_state == XFS_EXT_UNWRITTEN)
+ continue;
+ XFS_BUF_UNDONE(bp);
+ XFS_BUF_UNWRITE(bp);
+ XFS_BUF_READ(bp);
+ XFS_BUF_SET_ADDR(bp, xfs_fsb_to_db(ip, imap.br_startblock));
+
+ if (XFS_FORCED_SHUTDOWN(mp)) {
+ error = XFS_ERROR(EIO);
+ break;
+ }
+ xfs_buf_iorequest(bp);
+ error = xfs_buf_iowait(bp);
+ if (error) {
+ xfs_buf_ioerror_alert(bp,
+ "xfs_zero_remaining_bytes(read)");
+ break;
+ }
+ memset(bp->b_addr +
+ (offset - XFS_FSB_TO_B(mp, imap.br_startoff)),
+ 0, lastoffset - offset + 1);
+ XFS_BUF_UNDONE(bp);
+ XFS_BUF_UNREAD(bp);
+ XFS_BUF_WRITE(bp);
+
+ if (XFS_FORCED_SHUTDOWN(mp)) {
+ error = XFS_ERROR(EIO);
+ break;
+ }
+ xfs_buf_iorequest(bp);
+ error = xfs_buf_iowait(bp);
+ if (error) {
+ xfs_buf_ioerror_alert(bp,
+ "xfs_zero_remaining_bytes(write)");
+ break;
+ }
+ }
+ xfs_buf_free(bp);
+ return error;
+}
+
+int
+xfs_free_file_space(
+ struct xfs_inode *ip,
+ xfs_off_t offset,
+ xfs_off_t len)
+{
+ int committed;
+ int done;
+ xfs_fileoff_t endoffset_fsb;
+ int error;
+ xfs_fsblock_t firstfsb;
+ xfs_bmap_free_t free_list;
+ xfs_bmbt_irec_t imap;
+ xfs_off_t ioffset;
+ xfs_extlen_t mod=0;
+ xfs_mount_t *mp;
+ int nimap;
+ uint resblks;
+ xfs_off_t rounding;
+ int rt;
+ xfs_fileoff_t startoffset_fsb;
+ xfs_trans_t *tp;
+
+ mp = ip->i_mount;
+
+ trace_xfs_free_file_space(ip);
+
+ error = xfs_qm_dqattach(ip, 0);
+ if (error)
+ return error;
+
+ error = 0;
+ if (len <= 0) /* if nothing being freed */
+ return error;
+ rt = XFS_IS_REALTIME_INODE(ip);
+ startoffset_fsb = XFS_B_TO_FSB(mp, offset);
+ endoffset_fsb = XFS_B_TO_FSBT(mp, offset + len);
+
+ /* wait for the completion of any pending DIOs */
+ inode_dio_wait(VFS_I(ip));
+
+ rounding = max_t(xfs_off_t, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE);
+ ioffset = offset & ~(rounding - 1);
+ error = -filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
+ ioffset, -1);
+ if (error)
+ goto out;
+ truncate_pagecache_range(VFS_I(ip), ioffset, -1);
+
+ /*
+ * Need to zero the stuff we're not freeing, on disk.
+ * If it's a realtime file & can't use unwritten extents then we
+ * actually need to zero the extent edges. Otherwise xfs_bunmapi
+ * will take care of it for us.
+ */
+ if (rt && !xfs_sb_version_hasextflgbit(&mp->m_sb)) {
+ nimap = 1;
+ error = xfs_bmapi_read(ip, startoffset_fsb, 1,
+ &imap, &nimap, 0);
+ if (error)
+ goto out;
+ ASSERT(nimap == 0 || nimap == 1);
+ if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
+ xfs_daddr_t block;
+
+ ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
+ block = imap.br_startblock;
+ mod = do_div(block, mp->m_sb.sb_rextsize);
+ if (mod)
+ startoffset_fsb += mp->m_sb.sb_rextsize - mod;
+ }
+ nimap = 1;
+ error = xfs_bmapi_read(ip, endoffset_fsb - 1, 1,
+ &imap, &nimap, 0);
+ if (error)
+ goto out;
+ ASSERT(nimap == 0 || nimap == 1);
+ if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
+ ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
+ mod++;
+ if (mod && (mod != mp->m_sb.sb_rextsize))
+ endoffset_fsb -= mod;
+ }
+ }
+ if ((done = (endoffset_fsb <= startoffset_fsb)))
+ /*
+ * One contiguous piece to clear
+ */
+ error = xfs_zero_remaining_bytes(ip, offset, offset + len - 1);
+ else {
+ /*
+ * Some full blocks, possibly two pieces to clear
+ */
+ if (offset < XFS_FSB_TO_B(mp, startoffset_fsb))
+ error = xfs_zero_remaining_bytes(ip, offset,
+ XFS_FSB_TO_B(mp, startoffset_fsb) - 1);
+ if (!error &&
+ XFS_FSB_TO_B(mp, endoffset_fsb) < offset + len)
+ error = xfs_zero_remaining_bytes(ip,
+ XFS_FSB_TO_B(mp, endoffset_fsb),
+ offset + len - 1);
+ }
+
+ /*
+ * free file space until done or until there is an error
+ */
+ resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
+ while (!error && !done) {
+
+ /*
+ * allocate and setup the transaction. Allow this
+ * transaction to dip into the reserve blocks to ensure
+ * the freeing of the space succeeds at ENOSPC.
+ */
+ tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
+ error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, resblks, 0);
+
+ /*
+ * check for running out of space
+ */
+ if (error) {
+ /*
+ * Free the transaction structure.
+ */
+ ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp));
+ xfs_trans_cancel(tp, 0);
+ break;
+ }
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ error = xfs_trans_reserve_quota(tp, mp,
+ ip->i_udquot, ip->i_gdquot, ip->i_pdquot,
+ resblks, 0, XFS_QMOPT_RES_REGBLKS);
+ if (error)
+ goto error1;
+
+ xfs_trans_ijoin(tp, ip, 0);
+
+ /*
+ * issue the bunmapi() call to free the blocks
+ */
+ xfs_bmap_init(&free_list, &firstfsb);
+ error = xfs_bunmapi(tp, ip, startoffset_fsb,
+ endoffset_fsb - startoffset_fsb,
+ 0, 2, &firstfsb, &free_list, &done);
+ if (error) {
+ goto error0;
+ }
+
+ /*
+ * complete the transaction
+ */
+ error = xfs_bmap_finish(&tp, &free_list, &committed);
+ if (error) {
+ goto error0;
+ }
+
+ error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ }
+
+ out:
+ return error;
+
+ error0:
+ xfs_bmap_cancel(&free_list);
+ error1:
+ xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ goto out;
+}
+
+
+int
+xfs_zero_file_space(
+ struct xfs_inode *ip,
+ xfs_off_t offset,
+ xfs_off_t len)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ uint granularity;
+ xfs_off_t start_boundary;
+ xfs_off_t end_boundary;
+ int error;
+
+ trace_xfs_zero_file_space(ip);
+
+ granularity = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE);
+
+ /*
+ * Round the range of extents we are going to convert inwards. If the
+ * offset is aligned, then it doesn't get changed so we zero from the
+ * start of the block offset points to.
+ */
+ start_boundary = round_up(offset, granularity);
+ end_boundary = round_down(offset + len, granularity);
+
+ ASSERT(start_boundary >= offset);
+ ASSERT(end_boundary <= offset + len);
+
+ if (start_boundary < end_boundary - 1) {
+ /*
+ * punch out delayed allocation blocks and the page cache over
+ * the conversion range
+ */
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ error = xfs_bmap_punch_delalloc_range(ip,
+ XFS_B_TO_FSBT(mp, start_boundary),
+ XFS_B_TO_FSB(mp, end_boundary - start_boundary));
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ truncate_pagecache_range(VFS_I(ip), start_boundary,
+ end_boundary - 1);
+
+ /* convert the blocks */
+ error = xfs_alloc_file_space(ip, start_boundary,
+ end_boundary - start_boundary - 1,
+ XFS_BMAPI_PREALLOC | XFS_BMAPI_CONVERT);
+ if (error)
+ goto out;
+
+ /* We've handled the interior of the range, now for the edges */
+ if (start_boundary != offset) {
+ error = xfs_iozero(ip, offset, start_boundary - offset);
+ if (error)
+ goto out;
+ }
+
+ if (end_boundary != offset + len)
+ error = xfs_iozero(ip, end_boundary,
+ offset + len - end_boundary);
+
+ } else {
+ /*
+ * It's either a sub-granularity range or the range spanned lies
+ * partially across two adjacent blocks.
+ */
+ error = xfs_iozero(ip, offset, len);
+ }
+
+out:
+ return error;
+
+}
+
+/*
+ * xfs_collapse_file_space()
+ * This routine frees disk space and shift extent for the given file.
+ * The first thing we do is to free data blocks in the specified range
+ * by calling xfs_free_file_space(). It would also sync dirty data
+ * and invalidate page cache over the region on which collapse range
+ * is working. And Shift extent records to the left to cover a hole.
+ * RETURNS:
+ * 0 on success
+ * errno on error
+ *
+ */
+int
+xfs_collapse_file_space(
+ struct xfs_inode *ip,
+ xfs_off_t offset,
+ xfs_off_t len)
+{
+ int done = 0;
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_trans *tp;
+ int error;
+ xfs_extnum_t current_ext = 0;
+ struct xfs_bmap_free free_list;
+ xfs_fsblock_t first_block;
+ int committed;
+ xfs_fileoff_t start_fsb;
+ xfs_fileoff_t shift_fsb;
+
+ ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
+
+ trace_xfs_collapse_file_space(ip);
+
+ start_fsb = XFS_B_TO_FSB(mp, offset + len);
+ shift_fsb = XFS_B_TO_FSB(mp, len);
+
+ error = xfs_free_file_space(ip, offset, len);
+ if (error)
+ return error;
+
+ while (!error && !done) {
+ tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
+ /*
+ * We would need to reserve permanent block for transaction.
+ * This will come into picture when after shifting extent into
+ * hole we found that adjacent extents can be merged which
+ * may lead to freeing of a block during record update.
+ */
+ error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
+ XFS_DIOSTRAT_SPACE_RES(mp, 0), 0);
+ if (error) {
+ xfs_trans_cancel(tp, 0);
+ break;
+ }
+
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ error = xfs_trans_reserve_quota(tp, mp, ip->i_udquot,
+ ip->i_gdquot, ip->i_pdquot,
+ XFS_DIOSTRAT_SPACE_RES(mp, 0), 0,
+ XFS_QMOPT_RES_REGBLKS);
+ if (error)
+ goto out;
+
+ xfs_trans_ijoin(tp, ip, 0);
+
+ xfs_bmap_init(&free_list, &first_block);
+
+ /*
+ * We are using the write transaction in which max 2 bmbt
+ * updates are allowed
+ */
+ error = xfs_bmap_shift_extents(tp, ip, &done, start_fsb,
+ shift_fsb, &current_ext,
+ &first_block, &free_list,
+ XFS_BMAP_MAX_SHIFT_EXTENTS);
+ if (error)
+ goto out;
+
+ error = xfs_bmap_finish(&tp, &free_list, &committed);
+ if (error)
+ goto out;
+
+ error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ }
+
+ return error;
+
+out:
+ xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ return error;
+}
+
+/*
+ * We need to check that the format of the data fork in the temporary inode is
+ * valid for the target inode before doing the swap. This is not a problem with
+ * attr1 because of the fixed fork offset, but attr2 has a dynamically sized
+ * data fork depending on the space the attribute fork is taking so we can get
+ * invalid formats on the target inode.
+ *
+ * E.g. target has space for 7 extents in extent format, temp inode only has
+ * space for 6. If we defragment down to 7 extents, then the tmp format is a
+ * btree, but when swapped it needs to be in extent format. Hence we can't just
+ * blindly swap data forks on attr2 filesystems.
+ *
+ * Note that we check the swap in both directions so that we don't end up with
+ * a corrupt temporary inode, either.
+ *
+ * Note that fixing the way xfs_fsr sets up the attribute fork in the source
+ * inode will prevent this situation from occurring, so all we do here is
+ * reject and log the attempt. basically we are putting the responsibility on
+ * userspace to get this right.
+ */
+static int
+xfs_swap_extents_check_format(
+ xfs_inode_t *ip, /* target inode */
+ xfs_inode_t *tip) /* tmp inode */
+{
+
+ /* Should never get a local format */
+ if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL ||
+ tip->i_d.di_format == XFS_DINODE_FMT_LOCAL)
+ return EINVAL;
+
+ /*
+ * if the target inode has less extents that then temporary inode then
+ * why did userspace call us?
+ */
+ if (ip->i_d.di_nextents < tip->i_d.di_nextents)
+ return EINVAL;
+
+ /*
+ * if the target inode is in extent form and the temp inode is in btree
+ * form then we will end up with the target inode in the wrong format
+ * as we already know there are less extents in the temp inode.
+ */
+ if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
+ tip->i_d.di_format == XFS_DINODE_FMT_BTREE)
+ return EINVAL;
+
+ /* Check temp in extent form to max in target */
+ if (tip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
+ XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) >
+ XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK))
+ return EINVAL;
+
+ /* Check target in extent form to max in temp */
+ if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
+ XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) >
+ XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK))
+ return EINVAL;
+
+ /*
+ * If we are in a btree format, check that the temp root block will fit
+ * in the target and that it has enough extents to be in btree format
+ * in the target.
+ *
+ * Note that we have to be careful to allow btree->extent conversions
+ * (a common defrag case) which will occur when the temp inode is in
+ * extent format...
+ */
+ if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
+ if (XFS_IFORK_BOFF(ip) &&
+ XFS_BMAP_BMDR_SPACE(tip->i_df.if_broot) > XFS_IFORK_BOFF(ip))
+ return EINVAL;
+ if (XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) <=
+ XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK))
+ return EINVAL;
+ }
+
+ /* Reciprocal target->temp btree format checks */
+ if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
+ if (XFS_IFORK_BOFF(tip) &&
+ XFS_BMAP_BMDR_SPACE(ip->i_df.if_broot) > XFS_IFORK_BOFF(tip))
+ return EINVAL;
+ if (XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) <=
+ XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK))
+ return EINVAL;
+ }
+
+ return 0;
+}
+
+int
+xfs_swap_extents(
+ xfs_inode_t *ip, /* target inode */
+ xfs_inode_t *tip, /* tmp inode */
+ xfs_swapext_t *sxp)
+{
+ xfs_mount_t *mp = ip->i_mount;
+ xfs_trans_t *tp;
+ xfs_bstat_t *sbp = &sxp->sx_stat;
+ xfs_ifork_t *tempifp, *ifp, *tifp;
+ int src_log_flags, target_log_flags;
+ int error = 0;
+ int aforkblks = 0;
+ int taforkblks = 0;
+ __uint64_t tmp;
+
+ tempifp = kmem_alloc(sizeof(xfs_ifork_t), KM_MAYFAIL);
+ if (!tempifp) {
+ error = XFS_ERROR(ENOMEM);
+ goto out;
+ }
+
+ /*
+ * we have to do two separate lock calls here to keep lockdep
+ * happy. If we try to get all the locks in one call, lock will
+ * report false positives when we drop the ILOCK and regain them
+ * below.
+ */
+ xfs_lock_two_inodes(ip, tip, XFS_IOLOCK_EXCL);
+ xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL);
+
+ /* Verify that both files have the same format */
+ if ((ip->i_d.di_mode & S_IFMT) != (tip->i_d.di_mode & S_IFMT)) {
+ error = XFS_ERROR(EINVAL);
+ goto out_unlock;
+ }
+
+ /* Verify both files are either real-time or non-realtime */
+ if (XFS_IS_REALTIME_INODE(ip) != XFS_IS_REALTIME_INODE(tip)) {
+ error = XFS_ERROR(EINVAL);
+ goto out_unlock;
+ }
+
+ error = -filemap_write_and_wait(VFS_I(tip)->i_mapping);
+ if (error)
+ goto out_unlock;
+ truncate_pagecache_range(VFS_I(tip), 0, -1);
+
+ /* Verify O_DIRECT for ftmp */
+ if (VN_CACHED(VFS_I(tip)) != 0) {
+ error = XFS_ERROR(EINVAL);
+ goto out_unlock;
+ }
+
+ /* Verify all data are being swapped */
+ if (sxp->sx_offset != 0 ||
+ sxp->sx_length != ip->i_d.di_size ||
+ sxp->sx_length != tip->i_d.di_size) {
+ error = XFS_ERROR(EFAULT);
+ goto out_unlock;
+ }
+
+ trace_xfs_swap_extent_before(ip, 0);
+ trace_xfs_swap_extent_before(tip, 1);
+
+ /* check inode formats now that data is flushed */
+ error = xfs_swap_extents_check_format(ip, tip);
+ if (error) {
+ xfs_notice(mp,
+ "%s: inode 0x%llx format is incompatible for exchanging.",
+ __func__, ip->i_ino);
+ goto out_unlock;
+ }
+
+ /*
+ * Compare the current change & modify times with that
+ * passed in. If they differ, we abort this swap.
+ * This is the mechanism used to ensure the calling
+ * process that the file was not changed out from
+ * under it.
+ */
+ if ((sbp->bs_ctime.tv_sec != VFS_I(ip)->i_ctime.tv_sec) ||
+ (sbp->bs_ctime.tv_nsec != VFS_I(ip)->i_ctime.tv_nsec) ||
+ (sbp->bs_mtime.tv_sec != VFS_I(ip)->i_mtime.tv_sec) ||
+ (sbp->bs_mtime.tv_nsec != VFS_I(ip)->i_mtime.tv_nsec)) {
+ error = XFS_ERROR(EBUSY);
+ goto out_unlock;
+ }
+
+ /* We need to fail if the file is memory mapped. Once we have tossed
+ * all existing pages, the page fault will have no option
+ * but to go to the filesystem for pages. By making the page fault call
+ * vop_read (or write in the case of autogrow) they block on the iolock
+ * until we have switched the extents.
+ */
+ if (VN_MAPPED(VFS_I(ip))) {
+ error = XFS_ERROR(EBUSY);
+ goto out_unlock;
+ }
+
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ xfs_iunlock(tip, XFS_ILOCK_EXCL);
+
+ /*
+ * There is a race condition here since we gave up the
+ * ilock. However, the data fork will not change since
+ * we have the iolock (locked for truncation too) so we
+ * are safe. We don't really care if non-io related
+ * fields change.
+ */
+ truncate_pagecache_range(VFS_I(ip), 0, -1);
+
+ tp = xfs_trans_alloc(mp, XFS_TRANS_SWAPEXT);
+ error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0);
+ if (error) {
+ xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+ xfs_iunlock(tip, XFS_IOLOCK_EXCL);
+ xfs_trans_cancel(tp, 0);
+ goto out;
+ }
+ xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL);
+
+ /*
+ * Count the number of extended attribute blocks
+ */
+ if ( ((XFS_IFORK_Q(ip) != 0) && (ip->i_d.di_anextents > 0)) &&
+ (ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) {
+ error = xfs_bmap_count_blocks(tp, ip, XFS_ATTR_FORK, &aforkblks);
+ if (error)
+ goto out_trans_cancel;
+ }
+ if ( ((XFS_IFORK_Q(tip) != 0) && (tip->i_d.di_anextents > 0)) &&
+ (tip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) {
+ error = xfs_bmap_count_blocks(tp, tip, XFS_ATTR_FORK,
+ &taforkblks);
+ if (error)
+ goto out_trans_cancel;
+ }
+
+ xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+ xfs_trans_ijoin(tp, tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+
+ /*
+ * Before we've swapped the forks, lets set the owners of the forks
+ * appropriately. We have to do this as we are demand paging the btree
+ * buffers, and so the validation done on read will expect the owner
+ * field to be correctly set. Once we change the owners, we can swap the
+ * inode forks.
+ *
+ * Note the trickiness in setting the log flags - we set the owner log
+ * flag on the opposite inode (i.e. the inode we are setting the new
+ * owner to be) because once we swap the forks and log that, log
+ * recovery is going to see the fork as owned by the swapped inode,
+ * not the pre-swapped inodes.
+ */
+ src_log_flags = XFS_ILOG_CORE;
+ target_log_flags = XFS_ILOG_CORE;
+ if (ip->i_d.di_version == 3 &&
+ ip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
+ target_log_flags |= XFS_ILOG_DOWNER;
+ error = xfs_bmbt_change_owner(tp, ip, XFS_DATA_FORK,
+ tip->i_ino, NULL);
+ if (error)
+ goto out_trans_cancel;
+ }
+
+ if (tip->i_d.di_version == 3 &&
+ tip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
+ src_log_flags |= XFS_ILOG_DOWNER;
+ error = xfs_bmbt_change_owner(tp, tip, XFS_DATA_FORK,
+ ip->i_ino, NULL);
+ if (error)
+ goto out_trans_cancel;
+ }
+
+ /*
+ * Swap the data forks of the inodes
+ */
+ ifp = &ip->i_df;
+ tifp = &tip->i_df;
+ *tempifp = *ifp; /* struct copy */
+ *ifp = *tifp; /* struct copy */
+ *tifp = *tempifp; /* struct copy */
+
+ /*
+ * Fix the on-disk inode values
+ */
+ tmp = (__uint64_t)ip->i_d.di_nblocks;
+ ip->i_d.di_nblocks = tip->i_d.di_nblocks - taforkblks + aforkblks;
+ tip->i_d.di_nblocks = tmp + taforkblks - aforkblks;
+
+ tmp = (__uint64_t) ip->i_d.di_nextents;
+ ip->i_d.di_nextents = tip->i_d.di_nextents;
+ tip->i_d.di_nextents = tmp;
+
+ tmp = (__uint64_t) ip->i_d.di_format;
+ ip->i_d.di_format = tip->i_d.di_format;
+ tip->i_d.di_format = tmp;
+
+ /*
+ * The extents in the source inode could still contain speculative
+ * preallocation beyond EOF (e.g. the file is open but not modified
+ * while defrag is in progress). In that case, we need to copy over the
+ * number of delalloc blocks the data fork in the source inode is
+ * tracking beyond EOF so that when the fork is truncated away when the
+ * temporary inode is unlinked we don't underrun the i_delayed_blks
+ * counter on that inode.
+ */
+ ASSERT(tip->i_delayed_blks == 0);
+ tip->i_delayed_blks = ip->i_delayed_blks;
+ ip->i_delayed_blks = 0;
+
+ switch (ip->i_d.di_format) {
+ case XFS_DINODE_FMT_EXTENTS:
+ /* If the extents fit in the inode, fix the
+ * pointer. Otherwise it's already NULL or
+ * pointing to the extent.
+ */
+ if (ip->i_d.di_nextents <= XFS_INLINE_EXTS) {
+ ifp->if_u1.if_extents =
+ ifp->if_u2.if_inline_ext;
+ }
+ src_log_flags |= XFS_ILOG_DEXT;
+ break;
+ case XFS_DINODE_FMT_BTREE:
+ ASSERT(ip->i_d.di_version < 3 ||
+ (src_log_flags & XFS_ILOG_DOWNER));
+ src_log_flags |= XFS_ILOG_DBROOT;
+ break;
+ }
+
+ switch (tip->i_d.di_format) {
+ case XFS_DINODE_FMT_EXTENTS:
+ /* If the extents fit in the inode, fix the
+ * pointer. Otherwise it's already NULL or
+ * pointing to the extent.
+ */
+ if (tip->i_d.di_nextents <= XFS_INLINE_EXTS) {
+ tifp->if_u1.if_extents =
+ tifp->if_u2.if_inline_ext;
+ }
+ target_log_flags |= XFS_ILOG_DEXT;
+ break;
+ case XFS_DINODE_FMT_BTREE:
+ target_log_flags |= XFS_ILOG_DBROOT;
+ ASSERT(tip->i_d.di_version < 3 ||
+ (target_log_flags & XFS_ILOG_DOWNER));
+ break;
+ }
+
+ xfs_trans_log_inode(tp, ip, src_log_flags);
+ xfs_trans_log_inode(tp, tip, target_log_flags);
+
+ /*
+ * If this is a synchronous mount, make sure that the
+ * transaction goes to disk before returning to the user.
+ */
+ if (mp->m_flags & XFS_MOUNT_WSYNC)
+ xfs_trans_set_sync(tp);
+
+ error = xfs_trans_commit(tp, 0);
+
+ trace_xfs_swap_extent_after(ip, 0);
+ trace_xfs_swap_extent_after(tip, 1);
+out:
+ kmem_free(tempifp);
+ return error;
+
+out_unlock:
+ xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+ xfs_iunlock(tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+ goto out;
+
+out_trans_cancel:
+ xfs_trans_cancel(tp, 0);
+ goto out_unlock;
+}
diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h
new file mode 100644
index 00000000000..2fdb72d2c90
--- /dev/null
+++ b/fs/xfs/xfs_bmap_util.h
@@ -0,0 +1,112 @@
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#ifndef __XFS_BMAP_UTIL_H__
+#define __XFS_BMAP_UTIL_H__
+
+/* Kernel only BMAP related definitions and functions */
+
+struct xfs_bmbt_irec;
+struct xfs_bmap_free_item;
+struct xfs_ifork;
+struct xfs_inode;
+struct xfs_mount;
+struct xfs_trans;
+
+/*
+ * Argument structure for xfs_bmap_alloc.
+ */
+struct xfs_bmalloca {
+ xfs_fsblock_t *firstblock; /* i/o first block allocated */
+ struct xfs_bmap_free *flist; /* bmap freelist */
+ struct xfs_trans *tp; /* transaction pointer */
+ struct xfs_inode *ip; /* incore inode pointer */
+ struct xfs_bmbt_irec prev; /* extent before the new one */
+ struct xfs_bmbt_irec got; /* extent after, or delayed */
+
+ xfs_fileoff_t offset; /* offset in file filling in */
+ xfs_extlen_t length; /* i/o length asked/allocated */
+ xfs_fsblock_t blkno; /* starting block of new extent */
+
+ struct xfs_btree_cur *cur; /* btree cursor */
+ xfs_extnum_t idx; /* current extent index */
+ int nallocs;/* number of extents alloc'd */
+ int logflags;/* flags for transaction logging */
+
+ xfs_extlen_t total; /* total blocks needed for xaction */
+ xfs_extlen_t minlen; /* minimum allocation size (blocks) */
+ xfs_extlen_t minleft; /* amount must be left after alloc */
+ bool eof; /* set if allocating past last extent */
+ bool wasdel; /* replacing a delayed allocation */
+ bool userdata;/* set if is user data */
+ bool aeof; /* allocated space at eof */
+ bool conv; /* overwriting unwritten extents */
+ int flags;
+ struct completion *done;
+ struct work_struct work;
+ int result;
+};
+
+int xfs_bmap_finish(struct xfs_trans **tp, struct xfs_bmap_free *flist,
+ int *committed);
+int xfs_bmap_rtalloc(struct xfs_bmalloca *ap);
+int xfs_bmap_eof(struct xfs_inode *ip, xfs_fileoff_t endoff,
+ int whichfork, int *eof);
+int xfs_bmap_count_blocks(struct xfs_trans *tp, struct xfs_inode *ip,
+ int whichfork, int *count);
+int xfs_bmap_punch_delalloc_range(struct xfs_inode *ip,
+ xfs_fileoff_t start_fsb, xfs_fileoff_t length);
+
+/* bmap to userspace formatter - copy to user & advance pointer */
+typedef int (*xfs_bmap_format_t)(void **, struct getbmapx *, int *);
+int xfs_getbmap(struct xfs_inode *ip, struct getbmapx *bmv,
+ xfs_bmap_format_t formatter, void *arg);
+
+/* functions in xfs_bmap.c that are only needed by xfs_bmap_util.c */
+void xfs_bmap_del_free(struct xfs_bmap_free *flist,
+ struct xfs_bmap_free_item *prev,
+ struct xfs_bmap_free_item *free);
+int xfs_bmap_extsize_align(struct xfs_mount *mp, struct xfs_bmbt_irec *gotp,
+ struct xfs_bmbt_irec *prevp, xfs_extlen_t extsz,
+ int rt, int eof, int delay, int convert,
+ xfs_fileoff_t *offp, xfs_extlen_t *lenp);
+void xfs_bmap_adjacent(struct xfs_bmalloca *ap);
+int xfs_bmap_last_extent(struct xfs_trans *tp, struct xfs_inode *ip,
+ int whichfork, struct xfs_bmbt_irec *rec,
+ int *is_empty);
+
+/* preallocation and hole punch interface */
+int xfs_alloc_file_space(struct xfs_inode *ip, xfs_off_t offset,
+ xfs_off_t len, int alloc_type);
+int xfs_free_file_space(struct xfs_inode *ip, xfs_off_t offset,
+ xfs_off_t len);
+int xfs_zero_file_space(struct xfs_inode *ip, xfs_off_t offset,
+ xfs_off_t len);
+int xfs_collapse_file_space(struct xfs_inode *, xfs_off_t offset,
+ xfs_off_t len);
+
+/* EOF block manipulation functions */
+bool xfs_can_free_eofblocks(struct xfs_inode *ip, bool force);
+int xfs_free_eofblocks(struct xfs_mount *mp, struct xfs_inode *ip,
+ bool need_iolock);
+
+int xfs_swap_extents(struct xfs_inode *ip, struct xfs_inode *tip,
+ struct xfs_swapext *sx);
+
+xfs_daddr_t xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb);
+
+#endif /* __XFS_BMAP_UTIL_H__ */
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
index db010408d70..cf893bc1e37 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -17,22 +17,23 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
-#include "xfs_types.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_bit.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dinode.h"
#include "xfs_inode.h"
+#include "xfs_trans.h"
#include "xfs_inode_item.h"
+#include "xfs_buf_item.h"
#include "xfs_btree.h"
#include "xfs_error.h"
#include "xfs_trace.h"
+#include "xfs_cksum.h"
+#include "xfs_alloc.h"
/*
* Cursor allocation zone.
@@ -42,9 +43,14 @@ kmem_zone_t *xfs_btree_cur_zone;
/*
* Btree magic numbers.
*/
-const __uint32_t xfs_magics[XFS_BTNUM_MAX] = {
- XFS_ABTB_MAGIC, XFS_ABTC_MAGIC, XFS_BMAP_MAGIC, XFS_IBT_MAGIC
+static const __uint32_t xfs_magics[2][XFS_BTNUM_MAX] = {
+ { XFS_ABTB_MAGIC, XFS_ABTC_MAGIC, XFS_BMAP_MAGIC, XFS_IBT_MAGIC,
+ XFS_FIBT_MAGIC },
+ { XFS_ABTB_CRC_MAGIC, XFS_ABTC_CRC_MAGIC,
+ XFS_BMAP_CRC_MAGIC, XFS_IBT_CRC_MAGIC, XFS_FIBT_CRC_MAGIC }
};
+#define xfs_btree_magic(cur) \
+ xfs_magics[!!((cur)->bc_flags & XFS_BTREE_CRC_BLOCKS)][cur->bc_btnum]
STATIC int /* error (0 or EFSCORRUPTED) */
@@ -54,30 +60,38 @@ xfs_btree_check_lblock(
int level, /* level of the btree block */
struct xfs_buf *bp) /* buffer for block, if any */
{
- int lblock_ok; /* block passes checks */
+ int lblock_ok = 1; /* block passes checks */
struct xfs_mount *mp; /* file system mount point */
mp = cur->bc_mp;
- lblock_ok =
- be32_to_cpu(block->bb_magic) == xfs_magics[cur->bc_btnum] &&
+
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ lblock_ok = lblock_ok &&
+ uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_uuid) &&
+ block->bb_u.l.bb_blkno == cpu_to_be64(
+ bp ? bp->b_bn : XFS_BUF_DADDR_NULL);
+ }
+
+ lblock_ok = lblock_ok &&
+ be32_to_cpu(block->bb_magic) == xfs_btree_magic(cur) &&
be16_to_cpu(block->bb_level) == level &&
be16_to_cpu(block->bb_numrecs) <=
cur->bc_ops->get_maxrecs(cur, level) &&
block->bb_u.l.bb_leftsib &&
(block->bb_u.l.bb_leftsib == cpu_to_be64(NULLDFSBNO) ||
XFS_FSB_SANITY_CHECK(mp,
- be64_to_cpu(block->bb_u.l.bb_leftsib))) &&
+ be64_to_cpu(block->bb_u.l.bb_leftsib))) &&
block->bb_u.l.bb_rightsib &&
(block->bb_u.l.bb_rightsib == cpu_to_be64(NULLDFSBNO) ||
XFS_FSB_SANITY_CHECK(mp,
- be64_to_cpu(block->bb_u.l.bb_rightsib)));
+ be64_to_cpu(block->bb_u.l.bb_rightsib)));
+
if (unlikely(XFS_TEST_ERROR(!lblock_ok, mp,
XFS_ERRTAG_BTREE_CHECK_LBLOCK,
XFS_RANDOM_BTREE_CHECK_LBLOCK))) {
if (bp)
trace_xfs_btree_corrupt(bp, _RET_IP_);
- XFS_ERROR_REPORT("xfs_btree_check_lblock", XFS_ERRLEVEL_LOW,
- mp);
+ XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
return XFS_ERROR(EFSCORRUPTED);
}
return 0;
@@ -90,16 +104,26 @@ xfs_btree_check_sblock(
int level, /* level of the btree block */
struct xfs_buf *bp) /* buffer containing block */
{
+ struct xfs_mount *mp; /* file system mount point */
struct xfs_buf *agbp; /* buffer for ag. freespace struct */
struct xfs_agf *agf; /* ag. freespace structure */
xfs_agblock_t agflen; /* native ag. freespace length */
- int sblock_ok; /* block passes checks */
+ int sblock_ok = 1; /* block passes checks */
+ mp = cur->bc_mp;
agbp = cur->bc_private.a.agbp;
agf = XFS_BUF_TO_AGF(agbp);
agflen = be32_to_cpu(agf->agf_length);
- sblock_ok =
- be32_to_cpu(block->bb_magic) == xfs_magics[cur->bc_btnum] &&
+
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ sblock_ok = sblock_ok &&
+ uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid) &&
+ block->bb_u.s.bb_blkno == cpu_to_be64(
+ bp ? bp->b_bn : XFS_BUF_DADDR_NULL);
+ }
+
+ sblock_ok = sblock_ok &&
+ be32_to_cpu(block->bb_magic) == xfs_btree_magic(cur) &&
be16_to_cpu(block->bb_level) == level &&
be16_to_cpu(block->bb_numrecs) <=
cur->bc_ops->get_maxrecs(cur, level) &&
@@ -109,13 +133,13 @@ xfs_btree_check_sblock(
(block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK) ||
be32_to_cpu(block->bb_u.s.bb_rightsib) < agflen) &&
block->bb_u.s.bb_rightsib;
- if (unlikely(XFS_TEST_ERROR(!sblock_ok, cur->bc_mp,
+
+ if (unlikely(XFS_TEST_ERROR(!sblock_ok, mp,
XFS_ERRTAG_BTREE_CHECK_SBLOCK,
XFS_RANDOM_BTREE_CHECK_SBLOCK))) {
if (bp)
trace_xfs_btree_corrupt(bp, _RET_IP_);
- XFS_CORRUPTION_ERROR("xfs_btree_check_sblock",
- XFS_ERRLEVEL_LOW, cur->bc_mp, block);
+ XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
return XFS_ERROR(EFSCORRUPTED);
}
return 0;
@@ -194,6 +218,70 @@ xfs_btree_check_ptr(
#endif
/*
+ * Calculate CRC on the whole btree block and stuff it into the
+ * long-form btree header.
+ *
+ * Prior to calculting the CRC, pull the LSN out of the buffer log item and put
+ * it into the buffer so recovery knows what the last modifcation was that made
+ * it to disk.
+ */
+void
+xfs_btree_lblock_calc_crc(
+ struct xfs_buf *bp)
+{
+ struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
+ struct xfs_buf_log_item *bip = bp->b_fspriv;
+
+ if (!xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb))
+ return;
+ if (bip)
+ block->bb_u.l.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn);
+ xfs_buf_update_cksum(bp, XFS_BTREE_LBLOCK_CRC_OFF);
+}
+
+bool
+xfs_btree_lblock_verify_crc(
+ struct xfs_buf *bp)
+{
+ if (xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb))
+ return xfs_buf_verify_cksum(bp, XFS_BTREE_LBLOCK_CRC_OFF);
+
+ return true;
+}
+
+/*
+ * Calculate CRC on the whole btree block and stuff it into the
+ * short-form btree header.
+ *
+ * Prior to calculting the CRC, pull the LSN out of the buffer log item and put
+ * it into the buffer so recovery knows what the last modifcation was that made
+ * it to disk.
+ */
+void
+xfs_btree_sblock_calc_crc(
+ struct xfs_buf *bp)
+{
+ struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
+ struct xfs_buf_log_item *bip = bp->b_fspriv;
+
+ if (!xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb))
+ return;
+ if (bip)
+ block->bb_u.s.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn);
+ xfs_buf_update_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF);
+}
+
+bool
+xfs_btree_sblock_verify_crc(
+ struct xfs_buf *bp)
+{
+ if (xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb))
+ return xfs_buf_verify_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF);
+
+ return true;
+}
+
+/*
* Delete the btree cursor.
*/
void
@@ -277,10 +365,8 @@ xfs_btree_dup_cursor(
*ncur = NULL;
return error;
}
- new->bc_bufs[i] = bp;
- ASSERT(!xfs_buf_geterror(bp));
- } else
- new->bc_bufs[i] = NULL;
+ }
+ new->bc_bufs[i] = bp;
}
*ncur = new;
return 0;
@@ -321,9 +407,14 @@ xfs_btree_dup_cursor(
*/
static inline size_t xfs_btree_block_len(struct xfs_btree_cur *cur)
{
- return (cur->bc_flags & XFS_BTREE_LONG_PTRS) ?
- XFS_BTREE_LBLOCK_LEN :
- XFS_BTREE_SBLOCK_LEN;
+ if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+ if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS)
+ return XFS_BTREE_LBLOCK_CRC_LEN;
+ return XFS_BTREE_LBLOCK_LEN;
+ }
+ if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS)
+ return XFS_BTREE_SBLOCK_CRC_LEN;
+ return XFS_BTREE_SBLOCK_LEN;
}
/*
@@ -417,7 +508,7 @@ xfs_btree_ptr_addr(
}
/*
- * Get a the root block which is stored in the inode.
+ * Get the root block which is stored in the inode.
*
* For now this btree implementation assumes the btree root is always
* stored in the if_broot field of an inode fork.
@@ -463,14 +554,11 @@ xfs_btree_get_bufl(
xfs_fsblock_t fsbno, /* file system block number */
uint lock) /* lock flags for get_buf */
{
- xfs_buf_t *bp; /* buffer pointer (return value) */
xfs_daddr_t d; /* real disk block address */
ASSERT(fsbno != NULLFSBLOCK);
d = XFS_FSB_TO_DADDR(mp, fsbno);
- bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, lock);
- ASSERT(!xfs_buf_geterror(bp));
- return bp;
+ return xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, lock);
}
/*
@@ -485,15 +573,12 @@ xfs_btree_get_bufs(
xfs_agblock_t agbno, /* allocation group block number */
uint lock) /* lock flags for get_buf */
{
- xfs_buf_t *bp; /* buffer pointer (return value) */
xfs_daddr_t d; /* real disk block address */
ASSERT(agno != NULLAGNUMBER);
ASSERT(agbno != NULLAGBLOCK);
d = XFS_AGB_TO_DADDR(mp, agno, agbno);
- bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, lock);
- ASSERT(!xfs_buf_geterror(bp));
- return bp;
+ return xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, lock);
}
/*
@@ -633,7 +718,6 @@ xfs_btree_read_bufl(
mp->m_bsize, lock, &bp, ops);
if (error)
return error;
- ASSERT(!xfs_buf_geterror(bp));
if (bp)
xfs_buf_set_ref(bp, refval);
*bpp = bp;
@@ -762,6 +846,41 @@ xfs_btree_readahead(
return xfs_btree_readahead_sblock(cur, lr, block);
}
+STATIC xfs_daddr_t
+xfs_btree_ptr_to_daddr(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *ptr)
+{
+ if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+ ASSERT(ptr->l != cpu_to_be64(NULLDFSBNO));
+
+ return XFS_FSB_TO_DADDR(cur->bc_mp, be64_to_cpu(ptr->l));
+ } else {
+ ASSERT(cur->bc_private.a.agno != NULLAGNUMBER);
+ ASSERT(ptr->s != cpu_to_be32(NULLAGBLOCK));
+
+ return XFS_AGB_TO_DADDR(cur->bc_mp, cur->bc_private.a.agno,
+ be32_to_cpu(ptr->s));
+ }
+}
+
+/*
+ * Readahead @count btree blocks at the given @ptr location.
+ *
+ * We don't need to care about long or short form btrees here as we have a
+ * method of converting the ptr directly to a daddr available to us.
+ */
+STATIC void
+xfs_btree_readahead_ptr(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *ptr,
+ xfs_extlen_t count)
+{
+ xfs_buf_readahead(cur->bc_mp->m_ddev_targp,
+ xfs_btree_ptr_to_daddr(cur, ptr),
+ cur->bc_mp->m_bsize * count, cur->bc_ops->buf_ops);
+}
+
/*
* Set the buffer for level "lev" in the cursor to bp, releasing
* any previous buffer.
@@ -863,43 +982,87 @@ xfs_btree_set_sibling(
}
void
+xfs_btree_init_block_int(
+ struct xfs_mount *mp,
+ struct xfs_btree_block *buf,
+ xfs_daddr_t blkno,
+ __u32 magic,
+ __u16 level,
+ __u16 numrecs,
+ __u64 owner,
+ unsigned int flags)
+{
+ buf->bb_magic = cpu_to_be32(magic);
+ buf->bb_level = cpu_to_be16(level);
+ buf->bb_numrecs = cpu_to_be16(numrecs);
+
+ if (flags & XFS_BTREE_LONG_PTRS) {
+ buf->bb_u.l.bb_leftsib = cpu_to_be64(NULLDFSBNO);
+ buf->bb_u.l.bb_rightsib = cpu_to_be64(NULLDFSBNO);
+ if (flags & XFS_BTREE_CRC_BLOCKS) {
+ buf->bb_u.l.bb_blkno = cpu_to_be64(blkno);
+ buf->bb_u.l.bb_owner = cpu_to_be64(owner);
+ uuid_copy(&buf->bb_u.l.bb_uuid, &mp->m_sb.sb_uuid);
+ buf->bb_u.l.bb_pad = 0;
+ buf->bb_u.l.bb_lsn = 0;
+ }
+ } else {
+ /* owner is a 32 bit value on short blocks */
+ __u32 __owner = (__u32)owner;
+
+ buf->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK);
+ buf->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK);
+ if (flags & XFS_BTREE_CRC_BLOCKS) {
+ buf->bb_u.s.bb_blkno = cpu_to_be64(blkno);
+ buf->bb_u.s.bb_owner = cpu_to_be32(__owner);
+ uuid_copy(&buf->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid);
+ buf->bb_u.s.bb_lsn = 0;
+ }
+ }
+}
+
+void
xfs_btree_init_block(
struct xfs_mount *mp,
struct xfs_buf *bp,
__u32 magic,
__u16 level,
__u16 numrecs,
+ __u64 owner,
unsigned int flags)
{
- struct xfs_btree_block *new = XFS_BUF_TO_BLOCK(bp);
-
- new->bb_magic = cpu_to_be32(magic);
- new->bb_level = cpu_to_be16(level);
- new->bb_numrecs = cpu_to_be16(numrecs);
-
- if (flags & XFS_BTREE_LONG_PTRS) {
- new->bb_u.l.bb_leftsib = cpu_to_be64(NULLDFSBNO);
- new->bb_u.l.bb_rightsib = cpu_to_be64(NULLDFSBNO);
- } else {
- new->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK);
- new->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK);
- }
+ xfs_btree_init_block_int(mp, XFS_BUF_TO_BLOCK(bp), bp->b_bn,
+ magic, level, numrecs, owner, flags);
}
STATIC void
xfs_btree_init_block_cur(
struct xfs_btree_cur *cur,
+ struct xfs_buf *bp,
int level,
- int numrecs,
- struct xfs_buf *bp)
+ int numrecs)
{
- xfs_btree_init_block(cur->bc_mp, bp, xfs_magics[cur->bc_btnum],
- level, numrecs, cur->bc_flags);
+ __u64 owner;
+
+ /*
+ * we can pull the owner from the cursor right now as the different
+ * owners align directly with the pointer size of the btree. This may
+ * change in future, but is safe for current users of the generic btree
+ * code.
+ */
+ if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+ owner = cur->bc_private.b.ip->i_ino;
+ else
+ owner = cur->bc_private.a.agno;
+
+ xfs_btree_init_block_int(cur->bc_mp, XFS_BUF_TO_BLOCK(bp), bp->b_bn,
+ xfs_btree_magic(cur), level, numrecs,
+ owner, cur->bc_flags);
}
/*
* Return true if ptr is the last record in the btree and
- * we need to track updateѕ to this record. The decision
+ * we need to track updates to this record. The decision
* will be further refined in the update_lastrec method.
*/
STATIC int
@@ -936,24 +1099,6 @@ xfs_btree_buf_to_ptr(
}
}
-STATIC xfs_daddr_t
-xfs_btree_ptr_to_daddr(
- struct xfs_btree_cur *cur,
- union xfs_btree_ptr *ptr)
-{
- if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
- ASSERT(ptr->l != cpu_to_be64(NULLDFSBNO));
-
- return XFS_FSB_TO_DADDR(cur->bc_mp, be64_to_cpu(ptr->l));
- } else {
- ASSERT(cur->bc_private.a.agno != NULLAGNUMBER);
- ASSERT(ptr->s != cpu_to_be32(NULLAGBLOCK));
-
- return XFS_AGB_TO_DADDR(cur->bc_mp, cur->bc_private.a.agno,
- be32_to_cpu(ptr->s));
- }
-}
-
STATIC void
xfs_btree_set_refs(
struct xfs_btree_cur *cur,
@@ -965,6 +1110,7 @@ xfs_btree_set_refs(
xfs_buf_set_ref(bp, XFS_ALLOC_BTREE_REF);
break;
case XFS_BTNUM_INO:
+ case XFS_BTNUM_FINO:
xfs_buf_set_ref(bp, XFS_INO_BTREE_REF);
break;
case XFS_BTNUM_BMAP:
@@ -1009,7 +1155,6 @@ STATIC int
xfs_btree_read_buf_block(
struct xfs_btree_cur *cur,
union xfs_btree_ptr *ptr,
- int level,
int flags,
struct xfs_btree_block **block,
struct xfs_buf **bpp)
@@ -1028,7 +1173,6 @@ xfs_btree_read_buf_block(
if (error)
return error;
- ASSERT(!xfs_buf_geterror(*bpp));
xfs_btree_set_refs(cur, *bpp);
*block = XFS_BUF_TO_BLOCK(*bpp);
return 0;
@@ -1147,6 +1291,7 @@ xfs_btree_log_keys(
XFS_BTREE_TRACE_ARGBII(cur, bp, first, last);
if (bp) {
+ xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
xfs_trans_log_buf(cur->bc_tp, bp,
xfs_btree_key_offset(cur, first),
xfs_btree_key_offset(cur, last + 1) - 1);
@@ -1171,6 +1316,7 @@ xfs_btree_log_recs(
XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
XFS_BTREE_TRACE_ARGBII(cur, bp, first, last);
+ xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
xfs_trans_log_buf(cur->bc_tp, bp,
xfs_btree_rec_offset(cur, first),
xfs_btree_rec_offset(cur, last + 1) - 1);
@@ -1195,6 +1341,7 @@ xfs_btree_log_ptrs(
struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
int level = xfs_btree_get_level(block);
+ xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
xfs_trans_log_buf(cur->bc_tp, bp,
xfs_btree_ptr_offset(cur, first, level),
xfs_btree_ptr_offset(cur, last + 1, level) - 1);
@@ -1223,7 +1370,12 @@ xfs_btree_log_block(
offsetof(struct xfs_btree_block, bb_numrecs),
offsetof(struct xfs_btree_block, bb_u.s.bb_leftsib),
offsetof(struct xfs_btree_block, bb_u.s.bb_rightsib),
- XFS_BTREE_SBLOCK_LEN
+ offsetof(struct xfs_btree_block, bb_u.s.bb_blkno),
+ offsetof(struct xfs_btree_block, bb_u.s.bb_lsn),
+ offsetof(struct xfs_btree_block, bb_u.s.bb_uuid),
+ offsetof(struct xfs_btree_block, bb_u.s.bb_owner),
+ offsetof(struct xfs_btree_block, bb_u.s.bb_crc),
+ XFS_BTREE_SBLOCK_CRC_LEN
};
static const short loffsets[] = { /* table of offsets (long) */
offsetof(struct xfs_btree_block, bb_magic),
@@ -1231,17 +1383,40 @@ xfs_btree_log_block(
offsetof(struct xfs_btree_block, bb_numrecs),
offsetof(struct xfs_btree_block, bb_u.l.bb_leftsib),
offsetof(struct xfs_btree_block, bb_u.l.bb_rightsib),
- XFS_BTREE_LBLOCK_LEN
+ offsetof(struct xfs_btree_block, bb_u.l.bb_blkno),
+ offsetof(struct xfs_btree_block, bb_u.l.bb_lsn),
+ offsetof(struct xfs_btree_block, bb_u.l.bb_uuid),
+ offsetof(struct xfs_btree_block, bb_u.l.bb_owner),
+ offsetof(struct xfs_btree_block, bb_u.l.bb_crc),
+ offsetof(struct xfs_btree_block, bb_u.l.bb_pad),
+ XFS_BTREE_LBLOCK_CRC_LEN
};
XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
XFS_BTREE_TRACE_ARGBI(cur, bp, fields);
if (bp) {
+ int nbits;
+
+ if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) {
+ /*
+ * We don't log the CRC when updating a btree
+ * block but instead recreate it during log
+ * recovery. As the log buffers have checksums
+ * of their own this is safe and avoids logging a crc
+ * update in a lot of places.
+ */
+ if (fields == XFS_BB_ALL_BITS)
+ fields = XFS_BB_ALL_BITS_CRC;
+ nbits = XFS_BB_NUM_BITS_CRC;
+ } else {
+ nbits = XFS_BB_NUM_BITS;
+ }
xfs_btree_offsets(fields,
(cur->bc_flags & XFS_BTREE_LONG_PTRS) ?
loffsets : soffsets,
- XFS_BB_NUM_BITS, &first, &last);
+ nbits, &first, &last);
+ xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
xfs_trans_log_buf(cur->bc_tp, bp, first, last);
} else {
xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip,
@@ -1336,8 +1511,8 @@ xfs_btree_increment(
union xfs_btree_ptr *ptrp;
ptrp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[lev], block);
- error = xfs_btree_read_buf_block(cur, ptrp, --lev,
- 0, &block, &bp);
+ --lev;
+ error = xfs_btree_read_buf_block(cur, ptrp, 0, &block, &bp);
if (error)
goto error0;
@@ -1435,8 +1610,8 @@ xfs_btree_decrement(
union xfs_btree_ptr *ptrp;
ptrp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[lev], block);
- error = xfs_btree_read_buf_block(cur, ptrp, --lev,
- 0, &block, &bp);
+ --lev;
+ error = xfs_btree_read_buf_block(cur, ptrp, 0, &block, &bp);
if (error)
goto error0;
xfs_btree_setbuf(cur, lev, bp);
@@ -1486,7 +1661,7 @@ xfs_btree_lookup_get_block(
return 0;
}
- error = xfs_btree_read_buf_block(cur, pp, level, 0, blkp, &bp);
+ error = xfs_btree_read_buf_block(cur, pp, 0, blkp, &bp);
if (error)
return error;
@@ -1518,7 +1693,7 @@ xfs_lookup_get_search_key(
/*
* Lookup the record. The cursor is made to point to it, based on dir.
- * Return 0 if can't find any such record, 1 for success.
+ * stat is set to 0 if can't find any such record, 1 for success.
*/
int /* error */
xfs_btree_lookup(
@@ -1837,7 +2012,7 @@ xfs_btree_lshift(
goto out0;
/* Set up the left neighbor as "left". */
- error = xfs_btree_read_buf_block(cur, &lptr, level, 0, &left, &lbp);
+ error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp);
if (error)
goto error0;
@@ -2021,7 +2196,7 @@ xfs_btree_rshift(
goto out0;
/* Set up the right neighbor as "right". */
- error = xfs_btree_read_buf_block(cur, &rptr, level, 0, &right, &rbp);
+ error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp);
if (error)
goto error0;
@@ -2149,7 +2324,7 @@ error1:
* record (to be inserted into parent).
*/
STATIC int /* error */
-xfs_btree_split(
+__xfs_btree_split(
struct xfs_btree_cur *cur,
int level,
union xfs_btree_ptr *ptrp,
@@ -2191,7 +2366,7 @@ xfs_btree_split(
xfs_btree_buf_to_ptr(cur, lbp, &lptr);
/* Allocate the new block. If we can't do it, we're toast. Give up. */
- error = cur->bc_ops->alloc_block(cur, &lptr, &rptr, 1, stat);
+ error = cur->bc_ops->alloc_block(cur, &lptr, &rptr, stat);
if (error)
goto error0;
if (*stat == 0)
@@ -2204,7 +2379,7 @@ xfs_btree_split(
goto error0;
/* Fill in the btree header for the new right block. */
- xfs_btree_init_block_cur(cur, xfs_btree_get_level(left), 0, rbp);
+ xfs_btree_init_block_cur(cur, rbp, xfs_btree_get_level(left), 0);
/*
* Split the entries between the old and the new block evenly.
@@ -2289,7 +2464,7 @@ xfs_btree_split(
* point back to right instead of to left.
*/
if (!xfs_btree_ptr_is_null(cur, &rrptr)) {
- error = xfs_btree_read_buf_block(cur, &rrptr, level,
+ error = xfs_btree_read_buf_block(cur, &rrptr,
0, &rrblock, &rrbp);
if (error)
goto error0;
@@ -2329,6 +2504,85 @@ error0:
return error;
}
+struct xfs_btree_split_args {
+ struct xfs_btree_cur *cur;
+ int level;
+ union xfs_btree_ptr *ptrp;
+ union xfs_btree_key *key;
+ struct xfs_btree_cur **curp;
+ int *stat; /* success/failure */
+ int result;
+ bool kswapd; /* allocation in kswapd context */
+ struct completion *done;
+ struct work_struct work;
+};
+
+/*
+ * Stack switching interfaces for allocation
+ */
+static void
+xfs_btree_split_worker(
+ struct work_struct *work)
+{
+ struct xfs_btree_split_args *args = container_of(work,
+ struct xfs_btree_split_args, work);
+ unsigned long pflags;
+ unsigned long new_pflags = PF_FSTRANS;
+
+ /*
+ * we are in a transaction context here, but may also be doing work
+ * in kswapd context, and hence we may need to inherit that state
+ * temporarily to ensure that we don't block waiting for memory reclaim
+ * in any way.
+ */
+ if (args->kswapd)
+ new_pflags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
+
+ current_set_flags_nested(&pflags, new_pflags);
+
+ args->result = __xfs_btree_split(args->cur, args->level, args->ptrp,
+ args->key, args->curp, args->stat);
+ complete(args->done);
+
+ current_restore_flags_nested(&pflags, new_pflags);
+}
+
+/*
+ * BMBT split requests often come in with little stack to work on. Push
+ * them off to a worker thread so there is lots of stack to use. For the other
+ * btree types, just call directly to avoid the context switch overhead here.
+ */
+STATIC int /* error */
+xfs_btree_split(
+ struct xfs_btree_cur *cur,
+ int level,
+ union xfs_btree_ptr *ptrp,
+ union xfs_btree_key *key,
+ struct xfs_btree_cur **curp,
+ int *stat) /* success/failure */
+{
+ struct xfs_btree_split_args args;
+ DECLARE_COMPLETION_ONSTACK(done);
+
+ if (cur->bc_btnum != XFS_BTNUM_BMAP)
+ return __xfs_btree_split(cur, level, ptrp, key, curp, stat);
+
+ args.cur = cur;
+ args.level = level;
+ args.ptrp = ptrp;
+ args.key = key;
+ args.curp = curp;
+ args.stat = stat;
+ args.done = &done;
+ args.kswapd = current_is_kswapd();
+ INIT_WORK_ONSTACK(&args.work, xfs_btree_split_worker);
+ queue_work(xfs_alloc_wq, &args.work);
+ wait_for_completion(&done);
+ destroy_work_on_stack(&args.work);
+ return args.result;
+}
+
+
/*
* Copy the old inode root contents into a real block and make the
* broot point to it.
@@ -2364,7 +2618,7 @@ xfs_btree_new_iroot(
pp = xfs_btree_ptr_addr(cur, 1, block);
/* Allocate the new block. If we can't do it, we're toast. Give up. */
- error = cur->bc_ops->alloc_block(cur, pp, &nptr, 1, stat);
+ error = cur->bc_ops->alloc_block(cur, pp, &nptr, stat);
if (error)
goto error0;
if (*stat == 0) {
@@ -2378,7 +2632,17 @@ xfs_btree_new_iroot(
if (error)
goto error0;
+ /*
+ * we can't just memcpy() the root in for CRC enabled btree blocks.
+ * In that case have to also ensure the blkno remains correct
+ */
memcpy(cblock, block, xfs_btree_block_len(cur));
+ if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) {
+ if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+ cblock->bb_u.l.bb_blkno = cpu_to_be64(cbp->b_bn);
+ else
+ cblock->bb_u.s.bb_blkno = cpu_to_be64(cbp->b_bn);
+ }
be16_add_cpu(&block->bb_level, 1);
xfs_btree_set_numrecs(block, 1);
@@ -2458,7 +2722,7 @@ xfs_btree_new_root(
cur->bc_ops->init_ptr_from_cur(cur, &rptr);
/* Allocate the new block. If we can't do it, we're toast. Give up. */
- error = cur->bc_ops->alloc_block(cur, &rptr, &lptr, 1, stat);
+ error = cur->bc_ops->alloc_block(cur, &rptr, &lptr, stat);
if (error)
goto error0;
if (*stat == 0)
@@ -2493,8 +2757,7 @@ xfs_btree_new_root(
lbp = bp;
xfs_btree_buf_to_ptr(cur, lbp, &lptr);
left = block;
- error = xfs_btree_read_buf_block(cur, &rptr,
- cur->bc_nlevels - 1, 0, &right, &rbp);
+ error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp);
if (error)
goto error0;
bp = rbp;
@@ -2505,15 +2768,14 @@ xfs_btree_new_root(
xfs_btree_buf_to_ptr(cur, rbp, &rptr);
right = block;
xfs_btree_get_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
- error = xfs_btree_read_buf_block(cur, &lptr,
- cur->bc_nlevels - 1, 0, &left, &lbp);
+ error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp);
if (error)
goto error0;
bp = lbp;
nptr = 2;
}
/* Fill in the new block's btree header and log it. */
- xfs_btree_init_block_cur(cur, cur->bc_nlevels, 2, nbp);
+ xfs_btree_init_block_cur(cur, nbp, cur->bc_nlevels, 2);
xfs_btree_log_block(cur, nbp, XFS_BB_ALL_BITS);
ASSERT(!xfs_btree_ptr_is_null(cur, &lptr) &&
!xfs_btree_ptr_is_null(cur, &rptr));
@@ -2580,7 +2842,6 @@ xfs_btree_make_block_unfull(
if (numrecs < cur->bc_ops->get_dmaxrecs(cur, level)) {
/* A root block that can be made bigger. */
-
xfs_iroot_realloc(ip, 1, cur->bc_private.b.whichfork);
} else {
/* A root block that needs replacing */
@@ -3459,8 +3720,7 @@ xfs_btree_delrec(
rptr = cptr;
right = block;
rbp = bp;
- error = xfs_btree_read_buf_block(cur, &lptr, level,
- 0, &left, &lbp);
+ error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp);
if (error)
goto error0;
@@ -3477,8 +3737,7 @@ xfs_btree_delrec(
lptr = cptr;
left = block;
lbp = bp;
- error = xfs_btree_read_buf_block(cur, &rptr, level,
- 0, &right, &rbp);
+ error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp);
if (error)
goto error0;
@@ -3550,8 +3809,7 @@ xfs_btree_delrec(
/* If there is a right sibling, point it to the remaining block. */
xfs_btree_get_sibling(cur, left, &cptr, XFS_BB_RIGHTSIB);
if (!xfs_btree_ptr_is_null(cur, &cptr)) {
- error = xfs_btree_read_buf_block(cur, &cptr, level,
- 0, &rrblock, &rrbp);
+ error = xfs_btree_read_buf_block(cur, &cptr, 0, &rrblock, &rrbp);
if (error)
goto error0;
xfs_btree_set_sibling(cur, rrblock, &lptr, XFS_BB_LEFTSIB);
@@ -3692,3 +3950,120 @@ xfs_btree_get_rec(
*stat = 1;
return 0;
}
+
+/*
+ * Change the owner of a btree.
+ *
+ * The mechanism we use here is ordered buffer logging. Because we don't know
+ * how many buffers were are going to need to modify, we don't really want to
+ * have to make transaction reservations for the worst case of every buffer in a
+ * full size btree as that may be more space that we can fit in the log....
+ *
+ * We do the btree walk in the most optimal manner possible - we have sibling
+ * pointers so we can just walk all the blocks on each level from left to right
+ * in a single pass, and then move to the next level and do the same. We can
+ * also do readahead on the sibling pointers to get IO moving more quickly,
+ * though for slow disks this is unlikely to make much difference to performance
+ * as the amount of CPU work we have to do before moving to the next block is
+ * relatively small.
+ *
+ * For each btree block that we load, modify the owner appropriately, set the
+ * buffer as an ordered buffer and log it appropriately. We need to ensure that
+ * we mark the region we change dirty so that if the buffer is relogged in
+ * a subsequent transaction the changes we make here as an ordered buffer are
+ * correctly relogged in that transaction. If we are in recovery context, then
+ * just queue the modified buffer as delayed write buffer so the transaction
+ * recovery completion writes the changes to disk.
+ */
+static int
+xfs_btree_block_change_owner(
+ struct xfs_btree_cur *cur,
+ int level,
+ __uint64_t new_owner,
+ struct list_head *buffer_list)
+{
+ struct xfs_btree_block *block;
+ struct xfs_buf *bp;
+ union xfs_btree_ptr rptr;
+
+ /* do right sibling readahead */
+ xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA);
+
+ /* modify the owner */
+ block = xfs_btree_get_block(cur, level, &bp);
+ if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+ block->bb_u.l.bb_owner = cpu_to_be64(new_owner);
+ else
+ block->bb_u.s.bb_owner = cpu_to_be32(new_owner);
+
+ /*
+ * If the block is a root block hosted in an inode, we might not have a
+ * buffer pointer here and we shouldn't attempt to log the change as the
+ * information is already held in the inode and discarded when the root
+ * block is formatted into the on-disk inode fork. We still change it,
+ * though, so everything is consistent in memory.
+ */
+ if (bp) {
+ if (cur->bc_tp) {
+ xfs_trans_ordered_buf(cur->bc_tp, bp);
+ xfs_btree_log_block(cur, bp, XFS_BB_OWNER);
+ } else {
+ xfs_buf_delwri_queue(bp, buffer_list);
+ }
+ } else {
+ ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
+ ASSERT(level == cur->bc_nlevels - 1);
+ }
+
+ /* now read rh sibling block for next iteration */
+ xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
+ if (xfs_btree_ptr_is_null(cur, &rptr))
+ return ENOENT;
+
+ return xfs_btree_lookup_get_block(cur, level, &rptr, &block);
+}
+
+int
+xfs_btree_change_owner(
+ struct xfs_btree_cur *cur,
+ __uint64_t new_owner,
+ struct list_head *buffer_list)
+{
+ union xfs_btree_ptr lptr;
+ int level;
+ struct xfs_btree_block *block = NULL;
+ int error = 0;
+
+ cur->bc_ops->init_ptr_from_cur(cur, &lptr);
+
+ /* for each level */
+ for (level = cur->bc_nlevels - 1; level >= 0; level--) {
+ /* grab the left hand block */
+ error = xfs_btree_lookup_get_block(cur, level, &lptr, &block);
+ if (error)
+ return error;
+
+ /* readahead the left most block for the next level down */
+ if (level > 0) {
+ union xfs_btree_ptr *ptr;
+
+ ptr = xfs_btree_ptr_addr(cur, 1, block);
+ xfs_btree_readahead_ptr(cur, ptr, 1);
+
+ /* save for the next iteration of the loop */
+ lptr = *ptr;
+ }
+
+ /* for each buffer in the level */
+ do {
+ error = xfs_btree_block_change_owner(cur, level,
+ new_owner,
+ buffer_list);
+ } while (!error);
+
+ if (error != ENOENT)
+ return error;
+ }
+
+ return 0;
+}
diff --git a/fs/xfs/xfs_btree.h b/fs/xfs/xfs_btree.h
index f932897194e..a04b69422f6 100644
--- a/fs/xfs/xfs_btree.h
+++ b/fs/xfs/xfs_btree.h
@@ -27,48 +27,6 @@ struct xfs_trans;
extern kmem_zone_t *xfs_btree_cur_zone;
/*
- * This nonsense is to make -wlint happy.
- */
-#define XFS_LOOKUP_EQ ((xfs_lookup_t)XFS_LOOKUP_EQi)
-#define XFS_LOOKUP_LE ((xfs_lookup_t)XFS_LOOKUP_LEi)
-#define XFS_LOOKUP_GE ((xfs_lookup_t)XFS_LOOKUP_GEi)
-
-#define XFS_BTNUM_BNO ((xfs_btnum_t)XFS_BTNUM_BNOi)
-#define XFS_BTNUM_CNT ((xfs_btnum_t)XFS_BTNUM_CNTi)
-#define XFS_BTNUM_BMAP ((xfs_btnum_t)XFS_BTNUM_BMAPi)
-#define XFS_BTNUM_INO ((xfs_btnum_t)XFS_BTNUM_INOi)
-
-/*
- * Generic btree header.
- *
- * This is a combination of the actual format used on disk for short and long
- * format btrees. The first three fields are shared by both format, but
- * the pointers are different and should be used with care.
- *
- * To get the size of the actual short or long form headers please use
- * the size macros below. Never use sizeof(xfs_btree_block).
- */
-struct xfs_btree_block {
- __be32 bb_magic; /* magic number for block type */
- __be16 bb_level; /* 0 is a leaf */
- __be16 bb_numrecs; /* current # of data records */
- union {
- struct {
- __be32 bb_leftsib;
- __be32 bb_rightsib;
- } s; /* short form pointers */
- struct {
- __be64 bb_leftsib;
- __be64 bb_rightsib;
- } l; /* long form pointers */
- } bb_u; /* rest */
-};
-
-#define XFS_BTREE_SBLOCK_LEN 16 /* size of a short form block */
-#define XFS_BTREE_LBLOCK_LEN 24 /* size of a long form block */
-
-
-/*
* Generic key, ptr and record wrapper structures.
*
* These are disk format structures, and are converted where necessary
@@ -94,20 +52,34 @@ union xfs_btree_rec {
};
/*
- * For logging record fields.
+ * This nonsense is to make -wlint happy.
*/
-#define XFS_BB_MAGIC 0x01
-#define XFS_BB_LEVEL 0x02
-#define XFS_BB_NUMRECS 0x04
-#define XFS_BB_LEFTSIB 0x08
-#define XFS_BB_RIGHTSIB 0x10
-#define XFS_BB_NUM_BITS 5
-#define XFS_BB_ALL_BITS ((1 << XFS_BB_NUM_BITS) - 1)
+#define XFS_LOOKUP_EQ ((xfs_lookup_t)XFS_LOOKUP_EQi)
+#define XFS_LOOKUP_LE ((xfs_lookup_t)XFS_LOOKUP_LEi)
+#define XFS_LOOKUP_GE ((xfs_lookup_t)XFS_LOOKUP_GEi)
+
+#define XFS_BTNUM_BNO ((xfs_btnum_t)XFS_BTNUM_BNOi)
+#define XFS_BTNUM_CNT ((xfs_btnum_t)XFS_BTNUM_CNTi)
+#define XFS_BTNUM_BMAP ((xfs_btnum_t)XFS_BTNUM_BMAPi)
+#define XFS_BTNUM_INO ((xfs_btnum_t)XFS_BTNUM_INOi)
+#define XFS_BTNUM_FINO ((xfs_btnum_t)XFS_BTNUM_FINOi)
/*
- * Magic numbers for btree blocks.
+ * For logging record fields.
*/
-extern const __uint32_t xfs_magics[];
+#define XFS_BB_MAGIC (1 << 0)
+#define XFS_BB_LEVEL (1 << 1)
+#define XFS_BB_NUMRECS (1 << 2)
+#define XFS_BB_LEFTSIB (1 << 3)
+#define XFS_BB_RIGHTSIB (1 << 4)
+#define XFS_BB_BLKNO (1 << 5)
+#define XFS_BB_LSN (1 << 6)
+#define XFS_BB_UUID (1 << 7)
+#define XFS_BB_OWNER (1 << 8)
+#define XFS_BB_NUM_BITS 5
+#define XFS_BB_ALL_BITS ((1 << XFS_BB_NUM_BITS) - 1)
+#define XFS_BB_NUM_BITS_CRC 9
+#define XFS_BB_ALL_BITS_CRC ((1 << XFS_BB_NUM_BITS_CRC) - 1)
/*
* Generic stats interface
@@ -121,6 +93,7 @@ do { \
case XFS_BTNUM_CNT: __XFS_BTREE_STATS_INC(abtc, stat); break; \
case XFS_BTNUM_BMAP: __XFS_BTREE_STATS_INC(bmbt, stat); break; \
case XFS_BTNUM_INO: __XFS_BTREE_STATS_INC(ibt, stat); break; \
+ case XFS_BTNUM_FINO: __XFS_BTREE_STATS_INC(fibt, stat); break; \
case XFS_BTNUM_MAX: ASSERT(0); /* fucking gcc */ ; break; \
} \
} while (0)
@@ -134,6 +107,7 @@ do { \
case XFS_BTNUM_CNT: __XFS_BTREE_STATS_ADD(abtc, stat, val); break; \
case XFS_BTNUM_BMAP: __XFS_BTREE_STATS_ADD(bmbt, stat, val); break; \
case XFS_BTNUM_INO: __XFS_BTREE_STATS_ADD(ibt, stat, val); break; \
+ case XFS_BTNUM_FINO: __XFS_BTREE_STATS_ADD(fibt, stat, val); break; \
case XFS_BTNUM_MAX: ASSERT(0); /* fucking gcc */ ; break; \
} \
} while (0)
@@ -158,7 +132,7 @@ struct xfs_btree_ops {
int (*alloc_block)(struct xfs_btree_cur *cur,
union xfs_btree_ptr *start_bno,
union xfs_btree_ptr *new_bno,
- int length, int *stat);
+ int *stat);
int (*free_block)(struct xfs_btree_cur *cur, struct xfs_buf *bp);
/* update last record information */
@@ -190,7 +164,7 @@ struct xfs_btree_ops {
const struct xfs_buf_ops *buf_ops;
-#ifdef DEBUG
+#if defined(DEBUG) || defined(XFS_WARN)
/* check that k1 is lower than k2 */
int (*keys_inorder)(struct xfs_btree_cur *cur,
union xfs_btree_key *k1,
@@ -256,6 +230,7 @@ typedef struct xfs_btree_cur
#define XFS_BTREE_LONG_PTRS (1<<0) /* pointers are 64bits long */
#define XFS_BTREE_ROOT_IN_INODE (1<<1) /* root may be variable size */
#define XFS_BTREE_LASTREC_UPDATE (1<<2) /* track last rec externally */
+#define XFS_BTREE_CRC_BLOCKS (1<<3) /* uses extended btree blocks */
#define XFS_BTREE_NOERROR 0
@@ -393,8 +368,20 @@ xfs_btree_init_block(
__u32 magic,
__u16 level,
__u16 numrecs,
+ __u64 owner,
unsigned int flags);
+void
+xfs_btree_init_block_int(
+ struct xfs_mount *mp,
+ struct xfs_btree_block *buf,
+ xfs_daddr_t blkno,
+ __u32 magic,
+ __u16 level,
+ __u16 numrecs,
+ __u64 owner,
+ unsigned int flags);
+
/*
* Common btree core entry points.
*/
@@ -406,6 +393,16 @@ int xfs_btree_new_iroot(struct xfs_btree_cur *, int *, int *);
int xfs_btree_insert(struct xfs_btree_cur *, int *);
int xfs_btree_delete(struct xfs_btree_cur *, int *);
int xfs_btree_get_rec(struct xfs_btree_cur *, union xfs_btree_rec **, int *);
+int xfs_btree_change_owner(struct xfs_btree_cur *cur, __uint64_t new_owner,
+ struct list_head *buffer_list);
+
+/*
+ * btree block CRC helpers
+ */
+void xfs_btree_lblock_calc_crc(struct xfs_buf *);
+bool xfs_btree_lblock_verify_crc(struct xfs_buf *);
+void xfs_btree_sblock_calc_crc(struct xfs_buf *);
+bool xfs_btree_sblock_verify_crc(struct xfs_buf *);
/*
* Internal btree helpers also used by xfs_bmap.c.
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 56d1614760c..7a34a1ae655 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -34,11 +34,13 @@
#include <linux/backing-dev.h>
#include <linux/freezer.h>
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_sb.h"
-#include "xfs_log.h"
#include "xfs_ag.h"
#include "xfs_mount.h"
#include "xfs_trace.h"
+#include "xfs_log.h"
static kmem_zone_t *xfs_buf_zone;
@@ -80,54 +82,6 @@ xfs_buf_vmap_len(
}
/*
- * xfs_buf_lru_add - add a buffer to the LRU.
- *
- * The LRU takes a new reference to the buffer so that it will only be freed
- * once the shrinker takes the buffer off the LRU.
- */
-STATIC void
-xfs_buf_lru_add(
- struct xfs_buf *bp)
-{
- struct xfs_buftarg *btp = bp->b_target;
-
- spin_lock(&btp->bt_lru_lock);
- if (list_empty(&bp->b_lru)) {
- atomic_inc(&bp->b_hold);
- list_add_tail(&bp->b_lru, &btp->bt_lru);
- btp->bt_lru_nr++;
- bp->b_lru_flags &= ~_XBF_LRU_DISPOSE;
- }
- spin_unlock(&btp->bt_lru_lock);
-}
-
-/*
- * xfs_buf_lru_del - remove a buffer from the LRU
- *
- * The unlocked check is safe here because it only occurs when there are not
- * b_lru_ref counts left on the inode under the pag->pag_buf_lock. it is there
- * to optimise the shrinker removing the buffer from the LRU and calling
- * xfs_buf_free(). i.e. it removes an unnecessary round trip on the
- * bt_lru_lock.
- */
-STATIC void
-xfs_buf_lru_del(
- struct xfs_buf *bp)
-{
- struct xfs_buftarg *btp = bp->b_target;
-
- if (list_empty(&bp->b_lru))
- return;
-
- spin_lock(&btp->bt_lru_lock);
- if (!list_empty(&bp->b_lru)) {
- list_del_init(&bp->b_lru);
- btp->bt_lru_nr--;
- }
- spin_unlock(&btp->bt_lru_lock);
-}
-
-/*
* When we mark a buffer stale, we remove the buffer from the LRU and clear the
* b_lru_ref count so that the buffer is freed immediately when the buffer
* reference count falls to zero. If the buffer is already on the LRU, we need
@@ -150,20 +104,14 @@ xfs_buf_stale(
*/
bp->b_flags &= ~_XBF_DELWRI_Q;
- atomic_set(&(bp)->b_lru_ref, 0);
- if (!list_empty(&bp->b_lru)) {
- struct xfs_buftarg *btp = bp->b_target;
+ spin_lock(&bp->b_lock);
+ atomic_set(&bp->b_lru_ref, 0);
+ if (!(bp->b_state & XFS_BSTATE_DISPOSE) &&
+ (list_lru_del(&bp->b_target->bt_lru, &bp->b_lru)))
+ atomic_dec(&bp->b_hold);
- spin_lock(&btp->bt_lru_lock);
- if (!list_empty(&bp->b_lru) &&
- !(bp->b_lru_flags & _XBF_LRU_DISPOSE)) {
- list_del_init(&bp->b_lru);
- btp->bt_lru_nr--;
- atomic_dec(&bp->b_hold);
- }
- spin_unlock(&btp->bt_lru_lock);
- }
ASSERT(atomic_read(&bp->b_hold) >= 1);
+ spin_unlock(&bp->b_lock);
}
static int
@@ -227,6 +175,7 @@ _xfs_buf_alloc(
INIT_LIST_HEAD(&bp->b_list);
RB_CLEAR_NODE(&bp->b_rbnode);
sema_init(&bp->b_sema, 0); /* held, no waiters */
+ spin_lock_init(&bp->b_lock);
XB_SET_OWNER(bp);
bp->b_target = target;
bp->b_flags = flags;
@@ -267,8 +216,7 @@ _xfs_buf_alloc(
STATIC int
_xfs_buf_get_pages(
xfs_buf_t *bp,
- int page_count,
- xfs_buf_flags_t flags)
+ int page_count)
{
/* Make sure that we have a page list */
if (bp->b_pages == NULL) {
@@ -303,7 +251,7 @@ _xfs_buf_free_pages(
* Releases the specified buffer.
*
* The modification state of any associated pages is left unchanged.
- * The buffer most not be on any hash - use xfs_buf_rele instead for
+ * The buffer must not be on any hash - use xfs_buf_rele instead for
* hashed and refcounted buffers
*/
void
@@ -381,7 +329,7 @@ use_alloc_page:
end = (BBTOB(bp->b_maps[0].bm_bn + bp->b_length) + PAGE_SIZE - 1)
>> PAGE_SHIFT;
page_count = end - start;
- error = _xfs_buf_get_pages(bp, page_count, flags);
+ error = _xfs_buf_get_pages(bp, page_count);
if (unlikely(error))
return error;
@@ -447,7 +395,17 @@ _xfs_buf_map_pages(
bp->b_addr = NULL;
} else {
int retried = 0;
+ unsigned noio_flag;
+ /*
+ * vm_map_ram() will allocate auxillary structures (e.g.
+ * pagetables) with GFP_KERNEL, yet we are likely to be under
+ * GFP_NOFS context here. Hence we need to tell memory reclaim
+ * that we are in such a context via PF_MEMALLOC_NOIO to prevent
+ * memory reclaim re-entering the filesystem here and
+ * potentially deadlocking.
+ */
+ noio_flag = memalloc_noio_save();
do {
bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count,
-1, PAGE_KERNEL);
@@ -455,6 +413,7 @@ _xfs_buf_map_pages(
break;
vm_unmap_aliases();
} while (retried++ <= 1);
+ memalloc_noio_restore(noio_flag);
if (!bp->b_addr)
return -ENOMEM;
@@ -487,6 +446,7 @@ _xfs_buf_find(
struct rb_node *parent;
xfs_buf_t *bp;
xfs_daddr_t blkno = map[0].bm_bn;
+ xfs_daddr_t eofs;
int numblks = 0;
int i;
@@ -495,8 +455,26 @@ _xfs_buf_find(
numbytes = BBTOB(numblks);
/* Check for IOs smaller than the sector size / not sector aligned */
- ASSERT(!(numbytes < (1 << btp->bt_sshift)));
- ASSERT(!(BBTOB(blkno) & (xfs_off_t)btp->bt_smask));
+ ASSERT(!(numbytes < btp->bt_meta_sectorsize));
+ ASSERT(!(BBTOB(blkno) & (xfs_off_t)btp->bt_meta_sectormask));
+
+ /*
+ * Corrupted block numbers can get through to here, unfortunately, so we
+ * have to check that the buffer falls within the filesystem bounds.
+ */
+ eofs = XFS_FSB_TO_BB(btp->bt_mount, btp->bt_mount->m_sb.sb_dblocks);
+ if (blkno >= eofs) {
+ /*
+ * XXX (dgc): we should really be returning EFSCORRUPTED here,
+ * but none of the higher level infrastructure supports
+ * returning a specific error on buffer lookup failures.
+ */
+ xfs_alert(btp->bt_mount,
+ "%s: Block out of range: block 0x%llx, EOFS 0x%llx ",
+ __func__, blkno, eofs);
+ WARN_ON(1);
+ return NULL;
+ }
/* get tree root */
pag = xfs_perag_get(btp->bt_mount,
@@ -623,7 +601,7 @@ found:
error = _xfs_buf_map_pages(bp, flags);
if (unlikely(error)) {
xfs_warn(target->bt_mount,
- "%s: failed to map pages\n", __func__);
+ "%s: failed to map pagesn", __func__);
xfs_buf_relse(bp);
return NULL;
}
@@ -730,7 +708,11 @@ xfs_buf_read_uncached(
bp->b_flags |= XBF_READ;
bp->b_ops = ops;
- xfsbdstrat(target->bt_mount, bp);
+ if (XFS_FORCED_SHUTDOWN(target->bt_mount)) {
+ xfs_buf_relse(bp);
+ return NULL;
+ }
+ xfs_buf_iorequest(bp);
xfs_buf_iowait(bp);
return bp;
}
@@ -795,7 +777,7 @@ xfs_buf_associate_memory(
bp->b_pages = NULL;
bp->b_addr = mem;
- rval = _xfs_buf_get_pages(bp, page_count, 0);
+ rval = _xfs_buf_get_pages(bp, page_count);
if (rval)
return rval;
@@ -828,7 +810,7 @@ xfs_buf_get_uncached(
goto fail;
page_count = PAGE_ALIGN(numblks << BBSHIFT) >> PAGE_SHIFT;
- error = _xfs_buf_get_pages(bp, page_count, 0);
+ error = _xfs_buf_get_pages(bp, page_count);
if (error)
goto fail_free_buf;
@@ -842,7 +824,7 @@ xfs_buf_get_uncached(
error = _xfs_buf_map_pages(bp, 0);
if (unlikely(error)) {
xfs_warn(target->bt_mount,
- "%s: failed to map pages\n", __func__);
+ "%s: failed to map pages", __func__);
goto fail_free_mem;
}
@@ -897,12 +879,33 @@ xfs_buf_rele(
ASSERT(atomic_read(&bp->b_hold) > 0);
if (atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock)) {
- if (!(bp->b_flags & XBF_STALE) &&
- atomic_read(&bp->b_lru_ref)) {
- xfs_buf_lru_add(bp);
+ spin_lock(&bp->b_lock);
+ if (!(bp->b_flags & XBF_STALE) && atomic_read(&bp->b_lru_ref)) {
+ /*
+ * If the buffer is added to the LRU take a new
+ * reference to the buffer for the LRU and clear the
+ * (now stale) dispose list state flag
+ */
+ if (list_lru_add(&bp->b_target->bt_lru, &bp->b_lru)) {
+ bp->b_state &= ~XFS_BSTATE_DISPOSE;
+ atomic_inc(&bp->b_hold);
+ }
+ spin_unlock(&bp->b_lock);
spin_unlock(&pag->pag_buf_lock);
} else {
- xfs_buf_lru_del(bp);
+ /*
+ * most of the time buffers will already be removed from
+ * the LRU, so optimise that case by checking for the
+ * XFS_BSTATE_DISPOSE flag indicating the last list the
+ * buffer was on was the disposal list
+ */
+ if (!(bp->b_state & XFS_BSTATE_DISPOSE)) {
+ list_lru_del(&bp->b_target->bt_lru, &bp->b_lru);
+ } else {
+ ASSERT(list_empty(&bp->b_lru));
+ }
+ spin_unlock(&bp->b_lock);
+
ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
rb_erase(&bp->b_rbnode, &pag->pag_buf_tree);
spin_unlock(&pag->pag_buf_lock);
@@ -933,8 +936,6 @@ xfs_buf_trylock(
locked = down_trylock(&bp->b_sema) == 0;
if (locked)
XB_SET_OWNER(bp);
- else if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE))
- xfs_log_force(bp->b_target->bt_mount, 0);
trace_xfs_buf_trylock(bp, _RET_IP_);
return locked;
@@ -1006,7 +1007,9 @@ xfs_buf_iodone_work(
bool read = !!(bp->b_flags & XBF_READ);
bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD);
- if (read && bp->b_ops)
+
+ /* only validate buffers that were read without errors */
+ if (read && bp->b_ops && !bp->b_error && (bp->b_flags & XBF_DONE))
bp->b_ops->verify_read(bp);
if (bp->b_iodone)
@@ -1100,7 +1103,7 @@ xfs_bioerror(
* This is meant for userdata errors; metadata bufs come with
* iodone functions attached, so that we can track down errors.
*/
-STATIC int
+int
xfs_bioerror_relse(
struct xfs_buf *bp)
{
@@ -1163,7 +1166,7 @@ xfs_bwrite(
ASSERT(xfs_buf_islocked(bp));
bp->b_flags |= XBF_WRITE;
- bp->b_flags &= ~(XBF_ASYNC | XBF_READ | _XBF_DELWRI_Q);
+ bp->b_flags &= ~(XBF_ASYNC | XBF_READ | _XBF_DELWRI_Q | XBF_WRITE_FAIL);
xfs_bdstrat_cb(bp);
@@ -1175,25 +1178,6 @@ xfs_bwrite(
return error;
}
-/*
- * Wrapper around bdstrat so that we can stop data from going to disk in case
- * we are shutting down the filesystem. Typically user data goes thru this
- * path; one of the exceptions is the superblock.
- */
-void
-xfsbdstrat(
- struct xfs_mount *mp,
- struct xfs_buf *bp)
-{
- if (XFS_FORCED_SHUTDOWN(mp)) {
- trace_xfs_bdstrat_shut(bp, _RET_IP_);
- xfs_bioerror_relse(bp);
- return;
- }
-
- xfs_buf_iorequest(bp);
-}
-
STATIC void
_xfs_buf_ioend(
xfs_buf_t *bp,
@@ -1266,7 +1250,7 @@ next_chunk:
bio = bio_alloc(GFP_NOIO, nr_pages);
bio->bi_bdev = bp->b_target->bt_bdev;
- bio->bi_sector = sector;
+ bio->bi_iter.bi_sector = sector;
bio->bi_end_io = xfs_buf_bio_end_io;
bio->bi_private = bp;
@@ -1288,7 +1272,7 @@ next_chunk:
total_nr_pages--;
}
- if (likely(bio->bi_size)) {
+ if (likely(bio->bi_iter.bi_size)) {
if (xfs_buf_is_vmapped(bp)) {
flush_kernel_vmap_range(bp->b_addr,
xfs_buf_vmap_len(bp));
@@ -1318,6 +1302,12 @@ _xfs_buf_ioapply(
int size;
int i;
+ /*
+ * Make sure we capture only current IO errors rather than stale errors
+ * left over from previous use of the buffer (e.g. failed readahead).
+ */
+ bp->b_error = 0;
+
if (bp->b_flags & XBF_WRITE) {
if (bp->b_flags & XBF_SYNCIO)
rw = WRITE_SYNC;
@@ -1381,21 +1371,29 @@ xfs_buf_iorequest(
xfs_buf_wait_unpin(bp);
xfs_buf_hold(bp);
- /* Set the count to 1 initially, this will stop an I/O
+ /*
+ * Set the count to 1 initially, this will stop an I/O
* completion callout which happens before we have started
* all the I/O from calling xfs_buf_ioend too early.
*/
atomic_set(&bp->b_io_remaining, 1);
_xfs_buf_ioapply(bp);
- _xfs_buf_ioend(bp, 1);
+ /*
+ * If _xfs_buf_ioapply failed, we'll get back here with
+ * only the reference we took above. _xfs_buf_ioend will
+ * drop it to zero, so we'd better not queue it for later,
+ * or we'll free it before it's done.
+ */
+ _xfs_buf_ioend(bp, bp->b_error ? 0 : 1);
xfs_buf_rele(bp);
}
/*
* Waits for I/O to complete on the buffer supplied. It returns immediately if
- * no I/O is pending or there is already a pending error on the buffer. It
- * returns the I/O error code, if any, or 0 if there was no error.
+ * no I/O is pending or there is already a pending error on the buffer, in which
+ * case nothing will ever complete. It returns the I/O error code, if any, or
+ * 0 if there was no error.
*/
int
xfs_buf_iowait(
@@ -1476,81 +1474,127 @@ xfs_buf_iomove(
* returned. These buffers will have an elevated hold count, so wait on those
* while freeing all the buffers only held by the LRU.
*/
+static enum lru_status
+xfs_buftarg_wait_rele(
+ struct list_head *item,
+ spinlock_t *lru_lock,
+ void *arg)
+
+{
+ struct xfs_buf *bp = container_of(item, struct xfs_buf, b_lru);
+ struct list_head *dispose = arg;
+
+ if (atomic_read(&bp->b_hold) > 1) {
+ /* need to wait, so skip it this pass */
+ trace_xfs_buf_wait_buftarg(bp, _RET_IP_);
+ return LRU_SKIP;
+ }
+ if (!spin_trylock(&bp->b_lock))
+ return LRU_SKIP;
+
+ /*
+ * clear the LRU reference count so the buffer doesn't get
+ * ignored in xfs_buf_rele().
+ */
+ atomic_set(&bp->b_lru_ref, 0);
+ bp->b_state |= XFS_BSTATE_DISPOSE;
+ list_move(item, dispose);
+ spin_unlock(&bp->b_lock);
+ return LRU_REMOVED;
+}
+
void
xfs_wait_buftarg(
struct xfs_buftarg *btp)
{
- struct xfs_buf *bp;
+ LIST_HEAD(dispose);
+ int loop = 0;
-restart:
- spin_lock(&btp->bt_lru_lock);
- while (!list_empty(&btp->bt_lru)) {
- bp = list_first_entry(&btp->bt_lru, struct xfs_buf, b_lru);
- if (atomic_read(&bp->b_hold) > 1) {
- spin_unlock(&btp->bt_lru_lock);
- delay(100);
- goto restart;
+ /* loop until there is nothing left on the lru list. */
+ while (list_lru_count(&btp->bt_lru)) {
+ list_lru_walk(&btp->bt_lru, xfs_buftarg_wait_rele,
+ &dispose, LONG_MAX);
+
+ while (!list_empty(&dispose)) {
+ struct xfs_buf *bp;
+ bp = list_first_entry(&dispose, struct xfs_buf, b_lru);
+ list_del_init(&bp->b_lru);
+ if (bp->b_flags & XBF_WRITE_FAIL) {
+ xfs_alert(btp->bt_mount,
+"Corruption Alert: Buffer at block 0x%llx had permanent write failures!\n"
+"Please run xfs_repair to determine the extent of the problem.",
+ (long long)bp->b_bn);
+ }
+ xfs_buf_rele(bp);
}
- /*
- * clear the LRU reference count so the buffer doesn't get
- * ignored in xfs_buf_rele().
- */
- atomic_set(&bp->b_lru_ref, 0);
- spin_unlock(&btp->bt_lru_lock);
- xfs_buf_rele(bp);
- spin_lock(&btp->bt_lru_lock);
+ if (loop++ != 0)
+ delay(100);
}
- spin_unlock(&btp->bt_lru_lock);
}
-int
-xfs_buftarg_shrink(
+static enum lru_status
+xfs_buftarg_isolate(
+ struct list_head *item,
+ spinlock_t *lru_lock,
+ void *arg)
+{
+ struct xfs_buf *bp = container_of(item, struct xfs_buf, b_lru);
+ struct list_head *dispose = arg;
+
+ /*
+ * we are inverting the lru lock/bp->b_lock here, so use a trylock.
+ * If we fail to get the lock, just skip it.
+ */
+ if (!spin_trylock(&bp->b_lock))
+ return LRU_SKIP;
+ /*
+ * Decrement the b_lru_ref count unless the value is already
+ * zero. If the value is already zero, we need to reclaim the
+ * buffer, otherwise it gets another trip through the LRU.
+ */
+ if (!atomic_add_unless(&bp->b_lru_ref, -1, 0)) {
+ spin_unlock(&bp->b_lock);
+ return LRU_ROTATE;
+ }
+
+ bp->b_state |= XFS_BSTATE_DISPOSE;
+ list_move(item, dispose);
+ spin_unlock(&bp->b_lock);
+ return LRU_REMOVED;
+}
+
+static unsigned long
+xfs_buftarg_shrink_scan(
struct shrinker *shrink,
struct shrink_control *sc)
{
struct xfs_buftarg *btp = container_of(shrink,
struct xfs_buftarg, bt_shrinker);
- struct xfs_buf *bp;
- int nr_to_scan = sc->nr_to_scan;
LIST_HEAD(dispose);
+ unsigned long freed;
+ unsigned long nr_to_scan = sc->nr_to_scan;
- if (!nr_to_scan)
- return btp->bt_lru_nr;
-
- spin_lock(&btp->bt_lru_lock);
- while (!list_empty(&btp->bt_lru)) {
- if (nr_to_scan-- <= 0)
- break;
-
- bp = list_first_entry(&btp->bt_lru, struct xfs_buf, b_lru);
-
- /*
- * Decrement the b_lru_ref count unless the value is already
- * zero. If the value is already zero, we need to reclaim the
- * buffer, otherwise it gets another trip through the LRU.
- */
- if (!atomic_add_unless(&bp->b_lru_ref, -1, 0)) {
- list_move_tail(&bp->b_lru, &btp->bt_lru);
- continue;
- }
-
- /*
- * remove the buffer from the LRU now to avoid needing another
- * lock round trip inside xfs_buf_rele().
- */
- list_move(&bp->b_lru, &dispose);
- btp->bt_lru_nr--;
- bp->b_lru_flags |= _XBF_LRU_DISPOSE;
- }
- spin_unlock(&btp->bt_lru_lock);
+ freed = list_lru_walk_node(&btp->bt_lru, sc->nid, xfs_buftarg_isolate,
+ &dispose, &nr_to_scan);
while (!list_empty(&dispose)) {
+ struct xfs_buf *bp;
bp = list_first_entry(&dispose, struct xfs_buf, b_lru);
list_del_init(&bp->b_lru);
xfs_buf_rele(bp);
}
- return btp->bt_lru_nr;
+ return freed;
+}
+
+static unsigned long
+xfs_buftarg_shrink_count(
+ struct shrinker *shrink,
+ struct shrink_control *sc)
+{
+ struct xfs_buftarg *btp = container_of(shrink,
+ struct xfs_buftarg, bt_shrinker);
+ return list_lru_count_node(&btp->bt_lru, sc->nid);
}
void
@@ -1559,6 +1603,7 @@ xfs_free_buftarg(
struct xfs_buftarg *btp)
{
unregister_shrinker(&btp->bt_shrinker);
+ list_lru_destroy(&btp->bt_lru);
if (mp->m_flags & XFS_MOUNT_BARRIER)
xfs_blkdev_issue_flush(btp);
@@ -1566,16 +1611,14 @@ xfs_free_buftarg(
kmem_free(btp);
}
-STATIC int
-xfs_setsize_buftarg_flags(
+int
+xfs_setsize_buftarg(
xfs_buftarg_t *btp,
- unsigned int blocksize,
- unsigned int sectorsize,
- int verbose)
+ unsigned int sectorsize)
{
- btp->bt_bsize = blocksize;
- btp->bt_sshift = ffs(sectorsize) - 1;
- btp->bt_smask = sectorsize - 1;
+ /* Set up metadata sector size info */
+ btp->bt_meta_sectorsize = sectorsize;
+ btp->bt_meta_sectormask = sectorsize - 1;
if (set_blocksize(btp->bt_bdev, sectorsize)) {
char name[BDEVNAME_SIZE];
@@ -1583,47 +1626,39 @@ xfs_setsize_buftarg_flags(
bdevname(btp->bt_bdev, name);
xfs_warn(btp->bt_mount,
- "Cannot set_blocksize to %u on device %s\n",
+ "Cannot set_blocksize to %u on device %s",
sectorsize, name);
return EINVAL;
}
+ /* Set up device logical sector size mask */
+ btp->bt_logical_sectorsize = bdev_logical_block_size(btp->bt_bdev);
+ btp->bt_logical_sectormask = bdev_logical_block_size(btp->bt_bdev) - 1;
+
return 0;
}
/*
- * When allocating the initial buffer target we have not yet
- * read in the superblock, so don't know what sized sectors
- * are being used is at this early stage. Play safe.
+ * When allocating the initial buffer target we have not yet
+ * read in the superblock, so don't know what sized sectors
+ * are being used at this early stage. Play safe.
*/
STATIC int
xfs_setsize_buftarg_early(
xfs_buftarg_t *btp,
struct block_device *bdev)
{
- return xfs_setsize_buftarg_flags(btp,
- PAGE_SIZE, bdev_logical_block_size(bdev), 0);
-}
-
-int
-xfs_setsize_buftarg(
- xfs_buftarg_t *btp,
- unsigned int blocksize,
- unsigned int sectorsize)
-{
- return xfs_setsize_buftarg_flags(btp, blocksize, sectorsize, 1);
+ return xfs_setsize_buftarg(btp, bdev_logical_block_size(bdev));
}
xfs_buftarg_t *
xfs_alloc_buftarg(
struct xfs_mount *mp,
- struct block_device *bdev,
- int external,
- const char *fsname)
+ struct block_device *bdev)
{
xfs_buftarg_t *btp;
- btp = kmem_zalloc(sizeof(*btp), KM_SLEEP);
+ btp = kmem_zalloc(sizeof(*btp), KM_SLEEP | KM_NOFS);
btp->bt_mount = mp;
btp->bt_dev = bdev->bd_dev;
@@ -1632,12 +1667,16 @@ xfs_alloc_buftarg(
if (!btp->bt_bdi)
goto error;
- INIT_LIST_HEAD(&btp->bt_lru);
- spin_lock_init(&btp->bt_lru_lock);
if (xfs_setsize_buftarg_early(btp, bdev))
goto error;
- btp->bt_shrinker.shrink = xfs_buftarg_shrink;
+
+ if (list_lru_init(&btp->bt_lru))
+ goto error;
+
+ btp->bt_shrinker.count_objects = xfs_buftarg_shrink_count;
+ btp->bt_shrinker.scan_objects = xfs_buftarg_shrink_scan;
btp->bt_shrinker.seeks = DEFAULT_SEEKS;
+ btp->bt_shrinker.flags = SHRINKER_NUMA_AWARE;
register_shrinker(&btp->bt_shrinker);
return btp;
@@ -1759,7 +1798,7 @@ __xfs_buf_delwri_submit(
blk_start_plug(&plug);
list_for_each_entry_safe(bp, n, io_list, b_list) {
- bp->b_flags &= ~(_XBF_DELWRI_Q | XBF_ASYNC);
+ bp->b_flags &= ~(_XBF_DELWRI_Q | XBF_ASYNC | XBF_WRITE_FAIL);
bp->b_flags |= XBF_WRITE;
if (!wait) {
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 433a12ed7b1..3a7a5523d3d 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -25,6 +25,7 @@
#include <linux/fs.h>
#include <linux/buffer_head.h>
#include <linux/uio.h>
+#include <linux/list_lru.h>
/*
* Base types
@@ -44,6 +45,7 @@ typedef enum {
#define XBF_ASYNC (1 << 4) /* initiator will not wait for completion */
#define XBF_DONE (1 << 5) /* all pages in the buffer uptodate */
#define XBF_STALE (1 << 6) /* buffer has been staled, do not find it */
+#define XBF_WRITE_FAIL (1 << 24)/* async writes have failed on this buffer */
/* I/O hints for the BIO layer */
#define XBF_SYNCIO (1 << 10)/* treat this buffer as synchronous I/O */
@@ -59,7 +61,6 @@ typedef enum {
#define _XBF_KMEM (1 << 21)/* backed by heap memory */
#define _XBF_DELWRI_Q (1 << 22)/* buffer on a delwri queue */
#define _XBF_COMPOUND (1 << 23)/* compound buffer */
-#define _XBF_LRU_DISPOSE (1 << 24)/* buffer being discarded */
typedef unsigned int xfs_buf_flags_t;
@@ -70,6 +71,7 @@ typedef unsigned int xfs_buf_flags_t;
{ XBF_ASYNC, "ASYNC" }, \
{ XBF_DONE, "DONE" }, \
{ XBF_STALE, "STALE" }, \
+ { XBF_WRITE_FAIL, "WRITE_FAIL" }, \
{ XBF_SYNCIO, "SYNCIO" }, \
{ XBF_FUA, "FUA" }, \
{ XBF_FLUSH, "FLUSH" }, \
@@ -78,23 +80,40 @@ typedef unsigned int xfs_buf_flags_t;
{ _XBF_PAGES, "PAGES" }, \
{ _XBF_KMEM, "KMEM" }, \
{ _XBF_DELWRI_Q, "DELWRI_Q" }, \
- { _XBF_COMPOUND, "COMPOUND" }, \
- { _XBF_LRU_DISPOSE, "LRU_DISPOSE" }
+ { _XBF_COMPOUND, "COMPOUND" }
+
+/*
+ * Internal state flags.
+ */
+#define XFS_BSTATE_DISPOSE (1 << 0) /* buffer being discarded */
+
+/*
+ * The xfs_buftarg contains 2 notions of "sector size" -
+ *
+ * 1) The metadata sector size, which is the minimum unit and
+ * alignment of IO which will be performed by metadata operations.
+ * 2) The device logical sector size
+ *
+ * The first is specified at mkfs time, and is stored on-disk in the
+ * superblock's sb_sectsize.
+ *
+ * The latter is derived from the underlying device, and controls direct IO
+ * alignment constraints.
+ */
typedef struct xfs_buftarg {
dev_t bt_dev;
struct block_device *bt_bdev;
struct backing_dev_info *bt_bdi;
struct xfs_mount *bt_mount;
- unsigned int bt_bsize;
- unsigned int bt_sshift;
- size_t bt_smask;
+ unsigned int bt_meta_sectorsize;
+ size_t bt_meta_sectormask;
+ size_t bt_logical_sectorsize;
+ size_t bt_logical_sectormask;
/* LRU control structures */
struct shrinker bt_shrinker;
- struct list_head bt_lru;
- spinlock_t bt_lru_lock;
- unsigned int bt_lru_nr;
+ struct list_lru bt_lru;
} xfs_buftarg_t;
struct xfs_buf;
@@ -137,7 +156,8 @@ typedef struct xfs_buf {
* bt_lru_lock and not by b_sema
*/
struct list_head b_lru; /* lru list */
- xfs_buf_flags_t b_lru_flags; /* internal lru status flags */
+ spinlock_t b_lock; /* internal state lock */
+ unsigned int b_state; /* internal state flags */
wait_queue_head_t b_waiters; /* unpin waiters */
struct list_head b_list;
struct xfs_perag *b_pag; /* contains rbtree root */
@@ -266,9 +286,6 @@ extern void xfs_buf_unlock(xfs_buf_t *);
/* Buffer Read and Write Routines */
extern int xfs_bwrite(struct xfs_buf *bp);
-
-extern void xfsbdstrat(struct xfs_mount *, struct xfs_buf *);
-
extern void xfs_buf_ioend(xfs_buf_t *, int);
extern void xfs_buf_ioerror(xfs_buf_t *, int);
extern void xfs_buf_ioerror_alert(struct xfs_buf *, const char *func);
@@ -279,10 +296,7 @@ extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, void *,
#define xfs_buf_zero(bp, off, len) \
xfs_buf_iomove((bp), (off), (len), NULL, XBRW_ZERO)
-static inline int xfs_buf_geterror(xfs_buf_t *bp)
-{
- return bp ? bp->b_error : ENOMEM;
-}
+extern int xfs_bioerror_relse(struct xfs_buf *);
/* Buffer Utility Routines */
extern xfs_caddr_t xfs_buf_offset(xfs_buf_t *, size_t);
@@ -298,7 +312,8 @@ extern void xfs_buf_terminate(void);
#define XFS_BUF_ZEROFLAGS(bp) \
((bp)->b_flags &= ~(XBF_READ|XBF_WRITE|XBF_ASYNC| \
- XBF_SYNCIO|XBF_FUA|XBF_FLUSH))
+ XBF_SYNCIO|XBF_FUA|XBF_FLUSH| \
+ XBF_WRITE_FAIL))
void xfs_buf_stale(struct xfs_buf *bp);
#define XFS_BUF_UNSTALE(bp) ((bp)->b_flags &= ~XBF_STALE)
@@ -349,14 +364,28 @@ static inline void xfs_buf_relse(xfs_buf_t *bp)
xfs_buf_rele(bp);
}
+static inline int
+xfs_buf_verify_cksum(struct xfs_buf *bp, unsigned long cksum_offset)
+{
+ return xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length),
+ cksum_offset);
+}
+
+static inline void
+xfs_buf_update_cksum(struct xfs_buf *bp, unsigned long cksum_offset)
+{
+ xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length),
+ cksum_offset);
+}
+
/*
* Handling of buftargs.
*/
extern xfs_buftarg_t *xfs_alloc_buftarg(struct xfs_mount *,
- struct block_device *, int, const char *);
+ struct block_device *);
extern void xfs_free_buftarg(struct xfs_mount *, struct xfs_buftarg *);
extern void xfs_wait_buftarg(xfs_buftarg_t *);
-extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int, unsigned int);
+extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int);
#define xfs_getsize_buftarg(buftarg) block_size((buftarg)->bt_bdev)
#define xfs_readonly_buftarg(buftarg) bdev_read_only((buftarg)->bt_bdev)
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 77b09750e92..4654338b03f 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -17,17 +17,18 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
-#include "xfs_types.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_bit.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_mount.h"
+#include "xfs_trans.h"
#include "xfs_buf_item.h"
#include "xfs_trans_priv.h"
#include "xfs_error.h"
#include "xfs_trace.h"
+#include "xfs_log.h"
kmem_zone_t *xfs_buf_item_zone;
@@ -37,110 +38,15 @@ static inline struct xfs_buf_log_item *BUF_ITEM(struct xfs_log_item *lip)
return container_of(lip, struct xfs_buf_log_item, bli_item);
}
+STATIC void xfs_buf_do_callbacks(struct xfs_buf *bp);
-#ifdef XFS_TRANS_DEBUG
-/*
- * This function uses an alternate strategy for tracking the bytes
- * that the user requests to be logged. This can then be used
- * in conjunction with the bli_orig array in the buf log item to
- * catch bugs in our callers' code.
- *
- * We also double check the bits set in xfs_buf_item_log using a
- * simple algorithm to check that every byte is accounted for.
- */
-STATIC void
-xfs_buf_item_log_debug(
- xfs_buf_log_item_t *bip,
- uint first,
- uint last)
-{
- uint x;
- uint byte;
- uint nbytes;
- uint chunk_num;
- uint word_num;
- uint bit_num;
- uint bit_set;
- uint *wordp;
-
- ASSERT(bip->bli_logged != NULL);
- byte = first;
- nbytes = last - first + 1;
- bfset(bip->bli_logged, first, nbytes);
- for (x = 0; x < nbytes; x++) {
- chunk_num = byte >> XFS_BLF_SHIFT;
- word_num = chunk_num >> BIT_TO_WORD_SHIFT;
- bit_num = chunk_num & (NBWORD - 1);
- wordp = &(bip->__bli_format.blf_data_map[word_num]);
- bit_set = *wordp & (1 << bit_num);
- ASSERT(bit_set);
- byte++;
- }
-}
-
-/*
- * This function is called when we flush something into a buffer without
- * logging it. This happens for things like inodes which are logged
- * separately from the buffer.
- */
-void
-xfs_buf_item_flush_log_debug(
- xfs_buf_t *bp,
- uint first,
- uint last)
-{
- xfs_buf_log_item_t *bip = bp->b_fspriv;
- uint nbytes;
-
- if (bip == NULL || (bip->bli_item.li_type != XFS_LI_BUF))
- return;
-
- ASSERT(bip->bli_logged != NULL);
- nbytes = last - first + 1;
- bfset(bip->bli_logged, first, nbytes);
-}
-
-/*
- * This function is called to verify that our callers have logged
- * all the bytes that they changed.
- *
- * It does this by comparing the original copy of the buffer stored in
- * the buf log item's bli_orig array to the current copy of the buffer
- * and ensuring that all bytes which mismatch are set in the bli_logged
- * array of the buf log item.
- */
-STATIC void
-xfs_buf_item_log_check(
- xfs_buf_log_item_t *bip)
+static inline int
+xfs_buf_log_format_size(
+ struct xfs_buf_log_format *blfp)
{
- char *orig;
- char *buffer;
- int x;
- xfs_buf_t *bp;
-
- ASSERT(bip->bli_orig != NULL);
- ASSERT(bip->bli_logged != NULL);
-
- bp = bip->bli_buf;
- ASSERT(bp->b_length > 0);
- ASSERT(bp->b_addr != NULL);
- orig = bip->bli_orig;
- buffer = bp->b_addr;
- for (x = 0; x < BBTOB(bp->b_length); x++) {
- if (orig[x] != buffer[x] && !btst(bip->bli_logged, x)) {
- xfs_emerg(bp->b_mount,
- "%s: bip %x buffer %x orig %x index %d",
- __func__, bip, bp, orig, x);
- ASSERT(0);
- }
- }
+ return offsetof(struct xfs_buf_log_format, blf_data_map) +
+ (blfp->blf_map_size * sizeof(blfp->blf_data_map[0]));
}
-#else
-#define xfs_buf_item_log_debug(x,y,z)
-#define xfs_buf_item_log_check(x)
-#endif
-
-STATIC void xfs_buf_do_callbacks(struct xfs_buf *bp);
/*
* This returns the number of log iovecs needed to log the
@@ -152,25 +58,27 @@ STATIC void xfs_buf_do_callbacks(struct xfs_buf *bp);
*
* If the XFS_BLI_STALE flag has been set, then log nothing.
*/
-STATIC uint
+STATIC void
xfs_buf_item_size_segment(
struct xfs_buf_log_item *bip,
- struct xfs_buf_log_format *blfp)
+ struct xfs_buf_log_format *blfp,
+ int *nvecs,
+ int *nbytes)
{
struct xfs_buf *bp = bip->bli_buf;
- uint nvecs;
int next_bit;
int last_bit;
last_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, 0);
if (last_bit == -1)
- return 0;
+ return;
/*
* initial count for a dirty buffer is 2 vectors - the format structure
* and the first dirty region.
*/
- nvecs = 2;
+ *nvecs += 2;
+ *nbytes += xfs_buf_log_format_size(blfp) + XFS_BLF_CHUNK;
while (last_bit != -1) {
/*
@@ -190,18 +98,17 @@ xfs_buf_item_size_segment(
break;
} else if (next_bit != last_bit + 1) {
last_bit = next_bit;
- nvecs++;
+ (*nvecs)++;
} else if (xfs_buf_offset(bp, next_bit * XFS_BLF_CHUNK) !=
(xfs_buf_offset(bp, last_bit * XFS_BLF_CHUNK) +
XFS_BLF_CHUNK)) {
last_bit = next_bit;
- nvecs++;
+ (*nvecs)++;
} else {
last_bit++;
}
+ *nbytes += XFS_BLF_CHUNK;
}
-
- return nvecs;
}
/*
@@ -221,12 +128,13 @@ xfs_buf_item_size_segment(
* If the XFS_BLI_STALE flag has been set, then log nothing but the buf log
* format structures.
*/
-STATIC uint
+STATIC void
xfs_buf_item_size(
- struct xfs_log_item *lip)
+ struct xfs_log_item *lip,
+ int *nvecs,
+ int *nbytes)
{
struct xfs_buf_log_item *bip = BUF_ITEM(lip);
- uint nvecs;
int i;
ASSERT(atomic_read(&bip->bli_refcount) > 0);
@@ -238,11 +146,26 @@ xfs_buf_item_size(
*/
trace_xfs_buf_item_size_stale(bip);
ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL);
- return bip->bli_format_count;
+ *nvecs += bip->bli_format_count;
+ for (i = 0; i < bip->bli_format_count; i++) {
+ *nbytes += xfs_buf_log_format_size(&bip->bli_formats[i]);
+ }
+ return;
}
ASSERT(bip->bli_flags & XFS_BLI_LOGGED);
+ if (bip->bli_flags & XFS_BLI_ORDERED) {
+ /*
+ * The buffer has been logged just to order it.
+ * It is not being included in the transaction
+ * commit, so no vectors are used at all.
+ */
+ trace_xfs_buf_item_size_ordered(bip);
+ *nvecs = XFS_LOG_VEC_ORDERED;
+ return;
+ }
+
/*
* the vector count is based on the number of buffer vectors we have
* dirty bits in. This will only be greater than one when we have a
@@ -252,30 +175,54 @@ xfs_buf_item_size(
* count for the extra buf log format structure that will need to be
* written.
*/
- nvecs = 0;
for (i = 0; i < bip->bli_format_count; i++) {
- nvecs += xfs_buf_item_size_segment(bip, &bip->bli_formats[i]);
+ xfs_buf_item_size_segment(bip, &bip->bli_formats[i],
+ nvecs, nbytes);
}
-
trace_xfs_buf_item_size(bip);
- return nvecs;
}
-static struct xfs_log_iovec *
+static inline void
+xfs_buf_item_copy_iovec(
+ struct xfs_log_vec *lv,
+ struct xfs_log_iovec **vecp,
+ struct xfs_buf *bp,
+ uint offset,
+ int first_bit,
+ uint nbits)
+{
+ offset += first_bit * XFS_BLF_CHUNK;
+ xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_BCHUNK,
+ xfs_buf_offset(bp, offset),
+ nbits * XFS_BLF_CHUNK);
+}
+
+static inline bool
+xfs_buf_item_straddle(
+ struct xfs_buf *bp,
+ uint offset,
+ int next_bit,
+ int last_bit)
+{
+ return xfs_buf_offset(bp, offset + (next_bit << XFS_BLF_SHIFT)) !=
+ (xfs_buf_offset(bp, offset + (last_bit << XFS_BLF_SHIFT)) +
+ XFS_BLF_CHUNK);
+}
+
+static void
xfs_buf_item_format_segment(
struct xfs_buf_log_item *bip,
- struct xfs_log_iovec *vecp,
+ struct xfs_log_vec *lv,
+ struct xfs_log_iovec **vecp,
uint offset,
struct xfs_buf_log_format *blfp)
{
struct xfs_buf *bp = bip->bli_buf;
uint base_size;
- uint nvecs;
int first_bit;
int last_bit;
int next_bit;
uint nbits;
- uint buffer_offset;
/* copy the flags across from the base format item */
blfp->blf_flags = bip->__bli_format.blf_flags;
@@ -285,24 +232,19 @@ xfs_buf_item_format_segment(
* the actual size of the dirty bitmap rather than the size of the in
* memory structure.
*/
- base_size = offsetof(struct xfs_buf_log_format, blf_data_map) +
- (blfp->blf_map_size * sizeof(blfp->blf_data_map[0]));
+ base_size = xfs_buf_log_format_size(blfp);
- nvecs = 0;
first_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, 0);
if (!(bip->bli_flags & XFS_BLI_STALE) && first_bit == -1) {
/*
* If the map is not be dirty in the transaction, mark
* the size as zero and do not advance the vector pointer.
*/
- goto out;
+ return;
}
- vecp->i_addr = blfp;
- vecp->i_len = base_size;
- vecp->i_type = XLOG_REG_TYPE_BFORMAT;
- vecp++;
- nvecs = 1;
+ blfp = xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_BFORMAT, blfp, base_size);
+ blfp->blf_size = 1;
if (bip->bli_flags & XFS_BLI_STALE) {
/*
@@ -312,13 +254,13 @@ xfs_buf_item_format_segment(
*/
trace_xfs_buf_item_format_stale(bip);
ASSERT(blfp->blf_flags & XFS_BLF_CANCEL);
- goto out;
+ return;
}
+
/*
* Fill in an iovec for each set of contiguous chunks.
*/
-
last_bit = first_bit;
nbits = 1;
for (;;) {
@@ -331,47 +273,22 @@ xfs_buf_item_format_segment(
next_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size,
(uint)last_bit + 1);
/*
- * If we run out of bits fill in the last iovec and get
- * out of the loop.
- * Else if we start a new set of bits then fill in the
- * iovec for the series we were looking at and start
- * counting the bits in the new one.
- * Else we're still in the same set of bits so just
- * keep counting and scanning.
+ * If we run out of bits fill in the last iovec and get out of
+ * the loop. Else if we start a new set of bits then fill in
+ * the iovec for the series we were looking at and start
+ * counting the bits in the new one. Else we're still in the
+ * same set of bits so just keep counting and scanning.
*/
if (next_bit == -1) {
- buffer_offset = offset + first_bit * XFS_BLF_CHUNK;
- vecp->i_addr = xfs_buf_offset(bp, buffer_offset);
- vecp->i_len = nbits * XFS_BLF_CHUNK;
- vecp->i_type = XLOG_REG_TYPE_BCHUNK;
- nvecs++;
+ xfs_buf_item_copy_iovec(lv, vecp, bp, offset,
+ first_bit, nbits);
+ blfp->blf_size++;
break;
- } else if (next_bit != last_bit + 1) {
- buffer_offset = offset + first_bit * XFS_BLF_CHUNK;
- vecp->i_addr = xfs_buf_offset(bp, buffer_offset);
- vecp->i_len = nbits * XFS_BLF_CHUNK;
- vecp->i_type = XLOG_REG_TYPE_BCHUNK;
- nvecs++;
- vecp++;
- first_bit = next_bit;
- last_bit = next_bit;
- nbits = 1;
- } else if (xfs_buf_offset(bp, offset +
- (next_bit << XFS_BLF_SHIFT)) !=
- (xfs_buf_offset(bp, offset +
- (last_bit << XFS_BLF_SHIFT)) +
- XFS_BLF_CHUNK)) {
- buffer_offset = offset + first_bit * XFS_BLF_CHUNK;
- vecp->i_addr = xfs_buf_offset(bp, buffer_offset);
- vecp->i_len = nbits * XFS_BLF_CHUNK;
- vecp->i_type = XLOG_REG_TYPE_BCHUNK;
-/*
- * You would think we need to bump the nvecs here too, but we do not
- * this number is used by recovery, and it gets confused by the boundary
- * split here
- * nvecs++;
- */
- vecp++;
+ } else if (next_bit != last_bit + 1 ||
+ xfs_buf_item_straddle(bp, offset, next_bit, last_bit)) {
+ xfs_buf_item_copy_iovec(lv, vecp, bp, offset,
+ first_bit, nbits);
+ blfp->blf_size++;
first_bit = next_bit;
last_bit = next_bit;
nbits = 1;
@@ -380,9 +297,6 @@ xfs_buf_item_format_segment(
nbits++;
}
}
-out:
- blfp->blf_size = nvecs;
- return vecp;
}
/*
@@ -394,10 +308,11 @@ out:
STATIC void
xfs_buf_item_format(
struct xfs_log_item *lip,
- struct xfs_log_iovec *vecp)
+ struct xfs_log_vec *lv)
{
struct xfs_buf_log_item *bip = BUF_ITEM(lip);
struct xfs_buf *bp = bip->bli_buf;
+ struct xfs_log_iovec *vecp = NULL;
uint offset = 0;
int i;
@@ -407,21 +322,39 @@ xfs_buf_item_format(
/*
* If it is an inode buffer, transfer the in-memory state to the
- * format flags and clear the in-memory state. We do not transfer
+ * format flags and clear the in-memory state.
+ *
+ * For buffer based inode allocation, we do not transfer
* this state if the inode buffer allocation has not yet been committed
* to the log as setting the XFS_BLI_INODE_BUF flag will prevent
* correct replay of the inode allocation.
+ *
+ * For icreate item based inode allocation, the buffers aren't written
+ * to the journal during allocation, and hence we should always tag the
+ * buffer as an inode buffer so that the correct unlinked list replay
+ * occurs during recovery.
*/
if (bip->bli_flags & XFS_BLI_INODE_BUF) {
- if (!((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) &&
+ if (xfs_sb_version_hascrc(&lip->li_mountp->m_sb) ||
+ !((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) &&
xfs_log_item_in_current_chkpt(lip)))
bip->__bli_format.blf_flags |= XFS_BLF_INODE_BUF;
bip->bli_flags &= ~XFS_BLI_INODE_BUF;
}
+ if ((bip->bli_flags & (XFS_BLI_ORDERED|XFS_BLI_STALE)) ==
+ XFS_BLI_ORDERED) {
+ /*
+ * The buffer has been logged just to order it. It is not being
+ * included in the transaction commit, so don't format it.
+ */
+ trace_xfs_buf_item_format_ordered(bip);
+ return;
+ }
+
for (i = 0; i < bip->bli_format_count; i++) {
- vecp = xfs_buf_item_format_segment(bip, vecp, offset,
- &bip->bli_formats[i]);
+ xfs_buf_item_format_segment(bip, lv, &vecp, offset,
+ &bip->bli_formats[i]);
offset += bp->b_maps[i].bm_len;
}
@@ -429,7 +362,6 @@ xfs_buf_item_format(
* Check to make sure everything is consistent.
*/
trace_xfs_buf_item_format(bip);
- xfs_buf_item_log_check(bip);
}
/*
@@ -449,6 +381,7 @@ xfs_buf_item_pin(
ASSERT(atomic_read(&bip->bli_refcount) > 0);
ASSERT((bip->bli_flags & XFS_BLI_LOGGED) ||
+ (bip->bli_flags & XFS_BLI_ORDERED) ||
(bip->bli_flags & XFS_BLI_STALE));
trace_xfs_buf_item_pin(bip);
@@ -562,6 +495,14 @@ xfs_buf_item_unpin(
}
}
+/*
+ * Buffer IO error rate limiting. Limit it to no more than 10 messages per 30
+ * seconds so as to not spam logs too much on repeated detection of the same
+ * buffer being bad..
+ */
+
+DEFINE_RATELIMIT_STATE(xfs_buf_write_fail_rl_state, 30 * HZ, 10);
+
STATIC uint
xfs_buf_item_push(
struct xfs_log_item *lip,
@@ -573,13 +514,31 @@ xfs_buf_item_push(
if (xfs_buf_ispinned(bp))
return XFS_ITEM_PINNED;
- if (!xfs_buf_trylock(bp))
+ if (!xfs_buf_trylock(bp)) {
+ /*
+ * If we have just raced with a buffer being pinned and it has
+ * been marked stale, we could end up stalling until someone else
+ * issues a log force to unpin the stale buffer. Check for the
+ * race condition here so xfsaild recognizes the buffer is pinned
+ * and queues a log force to move it along.
+ */
+ if (xfs_buf_ispinned(bp))
+ return XFS_ITEM_PINNED;
return XFS_ITEM_LOCKED;
+ }
ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
trace_xfs_buf_item_push(bip);
+ /* has a previous flush failed due to IO errors? */
+ if ((bp->b_flags & XBF_WRITE_FAIL) &&
+ ___ratelimit(&xfs_buf_write_fail_rl_state, "XFS:")) {
+ xfs_warn(bp->b_target->bt_mount,
+"Detected failing async write on buffer block 0x%llx. Retrying async write.\n",
+ (long long)bp->b_bn);
+ }
+
if (!xfs_buf_delwri_queue(bp, buffer_list))
rval = XFS_ITEM_FLUSHING;
xfs_buf_unlock(bp);
@@ -611,8 +570,9 @@ xfs_buf_item_unlock(
{
struct xfs_buf_log_item *bip = BUF_ITEM(lip);
struct xfs_buf *bp = bip->bli_buf;
- int aborted, clean, i;
- uint hold;
+ bool clean;
+ bool aborted;
+ int flags;
/* Clear the buffer's association with this transaction. */
bp->b_transp = NULL;
@@ -623,23 +583,21 @@ xfs_buf_item_unlock(
* (cancelled) buffers at unpin time, but we'll never go through the
* pin/unpin cycle if we abort inside commit.
*/
- aborted = (lip->li_flags & XFS_LI_ABORTED) != 0;
-
+ aborted = (lip->li_flags & XFS_LI_ABORTED) ? true : false;
/*
- * Before possibly freeing the buf item, determine if we should
- * release the buffer at the end of this routine.
+ * Before possibly freeing the buf item, copy the per-transaction state
+ * so we can reference it safely later after clearing it from the
+ * buffer log item.
*/
- hold = bip->bli_flags & XFS_BLI_HOLD;
-
- /* Clear the per transaction state. */
- bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_HOLD);
+ flags = bip->bli_flags;
+ bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_HOLD | XFS_BLI_ORDERED);
/*
* If the buf item is marked stale, then don't do anything. We'll
* unlock the buffer and free the buf item when the buffer is unpinned
* for the last time.
*/
- if (bip->bli_flags & XFS_BLI_STALE) {
+ if (flags & XFS_BLI_STALE) {
trace_xfs_buf_item_unlock_stale(bip);
ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL);
if (!aborted) {
@@ -652,22 +610,49 @@ xfs_buf_item_unlock(
/*
* If the buf item isn't tracking any data, free it, otherwise drop the
- * reference we hold to it.
+ * reference we hold to it. If we are aborting the transaction, this may
+ * be the only reference to the buf item, so we free it anyway
+ * regardless of whether it is dirty or not. A dirty abort implies a
+ * shutdown, anyway.
+ *
+ * Ordered buffers are dirty but may have no recorded changes, so ensure
+ * we only release clean items here.
*/
- clean = 1;
- for (i = 0; i < bip->bli_format_count; i++) {
- if (!xfs_bitmap_empty(bip->bli_formats[i].blf_data_map,
- bip->bli_formats[i].blf_map_size)) {
- clean = 0;
- break;
+ clean = (flags & XFS_BLI_DIRTY) ? false : true;
+ if (clean) {
+ int i;
+ for (i = 0; i < bip->bli_format_count; i++) {
+ if (!xfs_bitmap_empty(bip->bli_formats[i].blf_data_map,
+ bip->bli_formats[i].blf_map_size)) {
+ clean = false;
+ break;
+ }
}
}
- if (clean)
- xfs_buf_item_relse(bp);
- else
- atomic_dec(&bip->bli_refcount);
- if (!hold)
+ /*
+ * Clean buffers, by definition, cannot be in the AIL. However, aborted
+ * buffers may be dirty and hence in the AIL. Therefore if we are
+ * aborting a buffer and we've just taken the last refernce away, we
+ * have to check if it is in the AIL before freeing it. We need to free
+ * it in this case, because an aborted transaction has already shut the
+ * filesystem down and this is the last chance we will have to do so.
+ */
+ if (atomic_dec_and_test(&bip->bli_refcount)) {
+ if (clean)
+ xfs_buf_item_relse(bp);
+ else if (aborted) {
+ ASSERT(XFS_FORCED_SHUTDOWN(lip->li_mountp));
+ if (lip->li_flags & XFS_LI_IN_AIL) {
+ spin_lock(&lip->li_ailp->xa_lock);
+ xfs_trans_ail_delete(lip->li_ailp, lip,
+ SHUTDOWN_LOG_IO_ERROR);
+ }
+ xfs_buf_item_relse(bp);
+ }
+ }
+
+ if (!(flags & XFS_BLI_HOLD))
xfs_buf_relse(bp);
}
@@ -811,20 +796,6 @@ xfs_buf_item_init(
bip->bli_formats[i].blf_map_size = map_size;
}
-#ifdef XFS_TRANS_DEBUG
- /*
- * Allocate the arrays for tracking what needs to be logged
- * and what our callers request to be logged. bli_orig
- * holds a copy of the original, clean buffer for comparison
- * against, and bli_logged keeps a 1 bit flag per byte in
- * the buffer to indicate which bytes the callers have asked
- * to have logged.
- */
- bip->bli_orig = kmem_alloc(BBTOB(bp->b_length), KM_SLEEP);
- memcpy(bip->bli_orig, bp->b_addr, BBTOB(bp->b_length));
- bip->bli_logged = kmem_zalloc(BBTOB(bp->b_length) / NBBY, KM_SLEEP);
-#endif
-
/*
* Put the buf item into the list of items attached to the
* buffer at the front.
@@ -839,9 +810,8 @@ xfs_buf_item_init(
* Mark bytes first through last inclusive as dirty in the buf
* item's bitmap.
*/
-void
+static void
xfs_buf_item_log_segment(
- struct xfs_buf_log_item *bip,
uint first,
uint last,
uint *map)
@@ -915,8 +885,6 @@ xfs_buf_item_log_segment(
mask = (1 << end_bit) - 1;
*wordp |= mask;
}
-
- xfs_buf_item_log_debug(bip, first, last);
}
/*
@@ -935,12 +903,6 @@ xfs_buf_item_log(
struct xfs_buf *bp = bip->bli_buf;
/*
- * Mark the item as having some dirty data for
- * quick reference in xfs_buf_item_dirty.
- */
- bip->bli_flags |= XFS_BLI_DIRTY;
-
- /*
* walk each buffer segment and mark them dirty appropriately.
*/
start = 0;
@@ -957,7 +919,7 @@ xfs_buf_item_log(
if (end > last)
end = last;
- xfs_buf_item_log_segment(bip, first, end,
+ xfs_buf_item_log_segment(first, end,
&bip->bli_formats[i].blf_data_map[0]);
start += bp->b_maps[i].bm_len;
@@ -966,7 +928,7 @@ xfs_buf_item_log(
/*
- * Return 1 if the buffer has some data that has been logged (at any
+ * Return 1 if the buffer has been logged or ordered in a transaction (at any
* point, not just the current transaction) and 0 if not.
*/
uint
@@ -980,11 +942,6 @@ STATIC void
xfs_buf_item_free(
xfs_buf_log_item_t *bip)
{
-#ifdef XFS_TRANS_DEBUG
- kmem_free(bip->bli_orig);
- kmem_free(bip->bli_logged);
-#endif /* XFS_TRANS_DEBUG */
-
xfs_buf_item_free_format(bip);
kmem_zone_free(xfs_buf_item_zone, bip);
}
@@ -1000,11 +957,11 @@ void
xfs_buf_item_relse(
xfs_buf_t *bp)
{
- xfs_buf_log_item_t *bip;
+ xfs_buf_log_item_t *bip = bp->b_fspriv;
trace_xfs_buf_item_relse(bp, _RET_IP_);
+ ASSERT(!(bip->bli_item.li_flags & XFS_LI_IN_AIL));
- bip = bp->b_fspriv;
bp->b_fspriv = bip->bli_item.li_bio_list;
if (bp->b_fspriv == NULL)
bp->b_iodone = NULL;
@@ -1095,7 +1052,7 @@ xfs_buf_iodone_callbacks(
static ulong lasttime;
static xfs_buftarg_t *lasttarg;
- if (likely(!xfs_buf_geterror(bp)))
+ if (likely(!bp->b_error))
goto do_callbacks;
/*
@@ -1134,8 +1091,9 @@ xfs_buf_iodone_callbacks(
xfs_buf_ioerror(bp, 0); /* errno of 0 unsets the flag */
- if (!XFS_BUF_ISSTALE(bp)) {
- bp->b_flags |= XBF_WRITE | XBF_ASYNC | XBF_DONE;
+ if (!(bp->b_flags & (XBF_STALE|XBF_WRITE_FAIL))) {
+ bp->b_flags |= XBF_WRITE | XBF_ASYNC |
+ XBF_DONE | XBF_WRITE_FAIL;
xfs_buf_iorequest(bp);
} else {
xfs_buf_relse(bp);
diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h
index 16def435944..3f3455a4151 100644
--- a/fs/xfs/xfs_buf_item.h
+++ b/fs/xfs/xfs_buf_item.h
@@ -18,51 +18,9 @@
#ifndef __XFS_BUF_ITEM_H__
#define __XFS_BUF_ITEM_H__
-extern kmem_zone_t *xfs_buf_item_zone;
-
-/*
- * This flag indicates that the buffer contains on disk inodes
- * and requires special recovery handling.
- */
-#define XFS_BLF_INODE_BUF 0x1
-/*
- * This flag indicates that the buffer should not be replayed
- * during recovery because its blocks are being freed.
- */
-#define XFS_BLF_CANCEL 0x2
-/*
- * This flag indicates that the buffer contains on disk
- * user or group dquots and may require special recovery handling.
- */
-#define XFS_BLF_UDQUOT_BUF 0x4
-#define XFS_BLF_PDQUOT_BUF 0x8
-#define XFS_BLF_GDQUOT_BUF 0x10
-
-#define XFS_BLF_CHUNK 128
-#define XFS_BLF_SHIFT 7
-#define BIT_TO_WORD_SHIFT 5
-#define NBWORD (NBBY * sizeof(unsigned int))
+/* kernel only definitions */
-/*
- * This is the structure used to lay out a buf log item in the
- * log. The data map describes which 128 byte chunks of the buffer
- * have been logged.
- */
-#define XFS_BLF_DATAMAP_SIZE ((XFS_MAX_BLOCKSIZE / XFS_BLF_CHUNK) / NBWORD)
-
-typedef struct xfs_buf_log_format {
- unsigned short blf_type; /* buf log item type indicator */
- unsigned short blf_size; /* size of this item */
- ushort blf_flags; /* misc state */
- ushort blf_len; /* number of blocks in this buf */
- __int64_t blf_blkno; /* starting blkno of this buf */
- unsigned int blf_map_size; /* used size of data bitmap in words */
- unsigned int blf_data_map[XFS_BLF_DATAMAP_SIZE]; /* dirty bitmap */
-} xfs_buf_log_format_t;
-
-/*
- * buf log item flags
- */
+/* buf log item flags */
#define XFS_BLI_HOLD 0x01
#define XFS_BLI_DIRTY 0x02
#define XFS_BLI_STALE 0x04
@@ -70,6 +28,7 @@ typedef struct xfs_buf_log_format {
#define XFS_BLI_INODE_ALLOC_BUF 0x10
#define XFS_BLI_STALE_INODE 0x20
#define XFS_BLI_INODE_BUF 0x40
+#define XFS_BLI_ORDERED 0x80
#define XFS_BLI_FLAGS \
{ XFS_BLI_HOLD, "HOLD" }, \
@@ -78,10 +37,9 @@ typedef struct xfs_buf_log_format {
{ XFS_BLI_LOGGED, "LOGGED" }, \
{ XFS_BLI_INODE_ALLOC_BUF, "INODE_ALLOC" }, \
{ XFS_BLI_STALE_INODE, "STALE_INODE" }, \
- { XFS_BLI_INODE_BUF, "INODE_BUF" }
-
+ { XFS_BLI_INODE_BUF, "INODE_BUF" }, \
+ { XFS_BLI_ORDERED, "ORDERED" }
-#ifdef __KERNEL__
struct xfs_buf;
struct xfs_mount;
@@ -98,10 +56,6 @@ typedef struct xfs_buf_log_item {
unsigned int bli_flags; /* misc flags */
unsigned int bli_recur; /* lock recursion count */
atomic_t bli_refcount; /* cnt of tp refs */
-#ifdef XFS_TRANS_DEBUG
- char *bli_orig; /* original buffer copy */
- char *bli_logged; /* bytes logged (bitmap) */
-#endif
int bli_format_count; /* count of headers */
struct xfs_buf_log_format *bli_formats; /* array of in-log header ptrs */
struct xfs_buf_log_format __bli_format; /* embedded in-log header */
@@ -117,16 +71,6 @@ void xfs_buf_attach_iodone(struct xfs_buf *,
void xfs_buf_iodone_callbacks(struct xfs_buf *);
void xfs_buf_iodone(struct xfs_buf *, struct xfs_log_item *);
-#ifdef XFS_TRANS_DEBUG
-void
-xfs_buf_item_flush_log_debug(
- struct xfs_buf *bp,
- uint first,
- uint last);
-#else
-#define xfs_buf_item_flush_log_debug(bp, first, last)
-#endif
-
-#endif /* __KERNEL__ */
+extern kmem_zone_t *xfs_buf_item_zone;
#endif /* __XFS_BUF_ITEM_H__ */
diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
index 4d7696a0241..a514ab61665 100644
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/xfs_da_btree.c
@@ -1,5 +1,6 @@
/*
* Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
* All Rights Reserved.
*
* This program is free software; you can redistribute it and/or
@@ -17,20 +18,20 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
-#include "xfs_types.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_bit.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_mount.h"
+#include "xfs_da_format.h"
#include "xfs_da_btree.h"
-#include "xfs_bmap_btree.h"
#include "xfs_dir2.h"
-#include "xfs_dir2_format.h"
#include "xfs_dir2_priv.h"
-#include "xfs_dinode.h"
#include "xfs_inode.h"
+#include "xfs_trans.h"
#include "xfs_inode_item.h"
#include "xfs_alloc.h"
#include "xfs_bmap.h"
@@ -38,6 +39,8 @@
#include "xfs_attr_leaf.h"
#include "xfs_error.h"
#include "xfs_trace.h"
+#include "xfs_cksum.h"
+#include "xfs_buf_item.h"
/*
* xfs_da_btree.c
@@ -52,69 +55,148 @@
/*
* Routines used for growing the Btree.
*/
-STATIC int xfs_da_root_split(xfs_da_state_t *state,
+STATIC int xfs_da3_root_split(xfs_da_state_t *state,
xfs_da_state_blk_t *existing_root,
xfs_da_state_blk_t *new_child);
-STATIC int xfs_da_node_split(xfs_da_state_t *state,
+STATIC int xfs_da3_node_split(xfs_da_state_t *state,
xfs_da_state_blk_t *existing_blk,
xfs_da_state_blk_t *split_blk,
xfs_da_state_blk_t *blk_to_add,
int treelevel,
int *result);
-STATIC void xfs_da_node_rebalance(xfs_da_state_t *state,
+STATIC void xfs_da3_node_rebalance(xfs_da_state_t *state,
xfs_da_state_blk_t *node_blk_1,
xfs_da_state_blk_t *node_blk_2);
-STATIC void xfs_da_node_add(xfs_da_state_t *state,
+STATIC void xfs_da3_node_add(xfs_da_state_t *state,
xfs_da_state_blk_t *old_node_blk,
xfs_da_state_blk_t *new_node_blk);
/*
* Routines used for shrinking the Btree.
*/
-STATIC int xfs_da_root_join(xfs_da_state_t *state,
+STATIC int xfs_da3_root_join(xfs_da_state_t *state,
xfs_da_state_blk_t *root_blk);
-STATIC int xfs_da_node_toosmall(xfs_da_state_t *state, int *retval);
-STATIC void xfs_da_node_remove(xfs_da_state_t *state,
+STATIC int xfs_da3_node_toosmall(xfs_da_state_t *state, int *retval);
+STATIC void xfs_da3_node_remove(xfs_da_state_t *state,
xfs_da_state_blk_t *drop_blk);
-STATIC void xfs_da_node_unbalance(xfs_da_state_t *state,
+STATIC void xfs_da3_node_unbalance(xfs_da_state_t *state,
xfs_da_state_blk_t *src_node_blk,
xfs_da_state_blk_t *dst_node_blk);
/*
* Utility routines.
*/
-STATIC uint xfs_da_node_lasthash(struct xfs_buf *bp, int *count);
-STATIC int xfs_da_node_order(struct xfs_buf *node1_bp,
- struct xfs_buf *node2_bp);
-STATIC int xfs_da_blk_unlink(xfs_da_state_t *state,
+STATIC int xfs_da3_blk_unlink(xfs_da_state_t *state,
xfs_da_state_blk_t *drop_blk,
xfs_da_state_blk_t *save_blk);
-STATIC void xfs_da_state_kill_altpath(xfs_da_state_t *state);
-static void
-xfs_da_node_verify(
+
+kmem_zone_t *xfs_da_state_zone; /* anchor for state struct zone */
+
+/*
+ * Allocate a dir-state structure.
+ * We don't put them on the stack since they're large.
+ */
+xfs_da_state_t *
+xfs_da_state_alloc(void)
+{
+ return kmem_zone_zalloc(xfs_da_state_zone, KM_NOFS);
+}
+
+/*
+ * Kill the altpath contents of a da-state structure.
+ */
+STATIC void
+xfs_da_state_kill_altpath(xfs_da_state_t *state)
+{
+ int i;
+
+ for (i = 0; i < state->altpath.active; i++)
+ state->altpath.blk[i].bp = NULL;
+ state->altpath.active = 0;
+}
+
+/*
+ * Free a da-state structure.
+ */
+void
+xfs_da_state_free(xfs_da_state_t *state)
+{
+ xfs_da_state_kill_altpath(state);
+#ifdef DEBUG
+ memset((char *)state, 0, sizeof(*state));
+#endif /* DEBUG */
+ kmem_zone_free(xfs_da_state_zone, state);
+}
+
+static bool
+xfs_da3_node_verify(
struct xfs_buf *bp)
{
struct xfs_mount *mp = bp->b_target->bt_mount;
- struct xfs_da_node_hdr *hdr = bp->b_addr;
- int block_ok = 0;
-
- block_ok = hdr->info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC);
- block_ok = block_ok &&
- be16_to_cpu(hdr->level) > 0 &&
- be16_to_cpu(hdr->count) > 0 ;
- if (!block_ok) {
- XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr);
- xfs_buf_ioerror(bp, EFSCORRUPTED);
+ struct xfs_da_intnode *hdr = bp->b_addr;
+ struct xfs_da3_icnode_hdr ichdr;
+ const struct xfs_dir_ops *ops;
+
+ ops = xfs_dir_get_ops(mp, NULL);
+
+ ops->node_hdr_from_disk(&ichdr, hdr);
+
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ struct xfs_da3_node_hdr *hdr3 = bp->b_addr;
+
+ if (ichdr.magic != XFS_DA3_NODE_MAGIC)
+ return false;
+
+ if (!uuid_equal(&hdr3->info.uuid, &mp->m_sb.sb_uuid))
+ return false;
+ if (be64_to_cpu(hdr3->info.blkno) != bp->b_bn)
+ return false;
+ } else {
+ if (ichdr.magic != XFS_DA_NODE_MAGIC)
+ return false;
}
+ if (ichdr.level == 0)
+ return false;
+ if (ichdr.level > XFS_DA_NODE_MAXDEPTH)
+ return false;
+ if (ichdr.count == 0)
+ return false;
+
+ /*
+ * we don't know if the node is for and attribute or directory tree,
+ * so only fail if the count is outside both bounds
+ */
+ if (ichdr.count > mp->m_dir_geo->node_ents &&
+ ichdr.count > mp->m_attr_geo->node_ents)
+ return false;
+ /* XXX: hash order check? */
+
+ return true;
}
static void
-xfs_da_node_write_verify(
+xfs_da3_node_write_verify(
struct xfs_buf *bp)
{
- xfs_da_node_verify(bp);
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_buf_log_item *bip = bp->b_fspriv;
+ struct xfs_da3_node_hdr *hdr3 = bp->b_addr;
+
+ if (!xfs_da3_node_verify(bp)) {
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
+ xfs_verifier_error(bp);
+ return;
+ }
+
+ if (!xfs_sb_version_hascrc(&mp->m_sb))
+ return;
+
+ if (bip)
+ hdr3->info.lsn = cpu_to_be64(bip->bli_item.li_lsn);
+
+ xfs_buf_update_cksum(bp, XFS_DA3_NODE_CRC_OFF);
}
/*
@@ -124,40 +206,49 @@ xfs_da_node_write_verify(
* format of the block being read.
*/
static void
-xfs_da_node_read_verify(
+xfs_da3_node_read_verify(
struct xfs_buf *bp)
{
- struct xfs_mount *mp = bp->b_target->bt_mount;
struct xfs_da_blkinfo *info = bp->b_addr;
switch (be16_to_cpu(info->magic)) {
+ case XFS_DA3_NODE_MAGIC:
+ if (!xfs_buf_verify_cksum(bp, XFS_DA3_NODE_CRC_OFF)) {
+ xfs_buf_ioerror(bp, EFSBADCRC);
+ break;
+ }
+ /* fall through */
case XFS_DA_NODE_MAGIC:
- xfs_da_node_verify(bp);
- break;
+ if (!xfs_da3_node_verify(bp)) {
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
+ break;
+ }
+ return;
case XFS_ATTR_LEAF_MAGIC:
- bp->b_ops = &xfs_attr_leaf_buf_ops;
+ case XFS_ATTR3_LEAF_MAGIC:
+ bp->b_ops = &xfs_attr3_leaf_buf_ops;
bp->b_ops->verify_read(bp);
return;
case XFS_DIR2_LEAFN_MAGIC:
- bp->b_ops = &xfs_dir2_leafn_buf_ops;
+ case XFS_DIR3_LEAFN_MAGIC:
+ bp->b_ops = &xfs_dir3_leafn_buf_ops;
bp->b_ops->verify_read(bp);
return;
default:
- XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW,
- mp, info);
- xfs_buf_ioerror(bp, EFSCORRUPTED);
break;
}
+
+ /* corrupt block */
+ xfs_verifier_error(bp);
}
-const struct xfs_buf_ops xfs_da_node_buf_ops = {
- .verify_read = xfs_da_node_read_verify,
- .verify_write = xfs_da_node_write_verify,
+const struct xfs_buf_ops xfs_da3_node_buf_ops = {
+ .verify_read = xfs_da3_node_read_verify,
+ .verify_write = xfs_da3_node_write_verify,
};
-
int
-xfs_da_node_read(
+xfs_da3_node_read(
struct xfs_trans *tp,
struct xfs_inode *dp,
xfs_dablk_t bno,
@@ -165,8 +256,35 @@ xfs_da_node_read(
struct xfs_buf **bpp,
int which_fork)
{
- return xfs_da_read_buf(tp, dp, bno, mappedbno, bpp,
- which_fork, &xfs_da_node_buf_ops);
+ int err;
+
+ err = xfs_da_read_buf(tp, dp, bno, mappedbno, bpp,
+ which_fork, &xfs_da3_node_buf_ops);
+ if (!err && tp) {
+ struct xfs_da_blkinfo *info = (*bpp)->b_addr;
+ int type;
+
+ switch (be16_to_cpu(info->magic)) {
+ case XFS_DA_NODE_MAGIC:
+ case XFS_DA3_NODE_MAGIC:
+ type = XFS_BLFT_DA_NODE_BUF;
+ break;
+ case XFS_ATTR_LEAF_MAGIC:
+ case XFS_ATTR3_LEAF_MAGIC:
+ type = XFS_BLFT_ATTR_LEAF_BUF;
+ break;
+ case XFS_DIR2_LEAFN_MAGIC:
+ case XFS_DIR3_LEAFN_MAGIC:
+ type = XFS_BLFT_DIR_LEAFN_BUF;
+ break;
+ default:
+ type = 0;
+ ASSERT(0);
+ break;
+ }
+ xfs_trans_buf_set_type(tp, *bpp, type);
+ }
+ return err;
}
/*========================================================================
@@ -177,33 +295,47 @@ xfs_da_node_read(
* Create the initial contents of an intermediate node.
*/
int
-xfs_da_node_create(xfs_da_args_t *args, xfs_dablk_t blkno, int level,
- struct xfs_buf **bpp, int whichfork)
+xfs_da3_node_create(
+ struct xfs_da_args *args,
+ xfs_dablk_t blkno,
+ int level,
+ struct xfs_buf **bpp,
+ int whichfork)
{
- xfs_da_intnode_t *node;
- struct xfs_buf *bp;
- int error;
- xfs_trans_t *tp;
+ struct xfs_da_intnode *node;
+ struct xfs_trans *tp = args->trans;
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_da3_icnode_hdr ichdr = {0};
+ struct xfs_buf *bp;
+ int error;
+ struct xfs_inode *dp = args->dp;
trace_xfs_da_node_create(args);
+ ASSERT(level <= XFS_DA_NODE_MAXDEPTH);
- tp = args->trans;
- error = xfs_da_get_buf(tp, args->dp, blkno, -1, &bp, whichfork);
+ error = xfs_da_get_buf(tp, dp, blkno, -1, &bp, whichfork);
if (error)
return(error);
- ASSERT(bp != NULL);
+ bp->b_ops = &xfs_da3_node_buf_ops;
+ xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DA_NODE_BUF);
node = bp->b_addr;
- node->hdr.info.forw = 0;
- node->hdr.info.back = 0;
- node->hdr.info.magic = cpu_to_be16(XFS_DA_NODE_MAGIC);
- node->hdr.info.pad = 0;
- node->hdr.count = 0;
- node->hdr.level = cpu_to_be16(level);
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ struct xfs_da3_node_hdr *hdr3 = bp->b_addr;
+
+ ichdr.magic = XFS_DA3_NODE_MAGIC;
+ hdr3->info.blkno = cpu_to_be64(bp->b_bn);
+ hdr3->info.owner = cpu_to_be64(args->dp->i_ino);
+ uuid_copy(&hdr3->info.uuid, &mp->m_sb.sb_uuid);
+ } else {
+ ichdr.magic = XFS_DA_NODE_MAGIC;
+ }
+ ichdr.level = level;
+
+ dp->d_ops->node_hdr_to_disk(node, &ichdr);
xfs_trans_log_buf(tp, bp,
- XFS_DA_LOGRANGE(node, &node->hdr, sizeof(node->hdr)));
+ XFS_DA_LOGRANGE(node, &node->hdr, dp->d_ops->node_hdr_size));
- bp->b_ops = &xfs_da_node_buf_ops;
*bpp = bp;
return(0);
}
@@ -213,12 +345,18 @@ xfs_da_node_create(xfs_da_args_t *args, xfs_dablk_t blkno, int level,
* intermediate nodes, rebalance, etc.
*/
int /* error */
-xfs_da_split(xfs_da_state_t *state)
+xfs_da3_split(
+ struct xfs_da_state *state)
{
- xfs_da_state_blk_t *oldblk, *newblk, *addblk;
- xfs_da_intnode_t *node;
- struct xfs_buf *bp;
- int max, action, error, i;
+ struct xfs_da_state_blk *oldblk;
+ struct xfs_da_state_blk *newblk;
+ struct xfs_da_state_blk *addblk;
+ struct xfs_da_intnode *node;
+ struct xfs_buf *bp;
+ int max;
+ int action = 0;
+ int error;
+ int i;
trace_xfs_da_split(state->args);
@@ -246,7 +384,7 @@ xfs_da_split(xfs_da_state_t *state)
*/
switch (oldblk->magic) {
case XFS_ATTR_LEAF_MAGIC:
- error = xfs_attr_leaf_split(state, oldblk, newblk);
+ error = xfs_attr3_leaf_split(state, oldblk, newblk);
if ((error != 0) && (error != ENOSPC)) {
return(error); /* GROT: attr is inconsistent */
}
@@ -261,12 +399,12 @@ xfs_da_split(xfs_da_state_t *state)
if (state->inleaf) {
state->extraafter = 0; /* before newblk */
trace_xfs_attr_leaf_split_before(state->args);
- error = xfs_attr_leaf_split(state, oldblk,
+ error = xfs_attr3_leaf_split(state, oldblk,
&state->extrablk);
} else {
state->extraafter = 1; /* after newblk */
trace_xfs_attr_leaf_split_after(state->args);
- error = xfs_attr_leaf_split(state, newblk,
+ error = xfs_attr3_leaf_split(state, newblk,
&state->extrablk);
}
if (error)
@@ -280,7 +418,7 @@ xfs_da_split(xfs_da_state_t *state)
addblk = newblk;
break;
case XFS_DA_NODE_MAGIC:
- error = xfs_da_node_split(state, oldblk, newblk, addblk,
+ error = xfs_da3_node_split(state, oldblk, newblk, addblk,
max - i, &action);
addblk->bp = NULL;
if (error)
@@ -298,7 +436,7 @@ xfs_da_split(xfs_da_state_t *state)
/*
* Update the btree to show the new hashval for this child.
*/
- xfs_da_fixhashpath(state, &state->path);
+ xfs_da3_fixhashpath(state, &state->path);
}
if (!addblk)
return(0);
@@ -308,7 +446,7 @@ xfs_da_split(xfs_da_state_t *state)
*/
ASSERT(state->path.active == 0);
oldblk = &state->path.blk[0];
- error = xfs_da_root_split(state, oldblk, addblk);
+ error = xfs_da3_root_split(state, oldblk, addblk);
if (error) {
addblk->bp = NULL;
return(error); /* GROT: dir is inconsistent */
@@ -319,8 +457,12 @@ xfs_da_split(xfs_da_state_t *state)
* just got bumped because of the addition of a new root node.
* There might be three blocks involved if a double split occurred,
* and the original block 0 could be at any position in the list.
+ *
+ * Note: the magic numbers and sibling pointers are in the same
+ * physical place for both v2 and v3 headers (by design). Hence it
+ * doesn't matter which version of the xfs_da_intnode structure we use
+ * here as the result will be the same using either structure.
*/
-
node = oldblk->bp->b_addr;
if (node->hdr.info.forw) {
if (be32_to_cpu(node->hdr.info.forw) == addblk->blkno) {
@@ -359,18 +501,25 @@ xfs_da_split(xfs_da_state_t *state)
* the EOF, extending the inode in process.
*/
STATIC int /* error */
-xfs_da_root_split(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
- xfs_da_state_blk_t *blk2)
+xfs_da3_root_split(
+ struct xfs_da_state *state,
+ struct xfs_da_state_blk *blk1,
+ struct xfs_da_state_blk *blk2)
{
- xfs_da_intnode_t *node, *oldroot;
- xfs_da_args_t *args;
- xfs_dablk_t blkno;
- struct xfs_buf *bp;
- int error, size;
- xfs_inode_t *dp;
- xfs_trans_t *tp;
- xfs_mount_t *mp;
- xfs_dir2_leaf_t *leaf;
+ struct xfs_da_intnode *node;
+ struct xfs_da_intnode *oldroot;
+ struct xfs_da_node_entry *btree;
+ struct xfs_da3_icnode_hdr nodehdr;
+ struct xfs_da_args *args;
+ struct xfs_buf *bp;
+ struct xfs_inode *dp;
+ struct xfs_trans *tp;
+ struct xfs_mount *mp;
+ struct xfs_dir2_leaf *leaf;
+ xfs_dablk_t blkno;
+ int level;
+ int error;
+ int size;
trace_xfs_da_root_split(state->args);
@@ -379,85 +528,132 @@ xfs_da_root_split(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
* to a free space somewhere.
*/
args = state->args;
- ASSERT(args != NULL);
error = xfs_da_grow_inode(args, &blkno);
if (error)
- return(error);
+ return error;
+
dp = args->dp;
tp = args->trans;
mp = state->mp;
error = xfs_da_get_buf(tp, dp, blkno, -1, &bp, args->whichfork);
if (error)
- return(error);
- ASSERT(bp != NULL);
+ return error;
node = bp->b_addr;
oldroot = blk1->bp->b_addr;
- if (oldroot->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC)) {
- size = (int)((char *)&oldroot->btree[be16_to_cpu(oldroot->hdr.count)] -
- (char *)oldroot);
+ if (oldroot->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC) ||
+ oldroot->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC)) {
+ struct xfs_da3_icnode_hdr nodehdr;
+
+ dp->d_ops->node_hdr_from_disk(&nodehdr, oldroot);
+ btree = dp->d_ops->node_tree_p(oldroot);
+ size = (int)((char *)&btree[nodehdr.count] - (char *)oldroot);
+ level = nodehdr.level;
+
+ /*
+ * we are about to copy oldroot to bp, so set up the type
+ * of bp while we know exactly what it will be.
+ */
+ xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DA_NODE_BUF);
} else {
- ASSERT(oldroot->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC));
+ struct xfs_dir3_icleaf_hdr leafhdr;
+ struct xfs_dir2_leaf_entry *ents;
+
leaf = (xfs_dir2_leaf_t *)oldroot;
- size = (int)((char *)&leaf->ents[be16_to_cpu(leaf->hdr.count)] -
- (char *)leaf);
+ dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
+ ents = dp->d_ops->leaf_ents_p(leaf);
+
+ ASSERT(leafhdr.magic == XFS_DIR2_LEAFN_MAGIC ||
+ leafhdr.magic == XFS_DIR3_LEAFN_MAGIC);
+ size = (int)((char *)&ents[leafhdr.count] - (char *)leaf);
+ level = 0;
+
+ /*
+ * we are about to copy oldroot to bp, so set up the type
+ * of bp while we know exactly what it will be.
+ */
+ xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_LEAFN_BUF);
}
+
+ /*
+ * we can copy most of the information in the node from one block to
+ * another, but for CRC enabled headers we have to make sure that the
+ * block specific identifiers are kept intact. We update the buffer
+ * directly for this.
+ */
memcpy(node, oldroot, size);
+ if (oldroot->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC) ||
+ oldroot->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)) {
+ struct xfs_da3_intnode *node3 = (struct xfs_da3_intnode *)node;
+
+ node3->hdr.info.blkno = cpu_to_be64(bp->b_bn);
+ }
xfs_trans_log_buf(tp, bp, 0, size - 1);
bp->b_ops = blk1->bp->b_ops;
+ xfs_trans_buf_copy_type(bp, blk1->bp);
blk1->bp = bp;
blk1->blkno = blkno;
/*
* Set up the new root node.
*/
- error = xfs_da_node_create(args,
- (args->whichfork == XFS_DATA_FORK) ? mp->m_dirleafblk : 0,
- be16_to_cpu(node->hdr.level) + 1, &bp, args->whichfork);
+ error = xfs_da3_node_create(args,
+ (args->whichfork == XFS_DATA_FORK) ? args->geo->leafblk : 0,
+ level + 1, &bp, args->whichfork);
if (error)
- return(error);
+ return error;
+
node = bp->b_addr;
- node->btree[0].hashval = cpu_to_be32(blk1->hashval);
- node->btree[0].before = cpu_to_be32(blk1->blkno);
- node->btree[1].hashval = cpu_to_be32(blk2->hashval);
- node->btree[1].before = cpu_to_be32(blk2->blkno);
- node->hdr.count = cpu_to_be16(2);
+ dp->d_ops->node_hdr_from_disk(&nodehdr, node);
+ btree = dp->d_ops->node_tree_p(node);
+ btree[0].hashval = cpu_to_be32(blk1->hashval);
+ btree[0].before = cpu_to_be32(blk1->blkno);
+ btree[1].hashval = cpu_to_be32(blk2->hashval);
+ btree[1].before = cpu_to_be32(blk2->blkno);
+ nodehdr.count = 2;
+ dp->d_ops->node_hdr_to_disk(node, &nodehdr);
#ifdef DEBUG
- if (oldroot->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)) {
- ASSERT(blk1->blkno >= mp->m_dirleafblk &&
- blk1->blkno < mp->m_dirfreeblk);
- ASSERT(blk2->blkno >= mp->m_dirleafblk &&
- blk2->blkno < mp->m_dirfreeblk);
+ if (oldroot->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) ||
+ oldroot->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)) {
+ ASSERT(blk1->blkno >= args->geo->leafblk &&
+ blk1->blkno < args->geo->freeblk);
+ ASSERT(blk2->blkno >= args->geo->leafblk &&
+ blk2->blkno < args->geo->freeblk);
}
#endif
/* Header is already logged by xfs_da_node_create */
xfs_trans_log_buf(tp, bp,
- XFS_DA_LOGRANGE(node, node->btree,
- sizeof(xfs_da_node_entry_t) * 2));
+ XFS_DA_LOGRANGE(node, btree, sizeof(xfs_da_node_entry_t) * 2));
- return(0);
+ return 0;
}
/*
* Split the node, rebalance, then add the new entry.
*/
STATIC int /* error */
-xfs_da_node_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk,
- xfs_da_state_blk_t *newblk,
- xfs_da_state_blk_t *addblk,
- int treelevel, int *result)
+xfs_da3_node_split(
+ struct xfs_da_state *state,
+ struct xfs_da_state_blk *oldblk,
+ struct xfs_da_state_blk *newblk,
+ struct xfs_da_state_blk *addblk,
+ int treelevel,
+ int *result)
{
- xfs_da_intnode_t *node;
- xfs_dablk_t blkno;
- int newcount, error;
- int useextra;
+ struct xfs_da_intnode *node;
+ struct xfs_da3_icnode_hdr nodehdr;
+ xfs_dablk_t blkno;
+ int newcount;
+ int error;
+ int useextra;
+ struct xfs_inode *dp = state->args->dp;
trace_xfs_da_node_split(state->args);
node = oldblk->bp->b_addr;
- ASSERT(node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC));
+ dp->d_ops->node_hdr_from_disk(&nodehdr, node);
/*
* With V2 dirs the extra block is data or freespace.
@@ -467,7 +663,7 @@ xfs_da_node_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk,
/*
* Do we have to split the node?
*/
- if ((be16_to_cpu(node->hdr.count) + newcount) > state->node_ents) {
+ if (nodehdr.count + newcount > state->args->geo->node_ents) {
/*
* Allocate a new node, add to the doubly linked chain of
* nodes, then move some of our excess entries into it.
@@ -476,14 +672,14 @@ xfs_da_node_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk,
if (error)
return(error); /* GROT: dir is inconsistent */
- error = xfs_da_node_create(state->args, blkno, treelevel,
+ error = xfs_da3_node_create(state->args, blkno, treelevel,
&newblk->bp, state->args->whichfork);
if (error)
return(error); /* GROT: dir is inconsistent */
newblk->blkno = blkno;
newblk->magic = XFS_DA_NODE_MAGIC;
- xfs_da_node_rebalance(state, oldblk, newblk);
- error = xfs_da_blk_link(state, oldblk, newblk);
+ xfs_da3_node_rebalance(state, oldblk, newblk);
+ error = xfs_da3_blk_link(state, oldblk, newblk);
if (error)
return(error);
*result = 1;
@@ -495,7 +691,7 @@ xfs_da_node_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk,
* Insert the new entry(s) into the correct block
* (updating last hashval in the process).
*
- * xfs_da_node_add() inserts BEFORE the given index,
+ * xfs_da3_node_add() inserts BEFORE the given index,
* and as a result of using node_lookup_int() we always
* point to a valid entry (not after one), but a split
* operation always results in a new block whose hashvals
@@ -504,22 +700,23 @@ xfs_da_node_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk,
* If we had double-split op below us, then add the extra block too.
*/
node = oldblk->bp->b_addr;
- if (oldblk->index <= be16_to_cpu(node->hdr.count)) {
+ dp->d_ops->node_hdr_from_disk(&nodehdr, node);
+ if (oldblk->index <= nodehdr.count) {
oldblk->index++;
- xfs_da_node_add(state, oldblk, addblk);
+ xfs_da3_node_add(state, oldblk, addblk);
if (useextra) {
if (state->extraafter)
oldblk->index++;
- xfs_da_node_add(state, oldblk, &state->extrablk);
+ xfs_da3_node_add(state, oldblk, &state->extrablk);
state->extravalid = 0;
}
} else {
newblk->index++;
- xfs_da_node_add(state, newblk, addblk);
+ xfs_da3_node_add(state, newblk, addblk);
if (useextra) {
if (state->extraafter)
newblk->index++;
- xfs_da_node_add(state, newblk, &state->extrablk);
+ xfs_da3_node_add(state, newblk, &state->extrablk);
state->extravalid = 0;
}
}
@@ -534,33 +731,54 @@ xfs_da_node_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk,
* NOTE: if blk2 is empty, then it will get the upper half of blk1.
*/
STATIC void
-xfs_da_node_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
- xfs_da_state_blk_t *blk2)
+xfs_da3_node_rebalance(
+ struct xfs_da_state *state,
+ struct xfs_da_state_blk *blk1,
+ struct xfs_da_state_blk *blk2)
{
- xfs_da_intnode_t *node1, *node2, *tmpnode;
- xfs_da_node_entry_t *btree_s, *btree_d;
- int count, tmp;
- xfs_trans_t *tp;
+ struct xfs_da_intnode *node1;
+ struct xfs_da_intnode *node2;
+ struct xfs_da_intnode *tmpnode;
+ struct xfs_da_node_entry *btree1;
+ struct xfs_da_node_entry *btree2;
+ struct xfs_da_node_entry *btree_s;
+ struct xfs_da_node_entry *btree_d;
+ struct xfs_da3_icnode_hdr nodehdr1;
+ struct xfs_da3_icnode_hdr nodehdr2;
+ struct xfs_trans *tp;
+ int count;
+ int tmp;
+ int swap = 0;
+ struct xfs_inode *dp = state->args->dp;
trace_xfs_da_node_rebalance(state->args);
node1 = blk1->bp->b_addr;
node2 = blk2->bp->b_addr;
+ dp->d_ops->node_hdr_from_disk(&nodehdr1, node1);
+ dp->d_ops->node_hdr_from_disk(&nodehdr2, node2);
+ btree1 = dp->d_ops->node_tree_p(node1);
+ btree2 = dp->d_ops->node_tree_p(node2);
+
/*
* Figure out how many entries need to move, and in which direction.
* Swap the nodes around if that makes it simpler.
*/
- if ((be16_to_cpu(node1->hdr.count) > 0) && (be16_to_cpu(node2->hdr.count) > 0) &&
- ((be32_to_cpu(node2->btree[0].hashval) < be32_to_cpu(node1->btree[0].hashval)) ||
- (be32_to_cpu(node2->btree[be16_to_cpu(node2->hdr.count)-1].hashval) <
- be32_to_cpu(node1->btree[be16_to_cpu(node1->hdr.count)-1].hashval)))) {
+ if (nodehdr1.count > 0 && nodehdr2.count > 0 &&
+ ((be32_to_cpu(btree2[0].hashval) < be32_to_cpu(btree1[0].hashval)) ||
+ (be32_to_cpu(btree2[nodehdr2.count - 1].hashval) <
+ be32_to_cpu(btree1[nodehdr1.count - 1].hashval)))) {
tmpnode = node1;
node1 = node2;
node2 = tmpnode;
+ dp->d_ops->node_hdr_from_disk(&nodehdr1, node1);
+ dp->d_ops->node_hdr_from_disk(&nodehdr2, node2);
+ btree1 = dp->d_ops->node_tree_p(node1);
+ btree2 = dp->d_ops->node_tree_p(node2);
+ swap = 1;
}
- ASSERT(node1->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC));
- ASSERT(node2->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC));
- count = (be16_to_cpu(node1->hdr.count) - be16_to_cpu(node2->hdr.count)) / 2;
+
+ count = (nodehdr1.count - nodehdr2.count) / 2;
if (count == 0)
return;
tp = state->args->trans;
@@ -571,10 +789,11 @@ xfs_da_node_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
/*
* Move elements in node2 up to make a hole.
*/
- if ((tmp = be16_to_cpu(node2->hdr.count)) > 0) {
+ tmp = nodehdr2.count;
+ if (tmp > 0) {
tmp *= (uint)sizeof(xfs_da_node_entry_t);
- btree_s = &node2->btree[0];
- btree_d = &node2->btree[count];
+ btree_s = &btree2[0];
+ btree_d = &btree2[count];
memmove(btree_d, btree_s, tmp);
}
@@ -582,12 +801,12 @@ xfs_da_node_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
* Move the req'd B-tree elements from high in node1 to
* low in node2.
*/
- be16_add_cpu(&node2->hdr.count, count);
+ nodehdr2.count += count;
tmp = count * (uint)sizeof(xfs_da_node_entry_t);
- btree_s = &node1->btree[be16_to_cpu(node1->hdr.count) - count];
- btree_d = &node2->btree[0];
+ btree_s = &btree1[nodehdr1.count - count];
+ btree_d = &btree2[0];
memcpy(btree_d, btree_s, tmp);
- be16_add_cpu(&node1->hdr.count, -count);
+ nodehdr1.count -= count;
} else {
/*
* Move the req'd B-tree elements from low in node2 to
@@ -595,49 +814,59 @@ xfs_da_node_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
*/
count = -count;
tmp = count * (uint)sizeof(xfs_da_node_entry_t);
- btree_s = &node2->btree[0];
- btree_d = &node1->btree[be16_to_cpu(node1->hdr.count)];
+ btree_s = &btree2[0];
+ btree_d = &btree1[nodehdr1.count];
memcpy(btree_d, btree_s, tmp);
- be16_add_cpu(&node1->hdr.count, count);
+ nodehdr1.count += count;
+
xfs_trans_log_buf(tp, blk1->bp,
XFS_DA_LOGRANGE(node1, btree_d, tmp));
/*
* Move elements in node2 down to fill the hole.
*/
- tmp = be16_to_cpu(node2->hdr.count) - count;
+ tmp = nodehdr2.count - count;
tmp *= (uint)sizeof(xfs_da_node_entry_t);
- btree_s = &node2->btree[count];
- btree_d = &node2->btree[0];
+ btree_s = &btree2[count];
+ btree_d = &btree2[0];
memmove(btree_d, btree_s, tmp);
- be16_add_cpu(&node2->hdr.count, -count);
+ nodehdr2.count -= count;
}
/*
* Log header of node 1 and all current bits of node 2.
*/
+ dp->d_ops->node_hdr_to_disk(node1, &nodehdr1);
xfs_trans_log_buf(tp, blk1->bp,
- XFS_DA_LOGRANGE(node1, &node1->hdr, sizeof(node1->hdr)));
+ XFS_DA_LOGRANGE(node1, &node1->hdr, dp->d_ops->node_hdr_size));
+
+ dp->d_ops->node_hdr_to_disk(node2, &nodehdr2);
xfs_trans_log_buf(tp, blk2->bp,
XFS_DA_LOGRANGE(node2, &node2->hdr,
- sizeof(node2->hdr) +
- sizeof(node2->btree[0]) * be16_to_cpu(node2->hdr.count)));
+ dp->d_ops->node_hdr_size +
+ (sizeof(btree2[0]) * nodehdr2.count)));
/*
* Record the last hashval from each block for upward propagation.
* (note: don't use the swapped node pointers)
*/
- node1 = blk1->bp->b_addr;
- node2 = blk2->bp->b_addr;
- blk1->hashval = be32_to_cpu(node1->btree[be16_to_cpu(node1->hdr.count)-1].hashval);
- blk2->hashval = be32_to_cpu(node2->btree[be16_to_cpu(node2->hdr.count)-1].hashval);
+ if (swap) {
+ node1 = blk1->bp->b_addr;
+ node2 = blk2->bp->b_addr;
+ dp->d_ops->node_hdr_from_disk(&nodehdr1, node1);
+ dp->d_ops->node_hdr_from_disk(&nodehdr2, node2);
+ btree1 = dp->d_ops->node_tree_p(node1);
+ btree2 = dp->d_ops->node_tree_p(node2);
+ }
+ blk1->hashval = be32_to_cpu(btree1[nodehdr1.count - 1].hashval);
+ blk2->hashval = be32_to_cpu(btree2[nodehdr2.count - 1].hashval);
/*
* Adjust the expected index for insertion.
*/
- if (blk1->index >= be16_to_cpu(node1->hdr.count)) {
- blk2->index = blk1->index - be16_to_cpu(node1->hdr.count);
- blk1->index = be16_to_cpu(node1->hdr.count) + 1; /* make it invalid */
+ if (blk1->index >= nodehdr1.count) {
+ blk2->index = blk1->index - nodehdr1.count;
+ blk1->index = nodehdr1.count + 1; /* make it invalid */
}
}
@@ -645,44 +874,52 @@ xfs_da_node_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
* Add a new entry to an intermediate node.
*/
STATIC void
-xfs_da_node_add(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk,
- xfs_da_state_blk_t *newblk)
+xfs_da3_node_add(
+ struct xfs_da_state *state,
+ struct xfs_da_state_blk *oldblk,
+ struct xfs_da_state_blk *newblk)
{
- xfs_da_intnode_t *node;
- xfs_da_node_entry_t *btree;
- int tmp;
+ struct xfs_da_intnode *node;
+ struct xfs_da3_icnode_hdr nodehdr;
+ struct xfs_da_node_entry *btree;
+ int tmp;
+ struct xfs_inode *dp = state->args->dp;
trace_xfs_da_node_add(state->args);
node = oldblk->bp->b_addr;
- ASSERT(node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC));
- ASSERT((oldblk->index >= 0) && (oldblk->index <= be16_to_cpu(node->hdr.count)));
+ dp->d_ops->node_hdr_from_disk(&nodehdr, node);
+ btree = dp->d_ops->node_tree_p(node);
+
+ ASSERT(oldblk->index >= 0 && oldblk->index <= nodehdr.count);
ASSERT(newblk->blkno != 0);
if (state->args->whichfork == XFS_DATA_FORK)
- ASSERT(newblk->blkno >= state->mp->m_dirleafblk &&
- newblk->blkno < state->mp->m_dirfreeblk);
+ ASSERT(newblk->blkno >= state->args->geo->leafblk &&
+ newblk->blkno < state->args->geo->freeblk);
/*
* We may need to make some room before we insert the new node.
*/
tmp = 0;
- btree = &node->btree[ oldblk->index ];
- if (oldblk->index < be16_to_cpu(node->hdr.count)) {
- tmp = (be16_to_cpu(node->hdr.count) - oldblk->index) * (uint)sizeof(*btree);
- memmove(btree + 1, btree, tmp);
+ if (oldblk->index < nodehdr.count) {
+ tmp = (nodehdr.count - oldblk->index) * (uint)sizeof(*btree);
+ memmove(&btree[oldblk->index + 1], &btree[oldblk->index], tmp);
}
- btree->hashval = cpu_to_be32(newblk->hashval);
- btree->before = cpu_to_be32(newblk->blkno);
+ btree[oldblk->index].hashval = cpu_to_be32(newblk->hashval);
+ btree[oldblk->index].before = cpu_to_be32(newblk->blkno);
xfs_trans_log_buf(state->args->trans, oldblk->bp,
- XFS_DA_LOGRANGE(node, btree, tmp + sizeof(*btree)));
- be16_add_cpu(&node->hdr.count, 1);
+ XFS_DA_LOGRANGE(node, &btree[oldblk->index],
+ tmp + sizeof(*btree)));
+
+ nodehdr.count += 1;
+ dp->d_ops->node_hdr_to_disk(node, &nodehdr);
xfs_trans_log_buf(state->args->trans, oldblk->bp,
- XFS_DA_LOGRANGE(node, &node->hdr, sizeof(node->hdr)));
+ XFS_DA_LOGRANGE(node, &node->hdr, dp->d_ops->node_hdr_size));
/*
* Copy the last hash value from the oldblk to propagate upwards.
*/
- oldblk->hashval = be32_to_cpu(node->btree[be16_to_cpu(node->hdr.count)-1 ].hashval);
+ oldblk->hashval = be32_to_cpu(btree[nodehdr.count - 1].hashval);
}
/*========================================================================
@@ -694,14 +931,16 @@ xfs_da_node_add(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk,
* possibly deallocating that block, etc...
*/
int
-xfs_da_join(xfs_da_state_t *state)
+xfs_da3_join(
+ struct xfs_da_state *state)
{
- xfs_da_state_blk_t *drop_blk, *save_blk;
- int action, error;
+ struct xfs_da_state_blk *drop_blk;
+ struct xfs_da_state_blk *save_blk;
+ int action = 0;
+ int error;
trace_xfs_da_join(state->args);
- action = 0;
drop_blk = &state->path.blk[ state->path.active-1 ];
save_blk = &state->altpath.blk[ state->path.active-1 ];
ASSERT(state->path.blk[0].magic == XFS_DA_NODE_MAGIC);
@@ -722,12 +961,12 @@ xfs_da_join(xfs_da_state_t *state)
*/
switch (drop_blk->magic) {
case XFS_ATTR_LEAF_MAGIC:
- error = xfs_attr_leaf_toosmall(state, &action);
+ error = xfs_attr3_leaf_toosmall(state, &action);
if (error)
return(error);
if (action == 0)
return(0);
- xfs_attr_leaf_unbalance(state, drop_blk, save_blk);
+ xfs_attr3_leaf_unbalance(state, drop_blk, save_blk);
break;
case XFS_DIR2_LEAFN_MAGIC:
error = xfs_dir2_leafn_toosmall(state, &action);
@@ -742,18 +981,18 @@ xfs_da_join(xfs_da_state_t *state)
* Remove the offending node, fixup hashvals,
* check for a toosmall neighbor.
*/
- xfs_da_node_remove(state, drop_blk);
- xfs_da_fixhashpath(state, &state->path);
- error = xfs_da_node_toosmall(state, &action);
+ xfs_da3_node_remove(state, drop_blk);
+ xfs_da3_fixhashpath(state, &state->path);
+ error = xfs_da3_node_toosmall(state, &action);
if (error)
return(error);
if (action == 0)
return 0;
- xfs_da_node_unbalance(state, drop_blk, save_blk);
+ xfs_da3_node_unbalance(state, drop_blk, save_blk);
break;
}
- xfs_da_fixhashpath(state, &state->altpath);
- error = xfs_da_blk_unlink(state, drop_blk, save_blk);
+ xfs_da3_fixhashpath(state, &state->altpath);
+ error = xfs_da3_blk_unlink(state, drop_blk, save_blk);
xfs_da_state_kill_altpath(state);
if (error)
return(error);
@@ -768,9 +1007,9 @@ xfs_da_join(xfs_da_state_t *state)
* we only have one entry in the root, make the child block
* the new root.
*/
- xfs_da_node_remove(state, drop_blk);
- xfs_da_fixhashpath(state, &state->path);
- error = xfs_da_root_join(state, &state->path.blk[0]);
+ xfs_da3_node_remove(state, drop_blk);
+ xfs_da3_fixhashpath(state, &state->path);
+ error = xfs_da3_root_join(state, &state->path.blk[0]);
return(error);
}
@@ -782,9 +1021,13 @@ xfs_da_blkinfo_onlychild_validate(struct xfs_da_blkinfo *blkinfo, __u16 level)
if (level == 1) {
ASSERT(magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) ||
- magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
- } else
- ASSERT(magic == cpu_to_be16(XFS_DA_NODE_MAGIC));
+ magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC) ||
+ magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC) ||
+ magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC));
+ } else {
+ ASSERT(magic == cpu_to_be16(XFS_DA_NODE_MAGIC) ||
+ magic == cpu_to_be16(XFS_DA3_NODE_MAGIC));
+ }
ASSERT(!blkinfo->forw);
ASSERT(!blkinfo->back);
}
@@ -797,53 +1040,64 @@ xfs_da_blkinfo_onlychild_validate(struct xfs_da_blkinfo *blkinfo, __u16 level)
* the old root to block 0 as the new root node.
*/
STATIC int
-xfs_da_root_join(xfs_da_state_t *state, xfs_da_state_blk_t *root_blk)
+xfs_da3_root_join(
+ struct xfs_da_state *state,
+ struct xfs_da_state_blk *root_blk)
{
- xfs_da_intnode_t *oldroot;
- xfs_da_args_t *args;
- xfs_dablk_t child;
- struct xfs_buf *bp;
- int error;
+ struct xfs_da_intnode *oldroot;
+ struct xfs_da_args *args;
+ xfs_dablk_t child;
+ struct xfs_buf *bp;
+ struct xfs_da3_icnode_hdr oldroothdr;
+ struct xfs_da_node_entry *btree;
+ int error;
+ struct xfs_inode *dp = state->args->dp;
trace_xfs_da_root_join(state->args);
- args = state->args;
- ASSERT(args != NULL);
ASSERT(root_blk->magic == XFS_DA_NODE_MAGIC);
+
+ args = state->args;
oldroot = root_blk->bp->b_addr;
- ASSERT(oldroot->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC));
- ASSERT(!oldroot->hdr.info.forw);
- ASSERT(!oldroot->hdr.info.back);
+ dp->d_ops->node_hdr_from_disk(&oldroothdr, oldroot);
+ ASSERT(oldroothdr.forw == 0);
+ ASSERT(oldroothdr.back == 0);
/*
* If the root has more than one child, then don't do anything.
*/
- if (be16_to_cpu(oldroot->hdr.count) > 1)
- return(0);
+ if (oldroothdr.count > 1)
+ return 0;
/*
* Read in the (only) child block, then copy those bytes into
* the root block's buffer and free the original child block.
*/
- child = be32_to_cpu(oldroot->btree[0].before);
+ btree = dp->d_ops->node_tree_p(oldroot);
+ child = be32_to_cpu(btree[0].before);
ASSERT(child != 0);
- error = xfs_da_node_read(args->trans, args->dp, child, -1, &bp,
+ error = xfs_da3_node_read(args->trans, dp, child, -1, &bp,
args->whichfork);
if (error)
- return(error);
- ASSERT(bp != NULL);
- xfs_da_blkinfo_onlychild_validate(bp->b_addr,
- be16_to_cpu(oldroot->hdr.level));
+ return error;
+ xfs_da_blkinfo_onlychild_validate(bp->b_addr, oldroothdr.level);
/*
* This could be copying a leaf back into the root block in the case of
* there only being a single leaf block left in the tree. Hence we have
* to update the b_ops pointer as well to match the buffer type change
- * that could occur.
+ * that could occur. For dir3 blocks we also need to update the block
+ * number in the buffer header.
*/
- memcpy(root_blk->bp->b_addr, bp->b_addr, state->blocksize);
+ memcpy(root_blk->bp->b_addr, bp->b_addr, args->geo->blksize);
root_blk->bp->b_ops = bp->b_ops;
- xfs_trans_log_buf(args->trans, root_blk->bp, 0, state->blocksize - 1);
+ xfs_trans_buf_copy_type(root_blk->bp, bp);
+ if (oldroothdr.magic == XFS_DA3_NODE_MAGIC) {
+ struct xfs_da3_blkinfo *da3 = root_blk->bp->b_addr;
+ da3->blkno = cpu_to_be64(root_blk->bp->b_bn);
+ }
+ xfs_trans_log_buf(args->trans, root_blk->bp, 0,
+ args->geo->blksize - 1);
error = xfs_da_shrink_inode(args, child, bp);
return(error);
}
@@ -858,14 +1112,22 @@ xfs_da_root_join(xfs_da_state_t *state, xfs_da_state_blk_t *root_blk)
* If nothing can be done, return 0.
*/
STATIC int
-xfs_da_node_toosmall(xfs_da_state_t *state, int *action)
+xfs_da3_node_toosmall(
+ struct xfs_da_state *state,
+ int *action)
{
- xfs_da_intnode_t *node;
- xfs_da_state_blk_t *blk;
- xfs_da_blkinfo_t *info;
- int count, forward, error, retval, i;
- xfs_dablk_t blkno;
- struct xfs_buf *bp;
+ struct xfs_da_intnode *node;
+ struct xfs_da_state_blk *blk;
+ struct xfs_da_blkinfo *info;
+ xfs_dablk_t blkno;
+ struct xfs_buf *bp;
+ struct xfs_da3_icnode_hdr nodehdr;
+ int count;
+ int forward;
+ int error;
+ int retval;
+ int i;
+ struct xfs_inode *dp = state->args->dp;
trace_xfs_da_node_toosmall(state->args);
@@ -876,10 +1138,9 @@ xfs_da_node_toosmall(xfs_da_state_t *state, int *action)
*/
blk = &state->path.blk[ state->path.active-1 ];
info = blk->bp->b_addr;
- ASSERT(info->magic == cpu_to_be16(XFS_DA_NODE_MAGIC));
node = (xfs_da_intnode_t *)info;
- count = be16_to_cpu(node->hdr.count);
- if (count > (state->node_ents >> 1)) {
+ dp->d_ops->node_hdr_from_disk(&nodehdr, node);
+ if (nodehdr.count > (state->args->geo->node_ents >> 1)) {
*action = 0; /* blk over 50%, don't try to join */
return(0); /* blk over 50%, don't try to join */
}
@@ -890,14 +1151,14 @@ xfs_da_node_toosmall(xfs_da_state_t *state, int *action)
* coalesce it with a sibling block. We choose (arbitrarily)
* to merge with the forward block unless it is NULL.
*/
- if (count == 0) {
+ if (nodehdr.count == 0) {
/*
* Make altpath point to the block we want to keep and
* path point to the block we want to drop (this one).
*/
forward = (info->forw != 0);
memcpy(&state->altpath, &state->path, sizeof(state->path));
- error = xfs_da_path_shift(state, &state->altpath, forward,
+ error = xfs_da3_path_shift(state, &state->altpath, forward,
0, &retval);
if (error)
return(error);
@@ -916,35 +1177,35 @@ xfs_da_node_toosmall(xfs_da_state_t *state, int *action)
* We prefer coalescing with the lower numbered sibling so as
* to shrink a directory over time.
*/
+ count = state->args->geo->node_ents;
+ count -= state->args->geo->node_ents >> 2;
+ count -= nodehdr.count;
+
/* start with smaller blk num */
- forward = (be32_to_cpu(info->forw) < be32_to_cpu(info->back));
+ forward = nodehdr.forw < nodehdr.back;
for (i = 0; i < 2; forward = !forward, i++) {
+ struct xfs_da3_icnode_hdr thdr;
if (forward)
- blkno = be32_to_cpu(info->forw);
+ blkno = nodehdr.forw;
else
- blkno = be32_to_cpu(info->back);
+ blkno = nodehdr.back;
if (blkno == 0)
continue;
- error = xfs_da_node_read(state->args->trans, state->args->dp,
+ error = xfs_da3_node_read(state->args->trans, dp,
blkno, -1, &bp, state->args->whichfork);
if (error)
return(error);
- ASSERT(bp != NULL);
- node = (xfs_da_intnode_t *)info;
- count = state->node_ents;
- count -= state->node_ents >> 2;
- count -= be16_to_cpu(node->hdr.count);
node = bp->b_addr;
- ASSERT(node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC));
- count -= be16_to_cpu(node->hdr.count);
+ dp->d_ops->node_hdr_from_disk(&thdr, node);
xfs_trans_brelse(state->args->trans, bp);
- if (count >= 0)
+
+ if (count - thdr.count >= 0)
break; /* fits with at least 25% to spare */
}
if (i >= 2) {
*action = 0;
- return(0);
+ return 0;
}
/*
@@ -953,28 +1214,43 @@ xfs_da_node_toosmall(xfs_da_state_t *state, int *action)
*/
memcpy(&state->altpath, &state->path, sizeof(state->path));
if (blkno < blk->blkno) {
- error = xfs_da_path_shift(state, &state->altpath, forward,
+ error = xfs_da3_path_shift(state, &state->altpath, forward,
0, &retval);
- if (error) {
- return(error);
- }
- if (retval) {
- *action = 0;
- return(0);
- }
} else {
- error = xfs_da_path_shift(state, &state->path, forward,
+ error = xfs_da3_path_shift(state, &state->path, forward,
0, &retval);
- if (error) {
- return(error);
- }
- if (retval) {
- *action = 0;
- return(0);
- }
+ }
+ if (error)
+ return error;
+ if (retval) {
+ *action = 0;
+ return 0;
}
*action = 1;
- return(0);
+ return 0;
+}
+
+/*
+ * Pick up the last hashvalue from an intermediate node.
+ */
+STATIC uint
+xfs_da3_node_lasthash(
+ struct xfs_inode *dp,
+ struct xfs_buf *bp,
+ int *count)
+{
+ struct xfs_da_intnode *node;
+ struct xfs_da_node_entry *btree;
+ struct xfs_da3_icnode_hdr nodehdr;
+
+ node = bp->b_addr;
+ dp->d_ops->node_hdr_from_disk(&nodehdr, node);
+ if (count)
+ *count = nodehdr.count;
+ if (!nodehdr.count)
+ return 0;
+ btree = dp->d_ops->node_tree_p(node);
+ return be32_to_cpu(btree[nodehdr.count - 1].hashval);
}
/*
@@ -982,13 +1258,17 @@ xfs_da_node_toosmall(xfs_da_state_t *state, int *action)
* when we stop making changes, return.
*/
void
-xfs_da_fixhashpath(xfs_da_state_t *state, xfs_da_state_path_t *path)
+xfs_da3_fixhashpath(
+ struct xfs_da_state *state,
+ struct xfs_da_state_path *path)
{
- xfs_da_state_blk_t *blk;
- xfs_da_intnode_t *node;
- xfs_da_node_entry_t *btree;
- xfs_dahash_t lasthash=0;
- int level, count;
+ struct xfs_da_state_blk *blk;
+ struct xfs_da_intnode *node;
+ struct xfs_da_node_entry *btree;
+ xfs_dahash_t lasthash=0;
+ int level;
+ int count;
+ struct xfs_inode *dp = state->args->dp;
trace_xfs_da_fixhashpath(state->args);
@@ -1001,28 +1281,31 @@ xfs_da_fixhashpath(xfs_da_state_t *state, xfs_da_state_path_t *path)
return;
break;
case XFS_DIR2_LEAFN_MAGIC:
- lasthash = xfs_dir2_leafn_lasthash(blk->bp, &count);
+ lasthash = xfs_dir2_leafn_lasthash(dp, blk->bp, &count);
if (count == 0)
return;
break;
case XFS_DA_NODE_MAGIC:
- lasthash = xfs_da_node_lasthash(blk->bp, &count);
+ lasthash = xfs_da3_node_lasthash(dp, blk->bp, &count);
if (count == 0)
return;
break;
}
for (blk--, level--; level >= 0; blk--, level--) {
+ struct xfs_da3_icnode_hdr nodehdr;
+
node = blk->bp->b_addr;
- ASSERT(node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC));
- btree = &node->btree[ blk->index ];
- if (be32_to_cpu(btree->hashval) == lasthash)
+ dp->d_ops->node_hdr_from_disk(&nodehdr, node);
+ btree = dp->d_ops->node_tree_p(node);
+ if (be32_to_cpu(btree[blk->index].hashval) == lasthash)
break;
blk->hashval = lasthash;
- btree->hashval = cpu_to_be32(lasthash);
+ btree[blk->index].hashval = cpu_to_be32(lasthash);
xfs_trans_log_buf(state->args->trans, blk->bp,
- XFS_DA_LOGRANGE(node, btree, sizeof(*btree)));
+ XFS_DA_LOGRANGE(node, &btree[blk->index],
+ sizeof(*btree)));
- lasthash = be32_to_cpu(node->btree[be16_to_cpu(node->hdr.count)-1].hashval);
+ lasthash = be32_to_cpu(btree[nodehdr.count - 1].hashval);
}
}
@@ -1030,104 +1313,122 @@ xfs_da_fixhashpath(xfs_da_state_t *state, xfs_da_state_path_t *path)
* Remove an entry from an intermediate node.
*/
STATIC void
-xfs_da_node_remove(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk)
+xfs_da3_node_remove(
+ struct xfs_da_state *state,
+ struct xfs_da_state_blk *drop_blk)
{
- xfs_da_intnode_t *node;
- xfs_da_node_entry_t *btree;
- int tmp;
+ struct xfs_da_intnode *node;
+ struct xfs_da3_icnode_hdr nodehdr;
+ struct xfs_da_node_entry *btree;
+ int index;
+ int tmp;
+ struct xfs_inode *dp = state->args->dp;
trace_xfs_da_node_remove(state->args);
node = drop_blk->bp->b_addr;
- ASSERT(drop_blk->index < be16_to_cpu(node->hdr.count));
+ dp->d_ops->node_hdr_from_disk(&nodehdr, node);
+ ASSERT(drop_blk->index < nodehdr.count);
ASSERT(drop_blk->index >= 0);
/*
* Copy over the offending entry, or just zero it out.
*/
- btree = &node->btree[drop_blk->index];
- if (drop_blk->index < (be16_to_cpu(node->hdr.count)-1)) {
- tmp = be16_to_cpu(node->hdr.count) - drop_blk->index - 1;
+ index = drop_blk->index;
+ btree = dp->d_ops->node_tree_p(node);
+ if (index < nodehdr.count - 1) {
+ tmp = nodehdr.count - index - 1;
tmp *= (uint)sizeof(xfs_da_node_entry_t);
- memmove(btree, btree + 1, tmp);
+ memmove(&btree[index], &btree[index + 1], tmp);
xfs_trans_log_buf(state->args->trans, drop_blk->bp,
- XFS_DA_LOGRANGE(node, btree, tmp));
- btree = &node->btree[be16_to_cpu(node->hdr.count)-1];
+ XFS_DA_LOGRANGE(node, &btree[index], tmp));
+ index = nodehdr.count - 1;
}
- memset((char *)btree, 0, sizeof(xfs_da_node_entry_t));
+ memset(&btree[index], 0, sizeof(xfs_da_node_entry_t));
xfs_trans_log_buf(state->args->trans, drop_blk->bp,
- XFS_DA_LOGRANGE(node, btree, sizeof(*btree)));
- be16_add_cpu(&node->hdr.count, -1);
+ XFS_DA_LOGRANGE(node, &btree[index], sizeof(btree[index])));
+ nodehdr.count -= 1;
+ dp->d_ops->node_hdr_to_disk(node, &nodehdr);
xfs_trans_log_buf(state->args->trans, drop_blk->bp,
- XFS_DA_LOGRANGE(node, &node->hdr, sizeof(node->hdr)));
+ XFS_DA_LOGRANGE(node, &node->hdr, dp->d_ops->node_hdr_size));
/*
* Copy the last hash value from the block to propagate upwards.
*/
- btree--;
- drop_blk->hashval = be32_to_cpu(btree->hashval);
+ drop_blk->hashval = be32_to_cpu(btree[index - 1].hashval);
}
/*
- * Unbalance the btree elements between two intermediate nodes,
+ * Unbalance the elements between two intermediate nodes,
* move all Btree elements from one node into another.
*/
STATIC void
-xfs_da_node_unbalance(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk,
- xfs_da_state_blk_t *save_blk)
+xfs_da3_node_unbalance(
+ struct xfs_da_state *state,
+ struct xfs_da_state_blk *drop_blk,
+ struct xfs_da_state_blk *save_blk)
{
- xfs_da_intnode_t *drop_node, *save_node;
- xfs_da_node_entry_t *btree;
- int tmp;
- xfs_trans_t *tp;
+ struct xfs_da_intnode *drop_node;
+ struct xfs_da_intnode *save_node;
+ struct xfs_da_node_entry *drop_btree;
+ struct xfs_da_node_entry *save_btree;
+ struct xfs_da3_icnode_hdr drop_hdr;
+ struct xfs_da3_icnode_hdr save_hdr;
+ struct xfs_trans *tp;
+ int sindex;
+ int tmp;
+ struct xfs_inode *dp = state->args->dp;
trace_xfs_da_node_unbalance(state->args);
drop_node = drop_blk->bp->b_addr;
save_node = save_blk->bp->b_addr;
- ASSERT(drop_node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC));
- ASSERT(save_node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC));
+ dp->d_ops->node_hdr_from_disk(&drop_hdr, drop_node);
+ dp->d_ops->node_hdr_from_disk(&save_hdr, save_node);
+ drop_btree = dp->d_ops->node_tree_p(drop_node);
+ save_btree = dp->d_ops->node_tree_p(save_node);
tp = state->args->trans;
/*
* If the dying block has lower hashvals, then move all the
* elements in the remaining block up to make a hole.
*/
- if ((be32_to_cpu(drop_node->btree[0].hashval) < be32_to_cpu(save_node->btree[ 0 ].hashval)) ||
- (be32_to_cpu(drop_node->btree[be16_to_cpu(drop_node->hdr.count)-1].hashval) <
- be32_to_cpu(save_node->btree[be16_to_cpu(save_node->hdr.count)-1].hashval)))
- {
- btree = &save_node->btree[be16_to_cpu(drop_node->hdr.count)];
- tmp = be16_to_cpu(save_node->hdr.count) * (uint)sizeof(xfs_da_node_entry_t);
- memmove(btree, &save_node->btree[0], tmp);
- btree = &save_node->btree[0];
+ if ((be32_to_cpu(drop_btree[0].hashval) <
+ be32_to_cpu(save_btree[0].hashval)) ||
+ (be32_to_cpu(drop_btree[drop_hdr.count - 1].hashval) <
+ be32_to_cpu(save_btree[save_hdr.count - 1].hashval))) {
+ /* XXX: check this - is memmove dst correct? */
+ tmp = save_hdr.count * sizeof(xfs_da_node_entry_t);
+ memmove(&save_btree[drop_hdr.count], &save_btree[0], tmp);
+
+ sindex = 0;
xfs_trans_log_buf(tp, save_blk->bp,
- XFS_DA_LOGRANGE(save_node, btree,
- (be16_to_cpu(save_node->hdr.count) + be16_to_cpu(drop_node->hdr.count)) *
- sizeof(xfs_da_node_entry_t)));
+ XFS_DA_LOGRANGE(save_node, &save_btree[0],
+ (save_hdr.count + drop_hdr.count) *
+ sizeof(xfs_da_node_entry_t)));
} else {
- btree = &save_node->btree[be16_to_cpu(save_node->hdr.count)];
+ sindex = save_hdr.count;
xfs_trans_log_buf(tp, save_blk->bp,
- XFS_DA_LOGRANGE(save_node, btree,
- be16_to_cpu(drop_node->hdr.count) *
- sizeof(xfs_da_node_entry_t)));
+ XFS_DA_LOGRANGE(save_node, &save_btree[sindex],
+ drop_hdr.count * sizeof(xfs_da_node_entry_t)));
}
/*
* Move all the B-tree elements from drop_blk to save_blk.
*/
- tmp = be16_to_cpu(drop_node->hdr.count) * (uint)sizeof(xfs_da_node_entry_t);
- memcpy(btree, &drop_node->btree[0], tmp);
- be16_add_cpu(&save_node->hdr.count, be16_to_cpu(drop_node->hdr.count));
+ tmp = drop_hdr.count * (uint)sizeof(xfs_da_node_entry_t);
+ memcpy(&save_btree[sindex], &drop_btree[0], tmp);
+ save_hdr.count += drop_hdr.count;
+ dp->d_ops->node_hdr_to_disk(save_node, &save_hdr);
xfs_trans_log_buf(tp, save_blk->bp,
XFS_DA_LOGRANGE(save_node, &save_node->hdr,
- sizeof(save_node->hdr)));
+ dp->d_ops->node_hdr_size));
/*
* Save the last hashval in the remaining block for upward propagation.
*/
- save_blk->hashval = be32_to_cpu(save_node->btree[be16_to_cpu(save_node->hdr.count)-1].hashval);
+ save_blk->hashval = be32_to_cpu(save_btree[save_hdr.count - 1].hashval);
}
/*========================================================================
@@ -1146,16 +1447,25 @@ xfs_da_node_unbalance(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk,
* pruned depth-first tree search.
*/
int /* error */
-xfs_da_node_lookup_int(xfs_da_state_t *state, int *result)
+xfs_da3_node_lookup_int(
+ struct xfs_da_state *state,
+ int *result)
{
- xfs_da_state_blk_t *blk;
- xfs_da_blkinfo_t *curr;
- xfs_da_intnode_t *node;
- xfs_da_node_entry_t *btree;
- xfs_dablk_t blkno;
- int probe, span, max, error, retval;
- xfs_dahash_t hashval, btreehashval;
- xfs_da_args_t *args;
+ struct xfs_da_state_blk *blk;
+ struct xfs_da_blkinfo *curr;
+ struct xfs_da_intnode *node;
+ struct xfs_da_node_entry *btree;
+ struct xfs_da3_icnode_hdr nodehdr;
+ struct xfs_da_args *args;
+ xfs_dablk_t blkno;
+ xfs_dahash_t hashval;
+ xfs_dahash_t btreehashval;
+ int probe;
+ int span;
+ int max;
+ int error;
+ int retval;
+ struct xfs_inode *dp = state->args->dp;
args = state->args;
@@ -1163,7 +1473,7 @@ xfs_da_node_lookup_int(xfs_da_state_t *state, int *result)
* Descend thru the B-tree searching each level for the right
* node to use, until the right hashval is found.
*/
- blkno = (args->whichfork == XFS_DATA_FORK)? state->mp->m_dirleafblk : 0;
+ blkno = (args->whichfork == XFS_DATA_FORK)? args->geo->leafblk : 0;
for (blk = &state->path.blk[0], state->path.active = 1;
state->path.active <= XFS_DA_NODE_MAXDEPTH;
blk++, state->path.active++) {
@@ -1171,7 +1481,7 @@ xfs_da_node_lookup_int(xfs_da_state_t *state, int *result)
* Read the next node down in the tree.
*/
blk->blkno = blkno;
- error = xfs_da_node_read(args->trans, args->dp, blkno,
+ error = xfs_da3_node_read(args->trans, args->dp, blkno,
-1, &blk->bp, args->whichfork);
if (error) {
blk->blkno = 0;
@@ -1180,66 +1490,76 @@ xfs_da_node_lookup_int(xfs_da_state_t *state, int *result)
}
curr = blk->bp->b_addr;
blk->magic = be16_to_cpu(curr->magic);
- ASSERT(blk->magic == XFS_DA_NODE_MAGIC ||
- blk->magic == XFS_DIR2_LEAFN_MAGIC ||
- blk->magic == XFS_ATTR_LEAF_MAGIC);
+
+ if (blk->magic == XFS_ATTR_LEAF_MAGIC ||
+ blk->magic == XFS_ATTR3_LEAF_MAGIC) {
+ blk->magic = XFS_ATTR_LEAF_MAGIC;
+ blk->hashval = xfs_attr_leaf_lasthash(blk->bp, NULL);
+ break;
+ }
+
+ if (blk->magic == XFS_DIR2_LEAFN_MAGIC ||
+ blk->magic == XFS_DIR3_LEAFN_MAGIC) {
+ blk->magic = XFS_DIR2_LEAFN_MAGIC;
+ blk->hashval = xfs_dir2_leafn_lasthash(args->dp,
+ blk->bp, NULL);
+ break;
+ }
+
+ blk->magic = XFS_DA_NODE_MAGIC;
+
/*
* Search an intermediate node for a match.
*/
- if (blk->magic == XFS_DA_NODE_MAGIC) {
- node = blk->bp->b_addr;
- max = be16_to_cpu(node->hdr.count);
- blk->hashval = be32_to_cpu(node->btree[max-1].hashval);
+ node = blk->bp->b_addr;
+ dp->d_ops->node_hdr_from_disk(&nodehdr, node);
+ btree = dp->d_ops->node_tree_p(node);
- /*
- * Binary search. (note: small blocks will skip loop)
- */
- probe = span = max / 2;
- hashval = args->hashval;
- for (btree = &node->btree[probe]; span > 4;
- btree = &node->btree[probe]) {
- span /= 2;
- btreehashval = be32_to_cpu(btree->hashval);
- if (btreehashval < hashval)
- probe += span;
- else if (btreehashval > hashval)
- probe -= span;
- else
- break;
- }
- ASSERT((probe >= 0) && (probe < max));
- ASSERT((span <= 4) || (be32_to_cpu(btree->hashval) == hashval));
+ max = nodehdr.count;
+ blk->hashval = be32_to_cpu(btree[max - 1].hashval);
- /*
- * Since we may have duplicate hashval's, find the first
- * matching hashval in the node.
- */
- while ((probe > 0) && (be32_to_cpu(btree->hashval) >= hashval)) {
- btree--;
- probe--;
- }
- while ((probe < max) && (be32_to_cpu(btree->hashval) < hashval)) {
- btree++;
- probe++;
- }
+ /*
+ * Binary search. (note: small blocks will skip loop)
+ */
+ probe = span = max / 2;
+ hashval = args->hashval;
+ while (span > 4) {
+ span /= 2;
+ btreehashval = be32_to_cpu(btree[probe].hashval);
+ if (btreehashval < hashval)
+ probe += span;
+ else if (btreehashval > hashval)
+ probe -= span;
+ else
+ break;
+ }
+ ASSERT((probe >= 0) && (probe < max));
+ ASSERT((span <= 4) ||
+ (be32_to_cpu(btree[probe].hashval) == hashval));
- /*
- * Pick the right block to descend on.
- */
- if (probe == max) {
- blk->index = max-1;
- blkno = be32_to_cpu(node->btree[max-1].before);
- } else {
- blk->index = probe;
- blkno = be32_to_cpu(btree->before);
- }
- } else if (blk->magic == XFS_ATTR_LEAF_MAGIC) {
- blk->hashval = xfs_attr_leaf_lasthash(blk->bp, NULL);
- break;
- } else if (blk->magic == XFS_DIR2_LEAFN_MAGIC) {
- blk->hashval = xfs_dir2_leafn_lasthash(blk->bp, NULL);
- break;
+ /*
+ * Since we may have duplicate hashval's, find the first
+ * matching hashval in the node.
+ */
+ while (probe > 0 &&
+ be32_to_cpu(btree[probe].hashval) >= hashval) {
+ probe--;
+ }
+ while (probe < max &&
+ be32_to_cpu(btree[probe].hashval) < hashval) {
+ probe++;
+ }
+
+ /*
+ * Pick the right block to descend on.
+ */
+ if (probe == max) {
+ blk->index = max - 1;
+ blkno = be32_to_cpu(btree[max - 1].before);
+ } else {
+ blk->index = probe;
+ blkno = be32_to_cpu(btree[probe].before);
}
}
@@ -1254,7 +1574,7 @@ xfs_da_node_lookup_int(xfs_da_state_t *state, int *result)
retval = xfs_dir2_leafn_lookup_int(blk->bp, args,
&blk->index, state);
} else if (blk->magic == XFS_ATTR_LEAF_MAGIC) {
- retval = xfs_attr_leaf_lookup_int(blk->bp, args);
+ retval = xfs_attr3_leaf_lookup_int(blk->bp, args);
blk->index = args->index;
args->blkno = blk->blkno;
} else {
@@ -1263,7 +1583,7 @@ xfs_da_node_lookup_int(xfs_da_state_t *state, int *result)
}
if (((retval == ENOENT) || (retval == ENOATTR)) &&
(blk->hashval == args->hashval)) {
- error = xfs_da_path_shift(state, &state->path, 1, 1,
+ error = xfs_da3_path_shift(state, &state->path, 1, 1,
&retval);
if (error)
return(error);
@@ -1285,16 +1605,54 @@ xfs_da_node_lookup_int(xfs_da_state_t *state, int *result)
*========================================================================*/
/*
+ * Compare two intermediate nodes for "order".
+ */
+STATIC int
+xfs_da3_node_order(
+ struct xfs_inode *dp,
+ struct xfs_buf *node1_bp,
+ struct xfs_buf *node2_bp)
+{
+ struct xfs_da_intnode *node1;
+ struct xfs_da_intnode *node2;
+ struct xfs_da_node_entry *btree1;
+ struct xfs_da_node_entry *btree2;
+ struct xfs_da3_icnode_hdr node1hdr;
+ struct xfs_da3_icnode_hdr node2hdr;
+
+ node1 = node1_bp->b_addr;
+ node2 = node2_bp->b_addr;
+ dp->d_ops->node_hdr_from_disk(&node1hdr, node1);
+ dp->d_ops->node_hdr_from_disk(&node2hdr, node2);
+ btree1 = dp->d_ops->node_tree_p(node1);
+ btree2 = dp->d_ops->node_tree_p(node2);
+
+ if (node1hdr.count > 0 && node2hdr.count > 0 &&
+ ((be32_to_cpu(btree2[0].hashval) < be32_to_cpu(btree1[0].hashval)) ||
+ (be32_to_cpu(btree2[node2hdr.count - 1].hashval) <
+ be32_to_cpu(btree1[node1hdr.count - 1].hashval)))) {
+ return 1;
+ }
+ return 0;
+}
+
+/*
* Link a new block into a doubly linked list of blocks (of whatever type).
*/
int /* error */
-xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk,
- xfs_da_state_blk_t *new_blk)
+xfs_da3_blk_link(
+ struct xfs_da_state *state,
+ struct xfs_da_state_blk *old_blk,
+ struct xfs_da_state_blk *new_blk)
{
- xfs_da_blkinfo_t *old_info, *new_info, *tmp_info;
- xfs_da_args_t *args;
- int before=0, error;
- struct xfs_buf *bp;
+ struct xfs_da_blkinfo *old_info;
+ struct xfs_da_blkinfo *new_info;
+ struct xfs_da_blkinfo *tmp_info;
+ struct xfs_da_args *args;
+ struct xfs_buf *bp;
+ int before = 0;
+ int error;
+ struct xfs_inode *dp = state->args->dp;
/*
* Set up environment.
@@ -1306,19 +1664,16 @@ xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk,
ASSERT(old_blk->magic == XFS_DA_NODE_MAGIC ||
old_blk->magic == XFS_DIR2_LEAFN_MAGIC ||
old_blk->magic == XFS_ATTR_LEAF_MAGIC);
- ASSERT(old_blk->magic == be16_to_cpu(old_info->magic));
- ASSERT(new_blk->magic == be16_to_cpu(new_info->magic));
- ASSERT(old_blk->magic == new_blk->magic);
switch (old_blk->magic) {
case XFS_ATTR_LEAF_MAGIC:
before = xfs_attr_leaf_order(old_blk->bp, new_blk->bp);
break;
case XFS_DIR2_LEAFN_MAGIC:
- before = xfs_dir2_leafn_order(old_blk->bp, new_blk->bp);
+ before = xfs_dir2_leafn_order(dp, old_blk->bp, new_blk->bp);
break;
case XFS_DA_NODE_MAGIC:
- before = xfs_da_node_order(old_blk->bp, new_blk->bp);
+ before = xfs_da3_node_order(dp, old_blk->bp, new_blk->bp);
break;
}
@@ -1333,14 +1688,14 @@ xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk,
new_info->forw = cpu_to_be32(old_blk->blkno);
new_info->back = old_info->back;
if (old_info->back) {
- error = xfs_da_node_read(args->trans, args->dp,
+ error = xfs_da3_node_read(args->trans, dp,
be32_to_cpu(old_info->back),
-1, &bp, args->whichfork);
if (error)
return(error);
ASSERT(bp != NULL);
tmp_info = bp->b_addr;
- ASSERT(be16_to_cpu(tmp_info->magic) == be16_to_cpu(old_info->magic));
+ ASSERT(tmp_info->magic == old_info->magic);
ASSERT(be32_to_cpu(tmp_info->forw) == old_blk->blkno);
tmp_info->forw = cpu_to_be32(new_blk->blkno);
xfs_trans_log_buf(args->trans, bp, 0, sizeof(*tmp_info)-1);
@@ -1354,7 +1709,7 @@ xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk,
new_info->forw = old_info->forw;
new_info->back = cpu_to_be32(old_blk->blkno);
if (old_info->forw) {
- error = xfs_da_node_read(args->trans, args->dp,
+ error = xfs_da3_node_read(args->trans, dp,
be32_to_cpu(old_info->forw),
-1, &bp, args->whichfork);
if (error)
@@ -1375,59 +1730,20 @@ xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk,
}
/*
- * Compare two intermediate nodes for "order".
- */
-STATIC int
-xfs_da_node_order(
- struct xfs_buf *node1_bp,
- struct xfs_buf *node2_bp)
-{
- xfs_da_intnode_t *node1, *node2;
-
- node1 = node1_bp->b_addr;
- node2 = node2_bp->b_addr;
- ASSERT(node1->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC) &&
- node2->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC));
- if ((be16_to_cpu(node1->hdr.count) > 0) && (be16_to_cpu(node2->hdr.count) > 0) &&
- ((be32_to_cpu(node2->btree[0].hashval) <
- be32_to_cpu(node1->btree[0].hashval)) ||
- (be32_to_cpu(node2->btree[be16_to_cpu(node2->hdr.count)-1].hashval) <
- be32_to_cpu(node1->btree[be16_to_cpu(node1->hdr.count)-1].hashval)))) {
- return(1);
- }
- return(0);
-}
-
-/*
- * Pick up the last hashvalue from an intermediate node.
- */
-STATIC uint
-xfs_da_node_lasthash(
- struct xfs_buf *bp,
- int *count)
-{
- xfs_da_intnode_t *node;
-
- node = bp->b_addr;
- ASSERT(node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC));
- if (count)
- *count = be16_to_cpu(node->hdr.count);
- if (!node->hdr.count)
- return(0);
- return be32_to_cpu(node->btree[be16_to_cpu(node->hdr.count)-1].hashval);
-}
-
-/*
* Unlink a block from a doubly linked list of blocks.
*/
STATIC int /* error */
-xfs_da_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk,
- xfs_da_state_blk_t *save_blk)
+xfs_da3_blk_unlink(
+ struct xfs_da_state *state,
+ struct xfs_da_state_blk *drop_blk,
+ struct xfs_da_state_blk *save_blk)
{
- xfs_da_blkinfo_t *drop_info, *save_info, *tmp_info;
- xfs_da_args_t *args;
- struct xfs_buf *bp;
- int error;
+ struct xfs_da_blkinfo *drop_info;
+ struct xfs_da_blkinfo *save_info;
+ struct xfs_da_blkinfo *tmp_info;
+ struct xfs_da_args *args;
+ struct xfs_buf *bp;
+ int error;
/*
* Set up environment.
@@ -1439,8 +1755,6 @@ xfs_da_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk,
ASSERT(save_blk->magic == XFS_DA_NODE_MAGIC ||
save_blk->magic == XFS_DIR2_LEAFN_MAGIC ||
save_blk->magic == XFS_ATTR_LEAF_MAGIC);
- ASSERT(save_blk->magic == be16_to_cpu(save_info->magic));
- ASSERT(drop_blk->magic == be16_to_cpu(drop_info->magic));
ASSERT(save_blk->magic == drop_blk->magic);
ASSERT((be32_to_cpu(save_info->forw) == drop_blk->blkno) ||
(be32_to_cpu(save_info->back) == drop_blk->blkno));
@@ -1454,7 +1768,7 @@ xfs_da_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk,
trace_xfs_da_unlink_back(args);
save_info->back = drop_info->back;
if (drop_info->back) {
- error = xfs_da_node_read(args->trans, args->dp,
+ error = xfs_da3_node_read(args->trans, args->dp,
be32_to_cpu(drop_info->back),
-1, &bp, args->whichfork);
if (error)
@@ -1471,7 +1785,7 @@ xfs_da_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk,
trace_xfs_da_unlink_forward(args);
save_info->forw = drop_info->forw;
if (drop_info->forw) {
- error = xfs_da_node_read(args->trans, args->dp,
+ error = xfs_da3_node_read(args->trans, args->dp,
be32_to_cpu(drop_info->forw),
-1, &bp, args->whichfork);
if (error)
@@ -1499,15 +1813,23 @@ xfs_da_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk,
* the new bottom and the root.
*/
int /* error */
-xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path,
- int forward, int release, int *result)
+xfs_da3_path_shift(
+ struct xfs_da_state *state,
+ struct xfs_da_state_path *path,
+ int forward,
+ int release,
+ int *result)
{
- xfs_da_state_blk_t *blk;
- xfs_da_blkinfo_t *info;
- xfs_da_intnode_t *node;
- xfs_da_args_t *args;
- xfs_dablk_t blkno=0;
- int level, error;
+ struct xfs_da_state_blk *blk;
+ struct xfs_da_blkinfo *info;
+ struct xfs_da_intnode *node;
+ struct xfs_da_args *args;
+ struct xfs_da_node_entry *btree;
+ struct xfs_da3_icnode_hdr nodehdr;
+ xfs_dablk_t blkno = 0;
+ int level;
+ int error;
+ struct xfs_inode *dp = state->args->dp;
trace_xfs_da_path_shift(state->args);
@@ -1522,16 +1844,17 @@ xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path,
ASSERT((path->active > 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
level = (path->active-1) - 1; /* skip bottom layer in path */
for (blk = &path->blk[level]; level >= 0; blk--, level--) {
- ASSERT(blk->bp != NULL);
node = blk->bp->b_addr;
- ASSERT(node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC));
- if (forward && (blk->index < be16_to_cpu(node->hdr.count)-1)) {
+ dp->d_ops->node_hdr_from_disk(&nodehdr, node);
+ btree = dp->d_ops->node_tree_p(node);
+
+ if (forward && (blk->index < nodehdr.count - 1)) {
blk->index++;
- blkno = be32_to_cpu(node->btree[blk->index].before);
+ blkno = be32_to_cpu(btree[blk->index].before);
break;
} else if (!forward && (blk->index > 0)) {
blk->index--;
- blkno = be32_to_cpu(node->btree[blk->index].before);
+ blkno = be32_to_cpu(btree[blk->index].before);
break;
}
}
@@ -1557,45 +1880,59 @@ xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path,
* Read the next child block.
*/
blk->blkno = blkno;
- error = xfs_da_node_read(args->trans, args->dp, blkno, -1,
+ error = xfs_da3_node_read(args->trans, dp, blkno, -1,
&blk->bp, args->whichfork);
if (error)
return(error);
- ASSERT(blk->bp != NULL);
info = blk->bp->b_addr;
ASSERT(info->magic == cpu_to_be16(XFS_DA_NODE_MAGIC) ||
+ info->magic == cpu_to_be16(XFS_DA3_NODE_MAGIC) ||
info->magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) ||
- info->magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
- blk->magic = be16_to_cpu(info->magic);
- if (blk->magic == XFS_DA_NODE_MAGIC) {
+ info->magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC) ||
+ info->magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC) ||
+ info->magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC));
+
+
+ /*
+ * Note: we flatten the magic number to a single type so we
+ * don't have to compare against crc/non-crc types elsewhere.
+ */
+ switch (be16_to_cpu(info->magic)) {
+ case XFS_DA_NODE_MAGIC:
+ case XFS_DA3_NODE_MAGIC:
+ blk->magic = XFS_DA_NODE_MAGIC;
node = (xfs_da_intnode_t *)info;
- blk->hashval = be32_to_cpu(node->btree[be16_to_cpu(node->hdr.count)-1].hashval);
+ dp->d_ops->node_hdr_from_disk(&nodehdr, node);
+ btree = dp->d_ops->node_tree_p(node);
+ blk->hashval = be32_to_cpu(btree[nodehdr.count - 1].hashval);
if (forward)
blk->index = 0;
else
- blk->index = be16_to_cpu(node->hdr.count)-1;
- blkno = be32_to_cpu(node->btree[blk->index].before);
- } else {
+ blk->index = nodehdr.count - 1;
+ blkno = be32_to_cpu(btree[blk->index].before);
+ break;
+ case XFS_ATTR_LEAF_MAGIC:
+ case XFS_ATTR3_LEAF_MAGIC:
+ blk->magic = XFS_ATTR_LEAF_MAGIC;
ASSERT(level == path->active-1);
blk->index = 0;
- switch(blk->magic) {
- case XFS_ATTR_LEAF_MAGIC:
- blk->hashval = xfs_attr_leaf_lasthash(blk->bp,
- NULL);
- break;
- case XFS_DIR2_LEAFN_MAGIC:
- blk->hashval = xfs_dir2_leafn_lasthash(blk->bp,
- NULL);
- break;
- default:
- ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC ||
- blk->magic == XFS_DIR2_LEAFN_MAGIC);
- break;
- }
+ blk->hashval = xfs_attr_leaf_lasthash(blk->bp, NULL);
+ break;
+ case XFS_DIR2_LEAFN_MAGIC:
+ case XFS_DIR3_LEAFN_MAGIC:
+ blk->magic = XFS_DIR2_LEAFN_MAGIC;
+ ASSERT(level == path->active-1);
+ blk->index = 0;
+ blk->hashval = xfs_dir2_leafn_lasthash(args->dp,
+ blk->bp, NULL);
+ break;
+ default:
+ ASSERT(0);
+ break;
}
}
*result = 0;
- return(0);
+ return 0;
}
@@ -1754,20 +2091,12 @@ xfs_da_grow_inode(
xfs_dablk_t *new_blkno)
{
xfs_fileoff_t bno;
- int count;
int error;
trace_xfs_da_grow_inode(args);
- if (args->whichfork == XFS_DATA_FORK) {
- bno = args->dp->i_mount->m_dirleafblk;
- count = args->dp->i_mount->m_dirblkfsbs;
- } else {
- bno = 0;
- count = 1;
- }
-
- error = xfs_da_grow_inode_int(args, &bno, count);
+ bno = args->geo->leafblk;
+ error = xfs_da_grow_inode_int(args, &bno, args->geo->fsbcount);
if (!error)
*new_blkno = (xfs_dablk_t)bno;
return error;
@@ -1782,34 +2111,48 @@ xfs_da_grow_inode(
* a bmap btree split to do that.
*/
STATIC int
-xfs_da_swap_lastblock(
- xfs_da_args_t *args,
- xfs_dablk_t *dead_blknop,
- struct xfs_buf **dead_bufp)
+xfs_da3_swap_lastblock(
+ struct xfs_da_args *args,
+ xfs_dablk_t *dead_blknop,
+ struct xfs_buf **dead_bufp)
{
- xfs_dablk_t dead_blkno, last_blkno, sib_blkno, par_blkno;
- struct xfs_buf *dead_buf, *last_buf, *sib_buf, *par_buf;
- xfs_fileoff_t lastoff;
- xfs_inode_t *ip;
- xfs_trans_t *tp;
- xfs_mount_t *mp;
- int error, w, entno, level, dead_level;
- xfs_da_blkinfo_t *dead_info, *sib_info;
- xfs_da_intnode_t *par_node, *dead_node;
- xfs_dir2_leaf_t *dead_leaf2;
- xfs_dahash_t dead_hash;
+ struct xfs_da_blkinfo *dead_info;
+ struct xfs_da_blkinfo *sib_info;
+ struct xfs_da_intnode *par_node;
+ struct xfs_da_intnode *dead_node;
+ struct xfs_dir2_leaf *dead_leaf2;
+ struct xfs_da_node_entry *btree;
+ struct xfs_da3_icnode_hdr par_hdr;
+ struct xfs_inode *dp;
+ struct xfs_trans *tp;
+ struct xfs_mount *mp;
+ struct xfs_buf *dead_buf;
+ struct xfs_buf *last_buf;
+ struct xfs_buf *sib_buf;
+ struct xfs_buf *par_buf;
+ xfs_dahash_t dead_hash;
+ xfs_fileoff_t lastoff;
+ xfs_dablk_t dead_blkno;
+ xfs_dablk_t last_blkno;
+ xfs_dablk_t sib_blkno;
+ xfs_dablk_t par_blkno;
+ int error;
+ int w;
+ int entno;
+ int level;
+ int dead_level;
trace_xfs_da_swap_lastblock(args);
dead_buf = *dead_bufp;
dead_blkno = *dead_blknop;
tp = args->trans;
- ip = args->dp;
+ dp = args->dp;
w = args->whichfork;
ASSERT(w == XFS_DATA_FORK);
- mp = ip->i_mount;
- lastoff = mp->m_dirfreeblk;
- error = xfs_bmap_last_before(tp, ip, &lastoff, w);
+ mp = dp->i_mount;
+ lastoff = args->geo->freeblk;
+ error = xfs_bmap_last_before(tp, dp, &lastoff, w);
if (error)
return error;
if (unlikely(lastoff == 0)) {
@@ -1820,35 +2163,44 @@ xfs_da_swap_lastblock(
/*
* Read the last block in the btree space.
*/
- last_blkno = (xfs_dablk_t)lastoff - mp->m_dirblkfsbs;
- error = xfs_da_node_read(tp, ip, last_blkno, -1, &last_buf, w);
+ last_blkno = (xfs_dablk_t)lastoff - args->geo->fsbcount;
+ error = xfs_da3_node_read(tp, dp, last_blkno, -1, &last_buf, w);
if (error)
return error;
/*
* Copy the last block into the dead buffer and log it.
*/
- memcpy(dead_buf->b_addr, last_buf->b_addr, mp->m_dirblksize);
- xfs_trans_log_buf(tp, dead_buf, 0, mp->m_dirblksize - 1);
+ memcpy(dead_buf->b_addr, last_buf->b_addr, args->geo->blksize);
+ xfs_trans_log_buf(tp, dead_buf, 0, args->geo->blksize - 1);
dead_info = dead_buf->b_addr;
/*
* Get values from the moved block.
*/
- if (dead_info->magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)) {
+ if (dead_info->magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) ||
+ dead_info->magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)) {
+ struct xfs_dir3_icleaf_hdr leafhdr;
+ struct xfs_dir2_leaf_entry *ents;
+
dead_leaf2 = (xfs_dir2_leaf_t *)dead_info;
+ dp->d_ops->leaf_hdr_from_disk(&leafhdr, dead_leaf2);
+ ents = dp->d_ops->leaf_ents_p(dead_leaf2);
dead_level = 0;
- dead_hash = be32_to_cpu(dead_leaf2->ents[be16_to_cpu(dead_leaf2->hdr.count) - 1].hashval);
+ dead_hash = be32_to_cpu(ents[leafhdr.count - 1].hashval);
} else {
- ASSERT(dead_info->magic == cpu_to_be16(XFS_DA_NODE_MAGIC));
+ struct xfs_da3_icnode_hdr deadhdr;
+
dead_node = (xfs_da_intnode_t *)dead_info;
- dead_level = be16_to_cpu(dead_node->hdr.level);
- dead_hash = be32_to_cpu(dead_node->btree[be16_to_cpu(dead_node->hdr.count) - 1].hashval);
+ dp->d_ops->node_hdr_from_disk(&deadhdr, dead_node);
+ btree = dp->d_ops->node_tree_p(dead_node);
+ dead_level = deadhdr.level;
+ dead_hash = be32_to_cpu(btree[deadhdr.count - 1].hashval);
}
sib_buf = par_buf = NULL;
/*
* If the moved block has a left sibling, fix up the pointers.
*/
if ((sib_blkno = be32_to_cpu(dead_info->back))) {
- error = xfs_da_node_read(tp, ip, sib_blkno, -1, &sib_buf, w);
+ error = xfs_da3_node_read(tp, dp, sib_blkno, -1, &sib_buf, w);
if (error)
goto done;
sib_info = sib_buf->b_addr;
@@ -1870,7 +2222,7 @@ xfs_da_swap_lastblock(
* If the moved block has a right sibling, fix up the pointers.
*/
if ((sib_blkno = be32_to_cpu(dead_info->forw))) {
- error = xfs_da_node_read(tp, ip, sib_blkno, -1, &sib_buf, w);
+ error = xfs_da3_node_read(tp, dp, sib_blkno, -1, &sib_buf, w);
if (error)
goto done;
sib_info = sib_buf->b_addr;
@@ -1888,37 +2240,37 @@ xfs_da_swap_lastblock(
sizeof(sib_info->back)));
sib_buf = NULL;
}
- par_blkno = mp->m_dirleafblk;
+ par_blkno = args->geo->leafblk;
level = -1;
/*
* Walk down the tree looking for the parent of the moved block.
*/
for (;;) {
- error = xfs_da_node_read(tp, ip, par_blkno, -1, &par_buf, w);
+ error = xfs_da3_node_read(tp, dp, par_blkno, -1, &par_buf, w);
if (error)
goto done;
par_node = par_buf->b_addr;
- if (unlikely(par_node->hdr.info.magic !=
- cpu_to_be16(XFS_DA_NODE_MAGIC) ||
- (level >= 0 && level != be16_to_cpu(par_node->hdr.level) + 1))) {
+ dp->d_ops->node_hdr_from_disk(&par_hdr, par_node);
+ if (level >= 0 && level != par_hdr.level + 1) {
XFS_ERROR_REPORT("xfs_da_swap_lastblock(4)",
XFS_ERRLEVEL_LOW, mp);
error = XFS_ERROR(EFSCORRUPTED);
goto done;
}
- level = be16_to_cpu(par_node->hdr.level);
+ level = par_hdr.level;
+ btree = dp->d_ops->node_tree_p(par_node);
for (entno = 0;
- entno < be16_to_cpu(par_node->hdr.count) &&
- be32_to_cpu(par_node->btree[entno].hashval) < dead_hash;
+ entno < par_hdr.count &&
+ be32_to_cpu(btree[entno].hashval) < dead_hash;
entno++)
continue;
- if (unlikely(entno == be16_to_cpu(par_node->hdr.count))) {
+ if (entno == par_hdr.count) {
XFS_ERROR_REPORT("xfs_da_swap_lastblock(5)",
XFS_ERRLEVEL_LOW, mp);
error = XFS_ERROR(EFSCORRUPTED);
goto done;
}
- par_blkno = be32_to_cpu(par_node->btree[entno].before);
+ par_blkno = be32_to_cpu(btree[entno].before);
if (level == dead_level + 1)
break;
xfs_trans_brelse(tp, par_buf);
@@ -1930,13 +2282,13 @@ xfs_da_swap_lastblock(
*/
for (;;) {
for (;
- entno < be16_to_cpu(par_node->hdr.count) &&
- be32_to_cpu(par_node->btree[entno].before) != last_blkno;
+ entno < par_hdr.count &&
+ be32_to_cpu(btree[entno].before) != last_blkno;
entno++)
continue;
- if (entno < be16_to_cpu(par_node->hdr.count))
+ if (entno < par_hdr.count)
break;
- par_blkno = be32_to_cpu(par_node->hdr.info.forw);
+ par_blkno = par_hdr.forw;
xfs_trans_brelse(tp, par_buf);
par_buf = NULL;
if (unlikely(par_blkno == 0)) {
@@ -1945,27 +2297,27 @@ xfs_da_swap_lastblock(
error = XFS_ERROR(EFSCORRUPTED);
goto done;
}
- error = xfs_da_node_read(tp, ip, par_blkno, -1, &par_buf, w);
+ error = xfs_da3_node_read(tp, dp, par_blkno, -1, &par_buf, w);
if (error)
goto done;
par_node = par_buf->b_addr;
- if (unlikely(
- be16_to_cpu(par_node->hdr.level) != level ||
- par_node->hdr.info.magic != cpu_to_be16(XFS_DA_NODE_MAGIC))) {
+ dp->d_ops->node_hdr_from_disk(&par_hdr, par_node);
+ if (par_hdr.level != level) {
XFS_ERROR_REPORT("xfs_da_swap_lastblock(7)",
XFS_ERRLEVEL_LOW, mp);
error = XFS_ERROR(EFSCORRUPTED);
goto done;
}
+ btree = dp->d_ops->node_tree_p(par_node);
entno = 0;
}
/*
* Update the parent entry pointing to the moved block.
*/
- par_node->btree[entno].before = cpu_to_be32(dead_blkno);
+ btree[entno].before = cpu_to_be32(dead_blkno);
xfs_trans_log_buf(tp, par_buf,
- XFS_DA_LOGRANGE(par_node, &par_node->btree[entno].before,
- sizeof(par_node->btree[entno].before)));
+ XFS_DA_LOGRANGE(par_node, &btree[entno].before,
+ sizeof(btree[entno].before)));
*dead_blknop = last_blkno;
*dead_bufp = last_buf;
return 0;
@@ -1998,23 +2350,21 @@ xfs_da_shrink_inode(
w = args->whichfork;
tp = args->trans;
mp = dp->i_mount;
- if (w == XFS_DATA_FORK)
- count = mp->m_dirblkfsbs;
- else
- count = 1;
+ count = args->geo->fsbcount;
for (;;) {
/*
* Remove extents. If we get ENOSPC for a dir we have to move
* the last block to the place we want to kill.
*/
- if ((error = xfs_bunmapi(tp, dp, dead_blkno, count,
- xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA,
- 0, args->firstblock, args->flist,
- &done)) == ENOSPC) {
+ error = xfs_bunmapi(tp, dp, dead_blkno, count,
+ xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA,
+ 0, args->firstblock, args->flist, &done);
+ if (error == ENOSPC) {
if (w != XFS_DATA_FORK)
break;
- if ((error = xfs_da_swap_lastblock(args, &dead_blkno,
- &dead_buf)))
+ error = xfs_da3_swap_lastblock(args, &dead_blkno,
+ &dead_buf);
+ if (error)
break;
} else {
break;
@@ -2063,9 +2413,9 @@ static int
xfs_buf_map_from_irec(
struct xfs_mount *mp,
struct xfs_buf_map **mapp,
- unsigned int *nmaps,
+ int *nmaps,
struct xfs_bmbt_irec *irecs,
- unsigned int nirecs)
+ int nirecs)
{
struct xfs_buf_map *map;
int i;
@@ -2074,7 +2424,8 @@ xfs_buf_map_from_irec(
ASSERT(nirecs >= 1);
if (nirecs > 1) {
- map = kmem_zalloc(nirecs * sizeof(struct xfs_buf_map), KM_SLEEP);
+ map = kmem_zalloc(nirecs * sizeof(struct xfs_buf_map),
+ KM_SLEEP | KM_NOFS);
if (!map)
return ENOMEM;
*mapp = map;
@@ -2101,7 +2452,6 @@ xfs_buf_map_from_irec(
*/
static int
xfs_dabuf_map(
- struct xfs_trans *trans,
struct xfs_inode *dp,
xfs_dablk_t bno,
xfs_daddr_t mappedbno,
@@ -2119,7 +2469,10 @@ xfs_dabuf_map(
ASSERT(map && *map);
ASSERT(*nmaps == 1);
- nfsb = (whichfork == XFS_DATA_FORK) ? mp->m_dirblkfsbs : 1;
+ if (whichfork == XFS_DATA_FORK)
+ nfsb = mp->m_dir_geo->fsbcount;
+ else
+ nfsb = mp->m_attr_geo->fsbcount;
/*
* Caller doesn't have a mapping. -2 means don't complain
@@ -2130,7 +2483,8 @@ xfs_dabuf_map(
* Optimize the one-block case.
*/
if (nfsb != 1)
- irecs = kmem_zalloc(sizeof(irec) * nfsb, KM_SLEEP);
+ irecs = kmem_zalloc(sizeof(irec) * nfsb,
+ KM_SLEEP | KM_NOFS);
nirecs = nfsb;
error = xfs_bmapi_read(dp, (xfs_fileoff_t)bno, nfsb, irecs,
@@ -2196,7 +2550,7 @@ xfs_da_get_buf(
*bpp = NULL;
mapp = &map;
nmap = 1;
- error = xfs_dabuf_map(trans, dp, bno, mappedbno, whichfork,
+ error = xfs_dabuf_map(dp, bno, mappedbno, whichfork,
&mapp, &nmap);
if (error) {
/* mapping a hole is not an error, but we don't continue */
@@ -2244,7 +2598,7 @@ xfs_da_read_buf(
*bpp = NULL;
mapp = &map;
nmap = 1;
- error = xfs_dabuf_map(trans, dp, bno, mappedbno, whichfork,
+ error = xfs_dabuf_map(dp, bno, mappedbno, whichfork,
&mapp, &nmap);
if (error) {
/* mapping a hole is not an error, but we don't continue */
@@ -2263,38 +2617,6 @@ xfs_da_read_buf(
xfs_buf_set_ref(bp, XFS_ATTR_BTREE_REF);
else
xfs_buf_set_ref(bp, XFS_DIR_BTREE_REF);
-
- /*
- * This verification code will be moved to a CRC verification callback
- * function so just leave it here unchanged until then.
- */
- {
- xfs_dir2_data_hdr_t *hdr = bp->b_addr;
- xfs_dir2_free_t *free = bp->b_addr;
- xfs_da_blkinfo_t *info = bp->b_addr;
- uint magic, magic1;
- struct xfs_mount *mp = dp->i_mount;
-
- magic = be16_to_cpu(info->magic);
- magic1 = be32_to_cpu(hdr->magic);
- if (unlikely(
- XFS_TEST_ERROR((magic != XFS_DA_NODE_MAGIC) &&
- (magic != XFS_ATTR_LEAF_MAGIC) &&
- (magic != XFS_DIR2_LEAF1_MAGIC) &&
- (magic != XFS_DIR2_LEAFN_MAGIC) &&
- (magic1 != XFS_DIR2_BLOCK_MAGIC) &&
- (magic1 != XFS_DIR2_DATA_MAGIC) &&
- (free->hdr.magic != cpu_to_be32(XFS_DIR2_FREE_MAGIC)),
- mp, XFS_ERRTAG_DA_READ_BUF,
- XFS_RANDOM_DA_READ_BUF))) {
- trace_xfs_da_btree_corrupt(bp, _RET_IP_);
- XFS_CORRUPTION_ERROR("xfs_da_do_buf(2)",
- XFS_ERRLEVEL_LOW, mp, info);
- error = XFS_ERROR(EFSCORRUPTED);
- xfs_trans_brelse(trans, bp);
- goto out_free;
- }
- }
*bpp = bp;
out_free:
if (mapp != &map)
@@ -2308,7 +2630,6 @@ out_free:
*/
xfs_daddr_t
xfs_da_reada_buf(
- struct xfs_trans *trans,
struct xfs_inode *dp,
xfs_dablk_t bno,
xfs_daddr_t mappedbno,
@@ -2322,7 +2643,7 @@ xfs_da_reada_buf(
mapp = &map;
nmap = 1;
- error = xfs_dabuf_map(trans, dp, bno, mappedbno, whichfork,
+ error = xfs_dabuf_map(dp, bno, mappedbno, whichfork,
&mapp, &nmap);
if (error) {
/* mapping a hole is not an error, but we don't continue */
@@ -2342,41 +2663,3 @@ out_free:
return -1;
return mappedbno;
}
-
-kmem_zone_t *xfs_da_state_zone; /* anchor for state struct zone */
-
-/*
- * Allocate a dir-state structure.
- * We don't put them on the stack since they're large.
- */
-xfs_da_state_t *
-xfs_da_state_alloc(void)
-{
- return kmem_zone_zalloc(xfs_da_state_zone, KM_NOFS);
-}
-
-/*
- * Kill the altpath contents of a da-state structure.
- */
-STATIC void
-xfs_da_state_kill_altpath(xfs_da_state_t *state)
-{
- int i;
-
- for (i = 0; i < state->altpath.active; i++)
- state->altpath.blk[i].bp = NULL;
- state->altpath.active = 0;
-}
-
-/*
- * Free a da-state structure.
- */
-void
-xfs_da_state_free(xfs_da_state_t *state)
-{
- xfs_da_state_kill_altpath(state);
-#ifdef DEBUG
- memset((char *)state, 0, sizeof(*state));
-#endif /* DEBUG */
- kmem_zone_free(xfs_da_state_zone, state);
-}
diff --git a/fs/xfs/xfs_da_btree.h b/fs/xfs/xfs_da_btree.h
index ee5170c46ae..6e153e399a7 100644
--- a/fs/xfs/xfs_da_btree.h
+++ b/fs/xfs/xfs_da_btree.h
@@ -1,5 +1,6 @@
/*
* Copyright (c) 2000,2002,2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
* All Rights Reserved.
*
* This program is free software; you can redistribute it and/or
@@ -20,58 +21,26 @@
struct xfs_bmap_free;
struct xfs_inode;
-struct xfs_mount;
struct xfs_trans;
struct zone;
-
-/*========================================================================
- * Directory Structure when greater than XFS_LBSIZE(mp) bytes.
- *========================================================================*/
-
-/*
- * This structure is common to both leaf nodes and non-leaf nodes in the Btree.
- *
- * It is used to manage a doubly linked list of all blocks at the same
- * level in the Btree, and to identify which type of block this is.
- */
-#define XFS_DA_NODE_MAGIC 0xfebe /* magic number: non-leaf blocks */
-#define XFS_ATTR_LEAF_MAGIC 0xfbee /* magic number: attribute leaf blks */
-#define XFS_DIR2_LEAF1_MAGIC 0xd2f1 /* magic number: v2 dirlf single blks */
-#define XFS_DIR2_LEAFN_MAGIC 0xd2ff /* magic number: v2 dirlf multi blks */
-
-typedef struct xfs_da_blkinfo {
- __be32 forw; /* previous block in list */
- __be32 back; /* following block in list */
- __be16 magic; /* validity check on block */
- __be16 pad; /* unused */
-} xfs_da_blkinfo_t;
+struct xfs_dir_ops;
/*
- * This is the structure of the root and intermediate nodes in the Btree.
- * The leaf nodes are defined above.
- *
- * Entries are not packed.
- *
- * Since we have duplicate keys, use a binary search but always follow
- * all match in the block, not just the first match found.
+ * Directory/attribute geometry information. There will be one of these for each
+ * data fork type, and it will be passed around via the xfs_da_args. Global
+ * structures will be attached to the xfs_mount.
*/
-#define XFS_DA_NODE_MAXDEPTH 5 /* max depth of Btree */
-
-typedef struct xfs_da_intnode {
- struct xfs_da_node_hdr { /* constant-structure header block */
- xfs_da_blkinfo_t info; /* block type, links, etc. */
- __be16 count; /* count of active entries */
- __be16 level; /* level above leaves (leaf == 0) */
- } hdr;
- struct xfs_da_node_entry {
- __be32 hashval; /* hash value for this descendant */
- __be32 before; /* Btree block before this key */
- } btree[1]; /* variable sized array of keys */
-} xfs_da_intnode_t;
-typedef struct xfs_da_node_hdr xfs_da_node_hdr_t;
-typedef struct xfs_da_node_entry xfs_da_node_entry_t;
-
-#define XFS_LBSIZE(mp) (mp)->m_sb.sb_blocksize
+struct xfs_da_geometry {
+ int blksize; /* da block size in bytes */
+ int fsbcount; /* da block size in filesystem blocks */
+ uint8_t fsblog; /* log2 of _filesystem_ block size */
+ uint8_t blklog; /* log2 of da block size */
+ uint node_ents; /* # of entries in a danode */
+ int magicpct; /* 37% of block size in bytes */
+ xfs_dablk_t datablk; /* blockno of dir data v2 */
+ xfs_dablk_t leafblk; /* blockno of leaf data v2 */
+ xfs_dablk_t freeblk; /* blockno of free data v2 */
+};
/*========================================================================
* Btree searching and modification structure definitions.
@@ -90,8 +59,10 @@ enum xfs_dacmp {
* Structure to ease passing around component names.
*/
typedef struct xfs_da_args {
+ struct xfs_da_geometry *geo; /* da block geometry */
const __uint8_t *name; /* string (maybe not NULL terminated) */
int namelen; /* length of string (maybe no NULL) */
+ __uint8_t filetype; /* filetype of inode for directories */
__uint8_t *value; /* set of bytes (maybe contain NULLs) */
int valuelen; /* length of value */
int flags; /* argument flags (eg: ATTR_NOCREATE) */
@@ -107,10 +78,12 @@ typedef struct xfs_da_args {
int index; /* index of attr of interest in blk */
xfs_dablk_t rmtblkno; /* remote attr value starting blkno */
int rmtblkcnt; /* remote attr value block count */
+ int rmtvaluelen; /* remote attr value length in bytes */
xfs_dablk_t blkno2; /* blkno of 2nd attr leaf of interest */
int index2; /* index of 2nd attr in blk */
xfs_dablk_t rmtblkno2; /* remote attr value starting blkno */
int rmtblkcnt2; /* remote attr value block count */
+ int rmtvaluelen2; /* remote attr value length in bytes */
int op_flags; /* operation flags */
enum xfs_dacmp cmpresult; /* name compare result for lookups */
} xfs_da_args_t;
@@ -155,8 +128,6 @@ typedef struct xfs_da_state_path {
typedef struct xfs_da_state {
xfs_da_args_t *args; /* filename arguments */
struct xfs_mount *mp; /* filesystem mount point */
- unsigned int blocksize; /* logical block size */
- unsigned int node_ents; /* how many entries in danode */
xfs_da_state_path_t path; /* search/split paths */
xfs_da_state_path_t altpath; /* alternate path for join */
unsigned char inleaf; /* insert into 1->lf, 0->splf */
@@ -191,29 +162,29 @@ struct xfs_nameops {
/*
* Routines used for growing the Btree.
*/
-int xfs_da_node_create(xfs_da_args_t *args, xfs_dablk_t blkno, int level,
- struct xfs_buf **bpp, int whichfork);
-int xfs_da_split(xfs_da_state_t *state);
+int xfs_da3_node_create(struct xfs_da_args *args, xfs_dablk_t blkno,
+ int level, struct xfs_buf **bpp, int whichfork);
+int xfs_da3_split(xfs_da_state_t *state);
/*
* Routines used for shrinking the Btree.
*/
-int xfs_da_join(xfs_da_state_t *state);
-void xfs_da_fixhashpath(xfs_da_state_t *state,
- xfs_da_state_path_t *path_to_to_fix);
+int xfs_da3_join(xfs_da_state_t *state);
+void xfs_da3_fixhashpath(struct xfs_da_state *state,
+ struct xfs_da_state_path *path_to_to_fix);
/*
* Routines used for finding things in the Btree.
*/
-int xfs_da_node_lookup_int(xfs_da_state_t *state, int *result);
-int xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path,
+int xfs_da3_node_lookup_int(xfs_da_state_t *state, int *result);
+int xfs_da3_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path,
int forward, int release, int *result);
/*
* Utility routines.
*/
-int xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk,
+int xfs_da3_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk,
xfs_da_state_blk_t *new_blk);
-int xfs_da_node_read(struct xfs_trans *tp, struct xfs_inode *dp,
+int xfs_da3_node_read(struct xfs_trans *tp, struct xfs_inode *dp,
xfs_dablk_t bno, xfs_daddr_t mappedbno,
struct xfs_buf **bpp, int which_fork);
@@ -230,9 +201,9 @@ int xfs_da_read_buf(struct xfs_trans *trans, struct xfs_inode *dp,
xfs_dablk_t bno, xfs_daddr_t mappedbno,
struct xfs_buf **bpp, int whichfork,
const struct xfs_buf_ops *ops);
-xfs_daddr_t xfs_da_reada_buf(struct xfs_trans *trans, struct xfs_inode *dp,
- xfs_dablk_t bno, xfs_daddr_t mapped_bno,
- int whichfork, const struct xfs_buf_ops *ops);
+xfs_daddr_t xfs_da_reada_buf(struct xfs_inode *dp, xfs_dablk_t bno,
+ xfs_daddr_t mapped_bno, int whichfork,
+ const struct xfs_buf_ops *ops);
int xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno,
struct xfs_buf *dead_buf);
diff --git a/fs/xfs/xfs_da_format.c b/fs/xfs/xfs_da_format.c
new file mode 100644
index 00000000000..c9aee52a37e
--- /dev/null
+++ b/fs/xfs/xfs_da_format.c
@@ -0,0 +1,911 @@
+/*
+ * Copyright (c) 2000,2002,2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_inode.h"
+#include "xfs_dir2.h"
+#include "xfs_dir2_priv.h"
+
+/*
+ * Shortform directory ops
+ */
+static int
+xfs_dir2_sf_entsize(
+ struct xfs_dir2_sf_hdr *hdr,
+ int len)
+{
+ int count = sizeof(struct xfs_dir2_sf_entry); /* namelen + offset */
+
+ count += len; /* name */
+ count += hdr->i8count ? sizeof(xfs_dir2_ino8_t) :
+ sizeof(xfs_dir2_ino4_t); /* ino # */
+ return count;
+}
+
+static int
+xfs_dir3_sf_entsize(
+ struct xfs_dir2_sf_hdr *hdr,
+ int len)
+{
+ return xfs_dir2_sf_entsize(hdr, len) + sizeof(__uint8_t);
+}
+
+static struct xfs_dir2_sf_entry *
+xfs_dir2_sf_nextentry(
+ struct xfs_dir2_sf_hdr *hdr,
+ struct xfs_dir2_sf_entry *sfep)
+{
+ return (struct xfs_dir2_sf_entry *)
+ ((char *)sfep + xfs_dir2_sf_entsize(hdr, sfep->namelen));
+}
+
+static struct xfs_dir2_sf_entry *
+xfs_dir3_sf_nextentry(
+ struct xfs_dir2_sf_hdr *hdr,
+ struct xfs_dir2_sf_entry *sfep)
+{
+ return (struct xfs_dir2_sf_entry *)
+ ((char *)sfep + xfs_dir3_sf_entsize(hdr, sfep->namelen));
+}
+
+
+/*
+ * For filetype enabled shortform directories, the file type field is stored at
+ * the end of the name. Because it's only a single byte, endian conversion is
+ * not necessary. For non-filetype enable directories, the type is always
+ * unknown and we never store the value.
+ */
+static __uint8_t
+xfs_dir2_sfe_get_ftype(
+ struct xfs_dir2_sf_entry *sfep)
+{
+ return XFS_DIR3_FT_UNKNOWN;
+}
+
+static void
+xfs_dir2_sfe_put_ftype(
+ struct xfs_dir2_sf_entry *sfep,
+ __uint8_t ftype)
+{
+ ASSERT(ftype < XFS_DIR3_FT_MAX);
+}
+
+static __uint8_t
+xfs_dir3_sfe_get_ftype(
+ struct xfs_dir2_sf_entry *sfep)
+{
+ __uint8_t ftype;
+
+ ftype = sfep->name[sfep->namelen];
+ if (ftype >= XFS_DIR3_FT_MAX)
+ return XFS_DIR3_FT_UNKNOWN;
+ return ftype;
+}
+
+static void
+xfs_dir3_sfe_put_ftype(
+ struct xfs_dir2_sf_entry *sfep,
+ __uint8_t ftype)
+{
+ ASSERT(ftype < XFS_DIR3_FT_MAX);
+
+ sfep->name[sfep->namelen] = ftype;
+}
+
+/*
+ * Inode numbers in short-form directories can come in two versions,
+ * either 4 bytes or 8 bytes wide. These helpers deal with the
+ * two forms transparently by looking at the headers i8count field.
+ *
+ * For 64-bit inode number the most significant byte must be zero.
+ */
+static xfs_ino_t
+xfs_dir2_sf_get_ino(
+ struct xfs_dir2_sf_hdr *hdr,
+ xfs_dir2_inou_t *from)
+{
+ if (hdr->i8count)
+ return get_unaligned_be64(&from->i8.i) & 0x00ffffffffffffffULL;
+ else
+ return get_unaligned_be32(&from->i4.i);
+}
+
+static void
+xfs_dir2_sf_put_ino(
+ struct xfs_dir2_sf_hdr *hdr,
+ xfs_dir2_inou_t *to,
+ xfs_ino_t ino)
+{
+ ASSERT((ino & 0xff00000000000000ULL) == 0);
+
+ if (hdr->i8count)
+ put_unaligned_be64(ino, &to->i8.i);
+ else
+ put_unaligned_be32(ino, &to->i4.i);
+}
+
+static xfs_ino_t
+xfs_dir2_sf_get_parent_ino(
+ struct xfs_dir2_sf_hdr *hdr)
+{
+ return xfs_dir2_sf_get_ino(hdr, &hdr->parent);
+}
+
+static void
+xfs_dir2_sf_put_parent_ino(
+ struct xfs_dir2_sf_hdr *hdr,
+ xfs_ino_t ino)
+{
+ xfs_dir2_sf_put_ino(hdr, &hdr->parent, ino);
+}
+
+/*
+ * In short-form directory entries the inode numbers are stored at variable
+ * offset behind the entry name. If the entry stores a filetype value, then it
+ * sits between the name and the inode number. Hence the inode numbers may only
+ * be accessed through the helpers below.
+ */
+static xfs_ino_t
+xfs_dir2_sfe_get_ino(
+ struct xfs_dir2_sf_hdr *hdr,
+ struct xfs_dir2_sf_entry *sfep)
+{
+ return xfs_dir2_sf_get_ino(hdr,
+ (xfs_dir2_inou_t *)&sfep->name[sfep->namelen]);
+}
+
+static void
+xfs_dir2_sfe_put_ino(
+ struct xfs_dir2_sf_hdr *hdr,
+ struct xfs_dir2_sf_entry *sfep,
+ xfs_ino_t ino)
+{
+ xfs_dir2_sf_put_ino(hdr,
+ (xfs_dir2_inou_t *)&sfep->name[sfep->namelen], ino);
+}
+
+static xfs_ino_t
+xfs_dir3_sfe_get_ino(
+ struct xfs_dir2_sf_hdr *hdr,
+ struct xfs_dir2_sf_entry *sfep)
+{
+ return xfs_dir2_sf_get_ino(hdr,
+ (xfs_dir2_inou_t *)&sfep->name[sfep->namelen + 1]);
+}
+
+static void
+xfs_dir3_sfe_put_ino(
+ struct xfs_dir2_sf_hdr *hdr,
+ struct xfs_dir2_sf_entry *sfep,
+ xfs_ino_t ino)
+{
+ xfs_dir2_sf_put_ino(hdr,
+ (xfs_dir2_inou_t *)&sfep->name[sfep->namelen + 1], ino);
+}
+
+
+/*
+ * Directory data block operations
+ */
+
+/*
+ * For special situations, the dirent size ends up fixed because we always know
+ * what the size of the entry is. That's true for the "." and "..", and
+ * therefore we know that they are a fixed size and hence their offsets are
+ * constant, as is the first entry.
+ *
+ * Hence, this calculation is written as a macro to be able to be calculated at
+ * compile time and so certain offsets can be calculated directly in the
+ * structure initaliser via the macro. There are two macros - one for dirents
+ * with ftype and without so there are no unresolvable conditionals in the
+ * calculations. We also use round_up() as XFS_DIR2_DATA_ALIGN is always a power
+ * of 2 and the compiler doesn't reject it (unlike roundup()).
+ */
+#define XFS_DIR2_DATA_ENTSIZE(n) \
+ round_up((offsetof(struct xfs_dir2_data_entry, name[0]) + (n) + \
+ sizeof(xfs_dir2_data_off_t)), XFS_DIR2_DATA_ALIGN)
+
+#define XFS_DIR3_DATA_ENTSIZE(n) \
+ round_up((offsetof(struct xfs_dir2_data_entry, name[0]) + (n) + \
+ sizeof(xfs_dir2_data_off_t) + sizeof(__uint8_t)), \
+ XFS_DIR2_DATA_ALIGN)
+
+static int
+xfs_dir2_data_entsize(
+ int n)
+{
+ return XFS_DIR2_DATA_ENTSIZE(n);
+}
+
+static int
+xfs_dir3_data_entsize(
+ int n)
+{
+ return XFS_DIR3_DATA_ENTSIZE(n);
+}
+
+static __uint8_t
+xfs_dir2_data_get_ftype(
+ struct xfs_dir2_data_entry *dep)
+{
+ return XFS_DIR3_FT_UNKNOWN;
+}
+
+static void
+xfs_dir2_data_put_ftype(
+ struct xfs_dir2_data_entry *dep,
+ __uint8_t ftype)
+{
+ ASSERT(ftype < XFS_DIR3_FT_MAX);
+}
+
+static __uint8_t
+xfs_dir3_data_get_ftype(
+ struct xfs_dir2_data_entry *dep)
+{
+ __uint8_t ftype = dep->name[dep->namelen];
+
+ ASSERT(ftype < XFS_DIR3_FT_MAX);
+ if (ftype >= XFS_DIR3_FT_MAX)
+ return XFS_DIR3_FT_UNKNOWN;
+ return ftype;
+}
+
+static void
+xfs_dir3_data_put_ftype(
+ struct xfs_dir2_data_entry *dep,
+ __uint8_t type)
+{
+ ASSERT(type < XFS_DIR3_FT_MAX);
+ ASSERT(dep->namelen != 0);
+
+ dep->name[dep->namelen] = type;
+}
+
+/*
+ * Pointer to an entry's tag word.
+ */
+static __be16 *
+xfs_dir2_data_entry_tag_p(
+ struct xfs_dir2_data_entry *dep)
+{
+ return (__be16 *)((char *)dep +
+ xfs_dir2_data_entsize(dep->namelen) - sizeof(__be16));
+}
+
+static __be16 *
+xfs_dir3_data_entry_tag_p(
+ struct xfs_dir2_data_entry *dep)
+{
+ return (__be16 *)((char *)dep +
+ xfs_dir3_data_entsize(dep->namelen) - sizeof(__be16));
+}
+
+/*
+ * location of . and .. in data space (always block 0)
+ */
+static struct xfs_dir2_data_entry *
+xfs_dir2_data_dot_entry_p(
+ struct xfs_dir2_data_hdr *hdr)
+{
+ return (struct xfs_dir2_data_entry *)
+ ((char *)hdr + sizeof(struct xfs_dir2_data_hdr));
+}
+
+static struct xfs_dir2_data_entry *
+xfs_dir2_data_dotdot_entry_p(
+ struct xfs_dir2_data_hdr *hdr)
+{
+ return (struct xfs_dir2_data_entry *)
+ ((char *)hdr + sizeof(struct xfs_dir2_data_hdr) +
+ XFS_DIR2_DATA_ENTSIZE(1));
+}
+
+static struct xfs_dir2_data_entry *
+xfs_dir2_data_first_entry_p(
+ struct xfs_dir2_data_hdr *hdr)
+{
+ return (struct xfs_dir2_data_entry *)
+ ((char *)hdr + sizeof(struct xfs_dir2_data_hdr) +
+ XFS_DIR2_DATA_ENTSIZE(1) +
+ XFS_DIR2_DATA_ENTSIZE(2));
+}
+
+static struct xfs_dir2_data_entry *
+xfs_dir2_ftype_data_dotdot_entry_p(
+ struct xfs_dir2_data_hdr *hdr)
+{
+ return (struct xfs_dir2_data_entry *)
+ ((char *)hdr + sizeof(struct xfs_dir2_data_hdr) +
+ XFS_DIR3_DATA_ENTSIZE(1));
+}
+
+static struct xfs_dir2_data_entry *
+xfs_dir2_ftype_data_first_entry_p(
+ struct xfs_dir2_data_hdr *hdr)
+{
+ return (struct xfs_dir2_data_entry *)
+ ((char *)hdr + sizeof(struct xfs_dir2_data_hdr) +
+ XFS_DIR3_DATA_ENTSIZE(1) +
+ XFS_DIR3_DATA_ENTSIZE(2));
+}
+
+static struct xfs_dir2_data_entry *
+xfs_dir3_data_dot_entry_p(
+ struct xfs_dir2_data_hdr *hdr)
+{
+ return (struct xfs_dir2_data_entry *)
+ ((char *)hdr + sizeof(struct xfs_dir3_data_hdr));
+}
+
+static struct xfs_dir2_data_entry *
+xfs_dir3_data_dotdot_entry_p(
+ struct xfs_dir2_data_hdr *hdr)
+{
+ return (struct xfs_dir2_data_entry *)
+ ((char *)hdr + sizeof(struct xfs_dir3_data_hdr) +
+ XFS_DIR3_DATA_ENTSIZE(1));
+}
+
+static struct xfs_dir2_data_entry *
+xfs_dir3_data_first_entry_p(
+ struct xfs_dir2_data_hdr *hdr)
+{
+ return (struct xfs_dir2_data_entry *)
+ ((char *)hdr + sizeof(struct xfs_dir3_data_hdr) +
+ XFS_DIR3_DATA_ENTSIZE(1) +
+ XFS_DIR3_DATA_ENTSIZE(2));
+}
+
+static struct xfs_dir2_data_free *
+xfs_dir2_data_bestfree_p(struct xfs_dir2_data_hdr *hdr)
+{
+ return hdr->bestfree;
+}
+
+static struct xfs_dir2_data_free *
+xfs_dir3_data_bestfree_p(struct xfs_dir2_data_hdr *hdr)
+{
+ return ((struct xfs_dir3_data_hdr *)hdr)->best_free;
+}
+
+static struct xfs_dir2_data_entry *
+xfs_dir2_data_entry_p(struct xfs_dir2_data_hdr *hdr)
+{
+ return (struct xfs_dir2_data_entry *)
+ ((char *)hdr + sizeof(struct xfs_dir2_data_hdr));
+}
+
+static struct xfs_dir2_data_unused *
+xfs_dir2_data_unused_p(struct xfs_dir2_data_hdr *hdr)
+{
+ return (struct xfs_dir2_data_unused *)
+ ((char *)hdr + sizeof(struct xfs_dir2_data_hdr));
+}
+
+static struct xfs_dir2_data_entry *
+xfs_dir3_data_entry_p(struct xfs_dir2_data_hdr *hdr)
+{
+ return (struct xfs_dir2_data_entry *)
+ ((char *)hdr + sizeof(struct xfs_dir3_data_hdr));
+}
+
+static struct xfs_dir2_data_unused *
+xfs_dir3_data_unused_p(struct xfs_dir2_data_hdr *hdr)
+{
+ return (struct xfs_dir2_data_unused *)
+ ((char *)hdr + sizeof(struct xfs_dir3_data_hdr));
+}
+
+
+/*
+ * Directory Leaf block operations
+ */
+static int
+xfs_dir2_max_leaf_ents(struct xfs_da_geometry *geo)
+{
+ return (geo->blksize - sizeof(struct xfs_dir2_leaf_hdr)) /
+ (uint)sizeof(struct xfs_dir2_leaf_entry);
+}
+
+static struct xfs_dir2_leaf_entry *
+xfs_dir2_leaf_ents_p(struct xfs_dir2_leaf *lp)
+{
+ return lp->__ents;
+}
+
+static int
+xfs_dir3_max_leaf_ents(struct xfs_da_geometry *geo)
+{
+ return (geo->blksize - sizeof(struct xfs_dir3_leaf_hdr)) /
+ (uint)sizeof(struct xfs_dir2_leaf_entry);
+}
+
+static struct xfs_dir2_leaf_entry *
+xfs_dir3_leaf_ents_p(struct xfs_dir2_leaf *lp)
+{
+ return ((struct xfs_dir3_leaf *)lp)->__ents;
+}
+
+static void
+xfs_dir2_leaf_hdr_from_disk(
+ struct xfs_dir3_icleaf_hdr *to,
+ struct xfs_dir2_leaf *from)
+{
+ to->forw = be32_to_cpu(from->hdr.info.forw);
+ to->back = be32_to_cpu(from->hdr.info.back);
+ to->magic = be16_to_cpu(from->hdr.info.magic);
+ to->count = be16_to_cpu(from->hdr.count);
+ to->stale = be16_to_cpu(from->hdr.stale);
+
+ ASSERT(to->magic == XFS_DIR2_LEAF1_MAGIC ||
+ to->magic == XFS_DIR2_LEAFN_MAGIC);
+}
+
+static void
+xfs_dir2_leaf_hdr_to_disk(
+ struct xfs_dir2_leaf *to,
+ struct xfs_dir3_icleaf_hdr *from)
+{
+ ASSERT(from->magic == XFS_DIR2_LEAF1_MAGIC ||
+ from->magic == XFS_DIR2_LEAFN_MAGIC);
+
+ to->hdr.info.forw = cpu_to_be32(from->forw);
+ to->hdr.info.back = cpu_to_be32(from->back);
+ to->hdr.info.magic = cpu_to_be16(from->magic);
+ to->hdr.count = cpu_to_be16(from->count);
+ to->hdr.stale = cpu_to_be16(from->stale);
+}
+
+static void
+xfs_dir3_leaf_hdr_from_disk(
+ struct xfs_dir3_icleaf_hdr *to,
+ struct xfs_dir2_leaf *from)
+{
+ struct xfs_dir3_leaf_hdr *hdr3 = (struct xfs_dir3_leaf_hdr *)from;
+
+ to->forw = be32_to_cpu(hdr3->info.hdr.forw);
+ to->back = be32_to_cpu(hdr3->info.hdr.back);
+ to->magic = be16_to_cpu(hdr3->info.hdr.magic);
+ to->count = be16_to_cpu(hdr3->count);
+ to->stale = be16_to_cpu(hdr3->stale);
+
+ ASSERT(to->magic == XFS_DIR3_LEAF1_MAGIC ||
+ to->magic == XFS_DIR3_LEAFN_MAGIC);
+}
+
+static void
+xfs_dir3_leaf_hdr_to_disk(
+ struct xfs_dir2_leaf *to,
+ struct xfs_dir3_icleaf_hdr *from)
+{
+ struct xfs_dir3_leaf_hdr *hdr3 = (struct xfs_dir3_leaf_hdr *)to;
+
+ ASSERT(from->magic == XFS_DIR3_LEAF1_MAGIC ||
+ from->magic == XFS_DIR3_LEAFN_MAGIC);
+
+ hdr3->info.hdr.forw = cpu_to_be32(from->forw);
+ hdr3->info.hdr.back = cpu_to_be32(from->back);
+ hdr3->info.hdr.magic = cpu_to_be16(from->magic);
+ hdr3->count = cpu_to_be16(from->count);
+ hdr3->stale = cpu_to_be16(from->stale);
+}
+
+
+/*
+ * Directory/Attribute Node block operations
+ */
+static struct xfs_da_node_entry *
+xfs_da2_node_tree_p(struct xfs_da_intnode *dap)
+{
+ return dap->__btree;
+}
+
+static struct xfs_da_node_entry *
+xfs_da3_node_tree_p(struct xfs_da_intnode *dap)
+{
+ return ((struct xfs_da3_intnode *)dap)->__btree;
+}
+
+static void
+xfs_da2_node_hdr_from_disk(
+ struct xfs_da3_icnode_hdr *to,
+ struct xfs_da_intnode *from)
+{
+ ASSERT(from->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC));
+ to->forw = be32_to_cpu(from->hdr.info.forw);
+ to->back = be32_to_cpu(from->hdr.info.back);
+ to->magic = be16_to_cpu(from->hdr.info.magic);
+ to->count = be16_to_cpu(from->hdr.__count);
+ to->level = be16_to_cpu(from->hdr.__level);
+}
+
+static void
+xfs_da2_node_hdr_to_disk(
+ struct xfs_da_intnode *to,
+ struct xfs_da3_icnode_hdr *from)
+{
+ ASSERT(from->magic == XFS_DA_NODE_MAGIC);
+ to->hdr.info.forw = cpu_to_be32(from->forw);
+ to->hdr.info.back = cpu_to_be32(from->back);
+ to->hdr.info.magic = cpu_to_be16(from->magic);
+ to->hdr.__count = cpu_to_be16(from->count);
+ to->hdr.__level = cpu_to_be16(from->level);
+}
+
+static void
+xfs_da3_node_hdr_from_disk(
+ struct xfs_da3_icnode_hdr *to,
+ struct xfs_da_intnode *from)
+{
+ struct xfs_da3_node_hdr *hdr3 = (struct xfs_da3_node_hdr *)from;
+
+ ASSERT(from->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC));
+ to->forw = be32_to_cpu(hdr3->info.hdr.forw);
+ to->back = be32_to_cpu(hdr3->info.hdr.back);
+ to->magic = be16_to_cpu(hdr3->info.hdr.magic);
+ to->count = be16_to_cpu(hdr3->__count);
+ to->level = be16_to_cpu(hdr3->__level);
+}
+
+static void
+xfs_da3_node_hdr_to_disk(
+ struct xfs_da_intnode *to,
+ struct xfs_da3_icnode_hdr *from)
+{
+ struct xfs_da3_node_hdr *hdr3 = (struct xfs_da3_node_hdr *)to;
+
+ ASSERT(from->magic == XFS_DA3_NODE_MAGIC);
+ hdr3->info.hdr.forw = cpu_to_be32(from->forw);
+ hdr3->info.hdr.back = cpu_to_be32(from->back);
+ hdr3->info.hdr.magic = cpu_to_be16(from->magic);
+ hdr3->__count = cpu_to_be16(from->count);
+ hdr3->__level = cpu_to_be16(from->level);
+}
+
+
+/*
+ * Directory free space block operations
+ */
+static int
+xfs_dir2_free_max_bests(struct xfs_da_geometry *geo)
+{
+ return (geo->blksize - sizeof(struct xfs_dir2_free_hdr)) /
+ sizeof(xfs_dir2_data_off_t);
+}
+
+static __be16 *
+xfs_dir2_free_bests_p(struct xfs_dir2_free *free)
+{
+ return (__be16 *)((char *)free + sizeof(struct xfs_dir2_free_hdr));
+}
+
+/*
+ * Convert data space db to the corresponding free db.
+ */
+static xfs_dir2_db_t
+xfs_dir2_db_to_fdb(struct xfs_da_geometry *geo, xfs_dir2_db_t db)
+{
+ return xfs_dir2_byte_to_db(geo, XFS_DIR2_FREE_OFFSET) +
+ (db / xfs_dir2_free_max_bests(geo));
+}
+
+/*
+ * Convert data space db to the corresponding index in a free db.
+ */
+static int
+xfs_dir2_db_to_fdindex(struct xfs_da_geometry *geo, xfs_dir2_db_t db)
+{
+ return db % xfs_dir2_free_max_bests(geo);
+}
+
+static int
+xfs_dir3_free_max_bests(struct xfs_da_geometry *geo)
+{
+ return (geo->blksize - sizeof(struct xfs_dir3_free_hdr)) /
+ sizeof(xfs_dir2_data_off_t);
+}
+
+static __be16 *
+xfs_dir3_free_bests_p(struct xfs_dir2_free *free)
+{
+ return (__be16 *)((char *)free + sizeof(struct xfs_dir3_free_hdr));
+}
+
+/*
+ * Convert data space db to the corresponding free db.
+ */
+static xfs_dir2_db_t
+xfs_dir3_db_to_fdb(struct xfs_da_geometry *geo, xfs_dir2_db_t db)
+{
+ return xfs_dir2_byte_to_db(geo, XFS_DIR2_FREE_OFFSET) +
+ (db / xfs_dir3_free_max_bests(geo));
+}
+
+/*
+ * Convert data space db to the corresponding index in a free db.
+ */
+static int
+xfs_dir3_db_to_fdindex(struct xfs_da_geometry *geo, xfs_dir2_db_t db)
+{
+ return db % xfs_dir3_free_max_bests(geo);
+}
+
+static void
+xfs_dir2_free_hdr_from_disk(
+ struct xfs_dir3_icfree_hdr *to,
+ struct xfs_dir2_free *from)
+{
+ to->magic = be32_to_cpu(from->hdr.magic);
+ to->firstdb = be32_to_cpu(from->hdr.firstdb);
+ to->nvalid = be32_to_cpu(from->hdr.nvalid);
+ to->nused = be32_to_cpu(from->hdr.nused);
+ ASSERT(to->magic == XFS_DIR2_FREE_MAGIC);
+}
+
+static void
+xfs_dir2_free_hdr_to_disk(
+ struct xfs_dir2_free *to,
+ struct xfs_dir3_icfree_hdr *from)
+{
+ ASSERT(from->magic == XFS_DIR2_FREE_MAGIC);
+
+ to->hdr.magic = cpu_to_be32(from->magic);
+ to->hdr.firstdb = cpu_to_be32(from->firstdb);
+ to->hdr.nvalid = cpu_to_be32(from->nvalid);
+ to->hdr.nused = cpu_to_be32(from->nused);
+}
+
+static void
+xfs_dir3_free_hdr_from_disk(
+ struct xfs_dir3_icfree_hdr *to,
+ struct xfs_dir2_free *from)
+{
+ struct xfs_dir3_free_hdr *hdr3 = (struct xfs_dir3_free_hdr *)from;
+
+ to->magic = be32_to_cpu(hdr3->hdr.magic);
+ to->firstdb = be32_to_cpu(hdr3->firstdb);
+ to->nvalid = be32_to_cpu(hdr3->nvalid);
+ to->nused = be32_to_cpu(hdr3->nused);
+
+ ASSERT(to->magic == XFS_DIR3_FREE_MAGIC);
+}
+
+static void
+xfs_dir3_free_hdr_to_disk(
+ struct xfs_dir2_free *to,
+ struct xfs_dir3_icfree_hdr *from)
+{
+ struct xfs_dir3_free_hdr *hdr3 = (struct xfs_dir3_free_hdr *)to;
+
+ ASSERT(from->magic == XFS_DIR3_FREE_MAGIC);
+
+ hdr3->hdr.magic = cpu_to_be32(from->magic);
+ hdr3->firstdb = cpu_to_be32(from->firstdb);
+ hdr3->nvalid = cpu_to_be32(from->nvalid);
+ hdr3->nused = cpu_to_be32(from->nused);
+}
+
+static const struct xfs_dir_ops xfs_dir2_ops = {
+ .sf_entsize = xfs_dir2_sf_entsize,
+ .sf_nextentry = xfs_dir2_sf_nextentry,
+ .sf_get_ftype = xfs_dir2_sfe_get_ftype,
+ .sf_put_ftype = xfs_dir2_sfe_put_ftype,
+ .sf_get_ino = xfs_dir2_sfe_get_ino,
+ .sf_put_ino = xfs_dir2_sfe_put_ino,
+ .sf_get_parent_ino = xfs_dir2_sf_get_parent_ino,
+ .sf_put_parent_ino = xfs_dir2_sf_put_parent_ino,
+
+ .data_entsize = xfs_dir2_data_entsize,
+ .data_get_ftype = xfs_dir2_data_get_ftype,
+ .data_put_ftype = xfs_dir2_data_put_ftype,
+ .data_entry_tag_p = xfs_dir2_data_entry_tag_p,
+ .data_bestfree_p = xfs_dir2_data_bestfree_p,
+
+ .data_dot_offset = sizeof(struct xfs_dir2_data_hdr),
+ .data_dotdot_offset = sizeof(struct xfs_dir2_data_hdr) +
+ XFS_DIR2_DATA_ENTSIZE(1),
+ .data_first_offset = sizeof(struct xfs_dir2_data_hdr) +
+ XFS_DIR2_DATA_ENTSIZE(1) +
+ XFS_DIR2_DATA_ENTSIZE(2),
+ .data_entry_offset = sizeof(struct xfs_dir2_data_hdr),
+
+ .data_dot_entry_p = xfs_dir2_data_dot_entry_p,
+ .data_dotdot_entry_p = xfs_dir2_data_dotdot_entry_p,
+ .data_first_entry_p = xfs_dir2_data_first_entry_p,
+ .data_entry_p = xfs_dir2_data_entry_p,
+ .data_unused_p = xfs_dir2_data_unused_p,
+
+ .leaf_hdr_size = sizeof(struct xfs_dir2_leaf_hdr),
+ .leaf_hdr_to_disk = xfs_dir2_leaf_hdr_to_disk,
+ .leaf_hdr_from_disk = xfs_dir2_leaf_hdr_from_disk,
+ .leaf_max_ents = xfs_dir2_max_leaf_ents,
+ .leaf_ents_p = xfs_dir2_leaf_ents_p,
+
+ .node_hdr_size = sizeof(struct xfs_da_node_hdr),
+ .node_hdr_to_disk = xfs_da2_node_hdr_to_disk,
+ .node_hdr_from_disk = xfs_da2_node_hdr_from_disk,
+ .node_tree_p = xfs_da2_node_tree_p,
+
+ .free_hdr_size = sizeof(struct xfs_dir2_free_hdr),
+ .free_hdr_to_disk = xfs_dir2_free_hdr_to_disk,
+ .free_hdr_from_disk = xfs_dir2_free_hdr_from_disk,
+ .free_max_bests = xfs_dir2_free_max_bests,
+ .free_bests_p = xfs_dir2_free_bests_p,
+ .db_to_fdb = xfs_dir2_db_to_fdb,
+ .db_to_fdindex = xfs_dir2_db_to_fdindex,
+};
+
+static const struct xfs_dir_ops xfs_dir2_ftype_ops = {
+ .sf_entsize = xfs_dir3_sf_entsize,
+ .sf_nextentry = xfs_dir3_sf_nextentry,
+ .sf_get_ftype = xfs_dir3_sfe_get_ftype,
+ .sf_put_ftype = xfs_dir3_sfe_put_ftype,
+ .sf_get_ino = xfs_dir3_sfe_get_ino,
+ .sf_put_ino = xfs_dir3_sfe_put_ino,
+ .sf_get_parent_ino = xfs_dir2_sf_get_parent_ino,
+ .sf_put_parent_ino = xfs_dir2_sf_put_parent_ino,
+
+ .data_entsize = xfs_dir3_data_entsize,
+ .data_get_ftype = xfs_dir3_data_get_ftype,
+ .data_put_ftype = xfs_dir3_data_put_ftype,
+ .data_entry_tag_p = xfs_dir3_data_entry_tag_p,
+ .data_bestfree_p = xfs_dir2_data_bestfree_p,
+
+ .data_dot_offset = sizeof(struct xfs_dir2_data_hdr),
+ .data_dotdot_offset = sizeof(struct xfs_dir2_data_hdr) +
+ XFS_DIR3_DATA_ENTSIZE(1),
+ .data_first_offset = sizeof(struct xfs_dir2_data_hdr) +
+ XFS_DIR3_DATA_ENTSIZE(1) +
+ XFS_DIR3_DATA_ENTSIZE(2),
+ .data_entry_offset = sizeof(struct xfs_dir2_data_hdr),
+
+ .data_dot_entry_p = xfs_dir2_data_dot_entry_p,
+ .data_dotdot_entry_p = xfs_dir2_ftype_data_dotdot_entry_p,
+ .data_first_entry_p = xfs_dir2_ftype_data_first_entry_p,
+ .data_entry_p = xfs_dir2_data_entry_p,
+ .data_unused_p = xfs_dir2_data_unused_p,
+
+ .leaf_hdr_size = sizeof(struct xfs_dir2_leaf_hdr),
+ .leaf_hdr_to_disk = xfs_dir2_leaf_hdr_to_disk,
+ .leaf_hdr_from_disk = xfs_dir2_leaf_hdr_from_disk,
+ .leaf_max_ents = xfs_dir2_max_leaf_ents,
+ .leaf_ents_p = xfs_dir2_leaf_ents_p,
+
+ .node_hdr_size = sizeof(struct xfs_da_node_hdr),
+ .node_hdr_to_disk = xfs_da2_node_hdr_to_disk,
+ .node_hdr_from_disk = xfs_da2_node_hdr_from_disk,
+ .node_tree_p = xfs_da2_node_tree_p,
+
+ .free_hdr_size = sizeof(struct xfs_dir2_free_hdr),
+ .free_hdr_to_disk = xfs_dir2_free_hdr_to_disk,
+ .free_hdr_from_disk = xfs_dir2_free_hdr_from_disk,
+ .free_max_bests = xfs_dir2_free_max_bests,
+ .free_bests_p = xfs_dir2_free_bests_p,
+ .db_to_fdb = xfs_dir2_db_to_fdb,
+ .db_to_fdindex = xfs_dir2_db_to_fdindex,
+};
+
+static const struct xfs_dir_ops xfs_dir3_ops = {
+ .sf_entsize = xfs_dir3_sf_entsize,
+ .sf_nextentry = xfs_dir3_sf_nextentry,
+ .sf_get_ftype = xfs_dir3_sfe_get_ftype,
+ .sf_put_ftype = xfs_dir3_sfe_put_ftype,
+ .sf_get_ino = xfs_dir3_sfe_get_ino,
+ .sf_put_ino = xfs_dir3_sfe_put_ino,
+ .sf_get_parent_ino = xfs_dir2_sf_get_parent_ino,
+ .sf_put_parent_ino = xfs_dir2_sf_put_parent_ino,
+
+ .data_entsize = xfs_dir3_data_entsize,
+ .data_get_ftype = xfs_dir3_data_get_ftype,
+ .data_put_ftype = xfs_dir3_data_put_ftype,
+ .data_entry_tag_p = xfs_dir3_data_entry_tag_p,
+ .data_bestfree_p = xfs_dir3_data_bestfree_p,
+
+ .data_dot_offset = sizeof(struct xfs_dir3_data_hdr),
+ .data_dotdot_offset = sizeof(struct xfs_dir3_data_hdr) +
+ XFS_DIR3_DATA_ENTSIZE(1),
+ .data_first_offset = sizeof(struct xfs_dir3_data_hdr) +
+ XFS_DIR3_DATA_ENTSIZE(1) +
+ XFS_DIR3_DATA_ENTSIZE(2),
+ .data_entry_offset = sizeof(struct xfs_dir3_data_hdr),
+
+ .data_dot_entry_p = xfs_dir3_data_dot_entry_p,
+ .data_dotdot_entry_p = xfs_dir3_data_dotdot_entry_p,
+ .data_first_entry_p = xfs_dir3_data_first_entry_p,
+ .data_entry_p = xfs_dir3_data_entry_p,
+ .data_unused_p = xfs_dir3_data_unused_p,
+
+ .leaf_hdr_size = sizeof(struct xfs_dir3_leaf_hdr),
+ .leaf_hdr_to_disk = xfs_dir3_leaf_hdr_to_disk,
+ .leaf_hdr_from_disk = xfs_dir3_leaf_hdr_from_disk,
+ .leaf_max_ents = xfs_dir3_max_leaf_ents,
+ .leaf_ents_p = xfs_dir3_leaf_ents_p,
+
+ .node_hdr_size = sizeof(struct xfs_da3_node_hdr),
+ .node_hdr_to_disk = xfs_da3_node_hdr_to_disk,
+ .node_hdr_from_disk = xfs_da3_node_hdr_from_disk,
+ .node_tree_p = xfs_da3_node_tree_p,
+
+ .free_hdr_size = sizeof(struct xfs_dir3_free_hdr),
+ .free_hdr_to_disk = xfs_dir3_free_hdr_to_disk,
+ .free_hdr_from_disk = xfs_dir3_free_hdr_from_disk,
+ .free_max_bests = xfs_dir3_free_max_bests,
+ .free_bests_p = xfs_dir3_free_bests_p,
+ .db_to_fdb = xfs_dir3_db_to_fdb,
+ .db_to_fdindex = xfs_dir3_db_to_fdindex,
+};
+
+static const struct xfs_dir_ops xfs_dir2_nondir_ops = {
+ .node_hdr_size = sizeof(struct xfs_da_node_hdr),
+ .node_hdr_to_disk = xfs_da2_node_hdr_to_disk,
+ .node_hdr_from_disk = xfs_da2_node_hdr_from_disk,
+ .node_tree_p = xfs_da2_node_tree_p,
+};
+
+static const struct xfs_dir_ops xfs_dir3_nondir_ops = {
+ .node_hdr_size = sizeof(struct xfs_da3_node_hdr),
+ .node_hdr_to_disk = xfs_da3_node_hdr_to_disk,
+ .node_hdr_from_disk = xfs_da3_node_hdr_from_disk,
+ .node_tree_p = xfs_da3_node_tree_p,
+};
+
+/*
+ * Return the ops structure according to the current config. If we are passed
+ * an inode, then that overrides the default config we use which is based on
+ * feature bits.
+ */
+const struct xfs_dir_ops *
+xfs_dir_get_ops(
+ struct xfs_mount *mp,
+ struct xfs_inode *dp)
+{
+ if (dp)
+ return dp->d_ops;
+ if (mp->m_dir_inode_ops)
+ return mp->m_dir_inode_ops;
+ if (xfs_sb_version_hascrc(&mp->m_sb))
+ return &xfs_dir3_ops;
+ if (xfs_sb_version_hasftype(&mp->m_sb))
+ return &xfs_dir2_ftype_ops;
+ return &xfs_dir2_ops;
+}
+
+const struct xfs_dir_ops *
+xfs_nondir_get_ops(
+ struct xfs_mount *mp,
+ struct xfs_inode *dp)
+{
+ if (dp)
+ return dp->d_ops;
+ if (mp->m_nondir_inode_ops)
+ return mp->m_nondir_inode_ops;
+ if (xfs_sb_version_hascrc(&mp->m_sb))
+ return &xfs_dir3_nondir_ops;
+ return &xfs_dir2_nondir_ops;
+}
diff --git a/fs/xfs/xfs_da_format.h b/fs/xfs/xfs_da_format.h
new file mode 100644
index 00000000000..0a49b028637
--- /dev/null
+++ b/fs/xfs/xfs_da_format.h
@@ -0,0 +1,861 @@
+/*
+ * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#ifndef __XFS_DA_FORMAT_H__
+#define __XFS_DA_FORMAT_H__
+
+/*
+ * This structure is common to both leaf nodes and non-leaf nodes in the Btree.
+ *
+ * It is used to manage a doubly linked list of all blocks at the same
+ * level in the Btree, and to identify which type of block this is.
+ */
+#define XFS_DA_NODE_MAGIC 0xfebe /* magic number: non-leaf blocks */
+#define XFS_ATTR_LEAF_MAGIC 0xfbee /* magic number: attribute leaf blks */
+#define XFS_DIR2_LEAF1_MAGIC 0xd2f1 /* magic number: v2 dirlf single blks */
+#define XFS_DIR2_LEAFN_MAGIC 0xd2ff /* magic number: v2 dirlf multi blks */
+
+typedef struct xfs_da_blkinfo {
+ __be32 forw; /* previous block in list */
+ __be32 back; /* following block in list */
+ __be16 magic; /* validity check on block */
+ __be16 pad; /* unused */
+} xfs_da_blkinfo_t;
+
+/*
+ * CRC enabled directory structure types
+ *
+ * The headers change size for the additional verification information, but
+ * otherwise the tree layouts and contents are unchanged. Hence the da btree
+ * code can use the struct xfs_da_blkinfo for manipulating the tree links and
+ * magic numbers without modification for both v2 and v3 nodes.
+ */
+#define XFS_DA3_NODE_MAGIC 0x3ebe /* magic number: non-leaf blocks */
+#define XFS_ATTR3_LEAF_MAGIC 0x3bee /* magic number: attribute leaf blks */
+#define XFS_DIR3_LEAF1_MAGIC 0x3df1 /* magic number: v2 dirlf single blks */
+#define XFS_DIR3_LEAFN_MAGIC 0x3dff /* magic number: v2 dirlf multi blks */
+
+struct xfs_da3_blkinfo {
+ /*
+ * the node link manipulation code relies on the fact that the first
+ * element of this structure is the struct xfs_da_blkinfo so it can
+ * ignore the differences in the rest of the structures.
+ */
+ struct xfs_da_blkinfo hdr;
+ __be32 crc; /* CRC of block */
+ __be64 blkno; /* first block of the buffer */
+ __be64 lsn; /* sequence number of last write */
+ uuid_t uuid; /* filesystem we belong to */
+ __be64 owner; /* inode that owns the block */
+};
+
+/*
+ * This is the structure of the root and intermediate nodes in the Btree.
+ * The leaf nodes are defined above.
+ *
+ * Entries are not packed.
+ *
+ * Since we have duplicate keys, use a binary search but always follow
+ * all match in the block, not just the first match found.
+ */
+#define XFS_DA_NODE_MAXDEPTH 5 /* max depth of Btree */
+
+typedef struct xfs_da_node_hdr {
+ struct xfs_da_blkinfo info; /* block type, links, etc. */
+ __be16 __count; /* count of active entries */
+ __be16 __level; /* level above leaves (leaf == 0) */
+} xfs_da_node_hdr_t;
+
+struct xfs_da3_node_hdr {
+ struct xfs_da3_blkinfo info; /* block type, links, etc. */
+ __be16 __count; /* count of active entries */
+ __be16 __level; /* level above leaves (leaf == 0) */
+ __be32 __pad32;
+};
+
+#define XFS_DA3_NODE_CRC_OFF (offsetof(struct xfs_da3_node_hdr, info.crc))
+
+typedef struct xfs_da_node_entry {
+ __be32 hashval; /* hash value for this descendant */
+ __be32 before; /* Btree block before this key */
+} xfs_da_node_entry_t;
+
+typedef struct xfs_da_intnode {
+ struct xfs_da_node_hdr hdr;
+ struct xfs_da_node_entry __btree[];
+} xfs_da_intnode_t;
+
+struct xfs_da3_intnode {
+ struct xfs_da3_node_hdr hdr;
+ struct xfs_da_node_entry __btree[];
+};
+
+/*
+ * In-core version of the node header to abstract the differences in the v2 and
+ * v3 disk format of the headers. Callers need to convert to/from disk format as
+ * appropriate.
+ */
+struct xfs_da3_icnode_hdr {
+ __uint32_t forw;
+ __uint32_t back;
+ __uint16_t magic;
+ __uint16_t count;
+ __uint16_t level;
+};
+
+/*
+ * Directory version 2.
+ *
+ * There are 4 possible formats:
+ * - shortform - embedded into the inode
+ * - single block - data with embedded leaf at the end
+ * - multiple data blocks, single leaf+freeindex block
+ * - data blocks, node and leaf blocks (btree), freeindex blocks
+ *
+ * Note: many node blocks structures and constants are shared with the attr
+ * code and defined in xfs_da_btree.h.
+ */
+
+#define XFS_DIR2_BLOCK_MAGIC 0x58443242 /* XD2B: single block dirs */
+#define XFS_DIR2_DATA_MAGIC 0x58443244 /* XD2D: multiblock dirs */
+#define XFS_DIR2_FREE_MAGIC 0x58443246 /* XD2F: free index blocks */
+
+/*
+ * Directory Version 3 With CRCs.
+ *
+ * The tree formats are the same as for version 2 directories. The difference
+ * is in the block header and dirent formats. In many cases the v3 structures
+ * use v2 definitions as they are no different and this makes code sharing much
+ * easier.
+ *
+ * Also, the xfs_dir3_*() functions handle both v2 and v3 formats - if the
+ * format is v2 then they switch to the existing v2 code, or the format is v3
+ * they implement the v3 functionality. This means the existing dir2 is a mix of
+ * xfs_dir2/xfs_dir3 calls and functions. The xfs_dir3 functions are called
+ * where there is a difference in the formats, otherwise the code is unchanged.
+ *
+ * Where it is possible, the code decides what to do based on the magic numbers
+ * in the blocks rather than feature bits in the superblock. This means the code
+ * is as independent of the external XFS code as possible as doesn't require
+ * passing struct xfs_mount pointers into places where it isn't really
+ * necessary.
+ *
+ * Version 3 includes:
+ *
+ * - a larger block header for CRC and identification purposes and so the
+ * offsets of all the structures inside the blocks are different.
+ *
+ * - new magic numbers to be able to detect the v2/v3 types on the fly.
+ */
+
+#define XFS_DIR3_BLOCK_MAGIC 0x58444233 /* XDB3: single block dirs */
+#define XFS_DIR3_DATA_MAGIC 0x58444433 /* XDD3: multiblock dirs */
+#define XFS_DIR3_FREE_MAGIC 0x58444633 /* XDF3: free index blocks */
+
+/*
+ * Dirents in version 3 directories have a file type field. Additions to this
+ * list are an on-disk format change, requiring feature bits. Valid values
+ * are as follows:
+ */
+#define XFS_DIR3_FT_UNKNOWN 0
+#define XFS_DIR3_FT_REG_FILE 1
+#define XFS_DIR3_FT_DIR 2
+#define XFS_DIR3_FT_CHRDEV 3
+#define XFS_DIR3_FT_BLKDEV 4
+#define XFS_DIR3_FT_FIFO 5
+#define XFS_DIR3_FT_SOCK 6
+#define XFS_DIR3_FT_SYMLINK 7
+#define XFS_DIR3_FT_WHT 8
+
+#define XFS_DIR3_FT_MAX 9
+
+/*
+ * Byte offset in data block and shortform entry.
+ */
+typedef __uint16_t xfs_dir2_data_off_t;
+#define NULLDATAOFF 0xffffU
+typedef uint xfs_dir2_data_aoff_t; /* argument form */
+
+/*
+ * Normalized offset (in a data block) of the entry, really xfs_dir2_data_off_t.
+ * Only need 16 bits, this is the byte offset into the single block form.
+ */
+typedef struct { __uint8_t i[2]; } __arch_pack xfs_dir2_sf_off_t;
+
+/*
+ * Offset in data space of a data entry.
+ */
+typedef __uint32_t xfs_dir2_dataptr_t;
+#define XFS_DIR2_MAX_DATAPTR ((xfs_dir2_dataptr_t)0xffffffff)
+#define XFS_DIR2_NULL_DATAPTR ((xfs_dir2_dataptr_t)0)
+
+/*
+ * Byte offset in a directory.
+ */
+typedef xfs_off_t xfs_dir2_off_t;
+
+/*
+ * Directory block number (logical dirblk in file)
+ */
+typedef __uint32_t xfs_dir2_db_t;
+
+/*
+ * Inode number stored as 8 8-bit values.
+ */
+typedef struct { __uint8_t i[8]; } xfs_dir2_ino8_t;
+
+/*
+ * Inode number stored as 4 8-bit values.
+ * Works a lot of the time, when all the inode numbers in a directory
+ * fit in 32 bits.
+ */
+typedef struct { __uint8_t i[4]; } xfs_dir2_ino4_t;
+
+typedef union {
+ xfs_dir2_ino8_t i8;
+ xfs_dir2_ino4_t i4;
+} xfs_dir2_inou_t;
+#define XFS_DIR2_MAX_SHORT_INUM ((xfs_ino_t)0xffffffffULL)
+
+/*
+ * Directory layout when stored internal to an inode.
+ *
+ * Small directories are packed as tightly as possible so as to fit into the
+ * literal area of the inode. These "shortform" directories consist of a
+ * single xfs_dir2_sf_hdr header followed by zero or more xfs_dir2_sf_entry
+ * structures. Due the different inode number storage size and the variable
+ * length name field in the xfs_dir2_sf_entry all these structure are
+ * variable length, and the accessors in this file should be used to iterate
+ * over them.
+ */
+typedef struct xfs_dir2_sf_hdr {
+ __uint8_t count; /* count of entries */
+ __uint8_t i8count; /* count of 8-byte inode #s */
+ xfs_dir2_inou_t parent; /* parent dir inode number */
+} __arch_pack xfs_dir2_sf_hdr_t;
+
+typedef struct xfs_dir2_sf_entry {
+ __u8 namelen; /* actual name length */
+ xfs_dir2_sf_off_t offset; /* saved offset */
+ __u8 name[]; /* name, variable size */
+ /*
+ * A single byte containing the file type field follows the inode
+ * number for version 3 directory entries.
+ *
+ * A xfs_dir2_ino8_t or xfs_dir2_ino4_t follows here, at a
+ * variable offset after the name.
+ */
+} __arch_pack xfs_dir2_sf_entry_t;
+
+static inline int xfs_dir2_sf_hdr_size(int i8count)
+{
+ return sizeof(struct xfs_dir2_sf_hdr) -
+ (i8count == 0) *
+ (sizeof(xfs_dir2_ino8_t) - sizeof(xfs_dir2_ino4_t));
+}
+
+static inline xfs_dir2_data_aoff_t
+xfs_dir2_sf_get_offset(xfs_dir2_sf_entry_t *sfep)
+{
+ return get_unaligned_be16(&sfep->offset.i);
+}
+
+static inline void
+xfs_dir2_sf_put_offset(xfs_dir2_sf_entry_t *sfep, xfs_dir2_data_aoff_t off)
+{
+ put_unaligned_be16(off, &sfep->offset.i);
+}
+
+static inline struct xfs_dir2_sf_entry *
+xfs_dir2_sf_firstentry(struct xfs_dir2_sf_hdr *hdr)
+{
+ return (struct xfs_dir2_sf_entry *)
+ ((char *)hdr + xfs_dir2_sf_hdr_size(hdr->i8count));
+}
+
+/*
+ * Data block structures.
+ *
+ * A pure data block looks like the following drawing on disk:
+ *
+ * +-------------------------------------------------+
+ * | xfs_dir2_data_hdr_t |
+ * +-------------------------------------------------+
+ * | xfs_dir2_data_entry_t OR xfs_dir2_data_unused_t |
+ * | xfs_dir2_data_entry_t OR xfs_dir2_data_unused_t |
+ * | xfs_dir2_data_entry_t OR xfs_dir2_data_unused_t |
+ * | ... |
+ * +-------------------------------------------------+
+ * | unused space |
+ * +-------------------------------------------------+
+ *
+ * As all the entries are variable size structures the accessors below should
+ * be used to iterate over them.
+ *
+ * In addition to the pure data blocks for the data and node formats,
+ * most structures are also used for the combined data/freespace "block"
+ * format below.
+ */
+
+#define XFS_DIR2_DATA_ALIGN_LOG 3 /* i.e., 8 bytes */
+#define XFS_DIR2_DATA_ALIGN (1 << XFS_DIR2_DATA_ALIGN_LOG)
+#define XFS_DIR2_DATA_FREE_TAG 0xffff
+#define XFS_DIR2_DATA_FD_COUNT 3
+
+/*
+ * Directory address space divided into sections,
+ * spaces separated by 32GB.
+ */
+#define XFS_DIR2_SPACE_SIZE (1ULL << (32 + XFS_DIR2_DATA_ALIGN_LOG))
+#define XFS_DIR2_DATA_SPACE 0
+#define XFS_DIR2_DATA_OFFSET (XFS_DIR2_DATA_SPACE * XFS_DIR2_SPACE_SIZE)
+
+/*
+ * Describe a free area in the data block.
+ *
+ * The freespace will be formatted as a xfs_dir2_data_unused_t.
+ */
+typedef struct xfs_dir2_data_free {
+ __be16 offset; /* start of freespace */
+ __be16 length; /* length of freespace */
+} xfs_dir2_data_free_t;
+
+/*
+ * Header for the data blocks.
+ *
+ * The code knows that XFS_DIR2_DATA_FD_COUNT is 3.
+ */
+typedef struct xfs_dir2_data_hdr {
+ __be32 magic; /* XFS_DIR2_DATA_MAGIC or */
+ /* XFS_DIR2_BLOCK_MAGIC */
+ xfs_dir2_data_free_t bestfree[XFS_DIR2_DATA_FD_COUNT];
+} xfs_dir2_data_hdr_t;
+
+/*
+ * define a structure for all the verification fields we are adding to the
+ * directory block structures. This will be used in several structures.
+ * The magic number must be the first entry to align with all the dir2
+ * structures so we determine how to decode them just by the magic number.
+ */
+struct xfs_dir3_blk_hdr {
+ __be32 magic; /* magic number */
+ __be32 crc; /* CRC of block */
+ __be64 blkno; /* first block of the buffer */
+ __be64 lsn; /* sequence number of last write */
+ uuid_t uuid; /* filesystem we belong to */
+ __be64 owner; /* inode that owns the block */
+};
+
+struct xfs_dir3_data_hdr {
+ struct xfs_dir3_blk_hdr hdr;
+ xfs_dir2_data_free_t best_free[XFS_DIR2_DATA_FD_COUNT];
+ __be32 pad; /* 64 bit alignment */
+};
+
+#define XFS_DIR3_DATA_CRC_OFF offsetof(struct xfs_dir3_data_hdr, hdr.crc)
+
+/*
+ * Active entry in a data block.
+ *
+ * Aligned to 8 bytes. After the variable length name field there is a
+ * 2 byte tag field, which can be accessed using xfs_dir3_data_entry_tag_p.
+ *
+ * For dir3 structures, there is file type field between the name and the tag.
+ * This can only be manipulated by helper functions. It is packed hard against
+ * the end of the name so any padding for rounding is between the file type and
+ * the tag.
+ */
+typedef struct xfs_dir2_data_entry {
+ __be64 inumber; /* inode number */
+ __u8 namelen; /* name length */
+ __u8 name[]; /* name bytes, no null */
+ /* __u8 filetype; */ /* type of inode we point to */
+ /* __be16 tag; */ /* starting offset of us */
+} xfs_dir2_data_entry_t;
+
+/*
+ * Unused entry in a data block.
+ *
+ * Aligned to 8 bytes. Tag appears as the last 2 bytes and must be accessed
+ * using xfs_dir2_data_unused_tag_p.
+ */
+typedef struct xfs_dir2_data_unused {
+ __be16 freetag; /* XFS_DIR2_DATA_FREE_TAG */
+ __be16 length; /* total free length */
+ /* variable offset */
+ __be16 tag; /* starting offset of us */
+} xfs_dir2_data_unused_t;
+
+/*
+ * Pointer to a freespace's tag word.
+ */
+static inline __be16 *
+xfs_dir2_data_unused_tag_p(struct xfs_dir2_data_unused *dup)
+{
+ return (__be16 *)((char *)dup +
+ be16_to_cpu(dup->length) - sizeof(__be16));
+}
+
+/*
+ * Leaf block structures.
+ *
+ * A pure leaf block looks like the following drawing on disk:
+ *
+ * +---------------------------+
+ * | xfs_dir2_leaf_hdr_t |
+ * +---------------------------+
+ * | xfs_dir2_leaf_entry_t |
+ * | xfs_dir2_leaf_entry_t |
+ * | xfs_dir2_leaf_entry_t |
+ * | xfs_dir2_leaf_entry_t |
+ * | ... |
+ * +---------------------------+
+ * | xfs_dir2_data_off_t |
+ * | xfs_dir2_data_off_t |
+ * | xfs_dir2_data_off_t |
+ * | ... |
+ * +---------------------------+
+ * | xfs_dir2_leaf_tail_t |
+ * +---------------------------+
+ *
+ * The xfs_dir2_data_off_t members (bests) and tail are at the end of the block
+ * for single-leaf (magic = XFS_DIR2_LEAF1_MAGIC) blocks only, but not present
+ * for directories with separate leaf nodes and free space blocks
+ * (magic = XFS_DIR2_LEAFN_MAGIC).
+ *
+ * As all the entries are variable size structures the accessors below should
+ * be used to iterate over them.
+ */
+
+/*
+ * Offset of the leaf/node space. First block in this space
+ * is the btree root.
+ */
+#define XFS_DIR2_LEAF_SPACE 1
+#define XFS_DIR2_LEAF_OFFSET (XFS_DIR2_LEAF_SPACE * XFS_DIR2_SPACE_SIZE)
+
+/*
+ * Leaf block header.
+ */
+typedef struct xfs_dir2_leaf_hdr {
+ xfs_da_blkinfo_t info; /* header for da routines */
+ __be16 count; /* count of entries */
+ __be16 stale; /* count of stale entries */
+} xfs_dir2_leaf_hdr_t;
+
+struct xfs_dir3_leaf_hdr {
+ struct xfs_da3_blkinfo info; /* header for da routines */
+ __be16 count; /* count of entries */
+ __be16 stale; /* count of stale entries */
+ __be32 pad; /* 64 bit alignment */
+};
+
+struct xfs_dir3_icleaf_hdr {
+ __uint32_t forw;
+ __uint32_t back;
+ __uint16_t magic;
+ __uint16_t count;
+ __uint16_t stale;
+};
+
+/*
+ * Leaf block entry.
+ */
+typedef struct xfs_dir2_leaf_entry {
+ __be32 hashval; /* hash value of name */
+ __be32 address; /* address of data entry */
+} xfs_dir2_leaf_entry_t;
+
+/*
+ * Leaf block tail.
+ */
+typedef struct xfs_dir2_leaf_tail {
+ __be32 bestcount;
+} xfs_dir2_leaf_tail_t;
+
+/*
+ * Leaf block.
+ */
+typedef struct xfs_dir2_leaf {
+ xfs_dir2_leaf_hdr_t hdr; /* leaf header */
+ xfs_dir2_leaf_entry_t __ents[]; /* entries */
+} xfs_dir2_leaf_t;
+
+struct xfs_dir3_leaf {
+ struct xfs_dir3_leaf_hdr hdr; /* leaf header */
+ struct xfs_dir2_leaf_entry __ents[]; /* entries */
+};
+
+#define XFS_DIR3_LEAF_CRC_OFF offsetof(struct xfs_dir3_leaf_hdr, info.crc)
+
+/*
+ * Get address of the bests array in the single-leaf block.
+ */
+static inline __be16 *
+xfs_dir2_leaf_bests_p(struct xfs_dir2_leaf_tail *ltp)
+{
+ return (__be16 *)ltp - be32_to_cpu(ltp->bestcount);
+}
+
+/*
+ * Free space block defintions for the node format.
+ */
+
+/*
+ * Offset of the freespace index.
+ */
+#define XFS_DIR2_FREE_SPACE 2
+#define XFS_DIR2_FREE_OFFSET (XFS_DIR2_FREE_SPACE * XFS_DIR2_SPACE_SIZE)
+
+typedef struct xfs_dir2_free_hdr {
+ __be32 magic; /* XFS_DIR2_FREE_MAGIC */
+ __be32 firstdb; /* db of first entry */
+ __be32 nvalid; /* count of valid entries */
+ __be32 nused; /* count of used entries */
+} xfs_dir2_free_hdr_t;
+
+typedef struct xfs_dir2_free {
+ xfs_dir2_free_hdr_t hdr; /* block header */
+ __be16 bests[]; /* best free counts */
+ /* unused entries are -1 */
+} xfs_dir2_free_t;
+
+struct xfs_dir3_free_hdr {
+ struct xfs_dir3_blk_hdr hdr;
+ __be32 firstdb; /* db of first entry */
+ __be32 nvalid; /* count of valid entries */
+ __be32 nused; /* count of used entries */
+ __be32 pad; /* 64 bit alignment */
+};
+
+struct xfs_dir3_free {
+ struct xfs_dir3_free_hdr hdr;
+ __be16 bests[]; /* best free counts */
+ /* unused entries are -1 */
+};
+
+#define XFS_DIR3_FREE_CRC_OFF offsetof(struct xfs_dir3_free, hdr.hdr.crc)
+
+/*
+ * In core version of the free block header, abstracted away from on-disk format
+ * differences. Use this in the code, and convert to/from the disk version using
+ * xfs_dir3_free_hdr_from_disk/xfs_dir3_free_hdr_to_disk.
+ */
+struct xfs_dir3_icfree_hdr {
+ __uint32_t magic;
+ __uint32_t firstdb;
+ __uint32_t nvalid;
+ __uint32_t nused;
+
+};
+
+/*
+ * Single block format.
+ *
+ * The single block format looks like the following drawing on disk:
+ *
+ * +-------------------------------------------------+
+ * | xfs_dir2_data_hdr_t |
+ * +-------------------------------------------------+
+ * | xfs_dir2_data_entry_t OR xfs_dir2_data_unused_t |
+ * | xfs_dir2_data_entry_t OR xfs_dir2_data_unused_t |
+ * | xfs_dir2_data_entry_t OR xfs_dir2_data_unused_t :
+ * | ... |
+ * +-------------------------------------------------+
+ * | unused space |
+ * +-------------------------------------------------+
+ * | ... |
+ * | xfs_dir2_leaf_entry_t |
+ * | xfs_dir2_leaf_entry_t |
+ * +-------------------------------------------------+
+ * | xfs_dir2_block_tail_t |
+ * +-------------------------------------------------+
+ *
+ * As all the entries are variable size structures the accessors below should
+ * be used to iterate over them.
+ */
+
+typedef struct xfs_dir2_block_tail {
+ __be32 count; /* count of leaf entries */
+ __be32 stale; /* count of stale lf entries */
+} xfs_dir2_block_tail_t;
+
+/*
+ * Pointer to the leaf entries embedded in a data block (1-block format)
+ */
+static inline struct xfs_dir2_leaf_entry *
+xfs_dir2_block_leaf_p(struct xfs_dir2_block_tail *btp)
+{
+ return ((struct xfs_dir2_leaf_entry *)btp) - be32_to_cpu(btp->count);
+}
+
+
+/*
+ * Attribute storage layout
+ *
+ * Attribute lists are structured around Btrees where all the data
+ * elements are in the leaf nodes. Attribute names are hashed into an int,
+ * then that int is used as the index into the Btree. Since the hashval
+ * of an attribute name may not be unique, we may have duplicate keys. The
+ * internal links in the Btree are logical block offsets into the file.
+ *
+ * Struct leaf_entry's are packed from the top. Name/values grow from the
+ * bottom but are not packed. The freemap contains run-length-encoded entries
+ * for the free bytes after the leaf_entry's, but only the N largest such,
+ * smaller runs are dropped. When the freemap doesn't show enough space
+ * for an allocation, we compact the name/value area and try again. If we
+ * still don't have enough space, then we have to split the block. The
+ * name/value structs (both local and remote versions) must be 32bit aligned.
+ *
+ * Since we have duplicate hash keys, for each key that matches, compare
+ * the actual name string. The root and intermediate node search always
+ * takes the first-in-the-block key match found, so we should only have
+ * to work "forw"ard. If none matches, continue with the "forw"ard leaf
+ * nodes until the hash key changes or the attribute name is found.
+ *
+ * We store the fact that an attribute is a ROOT/USER/SECURE attribute in
+ * the leaf_entry. The namespaces are independent only because we also look
+ * at the namespace bit when we are looking for a matching attribute name.
+ *
+ * We also store an "incomplete" bit in the leaf_entry. It shows that an
+ * attribute is in the middle of being created and should not be shown to
+ * the user if we crash during the time that the bit is set. We clear the
+ * bit when we have finished setting up the attribute. We do this because
+ * we cannot create some large attributes inside a single transaction, and we
+ * need some indication that we weren't finished if we crash in the middle.
+ */
+#define XFS_ATTR_LEAF_MAPSIZE 3 /* how many freespace slots */
+
+typedef struct xfs_attr_leaf_map { /* RLE map of free bytes */
+ __be16 base; /* base of free region */
+ __be16 size; /* length of free region */
+} xfs_attr_leaf_map_t;
+
+typedef struct xfs_attr_leaf_hdr { /* constant-structure header block */
+ xfs_da_blkinfo_t info; /* block type, links, etc. */
+ __be16 count; /* count of active leaf_entry's */
+ __be16 usedbytes; /* num bytes of names/values stored */
+ __be16 firstused; /* first used byte in name area */
+ __u8 holes; /* != 0 if blk needs compaction */
+ __u8 pad1;
+ xfs_attr_leaf_map_t freemap[XFS_ATTR_LEAF_MAPSIZE];
+ /* N largest free regions */
+} xfs_attr_leaf_hdr_t;
+
+typedef struct xfs_attr_leaf_entry { /* sorted on key, not name */
+ __be32 hashval; /* hash value of name */
+ __be16 nameidx; /* index into buffer of name/value */
+ __u8 flags; /* LOCAL/ROOT/SECURE/INCOMPLETE flag */
+ __u8 pad2; /* unused pad byte */
+} xfs_attr_leaf_entry_t;
+
+typedef struct xfs_attr_leaf_name_local {
+ __be16 valuelen; /* number of bytes in value */
+ __u8 namelen; /* length of name bytes */
+ __u8 nameval[1]; /* name/value bytes */
+} xfs_attr_leaf_name_local_t;
+
+typedef struct xfs_attr_leaf_name_remote {
+ __be32 valueblk; /* block number of value bytes */
+ __be32 valuelen; /* number of bytes in value */
+ __u8 namelen; /* length of name bytes */
+ __u8 name[1]; /* name bytes */
+} xfs_attr_leaf_name_remote_t;
+
+typedef struct xfs_attr_leafblock {
+ xfs_attr_leaf_hdr_t hdr; /* constant-structure header block */
+ xfs_attr_leaf_entry_t entries[1]; /* sorted on key, not name */
+ xfs_attr_leaf_name_local_t namelist; /* grows from bottom of buf */
+ xfs_attr_leaf_name_remote_t valuelist; /* grows from bottom of buf */
+} xfs_attr_leafblock_t;
+
+/*
+ * CRC enabled leaf structures. Called "version 3" structures to match the
+ * version number of the directory and dablk structures for this feature, and
+ * attr2 is already taken by the variable inode attribute fork size feature.
+ */
+struct xfs_attr3_leaf_hdr {
+ struct xfs_da3_blkinfo info;
+ __be16 count;
+ __be16 usedbytes;
+ __be16 firstused;
+ __u8 holes;
+ __u8 pad1;
+ struct xfs_attr_leaf_map freemap[XFS_ATTR_LEAF_MAPSIZE];
+ __be32 pad2; /* 64 bit alignment */
+};
+
+#define XFS_ATTR3_LEAF_CRC_OFF (offsetof(struct xfs_attr3_leaf_hdr, info.crc))
+
+struct xfs_attr3_leafblock {
+ struct xfs_attr3_leaf_hdr hdr;
+ struct xfs_attr_leaf_entry entries[1];
+
+ /*
+ * The rest of the block contains the following structures after the
+ * leaf entries, growing from the bottom up. The variables are never
+ * referenced, the locations accessed purely from helper functions.
+ *
+ * struct xfs_attr_leaf_name_local
+ * struct xfs_attr_leaf_name_remote
+ */
+};
+
+/*
+ * incore, neutral version of the attribute leaf header
+ */
+struct xfs_attr3_icleaf_hdr {
+ __uint32_t forw;
+ __uint32_t back;
+ __uint16_t magic;
+ __uint16_t count;
+ __uint16_t usedbytes;
+ __uint16_t firstused;
+ __u8 holes;
+ struct {
+ __uint16_t base;
+ __uint16_t size;
+ } freemap[XFS_ATTR_LEAF_MAPSIZE];
+};
+
+/*
+ * Flags used in the leaf_entry[i].flags field.
+ * NOTE: the INCOMPLETE bit must not collide with the flags bits specified
+ * on the system call, they are "or"ed together for various operations.
+ */
+#define XFS_ATTR_LOCAL_BIT 0 /* attr is stored locally */
+#define XFS_ATTR_ROOT_BIT 1 /* limit access to trusted attrs */
+#define XFS_ATTR_SECURE_BIT 2 /* limit access to secure attrs */
+#define XFS_ATTR_INCOMPLETE_BIT 7 /* attr in middle of create/delete */
+#define XFS_ATTR_LOCAL (1 << XFS_ATTR_LOCAL_BIT)
+#define XFS_ATTR_ROOT (1 << XFS_ATTR_ROOT_BIT)
+#define XFS_ATTR_SECURE (1 << XFS_ATTR_SECURE_BIT)
+#define XFS_ATTR_INCOMPLETE (1 << XFS_ATTR_INCOMPLETE_BIT)
+
+/*
+ * Conversion macros for converting namespace bits from argument flags
+ * to ondisk flags.
+ */
+#define XFS_ATTR_NSP_ARGS_MASK (ATTR_ROOT | ATTR_SECURE)
+#define XFS_ATTR_NSP_ONDISK_MASK (XFS_ATTR_ROOT | XFS_ATTR_SECURE)
+#define XFS_ATTR_NSP_ONDISK(flags) ((flags) & XFS_ATTR_NSP_ONDISK_MASK)
+#define XFS_ATTR_NSP_ARGS(flags) ((flags) & XFS_ATTR_NSP_ARGS_MASK)
+#define XFS_ATTR_NSP_ARGS_TO_ONDISK(x) (((x) & ATTR_ROOT ? XFS_ATTR_ROOT : 0) |\
+ ((x) & ATTR_SECURE ? XFS_ATTR_SECURE : 0))
+#define XFS_ATTR_NSP_ONDISK_TO_ARGS(x) (((x) & XFS_ATTR_ROOT ? ATTR_ROOT : 0) |\
+ ((x) & XFS_ATTR_SECURE ? ATTR_SECURE : 0))
+
+/*
+ * Alignment for namelist and valuelist entries (since they are mixed
+ * there can be only one alignment value)
+ */
+#define XFS_ATTR_LEAF_NAME_ALIGN ((uint)sizeof(xfs_dablk_t))
+
+static inline int
+xfs_attr3_leaf_hdr_size(struct xfs_attr_leafblock *leafp)
+{
+ if (leafp->hdr.info.magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC))
+ return sizeof(struct xfs_attr3_leaf_hdr);
+ return sizeof(struct xfs_attr_leaf_hdr);
+}
+
+static inline struct xfs_attr_leaf_entry *
+xfs_attr3_leaf_entryp(xfs_attr_leafblock_t *leafp)
+{
+ if (leafp->hdr.info.magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC))
+ return &((struct xfs_attr3_leafblock *)leafp)->entries[0];
+ return &leafp->entries[0];
+}
+
+/*
+ * Cast typed pointers for "local" and "remote" name/value structs.
+ */
+static inline char *
+xfs_attr3_leaf_name(xfs_attr_leafblock_t *leafp, int idx)
+{
+ struct xfs_attr_leaf_entry *entries = xfs_attr3_leaf_entryp(leafp);
+
+ return &((char *)leafp)[be16_to_cpu(entries[idx].nameidx)];
+}
+
+static inline xfs_attr_leaf_name_remote_t *
+xfs_attr3_leaf_name_remote(xfs_attr_leafblock_t *leafp, int idx)
+{
+ return (xfs_attr_leaf_name_remote_t *)xfs_attr3_leaf_name(leafp, idx);
+}
+
+static inline xfs_attr_leaf_name_local_t *
+xfs_attr3_leaf_name_local(xfs_attr_leafblock_t *leafp, int idx)
+{
+ return (xfs_attr_leaf_name_local_t *)xfs_attr3_leaf_name(leafp, idx);
+}
+
+/*
+ * Calculate total bytes used (including trailing pad for alignment) for
+ * a "local" name/value structure, a "remote" name/value structure, and
+ * a pointer which might be either.
+ */
+static inline int xfs_attr_leaf_entsize_remote(int nlen)
+{
+ return ((uint)sizeof(xfs_attr_leaf_name_remote_t) - 1 + (nlen) + \
+ XFS_ATTR_LEAF_NAME_ALIGN - 1) & ~(XFS_ATTR_LEAF_NAME_ALIGN - 1);
+}
+
+static inline int xfs_attr_leaf_entsize_local(int nlen, int vlen)
+{
+ return ((uint)sizeof(xfs_attr_leaf_name_local_t) - 1 + (nlen) + (vlen) +
+ XFS_ATTR_LEAF_NAME_ALIGN - 1) & ~(XFS_ATTR_LEAF_NAME_ALIGN - 1);
+}
+
+static inline int xfs_attr_leaf_entsize_local_max(int bsize)
+{
+ return (((bsize) >> 1) + ((bsize) >> 2));
+}
+
+
+
+/*
+ * Remote attribute block format definition
+ *
+ * There is one of these headers per filesystem block in a remote attribute.
+ * This is done to ensure there is a 1:1 mapping between the attribute value
+ * length and the number of blocks needed to store the attribute. This makes the
+ * verification of a buffer a little more complex, but greatly simplifies the
+ * allocation, reading and writing of these attributes as we don't have to guess
+ * the number of blocks needed to store the attribute data.
+ */
+#define XFS_ATTR3_RMT_MAGIC 0x5841524d /* XARM */
+
+struct xfs_attr3_rmt_hdr {
+ __be32 rm_magic;
+ __be32 rm_offset;
+ __be32 rm_bytes;
+ __be32 rm_crc;
+ uuid_t rm_uuid;
+ __be64 rm_owner;
+ __be64 rm_blkno;
+ __be64 rm_lsn;
+};
+
+#define XFS_ATTR3_RMT_CRC_OFF offsetof(struct xfs_attr3_rmt_hdr, rm_crc)
+
+#define XFS_ATTR3_RMT_BUF_SPACE(mp, bufsize) \
+ ((bufsize) - (xfs_sb_version_hascrc(&(mp)->m_sb) ? \
+ sizeof(struct xfs_attr3_rmt_hdr) : 0))
+
+#endif /* __XFS_DA_FORMAT_H__ */
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
deleted file mode 100644
index d0e9c74d3d9..00000000000
--- a/fs/xfs/xfs_dfrag.c
+++ /dev/null
@@ -1,449 +0,0 @@
-/*
- * Copyright (c) 2000-2006 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_types.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_dinode.h"
-#include "xfs_inode.h"
-#include "xfs_inode_item.h"
-#include "xfs_bmap.h"
-#include "xfs_itable.h"
-#include "xfs_dfrag.h"
-#include "xfs_error.h"
-#include "xfs_vnodeops.h"
-#include "xfs_trace.h"
-
-
-static int xfs_swap_extents(
- xfs_inode_t *ip, /* target inode */
- xfs_inode_t *tip, /* tmp inode */
- xfs_swapext_t *sxp);
-
-/*
- * ioctl interface for swapext
- */
-int
-xfs_swapext(
- xfs_swapext_t *sxp)
-{
- xfs_inode_t *ip, *tip;
- struct fd f, tmp;
- int error = 0;
-
- /* Pull information for the target fd */
- f = fdget((int)sxp->sx_fdtarget);
- if (!f.file) {
- error = XFS_ERROR(EINVAL);
- goto out;
- }
-
- if (!(f.file->f_mode & FMODE_WRITE) ||
- !(f.file->f_mode & FMODE_READ) ||
- (f.file->f_flags & O_APPEND)) {
- error = XFS_ERROR(EBADF);
- goto out_put_file;
- }
-
- tmp = fdget((int)sxp->sx_fdtmp);
- if (!tmp.file) {
- error = XFS_ERROR(EINVAL);
- goto out_put_file;
- }
-
- if (!(tmp.file->f_mode & FMODE_WRITE) ||
- !(tmp.file->f_mode & FMODE_READ) ||
- (tmp.file->f_flags & O_APPEND)) {
- error = XFS_ERROR(EBADF);
- goto out_put_tmp_file;
- }
-
- if (IS_SWAPFILE(f.file->f_path.dentry->d_inode) ||
- IS_SWAPFILE(tmp.file->f_path.dentry->d_inode)) {
- error = XFS_ERROR(EINVAL);
- goto out_put_tmp_file;
- }
-
- ip = XFS_I(f.file->f_path.dentry->d_inode);
- tip = XFS_I(tmp.file->f_path.dentry->d_inode);
-
- if (ip->i_mount != tip->i_mount) {
- error = XFS_ERROR(EINVAL);
- goto out_put_tmp_file;
- }
-
- if (ip->i_ino == tip->i_ino) {
- error = XFS_ERROR(EINVAL);
- goto out_put_tmp_file;
- }
-
- if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
- error = XFS_ERROR(EIO);
- goto out_put_tmp_file;
- }
-
- error = xfs_swap_extents(ip, tip, sxp);
-
- out_put_tmp_file:
- fdput(tmp);
- out_put_file:
- fdput(f);
- out:
- return error;
-}
-
-/*
- * We need to check that the format of the data fork in the temporary inode is
- * valid for the target inode before doing the swap. This is not a problem with
- * attr1 because of the fixed fork offset, but attr2 has a dynamically sized
- * data fork depending on the space the attribute fork is taking so we can get
- * invalid formats on the target inode.
- *
- * E.g. target has space for 7 extents in extent format, temp inode only has
- * space for 6. If we defragment down to 7 extents, then the tmp format is a
- * btree, but when swapped it needs to be in extent format. Hence we can't just
- * blindly swap data forks on attr2 filesystems.
- *
- * Note that we check the swap in both directions so that we don't end up with
- * a corrupt temporary inode, either.
- *
- * Note that fixing the way xfs_fsr sets up the attribute fork in the source
- * inode will prevent this situation from occurring, so all we do here is
- * reject and log the attempt. basically we are putting the responsibility on
- * userspace to get this right.
- */
-static int
-xfs_swap_extents_check_format(
- xfs_inode_t *ip, /* target inode */
- xfs_inode_t *tip) /* tmp inode */
-{
-
- /* Should never get a local format */
- if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL ||
- tip->i_d.di_format == XFS_DINODE_FMT_LOCAL)
- return EINVAL;
-
- /*
- * if the target inode has less extents that then temporary inode then
- * why did userspace call us?
- */
- if (ip->i_d.di_nextents < tip->i_d.di_nextents)
- return EINVAL;
-
- /*
- * if the target inode is in extent form and the temp inode is in btree
- * form then we will end up with the target inode in the wrong format
- * as we already know there are less extents in the temp inode.
- */
- if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
- tip->i_d.di_format == XFS_DINODE_FMT_BTREE)
- return EINVAL;
-
- /* Check temp in extent form to max in target */
- if (tip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
- XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) >
- XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK))
- return EINVAL;
-
- /* Check target in extent form to max in temp */
- if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
- XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) >
- XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK))
- return EINVAL;
-
- /*
- * If we are in a btree format, check that the temp root block will fit
- * in the target and that it has enough extents to be in btree format
- * in the target.
- *
- * Note that we have to be careful to allow btree->extent conversions
- * (a common defrag case) which will occur when the temp inode is in
- * extent format...
- */
- if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
- if (XFS_IFORK_BOFF(ip) &&
- tip->i_df.if_broot_bytes > XFS_IFORK_BOFF(ip))
- return EINVAL;
- if (XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) <=
- XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK))
- return EINVAL;
- }
-
- /* Reciprocal target->temp btree format checks */
- if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
- if (XFS_IFORK_BOFF(tip) &&
- ip->i_df.if_broot_bytes > XFS_IFORK_BOFF(tip))
- return EINVAL;
-
- if (XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) <=
- XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK))
- return EINVAL;
- }
-
- return 0;
-}
-
-static int
-xfs_swap_extents(
- xfs_inode_t *ip, /* target inode */
- xfs_inode_t *tip, /* tmp inode */
- xfs_swapext_t *sxp)
-{
- xfs_mount_t *mp = ip->i_mount;
- xfs_trans_t *tp;
- xfs_bstat_t *sbp = &sxp->sx_stat;
- xfs_ifork_t *tempifp, *ifp, *tifp;
- int src_log_flags, target_log_flags;
- int error = 0;
- int aforkblks = 0;
- int taforkblks = 0;
- __uint64_t tmp;
-
- tempifp = kmem_alloc(sizeof(xfs_ifork_t), KM_MAYFAIL);
- if (!tempifp) {
- error = XFS_ERROR(ENOMEM);
- goto out;
- }
-
- /*
- * we have to do two separate lock calls here to keep lockdep
- * happy. If we try to get all the locks in one call, lock will
- * report false positives when we drop the ILOCK and regain them
- * below.
- */
- xfs_lock_two_inodes(ip, tip, XFS_IOLOCK_EXCL);
- xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL);
-
- /* Verify that both files have the same format */
- if ((ip->i_d.di_mode & S_IFMT) != (tip->i_d.di_mode & S_IFMT)) {
- error = XFS_ERROR(EINVAL);
- goto out_unlock;
- }
-
- /* Verify both files are either real-time or non-realtime */
- if (XFS_IS_REALTIME_INODE(ip) != XFS_IS_REALTIME_INODE(tip)) {
- error = XFS_ERROR(EINVAL);
- goto out_unlock;
- }
-
- error = -filemap_write_and_wait(VFS_I(ip)->i_mapping);
- if (error)
- goto out_unlock;
- truncate_pagecache_range(VFS_I(ip), 0, -1);
-
- /* Verify O_DIRECT for ftmp */
- if (VN_CACHED(VFS_I(tip)) != 0) {
- error = XFS_ERROR(EINVAL);
- goto out_unlock;
- }
-
- /* Verify all data are being swapped */
- if (sxp->sx_offset != 0 ||
- sxp->sx_length != ip->i_d.di_size ||
- sxp->sx_length != tip->i_d.di_size) {
- error = XFS_ERROR(EFAULT);
- goto out_unlock;
- }
-
- trace_xfs_swap_extent_before(ip, 0);
- trace_xfs_swap_extent_before(tip, 1);
-
- /* check inode formats now that data is flushed */
- error = xfs_swap_extents_check_format(ip, tip);
- if (error) {
- xfs_notice(mp,
- "%s: inode 0x%llx format is incompatible for exchanging.",
- __func__, ip->i_ino);
- goto out_unlock;
- }
-
- /*
- * Compare the current change & modify times with that
- * passed in. If they differ, we abort this swap.
- * This is the mechanism used to ensure the calling
- * process that the file was not changed out from
- * under it.
- */
- if ((sbp->bs_ctime.tv_sec != VFS_I(ip)->i_ctime.tv_sec) ||
- (sbp->bs_ctime.tv_nsec != VFS_I(ip)->i_ctime.tv_nsec) ||
- (sbp->bs_mtime.tv_sec != VFS_I(ip)->i_mtime.tv_sec) ||
- (sbp->bs_mtime.tv_nsec != VFS_I(ip)->i_mtime.tv_nsec)) {
- error = XFS_ERROR(EBUSY);
- goto out_unlock;
- }
-
- /* We need to fail if the file is memory mapped. Once we have tossed
- * all existing pages, the page fault will have no option
- * but to go to the filesystem for pages. By making the page fault call
- * vop_read (or write in the case of autogrow) they block on the iolock
- * until we have switched the extents.
- */
- if (VN_MAPPED(VFS_I(ip))) {
- error = XFS_ERROR(EBUSY);
- goto out_unlock;
- }
-
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- xfs_iunlock(tip, XFS_ILOCK_EXCL);
-
- /*
- * There is a race condition here since we gave up the
- * ilock. However, the data fork will not change since
- * we have the iolock (locked for truncation too) so we
- * are safe. We don't really care if non-io related
- * fields change.
- */
- truncate_pagecache_range(VFS_I(ip), 0, -1);
-
- tp = xfs_trans_alloc(mp, XFS_TRANS_SWAPEXT);
- if ((error = xfs_trans_reserve(tp, 0,
- XFS_ICHANGE_LOG_RES(mp), 0,
- 0, 0))) {
- xfs_iunlock(ip, XFS_IOLOCK_EXCL);
- xfs_iunlock(tip, XFS_IOLOCK_EXCL);
- xfs_trans_cancel(tp, 0);
- goto out;
- }
- xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL);
-
- /*
- * Count the number of extended attribute blocks
- */
- if ( ((XFS_IFORK_Q(ip) != 0) && (ip->i_d.di_anextents > 0)) &&
- (ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) {
- error = xfs_bmap_count_blocks(tp, ip, XFS_ATTR_FORK, &aforkblks);
- if (error)
- goto out_trans_cancel;
- }
- if ( ((XFS_IFORK_Q(tip) != 0) && (tip->i_d.di_anextents > 0)) &&
- (tip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) {
- error = xfs_bmap_count_blocks(tp, tip, XFS_ATTR_FORK,
- &taforkblks);
- if (error)
- goto out_trans_cancel;
- }
-
- /*
- * Swap the data forks of the inodes
- */
- ifp = &ip->i_df;
- tifp = &tip->i_df;
- *tempifp = *ifp; /* struct copy */
- *ifp = *tifp; /* struct copy */
- *tifp = *tempifp; /* struct copy */
-
- /*
- * Fix the on-disk inode values
- */
- tmp = (__uint64_t)ip->i_d.di_nblocks;
- ip->i_d.di_nblocks = tip->i_d.di_nblocks - taforkblks + aforkblks;
- tip->i_d.di_nblocks = tmp + taforkblks - aforkblks;
-
- tmp = (__uint64_t) ip->i_d.di_nextents;
- ip->i_d.di_nextents = tip->i_d.di_nextents;
- tip->i_d.di_nextents = tmp;
-
- tmp = (__uint64_t) ip->i_d.di_format;
- ip->i_d.di_format = tip->i_d.di_format;
- tip->i_d.di_format = tmp;
-
- /*
- * The extents in the source inode could still contain speculative
- * preallocation beyond EOF (e.g. the file is open but not modified
- * while defrag is in progress). In that case, we need to copy over the
- * number of delalloc blocks the data fork in the source inode is
- * tracking beyond EOF so that when the fork is truncated away when the
- * temporary inode is unlinked we don't underrun the i_delayed_blks
- * counter on that inode.
- */
- ASSERT(tip->i_delayed_blks == 0);
- tip->i_delayed_blks = ip->i_delayed_blks;
- ip->i_delayed_blks = 0;
-
- src_log_flags = XFS_ILOG_CORE;
- switch (ip->i_d.di_format) {
- case XFS_DINODE_FMT_EXTENTS:
- /* If the extents fit in the inode, fix the
- * pointer. Otherwise it's already NULL or
- * pointing to the extent.
- */
- if (ip->i_d.di_nextents <= XFS_INLINE_EXTS) {
- ifp->if_u1.if_extents =
- ifp->if_u2.if_inline_ext;
- }
- src_log_flags |= XFS_ILOG_DEXT;
- break;
- case XFS_DINODE_FMT_BTREE:
- src_log_flags |= XFS_ILOG_DBROOT;
- break;
- }
-
- target_log_flags = XFS_ILOG_CORE;
- switch (tip->i_d.di_format) {
- case XFS_DINODE_FMT_EXTENTS:
- /* If the extents fit in the inode, fix the
- * pointer. Otherwise it's already NULL or
- * pointing to the extent.
- */
- if (tip->i_d.di_nextents <= XFS_INLINE_EXTS) {
- tifp->if_u1.if_extents =
- tifp->if_u2.if_inline_ext;
- }
- target_log_flags |= XFS_ILOG_DEXT;
- break;
- case XFS_DINODE_FMT_BTREE:
- target_log_flags |= XFS_ILOG_DBROOT;
- break;
- }
-
-
- xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
- xfs_trans_ijoin(tp, tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
-
- xfs_trans_log_inode(tp, ip, src_log_flags);
- xfs_trans_log_inode(tp, tip, target_log_flags);
-
- /*
- * If this is a synchronous mount, make sure that the
- * transaction goes to disk before returning to the user.
- */
- if (mp->m_flags & XFS_MOUNT_WSYNC)
- xfs_trans_set_sync(tp);
-
- error = xfs_trans_commit(tp, 0);
-
- trace_xfs_swap_extent_after(ip, 0);
- trace_xfs_swap_extent_after(tip, 1);
-out:
- kmem_free(tempifp);
- return error;
-
-out_unlock:
- xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
- xfs_iunlock(tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
- goto out;
-
-out_trans_cancel:
- xfs_trans_cancel(tp, 0);
- goto out_unlock;
-}
diff --git a/fs/xfs/xfs_dfrag.h b/fs/xfs/xfs_dfrag.h
deleted file mode 100644
index 20bdd935c12..00000000000
--- a/fs/xfs/xfs_dfrag.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * Copyright (c) 2000,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- */
-#ifndef __XFS_DFRAG_H__
-#define __XFS_DFRAG_H__
-
-/*
- * Structure passed to xfs_swapext
- */
-
-typedef struct xfs_swapext
-{
- __int64_t sx_version; /* version */
- __int64_t sx_fdtarget; /* fd of target file */
- __int64_t sx_fdtmp; /* fd of tmp file */
- xfs_off_t sx_offset; /* offset into file */
- xfs_off_t sx_length; /* leng from offset */
- char sx_pad[16]; /* pad space, unused */
- xfs_bstat_t sx_stat; /* stat of target b4 copy */
-} xfs_swapext_t;
-
-/*
- * Version flag
- */
-#define XFS_SX_VERSION 0
-
-#ifdef __KERNEL__
-/*
- * Prototypes for visible xfs_dfrag.c routines.
- */
-
-/*
- * Syscall interface for xfs_swapext
- */
-int xfs_swapext(struct xfs_swapext *sx);
-
-#endif /* __KERNEL__ */
-
-#endif /* __XFS_DFRAG_H__ */
diff --git a/fs/xfs/xfs_dinode.h b/fs/xfs/xfs_dinode.h
index 1d9643b3dce..623bbe8fd92 100644
--- a/fs/xfs/xfs_dinode.h
+++ b/fs/xfs/xfs_dinode.h
@@ -19,7 +19,7 @@
#define __XFS_DINODE_H__
#define XFS_DINODE_MAGIC 0x494e /* 'IN' */
-#define XFS_DINODE_GOOD_VERSION(v) (((v) == 1 || (v) == 2))
+#define XFS_DINODE_GOOD_VERSION(v) ((v) >= 1 && (v) <= 3)
typedef struct xfs_timestamp {
__be32 t_sec; /* timestamp seconds */
@@ -39,6 +39,9 @@ typedef struct xfs_timestamp {
* There is a very similar struct icdinode in xfs_inode which matches the
* layout of the first 96 bytes of this structure, but is kept in native
* format instead of big endian.
+ *
+ * Note: di_flushiter is only used by v1/2 inodes - it's effectively a zeroed
+ * padding field for v3 inodes.
*/
typedef struct xfs_dinode {
__be16 di_magic; /* inode magic # = XFS_DINODE_MAGIC */
@@ -70,11 +73,38 @@ typedef struct xfs_dinode {
/* di_next_unlinked is the only non-core field in the old dinode */
__be32 di_next_unlinked;/* agi unlinked list ptr */
-} __attribute__((packed)) xfs_dinode_t;
+
+ /* start of the extended dinode, writable fields */
+ __le32 di_crc; /* CRC of the inode */
+ __be64 di_changecount; /* number of attribute changes */
+ __be64 di_lsn; /* flush sequence */
+ __be64 di_flags2; /* more random flags */
+ __u8 di_pad2[16]; /* more padding for future expansion */
+
+ /* fields only written to during inode creation */
+ xfs_timestamp_t di_crtime; /* time created */
+ __be64 di_ino; /* inode number */
+ uuid_t di_uuid; /* UUID of the filesystem */
+
+ /* structure must be padded to 64 bit alignment */
+} xfs_dinode_t;
+
+#define XFS_DINODE_CRC_OFF offsetof(struct xfs_dinode, di_crc)
#define DI_MAX_FLUSH 0xffff
/*
+ * Size of the core inode on disk. Version 1 and 2 inodes have
+ * the same size, but version 3 has grown a few additional fields.
+ */
+static inline uint xfs_dinode_size(int version)
+{
+ if (version == 3)
+ return sizeof(struct xfs_dinode);
+ return offsetof(struct xfs_dinode, di_crc);
+}
+
+/*
* The 32 bit link count in the inode theoretically maxes out at UINT_MAX.
* Since the pathconf interface is signed, we use 2^31 - 1 instead.
* The old inode format had a 16 bit link count, so its maximum is USHRT_MAX.
@@ -104,11 +134,8 @@ typedef enum xfs_dinode_fmt {
/*
* Inode size for given fs.
*/
-#define XFS_LITINO(mp) \
- ((int)(((mp)->m_sb.sb_inodesize) - sizeof(struct xfs_dinode)))
-
-#define XFS_BROOT_SIZE_ADJ \
- (XFS_BTREE_LBLOCK_LEN - sizeof(xfs_bmdr_block_t))
+#define XFS_LITINO(mp, version) \
+ ((int)(((mp)->m_sb.sb_inodesize) - xfs_dinode_size(version)))
/*
* Inode data & attribute fork sizes, per inode.
@@ -119,10 +146,10 @@ typedef enum xfs_dinode_fmt {
#define XFS_DFORK_DSIZE(dip,mp) \
(XFS_DFORK_Q(dip) ? \
XFS_DFORK_BOFF(dip) : \
- XFS_LITINO(mp))
+ XFS_LITINO(mp, (dip)->di_version))
#define XFS_DFORK_ASIZE(dip,mp) \
(XFS_DFORK_Q(dip) ? \
- XFS_LITINO(mp) - XFS_DFORK_BOFF(dip) : \
+ XFS_LITINO(mp, (dip)->di_version) - XFS_DFORK_BOFF(dip) : \
0)
#define XFS_DFORK_SIZE(dip,mp,w) \
((w) == XFS_DATA_FORK ? \
@@ -133,7 +160,7 @@ typedef enum xfs_dinode_fmt {
* Return pointers to the data or attribute forks.
*/
#define XFS_DFORK_DPTR(dip) \
- ((char *)(dip) + sizeof(struct xfs_dinode))
+ ((char *)dip + xfs_dinode_size(dip->di_version))
#define XFS_DFORK_APTR(dip) \
(XFS_DFORK_DPTR(dip) + XFS_DFORK_BOFF(dip))
#define XFS_DFORK_PTR(dip,w) \
diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c
index b26a50f9921..79670cda48a 100644
--- a/fs/xfs/xfs_dir2.c
+++ b/fs/xfs/xfs_dir2.c
@@ -17,28 +17,27 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
-#include "xfs_types.h"
-#include "xfs_log.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_inum.h"
-#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_mount.h"
+#include "xfs_da_format.h"
#include "xfs_da_btree.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_dinode.h"
#include "xfs_inode.h"
+#include "xfs_trans.h"
#include "xfs_inode_item.h"
#include "xfs_bmap.h"
#include "xfs_dir2.h"
-#include "xfs_dir2_format.h"
#include "xfs_dir2_priv.h"
#include "xfs_error.h"
-#include "xfs_vnodeops.h"
#include "xfs_trace.h"
+#include "xfs_dinode.h"
+
+struct xfs_name xfs_name_dotdot = { (unsigned char *)"..", 2, XFS_DIR3_FT_DIR };
-struct xfs_name xfs_name_dotdot = { (unsigned char *)"..", 2};
/*
* ASCII case-insensitive (ie. A-Z) support for directories that was
@@ -86,29 +85,74 @@ static struct xfs_nameops xfs_ascii_ci_nameops = {
.compname = xfs_ascii_ci_compname,
};
-void
-xfs_dir_mount(
- xfs_mount_t *mp)
+int
+xfs_da_mount(
+ struct xfs_mount *mp)
{
- ASSERT(xfs_sb_version_hasdirv2(&mp->m_sb));
+ struct xfs_da_geometry *dageo;
+ int nodehdr_size;
+
+
+ ASSERT(mp->m_sb.sb_versionnum & XFS_SB_VERSION_DIRV2BIT);
ASSERT((1 << (mp->m_sb.sb_blocklog + mp->m_sb.sb_dirblklog)) <=
XFS_MAX_BLOCKSIZE);
- mp->m_dirblksize = 1 << (mp->m_sb.sb_blocklog + mp->m_sb.sb_dirblklog);
- mp->m_dirblkfsbs = 1 << mp->m_sb.sb_dirblklog;
- mp->m_dirdatablk = xfs_dir2_db_to_da(mp, XFS_DIR2_DATA_FIRSTDB(mp));
- mp->m_dirleafblk = xfs_dir2_db_to_da(mp, XFS_DIR2_LEAF_FIRSTDB(mp));
- mp->m_dirfreeblk = xfs_dir2_db_to_da(mp, XFS_DIR2_FREE_FIRSTDB(mp));
- mp->m_attr_node_ents =
- (mp->m_sb.sb_blocksize - (uint)sizeof(xfs_da_node_hdr_t)) /
- (uint)sizeof(xfs_da_node_entry_t);
- mp->m_dir_node_ents =
- (mp->m_dirblksize - (uint)sizeof(xfs_da_node_hdr_t)) /
- (uint)sizeof(xfs_da_node_entry_t);
- mp->m_dir_magicpct = (mp->m_dirblksize * 37) / 100;
+
+ mp->m_dir_inode_ops = xfs_dir_get_ops(mp, NULL);
+ mp->m_nondir_inode_ops = xfs_nondir_get_ops(mp, NULL);
+
+ nodehdr_size = mp->m_dir_inode_ops->node_hdr_size;
+ mp->m_dir_geo = kmem_zalloc(sizeof(struct xfs_da_geometry),
+ KM_SLEEP | KM_MAYFAIL);
+ mp->m_attr_geo = kmem_zalloc(sizeof(struct xfs_da_geometry),
+ KM_SLEEP | KM_MAYFAIL);
+ if (!mp->m_dir_geo || !mp->m_attr_geo) {
+ kmem_free(mp->m_dir_geo);
+ kmem_free(mp->m_attr_geo);
+ return ENOMEM;
+ }
+
+ /* set up directory geometry */
+ dageo = mp->m_dir_geo;
+ dageo->blklog = mp->m_sb.sb_blocklog + mp->m_sb.sb_dirblklog;
+ dageo->fsblog = mp->m_sb.sb_blocklog;
+ dageo->blksize = 1 << dageo->blklog;
+ dageo->fsbcount = 1 << mp->m_sb.sb_dirblklog;
+
+ /*
+ * Now we've set up the block conversion variables, we can calculate the
+ * segment block constants using the geometry structure.
+ */
+ dageo->datablk = xfs_dir2_byte_to_da(dageo, XFS_DIR2_DATA_OFFSET);
+ dageo->leafblk = xfs_dir2_byte_to_da(dageo, XFS_DIR2_LEAF_OFFSET);
+ dageo->freeblk = xfs_dir2_byte_to_da(dageo, XFS_DIR2_FREE_OFFSET);
+ dageo->node_ents = (dageo->blksize - nodehdr_size) /
+ (uint)sizeof(xfs_da_node_entry_t);
+ dageo->magicpct = (dageo->blksize * 37) / 100;
+
+ /* set up attribute geometry - single fsb only */
+ dageo = mp->m_attr_geo;
+ dageo->blklog = mp->m_sb.sb_blocklog;
+ dageo->fsblog = mp->m_sb.sb_blocklog;
+ dageo->blksize = 1 << dageo->blklog;
+ dageo->fsbcount = 1;
+ dageo->node_ents = (dageo->blksize - nodehdr_size) /
+ (uint)sizeof(xfs_da_node_entry_t);
+ dageo->magicpct = (dageo->blksize * 37) / 100;
+
if (xfs_sb_version_hasasciici(&mp->m_sb))
mp->m_dirnameops = &xfs_ascii_ci_nameops;
else
mp->m_dirnameops = &xfs_default_nameops;
+
+ return 0;
+}
+
+void
+xfs_da_unmount(
+ struct xfs_mount *mp)
+{
+ kmem_free(mp->m_dir_geo);
+ kmem_free(mp->m_attr_geo);
}
/*
@@ -172,16 +216,24 @@ xfs_dir_init(
xfs_inode_t *dp,
xfs_inode_t *pdp)
{
- xfs_da_args_t args;
+ struct xfs_da_args *args;
int error;
- memset((char *)&args, 0, sizeof(args));
- args.dp = dp;
- args.trans = tp;
ASSERT(S_ISDIR(dp->i_d.di_mode));
- if ((error = xfs_dir_ino_validate(tp->t_mountp, pdp->i_ino)))
+ error = xfs_dir_ino_validate(tp->t_mountp, pdp->i_ino);
+ if (error)
return error;
- return xfs_dir2_sf_create(&args, pdp->i_ino);
+
+ args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
+ if (!args)
+ return ENOMEM;
+
+ args->geo = dp->i_mount->m_dir_geo;
+ args->dp = dp;
+ args->trans = tp;
+ error = xfs_dir2_sf_create(args, pdp->i_ino);
+ kmem_free(args);
+ return error;
}
/*
@@ -197,40 +249,57 @@ xfs_dir_createname(
xfs_bmap_free_t *flist, /* bmap's freeblock list */
xfs_extlen_t total) /* bmap's total block count */
{
- xfs_da_args_t args;
+ struct xfs_da_args *args;
int rval;
int v; /* type-checking value */
ASSERT(S_ISDIR(dp->i_d.di_mode));
- if ((rval = xfs_dir_ino_validate(tp->t_mountp, inum)))
+ rval = xfs_dir_ino_validate(tp->t_mountp, inum);
+ if (rval)
return rval;
XFS_STATS_INC(xs_dir_create);
- memset(&args, 0, sizeof(xfs_da_args_t));
- args.name = name->name;
- args.namelen = name->len;
- args.hashval = dp->i_mount->m_dirnameops->hashname(name);
- args.inumber = inum;
- args.dp = dp;
- args.firstblock = first;
- args.flist = flist;
- args.total = total;
- args.whichfork = XFS_DATA_FORK;
- args.trans = tp;
- args.op_flags = XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT;
-
- if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
- rval = xfs_dir2_sf_addname(&args);
- else if ((rval = xfs_dir2_isblock(tp, dp, &v)))
- return rval;
- else if (v)
- rval = xfs_dir2_block_addname(&args);
- else if ((rval = xfs_dir2_isleaf(tp, dp, &v)))
- return rval;
- else if (v)
- rval = xfs_dir2_leaf_addname(&args);
+ args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
+ if (!args)
+ return ENOMEM;
+
+ args->geo = dp->i_mount->m_dir_geo;
+ args->name = name->name;
+ args->namelen = name->len;
+ args->filetype = name->type;
+ args->hashval = dp->i_mount->m_dirnameops->hashname(name);
+ args->inumber = inum;
+ args->dp = dp;
+ args->firstblock = first;
+ args->flist = flist;
+ args->total = total;
+ args->whichfork = XFS_DATA_FORK;
+ args->trans = tp;
+ args->op_flags = XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT;
+
+ if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
+ rval = xfs_dir2_sf_addname(args);
+ goto out_free;
+ }
+
+ rval = xfs_dir2_isblock(args, &v);
+ if (rval)
+ goto out_free;
+ if (v) {
+ rval = xfs_dir2_block_addname(args);
+ goto out_free;
+ }
+
+ rval = xfs_dir2_isleaf(args, &v);
+ if (rval)
+ goto out_free;
+ if (v)
+ rval = xfs_dir2_leaf_addname(args);
else
- rval = xfs_dir2_node_addname(&args);
+ rval = xfs_dir2_node_addname(args);
+
+out_free:
+ kmem_free(args);
return rval;
}
@@ -273,45 +342,67 @@ xfs_dir_lookup(
xfs_ino_t *inum, /* out: inode number */
struct xfs_name *ci_name) /* out: actual name if CI match */
{
- xfs_da_args_t args;
+ struct xfs_da_args *args;
int rval;
int v; /* type-checking value */
ASSERT(S_ISDIR(dp->i_d.di_mode));
XFS_STATS_INC(xs_dir_lookup);
- memset(&args, 0, sizeof(xfs_da_args_t));
- args.name = name->name;
- args.namelen = name->len;
- args.hashval = dp->i_mount->m_dirnameops->hashname(name);
- args.dp = dp;
- args.whichfork = XFS_DATA_FORK;
- args.trans = tp;
- args.op_flags = XFS_DA_OP_OKNOENT;
+ /*
+ * We need to use KM_NOFS here so that lockdep will not throw false
+ * positive deadlock warnings on a non-transactional lookup path. It is
+ * safe to recurse into inode recalim in that case, but lockdep can't
+ * easily be taught about it. Hence KM_NOFS avoids having to add more
+ * lockdep Doing this avoids having to add a bunch of lockdep class
+ * annotations into the reclaim path for the ilock.
+ */
+ args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
+ args->geo = dp->i_mount->m_dir_geo;
+ args->name = name->name;
+ args->namelen = name->len;
+ args->filetype = name->type;
+ args->hashval = dp->i_mount->m_dirnameops->hashname(name);
+ args->dp = dp;
+ args->whichfork = XFS_DATA_FORK;
+ args->trans = tp;
+ args->op_flags = XFS_DA_OP_OKNOENT;
if (ci_name)
- args.op_flags |= XFS_DA_OP_CILOOKUP;
+ args->op_flags |= XFS_DA_OP_CILOOKUP;
- if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
- rval = xfs_dir2_sf_lookup(&args);
- else if ((rval = xfs_dir2_isblock(tp, dp, &v)))
- return rval;
- else if (v)
- rval = xfs_dir2_block_lookup(&args);
- else if ((rval = xfs_dir2_isleaf(tp, dp, &v)))
- return rval;
- else if (v)
- rval = xfs_dir2_leaf_lookup(&args);
+ if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
+ rval = xfs_dir2_sf_lookup(args);
+ goto out_check_rval;
+ }
+
+ rval = xfs_dir2_isblock(args, &v);
+ if (rval)
+ goto out_free;
+ if (v) {
+ rval = xfs_dir2_block_lookup(args);
+ goto out_check_rval;
+ }
+
+ rval = xfs_dir2_isleaf(args, &v);
+ if (rval)
+ goto out_free;
+ if (v)
+ rval = xfs_dir2_leaf_lookup(args);
else
- rval = xfs_dir2_node_lookup(&args);
+ rval = xfs_dir2_node_lookup(args);
+
+out_check_rval:
if (rval == EEXIST)
rval = 0;
if (!rval) {
- *inum = args.inumber;
+ *inum = args->inumber;
if (ci_name) {
- ci_name->name = args.value;
- ci_name->len = args.valuelen;
+ ci_name->name = args->value;
+ ci_name->len = args->valuelen;
}
}
+out_free:
+ kmem_free(args);
return rval;
}
@@ -328,71 +419,52 @@ xfs_dir_removename(
xfs_bmap_free_t *flist, /* bmap's freeblock list */
xfs_extlen_t total) /* bmap's total block count */
{
- xfs_da_args_t args;
+ struct xfs_da_args *args;
int rval;
int v; /* type-checking value */
ASSERT(S_ISDIR(dp->i_d.di_mode));
XFS_STATS_INC(xs_dir_remove);
- memset(&args, 0, sizeof(xfs_da_args_t));
- args.name = name->name;
- args.namelen = name->len;
- args.hashval = dp->i_mount->m_dirnameops->hashname(name);
- args.inumber = ino;
- args.dp = dp;
- args.firstblock = first;
- args.flist = flist;
- args.total = total;
- args.whichfork = XFS_DATA_FORK;
- args.trans = tp;
-
- if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
- rval = xfs_dir2_sf_removename(&args);
- else if ((rval = xfs_dir2_isblock(tp, dp, &v)))
- return rval;
- else if (v)
- rval = xfs_dir2_block_removename(&args);
- else if ((rval = xfs_dir2_isleaf(tp, dp, &v)))
- return rval;
- else if (v)
- rval = xfs_dir2_leaf_removename(&args);
- else
- rval = xfs_dir2_node_removename(&args);
- return rval;
-}
-
-/*
- * Read a directory.
- */
-int
-xfs_readdir(
- xfs_inode_t *dp,
- void *dirent,
- size_t bufsize,
- xfs_off_t *offset,
- filldir_t filldir)
-{
- int rval; /* return value */
- int v; /* type-checking value */
+ args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
+ if (!args)
+ return ENOMEM;
- trace_xfs_readdir(dp);
+ args->geo = dp->i_mount->m_dir_geo;
+ args->name = name->name;
+ args->namelen = name->len;
+ args->filetype = name->type;
+ args->hashval = dp->i_mount->m_dirnameops->hashname(name);
+ args->inumber = ino;
+ args->dp = dp;
+ args->firstblock = first;
+ args->flist = flist;
+ args->total = total;
+ args->whichfork = XFS_DATA_FORK;
+ args->trans = tp;
+
+ if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
+ rval = xfs_dir2_sf_removename(args);
+ goto out_free;
+ }
- if (XFS_FORCED_SHUTDOWN(dp->i_mount))
- return XFS_ERROR(EIO);
+ rval = xfs_dir2_isblock(args, &v);
+ if (rval)
+ goto out_free;
+ if (v) {
+ rval = xfs_dir2_block_removename(args);
+ goto out_free;
+ }
- ASSERT(S_ISDIR(dp->i_d.di_mode));
- XFS_STATS_INC(xs_dir_getdents);
-
- if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
- rval = xfs_dir2_sf_getdents(dp, dirent, offset, filldir);
- else if ((rval = xfs_dir2_isblock(NULL, dp, &v)))
- ;
- else if (v)
- rval = xfs_dir2_block_getdents(dp, dirent, offset, filldir);
+ rval = xfs_dir2_isleaf(args, &v);
+ if (rval)
+ goto out_free;
+ if (v)
+ rval = xfs_dir2_leaf_removename(args);
else
- rval = xfs_dir2_leaf_getdents(dp, dirent, bufsize, offset,
- filldir);
+ rval = xfs_dir2_node_removename(args);
+out_free:
+ kmem_free(args);
return rval;
}
@@ -409,39 +481,55 @@ xfs_dir_replace(
xfs_bmap_free_t *flist, /* bmap's freeblock list */
xfs_extlen_t total) /* bmap's total block count */
{
- xfs_da_args_t args;
+ struct xfs_da_args *args;
int rval;
int v; /* type-checking value */
ASSERT(S_ISDIR(dp->i_d.di_mode));
- if ((rval = xfs_dir_ino_validate(tp->t_mountp, inum)))
+ rval = xfs_dir_ino_validate(tp->t_mountp, inum);
+ if (rval)
return rval;
- memset(&args, 0, sizeof(xfs_da_args_t));
- args.name = name->name;
- args.namelen = name->len;
- args.hashval = dp->i_mount->m_dirnameops->hashname(name);
- args.inumber = inum;
- args.dp = dp;
- args.firstblock = first;
- args.flist = flist;
- args.total = total;
- args.whichfork = XFS_DATA_FORK;
- args.trans = tp;
-
- if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
- rval = xfs_dir2_sf_replace(&args);
- else if ((rval = xfs_dir2_isblock(tp, dp, &v)))
- return rval;
- else if (v)
- rval = xfs_dir2_block_replace(&args);
- else if ((rval = xfs_dir2_isleaf(tp, dp, &v)))
- return rval;
- else if (v)
- rval = xfs_dir2_leaf_replace(&args);
+ args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
+ if (!args)
+ return ENOMEM;
+
+ args->geo = dp->i_mount->m_dir_geo;
+ args->name = name->name;
+ args->namelen = name->len;
+ args->filetype = name->type;
+ args->hashval = dp->i_mount->m_dirnameops->hashname(name);
+ args->inumber = inum;
+ args->dp = dp;
+ args->firstblock = first;
+ args->flist = flist;
+ args->total = total;
+ args->whichfork = XFS_DATA_FORK;
+ args->trans = tp;
+
+ if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
+ rval = xfs_dir2_sf_replace(args);
+ goto out_free;
+ }
+
+ rval = xfs_dir2_isblock(args, &v);
+ if (rval)
+ goto out_free;
+ if (v) {
+ rval = xfs_dir2_block_replace(args);
+ goto out_free;
+ }
+
+ rval = xfs_dir2_isleaf(args, &v);
+ if (rval)
+ goto out_free;
+ if (v)
+ rval = xfs_dir2_leaf_replace(args);
else
- rval = xfs_dir2_node_replace(&args);
+ rval = xfs_dir2_node_replace(args);
+out_free:
+ kmem_free(args);
return rval;
}
@@ -456,7 +544,7 @@ xfs_dir_canenter(
struct xfs_name *name, /* name of entry to add */
uint resblks)
{
- xfs_da_args_t args;
+ struct xfs_da_args *args;
int rval;
int v; /* type-checking value */
@@ -465,28 +553,43 @@ xfs_dir_canenter(
ASSERT(S_ISDIR(dp->i_d.di_mode));
- memset(&args, 0, sizeof(xfs_da_args_t));
- args.name = name->name;
- args.namelen = name->len;
- args.hashval = dp->i_mount->m_dirnameops->hashname(name);
- args.dp = dp;
- args.whichfork = XFS_DATA_FORK;
- args.trans = tp;
- args.op_flags = XFS_DA_OP_JUSTCHECK | XFS_DA_OP_ADDNAME |
+ args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
+ if (!args)
+ return ENOMEM;
+
+ args->geo = dp->i_mount->m_dir_geo;
+ args->name = name->name;
+ args->namelen = name->len;
+ args->filetype = name->type;
+ args->hashval = dp->i_mount->m_dirnameops->hashname(name);
+ args->dp = dp;
+ args->whichfork = XFS_DATA_FORK;
+ args->trans = tp;
+ args->op_flags = XFS_DA_OP_JUSTCHECK | XFS_DA_OP_ADDNAME |
XFS_DA_OP_OKNOENT;
- if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
- rval = xfs_dir2_sf_addname(&args);
- else if ((rval = xfs_dir2_isblock(tp, dp, &v)))
- return rval;
- else if (v)
- rval = xfs_dir2_block_addname(&args);
- else if ((rval = xfs_dir2_isleaf(tp, dp, &v)))
- return rval;
- else if (v)
- rval = xfs_dir2_leaf_addname(&args);
+ if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
+ rval = xfs_dir2_sf_addname(args);
+ goto out_free;
+ }
+
+ rval = xfs_dir2_isblock(args, &v);
+ if (rval)
+ goto out_free;
+ if (v) {
+ rval = xfs_dir2_block_addname(args);
+ goto out_free;
+ }
+
+ rval = xfs_dir2_isleaf(args, &v);
+ if (rval)
+ goto out_free;
+ if (v)
+ rval = xfs_dir2_leaf_addname(args);
else
- rval = xfs_dir2_node_addname(&args);
+ rval = xfs_dir2_node_addname(args);
+out_free:
+ kmem_free(args);
return rval;
}
@@ -518,13 +621,13 @@ xfs_dir2_grow_inode(
* Set lowest possible block in the space requested.
*/
bno = XFS_B_TO_FSBT(mp, space * XFS_DIR2_SPACE_SIZE);
- count = mp->m_dirblkfsbs;
+ count = args->geo->fsbcount;
error = xfs_da_grow_inode_int(args, &bno, count);
if (error)
return error;
- *dbp = xfs_dir2_da_to_db(mp, (xfs_dablk_t)bno);
+ *dbp = xfs_dir2_da_to_db(args->geo, (xfs_dablk_t)bno);
/*
* Update file's size if this is the data space and it grew.
@@ -546,19 +649,16 @@ xfs_dir2_grow_inode(
*/
int
xfs_dir2_isblock(
- xfs_trans_t *tp,
- xfs_inode_t *dp,
- int *vp) /* out: 1 is block, 0 is not block */
+ struct xfs_da_args *args,
+ int *vp) /* out: 1 is block, 0 is not block */
{
- xfs_fileoff_t last; /* last file offset */
- xfs_mount_t *mp;
- int rval;
+ xfs_fileoff_t last; /* last file offset */
+ int rval;
- mp = dp->i_mount;
- if ((rval = xfs_bmap_last_offset(tp, dp, &last, XFS_DATA_FORK)))
+ if ((rval = xfs_bmap_last_offset(args->dp, &last, XFS_DATA_FORK)))
return rval;
- rval = XFS_FSB_TO_B(mp, last) == mp->m_dirblksize;
- ASSERT(rval == 0 || dp->i_d.di_size == mp->m_dirblksize);
+ rval = XFS_FSB_TO_B(args->dp->i_mount, last) == args->geo->blksize;
+ ASSERT(rval == 0 || args->dp->i_d.di_size == args->geo->blksize);
*vp = rval;
return 0;
}
@@ -568,18 +668,15 @@ xfs_dir2_isblock(
*/
int
xfs_dir2_isleaf(
- xfs_trans_t *tp,
- xfs_inode_t *dp,
- int *vp) /* out: 1 is leaf, 0 is not leaf */
+ struct xfs_da_args *args,
+ int *vp) /* out: 1 is block, 0 is not block */
{
- xfs_fileoff_t last; /* last file offset */
- xfs_mount_t *mp;
- int rval;
+ xfs_fileoff_t last; /* last file offset */
+ int rval;
- mp = dp->i_mount;
- if ((rval = xfs_bmap_last_offset(tp, dp, &last, XFS_DATA_FORK)))
+ if ((rval = xfs_bmap_last_offset(args->dp, &last, XFS_DATA_FORK)))
return rval;
- *vp = last == mp->m_dirleafblk + (1 << mp->m_sb.sb_dirblklog);
+ *vp = last == args->geo->leafblk + args->geo->fsbcount;
return 0;
}
@@ -607,11 +704,11 @@ xfs_dir2_shrink_inode(
dp = args->dp;
mp = dp->i_mount;
tp = args->trans;
- da = xfs_dir2_db_to_da(mp, db);
+ da = xfs_dir2_db_to_da(args->geo, db);
/*
* Unmap the fsblock(s).
*/
- if ((error = xfs_bunmapi(tp, dp, da, mp->m_dirblkfsbs,
+ if ((error = xfs_bunmapi(tp, dp, da, args->geo->fsbcount,
XFS_BMAPI_METADATA, 0, args->firstblock, args->flist,
&done))) {
/*
@@ -638,12 +735,12 @@ xfs_dir2_shrink_inode(
/*
* If it's not a data block, we're done.
*/
- if (db >= XFS_DIR2_LEAF_FIRSTDB(mp))
+ if (db >= xfs_dir2_byte_to_db(args->geo, XFS_DIR2_LEAF_OFFSET))
return 0;
/*
* If the block isn't the last one in the directory, we're done.
*/
- if (dp->i_d.di_size > xfs_dir2_db_off_to_byte(mp, db + 1, 0))
+ if (dp->i_d.di_size > xfs_dir2_db_off_to_byte(args->geo, db + 1, 0))
return 0;
bno = da;
if ((error = xfs_bmap_last_before(tp, dp, &bno, XFS_DATA_FORK))) {
@@ -652,7 +749,7 @@ xfs_dir2_shrink_inode(
*/
return error;
}
- if (db == mp->m_dirdatablk)
+ if (db == args->geo->datablk)
ASSERT(bno == 0);
else
ASSERT(bno > 0);
diff --git a/fs/xfs/xfs_dir2.h b/fs/xfs/xfs_dir2.h
index e937d9991c1..c8e86b0b5e9 100644
--- a/fs/xfs/xfs_dir2.h
+++ b/fs/xfs/xfs_dir2.h
@@ -23,14 +23,100 @@ struct xfs_da_args;
struct xfs_inode;
struct xfs_mount;
struct xfs_trans;
+struct xfs_dir2_sf_hdr;
+struct xfs_dir2_sf_entry;
+struct xfs_dir2_data_hdr;
+struct xfs_dir2_data_entry;
+struct xfs_dir2_data_unused;
extern struct xfs_name xfs_name_dotdot;
/*
+ * directory operations vector for encode/decode routines
+ */
+struct xfs_dir_ops {
+ int (*sf_entsize)(struct xfs_dir2_sf_hdr *hdr, int len);
+ struct xfs_dir2_sf_entry *
+ (*sf_nextentry)(struct xfs_dir2_sf_hdr *hdr,
+ struct xfs_dir2_sf_entry *sfep);
+ __uint8_t (*sf_get_ftype)(struct xfs_dir2_sf_entry *sfep);
+ void (*sf_put_ftype)(struct xfs_dir2_sf_entry *sfep,
+ __uint8_t ftype);
+ xfs_ino_t (*sf_get_ino)(struct xfs_dir2_sf_hdr *hdr,
+ struct xfs_dir2_sf_entry *sfep);
+ void (*sf_put_ino)(struct xfs_dir2_sf_hdr *hdr,
+ struct xfs_dir2_sf_entry *sfep,
+ xfs_ino_t ino);
+ xfs_ino_t (*sf_get_parent_ino)(struct xfs_dir2_sf_hdr *hdr);
+ void (*sf_put_parent_ino)(struct xfs_dir2_sf_hdr *hdr,
+ xfs_ino_t ino);
+
+ int (*data_entsize)(int len);
+ __uint8_t (*data_get_ftype)(struct xfs_dir2_data_entry *dep);
+ void (*data_put_ftype)(struct xfs_dir2_data_entry *dep,
+ __uint8_t ftype);
+ __be16 * (*data_entry_tag_p)(struct xfs_dir2_data_entry *dep);
+ struct xfs_dir2_data_free *
+ (*data_bestfree_p)(struct xfs_dir2_data_hdr *hdr);
+
+ xfs_dir2_data_aoff_t data_dot_offset;
+ xfs_dir2_data_aoff_t data_dotdot_offset;
+ xfs_dir2_data_aoff_t data_first_offset;
+ size_t data_entry_offset;
+
+ struct xfs_dir2_data_entry *
+ (*data_dot_entry_p)(struct xfs_dir2_data_hdr *hdr);
+ struct xfs_dir2_data_entry *
+ (*data_dotdot_entry_p)(struct xfs_dir2_data_hdr *hdr);
+ struct xfs_dir2_data_entry *
+ (*data_first_entry_p)(struct xfs_dir2_data_hdr *hdr);
+ struct xfs_dir2_data_entry *
+ (*data_entry_p)(struct xfs_dir2_data_hdr *hdr);
+ struct xfs_dir2_data_unused *
+ (*data_unused_p)(struct xfs_dir2_data_hdr *hdr);
+
+ int leaf_hdr_size;
+ void (*leaf_hdr_to_disk)(struct xfs_dir2_leaf *to,
+ struct xfs_dir3_icleaf_hdr *from);
+ void (*leaf_hdr_from_disk)(struct xfs_dir3_icleaf_hdr *to,
+ struct xfs_dir2_leaf *from);
+ int (*leaf_max_ents)(struct xfs_da_geometry *geo);
+ struct xfs_dir2_leaf_entry *
+ (*leaf_ents_p)(struct xfs_dir2_leaf *lp);
+
+ int node_hdr_size;
+ void (*node_hdr_to_disk)(struct xfs_da_intnode *to,
+ struct xfs_da3_icnode_hdr *from);
+ void (*node_hdr_from_disk)(struct xfs_da3_icnode_hdr *to,
+ struct xfs_da_intnode *from);
+ struct xfs_da_node_entry *
+ (*node_tree_p)(struct xfs_da_intnode *dap);
+
+ int free_hdr_size;
+ void (*free_hdr_to_disk)(struct xfs_dir2_free *to,
+ struct xfs_dir3_icfree_hdr *from);
+ void (*free_hdr_from_disk)(struct xfs_dir3_icfree_hdr *to,
+ struct xfs_dir2_free *from);
+ int (*free_max_bests)(struct xfs_da_geometry *geo);
+ __be16 * (*free_bests_p)(struct xfs_dir2_free *free);
+ xfs_dir2_db_t (*db_to_fdb)(struct xfs_da_geometry *geo,
+ xfs_dir2_db_t db);
+ int (*db_to_fdindex)(struct xfs_da_geometry *geo,
+ xfs_dir2_db_t db);
+};
+
+extern const struct xfs_dir_ops *
+ xfs_dir_get_ops(struct xfs_mount *mp, struct xfs_inode *dp);
+extern const struct xfs_dir_ops *
+ xfs_nondir_get_ops(struct xfs_mount *mp, struct xfs_inode *dp);
+
+/*
* Generic directory interface routines
*/
extern void xfs_dir_startup(void);
-extern void xfs_dir_mount(struct xfs_mount *mp);
+extern int xfs_da_mount(struct xfs_mount *mp);
+extern void xfs_da_unmount(struct xfs_mount *mp);
+
extern int xfs_dir_isempty(struct xfs_inode *dp);
extern int xfs_dir_init(struct xfs_trans *tp, struct xfs_inode *dp,
struct xfs_inode *pdp);
@@ -57,4 +143,38 @@ extern int xfs_dir_canenter(struct xfs_trans *tp, struct xfs_inode *dp,
*/
extern int xfs_dir2_sf_to_block(struct xfs_da_args *args);
+/*
+ * Interface routines used by userspace utilities
+ */
+extern int xfs_dir2_isblock(struct xfs_da_args *args, int *r);
+extern int xfs_dir2_isleaf(struct xfs_da_args *args, int *r);
+extern int xfs_dir2_shrink_inode(struct xfs_da_args *args, xfs_dir2_db_t db,
+ struct xfs_buf *bp);
+
+extern void xfs_dir2_data_freescan(struct xfs_inode *dp,
+ struct xfs_dir2_data_hdr *hdr, int *loghead);
+extern void xfs_dir2_data_log_entry(struct xfs_da_args *args,
+ struct xfs_buf *bp, struct xfs_dir2_data_entry *dep);
+extern void xfs_dir2_data_log_header(struct xfs_da_args *args,
+ struct xfs_buf *bp);
+extern void xfs_dir2_data_log_unused(struct xfs_da_args *args,
+ struct xfs_buf *bp, struct xfs_dir2_data_unused *dup);
+extern void xfs_dir2_data_make_free(struct xfs_da_args *args,
+ struct xfs_buf *bp, xfs_dir2_data_aoff_t offset,
+ xfs_dir2_data_aoff_t len, int *needlogp, int *needscanp);
+extern void xfs_dir2_data_use_free(struct xfs_da_args *args,
+ struct xfs_buf *bp, struct xfs_dir2_data_unused *dup,
+ xfs_dir2_data_aoff_t offset, xfs_dir2_data_aoff_t len,
+ int *needlogp, int *needscanp);
+
+extern struct xfs_dir2_data_free *xfs_dir2_data_freefind(
+ struct xfs_dir2_data_hdr *hdr, struct xfs_dir2_data_free *bf,
+ struct xfs_dir2_data_unused *dup);
+
+extern const struct xfs_buf_ops xfs_dir3_block_buf_ops;
+extern const struct xfs_buf_ops xfs_dir3_leafn_buf_ops;
+extern const struct xfs_buf_ops xfs_dir3_leaf1_buf_ops;
+extern const struct xfs_buf_ops xfs_dir3_free_buf_ops;
+extern const struct xfs_buf_ops xfs_dir3_data_buf_ops;
+
#endif /* __XFS_DIR2_H__ */
diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c
index 12afe07a91d..c7cd3154026 100644
--- a/fs/xfs/xfs_dir2_block.c
+++ b/fs/xfs/xfs_dir2_block.c
@@ -1,5 +1,6 @@
/*
* Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
* All Rights Reserved.
*
* This program is free software; you can redistribute it and/or
@@ -17,22 +18,25 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
-#include "xfs_types.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_mount.h"
+#include "xfs_da_format.h"
#include "xfs_da_btree.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_dinode.h"
#include "xfs_inode.h"
+#include "xfs_trans.h"
#include "xfs_inode_item.h"
+#include "xfs_bmap.h"
+#include "xfs_buf_item.h"
#include "xfs_dir2.h"
-#include "xfs_dir2_format.h"
#include "xfs_dir2_priv.h"
#include "xfs_error.h"
#include "xfs_trace.h"
+#include "xfs_cksum.h"
+#include "xfs_dinode.h"
/*
* Local function prototypes.
@@ -56,56 +60,116 @@ xfs_dir_startup(void)
xfs_dir_hash_dotdot = xfs_da_hashname((unsigned char *)"..", 2);
}
-static void
-xfs_dir2_block_verify(
+static bool
+xfs_dir3_block_verify(
struct xfs_buf *bp)
{
struct xfs_mount *mp = bp->b_target->bt_mount;
- struct xfs_dir2_data_hdr *hdr = bp->b_addr;
- int block_ok = 0;
-
- block_ok = hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC);
- block_ok = block_ok && __xfs_dir2_data_check(NULL, bp) == 0;
-
- if (!block_ok) {
- XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr);
- xfs_buf_ioerror(bp, EFSCORRUPTED);
+ struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
+
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ if (hdr3->magic != cpu_to_be32(XFS_DIR3_BLOCK_MAGIC))
+ return false;
+ if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_uuid))
+ return false;
+ if (be64_to_cpu(hdr3->blkno) != bp->b_bn)
+ return false;
+ } else {
+ if (hdr3->magic != cpu_to_be32(XFS_DIR2_BLOCK_MAGIC))
+ return false;
}
+ if (__xfs_dir3_data_check(NULL, bp))
+ return false;
+ return true;
}
static void
-xfs_dir2_block_read_verify(
+xfs_dir3_block_read_verify(
struct xfs_buf *bp)
{
- xfs_dir2_block_verify(bp);
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+
+ if (xfs_sb_version_hascrc(&mp->m_sb) &&
+ !xfs_buf_verify_cksum(bp, XFS_DIR3_DATA_CRC_OFF))
+ xfs_buf_ioerror(bp, EFSBADCRC);
+ else if (!xfs_dir3_block_verify(bp))
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
+
+ if (bp->b_error)
+ xfs_verifier_error(bp);
}
static void
-xfs_dir2_block_write_verify(
+xfs_dir3_block_write_verify(
struct xfs_buf *bp)
{
- xfs_dir2_block_verify(bp);
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_buf_log_item *bip = bp->b_fspriv;
+ struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
+
+ if (!xfs_dir3_block_verify(bp)) {
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
+ xfs_verifier_error(bp);
+ return;
+ }
+
+ if (!xfs_sb_version_hascrc(&mp->m_sb))
+ return;
+
+ if (bip)
+ hdr3->lsn = cpu_to_be64(bip->bli_item.li_lsn);
+
+ xfs_buf_update_cksum(bp, XFS_DIR3_DATA_CRC_OFF);
}
-const struct xfs_buf_ops xfs_dir2_block_buf_ops = {
- .verify_read = xfs_dir2_block_read_verify,
- .verify_write = xfs_dir2_block_write_verify,
+const struct xfs_buf_ops xfs_dir3_block_buf_ops = {
+ .verify_read = xfs_dir3_block_read_verify,
+ .verify_write = xfs_dir3_block_write_verify,
};
-static int
-xfs_dir2_block_read(
+int
+xfs_dir3_block_read(
struct xfs_trans *tp,
struct xfs_inode *dp,
struct xfs_buf **bpp)
{
struct xfs_mount *mp = dp->i_mount;
+ int err;
+
+ err = xfs_da_read_buf(tp, dp, mp->m_dir_geo->datablk, -1, bpp,
+ XFS_DATA_FORK, &xfs_dir3_block_buf_ops);
+ if (!err && tp)
+ xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_BLOCK_BUF);
+ return err;
+}
+
+static void
+xfs_dir3_block_init(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ struct xfs_buf *bp,
+ struct xfs_inode *dp)
+{
+ struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
- return xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, bpp,
- XFS_DATA_FORK, &xfs_dir2_block_buf_ops);
+ bp->b_ops = &xfs_dir3_block_buf_ops;
+ xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_BLOCK_BUF);
+
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ memset(hdr3, 0, sizeof(*hdr3));
+ hdr3->magic = cpu_to_be32(XFS_DIR3_BLOCK_MAGIC);
+ hdr3->blkno = cpu_to_be64(bp->b_bn);
+ hdr3->owner = cpu_to_be64(dp->i_ino);
+ uuid_copy(&hdr3->uuid, &mp->m_sb.sb_uuid);
+ return;
+
+ }
+ hdr3->magic = cpu_to_be32(XFS_DIR2_BLOCK_MAGIC);
}
static void
xfs_dir2_block_need_space(
+ struct xfs_inode *dp,
struct xfs_dir2_data_hdr *hdr,
struct xfs_dir2_block_tail *btp,
struct xfs_dir2_leaf_entry *blp,
@@ -121,7 +185,7 @@ xfs_dir2_block_need_space(
struct xfs_dir2_data_unused *enddup = NULL;
*compact = 0;
- bf = hdr->bestfree;
+ bf = dp->d_ops->data_bestfree_p(hdr);
/*
* If there are stale entries we'll use one for the leaf.
@@ -217,7 +281,7 @@ out:
*/
static void
xfs_dir2_block_compact(
- struct xfs_trans *tp,
+ struct xfs_da_args *args,
struct xfs_buf *bp,
struct xfs_dir2_data_hdr *hdr,
struct xfs_dir2_block_tail *btp,
@@ -250,18 +314,17 @@ xfs_dir2_block_compact(
*lfloglow = toidx + 1 - (be32_to_cpu(btp->stale) - 1);
*lfloghigh -= be32_to_cpu(btp->stale) - 1;
be32_add_cpu(&btp->count, -(be32_to_cpu(btp->stale) - 1));
- xfs_dir2_data_make_free(tp, bp,
+ xfs_dir2_data_make_free(args, bp,
(xfs_dir2_data_aoff_t)((char *)blp - (char *)hdr),
(xfs_dir2_data_aoff_t)((be32_to_cpu(btp->stale) - 1) * sizeof(*blp)),
needlog, &needscan);
- blp += be32_to_cpu(btp->stale) - 1;
btp->stale = cpu_to_be32(1);
/*
* If we now need to rebuild the bestfree map, do so.
* This needs to happen before the next call to use_free.
*/
if (needscan)
- xfs_dir2_data_freescan(tp->t_mountp, hdr, needlog);
+ xfs_dir2_data_freescan(args->dp, hdr, needlog);
}
/*
@@ -303,24 +366,24 @@ xfs_dir2_block_addname(
mp = dp->i_mount;
/* Read the (one and only) directory block into bp. */
- error = xfs_dir2_block_read(tp, dp, &bp);
+ error = xfs_dir3_block_read(tp, dp, &bp);
if (error)
return error;
- len = xfs_dir2_data_entsize(args->namelen);
+ len = dp->d_ops->data_entsize(args->namelen);
/*
* Set up pointers to parts of the block.
*/
hdr = bp->b_addr;
- btp = xfs_dir2_block_tail_p(mp, hdr);
+ btp = xfs_dir2_block_tail_p(args->geo, hdr);
blp = xfs_dir2_block_leaf_p(btp);
/*
* Find out if we can reuse stale entries or whether we need extra
* space for entry and new leaf.
*/
- xfs_dir2_block_need_space(hdr, btp, blp, &tagp, &dup,
+ xfs_dir2_block_need_space(dp, hdr, btp, blp, &tagp, &dup,
&enddup, &compact, len);
/*
@@ -356,7 +419,7 @@ xfs_dir2_block_addname(
* If need to compact the leaf entries, do it now.
*/
if (compact) {
- xfs_dir2_block_compact(tp, bp, hdr, btp, blp, &needlog,
+ xfs_dir2_block_compact(args, bp, hdr, btp, blp, &needlog,
&lfloghigh, &lfloglow);
/* recalculate blp post-compaction */
blp = xfs_dir2_block_leaf_p(btp);
@@ -391,7 +454,7 @@ xfs_dir2_block_addname(
/*
* Mark the space needed for the new leaf entry, now in use.
*/
- xfs_dir2_data_use_free(tp, bp, enddup,
+ xfs_dir2_data_use_free(args, bp, enddup,
(xfs_dir2_data_aoff_t)
((char *)enddup - (char *)hdr + be16_to_cpu(enddup->length) -
sizeof(*blp)),
@@ -406,7 +469,7 @@ xfs_dir2_block_addname(
* This needs to happen before the next call to use_free.
*/
if (needscan) {
- xfs_dir2_data_freescan(mp, hdr, &needlog);
+ xfs_dir2_data_freescan(dp, hdr, &needlog);
needscan = 0;
}
/*
@@ -472,13 +535,13 @@ xfs_dir2_block_addname(
* Fill in the leaf entry.
*/
blp[mid].hashval = cpu_to_be32(args->hashval);
- blp[mid].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(mp,
+ blp[mid].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(
(char *)dep - (char *)hdr));
xfs_dir2_block_log_leaf(tp, bp, lfloglow, lfloghigh);
/*
* Mark space for the data entry used.
*/
- xfs_dir2_data_use_free(tp, bp, dup,
+ xfs_dir2_data_use_free(args, bp, dup,
(xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr),
(xfs_dir2_data_aoff_t)len, &needlog, &needscan);
/*
@@ -487,116 +550,19 @@ xfs_dir2_block_addname(
dep->inumber = cpu_to_be64(args->inumber);
dep->namelen = args->namelen;
memcpy(dep->name, args->name, args->namelen);
- tagp = xfs_dir2_data_entry_tag_p(dep);
+ dp->d_ops->data_put_ftype(dep, args->filetype);
+ tagp = dp->d_ops->data_entry_tag_p(dep);
*tagp = cpu_to_be16((char *)dep - (char *)hdr);
/*
* Clean up the bestfree array and log the header, tail, and entry.
*/
if (needscan)
- xfs_dir2_data_freescan(mp, hdr, &needlog);
+ xfs_dir2_data_freescan(dp, hdr, &needlog);
if (needlog)
- xfs_dir2_data_log_header(tp, bp);
+ xfs_dir2_data_log_header(args, bp);
xfs_dir2_block_log_tail(tp, bp);
- xfs_dir2_data_log_entry(tp, bp, dep);
- xfs_dir2_data_check(dp, bp);
- return 0;
-}
-
-/*
- * Readdir for block directories.
- */
-int /* error */
-xfs_dir2_block_getdents(
- xfs_inode_t *dp, /* incore inode */
- void *dirent,
- xfs_off_t *offset,
- filldir_t filldir)
-{
- xfs_dir2_data_hdr_t *hdr; /* block header */
- struct xfs_buf *bp; /* buffer for block */
- xfs_dir2_block_tail_t *btp; /* block tail */
- xfs_dir2_data_entry_t *dep; /* block data entry */
- xfs_dir2_data_unused_t *dup; /* block unused entry */
- char *endptr; /* end of the data entries */
- int error; /* error return value */
- xfs_mount_t *mp; /* filesystem mount point */
- char *ptr; /* current data entry */
- int wantoff; /* starting block offset */
- xfs_off_t cook;
-
- mp = dp->i_mount;
- /*
- * If the block number in the offset is out of range, we're done.
- */
- if (xfs_dir2_dataptr_to_db(mp, *offset) > mp->m_dirdatablk)
- return 0;
-
- error = xfs_dir2_block_read(NULL, dp, &bp);
- if (error)
- return error;
-
- /*
- * Extract the byte offset we start at from the seek pointer.
- * We'll skip entries before this.
- */
- wantoff = xfs_dir2_dataptr_to_off(mp, *offset);
- hdr = bp->b_addr;
- xfs_dir2_data_check(dp, bp);
- /*
- * Set up values for the loop.
- */
- btp = xfs_dir2_block_tail_p(mp, hdr);
- ptr = (char *)(hdr + 1);
- endptr = (char *)xfs_dir2_block_leaf_p(btp);
-
- /*
- * Loop over the data portion of the block.
- * Each object is a real entry (dep) or an unused one (dup).
- */
- while (ptr < endptr) {
- dup = (xfs_dir2_data_unused_t *)ptr;
- /*
- * Unused, skip it.
- */
- if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
- ptr += be16_to_cpu(dup->length);
- continue;
- }
-
- dep = (xfs_dir2_data_entry_t *)ptr;
-
- /*
- * Bump pointer for the next iteration.
- */
- ptr += xfs_dir2_data_entsize(dep->namelen);
- /*
- * The entry is before the desired starting point, skip it.
- */
- if ((char *)dep - (char *)hdr < wantoff)
- continue;
-
- cook = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk,
- (char *)dep - (char *)hdr);
-
- /*
- * If it didn't fit, set the final offset to here & return.
- */
- if (filldir(dirent, (char *)dep->name, dep->namelen,
- cook & 0x7fffffff, be64_to_cpu(dep->inumber),
- DT_UNKNOWN)) {
- *offset = cook & 0x7fffffff;
- xfs_trans_brelse(NULL, bp);
- return 0;
- }
- }
-
- /*
- * Reached the end of the block.
- * Set the offset to a non-existent block 1 and return.
- */
- *offset = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk + 1, 0) &
- 0x7fffffff;
- xfs_trans_brelse(NULL, bp);
+ xfs_dir2_data_log_entry(args, bp, dep);
+ xfs_dir3_data_check(dp, bp);
return 0;
}
@@ -614,7 +580,7 @@ xfs_dir2_block_log_leaf(
xfs_dir2_leaf_entry_t *blp;
xfs_dir2_block_tail_t *btp;
- btp = xfs_dir2_block_tail_p(tp->t_mountp, hdr);
+ btp = xfs_dir2_block_tail_p(tp->t_mountp->m_dir_geo, hdr);
blp = xfs_dir2_block_leaf_p(btp);
xfs_trans_log_buf(tp, bp, (uint)((char *)&blp[first] - (char *)hdr),
(uint)((char *)&blp[last + 1] - (char *)hdr - 1));
@@ -631,7 +597,7 @@ xfs_dir2_block_log_tail(
xfs_dir2_data_hdr_t *hdr = bp->b_addr;
xfs_dir2_block_tail_t *btp;
- btp = xfs_dir2_block_tail_p(tp->t_mountp, hdr);
+ btp = xfs_dir2_block_tail_p(tp->t_mountp->m_dir_geo, hdr);
xfs_trans_log_buf(tp, bp, (uint)((char *)btp - (char *)hdr),
(uint)((char *)(btp + 1) - (char *)hdr - 1));
}
@@ -665,18 +631,20 @@ xfs_dir2_block_lookup(
dp = args->dp;
mp = dp->i_mount;
hdr = bp->b_addr;
- xfs_dir2_data_check(dp, bp);
- btp = xfs_dir2_block_tail_p(mp, hdr);
+ xfs_dir3_data_check(dp, bp);
+ btp = xfs_dir2_block_tail_p(args->geo, hdr);
blp = xfs_dir2_block_leaf_p(btp);
/*
* Get the offset from the leaf entry, to point to the data.
*/
dep = (xfs_dir2_data_entry_t *)((char *)hdr +
- xfs_dir2_dataptr_to_off(mp, be32_to_cpu(blp[ent].address)));
+ xfs_dir2_dataptr_to_off(args->geo,
+ be32_to_cpu(blp[ent].address)));
/*
* Fill in inode number, CI name if appropriate, release the block.
*/
args->inumber = be64_to_cpu(dep->inumber);
+ args->filetype = dp->d_ops->data_get_ftype(dep);
error = xfs_dir_cilookup_result(args, dep->name, dep->namelen);
xfs_trans_brelse(args->trans, bp);
return XFS_ERROR(error);
@@ -711,13 +679,13 @@ xfs_dir2_block_lookup_int(
tp = args->trans;
mp = dp->i_mount;
- error = xfs_dir2_block_read(tp, dp, &bp);
+ error = xfs_dir3_block_read(tp, dp, &bp);
if (error)
return error;
hdr = bp->b_addr;
- xfs_dir2_data_check(dp, bp);
- btp = xfs_dir2_block_tail_p(mp, hdr);
+ xfs_dir3_data_check(dp, bp);
+ btp = xfs_dir2_block_tail_p(args->geo, hdr);
blp = xfs_dir2_block_leaf_p(btp);
/*
* Loop doing a binary search for our hash value.
@@ -755,7 +723,7 @@ xfs_dir2_block_lookup_int(
* Get pointer to the entry from the leaf.
*/
dep = (xfs_dir2_data_entry_t *)
- ((char *)hdr + xfs_dir2_dataptr_to_off(mp, addr));
+ ((char *)hdr + xfs_dir2_dataptr_to_off(args->geo, addr));
/*
* Compare name and if it's an exact match, return the index
* and buffer. If it's the first case-insensitive match, store
@@ -822,20 +790,21 @@ xfs_dir2_block_removename(
tp = args->trans;
mp = dp->i_mount;
hdr = bp->b_addr;
- btp = xfs_dir2_block_tail_p(mp, hdr);
+ btp = xfs_dir2_block_tail_p(args->geo, hdr);
blp = xfs_dir2_block_leaf_p(btp);
/*
* Point to the data entry using the leaf entry.
*/
- dep = (xfs_dir2_data_entry_t *)
- ((char *)hdr + xfs_dir2_dataptr_to_off(mp, be32_to_cpu(blp[ent].address)));
+ dep = (xfs_dir2_data_entry_t *)((char *)hdr +
+ xfs_dir2_dataptr_to_off(args->geo,
+ be32_to_cpu(blp[ent].address)));
/*
* Mark the data entry's space free.
*/
needlog = needscan = 0;
- xfs_dir2_data_make_free(tp, bp,
+ xfs_dir2_data_make_free(args, bp,
(xfs_dir2_data_aoff_t)((char *)dep - (char *)hdr),
- xfs_dir2_data_entsize(dep->namelen), &needlog, &needscan);
+ dp->d_ops->data_entsize(dep->namelen), &needlog, &needscan);
/*
* Fix up the block tail.
*/
@@ -850,10 +819,10 @@ xfs_dir2_block_removename(
* Fix up bestfree, log the header if necessary.
*/
if (needscan)
- xfs_dir2_data_freescan(mp, hdr, &needlog);
+ xfs_dir2_data_freescan(dp, hdr, &needlog);
if (needlog)
- xfs_dir2_data_log_header(tp, bp);
- xfs_dir2_data_check(dp, bp);
+ xfs_dir2_data_log_header(args, bp);
+ xfs_dir3_data_check(dp, bp);
/*
* See if the size as a shortform is good enough.
*/
@@ -897,20 +866,22 @@ xfs_dir2_block_replace(
dp = args->dp;
mp = dp->i_mount;
hdr = bp->b_addr;
- btp = xfs_dir2_block_tail_p(mp, hdr);
+ btp = xfs_dir2_block_tail_p(args->geo, hdr);
blp = xfs_dir2_block_leaf_p(btp);
/*
* Point to the data entry we need to change.
*/
- dep = (xfs_dir2_data_entry_t *)
- ((char *)hdr + xfs_dir2_dataptr_to_off(mp, be32_to_cpu(blp[ent].address)));
+ dep = (xfs_dir2_data_entry_t *)((char *)hdr +
+ xfs_dir2_dataptr_to_off(args->geo,
+ be32_to_cpu(blp[ent].address)));
ASSERT(be64_to_cpu(dep->inumber) != args->inumber);
/*
* Change the inode number to the new value.
*/
dep->inumber = cpu_to_be64(args->inumber);
- xfs_dir2_data_log_entry(args->trans, bp, dep);
- xfs_dir2_data_check(dp, bp);
+ dp->d_ops->data_put_ftype(dep, args->filetype);
+ xfs_dir2_data_log_entry(args, bp, dep);
+ xfs_dir3_data_check(dp, bp);
return 0;
}
@@ -958,6 +929,8 @@ xfs_dir2_leaf_to_block(
__be16 *tagp; /* end of entry (tag) */
int to; /* block/leaf to index */
xfs_trans_t *tp; /* transaction pointer */
+ struct xfs_dir2_leaf_entry *ents;
+ struct xfs_dir3_icleaf_hdr leafhdr;
trace_xfs_dir2_leaf_to_block(args);
@@ -965,18 +938,25 @@ xfs_dir2_leaf_to_block(
tp = args->trans;
mp = dp->i_mount;
leaf = lbp->b_addr;
- ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC));
- ltp = xfs_dir2_leaf_tail_p(mp, leaf);
+ dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
+ ents = dp->d_ops->leaf_ents_p(leaf);
+ ltp = xfs_dir2_leaf_tail_p(args->geo, leaf);
+
+ ASSERT(leafhdr.magic == XFS_DIR2_LEAF1_MAGIC ||
+ leafhdr.magic == XFS_DIR3_LEAF1_MAGIC);
/*
* If there are data blocks other than the first one, take this
* opportunity to remove trailing empty data blocks that may have
* been left behind during no-space-reservation operations.
* These will show up in the leaf bests table.
*/
- while (dp->i_d.di_size > mp->m_dirblksize) {
+ while (dp->i_d.di_size > args->geo->blksize) {
+ int hdrsz;
+
+ hdrsz = dp->d_ops->data_entry_offset;
bestsp = xfs_dir2_leaf_bests_p(ltp);
if (be16_to_cpu(bestsp[be32_to_cpu(ltp->bestcount) - 1]) ==
- mp->m_dirblksize - (uint)sizeof(*hdr)) {
+ args->geo->blksize - hdrsz) {
if ((error =
xfs_dir2_leaf_trim_data(args, lbp,
(xfs_dir2_db_t)(be32_to_cpu(ltp->bestcount) - 1))))
@@ -988,21 +968,23 @@ xfs_dir2_leaf_to_block(
* Read the data block if we don't already have it, give up if it fails.
*/
if (!dbp) {
- error = xfs_dir2_data_read(tp, dp, mp->m_dirdatablk, -1, &dbp);
+ error = xfs_dir3_data_read(tp, dp, args->geo->datablk, -1, &dbp);
if (error)
return error;
}
hdr = dbp->b_addr;
- ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC));
+ ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
+ hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC));
+
/*
* Size of the "leaf" area in the block.
*/
size = (uint)sizeof(xfs_dir2_block_tail_t) +
- (uint)sizeof(*lep) * (be16_to_cpu(leaf->hdr.count) - be16_to_cpu(leaf->hdr.stale));
+ (uint)sizeof(*lep) * (leafhdr.count - leafhdr.stale);
/*
* Look at the last data entry.
*/
- tagp = (__be16 *)((char *)hdr + mp->m_dirblksize) - 1;
+ tagp = (__be16 *)((char *)hdr + args->geo->blksize) - 1;
dup = (xfs_dir2_data_unused_t *)((char *)hdr + be16_to_cpu(*tagp));
/*
* If it's not free or is too short we can't do it.
@@ -1014,31 +996,30 @@ xfs_dir2_leaf_to_block(
/*
* Start converting it to block form.
*/
- dbp->b_ops = &xfs_dir2_block_buf_ops;
- hdr->magic = cpu_to_be32(XFS_DIR2_BLOCK_MAGIC);
+ xfs_dir3_block_init(mp, tp, dbp, dp);
+
needlog = 1;
needscan = 0;
/*
* Use up the space at the end of the block (blp/btp).
*/
- xfs_dir2_data_use_free(tp, dbp, dup, mp->m_dirblksize - size, size,
+ xfs_dir2_data_use_free(args, dbp, dup, args->geo->blksize - size, size,
&needlog, &needscan);
/*
* Initialize the block tail.
*/
- btp = xfs_dir2_block_tail_p(mp, hdr);
- btp->count = cpu_to_be32(be16_to_cpu(leaf->hdr.count) - be16_to_cpu(leaf->hdr.stale));
+ btp = xfs_dir2_block_tail_p(args->geo, hdr);
+ btp->count = cpu_to_be32(leafhdr.count - leafhdr.stale);
btp->stale = 0;
xfs_dir2_block_log_tail(tp, dbp);
/*
* Initialize the block leaf area. We compact out stale entries.
*/
lep = xfs_dir2_block_leaf_p(btp);
- for (from = to = 0; from < be16_to_cpu(leaf->hdr.count); from++) {
- if (leaf->ents[from].address ==
- cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
+ for (from = to = 0; from < leafhdr.count; from++) {
+ if (ents[from].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
continue;
- lep[to++] = leaf->ents[from];
+ lep[to++] = ents[from];
}
ASSERT(to == be32_to_cpu(btp->count));
xfs_dir2_block_log_leaf(tp, dbp, 0, be32_to_cpu(btp->count) - 1);
@@ -1046,13 +1027,13 @@ xfs_dir2_leaf_to_block(
* Scan the bestfree if we need it and log the data block header.
*/
if (needscan)
- xfs_dir2_data_freescan(mp, hdr, &needlog);
+ xfs_dir2_data_freescan(dp, hdr, &needlog);
if (needlog)
- xfs_dir2_data_log_header(tp, dbp);
+ xfs_dir2_data_log_header(args, dbp);
/*
* Pitch the old leaf block.
*/
- error = xfs_da_shrink_inode(args, mp->m_dirleafblk, lbp);
+ error = xfs_da_shrink_inode(args, args->geo->leafblk, lbp);
if (error)
return error;
@@ -1096,13 +1077,15 @@ xfs_dir2_sf_to_block(
__be16 *tagp; /* end of data entry */
xfs_trans_t *tp; /* transaction pointer */
struct xfs_name name;
+ struct xfs_ifork *ifp;
trace_xfs_dir2_sf_to_block(args);
dp = args->dp;
tp = args->trans;
mp = dp->i_mount;
- ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
+ ifp = XFS_IFORK_PTR(dp, XFS_DATA_FORK);
+ ASSERT(ifp->if_flags & XFS_IFINLINE);
/*
* Bomb out if the shortform directory is way too short.
*/
@@ -1111,22 +1094,23 @@ xfs_dir2_sf_to_block(
return XFS_ERROR(EIO);
}
- oldsfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+ oldsfp = (xfs_dir2_sf_hdr_t *)ifp->if_u1.if_data;
- ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
- ASSERT(dp->i_df.if_u1.if_data != NULL);
+ ASSERT(ifp->if_bytes == dp->i_d.di_size);
+ ASSERT(ifp->if_u1.if_data != NULL);
ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(oldsfp->i8count));
+ ASSERT(dp->i_d.di_nextents == 0);
/*
* Copy the directory into a temporary buffer.
* Then pitch the incore inode data so we can make extents.
*/
- sfp = kmem_alloc(dp->i_df.if_bytes, KM_SLEEP);
- memcpy(sfp, oldsfp, dp->i_df.if_bytes);
+ sfp = kmem_alloc(ifp->if_bytes, KM_SLEEP);
+ memcpy(sfp, oldsfp, ifp->if_bytes);
- xfs_idata_realloc(dp, -dp->i_df.if_bytes, XFS_DATA_FORK);
+ xfs_idata_realloc(dp, -ifp->if_bytes, XFS_DATA_FORK);
+ xfs_bmap_local_to_extents_empty(dp, XFS_DATA_FORK);
dp->i_d.di_size = 0;
- xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
/*
* Add block 0 to the inode.
@@ -1137,16 +1121,16 @@ xfs_dir2_sf_to_block(
return error;
}
/*
- * Initialize the data block.
+ * Initialize the data block, then convert it to block format.
*/
- error = xfs_dir2_data_init(args, blkno, &bp);
+ error = xfs_dir3_data_init(args, blkno, &bp);
if (error) {
kmem_free(sfp);
return error;
}
- bp->b_ops = &xfs_dir2_block_buf_ops;
+ xfs_dir3_block_init(mp, tp, bp, dp);
hdr = bp->b_addr;
- hdr->magic = cpu_to_be32(XFS_DIR2_BLOCK_MAGIC);
+
/*
* Compute size of block "tail" area.
*/
@@ -1156,15 +1140,15 @@ xfs_dir2_sf_to_block(
* The whole thing is initialized to free by the init routine.
* Say we're using the leaf and tail area.
*/
- dup = (xfs_dir2_data_unused_t *)(hdr + 1);
+ dup = dp->d_ops->data_unused_p(hdr);
needlog = needscan = 0;
- xfs_dir2_data_use_free(tp, bp, dup, mp->m_dirblksize - i, i, &needlog,
- &needscan);
+ xfs_dir2_data_use_free(args, bp, dup, args->geo->blksize - i,
+ i, &needlog, &needscan);
ASSERT(needscan == 0);
/*
* Fill in the tail.
*/
- btp = xfs_dir2_block_tail_p(mp, hdr);
+ btp = xfs_dir2_block_tail_p(args->geo, hdr);
btp->count = cpu_to_be32(sfp->count + 2); /* ., .. */
btp->stale = 0;
blp = xfs_dir2_block_leaf_p(btp);
@@ -1172,38 +1156,38 @@ xfs_dir2_sf_to_block(
/*
* Remove the freespace, we'll manage it.
*/
- xfs_dir2_data_use_free(tp, bp, dup,
+ xfs_dir2_data_use_free(args, bp, dup,
(xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr),
be16_to_cpu(dup->length), &needlog, &needscan);
/*
* Create entry for .
*/
- dep = (xfs_dir2_data_entry_t *)
- ((char *)hdr + XFS_DIR2_DATA_DOT_OFFSET);
+ dep = dp->d_ops->data_dot_entry_p(hdr);
dep->inumber = cpu_to_be64(dp->i_ino);
dep->namelen = 1;
dep->name[0] = '.';
- tagp = xfs_dir2_data_entry_tag_p(dep);
+ dp->d_ops->data_put_ftype(dep, XFS_DIR3_FT_DIR);
+ tagp = dp->d_ops->data_entry_tag_p(dep);
*tagp = cpu_to_be16((char *)dep - (char *)hdr);
- xfs_dir2_data_log_entry(tp, bp, dep);
+ xfs_dir2_data_log_entry(args, bp, dep);
blp[0].hashval = cpu_to_be32(xfs_dir_hash_dot);
- blp[0].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(mp,
+ blp[0].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(
(char *)dep - (char *)hdr));
/*
* Create entry for ..
*/
- dep = (xfs_dir2_data_entry_t *)
- ((char *)hdr + XFS_DIR2_DATA_DOTDOT_OFFSET);
- dep->inumber = cpu_to_be64(xfs_dir2_sf_get_parent_ino(sfp));
+ dep = dp->d_ops->data_dotdot_entry_p(hdr);
+ dep->inumber = cpu_to_be64(dp->d_ops->sf_get_parent_ino(sfp));
dep->namelen = 2;
dep->name[0] = dep->name[1] = '.';
- tagp = xfs_dir2_data_entry_tag_p(dep);
+ dp->d_ops->data_put_ftype(dep, XFS_DIR3_FT_DIR);
+ tagp = dp->d_ops->data_entry_tag_p(dep);
*tagp = cpu_to_be16((char *)dep - (char *)hdr);
- xfs_dir2_data_log_entry(tp, bp, dep);
+ xfs_dir2_data_log_entry(args, bp, dep);
blp[1].hashval = cpu_to_be32(xfs_dir_hash_dotdot);
- blp[1].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(mp,
+ blp[1].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(
(char *)dep - (char *)hdr));
- offset = XFS_DIR2_DATA_FIRST_OFFSET;
+ offset = dp->d_ops->data_first_offset;
/*
* Loop over existing entries, stuff them in.
*/
@@ -1233,8 +1217,10 @@ xfs_dir2_sf_to_block(
dup->length = cpu_to_be16(newoffset - offset);
*xfs_dir2_data_unused_tag_p(dup) = cpu_to_be16(
((char *)dup - (char *)hdr));
- xfs_dir2_data_log_unused(tp, bp, dup);
- xfs_dir2_data_freeinsert(hdr, dup, &dummy);
+ xfs_dir2_data_log_unused(args, bp, dup);
+ xfs_dir2_data_freeinsert(hdr,
+ dp->d_ops->data_bestfree_p(hdr),
+ dup, &dummy);
offset += be16_to_cpu(dup->length);
continue;
}
@@ -1242,23 +1228,24 @@ xfs_dir2_sf_to_block(
* Copy a real entry.
*/
dep = (xfs_dir2_data_entry_t *)((char *)hdr + newoffset);
- dep->inumber = cpu_to_be64(xfs_dir2_sfe_get_ino(sfp, sfep));
+ dep->inumber = cpu_to_be64(dp->d_ops->sf_get_ino(sfp, sfep));
dep->namelen = sfep->namelen;
+ dp->d_ops->data_put_ftype(dep, dp->d_ops->sf_get_ftype(sfep));
memcpy(dep->name, sfep->name, dep->namelen);
- tagp = xfs_dir2_data_entry_tag_p(dep);
+ tagp = dp->d_ops->data_entry_tag_p(dep);
*tagp = cpu_to_be16((char *)dep - (char *)hdr);
- xfs_dir2_data_log_entry(tp, bp, dep);
+ xfs_dir2_data_log_entry(args, bp, dep);
name.name = sfep->name;
name.len = sfep->namelen;
blp[2 + i].hashval = cpu_to_be32(mp->m_dirnameops->
hashname(&name));
- blp[2 + i].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(mp,
+ blp[2 + i].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(
(char *)dep - (char *)hdr));
offset = (int)((char *)(tagp + 1) - (char *)hdr);
if (++i == sfp->count)
sfep = NULL;
else
- sfep = xfs_dir2_sf_nextentry(sfp, sfep);
+ sfep = dp->d_ops->sf_nextentry(sfp, sfep);
}
/* Done with the temporary buffer */
kmem_free(sfp);
@@ -1273,6 +1260,6 @@ xfs_dir2_sf_to_block(
ASSERT(needscan == 0);
xfs_dir2_block_log_leaf(tp, bp, 0, be32_to_cpu(btp->count) - 1);
xfs_dir2_block_log_tail(tp, bp);
- xfs_dir2_data_check(dp, bp);
+ xfs_dir3_data_check(dp, bp);
return 0;
}
diff --git a/fs/xfs/xfs_dir2_data.c b/fs/xfs/xfs_dir2_data.c
index ffcf1774152..8c2f6422648 100644
--- a/fs/xfs/xfs_dir2_data.c
+++ b/fs/xfs/xfs_dir2_data.c
@@ -1,5 +1,6 @@
/*
* Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
* All Rights Reserved.
*
* This program is free software; you can redistribute it and/or
@@ -17,22 +18,21 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
-#include "xfs_types.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_mount.h"
+#include "xfs_da_format.h"
#include "xfs_da_btree.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_dinode.h"
#include "xfs_inode.h"
-#include "xfs_dir2_format.h"
+#include "xfs_dir2.h"
#include "xfs_dir2_priv.h"
#include "xfs_error.h"
-
-STATIC xfs_dir2_data_free_t *
-xfs_dir2_data_freefind(xfs_dir2_data_hdr_t *hdr, xfs_dir2_data_unused_t *dup);
+#include "xfs_trans.h"
+#include "xfs_buf_item.h"
+#include "xfs_cksum.h"
/*
* Check the consistency of the data block.
@@ -40,7 +40,7 @@ xfs_dir2_data_freefind(xfs_dir2_data_hdr_t *hdr, xfs_dir2_data_unused_t *dup);
* Return 0 is the buffer is good, otherwise an error.
*/
int
-__xfs_dir2_data_check(
+__xfs_dir3_data_check(
struct xfs_inode *dp, /* incore inode pointer */
struct xfs_buf *bp) /* data block's buffer */
{
@@ -62,30 +62,52 @@ __xfs_dir2_data_check(
char *p; /* current data position */
int stale; /* count of stale leaves */
struct xfs_name name;
+ const struct xfs_dir_ops *ops;
+ struct xfs_da_geometry *geo;
mp = bp->b_target->bt_mount;
+ geo = mp->m_dir_geo;
+
+ /*
+ * We can be passed a null dp here from a verifier, so we need to go the
+ * hard way to get them.
+ */
+ ops = xfs_dir_get_ops(mp, dp);
+
hdr = bp->b_addr;
- bf = hdr->bestfree;
- p = (char *)(hdr + 1);
+ p = (char *)ops->data_entry_p(hdr);
switch (hdr->magic) {
+ case cpu_to_be32(XFS_DIR3_BLOCK_MAGIC):
case cpu_to_be32(XFS_DIR2_BLOCK_MAGIC):
- btp = xfs_dir2_block_tail_p(mp, hdr);
+ btp = xfs_dir2_block_tail_p(geo, hdr);
lep = xfs_dir2_block_leaf_p(btp);
endp = (char *)lep;
+
+ /*
+ * The number of leaf entries is limited by the size of the
+ * block and the amount of space used by the data entries.
+ * We don't know how much space is used by the data entries yet,
+ * so just ensure that the count falls somewhere inside the
+ * block right now.
+ */
+ XFS_WANT_CORRUPTED_RETURN(be32_to_cpu(btp->count) <
+ ((char *)btp - p) / sizeof(struct xfs_dir2_leaf_entry));
break;
+ case cpu_to_be32(XFS_DIR3_DATA_MAGIC):
case cpu_to_be32(XFS_DIR2_DATA_MAGIC):
- endp = (char *)hdr + mp->m_dirblksize;
+ endp = (char *)hdr + geo->blksize;
break;
default:
XFS_ERROR_REPORT("Bad Magic", XFS_ERRLEVEL_LOW, mp);
return EFSCORRUPTED;
}
- count = lastfree = freeseen = 0;
/*
* Account for zero bestfree entries.
*/
+ bf = ops->data_bestfree_p(hdr);
+ count = lastfree = freeseen = 0;
if (!bf[0].length) {
XFS_WANT_CORRUPTED_RETURN(!bf[0].offset);
freeseen |= 1 << 0;
@@ -118,7 +140,7 @@ __xfs_dir2_data_check(
XFS_WANT_CORRUPTED_RETURN(
be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup)) ==
(char *)dup - (char *)hdr);
- dfp = xfs_dir2_data_freefind(hdr, dup);
+ dfp = xfs_dir2_data_freefind(hdr, bf, dup);
if (dfp) {
i = (int)(dfp - bf);
XFS_WANT_CORRUPTED_RETURN(
@@ -144,14 +166,17 @@ __xfs_dir2_data_check(
XFS_WANT_CORRUPTED_RETURN(
!xfs_dir_ino_validate(mp, be64_to_cpu(dep->inumber)));
XFS_WANT_CORRUPTED_RETURN(
- be16_to_cpu(*xfs_dir2_data_entry_tag_p(dep)) ==
+ be16_to_cpu(*ops->data_entry_tag_p(dep)) ==
(char *)dep - (char *)hdr);
+ XFS_WANT_CORRUPTED_RETURN(
+ ops->data_get_ftype(dep) < XFS_DIR3_FT_MAX);
count++;
lastfree = 0;
- if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)) {
- addr = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk,
- (xfs_dir2_data_aoff_t)
- ((char *)dep - (char *)hdr));
+ if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
+ hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)) {
+ addr = xfs_dir2_db_off_to_dataptr(geo, geo->datablk,
+ (xfs_dir2_data_aoff_t)
+ ((char *)dep - (char *)hdr));
name.name = dep->name;
name.len = dep->namelen;
hash = mp->m_dirnameops->hashname(&name);
@@ -162,13 +187,14 @@ __xfs_dir2_data_check(
}
XFS_WANT_CORRUPTED_RETURN(i < be32_to_cpu(btp->count));
}
- p += xfs_dir2_data_entsize(dep->namelen);
+ p += ops->data_entsize(dep->namelen);
}
/*
* Need to have seen all the entries and all the bestfree slots.
*/
XFS_WANT_CORRUPTED_RETURN(freeseen == 7);
- if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)) {
+ if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
+ hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)) {
for (i = stale = 0; i < be32_to_cpu(btp->count); i++) {
if (lep[i].address ==
cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
@@ -185,21 +211,27 @@ __xfs_dir2_data_check(
return 0;
}
-static void
-xfs_dir2_data_verify(
+static bool
+xfs_dir3_data_verify(
struct xfs_buf *bp)
{
struct xfs_mount *mp = bp->b_target->bt_mount;
- struct xfs_dir2_data_hdr *hdr = bp->b_addr;
- int block_ok = 0;
-
- block_ok = hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC);
- block_ok = block_ok && __xfs_dir2_data_check(NULL, bp) == 0;
+ struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
- if (!block_ok) {
- XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr);
- xfs_buf_ioerror(bp, EFSCORRUPTED);
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ if (hdr3->magic != cpu_to_be32(XFS_DIR3_DATA_MAGIC))
+ return false;
+ if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_uuid))
+ return false;
+ if (be64_to_cpu(hdr3->blkno) != bp->b_bn)
+ return false;
+ } else {
+ if (hdr3->magic != cpu_to_be32(XFS_DIR2_DATA_MAGIC))
+ return false;
}
+ if (__xfs_dir3_data_check(NULL, bp))
+ return false;
+ return true;
}
/*
@@ -208,102 +240,136 @@ xfs_dir2_data_verify(
* format buffer or a data format buffer on readahead.
*/
static void
-xfs_dir2_data_reada_verify(
+xfs_dir3_data_reada_verify(
struct xfs_buf *bp)
{
- struct xfs_mount *mp = bp->b_target->bt_mount;
struct xfs_dir2_data_hdr *hdr = bp->b_addr;
switch (hdr->magic) {
case cpu_to_be32(XFS_DIR2_BLOCK_MAGIC):
- bp->b_ops = &xfs_dir2_block_buf_ops;
+ case cpu_to_be32(XFS_DIR3_BLOCK_MAGIC):
+ bp->b_ops = &xfs_dir3_block_buf_ops;
bp->b_ops->verify_read(bp);
return;
case cpu_to_be32(XFS_DIR2_DATA_MAGIC):
- xfs_dir2_data_verify(bp);
+ case cpu_to_be32(XFS_DIR3_DATA_MAGIC):
+ xfs_dir3_data_verify(bp);
return;
default:
- XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr);
xfs_buf_ioerror(bp, EFSCORRUPTED);
+ xfs_verifier_error(bp);
break;
}
}
static void
-xfs_dir2_data_read_verify(
+xfs_dir3_data_read_verify(
struct xfs_buf *bp)
{
- xfs_dir2_data_verify(bp);
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+
+ if (xfs_sb_version_hascrc(&mp->m_sb) &&
+ !xfs_buf_verify_cksum(bp, XFS_DIR3_DATA_CRC_OFF))
+ xfs_buf_ioerror(bp, EFSBADCRC);
+ else if (!xfs_dir3_data_verify(bp))
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
+
+ if (bp->b_error)
+ xfs_verifier_error(bp);
}
static void
-xfs_dir2_data_write_verify(
+xfs_dir3_data_write_verify(
struct xfs_buf *bp)
{
- xfs_dir2_data_verify(bp);
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_buf_log_item *bip = bp->b_fspriv;
+ struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
+
+ if (!xfs_dir3_data_verify(bp)) {
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
+ xfs_verifier_error(bp);
+ return;
+ }
+
+ if (!xfs_sb_version_hascrc(&mp->m_sb))
+ return;
+
+ if (bip)
+ hdr3->lsn = cpu_to_be64(bip->bli_item.li_lsn);
+
+ xfs_buf_update_cksum(bp, XFS_DIR3_DATA_CRC_OFF);
}
-const struct xfs_buf_ops xfs_dir2_data_buf_ops = {
- .verify_read = xfs_dir2_data_read_verify,
- .verify_write = xfs_dir2_data_write_verify,
+const struct xfs_buf_ops xfs_dir3_data_buf_ops = {
+ .verify_read = xfs_dir3_data_read_verify,
+ .verify_write = xfs_dir3_data_write_verify,
};
-static const struct xfs_buf_ops xfs_dir2_data_reada_buf_ops = {
- .verify_read = xfs_dir2_data_reada_verify,
- .verify_write = xfs_dir2_data_write_verify,
+static const struct xfs_buf_ops xfs_dir3_data_reada_buf_ops = {
+ .verify_read = xfs_dir3_data_reada_verify,
+ .verify_write = xfs_dir3_data_write_verify,
};
int
-xfs_dir2_data_read(
+xfs_dir3_data_read(
struct xfs_trans *tp,
struct xfs_inode *dp,
xfs_dablk_t bno,
xfs_daddr_t mapped_bno,
struct xfs_buf **bpp)
{
- return xfs_da_read_buf(tp, dp, bno, mapped_bno, bpp,
- XFS_DATA_FORK, &xfs_dir2_data_buf_ops);
+ int err;
+
+ err = xfs_da_read_buf(tp, dp, bno, mapped_bno, bpp,
+ XFS_DATA_FORK, &xfs_dir3_data_buf_ops);
+ if (!err && tp)
+ xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_DATA_BUF);
+ return err;
}
int
-xfs_dir2_data_readahead(
- struct xfs_trans *tp,
+xfs_dir3_data_readahead(
struct xfs_inode *dp,
xfs_dablk_t bno,
xfs_daddr_t mapped_bno)
{
- return xfs_da_reada_buf(tp, dp, bno, mapped_bno,
- XFS_DATA_FORK, &xfs_dir2_data_reada_buf_ops);
+ return xfs_da_reada_buf(dp, bno, mapped_bno,
+ XFS_DATA_FORK, &xfs_dir3_data_reada_buf_ops);
}
/*
* Given a data block and an unused entry from that block,
* return the bestfree entry if any that corresponds to it.
*/
-STATIC xfs_dir2_data_free_t *
+xfs_dir2_data_free_t *
xfs_dir2_data_freefind(
- xfs_dir2_data_hdr_t *hdr, /* data block */
- xfs_dir2_data_unused_t *dup) /* data unused entry */
+ struct xfs_dir2_data_hdr *hdr, /* data block header */
+ struct xfs_dir2_data_free *bf, /* bestfree table pointer */
+ struct xfs_dir2_data_unused *dup) /* unused space */
{
xfs_dir2_data_free_t *dfp; /* bestfree entry */
xfs_dir2_data_aoff_t off; /* offset value needed */
-#if defined(DEBUG) && defined(__KERNEL__)
+#ifdef DEBUG
int matched; /* matched the value */
int seenzero; /* saw a 0 bestfree entry */
#endif
off = (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr);
-#if defined(DEBUG) && defined(__KERNEL__)
+
+#ifdef DEBUG
/*
* Validate some consistency in the bestfree table.
* Check order, non-overlapping entries, and if we find the
* one we're looking for it has to be exact.
*/
ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
- hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC));
- for (dfp = &hdr->bestfree[0], seenzero = matched = 0;
- dfp < &hdr->bestfree[XFS_DIR2_DATA_FD_COUNT];
+ hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
+ hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
+ hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC));
+ for (dfp = &bf[0], seenzero = matched = 0;
+ dfp < &bf[XFS_DIR2_DATA_FD_COUNT];
dfp++) {
if (!dfp->offset) {
ASSERT(!dfp->length);
@@ -319,7 +385,7 @@ xfs_dir2_data_freefind(
else
ASSERT(be16_to_cpu(dfp->offset) + be16_to_cpu(dfp->length) <= off);
ASSERT(matched || be16_to_cpu(dfp->length) >= be16_to_cpu(dup->length));
- if (dfp > &hdr->bestfree[0])
+ if (dfp > &bf[0])
ASSERT(be16_to_cpu(dfp[-1].length) >= be16_to_cpu(dfp[0].length));
}
#endif
@@ -328,14 +394,12 @@ xfs_dir2_data_freefind(
* it can't be there since they're sorted.
*/
if (be16_to_cpu(dup->length) <
- be16_to_cpu(hdr->bestfree[XFS_DIR2_DATA_FD_COUNT - 1].length))
+ be16_to_cpu(bf[XFS_DIR2_DATA_FD_COUNT - 1].length))
return NULL;
/*
* Look at the three bestfree entries for our guy.
*/
- for (dfp = &hdr->bestfree[0];
- dfp < &hdr->bestfree[XFS_DIR2_DATA_FD_COUNT];
- dfp++) {
+ for (dfp = &bf[0]; dfp < &bf[XFS_DIR2_DATA_FD_COUNT]; dfp++) {
if (!dfp->offset)
return NULL;
if (be16_to_cpu(dfp->offset) == off)
@@ -352,18 +416,18 @@ xfs_dir2_data_freefind(
*/
xfs_dir2_data_free_t * /* entry inserted */
xfs_dir2_data_freeinsert(
- xfs_dir2_data_hdr_t *hdr, /* data block pointer */
- xfs_dir2_data_unused_t *dup, /* unused space */
+ struct xfs_dir2_data_hdr *hdr, /* data block pointer */
+ struct xfs_dir2_data_free *dfp, /* bestfree table pointer */
+ struct xfs_dir2_data_unused *dup, /* unused space */
int *loghead) /* log the data header (out) */
{
- xfs_dir2_data_free_t *dfp; /* bestfree table pointer */
xfs_dir2_data_free_t new; /* new bestfree entry */
-#ifdef __KERNEL__
ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
- hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC));
-#endif
- dfp = hdr->bestfree;
+ hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
+ hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
+ hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC));
+
new.length = dup->length;
new.offset = cpu_to_be16((char *)dup - (char *)hdr);
@@ -396,36 +460,39 @@ xfs_dir2_data_freeinsert(
*/
STATIC void
xfs_dir2_data_freeremove(
- xfs_dir2_data_hdr_t *hdr, /* data block header */
- xfs_dir2_data_free_t *dfp, /* bestfree entry pointer */
+ struct xfs_dir2_data_hdr *hdr, /* data block header */
+ struct xfs_dir2_data_free *bf, /* bestfree table pointer */
+ struct xfs_dir2_data_free *dfp, /* bestfree entry pointer */
int *loghead) /* out: log data header */
{
-#ifdef __KERNEL__
+
ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
- hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC));
-#endif
+ hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
+ hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
+ hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC));
+
/*
* It's the first entry, slide the next 2 up.
*/
- if (dfp == &hdr->bestfree[0]) {
- hdr->bestfree[0] = hdr->bestfree[1];
- hdr->bestfree[1] = hdr->bestfree[2];
+ if (dfp == &bf[0]) {
+ bf[0] = bf[1];
+ bf[1] = bf[2];
}
/*
* It's the second entry, slide the 3rd entry up.
*/
- else if (dfp == &hdr->bestfree[1])
- hdr->bestfree[1] = hdr->bestfree[2];
+ else if (dfp == &bf[1])
+ bf[1] = bf[2];
/*
* Must be the last entry.
*/
else
- ASSERT(dfp == &hdr->bestfree[2]);
+ ASSERT(dfp == &bf[2]);
/*
* Clear the 3rd entry, must be zero now.
*/
- hdr->bestfree[2].length = 0;
- hdr->bestfree[2].offset = 0;
+ bf[2].length = 0;
+ bf[2].offset = 0;
*loghead = 1;
}
@@ -434,34 +501,39 @@ xfs_dir2_data_freeremove(
*/
void
xfs_dir2_data_freescan(
- xfs_mount_t *mp, /* filesystem mount point */
- xfs_dir2_data_hdr_t *hdr, /* data block header */
- int *loghead) /* out: log data header */
+ struct xfs_inode *dp,
+ struct xfs_dir2_data_hdr *hdr,
+ int *loghead)
{
xfs_dir2_block_tail_t *btp; /* block tail */
xfs_dir2_data_entry_t *dep; /* active data entry */
xfs_dir2_data_unused_t *dup; /* unused data entry */
+ struct xfs_dir2_data_free *bf;
char *endp; /* end of block's data */
char *p; /* current entry pointer */
+ struct xfs_da_geometry *geo = dp->i_mount->m_dir_geo;
-#ifdef __KERNEL__
ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
- hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC));
-#endif
+ hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
+ hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
+ hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC));
+
/*
* Start by clearing the table.
*/
- memset(hdr->bestfree, 0, sizeof(hdr->bestfree));
+ bf = dp->d_ops->data_bestfree_p(hdr);
+ memset(bf, 0, sizeof(*bf) * XFS_DIR2_DATA_FD_COUNT);
*loghead = 1;
/*
* Set up pointers.
*/
- p = (char *)(hdr + 1);
- if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)) {
- btp = xfs_dir2_block_tail_p(mp, hdr);
+ p = (char *)dp->d_ops->data_entry_p(hdr);
+ if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
+ hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)) {
+ btp = xfs_dir2_block_tail_p(geo, hdr);
endp = (char *)xfs_dir2_block_leaf_p(btp);
} else
- endp = (char *)hdr + mp->m_dirblksize;
+ endp = (char *)hdr + geo->blksize;
/*
* Loop over the block's entries.
*/
@@ -473,7 +545,7 @@ xfs_dir2_data_freescan(
if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
ASSERT((char *)dup - (char *)hdr ==
be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup)));
- xfs_dir2_data_freeinsert(hdr, dup, loghead);
+ xfs_dir2_data_freeinsert(hdr, bf, dup, loghead);
p += be16_to_cpu(dup->length);
}
/*
@@ -482,8 +554,8 @@ xfs_dir2_data_freescan(
else {
dep = (xfs_dir2_data_entry_t *)p;
ASSERT((char *)dep - (char *)hdr ==
- be16_to_cpu(*xfs_dir2_data_entry_tag_p(dep)));
- p += xfs_dir2_data_entsize(dep->namelen);
+ be16_to_cpu(*dp->d_ops->data_entry_tag_p(dep)));
+ p += dp->d_ops->data_entsize(dep->namelen);
}
}
}
@@ -493,7 +565,7 @@ xfs_dir2_data_freescan(
* Give back the buffer for the created block.
*/
int /* error */
-xfs_dir2_data_init(
+xfs_dir3_data_init(
xfs_da_args_t *args, /* directory operation args */
xfs_dir2_db_t blkno, /* logical dir block number */
struct xfs_buf **bpp) /* output block buffer */
@@ -502,6 +574,7 @@ xfs_dir2_data_init(
xfs_dir2_data_hdr_t *hdr; /* data block header */
xfs_inode_t *dp; /* incore directory inode */
xfs_dir2_data_unused_t *dup; /* unused entry pointer */
+ struct xfs_dir2_data_free *bf;
int error; /* error return value */
int i; /* bestfree index */
xfs_mount_t *mp; /* filesystem mount point */
@@ -514,38 +587,51 @@ xfs_dir2_data_init(
/*
* Get the buffer set up for the block.
*/
- error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(mp, blkno), -1, &bp,
- XFS_DATA_FORK);
+ error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(args->geo, blkno),
+ -1, &bp, XFS_DATA_FORK);
if (error)
return error;
- bp->b_ops = &xfs_dir2_data_buf_ops;
+ bp->b_ops = &xfs_dir3_data_buf_ops;
+ xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_DATA_BUF);
/*
* Initialize the header.
*/
hdr = bp->b_addr;
- hdr->magic = cpu_to_be32(XFS_DIR2_DATA_MAGIC);
- hdr->bestfree[0].offset = cpu_to_be16(sizeof(*hdr));
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
+
+ memset(hdr3, 0, sizeof(*hdr3));
+ hdr3->magic = cpu_to_be32(XFS_DIR3_DATA_MAGIC);
+ hdr3->blkno = cpu_to_be64(bp->b_bn);
+ hdr3->owner = cpu_to_be64(dp->i_ino);
+ uuid_copy(&hdr3->uuid, &mp->m_sb.sb_uuid);
+
+ } else
+ hdr->magic = cpu_to_be32(XFS_DIR2_DATA_MAGIC);
+
+ bf = dp->d_ops->data_bestfree_p(hdr);
+ bf[0].offset = cpu_to_be16(dp->d_ops->data_entry_offset);
for (i = 1; i < XFS_DIR2_DATA_FD_COUNT; i++) {
- hdr->bestfree[i].length = 0;
- hdr->bestfree[i].offset = 0;
+ bf[i].length = 0;
+ bf[i].offset = 0;
}
/*
* Set up an unused entry for the block's body.
*/
- dup = (xfs_dir2_data_unused_t *)(hdr + 1);
+ dup = dp->d_ops->data_unused_p(hdr);
dup->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG);
- t = mp->m_dirblksize - (uint)sizeof(*hdr);
- hdr->bestfree[0].length = cpu_to_be16(t);
+ t = args->geo->blksize - (uint)dp->d_ops->data_entry_offset;
+ bf[0].length = cpu_to_be16(t);
dup->length = cpu_to_be16(t);
*xfs_dir2_data_unused_tag_p(dup) = cpu_to_be16((char *)dup - (char *)hdr);
/*
* Log it and return it.
*/
- xfs_dir2_data_log_header(tp, bp);
- xfs_dir2_data_log_unused(tp, bp, dup);
+ xfs_dir2_data_log_header(args, bp);
+ xfs_dir2_data_log_unused(args, bp, dup);
*bpp = bp;
return 0;
}
@@ -555,17 +641,19 @@ xfs_dir2_data_init(
*/
void
xfs_dir2_data_log_entry(
- struct xfs_trans *tp,
+ struct xfs_da_args *args,
struct xfs_buf *bp,
xfs_dir2_data_entry_t *dep) /* data entry pointer */
{
- xfs_dir2_data_hdr_t *hdr = bp->b_addr;
+ struct xfs_dir2_data_hdr *hdr = bp->b_addr;
ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
- hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC));
+ hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
+ hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
+ hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC));
- xfs_trans_log_buf(tp, bp, (uint)((char *)dep - (char *)hdr),
- (uint)((char *)(xfs_dir2_data_entry_tag_p(dep) + 1) -
+ xfs_trans_log_buf(args->trans, bp, (uint)((char *)dep - (char *)hdr),
+ (uint)((char *)(args->dp->d_ops->data_entry_tag_p(dep) + 1) -
(char *)hdr - 1));
}
@@ -574,15 +662,20 @@ xfs_dir2_data_log_entry(
*/
void
xfs_dir2_data_log_header(
- struct xfs_trans *tp,
+ struct xfs_da_args *args,
struct xfs_buf *bp)
{
- xfs_dir2_data_hdr_t *hdr = bp->b_addr;
+#ifdef DEBUG
+ struct xfs_dir2_data_hdr *hdr = bp->b_addr;
ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
- hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC));
+ hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
+ hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
+ hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC));
+#endif
- xfs_trans_log_buf(tp, bp, 0, sizeof(*hdr) - 1);
+ xfs_trans_log_buf(args->trans, bp, 0,
+ args->dp->d_ops->data_entry_offset - 1);
}
/*
@@ -590,25 +683,27 @@ xfs_dir2_data_log_header(
*/
void
xfs_dir2_data_log_unused(
- struct xfs_trans *tp,
+ struct xfs_da_args *args,
struct xfs_buf *bp,
xfs_dir2_data_unused_t *dup) /* data unused pointer */
{
xfs_dir2_data_hdr_t *hdr = bp->b_addr;
ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
- hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC));
+ hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
+ hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
+ hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC));
/*
* Log the first part of the unused entry.
*/
- xfs_trans_log_buf(tp, bp, (uint)((char *)dup - (char *)hdr),
+ xfs_trans_log_buf(args->trans, bp, (uint)((char *)dup - (char *)hdr),
(uint)((char *)&dup->length + sizeof(dup->length) -
1 - (char *)hdr));
/*
* Log the end (tag) of the unused entry.
*/
- xfs_trans_log_buf(tp, bp,
+ xfs_trans_log_buf(args->trans, bp,
(uint)((char *)xfs_dir2_data_unused_tag_p(dup) - (char *)hdr),
(uint)((char *)xfs_dir2_data_unused_tag_p(dup) - (char *)hdr +
sizeof(xfs_dir2_data_off_t) - 1));
@@ -620,7 +715,7 @@ xfs_dir2_data_log_unused(
*/
void
xfs_dir2_data_make_free(
- struct xfs_trans *tp,
+ struct xfs_da_args *args,
struct xfs_buf *bp,
xfs_dir2_data_aoff_t offset, /* starting byte offset */
xfs_dir2_data_aoff_t len, /* length in bytes */
@@ -630,32 +725,33 @@ xfs_dir2_data_make_free(
xfs_dir2_data_hdr_t *hdr; /* data block pointer */
xfs_dir2_data_free_t *dfp; /* bestfree pointer */
char *endptr; /* end of data area */
- xfs_mount_t *mp; /* filesystem mount point */
int needscan; /* need to regen bestfree */
xfs_dir2_data_unused_t *newdup; /* new unused entry */
xfs_dir2_data_unused_t *postdup; /* unused entry after us */
xfs_dir2_data_unused_t *prevdup; /* unused entry before us */
+ struct xfs_dir2_data_free *bf;
- mp = tp->t_mountp;
hdr = bp->b_addr;
/*
* Figure out where the end of the data area is.
*/
- if (hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC))
- endptr = (char *)hdr + mp->m_dirblksize;
+ if (hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
+ hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC))
+ endptr = (char *)hdr + args->geo->blksize;
else {
xfs_dir2_block_tail_t *btp; /* block tail */
- ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC));
- btp = xfs_dir2_block_tail_p(mp, hdr);
+ ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
+ hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC));
+ btp = xfs_dir2_block_tail_p(args->geo, hdr);
endptr = (char *)xfs_dir2_block_leaf_p(btp);
}
/*
* If this isn't the start of the block, then back up to
* the previous entry and see if it's free.
*/
- if (offset > sizeof(*hdr)) {
+ if (offset > args->dp->d_ops->data_entry_offset) {
__be16 *tagp; /* tag just before us */
tagp = (__be16 *)((char *)hdr + offset) - 1;
@@ -681,28 +777,29 @@ xfs_dir2_data_make_free(
* Previous and following entries are both free,
* merge everything into a single free entry.
*/
+ bf = args->dp->d_ops->data_bestfree_p(hdr);
if (prevdup && postdup) {
xfs_dir2_data_free_t *dfp2; /* another bestfree pointer */
/*
* See if prevdup and/or postdup are in bestfree table.
*/
- dfp = xfs_dir2_data_freefind(hdr, prevdup);
- dfp2 = xfs_dir2_data_freefind(hdr, postdup);
+ dfp = xfs_dir2_data_freefind(hdr, bf, prevdup);
+ dfp2 = xfs_dir2_data_freefind(hdr, bf, postdup);
/*
* We need a rescan unless there are exactly 2 free entries
* namely our two. Then we know what's happening, otherwise
* since the third bestfree is there, there might be more
* entries.
*/
- needscan = (hdr->bestfree[2].length != 0);
+ needscan = (bf[2].length != 0);
/*
* Fix up the new big freespace.
*/
be16_add_cpu(&prevdup->length, len + be16_to_cpu(postdup->length));
*xfs_dir2_data_unused_tag_p(prevdup) =
cpu_to_be16((char *)prevdup - (char *)hdr);
- xfs_dir2_data_log_unused(tp, bp, prevdup);
+ xfs_dir2_data_log_unused(args, bp, prevdup);
if (!needscan) {
/*
* Has to be the case that entries 0 and 1 are
@@ -711,18 +808,19 @@ xfs_dir2_data_make_free(
* Remove entry 1 first then entry 0.
*/
ASSERT(dfp && dfp2);
- if (dfp == &hdr->bestfree[1]) {
- dfp = &hdr->bestfree[0];
+ if (dfp == &bf[1]) {
+ dfp = &bf[0];
ASSERT(dfp2 == dfp);
- dfp2 = &hdr->bestfree[1];
+ dfp2 = &bf[1];
}
- xfs_dir2_data_freeremove(hdr, dfp2, needlogp);
- xfs_dir2_data_freeremove(hdr, dfp, needlogp);
+ xfs_dir2_data_freeremove(hdr, bf, dfp2, needlogp);
+ xfs_dir2_data_freeremove(hdr, bf, dfp, needlogp);
/*
* Now insert the new entry.
*/
- dfp = xfs_dir2_data_freeinsert(hdr, prevdup, needlogp);
- ASSERT(dfp == &hdr->bestfree[0]);
+ dfp = xfs_dir2_data_freeinsert(hdr, bf, prevdup,
+ needlogp);
+ ASSERT(dfp == &bf[0]);
ASSERT(dfp->length == prevdup->length);
ASSERT(!dfp[1].length);
ASSERT(!dfp[2].length);
@@ -732,54 +830,54 @@ xfs_dir2_data_make_free(
* The entry before us is free, merge with it.
*/
else if (prevdup) {
- dfp = xfs_dir2_data_freefind(hdr, prevdup);
+ dfp = xfs_dir2_data_freefind(hdr, bf, prevdup);
be16_add_cpu(&prevdup->length, len);
*xfs_dir2_data_unused_tag_p(prevdup) =
cpu_to_be16((char *)prevdup - (char *)hdr);
- xfs_dir2_data_log_unused(tp, bp, prevdup);
+ xfs_dir2_data_log_unused(args, bp, prevdup);
/*
* If the previous entry was in the table, the new entry
* is longer, so it will be in the table too. Remove
* the old one and add the new one.
*/
if (dfp) {
- xfs_dir2_data_freeremove(hdr, dfp, needlogp);
- xfs_dir2_data_freeinsert(hdr, prevdup, needlogp);
+ xfs_dir2_data_freeremove(hdr, bf, dfp, needlogp);
+ xfs_dir2_data_freeinsert(hdr, bf, prevdup, needlogp);
}
/*
* Otherwise we need a scan if the new entry is big enough.
*/
else {
needscan = be16_to_cpu(prevdup->length) >
- be16_to_cpu(hdr->bestfree[2].length);
+ be16_to_cpu(bf[2].length);
}
}
/*
* The following entry is free, merge with it.
*/
else if (postdup) {
- dfp = xfs_dir2_data_freefind(hdr, postdup);
+ dfp = xfs_dir2_data_freefind(hdr, bf, postdup);
newdup = (xfs_dir2_data_unused_t *)((char *)hdr + offset);
newdup->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG);
newdup->length = cpu_to_be16(len + be16_to_cpu(postdup->length));
*xfs_dir2_data_unused_tag_p(newdup) =
cpu_to_be16((char *)newdup - (char *)hdr);
- xfs_dir2_data_log_unused(tp, bp, newdup);
+ xfs_dir2_data_log_unused(args, bp, newdup);
/*
* If the following entry was in the table, the new entry
* is longer, so it will be in the table too. Remove
* the old one and add the new one.
*/
if (dfp) {
- xfs_dir2_data_freeremove(hdr, dfp, needlogp);
- xfs_dir2_data_freeinsert(hdr, newdup, needlogp);
+ xfs_dir2_data_freeremove(hdr, bf, dfp, needlogp);
+ xfs_dir2_data_freeinsert(hdr, bf, newdup, needlogp);
}
/*
* Otherwise we need a scan if the new entry is big enough.
*/
else {
needscan = be16_to_cpu(newdup->length) >
- be16_to_cpu(hdr->bestfree[2].length);
+ be16_to_cpu(bf[2].length);
}
}
/*
@@ -791,8 +889,8 @@ xfs_dir2_data_make_free(
newdup->length = cpu_to_be16(len);
*xfs_dir2_data_unused_tag_p(newdup) =
cpu_to_be16((char *)newdup - (char *)hdr);
- xfs_dir2_data_log_unused(tp, bp, newdup);
- xfs_dir2_data_freeinsert(hdr, newdup, needlogp);
+ xfs_dir2_data_log_unused(args, bp, newdup);
+ xfs_dir2_data_freeinsert(hdr, bf, newdup, needlogp);
}
*needscanp = needscan;
}
@@ -802,7 +900,7 @@ xfs_dir2_data_make_free(
*/
void
xfs_dir2_data_use_free(
- struct xfs_trans *tp,
+ struct xfs_da_args *args,
struct xfs_buf *bp,
xfs_dir2_data_unused_t *dup, /* unused entry */
xfs_dir2_data_aoff_t offset, /* starting offset to use */
@@ -818,10 +916,13 @@ xfs_dir2_data_use_free(
xfs_dir2_data_unused_t *newdup; /* new unused entry */
xfs_dir2_data_unused_t *newdup2; /* another new unused entry */
int oldlen; /* old unused entry's length */
+ struct xfs_dir2_data_free *bf;
hdr = bp->b_addr;
ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
- hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC));
+ hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
+ hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
+ hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC));
ASSERT(be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG);
ASSERT(offset >= (char *)dup - (char *)hdr);
ASSERT(offset + len <= (char *)dup + be16_to_cpu(dup->length) - (char *)hdr);
@@ -829,9 +930,10 @@ xfs_dir2_data_use_free(
/*
* Look up the entry in the bestfree table.
*/
- dfp = xfs_dir2_data_freefind(hdr, dup);
oldlen = be16_to_cpu(dup->length);
- ASSERT(dfp || oldlen <= be16_to_cpu(hdr->bestfree[2].length));
+ bf = args->dp->d_ops->data_bestfree_p(hdr);
+ dfp = xfs_dir2_data_freefind(hdr, bf, dup);
+ ASSERT(dfp || oldlen <= be16_to_cpu(bf[2].length));
/*
* Check for alignment with front and back of the entry.
*/
@@ -845,9 +947,10 @@ xfs_dir2_data_use_free(
*/
if (matchfront && matchback) {
if (dfp) {
- needscan = (hdr->bestfree[2].offset != 0);
+ needscan = (bf[2].offset != 0);
if (!needscan)
- xfs_dir2_data_freeremove(hdr, dfp, needlogp);
+ xfs_dir2_data_freeremove(hdr, bf, dfp,
+ needlogp);
}
}
/*
@@ -860,13 +963,14 @@ xfs_dir2_data_use_free(
newdup->length = cpu_to_be16(oldlen - len);
*xfs_dir2_data_unused_tag_p(newdup) =
cpu_to_be16((char *)newdup - (char *)hdr);
- xfs_dir2_data_log_unused(tp, bp, newdup);
+ xfs_dir2_data_log_unused(args, bp, newdup);
/*
* If it was in the table, remove it and add the new one.
*/
if (dfp) {
- xfs_dir2_data_freeremove(hdr, dfp, needlogp);
- dfp = xfs_dir2_data_freeinsert(hdr, newdup, needlogp);
+ xfs_dir2_data_freeremove(hdr, bf, dfp, needlogp);
+ dfp = xfs_dir2_data_freeinsert(hdr, bf, newdup,
+ needlogp);
ASSERT(dfp != NULL);
ASSERT(dfp->length == newdup->length);
ASSERT(be16_to_cpu(dfp->offset) == (char *)newdup - (char *)hdr);
@@ -875,7 +979,7 @@ xfs_dir2_data_use_free(
* that means we don't know if there was a better
* choice for the last slot, or not. Rescan.
*/
- needscan = dfp == &hdr->bestfree[2];
+ needscan = dfp == &bf[2];
}
}
/*
@@ -887,13 +991,14 @@ xfs_dir2_data_use_free(
newdup->length = cpu_to_be16(((char *)hdr + offset) - (char *)newdup);
*xfs_dir2_data_unused_tag_p(newdup) =
cpu_to_be16((char *)newdup - (char *)hdr);
- xfs_dir2_data_log_unused(tp, bp, newdup);
+ xfs_dir2_data_log_unused(args, bp, newdup);
/*
* If it was in the table, remove it and add the new one.
*/
if (dfp) {
- xfs_dir2_data_freeremove(hdr, dfp, needlogp);
- dfp = xfs_dir2_data_freeinsert(hdr, newdup, needlogp);
+ xfs_dir2_data_freeremove(hdr, bf, dfp, needlogp);
+ dfp = xfs_dir2_data_freeinsert(hdr, bf, newdup,
+ needlogp);
ASSERT(dfp != NULL);
ASSERT(dfp->length == newdup->length);
ASSERT(be16_to_cpu(dfp->offset) == (char *)newdup - (char *)hdr);
@@ -902,7 +1007,7 @@ xfs_dir2_data_use_free(
* that means we don't know if there was a better
* choice for the last slot, or not. Rescan.
*/
- needscan = dfp == &hdr->bestfree[2];
+ needscan = dfp == &bf[2];
}
}
/*
@@ -914,13 +1019,13 @@ xfs_dir2_data_use_free(
newdup->length = cpu_to_be16(((char *)hdr + offset) - (char *)newdup);
*xfs_dir2_data_unused_tag_p(newdup) =
cpu_to_be16((char *)newdup - (char *)hdr);
- xfs_dir2_data_log_unused(tp, bp, newdup);
+ xfs_dir2_data_log_unused(args, bp, newdup);
newdup2 = (xfs_dir2_data_unused_t *)((char *)hdr + offset + len);
newdup2->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG);
newdup2->length = cpu_to_be16(oldlen - len - be16_to_cpu(newdup->length));
*xfs_dir2_data_unused_tag_p(newdup2) =
cpu_to_be16((char *)newdup2 - (char *)hdr);
- xfs_dir2_data_log_unused(tp, bp, newdup2);
+ xfs_dir2_data_log_unused(args, bp, newdup2);
/*
* If the old entry was in the table, we need to scan
* if the 3rd entry was valid, since these entries
@@ -930,11 +1035,13 @@ xfs_dir2_data_use_free(
* the 2 new will work.
*/
if (dfp) {
- needscan = (hdr->bestfree[2].length != 0);
+ needscan = (bf[2].length != 0);
if (!needscan) {
- xfs_dir2_data_freeremove(hdr, dfp, needlogp);
- xfs_dir2_data_freeinsert(hdr, newdup, needlogp);
- xfs_dir2_data_freeinsert(hdr, newdup2,
+ xfs_dir2_data_freeremove(hdr, bf, dfp,
+ needlogp);
+ xfs_dir2_data_freeinsert(hdr, bf, newdup,
+ needlogp);
+ xfs_dir2_data_freeinsert(hdr, bf, newdup2,
needlogp);
}
}
diff --git a/fs/xfs/xfs_dir2_format.h b/fs/xfs/xfs_dir2_format.h
deleted file mode 100644
index 07270981f48..00000000000
--- a/fs/xfs/xfs_dir2_format.h
+++ /dev/null
@@ -1,597 +0,0 @@
-/*
- * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- */
-#ifndef __XFS_DIR2_FORMAT_H__
-#define __XFS_DIR2_FORMAT_H__
-
-/*
- * Directory version 2.
- *
- * There are 4 possible formats:
- * - shortform - embedded into the inode
- * - single block - data with embedded leaf at the end
- * - multiple data blocks, single leaf+freeindex block
- * - data blocks, node and leaf blocks (btree), freeindex blocks
- *
- * Note: many node blocks structures and constants are shared with the attr
- * code and defined in xfs_da_btree.h.
- */
-
-#define XFS_DIR2_BLOCK_MAGIC 0x58443242 /* XD2B: single block dirs */
-#define XFS_DIR2_DATA_MAGIC 0x58443244 /* XD2D: multiblock dirs */
-#define XFS_DIR2_FREE_MAGIC 0x58443246 /* XD2F: free index blocks */
-
-/*
- * Byte offset in data block and shortform entry.
- */
-typedef __uint16_t xfs_dir2_data_off_t;
-#define NULLDATAOFF 0xffffU
-typedef uint xfs_dir2_data_aoff_t; /* argument form */
-
-/*
- * Normalized offset (in a data block) of the entry, really xfs_dir2_data_off_t.
- * Only need 16 bits, this is the byte offset into the single block form.
- */
-typedef struct { __uint8_t i[2]; } __arch_pack xfs_dir2_sf_off_t;
-
-/*
- * Offset in data space of a data entry.
- */
-typedef __uint32_t xfs_dir2_dataptr_t;
-#define XFS_DIR2_MAX_DATAPTR ((xfs_dir2_dataptr_t)0xffffffff)
-#define XFS_DIR2_NULL_DATAPTR ((xfs_dir2_dataptr_t)0)
-
-/*
- * Byte offset in a directory.
- */
-typedef xfs_off_t xfs_dir2_off_t;
-
-/*
- * Directory block number (logical dirblk in file)
- */
-typedef __uint32_t xfs_dir2_db_t;
-
-/*
- * Inode number stored as 8 8-bit values.
- */
-typedef struct { __uint8_t i[8]; } xfs_dir2_ino8_t;
-
-/*
- * Inode number stored as 4 8-bit values.
- * Works a lot of the time, when all the inode numbers in a directory
- * fit in 32 bits.
- */
-typedef struct { __uint8_t i[4]; } xfs_dir2_ino4_t;
-
-typedef union {
- xfs_dir2_ino8_t i8;
- xfs_dir2_ino4_t i4;
-} xfs_dir2_inou_t;
-#define XFS_DIR2_MAX_SHORT_INUM ((xfs_ino_t)0xffffffffULL)
-
-/*
- * Directory layout when stored internal to an inode.
- *
- * Small directories are packed as tightly as possible so as to fit into the
- * literal area of the inode. These "shortform" directories consist of a
- * single xfs_dir2_sf_hdr header followed by zero or more xfs_dir2_sf_entry
- * structures. Due the different inode number storage size and the variable
- * length name field in the xfs_dir2_sf_entry all these structure are
- * variable length, and the accessors in this file should be used to iterate
- * over them.
- */
-typedef struct xfs_dir2_sf_hdr {
- __uint8_t count; /* count of entries */
- __uint8_t i8count; /* count of 8-byte inode #s */
- xfs_dir2_inou_t parent; /* parent dir inode number */
-} __arch_pack xfs_dir2_sf_hdr_t;
-
-typedef struct xfs_dir2_sf_entry {
- __u8 namelen; /* actual name length */
- xfs_dir2_sf_off_t offset; /* saved offset */
- __u8 name[]; /* name, variable size */
- /*
- * A xfs_dir2_ino8_t or xfs_dir2_ino4_t follows here, at a
- * variable offset after the name.
- */
-} __arch_pack xfs_dir2_sf_entry_t;
-
-static inline int xfs_dir2_sf_hdr_size(int i8count)
-{
- return sizeof(struct xfs_dir2_sf_hdr) -
- (i8count == 0) *
- (sizeof(xfs_dir2_ino8_t) - sizeof(xfs_dir2_ino4_t));
-}
-
-static inline xfs_dir2_data_aoff_t
-xfs_dir2_sf_get_offset(xfs_dir2_sf_entry_t *sfep)
-{
- return get_unaligned_be16(&sfep->offset.i);
-}
-
-static inline void
-xfs_dir2_sf_put_offset(xfs_dir2_sf_entry_t *sfep, xfs_dir2_data_aoff_t off)
-{
- put_unaligned_be16(off, &sfep->offset.i);
-}
-
-static inline int
-xfs_dir2_sf_entsize(struct xfs_dir2_sf_hdr *hdr, int len)
-{
- return sizeof(struct xfs_dir2_sf_entry) + /* namelen + offset */
- len + /* name */
- (hdr->i8count ? /* ino */
- sizeof(xfs_dir2_ino8_t) :
- sizeof(xfs_dir2_ino4_t));
-}
-
-static inline struct xfs_dir2_sf_entry *
-xfs_dir2_sf_firstentry(struct xfs_dir2_sf_hdr *hdr)
-{
- return (struct xfs_dir2_sf_entry *)
- ((char *)hdr + xfs_dir2_sf_hdr_size(hdr->i8count));
-}
-
-static inline struct xfs_dir2_sf_entry *
-xfs_dir2_sf_nextentry(struct xfs_dir2_sf_hdr *hdr,
- struct xfs_dir2_sf_entry *sfep)
-{
- return (struct xfs_dir2_sf_entry *)
- ((char *)sfep + xfs_dir2_sf_entsize(hdr, sfep->namelen));
-}
-
-
-/*
- * Data block structures.
- *
- * A pure data block looks like the following drawing on disk:
- *
- * +-------------------------------------------------+
- * | xfs_dir2_data_hdr_t |
- * +-------------------------------------------------+
- * | xfs_dir2_data_entry_t OR xfs_dir2_data_unused_t |
- * | xfs_dir2_data_entry_t OR xfs_dir2_data_unused_t |
- * | xfs_dir2_data_entry_t OR xfs_dir2_data_unused_t |
- * | ... |
- * +-------------------------------------------------+
- * | unused space |
- * +-------------------------------------------------+
- *
- * As all the entries are variable size structures the accessors below should
- * be used to iterate over them.
- *
- * In addition to the pure data blocks for the data and node formats,
- * most structures are also used for the combined data/freespace "block"
- * format below.
- */
-
-#define XFS_DIR2_DATA_ALIGN_LOG 3 /* i.e., 8 bytes */
-#define XFS_DIR2_DATA_ALIGN (1 << XFS_DIR2_DATA_ALIGN_LOG)
-#define XFS_DIR2_DATA_FREE_TAG 0xffff
-#define XFS_DIR2_DATA_FD_COUNT 3
-
-/*
- * Directory address space divided into sections,
- * spaces separated by 32GB.
- */
-#define XFS_DIR2_SPACE_SIZE (1ULL << (32 + XFS_DIR2_DATA_ALIGN_LOG))
-#define XFS_DIR2_DATA_SPACE 0
-#define XFS_DIR2_DATA_OFFSET (XFS_DIR2_DATA_SPACE * XFS_DIR2_SPACE_SIZE)
-#define XFS_DIR2_DATA_FIRSTDB(mp) \
- xfs_dir2_byte_to_db(mp, XFS_DIR2_DATA_OFFSET)
-
-/*
- * Offsets of . and .. in data space (always block 0)
- */
-#define XFS_DIR2_DATA_DOT_OFFSET \
- ((xfs_dir2_data_aoff_t)sizeof(struct xfs_dir2_data_hdr))
-#define XFS_DIR2_DATA_DOTDOT_OFFSET \
- (XFS_DIR2_DATA_DOT_OFFSET + xfs_dir2_data_entsize(1))
-#define XFS_DIR2_DATA_FIRST_OFFSET \
- (XFS_DIR2_DATA_DOTDOT_OFFSET + xfs_dir2_data_entsize(2))
-
-/*
- * Describe a free area in the data block.
- *
- * The freespace will be formatted as a xfs_dir2_data_unused_t.
- */
-typedef struct xfs_dir2_data_free {
- __be16 offset; /* start of freespace */
- __be16 length; /* length of freespace */
-} xfs_dir2_data_free_t;
-
-/*
- * Header for the data blocks.
- *
- * The code knows that XFS_DIR2_DATA_FD_COUNT is 3.
- */
-typedef struct xfs_dir2_data_hdr {
- __be32 magic; /* XFS_DIR2_DATA_MAGIC or */
- /* XFS_DIR2_BLOCK_MAGIC */
- xfs_dir2_data_free_t bestfree[XFS_DIR2_DATA_FD_COUNT];
-} xfs_dir2_data_hdr_t;
-
-/*
- * Active entry in a data block.
- *
- * Aligned to 8 bytes. After the variable length name field there is a
- * 2 byte tag field, which can be accessed using xfs_dir2_data_entry_tag_p.
- */
-typedef struct xfs_dir2_data_entry {
- __be64 inumber; /* inode number */
- __u8 namelen; /* name length */
- __u8 name[]; /* name bytes, no null */
- /* __be16 tag; */ /* starting offset of us */
-} xfs_dir2_data_entry_t;
-
-/*
- * Unused entry in a data block.
- *
- * Aligned to 8 bytes. Tag appears as the last 2 bytes and must be accessed
- * using xfs_dir2_data_unused_tag_p.
- */
-typedef struct xfs_dir2_data_unused {
- __be16 freetag; /* XFS_DIR2_DATA_FREE_TAG */
- __be16 length; /* total free length */
- /* variable offset */
- __be16 tag; /* starting offset of us */
-} xfs_dir2_data_unused_t;
-
-/*
- * Size of a data entry.
- */
-static inline int xfs_dir2_data_entsize(int n)
-{
- return (int)roundup(offsetof(struct xfs_dir2_data_entry, name[0]) + n +
- (uint)sizeof(xfs_dir2_data_off_t), XFS_DIR2_DATA_ALIGN);
-}
-
-/*
- * Pointer to an entry's tag word.
- */
-static inline __be16 *
-xfs_dir2_data_entry_tag_p(struct xfs_dir2_data_entry *dep)
-{
- return (__be16 *)((char *)dep +
- xfs_dir2_data_entsize(dep->namelen) - sizeof(__be16));
-}
-
-/*
- * Pointer to a freespace's tag word.
- */
-static inline __be16 *
-xfs_dir2_data_unused_tag_p(struct xfs_dir2_data_unused *dup)
-{
- return (__be16 *)((char *)dup +
- be16_to_cpu(dup->length) - sizeof(__be16));
-}
-
-/*
- * Leaf block structures.
- *
- * A pure leaf block looks like the following drawing on disk:
- *
- * +---------------------------+
- * | xfs_dir2_leaf_hdr_t |
- * +---------------------------+
- * | xfs_dir2_leaf_entry_t |
- * | xfs_dir2_leaf_entry_t |
- * | xfs_dir2_leaf_entry_t |
- * | xfs_dir2_leaf_entry_t |
- * | ... |
- * +---------------------------+
- * | xfs_dir2_data_off_t |
- * | xfs_dir2_data_off_t |
- * | xfs_dir2_data_off_t |
- * | ... |
- * +---------------------------+
- * | xfs_dir2_leaf_tail_t |
- * +---------------------------+
- *
- * The xfs_dir2_data_off_t members (bests) and tail are at the end of the block
- * for single-leaf (magic = XFS_DIR2_LEAF1_MAGIC) blocks only, but not present
- * for directories with separate leaf nodes and free space blocks
- * (magic = XFS_DIR2_LEAFN_MAGIC).
- *
- * As all the entries are variable size structures the accessors below should
- * be used to iterate over them.
- */
-
-/*
- * Offset of the leaf/node space. First block in this space
- * is the btree root.
- */
-#define XFS_DIR2_LEAF_SPACE 1
-#define XFS_DIR2_LEAF_OFFSET (XFS_DIR2_LEAF_SPACE * XFS_DIR2_SPACE_SIZE)
-#define XFS_DIR2_LEAF_FIRSTDB(mp) \
- xfs_dir2_byte_to_db(mp, XFS_DIR2_LEAF_OFFSET)
-
-/*
- * Leaf block header.
- */
-typedef struct xfs_dir2_leaf_hdr {
- xfs_da_blkinfo_t info; /* header for da routines */
- __be16 count; /* count of entries */
- __be16 stale; /* count of stale entries */
-} xfs_dir2_leaf_hdr_t;
-
-/*
- * Leaf block entry.
- */
-typedef struct xfs_dir2_leaf_entry {
- __be32 hashval; /* hash value of name */
- __be32 address; /* address of data entry */
-} xfs_dir2_leaf_entry_t;
-
-/*
- * Leaf block tail.
- */
-typedef struct xfs_dir2_leaf_tail {
- __be32 bestcount;
-} xfs_dir2_leaf_tail_t;
-
-/*
- * Leaf block.
- */
-typedef struct xfs_dir2_leaf {
- xfs_dir2_leaf_hdr_t hdr; /* leaf header */
- xfs_dir2_leaf_entry_t ents[]; /* entries */
-} xfs_dir2_leaf_t;
-
-/*
- * DB blocks here are logical directory block numbers, not filesystem blocks.
- */
-
-static inline int xfs_dir2_max_leaf_ents(struct xfs_mount *mp)
-{
- return (mp->m_dirblksize - (uint)sizeof(struct xfs_dir2_leaf_hdr)) /
- (uint)sizeof(struct xfs_dir2_leaf_entry);
-}
-
-/*
- * Get address of the bestcount field in the single-leaf block.
- */
-static inline struct xfs_dir2_leaf_tail *
-xfs_dir2_leaf_tail_p(struct xfs_mount *mp, struct xfs_dir2_leaf *lp)
-{
- return (struct xfs_dir2_leaf_tail *)
- ((char *)lp + mp->m_dirblksize -
- sizeof(struct xfs_dir2_leaf_tail));
-}
-
-/*
- * Get address of the bests array in the single-leaf block.
- */
-static inline __be16 *
-xfs_dir2_leaf_bests_p(struct xfs_dir2_leaf_tail *ltp)
-{
- return (__be16 *)ltp - be32_to_cpu(ltp->bestcount);
-}
-
-/*
- * Convert dataptr to byte in file space
- */
-static inline xfs_dir2_off_t
-xfs_dir2_dataptr_to_byte(struct xfs_mount *mp, xfs_dir2_dataptr_t dp)
-{
- return (xfs_dir2_off_t)dp << XFS_DIR2_DATA_ALIGN_LOG;
-}
-
-/*
- * Convert byte in file space to dataptr. It had better be aligned.
- */
-static inline xfs_dir2_dataptr_t
-xfs_dir2_byte_to_dataptr(struct xfs_mount *mp, xfs_dir2_off_t by)
-{
- return (xfs_dir2_dataptr_t)(by >> XFS_DIR2_DATA_ALIGN_LOG);
-}
-
-/*
- * Convert byte in space to (DB) block
- */
-static inline xfs_dir2_db_t
-xfs_dir2_byte_to_db(struct xfs_mount *mp, xfs_dir2_off_t by)
-{
- return (xfs_dir2_db_t)
- (by >> (mp->m_sb.sb_blocklog + mp->m_sb.sb_dirblklog));
-}
-
-/*
- * Convert dataptr to a block number
- */
-static inline xfs_dir2_db_t
-xfs_dir2_dataptr_to_db(struct xfs_mount *mp, xfs_dir2_dataptr_t dp)
-{
- return xfs_dir2_byte_to_db(mp, xfs_dir2_dataptr_to_byte(mp, dp));
-}
-
-/*
- * Convert byte in space to offset in a block
- */
-static inline xfs_dir2_data_aoff_t
-xfs_dir2_byte_to_off(struct xfs_mount *mp, xfs_dir2_off_t by)
-{
- return (xfs_dir2_data_aoff_t)(by &
- ((1 << (mp->m_sb.sb_blocklog + mp->m_sb.sb_dirblklog)) - 1));
-}
-
-/*
- * Convert dataptr to a byte offset in a block
- */
-static inline xfs_dir2_data_aoff_t
-xfs_dir2_dataptr_to_off(struct xfs_mount *mp, xfs_dir2_dataptr_t dp)
-{
- return xfs_dir2_byte_to_off(mp, xfs_dir2_dataptr_to_byte(mp, dp));
-}
-
-/*
- * Convert block and offset to byte in space
- */
-static inline xfs_dir2_off_t
-xfs_dir2_db_off_to_byte(struct xfs_mount *mp, xfs_dir2_db_t db,
- xfs_dir2_data_aoff_t o)
-{
- return ((xfs_dir2_off_t)db <<
- (mp->m_sb.sb_blocklog + mp->m_sb.sb_dirblklog)) + o;
-}
-
-/*
- * Convert block (DB) to block (dablk)
- */
-static inline xfs_dablk_t
-xfs_dir2_db_to_da(struct xfs_mount *mp, xfs_dir2_db_t db)
-{
- return (xfs_dablk_t)(db << mp->m_sb.sb_dirblklog);
-}
-
-/*
- * Convert byte in space to (DA) block
- */
-static inline xfs_dablk_t
-xfs_dir2_byte_to_da(struct xfs_mount *mp, xfs_dir2_off_t by)
-{
- return xfs_dir2_db_to_da(mp, xfs_dir2_byte_to_db(mp, by));
-}
-
-/*
- * Convert block and offset to dataptr
- */
-static inline xfs_dir2_dataptr_t
-xfs_dir2_db_off_to_dataptr(struct xfs_mount *mp, xfs_dir2_db_t db,
- xfs_dir2_data_aoff_t o)
-{
- return xfs_dir2_byte_to_dataptr(mp, xfs_dir2_db_off_to_byte(mp, db, o));
-}
-
-/*
- * Convert block (dablk) to block (DB)
- */
-static inline xfs_dir2_db_t
-xfs_dir2_da_to_db(struct xfs_mount *mp, xfs_dablk_t da)
-{
- return (xfs_dir2_db_t)(da >> mp->m_sb.sb_dirblklog);
-}
-
-/*
- * Convert block (dablk) to byte offset in space
- */
-static inline xfs_dir2_off_t
-xfs_dir2_da_to_byte(struct xfs_mount *mp, xfs_dablk_t da)
-{
- return xfs_dir2_db_off_to_byte(mp, xfs_dir2_da_to_db(mp, da), 0);
-}
-
-/*
- * Free space block defintions for the node format.
- */
-
-/*
- * Offset of the freespace index.
- */
-#define XFS_DIR2_FREE_SPACE 2
-#define XFS_DIR2_FREE_OFFSET (XFS_DIR2_FREE_SPACE * XFS_DIR2_SPACE_SIZE)
-#define XFS_DIR2_FREE_FIRSTDB(mp) \
- xfs_dir2_byte_to_db(mp, XFS_DIR2_FREE_OFFSET)
-
-typedef struct xfs_dir2_free_hdr {
- __be32 magic; /* XFS_DIR2_FREE_MAGIC */
- __be32 firstdb; /* db of first entry */
- __be32 nvalid; /* count of valid entries */
- __be32 nused; /* count of used entries */
-} xfs_dir2_free_hdr_t;
-
-typedef struct xfs_dir2_free {
- xfs_dir2_free_hdr_t hdr; /* block header */
- __be16 bests[]; /* best free counts */
- /* unused entries are -1 */
-} xfs_dir2_free_t;
-
-static inline int xfs_dir2_free_max_bests(struct xfs_mount *mp)
-{
- return (mp->m_dirblksize - sizeof(struct xfs_dir2_free_hdr)) /
- sizeof(xfs_dir2_data_off_t);
-}
-
-/*
- * Convert data space db to the corresponding free db.
- */
-static inline xfs_dir2_db_t
-xfs_dir2_db_to_fdb(struct xfs_mount *mp, xfs_dir2_db_t db)
-{
- return XFS_DIR2_FREE_FIRSTDB(mp) + db / xfs_dir2_free_max_bests(mp);
-}
-
-/*
- * Convert data space db to the corresponding index in a free db.
- */
-static inline int
-xfs_dir2_db_to_fdindex(struct xfs_mount *mp, xfs_dir2_db_t db)
-{
- return db % xfs_dir2_free_max_bests(mp);
-}
-
-/*
- * Single block format.
- *
- * The single block format looks like the following drawing on disk:
- *
- * +-------------------------------------------------+
- * | xfs_dir2_data_hdr_t |
- * +-------------------------------------------------+
- * | xfs_dir2_data_entry_t OR xfs_dir2_data_unused_t |
- * | xfs_dir2_data_entry_t OR xfs_dir2_data_unused_t |
- * | xfs_dir2_data_entry_t OR xfs_dir2_data_unused_t :
- * | ... |
- * +-------------------------------------------------+
- * | unused space |
- * +-------------------------------------------------+
- * | ... |
- * | xfs_dir2_leaf_entry_t |
- * | xfs_dir2_leaf_entry_t |
- * +-------------------------------------------------+
- * | xfs_dir2_block_tail_t |
- * +-------------------------------------------------+
- *
- * As all the entries are variable size structures the accessors below should
- * be used to iterate over them.
- */
-
-typedef struct xfs_dir2_block_tail {
- __be32 count; /* count of leaf entries */
- __be32 stale; /* count of stale lf entries */
-} xfs_dir2_block_tail_t;
-
-/*
- * Pointer to the leaf header embedded in a data block (1-block format)
- */
-static inline struct xfs_dir2_block_tail *
-xfs_dir2_block_tail_p(struct xfs_mount *mp, struct xfs_dir2_data_hdr *hdr)
-{
- return ((struct xfs_dir2_block_tail *)
- ((char *)hdr + mp->m_dirblksize)) - 1;
-}
-
-/*
- * Pointer to the leaf entries embedded in a data block (1-block format)
- */
-static inline struct xfs_dir2_leaf_entry *
-xfs_dir2_block_leaf_p(struct xfs_dir2_block_tail *btp)
-{
- return ((struct xfs_dir2_leaf_entry *)btp) - be32_to_cpu(btp->count);
-}
-
-#endif /* __XFS_DIR2_FORMAT_H__ */
diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c
index 60cd2fa4e04..fb0aad4440c 100644
--- a/fs/xfs/xfs_dir2_leaf.c
+++ b/fs/xfs/xfs_dir2_leaf.c
@@ -1,5 +1,6 @@
/*
* Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
* All Rights Reserved.
*
* This program is free software; you can redistribute it and/or
@@ -17,113 +18,351 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
-#include "xfs_types.h"
-#include "xfs_bit.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_mount.h"
+#include "xfs_da_format.h"
#include "xfs_da_btree.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_dinode.h"
#include "xfs_inode.h"
#include "xfs_bmap.h"
-#include "xfs_dir2_format.h"
+#include "xfs_dir2.h"
#include "xfs_dir2_priv.h"
#include "xfs_error.h"
#include "xfs_trace.h"
+#include "xfs_trans.h"
+#include "xfs_buf_item.h"
+#include "xfs_cksum.h"
/*
* Local function declarations.
*/
+static int xfs_dir2_leaf_lookup_int(xfs_da_args_t *args, struct xfs_buf **lbpp,
+ int *indexp, struct xfs_buf **dbpp);
+static void xfs_dir3_leaf_log_bests(struct xfs_da_args *args,
+ struct xfs_buf *bp, int first, int last);
+static void xfs_dir3_leaf_log_tail(struct xfs_da_args *args,
+ struct xfs_buf *bp);
+
+/*
+ * Check the internal consistency of a leaf1 block.
+ * Pop an assert if something is wrong.
+ */
#ifdef DEBUG
-static void xfs_dir2_leaf_check(struct xfs_inode *dp, struct xfs_buf *bp);
+#define xfs_dir3_leaf_check(dp, bp) \
+do { \
+ if (!xfs_dir3_leaf1_check((dp), (bp))) \
+ ASSERT(0); \
+} while (0);
+
+STATIC bool
+xfs_dir3_leaf1_check(
+ struct xfs_inode *dp,
+ struct xfs_buf *bp)
+{
+ struct xfs_dir2_leaf *leaf = bp->b_addr;
+ struct xfs_dir3_icleaf_hdr leafhdr;
+
+ dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
+
+ if (leafhdr.magic == XFS_DIR3_LEAF1_MAGIC) {
+ struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr;
+ if (be64_to_cpu(leaf3->info.blkno) != bp->b_bn)
+ return false;
+ } else if (leafhdr.magic != XFS_DIR2_LEAF1_MAGIC)
+ return false;
+
+ return xfs_dir3_leaf_check_int(dp->i_mount, dp, &leafhdr, leaf);
+}
#else
-#define xfs_dir2_leaf_check(dp, bp)
+#define xfs_dir3_leaf_check(dp, bp)
#endif
-static int xfs_dir2_leaf_lookup_int(xfs_da_args_t *args, struct xfs_buf **lbpp,
- int *indexp, struct xfs_buf **dbpp);
-static void xfs_dir2_leaf_log_bests(struct xfs_trans *tp, struct xfs_buf *bp,
- int first, int last);
-static void xfs_dir2_leaf_log_tail(struct xfs_trans *tp, struct xfs_buf *bp);
-static void
-xfs_dir2_leaf_verify(
+bool
+xfs_dir3_leaf_check_int(
+ struct xfs_mount *mp,
+ struct xfs_inode *dp,
+ struct xfs_dir3_icleaf_hdr *hdr,
+ struct xfs_dir2_leaf *leaf)
+{
+ struct xfs_dir2_leaf_entry *ents;
+ xfs_dir2_leaf_tail_t *ltp;
+ int stale;
+ int i;
+ const struct xfs_dir_ops *ops;
+ struct xfs_dir3_icleaf_hdr leafhdr;
+ struct xfs_da_geometry *geo = mp->m_dir_geo;
+
+ /*
+ * we can be passed a null dp here from a verifier, so we need to go the
+ * hard way to get them.
+ */
+ ops = xfs_dir_get_ops(mp, dp);
+
+ if (!hdr) {
+ ops->leaf_hdr_from_disk(&leafhdr, leaf);
+ hdr = &leafhdr;
+ }
+
+ ents = ops->leaf_ents_p(leaf);
+ ltp = xfs_dir2_leaf_tail_p(geo, leaf);
+
+ /*
+ * XXX (dgc): This value is not restrictive enough.
+ * Should factor in the size of the bests table as well.
+ * We can deduce a value for that from di_size.
+ */
+ if (hdr->count > ops->leaf_max_ents(geo))
+ return false;
+
+ /* Leaves and bests don't overlap in leaf format. */
+ if ((hdr->magic == XFS_DIR2_LEAF1_MAGIC ||
+ hdr->magic == XFS_DIR3_LEAF1_MAGIC) &&
+ (char *)&ents[hdr->count] > (char *)xfs_dir2_leaf_bests_p(ltp))
+ return false;
+
+ /* Check hash value order, count stale entries. */
+ for (i = stale = 0; i < hdr->count; i++) {
+ if (i + 1 < hdr->count) {
+ if (be32_to_cpu(ents[i].hashval) >
+ be32_to_cpu(ents[i + 1].hashval))
+ return false;
+ }
+ if (ents[i].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
+ stale++;
+ }
+ if (hdr->stale != stale)
+ return false;
+ return true;
+}
+
+/*
+ * We verify the magic numbers before decoding the leaf header so that on debug
+ * kernels we don't get assertion failures in xfs_dir3_leaf_hdr_from_disk() due
+ * to incorrect magic numbers.
+ */
+static bool
+xfs_dir3_leaf_verify(
struct xfs_buf *bp,
- __be16 magic)
+ __uint16_t magic)
{
struct xfs_mount *mp = bp->b_target->bt_mount;
- struct xfs_dir2_leaf_hdr *hdr = bp->b_addr;
- int block_ok = 0;
+ struct xfs_dir2_leaf *leaf = bp->b_addr;
+
+ ASSERT(magic == XFS_DIR2_LEAF1_MAGIC || magic == XFS_DIR2_LEAFN_MAGIC);
+
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr;
+ __uint16_t magic3;
+
+ magic3 = (magic == XFS_DIR2_LEAF1_MAGIC) ? XFS_DIR3_LEAF1_MAGIC
+ : XFS_DIR3_LEAFN_MAGIC;
+
+ if (leaf3->info.hdr.magic != cpu_to_be16(magic3))
+ return false;
+ if (!uuid_equal(&leaf3->info.uuid, &mp->m_sb.sb_uuid))
+ return false;
+ if (be64_to_cpu(leaf3->info.blkno) != bp->b_bn)
+ return false;
+ } else {
+ if (leaf->hdr.info.magic != cpu_to_be16(magic))
+ return false;
+ }
- block_ok = hdr->info.magic == magic;
- if (!block_ok) {
- XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr);
+ return xfs_dir3_leaf_check_int(mp, NULL, NULL, leaf);
+}
+
+static void
+__read_verify(
+ struct xfs_buf *bp,
+ __uint16_t magic)
+{
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+
+ if (xfs_sb_version_hascrc(&mp->m_sb) &&
+ !xfs_buf_verify_cksum(bp, XFS_DIR3_LEAF_CRC_OFF))
+ xfs_buf_ioerror(bp, EFSBADCRC);
+ else if (!xfs_dir3_leaf_verify(bp, magic))
xfs_buf_ioerror(bp, EFSCORRUPTED);
+
+ if (bp->b_error)
+ xfs_verifier_error(bp);
+}
+
+static void
+__write_verify(
+ struct xfs_buf *bp,
+ __uint16_t magic)
+{
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_buf_log_item *bip = bp->b_fspriv;
+ struct xfs_dir3_leaf_hdr *hdr3 = bp->b_addr;
+
+ if (!xfs_dir3_leaf_verify(bp, magic)) {
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
+ xfs_verifier_error(bp);
+ return;
}
+
+ if (!xfs_sb_version_hascrc(&mp->m_sb))
+ return;
+
+ if (bip)
+ hdr3->info.lsn = cpu_to_be64(bip->bli_item.li_lsn);
+
+ xfs_buf_update_cksum(bp, XFS_DIR3_LEAF_CRC_OFF);
}
static void
-xfs_dir2_leaf1_read_verify(
+xfs_dir3_leaf1_read_verify(
struct xfs_buf *bp)
{
- xfs_dir2_leaf_verify(bp, cpu_to_be16(XFS_DIR2_LEAF1_MAGIC));
+ __read_verify(bp, XFS_DIR2_LEAF1_MAGIC);
}
static void
-xfs_dir2_leaf1_write_verify(
+xfs_dir3_leaf1_write_verify(
struct xfs_buf *bp)
{
- xfs_dir2_leaf_verify(bp, cpu_to_be16(XFS_DIR2_LEAF1_MAGIC));
+ __write_verify(bp, XFS_DIR2_LEAF1_MAGIC);
}
-void
-xfs_dir2_leafn_read_verify(
+static void
+xfs_dir3_leafn_read_verify(
struct xfs_buf *bp)
{
- xfs_dir2_leaf_verify(bp, cpu_to_be16(XFS_DIR2_LEAFN_MAGIC));
+ __read_verify(bp, XFS_DIR2_LEAFN_MAGIC);
}
-void
-xfs_dir2_leafn_write_verify(
+static void
+xfs_dir3_leafn_write_verify(
struct xfs_buf *bp)
{
- xfs_dir2_leaf_verify(bp, cpu_to_be16(XFS_DIR2_LEAFN_MAGIC));
+ __write_verify(bp, XFS_DIR2_LEAFN_MAGIC);
}
-static const struct xfs_buf_ops xfs_dir2_leaf1_buf_ops = {
- .verify_read = xfs_dir2_leaf1_read_verify,
- .verify_write = xfs_dir2_leaf1_write_verify,
+const struct xfs_buf_ops xfs_dir3_leaf1_buf_ops = {
+ .verify_read = xfs_dir3_leaf1_read_verify,
+ .verify_write = xfs_dir3_leaf1_write_verify,
};
-const struct xfs_buf_ops xfs_dir2_leafn_buf_ops = {
- .verify_read = xfs_dir2_leafn_read_verify,
- .verify_write = xfs_dir2_leafn_write_verify,
+const struct xfs_buf_ops xfs_dir3_leafn_buf_ops = {
+ .verify_read = xfs_dir3_leafn_read_verify,
+ .verify_write = xfs_dir3_leafn_write_verify,
};
static int
-xfs_dir2_leaf_read(
+xfs_dir3_leaf_read(
struct xfs_trans *tp,
struct xfs_inode *dp,
xfs_dablk_t fbno,
xfs_daddr_t mappedbno,
struct xfs_buf **bpp)
{
- return xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp,
- XFS_DATA_FORK, &xfs_dir2_leaf1_buf_ops);
+ int err;
+
+ err = xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp,
+ XFS_DATA_FORK, &xfs_dir3_leaf1_buf_ops);
+ if (!err && tp)
+ xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_LEAF1_BUF);
+ return err;
}
int
-xfs_dir2_leafn_read(
+xfs_dir3_leafn_read(
struct xfs_trans *tp,
struct xfs_inode *dp,
xfs_dablk_t fbno,
xfs_daddr_t mappedbno,
struct xfs_buf **bpp)
{
- return xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp,
- XFS_DATA_FORK, &xfs_dir2_leafn_buf_ops);
+ int err;
+
+ err = xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp,
+ XFS_DATA_FORK, &xfs_dir3_leafn_buf_ops);
+ if (!err && tp)
+ xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_LEAFN_BUF);
+ return err;
+}
+
+/*
+ * Initialize a new leaf block, leaf1 or leafn magic accepted.
+ */
+static void
+xfs_dir3_leaf_init(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ struct xfs_buf *bp,
+ xfs_ino_t owner,
+ __uint16_t type)
+{
+ struct xfs_dir2_leaf *leaf = bp->b_addr;
+
+ ASSERT(type == XFS_DIR2_LEAF1_MAGIC || type == XFS_DIR2_LEAFN_MAGIC);
+
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr;
+
+ memset(leaf3, 0, sizeof(*leaf3));
+
+ leaf3->info.hdr.magic = (type == XFS_DIR2_LEAF1_MAGIC)
+ ? cpu_to_be16(XFS_DIR3_LEAF1_MAGIC)
+ : cpu_to_be16(XFS_DIR3_LEAFN_MAGIC);
+ leaf3->info.blkno = cpu_to_be64(bp->b_bn);
+ leaf3->info.owner = cpu_to_be64(owner);
+ uuid_copy(&leaf3->info.uuid, &mp->m_sb.sb_uuid);
+ } else {
+ memset(leaf, 0, sizeof(*leaf));
+ leaf->hdr.info.magic = cpu_to_be16(type);
+ }
+
+ /*
+ * If it's a leaf-format directory initialize the tail.
+ * Caller is responsible for initialising the bests table.
+ */
+ if (type == XFS_DIR2_LEAF1_MAGIC) {
+ struct xfs_dir2_leaf_tail *ltp;
+
+ ltp = xfs_dir2_leaf_tail_p(mp->m_dir_geo, leaf);
+ ltp->bestcount = 0;
+ bp->b_ops = &xfs_dir3_leaf1_buf_ops;
+ xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_LEAF1_BUF);
+ } else {
+ bp->b_ops = &xfs_dir3_leafn_buf_ops;
+ xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_LEAFN_BUF);
+ }
+}
+
+int
+xfs_dir3_leaf_get_buf(
+ xfs_da_args_t *args,
+ xfs_dir2_db_t bno,
+ struct xfs_buf **bpp,
+ __uint16_t magic)
+{
+ struct xfs_inode *dp = args->dp;
+ struct xfs_trans *tp = args->trans;
+ struct xfs_mount *mp = dp->i_mount;
+ struct xfs_buf *bp;
+ int error;
+
+ ASSERT(magic == XFS_DIR2_LEAF1_MAGIC || magic == XFS_DIR2_LEAFN_MAGIC);
+ ASSERT(bno >= xfs_dir2_byte_to_db(args->geo, XFS_DIR2_LEAF_OFFSET) &&
+ bno < xfs_dir2_byte_to_db(args->geo, XFS_DIR2_FREE_OFFSET));
+
+ error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(args->geo, bno),
+ -1, &bp, XFS_DATA_FORK);
+ if (error)
+ return error;
+
+ xfs_dir3_leaf_init(mp, tp, bp, dp->i_ino, magic);
+ xfs_dir3_leaf_log_header(args, bp);
+ if (magic == XFS_DIR2_LEAF1_MAGIC)
+ xfs_dir3_leaf_log_tail(args, bp);
+ *bpp = bp;
+ return 0;
}
/*
@@ -149,6 +388,9 @@ xfs_dir2_block_to_leaf(
int needlog; /* need to log block header */
int needscan; /* need to rescan bestfree */
xfs_trans_t *tp; /* transaction pointer */
+ struct xfs_dir2_data_free *bf;
+ struct xfs_dir2_leaf_entry *ents;
+ struct xfs_dir3_icleaf_hdr leafhdr;
trace_xfs_dir2_block_to_leaf(args);
@@ -163,70 +405,83 @@ xfs_dir2_block_to_leaf(
if ((error = xfs_da_grow_inode(args, &blkno))) {
return error;
}
- ldb = xfs_dir2_da_to_db(mp, blkno);
- ASSERT(ldb == XFS_DIR2_LEAF_FIRSTDB(mp));
+ ldb = xfs_dir2_da_to_db(args->geo, blkno);
+ ASSERT(ldb == xfs_dir2_byte_to_db(args->geo, XFS_DIR2_LEAF_OFFSET));
/*
* Initialize the leaf block, get a buffer for it.
*/
- if ((error = xfs_dir2_leaf_init(args, ldb, &lbp, XFS_DIR2_LEAF1_MAGIC))) {
+ error = xfs_dir3_leaf_get_buf(args, ldb, &lbp, XFS_DIR2_LEAF1_MAGIC);
+ if (error)
return error;
- }
- ASSERT(lbp != NULL);
+
leaf = lbp->b_addr;
hdr = dbp->b_addr;
- xfs_dir2_data_check(dp, dbp);
- btp = xfs_dir2_block_tail_p(mp, hdr);
+ xfs_dir3_data_check(dp, dbp);
+ btp = xfs_dir2_block_tail_p(args->geo, hdr);
blp = xfs_dir2_block_leaf_p(btp);
+ bf = dp->d_ops->data_bestfree_p(hdr);
+ ents = dp->d_ops->leaf_ents_p(leaf);
+
/*
* Set the counts in the leaf header.
*/
- leaf->hdr.count = cpu_to_be16(be32_to_cpu(btp->count));
- leaf->hdr.stale = cpu_to_be16(be32_to_cpu(btp->stale));
+ dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
+ leafhdr.count = be32_to_cpu(btp->count);
+ leafhdr.stale = be32_to_cpu(btp->stale);
+ dp->d_ops->leaf_hdr_to_disk(leaf, &leafhdr);
+ xfs_dir3_leaf_log_header(args, lbp);
+
/*
* Could compact these but I think we always do the conversion
* after squeezing out stale entries.
*/
- memcpy(leaf->ents, blp, be32_to_cpu(btp->count) * sizeof(xfs_dir2_leaf_entry_t));
- xfs_dir2_leaf_log_ents(tp, lbp, 0, be16_to_cpu(leaf->hdr.count) - 1);
+ memcpy(ents, blp, be32_to_cpu(btp->count) * sizeof(xfs_dir2_leaf_entry_t));
+ xfs_dir3_leaf_log_ents(args, lbp, 0, leafhdr.count - 1);
needscan = 0;
needlog = 1;
/*
* Make the space formerly occupied by the leaf entries and block
* tail be free.
*/
- xfs_dir2_data_make_free(tp, dbp,
+ xfs_dir2_data_make_free(args, dbp,
(xfs_dir2_data_aoff_t)((char *)blp - (char *)hdr),
- (xfs_dir2_data_aoff_t)((char *)hdr + mp->m_dirblksize -
+ (xfs_dir2_data_aoff_t)((char *)hdr + args->geo->blksize -
(char *)blp),
&needlog, &needscan);
/*
* Fix up the block header, make it a data block.
*/
- dbp->b_ops = &xfs_dir2_data_buf_ops;
- hdr->magic = cpu_to_be32(XFS_DIR2_DATA_MAGIC);
+ dbp->b_ops = &xfs_dir3_data_buf_ops;
+ xfs_trans_buf_set_type(tp, dbp, XFS_BLFT_DIR_DATA_BUF);
+ if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC))
+ hdr->magic = cpu_to_be32(XFS_DIR2_DATA_MAGIC);
+ else
+ hdr->magic = cpu_to_be32(XFS_DIR3_DATA_MAGIC);
+
if (needscan)
- xfs_dir2_data_freescan(mp, hdr, &needlog);
+ xfs_dir2_data_freescan(dp, hdr, &needlog);
/*
* Set up leaf tail and bests table.
*/
- ltp = xfs_dir2_leaf_tail_p(mp, leaf);
+ ltp = xfs_dir2_leaf_tail_p(args->geo, leaf);
ltp->bestcount = cpu_to_be32(1);
bestsp = xfs_dir2_leaf_bests_p(ltp);
- bestsp[0] = hdr->bestfree[0].length;
+ bestsp[0] = bf[0].length;
/*
* Log the data header and leaf bests table.
*/
if (needlog)
- xfs_dir2_data_log_header(tp, dbp);
- xfs_dir2_leaf_check(dp, lbp);
- xfs_dir2_data_check(dp, dbp);
- xfs_dir2_leaf_log_bests(tp, lbp, 0, 0);
+ xfs_dir2_data_log_header(args, dbp);
+ xfs_dir3_leaf_check(dp, lbp);
+ xfs_dir3_data_check(dp, dbp);
+ xfs_dir3_leaf_log_bests(args, lbp, 0, 0);
return 0;
}
STATIC void
-xfs_dir2_leaf_find_stale(
- struct xfs_dir2_leaf *leaf,
+xfs_dir3_leaf_find_stale(
+ struct xfs_dir3_icleaf_hdr *leafhdr,
+ struct xfs_dir2_leaf_entry *ents,
int index,
int *lowstale,
int *highstale)
@@ -235,7 +490,7 @@ xfs_dir2_leaf_find_stale(
* Find the first stale entry before our index, if any.
*/
for (*lowstale = index - 1; *lowstale >= 0; --*lowstale) {
- if (leaf->ents[*lowstale].address ==
+ if (ents[*lowstale].address ==
cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
break;
}
@@ -245,10 +500,8 @@ xfs_dir2_leaf_find_stale(
* Stop if the result would require moving more entries than using
* lowstale.
*/
- for (*highstale = index;
- *highstale < be16_to_cpu(leaf->hdr.count);
- ++*highstale) {
- if (leaf->ents[*highstale].address ==
+ for (*highstale = index; *highstale < leafhdr->count; ++*highstale) {
+ if (ents[*highstale].address ==
cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
break;
if (*lowstale >= 0 && index - *lowstale <= *highstale - index)
@@ -257,8 +510,9 @@ xfs_dir2_leaf_find_stale(
}
struct xfs_dir2_leaf_entry *
-xfs_dir2_leaf_find_entry(
- xfs_dir2_leaf_t *leaf, /* leaf structure */
+xfs_dir3_leaf_find_entry(
+ struct xfs_dir3_icleaf_hdr *leafhdr,
+ struct xfs_dir2_leaf_entry *ents,
int index, /* leaf table position */
int compact, /* need to compact leaves */
int lowstale, /* index of prev stale leaf */
@@ -266,7 +520,7 @@ xfs_dir2_leaf_find_entry(
int *lfloglow, /* low leaf logging index */
int *lfloghigh) /* high leaf logging index */
{
- if (!leaf->hdr.stale) {
+ if (!leafhdr->stale) {
xfs_dir2_leaf_entry_t *lep; /* leaf entry table pointer */
/*
@@ -274,18 +528,16 @@ xfs_dir2_leaf_find_entry(
*
* If there are no stale entries, just insert a hole at index.
*/
- lep = &leaf->ents[index];
- if (index < be16_to_cpu(leaf->hdr.count))
+ lep = &ents[index];
+ if (index < leafhdr->count)
memmove(lep + 1, lep,
- (be16_to_cpu(leaf->hdr.count) - index) *
- sizeof(*lep));
+ (leafhdr->count - index) * sizeof(*lep));
/*
* Record low and high logging indices for the leaf.
*/
*lfloglow = index;
- *lfloghigh = be16_to_cpu(leaf->hdr.count);
- be16_add_cpu(&leaf->hdr.count, 1);
+ *lfloghigh = leafhdr->count++;
return lep;
}
@@ -299,16 +551,17 @@ xfs_dir2_leaf_find_entry(
* entries before and after our insertion point.
*/
if (compact == 0)
- xfs_dir2_leaf_find_stale(leaf, index, &lowstale, &highstale);
+ xfs_dir3_leaf_find_stale(leafhdr, ents, index,
+ &lowstale, &highstale);
/*
* If the low one is better, use it.
*/
if (lowstale >= 0 &&
- (highstale == be16_to_cpu(leaf->hdr.count) ||
+ (highstale == leafhdr->count ||
index - lowstale - 1 < highstale - index)) {
ASSERT(index - lowstale - 1 >= 0);
- ASSERT(leaf->ents[lowstale].address ==
+ ASSERT(ents[lowstale].address ==
cpu_to_be32(XFS_DIR2_NULL_DATAPTR));
/*
@@ -316,37 +569,34 @@ xfs_dir2_leaf_find_entry(
* for the new entry.
*/
if (index - lowstale - 1 > 0) {
- memmove(&leaf->ents[lowstale],
- &leaf->ents[lowstale + 1],
+ memmove(&ents[lowstale], &ents[lowstale + 1],
(index - lowstale - 1) *
- sizeof(xfs_dir2_leaf_entry_t));
+ sizeof(xfs_dir2_leaf_entry_t));
}
*lfloglow = MIN(lowstale, *lfloglow);
*lfloghigh = MAX(index - 1, *lfloghigh);
- be16_add_cpu(&leaf->hdr.stale, -1);
- return &leaf->ents[index - 1];
+ leafhdr->stale--;
+ return &ents[index - 1];
}
/*
* The high one is better, so use that one.
*/
ASSERT(highstale - index >= 0);
- ASSERT(leaf->ents[highstale].address ==
- cpu_to_be32(XFS_DIR2_NULL_DATAPTR));
+ ASSERT(ents[highstale].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR));
/*
* Copy entries down to cover the stale entry and make room for the
* new entry.
*/
if (highstale - index > 0) {
- memmove(&leaf->ents[index + 1],
- &leaf->ents[index],
+ memmove(&ents[index + 1], &ents[index],
(highstale - index) * sizeof(xfs_dir2_leaf_entry_t));
}
*lfloglow = MIN(index, *lfloglow);
*lfloghigh = MAX(highstale, *lfloghigh);
- be16_add_cpu(&leaf->hdr.stale, -1);
- return &leaf->ents[index];
+ leafhdr->stale--;
+ return &ents[index];
}
/*
@@ -383,6 +633,9 @@ xfs_dir2_leaf_addname(
__be16 *tagp; /* end of data entry */
xfs_trans_t *tp; /* transaction pointer */
xfs_dir2_db_t use_block; /* data block number */
+ struct xfs_dir2_data_free *bf; /* bestfree table */
+ struct xfs_dir2_leaf_entry *ents;
+ struct xfs_dir3_icleaf_hdr leafhdr;
trace_xfs_dir2_leaf_addname(args);
@@ -390,7 +643,7 @@ xfs_dir2_leaf_addname(
tp = args->trans;
mp = dp->i_mount;
- error = xfs_dir2_leaf_read(tp, dp, mp->m_dirleafblk, -1, &lbp);
+ error = xfs_dir3_leaf_read(tp, dp, args->geo->leafblk, -1, &lbp);
if (error)
return error;
@@ -402,21 +655,24 @@ xfs_dir2_leaf_addname(
*/
index = xfs_dir2_leaf_search_hash(args, lbp);
leaf = lbp->b_addr;
- ltp = xfs_dir2_leaf_tail_p(mp, leaf);
+ ltp = xfs_dir2_leaf_tail_p(args->geo, leaf);
+ ents = dp->d_ops->leaf_ents_p(leaf);
+ dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
bestsp = xfs_dir2_leaf_bests_p(ltp);
- length = xfs_dir2_data_entsize(args->namelen);
+ length = dp->d_ops->data_entsize(args->namelen);
+
/*
* See if there are any entries with the same hash value
* and space in their block for the new entry.
* This is good because it puts multiple same-hash value entries
* in a data block, improving the lookup of those entries.
*/
- for (use_block = -1, lep = &leaf->ents[index];
- index < be16_to_cpu(leaf->hdr.count) && be32_to_cpu(lep->hashval) == args->hashval;
+ for (use_block = -1, lep = &ents[index];
+ index < leafhdr.count && be32_to_cpu(lep->hashval) == args->hashval;
index++, lep++) {
if (be32_to_cpu(lep->address) == XFS_DIR2_NULL_DATAPTR)
continue;
- i = xfs_dir2_dataptr_to_db(mp, be32_to_cpu(lep->address));
+ i = xfs_dir2_dataptr_to_db(args->geo, be32_to_cpu(lep->address));
ASSERT(i < be32_to_cpu(ltp->bestcount));
ASSERT(bestsp[i] != cpu_to_be16(NULLDATAOFF));
if (be16_to_cpu(bestsp[i]) >= length) {
@@ -445,7 +701,7 @@ xfs_dir2_leaf_addname(
* How many bytes do we need in the leaf block?
*/
needbytes = 0;
- if (!leaf->hdr.stale)
+ if (!leafhdr.stale)
needbytes += sizeof(xfs_dir2_leaf_entry_t);
if (use_block == -1)
needbytes += sizeof(xfs_dir2_data_off_t);
@@ -460,16 +716,15 @@ xfs_dir2_leaf_addname(
* If we don't have enough free bytes but we can make enough
* by compacting out stale entries, we'll do that.
*/
- if ((char *)bestsp - (char *)&leaf->ents[be16_to_cpu(leaf->hdr.count)] <
- needbytes && be16_to_cpu(leaf->hdr.stale) > 1) {
+ if ((char *)bestsp - (char *)&ents[leafhdr.count] < needbytes &&
+ leafhdr.stale > 1)
compact = 1;
- }
+
/*
* Otherwise if we don't have enough free bytes we need to
* convert to node form.
*/
- else if ((char *)bestsp - (char *)&leaf->ents[be16_to_cpu(
- leaf->hdr.count)] < needbytes) {
+ else if ((char *)bestsp - (char *)&ents[leafhdr.count] < needbytes) {
/*
* Just checking or no space reservation, give up.
*/
@@ -517,15 +772,15 @@ xfs_dir2_leaf_addname(
* point later.
*/
if (compact) {
- xfs_dir2_leaf_compact_x1(lbp, &index, &lowstale, &highstale,
- &lfloglow, &lfloghigh);
+ xfs_dir3_leaf_compact_x1(&leafhdr, ents, &index, &lowstale,
+ &highstale, &lfloglow, &lfloghigh);
}
/*
* There are stale entries, so we'll need log-low and log-high
* impossibly bad values later.
*/
- else if (be16_to_cpu(leaf->hdr.stale)) {
- lfloglow = be16_to_cpu(leaf->hdr.count);
+ else if (leafhdr.stale) {
+ lfloglow = leafhdr.count;
lfloghigh = -1;
}
/*
@@ -544,7 +799,7 @@ xfs_dir2_leaf_addname(
/*
* Initialize the block.
*/
- if ((error = xfs_dir2_data_init(args, use_block, &dbp))) {
+ if ((error = xfs_dir3_data_init(args, use_block, &dbp))) {
xfs_trans_brelse(tp, lbp);
return error;
}
@@ -557,43 +812,46 @@ xfs_dir2_leaf_addname(
memmove(&bestsp[0], &bestsp[1],
be32_to_cpu(ltp->bestcount) * sizeof(bestsp[0]));
be32_add_cpu(&ltp->bestcount, 1);
- xfs_dir2_leaf_log_tail(tp, lbp);
- xfs_dir2_leaf_log_bests(tp, lbp, 0, be32_to_cpu(ltp->bestcount) - 1);
+ xfs_dir3_leaf_log_tail(args, lbp);
+ xfs_dir3_leaf_log_bests(args, lbp, 0,
+ be32_to_cpu(ltp->bestcount) - 1);
}
/*
* If we're filling in a previously empty block just log it.
*/
else
- xfs_dir2_leaf_log_bests(tp, lbp, use_block, use_block);
+ xfs_dir3_leaf_log_bests(args, lbp, use_block, use_block);
hdr = dbp->b_addr;
- bestsp[use_block] = hdr->bestfree[0].length;
+ bf = dp->d_ops->data_bestfree_p(hdr);
+ bestsp[use_block] = bf[0].length;
grown = 1;
} else {
/*
* Already had space in some data block.
* Just read that one in.
*/
- error = xfs_dir2_data_read(tp, dp,
- xfs_dir2_db_to_da(mp, use_block),
- -1, &dbp);
+ error = xfs_dir3_data_read(tp, dp,
+ xfs_dir2_db_to_da(args->geo, use_block),
+ -1, &dbp);
if (error) {
xfs_trans_brelse(tp, lbp);
return error;
}
hdr = dbp->b_addr;
+ bf = dp->d_ops->data_bestfree_p(hdr);
grown = 0;
}
/*
* Point to the biggest freespace in our data block.
*/
dup = (xfs_dir2_data_unused_t *)
- ((char *)hdr + be16_to_cpu(hdr->bestfree[0].offset));
+ ((char *)hdr + be16_to_cpu(bf[0].offset));
ASSERT(be16_to_cpu(dup->length) >= length);
needscan = needlog = 0;
/*
* Mark the initial part of our freespace in use for the new entry.
*/
- xfs_dir2_data_use_free(tp, dbp, dup,
+ xfs_dir2_data_use_free(args, dbp, dup,
(xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr), length,
&needlog, &needscan);
/*
@@ -603,117 +861,78 @@ xfs_dir2_leaf_addname(
dep->inumber = cpu_to_be64(args->inumber);
dep->namelen = args->namelen;
memcpy(dep->name, args->name, dep->namelen);
- tagp = xfs_dir2_data_entry_tag_p(dep);
+ dp->d_ops->data_put_ftype(dep, args->filetype);
+ tagp = dp->d_ops->data_entry_tag_p(dep);
*tagp = cpu_to_be16((char *)dep - (char *)hdr);
/*
* Need to scan fix up the bestfree table.
*/
if (needscan)
- xfs_dir2_data_freescan(mp, hdr, &needlog);
+ xfs_dir2_data_freescan(dp, hdr, &needlog);
/*
* Need to log the data block's header.
*/
if (needlog)
- xfs_dir2_data_log_header(tp, dbp);
- xfs_dir2_data_log_entry(tp, dbp, dep);
+ xfs_dir2_data_log_header(args, dbp);
+ xfs_dir2_data_log_entry(args, dbp, dep);
/*
* If the bests table needs to be changed, do it.
* Log the change unless we've already done that.
*/
- if (be16_to_cpu(bestsp[use_block]) != be16_to_cpu(hdr->bestfree[0].length)) {
- bestsp[use_block] = hdr->bestfree[0].length;
+ if (be16_to_cpu(bestsp[use_block]) != be16_to_cpu(bf[0].length)) {
+ bestsp[use_block] = bf[0].length;
if (!grown)
- xfs_dir2_leaf_log_bests(tp, lbp, use_block, use_block);
+ xfs_dir3_leaf_log_bests(args, lbp, use_block, use_block);
}
- lep = xfs_dir2_leaf_find_entry(leaf, index, compact, lowstale,
+ lep = xfs_dir3_leaf_find_entry(&leafhdr, ents, index, compact, lowstale,
highstale, &lfloglow, &lfloghigh);
/*
* Fill in the new leaf entry.
*/
lep->hashval = cpu_to_be32(args->hashval);
- lep->address = cpu_to_be32(xfs_dir2_db_off_to_dataptr(mp, use_block,
+ lep->address = cpu_to_be32(
+ xfs_dir2_db_off_to_dataptr(args->geo, use_block,
be16_to_cpu(*tagp)));
/*
* Log the leaf fields and give up the buffers.
*/
- xfs_dir2_leaf_log_header(tp, lbp);
- xfs_dir2_leaf_log_ents(tp, lbp, lfloglow, lfloghigh);
- xfs_dir2_leaf_check(dp, lbp);
- xfs_dir2_data_check(dp, dbp);
+ dp->d_ops->leaf_hdr_to_disk(leaf, &leafhdr);
+ xfs_dir3_leaf_log_header(args, lbp);
+ xfs_dir3_leaf_log_ents(args, lbp, lfloglow, lfloghigh);
+ xfs_dir3_leaf_check(dp, lbp);
+ xfs_dir3_data_check(dp, dbp);
return 0;
}
-#ifdef DEBUG
-/*
- * Check the internal consistency of a leaf1 block.
- * Pop an assert if something is wrong.
- */
-STATIC void
-xfs_dir2_leaf_check(
- struct xfs_inode *dp, /* incore directory inode */
- struct xfs_buf *bp) /* leaf's buffer */
-{
- int i; /* leaf index */
- xfs_dir2_leaf_t *leaf; /* leaf structure */
- xfs_dir2_leaf_tail_t *ltp; /* leaf tail pointer */
- xfs_mount_t *mp; /* filesystem mount point */
- int stale; /* count of stale leaves */
-
- leaf = bp->b_addr;
- mp = dp->i_mount;
- ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC));
- /*
- * This value is not restrictive enough.
- * Should factor in the size of the bests table as well.
- * We can deduce a value for that from di_size.
- */
- ASSERT(be16_to_cpu(leaf->hdr.count) <= xfs_dir2_max_leaf_ents(mp));
- ltp = xfs_dir2_leaf_tail_p(mp, leaf);
- /*
- * Leaves and bests don't overlap.
- */
- ASSERT((char *)&leaf->ents[be16_to_cpu(leaf->hdr.count)] <=
- (char *)xfs_dir2_leaf_bests_p(ltp));
- /*
- * Check hash value order, count stale entries.
- */
- for (i = stale = 0; i < be16_to_cpu(leaf->hdr.count); i++) {
- if (i + 1 < be16_to_cpu(leaf->hdr.count))
- ASSERT(be32_to_cpu(leaf->ents[i].hashval) <=
- be32_to_cpu(leaf->ents[i + 1].hashval));
- if (leaf->ents[i].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
- stale++;
- }
- ASSERT(be16_to_cpu(leaf->hdr.stale) == stale);
-}
-#endif /* DEBUG */
-
/*
* Compact out any stale entries in the leaf.
* Log the header and changed leaf entries, if any.
*/
void
-xfs_dir2_leaf_compact(
+xfs_dir3_leaf_compact(
xfs_da_args_t *args, /* operation arguments */
+ struct xfs_dir3_icleaf_hdr *leafhdr,
struct xfs_buf *bp) /* leaf buffer */
{
int from; /* source leaf index */
xfs_dir2_leaf_t *leaf; /* leaf structure */
int loglow; /* first leaf entry to log */
int to; /* target leaf index */
+ struct xfs_dir2_leaf_entry *ents;
+ struct xfs_inode *dp = args->dp;
leaf = bp->b_addr;
- if (!leaf->hdr.stale) {
+ if (!leafhdr->stale)
return;
- }
+
/*
* Compress out the stale entries in place.
*/
- for (from = to = 0, loglow = -1; from < be16_to_cpu(leaf->hdr.count); from++) {
- if (leaf->ents[from].address ==
- cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
+ ents = dp->d_ops->leaf_ents_p(leaf);
+ for (from = to = 0, loglow = -1; from < leafhdr->count; from++) {
+ if (ents[from].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
continue;
/*
* Only actually copy the entries that are different.
@@ -721,19 +940,21 @@ xfs_dir2_leaf_compact(
if (from > to) {
if (loglow == -1)
loglow = to;
- leaf->ents[to] = leaf->ents[from];
+ ents[to] = ents[from];
}
to++;
}
/*
* Update and log the header, log the leaf entries.
*/
- ASSERT(be16_to_cpu(leaf->hdr.stale) == from - to);
- be16_add_cpu(&leaf->hdr.count, -(be16_to_cpu(leaf->hdr.stale)));
- leaf->hdr.stale = 0;
- xfs_dir2_leaf_log_header(args->trans, bp);
+ ASSERT(leafhdr->stale == from - to);
+ leafhdr->count -= leafhdr->stale;
+ leafhdr->stale = 0;
+
+ dp->d_ops->leaf_hdr_to_disk(leaf, leafhdr);
+ xfs_dir3_leaf_log_header(args, bp);
if (loglow != -1)
- xfs_dir2_leaf_log_ents(args->trans, bp, loglow, to - 1);
+ xfs_dir3_leaf_log_ents(args, bp, loglow, to - 1);
}
/*
@@ -745,8 +966,9 @@ xfs_dir2_leaf_compact(
* and leaf logging indices.
*/
void
-xfs_dir2_leaf_compact_x1(
- struct xfs_buf *bp, /* leaf buffer */
+xfs_dir3_leaf_compact_x1(
+ struct xfs_dir3_icleaf_hdr *leafhdr,
+ struct xfs_dir2_leaf_entry *ents,
int *indexp, /* insertion index */
int *lowstalep, /* out: stale entry before us */
int *highstalep, /* out: stale entry after us */
@@ -757,22 +979,20 @@ xfs_dir2_leaf_compact_x1(
int highstale; /* stale entry at/after index */
int index; /* insertion index */
int keepstale; /* source index of kept stale */
- xfs_dir2_leaf_t *leaf; /* leaf structure */
int lowstale; /* stale entry before index */
int newindex=0; /* new insertion index */
int to; /* destination copy index */
- leaf = bp->b_addr;
- ASSERT(be16_to_cpu(leaf->hdr.stale) > 1);
+ ASSERT(leafhdr->stale > 1);
index = *indexp;
- xfs_dir2_leaf_find_stale(leaf, index, &lowstale, &highstale);
+ xfs_dir3_leaf_find_stale(leafhdr, ents, index, &lowstale, &highstale);
/*
* Pick the better of lowstale and highstale.
*/
if (lowstale >= 0 &&
- (highstale == be16_to_cpu(leaf->hdr.count) ||
+ (highstale == leafhdr->count ||
index - lowstale <= highstale - index))
keepstale = lowstale;
else
@@ -781,15 +1001,14 @@ xfs_dir2_leaf_compact_x1(
* Copy the entries in place, removing all the stale entries
* except keepstale.
*/
- for (from = to = 0; from < be16_to_cpu(leaf->hdr.count); from++) {
+ for (from = to = 0; from < leafhdr->count; from++) {
/*
* Notice the new value of index.
*/
if (index == from)
newindex = to;
if (from != keepstale &&
- leaf->ents[from].address ==
- cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) {
+ ents[from].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) {
if (from == to)
*lowlogp = to;
continue;
@@ -803,7 +1022,7 @@ xfs_dir2_leaf_compact_x1(
* Copy only the entries that have moved.
*/
if (from > to)
- leaf->ents[to] = leaf->ents[from];
+ ents[to] = ents[from];
to++;
}
ASSERT(from > to);
@@ -817,8 +1036,8 @@ xfs_dir2_leaf_compact_x1(
/*
* Adjust the leaf header values.
*/
- be16_add_cpu(&leaf->hdr.count, -(from - to));
- leaf->hdr.stale = cpu_to_be16(1);
+ leafhdr->count -= from - to;
+ leafhdr->stale = 1;
/*
* Remember the low/high stale value only in the "right"
* direction.
@@ -826,479 +1045,35 @@ xfs_dir2_leaf_compact_x1(
if (lowstale >= newindex)
lowstale = -1;
else
- highstale = be16_to_cpu(leaf->hdr.count);
- *highlogp = be16_to_cpu(leaf->hdr.count) - 1;
+ highstale = leafhdr->count;
+ *highlogp = leafhdr->count - 1;
*lowstalep = lowstale;
*highstalep = highstale;
}
-struct xfs_dir2_leaf_map_info {
- xfs_extlen_t map_blocks; /* number of fsbs in map */
- xfs_dablk_t map_off; /* last mapped file offset */
- int map_size; /* total entries in *map */
- int map_valid; /* valid entries in *map */
- int nmap; /* mappings to ask xfs_bmapi */
- xfs_dir2_db_t curdb; /* db for current block */
- int ra_current; /* number of read-ahead blks */
- int ra_index; /* *map index for read-ahead */
- int ra_offset; /* map entry offset for ra */
- int ra_want; /* readahead count wanted */
- struct xfs_bmbt_irec map[]; /* map vector for blocks */
-};
-
-STATIC int
-xfs_dir2_leaf_readbuf(
- struct xfs_inode *dp,
- size_t bufsize,
- struct xfs_dir2_leaf_map_info *mip,
- xfs_dir2_off_t *curoff,
- struct xfs_buf **bpp)
-{
- struct xfs_mount *mp = dp->i_mount;
- struct xfs_buf *bp = *bpp;
- struct xfs_bmbt_irec *map = mip->map;
- int error = 0;
- int length;
- int i;
- int j;
-
- /*
- * If we have a buffer, we need to release it and
- * take it out of the mapping.
- */
-
- if (bp) {
- xfs_trans_brelse(NULL, bp);
- bp = NULL;
- mip->map_blocks -= mp->m_dirblkfsbs;
- /*
- * Loop to get rid of the extents for the
- * directory block.
- */
- for (i = mp->m_dirblkfsbs; i > 0; ) {
- j = min_t(int, map->br_blockcount, i);
- map->br_blockcount -= j;
- map->br_startblock += j;
- map->br_startoff += j;
- /*
- * If mapping is done, pitch it from
- * the table.
- */
- if (!map->br_blockcount && --mip->map_valid)
- memmove(&map[0], &map[1],
- sizeof(map[0]) * mip->map_valid);
- i -= j;
- }
- }
-
- /*
- * Recalculate the readahead blocks wanted.
- */
- mip->ra_want = howmany(bufsize + mp->m_dirblksize,
- mp->m_sb.sb_blocksize) - 1;
- ASSERT(mip->ra_want >= 0);
-
- /*
- * If we don't have as many as we want, and we haven't
- * run out of data blocks, get some more mappings.
- */
- if (1 + mip->ra_want > mip->map_blocks &&
- mip->map_off < xfs_dir2_byte_to_da(mp, XFS_DIR2_LEAF_OFFSET)) {
- /*
- * Get more bmaps, fill in after the ones
- * we already have in the table.
- */
- mip->nmap = mip->map_size - mip->map_valid;
- error = xfs_bmapi_read(dp, mip->map_off,
- xfs_dir2_byte_to_da(mp, XFS_DIR2_LEAF_OFFSET) -
- mip->map_off,
- &map[mip->map_valid], &mip->nmap, 0);
-
- /*
- * Don't know if we should ignore this or try to return an
- * error. The trouble with returning errors is that readdir
- * will just stop without actually passing the error through.
- */
- if (error)
- goto out; /* XXX */
-
- /*
- * If we got all the mappings we asked for, set the final map
- * offset based on the last bmap value received. Otherwise,
- * we've reached the end.
- */
- if (mip->nmap == mip->map_size - mip->map_valid) {
- i = mip->map_valid + mip->nmap - 1;
- mip->map_off = map[i].br_startoff + map[i].br_blockcount;
- } else
- mip->map_off = xfs_dir2_byte_to_da(mp,
- XFS_DIR2_LEAF_OFFSET);
-
- /*
- * Look for holes in the mapping, and eliminate them. Count up
- * the valid blocks.
- */
- for (i = mip->map_valid; i < mip->map_valid + mip->nmap; ) {
- if (map[i].br_startblock == HOLESTARTBLOCK) {
- mip->nmap--;
- length = mip->map_valid + mip->nmap - i;
- if (length)
- memmove(&map[i], &map[i + 1],
- sizeof(map[i]) * length);
- } else {
- mip->map_blocks += map[i].br_blockcount;
- i++;
- }
- }
- mip->map_valid += mip->nmap;
- }
-
- /*
- * No valid mappings, so no more data blocks.
- */
- if (!mip->map_valid) {
- *curoff = xfs_dir2_da_to_byte(mp, mip->map_off);
- goto out;
- }
-
- /*
- * Read the directory block starting at the first mapping.
- */
- mip->curdb = xfs_dir2_da_to_db(mp, map->br_startoff);
- error = xfs_dir2_data_read(NULL, dp, map->br_startoff,
- map->br_blockcount >= mp->m_dirblkfsbs ?
- XFS_FSB_TO_DADDR(mp, map->br_startblock) : -1, &bp);
-
- /*
- * Should just skip over the data block instead of giving up.
- */
- if (error)
- goto out; /* XXX */
-
- /*
- * Adjust the current amount of read-ahead: we just read a block that
- * was previously ra.
- */
- if (mip->ra_current)
- mip->ra_current -= mp->m_dirblkfsbs;
-
- /*
- * Do we need more readahead?
- */
- for (mip->ra_index = mip->ra_offset = i = 0;
- mip->ra_want > mip->ra_current && i < mip->map_blocks;
- i += mp->m_dirblkfsbs) {
- ASSERT(mip->ra_index < mip->map_valid);
- /*
- * Read-ahead a contiguous directory block.
- */
- if (i > mip->ra_current &&
- map[mip->ra_index].br_blockcount >= mp->m_dirblkfsbs) {
- xfs_dir2_data_readahead(NULL, dp,
- map[mip->ra_index].br_startoff + mip->ra_offset,
- XFS_FSB_TO_DADDR(mp,
- map[mip->ra_index].br_startblock +
- mip->ra_offset));
- mip->ra_current = i;
- }
-
- /*
- * Read-ahead a non-contiguous directory block. This doesn't
- * use our mapping, but this is a very rare case.
- */
- else if (i > mip->ra_current) {
- xfs_dir2_data_readahead(NULL, dp,
- map[mip->ra_index].br_startoff +
- mip->ra_offset, -1);
- mip->ra_current = i;
- }
-
- /*
- * Advance offset through the mapping table.
- */
- for (j = 0; j < mp->m_dirblkfsbs; j++) {
- /*
- * The rest of this extent but not more than a dir
- * block.
- */
- length = min_t(int, mp->m_dirblkfsbs,
- map[mip->ra_index].br_blockcount -
- mip->ra_offset);
- j += length;
- mip->ra_offset += length;
-
- /*
- * Advance to the next mapping if this one is used up.
- */
- if (mip->ra_offset == map[mip->ra_index].br_blockcount) {
- mip->ra_offset = 0;
- mip->ra_index++;
- }
- }
- }
-
-out:
- *bpp = bp;
- return error;
-}
-
-/*
- * Getdents (readdir) for leaf and node directories.
- * This reads the data blocks only, so is the same for both forms.
- */
-int /* error */
-xfs_dir2_leaf_getdents(
- xfs_inode_t *dp, /* incore directory inode */
- void *dirent,
- size_t bufsize,
- xfs_off_t *offset,
- filldir_t filldir)
-{
- struct xfs_buf *bp = NULL; /* data block buffer */
- xfs_dir2_data_hdr_t *hdr; /* data block header */
- xfs_dir2_data_entry_t *dep; /* data entry */
- xfs_dir2_data_unused_t *dup; /* unused entry */
- int error = 0; /* error return value */
- int length; /* temporary length value */
- xfs_mount_t *mp; /* filesystem mount point */
- int byteoff; /* offset in current block */
- xfs_dir2_off_t curoff; /* current overall offset */
- xfs_dir2_off_t newoff; /* new curoff after new blk */
- char *ptr = NULL; /* pointer to current data */
- struct xfs_dir2_leaf_map_info *map_info;
-
- /*
- * If the offset is at or past the largest allowed value,
- * give up right away.
- */
- if (*offset >= XFS_DIR2_MAX_DATAPTR)
- return 0;
-
- mp = dp->i_mount;
-
- /*
- * Set up to bmap a number of blocks based on the caller's
- * buffer size, the directory block size, and the filesystem
- * block size.
- */
- length = howmany(bufsize + mp->m_dirblksize,
- mp->m_sb.sb_blocksize);
- map_info = kmem_zalloc(offsetof(struct xfs_dir2_leaf_map_info, map) +
- (length * sizeof(struct xfs_bmbt_irec)),
- KM_SLEEP);
- map_info->map_size = length;
-
- /*
- * Inside the loop we keep the main offset value as a byte offset
- * in the directory file.
- */
- curoff = xfs_dir2_dataptr_to_byte(mp, *offset);
-
- /*
- * Force this conversion through db so we truncate the offset
- * down to get the start of the data block.
- */
- map_info->map_off = xfs_dir2_db_to_da(mp,
- xfs_dir2_byte_to_db(mp, curoff));
-
- /*
- * Loop over directory entries until we reach the end offset.
- * Get more blocks and readahead as necessary.
- */
- while (curoff < XFS_DIR2_LEAF_OFFSET) {
- /*
- * If we have no buffer, or we're off the end of the
- * current buffer, need to get another one.
- */
- if (!bp || ptr >= (char *)bp->b_addr + mp->m_dirblksize) {
-
- error = xfs_dir2_leaf_readbuf(dp, bufsize, map_info,
- &curoff, &bp);
- if (error || !map_info->map_valid)
- break;
-
- /*
- * Having done a read, we need to set a new offset.
- */
- newoff = xfs_dir2_db_off_to_byte(mp, map_info->curdb, 0);
- /*
- * Start of the current block.
- */
- if (curoff < newoff)
- curoff = newoff;
- /*
- * Make sure we're in the right block.
- */
- else if (curoff > newoff)
- ASSERT(xfs_dir2_byte_to_db(mp, curoff) ==
- map_info->curdb);
- hdr = bp->b_addr;
- xfs_dir2_data_check(dp, bp);
- /*
- * Find our position in the block.
- */
- ptr = (char *)(hdr + 1);
- byteoff = xfs_dir2_byte_to_off(mp, curoff);
- /*
- * Skip past the header.
- */
- if (byteoff == 0)
- curoff += (uint)sizeof(*hdr);
- /*
- * Skip past entries until we reach our offset.
- */
- else {
- while ((char *)ptr - (char *)hdr < byteoff) {
- dup = (xfs_dir2_data_unused_t *)ptr;
-
- if (be16_to_cpu(dup->freetag)
- == XFS_DIR2_DATA_FREE_TAG) {
-
- length = be16_to_cpu(dup->length);
- ptr += length;
- continue;
- }
- dep = (xfs_dir2_data_entry_t *)ptr;
- length =
- xfs_dir2_data_entsize(dep->namelen);
- ptr += length;
- }
- /*
- * Now set our real offset.
- */
- curoff =
- xfs_dir2_db_off_to_byte(mp,
- xfs_dir2_byte_to_db(mp, curoff),
- (char *)ptr - (char *)hdr);
- if (ptr >= (char *)hdr + mp->m_dirblksize) {
- continue;
- }
- }
- }
- /*
- * We have a pointer to an entry.
- * Is it a live one?
- */
- dup = (xfs_dir2_data_unused_t *)ptr;
- /*
- * No, it's unused, skip over it.
- */
- if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
- length = be16_to_cpu(dup->length);
- ptr += length;
- curoff += length;
- continue;
- }
-
- dep = (xfs_dir2_data_entry_t *)ptr;
- length = xfs_dir2_data_entsize(dep->namelen);
-
- if (filldir(dirent, (char *)dep->name, dep->namelen,
- xfs_dir2_byte_to_dataptr(mp, curoff) & 0x7fffffff,
- be64_to_cpu(dep->inumber), DT_UNKNOWN))
- break;
-
- /*
- * Advance to next entry in the block.
- */
- ptr += length;
- curoff += length;
- /* bufsize may have just been a guess; don't go negative */
- bufsize = bufsize > length ? bufsize - length : 0;
- }
-
- /*
- * All done. Set output offset value to current offset.
- */
- if (curoff > xfs_dir2_dataptr_to_byte(mp, XFS_DIR2_MAX_DATAPTR))
- *offset = XFS_DIR2_MAX_DATAPTR & 0x7fffffff;
- else
- *offset = xfs_dir2_byte_to_dataptr(mp, curoff) & 0x7fffffff;
- kmem_free(map_info);
- if (bp)
- xfs_trans_brelse(NULL, bp);
- return error;
-}
-
-/*
- * Initialize a new leaf block, leaf1 or leafn magic accepted.
- */
-int
-xfs_dir2_leaf_init(
- xfs_da_args_t *args, /* operation arguments */
- xfs_dir2_db_t bno, /* directory block number */
- struct xfs_buf **bpp, /* out: leaf buffer */
- int magic) /* magic number for block */
-{
- struct xfs_buf *bp; /* leaf buffer */
- xfs_inode_t *dp; /* incore directory inode */
- int error; /* error return code */
- xfs_dir2_leaf_t *leaf; /* leaf structure */
- xfs_dir2_leaf_tail_t *ltp; /* leaf tail structure */
- xfs_mount_t *mp; /* filesystem mount point */
- xfs_trans_t *tp; /* transaction pointer */
-
- dp = args->dp;
- ASSERT(dp != NULL);
- tp = args->trans;
- mp = dp->i_mount;
- ASSERT(bno >= XFS_DIR2_LEAF_FIRSTDB(mp) &&
- bno < XFS_DIR2_FREE_FIRSTDB(mp));
- /*
- * Get the buffer for the block.
- */
- error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(mp, bno), -1, &bp,
- XFS_DATA_FORK);
- if (error)
- return error;
-
- /*
- * Initialize the header.
- */
- leaf = bp->b_addr;
- leaf->hdr.info.magic = cpu_to_be16(magic);
- leaf->hdr.info.forw = 0;
- leaf->hdr.info.back = 0;
- leaf->hdr.count = 0;
- leaf->hdr.stale = 0;
- xfs_dir2_leaf_log_header(tp, bp);
- /*
- * If it's a leaf-format directory initialize the tail.
- * In this case our caller has the real bests table to copy into
- * the block.
- */
- if (magic == XFS_DIR2_LEAF1_MAGIC) {
- bp->b_ops = &xfs_dir2_leaf1_buf_ops;
- ltp = xfs_dir2_leaf_tail_p(mp, leaf);
- ltp->bestcount = 0;
- xfs_dir2_leaf_log_tail(tp, bp);
- } else
- bp->b_ops = &xfs_dir2_leafn_buf_ops;
- *bpp = bp;
- return 0;
-}
-
/*
* Log the bests entries indicated from a leaf1 block.
*/
static void
-xfs_dir2_leaf_log_bests(
- xfs_trans_t *tp, /* transaction pointer */
+xfs_dir3_leaf_log_bests(
+ struct xfs_da_args *args,
struct xfs_buf *bp, /* leaf buffer */
int first, /* first entry to log */
int last) /* last entry to log */
{
__be16 *firstb; /* pointer to first entry */
__be16 *lastb; /* pointer to last entry */
- xfs_dir2_leaf_t *leaf; /* leaf structure */
+ struct xfs_dir2_leaf *leaf = bp->b_addr;
xfs_dir2_leaf_tail_t *ltp; /* leaf tail structure */
- leaf = bp->b_addr;
- ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC));
- ltp = xfs_dir2_leaf_tail_p(tp->t_mountp, leaf);
+ ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC) ||
+ leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAF1_MAGIC));
+
+ ltp = xfs_dir2_leaf_tail_p(args->geo, leaf);
firstb = xfs_dir2_leaf_bests_p(ltp) + first;
lastb = xfs_dir2_leaf_bests_p(ltp) + last;
- xfs_trans_log_buf(tp, bp, (uint)((char *)firstb - (char *)leaf),
+ xfs_trans_log_buf(args->trans, bp,
+ (uint)((char *)firstb - (char *)leaf),
(uint)((char *)lastb - (char *)leaf + sizeof(*lastb) - 1));
}
@@ -1306,22 +1081,27 @@ xfs_dir2_leaf_log_bests(
* Log the leaf entries indicated from a leaf1 or leafn block.
*/
void
-xfs_dir2_leaf_log_ents(
- xfs_trans_t *tp, /* transaction pointer */
- struct xfs_buf *bp, /* leaf buffer */
- int first, /* first entry to log */
- int last) /* last entry to log */
+xfs_dir3_leaf_log_ents(
+ struct xfs_da_args *args,
+ struct xfs_buf *bp,
+ int first,
+ int last)
{
xfs_dir2_leaf_entry_t *firstlep; /* pointer to first entry */
xfs_dir2_leaf_entry_t *lastlep; /* pointer to last entry */
- xfs_dir2_leaf_t *leaf; /* leaf structure */
+ struct xfs_dir2_leaf *leaf = bp->b_addr;
+ struct xfs_dir2_leaf_entry *ents;
- leaf = bp->b_addr;
ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC) ||
- leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC));
- firstlep = &leaf->ents[first];
- lastlep = &leaf->ents[last];
- xfs_trans_log_buf(tp, bp, (uint)((char *)firstlep - (char *)leaf),
+ leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAF1_MAGIC) ||
+ leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) ||
+ leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC));
+
+ ents = args->dp->d_ops->leaf_ents_p(leaf);
+ firstlep = &ents[first];
+ lastlep = &ents[last];
+ xfs_trans_log_buf(args->trans, bp,
+ (uint)((char *)firstlep - (char *)leaf),
(uint)((char *)lastlep - (char *)leaf + sizeof(*lastlep) - 1));
}
@@ -1329,37 +1109,41 @@ xfs_dir2_leaf_log_ents(
* Log the header of the leaf1 or leafn block.
*/
void
-xfs_dir2_leaf_log_header(
- struct xfs_trans *tp,
+xfs_dir3_leaf_log_header(
+ struct xfs_da_args *args,
struct xfs_buf *bp)
{
- xfs_dir2_leaf_t *leaf; /* leaf structure */
+ struct xfs_dir2_leaf *leaf = bp->b_addr;
- leaf = bp->b_addr;
ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC) ||
- leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC));
- xfs_trans_log_buf(tp, bp, (uint)((char *)&leaf->hdr - (char *)leaf),
- (uint)(sizeof(leaf->hdr) - 1));
+ leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAF1_MAGIC) ||
+ leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) ||
+ leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC));
+
+ xfs_trans_log_buf(args->trans, bp,
+ (uint)((char *)&leaf->hdr - (char *)leaf),
+ args->dp->d_ops->leaf_hdr_size - 1);
}
/*
* Log the tail of the leaf1 block.
*/
STATIC void
-xfs_dir2_leaf_log_tail(
- struct xfs_trans *tp,
+xfs_dir3_leaf_log_tail(
+ struct xfs_da_args *args,
struct xfs_buf *bp)
{
- xfs_dir2_leaf_t *leaf; /* leaf structure */
+ struct xfs_dir2_leaf *leaf = bp->b_addr;
xfs_dir2_leaf_tail_t *ltp; /* leaf tail structure */
- xfs_mount_t *mp; /* filesystem mount point */
- mp = tp->t_mountp;
- leaf = bp->b_addr;
- ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC));
- ltp = xfs_dir2_leaf_tail_p(mp, leaf);
- xfs_trans_log_buf(tp, bp, (uint)((char *)ltp - (char *)leaf),
- (uint)(mp->m_dirblksize - 1));
+ ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC) ||
+ leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAF1_MAGIC) ||
+ leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) ||
+ leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC));
+
+ ltp = xfs_dir2_leaf_tail_p(args->geo, leaf);
+ xfs_trans_log_buf(args->trans, bp, (uint)((char *)ltp - (char *)leaf),
+ (uint)(args->geo->blksize - 1));
}
/*
@@ -1380,6 +1164,7 @@ xfs_dir2_leaf_lookup(
xfs_dir2_leaf_t *leaf; /* leaf structure */
xfs_dir2_leaf_entry_t *lep; /* leaf entry */
xfs_trans_t *tp; /* transaction pointer */
+ struct xfs_dir2_leaf_entry *ents;
trace_xfs_dir2_leaf_lookup(args);
@@ -1391,22 +1176,25 @@ xfs_dir2_leaf_lookup(
}
tp = args->trans;
dp = args->dp;
- xfs_dir2_leaf_check(dp, lbp);
+ xfs_dir3_leaf_check(dp, lbp);
leaf = lbp->b_addr;
+ ents = dp->d_ops->leaf_ents_p(leaf);
/*
* Get to the leaf entry and contained data entry address.
*/
- lep = &leaf->ents[index];
+ lep = &ents[index];
+
/*
* Point to the data entry.
*/
dep = (xfs_dir2_data_entry_t *)
((char *)dbp->b_addr +
- xfs_dir2_dataptr_to_off(dp->i_mount, be32_to_cpu(lep->address)));
+ xfs_dir2_dataptr_to_off(args->geo, be32_to_cpu(lep->address)));
/*
* Return the found inode number & CI name if appropriate
*/
args->inumber = be64_to_cpu(dep->inumber);
+ args->filetype = dp->d_ops->data_get_ftype(dep);
error = xfs_dir_cilookup_result(args, dep->name, dep->namelen);
xfs_trans_brelse(tp, dbp);
xfs_trans_brelse(tp, lbp);
@@ -1440,18 +1228,23 @@ xfs_dir2_leaf_lookup_int(
xfs_trans_t *tp; /* transaction pointer */
xfs_dir2_db_t cidb = -1; /* case match data block no. */
enum xfs_dacmp cmp; /* name compare result */
+ struct xfs_dir2_leaf_entry *ents;
+ struct xfs_dir3_icleaf_hdr leafhdr;
dp = args->dp;
tp = args->trans;
mp = dp->i_mount;
- error = xfs_dir2_leaf_read(tp, dp, mp->m_dirleafblk, -1, &lbp);
+ error = xfs_dir3_leaf_read(tp, dp, args->geo->leafblk, -1, &lbp);
if (error)
return error;
*lbpp = lbp;
leaf = lbp->b_addr;
- xfs_dir2_leaf_check(dp, lbp);
+ xfs_dir3_leaf_check(dp, lbp);
+ ents = dp->d_ops->leaf_ents_p(leaf);
+ dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
+
/*
* Look for the first leaf entry with our hash value.
*/
@@ -1460,9 +1253,9 @@ xfs_dir2_leaf_lookup_int(
* Loop over all the entries with the right hash value
* looking to match the name.
*/
- for (lep = &leaf->ents[index]; index < be16_to_cpu(leaf->hdr.count) &&
- be32_to_cpu(lep->hashval) == args->hashval;
- lep++, index++) {
+ for (lep = &ents[index];
+ index < leafhdr.count && be32_to_cpu(lep->hashval) == args->hashval;
+ lep++, index++) {
/*
* Skip over stale leaf entries.
*/
@@ -1471,7 +1264,8 @@ xfs_dir2_leaf_lookup_int(
/*
* Get the new data block number.
*/
- newdb = xfs_dir2_dataptr_to_db(mp, be32_to_cpu(lep->address));
+ newdb = xfs_dir2_dataptr_to_db(args->geo,
+ be32_to_cpu(lep->address));
/*
* If it's not the same as the old data block number,
* need to pitch the old one and read the new one.
@@ -1479,9 +1273,9 @@ xfs_dir2_leaf_lookup_int(
if (newdb != curdb) {
if (dbp)
xfs_trans_brelse(tp, dbp);
- error = xfs_dir2_data_read(tp, dp,
- xfs_dir2_db_to_da(mp, newdb),
- -1, &dbp);
+ error = xfs_dir3_data_read(tp, dp,
+ xfs_dir2_db_to_da(args->geo, newdb),
+ -1, &dbp);
if (error) {
xfs_trans_brelse(tp, lbp);
return error;
@@ -1492,7 +1286,8 @@ xfs_dir2_leaf_lookup_int(
* Point to the data entry.
*/
dep = (xfs_dir2_data_entry_t *)((char *)dbp->b_addr +
- xfs_dir2_dataptr_to_off(mp, be32_to_cpu(lep->address)));
+ xfs_dir2_dataptr_to_off(args->geo,
+ be32_to_cpu(lep->address)));
/*
* Compare name and if it's an exact match, return the index
* and buffer. If it's the first case-insensitive match, store
@@ -1520,9 +1315,9 @@ xfs_dir2_leaf_lookup_int(
ASSERT(cidb != -1);
if (cidb != curdb) {
xfs_trans_brelse(tp, dbp);
- error = xfs_dir2_data_read(tp, dp,
- xfs_dir2_db_to_da(mp, cidb),
- -1, &dbp);
+ error = xfs_dir3_data_read(tp, dp,
+ xfs_dir2_db_to_da(args->geo, cidb),
+ -1, &dbp);
if (error) {
xfs_trans_brelse(tp, lbp);
return error;
@@ -1566,6 +1361,9 @@ xfs_dir2_leaf_removename(
int needscan; /* need to rescan data frees */
xfs_dir2_data_off_t oldbest; /* old value of best free */
xfs_trans_t *tp; /* transaction pointer */
+ struct xfs_dir2_data_free *bf; /* bestfree table */
+ struct xfs_dir2_leaf_entry *ents;
+ struct xfs_dir3_icleaf_hdr leafhdr;
trace_xfs_dir2_leaf_removename(args);
@@ -1580,55 +1378,61 @@ xfs_dir2_leaf_removename(
mp = dp->i_mount;
leaf = lbp->b_addr;
hdr = dbp->b_addr;
- xfs_dir2_data_check(dp, dbp);
+ xfs_dir3_data_check(dp, dbp);
+ bf = dp->d_ops->data_bestfree_p(hdr);
+ dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
+ ents = dp->d_ops->leaf_ents_p(leaf);
/*
* Point to the leaf entry, use that to point to the data entry.
*/
- lep = &leaf->ents[index];
- db = xfs_dir2_dataptr_to_db(mp, be32_to_cpu(lep->address));
- dep = (xfs_dir2_data_entry_t *)
- ((char *)hdr + xfs_dir2_dataptr_to_off(mp, be32_to_cpu(lep->address)));
+ lep = &ents[index];
+ db = xfs_dir2_dataptr_to_db(args->geo, be32_to_cpu(lep->address));
+ dep = (xfs_dir2_data_entry_t *)((char *)hdr +
+ xfs_dir2_dataptr_to_off(args->geo, be32_to_cpu(lep->address)));
needscan = needlog = 0;
- oldbest = be16_to_cpu(hdr->bestfree[0].length);
- ltp = xfs_dir2_leaf_tail_p(mp, leaf);
+ oldbest = be16_to_cpu(bf[0].length);
+ ltp = xfs_dir2_leaf_tail_p(args->geo, leaf);
bestsp = xfs_dir2_leaf_bests_p(ltp);
ASSERT(be16_to_cpu(bestsp[db]) == oldbest);
/*
* Mark the former data entry unused.
*/
- xfs_dir2_data_make_free(tp, dbp,
+ xfs_dir2_data_make_free(args, dbp,
(xfs_dir2_data_aoff_t)((char *)dep - (char *)hdr),
- xfs_dir2_data_entsize(dep->namelen), &needlog, &needscan);
+ dp->d_ops->data_entsize(dep->namelen), &needlog, &needscan);
/*
* We just mark the leaf entry stale by putting a null in it.
*/
- be16_add_cpu(&leaf->hdr.stale, 1);
- xfs_dir2_leaf_log_header(tp, lbp);
+ leafhdr.stale++;
+ dp->d_ops->leaf_hdr_to_disk(leaf, &leafhdr);
+ xfs_dir3_leaf_log_header(args, lbp);
+
lep->address = cpu_to_be32(XFS_DIR2_NULL_DATAPTR);
- xfs_dir2_leaf_log_ents(tp, lbp, index, index);
+ xfs_dir3_leaf_log_ents(args, lbp, index, index);
+
/*
* Scan the freespace in the data block again if necessary,
* log the data block header if necessary.
*/
if (needscan)
- xfs_dir2_data_freescan(mp, hdr, &needlog);
+ xfs_dir2_data_freescan(dp, hdr, &needlog);
if (needlog)
- xfs_dir2_data_log_header(tp, dbp);
+ xfs_dir2_data_log_header(args, dbp);
/*
* If the longest freespace in the data block has changed,
* put the new value in the bests table and log that.
*/
- if (be16_to_cpu(hdr->bestfree[0].length) != oldbest) {
- bestsp[db] = hdr->bestfree[0].length;
- xfs_dir2_leaf_log_bests(tp, lbp, db, db);
+ if (be16_to_cpu(bf[0].length) != oldbest) {
+ bestsp[db] = bf[0].length;
+ xfs_dir3_leaf_log_bests(args, lbp, db, db);
}
- xfs_dir2_data_check(dp, dbp);
+ xfs_dir3_data_check(dp, dbp);
/*
* If the data block is now empty then get rid of the data block.
*/
- if (be16_to_cpu(hdr->bestfree[0].length) ==
- mp->m_dirblksize - (uint)sizeof(*hdr)) {
- ASSERT(db != mp->m_dirdatablk);
+ if (be16_to_cpu(bf[0].length) ==
+ args->geo->blksize - dp->d_ops->data_entry_offset) {
+ ASSERT(db != args->geo->datablk);
if ((error = xfs_dir2_shrink_inode(args, db, dbp))) {
/*
* Nope, can't get rid of it because it caused
@@ -1638,7 +1442,7 @@ xfs_dir2_leaf_removename(
*/
if (error == ENOSPC && args->total == 0)
error = 0;
- xfs_dir2_leaf_check(dp, lbp);
+ xfs_dir3_leaf_check(dp, lbp);
return error;
}
dbp = NULL;
@@ -1661,18 +1465,19 @@ xfs_dir2_leaf_removename(
memmove(&bestsp[db - i], bestsp,
(be32_to_cpu(ltp->bestcount) - (db - i)) * sizeof(*bestsp));
be32_add_cpu(&ltp->bestcount, -(db - i));
- xfs_dir2_leaf_log_tail(tp, lbp);
- xfs_dir2_leaf_log_bests(tp, lbp, 0, be32_to_cpu(ltp->bestcount) - 1);
+ xfs_dir3_leaf_log_tail(args, lbp);
+ xfs_dir3_leaf_log_bests(args, lbp, 0,
+ be32_to_cpu(ltp->bestcount) - 1);
} else
bestsp[db] = cpu_to_be16(NULLDATAOFF);
}
/*
* If the data block was not the first one, drop it.
*/
- else if (db != mp->m_dirdatablk)
+ else if (db != args->geo->datablk)
dbp = NULL;
- xfs_dir2_leaf_check(dp, lbp);
+ xfs_dir3_leaf_check(dp, lbp);
/*
* See if we can convert to block form.
*/
@@ -1695,6 +1500,7 @@ xfs_dir2_leaf_replace(
xfs_dir2_leaf_t *leaf; /* leaf structure */
xfs_dir2_leaf_entry_t *lep; /* leaf entry */
xfs_trans_t *tp; /* transaction pointer */
+ struct xfs_dir2_leaf_entry *ents;
trace_xfs_dir2_leaf_replace(args);
@@ -1706,24 +1512,26 @@ xfs_dir2_leaf_replace(
}
dp = args->dp;
leaf = lbp->b_addr;
+ ents = dp->d_ops->leaf_ents_p(leaf);
/*
* Point to the leaf entry, get data address from it.
*/
- lep = &leaf->ents[index];
+ lep = &ents[index];
/*
* Point to the data entry.
*/
dep = (xfs_dir2_data_entry_t *)
((char *)dbp->b_addr +
- xfs_dir2_dataptr_to_off(dp->i_mount, be32_to_cpu(lep->address)));
+ xfs_dir2_dataptr_to_off(args->geo, be32_to_cpu(lep->address)));
ASSERT(args->inumber != be64_to_cpu(dep->inumber));
/*
* Put the new inode number in, log it.
*/
dep->inumber = cpu_to_be64(args->inumber);
+ dp->d_ops->data_put_ftype(dep, args->filetype);
tp = args->trans;
- xfs_dir2_data_log_entry(tp, dbp, dep);
- xfs_dir2_leaf_check(dp, lbp);
+ xfs_dir2_data_log_entry(args, dbp, dep);
+ xfs_dir3_leaf_check(dp, lbp);
xfs_trans_brelse(tp, lbp);
return 0;
}
@@ -1745,17 +1553,18 @@ xfs_dir2_leaf_search_hash(
xfs_dir2_leaf_t *leaf; /* leaf structure */
xfs_dir2_leaf_entry_t *lep; /* leaf entry */
int mid=0; /* current leaf index */
+ struct xfs_dir2_leaf_entry *ents;
+ struct xfs_dir3_icleaf_hdr leafhdr;
leaf = lbp->b_addr;
-#ifndef __KERNEL__
- if (!leaf->hdr.count)
- return 0;
-#endif
+ ents = args->dp->d_ops->leaf_ents_p(leaf);
+ args->dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
+
/*
* Note, the table cannot be empty, so we have to go through the loop.
* Binary search the leaf entries looking for our hash value.
*/
- for (lep = leaf->ents, low = 0, high = be16_to_cpu(leaf->hdr.count) - 1,
+ for (lep = ents, low = 0, high = leafhdr.count - 1,
hashwant = args->hashval;
low <= high; ) {
mid = (low + high) >> 1;
@@ -1807,20 +1616,23 @@ xfs_dir2_leaf_trim_data(
/*
* Read the offending data block. We need its buffer.
*/
- error = xfs_dir2_data_read(tp, dp, xfs_dir2_db_to_da(mp, db), -1, &dbp);
+ error = xfs_dir3_data_read(tp, dp, xfs_dir2_db_to_da(args->geo, db),
+ -1, &dbp);
if (error)
return error;
leaf = lbp->b_addr;
- ltp = xfs_dir2_leaf_tail_p(mp, leaf);
+ ltp = xfs_dir2_leaf_tail_p(args->geo, leaf);
#ifdef DEBUG
{
struct xfs_dir2_data_hdr *hdr = dbp->b_addr;
+ struct xfs_dir2_data_free *bf = dp->d_ops->data_bestfree_p(hdr);
- ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC));
- ASSERT(be16_to_cpu(hdr->bestfree[0].length) ==
- mp->m_dirblksize - (uint)sizeof(*hdr));
+ ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
+ hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC));
+ ASSERT(be16_to_cpu(bf[0].length) ==
+ args->geo->blksize - dp->d_ops->data_entry_offset);
ASSERT(db == be32_to_cpu(ltp->bestcount) - 1);
}
#endif
@@ -1839,23 +1651,29 @@ xfs_dir2_leaf_trim_data(
bestsp = xfs_dir2_leaf_bests_p(ltp);
be32_add_cpu(&ltp->bestcount, -1);
memmove(&bestsp[1], &bestsp[0], be32_to_cpu(ltp->bestcount) * sizeof(*bestsp));
- xfs_dir2_leaf_log_tail(tp, lbp);
- xfs_dir2_leaf_log_bests(tp, lbp, 0, be32_to_cpu(ltp->bestcount) - 1);
+ xfs_dir3_leaf_log_tail(args, lbp);
+ xfs_dir3_leaf_log_bests(args, lbp, 0, be32_to_cpu(ltp->bestcount) - 1);
return 0;
}
static inline size_t
-xfs_dir2_leaf_size(
- struct xfs_dir2_leaf_hdr *hdr,
+xfs_dir3_leaf_size(
+ struct xfs_dir3_icleaf_hdr *hdr,
int counts)
{
- int entries;
+ int entries;
+ int hdrsize;
+
+ entries = hdr->count - hdr->stale;
+ if (hdr->magic == XFS_DIR2_LEAF1_MAGIC ||
+ hdr->magic == XFS_DIR2_LEAFN_MAGIC)
+ hdrsize = sizeof(struct xfs_dir2_leaf_hdr);
+ else
+ hdrsize = sizeof(struct xfs_dir3_leaf_hdr);
- entries = be16_to_cpu(hdr->count) - be16_to_cpu(hdr->stale);
- return sizeof(xfs_dir2_leaf_hdr_t) +
- entries * sizeof(xfs_dir2_leaf_entry_t) +
- counts * sizeof(xfs_dir2_data_off_t) +
- sizeof(xfs_dir2_leaf_tail_t);
+ return hdrsize + entries * sizeof(xfs_dir2_leaf_entry_t)
+ + counts * sizeof(xfs_dir2_data_off_t)
+ + sizeof(xfs_dir2_leaf_tail_t);
}
/*
@@ -1879,6 +1697,8 @@ xfs_dir2_node_to_leaf(
xfs_mount_t *mp; /* filesystem mount point */
int rval; /* successful free trim? */
xfs_trans_t *tp; /* transaction pointer */
+ struct xfs_dir3_icleaf_hdr leafhdr;
+ struct xfs_dir3_icfree_hdr freehdr;
/*
* There's more than a leaf level in the btree, so there must
@@ -1896,22 +1716,22 @@ xfs_dir2_node_to_leaf(
/*
* Get the last offset in the file.
*/
- if ((error = xfs_bmap_last_offset(tp, dp, &fo, XFS_DATA_FORK))) {
+ if ((error = xfs_bmap_last_offset(dp, &fo, XFS_DATA_FORK))) {
return error;
}
- fo -= mp->m_dirblkfsbs;
+ fo -= args->geo->fsbcount;
/*
* If there are freespace blocks other than the first one,
* take this opportunity to remove trailing empty freespace blocks
* that may have been left behind during no-space-reservation
* operations.
*/
- while (fo > mp->m_dirfreeblk) {
+ while (fo > args->geo->freeblk) {
if ((error = xfs_dir2_node_trim_free(args, fo, &rval))) {
return error;
}
if (rval)
- fo -= mp->m_dirblkfsbs;
+ fo -= args->geo->fsbcount;
else
return 0;
}
@@ -1924,60 +1744,71 @@ xfs_dir2_node_to_leaf(
/*
* If it's not the single leaf block, give up.
*/
- if (XFS_FSB_TO_B(mp, fo) > XFS_DIR2_LEAF_OFFSET + mp->m_dirblksize)
+ if (XFS_FSB_TO_B(mp, fo) > XFS_DIR2_LEAF_OFFSET + args->geo->blksize)
return 0;
lbp = state->path.blk[0].bp;
leaf = lbp->b_addr;
- ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC));
+ dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
+
+ ASSERT(leafhdr.magic == XFS_DIR2_LEAFN_MAGIC ||
+ leafhdr.magic == XFS_DIR3_LEAFN_MAGIC);
+
/*
* Read the freespace block.
*/
- error = xfs_dir2_free_read(tp, dp, mp->m_dirfreeblk, &fbp);
+ error = xfs_dir2_free_read(tp, dp, args->geo->freeblk, &fbp);
if (error)
return error;
free = fbp->b_addr;
- ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC));
- ASSERT(!free->hdr.firstdb);
+ dp->d_ops->free_hdr_from_disk(&freehdr, free);
+
+ ASSERT(!freehdr.firstdb);
/*
* Now see if the leafn and free data will fit in a leaf1.
* If not, release the buffer and give up.
*/
- if (xfs_dir2_leaf_size(&leaf->hdr, be32_to_cpu(free->hdr.nvalid)) >
- mp->m_dirblksize) {
+ if (xfs_dir3_leaf_size(&leafhdr, freehdr.nvalid) > args->geo->blksize) {
xfs_trans_brelse(tp, fbp);
return 0;
}
/*
* If the leaf has any stale entries in it, compress them out.
- * The compact routine will log the header.
*/
- if (be16_to_cpu(leaf->hdr.stale))
- xfs_dir2_leaf_compact(args, lbp);
- else
- xfs_dir2_leaf_log_header(tp, lbp);
+ if (leafhdr.stale)
+ xfs_dir3_leaf_compact(args, &leafhdr, lbp);
- lbp->b_ops = &xfs_dir2_leaf1_buf_ops;
- leaf->hdr.info.magic = cpu_to_be16(XFS_DIR2_LEAF1_MAGIC);
+ lbp->b_ops = &xfs_dir3_leaf1_buf_ops;
+ xfs_trans_buf_set_type(tp, lbp, XFS_BLFT_DIR_LEAF1_BUF);
+ leafhdr.magic = (leafhdr.magic == XFS_DIR2_LEAFN_MAGIC)
+ ? XFS_DIR2_LEAF1_MAGIC
+ : XFS_DIR3_LEAF1_MAGIC;
/*
* Set up the leaf tail from the freespace block.
*/
- ltp = xfs_dir2_leaf_tail_p(mp, leaf);
- ltp->bestcount = free->hdr.nvalid;
+ ltp = xfs_dir2_leaf_tail_p(args->geo, leaf);
+ ltp->bestcount = cpu_to_be32(freehdr.nvalid);
+
/*
* Set up the leaf bests table.
*/
- memcpy(xfs_dir2_leaf_bests_p(ltp), free->bests,
- be32_to_cpu(ltp->bestcount) * sizeof(xfs_dir2_data_off_t));
- xfs_dir2_leaf_log_bests(tp, lbp, 0, be32_to_cpu(ltp->bestcount) - 1);
- xfs_dir2_leaf_log_tail(tp, lbp);
- xfs_dir2_leaf_check(dp, lbp);
+ memcpy(xfs_dir2_leaf_bests_p(ltp), dp->d_ops->free_bests_p(free),
+ freehdr.nvalid * sizeof(xfs_dir2_data_off_t));
+
+ dp->d_ops->leaf_hdr_to_disk(leaf, &leafhdr);
+ xfs_dir3_leaf_log_header(args, lbp);
+ xfs_dir3_leaf_log_bests(args, lbp, 0, be32_to_cpu(ltp->bestcount) - 1);
+ xfs_dir3_leaf_log_tail(args, lbp);
+ xfs_dir3_leaf_check(dp, lbp);
+
/*
* Get rid of the freespace block.
*/
- error = xfs_dir2_shrink_inode(args, XFS_DIR2_FREE_FIRSTDB(mp), fbp);
+ error = xfs_dir2_shrink_inode(args,
+ xfs_dir2_byte_to_db(args->geo, XFS_DIR2_FREE_OFFSET),
+ fbp);
if (error) {
/*
* This can't fail here because it can only happen when
diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c
index 5980f9b7fa9..da43d304fca 100644
--- a/fs/xfs/xfs_dir2_node.c
+++ b/fs/xfs/xfs_dir2_node.c
@@ -1,5 +1,6 @@
/*
* Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
* All Rights Reserved.
*
* This program is free software; you can redistribute it and/or
@@ -17,35 +18,29 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
-#include "xfs_types.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_mount.h"
+#include "xfs_da_format.h"
#include "xfs_da_btree.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_dinode.h"
#include "xfs_inode.h"
#include "xfs_bmap.h"
-#include "xfs_dir2_format.h"
+#include "xfs_dir2.h"
#include "xfs_dir2_priv.h"
#include "xfs_error.h"
#include "xfs_trace.h"
+#include "xfs_trans.h"
+#include "xfs_buf_item.h"
+#include "xfs_cksum.h"
/*
* Function declarations.
*/
static int xfs_dir2_leafn_add(struct xfs_buf *bp, xfs_da_args_t *args,
int index);
-#ifdef DEBUG
-static void xfs_dir2_leafn_check(struct xfs_inode *dp, struct xfs_buf *bp);
-#else
-#define xfs_dir2_leafn_check(dp, bp)
-#endif
-static void xfs_dir2_leafn_moveents(xfs_da_args_t *args, struct xfs_buf *bp_s,
- int start_s, struct xfs_buf *bp_d,
- int start_d, int count);
static void xfs_dir2_leafn_rebalance(xfs_da_state_t *state,
xfs_da_state_blk_t *blk1,
xfs_da_state_blk_t *blk2);
@@ -55,52 +50,127 @@ static int xfs_dir2_leafn_remove(xfs_da_args_t *args, struct xfs_buf *bp,
static int xfs_dir2_node_addname_int(xfs_da_args_t *args,
xfs_da_state_blk_t *fblk);
-static void
-xfs_dir2_free_verify(
+/*
+ * Check internal consistency of a leafn block.
+ */
+#ifdef DEBUG
+#define xfs_dir3_leaf_check(dp, bp) \
+do { \
+ if (!xfs_dir3_leafn_check((dp), (bp))) \
+ ASSERT(0); \
+} while (0);
+
+static bool
+xfs_dir3_leafn_check(
+ struct xfs_inode *dp,
+ struct xfs_buf *bp)
+{
+ struct xfs_dir2_leaf *leaf = bp->b_addr;
+ struct xfs_dir3_icleaf_hdr leafhdr;
+
+ dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
+
+ if (leafhdr.magic == XFS_DIR3_LEAFN_MAGIC) {
+ struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr;
+ if (be64_to_cpu(leaf3->info.blkno) != bp->b_bn)
+ return false;
+ } else if (leafhdr.magic != XFS_DIR2_LEAFN_MAGIC)
+ return false;
+
+ return xfs_dir3_leaf_check_int(dp->i_mount, dp, &leafhdr, leaf);
+}
+#else
+#define xfs_dir3_leaf_check(dp, bp)
+#endif
+
+static bool
+xfs_dir3_free_verify(
struct xfs_buf *bp)
{
struct xfs_mount *mp = bp->b_target->bt_mount;
struct xfs_dir2_free_hdr *hdr = bp->b_addr;
- int block_ok = 0;
- block_ok = hdr->magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC);
- if (!block_ok) {
- XFS_CORRUPTION_ERROR("xfs_dir2_free_verify magic",
- XFS_ERRLEVEL_LOW, mp, hdr);
- xfs_buf_ioerror(bp, EFSCORRUPTED);
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
+
+ if (hdr3->magic != cpu_to_be32(XFS_DIR3_FREE_MAGIC))
+ return false;
+ if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_uuid))
+ return false;
+ if (be64_to_cpu(hdr3->blkno) != bp->b_bn)
+ return false;
+ } else {
+ if (hdr->magic != cpu_to_be32(XFS_DIR2_FREE_MAGIC))
+ return false;
}
+
+ /* XXX: should bounds check the xfs_dir3_icfree_hdr here */
+
+ return true;
}
static void
-xfs_dir2_free_read_verify(
+xfs_dir3_free_read_verify(
struct xfs_buf *bp)
{
- xfs_dir2_free_verify(bp);
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+
+ if (xfs_sb_version_hascrc(&mp->m_sb) &&
+ !xfs_buf_verify_cksum(bp, XFS_DIR3_FREE_CRC_OFF))
+ xfs_buf_ioerror(bp, EFSBADCRC);
+ else if (!xfs_dir3_free_verify(bp))
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
+
+ if (bp->b_error)
+ xfs_verifier_error(bp);
}
static void
-xfs_dir2_free_write_verify(
+xfs_dir3_free_write_verify(
struct xfs_buf *bp)
{
- xfs_dir2_free_verify(bp);
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_buf_log_item *bip = bp->b_fspriv;
+ struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
+
+ if (!xfs_dir3_free_verify(bp)) {
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
+ xfs_verifier_error(bp);
+ return;
+ }
+
+ if (!xfs_sb_version_hascrc(&mp->m_sb))
+ return;
+
+ if (bip)
+ hdr3->lsn = cpu_to_be64(bip->bli_item.li_lsn);
+
+ xfs_buf_update_cksum(bp, XFS_DIR3_FREE_CRC_OFF);
}
-static const struct xfs_buf_ops xfs_dir2_free_buf_ops = {
- .verify_read = xfs_dir2_free_read_verify,
- .verify_write = xfs_dir2_free_write_verify,
+const struct xfs_buf_ops xfs_dir3_free_buf_ops = {
+ .verify_read = xfs_dir3_free_read_verify,
+ .verify_write = xfs_dir3_free_write_verify,
};
static int
-__xfs_dir2_free_read(
+__xfs_dir3_free_read(
struct xfs_trans *tp,
struct xfs_inode *dp,
xfs_dablk_t fbno,
xfs_daddr_t mappedbno,
struct xfs_buf **bpp)
{
- return xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp,
- XFS_DATA_FORK, &xfs_dir2_free_buf_ops);
+ int err;
+
+ err = xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp,
+ XFS_DATA_FORK, &xfs_dir3_free_buf_ops);
+
+ /* try read returns without an error or *bpp if it lands in a hole */
+ if (!err && tp && *bpp)
+ xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_FREE_BUF);
+ return err;
}
int
@@ -110,7 +180,7 @@ xfs_dir2_free_read(
xfs_dablk_t fbno,
struct xfs_buf **bpp)
{
- return __xfs_dir2_free_read(tp, dp, fbno, -1, bpp);
+ return __xfs_dir3_free_read(tp, dp, fbno, -1, bpp);
}
static int
@@ -120,7 +190,50 @@ xfs_dir2_free_try_read(
xfs_dablk_t fbno,
struct xfs_buf **bpp)
{
- return __xfs_dir2_free_read(tp, dp, fbno, -2, bpp);
+ return __xfs_dir3_free_read(tp, dp, fbno, -2, bpp);
+}
+
+static int
+xfs_dir3_free_get_buf(
+ xfs_da_args_t *args,
+ xfs_dir2_db_t fbno,
+ struct xfs_buf **bpp)
+{
+ struct xfs_trans *tp = args->trans;
+ struct xfs_inode *dp = args->dp;
+ struct xfs_mount *mp = dp->i_mount;
+ struct xfs_buf *bp;
+ int error;
+ struct xfs_dir3_icfree_hdr hdr;
+
+ error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(args->geo, fbno),
+ -1, &bp, XFS_DATA_FORK);
+ if (error)
+ return error;
+
+ xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_FREE_BUF);
+ bp->b_ops = &xfs_dir3_free_buf_ops;
+
+ /*
+ * Initialize the new block to be empty, and remember
+ * its first slot as our empty slot.
+ */
+ memset(bp->b_addr, 0, sizeof(struct xfs_dir3_free_hdr));
+ memset(&hdr, 0, sizeof(hdr));
+
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ struct xfs_dir3_free_hdr *hdr3 = bp->b_addr;
+
+ hdr.magic = XFS_DIR3_FREE_MAGIC;
+
+ hdr3->hdr.blkno = cpu_to_be64(bp->b_bn);
+ hdr3->hdr.owner = cpu_to_be64(dp->i_ino);
+ uuid_copy(&hdr3->hdr.uuid, &mp->m_sb.sb_uuid);
+ } else
+ hdr.magic = XFS_DIR2_FREE_MAGIC;
+ dp->d_ops->free_hdr_to_disk(bp->b_addr, &hdr);
+ *bpp = bp;
+ return 0;
}
/*
@@ -128,19 +241,22 @@ xfs_dir2_free_try_read(
*/
STATIC void
xfs_dir2_free_log_bests(
- struct xfs_trans *tp,
+ struct xfs_da_args *args,
struct xfs_buf *bp,
int first, /* first entry to log */
int last) /* last entry to log */
{
xfs_dir2_free_t *free; /* freespace structure */
+ __be16 *bests;
free = bp->b_addr;
- ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC));
- xfs_trans_log_buf(tp, bp,
- (uint)((char *)&free->bests[first] - (char *)free),
- (uint)((char *)&free->bests[last] - (char *)free +
- sizeof(free->bests[0]) - 1));
+ bests = args->dp->d_ops->free_bests_p(free);
+ ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC) ||
+ free->hdr.magic == cpu_to_be32(XFS_DIR3_FREE_MAGIC));
+ xfs_trans_log_buf(args->trans, bp,
+ (uint)((char *)&bests[first] - (char *)free),
+ (uint)((char *)&bests[last] - (char *)free +
+ sizeof(bests[0]) - 1));
}
/*
@@ -148,15 +264,18 @@ xfs_dir2_free_log_bests(
*/
static void
xfs_dir2_free_log_header(
- struct xfs_trans *tp,
+ struct xfs_da_args *args,
struct xfs_buf *bp)
{
+#ifdef DEBUG
xfs_dir2_free_t *free; /* freespace structure */
free = bp->b_addr;
- ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC));
- xfs_trans_log_buf(tp, bp, (uint)((char *)&free->hdr - (char *)free),
- (uint)(sizeof(xfs_dir2_free_hdr_t) - 1));
+ ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC) ||
+ free->hdr.magic == cpu_to_be32(XFS_DIR3_FREE_MAGIC));
+#endif
+ xfs_trans_log_buf(args->trans, bp, 0,
+ args->dp->d_ops->free_hdr_size - 1);
}
/*
@@ -183,6 +302,7 @@ xfs_dir2_leaf_to_node(
xfs_dir2_data_off_t off; /* freespace entry value */
__be16 *to; /* pointer to freespace entry */
xfs_trans_t *tp; /* transaction pointer */
+ struct xfs_dir3_icfree_hdr freehdr;
trace_xfs_dir2_leaf_to_node(args);
@@ -195,48 +315,57 @@ xfs_dir2_leaf_to_node(
if ((error = xfs_dir2_grow_inode(args, XFS_DIR2_FREE_SPACE, &fdb))) {
return error;
}
- ASSERT(fdb == XFS_DIR2_FREE_FIRSTDB(mp));
+ ASSERT(fdb == xfs_dir2_byte_to_db(args->geo, XFS_DIR2_FREE_OFFSET));
/*
* Get the buffer for the new freespace block.
*/
- error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(mp, fdb), -1, &fbp,
- XFS_DATA_FORK);
+ error = xfs_dir3_free_get_buf(args, fdb, &fbp);
if (error)
return error;
- fbp->b_ops = &xfs_dir2_free_buf_ops;
free = fbp->b_addr;
+ dp->d_ops->free_hdr_from_disk(&freehdr, free);
leaf = lbp->b_addr;
- ltp = xfs_dir2_leaf_tail_p(mp, leaf);
- /*
- * Initialize the freespace block header.
- */
- free->hdr.magic = cpu_to_be32(XFS_DIR2_FREE_MAGIC);
- free->hdr.firstdb = 0;
- ASSERT(be32_to_cpu(ltp->bestcount) <= (uint)dp->i_d.di_size / mp->m_dirblksize);
- free->hdr.nvalid = ltp->bestcount;
+ ltp = xfs_dir2_leaf_tail_p(args->geo, leaf);
+ ASSERT(be32_to_cpu(ltp->bestcount) <=
+ (uint)dp->i_d.di_size / args->geo->blksize);
+
/*
* Copy freespace entries from the leaf block to the new block.
* Count active entries.
*/
- for (i = n = 0, from = xfs_dir2_leaf_bests_p(ltp), to = free->bests;
- i < be32_to_cpu(ltp->bestcount); i++, from++, to++) {
+ from = xfs_dir2_leaf_bests_p(ltp);
+ to = dp->d_ops->free_bests_p(free);
+ for (i = n = 0; i < be32_to_cpu(ltp->bestcount); i++, from++, to++) {
if ((off = be16_to_cpu(*from)) != NULLDATAOFF)
n++;
*to = cpu_to_be16(off);
}
- free->hdr.nused = cpu_to_be32(n);
- lbp->b_ops = &xfs_dir2_leafn_buf_ops;
- leaf->hdr.info.magic = cpu_to_be16(XFS_DIR2_LEAFN_MAGIC);
+ /*
+ * Now initialize the freespace block header.
+ */
+ freehdr.nused = n;
+ freehdr.nvalid = be32_to_cpu(ltp->bestcount);
+
+ dp->d_ops->free_hdr_to_disk(fbp->b_addr, &freehdr);
+ xfs_dir2_free_log_bests(args, fbp, 0, freehdr.nvalid - 1);
+ xfs_dir2_free_log_header(args, fbp);
/*
- * Log everything.
+ * Converting the leaf to a leafnode is just a matter of changing the
+ * magic number and the ops. Do the change directly to the buffer as
+ * it's less work (and less code) than decoding the header to host
+ * format and back again.
*/
- xfs_dir2_leaf_log_header(tp, lbp);
- xfs_dir2_free_log_header(tp, fbp);
- xfs_dir2_free_log_bests(tp, fbp, 0, be32_to_cpu(free->hdr.nvalid) - 1);
- xfs_dir2_leafn_check(dp, lbp);
+ if (leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC))
+ leaf->hdr.info.magic = cpu_to_be16(XFS_DIR2_LEAFN_MAGIC);
+ else
+ leaf->hdr.info.magic = cpu_to_be16(XFS_DIR3_LEAFN_MAGIC);
+ lbp->b_ops = &xfs_dir3_leafn_buf_ops;
+ xfs_trans_buf_set_type(tp, lbp, XFS_BLFT_DIR_LEAFN_BUF);
+ xfs_dir3_leaf_log_header(args, lbp);
+ xfs_dir3_leaf_check(dp, lbp);
return 0;
}
@@ -260,6 +389,8 @@ xfs_dir2_leafn_add(
int lowstale; /* previous stale entry */
xfs_mount_t *mp; /* filesystem mount point */
xfs_trans_t *tp; /* transaction pointer */
+ struct xfs_dir3_icleaf_hdr leafhdr;
+ struct xfs_dir2_leaf_entry *ents;
trace_xfs_dir2_leafn_add(args, index);
@@ -267,6 +398,8 @@ xfs_dir2_leafn_add(
mp = dp->i_mount;
tp = args->trans;
leaf = bp->b_addr;
+ dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
+ ents = dp->d_ops->leaf_ents_p(leaf);
/*
* Quick check just to make sure we are not going to index
@@ -282,15 +415,15 @@ xfs_dir2_leafn_add(
* a compact.
*/
- if (be16_to_cpu(leaf->hdr.count) == xfs_dir2_max_leaf_ents(mp)) {
- if (!leaf->hdr.stale)
+ if (leafhdr.count == dp->d_ops->leaf_max_ents(args->geo)) {
+ if (!leafhdr.stale)
return XFS_ERROR(ENOSPC);
- compact = be16_to_cpu(leaf->hdr.stale) > 1;
+ compact = leafhdr.stale > 1;
} else
compact = 0;
- ASSERT(index == 0 || be32_to_cpu(leaf->ents[index - 1].hashval) <= args->hashval);
- ASSERT(index == be16_to_cpu(leaf->hdr.count) ||
- be32_to_cpu(leaf->ents[index].hashval) >= args->hashval);
+ ASSERT(index == 0 || be32_to_cpu(ents[index - 1].hashval) <= args->hashval);
+ ASSERT(index == leafhdr.count ||
+ be32_to_cpu(ents[index].hashval) >= args->hashval);
if (args->op_flags & XFS_DA_OP_JUSTCHECK)
return 0;
@@ -299,61 +432,52 @@ xfs_dir2_leafn_add(
* Compact out all but one stale leaf entry. Leaves behind
* the entry closest to index.
*/
- if (compact) {
- xfs_dir2_leaf_compact_x1(bp, &index, &lowstale, &highstale,
- &lfloglow, &lfloghigh);
- }
- /*
- * Set impossible logging indices for this case.
- */
- else if (leaf->hdr.stale) {
- lfloglow = be16_to_cpu(leaf->hdr.count);
+ if (compact)
+ xfs_dir3_leaf_compact_x1(&leafhdr, ents, &index, &lowstale,
+ &highstale, &lfloglow, &lfloghigh);
+ else if (leafhdr.stale) {
+ /*
+ * Set impossible logging indices for this case.
+ */
+ lfloglow = leafhdr.count;
lfloghigh = -1;
}
/*
* Insert the new entry, log everything.
*/
- lep = xfs_dir2_leaf_find_entry(leaf, index, compact, lowstale,
+ lep = xfs_dir3_leaf_find_entry(&leafhdr, ents, index, compact, lowstale,
highstale, &lfloglow, &lfloghigh);
lep->hashval = cpu_to_be32(args->hashval);
- lep->address = cpu_to_be32(xfs_dir2_db_off_to_dataptr(mp,
+ lep->address = cpu_to_be32(xfs_dir2_db_off_to_dataptr(args->geo,
args->blkno, args->index));
- xfs_dir2_leaf_log_header(tp, bp);
- xfs_dir2_leaf_log_ents(tp, bp, lfloglow, lfloghigh);
- xfs_dir2_leafn_check(dp, bp);
+
+ dp->d_ops->leaf_hdr_to_disk(leaf, &leafhdr);
+ xfs_dir3_leaf_log_header(args, bp);
+ xfs_dir3_leaf_log_ents(args, bp, lfloglow, lfloghigh);
+ xfs_dir3_leaf_check(dp, bp);
return 0;
}
#ifdef DEBUG
-/*
- * Check internal consistency of a leafn block.
- */
-void
-xfs_dir2_leafn_check(
+static void
+xfs_dir2_free_hdr_check(
struct xfs_inode *dp,
- struct xfs_buf *bp)
+ struct xfs_buf *bp,
+ xfs_dir2_db_t db)
{
- int i; /* leaf index */
- xfs_dir2_leaf_t *leaf; /* leaf structure */
- xfs_mount_t *mp; /* filesystem mount point */
- int stale; /* count of stale leaves */
+ struct xfs_dir3_icfree_hdr hdr;
- leaf = bp->b_addr;
- mp = dp->i_mount;
- ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC));
- ASSERT(be16_to_cpu(leaf->hdr.count) <= xfs_dir2_max_leaf_ents(mp));
- for (i = stale = 0; i < be16_to_cpu(leaf->hdr.count); i++) {
- if (i + 1 < be16_to_cpu(leaf->hdr.count)) {
- ASSERT(be32_to_cpu(leaf->ents[i].hashval) <=
- be32_to_cpu(leaf->ents[i + 1].hashval));
- }
- if (leaf->ents[i].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
- stale++;
- }
- ASSERT(be16_to_cpu(leaf->hdr.stale) == stale);
+ dp->d_ops->free_hdr_from_disk(&hdr, bp->b_addr);
+
+ ASSERT((hdr.firstdb %
+ dp->d_ops->free_max_bests(dp->i_mount->m_dir_geo)) == 0);
+ ASSERT(hdr.firstdb <= db);
+ ASSERT(db < hdr.firstdb + hdr.nvalid);
}
+#else
+#define xfs_dir2_free_hdr_check(dp, bp, db)
#endif /* DEBUG */
/*
@@ -362,18 +486,26 @@ xfs_dir2_leafn_check(
*/
xfs_dahash_t /* hash value */
xfs_dir2_leafn_lasthash(
+ struct xfs_inode *dp,
struct xfs_buf *bp, /* leaf buffer */
int *count) /* count of entries in leaf */
{
- xfs_dir2_leaf_t *leaf; /* leaf structure */
+ struct xfs_dir2_leaf *leaf = bp->b_addr;
+ struct xfs_dir2_leaf_entry *ents;
+ struct xfs_dir3_icleaf_hdr leafhdr;
+
+ dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
+
+ ASSERT(leafhdr.magic == XFS_DIR2_LEAFN_MAGIC ||
+ leafhdr.magic == XFS_DIR3_LEAFN_MAGIC);
- leaf = bp->b_addr;
- ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC));
if (count)
- *count = be16_to_cpu(leaf->hdr.count);
- if (!leaf->hdr.count)
+ *count = leafhdr.count;
+ if (!leafhdr.count)
return 0;
- return be32_to_cpu(leaf->ents[be16_to_cpu(leaf->hdr.count) - 1].hashval);
+
+ ents = dp->d_ops->leaf_ents_p(leaf);
+ return be32_to_cpu(ents[leafhdr.count - 1].hashval);
}
/*
@@ -402,16 +534,19 @@ xfs_dir2_leafn_lookup_for_addname(
xfs_dir2_db_t newdb; /* new data block number */
xfs_dir2_db_t newfdb; /* new free block number */
xfs_trans_t *tp; /* transaction pointer */
+ struct xfs_dir2_leaf_entry *ents;
+ struct xfs_dir3_icleaf_hdr leafhdr;
dp = args->dp;
tp = args->trans;
mp = dp->i_mount;
leaf = bp->b_addr;
- ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC));
-#ifdef __KERNEL__
- ASSERT(be16_to_cpu(leaf->hdr.count) > 0);
-#endif
- xfs_dir2_leafn_check(dp, bp);
+ dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
+ ents = dp->d_ops->leaf_ents_p(leaf);
+
+ xfs_dir3_leaf_check(dp, bp);
+ ASSERT(leafhdr.count > 0);
+
/*
* Look up the hash value in the leaf entries.
*/
@@ -424,15 +559,16 @@ xfs_dir2_leafn_lookup_for_addname(
curbp = state->extrablk.bp;
curfdb = state->extrablk.blkno;
free = curbp->b_addr;
- ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC));
+ ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC) ||
+ free->hdr.magic == cpu_to_be32(XFS_DIR3_FREE_MAGIC));
}
- length = xfs_dir2_data_entsize(args->namelen);
+ length = dp->d_ops->data_entsize(args->namelen);
/*
* Loop over leaf entries with the right hash value.
*/
- for (lep = &leaf->ents[index]; index < be16_to_cpu(leaf->hdr.count) &&
- be32_to_cpu(lep->hashval) == args->hashval;
- lep++, index++) {
+ for (lep = &ents[index];
+ index < leafhdr.count && be32_to_cpu(lep->hashval) == args->hashval;
+ lep++, index++) {
/*
* Skip stale leaf entries.
*/
@@ -441,7 +577,8 @@ xfs_dir2_leafn_lookup_for_addname(
/*
* Pull the data block number from the entry.
*/
- newdb = xfs_dir2_dataptr_to_db(mp, be32_to_cpu(lep->address));
+ newdb = xfs_dir2_dataptr_to_db(args->geo,
+ be32_to_cpu(lep->address));
/*
* For addname, we're looking for a place to put the new entry.
* We want to use a data block with an entry of equal
@@ -451,12 +588,14 @@ xfs_dir2_leafn_lookup_for_addname(
* in hand, take a look at it.
*/
if (newdb != curdb) {
+ __be16 *bests;
+
curdb = newdb;
/*
* Convert the data block to the free block
* holding its freespace information.
*/
- newfdb = xfs_dir2_db_to_fdb(mp, newdb);
+ newfdb = dp->d_ops->db_to_fdb(args->geo, newdb);
/*
* If it's not the one we have in hand, read it in.
*/
@@ -468,28 +607,24 @@ xfs_dir2_leafn_lookup_for_addname(
xfs_trans_brelse(tp, curbp);
error = xfs_dir2_free_read(tp, dp,
- xfs_dir2_db_to_da(mp, newfdb),
+ xfs_dir2_db_to_da(args->geo,
+ newfdb),
&curbp);
if (error)
return error;
free = curbp->b_addr;
- ASSERT(be32_to_cpu(free->hdr.magic) ==
- XFS_DIR2_FREE_MAGIC);
- ASSERT((be32_to_cpu(free->hdr.firstdb) %
- xfs_dir2_free_max_bests(mp)) == 0);
- ASSERT(be32_to_cpu(free->hdr.firstdb) <= curdb);
- ASSERT(curdb < be32_to_cpu(free->hdr.firstdb) +
- be32_to_cpu(free->hdr.nvalid));
+
+ xfs_dir2_free_hdr_check(dp, curbp, curdb);
}
/*
* Get the index for our entry.
*/
- fi = xfs_dir2_db_to_fdindex(mp, curdb);
+ fi = dp->d_ops->db_to_fdindex(args->geo, curdb);
/*
* If it has room, return it.
*/
- if (unlikely(free->bests[fi] ==
- cpu_to_be16(NULLDATAOFF))) {
+ bests = dp->d_ops->free_bests_p(free);
+ if (unlikely(bests[fi] == cpu_to_be16(NULLDATAOFF))) {
XFS_ERROR_REPORT("xfs_dir2_leafn_lookup_int",
XFS_ERRLEVEL_LOW, mp);
if (curfdb != newfdb)
@@ -497,7 +632,7 @@ xfs_dir2_leafn_lookup_for_addname(
return XFS_ERROR(EFSCORRUPTED);
}
curfdb = newfdb;
- if (be16_to_cpu(free->bests[fi]) >= length)
+ if (be16_to_cpu(bests[fi]) >= length)
goto out;
}
}
@@ -511,6 +646,12 @@ out:
state->extrablk.bp = curbp;
state->extrablk.index = fi;
state->extrablk.blkno = curfdb;
+
+ /*
+ * Important: this magic number is not in the buffer - it's for
+ * buffer type information and therefore only the free/data type
+ * matters here, not whether CRCs are enabled or not.
+ */
state->extrablk.magic = XFS_DIR2_FREE_MAGIC;
} else {
state->extravalid = 0;
@@ -545,16 +686,19 @@ xfs_dir2_leafn_lookup_for_entry(
xfs_dir2_db_t newdb; /* new data block number */
xfs_trans_t *tp; /* transaction pointer */
enum xfs_dacmp cmp; /* comparison result */
+ struct xfs_dir2_leaf_entry *ents;
+ struct xfs_dir3_icleaf_hdr leafhdr;
dp = args->dp;
tp = args->trans;
mp = dp->i_mount;
leaf = bp->b_addr;
- ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC));
-#ifdef __KERNEL__
- ASSERT(be16_to_cpu(leaf->hdr.count) > 0);
-#endif
- xfs_dir2_leafn_check(dp, bp);
+ dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
+ ents = dp->d_ops->leaf_ents_p(leaf);
+
+ xfs_dir3_leaf_check(dp, bp);
+ ASSERT(leafhdr.count > 0);
+
/*
* Look up the hash value in the leaf entries.
*/
@@ -569,9 +713,9 @@ xfs_dir2_leafn_lookup_for_entry(
/*
* Loop over leaf entries with the right hash value.
*/
- for (lep = &leaf->ents[index]; index < be16_to_cpu(leaf->hdr.count) &&
- be32_to_cpu(lep->hashval) == args->hashval;
- lep++, index++) {
+ for (lep = &ents[index];
+ index < leafhdr.count && be32_to_cpu(lep->hashval) == args->hashval;
+ lep++, index++) {
/*
* Skip stale leaf entries.
*/
@@ -580,7 +724,8 @@ xfs_dir2_leafn_lookup_for_entry(
/*
* Pull the data block number from the entry.
*/
- newdb = xfs_dir2_dataptr_to_db(mp, be32_to_cpu(lep->address));
+ newdb = xfs_dir2_dataptr_to_db(args->geo,
+ be32_to_cpu(lep->address));
/*
* Not adding a new entry, so we really want to find
* the name given to us.
@@ -604,20 +749,22 @@ xfs_dir2_leafn_lookup_for_entry(
ASSERT(state->extravalid);
curbp = state->extrablk.bp;
} else {
- error = xfs_dir2_data_read(tp, dp,
- xfs_dir2_db_to_da(mp, newdb),
+ error = xfs_dir3_data_read(tp, dp,
+ xfs_dir2_db_to_da(args->geo,
+ newdb),
-1, &curbp);
if (error)
return error;
}
- xfs_dir2_data_check(dp, curbp);
+ xfs_dir3_data_check(dp, curbp);
curdb = newdb;
}
/*
* Point to the data entry.
*/
dep = (xfs_dir2_data_entry_t *)((char *)curbp->b_addr +
- xfs_dir2_dataptr_to_off(mp, be32_to_cpu(lep->address)));
+ xfs_dir2_dataptr_to_off(args->geo,
+ be32_to_cpu(lep->address)));
/*
* Compare the entry and if it's an exact match, return
* EEXIST immediately. If it's the first case-insensitive
@@ -631,6 +778,7 @@ xfs_dir2_leafn_lookup_for_entry(
xfs_trans_brelse(tp, state->extrablk.bp);
args->cmpresult = cmp;
args->inumber = be64_to_cpu(dep->inumber);
+ args->filetype = dp->d_ops->data_get_ftype(dep);
*indexp = index;
state->extravalid = 1;
state->extrablk.bp = curbp;
@@ -638,13 +786,13 @@ xfs_dir2_leafn_lookup_for_entry(
state->extrablk.index = (int)((char *)dep -
(char *)curbp->b_addr);
state->extrablk.magic = XFS_DIR2_DATA_MAGIC;
- curbp->b_ops = &xfs_dir2_data_buf_ops;
+ curbp->b_ops = &xfs_dir3_data_buf_ops;
+ xfs_trans_buf_set_type(tp, curbp, XFS_BLFT_DIR_DATA_BUF);
if (cmp == XFS_CMP_EXACT)
return XFS_ERROR(EEXIST);
}
}
- ASSERT(index == be16_to_cpu(leaf->hdr.count) ||
- (args->op_flags & XFS_DA_OP_OKNOENT));
+ ASSERT(index == leafhdr.count || (args->op_flags & XFS_DA_OP_OKNOENT));
if (curbp) {
if (args->cmpresult == XFS_CMP_DIFFERENT) {
/* Giving back last used data block. */
@@ -653,7 +801,8 @@ xfs_dir2_leafn_lookup_for_entry(
state->extrablk.index = -1;
state->extrablk.blkno = curdb;
state->extrablk.magic = XFS_DIR2_DATA_MAGIC;
- curbp->b_ops = &xfs_dir2_data_buf_ops;
+ curbp->b_ops = &xfs_dir3_data_buf_ops;
+ xfs_trans_buf_set_type(tp, curbp, XFS_BLFT_DIR_DATA_BUF);
} else {
/* If the curbp is not the CI match block, drop it */
if (state->extrablk.bp != curbp)
@@ -689,52 +838,49 @@ xfs_dir2_leafn_lookup_int(
* Log entries and headers. Stale entries are preserved.
*/
static void
-xfs_dir2_leafn_moveents(
- xfs_da_args_t *args, /* operation arguments */
- struct xfs_buf *bp_s, /* source leaf buffer */
- int start_s, /* source leaf index */
- struct xfs_buf *bp_d, /* destination leaf buffer */
- int start_d, /* destination leaf index */
- int count) /* count of leaves to copy */
+xfs_dir3_leafn_moveents(
+ xfs_da_args_t *args, /* operation arguments */
+ struct xfs_buf *bp_s, /* source */
+ struct xfs_dir3_icleaf_hdr *shdr,
+ struct xfs_dir2_leaf_entry *sents,
+ int start_s,/* source leaf index */
+ struct xfs_buf *bp_d, /* destination */
+ struct xfs_dir3_icleaf_hdr *dhdr,
+ struct xfs_dir2_leaf_entry *dents,
+ int start_d,/* destination leaf index */
+ int count) /* count of leaves to copy */
{
- xfs_dir2_leaf_t *leaf_d; /* destination leaf structure */
- xfs_dir2_leaf_t *leaf_s; /* source leaf structure */
- int stale; /* count stale leaves copied */
- xfs_trans_t *tp; /* transaction pointer */
+ int stale; /* count stale leaves copied */
trace_xfs_dir2_leafn_moveents(args, start_s, start_d, count);
/*
* Silently return if nothing to do.
*/
- if (count == 0) {
+ if (count == 0)
return;
- }
- tp = args->trans;
- leaf_s = bp_s->b_addr;
- leaf_d = bp_d->b_addr;
+
/*
* If the destination index is not the end of the current
* destination leaf entries, open up a hole in the destination
* to hold the new entries.
*/
- if (start_d < be16_to_cpu(leaf_d->hdr.count)) {
- memmove(&leaf_d->ents[start_d + count], &leaf_d->ents[start_d],
- (be16_to_cpu(leaf_d->hdr.count) - start_d) *
- sizeof(xfs_dir2_leaf_entry_t));
- xfs_dir2_leaf_log_ents(tp, bp_d, start_d + count,
- count + be16_to_cpu(leaf_d->hdr.count) - 1);
+ if (start_d < dhdr->count) {
+ memmove(&dents[start_d + count], &dents[start_d],
+ (dhdr->count - start_d) * sizeof(xfs_dir2_leaf_entry_t));
+ xfs_dir3_leaf_log_ents(args, bp_d, start_d + count,
+ count + dhdr->count - 1);
}
/*
* If the source has stale leaves, count the ones in the copy range
* so we can update the header correctly.
*/
- if (leaf_s->hdr.stale) {
+ if (shdr->stale) {
int i; /* temp leaf index */
for (i = start_s, stale = 0; i < start_s + count; i++) {
- if (leaf_s->ents[i].address ==
- cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
+ if (sents[i].address ==
+ cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
stale++;
}
} else
@@ -742,29 +888,27 @@ xfs_dir2_leafn_moveents(
/*
* Copy the leaf entries from source to destination.
*/
- memcpy(&leaf_d->ents[start_d], &leaf_s->ents[start_s],
+ memcpy(&dents[start_d], &sents[start_s],
count * sizeof(xfs_dir2_leaf_entry_t));
- xfs_dir2_leaf_log_ents(tp, bp_d, start_d, start_d + count - 1);
+ xfs_dir3_leaf_log_ents(args, bp_d, start_d, start_d + count - 1);
+
/*
* If there are source entries after the ones we copied,
* delete the ones we copied by sliding the next ones down.
*/
- if (start_s + count < be16_to_cpu(leaf_s->hdr.count)) {
- memmove(&leaf_s->ents[start_s], &leaf_s->ents[start_s + count],
+ if (start_s + count < shdr->count) {
+ memmove(&sents[start_s], &sents[start_s + count],
count * sizeof(xfs_dir2_leaf_entry_t));
- xfs_dir2_leaf_log_ents(tp, bp_s, start_s, start_s + count - 1);
+ xfs_dir3_leaf_log_ents(args, bp_s, start_s, start_s + count - 1);
}
+
/*
* Update the headers and log them.
*/
- be16_add_cpu(&leaf_s->hdr.count, -(count));
- be16_add_cpu(&leaf_s->hdr.stale, -(stale));
- be16_add_cpu(&leaf_d->hdr.count, count);
- be16_add_cpu(&leaf_d->hdr.stale, stale);
- xfs_dir2_leaf_log_header(tp, bp_s);
- xfs_dir2_leaf_log_header(tp, bp_d);
- xfs_dir2_leafn_check(args->dp, bp_s);
- xfs_dir2_leafn_check(args->dp, bp_d);
+ shdr->count -= count;
+ shdr->stale -= stale;
+ dhdr->count += count;
+ dhdr->stale += stale;
}
/*
@@ -773,21 +917,26 @@ xfs_dir2_leafn_moveents(
*/
int /* sort order */
xfs_dir2_leafn_order(
- struct xfs_buf *leaf1_bp, /* leaf1 buffer */
- struct xfs_buf *leaf2_bp) /* leaf2 buffer */
+ struct xfs_inode *dp,
+ struct xfs_buf *leaf1_bp, /* leaf1 buffer */
+ struct xfs_buf *leaf2_bp) /* leaf2 buffer */
{
- xfs_dir2_leaf_t *leaf1; /* leaf1 structure */
- xfs_dir2_leaf_t *leaf2; /* leaf2 structure */
-
- leaf1 = leaf1_bp->b_addr;
- leaf2 = leaf2_bp->b_addr;
- ASSERT(leaf1->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC));
- ASSERT(leaf2->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC));
- if (be16_to_cpu(leaf1->hdr.count) > 0 &&
- be16_to_cpu(leaf2->hdr.count) > 0 &&
- (be32_to_cpu(leaf2->ents[0].hashval) < be32_to_cpu(leaf1->ents[0].hashval) ||
- be32_to_cpu(leaf2->ents[be16_to_cpu(leaf2->hdr.count) - 1].hashval) <
- be32_to_cpu(leaf1->ents[be16_to_cpu(leaf1->hdr.count) - 1].hashval)))
+ struct xfs_dir2_leaf *leaf1 = leaf1_bp->b_addr;
+ struct xfs_dir2_leaf *leaf2 = leaf2_bp->b_addr;
+ struct xfs_dir2_leaf_entry *ents1;
+ struct xfs_dir2_leaf_entry *ents2;
+ struct xfs_dir3_icleaf_hdr hdr1;
+ struct xfs_dir3_icleaf_hdr hdr2;
+
+ dp->d_ops->leaf_hdr_from_disk(&hdr1, leaf1);
+ dp->d_ops->leaf_hdr_from_disk(&hdr2, leaf2);
+ ents1 = dp->d_ops->leaf_ents_p(leaf1);
+ ents2 = dp->d_ops->leaf_ents_p(leaf2);
+
+ if (hdr1.count > 0 && hdr2.count > 0 &&
+ (be32_to_cpu(ents2[0].hashval) < be32_to_cpu(ents1[0].hashval) ||
+ be32_to_cpu(ents2[hdr2.count - 1].hashval) <
+ be32_to_cpu(ents1[hdr1.count - 1].hashval)))
return 1;
return 0;
}
@@ -811,17 +960,22 @@ xfs_dir2_leafn_rebalance(
xfs_dir2_leaf_t *leaf1; /* first leaf structure */
xfs_dir2_leaf_t *leaf2; /* second leaf structure */
int mid; /* midpoint leaf index */
-#ifdef DEBUG
+#if defined(DEBUG) || defined(XFS_WARN)
int oldstale; /* old count of stale leaves */
#endif
int oldsum; /* old total leaf count */
int swap; /* swapped leaf blocks */
+ struct xfs_dir2_leaf_entry *ents1;
+ struct xfs_dir2_leaf_entry *ents2;
+ struct xfs_dir3_icleaf_hdr hdr1;
+ struct xfs_dir3_icleaf_hdr hdr2;
+ struct xfs_inode *dp = state->args->dp;
args = state->args;
/*
* If the block order is wrong, swap the arguments.
*/
- if ((swap = xfs_dir2_leafn_order(blk1->bp, blk2->bp))) {
+ if ((swap = xfs_dir2_leafn_order(dp, blk1->bp, blk2->bp))) {
xfs_da_state_blk_t *tmp; /* temp for block swap */
tmp = blk1;
@@ -830,11 +984,17 @@ xfs_dir2_leafn_rebalance(
}
leaf1 = blk1->bp->b_addr;
leaf2 = blk2->bp->b_addr;
- oldsum = be16_to_cpu(leaf1->hdr.count) + be16_to_cpu(leaf2->hdr.count);
-#ifdef DEBUG
- oldstale = be16_to_cpu(leaf1->hdr.stale) + be16_to_cpu(leaf2->hdr.stale);
+ dp->d_ops->leaf_hdr_from_disk(&hdr1, leaf1);
+ dp->d_ops->leaf_hdr_from_disk(&hdr2, leaf2);
+ ents1 = dp->d_ops->leaf_ents_p(leaf1);
+ ents2 = dp->d_ops->leaf_ents_p(leaf2);
+
+ oldsum = hdr1.count + hdr2.count;
+#if defined(DEBUG) || defined(XFS_WARN)
+ oldstale = hdr1.stale + hdr2.stale;
#endif
mid = oldsum >> 1;
+
/*
* If the old leaf count was odd then the new one will be even,
* so we need to divide the new count evenly.
@@ -842,10 +1002,10 @@ xfs_dir2_leafn_rebalance(
if (oldsum & 1) {
xfs_dahash_t midhash; /* middle entry hash value */
- if (mid >= be16_to_cpu(leaf1->hdr.count))
- midhash = be32_to_cpu(leaf2->ents[mid - be16_to_cpu(leaf1->hdr.count)].hashval);
+ if (mid >= hdr1.count)
+ midhash = be32_to_cpu(ents2[mid - hdr1.count].hashval);
else
- midhash = be32_to_cpu(leaf1->ents[mid].hashval);
+ midhash = be32_to_cpu(ents1[mid].hashval);
isleft = args->hashval <= midhash;
}
/*
@@ -859,46 +1019,58 @@ xfs_dir2_leafn_rebalance(
* Calculate moved entry count. Positive means left-to-right,
* negative means right-to-left. Then move the entries.
*/
- count = be16_to_cpu(leaf1->hdr.count) - mid + (isleft == 0);
+ count = hdr1.count - mid + (isleft == 0);
if (count > 0)
- xfs_dir2_leafn_moveents(args, blk1->bp,
- be16_to_cpu(leaf1->hdr.count) - count, blk2->bp, 0, count);
+ xfs_dir3_leafn_moveents(args, blk1->bp, &hdr1, ents1,
+ hdr1.count - count, blk2->bp,
+ &hdr2, ents2, 0, count);
else if (count < 0)
- xfs_dir2_leafn_moveents(args, blk2->bp, 0, blk1->bp,
- be16_to_cpu(leaf1->hdr.count), count);
- ASSERT(be16_to_cpu(leaf1->hdr.count) + be16_to_cpu(leaf2->hdr.count) == oldsum);
- ASSERT(be16_to_cpu(leaf1->hdr.stale) + be16_to_cpu(leaf2->hdr.stale) == oldstale);
+ xfs_dir3_leafn_moveents(args, blk2->bp, &hdr2, ents2, 0,
+ blk1->bp, &hdr1, ents1,
+ hdr1.count, count);
+
+ ASSERT(hdr1.count + hdr2.count == oldsum);
+ ASSERT(hdr1.stale + hdr2.stale == oldstale);
+
+ /* log the changes made when moving the entries */
+ dp->d_ops->leaf_hdr_to_disk(leaf1, &hdr1);
+ dp->d_ops->leaf_hdr_to_disk(leaf2, &hdr2);
+ xfs_dir3_leaf_log_header(args, blk1->bp);
+ xfs_dir3_leaf_log_header(args, blk2->bp);
+
+ xfs_dir3_leaf_check(dp, blk1->bp);
+ xfs_dir3_leaf_check(dp, blk2->bp);
+
/*
* Mark whether we're inserting into the old or new leaf.
*/
- if (be16_to_cpu(leaf1->hdr.count) < be16_to_cpu(leaf2->hdr.count))
+ if (hdr1.count < hdr2.count)
state->inleaf = swap;
- else if (be16_to_cpu(leaf1->hdr.count) > be16_to_cpu(leaf2->hdr.count))
+ else if (hdr1.count > hdr2.count)
state->inleaf = !swap;
else
- state->inleaf =
- swap ^ (blk1->index <= be16_to_cpu(leaf1->hdr.count));
+ state->inleaf = swap ^ (blk1->index <= hdr1.count);
/*
* Adjust the expected index for insertion.
*/
if (!state->inleaf)
- blk2->index = blk1->index - be16_to_cpu(leaf1->hdr.count);
+ blk2->index = blk1->index - hdr1.count;
/*
* Finally sanity check just to make sure we are not returning a
* negative index
*/
- if(blk2->index < 0) {
+ if (blk2->index < 0) {
state->inleaf = 1;
blk2->index = 0;
- xfs_alert(args->dp->i_mount,
- "%s: picked the wrong leaf? reverting original leaf: blk1->index %d\n",
+ xfs_alert(dp->i_mount,
+ "%s: picked the wrong leaf? reverting original leaf: blk1->index %d",
__func__, blk1->index);
}
}
static int
-xfs_dir2_data_block_free(
+xfs_dir3_data_block_free(
xfs_da_args_t *args,
struct xfs_dir2_data_hdr *hdr,
struct xfs_dir2_free *free,
@@ -907,64 +1079,72 @@ xfs_dir2_data_block_free(
struct xfs_buf *fbp,
int longest)
{
- struct xfs_trans *tp = args->trans;
int logfree = 0;
+ __be16 *bests;
+ struct xfs_dir3_icfree_hdr freehdr;
+ struct xfs_inode *dp = args->dp;
- if (!hdr) {
- /* One less used entry in the free table. */
- be32_add_cpu(&free->hdr.nused, -1);
- xfs_dir2_free_log_header(tp, fbp);
-
+ dp->d_ops->free_hdr_from_disk(&freehdr, free);
+ bests = dp->d_ops->free_bests_p(free);
+ if (hdr) {
/*
- * If this was the last entry in the table, we can trim the
- * table size back. There might be other entries at the end
- * referring to non-existent data blocks, get those too.
+ * Data block is not empty, just set the free entry to the new
+ * value.
*/
- if (findex == be32_to_cpu(free->hdr.nvalid) - 1) {
- int i; /* free entry index */
+ bests[findex] = cpu_to_be16(longest);
+ xfs_dir2_free_log_bests(args, fbp, findex, findex);
+ return 0;
+ }
- for (i = findex - 1; i >= 0; i--) {
- if (free->bests[i] != cpu_to_be16(NULLDATAOFF))
- break;
- }
- free->hdr.nvalid = cpu_to_be32(i + 1);
- logfree = 0;
- } else {
- /* Not the last entry, just punch it out. */
- free->bests[findex] = cpu_to_be16(NULLDATAOFF);
- logfree = 1;
- }
- /*
- * If there are no useful entries left in the block,
- * get rid of the block if we can.
- */
- if (!free->hdr.nused) {
- int error;
+ /* One less used entry in the free table. */
+ freehdr.nused--;
- error = xfs_dir2_shrink_inode(args, fdb, fbp);
- if (error == 0) {
- fbp = NULL;
- logfree = 0;
- } else if (error != ENOSPC || args->total != 0)
- return error;
- /*
- * It's possible to get ENOSPC if there is no
- * space reservation. In this case some one
- * else will eventually get rid of this block.
- */
+ /*
+ * If this was the last entry in the table, we can trim the table size
+ * back. There might be other entries at the end referring to
+ * non-existent data blocks, get those too.
+ */
+ if (findex == freehdr.nvalid - 1) {
+ int i; /* free entry index */
+
+ for (i = findex - 1; i >= 0; i--) {
+ if (bests[i] != cpu_to_be16(NULLDATAOFF))
+ break;
}
+ freehdr.nvalid = i + 1;
+ logfree = 0;
} else {
+ /* Not the last entry, just punch it out. */
+ bests[findex] = cpu_to_be16(NULLDATAOFF);
+ logfree = 1;
+ }
+
+ dp->d_ops->free_hdr_to_disk(free, &freehdr);
+ xfs_dir2_free_log_header(args, fbp);
+
+ /*
+ * If there are no useful entries left in the block, get rid of the
+ * block if we can.
+ */
+ if (!freehdr.nused) {
+ int error;
+
+ error = xfs_dir2_shrink_inode(args, fdb, fbp);
+ if (error == 0) {
+ fbp = NULL;
+ logfree = 0;
+ } else if (error != ENOSPC || args->total != 0)
+ return error;
/*
- * Data block is not empty, just set the free entry to the new
- * value.
+ * It's possible to get ENOSPC if there is no
+ * space reservation. In this case some one
+ * else will eventually get rid of this block.
*/
- free->bests[findex] = cpu_to_be16(longest);
- logfree = 1;
}
/* Log the free entry that changed, unless we got rid of it. */
if (logfree)
- xfs_dir2_free_log_bests(tp, fbp, findex, findex);
+ xfs_dir2_free_log_bests(args, fbp, findex, findex);
return 0;
}
@@ -994,6 +1174,9 @@ xfs_dir2_leafn_remove(
int needlog; /* need to log data header */
int needscan; /* need to rescan data frees */
xfs_trans_t *tp; /* transaction pointer */
+ struct xfs_dir2_data_free *bf; /* bestfree table */
+ struct xfs_dir3_icleaf_hdr leafhdr;
+ struct xfs_dir2_leaf_entry *ents;
trace_xfs_dir2_leafn_remove(args, index);
@@ -1001,26 +1184,33 @@ xfs_dir2_leafn_remove(
tp = args->trans;
mp = dp->i_mount;
leaf = bp->b_addr;
- ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC));
+ dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
+ ents = dp->d_ops->leaf_ents_p(leaf);
+
/*
* Point to the entry we're removing.
*/
- lep = &leaf->ents[index];
+ lep = &ents[index];
+
/*
* Extract the data block and offset from the entry.
*/
- db = xfs_dir2_dataptr_to_db(mp, be32_to_cpu(lep->address));
+ db = xfs_dir2_dataptr_to_db(args->geo, be32_to_cpu(lep->address));
ASSERT(dblk->blkno == db);
- off = xfs_dir2_dataptr_to_off(mp, be32_to_cpu(lep->address));
+ off = xfs_dir2_dataptr_to_off(args->geo, be32_to_cpu(lep->address));
ASSERT(dblk->index == off);
+
/*
* Kill the leaf entry by marking it stale.
* Log the leaf block changes.
*/
- be16_add_cpu(&leaf->hdr.stale, 1);
- xfs_dir2_leaf_log_header(tp, bp);
+ leafhdr.stale++;
+ dp->d_ops->leaf_hdr_to_disk(leaf, &leafhdr);
+ xfs_dir3_leaf_log_header(args, bp);
+
lep->address = cpu_to_be32(XFS_DIR2_NULL_DATAPTR);
- xfs_dir2_leaf_log_ents(tp, bp, index, index);
+ xfs_dir3_leaf_log_ents(args, bp, index, index);
+
/*
* Make the data entry free. Keep track of the longest freespace
* in the data block in case it changes.
@@ -1028,24 +1218,25 @@ xfs_dir2_leafn_remove(
dbp = dblk->bp;
hdr = dbp->b_addr;
dep = (xfs_dir2_data_entry_t *)((char *)hdr + off);
- longest = be16_to_cpu(hdr->bestfree[0].length);
+ bf = dp->d_ops->data_bestfree_p(hdr);
+ longest = be16_to_cpu(bf[0].length);
needlog = needscan = 0;
- xfs_dir2_data_make_free(tp, dbp, off,
- xfs_dir2_data_entsize(dep->namelen), &needlog, &needscan);
+ xfs_dir2_data_make_free(args, dbp, off,
+ dp->d_ops->data_entsize(dep->namelen), &needlog, &needscan);
/*
* Rescan the data block freespaces for bestfree.
* Log the data block header if needed.
*/
if (needscan)
- xfs_dir2_data_freescan(mp, hdr, &needlog);
+ xfs_dir2_data_freescan(dp, hdr, &needlog);
if (needlog)
- xfs_dir2_data_log_header(tp, dbp);
- xfs_dir2_data_check(dp, dbp);
+ xfs_dir2_data_log_header(args, dbp);
+ xfs_dir3_data_check(dp, dbp);
/*
* If the longest data block freespace changes, need to update
* the corresponding freeblock entry.
*/
- if (longest < be16_to_cpu(hdr->bestfree[0].length)) {
+ if (longest < be16_to_cpu(bf[0].length)) {
int error; /* error return value */
struct xfs_buf *fbp; /* freeblock buffer */
xfs_dir2_db_t fdb; /* freeblock block number */
@@ -1056,26 +1247,33 @@ xfs_dir2_leafn_remove(
* Convert the data block number to a free block,
* read in the free block.
*/
- fdb = xfs_dir2_db_to_fdb(mp, db);
- error = xfs_dir2_free_read(tp, dp, xfs_dir2_db_to_da(mp, fdb),
+ fdb = dp->d_ops->db_to_fdb(args->geo, db);
+ error = xfs_dir2_free_read(tp, dp,
+ xfs_dir2_db_to_da(args->geo, fdb),
&fbp);
if (error)
return error;
free = fbp->b_addr;
- ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC));
- ASSERT(be32_to_cpu(free->hdr.firstdb) ==
- xfs_dir2_free_max_bests(mp) *
- (fdb - XFS_DIR2_FREE_FIRSTDB(mp)));
+#ifdef DEBUG
+ {
+ struct xfs_dir3_icfree_hdr freehdr;
+ dp->d_ops->free_hdr_from_disk(&freehdr, free);
+ ASSERT(freehdr.firstdb == dp->d_ops->free_max_bests(args->geo) *
+ (fdb - xfs_dir2_byte_to_db(args->geo,
+ XFS_DIR2_FREE_OFFSET)));
+ }
+#endif
/*
* Calculate which entry we need to fix.
*/
- findex = xfs_dir2_db_to_fdindex(mp, db);
- longest = be16_to_cpu(hdr->bestfree[0].length);
+ findex = dp->d_ops->db_to_fdindex(args->geo, db);
+ longest = be16_to_cpu(bf[0].length);
/*
* If the data block is now empty we can get rid of it
* (usually).
*/
- if (longest == mp->m_dirblksize - (uint)sizeof(*hdr)) {
+ if (longest == args->geo->blksize -
+ dp->d_ops->data_entry_offset) {
/*
* Try to punch out the data block.
*/
@@ -1096,22 +1294,20 @@ xfs_dir2_leafn_remove(
* If we got rid of the data block, we can eliminate that entry
* in the free block.
*/
- error = xfs_dir2_data_block_free(args, hdr, free,
+ error = xfs_dir3_data_block_free(args, hdr, free,
fdb, findex, fbp, longest);
if (error)
return error;
}
- xfs_dir2_leafn_check(dp, bp);
+ xfs_dir3_leaf_check(dp, bp);
/*
* Return indication of whether this leaf block is empty enough
* to justify trying to join it with a neighbor.
*/
- *rval =
- ((uint)sizeof(leaf->hdr) +
- (uint)sizeof(leaf->ents[0]) *
- (be16_to_cpu(leaf->hdr.count) - be16_to_cpu(leaf->hdr.stale))) <
- mp->m_dir_magicpct;
+ *rval = (dp->d_ops->leaf_hdr_size +
+ (uint)sizeof(ents[0]) * (leafhdr.count - leafhdr.stale)) <
+ args->geo->magicpct;
return 0;
}
@@ -1128,13 +1324,14 @@ xfs_dir2_leafn_split(
xfs_dablk_t blkno; /* new leaf block number */
int error; /* error return value */
xfs_mount_t *mp; /* filesystem mount point */
+ struct xfs_inode *dp;
/*
* Allocate space for a new leaf node.
*/
args = state->args;
- mp = args->dp->i_mount;
- ASSERT(args != NULL);
+ dp = args->dp;
+ mp = dp->i_mount;
ASSERT(oldblk->magic == XFS_DIR2_LEAFN_MAGIC);
error = xfs_da_grow_inode(args, &blkno);
if (error) {
@@ -1143,11 +1340,11 @@ xfs_dir2_leafn_split(
/*
* Initialize the new leaf block.
*/
- error = xfs_dir2_leaf_init(args, xfs_dir2_da_to_db(mp, blkno),
- &newblk->bp, XFS_DIR2_LEAFN_MAGIC);
- if (error) {
+ error = xfs_dir3_leaf_get_buf(args, xfs_dir2_da_to_db(args->geo, blkno),
+ &newblk->bp, XFS_DIR2_LEAFN_MAGIC);
+ if (error)
return error;
- }
+
newblk->blkno = blkno;
newblk->magic = XFS_DIR2_LEAFN_MAGIC;
/*
@@ -1155,7 +1352,7 @@ xfs_dir2_leafn_split(
* block into the leaves.
*/
xfs_dir2_leafn_rebalance(state, oldblk, newblk);
- error = xfs_da_blk_link(state, oldblk, newblk);
+ error = xfs_da3_blk_link(state, oldblk, newblk);
if (error) {
return error;
}
@@ -1169,10 +1366,10 @@ xfs_dir2_leafn_split(
/*
* Update last hashval in each block since we added the name.
*/
- oldblk->hashval = xfs_dir2_leafn_lasthash(oldblk->bp, NULL);
- newblk->hashval = xfs_dir2_leafn_lasthash(newblk->bp, NULL);
- xfs_dir2_leafn_check(args->dp, oldblk->bp);
- xfs_dir2_leafn_check(args->dp, newblk->bp);
+ oldblk->hashval = xfs_dir2_leafn_lasthash(dp, oldblk->bp, NULL);
+ newblk->hashval = xfs_dir2_leafn_lasthash(dp, newblk->bp, NULL);
+ xfs_dir3_leaf_check(dp, oldblk->bp);
+ xfs_dir3_leaf_check(dp, newblk->bp);
return error;
}
@@ -1198,9 +1395,11 @@ xfs_dir2_leafn_toosmall(
int error; /* error return value */
int forward; /* sibling block direction */
int i; /* sibling counter */
- xfs_da_blkinfo_t *info; /* leaf block header */
xfs_dir2_leaf_t *leaf; /* leaf structure */
int rval; /* result from path_shift */
+ struct xfs_dir3_icleaf_hdr leafhdr;
+ struct xfs_dir2_leaf_entry *ents;
+ struct xfs_inode *dp = state->args->dp;
/*
* Check for the degenerate case of the block being over 50% full.
@@ -1208,12 +1407,14 @@ xfs_dir2_leafn_toosmall(
* to coalesce with a sibling.
*/
blk = &state->path.blk[state->path.active - 1];
- info = blk->bp->b_addr;
- ASSERT(info->magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC));
- leaf = (xfs_dir2_leaf_t *)info;
- count = be16_to_cpu(leaf->hdr.count) - be16_to_cpu(leaf->hdr.stale);
- bytes = (uint)sizeof(leaf->hdr) + count * (uint)sizeof(leaf->ents[0]);
- if (bytes > (state->blocksize >> 1)) {
+ leaf = blk->bp->b_addr;
+ dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
+ ents = dp->d_ops->leaf_ents_p(leaf);
+ xfs_dir3_leaf_check(dp, blk->bp);
+
+ count = leafhdr.count - leafhdr.stale;
+ bytes = dp->d_ops->leaf_hdr_size + count * sizeof(ents[0]);
+ if (bytes > (state->args->geo->blksize >> 1)) {
/*
* Blk over 50%, don't try to join.
*/
@@ -1231,9 +1432,9 @@ xfs_dir2_leafn_toosmall(
* Make altpath point to the block we want to keep and
* path point to the block we want to drop (this one).
*/
- forward = (info->forw != 0);
+ forward = (leafhdr.forw != 0);
memcpy(&state->altpath, &state->path, sizeof(state->path));
- error = xfs_da_path_shift(state, &state->altpath, forward, 0,
+ error = xfs_da3_path_shift(state, &state->altpath, forward, 0,
&rval);
if (error)
return error;
@@ -1247,15 +1448,17 @@ xfs_dir2_leafn_toosmall(
* We prefer coalescing with the lower numbered sibling so as
* to shrink a directory over time.
*/
- forward = be32_to_cpu(info->forw) < be32_to_cpu(info->back);
+ forward = leafhdr.forw < leafhdr.back;
for (i = 0, bp = NULL; i < 2; forward = !forward, i++) {
- blkno = forward ? be32_to_cpu(info->forw) : be32_to_cpu(info->back);
+ struct xfs_dir3_icleaf_hdr hdr2;
+
+ blkno = forward ? leafhdr.forw : leafhdr.back;
if (blkno == 0)
continue;
/*
* Read the sibling leaf block.
*/
- error = xfs_dir2_leafn_read(state->args->trans, state->args->dp,
+ error = xfs_dir3_leafn_read(state->args->trans, dp,
blkno, -1, &bp);
if (error)
return error;
@@ -1263,13 +1466,16 @@ xfs_dir2_leafn_toosmall(
/*
* Count bytes in the two blocks combined.
*/
- leaf = (xfs_dir2_leaf_t *)info;
- count = be16_to_cpu(leaf->hdr.count) - be16_to_cpu(leaf->hdr.stale);
- bytes = state->blocksize - (state->blocksize >> 2);
+ count = leafhdr.count - leafhdr.stale;
+ bytes = state->args->geo->blksize -
+ (state->args->geo->blksize >> 2);
+
leaf = bp->b_addr;
- ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC));
- count += be16_to_cpu(leaf->hdr.count) - be16_to_cpu(leaf->hdr.stale);
- bytes -= count * (uint)sizeof(leaf->ents[0]);
+ dp->d_ops->leaf_hdr_from_disk(&hdr2, leaf);
+ ents = dp->d_ops->leaf_ents_p(leaf);
+ count += hdr2.count - hdr2.stale;
+ bytes -= count * sizeof(ents[0]);
+
/*
* Fits with at least 25% to spare.
*/
@@ -1291,10 +1497,10 @@ xfs_dir2_leafn_toosmall(
*/
memcpy(&state->altpath, &state->path, sizeof(state->path));
if (blkno < blk->blkno)
- error = xfs_da_path_shift(state, &state->altpath, forward, 0,
+ error = xfs_da3_path_shift(state, &state->altpath, forward, 0,
&rval);
else
- error = xfs_da_path_shift(state, &state->path, forward, 0,
+ error = xfs_da3_path_shift(state, &state->path, forward, 0,
&rval);
if (error) {
return error;
@@ -1316,34 +1522,54 @@ xfs_dir2_leafn_unbalance(
xfs_da_args_t *args; /* operation arguments */
xfs_dir2_leaf_t *drop_leaf; /* dead leaf structure */
xfs_dir2_leaf_t *save_leaf; /* surviving leaf structure */
+ struct xfs_dir3_icleaf_hdr savehdr;
+ struct xfs_dir3_icleaf_hdr drophdr;
+ struct xfs_dir2_leaf_entry *sents;
+ struct xfs_dir2_leaf_entry *dents;
+ struct xfs_inode *dp = state->args->dp;
args = state->args;
ASSERT(drop_blk->magic == XFS_DIR2_LEAFN_MAGIC);
ASSERT(save_blk->magic == XFS_DIR2_LEAFN_MAGIC);
drop_leaf = drop_blk->bp->b_addr;
save_leaf = save_blk->bp->b_addr;
- ASSERT(drop_leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC));
- ASSERT(save_leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC));
+
+ dp->d_ops->leaf_hdr_from_disk(&savehdr, save_leaf);
+ dp->d_ops->leaf_hdr_from_disk(&drophdr, drop_leaf);
+ sents = dp->d_ops->leaf_ents_p(save_leaf);
+ dents = dp->d_ops->leaf_ents_p(drop_leaf);
+
/*
* If there are any stale leaf entries, take this opportunity
* to purge them.
*/
- if (drop_leaf->hdr.stale)
- xfs_dir2_leaf_compact(args, drop_blk->bp);
- if (save_leaf->hdr.stale)
- xfs_dir2_leaf_compact(args, save_blk->bp);
+ if (drophdr.stale)
+ xfs_dir3_leaf_compact(args, &drophdr, drop_blk->bp);
+ if (savehdr.stale)
+ xfs_dir3_leaf_compact(args, &savehdr, save_blk->bp);
+
/*
* Move the entries from drop to the appropriate end of save.
*/
- drop_blk->hashval = be32_to_cpu(drop_leaf->ents[be16_to_cpu(drop_leaf->hdr.count) - 1].hashval);
- if (xfs_dir2_leafn_order(save_blk->bp, drop_blk->bp))
- xfs_dir2_leafn_moveents(args, drop_blk->bp, 0, save_blk->bp, 0,
- be16_to_cpu(drop_leaf->hdr.count));
+ drop_blk->hashval = be32_to_cpu(dents[drophdr.count - 1].hashval);
+ if (xfs_dir2_leafn_order(dp, save_blk->bp, drop_blk->bp))
+ xfs_dir3_leafn_moveents(args, drop_blk->bp, &drophdr, dents, 0,
+ save_blk->bp, &savehdr, sents, 0,
+ drophdr.count);
else
- xfs_dir2_leafn_moveents(args, drop_blk->bp, 0, save_blk->bp,
- be16_to_cpu(save_leaf->hdr.count), be16_to_cpu(drop_leaf->hdr.count));
- save_blk->hashval = be32_to_cpu(save_leaf->ents[be16_to_cpu(save_leaf->hdr.count) - 1].hashval);
- xfs_dir2_leafn_check(args->dp, save_blk->bp);
+ xfs_dir3_leafn_moveents(args, drop_blk->bp, &drophdr, dents, 0,
+ save_blk->bp, &savehdr, sents,
+ savehdr.count, drophdr.count);
+ save_blk->hashval = be32_to_cpu(sents[savehdr.count - 1].hashval);
+
+ /* log the changes made when moving the entries */
+ dp->d_ops->leaf_hdr_to_disk(save_leaf, &savehdr);
+ dp->d_ops->leaf_hdr_to_disk(drop_leaf, &drophdr);
+ xfs_dir3_leaf_log_header(args, save_blk->bp);
+ xfs_dir3_leaf_log_header(args, drop_blk->bp);
+
+ xfs_dir3_leaf_check(dp, save_blk->bp);
+ xfs_dir3_leaf_check(dp, drop_blk->bp);
}
/*
@@ -1366,13 +1592,11 @@ xfs_dir2_node_addname(
state = xfs_da_state_alloc();
state->args = args;
state->mp = args->dp->i_mount;
- state->blocksize = state->mp->m_dirblksize;
- state->node_ents = state->mp->m_dir_node_ents;
/*
* Look up the name. We're not supposed to find it, but
* this gives us the insertion point.
*/
- error = xfs_da_node_lookup_int(state, &rval);
+ error = xfs_da3_node_lookup_int(state, &rval);
if (error)
rval = error;
if (rval != ENOENT) {
@@ -1398,7 +1622,7 @@ xfs_dir2_node_addname(
* It worked, fix the hash values up the btree.
*/
if (!(args->op_flags & XFS_DA_OP_JUSTCHECK))
- xfs_da_fixhashpath(state, &state->path);
+ xfs_da3_fixhashpath(state, &state->path);
} else {
/*
* It didn't work, we need to split the leaf block.
@@ -1410,7 +1634,7 @@ xfs_dir2_node_addname(
/*
* Split the leaf block and insert the new entry.
*/
- rval = xfs_da_split(state);
+ rval = xfs_da3_split(state);
}
done:
xfs_da_state_free(state);
@@ -1447,11 +1671,14 @@ xfs_dir2_node_addname_int(
int needscan; /* need to rescan data frees */
__be16 *tagp; /* data entry tag pointer */
xfs_trans_t *tp; /* transaction pointer */
+ __be16 *bests;
+ struct xfs_dir3_icfree_hdr freehdr;
+ struct xfs_dir2_data_free *bf;
dp = args->dp;
mp = dp->i_mount;
tp = args->trans;
- length = xfs_dir2_data_entsize(args->namelen);
+ length = dp->d_ops->data_entsize(args->namelen);
/*
* If we came in with a freespace block that means that lookup
* found an entry with our hash value. This is the freespace
@@ -1464,36 +1691,37 @@ xfs_dir2_node_addname_int(
*/
ifbno = fblk->blkno;
free = fbp->b_addr;
- ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC));
findex = fblk->index;
+ bests = dp->d_ops->free_bests_p(free);
+ dp->d_ops->free_hdr_from_disk(&freehdr, free);
+
/*
* This means the free entry showed that the data block had
* space for our entry, so we remembered it.
* Use that data block.
*/
if (findex >= 0) {
- ASSERT(findex < be32_to_cpu(free->hdr.nvalid));
- ASSERT(be16_to_cpu(free->bests[findex]) != NULLDATAOFF);
- ASSERT(be16_to_cpu(free->bests[findex]) >= length);
- dbno = be32_to_cpu(free->hdr.firstdb) + findex;
- }
- /*
- * The data block looked at didn't have enough room.
- * We'll start at the beginning of the freespace entries.
- */
- else {
+ ASSERT(findex < freehdr.nvalid);
+ ASSERT(be16_to_cpu(bests[findex]) != NULLDATAOFF);
+ ASSERT(be16_to_cpu(bests[findex]) >= length);
+ dbno = freehdr.firstdb + findex;
+ } else {
+ /*
+ * The data block looked at didn't have enough room.
+ * We'll start at the beginning of the freespace entries.
+ */
dbno = -1;
findex = 0;
}
- }
- /*
- * Didn't come in with a freespace block, so don't have a data block.
- */
- else {
+ } else {
+ /*
+ * Didn't come in with a freespace block, so no data block.
+ */
ifbno = dbno = -1;
fbp = NULL;
findex = 0;
}
+
/*
* If we don't have a data block yet, we're going to scan the
* freespace blocks looking for one. Figure out what the
@@ -1502,9 +1730,9 @@ xfs_dir2_node_addname_int(
if (dbno == -1) {
xfs_fileoff_t fo; /* freespace block number */
- if ((error = xfs_bmap_last_offset(tp, dp, &fo, XFS_DATA_FORK)))
+ if ((error = xfs_bmap_last_offset(dp, &fo, XFS_DATA_FORK)))
return error;
- lastfbno = xfs_dir2_da_to_db(mp, (xfs_dablk_t)fo);
+ lastfbno = xfs_dir2_da_to_db(args->geo, (xfs_dablk_t)fo);
fbno = ifbno;
}
/*
@@ -1522,7 +1750,8 @@ xfs_dir2_node_addname_int(
* us a freespace block to start with.
*/
if (++fbno == 0)
- fbno = XFS_DIR2_FREE_FIRSTDB(mp);
+ fbno = xfs_dir2_byte_to_db(args->geo,
+ XFS_DIR2_FREE_OFFSET);
/*
* If it's ifbno we already looked at it.
*/
@@ -1540,27 +1769,33 @@ xfs_dir2_node_addname_int(
* to avoid it.
*/
error = xfs_dir2_free_try_read(tp, dp,
- xfs_dir2_db_to_da(mp, fbno),
- &fbp);
+ xfs_dir2_db_to_da(args->geo, fbno),
+ &fbp);
if (error)
return error;
if (!fbp)
continue;
free = fbp->b_addr;
- ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC));
findex = 0;
}
/*
* Look at the current free entry. Is it good enough?
+ *
+ * The bests initialisation should be where the bufer is read in
+ * the above branch. But gcc is too stupid to realise that bests
+ * and the freehdr are actually initialised if they are placed
+ * there, so we have to do it here to avoid warnings. Blech.
*/
- if (be16_to_cpu(free->bests[findex]) != NULLDATAOFF &&
- be16_to_cpu(free->bests[findex]) >= length)
- dbno = be32_to_cpu(free->hdr.firstdb) + findex;
+ bests = dp->d_ops->free_bests_p(free);
+ dp->d_ops->free_hdr_from_disk(&freehdr, free);
+ if (be16_to_cpu(bests[findex]) != NULLDATAOFF &&
+ be16_to_cpu(bests[findex]) >= length)
+ dbno = freehdr.firstdb + findex;
else {
/*
* Are we done with the freeblock?
*/
- if (++findex == be32_to_cpu(free->hdr.nvalid)) {
+ if (++findex == freehdr.nvalid) {
/*
* Drop the block.
*/
@@ -1588,7 +1823,7 @@ xfs_dir2_node_addname_int(
if (unlikely((error = xfs_dir2_grow_inode(args,
XFS_DIR2_DATA_SPACE,
&dbno)) ||
- (error = xfs_dir2_data_init(args, dbno, &dbp))))
+ (error = xfs_dir3_data_init(args, dbno, &dbp))))
return error;
/*
@@ -1603,10 +1838,10 @@ xfs_dir2_node_addname_int(
* Get the freespace block corresponding to the data block
* that was just allocated.
*/
- fbno = xfs_dir2_db_to_fdb(mp, dbno);
+ fbno = dp->d_ops->db_to_fdb(args->geo, dbno);
error = xfs_dir2_free_try_read(tp, dp,
- xfs_dir2_db_to_da(mp, fbno),
- &fbp);
+ xfs_dir2_db_to_da(args->geo, fbno),
+ &fbp);
if (error)
return error;
@@ -1614,18 +1849,19 @@ xfs_dir2_node_addname_int(
* If there wasn't a freespace block, the read will
* return a NULL fbp. Allocate and initialize a new one.
*/
- if( fbp == NULL ) {
- if ((error = xfs_dir2_grow_inode(args, XFS_DIR2_FREE_SPACE,
- &fbno))) {
+ if (!fbp) {
+ error = xfs_dir2_grow_inode(args, XFS_DIR2_FREE_SPACE,
+ &fbno);
+ if (error)
return error;
- }
- if (unlikely(xfs_dir2_db_to_fdb(mp, dbno) != fbno)) {
+ if (dp->d_ops->db_to_fdb(args->geo, dbno) != fbno) {
xfs_alert(mp,
"%s: dir ino %llu needed freesp block %lld for\n"
" data block %lld, got %lld ifbno %llu lastfbno %d",
__func__, (unsigned long long)dp->i_ino,
- (long long)xfs_dir2_db_to_fdb(mp, dbno),
+ (long long)dp->d_ops->db_to_fdb(
+ args->geo, dbno),
(long long)dbno, (long long)fbno,
(unsigned long long)ifbno, lastfbno);
if (fblk) {
@@ -1646,52 +1882,50 @@ xfs_dir2_node_addname_int(
/*
* Get a buffer for the new block.
*/
- error = xfs_da_get_buf(tp, dp,
- xfs_dir2_db_to_da(mp, fbno),
- -1, &fbp, XFS_DATA_FORK);
+ error = xfs_dir3_free_get_buf(args, fbno, &fbp);
if (error)
return error;
- fbp->b_ops = &xfs_dir2_free_buf_ops;
+ free = fbp->b_addr;
+ bests = dp->d_ops->free_bests_p(free);
+ dp->d_ops->free_hdr_from_disk(&freehdr, free);
/*
- * Initialize the new block to be empty, and remember
- * its first slot as our empty slot.
+ * Remember the first slot as our empty slot.
*/
- free = fbp->b_addr;
- free->hdr.magic = cpu_to_be32(XFS_DIR2_FREE_MAGIC);
- free->hdr.firstdb = cpu_to_be32(
- (fbno - XFS_DIR2_FREE_FIRSTDB(mp)) *
- xfs_dir2_free_max_bests(mp));
- free->hdr.nvalid = 0;
- free->hdr.nused = 0;
+ freehdr.firstdb =
+ (fbno - xfs_dir2_byte_to_db(args->geo,
+ XFS_DIR2_FREE_OFFSET)) *
+ dp->d_ops->free_max_bests(args->geo);
} else {
free = fbp->b_addr;
- ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC));
+ bests = dp->d_ops->free_bests_p(free);
+ dp->d_ops->free_hdr_from_disk(&freehdr, free);
}
/*
* Set the freespace block index from the data block number.
*/
- findex = xfs_dir2_db_to_fdindex(mp, dbno);
+ findex = dp->d_ops->db_to_fdindex(args->geo, dbno);
/*
* If it's after the end of the current entries in the
* freespace block, extend that table.
*/
- if (findex >= be32_to_cpu(free->hdr.nvalid)) {
- ASSERT(findex < xfs_dir2_free_max_bests(mp));
- free->hdr.nvalid = cpu_to_be32(findex + 1);
+ if (findex >= freehdr.nvalid) {
+ ASSERT(findex < dp->d_ops->free_max_bests(args->geo));
+ freehdr.nvalid = findex + 1;
/*
* Tag new entry so nused will go up.
*/
- free->bests[findex] = cpu_to_be16(NULLDATAOFF);
+ bests[findex] = cpu_to_be16(NULLDATAOFF);
}
/*
* If this entry was for an empty data block
* (this should always be true) then update the header.
*/
- if (free->bests[findex] == cpu_to_be16(NULLDATAOFF)) {
- be32_add_cpu(&free->hdr.nused, 1);
- xfs_dir2_free_log_header(tp, fbp);
+ if (bests[findex] == cpu_to_be16(NULLDATAOFF)) {
+ freehdr.nused++;
+ dp->d_ops->free_hdr_to_disk(fbp->b_addr, &freehdr);
+ xfs_dir2_free_log_header(args, fbp);
}
/*
* Update the real value in the table.
@@ -1699,7 +1933,8 @@ xfs_dir2_node_addname_int(
* change again.
*/
hdr = dbp->b_addr;
- free->bests[findex] = hdr->bestfree[0].length;
+ bf = dp->d_ops->data_bestfree_p(hdr);
+ bests[findex] = bf[0].length;
logfree = 1;
}
/*
@@ -1715,24 +1950,26 @@ xfs_dir2_node_addname_int(
/*
* Read the data block in.
*/
- error = xfs_dir2_data_read(tp, dp, xfs_dir2_db_to_da(mp, dbno),
+ error = xfs_dir3_data_read(tp, dp,
+ xfs_dir2_db_to_da(args->geo, dbno),
-1, &dbp);
if (error)
return error;
hdr = dbp->b_addr;
+ bf = dp->d_ops->data_bestfree_p(hdr);
logfree = 0;
}
- ASSERT(be16_to_cpu(hdr->bestfree[0].length) >= length);
+ ASSERT(be16_to_cpu(bf[0].length) >= length);
/*
* Point to the existing unused space.
*/
dup = (xfs_dir2_data_unused_t *)
- ((char *)hdr + be16_to_cpu(hdr->bestfree[0].offset));
+ ((char *)hdr + be16_to_cpu(bf[0].offset));
needscan = needlog = 0;
/*
* Mark the first part of the unused space, inuse for us.
*/
- xfs_dir2_data_use_free(tp, dbp, dup,
+ xfs_dir2_data_use_free(args, dbp, dup,
(xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr), length,
&needlog, &needscan);
/*
@@ -1742,31 +1979,33 @@ xfs_dir2_node_addname_int(
dep->inumber = cpu_to_be64(args->inumber);
dep->namelen = args->namelen;
memcpy(dep->name, args->name, dep->namelen);
- tagp = xfs_dir2_data_entry_tag_p(dep);
+ dp->d_ops->data_put_ftype(dep, args->filetype);
+ tagp = dp->d_ops->data_entry_tag_p(dep);
*tagp = cpu_to_be16((char *)dep - (char *)hdr);
- xfs_dir2_data_log_entry(tp, dbp, dep);
+ xfs_dir2_data_log_entry(args, dbp, dep);
/*
* Rescan the block for bestfree if needed.
*/
if (needscan)
- xfs_dir2_data_freescan(mp, hdr, &needlog);
+ xfs_dir2_data_freescan(dp, hdr, &needlog);
/*
* Log the data block header if needed.
*/
if (needlog)
- xfs_dir2_data_log_header(tp, dbp);
+ xfs_dir2_data_log_header(args, dbp);
/*
* If the freespace entry is now wrong, update it.
*/
- if (be16_to_cpu(free->bests[findex]) != be16_to_cpu(hdr->bestfree[0].length)) {
- free->bests[findex] = hdr->bestfree[0].length;
+ bests = dp->d_ops->free_bests_p(free); /* gcc is so stupid */
+ if (be16_to_cpu(bests[findex]) != be16_to_cpu(bf[0].length)) {
+ bests[findex] = bf[0].length;
logfree = 1;
}
/*
* Log the freespace entry if needed.
*/
if (logfree)
- xfs_dir2_free_log_bests(tp, fbp, findex, findex);
+ xfs_dir2_free_log_bests(args, fbp, findex, findex);
/*
* Return the data block and offset in args, then drop the data block.
*/
@@ -1777,7 +2016,7 @@ xfs_dir2_node_addname_int(
/*
* Lookup an entry in a node-format directory.
- * All the real work happens in xfs_da_node_lookup_int.
+ * All the real work happens in xfs_da3_node_lookup_int.
* The only real output is the inode number of the entry.
*/
int /* error */
@@ -1797,12 +2036,10 @@ xfs_dir2_node_lookup(
state = xfs_da_state_alloc();
state->args = args;
state->mp = args->dp->i_mount;
- state->blocksize = state->mp->m_dirblksize;
- state->node_ents = state->mp->m_dir_node_ents;
/*
* Fill in the path to the entry in the cursor.
*/
- error = xfs_da_node_lookup_int(state, &rval);
+ error = xfs_da3_node_lookup_int(state, &rval);
if (error)
rval = error;
else if (rval == ENOENT && args->cmpresult == XFS_CMP_CASE) {
@@ -1837,12 +2074,12 @@ xfs_dir2_node_lookup(
*/
int /* error */
xfs_dir2_node_removename(
- xfs_da_args_t *args) /* operation arguments */
+ struct xfs_da_args *args) /* operation arguments */
{
- xfs_da_state_blk_t *blk; /* leaf block */
+ struct xfs_da_state_blk *blk; /* leaf block */
int error; /* error return value */
int rval; /* operation return value */
- xfs_da_state_t *state; /* btree cursor */
+ struct xfs_da_state *state; /* btree cursor */
trace_xfs_dir2_node_removename(args);
@@ -1852,21 +2089,18 @@ xfs_dir2_node_removename(
state = xfs_da_state_alloc();
state->args = args;
state->mp = args->dp->i_mount;
- state->blocksize = state->mp->m_dirblksize;
- state->node_ents = state->mp->m_dir_node_ents;
- /*
- * Look up the entry we're deleting, set up the cursor.
- */
- error = xfs_da_node_lookup_int(state, &rval);
+
+ /* Look up the entry we're deleting, set up the cursor. */
+ error = xfs_da3_node_lookup_int(state, &rval);
if (error)
- rval = error;
- /*
- * Didn't find it, upper layer screwed up.
- */
+ goto out_free;
+
+ /* Didn't find it, upper layer screwed up. */
if (rval != EEXIST) {
- xfs_da_state_free(state);
- return rval;
+ error = rval;
+ goto out_free;
}
+
blk = &state->path.blk[state->path.active - 1];
ASSERT(blk->magic == XFS_DIR2_LEAFN_MAGIC);
ASSERT(state->extravalid);
@@ -1877,21 +2111,22 @@ xfs_dir2_node_removename(
error = xfs_dir2_leafn_remove(args, blk->bp, blk->index,
&state->extrablk, &rval);
if (error)
- return error;
+ goto out_free;
/*
* Fix the hash values up the btree.
*/
- xfs_da_fixhashpath(state, &state->path);
+ xfs_da3_fixhashpath(state, &state->path);
/*
* If we need to join leaf blocks, do it.
*/
if (rval && state->path.active > 1)
- error = xfs_da_join(state);
+ error = xfs_da3_join(state);
/*
* If no errors so far, try conversion to leaf format.
*/
if (!error)
error = xfs_dir2_node_to_leaf(state);
+out_free:
xfs_da_state_free(state);
return error;
}
@@ -1922,13 +2157,11 @@ xfs_dir2_node_replace(
state = xfs_da_state_alloc();
state->args = args;
state->mp = args->dp->i_mount;
- state->blocksize = state->mp->m_dirblksize;
- state->node_ents = state->mp->m_dir_node_ents;
inum = args->inumber;
/*
* Lookup the entry to change in the btree.
*/
- error = xfs_da_node_lookup_int(state, &rval);
+ error = xfs_da3_node_lookup_int(state, &rval);
if (error) {
rval = error;
}
@@ -1937,28 +2170,33 @@ xfs_dir2_node_replace(
* and locked it. But paranoia is good.
*/
if (rval == EEXIST) {
+ struct xfs_dir2_leaf_entry *ents;
/*
* Find the leaf entry.
*/
blk = &state->path.blk[state->path.active - 1];
ASSERT(blk->magic == XFS_DIR2_LEAFN_MAGIC);
leaf = blk->bp->b_addr;
- lep = &leaf->ents[blk->index];
+ ents = args->dp->d_ops->leaf_ents_p(leaf);
+ lep = &ents[blk->index];
ASSERT(state->extravalid);
/*
* Point to the data entry.
*/
hdr = state->extrablk.bp->b_addr;
- ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC));
+ ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
+ hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC));
dep = (xfs_dir2_data_entry_t *)
((char *)hdr +
- xfs_dir2_dataptr_to_off(state->mp, be32_to_cpu(lep->address)));
+ xfs_dir2_dataptr_to_off(args->geo,
+ be32_to_cpu(lep->address)));
ASSERT(inum != be64_to_cpu(dep->inumber));
/*
* Fill in the new inode number and log the entry.
*/
dep->inumber = cpu_to_be64(inum);
- xfs_dir2_data_log_entry(args->trans, state->extrablk.bp, dep);
+ args->dp->d_ops->data_put_ftype(dep, args->filetype);
+ xfs_dir2_data_log_entry(args, state->extrablk.bp, dep);
rval = 0;
}
/*
@@ -1995,6 +2233,7 @@ xfs_dir2_node_trim_free(
xfs_dir2_free_t *free; /* freespace structure */
xfs_mount_t *mp; /* filesystem mount point */
xfs_trans_t *tp; /* transaction pointer */
+ struct xfs_dir3_icfree_hdr freehdr;
dp = args->dp;
mp = dp->i_mount;
@@ -2012,11 +2251,12 @@ xfs_dir2_node_trim_free(
if (!bp)
return 0;
free = bp->b_addr;
- ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC));
+ dp->d_ops->free_hdr_from_disk(&freehdr, free);
+
/*
* If there are used entries, there's nothing to do.
*/
- if (be32_to_cpu(free->hdr.nused) > 0) {
+ if (freehdr.nused > 0) {
xfs_trans_brelse(tp, bp);
*rvalp = 0;
return 0;
@@ -2024,9 +2264,9 @@ xfs_dir2_node_trim_free(
/*
* Blow the block away.
*/
- if ((error =
- xfs_dir2_shrink_inode(args, xfs_dir2_da_to_db(mp, (xfs_dablk_t)fo),
- bp))) {
+ error = xfs_dir2_shrink_inode(args,
+ xfs_dir2_da_to_db(args->geo, (xfs_dablk_t)fo), bp);
+ if (error) {
/*
* Can't fail with ENOSPC since that only happens with no
* space reservation, when breaking up an extent into two
diff --git a/fs/xfs/xfs_dir2_priv.h b/fs/xfs/xfs_dir2_priv.h
index 7da79f6515f..27ce0794d19 100644
--- a/fs/xfs/xfs_dir2_priv.h
+++ b/fs/xfs/xfs_dir2_priv.h
@@ -18,23 +18,160 @@
#ifndef __XFS_DIR2_PRIV_H__
#define __XFS_DIR2_PRIV_H__
+struct dir_context;
+
+/*
+ * Directory offset/block conversion functions.
+ *
+ * DB blocks here are logical directory block numbers, not filesystem blocks.
+ */
+
+/*
+ * Convert dataptr to byte in file space
+ */
+static inline xfs_dir2_off_t
+xfs_dir2_dataptr_to_byte(xfs_dir2_dataptr_t dp)
+{
+ return (xfs_dir2_off_t)dp << XFS_DIR2_DATA_ALIGN_LOG;
+}
+
+/*
+ * Convert byte in file space to dataptr. It had better be aligned.
+ */
+static inline xfs_dir2_dataptr_t
+xfs_dir2_byte_to_dataptr(xfs_dir2_off_t by)
+{
+ return (xfs_dir2_dataptr_t)(by >> XFS_DIR2_DATA_ALIGN_LOG);
+}
+
+/*
+ * Convert byte in space to (DB) block
+ */
+static inline xfs_dir2_db_t
+xfs_dir2_byte_to_db(struct xfs_da_geometry *geo, xfs_dir2_off_t by)
+{
+ return (xfs_dir2_db_t)(by >> geo->blklog);
+}
+
+/*
+ * Convert dataptr to a block number
+ */
+static inline xfs_dir2_db_t
+xfs_dir2_dataptr_to_db(struct xfs_da_geometry *geo, xfs_dir2_dataptr_t dp)
+{
+ return xfs_dir2_byte_to_db(geo, xfs_dir2_dataptr_to_byte(dp));
+}
+
+/*
+ * Convert byte in space to offset in a block
+ */
+static inline xfs_dir2_data_aoff_t
+xfs_dir2_byte_to_off(struct xfs_da_geometry *geo, xfs_dir2_off_t by)
+{
+ return (xfs_dir2_data_aoff_t)(by & (geo->blksize - 1));
+}
+
+/*
+ * Convert dataptr to a byte offset in a block
+ */
+static inline xfs_dir2_data_aoff_t
+xfs_dir2_dataptr_to_off(struct xfs_da_geometry *geo, xfs_dir2_dataptr_t dp)
+{
+ return xfs_dir2_byte_to_off(geo, xfs_dir2_dataptr_to_byte(dp));
+}
+
+/*
+ * Convert block and offset to byte in space
+ */
+static inline xfs_dir2_off_t
+xfs_dir2_db_off_to_byte(struct xfs_da_geometry *geo, xfs_dir2_db_t db,
+ xfs_dir2_data_aoff_t o)
+{
+ return ((xfs_dir2_off_t)db << geo->blklog) + o;
+}
+
+/*
+ * Convert block (DB) to block (dablk)
+ */
+static inline xfs_dablk_t
+xfs_dir2_db_to_da(struct xfs_da_geometry *geo, xfs_dir2_db_t db)
+{
+ return (xfs_dablk_t)(db << (geo->blklog - geo->fsblog));
+}
+
+/*
+ * Convert byte in space to (DA) block
+ */
+static inline xfs_dablk_t
+xfs_dir2_byte_to_da(struct xfs_da_geometry *geo, xfs_dir2_off_t by)
+{
+ return xfs_dir2_db_to_da(geo, xfs_dir2_byte_to_db(geo, by));
+}
+
+/*
+ * Convert block and offset to dataptr
+ */
+static inline xfs_dir2_dataptr_t
+xfs_dir2_db_off_to_dataptr(struct xfs_da_geometry *geo, xfs_dir2_db_t db,
+ xfs_dir2_data_aoff_t o)
+{
+ return xfs_dir2_byte_to_dataptr(xfs_dir2_db_off_to_byte(geo, db, o));
+}
+
+/*
+ * Convert block (dablk) to block (DB)
+ */
+static inline xfs_dir2_db_t
+xfs_dir2_da_to_db(struct xfs_da_geometry *geo, xfs_dablk_t da)
+{
+ return (xfs_dir2_db_t)(da >> (geo->blklog - geo->fsblog));
+}
+
+/*
+ * Convert block (dablk) to byte offset in space
+ */
+static inline xfs_dir2_off_t
+xfs_dir2_da_to_byte(struct xfs_da_geometry *geo, xfs_dablk_t da)
+{
+ return xfs_dir2_db_off_to_byte(geo, xfs_dir2_da_to_db(geo, da), 0);
+}
+
+/*
+ * Directory tail pointer accessor functions. Based on block geometry.
+ */
+static inline struct xfs_dir2_block_tail *
+xfs_dir2_block_tail_p(struct xfs_da_geometry *geo, struct xfs_dir2_data_hdr *hdr)
+{
+ return ((struct xfs_dir2_block_tail *)
+ ((char *)hdr + geo->blksize)) - 1;
+}
+
+static inline struct xfs_dir2_leaf_tail *
+xfs_dir2_leaf_tail_p(struct xfs_da_geometry *geo, struct xfs_dir2_leaf *lp)
+{
+ return (struct xfs_dir2_leaf_tail *)
+ ((char *)lp + geo->blksize -
+ sizeof(struct xfs_dir2_leaf_tail));
+}
+
/* xfs_dir2.c */
extern int xfs_dir_ino_validate(struct xfs_mount *mp, xfs_ino_t ino);
-extern int xfs_dir2_isblock(struct xfs_trans *tp, struct xfs_inode *dp, int *r);
-extern int xfs_dir2_isleaf(struct xfs_trans *tp, struct xfs_inode *dp, int *r);
extern int xfs_dir2_grow_inode(struct xfs_da_args *args, int space,
xfs_dir2_db_t *dbp);
-extern int xfs_dir2_shrink_inode(struct xfs_da_args *args, xfs_dir2_db_t db,
- struct xfs_buf *bp);
extern int xfs_dir_cilookup_result(struct xfs_da_args *args,
const unsigned char *name, int len);
-/* xfs_dir2_block.c */
-extern const struct xfs_buf_ops xfs_dir2_block_buf_ops;
+#define S_SHIFT 12
+extern const unsigned char xfs_mode_to_ftype[];
+extern unsigned char xfs_dir3_get_dtype(struct xfs_mount *mp,
+ __uint8_t filetype);
+
+
+/* xfs_dir2_block.c */
+extern int xfs_dir3_block_read(struct xfs_trans *tp, struct xfs_inode *dp,
+ struct xfs_buf **bpp);
extern int xfs_dir2_block_addname(struct xfs_da_args *args);
-extern int xfs_dir2_block_getdents(struct xfs_inode *dp, void *dirent,
- xfs_off_t *offset, filldir_t filldir);
extern int xfs_dir2_block_lookup(struct xfs_da_args *args);
extern int xfs_dir2_block_removename(struct xfs_da_args *args);
extern int xfs_dir2_block_replace(struct xfs_da_args *args);
@@ -43,58 +180,40 @@ extern int xfs_dir2_leaf_to_block(struct xfs_da_args *args,
/* xfs_dir2_data.c */
#ifdef DEBUG
-#define xfs_dir2_data_check(dp,bp) __xfs_dir2_data_check(dp, bp);
+#define xfs_dir3_data_check(dp,bp) __xfs_dir3_data_check(dp, bp);
#else
-#define xfs_dir2_data_check(dp,bp)
+#define xfs_dir3_data_check(dp,bp)
#endif
-extern const struct xfs_buf_ops xfs_dir2_data_buf_ops;
-
-extern int __xfs_dir2_data_check(struct xfs_inode *dp, struct xfs_buf *bp);
-extern int xfs_dir2_data_read(struct xfs_trans *tp, struct xfs_inode *dp,
+extern int __xfs_dir3_data_check(struct xfs_inode *dp, struct xfs_buf *bp);
+extern int xfs_dir3_data_read(struct xfs_trans *tp, struct xfs_inode *dp,
xfs_dablk_t bno, xfs_daddr_t mapped_bno, struct xfs_buf **bpp);
-extern int xfs_dir2_data_readahead(struct xfs_trans *tp, struct xfs_inode *dp,
- xfs_dablk_t bno, xfs_daddr_t mapped_bno);
+extern int xfs_dir3_data_readahead(struct xfs_inode *dp, xfs_dablk_t bno,
+ xfs_daddr_t mapped_bno);
extern struct xfs_dir2_data_free *
xfs_dir2_data_freeinsert(struct xfs_dir2_data_hdr *hdr,
- struct xfs_dir2_data_unused *dup, int *loghead);
-extern void xfs_dir2_data_freescan(struct xfs_mount *mp,
- struct xfs_dir2_data_hdr *hdr, int *loghead);
-extern int xfs_dir2_data_init(struct xfs_da_args *args, xfs_dir2_db_t blkno,
+ struct xfs_dir2_data_free *bf, struct xfs_dir2_data_unused *dup,
+ int *loghead);
+extern int xfs_dir3_data_init(struct xfs_da_args *args, xfs_dir2_db_t blkno,
struct xfs_buf **bpp);
-extern void xfs_dir2_data_log_entry(struct xfs_trans *tp, struct xfs_buf *bp,
- struct xfs_dir2_data_entry *dep);
-extern void xfs_dir2_data_log_header(struct xfs_trans *tp,
- struct xfs_buf *bp);
-extern void xfs_dir2_data_log_unused(struct xfs_trans *tp, struct xfs_buf *bp,
- struct xfs_dir2_data_unused *dup);
-extern void xfs_dir2_data_make_free(struct xfs_trans *tp, struct xfs_buf *bp,
- xfs_dir2_data_aoff_t offset, xfs_dir2_data_aoff_t len,
- int *needlogp, int *needscanp);
-extern void xfs_dir2_data_use_free(struct xfs_trans *tp, struct xfs_buf *bp,
- struct xfs_dir2_data_unused *dup, xfs_dir2_data_aoff_t offset,
- xfs_dir2_data_aoff_t len, int *needlogp, int *needscanp);
/* xfs_dir2_leaf.c */
-extern const struct xfs_buf_ops xfs_dir2_leafn_buf_ops;
-
-extern int xfs_dir2_leafn_read(struct xfs_trans *tp, struct xfs_inode *dp,
+extern int xfs_dir3_leafn_read(struct xfs_trans *tp, struct xfs_inode *dp,
xfs_dablk_t fbno, xfs_daddr_t mappedbno, struct xfs_buf **bpp);
extern int xfs_dir2_block_to_leaf(struct xfs_da_args *args,
struct xfs_buf *dbp);
extern int xfs_dir2_leaf_addname(struct xfs_da_args *args);
-extern void xfs_dir2_leaf_compact(struct xfs_da_args *args,
- struct xfs_buf *bp);
-extern void xfs_dir2_leaf_compact_x1(struct xfs_buf *bp, int *indexp,
+extern void xfs_dir3_leaf_compact(struct xfs_da_args *args,
+ struct xfs_dir3_icleaf_hdr *leafhdr, struct xfs_buf *bp);
+extern void xfs_dir3_leaf_compact_x1(struct xfs_dir3_icleaf_hdr *leafhdr,
+ struct xfs_dir2_leaf_entry *ents, int *indexp,
int *lowstalep, int *highstalep, int *lowlogp, int *highlogp);
-extern int xfs_dir2_leaf_getdents(struct xfs_inode *dp, void *dirent,
- size_t bufsize, xfs_off_t *offset, filldir_t filldir);
-extern int xfs_dir2_leaf_init(struct xfs_da_args *args, xfs_dir2_db_t bno,
- struct xfs_buf **bpp, int magic);
-extern void xfs_dir2_leaf_log_ents(struct xfs_trans *tp, struct xfs_buf *bp,
- int first, int last);
-extern void xfs_dir2_leaf_log_header(struct xfs_trans *tp,
+extern int xfs_dir3_leaf_get_buf(struct xfs_da_args *args, xfs_dir2_db_t bno,
+ struct xfs_buf **bpp, __uint16_t magic);
+extern void xfs_dir3_leaf_log_ents(struct xfs_da_args *args,
+ struct xfs_buf *bp, int first, int last);
+extern void xfs_dir3_leaf_log_header(struct xfs_da_args *args,
struct xfs_buf *bp);
extern int xfs_dir2_leaf_lookup(struct xfs_da_args *args);
extern int xfs_dir2_leaf_removename(struct xfs_da_args *args);
@@ -104,19 +223,23 @@ extern int xfs_dir2_leaf_search_hash(struct xfs_da_args *args,
extern int xfs_dir2_leaf_trim_data(struct xfs_da_args *args,
struct xfs_buf *lbp, xfs_dir2_db_t db);
extern struct xfs_dir2_leaf_entry *
-xfs_dir2_leaf_find_entry(struct xfs_dir2_leaf *leaf, int index, int compact,
- int lowstale, int highstale,
- int *lfloglow, int *lfloghigh);
+xfs_dir3_leaf_find_entry(struct xfs_dir3_icleaf_hdr *leafhdr,
+ struct xfs_dir2_leaf_entry *ents, int index, int compact,
+ int lowstale, int highstale, int *lfloglow, int *lfloghigh);
extern int xfs_dir2_node_to_leaf(struct xfs_da_state *state);
+extern bool xfs_dir3_leaf_check_int(struct xfs_mount *mp, struct xfs_inode *dp,
+ struct xfs_dir3_icleaf_hdr *hdr, struct xfs_dir2_leaf *leaf);
+
/* xfs_dir2_node.c */
extern int xfs_dir2_leaf_to_node(struct xfs_da_args *args,
struct xfs_buf *lbp);
-extern xfs_dahash_t xfs_dir2_leafn_lasthash(struct xfs_buf *bp, int *count);
+extern xfs_dahash_t xfs_dir2_leafn_lasthash(struct xfs_inode *dp,
+ struct xfs_buf *bp, int *count);
extern int xfs_dir2_leafn_lookup_int(struct xfs_buf *bp,
struct xfs_da_args *args, int *indexp,
struct xfs_da_state *state);
-extern int xfs_dir2_leafn_order(struct xfs_buf *leaf1_bp,
+extern int xfs_dir2_leafn_order(struct xfs_inode *dp, struct xfs_buf *leaf1_bp,
struct xfs_buf *leaf2_bp);
extern int xfs_dir2_leafn_split(struct xfs_da_state *state,
struct xfs_da_state_blk *oldblk, struct xfs_da_state_blk *newblk);
@@ -134,19 +257,18 @@ extern int xfs_dir2_free_read(struct xfs_trans *tp, struct xfs_inode *dp,
xfs_dablk_t fbno, struct xfs_buf **bpp);
/* xfs_dir2_sf.c */
-extern xfs_ino_t xfs_dir2_sf_get_parent_ino(struct xfs_dir2_sf_hdr *sfp);
-extern xfs_ino_t xfs_dir2_sfe_get_ino(struct xfs_dir2_sf_hdr *sfp,
- struct xfs_dir2_sf_entry *sfep);
extern int xfs_dir2_block_sfsize(struct xfs_inode *dp,
struct xfs_dir2_data_hdr *block, struct xfs_dir2_sf_hdr *sfhp);
extern int xfs_dir2_block_to_sf(struct xfs_da_args *args, struct xfs_buf *bp,
int size, xfs_dir2_sf_hdr_t *sfhp);
extern int xfs_dir2_sf_addname(struct xfs_da_args *args);
extern int xfs_dir2_sf_create(struct xfs_da_args *args, xfs_ino_t pino);
-extern int xfs_dir2_sf_getdents(struct xfs_inode *dp, void *dirent,
- xfs_off_t *offset, filldir_t filldir);
extern int xfs_dir2_sf_lookup(struct xfs_da_args *args);
extern int xfs_dir2_sf_removename(struct xfs_da_args *args);
extern int xfs_dir2_sf_replace(struct xfs_da_args *args);
+/* xfs_dir2_readdir.c */
+extern int xfs_readdir(struct xfs_inode *dp, struct dir_context *ctx,
+ size_t bufsize);
+
#endif /* __XFS_DIR2_PRIV_H__ */
diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c
new file mode 100644
index 00000000000..48e99afb9cb
--- /dev/null
+++ b/fs/xfs/xfs_dir2_readdir.c
@@ -0,0 +1,700 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_inode.h"
+#include "xfs_dir2.h"
+#include "xfs_dir2_priv.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+#include "xfs_bmap.h"
+#include "xfs_trans.h"
+#include "xfs_dinode.h"
+
+/*
+ * Directory file type support functions
+ */
+static unsigned char xfs_dir3_filetype_table[] = {
+ DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK,
+ DT_FIFO, DT_SOCK, DT_LNK, DT_WHT,
+};
+
+unsigned char
+xfs_dir3_get_dtype(
+ struct xfs_mount *mp,
+ __uint8_t filetype)
+{
+ if (!xfs_sb_version_hasftype(&mp->m_sb))
+ return DT_UNKNOWN;
+
+ if (filetype >= XFS_DIR3_FT_MAX)
+ return DT_UNKNOWN;
+
+ return xfs_dir3_filetype_table[filetype];
+}
+/*
+ * @mode, if set, indicates that the type field needs to be set up.
+ * This uses the transformation from file mode to DT_* as defined in linux/fs.h
+ * for file type specification. This will be propagated into the directory
+ * structure if appropriate for the given operation and filesystem config.
+ */
+const unsigned char xfs_mode_to_ftype[S_IFMT >> S_SHIFT] = {
+ [0] = XFS_DIR3_FT_UNKNOWN,
+ [S_IFREG >> S_SHIFT] = XFS_DIR3_FT_REG_FILE,
+ [S_IFDIR >> S_SHIFT] = XFS_DIR3_FT_DIR,
+ [S_IFCHR >> S_SHIFT] = XFS_DIR3_FT_CHRDEV,
+ [S_IFBLK >> S_SHIFT] = XFS_DIR3_FT_BLKDEV,
+ [S_IFIFO >> S_SHIFT] = XFS_DIR3_FT_FIFO,
+ [S_IFSOCK >> S_SHIFT] = XFS_DIR3_FT_SOCK,
+ [S_IFLNK >> S_SHIFT] = XFS_DIR3_FT_SYMLINK,
+};
+
+STATIC int
+xfs_dir2_sf_getdents(
+ struct xfs_da_args *args,
+ struct dir_context *ctx)
+{
+ int i; /* shortform entry number */
+ struct xfs_inode *dp = args->dp; /* incore directory inode */
+ xfs_dir2_dataptr_t off; /* current entry's offset */
+ xfs_dir2_sf_entry_t *sfep; /* shortform directory entry */
+ xfs_dir2_sf_hdr_t *sfp; /* shortform structure */
+ xfs_dir2_dataptr_t dot_offset;
+ xfs_dir2_dataptr_t dotdot_offset;
+ xfs_ino_t ino;
+ struct xfs_da_geometry *geo = args->geo;
+
+ ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
+ /*
+ * Give up if the directory is way too short.
+ */
+ if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) {
+ ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount));
+ return XFS_ERROR(EIO);
+ }
+
+ ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
+ ASSERT(dp->i_df.if_u1.if_data != NULL);
+
+ sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+
+ ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(sfp->i8count));
+
+ /*
+ * If the block number in the offset is out of range, we're done.
+ */
+ if (xfs_dir2_dataptr_to_db(geo, ctx->pos) > geo->datablk)
+ return 0;
+
+ /*
+ * Precalculate offsets for . and .. as we will always need them.
+ *
+ * XXX(hch): the second argument is sometimes 0 and sometimes
+ * geo->datablk
+ */
+ dot_offset = xfs_dir2_db_off_to_dataptr(geo, geo->datablk,
+ dp->d_ops->data_dot_offset);
+ dotdot_offset = xfs_dir2_db_off_to_dataptr(geo, geo->datablk,
+ dp->d_ops->data_dotdot_offset);
+
+ /*
+ * Put . entry unless we're starting past it.
+ */
+ if (ctx->pos <= dot_offset) {
+ ctx->pos = dot_offset & 0x7fffffff;
+ if (!dir_emit(ctx, ".", 1, dp->i_ino, DT_DIR))
+ return 0;
+ }
+
+ /*
+ * Put .. entry unless we're starting past it.
+ */
+ if (ctx->pos <= dotdot_offset) {
+ ino = dp->d_ops->sf_get_parent_ino(sfp);
+ ctx->pos = dotdot_offset & 0x7fffffff;
+ if (!dir_emit(ctx, "..", 2, ino, DT_DIR))
+ return 0;
+ }
+
+ /*
+ * Loop while there are more entries and put'ing works.
+ */
+ sfep = xfs_dir2_sf_firstentry(sfp);
+ for (i = 0; i < sfp->count; i++) {
+ __uint8_t filetype;
+
+ off = xfs_dir2_db_off_to_dataptr(geo, geo->datablk,
+ xfs_dir2_sf_get_offset(sfep));
+
+ if (ctx->pos > off) {
+ sfep = dp->d_ops->sf_nextentry(sfp, sfep);
+ continue;
+ }
+
+ ino = dp->d_ops->sf_get_ino(sfp, sfep);
+ filetype = dp->d_ops->sf_get_ftype(sfep);
+ ctx->pos = off & 0x7fffffff;
+ if (!dir_emit(ctx, (char *)sfep->name, sfep->namelen, ino,
+ xfs_dir3_get_dtype(dp->i_mount, filetype)))
+ return 0;
+ sfep = dp->d_ops->sf_nextentry(sfp, sfep);
+ }
+
+ ctx->pos = xfs_dir2_db_off_to_dataptr(geo, geo->datablk + 1, 0) &
+ 0x7fffffff;
+ return 0;
+}
+
+/*
+ * Readdir for block directories.
+ */
+STATIC int
+xfs_dir2_block_getdents(
+ struct xfs_da_args *args,
+ struct dir_context *ctx)
+{
+ struct xfs_inode *dp = args->dp; /* incore directory inode */
+ xfs_dir2_data_hdr_t *hdr; /* block header */
+ struct xfs_buf *bp; /* buffer for block */
+ xfs_dir2_block_tail_t *btp; /* block tail */
+ xfs_dir2_data_entry_t *dep; /* block data entry */
+ xfs_dir2_data_unused_t *dup; /* block unused entry */
+ char *endptr; /* end of the data entries */
+ int error; /* error return value */
+ char *ptr; /* current data entry */
+ int wantoff; /* starting block offset */
+ xfs_off_t cook;
+ struct xfs_da_geometry *geo = args->geo;
+
+ /*
+ * If the block number in the offset is out of range, we're done.
+ */
+ if (xfs_dir2_dataptr_to_db(geo, ctx->pos) > geo->datablk)
+ return 0;
+
+ error = xfs_dir3_block_read(NULL, dp, &bp);
+ if (error)
+ return error;
+
+ /*
+ * Extract the byte offset we start at from the seek pointer.
+ * We'll skip entries before this.
+ */
+ wantoff = xfs_dir2_dataptr_to_off(geo, ctx->pos);
+ hdr = bp->b_addr;
+ xfs_dir3_data_check(dp, bp);
+ /*
+ * Set up values for the loop.
+ */
+ btp = xfs_dir2_block_tail_p(geo, hdr);
+ ptr = (char *)dp->d_ops->data_entry_p(hdr);
+ endptr = (char *)xfs_dir2_block_leaf_p(btp);
+
+ /*
+ * Loop over the data portion of the block.
+ * Each object is a real entry (dep) or an unused one (dup).
+ */
+ while (ptr < endptr) {
+ __uint8_t filetype;
+
+ dup = (xfs_dir2_data_unused_t *)ptr;
+ /*
+ * Unused, skip it.
+ */
+ if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
+ ptr += be16_to_cpu(dup->length);
+ continue;
+ }
+
+ dep = (xfs_dir2_data_entry_t *)ptr;
+
+ /*
+ * Bump pointer for the next iteration.
+ */
+ ptr += dp->d_ops->data_entsize(dep->namelen);
+ /*
+ * The entry is before the desired starting point, skip it.
+ */
+ if ((char *)dep - (char *)hdr < wantoff)
+ continue;
+
+ cook = xfs_dir2_db_off_to_dataptr(geo, geo->datablk,
+ (char *)dep - (char *)hdr);
+
+ ctx->pos = cook & 0x7fffffff;
+ filetype = dp->d_ops->data_get_ftype(dep);
+ /*
+ * If it didn't fit, set the final offset to here & return.
+ */
+ if (!dir_emit(ctx, (char *)dep->name, dep->namelen,
+ be64_to_cpu(dep->inumber),
+ xfs_dir3_get_dtype(dp->i_mount, filetype))) {
+ xfs_trans_brelse(NULL, bp);
+ return 0;
+ }
+ }
+
+ /*
+ * Reached the end of the block.
+ * Set the offset to a non-existent block 1 and return.
+ */
+ ctx->pos = xfs_dir2_db_off_to_dataptr(geo, geo->datablk + 1, 0) &
+ 0x7fffffff;
+ xfs_trans_brelse(NULL, bp);
+ return 0;
+}
+
+struct xfs_dir2_leaf_map_info {
+ xfs_extlen_t map_blocks; /* number of fsbs in map */
+ xfs_dablk_t map_off; /* last mapped file offset */
+ int map_size; /* total entries in *map */
+ int map_valid; /* valid entries in *map */
+ int nmap; /* mappings to ask xfs_bmapi */
+ xfs_dir2_db_t curdb; /* db for current block */
+ int ra_current; /* number of read-ahead blks */
+ int ra_index; /* *map index for read-ahead */
+ int ra_offset; /* map entry offset for ra */
+ int ra_want; /* readahead count wanted */
+ struct xfs_bmbt_irec map[]; /* map vector for blocks */
+};
+
+STATIC int
+xfs_dir2_leaf_readbuf(
+ struct xfs_da_args *args,
+ size_t bufsize,
+ struct xfs_dir2_leaf_map_info *mip,
+ xfs_dir2_off_t *curoff,
+ struct xfs_buf **bpp)
+{
+ struct xfs_inode *dp = args->dp;
+ struct xfs_buf *bp = *bpp;
+ struct xfs_bmbt_irec *map = mip->map;
+ struct blk_plug plug;
+ int error = 0;
+ int length;
+ int i;
+ int j;
+ struct xfs_da_geometry *geo = args->geo;
+
+ /*
+ * If we have a buffer, we need to release it and
+ * take it out of the mapping.
+ */
+
+ if (bp) {
+ xfs_trans_brelse(NULL, bp);
+ bp = NULL;
+ mip->map_blocks -= geo->fsbcount;
+ /*
+ * Loop to get rid of the extents for the
+ * directory block.
+ */
+ for (i = geo->fsbcount; i > 0; ) {
+ j = min_t(int, map->br_blockcount, i);
+ map->br_blockcount -= j;
+ map->br_startblock += j;
+ map->br_startoff += j;
+ /*
+ * If mapping is done, pitch it from
+ * the table.
+ */
+ if (!map->br_blockcount && --mip->map_valid)
+ memmove(&map[0], &map[1],
+ sizeof(map[0]) * mip->map_valid);
+ i -= j;
+ }
+ }
+
+ /*
+ * Recalculate the readahead blocks wanted.
+ */
+ mip->ra_want = howmany(bufsize + geo->blksize, (1 << geo->fsblog)) - 1;
+ ASSERT(mip->ra_want >= 0);
+
+ /*
+ * If we don't have as many as we want, and we haven't
+ * run out of data blocks, get some more mappings.
+ */
+ if (1 + mip->ra_want > mip->map_blocks &&
+ mip->map_off < xfs_dir2_byte_to_da(geo, XFS_DIR2_LEAF_OFFSET)) {
+ /*
+ * Get more bmaps, fill in after the ones
+ * we already have in the table.
+ */
+ mip->nmap = mip->map_size - mip->map_valid;
+ error = xfs_bmapi_read(dp, mip->map_off,
+ xfs_dir2_byte_to_da(geo, XFS_DIR2_LEAF_OFFSET) -
+ mip->map_off,
+ &map[mip->map_valid], &mip->nmap, 0);
+
+ /*
+ * Don't know if we should ignore this or try to return an
+ * error. The trouble with returning errors is that readdir
+ * will just stop without actually passing the error through.
+ */
+ if (error)
+ goto out; /* XXX */
+
+ /*
+ * If we got all the mappings we asked for, set the final map
+ * offset based on the last bmap value received. Otherwise,
+ * we've reached the end.
+ */
+ if (mip->nmap == mip->map_size - mip->map_valid) {
+ i = mip->map_valid + mip->nmap - 1;
+ mip->map_off = map[i].br_startoff + map[i].br_blockcount;
+ } else
+ mip->map_off = xfs_dir2_byte_to_da(geo,
+ XFS_DIR2_LEAF_OFFSET);
+
+ /*
+ * Look for holes in the mapping, and eliminate them. Count up
+ * the valid blocks.
+ */
+ for (i = mip->map_valid; i < mip->map_valid + mip->nmap; ) {
+ if (map[i].br_startblock == HOLESTARTBLOCK) {
+ mip->nmap--;
+ length = mip->map_valid + mip->nmap - i;
+ if (length)
+ memmove(&map[i], &map[i + 1],
+ sizeof(map[i]) * length);
+ } else {
+ mip->map_blocks += map[i].br_blockcount;
+ i++;
+ }
+ }
+ mip->map_valid += mip->nmap;
+ }
+
+ /*
+ * No valid mappings, so no more data blocks.
+ */
+ if (!mip->map_valid) {
+ *curoff = xfs_dir2_da_to_byte(geo, mip->map_off);
+ goto out;
+ }
+
+ /*
+ * Read the directory block starting at the first mapping.
+ */
+ mip->curdb = xfs_dir2_da_to_db(geo, map->br_startoff);
+ error = xfs_dir3_data_read(NULL, dp, map->br_startoff,
+ map->br_blockcount >= geo->fsbcount ?
+ XFS_FSB_TO_DADDR(dp->i_mount, map->br_startblock) :
+ -1, &bp);
+ /*
+ * Should just skip over the data block instead of giving up.
+ */
+ if (error)
+ goto out; /* XXX */
+
+ /*
+ * Adjust the current amount of read-ahead: we just read a block that
+ * was previously ra.
+ */
+ if (mip->ra_current)
+ mip->ra_current -= geo->fsbcount;
+
+ /*
+ * Do we need more readahead?
+ */
+ blk_start_plug(&plug);
+ for (mip->ra_index = mip->ra_offset = i = 0;
+ mip->ra_want > mip->ra_current && i < mip->map_blocks;
+ i += geo->fsbcount) {
+ ASSERT(mip->ra_index < mip->map_valid);
+ /*
+ * Read-ahead a contiguous directory block.
+ */
+ if (i > mip->ra_current &&
+ map[mip->ra_index].br_blockcount >= geo->fsbcount) {
+ xfs_dir3_data_readahead(dp,
+ map[mip->ra_index].br_startoff + mip->ra_offset,
+ XFS_FSB_TO_DADDR(dp->i_mount,
+ map[mip->ra_index].br_startblock +
+ mip->ra_offset));
+ mip->ra_current = i;
+ }
+
+ /*
+ * Read-ahead a non-contiguous directory block. This doesn't
+ * use our mapping, but this is a very rare case.
+ */
+ else if (i > mip->ra_current) {
+ xfs_dir3_data_readahead(dp,
+ map[mip->ra_index].br_startoff +
+ mip->ra_offset, -1);
+ mip->ra_current = i;
+ }
+
+ /*
+ * Advance offset through the mapping table.
+ */
+ for (j = 0; j < geo->fsbcount; j += length ) {
+ /*
+ * The rest of this extent but not more than a dir
+ * block.
+ */
+ length = min_t(int, geo->fsbcount,
+ map[mip->ra_index].br_blockcount -
+ mip->ra_offset);
+ mip->ra_offset += length;
+
+ /*
+ * Advance to the next mapping if this one is used up.
+ */
+ if (mip->ra_offset == map[mip->ra_index].br_blockcount) {
+ mip->ra_offset = 0;
+ mip->ra_index++;
+ }
+ }
+ }
+ blk_finish_plug(&plug);
+
+out:
+ *bpp = bp;
+ return error;
+}
+
+/*
+ * Getdents (readdir) for leaf and node directories.
+ * This reads the data blocks only, so is the same for both forms.
+ */
+STATIC int
+xfs_dir2_leaf_getdents(
+ struct xfs_da_args *args,
+ struct dir_context *ctx,
+ size_t bufsize)
+{
+ struct xfs_inode *dp = args->dp;
+ struct xfs_buf *bp = NULL; /* data block buffer */
+ xfs_dir2_data_hdr_t *hdr; /* data block header */
+ xfs_dir2_data_entry_t *dep; /* data entry */
+ xfs_dir2_data_unused_t *dup; /* unused entry */
+ int error = 0; /* error return value */
+ int length; /* temporary length value */
+ int byteoff; /* offset in current block */
+ xfs_dir2_off_t curoff; /* current overall offset */
+ xfs_dir2_off_t newoff; /* new curoff after new blk */
+ char *ptr = NULL; /* pointer to current data */
+ struct xfs_dir2_leaf_map_info *map_info;
+ struct xfs_da_geometry *geo = args->geo;
+
+ /*
+ * If the offset is at or past the largest allowed value,
+ * give up right away.
+ */
+ if (ctx->pos >= XFS_DIR2_MAX_DATAPTR)
+ return 0;
+
+ /*
+ * Set up to bmap a number of blocks based on the caller's
+ * buffer size, the directory block size, and the filesystem
+ * block size.
+ */
+ length = howmany(bufsize + geo->blksize, (1 << geo->fsblog));
+ map_info = kmem_zalloc(offsetof(struct xfs_dir2_leaf_map_info, map) +
+ (length * sizeof(struct xfs_bmbt_irec)),
+ KM_SLEEP | KM_NOFS);
+ map_info->map_size = length;
+
+ /*
+ * Inside the loop we keep the main offset value as a byte offset
+ * in the directory file.
+ */
+ curoff = xfs_dir2_dataptr_to_byte(ctx->pos);
+
+ /*
+ * Force this conversion through db so we truncate the offset
+ * down to get the start of the data block.
+ */
+ map_info->map_off = xfs_dir2_db_to_da(geo,
+ xfs_dir2_byte_to_db(geo, curoff));
+
+ /*
+ * Loop over directory entries until we reach the end offset.
+ * Get more blocks and readahead as necessary.
+ */
+ while (curoff < XFS_DIR2_LEAF_OFFSET) {
+ __uint8_t filetype;
+
+ /*
+ * If we have no buffer, or we're off the end of the
+ * current buffer, need to get another one.
+ */
+ if (!bp || ptr >= (char *)bp->b_addr + geo->blksize) {
+
+ error = xfs_dir2_leaf_readbuf(args, bufsize, map_info,
+ &curoff, &bp);
+ if (error || !map_info->map_valid)
+ break;
+
+ /*
+ * Having done a read, we need to set a new offset.
+ */
+ newoff = xfs_dir2_db_off_to_byte(geo,
+ map_info->curdb, 0);
+ /*
+ * Start of the current block.
+ */
+ if (curoff < newoff)
+ curoff = newoff;
+ /*
+ * Make sure we're in the right block.
+ */
+ else if (curoff > newoff)
+ ASSERT(xfs_dir2_byte_to_db(geo, curoff) ==
+ map_info->curdb);
+ hdr = bp->b_addr;
+ xfs_dir3_data_check(dp, bp);
+ /*
+ * Find our position in the block.
+ */
+ ptr = (char *)dp->d_ops->data_entry_p(hdr);
+ byteoff = xfs_dir2_byte_to_off(geo, curoff);
+ /*
+ * Skip past the header.
+ */
+ if (byteoff == 0)
+ curoff += dp->d_ops->data_entry_offset;
+ /*
+ * Skip past entries until we reach our offset.
+ */
+ else {
+ while ((char *)ptr - (char *)hdr < byteoff) {
+ dup = (xfs_dir2_data_unused_t *)ptr;
+
+ if (be16_to_cpu(dup->freetag)
+ == XFS_DIR2_DATA_FREE_TAG) {
+
+ length = be16_to_cpu(dup->length);
+ ptr += length;
+ continue;
+ }
+ dep = (xfs_dir2_data_entry_t *)ptr;
+ length =
+ dp->d_ops->data_entsize(dep->namelen);
+ ptr += length;
+ }
+ /*
+ * Now set our real offset.
+ */
+ curoff =
+ xfs_dir2_db_off_to_byte(geo,
+ xfs_dir2_byte_to_db(geo, curoff),
+ (char *)ptr - (char *)hdr);
+ if (ptr >= (char *)hdr + geo->blksize) {
+ continue;
+ }
+ }
+ }
+ /*
+ * We have a pointer to an entry.
+ * Is it a live one?
+ */
+ dup = (xfs_dir2_data_unused_t *)ptr;
+ /*
+ * No, it's unused, skip over it.
+ */
+ if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
+ length = be16_to_cpu(dup->length);
+ ptr += length;
+ curoff += length;
+ continue;
+ }
+
+ dep = (xfs_dir2_data_entry_t *)ptr;
+ length = dp->d_ops->data_entsize(dep->namelen);
+ filetype = dp->d_ops->data_get_ftype(dep);
+
+ ctx->pos = xfs_dir2_byte_to_dataptr(curoff) & 0x7fffffff;
+ if (!dir_emit(ctx, (char *)dep->name, dep->namelen,
+ be64_to_cpu(dep->inumber),
+ xfs_dir3_get_dtype(dp->i_mount, filetype)))
+ break;
+
+ /*
+ * Advance to next entry in the block.
+ */
+ ptr += length;
+ curoff += length;
+ /* bufsize may have just been a guess; don't go negative */
+ bufsize = bufsize > length ? bufsize - length : 0;
+ }
+
+ /*
+ * All done. Set output offset value to current offset.
+ */
+ if (curoff > xfs_dir2_dataptr_to_byte(XFS_DIR2_MAX_DATAPTR))
+ ctx->pos = XFS_DIR2_MAX_DATAPTR & 0x7fffffff;
+ else
+ ctx->pos = xfs_dir2_byte_to_dataptr(curoff) & 0x7fffffff;
+ kmem_free(map_info);
+ if (bp)
+ xfs_trans_brelse(NULL, bp);
+ return error;
+}
+
+/*
+ * Read a directory.
+ */
+int
+xfs_readdir(
+ struct xfs_inode *dp,
+ struct dir_context *ctx,
+ size_t bufsize)
+{
+ struct xfs_da_args args = { NULL };
+ int rval;
+ int v;
+ uint lock_mode;
+
+ trace_xfs_readdir(dp);
+
+ if (XFS_FORCED_SHUTDOWN(dp->i_mount))
+ return XFS_ERROR(EIO);
+
+ ASSERT(S_ISDIR(dp->i_d.di_mode));
+ XFS_STATS_INC(xs_dir_getdents);
+
+ args.dp = dp;
+ args.geo = dp->i_mount->m_dir_geo;
+
+ lock_mode = xfs_ilock_data_map_shared(dp);
+ if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
+ rval = xfs_dir2_sf_getdents(&args, ctx);
+ else if ((rval = xfs_dir2_isblock(&args, &v)))
+ ;
+ else if (v)
+ rval = xfs_dir2_block_getdents(&args, ctx);
+ else
+ rval = xfs_dir2_leaf_getdents(&args, ctx, bufsize);
+ xfs_iunlock(dp, lock_mode);
+
+ return rval;
+}
diff --git a/fs/xfs/xfs_dir2_sf.c b/fs/xfs/xfs_dir2_sf.c
index 1b9fc3ec7e4..53c3be619db 100644
--- a/fs/xfs/xfs_dir2_sf.c
+++ b/fs/xfs/xfs_dir2_sf.c
@@ -17,22 +17,22 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
-#include "xfs_types.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_mount.h"
+#include "xfs_da_format.h"
#include "xfs_da_btree.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_dinode.h"
#include "xfs_inode.h"
+#include "xfs_trans.h"
#include "xfs_inode_item.h"
#include "xfs_error.h"
#include "xfs_dir2.h"
-#include "xfs_dir2_format.h"
#include "xfs_dir2_priv.h"
#include "xfs_trace.h"
+#include "xfs_dinode.h"
/*
* Prototypes for internal functions.
@@ -57,82 +57,6 @@ static void xfs_dir2_sf_toino8(xfs_da_args_t *args);
#endif /* XFS_BIG_INUMS */
/*
- * Inode numbers in short-form directories can come in two versions,
- * either 4 bytes or 8 bytes wide. These helpers deal with the
- * two forms transparently by looking at the headers i8count field.
- *
- * For 64-bit inode number the most significant byte must be zero.
- */
-static xfs_ino_t
-xfs_dir2_sf_get_ino(
- struct xfs_dir2_sf_hdr *hdr,
- xfs_dir2_inou_t *from)
-{
- if (hdr->i8count)
- return get_unaligned_be64(&from->i8.i) & 0x00ffffffffffffffULL;
- else
- return get_unaligned_be32(&from->i4.i);
-}
-
-static void
-xfs_dir2_sf_put_ino(
- struct xfs_dir2_sf_hdr *hdr,
- xfs_dir2_inou_t *to,
- xfs_ino_t ino)
-{
- ASSERT((ino & 0xff00000000000000ULL) == 0);
-
- if (hdr->i8count)
- put_unaligned_be64(ino, &to->i8.i);
- else
- put_unaligned_be32(ino, &to->i4.i);
-}
-
-xfs_ino_t
-xfs_dir2_sf_get_parent_ino(
- struct xfs_dir2_sf_hdr *hdr)
-{
- return xfs_dir2_sf_get_ino(hdr, &hdr->parent);
-}
-
-static void
-xfs_dir2_sf_put_parent_ino(
- struct xfs_dir2_sf_hdr *hdr,
- xfs_ino_t ino)
-{
- xfs_dir2_sf_put_ino(hdr, &hdr->parent, ino);
-}
-
-/*
- * In short-form directory entries the inode numbers are stored at variable
- * offset behind the entry name. The inode numbers may only be accessed
- * through the helpers below.
- */
-static xfs_dir2_inou_t *
-xfs_dir2_sfe_inop(
- struct xfs_dir2_sf_entry *sfep)
-{
- return (xfs_dir2_inou_t *)&sfep->name[sfep->namelen];
-}
-
-xfs_ino_t
-xfs_dir2_sfe_get_ino(
- struct xfs_dir2_sf_hdr *hdr,
- struct xfs_dir2_sf_entry *sfep)
-{
- return xfs_dir2_sf_get_ino(hdr, xfs_dir2_sfe_inop(sfep));
-}
-
-static void
-xfs_dir2_sfe_put_ino(
- struct xfs_dir2_sf_hdr *hdr,
- struct xfs_dir2_sf_entry *sfep,
- xfs_ino_t ino)
-{
- xfs_dir2_sf_put_ino(hdr, xfs_dir2_sfe_inop(sfep), ino);
-}
-
-/*
* Given a block directory (dp/block), calculate its size as a shortform (sf)
* directory and a header for the sf directory, if it will fit it the
* space currently present in the inode. If it won't fit, the output
@@ -157,11 +81,20 @@ xfs_dir2_block_sfsize(
int namelen; /* total name bytes */
xfs_ino_t parent = 0; /* parent inode number */
int size=0; /* total computed size */
+ int has_ftype;
+ struct xfs_da_geometry *geo;
mp = dp->i_mount;
+ geo = mp->m_dir_geo;
+
+ /*
+ * if there is a filetype field, add the extra byte to the namelen
+ * for each entry that we see.
+ */
+ has_ftype = xfs_sb_version_hasftype(&mp->m_sb) ? 1 : 0;
count = i8count = namelen = 0;
- btp = xfs_dir2_block_tail_p(mp, hdr);
+ btp = xfs_dir2_block_tail_p(geo, hdr);
blp = xfs_dir2_block_leaf_p(btp);
/*
@@ -173,8 +106,8 @@ xfs_dir2_block_sfsize(
/*
* Calculate the pointer to the entry at hand.
*/
- dep = (xfs_dir2_data_entry_t *)
- ((char *)hdr + xfs_dir2_dataptr_to_off(mp, addr));
+ dep = (xfs_dir2_data_entry_t *)((char *)hdr +
+ xfs_dir2_dataptr_to_off(geo, addr));
/*
* Detect . and .., so we can special-case them.
* . is not included in sf directories.
@@ -188,9 +121,10 @@ xfs_dir2_block_sfsize(
if (!isdot)
i8count += be64_to_cpu(dep->inumber) > XFS_DIR2_MAX_SHORT_INUM;
#endif
+ /* take into account the file type field */
if (!isdot && !isdotdot) {
count++;
- namelen += dep->namelen;
+ namelen += dep->namelen + has_ftype;
} else if (isdotdot)
parent = be64_to_cpu(dep->inumber);
/*
@@ -211,7 +145,7 @@ xfs_dir2_block_sfsize(
*/
sfhp->count = count;
sfhp->i8count = i8count;
- xfs_dir2_sf_put_parent_ino(sfhp, parent);
+ dp->d_ops->sf_put_parent_ino(sfhp, parent);
return size;
}
@@ -238,6 +172,7 @@ xfs_dir2_block_to_sf(
char *ptr; /* current data pointer */
xfs_dir2_sf_entry_t *sfep; /* shortform entry */
xfs_dir2_sf_hdr_t *sfp; /* shortform directory header */
+ xfs_dir2_sf_hdr_t *dst; /* temporary data buffer */
trace_xfs_dir2_block_to_sf(args);
@@ -245,40 +180,25 @@ xfs_dir2_block_to_sf(
mp = dp->i_mount;
/*
- * Make a copy of the block data, so we can shrink the inode
- * and add local data.
+ * allocate a temporary destination buffer the size of the inode
+ * to format the data into. Once we have formatted the data, we
+ * can free the block and copy the formatted data into the inode literal
+ * area.
*/
- hdr = kmem_alloc(mp->m_dirblksize, KM_SLEEP);
- memcpy(hdr, bp->b_addr, mp->m_dirblksize);
- logflags = XFS_ILOG_CORE;
- if ((error = xfs_dir2_shrink_inode(args, mp->m_dirdatablk, bp))) {
- ASSERT(error != ENOSPC);
- goto out;
- }
+ dst = kmem_alloc(mp->m_sb.sb_inodesize, KM_SLEEP);
+ hdr = bp->b_addr;
/*
- * The buffer is now unconditionally gone, whether
- * xfs_dir2_shrink_inode worked or not.
- *
- * Convert the inode to local format.
- */
- dp->i_df.if_flags &= ~XFS_IFEXTENTS;
- dp->i_df.if_flags |= XFS_IFINLINE;
- dp->i_d.di_format = XFS_DINODE_FMT_LOCAL;
- ASSERT(dp->i_df.if_bytes == 0);
- xfs_idata_realloc(dp, size, XFS_DATA_FORK);
- logflags |= XFS_ILOG_DDATA;
- /*
* Copy the header into the newly allocate local space.
*/
- sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+ sfp = (xfs_dir2_sf_hdr_t *)dst;
memcpy(sfp, sfhp, xfs_dir2_sf_hdr_size(sfhp->i8count));
- dp->i_d.di_size = size;
+
/*
* Set up to loop over the block's entries.
*/
- btp = xfs_dir2_block_tail_p(mp, hdr);
- ptr = (char *)(hdr + 1);
+ btp = xfs_dir2_block_tail_p(args->geo, hdr);
+ ptr = (char *)dp->d_ops->data_entry_p(hdr);
endptr = (char *)xfs_dir2_block_leaf_p(btp);
sfep = xfs_dir2_sf_firstentry(sfp);
/*
@@ -306,7 +226,7 @@ xfs_dir2_block_to_sf(
else if (dep->namelen == 2 &&
dep->name[0] == '.' && dep->name[1] == '.')
ASSERT(be64_to_cpu(dep->inumber) ==
- xfs_dir2_sf_get_parent_ino(sfp));
+ dp->d_ops->sf_get_parent_ino(sfp));
/*
* Normal entry, copy it into shortform.
*/
@@ -316,18 +236,44 @@ xfs_dir2_block_to_sf(
(xfs_dir2_data_aoff_t)
((char *)dep - (char *)hdr));
memcpy(sfep->name, dep->name, dep->namelen);
- xfs_dir2_sfe_put_ino(sfp, sfep,
- be64_to_cpu(dep->inumber));
+ dp->d_ops->sf_put_ino(sfp, sfep,
+ be64_to_cpu(dep->inumber));
+ dp->d_ops->sf_put_ftype(sfep,
+ dp->d_ops->data_get_ftype(dep));
- sfep = xfs_dir2_sf_nextentry(sfp, sfep);
+ sfep = dp->d_ops->sf_nextentry(sfp, sfep);
}
- ptr += xfs_dir2_data_entsize(dep->namelen);
+ ptr += dp->d_ops->data_entsize(dep->namelen);
}
ASSERT((char *)sfep - (char *)sfp == size);
+
+ /* now we are done with the block, we can shrink the inode */
+ logflags = XFS_ILOG_CORE;
+ error = xfs_dir2_shrink_inode(args, args->geo->datablk, bp);
+ if (error) {
+ ASSERT(error != ENOSPC);
+ goto out;
+ }
+
+ /*
+ * The buffer is now unconditionally gone, whether
+ * xfs_dir2_shrink_inode worked or not.
+ *
+ * Convert the inode to local format and copy the data in.
+ */
+ dp->i_df.if_flags &= ~XFS_IFEXTENTS;
+ dp->i_df.if_flags |= XFS_IFINLINE;
+ dp->i_d.di_format = XFS_DINODE_FMT_LOCAL;
+ ASSERT(dp->i_df.if_bytes == 0);
+ xfs_idata_realloc(dp, size, XFS_DATA_FORK);
+
+ logflags |= XFS_ILOG_DDATA;
+ memcpy(dp->i_df.if_u1.if_data, dst, size);
+ dp->i_d.di_size = size;
xfs_dir2_sf_check(args);
out:
xfs_trans_log_inode(args->trans, dp, logflags);
- kmem_free(hdr);
+ kmem_free(dst);
return error;
}
@@ -341,14 +287,12 @@ int /* error */
xfs_dir2_sf_addname(
xfs_da_args_t *args) /* operation arguments */
{
- int add_entsize; /* size of the new entry */
xfs_inode_t *dp; /* incore directory inode */
int error; /* error return value */
int incr_isize; /* total change in size */
int new_isize; /* di_size after adding name */
int objchange; /* changing to 8-byte inodes */
xfs_dir2_data_aoff_t offset = 0; /* offset for new entry */
- int old_isize; /* di_size before adding name */
int pick; /* which algorithm to use */
xfs_dir2_sf_hdr_t *sfp; /* shortform structure */
xfs_dir2_sf_entry_t *sfep = NULL; /* shortform entry */
@@ -372,8 +316,7 @@ xfs_dir2_sf_addname(
/*
* Compute entry (and change in) size.
*/
- add_entsize = xfs_dir2_sf_entsize(sfp, args->namelen);
- incr_isize = add_entsize;
+ incr_isize = dp->d_ops->sf_entsize(sfp, args->namelen);
objchange = 0;
#if XFS_BIG_INUMS
/*
@@ -381,11 +324,8 @@ xfs_dir2_sf_addname(
*/
if (args->inumber > XFS_DIR2_MAX_SHORT_INUM && sfp->i8count == 0) {
/*
- * Yes, adjust the entry size and the total size.
+ * Yes, adjust the inode size. old count + (parent + new)
*/
- add_entsize +=
- (uint)sizeof(xfs_dir2_ino8_t) -
- (uint)sizeof(xfs_dir2_ino4_t);
incr_isize +=
(sfp->count + 2) *
((uint)sizeof(xfs_dir2_ino8_t) -
@@ -393,8 +333,7 @@ xfs_dir2_sf_addname(
objchange = 1;
}
#endif
- old_isize = (int)dp->i_d.di_size;
- new_isize = old_isize + incr_isize;
+ new_isize = (int)dp->i_d.di_size + incr_isize;
/*
* Won't fit as shortform any more (due to size),
* or the pick routine says it won't (due to offset values).
@@ -466,8 +405,8 @@ xfs_dir2_sf_addname_easy(
/*
* Grow the in-inode space.
*/
- xfs_idata_realloc(dp, xfs_dir2_sf_entsize(sfp, args->namelen),
- XFS_DATA_FORK);
+ xfs_idata_realloc(dp, dp->d_ops->sf_entsize(sfp, args->namelen),
+ XFS_DATA_FORK);
/*
* Need to set up again due to realloc of the inode data.
*/
@@ -479,7 +418,9 @@ xfs_dir2_sf_addname_easy(
sfep->namelen = args->namelen;
xfs_dir2_sf_put_offset(sfep, offset);
memcpy(sfep->name, args->name, sfep->namelen);
- xfs_dir2_sfe_put_ino(sfp, sfep, args->inumber);
+ dp->d_ops->sf_put_ino(sfp, sfep, args->inumber);
+ dp->d_ops->sf_put_ftype(sfep, args->filetype);
+
/*
* Update the header and inode.
*/
@@ -519,11 +460,13 @@ xfs_dir2_sf_addname_hard(
xfs_dir2_sf_hdr_t *oldsfp; /* original shortform dir */
xfs_dir2_sf_entry_t *sfep; /* entry in new dir */
xfs_dir2_sf_hdr_t *sfp; /* new shortform dir */
+ struct xfs_mount *mp;
/*
* Copy the old directory to the stack buffer.
*/
dp = args->dp;
+ mp = dp->i_mount;
sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
old_isize = (int)dp->i_d.di_size;
@@ -535,13 +478,13 @@ xfs_dir2_sf_addname_hard(
* to insert the new entry.
* If it's going to end up at the end then oldsfep will point there.
*/
- for (offset = XFS_DIR2_DATA_FIRST_OFFSET,
+ for (offset = dp->d_ops->data_first_offset,
oldsfep = xfs_dir2_sf_firstentry(oldsfp),
- add_datasize = xfs_dir2_data_entsize(args->namelen),
+ add_datasize = dp->d_ops->data_entsize(args->namelen),
eof = (char *)oldsfep == &buf[old_isize];
!eof;
- offset = new_offset + xfs_dir2_data_entsize(oldsfep->namelen),
- oldsfep = xfs_dir2_sf_nextentry(oldsfp, oldsfep),
+ offset = new_offset + dp->d_ops->data_entsize(oldsfep->namelen),
+ oldsfep = dp->d_ops->sf_nextentry(oldsfp, oldsfep),
eof = (char *)oldsfep == &buf[old_isize]) {
new_offset = xfs_dir2_sf_get_offset(oldsfep);
if (offset + add_datasize <= new_offset)
@@ -570,7 +513,8 @@ xfs_dir2_sf_addname_hard(
sfep->namelen = args->namelen;
xfs_dir2_sf_put_offset(sfep, offset);
memcpy(sfep->name, args->name, sfep->namelen);
- xfs_dir2_sfe_put_ino(sfp, sfep, args->inumber);
+ dp->d_ops->sf_put_ino(sfp, sfep, args->inumber);
+ dp->d_ops->sf_put_ftype(sfep, args->filetype);
sfp->count++;
#if XFS_BIG_INUMS
if (args->inumber > XFS_DIR2_MAX_SHORT_INUM && !objchange)
@@ -580,7 +524,7 @@ xfs_dir2_sf_addname_hard(
* If there's more left to copy, do that.
*/
if (!eof) {
- sfep = xfs_dir2_sf_nextentry(sfp, sfep);
+ sfep = dp->d_ops->sf_nextentry(sfp, sfep);
memcpy(sfep, oldsfep, old_isize - nbytes);
}
kmem_free(buf);
@@ -616,8 +560,8 @@ xfs_dir2_sf_addname_pick(
mp = dp->i_mount;
sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
- size = xfs_dir2_data_entsize(args->namelen);
- offset = XFS_DIR2_DATA_FIRST_OFFSET;
+ size = dp->d_ops->data_entsize(args->namelen);
+ offset = dp->d_ops->data_first_offset;
sfep = xfs_dir2_sf_firstentry(sfp);
holefit = 0;
/*
@@ -629,8 +573,8 @@ xfs_dir2_sf_addname_pick(
if (!holefit)
holefit = offset + size <= xfs_dir2_sf_get_offset(sfep);
offset = xfs_dir2_sf_get_offset(sfep) +
- xfs_dir2_data_entsize(sfep->namelen);
- sfep = xfs_dir2_sf_nextentry(sfp, sfep);
+ dp->d_ops->data_entsize(sfep->namelen);
+ sfep = dp->d_ops->sf_nextentry(sfp, sfep);
}
/*
* Calculate data bytes used excluding the new entry, if this
@@ -644,7 +588,7 @@ xfs_dir2_sf_addname_pick(
* we'll go back, convert to block, then try the insert and convert
* to leaf.
*/
- if (used + (holefit ? 0 : size) > mp->m_dirblksize)
+ if (used + (holefit ? 0 : size) > args->geo->blksize)
return 0;
/*
* If changing the inode number size, do it the hard way.
@@ -659,7 +603,7 @@ xfs_dir2_sf_addname_pick(
/*
* If it won't fit at the end then do it the hard way (use the hole).
*/
- if (used + size > mp->m_dirblksize)
+ if (used + size > args->geo->blksize)
return 2;
/*
* Do it the easy way.
@@ -684,31 +628,33 @@ xfs_dir2_sf_check(
int offset; /* data offset */
xfs_dir2_sf_entry_t *sfep; /* shortform dir entry */
xfs_dir2_sf_hdr_t *sfp; /* shortform structure */
+ struct xfs_mount *mp;
dp = args->dp;
+ mp = dp->i_mount;
sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
- offset = XFS_DIR2_DATA_FIRST_OFFSET;
- ino = xfs_dir2_sf_get_parent_ino(sfp);
+ offset = dp->d_ops->data_first_offset;
+ ino = dp->d_ops->sf_get_parent_ino(sfp);
i8count = ino > XFS_DIR2_MAX_SHORT_INUM;
for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp);
i < sfp->count;
- i++, sfep = xfs_dir2_sf_nextentry(sfp, sfep)) {
+ i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep)) {
ASSERT(xfs_dir2_sf_get_offset(sfep) >= offset);
- ino = xfs_dir2_sfe_get_ino(sfp, sfep);
+ ino = dp->d_ops->sf_get_ino(sfp, sfep);
i8count += ino > XFS_DIR2_MAX_SHORT_INUM;
offset =
xfs_dir2_sf_get_offset(sfep) +
- xfs_dir2_data_entsize(sfep->namelen);
+ dp->d_ops->data_entsize(sfep->namelen);
+ ASSERT(dp->d_ops->sf_get_ftype(sfep) < XFS_DIR3_FT_MAX);
}
ASSERT(i8count == sfp->i8count);
ASSERT(XFS_BIG_INUMS || i8count == 0);
ASSERT((char *)sfep - (char *)sfp == dp->i_d.di_size);
ASSERT(offset +
(sfp->count + 2) * (uint)sizeof(xfs_dir2_leaf_entry_t) +
- (uint)sizeof(xfs_dir2_block_tail_t) <=
- dp->i_mount->m_dirblksize);
+ (uint)sizeof(xfs_dir2_block_tail_t) <= args->geo->blksize);
}
#endif /* DEBUG */
@@ -757,7 +703,7 @@ xfs_dir2_sf_create(
/*
* Now can put in the inode number, since i8count is set.
*/
- xfs_dir2_sf_put_parent_ino(sfp, pino);
+ dp->d_ops->sf_put_parent_ino(sfp, pino);
sfp->count = 0;
dp->i_d.di_size = size;
xfs_dir2_sf_check(args);
@@ -765,105 +711,6 @@ xfs_dir2_sf_create(
return 0;
}
-int /* error */
-xfs_dir2_sf_getdents(
- xfs_inode_t *dp, /* incore directory inode */
- void *dirent,
- xfs_off_t *offset,
- filldir_t filldir)
-{
- int i; /* shortform entry number */
- xfs_mount_t *mp; /* filesystem mount point */
- xfs_dir2_dataptr_t off; /* current entry's offset */
- xfs_dir2_sf_entry_t *sfep; /* shortform directory entry */
- xfs_dir2_sf_hdr_t *sfp; /* shortform structure */
- xfs_dir2_dataptr_t dot_offset;
- xfs_dir2_dataptr_t dotdot_offset;
- xfs_ino_t ino;
-
- mp = dp->i_mount;
-
- ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
- /*
- * Give up if the directory is way too short.
- */
- if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) {
- ASSERT(XFS_FORCED_SHUTDOWN(mp));
- return XFS_ERROR(EIO);
- }
-
- ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
- ASSERT(dp->i_df.if_u1.if_data != NULL);
-
- sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
-
- ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(sfp->i8count));
-
- /*
- * If the block number in the offset is out of range, we're done.
- */
- if (xfs_dir2_dataptr_to_db(mp, *offset) > mp->m_dirdatablk)
- return 0;
-
- /*
- * Precalculate offsets for . and .. as we will always need them.
- *
- * XXX(hch): the second argument is sometimes 0 and sometimes
- * mp->m_dirdatablk.
- */
- dot_offset = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk,
- XFS_DIR2_DATA_DOT_OFFSET);
- dotdot_offset = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk,
- XFS_DIR2_DATA_DOTDOT_OFFSET);
-
- /*
- * Put . entry unless we're starting past it.
- */
- if (*offset <= dot_offset) {
- if (filldir(dirent, ".", 1, dot_offset & 0x7fffffff, dp->i_ino, DT_DIR)) {
- *offset = dot_offset & 0x7fffffff;
- return 0;
- }
- }
-
- /*
- * Put .. entry unless we're starting past it.
- */
- if (*offset <= dotdot_offset) {
- ino = xfs_dir2_sf_get_parent_ino(sfp);
- if (filldir(dirent, "..", 2, dotdot_offset & 0x7fffffff, ino, DT_DIR)) {
- *offset = dotdot_offset & 0x7fffffff;
- return 0;
- }
- }
-
- /*
- * Loop while there are more entries and put'ing works.
- */
- sfep = xfs_dir2_sf_firstentry(sfp);
- for (i = 0; i < sfp->count; i++) {
- off = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk,
- xfs_dir2_sf_get_offset(sfep));
-
- if (*offset > off) {
- sfep = xfs_dir2_sf_nextentry(sfp, sfep);
- continue;
- }
-
- ino = xfs_dir2_sfe_get_ino(sfp, sfep);
- if (filldir(dirent, (char *)sfep->name, sfep->namelen,
- off & 0x7fffffff, ino, DT_UNKNOWN)) {
- *offset = off & 0x7fffffff;
- return 0;
- }
- sfep = xfs_dir2_sf_nextentry(sfp, sfep);
- }
-
- *offset = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk + 1, 0) &
- 0x7fffffff;
- return 0;
-}
-
/*
* Lookup an entry in a shortform directory.
* Returns EEXIST if found, ENOENT if not found.
@@ -903,6 +750,7 @@ xfs_dir2_sf_lookup(
if (args->namelen == 1 && args->name[0] == '.') {
args->inumber = dp->i_ino;
args->cmpresult = XFS_CMP_EXACT;
+ args->filetype = XFS_DIR3_FT_DIR;
return XFS_ERROR(EEXIST);
}
/*
@@ -910,8 +758,9 @@ xfs_dir2_sf_lookup(
*/
if (args->namelen == 2 &&
args->name[0] == '.' && args->name[1] == '.') {
- args->inumber = xfs_dir2_sf_get_parent_ino(sfp);
+ args->inumber = dp->d_ops->sf_get_parent_ino(sfp);
args->cmpresult = XFS_CMP_EXACT;
+ args->filetype = XFS_DIR3_FT_DIR;
return XFS_ERROR(EEXIST);
}
/*
@@ -919,7 +768,7 @@ xfs_dir2_sf_lookup(
*/
ci_sfep = NULL;
for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp); i < sfp->count;
- i++, sfep = xfs_dir2_sf_nextentry(sfp, sfep)) {
+ i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep)) {
/*
* Compare name and if it's an exact match, return the inode
* number. If it's the first case-insensitive match, store the
@@ -929,7 +778,8 @@ xfs_dir2_sf_lookup(
sfep->namelen);
if (cmp != XFS_CMP_DIFFERENT && cmp != args->cmpresult) {
args->cmpresult = cmp;
- args->inumber = xfs_dir2_sfe_get_ino(sfp, sfep);
+ args->inumber = dp->d_ops->sf_get_ino(sfp, sfep);
+ args->filetype = dp->d_ops->sf_get_ftype(sfep);
if (cmp == XFS_CMP_EXACT)
return XFS_ERROR(EEXIST);
ci_sfep = sfep;
@@ -985,10 +835,10 @@ xfs_dir2_sf_removename(
* Find the one we're deleting.
*/
for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp); i < sfp->count;
- i++, sfep = xfs_dir2_sf_nextentry(sfp, sfep)) {
+ i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep)) {
if (xfs_da_compname(args, sfep->name, sfep->namelen) ==
XFS_CMP_EXACT) {
- ASSERT(xfs_dir2_sfe_get_ino(sfp, sfep) ==
+ ASSERT(dp->d_ops->sf_get_ino(sfp, sfep) ==
args->inumber);
break;
}
@@ -1002,7 +852,7 @@ xfs_dir2_sf_removename(
* Calculate sizes.
*/
byteoff = (int)((char *)sfep - (char *)sfp);
- entsize = xfs_dir2_sf_entsize(sfp, args->namelen);
+ entsize = dp->d_ops->sf_entsize(sfp, args->namelen);
newsize = oldsize - entsize;
/*
* Copy the part if any after the removed entry, sliding it down.
@@ -1109,25 +959,25 @@ xfs_dir2_sf_replace(
if (args->namelen == 2 &&
args->name[0] == '.' && args->name[1] == '.') {
#if XFS_BIG_INUMS || defined(DEBUG)
- ino = xfs_dir2_sf_get_parent_ino(sfp);
+ ino = dp->d_ops->sf_get_parent_ino(sfp);
ASSERT(args->inumber != ino);
#endif
- xfs_dir2_sf_put_parent_ino(sfp, args->inumber);
+ dp->d_ops->sf_put_parent_ino(sfp, args->inumber);
}
/*
* Normal entry, look for the name.
*/
else {
- for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp);
- i < sfp->count;
- i++, sfep = xfs_dir2_sf_nextentry(sfp, sfep)) {
+ for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp); i < sfp->count;
+ i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep)) {
if (xfs_da_compname(args, sfep->name, sfep->namelen) ==
XFS_CMP_EXACT) {
#if XFS_BIG_INUMS || defined(DEBUG)
- ino = xfs_dir2_sfe_get_ino(sfp, sfep);
+ ino = dp->d_ops->sf_get_ino(sfp, sfep);
ASSERT(args->inumber != ino);
#endif
- xfs_dir2_sfe_put_ino(sfp, sfep, args->inumber);
+ dp->d_ops->sf_put_ino(sfp, sfep, args->inumber);
+ dp->d_ops->sf_put_ftype(sfep, args->filetype);
break;
}
}
@@ -1194,10 +1044,12 @@ xfs_dir2_sf_toino4(
int oldsize; /* old inode size */
xfs_dir2_sf_entry_t *sfep; /* new sf entry */
xfs_dir2_sf_hdr_t *sfp; /* new sf directory */
+ struct xfs_mount *mp;
trace_xfs_dir2_sf_toino4(args);
dp = args->dp;
+ mp = dp->i_mount;
/*
* Copy the old directory to the buffer.
@@ -1228,20 +1080,21 @@ xfs_dir2_sf_toino4(
*/
sfp->count = oldsfp->count;
sfp->i8count = 0;
- xfs_dir2_sf_put_parent_ino(sfp, xfs_dir2_sf_get_parent_ino(oldsfp));
+ dp->d_ops->sf_put_parent_ino(sfp, dp->d_ops->sf_get_parent_ino(oldsfp));
/*
* Copy the entries field by field.
*/
for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp),
oldsfep = xfs_dir2_sf_firstentry(oldsfp);
i < sfp->count;
- i++, sfep = xfs_dir2_sf_nextentry(sfp, sfep),
- oldsfep = xfs_dir2_sf_nextentry(oldsfp, oldsfep)) {
+ i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep),
+ oldsfep = dp->d_ops->sf_nextentry(oldsfp, oldsfep)) {
sfep->namelen = oldsfep->namelen;
sfep->offset = oldsfep->offset;
memcpy(sfep->name, oldsfep->name, sfep->namelen);
- xfs_dir2_sfe_put_ino(sfp, sfep,
- xfs_dir2_sfe_get_ino(oldsfp, oldsfep));
+ dp->d_ops->sf_put_ino(sfp, sfep,
+ dp->d_ops->sf_get_ino(oldsfp, oldsfep));
+ dp->d_ops->sf_put_ftype(sfep, dp->d_ops->sf_get_ftype(oldsfep));
}
/*
* Clean up the inode.
@@ -1252,9 +1105,9 @@ xfs_dir2_sf_toino4(
}
/*
- * Convert from 4-byte inode numbers to 8-byte inode numbers.
- * The new 8-byte inode number is not there yet, we leave with the
- * count 1 but no corresponding entry.
+ * Convert existing entries from 4-byte inode numbers to 8-byte inode numbers.
+ * The new entry w/ an 8-byte inode number is not there yet; we leave with
+ * i8count set to 1, but no corresponding 8-byte entry.
*/
static void
xfs_dir2_sf_toino8(
@@ -1269,10 +1122,12 @@ xfs_dir2_sf_toino8(
int oldsize; /* old inode size */
xfs_dir2_sf_entry_t *sfep; /* new sf entry */
xfs_dir2_sf_hdr_t *sfp; /* new sf directory */
+ struct xfs_mount *mp;
trace_xfs_dir2_sf_toino8(args);
dp = args->dp;
+ mp = dp->i_mount;
/*
* Copy the old directory to the buffer.
@@ -1285,7 +1140,7 @@ xfs_dir2_sf_toino8(
ASSERT(oldsfp->i8count == 0);
memcpy(buf, oldsfp, oldsize);
/*
- * Compute the new inode size.
+ * Compute the new inode size (nb: entry count + 1 for parent)
*/
newsize =
oldsize +
@@ -1303,20 +1158,21 @@ xfs_dir2_sf_toino8(
*/
sfp->count = oldsfp->count;
sfp->i8count = 1;
- xfs_dir2_sf_put_parent_ino(sfp, xfs_dir2_sf_get_parent_ino(oldsfp));
+ dp->d_ops->sf_put_parent_ino(sfp, dp->d_ops->sf_get_parent_ino(oldsfp));
/*
* Copy the entries field by field.
*/
for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp),
oldsfep = xfs_dir2_sf_firstentry(oldsfp);
i < sfp->count;
- i++, sfep = xfs_dir2_sf_nextentry(sfp, sfep),
- oldsfep = xfs_dir2_sf_nextentry(oldsfp, oldsfep)) {
+ i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep),
+ oldsfep = dp->d_ops->sf_nextentry(oldsfp, oldsfep)) {
sfep->namelen = oldsfep->namelen;
sfep->offset = oldsfep->offset;
memcpy(sfep->name, oldsfep->name, sfep->namelen);
- xfs_dir2_sfe_put_ino(sfp, sfep,
- xfs_dir2_sfe_get_ino(oldsfp, oldsfep));
+ dp->d_ops->sf_put_ino(sfp, sfep,
+ dp->d_ops->sf_get_ino(oldsfp, oldsfep));
+ dp->d_ops->sf_put_ftype(sfep, dp->d_ops->sf_get_ftype(oldsfep));
}
/*
* Clean up the inode.
diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c
index 69cf4fcde03..4f11ef01113 100644
--- a/fs/xfs/xfs_discard.c
+++ b/fs/xfs/xfs_discard.c
@@ -16,22 +16,22 @@
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "xfs.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_sb.h"
-#include "xfs_log.h"
#include "xfs_ag.h"
#include "xfs_mount.h"
#include "xfs_quota.h"
-#include "xfs_trans.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_btree.h"
#include "xfs_inode.h"
+#include "xfs_btree.h"
+#include "xfs_alloc_btree.h"
#include "xfs_alloc.h"
#include "xfs_error.h"
#include "xfs_extent_busy.h"
#include "xfs_discard.h"
#include "xfs_trace.h"
+#include "xfs_log.h"
STATIC int
xfs_trim_extents(
@@ -157,7 +157,7 @@ xfs_ioc_trim(
struct xfs_mount *mp,
struct fstrim_range __user *urange)
{
- struct request_queue *q = mp->m_ddev_targp->bt_bdev->bd_disk->queue;
+ struct request_queue *q = bdev_get_queue(mp->m_ddev_targp->bt_bdev);
unsigned int granularity = q->limits.discard_granularity;
struct fstrim_range range;
xfs_daddr_t start, end, minlen;
@@ -180,7 +180,8 @@ xfs_ioc_trim(
* matter as trimming blocks is an advisory interface.
*/
if (range.start >= XFS_FSB_TO_B(mp, mp->m_sb.sb_dblocks) ||
- range.minlen > XFS_FSB_TO_B(mp, XFS_ALLOC_AG_MAX_USABLE(mp)))
+ range.minlen > XFS_FSB_TO_B(mp, XFS_ALLOC_AG_MAX_USABLE(mp)) ||
+ range.len < mp->m_sb.sb_blocksize)
return -XFS_ERROR(EINVAL);
start = BTOBB(range.start);
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 9e1bf5294c9..3ee0cd43edc 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -17,26 +17,29 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_shared.h"
+#include "xfs_trans_resv.h"
#include "xfs_bit.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
-#include "xfs_alloc.h"
-#include "xfs_quota.h"
#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
#include "xfs_inode.h"
#include "xfs_bmap.h"
-#include "xfs_rtalloc.h"
+#include "xfs_bmap_util.h"
+#include "xfs_alloc.h"
+#include "xfs_quota.h"
#include "xfs_error.h"
-#include "xfs_itable.h"
-#include "xfs_attr.h"
+#include "xfs_trans.h"
#include "xfs_buf_item.h"
#include "xfs_trans_space.h"
#include "xfs_trans_priv.h"
#include "xfs_qm.h"
+#include "xfs_cksum.h"
#include "xfs_trace.h"
+#include "xfs_log.h"
+#include "xfs_bmap_btree.h"
/*
* Lock order:
@@ -61,7 +64,8 @@ int xfs_dqerror_mod = 33;
struct kmem_zone *xfs_qm_dqtrxzone;
static struct kmem_zone *xfs_qm_dqzone;
-static struct lock_class_key xfs_dquot_other_class;
+static struct lock_class_key xfs_dquot_group_class;
+static struct lock_class_key xfs_dquot_project_class;
/*
* This is called to free all the memory associated with a dquot
@@ -85,17 +89,23 @@ xfs_qm_dqdestroy(
*/
void
xfs_qm_adjust_dqlimits(
- xfs_mount_t *mp,
- xfs_disk_dquot_t *d)
+ struct xfs_mount *mp,
+ struct xfs_dquot *dq)
{
- xfs_quotainfo_t *q = mp->m_quotainfo;
+ struct xfs_quotainfo *q = mp->m_quotainfo;
+ struct xfs_disk_dquot *d = &dq->q_core;
+ int prealloc = 0;
ASSERT(d->d_id);
- if (q->qi_bsoftlimit && !d->d_blk_softlimit)
+ if (q->qi_bsoftlimit && !d->d_blk_softlimit) {
d->d_blk_softlimit = cpu_to_be64(q->qi_bsoftlimit);
- if (q->qi_bhardlimit && !d->d_blk_hardlimit)
+ prealloc = 1;
+ }
+ if (q->qi_bhardlimit && !d->d_blk_hardlimit) {
d->d_blk_hardlimit = cpu_to_be64(q->qi_bhardlimit);
+ prealloc = 1;
+ }
if (q->qi_isoftlimit && !d->d_ino_softlimit)
d->d_ino_softlimit = cpu_to_be64(q->qi_isoftlimit);
if (q->qi_ihardlimit && !d->d_ino_hardlimit)
@@ -104,6 +114,9 @@ xfs_qm_adjust_dqlimits(
d->d_rtb_softlimit = cpu_to_be64(q->qi_rtbsoftlimit);
if (q->qi_rtbhardlimit && !d->d_rtb_hardlimit)
d->d_rtb_hardlimit = cpu_to_be64(q->qi_rtbhardlimit);
+
+ if (prealloc)
+ xfs_dquot_set_prealloc_limits(dq);
}
/*
@@ -239,6 +252,11 @@ xfs_qm_init_dquot_blk(
d->dd_diskdq.d_version = XFS_DQUOT_VERSION;
d->dd_diskdq.d_id = cpu_to_be32(curid);
d->dd_diskdq.d_flags = type;
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ uuid_copy(&d->dd_uuid, &mp->m_sb.sb_uuid);
+ xfs_update_cksum((char *)d, sizeof(struct xfs_dqblk),
+ XFS_DQUOT_CRC_OFF);
+ }
}
xfs_trans_dquot_buf(tp, bp,
@@ -248,60 +266,32 @@ xfs_qm_init_dquot_blk(
xfs_trans_log_buf(tp, bp, 0, BBTOB(q->qi_dqchunklen) - 1);
}
-static void
-xfs_dquot_buf_verify(
- struct xfs_buf *bp)
+/*
+ * Initialize the dynamic speculative preallocation thresholds. The lo/hi
+ * watermarks correspond to the soft and hard limits by default. If a soft limit
+ * is not specified, we use 95% of the hard limit.
+ */
+void
+xfs_dquot_set_prealloc_limits(struct xfs_dquot *dqp)
{
- struct xfs_mount *mp = bp->b_target->bt_mount;
- struct xfs_dqblk *d = (struct xfs_dqblk *)bp->b_addr;
- struct xfs_disk_dquot *ddq;
- xfs_dqid_t id = 0;
- int i;
-
- /*
- * On the first read of the buffer, verify that each dquot is valid.
- * We don't know what the id of the dquot is supposed to be, just that
- * they should be increasing monotonically within the buffer. If the
- * first id is corrupt, then it will fail on the second dquot in the
- * buffer so corruptions could point to the wrong dquot in this case.
- */
- for (i = 0; i < mp->m_quotainfo->qi_dqperchunk; i++) {
- int error;
-
- ddq = &d[i].dd_diskdq;
-
- if (i == 0)
- id = be32_to_cpu(ddq->d_id);
-
- error = xfs_qm_dqcheck(mp, ddq, id + i, 0, XFS_QMOPT_DOWARN,
- "xfs_dquot_read_verify");
- if (error) {
- XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, d);
- xfs_buf_ioerror(bp, EFSCORRUPTED);
- break;
- }
+ __uint64_t space;
+
+ dqp->q_prealloc_hi_wmark = be64_to_cpu(dqp->q_core.d_blk_hardlimit);
+ dqp->q_prealloc_lo_wmark = be64_to_cpu(dqp->q_core.d_blk_softlimit);
+ if (!dqp->q_prealloc_lo_wmark) {
+ dqp->q_prealloc_lo_wmark = dqp->q_prealloc_hi_wmark;
+ do_div(dqp->q_prealloc_lo_wmark, 100);
+ dqp->q_prealloc_lo_wmark *= 95;
}
-}
-static void
-xfs_dquot_buf_read_verify(
- struct xfs_buf *bp)
-{
- xfs_dquot_buf_verify(bp);
-}
+ space = dqp->q_prealloc_hi_wmark;
-void
-xfs_dquot_buf_write_verify(
- struct xfs_buf *bp)
-{
- xfs_dquot_buf_verify(bp);
+ do_div(space, 100);
+ dqp->q_low_space[XFS_QLOWSP_1_PCNT] = space;
+ dqp->q_low_space[XFS_QLOWSP_3_PCNT] = space * 3;
+ dqp->q_low_space[XFS_QLOWSP_5_PCNT] = space * 5;
}
-const struct xfs_buf_ops xfs_dquot_buf_ops = {
- .verify_read = xfs_dquot_buf_read_verify,
- .verify_write = xfs_dquot_buf_write_verify,
-};
-
/*
* Allocate a block and fill it with dquots.
* This is called when the bmapi finds a hole.
@@ -363,10 +353,10 @@ xfs_qm_dqalloc(
dqp->q_blkno,
mp->m_quotainfo->qi_dqchunklen,
0);
-
- error = xfs_buf_geterror(bp);
- if (error)
+ if (!bp) {
+ error = ENOMEM;
goto error1;
+ }
bp->b_ops = &xfs_dquot_buf_ops;
/*
@@ -412,6 +402,7 @@ xfs_qm_dqalloc(
return (error);
}
+
STATIC int
xfs_qm_dqrepair(
struct xfs_mount *mp,
@@ -445,7 +436,7 @@ xfs_qm_dqrepair(
/* Do the actual repair of dquots in this buffer */
for (i = 0; i < mp->m_quotainfo->qi_dqperchunk; i++) {
ddq = &d[i].dd_diskdq;
- error = xfs_qm_dqcheck(mp, ddq, firstid + i,
+ error = xfs_dqcheck(mp, ddq, firstid + i,
dqp->dq_flags & XFS_DQ_ALLTYPES,
XFS_QMOPT_DQREPAIR, "xfs_qm_dqrepair");
if (error) {
@@ -471,23 +462,24 @@ xfs_qm_dqtobp(
xfs_buf_t **O_bpp,
uint flags)
{
- xfs_bmbt_irec_t map;
- int nmaps = 1, error;
- xfs_buf_t *bp;
- xfs_inode_t *quotip = XFS_DQ_TO_QIP(dqp);
- xfs_mount_t *mp = dqp->q_mount;
- xfs_dqid_t id = be32_to_cpu(dqp->q_core.d_id);
- xfs_trans_t *tp = (tpp ? *tpp : NULL);
+ struct xfs_bmbt_irec map;
+ int nmaps = 1, error;
+ struct xfs_buf *bp;
+ struct xfs_inode *quotip = xfs_dq_to_quota_inode(dqp);
+ struct xfs_mount *mp = dqp->q_mount;
+ xfs_dqid_t id = be32_to_cpu(dqp->q_core.d_id);
+ struct xfs_trans *tp = (tpp ? *tpp : NULL);
+ uint lock_mode;
dqp->q_fileoffset = (xfs_fileoff_t)id / mp->m_quotainfo->qi_dqperchunk;
- xfs_ilock(quotip, XFS_ILOCK_SHARED);
+ lock_mode = xfs_ilock_data_map_shared(quotip);
if (!xfs_this_quota_on(dqp->q_mount, dqp->dq_flags)) {
/*
* Return if this type of quotas is turned off while we
* didn't have the quota inode lock.
*/
- xfs_iunlock(quotip, XFS_ILOCK_SHARED);
+ xfs_iunlock(quotip, lock_mode);
return ESRCH;
}
@@ -497,7 +489,7 @@ xfs_qm_dqtobp(
error = xfs_bmapi_read(quotip, dqp->q_fileoffset,
XFS_DQUOT_CLUSTER_SIZE_FSB, &map, &nmaps, 0);
- xfs_iunlock(quotip, XFS_ILOCK_SHARED);
+ xfs_iunlock(quotip, lock_mode);
if (error)
return error;
@@ -602,8 +594,20 @@ xfs_qm_dqread(
* Make sure group quotas have a different lock class than user
* quotas.
*/
- if (!(type & XFS_DQ_USER))
- lockdep_set_class(&dqp->q_qlock, &xfs_dquot_other_class);
+ switch (type) {
+ case XFS_DQ_USER:
+ /* uses the default lock class */
+ break;
+ case XFS_DQ_GROUP:
+ lockdep_set_class(&dqp->q_qlock, &xfs_dquot_group_class);
+ break;
+ case XFS_DQ_PROJ:
+ lockdep_set_class(&dqp->q_qlock, &xfs_dquot_project_class);
+ break;
+ default:
+ ASSERT(0);
+ break;
+ }
XFS_STATS_INC(xs_qm_dquot);
@@ -611,16 +615,8 @@ xfs_qm_dqread(
if (flags & XFS_QMOPT_DQALLOC) {
tp = xfs_trans_alloc(mp, XFS_TRANS_QM_DQALLOC);
- error = xfs_trans_reserve(tp, XFS_QM_DQALLOC_SPACE_RES(mp),
- XFS_WRITE_LOG_RES(mp) +
- /*
- * Round the chunklen up to the next multiple
- * of 128 (buf log item chunk size)).
- */
- BBTOB(mp->m_quotainfo->qi_dqchunklen) - 1 + 128,
- 0,
- XFS_TRANS_PERM_LOG_RES,
- XFS_WRITE_LOG_COUNT);
+ error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_dqalloc,
+ XFS_QM_DQALLOC_SPACE_RES(mp), 0);
if (error)
goto error1;
cancelflags = XFS_TRANS_RELEASE_LOG_RES;
@@ -654,6 +650,9 @@ xfs_qm_dqread(
dqp->q_res_icount = be64_to_cpu(ddqp->d_icount);
dqp->q_res_rtbcount = be64_to_cpu(ddqp->d_rtbcount);
+ /* initialize the dquot speculative prealloc thresholds */
+ xfs_dquot_set_prealloc_limits(dqp);
+
/* Mark the buf so that this will stay incore a little longer */
xfs_buf_set_ref(bp, XFS_DQUOT_REF);
@@ -708,7 +707,7 @@ xfs_qm_dqget(
xfs_dquot_t **O_dqpp) /* OUT : locked incore dquot */
{
struct xfs_quotainfo *qi = mp->m_quotainfo;
- struct radix_tree_root *tree = XFS_DQUOT_TREE(qi, type);
+ struct radix_tree_root *tree = xfs_dquot_tree(qi, type);
struct xfs_dquot *dqp;
int error;
@@ -833,43 +832,6 @@ restart:
return (0);
}
-
-STATIC void
-xfs_qm_dqput_final(
- struct xfs_dquot *dqp)
-{
- struct xfs_quotainfo *qi = dqp->q_mount->m_quotainfo;
- struct xfs_dquot *gdqp;
-
- trace_xfs_dqput_free(dqp);
-
- mutex_lock(&qi->qi_lru_lock);
- if (list_empty(&dqp->q_lru)) {
- list_add_tail(&dqp->q_lru, &qi->qi_lru_list);
- qi->qi_lru_count++;
- XFS_STATS_INC(xs_qm_dquot_unused);
- }
- mutex_unlock(&qi->qi_lru_lock);
-
- /*
- * If we just added a udquot to the freelist, then we want to release
- * the gdquot reference that it (probably) has. Otherwise it'll keep
- * the gdquot from getting reclaimed.
- */
- gdqp = dqp->q_gdquot;
- if (gdqp) {
- xfs_dqlock(gdqp);
- dqp->q_gdquot = NULL;
- }
- xfs_dqunlock(dqp);
-
- /*
- * If we had a group quota hint, release it now.
- */
- if (gdqp)
- xfs_qm_dqput(gdqp);
-}
-
/*
* Release a reference to the dquot (decrement ref-count) and unlock it.
*
@@ -885,10 +847,14 @@ xfs_qm_dqput(
trace_xfs_dqput(dqp);
- if (--dqp->q_nrefs > 0)
- xfs_dqunlock(dqp);
- else
- xfs_qm_dqput_final(dqp);
+ if (--dqp->q_nrefs == 0) {
+ struct xfs_quotainfo *qi = dqp->q_mount->m_quotainfo;
+ trace_xfs_dqput_free(dqp);
+
+ if (list_lru_add(&qi->qi_lru, &dqp->q_lru))
+ XFS_STATS_INC(xs_qm_dquot_unused);
+ }
+ xfs_dqunlock(dqp);
}
/*
@@ -1020,7 +986,7 @@ xfs_qm_dqflush(
/*
* A simple sanity check in case we got a corrupted dquot..
*/
- error = xfs_qm_dqcheck(mp, &dqp->q_core, be32_to_cpu(ddqp->d_id), 0,
+ error = xfs_dqcheck(mp, &dqp->q_core, be32_to_cpu(ddqp->d_id), 0,
XFS_QMOPT_DOWARN, "dqflush (incore copy)");
if (error) {
xfs_buf_relse(bp);
@@ -1041,6 +1007,23 @@ xfs_qm_dqflush(
&dqp->q_logitem.qli_item.li_lsn);
/*
+ * copy the lsn into the on-disk dquot now while we have the in memory
+ * dquot here. This can't be done later in the write verifier as we
+ * can't get access to the log item at that point in time.
+ *
+ * We also calculate the CRC here so that the on-disk dquot in the
+ * buffer always has a valid CRC. This ensures there is no possibility
+ * of a dquot without an up-to-date CRC getting to disk.
+ */
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ struct xfs_dqblk *dqb = (struct xfs_dqblk *)ddqp;
+
+ dqb->dd_lsn = cpu_to_be64(dqp->q_logitem.qli_item.li_lsn);
+ xfs_update_cksum((char *)dqb, sizeof(struct xfs_dqblk),
+ XFS_DQUOT_CRC_OFF);
+ }
+
+ /*
* Attach an iodone routine so that we can remove this dquot from the
* AIL and release the flush lock once the dquot is synced to disk.
*/
diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h
index c694a8469c4..68a68f70483 100644
--- a/fs/xfs/xfs_dquot.h
+++ b/fs/xfs/xfs_dquot.h
@@ -32,6 +32,13 @@
struct xfs_mount;
struct xfs_trans;
+enum {
+ XFS_QLOWSP_1_PCNT = 0,
+ XFS_QLOWSP_3_PCNT,
+ XFS_QLOWSP_5_PCNT,
+ XFS_QLOWSP_MAX
+};
+
/*
* The incore dquot structure
*/
@@ -45,12 +52,14 @@ typedef struct xfs_dquot {
int q_bufoffset; /* off of dq in buffer (# dquots) */
xfs_fileoff_t q_fileoffset; /* offset in quotas file */
- struct xfs_dquot*q_gdquot; /* group dquot, hint only */
xfs_disk_dquot_t q_core; /* actual usage & quotas */
xfs_dq_logitem_t q_logitem; /* dquot log item */
xfs_qcnt_t q_res_bcount; /* total regular nblks used+reserved */
xfs_qcnt_t q_res_icount; /* total inos allocd+reserved */
xfs_qcnt_t q_res_rtbcount;/* total realtime blks used+reserved */
+ xfs_qcnt_t q_prealloc_lo_wmark;/* prealloc throttle wmark */
+ xfs_qcnt_t q_prealloc_hi_wmark;/* prealloc disabled wmark */
+ int64_t q_low_space[XFS_QLOWSP_MAX];
struct mutex q_qlock; /* quota lock */
struct completion q_flush; /* flush completion queue */
atomic_t q_pincount; /* dquot pin count */
@@ -108,8 +117,9 @@ static inline int xfs_this_quota_on(struct xfs_mount *mp, int type)
case XFS_DQ_USER:
return XFS_IS_UQUOTA_ON(mp);
case XFS_DQ_GROUP:
+ return XFS_IS_GQUOTA_ON(mp);
case XFS_DQ_PROJ:
- return XFS_IS_OQUOTA_ON(mp);
+ return XFS_IS_PQUOTA_ON(mp);
default:
return 0;
}
@@ -121,8 +131,9 @@ static inline xfs_dquot_t *xfs_inode_dquot(struct xfs_inode *ip, int type)
case XFS_DQ_USER:
return ip->i_udquot;
case XFS_DQ_GROUP:
- case XFS_DQ_PROJ:
return ip->i_gdquot;
+ case XFS_DQ_PROJ:
+ return ip->i_pdquot;
default:
return NULL;
}
@@ -133,10 +144,6 @@ static inline xfs_dquot_t *xfs_inode_dquot(struct xfs_inode *ip, int type)
#define XFS_QM_ISUDQ(dqp) ((dqp)->dq_flags & XFS_DQ_USER)
#define XFS_QM_ISPDQ(dqp) ((dqp)->dq_flags & XFS_DQ_PROJ)
#define XFS_QM_ISGDQ(dqp) ((dqp)->dq_flags & XFS_DQ_GROUP)
-#define XFS_DQ_TO_QINF(dqp) ((dqp)->q_mount->m_quotainfo)
-#define XFS_DQ_TO_QIP(dqp) (XFS_QM_ISUDQ(dqp) ? \
- XFS_DQ_TO_QINF(dqp)->qi_uquotaip : \
- XFS_DQ_TO_QINF(dqp)->qi_gquotaip)
extern int xfs_qm_dqread(struct xfs_mount *, xfs_dqid_t, uint,
uint, struct xfs_dquot **);
@@ -145,14 +152,16 @@ extern int xfs_qm_dqflush(struct xfs_dquot *, struct xfs_buf **);
extern void xfs_qm_dqunpin_wait(xfs_dquot_t *);
extern void xfs_qm_adjust_dqtimers(xfs_mount_t *,
xfs_disk_dquot_t *);
-extern void xfs_qm_adjust_dqlimits(xfs_mount_t *,
- xfs_disk_dquot_t *);
+extern void xfs_qm_adjust_dqlimits(struct xfs_mount *,
+ struct xfs_dquot *);
extern int xfs_qm_dqget(xfs_mount_t *, xfs_inode_t *,
xfs_dqid_t, uint, uint, xfs_dquot_t **);
extern void xfs_qm_dqput(xfs_dquot_t *);
extern void xfs_dqlock2(struct xfs_dquot *, struct xfs_dquot *);
+extern void xfs_dquot_set_prealloc_limits(struct xfs_dquot *);
+
static inline struct xfs_dquot *xfs_qm_dqhold(struct xfs_dquot *dqp)
{
xfs_dqlock(dqp);
@@ -161,6 +170,4 @@ static inline struct xfs_dquot *xfs_qm_dqhold(struct xfs_dquot *dqp)
return dqp;
}
-extern const struct xfs_buf_ops xfs_dquot_buf_ops;
-
#endif /* __XFS_DQUOT_H__ */
diff --git a/fs/xfs/xfs_dquot_buf.c b/fs/xfs/xfs_dquot_buf.c
new file mode 100644
index 00000000000..c2ac0c611ad
--- /dev/null
+++ b/fs/xfs/xfs_dquot_buf.c
@@ -0,0 +1,290 @@
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_quota.h"
+#include "xfs_trans.h"
+#include "xfs_qm.h"
+#include "xfs_error.h"
+#include "xfs_cksum.h"
+#include "xfs_trace.h"
+
+int
+xfs_calc_dquots_per_chunk(
+ unsigned int nbblks) /* basic block units */
+{
+ unsigned int ndquots;
+
+ ASSERT(nbblks > 0);
+ ndquots = BBTOB(nbblks);
+ do_div(ndquots, sizeof(xfs_dqblk_t));
+
+ return ndquots;
+}
+
+/*
+ * Do some primitive error checking on ondisk dquot data structures.
+ */
+int
+xfs_dqcheck(
+ struct xfs_mount *mp,
+ xfs_disk_dquot_t *ddq,
+ xfs_dqid_t id,
+ uint type, /* used only when IO_dorepair is true */
+ uint flags,
+ char *str)
+{
+ xfs_dqblk_t *d = (xfs_dqblk_t *)ddq;
+ int errs = 0;
+
+ /*
+ * We can encounter an uninitialized dquot buffer for 2 reasons:
+ * 1. If we crash while deleting the quotainode(s), and those blks got
+ * used for user data. This is because we take the path of regular
+ * file deletion; however, the size field of quotainodes is never
+ * updated, so all the tricks that we play in itruncate_finish
+ * don't quite matter.
+ *
+ * 2. We don't play the quota buffers when there's a quotaoff logitem.
+ * But the allocation will be replayed so we'll end up with an
+ * uninitialized quota block.
+ *
+ * This is all fine; things are still consistent, and we haven't lost
+ * any quota information. Just don't complain about bad dquot blks.
+ */
+ if (ddq->d_magic != cpu_to_be16(XFS_DQUOT_MAGIC)) {
+ if (flags & XFS_QMOPT_DOWARN)
+ xfs_alert(mp,
+ "%s : XFS dquot ID 0x%x, magic 0x%x != 0x%x",
+ str, id, be16_to_cpu(ddq->d_magic), XFS_DQUOT_MAGIC);
+ errs++;
+ }
+ if (ddq->d_version != XFS_DQUOT_VERSION) {
+ if (flags & XFS_QMOPT_DOWARN)
+ xfs_alert(mp,
+ "%s : XFS dquot ID 0x%x, version 0x%x != 0x%x",
+ str, id, ddq->d_version, XFS_DQUOT_VERSION);
+ errs++;
+ }
+
+ if (ddq->d_flags != XFS_DQ_USER &&
+ ddq->d_flags != XFS_DQ_PROJ &&
+ ddq->d_flags != XFS_DQ_GROUP) {
+ if (flags & XFS_QMOPT_DOWARN)
+ xfs_alert(mp,
+ "%s : XFS dquot ID 0x%x, unknown flags 0x%x",
+ str, id, ddq->d_flags);
+ errs++;
+ }
+
+ if (id != -1 && id != be32_to_cpu(ddq->d_id)) {
+ if (flags & XFS_QMOPT_DOWARN)
+ xfs_alert(mp,
+ "%s : ondisk-dquot 0x%p, ID mismatch: "
+ "0x%x expected, found id 0x%x",
+ str, ddq, id, be32_to_cpu(ddq->d_id));
+ errs++;
+ }
+
+ if (!errs && ddq->d_id) {
+ if (ddq->d_blk_softlimit &&
+ be64_to_cpu(ddq->d_bcount) >
+ be64_to_cpu(ddq->d_blk_softlimit)) {
+ if (!ddq->d_btimer) {
+ if (flags & XFS_QMOPT_DOWARN)
+ xfs_alert(mp,
+ "%s : Dquot ID 0x%x (0x%p) BLK TIMER NOT STARTED",
+ str, (int)be32_to_cpu(ddq->d_id), ddq);
+ errs++;
+ }
+ }
+ if (ddq->d_ino_softlimit &&
+ be64_to_cpu(ddq->d_icount) >
+ be64_to_cpu(ddq->d_ino_softlimit)) {
+ if (!ddq->d_itimer) {
+ if (flags & XFS_QMOPT_DOWARN)
+ xfs_alert(mp,
+ "%s : Dquot ID 0x%x (0x%p) INODE TIMER NOT STARTED",
+ str, (int)be32_to_cpu(ddq->d_id), ddq);
+ errs++;
+ }
+ }
+ if (ddq->d_rtb_softlimit &&
+ be64_to_cpu(ddq->d_rtbcount) >
+ be64_to_cpu(ddq->d_rtb_softlimit)) {
+ if (!ddq->d_rtbtimer) {
+ if (flags & XFS_QMOPT_DOWARN)
+ xfs_alert(mp,
+ "%s : Dquot ID 0x%x (0x%p) RTBLK TIMER NOT STARTED",
+ str, (int)be32_to_cpu(ddq->d_id), ddq);
+ errs++;
+ }
+ }
+ }
+
+ if (!errs || !(flags & XFS_QMOPT_DQREPAIR))
+ return errs;
+
+ if (flags & XFS_QMOPT_DOWARN)
+ xfs_notice(mp, "Re-initializing dquot ID 0x%x", id);
+
+ /*
+ * Typically, a repair is only requested by quotacheck.
+ */
+ ASSERT(id != -1);
+ ASSERT(flags & XFS_QMOPT_DQREPAIR);
+ memset(d, 0, sizeof(xfs_dqblk_t));
+
+ d->dd_diskdq.d_magic = cpu_to_be16(XFS_DQUOT_MAGIC);
+ d->dd_diskdq.d_version = XFS_DQUOT_VERSION;
+ d->dd_diskdq.d_flags = type;
+ d->dd_diskdq.d_id = cpu_to_be32(id);
+
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ uuid_copy(&d->dd_uuid, &mp->m_sb.sb_uuid);
+ xfs_update_cksum((char *)d, sizeof(struct xfs_dqblk),
+ XFS_DQUOT_CRC_OFF);
+ }
+
+ return errs;
+}
+
+STATIC bool
+xfs_dquot_buf_verify_crc(
+ struct xfs_mount *mp,
+ struct xfs_buf *bp)
+{
+ struct xfs_dqblk *d = (struct xfs_dqblk *)bp->b_addr;
+ int ndquots;
+ int i;
+
+ if (!xfs_sb_version_hascrc(&mp->m_sb))
+ return true;
+
+ /*
+ * if we are in log recovery, the quota subsystem has not been
+ * initialised so we have no quotainfo structure. In that case, we need
+ * to manually calculate the number of dquots in the buffer.
+ */
+ if (mp->m_quotainfo)
+ ndquots = mp->m_quotainfo->qi_dqperchunk;
+ else
+ ndquots = xfs_calc_dquots_per_chunk(
+ XFS_BB_TO_FSB(mp, bp->b_length));
+
+ for (i = 0; i < ndquots; i++, d++) {
+ if (!xfs_verify_cksum((char *)d, sizeof(struct xfs_dqblk),
+ XFS_DQUOT_CRC_OFF))
+ return false;
+ if (!uuid_equal(&d->dd_uuid, &mp->m_sb.sb_uuid))
+ return false;
+ }
+ return true;
+}
+
+STATIC bool
+xfs_dquot_buf_verify(
+ struct xfs_mount *mp,
+ struct xfs_buf *bp)
+{
+ struct xfs_dqblk *d = (struct xfs_dqblk *)bp->b_addr;
+ xfs_dqid_t id = 0;
+ int ndquots;
+ int i;
+
+ /*
+ * if we are in log recovery, the quota subsystem has not been
+ * initialised so we have no quotainfo structure. In that case, we need
+ * to manually calculate the number of dquots in the buffer.
+ */
+ if (mp->m_quotainfo)
+ ndquots = mp->m_quotainfo->qi_dqperchunk;
+ else
+ ndquots = xfs_calc_dquots_per_chunk(bp->b_length);
+
+ /*
+ * On the first read of the buffer, verify that each dquot is valid.
+ * We don't know what the id of the dquot is supposed to be, just that
+ * they should be increasing monotonically within the buffer. If the
+ * first id is corrupt, then it will fail on the second dquot in the
+ * buffer so corruptions could point to the wrong dquot in this case.
+ */
+ for (i = 0; i < ndquots; i++) {
+ struct xfs_disk_dquot *ddq;
+ int error;
+
+ ddq = &d[i].dd_diskdq;
+
+ if (i == 0)
+ id = be32_to_cpu(ddq->d_id);
+
+ error = xfs_dqcheck(mp, ddq, id + i, 0, XFS_QMOPT_DOWARN,
+ "xfs_dquot_buf_verify");
+ if (error)
+ return false;
+ }
+ return true;
+}
+
+static void
+xfs_dquot_buf_read_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+
+ if (!xfs_dquot_buf_verify_crc(mp, bp))
+ xfs_buf_ioerror(bp, EFSBADCRC);
+ else if (!xfs_dquot_buf_verify(mp, bp))
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
+
+ if (bp->b_error)
+ xfs_verifier_error(bp);
+}
+
+/*
+ * we don't calculate the CRC here as that is done when the dquot is flushed to
+ * the buffer after the update is done. This ensures that the dquot in the
+ * buffer always has an up-to-date CRC value.
+ */
+static void
+xfs_dquot_buf_write_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+
+ if (!xfs_dquot_buf_verify(mp, bp)) {
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
+ xfs_verifier_error(bp);
+ return;
+ }
+}
+
+const struct xfs_buf_ops xfs_dquot_buf_ops = {
+ .verify_read = xfs_dquot_buf_read_verify,
+ .verify_write = xfs_dquot_buf_write_verify,
+};
+
diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c
index 57aa4b03720..f33fbaaa4d8 100644
--- a/fs/xfs/xfs_dquot_item.c
+++ b/fs/xfs/xfs_dquot_item.c
@@ -17,23 +17,20 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
-#include "xfs_alloc.h"
-#include "xfs_quota.h"
#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
#include "xfs_inode.h"
-#include "xfs_bmap.h"
-#include "xfs_rtalloc.h"
+#include "xfs_quota.h"
#include "xfs_error.h"
-#include "xfs_itable.h"
-#include "xfs_attr.h"
+#include "xfs_trans.h"
#include "xfs_buf_item.h"
#include "xfs_trans_priv.h"
#include "xfs_qm.h"
+#include "xfs_log.h"
static inline struct xfs_dq_logitem *DQUOT_ITEM(struct xfs_log_item *lip)
{
@@ -43,14 +40,15 @@ static inline struct xfs_dq_logitem *DQUOT_ITEM(struct xfs_log_item *lip)
/*
* returns the number of iovecs needed to log the given dquot item.
*/
-STATIC uint
+STATIC void
xfs_qm_dquot_logitem_size(
- struct xfs_log_item *lip)
+ struct xfs_log_item *lip,
+ int *nvecs,
+ int *nbytes)
{
- /*
- * we need only two iovecs, one for the format, one for the real thing
- */
- return 2;
+ *nvecs += 2;
+ *nbytes += sizeof(struct xfs_dq_logformat) +
+ sizeof(struct xfs_disk_dquot);
}
/*
@@ -59,20 +57,24 @@ xfs_qm_dquot_logitem_size(
STATIC void
xfs_qm_dquot_logitem_format(
struct xfs_log_item *lip,
- struct xfs_log_iovec *logvec)
+ struct xfs_log_vec *lv)
{
struct xfs_dq_logitem *qlip = DQUOT_ITEM(lip);
-
- logvec->i_addr = &qlip->qli_format;
- logvec->i_len = sizeof(xfs_dq_logformat_t);
- logvec->i_type = XLOG_REG_TYPE_QFORMAT;
- logvec++;
- logvec->i_addr = &qlip->qli_dquot->q_core;
- logvec->i_len = sizeof(xfs_disk_dquot_t);
- logvec->i_type = XLOG_REG_TYPE_DQUOT;
-
- qlip->qli_format.qlf_size = 2;
-
+ struct xfs_log_iovec *vecp = NULL;
+ struct xfs_dq_logformat *qlf;
+
+ qlf = xlog_prepare_iovec(lv, &vecp, XLOG_REG_TYPE_QFORMAT);
+ qlf->qlf_type = XFS_LI_DQUOT;
+ qlf->qlf_size = 2;
+ qlf->qlf_id = be32_to_cpu(qlip->qli_dquot->q_core.d_id);
+ qlf->qlf_blkno = qlip->qli_dquot->q_blkno;
+ qlf->qlf_len = 1;
+ qlf->qlf_boffset = qlip->qli_dquot->q_bufoffset;
+ xlog_finish_iovec(lv, vecp, sizeof(struct xfs_dq_logformat));
+
+ xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_DQUOT,
+ &qlip->qli_dquot->q_core,
+ sizeof(struct xfs_disk_dquot));
}
/*
@@ -140,7 +142,8 @@ xfs_qm_dqunpin_wait(
STATIC uint
xfs_qm_dquot_logitem_push(
struct xfs_log_item *lip,
- struct list_head *buffer_list)
+ struct list_head *buffer_list) __releases(&lip->li_ailp->xa_lock)
+ __acquires(&lip->li_ailp->xa_lock)
{
struct xfs_dquot *dqp = DQUOT_ITEM(lip)->qli_dquot;
struct xfs_buf *bp = NULL;
@@ -258,18 +261,6 @@ xfs_qm_dquot_logitem_init(
xfs_log_item_init(dqp->q_mount, &lp->qli_item, XFS_LI_DQUOT,
&xfs_dquot_item_ops);
lp->qli_dquot = dqp;
- lp->qli_format.qlf_type = XFS_LI_DQUOT;
- lp->qli_format.qlf_id = be32_to_cpu(dqp->q_core.d_id);
- lp->qli_format.qlf_blkno = dqp->q_blkno;
- lp->qli_format.qlf_len = 1;
- /*
- * This is just the offset of this dquot within its buffer
- * (which is currently 1 FSB and probably won't change).
- * Hence 32 bits for this offset should be just fine.
- * Alternatively, we can store (bufoffset / sizeof(xfs_dqblk_t))
- * here, and recompute it at recovery time.
- */
- lp->qli_format.qlf_boffset = (__uint32_t)dqp->q_bufoffset;
}
/*------------------ QUOTAOFF LOG ITEMS -------------------*/
@@ -285,33 +276,30 @@ static inline struct xfs_qoff_logitem *QOFF_ITEM(struct xfs_log_item *lip)
* We only need 1 iovec for an quotaoff item. It just logs the
* quotaoff_log_format structure.
*/
-STATIC uint
+STATIC void
xfs_qm_qoff_logitem_size(
- struct xfs_log_item *lip)
+ struct xfs_log_item *lip,
+ int *nvecs,
+ int *nbytes)
{
- return 1;
+ *nvecs += 1;
+ *nbytes += sizeof(struct xfs_qoff_logitem);
}
-/*
- * This is called to fill in the vector of log iovecs for the
- * given quotaoff log item. We use only 1 iovec, and we point that
- * at the quotaoff_log_format structure embedded in the quotaoff item.
- * It is at this point that we assert that all of the extent
- * slots in the quotaoff item have been filled.
- */
STATIC void
xfs_qm_qoff_logitem_format(
struct xfs_log_item *lip,
- struct xfs_log_iovec *log_vector)
+ struct xfs_log_vec *lv)
{
struct xfs_qoff_logitem *qflip = QOFF_ITEM(lip);
-
- ASSERT(qflip->qql_format.qf_type == XFS_LI_QUOTAOFF);
-
- log_vector->i_addr = &qflip->qql_format;
- log_vector->i_len = sizeof(xfs_qoff_logitem_t);
- log_vector->i_type = XLOG_REG_TYPE_QUOTAOFF;
- qflip->qql_format.qf_size = 1;
+ struct xfs_log_iovec *vecp = NULL;
+ struct xfs_qoff_logformat *qlf;
+
+ qlf = xlog_prepare_iovec(lv, &vecp, XLOG_REG_TYPE_QUOTAOFF);
+ qlf->qf_type = XFS_LI_QUOTAOFF;
+ qlf->qf_size = 1;
+ qlf->qf_flags = qflip->qql_flags;
+ xlog_finish_iovec(lv, vecp, sizeof(struct xfs_qoff_logitem));
}
/*
@@ -451,8 +439,7 @@ xfs_qm_qoff_logitem_init(
xfs_log_item_init(mp, &qf->qql_item, XFS_LI_QUOTAOFF, start ?
&xfs_qm_qoffend_logitem_ops : &xfs_qm_qoff_logitem_ops);
qf->qql_item.li_mountp = mp;
- qf->qql_format.qf_type = XFS_LI_QUOTAOFF;
- qf->qql_format.qf_flags = flags;
qf->qql_start_lip = start;
+ qf->qql_flags = flags;
return qf;
}
diff --git a/fs/xfs/xfs_dquot_item.h b/fs/xfs/xfs_dquot_item.h
index 5acae2ada70..502e9464634 100644
--- a/fs/xfs/xfs_dquot_item.h
+++ b/fs/xfs/xfs_dquot_item.h
@@ -27,13 +27,12 @@ typedef struct xfs_dq_logitem {
xfs_log_item_t qli_item; /* common portion */
struct xfs_dquot *qli_dquot; /* dquot ptr */
xfs_lsn_t qli_flush_lsn; /* lsn at last flush */
- xfs_dq_logformat_t qli_format; /* logged structure */
} xfs_dq_logitem_t;
typedef struct xfs_qoff_logitem {
xfs_log_item_t qql_item; /* common portion */
struct xfs_qoff_logitem *qql_start_lip; /* qoff-start logitem, if any */
- xfs_qoff_logformat_t qql_format; /* logged structure */
+ unsigned int qql_flags;
} xfs_qoff_logitem_t;
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index 610456054dc..edac5b057d2 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -16,17 +16,13 @@
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "xfs.h"
+#include "xfs_format.h"
#include "xfs_fs.h"
-#include "xfs_types.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_dinode.h"
-#include "xfs_inode.h"
-#include "xfs_utils.h"
#include "xfs_error.h"
#ifdef DEBUG
@@ -66,7 +62,7 @@ xfs_error_test(int error_tag, int *fsidp, char *expression,
int i;
int64_t fsid;
- if (random32() % randfactor)
+ if (prandom_u32() % randfactor)
return 0;
memcpy(&fsid, fsidp, sizeof(xfs_fsid_t));
@@ -160,7 +156,7 @@ xfs_error_report(
{
if (level <= xfs_error_level) {
xfs_alert_tag(mp, XFS_PTAG_ERROR_REPORT,
- "Internal error %s at line %d of file %s. Caller 0x%p\n",
+ "Internal error %s at line %d of file %s. Caller %pF",
tag, linenum, filename, ra);
xfs_stack_trace();
@@ -178,7 +174,32 @@ xfs_corruption_error(
inst_t *ra)
{
if (level <= xfs_error_level)
- xfs_hex_dump(p, 16);
+ xfs_hex_dump(p, 64);
xfs_error_report(tag, level, mp, filename, linenum, ra);
xfs_alert(mp, "Corruption detected. Unmount and run xfs_repair");
}
+
+/*
+ * Warnings specifically for verifier errors. Differentiate CRC vs. invalid
+ * values, and omit the stack trace unless the error level is tuned high.
+ */
+void
+xfs_verifier_error(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+
+ xfs_alert(mp, "Metadata %s detected at %pF, block 0x%llx",
+ bp->b_error == EFSBADCRC ? "CRC error" : "corruption",
+ __return_address, bp->b_bn);
+
+ xfs_alert(mp, "Unmount and run xfs_repair");
+
+ if (xfs_error_level >= XFS_ERRLEVEL_LOW) {
+ xfs_alert(mp, "First 64 bytes of corrupted metadata buffer:");
+ xfs_hex_dump(xfs_buf_offset(bp, 0), 64);
+ }
+
+ if (xfs_error_level >= XFS_ERRLEVEL_HIGH)
+ xfs_stack_trace();
+}
diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h
index 079a367f44e..c1c57d4a4b5 100644
--- a/fs/xfs/xfs_error.h
+++ b/fs/xfs/xfs_error.h
@@ -34,6 +34,7 @@ extern void xfs_error_report(const char *tag, int level, struct xfs_mount *mp,
extern void xfs_corruption_error(const char *tag, int level,
struct xfs_mount *mp, void *p, const char *filename,
int linenum, inst_t *ra);
+extern void xfs_verifier_error(struct xfs_buf *bp);
#define XFS_ERROR_REPORT(e, lvl, mp) \
xfs_error_report(e, lvl, mp, __FILE__, __LINE__, __return_address)
diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c
index a83611849ce..753e467aa1a 100644
--- a/fs/xfs/xfs_export.c
+++ b/fs/xfs/xfs_export.c
@@ -16,20 +16,21 @@
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "xfs.h"
-#include "xfs_types.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
-#include "xfs_dir2.h"
#include "xfs_mount.h"
+#include "xfs_da_format.h"
+#include "xfs_dir2.h"
#include "xfs_export.h"
-#include "xfs_vnodeops.h"
-#include "xfs_bmap_btree.h"
#include "xfs_inode.h"
+#include "xfs_trans.h"
#include "xfs_inode_item.h"
#include "xfs_trace.h"
#include "xfs_icache.h"
+#include "xfs_log.h"
/*
* Note that we only accept fileids which are long enough rather than allow
@@ -48,7 +49,7 @@ static int xfs_fileid_length(int fileid_type)
case FILEID_INO32_GEN_PARENT | XFS_FILEID_TYPE_64FLAG:
return 6;
}
- return 255; /* invalid */
+ return FILEID_INVALID;
}
STATIC int
@@ -90,7 +91,7 @@ xfs_fs_encode_fh(
len = xfs_fileid_length(fileid_type);
if (*max_len < len) {
*max_len = len;
- return 255;
+ return FILEID_INVALID;
}
*max_len = len;
@@ -236,7 +237,7 @@ xfs_fs_nfs_commit_metadata(
if (!lsn)
return 0;
- return _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL);
+ return -_xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL);
}
const struct export_operations xfs_export_operations = {
diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c
index 85e9f87a1a7..fd22f69049d 100644
--- a/fs/xfs/xfs_extent_busy.c
+++ b/fs/xfs/xfs_extent_busy.c
@@ -19,17 +19,18 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
-#include "xfs_types.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_shared.h"
+#include "xfs_trans_resv.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
#include "xfs_alloc.h"
-#include "xfs_inode.h"
#include "xfs_extent_busy.h"
#include "xfs_trace.h"
+#include "xfs_trans.h"
+#include "xfs_log.h"
void
xfs_extent_busy_insert(
@@ -147,7 +148,7 @@ xfs_extent_busy_search(
* extent. If the overlap covers the beginning, the end, or all of the busy
* extent, the overlapping portion can be made unbusy and used for the
* allocation. We can't split a busy extent because we can't modify a
- * transaction/CIL context busy list, but we can update an entries block
+ * transaction/CIL context busy list, but we can update an entry's block
* number or length.
*
* Returns true if the extent can safely be reused, or false if the search
@@ -160,7 +161,8 @@ xfs_extent_busy_update_extent(
struct xfs_extent_busy *busyp,
xfs_agblock_t fbno,
xfs_extlen_t flen,
- bool userdata)
+ bool userdata) __releases(&pag->pagb_lock)
+ __acquires(&pag->pagb_lock)
{
xfs_agblock_t fend = fbno + flen;
xfs_agblock_t bbno = busyp->bno;
diff --git a/fs/xfs/xfs_extent_busy.h b/fs/xfs/xfs_extent_busy.h
index 985412d65ba..bfff284d2dc 100644
--- a/fs/xfs/xfs_extent_busy.h
+++ b/fs/xfs/xfs_extent_busy.h
@@ -20,6 +20,10 @@
#ifndef __XFS_EXTENT_BUSY_H__
#define __XFS_EXTENT_BUSY_H__
+struct xfs_mount;
+struct xfs_trans;
+struct xfs_alloc_arg;
+
/*
* Busy block/extent entry. Indexed by a rbtree in perag to mark blocks that
* have been freed but whose transactions aren't committed to disk yet.
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index feb36d7551a..fb7a4c1ce1c 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -17,15 +17,16 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
-#include "xfs_types.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
-#include "xfs_buf_item.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_mount.h"
+#include "xfs_trans.h"
#include "xfs_trans_priv.h"
+#include "xfs_buf_item.h"
#include "xfs_extfree_item.h"
+#include "xfs_log.h"
kmem_zone_t *xfs_efi_zone;
@@ -50,9 +51,8 @@ xfs_efi_item_free(
* Freeing the efi requires that we remove it from the AIL if it has already
* been placed there. However, the EFI may not yet have been placed in the AIL
* when called by xfs_efi_release() from EFD processing due to the ordering of
- * committed vs unpin operations in bulk insert operations. Hence the
- * test_and_clear_bit(XFS_EFI_COMMITTED) to ensure only the last caller frees
- * the EFI.
+ * committed vs unpin operations in bulk insert operations. Hence the reference
+ * count to ensure only the last caller frees the EFI.
*/
STATIC void
__xfs_efi_release(
@@ -60,7 +60,7 @@ __xfs_efi_release(
{
struct xfs_ail *ailp = efip->efi_item.li_ailp;
- if (!test_and_clear_bit(XFS_EFI_COMMITTED, &efip->efi_flags)) {
+ if (atomic_dec_and_test(&efip->efi_refcount)) {
spin_lock(&ailp->xa_lock);
/* xfs_trans_ail_delete() drops the AIL lock. */
xfs_trans_ail_delete(ailp, &efip->efi_item,
@@ -74,11 +74,22 @@ __xfs_efi_release(
* We only need 1 iovec for an efi item. It just logs the efi_log_format
* structure.
*/
-STATIC uint
+static inline int
+xfs_efi_item_sizeof(
+ struct xfs_efi_log_item *efip)
+{
+ return sizeof(struct xfs_efi_log_format) +
+ (efip->efi_format.efi_nextents - 1) * sizeof(xfs_extent_t);
+}
+
+STATIC void
xfs_efi_item_size(
- struct xfs_log_item *lip)
+ struct xfs_log_item *lip,
+ int *nvecs,
+ int *nbytes)
{
- return 1;
+ *nvecs += 1;
+ *nbytes += xfs_efi_item_sizeof(EFI_ITEM(lip));
}
/*
@@ -91,24 +102,20 @@ xfs_efi_item_size(
STATIC void
xfs_efi_item_format(
struct xfs_log_item *lip,
- struct xfs_log_iovec *log_vector)
+ struct xfs_log_vec *lv)
{
struct xfs_efi_log_item *efip = EFI_ITEM(lip);
- uint size;
+ struct xfs_log_iovec *vecp = NULL;
ASSERT(atomic_read(&efip->efi_next_extent) ==
efip->efi_format.efi_nextents);
efip->efi_format.efi_type = XFS_LI_EFI;
-
- size = sizeof(xfs_efi_log_format_t);
- size += (efip->efi_format.efi_nextents - 1) * sizeof(xfs_extent_t);
efip->efi_format.efi_size = 1;
- log_vector->i_addr = &efip->efi_format;
- log_vector->i_len = size;
- log_vector->i_type = XLOG_REG_TYPE_EFI_FORMAT;
- ASSERT(size >= sizeof(xfs_efi_log_format_t));
+ xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_EFI_FORMAT,
+ &efip->efi_format,
+ xfs_efi_item_sizeof(efip));
}
@@ -126,8 +133,8 @@ xfs_efi_item_pin(
* which the EFI is manipulated during a transaction. If we are being asked to
* remove the EFI it's because the transaction has been cancelled and by
* definition that means the EFI cannot be in the AIL so remove it from the
- * transaction and free it. Otherwise coordinate with xfs_efi_release() (via
- * XFS_EFI_COMMITTED) to determine who gets to free the EFI.
+ * transaction and free it. Otherwise coordinate with xfs_efi_release()
+ * to determine who gets to free the EFI.
*/
STATIC void
xfs_efi_item_unpin(
@@ -171,19 +178,13 @@ xfs_efi_item_unlock(
/*
* The EFI is logged only once and cannot be moved in the log, so simply return
- * the lsn at which it's been logged. For bulk transaction committed
- * processing, the EFI may be processed but not yet unpinned prior to the EFD
- * being processed. Set the XFS_EFI_COMMITTED flag so this case can be detected
- * when processing the EFD.
+ * the lsn at which it's been logged.
*/
STATIC xfs_lsn_t
xfs_efi_item_committed(
struct xfs_log_item *lip,
xfs_lsn_t lsn)
{
- struct xfs_efi_log_item *efip = EFI_ITEM(lip);
-
- set_bit(XFS_EFI_COMMITTED, &efip->efi_flags);
return lsn;
}
@@ -241,6 +242,7 @@ xfs_efi_init(
efip->efi_format.efi_nextents = nextents;
efip->efi_format.efi_id = (__psint_t)(void*)efip;
atomic_set(&efip->efi_next_extent, 0);
+ atomic_set(&efip->efi_refcount, 2);
return efip;
}
@@ -310,8 +312,14 @@ xfs_efi_release(xfs_efi_log_item_t *efip,
uint nextents)
{
ASSERT(atomic_read(&efip->efi_next_extent) >= nextents);
- if (atomic_sub_and_test(nextents, &efip->efi_next_extent))
+ if (atomic_sub_and_test(nextents, &efip->efi_next_extent)) {
+ /* recovery needs us to drop the EFI reference, too */
+ if (test_bit(XFS_EFI_RECOVERED, &efip->efi_flags))
+ __xfs_efi_release(efip);
+
__xfs_efi_release(efip);
+ /* efip may now have been freed, do not reference it again. */
+ }
}
static inline struct xfs_efd_log_item *EFD_ITEM(struct xfs_log_item *lip)
@@ -333,11 +341,22 @@ xfs_efd_item_free(struct xfs_efd_log_item *efdp)
* We only need 1 iovec for an efd item. It just logs the efd_log_format
* structure.
*/
-STATIC uint
+static inline int
+xfs_efd_item_sizeof(
+ struct xfs_efd_log_item *efdp)
+{
+ return sizeof(xfs_efd_log_format_t) +
+ (efdp->efd_format.efd_nextents - 1) * sizeof(xfs_extent_t);
+}
+
+STATIC void
xfs_efd_item_size(
- struct xfs_log_item *lip)
+ struct xfs_log_item *lip,
+ int *nvecs,
+ int *nbytes)
{
- return 1;
+ *nvecs += 1;
+ *nbytes += xfs_efd_item_sizeof(EFD_ITEM(lip));
}
/*
@@ -350,23 +369,19 @@ xfs_efd_item_size(
STATIC void
xfs_efd_item_format(
struct xfs_log_item *lip,
- struct xfs_log_iovec *log_vector)
+ struct xfs_log_vec *lv)
{
struct xfs_efd_log_item *efdp = EFD_ITEM(lip);
- uint size;
+ struct xfs_log_iovec *vecp = NULL;
ASSERT(efdp->efd_next_extent == efdp->efd_format.efd_nextents);
efdp->efd_format.efd_type = XFS_LI_EFD;
-
- size = sizeof(xfs_efd_log_format_t);
- size += (efdp->efd_format.efd_nextents - 1) * sizeof(xfs_extent_t);
efdp->efd_format.efd_size = 1;
- log_vector->i_addr = &efdp->efd_format;
- log_vector->i_len = size;
- log_vector->i_type = XLOG_REG_TYPE_EFD_FORMAT;
- ASSERT(size >= sizeof(xfs_efd_log_format_t));
+ xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_EFD_FORMAT,
+ &efdp->efd_format,
+ xfs_efd_item_sizeof(efdp));
}
/*
diff --git a/fs/xfs/xfs_extfree_item.h b/fs/xfs/xfs_extfree_item.h
index 375f68e4253..0ffbce32d56 100644
--- a/fs/xfs/xfs_extfree_item.h
+++ b/fs/xfs/xfs_extfree_item.h
@@ -18,93 +18,11 @@
#ifndef __XFS_EXTFREE_ITEM_H__
#define __XFS_EXTFREE_ITEM_H__
+/* kernel only EFI/EFD definitions */
+
struct xfs_mount;
struct kmem_zone;
-typedef struct xfs_extent {
- xfs_dfsbno_t ext_start;
- xfs_extlen_t ext_len;
-} xfs_extent_t;
-
-/*
- * Since an xfs_extent_t has types (start:64, len: 32)
- * there are different alignments on 32 bit and 64 bit kernels.
- * So we provide the different variants for use by a
- * conversion routine.
- */
-
-typedef struct xfs_extent_32 {
- __uint64_t ext_start;
- __uint32_t ext_len;
-} __attribute__((packed)) xfs_extent_32_t;
-
-typedef struct xfs_extent_64 {
- __uint64_t ext_start;
- __uint32_t ext_len;
- __uint32_t ext_pad;
-} xfs_extent_64_t;
-
-/*
- * This is the structure used to lay out an efi log item in the
- * log. The efi_extents field is a variable size array whose
- * size is given by efi_nextents.
- */
-typedef struct xfs_efi_log_format {
- __uint16_t efi_type; /* efi log item type */
- __uint16_t efi_size; /* size of this item */
- __uint32_t efi_nextents; /* # extents to free */
- __uint64_t efi_id; /* efi identifier */
- xfs_extent_t efi_extents[1]; /* array of extents to free */
-} xfs_efi_log_format_t;
-
-typedef struct xfs_efi_log_format_32 {
- __uint16_t efi_type; /* efi log item type */
- __uint16_t efi_size; /* size of this item */
- __uint32_t efi_nextents; /* # extents to free */
- __uint64_t efi_id; /* efi identifier */
- xfs_extent_32_t efi_extents[1]; /* array of extents to free */
-} __attribute__((packed)) xfs_efi_log_format_32_t;
-
-typedef struct xfs_efi_log_format_64 {
- __uint16_t efi_type; /* efi log item type */
- __uint16_t efi_size; /* size of this item */
- __uint32_t efi_nextents; /* # extents to free */
- __uint64_t efi_id; /* efi identifier */
- xfs_extent_64_t efi_extents[1]; /* array of extents to free */
-} xfs_efi_log_format_64_t;
-
-/*
- * This is the structure used to lay out an efd log item in the
- * log. The efd_extents array is a variable size array whose
- * size is given by efd_nextents;
- */
-typedef struct xfs_efd_log_format {
- __uint16_t efd_type; /* efd log item type */
- __uint16_t efd_size; /* size of this item */
- __uint32_t efd_nextents; /* # of extents freed */
- __uint64_t efd_efi_id; /* id of corresponding efi */
- xfs_extent_t efd_extents[1]; /* array of extents freed */
-} xfs_efd_log_format_t;
-
-typedef struct xfs_efd_log_format_32 {
- __uint16_t efd_type; /* efd log item type */
- __uint16_t efd_size; /* size of this item */
- __uint32_t efd_nextents; /* # of extents freed */
- __uint64_t efd_efi_id; /* id of corresponding efi */
- xfs_extent_32_t efd_extents[1]; /* array of extents freed */
-} __attribute__((packed)) xfs_efd_log_format_32_t;
-
-typedef struct xfs_efd_log_format_64 {
- __uint16_t efd_type; /* efd log item type */
- __uint16_t efd_size; /* size of this item */
- __uint32_t efd_nextents; /* # of extents freed */
- __uint64_t efd_efi_id; /* id of corresponding efi */
- xfs_extent_64_t efd_extents[1]; /* array of extents freed */
-} xfs_efd_log_format_64_t;
-
-
-#ifdef __KERNEL__
-
/*
* Max number of extents in fast allocation path.
*/
@@ -114,16 +32,20 @@ typedef struct xfs_efd_log_format_64 {
* Define EFI flag bits. Manipulated by set/clear/test_bit operators.
*/
#define XFS_EFI_RECOVERED 1
-#define XFS_EFI_COMMITTED 2
/*
- * This is the "extent free intention" log item. It is used
- * to log the fact that some extents need to be free. It is
- * used in conjunction with the "extent free done" log item
- * described below.
+ * This is the "extent free intention" log item. It is used to log the fact
+ * that some extents need to be free. It is used in conjunction with the
+ * "extent free done" log item described below.
+ *
+ * The EFI is reference counted so that it is not freed prior to both the EFI
+ * and EFD being committed and unpinned. This ensures that when the last
+ * reference goes away the EFI will always be in the AIL as it has been
+ * unpinned, regardless of whether the EFD is processed before or after the EFI.
*/
typedef struct xfs_efi_log_item {
xfs_log_item_t efi_item;
+ atomic_t efi_refcount;
atomic_t efi_next_extent;
unsigned long efi_flags; /* misc flags */
xfs_efi_log_format_t efi_format;
@@ -156,6 +78,4 @@ int xfs_efi_copy_format(xfs_log_iovec_t *buf,
xfs_efi_log_format_t *dst_efi_fmt);
void xfs_efi_item_free(xfs_efi_log_item_t *);
-#endif /* __KERNEL__ */
-
#endif /* __XFS_EXTFREE_ITEM_H__ */
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 67284edb84d..1f66779d7a4 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -17,25 +17,29 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
-#include "xfs_log.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
-#include "xfs_trans.h"
#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc.h"
-#include "xfs_dinode.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
#include "xfs_inode.h"
+#include "xfs_trans.h"
#include "xfs_inode_item.h"
#include "xfs_bmap.h"
+#include "xfs_bmap_util.h"
#include "xfs_error.h"
-#include "xfs_vnodeops.h"
-#include "xfs_da_btree.h"
-#include "xfs_dir2_format.h"
+#include "xfs_dir2.h"
#include "xfs_dir2_priv.h"
#include "xfs_ioctl.h"
#include "xfs_trace.h"
+#include "xfs_log.h"
+#include "xfs_dinode.h"
+#include <linux/aio.h>
#include <linux/dcache.h>
#include <linux/falloc.h>
#include <linux/pagevec.h>
@@ -151,7 +155,7 @@ xfs_dir_fsync(
if (!lsn)
return 0;
- return _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL);
+ return -_xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL);
}
STATIC int
@@ -225,39 +229,33 @@ xfs_file_fsync(
}
STATIC ssize_t
-xfs_file_aio_read(
+xfs_file_read_iter(
struct kiocb *iocb,
- const struct iovec *iovp,
- unsigned long nr_segs,
- loff_t pos)
+ struct iov_iter *to)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
struct xfs_inode *ip = XFS_I(inode);
struct xfs_mount *mp = ip->i_mount;
- size_t size = 0;
+ size_t size = iov_iter_count(to);
ssize_t ret = 0;
int ioflags = 0;
xfs_fsize_t n;
+ loff_t pos = iocb->ki_pos;
XFS_STATS_INC(xs_read_calls);
- BUG_ON(iocb->ki_pos != pos);
-
if (unlikely(file->f_flags & O_DIRECT))
ioflags |= IO_ISDIRECT;
if (file->f_mode & FMODE_NOCMTIME)
ioflags |= IO_INVIS;
- ret = generic_segment_checks(iovp, &nr_segs, &size, VERIFY_WRITE);
- if (ret < 0)
- return ret;
-
if (unlikely(ioflags & IO_ISDIRECT)) {
xfs_buftarg_t *target =
XFS_IS_REALTIME_INODE(ip) ?
mp->m_rtdev_targp : mp->m_ddev_targp;
- if ((pos & target->bt_smask) || (size & target->bt_smask)) {
+ /* DIO must be aligned to device logical sector size */
+ if ((pos | size) & target->bt_logical_sectormask) {
if (pos == i_size_read(inode))
return 0;
return -XFS_ERROR(EINVAL);
@@ -290,7 +288,7 @@ xfs_file_aio_read(
xfs_rw_ilock(ip, XFS_IOLOCK_EXCL);
if (inode->i_mapping->nrpages) {
- ret = -filemap_write_and_wait_range(
+ ret = filemap_write_and_wait_range(
VFS_I(ip)->i_mapping,
pos, -1);
if (ret) {
@@ -304,7 +302,7 @@ xfs_file_aio_read(
trace_xfs_file_read(ip, size, pos, ioflags);
- ret = generic_file_aio_read(iocb, iovp, nr_segs, pos);
+ ret = generic_file_read_iter(iocb, to);
if (ret > 0)
XFS_STATS_ADD(xs_read_bytes, ret);
@@ -345,47 +343,6 @@ xfs_file_splice_read(
}
/*
- * xfs_file_splice_write() does not use xfs_rw_ilock() because
- * generic_file_splice_write() takes the i_mutex itself. This, in theory,
- * couuld cause lock inversions between the aio_write path and the splice path
- * if someone is doing concurrent splice(2) based writes and write(2) based
- * writes to the same inode. The only real way to fix this is to re-implement
- * the generic code here with correct locking orders.
- */
-STATIC ssize_t
-xfs_file_splice_write(
- struct pipe_inode_info *pipe,
- struct file *outfilp,
- loff_t *ppos,
- size_t count,
- unsigned int flags)
-{
- struct inode *inode = outfilp->f_mapping->host;
- struct xfs_inode *ip = XFS_I(inode);
- int ioflags = 0;
- ssize_t ret;
-
- XFS_STATS_INC(xs_write_calls);
-
- if (outfilp->f_mode & FMODE_NOCMTIME)
- ioflags |= IO_INVIS;
-
- if (XFS_FORCED_SHUTDOWN(ip->i_mount))
- return -EIO;
-
- xfs_ilock(ip, XFS_IOLOCK_EXCL);
-
- trace_xfs_file_splice_write(ip, count, *ppos, ioflags);
-
- ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags);
- if (ret > 0)
- XFS_STATS_ADD(xs_write_bytes, ret);
-
- xfs_iunlock(ip, XFS_IOLOCK_EXCL);
- return ret;
-}
-
-/*
* This routine is called to handle zeroing any space in the last block of the
* file that is beyond the EOF. We do this since the size is being increased
* without writing anything to that block and we don't want to read the
@@ -620,10 +577,7 @@ restart:
STATIC ssize_t
xfs_file_dio_aio_write(
struct kiocb *iocb,
- const struct iovec *iovp,
- unsigned long nr_segs,
- loff_t pos,
- size_t ocount)
+ struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
struct address_space *mapping = file->f_mapping;
@@ -631,15 +585,18 @@ xfs_file_dio_aio_write(
struct xfs_inode *ip = XFS_I(inode);
struct xfs_mount *mp = ip->i_mount;
ssize_t ret = 0;
- size_t count = ocount;
int unaligned_io = 0;
int iolock;
+ size_t count = iov_iter_count(from);
+ loff_t pos = iocb->ki_pos;
struct xfs_buftarg *target = XFS_IS_REALTIME_INODE(ip) ?
mp->m_rtdev_targp : mp->m_ddev_targp;
- if ((pos & target->bt_smask) || (count & target->bt_smask))
+ /* DIO must be aligned to device logical sector size */
+ if ((pos | count) & target->bt_logical_sectormask)
return -XFS_ERROR(EINVAL);
+ /* "unaligned" here means not aligned to a filesystem block */
if ((pos & mp->m_blockmask) || ((pos + count) & mp->m_blockmask))
unaligned_io = 1;
@@ -670,9 +627,10 @@ xfs_file_dio_aio_write(
ret = xfs_file_aio_write_checks(file, &pos, &count, &iolock);
if (ret)
goto out;
+ iov_iter_truncate(from, count);
if (mapping->nrpages) {
- ret = -filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
+ ret = filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
pos, -1);
if (ret)
goto out;
@@ -691,8 +649,7 @@ xfs_file_dio_aio_write(
}
trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0);
- ret = generic_file_direct_write(iocb, iovp,
- &nr_segs, pos, &iocb->ki_pos, count, ocount);
+ ret = generic_file_direct_write(iocb, from, pos);
out:
xfs_rw_iunlock(ip, iolock);
@@ -705,10 +662,7 @@ out:
STATIC ssize_t
xfs_file_buffered_aio_write(
struct kiocb *iocb,
- const struct iovec *iovp,
- unsigned long nr_segs,
- loff_t pos,
- size_t ocount)
+ struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
struct address_space *mapping = file->f_mapping;
@@ -717,7 +671,8 @@ xfs_file_buffered_aio_write(
ssize_t ret;
int enospc = 0;
int iolock = XFS_IOLOCK_EXCL;
- size_t count = ocount;
+ loff_t pos = iocb->ki_pos;
+ size_t count = iov_iter_count(from);
xfs_rw_ilock(ip, iolock);
@@ -725,14 +680,15 @@ xfs_file_buffered_aio_write(
if (ret)
goto out;
+ iov_iter_truncate(from, count);
/* We can write back this queue in page reclaim */
current->backing_dev_info = mapping->backing_dev_info;
write_retry:
trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, 0);
- ret = generic_file_buffered_write(iocb, iovp, nr_segs,
- pos, &iocb->ki_pos, count, 0);
-
+ ret = generic_perform_write(file, from, pos);
+ if (likely(ret >= 0))
+ iocb->ki_pos = pos + ret;
/*
* If we just got an ENOSPC, try to write back all dirty inodes to
* convert delalloc space to free up some of the excess reserved
@@ -751,42 +707,29 @@ out:
}
STATIC ssize_t
-xfs_file_aio_write(
+xfs_file_write_iter(
struct kiocb *iocb,
- const struct iovec *iovp,
- unsigned long nr_segs,
- loff_t pos)
+ struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
struct address_space *mapping = file->f_mapping;
struct inode *inode = mapping->host;
struct xfs_inode *ip = XFS_I(inode);
ssize_t ret;
- size_t ocount = 0;
+ size_t ocount = iov_iter_count(from);
XFS_STATS_INC(xs_write_calls);
- BUG_ON(iocb->ki_pos != pos);
-
- ret = generic_segment_checks(iovp, &nr_segs, &ocount, VERIFY_READ);
- if (ret)
- return ret;
-
if (ocount == 0)
return 0;
- sb_start_write(inode->i_sb);
-
- if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
- ret = -EIO;
- goto out;
- }
+ if (XFS_FORCED_SHUTDOWN(ip->i_mount))
+ return -EIO;
if (unlikely(file->f_flags & O_DIRECT))
- ret = xfs_file_dio_aio_write(iocb, iovp, nr_segs, pos, ocount);
+ ret = xfs_file_dio_aio_write(iocb, from);
else
- ret = xfs_file_buffered_aio_write(iocb, iovp, nr_segs, pos,
- ocount);
+ ret = xfs_file_buffered_aio_write(iocb, from);
if (ret > 0) {
ssize_t err;
@@ -794,56 +737,99 @@ xfs_file_aio_write(
XFS_STATS_ADD(xs_write_bytes, ret);
/* Handle various SYNC-type writes */
- err = generic_write_sync(file, pos, ret);
+ err = generic_write_sync(file, iocb->ki_pos - ret, ret);
if (err < 0)
ret = err;
}
-
-out:
- sb_end_write(inode->i_sb);
return ret;
}
STATIC long
xfs_file_fallocate(
- struct file *file,
- int mode,
- loff_t offset,
- loff_t len)
+ struct file *file,
+ int mode,
+ loff_t offset,
+ loff_t len)
{
- struct inode *inode = file->f_path.dentry->d_inode;
- long error;
- loff_t new_size = 0;
- xfs_flock64_t bf;
- xfs_inode_t *ip = XFS_I(inode);
- int cmd = XFS_IOC_RESVSP;
- int attr_flags = XFS_ATTR_NOLOCK;
+ struct inode *inode = file_inode(file);
+ struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_trans *tp;
+ long error;
+ loff_t new_size = 0;
- if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
+ if (!S_ISREG(inode->i_mode))
+ return -EINVAL;
+ if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
+ FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE))
return -EOPNOTSUPP;
- bf.l_whence = 0;
- bf.l_start = offset;
- bf.l_len = len;
-
xfs_ilock(ip, XFS_IOLOCK_EXCL);
+ if (mode & FALLOC_FL_PUNCH_HOLE) {
+ error = xfs_free_file_space(ip, offset, len);
+ if (error)
+ goto out_unlock;
+ } else if (mode & FALLOC_FL_COLLAPSE_RANGE) {
+ unsigned blksize_mask = (1 << inode->i_blkbits) - 1;
- if (mode & FALLOC_FL_PUNCH_HOLE)
- cmd = XFS_IOC_UNRESVSP;
+ if (offset & blksize_mask || len & blksize_mask) {
+ error = EINVAL;
+ goto out_unlock;
+ }
- /* check the new inode size is valid before allocating */
- if (!(mode & FALLOC_FL_KEEP_SIZE) &&
- offset + len > i_size_read(inode)) {
- new_size = offset + len;
- error = inode_newsize_ok(inode, new_size);
+ /*
+ * There is no need to overlap collapse range with EOF,
+ * in which case it is effectively a truncate operation
+ */
+ if (offset + len >= i_size_read(inode)) {
+ error = EINVAL;
+ goto out_unlock;
+ }
+
+ new_size = i_size_read(inode) - len;
+
+ error = xfs_collapse_file_space(ip, offset, len);
+ if (error)
+ goto out_unlock;
+ } else {
+ if (!(mode & FALLOC_FL_KEEP_SIZE) &&
+ offset + len > i_size_read(inode)) {
+ new_size = offset + len;
+ error = -inode_newsize_ok(inode, new_size);
+ if (error)
+ goto out_unlock;
+ }
+
+ if (mode & FALLOC_FL_ZERO_RANGE)
+ error = xfs_zero_file_space(ip, offset, len);
+ else
+ error = xfs_alloc_file_space(ip, offset, len,
+ XFS_BMAPI_PREALLOC);
if (error)
goto out_unlock;
}
- if (file->f_flags & O_DSYNC)
- attr_flags |= XFS_ATTR_SYNC;
+ tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_WRITEID);
+ error = xfs_trans_reserve(tp, &M_RES(ip->i_mount)->tr_writeid, 0, 0);
+ if (error) {
+ xfs_trans_cancel(tp, 0);
+ goto out_unlock;
+ }
+
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+ ip->i_d.di_mode &= ~S_ISUID;
+ if (ip->i_d.di_mode & S_IXGRP)
+ ip->i_d.di_mode &= ~S_ISGID;
+
+ if (!(mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_COLLAPSE_RANGE)))
+ ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC;
- error = -xfs_change_file_space(ip, cmd, &bf, 0, attr_flags);
+ xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+ if (file->f_flags & O_DSYNC)
+ xfs_trans_set_sync(tp);
+ error = xfs_trans_commit(tp, 0);
if (error)
goto out_unlock;
@@ -853,12 +839,12 @@ xfs_file_fallocate(
iattr.ia_valid = ATTR_SIZE;
iattr.ia_size = new_size;
- error = -xfs_setattr_size(ip, &iattr, XFS_ATTR_NOLOCK);
+ error = xfs_setattr_size(ip, &iattr);
}
out_unlock:
xfs_iunlock(ip, XFS_IOLOCK_EXCL);
- return error;
+ return -error;
}
@@ -891,9 +877,9 @@ xfs_dir_open(
* If there are any blocks, read-ahead block 0 as we're almost
* certain to have the next operation be a read there.
*/
- mode = xfs_ilock_map_shared(ip);
+ mode = xfs_ilock_data_map_shared(ip);
if (ip->i_d.di_nextents > 0)
- xfs_dir2_data_readahead(NULL, ip, 0, -1);
+ xfs_dir3_data_readahead(ip, 0, -1);
xfs_iunlock(ip, mode);
return 0;
}
@@ -908,11 +894,10 @@ xfs_file_release(
STATIC int
xfs_file_readdir(
- struct file *filp,
- void *dirent,
- filldir_t filldir)
+ struct file *file,
+ struct dir_context *ctx)
{
- struct inode *inode = filp->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(file);
xfs_inode_t *ip = XFS_I(inode);
int error;
size_t bufsize;
@@ -931,8 +916,7 @@ xfs_file_readdir(
*/
bufsize = (size_t)min_t(loff_t, 32768, ip->i_d.di_size);
- error = xfs_readdir(ip, dirent, bufsize,
- (xfs_off_t *)&filp->f_pos, filldir);
+ error = xfs_readdir(ip, ctx, bufsize);
if (error)
return -error;
return 0;
@@ -1196,7 +1180,7 @@ xfs_seek_data(
uint lock;
int error;
- lock = xfs_ilock_map_shared(ip);
+ lock = xfs_ilock_data_map_shared(ip);
isize = i_size_read(inode);
if (start >= isize) {
@@ -1272,11 +1256,10 @@ xfs_seek_data(
}
out:
- if (offset != file->f_pos)
- file->f_pos = offset;
+ offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
out_unlock:
- xfs_iunlock_map_shared(ip, lock);
+ xfs_iunlock(ip, lock);
if (error)
return -error;
@@ -1301,7 +1284,7 @@ xfs_seek_hole(
if (XFS_FORCED_SHUTDOWN(mp))
return -XFS_ERROR(EIO);
- lock = xfs_ilock_map_shared(ip);
+ lock = xfs_ilock_data_map_shared(ip);
isize = i_size_read(inode);
if (start >= isize) {
@@ -1381,11 +1364,10 @@ out:
* situation in particular.
*/
offset = min_t(loff_t, offset, isize);
- if (offset != file->f_pos)
- file->f_pos = offset;
+ offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
out_unlock:
- xfs_iunlock_map_shared(ip, lock);
+ xfs_iunlock(ip, lock);
if (error)
return -error;
@@ -1414,12 +1396,12 @@ xfs_file_llseek(
const struct file_operations xfs_file_operations = {
.llseek = xfs_file_llseek,
- .read = do_sync_read,
- .write = do_sync_write,
- .aio_read = xfs_file_aio_read,
- .aio_write = xfs_file_aio_write,
+ .read = new_sync_read,
+ .write = new_sync_write,
+ .read_iter = xfs_file_read_iter,
+ .write_iter = xfs_file_write_iter,
.splice_read = xfs_file_splice_read,
- .splice_write = xfs_file_splice_write,
+ .splice_write = iter_file_splice_write,
.unlocked_ioctl = xfs_file_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = xfs_file_compat_ioctl,
@@ -1434,7 +1416,7 @@ const struct file_operations xfs_file_operations = {
const struct file_operations xfs_dir_file_operations = {
.open = xfs_dir_open,
.read = generic_read_dir,
- .readdir = xfs_file_readdir,
+ .iterate = xfs_file_readdir,
.llseek = generic_file_llseek,
.unlocked_ioctl = xfs_file_ioctl,
#ifdef CONFIG_COMPAT
@@ -1445,6 +1427,7 @@ const struct file_operations xfs_dir_file_operations = {
static const struct vm_operations_struct xfs_file_vm_ops = {
.fault = filemap_fault,
+ .map_pages = filemap_map_pages,
.page_mkwrite = xfs_vm_page_mkwrite,
.remap_pages = generic_file_remap_pages,
};
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
index 5170306a100..8ec81bed799 100644
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c
@@ -1,5 +1,6 @@
/*
* Copyright (c) 2006-2007 Silicon Graphics, Inc.
+ * Copyright (c) 2014 Christoph Hellwig.
* All Rights Reserved.
*
* This program is free software; you can redistribute it and/or
@@ -16,116 +17,36 @@
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "xfs.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_inum.h"
-#include "xfs_dinode.h"
-#include "xfs_inode.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_ag.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_mount.h"
+#include "xfs_inum.h"
+#include "xfs_inode.h"
#include "xfs_bmap.h"
+#include "xfs_bmap_util.h"
#include "xfs_alloc.h"
-#include "xfs_utils.h"
#include "xfs_mru_cache.h"
+#include "xfs_dinode.h"
#include "xfs_filestream.h"
#include "xfs_trace.h"
-#ifdef XFS_FILESTREAMS_TRACE
-
-ktrace_t *xfs_filestreams_trace_buf;
-
-STATIC void
-xfs_filestreams_trace(
- xfs_mount_t *mp, /* mount point */
- int type, /* type of trace */
- const char *func, /* source function */
- int line, /* source line number */
- __psunsigned_t arg0,
- __psunsigned_t arg1,
- __psunsigned_t arg2,
- __psunsigned_t arg3,
- __psunsigned_t arg4,
- __psunsigned_t arg5)
-{
- ktrace_enter(xfs_filestreams_trace_buf,
- (void *)(__psint_t)(type | (line << 16)),
- (void *)func,
- (void *)(__psunsigned_t)current_pid(),
- (void *)mp,
- (void *)(__psunsigned_t)arg0,
- (void *)(__psunsigned_t)arg1,
- (void *)(__psunsigned_t)arg2,
- (void *)(__psunsigned_t)arg3,
- (void *)(__psunsigned_t)arg4,
- (void *)(__psunsigned_t)arg5,
- NULL, NULL, NULL, NULL, NULL, NULL);
-}
-
-#define TRACE0(mp,t) TRACE6(mp,t,0,0,0,0,0,0)
-#define TRACE1(mp,t,a0) TRACE6(mp,t,a0,0,0,0,0,0)
-#define TRACE2(mp,t,a0,a1) TRACE6(mp,t,a0,a1,0,0,0,0)
-#define TRACE3(mp,t,a0,a1,a2) TRACE6(mp,t,a0,a1,a2,0,0,0)
-#define TRACE4(mp,t,a0,a1,a2,a3) TRACE6(mp,t,a0,a1,a2,a3,0,0)
-#define TRACE5(mp,t,a0,a1,a2,a3,a4) TRACE6(mp,t,a0,a1,a2,a3,a4,0)
-#define TRACE6(mp,t,a0,a1,a2,a3,a4,a5) \
- xfs_filestreams_trace(mp, t, __func__, __LINE__, \
- (__psunsigned_t)a0, (__psunsigned_t)a1, \
- (__psunsigned_t)a2, (__psunsigned_t)a3, \
- (__psunsigned_t)a4, (__psunsigned_t)a5)
-
-#define TRACE_AG_SCAN(mp, ag, ag2) \
- TRACE2(mp, XFS_FSTRM_KTRACE_AGSCAN, ag, ag2);
-#define TRACE_AG_PICK1(mp, max_ag, maxfree) \
- TRACE2(mp, XFS_FSTRM_KTRACE_AGPICK1, max_ag, maxfree);
-#define TRACE_AG_PICK2(mp, ag, ag2, cnt, free, scan, flag) \
- TRACE6(mp, XFS_FSTRM_KTRACE_AGPICK2, ag, ag2, \
- cnt, free, scan, flag)
-#define TRACE_UPDATE(mp, ip, ag, cnt, ag2, cnt2) \
- TRACE5(mp, XFS_FSTRM_KTRACE_UPDATE, ip, ag, cnt, ag2, cnt2)
-#define TRACE_FREE(mp, ip, pip, ag, cnt) \
- TRACE4(mp, XFS_FSTRM_KTRACE_FREE, ip, pip, ag, cnt)
-#define TRACE_LOOKUP(mp, ip, pip, ag, cnt) \
- TRACE4(mp, XFS_FSTRM_KTRACE_ITEM_LOOKUP, ip, pip, ag, cnt)
-#define TRACE_ASSOCIATE(mp, ip, pip, ag, cnt) \
- TRACE4(mp, XFS_FSTRM_KTRACE_ASSOCIATE, ip, pip, ag, cnt)
-#define TRACE_MOVEAG(mp, ip, pip, oag, ocnt, nag, ncnt) \
- TRACE6(mp, XFS_FSTRM_KTRACE_MOVEAG, ip, pip, oag, ocnt, nag, ncnt)
-#define TRACE_ORPHAN(mp, ip, ag) \
- TRACE2(mp, XFS_FSTRM_KTRACE_ORPHAN, ip, ag);
-
-
-#else
-#define TRACE_AG_SCAN(mp, ag, ag2)
-#define TRACE_AG_PICK1(mp, max_ag, maxfree)
-#define TRACE_AG_PICK2(mp, ag, ag2, cnt, free, scan, flag)
-#define TRACE_UPDATE(mp, ip, ag, cnt, ag2, cnt2)
-#define TRACE_FREE(mp, ip, pip, ag, cnt)
-#define TRACE_LOOKUP(mp, ip, pip, ag, cnt)
-#define TRACE_ASSOCIATE(mp, ip, pip, ag, cnt)
-#define TRACE_MOVEAG(mp, ip, pip, oag, ocnt, nag, ncnt)
-#define TRACE_ORPHAN(mp, ip, ag)
-#endif
-
-static kmem_zone_t *item_zone;
+struct xfs_fstrm_item {
+ struct xfs_mru_cache_elem mru;
+ struct xfs_inode *ip;
+ xfs_agnumber_t ag; /* AG in use for this directory */
+};
-/*
- * Structure for associating a file or a directory with an allocation group.
- * The parent directory pointer is only needed for files, but since there will
- * generally be vastly more files than directories in the cache, using the same
- * data structure simplifies the code with very little memory overhead.
- */
-typedef struct fstrm_item
-{
- xfs_agnumber_t ag; /* AG currently in use for the file/directory. */
- xfs_inode_t *ip; /* inode self-pointer. */
- xfs_inode_t *pip; /* Parent directory inode pointer. */
-} fstrm_item_t;
+enum xfs_fstrm_alloc {
+ XFS_PICK_USERDATA = 1,
+ XFS_PICK_LOWSPACE = 2,
+};
/*
* Allocation group filestream associations are tracked with per-ag atomic
- * counters. These counters allow _xfs_filestream_pick_ag() to tell whether a
+ * counters. These counters allow xfs_filestream_pick_ag() to tell whether a
* particular AG already has active filestreams associated with it. The mount
* point's m_peraglock is used to protect these counters from per-ag array
* re-allocation during a growfs operation. When xfs_growfs_data_private() is
@@ -160,7 +81,7 @@ typedef struct fstrm_item
* the cache that reference per-ag array elements that have since been
* reallocated.
*/
-static int
+int
xfs_filestream_peek_ag(
xfs_mount_t *mp,
xfs_agnumber_t agno)
@@ -200,23 +121,40 @@ xfs_filestream_put_ag(
xfs_perag_put(pag);
}
+static void
+xfs_fstrm_free_func(
+ struct xfs_mru_cache_elem *mru)
+{
+ struct xfs_fstrm_item *item =
+ container_of(mru, struct xfs_fstrm_item, mru);
+
+ xfs_filestream_put_ag(item->ip->i_mount, item->ag);
+
+ trace_xfs_filestream_free(item->ip, item->ag);
+
+ kmem_free(item);
+}
+
/*
* Scan the AGs starting at startag looking for an AG that isn't in use and has
* at least minlen blocks free.
*/
static int
-_xfs_filestream_pick_ag(
- xfs_mount_t *mp,
- xfs_agnumber_t startag,
- xfs_agnumber_t *agp,
- int flags,
- xfs_extlen_t minlen)
+xfs_filestream_pick_ag(
+ struct xfs_inode *ip,
+ xfs_agnumber_t startag,
+ xfs_agnumber_t *agp,
+ int flags,
+ xfs_extlen_t minlen)
{
- int streams, max_streams;
- int err, trylock, nscan;
- xfs_extlen_t longest, free, minfree, maxfree = 0;
- xfs_agnumber_t ag, max_ag = NULLAGNUMBER;
- struct xfs_perag *pag;
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_fstrm_item *item;
+ struct xfs_perag *pag;
+ xfs_extlen_t longest, free = 0, minfree, maxfree = 0;
+ xfs_agnumber_t ag, max_ag = NULLAGNUMBER;
+ int err, trylock, nscan;
+
+ ASSERT(S_ISDIR(ip->i_d.di_mode));
/* 2% of an AG's blocks must be free for it to be chosen. */
minfree = mp->m_sb.sb_agblocks / 50;
@@ -228,8 +166,9 @@ _xfs_filestream_pick_ag(
trylock = XFS_ALLOC_FLAG_TRYLOCK;
for (nscan = 0; 1; nscan++) {
+ trace_xfs_filestream_scan(ip, ag);
+
pag = xfs_perag_get(mp, ag);
- TRACE_AG_SCAN(mp, ag, atomic_read(&pag->pagf_fstrms));
if (!pag->pagf_init) {
err = xfs_alloc_pagf_init(mp, NULL, ag, trylock);
@@ -246,7 +185,6 @@ _xfs_filestream_pick_ag(
/* Keep track of the AG with the most free blocks. */
if (pag->pagf_freeblks > maxfree) {
maxfree = pag->pagf_freeblks;
- max_streams = atomic_read(&pag->pagf_fstrms);
max_ag = ag;
}
@@ -269,7 +207,6 @@ _xfs_filestream_pick_ag(
/* Break out, retaining the reference on the AG. */
free = pag->pagf_freeblks;
- streams = atomic_read(&pag->pagf_fstrms);
xfs_perag_put(pag);
*agp = ag;
break;
@@ -305,317 +242,98 @@ next_ag:
*/
if (max_ag != NULLAGNUMBER) {
xfs_filestream_get_ag(mp, max_ag);
- TRACE_AG_PICK1(mp, max_ag, maxfree);
- streams = max_streams;
free = maxfree;
*agp = max_ag;
break;
}
/* take AG 0 if none matched */
- TRACE_AG_PICK1(mp, max_ag, maxfree);
+ trace_xfs_filestream_pick(ip, *agp, free, nscan);
*agp = 0;
return 0;
}
- TRACE_AG_PICK2(mp, startag, *agp, streams, free, nscan, flags);
-
- return 0;
-}
-
-/*
- * Set the allocation group number for a file or a directory, updating inode
- * references and per-AG references as appropriate.
- */
-static int
-_xfs_filestream_update_ag(
- xfs_inode_t *ip,
- xfs_inode_t *pip,
- xfs_agnumber_t ag)
-{
- int err = 0;
- xfs_mount_t *mp;
- xfs_mru_cache_t *cache;
- fstrm_item_t *item;
- xfs_agnumber_t old_ag;
- xfs_inode_t *old_pip;
+ trace_xfs_filestream_pick(ip, *agp, free, nscan);
- /*
- * Either ip is a regular file and pip is a directory, or ip is a
- * directory and pip is NULL.
- */
- ASSERT(ip && ((S_ISREG(ip->i_d.di_mode) && pip &&
- S_ISDIR(pip->i_d.di_mode)) ||
- (S_ISDIR(ip->i_d.di_mode) && !pip)));
-
- mp = ip->i_mount;
- cache = mp->m_filestream;
-
- item = xfs_mru_cache_lookup(cache, ip->i_ino);
- if (item) {
- ASSERT(item->ip == ip);
- old_ag = item->ag;
- item->ag = ag;
- old_pip = item->pip;
- item->pip = pip;
- xfs_mru_cache_done(cache);
-
- /*
- * If the AG has changed, drop the old ref and take a new one,
- * effectively transferring the reference from old to new AG.
- */
- if (ag != old_ag) {
- xfs_filestream_put_ag(mp, old_ag);
- xfs_filestream_get_ag(mp, ag);
- }
-
- /*
- * If ip is a file and its pip has changed, drop the old ref and
- * take a new one.
- */
- if (pip && pip != old_pip) {
- IRELE(old_pip);
- IHOLD(pip);
- }
-
- TRACE_UPDATE(mp, ip, old_ag, xfs_filestream_peek_ag(mp, old_ag),
- ag, xfs_filestream_peek_ag(mp, ag));
+ if (*agp == NULLAGNUMBER)
return 0;
- }
- item = kmem_zone_zalloc(item_zone, KM_MAYFAIL);
+ err = ENOMEM;
+ item = kmem_alloc(sizeof(*item), KM_MAYFAIL);
if (!item)
- return ENOMEM;
+ goto out_put_ag;
- item->ag = ag;
+ item->ag = *agp;
item->ip = ip;
- item->pip = pip;
- err = xfs_mru_cache_insert(cache, ip->i_ino, item);
+ err = xfs_mru_cache_insert(mp->m_filestream, ip->i_ino, &item->mru);
if (err) {
- kmem_zone_free(item_zone, item);
- return err;
+ if (err == EEXIST)
+ err = 0;
+ goto out_free_item;
}
- /* Take a reference on the AG. */
- xfs_filestream_get_ag(mp, ag);
-
- /*
- * Take a reference on the inode itself regardless of whether it's a
- * regular file or a directory.
- */
- IHOLD(ip);
-
- /*
- * In the case of a regular file, take a reference on the parent inode
- * as well to ensure it remains in-core.
- */
- if (pip)
- IHOLD(pip);
-
- TRACE_UPDATE(mp, ip, ag, xfs_filestream_peek_ag(mp, ag),
- ag, xfs_filestream_peek_ag(mp, ag));
-
- return 0;
-}
-
-/* xfs_fstrm_free_func(): callback for freeing cached stream items. */
-STATIC void
-xfs_fstrm_free_func(
- unsigned long ino,
- void *data)
-{
- fstrm_item_t *item = (fstrm_item_t *)data;
- xfs_inode_t *ip = item->ip;
-
- ASSERT(ip->i_ino == ino);
-
- xfs_iflags_clear(ip, XFS_IFILESTREAM);
-
- /* Drop the reference taken on the AG when the item was added. */
- xfs_filestream_put_ag(ip->i_mount, item->ag);
-
- TRACE_FREE(ip->i_mount, ip, item->pip, item->ag,
- xfs_filestream_peek_ag(ip->i_mount, item->ag));
-
- /*
- * _xfs_filestream_update_ag() always takes a reference on the inode
- * itself, whether it's a file or a directory. Release it here.
- * This can result in the inode being freed and so we must
- * not hold any inode locks when freeing filesstreams objects
- * otherwise we can deadlock here.
- */
- IRELE(ip);
-
- /*
- * In the case of a regular file, _xfs_filestream_update_ag() also
- * takes a ref on the parent inode to keep it in-core. Release that
- * too.
- */
- if (item->pip)
- IRELE(item->pip);
-
- /* Finally, free the memory allocated for the item. */
- kmem_zone_free(item_zone, item);
-}
-
-/*
- * xfs_filestream_init() is called at xfs initialisation time to set up the
- * memory zone that will be used for filestream data structure allocation.
- */
-int
-xfs_filestream_init(void)
-{
- item_zone = kmem_zone_init(sizeof(fstrm_item_t), "fstrm_item");
- if (!item_zone)
- return -ENOMEM;
-
return 0;
-}
-
-/*
- * xfs_filestream_uninit() is called at xfs termination time to destroy the
- * memory zone that was used for filestream data structure allocation.
- */
-void
-xfs_filestream_uninit(void)
-{
- kmem_zone_destroy(item_zone);
-}
-
-/*
- * xfs_filestream_mount() is called when a file system is mounted with the
- * filestream option. It is responsible for allocating the data structures
- * needed to track the new file system's file streams.
- */
-int
-xfs_filestream_mount(
- xfs_mount_t *mp)
-{
- int err;
- unsigned int lifetime, grp_count;
-
- /*
- * The filestream timer tunable is currently fixed within the range of
- * one second to four minutes, with five seconds being the default. The
- * group count is somewhat arbitrary, but it'd be nice to adhere to the
- * timer tunable to within about 10 percent. This requires at least 10
- * groups.
- */
- lifetime = xfs_fstrm_centisecs * 10;
- grp_count = 10;
-
- err = xfs_mru_cache_create(&mp->m_filestream, lifetime, grp_count,
- xfs_fstrm_free_func);
+out_free_item:
+ kmem_free(item);
+out_put_ag:
+ xfs_filestream_put_ag(mp, *agp);
return err;
}
-/*
- * xfs_filestream_unmount() is called when a file system that was mounted with
- * the filestream option is unmounted. It drains the data structures created
- * to track the file system's file streams and frees all the memory that was
- * allocated.
- */
-void
-xfs_filestream_unmount(
- xfs_mount_t *mp)
+static struct xfs_inode *
+xfs_filestream_get_parent(
+ struct xfs_inode *ip)
{
- xfs_mru_cache_destroy(mp->m_filestream);
-}
+ struct inode *inode = VFS_I(ip), *dir = NULL;
+ struct dentry *dentry, *parent;
-/*
- * Return the AG of the filestream the file or directory belongs to, or
- * NULLAGNUMBER otherwise.
- */
-xfs_agnumber_t
-xfs_filestream_lookup_ag(
- xfs_inode_t *ip)
-{
- xfs_mru_cache_t *cache;
- fstrm_item_t *item;
- xfs_agnumber_t ag;
- int ref;
-
- if (!S_ISREG(ip->i_d.di_mode) && !S_ISDIR(ip->i_d.di_mode)) {
- ASSERT(0);
- return NULLAGNUMBER;
- }
+ dentry = d_find_alias(inode);
+ if (!dentry)
+ goto out;
- cache = ip->i_mount->m_filestream;
- item = xfs_mru_cache_lookup(cache, ip->i_ino);
- if (!item) {
- TRACE_LOOKUP(ip->i_mount, ip, NULL, NULLAGNUMBER, 0);
- return NULLAGNUMBER;
- }
+ parent = dget_parent(dentry);
+ if (!parent)
+ goto out_dput;
- ASSERT(ip == item->ip);
- ag = item->ag;
- ref = xfs_filestream_peek_ag(ip->i_mount, ag);
- xfs_mru_cache_done(cache);
+ dir = igrab(parent->d_inode);
+ dput(parent);
- TRACE_LOOKUP(ip->i_mount, ip, item->pip, ag, ref);
- return ag;
+out_dput:
+ dput(dentry);
+out:
+ return dir ? XFS_I(dir) : NULL;
}
/*
- * xfs_filestream_associate() should only be called to associate a regular file
- * with its parent directory. Calling it with a child directory isn't
- * appropriate because filestreams don't apply to entire directory hierarchies.
- * Creating a file in a child directory of an existing filestream directory
- * starts a new filestream with its own allocation group association.
+ * Find the right allocation group for a file, either by finding an
+ * existing file stream or creating a new one.
*
- * Returns < 0 on error, 0 if successful association occurred, > 0 if
- * we failed to get an association because of locking issues.
+ * Returns NULLAGNUMBER in case of an error.
*/
-int
-xfs_filestream_associate(
- xfs_inode_t *pip,
- xfs_inode_t *ip)
+xfs_agnumber_t
+xfs_filestream_lookup_ag(
+ struct xfs_inode *ip)
{
- xfs_mount_t *mp;
- xfs_mru_cache_t *cache;
- fstrm_item_t *item;
- xfs_agnumber_t ag, rotorstep, startag;
- int err = 0;
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_inode *pip = NULL;
+ xfs_agnumber_t startag, ag = NULLAGNUMBER;
+ struct xfs_mru_cache_elem *mru;
- ASSERT(S_ISDIR(pip->i_d.di_mode));
ASSERT(S_ISREG(ip->i_d.di_mode));
- if (!S_ISDIR(pip->i_d.di_mode) || !S_ISREG(ip->i_d.di_mode))
- return -EINVAL;
-
- mp = pip->i_mount;
- cache = mp->m_filestream;
-
- /*
- * We have a problem, Houston.
- *
- * Taking the iolock here violates inode locking order - we already
- * hold the ilock. Hence if we block getting this lock we may never
- * wake. Unfortunately, that means if we can't get the lock, we're
- * screwed in terms of getting a stream association - we can't spin
- * waiting for the lock because someone else is waiting on the lock we
- * hold and we cannot drop that as we are in a transaction here.
- *
- * Lucky for us, this inversion is not a problem because it's a
- * directory inode that we are trying to lock here.
- *
- * So, if we can't get the iolock without sleeping then just give up
- */
- if (!xfs_ilock_nowait(pip, XFS_IOLOCK_EXCL))
- return 1;
- /* If the parent directory is already in the cache, use its AG. */
- item = xfs_mru_cache_lookup(cache, pip->i_ino);
- if (item) {
- ASSERT(item->ip == pip);
- ag = item->ag;
- xfs_mru_cache_done(cache);
+ pip = xfs_filestream_get_parent(ip);
+ if (!pip)
+ goto out;
- TRACE_LOOKUP(mp, pip, pip, ag, xfs_filestream_peek_ag(mp, ag));
- err = _xfs_filestream_update_ag(ip, pip, ag);
+ mru = xfs_mru_cache_lookup(mp->m_filestream, pip->i_ino);
+ if (mru) {
+ ag = container_of(mru, struct xfs_fstrm_item, mru)->ag;
+ xfs_mru_cache_done(mp->m_filestream);
- goto exit;
+ trace_xfs_filestream_lookup(ip, ag);
+ goto out;
}
/*
@@ -623,202 +341,94 @@ xfs_filestream_associate(
* use the directory inode's AG.
*/
if (mp->m_flags & XFS_MOUNT_32BITINODES) {
- rotorstep = xfs_rotorstep;
+ xfs_agnumber_t rotorstep = xfs_rotorstep;
startag = (mp->m_agfrotor / rotorstep) % mp->m_sb.sb_agcount;
mp->m_agfrotor = (mp->m_agfrotor + 1) %
(mp->m_sb.sb_agcount * rotorstep);
} else
startag = XFS_INO_TO_AGNO(mp, pip->i_ino);
- /* Pick a new AG for the parent inode starting at startag. */
- err = _xfs_filestream_pick_ag(mp, startag, &ag, 0, 0);
- if (err || ag == NULLAGNUMBER)
- goto exit_did_pick;
-
- /* Associate the parent inode with the AG. */
- err = _xfs_filestream_update_ag(pip, NULL, ag);
- if (err)
- goto exit_did_pick;
-
- /* Associate the file inode with the AG. */
- err = _xfs_filestream_update_ag(ip, pip, ag);
- if (err)
- goto exit_did_pick;
-
- TRACE_ASSOCIATE(mp, ip, pip, ag, xfs_filestream_peek_ag(mp, ag));
-
-exit_did_pick:
- /*
- * If _xfs_filestream_pick_ag() returned a valid AG, remove the
- * reference it took on it, since the file and directory will have taken
- * their own now if they were successfully cached.
- */
- if (ag != NULLAGNUMBER)
- xfs_filestream_put_ag(mp, ag);
-
-exit:
- xfs_iunlock(pip, XFS_IOLOCK_EXCL);
- return -err;
+ if (xfs_filestream_pick_ag(pip, startag, &ag, 0, 0))
+ ag = NULLAGNUMBER;
+out:
+ IRELE(pip);
+ return ag;
}
/*
- * Pick a new allocation group for the current file and its file stream. This
- * function is called by xfs_bmap_filestreams() with the mount point's per-ag
- * lock held.
+ * Pick a new allocation group for the current file and its file stream.
+ *
+ * This is called when the allocator can't find a suitable extent in the
+ * current AG, and we have to move the stream into a new AG with more space.
*/
int
xfs_filestream_new_ag(
- xfs_bmalloca_t *ap,
- xfs_agnumber_t *agp)
+ struct xfs_bmalloca *ap,
+ xfs_agnumber_t *agp)
{
- int flags, err;
- xfs_inode_t *ip, *pip = NULL;
- xfs_mount_t *mp;
- xfs_mru_cache_t *cache;
- xfs_extlen_t minlen;
- fstrm_item_t *dir, *file;
- xfs_agnumber_t ag = NULLAGNUMBER;
-
- ip = ap->ip;
- mp = ip->i_mount;
- cache = mp->m_filestream;
- minlen = ap->length;
- *agp = NULLAGNUMBER;
+ struct xfs_inode *ip = ap->ip, *pip;
+ struct xfs_mount *mp = ip->i_mount;
+ xfs_extlen_t minlen = ap->length;
+ xfs_agnumber_t startag = 0;
+ int flags, err = 0;
+ struct xfs_mru_cache_elem *mru;
- /*
- * Look for the file in the cache, removing it if it's found. Doing
- * this allows it to be held across the dir lookup that follows.
- */
- file = xfs_mru_cache_remove(cache, ip->i_ino);
- if (file) {
- ASSERT(ip == file->ip);
-
- /* Save the file's parent inode and old AG number for later. */
- pip = file->pip;
- ag = file->ag;
-
- /* Look for the file's directory in the cache. */
- dir = xfs_mru_cache_lookup(cache, pip->i_ino);
- if (dir) {
- ASSERT(pip == dir->ip);
-
- /*
- * If the directory has already moved on to a new AG,
- * use that AG as the new AG for the file. Don't
- * forget to twiddle the AG refcounts to match the
- * movement.
- */
- if (dir->ag != file->ag) {
- xfs_filestream_put_ag(mp, file->ag);
- xfs_filestream_get_ag(mp, dir->ag);
- *agp = file->ag = dir->ag;
- }
-
- xfs_mru_cache_done(cache);
- }
+ *agp = NULLAGNUMBER;
- /*
- * Put the file back in the cache. If this fails, the free
- * function needs to be called to tidy up in the same way as if
- * the item had simply expired from the cache.
- */
- err = xfs_mru_cache_insert(cache, ip->i_ino, file);
- if (err) {
- xfs_fstrm_free_func(ip->i_ino, file);
- return err;
- }
+ pip = xfs_filestream_get_parent(ip);
+ if (!pip)
+ goto exit;
- /*
- * If the file's AG was moved to the directory's new AG, there's
- * nothing more to be done.
- */
- if (*agp != NULLAGNUMBER) {
- TRACE_MOVEAG(mp, ip, pip,
- ag, xfs_filestream_peek_ag(mp, ag),
- *agp, xfs_filestream_peek_ag(mp, *agp));
- return 0;
- }
+ mru = xfs_mru_cache_remove(mp->m_filestream, pip->i_ino);
+ if (mru) {
+ struct xfs_fstrm_item *item =
+ container_of(mru, struct xfs_fstrm_item, mru);
+ startag = (item->ag + 1) % mp->m_sb.sb_agcount;
}
- /*
- * If the file's parent directory is known, take its iolock in exclusive
- * mode to prevent two sibling files from racing each other to migrate
- * themselves and their parent to different AGs.
- *
- * Note that we lock the parent directory iolock inside the child
- * iolock here. That's fine as we never hold both parent and child
- * iolock in any other place. This is different from the ilock,
- * which requires locking of the child after the parent for namespace
- * operations.
- */
- if (pip)
- xfs_ilock(pip, XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT);
-
- /*
- * A new AG needs to be found for the file. If the file's parent
- * directory is also known, it will be moved to the new AG as well to
- * ensure that files created inside it in future use the new AG.
- */
- ag = (ag == NULLAGNUMBER) ? 0 : (ag + 1) % mp->m_sb.sb_agcount;
flags = (ap->userdata ? XFS_PICK_USERDATA : 0) |
(ap->flist->xbf_low ? XFS_PICK_LOWSPACE : 0);
- err = _xfs_filestream_pick_ag(mp, ag, agp, flags, minlen);
- if (err || *agp == NULLAGNUMBER)
- goto exit;
+ err = xfs_filestream_pick_ag(pip, startag, agp, flags, minlen);
/*
- * If the file wasn't found in the file cache, then its parent directory
- * inode isn't known. For this to have happened, the file must either
- * be pre-existing, or it was created long enough ago that its cache
- * entry has expired. This isn't the sort of usage that the filestreams
- * allocator is trying to optimise, so there's no point trying to track
- * its new AG somehow in the filestream data structures.
+ * Only free the item here so we skip over the old AG earlier.
*/
- if (!pip) {
- TRACE_ORPHAN(mp, ip, *agp);
- goto exit;
- }
-
- /* Associate the parent inode with the AG. */
- err = _xfs_filestream_update_ag(pip, NULL, *agp);
- if (err)
- goto exit;
-
- /* Associate the file inode with the AG. */
- err = _xfs_filestream_update_ag(ip, pip, *agp);
- if (err)
- goto exit;
-
- TRACE_MOVEAG(mp, ip, pip, NULLAGNUMBER, 0,
- *agp, xfs_filestream_peek_ag(mp, *agp));
+ if (mru)
+ xfs_fstrm_free_func(mru);
+ IRELE(pip);
exit:
- /*
- * If _xfs_filestream_pick_ag() returned a valid AG, remove the
- * reference it took on it, since the file and directory will have taken
- * their own now if they were successfully cached.
- */
- if (*agp != NULLAGNUMBER)
- xfs_filestream_put_ag(mp, *agp);
- else
+ if (*agp == NULLAGNUMBER)
*agp = 0;
-
- if (pip)
- xfs_iunlock(pip, XFS_IOLOCK_EXCL);
-
return err;
}
-/*
- * Remove an association between an inode and a filestream object.
- * Typically this is done on last close of an unlinked file.
- */
void
xfs_filestream_deassociate(
- xfs_inode_t *ip)
+ struct xfs_inode *ip)
{
- xfs_mru_cache_t *cache = ip->i_mount->m_filestream;
+ xfs_mru_cache_delete(ip->i_mount->m_filestream, ip->i_ino);
+}
- xfs_mru_cache_delete(cache, ip->i_ino);
+int
+xfs_filestream_mount(
+ xfs_mount_t *mp)
+{
+ /*
+ * The filestream timer tunable is currently fixed within the range of
+ * one second to four minutes, with five seconds being the default. The
+ * group count is somewhat arbitrary, but it'd be nice to adhere to the
+ * timer tunable to within about 10 percent. This requires at least 10
+ * groups.
+ */
+ return xfs_mru_cache_create(&mp->m_filestream, xfs_fstrm_centisecs * 10,
+ 10, xfs_fstrm_free_func);
+}
+
+void
+xfs_filestream_unmount(
+ xfs_mount_t *mp)
+{
+ xfs_mru_cache_destroy(mp->m_filestream);
}
diff --git a/fs/xfs/xfs_filestream.h b/fs/xfs/xfs_filestream.h
index 09dd9af4543..2ef43406e53 100644
--- a/fs/xfs/xfs_filestream.h
+++ b/fs/xfs/xfs_filestream.h
@@ -18,57 +18,23 @@
#ifndef __XFS_FILESTREAM_H__
#define __XFS_FILESTREAM_H__
-#ifdef __KERNEL__
-
struct xfs_mount;
struct xfs_inode;
-struct xfs_perag;
struct xfs_bmalloca;
-#ifdef XFS_FILESTREAMS_TRACE
-#define XFS_FSTRM_KTRACE_INFO 1
-#define XFS_FSTRM_KTRACE_AGSCAN 2
-#define XFS_FSTRM_KTRACE_AGPICK1 3
-#define XFS_FSTRM_KTRACE_AGPICK2 4
-#define XFS_FSTRM_KTRACE_UPDATE 5
-#define XFS_FSTRM_KTRACE_FREE 6
-#define XFS_FSTRM_KTRACE_ITEM_LOOKUP 7
-#define XFS_FSTRM_KTRACE_ASSOCIATE 8
-#define XFS_FSTRM_KTRACE_MOVEAG 9
-#define XFS_FSTRM_KTRACE_ORPHAN 10
-
-#define XFS_FSTRM_KTRACE_SIZE 16384
-extern ktrace_t *xfs_filestreams_trace_buf;
-
-#endif
-
-/* allocation selection flags */
-typedef enum xfs_fstrm_alloc {
- XFS_PICK_USERDATA = 1,
- XFS_PICK_LOWSPACE = 2,
-} xfs_fstrm_alloc_t;
-
-/* prototypes for filestream.c */
-int xfs_filestream_init(void);
-void xfs_filestream_uninit(void);
int xfs_filestream_mount(struct xfs_mount *mp);
void xfs_filestream_unmount(struct xfs_mount *mp);
-xfs_agnumber_t xfs_filestream_lookup_ag(struct xfs_inode *ip);
-int xfs_filestream_associate(struct xfs_inode *dip, struct xfs_inode *ip);
void xfs_filestream_deassociate(struct xfs_inode *ip);
+xfs_agnumber_t xfs_filestream_lookup_ag(struct xfs_inode *ip);
int xfs_filestream_new_ag(struct xfs_bmalloca *ap, xfs_agnumber_t *agp);
+int xfs_filestream_peek_ag(struct xfs_mount *mp, xfs_agnumber_t agno);
-
-/* filestreams for the inode? */
static inline int
xfs_inode_is_filestream(
struct xfs_inode *ip)
{
return (ip->i_mount->m_flags & XFS_MOUNT_FILESTREAMS) ||
- xfs_iflags_test(ip, XFS_IFILESTREAM) ||
(ip->i_d.di_flags & XFS_DIFLAG_FILESTREAM);
}
-#endif /* __KERNEL__ */
-
#endif /* __XFS_FILESTREAM_H__ */
diff --git a/fs/xfs/xfs_format.h b/fs/xfs/xfs_format.h
new file mode 100644
index 00000000000..34d85aca305
--- /dev/null
+++ b/fs/xfs/xfs_format.h
@@ -0,0 +1,428 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#ifndef __XFS_FORMAT_H__
+#define __XFS_FORMAT_H__
+
+/*
+ * XFS On Disk Format Definitions
+ *
+ * This header file defines all the on-disk format definitions for
+ * general XFS objects. Directory and attribute related objects are defined in
+ * xfs_da_format.h, which log and log item formats are defined in
+ * xfs_log_format.h. Everything else goes here.
+ */
+
+struct xfs_mount;
+struct xfs_trans;
+struct xfs_inode;
+struct xfs_buf;
+struct xfs_ifork;
+
+/*
+ * RealTime Device format definitions
+ */
+
+/* Min and max rt extent sizes, specified in bytes */
+#define XFS_MAX_RTEXTSIZE (1024 * 1024 * 1024) /* 1GB */
+#define XFS_DFL_RTEXTSIZE (64 * 1024) /* 64kB */
+#define XFS_MIN_RTEXTSIZE (4 * 1024) /* 4kB */
+
+#define XFS_BLOCKSIZE(mp) ((mp)->m_sb.sb_blocksize)
+#define XFS_BLOCKMASK(mp) ((mp)->m_blockmask)
+#define XFS_BLOCKWSIZE(mp) ((mp)->m_blockwsize)
+#define XFS_BLOCKWMASK(mp) ((mp)->m_blockwmask)
+
+/*
+ * RT Summary and bit manipulation macros.
+ */
+#define XFS_SUMOFFS(mp,ls,bb) ((int)((ls) * (mp)->m_sb.sb_rbmblocks + (bb)))
+#define XFS_SUMOFFSTOBLOCK(mp,s) \
+ (((s) * (uint)sizeof(xfs_suminfo_t)) >> (mp)->m_sb.sb_blocklog)
+#define XFS_SUMPTR(mp,bp,so) \
+ ((xfs_suminfo_t *)((bp)->b_addr + \
+ (((so) * (uint)sizeof(xfs_suminfo_t)) & XFS_BLOCKMASK(mp))))
+
+#define XFS_BITTOBLOCK(mp,bi) ((bi) >> (mp)->m_blkbit_log)
+#define XFS_BLOCKTOBIT(mp,bb) ((bb) << (mp)->m_blkbit_log)
+#define XFS_BITTOWORD(mp,bi) \
+ ((int)(((bi) >> XFS_NBWORDLOG) & XFS_BLOCKWMASK(mp)))
+
+#define XFS_RTMIN(a,b) ((a) < (b) ? (a) : (b))
+#define XFS_RTMAX(a,b) ((a) > (b) ? (a) : (b))
+
+#define XFS_RTLOBIT(w) xfs_lowbit32(w)
+#define XFS_RTHIBIT(w) xfs_highbit32(w)
+
+#if XFS_BIG_BLKNOS
+#define XFS_RTBLOCKLOG(b) xfs_highbit64(b)
+#else
+#define XFS_RTBLOCKLOG(b) xfs_highbit32(b)
+#endif
+
+/*
+ * Dquot and dquot block format definitions
+ */
+#define XFS_DQUOT_MAGIC 0x4451 /* 'DQ' */
+#define XFS_DQUOT_VERSION (u_int8_t)0x01 /* latest version number */
+
+/*
+ * This is the main portion of the on-disk representation of quota
+ * information for a user. This is the q_core of the xfs_dquot_t that
+ * is kept in kernel memory. We pad this with some more expansion room
+ * to construct the on disk structure.
+ */
+typedef struct xfs_disk_dquot {
+ __be16 d_magic; /* dquot magic = XFS_DQUOT_MAGIC */
+ __u8 d_version; /* dquot version */
+ __u8 d_flags; /* XFS_DQ_USER/PROJ/GROUP */
+ __be32 d_id; /* user,project,group id */
+ __be64 d_blk_hardlimit;/* absolute limit on disk blks */
+ __be64 d_blk_softlimit;/* preferred limit on disk blks */
+ __be64 d_ino_hardlimit;/* maximum # allocated inodes */
+ __be64 d_ino_softlimit;/* preferred inode limit */
+ __be64 d_bcount; /* disk blocks owned by the user */
+ __be64 d_icount; /* inodes owned by the user */
+ __be32 d_itimer; /* zero if within inode limits if not,
+ this is when we refuse service */
+ __be32 d_btimer; /* similar to above; for disk blocks */
+ __be16 d_iwarns; /* warnings issued wrt num inodes */
+ __be16 d_bwarns; /* warnings issued wrt disk blocks */
+ __be32 d_pad0; /* 64 bit align */
+ __be64 d_rtb_hardlimit;/* absolute limit on realtime blks */
+ __be64 d_rtb_softlimit;/* preferred limit on RT disk blks */
+ __be64 d_rtbcount; /* realtime blocks owned */
+ __be32 d_rtbtimer; /* similar to above; for RT disk blocks */
+ __be16 d_rtbwarns; /* warnings issued wrt RT disk blocks */
+ __be16 d_pad;
+} xfs_disk_dquot_t;
+
+/*
+ * This is what goes on disk. This is separated from the xfs_disk_dquot because
+ * carrying the unnecessary padding would be a waste of memory.
+ */
+typedef struct xfs_dqblk {
+ xfs_disk_dquot_t dd_diskdq; /* portion that lives incore as well */
+ char dd_fill[4]; /* filling for posterity */
+
+ /*
+ * These two are only present on filesystems with the CRC bits set.
+ */
+ __be32 dd_crc; /* checksum */
+ __be64 dd_lsn; /* last modification in log */
+ uuid_t dd_uuid; /* location information */
+} xfs_dqblk_t;
+
+#define XFS_DQUOT_CRC_OFF offsetof(struct xfs_dqblk, dd_crc)
+
+/*
+ * Remote symlink format and access functions.
+ */
+#define XFS_SYMLINK_MAGIC 0x58534c4d /* XSLM */
+
+struct xfs_dsymlink_hdr {
+ __be32 sl_magic;
+ __be32 sl_offset;
+ __be32 sl_bytes;
+ __be32 sl_crc;
+ uuid_t sl_uuid;
+ __be64 sl_owner;
+ __be64 sl_blkno;
+ __be64 sl_lsn;
+};
+
+#define XFS_SYMLINK_CRC_OFF offsetof(struct xfs_dsymlink_hdr, sl_crc)
+
+/*
+ * The maximum pathlen is 1024 bytes. Since the minimum file system
+ * blocksize is 512 bytes, we can get a max of 3 extents back from
+ * bmapi when crc headers are taken into account.
+ */
+#define XFS_SYMLINK_MAPS 3
+
+#define XFS_SYMLINK_BUF_SPACE(mp, bufsize) \
+ ((bufsize) - (xfs_sb_version_hascrc(&(mp)->m_sb) ? \
+ sizeof(struct xfs_dsymlink_hdr) : 0))
+
+
+/*
+ * Allocation Btree format definitions
+ *
+ * There are two on-disk btrees, one sorted by blockno and one sorted
+ * by blockcount and blockno. All blocks look the same to make the code
+ * simpler; if we have time later, we'll make the optimizations.
+ */
+#define XFS_ABTB_MAGIC 0x41425442 /* 'ABTB' for bno tree */
+#define XFS_ABTB_CRC_MAGIC 0x41423342 /* 'AB3B' */
+#define XFS_ABTC_MAGIC 0x41425443 /* 'ABTC' for cnt tree */
+#define XFS_ABTC_CRC_MAGIC 0x41423343 /* 'AB3C' */
+
+/*
+ * Data record/key structure
+ */
+typedef struct xfs_alloc_rec {
+ __be32 ar_startblock; /* starting block number */
+ __be32 ar_blockcount; /* count of free blocks */
+} xfs_alloc_rec_t, xfs_alloc_key_t;
+
+typedef struct xfs_alloc_rec_incore {
+ xfs_agblock_t ar_startblock; /* starting block number */
+ xfs_extlen_t ar_blockcount; /* count of free blocks */
+} xfs_alloc_rec_incore_t;
+
+/* btree pointer type */
+typedef __be32 xfs_alloc_ptr_t;
+
+/*
+ * Block numbers in the AG:
+ * SB is sector 0, AGF is sector 1, AGI is sector 2, AGFL is sector 3.
+ */
+#define XFS_BNO_BLOCK(mp) ((xfs_agblock_t)(XFS_AGFL_BLOCK(mp) + 1))
+#define XFS_CNT_BLOCK(mp) ((xfs_agblock_t)(XFS_BNO_BLOCK(mp) + 1))
+
+
+/*
+ * Inode Allocation Btree format definitions
+ *
+ * There is a btree for the inode map per allocation group.
+ */
+#define XFS_IBT_MAGIC 0x49414254 /* 'IABT' */
+#define XFS_IBT_CRC_MAGIC 0x49414233 /* 'IAB3' */
+#define XFS_FIBT_MAGIC 0x46494254 /* 'FIBT' */
+#define XFS_FIBT_CRC_MAGIC 0x46494233 /* 'FIB3' */
+
+typedef __uint64_t xfs_inofree_t;
+#define XFS_INODES_PER_CHUNK (NBBY * sizeof(xfs_inofree_t))
+#define XFS_INODES_PER_CHUNK_LOG (XFS_NBBYLOG + 3)
+#define XFS_INOBT_ALL_FREE ((xfs_inofree_t)-1)
+#define XFS_INOBT_MASK(i) ((xfs_inofree_t)1 << (i))
+
+static inline xfs_inofree_t xfs_inobt_maskn(int i, int n)
+{
+ return ((n >= XFS_INODES_PER_CHUNK ? 0 : XFS_INOBT_MASK(n)) - 1) << i;
+}
+
+/*
+ * Data record structure
+ */
+typedef struct xfs_inobt_rec {
+ __be32 ir_startino; /* starting inode number */
+ __be32 ir_freecount; /* count of free inodes (set bits) */
+ __be64 ir_free; /* free inode mask */
+} xfs_inobt_rec_t;
+
+typedef struct xfs_inobt_rec_incore {
+ xfs_agino_t ir_startino; /* starting inode number */
+ __int32_t ir_freecount; /* count of free inodes (set bits) */
+ xfs_inofree_t ir_free; /* free inode mask */
+} xfs_inobt_rec_incore_t;
+
+
+/*
+ * Key structure
+ */
+typedef struct xfs_inobt_key {
+ __be32 ir_startino; /* starting inode number */
+} xfs_inobt_key_t;
+
+/* btree pointer type */
+typedef __be32 xfs_inobt_ptr_t;
+
+/*
+ * block numbers in the AG.
+ */
+#define XFS_IBT_BLOCK(mp) ((xfs_agblock_t)(XFS_CNT_BLOCK(mp) + 1))
+#define XFS_FIBT_BLOCK(mp) ((xfs_agblock_t)(XFS_IBT_BLOCK(mp) + 1))
+
+/*
+ * The first data block of an AG depends on whether the filesystem was formatted
+ * with the finobt feature. If so, account for the finobt reserved root btree
+ * block.
+ */
+#define XFS_PREALLOC_BLOCKS(mp) \
+ (xfs_sb_version_hasfinobt(&((mp)->m_sb)) ? \
+ XFS_FIBT_BLOCK(mp) + 1 : \
+ XFS_IBT_BLOCK(mp) + 1)
+
+
+
+/*
+ * BMAP Btree format definitions
+ *
+ * This includes both the root block definition that sits inside an inode fork
+ * and the record/pointer formats for the leaf/node in the blocks.
+ */
+#define XFS_BMAP_MAGIC 0x424d4150 /* 'BMAP' */
+#define XFS_BMAP_CRC_MAGIC 0x424d4133 /* 'BMA3' */
+
+/*
+ * Bmap root header, on-disk form only.
+ */
+typedef struct xfs_bmdr_block {
+ __be16 bb_level; /* 0 is a leaf */
+ __be16 bb_numrecs; /* current # of data records */
+} xfs_bmdr_block_t;
+
+/*
+ * Bmap btree record and extent descriptor.
+ * l0:63 is an extent flag (value 1 indicates non-normal).
+ * l0:9-62 are startoff.
+ * l0:0-8 and l1:21-63 are startblock.
+ * l1:0-20 are blockcount.
+ */
+#define BMBT_EXNTFLAG_BITLEN 1
+#define BMBT_STARTOFF_BITLEN 54
+#define BMBT_STARTBLOCK_BITLEN 52
+#define BMBT_BLOCKCOUNT_BITLEN 21
+
+typedef struct xfs_bmbt_rec {
+ __be64 l0, l1;
+} xfs_bmbt_rec_t;
+
+typedef __uint64_t xfs_bmbt_rec_base_t; /* use this for casts */
+typedef xfs_bmbt_rec_t xfs_bmdr_rec_t;
+
+typedef struct xfs_bmbt_rec_host {
+ __uint64_t l0, l1;
+} xfs_bmbt_rec_host_t;
+
+/*
+ * Values and macros for delayed-allocation startblock fields.
+ */
+#define STARTBLOCKVALBITS 17
+#define STARTBLOCKMASKBITS (15 + XFS_BIG_BLKNOS * 20)
+#define DSTARTBLOCKMASKBITS (15 + 20)
+#define STARTBLOCKMASK \
+ (((((xfs_fsblock_t)1) << STARTBLOCKMASKBITS) - 1) << STARTBLOCKVALBITS)
+#define DSTARTBLOCKMASK \
+ (((((xfs_dfsbno_t)1) << DSTARTBLOCKMASKBITS) - 1) << STARTBLOCKVALBITS)
+
+static inline int isnullstartblock(xfs_fsblock_t x)
+{
+ return ((x) & STARTBLOCKMASK) == STARTBLOCKMASK;
+}
+
+static inline int isnulldstartblock(xfs_dfsbno_t x)
+{
+ return ((x) & DSTARTBLOCKMASK) == DSTARTBLOCKMASK;
+}
+
+static inline xfs_fsblock_t nullstartblock(int k)
+{
+ ASSERT(k < (1 << STARTBLOCKVALBITS));
+ return STARTBLOCKMASK | (k);
+}
+
+static inline xfs_filblks_t startblockval(xfs_fsblock_t x)
+{
+ return (xfs_filblks_t)((x) & ~STARTBLOCKMASK);
+}
+
+/*
+ * Possible extent formats.
+ */
+typedef enum {
+ XFS_EXTFMT_NOSTATE = 0,
+ XFS_EXTFMT_HASSTATE
+} xfs_exntfmt_t;
+
+/*
+ * Possible extent states.
+ */
+typedef enum {
+ XFS_EXT_NORM, XFS_EXT_UNWRITTEN,
+ XFS_EXT_DMAPI_OFFLINE, XFS_EXT_INVALID
+} xfs_exntst_t;
+
+/*
+ * Incore version of above.
+ */
+typedef struct xfs_bmbt_irec
+{
+ xfs_fileoff_t br_startoff; /* starting file offset */
+ xfs_fsblock_t br_startblock; /* starting block number */
+ xfs_filblks_t br_blockcount; /* number of blocks */
+ xfs_exntst_t br_state; /* extent state */
+} xfs_bmbt_irec_t;
+
+/*
+ * Key structure for non-leaf levels of the tree.
+ */
+typedef struct xfs_bmbt_key {
+ __be64 br_startoff; /* starting file offset */
+} xfs_bmbt_key_t, xfs_bmdr_key_t;
+
+/* btree pointer type */
+typedef __be64 xfs_bmbt_ptr_t, xfs_bmdr_ptr_t;
+
+
+/*
+ * Generic Btree block format definitions
+ *
+ * This is a combination of the actual format used on disk for short and long
+ * format btrees. The first three fields are shared by both format, but the
+ * pointers are different and should be used with care.
+ *
+ * To get the size of the actual short or long form headers please use the size
+ * macros below. Never use sizeof(xfs_btree_block).
+ *
+ * The blkno, crc, lsn, owner and uuid fields are only available in filesystems
+ * with the crc feature bit, and all accesses to them must be conditional on
+ * that flag.
+ */
+struct xfs_btree_block {
+ __be32 bb_magic; /* magic number for block type */
+ __be16 bb_level; /* 0 is a leaf */
+ __be16 bb_numrecs; /* current # of data records */
+ union {
+ struct {
+ __be32 bb_leftsib;
+ __be32 bb_rightsib;
+
+ __be64 bb_blkno;
+ __be64 bb_lsn;
+ uuid_t bb_uuid;
+ __be32 bb_owner;
+ __le32 bb_crc;
+ } s; /* short form pointers */
+ struct {
+ __be64 bb_leftsib;
+ __be64 bb_rightsib;
+
+ __be64 bb_blkno;
+ __be64 bb_lsn;
+ uuid_t bb_uuid;
+ __be64 bb_owner;
+ __le32 bb_crc;
+ __be32 bb_pad; /* padding for alignment */
+ } l; /* long form pointers */
+ } bb_u; /* rest */
+};
+
+#define XFS_BTREE_SBLOCK_LEN 16 /* size of a short form block */
+#define XFS_BTREE_LBLOCK_LEN 24 /* size of a long form block */
+
+/* sizes of CRC enabled btree blocks */
+#define XFS_BTREE_SBLOCK_CRC_LEN (XFS_BTREE_SBLOCK_LEN + 40)
+#define XFS_BTREE_LBLOCK_CRC_LEN (XFS_BTREE_LBLOCK_LEN + 48)
+
+#define XFS_BTREE_SBLOCK_CRC_OFF \
+ offsetof(struct xfs_btree_block, bb_u.s.bb_crc)
+#define XFS_BTREE_LBLOCK_CRC_OFF \
+ offsetof(struct xfs_btree_block, bb_u.l.bb_crc)
+
+#endif /* __XFS_FORMAT_H__ */
diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h
index 6dda3f949b0..d34703dbcb4 100644
--- a/fs/xfs/xfs_fs.h
+++ b/fs/xfs/xfs_fs.h
@@ -233,13 +233,17 @@ typedef struct xfs_fsop_resblks {
#define XFS_FSOP_GEOM_FLAGS_LOGV2 0x0100 /* log format version 2 */
#define XFS_FSOP_GEOM_FLAGS_SECTOR 0x0200 /* sector sizes >1BB */
#define XFS_FSOP_GEOM_FLAGS_ATTR2 0x0400 /* inline attributes rework */
-#define XFS_FSOP_GEOM_FLAGS_PROJID32 0x0800 /* 32-bit project IDs */
+#define XFS_FSOP_GEOM_FLAGS_PROJID32 0x0800 /* 32-bit project IDs */
#define XFS_FSOP_GEOM_FLAGS_DIRV2CI 0x1000 /* ASCII only CI names */
#define XFS_FSOP_GEOM_FLAGS_LAZYSB 0x4000 /* lazy superblock counters */
-
+#define XFS_FSOP_GEOM_FLAGS_V5SB 0x8000 /* version 5 superblock */
+#define XFS_FSOP_GEOM_FLAGS_FTYPE 0x10000 /* inode directory types */
+#define XFS_FSOP_GEOM_FLAGS_FINOBT 0x20000 /* free inode btree */
/*
- * Minimum and maximum sizes need for growth checks
+ * Minimum and maximum sizes need for growth checks.
+ *
+ * Block counts are in units of filesystem blocks, not basic blocks.
*/
#define XFS_MIN_AG_BLOCKS 64
#define XFS_MIN_LOG_BLOCKS 512ULL
@@ -310,6 +314,17 @@ typedef struct xfs_bstat {
} xfs_bstat_t;
/*
+ * Project quota id helpers (previously projid was 16bit only
+ * and using two 16bit values to hold new 32bit projid was choosen
+ * to retain compatibility with "old" filesystems).
+ */
+static inline __uint32_t
+bstat_get_projid(struct xfs_bstat *bs)
+{
+ return (__uint32_t)bs->bs_projid_hi << 16 | bs->bs_projid_lo;
+}
+
+/*
* The user-level BulkStat Request interface structure.
*/
typedef struct xfs_fsop_bulkreq {
@@ -343,7 +358,7 @@ typedef struct xfs_error_injection {
* Speculative preallocation trimming.
*/
#define XFS_EOFBLOCKS_VERSION 1
-struct xfs_eofblocks {
+struct xfs_fs_eofblocks {
__u32 eof_version;
__u32 eof_flags;
uid_t eof_uid;
@@ -449,6 +464,21 @@ typedef struct xfs_handle {
+ (handle).ha_fid.fid_len)
/*
+ * Structure passed to XFS_IOC_SWAPEXT
+ */
+typedef struct xfs_swapext
+{
+ __int64_t sx_version; /* version */
+#define XFS_SX_VERSION 0
+ __int64_t sx_fdtarget; /* fd of target file */
+ __int64_t sx_fdtmp; /* fd of tmp file */
+ xfs_off_t sx_offset; /* offset into file */
+ xfs_off_t sx_length; /* leng from offset */
+ char sx_pad[16]; /* pad space, unused */
+ xfs_bstat_t sx_stat; /* stat of target b4 copy */
+} xfs_swapext_t;
+
+/*
* Flags for going down operation
*/
#define XFS_FSOP_GOING_FLAGS_DEFAULT 0x0 /* going down */
@@ -486,7 +516,7 @@ typedef struct xfs_handle {
/* XFS_IOC_GETBIOSIZE ---- deprecated 47 */
#define XFS_IOC_GETBMAPX _IOWR('X', 56, struct getbmap)
#define XFS_IOC_ZERO_RANGE _IOW ('X', 57, struct xfs_flock64)
-#define XFS_IOC_FREE_EOFBLOCKS _IOR ('X', 58, struct xfs_eofblocks)
+#define XFS_IOC_FREE_EOFBLOCKS _IOR ('X', 58, struct xfs_fs_eofblocks)
/*
* ioctl commands that replace IRIX syssgi()'s
@@ -510,8 +540,14 @@ typedef struct xfs_handle {
#define XFS_IOC_ERROR_INJECTION _IOW ('X', 116, struct xfs_error_injection)
#define XFS_IOC_ERROR_CLEARALL _IOW ('X', 117, struct xfs_error_injection)
/* XFS_IOC_ATTRCTL_BY_HANDLE -- deprecated 118 */
+
/* XFS_IOC_FREEZE -- FIFREEZE 119 */
/* XFS_IOC_THAW -- FITHAW 120 */
+#ifndef FIFREEZE
+#define XFS_IOC_FREEZE _IOWR('X', 119, int)
+#define XFS_IOC_THAW _IOWR('X', 120, int)
+#endif
+
#define XFS_IOC_FSSETDM_BY_HANDLE _IOW ('X', 121, struct xfs_fsop_setdm_handlereq)
#define XFS_IOC_ATTRLIST_BY_HANDLE _IOW ('X', 122, struct xfs_fsop_attrlist_handlereq)
#define XFS_IOC_ATTRMULTI_BY_HANDLE _IOW ('X', 123, struct xfs_fsop_attrmulti_handlereq)
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 94eaeedc549..d2295561570 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -17,28 +17,31 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
-#include "xfs_types.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dinode.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
#include "xfs_inode.h"
+#include "xfs_trans.h"
#include "xfs_inode_item.h"
-#include "xfs_btree.h"
#include "xfs_error.h"
+#include "xfs_btree.h"
+#include "xfs_alloc_btree.h"
#include "xfs_alloc.h"
#include "xfs_ialloc.h"
#include "xfs_fsops.h"
#include "xfs_itable.h"
#include "xfs_trans_space.h"
#include "xfs_rtalloc.h"
-#include "xfs_filestream.h"
#include "xfs_trace.h"
+#include "xfs_log.h"
+#include "xfs_dinode.h"
+#include "xfs_filestream.h"
/*
* File system operations
@@ -73,23 +76,18 @@ xfs_fs_geometry(
}
if (new_version >= 3) {
geo->version = XFS_FSOP_GEOM_VERSION;
- geo->flags =
+ geo->flags = XFS_FSOP_GEOM_FLAGS_NLINK |
+ XFS_FSOP_GEOM_FLAGS_DIRV2 |
(xfs_sb_version_hasattr(&mp->m_sb) ?
XFS_FSOP_GEOM_FLAGS_ATTR : 0) |
- (xfs_sb_version_hasnlink(&mp->m_sb) ?
- XFS_FSOP_GEOM_FLAGS_NLINK : 0) |
(xfs_sb_version_hasquota(&mp->m_sb) ?
XFS_FSOP_GEOM_FLAGS_QUOTA : 0) |
(xfs_sb_version_hasalign(&mp->m_sb) ?
XFS_FSOP_GEOM_FLAGS_IALIGN : 0) |
(xfs_sb_version_hasdalign(&mp->m_sb) ?
XFS_FSOP_GEOM_FLAGS_DALIGN : 0) |
- (xfs_sb_version_hasshared(&mp->m_sb) ?
- XFS_FSOP_GEOM_FLAGS_SHARED : 0) |
(xfs_sb_version_hasextflgbit(&mp->m_sb) ?
XFS_FSOP_GEOM_FLAGS_EXTFLG : 0) |
- (xfs_sb_version_hasdirv2(&mp->m_sb) ?
- XFS_FSOP_GEOM_FLAGS_DIRV2 : 0) |
(xfs_sb_version_hassector(&mp->m_sb) ?
XFS_FSOP_GEOM_FLAGS_SECTOR : 0) |
(xfs_sb_version_hasasciici(&mp->m_sb) ?
@@ -99,11 +97,17 @@ xfs_fs_geometry(
(xfs_sb_version_hasattr2(&mp->m_sb) ?
XFS_FSOP_GEOM_FLAGS_ATTR2 : 0) |
(xfs_sb_version_hasprojid32bit(&mp->m_sb) ?
- XFS_FSOP_GEOM_FLAGS_PROJID32 : 0);
+ XFS_FSOP_GEOM_FLAGS_PROJID32 : 0) |
+ (xfs_sb_version_hascrc(&mp->m_sb) ?
+ XFS_FSOP_GEOM_FLAGS_V5SB : 0) |
+ (xfs_sb_version_hasftype(&mp->m_sb) ?
+ XFS_FSOP_GEOM_FLAGS_FTYPE : 0) |
+ (xfs_sb_version_hasfinobt(&mp->m_sb) ?
+ XFS_FSOP_GEOM_FLAGS_FINOBT : 0);
geo->logsectsize = xfs_sb_version_hassector(&mp->m_sb) ?
mp->m_sb.sb_logsectsize : BBSIZE;
geo->rtsectsize = mp->m_sb.sb_blocksize;
- geo->dirblocksize = mp->m_dirblksize;
+ geo->dirblocksize = mp->m_dir_geo->blksize;
}
if (new_version >= 4) {
geo->flags |=
@@ -151,7 +155,7 @@ xfs_growfs_data_private(
xfs_buf_t *bp;
int bucket;
int dpct;
- int error;
+ int error, saved_error = 0;
xfs_agnumber_t nagcount;
xfs_agnumber_t nagimax = 0;
xfs_rfsblock_t nb, nb_mod;
@@ -174,7 +178,7 @@ xfs_growfs_data_private(
if (!bp)
return EIO;
if (bp->b_error) {
- int error = bp->b_error;
+ error = bp->b_error;
xfs_buf_relse(bp);
return error;
}
@@ -201,8 +205,9 @@ xfs_growfs_data_private(
tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFS);
tp->t_flags |= XFS_TRANS_RESERVE;
- if ((error = xfs_trans_reserve(tp, XFS_GROWFS_SPACE_RES(mp),
- XFS_GROWDATA_LOG_RES(mp), 0, 0, 0))) {
+ error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growdata,
+ XFS_GROWFS_SPACE_RES(mp), 0);
+ if (error) {
xfs_trans_cancel(tp, 0);
return error;
}
@@ -214,6 +219,8 @@ xfs_growfs_data_private(
*/
nfree = 0;
for (agno = nagcount - 1; agno >= oagcount; agno--, new -= agsize) {
+ __be32 *agfl_bno;
+
/*
* AG freespace header block
*/
@@ -247,6 +254,9 @@ xfs_growfs_data_private(
tmpsize = agsize - XFS_PREALLOC_BLOCKS(mp);
agf->agf_freeblks = cpu_to_be32(tmpsize);
agf->agf_longest = cpu_to_be32(tmpsize);
+ if (xfs_sb_version_hascrc(&mp->m_sb))
+ uuid_copy(&agf->agf_uuid, &mp->m_sb.sb_uuid);
+
error = xfs_bwrite(bp);
xfs_buf_relse(bp);
if (error)
@@ -265,8 +275,15 @@ xfs_growfs_data_private(
}
agfl = XFS_BUF_TO_AGFL(bp);
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ agfl->agfl_magicnum = cpu_to_be32(XFS_AGFL_MAGIC);
+ agfl->agfl_seqno = cpu_to_be32(agno);
+ uuid_copy(&agfl->agfl_uuid, &mp->m_sb.sb_uuid);
+ }
+
+ agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, bp);
for (bucket = 0; bucket < XFS_AGFL_SIZE(mp); bucket++)
- agfl->agfl_bno[bucket] = cpu_to_be32(NULLAGBLOCK);
+ agfl_bno[bucket] = cpu_to_be32(NULLAGBLOCK);
error = xfs_bwrite(bp);
xfs_buf_relse(bp);
@@ -296,8 +313,15 @@ xfs_growfs_data_private(
agi->agi_freecount = 0;
agi->agi_newino = cpu_to_be32(NULLAGINO);
agi->agi_dirino = cpu_to_be32(NULLAGINO);
+ if (xfs_sb_version_hascrc(&mp->m_sb))
+ uuid_copy(&agi->agi_uuid, &mp->m_sb.sb_uuid);
+ if (xfs_sb_version_hasfinobt(&mp->m_sb)) {
+ agi->agi_free_root = cpu_to_be32(XFS_FIBT_BLOCK(mp));
+ agi->agi_free_level = cpu_to_be32(1);
+ }
for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++)
agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO);
+
error = xfs_bwrite(bp);
xfs_buf_relse(bp);
if (error)
@@ -316,7 +340,13 @@ xfs_growfs_data_private(
goto error0;
}
- xfs_btree_init_block(mp, bp, XFS_ABTB_MAGIC, 0, 1, 0);
+ if (xfs_sb_version_hascrc(&mp->m_sb))
+ xfs_btree_init_block(mp, bp, XFS_ABTB_CRC_MAGIC, 0, 1,
+ agno, XFS_BTREE_CRC_BLOCKS);
+ else
+ xfs_btree_init_block(mp, bp, XFS_ABTB_MAGIC, 0, 1,
+ agno, 0);
+
arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1);
arec->ar_startblock = cpu_to_be32(XFS_PREALLOC_BLOCKS(mp));
arec->ar_blockcount = cpu_to_be32(
@@ -339,7 +369,13 @@ xfs_growfs_data_private(
goto error0;
}
- xfs_btree_init_block(mp, bp, XFS_ABTC_MAGIC, 0, 1, 0);
+ if (xfs_sb_version_hascrc(&mp->m_sb))
+ xfs_btree_init_block(mp, bp, XFS_ABTC_CRC_MAGIC, 0, 1,
+ agno, XFS_BTREE_CRC_BLOCKS);
+ else
+ xfs_btree_init_block(mp, bp, XFS_ABTC_MAGIC, 0, 1,
+ agno, 0);
+
arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1);
arec->ar_startblock = cpu_to_be32(XFS_PREALLOC_BLOCKS(mp));
arec->ar_blockcount = cpu_to_be32(
@@ -363,12 +399,45 @@ xfs_growfs_data_private(
goto error0;
}
- xfs_btree_init_block(mp, bp, XFS_IBT_MAGIC, 0, 0, 0);
+ if (xfs_sb_version_hascrc(&mp->m_sb))
+ xfs_btree_init_block(mp, bp, XFS_IBT_CRC_MAGIC, 0, 0,
+ agno, XFS_BTREE_CRC_BLOCKS);
+ else
+ xfs_btree_init_block(mp, bp, XFS_IBT_MAGIC, 0, 0,
+ agno, 0);
error = xfs_bwrite(bp);
xfs_buf_relse(bp);
if (error)
goto error0;
+
+ /*
+ * FINO btree root block
+ */
+ if (xfs_sb_version_hasfinobt(&mp->m_sb)) {
+ bp = xfs_growfs_get_hdr_buf(mp,
+ XFS_AGB_TO_DADDR(mp, agno, XFS_FIBT_BLOCK(mp)),
+ BTOBB(mp->m_sb.sb_blocksize), 0,
+ &xfs_inobt_buf_ops);
+ if (!bp) {
+ error = ENOMEM;
+ goto error0;
+ }
+
+ if (xfs_sb_version_hascrc(&mp->m_sb))
+ xfs_btree_init_block(mp, bp, XFS_FIBT_CRC_MAGIC,
+ 0, 0, agno,
+ XFS_BTREE_CRC_BLOCKS);
+ else
+ xfs_btree_init_block(mp, bp, XFS_FIBT_MAGIC, 0,
+ 0, agno, 0);
+
+ error = xfs_bwrite(bp);
+ xfs_buf_relse(bp);
+ if (error)
+ goto error0;
+ }
+
}
xfs_trans_agblocks_delta(tp, nfree);
/*
@@ -465,29 +534,33 @@ xfs_growfs_data_private(
error = ENOMEM;
}
+ /*
+ * If we get an error reading or writing alternate superblocks,
+ * continue. xfs_repair chooses the "best" superblock based
+ * on most matches; if we break early, we'll leave more
+ * superblocks un-updated than updated, and xfs_repair may
+ * pick them over the properly-updated primary.
+ */
if (error) {
xfs_warn(mp,
"error %d reading secondary superblock for ag %d",
error, agno);
- break;
+ saved_error = error;
+ continue;
}
xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb, XFS_SB_ALL_BITS);
- /*
- * If we get an error writing out the alternate superblocks,
- * just issue a warning and continue. The real work is
- * already done and committed.
- */
error = xfs_bwrite(bp);
xfs_buf_relse(bp);
if (error) {
xfs_warn(mp,
"write error %d updating secondary superblock for ag %d",
error, agno);
- break; /* no point in continuing */
+ saved_error = error;
+ continue;
}
}
- return error;
+ return saved_error ? saved_error : error;
error0:
xfs_trans_cancel(tp, XFS_TRANS_ABORT);
@@ -709,8 +782,7 @@ xfs_fs_log_dummy(
int error;
tp = _xfs_trans_alloc(mp, XFS_TRANS_DUMMY1, KM_SLEEP);
- error = xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0,
- XFS_DEFAULT_LOG_COUNT);
+ error = xfs_trans_reserve(tp, &M_RES(mp)->tr_sb, 0, 0);
if (error) {
xfs_trans_cancel(tp, 0);
return error;
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index a815412eab8..5960e5593fe 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -17,25 +17,30 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
-#include "xfs_types.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_bit.h"
-#include "xfs_log.h"
#include "xfs_inum.h"
-#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dinode.h"
#include "xfs_inode.h"
#include "xfs_btree.h"
#include "xfs_ialloc.h"
+#include "xfs_ialloc_btree.h"
#include "xfs_alloc.h"
#include "xfs_rtalloc.h"
#include "xfs_error.h"
#include "xfs_bmap.h"
+#include "xfs_cksum.h"
+#include "xfs_trans.h"
+#include "xfs_buf_item.h"
+#include "xfs_icreate_item.h"
+#include "xfs_icache.h"
+#include "xfs_dinode.h"
+#include "xfs_trace.h"
/*
@@ -47,7 +52,7 @@ xfs_ialloc_cluster_alignment(
{
if (xfs_sb_version_hasalign(&args->mp->m_sb) &&
args->mp->m_sb.sb_inoalignmt >=
- XFS_B_TO_FSBT(args->mp, XFS_INODE_CLUSTER_SIZE(args->mp)))
+ XFS_B_TO_FSBT(args->mp, args->mp->m_inode_cluster_size))
return args->mp->m_sb.sb_inoalignmt;
return 1;
}
@@ -107,6 +112,66 @@ xfs_inobt_get_rec(
}
/*
+ * Insert a single inobt record. Cursor must already point to desired location.
+ */
+STATIC int
+xfs_inobt_insert_rec(
+ struct xfs_btree_cur *cur,
+ __int32_t freecount,
+ xfs_inofree_t free,
+ int *stat)
+{
+ cur->bc_rec.i.ir_freecount = freecount;
+ cur->bc_rec.i.ir_free = free;
+ return xfs_btree_insert(cur, stat);
+}
+
+/*
+ * Insert records describing a newly allocated inode chunk into the inobt.
+ */
+STATIC int
+xfs_inobt_insert(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ struct xfs_buf *agbp,
+ xfs_agino_t newino,
+ xfs_agino_t newlen,
+ xfs_btnum_t btnum)
+{
+ struct xfs_btree_cur *cur;
+ struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp);
+ xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno);
+ xfs_agino_t thisino;
+ int i;
+ int error;
+
+ cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, btnum);
+
+ for (thisino = newino;
+ thisino < newino + newlen;
+ thisino += XFS_INODES_PER_CHUNK) {
+ error = xfs_inobt_lookup(cur, thisino, XFS_LOOKUP_EQ, &i);
+ if (error) {
+ xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+ return error;
+ }
+ ASSERT(i == 0);
+
+ error = xfs_inobt_insert_rec(cur, XFS_INODES_PER_CHUNK,
+ XFS_INOBT_ALL_FREE, &i);
+ if (error) {
+ xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+ return error;
+ }
+ ASSERT(i == 1);
+ }
+
+ xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+
+ return 0;
+}
+
+/*
* Verify that the number of free inodes in the AGI is correct.
*/
#ifdef DEBUG
@@ -148,12 +213,16 @@ xfs_check_agi_freecount(
#endif
/*
- * Initialise a new set of inodes.
+ * Initialise a new set of inodes. When called without a transaction context
+ * (e.g. from recovery) we initiate a delayed write of the inode buffers rather
+ * than logging them (which in a transaction context puts them into the AIL
+ * for writeback rather than the xfsbufd queue).
*/
-STATIC int
+int
xfs_ialloc_inode_init(
struct xfs_mount *mp,
struct xfs_trans *tp,
+ struct list_head *buffer_list,
xfs_agnumber_t agno,
xfs_agblock_t agbno,
xfs_agblock_t length,
@@ -161,38 +230,58 @@ xfs_ialloc_inode_init(
{
struct xfs_buf *fbuf;
struct xfs_dinode *free;
- int blks_per_cluster, nbufs, ninodes;
+ int nbufs, blks_per_cluster, inodes_per_cluster;
int version;
int i, j;
xfs_daddr_t d;
+ xfs_ino_t ino = 0;
/*
- * Loop over the new block(s), filling in the inodes.
- * For small block sizes, manipulate the inodes in buffers
- * which are multiples of the blocks size.
+ * Loop over the new block(s), filling in the inodes. For small block
+ * sizes, manipulate the inodes in buffers which are multiples of the
+ * blocks size.
*/
- if (mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(mp)) {
- blks_per_cluster = 1;
- nbufs = length;
- ninodes = mp->m_sb.sb_inopblock;
- } else {
- blks_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) /
- mp->m_sb.sb_blocksize;
- nbufs = length / blks_per_cluster;
- ninodes = blks_per_cluster * mp->m_sb.sb_inopblock;
- }
+ blks_per_cluster = xfs_icluster_size_fsb(mp);
+ inodes_per_cluster = blks_per_cluster << mp->m_sb.sb_inopblog;
+ nbufs = length / blks_per_cluster;
/*
- * Figure out what version number to use in the inodes we create.
- * If the superblock version has caught up to the one that supports
- * the new inode format, then use the new inode version. Otherwise
- * use the old version so that old kernels will continue to be
- * able to use the file system.
+ * Figure out what version number to use in the inodes we create. If
+ * the superblock version has caught up to the one that supports the new
+ * inode format, then use the new inode version. Otherwise use the old
+ * version so that old kernels will continue to be able to use the file
+ * system.
+ *
+ * For v3 inodes, we also need to write the inode number into the inode,
+ * so calculate the first inode number of the chunk here as
+ * XFS_OFFBNO_TO_AGINO() only works within a filesystem block, not
+ * across multiple filesystem blocks (such as a cluster) and so cannot
+ * be used in the cluster buffer loop below.
+ *
+ * Further, because we are writing the inode directly into the buffer
+ * and calculating a CRC on the entire inode, we have ot log the entire
+ * inode so that the entire range the CRC covers is present in the log.
+ * That means for v3 inode we log the entire buffer rather than just the
+ * inode cores.
*/
- if (xfs_sb_version_hasnlink(&mp->m_sb))
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ version = 3;
+ ino = XFS_AGINO_TO_INO(mp, agno,
+ XFS_OFFBNO_TO_AGINO(mp, agbno, 0));
+
+ /*
+ * log the initialisation that is about to take place as an
+ * logical operation. This means the transaction does not
+ * need to log the physical changes to the inode buffers as log
+ * recovery will know what initialisation is actually needed.
+ * Hence we only need to log the buffers as "ordered" buffers so
+ * they track in the AIL as if they were physically logged.
+ */
+ if (tp)
+ xfs_icreate_log(tp, agno, agbno, mp->m_ialloc_inos,
+ mp->m_sb.sb_inodesize, length, gen);
+ } else
version = 2;
- else
- version = 1;
for (j = 0; j < nbufs; j++) {
/*
@@ -204,27 +293,58 @@ xfs_ialloc_inode_init(
XBF_UNMAPPED);
if (!fbuf)
return ENOMEM;
- /*
- * Initialize all inodes in this buffer and then log them.
- *
- * XXX: It would be much better if we had just one transaction
- * to log a whole cluster of inodes instead of all the
- * individual transactions causing a lot of log traffic.
- */
+
+ /* Initialize the inode buffers and log them appropriately. */
fbuf->b_ops = &xfs_inode_buf_ops;
- xfs_buf_zero(fbuf, 0, ninodes << mp->m_sb.sb_inodelog);
- for (i = 0; i < ninodes; i++) {
+ xfs_buf_zero(fbuf, 0, BBTOB(fbuf->b_length));
+ for (i = 0; i < inodes_per_cluster; i++) {
int ioffset = i << mp->m_sb.sb_inodelog;
- uint isize = sizeof(struct xfs_dinode);
+ uint isize = xfs_dinode_size(version);
free = xfs_make_iptr(mp, fbuf, i);
free->di_magic = cpu_to_be16(XFS_DINODE_MAGIC);
free->di_version = version;
free->di_gen = cpu_to_be32(gen);
free->di_next_unlinked = cpu_to_be32(NULLAGINO);
- xfs_trans_log_buf(tp, fbuf, ioffset, ioffset + isize - 1);
+
+ if (version == 3) {
+ free->di_ino = cpu_to_be64(ino);
+ ino++;
+ uuid_copy(&free->di_uuid, &mp->m_sb.sb_uuid);
+ xfs_dinode_calc_crc(mp, free);
+ } else if (tp) {
+ /* just log the inode core */
+ xfs_trans_log_buf(tp, fbuf, ioffset,
+ ioffset + isize - 1);
+ }
+ }
+
+ if (tp) {
+ /*
+ * Mark the buffer as an inode allocation buffer so it
+ * sticks in AIL at the point of this allocation
+ * transaction. This ensures the they are on disk before
+ * the tail of the log can be moved past this
+ * transaction (i.e. by preventing relogging from moving
+ * it forward in the log).
+ */
+ xfs_trans_inode_alloc_buf(tp, fbuf);
+ if (version == 3) {
+ /*
+ * Mark the buffer as ordered so that they are
+ * not physically logged in the transaction but
+ * still tracked in the AIL as part of the
+ * transaction and pin the log appropriately.
+ */
+ xfs_trans_ordered_buf(tp, fbuf);
+ xfs_trans_log_buf(tp, fbuf, 0,
+ BBTOB(fbuf->b_length) - 1);
+ }
+ } else {
+ fbuf->b_flags |= XBF_DONE;
+ xfs_buf_delwri_queue(fbuf, buffer_list);
+ xfs_buf_relse(fbuf);
}
- xfs_trans_inode_alloc_buf(tp, fbuf);
}
return 0;
}
@@ -241,13 +361,10 @@ xfs_ialloc_ag_alloc(
{
xfs_agi_t *agi; /* allocation group header */
xfs_alloc_arg_t args; /* allocation argument structure */
- xfs_btree_cur_t *cur; /* inode btree cursor */
xfs_agnumber_t agno;
int error;
- int i;
xfs_agino_t newino; /* new first inode's number */
xfs_agino_t newlen; /* new number of inodes */
- xfs_agino_t thisino; /* current inode number, for loop */
int isaligned = 0; /* inode allocation at stripe unit */
/* boundary */
struct xfs_perag *pag;
@@ -260,27 +377,25 @@ xfs_ialloc_ag_alloc(
* Locking will ensure that we don't have two callers in here
* at one time.
*/
- newlen = XFS_IALLOC_INODES(args.mp);
+ newlen = args.mp->m_ialloc_inos;
if (args.mp->m_maxicount &&
args.mp->m_sb.sb_icount + newlen > args.mp->m_maxicount)
return XFS_ERROR(ENOSPC);
- args.minlen = args.maxlen = XFS_IALLOC_BLOCKS(args.mp);
+ args.minlen = args.maxlen = args.mp->m_ialloc_blks;
/*
* First try to allocate inodes contiguous with the last-allocated
* chunk of inodes. If the filesystem is striped, this will fill
* an entire stripe unit with inodes.
- */
+ */
agi = XFS_BUF_TO_AGI(agbp);
newino = be32_to_cpu(agi->agi_newino);
agno = be32_to_cpu(agi->agi_seqno);
args.agbno = XFS_AGINO_TO_AGBNO(args.mp, newino) +
- XFS_IALLOC_BLOCKS(args.mp);
+ args.mp->m_ialloc_blks;
if (likely(newino != NULLAGINO &&
(args.agbno < be32_to_cpu(agi->agi_length)))) {
args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno);
args.type = XFS_ALLOCTYPE_THIS_BNO;
- args.mod = args.total = args.wasdel = args.isfl =
- args.userdata = args.minalignslop = 0;
args.prod = 1;
/*
@@ -303,6 +418,18 @@ xfs_ialloc_ag_alloc(
args.minleft = args.mp->m_in_maxlevels - 1;
if ((error = xfs_alloc_vextent(&args)))
return error;
+
+ /*
+ * This request might have dirtied the transaction if the AG can
+ * satisfy the request, but the exact block was not available.
+ * If the allocation did fail, subsequent requests will relax
+ * the exact agbno requirement and increase the alignment
+ * instead. It is critical that the total size of the request
+ * (len + alignment + slop) does not increase from this point
+ * on, so reset minalignslop to ensure it is not included in
+ * subsequent requests.
+ */
+ args.minalignslop = 0;
} else
args.fsbno = NULLFSBLOCK;
@@ -333,8 +460,6 @@ xfs_ialloc_ag_alloc(
* Allocate a fixed-size extent of inodes.
*/
args.type = XFS_ALLOCTYPE_NEAR_BNO;
- args.mod = args.total = args.wasdel = args.isfl =
- args.userdata = args.minalignslop = 0;
args.prod = 1;
/*
* Allow space for the inode btree to split.
@@ -372,8 +497,8 @@ xfs_ialloc_ag_alloc(
* rather than a linear progression to prevent the next generation
* number from being easily guessable.
*/
- error = xfs_ialloc_inode_init(args.mp, tp, agno, args.agbno,
- args.len, random32());
+ error = xfs_ialloc_inode_init(args.mp, tp, NULL, agno, args.agbno,
+ args.len, prandom_u32());
if (error)
return error;
@@ -389,29 +514,19 @@ xfs_ialloc_ag_alloc(
agi->agi_newino = cpu_to_be32(newino);
/*
- * Insert records describing the new inode chunk into the btree.
+ * Insert records describing the new inode chunk into the btrees.
*/
- cur = xfs_inobt_init_cursor(args.mp, tp, agbp, agno);
- for (thisino = newino;
- thisino < newino + newlen;
- thisino += XFS_INODES_PER_CHUNK) {
- cur->bc_rec.i.ir_startino = thisino;
- cur->bc_rec.i.ir_freecount = XFS_INODES_PER_CHUNK;
- cur->bc_rec.i.ir_free = XFS_INOBT_ALL_FREE;
- error = xfs_btree_lookup(cur, XFS_LOOKUP_EQ, &i);
- if (error) {
- xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
- return error;
- }
- ASSERT(i == 0);
- error = xfs_btree_insert(cur, &i);
- if (error) {
- xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+ error = xfs_inobt_insert(args.mp, tp, agbp, newino, newlen,
+ XFS_BTNUM_INO);
+ if (error)
+ return error;
+
+ if (xfs_sb_version_hasfinobt(&args.mp->m_sb)) {
+ error = xfs_inobt_insert(args.mp, tp, agbp, newino, newlen,
+ XFS_BTNUM_FINO);
+ if (error)
return error;
- }
- ASSERT(i == 1);
}
- xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
/*
* Log allocation group header fields
*/
@@ -443,7 +558,7 @@ xfs_ialloc_next_ag(
/*
* Select an allocation group to look for a free inode in, based on the parent
- * inode and then mode. Return the allocation group buffer.
+ * inode and the mode. Return the allocation group buffer.
*/
STATIC xfs_agnumber_t
xfs_ialloc_ag_select(
@@ -520,7 +635,7 @@ xfs_ialloc_ag_select(
* Is there enough free space for the file plus a block of
* inodes? (if we need to allocate some)?
*/
- ineed = XFS_IALLOC_BLOCKS(mp);
+ ineed = mp->m_ialloc_blks;
longest = pag->pagf_longest;
if (!longest)
longest = pag->pagf_flcount > 0;
@@ -585,8 +700,7 @@ xfs_ialloc_get_rec(
struct xfs_btree_cur *cur,
xfs_agino_t agino,
xfs_inobt_rec_incore_t *rec,
- int *done,
- int left)
+ int *done)
{
int error;
int i;
@@ -606,13 +720,10 @@ xfs_ialloc_get_rec(
}
/*
- * Allocate an inode.
- *
- * The caller selected an AG for us, and made sure that free inodes are
- * available.
+ * Allocate an inode using the inobt-only algorithm.
*/
STATIC int
-xfs_dialloc_ag(
+xfs_dialloc_ag_inobt(
struct xfs_trans *tp,
struct xfs_buf *agbp,
xfs_ino_t parent,
@@ -638,7 +749,7 @@ xfs_dialloc_ag(
ASSERT(pag->pagi_freecount > 0);
restart_pagno:
- cur = xfs_inobt_init_cursor(mp, tp, agbp, agno);
+ cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO);
/*
* If pagino is 0 (this is the root inode allocation) use newino.
* This must work because we've just allocated some.
@@ -666,7 +777,7 @@ xfs_dialloc_ag(
error = xfs_inobt_get_rec(cur, &rec, &j);
if (error)
goto error0;
- XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+ XFS_WANT_CORRUPTED_GOTO(j == 1, error0);
if (rec.ir_freecount > 0) {
/*
@@ -694,12 +805,12 @@ xfs_dialloc_ag(
pag->pagl_leftrec != NULLAGINO &&
pag->pagl_rightrec != NULLAGINO) {
error = xfs_ialloc_get_rec(tcur, pag->pagl_leftrec,
- &trec, &doneleft, 1);
+ &trec, &doneleft);
if (error)
goto error1;
error = xfs_ialloc_get_rec(cur, pag->pagl_rightrec,
- &rec, &doneright, 0);
+ &rec, &doneright);
if (error)
goto error1;
} else {
@@ -871,6 +982,294 @@ error0:
}
/*
+ * Use the free inode btree to allocate an inode based on distance from the
+ * parent. Note that the provided cursor may be deleted and replaced.
+ */
+STATIC int
+xfs_dialloc_ag_finobt_near(
+ xfs_agino_t pagino,
+ struct xfs_btree_cur **ocur,
+ struct xfs_inobt_rec_incore *rec)
+{
+ struct xfs_btree_cur *lcur = *ocur; /* left search cursor */
+ struct xfs_btree_cur *rcur; /* right search cursor */
+ struct xfs_inobt_rec_incore rrec;
+ int error;
+ int i, j;
+
+ error = xfs_inobt_lookup(lcur, pagino, XFS_LOOKUP_LE, &i);
+ if (error)
+ return error;
+
+ if (i == 1) {
+ error = xfs_inobt_get_rec(lcur, rec, &i);
+ if (error)
+ return error;
+ XFS_WANT_CORRUPTED_RETURN(i == 1);
+
+ /*
+ * See if we've landed in the parent inode record. The finobt
+ * only tracks chunks with at least one free inode, so record
+ * existence is enough.
+ */
+ if (pagino >= rec->ir_startino &&
+ pagino < (rec->ir_startino + XFS_INODES_PER_CHUNK))
+ return 0;
+ }
+
+ error = xfs_btree_dup_cursor(lcur, &rcur);
+ if (error)
+ return error;
+
+ error = xfs_inobt_lookup(rcur, pagino, XFS_LOOKUP_GE, &j);
+ if (error)
+ goto error_rcur;
+ if (j == 1) {
+ error = xfs_inobt_get_rec(rcur, &rrec, &j);
+ if (error)
+ goto error_rcur;
+ XFS_WANT_CORRUPTED_GOTO(j == 1, error_rcur);
+ }
+
+ XFS_WANT_CORRUPTED_GOTO(i == 1 || j == 1, error_rcur);
+ if (i == 1 && j == 1) {
+ /*
+ * Both the left and right records are valid. Choose the closer
+ * inode chunk to the target.
+ */
+ if ((pagino - rec->ir_startino + XFS_INODES_PER_CHUNK - 1) >
+ (rrec.ir_startino - pagino)) {
+ *rec = rrec;
+ xfs_btree_del_cursor(lcur, XFS_BTREE_NOERROR);
+ *ocur = rcur;
+ } else {
+ xfs_btree_del_cursor(rcur, XFS_BTREE_NOERROR);
+ }
+ } else if (j == 1) {
+ /* only the right record is valid */
+ *rec = rrec;
+ xfs_btree_del_cursor(lcur, XFS_BTREE_NOERROR);
+ *ocur = rcur;
+ } else if (i == 1) {
+ /* only the left record is valid */
+ xfs_btree_del_cursor(rcur, XFS_BTREE_NOERROR);
+ }
+
+ return 0;
+
+error_rcur:
+ xfs_btree_del_cursor(rcur, XFS_BTREE_ERROR);
+ return error;
+}
+
+/*
+ * Use the free inode btree to find a free inode based on a newino hint. If
+ * the hint is NULL, find the first free inode in the AG.
+ */
+STATIC int
+xfs_dialloc_ag_finobt_newino(
+ struct xfs_agi *agi,
+ struct xfs_btree_cur *cur,
+ struct xfs_inobt_rec_incore *rec)
+{
+ int error;
+ int i;
+
+ if (agi->agi_newino != cpu_to_be32(NULLAGINO)) {
+ error = xfs_inobt_lookup(cur, agi->agi_newino, XFS_LOOKUP_EQ,
+ &i);
+ if (error)
+ return error;
+ if (i == 1) {
+ error = xfs_inobt_get_rec(cur, rec, &i);
+ if (error)
+ return error;
+ XFS_WANT_CORRUPTED_RETURN(i == 1);
+
+ return 0;
+ }
+ }
+
+ /*
+ * Find the first inode available in the AG.
+ */
+ error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i);
+ if (error)
+ return error;
+ XFS_WANT_CORRUPTED_RETURN(i == 1);
+
+ error = xfs_inobt_get_rec(cur, rec, &i);
+ if (error)
+ return error;
+ XFS_WANT_CORRUPTED_RETURN(i == 1);
+
+ return 0;
+}
+
+/*
+ * Update the inobt based on a modification made to the finobt. Also ensure that
+ * the records from both trees are equivalent post-modification.
+ */
+STATIC int
+xfs_dialloc_ag_update_inobt(
+ struct xfs_btree_cur *cur, /* inobt cursor */
+ struct xfs_inobt_rec_incore *frec, /* finobt record */
+ int offset) /* inode offset */
+{
+ struct xfs_inobt_rec_incore rec;
+ int error;
+ int i;
+
+ error = xfs_inobt_lookup(cur, frec->ir_startino, XFS_LOOKUP_EQ, &i);
+ if (error)
+ return error;
+ XFS_WANT_CORRUPTED_RETURN(i == 1);
+
+ error = xfs_inobt_get_rec(cur, &rec, &i);
+ if (error)
+ return error;
+ XFS_WANT_CORRUPTED_RETURN(i == 1);
+ ASSERT((XFS_AGINO_TO_OFFSET(cur->bc_mp, rec.ir_startino) %
+ XFS_INODES_PER_CHUNK) == 0);
+
+ rec.ir_free &= ~XFS_INOBT_MASK(offset);
+ rec.ir_freecount--;
+
+ XFS_WANT_CORRUPTED_RETURN((rec.ir_free == frec->ir_free) &&
+ (rec.ir_freecount == frec->ir_freecount));
+
+ error = xfs_inobt_update(cur, &rec);
+ if (error)
+ return error;
+
+ return 0;
+}
+
+/*
+ * Allocate an inode using the free inode btree, if available. Otherwise, fall
+ * back to the inobt search algorithm.
+ *
+ * The caller selected an AG for us, and made sure that free inodes are
+ * available.
+ */
+STATIC int
+xfs_dialloc_ag(
+ struct xfs_trans *tp,
+ struct xfs_buf *agbp,
+ xfs_ino_t parent,
+ xfs_ino_t *inop)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp);
+ xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno);
+ xfs_agnumber_t pagno = XFS_INO_TO_AGNO(mp, parent);
+ xfs_agino_t pagino = XFS_INO_TO_AGINO(mp, parent);
+ struct xfs_perag *pag;
+ struct xfs_btree_cur *cur; /* finobt cursor */
+ struct xfs_btree_cur *icur; /* inobt cursor */
+ struct xfs_inobt_rec_incore rec;
+ xfs_ino_t ino;
+ int error;
+ int offset;
+ int i;
+
+ if (!xfs_sb_version_hasfinobt(&mp->m_sb))
+ return xfs_dialloc_ag_inobt(tp, agbp, parent, inop);
+
+ pag = xfs_perag_get(mp, agno);
+
+ /*
+ * If pagino is 0 (this is the root inode allocation) use newino.
+ * This must work because we've just allocated some.
+ */
+ if (!pagino)
+ pagino = be32_to_cpu(agi->agi_newino);
+
+ cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_FINO);
+
+ error = xfs_check_agi_freecount(cur, agi);
+ if (error)
+ goto error_cur;
+
+ /*
+ * The search algorithm depends on whether we're in the same AG as the
+ * parent. If so, find the closest available inode to the parent. If
+ * not, consider the agi hint or find the first free inode in the AG.
+ */
+ if (agno == pagno)
+ error = xfs_dialloc_ag_finobt_near(pagino, &cur, &rec);
+ else
+ error = xfs_dialloc_ag_finobt_newino(agi, cur, &rec);
+ if (error)
+ goto error_cur;
+
+ offset = xfs_lowbit64(rec.ir_free);
+ ASSERT(offset >= 0);
+ ASSERT(offset < XFS_INODES_PER_CHUNK);
+ ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) %
+ XFS_INODES_PER_CHUNK) == 0);
+ ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino + offset);
+
+ /*
+ * Modify or remove the finobt record.
+ */
+ rec.ir_free &= ~XFS_INOBT_MASK(offset);
+ rec.ir_freecount--;
+ if (rec.ir_freecount)
+ error = xfs_inobt_update(cur, &rec);
+ else
+ error = xfs_btree_delete(cur, &i);
+ if (error)
+ goto error_cur;
+
+ /*
+ * The finobt has now been updated appropriately. We haven't updated the
+ * agi and superblock yet, so we can create an inobt cursor and validate
+ * the original freecount. If all is well, make the equivalent update to
+ * the inobt using the finobt record and offset information.
+ */
+ icur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO);
+
+ error = xfs_check_agi_freecount(icur, agi);
+ if (error)
+ goto error_icur;
+
+ error = xfs_dialloc_ag_update_inobt(icur, &rec, offset);
+ if (error)
+ goto error_icur;
+
+ /*
+ * Both trees have now been updated. We must update the perag and
+ * superblock before we can check the freecount for each btree.
+ */
+ be32_add_cpu(&agi->agi_freecount, -1);
+ xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT);
+ pag->pagi_freecount--;
+
+ xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -1);
+
+ error = xfs_check_agi_freecount(icur, agi);
+ if (error)
+ goto error_icur;
+ error = xfs_check_agi_freecount(cur, agi);
+ if (error)
+ goto error_icur;
+
+ xfs_btree_del_cursor(icur, XFS_BTREE_NOERROR);
+ xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+ xfs_perag_put(pag);
+ *inop = ino;
+ return 0;
+
+error_icur:
+ xfs_btree_del_cursor(icur, XFS_BTREE_ERROR);
+error_cur:
+ xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+ xfs_perag_put(pag);
+ return error;
+}
+
+/*
* Allocate an inode on disk.
*
* Mode is used to tell whether the new inode will need space, and whether it
@@ -935,7 +1334,7 @@ xfs_dialloc(
* inode.
*/
if (mp->m_maxicount &&
- mp->m_sb.sb_icount + XFS_IALLOC_INODES(mp) > mp->m_maxicount) {
+ mp->m_sb.sb_icount + mp->m_ialloc_inos > mp->m_maxicount) {
noroom = 1;
okalloc = 0;
}
@@ -1029,78 +1428,34 @@ out_error:
return XFS_ERROR(error);
}
-/*
- * Free disk inode. Carefully avoids touching the incore inode, all
- * manipulations incore are the caller's responsibility.
- * The on-disk inode is not changed by this operation, only the
- * btree (free inode mask) is changed.
- */
-int
-xfs_difree(
- xfs_trans_t *tp, /* transaction pointer */
- xfs_ino_t inode, /* inode to be freed */
- xfs_bmap_free_t *flist, /* extents to free */
- int *delete, /* set if inode cluster was deleted */
- xfs_ino_t *first_ino) /* first inode in deleted cluster */
+STATIC int
+xfs_difree_inobt(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ struct xfs_buf *agbp,
+ xfs_agino_t agino,
+ struct xfs_bmap_free *flist,
+ int *deleted,
+ xfs_ino_t *first_ino,
+ struct xfs_inobt_rec_incore *orec)
{
- /* REFERENCED */
- xfs_agblock_t agbno; /* block number containing inode */
- xfs_buf_t *agbp; /* buffer containing allocation group header */
- xfs_agino_t agino; /* inode number relative to allocation group */
- xfs_agnumber_t agno; /* allocation group number */
- xfs_agi_t *agi; /* allocation group header */
- xfs_btree_cur_t *cur; /* inode btree cursor */
- int error; /* error return value */
- int i; /* result code */
- int ilen; /* inodes in an inode cluster */
- xfs_mount_t *mp; /* mount structure for filesystem */
- int off; /* offset of inode in inode chunk */
- xfs_inobt_rec_incore_t rec; /* btree record */
- struct xfs_perag *pag;
-
- mp = tp->t_mountp;
+ struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp);
+ xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno);
+ struct xfs_perag *pag;
+ struct xfs_btree_cur *cur;
+ struct xfs_inobt_rec_incore rec;
+ int ilen;
+ int error;
+ int i;
+ int off;
- /*
- * Break up inode number into its components.
- */
- agno = XFS_INO_TO_AGNO(mp, inode);
- if (agno >= mp->m_sb.sb_agcount) {
- xfs_warn(mp, "%s: agno >= mp->m_sb.sb_agcount (%d >= %d).",
- __func__, agno, mp->m_sb.sb_agcount);
- ASSERT(0);
- return XFS_ERROR(EINVAL);
- }
- agino = XFS_INO_TO_AGINO(mp, inode);
- if (inode != XFS_AGINO_TO_INO(mp, agno, agino)) {
- xfs_warn(mp, "%s: inode != XFS_AGINO_TO_INO() (%llu != %llu).",
- __func__, (unsigned long long)inode,
- (unsigned long long)XFS_AGINO_TO_INO(mp, agno, agino));
- ASSERT(0);
- return XFS_ERROR(EINVAL);
- }
- agbno = XFS_AGINO_TO_AGBNO(mp, agino);
- if (agbno >= mp->m_sb.sb_agblocks) {
- xfs_warn(mp, "%s: agbno >= mp->m_sb.sb_agblocks (%d >= %d).",
- __func__, agbno, mp->m_sb.sb_agblocks);
- ASSERT(0);
- return XFS_ERROR(EINVAL);
- }
- /*
- * Get the allocation group header.
- */
- error = xfs_ialloc_read_agi(mp, tp, agno, &agbp);
- if (error) {
- xfs_warn(mp, "%s: xfs_ialloc_read_agi() returned error %d.",
- __func__, error);
- return error;
- }
- agi = XFS_BUF_TO_AGI(agbp);
ASSERT(agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC));
- ASSERT(agbno < be32_to_cpu(agi->agi_length));
+ ASSERT(XFS_AGINO_TO_AGBNO(mp, agino) < be32_to_cpu(agi->agi_length));
+
/*
* Initialize the cursor.
*/
- cur = xfs_inobt_init_cursor(mp, tp, agbp, agno);
+ cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO);
error = xfs_check_agi_freecount(cur, agi);
if (error)
@@ -1138,9 +1493,9 @@ xfs_difree(
* When an inode cluster is free, it becomes eligible for removal
*/
if (!(mp->m_flags & XFS_MOUNT_IKEEP) &&
- (rec.ir_freecount == XFS_IALLOC_INODES(mp))) {
+ (rec.ir_freecount == mp->m_ialloc_inos)) {
- *delete = 1;
+ *deleted = 1;
*first_ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino);
/*
@@ -1148,7 +1503,7 @@ xfs_difree(
* AGI and Superblock inode counts, and mark the disk space
* to be freed when the transaction is committed.
*/
- ilen = XFS_IALLOC_INODES(mp);
+ ilen = mp->m_ialloc_inos;
be32_add_cpu(&agi->agi_count, -ilen);
be32_add_cpu(&agi->agi_freecount, -(ilen - 1));
xfs_ialloc_log_agi(tp, agbp, XFS_AGI_COUNT | XFS_AGI_FREECOUNT);
@@ -1164,11 +1519,11 @@ xfs_difree(
goto error0;
}
- xfs_bmap_add_free(XFS_AGB_TO_FSB(mp,
- agno, XFS_INO_TO_AGBNO(mp,rec.ir_startino)),
- XFS_IALLOC_BLOCKS(mp), flist, mp);
+ xfs_bmap_add_free(XFS_AGB_TO_FSB(mp, agno,
+ XFS_AGINO_TO_AGBNO(mp, rec.ir_startino)),
+ mp->m_ialloc_blks, flist, mp);
} else {
- *delete = 0;
+ *deleted = 0;
error = xfs_inobt_update(cur, &rec);
if (error) {
@@ -1192,6 +1547,7 @@ xfs_difree(
if (error)
goto error0;
+ *orec = rec;
xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
return 0;
@@ -1200,6 +1556,182 @@ error0:
return error;
}
+/*
+ * Free an inode in the free inode btree.
+ */
+STATIC int
+xfs_difree_finobt(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ struct xfs_buf *agbp,
+ xfs_agino_t agino,
+ struct xfs_inobt_rec_incore *ibtrec) /* inobt record */
+{
+ struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp);
+ xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno);
+ struct xfs_btree_cur *cur;
+ struct xfs_inobt_rec_incore rec;
+ int offset = agino - ibtrec->ir_startino;
+ int error;
+ int i;
+
+ cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_FINO);
+
+ error = xfs_inobt_lookup(cur, ibtrec->ir_startino, XFS_LOOKUP_EQ, &i);
+ if (error)
+ goto error;
+ if (i == 0) {
+ /*
+ * If the record does not exist in the finobt, we must have just
+ * freed an inode in a previously fully allocated chunk. If not,
+ * something is out of sync.
+ */
+ XFS_WANT_CORRUPTED_GOTO(ibtrec->ir_freecount == 1, error);
+
+ error = xfs_inobt_insert_rec(cur, ibtrec->ir_freecount,
+ ibtrec->ir_free, &i);
+ if (error)
+ goto error;
+ ASSERT(i == 1);
+
+ goto out;
+ }
+
+ /*
+ * Read and update the existing record. We could just copy the ibtrec
+ * across here, but that would defeat the purpose of having redundant
+ * metadata. By making the modifications independently, we can catch
+ * corruptions that we wouldn't see if we just copied from one record
+ * to another.
+ */
+ error = xfs_inobt_get_rec(cur, &rec, &i);
+ if (error)
+ goto error;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, error);
+
+ rec.ir_free |= XFS_INOBT_MASK(offset);
+ rec.ir_freecount++;
+
+ XFS_WANT_CORRUPTED_GOTO((rec.ir_free == ibtrec->ir_free) &&
+ (rec.ir_freecount == ibtrec->ir_freecount),
+ error);
+
+ /*
+ * The content of inobt records should always match between the inobt
+ * and finobt. The lifecycle of records in the finobt is different from
+ * the inobt in that the finobt only tracks records with at least one
+ * free inode. Hence, if all of the inodes are free and we aren't
+ * keeping inode chunks permanently on disk, remove the record.
+ * Otherwise, update the record with the new information.
+ */
+ if (rec.ir_freecount == mp->m_ialloc_inos &&
+ !(mp->m_flags & XFS_MOUNT_IKEEP)) {
+ error = xfs_btree_delete(cur, &i);
+ if (error)
+ goto error;
+ ASSERT(i == 1);
+ } else {
+ error = xfs_inobt_update(cur, &rec);
+ if (error)
+ goto error;
+ }
+
+out:
+ error = xfs_check_agi_freecount(cur, agi);
+ if (error)
+ goto error;
+
+ xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+ return 0;
+
+error:
+ xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+ return error;
+}
+
+/*
+ * Free disk inode. Carefully avoids touching the incore inode, all
+ * manipulations incore are the caller's responsibility.
+ * The on-disk inode is not changed by this operation, only the
+ * btree (free inode mask) is changed.
+ */
+int
+xfs_difree(
+ struct xfs_trans *tp, /* transaction pointer */
+ xfs_ino_t inode, /* inode to be freed */
+ struct xfs_bmap_free *flist, /* extents to free */
+ int *deleted,/* set if inode cluster was deleted */
+ xfs_ino_t *first_ino)/* first inode in deleted cluster */
+{
+ /* REFERENCED */
+ xfs_agblock_t agbno; /* block number containing inode */
+ struct xfs_buf *agbp; /* buffer for allocation group header */
+ xfs_agino_t agino; /* allocation group inode number */
+ xfs_agnumber_t agno; /* allocation group number */
+ int error; /* error return value */
+ struct xfs_mount *mp; /* mount structure for filesystem */
+ struct xfs_inobt_rec_incore rec;/* btree record */
+
+ mp = tp->t_mountp;
+
+ /*
+ * Break up inode number into its components.
+ */
+ agno = XFS_INO_TO_AGNO(mp, inode);
+ if (agno >= mp->m_sb.sb_agcount) {
+ xfs_warn(mp, "%s: agno >= mp->m_sb.sb_agcount (%d >= %d).",
+ __func__, agno, mp->m_sb.sb_agcount);
+ ASSERT(0);
+ return XFS_ERROR(EINVAL);
+ }
+ agino = XFS_INO_TO_AGINO(mp, inode);
+ if (inode != XFS_AGINO_TO_INO(mp, agno, agino)) {
+ xfs_warn(mp, "%s: inode != XFS_AGINO_TO_INO() (%llu != %llu).",
+ __func__, (unsigned long long)inode,
+ (unsigned long long)XFS_AGINO_TO_INO(mp, agno, agino));
+ ASSERT(0);
+ return XFS_ERROR(EINVAL);
+ }
+ agbno = XFS_AGINO_TO_AGBNO(mp, agino);
+ if (agbno >= mp->m_sb.sb_agblocks) {
+ xfs_warn(mp, "%s: agbno >= mp->m_sb.sb_agblocks (%d >= %d).",
+ __func__, agbno, mp->m_sb.sb_agblocks);
+ ASSERT(0);
+ return XFS_ERROR(EINVAL);
+ }
+ /*
+ * Get the allocation group header.
+ */
+ error = xfs_ialloc_read_agi(mp, tp, agno, &agbp);
+ if (error) {
+ xfs_warn(mp, "%s: xfs_ialloc_read_agi() returned error %d.",
+ __func__, error);
+ return error;
+ }
+
+ /*
+ * Fix up the inode allocation btree.
+ */
+ error = xfs_difree_inobt(mp, tp, agbp, agino, flist, deleted, first_ino,
+ &rec);
+ if (error)
+ goto error0;
+
+ /*
+ * Fix up the free inode btree.
+ */
+ if (xfs_sb_version_hasfinobt(&mp->m_sb)) {
+ error = xfs_difree_finobt(mp, tp, agbp, agino, &rec);
+ if (error)
+ goto error0;
+ }
+
+ return 0;
+
+error0:
+ return error;
+}
+
STATIC int
xfs_imap_lookup(
struct xfs_mount *mp,
@@ -1231,7 +1763,7 @@ xfs_imap_lookup(
* we have a record, we need to ensure it contains the inode number
* we are looking up.
*/
- cur = xfs_inobt_init_cursor(mp, tp, agbp, agno);
+ cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO);
error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i);
if (!error) {
if (i)
@@ -1247,7 +1779,7 @@ xfs_imap_lookup(
/* check that the returned record contains the required inode */
if (rec.ir_startino > agino ||
- rec.ir_startino + XFS_IALLOC_INODES(mp) <= agino)
+ rec.ir_startino + mp->m_ialloc_inos <= agino)
return EINVAL;
/* for untrusted inodes check it is allocated first */
@@ -1279,7 +1811,7 @@ xfs_imap(
xfs_agblock_t cluster_agbno; /* first block in inode cluster */
int error; /* error code */
int offset; /* index of inode in its buffer */
- int offset_agbno; /* blks from chunk start to inode */
+ xfs_agblock_t offset_agbno; /* blks from chunk start to inode */
ASSERT(ino != NULLFSINO);
@@ -1320,7 +1852,7 @@ xfs_imap(
return XFS_ERROR(EINVAL);
}
- blks_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_blocklog;
+ blks_per_cluster = xfs_icluster_size_fsb(mp);
/*
* For bulkstat and handle lookups, we have an untrusted inode number
@@ -1341,7 +1873,7 @@ xfs_imap(
* If the inode cluster size is the same as the blocksize or
* smaller we get to the buffer by simple arithmetics.
*/
- if (XFS_INODE_CLUSTER_SIZE(mp) <= mp->m_sb.sb_blocksize) {
+ if (blks_per_cluster == 1) {
offset = XFS_INO_TO_OFFSET(mp, ino);
ASSERT(offset < mp->m_sb.sb_inopblock);
@@ -1419,7 +1951,16 @@ xfs_ialloc_compute_maxlevels(
}
/*
- * Log specified fields for the ag hdr (inode section)
+ * Log specified fields for the ag hdr (inode section). The growth of the agi
+ * structure over time requires that we interpret the buffer as two logical
+ * regions delineated by the end of the unlinked list. This is due to the size
+ * of the hash table and its location in the middle of the agi.
+ *
+ * For example, a request to log a field before agi_unlinked and a field after
+ * agi_unlinked could cause us to log the entire hash table and use an excessive
+ * amount of log space. To avoid this behavior, log the region up through
+ * agi_unlinked in one call and the region after agi_unlinked through the end of
+ * the structure in another.
*/
void
xfs_ialloc_log_agi(
@@ -1442,6 +1983,8 @@ xfs_ialloc_log_agi(
offsetof(xfs_agi_t, agi_newino),
offsetof(xfs_agi_t, agi_dirino),
offsetof(xfs_agi_t, agi_unlinked),
+ offsetof(xfs_agi_t, agi_free_root),
+ offsetof(xfs_agi_t, agi_free_level),
sizeof(xfs_agi_t)
};
#ifdef DEBUG
@@ -1450,14 +1993,30 @@ xfs_ialloc_log_agi(
agi = XFS_BUF_TO_AGI(bp);
ASSERT(agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC));
#endif
+
+ xfs_trans_buf_set_type(tp, bp, XFS_BLFT_AGI_BUF);
+
/*
- * Compute byte offsets for the first and last fields.
+ * Compute byte offsets for the first and last fields in the first
+ * region and log the agi buffer. This only logs up through
+ * agi_unlinked.
*/
- xfs_btree_offsets(fields, offsets, XFS_AGI_NUM_BITS, &first, &last);
+ if (fields & XFS_AGI_ALL_BITS_R1) {
+ xfs_btree_offsets(fields, offsets, XFS_AGI_NUM_BITS_R1,
+ &first, &last);
+ xfs_trans_log_buf(tp, bp, first, last);
+ }
+
/*
- * Log the allocation group inode header buffer.
+ * Mask off the bits in the first region and calculate the first and
+ * last field offsets for any bits in the second region.
*/
- xfs_trans_log_buf(tp, bp, first, last);
+ fields &= ~XFS_AGI_ALL_BITS_R1;
+ if (fields) {
+ xfs_btree_offsets(fields, offsets, XFS_AGI_NUM_BITS_R2,
+ &first, &last);
+ xfs_trans_log_buf(tp, bp, first, last);
+ }
}
#ifdef DEBUG
@@ -1474,19 +2033,23 @@ xfs_check_agi_unlinked(
#define xfs_check_agi_unlinked(agi)
#endif
-static void
+static bool
xfs_agi_verify(
struct xfs_buf *bp)
{
struct xfs_mount *mp = bp->b_target->bt_mount;
struct xfs_agi *agi = XFS_BUF_TO_AGI(bp);
- int agi_ok;
+ if (xfs_sb_version_hascrc(&mp->m_sb) &&
+ !uuid_equal(&agi->agi_uuid, &mp->m_sb.sb_uuid))
+ return false;
/*
* Validate the magic number of the agi block.
*/
- agi_ok = agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC) &&
- XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum));
+ if (agi->agi_magicnum != cpu_to_be32(XFS_AGI_MAGIC))
+ return false;
+ if (!XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum)))
+ return false;
/*
* during growfs operations, the perag is not fully initialised,
@@ -1494,30 +2057,50 @@ xfs_agi_verify(
* use it by using uncached buffers that don't have the perag attached
* so we can detect and avoid this problem.
*/
- if (bp->b_pag)
- agi_ok = agi_ok && be32_to_cpu(agi->agi_seqno) ==
- bp->b_pag->pag_agno;
+ if (bp->b_pag && be32_to_cpu(agi->agi_seqno) != bp->b_pag->pag_agno)
+ return false;
- if (unlikely(XFS_TEST_ERROR(!agi_ok, mp, XFS_ERRTAG_IALLOC_READ_AGI,
- XFS_RANDOM_IALLOC_READ_AGI))) {
- XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, agi);
- xfs_buf_ioerror(bp, EFSCORRUPTED);
- }
xfs_check_agi_unlinked(agi);
+ return true;
}
static void
xfs_agi_read_verify(
struct xfs_buf *bp)
{
- xfs_agi_verify(bp);
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+
+ if (xfs_sb_version_hascrc(&mp->m_sb) &&
+ !xfs_buf_verify_cksum(bp, XFS_AGI_CRC_OFF))
+ xfs_buf_ioerror(bp, EFSBADCRC);
+ else if (XFS_TEST_ERROR(!xfs_agi_verify(bp), mp,
+ XFS_ERRTAG_IALLOC_READ_AGI,
+ XFS_RANDOM_IALLOC_READ_AGI))
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
+
+ if (bp->b_error)
+ xfs_verifier_error(bp);
}
static void
xfs_agi_write_verify(
struct xfs_buf *bp)
{
- xfs_agi_verify(bp);
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_buf_log_item *bip = bp->b_fspriv;
+
+ if (!xfs_agi_verify(bp)) {
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
+ xfs_verifier_error(bp);
+ return;
+ }
+
+ if (!xfs_sb_version_hascrc(&mp->m_sb))
+ return;
+
+ if (bip)
+ XFS_BUF_TO_AGI(bp)->agi_lsn = cpu_to_be64(bip->bli_item.li_lsn);
+ xfs_buf_update_cksum(bp, XFS_AGI_CRC_OFF);
}
const struct xfs_buf_ops xfs_agi_buf_ops = {
@@ -1537,15 +2120,15 @@ xfs_read_agi(
{
int error;
- ASSERT(agno != NULLAGNUMBER);
+ trace_xfs_read_agi(mp, agno);
+ ASSERT(agno != NULLAGNUMBER);
error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)),
XFS_FSS_TO_BB(mp, 1), 0, bpp, &xfs_agi_buf_ops);
if (error)
return error;
- ASSERT(!xfs_buf_geterror(*bpp));
xfs_buf_set_ref(*bpp, XFS_AGI_REF);
return 0;
}
@@ -1561,6 +2144,8 @@ xfs_ialloc_read_agi(
struct xfs_perag *pag; /* per allocation group data */
int error;
+ trace_xfs_ialloc_read_agi(mp, agno);
+
error = xfs_read_agi(mp, tp, agno, bpp);
if (error)
return error;
diff --git a/fs/xfs/xfs_ialloc.h b/fs/xfs/xfs_ialloc.h
index c8da3df271e..95ad1c002d6 100644
--- a/fs/xfs/xfs_ialloc.h
+++ b/fs/xfs/xfs_ialloc.h
@@ -23,18 +23,20 @@ struct xfs_dinode;
struct xfs_imap;
struct xfs_mount;
struct xfs_trans;
+struct xfs_btree_cur;
-/*
- * Allocation parameters for inode allocation.
- */
-#define XFS_IALLOC_INODES(mp) (mp)->m_ialloc_inos
-#define XFS_IALLOC_BLOCKS(mp) (mp)->m_ialloc_blks
-
-/*
- * Move inodes in clusters of this size.
- */
+/* Move inodes in clusters of this size */
#define XFS_INODE_BIG_CLUSTER_SIZE 8192
-#define XFS_INODE_CLUSTER_SIZE(mp) (mp)->m_inode_cluster_size
+
+/* Calculate and return the number of filesystem blocks per inode cluster */
+static inline int
+xfs_icluster_size_fsb(
+ struct xfs_mount *mp)
+{
+ if (mp->m_sb.sb_blocksize >= mp->m_inode_cluster_size)
+ return 1;
+ return mp->m_inode_cluster_size >> mp->m_sb.sb_blocklog;
+}
/*
* Make an inode pointer out of the buffer/offset.
@@ -42,7 +44,7 @@ struct xfs_trans;
static inline struct xfs_dinode *
xfs_make_iptr(struct xfs_mount *mp, struct xfs_buf *b, int o)
{
- return (xfs_dinode_t *)
+ return (struct xfs_dinode *)
(xfs_buf_offset(b, o << (mp)->m_sb.sb_inodelog));
}
@@ -88,7 +90,7 @@ xfs_difree(
struct xfs_trans *tp, /* transaction pointer */
xfs_ino_t inode, /* inode to be freed */
struct xfs_bmap_free *flist, /* extents to free */
- int *delete, /* set if inode cluster was deleted */
+ int *deleted, /* set if inode cluster was deleted */
xfs_ino_t *first_ino); /* first inode in deleted cluster */
/*
@@ -150,6 +152,12 @@ int xfs_inobt_lookup(struct xfs_btree_cur *cur, xfs_agino_t ino,
int xfs_inobt_get_rec(struct xfs_btree_cur *cur,
xfs_inobt_rec_incore_t *rec, int *stat);
-extern const struct xfs_buf_ops xfs_agi_buf_ops;
+/*
+ * Inode chunk initialisation routine
+ */
+int xfs_ialloc_inode_init(struct xfs_mount *mp, struct xfs_trans *tp,
+ struct list_head *buffer_list,
+ xfs_agnumber_t agno, xfs_agblock_t agbno,
+ xfs_agblock_t length, unsigned int gen);
#endif /* __XFS_IALLOC_H__ */
diff --git a/fs/xfs/xfs_ialloc_btree.c b/fs/xfs/xfs_ialloc_btree.c
index bec344b3650..726f83a681a 100644
--- a/fs/xfs/xfs_ialloc_btree.c
+++ b/fs/xfs/xfs_ialloc_btree.c
@@ -17,23 +17,23 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
-#include "xfs_types.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_bit.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dinode.h"
#include "xfs_inode.h"
#include "xfs_btree.h"
#include "xfs_ialloc.h"
+#include "xfs_ialloc_btree.h"
#include "xfs_alloc.h"
#include "xfs_error.h"
#include "xfs_trace.h"
+#include "xfs_cksum.h"
+#include "xfs_trans.h"
STATIC int
@@ -49,7 +49,8 @@ xfs_inobt_dup_cursor(
struct xfs_btree_cur *cur)
{
return xfs_inobt_init_cursor(cur->bc_mp, cur->bc_tp,
- cur->bc_private.a.agbp, cur->bc_private.a.agno);
+ cur->bc_private.a.agbp, cur->bc_private.a.agno,
+ cur->bc_btnum);
}
STATIC void
@@ -66,12 +67,26 @@ xfs_inobt_set_root(
xfs_ialloc_log_agi(cur->bc_tp, agbp, XFS_AGI_ROOT | XFS_AGI_LEVEL);
}
+STATIC void
+xfs_finobt_set_root(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *nptr,
+ int inc) /* level change */
+{
+ struct xfs_buf *agbp = cur->bc_private.a.agbp;
+ struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp);
+
+ agi->agi_free_root = nptr->s;
+ be32_add_cpu(&agi->agi_free_level, inc);
+ xfs_ialloc_log_agi(cur->bc_tp, agbp,
+ XFS_AGI_FREE_ROOT | XFS_AGI_FREE_LEVEL);
+}
+
STATIC int
xfs_inobt_alloc_block(
struct xfs_btree_cur *cur,
union xfs_btree_ptr *start,
union xfs_btree_ptr *new,
- int length,
int *stat)
{
xfs_alloc_arg_t args; /* block allocation args */
@@ -173,6 +188,17 @@ xfs_inobt_init_ptr_from_cur(
ptr->s = agi->agi_root;
}
+STATIC void
+xfs_finobt_init_ptr_from_cur(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *ptr)
+{
+ struct xfs_agi *agi = XFS_BUF_TO_AGI(cur->bc_private.a.agbp);
+
+ ASSERT(cur->bc_private.a.agno == be32_to_cpu(agi->agi_seqno));
+ ptr->s = agi->agi_free_root;
+}
+
STATIC __int64_t
xfs_inobt_key_diff(
struct xfs_btree_cur *cur,
@@ -182,52 +208,92 @@ xfs_inobt_key_diff(
cur->bc_rec.i.ir_startino;
}
-void
+static int
xfs_inobt_verify(
struct xfs_buf *bp)
{
struct xfs_mount *mp = bp->b_target->bt_mount;
struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
+ struct xfs_perag *pag = bp->b_pag;
unsigned int level;
- int sblock_ok; /* block passes checks */
- /* magic number and level verification */
- level = be16_to_cpu(block->bb_level);
- sblock_ok = block->bb_magic == cpu_to_be32(XFS_IBT_MAGIC) &&
- level < mp->m_in_maxlevels;
+ /*
+ * During growfs operations, we can't verify the exact owner as the
+ * perag is not fully initialised and hence not attached to the buffer.
+ *
+ * Similarly, during log recovery we will have a perag structure
+ * attached, but the agi information will not yet have been initialised
+ * from the on disk AGI. We don't currently use any of this information,
+ * but beware of the landmine (i.e. need to check pag->pagi_init) if we
+ * ever do.
+ */
+ switch (block->bb_magic) {
+ case cpu_to_be32(XFS_IBT_CRC_MAGIC):
+ case cpu_to_be32(XFS_FIBT_CRC_MAGIC):
+ if (!xfs_sb_version_hascrc(&mp->m_sb))
+ return false;
+ if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid))
+ return false;
+ if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn))
+ return false;
+ if (pag &&
+ be32_to_cpu(block->bb_u.s.bb_owner) != pag->pag_agno)
+ return false;
+ /* fall through */
+ case cpu_to_be32(XFS_IBT_MAGIC):
+ case cpu_to_be32(XFS_FIBT_MAGIC):
+ break;
+ default:
+ return 0;
+ }
- /* numrecs verification */
- sblock_ok = sblock_ok &&
- be16_to_cpu(block->bb_numrecs) <= mp->m_inobt_mxr[level != 0];
+ /* numrecs and level verification */
+ level = be16_to_cpu(block->bb_level);
+ if (level >= mp->m_in_maxlevels)
+ return false;
+ if (be16_to_cpu(block->bb_numrecs) > mp->m_inobt_mxr[level != 0])
+ return false;
/* sibling pointer verification */
- sblock_ok = sblock_ok &&
- (block->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK) ||
- be32_to_cpu(block->bb_u.s.bb_leftsib) < mp->m_sb.sb_agblocks) &&
- block->bb_u.s.bb_leftsib &&
- (block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK) ||
- be32_to_cpu(block->bb_u.s.bb_rightsib) < mp->m_sb.sb_agblocks) &&
- block->bb_u.s.bb_rightsib;
-
- if (!sblock_ok) {
- trace_xfs_btree_corrupt(bp, _RET_IP_);
- XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, block);
- xfs_buf_ioerror(bp, EFSCORRUPTED);
- }
+ if (!block->bb_u.s.bb_leftsib ||
+ (be32_to_cpu(block->bb_u.s.bb_leftsib) >= mp->m_sb.sb_agblocks &&
+ block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK)))
+ return false;
+ if (!block->bb_u.s.bb_rightsib ||
+ (be32_to_cpu(block->bb_u.s.bb_rightsib) >= mp->m_sb.sb_agblocks &&
+ block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK)))
+ return false;
+
+ return true;
}
static void
xfs_inobt_read_verify(
struct xfs_buf *bp)
{
- xfs_inobt_verify(bp);
+ if (!xfs_btree_sblock_verify_crc(bp))
+ xfs_buf_ioerror(bp, EFSBADCRC);
+ else if (!xfs_inobt_verify(bp))
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
+
+ if (bp->b_error) {
+ trace_xfs_btree_corrupt(bp, _RET_IP_);
+ xfs_verifier_error(bp);
+ }
}
static void
xfs_inobt_write_verify(
struct xfs_buf *bp)
{
- xfs_inobt_verify(bp);
+ if (!xfs_inobt_verify(bp)) {
+ trace_xfs_btree_corrupt(bp, _RET_IP_);
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
+ xfs_verifier_error(bp);
+ return;
+ }
+ xfs_btree_sblock_calc_crc(bp);
+
}
const struct xfs_buf_ops xfs_inobt_buf_ops = {
@@ -235,7 +301,7 @@ const struct xfs_buf_ops xfs_inobt_buf_ops = {
.verify_write = xfs_inobt_write_verify,
};
-#ifdef DEBUG
+#if defined(DEBUG) || defined(XFS_WARN)
STATIC int
xfs_inobt_keys_inorder(
struct xfs_btree_cur *cur,
@@ -273,7 +339,29 @@ static const struct xfs_btree_ops xfs_inobt_ops = {
.init_ptr_from_cur = xfs_inobt_init_ptr_from_cur,
.key_diff = xfs_inobt_key_diff,
.buf_ops = &xfs_inobt_buf_ops,
-#ifdef DEBUG
+#if defined(DEBUG) || defined(XFS_WARN)
+ .keys_inorder = xfs_inobt_keys_inorder,
+ .recs_inorder = xfs_inobt_recs_inorder,
+#endif
+};
+
+static const struct xfs_btree_ops xfs_finobt_ops = {
+ .rec_len = sizeof(xfs_inobt_rec_t),
+ .key_len = sizeof(xfs_inobt_key_t),
+
+ .dup_cursor = xfs_inobt_dup_cursor,
+ .set_root = xfs_finobt_set_root,
+ .alloc_block = xfs_inobt_alloc_block,
+ .free_block = xfs_inobt_free_block,
+ .get_minrecs = xfs_inobt_get_minrecs,
+ .get_maxrecs = xfs_inobt_get_maxrecs,
+ .init_key_from_rec = xfs_inobt_init_key_from_rec,
+ .init_rec_from_key = xfs_inobt_init_rec_from_key,
+ .init_rec_from_cur = xfs_inobt_init_rec_from_cur,
+ .init_ptr_from_cur = xfs_finobt_init_ptr_from_cur,
+ .key_diff = xfs_inobt_key_diff,
+ .buf_ops = &xfs_inobt_buf_ops,
+#if defined(DEBUG) || defined(XFS_WARN)
.keys_inorder = xfs_inobt_keys_inorder,
.recs_inorder = xfs_inobt_recs_inorder,
#endif
@@ -287,7 +375,8 @@ xfs_inobt_init_cursor(
struct xfs_mount *mp, /* file system mount point */
struct xfs_trans *tp, /* transaction pointer */
struct xfs_buf *agbp, /* buffer for agi structure */
- xfs_agnumber_t agno) /* allocation group number */
+ xfs_agnumber_t agno, /* allocation group number */
+ xfs_btnum_t btnum) /* ialloc or free ino btree */
{
struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp);
struct xfs_btree_cur *cur;
@@ -296,11 +385,19 @@ xfs_inobt_init_cursor(
cur->bc_tp = tp;
cur->bc_mp = mp;
- cur->bc_nlevels = be32_to_cpu(agi->agi_level);
- cur->bc_btnum = XFS_BTNUM_INO;
+ cur->bc_btnum = btnum;
+ if (btnum == XFS_BTNUM_INO) {
+ cur->bc_nlevels = be32_to_cpu(agi->agi_level);
+ cur->bc_ops = &xfs_inobt_ops;
+ } else {
+ cur->bc_nlevels = be32_to_cpu(agi->agi_free_level);
+ cur->bc_ops = &xfs_finobt_ops;
+ }
+
cur->bc_blocklog = mp->m_sb.sb_blocklog;
- cur->bc_ops = &xfs_inobt_ops;
+ if (xfs_sb_version_hascrc(&mp->m_sb))
+ cur->bc_flags |= XFS_BTREE_CRC_BLOCKS;
cur->bc_private.a.agbp = agbp;
cur->bc_private.a.agno = agno;
diff --git a/fs/xfs/xfs_ialloc_btree.h b/fs/xfs/xfs_ialloc_btree.h
index 25c0239a8ea..d7ebea72c2d 100644
--- a/fs/xfs/xfs_ialloc_btree.h
+++ b/fs/xfs/xfs_ialloc_btree.h
@@ -27,59 +27,11 @@ struct xfs_btree_cur;
struct xfs_mount;
/*
- * There is a btree for the inode map per allocation group.
- */
-#define XFS_IBT_MAGIC 0x49414254 /* 'IABT' */
-
-typedef __uint64_t xfs_inofree_t;
-#define XFS_INODES_PER_CHUNK (NBBY * sizeof(xfs_inofree_t))
-#define XFS_INODES_PER_CHUNK_LOG (XFS_NBBYLOG + 3)
-#define XFS_INOBT_ALL_FREE ((xfs_inofree_t)-1)
-#define XFS_INOBT_MASK(i) ((xfs_inofree_t)1 << (i))
-
-static inline xfs_inofree_t xfs_inobt_maskn(int i, int n)
-{
- return ((n >= XFS_INODES_PER_CHUNK ? 0 : XFS_INOBT_MASK(n)) - 1) << i;
-}
-
-/*
- * Data record structure
- */
-typedef struct xfs_inobt_rec {
- __be32 ir_startino; /* starting inode number */
- __be32 ir_freecount; /* count of free inodes (set bits) */
- __be64 ir_free; /* free inode mask */
-} xfs_inobt_rec_t;
-
-typedef struct xfs_inobt_rec_incore {
- xfs_agino_t ir_startino; /* starting inode number */
- __int32_t ir_freecount; /* count of free inodes (set bits) */
- xfs_inofree_t ir_free; /* free inode mask */
-} xfs_inobt_rec_incore_t;
-
-
-/*
- * Key structure
- */
-typedef struct xfs_inobt_key {
- __be32 ir_startino; /* starting inode number */
-} xfs_inobt_key_t;
-
-/* btree pointer type */
-typedef __be32 xfs_inobt_ptr_t;
-
-/*
- * block numbers in the AG.
- */
-#define XFS_IBT_BLOCK(mp) ((xfs_agblock_t)(XFS_CNT_BLOCK(mp) + 1))
-#define XFS_PREALLOC_BLOCKS(mp) ((xfs_agblock_t)(XFS_IBT_BLOCK(mp) + 1))
-
-/*
* Btree block header size depends on a superblock flag.
- *
- * (not quite yet, but soon)
*/
-#define XFS_INOBT_BLOCK_LEN(mp) XFS_BTREE_SBLOCK_LEN
+#define XFS_INOBT_BLOCK_LEN(mp) \
+ (xfs_sb_version_hascrc(&((mp)->m_sb)) ? \
+ XFS_BTREE_SBLOCK_CRC_LEN : XFS_BTREE_SBLOCK_LEN)
/*
* Record, key, and pointer address macros for btree blocks.
@@ -106,9 +58,8 @@ typedef __be32 xfs_inobt_ptr_t;
((index) - 1) * sizeof(xfs_inobt_ptr_t)))
extern struct xfs_btree_cur *xfs_inobt_init_cursor(struct xfs_mount *,
- struct xfs_trans *, struct xfs_buf *, xfs_agnumber_t);
+ struct xfs_trans *, struct xfs_buf *, xfs_agnumber_t,
+ xfs_btnum_t);
extern int xfs_inobt_maxrecs(struct xfs_mount *, int, int);
-extern const struct xfs_buf_ops xfs_inobt_buf_ops;
-
#endif /* __XFS_IALLOC_BTREE_H__ */
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 96e344e3e92..c48df5f25b9 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -17,26 +17,22 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
-#include "xfs_types.h"
-#include "xfs_log.h"
-#include "xfs_log_priv.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_inum.h"
-#include "xfs_trans.h"
-#include "xfs_trans_priv.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
#include "xfs_inode.h"
-#include "xfs_dinode.h"
#include "xfs_error.h"
-#include "xfs_filestream.h"
-#include "xfs_vnodeops.h"
+#include "xfs_trans.h"
+#include "xfs_trans_priv.h"
#include "xfs_inode_item.h"
#include "xfs_quota.h"
#include "xfs_trace.h"
-#include "xfs_fsops.h"
#include "xfs_icache.h"
+#include "xfs_bmap_util.h"
#include <linux/kthread.h>
#include <linux/freezer.h>
@@ -47,7 +43,7 @@ STATIC void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp,
/*
* Allocate and initialise an xfs_inode.
*/
-STATIC struct xfs_inode *
+struct xfs_inode *
xfs_inode_alloc(
struct xfs_mount *mp,
xfs_ino_t ino)
@@ -97,7 +93,7 @@ xfs_inode_free_callback(
kmem_zone_free(xfs_inode_zone, ip);
}
-STATIC void
+void
xfs_inode_free(
struct xfs_inode *ip)
{
@@ -118,11 +114,6 @@ xfs_inode_free(
ip->i_itemp = NULL;
}
- /* asserts to verify all state is correct here */
- ASSERT(atomic_read(&ip->i_pincount) == 0);
- ASSERT(!spin_is_locked(&ip->i_flags_lock));
- ASSERT(!xfs_isiflocked(ip));
-
/*
* Because we use RCU freeing we need to ensure the inode always
* appears to be reclaimed with an invalid inode number when in the
@@ -134,6 +125,10 @@ xfs_inode_free(
ip->i_ino = 0;
spin_unlock(&ip->i_flags_lock);
+ /* asserts to verify all state is correct here */
+ ASSERT(atomic_read(&ip->i_pincount) == 0);
+ ASSERT(!xfs_isiflocked(ip));
+
call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback);
}
@@ -335,7 +330,9 @@ xfs_iget_cache_miss(
iflags = XFS_INEW;
if (flags & XFS_IGET_DONTCACHE)
iflags |= XFS_IDONTCACHE;
- ip->i_udquot = ip->i_gdquot = NULL;
+ ip->i_udquot = NULL;
+ ip->i_gdquot = NULL;
+ ip->i_pdquot = NULL;
xfs_iflags_set(ip, iflags);
/* insert the new inode */
@@ -498,11 +495,6 @@ xfs_inode_ag_walk_grab(
if (!igrab(inode))
return ENOENT;
- if (is_bad_inode(inode)) {
- IRELE(ip);
- return ENOENT;
- }
-
/* inode is valid */
return 0;
@@ -515,8 +507,7 @@ STATIC int
xfs_inode_ag_walk(
struct xfs_mount *mp,
struct xfs_perag *pag,
- int (*execute)(struct xfs_inode *ip,
- struct xfs_perag *pag, int flags,
+ int (*execute)(struct xfs_inode *ip, int flags,
void *args),
int flags,
void *args,
@@ -590,7 +581,7 @@ restart:
for (i = 0; i < nr_found; i++) {
if (!batch[i])
continue;
- error = execute(batch[i], pag, flags, args);
+ error = execute(batch[i], flags, args);
IRELE(batch[i]);
if (error == EAGAIN) {
skipped++;
@@ -617,7 +608,7 @@ restart:
/*
* Background scanning to trim post-EOF preallocated space. This is queued
- * based on the 'background_prealloc_discard_period' tunable (5m by default).
+ * based on the 'speculative_prealloc_lifetime' tunable (5m by default).
*/
STATIC void
xfs_queue_eofblocks(
@@ -644,8 +635,7 @@ xfs_eofblocks_worker(
int
xfs_inode_ag_iterator(
struct xfs_mount *mp,
- int (*execute)(struct xfs_inode *ip,
- struct xfs_perag *pag, int flags,
+ int (*execute)(struct xfs_inode *ip, int flags,
void *args),
int flags,
void *args)
@@ -672,8 +662,7 @@ xfs_inode_ag_iterator(
int
xfs_inode_ag_iterator_tag(
struct xfs_mount *mp,
- int (*execute)(struct xfs_inode *ip,
- struct xfs_perag *pag, int flags,
+ int (*execute)(struct xfs_inode *ip, int flags,
void *args),
int flags,
void *args,
@@ -916,8 +905,6 @@ restart:
xfs_iflock(ip);
}
- if (is_bad_inode(VFS_I(ip)))
- goto reclaim;
if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
xfs_iunpin_wait(ip);
xfs_iflush_abort(ip, false);
@@ -1164,7 +1151,7 @@ xfs_reclaim_inodes(
* them to be cleaned, which we hope will not be very long due to the
* background walker having already kicked the IO off on those dirty inodes.
*/
-void
+long
xfs_reclaim_inodes_nr(
struct xfs_mount *mp,
int nr_to_scan)
@@ -1173,7 +1160,7 @@ xfs_reclaim_inodes_nr(
xfs_reclaim_work_queue(mp);
xfs_ail_push_all(mp->m_ail);
- xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK | SYNC_WAIT, &nr_to_scan);
+ return xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK | SYNC_WAIT, &nr_to_scan);
}
/*
@@ -1201,15 +1188,15 @@ xfs_inode_match_id(
struct xfs_inode *ip,
struct xfs_eofblocks *eofb)
{
- if (eofb->eof_flags & XFS_EOF_FLAGS_UID &&
- ip->i_d.di_uid != eofb->eof_uid)
+ if ((eofb->eof_flags & XFS_EOF_FLAGS_UID) &&
+ !uid_eq(VFS_I(ip)->i_uid, eofb->eof_uid))
return 0;
- if (eofb->eof_flags & XFS_EOF_FLAGS_GID &&
- ip->i_d.di_gid != eofb->eof_gid)
+ if ((eofb->eof_flags & XFS_EOF_FLAGS_GID) &&
+ !gid_eq(VFS_I(ip)->i_gid, eofb->eof_gid))
return 0;
- if (eofb->eof_flags & XFS_EOF_FLAGS_PRID &&
+ if ((eofb->eof_flags & XFS_EOF_FLAGS_PRID) &&
xfs_get_projid(ip) != eofb->eof_prid)
return 0;
@@ -1219,7 +1206,6 @@ xfs_inode_match_id(
STATIC int
xfs_inode_free_eofblocks(
struct xfs_inode *ip,
- struct xfs_perag *pag,
int flags,
void *args)
{
diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h
index e0f138c70a2..9cf017b899b 100644
--- a/fs/xfs/xfs_icache.h
+++ b/fs/xfs/xfs_icache.h
@@ -21,17 +21,36 @@
struct xfs_mount;
struct xfs_perag;
+struct xfs_eofblocks {
+ __u32 eof_flags;
+ kuid_t eof_uid;
+ kgid_t eof_gid;
+ prid_t eof_prid;
+ __u64 eof_min_file_size;
+};
+
#define SYNC_WAIT 0x0001 /* wait for i/o to complete */
#define SYNC_TRYLOCK 0x0002 /* only try to lock inodes */
+/*
+ * Flags for xfs_iget()
+ */
+#define XFS_IGET_CREATE 0x1
+#define XFS_IGET_UNTRUSTED 0x2
+#define XFS_IGET_DONTCACHE 0x4
+
int xfs_iget(struct xfs_mount *mp, struct xfs_trans *tp, xfs_ino_t ino,
uint flags, uint lock_flags, xfs_inode_t **ipp);
+/* recovery needs direct inode allocation capability */
+struct xfs_inode * xfs_inode_alloc(struct xfs_mount *mp, xfs_ino_t ino);
+void xfs_inode_free(struct xfs_inode *ip);
+
void xfs_reclaim_worker(struct work_struct *work);
int xfs_reclaim_inodes(struct xfs_mount *mp, int mode);
int xfs_reclaim_inodes_count(struct xfs_mount *mp);
-void xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan);
+long xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan);
void xfs_inode_set_reclaim_tag(struct xfs_inode *ip);
@@ -40,14 +59,46 @@ void xfs_inode_clear_eofblocks_tag(struct xfs_inode *ip);
int xfs_icache_free_eofblocks(struct xfs_mount *, struct xfs_eofblocks *);
void xfs_eofblocks_worker(struct work_struct *);
-int xfs_sync_inode_grab(struct xfs_inode *ip);
int xfs_inode_ag_iterator(struct xfs_mount *mp,
- int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag,
- int flags, void *args),
+ int (*execute)(struct xfs_inode *ip, int flags, void *args),
int flags, void *args);
int xfs_inode_ag_iterator_tag(struct xfs_mount *mp,
- int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag,
- int flags, void *args),
+ int (*execute)(struct xfs_inode *ip, int flags, void *args),
int flags, void *args, int tag);
+static inline int
+xfs_fs_eofblocks_from_user(
+ struct xfs_fs_eofblocks *src,
+ struct xfs_eofblocks *dst)
+{
+ if (src->eof_version != XFS_EOFBLOCKS_VERSION)
+ return EINVAL;
+
+ if (src->eof_flags & ~XFS_EOF_FLAGS_VALID)
+ return EINVAL;
+
+ if (memchr_inv(&src->pad32, 0, sizeof(src->pad32)) ||
+ memchr_inv(src->pad64, 0, sizeof(src->pad64)))
+ return EINVAL;
+
+ dst->eof_flags = src->eof_flags;
+ dst->eof_prid = src->eof_prid;
+ dst->eof_min_file_size = src->eof_min_file_size;
+
+ dst->eof_uid = INVALID_UID;
+ if (src->eof_flags & XFS_EOF_FLAGS_UID) {
+ dst->eof_uid = make_kuid(current_user_ns(), src->eof_uid);
+ if (!uid_valid(dst->eof_uid))
+ return EINVAL;
+ }
+
+ dst->eof_gid = INVALID_GID;
+ if (src->eof_flags & XFS_EOF_FLAGS_GID) {
+ dst->eof_gid = make_kgid(current_user_ns(), src->eof_gid);
+ if (!gid_valid(dst->eof_gid))
+ return EINVAL;
+ }
+ return 0;
+}
+
#endif
diff --git a/fs/xfs/xfs_icreate_item.c b/fs/xfs/xfs_icreate_item.c
new file mode 100644
index 00000000000..7e454923325
--- /dev/null
+++ b/fs/xfs/xfs_icreate_item.c
@@ -0,0 +1,189 @@
+/*
+ * Copyright (c) 2008-2010, 2013 Dave Chinner
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_trans.h"
+#include "xfs_trans_priv.h"
+#include "xfs_error.h"
+#include "xfs_icreate_item.h"
+#include "xfs_log.h"
+
+kmem_zone_t *xfs_icreate_zone; /* inode create item zone */
+
+static inline struct xfs_icreate_item *ICR_ITEM(struct xfs_log_item *lip)
+{
+ return container_of(lip, struct xfs_icreate_item, ic_item);
+}
+
+/*
+ * This returns the number of iovecs needed to log the given inode item.
+ *
+ * We only need one iovec for the icreate log structure.
+ */
+STATIC void
+xfs_icreate_item_size(
+ struct xfs_log_item *lip,
+ int *nvecs,
+ int *nbytes)
+{
+ *nvecs += 1;
+ *nbytes += sizeof(struct xfs_icreate_log);
+}
+
+/*
+ * This is called to fill in the vector of log iovecs for the
+ * given inode create log item.
+ */
+STATIC void
+xfs_icreate_item_format(
+ struct xfs_log_item *lip,
+ struct xfs_log_vec *lv)
+{
+ struct xfs_icreate_item *icp = ICR_ITEM(lip);
+ struct xfs_log_iovec *vecp = NULL;
+
+ xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_ICREATE,
+ &icp->ic_format,
+ sizeof(struct xfs_icreate_log));
+}
+
+
+/* Pinning has no meaning for the create item, so just return. */
+STATIC void
+xfs_icreate_item_pin(
+ struct xfs_log_item *lip)
+{
+}
+
+
+/* pinning has no meaning for the create item, so just return. */
+STATIC void
+xfs_icreate_item_unpin(
+ struct xfs_log_item *lip,
+ int remove)
+{
+}
+
+STATIC void
+xfs_icreate_item_unlock(
+ struct xfs_log_item *lip)
+{
+ struct xfs_icreate_item *icp = ICR_ITEM(lip);
+
+ if (icp->ic_item.li_flags & XFS_LI_ABORTED)
+ kmem_zone_free(xfs_icreate_zone, icp);
+ return;
+}
+
+/*
+ * Because we have ordered buffers being tracked in the AIL for the inode
+ * creation, we don't need the create item after this. Hence we can free
+ * the log item and return -1 to tell the caller we're done with the item.
+ */
+STATIC xfs_lsn_t
+xfs_icreate_item_committed(
+ struct xfs_log_item *lip,
+ xfs_lsn_t lsn)
+{
+ struct xfs_icreate_item *icp = ICR_ITEM(lip);
+
+ kmem_zone_free(xfs_icreate_zone, icp);
+ return (xfs_lsn_t)-1;
+}
+
+/* item can never get into the AIL */
+STATIC uint
+xfs_icreate_item_push(
+ struct xfs_log_item *lip,
+ struct list_head *buffer_list)
+{
+ ASSERT(0);
+ return XFS_ITEM_SUCCESS;
+}
+
+/* Ordered buffers do the dependency tracking here, so this does nothing. */
+STATIC void
+xfs_icreate_item_committing(
+ struct xfs_log_item *lip,
+ xfs_lsn_t lsn)
+{
+}
+
+/*
+ * This is the ops vector shared by all buf log items.
+ */
+static struct xfs_item_ops xfs_icreate_item_ops = {
+ .iop_size = xfs_icreate_item_size,
+ .iop_format = xfs_icreate_item_format,
+ .iop_pin = xfs_icreate_item_pin,
+ .iop_unpin = xfs_icreate_item_unpin,
+ .iop_push = xfs_icreate_item_push,
+ .iop_unlock = xfs_icreate_item_unlock,
+ .iop_committed = xfs_icreate_item_committed,
+ .iop_committing = xfs_icreate_item_committing,
+};
+
+
+/*
+ * Initialize the inode log item for a newly allocated (in-core) inode.
+ *
+ * Inode extents can only reside within an AG. Hence specify the starting
+ * block for the inode chunk by offset within an AG as well as the
+ * length of the allocated extent.
+ *
+ * This joins the item to the transaction and marks it dirty so
+ * that we don't need a separate call to do this, nor does the
+ * caller need to know anything about the icreate item.
+ */
+void
+xfs_icreate_log(
+ struct xfs_trans *tp,
+ xfs_agnumber_t agno,
+ xfs_agblock_t agbno,
+ unsigned int count,
+ unsigned int inode_size,
+ xfs_agblock_t length,
+ unsigned int generation)
+{
+ struct xfs_icreate_item *icp;
+
+ icp = kmem_zone_zalloc(xfs_icreate_zone, KM_SLEEP);
+
+ xfs_log_item_init(tp->t_mountp, &icp->ic_item, XFS_LI_ICREATE,
+ &xfs_icreate_item_ops);
+
+ icp->ic_format.icl_type = XFS_LI_ICREATE;
+ icp->ic_format.icl_size = 1; /* single vector */
+ icp->ic_format.icl_ag = cpu_to_be32(agno);
+ icp->ic_format.icl_agbno = cpu_to_be32(agbno);
+ icp->ic_format.icl_count = cpu_to_be32(count);
+ icp->ic_format.icl_isize = cpu_to_be32(inode_size);
+ icp->ic_format.icl_length = cpu_to_be32(length);
+ icp->ic_format.icl_gen = cpu_to_be32(generation);
+
+ xfs_trans_add_item(tp, &icp->ic_item);
+ tp->t_flags |= XFS_TRANS_DIRTY;
+ icp->ic_item.li_desc->lid_flags |= XFS_LID_DIRTY;
+}
diff --git a/fs/xfs/xfs_icreate_item.h b/fs/xfs/xfs_icreate_item.h
new file mode 100644
index 00000000000..59e89f87c09
--- /dev/null
+++ b/fs/xfs/xfs_icreate_item.h
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2008-2010, Dave Chinner
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#ifndef XFS_ICREATE_ITEM_H
+#define XFS_ICREATE_ITEM_H 1
+
+/* in memory log item structure */
+struct xfs_icreate_item {
+ struct xfs_log_item ic_item;
+ struct xfs_icreate_log ic_format;
+};
+
+extern kmem_zone_t *xfs_icreate_zone; /* inode create item zone */
+
+void xfs_icreate_log(struct xfs_trans *tp, xfs_agnumber_t agno,
+ xfs_agblock_t agbno, unsigned int count,
+ unsigned int inode_size, xfs_agblock_t length,
+ unsigned int generation);
+
+#endif /* XFS_ICREATE_ITEM_H */
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 66282dcb821..a6115fe1ac9 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -19,35 +19,38 @@
#include "xfs.h"
#include "xfs_fs.h"
-#include "xfs_types.h"
-#include "xfs_log.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_inum.h"
-#include "xfs_trans.h"
-#include "xfs_trans_priv.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_attr_sf.h"
-#include "xfs_dinode.h"
#include "xfs_inode.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_dir2.h"
+#include "xfs_attr_sf.h"
+#include "xfs_attr.h"
+#include "xfs_trans_space.h"
+#include "xfs_trans.h"
#include "xfs_buf_item.h"
#include "xfs_inode_item.h"
-#include "xfs_btree.h"
-#include "xfs_alloc.h"
#include "xfs_ialloc.h"
#include "xfs_bmap.h"
+#include "xfs_bmap_util.h"
#include "xfs_error.h"
-#include "xfs_utils.h"
#include "xfs_quota.h"
#include "xfs_filestream.h"
-#include "xfs_vnodeops.h"
+#include "xfs_cksum.h"
#include "xfs_trace.h"
#include "xfs_icache.h"
+#include "xfs_symlink.h"
+#include "xfs_trans_priv.h"
+#include "xfs_log.h"
+#include "xfs_bmap_btree.h"
-kmem_zone_t *xfs_ifork_zone;
kmem_zone_t *xfs_inode_zone;
/*
@@ -57,9 +60,8 @@ kmem_zone_t *xfs_inode_zone;
#define XFS_ITRUNC_MAX_EXTENTS 2
STATIC int xfs_iflush_int(xfs_inode_t *, xfs_buf_t *);
-STATIC int xfs_iformat_local(xfs_inode_t *, xfs_dinode_t *, int, int);
-STATIC int xfs_iformat_extents(xfs_inode_t *, xfs_dinode_t *, int);
-STATIC int xfs_iformat_btree(xfs_inode_t *, xfs_dinode_t *, int);
+
+STATIC int xfs_iunlink_remove(xfs_trans_t *, xfs_inode_t *);
/*
* helper function to extract extent size hint from inode
@@ -76,48 +78,44 @@ xfs_get_extsz_hint(
}
/*
- * This is a wrapper routine around the xfs_ilock() routine used to centralize
- * some grungy code. It is used in places that wish to lock the inode solely
- * for reading the extents. The reason these places can't just call
- * xfs_ilock(SHARED) is that the inode lock also guards to bringing in of the
- * extents from disk for a file in b-tree format. If the inode is in b-tree
- * format, then we need to lock the inode exclusively until the extents are read
- * in. Locking it exclusively all the time would limit our parallelism
- * unnecessarily, though. What we do instead is check to see if the extents
- * have been read in yet, and only lock the inode exclusively if they have not.
+ * These two are wrapper routines around the xfs_ilock() routine used to
+ * centralize some grungy code. They are used in places that wish to lock the
+ * inode solely for reading the extents. The reason these places can't just
+ * call xfs_ilock(ip, XFS_ILOCK_SHARED) is that the inode lock also guards to
+ * bringing in of the extents from disk for a file in b-tree format. If the
+ * inode is in b-tree format, then we need to lock the inode exclusively until
+ * the extents are read in. Locking it exclusively all the time would limit
+ * our parallelism unnecessarily, though. What we do instead is check to see
+ * if the extents have been read in yet, and only lock the inode exclusively
+ * if they have not.
*
- * The function returns a value which should be given to the corresponding
- * xfs_iunlock_map_shared(). This value is the mode in which the lock was
- * actually taken.
+ * The functions return a value which should be given to the corresponding
+ * xfs_iunlock() call.
*/
uint
-xfs_ilock_map_shared(
- xfs_inode_t *ip)
+xfs_ilock_data_map_shared(
+ struct xfs_inode *ip)
{
- uint lock_mode;
+ uint lock_mode = XFS_ILOCK_SHARED;
- if ((ip->i_d.di_format == XFS_DINODE_FMT_BTREE) &&
- ((ip->i_df.if_flags & XFS_IFEXTENTS) == 0)) {
+ if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE &&
+ (ip->i_df.if_flags & XFS_IFEXTENTS) == 0)
lock_mode = XFS_ILOCK_EXCL;
- } else {
- lock_mode = XFS_ILOCK_SHARED;
- }
-
xfs_ilock(ip, lock_mode);
-
return lock_mode;
}
-/*
- * This is simply the unlock routine to go with xfs_ilock_map_shared().
- * All it does is call xfs_iunlock() with the given lock_mode.
- */
-void
-xfs_iunlock_map_shared(
- xfs_inode_t *ip,
- unsigned int lock_mode)
+uint
+xfs_ilock_attr_map_shared(
+ struct xfs_inode *ip)
{
- xfs_iunlock(ip, lock_mode);
+ uint lock_mode = XFS_ILOCK_SHARED;
+
+ if (ip->i_d.di_aformat == XFS_DINODE_FMT_BTREE &&
+ (ip->i_afp->if_flags & XFS_IFEXTENTS) == 0)
+ lock_mode = XFS_ILOCK_EXCL;
+ xfs_ilock(ip, lock_mode);
+ return lock_mode;
}
/*
@@ -286,7 +284,7 @@ xfs_ilock_demote(
trace_xfs_ilock_demote(ip, lock_flags, _RET_IP_);
}
-#ifdef DEBUG
+#if defined(DEBUG) || defined(XFS_WARN)
int
xfs_isilocked(
xfs_inode_t *ip,
@@ -309,599 +307,202 @@ xfs_isilocked(
}
#endif
-void
-__xfs_iflock(
- struct xfs_inode *ip)
-{
- wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IFLOCK_BIT);
- DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IFLOCK_BIT);
-
- do {
- prepare_to_wait_exclusive(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
- if (xfs_isiflocked(ip))
- io_schedule();
- } while (!xfs_iflock_nowait(ip));
-
- finish_wait(wq, &wait.wait);
-}
-
#ifdef DEBUG
+int xfs_locked_n;
+int xfs_small_retries;
+int xfs_middle_retries;
+int xfs_lots_retries;
+int xfs_lock_delays;
+#endif
+
/*
- * Make sure that the extents in the given memory buffer
- * are valid.
+ * Bump the subclass so xfs_lock_inodes() acquires each lock with
+ * a different value
*/
-STATIC void
-xfs_validate_extents(
- xfs_ifork_t *ifp,
- int nrecs,
- xfs_exntfmt_t fmt)
+static inline int
+xfs_lock_inumorder(int lock_mode, int subclass)
{
- xfs_bmbt_irec_t irec;
- xfs_bmbt_rec_host_t rec;
- int i;
+ if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL))
+ lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_IOLOCK_SHIFT;
+ if (lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL))
+ lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_ILOCK_SHIFT;
- for (i = 0; i < nrecs; i++) {
- xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i);
- rec.l0 = get_unaligned(&ep->l0);
- rec.l1 = get_unaligned(&ep->l1);
- xfs_bmbt_get_all(&rec, &irec);
- if (fmt == XFS_EXTFMT_NOSTATE)
- ASSERT(irec.br_state == XFS_EXT_NORM);
- }
+ return lock_mode;
}
-#else /* DEBUG */
-#define xfs_validate_extents(ifp, nrecs, fmt)
-#endif /* DEBUG */
/*
- * Check that none of the inode's in the buffer have a next
- * unlinked field of 0.
+ * The following routine will lock n inodes in exclusive mode.
+ * We assume the caller calls us with the inodes in i_ino order.
+ *
+ * We need to detect deadlock where an inode that we lock
+ * is in the AIL and we start waiting for another inode that is locked
+ * by a thread in a long running transaction (such as truncate). This can
+ * result in deadlock since the long running trans might need to wait
+ * for the inode we just locked in order to push the tail and free space
+ * in the log.
*/
-#if defined(DEBUG)
void
-xfs_inobp_check(
- xfs_mount_t *mp,
- xfs_buf_t *bp)
+xfs_lock_inodes(
+ xfs_inode_t **ips,
+ int inodes,
+ uint lock_mode)
{
- int i;
- int j;
- xfs_dinode_t *dip;
+ int attempts = 0, i, j, try_lock;
+ xfs_log_item_t *lp;
- j = mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog;
+ ASSERT(ips && (inodes >= 2)); /* we need at least two */
- for (i = 0; i < j; i++) {
- dip = (xfs_dinode_t *)xfs_buf_offset(bp,
- i * mp->m_sb.sb_inodesize);
- if (!dip->di_next_unlinked) {
- xfs_alert(mp,
- "Detected bogus zero next_unlinked field in incore inode buffer 0x%p.",
- bp);
- ASSERT(dip->di_next_unlinked);
- }
- }
-}
-#endif
-
-static void
-xfs_inode_buf_verify(
- struct xfs_buf *bp)
-{
- struct xfs_mount *mp = bp->b_target->bt_mount;
- int i;
- int ni;
-
- /*
- * Validate the magic number and version of every inode in the buffer
- */
- ni = XFS_BB_TO_FSB(mp, bp->b_length) * mp->m_sb.sb_inopblock;
- for (i = 0; i < ni; i++) {
- int di_ok;
- xfs_dinode_t *dip;
-
- dip = (struct xfs_dinode *)xfs_buf_offset(bp,
- (i << mp->m_sb.sb_inodelog));
- di_ok = dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) &&
- XFS_DINODE_GOOD_VERSION(dip->di_version);
- if (unlikely(XFS_TEST_ERROR(!di_ok, mp,
- XFS_ERRTAG_ITOBP_INOTOBP,
- XFS_RANDOM_ITOBP_INOTOBP))) {
- xfs_buf_ioerror(bp, EFSCORRUPTED);
- XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_HIGH,
- mp, dip);
-#ifdef DEBUG
- xfs_emerg(mp,
- "bad inode magic/vsn daddr %lld #%d (magic=%x)",
- (unsigned long long)bp->b_bn, i,
- be16_to_cpu(dip->di_magic));
- ASSERT(0);
-#endif
- }
- }
- xfs_inobp_check(mp, bp);
-}
-
-
-static void
-xfs_inode_buf_read_verify(
- struct xfs_buf *bp)
-{
- xfs_inode_buf_verify(bp);
-}
-
-static void
-xfs_inode_buf_write_verify(
- struct xfs_buf *bp)
-{
- xfs_inode_buf_verify(bp);
-}
+ try_lock = 0;
+ i = 0;
-const struct xfs_buf_ops xfs_inode_buf_ops = {
- .verify_read = xfs_inode_buf_read_verify,
- .verify_write = xfs_inode_buf_write_verify,
-};
+again:
+ for (; i < inodes; i++) {
+ ASSERT(ips[i]);
+ if (i && (ips[i] == ips[i-1])) /* Already locked */
+ continue;
-/*
- * This routine is called to map an inode to the buffer containing the on-disk
- * version of the inode. It returns a pointer to the buffer containing the
- * on-disk inode in the bpp parameter, and in the dipp parameter it returns a
- * pointer to the on-disk inode within that buffer.
- *
- * If a non-zero error is returned, then the contents of bpp and dipp are
- * undefined.
- */
-int
-xfs_imap_to_bp(
- struct xfs_mount *mp,
- struct xfs_trans *tp,
- struct xfs_imap *imap,
- struct xfs_dinode **dipp,
- struct xfs_buf **bpp,
- uint buf_flags,
- uint iget_flags)
-{
- struct xfs_buf *bp;
- int error;
+ /*
+ * If try_lock is not set yet, make sure all locked inodes
+ * are not in the AIL.
+ * If any are, set try_lock to be used later.
+ */
- buf_flags |= XBF_UNMAPPED;
- error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno,
- (int)imap->im_len, buf_flags, &bp,
- &xfs_inode_buf_ops);
- if (error) {
- if (error == EAGAIN) {
- ASSERT(buf_flags & XBF_TRYLOCK);
- return error;
+ if (!try_lock) {
+ for (j = (i - 1); j >= 0 && !try_lock; j--) {
+ lp = (xfs_log_item_t *)ips[j]->i_itemp;
+ if (lp && (lp->li_flags & XFS_LI_IN_AIL)) {
+ try_lock++;
+ }
+ }
}
- if (error == EFSCORRUPTED &&
- (iget_flags & XFS_IGET_UNTRUSTED))
- return XFS_ERROR(EINVAL);
-
- xfs_warn(mp, "%s: xfs_trans_read_buf() returned error %d.",
- __func__, error);
- return error;
- }
-
- *bpp = bp;
- *dipp = (struct xfs_dinode *)xfs_buf_offset(bp, imap->im_boffset);
- return 0;
-}
-
-/*
- * Move inode type and inode format specific information from the
- * on-disk inode to the in-core inode. For fifos, devs, and sockets
- * this means set if_rdev to the proper value. For files, directories,
- * and symlinks this means to bring in the in-line data or extent
- * pointers. For a file in B-tree format, only the root is immediately
- * brought in-core. The rest will be in-lined in if_extents when it
- * is first referenced (see xfs_iread_extents()).
- */
-STATIC int
-xfs_iformat(
- xfs_inode_t *ip,
- xfs_dinode_t *dip)
-{
- xfs_attr_shortform_t *atp;
- int size;
- int error = 0;
- xfs_fsize_t di_size;
-
- if (unlikely(be32_to_cpu(dip->di_nextents) +
- be16_to_cpu(dip->di_anextents) >
- be64_to_cpu(dip->di_nblocks))) {
- xfs_warn(ip->i_mount,
- "corrupt dinode %Lu, extent total = %d, nblocks = %Lu.",
- (unsigned long long)ip->i_ino,
- (int)(be32_to_cpu(dip->di_nextents) +
- be16_to_cpu(dip->di_anextents)),
- (unsigned long long)
- be64_to_cpu(dip->di_nblocks));
- XFS_CORRUPTION_ERROR("xfs_iformat(1)", XFS_ERRLEVEL_LOW,
- ip->i_mount, dip);
- return XFS_ERROR(EFSCORRUPTED);
- }
-
- if (unlikely(dip->di_forkoff > ip->i_mount->m_sb.sb_inodesize)) {
- xfs_warn(ip->i_mount, "corrupt dinode %Lu, forkoff = 0x%x.",
- (unsigned long long)ip->i_ino,
- dip->di_forkoff);
- XFS_CORRUPTION_ERROR("xfs_iformat(2)", XFS_ERRLEVEL_LOW,
- ip->i_mount, dip);
- return XFS_ERROR(EFSCORRUPTED);
- }
-
- if (unlikely((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) &&
- !ip->i_mount->m_rtdev_targp)) {
- xfs_warn(ip->i_mount,
- "corrupt dinode %Lu, has realtime flag set.",
- ip->i_ino);
- XFS_CORRUPTION_ERROR("xfs_iformat(realtime)",
- XFS_ERRLEVEL_LOW, ip->i_mount, dip);
- return XFS_ERROR(EFSCORRUPTED);
- }
-
- switch (ip->i_d.di_mode & S_IFMT) {
- case S_IFIFO:
- case S_IFCHR:
- case S_IFBLK:
- case S_IFSOCK:
- if (unlikely(dip->di_format != XFS_DINODE_FMT_DEV)) {
- XFS_CORRUPTION_ERROR("xfs_iformat(3)", XFS_ERRLEVEL_LOW,
- ip->i_mount, dip);
- return XFS_ERROR(EFSCORRUPTED);
- }
- ip->i_d.di_size = 0;
- ip->i_df.if_u2.if_rdev = xfs_dinode_get_rdev(dip);
- break;
+ /*
+ * If any of the previous locks we have locked is in the AIL,
+ * we must TRY to get the second and subsequent locks. If
+ * we can't get any, we must release all we have
+ * and try again.
+ */
- case S_IFREG:
- case S_IFLNK:
- case S_IFDIR:
- switch (dip->di_format) {
- case XFS_DINODE_FMT_LOCAL:
+ if (try_lock) {
+ /* try_lock must be 0 if i is 0. */
/*
- * no local regular files yet
+ * try_lock means we have an inode locked
+ * that is in the AIL.
*/
- if (unlikely(S_ISREG(be16_to_cpu(dip->di_mode)))) {
- xfs_warn(ip->i_mount,
- "corrupt inode %Lu (local format for regular file).",
- (unsigned long long) ip->i_ino);
- XFS_CORRUPTION_ERROR("xfs_iformat(4)",
- XFS_ERRLEVEL_LOW,
- ip->i_mount, dip);
- return XFS_ERROR(EFSCORRUPTED);
- }
+ ASSERT(i != 0);
+ if (!xfs_ilock_nowait(ips[i], xfs_lock_inumorder(lock_mode, i))) {
+ attempts++;
+
+ /*
+ * Unlock all previous guys and try again.
+ * xfs_iunlock will try to push the tail
+ * if the inode is in the AIL.
+ */
+
+ for(j = i - 1; j >= 0; j--) {
+
+ /*
+ * Check to see if we've already
+ * unlocked this one.
+ * Not the first one going back,
+ * and the inode ptr is the same.
+ */
+ if ((j != (i - 1)) && ips[j] ==
+ ips[j+1])
+ continue;
+
+ xfs_iunlock(ips[j], lock_mode);
+ }
- di_size = be64_to_cpu(dip->di_size);
- if (unlikely(di_size > XFS_DFORK_DSIZE(dip, ip->i_mount))) {
- xfs_warn(ip->i_mount,
- "corrupt inode %Lu (bad size %Ld for local inode).",
- (unsigned long long) ip->i_ino,
- (long long) di_size);
- XFS_CORRUPTION_ERROR("xfs_iformat(5)",
- XFS_ERRLEVEL_LOW,
- ip->i_mount, dip);
- return XFS_ERROR(EFSCORRUPTED);
+ if ((attempts % 5) == 0) {
+ delay(1); /* Don't just spin the CPU */
+#ifdef DEBUG
+ xfs_lock_delays++;
+#endif
+ }
+ i = 0;
+ try_lock = 0;
+ goto again;
}
-
- size = (int)di_size;
- error = xfs_iformat_local(ip, dip, XFS_DATA_FORK, size);
- break;
- case XFS_DINODE_FMT_EXTENTS:
- error = xfs_iformat_extents(ip, dip, XFS_DATA_FORK);
- break;
- case XFS_DINODE_FMT_BTREE:
- error = xfs_iformat_btree(ip, dip, XFS_DATA_FORK);
- break;
- default:
- XFS_ERROR_REPORT("xfs_iformat(6)", XFS_ERRLEVEL_LOW,
- ip->i_mount);
- return XFS_ERROR(EFSCORRUPTED);
+ } else {
+ xfs_ilock(ips[i], xfs_lock_inumorder(lock_mode, i));
}
- break;
-
- default:
- XFS_ERROR_REPORT("xfs_iformat(7)", XFS_ERRLEVEL_LOW, ip->i_mount);
- return XFS_ERROR(EFSCORRUPTED);
}
- if (error) {
- return error;
- }
- if (!XFS_DFORK_Q(dip))
- return 0;
-
- ASSERT(ip->i_afp == NULL);
- ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP | KM_NOFS);
-
- switch (dip->di_aformat) {
- case XFS_DINODE_FMT_LOCAL:
- atp = (xfs_attr_shortform_t *)XFS_DFORK_APTR(dip);
- size = be16_to_cpu(atp->hdr.totsize);
-
- if (unlikely(size < sizeof(struct xfs_attr_sf_hdr))) {
- xfs_warn(ip->i_mount,
- "corrupt inode %Lu (bad attr fork size %Ld).",
- (unsigned long long) ip->i_ino,
- (long long) size);
- XFS_CORRUPTION_ERROR("xfs_iformat(8)",
- XFS_ERRLEVEL_LOW,
- ip->i_mount, dip);
- return XFS_ERROR(EFSCORRUPTED);
- }
- error = xfs_iformat_local(ip, dip, XFS_ATTR_FORK, size);
- break;
- case XFS_DINODE_FMT_EXTENTS:
- error = xfs_iformat_extents(ip, dip, XFS_ATTR_FORK);
- break;
- case XFS_DINODE_FMT_BTREE:
- error = xfs_iformat_btree(ip, dip, XFS_ATTR_FORK);
- break;
- default:
- error = XFS_ERROR(EFSCORRUPTED);
- break;
- }
- if (error) {
- kmem_zone_free(xfs_ifork_zone, ip->i_afp);
- ip->i_afp = NULL;
- xfs_idestroy_fork(ip, XFS_DATA_FORK);
+#ifdef DEBUG
+ if (attempts) {
+ if (attempts < 5) xfs_small_retries++;
+ else if (attempts < 100) xfs_middle_retries++;
+ else xfs_lots_retries++;
+ } else {
+ xfs_locked_n++;
}
- return error;
+#endif
}
/*
- * The file is in-lined in the on-disk inode.
- * If it fits into if_inline_data, then copy
- * it there, otherwise allocate a buffer for it
- * and copy the data there. Either way, set
- * if_data to point at the data.
- * If we allocate a buffer for the data, make
- * sure that its size is a multiple of 4 and
- * record the real size in i_real_bytes.
+ * xfs_lock_two_inodes() can only be used to lock one type of lock
+ * at a time - the iolock or the ilock, but not both at once. If
+ * we lock both at once, lockdep will report false positives saying
+ * we have violated locking orders.
*/
-STATIC int
-xfs_iformat_local(
- xfs_inode_t *ip,
- xfs_dinode_t *dip,
- int whichfork,
- int size)
+void
+xfs_lock_two_inodes(
+ xfs_inode_t *ip0,
+ xfs_inode_t *ip1,
+ uint lock_mode)
{
- xfs_ifork_t *ifp;
- int real_size;
-
- /*
- * If the size is unreasonable, then something
- * is wrong and we just bail out rather than crash in
- * kmem_alloc() or memcpy() below.
- */
- if (unlikely(size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) {
- xfs_warn(ip->i_mount,
- "corrupt inode %Lu (bad size %d for local fork, size = %d).",
- (unsigned long long) ip->i_ino, size,
- XFS_DFORK_SIZE(dip, ip->i_mount, whichfork));
- XFS_CORRUPTION_ERROR("xfs_iformat_local", XFS_ERRLEVEL_LOW,
- ip->i_mount, dip);
- return XFS_ERROR(EFSCORRUPTED);
- }
- ifp = XFS_IFORK_PTR(ip, whichfork);
- real_size = 0;
- if (size == 0)
- ifp->if_u1.if_data = NULL;
- else if (size <= sizeof(ifp->if_u2.if_inline_data))
- ifp->if_u1.if_data = ifp->if_u2.if_inline_data;
- else {
- real_size = roundup(size, 4);
- ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP | KM_NOFS);
- }
- ifp->if_bytes = size;
- ifp->if_real_bytes = real_size;
- if (size)
- memcpy(ifp->if_u1.if_data, XFS_DFORK_PTR(dip, whichfork), size);
- ifp->if_flags &= ~XFS_IFEXTENTS;
- ifp->if_flags |= XFS_IFINLINE;
- return 0;
-}
+ xfs_inode_t *temp;
+ int attempts = 0;
+ xfs_log_item_t *lp;
-/*
- * The file consists of a set of extents all
- * of which fit into the on-disk inode.
- * If there are few enough extents to fit into
- * the if_inline_ext, then copy them there.
- * Otherwise allocate a buffer for them and copy
- * them into it. Either way, set if_extents
- * to point at the extents.
- */
-STATIC int
-xfs_iformat_extents(
- xfs_inode_t *ip,
- xfs_dinode_t *dip,
- int whichfork)
-{
- xfs_bmbt_rec_t *dp;
- xfs_ifork_t *ifp;
- int nex;
- int size;
- int i;
-
- ifp = XFS_IFORK_PTR(ip, whichfork);
- nex = XFS_DFORK_NEXTENTS(dip, whichfork);
- size = nex * (uint)sizeof(xfs_bmbt_rec_t);
-
- /*
- * If the number of extents is unreasonable, then something
- * is wrong and we just bail out rather than crash in
- * kmem_alloc() or memcpy() below.
- */
- if (unlikely(size < 0 || size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) {
- xfs_warn(ip->i_mount, "corrupt inode %Lu ((a)extents = %d).",
- (unsigned long long) ip->i_ino, nex);
- XFS_CORRUPTION_ERROR("xfs_iformat_extents(1)", XFS_ERRLEVEL_LOW,
- ip->i_mount, dip);
- return XFS_ERROR(EFSCORRUPTED);
- }
-
- ifp->if_real_bytes = 0;
- if (nex == 0)
- ifp->if_u1.if_extents = NULL;
- else if (nex <= XFS_INLINE_EXTS)
- ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext;
- else
- xfs_iext_add(ifp, 0, nex);
-
- ifp->if_bytes = size;
- if (size) {
- dp = (xfs_bmbt_rec_t *) XFS_DFORK_PTR(dip, whichfork);
- xfs_validate_extents(ifp, nex, XFS_EXTFMT_INODE(ip));
- for (i = 0; i < nex; i++, dp++) {
- xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i);
- ep->l0 = get_unaligned_be64(&dp->l0);
- ep->l1 = get_unaligned_be64(&dp->l1);
- }
- XFS_BMAP_TRACE_EXLIST(ip, nex, whichfork);
- if (whichfork != XFS_DATA_FORK ||
- XFS_EXTFMT_INODE(ip) == XFS_EXTFMT_NOSTATE)
- if (unlikely(xfs_check_nostate_extents(
- ifp, 0, nex))) {
- XFS_ERROR_REPORT("xfs_iformat_extents(2)",
- XFS_ERRLEVEL_LOW,
- ip->i_mount);
- return XFS_ERROR(EFSCORRUPTED);
- }
+ if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL))
+ ASSERT((lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)) == 0);
+ ASSERT(ip0->i_ino != ip1->i_ino);
+
+ if (ip0->i_ino > ip1->i_ino) {
+ temp = ip0;
+ ip0 = ip1;
+ ip1 = temp;
}
- ifp->if_flags |= XFS_IFEXTENTS;
- return 0;
-}
-/*
- * The file has too many extents to fit into
- * the inode, so they are in B-tree format.
- * Allocate a buffer for the root of the B-tree
- * and copy the root into it. The i_extents
- * field will remain NULL until all of the
- * extents are read in (when they are needed).
- */
-STATIC int
-xfs_iformat_btree(
- xfs_inode_t *ip,
- xfs_dinode_t *dip,
- int whichfork)
-{
- xfs_bmdr_block_t *dfp;
- xfs_ifork_t *ifp;
- /* REFERENCED */
- int nrecs;
- int size;
-
- ifp = XFS_IFORK_PTR(ip, whichfork);
- dfp = (xfs_bmdr_block_t *)XFS_DFORK_PTR(dip, whichfork);
- size = XFS_BMAP_BROOT_SPACE(dfp);
- nrecs = be16_to_cpu(dfp->bb_numrecs);
-
- /*
- * blow out if -- fork has less extents than can fit in
- * fork (fork shouldn't be a btree format), root btree
- * block has more records than can fit into the fork,
- * or the number of extents is greater than the number of
- * blocks.
- */
- if (unlikely(XFS_IFORK_NEXTENTS(ip, whichfork) <=
- XFS_IFORK_MAXEXT(ip, whichfork) ||
- XFS_BMDR_SPACE_CALC(nrecs) >
- XFS_DFORK_SIZE(dip, ip->i_mount, whichfork) ||
- XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks)) {
- xfs_warn(ip->i_mount, "corrupt inode %Lu (btree).",
- (unsigned long long) ip->i_ino);
- XFS_CORRUPTION_ERROR("xfs_iformat_btree", XFS_ERRLEVEL_LOW,
- ip->i_mount, dip);
- return XFS_ERROR(EFSCORRUPTED);
- }
-
- ifp->if_broot_bytes = size;
- ifp->if_broot = kmem_alloc(size, KM_SLEEP | KM_NOFS);
- ASSERT(ifp->if_broot != NULL);
- /*
- * Copy and convert from the on-disk structure
- * to the in-memory structure.
- */
- xfs_bmdr_to_bmbt(ip->i_mount, dfp,
- XFS_DFORK_SIZE(dip, ip->i_mount, whichfork),
- ifp->if_broot, size);
- ifp->if_flags &= ~XFS_IFEXTENTS;
- ifp->if_flags |= XFS_IFBROOT;
+ again:
+ xfs_ilock(ip0, xfs_lock_inumorder(lock_mode, 0));
- return 0;
+ /*
+ * If the first lock we have locked is in the AIL, we must TRY to get
+ * the second lock. If we can't get it, we must release the first one
+ * and try again.
+ */
+ lp = (xfs_log_item_t *)ip0->i_itemp;
+ if (lp && (lp->li_flags & XFS_LI_IN_AIL)) {
+ if (!xfs_ilock_nowait(ip1, xfs_lock_inumorder(lock_mode, 1))) {
+ xfs_iunlock(ip0, lock_mode);
+ if ((++attempts % 5) == 0)
+ delay(1); /* Don't just spin the CPU */
+ goto again;
+ }
+ } else {
+ xfs_ilock(ip1, xfs_lock_inumorder(lock_mode, 1));
+ }
}
-STATIC void
-xfs_dinode_from_disk(
- xfs_icdinode_t *to,
- xfs_dinode_t *from)
-{
- to->di_magic = be16_to_cpu(from->di_magic);
- to->di_mode = be16_to_cpu(from->di_mode);
- to->di_version = from ->di_version;
- to->di_format = from->di_format;
- to->di_onlink = be16_to_cpu(from->di_onlink);
- to->di_uid = be32_to_cpu(from->di_uid);
- to->di_gid = be32_to_cpu(from->di_gid);
- to->di_nlink = be32_to_cpu(from->di_nlink);
- to->di_projid_lo = be16_to_cpu(from->di_projid_lo);
- to->di_projid_hi = be16_to_cpu(from->di_projid_hi);
- memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad));
- to->di_flushiter = be16_to_cpu(from->di_flushiter);
- to->di_atime.t_sec = be32_to_cpu(from->di_atime.t_sec);
- to->di_atime.t_nsec = be32_to_cpu(from->di_atime.t_nsec);
- to->di_mtime.t_sec = be32_to_cpu(from->di_mtime.t_sec);
- to->di_mtime.t_nsec = be32_to_cpu(from->di_mtime.t_nsec);
- to->di_ctime.t_sec = be32_to_cpu(from->di_ctime.t_sec);
- to->di_ctime.t_nsec = be32_to_cpu(from->di_ctime.t_nsec);
- to->di_size = be64_to_cpu(from->di_size);
- to->di_nblocks = be64_to_cpu(from->di_nblocks);
- to->di_extsize = be32_to_cpu(from->di_extsize);
- to->di_nextents = be32_to_cpu(from->di_nextents);
- to->di_anextents = be16_to_cpu(from->di_anextents);
- to->di_forkoff = from->di_forkoff;
- to->di_aformat = from->di_aformat;
- to->di_dmevmask = be32_to_cpu(from->di_dmevmask);
- to->di_dmstate = be16_to_cpu(from->di_dmstate);
- to->di_flags = be16_to_cpu(from->di_flags);
- to->di_gen = be32_to_cpu(from->di_gen);
-}
void
-xfs_dinode_to_disk(
- xfs_dinode_t *to,
- xfs_icdinode_t *from)
+__xfs_iflock(
+ struct xfs_inode *ip)
{
- to->di_magic = cpu_to_be16(from->di_magic);
- to->di_mode = cpu_to_be16(from->di_mode);
- to->di_version = from ->di_version;
- to->di_format = from->di_format;
- to->di_onlink = cpu_to_be16(from->di_onlink);
- to->di_uid = cpu_to_be32(from->di_uid);
- to->di_gid = cpu_to_be32(from->di_gid);
- to->di_nlink = cpu_to_be32(from->di_nlink);
- to->di_projid_lo = cpu_to_be16(from->di_projid_lo);
- to->di_projid_hi = cpu_to_be16(from->di_projid_hi);
- memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad));
- to->di_flushiter = cpu_to_be16(from->di_flushiter);
- to->di_atime.t_sec = cpu_to_be32(from->di_atime.t_sec);
- to->di_atime.t_nsec = cpu_to_be32(from->di_atime.t_nsec);
- to->di_mtime.t_sec = cpu_to_be32(from->di_mtime.t_sec);
- to->di_mtime.t_nsec = cpu_to_be32(from->di_mtime.t_nsec);
- to->di_ctime.t_sec = cpu_to_be32(from->di_ctime.t_sec);
- to->di_ctime.t_nsec = cpu_to_be32(from->di_ctime.t_nsec);
- to->di_size = cpu_to_be64(from->di_size);
- to->di_nblocks = cpu_to_be64(from->di_nblocks);
- to->di_extsize = cpu_to_be32(from->di_extsize);
- to->di_nextents = cpu_to_be32(from->di_nextents);
- to->di_anextents = cpu_to_be16(from->di_anextents);
- to->di_forkoff = from->di_forkoff;
- to->di_aformat = from->di_aformat;
- to->di_dmevmask = cpu_to_be32(from->di_dmevmask);
- to->di_dmstate = cpu_to_be16(from->di_dmstate);
- to->di_flags = cpu_to_be16(from->di_flags);
- to->di_gen = cpu_to_be32(from->di_gen);
+ wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IFLOCK_BIT);
+ DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IFLOCK_BIT);
+
+ do {
+ prepare_to_wait_exclusive(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
+ if (xfs_isiflocked(ip))
+ io_schedule();
+ } while (!xfs_iflock_nowait(ip));
+
+ finish_wait(wq, &wait.wait);
}
STATIC uint
@@ -963,162 +564,49 @@ xfs_dic2xflags(
}
/*
- * Read the disk inode attributes into the in-core inode structure.
+ * Lookups up an inode from "name". If ci_name is not NULL, then a CI match
+ * is allowed, otherwise it has to be an exact match. If a CI match is found,
+ * ci_name->name will point to a the actual name (caller must free) or
+ * will be set to NULL if an exact match is found.
*/
int
-xfs_iread(
- xfs_mount_t *mp,
- xfs_trans_t *tp,
- xfs_inode_t *ip,
- uint iget_flags)
+xfs_lookup(
+ xfs_inode_t *dp,
+ struct xfs_name *name,
+ xfs_inode_t **ipp,
+ struct xfs_name *ci_name)
{
- xfs_buf_t *bp;
- xfs_dinode_t *dip;
- int error;
+ xfs_ino_t inum;
+ int error;
+ uint lock_mode;
- /*
- * Fill in the location information in the in-core inode.
- */
- error = xfs_imap(mp, tp, ip->i_ino, &ip->i_imap, iget_flags);
- if (error)
- return error;
+ trace_xfs_lookup(dp, name);
- /*
- * Get pointers to the on-disk inode and the buffer containing it.
- */
- error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &bp, 0, iget_flags);
- if (error)
- return error;
+ if (XFS_FORCED_SHUTDOWN(dp->i_mount))
+ return XFS_ERROR(EIO);
- /*
- * If we got something that isn't an inode it means someone
- * (nfs or dmi) has a stale handle.
- */
- if (dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC)) {
-#ifdef DEBUG
- xfs_alert(mp,
- "%s: dip->di_magic (0x%x) != XFS_DINODE_MAGIC (0x%x)",
- __func__, be16_to_cpu(dip->di_magic), XFS_DINODE_MAGIC);
-#endif /* DEBUG */
- error = XFS_ERROR(EINVAL);
- goto out_brelse;
- }
+ lock_mode = xfs_ilock_data_map_shared(dp);
+ error = xfs_dir_lookup(NULL, dp, name, &inum, ci_name);
+ xfs_iunlock(dp, lock_mode);
- /*
- * If the on-disk inode is already linked to a directory
- * entry, copy all of the inode into the in-core inode.
- * xfs_iformat() handles copying in the inode format
- * specific information.
- * Otherwise, just get the truly permanent information.
- */
- if (dip->di_mode) {
- xfs_dinode_from_disk(&ip->i_d, dip);
- error = xfs_iformat(ip, dip);
- if (error) {
-#ifdef DEBUG
- xfs_alert(mp, "%s: xfs_iformat() returned error %d",
- __func__, error);
-#endif /* DEBUG */
- goto out_brelse;
- }
- } else {
- ip->i_d.di_magic = be16_to_cpu(dip->di_magic);
- ip->i_d.di_version = dip->di_version;
- ip->i_d.di_gen = be32_to_cpu(dip->di_gen);
- ip->i_d.di_flushiter = be16_to_cpu(dip->di_flushiter);
- /*
- * Make sure to pull in the mode here as well in
- * case the inode is released without being used.
- * This ensures that xfs_inactive() will see that
- * the inode is already free and not try to mess
- * with the uninitialized part of it.
- */
- ip->i_d.di_mode = 0;
- }
-
- /*
- * The inode format changed when we moved the link count and
- * made it 32 bits long. If this is an old format inode,
- * convert it in memory to look like a new one. If it gets
- * flushed to disk we will convert back before flushing or
- * logging it. We zero out the new projid field and the old link
- * count field. We'll handle clearing the pad field (the remains
- * of the old uuid field) when we actually convert the inode to
- * the new format. We don't change the version number so that we
- * can distinguish this from a real new format inode.
- */
- if (ip->i_d.di_version == 1) {
- ip->i_d.di_nlink = ip->i_d.di_onlink;
- ip->i_d.di_onlink = 0;
- xfs_set_projid(ip, 0);
- }
+ if (error)
+ goto out;
- ip->i_delayed_blks = 0;
+ error = xfs_iget(dp->i_mount, NULL, inum, 0, 0, ipp);
+ if (error)
+ goto out_free_name;
- /*
- * Mark the buffer containing the inode as something to keep
- * around for a while. This helps to keep recently accessed
- * meta-data in-core longer.
- */
- xfs_buf_set_ref(bp, XFS_INO_REF);
+ return 0;
- /*
- * Use xfs_trans_brelse() to release the buffer containing the
- * on-disk inode, because it was acquired with xfs_trans_read_buf()
- * in xfs_imap_to_bp() above. If tp is NULL, this is just a normal
- * brelse(). If we're within a transaction, then xfs_trans_brelse()
- * will only release the buffer if it is not dirty within the
- * transaction. It will be OK to release the buffer in this case,
- * because inodes on disk are never destroyed and we will be
- * locking the new in-core inode before putting it in the hash
- * table where other processes can find it. Thus we don't have
- * to worry about the inode being changed just because we released
- * the buffer.
- */
- out_brelse:
- xfs_trans_brelse(tp, bp);
+out_free_name:
+ if (ci_name)
+ kmem_free(ci_name->name);
+out:
+ *ipp = NULL;
return error;
}
/*
- * Read in extents from a btree-format inode.
- * Allocate and fill in if_extents. Real work is done in xfs_bmap.c.
- */
-int
-xfs_iread_extents(
- xfs_trans_t *tp,
- xfs_inode_t *ip,
- int whichfork)
-{
- int error;
- xfs_ifork_t *ifp;
- xfs_extnum_t nextents;
-
- if (unlikely(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)) {
- XFS_ERROR_REPORT("xfs_iread_extents", XFS_ERRLEVEL_LOW,
- ip->i_mount);
- return XFS_ERROR(EFSCORRUPTED);
- }
- nextents = XFS_IFORK_NEXTENTS(ip, whichfork);
- ifp = XFS_IFORK_PTR(ip, whichfork);
-
- /*
- * We know that the size is valid (it's checked in iformat_btree)
- */
- ifp->if_bytes = ifp->if_real_bytes = 0;
- ifp->if_flags |= XFS_IFEXTENTS;
- xfs_iext_add(ifp, 0, nextents);
- error = xfs_bmap_read_extents(tp, ip, whichfork);
- if (error) {
- xfs_iext_destroy(ifp);
- ifp->if_flags &= ~XFS_IFEXTENTS;
- return error;
- }
- xfs_validate_extents(ifp, nextents, XFS_EXTFMT_INODE(ip));
- return 0;
-}
-
-/*
* Allocate an inode on disk and return a copy of its in-core version.
* The in-core inode is locked exclusively. Set mode, nlink, and rdev
* appropriately within the inode. The uid and gid for the inode are
@@ -1161,12 +649,12 @@ xfs_ialloc(
xfs_buf_t **ialloc_context,
xfs_inode_t **ipp)
{
+ struct xfs_mount *mp = tp->t_mountp;
xfs_ino_t ino;
xfs_inode_t *ip;
uint flags;
int error;
timespec_t tv;
- int filestreams = 0;
/*
* Call the space management code to pick
@@ -1187,42 +675,29 @@ xfs_ialloc(
* This is because we're setting fields here we need
* to prevent others from looking at until we're done.
*/
- error = xfs_iget(tp->t_mountp, tp, ino, XFS_IGET_CREATE,
+ error = xfs_iget(mp, tp, ino, XFS_IGET_CREATE,
XFS_ILOCK_EXCL, &ip);
if (error)
return error;
ASSERT(ip != NULL);
+ /*
+ * We always convert v1 inodes to v2 now - we only support filesystems
+ * with >= v2 inode capability, so there is no reason for ever leaving
+ * an inode in v1 format.
+ */
+ if (ip->i_d.di_version == 1)
+ ip->i_d.di_version = 2;
+
ip->i_d.di_mode = mode;
ip->i_d.di_onlink = 0;
ip->i_d.di_nlink = nlink;
ASSERT(ip->i_d.di_nlink == nlink);
- ip->i_d.di_uid = current_fsuid();
- ip->i_d.di_gid = current_fsgid();
+ ip->i_d.di_uid = xfs_kuid_to_uid(current_fsuid());
+ ip->i_d.di_gid = xfs_kgid_to_gid(current_fsgid());
xfs_set_projid(ip, prid);
memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
- /*
- * If the superblock version is up to where we support new format
- * inodes and this is currently an old format inode, then change
- * the inode version number now. This way we only do the conversion
- * here rather than here and in the flush/logging code.
- */
- if (xfs_sb_version_hasnlink(&tp->t_mountp->m_sb) &&
- ip->i_d.di_version == 1) {
- ip->i_d.di_version = 2;
- /*
- * We've already zeroed the old link count, the projid field,
- * and the pad field.
- */
- }
-
- /*
- * Project ids won't be stored on disk if we are using a version 1 inode.
- */
- if ((prid != 0) && (ip->i_d.di_version == 1))
- xfs_bump_ino_vers2(tp, ip);
-
if (pip && XFS_INHERIT_GID(pip)) {
ip->i_d.di_gid = pip->i_d.di_gid;
if ((pip->i_d.di_mode & S_ISGID) && S_ISDIR(mode)) {
@@ -1237,7 +712,7 @@ xfs_ialloc(
*/
if ((irix_sgid_inherit) &&
(ip->i_d.di_mode & S_ISGID) &&
- (!in_group_p((gid_t)ip->i_d.di_gid))) {
+ (!in_group_p(xfs_gid_to_kgid(ip->i_d.di_gid)))) {
ip->i_d.di_mode &= ~S_ISGID;
}
@@ -1258,6 +733,19 @@ xfs_ialloc(
ip->i_d.di_dmevmask = 0;
ip->i_d.di_dmstate = 0;
ip->i_d.di_flags = 0;
+
+ if (ip->i_d.di_version == 3) {
+ ASSERT(ip->i_d.di_ino == ino);
+ ASSERT(uuid_equal(&ip->i_d.di_uuid, &mp->m_sb.sb_uuid));
+ ip->i_d.di_crc = 0;
+ ip->i_d.di_changecount = 1;
+ ip->i_d.di_lsn = 0;
+ ip->i_d.di_flags2 = 0;
+ memset(&(ip->i_d.di_pad2[0]), 0, sizeof(ip->i_d.di_pad2));
+ ip->i_d.di_crtime = ip->i_d.di_mtime;
+ }
+
+
flags = XFS_ILOG_CORE;
switch (mode & S_IFMT) {
case S_IFIFO:
@@ -1270,13 +758,6 @@ xfs_ialloc(
flags |= XFS_ILOG_DEV;
break;
case S_IFREG:
- /*
- * we can't set up filestreams until after the VFS inode
- * is set up properly.
- */
- if (pip && xfs_inode_is_filestream(pip))
- filestreams = 1;
- /* fall through */
case S_IFDIR:
if (pip && (pip->i_d.di_flags & XFS_DIFLAG_ANY)) {
uint di_flags = 0;
@@ -1342,20 +823,653 @@ xfs_ialloc(
/* now that we have an i_mode we can setup inode ops and unlock */
xfs_setup_inode(ip);
- /* now we have set up the vfs inode we can associate the filestream */
- if (filestreams) {
- error = xfs_filestream_associate(pip, ip);
- if (error < 0)
- return -error;
- if (!error)
- xfs_iflags_set(ip, XFS_IFILESTREAM);
+ *ipp = ip;
+ return 0;
+}
+
+/*
+ * Allocates a new inode from disk and return a pointer to the
+ * incore copy. This routine will internally commit the current
+ * transaction and allocate a new one if the Space Manager needed
+ * to do an allocation to replenish the inode free-list.
+ *
+ * This routine is designed to be called from xfs_create and
+ * xfs_create_dir.
+ *
+ */
+int
+xfs_dir_ialloc(
+ xfs_trans_t **tpp, /* input: current transaction;
+ output: may be a new transaction. */
+ xfs_inode_t *dp, /* directory within whose allocate
+ the inode. */
+ umode_t mode,
+ xfs_nlink_t nlink,
+ xfs_dev_t rdev,
+ prid_t prid, /* project id */
+ int okalloc, /* ok to allocate new space */
+ xfs_inode_t **ipp, /* pointer to inode; it will be
+ locked. */
+ int *committed)
+
+{
+ xfs_trans_t *tp;
+ xfs_trans_t *ntp;
+ xfs_inode_t *ip;
+ xfs_buf_t *ialloc_context = NULL;
+ int code;
+ void *dqinfo;
+ uint tflags;
+
+ tp = *tpp;
+ ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
+
+ /*
+ * xfs_ialloc will return a pointer to an incore inode if
+ * the Space Manager has an available inode on the free
+ * list. Otherwise, it will do an allocation and replenish
+ * the freelist. Since we can only do one allocation per
+ * transaction without deadlocks, we will need to commit the
+ * current transaction and start a new one. We will then
+ * need to call xfs_ialloc again to get the inode.
+ *
+ * If xfs_ialloc did an allocation to replenish the freelist,
+ * it returns the bp containing the head of the freelist as
+ * ialloc_context. We will hold a lock on it across the
+ * transaction commit so that no other process can steal
+ * the inode(s) that we've just allocated.
+ */
+ code = xfs_ialloc(tp, dp, mode, nlink, rdev, prid, okalloc,
+ &ialloc_context, &ip);
+
+ /*
+ * Return an error if we were unable to allocate a new inode.
+ * This should only happen if we run out of space on disk or
+ * encounter a disk error.
+ */
+ if (code) {
+ *ipp = NULL;
+ return code;
+ }
+ if (!ialloc_context && !ip) {
+ *ipp = NULL;
+ return XFS_ERROR(ENOSPC);
+ }
+
+ /*
+ * If the AGI buffer is non-NULL, then we were unable to get an
+ * inode in one operation. We need to commit the current
+ * transaction and call xfs_ialloc() again. It is guaranteed
+ * to succeed the second time.
+ */
+ if (ialloc_context) {
+ struct xfs_trans_res tres;
+
+ /*
+ * Normally, xfs_trans_commit releases all the locks.
+ * We call bhold to hang on to the ialloc_context across
+ * the commit. Holding this buffer prevents any other
+ * processes from doing any allocations in this
+ * allocation group.
+ */
+ xfs_trans_bhold(tp, ialloc_context);
+ /*
+ * Save the log reservation so we can use
+ * them in the next transaction.
+ */
+ tres.tr_logres = xfs_trans_get_log_res(tp);
+ tres.tr_logcount = xfs_trans_get_log_count(tp);
+
+ /*
+ * We want the quota changes to be associated with the next
+ * transaction, NOT this one. So, detach the dqinfo from this
+ * and attach it to the next transaction.
+ */
+ dqinfo = NULL;
+ tflags = 0;
+ if (tp->t_dqinfo) {
+ dqinfo = (void *)tp->t_dqinfo;
+ tp->t_dqinfo = NULL;
+ tflags = tp->t_flags & XFS_TRANS_DQ_DIRTY;
+ tp->t_flags &= ~(XFS_TRANS_DQ_DIRTY);
+ }
+
+ ntp = xfs_trans_dup(tp);
+ code = xfs_trans_commit(tp, 0);
+ tp = ntp;
+ if (committed != NULL) {
+ *committed = 1;
+ }
+ /*
+ * If we get an error during the commit processing,
+ * release the buffer that is still held and return
+ * to the caller.
+ */
+ if (code) {
+ xfs_buf_relse(ialloc_context);
+ if (dqinfo) {
+ tp->t_dqinfo = dqinfo;
+ xfs_trans_free_dqinfo(tp);
+ }
+ *tpp = ntp;
+ *ipp = NULL;
+ return code;
+ }
+
+ /*
+ * transaction commit worked ok so we can drop the extra ticket
+ * reference that we gained in xfs_trans_dup()
+ */
+ xfs_log_ticket_put(tp->t_ticket);
+ tres.tr_logflags = XFS_TRANS_PERM_LOG_RES;
+ code = xfs_trans_reserve(tp, &tres, 0, 0);
+
+ /*
+ * Re-attach the quota info that we detached from prev trx.
+ */
+ if (dqinfo) {
+ tp->t_dqinfo = dqinfo;
+ tp->t_flags |= tflags;
+ }
+
+ if (code) {
+ xfs_buf_relse(ialloc_context);
+ *tpp = ntp;
+ *ipp = NULL;
+ return code;
+ }
+ xfs_trans_bjoin(tp, ialloc_context);
+
+ /*
+ * Call ialloc again. Since we've locked out all
+ * other allocations in this allocation group,
+ * this call should always succeed.
+ */
+ code = xfs_ialloc(tp, dp, mode, nlink, rdev, prid,
+ okalloc, &ialloc_context, &ip);
+
+ /*
+ * If we get an error at this point, return to the caller
+ * so that the current transaction can be aborted.
+ */
+ if (code) {
+ *tpp = tp;
+ *ipp = NULL;
+ return code;
+ }
+ ASSERT(!ialloc_context && ip);
+
+ } else {
+ if (committed != NULL)
+ *committed = 0;
}
*ipp = ip;
+ *tpp = tp;
+
return 0;
}
/*
+ * Decrement the link count on an inode & log the change.
+ * If this causes the link count to go to zero, initiate the
+ * logging activity required to truncate a file.
+ */
+int /* error */
+xfs_droplink(
+ xfs_trans_t *tp,
+ xfs_inode_t *ip)
+{
+ int error;
+
+ xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
+
+ ASSERT (ip->i_d.di_nlink > 0);
+ ip->i_d.di_nlink--;
+ drop_nlink(VFS_I(ip));
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+ error = 0;
+ if (ip->i_d.di_nlink == 0) {
+ /*
+ * We're dropping the last link to this file.
+ * Move the on-disk inode to the AGI unlinked list.
+ * From xfs_inactive() we will pull the inode from
+ * the list and free it.
+ */
+ error = xfs_iunlink(tp, ip);
+ }
+ return error;
+}
+
+/*
+ * Increment the link count on an inode & log the change.
+ */
+int
+xfs_bumplink(
+ xfs_trans_t *tp,
+ xfs_inode_t *ip)
+{
+ xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
+
+ ASSERT(ip->i_d.di_version > 1);
+ ASSERT(ip->i_d.di_nlink > 0 || (VFS_I(ip)->i_state & I_LINKABLE));
+ ip->i_d.di_nlink++;
+ inc_nlink(VFS_I(ip));
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+ return 0;
+}
+
+int
+xfs_create(
+ xfs_inode_t *dp,
+ struct xfs_name *name,
+ umode_t mode,
+ xfs_dev_t rdev,
+ xfs_inode_t **ipp)
+{
+ int is_dir = S_ISDIR(mode);
+ struct xfs_mount *mp = dp->i_mount;
+ struct xfs_inode *ip = NULL;
+ struct xfs_trans *tp = NULL;
+ int error;
+ xfs_bmap_free_t free_list;
+ xfs_fsblock_t first_block;
+ bool unlock_dp_on_error = false;
+ uint cancel_flags;
+ int committed;
+ prid_t prid;
+ struct xfs_dquot *udqp = NULL;
+ struct xfs_dquot *gdqp = NULL;
+ struct xfs_dquot *pdqp = NULL;
+ struct xfs_trans_res tres;
+ uint resblks;
+
+ trace_xfs_create(dp, name);
+
+ if (XFS_FORCED_SHUTDOWN(mp))
+ return XFS_ERROR(EIO);
+
+ prid = xfs_get_initial_prid(dp);
+
+ /*
+ * Make sure that we have allocated dquot(s) on disk.
+ */
+ error = xfs_qm_vop_dqalloc(dp, xfs_kuid_to_uid(current_fsuid()),
+ xfs_kgid_to_gid(current_fsgid()), prid,
+ XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT,
+ &udqp, &gdqp, &pdqp);
+ if (error)
+ return error;
+
+ if (is_dir) {
+ rdev = 0;
+ resblks = XFS_MKDIR_SPACE_RES(mp, name->len);
+ tres.tr_logres = M_RES(mp)->tr_mkdir.tr_logres;
+ tres.tr_logcount = XFS_MKDIR_LOG_COUNT;
+ tp = xfs_trans_alloc(mp, XFS_TRANS_MKDIR);
+ } else {
+ resblks = XFS_CREATE_SPACE_RES(mp, name->len);
+ tres.tr_logres = M_RES(mp)->tr_create.tr_logres;
+ tres.tr_logcount = XFS_CREATE_LOG_COUNT;
+ tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE);
+ }
+
+ cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
+
+ /*
+ * Initially assume that the file does not exist and
+ * reserve the resources for that case. If that is not
+ * the case we'll drop the one we have and get a more
+ * appropriate transaction later.
+ */
+ tres.tr_logflags = XFS_TRANS_PERM_LOG_RES;
+ error = xfs_trans_reserve(tp, &tres, resblks, 0);
+ if (error == ENOSPC) {
+ /* flush outstanding delalloc blocks and retry */
+ xfs_flush_inodes(mp);
+ error = xfs_trans_reserve(tp, &tres, resblks, 0);
+ }
+ if (error == ENOSPC) {
+ /* No space at all so try a "no-allocation" reservation */
+ resblks = 0;
+ error = xfs_trans_reserve(tp, &tres, 0, 0);
+ }
+ if (error) {
+ cancel_flags = 0;
+ goto out_trans_cancel;
+ }
+
+ xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
+ unlock_dp_on_error = true;
+
+ xfs_bmap_init(&free_list, &first_block);
+
+ /*
+ * Reserve disk quota and the inode.
+ */
+ error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp,
+ pdqp, resblks, 1, 0);
+ if (error)
+ goto out_trans_cancel;
+
+ error = xfs_dir_canenter(tp, dp, name, resblks);
+ if (error)
+ goto out_trans_cancel;
+
+ /*
+ * A newly created regular or special file just has one directory
+ * entry pointing to them, but a directory also the "." entry
+ * pointing to itself.
+ */
+ error = xfs_dir_ialloc(&tp, dp, mode, is_dir ? 2 : 1, rdev,
+ prid, resblks > 0, &ip, &committed);
+ if (error) {
+ if (error == ENOSPC)
+ goto out_trans_cancel;
+ goto out_trans_abort;
+ }
+
+ /*
+ * Now we join the directory inode to the transaction. We do not do it
+ * earlier because xfs_dir_ialloc might commit the previous transaction
+ * (and release all the locks). An error from here on will result in
+ * the transaction cancel unlocking dp so don't do it explicitly in the
+ * error path.
+ */
+ xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
+ unlock_dp_on_error = false;
+
+ error = xfs_dir_createname(tp, dp, name, ip->i_ino,
+ &first_block, &free_list, resblks ?
+ resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
+ if (error) {
+ ASSERT(error != ENOSPC);
+ goto out_trans_abort;
+ }
+ xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+ xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
+
+ if (is_dir) {
+ error = xfs_dir_init(tp, ip, dp);
+ if (error)
+ goto out_bmap_cancel;
+
+ error = xfs_bumplink(tp, dp);
+ if (error)
+ goto out_bmap_cancel;
+ }
+
+ /*
+ * If this is a synchronous mount, make sure that the
+ * create transaction goes to disk before returning to
+ * the user.
+ */
+ if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC))
+ xfs_trans_set_sync(tp);
+
+ /*
+ * Attach the dquot(s) to the inodes and modify them incore.
+ * These ids of the inode couldn't have changed since the new
+ * inode has been locked ever since it was created.
+ */
+ xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp);
+
+ error = xfs_bmap_finish(&tp, &free_list, &committed);
+ if (error)
+ goto out_bmap_cancel;
+
+ error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+ if (error)
+ goto out_release_inode;
+
+ xfs_qm_dqrele(udqp);
+ xfs_qm_dqrele(gdqp);
+ xfs_qm_dqrele(pdqp);
+
+ *ipp = ip;
+ return 0;
+
+ out_bmap_cancel:
+ xfs_bmap_cancel(&free_list);
+ out_trans_abort:
+ cancel_flags |= XFS_TRANS_ABORT;
+ out_trans_cancel:
+ xfs_trans_cancel(tp, cancel_flags);
+ out_release_inode:
+ /*
+ * Wait until after the current transaction is aborted to
+ * release the inode. This prevents recursive transactions
+ * and deadlocks from xfs_inactive.
+ */
+ if (ip)
+ IRELE(ip);
+
+ xfs_qm_dqrele(udqp);
+ xfs_qm_dqrele(gdqp);
+ xfs_qm_dqrele(pdqp);
+
+ if (unlock_dp_on_error)
+ xfs_iunlock(dp, XFS_ILOCK_EXCL);
+ return error;
+}
+
+int
+xfs_create_tmpfile(
+ struct xfs_inode *dp,
+ struct dentry *dentry,
+ umode_t mode,
+ struct xfs_inode **ipp)
+{
+ struct xfs_mount *mp = dp->i_mount;
+ struct xfs_inode *ip = NULL;
+ struct xfs_trans *tp = NULL;
+ int error;
+ uint cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
+ prid_t prid;
+ struct xfs_dquot *udqp = NULL;
+ struct xfs_dquot *gdqp = NULL;
+ struct xfs_dquot *pdqp = NULL;
+ struct xfs_trans_res *tres;
+ uint resblks;
+
+ if (XFS_FORCED_SHUTDOWN(mp))
+ return XFS_ERROR(EIO);
+
+ prid = xfs_get_initial_prid(dp);
+
+ /*
+ * Make sure that we have allocated dquot(s) on disk.
+ */
+ error = xfs_qm_vop_dqalloc(dp, xfs_kuid_to_uid(current_fsuid()),
+ xfs_kgid_to_gid(current_fsgid()), prid,
+ XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT,
+ &udqp, &gdqp, &pdqp);
+ if (error)
+ return error;
+
+ resblks = XFS_IALLOC_SPACE_RES(mp);
+ tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE_TMPFILE);
+
+ tres = &M_RES(mp)->tr_create_tmpfile;
+ error = xfs_trans_reserve(tp, tres, resblks, 0);
+ if (error == ENOSPC) {
+ /* No space at all so try a "no-allocation" reservation */
+ resblks = 0;
+ error = xfs_trans_reserve(tp, tres, 0, 0);
+ }
+ if (error) {
+ cancel_flags = 0;
+ goto out_trans_cancel;
+ }
+
+ error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp,
+ pdqp, resblks, 1, 0);
+ if (error)
+ goto out_trans_cancel;
+
+ error = xfs_dir_ialloc(&tp, dp, mode, 1, 0,
+ prid, resblks > 0, &ip, NULL);
+ if (error) {
+ if (error == ENOSPC)
+ goto out_trans_cancel;
+ goto out_trans_abort;
+ }
+
+ if (mp->m_flags & XFS_MOUNT_WSYNC)
+ xfs_trans_set_sync(tp);
+
+ /*
+ * Attach the dquot(s) to the inodes and modify them incore.
+ * These ids of the inode couldn't have changed since the new
+ * inode has been locked ever since it was created.
+ */
+ xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp);
+
+ ip->i_d.di_nlink--;
+ error = xfs_iunlink(tp, ip);
+ if (error)
+ goto out_trans_abort;
+
+ error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+ if (error)
+ goto out_release_inode;
+
+ xfs_qm_dqrele(udqp);
+ xfs_qm_dqrele(gdqp);
+ xfs_qm_dqrele(pdqp);
+
+ *ipp = ip;
+ return 0;
+
+ out_trans_abort:
+ cancel_flags |= XFS_TRANS_ABORT;
+ out_trans_cancel:
+ xfs_trans_cancel(tp, cancel_flags);
+ out_release_inode:
+ /*
+ * Wait until after the current transaction is aborted to
+ * release the inode. This prevents recursive transactions
+ * and deadlocks from xfs_inactive.
+ */
+ if (ip)
+ IRELE(ip);
+
+ xfs_qm_dqrele(udqp);
+ xfs_qm_dqrele(gdqp);
+ xfs_qm_dqrele(pdqp);
+
+ return error;
+}
+
+int
+xfs_link(
+ xfs_inode_t *tdp,
+ xfs_inode_t *sip,
+ struct xfs_name *target_name)
+{
+ xfs_mount_t *mp = tdp->i_mount;
+ xfs_trans_t *tp;
+ int error;
+ xfs_bmap_free_t free_list;
+ xfs_fsblock_t first_block;
+ int cancel_flags;
+ int committed;
+ int resblks;
+
+ trace_xfs_link(tdp, target_name);
+
+ ASSERT(!S_ISDIR(sip->i_d.di_mode));
+
+ if (XFS_FORCED_SHUTDOWN(mp))
+ return XFS_ERROR(EIO);
+
+ error = xfs_qm_dqattach(sip, 0);
+ if (error)
+ goto std_return;
+
+ error = xfs_qm_dqattach(tdp, 0);
+ if (error)
+ goto std_return;
+
+ tp = xfs_trans_alloc(mp, XFS_TRANS_LINK);
+ cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
+ resblks = XFS_LINK_SPACE_RES(mp, target_name->len);
+ error = xfs_trans_reserve(tp, &M_RES(mp)->tr_link, resblks, 0);
+ if (error == ENOSPC) {
+ resblks = 0;
+ error = xfs_trans_reserve(tp, &M_RES(mp)->tr_link, 0, 0);
+ }
+ if (error) {
+ cancel_flags = 0;
+ goto error_return;
+ }
+
+ xfs_lock_two_inodes(sip, tdp, XFS_ILOCK_EXCL);
+
+ xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, tdp, XFS_ILOCK_EXCL);
+
+ /*
+ * If we are using project inheritance, we only allow hard link
+ * creation in our tree when the project IDs are the same; else
+ * the tree quota mechanism could be circumvented.
+ */
+ if (unlikely((tdp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
+ (xfs_get_projid(tdp) != xfs_get_projid(sip)))) {
+ error = XFS_ERROR(EXDEV);
+ goto error_return;
+ }
+
+ error = xfs_dir_canenter(tp, tdp, target_name, resblks);
+ if (error)
+ goto error_return;
+
+ xfs_bmap_init(&free_list, &first_block);
+
+ if (sip->i_d.di_nlink == 0) {
+ error = xfs_iunlink_remove(tp, sip);
+ if (error)
+ goto abort_return;
+ }
+
+ error = xfs_dir_createname(tp, tdp, target_name, sip->i_ino,
+ &first_block, &free_list, resblks);
+ if (error)
+ goto abort_return;
+ xfs_trans_ichgtime(tp, tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+ xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE);
+
+ error = xfs_bumplink(tp, sip);
+ if (error)
+ goto abort_return;
+
+ /*
+ * If this is a synchronous mount, make sure that the
+ * link transaction goes to disk before returning to
+ * the user.
+ */
+ if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
+ xfs_trans_set_sync(tp);
+ }
+
+ error = xfs_bmap_finish (&tp, &free_list, &committed);
+ if (error) {
+ xfs_bmap_cancel(&free_list);
+ goto abort_return;
+ }
+
+ return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+
+ abort_return:
+ cancel_flags |= XFS_TRANS_ABORT;
+ error_return:
+ xfs_trans_cancel(tp, cancel_flags);
+ std_return:
+ return error;
+}
+
+/*
* Free up the underlying blocks past new_size. The new size must be smaller
* than the current size. This routine can be used both for the attribute and
* data fork, and does not modify the inode size, which is left to the caller.
@@ -1465,10 +1579,7 @@ xfs_itruncate_extents(
* reference that we gained in xfs_trans_dup()
*/
xfs_log_ticket_put(tp->t_ticket);
- error = xfs_trans_reserve(tp, 0,
- XFS_ITRUNCATE_LOG_RES(mp), 0,
- XFS_TRANS_PERM_LOG_RES,
- XFS_ITRUNCATE_LOG_COUNT);
+ error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
if (error)
goto out;
}
@@ -1494,6 +1605,320 @@ out_bmap_cancel:
goto out;
}
+int
+xfs_release(
+ xfs_inode_t *ip)
+{
+ xfs_mount_t *mp = ip->i_mount;
+ int error;
+
+ if (!S_ISREG(ip->i_d.di_mode) || (ip->i_d.di_mode == 0))
+ return 0;
+
+ /* If this is a read-only mount, don't do this (would generate I/O) */
+ if (mp->m_flags & XFS_MOUNT_RDONLY)
+ return 0;
+
+ if (!XFS_FORCED_SHUTDOWN(mp)) {
+ int truncated;
+
+ /*
+ * If we previously truncated this file and removed old data
+ * in the process, we want to initiate "early" writeout on
+ * the last close. This is an attempt to combat the notorious
+ * NULL files problem which is particularly noticeable from a
+ * truncate down, buffered (re-)write (delalloc), followed by
+ * a crash. What we are effectively doing here is
+ * significantly reducing the time window where we'd otherwise
+ * be exposed to that problem.
+ */
+ truncated = xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED);
+ if (truncated) {
+ xfs_iflags_clear(ip, XFS_IDIRTY_RELEASE);
+ if (VN_DIRTY(VFS_I(ip)) && ip->i_delayed_blks > 0) {
+ error = -filemap_flush(VFS_I(ip)->i_mapping);
+ if (error)
+ return error;
+ }
+ }
+ }
+
+ if (ip->i_d.di_nlink == 0)
+ return 0;
+
+ if (xfs_can_free_eofblocks(ip, false)) {
+
+ /*
+ * If we can't get the iolock just skip truncating the blocks
+ * past EOF because we could deadlock with the mmap_sem
+ * otherwise. We'll get another chance to drop them once the
+ * last reference to the inode is dropped, so we'll never leak
+ * blocks permanently.
+ *
+ * Further, check if the inode is being opened, written and
+ * closed frequently and we have delayed allocation blocks
+ * outstanding (e.g. streaming writes from the NFS server),
+ * truncating the blocks past EOF will cause fragmentation to
+ * occur.
+ *
+ * In this case don't do the truncation, either, but we have to
+ * be careful how we detect this case. Blocks beyond EOF show
+ * up as i_delayed_blks even when the inode is clean, so we
+ * need to truncate them away first before checking for a dirty
+ * release. Hence on the first dirty close we will still remove
+ * the speculative allocation, but after that we will leave it
+ * in place.
+ */
+ if (xfs_iflags_test(ip, XFS_IDIRTY_RELEASE))
+ return 0;
+
+ error = xfs_free_eofblocks(mp, ip, true);
+ if (error && error != EAGAIN)
+ return error;
+
+ /* delalloc blocks after truncation means it really is dirty */
+ if (ip->i_delayed_blks)
+ xfs_iflags_set(ip, XFS_IDIRTY_RELEASE);
+ }
+ return 0;
+}
+
+/*
+ * xfs_inactive_truncate
+ *
+ * Called to perform a truncate when an inode becomes unlinked.
+ */
+STATIC int
+xfs_inactive_truncate(
+ struct xfs_inode *ip)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_trans *tp;
+ int error;
+
+ tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
+ error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
+ if (error) {
+ ASSERT(XFS_FORCED_SHUTDOWN(mp));
+ xfs_trans_cancel(tp, 0);
+ return error;
+ }
+
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, ip, 0);
+
+ /*
+ * Log the inode size first to prevent stale data exposure in the event
+ * of a system crash before the truncate completes. See the related
+ * comment in xfs_setattr_size() for details.
+ */
+ ip->i_d.di_size = 0;
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+ error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, 0);
+ if (error)
+ goto error_trans_cancel;
+
+ ASSERT(ip->i_d.di_nextents == 0);
+
+ error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+ if (error)
+ goto error_unlock;
+
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ return 0;
+
+error_trans_cancel:
+ xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
+error_unlock:
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ return error;
+}
+
+/*
+ * xfs_inactive_ifree()
+ *
+ * Perform the inode free when an inode is unlinked.
+ */
+STATIC int
+xfs_inactive_ifree(
+ struct xfs_inode *ip)
+{
+ xfs_bmap_free_t free_list;
+ xfs_fsblock_t first_block;
+ int committed;
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_trans *tp;
+ int error;
+
+ tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
+
+ /*
+ * The ifree transaction might need to allocate blocks for record
+ * insertion to the finobt. We don't want to fail here at ENOSPC, so
+ * allow ifree to dip into the reserved block pool if necessary.
+ *
+ * Freeing large sets of inodes generally means freeing inode chunks,
+ * directory and file data blocks, so this should be relatively safe.
+ * Only under severe circumstances should it be possible to free enough
+ * inodes to exhaust the reserve block pool via finobt expansion while
+ * at the same time not creating free space in the filesystem.
+ *
+ * Send a warning if the reservation does happen to fail, as the inode
+ * now remains allocated and sits on the unlinked list until the fs is
+ * repaired.
+ */
+ tp->t_flags |= XFS_TRANS_RESERVE;
+ error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ifree,
+ XFS_IFREE_SPACE_RES(mp), 0);
+ if (error) {
+ if (error == ENOSPC) {
+ xfs_warn_ratelimited(mp,
+ "Failed to remove inode(s) from unlinked list. "
+ "Please free space, unmount and run xfs_repair.");
+ } else {
+ ASSERT(XFS_FORCED_SHUTDOWN(mp));
+ }
+ xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES);
+ return error;
+ }
+
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, ip, 0);
+
+ xfs_bmap_init(&free_list, &first_block);
+ error = xfs_ifree(tp, ip, &free_list);
+ if (error) {
+ /*
+ * If we fail to free the inode, shut down. The cancel
+ * might do that, we need to make sure. Otherwise the
+ * inode might be lost for a long time or forever.
+ */
+ if (!XFS_FORCED_SHUTDOWN(mp)) {
+ xfs_notice(mp, "%s: xfs_ifree returned error %d",
+ __func__, error);
+ xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
+ }
+ xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ return error;
+ }
+
+ /*
+ * Credit the quota account(s). The inode is gone.
+ */
+ xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_ICOUNT, -1);
+
+ /*
+ * Just ignore errors at this point. There is nothing we can
+ * do except to try to keep going. Make sure it's not a silent
+ * error.
+ */
+ error = xfs_bmap_finish(&tp, &free_list, &committed);
+ if (error)
+ xfs_notice(mp, "%s: xfs_bmap_finish returned error %d",
+ __func__, error);
+ error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+ if (error)
+ xfs_notice(mp, "%s: xfs_trans_commit returned error %d",
+ __func__, error);
+
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ return 0;
+}
+
+/*
+ * xfs_inactive
+ *
+ * This is called when the vnode reference count for the vnode
+ * goes to zero. If the file has been unlinked, then it must
+ * now be truncated. Also, we clear all of the read-ahead state
+ * kept for the inode here since the file is now closed.
+ */
+void
+xfs_inactive(
+ xfs_inode_t *ip)
+{
+ struct xfs_mount *mp;
+ int error;
+ int truncate = 0;
+
+ /*
+ * If the inode is already free, then there can be nothing
+ * to clean up here.
+ */
+ if (ip->i_d.di_mode == 0) {
+ ASSERT(ip->i_df.if_real_bytes == 0);
+ ASSERT(ip->i_df.if_broot_bytes == 0);
+ return;
+ }
+
+ mp = ip->i_mount;
+
+ /* If this is a read-only mount, don't do this (would generate I/O) */
+ if (mp->m_flags & XFS_MOUNT_RDONLY)
+ return;
+
+ if (ip->i_d.di_nlink != 0) {
+ /*
+ * force is true because we are evicting an inode from the
+ * cache. Post-eof blocks must be freed, lest we end up with
+ * broken free space accounting.
+ */
+ if (xfs_can_free_eofblocks(ip, true))
+ xfs_free_eofblocks(mp, ip, false);
+
+ return;
+ }
+
+ if (S_ISREG(ip->i_d.di_mode) &&
+ (ip->i_d.di_size != 0 || XFS_ISIZE(ip) != 0 ||
+ ip->i_d.di_nextents > 0 || ip->i_delayed_blks > 0))
+ truncate = 1;
+
+ error = xfs_qm_dqattach(ip, 0);
+ if (error)
+ return;
+
+ if (S_ISLNK(ip->i_d.di_mode))
+ error = xfs_inactive_symlink(ip);
+ else if (truncate)
+ error = xfs_inactive_truncate(ip);
+ if (error)
+ return;
+
+ /*
+ * If there are attributes associated with the file then blow them away
+ * now. The code calls a routine that recursively deconstructs the
+ * attribute fork. We need to just commit the current transaction
+ * because we can't use it for xfs_attr_inactive().
+ */
+ if (ip->i_d.di_anextents > 0) {
+ ASSERT(ip->i_d.di_forkoff != 0);
+
+ error = xfs_attr_inactive(ip);
+ if (error)
+ return;
+ }
+
+ if (ip->i_afp)
+ xfs_idestroy_fork(ip, XFS_ATTR_FORK);
+
+ ASSERT(ip->i_d.di_anextents == 0);
+
+ /*
+ * Free the inode.
+ */
+ error = xfs_inactive_ifree(ip);
+ if (error)
+ return;
+
+ /*
+ * Release the dquots held by inode, if any.
+ */
+ xfs_qm_dqdetach(ip);
+}
+
/*
* This is called when the inode's link count goes to 0.
* We place the on-disk inode on a list in the AGI. It
@@ -1554,6 +1979,10 @@ xfs_iunlink(
dip->di_next_unlinked = agi->agi_unlinked[bucket_index];
offset = ip->i_imap.im_boffset +
offsetof(xfs_dinode_t, di_next_unlinked);
+
+ /* need to recalc the inode CRC if appropriate */
+ xfs_dinode_calc_crc(mp, dip);
+
xfs_trans_inode_buf(tp, ibp);
xfs_trans_log_buf(tp, ibp, offset,
(offset + sizeof(xfs_agino_t) - 1));
@@ -1639,6 +2068,10 @@ xfs_iunlink_remove(
dip->di_next_unlinked = cpu_to_be32(NULLAGINO);
offset = ip->i_imap.im_boffset +
offsetof(xfs_dinode_t, di_next_unlinked);
+
+ /* need to recalc the inode CRC if appropriate */
+ xfs_dinode_calc_crc(mp, dip);
+
xfs_trans_inode_buf(tp, ibp);
xfs_trans_log_buf(tp, ibp, offset,
(offset + sizeof(xfs_agino_t) - 1));
@@ -1712,6 +2145,10 @@ xfs_iunlink_remove(
dip->di_next_unlinked = cpu_to_be32(NULLAGINO);
offset = ip->i_imap.im_boffset +
offsetof(xfs_dinode_t, di_next_unlinked);
+
+ /* need to recalc the inode CRC if appropriate */
+ xfs_dinode_calc_crc(mp, dip);
+
xfs_trans_inode_buf(tp, ibp);
xfs_trans_log_buf(tp, ibp, offset,
(offset + sizeof(xfs_agino_t) - 1));
@@ -1725,6 +2162,10 @@ xfs_iunlink_remove(
last_dip->di_next_unlinked = cpu_to_be32(next_agino);
ASSERT(next_agino != 0);
offset = last_offset + offsetof(xfs_dinode_t, di_next_unlinked);
+
+ /* need to recalc the inode CRC if appropriate */
+ xfs_dinode_calc_crc(mp, last_dip);
+
xfs_trans_inode_buf(tp, last_ibp);
xfs_trans_log_buf(tp, last_ibp, offset,
(offset + sizeof(xfs_agino_t) - 1));
@@ -1734,7 +2175,7 @@ xfs_iunlink_remove(
}
/*
- * A big issue when freeing the inode cluster is is that we _cannot_ skip any
+ * A big issue when freeing the inode cluster is that we _cannot_ skip any
* inodes that are in memory - they all must be marked stale and attached to
* the cluster buffer.
*/
@@ -1746,8 +2187,8 @@ xfs_ifree_cluster(
{
xfs_mount_t *mp = free_ip->i_mount;
int blks_per_cluster;
+ int inodes_per_cluster;
int nbufs;
- int ninodes;
int i, j;
xfs_daddr_t blkno;
xfs_buf_t *bp;
@@ -1757,18 +2198,11 @@ xfs_ifree_cluster(
struct xfs_perag *pag;
pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, inum));
- if (mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(mp)) {
- blks_per_cluster = 1;
- ninodes = mp->m_sb.sb_inopblock;
- nbufs = XFS_IALLOC_BLOCKS(mp);
- } else {
- blks_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) /
- mp->m_sb.sb_blocksize;
- ninodes = blks_per_cluster * mp->m_sb.sb_inopblock;
- nbufs = XFS_IALLOC_BLOCKS(mp) / blks_per_cluster;
- }
+ blks_per_cluster = xfs_icluster_size_fsb(mp);
+ inodes_per_cluster = blks_per_cluster << mp->m_sb.sb_inopblog;
+ nbufs = mp->m_ialloc_blks / blks_per_cluster;
- for (j = 0; j < nbufs; j++, inum += ninodes) {
+ for (j = 0; j < nbufs; j++, inum += inodes_per_cluster) {
blkno = XFS_AGB_TO_DADDR(mp, XFS_INO_TO_AGNO(mp, inum),
XFS_INO_TO_AGBNO(mp, inum));
@@ -1830,7 +2264,7 @@ xfs_ifree_cluster(
* transaction stale above, which means there is no point in
* even trying to lock them.
*/
- for (i = 0; i < ninodes; i++) {
+ for (i = 0; i < inodes_per_cluster; i++) {
retry:
rcu_read_lock();
ip = radix_tree_lookup(&pag->pag_ici_root,
@@ -1928,8 +2362,6 @@ xfs_ifree(
int error;
int delete;
xfs_ino_t first_ino;
- xfs_dinode_t *dip;
- xfs_buf_t *ibp;
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
ASSERT(ip->i_d.di_nlink == 0);
@@ -1942,14 +2374,13 @@ xfs_ifree(
* Pull the on-disk inode from the AGI unlinked list.
*/
error = xfs_iunlink_remove(tp, ip);
- if (error != 0) {
+ if (error)
return error;
- }
error = xfs_difree(tp, ip->i_ino, flist, &delete, &first_ino);
- if (error != 0) {
+ if (error)
return error;
- }
+
ip->i_d.di_mode = 0; /* mark incore inode as free */
ip->i_d.di_flags = 0;
ip->i_d.di_dmevmask = 0;
@@ -1961,300 +2392,15 @@ xfs_ifree(
* by reincarnations of this inode.
*/
ip->i_d.di_gen++;
-
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
- error = xfs_imap_to_bp(ip->i_mount, tp, &ip->i_imap, &dip, &ibp,
- 0, 0);
- if (error)
- return error;
-
- /*
- * Clear the on-disk di_mode. This is to prevent xfs_bulkstat
- * from picking up this inode when it is reclaimed (its incore state
- * initialzed but not flushed to disk yet). The in-core di_mode is
- * already cleared and a corresponding transaction logged.
- * The hack here just synchronizes the in-core to on-disk
- * di_mode value in advance before the actual inode sync to disk.
- * This is OK because the inode is already unlinked and would never
- * change its di_mode again for this inode generation.
- * This is a temporary hack that would require a proper fix
- * in the future.
- */
- dip->di_mode = 0;
-
- if (delete) {
+ if (delete)
error = xfs_ifree_cluster(ip, tp, first_ino);
- }
return error;
}
/*
- * Reallocate the space for if_broot based on the number of records
- * being added or deleted as indicated in rec_diff. Move the records
- * and pointers in if_broot to fit the new size. When shrinking this
- * will eliminate holes between the records and pointers created by
- * the caller. When growing this will create holes to be filled in
- * by the caller.
- *
- * The caller must not request to add more records than would fit in
- * the on-disk inode root. If the if_broot is currently NULL, then
- * if we adding records one will be allocated. The caller must also
- * not request that the number of records go below zero, although
- * it can go to zero.
- *
- * ip -- the inode whose if_broot area is changing
- * ext_diff -- the change in the number of records, positive or negative,
- * requested for the if_broot array.
- */
-void
-xfs_iroot_realloc(
- xfs_inode_t *ip,
- int rec_diff,
- int whichfork)
-{
- struct xfs_mount *mp = ip->i_mount;
- int cur_max;
- xfs_ifork_t *ifp;
- struct xfs_btree_block *new_broot;
- int new_max;
- size_t new_size;
- char *np;
- char *op;
-
- /*
- * Handle the degenerate case quietly.
- */
- if (rec_diff == 0) {
- return;
- }
-
- ifp = XFS_IFORK_PTR(ip, whichfork);
- if (rec_diff > 0) {
- /*
- * If there wasn't any memory allocated before, just
- * allocate it now and get out.
- */
- if (ifp->if_broot_bytes == 0) {
- new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(rec_diff);
- ifp->if_broot = kmem_alloc(new_size, KM_SLEEP | KM_NOFS);
- ifp->if_broot_bytes = (int)new_size;
- return;
- }
-
- /*
- * If there is already an existing if_broot, then we need
- * to realloc() it and shift the pointers to their new
- * location. The records don't change location because
- * they are kept butted up against the btree block header.
- */
- cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0);
- new_max = cur_max + rec_diff;
- new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(new_max);
- ifp->if_broot = kmem_realloc(ifp->if_broot, new_size,
- (size_t)XFS_BMAP_BROOT_SPACE_CALC(cur_max), /* old size */
- KM_SLEEP | KM_NOFS);
- op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
- ifp->if_broot_bytes);
- np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
- (int)new_size);
- ifp->if_broot_bytes = (int)new_size;
- ASSERT(ifp->if_broot_bytes <=
- XFS_IFORK_SIZE(ip, whichfork) + XFS_BROOT_SIZE_ADJ);
- memmove(np, op, cur_max * (uint)sizeof(xfs_dfsbno_t));
- return;
- }
-
- /*
- * rec_diff is less than 0. In this case, we are shrinking the
- * if_broot buffer. It must already exist. If we go to zero
- * records, just get rid of the root and clear the status bit.
- */
- ASSERT((ifp->if_broot != NULL) && (ifp->if_broot_bytes > 0));
- cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0);
- new_max = cur_max + rec_diff;
- ASSERT(new_max >= 0);
- if (new_max > 0)
- new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(new_max);
- else
- new_size = 0;
- if (new_size > 0) {
- new_broot = kmem_alloc(new_size, KM_SLEEP | KM_NOFS);
- /*
- * First copy over the btree block header.
- */
- memcpy(new_broot, ifp->if_broot, XFS_BTREE_LBLOCK_LEN);
- } else {
- new_broot = NULL;
- ifp->if_flags &= ~XFS_IFBROOT;
- }
-
- /*
- * Only copy the records and pointers if there are any.
- */
- if (new_max > 0) {
- /*
- * First copy the records.
- */
- op = (char *)XFS_BMBT_REC_ADDR(mp, ifp->if_broot, 1);
- np = (char *)XFS_BMBT_REC_ADDR(mp, new_broot, 1);
- memcpy(np, op, new_max * (uint)sizeof(xfs_bmbt_rec_t));
-
- /*
- * Then copy the pointers.
- */
- op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
- ifp->if_broot_bytes);
- np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, new_broot, 1,
- (int)new_size);
- memcpy(np, op, new_max * (uint)sizeof(xfs_dfsbno_t));
- }
- kmem_free(ifp->if_broot);
- ifp->if_broot = new_broot;
- ifp->if_broot_bytes = (int)new_size;
- ASSERT(ifp->if_broot_bytes <=
- XFS_IFORK_SIZE(ip, whichfork) + XFS_BROOT_SIZE_ADJ);
- return;
-}
-
-
-/*
- * This is called when the amount of space needed for if_data
- * is increased or decreased. The change in size is indicated by
- * the number of bytes that need to be added or deleted in the
- * byte_diff parameter.
- *
- * If the amount of space needed has decreased below the size of the
- * inline buffer, then switch to using the inline buffer. Otherwise,
- * use kmem_realloc() or kmem_alloc() to adjust the size of the buffer
- * to what is needed.
- *
- * ip -- the inode whose if_data area is changing
- * byte_diff -- the change in the number of bytes, positive or negative,
- * requested for the if_data array.
- */
-void
-xfs_idata_realloc(
- xfs_inode_t *ip,
- int byte_diff,
- int whichfork)
-{
- xfs_ifork_t *ifp;
- int new_size;
- int real_size;
-
- if (byte_diff == 0) {
- return;
- }
-
- ifp = XFS_IFORK_PTR(ip, whichfork);
- new_size = (int)ifp->if_bytes + byte_diff;
- ASSERT(new_size >= 0);
-
- if (new_size == 0) {
- if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) {
- kmem_free(ifp->if_u1.if_data);
- }
- ifp->if_u1.if_data = NULL;
- real_size = 0;
- } else if (new_size <= sizeof(ifp->if_u2.if_inline_data)) {
- /*
- * If the valid extents/data can fit in if_inline_ext/data,
- * copy them from the malloc'd vector and free it.
- */
- if (ifp->if_u1.if_data == NULL) {
- ifp->if_u1.if_data = ifp->if_u2.if_inline_data;
- } else if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) {
- ASSERT(ifp->if_real_bytes != 0);
- memcpy(ifp->if_u2.if_inline_data, ifp->if_u1.if_data,
- new_size);
- kmem_free(ifp->if_u1.if_data);
- ifp->if_u1.if_data = ifp->if_u2.if_inline_data;
- }
- real_size = 0;
- } else {
- /*
- * Stuck with malloc/realloc.
- * For inline data, the underlying buffer must be
- * a multiple of 4 bytes in size so that it can be
- * logged and stay on word boundaries. We enforce
- * that here.
- */
- real_size = roundup(new_size, 4);
- if (ifp->if_u1.if_data == NULL) {
- ASSERT(ifp->if_real_bytes == 0);
- ifp->if_u1.if_data = kmem_alloc(real_size,
- KM_SLEEP | KM_NOFS);
- } else if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) {
- /*
- * Only do the realloc if the underlying size
- * is really changing.
- */
- if (ifp->if_real_bytes != real_size) {
- ifp->if_u1.if_data =
- kmem_realloc(ifp->if_u1.if_data,
- real_size,
- ifp->if_real_bytes,
- KM_SLEEP | KM_NOFS);
- }
- } else {
- ASSERT(ifp->if_real_bytes == 0);
- ifp->if_u1.if_data = kmem_alloc(real_size,
- KM_SLEEP | KM_NOFS);
- memcpy(ifp->if_u1.if_data, ifp->if_u2.if_inline_data,
- ifp->if_bytes);
- }
- }
- ifp->if_real_bytes = real_size;
- ifp->if_bytes = new_size;
- ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork));
-}
-
-void
-xfs_idestroy_fork(
- xfs_inode_t *ip,
- int whichfork)
-{
- xfs_ifork_t *ifp;
-
- ifp = XFS_IFORK_PTR(ip, whichfork);
- if (ifp->if_broot != NULL) {
- kmem_free(ifp->if_broot);
- ifp->if_broot = NULL;
- }
-
- /*
- * If the format is local, then we can't have an extents
- * array so just look for an inline data array. If we're
- * not local then we may or may not have an extents list,
- * so check and free it up if we do.
- */
- if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) {
- if ((ifp->if_u1.if_data != ifp->if_u2.if_inline_data) &&
- (ifp->if_u1.if_data != NULL)) {
- ASSERT(ifp->if_real_bytes != 0);
- kmem_free(ifp->if_u1.if_data);
- ifp->if_u1.if_data = NULL;
- ifp->if_real_bytes = 0;
- }
- } else if ((ifp->if_flags & XFS_IFEXTENTS) &&
- ((ifp->if_flags & XFS_IFEXTIREC) ||
- ((ifp->if_u1.if_extents != NULL) &&
- (ifp->if_u1.if_extents != ifp->if_u2.if_inline_ext)))) {
- ASSERT(ifp->if_real_bytes != 0);
- xfs_iext_destroy(ifp);
- }
- ASSERT(ifp->if_u1.if_extents == NULL ||
- ifp->if_u1.if_extents == ifp->if_u2.if_inline_ext);
- ASSERT(ifp->if_real_bytes == 0);
- if (whichfork == XFS_ATTR_FORK) {
- kmem_zone_free(xfs_ifork_zone, ip->i_afp);
- ip->i_afp = NULL;
- }
-}
-
-/*
* This is called to unpin an inode. The caller must have the inode locked
* in at least shared mode so that the buffer cannot be subsequently pinned
* once someone is waiting for it to be unpinned.
@@ -2298,165 +2444,480 @@ xfs_iunpin_wait(
}
/*
- * xfs_iextents_copy()
+ * Removing an inode from the namespace involves removing the directory entry
+ * and dropping the link count on the inode. Removing the directory entry can
+ * result in locking an AGF (directory blocks were freed) and removing a link
+ * count can result in placing the inode on an unlinked list which results in
+ * locking an AGI.
*
- * This is called to copy the REAL extents (as opposed to the delayed
- * allocation extents) from the inode into the given buffer. It
- * returns the number of bytes copied into the buffer.
+ * The big problem here is that we have an ordering constraint on AGF and AGI
+ * locking - inode allocation locks the AGI, then can allocate a new extent for
+ * new inodes, locking the AGF after the AGI. Similarly, freeing the inode
+ * removes the inode from the unlinked list, requiring that we lock the AGI
+ * first, and then freeing the inode can result in an inode chunk being freed
+ * and hence freeing disk space requiring that we lock an AGF.
*
- * If there are no delayed allocation extents, then we can just
- * memcpy() the extents into the buffer. Otherwise, we need to
- * examine each extent in turn and skip those which are delayed.
+ * Hence the ordering that is imposed by other parts of the code is AGI before
+ * AGF. This means we cannot remove the directory entry before we drop the inode
+ * reference count and put it on the unlinked list as this results in a lock
+ * order of AGF then AGI, and this can deadlock against inode allocation and
+ * freeing. Therefore we must drop the link counts before we remove the
+ * directory entry.
+ *
+ * This is still safe from a transactional point of view - it is not until we
+ * get to xfs_bmap_finish() that we have the possibility of multiple
+ * transactions in this operation. Hence as long as we remove the directory
+ * entry and drop the link count in the first transaction of the remove
+ * operation, there are no transactional constraints on the ordering here.
*/
int
-xfs_iextents_copy(
- xfs_inode_t *ip,
- xfs_bmbt_rec_t *dp,
- int whichfork)
+xfs_remove(
+ xfs_inode_t *dp,
+ struct xfs_name *name,
+ xfs_inode_t *ip)
{
- int copied;
- int i;
- xfs_ifork_t *ifp;
- int nrecs;
- xfs_fsblock_t start_block;
+ xfs_mount_t *mp = dp->i_mount;
+ xfs_trans_t *tp = NULL;
+ int is_dir = S_ISDIR(ip->i_d.di_mode);
+ int error = 0;
+ xfs_bmap_free_t free_list;
+ xfs_fsblock_t first_block;
+ int cancel_flags;
+ int committed;
+ int link_zero;
+ uint resblks;
+ uint log_count;
- ifp = XFS_IFORK_PTR(ip, whichfork);
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
- ASSERT(ifp->if_bytes > 0);
+ trace_xfs_remove(dp, name);
+
+ if (XFS_FORCED_SHUTDOWN(mp))
+ return XFS_ERROR(EIO);
+
+ error = xfs_qm_dqattach(dp, 0);
+ if (error)
+ goto std_return;
- nrecs = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
- XFS_BMAP_TRACE_EXLIST(ip, nrecs, whichfork);
- ASSERT(nrecs > 0);
+ error = xfs_qm_dqattach(ip, 0);
+ if (error)
+ goto std_return;
+
+ if (is_dir) {
+ tp = xfs_trans_alloc(mp, XFS_TRANS_RMDIR);
+ log_count = XFS_DEFAULT_LOG_COUNT;
+ } else {
+ tp = xfs_trans_alloc(mp, XFS_TRANS_REMOVE);
+ log_count = XFS_REMOVE_LOG_COUNT;
+ }
+ cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
/*
- * There are some delayed allocation extents in the
- * inode, so copy the extents one at a time and skip
- * the delayed ones. There must be at least one
- * non-delayed extent.
+ * We try to get the real space reservation first,
+ * allowing for directory btree deletion(s) implying
+ * possible bmap insert(s). If we can't get the space
+ * reservation then we use 0 instead, and avoid the bmap
+ * btree insert(s) in the directory code by, if the bmap
+ * insert tries to happen, instead trimming the LAST
+ * block from the directory.
*/
- copied = 0;
- for (i = 0; i < nrecs; i++) {
- xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i);
- start_block = xfs_bmbt_get_startblock(ep);
- if (isnullstartblock(start_block)) {
- /*
- * It's a delayed allocation extent, so skip it.
- */
- continue;
+ resblks = XFS_REMOVE_SPACE_RES(mp);
+ error = xfs_trans_reserve(tp, &M_RES(mp)->tr_remove, resblks, 0);
+ if (error == ENOSPC) {
+ resblks = 0;
+ error = xfs_trans_reserve(tp, &M_RES(mp)->tr_remove, 0, 0);
+ }
+ if (error) {
+ ASSERT(error != ENOSPC);
+ cancel_flags = 0;
+ goto out_trans_cancel;
+ }
+
+ xfs_lock_two_inodes(dp, ip, XFS_ILOCK_EXCL);
+
+ xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+
+ /*
+ * If we're removing a directory perform some additional validation.
+ */
+ cancel_flags |= XFS_TRANS_ABORT;
+ if (is_dir) {
+ ASSERT(ip->i_d.di_nlink >= 2);
+ if (ip->i_d.di_nlink != 2) {
+ error = XFS_ERROR(ENOTEMPTY);
+ goto out_trans_cancel;
+ }
+ if (!xfs_dir_isempty(ip)) {
+ error = XFS_ERROR(ENOTEMPTY);
+ goto out_trans_cancel;
}
- /* Translate to on disk format */
- put_unaligned(cpu_to_be64(ep->l0), &dp->l0);
- put_unaligned(cpu_to_be64(ep->l1), &dp->l1);
- dp++;
- copied++;
+ /* Drop the link from ip's "..". */
+ error = xfs_droplink(tp, dp);
+ if (error)
+ goto out_trans_cancel;
+
+ /* Drop the "." link from ip to self. */
+ error = xfs_droplink(tp, ip);
+ if (error)
+ goto out_trans_cancel;
+ } else {
+ /*
+ * When removing a non-directory we need to log the parent
+ * inode here. For a directory this is done implicitly
+ * by the xfs_droplink call for the ".." entry.
+ */
+ xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
+ }
+ xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+
+ /* Drop the link from dp to ip. */
+ error = xfs_droplink(tp, ip);
+ if (error)
+ goto out_trans_cancel;
+
+ /* Determine if this is the last link while the inode is locked */
+ link_zero = (ip->i_d.di_nlink == 0);
+
+ xfs_bmap_init(&free_list, &first_block);
+ error = xfs_dir_removename(tp, dp, name, ip->i_ino,
+ &first_block, &free_list, resblks);
+ if (error) {
+ ASSERT(error != ENOENT);
+ goto out_bmap_cancel;
}
- ASSERT(copied != 0);
- xfs_validate_extents(ifp, copied, XFS_EXTFMT_INODE(ip));
- return (copied * (uint)sizeof(xfs_bmbt_rec_t));
+ /*
+ * If this is a synchronous mount, make sure that the
+ * remove transaction goes to disk before returning to
+ * the user.
+ */
+ if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC))
+ xfs_trans_set_sync(tp);
+
+ error = xfs_bmap_finish(&tp, &free_list, &committed);
+ if (error)
+ goto out_bmap_cancel;
+
+ error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+ if (error)
+ goto std_return;
+
+ if (is_dir && xfs_inode_is_filestream(ip))
+ xfs_filestream_deassociate(ip);
+
+ return 0;
+
+ out_bmap_cancel:
+ xfs_bmap_cancel(&free_list);
+ out_trans_cancel:
+ xfs_trans_cancel(tp, cancel_flags);
+ std_return:
+ return error;
}
/*
- * Each of the following cases stores data into the same region
- * of the on-disk inode, so only one of them can be valid at
- * any given time. While it is possible to have conflicting formats
- * and log flags, e.g. having XFS_ILOG_?DATA set when the fork is
- * in EXTENTS format, this can only happen when the fork has
- * changed formats after being modified but before being flushed.
- * In these cases, the format always takes precedence, because the
- * format indicates the current state of the fork.
+ * Enter all inodes for a rename transaction into a sorted array.
*/
-/*ARGSUSED*/
STATIC void
-xfs_iflush_fork(
- xfs_inode_t *ip,
- xfs_dinode_t *dip,
- xfs_inode_log_item_t *iip,
- int whichfork,
- xfs_buf_t *bp)
-{
- char *cp;
- xfs_ifork_t *ifp;
- xfs_mount_t *mp;
-#ifdef XFS_TRANS_DEBUG
- int first;
-#endif
- static const short brootflag[2] =
- { XFS_ILOG_DBROOT, XFS_ILOG_ABROOT };
- static const short dataflag[2] =
- { XFS_ILOG_DDATA, XFS_ILOG_ADATA };
- static const short extflag[2] =
- { XFS_ILOG_DEXT, XFS_ILOG_AEXT };
-
- if (!iip)
- return;
- ifp = XFS_IFORK_PTR(ip, whichfork);
+xfs_sort_for_rename(
+ xfs_inode_t *dp1, /* in: old (source) directory inode */
+ xfs_inode_t *dp2, /* in: new (target) directory inode */
+ xfs_inode_t *ip1, /* in: inode of old entry */
+ xfs_inode_t *ip2, /* in: inode of new entry, if it
+ already exists, NULL otherwise. */
+ xfs_inode_t **i_tab,/* out: array of inode returned, sorted */
+ int *num_inodes) /* out: number of inodes in array */
+{
+ xfs_inode_t *temp;
+ int i, j;
+
/*
- * This can happen if we gave up in iformat in an error path,
- * for the attribute fork.
- */
- if (!ifp) {
- ASSERT(whichfork == XFS_ATTR_FORK);
- return;
+ * i_tab contains a list of pointers to inodes. We initialize
+ * the table here & we'll sort it. We will then use it to
+ * order the acquisition of the inode locks.
+ *
+ * Note that the table may contain duplicates. e.g., dp1 == dp2.
+ */
+ i_tab[0] = dp1;
+ i_tab[1] = dp2;
+ i_tab[2] = ip1;
+ if (ip2) {
+ *num_inodes = 4;
+ i_tab[3] = ip2;
+ } else {
+ *num_inodes = 3;
+ i_tab[3] = NULL;
}
- cp = XFS_DFORK_PTR(dip, whichfork);
- mp = ip->i_mount;
- switch (XFS_IFORK_FORMAT(ip, whichfork)) {
- case XFS_DINODE_FMT_LOCAL:
- if ((iip->ili_fields & dataflag[whichfork]) &&
- (ifp->if_bytes > 0)) {
- ASSERT(ifp->if_u1.if_data != NULL);
- ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork));
- memcpy(cp, ifp->if_u1.if_data, ifp->if_bytes);
- }
- break;
- case XFS_DINODE_FMT_EXTENTS:
- ASSERT((ifp->if_flags & XFS_IFEXTENTS) ||
- !(iip->ili_fields & extflag[whichfork]));
- if ((iip->ili_fields & extflag[whichfork]) &&
- (ifp->if_bytes > 0)) {
- ASSERT(xfs_iext_get_ext(ifp, 0));
- ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) > 0);
- (void)xfs_iextents_copy(ip, (xfs_bmbt_rec_t *)cp,
- whichfork);
+ /*
+ * Sort the elements via bubble sort. (Remember, there are at
+ * most 4 elements to sort, so this is adequate.)
+ */
+ for (i = 0; i < *num_inodes; i++) {
+ for (j = 1; j < *num_inodes; j++) {
+ if (i_tab[j]->i_ino < i_tab[j-1]->i_ino) {
+ temp = i_tab[j];
+ i_tab[j] = i_tab[j-1];
+ i_tab[j-1] = temp;
+ }
}
- break;
+ }
+}
- case XFS_DINODE_FMT_BTREE:
- if ((iip->ili_fields & brootflag[whichfork]) &&
- (ifp->if_broot_bytes > 0)) {
- ASSERT(ifp->if_broot != NULL);
- ASSERT(ifp->if_broot_bytes <=
- (XFS_IFORK_SIZE(ip, whichfork) +
- XFS_BROOT_SIZE_ADJ));
- xfs_bmbt_to_bmdr(mp, ifp->if_broot, ifp->if_broot_bytes,
- (xfs_bmdr_block_t *)cp,
- XFS_DFORK_SIZE(dip, mp, whichfork));
- }
- break;
+/*
+ * xfs_rename
+ */
+int
+xfs_rename(
+ xfs_inode_t *src_dp,
+ struct xfs_name *src_name,
+ xfs_inode_t *src_ip,
+ xfs_inode_t *target_dp,
+ struct xfs_name *target_name,
+ xfs_inode_t *target_ip)
+{
+ xfs_trans_t *tp = NULL;
+ xfs_mount_t *mp = src_dp->i_mount;
+ int new_parent; /* moving to a new dir */
+ int src_is_directory; /* src_name is a directory */
+ int error;
+ xfs_bmap_free_t free_list;
+ xfs_fsblock_t first_block;
+ int cancel_flags;
+ int committed;
+ xfs_inode_t *inodes[4];
+ int spaceres;
+ int num_inodes;
+
+ trace_xfs_rename(src_dp, target_dp, src_name, target_name);
+
+ new_parent = (src_dp != target_dp);
+ src_is_directory = S_ISDIR(src_ip->i_d.di_mode);
+
+ xfs_sort_for_rename(src_dp, target_dp, src_ip, target_ip,
+ inodes, &num_inodes);
+
+ xfs_bmap_init(&free_list, &first_block);
+ tp = xfs_trans_alloc(mp, XFS_TRANS_RENAME);
+ cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
+ spaceres = XFS_RENAME_SPACE_RES(mp, target_name->len);
+ error = xfs_trans_reserve(tp, &M_RES(mp)->tr_rename, spaceres, 0);
+ if (error == ENOSPC) {
+ spaceres = 0;
+ error = xfs_trans_reserve(tp, &M_RES(mp)->tr_rename, 0, 0);
+ }
+ if (error) {
+ xfs_trans_cancel(tp, 0);
+ goto std_return;
+ }
+
+ /*
+ * Attach the dquots to the inodes
+ */
+ error = xfs_qm_vop_rename_dqattach(inodes);
+ if (error) {
+ xfs_trans_cancel(tp, cancel_flags);
+ goto std_return;
+ }
+
+ /*
+ * Lock all the participating inodes. Depending upon whether
+ * the target_name exists in the target directory, and
+ * whether the target directory is the same as the source
+ * directory, we can lock from 2 to 4 inodes.
+ */
+ xfs_lock_inodes(inodes, num_inodes, XFS_ILOCK_EXCL);
+
+ /*
+ * Join all the inodes to the transaction. From this point on,
+ * we can rely on either trans_commit or trans_cancel to unlock
+ * them.
+ */
+ xfs_trans_ijoin(tp, src_dp, XFS_ILOCK_EXCL);
+ if (new_parent)
+ xfs_trans_ijoin(tp, target_dp, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, src_ip, XFS_ILOCK_EXCL);
+ if (target_ip)
+ xfs_trans_ijoin(tp, target_ip, XFS_ILOCK_EXCL);
+
+ /*
+ * If we are using project inheritance, we only allow renames
+ * into our tree when the project IDs are the same; else the
+ * tree quota mechanism would be circumvented.
+ */
+ if (unlikely((target_dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
+ (xfs_get_projid(target_dp) != xfs_get_projid(src_ip)))) {
+ error = XFS_ERROR(EXDEV);
+ goto error_return;
+ }
+
+ /*
+ * Set up the target.
+ */
+ if (target_ip == NULL) {
+ /*
+ * If there's no space reservation, check the entry will
+ * fit before actually inserting it.
+ */
+ error = xfs_dir_canenter(tp, target_dp, target_name, spaceres);
+ if (error)
+ goto error_return;
+ /*
+ * If target does not exist and the rename crosses
+ * directories, adjust the target directory link count
+ * to account for the ".." reference from the new entry.
+ */
+ error = xfs_dir_createname(tp, target_dp, target_name,
+ src_ip->i_ino, &first_block,
+ &free_list, spaceres);
+ if (error == ENOSPC)
+ goto error_return;
+ if (error)
+ goto abort_return;
- case XFS_DINODE_FMT_DEV:
- if (iip->ili_fields & XFS_ILOG_DEV) {
- ASSERT(whichfork == XFS_DATA_FORK);
- xfs_dinode_put_rdev(dip, ip->i_df.if_u2.if_rdev);
+ xfs_trans_ichgtime(tp, target_dp,
+ XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+
+ if (new_parent && src_is_directory) {
+ error = xfs_bumplink(tp, target_dp);
+ if (error)
+ goto abort_return;
+ }
+ } else { /* target_ip != NULL */
+ /*
+ * If target exists and it's a directory, check that both
+ * target and source are directories and that target can be
+ * destroyed, or that neither is a directory.
+ */
+ if (S_ISDIR(target_ip->i_d.di_mode)) {
+ /*
+ * Make sure target dir is empty.
+ */
+ if (!(xfs_dir_isempty(target_ip)) ||
+ (target_ip->i_d.di_nlink > 2)) {
+ error = XFS_ERROR(EEXIST);
+ goto error_return;
+ }
}
- break;
- case XFS_DINODE_FMT_UUID:
- if (iip->ili_fields & XFS_ILOG_UUID) {
- ASSERT(whichfork == XFS_DATA_FORK);
- memcpy(XFS_DFORK_DPTR(dip),
- &ip->i_df.if_u2.if_uuid,
- sizeof(uuid_t));
+ /*
+ * Link the source inode under the target name.
+ * If the source inode is a directory and we are moving
+ * it across directories, its ".." entry will be
+ * inconsistent until we replace that down below.
+ *
+ * In case there is already an entry with the same
+ * name at the destination directory, remove it first.
+ */
+ error = xfs_dir_replace(tp, target_dp, target_name,
+ src_ip->i_ino,
+ &first_block, &free_list, spaceres);
+ if (error)
+ goto abort_return;
+
+ xfs_trans_ichgtime(tp, target_dp,
+ XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+
+ /*
+ * Decrement the link count on the target since the target
+ * dir no longer points to it.
+ */
+ error = xfs_droplink(tp, target_ip);
+ if (error)
+ goto abort_return;
+
+ if (src_is_directory) {
+ /*
+ * Drop the link from the old "." entry.
+ */
+ error = xfs_droplink(tp, target_ip);
+ if (error)
+ goto abort_return;
}
- break;
+ } /* target_ip != NULL */
- default:
- ASSERT(0);
- break;
+ /*
+ * Remove the source.
+ */
+ if (new_parent && src_is_directory) {
+ /*
+ * Rewrite the ".." entry to point to the new
+ * directory.
+ */
+ error = xfs_dir_replace(tp, src_ip, &xfs_name_dotdot,
+ target_dp->i_ino,
+ &first_block, &free_list, spaceres);
+ ASSERT(error != EEXIST);
+ if (error)
+ goto abort_return;
}
+
+ /*
+ * We always want to hit the ctime on the source inode.
+ *
+ * This isn't strictly required by the standards since the source
+ * inode isn't really being changed, but old unix file systems did
+ * it and some incremental backup programs won't work without it.
+ */
+ xfs_trans_ichgtime(tp, src_ip, XFS_ICHGTIME_CHG);
+ xfs_trans_log_inode(tp, src_ip, XFS_ILOG_CORE);
+
+ /*
+ * Adjust the link count on src_dp. This is necessary when
+ * renaming a directory, either within one parent when
+ * the target existed, or across two parent directories.
+ */
+ if (src_is_directory && (new_parent || target_ip != NULL)) {
+
+ /*
+ * Decrement link count on src_directory since the
+ * entry that's moved no longer points to it.
+ */
+ error = xfs_droplink(tp, src_dp);
+ if (error)
+ goto abort_return;
+ }
+
+ error = xfs_dir_removename(tp, src_dp, src_name, src_ip->i_ino,
+ &first_block, &free_list, spaceres);
+ if (error)
+ goto abort_return;
+
+ xfs_trans_ichgtime(tp, src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+ xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE);
+ if (new_parent)
+ xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE);
+
+ /*
+ * If this is a synchronous mount, make sure that the
+ * rename transaction goes to disk before returning to
+ * the user.
+ */
+ if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
+ xfs_trans_set_sync(tp);
+ }
+
+ error = xfs_bmap_finish(&tp, &free_list, &committed);
+ if (error) {
+ xfs_bmap_cancel(&free_list);
+ xfs_trans_cancel(tp, (XFS_TRANS_RELEASE_LOG_RES |
+ XFS_TRANS_ABORT));
+ goto std_return;
+ }
+
+ /*
+ * trans_commit will unlock src_ip, target_ip & decrement
+ * the vnode references.
+ */
+ return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+
+ abort_return:
+ cancel_flags |= XFS_TRANS_ABORT;
+ error_return:
+ xfs_bmap_cancel(&free_list);
+ xfs_trans_cancel(tp, cancel_flags);
+ std_return:
+ return error;
}
STATIC int
@@ -2478,13 +2939,13 @@ xfs_iflush_cluster(
pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
- inodes_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog;
+ inodes_per_cluster = mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog;
ilist_size = inodes_per_cluster * sizeof(xfs_inode_t *);
ilist = kmem_alloc(ilist_size, KM_MAYFAIL|KM_NOFS);
if (!ilist)
goto out_put;
- mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1);
+ mask = ~(((mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog)) - 1);
first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask;
rcu_read_lock();
/* really need a gang lookup range call here */
@@ -2715,26 +3176,21 @@ abort_out:
return error;
}
-
STATIC int
xfs_iflush_int(
- xfs_inode_t *ip,
- xfs_buf_t *bp)
+ struct xfs_inode *ip,
+ struct xfs_buf *bp)
{
- xfs_inode_log_item_t *iip;
- xfs_dinode_t *dip;
- xfs_mount_t *mp;
-#ifdef XFS_TRANS_DEBUG
- int first;
-#endif
+ struct xfs_inode_log_item *iip = ip->i_itemp;
+ struct xfs_dinode *dip;
+ struct xfs_mount *mp = ip->i_mount;
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
ASSERT(xfs_isiflocked(ip));
ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK));
-
- iip = ip->i_itemp;
- mp = ip->i_mount;
+ ASSERT(iip != NULL && iip->ili_fields != 0);
+ ASSERT(ip->i_d.di_version > 1);
/* set *dip = inode's place in the buffer */
dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset);
@@ -2793,12 +3249,18 @@ xfs_iflush_int(
__func__, ip->i_ino, ip->i_d.di_forkoff, ip);
goto corrupt_out;
}
+
/*
- * bump the flush iteration count, used to detect flushes which
- * postdate a log record during recovery.
+ * Inode item log recovery for v2 inodes are dependent on the
+ * di_flushiter count for correct sequencing. We bump the flush
+ * iteration count so we can detect flushes which postdate a log record
+ * during recovery. This is redundant as we now log every change and
+ * hence this can't happen but we need to still do it to ensure
+ * backwards compatibility with old kernels that predate logging all
+ * inode changes.
*/
-
- ip->i_d.di_flushiter++;
+ if (ip->i_d.di_version < 3)
+ ip->i_d.di_flushiter++;
/*
* Copy the dirty parts of the inode into the on-disk
@@ -2812,40 +3274,9 @@ xfs_iflush_int(
if (ip->i_d.di_flushiter == DI_MAX_FLUSH)
ip->i_d.di_flushiter = 0;
- /*
- * If this is really an old format inode and the superblock version
- * has not been updated to support only new format inodes, then
- * convert back to the old inode format. If the superblock version
- * has been updated, then make the conversion permanent.
- */
- ASSERT(ip->i_d.di_version == 1 || xfs_sb_version_hasnlink(&mp->m_sb));
- if (ip->i_d.di_version == 1) {
- if (!xfs_sb_version_hasnlink(&mp->m_sb)) {
- /*
- * Convert it back.
- */
- ASSERT(ip->i_d.di_nlink <= XFS_MAXLINK_1);
- dip->di_onlink = cpu_to_be16(ip->i_d.di_nlink);
- } else {
- /*
- * The superblock version has already been bumped,
- * so just make the conversion to the new inode
- * format permanent.
- */
- ip->i_d.di_version = 2;
- dip->di_version = 2;
- ip->i_d.di_onlink = 0;
- dip->di_onlink = 0;
- memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
- memset(&(dip->di_pad[0]), 0,
- sizeof(dip->di_pad));
- ASSERT(xfs_get_projid(ip) == 0);
- }
- }
-
- xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK, bp);
+ xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK);
if (XFS_IFORK_Q(ip))
- xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK, bp);
+ xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK);
xfs_inobp_check(mp, bp);
/*
@@ -2873,1112 +3304,32 @@ xfs_iflush_int(
* need the AIL lock, because it is a 64 bit value that cannot be read
* atomically.
*/
- if (iip != NULL && iip->ili_fields != 0) {
- iip->ili_last_fields = iip->ili_fields;
- iip->ili_fields = 0;
- iip->ili_logged = 1;
-
- xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn,
- &iip->ili_item.li_lsn);
-
- /*
- * Attach the function xfs_iflush_done to the inode's
- * buffer. This will remove the inode from the AIL
- * and unlock the inode's flush lock when the inode is
- * completely written to disk.
- */
- xfs_buf_attach_iodone(bp, xfs_iflush_done, &iip->ili_item);
-
- ASSERT(bp->b_fspriv != NULL);
- ASSERT(bp->b_iodone != NULL);
- } else {
- /*
- * We're flushing an inode which is not in the AIL and has
- * not been logged. For this case we can immediately drop
- * the inode flush lock because we can avoid the whole
- * AIL state thing. It's OK to drop the flush lock now,
- * because we've already locked the buffer and to do anything
- * you really need both.
- */
- if (iip != NULL) {
- ASSERT(iip->ili_logged == 0);
- ASSERT(iip->ili_last_fields == 0);
- ASSERT((iip->ili_item.li_flags & XFS_LI_IN_AIL) == 0);
- }
- xfs_ifunlock(ip);
- }
-
- return 0;
-
-corrupt_out:
- return XFS_ERROR(EFSCORRUPTED);
-}
-
-/*
- * Return a pointer to the extent record at file index idx.
- */
-xfs_bmbt_rec_host_t *
-xfs_iext_get_ext(
- xfs_ifork_t *ifp, /* inode fork pointer */
- xfs_extnum_t idx) /* index of target extent */
-{
- ASSERT(idx >= 0);
- ASSERT(idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t));
-
- if ((ifp->if_flags & XFS_IFEXTIREC) && (idx == 0)) {
- return ifp->if_u1.if_ext_irec->er_extbuf;
- } else if (ifp->if_flags & XFS_IFEXTIREC) {
- xfs_ext_irec_t *erp; /* irec pointer */
- int erp_idx = 0; /* irec index */
- xfs_extnum_t page_idx = idx; /* ext index in target list */
-
- erp = xfs_iext_idx_to_irec(ifp, &page_idx, &erp_idx, 0);
- return &erp->er_extbuf[page_idx];
- } else if (ifp->if_bytes) {
- return &ifp->if_u1.if_extents[idx];
- } else {
- return NULL;
- }
-}
-
-/*
- * Insert new item(s) into the extent records for incore inode
- * fork 'ifp'. 'count' new items are inserted at index 'idx'.
- */
-void
-xfs_iext_insert(
- xfs_inode_t *ip, /* incore inode pointer */
- xfs_extnum_t idx, /* starting index of new items */
- xfs_extnum_t count, /* number of inserted items */
- xfs_bmbt_irec_t *new, /* items to insert */
- int state) /* type of extent conversion */
-{
- xfs_ifork_t *ifp = (state & BMAP_ATTRFORK) ? ip->i_afp : &ip->i_df;
- xfs_extnum_t i; /* extent record index */
-
- trace_xfs_iext_insert(ip, idx, new, state, _RET_IP_);
-
- ASSERT(ifp->if_flags & XFS_IFEXTENTS);
- xfs_iext_add(ifp, idx, count);
- for (i = idx; i < idx + count; i++, new++)
- xfs_bmbt_set_all(xfs_iext_get_ext(ifp, i), new);
-}
-
-/*
- * This is called when the amount of space required for incore file
- * extents needs to be increased. The ext_diff parameter stores the
- * number of new extents being added and the idx parameter contains
- * the extent index where the new extents will be added. If the new
- * extents are being appended, then we just need to (re)allocate and
- * initialize the space. Otherwise, if the new extents are being
- * inserted into the middle of the existing entries, a bit more work
- * is required to make room for the new extents to be inserted. The
- * caller is responsible for filling in the new extent entries upon
- * return.
- */
-void
-xfs_iext_add(
- xfs_ifork_t *ifp, /* inode fork pointer */
- xfs_extnum_t idx, /* index to begin adding exts */
- int ext_diff) /* number of extents to add */
-{
- int byte_diff; /* new bytes being added */
- int new_size; /* size of extents after adding */
- xfs_extnum_t nextents; /* number of extents in file */
-
- nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
- ASSERT((idx >= 0) && (idx <= nextents));
- byte_diff = ext_diff * sizeof(xfs_bmbt_rec_t);
- new_size = ifp->if_bytes + byte_diff;
- /*
- * If the new number of extents (nextents + ext_diff)
- * fits inside the inode, then continue to use the inline
- * extent buffer.
- */
- if (nextents + ext_diff <= XFS_INLINE_EXTS) {
- if (idx < nextents) {
- memmove(&ifp->if_u2.if_inline_ext[idx + ext_diff],
- &ifp->if_u2.if_inline_ext[idx],
- (nextents - idx) * sizeof(xfs_bmbt_rec_t));
- memset(&ifp->if_u2.if_inline_ext[idx], 0, byte_diff);
- }
- ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext;
- ifp->if_real_bytes = 0;
- }
- /*
- * Otherwise use a linear (direct) extent list.
- * If the extents are currently inside the inode,
- * xfs_iext_realloc_direct will switch us from
- * inline to direct extent allocation mode.
- */
- else if (nextents + ext_diff <= XFS_LINEAR_EXTS) {
- xfs_iext_realloc_direct(ifp, new_size);
- if (idx < nextents) {
- memmove(&ifp->if_u1.if_extents[idx + ext_diff],
- &ifp->if_u1.if_extents[idx],
- (nextents - idx) * sizeof(xfs_bmbt_rec_t));
- memset(&ifp->if_u1.if_extents[idx], 0, byte_diff);
- }
- }
- /* Indirection array */
- else {
- xfs_ext_irec_t *erp;
- int erp_idx = 0;
- int page_idx = idx;
-
- ASSERT(nextents + ext_diff > XFS_LINEAR_EXTS);
- if (ifp->if_flags & XFS_IFEXTIREC) {
- erp = xfs_iext_idx_to_irec(ifp, &page_idx, &erp_idx, 1);
- } else {
- xfs_iext_irec_init(ifp);
- ASSERT(ifp->if_flags & XFS_IFEXTIREC);
- erp = ifp->if_u1.if_ext_irec;
- }
- /* Extents fit in target extent page */
- if (erp && erp->er_extcount + ext_diff <= XFS_LINEAR_EXTS) {
- if (page_idx < erp->er_extcount) {
- memmove(&erp->er_extbuf[page_idx + ext_diff],
- &erp->er_extbuf[page_idx],
- (erp->er_extcount - page_idx) *
- sizeof(xfs_bmbt_rec_t));
- memset(&erp->er_extbuf[page_idx], 0, byte_diff);
- }
- erp->er_extcount += ext_diff;
- xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff);
- }
- /* Insert a new extent page */
- else if (erp) {
- xfs_iext_add_indirect_multi(ifp,
- erp_idx, page_idx, ext_diff);
- }
- /*
- * If extent(s) are being appended to the last page in
- * the indirection array and the new extent(s) don't fit
- * in the page, then erp is NULL and erp_idx is set to
- * the next index needed in the indirection array.
- */
- else {
- int count = ext_diff;
-
- while (count) {
- erp = xfs_iext_irec_new(ifp, erp_idx);
- erp->er_extcount = count;
- count -= MIN(count, (int)XFS_LINEAR_EXTS);
- if (count) {
- erp_idx++;
- }
- }
- }
- }
- ifp->if_bytes = new_size;
-}
+ iip->ili_last_fields = iip->ili_fields;
+ iip->ili_fields = 0;
+ iip->ili_logged = 1;
-/*
- * This is called when incore extents are being added to the indirection
- * array and the new extents do not fit in the target extent list. The
- * erp_idx parameter contains the irec index for the target extent list
- * in the indirection array, and the idx parameter contains the extent
- * index within the list. The number of extents being added is stored
- * in the count parameter.
- *
- * |-------| |-------|
- * | | | | idx - number of extents before idx
- * | idx | | count |
- * | | | | count - number of extents being inserted at idx
- * |-------| |-------|
- * | count | | nex2 | nex2 - number of extents after idx + count
- * |-------| |-------|
- */
-void
-xfs_iext_add_indirect_multi(
- xfs_ifork_t *ifp, /* inode fork pointer */
- int erp_idx, /* target extent irec index */
- xfs_extnum_t idx, /* index within target list */
- int count) /* new extents being added */
-{
- int byte_diff; /* new bytes being added */
- xfs_ext_irec_t *erp; /* pointer to irec entry */
- xfs_extnum_t ext_diff; /* number of extents to add */
- xfs_extnum_t ext_cnt; /* new extents still needed */
- xfs_extnum_t nex2; /* extents after idx + count */
- xfs_bmbt_rec_t *nex2_ep = NULL; /* temp list for nex2 extents */
- int nlists; /* number of irec's (lists) */
-
- ASSERT(ifp->if_flags & XFS_IFEXTIREC);
- erp = &ifp->if_u1.if_ext_irec[erp_idx];
- nex2 = erp->er_extcount - idx;
- nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
-
- /*
- * Save second part of target extent list
- * (all extents past */
- if (nex2) {
- byte_diff = nex2 * sizeof(xfs_bmbt_rec_t);
- nex2_ep = (xfs_bmbt_rec_t *) kmem_alloc(byte_diff, KM_NOFS);
- memmove(nex2_ep, &erp->er_extbuf[idx], byte_diff);
- erp->er_extcount -= nex2;
- xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, -nex2);
- memset(&erp->er_extbuf[idx], 0, byte_diff);
- }
-
- /*
- * Add the new extents to the end of the target
- * list, then allocate new irec record(s) and
- * extent buffer(s) as needed to store the rest
- * of the new extents.
- */
- ext_cnt = count;
- ext_diff = MIN(ext_cnt, (int)XFS_LINEAR_EXTS - erp->er_extcount);
- if (ext_diff) {
- erp->er_extcount += ext_diff;
- xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff);
- ext_cnt -= ext_diff;
- }
- while (ext_cnt) {
- erp_idx++;
- erp = xfs_iext_irec_new(ifp, erp_idx);
- ext_diff = MIN(ext_cnt, (int)XFS_LINEAR_EXTS);
- erp->er_extcount = ext_diff;
- xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff);
- ext_cnt -= ext_diff;
- }
-
- /* Add nex2 extents back to indirection array */
- if (nex2) {
- xfs_extnum_t ext_avail;
- int i;
-
- byte_diff = nex2 * sizeof(xfs_bmbt_rec_t);
- ext_avail = XFS_LINEAR_EXTS - erp->er_extcount;
- i = 0;
- /*
- * If nex2 extents fit in the current page, append
- * nex2_ep after the new extents.
- */
- if (nex2 <= ext_avail) {
- i = erp->er_extcount;
- }
- /*
- * Otherwise, check if space is available in the
- * next page.
- */
- else if ((erp_idx < nlists - 1) &&
- (nex2 <= (ext_avail = XFS_LINEAR_EXTS -
- ifp->if_u1.if_ext_irec[erp_idx+1].er_extcount))) {
- erp_idx++;
- erp++;
- /* Create a hole for nex2 extents */
- memmove(&erp->er_extbuf[nex2], erp->er_extbuf,
- erp->er_extcount * sizeof(xfs_bmbt_rec_t));
- }
- /*
- * Final choice, create a new extent page for
- * nex2 extents.
- */
- else {
- erp_idx++;
- erp = xfs_iext_irec_new(ifp, erp_idx);
- }
- memmove(&erp->er_extbuf[i], nex2_ep, byte_diff);
- kmem_free(nex2_ep);
- erp->er_extcount += nex2;
- xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, nex2);
- }
-}
+ xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn,
+ &iip->ili_item.li_lsn);
-/*
- * This is called when the amount of space required for incore file
- * extents needs to be decreased. The ext_diff parameter stores the
- * number of extents to be removed and the idx parameter contains
- * the extent index where the extents will be removed from.
- *
- * If the amount of space needed has decreased below the linear
- * limit, XFS_IEXT_BUFSZ, then switch to using the contiguous
- * extent array. Otherwise, use kmem_realloc() to adjust the
- * size to what is needed.
- */
-void
-xfs_iext_remove(
- xfs_inode_t *ip, /* incore inode pointer */
- xfs_extnum_t idx, /* index to begin removing exts */
- int ext_diff, /* number of extents to remove */
- int state) /* type of extent conversion */
-{
- xfs_ifork_t *ifp = (state & BMAP_ATTRFORK) ? ip->i_afp : &ip->i_df;
- xfs_extnum_t nextents; /* number of extents in file */
- int new_size; /* size of extents after removal */
-
- trace_xfs_iext_remove(ip, idx, state, _RET_IP_);
-
- ASSERT(ext_diff > 0);
- nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
- new_size = (nextents - ext_diff) * sizeof(xfs_bmbt_rec_t);
-
- if (new_size == 0) {
- xfs_iext_destroy(ifp);
- } else if (ifp->if_flags & XFS_IFEXTIREC) {
- xfs_iext_remove_indirect(ifp, idx, ext_diff);
- } else if (ifp->if_real_bytes) {
- xfs_iext_remove_direct(ifp, idx, ext_diff);
- } else {
- xfs_iext_remove_inline(ifp, idx, ext_diff);
- }
- ifp->if_bytes = new_size;
-}
-
-/*
- * This removes ext_diff extents from the inline buffer, beginning
- * at extent index idx.
- */
-void
-xfs_iext_remove_inline(
- xfs_ifork_t *ifp, /* inode fork pointer */
- xfs_extnum_t idx, /* index to begin removing exts */
- int ext_diff) /* number of extents to remove */
-{
- int nextents; /* number of extents in file */
-
- ASSERT(!(ifp->if_flags & XFS_IFEXTIREC));
- ASSERT(idx < XFS_INLINE_EXTS);
- nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
- ASSERT(((nextents - ext_diff) > 0) &&
- (nextents - ext_diff) < XFS_INLINE_EXTS);
-
- if (idx + ext_diff < nextents) {
- memmove(&ifp->if_u2.if_inline_ext[idx],
- &ifp->if_u2.if_inline_ext[idx + ext_diff],
- (nextents - (idx + ext_diff)) *
- sizeof(xfs_bmbt_rec_t));
- memset(&ifp->if_u2.if_inline_ext[nextents - ext_diff],
- 0, ext_diff * sizeof(xfs_bmbt_rec_t));
- } else {
- memset(&ifp->if_u2.if_inline_ext[idx], 0,
- ext_diff * sizeof(xfs_bmbt_rec_t));
- }
-}
-
-/*
- * This removes ext_diff extents from a linear (direct) extent list,
- * beginning at extent index idx. If the extents are being removed
- * from the end of the list (ie. truncate) then we just need to re-
- * allocate the list to remove the extra space. Otherwise, if the
- * extents are being removed from the middle of the existing extent
- * entries, then we first need to move the extent records beginning
- * at idx + ext_diff up in the list to overwrite the records being
- * removed, then remove the extra space via kmem_realloc.
- */
-void
-xfs_iext_remove_direct(
- xfs_ifork_t *ifp, /* inode fork pointer */
- xfs_extnum_t idx, /* index to begin removing exts */
- int ext_diff) /* number of extents to remove */
-{
- xfs_extnum_t nextents; /* number of extents in file */
- int new_size; /* size of extents after removal */
-
- ASSERT(!(ifp->if_flags & XFS_IFEXTIREC));
- new_size = ifp->if_bytes -
- (ext_diff * sizeof(xfs_bmbt_rec_t));
- nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
-
- if (new_size == 0) {
- xfs_iext_destroy(ifp);
- return;
- }
- /* Move extents up in the list (if needed) */
- if (idx + ext_diff < nextents) {
- memmove(&ifp->if_u1.if_extents[idx],
- &ifp->if_u1.if_extents[idx + ext_diff],
- (nextents - (idx + ext_diff)) *
- sizeof(xfs_bmbt_rec_t));
- }
- memset(&ifp->if_u1.if_extents[nextents - ext_diff],
- 0, ext_diff * sizeof(xfs_bmbt_rec_t));
/*
- * Reallocate the direct extent list. If the extents
- * will fit inside the inode then xfs_iext_realloc_direct
- * will switch from direct to inline extent allocation
- * mode for us.
+ * Attach the function xfs_iflush_done to the inode's
+ * buffer. This will remove the inode from the AIL
+ * and unlock the inode's flush lock when the inode is
+ * completely written to disk.
*/
- xfs_iext_realloc_direct(ifp, new_size);
- ifp->if_bytes = new_size;
-}
+ xfs_buf_attach_iodone(bp, xfs_iflush_done, &iip->ili_item);
-/*
- * This is called when incore extents are being removed from the
- * indirection array and the extents being removed span multiple extent
- * buffers. The idx parameter contains the file extent index where we
- * want to begin removing extents, and the count parameter contains
- * how many extents need to be removed.
- *
- * |-------| |-------|
- * | nex1 | | | nex1 - number of extents before idx
- * |-------| | count |
- * | | | | count - number of extents being removed at idx
- * | count | |-------|
- * | | | nex2 | nex2 - number of extents after idx + count
- * |-------| |-------|
- */
-void
-xfs_iext_remove_indirect(
- xfs_ifork_t *ifp, /* inode fork pointer */
- xfs_extnum_t idx, /* index to begin removing extents */
- int count) /* number of extents to remove */
-{
- xfs_ext_irec_t *erp; /* indirection array pointer */
- int erp_idx = 0; /* indirection array index */
- xfs_extnum_t ext_cnt; /* extents left to remove */
- xfs_extnum_t ext_diff; /* extents to remove in current list */
- xfs_extnum_t nex1; /* number of extents before idx */
- xfs_extnum_t nex2; /* extents after idx + count */
- int page_idx = idx; /* index in target extent list */
-
- ASSERT(ifp->if_flags & XFS_IFEXTIREC);
- erp = xfs_iext_idx_to_irec(ifp, &page_idx, &erp_idx, 0);
- ASSERT(erp != NULL);
- nex1 = page_idx;
- ext_cnt = count;
- while (ext_cnt) {
- nex2 = MAX((erp->er_extcount - (nex1 + ext_cnt)), 0);
- ext_diff = MIN(ext_cnt, (erp->er_extcount - nex1));
- /*
- * Check for deletion of entire list;
- * xfs_iext_irec_remove() updates extent offsets.
- */
- if (ext_diff == erp->er_extcount) {
- xfs_iext_irec_remove(ifp, erp_idx);
- ext_cnt -= ext_diff;
- nex1 = 0;
- if (ext_cnt) {
- ASSERT(erp_idx < ifp->if_real_bytes /
- XFS_IEXT_BUFSZ);
- erp = &ifp->if_u1.if_ext_irec[erp_idx];
- nex1 = 0;
- continue;
- } else {
- break;
- }
- }
- /* Move extents up (if needed) */
- if (nex2) {
- memmove(&erp->er_extbuf[nex1],
- &erp->er_extbuf[nex1 + ext_diff],
- nex2 * sizeof(xfs_bmbt_rec_t));
- }
- /* Zero out rest of page */
- memset(&erp->er_extbuf[nex1 + nex2], 0, (XFS_IEXT_BUFSZ -
- ((nex1 + nex2) * sizeof(xfs_bmbt_rec_t))));
- /* Update remaining counters */
- erp->er_extcount -= ext_diff;
- xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, -ext_diff);
- ext_cnt -= ext_diff;
- nex1 = 0;
- erp_idx++;
- erp++;
- }
- ifp->if_bytes -= count * sizeof(xfs_bmbt_rec_t);
- xfs_iext_irec_compact(ifp);
-}
+ /* update the lsn in the on disk inode if required */
+ if (ip->i_d.di_version == 3)
+ dip->di_lsn = cpu_to_be64(iip->ili_item.li_lsn);
-/*
- * Create, destroy, or resize a linear (direct) block of extents.
- */
-void
-xfs_iext_realloc_direct(
- xfs_ifork_t *ifp, /* inode fork pointer */
- int new_size) /* new size of extents */
-{
- int rnew_size; /* real new size of extents */
-
- rnew_size = new_size;
-
- ASSERT(!(ifp->if_flags & XFS_IFEXTIREC) ||
- ((new_size >= 0) && (new_size <= XFS_IEXT_BUFSZ) &&
- (new_size != ifp->if_real_bytes)));
-
- /* Free extent records */
- if (new_size == 0) {
- xfs_iext_destroy(ifp);
- }
- /* Resize direct extent list and zero any new bytes */
- else if (ifp->if_real_bytes) {
- /* Check if extents will fit inside the inode */
- if (new_size <= XFS_INLINE_EXTS * sizeof(xfs_bmbt_rec_t)) {
- xfs_iext_direct_to_inline(ifp, new_size /
- (uint)sizeof(xfs_bmbt_rec_t));
- ifp->if_bytes = new_size;
- return;
- }
- if (!is_power_of_2(new_size)){
- rnew_size = roundup_pow_of_two(new_size);
- }
- if (rnew_size != ifp->if_real_bytes) {
- ifp->if_u1.if_extents =
- kmem_realloc(ifp->if_u1.if_extents,
- rnew_size,
- ifp->if_real_bytes, KM_NOFS);
- }
- if (rnew_size > ifp->if_real_bytes) {
- memset(&ifp->if_u1.if_extents[ifp->if_bytes /
- (uint)sizeof(xfs_bmbt_rec_t)], 0,
- rnew_size - ifp->if_real_bytes);
- }
- }
- /*
- * Switch from the inline extent buffer to a direct
- * extent list. Be sure to include the inline extent
- * bytes in new_size.
- */
- else {
- new_size += ifp->if_bytes;
- if (!is_power_of_2(new_size)) {
- rnew_size = roundup_pow_of_two(new_size);
- }
- xfs_iext_inline_to_direct(ifp, rnew_size);
- }
- ifp->if_real_bytes = rnew_size;
- ifp->if_bytes = new_size;
-}
-
-/*
- * Switch from linear (direct) extent records to inline buffer.
- */
-void
-xfs_iext_direct_to_inline(
- xfs_ifork_t *ifp, /* inode fork pointer */
- xfs_extnum_t nextents) /* number of extents in file */
-{
- ASSERT(ifp->if_flags & XFS_IFEXTENTS);
- ASSERT(nextents <= XFS_INLINE_EXTS);
- /*
- * The inline buffer was zeroed when we switched
- * from inline to direct extent allocation mode,
- * so we don't need to clear it here.
- */
- memcpy(ifp->if_u2.if_inline_ext, ifp->if_u1.if_extents,
- nextents * sizeof(xfs_bmbt_rec_t));
- kmem_free(ifp->if_u1.if_extents);
- ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext;
- ifp->if_real_bytes = 0;
-}
-
-/*
- * Switch from inline buffer to linear (direct) extent records.
- * new_size should already be rounded up to the next power of 2
- * by the caller (when appropriate), so use new_size as it is.
- * However, since new_size may be rounded up, we can't update
- * if_bytes here. It is the caller's responsibility to update
- * if_bytes upon return.
- */
-void
-xfs_iext_inline_to_direct(
- xfs_ifork_t *ifp, /* inode fork pointer */
- int new_size) /* number of extents in file */
-{
- ifp->if_u1.if_extents = kmem_alloc(new_size, KM_NOFS);
- memset(ifp->if_u1.if_extents, 0, new_size);
- if (ifp->if_bytes) {
- memcpy(ifp->if_u1.if_extents, ifp->if_u2.if_inline_ext,
- ifp->if_bytes);
- memset(ifp->if_u2.if_inline_ext, 0, XFS_INLINE_EXTS *
- sizeof(xfs_bmbt_rec_t));
- }
- ifp->if_real_bytes = new_size;
-}
-
-/*
- * Resize an extent indirection array to new_size bytes.
- */
-STATIC void
-xfs_iext_realloc_indirect(
- xfs_ifork_t *ifp, /* inode fork pointer */
- int new_size) /* new indirection array size */
-{
- int nlists; /* number of irec's (ex lists) */
- int size; /* current indirection array size */
-
- ASSERT(ifp->if_flags & XFS_IFEXTIREC);
- nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
- size = nlists * sizeof(xfs_ext_irec_t);
- ASSERT(ifp->if_real_bytes);
- ASSERT((new_size >= 0) && (new_size != size));
- if (new_size == 0) {
- xfs_iext_destroy(ifp);
- } else {
- ifp->if_u1.if_ext_irec = (xfs_ext_irec_t *)
- kmem_realloc(ifp->if_u1.if_ext_irec,
- new_size, size, KM_NOFS);
- }
-}
-
-/*
- * Switch from indirection array to linear (direct) extent allocations.
- */
-STATIC void
-xfs_iext_indirect_to_direct(
- xfs_ifork_t *ifp) /* inode fork pointer */
-{
- xfs_bmbt_rec_host_t *ep; /* extent record pointer */
- xfs_extnum_t nextents; /* number of extents in file */
- int size; /* size of file extents */
-
- ASSERT(ifp->if_flags & XFS_IFEXTIREC);
- nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
- ASSERT(nextents <= XFS_LINEAR_EXTS);
- size = nextents * sizeof(xfs_bmbt_rec_t);
-
- xfs_iext_irec_compact_pages(ifp);
- ASSERT(ifp->if_real_bytes == XFS_IEXT_BUFSZ);
-
- ep = ifp->if_u1.if_ext_irec->er_extbuf;
- kmem_free(ifp->if_u1.if_ext_irec);
- ifp->if_flags &= ~XFS_IFEXTIREC;
- ifp->if_u1.if_extents = ep;
- ifp->if_bytes = size;
- if (nextents < XFS_LINEAR_EXTS) {
- xfs_iext_realloc_direct(ifp, size);
- }
-}
-
-/*
- * Free incore file extents.
- */
-void
-xfs_iext_destroy(
- xfs_ifork_t *ifp) /* inode fork pointer */
-{
- if (ifp->if_flags & XFS_IFEXTIREC) {
- int erp_idx;
- int nlists;
-
- nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
- for (erp_idx = nlists - 1; erp_idx >= 0 ; erp_idx--) {
- xfs_iext_irec_remove(ifp, erp_idx);
- }
- ifp->if_flags &= ~XFS_IFEXTIREC;
- } else if (ifp->if_real_bytes) {
- kmem_free(ifp->if_u1.if_extents);
- } else if (ifp->if_bytes) {
- memset(ifp->if_u2.if_inline_ext, 0, XFS_INLINE_EXTS *
- sizeof(xfs_bmbt_rec_t));
- }
- ifp->if_u1.if_extents = NULL;
- ifp->if_real_bytes = 0;
- ifp->if_bytes = 0;
-}
-
-/*
- * Return a pointer to the extent record for file system block bno.
- */
-xfs_bmbt_rec_host_t * /* pointer to found extent record */
-xfs_iext_bno_to_ext(
- xfs_ifork_t *ifp, /* inode fork pointer */
- xfs_fileoff_t bno, /* block number to search for */
- xfs_extnum_t *idxp) /* index of target extent */
-{
- xfs_bmbt_rec_host_t *base; /* pointer to first extent */
- xfs_filblks_t blockcount = 0; /* number of blocks in extent */
- xfs_bmbt_rec_host_t *ep = NULL; /* pointer to target extent */
- xfs_ext_irec_t *erp = NULL; /* indirection array pointer */
- int high; /* upper boundary in search */
- xfs_extnum_t idx = 0; /* index of target extent */
- int low; /* lower boundary in search */
- xfs_extnum_t nextents; /* number of file extents */
- xfs_fileoff_t startoff = 0; /* start offset of extent */
-
- nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
- if (nextents == 0) {
- *idxp = 0;
- return NULL;
- }
- low = 0;
- if (ifp->if_flags & XFS_IFEXTIREC) {
- /* Find target extent list */
- int erp_idx = 0;
- erp = xfs_iext_bno_to_irec(ifp, bno, &erp_idx);
- base = erp->er_extbuf;
- high = erp->er_extcount - 1;
- } else {
- base = ifp->if_u1.if_extents;
- high = nextents - 1;
- }
- /* Binary search extent records */
- while (low <= high) {
- idx = (low + high) >> 1;
- ep = base + idx;
- startoff = xfs_bmbt_get_startoff(ep);
- blockcount = xfs_bmbt_get_blockcount(ep);
- if (bno < startoff) {
- high = idx - 1;
- } else if (bno >= startoff + blockcount) {
- low = idx + 1;
- } else {
- /* Convert back to file-based extent index */
- if (ifp->if_flags & XFS_IFEXTIREC) {
- idx += erp->er_extoff;
- }
- *idxp = idx;
- return ep;
- }
- }
- /* Convert back to file-based extent index */
- if (ifp->if_flags & XFS_IFEXTIREC) {
- idx += erp->er_extoff;
- }
- if (bno >= startoff + blockcount) {
- if (++idx == nextents) {
- ep = NULL;
- } else {
- ep = xfs_iext_get_ext(ifp, idx);
- }
- }
- *idxp = idx;
- return ep;
-}
-
-/*
- * Return a pointer to the indirection array entry containing the
- * extent record for filesystem block bno. Store the index of the
- * target irec in *erp_idxp.
- */
-xfs_ext_irec_t * /* pointer to found extent record */
-xfs_iext_bno_to_irec(
- xfs_ifork_t *ifp, /* inode fork pointer */
- xfs_fileoff_t bno, /* block number to search for */
- int *erp_idxp) /* irec index of target ext list */
-{
- xfs_ext_irec_t *erp = NULL; /* indirection array pointer */
- xfs_ext_irec_t *erp_next; /* next indirection array entry */
- int erp_idx; /* indirection array index */
- int nlists; /* number of extent irec's (lists) */
- int high; /* binary search upper limit */
- int low; /* binary search lower limit */
-
- ASSERT(ifp->if_flags & XFS_IFEXTIREC);
- nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
- erp_idx = 0;
- low = 0;
- high = nlists - 1;
- while (low <= high) {
- erp_idx = (low + high) >> 1;
- erp = &ifp->if_u1.if_ext_irec[erp_idx];
- erp_next = erp_idx < nlists - 1 ? erp + 1 : NULL;
- if (bno < xfs_bmbt_get_startoff(erp->er_extbuf)) {
- high = erp_idx - 1;
- } else if (erp_next && bno >=
- xfs_bmbt_get_startoff(erp_next->er_extbuf)) {
- low = erp_idx + 1;
- } else {
- break;
- }
- }
- *erp_idxp = erp_idx;
- return erp;
-}
+ /* generate the checksum. */
+ xfs_dinode_calc_crc(mp, dip);
-/*
- * Return a pointer to the indirection array entry containing the
- * extent record at file extent index *idxp. Store the index of the
- * target irec in *erp_idxp and store the page index of the target
- * extent record in *idxp.
- */
-xfs_ext_irec_t *
-xfs_iext_idx_to_irec(
- xfs_ifork_t *ifp, /* inode fork pointer */
- xfs_extnum_t *idxp, /* extent index (file -> page) */
- int *erp_idxp, /* pointer to target irec */
- int realloc) /* new bytes were just added */
-{
- xfs_ext_irec_t *prev; /* pointer to previous irec */
- xfs_ext_irec_t *erp = NULL; /* pointer to current irec */
- int erp_idx; /* indirection array index */
- int nlists; /* number of irec's (ex lists) */
- int high; /* binary search upper limit */
- int low; /* binary search lower limit */
- xfs_extnum_t page_idx = *idxp; /* extent index in target list */
-
- ASSERT(ifp->if_flags & XFS_IFEXTIREC);
- ASSERT(page_idx >= 0);
- ASSERT(page_idx <= ifp->if_bytes / sizeof(xfs_bmbt_rec_t));
- ASSERT(page_idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t) || realloc);
-
- nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
- erp_idx = 0;
- low = 0;
- high = nlists - 1;
-
- /* Binary search extent irec's */
- while (low <= high) {
- erp_idx = (low + high) >> 1;
- erp = &ifp->if_u1.if_ext_irec[erp_idx];
- prev = erp_idx > 0 ? erp - 1 : NULL;
- if (page_idx < erp->er_extoff || (page_idx == erp->er_extoff &&
- realloc && prev && prev->er_extcount < XFS_LINEAR_EXTS)) {
- high = erp_idx - 1;
- } else if (page_idx > erp->er_extoff + erp->er_extcount ||
- (page_idx == erp->er_extoff + erp->er_extcount &&
- !realloc)) {
- low = erp_idx + 1;
- } else if (page_idx == erp->er_extoff + erp->er_extcount &&
- erp->er_extcount == XFS_LINEAR_EXTS) {
- ASSERT(realloc);
- page_idx = 0;
- erp_idx++;
- erp = erp_idx < nlists ? erp + 1 : NULL;
- break;
- } else {
- page_idx -= erp->er_extoff;
- break;
- }
- }
- *idxp = page_idx;
- *erp_idxp = erp_idx;
- return(erp);
-}
-
-/*
- * Allocate and initialize an indirection array once the space needed
- * for incore extents increases above XFS_IEXT_BUFSZ.
- */
-void
-xfs_iext_irec_init(
- xfs_ifork_t *ifp) /* inode fork pointer */
-{
- xfs_ext_irec_t *erp; /* indirection array pointer */
- xfs_extnum_t nextents; /* number of extents in file */
-
- ASSERT(!(ifp->if_flags & XFS_IFEXTIREC));
- nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
- ASSERT(nextents <= XFS_LINEAR_EXTS);
-
- erp = kmem_alloc(sizeof(xfs_ext_irec_t), KM_NOFS);
-
- if (nextents == 0) {
- ifp->if_u1.if_extents = kmem_alloc(XFS_IEXT_BUFSZ, KM_NOFS);
- } else if (!ifp->if_real_bytes) {
- xfs_iext_inline_to_direct(ifp, XFS_IEXT_BUFSZ);
- } else if (ifp->if_real_bytes < XFS_IEXT_BUFSZ) {
- xfs_iext_realloc_direct(ifp, XFS_IEXT_BUFSZ);
- }
- erp->er_extbuf = ifp->if_u1.if_extents;
- erp->er_extcount = nextents;
- erp->er_extoff = 0;
-
- ifp->if_flags |= XFS_IFEXTIREC;
- ifp->if_real_bytes = XFS_IEXT_BUFSZ;
- ifp->if_bytes = nextents * sizeof(xfs_bmbt_rec_t);
- ifp->if_u1.if_ext_irec = erp;
-
- return;
-}
-
-/*
- * Allocate and initialize a new entry in the indirection array.
- */
-xfs_ext_irec_t *
-xfs_iext_irec_new(
- xfs_ifork_t *ifp, /* inode fork pointer */
- int erp_idx) /* index for new irec */
-{
- xfs_ext_irec_t *erp; /* indirection array pointer */
- int i; /* loop counter */
- int nlists; /* number of irec's (ex lists) */
-
- ASSERT(ifp->if_flags & XFS_IFEXTIREC);
- nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
-
- /* Resize indirection array */
- xfs_iext_realloc_indirect(ifp, ++nlists *
- sizeof(xfs_ext_irec_t));
- /*
- * Move records down in the array so the
- * new page can use erp_idx.
- */
- erp = ifp->if_u1.if_ext_irec;
- for (i = nlists - 1; i > erp_idx; i--) {
- memmove(&erp[i], &erp[i-1], sizeof(xfs_ext_irec_t));
- }
- ASSERT(i == erp_idx);
-
- /* Initialize new extent record */
- erp = ifp->if_u1.if_ext_irec;
- erp[erp_idx].er_extbuf = kmem_alloc(XFS_IEXT_BUFSZ, KM_NOFS);
- ifp->if_real_bytes = nlists * XFS_IEXT_BUFSZ;
- memset(erp[erp_idx].er_extbuf, 0, XFS_IEXT_BUFSZ);
- erp[erp_idx].er_extcount = 0;
- erp[erp_idx].er_extoff = erp_idx > 0 ?
- erp[erp_idx-1].er_extoff + erp[erp_idx-1].er_extcount : 0;
- return (&erp[erp_idx]);
-}
-
-/*
- * Remove a record from the indirection array.
- */
-void
-xfs_iext_irec_remove(
- xfs_ifork_t *ifp, /* inode fork pointer */
- int erp_idx) /* irec index to remove */
-{
- xfs_ext_irec_t *erp; /* indirection array pointer */
- int i; /* loop counter */
- int nlists; /* number of irec's (ex lists) */
-
- ASSERT(ifp->if_flags & XFS_IFEXTIREC);
- nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
- erp = &ifp->if_u1.if_ext_irec[erp_idx];
- if (erp->er_extbuf) {
- xfs_iext_irec_update_extoffs(ifp, erp_idx + 1,
- -erp->er_extcount);
- kmem_free(erp->er_extbuf);
- }
- /* Compact extent records */
- erp = ifp->if_u1.if_ext_irec;
- for (i = erp_idx; i < nlists - 1; i++) {
- memmove(&erp[i], &erp[i+1], sizeof(xfs_ext_irec_t));
- }
- /*
- * Manually free the last extent record from the indirection
- * array. A call to xfs_iext_realloc_indirect() with a size
- * of zero would result in a call to xfs_iext_destroy() which
- * would in turn call this function again, creating a nasty
- * infinite loop.
- */
- if (--nlists) {
- xfs_iext_realloc_indirect(ifp,
- nlists * sizeof(xfs_ext_irec_t));
- } else {
- kmem_free(ifp->if_u1.if_ext_irec);
- }
- ifp->if_real_bytes = nlists * XFS_IEXT_BUFSZ;
-}
-
-/*
- * This is called to clean up large amounts of unused memory allocated
- * by the indirection array. Before compacting anything though, verify
- * that the indirection array is still needed and switch back to the
- * linear extent list (or even the inline buffer) if possible. The
- * compaction policy is as follows:
- *
- * Full Compaction: Extents fit into a single page (or inline buffer)
- * Partial Compaction: Extents occupy less than 50% of allocated space
- * No Compaction: Extents occupy at least 50% of allocated space
- */
-void
-xfs_iext_irec_compact(
- xfs_ifork_t *ifp) /* inode fork pointer */
-{
- xfs_extnum_t nextents; /* number of extents in file */
- int nlists; /* number of irec's (ex lists) */
-
- ASSERT(ifp->if_flags & XFS_IFEXTIREC);
- nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
- nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
-
- if (nextents == 0) {
- xfs_iext_destroy(ifp);
- } else if (nextents <= XFS_INLINE_EXTS) {
- xfs_iext_indirect_to_direct(ifp);
- xfs_iext_direct_to_inline(ifp, nextents);
- } else if (nextents <= XFS_LINEAR_EXTS) {
- xfs_iext_indirect_to_direct(ifp);
- } else if (nextents < (nlists * XFS_LINEAR_EXTS) >> 1) {
- xfs_iext_irec_compact_pages(ifp);
- }
-}
-
-/*
- * Combine extents from neighboring extent pages.
- */
-void
-xfs_iext_irec_compact_pages(
- xfs_ifork_t *ifp) /* inode fork pointer */
-{
- xfs_ext_irec_t *erp, *erp_next;/* pointers to irec entries */
- int erp_idx = 0; /* indirection array index */
- int nlists; /* number of irec's (ex lists) */
-
- ASSERT(ifp->if_flags & XFS_IFEXTIREC);
- nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
- while (erp_idx < nlists - 1) {
- erp = &ifp->if_u1.if_ext_irec[erp_idx];
- erp_next = erp + 1;
- if (erp_next->er_extcount <=
- (XFS_LINEAR_EXTS - erp->er_extcount)) {
- memcpy(&erp->er_extbuf[erp->er_extcount],
- erp_next->er_extbuf, erp_next->er_extcount *
- sizeof(xfs_bmbt_rec_t));
- erp->er_extcount += erp_next->er_extcount;
- /*
- * Free page before removing extent record
- * so er_extoffs don't get modified in
- * xfs_iext_irec_remove.
- */
- kmem_free(erp_next->er_extbuf);
- erp_next->er_extbuf = NULL;
- xfs_iext_irec_remove(ifp, erp_idx + 1);
- nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
- } else {
- erp_idx++;
- }
- }
-}
-
-/*
- * This is called to update the er_extoff field in the indirection
- * array when extents have been added or removed from one of the
- * extent lists. erp_idx contains the irec index to begin updating
- * at and ext_diff contains the number of extents that were added
- * or removed.
- */
-void
-xfs_iext_irec_update_extoffs(
- xfs_ifork_t *ifp, /* inode fork pointer */
- int erp_idx, /* irec index to update */
- int ext_diff) /* number of new extents */
-{
- int i; /* loop counter */
- int nlists; /* number of irec's (ex lists */
-
- ASSERT(ifp->if_flags & XFS_IFEXTIREC);
- nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
- for (i = erp_idx; i < nlists; i++) {
- ifp->if_u1.if_ext_irec[i].er_extoff += ext_diff;
- }
-}
-
-/*
- * Test whether it is appropriate to check an inode for and free post EOF
- * blocks. The 'force' parameter determines whether we should also consider
- * regular files that are marked preallocated or append-only.
- */
-bool
-xfs_can_free_eofblocks(struct xfs_inode *ip, bool force)
-{
- /* prealloc/delalloc exists only on regular files */
- if (!S_ISREG(ip->i_d.di_mode))
- return false;
-
- /*
- * Zero sized files with no cached pages and delalloc blocks will not
- * have speculative prealloc/delalloc blocks to remove.
- */
- if (VFS_I(ip)->i_size == 0 &&
- VN_CACHED(VFS_I(ip)) == 0 &&
- ip->i_delayed_blks == 0)
- return false;
-
- /* If we haven't read in the extent list, then don't do it now. */
- if (!(ip->i_df.if_flags & XFS_IFEXTENTS))
- return false;
-
- /*
- * Do not free real preallocated or append-only files unless the file
- * has delalloc blocks and we are forced to remove them.
- */
- if (ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND))
- if (!force || ip->i_delayed_blks == 0)
- return false;
+ ASSERT(bp->b_fspriv != NULL);
+ ASSERT(bp->b_iodone != NULL);
+ return 0;
- return true;
+corrupt_out:
+ return XFS_ERROR(EFSCORRUPTED);
}
-
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 22baf6ea4fa..f72bffa6726 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -18,199 +18,15 @@
#ifndef __XFS_INODE_H__
#define __XFS_INODE_H__
-struct posix_acl;
-struct xfs_dinode;
-struct xfs_inode;
-
-/*
- * Fork identifiers.
- */
-#define XFS_DATA_FORK 0
-#define XFS_ATTR_FORK 1
-
-/*
- * The following xfs_ext_irec_t struct introduces a second (top) level
- * to the in-core extent allocation scheme. These structs are allocated
- * in a contiguous block, creating an indirection array where each entry
- * (irec) contains a pointer to a buffer of in-core extent records which
- * it manages. Each extent buffer is 4k in size, since 4k is the system
- * page size on Linux i386 and systems with larger page sizes don't seem
- * to gain much, if anything, by using their native page size as the
- * extent buffer size. Also, using 4k extent buffers everywhere provides
- * a consistent interface for CXFS across different platforms.
- *
- * There is currently no limit on the number of irec's (extent lists)
- * allowed, so heavily fragmented files may require an indirection array
- * which spans multiple system pages of memory. The number of extents
- * which would require this amount of contiguous memory is very large
- * and should not cause problems in the foreseeable future. However,
- * if the memory needed for the contiguous array ever becomes a problem,
- * it is possible that a third level of indirection may be required.
- */
-typedef struct xfs_ext_irec {
- xfs_bmbt_rec_host_t *er_extbuf; /* block of extent records */
- xfs_extnum_t er_extoff; /* extent offset in file */
- xfs_extnum_t er_extcount; /* number of extents in page/block */
-} xfs_ext_irec_t;
-
-/*
- * File incore extent information, present for each of data & attr forks.
- */
-#define XFS_IEXT_BUFSZ 4096
-#define XFS_LINEAR_EXTS (XFS_IEXT_BUFSZ / (uint)sizeof(xfs_bmbt_rec_t))
-#define XFS_INLINE_EXTS 2
-#define XFS_INLINE_DATA 32
-typedef struct xfs_ifork {
- int if_bytes; /* bytes in if_u1 */
- int if_real_bytes; /* bytes allocated in if_u1 */
- struct xfs_btree_block *if_broot; /* file's incore btree root */
- short if_broot_bytes; /* bytes allocated for root */
- unsigned char if_flags; /* per-fork flags */
- union {
- xfs_bmbt_rec_host_t *if_extents;/* linear map file exts */
- xfs_ext_irec_t *if_ext_irec; /* irec map file exts */
- char *if_data; /* inline file data */
- } if_u1;
- union {
- xfs_bmbt_rec_host_t if_inline_ext[XFS_INLINE_EXTS];
- /* very small file extents */
- char if_inline_data[XFS_INLINE_DATA];
- /* very small file data */
- xfs_dev_t if_rdev; /* dev number if special */
- uuid_t if_uuid; /* mount point value */
- } if_u2;
-} xfs_ifork_t;
-
-/*
- * Inode location information. Stored in the inode and passed to
- * xfs_imap_to_bp() to get a buffer and dinode for a given inode.
- */
-struct xfs_imap {
- xfs_daddr_t im_blkno; /* starting BB of inode chunk */
- ushort im_len; /* length in BBs of inode chunk */
- ushort im_boffset; /* inode offset in block in bytes */
-};
-
-/*
- * This is the xfs in-core inode structure.
- * Most of the on-disk inode is embedded in the i_d field.
- *
- * The extent pointers/inline file space, however, are managed
- * separately. The memory for this information is pointed to by
- * the if_u1 unions depending on the type of the data.
- * This is used to linearize the array of extents for fast in-core
- * access. This is used until the file's number of extents
- * surpasses XFS_MAX_INCORE_EXTENTS, at which point all extent pointers
- * are accessed through the buffer cache.
- *
- * Other state kept in the in-core inode is used for identification,
- * locking, transactional updating, etc of the inode.
- *
- * Generally, we do not want to hold the i_rlock while holding the
- * i_ilock. Hierarchy is i_iolock followed by i_rlock.
- *
- * xfs_iptr_t contains all the inode fields up to and including the
- * i_mnext and i_mprev fields, it is used as a marker in the inode
- * chain off the mount structure by xfs_sync calls.
- */
-
-typedef struct xfs_ictimestamp {
- __int32_t t_sec; /* timestamp seconds */
- __int32_t t_nsec; /* timestamp nanoseconds */
-} xfs_ictimestamp_t;
-
-/*
- * NOTE: This structure must be kept identical to struct xfs_dinode
- * in xfs_dinode.h except for the endianness annotations.
- */
-typedef struct xfs_icdinode {
- __uint16_t di_magic; /* inode magic # = XFS_DINODE_MAGIC */
- __uint16_t di_mode; /* mode and type of file */
- __int8_t di_version; /* inode version */
- __int8_t di_format; /* format of di_c data */
- __uint16_t di_onlink; /* old number of links to file */
- __uint32_t di_uid; /* owner's user id */
- __uint32_t di_gid; /* owner's group id */
- __uint32_t di_nlink; /* number of links to file */
- __uint16_t di_projid_lo; /* lower part of owner's project id */
- __uint16_t di_projid_hi; /* higher part of owner's project id */
- __uint8_t di_pad[6]; /* unused, zeroed space */
- __uint16_t di_flushiter; /* incremented on flush */
- xfs_ictimestamp_t di_atime; /* time last accessed */
- xfs_ictimestamp_t di_mtime; /* time last modified */
- xfs_ictimestamp_t di_ctime; /* time created/inode modified */
- xfs_fsize_t di_size; /* number of bytes in file */
- xfs_drfsbno_t di_nblocks; /* # of direct & btree blocks used */
- xfs_extlen_t di_extsize; /* basic/minimum extent size for file */
- xfs_extnum_t di_nextents; /* number of extents in data fork */
- xfs_aextnum_t di_anextents; /* number of extents in attribute fork*/
- __uint8_t di_forkoff; /* attr fork offs, <<3 for 64b align */
- __int8_t di_aformat; /* format of attr fork's data */
- __uint32_t di_dmevmask; /* DMIG event mask */
- __uint16_t di_dmstate; /* DMIG state info */
- __uint16_t di_flags; /* random flags, XFS_DIFLAG_... */
- __uint32_t di_gen; /* generation number */
-} xfs_icdinode_t;
-
-/*
- * Flags for xfs_ichgtime().
- */
-#define XFS_ICHGTIME_MOD 0x1 /* data fork modification timestamp */
-#define XFS_ICHGTIME_CHG 0x2 /* inode field change timestamp */
+#include "xfs_inode_buf.h"
+#include "xfs_inode_fork.h"
+#include "xfs_dinode.h"
/*
- * Per-fork incore inode flags.
+ * Kernel only inode definitions
*/
-#define XFS_IFINLINE 0x01 /* Inline data is read in */
-#define XFS_IFEXTENTS 0x02 /* All extent pointers are read in */
-#define XFS_IFBROOT 0x04 /* i_broot points to the bmap b-tree root */
-#define XFS_IFEXTIREC 0x08 /* Indirection array of extent blocks */
-
-/*
- * Fork handling.
- */
-
-#define XFS_IFORK_Q(ip) ((ip)->i_d.di_forkoff != 0)
-#define XFS_IFORK_BOFF(ip) ((int)((ip)->i_d.di_forkoff << 3))
-
-#define XFS_IFORK_PTR(ip,w) \
- ((w) == XFS_DATA_FORK ? \
- &(ip)->i_df : \
- (ip)->i_afp)
-#define XFS_IFORK_DSIZE(ip) \
- (XFS_IFORK_Q(ip) ? \
- XFS_IFORK_BOFF(ip) : \
- XFS_LITINO((ip)->i_mount))
-#define XFS_IFORK_ASIZE(ip) \
- (XFS_IFORK_Q(ip) ? \
- XFS_LITINO((ip)->i_mount) - XFS_IFORK_BOFF(ip) : \
- 0)
-#define XFS_IFORK_SIZE(ip,w) \
- ((w) == XFS_DATA_FORK ? \
- XFS_IFORK_DSIZE(ip) : \
- XFS_IFORK_ASIZE(ip))
-#define XFS_IFORK_FORMAT(ip,w) \
- ((w) == XFS_DATA_FORK ? \
- (ip)->i_d.di_format : \
- (ip)->i_d.di_aformat)
-#define XFS_IFORK_FMT_SET(ip,w,n) \
- ((w) == XFS_DATA_FORK ? \
- ((ip)->i_d.di_format = (n)) : \
- ((ip)->i_d.di_aformat = (n)))
-#define XFS_IFORK_NEXTENTS(ip,w) \
- ((w) == XFS_DATA_FORK ? \
- (ip)->i_d.di_nextents : \
- (ip)->i_d.di_anextents)
-#define XFS_IFORK_NEXT_SET(ip,w,n) \
- ((w) == XFS_DATA_FORK ? \
- ((ip)->i_d.di_nextents = (n)) : \
- ((ip)->i_d.di_anextents = (n)))
-#define XFS_IFORK_MAXEXT(ip, w) \
- (XFS_IFORK_SIZE(ip, w) / sizeof(xfs_bmbt_rec_t))
-
-
-#ifdef __KERNEL__
-
+struct xfs_dinode;
+struct xfs_inode;
struct xfs_buf;
struct xfs_bmap_free;
struct xfs_bmbt_irec;
@@ -224,6 +40,7 @@ typedef struct xfs_inode {
struct xfs_mount *i_mount; /* fs mount struct ptr */
struct xfs_dquot *i_udquot; /* user dquot */
struct xfs_dquot *i_gdquot; /* group dquot */
+ struct xfs_dquot *i_pdquot; /* project dquot */
/* Inode location stuff */
xfs_ino_t i_ino; /* inode number (agno/agino)*/
@@ -233,6 +50,9 @@ typedef struct xfs_inode {
xfs_ifork_t *i_afp; /* attribute fork pointer */
xfs_ifork_t i_df; /* data fork */
+ /* operations vectors */
+ const struct xfs_dir_ops *d_ops; /* directory ops vector */
+
/* Transaction and locking information. */
struct xfs_inode_log_item *i_itemp; /* logging information */
mrlock_t i_lock; /* inode lock */
@@ -373,6 +193,15 @@ xfs_set_projid(struct xfs_inode *ip,
ip->i_d.di_projid_lo = (__uint16_t) (projid & 0xffff);
}
+static inline prid_t
+xfs_get_initial_prid(struct xfs_inode *dp)
+{
+ if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
+ return xfs_get_projid(dp);
+
+ return XFS_PROJID_DEFAULT;
+}
+
/*
* In-core inode flags.
*/
@@ -380,7 +209,6 @@ xfs_set_projid(struct xfs_inode *ip,
#define XFS_ISTALE (1 << 1) /* inode has been staled */
#define XFS_IRECLAIMABLE (1 << 2) /* inode can be reclaimed */
#define XFS_INEW (1 << 3) /* inode has just been allocated */
-#define XFS_IFILESTREAM (1 << 4) /* inode is in a filestream dir. */
#define XFS_ITRUNCATED (1 << 5) /* truncated down so flush-on-close */
#define XFS_IDIRTY_RELEASE (1 << 6) /* dirty release already seen */
#define __XFS_IFLOCK_BIT 7 /* inode is being flushed right now */
@@ -396,8 +224,7 @@ xfs_set_projid(struct xfs_inode *ip,
*/
#define XFS_IRECLAIM_RESET_FLAGS \
(XFS_IRECLAIMABLE | XFS_IRECLAIM | \
- XFS_IDIRTY_RELEASE | XFS_ITRUNCATED | \
- XFS_IFILESTREAM);
+ XFS_IDIRTY_RELEASE | XFS_ITRUNCATED)
/*
* Synchronize processes attempting to flush the in-core inode back to disk.
@@ -419,6 +246,7 @@ static inline void xfs_iflock(struct xfs_inode *ip)
static inline void xfs_ifunlock(struct xfs_inode *ip)
{
xfs_iflags_clear(ip, XFS_IFLOCK);
+ smp_mb();
wake_up_bit(&ip->i_flags, __XFS_IFLOCK_BIT);
}
@@ -497,16 +325,30 @@ static inline int xfs_isiflocked(struct xfs_inode *ip)
((pip)->i_d.di_mode & S_ISGID))
-/*
- * xfs_inode.c prototypes.
- */
+int xfs_release(struct xfs_inode *ip);
+void xfs_inactive(struct xfs_inode *ip);
+int xfs_lookup(struct xfs_inode *dp, struct xfs_name *name,
+ struct xfs_inode **ipp, struct xfs_name *ci_name);
+int xfs_create(struct xfs_inode *dp, struct xfs_name *name,
+ umode_t mode, xfs_dev_t rdev, struct xfs_inode **ipp);
+int xfs_create_tmpfile(struct xfs_inode *dp, struct dentry *dentry,
+ umode_t mode, struct xfs_inode **ipp);
+int xfs_remove(struct xfs_inode *dp, struct xfs_name *name,
+ struct xfs_inode *ip);
+int xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip,
+ struct xfs_name *target_name);
+int xfs_rename(struct xfs_inode *src_dp, struct xfs_name *src_name,
+ struct xfs_inode *src_ip, struct xfs_inode *target_dp,
+ struct xfs_name *target_name,
+ struct xfs_inode *target_ip);
+
void xfs_ilock(xfs_inode_t *, uint);
int xfs_ilock_nowait(xfs_inode_t *, uint);
void xfs_iunlock(xfs_inode_t *, uint);
void xfs_ilock_demote(xfs_inode_t *, uint);
int xfs_isilocked(xfs_inode_t *, uint);
-uint xfs_ilock_map_shared(xfs_inode_t *);
-void xfs_iunlock_map_shared(xfs_inode_t *, uint);
+uint xfs_ilock_data_map_shared(struct xfs_inode *);
+uint xfs_ilock_attr_map_shared(struct xfs_inode *);
int xfs_ialloc(struct xfs_trans *, xfs_inode_t *, umode_t,
xfs_nlink_t, xfs_dev_t, prid_t, int,
struct xfs_buf **, xfs_inode_t **);
@@ -520,13 +362,27 @@ int xfs_itruncate_extents(struct xfs_trans **, struct xfs_inode *,
int xfs_iunlink(struct xfs_trans *, xfs_inode_t *);
void xfs_iext_realloc(xfs_inode_t *, int, int);
+
void xfs_iunpin_wait(xfs_inode_t *);
+#define xfs_ipincount(ip) ((unsigned int) atomic_read(&ip->i_pincount))
+
int xfs_iflush(struct xfs_inode *, struct xfs_buf **);
void xfs_lock_inodes(xfs_inode_t **, int, uint);
void xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint);
xfs_extlen_t xfs_get_extsz_hint(struct xfs_inode *ip);
+int xfs_dir_ialloc(struct xfs_trans **, struct xfs_inode *, umode_t,
+ xfs_nlink_t, xfs_dev_t, prid_t, int,
+ struct xfs_inode **, int *);
+int xfs_droplink(struct xfs_trans *, struct xfs_inode *);
+int xfs_bumplink(struct xfs_trans *, struct xfs_inode *);
+
+/* from xfs_file.c */
+int xfs_zero_eof(struct xfs_inode *, xfs_off_t, xfs_fsize_t);
+int xfs_iozero(struct xfs_inode *, loff_t, size_t);
+
+
#define IHOLD(ip) \
do { \
ASSERT(atomic_read(&VFS_I(ip)->i_count) > 0) ; \
@@ -540,64 +396,6 @@ do { \
iput(VFS_I(ip)); \
} while (0)
-#endif /* __KERNEL__ */
-
-/*
- * Flags for xfs_iget()
- */
-#define XFS_IGET_CREATE 0x1
-#define XFS_IGET_UNTRUSTED 0x2
-#define XFS_IGET_DONTCACHE 0x4
-
-int xfs_imap_to_bp(struct xfs_mount *, struct xfs_trans *,
- struct xfs_imap *, struct xfs_dinode **,
- struct xfs_buf **, uint, uint);
-int xfs_iread(struct xfs_mount *, struct xfs_trans *,
- struct xfs_inode *, uint);
-void xfs_dinode_to_disk(struct xfs_dinode *,
- struct xfs_icdinode *);
-void xfs_idestroy_fork(struct xfs_inode *, int);
-void xfs_idata_realloc(struct xfs_inode *, int, int);
-void xfs_iroot_realloc(struct xfs_inode *, int, int);
-int xfs_iread_extents(struct xfs_trans *, struct xfs_inode *, int);
-int xfs_iextents_copy(struct xfs_inode *, xfs_bmbt_rec_t *, int);
-
-xfs_bmbt_rec_host_t *xfs_iext_get_ext(xfs_ifork_t *, xfs_extnum_t);
-void xfs_iext_insert(xfs_inode_t *, xfs_extnum_t, xfs_extnum_t,
- xfs_bmbt_irec_t *, int);
-void xfs_iext_add(xfs_ifork_t *, xfs_extnum_t, int);
-void xfs_iext_add_indirect_multi(xfs_ifork_t *, int, xfs_extnum_t, int);
-void xfs_iext_remove(xfs_inode_t *, xfs_extnum_t, int, int);
-void xfs_iext_remove_inline(xfs_ifork_t *, xfs_extnum_t, int);
-void xfs_iext_remove_direct(xfs_ifork_t *, xfs_extnum_t, int);
-void xfs_iext_remove_indirect(xfs_ifork_t *, xfs_extnum_t, int);
-void xfs_iext_realloc_direct(xfs_ifork_t *, int);
-void xfs_iext_direct_to_inline(xfs_ifork_t *, xfs_extnum_t);
-void xfs_iext_inline_to_direct(xfs_ifork_t *, int);
-void xfs_iext_destroy(xfs_ifork_t *);
-xfs_bmbt_rec_host_t *xfs_iext_bno_to_ext(xfs_ifork_t *, xfs_fileoff_t, int *);
-xfs_ext_irec_t *xfs_iext_bno_to_irec(xfs_ifork_t *, xfs_fileoff_t, int *);
-xfs_ext_irec_t *xfs_iext_idx_to_irec(xfs_ifork_t *, xfs_extnum_t *, int *, int);
-void xfs_iext_irec_init(xfs_ifork_t *);
-xfs_ext_irec_t *xfs_iext_irec_new(xfs_ifork_t *, int);
-void xfs_iext_irec_remove(xfs_ifork_t *, int);
-void xfs_iext_irec_compact(xfs_ifork_t *);
-void xfs_iext_irec_compact_pages(xfs_ifork_t *);
-void xfs_iext_irec_compact_full(xfs_ifork_t *);
-void xfs_iext_irec_update_extoffs(xfs_ifork_t *, int, int);
-bool xfs_can_free_eofblocks(struct xfs_inode *, bool);
-
-#define xfs_ipincount(ip) ((unsigned int) atomic_read(&ip->i_pincount))
-
-#if defined(DEBUG)
-void xfs_inobp_check(struct xfs_mount *, struct xfs_buf *);
-#else
-#define xfs_inobp_check(mp, bp)
-#endif /* DEBUG */
-
-extern struct kmem_zone *xfs_ifork_zone;
extern struct kmem_zone *xfs_inode_zone;
-extern struct kmem_zone *xfs_ili_zone;
-extern const struct xfs_buf_ops xfs_inode_buf_ops;
#endif /* __XFS_INODE_H__ */
diff --git a/fs/xfs/xfs_inode_buf.c b/fs/xfs/xfs_inode_buf.c
new file mode 100644
index 00000000000..cb35ae41d4a
--- /dev/null
+++ b/fs/xfs/xfs_inode_buf.c
@@ -0,0 +1,479 @@
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_error.h"
+#include "xfs_cksum.h"
+#include "xfs_icache.h"
+#include "xfs_trans.h"
+#include "xfs_ialloc.h"
+#include "xfs_dinode.h"
+
+/*
+ * Check that none of the inode's in the buffer have a next
+ * unlinked field of 0.
+ */
+#if defined(DEBUG)
+void
+xfs_inobp_check(
+ xfs_mount_t *mp,
+ xfs_buf_t *bp)
+{
+ int i;
+ int j;
+ xfs_dinode_t *dip;
+
+ j = mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog;
+
+ for (i = 0; i < j; i++) {
+ dip = (xfs_dinode_t *)xfs_buf_offset(bp,
+ i * mp->m_sb.sb_inodesize);
+ if (!dip->di_next_unlinked) {
+ xfs_alert(mp,
+ "Detected bogus zero next_unlinked field in inode %d buffer 0x%llx.",
+ i, (long long)bp->b_bn);
+ }
+ }
+}
+#endif
+
+/*
+ * If we are doing readahead on an inode buffer, we might be in log recovery
+ * reading an inode allocation buffer that hasn't yet been replayed, and hence
+ * has not had the inode cores stamped into it. Hence for readahead, the buffer
+ * may be potentially invalid.
+ *
+ * If the readahead buffer is invalid, we don't want to mark it with an error,
+ * but we do want to clear the DONE status of the buffer so that a followup read
+ * will re-read it from disk. This will ensure that we don't get an unnecessary
+ * warnings during log recovery and we don't get unnecssary panics on debug
+ * kernels.
+ */
+static void
+xfs_inode_buf_verify(
+ struct xfs_buf *bp,
+ bool readahead)
+{
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ int i;
+ int ni;
+
+ /*
+ * Validate the magic number and version of every inode in the buffer
+ */
+ ni = XFS_BB_TO_FSB(mp, bp->b_length) * mp->m_sb.sb_inopblock;
+ for (i = 0; i < ni; i++) {
+ int di_ok;
+ xfs_dinode_t *dip;
+
+ dip = (struct xfs_dinode *)xfs_buf_offset(bp,
+ (i << mp->m_sb.sb_inodelog));
+ di_ok = dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) &&
+ XFS_DINODE_GOOD_VERSION(dip->di_version);
+ if (unlikely(XFS_TEST_ERROR(!di_ok, mp,
+ XFS_ERRTAG_ITOBP_INOTOBP,
+ XFS_RANDOM_ITOBP_INOTOBP))) {
+ if (readahead) {
+ bp->b_flags &= ~XBF_DONE;
+ return;
+ }
+
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
+ xfs_verifier_error(bp);
+#ifdef DEBUG
+ xfs_alert(mp,
+ "bad inode magic/vsn daddr %lld #%d (magic=%x)",
+ (unsigned long long)bp->b_bn, i,
+ be16_to_cpu(dip->di_magic));
+#endif
+ }
+ }
+ xfs_inobp_check(mp, bp);
+}
+
+
+static void
+xfs_inode_buf_read_verify(
+ struct xfs_buf *bp)
+{
+ xfs_inode_buf_verify(bp, false);
+}
+
+static void
+xfs_inode_buf_readahead_verify(
+ struct xfs_buf *bp)
+{
+ xfs_inode_buf_verify(bp, true);
+}
+
+static void
+xfs_inode_buf_write_verify(
+ struct xfs_buf *bp)
+{
+ xfs_inode_buf_verify(bp, false);
+}
+
+const struct xfs_buf_ops xfs_inode_buf_ops = {
+ .verify_read = xfs_inode_buf_read_verify,
+ .verify_write = xfs_inode_buf_write_verify,
+};
+
+const struct xfs_buf_ops xfs_inode_buf_ra_ops = {
+ .verify_read = xfs_inode_buf_readahead_verify,
+ .verify_write = xfs_inode_buf_write_verify,
+};
+
+
+/*
+ * This routine is called to map an inode to the buffer containing the on-disk
+ * version of the inode. It returns a pointer to the buffer containing the
+ * on-disk inode in the bpp parameter, and in the dipp parameter it returns a
+ * pointer to the on-disk inode within that buffer.
+ *
+ * If a non-zero error is returned, then the contents of bpp and dipp are
+ * undefined.
+ */
+int
+xfs_imap_to_bp(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ struct xfs_imap *imap,
+ struct xfs_dinode **dipp,
+ struct xfs_buf **bpp,
+ uint buf_flags,
+ uint iget_flags)
+{
+ struct xfs_buf *bp;
+ int error;
+
+ buf_flags |= XBF_UNMAPPED;
+ error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno,
+ (int)imap->im_len, buf_flags, &bp,
+ &xfs_inode_buf_ops);
+ if (error) {
+ if (error == EAGAIN) {
+ ASSERT(buf_flags & XBF_TRYLOCK);
+ return error;
+ }
+
+ if (error == EFSCORRUPTED &&
+ (iget_flags & XFS_IGET_UNTRUSTED))
+ return XFS_ERROR(EINVAL);
+
+ xfs_warn(mp, "%s: xfs_trans_read_buf() returned error %d.",
+ __func__, error);
+ return error;
+ }
+
+ *bpp = bp;
+ *dipp = (struct xfs_dinode *)xfs_buf_offset(bp, imap->im_boffset);
+ return 0;
+}
+
+void
+xfs_dinode_from_disk(
+ xfs_icdinode_t *to,
+ xfs_dinode_t *from)
+{
+ to->di_magic = be16_to_cpu(from->di_magic);
+ to->di_mode = be16_to_cpu(from->di_mode);
+ to->di_version = from ->di_version;
+ to->di_format = from->di_format;
+ to->di_onlink = be16_to_cpu(from->di_onlink);
+ to->di_uid = be32_to_cpu(from->di_uid);
+ to->di_gid = be32_to_cpu(from->di_gid);
+ to->di_nlink = be32_to_cpu(from->di_nlink);
+ to->di_projid_lo = be16_to_cpu(from->di_projid_lo);
+ to->di_projid_hi = be16_to_cpu(from->di_projid_hi);
+ memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad));
+ to->di_flushiter = be16_to_cpu(from->di_flushiter);
+ to->di_atime.t_sec = be32_to_cpu(from->di_atime.t_sec);
+ to->di_atime.t_nsec = be32_to_cpu(from->di_atime.t_nsec);
+ to->di_mtime.t_sec = be32_to_cpu(from->di_mtime.t_sec);
+ to->di_mtime.t_nsec = be32_to_cpu(from->di_mtime.t_nsec);
+ to->di_ctime.t_sec = be32_to_cpu(from->di_ctime.t_sec);
+ to->di_ctime.t_nsec = be32_to_cpu(from->di_ctime.t_nsec);
+ to->di_size = be64_to_cpu(from->di_size);
+ to->di_nblocks = be64_to_cpu(from->di_nblocks);
+ to->di_extsize = be32_to_cpu(from->di_extsize);
+ to->di_nextents = be32_to_cpu(from->di_nextents);
+ to->di_anextents = be16_to_cpu(from->di_anextents);
+ to->di_forkoff = from->di_forkoff;
+ to->di_aformat = from->di_aformat;
+ to->di_dmevmask = be32_to_cpu(from->di_dmevmask);
+ to->di_dmstate = be16_to_cpu(from->di_dmstate);
+ to->di_flags = be16_to_cpu(from->di_flags);
+ to->di_gen = be32_to_cpu(from->di_gen);
+
+ if (to->di_version == 3) {
+ to->di_changecount = be64_to_cpu(from->di_changecount);
+ to->di_crtime.t_sec = be32_to_cpu(from->di_crtime.t_sec);
+ to->di_crtime.t_nsec = be32_to_cpu(from->di_crtime.t_nsec);
+ to->di_flags2 = be64_to_cpu(from->di_flags2);
+ to->di_ino = be64_to_cpu(from->di_ino);
+ to->di_lsn = be64_to_cpu(from->di_lsn);
+ memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2));
+ uuid_copy(&to->di_uuid, &from->di_uuid);
+ }
+}
+
+void
+xfs_dinode_to_disk(
+ xfs_dinode_t *to,
+ xfs_icdinode_t *from)
+{
+ to->di_magic = cpu_to_be16(from->di_magic);
+ to->di_mode = cpu_to_be16(from->di_mode);
+ to->di_version = from ->di_version;
+ to->di_format = from->di_format;
+ to->di_onlink = cpu_to_be16(from->di_onlink);
+ to->di_uid = cpu_to_be32(from->di_uid);
+ to->di_gid = cpu_to_be32(from->di_gid);
+ to->di_nlink = cpu_to_be32(from->di_nlink);
+ to->di_projid_lo = cpu_to_be16(from->di_projid_lo);
+ to->di_projid_hi = cpu_to_be16(from->di_projid_hi);
+ memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad));
+ to->di_atime.t_sec = cpu_to_be32(from->di_atime.t_sec);
+ to->di_atime.t_nsec = cpu_to_be32(from->di_atime.t_nsec);
+ to->di_mtime.t_sec = cpu_to_be32(from->di_mtime.t_sec);
+ to->di_mtime.t_nsec = cpu_to_be32(from->di_mtime.t_nsec);
+ to->di_ctime.t_sec = cpu_to_be32(from->di_ctime.t_sec);
+ to->di_ctime.t_nsec = cpu_to_be32(from->di_ctime.t_nsec);
+ to->di_size = cpu_to_be64(from->di_size);
+ to->di_nblocks = cpu_to_be64(from->di_nblocks);
+ to->di_extsize = cpu_to_be32(from->di_extsize);
+ to->di_nextents = cpu_to_be32(from->di_nextents);
+ to->di_anextents = cpu_to_be16(from->di_anextents);
+ to->di_forkoff = from->di_forkoff;
+ to->di_aformat = from->di_aformat;
+ to->di_dmevmask = cpu_to_be32(from->di_dmevmask);
+ to->di_dmstate = cpu_to_be16(from->di_dmstate);
+ to->di_flags = cpu_to_be16(from->di_flags);
+ to->di_gen = cpu_to_be32(from->di_gen);
+
+ if (from->di_version == 3) {
+ to->di_changecount = cpu_to_be64(from->di_changecount);
+ to->di_crtime.t_sec = cpu_to_be32(from->di_crtime.t_sec);
+ to->di_crtime.t_nsec = cpu_to_be32(from->di_crtime.t_nsec);
+ to->di_flags2 = cpu_to_be64(from->di_flags2);
+ to->di_ino = cpu_to_be64(from->di_ino);
+ to->di_lsn = cpu_to_be64(from->di_lsn);
+ memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2));
+ uuid_copy(&to->di_uuid, &from->di_uuid);
+ to->di_flushiter = 0;
+ } else {
+ to->di_flushiter = cpu_to_be16(from->di_flushiter);
+ }
+}
+
+static bool
+xfs_dinode_verify(
+ struct xfs_mount *mp,
+ struct xfs_inode *ip,
+ struct xfs_dinode *dip)
+{
+ if (dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC))
+ return false;
+
+ /* only version 3 or greater inodes are extensively verified here */
+ if (dip->di_version < 3)
+ return true;
+
+ if (!xfs_sb_version_hascrc(&mp->m_sb))
+ return false;
+ if (!xfs_verify_cksum((char *)dip, mp->m_sb.sb_inodesize,
+ XFS_DINODE_CRC_OFF))
+ return false;
+ if (be64_to_cpu(dip->di_ino) != ip->i_ino)
+ return false;
+ if (!uuid_equal(&dip->di_uuid, &mp->m_sb.sb_uuid))
+ return false;
+ return true;
+}
+
+void
+xfs_dinode_calc_crc(
+ struct xfs_mount *mp,
+ struct xfs_dinode *dip)
+{
+ __uint32_t crc;
+
+ if (dip->di_version < 3)
+ return;
+
+ ASSERT(xfs_sb_version_hascrc(&mp->m_sb));
+ crc = xfs_start_cksum((char *)dip, mp->m_sb.sb_inodesize,
+ XFS_DINODE_CRC_OFF);
+ dip->di_crc = xfs_end_cksum(crc);
+}
+
+/*
+ * Read the disk inode attributes into the in-core inode structure.
+ *
+ * For version 5 superblocks, if we are initialising a new inode and we are not
+ * utilising the XFS_MOUNT_IKEEP inode cluster mode, we can simple build the new
+ * inode core with a random generation number. If we are keeping inodes around,
+ * we need to read the inode cluster to get the existing generation number off
+ * disk. Further, if we are using version 4 superblocks (i.e. v1/v2 inode
+ * format) then log recovery is dependent on the di_flushiter field being
+ * initialised from the current on-disk value and hence we must also read the
+ * inode off disk.
+ */
+int
+xfs_iread(
+ xfs_mount_t *mp,
+ xfs_trans_t *tp,
+ xfs_inode_t *ip,
+ uint iget_flags)
+{
+ xfs_buf_t *bp;
+ xfs_dinode_t *dip;
+ int error;
+
+ /*
+ * Fill in the location information in the in-core inode.
+ */
+ error = xfs_imap(mp, tp, ip->i_ino, &ip->i_imap, iget_flags);
+ if (error)
+ return error;
+
+ /* shortcut IO on inode allocation if possible */
+ if ((iget_flags & XFS_IGET_CREATE) &&
+ xfs_sb_version_hascrc(&mp->m_sb) &&
+ !(mp->m_flags & XFS_MOUNT_IKEEP)) {
+ /* initialise the on-disk inode core */
+ memset(&ip->i_d, 0, sizeof(ip->i_d));
+ ip->i_d.di_magic = XFS_DINODE_MAGIC;
+ ip->i_d.di_gen = prandom_u32();
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ ip->i_d.di_version = 3;
+ ip->i_d.di_ino = ip->i_ino;
+ uuid_copy(&ip->i_d.di_uuid, &mp->m_sb.sb_uuid);
+ } else
+ ip->i_d.di_version = 2;
+ return 0;
+ }
+
+ /*
+ * Get pointers to the on-disk inode and the buffer containing it.
+ */
+ error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &bp, 0, iget_flags);
+ if (error)
+ return error;
+
+ /* even unallocated inodes are verified */
+ if (!xfs_dinode_verify(mp, ip, dip)) {
+ xfs_alert(mp, "%s: validation failed for inode %lld failed",
+ __func__, ip->i_ino);
+
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, dip);
+ error = XFS_ERROR(EFSCORRUPTED);
+ goto out_brelse;
+ }
+
+ /*
+ * If the on-disk inode is already linked to a directory
+ * entry, copy all of the inode into the in-core inode.
+ * xfs_iformat_fork() handles copying in the inode format
+ * specific information.
+ * Otherwise, just get the truly permanent information.
+ */
+ if (dip->di_mode) {
+ xfs_dinode_from_disk(&ip->i_d, dip);
+ error = xfs_iformat_fork(ip, dip);
+ if (error) {
+#ifdef DEBUG
+ xfs_alert(mp, "%s: xfs_iformat() returned error %d",
+ __func__, error);
+#endif /* DEBUG */
+ goto out_brelse;
+ }
+ } else {
+ /*
+ * Partial initialisation of the in-core inode. Just the bits
+ * that xfs_ialloc won't overwrite or relies on being correct.
+ */
+ ip->i_d.di_magic = be16_to_cpu(dip->di_magic);
+ ip->i_d.di_version = dip->di_version;
+ ip->i_d.di_gen = be32_to_cpu(dip->di_gen);
+ ip->i_d.di_flushiter = be16_to_cpu(dip->di_flushiter);
+
+ if (dip->di_version == 3) {
+ ip->i_d.di_ino = be64_to_cpu(dip->di_ino);
+ uuid_copy(&ip->i_d.di_uuid, &dip->di_uuid);
+ }
+
+ /*
+ * Make sure to pull in the mode here as well in
+ * case the inode is released without being used.
+ * This ensures that xfs_inactive() will see that
+ * the inode is already free and not try to mess
+ * with the uninitialized part of it.
+ */
+ ip->i_d.di_mode = 0;
+ }
+
+ /*
+ * Automatically convert version 1 inode formats in memory to version 2
+ * inode format. If the inode is modified, it will get logged and
+ * rewritten as a version 2 inode. We can do this because we set the
+ * superblock feature bit for v2 inodes unconditionally during mount
+ * and it means the reast of the code can assume the inode version is 2
+ * or higher.
+ */
+ if (ip->i_d.di_version == 1) {
+ ip->i_d.di_version = 2;
+ memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
+ ip->i_d.di_nlink = ip->i_d.di_onlink;
+ ip->i_d.di_onlink = 0;
+ xfs_set_projid(ip, 0);
+ }
+
+ ip->i_delayed_blks = 0;
+
+ /*
+ * Mark the buffer containing the inode as something to keep
+ * around for a while. This helps to keep recently accessed
+ * meta-data in-core longer.
+ */
+ xfs_buf_set_ref(bp, XFS_INO_REF);
+
+ /*
+ * Use xfs_trans_brelse() to release the buffer containing the on-disk
+ * inode, because it was acquired with xfs_trans_read_buf() in
+ * xfs_imap_to_bp() above. If tp is NULL, this is just a normal
+ * brelse(). If we're within a transaction, then xfs_trans_brelse()
+ * will only release the buffer if it is not dirty within the
+ * transaction. It will be OK to release the buffer in this case,
+ * because inodes on disk are never destroyed and we will be locking the
+ * new in-core inode before putting it in the cache where other
+ * processes can find it. Thus we don't have to worry about the inode
+ * being changed just because we released the buffer.
+ */
+ out_brelse:
+ xfs_trans_brelse(tp, bp);
+ return error;
+}
diff --git a/fs/xfs/xfs_inode_buf.h b/fs/xfs/xfs_inode_buf.h
new file mode 100644
index 00000000000..9308c47f2a5
--- /dev/null
+++ b/fs/xfs/xfs_inode_buf.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#ifndef __XFS_INODE_BUF_H__
+#define __XFS_INODE_BUF_H__
+
+struct xfs_inode;
+struct xfs_dinode;
+struct xfs_icdinode;
+
+/*
+ * Inode location information. Stored in the inode and passed to
+ * xfs_imap_to_bp() to get a buffer and dinode for a given inode.
+ */
+struct xfs_imap {
+ xfs_daddr_t im_blkno; /* starting BB of inode chunk */
+ ushort im_len; /* length in BBs of inode chunk */
+ ushort im_boffset; /* inode offset in block in bytes */
+};
+
+int xfs_imap_to_bp(struct xfs_mount *, struct xfs_trans *,
+ struct xfs_imap *, struct xfs_dinode **,
+ struct xfs_buf **, uint, uint);
+int xfs_iread(struct xfs_mount *, struct xfs_trans *,
+ struct xfs_inode *, uint);
+void xfs_dinode_calc_crc(struct xfs_mount *, struct xfs_dinode *);
+void xfs_dinode_to_disk(struct xfs_dinode *to, struct xfs_icdinode *from);
+void xfs_dinode_from_disk(struct xfs_icdinode *to, struct xfs_dinode *from);
+
+#if defined(DEBUG)
+void xfs_inobp_check(struct xfs_mount *, struct xfs_buf *);
+#else
+#define xfs_inobp_check(mp, bp)
+#endif /* DEBUG */
+
+#endif /* __XFS_INODE_BUF_H__ */
diff --git a/fs/xfs/xfs_inode_fork.c b/fs/xfs/xfs_inode_fork.c
new file mode 100644
index 00000000000..b031e8d0d92
--- /dev/null
+++ b/fs/xfs/xfs_inode_fork.c
@@ -0,0 +1,1906 @@
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include <linux/log2.h>
+
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_inum.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_inode_item.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_bmap.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+#include "xfs_attr_sf.h"
+#include "xfs_dinode.h"
+
+kmem_zone_t *xfs_ifork_zone;
+
+STATIC int xfs_iformat_local(xfs_inode_t *, xfs_dinode_t *, int, int);
+STATIC int xfs_iformat_extents(xfs_inode_t *, xfs_dinode_t *, int);
+STATIC int xfs_iformat_btree(xfs_inode_t *, xfs_dinode_t *, int);
+
+#ifdef DEBUG
+/*
+ * Make sure that the extents in the given memory buffer
+ * are valid.
+ */
+void
+xfs_validate_extents(
+ xfs_ifork_t *ifp,
+ int nrecs,
+ xfs_exntfmt_t fmt)
+{
+ xfs_bmbt_irec_t irec;
+ xfs_bmbt_rec_host_t rec;
+ int i;
+
+ for (i = 0; i < nrecs; i++) {
+ xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i);
+ rec.l0 = get_unaligned(&ep->l0);
+ rec.l1 = get_unaligned(&ep->l1);
+ xfs_bmbt_get_all(&rec, &irec);
+ if (fmt == XFS_EXTFMT_NOSTATE)
+ ASSERT(irec.br_state == XFS_EXT_NORM);
+ }
+}
+#else /* DEBUG */
+#define xfs_validate_extents(ifp, nrecs, fmt)
+#endif /* DEBUG */
+
+
+/*
+ * Move inode type and inode format specific information from the
+ * on-disk inode to the in-core inode. For fifos, devs, and sockets
+ * this means set if_rdev to the proper value. For files, directories,
+ * and symlinks this means to bring in the in-line data or extent
+ * pointers. For a file in B-tree format, only the root is immediately
+ * brought in-core. The rest will be in-lined in if_extents when it
+ * is first referenced (see xfs_iread_extents()).
+ */
+int
+xfs_iformat_fork(
+ xfs_inode_t *ip,
+ xfs_dinode_t *dip)
+{
+ xfs_attr_shortform_t *atp;
+ int size;
+ int error = 0;
+ xfs_fsize_t di_size;
+
+ if (unlikely(be32_to_cpu(dip->di_nextents) +
+ be16_to_cpu(dip->di_anextents) >
+ be64_to_cpu(dip->di_nblocks))) {
+ xfs_warn(ip->i_mount,
+ "corrupt dinode %Lu, extent total = %d, nblocks = %Lu.",
+ (unsigned long long)ip->i_ino,
+ (int)(be32_to_cpu(dip->di_nextents) +
+ be16_to_cpu(dip->di_anextents)),
+ (unsigned long long)
+ be64_to_cpu(dip->di_nblocks));
+ XFS_CORRUPTION_ERROR("xfs_iformat(1)", XFS_ERRLEVEL_LOW,
+ ip->i_mount, dip);
+ return XFS_ERROR(EFSCORRUPTED);
+ }
+
+ if (unlikely(dip->di_forkoff > ip->i_mount->m_sb.sb_inodesize)) {
+ xfs_warn(ip->i_mount, "corrupt dinode %Lu, forkoff = 0x%x.",
+ (unsigned long long)ip->i_ino,
+ dip->di_forkoff);
+ XFS_CORRUPTION_ERROR("xfs_iformat(2)", XFS_ERRLEVEL_LOW,
+ ip->i_mount, dip);
+ return XFS_ERROR(EFSCORRUPTED);
+ }
+
+ if (unlikely((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) &&
+ !ip->i_mount->m_rtdev_targp)) {
+ xfs_warn(ip->i_mount,
+ "corrupt dinode %Lu, has realtime flag set.",
+ ip->i_ino);
+ XFS_CORRUPTION_ERROR("xfs_iformat(realtime)",
+ XFS_ERRLEVEL_LOW, ip->i_mount, dip);
+ return XFS_ERROR(EFSCORRUPTED);
+ }
+
+ switch (ip->i_d.di_mode & S_IFMT) {
+ case S_IFIFO:
+ case S_IFCHR:
+ case S_IFBLK:
+ case S_IFSOCK:
+ if (unlikely(dip->di_format != XFS_DINODE_FMT_DEV)) {
+ XFS_CORRUPTION_ERROR("xfs_iformat(3)", XFS_ERRLEVEL_LOW,
+ ip->i_mount, dip);
+ return XFS_ERROR(EFSCORRUPTED);
+ }
+ ip->i_d.di_size = 0;
+ ip->i_df.if_u2.if_rdev = xfs_dinode_get_rdev(dip);
+ break;
+
+ case S_IFREG:
+ case S_IFLNK:
+ case S_IFDIR:
+ switch (dip->di_format) {
+ case XFS_DINODE_FMT_LOCAL:
+ /*
+ * no local regular files yet
+ */
+ if (unlikely(S_ISREG(be16_to_cpu(dip->di_mode)))) {
+ xfs_warn(ip->i_mount,
+ "corrupt inode %Lu (local format for regular file).",
+ (unsigned long long) ip->i_ino);
+ XFS_CORRUPTION_ERROR("xfs_iformat(4)",
+ XFS_ERRLEVEL_LOW,
+ ip->i_mount, dip);
+ return XFS_ERROR(EFSCORRUPTED);
+ }
+
+ di_size = be64_to_cpu(dip->di_size);
+ if (unlikely(di_size < 0 ||
+ di_size > XFS_DFORK_DSIZE(dip, ip->i_mount))) {
+ xfs_warn(ip->i_mount,
+ "corrupt inode %Lu (bad size %Ld for local inode).",
+ (unsigned long long) ip->i_ino,
+ (long long) di_size);
+ XFS_CORRUPTION_ERROR("xfs_iformat(5)",
+ XFS_ERRLEVEL_LOW,
+ ip->i_mount, dip);
+ return XFS_ERROR(EFSCORRUPTED);
+ }
+
+ size = (int)di_size;
+ error = xfs_iformat_local(ip, dip, XFS_DATA_FORK, size);
+ break;
+ case XFS_DINODE_FMT_EXTENTS:
+ error = xfs_iformat_extents(ip, dip, XFS_DATA_FORK);
+ break;
+ case XFS_DINODE_FMT_BTREE:
+ error = xfs_iformat_btree(ip, dip, XFS_DATA_FORK);
+ break;
+ default:
+ XFS_ERROR_REPORT("xfs_iformat(6)", XFS_ERRLEVEL_LOW,
+ ip->i_mount);
+ return XFS_ERROR(EFSCORRUPTED);
+ }
+ break;
+
+ default:
+ XFS_ERROR_REPORT("xfs_iformat(7)", XFS_ERRLEVEL_LOW, ip->i_mount);
+ return XFS_ERROR(EFSCORRUPTED);
+ }
+ if (error) {
+ return error;
+ }
+ if (!XFS_DFORK_Q(dip))
+ return 0;
+
+ ASSERT(ip->i_afp == NULL);
+ ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP | KM_NOFS);
+
+ switch (dip->di_aformat) {
+ case XFS_DINODE_FMT_LOCAL:
+ atp = (xfs_attr_shortform_t *)XFS_DFORK_APTR(dip);
+ size = be16_to_cpu(atp->hdr.totsize);
+
+ if (unlikely(size < sizeof(struct xfs_attr_sf_hdr))) {
+ xfs_warn(ip->i_mount,
+ "corrupt inode %Lu (bad attr fork size %Ld).",
+ (unsigned long long) ip->i_ino,
+ (long long) size);
+ XFS_CORRUPTION_ERROR("xfs_iformat(8)",
+ XFS_ERRLEVEL_LOW,
+ ip->i_mount, dip);
+ return XFS_ERROR(EFSCORRUPTED);
+ }
+
+ error = xfs_iformat_local(ip, dip, XFS_ATTR_FORK, size);
+ break;
+ case XFS_DINODE_FMT_EXTENTS:
+ error = xfs_iformat_extents(ip, dip, XFS_ATTR_FORK);
+ break;
+ case XFS_DINODE_FMT_BTREE:
+ error = xfs_iformat_btree(ip, dip, XFS_ATTR_FORK);
+ break;
+ default:
+ error = XFS_ERROR(EFSCORRUPTED);
+ break;
+ }
+ if (error) {
+ kmem_zone_free(xfs_ifork_zone, ip->i_afp);
+ ip->i_afp = NULL;
+ xfs_idestroy_fork(ip, XFS_DATA_FORK);
+ }
+ return error;
+}
+
+/*
+ * The file is in-lined in the on-disk inode.
+ * If it fits into if_inline_data, then copy
+ * it there, otherwise allocate a buffer for it
+ * and copy the data there. Either way, set
+ * if_data to point at the data.
+ * If we allocate a buffer for the data, make
+ * sure that its size is a multiple of 4 and
+ * record the real size in i_real_bytes.
+ */
+STATIC int
+xfs_iformat_local(
+ xfs_inode_t *ip,
+ xfs_dinode_t *dip,
+ int whichfork,
+ int size)
+{
+ xfs_ifork_t *ifp;
+ int real_size;
+
+ /*
+ * If the size is unreasonable, then something
+ * is wrong and we just bail out rather than crash in
+ * kmem_alloc() or memcpy() below.
+ */
+ if (unlikely(size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) {
+ xfs_warn(ip->i_mount,
+ "corrupt inode %Lu (bad size %d for local fork, size = %d).",
+ (unsigned long long) ip->i_ino, size,
+ XFS_DFORK_SIZE(dip, ip->i_mount, whichfork));
+ XFS_CORRUPTION_ERROR("xfs_iformat_local", XFS_ERRLEVEL_LOW,
+ ip->i_mount, dip);
+ return XFS_ERROR(EFSCORRUPTED);
+ }
+ ifp = XFS_IFORK_PTR(ip, whichfork);
+ real_size = 0;
+ if (size == 0)
+ ifp->if_u1.if_data = NULL;
+ else if (size <= sizeof(ifp->if_u2.if_inline_data))
+ ifp->if_u1.if_data = ifp->if_u2.if_inline_data;
+ else {
+ real_size = roundup(size, 4);
+ ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP | KM_NOFS);
+ }
+ ifp->if_bytes = size;
+ ifp->if_real_bytes = real_size;
+ if (size)
+ memcpy(ifp->if_u1.if_data, XFS_DFORK_PTR(dip, whichfork), size);
+ ifp->if_flags &= ~XFS_IFEXTENTS;
+ ifp->if_flags |= XFS_IFINLINE;
+ return 0;
+}
+
+/*
+ * The file consists of a set of extents all
+ * of which fit into the on-disk inode.
+ * If there are few enough extents to fit into
+ * the if_inline_ext, then copy them there.
+ * Otherwise allocate a buffer for them and copy
+ * them into it. Either way, set if_extents
+ * to point at the extents.
+ */
+STATIC int
+xfs_iformat_extents(
+ xfs_inode_t *ip,
+ xfs_dinode_t *dip,
+ int whichfork)
+{
+ xfs_bmbt_rec_t *dp;
+ xfs_ifork_t *ifp;
+ int nex;
+ int size;
+ int i;
+
+ ifp = XFS_IFORK_PTR(ip, whichfork);
+ nex = XFS_DFORK_NEXTENTS(dip, whichfork);
+ size = nex * (uint)sizeof(xfs_bmbt_rec_t);
+
+ /*
+ * If the number of extents is unreasonable, then something
+ * is wrong and we just bail out rather than crash in
+ * kmem_alloc() or memcpy() below.
+ */
+ if (unlikely(size < 0 || size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) {
+ xfs_warn(ip->i_mount, "corrupt inode %Lu ((a)extents = %d).",
+ (unsigned long long) ip->i_ino, nex);
+ XFS_CORRUPTION_ERROR("xfs_iformat_extents(1)", XFS_ERRLEVEL_LOW,
+ ip->i_mount, dip);
+ return XFS_ERROR(EFSCORRUPTED);
+ }
+
+ ifp->if_real_bytes = 0;
+ if (nex == 0)
+ ifp->if_u1.if_extents = NULL;
+ else if (nex <= XFS_INLINE_EXTS)
+ ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext;
+ else
+ xfs_iext_add(ifp, 0, nex);
+
+ ifp->if_bytes = size;
+ if (size) {
+ dp = (xfs_bmbt_rec_t *) XFS_DFORK_PTR(dip, whichfork);
+ xfs_validate_extents(ifp, nex, XFS_EXTFMT_INODE(ip));
+ for (i = 0; i < nex; i++, dp++) {
+ xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i);
+ ep->l0 = get_unaligned_be64(&dp->l0);
+ ep->l1 = get_unaligned_be64(&dp->l1);
+ }
+ XFS_BMAP_TRACE_EXLIST(ip, nex, whichfork);
+ if (whichfork != XFS_DATA_FORK ||
+ XFS_EXTFMT_INODE(ip) == XFS_EXTFMT_NOSTATE)
+ if (unlikely(xfs_check_nostate_extents(
+ ifp, 0, nex))) {
+ XFS_ERROR_REPORT("xfs_iformat_extents(2)",
+ XFS_ERRLEVEL_LOW,
+ ip->i_mount);
+ return XFS_ERROR(EFSCORRUPTED);
+ }
+ }
+ ifp->if_flags |= XFS_IFEXTENTS;
+ return 0;
+}
+
+/*
+ * The file has too many extents to fit into
+ * the inode, so they are in B-tree format.
+ * Allocate a buffer for the root of the B-tree
+ * and copy the root into it. The i_extents
+ * field will remain NULL until all of the
+ * extents are read in (when they are needed).
+ */
+STATIC int
+xfs_iformat_btree(
+ xfs_inode_t *ip,
+ xfs_dinode_t *dip,
+ int whichfork)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ xfs_bmdr_block_t *dfp;
+ xfs_ifork_t *ifp;
+ /* REFERENCED */
+ int nrecs;
+ int size;
+
+ ifp = XFS_IFORK_PTR(ip, whichfork);
+ dfp = (xfs_bmdr_block_t *)XFS_DFORK_PTR(dip, whichfork);
+ size = XFS_BMAP_BROOT_SPACE(mp, dfp);
+ nrecs = be16_to_cpu(dfp->bb_numrecs);
+
+ /*
+ * blow out if -- fork has less extents than can fit in
+ * fork (fork shouldn't be a btree format), root btree
+ * block has more records than can fit into the fork,
+ * or the number of extents is greater than the number of
+ * blocks.
+ */
+ if (unlikely(XFS_IFORK_NEXTENTS(ip, whichfork) <=
+ XFS_IFORK_MAXEXT(ip, whichfork) ||
+ XFS_BMDR_SPACE_CALC(nrecs) >
+ XFS_DFORK_SIZE(dip, mp, whichfork) ||
+ XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks)) {
+ xfs_warn(mp, "corrupt inode %Lu (btree).",
+ (unsigned long long) ip->i_ino);
+ XFS_CORRUPTION_ERROR("xfs_iformat_btree", XFS_ERRLEVEL_LOW,
+ mp, dip);
+ return XFS_ERROR(EFSCORRUPTED);
+ }
+
+ ifp->if_broot_bytes = size;
+ ifp->if_broot = kmem_alloc(size, KM_SLEEP | KM_NOFS);
+ ASSERT(ifp->if_broot != NULL);
+ /*
+ * Copy and convert from the on-disk structure
+ * to the in-memory structure.
+ */
+ xfs_bmdr_to_bmbt(ip, dfp, XFS_DFORK_SIZE(dip, ip->i_mount, whichfork),
+ ifp->if_broot, size);
+ ifp->if_flags &= ~XFS_IFEXTENTS;
+ ifp->if_flags |= XFS_IFBROOT;
+
+ return 0;
+}
+
+/*
+ * Read in extents from a btree-format inode.
+ * Allocate and fill in if_extents. Real work is done in xfs_bmap.c.
+ */
+int
+xfs_iread_extents(
+ xfs_trans_t *tp,
+ xfs_inode_t *ip,
+ int whichfork)
+{
+ int error;
+ xfs_ifork_t *ifp;
+ xfs_extnum_t nextents;
+
+ ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+
+ if (unlikely(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)) {
+ XFS_ERROR_REPORT("xfs_iread_extents", XFS_ERRLEVEL_LOW,
+ ip->i_mount);
+ return XFS_ERROR(EFSCORRUPTED);
+ }
+ nextents = XFS_IFORK_NEXTENTS(ip, whichfork);
+ ifp = XFS_IFORK_PTR(ip, whichfork);
+
+ /*
+ * We know that the size is valid (it's checked in iformat_btree)
+ */
+ ifp->if_bytes = ifp->if_real_bytes = 0;
+ ifp->if_flags |= XFS_IFEXTENTS;
+ xfs_iext_add(ifp, 0, nextents);
+ error = xfs_bmap_read_extents(tp, ip, whichfork);
+ if (error) {
+ xfs_iext_destroy(ifp);
+ ifp->if_flags &= ~XFS_IFEXTENTS;
+ return error;
+ }
+ xfs_validate_extents(ifp, nextents, XFS_EXTFMT_INODE(ip));
+ return 0;
+}
+/*
+ * Reallocate the space for if_broot based on the number of records
+ * being added or deleted as indicated in rec_diff. Move the records
+ * and pointers in if_broot to fit the new size. When shrinking this
+ * will eliminate holes between the records and pointers created by
+ * the caller. When growing this will create holes to be filled in
+ * by the caller.
+ *
+ * The caller must not request to add more records than would fit in
+ * the on-disk inode root. If the if_broot is currently NULL, then
+ * if we are adding records, one will be allocated. The caller must also
+ * not request that the number of records go below zero, although
+ * it can go to zero.
+ *
+ * ip -- the inode whose if_broot area is changing
+ * ext_diff -- the change in the number of records, positive or negative,
+ * requested for the if_broot array.
+ */
+void
+xfs_iroot_realloc(
+ xfs_inode_t *ip,
+ int rec_diff,
+ int whichfork)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ int cur_max;
+ xfs_ifork_t *ifp;
+ struct xfs_btree_block *new_broot;
+ int new_max;
+ size_t new_size;
+ char *np;
+ char *op;
+
+ /*
+ * Handle the degenerate case quietly.
+ */
+ if (rec_diff == 0) {
+ return;
+ }
+
+ ifp = XFS_IFORK_PTR(ip, whichfork);
+ if (rec_diff > 0) {
+ /*
+ * If there wasn't any memory allocated before, just
+ * allocate it now and get out.
+ */
+ if (ifp->if_broot_bytes == 0) {
+ new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, rec_diff);
+ ifp->if_broot = kmem_alloc(new_size, KM_SLEEP | KM_NOFS);
+ ifp->if_broot_bytes = (int)new_size;
+ return;
+ }
+
+ /*
+ * If there is already an existing if_broot, then we need
+ * to realloc() it and shift the pointers to their new
+ * location. The records don't change location because
+ * they are kept butted up against the btree block header.
+ */
+ cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0);
+ new_max = cur_max + rec_diff;
+ new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, new_max);
+ ifp->if_broot = kmem_realloc(ifp->if_broot, new_size,
+ XFS_BMAP_BROOT_SPACE_CALC(mp, cur_max),
+ KM_SLEEP | KM_NOFS);
+ op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
+ ifp->if_broot_bytes);
+ np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
+ (int)new_size);
+ ifp->if_broot_bytes = (int)new_size;
+ ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <=
+ XFS_IFORK_SIZE(ip, whichfork));
+ memmove(np, op, cur_max * (uint)sizeof(xfs_dfsbno_t));
+ return;
+ }
+
+ /*
+ * rec_diff is less than 0. In this case, we are shrinking the
+ * if_broot buffer. It must already exist. If we go to zero
+ * records, just get rid of the root and clear the status bit.
+ */
+ ASSERT((ifp->if_broot != NULL) && (ifp->if_broot_bytes > 0));
+ cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0);
+ new_max = cur_max + rec_diff;
+ ASSERT(new_max >= 0);
+ if (new_max > 0)
+ new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, new_max);
+ else
+ new_size = 0;
+ if (new_size > 0) {
+ new_broot = kmem_alloc(new_size, KM_SLEEP | KM_NOFS);
+ /*
+ * First copy over the btree block header.
+ */
+ memcpy(new_broot, ifp->if_broot,
+ XFS_BMBT_BLOCK_LEN(ip->i_mount));
+ } else {
+ new_broot = NULL;
+ ifp->if_flags &= ~XFS_IFBROOT;
+ }
+
+ /*
+ * Only copy the records and pointers if there are any.
+ */
+ if (new_max > 0) {
+ /*
+ * First copy the records.
+ */
+ op = (char *)XFS_BMBT_REC_ADDR(mp, ifp->if_broot, 1);
+ np = (char *)XFS_BMBT_REC_ADDR(mp, new_broot, 1);
+ memcpy(np, op, new_max * (uint)sizeof(xfs_bmbt_rec_t));
+
+ /*
+ * Then copy the pointers.
+ */
+ op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
+ ifp->if_broot_bytes);
+ np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, new_broot, 1,
+ (int)new_size);
+ memcpy(np, op, new_max * (uint)sizeof(xfs_dfsbno_t));
+ }
+ kmem_free(ifp->if_broot);
+ ifp->if_broot = new_broot;
+ ifp->if_broot_bytes = (int)new_size;
+ if (ifp->if_broot)
+ ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <=
+ XFS_IFORK_SIZE(ip, whichfork));
+ return;
+}
+
+
+/*
+ * This is called when the amount of space needed for if_data
+ * is increased or decreased. The change in size is indicated by
+ * the number of bytes that need to be added or deleted in the
+ * byte_diff parameter.
+ *
+ * If the amount of space needed has decreased below the size of the
+ * inline buffer, then switch to using the inline buffer. Otherwise,
+ * use kmem_realloc() or kmem_alloc() to adjust the size of the buffer
+ * to what is needed.
+ *
+ * ip -- the inode whose if_data area is changing
+ * byte_diff -- the change in the number of bytes, positive or negative,
+ * requested for the if_data array.
+ */
+void
+xfs_idata_realloc(
+ xfs_inode_t *ip,
+ int byte_diff,
+ int whichfork)
+{
+ xfs_ifork_t *ifp;
+ int new_size;
+ int real_size;
+
+ if (byte_diff == 0) {
+ return;
+ }
+
+ ifp = XFS_IFORK_PTR(ip, whichfork);
+ new_size = (int)ifp->if_bytes + byte_diff;
+ ASSERT(new_size >= 0);
+
+ if (new_size == 0) {
+ if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) {
+ kmem_free(ifp->if_u1.if_data);
+ }
+ ifp->if_u1.if_data = NULL;
+ real_size = 0;
+ } else if (new_size <= sizeof(ifp->if_u2.if_inline_data)) {
+ /*
+ * If the valid extents/data can fit in if_inline_ext/data,
+ * copy them from the malloc'd vector and free it.
+ */
+ if (ifp->if_u1.if_data == NULL) {
+ ifp->if_u1.if_data = ifp->if_u2.if_inline_data;
+ } else if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) {
+ ASSERT(ifp->if_real_bytes != 0);
+ memcpy(ifp->if_u2.if_inline_data, ifp->if_u1.if_data,
+ new_size);
+ kmem_free(ifp->if_u1.if_data);
+ ifp->if_u1.if_data = ifp->if_u2.if_inline_data;
+ }
+ real_size = 0;
+ } else {
+ /*
+ * Stuck with malloc/realloc.
+ * For inline data, the underlying buffer must be
+ * a multiple of 4 bytes in size so that it can be
+ * logged and stay on word boundaries. We enforce
+ * that here.
+ */
+ real_size = roundup(new_size, 4);
+ if (ifp->if_u1.if_data == NULL) {
+ ASSERT(ifp->if_real_bytes == 0);
+ ifp->if_u1.if_data = kmem_alloc(real_size,
+ KM_SLEEP | KM_NOFS);
+ } else if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) {
+ /*
+ * Only do the realloc if the underlying size
+ * is really changing.
+ */
+ if (ifp->if_real_bytes != real_size) {
+ ifp->if_u1.if_data =
+ kmem_realloc(ifp->if_u1.if_data,
+ real_size,
+ ifp->if_real_bytes,
+ KM_SLEEP | KM_NOFS);
+ }
+ } else {
+ ASSERT(ifp->if_real_bytes == 0);
+ ifp->if_u1.if_data = kmem_alloc(real_size,
+ KM_SLEEP | KM_NOFS);
+ memcpy(ifp->if_u1.if_data, ifp->if_u2.if_inline_data,
+ ifp->if_bytes);
+ }
+ }
+ ifp->if_real_bytes = real_size;
+ ifp->if_bytes = new_size;
+ ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork));
+}
+
+void
+xfs_idestroy_fork(
+ xfs_inode_t *ip,
+ int whichfork)
+{
+ xfs_ifork_t *ifp;
+
+ ifp = XFS_IFORK_PTR(ip, whichfork);
+ if (ifp->if_broot != NULL) {
+ kmem_free(ifp->if_broot);
+ ifp->if_broot = NULL;
+ }
+
+ /*
+ * If the format is local, then we can't have an extents
+ * array so just look for an inline data array. If we're
+ * not local then we may or may not have an extents list,
+ * so check and free it up if we do.
+ */
+ if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) {
+ if ((ifp->if_u1.if_data != ifp->if_u2.if_inline_data) &&
+ (ifp->if_u1.if_data != NULL)) {
+ ASSERT(ifp->if_real_bytes != 0);
+ kmem_free(ifp->if_u1.if_data);
+ ifp->if_u1.if_data = NULL;
+ ifp->if_real_bytes = 0;
+ }
+ } else if ((ifp->if_flags & XFS_IFEXTENTS) &&
+ ((ifp->if_flags & XFS_IFEXTIREC) ||
+ ((ifp->if_u1.if_extents != NULL) &&
+ (ifp->if_u1.if_extents != ifp->if_u2.if_inline_ext)))) {
+ ASSERT(ifp->if_real_bytes != 0);
+ xfs_iext_destroy(ifp);
+ }
+ ASSERT(ifp->if_u1.if_extents == NULL ||
+ ifp->if_u1.if_extents == ifp->if_u2.if_inline_ext);
+ ASSERT(ifp->if_real_bytes == 0);
+ if (whichfork == XFS_ATTR_FORK) {
+ kmem_zone_free(xfs_ifork_zone, ip->i_afp);
+ ip->i_afp = NULL;
+ }
+}
+
+/*
+ * Convert in-core extents to on-disk form
+ *
+ * For either the data or attr fork in extent format, we need to endian convert
+ * the in-core extent as we place them into the on-disk inode.
+ *
+ * In the case of the data fork, the in-core and on-disk fork sizes can be
+ * different due to delayed allocation extents. We only copy on-disk extents
+ * here, so callers must always use the physical fork size to determine the
+ * size of the buffer passed to this routine. We will return the size actually
+ * used.
+ */
+int
+xfs_iextents_copy(
+ xfs_inode_t *ip,
+ xfs_bmbt_rec_t *dp,
+ int whichfork)
+{
+ int copied;
+ int i;
+ xfs_ifork_t *ifp;
+ int nrecs;
+ xfs_fsblock_t start_block;
+
+ ifp = XFS_IFORK_PTR(ip, whichfork);
+ ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
+ ASSERT(ifp->if_bytes > 0);
+
+ nrecs = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+ XFS_BMAP_TRACE_EXLIST(ip, nrecs, whichfork);
+ ASSERT(nrecs > 0);
+
+ /*
+ * There are some delayed allocation extents in the
+ * inode, so copy the extents one at a time and skip
+ * the delayed ones. There must be at least one
+ * non-delayed extent.
+ */
+ copied = 0;
+ for (i = 0; i < nrecs; i++) {
+ xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i);
+ start_block = xfs_bmbt_get_startblock(ep);
+ if (isnullstartblock(start_block)) {
+ /*
+ * It's a delayed allocation extent, so skip it.
+ */
+ continue;
+ }
+
+ /* Translate to on disk format */
+ put_unaligned_be64(ep->l0, &dp->l0);
+ put_unaligned_be64(ep->l1, &dp->l1);
+ dp++;
+ copied++;
+ }
+ ASSERT(copied != 0);
+ xfs_validate_extents(ifp, copied, XFS_EXTFMT_INODE(ip));
+
+ return (copied * (uint)sizeof(xfs_bmbt_rec_t));
+}
+
+/*
+ * Each of the following cases stores data into the same region
+ * of the on-disk inode, so only one of them can be valid at
+ * any given time. While it is possible to have conflicting formats
+ * and log flags, e.g. having XFS_ILOG_?DATA set when the fork is
+ * in EXTENTS format, this can only happen when the fork has
+ * changed formats after being modified but before being flushed.
+ * In these cases, the format always takes precedence, because the
+ * format indicates the current state of the fork.
+ */
+void
+xfs_iflush_fork(
+ xfs_inode_t *ip,
+ xfs_dinode_t *dip,
+ xfs_inode_log_item_t *iip,
+ int whichfork)
+{
+ char *cp;
+ xfs_ifork_t *ifp;
+ xfs_mount_t *mp;
+ static const short brootflag[2] =
+ { XFS_ILOG_DBROOT, XFS_ILOG_ABROOT };
+ static const short dataflag[2] =
+ { XFS_ILOG_DDATA, XFS_ILOG_ADATA };
+ static const short extflag[2] =
+ { XFS_ILOG_DEXT, XFS_ILOG_AEXT };
+
+ if (!iip)
+ return;
+ ifp = XFS_IFORK_PTR(ip, whichfork);
+ /*
+ * This can happen if we gave up in iformat in an error path,
+ * for the attribute fork.
+ */
+ if (!ifp) {
+ ASSERT(whichfork == XFS_ATTR_FORK);
+ return;
+ }
+ cp = XFS_DFORK_PTR(dip, whichfork);
+ mp = ip->i_mount;
+ switch (XFS_IFORK_FORMAT(ip, whichfork)) {
+ case XFS_DINODE_FMT_LOCAL:
+ if ((iip->ili_fields & dataflag[whichfork]) &&
+ (ifp->if_bytes > 0)) {
+ ASSERT(ifp->if_u1.if_data != NULL);
+ ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork));
+ memcpy(cp, ifp->if_u1.if_data, ifp->if_bytes);
+ }
+ break;
+
+ case XFS_DINODE_FMT_EXTENTS:
+ ASSERT((ifp->if_flags & XFS_IFEXTENTS) ||
+ !(iip->ili_fields & extflag[whichfork]));
+ if ((iip->ili_fields & extflag[whichfork]) &&
+ (ifp->if_bytes > 0)) {
+ ASSERT(xfs_iext_get_ext(ifp, 0));
+ ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) > 0);
+ (void)xfs_iextents_copy(ip, (xfs_bmbt_rec_t *)cp,
+ whichfork);
+ }
+ break;
+
+ case XFS_DINODE_FMT_BTREE:
+ if ((iip->ili_fields & brootflag[whichfork]) &&
+ (ifp->if_broot_bytes > 0)) {
+ ASSERT(ifp->if_broot != NULL);
+ ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <=
+ XFS_IFORK_SIZE(ip, whichfork));
+ xfs_bmbt_to_bmdr(mp, ifp->if_broot, ifp->if_broot_bytes,
+ (xfs_bmdr_block_t *)cp,
+ XFS_DFORK_SIZE(dip, mp, whichfork));
+ }
+ break;
+
+ case XFS_DINODE_FMT_DEV:
+ if (iip->ili_fields & XFS_ILOG_DEV) {
+ ASSERT(whichfork == XFS_DATA_FORK);
+ xfs_dinode_put_rdev(dip, ip->i_df.if_u2.if_rdev);
+ }
+ break;
+
+ case XFS_DINODE_FMT_UUID:
+ if (iip->ili_fields & XFS_ILOG_UUID) {
+ ASSERT(whichfork == XFS_DATA_FORK);
+ memcpy(XFS_DFORK_DPTR(dip),
+ &ip->i_df.if_u2.if_uuid,
+ sizeof(uuid_t));
+ }
+ break;
+
+ default:
+ ASSERT(0);
+ break;
+ }
+}
+
+/*
+ * Return a pointer to the extent record at file index idx.
+ */
+xfs_bmbt_rec_host_t *
+xfs_iext_get_ext(
+ xfs_ifork_t *ifp, /* inode fork pointer */
+ xfs_extnum_t idx) /* index of target extent */
+{
+ ASSERT(idx >= 0);
+ ASSERT(idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t));
+
+ if ((ifp->if_flags & XFS_IFEXTIREC) && (idx == 0)) {
+ return ifp->if_u1.if_ext_irec->er_extbuf;
+ } else if (ifp->if_flags & XFS_IFEXTIREC) {
+ xfs_ext_irec_t *erp; /* irec pointer */
+ int erp_idx = 0; /* irec index */
+ xfs_extnum_t page_idx = idx; /* ext index in target list */
+
+ erp = xfs_iext_idx_to_irec(ifp, &page_idx, &erp_idx, 0);
+ return &erp->er_extbuf[page_idx];
+ } else if (ifp->if_bytes) {
+ return &ifp->if_u1.if_extents[idx];
+ } else {
+ return NULL;
+ }
+}
+
+/*
+ * Insert new item(s) into the extent records for incore inode
+ * fork 'ifp'. 'count' new items are inserted at index 'idx'.
+ */
+void
+xfs_iext_insert(
+ xfs_inode_t *ip, /* incore inode pointer */
+ xfs_extnum_t idx, /* starting index of new items */
+ xfs_extnum_t count, /* number of inserted items */
+ xfs_bmbt_irec_t *new, /* items to insert */
+ int state) /* type of extent conversion */
+{
+ xfs_ifork_t *ifp = (state & BMAP_ATTRFORK) ? ip->i_afp : &ip->i_df;
+ xfs_extnum_t i; /* extent record index */
+
+ trace_xfs_iext_insert(ip, idx, new, state, _RET_IP_);
+
+ ASSERT(ifp->if_flags & XFS_IFEXTENTS);
+ xfs_iext_add(ifp, idx, count);
+ for (i = idx; i < idx + count; i++, new++)
+ xfs_bmbt_set_all(xfs_iext_get_ext(ifp, i), new);
+}
+
+/*
+ * This is called when the amount of space required for incore file
+ * extents needs to be increased. The ext_diff parameter stores the
+ * number of new extents being added and the idx parameter contains
+ * the extent index where the new extents will be added. If the new
+ * extents are being appended, then we just need to (re)allocate and
+ * initialize the space. Otherwise, if the new extents are being
+ * inserted into the middle of the existing entries, a bit more work
+ * is required to make room for the new extents to be inserted. The
+ * caller is responsible for filling in the new extent entries upon
+ * return.
+ */
+void
+xfs_iext_add(
+ xfs_ifork_t *ifp, /* inode fork pointer */
+ xfs_extnum_t idx, /* index to begin adding exts */
+ int ext_diff) /* number of extents to add */
+{
+ int byte_diff; /* new bytes being added */
+ int new_size; /* size of extents after adding */
+ xfs_extnum_t nextents; /* number of extents in file */
+
+ nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+ ASSERT((idx >= 0) && (idx <= nextents));
+ byte_diff = ext_diff * sizeof(xfs_bmbt_rec_t);
+ new_size = ifp->if_bytes + byte_diff;
+ /*
+ * If the new number of extents (nextents + ext_diff)
+ * fits inside the inode, then continue to use the inline
+ * extent buffer.
+ */
+ if (nextents + ext_diff <= XFS_INLINE_EXTS) {
+ if (idx < nextents) {
+ memmove(&ifp->if_u2.if_inline_ext[idx + ext_diff],
+ &ifp->if_u2.if_inline_ext[idx],
+ (nextents - idx) * sizeof(xfs_bmbt_rec_t));
+ memset(&ifp->if_u2.if_inline_ext[idx], 0, byte_diff);
+ }
+ ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext;
+ ifp->if_real_bytes = 0;
+ }
+ /*
+ * Otherwise use a linear (direct) extent list.
+ * If the extents are currently inside the inode,
+ * xfs_iext_realloc_direct will switch us from
+ * inline to direct extent allocation mode.
+ */
+ else if (nextents + ext_diff <= XFS_LINEAR_EXTS) {
+ xfs_iext_realloc_direct(ifp, new_size);
+ if (idx < nextents) {
+ memmove(&ifp->if_u1.if_extents[idx + ext_diff],
+ &ifp->if_u1.if_extents[idx],
+ (nextents - idx) * sizeof(xfs_bmbt_rec_t));
+ memset(&ifp->if_u1.if_extents[idx], 0, byte_diff);
+ }
+ }
+ /* Indirection array */
+ else {
+ xfs_ext_irec_t *erp;
+ int erp_idx = 0;
+ int page_idx = idx;
+
+ ASSERT(nextents + ext_diff > XFS_LINEAR_EXTS);
+ if (ifp->if_flags & XFS_IFEXTIREC) {
+ erp = xfs_iext_idx_to_irec(ifp, &page_idx, &erp_idx, 1);
+ } else {
+ xfs_iext_irec_init(ifp);
+ ASSERT(ifp->if_flags & XFS_IFEXTIREC);
+ erp = ifp->if_u1.if_ext_irec;
+ }
+ /* Extents fit in target extent page */
+ if (erp && erp->er_extcount + ext_diff <= XFS_LINEAR_EXTS) {
+ if (page_idx < erp->er_extcount) {
+ memmove(&erp->er_extbuf[page_idx + ext_diff],
+ &erp->er_extbuf[page_idx],
+ (erp->er_extcount - page_idx) *
+ sizeof(xfs_bmbt_rec_t));
+ memset(&erp->er_extbuf[page_idx], 0, byte_diff);
+ }
+ erp->er_extcount += ext_diff;
+ xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff);
+ }
+ /* Insert a new extent page */
+ else if (erp) {
+ xfs_iext_add_indirect_multi(ifp,
+ erp_idx, page_idx, ext_diff);
+ }
+ /*
+ * If extent(s) are being appended to the last page in
+ * the indirection array and the new extent(s) don't fit
+ * in the page, then erp is NULL and erp_idx is set to
+ * the next index needed in the indirection array.
+ */
+ else {
+ uint count = ext_diff;
+
+ while (count) {
+ erp = xfs_iext_irec_new(ifp, erp_idx);
+ erp->er_extcount = min(count, XFS_LINEAR_EXTS);
+ count -= erp->er_extcount;
+ if (count)
+ erp_idx++;
+ }
+ }
+ }
+ ifp->if_bytes = new_size;
+}
+
+/*
+ * This is called when incore extents are being added to the indirection
+ * array and the new extents do not fit in the target extent list. The
+ * erp_idx parameter contains the irec index for the target extent list
+ * in the indirection array, and the idx parameter contains the extent
+ * index within the list. The number of extents being added is stored
+ * in the count parameter.
+ *
+ * |-------| |-------|
+ * | | | | idx - number of extents before idx
+ * | idx | | count |
+ * | | | | count - number of extents being inserted at idx
+ * |-------| |-------|
+ * | count | | nex2 | nex2 - number of extents after idx + count
+ * |-------| |-------|
+ */
+void
+xfs_iext_add_indirect_multi(
+ xfs_ifork_t *ifp, /* inode fork pointer */
+ int erp_idx, /* target extent irec index */
+ xfs_extnum_t idx, /* index within target list */
+ int count) /* new extents being added */
+{
+ int byte_diff; /* new bytes being added */
+ xfs_ext_irec_t *erp; /* pointer to irec entry */
+ xfs_extnum_t ext_diff; /* number of extents to add */
+ xfs_extnum_t ext_cnt; /* new extents still needed */
+ xfs_extnum_t nex2; /* extents after idx + count */
+ xfs_bmbt_rec_t *nex2_ep = NULL; /* temp list for nex2 extents */
+ int nlists; /* number of irec's (lists) */
+
+ ASSERT(ifp->if_flags & XFS_IFEXTIREC);
+ erp = &ifp->if_u1.if_ext_irec[erp_idx];
+ nex2 = erp->er_extcount - idx;
+ nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
+
+ /*
+ * Save second part of target extent list
+ * (all extents past */
+ if (nex2) {
+ byte_diff = nex2 * sizeof(xfs_bmbt_rec_t);
+ nex2_ep = (xfs_bmbt_rec_t *) kmem_alloc(byte_diff, KM_NOFS);
+ memmove(nex2_ep, &erp->er_extbuf[idx], byte_diff);
+ erp->er_extcount -= nex2;
+ xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, -nex2);
+ memset(&erp->er_extbuf[idx], 0, byte_diff);
+ }
+
+ /*
+ * Add the new extents to the end of the target
+ * list, then allocate new irec record(s) and
+ * extent buffer(s) as needed to store the rest
+ * of the new extents.
+ */
+ ext_cnt = count;
+ ext_diff = MIN(ext_cnt, (int)XFS_LINEAR_EXTS - erp->er_extcount);
+ if (ext_diff) {
+ erp->er_extcount += ext_diff;
+ xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff);
+ ext_cnt -= ext_diff;
+ }
+ while (ext_cnt) {
+ erp_idx++;
+ erp = xfs_iext_irec_new(ifp, erp_idx);
+ ext_diff = MIN(ext_cnt, (int)XFS_LINEAR_EXTS);
+ erp->er_extcount = ext_diff;
+ xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff);
+ ext_cnt -= ext_diff;
+ }
+
+ /* Add nex2 extents back to indirection array */
+ if (nex2) {
+ xfs_extnum_t ext_avail;
+ int i;
+
+ byte_diff = nex2 * sizeof(xfs_bmbt_rec_t);
+ ext_avail = XFS_LINEAR_EXTS - erp->er_extcount;
+ i = 0;
+ /*
+ * If nex2 extents fit in the current page, append
+ * nex2_ep after the new extents.
+ */
+ if (nex2 <= ext_avail) {
+ i = erp->er_extcount;
+ }
+ /*
+ * Otherwise, check if space is available in the
+ * next page.
+ */
+ else if ((erp_idx < nlists - 1) &&
+ (nex2 <= (ext_avail = XFS_LINEAR_EXTS -
+ ifp->if_u1.if_ext_irec[erp_idx+1].er_extcount))) {
+ erp_idx++;
+ erp++;
+ /* Create a hole for nex2 extents */
+ memmove(&erp->er_extbuf[nex2], erp->er_extbuf,
+ erp->er_extcount * sizeof(xfs_bmbt_rec_t));
+ }
+ /*
+ * Final choice, create a new extent page for
+ * nex2 extents.
+ */
+ else {
+ erp_idx++;
+ erp = xfs_iext_irec_new(ifp, erp_idx);
+ }
+ memmove(&erp->er_extbuf[i], nex2_ep, byte_diff);
+ kmem_free(nex2_ep);
+ erp->er_extcount += nex2;
+ xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, nex2);
+ }
+}
+
+/*
+ * This is called when the amount of space required for incore file
+ * extents needs to be decreased. The ext_diff parameter stores the
+ * number of extents to be removed and the idx parameter contains
+ * the extent index where the extents will be removed from.
+ *
+ * If the amount of space needed has decreased below the linear
+ * limit, XFS_IEXT_BUFSZ, then switch to using the contiguous
+ * extent array. Otherwise, use kmem_realloc() to adjust the
+ * size to what is needed.
+ */
+void
+xfs_iext_remove(
+ xfs_inode_t *ip, /* incore inode pointer */
+ xfs_extnum_t idx, /* index to begin removing exts */
+ int ext_diff, /* number of extents to remove */
+ int state) /* type of extent conversion */
+{
+ xfs_ifork_t *ifp = (state & BMAP_ATTRFORK) ? ip->i_afp : &ip->i_df;
+ xfs_extnum_t nextents; /* number of extents in file */
+ int new_size; /* size of extents after removal */
+
+ trace_xfs_iext_remove(ip, idx, state, _RET_IP_);
+
+ ASSERT(ext_diff > 0);
+ nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+ new_size = (nextents - ext_diff) * sizeof(xfs_bmbt_rec_t);
+
+ if (new_size == 0) {
+ xfs_iext_destroy(ifp);
+ } else if (ifp->if_flags & XFS_IFEXTIREC) {
+ xfs_iext_remove_indirect(ifp, idx, ext_diff);
+ } else if (ifp->if_real_bytes) {
+ xfs_iext_remove_direct(ifp, idx, ext_diff);
+ } else {
+ xfs_iext_remove_inline(ifp, idx, ext_diff);
+ }
+ ifp->if_bytes = new_size;
+}
+
+/*
+ * This removes ext_diff extents from the inline buffer, beginning
+ * at extent index idx.
+ */
+void
+xfs_iext_remove_inline(
+ xfs_ifork_t *ifp, /* inode fork pointer */
+ xfs_extnum_t idx, /* index to begin removing exts */
+ int ext_diff) /* number of extents to remove */
+{
+ int nextents; /* number of extents in file */
+
+ ASSERT(!(ifp->if_flags & XFS_IFEXTIREC));
+ ASSERT(idx < XFS_INLINE_EXTS);
+ nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+ ASSERT(((nextents - ext_diff) > 0) &&
+ (nextents - ext_diff) < XFS_INLINE_EXTS);
+
+ if (idx + ext_diff < nextents) {
+ memmove(&ifp->if_u2.if_inline_ext[idx],
+ &ifp->if_u2.if_inline_ext[idx + ext_diff],
+ (nextents - (idx + ext_diff)) *
+ sizeof(xfs_bmbt_rec_t));
+ memset(&ifp->if_u2.if_inline_ext[nextents - ext_diff],
+ 0, ext_diff * sizeof(xfs_bmbt_rec_t));
+ } else {
+ memset(&ifp->if_u2.if_inline_ext[idx], 0,
+ ext_diff * sizeof(xfs_bmbt_rec_t));
+ }
+}
+
+/*
+ * This removes ext_diff extents from a linear (direct) extent list,
+ * beginning at extent index idx. If the extents are being removed
+ * from the end of the list (ie. truncate) then we just need to re-
+ * allocate the list to remove the extra space. Otherwise, if the
+ * extents are being removed from the middle of the existing extent
+ * entries, then we first need to move the extent records beginning
+ * at idx + ext_diff up in the list to overwrite the records being
+ * removed, then remove the extra space via kmem_realloc.
+ */
+void
+xfs_iext_remove_direct(
+ xfs_ifork_t *ifp, /* inode fork pointer */
+ xfs_extnum_t idx, /* index to begin removing exts */
+ int ext_diff) /* number of extents to remove */
+{
+ xfs_extnum_t nextents; /* number of extents in file */
+ int new_size; /* size of extents after removal */
+
+ ASSERT(!(ifp->if_flags & XFS_IFEXTIREC));
+ new_size = ifp->if_bytes -
+ (ext_diff * sizeof(xfs_bmbt_rec_t));
+ nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+
+ if (new_size == 0) {
+ xfs_iext_destroy(ifp);
+ return;
+ }
+ /* Move extents up in the list (if needed) */
+ if (idx + ext_diff < nextents) {
+ memmove(&ifp->if_u1.if_extents[idx],
+ &ifp->if_u1.if_extents[idx + ext_diff],
+ (nextents - (idx + ext_diff)) *
+ sizeof(xfs_bmbt_rec_t));
+ }
+ memset(&ifp->if_u1.if_extents[nextents - ext_diff],
+ 0, ext_diff * sizeof(xfs_bmbt_rec_t));
+ /*
+ * Reallocate the direct extent list. If the extents
+ * will fit inside the inode then xfs_iext_realloc_direct
+ * will switch from direct to inline extent allocation
+ * mode for us.
+ */
+ xfs_iext_realloc_direct(ifp, new_size);
+ ifp->if_bytes = new_size;
+}
+
+/*
+ * This is called when incore extents are being removed from the
+ * indirection array and the extents being removed span multiple extent
+ * buffers. The idx parameter contains the file extent index where we
+ * want to begin removing extents, and the count parameter contains
+ * how many extents need to be removed.
+ *
+ * |-------| |-------|
+ * | nex1 | | | nex1 - number of extents before idx
+ * |-------| | count |
+ * | | | | count - number of extents being removed at idx
+ * | count | |-------|
+ * | | | nex2 | nex2 - number of extents after idx + count
+ * |-------| |-------|
+ */
+void
+xfs_iext_remove_indirect(
+ xfs_ifork_t *ifp, /* inode fork pointer */
+ xfs_extnum_t idx, /* index to begin removing extents */
+ int count) /* number of extents to remove */
+{
+ xfs_ext_irec_t *erp; /* indirection array pointer */
+ int erp_idx = 0; /* indirection array index */
+ xfs_extnum_t ext_cnt; /* extents left to remove */
+ xfs_extnum_t ext_diff; /* extents to remove in current list */
+ xfs_extnum_t nex1; /* number of extents before idx */
+ xfs_extnum_t nex2; /* extents after idx + count */
+ int page_idx = idx; /* index in target extent list */
+
+ ASSERT(ifp->if_flags & XFS_IFEXTIREC);
+ erp = xfs_iext_idx_to_irec(ifp, &page_idx, &erp_idx, 0);
+ ASSERT(erp != NULL);
+ nex1 = page_idx;
+ ext_cnt = count;
+ while (ext_cnt) {
+ nex2 = MAX((erp->er_extcount - (nex1 + ext_cnt)), 0);
+ ext_diff = MIN(ext_cnt, (erp->er_extcount - nex1));
+ /*
+ * Check for deletion of entire list;
+ * xfs_iext_irec_remove() updates extent offsets.
+ */
+ if (ext_diff == erp->er_extcount) {
+ xfs_iext_irec_remove(ifp, erp_idx);
+ ext_cnt -= ext_diff;
+ nex1 = 0;
+ if (ext_cnt) {
+ ASSERT(erp_idx < ifp->if_real_bytes /
+ XFS_IEXT_BUFSZ);
+ erp = &ifp->if_u1.if_ext_irec[erp_idx];
+ nex1 = 0;
+ continue;
+ } else {
+ break;
+ }
+ }
+ /* Move extents up (if needed) */
+ if (nex2) {
+ memmove(&erp->er_extbuf[nex1],
+ &erp->er_extbuf[nex1 + ext_diff],
+ nex2 * sizeof(xfs_bmbt_rec_t));
+ }
+ /* Zero out rest of page */
+ memset(&erp->er_extbuf[nex1 + nex2], 0, (XFS_IEXT_BUFSZ -
+ ((nex1 + nex2) * sizeof(xfs_bmbt_rec_t))));
+ /* Update remaining counters */
+ erp->er_extcount -= ext_diff;
+ xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, -ext_diff);
+ ext_cnt -= ext_diff;
+ nex1 = 0;
+ erp_idx++;
+ erp++;
+ }
+ ifp->if_bytes -= count * sizeof(xfs_bmbt_rec_t);
+ xfs_iext_irec_compact(ifp);
+}
+
+/*
+ * Create, destroy, or resize a linear (direct) block of extents.
+ */
+void
+xfs_iext_realloc_direct(
+ xfs_ifork_t *ifp, /* inode fork pointer */
+ int new_size) /* new size of extents after adding */
+{
+ int rnew_size; /* real new size of extents */
+
+ rnew_size = new_size;
+
+ ASSERT(!(ifp->if_flags & XFS_IFEXTIREC) ||
+ ((new_size >= 0) && (new_size <= XFS_IEXT_BUFSZ) &&
+ (new_size != ifp->if_real_bytes)));
+
+ /* Free extent records */
+ if (new_size == 0) {
+ xfs_iext_destroy(ifp);
+ }
+ /* Resize direct extent list and zero any new bytes */
+ else if (ifp->if_real_bytes) {
+ /* Check if extents will fit inside the inode */
+ if (new_size <= XFS_INLINE_EXTS * sizeof(xfs_bmbt_rec_t)) {
+ xfs_iext_direct_to_inline(ifp, new_size /
+ (uint)sizeof(xfs_bmbt_rec_t));
+ ifp->if_bytes = new_size;
+ return;
+ }
+ if (!is_power_of_2(new_size)){
+ rnew_size = roundup_pow_of_two(new_size);
+ }
+ if (rnew_size != ifp->if_real_bytes) {
+ ifp->if_u1.if_extents =
+ kmem_realloc(ifp->if_u1.if_extents,
+ rnew_size,
+ ifp->if_real_bytes, KM_NOFS);
+ }
+ if (rnew_size > ifp->if_real_bytes) {
+ memset(&ifp->if_u1.if_extents[ifp->if_bytes /
+ (uint)sizeof(xfs_bmbt_rec_t)], 0,
+ rnew_size - ifp->if_real_bytes);
+ }
+ }
+ /* Switch from the inline extent buffer to a direct extent list */
+ else {
+ if (!is_power_of_2(new_size)) {
+ rnew_size = roundup_pow_of_two(new_size);
+ }
+ xfs_iext_inline_to_direct(ifp, rnew_size);
+ }
+ ifp->if_real_bytes = rnew_size;
+ ifp->if_bytes = new_size;
+}
+
+/*
+ * Switch from linear (direct) extent records to inline buffer.
+ */
+void
+xfs_iext_direct_to_inline(
+ xfs_ifork_t *ifp, /* inode fork pointer */
+ xfs_extnum_t nextents) /* number of extents in file */
+{
+ ASSERT(ifp->if_flags & XFS_IFEXTENTS);
+ ASSERT(nextents <= XFS_INLINE_EXTS);
+ /*
+ * The inline buffer was zeroed when we switched
+ * from inline to direct extent allocation mode,
+ * so we don't need to clear it here.
+ */
+ memcpy(ifp->if_u2.if_inline_ext, ifp->if_u1.if_extents,
+ nextents * sizeof(xfs_bmbt_rec_t));
+ kmem_free(ifp->if_u1.if_extents);
+ ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext;
+ ifp->if_real_bytes = 0;
+}
+
+/*
+ * Switch from inline buffer to linear (direct) extent records.
+ * new_size should already be rounded up to the next power of 2
+ * by the caller (when appropriate), so use new_size as it is.
+ * However, since new_size may be rounded up, we can't update
+ * if_bytes here. It is the caller's responsibility to update
+ * if_bytes upon return.
+ */
+void
+xfs_iext_inline_to_direct(
+ xfs_ifork_t *ifp, /* inode fork pointer */
+ int new_size) /* number of extents in file */
+{
+ ifp->if_u1.if_extents = kmem_alloc(new_size, KM_NOFS);
+ memset(ifp->if_u1.if_extents, 0, new_size);
+ if (ifp->if_bytes) {
+ memcpy(ifp->if_u1.if_extents, ifp->if_u2.if_inline_ext,
+ ifp->if_bytes);
+ memset(ifp->if_u2.if_inline_ext, 0, XFS_INLINE_EXTS *
+ sizeof(xfs_bmbt_rec_t));
+ }
+ ifp->if_real_bytes = new_size;
+}
+
+/*
+ * Resize an extent indirection array to new_size bytes.
+ */
+STATIC void
+xfs_iext_realloc_indirect(
+ xfs_ifork_t *ifp, /* inode fork pointer */
+ int new_size) /* new indirection array size */
+{
+ int nlists; /* number of irec's (ex lists) */
+ int size; /* current indirection array size */
+
+ ASSERT(ifp->if_flags & XFS_IFEXTIREC);
+ nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
+ size = nlists * sizeof(xfs_ext_irec_t);
+ ASSERT(ifp->if_real_bytes);
+ ASSERT((new_size >= 0) && (new_size != size));
+ if (new_size == 0) {
+ xfs_iext_destroy(ifp);
+ } else {
+ ifp->if_u1.if_ext_irec = (xfs_ext_irec_t *)
+ kmem_realloc(ifp->if_u1.if_ext_irec,
+ new_size, size, KM_NOFS);
+ }
+}
+
+/*
+ * Switch from indirection array to linear (direct) extent allocations.
+ */
+STATIC void
+xfs_iext_indirect_to_direct(
+ xfs_ifork_t *ifp) /* inode fork pointer */
+{
+ xfs_bmbt_rec_host_t *ep; /* extent record pointer */
+ xfs_extnum_t nextents; /* number of extents in file */
+ int size; /* size of file extents */
+
+ ASSERT(ifp->if_flags & XFS_IFEXTIREC);
+ nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+ ASSERT(nextents <= XFS_LINEAR_EXTS);
+ size = nextents * sizeof(xfs_bmbt_rec_t);
+
+ xfs_iext_irec_compact_pages(ifp);
+ ASSERT(ifp->if_real_bytes == XFS_IEXT_BUFSZ);
+
+ ep = ifp->if_u1.if_ext_irec->er_extbuf;
+ kmem_free(ifp->if_u1.if_ext_irec);
+ ifp->if_flags &= ~XFS_IFEXTIREC;
+ ifp->if_u1.if_extents = ep;
+ ifp->if_bytes = size;
+ if (nextents < XFS_LINEAR_EXTS) {
+ xfs_iext_realloc_direct(ifp, size);
+ }
+}
+
+/*
+ * Free incore file extents.
+ */
+void
+xfs_iext_destroy(
+ xfs_ifork_t *ifp) /* inode fork pointer */
+{
+ if (ifp->if_flags & XFS_IFEXTIREC) {
+ int erp_idx;
+ int nlists;
+
+ nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
+ for (erp_idx = nlists - 1; erp_idx >= 0 ; erp_idx--) {
+ xfs_iext_irec_remove(ifp, erp_idx);
+ }
+ ifp->if_flags &= ~XFS_IFEXTIREC;
+ } else if (ifp->if_real_bytes) {
+ kmem_free(ifp->if_u1.if_extents);
+ } else if (ifp->if_bytes) {
+ memset(ifp->if_u2.if_inline_ext, 0, XFS_INLINE_EXTS *
+ sizeof(xfs_bmbt_rec_t));
+ }
+ ifp->if_u1.if_extents = NULL;
+ ifp->if_real_bytes = 0;
+ ifp->if_bytes = 0;
+}
+
+/*
+ * Return a pointer to the extent record for file system block bno.
+ */
+xfs_bmbt_rec_host_t * /* pointer to found extent record */
+xfs_iext_bno_to_ext(
+ xfs_ifork_t *ifp, /* inode fork pointer */
+ xfs_fileoff_t bno, /* block number to search for */
+ xfs_extnum_t *idxp) /* index of target extent */
+{
+ xfs_bmbt_rec_host_t *base; /* pointer to first extent */
+ xfs_filblks_t blockcount = 0; /* number of blocks in extent */
+ xfs_bmbt_rec_host_t *ep = NULL; /* pointer to target extent */
+ xfs_ext_irec_t *erp = NULL; /* indirection array pointer */
+ int high; /* upper boundary in search */
+ xfs_extnum_t idx = 0; /* index of target extent */
+ int low; /* lower boundary in search */
+ xfs_extnum_t nextents; /* number of file extents */
+ xfs_fileoff_t startoff = 0; /* start offset of extent */
+
+ nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+ if (nextents == 0) {
+ *idxp = 0;
+ return NULL;
+ }
+ low = 0;
+ if (ifp->if_flags & XFS_IFEXTIREC) {
+ /* Find target extent list */
+ int erp_idx = 0;
+ erp = xfs_iext_bno_to_irec(ifp, bno, &erp_idx);
+ base = erp->er_extbuf;
+ high = erp->er_extcount - 1;
+ } else {
+ base = ifp->if_u1.if_extents;
+ high = nextents - 1;
+ }
+ /* Binary search extent records */
+ while (low <= high) {
+ idx = (low + high) >> 1;
+ ep = base + idx;
+ startoff = xfs_bmbt_get_startoff(ep);
+ blockcount = xfs_bmbt_get_blockcount(ep);
+ if (bno < startoff) {
+ high = idx - 1;
+ } else if (bno >= startoff + blockcount) {
+ low = idx + 1;
+ } else {
+ /* Convert back to file-based extent index */
+ if (ifp->if_flags & XFS_IFEXTIREC) {
+ idx += erp->er_extoff;
+ }
+ *idxp = idx;
+ return ep;
+ }
+ }
+ /* Convert back to file-based extent index */
+ if (ifp->if_flags & XFS_IFEXTIREC) {
+ idx += erp->er_extoff;
+ }
+ if (bno >= startoff + blockcount) {
+ if (++idx == nextents) {
+ ep = NULL;
+ } else {
+ ep = xfs_iext_get_ext(ifp, idx);
+ }
+ }
+ *idxp = idx;
+ return ep;
+}
+
+/*
+ * Return a pointer to the indirection array entry containing the
+ * extent record for filesystem block bno. Store the index of the
+ * target irec in *erp_idxp.
+ */
+xfs_ext_irec_t * /* pointer to found extent record */
+xfs_iext_bno_to_irec(
+ xfs_ifork_t *ifp, /* inode fork pointer */
+ xfs_fileoff_t bno, /* block number to search for */
+ int *erp_idxp) /* irec index of target ext list */
+{
+ xfs_ext_irec_t *erp = NULL; /* indirection array pointer */
+ xfs_ext_irec_t *erp_next; /* next indirection array entry */
+ int erp_idx; /* indirection array index */
+ int nlists; /* number of extent irec's (lists) */
+ int high; /* binary search upper limit */
+ int low; /* binary search lower limit */
+
+ ASSERT(ifp->if_flags & XFS_IFEXTIREC);
+ nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
+ erp_idx = 0;
+ low = 0;
+ high = nlists - 1;
+ while (low <= high) {
+ erp_idx = (low + high) >> 1;
+ erp = &ifp->if_u1.if_ext_irec[erp_idx];
+ erp_next = erp_idx < nlists - 1 ? erp + 1 : NULL;
+ if (bno < xfs_bmbt_get_startoff(erp->er_extbuf)) {
+ high = erp_idx - 1;
+ } else if (erp_next && bno >=
+ xfs_bmbt_get_startoff(erp_next->er_extbuf)) {
+ low = erp_idx + 1;
+ } else {
+ break;
+ }
+ }
+ *erp_idxp = erp_idx;
+ return erp;
+}
+
+/*
+ * Return a pointer to the indirection array entry containing the
+ * extent record at file extent index *idxp. Store the index of the
+ * target irec in *erp_idxp and store the page index of the target
+ * extent record in *idxp.
+ */
+xfs_ext_irec_t *
+xfs_iext_idx_to_irec(
+ xfs_ifork_t *ifp, /* inode fork pointer */
+ xfs_extnum_t *idxp, /* extent index (file -> page) */
+ int *erp_idxp, /* pointer to target irec */
+ int realloc) /* new bytes were just added */
+{
+ xfs_ext_irec_t *prev; /* pointer to previous irec */
+ xfs_ext_irec_t *erp = NULL; /* pointer to current irec */
+ int erp_idx; /* indirection array index */
+ int nlists; /* number of irec's (ex lists) */
+ int high; /* binary search upper limit */
+ int low; /* binary search lower limit */
+ xfs_extnum_t page_idx = *idxp; /* extent index in target list */
+
+ ASSERT(ifp->if_flags & XFS_IFEXTIREC);
+ ASSERT(page_idx >= 0);
+ ASSERT(page_idx <= ifp->if_bytes / sizeof(xfs_bmbt_rec_t));
+ ASSERT(page_idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t) || realloc);
+
+ nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
+ erp_idx = 0;
+ low = 0;
+ high = nlists - 1;
+
+ /* Binary search extent irec's */
+ while (low <= high) {
+ erp_idx = (low + high) >> 1;
+ erp = &ifp->if_u1.if_ext_irec[erp_idx];
+ prev = erp_idx > 0 ? erp - 1 : NULL;
+ if (page_idx < erp->er_extoff || (page_idx == erp->er_extoff &&
+ realloc && prev && prev->er_extcount < XFS_LINEAR_EXTS)) {
+ high = erp_idx - 1;
+ } else if (page_idx > erp->er_extoff + erp->er_extcount ||
+ (page_idx == erp->er_extoff + erp->er_extcount &&
+ !realloc)) {
+ low = erp_idx + 1;
+ } else if (page_idx == erp->er_extoff + erp->er_extcount &&
+ erp->er_extcount == XFS_LINEAR_EXTS) {
+ ASSERT(realloc);
+ page_idx = 0;
+ erp_idx++;
+ erp = erp_idx < nlists ? erp + 1 : NULL;
+ break;
+ } else {
+ page_idx -= erp->er_extoff;
+ break;
+ }
+ }
+ *idxp = page_idx;
+ *erp_idxp = erp_idx;
+ return(erp);
+}
+
+/*
+ * Allocate and initialize an indirection array once the space needed
+ * for incore extents increases above XFS_IEXT_BUFSZ.
+ */
+void
+xfs_iext_irec_init(
+ xfs_ifork_t *ifp) /* inode fork pointer */
+{
+ xfs_ext_irec_t *erp; /* indirection array pointer */
+ xfs_extnum_t nextents; /* number of extents in file */
+
+ ASSERT(!(ifp->if_flags & XFS_IFEXTIREC));
+ nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+ ASSERT(nextents <= XFS_LINEAR_EXTS);
+
+ erp = kmem_alloc(sizeof(xfs_ext_irec_t), KM_NOFS);
+
+ if (nextents == 0) {
+ ifp->if_u1.if_extents = kmem_alloc(XFS_IEXT_BUFSZ, KM_NOFS);
+ } else if (!ifp->if_real_bytes) {
+ xfs_iext_inline_to_direct(ifp, XFS_IEXT_BUFSZ);
+ } else if (ifp->if_real_bytes < XFS_IEXT_BUFSZ) {
+ xfs_iext_realloc_direct(ifp, XFS_IEXT_BUFSZ);
+ }
+ erp->er_extbuf = ifp->if_u1.if_extents;
+ erp->er_extcount = nextents;
+ erp->er_extoff = 0;
+
+ ifp->if_flags |= XFS_IFEXTIREC;
+ ifp->if_real_bytes = XFS_IEXT_BUFSZ;
+ ifp->if_bytes = nextents * sizeof(xfs_bmbt_rec_t);
+ ifp->if_u1.if_ext_irec = erp;
+
+ return;
+}
+
+/*
+ * Allocate and initialize a new entry in the indirection array.
+ */
+xfs_ext_irec_t *
+xfs_iext_irec_new(
+ xfs_ifork_t *ifp, /* inode fork pointer */
+ int erp_idx) /* index for new irec */
+{
+ xfs_ext_irec_t *erp; /* indirection array pointer */
+ int i; /* loop counter */
+ int nlists; /* number of irec's (ex lists) */
+
+ ASSERT(ifp->if_flags & XFS_IFEXTIREC);
+ nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
+
+ /* Resize indirection array */
+ xfs_iext_realloc_indirect(ifp, ++nlists *
+ sizeof(xfs_ext_irec_t));
+ /*
+ * Move records down in the array so the
+ * new page can use erp_idx.
+ */
+ erp = ifp->if_u1.if_ext_irec;
+ for (i = nlists - 1; i > erp_idx; i--) {
+ memmove(&erp[i], &erp[i-1], sizeof(xfs_ext_irec_t));
+ }
+ ASSERT(i == erp_idx);
+
+ /* Initialize new extent record */
+ erp = ifp->if_u1.if_ext_irec;
+ erp[erp_idx].er_extbuf = kmem_alloc(XFS_IEXT_BUFSZ, KM_NOFS);
+ ifp->if_real_bytes = nlists * XFS_IEXT_BUFSZ;
+ memset(erp[erp_idx].er_extbuf, 0, XFS_IEXT_BUFSZ);
+ erp[erp_idx].er_extcount = 0;
+ erp[erp_idx].er_extoff = erp_idx > 0 ?
+ erp[erp_idx-1].er_extoff + erp[erp_idx-1].er_extcount : 0;
+ return (&erp[erp_idx]);
+}
+
+/*
+ * Remove a record from the indirection array.
+ */
+void
+xfs_iext_irec_remove(
+ xfs_ifork_t *ifp, /* inode fork pointer */
+ int erp_idx) /* irec index to remove */
+{
+ xfs_ext_irec_t *erp; /* indirection array pointer */
+ int i; /* loop counter */
+ int nlists; /* number of irec's (ex lists) */
+
+ ASSERT(ifp->if_flags & XFS_IFEXTIREC);
+ nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
+ erp = &ifp->if_u1.if_ext_irec[erp_idx];
+ if (erp->er_extbuf) {
+ xfs_iext_irec_update_extoffs(ifp, erp_idx + 1,
+ -erp->er_extcount);
+ kmem_free(erp->er_extbuf);
+ }
+ /* Compact extent records */
+ erp = ifp->if_u1.if_ext_irec;
+ for (i = erp_idx; i < nlists - 1; i++) {
+ memmove(&erp[i], &erp[i+1], sizeof(xfs_ext_irec_t));
+ }
+ /*
+ * Manually free the last extent record from the indirection
+ * array. A call to xfs_iext_realloc_indirect() with a size
+ * of zero would result in a call to xfs_iext_destroy() which
+ * would in turn call this function again, creating a nasty
+ * infinite loop.
+ */
+ if (--nlists) {
+ xfs_iext_realloc_indirect(ifp,
+ nlists * sizeof(xfs_ext_irec_t));
+ } else {
+ kmem_free(ifp->if_u1.if_ext_irec);
+ }
+ ifp->if_real_bytes = nlists * XFS_IEXT_BUFSZ;
+}
+
+/*
+ * This is called to clean up large amounts of unused memory allocated
+ * by the indirection array. Before compacting anything though, verify
+ * that the indirection array is still needed and switch back to the
+ * linear extent list (or even the inline buffer) if possible. The
+ * compaction policy is as follows:
+ *
+ * Full Compaction: Extents fit into a single page (or inline buffer)
+ * Partial Compaction: Extents occupy less than 50% of allocated space
+ * No Compaction: Extents occupy at least 50% of allocated space
+ */
+void
+xfs_iext_irec_compact(
+ xfs_ifork_t *ifp) /* inode fork pointer */
+{
+ xfs_extnum_t nextents; /* number of extents in file */
+ int nlists; /* number of irec's (ex lists) */
+
+ ASSERT(ifp->if_flags & XFS_IFEXTIREC);
+ nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
+ nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+
+ if (nextents == 0) {
+ xfs_iext_destroy(ifp);
+ } else if (nextents <= XFS_INLINE_EXTS) {
+ xfs_iext_indirect_to_direct(ifp);
+ xfs_iext_direct_to_inline(ifp, nextents);
+ } else if (nextents <= XFS_LINEAR_EXTS) {
+ xfs_iext_indirect_to_direct(ifp);
+ } else if (nextents < (nlists * XFS_LINEAR_EXTS) >> 1) {
+ xfs_iext_irec_compact_pages(ifp);
+ }
+}
+
+/*
+ * Combine extents from neighboring extent pages.
+ */
+void
+xfs_iext_irec_compact_pages(
+ xfs_ifork_t *ifp) /* inode fork pointer */
+{
+ xfs_ext_irec_t *erp, *erp_next;/* pointers to irec entries */
+ int erp_idx = 0; /* indirection array index */
+ int nlists; /* number of irec's (ex lists) */
+
+ ASSERT(ifp->if_flags & XFS_IFEXTIREC);
+ nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
+ while (erp_idx < nlists - 1) {
+ erp = &ifp->if_u1.if_ext_irec[erp_idx];
+ erp_next = erp + 1;
+ if (erp_next->er_extcount <=
+ (XFS_LINEAR_EXTS - erp->er_extcount)) {
+ memcpy(&erp->er_extbuf[erp->er_extcount],
+ erp_next->er_extbuf, erp_next->er_extcount *
+ sizeof(xfs_bmbt_rec_t));
+ erp->er_extcount += erp_next->er_extcount;
+ /*
+ * Free page before removing extent record
+ * so er_extoffs don't get modified in
+ * xfs_iext_irec_remove.
+ */
+ kmem_free(erp_next->er_extbuf);
+ erp_next->er_extbuf = NULL;
+ xfs_iext_irec_remove(ifp, erp_idx + 1);
+ nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
+ } else {
+ erp_idx++;
+ }
+ }
+}
+
+/*
+ * This is called to update the er_extoff field in the indirection
+ * array when extents have been added or removed from one of the
+ * extent lists. erp_idx contains the irec index to begin updating
+ * at and ext_diff contains the number of extents that were added
+ * or removed.
+ */
+void
+xfs_iext_irec_update_extoffs(
+ xfs_ifork_t *ifp, /* inode fork pointer */
+ int erp_idx, /* irec index to update */
+ int ext_diff) /* number of new extents */
+{
+ int i; /* loop counter */
+ int nlists; /* number of irec's (ex lists */
+
+ ASSERT(ifp->if_flags & XFS_IFEXTIREC);
+ nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
+ for (i = erp_idx; i < nlists; i++) {
+ ifp->if_u1.if_ext_irec[i].er_extoff += ext_diff;
+ }
+}
diff --git a/fs/xfs/xfs_inode_fork.h b/fs/xfs/xfs_inode_fork.h
new file mode 100644
index 00000000000..7d3b1ed6dcb
--- /dev/null
+++ b/fs/xfs/xfs_inode_fork.h
@@ -0,0 +1,171 @@
+/*
+ * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#ifndef __XFS_INODE_FORK_H__
+#define __XFS_INODE_FORK_H__
+
+struct xfs_inode_log_item;
+struct xfs_dinode;
+
+/*
+ * The following xfs_ext_irec_t struct introduces a second (top) level
+ * to the in-core extent allocation scheme. These structs are allocated
+ * in a contiguous block, creating an indirection array where each entry
+ * (irec) contains a pointer to a buffer of in-core extent records which
+ * it manages. Each extent buffer is 4k in size, since 4k is the system
+ * page size on Linux i386 and systems with larger page sizes don't seem
+ * to gain much, if anything, by using their native page size as the
+ * extent buffer size. Also, using 4k extent buffers everywhere provides
+ * a consistent interface for CXFS across different platforms.
+ *
+ * There is currently no limit on the number of irec's (extent lists)
+ * allowed, so heavily fragmented files may require an indirection array
+ * which spans multiple system pages of memory. The number of extents
+ * which would require this amount of contiguous memory is very large
+ * and should not cause problems in the foreseeable future. However,
+ * if the memory needed for the contiguous array ever becomes a problem,
+ * it is possible that a third level of indirection may be required.
+ */
+typedef struct xfs_ext_irec {
+ xfs_bmbt_rec_host_t *er_extbuf; /* block of extent records */
+ xfs_extnum_t er_extoff; /* extent offset in file */
+ xfs_extnum_t er_extcount; /* number of extents in page/block */
+} xfs_ext_irec_t;
+
+/*
+ * File incore extent information, present for each of data & attr forks.
+ */
+#define XFS_IEXT_BUFSZ 4096
+#define XFS_LINEAR_EXTS (XFS_IEXT_BUFSZ / (uint)sizeof(xfs_bmbt_rec_t))
+#define XFS_INLINE_EXTS 2
+#define XFS_INLINE_DATA 32
+typedef struct xfs_ifork {
+ int if_bytes; /* bytes in if_u1 */
+ int if_real_bytes; /* bytes allocated in if_u1 */
+ struct xfs_btree_block *if_broot; /* file's incore btree root */
+ short if_broot_bytes; /* bytes allocated for root */
+ unsigned char if_flags; /* per-fork flags */
+ union {
+ xfs_bmbt_rec_host_t *if_extents;/* linear map file exts */
+ xfs_ext_irec_t *if_ext_irec; /* irec map file exts */
+ char *if_data; /* inline file data */
+ } if_u1;
+ union {
+ xfs_bmbt_rec_host_t if_inline_ext[XFS_INLINE_EXTS];
+ /* very small file extents */
+ char if_inline_data[XFS_INLINE_DATA];
+ /* very small file data */
+ xfs_dev_t if_rdev; /* dev number if special */
+ uuid_t if_uuid; /* mount point value */
+ } if_u2;
+} xfs_ifork_t;
+
+/*
+ * Per-fork incore inode flags.
+ */
+#define XFS_IFINLINE 0x01 /* Inline data is read in */
+#define XFS_IFEXTENTS 0x02 /* All extent pointers are read in */
+#define XFS_IFBROOT 0x04 /* i_broot points to the bmap b-tree root */
+#define XFS_IFEXTIREC 0x08 /* Indirection array of extent blocks */
+
+/*
+ * Fork handling.
+ */
+
+#define XFS_IFORK_Q(ip) ((ip)->i_d.di_forkoff != 0)
+#define XFS_IFORK_BOFF(ip) ((int)((ip)->i_d.di_forkoff << 3))
+
+#define XFS_IFORK_PTR(ip,w) \
+ ((w) == XFS_DATA_FORK ? \
+ &(ip)->i_df : \
+ (ip)->i_afp)
+#define XFS_IFORK_DSIZE(ip) \
+ (XFS_IFORK_Q(ip) ? \
+ XFS_IFORK_BOFF(ip) : \
+ XFS_LITINO((ip)->i_mount, (ip)->i_d.di_version))
+#define XFS_IFORK_ASIZE(ip) \
+ (XFS_IFORK_Q(ip) ? \
+ XFS_LITINO((ip)->i_mount, (ip)->i_d.di_version) - \
+ XFS_IFORK_BOFF(ip) : \
+ 0)
+#define XFS_IFORK_SIZE(ip,w) \
+ ((w) == XFS_DATA_FORK ? \
+ XFS_IFORK_DSIZE(ip) : \
+ XFS_IFORK_ASIZE(ip))
+#define XFS_IFORK_FORMAT(ip,w) \
+ ((w) == XFS_DATA_FORK ? \
+ (ip)->i_d.di_format : \
+ (ip)->i_d.di_aformat)
+#define XFS_IFORK_FMT_SET(ip,w,n) \
+ ((w) == XFS_DATA_FORK ? \
+ ((ip)->i_d.di_format = (n)) : \
+ ((ip)->i_d.di_aformat = (n)))
+#define XFS_IFORK_NEXTENTS(ip,w) \
+ ((w) == XFS_DATA_FORK ? \
+ (ip)->i_d.di_nextents : \
+ (ip)->i_d.di_anextents)
+#define XFS_IFORK_NEXT_SET(ip,w,n) \
+ ((w) == XFS_DATA_FORK ? \
+ ((ip)->i_d.di_nextents = (n)) : \
+ ((ip)->i_d.di_anextents = (n)))
+#define XFS_IFORK_MAXEXT(ip, w) \
+ (XFS_IFORK_SIZE(ip, w) / sizeof(xfs_bmbt_rec_t))
+
+int xfs_iformat_fork(struct xfs_inode *, struct xfs_dinode *);
+void xfs_iflush_fork(struct xfs_inode *, struct xfs_dinode *,
+ struct xfs_inode_log_item *, int);
+void xfs_idestroy_fork(struct xfs_inode *, int);
+void xfs_idata_realloc(struct xfs_inode *, int, int);
+void xfs_iroot_realloc(struct xfs_inode *, int, int);
+int xfs_iread_extents(struct xfs_trans *, struct xfs_inode *, int);
+int xfs_iextents_copy(struct xfs_inode *, struct xfs_bmbt_rec *,
+ int);
+
+struct xfs_bmbt_rec_host *
+ xfs_iext_get_ext(struct xfs_ifork *, xfs_extnum_t);
+void xfs_iext_insert(struct xfs_inode *, xfs_extnum_t, xfs_extnum_t,
+ struct xfs_bmbt_irec *, int);
+void xfs_iext_add(struct xfs_ifork *, xfs_extnum_t, int);
+void xfs_iext_add_indirect_multi(struct xfs_ifork *, int,
+ xfs_extnum_t, int);
+void xfs_iext_remove(struct xfs_inode *, xfs_extnum_t, int, int);
+void xfs_iext_remove_inline(struct xfs_ifork *, xfs_extnum_t, int);
+void xfs_iext_remove_direct(struct xfs_ifork *, xfs_extnum_t, int);
+void xfs_iext_remove_indirect(struct xfs_ifork *, xfs_extnum_t, int);
+void xfs_iext_realloc_direct(struct xfs_ifork *, int);
+void xfs_iext_direct_to_inline(struct xfs_ifork *, xfs_extnum_t);
+void xfs_iext_inline_to_direct(struct xfs_ifork *, int);
+void xfs_iext_destroy(struct xfs_ifork *);
+struct xfs_bmbt_rec_host *
+ xfs_iext_bno_to_ext(struct xfs_ifork *, xfs_fileoff_t, int *);
+struct xfs_ext_irec *
+ xfs_iext_bno_to_irec(struct xfs_ifork *, xfs_fileoff_t, int *);
+struct xfs_ext_irec *
+ xfs_iext_idx_to_irec(struct xfs_ifork *, xfs_extnum_t *, int *,
+ int);
+void xfs_iext_irec_init(struct xfs_ifork *);
+struct xfs_ext_irec *
+ xfs_iext_irec_new(struct xfs_ifork *, int);
+void xfs_iext_irec_remove(struct xfs_ifork *, int);
+void xfs_iext_irec_compact(struct xfs_ifork *);
+void xfs_iext_irec_compact_pages(struct xfs_ifork *);
+void xfs_iext_irec_compact_full(struct xfs_ifork *);
+void xfs_iext_irec_update_extoffs(struct xfs_ifork *, int, int);
+
+extern struct kmem_zone *xfs_ifork_zone;
+
+#endif /* __XFS_INODE_FORK_H__ */
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index d041d47d9d8..a640137b357 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -17,19 +17,20 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
-#include "xfs_types.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_mount.h"
-#include "xfs_trans_priv.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_dinode.h"
#include "xfs_inode.h"
+#include "xfs_trans.h"
#include "xfs_inode_item.h"
#include "xfs_error.h"
#include "xfs_trace.h"
+#include "xfs_trans_priv.h"
+#include "xfs_dinode.h"
+#include "xfs_log.h"
kmem_zone_t *xfs_ili_zone; /* inode log item zone */
@@ -39,179 +40,120 @@ static inline struct xfs_inode_log_item *INODE_ITEM(struct xfs_log_item *lip)
return container_of(lip, struct xfs_inode_log_item, ili_item);
}
-
-/*
- * This returns the number of iovecs needed to log the given inode item.
- *
- * We need one iovec for the inode log format structure, one for the
- * inode core, and possibly one for the inode data/extents/b-tree root
- * and one for the inode attribute data/extents/b-tree root.
- */
-STATIC uint
-xfs_inode_item_size(
- struct xfs_log_item *lip)
+STATIC void
+xfs_inode_item_data_fork_size(
+ struct xfs_inode_log_item *iip,
+ int *nvecs,
+ int *nbytes)
{
- struct xfs_inode_log_item *iip = INODE_ITEM(lip);
struct xfs_inode *ip = iip->ili_inode;
- uint nvecs = 2;
switch (ip->i_d.di_format) {
case XFS_DINODE_FMT_EXTENTS:
if ((iip->ili_fields & XFS_ILOG_DEXT) &&
ip->i_d.di_nextents > 0 &&
- ip->i_df.if_bytes > 0)
- nvecs++;
+ ip->i_df.if_bytes > 0) {
+ /* worst case, doesn't subtract delalloc extents */
+ *nbytes += XFS_IFORK_DSIZE(ip);
+ *nvecs += 1;
+ }
break;
-
case XFS_DINODE_FMT_BTREE:
if ((iip->ili_fields & XFS_ILOG_DBROOT) &&
- ip->i_df.if_broot_bytes > 0)
- nvecs++;
+ ip->i_df.if_broot_bytes > 0) {
+ *nbytes += ip->i_df.if_broot_bytes;
+ *nvecs += 1;
+ }
break;
-
case XFS_DINODE_FMT_LOCAL:
if ((iip->ili_fields & XFS_ILOG_DDATA) &&
- ip->i_df.if_bytes > 0)
- nvecs++;
+ ip->i_df.if_bytes > 0) {
+ *nbytes += roundup(ip->i_df.if_bytes, 4);
+ *nvecs += 1;
+ }
break;
case XFS_DINODE_FMT_DEV:
case XFS_DINODE_FMT_UUID:
break;
-
default:
ASSERT(0);
break;
}
+}
- if (!XFS_IFORK_Q(ip))
- return nvecs;
-
+STATIC void
+xfs_inode_item_attr_fork_size(
+ struct xfs_inode_log_item *iip,
+ int *nvecs,
+ int *nbytes)
+{
+ struct xfs_inode *ip = iip->ili_inode;
- /*
- * Log any necessary attribute data.
- */
switch (ip->i_d.di_aformat) {
case XFS_DINODE_FMT_EXTENTS:
if ((iip->ili_fields & XFS_ILOG_AEXT) &&
ip->i_d.di_anextents > 0 &&
- ip->i_afp->if_bytes > 0)
- nvecs++;
+ ip->i_afp->if_bytes > 0) {
+ /* worst case, doesn't subtract unused space */
+ *nbytes += XFS_IFORK_ASIZE(ip);
+ *nvecs += 1;
+ }
break;
-
case XFS_DINODE_FMT_BTREE:
if ((iip->ili_fields & XFS_ILOG_ABROOT) &&
- ip->i_afp->if_broot_bytes > 0)
- nvecs++;
+ ip->i_afp->if_broot_bytes > 0) {
+ *nbytes += ip->i_afp->if_broot_bytes;
+ *nvecs += 1;
+ }
break;
-
case XFS_DINODE_FMT_LOCAL:
if ((iip->ili_fields & XFS_ILOG_ADATA) &&
- ip->i_afp->if_bytes > 0)
- nvecs++;
+ ip->i_afp->if_bytes > 0) {
+ *nbytes += roundup(ip->i_afp->if_bytes, 4);
+ *nvecs += 1;
+ }
break;
-
default:
ASSERT(0);
break;
}
-
- return nvecs;
}
/*
- * xfs_inode_item_format_extents - convert in-core extents to on-disk form
- *
- * For either the data or attr fork in extent format, we need to endian convert
- * the in-core extent as we place them into the on-disk inode. In this case, we
- * need to do this conversion before we write the extents into the log. Because
- * we don't have the disk inode to write into here, we allocate a buffer and
- * format the extents into it via xfs_iextents_copy(). We free the buffer in
- * the unlock routine after the copy for the log has been made.
+ * This returns the number of iovecs needed to log the given inode item.
*
- * In the case of the data fork, the in-core and on-disk fork sizes can be
- * different due to delayed allocation extents. We only log on-disk extents
- * here, so always use the physical fork size to determine the size of the
- * buffer we need to allocate.
+ * We need one iovec for the inode log format structure, one for the
+ * inode core, and possibly one for the inode data/extents/b-tree root
+ * and one for the inode attribute data/extents/b-tree root.
*/
STATIC void
-xfs_inode_item_format_extents(
- struct xfs_inode *ip,
- struct xfs_log_iovec *vecp,
- int whichfork,
- int type)
+xfs_inode_item_size(
+ struct xfs_log_item *lip,
+ int *nvecs,
+ int *nbytes)
{
- xfs_bmbt_rec_t *ext_buffer;
+ struct xfs_inode_log_item *iip = INODE_ITEM(lip);
+ struct xfs_inode *ip = iip->ili_inode;
- ext_buffer = kmem_alloc(XFS_IFORK_SIZE(ip, whichfork), KM_SLEEP);
- if (whichfork == XFS_DATA_FORK)
- ip->i_itemp->ili_extents_buf = ext_buffer;
- else
- ip->i_itemp->ili_aextents_buf = ext_buffer;
+ *nvecs += 2;
+ *nbytes += sizeof(struct xfs_inode_log_format) +
+ xfs_icdinode_size(ip->i_d.di_version);
- vecp->i_addr = ext_buffer;
- vecp->i_len = xfs_iextents_copy(ip, ext_buffer, whichfork);
- vecp->i_type = type;
+ xfs_inode_item_data_fork_size(iip, nvecs, nbytes);
+ if (XFS_IFORK_Q(ip))
+ xfs_inode_item_attr_fork_size(iip, nvecs, nbytes);
}
-/*
- * This is called to fill in the vector of log iovecs for the
- * given inode log item. It fills the first item with an inode
- * log format structure, the second with the on-disk inode structure,
- * and a possible third and/or fourth with the inode data/extents/b-tree
- * root and inode attributes data/extents/b-tree root.
- */
STATIC void
-xfs_inode_item_format(
- struct xfs_log_item *lip,
- struct xfs_log_iovec *vecp)
+xfs_inode_item_format_data_fork(
+ struct xfs_inode_log_item *iip,
+ struct xfs_inode_log_format *ilf,
+ struct xfs_log_vec *lv,
+ struct xfs_log_iovec **vecp)
{
- struct xfs_inode_log_item *iip = INODE_ITEM(lip);
struct xfs_inode *ip = iip->ili_inode;
- uint nvecs;
size_t data_bytes;
- xfs_mount_t *mp;
-
- vecp->i_addr = &iip->ili_format;
- vecp->i_len = sizeof(xfs_inode_log_format_t);
- vecp->i_type = XLOG_REG_TYPE_IFORMAT;
- vecp++;
- nvecs = 1;
-
- vecp->i_addr = &ip->i_d;
- vecp->i_len = sizeof(struct xfs_icdinode);
- vecp->i_type = XLOG_REG_TYPE_ICORE;
- vecp++;
- nvecs++;
-
- /*
- * If this is really an old format inode, then we need to
- * log it as such. This means that we have to copy the link
- * count from the new field to the old. We don't have to worry
- * about the new fields, because nothing trusts them as long as
- * the old inode version number is there. If the superblock already
- * has a new version number, then we don't bother converting back.
- */
- mp = ip->i_mount;
- ASSERT(ip->i_d.di_version == 1 || xfs_sb_version_hasnlink(&mp->m_sb));
- if (ip->i_d.di_version == 1) {
- if (!xfs_sb_version_hasnlink(&mp->m_sb)) {
- /*
- * Convert it back.
- */
- ASSERT(ip->i_d.di_nlink <= XFS_MAXLINK_1);
- ip->i_d.di_onlink = ip->i_d.di_nlink;
- } else {
- /*
- * The superblock version has already been bumped,
- * so just make the conversion to the new inode
- * format permanent.
- */
- ip->i_d.di_version = 2;
- ip->i_d.di_onlink = 0;
- memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
- }
- }
switch (ip->i_d.di_format) {
case XFS_DINODE_FMT_EXTENTS:
@@ -222,36 +164,23 @@ xfs_inode_item_format(
if ((iip->ili_fields & XFS_ILOG_DEXT) &&
ip->i_d.di_nextents > 0 &&
ip->i_df.if_bytes > 0) {
+ struct xfs_bmbt_rec *p;
+
ASSERT(ip->i_df.if_u1.if_extents != NULL);
ASSERT(ip->i_df.if_bytes / sizeof(xfs_bmbt_rec_t) > 0);
- ASSERT(iip->ili_extents_buf == NULL);
-
-#ifdef XFS_NATIVE_HOST
- if (ip->i_d.di_nextents == ip->i_df.if_bytes /
- (uint)sizeof(xfs_bmbt_rec_t)) {
- /*
- * There are no delayed allocation
- * extents, so just point to the
- * real extents array.
- */
- vecp->i_addr = ip->i_df.if_u1.if_extents;
- vecp->i_len = ip->i_df.if_bytes;
- vecp->i_type = XLOG_REG_TYPE_IEXT;
- } else
-#endif
- {
- xfs_inode_item_format_extents(ip, vecp,
- XFS_DATA_FORK, XLOG_REG_TYPE_IEXT);
- }
- ASSERT(vecp->i_len <= ip->i_df.if_bytes);
- iip->ili_format.ilf_dsize = vecp->i_len;
- vecp++;
- nvecs++;
+
+ p = xlog_prepare_iovec(lv, vecp, XLOG_REG_TYPE_IEXT);
+ data_bytes = xfs_iextents_copy(ip, p, XFS_DATA_FORK);
+ xlog_finish_iovec(lv, *vecp, data_bytes);
+
+ ASSERT(data_bytes <= ip->i_df.if_bytes);
+
+ ilf->ilf_dsize = data_bytes;
+ ilf->ilf_size++;
} else {
iip->ili_fields &= ~XFS_ILOG_DEXT;
}
break;
-
case XFS_DINODE_FMT_BTREE:
iip->ili_fields &=
~(XFS_ILOG_DDATA | XFS_ILOG_DEXT |
@@ -260,91 +189,70 @@ xfs_inode_item_format(
if ((iip->ili_fields & XFS_ILOG_DBROOT) &&
ip->i_df.if_broot_bytes > 0) {
ASSERT(ip->i_df.if_broot != NULL);
- vecp->i_addr = ip->i_df.if_broot;
- vecp->i_len = ip->i_df.if_broot_bytes;
- vecp->i_type = XLOG_REG_TYPE_IBROOT;
- vecp++;
- nvecs++;
- iip->ili_format.ilf_dsize = ip->i_df.if_broot_bytes;
+ xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_IBROOT,
+ ip->i_df.if_broot,
+ ip->i_df.if_broot_bytes);
+ ilf->ilf_dsize = ip->i_df.if_broot_bytes;
+ ilf->ilf_size++;
} else {
ASSERT(!(iip->ili_fields &
XFS_ILOG_DBROOT));
-#ifdef XFS_TRANS_DEBUG
- if (iip->ili_root_size > 0) {
- ASSERT(iip->ili_root_size ==
- ip->i_df.if_broot_bytes);
- ASSERT(memcmp(iip->ili_orig_root,
- ip->i_df.if_broot,
- iip->ili_root_size) == 0);
- } else {
- ASSERT(ip->i_df.if_broot_bytes == 0);
- }
-#endif
iip->ili_fields &= ~XFS_ILOG_DBROOT;
}
break;
-
case XFS_DINODE_FMT_LOCAL:
iip->ili_fields &=
~(XFS_ILOG_DEXT | XFS_ILOG_DBROOT |
XFS_ILOG_DEV | XFS_ILOG_UUID);
if ((iip->ili_fields & XFS_ILOG_DDATA) &&
ip->i_df.if_bytes > 0) {
- ASSERT(ip->i_df.if_u1.if_data != NULL);
- ASSERT(ip->i_d.di_size > 0);
-
- vecp->i_addr = ip->i_df.if_u1.if_data;
/*
* Round i_bytes up to a word boundary.
* The underlying memory is guaranteed to
* to be there by xfs_idata_realloc().
*/
data_bytes = roundup(ip->i_df.if_bytes, 4);
- ASSERT((ip->i_df.if_real_bytes == 0) ||
- (ip->i_df.if_real_bytes == data_bytes));
- vecp->i_len = (int)data_bytes;
- vecp->i_type = XLOG_REG_TYPE_ILOCAL;
- vecp++;
- nvecs++;
- iip->ili_format.ilf_dsize = (unsigned)data_bytes;
+ ASSERT(ip->i_df.if_real_bytes == 0 ||
+ ip->i_df.if_real_bytes == data_bytes);
+ ASSERT(ip->i_df.if_u1.if_data != NULL);
+ ASSERT(ip->i_d.di_size > 0);
+ xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_ILOCAL,
+ ip->i_df.if_u1.if_data, data_bytes);
+ ilf->ilf_dsize = (unsigned)data_bytes;
+ ilf->ilf_size++;
} else {
iip->ili_fields &= ~XFS_ILOG_DDATA;
}
break;
-
case XFS_DINODE_FMT_DEV:
iip->ili_fields &=
~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT |
XFS_ILOG_DEXT | XFS_ILOG_UUID);
- if (iip->ili_fields & XFS_ILOG_DEV) {
- iip->ili_format.ilf_u.ilfu_rdev =
- ip->i_df.if_u2.if_rdev;
- }
+ if (iip->ili_fields & XFS_ILOG_DEV)
+ ilf->ilf_u.ilfu_rdev = ip->i_df.if_u2.if_rdev;
break;
-
case XFS_DINODE_FMT_UUID:
iip->ili_fields &=
~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT |
XFS_ILOG_DEXT | XFS_ILOG_DEV);
- if (iip->ili_fields & XFS_ILOG_UUID) {
- iip->ili_format.ilf_u.ilfu_uuid =
- ip->i_df.if_u2.if_uuid;
- }
+ if (iip->ili_fields & XFS_ILOG_UUID)
+ ilf->ilf_u.ilfu_uuid = ip->i_df.if_u2.if_uuid;
break;
-
default:
ASSERT(0);
break;
}
+}
- /*
- * If there are no attributes associated with the file, then we're done.
- */
- if (!XFS_IFORK_Q(ip)) {
- iip->ili_fields &=
- ~(XFS_ILOG_ADATA | XFS_ILOG_ABROOT | XFS_ILOG_AEXT);
- goto out;
- }
+STATIC void
+xfs_inode_item_format_attr_fork(
+ struct xfs_inode_log_item *iip,
+ struct xfs_inode_log_format *ilf,
+ struct xfs_log_vec *lv,
+ struct xfs_log_iovec **vecp)
+{
+ struct xfs_inode *ip = iip->ili_inode;
+ size_t data_bytes;
switch (ip->i_d.di_aformat) {
case XFS_DINODE_FMT_EXTENTS:
@@ -354,30 +262,22 @@ xfs_inode_item_format(
if ((iip->ili_fields & XFS_ILOG_AEXT) &&
ip->i_d.di_anextents > 0 &&
ip->i_afp->if_bytes > 0) {
+ struct xfs_bmbt_rec *p;
+
ASSERT(ip->i_afp->if_bytes / sizeof(xfs_bmbt_rec_t) ==
ip->i_d.di_anextents);
ASSERT(ip->i_afp->if_u1.if_extents != NULL);
-#ifdef XFS_NATIVE_HOST
- /*
- * There are not delayed allocation extents
- * for attributes, so just point at the array.
- */
- vecp->i_addr = ip->i_afp->if_u1.if_extents;
- vecp->i_len = ip->i_afp->if_bytes;
- vecp->i_type = XLOG_REG_TYPE_IATTR_EXT;
-#else
- ASSERT(iip->ili_aextents_buf == NULL);
- xfs_inode_item_format_extents(ip, vecp,
- XFS_ATTR_FORK, XLOG_REG_TYPE_IATTR_EXT);
-#endif
- iip->ili_format.ilf_asize = vecp->i_len;
- vecp++;
- nvecs++;
+
+ p = xlog_prepare_iovec(lv, vecp, XLOG_REG_TYPE_IATTR_EXT);
+ data_bytes = xfs_iextents_copy(ip, p, XFS_ATTR_FORK);
+ xlog_finish_iovec(lv, *vecp, data_bytes);
+
+ ilf->ilf_asize = data_bytes;
+ ilf->ilf_size++;
} else {
iip->ili_fields &= ~XFS_ILOG_AEXT;
}
break;
-
case XFS_DINODE_FMT_BTREE:
iip->ili_fields &=
~(XFS_ILOG_ADATA | XFS_ILOG_AEXT);
@@ -386,61 +286,89 @@ xfs_inode_item_format(
ip->i_afp->if_broot_bytes > 0) {
ASSERT(ip->i_afp->if_broot != NULL);
- vecp->i_addr = ip->i_afp->if_broot;
- vecp->i_len = ip->i_afp->if_broot_bytes;
- vecp->i_type = XLOG_REG_TYPE_IATTR_BROOT;
- vecp++;
- nvecs++;
- iip->ili_format.ilf_asize = ip->i_afp->if_broot_bytes;
+ xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_IATTR_BROOT,
+ ip->i_afp->if_broot,
+ ip->i_afp->if_broot_bytes);
+ ilf->ilf_asize = ip->i_afp->if_broot_bytes;
+ ilf->ilf_size++;
} else {
iip->ili_fields &= ~XFS_ILOG_ABROOT;
}
break;
-
case XFS_DINODE_FMT_LOCAL:
iip->ili_fields &=
~(XFS_ILOG_AEXT | XFS_ILOG_ABROOT);
if ((iip->ili_fields & XFS_ILOG_ADATA) &&
ip->i_afp->if_bytes > 0) {
- ASSERT(ip->i_afp->if_u1.if_data != NULL);
-
- vecp->i_addr = ip->i_afp->if_u1.if_data;
/*
* Round i_bytes up to a word boundary.
* The underlying memory is guaranteed to
* to be there by xfs_idata_realloc().
*/
data_bytes = roundup(ip->i_afp->if_bytes, 4);
- ASSERT((ip->i_afp->if_real_bytes == 0) ||
- (ip->i_afp->if_real_bytes == data_bytes));
- vecp->i_len = (int)data_bytes;
- vecp->i_type = XLOG_REG_TYPE_IATTR_LOCAL;
- vecp++;
- nvecs++;
- iip->ili_format.ilf_asize = (unsigned)data_bytes;
+ ASSERT(ip->i_afp->if_real_bytes == 0 ||
+ ip->i_afp->if_real_bytes == data_bytes);
+ ASSERT(ip->i_afp->if_u1.if_data != NULL);
+ xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_IATTR_LOCAL,
+ ip->i_afp->if_u1.if_data,
+ data_bytes);
+ ilf->ilf_asize = (unsigned)data_bytes;
+ ilf->ilf_size++;
} else {
iip->ili_fields &= ~XFS_ILOG_ADATA;
}
break;
-
default:
ASSERT(0);
break;
}
-
-out:
- /*
- * Now update the log format that goes out to disk from the in-core
- * values. We always write the inode core to make the arithmetic
- * games in recovery easier, which isn't a big deal as just about any
- * transaction would dirty it anyway.
- */
- iip->ili_format.ilf_fields = XFS_ILOG_CORE |
- (iip->ili_fields & ~XFS_ILOG_TIMESTAMP);
- iip->ili_format.ilf_size = nvecs;
}
+/*
+ * This is called to fill in the vector of log iovecs for the given inode
+ * log item. It fills the first item with an inode log format structure,
+ * the second with the on-disk inode structure, and a possible third and/or
+ * fourth with the inode data/extents/b-tree root and inode attributes
+ * data/extents/b-tree root.
+ */
+STATIC void
+xfs_inode_item_format(
+ struct xfs_log_item *lip,
+ struct xfs_log_vec *lv)
+{
+ struct xfs_inode_log_item *iip = INODE_ITEM(lip);
+ struct xfs_inode *ip = iip->ili_inode;
+ struct xfs_inode_log_format *ilf;
+ struct xfs_log_iovec *vecp = NULL;
+
+ ASSERT(ip->i_d.di_version > 1);
+
+ ilf = xlog_prepare_iovec(lv, &vecp, XLOG_REG_TYPE_IFORMAT);
+ ilf->ilf_type = XFS_LI_INODE;
+ ilf->ilf_ino = ip->i_ino;
+ ilf->ilf_blkno = ip->i_imap.im_blkno;
+ ilf->ilf_len = ip->i_imap.im_len;
+ ilf->ilf_boffset = ip->i_imap.im_boffset;
+ ilf->ilf_fields = XFS_ILOG_CORE;
+ ilf->ilf_size = 2; /* format + core */
+ xlog_finish_iovec(lv, vecp, sizeof(struct xfs_inode_log_format));
+
+ xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_ICORE,
+ &ip->i_d,
+ xfs_icdinode_size(ip->i_d.di_version));
+
+ xfs_inode_item_format_data_fork(iip, ilf, lv, &vecp);
+ if (XFS_IFORK_Q(ip)) {
+ xfs_inode_item_format_attr_fork(iip, ilf, lv, &vecp);
+ } else {
+ iip->ili_fields &=
+ ~(XFS_ILOG_ADATA | XFS_ILOG_ABROOT | XFS_ILOG_AEXT);
+ }
+
+ /* update the format with the exact fields we actually logged */
+ ilf->ilf_fields |= (iip->ili_fields & ~XFS_ILOG_TIMESTAMP);
+}
/*
* This is called to pin the inode associated with the inode log
@@ -557,27 +485,6 @@ xfs_inode_item_unlock(
ASSERT(ip->i_itemp != NULL);
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
- /*
- * If the inode needed a separate buffer with which to log
- * its extents, then free it now.
- */
- if (iip->ili_extents_buf != NULL) {
- ASSERT(ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS);
- ASSERT(ip->i_d.di_nextents > 0);
- ASSERT(iip->ili_fields & XFS_ILOG_DEXT);
- ASSERT(ip->i_df.if_bytes > 0);
- kmem_free(iip->ili_extents_buf);
- iip->ili_extents_buf = NULL;
- }
- if (iip->ili_aextents_buf != NULL) {
- ASSERT(ip->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS);
- ASSERT(ip->i_d.di_anextents > 0);
- ASSERT(iip->ili_fields & XFS_ILOG_AEXT);
- ASSERT(ip->i_afp->if_bytes > 0);
- kmem_free(iip->ili_aextents_buf);
- iip->ili_aextents_buf = NULL;
- }
-
lock_flags = iip->ili_lock_flags;
iip->ili_lock_flags = 0;
if (lock_flags)
@@ -664,11 +571,6 @@ xfs_inode_item_init(
iip->ili_inode = ip;
xfs_log_item_init(mp, &iip->ili_item, XFS_LI_INODE,
&xfs_inode_item_ops);
- iip->ili_format.ilf_type = XFS_LI_INODE;
- iip->ili_format.ilf_ino = ip->i_ino;
- iip->ili_format.ilf_blkno = ip->i_imap.im_blkno;
- iip->ili_format.ilf_len = ip->i_imap.im_len;
- iip->ili_format.ilf_boffset = ip->i_imap.im_boffset;
}
/*
@@ -678,11 +580,6 @@ void
xfs_inode_item_destroy(
xfs_inode_t *ip)
{
-#ifdef XFS_TRANS_DEBUG
- if (ip->i_itemp->ili_root_size != 0) {
- kmem_free(ip->i_itemp->ili_orig_root);
- }
-#endif
kmem_zone_free(xfs_ili_zone, ip->i_itemp);
}
diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h
index 376d4d0b263..488d81254e2 100644
--- a/fs/xfs/xfs_inode_item.h
+++ b/fs/xfs/xfs_inode_item.h
@@ -18,123 +18,13 @@
#ifndef __XFS_INODE_ITEM_H__
#define __XFS_INODE_ITEM_H__
-/*
- * This is the structure used to lay out an inode log item in the
- * log. The size of the inline data/extents/b-tree root to be logged
- * (if any) is indicated in the ilf_dsize field. Changes to this structure
- * must be added on to the end.
- */
-typedef struct xfs_inode_log_format {
- __uint16_t ilf_type; /* inode log item type */
- __uint16_t ilf_size; /* size of this item */
- __uint32_t ilf_fields; /* flags for fields logged */
- __uint16_t ilf_asize; /* size of attr d/ext/root */
- __uint16_t ilf_dsize; /* size of data/ext/root */
- __uint64_t ilf_ino; /* inode number */
- union {
- __uint32_t ilfu_rdev; /* rdev value for dev inode*/
- uuid_t ilfu_uuid; /* mount point value */
- } ilf_u;
- __int64_t ilf_blkno; /* blkno of inode buffer */
- __int32_t ilf_len; /* len of inode buffer */
- __int32_t ilf_boffset; /* off of inode in buffer */
-} xfs_inode_log_format_t;
-
-typedef struct xfs_inode_log_format_32 {
- __uint16_t ilf_type; /* inode log item type */
- __uint16_t ilf_size; /* size of this item */
- __uint32_t ilf_fields; /* flags for fields logged */
- __uint16_t ilf_asize; /* size of attr d/ext/root */
- __uint16_t ilf_dsize; /* size of data/ext/root */
- __uint64_t ilf_ino; /* inode number */
- union {
- __uint32_t ilfu_rdev; /* rdev value for dev inode*/
- uuid_t ilfu_uuid; /* mount point value */
- } ilf_u;
- __int64_t ilf_blkno; /* blkno of inode buffer */
- __int32_t ilf_len; /* len of inode buffer */
- __int32_t ilf_boffset; /* off of inode in buffer */
-} __attribute__((packed)) xfs_inode_log_format_32_t;
-
-typedef struct xfs_inode_log_format_64 {
- __uint16_t ilf_type; /* inode log item type */
- __uint16_t ilf_size; /* size of this item */
- __uint32_t ilf_fields; /* flags for fields logged */
- __uint16_t ilf_asize; /* size of attr d/ext/root */
- __uint16_t ilf_dsize; /* size of data/ext/root */
- __uint32_t ilf_pad; /* pad for 64 bit boundary */
- __uint64_t ilf_ino; /* inode number */
- union {
- __uint32_t ilfu_rdev; /* rdev value for dev inode*/
- uuid_t ilfu_uuid; /* mount point value */
- } ilf_u;
- __int64_t ilf_blkno; /* blkno of inode buffer */
- __int32_t ilf_len; /* len of inode buffer */
- __int32_t ilf_boffset; /* off of inode in buffer */
-} xfs_inode_log_format_64_t;
-
-/*
- * Flags for xfs_trans_log_inode flags field.
- */
-#define XFS_ILOG_CORE 0x001 /* log standard inode fields */
-#define XFS_ILOG_DDATA 0x002 /* log i_df.if_data */
-#define XFS_ILOG_DEXT 0x004 /* log i_df.if_extents */
-#define XFS_ILOG_DBROOT 0x008 /* log i_df.i_broot */
-#define XFS_ILOG_DEV 0x010 /* log the dev field */
-#define XFS_ILOG_UUID 0x020 /* log the uuid field */
-#define XFS_ILOG_ADATA 0x040 /* log i_af.if_data */
-#define XFS_ILOG_AEXT 0x080 /* log i_af.if_extents */
-#define XFS_ILOG_ABROOT 0x100 /* log i_af.i_broot */
-
-
-/*
- * The timestamps are dirty, but not necessarily anything else in the inode
- * core. Unlike the other fields above this one must never make it to disk
- * in the ilf_fields of the inode_log_format, but is purely store in-memory in
- * ili_fields in the inode_log_item.
- */
-#define XFS_ILOG_TIMESTAMP 0x4000
-
-#define XFS_ILOG_NONCORE (XFS_ILOG_DDATA | XFS_ILOG_DEXT | \
- XFS_ILOG_DBROOT | XFS_ILOG_DEV | \
- XFS_ILOG_UUID | XFS_ILOG_ADATA | \
- XFS_ILOG_AEXT | XFS_ILOG_ABROOT)
-
-#define XFS_ILOG_DFORK (XFS_ILOG_DDATA | XFS_ILOG_DEXT | \
- XFS_ILOG_DBROOT)
-
-#define XFS_ILOG_AFORK (XFS_ILOG_ADATA | XFS_ILOG_AEXT | \
- XFS_ILOG_ABROOT)
-
-#define XFS_ILOG_ALL (XFS_ILOG_CORE | XFS_ILOG_DDATA | \
- XFS_ILOG_DEXT | XFS_ILOG_DBROOT | \
- XFS_ILOG_DEV | XFS_ILOG_UUID | \
- XFS_ILOG_ADATA | XFS_ILOG_AEXT | \
- XFS_ILOG_ABROOT | XFS_ILOG_TIMESTAMP)
-
-static inline int xfs_ilog_fbroot(int w)
-{
- return (w == XFS_DATA_FORK ? XFS_ILOG_DBROOT : XFS_ILOG_ABROOT);
-}
-
-static inline int xfs_ilog_fext(int w)
-{
- return (w == XFS_DATA_FORK ? XFS_ILOG_DEXT : XFS_ILOG_AEXT);
-}
-
-static inline int xfs_ilog_fdata(int w)
-{
- return (w == XFS_DATA_FORK ? XFS_ILOG_DDATA : XFS_ILOG_ADATA);
-}
-
-#ifdef __KERNEL__
+/* kernel only definitions */
struct xfs_buf;
struct xfs_bmbt_rec;
struct xfs_inode;
struct xfs_mount;
-
typedef struct xfs_inode_log_item {
xfs_log_item_t ili_item; /* common portion */
struct xfs_inode *ili_inode; /* inode ptr */
@@ -144,18 +34,8 @@ typedef struct xfs_inode_log_item {
unsigned short ili_logged; /* flushed logged data */
unsigned int ili_last_fields; /* fields when flushed */
unsigned int ili_fields; /* fields to be logged */
- struct xfs_bmbt_rec *ili_extents_buf; /* array of logged
- data exts */
- struct xfs_bmbt_rec *ili_aextents_buf; /* array of logged
- attr exts */
-#ifdef XFS_TRANS_DEBUG
- int ili_root_size;
- char *ili_orig_root;
-#endif
- xfs_inode_log_format_t ili_format; /* logged structure */
} xfs_inode_log_item_t;
-
static inline int xfs_inode_clean(xfs_inode_t *ip)
{
return !ip->i_itemp || !(ip->i_itemp->ili_fields & XFS_ILOG_ALL);
@@ -169,6 +49,6 @@ extern void xfs_iflush_abort(struct xfs_inode *, bool);
extern int xfs_inode_item_format_convert(xfs_log_iovec_t *,
xfs_inode_log_format_t *);
-#endif /* __KERNEL__ */
+extern struct kmem_zone *xfs_ili_zone;
#endif /* __XFS_INODE_ITEM_H__ */
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index c1c3ef88a26..8bc1bbce745 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -17,32 +17,31 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
-#include "xfs_alloc.h"
#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_dinode.h"
#include "xfs_inode.h"
#include "xfs_ioctl.h"
+#include "xfs_alloc.h"
#include "xfs_rtalloc.h"
#include "xfs_itable.h"
#include "xfs_error.h"
#include "xfs_attr.h"
#include "xfs_bmap.h"
-#include "xfs_buf_item.h"
-#include "xfs_utils.h"
-#include "xfs_dfrag.h"
+#include "xfs_bmap_util.h"
#include "xfs_fsops.h"
-#include "xfs_vnodeops.h"
#include "xfs_discard.h"
#include "xfs_quota.h"
-#include "xfs_inode_item.h"
#include "xfs_export.h"
#include "xfs_trace.h"
#include "xfs_icache.h"
+#include "xfs_symlink.h"
+#include "xfs_dinode.h"
+#include "xfs_trans.h"
#include <linux/capability.h>
#include <linux/dcache.h>
@@ -71,7 +70,7 @@ xfs_find_handle(
int hsize;
xfs_handle_t handle;
struct inode *inode;
- struct fd f = {0};
+ struct fd f = {NULL};
struct path path;
int error;
struct xfs_inode *ip;
@@ -80,7 +79,7 @@ xfs_find_handle(
f = fdget(hreq->fd);
if (!f.file)
return -EBADF;
- inode = f.file->f_path.dentry->d_inode;
+ inode = file_inode(f.file);
} else {
error = user_lpath((const char __user *)hreq->path, &path);
if (error)
@@ -113,15 +112,11 @@ xfs_find_handle(
memset(&handle.ha_fid, 0, sizeof(handle.ha_fid));
hsize = sizeof(xfs_fsid_t);
} else {
- int lock_mode;
-
- lock_mode = xfs_ilock_map_shared(ip);
handle.ha_fid.fid_len = sizeof(xfs_fid_t) -
sizeof(handle.ha_fid.fid_len);
handle.ha_fid.fid_pad = 0;
handle.ha_fid.fid_gen = ip->i_d.di_gen;
handle.ha_fid.fid_ino = ip->i_ino;
- xfs_iunlock_map_shared(ip, lock_mode);
hsize = XFS_HSIZE(handle);
}
@@ -168,7 +163,7 @@ xfs_handle_to_dentry(
/*
* Only allow handle opens under a directory.
*/
- if (!S_ISDIR(parfilp->f_path.dentry->d_inode->i_mode))
+ if (!S_ISDIR(file_inode(parfilp)->i_mode))
return ERR_PTR(-ENOTDIR);
if (hlen != sizeof(xfs_handle_t))
@@ -248,7 +243,7 @@ xfs_open_by_handle(
goto out_dput;
}
- fd = get_unused_fd();
+ fd = get_unused_fd_flags(0);
if (fd < 0) {
error = fd;
goto out_dput;
@@ -276,32 +271,6 @@ xfs_open_by_handle(
return error;
}
-/*
- * This is a copy from fs/namei.c:vfs_readlink(), except for removing it's
- * unused first argument.
- */
-STATIC int
-do_readlink(
- char __user *buffer,
- int buflen,
- const char *link)
-{
- int len;
-
- len = PTR_ERR(link);
- if (IS_ERR(link))
- goto out;
-
- len = strlen(link);
- if (len > (unsigned) buflen)
- len = buflen;
- if (copy_to_user(buffer, link, len))
- len = -EFAULT;
- out:
- return len;
-}
-
-
int
xfs_readlink_by_handle(
struct file *parfilp,
@@ -339,7 +308,7 @@ xfs_readlink_by_handle(
error = -xfs_readlink(XFS_I(dentry->d_inode), link);
if (error)
goto out_kfree;
- error = do_readlink(hreq->ohandle, olen, link);
+ error = readlink_copy(hreq->ohandle, olen, link);
if (error)
goto out_kfree;
@@ -350,6 +319,40 @@ xfs_readlink_by_handle(
return error;
}
+int
+xfs_set_dmattrs(
+ xfs_inode_t *ip,
+ u_int evmask,
+ u_int16_t state)
+{
+ xfs_mount_t *mp = ip->i_mount;
+ xfs_trans_t *tp;
+ int error;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return XFS_ERROR(EPERM);
+
+ if (XFS_FORCED_SHUTDOWN(mp))
+ return XFS_ERROR(EIO);
+
+ tp = xfs_trans_alloc(mp, XFS_TRANS_SET_DMATTRS);
+ error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0);
+ if (error) {
+ xfs_trans_cancel(tp, 0);
+ return error;
+ }
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+
+ ip->i_d.di_dmevmask = evmask;
+ ip->i_d.di_dmstate = state;
+
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+ error = xfs_trans_commit(tp, 0);
+
+ return error;
+}
+
STATIC int
xfs_fssetdm_by_handle(
struct file *parfilp,
@@ -409,7 +412,8 @@ xfs_attrlist_by_handle(
return -XFS_ERROR(EPERM);
if (copy_from_user(&al_hreq, arg, sizeof(xfs_fsop_attrlist_handlereq_t)))
return -XFS_ERROR(EFAULT);
- if (al_hreq.buflen > XATTR_LIST_MAX)
+ if (al_hreq.buflen < sizeof(struct attrlist) ||
+ al_hreq.buflen > XATTR_LIST_MAX)
return -XFS_ERROR(EINVAL);
/*
@@ -422,7 +426,7 @@ xfs_attrlist_by_handle(
if (IS_ERR(dentry))
return PTR_ERR(dentry);
- kbuf = kzalloc(al_hreq.buflen, GFP_KERNEL);
+ kbuf = kmem_zalloc_large(al_hreq.buflen, KM_SLEEP);
if (!kbuf)
goto out_dput;
@@ -435,9 +439,9 @@ xfs_attrlist_by_handle(
if (copy_to_user(al_hreq.buffer, kbuf, al_hreq.buflen))
error = -EFAULT;
- out_kfree:
- kfree(kbuf);
- out_dput:
+out_kfree:
+ kmem_free(kbuf);
+out_dput:
dput(dentry);
return error;
}
@@ -455,12 +459,9 @@ xfs_attrmulti_attr_get(
if (*len > XATTR_SIZE_MAX)
return EINVAL;
- kbuf = kmem_zalloc(*len, KM_SLEEP | KM_MAYFAIL);
- if (!kbuf) {
- kbuf = kmem_zalloc_large(*len);
- if (!kbuf)
- return ENOMEM;
- }
+ kbuf = kmem_zalloc_large(*len, KM_SLEEP);
+ if (!kbuf)
+ return ENOMEM;
error = xfs_attr_get(XFS_I(inode), name, kbuf, (int *)len, flags);
if (error)
@@ -469,11 +470,8 @@ xfs_attrmulti_attr_get(
if (copy_to_user(ubuf, kbuf, *len))
error = EFAULT;
- out_kfree:
- if (is_vmalloc_addr(kbuf))
- kmem_free_large(kbuf);
- else
- kmem_free(kbuf);
+out_kfree:
+ kmem_free(kbuf);
return error;
}
@@ -545,10 +543,11 @@ xfs_attrmulti_by_handle(
ops = memdup_user(am_hreq.ops, size);
if (IS_ERR(ops)) {
- error = PTR_ERR(ops);
+ error = -PTR_ERR(ops);
goto out_dput;
}
+ error = ENOMEM;
attr_name = kmalloc(MAXNAMELEN, GFP_KERNEL);
if (!attr_name)
goto out_kfree_ops;
@@ -558,7 +557,7 @@ xfs_attrmulti_by_handle(
ops[i].am_error = strncpy_from_user((char *)attr_name,
ops[i].am_attrname, MAXNAMELEN);
if (ops[i].am_error == 0 || ops[i].am_error == MAXNAMELEN)
- error = -ERANGE;
+ error = ERANGE;
if (ops[i].am_error < 0)
break;
@@ -613,7 +612,11 @@ xfs_ioc_space(
unsigned int cmd,
xfs_flock64_t *bf)
{
- int attr_flags = 0;
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_trans *tp;
+ struct iattr iattr;
+ bool setprealloc = false;
+ bool clrprealloc = false;
int error;
/*
@@ -633,19 +636,128 @@ xfs_ioc_space(
if (!S_ISREG(inode->i_mode))
return -XFS_ERROR(EINVAL);
- if (filp->f_flags & (O_NDELAY|O_NONBLOCK))
- attr_flags |= XFS_ATTR_NONBLOCK;
+ error = mnt_want_write_file(filp);
+ if (error)
+ return error;
- if (filp->f_flags & O_DSYNC)
- attr_flags |= XFS_ATTR_SYNC;
+ xfs_ilock(ip, XFS_IOLOCK_EXCL);
+
+ switch (bf->l_whence) {
+ case 0: /*SEEK_SET*/
+ break;
+ case 1: /*SEEK_CUR*/
+ bf->l_start += filp->f_pos;
+ break;
+ case 2: /*SEEK_END*/
+ bf->l_start += XFS_ISIZE(ip);
+ break;
+ default:
+ error = XFS_ERROR(EINVAL);
+ goto out_unlock;
+ }
- if (ioflags & IO_INVIS)
- attr_flags |= XFS_ATTR_DMI;
+ /*
+ * length of <= 0 for resv/unresv/zero is invalid. length for
+ * alloc/free is ignored completely and we have no idea what userspace
+ * might have set it to, so set it to zero to allow range
+ * checks to pass.
+ */
+ switch (cmd) {
+ case XFS_IOC_ZERO_RANGE:
+ case XFS_IOC_RESVSP:
+ case XFS_IOC_RESVSP64:
+ case XFS_IOC_UNRESVSP:
+ case XFS_IOC_UNRESVSP64:
+ if (bf->l_len <= 0) {
+ error = XFS_ERROR(EINVAL);
+ goto out_unlock;
+ }
+ break;
+ default:
+ bf->l_len = 0;
+ break;
+ }
+
+ if (bf->l_start < 0 ||
+ bf->l_start > mp->m_super->s_maxbytes ||
+ bf->l_start + bf->l_len < 0 ||
+ bf->l_start + bf->l_len >= mp->m_super->s_maxbytes) {
+ error = XFS_ERROR(EINVAL);
+ goto out_unlock;
+ }
+
+ switch (cmd) {
+ case XFS_IOC_ZERO_RANGE:
+ error = xfs_zero_file_space(ip, bf->l_start, bf->l_len);
+ if (!error)
+ setprealloc = true;
+ break;
+ case XFS_IOC_RESVSP:
+ case XFS_IOC_RESVSP64:
+ error = xfs_alloc_file_space(ip, bf->l_start, bf->l_len,
+ XFS_BMAPI_PREALLOC);
+ if (!error)
+ setprealloc = true;
+ break;
+ case XFS_IOC_UNRESVSP:
+ case XFS_IOC_UNRESVSP64:
+ error = xfs_free_file_space(ip, bf->l_start, bf->l_len);
+ break;
+ case XFS_IOC_ALLOCSP:
+ case XFS_IOC_ALLOCSP64:
+ case XFS_IOC_FREESP:
+ case XFS_IOC_FREESP64:
+ if (bf->l_start > XFS_ISIZE(ip)) {
+ error = xfs_alloc_file_space(ip, XFS_ISIZE(ip),
+ bf->l_start - XFS_ISIZE(ip), 0);
+ if (error)
+ goto out_unlock;
+ }
+
+ iattr.ia_valid = ATTR_SIZE;
+ iattr.ia_size = bf->l_start;
+
+ error = xfs_setattr_size(ip, &iattr);
+ if (!error)
+ clrprealloc = true;
+ break;
+ default:
+ ASSERT(0);
+ error = XFS_ERROR(EINVAL);
+ }
- error = mnt_want_write_file(filp);
if (error)
- return error;
- error = xfs_change_file_space(ip, cmd, bf, filp->f_pos, attr_flags);
+ goto out_unlock;
+
+ tp = xfs_trans_alloc(mp, XFS_TRANS_WRITEID);
+ error = xfs_trans_reserve(tp, &M_RES(mp)->tr_writeid, 0, 0);
+ if (error) {
+ xfs_trans_cancel(tp, 0);
+ goto out_unlock;
+ }
+
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+
+ if (!(ioflags & IO_INVIS)) {
+ ip->i_d.di_mode &= ~S_ISUID;
+ if (ip->i_d.di_mode & S_IXGRP)
+ ip->i_d.di_mode &= ~S_ISGID;
+ xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+ }
+
+ if (setprealloc)
+ ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC;
+ else if (clrprealloc)
+ ip->i_d.di_flags &= ~XFS_DIFLAG_PREALLOC;
+
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+ if (filp->f_flags & O_DSYNC)
+ xfs_trans_set_sync(tp);
+ error = xfs_trans_commit(tp, 0);
+
+out_unlock:
+ xfs_iunlock(ip, XFS_IOLOCK_EXCL);
mnt_drop_write_file(filp);
return -error;
}
@@ -922,7 +1034,7 @@ xfs_ioctl_setattr(
struct xfs_trans *tp;
unsigned int lock_flags = 0;
struct xfs_dquot *udqp = NULL;
- struct xfs_dquot *gdqp = NULL;
+ struct xfs_dquot *pdqp = NULL;
struct xfs_dquot *olddquot = NULL;
int code;
@@ -951,7 +1063,7 @@ xfs_ioctl_setattr(
if (XFS_IS_QUOTA_ON(mp) && (mask & FSX_PROJID)) {
code = xfs_qm_vop_dqalloc(ip, ip->i_d.di_uid,
ip->i_d.di_gid, fa->fsx_projid,
- XFS_QMOPT_PQUOTA, &udqp, &gdqp);
+ XFS_QMOPT_PQUOTA, &udqp, NULL, &pdqp);
if (code)
return code;
}
@@ -961,7 +1073,7 @@ xfs_ioctl_setattr(
* first do an error checking pass.
*/
tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE);
- code = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0);
+ code = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0);
if (code)
goto error_return;
@@ -975,21 +1087,28 @@ xfs_ioctl_setattr(
* to the file owner ID, except in cases where the
* CAP_FSETID capability is applicable.
*/
- if (current_fsuid() != ip->i_d.di_uid && !capable(CAP_FOWNER)) {
+ if (!inode_owner_or_capable(VFS_I(ip))) {
code = XFS_ERROR(EPERM);
goto error_return;
}
/*
* Do a quota reservation only if projid is actually going to change.
+ * Only allow changing of projid from init_user_ns since it is a
+ * non user namespace aware identifier.
*/
if (mask & FSX_PROJID) {
+ if (current_user_ns() != &init_user_ns) {
+ code = XFS_ERROR(EINVAL);
+ goto error_return;
+ }
+
if (XFS_IS_QUOTA_RUNNING(mp) &&
XFS_IS_PQUOTA_ON(mp) &&
xfs_get_projid(ip) != fa->fsx_projid) {
ASSERT(tp);
- code = xfs_qm_vop_chown_reserve(tp, ip, udqp, gdqp,
- capable(CAP_FOWNER) ?
+ code = xfs_qm_vop_chown_reserve(tp, ip, udqp, NULL,
+ pdqp, capable(CAP_FOWNER) ?
XFS_QMOPT_FORCE_RES : 0);
if (code) /* out of quota */
goto error_return;
@@ -1097,7 +1216,7 @@ xfs_ioctl_setattr(
* cleared upon successful return from chown()
*/
if ((ip->i_d.di_mode & (S_ISUID|S_ISGID)) &&
- !capable(CAP_FSETID))
+ !capable_wrt_inode_uidgid(VFS_I(ip), CAP_FSETID))
ip->i_d.di_mode &= ~(S_ISUID|S_ISGID);
/*
@@ -1107,17 +1226,10 @@ xfs_ioctl_setattr(
if (xfs_get_projid(ip) != fa->fsx_projid) {
if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_PQUOTA_ON(mp)) {
olddquot = xfs_qm_vop_chown(tp, ip,
- &ip->i_gdquot, gdqp);
+ &ip->i_pdquot, pdqp);
}
+ ASSERT(ip->i_d.di_version > 1);
xfs_set_projid(ip, fa->fsx_projid);
-
- /*
- * We may have to rev the inode as well as
- * the superblock version number since projids didn't
- * exist before DINODE_VERSION_2 and SB_VERSION_NLINK.
- */
- if (ip->i_d.di_version == 1)
- xfs_bump_ino_vers2(tp, ip);
}
}
@@ -1154,13 +1266,13 @@ xfs_ioctl_setattr(
*/
xfs_qm_dqrele(olddquot);
xfs_qm_dqrele(udqp);
- xfs_qm_dqrele(gdqp);
+ xfs_qm_dqrele(pdqp);
return code;
error_return:
xfs_qm_dqrele(udqp);
- xfs_qm_dqrele(gdqp);
+ xfs_qm_dqrele(pdqp);
xfs_trans_cancel(tp, 0);
if (lock_flags)
xfs_iunlock(ip, lock_flags);
@@ -1322,6 +1434,75 @@ xfs_ioc_getbmapx(
return 0;
}
+int
+xfs_ioc_swapext(
+ xfs_swapext_t *sxp)
+{
+ xfs_inode_t *ip, *tip;
+ struct fd f, tmp;
+ int error = 0;
+
+ /* Pull information for the target fd */
+ f = fdget((int)sxp->sx_fdtarget);
+ if (!f.file) {
+ error = XFS_ERROR(EINVAL);
+ goto out;
+ }
+
+ if (!(f.file->f_mode & FMODE_WRITE) ||
+ !(f.file->f_mode & FMODE_READ) ||
+ (f.file->f_flags & O_APPEND)) {
+ error = XFS_ERROR(EBADF);
+ goto out_put_file;
+ }
+
+ tmp = fdget((int)sxp->sx_fdtmp);
+ if (!tmp.file) {
+ error = XFS_ERROR(EINVAL);
+ goto out_put_file;
+ }
+
+ if (!(tmp.file->f_mode & FMODE_WRITE) ||
+ !(tmp.file->f_mode & FMODE_READ) ||
+ (tmp.file->f_flags & O_APPEND)) {
+ error = XFS_ERROR(EBADF);
+ goto out_put_tmp_file;
+ }
+
+ if (IS_SWAPFILE(file_inode(f.file)) ||
+ IS_SWAPFILE(file_inode(tmp.file))) {
+ error = XFS_ERROR(EINVAL);
+ goto out_put_tmp_file;
+ }
+
+ ip = XFS_I(file_inode(f.file));
+ tip = XFS_I(file_inode(tmp.file));
+
+ if (ip->i_mount != tip->i_mount) {
+ error = XFS_ERROR(EINVAL);
+ goto out_put_tmp_file;
+ }
+
+ if (ip->i_ino == tip->i_ino) {
+ error = XFS_ERROR(EINVAL);
+ goto out_put_tmp_file;
+ }
+
+ if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+ error = XFS_ERROR(EIO);
+ goto out_put_tmp_file;
+ }
+
+ error = xfs_swap_extents(ip, tip, sxp);
+
+ out_put_tmp_file:
+ fdput(tmp);
+ out_put_file:
+ fdput(f);
+ out:
+ return error;
+}
+
/*
* Note: some of the ioctl's return positive numbers as a
* byte count indicating success, such as readlink_by_handle.
@@ -1334,7 +1515,7 @@ xfs_file_ioctl(
unsigned int cmd,
unsigned long p)
{
- struct inode *inode = filp->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(filp);
struct xfs_inode *ip = XFS_I(inode);
struct xfs_mount *mp = ip->i_mount;
void __user *arg = (void __user *)p;
@@ -1370,7 +1551,7 @@ xfs_file_ioctl(
XFS_IS_REALTIME_INODE(ip) ?
mp->m_rtdev_targp : mp->m_ddev_targp;
- da.d_mem = da.d_miniosz = 1 << target->bt_sshift;
+ da.d_mem = da.d_miniosz = target->bt_logical_sectorsize;
da.d_maxiosz = INT_MAX & ~(da.d_miniosz - 1);
if (copy_to_user(arg, &da, sizeof(da)))
@@ -1466,7 +1647,7 @@ xfs_file_ioctl(
error = mnt_want_write_file(filp);
if (error)
return error;
- error = xfs_swapext(&sxp);
+ error = xfs_ioc_swapext(&sxp);
mnt_drop_write_file(filp);
return -error;
}
@@ -1604,23 +1785,23 @@ xfs_file_ioctl(
return -error;
case XFS_IOC_FREE_EOFBLOCKS: {
- struct xfs_eofblocks eofb;
+ struct xfs_fs_eofblocks eofb;
+ struct xfs_eofblocks keofb;
- if (copy_from_user(&eofb, arg, sizeof(eofb)))
- return -XFS_ERROR(EFAULT);
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
- if (eofb.eof_version != XFS_EOFBLOCKS_VERSION)
- return -XFS_ERROR(EINVAL);
+ if (mp->m_flags & XFS_MOUNT_RDONLY)
+ return -XFS_ERROR(EROFS);
- if (eofb.eof_flags & ~XFS_EOF_FLAGS_VALID)
- return -XFS_ERROR(EINVAL);
+ if (copy_from_user(&eofb, arg, sizeof(eofb)))
+ return -XFS_ERROR(EFAULT);
- if (memchr_inv(&eofb.pad32, 0, sizeof(eofb.pad32)) ||
- memchr_inv(eofb.pad64, 0, sizeof(eofb.pad64)))
- return -XFS_ERROR(EINVAL);
+ error = xfs_fs_eofblocks_from_user(&eofb, &keofb);
+ if (error)
+ return -error;
- error = xfs_icache_free_eofblocks(mp, &eofb);
- return -error;
+ return -xfs_icache_free_eofblocks(mp, &keofb);
}
default:
diff --git a/fs/xfs/xfs_ioctl.h b/fs/xfs/xfs_ioctl.h
index d56173b34a2..77c02c7900b 100644
--- a/fs/xfs/xfs_ioctl.h
+++ b/fs/xfs/xfs_ioctl.h
@@ -27,6 +27,10 @@ xfs_ioc_space(
unsigned int cmd,
xfs_flock64_t *bf);
+int
+xfs_ioc_swapext(
+ xfs_swapext_t *sxp);
+
extern int
xfs_find_handle(
unsigned int cmd,
@@ -82,4 +86,10 @@ xfs_file_compat_ioctl(
unsigned int cmd,
unsigned long arg);
+extern int
+xfs_set_dmattrs(
+ struct xfs_inode *ip,
+ u_int evmask,
+ u_int16_t state);
+
#endif
diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c
index 1244274a567..944d5baa710 100644
--- a/fs/xfs/xfs_ioctl32.c
+++ b/fs/xfs/xfs_ioctl32.c
@@ -22,19 +22,16 @@
#include <asm/uaccess.h>
#include "xfs.h"
#include "xfs_fs.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
#include "xfs_vnode.h"
-#include "xfs_dinode.h"
#include "xfs_inode.h"
#include "xfs_itable.h"
#include "xfs_error.h"
-#include "xfs_dfrag.h"
-#include "xfs_vnodeops.h"
#include "xfs_fsops.h"
#include "xfs_alloc.h"
#include "xfs_rtalloc.h"
@@ -359,7 +356,8 @@ xfs_compat_attrlist_by_handle(
if (copy_from_user(&al_hreq, arg,
sizeof(compat_xfs_fsop_attrlist_handlereq_t)))
return -XFS_ERROR(EFAULT);
- if (al_hreq.buflen > XATTR_LIST_MAX)
+ if (al_hreq.buflen < sizeof(struct attrlist) ||
+ al_hreq.buflen > XATTR_LIST_MAX)
return -XFS_ERROR(EINVAL);
/*
@@ -373,7 +371,7 @@ xfs_compat_attrlist_by_handle(
return PTR_ERR(dentry);
error = -ENOMEM;
- kbuf = kmalloc(al_hreq.buflen, GFP_KERNEL);
+ kbuf = kmem_zalloc_large(al_hreq.buflen, KM_SLEEP);
if (!kbuf)
goto out_dput;
@@ -386,9 +384,9 @@ xfs_compat_attrlist_by_handle(
if (copy_to_user(compat_ptr(al_hreq.buffer), kbuf, al_hreq.buflen))
error = -EFAULT;
- out_kfree:
- kfree(kbuf);
- out_dput:
+out_kfree:
+ kmem_free(kbuf);
+out_dput:
dput(dentry);
return error;
}
@@ -426,10 +424,11 @@ xfs_compat_attrmulti_by_handle(
ops = memdup_user(compat_ptr(am_hreq.ops), size);
if (IS_ERR(ops)) {
- error = PTR_ERR(ops);
+ error = -PTR_ERR(ops);
goto out_dput;
}
+ error = ENOMEM;
attr_name = kmalloc(MAXNAMELEN, GFP_KERNEL);
if (!attr_name)
goto out_kfree_ops;
@@ -440,7 +439,7 @@ xfs_compat_attrmulti_by_handle(
compat_ptr(ops[i].am_attrname),
MAXNAMELEN);
if (ops[i].am_error == 0 || ops[i].am_error == MAXNAMELEN)
- error = -ERANGE;
+ error = ERANGE;
if (ops[i].am_error < 0)
break;
@@ -530,7 +529,7 @@ xfs_file_compat_ioctl(
unsigned cmd,
unsigned long p)
{
- struct inode *inode = filp->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(filp);
struct xfs_inode *ip = XFS_I(inode);
struct xfs_mount *mp = ip->i_mount;
void __user *arg = (void __user *)p;
@@ -638,7 +637,7 @@ xfs_file_compat_ioctl(
error = mnt_want_write_file(filp);
if (error)
return error;
- error = xfs_swapext(&sxp);
+ error = xfs_ioc_swapext(&sxp);
mnt_drop_write_file(filp);
return -error;
}
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index add06b4e9a6..6d3ec2b6ee2 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -17,31 +17,28 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
-#include "xfs_alloc.h"
-#include "xfs_quota.h"
#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dinode.h"
#include "xfs_inode.h"
-#include "xfs_inode_item.h"
#include "xfs_btree.h"
+#include "xfs_bmap_btree.h"
#include "xfs_bmap.h"
-#include "xfs_rtalloc.h"
+#include "xfs_bmap_util.h"
#include "xfs_error.h"
-#include "xfs_itable.h"
-#include "xfs_attr.h"
-#include "xfs_buf_item.h"
+#include "xfs_trans.h"
#include "xfs_trans_space.h"
-#include "xfs_utils.h"
#include "xfs_iomap.h"
#include "xfs_trace.h"
#include "xfs_icache.h"
+#include "xfs_quota.h"
+#include "xfs_dquot_item.h"
+#include "xfs_dquot.h"
+#include "xfs_dinode.h"
#define XFS_WRITEIO_ALIGN(mp,off) (((off) >> mp->m_writeio_log) \
@@ -107,7 +104,7 @@ xfs_alert_fsblock_zero(
xfs_alert_tag(ip->i_mount, XFS_PTAG_FSBLOCK_ZERO,
"Access to block zero in inode %llu "
"start_block: %llx start_off: %llx "
- "blkcnt: %llx extent-state: %x\n",
+ "blkcnt: %llx extent-state: %x",
(unsigned long long)ip->i_ino,
(unsigned long long)imap->br_startblock,
(unsigned long long)imap->br_startoff,
@@ -131,7 +128,6 @@ xfs_iomap_write_direct(
xfs_fsblock_t firstfsb;
xfs_extlen_t extsz, temp;
int nimaps;
- int bmapi_flag;
int quota_flag;
int rt;
xfs_trans_t *tp;
@@ -185,10 +181,8 @@ xfs_iomap_write_direct(
* Allocate and setup the transaction
*/
tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
- error = xfs_trans_reserve(tp, resblks,
- XFS_WRITE_LOG_RES(mp), resrtextents,
- XFS_TRANS_PERM_LOG_RES,
- XFS_WRITE_LOG_COUNT);
+ error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
+ resblks, resrtextents);
/*
* Check for running out of space, note: need lock to return
*/
@@ -205,18 +199,15 @@ xfs_iomap_write_direct(
xfs_trans_ijoin(tp, ip, 0);
- bmapi_flag = 0;
- if (offset < XFS_ISIZE(ip) || extsz)
- bmapi_flag |= XFS_BMAPI_PREALLOC;
-
/*
* From this point onwards we overwrite the imap pointer that the
* caller gave to us.
*/
xfs_bmap_init(&free_list, &firstfsb);
nimaps = 1;
- error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb, bmapi_flag,
- &firstfsb, 0, imap, &nimaps, &free_list);
+ error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb,
+ XFS_BMAPI_PREALLOC, &firstfsb, 0,
+ imap, &nimaps, &free_list);
if (error)
goto out_bmap_cancel;
@@ -282,6 +273,15 @@ xfs_iomap_eof_want_preallocate(
return 0;
/*
+ * If the file is smaller than the minimum prealloc and we are using
+ * dynamic preallocation, don't do any preallocation at all as it is
+ * likely this is the only write to the file that is going to be done.
+ */
+ if (!(mp->m_flags & XFS_MOUNT_DFLT_IOSIZE) &&
+ XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, mp->m_writeio_blocks))
+ return 0;
+
+ /*
* If there are any real blocks past eof, then don't
* do any speculative allocation.
*/
@@ -311,6 +311,121 @@ xfs_iomap_eof_want_preallocate(
}
/*
+ * Determine the initial size of the preallocation. We are beyond the current
+ * EOF here, but we need to take into account whether this is a sparse write or
+ * an extending write when determining the preallocation size. Hence we need to
+ * look up the extent that ends at the current write offset and use the result
+ * to determine the preallocation size.
+ *
+ * If the extent is a hole, then preallocation is essentially disabled.
+ * Otherwise we take the size of the preceeding data extent as the basis for the
+ * preallocation size. If the size of the extent is greater than half the
+ * maximum extent length, then use the current offset as the basis. This ensures
+ * that for large files the preallocation size always extends to MAXEXTLEN
+ * rather than falling short due to things like stripe unit/width alignment of
+ * real extents.
+ */
+STATIC xfs_fsblock_t
+xfs_iomap_eof_prealloc_initial_size(
+ struct xfs_mount *mp,
+ struct xfs_inode *ip,
+ xfs_off_t offset,
+ xfs_bmbt_irec_t *imap,
+ int nimaps)
+{
+ xfs_fileoff_t start_fsb;
+ int imaps = 1;
+ int error;
+
+ ASSERT(nimaps >= imaps);
+
+ /* if we are using a specific prealloc size, return now */
+ if (mp->m_flags & XFS_MOUNT_DFLT_IOSIZE)
+ return 0;
+
+ /* If the file is small, then use the minimum prealloc */
+ if (XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, mp->m_dalign))
+ return 0;
+
+ /*
+ * As we write multiple pages, the offset will always align to the
+ * start of a page and hence point to a hole at EOF. i.e. if the size is
+ * 4096 bytes, we only have one block at FSB 0, but XFS_B_TO_FSB(4096)
+ * will return FSB 1. Hence if there are blocks in the file, we want to
+ * point to the block prior to the EOF block and not the hole that maps
+ * directly at @offset.
+ */
+ start_fsb = XFS_B_TO_FSB(mp, offset);
+ if (start_fsb)
+ start_fsb--;
+ error = xfs_bmapi_read(ip, start_fsb, 1, imap, &imaps, XFS_BMAPI_ENTIRE);
+ if (error)
+ return 0;
+
+ ASSERT(imaps == 1);
+ if (imap[0].br_startblock == HOLESTARTBLOCK)
+ return 0;
+ if (imap[0].br_blockcount <= (MAXEXTLEN >> 1))
+ return imap[0].br_blockcount << 1;
+ return XFS_B_TO_FSB(mp, offset);
+}
+
+STATIC bool
+xfs_quota_need_throttle(
+ struct xfs_inode *ip,
+ int type,
+ xfs_fsblock_t alloc_blocks)
+{
+ struct xfs_dquot *dq = xfs_inode_dquot(ip, type);
+
+ if (!dq || !xfs_this_quota_on(ip->i_mount, type))
+ return false;
+
+ /* no hi watermark, no throttle */
+ if (!dq->q_prealloc_hi_wmark)
+ return false;
+
+ /* under the lo watermark, no throttle */
+ if (dq->q_res_bcount + alloc_blocks < dq->q_prealloc_lo_wmark)
+ return false;
+
+ return true;
+}
+
+STATIC void
+xfs_quota_calc_throttle(
+ struct xfs_inode *ip,
+ int type,
+ xfs_fsblock_t *qblocks,
+ int *qshift)
+{
+ int64_t freesp;
+ int shift = 0;
+ struct xfs_dquot *dq = xfs_inode_dquot(ip, type);
+
+ /* over hi wmark, squash the prealloc completely */
+ if (dq->q_res_bcount >= dq->q_prealloc_hi_wmark) {
+ *qblocks = 0;
+ return;
+ }
+
+ freesp = dq->q_prealloc_hi_wmark - dq->q_res_bcount;
+ if (freesp < dq->q_low_space[XFS_QLOWSP_5_PCNT]) {
+ shift = 2;
+ if (freesp < dq->q_low_space[XFS_QLOWSP_3_PCNT])
+ shift += 2;
+ if (freesp < dq->q_low_space[XFS_QLOWSP_1_PCNT])
+ shift += 2;
+ }
+
+ /* only overwrite the throttle values if we are more aggressive */
+ if ((freesp >> shift) < (*qblocks >> *qshift)) {
+ *qblocks = freesp;
+ *qshift = shift;
+ }
+}
+
+/*
* If we don't have a user specified preallocation size, dynamically increase
* the preallocation size as the size of the file grows. Cap the maximum size
* at a single extent or less if the filesystem is near full. The closer the
@@ -319,43 +434,95 @@ xfs_iomap_eof_want_preallocate(
STATIC xfs_fsblock_t
xfs_iomap_prealloc_size(
struct xfs_mount *mp,
- struct xfs_inode *ip)
+ struct xfs_inode *ip,
+ xfs_off_t offset,
+ struct xfs_bmbt_irec *imap,
+ int nimaps)
{
xfs_fsblock_t alloc_blocks = 0;
+ int shift = 0;
+ int64_t freesp;
+ xfs_fsblock_t qblocks;
+ int qshift = 0;
- if (!(mp->m_flags & XFS_MOUNT_DFLT_IOSIZE)) {
- int shift = 0;
- int64_t freesp;
+ alloc_blocks = xfs_iomap_eof_prealloc_initial_size(mp, ip, offset,
+ imap, nimaps);
+ if (!alloc_blocks)
+ goto check_writeio;
+ qblocks = alloc_blocks;
- /*
- * rounddown_pow_of_two() returns an undefined result
- * if we pass in alloc_blocks = 0. Hence the "+ 1" to
- * ensure we always pass in a non-zero value.
- */
- alloc_blocks = XFS_B_TO_FSB(mp, XFS_ISIZE(ip)) + 1;
- alloc_blocks = XFS_FILEOFF_MIN(MAXEXTLEN,
- rounddown_pow_of_two(alloc_blocks));
-
- xfs_icsb_sync_counters(mp, XFS_ICSB_LAZY_COUNT);
- freesp = mp->m_sb.sb_fdblocks;
- if (freesp < mp->m_low_space[XFS_LOWSP_5_PCNT]) {
- shift = 2;
- if (freesp < mp->m_low_space[XFS_LOWSP_4_PCNT])
- shift++;
- if (freesp < mp->m_low_space[XFS_LOWSP_3_PCNT])
- shift++;
- if (freesp < mp->m_low_space[XFS_LOWSP_2_PCNT])
- shift++;
- if (freesp < mp->m_low_space[XFS_LOWSP_1_PCNT])
- shift++;
- }
- if (shift)
- alloc_blocks >>= shift;
+ /*
+ * MAXEXTLEN is not a power of two value but we round the prealloc down
+ * to the nearest power of two value after throttling. To prevent the
+ * round down from unconditionally reducing the maximum supported prealloc
+ * size, we round up first, apply appropriate throttling, round down and
+ * cap the value to MAXEXTLEN.
+ */
+ alloc_blocks = XFS_FILEOFF_MIN(roundup_pow_of_two(MAXEXTLEN),
+ alloc_blocks);
+
+ xfs_icsb_sync_counters(mp, XFS_ICSB_LAZY_COUNT);
+ freesp = mp->m_sb.sb_fdblocks;
+ if (freesp < mp->m_low_space[XFS_LOWSP_5_PCNT]) {
+ shift = 2;
+ if (freesp < mp->m_low_space[XFS_LOWSP_4_PCNT])
+ shift++;
+ if (freesp < mp->m_low_space[XFS_LOWSP_3_PCNT])
+ shift++;
+ if (freesp < mp->m_low_space[XFS_LOWSP_2_PCNT])
+ shift++;
+ if (freesp < mp->m_low_space[XFS_LOWSP_1_PCNT])
+ shift++;
}
+ /*
+ * Check each quota to cap the prealloc size and provide a shift
+ * value to throttle with.
+ */
+ if (xfs_quota_need_throttle(ip, XFS_DQ_USER, alloc_blocks))
+ xfs_quota_calc_throttle(ip, XFS_DQ_USER, &qblocks, &qshift);
+ if (xfs_quota_need_throttle(ip, XFS_DQ_GROUP, alloc_blocks))
+ xfs_quota_calc_throttle(ip, XFS_DQ_GROUP, &qblocks, &qshift);
+ if (xfs_quota_need_throttle(ip, XFS_DQ_PROJ, alloc_blocks))
+ xfs_quota_calc_throttle(ip, XFS_DQ_PROJ, &qblocks, &qshift);
+
+ /*
+ * The final prealloc size is set to the minimum of free space available
+ * in each of the quotas and the overall filesystem.
+ *
+ * The shift throttle value is set to the maximum value as determined by
+ * the global low free space values and per-quota low free space values.
+ */
+ alloc_blocks = MIN(alloc_blocks, qblocks);
+ shift = MAX(shift, qshift);
+
+ if (shift)
+ alloc_blocks >>= shift;
+ /*
+ * rounddown_pow_of_two() returns an undefined result if we pass in
+ * alloc_blocks = 0.
+ */
+ if (alloc_blocks)
+ alloc_blocks = rounddown_pow_of_two(alloc_blocks);
+ if (alloc_blocks > MAXEXTLEN)
+ alloc_blocks = MAXEXTLEN;
+
+ /*
+ * If we are still trying to allocate more space than is
+ * available, squash the prealloc hard. This can happen if we
+ * have a large file on a small filesystem and the above
+ * lowspace thresholds are smaller than MAXEXTLEN.
+ */
+ while (alloc_blocks && alloc_blocks >= freesp)
+ alloc_blocks >>= 4;
+
+check_writeio:
if (alloc_blocks < mp->m_writeio_blocks)
alloc_blocks = mp->m_writeio_blocks;
+ trace_xfs_iomap_prealloc_size(ip, alloc_blocks, shift,
+ mp->m_writeio_blocks);
+
return alloc_blocks;
}
@@ -390,7 +557,6 @@ xfs_iomap_write_delay(
extsz = xfs_get_extsz_hint(ip);
offset_fsb = XFS_B_TO_FSBT(mp, offset);
-
error = xfs_iomap_eof_want_preallocate(mp, ip, offset, count,
imap, XFS_WRITE_IMAPS, &prealloc);
if (error)
@@ -398,7 +564,10 @@ xfs_iomap_write_delay(
retry:
if (prealloc) {
- xfs_fsblock_t alloc_blocks = xfs_iomap_prealloc_size(mp, ip);
+ xfs_fsblock_t alloc_blocks;
+
+ alloc_blocks = xfs_iomap_prealloc_size(mp, ip, offset, imap,
+ XFS_WRITE_IMAPS);
aligned_offset = XFS_WRITEIO_ALIGN(mp, (offset + count - 1));
ioalign = XFS_B_TO_FSBT(mp, aligned_offset);
@@ -476,7 +645,6 @@ int
xfs_iomap_write_allocate(
xfs_inode_t *ip,
xfs_off_t offset,
- size_t count,
xfs_bmbt_irec_t *imap)
{
xfs_mount_t *mp = ip->i_mount;
@@ -518,10 +686,8 @@ xfs_iomap_write_allocate(
tp = xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE);
tp->t_flags |= XFS_TRANS_RESERVE;
nres = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK);
- error = xfs_trans_reserve(tp, nres,
- XFS_WRITE_LOG_RES(mp),
- 0, XFS_TRANS_PERM_LOG_RES,
- XFS_WRITE_LOG_COUNT);
+ error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
+ nres, 0);
if (error) {
xfs_trans_cancel(tp, 0);
return XFS_ERROR(error);
@@ -564,7 +730,7 @@ xfs_iomap_write_allocate(
*/
nimaps = 1;
end_fsb = XFS_B_TO_FSB(mp, XFS_ISIZE(ip));
- error = xfs_bmap_last_offset(NULL, ip, &last_block,
+ error = xfs_bmap_last_offset(ip, &last_block,
XFS_DATA_FORK);
if (error)
goto trans_cancel;
@@ -583,8 +749,7 @@ xfs_iomap_write_allocate(
* pointer that the caller gave to us.
*/
error = xfs_bmapi_write(tp, ip, map_start_fsb,
- count_fsb,
- XFS_BMAPI_STACK_SWITCH,
+ count_fsb, 0,
&first_block, 1,
imap, &nimaps, &free_list);
if (error)
@@ -684,10 +849,8 @@ xfs_iomap_write_unwritten(
sb_start_intwrite(mp->m_super);
tp = _xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE, KM_NOFS);
tp->t_flags |= XFS_TRANS_RESERVE | XFS_TRANS_FREEZE_PROT;
- error = xfs_trans_reserve(tp, resblks,
- XFS_WRITE_LOG_RES(mp), 0,
- XFS_TRANS_PERM_LOG_RES,
- XFS_WRITE_LOG_COUNT);
+ error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
+ resblks, 0);
if (error) {
xfs_trans_cancel(tp, 0);
return XFS_ERROR(error);
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index 80615760959..411fbb8919e 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -21,12 +21,12 @@
struct xfs_inode;
struct xfs_bmbt_irec;
-extern int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t,
+int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t,
struct xfs_bmbt_irec *, int);
-extern int xfs_iomap_write_delay(struct xfs_inode *, xfs_off_t, size_t,
+int xfs_iomap_write_delay(struct xfs_inode *, xfs_off_t, size_t,
struct xfs_bmbt_irec *);
-extern int xfs_iomap_write_allocate(struct xfs_inode *, xfs_off_t, size_t,
+int xfs_iomap_write_allocate(struct xfs_inode *, xfs_off_t,
struct xfs_bmbt_irec *);
-extern int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, size_t);
+int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, size_t);
#endif /* __XFS_IOMAP_H__*/
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index d82efaa2ac7..205613a0606 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -17,28 +17,29 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
-#include "xfs_acl.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
-#include "xfs_alloc.h"
-#include "xfs_quota.h"
#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_dinode.h"
+#include "xfs_da_format.h"
#include "xfs_inode.h"
#include "xfs_bmap.h"
-#include "xfs_rtalloc.h"
+#include "xfs_bmap_util.h"
+#include "xfs_acl.h"
+#include "xfs_quota.h"
#include "xfs_error.h"
-#include "xfs_itable.h"
#include "xfs_attr.h"
-#include "xfs_buf_item.h"
-#include "xfs_utils.h"
-#include "xfs_vnodeops.h"
-#include "xfs_inode_item.h"
+#include "xfs_trans.h"
#include "xfs_trace.h"
#include "xfs_icache.h"
+#include "xfs_symlink.h"
+#include "xfs_da_btree.h"
+#include "xfs_dir2_priv.h"
+#include "xfs_dinode.h"
+#include "xfs_trans_space.h"
#include <linux/capability.h>
#include <linux/xattr.h>
@@ -48,6 +49,18 @@
#include <linux/fiemap.h>
#include <linux/slab.h>
+/*
+ * Directories have different lock order w.r.t. mmap_sem compared to regular
+ * files. This is due to readdir potentially triggering page faults on a user
+ * buffer inside filldir(), and this happens with the ilock on the directory
+ * held. For regular files, the lock order is the other way around - the
+ * mmap_sem is taken during the page fault, and then we lock the ilock to do
+ * block mapping. Hence we need a different class for the directory ilock so
+ * that lockdep can tell them apart.
+ */
+static struct lock_class_key xfs_nondir_ilock_class;
+static struct lock_class_key xfs_dir_ilock_class;
+
static int
xfs_initxattrs(
struct inode *inode,
@@ -59,8 +72,8 @@ xfs_initxattrs(
int error = 0;
for (xattr = xattr_array; xattr->name != NULL; xattr++) {
- error = xfs_attr_set(ip, xattr->name, xattr->value,
- xattr->value_len, ATTR_SECURE);
+ error = -xfs_attr_set(ip, xattr->name, xattr->value,
+ xattr->value_len, ATTR_SECURE);
if (error < 0)
break;
}
@@ -80,17 +93,19 @@ xfs_init_security(
struct inode *dir,
const struct qstr *qstr)
{
- return security_inode_init_security(inode, dir, qstr,
- &xfs_initxattrs, NULL);
+ return -security_inode_init_security(inode, dir, qstr,
+ &xfs_initxattrs, NULL);
}
static void
xfs_dentry_to_name(
struct xfs_name *namep,
- struct dentry *dentry)
+ struct dentry *dentry,
+ int mode)
{
namep->name = dentry->d_name.name;
namep->len = dentry->d_name.len;
+ namep->type = xfs_mode_to_ftype[(mode & S_IFMT) >> S_SHIFT];
}
STATIC void
@@ -106,22 +121,22 @@ xfs_cleanup_inode(
* xfs_init_security we must back out.
* ENOSPC can hit here, among other things.
*/
- xfs_dentry_to_name(&teardown, dentry);
+ xfs_dentry_to_name(&teardown, dentry, 0);
xfs_remove(XFS_I(dir), &teardown, XFS_I(inode));
- iput(inode);
}
STATIC int
-xfs_vn_mknod(
+xfs_generic_create(
struct inode *dir,
struct dentry *dentry,
umode_t mode,
- dev_t rdev)
+ dev_t rdev,
+ bool tmpfile) /* unnamed file */
{
struct inode *inode;
struct xfs_inode *ip = NULL;
- struct posix_acl *default_acl = NULL;
+ struct posix_acl *default_acl, *acl;
struct xfs_name name;
int error;
@@ -137,17 +152,16 @@ xfs_vn_mknod(
rdev = 0;
}
- if (IS_POSIXACL(dir)) {
- default_acl = xfs_get_acl(dir, ACL_TYPE_DEFAULT);
- if (IS_ERR(default_acl))
- return PTR_ERR(default_acl);
+ error = posix_acl_create(dir, &mode, &default_acl, &acl);
+ if (error)
+ return error;
- if (!default_acl)
- mode &= ~current_umask();
+ if (!tmpfile) {
+ xfs_dentry_to_name(&name, dentry, mode);
+ error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip);
+ } else {
+ error = xfs_create_tmpfile(XFS_I(dir), dentry, mode, &ip);
}
-
- xfs_dentry_to_name(&name, dentry);
- error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip);
if (unlikely(error))
goto out_free_acl;
@@ -157,22 +171,46 @@ xfs_vn_mknod(
if (unlikely(error))
goto out_cleanup_inode;
+#ifdef CONFIG_XFS_POSIX_ACL
if (default_acl) {
- error = -xfs_inherit_acl(inode, default_acl);
- default_acl = NULL;
- if (unlikely(error))
+ error = -xfs_set_acl(inode, default_acl, ACL_TYPE_DEFAULT);
+ if (error)
+ goto out_cleanup_inode;
+ }
+ if (acl) {
+ error = -xfs_set_acl(inode, acl, ACL_TYPE_ACCESS);
+ if (error)
goto out_cleanup_inode;
}
+#endif
+ if (tmpfile)
+ d_tmpfile(dentry, inode);
+ else
+ d_instantiate(dentry, inode);
- d_instantiate(dentry, inode);
+ out_free_acl:
+ if (default_acl)
+ posix_acl_release(default_acl);
+ if (acl)
+ posix_acl_release(acl);
return -error;
out_cleanup_inode:
- xfs_cleanup_inode(dir, inode, dentry);
- out_free_acl:
- posix_acl_release(default_acl);
- return -error;
+ if (!tmpfile)
+ xfs_cleanup_inode(dir, inode, dentry);
+ iput(inode);
+ goto out_free_acl;
+}
+
+STATIC int
+xfs_vn_mknod(
+ struct inode *dir,
+ struct dentry *dentry,
+ umode_t mode,
+ dev_t rdev)
+{
+ return xfs_generic_create(dir, dentry, mode, rdev, false);
}
STATIC int
@@ -207,7 +245,7 @@ xfs_vn_lookup(
if (dentry->d_name.len >= MAXNAMELEN)
return ERR_PTR(-ENAMETOOLONG);
- xfs_dentry_to_name(&name, dentry);
+ xfs_dentry_to_name(&name, dentry, 0);
error = xfs_lookup(XFS_I(dir), &name, &cip, NULL);
if (unlikely(error)) {
if (unlikely(error != ENOENT))
@@ -234,7 +272,7 @@ xfs_vn_ci_lookup(
if (dentry->d_name.len >= MAXNAMELEN)
return ERR_PTR(-ENAMETOOLONG);
- xfs_dentry_to_name(&xname, dentry);
+ xfs_dentry_to_name(&xname, dentry, 0);
error = xfs_lookup(XFS_I(dir), &xname, &ip, &ci_name);
if (unlikely(error)) {
if (unlikely(error != ENOENT))
@@ -269,7 +307,7 @@ xfs_vn_link(
struct xfs_name name;
int error;
- xfs_dentry_to_name(&name, dentry);
+ xfs_dentry_to_name(&name, dentry, inode->i_mode);
error = xfs_link(XFS_I(dir), XFS_I(inode), &name);
if (unlikely(error))
@@ -288,7 +326,7 @@ xfs_vn_unlink(
struct xfs_name name;
int error;
- xfs_dentry_to_name(&name, dentry);
+ xfs_dentry_to_name(&name, dentry, 0);
error = -xfs_remove(XFS_I(dir), &name, XFS_I(dentry->d_inode));
if (error)
@@ -318,7 +356,7 @@ xfs_vn_symlink(
mode = S_IFLNK |
(irix_symlink_mode ? 0777 & ~current_umask() : S_IRWXUGO);
- xfs_dentry_to_name(&name, dentry);
+ xfs_dentry_to_name(&name, dentry, mode);
error = xfs_symlink(XFS_I(dir), &name, symname, mode, &cip);
if (unlikely(error))
@@ -335,6 +373,7 @@ xfs_vn_symlink(
out_cleanup_inode:
xfs_cleanup_inode(dir, inode, dentry);
+ iput(inode);
out:
return -error;
}
@@ -350,12 +389,12 @@ xfs_vn_rename(
struct xfs_name oname;
struct xfs_name nname;
- xfs_dentry_to_name(&oname, odentry);
- xfs_dentry_to_name(&nname, ndentry);
+ xfs_dentry_to_name(&oname, odentry, 0);
+ xfs_dentry_to_name(&nname, ndentry, odentry->d_inode->i_mode);
return -xfs_rename(XFS_I(odir), &oname, XFS_I(odentry->d_inode),
XFS_I(ndir), &nname, new_inode ?
- XFS_I(new_inode) : NULL);
+ XFS_I(new_inode) : NULL);
}
/*
@@ -389,18 +428,6 @@ xfs_vn_follow_link(
return NULL;
}
-STATIC void
-xfs_vn_put_link(
- struct dentry *dentry,
- struct nameidata *nd,
- void *p)
-{
- char *s = nd_get_link(nd);
-
- if (!IS_ERR(s))
- kfree(s);
-}
-
STATIC int
xfs_vn_getattr(
struct vfsmount *mnt,
@@ -420,8 +447,8 @@ xfs_vn_getattr(
stat->dev = inode->i_sb->s_dev;
stat->mode = ip->i_d.di_mode;
stat->nlink = ip->i_d.di_nlink;
- stat->uid = ip->i_d.di_uid;
- stat->gid = ip->i_d.di_gid;
+ stat->uid = inode->i_uid;
+ stat->gid = inode->i_gid;
stat->ino = ip->i_ino;
stat->atime = inode->i_atime;
stat->mtime = inode->i_mtime;
@@ -455,6 +482,49 @@ xfs_vn_getattr(
return 0;
}
+static void
+xfs_setattr_mode(
+ struct xfs_inode *ip,
+ struct iattr *iattr)
+{
+ struct inode *inode = VFS_I(ip);
+ umode_t mode = iattr->ia_mode;
+
+ ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+
+ ip->i_d.di_mode &= S_IFMT;
+ ip->i_d.di_mode |= mode & ~S_IFMT;
+
+ inode->i_mode &= S_IFMT;
+ inode->i_mode |= mode & ~S_IFMT;
+}
+
+static void
+xfs_setattr_time(
+ struct xfs_inode *ip,
+ struct iattr *iattr)
+{
+ struct inode *inode = VFS_I(ip);
+
+ ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+
+ if (iattr->ia_valid & ATTR_ATIME) {
+ inode->i_atime = iattr->ia_atime;
+ ip->i_d.di_atime.t_sec = iattr->ia_atime.tv_sec;
+ ip->i_d.di_atime.t_nsec = iattr->ia_atime.tv_nsec;
+ }
+ if (iattr->ia_valid & ATTR_CTIME) {
+ inode->i_ctime = iattr->ia_ctime;
+ ip->i_d.di_ctime.t_sec = iattr->ia_ctime.tv_sec;
+ ip->i_d.di_ctime.t_nsec = iattr->ia_ctime.tv_nsec;
+ }
+ if (iattr->ia_valid & ATTR_MTIME) {
+ inode->i_mtime = iattr->ia_mtime;
+ ip->i_d.di_mtime.t_sec = iattr->ia_mtime.tv_sec;
+ ip->i_d.di_mtime.t_nsec = iattr->ia_mtime.tv_nsec;
+ }
+}
+
int
xfs_setattr_nonsize(
struct xfs_inode *ip,
@@ -466,22 +536,25 @@ xfs_setattr_nonsize(
int mask = iattr->ia_valid;
xfs_trans_t *tp;
int error;
- uid_t uid = 0, iuid = 0;
- gid_t gid = 0, igid = 0;
+ kuid_t uid = GLOBAL_ROOT_UID, iuid = GLOBAL_ROOT_UID;
+ kgid_t gid = GLOBAL_ROOT_GID, igid = GLOBAL_ROOT_GID;
struct xfs_dquot *udqp = NULL, *gdqp = NULL;
struct xfs_dquot *olddquot1 = NULL, *olddquot2 = NULL;
trace_xfs_setattr(ip);
- if (mp->m_flags & XFS_MOUNT_RDONLY)
- return XFS_ERROR(EROFS);
+ /* If acls are being inherited, we already have this checked */
+ if (!(flags & XFS_ATTR_NOACL)) {
+ if (mp->m_flags & XFS_MOUNT_RDONLY)
+ return XFS_ERROR(EROFS);
- if (XFS_FORCED_SHUTDOWN(mp))
- return XFS_ERROR(EIO);
+ if (XFS_FORCED_SHUTDOWN(mp))
+ return XFS_ERROR(EIO);
- error = -inode_change_ok(inode, iattr);
- if (error)
- return XFS_ERROR(error);
+ error = -inode_change_ok(inode, iattr);
+ if (error)
+ return XFS_ERROR(error);
+ }
ASSERT((mask & ATTR_SIZE) == 0);
@@ -500,13 +573,13 @@ xfs_setattr_nonsize(
uid = iattr->ia_uid;
qflags |= XFS_QMOPT_UQUOTA;
} else {
- uid = ip->i_d.di_uid;
+ uid = inode->i_uid;
}
if ((mask & ATTR_GID) && XFS_IS_GQUOTA_ON(mp)) {
gid = iattr->ia_gid;
qflags |= XFS_QMOPT_GQUOTA;
} else {
- gid = ip->i_d.di_gid;
+ gid = inode->i_gid;
}
/*
@@ -516,14 +589,16 @@ xfs_setattr_nonsize(
*/
ASSERT(udqp == NULL);
ASSERT(gdqp == NULL);
- error = xfs_qm_vop_dqalloc(ip, uid, gid, xfs_get_projid(ip),
- qflags, &udqp, &gdqp);
+ error = xfs_qm_vop_dqalloc(ip, xfs_kuid_to_uid(uid),
+ xfs_kgid_to_gid(gid),
+ xfs_get_projid(ip),
+ qflags, &udqp, &gdqp, NULL);
if (error)
return error;
}
tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE);
- error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0);
+ error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0);
if (error)
goto out_dqrele;
@@ -539,8 +614,8 @@ xfs_setattr_nonsize(
* while we didn't have the inode locked, inode's dquot(s)
* would have changed also.
*/
- iuid = ip->i_d.di_uid;
- igid = ip->i_d.di_gid;
+ iuid = inode->i_uid;
+ igid = inode->i_gid;
gid = (mask & ATTR_GID) ? iattr->ia_gid : igid;
uid = (mask & ATTR_UID) ? iattr->ia_uid : iuid;
@@ -549,11 +624,11 @@ xfs_setattr_nonsize(
* going to change.
*/
if (XFS_IS_QUOTA_RUNNING(mp) &&
- ((XFS_IS_UQUOTA_ON(mp) && iuid != uid) ||
- (XFS_IS_GQUOTA_ON(mp) && igid != gid))) {
+ ((XFS_IS_UQUOTA_ON(mp) && !uid_eq(iuid, uid)) ||
+ (XFS_IS_GQUOTA_ON(mp) && !gid_eq(igid, gid)))) {
ASSERT(tp);
error = xfs_qm_vop_chown_reserve(tp, ip, udqp, gdqp,
- capable(CAP_FOWNER) ?
+ NULL, capable(CAP_FOWNER) ?
XFS_QMOPT_FORCE_RES : 0);
if (error) /* out of quota */
goto out_trans_cancel;
@@ -580,63 +655,34 @@ xfs_setattr_nonsize(
* Change the ownerships and register quota modifications
* in the transaction.
*/
- if (iuid != uid) {
+ if (!uid_eq(iuid, uid)) {
if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_UQUOTA_ON(mp)) {
ASSERT(mask & ATTR_UID);
ASSERT(udqp);
olddquot1 = xfs_qm_vop_chown(tp, ip,
&ip->i_udquot, udqp);
}
- ip->i_d.di_uid = uid;
+ ip->i_d.di_uid = xfs_kuid_to_uid(uid);
inode->i_uid = uid;
}
- if (igid != gid) {
+ if (!gid_eq(igid, gid)) {
if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_GQUOTA_ON(mp)) {
- ASSERT(!XFS_IS_PQUOTA_ON(mp));
+ ASSERT(xfs_sb_version_has_pquotino(&mp->m_sb) ||
+ !XFS_IS_PQUOTA_ON(mp));
ASSERT(mask & ATTR_GID);
ASSERT(gdqp);
olddquot2 = xfs_qm_vop_chown(tp, ip,
&ip->i_gdquot, gdqp);
}
- ip->i_d.di_gid = gid;
+ ip->i_d.di_gid = xfs_kgid_to_gid(gid);
inode->i_gid = gid;
}
}
- /*
- * Change file access modes.
- */
- if (mask & ATTR_MODE) {
- umode_t mode = iattr->ia_mode;
-
- if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID))
- mode &= ~S_ISGID;
-
- ip->i_d.di_mode &= S_IFMT;
- ip->i_d.di_mode |= mode & ~S_IFMT;
-
- inode->i_mode &= S_IFMT;
- inode->i_mode |= mode & ~S_IFMT;
- }
-
- /*
- * Change file access or modified times.
- */
- if (mask & ATTR_ATIME) {
- inode->i_atime = iattr->ia_atime;
- ip->i_d.di_atime.t_sec = iattr->ia_atime.tv_sec;
- ip->i_d.di_atime.t_nsec = iattr->ia_atime.tv_nsec;
- }
- if (mask & ATTR_CTIME) {
- inode->i_ctime = iattr->ia_ctime;
- ip->i_d.di_ctime.t_sec = iattr->ia_ctime.tv_sec;
- ip->i_d.di_ctime.t_nsec = iattr->ia_ctime.tv_nsec;
- }
- if (mask & ATTR_MTIME) {
- inode->i_mtime = iattr->ia_mtime;
- ip->i_d.di_mtime.t_sec = iattr->ia_mtime.tv_sec;
- ip->i_d.di_mtime.t_nsec = iattr->ia_mtime.tv_nsec;
- }
+ if (mask & ATTR_MODE)
+ xfs_setattr_mode(ip, iattr);
+ if (mask & (ATTR_ATIME|ATTR_CTIME|ATTR_MTIME))
+ xfs_setattr_time(ip, iattr);
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
@@ -667,7 +713,7 @@ xfs_setattr_nonsize(
* Posix ACL code seems to care about this issue either.
*/
if ((mask & ATTR_MODE) && !(flags & XFS_ATTR_NOACL)) {
- error = -xfs_acl_chmod(inode);
+ error = -posix_acl_chmod(inode, inode->i_mode);
if (error)
return XFS_ERROR(error);
}
@@ -689,12 +735,10 @@ out_dqrele:
int
xfs_setattr_size(
struct xfs_inode *ip,
- struct iattr *iattr,
- int flags)
+ struct iattr *iattr)
{
struct xfs_mount *mp = ip->i_mount;
struct inode *inode = VFS_I(ip);
- int mask = iattr->ia_valid;
xfs_off_t oldsize, newsize;
struct xfs_trans *tp;
int error;
@@ -713,15 +757,10 @@ xfs_setattr_size(
if (error)
return XFS_ERROR(error);
+ ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
ASSERT(S_ISREG(ip->i_d.di_mode));
- ASSERT((mask & (ATTR_MODE|ATTR_UID|ATTR_GID|ATTR_ATIME|ATTR_ATIME_SET|
- ATTR_MTIME_SET|ATTR_KILL_SUID|ATTR_KILL_SGID|
- ATTR_KILL_PRIV|ATTR_TIMES_SET)) == 0);
-
- if (!(flags & XFS_ATTR_NOLOCK)) {
- lock_flags |= XFS_IOLOCK_EXCL;
- xfs_ilock(ip, lock_flags);
- }
+ ASSERT((iattr->ia_valid & (ATTR_UID|ATTR_GID|ATTR_ATIME|ATTR_ATIME_SET|
+ ATTR_MTIME_SET|ATTR_KILL_PRIV|ATTR_TIMES_SET)) == 0);
oldsize = inode->i_size;
newsize = iattr->ia_size;
@@ -730,13 +769,12 @@ xfs_setattr_size(
* Short circuit the truncate case for zero length files.
*/
if (newsize == 0 && oldsize == 0 && ip->i_d.di_nextents == 0) {
- if (!(mask & (ATTR_CTIME|ATTR_MTIME)))
- goto out_unlock;
+ if (!(iattr->ia_valid & (ATTR_CTIME|ATTR_MTIME)))
+ return 0;
/*
* Use the regular setattr path to update the timestamps.
*/
- xfs_iunlock(ip, lock_flags);
iattr->ia_valid &= ~ATTR_SIZE;
return xfs_setattr_nonsize(ip, iattr, 0);
}
@@ -746,7 +784,7 @@ xfs_setattr_size(
*/
error = xfs_qm_dqattach(ip, 0);
if (error)
- goto out_unlock;
+ return error;
/*
* Now we can make the changes. Before we join the inode to the
@@ -764,7 +802,7 @@ xfs_setattr_size(
*/
error = xfs_zero_eof(ip, newsize, oldsize);
if (error)
- goto out_unlock;
+ return error;
}
/*
@@ -783,7 +821,7 @@ xfs_setattr_size(
error = -filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
ip->i_d.di_size, newsize);
if (error)
- goto out_unlock;
+ return error;
}
/*
@@ -791,24 +829,34 @@ xfs_setattr_size(
*/
inode_dio_wait(inode);
+ /*
+ * Do all the page cache truncate work outside the transaction context
+ * as the "lock" order is page lock->log space reservation. i.e.
+ * locking pages inside the transaction can ABBA deadlock with
+ * writeback. We have to do the VFS inode size update before we truncate
+ * the pagecache, however, to avoid racing with page faults beyond the
+ * new EOF they are not serialised against truncate operations except by
+ * page locks and size updates.
+ *
+ * Hence we are in a situation where a truncate can fail with ENOMEM
+ * from xfs_trans_reserve(), but having already truncated the in-memory
+ * version of the file (i.e. made user visible changes). There's not
+ * much we can do about this, except to hope that the caller sees ENOMEM
+ * and retries the truncate operation.
+ */
error = -block_truncate_page(inode->i_mapping, newsize, xfs_get_blocks);
if (error)
- goto out_unlock;
+ return error;
+ truncate_setsize(inode, newsize);
tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_SIZE);
- error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0,
- XFS_TRANS_PERM_LOG_RES,
- XFS_ITRUNCATE_LOG_COUNT);
+ error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
if (error)
goto out_trans_cancel;
- truncate_setsize(inode, newsize);
-
commit_flags = XFS_TRANS_RELEASE_LOG_RES;
lock_flags |= XFS_ILOCK_EXCL;
-
xfs_ilock(ip, XFS_ILOCK_EXCL);
-
xfs_trans_ijoin(tp, ip, 0);
/*
@@ -821,10 +869,11 @@ xfs_setattr_size(
* these flags set. For all other operations the VFS set these flags
* explicitly if it wants a timestamp update.
*/
- if (newsize != oldsize && (!(mask & (ATTR_CTIME | ATTR_MTIME)))) {
+ if (newsize != oldsize &&
+ !(iattr->ia_valid & (ATTR_CTIME | ATTR_MTIME))) {
iattr->ia_ctime = iattr->ia_mtime =
current_fs_time(inode->i_sb);
- mask |= ATTR_CTIME | ATTR_MTIME;
+ iattr->ia_valid |= ATTR_CTIME | ATTR_MTIME;
}
/*
@@ -860,16 +909,10 @@ xfs_setattr_size(
xfs_inode_clear_eofblocks_tag(ip);
}
- if (mask & ATTR_CTIME) {
- inode->i_ctime = iattr->ia_ctime;
- ip->i_d.di_ctime.t_sec = iattr->ia_ctime.tv_sec;
- ip->i_d.di_ctime.t_nsec = iattr->ia_ctime.tv_nsec;
- }
- if (mask & ATTR_MTIME) {
- inode->i_mtime = iattr->ia_mtime;
- ip->i_d.di_mtime.t_sec = iattr->ia_mtime.tv_sec;
- ip->i_d.di_mtime.t_nsec = iattr->ia_mtime.tv_nsec;
- }
+ if (iattr->ia_valid & ATTR_MODE)
+ xfs_setattr_mode(ip, iattr);
+ if (iattr->ia_valid & (ATTR_ATIME|ATTR_CTIME|ATTR_MTIME))
+ xfs_setattr_time(ip, iattr);
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
@@ -893,12 +936,21 @@ out_trans_cancel:
STATIC int
xfs_vn_setattr(
- struct dentry *dentry,
- struct iattr *iattr)
+ struct dentry *dentry,
+ struct iattr *iattr)
{
- if (iattr->ia_valid & ATTR_SIZE)
- return -xfs_setattr_size(XFS_I(dentry->d_inode), iattr, 0);
- return -xfs_setattr_nonsize(XFS_I(dentry->d_inode), iattr, 0);
+ struct xfs_inode *ip = XFS_I(dentry->d_inode);
+ int error;
+
+ if (iattr->ia_valid & ATTR_SIZE) {
+ xfs_ilock(ip, XFS_IOLOCK_EXCL);
+ error = xfs_setattr_size(ip, iattr);
+ xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+ } else {
+ error = xfs_setattr_nonsize(ip, iattr, 0);
+ }
+
+ return -error;
}
STATIC int
@@ -915,7 +967,7 @@ xfs_vn_update_time(
trace_xfs_update_time(ip);
tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
- error = xfs_trans_reserve(tp, 0, XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0);
+ error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0);
if (error) {
xfs_trans_cancel(tp, 0);
return -error;
@@ -970,7 +1022,8 @@ xfs_fiemap_format(
if (bmv->bmv_oflags & BMV_OF_PREALLOC)
fiemap_flags |= FIEMAP_EXTENT_UNWRITTEN;
else if (bmv->bmv_oflags & BMV_OF_DELALLOC) {
- fiemap_flags |= FIEMAP_EXTENT_DELALLOC;
+ fiemap_flags |= (FIEMAP_EXTENT_DELALLOC |
+ FIEMAP_EXTENT_UNKNOWN);
physical = 0; /* no block yet */
}
if (bmv->bmv_oflags & BMV_OF_LAST)
@@ -1027,8 +1080,18 @@ xfs_vn_fiemap(
return 0;
}
+STATIC int
+xfs_vn_tmpfile(
+ struct inode *dir,
+ struct dentry *dentry,
+ umode_t mode)
+{
+ return xfs_generic_create(dir, dentry, mode, 0, true);
+}
+
static const struct inode_operations xfs_inode_operations = {
.get_acl = xfs_get_acl,
+ .set_acl = xfs_set_acl,
.getattr = xfs_vn_getattr,
.setattr = xfs_vn_setattr,
.setxattr = generic_setxattr,
@@ -1056,6 +1119,7 @@ static const struct inode_operations xfs_dir_inode_operations = {
.mknod = xfs_vn_mknod,
.rename = xfs_vn_rename,
.get_acl = xfs_get_acl,
+ .set_acl = xfs_set_acl,
.getattr = xfs_vn_getattr,
.setattr = xfs_vn_setattr,
.setxattr = generic_setxattr,
@@ -1063,6 +1127,7 @@ static const struct inode_operations xfs_dir_inode_operations = {
.removexattr = generic_removexattr,
.listxattr = xfs_vn_listxattr,
.update_time = xfs_vn_update_time,
+ .tmpfile = xfs_vn_tmpfile,
};
static const struct inode_operations xfs_dir_ci_inode_operations = {
@@ -1082,6 +1147,7 @@ static const struct inode_operations xfs_dir_ci_inode_operations = {
.mknod = xfs_vn_mknod,
.rename = xfs_vn_rename,
.get_acl = xfs_get_acl,
+ .set_acl = xfs_set_acl,
.getattr = xfs_vn_getattr,
.setattr = xfs_vn_setattr,
.setxattr = generic_setxattr,
@@ -1089,13 +1155,13 @@ static const struct inode_operations xfs_dir_ci_inode_operations = {
.removexattr = generic_removexattr,
.listxattr = xfs_vn_listxattr,
.update_time = xfs_vn_update_time,
+ .tmpfile = xfs_vn_tmpfile,
};
static const struct inode_operations xfs_symlink_inode_operations = {
.readlink = generic_readlink,
.follow_link = xfs_vn_follow_link,
- .put_link = xfs_vn_put_link,
- .get_acl = xfs_get_acl,
+ .put_link = kfree_put_link,
.getattr = xfs_vn_getattr,
.setattr = xfs_vn_setattr,
.setxattr = generic_setxattr,
@@ -1145,6 +1211,7 @@ xfs_setup_inode(
struct xfs_inode *ip)
{
struct inode *inode = &ip->i_vnode;
+ gfp_t gfp_mask;
inode->i_ino = ip->i_ino;
inode->i_state = I_NEW;
@@ -1155,8 +1222,8 @@ xfs_setup_inode(
inode->i_mode = ip->i_d.di_mode;
set_nlink(inode, ip->i_d.di_nlink);
- inode->i_uid = ip->i_d.di_uid;
- inode->i_gid = ip->i_d.di_gid;
+ inode->i_uid = xfs_uid_to_kuid(ip->i_d.di_uid);
+ inode->i_gid = xfs_gid_to_kgid(ip->i_d.di_gid);
switch (inode->i_mode & S_IFMT) {
case S_IFBLK:
@@ -1180,6 +1247,8 @@ xfs_setup_inode(
inode->i_ctime.tv_nsec = ip->i_d.di_ctime.t_nsec;
xfs_diflags_to_iflags(inode, ip);
+ ip->d_ops = ip->i_mount->m_nondir_inode_ops;
+ lockdep_set_class(&ip->i_lock.mr_lock, &xfs_nondir_ilock_class);
switch (inode->i_mode & S_IFMT) {
case S_IFREG:
inode->i_op = &xfs_inode_operations;
@@ -1187,11 +1256,13 @@ xfs_setup_inode(
inode->i_mapping->a_ops = &xfs_address_space_operations;
break;
case S_IFDIR:
+ lockdep_set_class(&ip->i_lock.mr_lock, &xfs_dir_ilock_class);
if (xfs_sb_version_hasasciici(&XFS_M(inode->i_sb)->m_sb))
inode->i_op = &xfs_dir_ci_inode_operations;
else
inode->i_op = &xfs_dir_inode_operations;
inode->i_fop = &xfs_dir_file_operations;
+ ip->d_ops = ip->i_mount->m_dir_inode_ops;
break;
case S_IFLNK:
inode->i_op = &xfs_symlink_inode_operations;
@@ -1205,6 +1276,14 @@ xfs_setup_inode(
}
/*
+ * Ensure all page cache allocations are done from GFP_NOFS context to
+ * prevent direct reclaim recursion back into the filesystem and blowing
+ * stacks or deadlocking.
+ */
+ gfp_mask = mapping_gfp_mask(inode->i_mapping);
+ mapping_set_gfp_mask(inode->i_mapping, (gfp_mask & ~(__GFP_FS)));
+
+ /*
* If there is no attribute fork no ACL can exist on this inode,
* and it can't have any file capabilities attached to it either.
*/
diff --git a/fs/xfs/xfs_iops.h b/fs/xfs/xfs_iops.h
index ef41c92ce66..1c34e433592 100644
--- a/fs/xfs/xfs_iops.h
+++ b/fs/xfs/xfs_iops.h
@@ -27,4 +27,13 @@ extern ssize_t xfs_vn_listxattr(struct dentry *, char *data, size_t size);
extern void xfs_setup_inode(struct xfs_inode *);
+/*
+ * Internal setattr interfaces.
+ */
+#define XFS_ATTR_NOACL 0x01 /* Don't call posix_acl_chmod */
+
+extern int xfs_setattr_nonsize(struct xfs_inode *ip, struct iattr *vap,
+ int flags);
+extern int xfs_setattr_size(struct xfs_inode *ip, struct iattr *vap);
+
#endif /* __XFS_IOPS_H__ */
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index 2ea7d402188..cb64f222d60 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -17,24 +17,23 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
-#include "xfs_types.h"
-#include "xfs_log.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_inum.h"
-#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dinode.h"
#include "xfs_inode.h"
+#include "xfs_btree.h"
#include "xfs_ialloc.h"
+#include "xfs_ialloc_btree.h"
#include "xfs_itable.h"
#include "xfs_error.h"
-#include "xfs_btree.h"
#include "xfs_trace.h"
#include "xfs_icache.h"
+#include "xfs_dinode.h"
STATIC int
xfs_internal_inum(
@@ -43,7 +42,7 @@ xfs_internal_inum(
{
return (ino == mp->m_sb.sb_rbmino || ino == mp->m_sb.sb_rsumino ||
(xfs_sb_version_hasquota(&mp->m_sb) &&
- (ino == mp->m_sb.sb_uquotino || ino == mp->m_sb.sb_gquotino)));
+ xfs_is_quota_inode(&mp->m_sb, ino)));
}
/*
@@ -210,9 +209,8 @@ xfs_bulkstat(
xfs_inobt_rec_incore_t *irbuf; /* start of irec buffer */
xfs_inobt_rec_incore_t *irbufend; /* end of good irec buffer entries */
xfs_ino_t lastino; /* last inode number returned */
- int nbcluster; /* # of blocks in a cluster */
- int nicluster; /* # of inodes in a cluster */
- int nimask; /* mask for inode clusters */
+ int blks_per_cluster; /* # of blocks per cluster */
+ int inodes_per_cluster;/* # of inodes per cluster */
int nirbuf; /* size of irbuf */
int rval; /* return value error code */
int tmp; /* result value from btree calls */
@@ -221,7 +219,6 @@ xfs_bulkstat(
char __user *ubufp; /* pointer into user's buffer */
int ubelem; /* spaces used in user's buffer */
int ubused; /* bytes used by formatter */
- xfs_buf_t *bp; /* ptr to on-disk inode cluster buf */
/*
* Get the last inode value, see if there's nothing to do.
@@ -245,11 +242,8 @@ xfs_bulkstat(
*done = 0;
fmterror = 0;
ubufp = ubuffer;
- nicluster = mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(mp) ?
- mp->m_sb.sb_inopblock :
- (XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog);
- nimask = ~(nicluster - 1);
- nbcluster = nicluster >> mp->m_sb.sb_inopblog;
+ blks_per_cluster = xfs_icluster_size_fsb(mp);
+ inodes_per_cluster = blks_per_cluster << mp->m_sb.sb_inopblog;
irbuf = kmem_zalloc_greedy(&irbsize, PAGE_SIZE, PAGE_SIZE * 4);
if (!irbuf)
return ENOMEM;
@@ -263,7 +257,6 @@ xfs_bulkstat(
rval = 0;
while (XFS_BULKSTAT_UBLEFT(ubleft) && agno < mp->m_sb.sb_agcount) {
cond_resched();
- bp = NULL;
error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp);
if (error) {
/*
@@ -277,7 +270,8 @@ xfs_bulkstat(
/*
* Allocate and initialize a btree cursor for ialloc btree.
*/
- cur = xfs_inobt_init_cursor(mp, NULL, agbp, agno);
+ cur = xfs_inobt_init_cursor(mp, NULL, agbp, agno,
+ XFS_BTNUM_INO);
irbp = irbuf;
irbufend = irbuf + nirbuf;
end_of_ag = 0;
@@ -383,22 +377,25 @@ xfs_bulkstat(
* Also start read-ahead now for this chunk.
*/
if (r.ir_freecount < XFS_INODES_PER_CHUNK) {
+ struct blk_plug plug;
/*
* Loop over all clusters in the next chunk.
* Do a readahead if there are any allocated
* inodes in that cluster.
*/
+ blk_start_plug(&plug);
agbno = XFS_AGINO_TO_AGBNO(mp, r.ir_startino);
for (chunkidx = 0;
chunkidx < XFS_INODES_PER_CHUNK;
- chunkidx += nicluster,
- agbno += nbcluster) {
- if (xfs_inobt_maskn(chunkidx, nicluster)
- & ~r.ir_free)
+ chunkidx += inodes_per_cluster,
+ agbno += blks_per_cluster) {
+ if (xfs_inobt_maskn(chunkidx,
+ inodes_per_cluster) & ~r.ir_free)
xfs_btree_reada_bufs(mp, agno,
- agbno, nbcluster,
+ agbno, blks_per_cluster,
&xfs_inode_buf_ops);
}
+ blk_finish_plug(&plug);
irbp->ir_startino = r.ir_startino;
irbp->ir_freecount = r.ir_freecount;
irbp->ir_free = r.ir_free;
@@ -433,27 +430,7 @@ xfs_bulkstat(
irbp->ir_freecount < XFS_INODES_PER_CHUNK;
chunkidx++, clustidx++, agino++) {
ASSERT(chunkidx < XFS_INODES_PER_CHUNK);
- /*
- * Recompute agbno if this is the
- * first inode of the cluster.
- *
- * Careful with clustidx. There can be
- * multiple clusters per chunk, a single
- * cluster per chunk or a cluster that has
- * inodes represented from several different
- * chunks (if blocksize is large).
- *
- * Because of this, the starting clustidx is
- * initialized to zero in this loop but must
- * later be reset after reading in the cluster
- * buffer.
- */
- if ((chunkidx & (nicluster - 1)) == 0) {
- agbno = XFS_AGINO_TO_AGBNO(mp,
- irbp->ir_startino) +
- ((chunkidx & nimask) >>
- mp->m_sb.sb_inopblog);
- }
+
ino = XFS_AGINO_TO_INO(mp, agno, agino);
/*
* Skip if this inode is free.
@@ -499,10 +476,6 @@ xfs_bulkstat(
cond_resched();
}
-
- if (bp)
- xfs_buf_relse(bp);
-
/*
* Set up for the next loop iteration.
*/
@@ -518,7 +491,7 @@ xfs_bulkstat(
/*
* Done, we're either out of filesystem or space to put the data.
*/
- kmem_free_large(irbuf);
+ kmem_free(irbuf);
*ubcountp = ubelem;
/*
* Found some inodes, return them now and return the error next time.
@@ -564,8 +537,9 @@ xfs_bulkstat_single(
* at the expense of the error case.
*/
- ino = (xfs_ino_t)*lastinop;
- error = xfs_bulkstat_one(mp, ino, buffer, sizeof(xfs_bstat_t), 0, &res);
+ ino = *lastinop;
+ error = xfs_bulkstat_one(mp, ino, buffer, sizeof(xfs_bstat_t),
+ NULL, &res);
if (error) {
/*
* Special case way failed, do it the "long" way
@@ -648,7 +622,8 @@ xfs_inumbers(
agino = 0;
continue;
}
- cur = xfs_inobt_init_cursor(mp, NULL, agbp, agno);
+ cur = xfs_inobt_init_cursor(mp, NULL, agbp, agno,
+ XFS_BTNUM_INO);
error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_GE,
&tmp);
if (error) {
diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h
index fe7e4df85a7..825249d2dfc 100644
--- a/fs/xfs/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h
@@ -32,6 +32,38 @@
# define XFS_BIG_INUMS 0
#endif
+/*
+ * Kernel specific type declarations for XFS
+ */
+typedef signed char __int8_t;
+typedef unsigned char __uint8_t;
+typedef signed short int __int16_t;
+typedef unsigned short int __uint16_t;
+typedef signed int __int32_t;
+typedef unsigned int __uint32_t;
+typedef signed long long int __int64_t;
+typedef unsigned long long int __uint64_t;
+
+typedef __uint32_t inst_t; /* an instruction */
+
+typedef __s64 xfs_off_t; /* <file offset> type */
+typedef unsigned long long xfs_ino_t; /* <inode> type */
+typedef __s64 xfs_daddr_t; /* <disk address> type */
+typedef char * xfs_caddr_t; /* <core address> type */
+typedef __u32 xfs_dev_t;
+typedef __u32 xfs_nlink_t;
+
+/* __psint_t is the same size as a pointer */
+#if (BITS_PER_LONG == 32)
+typedef __int32_t __psint_t;
+typedef __uint32_t __psunsigned_t;
+#elif (BITS_PER_LONG == 64)
+typedef __int64_t __psint_t;
+typedef __uint64_t __psunsigned_t;
+#else
+#error BITS_PER_LONG must be 32 or 64
+#endif
+
#include "xfs_types.h"
#include "kmem.h"
@@ -72,6 +104,7 @@
#include <linux/kthread.h>
#include <linux/freezer.h>
#include <linux/list_sort.h>
+#include <linux/ratelimit.h>
#include <asm/page.h>
#include <asm/div64.h>
@@ -86,6 +119,7 @@
#include "xfs_iops.h"
#include "xfs_aops.h"
#include "xfs_super.h"
+#include "xfs_cksum.h"
#include "xfs_buf.h"
#include "xfs_message.h"
@@ -113,8 +147,6 @@
#define xfs_inherit_sync xfs_params.inherit_sync.val
#define xfs_inherit_nodump xfs_params.inherit_nodump.val
#define xfs_inherit_noatime xfs_params.inherit_noatim.val
-#define xfs_buf_timer_centisecs xfs_params.xfs_buf_timer.val
-#define xfs_buf_age_centisecs xfs_params.xfs_buf_age.val
#define xfs_inherit_nosymlinks xfs_params.inherit_nosym.val
#define xfs_rotorstep xfs_params.rotorstep.val
#define xfs_inherit_nodefrag xfs_params.inherit_nodfrg.val
@@ -147,6 +179,7 @@
#define ENOATTR ENODATA /* Attribute not found */
#define EWRONGFS EINVAL /* Mount with wrong filesystem type */
#define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */
+#define EFSBADCRC EBADMSG /* Bad CRC detected */
#define SYNCHRONIZE() barrier()
#define __return_address __builtin_return_address(0)
@@ -158,6 +191,32 @@
#define MAX(a,b) (max(a,b))
#define howmany(x, y) (((x)+((y)-1))/(y))
+/* Kernel uid/gid conversion. These are used to convert to/from the on disk
+ * uid_t/gid_t types to the kuid_t/kgid_t types that the kernel uses internally.
+ * The conversion here is type only, the value will remain the same since we
+ * are converting to the init_user_ns. The uid is later mapped to a particular
+ * user namespace value when crossing the kernel/user boundary.
+ */
+static inline __uint32_t xfs_kuid_to_uid(kuid_t uid)
+{
+ return from_kuid(&init_user_ns, uid);
+}
+
+static inline kuid_t xfs_uid_to_kuid(__uint32_t uid)
+{
+ return make_kuid(&init_user_ns, uid);
+}
+
+static inline __uint32_t xfs_kgid_to_gid(kgid_t gid)
+{
+ return from_kgid(&init_user_ns, gid);
+}
+
+static inline kgid_t xfs_gid_to_kgid(__uint32_t gid)
+{
+ return make_kgid(&init_user_ns, gid);
+}
+
/*
* Various platform dependent calls that don't fit anywhere else
*/
@@ -292,22 +351,34 @@ static inline __uint64_t howmany_64(__uint64_t x, __uint32_t y)
#define ASSERT_ALWAYS(expr) \
(unlikely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__))
-#ifndef DEBUG
-#define ASSERT(expr) ((void)0)
+#ifdef DEBUG
+#define ASSERT(expr) \
+ (unlikely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__))
#ifndef STATIC
-# define STATIC static noinline
+# define STATIC noinline
#endif
-#else /* DEBUG */
+#else /* !DEBUG */
+
+#ifdef XFS_WARN
#define ASSERT(expr) \
- (unlikely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__))
+ (unlikely(expr) ? (void)0 : asswarn(#expr, __FILE__, __LINE__))
#ifndef STATIC
-# define STATIC noinline
+# define STATIC static noinline
+#endif
+
+#else /* !DEBUG && !XFS_WARN */
+
+#define ASSERT(expr) ((void)0)
+
+#ifndef STATIC
+# define STATIC static noinline
#endif
+#endif /* XFS_WARN */
#endif /* DEBUG */
#endif /* __XFS_LINUX__ */
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 46bd9d52ab5..292308dede6 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -17,21 +17,19 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
-#include "xfs_types.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_mount.h"
#include "xfs_error.h"
+#include "xfs_trans.h"
+#include "xfs_trans_priv.h"
+#include "xfs_log.h"
#include "xfs_log_priv.h"
-#include "xfs_buf_item.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
#include "xfs_log_recover.h"
-#include "xfs_trans_priv.h"
-#include "xfs_dinode.h"
#include "xfs_inode.h"
#include "xfs_trace.h"
#include "xfs_fsops.h"
@@ -120,7 +118,7 @@ xlog_verify_iclog(
struct xlog *log,
struct xlog_in_core *iclog,
int count,
- boolean_t syncing);
+ bool syncing);
STATIC void
xlog_verify_tail_lsn(
struct xlog *log,
@@ -257,7 +255,8 @@ xlog_grant_head_wait(
struct xlog *log,
struct xlog_grant_head *head,
struct xlog_ticket *tic,
- int need_bytes)
+ int need_bytes) __releases(&head->lock)
+ __acquires(&head->lock)
{
list_add_tail(&tic->t_queue, &head->waiters);
@@ -614,13 +613,16 @@ xfs_log_mount(
xfs_daddr_t blk_offset,
int num_bblks)
{
- int error;
+ int error = 0;
+ int min_logfsbs;
- if (!(mp->m_flags & XFS_MOUNT_NORECOVERY))
- xfs_notice(mp, "Mounting Filesystem");
- else {
+ if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) {
+ xfs_notice(mp, "Mounting V%d Filesystem",
+ XFS_SB_VERSION_NUM(&mp->m_sb));
+ } else {
xfs_notice(mp,
-"Mounting filesystem in no-recovery mode. Filesystem will be inconsistent.");
+"Mounting V%d filesystem in no-recovery mode. Filesystem will be inconsistent.",
+ XFS_SB_VERSION_NUM(&mp->m_sb));
ASSERT(mp->m_flags & XFS_MOUNT_RDONLY);
}
@@ -631,6 +633,50 @@ xfs_log_mount(
}
/*
+ * Validate the given log space and drop a critical message via syslog
+ * if the log size is too small that would lead to some unexpected
+ * situations in transaction log space reservation stage.
+ *
+ * Note: we can't just reject the mount if the validation fails. This
+ * would mean that people would have to downgrade their kernel just to
+ * remedy the situation as there is no way to grow the log (short of
+ * black magic surgery with xfs_db).
+ *
+ * We can, however, reject mounts for CRC format filesystems, as the
+ * mkfs binary being used to make the filesystem should never create a
+ * filesystem with a log that is too small.
+ */
+ min_logfsbs = xfs_log_calc_minimum_size(mp);
+
+ if (mp->m_sb.sb_logblocks < min_logfsbs) {
+ xfs_warn(mp,
+ "Log size %d blocks too small, minimum size is %d blocks",
+ mp->m_sb.sb_logblocks, min_logfsbs);
+ error = EINVAL;
+ } else if (mp->m_sb.sb_logblocks > XFS_MAX_LOG_BLOCKS) {
+ xfs_warn(mp,
+ "Log size %d blocks too large, maximum size is %lld blocks",
+ mp->m_sb.sb_logblocks, XFS_MAX_LOG_BLOCKS);
+ error = EINVAL;
+ } else if (XFS_FSB_TO_B(mp, mp->m_sb.sb_logblocks) > XFS_MAX_LOG_BYTES) {
+ xfs_warn(mp,
+ "log size %lld bytes too large, maximum size is %lld bytes",
+ XFS_FSB_TO_B(mp, mp->m_sb.sb_logblocks),
+ XFS_MAX_LOG_BYTES);
+ error = EINVAL;
+ }
+ if (error) {
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ xfs_crit(mp, "AAIEEE! Log failed size checks. Abort!");
+ ASSERT(0);
+ goto out_free_log;
+ }
+ xfs_crit(mp,
+"Log size out of supported range. Continuing onwards, but if log hangs are\n"
+"experienced then please report this message in the bug report.");
+ }
+
+ /*
* Initialize the AIL now we have a log.
*/
error = xfs_trans_ail_init(mp);
@@ -720,7 +766,7 @@ xfs_log_mount_finish(xfs_mount_t *mp)
* Unmount record used to have a string "Unmount filesystem--" in the
* data section where the "Un" was really a magic number (XLOG_UNMOUNT_TYPE).
* We just write the magic number now since that particular field isn't
- * currently architecture converted and "nUmount" is a bit foo.
+ * currently architecture converted and "Unmount" is a bit foo.
* As far as I know, there weren't any dependencies on the old behaviour.
*/
@@ -954,27 +1000,34 @@ xfs_log_space_wake(
}
/*
- * Determine if we have a transaction that has gone to disk
- * that needs to be covered. To begin the transition to the idle state
- * firstly the log needs to be idle (no AIL and nothing in the iclogs).
- * If we are then in a state where covering is needed, the caller is informed
- * that dummy transactions are required to move the log into the idle state.
+ * Determine if we have a transaction that has gone to disk that needs to be
+ * covered. To begin the transition to the idle state firstly the log needs to
+ * be idle. That means the CIL, the AIL and the iclogs needs to be empty before
+ * we start attempting to cover the log.
*
- * Because this is called as part of the sync process, we should also indicate
- * that dummy transactions should be issued in anything but the covered or
- * idle states. This ensures that the log tail is accurately reflected in
- * the log at the end of the sync, hence if a crash occurrs avoids replay
- * of transactions where the metadata is already on disk.
+ * Only if we are then in a state where covering is needed, the caller is
+ * informed that dummy transactions are required to move the log into the idle
+ * state.
+ *
+ * If there are any items in the AIl or CIL, then we do not want to attempt to
+ * cover the log as we may be in a situation where there isn't log space
+ * available to run a dummy transaction and this can lead to deadlocks when the
+ * tail of the log is pinned by an item that is modified in the CIL. Hence
+ * there's no point in running a dummy transaction at this point because we
+ * can't start trying to idle the log until both the CIL and AIL are empty.
*/
int
xfs_log_need_covered(xfs_mount_t *mp)
{
- int needed = 0;
struct xlog *log = mp->m_log;
+ int needed = 0;
if (!xfs_fs_writable(mp))
return 0;
+ if (!xlog_cil_empty(log))
+ return 0;
+
spin_lock(&log->l_icloglock);
switch (log->l_covered_state) {
case XLOG_STATE_COVER_DONE:
@@ -983,14 +1036,17 @@ xfs_log_need_covered(xfs_mount_t *mp)
break;
case XLOG_STATE_COVER_NEED:
case XLOG_STATE_COVER_NEED2:
- if (!xfs_ail_min_lsn(log->l_ailp) &&
- xlog_iclogs_empty(log)) {
- if (log->l_covered_state == XLOG_STATE_COVER_NEED)
- log->l_covered_state = XLOG_STATE_COVER_DONE;
- else
- log->l_covered_state = XLOG_STATE_COVER_DONE2;
- }
- /* FALLTHRU */
+ if (xfs_ail_min_lsn(log->l_ailp))
+ break;
+ if (!xlog_iclogs_empty(log))
+ break;
+
+ needed = 1;
+ if (log->l_covered_state == XLOG_STATE_COVER_NEED)
+ log->l_covered_state = XLOG_STATE_COVER_DONE;
+ else
+ log->l_covered_state = XLOG_STATE_COVER_DONE2;
+ break;
default:
needed = 1;
break;
@@ -1022,6 +1078,7 @@ xlog_assign_tail_lsn_locked(
tail_lsn = lip->li_lsn;
else
tail_lsn = atomic64_read(&log->l_last_sync_lsn);
+ trace_xfs_log_assign_tail_lsn(log, tail_lsn);
atomic64_set(&log->l_tail_lsn, tail_lsn);
return tail_lsn;
}
@@ -1108,7 +1165,7 @@ xlog_iodone(xfs_buf_t *bp)
/*
* Race to shutdown the filesystem if we see an error.
*/
- if (XFS_TEST_ERROR((xfs_buf_geterror(bp)), l->l_mp,
+ if (XFS_TEST_ERROR(bp->b_error, l->l_mp,
XFS_ERRTAG_IODONE_IOERR, XFS_RANDOM_IODONE_IOERR)) {
xfs_buf_ioerror_alert(bp, __func__);
xfs_buf_stale(bp);
@@ -1126,11 +1183,14 @@ xlog_iodone(xfs_buf_t *bp)
/* log I/O is always issued ASYNC */
ASSERT(XFS_BUF_ISASYNC(bp));
xlog_state_done_syncing(iclog, aborted);
+
/*
- * do not reference the buffer (bp) here as we could race
- * with it being freed after writing the unmount record to the
- * log.
+ * drop the buffer lock now that we are done. Nothing references
+ * the buffer after this, so an unmount waiting on this lock can now
+ * tear it down safely. As such, it is unsafe to reference the buffer
+ * (bp) after the unlock as we could race with it being freed.
*/
+ xfs_buf_unlock(bp);
}
/*
@@ -1313,8 +1373,16 @@ xlog_alloc_log(
bp = xfs_buf_alloc(mp->m_logdev_targp, 0, BTOBB(log->l_iclog_size), 0);
if (!bp)
goto out_free_log;
- bp->b_iodone = xlog_iodone;
+
+ /*
+ * The iclogbuf buffer locks are held over IO but we are not going to do
+ * IO yet. Hence unlock the buffer so that the log IO path can grab it
+ * when appropriately.
+ */
ASSERT(xfs_buf_islocked(bp));
+ xfs_buf_unlock(bp);
+
+ bp->b_iodone = xlog_iodone;
log->l_xbuf = bp;
spin_lock_init(&log->l_icloglock);
@@ -1343,6 +1411,9 @@ xlog_alloc_log(
if (!bp)
goto out_free_iclog;
+ ASSERT(xfs_buf_islocked(bp));
+ xfs_buf_unlock(bp);
+
bp->b_iodone = xlog_iodone;
iclog->ic_bp = bp;
iclog->ic_data = bp->b_addr;
@@ -1367,7 +1438,6 @@ xlog_alloc_log(
iclog->ic_callback_tail = &(iclog->ic_callback);
iclog->ic_datap = (char *)iclog->ic_data + log->l_iclog_hsize;
- ASSERT(xfs_buf_islocked(iclog->ic_bp));
init_waitqueue_head(&iclog->ic_force_wait);
init_waitqueue_head(&iclog->ic_write_wait);
@@ -1576,6 +1646,12 @@ xlog_cksum(
* we transition the iclogs to IOERROR state *after* flushing all existing
* iclogs to disk. This is because we don't want anymore new transactions to be
* started or completed afterwards.
+ *
+ * We lock the iclogbufs here so that we can serialise against IO completion
+ * during unmount. We might be processing a shutdown triggered during unmount,
+ * and that can occur asynchronously to the unmount thread, and hence we need to
+ * ensure that completes before tearing down the iclogbufs. Hence we need to
+ * hold the buffer lock across the log IO to acheive that.
*/
STATIC int
xlog_bdstrat(
@@ -1583,6 +1659,7 @@ xlog_bdstrat(
{
struct xlog_in_core *iclog = bp->b_fspriv;
+ xfs_buf_lock(bp);
if (iclog->ic_state & XLOG_STATE_IOERROR) {
xfs_buf_ioerror(bp, EIO);
xfs_buf_stale(bp);
@@ -1590,7 +1667,8 @@ xlog_bdstrat(
/*
* It would seem logical to return EIO here, but we rely on
* the log state machine to propagate I/O errors instead of
- * doing it here.
+ * doing it here. Similarly, IO completion will unlock the
+ * buffer, so we don't do it here.
*/
return 0;
}
@@ -1737,7 +1815,7 @@ xlog_sync(
ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1);
ASSERT(XFS_BUF_ADDR(bp) + BTOBB(count) <= log->l_logBBsize);
- xlog_verify_iclog(log, iclog, count, B_TRUE);
+ xlog_verify_iclog(log, iclog, count, true);
/* account for log which doesn't start at block #0 */
XFS_BUF_SET_ADDR(bp, XFS_BUF_ADDR(bp) + log->l_logBBstart);
@@ -1792,14 +1870,28 @@ xlog_dealloc_log(
xlog_cil_destroy(log);
/*
- * always need to ensure that the extra buffer does not point to memory
- * owned by another log buffer before we free it.
+ * Cycle all the iclogbuf locks to make sure all log IO completion
+ * is done before we tear down these buffers.
+ */
+ iclog = log->l_iclog;
+ for (i = 0; i < log->l_iclog_bufs; i++) {
+ xfs_buf_lock(iclog->ic_bp);
+ xfs_buf_unlock(iclog->ic_bp);
+ iclog = iclog->ic_next;
+ }
+
+ /*
+ * Always need to ensure that the extra buffer does not point to memory
+ * owned by another log buffer before we free it. Also, cycle the lock
+ * first to ensure we've completed IO on it.
*/
+ xfs_buf_lock(log->l_xbuf);
+ xfs_buf_unlock(log->l_xbuf);
xfs_buf_set_empty(log->l_xbuf, BTOBB(log->l_iclog_size));
xfs_buf_free(log->l_xbuf);
iclog = log->l_iclog;
- for (i=0; i<log->l_iclog_bufs; i++) {
+ for (i = 0; i < log->l_iclog_bufs; i++) {
xfs_buf_free(iclog->ic_bp);
next_iclog = iclog->ic_next;
kmem_free(iclog);
@@ -1933,7 +2025,7 @@ xlog_print_tic_res(
for (i = 0; i < ticket->t_res_num; i++) {
uint r_type = ticket->t_res_arr[i].r_type;
- xfs_warn(mp, "region[%u]: %s - %u bytes\n", i,
+ xfs_warn(mp, "region[%u]: %s - %u bytes", i,
((r_type <= 0 || r_type > XLOG_REG_TYPE_MAX) ?
"bad-rtype" : res_type_str[r_type-1]),
ticket->t_res_arr[i].r_len);
@@ -1941,7 +2033,7 @@ xlog_print_tic_res(
xfs_alert_tag(mp, XFS_PTAG_LOGRES,
"xlog_write: reservation ran out. Need to up reservation");
- xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+ xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR);
}
/*
@@ -1963,6 +2055,10 @@ xlog_write_calc_vec_length(
headers++;
for (lv = log_vector; lv; lv = lv->lv_next) {
+ /* we don't write ordered log vectors */
+ if (lv->lv_buf_len == XFS_LOG_VEC_ORDERED)
+ continue;
+
headers += lv->lv_niovecs;
for (i = 0; i < lv->lv_niovecs; i++) {
@@ -2040,7 +2136,7 @@ xlog_write_setup_ophdr(
* Set up the parameters of the region copy into the log. This has
* to handle region write split across multiple log buffers - this
* state is kept external to this function so that this code can
- * can be written in an obvious, self documenting manner.
+ * be written in an obvious, self documenting manner.
*/
static int
xlog_write_setup_copy(
@@ -2216,7 +2312,7 @@ xlog_write(
index = 0;
lv = log_vector;
vecp = lv->lv_iovecp;
- while (lv && index < lv->lv_niovecs) {
+ while (lv && (!lv->lv_niovecs || index < lv->lv_niovecs)) {
void *ptr;
int log_offset;
@@ -2236,13 +2332,22 @@ xlog_write(
* This loop writes out as many regions as can fit in the amount
* of space which was allocated by xlog_state_get_iclog_space().
*/
- while (lv && index < lv->lv_niovecs) {
- struct xfs_log_iovec *reg = &vecp[index];
+ while (lv && (!lv->lv_niovecs || index < lv->lv_niovecs)) {
+ struct xfs_log_iovec *reg;
struct xlog_op_header *ophdr;
int start_rec_copy;
int copy_len;
int copy_off;
+ bool ordered = false;
+ /* ordered log vectors have no regions to write */
+ if (lv->lv_buf_len == XFS_LOG_VEC_ORDERED) {
+ ASSERT(lv->lv_niovecs == 0);
+ ordered = true;
+ goto next_lv;
+ }
+
+ reg = &vecp[index];
ASSERT(reg->i_len % sizeof(__int32_t) == 0);
ASSERT((unsigned long)ptr % sizeof(__int32_t) == 0);
@@ -2302,12 +2407,13 @@ xlog_write(
break;
if (++index == lv->lv_niovecs) {
+next_lv:
lv = lv->lv_next;
index = 0;
if (lv)
vecp = lv->lv_iovecp;
}
- if (record_cnt == 0) {
+ if (record_cnt == 0 && ordered == false) {
if (!lv)
return 0;
break;
@@ -3377,24 +3483,17 @@ xfs_log_ticket_get(
}
/*
- * Allocate and initialise a new log ticket.
+ * Figure out the total log space unit (in bytes) that would be
+ * required for a log ticket.
*/
-struct xlog_ticket *
-xlog_ticket_alloc(
- struct xlog *log,
- int unit_bytes,
- int cnt,
- char client,
- bool permanent,
- xfs_km_flags_t alloc_flags)
+int
+xfs_log_calc_unit_res(
+ struct xfs_mount *mp,
+ int unit_bytes)
{
- struct xlog_ticket *tic;
- uint num_headers;
- int iclog_space;
-
- tic = kmem_zone_zalloc(xfs_log_ticket_zone, alloc_flags);
- if (!tic)
- return NULL;
+ struct xlog *log = mp->m_log;
+ int iclog_space;
+ uint num_headers;
/*
* Permanent reservations have up to 'cnt'-1 active log operations
@@ -3469,23 +3568,46 @@ xlog_ticket_alloc(
unit_bytes += log->l_iclog_hsize;
/* for roundoff padding for transaction data and one for commit record */
- if (xfs_sb_version_haslogv2(&log->l_mp->m_sb) &&
- log->l_mp->m_sb.sb_logsunit > 1) {
+ if (xfs_sb_version_haslogv2(&mp->m_sb) && mp->m_sb.sb_logsunit > 1) {
/* log su roundoff */
- unit_bytes += 2*log->l_mp->m_sb.sb_logsunit;
+ unit_bytes += 2 * mp->m_sb.sb_logsunit;
} else {
/* BB roundoff */
- unit_bytes += 2*BBSIZE;
+ unit_bytes += 2 * BBSIZE;
}
+ return unit_bytes;
+}
+
+/*
+ * Allocate and initialise a new log ticket.
+ */
+struct xlog_ticket *
+xlog_ticket_alloc(
+ struct xlog *log,
+ int unit_bytes,
+ int cnt,
+ char client,
+ bool permanent,
+ xfs_km_flags_t alloc_flags)
+{
+ struct xlog_ticket *tic;
+ int unit_res;
+
+ tic = kmem_zone_zalloc(xfs_log_ticket_zone, alloc_flags);
+ if (!tic)
+ return NULL;
+
+ unit_res = xfs_log_calc_unit_res(log->l_mp, unit_bytes);
+
atomic_set(&tic->t_ref, 1);
tic->t_task = current;
INIT_LIST_HEAD(&tic->t_queue);
- tic->t_unit_res = unit_bytes;
- tic->t_curr_res = unit_bytes;
+ tic->t_unit_res = unit_res;
+ tic->t_curr_res = unit_res;
tic->t_cnt = cnt;
tic->t_ocnt = cnt;
- tic->t_tid = random32();
+ tic->t_tid = prandom_u32();
tic->t_clientid = client;
tic->t_flags = XLOG_TIC_INITED;
tic->t_trans_type = 0;
@@ -3611,7 +3733,7 @@ xlog_verify_iclog(
struct xlog *log,
struct xlog_in_core *iclog,
int count,
- boolean_t syncing)
+ bool syncing)
{
xlog_op_header_t *ophead;
xlog_in_core_t *icptr;
@@ -3626,11 +3748,9 @@ xlog_verify_iclog(
/* check validity of iclog pointers */
spin_lock(&log->l_icloglock);
icptr = log->l_iclog;
- for (i=0; i < log->l_iclog_bufs; i++) {
- if (icptr == NULL)
- xfs_emerg(log->l_mp, "%s: invalid ptr", __func__);
- icptr = icptr->ic_next;
- }
+ for (i = 0; i < log->l_iclog_bufs; i++, icptr = icptr->ic_next)
+ ASSERT(icptr);
+
if (icptr != log->l_iclog)
xfs_emerg(log->l_mp, "%s: corrupt iclog ring", __func__);
spin_unlock(&log->l_icloglock);
@@ -3659,7 +3779,7 @@ xlog_verify_iclog(
/* clientid is only 1 byte */
field_offset = (__psint_t)
((xfs_caddr_t)&(ophead->oh_clientid) - base_ptr);
- if (syncing == B_FALSE || (field_offset & 0x1ff)) {
+ if (!syncing || (field_offset & 0x1ff)) {
clientid = ophead->oh_clientid;
} else {
idx = BTOBBT((xfs_caddr_t)&(ophead->oh_clientid) - iclog->ic_datap);
@@ -3682,7 +3802,7 @@ xlog_verify_iclog(
/* check length */
field_offset = (__psint_t)
((xfs_caddr_t)&(ophead->oh_len) - base_ptr);
- if (syncing == B_FALSE || (field_offset & 0x1ff)) {
+ if (!syncing || (field_offset & 0x1ff)) {
op_len = be32_to_cpu(ophead->oh_len);
} else {
idx = BTOBBT((__psint_t)&ophead->oh_len -
@@ -3832,11 +3952,14 @@ xfs_log_force_umount(
retval = xlog_state_ioerror(log);
spin_unlock(&log->l_icloglock);
}
+
/*
- * Wake up everybody waiting on xfs_log_force.
- * Callback all log item committed functions as if the
- * log writes were completed.
+ * Wake up everybody waiting on xfs_log_force. Wake the CIL push first
+ * as if the log writes were completed. The abort handling in the log
+ * item committed callback functions will do this again under lock to
+ * avoid races.
*/
+ wake_up_all(&log->l_cilp->xc_commit_wait);
xlog_state_do_callback(log, XFS_LI_ABORTED, NULL);
#ifdef XFSERRORDEBUG
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 5caee96059d..84e0deb95ab 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -18,14 +18,81 @@
#ifndef __XFS_LOG_H__
#define __XFS_LOG_H__
-/* get lsn fields */
-#define CYCLE_LSN(lsn) ((uint)((lsn)>>32))
-#define BLOCK_LSN(lsn) ((uint)(lsn))
+struct xfs_log_vec {
+ struct xfs_log_vec *lv_next; /* next lv in build list */
+ int lv_niovecs; /* number of iovecs in lv */
+ struct xfs_log_iovec *lv_iovecp; /* iovec array */
+ struct xfs_log_item *lv_item; /* owner */
+ char *lv_buf; /* formatted buffer */
+ int lv_bytes; /* accounted space in buffer */
+ int lv_buf_len; /* aligned size of buffer */
+ int lv_size; /* size of allocated lv */
+};
+
+#define XFS_LOG_VEC_ORDERED (-1)
+
+static inline void *
+xlog_prepare_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec **vecp,
+ uint type)
+{
+ struct xfs_log_iovec *vec = *vecp;
+
+ if (vec) {
+ ASSERT(vec - lv->lv_iovecp < lv->lv_niovecs);
+ vec++;
+ } else {
+ vec = &lv->lv_iovecp[0];
+ }
+
+ vec->i_type = type;
+ vec->i_addr = lv->lv_buf + lv->lv_buf_len;
+
+ ASSERT(IS_ALIGNED((unsigned long)vec->i_addr, sizeof(uint64_t)));
+
+ *vecp = vec;
+ return vec->i_addr;
+}
+
+/*
+ * We need to make sure the next buffer is naturally aligned for the biggest
+ * basic data type we put into it. We already accounted for this padding when
+ * sizing the buffer.
+ *
+ * However, this padding does not get written into the log, and hence we have to
+ * track the space used by the log vectors separately to prevent log space hangs
+ * due to inaccurate accounting (i.e. a leak) of the used log space through the
+ * CIL context ticket.
+ */
+static inline void
+xlog_finish_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec *vec, int len)
+{
+ lv->lv_buf_len += round_up(len, sizeof(uint64_t));
+ lv->lv_bytes += len;
+ vec->i_len = len;
+}
-/* this is used in a spot where we might otherwise double-endian-flip */
-#define CYCLE_LSN_DISK(lsn) (((__be32 *)&(lsn))[0])
+static inline void *
+xlog_copy_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec **vecp,
+ uint type, void *data, int len)
+{
+ void *buf;
+
+ buf = xlog_prepare_iovec(lv, vecp, type);
+ memcpy(buf, data, len);
+ xlog_finish_iovec(lv, *vecp, len);
+ return buf;
+}
+
+/*
+ * Structure used to pass callback function and the function's argument
+ * to the log manager.
+ */
+typedef struct xfs_log_callback {
+ struct xfs_log_callback *cb_next;
+ void (*cb_func)(void *, int);
+ void *cb_arg;
+} xfs_log_callback_t;
-#ifdef __KERNEL__
/*
* By comparing each component, we don't have to worry about extra
* endian issues in treating two 32 bit numbers as one 64 bit number
@@ -59,64 +126,6 @@ static inline xfs_lsn_t _lsn_cmp(xfs_lsn_t lsn1, xfs_lsn_t lsn2)
*/
#define XFS_LOG_SYNC 0x1
-#endif /* __KERNEL__ */
-
-
-/* Log Clients */
-#define XFS_TRANSACTION 0x69
-#define XFS_VOLUME 0x2
-#define XFS_LOG 0xaa
-
-
-/* Region types for iovec's i_type */
-#define XLOG_REG_TYPE_BFORMAT 1
-#define XLOG_REG_TYPE_BCHUNK 2
-#define XLOG_REG_TYPE_EFI_FORMAT 3
-#define XLOG_REG_TYPE_EFD_FORMAT 4
-#define XLOG_REG_TYPE_IFORMAT 5
-#define XLOG_REG_TYPE_ICORE 6
-#define XLOG_REG_TYPE_IEXT 7
-#define XLOG_REG_TYPE_IBROOT 8
-#define XLOG_REG_TYPE_ILOCAL 9
-#define XLOG_REG_TYPE_IATTR_EXT 10
-#define XLOG_REG_TYPE_IATTR_BROOT 11
-#define XLOG_REG_TYPE_IATTR_LOCAL 12
-#define XLOG_REG_TYPE_QFORMAT 13
-#define XLOG_REG_TYPE_DQUOT 14
-#define XLOG_REG_TYPE_QUOTAOFF 15
-#define XLOG_REG_TYPE_LRHEADER 16
-#define XLOG_REG_TYPE_UNMOUNT 17
-#define XLOG_REG_TYPE_COMMIT 18
-#define XLOG_REG_TYPE_TRANSHDR 19
-#define XLOG_REG_TYPE_MAX 19
-
-typedef struct xfs_log_iovec {
- void *i_addr; /* beginning address of region */
- int i_len; /* length in bytes of region */
- uint i_type; /* type of region */
-} xfs_log_iovec_t;
-
-struct xfs_log_vec {
- struct xfs_log_vec *lv_next; /* next lv in build list */
- int lv_niovecs; /* number of iovecs in lv */
- struct xfs_log_iovec *lv_iovecp; /* iovec array */
- struct xfs_log_item *lv_item; /* owner */
- char *lv_buf; /* formatted buffer */
- int lv_buf_len; /* size of formatted buffer */
-};
-
-/*
- * Structure used to pass callback function and the function's argument
- * to the log manager.
- */
-typedef struct xfs_log_callback {
- struct xfs_log_callback *cb_next;
- void (*cb_func)(void *, int);
- void *cb_arg;
-} xfs_log_callback_t;
-
-
-#ifdef __KERNEL__
/* Log manager interfaces */
struct xfs_mount;
struct xlog_in_core;
@@ -124,11 +133,7 @@ struct xlog_ticket;
struct xfs_log_item;
struct xfs_item_ops;
struct xfs_trans;
-
-void xfs_log_item_init(struct xfs_mount *mp,
- struct xfs_log_item *item,
- int type,
- const struct xfs_item_ops *ops);
+struct xfs_log_callback;
xfs_lsn_t xfs_log_done(struct xfs_mount *mp,
struct xlog_ticket *ticket,
@@ -156,7 +161,7 @@ xfs_lsn_t xlog_assign_tail_lsn_locked(struct xfs_mount *mp);
void xfs_log_space_wake(struct xfs_mount *mp);
int xfs_log_notify(struct xfs_mount *mp,
struct xlog_in_core *iclog,
- xfs_log_callback_t *callback_entry);
+ struct xfs_log_callback *callback_entry);
int xfs_log_release_iclog(struct xfs_mount *mp,
struct xlog_in_core *iclog);
int xfs_log_reserve(struct xfs_mount *mp,
@@ -177,7 +182,7 @@ void xlog_iodone(struct xfs_buf *);
struct xlog_ticket *xfs_log_ticket_get(struct xlog_ticket *ticket);
void xfs_log_ticket_put(struct xlog_ticket *ticket);
-int xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp,
+void xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp,
xfs_lsn_t *commit_lsn, int flags);
bool xfs_log_item_in_current_chkpt(struct xfs_log_item *lip);
@@ -185,5 +190,4 @@ void xfs_log_work_queue(struct xfs_mount *mp);
void xfs_log_worker(struct work_struct *work);
void xfs_log_quiesce(struct xfs_mount *mp);
-#endif
#endif /* __XFS_LOG_H__ */
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index ddc4529d07d..b3425b34e3d 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -17,11 +17,9 @@
#include "xfs.h"
#include "xfs_fs.h"
-#include "xfs_types.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
-#include "xfs_trans_priv.h"
-#include "xfs_log_priv.h"
+#include "xfs_log_format.h"
+#include "xfs_shared.h"
+#include "xfs_trans_resv.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_mount.h"
@@ -29,6 +27,10 @@
#include "xfs_alloc.h"
#include "xfs_extent_busy.h"
#include "xfs_discard.h"
+#include "xfs_trans.h"
+#include "xfs_trans_priv.h"
+#include "xfs_log.h"
+#include "xfs_log_priv.h"
/*
* Allocate a new ticket. Failing to get a new ticket makes it really hard to
@@ -81,6 +83,53 @@ xlog_cil_init_post_recovery(
}
/*
+ * Prepare the log item for insertion into the CIL. Calculate the difference in
+ * log space and vectors it will consume, and if it is a new item pin it as
+ * well.
+ */
+STATIC void
+xfs_cil_prepare_item(
+ struct xlog *log,
+ struct xfs_log_vec *lv,
+ struct xfs_log_vec *old_lv,
+ int *diff_len,
+ int *diff_iovecs)
+{
+ /* Account for the new LV being passed in */
+ if (lv->lv_buf_len != XFS_LOG_VEC_ORDERED) {
+ *diff_len += lv->lv_bytes;
+ *diff_iovecs += lv->lv_niovecs;
+ }
+
+ /*
+ * If there is no old LV, this is the first time we've seen the item in
+ * this CIL context and so we need to pin it. If we are replacing the
+ * old_lv, then remove the space it accounts for and free it.
+ */
+ if (!old_lv)
+ lv->lv_item->li_ops->iop_pin(lv->lv_item);
+ else if (old_lv != lv) {
+ ASSERT(lv->lv_buf_len != XFS_LOG_VEC_ORDERED);
+
+ *diff_len -= old_lv->lv_bytes;
+ *diff_iovecs -= old_lv->lv_niovecs;
+ kmem_free(old_lv);
+ }
+
+ /* attach new log vector to log item */
+ lv->lv_item->li_lv = lv;
+
+ /*
+ * If this is the first time the item is being committed to the
+ * CIL, store the sequence number on the log item so we can
+ * tell in future commits whether this is the first checkpoint
+ * the item is being committed into.
+ */
+ if (!lv->lv_item->li_seq)
+ lv->lv_item->li_seq = log->l_cilp->xc_ctx->sequence;
+}
+
+/*
* Format log item into a flat buffers
*
* For delayed logging, we need to hold a formatted buffer containing all the
@@ -106,120 +155,119 @@ xlog_cil_init_post_recovery(
* format the regions into the iclog as though they are being formatted
* directly out of the objects themselves.
*/
-static struct xfs_log_vec *
-xlog_cil_prepare_log_vecs(
- struct xfs_trans *tp)
+static void
+xlog_cil_insert_format_items(
+ struct xlog *log,
+ struct xfs_trans *tp,
+ int *diff_len,
+ int *diff_iovecs)
{
struct xfs_log_item_desc *lidp;
- struct xfs_log_vec *lv = NULL;
- struct xfs_log_vec *ret_lv = NULL;
/* Bail out if we didn't find a log item. */
if (list_empty(&tp->t_items)) {
ASSERT(0);
- return NULL;
+ return;
}
list_for_each_entry(lidp, &tp->t_items, lid_trans) {
- struct xfs_log_vec *new_lv;
- void *ptr;
- int index;
- int len = 0;
- uint niovecs;
+ struct xfs_log_item *lip = lidp->lid_item;
+ struct xfs_log_vec *lv;
+ struct xfs_log_vec *old_lv;
+ int niovecs = 0;
+ int nbytes = 0;
+ int buf_size;
+ bool ordered = false;
/* Skip items which aren't dirty in this transaction. */
if (!(lidp->lid_flags & XFS_LID_DIRTY))
continue;
+ /* get number of vecs and size of data to be stored */
+ lip->li_ops->iop_size(lip, &niovecs, &nbytes);
+
/* Skip items that do not have any vectors for writing */
- niovecs = IOP_SIZE(lidp->lid_item);
if (!niovecs)
continue;
- new_lv = kmem_zalloc(sizeof(*new_lv) +
- niovecs * sizeof(struct xfs_log_iovec),
- KM_SLEEP);
+ /*
+ * Ordered items need to be tracked but we do not wish to write
+ * them. We need a logvec to track the object, but we do not
+ * need an iovec or buffer to be allocated for copying data.
+ */
+ if (niovecs == XFS_LOG_VEC_ORDERED) {
+ ordered = true;
+ niovecs = 0;
+ nbytes = 0;
+ }
- /* The allocated iovec region lies beyond the log vector. */
- new_lv->lv_iovecp = (struct xfs_log_iovec *)&new_lv[1];
- new_lv->lv_niovecs = niovecs;
- new_lv->lv_item = lidp->lid_item;
+ /*
+ * We 64-bit align the length of each iovec so that the start
+ * of the next one is naturally aligned. We'll need to
+ * account for that slack space here. Then round nbytes up
+ * to 64-bit alignment so that the initial buffer alignment is
+ * easy to calculate and verify.
+ */
+ nbytes += niovecs * sizeof(uint64_t);
+ nbytes = round_up(nbytes, sizeof(uint64_t));
- /* build the vector array and calculate it's length */
- IOP_FORMAT(new_lv->lv_item, new_lv->lv_iovecp);
- for (index = 0; index < new_lv->lv_niovecs; index++)
- len += new_lv->lv_iovecp[index].i_len;
+ /* grab the old item if it exists for reservation accounting */
+ old_lv = lip->li_lv;
- new_lv->lv_buf_len = len;
- new_lv->lv_buf = kmem_alloc(new_lv->lv_buf_len,
- KM_SLEEP|KM_NOFS);
- ptr = new_lv->lv_buf;
+ /*
+ * The data buffer needs to start 64-bit aligned, so round up
+ * that space to ensure we can align it appropriately and not
+ * overrun the buffer.
+ */
+ buf_size = nbytes +
+ round_up((sizeof(struct xfs_log_vec) +
+ niovecs * sizeof(struct xfs_log_iovec)),
+ sizeof(uint64_t));
- for (index = 0; index < new_lv->lv_niovecs; index++) {
- struct xfs_log_iovec *vec = &new_lv->lv_iovecp[index];
+ /* compare to existing item size */
+ if (lip->li_lv && buf_size <= lip->li_lv->lv_size) {
+ /* same or smaller, optimise common overwrite case */
+ lv = lip->li_lv;
+ lv->lv_next = NULL;
- memcpy(ptr, vec->i_addr, vec->i_len);
- vec->i_addr = ptr;
- ptr += vec->i_len;
- }
- ASSERT(ptr == new_lv->lv_buf + new_lv->lv_buf_len);
+ if (ordered)
+ goto insert;
- if (!ret_lv)
- ret_lv = new_lv;
- else
- lv->lv_next = new_lv;
- lv = new_lv;
- }
+ /*
+ * set the item up as though it is a new insertion so
+ * that the space reservation accounting is correct.
+ */
+ *diff_iovecs -= lv->lv_niovecs;
+ *diff_len -= lv->lv_bytes;
+ } else {
+ /* allocate new data chunk */
+ lv = kmem_zalloc(buf_size, KM_SLEEP|KM_NOFS);
+ lv->lv_item = lip;
+ lv->lv_size = buf_size;
+ if (ordered) {
+ /* track as an ordered logvec */
+ ASSERT(lip->li_lv == NULL);
+ lv->lv_buf_len = XFS_LOG_VEC_ORDERED;
+ goto insert;
+ }
+ lv->lv_iovecp = (struct xfs_log_iovec *)&lv[1];
+ }
- return ret_lv;
-}
+ /* Ensure the lv is set up according to ->iop_size */
+ lv->lv_niovecs = niovecs;
-/*
- * Prepare the log item for insertion into the CIL. Calculate the difference in
- * log space and vectors it will consume, and if it is a new item pin it as
- * well.
- */
-STATIC void
-xfs_cil_prepare_item(
- struct xlog *log,
- struct xfs_log_vec *lv,
- int *len,
- int *diff_iovecs)
-{
- struct xfs_log_vec *old = lv->lv_item->li_lv;
-
- if (old) {
- /* existing lv on log item, space used is a delta */
- ASSERT(!list_empty(&lv->lv_item->li_cil));
- ASSERT(old->lv_buf && old->lv_buf_len && old->lv_niovecs);
-
- *len += lv->lv_buf_len - old->lv_buf_len;
- *diff_iovecs += lv->lv_niovecs - old->lv_niovecs;
- kmem_free(old->lv_buf);
- kmem_free(old);
- } else {
- /* new lv, must pin the log item */
- ASSERT(!lv->lv_item->li_lv);
- ASSERT(list_empty(&lv->lv_item->li_cil));
-
- *len += lv->lv_buf_len;
- *diff_iovecs += lv->lv_niovecs;
- IOP_PIN(lv->lv_item);
+ /* The allocated data region lies beyond the iovec region */
+ lv->lv_buf_len = 0;
+ lv->lv_bytes = 0;
+ lv->lv_buf = (char *)lv + buf_size - nbytes;
+ ASSERT(IS_ALIGNED((unsigned long)lv->lv_buf, sizeof(uint64_t)));
+ lip->li_ops->iop_format(lip, lv);
+insert:
+ ASSERT(lv->lv_buf_len <= nbytes);
+ xfs_cil_prepare_item(log, lv, old_lv, diff_len, diff_iovecs);
}
-
- /* attach new log vector to log item */
- lv->lv_item->li_lv = lv;
-
- /*
- * If this is the first time the item is being committed to the
- * CIL, store the sequence number on the log item so we can
- * tell in future commits whether this is the first checkpoint
- * the item is being committed into.
- */
- if (!lv->lv_item->li_seq)
- lv->lv_item->li_seq = log->l_cilp->xc_ctx->sequence;
}
/*
@@ -232,47 +280,47 @@ xfs_cil_prepare_item(
static void
xlog_cil_insert_items(
struct xlog *log,
- struct xfs_log_vec *log_vector,
- struct xlog_ticket *ticket)
+ struct xfs_trans *tp)
{
struct xfs_cil *cil = log->l_cilp;
struct xfs_cil_ctx *ctx = cil->xc_ctx;
- struct xfs_log_vec *lv;
+ struct xfs_log_item_desc *lidp;
int len = 0;
int diff_iovecs = 0;
int iclog_space;
- ASSERT(log_vector);
+ ASSERT(tp);
/*
- * Do all the accounting aggregation and switching of log vectors
- * around in a separate loop to the insertion of items into the CIL.
- * Then we can do a separate loop to update the CIL within a single
- * lock/unlock pair. This reduces the number of round trips on the CIL
- * lock from O(nr_logvectors) to O(1) and greatly reduces the overall
- * hold time for the transaction commit.
- *
- * If this is the first time the item is being placed into the CIL in
- * this context, pin it so it can't be written to disk until the CIL is
- * flushed to the iclog and the iclog written to disk.
- *
* We can do this safely because the context can't checkpoint until we
* are done so it doesn't matter exactly how we update the CIL.
*/
- for (lv = log_vector; lv; lv = lv->lv_next)
- xfs_cil_prepare_item(log, lv, &len, &diff_iovecs);
-
- /* account for space used by new iovec headers */
- len += diff_iovecs * sizeof(xlog_op_header_t);
+ xlog_cil_insert_format_items(log, tp, &len, &diff_iovecs);
+ /*
+ * Now (re-)position everything modified at the tail of the CIL.
+ * We do this here so we only need to take the CIL lock once during
+ * the transaction commit.
+ */
spin_lock(&cil->xc_cil_lock);
+ list_for_each_entry(lidp, &tp->t_items, lid_trans) {
+ struct xfs_log_item *lip = lidp->lid_item;
+
+ /* Skip items which aren't dirty in this transaction. */
+ if (!(lidp->lid_flags & XFS_LID_DIRTY))
+ continue;
- /* move the items to the tail of the CIL */
- for (lv = log_vector; lv; lv = lv->lv_next)
- list_move_tail(&lv->lv_item->li_cil, &cil->xc_cil);
+ list_move_tail(&lip->li_cil, &cil->xc_cil);
+ }
+ /* account for space used by new iovec headers */
+ len += diff_iovecs * sizeof(xlog_op_header_t);
ctx->nvecs += diff_iovecs;
+ /* attach the transaction to the CIL if it has any busy extents */
+ if (!list_empty(&tp->t_busy))
+ list_splice_init(&tp->t_busy, &ctx->busy_extents);
+
/*
* Now transfer enough transaction reservation to the context ticket
* for the checkpoint. The context ticket is special - the unit
@@ -281,10 +329,8 @@ xlog_cil_insert_items(
* during the transaction commit.
*/
if (ctx->ticket->t_curr_res == 0) {
- /* first commit in checkpoint, steal the header reservation */
- ASSERT(ticket->t_curr_res >= ctx->ticket->t_unit_res + len);
ctx->ticket->t_curr_res = ctx->ticket->t_unit_res;
- ticket->t_curr_res -= ctx->ticket->t_unit_res;
+ tp->t_ticket->t_curr_res -= ctx->ticket->t_unit_res;
}
/* do we need space for more log record headers? */
@@ -298,10 +344,10 @@ xlog_cil_insert_items(
hdrs *= log->l_iclog_hsize + sizeof(struct xlog_op_header);
ctx->ticket->t_unit_res += hdrs;
ctx->ticket->t_curr_res += hdrs;
- ticket->t_curr_res -= hdrs;
- ASSERT(ticket->t_curr_res >= len);
+ tp->t_ticket->t_curr_res -= hdrs;
+ ASSERT(tp->t_ticket->t_curr_res >= len);
}
- ticket->t_curr_res -= len;
+ tp->t_ticket->t_curr_res -= len;
ctx->space_used += len;
spin_unlock(&cil->xc_cil_lock);
@@ -315,7 +361,6 @@ xlog_cil_free_logvec(
for (lv = log_vector; lv; ) {
struct xfs_log_vec *next = lv->lv_next;
- kmem_free(lv->lv_buf);
kmem_free(lv);
lv = next;
}
@@ -341,9 +386,17 @@ xlog_cil_committed(
xfs_extent_busy_clear(mp, &ctx->busy_extents,
(mp->m_flags & XFS_MOUNT_DISCARD) && !abort);
- spin_lock(&ctx->cil->xc_cil_lock);
+ /*
+ * If we are aborting the commit, wake up anyone waiting on the
+ * committing list. If we don't, then a shutdown we can leave processes
+ * waiting in xlog_cil_force_lsn() waiting on a sequence commit that
+ * will never happen because we aborted it.
+ */
+ spin_lock(&ctx->cil->xc_push_lock);
+ if (abort)
+ wake_up_all(&ctx->cil->xc_commit_wait);
list_del(&ctx->committing);
- spin_unlock(&ctx->cil->xc_cil_lock);
+ spin_unlock(&ctx->cil->xc_push_lock);
xlog_cil_free_logvec(ctx->lv_chain);
@@ -381,9 +434,7 @@ xlog_cil_push(
struct xfs_cil_ctx *new_ctx;
struct xlog_in_core *commit_iclog;
struct xlog_ticket *tic;
- int num_lv;
int num_iovecs;
- int len;
int error = 0;
struct xfs_trans_header thdr;
struct xfs_log_iovec lhdr;
@@ -400,7 +451,7 @@ xlog_cil_push(
down_write(&cil->xc_ctx_lock);
ctx = cil->xc_ctx;
- spin_lock(&cil->xc_cil_lock);
+ spin_lock(&cil->xc_push_lock);
push_seq = cil->xc_push_seq;
ASSERT(push_seq <= ctx->sequence);
@@ -411,10 +462,10 @@ xlog_cil_push(
*/
if (list_empty(&cil->xc_cil)) {
cil->xc_push_seq = 0;
- spin_unlock(&cil->xc_cil_lock);
+ spin_unlock(&cil->xc_push_lock);
goto out_skip;
}
- spin_unlock(&cil->xc_cil_lock);
+ spin_unlock(&cil->xc_push_lock);
/* check for a previously pushed seqeunce */
@@ -428,12 +479,9 @@ xlog_cil_push(
* side which is currently locked out by the flush lock.
*/
lv = NULL;
- num_lv = 0;
num_iovecs = 0;
- len = 0;
while (!list_empty(&cil->xc_cil)) {
struct xfs_log_item *item;
- int i;
item = list_first_entry(&cil->xc_cil,
struct xfs_log_item, li_cil);
@@ -444,11 +492,7 @@ xlog_cil_push(
lv->lv_next = item->li_lv;
lv = item->li_lv;
item->li_lv = NULL;
-
- num_lv++;
num_iovecs += lv->lv_niovecs;
- for (i = 0; i < lv->lv_niovecs; i++)
- len += lv->lv_iovecp[i].i_len;
}
/*
@@ -464,13 +508,6 @@ xlog_cil_push(
cil->xc_ctx = new_ctx;
/*
- * mirror the new sequence into the cil structure so that we can do
- * unlocked checks against the current sequence in log forces without
- * risking deferencing a freed context pointer.
- */
- cil->xc_current_sequence = new_ctx->sequence;
-
- /*
* The switch is now done, so we can drop the context lock and move out
* of a shared context. We can't just go straight to the commit record,
* though - we need to synchronise with previous and future commits so
@@ -488,10 +525,17 @@ xlog_cil_push(
* Hence we need to add this context to the committing context list so
* that higher sequences will wait for us to write out a commit record
* before they do.
+ *
+ * xfs_log_force_lsn requires us to mirror the new sequence into the cil
+ * structure atomically with the addition of this sequence to the
+ * committing list. This also ensures that we can do unlocked checks
+ * against the current sequence in log forces without risking
+ * deferencing a freed context pointer.
*/
- spin_lock(&cil->xc_cil_lock);
+ spin_lock(&cil->xc_push_lock);
+ cil->xc_current_sequence = new_ctx->sequence;
list_add(&ctx->committing, &cil->xc_committing);
- spin_unlock(&cil->xc_cil_lock);
+ spin_unlock(&cil->xc_push_lock);
up_write(&cil->xc_ctx_lock);
/*
@@ -526,11 +570,21 @@ xlog_cil_push(
* order the commit records so replay will get them in the right order.
*/
restart:
- spin_lock(&cil->xc_cil_lock);
+ spin_lock(&cil->xc_push_lock);
list_for_each_entry(new_ctx, &cil->xc_committing, committing) {
/*
+ * Avoid getting stuck in this loop because we were woken by the
+ * shutdown, but then went back to sleep once already in the
+ * shutdown state.
+ */
+ if (XLOG_FORCED_SHUTDOWN(log)) {
+ spin_unlock(&cil->xc_push_lock);
+ goto out_abort_free_ticket;
+ }
+
+ /*
* Higher sequences will wait for this one so skip them.
- * Don't wait for own own sequence, either.
+ * Don't wait for our own sequence, either.
*/
if (new_ctx->sequence >= ctx->sequence)
continue;
@@ -539,11 +593,11 @@ restart:
* It is still being pushed! Wait for the push to
* complete, then start again from the beginning.
*/
- xlog_wait(&cil->xc_commit_wait, &cil->xc_cil_lock);
+ xlog_wait(&cil->xc_commit_wait, &cil->xc_push_lock);
goto restart;
}
}
- spin_unlock(&cil->xc_cil_lock);
+ spin_unlock(&cil->xc_push_lock);
/* xfs_log_done always frees the ticket on error. */
commit_lsn = xfs_log_done(log->l_mp, tic, &commit_iclog, 0);
@@ -562,10 +616,10 @@ restart:
* callbacks to the iclog we can assign the commit LSN to the context
* and wake up anyone who is waiting for the commit to complete.
*/
- spin_lock(&cil->xc_cil_lock);
+ spin_lock(&cil->xc_push_lock);
ctx->commit_lsn = commit_lsn;
wake_up_all(&cil->xc_commit_wait);
- spin_unlock(&cil->xc_cil_lock);
+ spin_unlock(&cil->xc_push_lock);
/* release the hounds! */
return xfs_log_release_iclog(log->l_mp, commit_iclog);
@@ -618,17 +672,23 @@ xlog_cil_push_background(
if (cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log))
return;
- spin_lock(&cil->xc_cil_lock);
+ spin_lock(&cil->xc_push_lock);
if (cil->xc_push_seq < cil->xc_current_sequence) {
cil->xc_push_seq = cil->xc_current_sequence;
queue_work(log->l_mp->m_cil_workqueue, &cil->xc_push_work);
}
- spin_unlock(&cil->xc_cil_lock);
+ spin_unlock(&cil->xc_push_lock);
}
+/*
+ * xlog_cil_push_now() is used to trigger an immediate CIL push to the sequence
+ * number that is passed. When it returns, the work will be queued for
+ * @push_seq, but it won't be completed. The caller is expected to do any
+ * waiting for push_seq to complete if it is required.
+ */
static void
-xlog_cil_push_foreground(
+xlog_cil_push_now(
struct xlog *log,
xfs_lsn_t push_seq)
{
@@ -646,17 +706,29 @@ xlog_cil_push_foreground(
* If the CIL is empty or we've already pushed the sequence then
* there's no work we need to do.
*/
- spin_lock(&cil->xc_cil_lock);
+ spin_lock(&cil->xc_push_lock);
if (list_empty(&cil->xc_cil) || push_seq <= cil->xc_push_seq) {
- spin_unlock(&cil->xc_cil_lock);
+ spin_unlock(&cil->xc_push_lock);
return;
}
cil->xc_push_seq = push_seq;
- spin_unlock(&cil->xc_cil_lock);
+ queue_work(log->l_mp->m_cil_workqueue, &cil->xc_push_work);
+ spin_unlock(&cil->xc_push_lock);
+}
- /* do the push now */
- xlog_cil_push(log);
+bool
+xlog_cil_empty(
+ struct xlog *log)
+{
+ struct xfs_cil *cil = log->l_cilp;
+ bool empty = false;
+
+ spin_lock(&cil->xc_push_lock);
+ if (list_empty(&cil->xc_cil))
+ empty = true;
+ spin_unlock(&cil->xc_push_lock);
+ return empty;
}
/*
@@ -668,15 +740,11 @@ xlog_cil_push_foreground(
* transaction to the checkpoint context so we carry the busy extents through
* to checkpoint completion, and then unlock all the items in the transaction.
*
- * For more specific information about the order of operations in
- * xfs_log_commit_cil() please refer to the comments in
- * xfs_trans_commit_iclog().
- *
* Called with the context lock already held in read mode to lock out
* background commit, returns without it held once background commits are
* allowed again.
*/
-int
+void
xfs_log_commit_cil(
struct xfs_mount *mp,
struct xfs_trans *tp,
@@ -684,42 +752,25 @@ xfs_log_commit_cil(
int flags)
{
struct xlog *log = mp->m_log;
+ struct xfs_cil *cil = log->l_cilp;
int log_flags = 0;
- struct xfs_log_vec *log_vector;
if (flags & XFS_TRANS_RELEASE_LOG_RES)
log_flags = XFS_LOG_REL_PERM_RESERV;
- /*
- * Do all the hard work of formatting items (including memory
- * allocation) outside the CIL context lock. This prevents stalling CIL
- * pushes when we are low on memory and a transaction commit spends a
- * lot of time in memory reclaim.
- */
- log_vector = xlog_cil_prepare_log_vecs(tp);
- if (!log_vector)
- return ENOMEM;
-
/* lock out background commit */
- down_read(&log->l_cilp->xc_ctx_lock);
- if (commit_lsn)
- *commit_lsn = log->l_cilp->xc_ctx->sequence;
+ down_read(&cil->xc_ctx_lock);
- xlog_cil_insert_items(log, log_vector, tp->t_ticket);
+ xlog_cil_insert_items(log, tp);
/* check we didn't blow the reservation */
if (tp->t_ticket->t_curr_res < 0)
- xlog_print_tic_res(log->l_mp, tp->t_ticket);
+ xlog_print_tic_res(mp, tp->t_ticket);
- /* attach the transaction to the CIL if it has any busy extents */
- if (!list_empty(&tp->t_busy)) {
- spin_lock(&log->l_cilp->xc_cil_lock);
- list_splice_init(&tp->t_busy,
- &log->l_cilp->xc_ctx->busy_extents);
- spin_unlock(&log->l_cilp->xc_cil_lock);
- }
+ tp->t_commit_lsn = cil->xc_ctx->sequence;
+ if (commit_lsn)
+ *commit_lsn = tp->t_commit_lsn;
- tp->t_commit_lsn = *commit_lsn;
xfs_log_done(mp, tp->t_ticket, NULL, log_flags);
xfs_trans_unreserve_and_mod_sb(tp);
@@ -734,12 +785,11 @@ xfs_log_commit_cil(
* the log items. This affects (at least) processing of stale buffers,
* inodes and EFIs.
*/
- xfs_trans_free_items(tp, *commit_lsn, 0);
+ xfs_trans_free_items(tp, tp->t_commit_lsn, 0);
xlog_cil_push_background(log);
- up_read(&log->l_cilp->xc_ctx_lock);
- return 0;
+ up_read(&cil->xc_ctx_lock);
}
/*
@@ -768,7 +818,8 @@ xlog_cil_force_lsn(
* xlog_cil_push() handles racing pushes for the same sequence,
* so no need to deal with it here.
*/
- xlog_cil_push_foreground(log, sequence);
+restart:
+ xlog_cil_push_now(log, sequence);
/*
* See if we can find a previous sequence still committing.
@@ -776,9 +827,15 @@ xlog_cil_force_lsn(
* before allowing the force of push_seq to go ahead. Hence block
* on commits for those as well.
*/
-restart:
- spin_lock(&cil->xc_cil_lock);
+ spin_lock(&cil->xc_push_lock);
list_for_each_entry(ctx, &cil->xc_committing, committing) {
+ /*
+ * Avoid getting stuck in this loop because we were woken by the
+ * shutdown, but then went back to sleep once already in the
+ * shutdown state.
+ */
+ if (XLOG_FORCED_SHUTDOWN(log))
+ goto out_shutdown;
if (ctx->sequence > sequence)
continue;
if (!ctx->commit_lsn) {
@@ -786,7 +843,7 @@ restart:
* It is still being pushed! Wait for the push to
* complete, then start again from the beginning.
*/
- xlog_wait(&cil->xc_commit_wait, &cil->xc_cil_lock);
+ xlog_wait(&cil->xc_commit_wait, &cil->xc_push_lock);
goto restart;
}
if (ctx->sequence != sequence)
@@ -794,8 +851,39 @@ restart:
/* found it! */
commit_lsn = ctx->commit_lsn;
}
- spin_unlock(&cil->xc_cil_lock);
+
+ /*
+ * The call to xlog_cil_push_now() executes the push in the background.
+ * Hence by the time we have got here it our sequence may not have been
+ * pushed yet. This is true if the current sequence still matches the
+ * push sequence after the above wait loop and the CIL still contains
+ * dirty objects.
+ *
+ * When the push occurs, it will empty the CIL and atomically increment
+ * the currect sequence past the push sequence and move it into the
+ * committing list. Of course, if the CIL is clean at the time of the
+ * push, it won't have pushed the CIL at all, so in that case we should
+ * try the push for this sequence again from the start just in case.
+ */
+ if (sequence == cil->xc_current_sequence &&
+ !list_empty(&cil->xc_cil)) {
+ spin_unlock(&cil->xc_push_lock);
+ goto restart;
+ }
+
+ spin_unlock(&cil->xc_push_lock);
return commit_lsn;
+
+ /*
+ * We detected a shutdown in progress. We need to trigger the log force
+ * to pass through it's iclog state machine error handling, even though
+ * we are already in a shutdown state. Hence we can't return
+ * NULLCOMMITLSN here as that has special meaning to log forces (i.e.
+ * LSN is already stable), so we return a zero LSN instead.
+ */
+out_shutdown:
+ spin_unlock(&cil->xc_push_lock);
+ return 0;
}
/*
@@ -852,6 +940,7 @@ xlog_cil_init(
INIT_LIST_HEAD(&cil->xc_cil);
INIT_LIST_HEAD(&cil->xc_committing);
spin_lock_init(&cil->xc_cil_lock);
+ spin_lock_init(&cil->xc_push_lock);
init_rwsem(&cil->xc_ctx_lock);
init_waitqueue_head(&cil->xc_commit_wait);
diff --git a/fs/xfs/xfs_log_format.h b/fs/xfs/xfs_log_format.h
new file mode 100644
index 00000000000..f0969c77bdb
--- /dev/null
+++ b/fs/xfs/xfs_log_format.h
@@ -0,0 +1,679 @@
+/*
+ * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#ifndef __XFS_LOG_FORMAT_H__
+#define __XFS_LOG_FORMAT_H__
+
+struct xfs_mount;
+struct xfs_trans_res;
+
+/*
+ * On-disk Log Format definitions.
+ *
+ * This file contains all the on-disk format definitions used within the log. It
+ * includes the physical log structure itself, as well as all the log item
+ * format structures that are written into the log and intepreted by log
+ * recovery. We start with the physical log format definitions, and then work
+ * through all the log items definitions and everything they encode into the
+ * log.
+ */
+typedef __uint32_t xlog_tid_t;
+
+#define XLOG_MIN_ICLOGS 2
+#define XLOG_MAX_ICLOGS 8
+#define XLOG_HEADER_MAGIC_NUM 0xFEEDbabe /* Invalid cycle number */
+#define XLOG_VERSION_1 1
+#define XLOG_VERSION_2 2 /* Large IClogs, Log sunit */
+#define XLOG_VERSION_OKBITS (XLOG_VERSION_1 | XLOG_VERSION_2)
+#define XLOG_MIN_RECORD_BSIZE (16*1024) /* eventually 32k */
+#define XLOG_BIG_RECORD_BSIZE (32*1024) /* 32k buffers */
+#define XLOG_MAX_RECORD_BSIZE (256*1024)
+#define XLOG_HEADER_CYCLE_SIZE (32*1024) /* cycle data in header */
+#define XLOG_MIN_RECORD_BSHIFT 14 /* 16384 == 1 << 14 */
+#define XLOG_BIG_RECORD_BSHIFT 15 /* 32k == 1 << 15 */
+#define XLOG_MAX_RECORD_BSHIFT 18 /* 256k == 1 << 18 */
+#define XLOG_BTOLSUNIT(log, b) (((b)+(log)->l_mp->m_sb.sb_logsunit-1) / \
+ (log)->l_mp->m_sb.sb_logsunit)
+#define XLOG_LSUNITTOB(log, su) ((su) * (log)->l_mp->m_sb.sb_logsunit)
+
+#define XLOG_HEADER_SIZE 512
+
+/* Minimum number of transactions that must fit in the log (defined by mkfs) */
+#define XFS_MIN_LOG_FACTOR 3
+
+#define XLOG_REC_SHIFT(log) \
+ BTOBB(1 << (xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? \
+ XLOG_MAX_RECORD_BSHIFT : XLOG_BIG_RECORD_BSHIFT))
+#define XLOG_TOTAL_REC_SHIFT(log) \
+ BTOBB(XLOG_MAX_ICLOGS << (xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? \
+ XLOG_MAX_RECORD_BSHIFT : XLOG_BIG_RECORD_BSHIFT))
+
+/* get lsn fields */
+#define CYCLE_LSN(lsn) ((uint)((lsn)>>32))
+#define BLOCK_LSN(lsn) ((uint)(lsn))
+
+/* this is used in a spot where we might otherwise double-endian-flip */
+#define CYCLE_LSN_DISK(lsn) (((__be32 *)&(lsn))[0])
+
+static inline xfs_lsn_t xlog_assign_lsn(uint cycle, uint block)
+{
+ return ((xfs_lsn_t)cycle << 32) | block;
+}
+
+static inline uint xlog_get_cycle(char *ptr)
+{
+ if (be32_to_cpu(*(__be32 *)ptr) == XLOG_HEADER_MAGIC_NUM)
+ return be32_to_cpu(*((__be32 *)ptr + 1));
+ else
+ return be32_to_cpu(*(__be32 *)ptr);
+}
+
+/* Log Clients */
+#define XFS_TRANSACTION 0x69
+#define XFS_VOLUME 0x2
+#define XFS_LOG 0xaa
+
+#define XLOG_UNMOUNT_TYPE 0x556e /* Un for Unmount */
+
+/* Region types for iovec's i_type */
+#define XLOG_REG_TYPE_BFORMAT 1
+#define XLOG_REG_TYPE_BCHUNK 2
+#define XLOG_REG_TYPE_EFI_FORMAT 3
+#define XLOG_REG_TYPE_EFD_FORMAT 4
+#define XLOG_REG_TYPE_IFORMAT 5
+#define XLOG_REG_TYPE_ICORE 6
+#define XLOG_REG_TYPE_IEXT 7
+#define XLOG_REG_TYPE_IBROOT 8
+#define XLOG_REG_TYPE_ILOCAL 9
+#define XLOG_REG_TYPE_IATTR_EXT 10
+#define XLOG_REG_TYPE_IATTR_BROOT 11
+#define XLOG_REG_TYPE_IATTR_LOCAL 12
+#define XLOG_REG_TYPE_QFORMAT 13
+#define XLOG_REG_TYPE_DQUOT 14
+#define XLOG_REG_TYPE_QUOTAOFF 15
+#define XLOG_REG_TYPE_LRHEADER 16
+#define XLOG_REG_TYPE_UNMOUNT 17
+#define XLOG_REG_TYPE_COMMIT 18
+#define XLOG_REG_TYPE_TRANSHDR 19
+#define XLOG_REG_TYPE_ICREATE 20
+#define XLOG_REG_TYPE_MAX 20
+
+/*
+ * Flags to log operation header
+ *
+ * The first write of a new transaction will be preceded with a start
+ * record, XLOG_START_TRANS. Once a transaction is committed, a commit
+ * record is written, XLOG_COMMIT_TRANS. If a single region can not fit into
+ * the remainder of the current active in-core log, it is split up into
+ * multiple regions. Each partial region will be marked with a
+ * XLOG_CONTINUE_TRANS until the last one, which gets marked with XLOG_END_TRANS.
+ *
+ */
+#define XLOG_START_TRANS 0x01 /* Start a new transaction */
+#define XLOG_COMMIT_TRANS 0x02 /* Commit this transaction */
+#define XLOG_CONTINUE_TRANS 0x04 /* Cont this trans into new region */
+#define XLOG_WAS_CONT_TRANS 0x08 /* Cont this trans into new region */
+#define XLOG_END_TRANS 0x10 /* End a continued transaction */
+#define XLOG_UNMOUNT_TRANS 0x20 /* Unmount a filesystem transaction */
+
+
+typedef struct xlog_op_header {
+ __be32 oh_tid; /* transaction id of operation : 4 b */
+ __be32 oh_len; /* bytes in data region : 4 b */
+ __u8 oh_clientid; /* who sent me this : 1 b */
+ __u8 oh_flags; /* : 1 b */
+ __u16 oh_res2; /* 32 bit align : 2 b */
+} xlog_op_header_t;
+
+/* valid values for h_fmt */
+#define XLOG_FMT_UNKNOWN 0
+#define XLOG_FMT_LINUX_LE 1
+#define XLOG_FMT_LINUX_BE 2
+#define XLOG_FMT_IRIX_BE 3
+
+/* our fmt */
+#ifdef XFS_NATIVE_HOST
+#define XLOG_FMT XLOG_FMT_LINUX_BE
+#else
+#define XLOG_FMT XLOG_FMT_LINUX_LE
+#endif
+
+typedef struct xlog_rec_header {
+ __be32 h_magicno; /* log record (LR) identifier : 4 */
+ __be32 h_cycle; /* write cycle of log : 4 */
+ __be32 h_version; /* LR version : 4 */
+ __be32 h_len; /* len in bytes; should be 64-bit aligned: 4 */
+ __be64 h_lsn; /* lsn of this LR : 8 */
+ __be64 h_tail_lsn; /* lsn of 1st LR w/ buffers not committed: 8 */
+ __le32 h_crc; /* crc of log record : 4 */
+ __be32 h_prev_block; /* block number to previous LR : 4 */
+ __be32 h_num_logops; /* number of log operations in this LR : 4 */
+ __be32 h_cycle_data[XLOG_HEADER_CYCLE_SIZE / BBSIZE];
+ /* new fields */
+ __be32 h_fmt; /* format of log record : 4 */
+ uuid_t h_fs_uuid; /* uuid of FS : 16 */
+ __be32 h_size; /* iclog size : 4 */
+} xlog_rec_header_t;
+
+typedef struct xlog_rec_ext_header {
+ __be32 xh_cycle; /* write cycle of log : 4 */
+ __be32 xh_cycle_data[XLOG_HEADER_CYCLE_SIZE / BBSIZE]; /* : 256 */
+} xlog_rec_ext_header_t;
+
+/*
+ * Quite misnamed, because this union lays out the actual on-disk log buffer.
+ */
+typedef union xlog_in_core2 {
+ xlog_rec_header_t hic_header;
+ xlog_rec_ext_header_t hic_xheader;
+ char hic_sector[XLOG_HEADER_SIZE];
+} xlog_in_core_2_t;
+
+/* not an on-disk structure, but needed by log recovery in userspace */
+typedef struct xfs_log_iovec {
+ void *i_addr; /* beginning address of region */
+ int i_len; /* length in bytes of region */
+ uint i_type; /* type of region */
+} xfs_log_iovec_t;
+
+
+/*
+ * Transaction Header definitions.
+ *
+ * This is the structure written in the log at the head of every transaction. It
+ * identifies the type and id of the transaction, and contains the number of
+ * items logged by the transaction so we know how many to expect during
+ * recovery.
+ *
+ * Do not change the below structure without redoing the code in
+ * xlog_recover_add_to_trans() and xlog_recover_add_to_cont_trans().
+ */
+typedef struct xfs_trans_header {
+ uint th_magic; /* magic number */
+ uint th_type; /* transaction type */
+ __int32_t th_tid; /* transaction id (unused) */
+ uint th_num_items; /* num items logged by trans */
+} xfs_trans_header_t;
+
+#define XFS_TRANS_HEADER_MAGIC 0x5452414e /* TRAN */
+
+/*
+ * Log item types.
+ */
+#define XFS_LI_EFI 0x1236
+#define XFS_LI_EFD 0x1237
+#define XFS_LI_IUNLINK 0x1238
+#define XFS_LI_INODE 0x123b /* aligned ino chunks, var-size ibufs */
+#define XFS_LI_BUF 0x123c /* v2 bufs, variable sized inode bufs */
+#define XFS_LI_DQUOT 0x123d
+#define XFS_LI_QUOTAOFF 0x123e
+#define XFS_LI_ICREATE 0x123f
+
+#define XFS_LI_TYPE_DESC \
+ { XFS_LI_EFI, "XFS_LI_EFI" }, \
+ { XFS_LI_EFD, "XFS_LI_EFD" }, \
+ { XFS_LI_IUNLINK, "XFS_LI_IUNLINK" }, \
+ { XFS_LI_INODE, "XFS_LI_INODE" }, \
+ { XFS_LI_BUF, "XFS_LI_BUF" }, \
+ { XFS_LI_DQUOT, "XFS_LI_DQUOT" }, \
+ { XFS_LI_QUOTAOFF, "XFS_LI_QUOTAOFF" }, \
+ { XFS_LI_ICREATE, "XFS_LI_ICREATE" }
+
+/*
+ * Inode Log Item Format definitions.
+ *
+ * This is the structure used to lay out an inode log item in the
+ * log. The size of the inline data/extents/b-tree root to be logged
+ * (if any) is indicated in the ilf_dsize field. Changes to this structure
+ * must be added on to the end.
+ */
+typedef struct xfs_inode_log_format {
+ __uint16_t ilf_type; /* inode log item type */
+ __uint16_t ilf_size; /* size of this item */
+ __uint32_t ilf_fields; /* flags for fields logged */
+ __uint16_t ilf_asize; /* size of attr d/ext/root */
+ __uint16_t ilf_dsize; /* size of data/ext/root */
+ __uint64_t ilf_ino; /* inode number */
+ union {
+ __uint32_t ilfu_rdev; /* rdev value for dev inode*/
+ uuid_t ilfu_uuid; /* mount point value */
+ } ilf_u;
+ __int64_t ilf_blkno; /* blkno of inode buffer */
+ __int32_t ilf_len; /* len of inode buffer */
+ __int32_t ilf_boffset; /* off of inode in buffer */
+} xfs_inode_log_format_t;
+
+typedef struct xfs_inode_log_format_32 {
+ __uint16_t ilf_type; /* inode log item type */
+ __uint16_t ilf_size; /* size of this item */
+ __uint32_t ilf_fields; /* flags for fields logged */
+ __uint16_t ilf_asize; /* size of attr d/ext/root */
+ __uint16_t ilf_dsize; /* size of data/ext/root */
+ __uint64_t ilf_ino; /* inode number */
+ union {
+ __uint32_t ilfu_rdev; /* rdev value for dev inode*/
+ uuid_t ilfu_uuid; /* mount point value */
+ } ilf_u;
+ __int64_t ilf_blkno; /* blkno of inode buffer */
+ __int32_t ilf_len; /* len of inode buffer */
+ __int32_t ilf_boffset; /* off of inode in buffer */
+} __attribute__((packed)) xfs_inode_log_format_32_t;
+
+typedef struct xfs_inode_log_format_64 {
+ __uint16_t ilf_type; /* inode log item type */
+ __uint16_t ilf_size; /* size of this item */
+ __uint32_t ilf_fields; /* flags for fields logged */
+ __uint16_t ilf_asize; /* size of attr d/ext/root */
+ __uint16_t ilf_dsize; /* size of data/ext/root */
+ __uint32_t ilf_pad; /* pad for 64 bit boundary */
+ __uint64_t ilf_ino; /* inode number */
+ union {
+ __uint32_t ilfu_rdev; /* rdev value for dev inode*/
+ uuid_t ilfu_uuid; /* mount point value */
+ } ilf_u;
+ __int64_t ilf_blkno; /* blkno of inode buffer */
+ __int32_t ilf_len; /* len of inode buffer */
+ __int32_t ilf_boffset; /* off of inode in buffer */
+} xfs_inode_log_format_64_t;
+
+/*
+ * Flags for xfs_trans_log_inode flags field.
+ */
+#define XFS_ILOG_CORE 0x001 /* log standard inode fields */
+#define XFS_ILOG_DDATA 0x002 /* log i_df.if_data */
+#define XFS_ILOG_DEXT 0x004 /* log i_df.if_extents */
+#define XFS_ILOG_DBROOT 0x008 /* log i_df.i_broot */
+#define XFS_ILOG_DEV 0x010 /* log the dev field */
+#define XFS_ILOG_UUID 0x020 /* log the uuid field */
+#define XFS_ILOG_ADATA 0x040 /* log i_af.if_data */
+#define XFS_ILOG_AEXT 0x080 /* log i_af.if_extents */
+#define XFS_ILOG_ABROOT 0x100 /* log i_af.i_broot */
+#define XFS_ILOG_DOWNER 0x200 /* change the data fork owner on replay */
+#define XFS_ILOG_AOWNER 0x400 /* change the attr fork owner on replay */
+
+
+/*
+ * The timestamps are dirty, but not necessarily anything else in the inode
+ * core. Unlike the other fields above this one must never make it to disk
+ * in the ilf_fields of the inode_log_format, but is purely store in-memory in
+ * ili_fields in the inode_log_item.
+ */
+#define XFS_ILOG_TIMESTAMP 0x4000
+
+#define XFS_ILOG_NONCORE (XFS_ILOG_DDATA | XFS_ILOG_DEXT | \
+ XFS_ILOG_DBROOT | XFS_ILOG_DEV | \
+ XFS_ILOG_UUID | XFS_ILOG_ADATA | \
+ XFS_ILOG_AEXT | XFS_ILOG_ABROOT | \
+ XFS_ILOG_DOWNER | XFS_ILOG_AOWNER)
+
+#define XFS_ILOG_DFORK (XFS_ILOG_DDATA | XFS_ILOG_DEXT | \
+ XFS_ILOG_DBROOT)
+
+#define XFS_ILOG_AFORK (XFS_ILOG_ADATA | XFS_ILOG_AEXT | \
+ XFS_ILOG_ABROOT)
+
+#define XFS_ILOG_ALL (XFS_ILOG_CORE | XFS_ILOG_DDATA | \
+ XFS_ILOG_DEXT | XFS_ILOG_DBROOT | \
+ XFS_ILOG_DEV | XFS_ILOG_UUID | \
+ XFS_ILOG_ADATA | XFS_ILOG_AEXT | \
+ XFS_ILOG_ABROOT | XFS_ILOG_TIMESTAMP | \
+ XFS_ILOG_DOWNER | XFS_ILOG_AOWNER)
+
+static inline int xfs_ilog_fbroot(int w)
+{
+ return (w == XFS_DATA_FORK ? XFS_ILOG_DBROOT : XFS_ILOG_ABROOT);
+}
+
+static inline int xfs_ilog_fext(int w)
+{
+ return (w == XFS_DATA_FORK ? XFS_ILOG_DEXT : XFS_ILOG_AEXT);
+}
+
+static inline int xfs_ilog_fdata(int w)
+{
+ return (w == XFS_DATA_FORK ? XFS_ILOG_DDATA : XFS_ILOG_ADATA);
+}
+
+/*
+ * Incore version of the on-disk inode core structures. We log this directly
+ * into the journal in host CPU format (for better or worse) and as such
+ * directly mirrors the xfs_dinode structure as it must contain all the same
+ * information.
+ */
+typedef struct xfs_ictimestamp {
+ __int32_t t_sec; /* timestamp seconds */
+ __int32_t t_nsec; /* timestamp nanoseconds */
+} xfs_ictimestamp_t;
+
+/*
+ * NOTE: This structure must be kept identical to struct xfs_dinode
+ * in xfs_dinode.h except for the endianness annotations.
+ */
+typedef struct xfs_icdinode {
+ __uint16_t di_magic; /* inode magic # = XFS_DINODE_MAGIC */
+ __uint16_t di_mode; /* mode and type of file */
+ __int8_t di_version; /* inode version */
+ __int8_t di_format; /* format of di_c data */
+ __uint16_t di_onlink; /* old number of links to file */
+ __uint32_t di_uid; /* owner's user id */
+ __uint32_t di_gid; /* owner's group id */
+ __uint32_t di_nlink; /* number of links to file */
+ __uint16_t di_projid_lo; /* lower part of owner's project id */
+ __uint16_t di_projid_hi; /* higher part of owner's project id */
+ __uint8_t di_pad[6]; /* unused, zeroed space */
+ __uint16_t di_flushiter; /* incremented on flush */
+ xfs_ictimestamp_t di_atime; /* time last accessed */
+ xfs_ictimestamp_t di_mtime; /* time last modified */
+ xfs_ictimestamp_t di_ctime; /* time created/inode modified */
+ xfs_fsize_t di_size; /* number of bytes in file */
+ xfs_drfsbno_t di_nblocks; /* # of direct & btree blocks used */
+ xfs_extlen_t di_extsize; /* basic/minimum extent size for file */
+ xfs_extnum_t di_nextents; /* number of extents in data fork */
+ xfs_aextnum_t di_anextents; /* number of extents in attribute fork*/
+ __uint8_t di_forkoff; /* attr fork offs, <<3 for 64b align */
+ __int8_t di_aformat; /* format of attr fork's data */
+ __uint32_t di_dmevmask; /* DMIG event mask */
+ __uint16_t di_dmstate; /* DMIG state info */
+ __uint16_t di_flags; /* random flags, XFS_DIFLAG_... */
+ __uint32_t di_gen; /* generation number */
+
+ /* di_next_unlinked is the only non-core field in the old dinode */
+ xfs_agino_t di_next_unlinked;/* agi unlinked list ptr */
+
+ /* start of the extended dinode, writable fields */
+ __uint32_t di_crc; /* CRC of the inode */
+ __uint64_t di_changecount; /* number of attribute changes */
+ xfs_lsn_t di_lsn; /* flush sequence */
+ __uint64_t di_flags2; /* more random flags */
+ __uint8_t di_pad2[16]; /* more padding for future expansion */
+
+ /* fields only written to during inode creation */
+ xfs_ictimestamp_t di_crtime; /* time created */
+ xfs_ino_t di_ino; /* inode number */
+ uuid_t di_uuid; /* UUID of the filesystem */
+
+ /* structure must be padded to 64 bit alignment */
+} xfs_icdinode_t;
+
+static inline uint xfs_icdinode_size(int version)
+{
+ if (version == 3)
+ return sizeof(struct xfs_icdinode);
+ return offsetof(struct xfs_icdinode, di_next_unlinked);
+}
+
+/*
+ * Buffer Log Format defintions
+ *
+ * These are the physical dirty bitmap defintions for the log format structure.
+ */
+#define XFS_BLF_CHUNK 128
+#define XFS_BLF_SHIFT 7
+#define BIT_TO_WORD_SHIFT 5
+#define NBWORD (NBBY * sizeof(unsigned int))
+
+/*
+ * This flag indicates that the buffer contains on disk inodes
+ * and requires special recovery handling.
+ */
+#define XFS_BLF_INODE_BUF (1<<0)
+
+/*
+ * This flag indicates that the buffer should not be replayed
+ * during recovery because its blocks are being freed.
+ */
+#define XFS_BLF_CANCEL (1<<1)
+
+/*
+ * This flag indicates that the buffer contains on disk
+ * user or group dquots and may require special recovery handling.
+ */
+#define XFS_BLF_UDQUOT_BUF (1<<2)
+#define XFS_BLF_PDQUOT_BUF (1<<3)
+#define XFS_BLF_GDQUOT_BUF (1<<4)
+
+/*
+ * This is the structure used to lay out a buf log item in the
+ * log. The data map describes which 128 byte chunks of the buffer
+ * have been logged.
+ */
+#define XFS_BLF_DATAMAP_SIZE ((XFS_MAX_BLOCKSIZE / XFS_BLF_CHUNK) / NBWORD)
+
+typedef struct xfs_buf_log_format {
+ unsigned short blf_type; /* buf log item type indicator */
+ unsigned short blf_size; /* size of this item */
+ ushort blf_flags; /* misc state */
+ ushort blf_len; /* number of blocks in this buf */
+ __int64_t blf_blkno; /* starting blkno of this buf */
+ unsigned int blf_map_size; /* used size of data bitmap in words */
+ unsigned int blf_data_map[XFS_BLF_DATAMAP_SIZE]; /* dirty bitmap */
+} xfs_buf_log_format_t;
+
+/*
+ * All buffers now need to tell recovery where the magic number
+ * is so that it can verify and calculate the CRCs on the buffer correctly
+ * once the changes have been replayed into the buffer.
+ *
+ * The type value is held in the upper 5 bits of the blf_flags field, which is
+ * an unsigned 16 bit field. Hence we need to shift it 11 bits up and down.
+ */
+#define XFS_BLFT_BITS 5
+#define XFS_BLFT_SHIFT 11
+#define XFS_BLFT_MASK (((1 << XFS_BLFT_BITS) - 1) << XFS_BLFT_SHIFT)
+
+enum xfs_blft {
+ XFS_BLFT_UNKNOWN_BUF = 0,
+ XFS_BLFT_UDQUOT_BUF,
+ XFS_BLFT_PDQUOT_BUF,
+ XFS_BLFT_GDQUOT_BUF,
+ XFS_BLFT_BTREE_BUF,
+ XFS_BLFT_AGF_BUF,
+ XFS_BLFT_AGFL_BUF,
+ XFS_BLFT_AGI_BUF,
+ XFS_BLFT_DINO_BUF,
+ XFS_BLFT_SYMLINK_BUF,
+ XFS_BLFT_DIR_BLOCK_BUF,
+ XFS_BLFT_DIR_DATA_BUF,
+ XFS_BLFT_DIR_FREE_BUF,
+ XFS_BLFT_DIR_LEAF1_BUF,
+ XFS_BLFT_DIR_LEAFN_BUF,
+ XFS_BLFT_DA_NODE_BUF,
+ XFS_BLFT_ATTR_LEAF_BUF,
+ XFS_BLFT_ATTR_RMT_BUF,
+ XFS_BLFT_SB_BUF,
+ XFS_BLFT_MAX_BUF = (1 << XFS_BLFT_BITS),
+};
+
+static inline void
+xfs_blft_to_flags(struct xfs_buf_log_format *blf, enum xfs_blft type)
+{
+ ASSERT(type > XFS_BLFT_UNKNOWN_BUF && type < XFS_BLFT_MAX_BUF);
+ blf->blf_flags &= ~XFS_BLFT_MASK;
+ blf->blf_flags |= ((type << XFS_BLFT_SHIFT) & XFS_BLFT_MASK);
+}
+
+static inline __uint16_t
+xfs_blft_from_flags(struct xfs_buf_log_format *blf)
+{
+ return (blf->blf_flags & XFS_BLFT_MASK) >> XFS_BLFT_SHIFT;
+}
+
+/*
+ * EFI/EFD log format definitions
+ */
+typedef struct xfs_extent {
+ xfs_dfsbno_t ext_start;
+ xfs_extlen_t ext_len;
+} xfs_extent_t;
+
+/*
+ * Since an xfs_extent_t has types (start:64, len: 32)
+ * there are different alignments on 32 bit and 64 bit kernels.
+ * So we provide the different variants for use by a
+ * conversion routine.
+ */
+typedef struct xfs_extent_32 {
+ __uint64_t ext_start;
+ __uint32_t ext_len;
+} __attribute__((packed)) xfs_extent_32_t;
+
+typedef struct xfs_extent_64 {
+ __uint64_t ext_start;
+ __uint32_t ext_len;
+ __uint32_t ext_pad;
+} xfs_extent_64_t;
+
+/*
+ * This is the structure used to lay out an efi log item in the
+ * log. The efi_extents field is a variable size array whose
+ * size is given by efi_nextents.
+ */
+typedef struct xfs_efi_log_format {
+ __uint16_t efi_type; /* efi log item type */
+ __uint16_t efi_size; /* size of this item */
+ __uint32_t efi_nextents; /* # extents to free */
+ __uint64_t efi_id; /* efi identifier */
+ xfs_extent_t efi_extents[1]; /* array of extents to free */
+} xfs_efi_log_format_t;
+
+typedef struct xfs_efi_log_format_32 {
+ __uint16_t efi_type; /* efi log item type */
+ __uint16_t efi_size; /* size of this item */
+ __uint32_t efi_nextents; /* # extents to free */
+ __uint64_t efi_id; /* efi identifier */
+ xfs_extent_32_t efi_extents[1]; /* array of extents to free */
+} __attribute__((packed)) xfs_efi_log_format_32_t;
+
+typedef struct xfs_efi_log_format_64 {
+ __uint16_t efi_type; /* efi log item type */
+ __uint16_t efi_size; /* size of this item */
+ __uint32_t efi_nextents; /* # extents to free */
+ __uint64_t efi_id; /* efi identifier */
+ xfs_extent_64_t efi_extents[1]; /* array of extents to free */
+} xfs_efi_log_format_64_t;
+
+/*
+ * This is the structure used to lay out an efd log item in the
+ * log. The efd_extents array is a variable size array whose
+ * size is given by efd_nextents;
+ */
+typedef struct xfs_efd_log_format {
+ __uint16_t efd_type; /* efd log item type */
+ __uint16_t efd_size; /* size of this item */
+ __uint32_t efd_nextents; /* # of extents freed */
+ __uint64_t efd_efi_id; /* id of corresponding efi */
+ xfs_extent_t efd_extents[1]; /* array of extents freed */
+} xfs_efd_log_format_t;
+
+typedef struct xfs_efd_log_format_32 {
+ __uint16_t efd_type; /* efd log item type */
+ __uint16_t efd_size; /* size of this item */
+ __uint32_t efd_nextents; /* # of extents freed */
+ __uint64_t efd_efi_id; /* id of corresponding efi */
+ xfs_extent_32_t efd_extents[1]; /* array of extents freed */
+} __attribute__((packed)) xfs_efd_log_format_32_t;
+
+typedef struct xfs_efd_log_format_64 {
+ __uint16_t efd_type; /* efd log item type */
+ __uint16_t efd_size; /* size of this item */
+ __uint32_t efd_nextents; /* # of extents freed */
+ __uint64_t efd_efi_id; /* id of corresponding efi */
+ xfs_extent_64_t efd_extents[1]; /* array of extents freed */
+} xfs_efd_log_format_64_t;
+
+/*
+ * Dquot Log format definitions.
+ *
+ * The first two fields must be the type and size fitting into
+ * 32 bits : log_recovery code assumes that.
+ */
+typedef struct xfs_dq_logformat {
+ __uint16_t qlf_type; /* dquot log item type */
+ __uint16_t qlf_size; /* size of this item */
+ xfs_dqid_t qlf_id; /* usr/grp/proj id : 32 bits */
+ __int64_t qlf_blkno; /* blkno of dquot buffer */
+ __int32_t qlf_len; /* len of dquot buffer */
+ __uint32_t qlf_boffset; /* off of dquot in buffer */
+} xfs_dq_logformat_t;
+
+/*
+ * log format struct for QUOTAOFF records.
+ * The first two fields must be the type and size fitting into
+ * 32 bits : log_recovery code assumes that.
+ * We write two LI_QUOTAOFF logitems per quotaoff, the last one keeps a pointer
+ * to the first and ensures that the first logitem is taken out of the AIL
+ * only when the last one is securely committed.
+ */
+typedef struct xfs_qoff_logformat {
+ unsigned short qf_type; /* quotaoff log item type */
+ unsigned short qf_size; /* size of this item */
+ unsigned int qf_flags; /* USR and/or GRP */
+ char qf_pad[12]; /* padding for future */
+} xfs_qoff_logformat_t;
+
+/*
+ * Disk quotas status in m_qflags, and also sb_qflags. 16 bits.
+ */
+#define XFS_UQUOTA_ACCT 0x0001 /* user quota accounting ON */
+#define XFS_UQUOTA_ENFD 0x0002 /* user quota limits enforced */
+#define XFS_UQUOTA_CHKD 0x0004 /* quotacheck run on usr quotas */
+#define XFS_PQUOTA_ACCT 0x0008 /* project quota accounting ON */
+#define XFS_OQUOTA_ENFD 0x0010 /* other (grp/prj) quota limits enforced */
+#define XFS_OQUOTA_CHKD 0x0020 /* quotacheck run on other (grp/prj) quotas */
+#define XFS_GQUOTA_ACCT 0x0040 /* group quota accounting ON */
+
+/*
+ * Conversion to and from the combined OQUOTA flag (if necessary)
+ * is done only in xfs_sb_qflags_to_disk() and xfs_sb_qflags_from_disk()
+ */
+#define XFS_GQUOTA_ENFD 0x0080 /* group quota limits enforced */
+#define XFS_GQUOTA_CHKD 0x0100 /* quotacheck run on group quotas */
+#define XFS_PQUOTA_ENFD 0x0200 /* project quota limits enforced */
+#define XFS_PQUOTA_CHKD 0x0400 /* quotacheck run on project quotas */
+
+#define XFS_ALL_QUOTA_ACCT \
+ (XFS_UQUOTA_ACCT | XFS_GQUOTA_ACCT | XFS_PQUOTA_ACCT)
+#define XFS_ALL_QUOTA_ENFD \
+ (XFS_UQUOTA_ENFD | XFS_GQUOTA_ENFD | XFS_PQUOTA_ENFD)
+#define XFS_ALL_QUOTA_CHKD \
+ (XFS_UQUOTA_CHKD | XFS_GQUOTA_CHKD | XFS_PQUOTA_CHKD)
+
+#define XFS_MOUNT_QUOTA_ALL (XFS_UQUOTA_ACCT|XFS_UQUOTA_ENFD|\
+ XFS_UQUOTA_CHKD|XFS_GQUOTA_ACCT|\
+ XFS_GQUOTA_ENFD|XFS_GQUOTA_CHKD|\
+ XFS_PQUOTA_ACCT|XFS_PQUOTA_ENFD|\
+ XFS_PQUOTA_CHKD)
+
+/*
+ * Inode create log item structure
+ *
+ * Log recovery assumes the first two entries are the type and size and they fit
+ * in 32 bits. Also in host order (ugh) so they have to be 32 bit aligned so
+ * decoding can be done correctly.
+ */
+struct xfs_icreate_log {
+ __uint16_t icl_type; /* type of log format structure */
+ __uint16_t icl_size; /* size of log format structure */
+ __be32 icl_ag; /* ag being allocated in */
+ __be32 icl_agbno; /* start block of inode range */
+ __be32 icl_count; /* number of inodes to initialise */
+ __be32 icl_isize; /* size of inodes */
+ __be32 icl_length; /* length of extent to initialise */
+ __be32 icl_gen; /* inode generation number to use */
+};
+
+#endif /* __XFS_LOG_FORMAT_H__ */
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index 16d8d12ea3b..9bc403a9e54 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -22,53 +22,16 @@ struct xfs_buf;
struct xlog;
struct xlog_ticket;
struct xfs_mount;
+struct xfs_log_callback;
/*
- * Macros, structures, prototypes for internal log manager use.
+ * Flags for log structure
*/
-
-#define XLOG_MIN_ICLOGS 2
-#define XLOG_MAX_ICLOGS 8
-#define XLOG_HEADER_MAGIC_NUM 0xFEEDbabe /* Invalid cycle number */
-#define XLOG_VERSION_1 1
-#define XLOG_VERSION_2 2 /* Large IClogs, Log sunit */
-#define XLOG_VERSION_OKBITS (XLOG_VERSION_1 | XLOG_VERSION_2)
-#define XLOG_MIN_RECORD_BSIZE (16*1024) /* eventually 32k */
-#define XLOG_BIG_RECORD_BSIZE (32*1024) /* 32k buffers */
-#define XLOG_MAX_RECORD_BSIZE (256*1024)
-#define XLOG_HEADER_CYCLE_SIZE (32*1024) /* cycle data in header */
-#define XLOG_MIN_RECORD_BSHIFT 14 /* 16384 == 1 << 14 */
-#define XLOG_BIG_RECORD_BSHIFT 15 /* 32k == 1 << 15 */
-#define XLOG_MAX_RECORD_BSHIFT 18 /* 256k == 1 << 18 */
-#define XLOG_BTOLSUNIT(log, b) (((b)+(log)->l_mp->m_sb.sb_logsunit-1) / \
- (log)->l_mp->m_sb.sb_logsunit)
-#define XLOG_LSUNITTOB(log, su) ((su) * (log)->l_mp->m_sb.sb_logsunit)
-
-#define XLOG_HEADER_SIZE 512
-
-#define XLOG_REC_SHIFT(log) \
- BTOBB(1 << (xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? \
- XLOG_MAX_RECORD_BSHIFT : XLOG_BIG_RECORD_BSHIFT))
-#define XLOG_TOTAL_REC_SHIFT(log) \
- BTOBB(XLOG_MAX_ICLOGS << (xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? \
- XLOG_MAX_RECORD_BSHIFT : XLOG_BIG_RECORD_BSHIFT))
-
-static inline xfs_lsn_t xlog_assign_lsn(uint cycle, uint block)
-{
- return ((xfs_lsn_t)cycle << 32) | block;
-}
-
-static inline uint xlog_get_cycle(char *ptr)
-{
- if (be32_to_cpu(*(__be32 *)ptr) == XLOG_HEADER_MAGIC_NUM)
- return be32_to_cpu(*((__be32 *)ptr + 1));
- else
- return be32_to_cpu(*(__be32 *)ptr);
-}
-
-#define BLK_AVG(blk1, blk2) ((blk1+blk2) >> 1)
-
-#ifdef __KERNEL__
+#define XLOG_ACTIVE_RECOVERY 0x2 /* in the middle of recovery */
+#define XLOG_RECOVERY_NEEDED 0x4 /* log was recovered */
+#define XLOG_IO_ERROR 0x8 /* log hit an I/O error, and being
+ shutdown */
+#define XLOG_TAIL_WARN 0x10 /* log tail verify warning issued */
/*
* get client id from packed copy.
@@ -101,28 +64,8 @@ static inline uint xlog_get_client_id(__be32 i)
#define XLOG_STATE_IOERROR 0x0080 /* IO error happened in sync'ing log */
#define XLOG_STATE_ALL 0x7FFF /* All possible valid flags */
#define XLOG_STATE_NOTUSED 0x8000 /* This IC log not being used */
-#endif /* __KERNEL__ */
/*
- * Flags to log operation header
- *
- * The first write of a new transaction will be preceded with a start
- * record, XLOG_START_TRANS. Once a transaction is committed, a commit
- * record is written, XLOG_COMMIT_TRANS. If a single region can not fit into
- * the remainder of the current active in-core log, it is split up into
- * multiple regions. Each partial region will be marked with a
- * XLOG_CONTINUE_TRANS until the last one, which gets marked with XLOG_END_TRANS.
- *
- */
-#define XLOG_START_TRANS 0x01 /* Start a new transaction */
-#define XLOG_COMMIT_TRANS 0x02 /* Commit this transaction */
-#define XLOG_CONTINUE_TRANS 0x04 /* Cont this trans into new region */
-#define XLOG_WAS_CONT_TRANS 0x08 /* Cont this trans into new region */
-#define XLOG_END_TRANS 0x10 /* End a continued transaction */
-#define XLOG_UNMOUNT_TRANS 0x20 /* Unmount a filesystem transaction */
-
-#ifdef __KERNEL__
-/*
* Flags to log ticket
*/
#define XLOG_TIC_INITED 0x1 /* has been initialized */
@@ -132,22 +75,6 @@ static inline uint xlog_get_client_id(__be32 i)
{ XLOG_TIC_INITED, "XLOG_TIC_INITED" }, \
{ XLOG_TIC_PERM_RESERV, "XLOG_TIC_PERM_RESERV" }
-#endif /* __KERNEL__ */
-
-#define XLOG_UNMOUNT_TYPE 0x556e /* Un for Unmount */
-
-/*
- * Flags for log structure
- */
-#define XLOG_ACTIVE_RECOVERY 0x2 /* in the middle of recovery */
-#define XLOG_RECOVERY_NEEDED 0x4 /* log was recovered */
-#define XLOG_IO_ERROR 0x8 /* log hit an I/O error, and being
- shutdown */
-#define XLOG_TAIL_WARN 0x10 /* log tail verify warning issued */
-
-typedef __uint32_t xlog_tid_t;
-
-#ifdef __KERNEL__
/*
* Below are states for covering allocation transactions.
* By covering, we mean changing the h_tail_lsn in the last on-disk
@@ -223,7 +150,6 @@ typedef __uint32_t xlog_tid_t;
#define XLOG_COVER_OPS 5
-
/* Ticket reservation region accounting */
#define XLOG_TIC_LEN_MAX 15
@@ -258,64 +184,6 @@ typedef struct xlog_ticket {
xlog_res_t t_res_arr[XLOG_TIC_LEN_MAX]; /* array of res : 8 * 15 */
} xlog_ticket_t;
-#endif
-
-
-typedef struct xlog_op_header {
- __be32 oh_tid; /* transaction id of operation : 4 b */
- __be32 oh_len; /* bytes in data region : 4 b */
- __u8 oh_clientid; /* who sent me this : 1 b */
- __u8 oh_flags; /* : 1 b */
- __u16 oh_res2; /* 32 bit align : 2 b */
-} xlog_op_header_t;
-
-
-/* valid values for h_fmt */
-#define XLOG_FMT_UNKNOWN 0
-#define XLOG_FMT_LINUX_LE 1
-#define XLOG_FMT_LINUX_BE 2
-#define XLOG_FMT_IRIX_BE 3
-
-/* our fmt */
-#ifdef XFS_NATIVE_HOST
-#define XLOG_FMT XLOG_FMT_LINUX_BE
-#else
-#define XLOG_FMT XLOG_FMT_LINUX_LE
-#endif
-
-typedef struct xlog_rec_header {
- __be32 h_magicno; /* log record (LR) identifier : 4 */
- __be32 h_cycle; /* write cycle of log : 4 */
- __be32 h_version; /* LR version : 4 */
- __be32 h_len; /* len in bytes; should be 64-bit aligned: 4 */
- __be64 h_lsn; /* lsn of this LR : 8 */
- __be64 h_tail_lsn; /* lsn of 1st LR w/ buffers not committed: 8 */
- __le32 h_crc; /* crc of log record : 4 */
- __be32 h_prev_block; /* block number to previous LR : 4 */
- __be32 h_num_logops; /* number of log operations in this LR : 4 */
- __be32 h_cycle_data[XLOG_HEADER_CYCLE_SIZE / BBSIZE];
- /* new fields */
- __be32 h_fmt; /* format of log record : 4 */
- uuid_t h_fs_uuid; /* uuid of FS : 16 */
- __be32 h_size; /* iclog size : 4 */
-} xlog_rec_header_t;
-
-typedef struct xlog_rec_ext_header {
- __be32 xh_cycle; /* write cycle of log : 4 */
- __be32 xh_cycle_data[XLOG_HEADER_CYCLE_SIZE / BBSIZE]; /* : 256 */
-} xlog_rec_ext_header_t;
-
-#ifdef __KERNEL__
-
-/*
- * Quite misnamed, because this union lays out the actual on-disk log buffer.
- */
-typedef union xlog_in_core2 {
- xlog_rec_header_t hic_header;
- xlog_rec_ext_header_t hic_xheader;
- char hic_sector[XLOG_HEADER_SIZE];
-} xlog_in_core_2_t;
-
/*
* - A log record header is 512 bytes. There is plenty of room to grow the
* xlog_rec_header_t into the reserved space.
@@ -360,8 +228,8 @@ typedef struct xlog_in_core {
/* Callback structures need their own cacheline */
spinlock_t ic_callback_lock ____cacheline_aligned_in_smp;
- xfs_log_callback_t *ic_callback;
- xfs_log_callback_t **ic_callback_tail;
+ struct xfs_log_callback *ic_callback;
+ struct xfs_log_callback **ic_callback_tail;
/* reference counts need their own cacheline */
atomic_t ic_refcnt ____cacheline_aligned_in_smp;
@@ -387,7 +255,7 @@ struct xfs_cil_ctx {
int space_used; /* aggregate size of regions */
struct list_head busy_extents; /* busy extents in chkpt */
struct xfs_log_vec *lv_chain; /* logvecs being pushed */
- xfs_log_callback_t log_cb; /* completion callback hook. */
+ struct xfs_log_callback log_cb; /* completion callback hook. */
struct list_head committing; /* ctx committing list */
};
@@ -411,14 +279,17 @@ struct xfs_cil {
struct xlog *xc_log;
struct list_head xc_cil;
spinlock_t xc_cil_lock;
+
+ struct rw_semaphore xc_ctx_lock ____cacheline_aligned_in_smp;
struct xfs_cil_ctx *xc_ctx;
- struct rw_semaphore xc_ctx_lock;
+
+ spinlock_t xc_push_lock ____cacheline_aligned_in_smp;
+ xfs_lsn_t xc_push_seq;
struct list_head xc_committing;
wait_queue_head_t xc_commit_wait;
xfs_lsn_t xc_current_sequence;
struct work_struct xc_push_work;
- xfs_lsn_t xc_push_seq;
-};
+} ____cacheline_aligned_in_smp;
/*
* The amount of log space we allow the CIL to aggregate is difficult to size.
@@ -468,7 +339,6 @@ struct xfs_cil {
* threshold, yet give us plenty of space for aggregation on large logs.
*/
#define XLOG_CIL_SPACE_LIMIT(log) (log->l_logsize >> 3)
-#define XLOG_CIL_HARD_SPACE_LIMIT(log) (3 * (log->l_logsize >> 4))
/*
* ticket grant locks, queues and accounting have their own cachlines
@@ -645,12 +515,10 @@ xlog_assign_grant_head(atomic64_t *head, int cycle, int space)
/*
* Committed Item List interfaces
*/
-int
-xlog_cil_init(struct xlog *log);
-void
-xlog_cil_init_post_recovery(struct xlog *log);
-void
-xlog_cil_destroy(struct xlog *log);
+int xlog_cil_init(struct xlog *log);
+void xlog_cil_init_post_recovery(struct xlog *log);
+void xlog_cil_destroy(struct xlog *log);
+bool xlog_cil_empty(struct xlog *log);
/*
* CIL force routines
@@ -687,6 +555,5 @@ static inline void xlog_wait(wait_queue_head_t *wq, spinlock_t *lock)
schedule();
remove_wait_queue(wq, &wait);
}
-#endif /* __KERNEL__ */
#endif /* __XFS_LOG_PRIV_H__ */
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 96fcbb85ff8..981af0f6504 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -17,33 +17,36 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
-#include "xfs_types.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_bit.h"
-#include "xfs_log.h"
#include "xfs_inum.h"
-#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_mount.h"
-#include "xfs_error.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dinode.h"
+#include "xfs_da_format.h"
#include "xfs_inode.h"
-#include "xfs_inode_item.h"
-#include "xfs_alloc.h"
-#include "xfs_ialloc.h"
+#include "xfs_trans.h"
+#include "xfs_log.h"
#include "xfs_log_priv.h"
-#include "xfs_buf_item.h"
#include "xfs_log_recover.h"
+#include "xfs_inode_item.h"
#include "xfs_extfree_item.h"
#include "xfs_trans_priv.h"
+#include "xfs_alloc.h"
+#include "xfs_ialloc.h"
#include "xfs_quota.h"
-#include "xfs_utils.h"
#include "xfs_cksum.h"
#include "xfs_trace.h"
#include "xfs_icache.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_dinode.h"
+#include "xfs_error.h"
+#include "xfs_dir2.h"
+
+#define BLK_AVG(blk1, blk2) ((blk1+blk2) >> 1)
STATIC int
xlog_find_zeroed(
@@ -190,7 +193,10 @@ xlog_bread_noalign(
bp->b_io_length = nbblks;
bp->b_error = 0;
- xfsbdstrat(log->l_mp, bp);
+ if (XFS_FORCED_SHUTDOWN(log->l_mp))
+ return XFS_ERROR(EIO);
+
+ xfs_buf_iorequest(bp);
error = xfs_buf_iowait(bp);
if (error)
xfs_buf_ioerror_alert(bp, __func__);
@@ -294,9 +300,9 @@ xlog_header_check_dump(
xfs_mount_t *mp,
xlog_rec_header_t *head)
{
- xfs_debug(mp, "%s: SB : uuid = %pU, fmt = %d\n",
+ xfs_debug(mp, "%s: SB : uuid = %pU, fmt = %d",
__func__, &mp->m_sb.sb_uuid, XLOG_FMT);
- xfs_debug(mp, " log : uuid = %pU, fmt = %d\n",
+ xfs_debug(mp, " log : uuid = %pU, fmt = %d",
&head->h_fs_uuid, be32_to_cpu(head->h_fmt));
}
#else
@@ -597,7 +603,7 @@ out:
/*
* Head is defined to be the point of the log where the next log write
- * write could go. This means that incomplete LR writes at the end are
+ * could go. This means that incomplete LR writes at the end are
* eliminated when calculating the head. We aren't guaranteed that previous
* LR have complete transactions. We only know that a cycle number of
* current cycle number -1 won't be present in the log if we start writing
@@ -953,6 +959,7 @@ xlog_find_tail(
}
if (!found) {
xfs_warn(log->l_mp, "%s: couldn't find sync record", __func__);
+ xlog_put_bp(bp);
ASSERT(0);
return XFS_ERROR(EIO);
}
@@ -1134,7 +1141,8 @@ xlog_find_zeroed(
*/
xfs_warn(log->l_mp,
"Log inconsistent or not a log (last==0, first!=1)");
- return XFS_ERROR(EINVAL);
+ error = XFS_ERROR(EINVAL);
+ goto bp_err;
}
/* we have a partially zeroed log */
@@ -1442,9 +1450,8 @@ xlog_recover_find_tid(
xlog_tid_t tid)
{
xlog_recover_t *trans;
- struct hlist_node *n;
- hlist_for_each_entry(trans, n, head, r_list) {
+ hlist_for_each_entry(trans, head, r_list) {
if (trans->r_log_tid == tid)
return trans;
}
@@ -1573,6 +1580,7 @@ xlog_recover_add_to_trans(
"bad number of regions (%d) in inode log format",
in_f->ilf_size);
ASSERT(0);
+ kmem_free(ptr);
return XFS_ERROR(EIO);
}
@@ -1591,10 +1599,53 @@ xlog_recover_add_to_trans(
}
/*
- * Sort the log items in the transaction. Cancelled buffers need
- * to be put first so they are processed before any items that might
- * modify the buffers. If they are cancelled, then the modifications
- * don't need to be replayed.
+ * Sort the log items in the transaction.
+ *
+ * The ordering constraints are defined by the inode allocation and unlink
+ * behaviour. The rules are:
+ *
+ * 1. Every item is only logged once in a given transaction. Hence it
+ * represents the last logged state of the item. Hence ordering is
+ * dependent on the order in which operations need to be performed so
+ * required initial conditions are always met.
+ *
+ * 2. Cancelled buffers are recorded in pass 1 in a separate table and
+ * there's nothing to replay from them so we can simply cull them
+ * from the transaction. However, we can't do that until after we've
+ * replayed all the other items because they may be dependent on the
+ * cancelled buffer and replaying the cancelled buffer can remove it
+ * form the cancelled buffer table. Hence they have tobe done last.
+ *
+ * 3. Inode allocation buffers must be replayed before inode items that
+ * read the buffer and replay changes into it. For filesystems using the
+ * ICREATE transactions, this means XFS_LI_ICREATE objects need to get
+ * treated the same as inode allocation buffers as they create and
+ * initialise the buffers directly.
+ *
+ * 4. Inode unlink buffers must be replayed after inode items are replayed.
+ * This ensures that inodes are completely flushed to the inode buffer
+ * in a "free" state before we remove the unlinked inode list pointer.
+ *
+ * Hence the ordering needs to be inode allocation buffers first, inode items
+ * second, inode unlink buffers third and cancelled buffers last.
+ *
+ * But there's a problem with that - we can't tell an inode allocation buffer
+ * apart from a regular buffer, so we can't separate them. We can, however,
+ * tell an inode unlink buffer from the others, and so we can separate them out
+ * from all the other buffers and move them to last.
+ *
+ * Hence, 4 lists, in order from head to tail:
+ * - buffer_list for all buffers except cancelled/inode unlink buffers
+ * - item_list for all non-buffer items
+ * - inode_buffer_list for inode unlink buffers
+ * - cancel_list for the cancelled buffers
+ *
+ * Note that we add objects to the tail of the lists so that first-to-last
+ * ordering is preserved within the lists. Adding objects to the head of the
+ * list means when we traverse from the head we walk them in last-to-first
+ * order. For cancelled buffers and inode unlink buffers this doesn't matter,
+ * but for all other items there may be specific ordering that we need to
+ * preserve.
*/
STATIC int
xlog_recover_reorder_trans(
@@ -1603,20 +1654,34 @@ xlog_recover_reorder_trans(
int pass)
{
xlog_recover_item_t *item, *n;
+ int error = 0;
LIST_HEAD(sort_list);
+ LIST_HEAD(cancel_list);
+ LIST_HEAD(buffer_list);
+ LIST_HEAD(inode_buffer_list);
+ LIST_HEAD(inode_list);
list_splice_init(&trans->r_itemq, &sort_list);
list_for_each_entry_safe(item, n, &sort_list, ri_list) {
xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr;
switch (ITEM_TYPE(item)) {
+ case XFS_LI_ICREATE:
+ list_move_tail(&item->ri_list, &buffer_list);
+ break;
case XFS_LI_BUF:
- if (!(buf_f->blf_flags & XFS_BLF_CANCEL)) {
+ if (buf_f->blf_flags & XFS_BLF_CANCEL) {
trace_xfs_log_recover_item_reorder_head(log,
trans, item, pass);
- list_move(&item->ri_list, &trans->r_itemq);
+ list_move(&item->ri_list, &cancel_list);
break;
}
+ if (buf_f->blf_flags & XFS_BLF_INODE_BUF) {
+ list_move(&item->ri_list, &inode_buffer_list);
+ break;
+ }
+ list_move_tail(&item->ri_list, &buffer_list);
+ break;
case XFS_LI_INODE:
case XFS_LI_DQUOT:
case XFS_LI_QUOTAOFF:
@@ -1624,18 +1689,34 @@ xlog_recover_reorder_trans(
case XFS_LI_EFI:
trace_xfs_log_recover_item_reorder_tail(log,
trans, item, pass);
- list_move_tail(&item->ri_list, &trans->r_itemq);
+ list_move_tail(&item->ri_list, &inode_list);
break;
default:
xfs_warn(log->l_mp,
"%s: unrecognized type of log operation",
__func__);
ASSERT(0);
- return XFS_ERROR(EIO);
+ /*
+ * return the remaining items back to the transaction
+ * item list so they can be freed in caller.
+ */
+ if (!list_empty(&sort_list))
+ list_splice_init(&sort_list, &trans->r_itemq);
+ error = XFS_ERROR(EIO);
+ goto out;
}
}
+out:
ASSERT(list_empty(&sort_list));
- return 0;
+ if (!list_empty(&buffer_list))
+ list_splice(&buffer_list, &trans->r_itemq);
+ if (!list_empty(&inode_list))
+ list_splice_tail(&inode_list, &trans->r_itemq);
+ if (!list_empty(&inode_buffer_list))
+ list_splice_tail(&inode_buffer_list, &trans->r_itemq);
+ if (!list_empty(&cancel_list))
+ list_splice_tail(&cancel_list, &trans->r_itemq);
+ return error;
}
/*
@@ -1693,19 +1774,11 @@ xlog_recover_buffer_pass1(
/*
* Check to see whether the buffer being recovered has a corresponding
- * entry in the buffer cancel record table. If it does then return 1
- * so that it will be cancelled, otherwise return 0. If the buffer is
- * actually a buffer cancel item (XFS_BLF_CANCEL is set), then decrement
- * the refcount on the entry in the table and remove it from the table
- * if this is the last reference.
- *
- * We remove the cancel record from the table when we encounter its
- * last occurrence in the log so that if the same buffer is re-used
- * again after its last cancellation we actually replay the changes
- * made at that point.
+ * entry in the buffer cancel record table. If it is, return the cancel
+ * buffer structure to the caller.
*/
-STATIC int
-xlog_check_buffer_cancelled(
+STATIC struct xfs_buf_cancel *
+xlog_peek_buffer_cancelled(
struct xlog *log,
xfs_daddr_t blkno,
uint len,
@@ -1714,22 +1787,16 @@ xlog_check_buffer_cancelled(
struct list_head *bucket;
struct xfs_buf_cancel *bcp;
- if (log->l_buf_cancel_table == NULL) {
- /*
- * There is nothing in the table built in pass one,
- * so this buffer must not be cancelled.
- */
+ if (!log->l_buf_cancel_table) {
+ /* empty table means no cancelled buffers in the log */
ASSERT(!(flags & XFS_BLF_CANCEL));
- return 0;
+ return NULL;
}
- /*
- * Search for an entry in the cancel table that matches our buffer.
- */
bucket = XLOG_BUF_CANCEL_BUCKET(log, blkno);
list_for_each_entry(bcp, bucket, bc_list) {
if (bcp->bc_blkno == blkno && bcp->bc_len == len)
- goto found;
+ return bcp;
}
/*
@@ -1737,9 +1804,32 @@ xlog_check_buffer_cancelled(
* that the buffer is NOT cancelled.
*/
ASSERT(!(flags & XFS_BLF_CANCEL));
- return 0;
+ return NULL;
+}
+
+/*
+ * If the buffer is being cancelled then return 1 so that it will be cancelled,
+ * otherwise return 0. If the buffer is actually a buffer cancel item
+ * (XFS_BLF_CANCEL is set), then decrement the refcount on the entry in the
+ * table and remove it from the table if this is the last reference.
+ *
+ * We remove the cancel record from the table when we encounter its last
+ * occurrence in the log so that if the same buffer is re-used again after its
+ * last cancellation we actually replay the changes made at that point.
+ */
+STATIC int
+xlog_check_buffer_cancelled(
+ struct xlog *log,
+ xfs_daddr_t blkno,
+ uint len,
+ ushort flags)
+{
+ struct xfs_buf_cancel *bcp;
+
+ bcp = xlog_peek_buffer_cancelled(log, blkno, len, flags);
+ if (!bcp)
+ return 0;
-found:
/*
* We've go a match, so return 1 so that the recovery of this buffer
* is cancelled. If this buffer is actually a buffer cancel log
@@ -1787,6 +1877,13 @@ xlog_recover_do_inode_buffer(
trace_xfs_log_recover_buf_inode_buf(mp->m_log, buf_f);
+ /*
+ * Post recovery validation only works properly on CRC enabled
+ * filesystems.
+ */
+ if (xfs_sb_version_hascrc(&mp->m_sb))
+ bp->b_ops = &xfs_inode_buf_ops;
+
inodes_per_buf = BBTOB(bp->b_io_length) >> mp->m_sb.sb_inodelog;
for (i = 0; i < inodes_per_buf; i++) {
next_unlinked_offset = (i * mp->m_sb.sb_inodesize) +
@@ -1852,12 +1949,361 @@ xlog_recover_do_inode_buffer(
buffer_nextp = (xfs_agino_t *)xfs_buf_offset(bp,
next_unlinked_offset);
*buffer_nextp = *logged_nextp;
+
+ /*
+ * If necessary, recalculate the CRC in the on-disk inode. We
+ * have to leave the inode in a consistent state for whoever
+ * reads it next....
+ */
+ xfs_dinode_calc_crc(mp, (struct xfs_dinode *)
+ xfs_buf_offset(bp, i * mp->m_sb.sb_inodesize));
+
}
return 0;
}
/*
+ * V5 filesystems know the age of the buffer on disk being recovered. We can
+ * have newer objects on disk than we are replaying, and so for these cases we
+ * don't want to replay the current change as that will make the buffer contents
+ * temporarily invalid on disk.
+ *
+ * The magic number might not match the buffer type we are going to recover
+ * (e.g. reallocated blocks), so we ignore the xfs_buf_log_format flags. Hence
+ * extract the LSN of the existing object in the buffer based on it's current
+ * magic number. If we don't recognise the magic number in the buffer, then
+ * return a LSN of -1 so that the caller knows it was an unrecognised block and
+ * so can recover the buffer.
+ *
+ * Note: we cannot rely solely on magic number matches to determine that the
+ * buffer has a valid LSN - we also need to verify that it belongs to this
+ * filesystem, so we need to extract the object's LSN and compare it to that
+ * which we read from the superblock. If the UUIDs don't match, then we've got a
+ * stale metadata block from an old filesystem instance that we need to recover
+ * over the top of.
+ */
+static xfs_lsn_t
+xlog_recover_get_buf_lsn(
+ struct xfs_mount *mp,
+ struct xfs_buf *bp)
+{
+ __uint32_t magic32;
+ __uint16_t magic16;
+ __uint16_t magicda;
+ void *blk = bp->b_addr;
+ uuid_t *uuid;
+ xfs_lsn_t lsn = -1;
+
+ /* v4 filesystems always recover immediately */
+ if (!xfs_sb_version_hascrc(&mp->m_sb))
+ goto recover_immediately;
+
+ magic32 = be32_to_cpu(*(__be32 *)blk);
+ switch (magic32) {
+ case XFS_ABTB_CRC_MAGIC:
+ case XFS_ABTC_CRC_MAGIC:
+ case XFS_ABTB_MAGIC:
+ case XFS_ABTC_MAGIC:
+ case XFS_IBT_CRC_MAGIC:
+ case XFS_IBT_MAGIC: {
+ struct xfs_btree_block *btb = blk;
+
+ lsn = be64_to_cpu(btb->bb_u.s.bb_lsn);
+ uuid = &btb->bb_u.s.bb_uuid;
+ break;
+ }
+ case XFS_BMAP_CRC_MAGIC:
+ case XFS_BMAP_MAGIC: {
+ struct xfs_btree_block *btb = blk;
+
+ lsn = be64_to_cpu(btb->bb_u.l.bb_lsn);
+ uuid = &btb->bb_u.l.bb_uuid;
+ break;
+ }
+ case XFS_AGF_MAGIC:
+ lsn = be64_to_cpu(((struct xfs_agf *)blk)->agf_lsn);
+ uuid = &((struct xfs_agf *)blk)->agf_uuid;
+ break;
+ case XFS_AGFL_MAGIC:
+ lsn = be64_to_cpu(((struct xfs_agfl *)blk)->agfl_lsn);
+ uuid = &((struct xfs_agfl *)blk)->agfl_uuid;
+ break;
+ case XFS_AGI_MAGIC:
+ lsn = be64_to_cpu(((struct xfs_agi *)blk)->agi_lsn);
+ uuid = &((struct xfs_agi *)blk)->agi_uuid;
+ break;
+ case XFS_SYMLINK_MAGIC:
+ lsn = be64_to_cpu(((struct xfs_dsymlink_hdr *)blk)->sl_lsn);
+ uuid = &((struct xfs_dsymlink_hdr *)blk)->sl_uuid;
+ break;
+ case XFS_DIR3_BLOCK_MAGIC:
+ case XFS_DIR3_DATA_MAGIC:
+ case XFS_DIR3_FREE_MAGIC:
+ lsn = be64_to_cpu(((struct xfs_dir3_blk_hdr *)blk)->lsn);
+ uuid = &((struct xfs_dir3_blk_hdr *)blk)->uuid;
+ break;
+ case XFS_ATTR3_RMT_MAGIC:
+ lsn = be64_to_cpu(((struct xfs_attr3_rmt_hdr *)blk)->rm_lsn);
+ uuid = &((struct xfs_attr3_rmt_hdr *)blk)->rm_uuid;
+ break;
+ case XFS_SB_MAGIC:
+ lsn = be64_to_cpu(((struct xfs_dsb *)blk)->sb_lsn);
+ uuid = &((struct xfs_dsb *)blk)->sb_uuid;
+ break;
+ default:
+ break;
+ }
+
+ if (lsn != (xfs_lsn_t)-1) {
+ if (!uuid_equal(&mp->m_sb.sb_uuid, uuid))
+ goto recover_immediately;
+ return lsn;
+ }
+
+ magicda = be16_to_cpu(((struct xfs_da_blkinfo *)blk)->magic);
+ switch (magicda) {
+ case XFS_DIR3_LEAF1_MAGIC:
+ case XFS_DIR3_LEAFN_MAGIC:
+ case XFS_DA3_NODE_MAGIC:
+ lsn = be64_to_cpu(((struct xfs_da3_blkinfo *)blk)->lsn);
+ uuid = &((struct xfs_da3_blkinfo *)blk)->uuid;
+ break;
+ default:
+ break;
+ }
+
+ if (lsn != (xfs_lsn_t)-1) {
+ if (!uuid_equal(&mp->m_sb.sb_uuid, uuid))
+ goto recover_immediately;
+ return lsn;
+ }
+
+ /*
+ * We do individual object checks on dquot and inode buffers as they
+ * have their own individual LSN records. Also, we could have a stale
+ * buffer here, so we have to at least recognise these buffer types.
+ *
+ * A notd complexity here is inode unlinked list processing - it logs
+ * the inode directly in the buffer, but we don't know which inodes have
+ * been modified, and there is no global buffer LSN. Hence we need to
+ * recover all inode buffer types immediately. This problem will be
+ * fixed by logical logging of the unlinked list modifications.
+ */
+ magic16 = be16_to_cpu(*(__be16 *)blk);
+ switch (magic16) {
+ case XFS_DQUOT_MAGIC:
+ case XFS_DINODE_MAGIC:
+ goto recover_immediately;
+ default:
+ break;
+ }
+
+ /* unknown buffer contents, recover immediately */
+
+recover_immediately:
+ return (xfs_lsn_t)-1;
+
+}
+
+/*
+ * Validate the recovered buffer is of the correct type and attach the
+ * appropriate buffer operations to them for writeback. Magic numbers are in a
+ * few places:
+ * the first 16 bits of the buffer (inode buffer, dquot buffer),
+ * the first 32 bits of the buffer (most blocks),
+ * inside a struct xfs_da_blkinfo at the start of the buffer.
+ */
+static void
+xlog_recover_validate_buf_type(
+ struct xfs_mount *mp,
+ struct xfs_buf *bp,
+ xfs_buf_log_format_t *buf_f)
+{
+ struct xfs_da_blkinfo *info = bp->b_addr;
+ __uint32_t magic32;
+ __uint16_t magic16;
+ __uint16_t magicda;
+
+ magic32 = be32_to_cpu(*(__be32 *)bp->b_addr);
+ magic16 = be16_to_cpu(*(__be16*)bp->b_addr);
+ magicda = be16_to_cpu(info->magic);
+ switch (xfs_blft_from_flags(buf_f)) {
+ case XFS_BLFT_BTREE_BUF:
+ switch (magic32) {
+ case XFS_ABTB_CRC_MAGIC:
+ case XFS_ABTC_CRC_MAGIC:
+ case XFS_ABTB_MAGIC:
+ case XFS_ABTC_MAGIC:
+ bp->b_ops = &xfs_allocbt_buf_ops;
+ break;
+ case XFS_IBT_CRC_MAGIC:
+ case XFS_FIBT_CRC_MAGIC:
+ case XFS_IBT_MAGIC:
+ case XFS_FIBT_MAGIC:
+ bp->b_ops = &xfs_inobt_buf_ops;
+ break;
+ case XFS_BMAP_CRC_MAGIC:
+ case XFS_BMAP_MAGIC:
+ bp->b_ops = &xfs_bmbt_buf_ops;
+ break;
+ default:
+ xfs_warn(mp, "Bad btree block magic!");
+ ASSERT(0);
+ break;
+ }
+ break;
+ case XFS_BLFT_AGF_BUF:
+ if (magic32 != XFS_AGF_MAGIC) {
+ xfs_warn(mp, "Bad AGF block magic!");
+ ASSERT(0);
+ break;
+ }
+ bp->b_ops = &xfs_agf_buf_ops;
+ break;
+ case XFS_BLFT_AGFL_BUF:
+ if (!xfs_sb_version_hascrc(&mp->m_sb))
+ break;
+ if (magic32 != XFS_AGFL_MAGIC) {
+ xfs_warn(mp, "Bad AGFL block magic!");
+ ASSERT(0);
+ break;
+ }
+ bp->b_ops = &xfs_agfl_buf_ops;
+ break;
+ case XFS_BLFT_AGI_BUF:
+ if (magic32 != XFS_AGI_MAGIC) {
+ xfs_warn(mp, "Bad AGI block magic!");
+ ASSERT(0);
+ break;
+ }
+ bp->b_ops = &xfs_agi_buf_ops;
+ break;
+ case XFS_BLFT_UDQUOT_BUF:
+ case XFS_BLFT_PDQUOT_BUF:
+ case XFS_BLFT_GDQUOT_BUF:
+#ifdef CONFIG_XFS_QUOTA
+ if (magic16 != XFS_DQUOT_MAGIC) {
+ xfs_warn(mp, "Bad DQUOT block magic!");
+ ASSERT(0);
+ break;
+ }
+ bp->b_ops = &xfs_dquot_buf_ops;
+#else
+ xfs_alert(mp,
+ "Trying to recover dquots without QUOTA support built in!");
+ ASSERT(0);
+#endif
+ break;
+ case XFS_BLFT_DINO_BUF:
+ /*
+ * we get here with inode allocation buffers, not buffers that
+ * track unlinked list changes.
+ */
+ if (magic16 != XFS_DINODE_MAGIC) {
+ xfs_warn(mp, "Bad INODE block magic!");
+ ASSERT(0);
+ break;
+ }
+ bp->b_ops = &xfs_inode_buf_ops;
+ break;
+ case XFS_BLFT_SYMLINK_BUF:
+ if (magic32 != XFS_SYMLINK_MAGIC) {
+ xfs_warn(mp, "Bad symlink block magic!");
+ ASSERT(0);
+ break;
+ }
+ bp->b_ops = &xfs_symlink_buf_ops;
+ break;
+ case XFS_BLFT_DIR_BLOCK_BUF:
+ if (magic32 != XFS_DIR2_BLOCK_MAGIC &&
+ magic32 != XFS_DIR3_BLOCK_MAGIC) {
+ xfs_warn(mp, "Bad dir block magic!");
+ ASSERT(0);
+ break;
+ }
+ bp->b_ops = &xfs_dir3_block_buf_ops;
+ break;
+ case XFS_BLFT_DIR_DATA_BUF:
+ if (magic32 != XFS_DIR2_DATA_MAGIC &&
+ magic32 != XFS_DIR3_DATA_MAGIC) {
+ xfs_warn(mp, "Bad dir data magic!");
+ ASSERT(0);
+ break;
+ }
+ bp->b_ops = &xfs_dir3_data_buf_ops;
+ break;
+ case XFS_BLFT_DIR_FREE_BUF:
+ if (magic32 != XFS_DIR2_FREE_MAGIC &&
+ magic32 != XFS_DIR3_FREE_MAGIC) {
+ xfs_warn(mp, "Bad dir3 free magic!");
+ ASSERT(0);
+ break;
+ }
+ bp->b_ops = &xfs_dir3_free_buf_ops;
+ break;
+ case XFS_BLFT_DIR_LEAF1_BUF:
+ if (magicda != XFS_DIR2_LEAF1_MAGIC &&
+ magicda != XFS_DIR3_LEAF1_MAGIC) {
+ xfs_warn(mp, "Bad dir leaf1 magic!");
+ ASSERT(0);
+ break;
+ }
+ bp->b_ops = &xfs_dir3_leaf1_buf_ops;
+ break;
+ case XFS_BLFT_DIR_LEAFN_BUF:
+ if (magicda != XFS_DIR2_LEAFN_MAGIC &&
+ magicda != XFS_DIR3_LEAFN_MAGIC) {
+ xfs_warn(mp, "Bad dir leafn magic!");
+ ASSERT(0);
+ break;
+ }
+ bp->b_ops = &xfs_dir3_leafn_buf_ops;
+ break;
+ case XFS_BLFT_DA_NODE_BUF:
+ if (magicda != XFS_DA_NODE_MAGIC &&
+ magicda != XFS_DA3_NODE_MAGIC) {
+ xfs_warn(mp, "Bad da node magic!");
+ ASSERT(0);
+ break;
+ }
+ bp->b_ops = &xfs_da3_node_buf_ops;
+ break;
+ case XFS_BLFT_ATTR_LEAF_BUF:
+ if (magicda != XFS_ATTR_LEAF_MAGIC &&
+ magicda != XFS_ATTR3_LEAF_MAGIC) {
+ xfs_warn(mp, "Bad attr leaf magic!");
+ ASSERT(0);
+ break;
+ }
+ bp->b_ops = &xfs_attr3_leaf_buf_ops;
+ break;
+ case XFS_BLFT_ATTR_RMT_BUF:
+ if (!xfs_sb_version_hascrc(&mp->m_sb))
+ break;
+ if (magic32 != XFS_ATTR3_RMT_MAGIC) {
+ xfs_warn(mp, "Bad attr remote magic!");
+ ASSERT(0);
+ break;
+ }
+ bp->b_ops = &xfs_attr3_rmt_buf_ops;
+ break;
+ case XFS_BLFT_SB_BUF:
+ if (magic32 != XFS_SB_MAGIC) {
+ xfs_warn(mp, "Bad SB block magic!");
+ ASSERT(0);
+ break;
+ }
+ bp->b_ops = &xfs_sb_buf_ops;
+ break;
+ default:
+ xfs_warn(mp, "Unknown buffer type %d!",
+ xfs_blft_from_flags(buf_f));
+ break;
+ }
+}
+
+/*
* Perform a 'normal' buffer recovery. Each logged region of the
* buffer should be copied over the corresponding region in the
* given buffer. The bitmap in the buf log format structure indicates
@@ -1893,6 +2339,17 @@ xlog_recover_do_reg_buffer(
((uint)bit << XFS_BLF_SHIFT) + (nbits << XFS_BLF_SHIFT));
/*
+ * The dirty regions logged in the buffer, even though
+ * contiguous, may span multiple chunks. This is because the
+ * dirty region may span a physical page boundary in a buffer
+ * and hence be split into two separate vectors for writing into
+ * the log. Hence we need to trim nbits back to the length of
+ * the current region being copied out of the log.
+ */
+ if (item->ri_buf[i].i_len < (nbits << XFS_BLF_SHIFT))
+ nbits = item->ri_buf[i].i_len >> XFS_BLF_SHIFT;
+
+ /*
* Do a sanity check if this is a dquot buffer. Just checking
* the first dquot in the buffer should do. XXXThis is
* probably a good thing to do for other buf types also.
@@ -1911,7 +2368,7 @@ xlog_recover_do_reg_buffer(
item->ri_buf[i].i_len, __func__);
goto next;
}
- error = xfs_qm_dqcheck(mp, item->ri_buf[i].i_addr,
+ error = xfs_dqcheck(mp, item->ri_buf[i].i_addr,
-1, 0, XFS_QMOPT_DOWARN,
"dquot_buf_recover");
if (error)
@@ -1929,132 +2386,22 @@ xlog_recover_do_reg_buffer(
/* Shouldn't be any more regions */
ASSERT(i == item->ri_total);
-}
-
-/*
- * Do some primitive error checking on ondisk dquot data structures.
- */
-int
-xfs_qm_dqcheck(
- struct xfs_mount *mp,
- xfs_disk_dquot_t *ddq,
- xfs_dqid_t id,
- uint type, /* used only when IO_dorepair is true */
- uint flags,
- char *str)
-{
- xfs_dqblk_t *d = (xfs_dqblk_t *)ddq;
- int errs = 0;
-
- /*
- * We can encounter an uninitialized dquot buffer for 2 reasons:
- * 1. If we crash while deleting the quotainode(s), and those blks got
- * used for user data. This is because we take the path of regular
- * file deletion; however, the size field of quotainodes is never
- * updated, so all the tricks that we play in itruncate_finish
- * don't quite matter.
- *
- * 2. We don't play the quota buffers when there's a quotaoff logitem.
- * But the allocation will be replayed so we'll end up with an
- * uninitialized quota block.
- *
- * This is all fine; things are still consistent, and we haven't lost
- * any quota information. Just don't complain about bad dquot blks.
- */
- if (ddq->d_magic != cpu_to_be16(XFS_DQUOT_MAGIC)) {
- if (flags & XFS_QMOPT_DOWARN)
- xfs_alert(mp,
- "%s : XFS dquot ID 0x%x, magic 0x%x != 0x%x",
- str, id, be16_to_cpu(ddq->d_magic), XFS_DQUOT_MAGIC);
- errs++;
- }
- if (ddq->d_version != XFS_DQUOT_VERSION) {
- if (flags & XFS_QMOPT_DOWARN)
- xfs_alert(mp,
- "%s : XFS dquot ID 0x%x, version 0x%x != 0x%x",
- str, id, ddq->d_version, XFS_DQUOT_VERSION);
- errs++;
- }
-
- if (ddq->d_flags != XFS_DQ_USER &&
- ddq->d_flags != XFS_DQ_PROJ &&
- ddq->d_flags != XFS_DQ_GROUP) {
- if (flags & XFS_QMOPT_DOWARN)
- xfs_alert(mp,
- "%s : XFS dquot ID 0x%x, unknown flags 0x%x",
- str, id, ddq->d_flags);
- errs++;
- }
-
- if (id != -1 && id != be32_to_cpu(ddq->d_id)) {
- if (flags & XFS_QMOPT_DOWARN)
- xfs_alert(mp,
- "%s : ondisk-dquot 0x%p, ID mismatch: "
- "0x%x expected, found id 0x%x",
- str, ddq, id, be32_to_cpu(ddq->d_id));
- errs++;
- }
-
- if (!errs && ddq->d_id) {
- if (ddq->d_blk_softlimit &&
- be64_to_cpu(ddq->d_bcount) >
- be64_to_cpu(ddq->d_blk_softlimit)) {
- if (!ddq->d_btimer) {
- if (flags & XFS_QMOPT_DOWARN)
- xfs_alert(mp,
- "%s : Dquot ID 0x%x (0x%p) BLK TIMER NOT STARTED",
- str, (int)be32_to_cpu(ddq->d_id), ddq);
- errs++;
- }
- }
- if (ddq->d_ino_softlimit &&
- be64_to_cpu(ddq->d_icount) >
- be64_to_cpu(ddq->d_ino_softlimit)) {
- if (!ddq->d_itimer) {
- if (flags & XFS_QMOPT_DOWARN)
- xfs_alert(mp,
- "%s : Dquot ID 0x%x (0x%p) INODE TIMER NOT STARTED",
- str, (int)be32_to_cpu(ddq->d_id), ddq);
- errs++;
- }
- }
- if (ddq->d_rtb_softlimit &&
- be64_to_cpu(ddq->d_rtbcount) >
- be64_to_cpu(ddq->d_rtb_softlimit)) {
- if (!ddq->d_rtbtimer) {
- if (flags & XFS_QMOPT_DOWARN)
- xfs_alert(mp,
- "%s : Dquot ID 0x%x (0x%p) RTBLK TIMER NOT STARTED",
- str, (int)be32_to_cpu(ddq->d_id), ddq);
- errs++;
- }
- }
- }
-
- if (!errs || !(flags & XFS_QMOPT_DQREPAIR))
- return errs;
-
- if (flags & XFS_QMOPT_DOWARN)
- xfs_notice(mp, "Re-initializing dquot ID 0x%x", id);
/*
- * Typically, a repair is only requested by quotacheck.
+ * We can only do post recovery validation on items on CRC enabled
+ * fielsystems as we need to know when the buffer was written to be able
+ * to determine if we should have replayed the item. If we replay old
+ * metadata over a newer buffer, then it will enter a temporarily
+ * inconsistent state resulting in verification failures. Hence for now
+ * just avoid the verification stage for non-crc filesystems
*/
- ASSERT(id != -1);
- ASSERT(flags & XFS_QMOPT_DQREPAIR);
- memset(d, 0, sizeof(xfs_dqblk_t));
-
- d->dd_diskdq.d_magic = cpu_to_be16(XFS_DQUOT_MAGIC);
- d->dd_diskdq.d_version = XFS_DQUOT_VERSION;
- d->dd_diskdq.d_flags = type;
- d->dd_diskdq.d_id = cpu_to_be32(id);
-
- return errs;
+ if (xfs_sb_version_hascrc(&mp->m_sb))
+ xlog_recover_validate_buf_type(mp, bp, buf_f);
}
/*
* Perform a dquot buffer recovery.
- * Simple algorithm: if we have found a QUOTAOFF logitem of the same type
+ * Simple algorithm: if we have found a QUOTAOFF log item of the same type
* (ie. USR or GRP), then just toss this buffer away; don't recover it.
* Else, treat it as a regular buffer and do recovery.
*/
@@ -2113,20 +2460,22 @@ xlog_recover_do_dquot_buffer(
* over the log during recovery. During the first we build a table of
* those buffers which have been cancelled, and during the second we
* only replay those buffers which do not have corresponding cancel
- * records in the table. See xlog_recover_do_buffer_pass[1,2] above
+ * records in the table. See xlog_recover_buffer_pass[1,2] above
* for more details on the implementation of the table of cancel records.
*/
STATIC int
xlog_recover_buffer_pass2(
struct xlog *log,
struct list_head *buffer_list,
- struct xlog_recover_item *item)
+ struct xlog_recover_item *item,
+ xfs_lsn_t current_lsn)
{
xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr;
xfs_mount_t *mp = log->l_mp;
xfs_buf_t *bp;
int error;
uint buf_flags;
+ xfs_lsn_t lsn;
/*
* In this pass we only want to recover all the buffers which have
@@ -2151,10 +2500,17 @@ xlog_recover_buffer_pass2(
error = bp->b_error;
if (error) {
xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#1)");
- xfs_buf_relse(bp);
- return error;
+ goto out_release;
}
+ /*
+ * recover the buffer only if we get an LSN from it and it's less than
+ * the lsn of the transaction we are replaying.
+ */
+ lsn = xlog_recover_get_buf_lsn(mp, bp);
+ if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0)
+ goto out_release;
+
if (buf_f->blf_flags & XFS_BLF_INODE_BUF) {
error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f);
} else if (buf_f->blf_flags &
@@ -2164,7 +2520,7 @@ xlog_recover_buffer_pass2(
xlog_recover_do_reg_buffer(mp, item, bp, buf_f);
}
if (error)
- return XFS_ERROR(error);
+ goto out_release;
/*
* Perform delayed write on the buffer. Asynchronous writes will be
@@ -2172,19 +2528,19 @@ xlog_recover_buffer_pass2(
*
* Also make sure that only inode buffers with good sizes stay in
* the buffer cache. The kernel moves inodes in buffers of 1 block
- * or XFS_INODE_CLUSTER_SIZE bytes, whichever is bigger. The inode
+ * or mp->m_inode_cluster_size bytes, whichever is bigger. The inode
* buffers in the log can be a different size if the log was generated
* by an older kernel using unclustered inode buffers or a newer kernel
* running with a different inode cluster size. Regardless, if the
- * the inode buffer size isn't MAX(blocksize, XFS_INODE_CLUSTER_SIZE)
- * for *our* value of XFS_INODE_CLUSTER_SIZE, then we need to keep
+ * the inode buffer size isn't MAX(blocksize, mp->m_inode_cluster_size)
+ * for *our* value of mp->m_inode_cluster_size, then we need to keep
* the buffer out of the buffer cache so that the buffer won't
* overlap with future reads of those inodes.
*/
if (XFS_DINODE_MAGIC ==
be16_to_cpu(*((__be16 *)xfs_buf_offset(bp, 0))) &&
(BBTOB(bp->b_io_length) != MAX(log->l_mp->m_sb.sb_blocksize,
- (__uint32_t)XFS_INODE_CLUSTER_SIZE(log->l_mp)))) {
+ (__uint32_t)log->l_mp->m_inode_cluster_size))) {
xfs_buf_stale(bp);
error = xfs_bwrite(bp);
} else {
@@ -2193,15 +2549,93 @@ xlog_recover_buffer_pass2(
xfs_buf_delwri_queue(bp, buffer_list);
}
+out_release:
xfs_buf_relse(bp);
return error;
}
+/*
+ * Inode fork owner changes
+ *
+ * If we have been told that we have to reparent the inode fork, it's because an
+ * extent swap operation on a CRC enabled filesystem has been done and we are
+ * replaying it. We need to walk the BMBT of the appropriate fork and change the
+ * owners of it.
+ *
+ * The complexity here is that we don't have an inode context to work with, so
+ * after we've replayed the inode we need to instantiate one. This is where the
+ * fun begins.
+ *
+ * We are in the middle of log recovery, so we can't run transactions. That
+ * means we cannot use cache coherent inode instantiation via xfs_iget(), as
+ * that will result in the corresponding iput() running the inode through
+ * xfs_inactive(). If we've just replayed an inode core that changes the link
+ * count to zero (i.e. it's been unlinked), then xfs_inactive() will run
+ * transactions (bad!).
+ *
+ * So, to avoid this, we instantiate an inode directly from the inode core we've
+ * just recovered. We have the buffer still locked, and all we really need to
+ * instantiate is the inode core and the forks being modified. We can do this
+ * manually, then run the inode btree owner change, and then tear down the
+ * xfs_inode without having to run any transactions at all.
+ *
+ * Also, because we don't have a transaction context available here but need to
+ * gather all the buffers we modify for writeback so we pass the buffer_list
+ * instead for the operation to use.
+ */
+
+STATIC int
+xfs_recover_inode_owner_change(
+ struct xfs_mount *mp,
+ struct xfs_dinode *dip,
+ struct xfs_inode_log_format *in_f,
+ struct list_head *buffer_list)
+{
+ struct xfs_inode *ip;
+ int error;
+
+ ASSERT(in_f->ilf_fields & (XFS_ILOG_DOWNER|XFS_ILOG_AOWNER));
+
+ ip = xfs_inode_alloc(mp, in_f->ilf_ino);
+ if (!ip)
+ return ENOMEM;
+
+ /* instantiate the inode */
+ xfs_dinode_from_disk(&ip->i_d, dip);
+ ASSERT(ip->i_d.di_version >= 3);
+
+ error = xfs_iformat_fork(ip, dip);
+ if (error)
+ goto out_free_ip;
+
+
+ if (in_f->ilf_fields & XFS_ILOG_DOWNER) {
+ ASSERT(in_f->ilf_fields & XFS_ILOG_DBROOT);
+ error = xfs_bmbt_change_owner(NULL, ip, XFS_DATA_FORK,
+ ip->i_ino, buffer_list);
+ if (error)
+ goto out_free_ip;
+ }
+
+ if (in_f->ilf_fields & XFS_ILOG_AOWNER) {
+ ASSERT(in_f->ilf_fields & XFS_ILOG_ABROOT);
+ error = xfs_bmbt_change_owner(NULL, ip, XFS_ATTR_FORK,
+ ip->i_ino, buffer_list);
+ if (error)
+ goto out_free_ip;
+ }
+
+out_free_ip:
+ xfs_inode_free(ip);
+ return error;
+}
+
STATIC int
xlog_recover_inode_pass2(
struct xlog *log,
struct list_head *buffer_list,
- struct xlog_recover_item *item)
+ struct xlog_recover_item *item,
+ xfs_lsn_t current_lsn)
{
xfs_inode_log_format_t *in_f;
xfs_mount_t *mp = log->l_mp;
@@ -2214,6 +2648,7 @@ xlog_recover_inode_pass2(
int attr_index;
uint fields;
xfs_icdinode_t *dicp;
+ uint isize;
int need_free = 0;
if (item->ri_buf[0].i_len == sizeof(xfs_inode_log_format_t)) {
@@ -2239,7 +2674,7 @@ xlog_recover_inode_pass2(
trace_xfs_log_recover_inode_recover(log, in_f);
bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len, 0,
- NULL);
+ &xfs_inode_buf_ops);
if (!bp) {
error = ENOMEM;
goto error;
@@ -2247,8 +2682,7 @@ xlog_recover_inode_pass2(
error = bp->b_error;
if (error) {
xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#2)");
- xfs_buf_relse(bp);
- goto error;
+ goto out_release;
}
ASSERT(in_f->ilf_fields & XFS_ILOG_CORE);
dip = (xfs_dinode_t *)xfs_buf_offset(bp, in_f->ilf_boffset);
@@ -2258,29 +2692,52 @@ xlog_recover_inode_pass2(
* like an inode!
*/
if (unlikely(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC))) {
- xfs_buf_relse(bp);
xfs_alert(mp,
"%s: Bad inode magic number, dip = 0x%p, dino bp = 0x%p, ino = %Ld",
__func__, dip, bp, in_f->ilf_ino);
XFS_ERROR_REPORT("xlog_recover_inode_pass2(1)",
XFS_ERRLEVEL_LOW, mp);
error = EFSCORRUPTED;
- goto error;
+ goto out_release;
}
dicp = item->ri_buf[1].i_addr;
if (unlikely(dicp->di_magic != XFS_DINODE_MAGIC)) {
- xfs_buf_relse(bp);
xfs_alert(mp,
"%s: Bad inode log record, rec ptr 0x%p, ino %Ld",
__func__, item, in_f->ilf_ino);
XFS_ERROR_REPORT("xlog_recover_inode_pass2(2)",
XFS_ERRLEVEL_LOW, mp);
error = EFSCORRUPTED;
- goto error;
+ goto out_release;
+ }
+
+ /*
+ * If the inode has an LSN in it, recover the inode only if it's less
+ * than the lsn of the transaction we are replaying. Note: we still
+ * need to replay an owner change even though the inode is more recent
+ * than the transaction as there is no guarantee that all the btree
+ * blocks are more recent than this transaction, too.
+ */
+ if (dip->di_version >= 3) {
+ xfs_lsn_t lsn = be64_to_cpu(dip->di_lsn);
+
+ if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) {
+ trace_xfs_log_recover_inode_skip(log, in_f);
+ error = 0;
+ goto out_owner_change;
+ }
}
- /* Skip replay when the on disk inode is newer than the log one */
- if (dicp->di_flushiter < be16_to_cpu(dip->di_flushiter)) {
+ /*
+ * di_flushiter is only valid for v1/2 inodes. All changes for v3 inodes
+ * are transactional and if ordering is necessary we can determine that
+ * more accurately by the LSN field in the V3 inode core. Don't trust
+ * the inode versions we might be changing them here - use the
+ * superblock flag to determine whether we need to look at di_flushiter
+ * to skip replay when the on disk inode is newer than the log one
+ */
+ if (!xfs_sb_version_hascrc(&mp->m_sb) &&
+ dicp->di_flushiter < be16_to_cpu(dip->di_flushiter)) {
/*
* Deal with the wrap case, DI_MAX_FLUSH is less
* than smaller numbers
@@ -2289,12 +2746,12 @@ xlog_recover_inode_pass2(
dicp->di_flushiter < (DI_MAX_FLUSH >> 1)) {
/* do nothing */
} else {
- xfs_buf_relse(bp);
trace_xfs_log_recover_inode_skip(log, in_f);
error = 0;
- goto error;
+ goto out_release;
}
}
+
/* Take the opportunity to reset the flush iteration count */
dicp->di_flushiter = 0;
@@ -2303,13 +2760,12 @@ xlog_recover_inode_pass2(
(dicp->di_format != XFS_DINODE_FMT_BTREE)) {
XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(3)",
XFS_ERRLEVEL_LOW, mp, dicp);
- xfs_buf_relse(bp);
xfs_alert(mp,
"%s: Bad regular inode log record, rec ptr 0x%p, "
"ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
__func__, item, dip, bp, in_f->ilf_ino);
error = EFSCORRUPTED;
- goto error;
+ goto out_release;
}
} else if (unlikely(S_ISDIR(dicp->di_mode))) {
if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) &&
@@ -2317,19 +2773,17 @@ xlog_recover_inode_pass2(
(dicp->di_format != XFS_DINODE_FMT_LOCAL)) {
XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(4)",
XFS_ERRLEVEL_LOW, mp, dicp);
- xfs_buf_relse(bp);
xfs_alert(mp,
"%s: Bad dir inode log record, rec ptr 0x%p, "
"ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
__func__, item, dip, bp, in_f->ilf_ino);
error = EFSCORRUPTED;
- goto error;
+ goto out_release;
}
}
if (unlikely(dicp->di_nextents + dicp->di_anextents > dicp->di_nblocks)){
XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(5)",
XFS_ERRLEVEL_LOW, mp, dicp);
- xfs_buf_relse(bp);
xfs_alert(mp,
"%s: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, "
"dino bp 0x%p, ino %Ld, total extents = %d, nblocks = %Ld",
@@ -2337,38 +2791,37 @@ xlog_recover_inode_pass2(
dicp->di_nextents + dicp->di_anextents,
dicp->di_nblocks);
error = EFSCORRUPTED;
- goto error;
+ goto out_release;
}
if (unlikely(dicp->di_forkoff > mp->m_sb.sb_inodesize)) {
XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(6)",
XFS_ERRLEVEL_LOW, mp, dicp);
- xfs_buf_relse(bp);
xfs_alert(mp,
"%s: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, "
"dino bp 0x%p, ino %Ld, forkoff 0x%x", __func__,
item, dip, bp, in_f->ilf_ino, dicp->di_forkoff);
error = EFSCORRUPTED;
- goto error;
+ goto out_release;
}
- if (unlikely(item->ri_buf[1].i_len > sizeof(struct xfs_icdinode))) {
+ isize = xfs_icdinode_size(dicp->di_version);
+ if (unlikely(item->ri_buf[1].i_len > isize)) {
XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(7)",
XFS_ERRLEVEL_LOW, mp, dicp);
- xfs_buf_relse(bp);
xfs_alert(mp,
"%s: Bad inode log record length %d, rec ptr 0x%p",
__func__, item->ri_buf[1].i_len, item);
error = EFSCORRUPTED;
- goto error;
+ goto out_release;
}
/* The core is in in-core format */
- xfs_dinode_to_disk(dip, item->ri_buf[1].i_addr);
+ xfs_dinode_to_disk(dip, dicp);
/* the rest is in on-disk format */
- if (item->ri_buf[1].i_len > sizeof(struct xfs_icdinode)) {
- memcpy((xfs_caddr_t) dip + sizeof(struct xfs_icdinode),
- item->ri_buf[1].i_addr + sizeof(struct xfs_icdinode),
- item->ri_buf[1].i_len - sizeof(struct xfs_icdinode));
+ if (item->ri_buf[1].i_len > isize) {
+ memcpy((char *)dip + isize,
+ item->ri_buf[1].i_addr + isize,
+ item->ri_buf[1].i_len - isize);
}
fields = in_f->ilf_fields;
@@ -2384,7 +2837,7 @@ xlog_recover_inode_pass2(
}
if (in_f->ilf_size == 2)
- goto write_inode_buffer;
+ goto out_owner_change;
len = item->ri_buf[2].i_len;
src = item->ri_buf[2].i_addr;
ASSERT(in_f->ilf_size <= 4);
@@ -2445,16 +2898,23 @@ xlog_recover_inode_pass2(
default:
xfs_warn(log->l_mp, "%s: Invalid flag", __func__);
ASSERT(0);
- xfs_buf_relse(bp);
error = EIO;
- goto error;
+ goto out_release;
}
}
-write_inode_buffer:
+out_owner_change:
+ if (in_f->ilf_fields & (XFS_ILOG_DOWNER|XFS_ILOG_AOWNER))
+ error = xfs_recover_inode_owner_change(mp, dip, in_f,
+ buffer_list);
+ /* re-generate the checksum. */
+ xfs_dinode_calc_crc(log->l_mp, dip);
+
ASSERT(bp->b_target->bt_mount == mp);
bp->b_iodone = xlog_recover_iodone;
xfs_buf_delwri_queue(bp, buffer_list);
+
+out_release:
xfs_buf_relse(bp);
error:
if (need_free)
@@ -2496,7 +2956,8 @@ STATIC int
xlog_recover_dquot_pass2(
struct xlog *log,
struct list_head *buffer_list,
- struct xlog_recover_item *item)
+ struct xlog_recover_item *item,
+ xfs_lsn_t current_lsn)
{
xfs_mount_t *mp = log->l_mp;
xfs_buf_t *bp;
@@ -2543,7 +3004,7 @@ xlog_recover_dquot_pass2(
*/
dq_f = item->ri_buf[0].i_addr;
ASSERT(dq_f);
- error = xfs_qm_dqcheck(mp, recddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN,
+ error = xfs_dqcheck(mp, recddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN,
"xlog_recover_dquot_pass2 (log copy)");
if (error)
return XFS_ERROR(EIO);
@@ -2563,22 +3024,40 @@ xlog_recover_dquot_pass2(
* was among a chunk of dquots created earlier, and we did some
* minimal initialization then.
*/
- error = xfs_qm_dqcheck(mp, ddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN,
+ error = xfs_dqcheck(mp, ddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN,
"xlog_recover_dquot_pass2");
if (error) {
xfs_buf_relse(bp);
return XFS_ERROR(EIO);
}
+ /*
+ * If the dquot has an LSN in it, recover the dquot only if it's less
+ * than the lsn of the transaction we are replaying.
+ */
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ struct xfs_dqblk *dqb = (struct xfs_dqblk *)ddq;
+ xfs_lsn_t lsn = be64_to_cpu(dqb->dd_lsn);
+
+ if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) {
+ goto out_release;
+ }
+ }
+
memcpy(ddq, recddq, item->ri_buf[1].i_len);
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ xfs_update_cksum((char *)ddq, sizeof(struct xfs_dqblk),
+ XFS_DQUOT_CRC_OFF);
+ }
ASSERT(dq_f->qlf_size == 2);
ASSERT(bp->b_target->bt_mount == mp);
bp->b_iodone = xlog_recover_iodone;
xfs_buf_delwri_queue(bp, buffer_list);
- xfs_buf_relse(bp);
- return (0);
+out_release:
+ xfs_buf_relse(bp);
+ return 0;
}
/*
@@ -2668,13 +3147,100 @@ xlog_recover_efd_pass2(
}
lip = xfs_trans_ail_cursor_next(ailp, &cur);
}
- xfs_trans_ail_cursor_done(ailp, &cur);
+ xfs_trans_ail_cursor_done(&cur);
spin_unlock(&ailp->xa_lock);
return 0;
}
/*
+ * This routine is called when an inode create format structure is found in a
+ * committed transaction in the log. It's purpose is to initialise the inodes
+ * being allocated on disk. This requires us to get inode cluster buffers that
+ * match the range to be intialised, stamped with inode templates and written
+ * by delayed write so that subsequent modifications will hit the cached buffer
+ * and only need writing out at the end of recovery.
+ */
+STATIC int
+xlog_recover_do_icreate_pass2(
+ struct xlog *log,
+ struct list_head *buffer_list,
+ xlog_recover_item_t *item)
+{
+ struct xfs_mount *mp = log->l_mp;
+ struct xfs_icreate_log *icl;
+ xfs_agnumber_t agno;
+ xfs_agblock_t agbno;
+ unsigned int count;
+ unsigned int isize;
+ xfs_agblock_t length;
+
+ icl = (struct xfs_icreate_log *)item->ri_buf[0].i_addr;
+ if (icl->icl_type != XFS_LI_ICREATE) {
+ xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad type");
+ return EINVAL;
+ }
+
+ if (icl->icl_size != 1) {
+ xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad icl size");
+ return EINVAL;
+ }
+
+ agno = be32_to_cpu(icl->icl_ag);
+ if (agno >= mp->m_sb.sb_agcount) {
+ xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad agno");
+ return EINVAL;
+ }
+ agbno = be32_to_cpu(icl->icl_agbno);
+ if (!agbno || agbno == NULLAGBLOCK || agbno >= mp->m_sb.sb_agblocks) {
+ xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad agbno");
+ return EINVAL;
+ }
+ isize = be32_to_cpu(icl->icl_isize);
+ if (isize != mp->m_sb.sb_inodesize) {
+ xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad isize");
+ return EINVAL;
+ }
+ count = be32_to_cpu(icl->icl_count);
+ if (!count) {
+ xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad count");
+ return EINVAL;
+ }
+ length = be32_to_cpu(icl->icl_length);
+ if (!length || length >= mp->m_sb.sb_agblocks) {
+ xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad length");
+ return EINVAL;
+ }
+
+ /* existing allocation is fixed value */
+ ASSERT(count == mp->m_ialloc_inos);
+ ASSERT(length == mp->m_ialloc_blks);
+ if (count != mp->m_ialloc_inos ||
+ length != mp->m_ialloc_blks) {
+ xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad count 2");
+ return EINVAL;
+ }
+
+ /*
+ * Inode buffers can be freed. Do not replay the inode initialisation as
+ * we could be overwriting something written after this inode buffer was
+ * cancelled.
+ *
+ * XXX: we need to iterate all buffers and only init those that are not
+ * cancelled. I think that a more fine grained factoring of
+ * xfs_ialloc_inode_init may be appropriate here to enable this to be
+ * done easily.
+ */
+ if (xlog_check_buffer_cancelled(log,
+ XFS_AGB_TO_DADDR(mp, agno, agbno), length, 0))
+ return 0;
+
+ xfs_ialloc_inode_init(mp, NULL, buffer_list, agno, agbno, length,
+ be32_to_cpu(icl->icl_gen));
+ return 0;
+}
+
+/*
* Free up any resources allocated by the transaction
*
* Remember that EFIs, EFDs, and IUNLINKs are handled later.
@@ -2699,6 +3265,106 @@ xlog_recover_free_trans(
kmem_free(trans);
}
+STATIC void
+xlog_recover_buffer_ra_pass2(
+ struct xlog *log,
+ struct xlog_recover_item *item)
+{
+ struct xfs_buf_log_format *buf_f = item->ri_buf[0].i_addr;
+ struct xfs_mount *mp = log->l_mp;
+
+ if (xlog_peek_buffer_cancelled(log, buf_f->blf_blkno,
+ buf_f->blf_len, buf_f->blf_flags)) {
+ return;
+ }
+
+ xfs_buf_readahead(mp->m_ddev_targp, buf_f->blf_blkno,
+ buf_f->blf_len, NULL);
+}
+
+STATIC void
+xlog_recover_inode_ra_pass2(
+ struct xlog *log,
+ struct xlog_recover_item *item)
+{
+ struct xfs_inode_log_format ilf_buf;
+ struct xfs_inode_log_format *ilfp;
+ struct xfs_mount *mp = log->l_mp;
+ int error;
+
+ if (item->ri_buf[0].i_len == sizeof(struct xfs_inode_log_format)) {
+ ilfp = item->ri_buf[0].i_addr;
+ } else {
+ ilfp = &ilf_buf;
+ memset(ilfp, 0, sizeof(*ilfp));
+ error = xfs_inode_item_format_convert(&item->ri_buf[0], ilfp);
+ if (error)
+ return;
+ }
+
+ if (xlog_peek_buffer_cancelled(log, ilfp->ilf_blkno, ilfp->ilf_len, 0))
+ return;
+
+ xfs_buf_readahead(mp->m_ddev_targp, ilfp->ilf_blkno,
+ ilfp->ilf_len, &xfs_inode_buf_ra_ops);
+}
+
+STATIC void
+xlog_recover_dquot_ra_pass2(
+ struct xlog *log,
+ struct xlog_recover_item *item)
+{
+ struct xfs_mount *mp = log->l_mp;
+ struct xfs_disk_dquot *recddq;
+ struct xfs_dq_logformat *dq_f;
+ uint type;
+
+
+ if (mp->m_qflags == 0)
+ return;
+
+ recddq = item->ri_buf[1].i_addr;
+ if (recddq == NULL)
+ return;
+ if (item->ri_buf[1].i_len < sizeof(struct xfs_disk_dquot))
+ return;
+
+ type = recddq->d_flags & (XFS_DQ_USER | XFS_DQ_PROJ | XFS_DQ_GROUP);
+ ASSERT(type);
+ if (log->l_quotaoffs_flag & type)
+ return;
+
+ dq_f = item->ri_buf[0].i_addr;
+ ASSERT(dq_f);
+ ASSERT(dq_f->qlf_len == 1);
+
+ xfs_buf_readahead(mp->m_ddev_targp, dq_f->qlf_blkno,
+ XFS_FSB_TO_BB(mp, dq_f->qlf_len), NULL);
+}
+
+STATIC void
+xlog_recover_ra_pass2(
+ struct xlog *log,
+ struct xlog_recover_item *item)
+{
+ switch (ITEM_TYPE(item)) {
+ case XFS_LI_BUF:
+ xlog_recover_buffer_ra_pass2(log, item);
+ break;
+ case XFS_LI_INODE:
+ xlog_recover_inode_ra_pass2(log, item);
+ break;
+ case XFS_LI_DQUOT:
+ xlog_recover_dquot_ra_pass2(log, item);
+ break;
+ case XFS_LI_EFI:
+ case XFS_LI_EFD:
+ case XFS_LI_QUOTAOFF:
+ default:
+ break;
+ }
+}
+
STATIC int
xlog_recover_commit_pass1(
struct xlog *log,
@@ -2716,6 +3382,7 @@ xlog_recover_commit_pass1(
case XFS_LI_EFI:
case XFS_LI_EFD:
case XFS_LI_DQUOT:
+ case XFS_LI_ICREATE:
/* nothing to do in pass 1 */
return 0;
default:
@@ -2737,15 +3404,20 @@ xlog_recover_commit_pass2(
switch (ITEM_TYPE(item)) {
case XFS_LI_BUF:
- return xlog_recover_buffer_pass2(log, buffer_list, item);
+ return xlog_recover_buffer_pass2(log, buffer_list, item,
+ trans->r_lsn);
case XFS_LI_INODE:
- return xlog_recover_inode_pass2(log, buffer_list, item);
+ return xlog_recover_inode_pass2(log, buffer_list, item,
+ trans->r_lsn);
case XFS_LI_EFI:
return xlog_recover_efi_pass2(log, item, trans->r_lsn);
case XFS_LI_EFD:
return xlog_recover_efd_pass2(log, item);
case XFS_LI_DQUOT:
- return xlog_recover_dquot_pass2(log, buffer_list, item);
+ return xlog_recover_dquot_pass2(log, buffer_list, item,
+ trans->r_lsn);
+ case XFS_LI_ICREATE:
+ return xlog_recover_do_icreate_pass2(log, buffer_list, item);
case XFS_LI_QUOTAOFF:
/* nothing to do in pass2 */
return 0;
@@ -2757,6 +3429,26 @@ xlog_recover_commit_pass2(
}
}
+STATIC int
+xlog_recover_items_pass2(
+ struct xlog *log,
+ struct xlog_recover *trans,
+ struct list_head *buffer_list,
+ struct list_head *item_list)
+{
+ struct xlog_recover_item *item;
+ int error = 0;
+
+ list_for_each_entry(item, item_list, ri_list) {
+ error = xlog_recover_commit_pass2(log, trans,
+ buffer_list, item);
+ if (error)
+ return error;
+ }
+
+ return error;
+}
+
/*
* Perform the transaction.
*
@@ -2769,9 +3461,16 @@ xlog_recover_commit_trans(
struct xlog_recover *trans,
int pass)
{
- int error = 0, error2;
- xlog_recover_item_t *item;
- LIST_HEAD (buffer_list);
+ int error = 0;
+ int error2;
+ int items_queued = 0;
+ struct xlog_recover_item *item;
+ struct xlog_recover_item *next;
+ LIST_HEAD (buffer_list);
+ LIST_HEAD (ra_list);
+ LIST_HEAD (done_list);
+
+ #define XLOG_RECOVER_COMMIT_QUEUE_MAX 100
hlist_del(&trans->r_list);
@@ -2779,14 +3478,22 @@ xlog_recover_commit_trans(
if (error)
return error;
- list_for_each_entry(item, &trans->r_itemq, ri_list) {
+ list_for_each_entry_safe(item, next, &trans->r_itemq, ri_list) {
switch (pass) {
case XLOG_RECOVER_PASS1:
error = xlog_recover_commit_pass1(log, trans, item);
break;
case XLOG_RECOVER_PASS2:
- error = xlog_recover_commit_pass2(log, trans,
- &buffer_list, item);
+ xlog_recover_ra_pass2(log, item);
+ list_move_tail(&item->ri_list, &ra_list);
+ items_queued++;
+ if (items_queued >= XLOG_RECOVER_COMMIT_QUEUE_MAX) {
+ error = xlog_recover_items_pass2(log, trans,
+ &buffer_list, &ra_list);
+ list_splice_tail_init(&ra_list, &done_list);
+ items_queued = 0;
+ }
+
break;
default:
ASSERT(0);
@@ -2796,17 +3503,26 @@ xlog_recover_commit_trans(
goto out;
}
+out:
+ if (!list_empty(&ra_list)) {
+ if (!error)
+ error = xlog_recover_items_pass2(log, trans,
+ &buffer_list, &ra_list);
+ list_splice_tail_init(&ra_list, &done_list);
+ }
+
+ if (!list_empty(&done_list))
+ list_splice_init(&done_list, &trans->r_itemq);
+
xlog_recover_free_trans(trans);
-out:
error2 = xfs_buf_delwri_submit(&buffer_list);
return error ? error : error2;
}
STATIC int
xlog_recover_unmount_trans(
- struct xlog *log,
- struct xlog_recover *trans)
+ struct xlog *log)
{
/* Do nothing now */
xfs_warn(log->l_mp, "%s: Unmount LR", __func__);
@@ -2880,7 +3596,7 @@ xlog_recover_process_data(
trans, pass);
break;
case XLOG_UNMOUNT_TRANS:
- error = xlog_recover_unmount_trans(log, trans);
+ error = xlog_recover_unmount_trans(log);
break;
case XLOG_WAS_CONT_TRANS:
error = xlog_recover_add_to_cont_trans(log,
@@ -2905,8 +3621,10 @@ xlog_recover_process_data(
error = XFS_ERROR(EIO);
break;
}
- if (error)
+ if (error) {
+ xlog_recover_free_trans(trans);
return error;
+ }
}
dp += be32_to_cpu(ohead->oh_len);
num_logops--;
@@ -2949,13 +3667,14 @@ xlog_recover_process_efi(
* This will pull the EFI from the AIL and
* free the memory associated with it.
*/
+ set_bit(XFS_EFI_RECOVERED, &efip->efi_flags);
xfs_efi_release(efip, efip->efi_format.efi_nextents);
return XFS_ERROR(EIO);
}
}
tp = xfs_trans_alloc(mp, 0);
- error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0, 0, 0);
+ error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
if (error)
goto abort_error;
efdp = xfs_trans_get_efd(tp, efip, efip->efi_format.efi_nextents);
@@ -3039,7 +3758,7 @@ xlog_recover_process_efis(
lip = xfs_trans_ail_cursor_next(ailp, &cur);
}
out:
- xfs_trans_ail_cursor_done(ailp, &cur);
+ xfs_trans_ail_cursor_done(&cur);
spin_unlock(&ailp->xa_lock);
return error;
}
@@ -3061,8 +3780,7 @@ xlog_recover_clear_agi_bucket(
int error;
tp = xfs_trans_alloc(mp, XFS_TRANS_CLEAR_AGI_BUCKET);
- error = xfs_trans_reserve(tp, 0, XFS_CLEAR_AGI_BUCKET_LOG_RES(mp),
- 0, 0, 0);
+ error = xfs_trans_reserve(tp, &M_RES(mp)->tr_clearagi, 0, 0);
if (error)
goto out_abort;
@@ -3239,7 +3957,7 @@ xlog_unpack_data_crc(
if (crc != rhead->h_crc) {
if (rhead->h_crc || xfs_sb_version_hascrc(&log->l_mp->m_sb)) {
xfs_alert(log->l_mp,
- "log record CRC mismatch: found 0x%x, expected 0x%x.\n",
+ "log record CRC mismatch: found 0x%x, expected 0x%x.",
le32_to_cpu(rhead->h_crc),
le32_to_cpu(crc));
xfs_hex_dump(dp, 32);
@@ -3694,7 +4412,13 @@ xlog_do_recover(
XFS_BUF_READ(bp);
XFS_BUF_UNASYNC(bp);
bp->b_ops = &xfs_sb_buf_ops;
- xfsbdstrat(log->l_mp, bp);
+
+ if (XFS_FORCED_SHUTDOWN(log->l_mp)) {
+ xfs_buf_relse(bp);
+ return XFS_ERROR(EIO);
+ }
+
+ xfs_buf_iorequest(bp);
error = xfs_buf_iowait(bp);
if (error) {
xfs_buf_ioerror_alert(bp, __func__);
@@ -3752,6 +4476,25 @@ xlog_recover(
return error;
}
+ /*
+ * Version 5 superblock log feature mask validation. We know the
+ * log is dirty so check if there are any unknown log features
+ * in what we need to recover. If there are unknown features
+ * (e.g. unsupported transactions, then simply reject the
+ * attempt at recovery before touching anything.
+ */
+ if (XFS_SB_VERSION_NUM(&log->l_mp->m_sb) == XFS_SB_VERSION_5 &&
+ xfs_sb_has_incompat_log_feature(&log->l_mp->m_sb,
+ XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN)) {
+ xfs_warn(log->l_mp,
+"Superblock has unknown incompatible log features (0x%x) enabled.\n"
+"The log can not be fully and/or safely recovered by this kernel.\n"
+"Please recover the log on a kernel that supports the unknown features.",
+ (log->l_mp->m_sb.sb_features_log_incompat &
+ XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN));
+ return EINVAL;
+ }
+
xfs_notice(log->l_mp, "Starting recovery (logdev: %s)",
log->l_mp->m_logname ? log->l_mp->m_logname
: "internal");
diff --git a/fs/xfs/xfs_log_rlimit.c b/fs/xfs/xfs_log_rlimit.c
new file mode 100644
index 00000000000..ee7e0e80246
--- /dev/null
+++ b/fs/xfs/xfs_log_rlimit.c
@@ -0,0 +1,150 @@
+/*
+ * Copyright (c) 2013 Jie Liu.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_ag.h"
+#include "xfs_sb.h"
+#include "xfs_mount.h"
+#include "xfs_da_format.h"
+#include "xfs_trans_space.h"
+#include "xfs_inode.h"
+#include "xfs_da_btree.h"
+#include "xfs_attr_leaf.h"
+#include "xfs_bmap_btree.h"
+
+/*
+ * Calculate the maximum length in bytes that would be required for a local
+ * attribute value as large attributes out of line are not logged.
+ */
+STATIC int
+xfs_log_calc_max_attrsetm_res(
+ struct xfs_mount *mp)
+{
+ int size;
+ int nblks;
+
+ size = xfs_attr_leaf_entsize_local_max(mp->m_attr_geo->blksize) -
+ MAXNAMELEN - 1;
+ nblks = XFS_DAENTER_SPACE_RES(mp, XFS_ATTR_FORK);
+ nblks += XFS_B_TO_FSB(mp, size);
+ nblks += XFS_NEXTENTADD_SPACE_RES(mp, size, XFS_ATTR_FORK);
+
+ return M_RES(mp)->tr_attrsetm.tr_logres +
+ M_RES(mp)->tr_attrsetrt.tr_logres * nblks;
+}
+
+/*
+ * Iterate over the log space reservation table to figure out and return
+ * the maximum one in terms of the pre-calculated values which were done
+ * at mount time.
+ */
+STATIC void
+xfs_log_get_max_trans_res(
+ struct xfs_mount *mp,
+ struct xfs_trans_res *max_resp)
+{
+ struct xfs_trans_res *resp;
+ struct xfs_trans_res *end_resp;
+ int log_space = 0;
+ int attr_space;
+
+ attr_space = xfs_log_calc_max_attrsetm_res(mp);
+
+ resp = (struct xfs_trans_res *)M_RES(mp);
+ end_resp = (struct xfs_trans_res *)(M_RES(mp) + 1);
+ for (; resp < end_resp; resp++) {
+ int tmp = resp->tr_logcount > 1 ?
+ resp->tr_logres * resp->tr_logcount :
+ resp->tr_logres;
+ if (log_space < tmp) {
+ log_space = tmp;
+ *max_resp = *resp; /* struct copy */
+ }
+ }
+
+ if (attr_space > log_space) {
+ *max_resp = M_RES(mp)->tr_attrsetm; /* struct copy */
+ max_resp->tr_logres = attr_space;
+ }
+}
+
+/*
+ * Calculate the minimum valid log size for the given superblock configuration.
+ * Used to calculate the minimum log size at mkfs time, and to determine if
+ * the log is large enough or not at mount time. Returns the minimum size in
+ * filesystem block size units.
+ */
+int
+xfs_log_calc_minimum_size(
+ struct xfs_mount *mp)
+{
+ struct xfs_trans_res tres = {0};
+ int max_logres;
+ int min_logblks = 0;
+ int lsunit = 0;
+
+ xfs_log_get_max_trans_res(mp, &tres);
+
+ max_logres = xfs_log_calc_unit_res(mp, tres.tr_logres);
+ if (tres.tr_logcount > 1)
+ max_logres *= tres.tr_logcount;
+
+ if (xfs_sb_version_haslogv2(&mp->m_sb) && mp->m_sb.sb_logsunit > 1)
+ lsunit = BTOBB(mp->m_sb.sb_logsunit);
+
+ /*
+ * Two factors should be taken into account for calculating the minimum
+ * log space.
+ * 1) The fundamental limitation is that no single transaction can be
+ * larger than half size of the log.
+ *
+ * From mkfs.xfs, this is considered by the XFS_MIN_LOG_FACTOR
+ * define, which is set to 3. That means we can definitely fit
+ * maximally sized 2 transactions in the log. We'll use this same
+ * value here.
+ *
+ * 2) If the lsunit option is specified, a transaction requires 2 LSU
+ * for the reservation because there are two log writes that can
+ * require padding - the transaction data and the commit record which
+ * are written separately and both can require padding to the LSU.
+ * Consider that we can have an active CIL reservation holding 2*LSU,
+ * but the CIL is not over a push threshold, in this case, if we
+ * don't have enough log space for at one new transaction, which
+ * includes another 2*LSU in the reservation, we will run into dead
+ * loop situation in log space grant procedure. i.e.
+ * xlog_grant_head_wait().
+ *
+ * Hence the log size needs to be able to contain two maximally sized
+ * and padded transactions, which is (2 * (2 * LSU + maxlres)).
+ *
+ * Also, the log size should be a multiple of the log stripe unit, round
+ * it up to lsunit boundary if lsunit is specified.
+ */
+ if (lsunit) {
+ min_logblks = roundup_64(BTOBB(max_logres), lsunit) +
+ 2 * lsunit;
+ } else
+ min_logblks = BTOBB(max_logres) + 2 * BBSIZE;
+ min_logblks *= XFS_MIN_LOG_FACTOR;
+
+ return XFS_BB_TO_FSB(mp, min_logblks);
+}
diff --git a/fs/xfs/xfs_message.c b/fs/xfs/xfs_message.c
index 331cd9f83a7..63ca2f0420b 100644
--- a/fs/xfs/xfs_message.c
+++ b/fs/xfs/xfs_message.c
@@ -17,9 +17,8 @@
#include "xfs.h"
#include "xfs_fs.h"
-#include "xfs_types.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_mount.h"
@@ -93,6 +92,14 @@ xfs_alert_tag(
}
void
+asswarn(char *expr, char *file, int line)
+{
+ xfs_warn(NULL, "Assertion failed: %s, file: %s, line: %d",
+ expr, file, line);
+ WARN_ON(1);
+}
+
+void
assfail(char *expr, char *file, int line)
{
xfs_emerg(NULL, "Assertion failed: %s, file: %s, line: %d",
diff --git a/fs/xfs/xfs_message.h b/fs/xfs/xfs_message.h
index 56dc0c17f16..85401155750 100644
--- a/fs/xfs/xfs_message.h
+++ b/fs/xfs/xfs_message.h
@@ -30,7 +30,34 @@ void xfs_debug(const struct xfs_mount *mp, const char *fmt, ...)
}
#endif
+#define xfs_printk_ratelimited(func, dev, fmt, ...) \
+do { \
+ static DEFINE_RATELIMIT_STATE(_rs, \
+ DEFAULT_RATELIMIT_INTERVAL, \
+ DEFAULT_RATELIMIT_BURST); \
+ if (__ratelimit(&_rs)) \
+ func(dev, fmt, ##__VA_ARGS__); \
+} while (0)
+
+#define xfs_emerg_ratelimited(dev, fmt, ...) \
+ xfs_printk_ratelimited(xfs_emerg, dev, fmt, ##__VA_ARGS__)
+#define xfs_alert_ratelimited(dev, fmt, ...) \
+ xfs_printk_ratelimited(xfs_alert, dev, fmt, ##__VA_ARGS__)
+#define xfs_crit_ratelimited(dev, fmt, ...) \
+ xfs_printk_ratelimited(xfs_crit, dev, fmt, ##__VA_ARGS__)
+#define xfs_err_ratelimited(dev, fmt, ...) \
+ xfs_printk_ratelimited(xfs_err, dev, fmt, ##__VA_ARGS__)
+#define xfs_warn_ratelimited(dev, fmt, ...) \
+ xfs_printk_ratelimited(xfs_warn, dev, fmt, ##__VA_ARGS__)
+#define xfs_notice_ratelimited(dev, fmt, ...) \
+ xfs_printk_ratelimited(xfs_notice, dev, fmt, ##__VA_ARGS__)
+#define xfs_info_ratelimited(dev, fmt, ...) \
+ xfs_printk_ratelimited(xfs_info, dev, fmt, ##__VA_ARGS__)
+#define xfs_debug_ratelimited(dev, fmt, ...) \
+ xfs_printk_ratelimited(xfs_debug, dev, fmt, ##__VA_ARGS__)
+
extern void assfail(char *expr, char *f, int l);
+extern void asswarn(char *expr, char *f, int l);
extern void xfs_hex_dump(void *p, int length);
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index da508463ff1..3507cd0ec40 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -17,32 +17,31 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
-#include "xfs_types.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_bit.h"
-#include "xfs_log.h"
#include "xfs_inum.h"
-#include "xfs_trans.h"
-#include "xfs_trans_priv.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
-#include "xfs_dir2.h"
#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dinode.h"
+#include "xfs_da_format.h"
#include "xfs_inode.h"
-#include "xfs_btree.h"
+#include "xfs_dir2.h"
#include "xfs_ialloc.h"
#include "xfs_alloc.h"
#include "xfs_rtalloc.h"
#include "xfs_bmap.h"
+#include "xfs_trans.h"
+#include "xfs_trans_priv.h"
+#include "xfs_log.h"
#include "xfs_error.h"
#include "xfs_quota.h"
#include "xfs_fsops.h"
-#include "xfs_utils.h"
#include "xfs_trace.h"
#include "xfs_icache.h"
+#include "xfs_dinode.h"
#ifdef HAVE_PERCPU_SB
@@ -57,61 +56,6 @@ STATIC void xfs_icsb_disable_counter(xfs_mount_t *, xfs_sb_field_t);
#define xfs_icsb_balance_counter_locked(mp, a, b) do { } while (0)
#endif
-static const struct {
- short offset;
- short type; /* 0 = integer
- * 1 = binary / string (no translation)
- */
-} xfs_sb_info[] = {
- { offsetof(xfs_sb_t, sb_magicnum), 0 },
- { offsetof(xfs_sb_t, sb_blocksize), 0 },
- { offsetof(xfs_sb_t, sb_dblocks), 0 },
- { offsetof(xfs_sb_t, sb_rblocks), 0 },
- { offsetof(xfs_sb_t, sb_rextents), 0 },
- { offsetof(xfs_sb_t, sb_uuid), 1 },
- { offsetof(xfs_sb_t, sb_logstart), 0 },
- { offsetof(xfs_sb_t, sb_rootino), 0 },
- { offsetof(xfs_sb_t, sb_rbmino), 0 },
- { offsetof(xfs_sb_t, sb_rsumino), 0 },
- { offsetof(xfs_sb_t, sb_rextsize), 0 },
- { offsetof(xfs_sb_t, sb_agblocks), 0 },
- { offsetof(xfs_sb_t, sb_agcount), 0 },
- { offsetof(xfs_sb_t, sb_rbmblocks), 0 },
- { offsetof(xfs_sb_t, sb_logblocks), 0 },
- { offsetof(xfs_sb_t, sb_versionnum), 0 },
- { offsetof(xfs_sb_t, sb_sectsize), 0 },
- { offsetof(xfs_sb_t, sb_inodesize), 0 },
- { offsetof(xfs_sb_t, sb_inopblock), 0 },
- { offsetof(xfs_sb_t, sb_fname[0]), 1 },
- { offsetof(xfs_sb_t, sb_blocklog), 0 },
- { offsetof(xfs_sb_t, sb_sectlog), 0 },
- { offsetof(xfs_sb_t, sb_inodelog), 0 },
- { offsetof(xfs_sb_t, sb_inopblog), 0 },
- { offsetof(xfs_sb_t, sb_agblklog), 0 },
- { offsetof(xfs_sb_t, sb_rextslog), 0 },
- { offsetof(xfs_sb_t, sb_inprogress), 0 },
- { offsetof(xfs_sb_t, sb_imax_pct), 0 },
- { offsetof(xfs_sb_t, sb_icount), 0 },
- { offsetof(xfs_sb_t, sb_ifree), 0 },
- { offsetof(xfs_sb_t, sb_fdblocks), 0 },
- { offsetof(xfs_sb_t, sb_frextents), 0 },
- { offsetof(xfs_sb_t, sb_uquotino), 0 },
- { offsetof(xfs_sb_t, sb_gquotino), 0 },
- { offsetof(xfs_sb_t, sb_qflags), 0 },
- { offsetof(xfs_sb_t, sb_flags), 0 },
- { offsetof(xfs_sb_t, sb_shared_vn), 0 },
- { offsetof(xfs_sb_t, sb_inoalignmt), 0 },
- { offsetof(xfs_sb_t, sb_unit), 0 },
- { offsetof(xfs_sb_t, sb_width), 0 },
- { offsetof(xfs_sb_t, sb_dirblklog), 0 },
- { offsetof(xfs_sb_t, sb_logsectlog), 0 },
- { offsetof(xfs_sb_t, sb_logsectsize),0 },
- { offsetof(xfs_sb_t, sb_logsunit), 0 },
- { offsetof(xfs_sb_t, sb_features2), 0 },
- { offsetof(xfs_sb_t, sb_bad_features2), 0 },
- { sizeof(xfs_sb_t), 0 }
-};
-
static DEFINE_MUTEX(xfs_uuid_table_mutex);
static int xfs_uuid_table_size;
static uuid_t *xfs_uuid_table;
@@ -187,64 +131,6 @@ xfs_uuid_unmount(
}
-/*
- * Reference counting access wrappers to the perag structures.
- * Because we never free per-ag structures, the only thing we
- * have to protect against changes is the tree structure itself.
- */
-struct xfs_perag *
-xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno)
-{
- struct xfs_perag *pag;
- int ref = 0;
-
- rcu_read_lock();
- pag = radix_tree_lookup(&mp->m_perag_tree, agno);
- if (pag) {
- ASSERT(atomic_read(&pag->pag_ref) >= 0);
- ref = atomic_inc_return(&pag->pag_ref);
- }
- rcu_read_unlock();
- trace_xfs_perag_get(mp, agno, ref, _RET_IP_);
- return pag;
-}
-
-/*
- * search from @first to find the next perag with the given tag set.
- */
-struct xfs_perag *
-xfs_perag_get_tag(
- struct xfs_mount *mp,
- xfs_agnumber_t first,
- int tag)
-{
- struct xfs_perag *pag;
- int found;
- int ref;
-
- rcu_read_lock();
- found = radix_tree_gang_lookup_tag(&mp->m_perag_tree,
- (void **)&pag, first, 1, tag);
- if (found <= 0) {
- rcu_read_unlock();
- return NULL;
- }
- ref = atomic_inc_return(&pag->pag_ref);
- rcu_read_unlock();
- trace_xfs_perag_get_tag(mp, pag->pag_agno, ref, _RET_IP_);
- return pag;
-}
-
-void
-xfs_perag_put(struct xfs_perag *pag)
-{
- int ref;
-
- ASSERT(atomic_read(&pag->pag_ref) > 0);
- ref = atomic_dec_return(&pag->pag_ref);
- trace_xfs_perag_put(pag->pag_mount, pag->pag_agno, ref, _RET_IP_);
-}
-
STATIC void
__xfs_free_perag(
struct rcu_head *head)
@@ -297,131 +183,6 @@ xfs_sb_validate_fsb_count(
return 0;
}
-/*
- * Check the validity of the SB found.
- */
-STATIC int
-xfs_mount_validate_sb(
- xfs_mount_t *mp,
- xfs_sb_t *sbp,
- bool check_inprogress)
-{
-
- /*
- * If the log device and data device have the
- * same device number, the log is internal.
- * Consequently, the sb_logstart should be non-zero. If
- * we have a zero sb_logstart in this case, we may be trying to mount
- * a volume filesystem in a non-volume manner.
- */
- if (sbp->sb_magicnum != XFS_SB_MAGIC) {
- xfs_warn(mp, "bad magic number");
- return XFS_ERROR(EWRONGFS);
- }
-
- if (!xfs_sb_good_version(sbp)) {
- xfs_warn(mp, "bad version");
- return XFS_ERROR(EWRONGFS);
- }
-
- if (unlikely(
- sbp->sb_logstart == 0 && mp->m_logdev_targp == mp->m_ddev_targp)) {
- xfs_warn(mp,
- "filesystem is marked as having an external log; "
- "specify logdev on the mount command line.");
- return XFS_ERROR(EINVAL);
- }
-
- if (unlikely(
- sbp->sb_logstart != 0 && mp->m_logdev_targp != mp->m_ddev_targp)) {
- xfs_warn(mp,
- "filesystem is marked as having an internal log; "
- "do not specify logdev on the mount command line.");
- return XFS_ERROR(EINVAL);
- }
-
- /*
- * More sanity checking. Most of these were stolen directly from
- * xfs_repair.
- */
- if (unlikely(
- sbp->sb_agcount <= 0 ||
- sbp->sb_sectsize < XFS_MIN_SECTORSIZE ||
- sbp->sb_sectsize > XFS_MAX_SECTORSIZE ||
- sbp->sb_sectlog < XFS_MIN_SECTORSIZE_LOG ||
- sbp->sb_sectlog > XFS_MAX_SECTORSIZE_LOG ||
- sbp->sb_sectsize != (1 << sbp->sb_sectlog) ||
- sbp->sb_blocksize < XFS_MIN_BLOCKSIZE ||
- sbp->sb_blocksize > XFS_MAX_BLOCKSIZE ||
- sbp->sb_blocklog < XFS_MIN_BLOCKSIZE_LOG ||
- sbp->sb_blocklog > XFS_MAX_BLOCKSIZE_LOG ||
- sbp->sb_blocksize != (1 << sbp->sb_blocklog) ||
- sbp->sb_inodesize < XFS_DINODE_MIN_SIZE ||
- sbp->sb_inodesize > XFS_DINODE_MAX_SIZE ||
- sbp->sb_inodelog < XFS_DINODE_MIN_LOG ||
- sbp->sb_inodelog > XFS_DINODE_MAX_LOG ||
- sbp->sb_inodesize != (1 << sbp->sb_inodelog) ||
- (sbp->sb_blocklog - sbp->sb_inodelog != sbp->sb_inopblog) ||
- (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE) ||
- (sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE) ||
- (sbp->sb_imax_pct > 100 /* zero sb_imax_pct is valid */) ||
- sbp->sb_dblocks == 0 ||
- sbp->sb_dblocks > XFS_MAX_DBLOCKS(sbp) ||
- sbp->sb_dblocks < XFS_MIN_DBLOCKS(sbp))) {
- XFS_CORRUPTION_ERROR("SB sanity check failed",
- XFS_ERRLEVEL_LOW, mp, sbp);
- return XFS_ERROR(EFSCORRUPTED);
- }
-
- /*
- * Until this is fixed only page-sized or smaller data blocks work.
- */
- if (unlikely(sbp->sb_blocksize > PAGE_SIZE)) {
- xfs_warn(mp,
- "File system with blocksize %d bytes. "
- "Only pagesize (%ld) or less will currently work.",
- sbp->sb_blocksize, PAGE_SIZE);
- return XFS_ERROR(ENOSYS);
- }
-
- /*
- * Currently only very few inode sizes are supported.
- */
- switch (sbp->sb_inodesize) {
- case 256:
- case 512:
- case 1024:
- case 2048:
- break;
- default:
- xfs_warn(mp, "inode size of %d bytes not supported",
- sbp->sb_inodesize);
- return XFS_ERROR(ENOSYS);
- }
-
- if (xfs_sb_validate_fsb_count(sbp, sbp->sb_dblocks) ||
- xfs_sb_validate_fsb_count(sbp, sbp->sb_rblocks)) {
- xfs_warn(mp,
- "file system too large to be mounted on this system.");
- return XFS_ERROR(EFBIG);
- }
-
- if (check_inprogress && sbp->sb_inprogress) {
- xfs_warn(mp, "Offline file system operation in progress!");
- return XFS_ERROR(EFSCORRUPTED);
- }
-
- /*
- * Version 1 directory format has never worked on Linux.
- */
- if (unlikely(!xfs_sb_version_hasdirv2(sbp))) {
- xfs_warn(mp, "file system using version 1 directory format");
- return XFS_ERROR(ENOSYS);
- }
-
- return 0;
-}
-
int
xfs_initialize_perag(
xfs_mount_t *mp,
@@ -506,206 +267,44 @@ out_unwind:
return error;
}
-void
-xfs_sb_from_disk(
- struct xfs_sb *to,
- xfs_dsb_t *from)
-{
- to->sb_magicnum = be32_to_cpu(from->sb_magicnum);
- to->sb_blocksize = be32_to_cpu(from->sb_blocksize);
- to->sb_dblocks = be64_to_cpu(from->sb_dblocks);
- to->sb_rblocks = be64_to_cpu(from->sb_rblocks);
- to->sb_rextents = be64_to_cpu(from->sb_rextents);
- memcpy(&to->sb_uuid, &from->sb_uuid, sizeof(to->sb_uuid));
- to->sb_logstart = be64_to_cpu(from->sb_logstart);
- to->sb_rootino = be64_to_cpu(from->sb_rootino);
- to->sb_rbmino = be64_to_cpu(from->sb_rbmino);
- to->sb_rsumino = be64_to_cpu(from->sb_rsumino);
- to->sb_rextsize = be32_to_cpu(from->sb_rextsize);
- to->sb_agblocks = be32_to_cpu(from->sb_agblocks);
- to->sb_agcount = be32_to_cpu(from->sb_agcount);
- to->sb_rbmblocks = be32_to_cpu(from->sb_rbmblocks);
- to->sb_logblocks = be32_to_cpu(from->sb_logblocks);
- to->sb_versionnum = be16_to_cpu(from->sb_versionnum);
- to->sb_sectsize = be16_to_cpu(from->sb_sectsize);
- to->sb_inodesize = be16_to_cpu(from->sb_inodesize);
- to->sb_inopblock = be16_to_cpu(from->sb_inopblock);
- memcpy(&to->sb_fname, &from->sb_fname, sizeof(to->sb_fname));
- to->sb_blocklog = from->sb_blocklog;
- to->sb_sectlog = from->sb_sectlog;
- to->sb_inodelog = from->sb_inodelog;
- to->sb_inopblog = from->sb_inopblog;
- to->sb_agblklog = from->sb_agblklog;
- to->sb_rextslog = from->sb_rextslog;
- to->sb_inprogress = from->sb_inprogress;
- to->sb_imax_pct = from->sb_imax_pct;
- to->sb_icount = be64_to_cpu(from->sb_icount);
- to->sb_ifree = be64_to_cpu(from->sb_ifree);
- to->sb_fdblocks = be64_to_cpu(from->sb_fdblocks);
- to->sb_frextents = be64_to_cpu(from->sb_frextents);
- to->sb_uquotino = be64_to_cpu(from->sb_uquotino);
- to->sb_gquotino = be64_to_cpu(from->sb_gquotino);
- to->sb_qflags = be16_to_cpu(from->sb_qflags);
- to->sb_flags = from->sb_flags;
- to->sb_shared_vn = from->sb_shared_vn;
- to->sb_inoalignmt = be32_to_cpu(from->sb_inoalignmt);
- to->sb_unit = be32_to_cpu(from->sb_unit);
- to->sb_width = be32_to_cpu(from->sb_width);
- to->sb_dirblklog = from->sb_dirblklog;
- to->sb_logsectlog = from->sb_logsectlog;
- to->sb_logsectsize = be16_to_cpu(from->sb_logsectsize);
- to->sb_logsunit = be32_to_cpu(from->sb_logsunit);
- to->sb_features2 = be32_to_cpu(from->sb_features2);
- to->sb_bad_features2 = be32_to_cpu(from->sb_bad_features2);
-}
-
-/*
- * Copy in core superblock to ondisk one.
- *
- * The fields argument is mask of superblock fields to copy.
- */
-void
-xfs_sb_to_disk(
- xfs_dsb_t *to,
- xfs_sb_t *from,
- __int64_t fields)
-{
- xfs_caddr_t to_ptr = (xfs_caddr_t)to;
- xfs_caddr_t from_ptr = (xfs_caddr_t)from;
- xfs_sb_field_t f;
- int first;
- int size;
-
- ASSERT(fields);
- if (!fields)
- return;
-
- while (fields) {
- f = (xfs_sb_field_t)xfs_lowbit64((__uint64_t)fields);
- first = xfs_sb_info[f].offset;
- size = xfs_sb_info[f + 1].offset - first;
-
- ASSERT(xfs_sb_info[f].type == 0 || xfs_sb_info[f].type == 1);
-
- if (size == 1 || xfs_sb_info[f].type == 1) {
- memcpy(to_ptr + first, from_ptr + first, size);
- } else {
- switch (size) {
- case 2:
- *(__be16 *)(to_ptr + first) =
- cpu_to_be16(*(__u16 *)(from_ptr + first));
- break;
- case 4:
- *(__be32 *)(to_ptr + first) =
- cpu_to_be32(*(__u32 *)(from_ptr + first));
- break;
- case 8:
- *(__be64 *)(to_ptr + first) =
- cpu_to_be64(*(__u64 *)(from_ptr + first));
- break;
- default:
- ASSERT(0);
- }
- }
-
- fields &= ~(1LL << f);
- }
-}
-
-static void
-xfs_sb_verify(
- struct xfs_buf *bp)
-{
- struct xfs_mount *mp = bp->b_target->bt_mount;
- struct xfs_sb sb;
- int error;
-
- xfs_sb_from_disk(&sb, XFS_BUF_TO_SBP(bp));
-
- /*
- * Only check the in progress field for the primary superblock as
- * mkfs.xfs doesn't clear it from secondary superblocks.
- */
- error = xfs_mount_validate_sb(mp, &sb, bp->b_bn == XFS_SB_DADDR);
- if (error)
- xfs_buf_ioerror(bp, error);
-}
-
-static void
-xfs_sb_read_verify(
- struct xfs_buf *bp)
-{
- xfs_sb_verify(bp);
-}
-
-/*
- * We may be probed for a filesystem match, so we may not want to emit
- * messages when the superblock buffer is not actually an XFS superblock.
- * If we find an XFS superblock, the run a normal, noisy mount because we are
- * really going to mount it and want to know about errors.
- */
-static void
-xfs_sb_quiet_read_verify(
- struct xfs_buf *bp)
-{
- struct xfs_sb sb;
-
- xfs_sb_from_disk(&sb, XFS_BUF_TO_SBP(bp));
-
- if (sb.sb_magicnum == XFS_SB_MAGIC) {
- /* XFS filesystem, verify noisily! */
- xfs_sb_read_verify(bp);
- return;
- }
- /* quietly fail */
- xfs_buf_ioerror(bp, EFSCORRUPTED);
-}
-
-static void
-xfs_sb_write_verify(
- struct xfs_buf *bp)
-{
- xfs_sb_verify(bp);
-}
-
-const struct xfs_buf_ops xfs_sb_buf_ops = {
- .verify_read = xfs_sb_read_verify,
- .verify_write = xfs_sb_write_verify,
-};
-
-static const struct xfs_buf_ops xfs_sb_quiet_buf_ops = {
- .verify_read = xfs_sb_quiet_read_verify,
- .verify_write = xfs_sb_write_verify,
-};
-
/*
* xfs_readsb
*
* Does the initial read of the superblock.
*/
int
-xfs_readsb(xfs_mount_t *mp, int flags)
+xfs_readsb(
+ struct xfs_mount *mp,
+ int flags)
{
unsigned int sector_size;
- xfs_buf_t *bp;
+ struct xfs_buf *bp;
+ struct xfs_sb *sbp = &mp->m_sb;
int error;
int loud = !(flags & XFS_MFSI_QUIET);
+ const struct xfs_buf_ops *buf_ops;
ASSERT(mp->m_sb_bp == NULL);
ASSERT(mp->m_ddev_targp != NULL);
/*
+ * For the initial read, we must guess at the sector
+ * size based on the block device. It's enough to
+ * get the sb_sectsize out of the superblock and
+ * then reread with the proper length.
+ * We don't verify it yet, because it may not be complete.
+ */
+ sector_size = xfs_getsize_buftarg(mp->m_ddev_targp);
+ buf_ops = NULL;
+
+ /*
* Allocate a (locked) buffer to hold the superblock.
* This will be kept around at all times to optimize
* access to the superblock.
*/
- sector_size = xfs_getsize_buftarg(mp->m_ddev_targp);
-
reread:
bp = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_SB_DADDR,
- BTOBB(sector_size), 0,
- loud ? &xfs_sb_buf_ops
- : &xfs_sb_quiet_buf_ops);
+ BTOBB(sector_size), 0, buf_ops);
if (!bp) {
if (loud)
xfs_warn(mp, "SB buffer read failed");
@@ -714,39 +313,58 @@ reread:
if (bp->b_error) {
error = bp->b_error;
if (loud)
- xfs_warn(mp, "SB validate failed");
+ xfs_warn(mp, "SB validate failed with error %d.", error);
+ /* bad CRC means corrupted metadata */
+ if (error == EFSBADCRC)
+ error = EFSCORRUPTED;
goto release_buf;
}
/*
* Initialize the mount structure from the superblock.
*/
- xfs_sb_from_disk(&mp->m_sb, XFS_BUF_TO_SBP(bp));
+ xfs_sb_from_disk(sbp, XFS_BUF_TO_SBP(bp));
+ xfs_sb_quota_from_disk(sbp);
+
+ /*
+ * If we haven't validated the superblock, do so now before we try
+ * to check the sector size and reread the superblock appropriately.
+ */
+ if (sbp->sb_magicnum != XFS_SB_MAGIC) {
+ if (loud)
+ xfs_warn(mp, "Invalid superblock magic number");
+ error = EINVAL;
+ goto release_buf;
+ }
/*
* We must be able to do sector-sized and sector-aligned IO.
*/
- if (sector_size > mp->m_sb.sb_sectsize) {
+ if (sector_size > sbp->sb_sectsize) {
if (loud)
xfs_warn(mp, "device supports %u byte sectors (not %u)",
- sector_size, mp->m_sb.sb_sectsize);
+ sector_size, sbp->sb_sectsize);
error = ENOSYS;
goto release_buf;
}
- /*
- * If device sector size is smaller than the superblock size,
- * re-read the superblock so the buffer is correctly sized.
- */
- if (sector_size < mp->m_sb.sb_sectsize) {
+ if (buf_ops == NULL) {
+ /*
+ * Re-read the superblock so the buffer is correctly sized,
+ * and properly verified.
+ */
xfs_buf_relse(bp);
- sector_size = mp->m_sb.sb_sectsize;
+ sector_size = sbp->sb_sectsize;
+ buf_ops = loud ? &xfs_sb_buf_ops : &xfs_sb_quiet_buf_ops;
goto reread;
}
/* Initialize per-cpu counters */
xfs_icsb_reinit_counters(mp);
+ /* no need to be quiet anymore, so reset the buf ops */
+ bp->b_ops = &xfs_sb_buf_ops;
+
mp->m_sb_bp = bp;
xfs_buf_unlock(bp);
return 0;
@@ -756,107 +374,6 @@ release_buf:
return error;
}
-
-/*
- * xfs_mount_common
- *
- * Mount initialization code establishing various mount
- * fields from the superblock associated with the given
- * mount structure
- */
-STATIC void
-xfs_mount_common(xfs_mount_t *mp, xfs_sb_t *sbp)
-{
- mp->m_agfrotor = mp->m_agirotor = 0;
- spin_lock_init(&mp->m_agirotor_lock);
- mp->m_maxagi = mp->m_sb.sb_agcount;
- mp->m_blkbit_log = sbp->sb_blocklog + XFS_NBBYLOG;
- mp->m_blkbb_log = sbp->sb_blocklog - BBSHIFT;
- mp->m_sectbb_log = sbp->sb_sectlog - BBSHIFT;
- mp->m_agno_log = xfs_highbit32(sbp->sb_agcount - 1) + 1;
- mp->m_agino_log = sbp->sb_inopblog + sbp->sb_agblklog;
- mp->m_blockmask = sbp->sb_blocksize - 1;
- mp->m_blockwsize = sbp->sb_blocksize >> XFS_WORDLOG;
- mp->m_blockwmask = mp->m_blockwsize - 1;
-
- mp->m_alloc_mxr[0] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, 1);
- mp->m_alloc_mxr[1] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, 0);
- mp->m_alloc_mnr[0] = mp->m_alloc_mxr[0] / 2;
- mp->m_alloc_mnr[1] = mp->m_alloc_mxr[1] / 2;
-
- mp->m_inobt_mxr[0] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, 1);
- mp->m_inobt_mxr[1] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, 0);
- mp->m_inobt_mnr[0] = mp->m_inobt_mxr[0] / 2;
- mp->m_inobt_mnr[1] = mp->m_inobt_mxr[1] / 2;
-
- mp->m_bmap_dmxr[0] = xfs_bmbt_maxrecs(mp, sbp->sb_blocksize, 1);
- mp->m_bmap_dmxr[1] = xfs_bmbt_maxrecs(mp, sbp->sb_blocksize, 0);
- mp->m_bmap_dmnr[0] = mp->m_bmap_dmxr[0] / 2;
- mp->m_bmap_dmnr[1] = mp->m_bmap_dmxr[1] / 2;
-
- mp->m_bsize = XFS_FSB_TO_BB(mp, 1);
- mp->m_ialloc_inos = (int)MAX((__uint16_t)XFS_INODES_PER_CHUNK,
- sbp->sb_inopblock);
- mp->m_ialloc_blks = mp->m_ialloc_inos >> sbp->sb_inopblog;
-}
-
-/*
- * xfs_initialize_perag_data
- *
- * Read in each per-ag structure so we can count up the number of
- * allocated inodes, free inodes and used filesystem blocks as this
- * information is no longer persistent in the superblock. Once we have
- * this information, write it into the in-core superblock structure.
- */
-STATIC int
-xfs_initialize_perag_data(xfs_mount_t *mp, xfs_agnumber_t agcount)
-{
- xfs_agnumber_t index;
- xfs_perag_t *pag;
- xfs_sb_t *sbp = &mp->m_sb;
- uint64_t ifree = 0;
- uint64_t ialloc = 0;
- uint64_t bfree = 0;
- uint64_t bfreelst = 0;
- uint64_t btree = 0;
- int error;
-
- for (index = 0; index < agcount; index++) {
- /*
- * read the agf, then the agi. This gets us
- * all the information we need and populates the
- * per-ag structures for us.
- */
- error = xfs_alloc_pagf_init(mp, NULL, index, 0);
- if (error)
- return error;
-
- error = xfs_ialloc_pagi_init(mp, NULL, index);
- if (error)
- return error;
- pag = xfs_perag_get(mp, index);
- ifree += pag->pagi_freecount;
- ialloc += pag->pagi_count;
- bfree += pag->pagf_freeblks;
- bfreelst += pag->pagf_flcount;
- btree += pag->pagf_btreeblks;
- xfs_perag_put(pag);
- }
- /*
- * Overwrite incore superblock counters with just-read data
- */
- spin_lock(&mp->m_sb_lock);
- sbp->sb_ifree = ifree;
- sbp->sb_icount = ialloc;
- sbp->sb_fdblocks = bfree + bfreelst + btree;
- spin_unlock(&mp->m_sb_lock);
-
- /* Fixup the per-cpu counters as well. */
- xfs_icsb_reinit_counters(mp);
-
- return 0;
-}
-
/*
* Update alignment values based on mount options and sb values
*/
@@ -872,42 +389,27 @@ xfs_update_alignment(xfs_mount_t *mp)
*/
if ((BBTOB(mp->m_dalign) & mp->m_blockmask) ||
(BBTOB(mp->m_swidth) & mp->m_blockmask)) {
- if (mp->m_flags & XFS_MOUNT_RETERR) {
- xfs_warn(mp, "alignment check failed: "
- "(sunit/swidth vs. blocksize)");
- return XFS_ERROR(EINVAL);
- }
- mp->m_dalign = mp->m_swidth = 0;
+ xfs_warn(mp,
+ "alignment check failed: sunit/swidth vs. blocksize(%d)",
+ sbp->sb_blocksize);
+ return XFS_ERROR(EINVAL);
} else {
/*
* Convert the stripe unit and width to FSBs.
*/
mp->m_dalign = XFS_BB_TO_FSBT(mp, mp->m_dalign);
if (mp->m_dalign && (sbp->sb_agblocks % mp->m_dalign)) {
- if (mp->m_flags & XFS_MOUNT_RETERR) {
- xfs_warn(mp, "alignment check failed: "
- "(sunit/swidth vs. ag size)");
- return XFS_ERROR(EINVAL);
- }
xfs_warn(mp,
- "stripe alignment turned off: sunit(%d)/swidth(%d) "
- "incompatible with agsize(%d)",
- mp->m_dalign, mp->m_swidth,
- sbp->sb_agblocks);
-
- mp->m_dalign = 0;
- mp->m_swidth = 0;
+ "alignment check failed: sunit/swidth vs. agsize(%d)",
+ sbp->sb_agblocks);
+ return XFS_ERROR(EINVAL);
} else if (mp->m_dalign) {
mp->m_swidth = XFS_BB_TO_FSBT(mp, mp->m_swidth);
} else {
- if (mp->m_flags & XFS_MOUNT_RETERR) {
- xfs_warn(mp, "alignment check failed: "
- "sunit(%d) less than bsize(%d)",
- mp->m_dalign,
- mp->m_blockmask +1);
- return XFS_ERROR(EINVAL);
- }
- mp->m_swidth = 0;
+ xfs_warn(mp,
+ "alignment check failed: sunit(%d) less than bsize(%d)",
+ mp->m_dalign, sbp->sb_blocksize);
+ return XFS_ERROR(EINVAL);
}
}
@@ -924,6 +426,10 @@ xfs_update_alignment(xfs_mount_t *mp)
sbp->sb_width = mp->m_swidth;
mp->m_update_flags |= XFS_SB_WIDTH;
}
+ } else {
+ xfs_warn(mp,
+ "cannot change alignment: superblock does not support data alignment");
+ return XFS_ERROR(EINVAL);
}
} else if ((mp->m_flags & XFS_MOUNT_NOALIGN) != XFS_MOUNT_NOALIGN &&
xfs_sb_version_hasdalign(&mp->m_sb)) {
@@ -1039,7 +545,7 @@ xfs_set_inoalignment(xfs_mount_t *mp)
}
/*
- * Check that the data (and log if separate) are an ok size.
+ * Check that the data (and log if separate) is an ok size.
*/
STATIC int
xfs_check_sizes(xfs_mount_t *mp)
@@ -1109,8 +615,7 @@ xfs_mount_reset_sbqflags(
return 0;
tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SBCHANGE);
- error = xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0,
- XFS_DEFAULT_LOG_COUNT);
+ error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_sbchange, 0, 0);
if (error) {
xfs_trans_cancel(tp, 0);
xfs_alert(mp, "%s: Superblock update failed!", __func__);
@@ -1160,7 +665,7 @@ xfs_mountfs(
uint quotaflags = 0;
int error = 0;
- xfs_mount_common(mp, sbp);
+ xfs_sb_mount_common(mp, sbp);
/*
* Check for a mismatched features2 values. Older kernels
@@ -1203,6 +708,12 @@ xfs_mountfs(
mp->m_update_flags |= XFS_SB_VERSIONNUM;
}
+ /* always use v2 inodes by default now */
+ if (!(mp->m_sb.sb_versionnum & XFS_SB_VERSION_NLINKBIT)) {
+ mp->m_sb.sb_versionnum |= XFS_SB_VERSION_NLINKBIT;
+ mp->m_update_flags |= XFS_SB_VERSIONNUM;
+ }
+
/*
* Check if sb_agblocks is aligned at stripe boundary
* If sb_agblocks is NOT aligned turn off m_dalign since
@@ -1236,8 +747,20 @@ xfs_mountfs(
* Set the inode cluster size.
* This may still be overridden by the file system
* block size if it is larger than the chosen cluster size.
+ *
+ * For v5 filesystems, scale the cluster size with the inode size to
+ * keep a constant ratio of inode per cluster buffer, but only if mkfs
+ * has set the inode alignment value appropriately for larger cluster
+ * sizes.
*/
mp->m_inode_cluster_size = XFS_INODE_BIG_CLUSTER_SIZE;
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ int new_size = mp->m_inode_cluster_size;
+
+ new_size *= mp->m_sb.sb_inodesize / XFS_DINODE_MIN_SIZE;
+ if (mp->m_sb.sb_inoalignmt >= XFS_B_TO_FSBT(mp, new_size))
+ mp->m_inode_cluster_size = new_size;
+ }
/*
* Set inode alignment fields
@@ -1245,7 +768,7 @@ xfs_mountfs(
xfs_set_inoalignment(mp);
/*
- * Check that the data (and log if separate) are an ok size.
+ * Check that the data (and log if separate) is an ok size.
*/
error = xfs_check_sizes(mp);
if (error)
@@ -1268,12 +791,11 @@ xfs_mountfs(
mp->m_dmevmask = 0; /* not persistent; set after each mount */
- xfs_dir_mount(mp);
-
- /*
- * Initialize the attribute manager's entries.
- */
- mp->m_attr_magicpct = (mp->m_sb.sb_blocksize * 37) / 100;
+ error = xfs_da_mount(mp);
+ if (error) {
+ xfs_warn(mp, "Failed dir/attr init: %d", error);
+ goto out_remove_uuid;
+ }
/*
* Initialize the precomputed transaction reservations values.
@@ -1288,7 +810,7 @@ xfs_mountfs(
error = xfs_initialize_perag(mp, sbp->sb_agcount, &mp->m_maxagi);
if (error) {
xfs_warn(mp, "Failed per-ag init: %d", error);
- goto out_remove_uuid;
+ goto out_free_dir;
}
if (!sbp->sb_logblocks) {
@@ -1463,6 +985,8 @@ xfs_mountfs(
xfs_wait_buftarg(mp->m_ddev_targp);
out_free_perag:
xfs_free_perag(mp);
+ out_free_dir:
+ xfs_da_unmount(mp);
out_remove_uuid:
xfs_uuid_unmount(mp);
out:
@@ -1540,6 +1064,7 @@ xfs_unmountfs(
"Freespace may not be correct on next mount.");
xfs_log_unmount(mp);
+ xfs_da_unmount(mp);
xfs_uuid_unmount(mp);
#if defined(DEBUG)
@@ -1583,8 +1108,7 @@ xfs_log_sbcount(xfs_mount_t *mp)
return 0;
tp = _xfs_trans_alloc(mp, XFS_TRANS_SB_COUNT, KM_SLEEP);
- error = xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0,
- XFS_DEFAULT_LOG_COUNT);
+ error = xfs_trans_reserve(tp, &M_RES(mp)->tr_sb, 0, 0);
if (error) {
xfs_trans_cancel(tp, 0);
return error;
@@ -1597,48 +1121,7 @@ xfs_log_sbcount(xfs_mount_t *mp)
}
/*
- * xfs_mod_sb() can be used to copy arbitrary changes to the
- * in-core superblock into the superblock buffer to be logged.
- * It does not provide the higher level of locking that is
- * needed to protect the in-core superblock from concurrent
- * access.
- */
-void
-xfs_mod_sb(xfs_trans_t *tp, __int64_t fields)
-{
- xfs_buf_t *bp;
- int first;
- int last;
- xfs_mount_t *mp;
- xfs_sb_field_t f;
-
- ASSERT(fields);
- if (!fields)
- return;
- mp = tp->t_mountp;
- bp = xfs_trans_getsb(tp, mp, 0);
- first = sizeof(xfs_sb_t);
- last = 0;
-
- /* translate/copy */
-
- xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb, fields);
-
- /* find modified range */
- f = (xfs_sb_field_t)xfs_highbit64((__uint64_t)fields);
- ASSERT((1LL << f) & XFS_SB_MOD_BITS);
- last = xfs_sb_info[f + 1].offset - 1;
-
- f = (xfs_sb_field_t)xfs_lowbit64((__uint64_t)fields);
- ASSERT((1LL << f) & XFS_SB_MOD_BITS);
- first = xfs_sb_info[f].offset;
-
- xfs_trans_log_buf(tp, bp, first, last);
-}
-
-
-/*
- * xfs_mod_incore_sb_unlocked() is a utility routine common used to apply
+ * xfs_mod_incore_sb_unlocked() is a utility routine commonly used to apply
* a delta to a specified field in the in-core superblock. Simply
* switch on the field indicated and apply the delta to that field.
* Fields are not allowed to dip below zero, so if the delta would
@@ -1945,8 +1428,7 @@ xfs_mount_log_sb(
XFS_SB_VERSIONNUM));
tp = xfs_trans_alloc(mp, XFS_TRANS_SB_UNIT);
- error = xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0,
- XFS_DEFAULT_LOG_COUNT);
+ error = xfs_trans_reserve(tp, &M_RES(mp)->tr_sb, 0, 0);
if (error) {
xfs_trans_cancel(tp, 0);
return error;
@@ -2104,12 +1586,6 @@ xfs_icsb_init_counters(
if (mp->m_sb_cnts == NULL)
return -ENOMEM;
-#ifdef CONFIG_HOTPLUG_CPU
- mp->m_icsb_notifier.notifier_call = xfs_icsb_cpu_notify;
- mp->m_icsb_notifier.priority = 0;
- register_hotcpu_notifier(&mp->m_icsb_notifier);
-#endif /* CONFIG_HOTPLUG_CPU */
-
for_each_online_cpu(i) {
cntp = (xfs_icsb_cnts_t *)per_cpu_ptr(mp->m_sb_cnts, i);
memset(cntp, 0, sizeof(xfs_icsb_cnts_t));
@@ -2122,6 +1598,13 @@ xfs_icsb_init_counters(
* initial balance kicks us off correctly
*/
mp->m_icsb_counters = -1;
+
+#ifdef CONFIG_HOTPLUG_CPU
+ mp->m_icsb_notifier.notifier_call = xfs_icsb_cpu_notify;
+ mp->m_icsb_notifier.priority = 0;
+ register_hotcpu_notifier(&mp->m_icsb_notifier);
+#endif /* CONFIG_HOTPLUG_CPU */
+
return 0;
}
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index bab8314507e..7295a0b7c34 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -18,38 +18,7 @@
#ifndef __XFS_MOUNT_H__
#define __XFS_MOUNT_H__
-typedef struct xfs_trans_reservations {
- uint tr_write; /* extent alloc trans */
- uint tr_itruncate; /* truncate trans */
- uint tr_rename; /* rename trans */
- uint tr_link; /* link trans */
- uint tr_remove; /* unlink trans */
- uint tr_symlink; /* symlink trans */
- uint tr_create; /* create trans */
- uint tr_mkdir; /* mkdir trans */
- uint tr_ifree; /* inode free trans */
- uint tr_ichange; /* inode update trans */
- uint tr_growdata; /* fs data section grow trans */
- uint tr_swrite; /* sync write inode trans */
- uint tr_addafork; /* cvt inode to attributed trans */
- uint tr_writeid; /* write setuid/setgid file */
- uint tr_attrinval; /* attr fork buffer invalidation */
- uint tr_attrset; /* set/create an attribute */
- uint tr_attrrm; /* remove an attribute */
- uint tr_clearagi; /* clear bad agi unlinked ino bucket */
- uint tr_growrtalloc; /* grow realtime allocations */
- uint tr_growrtzero; /* grow realtime zeroing */
- uint tr_growrtfree; /* grow realtime freeing */
-} xfs_trans_reservations_t;
-
-#ifndef __KERNEL__
-
-#define xfs_daddr_to_agno(mp,d) \
- ((xfs_agnumber_t)(XFS_BB_TO_FSBT(mp, d) / (mp)->m_sb.sb_agblocks))
-#define xfs_daddr_to_agbno(mp,d) \
- ((xfs_agblock_t)(XFS_BB_TO_FSBT(mp, d) % (mp)->m_sb.sb_agblocks))
-
-#else /* __KERNEL__ */
+#ifdef __KERNEL__
struct xlog;
struct xfs_inode;
@@ -57,6 +26,8 @@ struct xfs_mru_cache;
struct xfs_nameops;
struct xfs_ail;
struct xfs_quotainfo;
+struct xfs_dir_ops;
+struct xfs_da_geometry;
#ifdef HAVE_PERCPU_SB
@@ -126,6 +97,8 @@ typedef struct xfs_mount {
uint m_readio_blocks; /* min read size blocks */
uint m_writeio_log; /* min write size log bytes */
uint m_writeio_blocks; /* min write size blocks */
+ struct xfs_da_geometry *m_dir_geo; /* directory block geometry */
+ struct xfs_da_geometry *m_attr_geo; /* attribute block geometry */
struct xlog *m_log; /* log specific stuff */
int m_logbufs; /* number of log buffers */
int m_logbsize; /* size of each log buffer */
@@ -142,7 +115,7 @@ typedef struct xfs_mount {
__uint8_t m_blkbb_log; /* blocklog - BBSHIFT */
__uint8_t m_agno_log; /* log #ag's */
__uint8_t m_agino_log; /* #bits for agino in inum */
- __uint16_t m_inode_cluster_size;/* min inode buf size */
+ uint m_inode_cluster_size;/* min inode buf size */
uint m_blockmask; /* sb_blocksize-1 */
uint m_blockwsize; /* sb_blocksize in words */
uint m_blockwmask; /* blockwsize-1 */
@@ -161,13 +134,11 @@ typedef struct xfs_mount {
int m_fixedfsid[2]; /* unchanged for life of FS */
uint m_dmevmask; /* DMI events for this FS */
__uint64_t m_flags; /* global mount flags */
- uint m_dir_node_ents; /* #entries in a dir danode */
- uint m_attr_node_ents; /* #entries in attr danode */
int m_ialloc_inos; /* inodes in inode allocation */
int m_ialloc_blks; /* blocks in inode allocation */
int m_inoalign_mask;/* mask sb_inoalignmt if used */
uint m_qflags; /* quota status flags */
- xfs_trans_reservations_t m_reservations;/* precomputed res values */
+ struct xfs_trans_resv m_resv; /* precomputed res values */
__uint64_t m_maxicount; /* maximum inode count */
__uint64_t m_resblks; /* total reserved blocks */
__uint64_t m_resblks_avail;/* available reserved blocks */
@@ -175,18 +146,11 @@ typedef struct xfs_mount {
int m_dalign; /* stripe unit */
int m_swidth; /* stripe width */
int m_sinoalign; /* stripe unit inode alignment */
- int m_attr_magicpct;/* 37% of the blocksize */
- int m_dir_magicpct; /* 37% of the dir blocksize */
__uint8_t m_sectbb_log; /* sectlog - BBSHIFT */
const struct xfs_nameops *m_dirnameops; /* vector of dir name ops */
- int m_dirblksize; /* directory block sz--bytes */
- int m_dirblkfsbs; /* directory block sz--fsbs */
- xfs_dablk_t m_dirdatablk; /* blockno of dir data v2 */
- xfs_dablk_t m_dirleafblk; /* blockno of dir non-data v2 */
- xfs_dablk_t m_dirfreeblk; /* blockno of dirfreeindex v2 */
+ const struct xfs_dir_ops *m_dir_inode_ops; /* vector of dir inode ops */
+ const struct xfs_dir_ops *m_nondir_inode_ops; /* !dir inode ops */
uint m_chsize; /* size of next field */
- struct xfs_chash *m_chash; /* fs private inode per-cluster
- * hash table */
atomic_t m_active_trans; /* number trans frozen */
#ifdef HAVE_PERCPU_SB
xfs_icsb_cnts_t __percpu *m_sb_cnts; /* per-cpu superblock counters */
@@ -200,7 +164,6 @@ typedef struct xfs_mount {
trimming */
__int64_t m_update_flags; /* sb flags we need to update
on the next remount,rw */
- struct shrinker m_inode_shrink; /* inode reclaim shrinker */
int64_t m_low_space[XFS_LOWSP_MAX];
/* low free space thresholds */
@@ -223,8 +186,6 @@ typedef struct xfs_mount {
operations, typically for
disk errors in metadata */
#define XFS_MOUNT_DISCARD (1ULL << 5) /* discard unused blocks */
-#define XFS_MOUNT_RETERR (1ULL << 6) /* return alignment errors to
- user */
#define XFS_MOUNT_NOALIGN (1ULL << 7) /* turn off stripe alignment
allocations */
#define XFS_MOUNT_ATTR2 (1ULL << 8) /* allow use of attr2 format */
@@ -328,14 +289,6 @@ xfs_daddr_to_agbno(struct xfs_mount *mp, xfs_daddr_t d)
}
/*
- * perag get/put wrappers for ref counting
- */
-struct xfs_perag *xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno);
-struct xfs_perag *xfs_perag_get_tag(struct xfs_mount *mp, xfs_agnumber_t agno,
- int tag);
-void xfs_perag_put(struct xfs_perag *pag);
-
-/*
* Per-cpu superblock locking functions
*/
#ifdef HAVE_PERCPU_SB
@@ -364,9 +317,63 @@ typedef struct xfs_mod_sb {
int64_t msb_delta; /* Change to make to specified field */
} xfs_mod_sb_t;
+/*
+ * Per-ag incore structure, copies of information in agf and agi, to improve the
+ * performance of allocation group selection. This is defined for the kernel
+ * only, and hence is defined here instead of in xfs_ag.h. You need the struct
+ * xfs_mount to be defined to look up a xfs_perag anyway (via mp->m_perag_tree),
+ * so this doesn't introduce any strange header file dependencies.
+ */
+typedef struct xfs_perag {
+ struct xfs_mount *pag_mount; /* owner filesystem */
+ xfs_agnumber_t pag_agno; /* AG this structure belongs to */
+ atomic_t pag_ref; /* perag reference count */
+ char pagf_init; /* this agf's entry is initialized */
+ char pagi_init; /* this agi's entry is initialized */
+ char pagf_metadata; /* the agf is preferred to be metadata */
+ char pagi_inodeok; /* The agi is ok for inodes */
+ __uint8_t pagf_levels[XFS_BTNUM_AGF];
+ /* # of levels in bno & cnt btree */
+ __uint32_t pagf_flcount; /* count of blocks in freelist */
+ xfs_extlen_t pagf_freeblks; /* total free blocks */
+ xfs_extlen_t pagf_longest; /* longest free space */
+ __uint32_t pagf_btreeblks; /* # of blocks held in AGF btrees */
+ xfs_agino_t pagi_freecount; /* number of free inodes */
+ xfs_agino_t pagi_count; /* number of allocated inodes */
+
+ /*
+ * Inode allocation search lookup optimisation.
+ * If the pagino matches, the search for new inodes
+ * doesn't need to search the near ones again straight away
+ */
+ xfs_agino_t pagl_pagino;
+ xfs_agino_t pagl_leftrec;
+ xfs_agino_t pagl_rightrec;
+ spinlock_t pagb_lock; /* lock for pagb_tree */
+ struct rb_root pagb_tree; /* ordered tree of busy extents */
+
+ atomic_t pagf_fstrms; /* # of filestreams active in this AG */
+
+ spinlock_t pag_ici_lock; /* incore inode cache lock */
+ struct radix_tree_root pag_ici_root; /* incore inode cache root */
+ int pag_ici_reclaimable; /* reclaimable inodes */
+ struct mutex pag_ici_reclaim_lock; /* serialisation point */
+ unsigned long pag_ici_reclaim_cursor; /* reclaim restart point */
+
+ /* buffer cache index */
+ spinlock_t pag_buf_lock; /* lock for pag_buf_tree */
+ struct rb_root pag_buf_tree; /* ordered tree of active buffers */
+
+ /* for rcu-safe freeing */
+ struct rcu_head rcu_head;
+ int pagb_count; /* pagb slots in use */
+} xfs_perag_t;
+
extern int xfs_log_sbcount(xfs_mount_t *);
extern __uint64_t xfs_default_resblks(xfs_mount_t *mp);
extern int xfs_mountfs(xfs_mount_t *mp);
+extern int xfs_initialize_perag(xfs_mount_t *mp, xfs_agnumber_t agcount,
+ xfs_agnumber_t *maxagi);
extern void xfs_unmountfs(xfs_mount_t *);
extern int xfs_mod_incore_sb(xfs_mount_t *, xfs_sb_field_t, int64_t, int);
@@ -385,12 +392,4 @@ extern void xfs_set_low_space_thresholds(struct xfs_mount *);
#endif /* __KERNEL__ */
-extern void xfs_mod_sb(struct xfs_trans *, __int64_t);
-extern int xfs_initialize_perag(struct xfs_mount *, xfs_agnumber_t,
- xfs_agnumber_t *);
-extern void xfs_sb_from_disk(struct xfs_sb *, struct xfs_dsb *);
-extern void xfs_sb_to_disk(struct xfs_dsb *, struct xfs_sb *, __int64_t);
-
-extern const struct xfs_buf_ops xfs_sb_buf_ops;
-
#endif /* __XFS_MOUNT_H__ */
diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c
index 4aff5639573..f99b4933dc2 100644
--- a/fs/xfs/xfs_mru_cache.c
+++ b/fs/xfs/xfs_mru_cache.c
@@ -100,14 +100,20 @@
* likely result in a loop in one of the lists. That's a sure-fire recipe for
* an infinite loop in the code.
*/
-typedef struct xfs_mru_cache_elem
-{
- struct list_head list_node;
- unsigned long key;
- void *value;
-} xfs_mru_cache_elem_t;
+struct xfs_mru_cache {
+ struct radix_tree_root store; /* Core storage data structure. */
+ struct list_head *lists; /* Array of lists, one per grp. */
+ struct list_head reap_list; /* Elements overdue for reaping. */
+ spinlock_t lock; /* Lock to protect this struct. */
+ unsigned int grp_count; /* Number of discrete groups. */
+ unsigned int grp_time; /* Time period spanned by grps. */
+ unsigned int lru_grp; /* Group containing time zero. */
+ unsigned long time_zero; /* Time first element was added. */
+ xfs_mru_cache_free_func_t free_func; /* Function pointer for freeing. */
+ struct delayed_work work; /* Workqueue data for reaping. */
+ unsigned int queued; /* work has been queued */
+};
-static kmem_zone_t *xfs_mru_elem_zone;
static struct workqueue_struct *xfs_mru_reap_wq;
/*
@@ -129,12 +135,12 @@ static struct workqueue_struct *xfs_mru_reap_wq;
*/
STATIC unsigned long
_xfs_mru_cache_migrate(
- xfs_mru_cache_t *mru,
- unsigned long now)
+ struct xfs_mru_cache *mru,
+ unsigned long now)
{
- unsigned int grp;
- unsigned int migrated = 0;
- struct list_head *lru_list;
+ unsigned int grp;
+ unsigned int migrated = 0;
+ struct list_head *lru_list;
/* Nothing to do if the data store is empty. */
if (!mru->time_zero)
@@ -193,11 +199,11 @@ _xfs_mru_cache_migrate(
*/
STATIC void
_xfs_mru_cache_list_insert(
- xfs_mru_cache_t *mru,
- xfs_mru_cache_elem_t *elem)
+ struct xfs_mru_cache *mru,
+ struct xfs_mru_cache_elem *elem)
{
- unsigned int grp = 0;
- unsigned long now = jiffies;
+ unsigned int grp = 0;
+ unsigned long now = jiffies;
/*
* If the data store is empty, initialise time zero, leave grp set to
@@ -231,10 +237,10 @@ _xfs_mru_cache_list_insert(
*/
STATIC void
_xfs_mru_cache_clear_reap_list(
- xfs_mru_cache_t *mru) __releases(mru->lock) __acquires(mru->lock)
-
+ struct xfs_mru_cache *mru)
+ __releases(mru->lock) __acquires(mru->lock)
{
- xfs_mru_cache_elem_t *elem, *next;
+ struct xfs_mru_cache_elem *elem, *next;
struct list_head tmp;
INIT_LIST_HEAD(&tmp);
@@ -252,15 +258,8 @@ _xfs_mru_cache_clear_reap_list(
spin_unlock(&mru->lock);
list_for_each_entry_safe(elem, next, &tmp, list_node) {
-
- /* Remove the element from the reap list. */
list_del_init(&elem->list_node);
-
- /* Call the client's free function with the key and value pointer. */
- mru->free_func(elem->key, elem->value);
-
- /* Free the element structure. */
- kmem_zone_free(xfs_mru_elem_zone, elem);
+ mru->free_func(elem);
}
spin_lock(&mru->lock);
@@ -277,7 +276,8 @@ STATIC void
_xfs_mru_cache_reap(
struct work_struct *work)
{
- xfs_mru_cache_t *mru = container_of(work, xfs_mru_cache_t, work.work);
+ struct xfs_mru_cache *mru =
+ container_of(work, struct xfs_mru_cache, work.work);
unsigned long now, next;
ASSERT(mru && mru->lists);
@@ -304,28 +304,16 @@ _xfs_mru_cache_reap(
int
xfs_mru_cache_init(void)
{
- xfs_mru_elem_zone = kmem_zone_init(sizeof(xfs_mru_cache_elem_t),
- "xfs_mru_cache_elem");
- if (!xfs_mru_elem_zone)
- goto out;
-
xfs_mru_reap_wq = alloc_workqueue("xfs_mru_cache", WQ_MEM_RECLAIM, 1);
if (!xfs_mru_reap_wq)
- goto out_destroy_mru_elem_zone;
-
+ return -ENOMEM;
return 0;
-
- out_destroy_mru_elem_zone:
- kmem_zone_destroy(xfs_mru_elem_zone);
- out:
- return -ENOMEM;
}
void
xfs_mru_cache_uninit(void)
{
destroy_workqueue(xfs_mru_reap_wq);
- kmem_zone_destroy(xfs_mru_elem_zone);
}
/*
@@ -336,14 +324,14 @@ xfs_mru_cache_uninit(void)
*/
int
xfs_mru_cache_create(
- xfs_mru_cache_t **mrup,
+ struct xfs_mru_cache **mrup,
unsigned int lifetime_ms,
unsigned int grp_count,
xfs_mru_cache_free_func_t free_func)
{
- xfs_mru_cache_t *mru = NULL;
- int err = 0, grp;
- unsigned int grp_time;
+ struct xfs_mru_cache *mru = NULL;
+ int err = 0, grp;
+ unsigned int grp_time;
if (mrup)
*mrup = NULL;
@@ -400,7 +388,7 @@ exit:
*/
static void
xfs_mru_cache_flush(
- xfs_mru_cache_t *mru)
+ struct xfs_mru_cache *mru)
{
if (!mru || !mru->lists)
return;
@@ -420,7 +408,7 @@ xfs_mru_cache_flush(
void
xfs_mru_cache_destroy(
- xfs_mru_cache_t *mru)
+ struct xfs_mru_cache *mru)
{
if (!mru || !mru->lists)
return;
@@ -438,38 +426,30 @@ xfs_mru_cache_destroy(
*/
int
xfs_mru_cache_insert(
- xfs_mru_cache_t *mru,
- unsigned long key,
- void *value)
+ struct xfs_mru_cache *mru,
+ unsigned long key,
+ struct xfs_mru_cache_elem *elem)
{
- xfs_mru_cache_elem_t *elem;
+ int error;
ASSERT(mru && mru->lists);
if (!mru || !mru->lists)
return EINVAL;
- elem = kmem_zone_zalloc(xfs_mru_elem_zone, KM_SLEEP);
- if (!elem)
+ if (radix_tree_preload(GFP_KERNEL))
return ENOMEM;
- if (radix_tree_preload(GFP_KERNEL)) {
- kmem_zone_free(xfs_mru_elem_zone, elem);
- return ENOMEM;
- }
-
INIT_LIST_HEAD(&elem->list_node);
elem->key = key;
- elem->value = value;
spin_lock(&mru->lock);
-
- radix_tree_insert(&mru->store, key, elem);
+ error = -radix_tree_insert(&mru->store, key, elem);
radix_tree_preload_end();
- _xfs_mru_cache_list_insert(mru, elem);
-
+ if (!error)
+ _xfs_mru_cache_list_insert(mru, elem);
spin_unlock(&mru->lock);
- return 0;
+ return error;
}
/*
@@ -478,13 +458,12 @@ xfs_mru_cache_insert(
* the client data pointer for the removed element is returned, otherwise this
* function will return a NULL pointer.
*/
-void *
+struct xfs_mru_cache_elem *
xfs_mru_cache_remove(
- xfs_mru_cache_t *mru,
- unsigned long key)
+ struct xfs_mru_cache *mru,
+ unsigned long key)
{
- xfs_mru_cache_elem_t *elem;
- void *value = NULL;
+ struct xfs_mru_cache_elem *elem;
ASSERT(mru && mru->lists);
if (!mru || !mru->lists)
@@ -492,17 +471,11 @@ xfs_mru_cache_remove(
spin_lock(&mru->lock);
elem = radix_tree_delete(&mru->store, key);
- if (elem) {
- value = elem->value;
+ if (elem)
list_del(&elem->list_node);
- }
-
spin_unlock(&mru->lock);
- if (elem)
- kmem_zone_free(xfs_mru_elem_zone, elem);
-
- return value;
+ return elem;
}
/*
@@ -511,13 +484,14 @@ xfs_mru_cache_remove(
*/
void
xfs_mru_cache_delete(
- xfs_mru_cache_t *mru,
- unsigned long key)
+ struct xfs_mru_cache *mru,
+ unsigned long key)
{
- void *value = xfs_mru_cache_remove(mru, key);
+ struct xfs_mru_cache_elem *elem;
- if (value)
- mru->free_func(key, value);
+ elem = xfs_mru_cache_remove(mru, key);
+ if (elem)
+ mru->free_func(elem);
}
/*
@@ -540,12 +514,12 @@ xfs_mru_cache_delete(
* status, we need to help it get it right by annotating the path that does
* not release the lock.
*/
-void *
+struct xfs_mru_cache_elem *
xfs_mru_cache_lookup(
- xfs_mru_cache_t *mru,
- unsigned long key)
+ struct xfs_mru_cache *mru,
+ unsigned long key)
{
- xfs_mru_cache_elem_t *elem;
+ struct xfs_mru_cache_elem *elem;
ASSERT(mru && mru->lists);
if (!mru || !mru->lists)
@@ -560,7 +534,7 @@ xfs_mru_cache_lookup(
} else
spin_unlock(&mru->lock);
- return elem ? elem->value : NULL;
+ return elem;
}
/*
@@ -570,7 +544,8 @@ xfs_mru_cache_lookup(
*/
void
xfs_mru_cache_done(
- xfs_mru_cache_t *mru) __releases(mru->lock)
+ struct xfs_mru_cache *mru)
+ __releases(mru->lock)
{
spin_unlock(&mru->lock);
}
diff --git a/fs/xfs/xfs_mru_cache.h b/fs/xfs/xfs_mru_cache.h
index 36dd3ec8b4e..fb5245ba5ff 100644
--- a/fs/xfs/xfs_mru_cache.h
+++ b/fs/xfs/xfs_mru_cache.h
@@ -18,24 +18,15 @@
#ifndef __XFS_MRU_CACHE_H__
#define __XFS_MRU_CACHE_H__
+struct xfs_mru_cache;
-/* Function pointer type for callback to free a client's data pointer. */
-typedef void (*xfs_mru_cache_free_func_t)(unsigned long, void*);
+struct xfs_mru_cache_elem {
+ struct list_head list_node;
+ unsigned long key;
+};
-typedef struct xfs_mru_cache
-{
- struct radix_tree_root store; /* Core storage data structure. */
- struct list_head *lists; /* Array of lists, one per grp. */
- struct list_head reap_list; /* Elements overdue for reaping. */
- spinlock_t lock; /* Lock to protect this struct. */
- unsigned int grp_count; /* Number of discrete groups. */
- unsigned int grp_time; /* Time period spanned by grps. */
- unsigned int lru_grp; /* Group containing time zero. */
- unsigned long time_zero; /* Time first element was added. */
- xfs_mru_cache_free_func_t free_func; /* Function pointer for freeing. */
- struct delayed_work work; /* Workqueue data for reaping. */
- unsigned int queued; /* work has been queued */
-} xfs_mru_cache_t;
+/* Function pointer type for callback to free a client's data pointer. */
+typedef void (*xfs_mru_cache_free_func_t)(struct xfs_mru_cache_elem *elem);
int xfs_mru_cache_init(void);
void xfs_mru_cache_uninit(void);
@@ -44,10 +35,12 @@ int xfs_mru_cache_create(struct xfs_mru_cache **mrup, unsigned int lifetime_ms,
xfs_mru_cache_free_func_t free_func);
void xfs_mru_cache_destroy(struct xfs_mru_cache *mru);
int xfs_mru_cache_insert(struct xfs_mru_cache *mru, unsigned long key,
- void *value);
-void * xfs_mru_cache_remove(struct xfs_mru_cache *mru, unsigned long key);
+ struct xfs_mru_cache_elem *elem);
+struct xfs_mru_cache_elem *
+xfs_mru_cache_remove(struct xfs_mru_cache *mru, unsigned long key);
void xfs_mru_cache_delete(struct xfs_mru_cache *mru, unsigned long key);
-void *xfs_mru_cache_lookup(struct xfs_mru_cache *mru, unsigned long key);
+struct xfs_mru_cache_elem *
+xfs_mru_cache_lookup(struct xfs_mru_cache *mru, unsigned long key);
void xfs_mru_cache_done(struct xfs_mru_cache *mru);
#endif /* __XFS_MRU_CACHE_H__ */
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 60eff476315..6d26759c779 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -17,30 +17,28 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_bit.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
-#include "xfs_alloc.h"
-#include "xfs_quota.h"
#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dinode.h"
#include "xfs_inode.h"
#include "xfs_ialloc.h"
#include "xfs_itable.h"
-#include "xfs_rtalloc.h"
+#include "xfs_quota.h"
#include "xfs_error.h"
#include "xfs_bmap.h"
-#include "xfs_attr.h"
-#include "xfs_buf_item.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_trans.h"
#include "xfs_trans_space.h"
-#include "xfs_utils.h"
#include "xfs_qm.h"
#include "xfs_trace.h"
#include "xfs_icache.h"
+#include "xfs_cksum.h"
+#include "xfs_dinode.h"
/*
* The global quota manager. There is only one of these for the entire
@@ -50,8 +48,9 @@
*/
STATIC int xfs_qm_init_quotainos(xfs_mount_t *);
STATIC int xfs_qm_init_quotainfo(xfs_mount_t *);
-STATIC int xfs_qm_shake(struct shrinker *, struct shrink_control *);
+
+STATIC void xfs_qm_dqfree_one(struct xfs_dquot *dqp);
/*
* We use the batch lookup interface to iterate over the dquots as it
* currently is the only interface into the radix tree code that allows
@@ -69,7 +68,7 @@ xfs_qm_dquot_walk(
void *data)
{
struct xfs_quotainfo *qi = mp->m_quotainfo;
- struct radix_tree_root *tree = XFS_DQUOT_TREE(qi, type);
+ struct radix_tree_root *tree = xfs_dquot_tree(qi, type);
uint32_t next_index;
int last_error = 0;
int skipped;
@@ -135,7 +134,6 @@ xfs_qm_dqpurge(
{
struct xfs_mount *mp = dqp->q_mount;
struct xfs_quotainfo *qi = mp->m_quotainfo;
- struct xfs_dquot *gdqp = NULL;
xfs_dqlock(dqp);
if ((dqp->dq_flags & XFS_DQ_FREEING) || dqp->q_nrefs != 0) {
@@ -143,16 +141,6 @@ xfs_qm_dqpurge(
return EAGAIN;
}
- /*
- * If this quota has a group hint attached, prepare for releasing it
- * now.
- */
- gdqp = dqp->q_gdquot;
- if (gdqp) {
- xfs_dqlock(gdqp);
- dqp->q_gdquot = NULL;
- }
-
dqp->dq_flags |= XFS_DQ_FREEING;
xfs_dqflock(dqp);
@@ -188,7 +176,7 @@ xfs_qm_dqpurge(
xfs_dqfunlock(dqp);
xfs_dqunlock(dqp);
- radix_tree_delete(XFS_DQUOT_TREE(qi, dqp->q_core.d_flags),
+ radix_tree_delete(xfs_dquot_tree(qi, dqp->q_core.d_flags),
be32_to_cpu(dqp->q_core.d_id));
qi->qi_dquots--;
@@ -196,17 +184,11 @@ xfs_qm_dqpurge(
* We move dquots to the freelist as soon as their reference count
* hits zero, so it really should be on the freelist here.
*/
- mutex_lock(&qi->qi_lru_lock);
ASSERT(!list_empty(&dqp->q_lru));
- list_del_init(&dqp->q_lru);
- qi->qi_lru_count--;
+ list_lru_del(&qi->qi_lru, &dqp->q_lru);
XFS_STATS_DEC(xs_qm_dquot_unused);
- mutex_unlock(&qi->qi_lru_lock);
xfs_qm_dqdestroy(dqp);
-
- if (gdqp)
- xfs_qm_dqput(gdqp);
return 0;
}
@@ -298,8 +280,10 @@ xfs_qm_mount_quotas(
*/
if (!XFS_IS_UQUOTA_ON(mp))
mp->m_qflags &= ~XFS_UQUOTA_CHKD;
- if (!(XFS_IS_GQUOTA_ON(mp) || XFS_IS_PQUOTA_ON(mp)))
- mp->m_qflags &= ~XFS_OQUOTA_CHKD;
+ if (!XFS_IS_GQUOTA_ON(mp))
+ mp->m_qflags &= ~XFS_GQUOTA_CHKD;
+ if (!XFS_IS_PQUOTA_ON(mp))
+ mp->m_qflags &= ~XFS_PQUOTA_CHKD;
write_changes:
/*
@@ -361,6 +345,10 @@ xfs_qm_unmount_quotas(
IRELE(mp->m_quotainfo->qi_gquotaip);
mp->m_quotainfo->qi_gquotaip = NULL;
}
+ if (mp->m_quotainfo->qi_pquotaip) {
+ IRELE(mp->m_quotainfo->qi_pquotaip);
+ mp->m_quotainfo->qi_pquotaip = NULL;
+ }
}
}
@@ -370,7 +358,6 @@ xfs_qm_dqattach_one(
xfs_dqid_t id,
uint type,
uint doalloc,
- xfs_dquot_t *udqhint, /* hint */
xfs_dquot_t **IO_idqpp)
{
xfs_dquot_t *dqp;
@@ -380,9 +367,9 @@ xfs_qm_dqattach_one(
error = 0;
/*
- * See if we already have it in the inode itself. IO_idqpp is
- * &i_udquot or &i_gdquot. This made the code look weird, but
- * made the logic a lot simpler.
+ * See if we already have it in the inode itself. IO_idqpp is &i_udquot
+ * or &i_gdquot. This made the code look weird, but made the logic a lot
+ * simpler.
*/
dqp = *IO_idqpp;
if (dqp) {
@@ -391,46 +378,10 @@ xfs_qm_dqattach_one(
}
/*
- * udqhint is the i_udquot field in inode, and is non-NULL only
- * when the type arg is group/project. Its purpose is to save a
- * lookup by dqid (xfs_qm_dqget) by caching a group dquot inside
- * the user dquot.
- */
- if (udqhint) {
- ASSERT(type == XFS_DQ_GROUP || type == XFS_DQ_PROJ);
- xfs_dqlock(udqhint);
-
- /*
- * No need to take dqlock to look at the id.
- *
- * The ID can't change until it gets reclaimed, and it won't
- * be reclaimed as long as we have a ref from inode and we
- * hold the ilock.
- */
- dqp = udqhint->q_gdquot;
- if (dqp && be32_to_cpu(dqp->q_core.d_id) == id) {
- ASSERT(*IO_idqpp == NULL);
-
- *IO_idqpp = xfs_qm_dqhold(dqp);
- xfs_dqunlock(udqhint);
- return 0;
- }
-
- /*
- * We can't hold a dquot lock when we call the dqget code.
- * We'll deadlock in no time, because of (not conforming to)
- * lock ordering - the inodelock comes before any dquot lock,
- * and we may drop and reacquire the ilock in xfs_qm_dqget().
- */
- xfs_dqunlock(udqhint);
- }
-
- /*
- * Find the dquot from somewhere. This bumps the
- * reference count of dquot and returns it locked.
- * This can return ENOENT if dquot didn't exist on
- * disk and we didn't ask it to allocate;
- * ESRCH if quotas got turned off suddenly.
+ * Find the dquot from somewhere. This bumps the reference count of
+ * dquot and returns it locked. This can return ENOENT if dquot didn't
+ * exist on disk and we didn't ask it to allocate; ESRCH if quotas got
+ * turned off suddenly.
*/
error = xfs_qm_dqget(ip->i_mount, ip, id, type,
doalloc | XFS_QMOPT_DOWARN, &dqp);
@@ -448,34 +399,6 @@ xfs_qm_dqattach_one(
return 0;
}
-
-/*
- * Given a udquot and gdquot, attach a ptr to the group dquot in the
- * udquot as a hint for future lookups.
- */
-STATIC void
-xfs_qm_dqattach_grouphint(
- xfs_dquot_t *udq,
- xfs_dquot_t *gdq)
-{
- xfs_dquot_t *tmp;
-
- xfs_dqlock(udq);
-
- tmp = udq->q_gdquot;
- if (tmp) {
- if (tmp == gdq)
- goto done;
-
- udq->q_gdquot = NULL;
- xfs_qm_dqrele(tmp);
- }
-
- udq->q_gdquot = xfs_qm_dqhold(gdq);
-done:
- xfs_dqunlock(udq);
-}
-
static bool
xfs_qm_need_dqattach(
struct xfs_inode *ip)
@@ -488,8 +411,7 @@ xfs_qm_need_dqattach(
return false;
if (!XFS_NOT_DQATTACHED(mp, ip))
return false;
- if (ip->i_ino == mp->m_sb.sb_uquotino ||
- ip->i_ino == mp->m_sb.sb_gquotino)
+ if (xfs_is_quota_inode(&mp->m_sb, ip->i_ino))
return false;
return true;
}
@@ -507,7 +429,6 @@ xfs_qm_dqattach_locked(
uint flags)
{
xfs_mount_t *mp = ip->i_mount;
- uint nquotas = 0;
int error = 0;
if (!xfs_qm_need_dqattach(ip))
@@ -515,62 +436,39 @@ xfs_qm_dqattach_locked(
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
- if (XFS_IS_UQUOTA_ON(mp)) {
+ if (XFS_IS_UQUOTA_ON(mp) && !ip->i_udquot) {
error = xfs_qm_dqattach_one(ip, ip->i_d.di_uid, XFS_DQ_USER,
flags & XFS_QMOPT_DQALLOC,
- NULL, &ip->i_udquot);
+ &ip->i_udquot);
if (error)
goto done;
- nquotas++;
+ ASSERT(ip->i_udquot);
}
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
- if (XFS_IS_OQUOTA_ON(mp)) {
- error = XFS_IS_GQUOTA_ON(mp) ?
- xfs_qm_dqattach_one(ip, ip->i_d.di_gid, XFS_DQ_GROUP,
+ if (XFS_IS_GQUOTA_ON(mp) && !ip->i_gdquot) {
+ error = xfs_qm_dqattach_one(ip, ip->i_d.di_gid, XFS_DQ_GROUP,
flags & XFS_QMOPT_DQALLOC,
- ip->i_udquot, &ip->i_gdquot) :
- xfs_qm_dqattach_one(ip, xfs_get_projid(ip), XFS_DQ_PROJ,
+ &ip->i_gdquot);
+ if (error)
+ goto done;
+ ASSERT(ip->i_gdquot);
+ }
+
+ if (XFS_IS_PQUOTA_ON(mp) && !ip->i_pdquot) {
+ error = xfs_qm_dqattach_one(ip, xfs_get_projid(ip), XFS_DQ_PROJ,
flags & XFS_QMOPT_DQALLOC,
- ip->i_udquot, &ip->i_gdquot);
- /*
- * Don't worry about the udquot that we may have
- * attached above. It'll get detached, if not already.
- */
+ &ip->i_pdquot);
if (error)
goto done;
- nquotas++;
+ ASSERT(ip->i_pdquot);
}
+done:
/*
- * Attach this group quota to the user quota as a hint.
- * This WON'T, in general, result in a thrash.
+ * Don't worry about the dquots that we may have attached before any
+ * error - they'll get detached later if it has not already been done.
*/
- if (nquotas == 2) {
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
- ASSERT(ip->i_udquot);
- ASSERT(ip->i_gdquot);
-
- /*
- * We do not have i_udquot locked at this point, but this check
- * is OK since we don't depend on the i_gdquot to be accurate
- * 100% all the time. It is just a hint, and this will
- * succeed in general.
- */
- if (ip->i_udquot->q_gdquot != ip->i_gdquot)
- xfs_qm_dqattach_grouphint(ip->i_udquot, ip->i_gdquot);
- }
-
- done:
-#ifdef DEBUG
- if (!error) {
- if (XFS_IS_UQUOTA_ON(mp))
- ASSERT(ip->i_udquot);
- if (XFS_IS_OQUOTA_ON(mp))
- ASSERT(ip->i_gdquot);
- }
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
-#endif
return error;
}
@@ -600,13 +498,12 @@ void
xfs_qm_dqdetach(
xfs_inode_t *ip)
{
- if (!(ip->i_udquot || ip->i_gdquot))
+ if (!(ip->i_udquot || ip->i_gdquot || ip->i_pdquot))
return;
trace_xfs_dquot_dqdetach(ip);
- ASSERT(ip->i_ino != ip->i_mount->m_sb.sb_uquotino);
- ASSERT(ip->i_ino != ip->i_mount->m_sb.sb_gquotino);
+ ASSERT(!xfs_is_quota_inode(&ip->i_mount->m_sb, ip->i_ino));
if (ip->i_udquot) {
xfs_qm_dqrele(ip->i_udquot);
ip->i_udquot = NULL;
@@ -615,6 +512,147 @@ xfs_qm_dqdetach(
xfs_qm_dqrele(ip->i_gdquot);
ip->i_gdquot = NULL;
}
+ if (ip->i_pdquot) {
+ xfs_qm_dqrele(ip->i_pdquot);
+ ip->i_pdquot = NULL;
+ }
+}
+
+struct xfs_qm_isolate {
+ struct list_head buffers;
+ struct list_head dispose;
+};
+
+static enum lru_status
+xfs_qm_dquot_isolate(
+ struct list_head *item,
+ spinlock_t *lru_lock,
+ void *arg)
+{
+ struct xfs_dquot *dqp = container_of(item,
+ struct xfs_dquot, q_lru);
+ struct xfs_qm_isolate *isol = arg;
+
+ if (!xfs_dqlock_nowait(dqp))
+ goto out_miss_busy;
+
+ /*
+ * This dquot has acquired a reference in the meantime remove it from
+ * the freelist and try again.
+ */
+ if (dqp->q_nrefs) {
+ xfs_dqunlock(dqp);
+ XFS_STATS_INC(xs_qm_dqwants);
+
+ trace_xfs_dqreclaim_want(dqp);
+ list_del_init(&dqp->q_lru);
+ XFS_STATS_DEC(xs_qm_dquot_unused);
+ return LRU_REMOVED;
+ }
+
+ /*
+ * If the dquot is dirty, flush it. If it's already being flushed, just
+ * skip it so there is time for the IO to complete before we try to
+ * reclaim it again on the next LRU pass.
+ */
+ if (!xfs_dqflock_nowait(dqp)) {
+ xfs_dqunlock(dqp);
+ goto out_miss_busy;
+ }
+
+ if (XFS_DQ_IS_DIRTY(dqp)) {
+ struct xfs_buf *bp = NULL;
+ int error;
+
+ trace_xfs_dqreclaim_dirty(dqp);
+
+ /* we have to drop the LRU lock to flush the dquot */
+ spin_unlock(lru_lock);
+
+ error = xfs_qm_dqflush(dqp, &bp);
+ if (error) {
+ xfs_warn(dqp->q_mount, "%s: dquot %p flush failed",
+ __func__, dqp);
+ goto out_unlock_dirty;
+ }
+
+ xfs_buf_delwri_queue(bp, &isol->buffers);
+ xfs_buf_relse(bp);
+ goto out_unlock_dirty;
+ }
+ xfs_dqfunlock(dqp);
+
+ /*
+ * Prevent lookups now that we are past the point of no return.
+ */
+ dqp->dq_flags |= XFS_DQ_FREEING;
+ xfs_dqunlock(dqp);
+
+ ASSERT(dqp->q_nrefs == 0);
+ list_move_tail(&dqp->q_lru, &isol->dispose);
+ XFS_STATS_DEC(xs_qm_dquot_unused);
+ trace_xfs_dqreclaim_done(dqp);
+ XFS_STATS_INC(xs_qm_dqreclaims);
+ return LRU_REMOVED;
+
+out_miss_busy:
+ trace_xfs_dqreclaim_busy(dqp);
+ XFS_STATS_INC(xs_qm_dqreclaim_misses);
+ return LRU_SKIP;
+
+out_unlock_dirty:
+ trace_xfs_dqreclaim_busy(dqp);
+ XFS_STATS_INC(xs_qm_dqreclaim_misses);
+ xfs_dqunlock(dqp);
+ spin_lock(lru_lock);
+ return LRU_RETRY;
+}
+
+static unsigned long
+xfs_qm_shrink_scan(
+ struct shrinker *shrink,
+ struct shrink_control *sc)
+{
+ struct xfs_quotainfo *qi = container_of(shrink,
+ struct xfs_quotainfo, qi_shrinker);
+ struct xfs_qm_isolate isol;
+ unsigned long freed;
+ int error;
+ unsigned long nr_to_scan = sc->nr_to_scan;
+
+ if ((sc->gfp_mask & (__GFP_FS|__GFP_WAIT)) != (__GFP_FS|__GFP_WAIT))
+ return 0;
+
+ INIT_LIST_HEAD(&isol.buffers);
+ INIT_LIST_HEAD(&isol.dispose);
+
+ freed = list_lru_walk_node(&qi->qi_lru, sc->nid, xfs_qm_dquot_isolate, &isol,
+ &nr_to_scan);
+
+ error = xfs_buf_delwri_submit(&isol.buffers);
+ if (error)
+ xfs_warn(NULL, "%s: dquot reclaim failed", __func__);
+
+ while (!list_empty(&isol.dispose)) {
+ struct xfs_dquot *dqp;
+
+ dqp = list_first_entry(&isol.dispose, struct xfs_dquot, q_lru);
+ list_del_init(&dqp->q_lru);
+ xfs_qm_dqfree_one(dqp);
+ }
+
+ return freed;
+}
+
+static unsigned long
+xfs_qm_shrink_count(
+ struct shrinker *shrink,
+ struct shrink_control *sc)
+{
+ struct xfs_quotainfo *qi = container_of(shrink,
+ struct xfs_quotainfo, qi_shrinker);
+
+ return list_lru_count_node(&qi->qi_lru, sc->nid);
}
/*
@@ -633,32 +671,29 @@ xfs_qm_init_quotainfo(
qinf = mp->m_quotainfo = kmem_zalloc(sizeof(xfs_quotainfo_t), KM_SLEEP);
+ error = -list_lru_init(&qinf->qi_lru);
+ if (error)
+ goto out_free_qinf;
+
/*
* See if quotainodes are setup, and if not, allocate them,
* and change the superblock accordingly.
*/
- if ((error = xfs_qm_init_quotainos(mp))) {
- kmem_free(qinf);
- mp->m_quotainfo = NULL;
- return error;
- }
+ error = xfs_qm_init_quotainos(mp);
+ if (error)
+ goto out_free_lru;
INIT_RADIX_TREE(&qinf->qi_uquota_tree, GFP_NOFS);
INIT_RADIX_TREE(&qinf->qi_gquota_tree, GFP_NOFS);
+ INIT_RADIX_TREE(&qinf->qi_pquota_tree, GFP_NOFS);
mutex_init(&qinf->qi_tree_lock);
- INIT_LIST_HEAD(&qinf->qi_lru_list);
- qinf->qi_lru_count = 0;
- mutex_init(&qinf->qi_lru_lock);
-
/* mutex used to serialize quotaoffs */
mutex_init(&qinf->qi_quotaofflock);
/* Precalc some constants */
qinf->qi_dqchunklen = XFS_FSB_TO_BB(mp, XFS_DQUOT_CLUSTER_SIZE_FSB);
- ASSERT(qinf->qi_dqchunklen);
- qinf->qi_dqperchunk = BBTOB(qinf->qi_dqchunklen);
- do_div(qinf->qi_dqperchunk, sizeof(xfs_dqblk_t));
+ qinf->qi_dqperchunk = xfs_calc_dquots_per_chunk(qinf->qi_dqchunklen);
mp->m_qflags |= (mp->m_sb.sb_qflags & XFS_ALL_QUOTA_CHKD);
@@ -705,7 +740,7 @@ xfs_qm_init_quotainfo(
qinf->qi_isoftlimit = be64_to_cpu(ddqp->d_ino_softlimit);
qinf->qi_rtbhardlimit = be64_to_cpu(ddqp->d_rtb_hardlimit);
qinf->qi_rtbsoftlimit = be64_to_cpu(ddqp->d_rtb_softlimit);
-
+
xfs_qm_dqdestroy(dqp);
} else {
qinf->qi_btimelimit = XFS_QM_BTIMELIMIT;
@@ -716,10 +751,19 @@ xfs_qm_init_quotainfo(
qinf->qi_rtbwarnlimit = XFS_QM_RTBWARNLIMIT;
}
- qinf->qi_shrinker.shrink = xfs_qm_shake;
+ qinf->qi_shrinker.count_objects = xfs_qm_shrink_count;
+ qinf->qi_shrinker.scan_objects = xfs_qm_shrink_scan;
qinf->qi_shrinker.seeks = DEFAULT_SEEKS;
+ qinf->qi_shrinker.flags = SHRINKER_NUMA_AWARE;
register_shrinker(&qinf->qi_shrinker);
return 0;
+
+out_free_lru:
+ list_lru_destroy(&qinf->qi_lru);
+out_free_qinf:
+ kmem_free(qinf);
+ mp->m_quotainfo = NULL;
+ return error;
}
@@ -738,6 +782,7 @@ xfs_qm_destroy_quotainfo(
ASSERT(qi != NULL);
unregister_shrinker(&qi->qi_shrinker);
+ list_lru_destroy(&qi->qi_lru);
if (qi->qi_uquotaip) {
IRELE(qi->qi_uquotaip);
@@ -747,6 +792,10 @@ xfs_qm_destroy_quotainfo(
IRELE(qi->qi_gquotaip);
qi->qi_gquotaip = NULL;
}
+ if (qi->qi_pquotaip) {
+ IRELE(qi->qi_pquotaip);
+ qi->qi_pquotaip = NULL;
+ }
mutex_destroy(&qi->qi_quotaofflock);
kmem_free(qi);
mp->m_quotainfo = NULL;
@@ -767,21 +816,52 @@ xfs_qm_qino_alloc(
int error;
int committed;
+ *ip = NULL;
+ /*
+ * With superblock that doesn't have separate pquotino, we
+ * share an inode between gquota and pquota. If the on-disk
+ * superblock has GQUOTA and the filesystem is now mounted
+ * with PQUOTA, just use sb_gquotino for sb_pquotino and
+ * vice-versa.
+ */
+ if (!xfs_sb_version_has_pquotino(&mp->m_sb) &&
+ (flags & (XFS_QMOPT_PQUOTA|XFS_QMOPT_GQUOTA))) {
+ xfs_ino_t ino = NULLFSINO;
+
+ if ((flags & XFS_QMOPT_PQUOTA) &&
+ (mp->m_sb.sb_gquotino != NULLFSINO)) {
+ ino = mp->m_sb.sb_gquotino;
+ ASSERT(mp->m_sb.sb_pquotino == NULLFSINO);
+ } else if ((flags & XFS_QMOPT_GQUOTA) &&
+ (mp->m_sb.sb_pquotino != NULLFSINO)) {
+ ino = mp->m_sb.sb_pquotino;
+ ASSERT(mp->m_sb.sb_gquotino == NULLFSINO);
+ }
+ if (ino != NULLFSINO) {
+ error = xfs_iget(mp, NULL, ino, 0, 0, ip);
+ if (error)
+ return error;
+ mp->m_sb.sb_gquotino = NULLFSINO;
+ mp->m_sb.sb_pquotino = NULLFSINO;
+ }
+ }
+
tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QINOCREATE);
- if ((error = xfs_trans_reserve(tp,
- XFS_QM_QINOCREATE_SPACE_RES(mp),
- XFS_CREATE_LOG_RES(mp), 0,
- XFS_TRANS_PERM_LOG_RES,
- XFS_CREATE_LOG_COUNT))) {
+ error = xfs_trans_reserve(tp, &M_RES(mp)->tr_create,
+ XFS_QM_QINOCREATE_SPACE_RES(mp), 0);
+ if (error) {
xfs_trans_cancel(tp, 0);
return error;
}
- error = xfs_dir_ialloc(&tp, NULL, S_IFREG, 1, 0, 0, 1, ip, &committed);
- if (error) {
- xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES |
- XFS_TRANS_ABORT);
- return error;
+ if (!*ip) {
+ error = xfs_dir_ialloc(&tp, NULL, S_IFREG, 1, 0, 0, 1, ip,
+ &committed);
+ if (error) {
+ xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES |
+ XFS_TRANS_ABORT);
+ return error;
+ }
}
/*
@@ -793,21 +873,25 @@ xfs_qm_qino_alloc(
if (flags & XFS_QMOPT_SBVERSION) {
ASSERT(!xfs_sb_version_hasquota(&mp->m_sb));
ASSERT((sbfields & (XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO |
- XFS_SB_GQUOTINO | XFS_SB_QFLAGS)) ==
- (XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO |
- XFS_SB_GQUOTINO | XFS_SB_QFLAGS));
+ XFS_SB_GQUOTINO | XFS_SB_PQUOTINO | XFS_SB_QFLAGS)) ==
+ (XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO |
+ XFS_SB_GQUOTINO | XFS_SB_PQUOTINO |
+ XFS_SB_QFLAGS));
xfs_sb_version_addquota(&mp->m_sb);
mp->m_sb.sb_uquotino = NULLFSINO;
mp->m_sb.sb_gquotino = NULLFSINO;
+ mp->m_sb.sb_pquotino = NULLFSINO;
- /* qflags will get updated _after_ quotacheck */
- mp->m_sb.sb_qflags = 0;
+ /* qflags will get updated fully _after_ quotacheck */
+ mp->m_sb.sb_qflags = mp->m_qflags & XFS_ALL_QUOTA_ACCT;
}
if (flags & XFS_QMOPT_UQUOTA)
mp->m_sb.sb_uquotino = (*ip)->i_ino;
- else
+ else if (flags & XFS_QMOPT_GQUOTA)
mp->m_sb.sb_gquotino = (*ip)->i_ino;
+ else
+ mp->m_sb.sb_pquotino = (*ip)->i_ino;
spin_unlock(&mp->m_sb_lock);
xfs_mod_sb(tp, sbfields);
@@ -826,7 +910,7 @@ xfs_qm_reset_dqcounts(
xfs_dqid_t id,
uint type)
{
- xfs_disk_dquot_t *ddq;
+ struct xfs_dqblk *dqb;
int j;
trace_xfs_reset_dqcounts(bp, _RET_IP_);
@@ -840,15 +924,19 @@ xfs_qm_reset_dqcounts(
do_div(j, sizeof(xfs_dqblk_t));
ASSERT(mp->m_quotainfo->qi_dqperchunk == j);
#endif
- ddq = bp->b_addr;
+ dqb = bp->b_addr;
for (j = 0; j < mp->m_quotainfo->qi_dqperchunk; j++) {
+ struct xfs_disk_dquot *ddq;
+
+ ddq = (struct xfs_disk_dquot *)&dqb[j];
+
/*
* Do a sanity check, and if needed, repair the dqblk. Don't
* output any warnings because it's perfectly possible to
- * find uninitialised dquot blks. See comment in xfs_qm_dqcheck.
+ * find uninitialised dquot blks. See comment in xfs_dqcheck.
*/
- (void) xfs_qm_dqcheck(mp, ddq, id+j, type, XFS_QMOPT_DQREPAIR,
- "xfs_quotacheck");
+ xfs_dqcheck(mp, ddq, id+j, type, XFS_QMOPT_DQREPAIR,
+ "xfs_quotacheck");
ddq->d_bcount = 0;
ddq->d_icount = 0;
ddq->d_rtbcount = 0;
@@ -858,7 +946,12 @@ xfs_qm_reset_dqcounts(
ddq->d_bwarns = 0;
ddq->d_iwarns = 0;
ddq->d_rtbwarns = 0;
- ddq = (xfs_disk_dquot_t *) ((xfs_dqblk_t *)ddq + 1);
+
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ xfs_update_cksum((char *)&dqb[j],
+ sizeof(struct xfs_dqblk),
+ XFS_DQUOT_CRC_OFF);
+ }
}
}
@@ -894,15 +987,29 @@ xfs_qm_dqiter_bufs(
XFS_FSB_TO_DADDR(mp, bno),
mp->m_quotainfo->qi_dqchunklen, 0, &bp,
&xfs_dquot_buf_ops);
+
+ /*
+ * CRC and validation errors will return a EFSCORRUPTED here. If
+ * this occurs, re-read without CRC validation so that we can
+ * repair the damage via xfs_qm_reset_dqcounts(). This process
+ * will leave a trace in the log indicating corruption has
+ * been detected.
+ */
+ if (error == EFSCORRUPTED) {
+ error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp,
+ XFS_FSB_TO_DADDR(mp, bno),
+ mp->m_quotainfo->qi_dqchunklen, 0, &bp,
+ NULL);
+ }
+
if (error)
break;
xfs_qm_reset_dqcounts(mp, bp, firstid, type);
xfs_buf_delwri_queue(bp, buffer_list);
xfs_buf_relse(bp);
- /*
- * goto the next block.
- */
+
+ /* goto the next block. */
bno++;
firstid += mp->m_quotainfo->qi_dqperchunk;
}
@@ -944,16 +1051,18 @@ xfs_qm_dqiterate(
lblkno = 0;
maxlblkcnt = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes);
do {
+ uint lock_mode;
+
nmaps = XFS_DQITER_MAP_SIZE;
/*
* We aren't changing the inode itself. Just changing
* some of its data. No new blocks are added here, and
* the inode is never added to the transaction.
*/
- xfs_ilock(qip, XFS_ILOCK_SHARED);
+ lock_mode = xfs_ilock_data_map_shared(qip);
error = xfs_bmapi_read(qip, lblkno, maxlblkcnt - lblkno,
map, &nmaps, 0);
- xfs_iunlock(qip, XFS_ILOCK_SHARED);
+ xfs_iunlock(qip, lock_mode);
if (error)
break;
@@ -1057,7 +1166,7 @@ xfs_qm_quotacheck_dqadjust(
* There are no timers for the default values set in the root dquot.
*/
if (dqp->q_core.d_id) {
- xfs_qm_adjust_dqlimits(mp, &dqp->q_core);
+ xfs_qm_adjust_dqlimits(mp, dqp);
xfs_qm_adjust_dqtimers(mp, &dqp->q_core);
}
@@ -1115,7 +1224,7 @@ xfs_qm_dqusage_adjust(
* rootino must have its resources accounted for, not so with the quota
* inodes.
*/
- if (ino == mp->m_sb.sb_uquotino || ino == mp->m_sb.sb_gquotino) {
+ if (xfs_is_quota_inode(&mp->m_sb, ino)) {
*res = BULKSTAT_RV_NOTHING;
return XFS_ERROR(EINVAL);
}
@@ -1225,19 +1334,21 @@ int
xfs_qm_quotacheck(
xfs_mount_t *mp)
{
- int done, count, error, error2;
- xfs_ino_t lastino;
- size_t structsz;
- xfs_inode_t *uip, *gip;
- uint flags;
- LIST_HEAD (buffer_list);
+ int done, count, error, error2;
+ xfs_ino_t lastino;
+ size_t structsz;
+ uint flags;
+ LIST_HEAD (buffer_list);
+ struct xfs_inode *uip = mp->m_quotainfo->qi_uquotaip;
+ struct xfs_inode *gip = mp->m_quotainfo->qi_gquotaip;
+ struct xfs_inode *pip = mp->m_quotainfo->qi_pquotaip;
count = INT_MAX;
structsz = 1;
lastino = 0;
flags = 0;
- ASSERT(mp->m_quotainfo->qi_uquotaip || mp->m_quotainfo->qi_gquotaip);
+ ASSERT(uip || gip || pip);
ASSERT(XFS_IS_QUOTA_RUNNING(mp));
xfs_notice(mp, "Quotacheck needed: Please wait.");
@@ -1247,7 +1358,6 @@ xfs_qm_quotacheck(
* their counters to zero. We need a clean slate.
* We don't log our changes till later.
*/
- uip = mp->m_quotainfo->qi_uquotaip;
if (uip) {
error = xfs_qm_dqiterate(mp, uip, XFS_QMOPT_UQUOTA,
&buffer_list);
@@ -1256,14 +1366,20 @@ xfs_qm_quotacheck(
flags |= XFS_UQUOTA_CHKD;
}
- gip = mp->m_quotainfo->qi_gquotaip;
if (gip) {
- error = xfs_qm_dqiterate(mp, gip, XFS_IS_GQUOTA_ON(mp) ?
- XFS_QMOPT_GQUOTA : XFS_QMOPT_PQUOTA,
+ error = xfs_qm_dqiterate(mp, gip, XFS_QMOPT_GQUOTA,
+ &buffer_list);
+ if (error)
+ goto error_return;
+ flags |= XFS_GQUOTA_CHKD;
+ }
+
+ if (pip) {
+ error = xfs_qm_dqiterate(mp, pip, XFS_QMOPT_PQUOTA,
&buffer_list);
if (error)
goto error_return;
- flags |= XFS_OQUOTA_CHKD;
+ flags |= XFS_PQUOTA_CHKD;
}
do {
@@ -1358,15 +1474,14 @@ STATIC int
xfs_qm_init_quotainos(
xfs_mount_t *mp)
{
- xfs_inode_t *uip, *gip;
- int error;
- __int64_t sbflags;
- uint flags;
+ struct xfs_inode *uip = NULL;
+ struct xfs_inode *gip = NULL;
+ struct xfs_inode *pip = NULL;
+ int error;
+ __int64_t sbflags = 0;
+ uint flags = 0;
ASSERT(mp->m_quotainfo);
- uip = gip = NULL;
- sbflags = 0;
- flags = 0;
/*
* Get the uquota and gquota inodes
@@ -1375,57 +1490,80 @@ xfs_qm_init_quotainos(
if (XFS_IS_UQUOTA_ON(mp) &&
mp->m_sb.sb_uquotino != NULLFSINO) {
ASSERT(mp->m_sb.sb_uquotino > 0);
- if ((error = xfs_iget(mp, NULL, mp->m_sb.sb_uquotino,
- 0, 0, &uip)))
+ error = xfs_iget(mp, NULL, mp->m_sb.sb_uquotino,
+ 0, 0, &uip);
+ if (error)
return XFS_ERROR(error);
}
- if (XFS_IS_OQUOTA_ON(mp) &&
+ if (XFS_IS_GQUOTA_ON(mp) &&
mp->m_sb.sb_gquotino != NULLFSINO) {
ASSERT(mp->m_sb.sb_gquotino > 0);
- if ((error = xfs_iget(mp, NULL, mp->m_sb.sb_gquotino,
- 0, 0, &gip))) {
- if (uip)
- IRELE(uip);
- return XFS_ERROR(error);
- }
+ error = xfs_iget(mp, NULL, mp->m_sb.sb_gquotino,
+ 0, 0, &gip);
+ if (error)
+ goto error_rele;
+ }
+ if (XFS_IS_PQUOTA_ON(mp) &&
+ mp->m_sb.sb_pquotino != NULLFSINO) {
+ ASSERT(mp->m_sb.sb_pquotino > 0);
+ error = xfs_iget(mp, NULL, mp->m_sb.sb_pquotino,
+ 0, 0, &pip);
+ if (error)
+ goto error_rele;
}
} else {
flags |= XFS_QMOPT_SBVERSION;
sbflags |= (XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO |
- XFS_SB_GQUOTINO | XFS_SB_QFLAGS);
+ XFS_SB_GQUOTINO | XFS_SB_PQUOTINO |
+ XFS_SB_QFLAGS);
}
/*
- * Create the two inodes, if they don't exist already. The changes
+ * Create the three inodes, if they don't exist already. The changes
* made above will get added to a transaction and logged in one of
* the qino_alloc calls below. If the device is readonly,
* temporarily switch to read-write to do this.
*/
if (XFS_IS_UQUOTA_ON(mp) && uip == NULL) {
- if ((error = xfs_qm_qino_alloc(mp, &uip,
+ error = xfs_qm_qino_alloc(mp, &uip,
sbflags | XFS_SB_UQUOTINO,
- flags | XFS_QMOPT_UQUOTA)))
- return XFS_ERROR(error);
+ flags | XFS_QMOPT_UQUOTA);
+ if (error)
+ goto error_rele;
flags &= ~XFS_QMOPT_SBVERSION;
}
- if (XFS_IS_OQUOTA_ON(mp) && gip == NULL) {
- flags |= (XFS_IS_GQUOTA_ON(mp) ?
- XFS_QMOPT_GQUOTA : XFS_QMOPT_PQUOTA);
+ if (XFS_IS_GQUOTA_ON(mp) && gip == NULL) {
error = xfs_qm_qino_alloc(mp, &gip,
- sbflags | XFS_SB_GQUOTINO, flags);
- if (error) {
- if (uip)
- IRELE(uip);
+ sbflags | XFS_SB_GQUOTINO,
+ flags | XFS_QMOPT_GQUOTA);
+ if (error)
+ goto error_rele;
- return XFS_ERROR(error);
- }
+ flags &= ~XFS_QMOPT_SBVERSION;
+ }
+ if (XFS_IS_PQUOTA_ON(mp) && pip == NULL) {
+ error = xfs_qm_qino_alloc(mp, &pip,
+ sbflags | XFS_SB_PQUOTINO,
+ flags | XFS_QMOPT_PQUOTA);
+ if (error)
+ goto error_rele;
}
mp->m_quotainfo->qi_uquotaip = uip;
mp->m_quotainfo->qi_gquotaip = gip;
+ mp->m_quotainfo->qi_pquotaip = pip;
return 0;
+
+error_rele:
+ if (uip)
+ IRELE(uip);
+ if (gip)
+ IRELE(gip);
+ if (pip)
+ IRELE(pip);
+ return XFS_ERROR(error);
}
STATIC void
@@ -1436,7 +1574,7 @@ xfs_qm_dqfree_one(
struct xfs_quotainfo *qi = mp->m_quotainfo;
mutex_lock(&qi->qi_tree_lock);
- radix_tree_delete(XFS_DQUOT_TREE(qi, dqp->q_core.d_flags),
+ radix_tree_delete(xfs_dquot_tree(qi, dqp->q_core.d_flags),
be32_to_cpu(dqp->q_core.d_id));
qi->qi_dquots--;
@@ -1445,132 +1583,6 @@ xfs_qm_dqfree_one(
xfs_qm_dqdestroy(dqp);
}
-STATIC void
-xfs_qm_dqreclaim_one(
- struct xfs_dquot *dqp,
- struct list_head *buffer_list,
- struct list_head *dispose_list)
-{
- struct xfs_mount *mp = dqp->q_mount;
- struct xfs_quotainfo *qi = mp->m_quotainfo;
- int error;
-
- if (!xfs_dqlock_nowait(dqp))
- goto out_move_tail;
-
- /*
- * This dquot has acquired a reference in the meantime remove it from
- * the freelist and try again.
- */
- if (dqp->q_nrefs) {
- xfs_dqunlock(dqp);
-
- trace_xfs_dqreclaim_want(dqp);
- XFS_STATS_INC(xs_qm_dqwants);
-
- list_del_init(&dqp->q_lru);
- qi->qi_lru_count--;
- XFS_STATS_DEC(xs_qm_dquot_unused);
- return;
- }
-
- /*
- * Try to grab the flush lock. If this dquot is in the process of
- * getting flushed to disk, we don't want to reclaim it.
- */
- if (!xfs_dqflock_nowait(dqp))
- goto out_unlock_move_tail;
-
- if (XFS_DQ_IS_DIRTY(dqp)) {
- struct xfs_buf *bp = NULL;
-
- trace_xfs_dqreclaim_dirty(dqp);
-
- error = xfs_qm_dqflush(dqp, &bp);
- if (error) {
- xfs_warn(mp, "%s: dquot %p flush failed",
- __func__, dqp);
- goto out_unlock_move_tail;
- }
-
- xfs_buf_delwri_queue(bp, buffer_list);
- xfs_buf_relse(bp);
- /*
- * Give the dquot another try on the freelist, as the
- * flushing will take some time.
- */
- goto out_unlock_move_tail;
- }
- xfs_dqfunlock(dqp);
-
- /*
- * Prevent lookups now that we are past the point of no return.
- */
- dqp->dq_flags |= XFS_DQ_FREEING;
- xfs_dqunlock(dqp);
-
- ASSERT(dqp->q_nrefs == 0);
- list_move_tail(&dqp->q_lru, dispose_list);
- qi->qi_lru_count--;
- XFS_STATS_DEC(xs_qm_dquot_unused);
-
- trace_xfs_dqreclaim_done(dqp);
- XFS_STATS_INC(xs_qm_dqreclaims);
- return;
-
- /*
- * Move the dquot to the tail of the list so that we don't spin on it.
- */
-out_unlock_move_tail:
- xfs_dqunlock(dqp);
-out_move_tail:
- list_move_tail(&dqp->q_lru, &qi->qi_lru_list);
- trace_xfs_dqreclaim_busy(dqp);
- XFS_STATS_INC(xs_qm_dqreclaim_misses);
-}
-
-STATIC int
-xfs_qm_shake(
- struct shrinker *shrink,
- struct shrink_control *sc)
-{
- struct xfs_quotainfo *qi =
- container_of(shrink, struct xfs_quotainfo, qi_shrinker);
- int nr_to_scan = sc->nr_to_scan;
- LIST_HEAD (buffer_list);
- LIST_HEAD (dispose_list);
- struct xfs_dquot *dqp;
- int error;
-
- if ((sc->gfp_mask & (__GFP_FS|__GFP_WAIT)) != (__GFP_FS|__GFP_WAIT))
- return 0;
- if (!nr_to_scan)
- goto out;
-
- mutex_lock(&qi->qi_lru_lock);
- while (!list_empty(&qi->qi_lru_list)) {
- if (nr_to_scan-- <= 0)
- break;
- dqp = list_first_entry(&qi->qi_lru_list, struct xfs_dquot,
- q_lru);
- xfs_qm_dqreclaim_one(dqp, &buffer_list, &dispose_list);
- }
- mutex_unlock(&qi->qi_lru_lock);
-
- error = xfs_buf_delwri_submit(&buffer_list);
- if (error)
- xfs_warn(NULL, "%s: dquot reclaim failed", __func__);
-
- while (!list_empty(&dispose_list)) {
- dqp = list_first_entry(&dispose_list, struct xfs_dquot, q_lru);
- list_del_init(&dqp->q_lru);
- xfs_qm_dqfree_one(dqp);
- }
-
-out:
- return (qi->qi_lru_count / 100) * sysctl_vfs_cache_pressure;
-}
-
/*
* Start a transaction and write the incore superblock changes to
* disk. flags parameter indicates which fields have changed.
@@ -1584,10 +1596,8 @@ xfs_qm_write_sb_changes(
int error;
tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SBCHANGE);
- if ((error = xfs_trans_reserve(tp, 0,
- mp->m_sb.sb_sectsize + 128, 0,
- 0,
- XFS_DEFAULT_LOG_COUNT))) {
+ error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_sbchange, 0, 0);
+ if (error) {
xfs_trans_cancel(tp, 0);
return error;
}
@@ -1615,15 +1625,18 @@ xfs_qm_write_sb_changes(
int
xfs_qm_vop_dqalloc(
struct xfs_inode *ip,
- uid_t uid,
- gid_t gid,
+ xfs_dqid_t uid,
+ xfs_dqid_t gid,
prid_t prid,
uint flags,
struct xfs_dquot **O_udqpp,
- struct xfs_dquot **O_gdqpp)
+ struct xfs_dquot **O_gdqpp,
+ struct xfs_dquot **O_pdqpp)
{
struct xfs_mount *mp = ip->i_mount;
- struct xfs_dquot *uq, *gq;
+ struct xfs_dquot *uq = NULL;
+ struct xfs_dquot *gq = NULL;
+ struct xfs_dquot *pq = NULL;
int error;
uint lockflags;
@@ -1648,7 +1661,6 @@ xfs_qm_vop_dqalloc(
}
}
- uq = gq = NULL;
if ((flags & XFS_QMOPT_UQUOTA) && XFS_IS_UQUOTA_ON(mp)) {
if (ip->i_d.di_uid != uid) {
/*
@@ -1661,11 +1673,12 @@ xfs_qm_vop_dqalloc(
* holding ilock.
*/
xfs_iunlock(ip, lockflags);
- if ((error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t) uid,
+ error = xfs_qm_dqget(mp, NULL, uid,
XFS_DQ_USER,
XFS_QMOPT_DQALLOC |
XFS_QMOPT_DOWARN,
- &uq))) {
+ &uq);
+ if (error) {
ASSERT(error != ENOENT);
return error;
}
@@ -1687,15 +1700,14 @@ xfs_qm_vop_dqalloc(
if ((flags & XFS_QMOPT_GQUOTA) && XFS_IS_GQUOTA_ON(mp)) {
if (ip->i_d.di_gid != gid) {
xfs_iunlock(ip, lockflags);
- if ((error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)gid,
+ error = xfs_qm_dqget(mp, NULL, gid,
XFS_DQ_GROUP,
XFS_QMOPT_DQALLOC |
XFS_QMOPT_DOWARN,
- &gq))) {
- if (uq)
- xfs_qm_dqrele(uq);
+ &gq);
+ if (error) {
ASSERT(error != ENOENT);
- return error;
+ goto error_rele;
}
xfs_dqunlock(gq);
lockflags = XFS_ILOCK_SHARED;
@@ -1704,25 +1716,25 @@ xfs_qm_vop_dqalloc(
ASSERT(ip->i_gdquot);
gq = xfs_qm_dqhold(ip->i_gdquot);
}
- } else if ((flags & XFS_QMOPT_PQUOTA) && XFS_IS_PQUOTA_ON(mp)) {
+ }
+ if ((flags & XFS_QMOPT_PQUOTA) && XFS_IS_PQUOTA_ON(mp)) {
if (xfs_get_projid(ip) != prid) {
xfs_iunlock(ip, lockflags);
- if ((error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)prid,
+ error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)prid,
XFS_DQ_PROJ,
XFS_QMOPT_DQALLOC |
XFS_QMOPT_DOWARN,
- &gq))) {
- if (uq)
- xfs_qm_dqrele(uq);
+ &pq);
+ if (error) {
ASSERT(error != ENOENT);
- return (error);
+ goto error_rele;
}
- xfs_dqunlock(gq);
+ xfs_dqunlock(pq);
lockflags = XFS_ILOCK_SHARED;
xfs_ilock(ip, lockflags);
} else {
- ASSERT(ip->i_gdquot);
- gq = xfs_qm_dqhold(ip->i_gdquot);
+ ASSERT(ip->i_pdquot);
+ pq = xfs_qm_dqhold(ip->i_pdquot);
}
}
if (uq)
@@ -1737,7 +1749,18 @@ xfs_qm_vop_dqalloc(
*O_gdqpp = gq;
else if (gq)
xfs_qm_dqrele(gq);
+ if (O_pdqpp)
+ *O_pdqpp = pq;
+ else if (pq)
+ xfs_qm_dqrele(pq);
return 0;
+
+error_rele:
+ if (gq)
+ xfs_qm_dqrele(gq);
+ if (uq)
+ xfs_qm_dqrele(uq);
+ return error;
}
/*
@@ -1785,29 +1808,34 @@ xfs_qm_vop_chown(
*/
int
xfs_qm_vop_chown_reserve(
- xfs_trans_t *tp,
- xfs_inode_t *ip,
- xfs_dquot_t *udqp,
- xfs_dquot_t *gdqp,
- uint flags)
+ struct xfs_trans *tp,
+ struct xfs_inode *ip,
+ struct xfs_dquot *udqp,
+ struct xfs_dquot *gdqp,
+ struct xfs_dquot *pdqp,
+ uint flags)
{
- xfs_mount_t *mp = ip->i_mount;
- uint delblks, blkflags, prjflags = 0;
- xfs_dquot_t *unresudq, *unresgdq, *delblksudq, *delblksgdq;
- int error;
+ struct xfs_mount *mp = ip->i_mount;
+ uint delblks, blkflags, prjflags = 0;
+ struct xfs_dquot *udq_unres = NULL;
+ struct xfs_dquot *gdq_unres = NULL;
+ struct xfs_dquot *pdq_unres = NULL;
+ struct xfs_dquot *udq_delblks = NULL;
+ struct xfs_dquot *gdq_delblks = NULL;
+ struct xfs_dquot *pdq_delblks = NULL;
+ int error;
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
ASSERT(XFS_IS_QUOTA_RUNNING(mp));
delblks = ip->i_delayed_blks;
- delblksudq = delblksgdq = unresudq = unresgdq = NULL;
blkflags = XFS_IS_REALTIME_INODE(ip) ?
XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS;
if (XFS_IS_UQUOTA_ON(mp) && udqp &&
- ip->i_d.di_uid != (uid_t)be32_to_cpu(udqp->q_core.d_id)) {
- delblksudq = udqp;
+ ip->i_d.di_uid != be32_to_cpu(udqp->q_core.d_id)) {
+ udq_delblks = udqp;
/*
* If there are delayed allocation blocks, then we have to
* unreserve those from the old dquot, and add them to the
@@ -1815,29 +1843,34 @@ xfs_qm_vop_chown_reserve(
*/
if (delblks) {
ASSERT(ip->i_udquot);
- unresudq = ip->i_udquot;
+ udq_unres = ip->i_udquot;
}
}
- if (XFS_IS_OQUOTA_ON(ip->i_mount) && gdqp) {
- if (XFS_IS_PQUOTA_ON(ip->i_mount) &&
- xfs_get_projid(ip) != be32_to_cpu(gdqp->q_core.d_id))
- prjflags = XFS_QMOPT_ENOSPC;
-
- if (prjflags ||
- (XFS_IS_GQUOTA_ON(ip->i_mount) &&
- ip->i_d.di_gid != be32_to_cpu(gdqp->q_core.d_id))) {
- delblksgdq = gdqp;
- if (delblks) {
- ASSERT(ip->i_gdquot);
- unresgdq = ip->i_gdquot;
- }
+ if (XFS_IS_GQUOTA_ON(ip->i_mount) && gdqp &&
+ ip->i_d.di_gid != be32_to_cpu(gdqp->q_core.d_id)) {
+ gdq_delblks = gdqp;
+ if (delblks) {
+ ASSERT(ip->i_gdquot);
+ gdq_unres = ip->i_gdquot;
}
}
- if ((error = xfs_trans_reserve_quota_bydquots(tp, ip->i_mount,
- delblksudq, delblksgdq, ip->i_d.di_nblocks, 1,
- flags | blkflags | prjflags)))
- return (error);
+ if (XFS_IS_PQUOTA_ON(ip->i_mount) && pdqp &&
+ xfs_get_projid(ip) != be32_to_cpu(pdqp->q_core.d_id)) {
+ prjflags = XFS_QMOPT_ENOSPC;
+ pdq_delblks = pdqp;
+ if (delblks) {
+ ASSERT(ip->i_pdquot);
+ pdq_unres = ip->i_pdquot;
+ }
+ }
+
+ error = xfs_trans_reserve_quota_bydquots(tp, ip->i_mount,
+ udq_delblks, gdq_delblks, pdq_delblks,
+ ip->i_d.di_nblocks, 1,
+ flags | blkflags | prjflags);
+ if (error)
+ return error;
/*
* Do the delayed blks reservations/unreservations now. Since, these
@@ -1849,15 +1882,17 @@ xfs_qm_vop_chown_reserve(
/*
* Do the reservations first. Unreservation can't fail.
*/
- ASSERT(delblksudq || delblksgdq);
- ASSERT(unresudq || unresgdq);
- if ((error = xfs_trans_reserve_quota_bydquots(NULL, ip->i_mount,
- delblksudq, delblksgdq, (xfs_qcnt_t)delblks, 0,
- flags | blkflags | prjflags)))
- return (error);
+ ASSERT(udq_delblks || gdq_delblks || pdq_delblks);
+ ASSERT(udq_unres || gdq_unres || pdq_unres);
+ error = xfs_trans_reserve_quota_bydquots(NULL, ip->i_mount,
+ udq_delblks, gdq_delblks, pdq_delblks,
+ (xfs_qcnt_t)delblks, 0,
+ flags | blkflags | prjflags);
+ if (error)
+ return error;
xfs_trans_reserve_quota_bydquots(NULL, ip->i_mount,
- unresudq, unresgdq, -((xfs_qcnt_t)delblks), 0,
- blkflags);
+ udq_unres, gdq_unres, pdq_unres,
+ -((xfs_qcnt_t)delblks), 0, blkflags);
}
return (0);
@@ -1896,7 +1931,8 @@ xfs_qm_vop_create_dqattach(
struct xfs_trans *tp,
struct xfs_inode *ip,
struct xfs_dquot *udqp,
- struct xfs_dquot *gdqp)
+ struct xfs_dquot *gdqp,
+ struct xfs_dquot *pdqp)
{
struct xfs_mount *mp = tp->t_mountp;
@@ -1906,23 +1942,25 @@ xfs_qm_vop_create_dqattach(
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
ASSERT(XFS_IS_QUOTA_RUNNING(mp));
- if (udqp) {
+ if (udqp && XFS_IS_UQUOTA_ON(mp)) {
ASSERT(ip->i_udquot == NULL);
- ASSERT(XFS_IS_UQUOTA_ON(mp));
ASSERT(ip->i_d.di_uid == be32_to_cpu(udqp->q_core.d_id));
ip->i_udquot = xfs_qm_dqhold(udqp);
xfs_trans_mod_dquot(tp, udqp, XFS_TRANS_DQ_ICOUNT, 1);
}
- if (gdqp) {
+ if (gdqp && XFS_IS_GQUOTA_ON(mp)) {
ASSERT(ip->i_gdquot == NULL);
- ASSERT(XFS_IS_OQUOTA_ON(mp));
- ASSERT((XFS_IS_GQUOTA_ON(mp) ?
- ip->i_d.di_gid : xfs_get_projid(ip)) ==
- be32_to_cpu(gdqp->q_core.d_id));
-
+ ASSERT(ip->i_d.di_gid == be32_to_cpu(gdqp->q_core.d_id));
ip->i_gdquot = xfs_qm_dqhold(gdqp);
xfs_trans_mod_dquot(tp, gdqp, XFS_TRANS_DQ_ICOUNT, 1);
}
+ if (pdqp && XFS_IS_PQUOTA_ON(mp)) {
+ ASSERT(ip->i_pdquot == NULL);
+ ASSERT(xfs_get_projid(ip) == be32_to_cpu(pdqp->q_core.d_id));
+
+ ip->i_pdquot = xfs_qm_dqhold(pdqp);
+ xfs_trans_mod_dquot(tp, pdqp, XFS_TRANS_DQ_ICOUNT, 1);
+ }
}
diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h
index 44b858b79d7..797fd463627 100644
--- a/fs/xfs/xfs_qm.h
+++ b/fs/xfs/xfs_qm.h
@@ -20,13 +20,29 @@
#include "xfs_dquot_item.h"
#include "xfs_dquot.h"
-#include "xfs_quota_priv.h"
struct xfs_inode;
extern struct kmem_zone *xfs_qm_dqtrxzone;
/*
+ * Number of bmaps that we ask from bmapi when doing a quotacheck.
+ * We make this restriction to keep the memory usage to a minimum.
+ */
+#define XFS_DQITER_MAP_SIZE 10
+
+#define XFS_IS_DQUOT_UNINITIALIZED(dqp) ( \
+ !dqp->q_core.d_blk_hardlimit && \
+ !dqp->q_core.d_blk_softlimit && \
+ !dqp->q_core.d_rtb_hardlimit && \
+ !dqp->q_core.d_rtb_softlimit && \
+ !dqp->q_core.d_ino_hardlimit && \
+ !dqp->q_core.d_ino_softlimit && \
+ !dqp->q_core.d_bcount && \
+ !dqp->q_core.d_rtbcount && \
+ !dqp->q_core.d_icount)
+
+/*
* This defines the unit of allocation of dquots.
* Currently, it is just one file system block, and a 4K blk contains 30
* (136 * 30 = 4080) dquots. It's probably not worth trying to make
@@ -44,12 +60,12 @@ extern struct kmem_zone *xfs_qm_dqtrxzone;
typedef struct xfs_quotainfo {
struct radix_tree_root qi_uquota_tree;
struct radix_tree_root qi_gquota_tree;
+ struct radix_tree_root qi_pquota_tree;
struct mutex qi_tree_lock;
- xfs_inode_t *qi_uquotaip; /* user quota inode */
- xfs_inode_t *qi_gquotaip; /* group quota inode */
- struct list_head qi_lru_list;
- struct mutex qi_lru_lock;
- int qi_lru_count;
+ struct xfs_inode *qi_uquotaip; /* user quota inode */
+ struct xfs_inode *qi_gquotaip; /* group quota inode */
+ struct xfs_inode *qi_pquotaip; /* project quota inode */
+ struct list_lru qi_lru;
int qi_dquots;
time_t qi_btimelimit; /* limit for blks timer */
time_t qi_itimelimit; /* limit for inodes timer */
@@ -69,28 +85,64 @@ typedef struct xfs_quotainfo {
struct shrinker qi_shrinker;
} xfs_quotainfo_t;
-#define XFS_DQUOT_TREE(qi, type) \
- ((type & XFS_DQ_USER) ? \
- &((qi)->qi_uquota_tree) : \
- &((qi)->qi_gquota_tree))
+static inline struct radix_tree_root *
+xfs_dquot_tree(
+ struct xfs_quotainfo *qi,
+ int type)
+{
+ switch (type) {
+ case XFS_DQ_USER:
+ return &qi->qi_uquota_tree;
+ case XFS_DQ_GROUP:
+ return &qi->qi_gquota_tree;
+ case XFS_DQ_PROJ:
+ return &qi->qi_pquota_tree;
+ default:
+ ASSERT(0);
+ }
+ return NULL;
+}
+static inline struct xfs_inode *
+xfs_dq_to_quota_inode(struct xfs_dquot *dqp)
+{
+ switch (dqp->dq_flags & XFS_DQ_ALLTYPES) {
+ case XFS_DQ_USER:
+ return dqp->q_mount->m_quotainfo->qi_uquotaip;
+ case XFS_DQ_GROUP:
+ return dqp->q_mount->m_quotainfo->qi_gquotaip;
+ case XFS_DQ_PROJ:
+ return dqp->q_mount->m_quotainfo->qi_pquotaip;
+ default:
+ ASSERT(0);
+ }
+ return NULL;
+}
-extern void xfs_trans_mod_dquot(xfs_trans_t *, xfs_dquot_t *, uint, long);
-extern int xfs_trans_reserve_quota_bydquots(xfs_trans_t *, xfs_mount_t *,
- xfs_dquot_t *, xfs_dquot_t *, long, long, uint);
-extern void xfs_trans_dqjoin(xfs_trans_t *, xfs_dquot_t *);
-extern void xfs_trans_log_dquot(xfs_trans_t *, xfs_dquot_t *);
+extern void xfs_trans_mod_dquot(struct xfs_trans *,
+ struct xfs_dquot *, uint, long);
+extern int xfs_trans_reserve_quota_bydquots(struct xfs_trans *,
+ struct xfs_mount *, struct xfs_dquot *,
+ struct xfs_dquot *, struct xfs_dquot *,
+ long, long, uint);
+extern void xfs_trans_dqjoin(struct xfs_trans *, struct xfs_dquot *);
+extern void xfs_trans_log_dquot(struct xfs_trans *, struct xfs_dquot *);
/*
- * We keep the usr and grp dquots separately so that locking will be easier
- * to do at commit time. All transactions that we know of at this point
+ * We keep the usr, grp, and prj dquots separately so that locking will be
+ * easier to do at commit time. All transactions that we know of at this point
* affect no more than two dquots of one type. Hence, the TRANS_MAXDQS value.
*/
+enum {
+ XFS_QM_TRANS_USR = 0,
+ XFS_QM_TRANS_GRP,
+ XFS_QM_TRANS_PRJ,
+ XFS_QM_TRANS_DQTYPES
+};
#define XFS_QM_TRANS_MAXDQS 2
-typedef struct xfs_dquot_acct {
- xfs_dqtrx_t dqa_usrdquots[XFS_QM_TRANS_MAXDQS];
- xfs_dqtrx_t dqa_grpdquots[XFS_QM_TRANS_MAXDQS];
-} xfs_dquot_acct_t;
+struct xfs_dquot_acct {
+ struct xfs_dqtrx dqs[XFS_QM_TRANS_DQTYPES][XFS_QM_TRANS_MAXDQS];
+};
/*
* Users are allowed to have a usage exceeding their softlimit for
@@ -104,22 +156,25 @@ typedef struct xfs_dquot_acct {
#define XFS_QM_IWARNLIMIT 5
#define XFS_QM_RTBWARNLIMIT 5
-extern void xfs_qm_destroy_quotainfo(xfs_mount_t *);
-extern int xfs_qm_quotacheck(xfs_mount_t *);
-extern int xfs_qm_write_sb_changes(xfs_mount_t *, __int64_t);
+extern void xfs_qm_destroy_quotainfo(struct xfs_mount *);
+extern int xfs_qm_quotacheck(struct xfs_mount *);
+extern int xfs_qm_write_sb_changes(struct xfs_mount *, __int64_t);
/* dquot stuff */
-extern void xfs_qm_dqpurge_all(xfs_mount_t *, uint);
-extern void xfs_qm_dqrele_all_inodes(xfs_mount_t *, uint);
+extern void xfs_qm_dqpurge_all(struct xfs_mount *, uint);
+extern void xfs_qm_dqrele_all_inodes(struct xfs_mount *, uint);
/* quota ops */
-extern int xfs_qm_scall_trunc_qfiles(xfs_mount_t *, uint);
-extern int xfs_qm_scall_getquota(xfs_mount_t *, xfs_dqid_t, uint,
- fs_disk_quota_t *);
-extern int xfs_qm_scall_setqlim(xfs_mount_t *, xfs_dqid_t, uint,
- fs_disk_quota_t *);
-extern int xfs_qm_scall_getqstat(xfs_mount_t *, fs_quota_stat_t *);
-extern int xfs_qm_scall_quotaon(xfs_mount_t *, uint);
-extern int xfs_qm_scall_quotaoff(xfs_mount_t *, uint);
+extern int xfs_qm_scall_trunc_qfiles(struct xfs_mount *, uint);
+extern int xfs_qm_scall_getquota(struct xfs_mount *, xfs_dqid_t,
+ uint, struct fs_disk_quota *);
+extern int xfs_qm_scall_setqlim(struct xfs_mount *, xfs_dqid_t, uint,
+ struct fs_disk_quota *);
+extern int xfs_qm_scall_getqstat(struct xfs_mount *,
+ struct fs_quota_stat *);
+extern int xfs_qm_scall_getqstatv(struct xfs_mount *,
+ struct fs_quota_statv *);
+extern int xfs_qm_scall_quotaon(struct xfs_mount *, uint);
+extern int xfs_qm_scall_quotaoff(struct xfs_mount *, uint);
#endif /* __XFS_QM_H__ */
diff --git a/fs/xfs/xfs_qm_bhv.c b/fs/xfs/xfs_qm_bhv.c
index 6b39115bf14..e9be63abd8d 100644
--- a/fs/xfs/xfs_qm_bhv.c
+++ b/fs/xfs/xfs_qm_bhv.c
@@ -17,21 +17,16 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
-#include "xfs_alloc.h"
#include "xfs_quota.h"
#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
#include "xfs_inode.h"
-#include "xfs_itable.h"
-#include "xfs_bmap.h"
-#include "xfs_rtalloc.h"
#include "xfs_error.h"
-#include "xfs_attr.h"
-#include "xfs_buf_item.h"
+#include "xfs_trans.h"
#include "xfs_qm.h"
@@ -112,16 +107,16 @@ xfs_qm_newmount(
if (((uquotaondisk && !XFS_IS_UQUOTA_ON(mp)) ||
(!uquotaondisk && XFS_IS_UQUOTA_ON(mp)) ||
- (pquotaondisk && !XFS_IS_PQUOTA_ON(mp)) ||
- (!pquotaondisk && XFS_IS_PQUOTA_ON(mp)) ||
(gquotaondisk && !XFS_IS_GQUOTA_ON(mp)) ||
- (!gquotaondisk && XFS_IS_OQUOTA_ON(mp))) &&
+ (!gquotaondisk && XFS_IS_GQUOTA_ON(mp)) ||
+ (pquotaondisk && !XFS_IS_PQUOTA_ON(mp)) ||
+ (!pquotaondisk && XFS_IS_PQUOTA_ON(mp))) &&
xfs_dev_is_read_only(mp, "changing quota state")) {
xfs_warn(mp, "please mount with%s%s%s%s.",
(!quotaondisk ? "out quota" : ""),
(uquotaondisk ? " usrquota" : ""),
- (pquotaondisk ? " prjquota" : ""),
- (gquotaondisk ? " grpquota" : ""));
+ (gquotaondisk ? " grpquota" : ""),
+ (pquotaondisk ? " prjquota" : ""));
return XFS_ERROR(EPERM);
}
@@ -146,7 +141,7 @@ xfs_qm_newmount(
* inode goes inactive and wants to free blocks,
* or via xfs_log_mount_finish.
*/
- *needquotamount = B_TRUE;
+ *needquotamount = true;
*quotaflags = mp->m_qflags;
mp->m_qflags = 0;
}
diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
index 8a59f854655..bbc813caba4 100644
--- a/fs/xfs/xfs_qm_syscalls.c
+++ b/fs/xfs/xfs_qm_syscalls.c
@@ -20,24 +20,18 @@
#include "xfs.h"
#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_bit.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
-#include "xfs_alloc.h"
-#include "xfs_quota.h"
#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
#include "xfs_inode.h"
-#include "xfs_inode_item.h"
-#include "xfs_itable.h"
-#include "xfs_bmap.h"
-#include "xfs_rtalloc.h"
+#include "xfs_trans.h"
#include "xfs_error.h"
-#include "xfs_attr.h"
-#include "xfs_buf_item.h"
-#include "xfs_utils.h"
+#include "xfs_quota.h"
#include "xfs_qm.h"
#include "xfs_trace.h"
#include "xfs_icache.h"
@@ -117,11 +111,12 @@ xfs_qm_scall_quotaoff(
}
if (flags & XFS_GQUOTA_ACCT) {
dqtype |= XFS_QMOPT_GQUOTA;
- flags |= (XFS_OQUOTA_CHKD | XFS_OQUOTA_ENFD);
+ flags |= (XFS_GQUOTA_CHKD | XFS_GQUOTA_ENFD);
inactivate_flags |= XFS_GQUOTA_ACTIVE;
- } else if (flags & XFS_PQUOTA_ACCT) {
+ }
+ if (flags & XFS_PQUOTA_ACCT) {
dqtype |= XFS_QMOPT_PQUOTA;
- flags |= (XFS_OQUOTA_CHKD | XFS_OQUOTA_ENFD);
+ flags |= (XFS_PQUOTA_CHKD | XFS_PQUOTA_ENFD);
inactivate_flags |= XFS_PQUOTA_ACTIVE;
}
@@ -198,10 +193,9 @@ xfs_qm_scall_quotaoff(
}
/*
- * If quotas is completely disabled, close shop.
+ * If all quotas are completely turned off, close shop.
*/
- if (((flags & XFS_MOUNT_QUOTA_ALL) == XFS_MOUNT_QUOTA_SET1) ||
- ((flags & XFS_MOUNT_QUOTA_ALL) == XFS_MOUNT_QUOTA_SET2)) {
+ if (mp->m_qflags == 0) {
mutex_unlock(&q->qi_quotaofflock);
xfs_qm_destroy_quotainfo(mp);
return (0);
@@ -214,10 +208,14 @@ xfs_qm_scall_quotaoff(
IRELE(q->qi_uquotaip);
q->qi_uquotaip = NULL;
}
- if ((dqtype & (XFS_QMOPT_GQUOTA|XFS_QMOPT_PQUOTA)) && q->qi_gquotaip) {
+ if ((dqtype & XFS_QMOPT_GQUOTA) && q->qi_gquotaip) {
IRELE(q->qi_gquotaip);
q->qi_gquotaip = NULL;
}
+ if ((dqtype & XFS_QMOPT_PQUOTA) && q->qi_pquotaip) {
+ IRELE(q->qi_pquotaip);
+ q->qi_pquotaip = NULL;
+ }
out_unlock:
mutex_unlock(&q->qi_quotaofflock);
@@ -243,9 +241,7 @@ xfs_qm_scall_trunc_qfile(
xfs_ilock(ip, XFS_IOLOCK_EXCL);
tp = xfs_trans_alloc(mp, XFS_TRANS_TRUNCATE_FILE);
- error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0,
- XFS_TRANS_PERM_LOG_RES,
- XFS_ITRUNCATE_LOG_COUNT);
+ error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
if (error) {
xfs_trans_cancel(tp, 0);
xfs_iunlock(ip, XFS_IOLOCK_EXCL);
@@ -282,20 +278,29 @@ xfs_qm_scall_trunc_qfiles(
xfs_mount_t *mp,
uint flags)
{
- int error = 0, error2 = 0;
+ int error = EINVAL;
- if (!xfs_sb_version_hasquota(&mp->m_sb) || flags == 0) {
- xfs_debug(mp, "%s: flags=%x m_qflags=%x\n",
+ if (!xfs_sb_version_hasquota(&mp->m_sb) || flags == 0 ||
+ (flags & ~XFS_DQ_ALLTYPES)) {
+ xfs_debug(mp, "%s: flags=%x m_qflags=%x",
__func__, flags, mp->m_qflags);
return XFS_ERROR(EINVAL);
}
- if (flags & XFS_DQ_USER)
+ if (flags & XFS_DQ_USER) {
error = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_uquotino);
- if (flags & (XFS_DQ_GROUP|XFS_DQ_PROJ))
- error2 = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_gquotino);
+ if (error)
+ return error;
+ }
+ if (flags & XFS_DQ_GROUP) {
+ error = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_gquotino);
+ if (error)
+ return error;
+ }
+ if (flags & XFS_DQ_PROJ)
+ error = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_pquotino);
- return error ? error : error2;
+ return error;
}
/*
@@ -321,7 +326,7 @@ xfs_qm_scall_quotaon(
sbflags = 0;
if (flags == 0) {
- xfs_debug(mp, "%s: zero flags, m_qflags=%x\n",
+ xfs_debug(mp, "%s: zero flags, m_qflags=%x",
__func__, mp->m_qflags);
return XFS_ERROR(EINVAL);
}
@@ -335,16 +340,16 @@ xfs_qm_scall_quotaon(
* quota acct on ondisk without m_qflags' knowing.
*/
if (((flags & XFS_UQUOTA_ACCT) == 0 &&
- (mp->m_sb.sb_qflags & XFS_UQUOTA_ACCT) == 0 &&
- (flags & XFS_UQUOTA_ENFD))
- ||
+ (mp->m_sb.sb_qflags & XFS_UQUOTA_ACCT) == 0 &&
+ (flags & XFS_UQUOTA_ENFD)) ||
+ ((flags & XFS_GQUOTA_ACCT) == 0 &&
+ (mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT) == 0 &&
+ (flags & XFS_GQUOTA_ENFD)) ||
((flags & XFS_PQUOTA_ACCT) == 0 &&
- (mp->m_sb.sb_qflags & XFS_PQUOTA_ACCT) == 0 &&
- (flags & XFS_GQUOTA_ACCT) == 0 &&
- (mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT) == 0 &&
- (flags & XFS_OQUOTA_ENFD))) {
+ (mp->m_sb.sb_qflags & XFS_PQUOTA_ACCT) == 0 &&
+ (flags & XFS_PQUOTA_ENFD))) {
xfs_debug(mp,
- "%s: Can't enforce without acct, flags=%x sbflags=%x\n",
+ "%s: Can't enforce without acct, flags=%x sbflags=%x",
__func__, flags, mp->m_sb.sb_qflags);
return XFS_ERROR(EINVAL);
}
@@ -400,6 +405,7 @@ xfs_qm_scall_quotaon(
/*
* Return quota status information, such as uquota-off, enforcements, etc.
+ * for Q_XGETQSTAT command.
*/
int
xfs_qm_scall_getqstat(
@@ -407,11 +413,13 @@ xfs_qm_scall_getqstat(
struct fs_quota_stat *out)
{
struct xfs_quotainfo *q = mp->m_quotainfo;
- struct xfs_inode *uip, *gip;
- boolean_t tempuqip, tempgqip;
+ struct xfs_inode *uip = NULL;
+ struct xfs_inode *gip = NULL;
+ struct xfs_inode *pip = NULL;
+ bool tempuqip = false;
+ bool tempgqip = false;
+ bool temppqip = false;
- uip = gip = NULL;
- tempuqip = tempgqip = B_FALSE;
memset(out, 0, sizeof(fs_quota_stat_t));
out->qs_version = FS_QSTAT_VERSION;
@@ -420,26 +428,121 @@ xfs_qm_scall_getqstat(
out->qs_gquota.qfs_ino = NULLFSINO;
return (0);
}
+
+ out->qs_flags = (__uint16_t) xfs_qm_export_flags(mp->m_qflags &
+ (XFS_ALL_QUOTA_ACCT|
+ XFS_ALL_QUOTA_ENFD));
+ if (q) {
+ uip = q->qi_uquotaip;
+ gip = q->qi_gquotaip;
+ pip = q->qi_pquotaip;
+ }
+ if (!uip && mp->m_sb.sb_uquotino != NULLFSINO) {
+ if (xfs_iget(mp, NULL, mp->m_sb.sb_uquotino,
+ 0, 0, &uip) == 0)
+ tempuqip = true;
+ }
+ if (!gip && mp->m_sb.sb_gquotino != NULLFSINO) {
+ if (xfs_iget(mp, NULL, mp->m_sb.sb_gquotino,
+ 0, 0, &gip) == 0)
+ tempgqip = true;
+ }
+ /*
+ * Q_XGETQSTAT doesn't have room for both group and project quotas.
+ * So, allow the project quota values to be copied out only if
+ * there is no group quota information available.
+ */
+ if (!gip) {
+ if (!pip && mp->m_sb.sb_pquotino != NULLFSINO) {
+ if (xfs_iget(mp, NULL, mp->m_sb.sb_pquotino,
+ 0, 0, &pip) == 0)
+ temppqip = true;
+ }
+ } else
+ pip = NULL;
+ if (uip) {
+ out->qs_uquota.qfs_ino = mp->m_sb.sb_uquotino;
+ out->qs_uquota.qfs_nblks = uip->i_d.di_nblocks;
+ out->qs_uquota.qfs_nextents = uip->i_d.di_nextents;
+ if (tempuqip)
+ IRELE(uip);
+ }
+
+ if (gip) {
+ out->qs_gquota.qfs_ino = mp->m_sb.sb_gquotino;
+ out->qs_gquota.qfs_nblks = gip->i_d.di_nblocks;
+ out->qs_gquota.qfs_nextents = gip->i_d.di_nextents;
+ if (tempgqip)
+ IRELE(gip);
+ }
+ if (pip) {
+ out->qs_gquota.qfs_ino = mp->m_sb.sb_gquotino;
+ out->qs_gquota.qfs_nblks = pip->i_d.di_nblocks;
+ out->qs_gquota.qfs_nextents = pip->i_d.di_nextents;
+ if (temppqip)
+ IRELE(pip);
+ }
+ if (q) {
+ out->qs_incoredqs = q->qi_dquots;
+ out->qs_btimelimit = q->qi_btimelimit;
+ out->qs_itimelimit = q->qi_itimelimit;
+ out->qs_rtbtimelimit = q->qi_rtbtimelimit;
+ out->qs_bwarnlimit = q->qi_bwarnlimit;
+ out->qs_iwarnlimit = q->qi_iwarnlimit;
+ }
+ return 0;
+}
+
+/*
+ * Return quota status information, such as uquota-off, enforcements, etc.
+ * for Q_XGETQSTATV command, to support separate project quota field.
+ */
+int
+xfs_qm_scall_getqstatv(
+ struct xfs_mount *mp,
+ struct fs_quota_statv *out)
+{
+ struct xfs_quotainfo *q = mp->m_quotainfo;
+ struct xfs_inode *uip = NULL;
+ struct xfs_inode *gip = NULL;
+ struct xfs_inode *pip = NULL;
+ bool tempuqip = false;
+ bool tempgqip = false;
+ bool temppqip = false;
+
+ if (!xfs_sb_version_hasquota(&mp->m_sb)) {
+ out->qs_uquota.qfs_ino = NULLFSINO;
+ out->qs_gquota.qfs_ino = NULLFSINO;
+ out->qs_pquota.qfs_ino = NULLFSINO;
+ return (0);
+ }
+
out->qs_flags = (__uint16_t) xfs_qm_export_flags(mp->m_qflags &
(XFS_ALL_QUOTA_ACCT|
XFS_ALL_QUOTA_ENFD));
- out->qs_pad = 0;
out->qs_uquota.qfs_ino = mp->m_sb.sb_uquotino;
out->qs_gquota.qfs_ino = mp->m_sb.sb_gquotino;
+ out->qs_pquota.qfs_ino = mp->m_sb.sb_pquotino;
if (q) {
uip = q->qi_uquotaip;
gip = q->qi_gquotaip;
+ pip = q->qi_pquotaip;
}
if (!uip && mp->m_sb.sb_uquotino != NULLFSINO) {
if (xfs_iget(mp, NULL, mp->m_sb.sb_uquotino,
0, 0, &uip) == 0)
- tempuqip = B_TRUE;
+ tempuqip = true;
}
if (!gip && mp->m_sb.sb_gquotino != NULLFSINO) {
if (xfs_iget(mp, NULL, mp->m_sb.sb_gquotino,
0, 0, &gip) == 0)
- tempgqip = B_TRUE;
+ tempgqip = true;
+ }
+ if (!pip && mp->m_sb.sb_pquotino != NULLFSINO) {
+ if (xfs_iget(mp, NULL, mp->m_sb.sb_pquotino,
+ 0, 0, &pip) == 0)
+ temppqip = true;
}
if (uip) {
out->qs_uquota.qfs_nblks = uip->i_d.di_nblocks;
@@ -447,12 +550,19 @@ xfs_qm_scall_getqstat(
if (tempuqip)
IRELE(uip);
}
+
if (gip) {
out->qs_gquota.qfs_nblks = gip->i_d.di_nblocks;
out->qs_gquota.qfs_nextents = gip->i_d.di_nextents;
if (tempgqip)
IRELE(gip);
}
+ if (pip) {
+ out->qs_pquota.qfs_nblks = pip->i_d.di_nblocks;
+ out->qs_pquota.qfs_nextents = pip->i_d.di_nextents;
+ if (temppqip)
+ IRELE(pip);
+ }
if (q) {
out->qs_incoredqs = q->qi_dquots;
out->qs_btimelimit = q->qi_btimelimit;
@@ -472,15 +582,15 @@ xfs_qm_scall_getqstat(
*/
int
xfs_qm_scall_setqlim(
- xfs_mount_t *mp,
+ struct xfs_mount *mp,
xfs_dqid_t id,
uint type,
fs_disk_quota_t *newlim)
{
struct xfs_quotainfo *q = mp->m_quotainfo;
- xfs_disk_dquot_t *ddq;
- xfs_dquot_t *dqp;
- xfs_trans_t *tp;
+ struct xfs_disk_dquot *ddq;
+ struct xfs_dquot *dqp;
+ struct xfs_trans *tp;
int error;
xfs_qcnt_t hard, soft;
@@ -489,30 +599,35 @@ xfs_qm_scall_setqlim(
if ((newlim->d_fieldmask & XFS_DQ_MASK) == 0)
return 0;
- tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SETQLIM);
- if ((error = xfs_trans_reserve(tp, 0, sizeof(xfs_disk_dquot_t) + 128,
- 0, 0, XFS_DEFAULT_LOG_COUNT))) {
- xfs_trans_cancel(tp, 0);
- return (error);
- }
-
/*
* We don't want to race with a quotaoff so take the quotaoff lock.
- * (We don't hold an inode lock, so there's nothing else to stop
- * a quotaoff from happening). (XXXThis doesn't currently happen
- * because we take the vfslock before calling xfs_qm_sysent).
+ * We don't hold an inode lock, so there's nothing else to stop
+ * a quotaoff from happening.
*/
mutex_lock(&q->qi_quotaofflock);
/*
- * Get the dquot (locked), and join it to the transaction.
- * Allocate the dquot if this doesn't exist.
+ * Get the dquot (locked) before we start, as we need to do a
+ * transaction to allocate it if it doesn't exist. Once we have the
+ * dquot, unlock it so we can start the next transaction safely. We hold
+ * a reference to the dquot, so it's safe to do this unlock/lock without
+ * it being reclaimed in the mean time.
*/
- if ((error = xfs_qm_dqget(mp, NULL, id, type, XFS_QMOPT_DQALLOC, &dqp))) {
- xfs_trans_cancel(tp, XFS_TRANS_ABORT);
+ error = xfs_qm_dqget(mp, NULL, id, type, XFS_QMOPT_DQALLOC, &dqp);
+ if (error) {
ASSERT(error != ENOENT);
goto out_unlock;
}
+ xfs_dqunlock(dqp);
+
+ tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SETQLIM);
+ error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_setqlim, 0, 0);
+ if (error) {
+ xfs_trans_cancel(tp, 0);
+ goto out_rele;
+ }
+
+ xfs_dqlock(dqp);
xfs_trans_dqjoin(tp, dqp);
ddq = &dqp->q_core;
@@ -528,12 +643,13 @@ xfs_qm_scall_setqlim(
if (hard == 0 || hard >= soft) {
ddq->d_blk_hardlimit = cpu_to_be64(hard);
ddq->d_blk_softlimit = cpu_to_be64(soft);
+ xfs_dquot_set_prealloc_limits(dqp);
if (id == 0) {
q->qi_bhardlimit = hard;
q->qi_bsoftlimit = soft;
}
} else {
- xfs_debug(mp, "blkhard %Ld < blksoft %Ld\n", hard, soft);
+ xfs_debug(mp, "blkhard %Ld < blksoft %Ld", hard, soft);
}
hard = (newlim->d_fieldmask & FS_DQ_RTBHARD) ?
(xfs_qcnt_t) XFS_BB_TO_FSB(mp, newlim->d_rtb_hardlimit) :
@@ -549,7 +665,7 @@ xfs_qm_scall_setqlim(
q->qi_rtbsoftlimit = soft;
}
} else {
- xfs_debug(mp, "rtbhard %Ld < rtbsoft %Ld\n", hard, soft);
+ xfs_debug(mp, "rtbhard %Ld < rtbsoft %Ld", hard, soft);
}
hard = (newlim->d_fieldmask & FS_DQ_IHARD) ?
@@ -566,7 +682,7 @@ xfs_qm_scall_setqlim(
q->qi_isoftlimit = soft;
}
} else {
- xfs_debug(mp, "ihard %Ld < isoft %Ld\n", hard, soft);
+ xfs_debug(mp, "ihard %Ld < isoft %Ld", hard, soft);
}
/*
@@ -619,9 +735,10 @@ xfs_qm_scall_setqlim(
xfs_trans_log_dquot(tp, dqp);
error = xfs_trans_commit(tp, 0);
- xfs_qm_dqrele(dqp);
- out_unlock:
+out_rele:
+ xfs_qm_dqrele(dqp);
+out_unlock:
mutex_unlock(&q->qi_quotaofflock);
return error;
}
@@ -638,8 +755,8 @@ xfs_qm_log_quotaoff_end(
tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QUOTAOFF_END);
- if ((error = xfs_trans_reserve(tp, 0, sizeof(xfs_qoff_logitem_t) * 2,
- 0, 0, XFS_DEFAULT_LOG_COUNT))) {
+ error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_equotaoff, 0, 0);
+ if (error) {
xfs_trans_cancel(tp, 0);
return (error);
}
@@ -671,14 +788,9 @@ xfs_qm_log_quotaoff(
uint oldsbqflag=0;
tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QUOTAOFF);
- if ((error = xfs_trans_reserve(tp, 0,
- sizeof(xfs_qoff_logitem_t) * 2 +
- mp->m_sb.sb_sectsize + 128,
- 0,
- 0,
- XFS_DEFAULT_LOG_COUNT))) {
+ error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_quotaoff, 0, 0);
+ if (error)
goto error0;
- }
qoffi = xfs_trans_get_qoff_item(tp, NULL, flags & XFS_ALL_QUOTA_ACCT);
xfs_trans_log_quotaoff_item(tp, qoffi);
@@ -771,9 +883,12 @@ xfs_qm_scall_getquota(
* gets turned off. No need to confuse the user level code,
* so return zeroes in that case.
*/
- if ((!XFS_IS_UQUOTA_ENFORCED(mp) && dqp->q_core.d_flags == XFS_DQ_USER) ||
- (!XFS_IS_OQUOTA_ENFORCED(mp) &&
- (dqp->q_core.d_flags & (XFS_DQ_PROJ | XFS_DQ_GROUP)))) {
+ if ((!XFS_IS_UQUOTA_ENFORCED(mp) &&
+ dqp->q_core.d_flags == XFS_DQ_USER) ||
+ (!XFS_IS_GQUOTA_ENFORCED(mp) &&
+ dqp->q_core.d_flags == XFS_DQ_GROUP) ||
+ (!XFS_IS_PQUOTA_ENFORCED(mp) &&
+ dqp->q_core.d_flags == XFS_DQ_PROJ)) {
dst->d_btimer = 0;
dst->d_itimer = 0;
dst->d_rtbtimer = 0;
@@ -781,8 +896,8 @@ xfs_qm_scall_getquota(
#ifdef DEBUG
if (((XFS_IS_UQUOTA_ENFORCED(mp) && dst->d_flags == FS_USER_QUOTA) ||
- (XFS_IS_OQUOTA_ENFORCED(mp) &&
- (dst->d_flags & (FS_PROJ_QUOTA | FS_GROUP_QUOTA)))) &&
+ (XFS_IS_GQUOTA_ENFORCED(mp) && dst->d_flags == FS_GROUP_QUOTA) ||
+ (XFS_IS_PQUOTA_ENFORCED(mp) && dst->d_flags == FS_PROJ_QUOTA)) &&
dst->d_id != 0) {
if ((dst->d_bcount > dst->d_blk_softlimit) &&
(dst->d_blk_softlimit > 0)) {
@@ -828,16 +943,16 @@ xfs_qm_export_flags(
uflags = 0;
if (flags & XFS_UQUOTA_ACCT)
uflags |= FS_QUOTA_UDQ_ACCT;
- if (flags & XFS_PQUOTA_ACCT)
- uflags |= FS_QUOTA_PDQ_ACCT;
if (flags & XFS_GQUOTA_ACCT)
uflags |= FS_QUOTA_GDQ_ACCT;
+ if (flags & XFS_PQUOTA_ACCT)
+ uflags |= FS_QUOTA_PDQ_ACCT;
if (flags & XFS_UQUOTA_ENFD)
uflags |= FS_QUOTA_UDQ_ENFD;
- if (flags & (XFS_OQUOTA_ENFD)) {
- uflags |= (flags & XFS_GQUOTA_ACCT) ?
- FS_QUOTA_GDQ_ENFD : FS_QUOTA_PDQ_ENFD;
- }
+ if (flags & XFS_GQUOTA_ENFD)
+ uflags |= FS_QUOTA_GDQ_ENFD;
+ if (flags & XFS_PQUOTA_ENFD)
+ uflags |= FS_QUOTA_PDQ_ENFD;
return (uflags);
}
@@ -845,15 +960,16 @@ xfs_qm_export_flags(
STATIC int
xfs_dqrele_inode(
struct xfs_inode *ip,
- struct xfs_perag *pag,
int flags,
void *args)
{
/* skip quota inodes */
if (ip == ip->i_mount->m_quotainfo->qi_uquotaip ||
- ip == ip->i_mount->m_quotainfo->qi_gquotaip) {
+ ip == ip->i_mount->m_quotainfo->qi_gquotaip ||
+ ip == ip->i_mount->m_quotainfo->qi_pquotaip) {
ASSERT(ip->i_udquot == NULL);
ASSERT(ip->i_gdquot == NULL);
+ ASSERT(ip->i_pdquot == NULL);
return 0;
}
@@ -862,10 +978,14 @@ xfs_dqrele_inode(
xfs_qm_dqrele(ip->i_udquot);
ip->i_udquot = NULL;
}
- if (flags & (XFS_PQUOTA_ACCT|XFS_GQUOTA_ACCT) && ip->i_gdquot) {
+ if ((flags & XFS_GQUOTA_ACCT) && ip->i_gdquot) {
xfs_qm_dqrele(ip->i_gdquot);
ip->i_gdquot = NULL;
}
+ if ((flags & XFS_PQUOTA_ACCT) && ip->i_pdquot) {
+ xfs_qm_dqrele(ip->i_pdquot);
+ ip->i_pdquot = NULL;
+ }
xfs_iunlock(ip, XFS_ILOCK_EXCL);
return 0;
}
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index b50ec5b95d5..5376dd406ba 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -18,229 +18,14 @@
#ifndef __XFS_QUOTA_H__
#define __XFS_QUOTA_H__
-struct xfs_trans;
-
-/*
- * The ondisk form of a dquot structure.
- */
-#define XFS_DQUOT_MAGIC 0x4451 /* 'DQ' */
-#define XFS_DQUOT_VERSION (u_int8_t)0x01 /* latest version number */
-
-/*
- * uid_t and gid_t are hard-coded to 32 bits in the inode.
- * Hence, an 'id' in a dquot is 32 bits..
- */
-typedef __uint32_t xfs_dqid_t;
-
-/*
- * Even though users may not have quota limits occupying all 64-bits,
- * they may need 64-bit accounting. Hence, 64-bit quota-counters,
- * and quota-limits. This is a waste in the common case, but hey ...
- */
-typedef __uint64_t xfs_qcnt_t;
-typedef __uint16_t xfs_qwarncnt_t;
-
-/*
- * This is the main portion of the on-disk representation of quota
- * information for a user. This is the q_core of the xfs_dquot_t that
- * is kept in kernel memory. We pad this with some more expansion room
- * to construct the on disk structure.
- */
-typedef struct xfs_disk_dquot {
- __be16 d_magic; /* dquot magic = XFS_DQUOT_MAGIC */
- __u8 d_version; /* dquot version */
- __u8 d_flags; /* XFS_DQ_USER/PROJ/GROUP */
- __be32 d_id; /* user,project,group id */
- __be64 d_blk_hardlimit;/* absolute limit on disk blks */
- __be64 d_blk_softlimit;/* preferred limit on disk blks */
- __be64 d_ino_hardlimit;/* maximum # allocated inodes */
- __be64 d_ino_softlimit;/* preferred inode limit */
- __be64 d_bcount; /* disk blocks owned by the user */
- __be64 d_icount; /* inodes owned by the user */
- __be32 d_itimer; /* zero if within inode limits if not,
- this is when we refuse service */
- __be32 d_btimer; /* similar to above; for disk blocks */
- __be16 d_iwarns; /* warnings issued wrt num inodes */
- __be16 d_bwarns; /* warnings issued wrt disk blocks */
- __be32 d_pad0; /* 64 bit align */
- __be64 d_rtb_hardlimit;/* absolute limit on realtime blks */
- __be64 d_rtb_softlimit;/* preferred limit on RT disk blks */
- __be64 d_rtbcount; /* realtime blocks owned */
- __be32 d_rtbtimer; /* similar to above; for RT disk blocks */
- __be16 d_rtbwarns; /* warnings issued wrt RT disk blocks */
- __be16 d_pad;
-} xfs_disk_dquot_t;
-
-/*
- * This is what goes on disk. This is separated from the xfs_disk_dquot because
- * carrying the unnecessary padding would be a waste of memory.
- */
-typedef struct xfs_dqblk {
- xfs_disk_dquot_t dd_diskdq; /* portion that lives incore as well */
- char dd_fill[32]; /* filling for posterity */
-} xfs_dqblk_t;
+#include "xfs_quota_defs.h"
/*
- * flags for q_flags field in the dquot.
+ * Kernel only quota definitions and functions
*/
-#define XFS_DQ_USER 0x0001 /* a user quota */
-#define XFS_DQ_PROJ 0x0002 /* project quota */
-#define XFS_DQ_GROUP 0x0004 /* a group quota */
-#define XFS_DQ_DIRTY 0x0008 /* dquot is dirty */
-#define XFS_DQ_FREEING 0x0010 /* dquot is beeing torn down */
-
-#define XFS_DQ_ALLTYPES (XFS_DQ_USER|XFS_DQ_PROJ|XFS_DQ_GROUP)
-
-#define XFS_DQ_FLAGS \
- { XFS_DQ_USER, "USER" }, \
- { XFS_DQ_PROJ, "PROJ" }, \
- { XFS_DQ_GROUP, "GROUP" }, \
- { XFS_DQ_DIRTY, "DIRTY" }, \
- { XFS_DQ_FREEING, "FREEING" }
-
-/*
- * In the worst case, when both user and group quotas are on,
- * we can have a max of three dquots changing in a single transaction.
- */
-#define XFS_DQUOT_LOGRES(mp) (sizeof(xfs_disk_dquot_t) * 3)
-
-
-/*
- * These are the structures used to lay out dquots and quotaoff
- * records on the log. Quite similar to those of inodes.
- */
-
-/*
- * log format struct for dquots.
- * The first two fields must be the type and size fitting into
- * 32 bits : log_recovery code assumes that.
- */
-typedef struct xfs_dq_logformat {
- __uint16_t qlf_type; /* dquot log item type */
- __uint16_t qlf_size; /* size of this item */
- xfs_dqid_t qlf_id; /* usr/grp/proj id : 32 bits */
- __int64_t qlf_blkno; /* blkno of dquot buffer */
- __int32_t qlf_len; /* len of dquot buffer */
- __uint32_t qlf_boffset; /* off of dquot in buffer */
-} xfs_dq_logformat_t;
-
-/*
- * log format struct for QUOTAOFF records.
- * The first two fields must be the type and size fitting into
- * 32 bits : log_recovery code assumes that.
- * We write two LI_QUOTAOFF logitems per quotaoff, the last one keeps a pointer
- * to the first and ensures that the first logitem is taken out of the AIL
- * only when the last one is securely committed.
- */
-typedef struct xfs_qoff_logformat {
- unsigned short qf_type; /* quotaoff log item type */
- unsigned short qf_size; /* size of this item */
- unsigned int qf_flags; /* USR and/or GRP */
- char qf_pad[12]; /* padding for future */
-} xfs_qoff_logformat_t;
-
-/*
- * Disk quotas status in m_qflags, and also sb_qflags. 16 bits.
- */
-#define XFS_UQUOTA_ACCT 0x0001 /* user quota accounting ON */
-#define XFS_UQUOTA_ENFD 0x0002 /* user quota limits enforced */
-#define XFS_UQUOTA_CHKD 0x0004 /* quotacheck run on usr quotas */
-#define XFS_PQUOTA_ACCT 0x0008 /* project quota accounting ON */
-#define XFS_OQUOTA_ENFD 0x0010 /* other (grp/prj) quota limits enforced */
-#define XFS_OQUOTA_CHKD 0x0020 /* quotacheck run on other (grp/prj) quotas */
-#define XFS_GQUOTA_ACCT 0x0040 /* group quota accounting ON */
-
-/*
- * Quota Accounting/Enforcement flags
- */
-#define XFS_ALL_QUOTA_ACCT \
- (XFS_UQUOTA_ACCT | XFS_GQUOTA_ACCT | XFS_PQUOTA_ACCT)
-#define XFS_ALL_QUOTA_ENFD (XFS_UQUOTA_ENFD | XFS_OQUOTA_ENFD)
-#define XFS_ALL_QUOTA_CHKD (XFS_UQUOTA_CHKD | XFS_OQUOTA_CHKD)
-
-#define XFS_IS_QUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_ALL_QUOTA_ACCT)
-#define XFS_IS_UQUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_UQUOTA_ACCT)
-#define XFS_IS_PQUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_PQUOTA_ACCT)
-#define XFS_IS_GQUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_GQUOTA_ACCT)
-#define XFS_IS_UQUOTA_ENFORCED(mp) ((mp)->m_qflags & XFS_UQUOTA_ENFD)
-#define XFS_IS_OQUOTA_ENFORCED(mp) ((mp)->m_qflags & XFS_OQUOTA_ENFD)
-
-/*
- * Incore only flags for quotaoff - these bits get cleared when quota(s)
- * are in the process of getting turned off. These flags are in m_qflags but
- * never in sb_qflags.
- */
-#define XFS_UQUOTA_ACTIVE 0x0100 /* uquotas are being turned off */
-#define XFS_PQUOTA_ACTIVE 0x0200 /* pquotas are being turned off */
-#define XFS_GQUOTA_ACTIVE 0x0400 /* gquotas are being turned off */
-#define XFS_ALL_QUOTA_ACTIVE \
- (XFS_UQUOTA_ACTIVE | XFS_PQUOTA_ACTIVE | XFS_GQUOTA_ACTIVE)
-
-/*
- * Checking XFS_IS_*QUOTA_ON() while holding any inode lock guarantees
- * quota will be not be switched off as long as that inode lock is held.
- */
-#define XFS_IS_QUOTA_ON(mp) ((mp)->m_qflags & (XFS_UQUOTA_ACTIVE | \
- XFS_GQUOTA_ACTIVE | \
- XFS_PQUOTA_ACTIVE))
-#define XFS_IS_OQUOTA_ON(mp) ((mp)->m_qflags & (XFS_GQUOTA_ACTIVE | \
- XFS_PQUOTA_ACTIVE))
-#define XFS_IS_UQUOTA_ON(mp) ((mp)->m_qflags & XFS_UQUOTA_ACTIVE)
-#define XFS_IS_GQUOTA_ON(mp) ((mp)->m_qflags & XFS_GQUOTA_ACTIVE)
-#define XFS_IS_PQUOTA_ON(mp) ((mp)->m_qflags & XFS_PQUOTA_ACTIVE)
-
-/*
- * Flags to tell various functions what to do. Not all of these are meaningful
- * to a single function. None of these XFS_QMOPT_* flags are meant to have
- * persistent values (ie. their values can and will change between versions)
- */
-#define XFS_QMOPT_DQALLOC 0x0000002 /* alloc dquot ondisk if needed */
-#define XFS_QMOPT_UQUOTA 0x0000004 /* user dquot requested */
-#define XFS_QMOPT_PQUOTA 0x0000008 /* project dquot requested */
-#define XFS_QMOPT_FORCE_RES 0x0000010 /* ignore quota limits */
-#define XFS_QMOPT_SBVERSION 0x0000040 /* change superblock version num */
-#define XFS_QMOPT_DOWARN 0x0000400 /* increase warning cnt if needed */
-#define XFS_QMOPT_DQREPAIR 0x0001000 /* repair dquot if damaged */
-#define XFS_QMOPT_GQUOTA 0x0002000 /* group dquot requested */
-#define XFS_QMOPT_ENOSPC 0x0004000 /* enospc instead of edquot (prj) */
-
-/*
- * flags to xfs_trans_mod_dquot to indicate which field needs to be
- * modified.
- */
-#define XFS_QMOPT_RES_REGBLKS 0x0010000
-#define XFS_QMOPT_RES_RTBLKS 0x0020000
-#define XFS_QMOPT_BCOUNT 0x0040000
-#define XFS_QMOPT_ICOUNT 0x0080000
-#define XFS_QMOPT_RTBCOUNT 0x0100000
-#define XFS_QMOPT_DELBCOUNT 0x0200000
-#define XFS_QMOPT_DELRTBCOUNT 0x0400000
-#define XFS_QMOPT_RES_INOS 0x0800000
-
-/*
- * flags for dqalloc.
- */
-#define XFS_QMOPT_INHERIT 0x1000000
-
-/*
- * flags to xfs_trans_mod_dquot.
- */
-#define XFS_TRANS_DQ_RES_BLKS XFS_QMOPT_RES_REGBLKS
-#define XFS_TRANS_DQ_RES_RTBLKS XFS_QMOPT_RES_RTBLKS
-#define XFS_TRANS_DQ_RES_INOS XFS_QMOPT_RES_INOS
-#define XFS_TRANS_DQ_BCOUNT XFS_QMOPT_BCOUNT
-#define XFS_TRANS_DQ_DELBCOUNT XFS_QMOPT_DELBCOUNT
-#define XFS_TRANS_DQ_ICOUNT XFS_QMOPT_ICOUNT
-#define XFS_TRANS_DQ_RTBCOUNT XFS_QMOPT_RTBCOUNT
-#define XFS_TRANS_DQ_DELRTBCOUNT XFS_QMOPT_DELRTBCOUNT
-
-
-#define XFS_QMOPT_QUOTALL \
- (XFS_QMOPT_UQUOTA | XFS_QMOPT_PQUOTA | XFS_QMOPT_GQUOTA)
-#define XFS_QMOPT_RESBLK_MASK (XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_RES_RTBLKS)
+struct xfs_trans;
-#ifdef __KERNEL__
/*
* This check is done typically without holding the inode lock;
* that may seem racy, but it is harmless in the context that it is used.
@@ -250,34 +35,18 @@ typedef struct xfs_qoff_logformat {
* we didn't have the inode locked, the appropriate dquot(s) will be
* attached atomically.
*/
-#define XFS_NOT_DQATTACHED(mp, ip) ((XFS_IS_UQUOTA_ON(mp) &&\
- (ip)->i_udquot == NULL) || \
- (XFS_IS_OQUOTA_ON(mp) && \
- (ip)->i_gdquot == NULL))
+#define XFS_NOT_DQATTACHED(mp, ip) \
+ ((XFS_IS_UQUOTA_ON(mp) && (ip)->i_udquot == NULL) || \
+ (XFS_IS_GQUOTA_ON(mp) && (ip)->i_gdquot == NULL) || \
+ (XFS_IS_PQUOTA_ON(mp) && (ip)->i_pdquot == NULL))
#define XFS_QM_NEED_QUOTACHECK(mp) \
((XFS_IS_UQUOTA_ON(mp) && \
(mp->m_sb.sb_qflags & XFS_UQUOTA_CHKD) == 0) || \
(XFS_IS_GQUOTA_ON(mp) && \
- ((mp->m_sb.sb_qflags & XFS_OQUOTA_CHKD) == 0 || \
- (mp->m_sb.sb_qflags & XFS_PQUOTA_ACCT))) || \
+ (mp->m_sb.sb_qflags & XFS_GQUOTA_CHKD) == 0) || \
(XFS_IS_PQUOTA_ON(mp) && \
- ((mp->m_sb.sb_qflags & XFS_OQUOTA_CHKD) == 0 || \
- (mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT))))
-
-#define XFS_MOUNT_QUOTA_SET1 (XFS_UQUOTA_ACCT|XFS_UQUOTA_ENFD|\
- XFS_UQUOTA_CHKD|XFS_PQUOTA_ACCT|\
- XFS_OQUOTA_ENFD|XFS_OQUOTA_CHKD)
-
-#define XFS_MOUNT_QUOTA_SET2 (XFS_UQUOTA_ACCT|XFS_UQUOTA_ENFD|\
- XFS_UQUOTA_CHKD|XFS_GQUOTA_ACCT|\
- XFS_OQUOTA_ENFD|XFS_OQUOTA_CHKD)
-
-#define XFS_MOUNT_QUOTA_ALL (XFS_UQUOTA_ACCT|XFS_UQUOTA_ENFD|\
- XFS_UQUOTA_CHKD|XFS_PQUOTA_ACCT|\
- XFS_OQUOTA_ENFD|XFS_OQUOTA_CHKD|\
- XFS_GQUOTA_ACCT)
-
+ (mp->m_sb.sb_qflags & XFS_PQUOTA_CHKD) == 0))
/*
* The structure kept inside the xfs_trans_t keep track of dquot changes
@@ -309,17 +78,19 @@ extern int xfs_trans_reserve_quota_nblks(struct xfs_trans *,
struct xfs_inode *, long, long, uint);
extern int xfs_trans_reserve_quota_bydquots(struct xfs_trans *,
struct xfs_mount *, struct xfs_dquot *,
- struct xfs_dquot *, long, long, uint);
+ struct xfs_dquot *, struct xfs_dquot *, long, long, uint);
-extern int xfs_qm_vop_dqalloc(struct xfs_inode *, uid_t, gid_t, prid_t, uint,
- struct xfs_dquot **, struct xfs_dquot **);
+extern int xfs_qm_vop_dqalloc(struct xfs_inode *, xfs_dqid_t, xfs_dqid_t,
+ prid_t, uint, struct xfs_dquot **, struct xfs_dquot **,
+ struct xfs_dquot **);
extern void xfs_qm_vop_create_dqattach(struct xfs_trans *, struct xfs_inode *,
- struct xfs_dquot *, struct xfs_dquot *);
+ struct xfs_dquot *, struct xfs_dquot *, struct xfs_dquot *);
extern int xfs_qm_vop_rename_dqattach(struct xfs_inode **);
extern struct xfs_dquot *xfs_qm_vop_chown(struct xfs_trans *,
struct xfs_inode *, struct xfs_dquot **, struct xfs_dquot *);
extern int xfs_qm_vop_chown_reserve(struct xfs_trans *, struct xfs_inode *,
- struct xfs_dquot *, struct xfs_dquot *, uint);
+ struct xfs_dquot *, struct xfs_dquot *,
+ struct xfs_dquot *, uint);
extern int xfs_qm_dqattach(struct xfs_inode *, uint);
extern int xfs_qm_dqattach_locked(struct xfs_inode *, uint);
extern void xfs_qm_dqdetach(struct xfs_inode *);
@@ -332,11 +103,13 @@ extern void xfs_qm_unmount_quotas(struct xfs_mount *);
#else
static inline int
-xfs_qm_vop_dqalloc(struct xfs_inode *ip, uid_t uid, gid_t gid, prid_t prid,
- uint flags, struct xfs_dquot **udqp, struct xfs_dquot **gdqp)
+xfs_qm_vop_dqalloc(struct xfs_inode *ip, xfs_dqid_t uid, xfs_dqid_t gid,
+ prid_t prid, uint flags, struct xfs_dquot **udqp,
+ struct xfs_dquot **gdqp, struct xfs_dquot **pdqp)
{
*udqp = NULL;
*gdqp = NULL;
+ *pdqp = NULL;
return 0;
}
#define xfs_trans_dup_dqinfo(tp, tp2)
@@ -351,14 +124,15 @@ static inline int xfs_trans_reserve_quota_nblks(struct xfs_trans *tp,
}
static inline int xfs_trans_reserve_quota_bydquots(struct xfs_trans *tp,
struct xfs_mount *mp, struct xfs_dquot *udqp,
- struct xfs_dquot *gdqp, long nblks, long nions, uint flags)
+ struct xfs_dquot *gdqp, struct xfs_dquot *pdqp,
+ long nblks, long nions, uint flags)
{
return 0;
}
-#define xfs_qm_vop_create_dqattach(tp, ip, u, g)
+#define xfs_qm_vop_create_dqattach(tp, ip, u, g, p)
#define xfs_qm_vop_rename_dqattach(it) (0)
#define xfs_qm_vop_chown(tp, ip, old, new) (NULL)
-#define xfs_qm_vop_chown_reserve(tp, ip, u, g, fl) (0)
+#define xfs_qm_vop_chown_reserve(tp, ip, u, g, p, fl) (0)
#define xfs_qm_dqattach(ip, fl) (0)
#define xfs_qm_dqattach_locked(ip, fl) (0)
#define xfs_qm_dqdetach(ip)
@@ -372,13 +146,10 @@ static inline int xfs_trans_reserve_quota_bydquots(struct xfs_trans *tp,
#define xfs_trans_unreserve_quota_nblks(tp, ip, nblks, ninos, flags) \
xfs_trans_reserve_quota_nblks(tp, ip, -(nblks), -(ninos), flags)
-#define xfs_trans_reserve_quota(tp, mp, ud, gd, nb, ni, f) \
- xfs_trans_reserve_quota_bydquots(tp, mp, ud, gd, nb, ni, \
+#define xfs_trans_reserve_quota(tp, mp, ud, gd, pd, nb, ni, f) \
+ xfs_trans_reserve_quota_bydquots(tp, mp, ud, gd, pd, nb, ni, \
f | XFS_QMOPT_RES_REGBLKS)
-extern int xfs_qm_dqcheck(struct xfs_mount *, xfs_disk_dquot_t *,
- xfs_dqid_t, uint, uint, char *);
extern int xfs_mount_reset_sbqflags(struct xfs_mount *);
-#endif /* __KERNEL__ */
#endif /* __XFS_QUOTA_H__ */
diff --git a/fs/xfs/xfs_quota_defs.h b/fs/xfs/xfs_quota_defs.h
new file mode 100644
index 00000000000..137e2093707
--- /dev/null
+++ b/fs/xfs/xfs_quota_defs.h
@@ -0,0 +1,161 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#ifndef __XFS_QUOTA_DEFS_H__
+#define __XFS_QUOTA_DEFS_H__
+
+/*
+ * Quota definitions shared between user and kernel source trees.
+ */
+
+/*
+ * Even though users may not have quota limits occupying all 64-bits,
+ * they may need 64-bit accounting. Hence, 64-bit quota-counters,
+ * and quota-limits. This is a waste in the common case, but hey ...
+ */
+typedef __uint64_t xfs_qcnt_t;
+typedef __uint16_t xfs_qwarncnt_t;
+
+/*
+ * flags for q_flags field in the dquot.
+ */
+#define XFS_DQ_USER 0x0001 /* a user quota */
+#define XFS_DQ_PROJ 0x0002 /* project quota */
+#define XFS_DQ_GROUP 0x0004 /* a group quota */
+#define XFS_DQ_DIRTY 0x0008 /* dquot is dirty */
+#define XFS_DQ_FREEING 0x0010 /* dquot is beeing torn down */
+
+#define XFS_DQ_ALLTYPES (XFS_DQ_USER|XFS_DQ_PROJ|XFS_DQ_GROUP)
+
+#define XFS_DQ_FLAGS \
+ { XFS_DQ_USER, "USER" }, \
+ { XFS_DQ_PROJ, "PROJ" }, \
+ { XFS_DQ_GROUP, "GROUP" }, \
+ { XFS_DQ_DIRTY, "DIRTY" }, \
+ { XFS_DQ_FREEING, "FREEING" }
+
+/*
+ * We have the possibility of all three quota types being active at once, and
+ * hence free space modification requires modification of all three current
+ * dquots in a single transaction. For this case we need to have a reservation
+ * of at least 3 dquots.
+ *
+ * However, a chmod operation can change both UID and GID in a single
+ * transaction, resulting in requiring {old, new} x {uid, gid} dquots to be
+ * modified. Hence for this case we need to reserve space for at least 4 dquots.
+ *
+ * And in the worst case, there's a rename operation that can be modifying up to
+ * 4 inodes with dquots attached to them. In reality, the only inodes that can
+ * have their dquots modified are the source and destination directory inodes
+ * due to directory name creation and removal. That can require space allocation
+ * and/or freeing on both directory inodes, and hence all three dquots on each
+ * inode can be modified. And if the directories are world writeable, all the
+ * dquots can be unique and so 6 dquots can be modified....
+ *
+ * And, of course, we also need to take into account the dquot log format item
+ * used to describe each dquot.
+ */
+#define XFS_DQUOT_LOGRES(mp) \
+ ((sizeof(struct xfs_dq_logformat) + sizeof(struct xfs_disk_dquot)) * 6)
+
+#define XFS_IS_QUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_ALL_QUOTA_ACCT)
+#define XFS_IS_UQUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_UQUOTA_ACCT)
+#define XFS_IS_PQUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_PQUOTA_ACCT)
+#define XFS_IS_GQUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_GQUOTA_ACCT)
+#define XFS_IS_UQUOTA_ENFORCED(mp) ((mp)->m_qflags & XFS_UQUOTA_ENFD)
+#define XFS_IS_GQUOTA_ENFORCED(mp) ((mp)->m_qflags & XFS_GQUOTA_ENFD)
+#define XFS_IS_PQUOTA_ENFORCED(mp) ((mp)->m_qflags & XFS_PQUOTA_ENFD)
+
+/*
+ * Incore only flags for quotaoff - these bits get cleared when quota(s)
+ * are in the process of getting turned off. These flags are in m_qflags but
+ * never in sb_qflags.
+ */
+#define XFS_UQUOTA_ACTIVE 0x1000 /* uquotas are being turned off */
+#define XFS_GQUOTA_ACTIVE 0x2000 /* gquotas are being turned off */
+#define XFS_PQUOTA_ACTIVE 0x4000 /* pquotas are being turned off */
+#define XFS_ALL_QUOTA_ACTIVE \
+ (XFS_UQUOTA_ACTIVE | XFS_GQUOTA_ACTIVE | XFS_PQUOTA_ACTIVE)
+
+/*
+ * Checking XFS_IS_*QUOTA_ON() while holding any inode lock guarantees
+ * quota will be not be switched off as long as that inode lock is held.
+ */
+#define XFS_IS_QUOTA_ON(mp) ((mp)->m_qflags & (XFS_UQUOTA_ACTIVE | \
+ XFS_GQUOTA_ACTIVE | \
+ XFS_PQUOTA_ACTIVE))
+#define XFS_IS_OQUOTA_ON(mp) ((mp)->m_qflags & (XFS_GQUOTA_ACTIVE | \
+ XFS_PQUOTA_ACTIVE))
+#define XFS_IS_UQUOTA_ON(mp) ((mp)->m_qflags & XFS_UQUOTA_ACTIVE)
+#define XFS_IS_GQUOTA_ON(mp) ((mp)->m_qflags & XFS_GQUOTA_ACTIVE)
+#define XFS_IS_PQUOTA_ON(mp) ((mp)->m_qflags & XFS_PQUOTA_ACTIVE)
+
+/*
+ * Flags to tell various functions what to do. Not all of these are meaningful
+ * to a single function. None of these XFS_QMOPT_* flags are meant to have
+ * persistent values (ie. their values can and will change between versions)
+ */
+#define XFS_QMOPT_DQALLOC 0x0000002 /* alloc dquot ondisk if needed */
+#define XFS_QMOPT_UQUOTA 0x0000004 /* user dquot requested */
+#define XFS_QMOPT_PQUOTA 0x0000008 /* project dquot requested */
+#define XFS_QMOPT_FORCE_RES 0x0000010 /* ignore quota limits */
+#define XFS_QMOPT_SBVERSION 0x0000040 /* change superblock version num */
+#define XFS_QMOPT_DOWARN 0x0000400 /* increase warning cnt if needed */
+#define XFS_QMOPT_DQREPAIR 0x0001000 /* repair dquot if damaged */
+#define XFS_QMOPT_GQUOTA 0x0002000 /* group dquot requested */
+#define XFS_QMOPT_ENOSPC 0x0004000 /* enospc instead of edquot (prj) */
+
+/*
+ * flags to xfs_trans_mod_dquot to indicate which field needs to be
+ * modified.
+ */
+#define XFS_QMOPT_RES_REGBLKS 0x0010000
+#define XFS_QMOPT_RES_RTBLKS 0x0020000
+#define XFS_QMOPT_BCOUNT 0x0040000
+#define XFS_QMOPT_ICOUNT 0x0080000
+#define XFS_QMOPT_RTBCOUNT 0x0100000
+#define XFS_QMOPT_DELBCOUNT 0x0200000
+#define XFS_QMOPT_DELRTBCOUNT 0x0400000
+#define XFS_QMOPT_RES_INOS 0x0800000
+
+/*
+ * flags for dqalloc.
+ */
+#define XFS_QMOPT_INHERIT 0x1000000
+
+/*
+ * flags to xfs_trans_mod_dquot.
+ */
+#define XFS_TRANS_DQ_RES_BLKS XFS_QMOPT_RES_REGBLKS
+#define XFS_TRANS_DQ_RES_RTBLKS XFS_QMOPT_RES_RTBLKS
+#define XFS_TRANS_DQ_RES_INOS XFS_QMOPT_RES_INOS
+#define XFS_TRANS_DQ_BCOUNT XFS_QMOPT_BCOUNT
+#define XFS_TRANS_DQ_DELBCOUNT XFS_QMOPT_DELBCOUNT
+#define XFS_TRANS_DQ_ICOUNT XFS_QMOPT_ICOUNT
+#define XFS_TRANS_DQ_RTBCOUNT XFS_QMOPT_RTBCOUNT
+#define XFS_TRANS_DQ_DELRTBCOUNT XFS_QMOPT_DELRTBCOUNT
+
+
+#define XFS_QMOPT_QUOTALL \
+ (XFS_QMOPT_UQUOTA | XFS_QMOPT_PQUOTA | XFS_QMOPT_GQUOTA)
+#define XFS_QMOPT_RESBLK_MASK (XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_RES_RTBLKS)
+
+extern int xfs_dqcheck(struct xfs_mount *mp, xfs_disk_dquot_t *ddq,
+ xfs_dqid_t id, uint type, uint flags, char *str);
+extern int xfs_calc_dquots_per_chunk(unsigned int nbblks);
+
+#endif /* __XFS_QUOTA_H__ */
diff --git a/fs/xfs/xfs_quota_priv.h b/fs/xfs/xfs_quota_priv.h
deleted file mode 100644
index 6d86219d93d..00000000000
--- a/fs/xfs/xfs_quota_priv.h
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * Copyright (c) 2000-2003 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- */
-#ifndef __XFS_QUOTA_PRIV_H__
-#define __XFS_QUOTA_PRIV_H__
-
-/*
- * Number of bmaps that we ask from bmapi when doing a quotacheck.
- * We make this restriction to keep the memory usage to a minimum.
- */
-#define XFS_DQITER_MAP_SIZE 10
-
-#define XFS_IS_DQUOT_UNINITIALIZED(dqp) ( \
- !dqp->q_core.d_blk_hardlimit && \
- !dqp->q_core.d_blk_softlimit && \
- !dqp->q_core.d_rtb_hardlimit && \
- !dqp->q_core.d_rtb_softlimit && \
- !dqp->q_core.d_ino_hardlimit && \
- !dqp->q_core.d_ino_softlimit && \
- !dqp->q_core.d_bcount && \
- !dqp->q_core.d_rtbcount && \
- !dqp->q_core.d_icount)
-
-#define DQFLAGTO_TYPESTR(d) (((d)->dq_flags & XFS_DQ_USER) ? "USR" : \
- (((d)->dq_flags & XFS_DQ_GROUP) ? "GRP" : \
- (((d)->dq_flags & XFS_DQ_PROJ) ? "PRJ":"???")))
-
-#endif /* __XFS_QUOTA_PRIV_H__ */
diff --git a/fs/xfs/xfs_quotaops.c b/fs/xfs/xfs_quotaops.c
index 71926d63052..2ad1b9822e9 100644
--- a/fs/xfs/xfs_quotaops.c
+++ b/fs/xfs/xfs_quotaops.c
@@ -16,14 +16,15 @@
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "xfs.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_sb.h"
-#include "xfs_log.h"
#include "xfs_ag.h"
#include "xfs_mount.h"
+#include "xfs_inode.h"
#include "xfs_quota.h"
#include "xfs_trans.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_inode.h"
#include "xfs_qm.h"
#include <linux/quota.h>
@@ -54,6 +55,18 @@ xfs_fs_get_xstate(
}
STATIC int
+xfs_fs_get_xstatev(
+ struct super_block *sb,
+ struct fs_quota_statv *fqs)
+{
+ struct xfs_mount *mp = XFS_M(sb);
+
+ if (!XFS_IS_QUOTA_RUNNING(mp))
+ return -ENOSYS;
+ return -xfs_qm_scall_getqstatv(mp, fqs);
+}
+
+STATIC int
xfs_fs_set_xstate(
struct super_block *sb,
unsigned int uflags,
@@ -75,8 +88,10 @@ xfs_fs_set_xstate(
flags |= XFS_GQUOTA_ACCT;
if (uflags & FS_QUOTA_UDQ_ENFD)
flags |= XFS_UQUOTA_ENFD;
- if (uflags & (FS_QUOTA_PDQ_ENFD|FS_QUOTA_GDQ_ENFD))
- flags |= XFS_OQUOTA_ENFD;
+ if (uflags & FS_QUOTA_GDQ_ENFD)
+ flags |= XFS_GQUOTA_ENFD;
+ if (uflags & FS_QUOTA_PDQ_ENFD)
+ flags |= XFS_PQUOTA_ENFD;
switch (op) {
case Q_XQUOTAON:
@@ -85,16 +100,36 @@ xfs_fs_set_xstate(
if (!XFS_IS_QUOTA_ON(mp))
return -EINVAL;
return -xfs_qm_scall_quotaoff(mp, flags);
- case Q_XQUOTARM:
- if (XFS_IS_QUOTA_ON(mp))
- return -EINVAL;
- return -xfs_qm_scall_trunc_qfiles(mp, flags);
}
return -EINVAL;
}
STATIC int
+xfs_fs_rm_xquota(
+ struct super_block *sb,
+ unsigned int uflags)
+{
+ struct xfs_mount *mp = XFS_M(sb);
+ unsigned int flags = 0;
+
+ if (sb->s_flags & MS_RDONLY)
+ return -EROFS;
+
+ if (XFS_IS_QUOTA_ON(mp))
+ return -EINVAL;
+
+ if (uflags & FS_USER_QUOTA)
+ flags |= XFS_DQ_USER;
+ if (uflags & FS_GROUP_QUOTA)
+ flags |= XFS_DQ_GROUP;
+ if (uflags & FS_USER_QUOTA)
+ flags |= XFS_DQ_PROJ;
+
+ return -xfs_qm_scall_trunc_qfiles(mp, flags);
+}
+
+STATIC int
xfs_fs_get_dqblk(
struct super_block *sb,
struct kqid qid,
@@ -131,8 +166,10 @@ xfs_fs_set_dqblk(
}
const struct quotactl_ops xfs_quotactl_operations = {
+ .get_xstatev = xfs_fs_get_xstatev,
.get_xstate = xfs_fs_get_xstate,
.set_xstate = xfs_fs_set_xstate,
+ .rm_xquota = xfs_fs_rm_xquota,
.get_dqblk = xfs_fs_get_dqblk,
.set_dqblk = xfs_fs_set_dqblk,
};
diff --git a/fs/xfs/xfs_rename.c b/fs/xfs/xfs_rename.c
deleted file mode 100644
index 30ff5f401d2..00000000000
--- a/fs/xfs/xfs_rename.c
+++ /dev/null
@@ -1,346 +0,0 @@
-/*
- * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_types.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_dir2.h"
-#include "xfs_mount.h"
-#include "xfs_da_btree.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_dinode.h"
-#include "xfs_inode.h"
-#include "xfs_inode_item.h"
-#include "xfs_bmap.h"
-#include "xfs_error.h"
-#include "xfs_quota.h"
-#include "xfs_utils.h"
-#include "xfs_trans_space.h"
-#include "xfs_vnodeops.h"
-#include "xfs_trace.h"
-
-
-/*
- * Enter all inodes for a rename transaction into a sorted array.
- */
-STATIC void
-xfs_sort_for_rename(
- xfs_inode_t *dp1, /* in: old (source) directory inode */
- xfs_inode_t *dp2, /* in: new (target) directory inode */
- xfs_inode_t *ip1, /* in: inode of old entry */
- xfs_inode_t *ip2, /* in: inode of new entry, if it
- already exists, NULL otherwise. */
- xfs_inode_t **i_tab,/* out: array of inode returned, sorted */
- int *num_inodes) /* out: number of inodes in array */
-{
- xfs_inode_t *temp;
- int i, j;
-
- /*
- * i_tab contains a list of pointers to inodes. We initialize
- * the table here & we'll sort it. We will then use it to
- * order the acquisition of the inode locks.
- *
- * Note that the table may contain duplicates. e.g., dp1 == dp2.
- */
- i_tab[0] = dp1;
- i_tab[1] = dp2;
- i_tab[2] = ip1;
- if (ip2) {
- *num_inodes = 4;
- i_tab[3] = ip2;
- } else {
- *num_inodes = 3;
- i_tab[3] = NULL;
- }
-
- /*
- * Sort the elements via bubble sort. (Remember, there are at
- * most 4 elements to sort, so this is adequate.)
- */
- for (i = 0; i < *num_inodes; i++) {
- for (j = 1; j < *num_inodes; j++) {
- if (i_tab[j]->i_ino < i_tab[j-1]->i_ino) {
- temp = i_tab[j];
- i_tab[j] = i_tab[j-1];
- i_tab[j-1] = temp;
- }
- }
- }
-}
-
-/*
- * xfs_rename
- */
-int
-xfs_rename(
- xfs_inode_t *src_dp,
- struct xfs_name *src_name,
- xfs_inode_t *src_ip,
- xfs_inode_t *target_dp,
- struct xfs_name *target_name,
- xfs_inode_t *target_ip)
-{
- xfs_trans_t *tp = NULL;
- xfs_mount_t *mp = src_dp->i_mount;
- int new_parent; /* moving to a new dir */
- int src_is_directory; /* src_name is a directory */
- int error;
- xfs_bmap_free_t free_list;
- xfs_fsblock_t first_block;
- int cancel_flags;
- int committed;
- xfs_inode_t *inodes[4];
- int spaceres;
- int num_inodes;
-
- trace_xfs_rename(src_dp, target_dp, src_name, target_name);
-
- new_parent = (src_dp != target_dp);
- src_is_directory = S_ISDIR(src_ip->i_d.di_mode);
-
- xfs_sort_for_rename(src_dp, target_dp, src_ip, target_ip,
- inodes, &num_inodes);
-
- xfs_bmap_init(&free_list, &first_block);
- tp = xfs_trans_alloc(mp, XFS_TRANS_RENAME);
- cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
- spaceres = XFS_RENAME_SPACE_RES(mp, target_name->len);
- error = xfs_trans_reserve(tp, spaceres, XFS_RENAME_LOG_RES(mp), 0,
- XFS_TRANS_PERM_LOG_RES, XFS_RENAME_LOG_COUNT);
- if (error == ENOSPC) {
- spaceres = 0;
- error = xfs_trans_reserve(tp, 0, XFS_RENAME_LOG_RES(mp), 0,
- XFS_TRANS_PERM_LOG_RES, XFS_RENAME_LOG_COUNT);
- }
- if (error) {
- xfs_trans_cancel(tp, 0);
- goto std_return;
- }
-
- /*
- * Attach the dquots to the inodes
- */
- error = xfs_qm_vop_rename_dqattach(inodes);
- if (error) {
- xfs_trans_cancel(tp, cancel_flags);
- goto std_return;
- }
-
- /*
- * Lock all the participating inodes. Depending upon whether
- * the target_name exists in the target directory, and
- * whether the target directory is the same as the source
- * directory, we can lock from 2 to 4 inodes.
- */
- xfs_lock_inodes(inodes, num_inodes, XFS_ILOCK_EXCL);
-
- /*
- * Join all the inodes to the transaction. From this point on,
- * we can rely on either trans_commit or trans_cancel to unlock
- * them.
- */
- xfs_trans_ijoin(tp, src_dp, XFS_ILOCK_EXCL);
- if (new_parent)
- xfs_trans_ijoin(tp, target_dp, XFS_ILOCK_EXCL);
- xfs_trans_ijoin(tp, src_ip, XFS_ILOCK_EXCL);
- if (target_ip)
- xfs_trans_ijoin(tp, target_ip, XFS_ILOCK_EXCL);
-
- /*
- * If we are using project inheritance, we only allow renames
- * into our tree when the project IDs are the same; else the
- * tree quota mechanism would be circumvented.
- */
- if (unlikely((target_dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
- (xfs_get_projid(target_dp) != xfs_get_projid(src_ip)))) {
- error = XFS_ERROR(EXDEV);
- goto error_return;
- }
-
- /*
- * Set up the target.
- */
- if (target_ip == NULL) {
- /*
- * If there's no space reservation, check the entry will
- * fit before actually inserting it.
- */
- error = xfs_dir_canenter(tp, target_dp, target_name, spaceres);
- if (error)
- goto error_return;
- /*
- * If target does not exist and the rename crosses
- * directories, adjust the target directory link count
- * to account for the ".." reference from the new entry.
- */
- error = xfs_dir_createname(tp, target_dp, target_name,
- src_ip->i_ino, &first_block,
- &free_list, spaceres);
- if (error == ENOSPC)
- goto error_return;
- if (error)
- goto abort_return;
-
- xfs_trans_ichgtime(tp, target_dp,
- XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
-
- if (new_parent && src_is_directory) {
- error = xfs_bumplink(tp, target_dp);
- if (error)
- goto abort_return;
- }
- } else { /* target_ip != NULL */
- /*
- * If target exists and it's a directory, check that both
- * target and source are directories and that target can be
- * destroyed, or that neither is a directory.
- */
- if (S_ISDIR(target_ip->i_d.di_mode)) {
- /*
- * Make sure target dir is empty.
- */
- if (!(xfs_dir_isempty(target_ip)) ||
- (target_ip->i_d.di_nlink > 2)) {
- error = XFS_ERROR(EEXIST);
- goto error_return;
- }
- }
-
- /*
- * Link the source inode under the target name.
- * If the source inode is a directory and we are moving
- * it across directories, its ".." entry will be
- * inconsistent until we replace that down below.
- *
- * In case there is already an entry with the same
- * name at the destination directory, remove it first.
- */
- error = xfs_dir_replace(tp, target_dp, target_name,
- src_ip->i_ino,
- &first_block, &free_list, spaceres);
- if (error)
- goto abort_return;
-
- xfs_trans_ichgtime(tp, target_dp,
- XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
-
- /*
- * Decrement the link count on the target since the target
- * dir no longer points to it.
- */
- error = xfs_droplink(tp, target_ip);
- if (error)
- goto abort_return;
-
- if (src_is_directory) {
- /*
- * Drop the link from the old "." entry.
- */
- error = xfs_droplink(tp, target_ip);
- if (error)
- goto abort_return;
- }
- } /* target_ip != NULL */
-
- /*
- * Remove the source.
- */
- if (new_parent && src_is_directory) {
- /*
- * Rewrite the ".." entry to point to the new
- * directory.
- */
- error = xfs_dir_replace(tp, src_ip, &xfs_name_dotdot,
- target_dp->i_ino,
- &first_block, &free_list, spaceres);
- ASSERT(error != EEXIST);
- if (error)
- goto abort_return;
- }
-
- /*
- * We always want to hit the ctime on the source inode.
- *
- * This isn't strictly required by the standards since the source
- * inode isn't really being changed, but old unix file systems did
- * it and some incremental backup programs won't work without it.
- */
- xfs_trans_ichgtime(tp, src_ip, XFS_ICHGTIME_CHG);
- xfs_trans_log_inode(tp, src_ip, XFS_ILOG_CORE);
-
- /*
- * Adjust the link count on src_dp. This is necessary when
- * renaming a directory, either within one parent when
- * the target existed, or across two parent directories.
- */
- if (src_is_directory && (new_parent || target_ip != NULL)) {
-
- /*
- * Decrement link count on src_directory since the
- * entry that's moved no longer points to it.
- */
- error = xfs_droplink(tp, src_dp);
- if (error)
- goto abort_return;
- }
-
- error = xfs_dir_removename(tp, src_dp, src_name, src_ip->i_ino,
- &first_block, &free_list, spaceres);
- if (error)
- goto abort_return;
-
- xfs_trans_ichgtime(tp, src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
- xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE);
- if (new_parent)
- xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE);
-
- /*
- * If this is a synchronous mount, make sure that the
- * rename transaction goes to disk before returning to
- * the user.
- */
- if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
- xfs_trans_set_sync(tp);
- }
-
- error = xfs_bmap_finish(&tp, &free_list, &committed);
- if (error) {
- xfs_bmap_cancel(&free_list);
- xfs_trans_cancel(tp, (XFS_TRANS_RELEASE_LOG_RES |
- XFS_TRANS_ABORT));
- goto std_return;
- }
-
- /*
- * trans_commit will unlock src_ip, target_ip & decrement
- * the vnode references.
- */
- return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
-
- abort_return:
- cancel_flags |= XFS_TRANS_ABORT;
- error_return:
- xfs_bmap_cancel(&free_list);
- xfs_trans_cancel(tp, cancel_flags);
- std_return:
- return error;
-}
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 98dc670d3ee..ec5ca65c621 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -17,173 +17,260 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
-#include "xfs_types.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_bit.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
-#include "xfs_dir2.h"
#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_dinode.h"
#include "xfs_inode.h"
-#include "xfs_alloc.h"
#include "xfs_bmap.h"
-#include "xfs_rtalloc.h"
-#include "xfs_fsops.h"
+#include "xfs_bmap_util.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_alloc.h"
#include "xfs_error.h"
-#include "xfs_inode_item.h"
+#include "xfs_trans.h"
#include "xfs_trans_space.h"
-#include "xfs_utils.h"
#include "xfs_trace.h"
#include "xfs_buf.h"
#include "xfs_icache.h"
+#include "xfs_dinode.h"
+#include "xfs_rtalloc.h"
/*
- * Prototypes for internal functions.
+ * Read and return the summary information for a given extent size,
+ * bitmap block combination.
+ * Keeps track of a current summary block, so we don't keep reading
+ * it from the buffer cache.
*/
+STATIC int /* error */
+xfs_rtget_summary(
+ xfs_mount_t *mp, /* file system mount structure */
+ xfs_trans_t *tp, /* transaction pointer */
+ int log, /* log2 of extent size */
+ xfs_rtblock_t bbno, /* bitmap block number */
+ xfs_buf_t **rbpp, /* in/out: summary block buffer */
+ xfs_fsblock_t *rsb, /* in/out: summary block number */
+ xfs_suminfo_t *sum) /* out: summary info for this block */
+{
+ xfs_buf_t *bp; /* buffer for summary block */
+ int error; /* error value */
+ xfs_fsblock_t sb; /* summary fsblock */
+ int so; /* index into the summary file */
+ xfs_suminfo_t *sp; /* pointer to returned data */
+ /*
+ * Compute entry number in the summary file.
+ */
+ so = XFS_SUMOFFS(mp, log, bbno);
+ /*
+ * Compute the block number in the summary file.
+ */
+ sb = XFS_SUMOFFSTOBLOCK(mp, so);
+ /*
+ * If we have an old buffer, and the block number matches, use that.
+ */
+ if (rbpp && *rbpp && *rsb == sb)
+ bp = *rbpp;
+ /*
+ * Otherwise we have to get the buffer.
+ */
+ else {
+ /*
+ * If there was an old one, get rid of it first.
+ */
+ if (rbpp && *rbpp)
+ xfs_trans_brelse(tp, *rbpp);
+ error = xfs_rtbuf_get(mp, tp, sb, 1, &bp);
+ if (error) {
+ return error;
+ }
+ /*
+ * Remember this buffer and block for the next call.
+ */
+ if (rbpp) {
+ *rbpp = bp;
+ *rsb = sb;
+ }
+ }
+ /*
+ * Point to the summary information & copy it out.
+ */
+ sp = XFS_SUMPTR(mp, bp, so);
+ *sum = *sp;
+ /*
+ * Drop the buffer if we're not asked to remember it.
+ */
+ if (!rbpp)
+ xfs_trans_brelse(tp, bp);
+ return 0;
+}
-STATIC int xfs_rtallocate_range(xfs_mount_t *, xfs_trans_t *, xfs_rtblock_t,
- xfs_extlen_t, xfs_buf_t **, xfs_fsblock_t *);
-STATIC int xfs_rtany_summary(xfs_mount_t *, xfs_trans_t *, int, int,
- xfs_rtblock_t, xfs_buf_t **, xfs_fsblock_t *, int *);
-STATIC int xfs_rtcheck_range(xfs_mount_t *, xfs_trans_t *, xfs_rtblock_t,
- xfs_extlen_t, int, xfs_rtblock_t *, int *);
-STATIC int xfs_rtfind_back(xfs_mount_t *, xfs_trans_t *, xfs_rtblock_t,
- xfs_rtblock_t, xfs_rtblock_t *);
-STATIC int xfs_rtfind_forw(xfs_mount_t *, xfs_trans_t *, xfs_rtblock_t,
- xfs_rtblock_t, xfs_rtblock_t *);
-STATIC int xfs_rtget_summary( xfs_mount_t *, xfs_trans_t *, int,
- xfs_rtblock_t, xfs_buf_t **, xfs_fsblock_t *, xfs_suminfo_t *);
-STATIC int xfs_rtmodify_range(xfs_mount_t *, xfs_trans_t *, xfs_rtblock_t,
- xfs_extlen_t, int);
-STATIC int xfs_rtmodify_summary(xfs_mount_t *, xfs_trans_t *, int,
- xfs_rtblock_t, int, xfs_buf_t **, xfs_fsblock_t *);
-
-/*
- * Internal functions.
- */
/*
- * Allocate space to the bitmap or summary file, and zero it, for growfs.
+ * Return whether there are any free extents in the size range given
+ * by low and high, for the bitmap block bbno.
*/
STATIC int /* error */
-xfs_growfs_rt_alloc(
- xfs_mount_t *mp, /* file system mount point */
- xfs_extlen_t oblocks, /* old count of blocks */
- xfs_extlen_t nblocks, /* new count of blocks */
- xfs_inode_t *ip) /* inode (bitmap/summary) */
+xfs_rtany_summary(
+ xfs_mount_t *mp, /* file system mount structure */
+ xfs_trans_t *tp, /* transaction pointer */
+ int low, /* low log2 extent size */
+ int high, /* high log2 extent size */
+ xfs_rtblock_t bbno, /* bitmap block number */
+ xfs_buf_t **rbpp, /* in/out: summary block buffer */
+ xfs_fsblock_t *rsb, /* in/out: summary block number */
+ int *stat) /* out: any good extents here? */
{
- xfs_fileoff_t bno; /* block number in file */
- xfs_buf_t *bp; /* temporary buffer for zeroing */
- int committed; /* transaction committed flag */
- xfs_daddr_t d; /* disk block address */
- int error; /* error return value */
- xfs_fsblock_t firstblock; /* first block allocated in xaction */
- xfs_bmap_free_t flist; /* list of freed blocks */
- xfs_fsblock_t fsbno; /* filesystem block for bno */
- xfs_bmbt_irec_t map; /* block map output */
- int nmap; /* number of block maps */
- int resblks; /* space reservation */
+ int error; /* error value */
+ int log; /* loop counter, log2 of ext. size */
+ xfs_suminfo_t sum; /* summary data */
/*
- * Allocate space to the file, as necessary.
+ * Loop over logs of extent sizes. Order is irrelevant.
*/
- while (oblocks < nblocks) {
- int cancelflags = 0;
- xfs_trans_t *tp;
-
- tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFSRT_ALLOC);
- resblks = XFS_GROWFSRT_SPACE_RES(mp, nblocks - oblocks);
+ for (log = low; log <= high; log++) {
/*
- * Reserve space & log for one extent added to the file.
+ * Get one summary datum.
*/
- if ((error = xfs_trans_reserve(tp, resblks,
- XFS_GROWRTALLOC_LOG_RES(mp), 0,
- XFS_TRANS_PERM_LOG_RES,
- XFS_DEFAULT_PERM_LOG_COUNT)))
- goto error_cancel;
- cancelflags = XFS_TRANS_RELEASE_LOG_RES;
+ error = xfs_rtget_summary(mp, tp, log, bbno, rbpp, rsb, &sum);
+ if (error) {
+ return error;
+ }
/*
- * Lock the inode.
+ * If there are any, return success.
*/
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+ if (sum) {
+ *stat = 1;
+ return 0;
+ }
+ }
+ /*
+ * Found nothing, return failure.
+ */
+ *stat = 0;
+ return 0;
+}
- xfs_bmap_init(&flist, &firstblock);
- /*
- * Allocate blocks to the bitmap file.
- */
- nmap = 1;
- cancelflags |= XFS_TRANS_ABORT;
- error = xfs_bmapi_write(tp, ip, oblocks, nblocks - oblocks,
- XFS_BMAPI_METADATA, &firstblock,
- resblks, &map, &nmap, &flist);
- if (!error && nmap < 1)
- error = XFS_ERROR(ENOSPC);
- if (error)
- goto error_cancel;
- /*
- * Free any blocks freed up in the transaction, then commit.
- */
- error = xfs_bmap_finish(&tp, &flist, &committed);
- if (error)
- goto error_cancel;
- error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
- if (error)
- goto error;
- /*
- * Now we need to clear the allocated blocks.
- * Do this one block per transaction, to keep it simple.
- */
- cancelflags = 0;
- for (bno = map.br_startoff, fsbno = map.br_startblock;
- bno < map.br_startoff + map.br_blockcount;
- bno++, fsbno++) {
- tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFSRT_ZERO);
- /*
- * Reserve log for one block zeroing.
- */
- if ((error = xfs_trans_reserve(tp, 0,
- XFS_GROWRTZERO_LOG_RES(mp), 0, 0, 0)))
- goto error_cancel;
- /*
- * Lock the bitmap inode.
- */
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
- /*
- * Get a buffer for the block.
- */
- d = XFS_FSB_TO_DADDR(mp, fsbno);
- bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
- mp->m_bsize, 0);
- if (bp == NULL) {
- error = XFS_ERROR(EIO);
-error_cancel:
- xfs_trans_cancel(tp, cancelflags);
- goto error;
- }
- memset(bp->b_addr, 0, mp->m_sb.sb_blocksize);
- xfs_trans_log_buf(tp, bp, 0, mp->m_sb.sb_blocksize - 1);
- /*
- * Commit the transaction.
- */
- error = xfs_trans_commit(tp, 0);
+
+/*
+ * Copy and transform the summary file, given the old and new
+ * parameters in the mount structures.
+ */
+STATIC int /* error */
+xfs_rtcopy_summary(
+ xfs_mount_t *omp, /* old file system mount point */
+ xfs_mount_t *nmp, /* new file system mount point */
+ xfs_trans_t *tp) /* transaction pointer */
+{
+ xfs_rtblock_t bbno; /* bitmap block number */
+ xfs_buf_t *bp; /* summary buffer */
+ int error; /* error return value */
+ int log; /* summary level number (log length) */
+ xfs_suminfo_t sum; /* summary data */
+ xfs_fsblock_t sumbno; /* summary block number */
+
+ bp = NULL;
+ for (log = omp->m_rsumlevels - 1; log >= 0; log--) {
+ for (bbno = omp->m_sb.sb_rbmblocks - 1;
+ (xfs_srtblock_t)bbno >= 0;
+ bbno--) {
+ error = xfs_rtget_summary(omp, tp, log, bbno, &bp,
+ &sumbno, &sum);
if (error)
- goto error;
+ return error;
+ if (sum == 0)
+ continue;
+ error = xfs_rtmodify_summary(omp, tp, log, bbno, -sum,
+ &bp, &sumbno);
+ if (error)
+ return error;
+ error = xfs_rtmodify_summary(nmp, tp, log, bbno, sum,
+ &bp, &sumbno);
+ if (error)
+ return error;
+ ASSERT(sum > 0);
}
- /*
- * Go on to the next extent, if any.
- */
- oblocks = map.br_startoff + map.br_blockcount;
}
return 0;
+}
+/*
+ * Mark an extent specified by start and len allocated.
+ * Updates all the summary information as well as the bitmap.
+ */
+STATIC int /* error */
+xfs_rtallocate_range(
+ xfs_mount_t *mp, /* file system mount point */
+ xfs_trans_t *tp, /* transaction pointer */
+ xfs_rtblock_t start, /* start block to allocate */
+ xfs_extlen_t len, /* length to allocate */
+ xfs_buf_t **rbpp, /* in/out: summary block buffer */
+ xfs_fsblock_t *rsb) /* in/out: summary block number */
+{
+ xfs_rtblock_t end; /* end of the allocated extent */
+ int error; /* error value */
+ xfs_rtblock_t postblock = 0; /* first block allocated > end */
+ xfs_rtblock_t preblock = 0; /* first block allocated < start */
-error:
+ end = start + len - 1;
+ /*
+ * Assume we're allocating out of the middle of a free extent.
+ * We need to find the beginning and end of the extent so we can
+ * properly update the summary.
+ */
+ error = xfs_rtfind_back(mp, tp, start, 0, &preblock);
+ if (error) {
+ return error;
+ }
+ /*
+ * Find the next allocated block (end of free extent).
+ */
+ error = xfs_rtfind_forw(mp, tp, end, mp->m_sb.sb_rextents - 1,
+ &postblock);
+ if (error) {
+ return error;
+ }
+ /*
+ * Decrement the summary information corresponding to the entire
+ * (old) free extent.
+ */
+ error = xfs_rtmodify_summary(mp, tp,
+ XFS_RTBLOCKLOG(postblock + 1 - preblock),
+ XFS_BITTOBLOCK(mp, preblock), -1, rbpp, rsb);
+ if (error) {
+ return error;
+ }
+ /*
+ * If there are blocks not being allocated at the front of the
+ * old extent, add summary data for them to be free.
+ */
+ if (preblock < start) {
+ error = xfs_rtmodify_summary(mp, tp,
+ XFS_RTBLOCKLOG(start - preblock),
+ XFS_BITTOBLOCK(mp, preblock), 1, rbpp, rsb);
+ if (error) {
+ return error;
+ }
+ }
+ /*
+ * If there are blocks not being allocated at the end of the
+ * old extent, add summary data for them to be free.
+ */
+ if (postblock > end) {
+ error = xfs_rtmodify_summary(mp, tp,
+ XFS_RTBLOCKLOG(postblock - end),
+ XFS_BITTOBLOCK(mp, end + 1), 1, rbpp, rsb);
+ if (error) {
+ return error;
+ }
+ }
+ /*
+ * Modify the bitmap to mark this extent allocated.
+ */
+ error = xfs_rtmodify_range(mp, tp, start, len, 0);
return error;
}
@@ -722,1112 +809,126 @@ xfs_rtallocate_extent_size(
}
/*
- * Mark an extent specified by start and len allocated.
- * Updates all the summary information as well as the bitmap.
+ * Allocate space to the bitmap or summary file, and zero it, for growfs.
*/
STATIC int /* error */
-xfs_rtallocate_range(
+xfs_growfs_rt_alloc(
xfs_mount_t *mp, /* file system mount point */
- xfs_trans_t *tp, /* transaction pointer */
- xfs_rtblock_t start, /* start block to allocate */
- xfs_extlen_t len, /* length to allocate */
- xfs_buf_t **rbpp, /* in/out: summary block buffer */
- xfs_fsblock_t *rsb) /* in/out: summary block number */
+ xfs_extlen_t oblocks, /* old count of blocks */
+ xfs_extlen_t nblocks, /* new count of blocks */
+ xfs_inode_t *ip) /* inode (bitmap/summary) */
{
- xfs_rtblock_t end; /* end of the allocated extent */
- int error; /* error value */
- xfs_rtblock_t postblock; /* first block allocated > end */
- xfs_rtblock_t preblock; /* first block allocated < start */
+ xfs_fileoff_t bno; /* block number in file */
+ xfs_buf_t *bp; /* temporary buffer for zeroing */
+ int committed; /* transaction committed flag */
+ xfs_daddr_t d; /* disk block address */
+ int error; /* error return value */
+ xfs_fsblock_t firstblock; /* first block allocated in xaction */
+ xfs_bmap_free_t flist; /* list of freed blocks */
+ xfs_fsblock_t fsbno; /* filesystem block for bno */
+ xfs_bmbt_irec_t map; /* block map output */
+ int nmap; /* number of block maps */
+ int resblks; /* space reservation */
- end = start + len - 1;
- /*
- * Assume we're allocating out of the middle of a free extent.
- * We need to find the beginning and end of the extent so we can
- * properly update the summary.
- */
- error = xfs_rtfind_back(mp, tp, start, 0, &preblock);
- if (error) {
- return error;
- }
- /*
- * Find the next allocated block (end of free extent).
- */
- error = xfs_rtfind_forw(mp, tp, end, mp->m_sb.sb_rextents - 1,
- &postblock);
- if (error) {
- return error;
- }
- /*
- * Decrement the summary information corresponding to the entire
- * (old) free extent.
- */
- error = xfs_rtmodify_summary(mp, tp,
- XFS_RTBLOCKLOG(postblock + 1 - preblock),
- XFS_BITTOBLOCK(mp, preblock), -1, rbpp, rsb);
- if (error) {
- return error;
- }
- /*
- * If there are blocks not being allocated at the front of the
- * old extent, add summary data for them to be free.
- */
- if (preblock < start) {
- error = xfs_rtmodify_summary(mp, tp,
- XFS_RTBLOCKLOG(start - preblock),
- XFS_BITTOBLOCK(mp, preblock), 1, rbpp, rsb);
- if (error) {
- return error;
- }
- }
/*
- * If there are blocks not being allocated at the end of the
- * old extent, add summary data for them to be free.
- */
- if (postblock > end) {
- error = xfs_rtmodify_summary(mp, tp,
- XFS_RTBLOCKLOG(postblock - end),
- XFS_BITTOBLOCK(mp, end + 1), 1, rbpp, rsb);
- if (error) {
- return error;
- }
- }
- /*
- * Modify the bitmap to mark this extent allocated.
+ * Allocate space to the file, as necessary.
*/
- error = xfs_rtmodify_range(mp, tp, start, len, 0);
- return error;
-}
-
-/*
- * Return whether there are any free extents in the size range given
- * by low and high, for the bitmap block bbno.
- */
-STATIC int /* error */
-xfs_rtany_summary(
- xfs_mount_t *mp, /* file system mount structure */
- xfs_trans_t *tp, /* transaction pointer */
- int low, /* low log2 extent size */
- int high, /* high log2 extent size */
- xfs_rtblock_t bbno, /* bitmap block number */
- xfs_buf_t **rbpp, /* in/out: summary block buffer */
- xfs_fsblock_t *rsb, /* in/out: summary block number */
- int *stat) /* out: any good extents here? */
-{
- int error; /* error value */
- int log; /* loop counter, log2 of ext. size */
- xfs_suminfo_t sum; /* summary data */
+ while (oblocks < nblocks) {
+ int cancelflags = 0;
+ xfs_trans_t *tp;
- /*
- * Loop over logs of extent sizes. Order is irrelevant.
- */
- for (log = low; log <= high; log++) {
+ tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFSRT_ALLOC);
+ resblks = XFS_GROWFSRT_SPACE_RES(mp, nblocks - oblocks);
/*
- * Get one summary datum.
+ * Reserve space & log for one extent added to the file.
*/
- error = xfs_rtget_summary(mp, tp, log, bbno, rbpp, rsb, &sum);
- if (error) {
- return error;
- }
+ error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growrtalloc,
+ resblks, 0);
+ if (error)
+ goto error_cancel;
+ cancelflags = XFS_TRANS_RELEASE_LOG_RES;
/*
- * If there are any, return success.
+ * Lock the inode.
*/
- if (sum) {
- *stat = 1;
- return 0;
- }
- }
- /*
- * Found nothing, return failure.
- */
- *stat = 0;
- return 0;
-}
-
-/*
- * Get a buffer for the bitmap or summary file block specified.
- * The buffer is returned read and locked.
- */
-STATIC int /* error */
-xfs_rtbuf_get(
- xfs_mount_t *mp, /* file system mount structure */
- xfs_trans_t *tp, /* transaction pointer */
- xfs_rtblock_t block, /* block number in bitmap or summary */
- int issum, /* is summary not bitmap */
- xfs_buf_t **bpp) /* output: buffer for the block */
-{
- xfs_buf_t *bp; /* block buffer, result */
- xfs_inode_t *ip; /* bitmap or summary inode */
- xfs_bmbt_irec_t map;
- int nmap = 1;
- int error; /* error value */
-
- ip = issum ? mp->m_rsumip : mp->m_rbmip;
-
- error = xfs_bmapi_read(ip, block, 1, &map, &nmap, XFS_DATA_FORK);
- if (error)
- return error;
-
- ASSERT(map.br_startblock != NULLFSBLOCK);
- error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
- XFS_FSB_TO_DADDR(mp, map.br_startblock),
- mp->m_bsize, 0, &bp, NULL);
- if (error)
- return error;
- ASSERT(!xfs_buf_geterror(bp));
- *bpp = bp;
- return 0;
-}
-
-#ifdef DEBUG
-/*
- * Check that the given extent (block range) is allocated already.
- */
-STATIC int /* error */
-xfs_rtcheck_alloc_range(
- xfs_mount_t *mp, /* file system mount point */
- xfs_trans_t *tp, /* transaction pointer */
- xfs_rtblock_t bno, /* starting block number of extent */
- xfs_extlen_t len, /* length of extent */
- int *stat) /* out: 1 for allocated, 0 for not */
-{
- xfs_rtblock_t new; /* dummy for xfs_rtcheck_range */
-
- return xfs_rtcheck_range(mp, tp, bno, len, 0, &new, stat);
-}
-#endif
-
-/*
- * Check that the given range is either all allocated (val = 0) or
- * all free (val = 1).
- */
-STATIC int /* error */
-xfs_rtcheck_range(
- xfs_mount_t *mp, /* file system mount point */
- xfs_trans_t *tp, /* transaction pointer */
- xfs_rtblock_t start, /* starting block number of extent */
- xfs_extlen_t len, /* length of extent */
- int val, /* 1 for free, 0 for allocated */
- xfs_rtblock_t *new, /* out: first block not matching */
- int *stat) /* out: 1 for matches, 0 for not */
-{
- xfs_rtword_t *b; /* current word in buffer */
- int bit; /* bit number in the word */
- xfs_rtblock_t block; /* bitmap block number */
- xfs_buf_t *bp; /* buf for the block */
- xfs_rtword_t *bufp; /* starting word in buffer */
- int error; /* error value */
- xfs_rtblock_t i; /* current bit number rel. to start */
- xfs_rtblock_t lastbit; /* last useful bit in word */
- xfs_rtword_t mask; /* mask of relevant bits for value */
- xfs_rtword_t wdiff; /* difference from wanted value */
- int word; /* word number in the buffer */
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
- /*
- * Compute starting bitmap block number
- */
- block = XFS_BITTOBLOCK(mp, start);
- /*
- * Read the bitmap block.
- */
- error = xfs_rtbuf_get(mp, tp, block, 0, &bp);
- if (error) {
- return error;
- }
- bufp = bp->b_addr;
- /*
- * Compute the starting word's address, and starting bit.
- */
- word = XFS_BITTOWORD(mp, start);
- b = &bufp[word];
- bit = (int)(start & (XFS_NBWORD - 1));
- /*
- * 0 (allocated) => all zero's; 1 (free) => all one's.
- */
- val = -val;
- /*
- * If not starting on a word boundary, deal with the first
- * (partial) word.
- */
- if (bit) {
- /*
- * Compute first bit not examined.
- */
- lastbit = XFS_RTMIN(bit + len, XFS_NBWORD);
- /*
- * Mask of relevant bits.
- */
- mask = (((xfs_rtword_t)1 << (lastbit - bit)) - 1) << bit;
- /*
- * Compute difference between actual and desired value.
- */
- if ((wdiff = (*b ^ val) & mask)) {
- /*
- * Different, compute first wrong bit and return.
- */
- xfs_trans_brelse(tp, bp);
- i = XFS_RTLOBIT(wdiff) - bit;
- *new = start + i;
- *stat = 0;
- return 0;
- }
- i = lastbit - bit;
- /*
- * Go on to next block if that's where the next word is
- * and we need the next word.
- */
- if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
- /*
- * If done with this block, get the next one.
- */
- xfs_trans_brelse(tp, bp);
- error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
- if (error) {
- return error;
- }
- b = bufp = bp->b_addr;
- word = 0;
- } else {
- /*
- * Go on to the next word in the buffer.
- */
- b++;
- }
- } else {
- /*
- * Starting on a word boundary, no partial word.
- */
- i = 0;
- }
- /*
- * Loop over whole words in buffers. When we use up one buffer
- * we move on to the next one.
- */
- while (len - i >= XFS_NBWORD) {
- /*
- * Compute difference between actual and desired value.
- */
- if ((wdiff = *b ^ val)) {
- /*
- * Different, compute first wrong bit and return.
- */
- xfs_trans_brelse(tp, bp);
- i += XFS_RTLOBIT(wdiff);
- *new = start + i;
- *stat = 0;
- return 0;
- }
- i += XFS_NBWORD;
+ xfs_bmap_init(&flist, &firstblock);
/*
- * Go on to next block if that's where the next word is
- * and we need the next word.
+ * Allocate blocks to the bitmap file.
*/
- if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
- /*
- * If done with this block, get the next one.
- */
- xfs_trans_brelse(tp, bp);
- error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
- if (error) {
- return error;
- }
- b = bufp = bp->b_addr;
- word = 0;
- } else {
- /*
- * Go on to the next word in the buffer.
- */
- b++;
- }
- }
- /*
- * If not ending on a word boundary, deal with the last
- * (partial) word.
- */
- if ((lastbit = len - i)) {
+ nmap = 1;
+ cancelflags |= XFS_TRANS_ABORT;
+ error = xfs_bmapi_write(tp, ip, oblocks, nblocks - oblocks,
+ XFS_BMAPI_METADATA, &firstblock,
+ resblks, &map, &nmap, &flist);
+ if (!error && nmap < 1)
+ error = XFS_ERROR(ENOSPC);
+ if (error)
+ goto error_cancel;
/*
- * Mask of relevant bits.
+ * Free any blocks freed up in the transaction, then commit.
*/
- mask = ((xfs_rtword_t)1 << lastbit) - 1;
+ error = xfs_bmap_finish(&tp, &flist, &committed);
+ if (error)
+ goto error_cancel;
+ error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+ if (error)
+ goto error;
/*
- * Compute difference between actual and desired value.
+ * Now we need to clear the allocated blocks.
+ * Do this one block per transaction, to keep it simple.
*/
- if ((wdiff = (*b ^ val) & mask)) {
+ cancelflags = 0;
+ for (bno = map.br_startoff, fsbno = map.br_startblock;
+ bno < map.br_startoff + map.br_blockcount;
+ bno++, fsbno++) {
+ tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFSRT_ZERO);
/*
- * Different, compute first wrong bit and return.
+ * Reserve log for one block zeroing.
*/
- xfs_trans_brelse(tp, bp);
- i += XFS_RTLOBIT(wdiff);
- *new = start + i;
- *stat = 0;
- return 0;
- } else
- i = len;
- }
- /*
- * Successful, return.
- */
- xfs_trans_brelse(tp, bp);
- *new = start + i;
- *stat = 1;
- return 0;
-}
-
-/*
- * Copy and transform the summary file, given the old and new
- * parameters in the mount structures.
- */
-STATIC int /* error */
-xfs_rtcopy_summary(
- xfs_mount_t *omp, /* old file system mount point */
- xfs_mount_t *nmp, /* new file system mount point */
- xfs_trans_t *tp) /* transaction pointer */
-{
- xfs_rtblock_t bbno; /* bitmap block number */
- xfs_buf_t *bp; /* summary buffer */
- int error; /* error return value */
- int log; /* summary level number (log length) */
- xfs_suminfo_t sum; /* summary data */
- xfs_fsblock_t sumbno; /* summary block number */
-
- bp = NULL;
- for (log = omp->m_rsumlevels - 1; log >= 0; log--) {
- for (bbno = omp->m_sb.sb_rbmblocks - 1;
- (xfs_srtblock_t)bbno >= 0;
- bbno--) {
- error = xfs_rtget_summary(omp, tp, log, bbno, &bp,
- &sumbno, &sum);
- if (error)
- return error;
- if (sum == 0)
- continue;
- error = xfs_rtmodify_summary(omp, tp, log, bbno, -sum,
- &bp, &sumbno);
+ error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growrtzero,
+ 0, 0);
if (error)
- return error;
- error = xfs_rtmodify_summary(nmp, tp, log, bbno, sum,
- &bp, &sumbno);
- if (error)
- return error;
- ASSERT(sum > 0);
- }
- }
- return 0;
-}
-
-/*
- * Searching backward from start to limit, find the first block whose
- * allocated/free state is different from start's.
- */
-STATIC int /* error */
-xfs_rtfind_back(
- xfs_mount_t *mp, /* file system mount point */
- xfs_trans_t *tp, /* transaction pointer */
- xfs_rtblock_t start, /* starting block to look at */
- xfs_rtblock_t limit, /* last block to look at */
- xfs_rtblock_t *rtblock) /* out: start block found */
-{
- xfs_rtword_t *b; /* current word in buffer */
- int bit; /* bit number in the word */
- xfs_rtblock_t block; /* bitmap block number */
- xfs_buf_t *bp; /* buf for the block */
- xfs_rtword_t *bufp; /* starting word in buffer */
- int error; /* error value */
- xfs_rtblock_t firstbit; /* first useful bit in the word */
- xfs_rtblock_t i; /* current bit number rel. to start */
- xfs_rtblock_t len; /* length of inspected area */
- xfs_rtword_t mask; /* mask of relevant bits for value */
- xfs_rtword_t want; /* mask for "good" values */
- xfs_rtword_t wdiff; /* difference from wanted value */
- int word; /* word number in the buffer */
-
- /*
- * Compute and read in starting bitmap block for starting block.
- */
- block = XFS_BITTOBLOCK(mp, start);
- error = xfs_rtbuf_get(mp, tp, block, 0, &bp);
- if (error) {
- return error;
- }
- bufp = bp->b_addr;
- /*
- * Get the first word's index & point to it.
- */
- word = XFS_BITTOWORD(mp, start);
- b = &bufp[word];
- bit = (int)(start & (XFS_NBWORD - 1));
- len = start - limit + 1;
- /*
- * Compute match value, based on the bit at start: if 1 (free)
- * then all-ones, else all-zeroes.
- */
- want = (*b & ((xfs_rtword_t)1 << bit)) ? -1 : 0;
- /*
- * If the starting position is not word-aligned, deal with the
- * partial word.
- */
- if (bit < XFS_NBWORD - 1) {
- /*
- * Calculate first (leftmost) bit number to look at,
- * and mask for all the relevant bits in this word.
- */
- firstbit = XFS_RTMAX((xfs_srtblock_t)(bit - len + 1), 0);
- mask = (((xfs_rtword_t)1 << (bit - firstbit + 1)) - 1) <<
- firstbit;
- /*
- * Calculate the difference between the value there
- * and what we're looking for.
- */
- if ((wdiff = (*b ^ want) & mask)) {
- /*
- * Different. Mark where we are and return.
- */
- xfs_trans_brelse(tp, bp);
- i = bit - XFS_RTHIBIT(wdiff);
- *rtblock = start - i + 1;
- return 0;
- }
- i = bit - firstbit + 1;
- /*
- * Go on to previous block if that's where the previous word is
- * and we need the previous word.
- */
- if (--word == -1 && i < len) {
- /*
- * If done with this block, get the previous one.
- */
- xfs_trans_brelse(tp, bp);
- error = xfs_rtbuf_get(mp, tp, --block, 0, &bp);
- if (error) {
- return error;
- }
- bufp = bp->b_addr;
- word = XFS_BLOCKWMASK(mp);
- b = &bufp[word];
- } else {
- /*
- * Go on to the previous word in the buffer.
- */
- b--;
- }
- } else {
- /*
- * Starting on a word boundary, no partial word.
- */
- i = 0;
- }
- /*
- * Loop over whole words in buffers. When we use up one buffer
- * we move on to the previous one.
- */
- while (len - i >= XFS_NBWORD) {
- /*
- * Compute difference between actual and desired value.
- */
- if ((wdiff = *b ^ want)) {
- /*
- * Different, mark where we are and return.
- */
- xfs_trans_brelse(tp, bp);
- i += XFS_NBWORD - 1 - XFS_RTHIBIT(wdiff);
- *rtblock = start - i + 1;
- return 0;
- }
- i += XFS_NBWORD;
- /*
- * Go on to previous block if that's where the previous word is
- * and we need the previous word.
- */
- if (--word == -1 && i < len) {
- /*
- * If done with this block, get the previous one.
- */
- xfs_trans_brelse(tp, bp);
- error = xfs_rtbuf_get(mp, tp, --block, 0, &bp);
- if (error) {
- return error;
- }
- bufp = bp->b_addr;
- word = XFS_BLOCKWMASK(mp);
- b = &bufp[word];
- } else {
- /*
- * Go on to the previous word in the buffer.
- */
- b--;
- }
- }
- /*
- * If not ending on a word boundary, deal with the last
- * (partial) word.
- */
- if (len - i) {
- /*
- * Calculate first (leftmost) bit number to look at,
- * and mask for all the relevant bits in this word.
- */
- firstbit = XFS_NBWORD - (len - i);
- mask = (((xfs_rtword_t)1 << (len - i)) - 1) << firstbit;
- /*
- * Compute difference between actual and desired value.
- */
- if ((wdiff = (*b ^ want) & mask)) {
- /*
- * Different, mark where we are and return.
- */
- xfs_trans_brelse(tp, bp);
- i += XFS_NBWORD - 1 - XFS_RTHIBIT(wdiff);
- *rtblock = start - i + 1;
- return 0;
- } else
- i = len;
- }
- /*
- * No match, return that we scanned the whole area.
- */
- xfs_trans_brelse(tp, bp);
- *rtblock = start - i + 1;
- return 0;
-}
-
-/*
- * Searching forward from start to limit, find the first block whose
- * allocated/free state is different from start's.
- */
-STATIC int /* error */
-xfs_rtfind_forw(
- xfs_mount_t *mp, /* file system mount point */
- xfs_trans_t *tp, /* transaction pointer */
- xfs_rtblock_t start, /* starting block to look at */
- xfs_rtblock_t limit, /* last block to look at */
- xfs_rtblock_t *rtblock) /* out: start block found */
-{
- xfs_rtword_t *b; /* current word in buffer */
- int bit; /* bit number in the word */
- xfs_rtblock_t block; /* bitmap block number */
- xfs_buf_t *bp; /* buf for the block */
- xfs_rtword_t *bufp; /* starting word in buffer */
- int error; /* error value */
- xfs_rtblock_t i; /* current bit number rel. to start */
- xfs_rtblock_t lastbit; /* last useful bit in the word */
- xfs_rtblock_t len; /* length of inspected area */
- xfs_rtword_t mask; /* mask of relevant bits for value */
- xfs_rtword_t want; /* mask for "good" values */
- xfs_rtword_t wdiff; /* difference from wanted value */
- int word; /* word number in the buffer */
-
- /*
- * Compute and read in starting bitmap block for starting block.
- */
- block = XFS_BITTOBLOCK(mp, start);
- error = xfs_rtbuf_get(mp, tp, block, 0, &bp);
- if (error) {
- return error;
- }
- bufp = bp->b_addr;
- /*
- * Get the first word's index & point to it.
- */
- word = XFS_BITTOWORD(mp, start);
- b = &bufp[word];
- bit = (int)(start & (XFS_NBWORD - 1));
- len = limit - start + 1;
- /*
- * Compute match value, based on the bit at start: if 1 (free)
- * then all-ones, else all-zeroes.
- */
- want = (*b & ((xfs_rtword_t)1 << bit)) ? -1 : 0;
- /*
- * If the starting position is not word-aligned, deal with the
- * partial word.
- */
- if (bit) {
- /*
- * Calculate last (rightmost) bit number to look at,
- * and mask for all the relevant bits in this word.
- */
- lastbit = XFS_RTMIN(bit + len, XFS_NBWORD);
- mask = (((xfs_rtword_t)1 << (lastbit - bit)) - 1) << bit;
- /*
- * Calculate the difference between the value there
- * and what we're looking for.
- */
- if ((wdiff = (*b ^ want) & mask)) {
- /*
- * Different. Mark where we are and return.
- */
- xfs_trans_brelse(tp, bp);
- i = XFS_RTLOBIT(wdiff) - bit;
- *rtblock = start + i - 1;
- return 0;
- }
- i = lastbit - bit;
- /*
- * Go on to next block if that's where the next word is
- * and we need the next word.
- */
- if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
- /*
- * If done with this block, get the previous one.
- */
- xfs_trans_brelse(tp, bp);
- error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
- if (error) {
- return error;
- }
- b = bufp = bp->b_addr;
- word = 0;
- } else {
- /*
- * Go on to the previous word in the buffer.
- */
- b++;
- }
- } else {
- /*
- * Starting on a word boundary, no partial word.
- */
- i = 0;
- }
- /*
- * Loop over whole words in buffers. When we use up one buffer
- * we move on to the next one.
- */
- while (len - i >= XFS_NBWORD) {
- /*
- * Compute difference between actual and desired value.
- */
- if ((wdiff = *b ^ want)) {
+ goto error_cancel;
/*
- * Different, mark where we are and return.
+ * Lock the bitmap inode.
*/
- xfs_trans_brelse(tp, bp);
- i += XFS_RTLOBIT(wdiff);
- *rtblock = start + i - 1;
- return 0;
- }
- i += XFS_NBWORD;
- /*
- * Go on to next block if that's where the next word is
- * and we need the next word.
- */
- if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
/*
- * If done with this block, get the next one.
+ * Get a buffer for the block.
*/
- xfs_trans_brelse(tp, bp);
- error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
- if (error) {
- return error;
+ d = XFS_FSB_TO_DADDR(mp, fsbno);
+ bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
+ mp->m_bsize, 0);
+ if (bp == NULL) {
+ error = XFS_ERROR(EIO);
+error_cancel:
+ xfs_trans_cancel(tp, cancelflags);
+ goto error;
}
- b = bufp = bp->b_addr;
- word = 0;
- } else {
+ memset(bp->b_addr, 0, mp->m_sb.sb_blocksize);
+ xfs_trans_log_buf(tp, bp, 0, mp->m_sb.sb_blocksize - 1);
/*
- * Go on to the next word in the buffer.
+ * Commit the transaction.
*/
- b++;
+ error = xfs_trans_commit(tp, 0);
+ if (error)
+ goto error;
}
- }
- /*
- * If not ending on a word boundary, deal with the last
- * (partial) word.
- */
- if ((lastbit = len - i)) {
/*
- * Calculate mask for all the relevant bits in this word.
- */
- mask = ((xfs_rtword_t)1 << lastbit) - 1;
- /*
- * Compute difference between actual and desired value.
+ * Go on to the next extent, if any.
*/
- if ((wdiff = (*b ^ want) & mask)) {
- /*
- * Different, mark where we are and return.
- */
- xfs_trans_brelse(tp, bp);
- i += XFS_RTLOBIT(wdiff);
- *rtblock = start + i - 1;
- return 0;
- } else
- i = len;
+ oblocks = map.br_startoff + map.br_blockcount;
}
- /*
- * No match, return that we scanned the whole area.
- */
- xfs_trans_brelse(tp, bp);
- *rtblock = start + i - 1;
return 0;
-}
-/*
- * Mark an extent specified by start and len freed.
- * Updates all the summary information as well as the bitmap.
- */
-STATIC int /* error */
-xfs_rtfree_range(
- xfs_mount_t *mp, /* file system mount point */
- xfs_trans_t *tp, /* transaction pointer */
- xfs_rtblock_t start, /* starting block to free */
- xfs_extlen_t len, /* length to free */
- xfs_buf_t **rbpp, /* in/out: summary block buffer */
- xfs_fsblock_t *rsb) /* in/out: summary block number */
-{
- xfs_rtblock_t end; /* end of the freed extent */
- int error; /* error value */
- xfs_rtblock_t postblock; /* first block freed > end */
- xfs_rtblock_t preblock; /* first block freed < start */
-
- end = start + len - 1;
- /*
- * Modify the bitmap to mark this extent freed.
- */
- error = xfs_rtmodify_range(mp, tp, start, len, 1);
- if (error) {
- return error;
- }
- /*
- * Assume we're freeing out of the middle of an allocated extent.
- * We need to find the beginning and end of the extent so we can
- * properly update the summary.
- */
- error = xfs_rtfind_back(mp, tp, start, 0, &preblock);
- if (error) {
- return error;
- }
- /*
- * Find the next allocated block (end of allocated extent).
- */
- error = xfs_rtfind_forw(mp, tp, end, mp->m_sb.sb_rextents - 1,
- &postblock);
- if (error)
- return error;
- /*
- * If there are blocks not being freed at the front of the
- * old extent, add summary data for them to be allocated.
- */
- if (preblock < start) {
- error = xfs_rtmodify_summary(mp, tp,
- XFS_RTBLOCKLOG(start - preblock),
- XFS_BITTOBLOCK(mp, preblock), -1, rbpp, rsb);
- if (error) {
- return error;
- }
- }
- /*
- * If there are blocks not being freed at the end of the
- * old extent, add summary data for them to be allocated.
- */
- if (postblock > end) {
- error = xfs_rtmodify_summary(mp, tp,
- XFS_RTBLOCKLOG(postblock - end),
- XFS_BITTOBLOCK(mp, end + 1), -1, rbpp, rsb);
- if (error) {
- return error;
- }
- }
- /*
- * Increment the summary information corresponding to the entire
- * (new) free extent.
- */
- error = xfs_rtmodify_summary(mp, tp,
- XFS_RTBLOCKLOG(postblock + 1 - preblock),
- XFS_BITTOBLOCK(mp, preblock), 1, rbpp, rsb);
+error:
return error;
}
/*
- * Read and return the summary information for a given extent size,
- * bitmap block combination.
- * Keeps track of a current summary block, so we don't keep reading
- * it from the buffer cache.
- */
-STATIC int /* error */
-xfs_rtget_summary(
- xfs_mount_t *mp, /* file system mount structure */
- xfs_trans_t *tp, /* transaction pointer */
- int log, /* log2 of extent size */
- xfs_rtblock_t bbno, /* bitmap block number */
- xfs_buf_t **rbpp, /* in/out: summary block buffer */
- xfs_fsblock_t *rsb, /* in/out: summary block number */
- xfs_suminfo_t *sum) /* out: summary info for this block */
-{
- xfs_buf_t *bp; /* buffer for summary block */
- int error; /* error value */
- xfs_fsblock_t sb; /* summary fsblock */
- int so; /* index into the summary file */
- xfs_suminfo_t *sp; /* pointer to returned data */
-
- /*
- * Compute entry number in the summary file.
- */
- so = XFS_SUMOFFS(mp, log, bbno);
- /*
- * Compute the block number in the summary file.
- */
- sb = XFS_SUMOFFSTOBLOCK(mp, so);
- /*
- * If we have an old buffer, and the block number matches, use that.
- */
- if (rbpp && *rbpp && *rsb == sb)
- bp = *rbpp;
- /*
- * Otherwise we have to get the buffer.
- */
- else {
- /*
- * If there was an old one, get rid of it first.
- */
- if (rbpp && *rbpp)
- xfs_trans_brelse(tp, *rbpp);
- error = xfs_rtbuf_get(mp, tp, sb, 1, &bp);
- if (error) {
- return error;
- }
- /*
- * Remember this buffer and block for the next call.
- */
- if (rbpp) {
- *rbpp = bp;
- *rsb = sb;
- }
- }
- /*
- * Point to the summary information & copy it out.
- */
- sp = XFS_SUMPTR(mp, bp, so);
- *sum = *sp;
- /*
- * Drop the buffer if we're not asked to remember it.
- */
- if (!rbpp)
- xfs_trans_brelse(tp, bp);
- return 0;
-}
-
-/*
- * Set the given range of bitmap bits to the given value.
- * Do whatever I/O and logging is required.
- */
-STATIC int /* error */
-xfs_rtmodify_range(
- xfs_mount_t *mp, /* file system mount point */
- xfs_trans_t *tp, /* transaction pointer */
- xfs_rtblock_t start, /* starting block to modify */
- xfs_extlen_t len, /* length of extent to modify */
- int val) /* 1 for free, 0 for allocated */
-{
- xfs_rtword_t *b; /* current word in buffer */
- int bit; /* bit number in the word */
- xfs_rtblock_t block; /* bitmap block number */
- xfs_buf_t *bp; /* buf for the block */
- xfs_rtword_t *bufp; /* starting word in buffer */
- int error; /* error value */
- xfs_rtword_t *first; /* first used word in the buffer */
- int i; /* current bit number rel. to start */
- int lastbit; /* last useful bit in word */
- xfs_rtword_t mask; /* mask o frelevant bits for value */
- int word; /* word number in the buffer */
-
- /*
- * Compute starting bitmap block number.
- */
- block = XFS_BITTOBLOCK(mp, start);
- /*
- * Read the bitmap block, and point to its data.
- */
- error = xfs_rtbuf_get(mp, tp, block, 0, &bp);
- if (error) {
- return error;
- }
- bufp = bp->b_addr;
- /*
- * Compute the starting word's address, and starting bit.
- */
- word = XFS_BITTOWORD(mp, start);
- first = b = &bufp[word];
- bit = (int)(start & (XFS_NBWORD - 1));
- /*
- * 0 (allocated) => all zeroes; 1 (free) => all ones.
- */
- val = -val;
- /*
- * If not starting on a word boundary, deal with the first
- * (partial) word.
- */
- if (bit) {
- /*
- * Compute first bit not changed and mask of relevant bits.
- */
- lastbit = XFS_RTMIN(bit + len, XFS_NBWORD);
- mask = (((xfs_rtword_t)1 << (lastbit - bit)) - 1) << bit;
- /*
- * Set/clear the active bits.
- */
- if (val)
- *b |= mask;
- else
- *b &= ~mask;
- i = lastbit - bit;
- /*
- * Go on to the next block if that's where the next word is
- * and we need the next word.
- */
- if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
- /*
- * Log the changed part of this block.
- * Get the next one.
- */
- xfs_trans_log_buf(tp, bp,
- (uint)((char *)first - (char *)bufp),
- (uint)((char *)b - (char *)bufp));
- error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
- if (error) {
- return error;
- }
- first = b = bufp = bp->b_addr;
- word = 0;
- } else {
- /*
- * Go on to the next word in the buffer
- */
- b++;
- }
- } else {
- /*
- * Starting on a word boundary, no partial word.
- */
- i = 0;
- }
- /*
- * Loop over whole words in buffers. When we use up one buffer
- * we move on to the next one.
- */
- while (len - i >= XFS_NBWORD) {
- /*
- * Set the word value correctly.
- */
- *b = val;
- i += XFS_NBWORD;
- /*
- * Go on to the next block if that's where the next word is
- * and we need the next word.
- */
- if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
- /*
- * Log the changed part of this block.
- * Get the next one.
- */
- xfs_trans_log_buf(tp, bp,
- (uint)((char *)first - (char *)bufp),
- (uint)((char *)b - (char *)bufp));
- error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
- if (error) {
- return error;
- }
- first = b = bufp = bp->b_addr;
- word = 0;
- } else {
- /*
- * Go on to the next word in the buffer
- */
- b++;
- }
- }
- /*
- * If not ending on a word boundary, deal with the last
- * (partial) word.
- */
- if ((lastbit = len - i)) {
- /*
- * Compute a mask of relevant bits.
- */
- bit = 0;
- mask = ((xfs_rtword_t)1 << lastbit) - 1;
- /*
- * Set/clear the active bits.
- */
- if (val)
- *b |= mask;
- else
- *b &= ~mask;
- b++;
- }
- /*
- * Log any remaining changed bytes.
- */
- if (b > first)
- xfs_trans_log_buf(tp, bp, (uint)((char *)first - (char *)bufp),
- (uint)((char *)b - (char *)bufp - 1));
- return 0;
-}
-
-/*
- * Read and modify the summary information for a given extent size,
- * bitmap block combination.
- * Keeps track of a current summary block, so we don't keep reading
- * it from the buffer cache.
- */
-STATIC int /* error */
-xfs_rtmodify_summary(
- xfs_mount_t *mp, /* file system mount point */
- xfs_trans_t *tp, /* transaction pointer */
- int log, /* log2 of extent size */
- xfs_rtblock_t bbno, /* bitmap block number */
- int delta, /* change to make to summary info */
- xfs_buf_t **rbpp, /* in/out: summary block buffer */
- xfs_fsblock_t *rsb) /* in/out: summary block number */
-{
- xfs_buf_t *bp; /* buffer for the summary block */
- int error; /* error value */
- xfs_fsblock_t sb; /* summary fsblock */
- int so; /* index into the summary file */
- xfs_suminfo_t *sp; /* pointer to returned data */
-
- /*
- * Compute entry number in the summary file.
- */
- so = XFS_SUMOFFS(mp, log, bbno);
- /*
- * Compute the block number in the summary file.
- */
- sb = XFS_SUMOFFSTOBLOCK(mp, so);
- /*
- * If we have an old buffer, and the block number matches, use that.
- */
- if (rbpp && *rbpp && *rsb == sb)
- bp = *rbpp;
- /*
- * Otherwise we have to get the buffer.
- */
- else {
- /*
- * If there was an old one, get rid of it first.
- */
- if (rbpp && *rbpp)
- xfs_trans_brelse(tp, *rbpp);
- error = xfs_rtbuf_get(mp, tp, sb, 1, &bp);
- if (error) {
- return error;
- }
- /*
- * Remember this buffer and block for the next call.
- */
- if (rbpp) {
- *rbpp = bp;
- *rsb = sb;
- }
- }
- /*
- * Point to the summary information, modify and log it.
- */
- sp = XFS_SUMPTR(mp, bp, so);
- *sp += delta;
- xfs_trans_log_buf(tp, bp, (uint)((char *)sp - (char *)bp->b_addr),
- (uint)((char *)sp - (char *)bp->b_addr + sizeof(*sp) - 1));
- return 0;
-}
-
-/*
* Visible (exported) functions.
*/
@@ -1958,8 +1059,9 @@ xfs_growfs_rt(
* Start a transaction, get the log reservation.
*/
tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFSRT_FREE);
- if ((error = xfs_trans_reserve(tp, 0,
- XFS_GROWRTFREE_LOG_RES(nmp), 0, 0, 0)))
+ error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growrtfree,
+ 0, 0);
+ if (error)
goto error_cancel;
/*
* Lock out other callers by grabbing the bitmap inode lock.
@@ -2129,66 +1231,6 @@ xfs_rtallocate_extent(
}
/*
- * Free an extent in the realtime subvolume. Length is expressed in
- * realtime extents, as is the block number.
- */
-int /* error */
-xfs_rtfree_extent(
- xfs_trans_t *tp, /* transaction pointer */
- xfs_rtblock_t bno, /* starting block number to free */
- xfs_extlen_t len) /* length of extent freed */
-{
- int error; /* error value */
- xfs_mount_t *mp; /* file system mount structure */
- xfs_fsblock_t sb; /* summary file block number */
- xfs_buf_t *sumbp; /* summary file block buffer */
-
- mp = tp->t_mountp;
-
- ASSERT(mp->m_rbmip->i_itemp != NULL);
- ASSERT(xfs_isilocked(mp->m_rbmip, XFS_ILOCK_EXCL));
-
-#if defined(__KERNEL__) && defined(DEBUG)
- /*
- * Check to see that this whole range is currently allocated.
- */
- {
- int stat; /* result from checking range */
-
- error = xfs_rtcheck_alloc_range(mp, tp, bno, len, &stat);
- if (error) {
- return error;
- }
- ASSERT(stat);
- }
-#endif
- sumbp = NULL;
- /*
- * Free the range of realtime blocks.
- */
- error = xfs_rtfree_range(mp, tp, bno, len, &sumbp, &sb);
- if (error) {
- return error;
- }
- /*
- * Mark more blocks free in the superblock.
- */
- xfs_trans_mod_sb(tp, XFS_TRANS_SB_FREXTENTS, (long)len);
- /*
- * If we've now freed all the blocks, reset the file sequence
- * number to 0.
- */
- if (tp->t_frextents_delta + mp->m_sb.sb_frextents ==
- mp->m_sb.sb_rextents) {
- if (!(mp->m_rbmip->i_d.di_flags & XFS_DIFLAG_NEWRTBM))
- mp->m_rbmip->i_d.di_flags |= XFS_DIFLAG_NEWRTBM;
- *(__uint64_t *)&mp->m_rbmip->i_d.di_atime = 0;
- xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE);
- }
- return 0;
-}
-
-/*
* Initialize realtime fields in the mount structure.
*/
int /* error */
diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h
index f7f3a359c1c..752b63d1030 100644
--- a/fs/xfs/xfs_rtalloc.h
+++ b/fs/xfs/xfs_rtalloc.h
@@ -18,58 +18,11 @@
#ifndef __XFS_RTALLOC_H__
#define __XFS_RTALLOC_H__
+/* kernel only definitions and functions */
+
struct xfs_mount;
struct xfs_trans;
-/* Min and max rt extent sizes, specified in bytes */
-#define XFS_MAX_RTEXTSIZE (1024 * 1024 * 1024) /* 1GB */
-#define XFS_DFL_RTEXTSIZE (64 * 1024) /* 64kB */
-#define XFS_MIN_RTEXTSIZE (4 * 1024) /* 4kB */
-
-/*
- * Constants for bit manipulations.
- */
-#define XFS_NBBYLOG 3 /* log2(NBBY) */
-#define XFS_WORDLOG 2 /* log2(sizeof(xfs_rtword_t)) */
-#define XFS_NBWORDLOG (XFS_NBBYLOG + XFS_WORDLOG)
-#define XFS_NBWORD (1 << XFS_NBWORDLOG)
-#define XFS_WORDMASK ((1 << XFS_WORDLOG) - 1)
-
-#define XFS_BLOCKSIZE(mp) ((mp)->m_sb.sb_blocksize)
-#define XFS_BLOCKMASK(mp) ((mp)->m_blockmask)
-#define XFS_BLOCKWSIZE(mp) ((mp)->m_blockwsize)
-#define XFS_BLOCKWMASK(mp) ((mp)->m_blockwmask)
-
-/*
- * Summary and bit manipulation macros.
- */
-#define XFS_SUMOFFS(mp,ls,bb) ((int)((ls) * (mp)->m_sb.sb_rbmblocks + (bb)))
-#define XFS_SUMOFFSTOBLOCK(mp,s) \
- (((s) * (uint)sizeof(xfs_suminfo_t)) >> (mp)->m_sb.sb_blocklog)
-#define XFS_SUMPTR(mp,bp,so) \
- ((xfs_suminfo_t *)((bp)->b_addr + \
- (((so) * (uint)sizeof(xfs_suminfo_t)) & XFS_BLOCKMASK(mp))))
-
-#define XFS_BITTOBLOCK(mp,bi) ((bi) >> (mp)->m_blkbit_log)
-#define XFS_BLOCKTOBIT(mp,bb) ((bb) << (mp)->m_blkbit_log)
-#define XFS_BITTOWORD(mp,bi) \
- ((int)(((bi) >> XFS_NBWORDLOG) & XFS_BLOCKWMASK(mp)))
-
-#define XFS_RTMIN(a,b) ((a) < (b) ? (a) : (b))
-#define XFS_RTMAX(a,b) ((a) > (b) ? (a) : (b))
-
-#define XFS_RTLOBIT(w) xfs_lowbit32(w)
-#define XFS_RTHIBIT(w) xfs_highbit32(w)
-
-#if XFS_BIG_BLKNOS
-#define XFS_RTBLOCKLOG(b) xfs_highbit64(b)
-#else
-#define XFS_RTBLOCKLOG(b) xfs_highbit32(b)
-#endif
-
-
-#ifdef __KERNEL__
-
#ifdef CONFIG_XFS_RT
/*
* Function prototypes for exported functions.
@@ -142,6 +95,30 @@ xfs_growfs_rt(
struct xfs_mount *mp, /* file system mount structure */
xfs_growfs_rt_t *in); /* user supplied growfs struct */
+/*
+ * From xfs_rtbitmap.c
+ */
+int xfs_rtbuf_get(struct xfs_mount *mp, struct xfs_trans *tp,
+ xfs_rtblock_t block, int issum, struct xfs_buf **bpp);
+int xfs_rtcheck_range(struct xfs_mount *mp, struct xfs_trans *tp,
+ xfs_rtblock_t start, xfs_extlen_t len, int val,
+ xfs_rtblock_t *new, int *stat);
+int xfs_rtfind_back(struct xfs_mount *mp, struct xfs_trans *tp,
+ xfs_rtblock_t start, xfs_rtblock_t limit,
+ xfs_rtblock_t *rtblock);
+int xfs_rtfind_forw(struct xfs_mount *mp, struct xfs_trans *tp,
+ xfs_rtblock_t start, xfs_rtblock_t limit,
+ xfs_rtblock_t *rtblock);
+int xfs_rtmodify_range(struct xfs_mount *mp, struct xfs_trans *tp,
+ xfs_rtblock_t start, xfs_extlen_t len, int val);
+int xfs_rtmodify_summary(struct xfs_mount *mp, struct xfs_trans *tp, int log,
+ xfs_rtblock_t bbno, int delta, xfs_buf_t **rbpp,
+ xfs_fsblock_t *rsb);
+int xfs_rtfree_range(struct xfs_mount *mp, struct xfs_trans *tp,
+ xfs_rtblock_t start, xfs_extlen_t len,
+ struct xfs_buf **rbpp, xfs_fsblock_t *rsb);
+
+
#else
# define xfs_rtallocate_extent(t,b,min,max,l,a,f,p,rb) (ENOSYS)
# define xfs_rtfree_extent(t,b,l) (ENOSYS)
@@ -161,6 +138,4 @@ xfs_rtmount_init(
# define xfs_rtunmount_inodes(m)
#endif /* CONFIG_XFS_RT */
-#endif /* __KERNEL__ */
-
#endif /* __XFS_RTALLOC_H__ */
diff --git a/fs/xfs/xfs_rtbitmap.c b/fs/xfs/xfs_rtbitmap.c
new file mode 100644
index 00000000000..f4dd697cac0
--- /dev/null
+++ b/fs/xfs/xfs_rtbitmap.c
@@ -0,0 +1,973 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_bmap.h"
+#include "xfs_bmap_util.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_alloc.h"
+#include "xfs_error.h"
+#include "xfs_trans.h"
+#include "xfs_trans_space.h"
+#include "xfs_trace.h"
+#include "xfs_buf.h"
+#include "xfs_icache.h"
+#include "xfs_dinode.h"
+#include "xfs_rtalloc.h"
+
+
+/*
+ * Realtime allocator bitmap functions shared with userspace.
+ */
+
+/*
+ * Get a buffer for the bitmap or summary file block specified.
+ * The buffer is returned read and locked.
+ */
+int
+xfs_rtbuf_get(
+ xfs_mount_t *mp, /* file system mount structure */
+ xfs_trans_t *tp, /* transaction pointer */
+ xfs_rtblock_t block, /* block number in bitmap or summary */
+ int issum, /* is summary not bitmap */
+ xfs_buf_t **bpp) /* output: buffer for the block */
+{
+ xfs_buf_t *bp; /* block buffer, result */
+ xfs_inode_t *ip; /* bitmap or summary inode */
+ xfs_bmbt_irec_t map;
+ int nmap = 1;
+ int error; /* error value */
+
+ ip = issum ? mp->m_rsumip : mp->m_rbmip;
+
+ error = xfs_bmapi_read(ip, block, 1, &map, &nmap, XFS_DATA_FORK);
+ if (error)
+ return error;
+
+ ASSERT(map.br_startblock != NULLFSBLOCK);
+ error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
+ XFS_FSB_TO_DADDR(mp, map.br_startblock),
+ mp->m_bsize, 0, &bp, NULL);
+ if (error)
+ return error;
+ *bpp = bp;
+ return 0;
+}
+
+/*
+ * Searching backward from start to limit, find the first block whose
+ * allocated/free state is different from start's.
+ */
+int
+xfs_rtfind_back(
+ xfs_mount_t *mp, /* file system mount point */
+ xfs_trans_t *tp, /* transaction pointer */
+ xfs_rtblock_t start, /* starting block to look at */
+ xfs_rtblock_t limit, /* last block to look at */
+ xfs_rtblock_t *rtblock) /* out: start block found */
+{
+ xfs_rtword_t *b; /* current word in buffer */
+ int bit; /* bit number in the word */
+ xfs_rtblock_t block; /* bitmap block number */
+ xfs_buf_t *bp; /* buf for the block */
+ xfs_rtword_t *bufp; /* starting word in buffer */
+ int error; /* error value */
+ xfs_rtblock_t firstbit; /* first useful bit in the word */
+ xfs_rtblock_t i; /* current bit number rel. to start */
+ xfs_rtblock_t len; /* length of inspected area */
+ xfs_rtword_t mask; /* mask of relevant bits for value */
+ xfs_rtword_t want; /* mask for "good" values */
+ xfs_rtword_t wdiff; /* difference from wanted value */
+ int word; /* word number in the buffer */
+
+ /*
+ * Compute and read in starting bitmap block for starting block.
+ */
+ block = XFS_BITTOBLOCK(mp, start);
+ error = xfs_rtbuf_get(mp, tp, block, 0, &bp);
+ if (error) {
+ return error;
+ }
+ bufp = bp->b_addr;
+ /*
+ * Get the first word's index & point to it.
+ */
+ word = XFS_BITTOWORD(mp, start);
+ b = &bufp[word];
+ bit = (int)(start & (XFS_NBWORD - 1));
+ len = start - limit + 1;
+ /*
+ * Compute match value, based on the bit at start: if 1 (free)
+ * then all-ones, else all-zeroes.
+ */
+ want = (*b & ((xfs_rtword_t)1 << bit)) ? -1 : 0;
+ /*
+ * If the starting position is not word-aligned, deal with the
+ * partial word.
+ */
+ if (bit < XFS_NBWORD - 1) {
+ /*
+ * Calculate first (leftmost) bit number to look at,
+ * and mask for all the relevant bits in this word.
+ */
+ firstbit = XFS_RTMAX((xfs_srtblock_t)(bit - len + 1), 0);
+ mask = (((xfs_rtword_t)1 << (bit - firstbit + 1)) - 1) <<
+ firstbit;
+ /*
+ * Calculate the difference between the value there
+ * and what we're looking for.
+ */
+ if ((wdiff = (*b ^ want) & mask)) {
+ /*
+ * Different. Mark where we are and return.
+ */
+ xfs_trans_brelse(tp, bp);
+ i = bit - XFS_RTHIBIT(wdiff);
+ *rtblock = start - i + 1;
+ return 0;
+ }
+ i = bit - firstbit + 1;
+ /*
+ * Go on to previous block if that's where the previous word is
+ * and we need the previous word.
+ */
+ if (--word == -1 && i < len) {
+ /*
+ * If done with this block, get the previous one.
+ */
+ xfs_trans_brelse(tp, bp);
+ error = xfs_rtbuf_get(mp, tp, --block, 0, &bp);
+ if (error) {
+ return error;
+ }
+ bufp = bp->b_addr;
+ word = XFS_BLOCKWMASK(mp);
+ b = &bufp[word];
+ } else {
+ /*
+ * Go on to the previous word in the buffer.
+ */
+ b--;
+ }
+ } else {
+ /*
+ * Starting on a word boundary, no partial word.
+ */
+ i = 0;
+ }
+ /*
+ * Loop over whole words in buffers. When we use up one buffer
+ * we move on to the previous one.
+ */
+ while (len - i >= XFS_NBWORD) {
+ /*
+ * Compute difference between actual and desired value.
+ */
+ if ((wdiff = *b ^ want)) {
+ /*
+ * Different, mark where we are and return.
+ */
+ xfs_trans_brelse(tp, bp);
+ i += XFS_NBWORD - 1 - XFS_RTHIBIT(wdiff);
+ *rtblock = start - i + 1;
+ return 0;
+ }
+ i += XFS_NBWORD;
+ /*
+ * Go on to previous block if that's where the previous word is
+ * and we need the previous word.
+ */
+ if (--word == -1 && i < len) {
+ /*
+ * If done with this block, get the previous one.
+ */
+ xfs_trans_brelse(tp, bp);
+ error = xfs_rtbuf_get(mp, tp, --block, 0, &bp);
+ if (error) {
+ return error;
+ }
+ bufp = bp->b_addr;
+ word = XFS_BLOCKWMASK(mp);
+ b = &bufp[word];
+ } else {
+ /*
+ * Go on to the previous word in the buffer.
+ */
+ b--;
+ }
+ }
+ /*
+ * If not ending on a word boundary, deal with the last
+ * (partial) word.
+ */
+ if (len - i) {
+ /*
+ * Calculate first (leftmost) bit number to look at,
+ * and mask for all the relevant bits in this word.
+ */
+ firstbit = XFS_NBWORD - (len - i);
+ mask = (((xfs_rtword_t)1 << (len - i)) - 1) << firstbit;
+ /*
+ * Compute difference between actual and desired value.
+ */
+ if ((wdiff = (*b ^ want) & mask)) {
+ /*
+ * Different, mark where we are and return.
+ */
+ xfs_trans_brelse(tp, bp);
+ i += XFS_NBWORD - 1 - XFS_RTHIBIT(wdiff);
+ *rtblock = start - i + 1;
+ return 0;
+ } else
+ i = len;
+ }
+ /*
+ * No match, return that we scanned the whole area.
+ */
+ xfs_trans_brelse(tp, bp);
+ *rtblock = start - i + 1;
+ return 0;
+}
+
+/*
+ * Searching forward from start to limit, find the first block whose
+ * allocated/free state is different from start's.
+ */
+int
+xfs_rtfind_forw(
+ xfs_mount_t *mp, /* file system mount point */
+ xfs_trans_t *tp, /* transaction pointer */
+ xfs_rtblock_t start, /* starting block to look at */
+ xfs_rtblock_t limit, /* last block to look at */
+ xfs_rtblock_t *rtblock) /* out: start block found */
+{
+ xfs_rtword_t *b; /* current word in buffer */
+ int bit; /* bit number in the word */
+ xfs_rtblock_t block; /* bitmap block number */
+ xfs_buf_t *bp; /* buf for the block */
+ xfs_rtword_t *bufp; /* starting word in buffer */
+ int error; /* error value */
+ xfs_rtblock_t i; /* current bit number rel. to start */
+ xfs_rtblock_t lastbit; /* last useful bit in the word */
+ xfs_rtblock_t len; /* length of inspected area */
+ xfs_rtword_t mask; /* mask of relevant bits for value */
+ xfs_rtword_t want; /* mask for "good" values */
+ xfs_rtword_t wdiff; /* difference from wanted value */
+ int word; /* word number in the buffer */
+
+ /*
+ * Compute and read in starting bitmap block for starting block.
+ */
+ block = XFS_BITTOBLOCK(mp, start);
+ error = xfs_rtbuf_get(mp, tp, block, 0, &bp);
+ if (error) {
+ return error;
+ }
+ bufp = bp->b_addr;
+ /*
+ * Get the first word's index & point to it.
+ */
+ word = XFS_BITTOWORD(mp, start);
+ b = &bufp[word];
+ bit = (int)(start & (XFS_NBWORD - 1));
+ len = limit - start + 1;
+ /*
+ * Compute match value, based on the bit at start: if 1 (free)
+ * then all-ones, else all-zeroes.
+ */
+ want = (*b & ((xfs_rtword_t)1 << bit)) ? -1 : 0;
+ /*
+ * If the starting position is not word-aligned, deal with the
+ * partial word.
+ */
+ if (bit) {
+ /*
+ * Calculate last (rightmost) bit number to look at,
+ * and mask for all the relevant bits in this word.
+ */
+ lastbit = XFS_RTMIN(bit + len, XFS_NBWORD);
+ mask = (((xfs_rtword_t)1 << (lastbit - bit)) - 1) << bit;
+ /*
+ * Calculate the difference between the value there
+ * and what we're looking for.
+ */
+ if ((wdiff = (*b ^ want) & mask)) {
+ /*
+ * Different. Mark where we are and return.
+ */
+ xfs_trans_brelse(tp, bp);
+ i = XFS_RTLOBIT(wdiff) - bit;
+ *rtblock = start + i - 1;
+ return 0;
+ }
+ i = lastbit - bit;
+ /*
+ * Go on to next block if that's where the next word is
+ * and we need the next word.
+ */
+ if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
+ /*
+ * If done with this block, get the previous one.
+ */
+ xfs_trans_brelse(tp, bp);
+ error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
+ if (error) {
+ return error;
+ }
+ b = bufp = bp->b_addr;
+ word = 0;
+ } else {
+ /*
+ * Go on to the previous word in the buffer.
+ */
+ b++;
+ }
+ } else {
+ /*
+ * Starting on a word boundary, no partial word.
+ */
+ i = 0;
+ }
+ /*
+ * Loop over whole words in buffers. When we use up one buffer
+ * we move on to the next one.
+ */
+ while (len - i >= XFS_NBWORD) {
+ /*
+ * Compute difference between actual and desired value.
+ */
+ if ((wdiff = *b ^ want)) {
+ /*
+ * Different, mark where we are and return.
+ */
+ xfs_trans_brelse(tp, bp);
+ i += XFS_RTLOBIT(wdiff);
+ *rtblock = start + i - 1;
+ return 0;
+ }
+ i += XFS_NBWORD;
+ /*
+ * Go on to next block if that's where the next word is
+ * and we need the next word.
+ */
+ if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
+ /*
+ * If done with this block, get the next one.
+ */
+ xfs_trans_brelse(tp, bp);
+ error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
+ if (error) {
+ return error;
+ }
+ b = bufp = bp->b_addr;
+ word = 0;
+ } else {
+ /*
+ * Go on to the next word in the buffer.
+ */
+ b++;
+ }
+ }
+ /*
+ * If not ending on a word boundary, deal with the last
+ * (partial) word.
+ */
+ if ((lastbit = len - i)) {
+ /*
+ * Calculate mask for all the relevant bits in this word.
+ */
+ mask = ((xfs_rtword_t)1 << lastbit) - 1;
+ /*
+ * Compute difference between actual and desired value.
+ */
+ if ((wdiff = (*b ^ want) & mask)) {
+ /*
+ * Different, mark where we are and return.
+ */
+ xfs_trans_brelse(tp, bp);
+ i += XFS_RTLOBIT(wdiff);
+ *rtblock = start + i - 1;
+ return 0;
+ } else
+ i = len;
+ }
+ /*
+ * No match, return that we scanned the whole area.
+ */
+ xfs_trans_brelse(tp, bp);
+ *rtblock = start + i - 1;
+ return 0;
+}
+
+/*
+ * Read and modify the summary information for a given extent size,
+ * bitmap block combination.
+ * Keeps track of a current summary block, so we don't keep reading
+ * it from the buffer cache.
+ */
+int
+xfs_rtmodify_summary(
+ xfs_mount_t *mp, /* file system mount point */
+ xfs_trans_t *tp, /* transaction pointer */
+ int log, /* log2 of extent size */
+ xfs_rtblock_t bbno, /* bitmap block number */
+ int delta, /* change to make to summary info */
+ xfs_buf_t **rbpp, /* in/out: summary block buffer */
+ xfs_fsblock_t *rsb) /* in/out: summary block number */
+{
+ xfs_buf_t *bp; /* buffer for the summary block */
+ int error; /* error value */
+ xfs_fsblock_t sb; /* summary fsblock */
+ int so; /* index into the summary file */
+ xfs_suminfo_t *sp; /* pointer to returned data */
+
+ /*
+ * Compute entry number in the summary file.
+ */
+ so = XFS_SUMOFFS(mp, log, bbno);
+ /*
+ * Compute the block number in the summary file.
+ */
+ sb = XFS_SUMOFFSTOBLOCK(mp, so);
+ /*
+ * If we have an old buffer, and the block number matches, use that.
+ */
+ if (rbpp && *rbpp && *rsb == sb)
+ bp = *rbpp;
+ /*
+ * Otherwise we have to get the buffer.
+ */
+ else {
+ /*
+ * If there was an old one, get rid of it first.
+ */
+ if (rbpp && *rbpp)
+ xfs_trans_brelse(tp, *rbpp);
+ error = xfs_rtbuf_get(mp, tp, sb, 1, &bp);
+ if (error) {
+ return error;
+ }
+ /*
+ * Remember this buffer and block for the next call.
+ */
+ if (rbpp) {
+ *rbpp = bp;
+ *rsb = sb;
+ }
+ }
+ /*
+ * Point to the summary information, modify and log it.
+ */
+ sp = XFS_SUMPTR(mp, bp, so);
+ *sp += delta;
+ xfs_trans_log_buf(tp, bp, (uint)((char *)sp - (char *)bp->b_addr),
+ (uint)((char *)sp - (char *)bp->b_addr + sizeof(*sp) - 1));
+ return 0;
+}
+
+/*
+ * Set the given range of bitmap bits to the given value.
+ * Do whatever I/O and logging is required.
+ */
+int
+xfs_rtmodify_range(
+ xfs_mount_t *mp, /* file system mount point */
+ xfs_trans_t *tp, /* transaction pointer */
+ xfs_rtblock_t start, /* starting block to modify */
+ xfs_extlen_t len, /* length of extent to modify */
+ int val) /* 1 for free, 0 for allocated */
+{
+ xfs_rtword_t *b; /* current word in buffer */
+ int bit; /* bit number in the word */
+ xfs_rtblock_t block; /* bitmap block number */
+ xfs_buf_t *bp; /* buf for the block */
+ xfs_rtword_t *bufp; /* starting word in buffer */
+ int error; /* error value */
+ xfs_rtword_t *first; /* first used word in the buffer */
+ int i; /* current bit number rel. to start */
+ int lastbit; /* last useful bit in word */
+ xfs_rtword_t mask; /* mask o frelevant bits for value */
+ int word; /* word number in the buffer */
+
+ /*
+ * Compute starting bitmap block number.
+ */
+ block = XFS_BITTOBLOCK(mp, start);
+ /*
+ * Read the bitmap block, and point to its data.
+ */
+ error = xfs_rtbuf_get(mp, tp, block, 0, &bp);
+ if (error) {
+ return error;
+ }
+ bufp = bp->b_addr;
+ /*
+ * Compute the starting word's address, and starting bit.
+ */
+ word = XFS_BITTOWORD(mp, start);
+ first = b = &bufp[word];
+ bit = (int)(start & (XFS_NBWORD - 1));
+ /*
+ * 0 (allocated) => all zeroes; 1 (free) => all ones.
+ */
+ val = -val;
+ /*
+ * If not starting on a word boundary, deal with the first
+ * (partial) word.
+ */
+ if (bit) {
+ /*
+ * Compute first bit not changed and mask of relevant bits.
+ */
+ lastbit = XFS_RTMIN(bit + len, XFS_NBWORD);
+ mask = (((xfs_rtword_t)1 << (lastbit - bit)) - 1) << bit;
+ /*
+ * Set/clear the active bits.
+ */
+ if (val)
+ *b |= mask;
+ else
+ *b &= ~mask;
+ i = lastbit - bit;
+ /*
+ * Go on to the next block if that's where the next word is
+ * and we need the next word.
+ */
+ if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
+ /*
+ * Log the changed part of this block.
+ * Get the next one.
+ */
+ xfs_trans_log_buf(tp, bp,
+ (uint)((char *)first - (char *)bufp),
+ (uint)((char *)b - (char *)bufp));
+ error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
+ if (error) {
+ return error;
+ }
+ first = b = bufp = bp->b_addr;
+ word = 0;
+ } else {
+ /*
+ * Go on to the next word in the buffer
+ */
+ b++;
+ }
+ } else {
+ /*
+ * Starting on a word boundary, no partial word.
+ */
+ i = 0;
+ }
+ /*
+ * Loop over whole words in buffers. When we use up one buffer
+ * we move on to the next one.
+ */
+ while (len - i >= XFS_NBWORD) {
+ /*
+ * Set the word value correctly.
+ */
+ *b = val;
+ i += XFS_NBWORD;
+ /*
+ * Go on to the next block if that's where the next word is
+ * and we need the next word.
+ */
+ if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
+ /*
+ * Log the changed part of this block.
+ * Get the next one.
+ */
+ xfs_trans_log_buf(tp, bp,
+ (uint)((char *)first - (char *)bufp),
+ (uint)((char *)b - (char *)bufp));
+ error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
+ if (error) {
+ return error;
+ }
+ first = b = bufp = bp->b_addr;
+ word = 0;
+ } else {
+ /*
+ * Go on to the next word in the buffer
+ */
+ b++;
+ }
+ }
+ /*
+ * If not ending on a word boundary, deal with the last
+ * (partial) word.
+ */
+ if ((lastbit = len - i)) {
+ /*
+ * Compute a mask of relevant bits.
+ */
+ bit = 0;
+ mask = ((xfs_rtword_t)1 << lastbit) - 1;
+ /*
+ * Set/clear the active bits.
+ */
+ if (val)
+ *b |= mask;
+ else
+ *b &= ~mask;
+ b++;
+ }
+ /*
+ * Log any remaining changed bytes.
+ */
+ if (b > first)
+ xfs_trans_log_buf(tp, bp, (uint)((char *)first - (char *)bufp),
+ (uint)((char *)b - (char *)bufp - 1));
+ return 0;
+}
+
+/*
+ * Mark an extent specified by start and len freed.
+ * Updates all the summary information as well as the bitmap.
+ */
+int
+xfs_rtfree_range(
+ xfs_mount_t *mp, /* file system mount point */
+ xfs_trans_t *tp, /* transaction pointer */
+ xfs_rtblock_t start, /* starting block to free */
+ xfs_extlen_t len, /* length to free */
+ xfs_buf_t **rbpp, /* in/out: summary block buffer */
+ xfs_fsblock_t *rsb) /* in/out: summary block number */
+{
+ xfs_rtblock_t end; /* end of the freed extent */
+ int error; /* error value */
+ xfs_rtblock_t postblock; /* first block freed > end */
+ xfs_rtblock_t preblock; /* first block freed < start */
+
+ end = start + len - 1;
+ /*
+ * Modify the bitmap to mark this extent freed.
+ */
+ error = xfs_rtmodify_range(mp, tp, start, len, 1);
+ if (error) {
+ return error;
+ }
+ /*
+ * Assume we're freeing out of the middle of an allocated extent.
+ * We need to find the beginning and end of the extent so we can
+ * properly update the summary.
+ */
+ error = xfs_rtfind_back(mp, tp, start, 0, &preblock);
+ if (error) {
+ return error;
+ }
+ /*
+ * Find the next allocated block (end of allocated extent).
+ */
+ error = xfs_rtfind_forw(mp, tp, end, mp->m_sb.sb_rextents - 1,
+ &postblock);
+ if (error)
+ return error;
+ /*
+ * If there are blocks not being freed at the front of the
+ * old extent, add summary data for them to be allocated.
+ */
+ if (preblock < start) {
+ error = xfs_rtmodify_summary(mp, tp,
+ XFS_RTBLOCKLOG(start - preblock),
+ XFS_BITTOBLOCK(mp, preblock), -1, rbpp, rsb);
+ if (error) {
+ return error;
+ }
+ }
+ /*
+ * If there are blocks not being freed at the end of the
+ * old extent, add summary data for them to be allocated.
+ */
+ if (postblock > end) {
+ error = xfs_rtmodify_summary(mp, tp,
+ XFS_RTBLOCKLOG(postblock - end),
+ XFS_BITTOBLOCK(mp, end + 1), -1, rbpp, rsb);
+ if (error) {
+ return error;
+ }
+ }
+ /*
+ * Increment the summary information corresponding to the entire
+ * (new) free extent.
+ */
+ error = xfs_rtmodify_summary(mp, tp,
+ XFS_RTBLOCKLOG(postblock + 1 - preblock),
+ XFS_BITTOBLOCK(mp, preblock), 1, rbpp, rsb);
+ return error;
+}
+
+/*
+ * Check that the given range is either all allocated (val = 0) or
+ * all free (val = 1).
+ */
+int
+xfs_rtcheck_range(
+ xfs_mount_t *mp, /* file system mount point */
+ xfs_trans_t *tp, /* transaction pointer */
+ xfs_rtblock_t start, /* starting block number of extent */
+ xfs_extlen_t len, /* length of extent */
+ int val, /* 1 for free, 0 for allocated */
+ xfs_rtblock_t *new, /* out: first block not matching */
+ int *stat) /* out: 1 for matches, 0 for not */
+{
+ xfs_rtword_t *b; /* current word in buffer */
+ int bit; /* bit number in the word */
+ xfs_rtblock_t block; /* bitmap block number */
+ xfs_buf_t *bp; /* buf for the block */
+ xfs_rtword_t *bufp; /* starting word in buffer */
+ int error; /* error value */
+ xfs_rtblock_t i; /* current bit number rel. to start */
+ xfs_rtblock_t lastbit; /* last useful bit in word */
+ xfs_rtword_t mask; /* mask of relevant bits for value */
+ xfs_rtword_t wdiff; /* difference from wanted value */
+ int word; /* word number in the buffer */
+
+ /*
+ * Compute starting bitmap block number
+ */
+ block = XFS_BITTOBLOCK(mp, start);
+ /*
+ * Read the bitmap block.
+ */
+ error = xfs_rtbuf_get(mp, tp, block, 0, &bp);
+ if (error) {
+ return error;
+ }
+ bufp = bp->b_addr;
+ /*
+ * Compute the starting word's address, and starting bit.
+ */
+ word = XFS_BITTOWORD(mp, start);
+ b = &bufp[word];
+ bit = (int)(start & (XFS_NBWORD - 1));
+ /*
+ * 0 (allocated) => all zero's; 1 (free) => all one's.
+ */
+ val = -val;
+ /*
+ * If not starting on a word boundary, deal with the first
+ * (partial) word.
+ */
+ if (bit) {
+ /*
+ * Compute first bit not examined.
+ */
+ lastbit = XFS_RTMIN(bit + len, XFS_NBWORD);
+ /*
+ * Mask of relevant bits.
+ */
+ mask = (((xfs_rtword_t)1 << (lastbit - bit)) - 1) << bit;
+ /*
+ * Compute difference between actual and desired value.
+ */
+ if ((wdiff = (*b ^ val) & mask)) {
+ /*
+ * Different, compute first wrong bit and return.
+ */
+ xfs_trans_brelse(tp, bp);
+ i = XFS_RTLOBIT(wdiff) - bit;
+ *new = start + i;
+ *stat = 0;
+ return 0;
+ }
+ i = lastbit - bit;
+ /*
+ * Go on to next block if that's where the next word is
+ * and we need the next word.
+ */
+ if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
+ /*
+ * If done with this block, get the next one.
+ */
+ xfs_trans_brelse(tp, bp);
+ error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
+ if (error) {
+ return error;
+ }
+ b = bufp = bp->b_addr;
+ word = 0;
+ } else {
+ /*
+ * Go on to the next word in the buffer.
+ */
+ b++;
+ }
+ } else {
+ /*
+ * Starting on a word boundary, no partial word.
+ */
+ i = 0;
+ }
+ /*
+ * Loop over whole words in buffers. When we use up one buffer
+ * we move on to the next one.
+ */
+ while (len - i >= XFS_NBWORD) {
+ /*
+ * Compute difference between actual and desired value.
+ */
+ if ((wdiff = *b ^ val)) {
+ /*
+ * Different, compute first wrong bit and return.
+ */
+ xfs_trans_brelse(tp, bp);
+ i += XFS_RTLOBIT(wdiff);
+ *new = start + i;
+ *stat = 0;
+ return 0;
+ }
+ i += XFS_NBWORD;
+ /*
+ * Go on to next block if that's where the next word is
+ * and we need the next word.
+ */
+ if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
+ /*
+ * If done with this block, get the next one.
+ */
+ xfs_trans_brelse(tp, bp);
+ error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
+ if (error) {
+ return error;
+ }
+ b = bufp = bp->b_addr;
+ word = 0;
+ } else {
+ /*
+ * Go on to the next word in the buffer.
+ */
+ b++;
+ }
+ }
+ /*
+ * If not ending on a word boundary, deal with the last
+ * (partial) word.
+ */
+ if ((lastbit = len - i)) {
+ /*
+ * Mask of relevant bits.
+ */
+ mask = ((xfs_rtword_t)1 << lastbit) - 1;
+ /*
+ * Compute difference between actual and desired value.
+ */
+ if ((wdiff = (*b ^ val) & mask)) {
+ /*
+ * Different, compute first wrong bit and return.
+ */
+ xfs_trans_brelse(tp, bp);
+ i += XFS_RTLOBIT(wdiff);
+ *new = start + i;
+ *stat = 0;
+ return 0;
+ } else
+ i = len;
+ }
+ /*
+ * Successful, return.
+ */
+ xfs_trans_brelse(tp, bp);
+ *new = start + i;
+ *stat = 1;
+ return 0;
+}
+
+#ifdef DEBUG
+/*
+ * Check that the given extent (block range) is allocated already.
+ */
+STATIC int /* error */
+xfs_rtcheck_alloc_range(
+ xfs_mount_t *mp, /* file system mount point */
+ xfs_trans_t *tp, /* transaction pointer */
+ xfs_rtblock_t bno, /* starting block number of extent */
+ xfs_extlen_t len) /* length of extent */
+{
+ xfs_rtblock_t new; /* dummy for xfs_rtcheck_range */
+ int stat;
+ int error;
+
+ error = xfs_rtcheck_range(mp, tp, bno, len, 0, &new, &stat);
+ if (error)
+ return error;
+ ASSERT(stat);
+ return 0;
+}
+#else
+#define xfs_rtcheck_alloc_range(m,t,b,l) (0)
+#endif
+/*
+ * Free an extent in the realtime subvolume. Length is expressed in
+ * realtime extents, as is the block number.
+ */
+int /* error */
+xfs_rtfree_extent(
+ xfs_trans_t *tp, /* transaction pointer */
+ xfs_rtblock_t bno, /* starting block number to free */
+ xfs_extlen_t len) /* length of extent freed */
+{
+ int error; /* error value */
+ xfs_mount_t *mp; /* file system mount structure */
+ xfs_fsblock_t sb; /* summary file block number */
+ xfs_buf_t *sumbp = NULL; /* summary file block buffer */
+
+ mp = tp->t_mountp;
+
+ ASSERT(mp->m_rbmip->i_itemp != NULL);
+ ASSERT(xfs_isilocked(mp->m_rbmip, XFS_ILOCK_EXCL));
+
+ error = xfs_rtcheck_alloc_range(mp, tp, bno, len);
+ if (error)
+ return error;
+
+ /*
+ * Free the range of realtime blocks.
+ */
+ error = xfs_rtfree_range(mp, tp, bno, len, &sumbp, &sb);
+ if (error) {
+ return error;
+ }
+ /*
+ * Mark more blocks free in the superblock.
+ */
+ xfs_trans_mod_sb(tp, XFS_TRANS_SB_FREXTENTS, (long)len);
+ /*
+ * If we've now freed all the blocks, reset the file sequence
+ * number to 0.
+ */
+ if (tp->t_frextents_delta + mp->m_sb.sb_frextents ==
+ mp->m_sb.sb_rextents) {
+ if (!(mp->m_rbmip->i_d.di_flags & XFS_DIFLAG_NEWRTBM))
+ mp->m_rbmip->i_d.di_flags |= XFS_DIFLAG_NEWRTBM;
+ *(__uint64_t *)&mp->m_rbmip->i_d.di_atime = 0;
+ xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE);
+ }
+ return 0;
+}
+
diff --git a/fs/xfs/xfs_sb.c b/fs/xfs/xfs_sb.c
new file mode 100644
index 00000000000..7703fa6770f
--- /dev/null
+++ b/fs/xfs/xfs_sb.c
@@ -0,0 +1,836 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_ialloc.h"
+#include "xfs_alloc.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+#include "xfs_cksum.h"
+#include "xfs_trans.h"
+#include "xfs_buf_item.h"
+#include "xfs_dinode.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_ialloc_btree.h"
+
+/*
+ * Physical superblock buffer manipulations. Shared with libxfs in userspace.
+ */
+
+static const struct {
+ short offset;
+ short type; /* 0 = integer
+ * 1 = binary / string (no translation)
+ */
+} xfs_sb_info[] = {
+ { offsetof(xfs_sb_t, sb_magicnum), 0 },
+ { offsetof(xfs_sb_t, sb_blocksize), 0 },
+ { offsetof(xfs_sb_t, sb_dblocks), 0 },
+ { offsetof(xfs_sb_t, sb_rblocks), 0 },
+ { offsetof(xfs_sb_t, sb_rextents), 0 },
+ { offsetof(xfs_sb_t, sb_uuid), 1 },
+ { offsetof(xfs_sb_t, sb_logstart), 0 },
+ { offsetof(xfs_sb_t, sb_rootino), 0 },
+ { offsetof(xfs_sb_t, sb_rbmino), 0 },
+ { offsetof(xfs_sb_t, sb_rsumino), 0 },
+ { offsetof(xfs_sb_t, sb_rextsize), 0 },
+ { offsetof(xfs_sb_t, sb_agblocks), 0 },
+ { offsetof(xfs_sb_t, sb_agcount), 0 },
+ { offsetof(xfs_sb_t, sb_rbmblocks), 0 },
+ { offsetof(xfs_sb_t, sb_logblocks), 0 },
+ { offsetof(xfs_sb_t, sb_versionnum), 0 },
+ { offsetof(xfs_sb_t, sb_sectsize), 0 },
+ { offsetof(xfs_sb_t, sb_inodesize), 0 },
+ { offsetof(xfs_sb_t, sb_inopblock), 0 },
+ { offsetof(xfs_sb_t, sb_fname[0]), 1 },
+ { offsetof(xfs_sb_t, sb_blocklog), 0 },
+ { offsetof(xfs_sb_t, sb_sectlog), 0 },
+ { offsetof(xfs_sb_t, sb_inodelog), 0 },
+ { offsetof(xfs_sb_t, sb_inopblog), 0 },
+ { offsetof(xfs_sb_t, sb_agblklog), 0 },
+ { offsetof(xfs_sb_t, sb_rextslog), 0 },
+ { offsetof(xfs_sb_t, sb_inprogress), 0 },
+ { offsetof(xfs_sb_t, sb_imax_pct), 0 },
+ { offsetof(xfs_sb_t, sb_icount), 0 },
+ { offsetof(xfs_sb_t, sb_ifree), 0 },
+ { offsetof(xfs_sb_t, sb_fdblocks), 0 },
+ { offsetof(xfs_sb_t, sb_frextents), 0 },
+ { offsetof(xfs_sb_t, sb_uquotino), 0 },
+ { offsetof(xfs_sb_t, sb_gquotino), 0 },
+ { offsetof(xfs_sb_t, sb_qflags), 0 },
+ { offsetof(xfs_sb_t, sb_flags), 0 },
+ { offsetof(xfs_sb_t, sb_shared_vn), 0 },
+ { offsetof(xfs_sb_t, sb_inoalignmt), 0 },
+ { offsetof(xfs_sb_t, sb_unit), 0 },
+ { offsetof(xfs_sb_t, sb_width), 0 },
+ { offsetof(xfs_sb_t, sb_dirblklog), 0 },
+ { offsetof(xfs_sb_t, sb_logsectlog), 0 },
+ { offsetof(xfs_sb_t, sb_logsectsize), 0 },
+ { offsetof(xfs_sb_t, sb_logsunit), 0 },
+ { offsetof(xfs_sb_t, sb_features2), 0 },
+ { offsetof(xfs_sb_t, sb_bad_features2), 0 },
+ { offsetof(xfs_sb_t, sb_features_compat), 0 },
+ { offsetof(xfs_sb_t, sb_features_ro_compat), 0 },
+ { offsetof(xfs_sb_t, sb_features_incompat), 0 },
+ { offsetof(xfs_sb_t, sb_features_log_incompat), 0 },
+ { offsetof(xfs_sb_t, sb_crc), 0 },
+ { offsetof(xfs_sb_t, sb_pad), 0 },
+ { offsetof(xfs_sb_t, sb_pquotino), 0 },
+ { offsetof(xfs_sb_t, sb_lsn), 0 },
+ { sizeof(xfs_sb_t), 0 }
+};
+
+/*
+ * Reference counting access wrappers to the perag structures.
+ * Because we never free per-ag structures, the only thing we
+ * have to protect against changes is the tree structure itself.
+ */
+struct xfs_perag *
+xfs_perag_get(
+ struct xfs_mount *mp,
+ xfs_agnumber_t agno)
+{
+ struct xfs_perag *pag;
+ int ref = 0;
+
+ rcu_read_lock();
+ pag = radix_tree_lookup(&mp->m_perag_tree, agno);
+ if (pag) {
+ ASSERT(atomic_read(&pag->pag_ref) >= 0);
+ ref = atomic_inc_return(&pag->pag_ref);
+ }
+ rcu_read_unlock();
+ trace_xfs_perag_get(mp, agno, ref, _RET_IP_);
+ return pag;
+}
+
+/*
+ * search from @first to find the next perag with the given tag set.
+ */
+struct xfs_perag *
+xfs_perag_get_tag(
+ struct xfs_mount *mp,
+ xfs_agnumber_t first,
+ int tag)
+{
+ struct xfs_perag *pag;
+ int found;
+ int ref;
+
+ rcu_read_lock();
+ found = radix_tree_gang_lookup_tag(&mp->m_perag_tree,
+ (void **)&pag, first, 1, tag);
+ if (found <= 0) {
+ rcu_read_unlock();
+ return NULL;
+ }
+ ref = atomic_inc_return(&pag->pag_ref);
+ rcu_read_unlock();
+ trace_xfs_perag_get_tag(mp, pag->pag_agno, ref, _RET_IP_);
+ return pag;
+}
+
+void
+xfs_perag_put(
+ struct xfs_perag *pag)
+{
+ int ref;
+
+ ASSERT(atomic_read(&pag->pag_ref) > 0);
+ ref = atomic_dec_return(&pag->pag_ref);
+ trace_xfs_perag_put(pag->pag_mount, pag->pag_agno, ref, _RET_IP_);
+}
+
+/*
+ * Check the validity of the SB found.
+ */
+STATIC int
+xfs_mount_validate_sb(
+ xfs_mount_t *mp,
+ xfs_sb_t *sbp,
+ bool check_inprogress,
+ bool check_version)
+{
+
+ /*
+ * If the log device and data device have the
+ * same device number, the log is internal.
+ * Consequently, the sb_logstart should be non-zero. If
+ * we have a zero sb_logstart in this case, we may be trying to mount
+ * a volume filesystem in a non-volume manner.
+ */
+ if (sbp->sb_magicnum != XFS_SB_MAGIC) {
+ xfs_warn(mp, "bad magic number");
+ return XFS_ERROR(EWRONGFS);
+ }
+
+
+ if (!xfs_sb_good_version(sbp)) {
+ xfs_warn(mp, "bad version");
+ return XFS_ERROR(EWRONGFS);
+ }
+
+ /*
+ * Version 5 superblock feature mask validation. Reject combinations the
+ * kernel cannot support up front before checking anything else. For
+ * write validation, we don't need to check feature masks.
+ */
+ if (check_version && XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) {
+ if (xfs_sb_has_compat_feature(sbp,
+ XFS_SB_FEAT_COMPAT_UNKNOWN)) {
+ xfs_warn(mp,
+"Superblock has unknown compatible features (0x%x) enabled.\n"
+"Using a more recent kernel is recommended.",
+ (sbp->sb_features_compat &
+ XFS_SB_FEAT_COMPAT_UNKNOWN));
+ }
+
+ if (xfs_sb_has_ro_compat_feature(sbp,
+ XFS_SB_FEAT_RO_COMPAT_UNKNOWN)) {
+ xfs_alert(mp,
+"Superblock has unknown read-only compatible features (0x%x) enabled.",
+ (sbp->sb_features_ro_compat &
+ XFS_SB_FEAT_RO_COMPAT_UNKNOWN));
+ if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
+ xfs_warn(mp,
+"Attempted to mount read-only compatible filesystem read-write.\n"
+"Filesystem can only be safely mounted read only.");
+ return XFS_ERROR(EINVAL);
+ }
+ }
+ if (xfs_sb_has_incompat_feature(sbp,
+ XFS_SB_FEAT_INCOMPAT_UNKNOWN)) {
+ xfs_warn(mp,
+"Superblock has unknown incompatible features (0x%x) enabled.\n"
+"Filesystem can not be safely mounted by this kernel.",
+ (sbp->sb_features_incompat &
+ XFS_SB_FEAT_INCOMPAT_UNKNOWN));
+ return XFS_ERROR(EINVAL);
+ }
+ }
+
+ if (xfs_sb_version_has_pquotino(sbp)) {
+ if (sbp->sb_qflags & (XFS_OQUOTA_ENFD | XFS_OQUOTA_CHKD)) {
+ xfs_notice(mp,
+ "Version 5 of Super block has XFS_OQUOTA bits.");
+ return XFS_ERROR(EFSCORRUPTED);
+ }
+ } else if (sbp->sb_qflags & (XFS_PQUOTA_ENFD | XFS_GQUOTA_ENFD |
+ XFS_PQUOTA_CHKD | XFS_GQUOTA_CHKD)) {
+ xfs_notice(mp,
+"Superblock earlier than Version 5 has XFS_[PQ]UOTA_{ENFD|CHKD} bits.");
+ return XFS_ERROR(EFSCORRUPTED);
+ }
+
+ if (unlikely(
+ sbp->sb_logstart == 0 && mp->m_logdev_targp == mp->m_ddev_targp)) {
+ xfs_warn(mp,
+ "filesystem is marked as having an external log; "
+ "specify logdev on the mount command line.");
+ return XFS_ERROR(EINVAL);
+ }
+
+ if (unlikely(
+ sbp->sb_logstart != 0 && mp->m_logdev_targp != mp->m_ddev_targp)) {
+ xfs_warn(mp,
+ "filesystem is marked as having an internal log; "
+ "do not specify logdev on the mount command line.");
+ return XFS_ERROR(EINVAL);
+ }
+
+ /*
+ * More sanity checking. Most of these were stolen directly from
+ * xfs_repair.
+ */
+ if (unlikely(
+ sbp->sb_agcount <= 0 ||
+ sbp->sb_sectsize < XFS_MIN_SECTORSIZE ||
+ sbp->sb_sectsize > XFS_MAX_SECTORSIZE ||
+ sbp->sb_sectlog < XFS_MIN_SECTORSIZE_LOG ||
+ sbp->sb_sectlog > XFS_MAX_SECTORSIZE_LOG ||
+ sbp->sb_sectsize != (1 << sbp->sb_sectlog) ||
+ sbp->sb_blocksize < XFS_MIN_BLOCKSIZE ||
+ sbp->sb_blocksize > XFS_MAX_BLOCKSIZE ||
+ sbp->sb_blocklog < XFS_MIN_BLOCKSIZE_LOG ||
+ sbp->sb_blocklog > XFS_MAX_BLOCKSIZE_LOG ||
+ sbp->sb_blocksize != (1 << sbp->sb_blocklog) ||
+ sbp->sb_inodesize < XFS_DINODE_MIN_SIZE ||
+ sbp->sb_inodesize > XFS_DINODE_MAX_SIZE ||
+ sbp->sb_inodelog < XFS_DINODE_MIN_LOG ||
+ sbp->sb_inodelog > XFS_DINODE_MAX_LOG ||
+ sbp->sb_inodesize != (1 << sbp->sb_inodelog) ||
+ sbp->sb_inopblock != howmany(sbp->sb_blocksize,sbp->sb_inodesize) ||
+ (sbp->sb_blocklog - sbp->sb_inodelog != sbp->sb_inopblog) ||
+ (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE) ||
+ (sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE) ||
+ (sbp->sb_imax_pct > 100 /* zero sb_imax_pct is valid */) ||
+ sbp->sb_dblocks == 0 ||
+ sbp->sb_dblocks > XFS_MAX_DBLOCKS(sbp) ||
+ sbp->sb_dblocks < XFS_MIN_DBLOCKS(sbp) ||
+ sbp->sb_shared_vn != 0)) {
+ xfs_notice(mp, "SB sanity check failed");
+ return XFS_ERROR(EFSCORRUPTED);
+ }
+
+ /*
+ * Until this is fixed only page-sized or smaller data blocks work.
+ */
+ if (unlikely(sbp->sb_blocksize > PAGE_SIZE)) {
+ xfs_warn(mp,
+ "File system with blocksize %d bytes. "
+ "Only pagesize (%ld) or less will currently work.",
+ sbp->sb_blocksize, PAGE_SIZE);
+ return XFS_ERROR(ENOSYS);
+ }
+
+ /*
+ * Currently only very few inode sizes are supported.
+ */
+ switch (sbp->sb_inodesize) {
+ case 256:
+ case 512:
+ case 1024:
+ case 2048:
+ break;
+ default:
+ xfs_warn(mp, "inode size of %d bytes not supported",
+ sbp->sb_inodesize);
+ return XFS_ERROR(ENOSYS);
+ }
+
+ if (xfs_sb_validate_fsb_count(sbp, sbp->sb_dblocks) ||
+ xfs_sb_validate_fsb_count(sbp, sbp->sb_rblocks)) {
+ xfs_warn(mp,
+ "file system too large to be mounted on this system.");
+ return XFS_ERROR(EFBIG);
+ }
+
+ if (check_inprogress && sbp->sb_inprogress) {
+ xfs_warn(mp, "Offline file system operation in progress!");
+ return XFS_ERROR(EFSCORRUPTED);
+ }
+ return 0;
+}
+
+void
+xfs_sb_quota_from_disk(struct xfs_sb *sbp)
+{
+ /*
+ * older mkfs doesn't initialize quota inodes to NULLFSINO. This
+ * leads to in-core values having two different values for a quota
+ * inode to be invalid: 0 and NULLFSINO. Change it to a single value
+ * NULLFSINO.
+ *
+ * Note that this change affect only the in-core values. These
+ * values are not written back to disk unless any quota information
+ * is written to the disk. Even in that case, sb_pquotino field is
+ * not written to disk unless the superblock supports pquotino.
+ */
+ if (sbp->sb_uquotino == 0)
+ sbp->sb_uquotino = NULLFSINO;
+ if (sbp->sb_gquotino == 0)
+ sbp->sb_gquotino = NULLFSINO;
+ if (sbp->sb_pquotino == 0)
+ sbp->sb_pquotino = NULLFSINO;
+
+ /*
+ * We need to do these manipilations only if we are working
+ * with an older version of on-disk superblock.
+ */
+ if (xfs_sb_version_has_pquotino(sbp))
+ return;
+
+ if (sbp->sb_qflags & XFS_OQUOTA_ENFD)
+ sbp->sb_qflags |= (sbp->sb_qflags & XFS_PQUOTA_ACCT) ?
+ XFS_PQUOTA_ENFD : XFS_GQUOTA_ENFD;
+ if (sbp->sb_qflags & XFS_OQUOTA_CHKD)
+ sbp->sb_qflags |= (sbp->sb_qflags & XFS_PQUOTA_ACCT) ?
+ XFS_PQUOTA_CHKD : XFS_GQUOTA_CHKD;
+ sbp->sb_qflags &= ~(XFS_OQUOTA_ENFD | XFS_OQUOTA_CHKD);
+
+ if (sbp->sb_qflags & XFS_PQUOTA_ACCT) {
+ /*
+ * In older version of superblock, on-disk superblock only
+ * has sb_gquotino, and in-core superblock has both sb_gquotino
+ * and sb_pquotino. But, only one of them is supported at any
+ * point of time. So, if PQUOTA is set in disk superblock,
+ * copy over sb_gquotino to sb_pquotino.
+ */
+ sbp->sb_pquotino = sbp->sb_gquotino;
+ sbp->sb_gquotino = NULLFSINO;
+ }
+}
+
+void
+xfs_sb_from_disk(
+ struct xfs_sb *to,
+ xfs_dsb_t *from)
+{
+ to->sb_magicnum = be32_to_cpu(from->sb_magicnum);
+ to->sb_blocksize = be32_to_cpu(from->sb_blocksize);
+ to->sb_dblocks = be64_to_cpu(from->sb_dblocks);
+ to->sb_rblocks = be64_to_cpu(from->sb_rblocks);
+ to->sb_rextents = be64_to_cpu(from->sb_rextents);
+ memcpy(&to->sb_uuid, &from->sb_uuid, sizeof(to->sb_uuid));
+ to->sb_logstart = be64_to_cpu(from->sb_logstart);
+ to->sb_rootino = be64_to_cpu(from->sb_rootino);
+ to->sb_rbmino = be64_to_cpu(from->sb_rbmino);
+ to->sb_rsumino = be64_to_cpu(from->sb_rsumino);
+ to->sb_rextsize = be32_to_cpu(from->sb_rextsize);
+ to->sb_agblocks = be32_to_cpu(from->sb_agblocks);
+ to->sb_agcount = be32_to_cpu(from->sb_agcount);
+ to->sb_rbmblocks = be32_to_cpu(from->sb_rbmblocks);
+ to->sb_logblocks = be32_to_cpu(from->sb_logblocks);
+ to->sb_versionnum = be16_to_cpu(from->sb_versionnum);
+ to->sb_sectsize = be16_to_cpu(from->sb_sectsize);
+ to->sb_inodesize = be16_to_cpu(from->sb_inodesize);
+ to->sb_inopblock = be16_to_cpu(from->sb_inopblock);
+ memcpy(&to->sb_fname, &from->sb_fname, sizeof(to->sb_fname));
+ to->sb_blocklog = from->sb_blocklog;
+ to->sb_sectlog = from->sb_sectlog;
+ to->sb_inodelog = from->sb_inodelog;
+ to->sb_inopblog = from->sb_inopblog;
+ to->sb_agblklog = from->sb_agblklog;
+ to->sb_rextslog = from->sb_rextslog;
+ to->sb_inprogress = from->sb_inprogress;
+ to->sb_imax_pct = from->sb_imax_pct;
+ to->sb_icount = be64_to_cpu(from->sb_icount);
+ to->sb_ifree = be64_to_cpu(from->sb_ifree);
+ to->sb_fdblocks = be64_to_cpu(from->sb_fdblocks);
+ to->sb_frextents = be64_to_cpu(from->sb_frextents);
+ to->sb_uquotino = be64_to_cpu(from->sb_uquotino);
+ to->sb_gquotino = be64_to_cpu(from->sb_gquotino);
+ to->sb_qflags = be16_to_cpu(from->sb_qflags);
+ to->sb_flags = from->sb_flags;
+ to->sb_shared_vn = from->sb_shared_vn;
+ to->sb_inoalignmt = be32_to_cpu(from->sb_inoalignmt);
+ to->sb_unit = be32_to_cpu(from->sb_unit);
+ to->sb_width = be32_to_cpu(from->sb_width);
+ to->sb_dirblklog = from->sb_dirblklog;
+ to->sb_logsectlog = from->sb_logsectlog;
+ to->sb_logsectsize = be16_to_cpu(from->sb_logsectsize);
+ to->sb_logsunit = be32_to_cpu(from->sb_logsunit);
+ to->sb_features2 = be32_to_cpu(from->sb_features2);
+ to->sb_bad_features2 = be32_to_cpu(from->sb_bad_features2);
+ to->sb_features_compat = be32_to_cpu(from->sb_features_compat);
+ to->sb_features_ro_compat = be32_to_cpu(from->sb_features_ro_compat);
+ to->sb_features_incompat = be32_to_cpu(from->sb_features_incompat);
+ to->sb_features_log_incompat =
+ be32_to_cpu(from->sb_features_log_incompat);
+ to->sb_pad = 0;
+ to->sb_pquotino = be64_to_cpu(from->sb_pquotino);
+ to->sb_lsn = be64_to_cpu(from->sb_lsn);
+}
+
+static inline void
+xfs_sb_quota_to_disk(
+ xfs_dsb_t *to,
+ xfs_sb_t *from,
+ __int64_t *fields)
+{
+ __uint16_t qflags = from->sb_qflags;
+
+ /*
+ * We need to do these manipilations only if we are working
+ * with an older version of on-disk superblock.
+ */
+ if (xfs_sb_version_has_pquotino(from))
+ return;
+
+ if (*fields & XFS_SB_QFLAGS) {
+ /*
+ * The in-core version of sb_qflags do not have
+ * XFS_OQUOTA_* flags, whereas the on-disk version
+ * does. So, convert incore XFS_{PG}QUOTA_* flags
+ * to on-disk XFS_OQUOTA_* flags.
+ */
+ qflags &= ~(XFS_PQUOTA_ENFD | XFS_PQUOTA_CHKD |
+ XFS_GQUOTA_ENFD | XFS_GQUOTA_CHKD);
+
+ if (from->sb_qflags &
+ (XFS_PQUOTA_ENFD | XFS_GQUOTA_ENFD))
+ qflags |= XFS_OQUOTA_ENFD;
+ if (from->sb_qflags &
+ (XFS_PQUOTA_CHKD | XFS_GQUOTA_CHKD))
+ qflags |= XFS_OQUOTA_CHKD;
+ to->sb_qflags = cpu_to_be16(qflags);
+ *fields &= ~XFS_SB_QFLAGS;
+ }
+
+ /*
+ * GQUOTINO and PQUOTINO cannot be used together in versions of
+ * superblock that do not have pquotino. from->sb_flags tells us which
+ * quota is active and should be copied to disk. If neither are active,
+ * make sure we write NULLFSINO to the sb_gquotino field as a quota
+ * inode value of "0" is invalid when the XFS_SB_VERSION_QUOTA feature
+ * bit is set.
+ *
+ * Note that we don't need to handle the sb_uquotino or sb_pquotino here
+ * as they do not require any translation. Hence the main sb field loop
+ * will write them appropriately from the in-core superblock.
+ */
+ if ((*fields & XFS_SB_GQUOTINO) &&
+ (from->sb_qflags & XFS_GQUOTA_ACCT))
+ to->sb_gquotino = cpu_to_be64(from->sb_gquotino);
+ else if ((*fields & XFS_SB_PQUOTINO) &&
+ (from->sb_qflags & XFS_PQUOTA_ACCT))
+ to->sb_gquotino = cpu_to_be64(from->sb_pquotino);
+ else {
+ /*
+ * We can't rely on just the fields being logged to tell us
+ * that it is safe to write NULLFSINO - we should only do that
+ * if quotas are not actually enabled. Hence only write
+ * NULLFSINO if both in-core quota inodes are NULL.
+ */
+ if (from->sb_gquotino == NULLFSINO &&
+ from->sb_pquotino == NULLFSINO)
+ to->sb_gquotino = cpu_to_be64(NULLFSINO);
+ }
+
+ *fields &= ~(XFS_SB_PQUOTINO | XFS_SB_GQUOTINO);
+}
+
+/*
+ * Copy in core superblock to ondisk one.
+ *
+ * The fields argument is mask of superblock fields to copy.
+ */
+void
+xfs_sb_to_disk(
+ xfs_dsb_t *to,
+ xfs_sb_t *from,
+ __int64_t fields)
+{
+ xfs_caddr_t to_ptr = (xfs_caddr_t)to;
+ xfs_caddr_t from_ptr = (xfs_caddr_t)from;
+ xfs_sb_field_t f;
+ int first;
+ int size;
+
+ ASSERT(fields);
+ if (!fields)
+ return;
+
+ xfs_sb_quota_to_disk(to, from, &fields);
+ while (fields) {
+ f = (xfs_sb_field_t)xfs_lowbit64((__uint64_t)fields);
+ first = xfs_sb_info[f].offset;
+ size = xfs_sb_info[f + 1].offset - first;
+
+ ASSERT(xfs_sb_info[f].type == 0 || xfs_sb_info[f].type == 1);
+
+ if (size == 1 || xfs_sb_info[f].type == 1) {
+ memcpy(to_ptr + first, from_ptr + first, size);
+ } else {
+ switch (size) {
+ case 2:
+ *(__be16 *)(to_ptr + first) =
+ cpu_to_be16(*(__u16 *)(from_ptr + first));
+ break;
+ case 4:
+ *(__be32 *)(to_ptr + first) =
+ cpu_to_be32(*(__u32 *)(from_ptr + first));
+ break;
+ case 8:
+ *(__be64 *)(to_ptr + first) =
+ cpu_to_be64(*(__u64 *)(from_ptr + first));
+ break;
+ default:
+ ASSERT(0);
+ }
+ }
+
+ fields &= ~(1LL << f);
+ }
+}
+
+static int
+xfs_sb_verify(
+ struct xfs_buf *bp,
+ bool check_version)
+{
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_sb sb;
+
+ xfs_sb_from_disk(&sb, XFS_BUF_TO_SBP(bp));
+
+ /*
+ * Only check the in progress field for the primary superblock as
+ * mkfs.xfs doesn't clear it from secondary superblocks.
+ */
+ return xfs_mount_validate_sb(mp, &sb, bp->b_bn == XFS_SB_DADDR,
+ check_version);
+}
+
+/*
+ * If the superblock has the CRC feature bit set or the CRC field is non-null,
+ * check that the CRC is valid. We check the CRC field is non-null because a
+ * single bit error could clear the feature bit and unused parts of the
+ * superblock are supposed to be zero. Hence a non-null crc field indicates that
+ * we've potentially lost a feature bit and we should check it anyway.
+ *
+ * However, past bugs (i.e. in growfs) left non-zeroed regions beyond the
+ * last field in V4 secondary superblocks. So for secondary superblocks,
+ * we are more forgiving, and ignore CRC failures if the primary doesn't
+ * indicate that the fs version is V5.
+ */
+static void
+xfs_sb_read_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_dsb *dsb = XFS_BUF_TO_SBP(bp);
+ int error;
+
+ /*
+ * open code the version check to avoid needing to convert the entire
+ * superblock from disk order just to check the version number
+ */
+ if (dsb->sb_magicnum == cpu_to_be32(XFS_SB_MAGIC) &&
+ (((be16_to_cpu(dsb->sb_versionnum) & XFS_SB_VERSION_NUMBITS) ==
+ XFS_SB_VERSION_5) ||
+ dsb->sb_crc != 0)) {
+
+ if (!xfs_buf_verify_cksum(bp, XFS_SB_CRC_OFF)) {
+ /* Only fail bad secondaries on a known V5 filesystem */
+ if (bp->b_bn == XFS_SB_DADDR ||
+ xfs_sb_version_hascrc(&mp->m_sb)) {
+ error = EFSBADCRC;
+ goto out_error;
+ }
+ }
+ }
+ error = xfs_sb_verify(bp, true);
+
+out_error:
+ if (error) {
+ xfs_buf_ioerror(bp, error);
+ if (error == EFSCORRUPTED || error == EFSBADCRC)
+ xfs_verifier_error(bp);
+ }
+}
+
+/*
+ * We may be probed for a filesystem match, so we may not want to emit
+ * messages when the superblock buffer is not actually an XFS superblock.
+ * If we find an XFS superblock, then run a normal, noisy mount because we are
+ * really going to mount it and want to know about errors.
+ */
+static void
+xfs_sb_quiet_read_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_dsb *dsb = XFS_BUF_TO_SBP(bp);
+
+ if (dsb->sb_magicnum == cpu_to_be32(XFS_SB_MAGIC)) {
+ /* XFS filesystem, verify noisily! */
+ xfs_sb_read_verify(bp);
+ return;
+ }
+ /* quietly fail */
+ xfs_buf_ioerror(bp, EWRONGFS);
+}
+
+static void
+xfs_sb_write_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_buf_log_item *bip = bp->b_fspriv;
+ int error;
+
+ error = xfs_sb_verify(bp, false);
+ if (error) {
+ xfs_buf_ioerror(bp, error);
+ xfs_verifier_error(bp);
+ return;
+ }
+
+ if (!xfs_sb_version_hascrc(&mp->m_sb))
+ return;
+
+ if (bip)
+ XFS_BUF_TO_SBP(bp)->sb_lsn = cpu_to_be64(bip->bli_item.li_lsn);
+
+ xfs_buf_update_cksum(bp, XFS_SB_CRC_OFF);
+}
+
+const struct xfs_buf_ops xfs_sb_buf_ops = {
+ .verify_read = xfs_sb_read_verify,
+ .verify_write = xfs_sb_write_verify,
+};
+
+const struct xfs_buf_ops xfs_sb_quiet_buf_ops = {
+ .verify_read = xfs_sb_quiet_read_verify,
+ .verify_write = xfs_sb_write_verify,
+};
+
+/*
+ * xfs_mount_common
+ *
+ * Mount initialization code establishing various mount
+ * fields from the superblock associated with the given
+ * mount structure
+ */
+void
+xfs_sb_mount_common(
+ struct xfs_mount *mp,
+ struct xfs_sb *sbp)
+{
+ mp->m_agfrotor = mp->m_agirotor = 0;
+ spin_lock_init(&mp->m_agirotor_lock);
+ mp->m_maxagi = mp->m_sb.sb_agcount;
+ mp->m_blkbit_log = sbp->sb_blocklog + XFS_NBBYLOG;
+ mp->m_blkbb_log = sbp->sb_blocklog - BBSHIFT;
+ mp->m_sectbb_log = sbp->sb_sectlog - BBSHIFT;
+ mp->m_agno_log = xfs_highbit32(sbp->sb_agcount - 1) + 1;
+ mp->m_agino_log = sbp->sb_inopblog + sbp->sb_agblklog;
+ mp->m_blockmask = sbp->sb_blocksize - 1;
+ mp->m_blockwsize = sbp->sb_blocksize >> XFS_WORDLOG;
+ mp->m_blockwmask = mp->m_blockwsize - 1;
+
+ mp->m_alloc_mxr[0] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, 1);
+ mp->m_alloc_mxr[1] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, 0);
+ mp->m_alloc_mnr[0] = mp->m_alloc_mxr[0] / 2;
+ mp->m_alloc_mnr[1] = mp->m_alloc_mxr[1] / 2;
+
+ mp->m_inobt_mxr[0] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, 1);
+ mp->m_inobt_mxr[1] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, 0);
+ mp->m_inobt_mnr[0] = mp->m_inobt_mxr[0] / 2;
+ mp->m_inobt_mnr[1] = mp->m_inobt_mxr[1] / 2;
+
+ mp->m_bmap_dmxr[0] = xfs_bmbt_maxrecs(mp, sbp->sb_blocksize, 1);
+ mp->m_bmap_dmxr[1] = xfs_bmbt_maxrecs(mp, sbp->sb_blocksize, 0);
+ mp->m_bmap_dmnr[0] = mp->m_bmap_dmxr[0] / 2;
+ mp->m_bmap_dmnr[1] = mp->m_bmap_dmxr[1] / 2;
+
+ mp->m_bsize = XFS_FSB_TO_BB(mp, 1);
+ mp->m_ialloc_inos = (int)MAX((__uint16_t)XFS_INODES_PER_CHUNK,
+ sbp->sb_inopblock);
+ mp->m_ialloc_blks = mp->m_ialloc_inos >> sbp->sb_inopblog;
+}
+
+/*
+ * xfs_initialize_perag_data
+ *
+ * Read in each per-ag structure so we can count up the number of
+ * allocated inodes, free inodes and used filesystem blocks as this
+ * information is no longer persistent in the superblock. Once we have
+ * this information, write it into the in-core superblock structure.
+ */
+int
+xfs_initialize_perag_data(
+ struct xfs_mount *mp,
+ xfs_agnumber_t agcount)
+{
+ xfs_agnumber_t index;
+ xfs_perag_t *pag;
+ xfs_sb_t *sbp = &mp->m_sb;
+ uint64_t ifree = 0;
+ uint64_t ialloc = 0;
+ uint64_t bfree = 0;
+ uint64_t bfreelst = 0;
+ uint64_t btree = 0;
+ int error;
+
+ for (index = 0; index < agcount; index++) {
+ /*
+ * read the agf, then the agi. This gets us
+ * all the information we need and populates the
+ * per-ag structures for us.
+ */
+ error = xfs_alloc_pagf_init(mp, NULL, index, 0);
+ if (error)
+ return error;
+
+ error = xfs_ialloc_pagi_init(mp, NULL, index);
+ if (error)
+ return error;
+ pag = xfs_perag_get(mp, index);
+ ifree += pag->pagi_freecount;
+ ialloc += pag->pagi_count;
+ bfree += pag->pagf_freeblks;
+ bfreelst += pag->pagf_flcount;
+ btree += pag->pagf_btreeblks;
+ xfs_perag_put(pag);
+ }
+ /*
+ * Overwrite incore superblock counters with just-read data
+ */
+ spin_lock(&mp->m_sb_lock);
+ sbp->sb_ifree = ifree;
+ sbp->sb_icount = ialloc;
+ sbp->sb_fdblocks = bfree + bfreelst + btree;
+ spin_unlock(&mp->m_sb_lock);
+
+ /* Fixup the per-cpu counters as well. */
+ xfs_icsb_reinit_counters(mp);
+
+ return 0;
+}
+
+/*
+ * xfs_mod_sb() can be used to copy arbitrary changes to the
+ * in-core superblock into the superblock buffer to be logged.
+ * It does not provide the higher level of locking that is
+ * needed to protect the in-core superblock from concurrent
+ * access.
+ */
+void
+xfs_mod_sb(xfs_trans_t *tp, __int64_t fields)
+{
+ xfs_buf_t *bp;
+ int first;
+ int last;
+ xfs_mount_t *mp;
+ xfs_sb_field_t f;
+
+ ASSERT(fields);
+ if (!fields)
+ return;
+ mp = tp->t_mountp;
+ bp = xfs_trans_getsb(tp, mp, 0);
+ first = sizeof(xfs_sb_t);
+ last = 0;
+
+ /* translate/copy */
+
+ xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb, fields);
+
+ /* find modified range */
+ f = (xfs_sb_field_t)xfs_highbit64((__uint64_t)fields);
+ ASSERT((1LL << f) & XFS_SB_MOD_BITS);
+ last = xfs_sb_info[f + 1].offset - 1;
+
+ f = (xfs_sb_field_t)xfs_lowbit64((__uint64_t)fields);
+ ASSERT((1LL << f) & XFS_SB_MOD_BITS);
+ first = xfs_sb_info[f].offset;
+
+ xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SB_BUF);
+ xfs_trans_log_buf(tp, bp, first, last);
+}
diff --git a/fs/xfs/xfs_sb.h b/fs/xfs/xfs_sb.h
index a05b45175fb..c43c2d609a2 100644
--- a/fs/xfs/xfs_sb.h
+++ b/fs/xfs/xfs_sb.h
@@ -26,16 +26,16 @@
struct xfs_buf;
struct xfs_mount;
+struct xfs_trans;
#define XFS_SB_MAGIC 0x58465342 /* 'XFSB' */
#define XFS_SB_VERSION_1 1 /* 5.3, 6.0.1, 6.1 */
#define XFS_SB_VERSION_2 2 /* 6.2 - attributes */
#define XFS_SB_VERSION_3 3 /* 6.2 - new inode version */
#define XFS_SB_VERSION_4 4 /* 6.2+ - bitmask version */
+#define XFS_SB_VERSION_5 5 /* CRC enabled filesystem */
#define XFS_SB_VERSION_NUMBITS 0x000f
#define XFS_SB_VERSION_ALLFBITS 0xfff0
-#define XFS_SB_VERSION_SASHFBITS 0xf000
-#define XFS_SB_VERSION_REALFBITS 0x0ff0
#define XFS_SB_VERSION_ATTRBIT 0x0010
#define XFS_SB_VERSION_NLINKBIT 0x0020
#define XFS_SB_VERSION_QUOTABIT 0x0040
@@ -48,24 +48,15 @@ struct xfs_mount;
#define XFS_SB_VERSION_DIRV2BIT 0x2000
#define XFS_SB_VERSION_BORGBIT 0x4000 /* ASCII only case-insens. */
#define XFS_SB_VERSION_MOREBITSBIT 0x8000
-#define XFS_SB_VERSION_OKSASHFBITS \
- (XFS_SB_VERSION_EXTFLGBIT | \
- XFS_SB_VERSION_DIRV2BIT | \
- XFS_SB_VERSION_BORGBIT)
-#define XFS_SB_VERSION_OKREALFBITS \
- (XFS_SB_VERSION_ATTRBIT | \
- XFS_SB_VERSION_NLINKBIT | \
- XFS_SB_VERSION_QUOTABIT | \
- XFS_SB_VERSION_ALIGNBIT | \
- XFS_SB_VERSION_DALIGNBIT | \
- XFS_SB_VERSION_SHAREDBIT | \
- XFS_SB_VERSION_LOGV2BIT | \
- XFS_SB_VERSION_SECTORBIT | \
- XFS_SB_VERSION_MOREBITSBIT)
-#define XFS_SB_VERSION_OKREALBITS \
- (XFS_SB_VERSION_NUMBITS | \
- XFS_SB_VERSION_OKREALFBITS | \
- XFS_SB_VERSION_OKSASHFBITS)
+
+/*
+ * Supported feature bit list is just all bits in the versionnum field because
+ * we've used them all up and understand them all. Except, of course, for the
+ * shared superblock bit, which nobody knows what it does and so is unsupported.
+ */
+#define XFS_SB_VERSION_OKBITS \
+ ((XFS_SB_VERSION_NUMBITS | XFS_SB_VERSION_ALLFBITS) & \
+ ~XFS_SB_VERSION_SHAREDBIT)
/*
* There are two words to hold XFS "feature" bits: the original
@@ -74,7 +65,6 @@ struct xfs_mount;
*
* These defines represent bits in sb_features2.
*/
-#define XFS_SB_VERSION2_REALFBITS 0x00ffffff /* Mask: features */
#define XFS_SB_VERSION2_RESERVED1BIT 0x00000001
#define XFS_SB_VERSION2_LAZYSBCOUNTBIT 0x00000002 /* Superblk counters */
#define XFS_SB_VERSION2_RESERVED4BIT 0x00000004
@@ -82,16 +72,13 @@ struct xfs_mount;
#define XFS_SB_VERSION2_PARENTBIT 0x00000010 /* parent pointers */
#define XFS_SB_VERSION2_PROJID32BIT 0x00000080 /* 32 bit project id */
#define XFS_SB_VERSION2_CRCBIT 0x00000100 /* metadata CRCs */
+#define XFS_SB_VERSION2_FTYPE 0x00000200 /* inode type in dir */
-#define XFS_SB_VERSION2_OKREALFBITS \
+#define XFS_SB_VERSION2_OKBITS \
(XFS_SB_VERSION2_LAZYSBCOUNTBIT | \
XFS_SB_VERSION2_ATTR2BIT | \
- XFS_SB_VERSION2_PROJID32BIT)
-#define XFS_SB_VERSION2_OKSASHFBITS \
- (0)
-#define XFS_SB_VERSION2_OKREALBITS \
- (XFS_SB_VERSION2_OKREALFBITS | \
- XFS_SB_VERSION2_OKSASHFBITS )
+ XFS_SB_VERSION2_PROJID32BIT | \
+ XFS_SB_VERSION2_FTYPE)
/*
* Superblock - in core version. Must match the ondisk version below.
@@ -161,9 +148,25 @@ typedef struct xfs_sb {
*/
__uint32_t sb_bad_features2;
+ /* version 5 superblock fields start here */
+
+ /* feature masks */
+ __uint32_t sb_features_compat;
+ __uint32_t sb_features_ro_compat;
+ __uint32_t sb_features_incompat;
+ __uint32_t sb_features_log_incompat;
+
+ __uint32_t sb_crc; /* superblock crc */
+ __uint32_t sb_pad;
+
+ xfs_ino_t sb_pquotino; /* project quota inode */
+ xfs_lsn_t sb_lsn; /* last write sequence */
+
/* must be padded to 64 bit alignment */
} xfs_sb_t;
+#define XFS_SB_CRC_OFF offsetof(struct xfs_sb, sb_crc)
+
/*
* Superblock - on disk version. Must match the in core version above.
* Must be padded to 64 bit alignment.
@@ -229,7 +232,21 @@ typedef struct xfs_dsb {
* for features2 bits. Easiest just to mark it bad and not use
* it for anything else.
*/
- __be32 sb_bad_features2;
+ __be32 sb_bad_features2;
+
+ /* version 5 superblock fields start here */
+
+ /* feature masks */
+ __be32 sb_features_compat;
+ __be32 sb_features_ro_compat;
+ __be32 sb_features_incompat;
+ __be32 sb_features_log_incompat;
+
+ __le32 sb_crc; /* superblock crc */
+ __be32 sb_pad;
+
+ __be64 sb_pquotino; /* project quota inode */
+ __be64 sb_lsn; /* last write sequence */
/* must be padded to 64 bit alignment */
} xfs_dsb_t;
@@ -250,7 +267,10 @@ typedef enum {
XFS_SBS_GQUOTINO, XFS_SBS_QFLAGS, XFS_SBS_FLAGS, XFS_SBS_SHARED_VN,
XFS_SBS_INOALIGNMT, XFS_SBS_UNIT, XFS_SBS_WIDTH, XFS_SBS_DIRBLKLOG,
XFS_SBS_LOGSECTLOG, XFS_SBS_LOGSECTSIZE, XFS_SBS_LOGSUNIT,
- XFS_SBS_FEATURES2, XFS_SBS_BAD_FEATURES2,
+ XFS_SBS_FEATURES2, XFS_SBS_BAD_FEATURES2, XFS_SBS_FEATURES_COMPAT,
+ XFS_SBS_FEATURES_RO_COMPAT, XFS_SBS_FEATURES_INCOMPAT,
+ XFS_SBS_FEATURES_LOG_INCOMPAT, XFS_SBS_CRC, XFS_SBS_PAD,
+ XFS_SBS_PQUOTINO, XFS_SBS_LSN,
XFS_SBS_FIELDCOUNT
} xfs_sb_field_t;
@@ -276,6 +296,12 @@ typedef enum {
#define XFS_SB_FDBLOCKS XFS_SB_MVAL(FDBLOCKS)
#define XFS_SB_FEATURES2 XFS_SB_MVAL(FEATURES2)
#define XFS_SB_BAD_FEATURES2 XFS_SB_MVAL(BAD_FEATURES2)
+#define XFS_SB_FEATURES_COMPAT XFS_SB_MVAL(FEATURES_COMPAT)
+#define XFS_SB_FEATURES_RO_COMPAT XFS_SB_MVAL(FEATURES_RO_COMPAT)
+#define XFS_SB_FEATURES_INCOMPAT XFS_SB_MVAL(FEATURES_INCOMPAT)
+#define XFS_SB_FEATURES_LOG_INCOMPAT XFS_SB_MVAL(FEATURES_LOG_INCOMPAT)
+#define XFS_SB_CRC XFS_SB_MVAL(CRC)
+#define XFS_SB_PQUOTINO XFS_SB_MVAL(PQUOTINO)
#define XFS_SB_NUM_BITS ((int)XFS_SBS_FIELDCOUNT)
#define XFS_SB_ALL_BITS ((1LL << XFS_SB_NUM_BITS) - 1)
#define XFS_SB_MOD_BITS \
@@ -283,7 +309,9 @@ typedef enum {
XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO | XFS_SB_GQUOTINO | \
XFS_SB_QFLAGS | XFS_SB_SHARED_VN | XFS_SB_UNIT | XFS_SB_WIDTH | \
XFS_SB_ICOUNT | XFS_SB_IFREE | XFS_SB_FDBLOCKS | XFS_SB_FEATURES2 | \
- XFS_SB_BAD_FEATURES2)
+ XFS_SB_BAD_FEATURES2 | XFS_SB_FEATURES_COMPAT | \
+ XFS_SB_FEATURES_RO_COMPAT | XFS_SB_FEATURES_INCOMPAT | \
+ XFS_SB_FEATURES_LOG_INCOMPAT | XFS_SB_PQUOTINO)
/*
@@ -300,220 +328,248 @@ typedef enum {
#define XFS_SB_VERSION_NUM(sbp) ((sbp)->sb_versionnum & XFS_SB_VERSION_NUMBITS)
-static inline int xfs_sb_good_version(xfs_sb_t *sbp)
+/*
+ * The first XFS version we support is a v4 superblock with V2 directories.
+ */
+static inline bool xfs_sb_good_v4_features(struct xfs_sb *sbp)
{
- /* We always support version 1-3 */
- if (sbp->sb_versionnum >= XFS_SB_VERSION_1 &&
- sbp->sb_versionnum <= XFS_SB_VERSION_3)
- return 1;
-
- /* We support version 4 if all feature bits are supported */
- if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) {
- if ((sbp->sb_versionnum & ~XFS_SB_VERSION_OKREALBITS) ||
- ((sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT) &&
- (sbp->sb_features2 & ~XFS_SB_VERSION2_OKREALBITS)))
- return 0;
-
-#ifdef __KERNEL__
- if (sbp->sb_shared_vn > XFS_SB_MAX_SHARED_VN)
- return 0;
-#else
- if ((sbp->sb_versionnum & XFS_SB_VERSION_SHAREDBIT) &&
- sbp->sb_shared_vn > XFS_SB_MAX_SHARED_VN)
- return 0;
-#endif
-
- return 1;
- }
-
- return 0;
+ if (!(sbp->sb_versionnum & XFS_SB_VERSION_DIRV2BIT))
+ return false;
+
+ /* check for unknown features in the fs */
+ if ((sbp->sb_versionnum & ~XFS_SB_VERSION_OKBITS) ||
+ ((sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT) &&
+ (sbp->sb_features2 & ~XFS_SB_VERSION2_OKBITS)))
+ return false;
+
+ return true;
+}
+
+static inline bool xfs_sb_good_version(struct xfs_sb *sbp)
+{
+ if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5)
+ return true;
+ if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4)
+ return xfs_sb_good_v4_features(sbp);
+ return false;
}
/*
* Detect a mismatched features2 field. Older kernels read/wrote
* this into the wrong slot, so to be safe we keep them in sync.
*/
-static inline int xfs_sb_has_mismatched_features2(xfs_sb_t *sbp)
+static inline bool xfs_sb_has_mismatched_features2(struct xfs_sb *sbp)
{
- return (sbp->sb_bad_features2 != sbp->sb_features2);
+ return sbp->sb_bad_features2 != sbp->sb_features2;
}
-static inline unsigned xfs_sb_version_tonew(unsigned v)
+static inline bool xfs_sb_version_hasattr(struct xfs_sb *sbp)
{
- if (v == XFS_SB_VERSION_1)
- return XFS_SB_VERSION_4;
-
- if (v == XFS_SB_VERSION_2)
- return XFS_SB_VERSION_4 | XFS_SB_VERSION_ATTRBIT;
-
- return XFS_SB_VERSION_4 | XFS_SB_VERSION_ATTRBIT |
- XFS_SB_VERSION_NLINKBIT;
+ return (sbp->sb_versionnum & XFS_SB_VERSION_ATTRBIT);
}
-static inline unsigned xfs_sb_version_toold(unsigned v)
+static inline void xfs_sb_version_addattr(struct xfs_sb *sbp)
{
- if (v & (XFS_SB_VERSION_QUOTABIT | XFS_SB_VERSION_ALIGNBIT))
- return 0;
- if (v & XFS_SB_VERSION_NLINKBIT)
- return XFS_SB_VERSION_3;
- if (v & XFS_SB_VERSION_ATTRBIT)
- return XFS_SB_VERSION_2;
- return XFS_SB_VERSION_1;
+ sbp->sb_versionnum |= XFS_SB_VERSION_ATTRBIT;
}
-static inline int xfs_sb_version_hasattr(xfs_sb_t *sbp)
+static inline bool xfs_sb_version_hasquota(struct xfs_sb *sbp)
{
- return sbp->sb_versionnum == XFS_SB_VERSION_2 ||
- sbp->sb_versionnum == XFS_SB_VERSION_3 ||
- (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
- (sbp->sb_versionnum & XFS_SB_VERSION_ATTRBIT));
+ return (sbp->sb_versionnum & XFS_SB_VERSION_QUOTABIT);
}
-static inline void xfs_sb_version_addattr(xfs_sb_t *sbp)
+static inline void xfs_sb_version_addquota(struct xfs_sb *sbp)
{
- if (sbp->sb_versionnum == XFS_SB_VERSION_1)
- sbp->sb_versionnum = XFS_SB_VERSION_2;
- else if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4)
- sbp->sb_versionnum |= XFS_SB_VERSION_ATTRBIT;
- else
- sbp->sb_versionnum = XFS_SB_VERSION_4 | XFS_SB_VERSION_ATTRBIT;
+ sbp->sb_versionnum |= XFS_SB_VERSION_QUOTABIT;
}
-static inline int xfs_sb_version_hasnlink(xfs_sb_t *sbp)
+static inline bool xfs_sb_version_hasalign(struct xfs_sb *sbp)
{
- return sbp->sb_versionnum == XFS_SB_VERSION_3 ||
- (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
- (sbp->sb_versionnum & XFS_SB_VERSION_NLINKBIT));
+ return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 ||
+ (sbp->sb_versionnum & XFS_SB_VERSION_ALIGNBIT));
}
-static inline void xfs_sb_version_addnlink(xfs_sb_t *sbp)
+static inline bool xfs_sb_version_hasdalign(struct xfs_sb *sbp)
{
- if (sbp->sb_versionnum <= XFS_SB_VERSION_2)
- sbp->sb_versionnum = XFS_SB_VERSION_3;
- else
- sbp->sb_versionnum |= XFS_SB_VERSION_NLINKBIT;
+ return (sbp->sb_versionnum & XFS_SB_VERSION_DALIGNBIT);
}
-static inline int xfs_sb_version_hasquota(xfs_sb_t *sbp)
+static inline bool xfs_sb_version_haslogv2(struct xfs_sb *sbp)
{
- return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
- (sbp->sb_versionnum & XFS_SB_VERSION_QUOTABIT);
+ return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 ||
+ (sbp->sb_versionnum & XFS_SB_VERSION_LOGV2BIT);
}
-static inline void xfs_sb_version_addquota(xfs_sb_t *sbp)
+static inline bool xfs_sb_version_hasextflgbit(struct xfs_sb *sbp)
{
- if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4)
- sbp->sb_versionnum |= XFS_SB_VERSION_QUOTABIT;
- else
- sbp->sb_versionnum = xfs_sb_version_tonew(sbp->sb_versionnum) |
- XFS_SB_VERSION_QUOTABIT;
+ return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 ||
+ (sbp->sb_versionnum & XFS_SB_VERSION_EXTFLGBIT);
}
-static inline int xfs_sb_version_hasalign(xfs_sb_t *sbp)
+static inline bool xfs_sb_version_hassector(struct xfs_sb *sbp)
{
- return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
- (sbp->sb_versionnum & XFS_SB_VERSION_ALIGNBIT);
+ return (sbp->sb_versionnum & XFS_SB_VERSION_SECTORBIT);
}
-static inline int xfs_sb_version_hasdalign(xfs_sb_t *sbp)
+static inline bool xfs_sb_version_hasasciici(struct xfs_sb *sbp)
{
- return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
- (sbp->sb_versionnum & XFS_SB_VERSION_DALIGNBIT);
+ return (sbp->sb_versionnum & XFS_SB_VERSION_BORGBIT);
}
-static inline int xfs_sb_version_hasshared(xfs_sb_t *sbp)
+static inline bool xfs_sb_version_hasmorebits(struct xfs_sb *sbp)
{
- return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
- (sbp->sb_versionnum & XFS_SB_VERSION_SHAREDBIT);
+ return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 ||
+ (sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT);
}
-static inline int xfs_sb_version_hasdirv2(xfs_sb_t *sbp)
+/*
+ * sb_features2 bit version macros.
+ */
+static inline bool xfs_sb_version_haslazysbcount(struct xfs_sb *sbp)
{
- return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
- (sbp->sb_versionnum & XFS_SB_VERSION_DIRV2BIT);
+ return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) ||
+ (xfs_sb_version_hasmorebits(sbp) &&
+ (sbp->sb_features2 & XFS_SB_VERSION2_LAZYSBCOUNTBIT));
}
-static inline int xfs_sb_version_haslogv2(xfs_sb_t *sbp)
+static inline bool xfs_sb_version_hasattr2(struct xfs_sb *sbp)
{
- return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
- (sbp->sb_versionnum & XFS_SB_VERSION_LOGV2BIT);
+ return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) ||
+ (xfs_sb_version_hasmorebits(sbp) &&
+ (sbp->sb_features2 & XFS_SB_VERSION2_ATTR2BIT));
}
-static inline int xfs_sb_version_hasextflgbit(xfs_sb_t *sbp)
+static inline void xfs_sb_version_addattr2(struct xfs_sb *sbp)
{
- return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
- (sbp->sb_versionnum & XFS_SB_VERSION_EXTFLGBIT);
+ sbp->sb_versionnum |= XFS_SB_VERSION_MOREBITSBIT;
+ sbp->sb_features2 |= XFS_SB_VERSION2_ATTR2BIT;
+ sbp->sb_bad_features2 |= XFS_SB_VERSION2_ATTR2BIT;
}
-static inline int xfs_sb_version_hassector(xfs_sb_t *sbp)
+static inline void xfs_sb_version_removeattr2(struct xfs_sb *sbp)
{
- return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
- (sbp->sb_versionnum & XFS_SB_VERSION_SECTORBIT);
+ sbp->sb_features2 &= ~XFS_SB_VERSION2_ATTR2BIT;
+ sbp->sb_bad_features2 &= ~XFS_SB_VERSION2_ATTR2BIT;
+ if (!sbp->sb_features2)
+ sbp->sb_versionnum &= ~XFS_SB_VERSION_MOREBITSBIT;
}
-static inline int xfs_sb_version_hasasciici(xfs_sb_t *sbp)
+static inline bool xfs_sb_version_hasprojid32bit(struct xfs_sb *sbp)
{
- return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
- (sbp->sb_versionnum & XFS_SB_VERSION_BORGBIT);
+ return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) ||
+ (xfs_sb_version_hasmorebits(sbp) &&
+ (sbp->sb_features2 & XFS_SB_VERSION2_PROJID32BIT));
}
-static inline int xfs_sb_version_hasmorebits(xfs_sb_t *sbp)
+static inline void xfs_sb_version_addprojid32bit(struct xfs_sb *sbp)
{
- return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
- (sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT);
+ sbp->sb_versionnum |= XFS_SB_VERSION_MOREBITSBIT;
+ sbp->sb_features2 |= XFS_SB_VERSION2_PROJID32BIT;
+ sbp->sb_bad_features2 |= XFS_SB_VERSION2_PROJID32BIT;
}
/*
- * sb_features2 bit version macros.
+ * Extended v5 superblock feature masks. These are to be used for new v5
+ * superblock features only.
+ *
+ * Compat features are new features that old kernels will not notice or affect
+ * and so can mount read-write without issues.
*
- * For example, for a bit defined as XFS_SB_VERSION2_FUNBIT, has a macro:
+ * RO-Compat (read only) are features that old kernels can read but will break
+ * if they write. Hence only read-only mounts of such filesystems are allowed on
+ * kernels that don't support the feature bit.
*
- * SB_VERSION_HASFUNBIT(xfs_sb_t *sbp)
- * ((xfs_sb_version_hasmorebits(sbp) &&
- * ((sbp)->sb_features2 & XFS_SB_VERSION2_FUNBIT)
+ * InCompat features are features which old kernels will not understand and so
+ * must not mount.
+ *
+ * Log-InCompat features are for changes to log formats or new transactions that
+ * can't be replayed on older kernels. The fields are set when the filesystem is
+ * mounted, and a clean unmount clears the fields.
*/
+#define XFS_SB_FEAT_COMPAT_ALL 0
+#define XFS_SB_FEAT_COMPAT_UNKNOWN ~XFS_SB_FEAT_COMPAT_ALL
+static inline bool
+xfs_sb_has_compat_feature(
+ struct xfs_sb *sbp,
+ __uint32_t feature)
+{
+ return (sbp->sb_features_compat & feature) != 0;
+}
-static inline int xfs_sb_version_haslazysbcount(xfs_sb_t *sbp)
+#define XFS_SB_FEAT_RO_COMPAT_FINOBT (1 << 0) /* free inode btree */
+#define XFS_SB_FEAT_RO_COMPAT_ALL \
+ (XFS_SB_FEAT_RO_COMPAT_FINOBT)
+#define XFS_SB_FEAT_RO_COMPAT_UNKNOWN ~XFS_SB_FEAT_RO_COMPAT_ALL
+static inline bool
+xfs_sb_has_ro_compat_feature(
+ struct xfs_sb *sbp,
+ __uint32_t feature)
{
- return xfs_sb_version_hasmorebits(sbp) &&
- (sbp->sb_features2 & XFS_SB_VERSION2_LAZYSBCOUNTBIT);
+ return (sbp->sb_features_ro_compat & feature) != 0;
}
-static inline int xfs_sb_version_hasattr2(xfs_sb_t *sbp)
+#define XFS_SB_FEAT_INCOMPAT_FTYPE (1 << 0) /* filetype in dirent */
+#define XFS_SB_FEAT_INCOMPAT_ALL \
+ (XFS_SB_FEAT_INCOMPAT_FTYPE)
+
+#define XFS_SB_FEAT_INCOMPAT_UNKNOWN ~XFS_SB_FEAT_INCOMPAT_ALL
+static inline bool
+xfs_sb_has_incompat_feature(
+ struct xfs_sb *sbp,
+ __uint32_t feature)
{
- return xfs_sb_version_hasmorebits(sbp) &&
- (sbp->sb_features2 & XFS_SB_VERSION2_ATTR2BIT);
+ return (sbp->sb_features_incompat & feature) != 0;
}
-static inline void xfs_sb_version_addattr2(xfs_sb_t *sbp)
+#define XFS_SB_FEAT_INCOMPAT_LOG_ALL 0
+#define XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN ~XFS_SB_FEAT_INCOMPAT_LOG_ALL
+static inline bool
+xfs_sb_has_incompat_log_feature(
+ struct xfs_sb *sbp,
+ __uint32_t feature)
{
- sbp->sb_versionnum |= XFS_SB_VERSION_MOREBITSBIT;
- sbp->sb_features2 |= XFS_SB_VERSION2_ATTR2BIT;
+ return (sbp->sb_features_log_incompat & feature) != 0;
}
-static inline void xfs_sb_version_removeattr2(xfs_sb_t *sbp)
+/*
+ * V5 superblock specific feature checks
+ */
+static inline int xfs_sb_version_hascrc(struct xfs_sb *sbp)
{
- sbp->sb_features2 &= ~XFS_SB_VERSION2_ATTR2BIT;
- if (!sbp->sb_features2)
- sbp->sb_versionnum &= ~XFS_SB_VERSION_MOREBITSBIT;
+ return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5;
+}
+
+static inline int xfs_sb_version_has_pquotino(struct xfs_sb *sbp)
+{
+ return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5;
}
-static inline int xfs_sb_version_hasprojid32bit(xfs_sb_t *sbp)
+static inline int xfs_sb_version_hasftype(struct xfs_sb *sbp)
{
- return xfs_sb_version_hasmorebits(sbp) &&
- (sbp->sb_features2 & XFS_SB_VERSION2_PROJID32BIT);
+ return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 &&
+ xfs_sb_has_incompat_feature(sbp, XFS_SB_FEAT_INCOMPAT_FTYPE)) ||
+ (xfs_sb_version_hasmorebits(sbp) &&
+ (sbp->sb_features2 & XFS_SB_VERSION2_FTYPE));
}
-static inline int xfs_sb_version_hascrc(xfs_sb_t *sbp)
+static inline int xfs_sb_version_hasfinobt(xfs_sb_t *sbp)
{
- return (xfs_sb_version_hasmorebits(sbp) &&
- (sbp->sb_features2 & XFS_SB_VERSION2_CRCBIT));
+ return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) &&
+ (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_FINOBT);
}
/*
* end of superblock version macros
*/
+static inline bool
+xfs_is_quota_inode(struct xfs_sb *sbp, xfs_ino_t ino)
+{
+ return (ino == sbp->sb_uquotino ||
+ ino == sbp->sb_gquotino ||
+ ino == sbp->sb_pquotino);
+}
+
#define XFS_SB_DADDR ((xfs_daddr_t)0) /* daddr in filesystem/ag */
#define XFS_SB_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_SB_DADDR)
#define XFS_BUF_TO_SBP(bp) ((xfs_dsb_t *)((bp)->b_addr))
@@ -546,4 +602,20 @@ static inline int xfs_sb_version_hascrc(xfs_sb_t *sbp)
#define XFS_B_TO_FSBT(mp,b) (((__uint64_t)(b)) >> (mp)->m_sb.sb_blocklog)
#define XFS_B_FSB_OFFSET(mp,b) ((b) & (mp)->m_blockmask)
+/*
+ * perag get/put wrappers for ref counting
+ */
+extern struct xfs_perag *xfs_perag_get(struct xfs_mount *, xfs_agnumber_t);
+extern struct xfs_perag *xfs_perag_get_tag(struct xfs_mount *, xfs_agnumber_t,
+ int tag);
+extern void xfs_perag_put(struct xfs_perag *pag);
+extern int xfs_initialize_perag_data(struct xfs_mount *, xfs_agnumber_t);
+
+extern void xfs_sb_calc_crc(struct xfs_buf *);
+extern void xfs_mod_sb(struct xfs_trans *, __int64_t);
+extern void xfs_sb_mount_common(struct xfs_mount *, struct xfs_sb *);
+extern void xfs_sb_from_disk(struct xfs_sb *, struct xfs_dsb *);
+extern void xfs_sb_to_disk(struct xfs_dsb *, struct xfs_sb *, __int64_t);
+extern void xfs_sb_quota_from_disk(struct xfs_sb *sbp);
+
#endif /* __XFS_SB_H__ */
diff --git a/fs/xfs/xfs_shared.h b/fs/xfs/xfs_shared.h
new file mode 100644
index 00000000000..82404da2ca6
--- /dev/null
+++ b/fs/xfs/xfs_shared.h
@@ -0,0 +1,246 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#ifndef __XFS_SHARED_H__
+#define __XFS_SHARED_H__
+
+/*
+ * Definitions shared between kernel and userspace that don't fit into any other
+ * header file that is shared with userspace.
+ */
+struct xfs_ifork;
+struct xfs_buf;
+struct xfs_buf_ops;
+struct xfs_mount;
+struct xfs_trans;
+struct xfs_inode;
+
+/*
+ * Buffer verifier operations are widely used, including userspace tools
+ */
+extern const struct xfs_buf_ops xfs_agf_buf_ops;
+extern const struct xfs_buf_ops xfs_agi_buf_ops;
+extern const struct xfs_buf_ops xfs_agf_buf_ops;
+extern const struct xfs_buf_ops xfs_agfl_buf_ops;
+extern const struct xfs_buf_ops xfs_allocbt_buf_ops;
+extern const struct xfs_buf_ops xfs_attr3_leaf_buf_ops;
+extern const struct xfs_buf_ops xfs_attr3_rmt_buf_ops;
+extern const struct xfs_buf_ops xfs_bmbt_buf_ops;
+extern const struct xfs_buf_ops xfs_da3_node_buf_ops;
+extern const struct xfs_buf_ops xfs_dquot_buf_ops;
+extern const struct xfs_buf_ops xfs_symlink_buf_ops;
+extern const struct xfs_buf_ops xfs_agi_buf_ops;
+extern const struct xfs_buf_ops xfs_inobt_buf_ops;
+extern const struct xfs_buf_ops xfs_inode_buf_ops;
+extern const struct xfs_buf_ops xfs_inode_buf_ra_ops;
+extern const struct xfs_buf_ops xfs_dquot_buf_ops;
+extern const struct xfs_buf_ops xfs_sb_buf_ops;
+extern const struct xfs_buf_ops xfs_sb_quiet_buf_ops;
+extern const struct xfs_buf_ops xfs_symlink_buf_ops;
+
+/*
+ * Transaction types. Used to distinguish types of buffers. These never reach
+ * the log.
+ */
+#define XFS_TRANS_SETATTR_NOT_SIZE 1
+#define XFS_TRANS_SETATTR_SIZE 2
+#define XFS_TRANS_INACTIVE 3
+#define XFS_TRANS_CREATE 4
+#define XFS_TRANS_CREATE_TRUNC 5
+#define XFS_TRANS_TRUNCATE_FILE 6
+#define XFS_TRANS_REMOVE 7
+#define XFS_TRANS_LINK 8
+#define XFS_TRANS_RENAME 9
+#define XFS_TRANS_MKDIR 10
+#define XFS_TRANS_RMDIR 11
+#define XFS_TRANS_SYMLINK 12
+#define XFS_TRANS_SET_DMATTRS 13
+#define XFS_TRANS_GROWFS 14
+#define XFS_TRANS_STRAT_WRITE 15
+#define XFS_TRANS_DIOSTRAT 16
+/* 17 was XFS_TRANS_WRITE_SYNC */
+#define XFS_TRANS_WRITEID 18
+#define XFS_TRANS_ADDAFORK 19
+#define XFS_TRANS_ATTRINVAL 20
+#define XFS_TRANS_ATRUNCATE 21
+#define XFS_TRANS_ATTR_SET 22
+#define XFS_TRANS_ATTR_RM 23
+#define XFS_TRANS_ATTR_FLAG 24
+#define XFS_TRANS_CLEAR_AGI_BUCKET 25
+#define XFS_TRANS_QM_SBCHANGE 26
+/*
+ * Dummy entries since we use the transaction type to index into the
+ * trans_type[] in xlog_recover_print_trans_head()
+ */
+#define XFS_TRANS_DUMMY1 27
+#define XFS_TRANS_DUMMY2 28
+#define XFS_TRANS_QM_QUOTAOFF 29
+#define XFS_TRANS_QM_DQALLOC 30
+#define XFS_TRANS_QM_SETQLIM 31
+#define XFS_TRANS_QM_DQCLUSTER 32
+#define XFS_TRANS_QM_QINOCREATE 33
+#define XFS_TRANS_QM_QUOTAOFF_END 34
+#define XFS_TRANS_SB_UNIT 35
+#define XFS_TRANS_FSYNC_TS 36
+#define XFS_TRANS_GROWFSRT_ALLOC 37
+#define XFS_TRANS_GROWFSRT_ZERO 38
+#define XFS_TRANS_GROWFSRT_FREE 39
+#define XFS_TRANS_SWAPEXT 40
+#define XFS_TRANS_SB_COUNT 41
+#define XFS_TRANS_CHECKPOINT 42
+#define XFS_TRANS_ICREATE 43
+#define XFS_TRANS_CREATE_TMPFILE 44
+#define XFS_TRANS_TYPE_MAX 44
+/* new transaction types need to be reflected in xfs_logprint(8) */
+
+#define XFS_TRANS_TYPES \
+ { XFS_TRANS_SETATTR_NOT_SIZE, "SETATTR_NOT_SIZE" }, \
+ { XFS_TRANS_SETATTR_SIZE, "SETATTR_SIZE" }, \
+ { XFS_TRANS_INACTIVE, "INACTIVE" }, \
+ { XFS_TRANS_CREATE, "CREATE" }, \
+ { XFS_TRANS_CREATE_TMPFILE, "CREATE_TMPFILE" }, \
+ { XFS_TRANS_CREATE_TRUNC, "CREATE_TRUNC" }, \
+ { XFS_TRANS_TRUNCATE_FILE, "TRUNCATE_FILE" }, \
+ { XFS_TRANS_REMOVE, "REMOVE" }, \
+ { XFS_TRANS_LINK, "LINK" }, \
+ { XFS_TRANS_RENAME, "RENAME" }, \
+ { XFS_TRANS_MKDIR, "MKDIR" }, \
+ { XFS_TRANS_RMDIR, "RMDIR" }, \
+ { XFS_TRANS_SYMLINK, "SYMLINK" }, \
+ { XFS_TRANS_SET_DMATTRS, "SET_DMATTRS" }, \
+ { XFS_TRANS_GROWFS, "GROWFS" }, \
+ { XFS_TRANS_STRAT_WRITE, "STRAT_WRITE" }, \
+ { XFS_TRANS_DIOSTRAT, "DIOSTRAT" }, \
+ { XFS_TRANS_WRITEID, "WRITEID" }, \
+ { XFS_TRANS_ADDAFORK, "ADDAFORK" }, \
+ { XFS_TRANS_ATTRINVAL, "ATTRINVAL" }, \
+ { XFS_TRANS_ATRUNCATE, "ATRUNCATE" }, \
+ { XFS_TRANS_ATTR_SET, "ATTR_SET" }, \
+ { XFS_TRANS_ATTR_RM, "ATTR_RM" }, \
+ { XFS_TRANS_ATTR_FLAG, "ATTR_FLAG" }, \
+ { XFS_TRANS_CLEAR_AGI_BUCKET, "CLEAR_AGI_BUCKET" }, \
+ { XFS_TRANS_QM_SBCHANGE, "QM_SBCHANGE" }, \
+ { XFS_TRANS_QM_QUOTAOFF, "QM_QUOTAOFF" }, \
+ { XFS_TRANS_QM_DQALLOC, "QM_DQALLOC" }, \
+ { XFS_TRANS_QM_SETQLIM, "QM_SETQLIM" }, \
+ { XFS_TRANS_QM_DQCLUSTER, "QM_DQCLUSTER" }, \
+ { XFS_TRANS_QM_QINOCREATE, "QM_QINOCREATE" }, \
+ { XFS_TRANS_QM_QUOTAOFF_END, "QM_QOFF_END" }, \
+ { XFS_TRANS_SB_UNIT, "SB_UNIT" }, \
+ { XFS_TRANS_FSYNC_TS, "FSYNC_TS" }, \
+ { XFS_TRANS_GROWFSRT_ALLOC, "GROWFSRT_ALLOC" }, \
+ { XFS_TRANS_GROWFSRT_ZERO, "GROWFSRT_ZERO" }, \
+ { XFS_TRANS_GROWFSRT_FREE, "GROWFSRT_FREE" }, \
+ { XFS_TRANS_SWAPEXT, "SWAPEXT" }, \
+ { XFS_TRANS_SB_COUNT, "SB_COUNT" }, \
+ { XFS_TRANS_CHECKPOINT, "CHECKPOINT" }, \
+ { XFS_TRANS_DUMMY1, "DUMMY1" }, \
+ { XFS_TRANS_DUMMY2, "DUMMY2" }, \
+ { XLOG_UNMOUNT_REC_TYPE, "UNMOUNT" }
+
+/*
+ * This structure is used to track log items associated with
+ * a transaction. It points to the log item and keeps some
+ * flags to track the state of the log item. It also tracks
+ * the amount of space needed to log the item it describes
+ * once we get to commit processing (see xfs_trans_commit()).
+ */
+struct xfs_log_item_desc {
+ struct xfs_log_item *lid_item;
+ struct list_head lid_trans;
+ unsigned char lid_flags;
+};
+
+#define XFS_LID_DIRTY 0x1
+
+/* log size calculation functions */
+int xfs_log_calc_unit_res(struct xfs_mount *mp, int unit_bytes);
+int xfs_log_calc_minimum_size(struct xfs_mount *);
+
+
+/*
+ * Values for t_flags.
+ */
+#define XFS_TRANS_DIRTY 0x01 /* something needs to be logged */
+#define XFS_TRANS_SB_DIRTY 0x02 /* superblock is modified */
+#define XFS_TRANS_PERM_LOG_RES 0x04 /* xact took a permanent log res */
+#define XFS_TRANS_SYNC 0x08 /* make commit synchronous */
+#define XFS_TRANS_DQ_DIRTY 0x10 /* at least one dquot in trx dirty */
+#define XFS_TRANS_RESERVE 0x20 /* OK to use reserved data blocks */
+#define XFS_TRANS_FREEZE_PROT 0x40 /* Transaction has elevated writer
+ count in superblock */
+/*
+ * Values for call flags parameter.
+ */
+#define XFS_TRANS_RELEASE_LOG_RES 0x4
+#define XFS_TRANS_ABORT 0x8
+
+/*
+ * Field values for xfs_trans_mod_sb.
+ */
+#define XFS_TRANS_SB_ICOUNT 0x00000001
+#define XFS_TRANS_SB_IFREE 0x00000002
+#define XFS_TRANS_SB_FDBLOCKS 0x00000004
+#define XFS_TRANS_SB_RES_FDBLOCKS 0x00000008
+#define XFS_TRANS_SB_FREXTENTS 0x00000010
+#define XFS_TRANS_SB_RES_FREXTENTS 0x00000020
+#define XFS_TRANS_SB_DBLOCKS 0x00000040
+#define XFS_TRANS_SB_AGCOUNT 0x00000080
+#define XFS_TRANS_SB_IMAXPCT 0x00000100
+#define XFS_TRANS_SB_REXTSIZE 0x00000200
+#define XFS_TRANS_SB_RBMBLOCKS 0x00000400
+#define XFS_TRANS_SB_RBLOCKS 0x00000800
+#define XFS_TRANS_SB_REXTENTS 0x00001000
+#define XFS_TRANS_SB_REXTSLOG 0x00002000
+
+/*
+ * Here we centralize the specification of XFS meta-data buffer reference count
+ * values. This determines how hard the buffer cache tries to hold onto the
+ * buffer.
+ */
+#define XFS_AGF_REF 4
+#define XFS_AGI_REF 4
+#define XFS_AGFL_REF 3
+#define XFS_INO_BTREE_REF 3
+#define XFS_ALLOC_BTREE_REF 2
+#define XFS_BMAP_BTREE_REF 2
+#define XFS_DIR_BTREE_REF 2
+#define XFS_INO_REF 2
+#define XFS_ATTR_BTREE_REF 1
+#define XFS_DQUOT_REF 1
+
+/*
+ * Flags for xfs_trans_ichgtime().
+ */
+#define XFS_ICHGTIME_MOD 0x1 /* data fork modification timestamp */
+#define XFS_ICHGTIME_CHG 0x2 /* inode field change timestamp */
+#define XFS_ICHGTIME_CREATE 0x4 /* inode create timestamp */
+
+
+/*
+ * Symlink decoding/encoding functions
+ */
+int xfs_symlink_blocks(struct xfs_mount *mp, int pathlen);
+int xfs_symlink_hdr_set(struct xfs_mount *mp, xfs_ino_t ino, uint32_t offset,
+ uint32_t size, struct xfs_buf *bp);
+bool xfs_symlink_hdr_ok(xfs_ino_t ino, uint32_t offset,
+ uint32_t size, struct xfs_buf *bp);
+void xfs_symlink_local_to_remote(struct xfs_trans *tp, struct xfs_buf *bp,
+ struct xfs_inode *ip, struct xfs_ifork *ifp);
+
+#endif /* __XFS_SHARED_H__ */
diff --git a/fs/xfs/xfs_stats.c b/fs/xfs/xfs_stats.c
index ce372b7d564..f2240383d4b 100644
--- a/fs/xfs/xfs_stats.c
+++ b/fs/xfs/xfs_stats.c
@@ -59,6 +59,7 @@ static int xfs_stat_proc_show(struct seq_file *m, void *v)
{ "abtc2", XFSSTAT_END_ABTC_V2 },
{ "bmbt2", XFSSTAT_END_BMBT_V2 },
{ "ibt2", XFSSTAT_END_IBT_V2 },
+ { "fibt2", XFSSTAT_END_FIBT_V2 },
/* we print both series of quota information together */
{ "qm", XFSSTAT_END_QM },
};
diff --git a/fs/xfs/xfs_stats.h b/fs/xfs/xfs_stats.h
index c03ad38ceae..c8f238b8299 100644
--- a/fs/xfs/xfs_stats.h
+++ b/fs/xfs/xfs_stats.h
@@ -183,7 +183,23 @@ struct xfsstats {
__uint32_t xs_ibt_2_alloc;
__uint32_t xs_ibt_2_free;
__uint32_t xs_ibt_2_moves;
-#define XFSSTAT_END_XQMSTAT (XFSSTAT_END_IBT_V2+6)
+#define XFSSTAT_END_FIBT_V2 (XFSSTAT_END_IBT_V2+15)
+ __uint32_t xs_fibt_2_lookup;
+ __uint32_t xs_fibt_2_compare;
+ __uint32_t xs_fibt_2_insrec;
+ __uint32_t xs_fibt_2_delrec;
+ __uint32_t xs_fibt_2_newroot;
+ __uint32_t xs_fibt_2_killroot;
+ __uint32_t xs_fibt_2_increment;
+ __uint32_t xs_fibt_2_decrement;
+ __uint32_t xs_fibt_2_lshift;
+ __uint32_t xs_fibt_2_rshift;
+ __uint32_t xs_fibt_2_split;
+ __uint32_t xs_fibt_2_join;
+ __uint32_t xs_fibt_2_alloc;
+ __uint32_t xs_fibt_2_free;
+ __uint32_t xs_fibt_2_moves;
+#define XFSSTAT_END_XQMSTAT (XFSSTAT_END_FIBT_V2+6)
__uint32_t xs_qm_dqreclaims;
__uint32_t xs_qm_dqreclaim_misses;
__uint32_t xs_qm_dquot_dups;
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index ab8839b2627..8f0333b3f7a 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -17,40 +17,36 @@
*/
#include "xfs.h"
-#include "xfs_log.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_inum.h"
-#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
-#include "xfs_dir2.h"
-#include "xfs_alloc.h"
-#include "xfs_quota.h"
#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dinode.h"
+#include "xfs_da_format.h"
#include "xfs_inode.h"
#include "xfs_btree.h"
-#include "xfs_ialloc.h"
#include "xfs_bmap.h"
-#include "xfs_rtalloc.h"
+#include "xfs_alloc.h"
#include "xfs_error.h"
-#include "xfs_itable.h"
#include "xfs_fsops.h"
-#include "xfs_attr.h"
+#include "xfs_trans.h"
#include "xfs_buf_item.h"
-#include "xfs_utils.h"
-#include "xfs_vnodeops.h"
+#include "xfs_log.h"
#include "xfs_log_priv.h"
-#include "xfs_trans_priv.h"
-#include "xfs_filestream.h"
#include "xfs_da_btree.h"
+#include "xfs_dir2.h"
#include "xfs_extfree_item.h"
#include "xfs_mru_cache.h"
#include "xfs_inode_item.h"
#include "xfs_icache.h"
#include "xfs_trace.h"
+#include "xfs_icreate_item.h"
+#include "xfs_dinode.h"
+#include "xfs_filestream.h"
+#include "xfs_quota.h"
#include <linux/namei.h>
#include <linux/init.h>
@@ -139,9 +135,9 @@ static const match_table_t tokens = {
STATIC unsigned long
-suffix_strtoul(char *s, char **endp, unsigned int base)
+suffix_kstrtoint(char *s, unsigned int base, int *res)
{
- int last, shift_left_factor = 0;
+ int last, shift_left_factor = 0, _res;
char *value = s;
last = strlen(value) - 1;
@@ -158,7 +154,10 @@ suffix_strtoul(char *s, char **endp, unsigned int base)
value[last] = '\0';
}
- return simple_strtoul((const char *)s, endp, base) << shift_left_factor;
+ if (kstrtoint(s, base, &_res))
+ return -EINVAL;
+ *res = _res << shift_left_factor;
+ return 0;
}
/*
@@ -174,7 +173,7 @@ xfs_parseargs(
char *options)
{
struct super_block *sb = mp->m_super;
- char *this_char, *value, *eov;
+ char *this_char, *value;
int dsunit = 0;
int dswidth = 0;
int iosize = 0;
@@ -230,14 +229,16 @@ xfs_parseargs(
this_char);
return EINVAL;
}
- mp->m_logbufs = simple_strtoul(value, &eov, 10);
+ if (kstrtoint(value, 10, &mp->m_logbufs))
+ return EINVAL;
} else if (!strcmp(this_char, MNTOPT_LOGBSIZE)) {
if (!value || !*value) {
xfs_warn(mp, "%s option requires an argument",
this_char);
return EINVAL;
}
- mp->m_logbsize = suffix_strtoul(value, &eov, 10);
+ if (suffix_kstrtoint(value, 10, &mp->m_logbsize))
+ return EINVAL;
} else if (!strcmp(this_char, MNTOPT_LOGDEV)) {
if (!value || !*value) {
xfs_warn(mp, "%s option requires an argument",
@@ -266,7 +267,8 @@ xfs_parseargs(
this_char);
return EINVAL;
}
- iosize = simple_strtoul(value, &eov, 10);
+ if (kstrtoint(value, 10, &iosize))
+ return EINVAL;
iosizelog = ffs(iosize) - 1;
} else if (!strcmp(this_char, MNTOPT_ALLOCSIZE)) {
if (!value || !*value) {
@@ -274,7 +276,8 @@ xfs_parseargs(
this_char);
return EINVAL;
}
- iosize = suffix_strtoul(value, &eov, 10);
+ if (suffix_kstrtoint(value, 10, &iosize))
+ return EINVAL;
iosizelog = ffs(iosize) - 1;
} else if (!strcmp(this_char, MNTOPT_GRPID) ||
!strcmp(this_char, MNTOPT_BSDGROUPS)) {
@@ -296,14 +299,16 @@ xfs_parseargs(
this_char);
return EINVAL;
}
- dsunit = simple_strtoul(value, &eov, 10);
+ if (kstrtoint(value, 10, &dsunit))
+ return EINVAL;
} else if (!strcmp(this_char, MNTOPT_SWIDTH)) {
if (!value || !*value) {
xfs_warn(mp, "%s option requires an argument",
this_char);
return EINVAL;
}
- dswidth = simple_strtoul(value, &eov, 10);
+ if (kstrtoint(value, 10, &dswidth))
+ return EINVAL;
} else if (!strcmp(this_char, MNTOPT_32BITINODE)) {
mp->m_flags |= XFS_MOUNT_SMALL_INUMS;
} else if (!strcmp(this_char, MNTOPT_64BITINODE)) {
@@ -350,17 +355,17 @@ xfs_parseargs(
} else if (!strcmp(this_char, MNTOPT_PQUOTA) ||
!strcmp(this_char, MNTOPT_PRJQUOTA)) {
mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE |
- XFS_OQUOTA_ENFD);
+ XFS_PQUOTA_ENFD);
} else if (!strcmp(this_char, MNTOPT_PQUOTANOENF)) {
mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE);
- mp->m_qflags &= ~XFS_OQUOTA_ENFD;
+ mp->m_qflags &= ~XFS_PQUOTA_ENFD;
} else if (!strcmp(this_char, MNTOPT_GQUOTA) ||
!strcmp(this_char, MNTOPT_GRPQUOTA)) {
mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE |
- XFS_OQUOTA_ENFD);
+ XFS_GQUOTA_ENFD);
} else if (!strcmp(this_char, MNTOPT_GQUOTANOENF)) {
mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE);
- mp->m_qflags &= ~XFS_OQUOTA_ENFD;
+ mp->m_qflags &= ~XFS_GQUOTA_ENFD;
} else if (!strcmp(this_char, MNTOPT_DELAYLOG)) {
xfs_warn(mp,
"delaylog is the default now, option is deprecated.");
@@ -411,12 +416,6 @@ xfs_parseargs(
}
#endif
- if ((mp->m_qflags & (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE)) &&
- (mp->m_qflags & (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE))) {
- xfs_warn(mp, "cannot mount with both project and group quota");
- return EINVAL;
- }
-
if ((dsunit && !dswidth) || (!dsunit && dswidth)) {
xfs_warn(mp, "sunit and swidth must be specified together");
return EINVAL;
@@ -430,20 +429,15 @@ xfs_parseargs(
}
done:
- if (!(mp->m_flags & XFS_MOUNT_NOALIGN)) {
+ if (dsunit && !(mp->m_flags & XFS_MOUNT_NOALIGN)) {
/*
* At this point the superblock has not been read
* in, therefore we do not know the block size.
* Before the mount call ends we will convert
* these to FSBs.
*/
- if (dsunit) {
- mp->m_dalign = dsunit;
- mp->m_flags |= XFS_MOUNT_RETERR;
- }
-
- if (dswidth)
- mp->m_swidth = dswidth;
+ mp->m_dalign = dsunit;
+ mp->m_swidth = dswidth;
}
if (mp->m_logbufs != -1 &&
@@ -551,15 +545,14 @@ xfs_showargs(
else if (mp->m_qflags & XFS_UQUOTA_ACCT)
seq_puts(m, "," MNTOPT_UQUOTANOENF);
- /* Either project or group quotas can be active, not both */
-
if (mp->m_qflags & XFS_PQUOTA_ACCT) {
- if (mp->m_qflags & XFS_OQUOTA_ENFD)
+ if (mp->m_qflags & XFS_PQUOTA_ENFD)
seq_puts(m, "," MNTOPT_PRJQUOTA);
else
seq_puts(m, "," MNTOPT_PQUOTANOENF);
- } else if (mp->m_qflags & XFS_GQUOTA_ACCT) {
- if (mp->m_qflags & XFS_OQUOTA_ENFD)
+ }
+ if (mp->m_qflags & XFS_GQUOTA_ACCT) {
+ if (mp->m_qflags & XFS_GQUOTA_ENFD)
seq_puts(m, "," MNTOPT_GRPQUOTA);
else
seq_puts(m, "," MNTOPT_GQUOTANOENF);
@@ -772,20 +765,18 @@ xfs_open_devices(
* Setup xfs_mount buffer target pointers
*/
error = ENOMEM;
- mp->m_ddev_targp = xfs_alloc_buftarg(mp, ddev, 0, mp->m_fsname);
+ mp->m_ddev_targp = xfs_alloc_buftarg(mp, ddev);
if (!mp->m_ddev_targp)
goto out_close_rtdev;
if (rtdev) {
- mp->m_rtdev_targp = xfs_alloc_buftarg(mp, rtdev, 1,
- mp->m_fsname);
+ mp->m_rtdev_targp = xfs_alloc_buftarg(mp, rtdev);
if (!mp->m_rtdev_targp)
goto out_free_ddev_targ;
}
if (logdev && logdev != ddev) {
- mp->m_logdev_targp = xfs_alloc_buftarg(mp, logdev, 1,
- mp->m_fsname);
+ mp->m_logdev_targp = xfs_alloc_buftarg(mp, logdev);
if (!mp->m_logdev_targp)
goto out_free_rtdev_targ;
} else {
@@ -818,8 +809,7 @@ xfs_setup_devices(
{
int error;
- error = xfs_setsize_buftarg(mp->m_ddev_targp, mp->m_sb.sb_blocksize,
- mp->m_sb.sb_sectsize);
+ error = xfs_setsize_buftarg(mp->m_ddev_targp, mp->m_sb.sb_sectsize);
if (error)
return error;
@@ -829,14 +819,12 @@ xfs_setup_devices(
if (xfs_sb_version_hassector(&mp->m_sb))
log_sector_size = mp->m_sb.sb_logsectsize;
error = xfs_setsize_buftarg(mp->m_logdev_targp,
- mp->m_sb.sb_blocksize,
log_sector_size);
if (error)
return error;
}
if (mp->m_rtdev_targp) {
error = xfs_setsize_buftarg(mp->m_rtdev_targp,
- mp->m_sb.sb_blocksize,
mp->m_sb.sb_sectsize);
if (error)
return error;
@@ -865,17 +853,17 @@ xfs_init_mount_workqueues(
goto out_destroy_unwritten;
mp->m_reclaim_workqueue = alloc_workqueue("xfs-reclaim/%s",
- WQ_NON_REENTRANT, 0, mp->m_fsname);
+ 0, 0, mp->m_fsname);
if (!mp->m_reclaim_workqueue)
goto out_destroy_cil;
mp->m_log_workqueue = alloc_workqueue("xfs-log/%s",
- WQ_NON_REENTRANT, 0, mp->m_fsname);
+ 0, 0, mp->m_fsname);
if (!mp->m_log_workqueue)
goto out_destroy_reclaim;
mp->m_eofblocks_workqueue = alloc_workqueue("xfs-eofblocks/%s",
- WQ_NON_REENTRANT, 0, mp->m_fsname);
+ 0, 0, mp->m_fsname);
if (!mp->m_eofblocks_workqueue)
goto out_destroy_log;
@@ -948,10 +936,6 @@ xfs_fs_destroy_inode(
XFS_STATS_INC(vn_reclaim);
- /* bad inode, get out here ASAP */
- if (is_bad_inode(inode))
- goto out_reclaim;
-
ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0);
/*
@@ -967,7 +951,6 @@ xfs_fs_destroy_inode(
* this more efficiently than we can here, so simply let background
* reclaim tear down all inodes.
*/
-out_reclaim:
xfs_inode_set_reclaim_tag(ip);
}
@@ -1008,7 +991,7 @@ xfs_fs_evict_inode(
trace_xfs_evict_inode(ip);
- truncate_inode_pages(&inode->i_data, 0);
+ truncate_inode_pages_final(&inode->i_data);
clear_inode(inode);
XFS_STATS_INC(vn_rele);
XFS_STATS_INC(vn_remove);
@@ -1127,8 +1110,8 @@ xfs_fs_statfs(
spin_unlock(&mp->m_sb_lock);
if ((ip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
- ((mp->m_qflags & (XFS_PQUOTA_ACCT|XFS_OQUOTA_ENFD))) ==
- (XFS_PQUOTA_ACCT|XFS_OQUOTA_ENFD))
+ ((mp->m_qflags & (XFS_PQUOTA_ACCT|XFS_PQUOTA_ENFD))) ==
+ (XFS_PQUOTA_ACCT|XFS_PQUOTA_ENFD))
xfs_qm_statvfs(ip, statp);
return 0;
}
@@ -1167,7 +1150,7 @@ xfs_restore_resvblks(struct xfs_mount *mp)
* Note: xfs_log_quiesce() stops background log work - the callers must ensure
* it is started again when appropriate.
*/
-void
+static void
xfs_quiesce_attr(
struct xfs_mount *mp)
{
@@ -1209,6 +1192,7 @@ xfs_fs_remount(
char *p;
int error;
+ sync_filesystem(sb);
while ((p = strsep(&options, ",")) != NULL) {
int token;
@@ -1248,7 +1232,7 @@ xfs_fs_remount(
*/
#if 0
xfs_info(mp,
- "mount option \"%s\" not supported for remount\n", p);
+ "mount option \"%s\" not supported for remount", p);
return -EINVAL;
#else
break;
@@ -1364,6 +1348,17 @@ xfs_finish_flags(
}
/*
+ * V5 filesystems always use attr2 format for attributes.
+ */
+ if (xfs_sb_version_hascrc(&mp->m_sb) &&
+ (mp->m_flags & XFS_MOUNT_NOATTR2)) {
+ xfs_warn(mp,
+"Cannot mount a V5 filesystem as %s. %s is always enabled for V5 filesystems.",
+ MNTOPT_NOATTR2, MNTOPT_ATTR2);
+ return XFS_ERROR(EINVAL);
+ }
+
+ /*
* mkfs'ed attr2 will turn on attr2 mount unless explicitly
* told by noattr2 to turn it off
*/
@@ -1380,6 +1375,14 @@ xfs_finish_flags(
return XFS_ERROR(EROFS);
}
+ if ((mp->m_qflags & (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE)) &&
+ (mp->m_qflags & (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE)) &&
+ !xfs_sb_version_has_pquotino(&mp->m_sb)) {
+ xfs_warn(mp,
+ "Super block does not support project and group quota together");
+ return XFS_ERROR(EINVAL);
+ }
+
return 0;
}
@@ -1425,11 +1428,11 @@ xfs_fs_fill_super(
if (error)
goto out_free_fsname;
- error = xfs_init_mount_workqueues(mp);
+ error = -xfs_init_mount_workqueues(mp);
if (error)
goto out_close_devices;
- error = xfs_icsb_init_counters(mp);
+ error = -xfs_icsb_init_counters(mp);
if (error)
goto out_destroy_workqueues;
@@ -1461,6 +1464,10 @@ xfs_fs_fill_super(
sb->s_time_gran = 1;
set_posix_acl_flag(sb);
+ /* version 5 superblocks support inode version counters. */
+ if (XFS_SB_VERSION_NUM(&mp->m_sb) == XFS_SB_VERSION_5)
+ sb->s_flags |= MS_I_VERSION;
+
error = xfs_mountfs(mp);
if (error)
goto out_filestream_unmount;
@@ -1470,10 +1477,6 @@ xfs_fs_fill_super(
error = ENOENT;
goto out_unmount;
}
- if (is_bad_inode(root)) {
- error = EINVAL;
- goto out_unmount;
- }
sb->s_root = d_make_root(root);
if (!sb->s_root) {
error = ENOMEM;
@@ -1514,19 +1517,21 @@ xfs_fs_mount(
return mount_bdev(fs_type, flags, dev_name, data, xfs_fs_fill_super);
}
-static int
+static long
xfs_fs_nr_cached_objects(
- struct super_block *sb)
+ struct super_block *sb,
+ int nid)
{
return xfs_reclaim_inodes_count(XFS_M(sb));
}
-static void
+static long
xfs_fs_free_cached_objects(
struct super_block *sb,
- int nr_to_scan)
+ long nr_to_scan,
+ int nid)
{
- xfs_reclaim_inodes_nr(XFS_M(sb), nr_to_scan);
+ return xfs_reclaim_inodes_nr(XFS_M(sb), nr_to_scan);
}
static const struct super_operations xfs_super_operations = {
@@ -1552,6 +1557,7 @@ static struct file_system_type xfs_fs_type = {
.kill_sb = kill_block_super,
.fs_flags = FS_REQUIRES_DEV,
};
+MODULE_ALIAS_FS("xfs");
STATIC int __init
xfs_init_zones(void)
@@ -1634,9 +1640,15 @@ xfs_init_zones(void)
KM_ZONE_SPREAD, NULL);
if (!xfs_ili_zone)
goto out_destroy_inode_zone;
+ xfs_icreate_zone = kmem_zone_init(sizeof(struct xfs_icreate_item),
+ "xfs_icr");
+ if (!xfs_icreate_zone)
+ goto out_destroy_ili_zone;
return 0;
+ out_destroy_ili_zone:
+ kmem_zone_destroy(xfs_ili_zone);
out_destroy_inode_zone:
kmem_zone_destroy(xfs_inode_zone);
out_destroy_efi_zone:
@@ -1675,6 +1687,7 @@ xfs_destroy_zones(void)
* destroy caches.
*/
rcu_barrier();
+ kmem_zone_destroy(xfs_icreate_zone);
kmem_zone_destroy(xfs_ili_zone);
kmem_zone_destroy(xfs_inode_zone);
kmem_zone_destroy(xfs_efi_zone);
@@ -1736,13 +1749,9 @@ init_xfs_fs(void)
if (error)
goto out_destroy_wq;
- error = xfs_filestream_init();
- if (error)
- goto out_mru_cache_uninit;
-
error = xfs_buf_init();
if (error)
- goto out_filestream_uninit;
+ goto out_mru_cache_uninit;
error = xfs_init_procfs();
if (error)
@@ -1769,8 +1778,6 @@ init_xfs_fs(void)
xfs_cleanup_procfs();
out_buf_terminate:
xfs_buf_terminate();
- out_filestream_uninit:
- xfs_filestream_uninit();
out_mru_cache_uninit:
xfs_mru_cache_uninit();
out_destroy_wq:
@@ -1789,7 +1796,6 @@ exit_xfs_fs(void)
xfs_sysctl_unregister();
xfs_cleanup_procfs();
xfs_buf_terminate();
- xfs_filestream_uninit();
xfs_mru_cache_uninit();
xfs_destroy_workqueues();
xfs_destroy_zones();
diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c
new file mode 100644
index 00000000000..d69363c833e
--- /dev/null
+++ b/fs/xfs/xfs_symlink.c
@@ -0,0 +1,599 @@
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * Copyright (c) 2012-2013 Red Hat, Inc.
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "xfs.h"
+#include "xfs_shared.h"
+#include "xfs_fs.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_dir2.h"
+#include "xfs_inode.h"
+#include "xfs_ialloc.h"
+#include "xfs_alloc.h"
+#include "xfs_bmap.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_bmap_util.h"
+#include "xfs_error.h"
+#include "xfs_quota.h"
+#include "xfs_trans_space.h"
+#include "xfs_trace.h"
+#include "xfs_symlink.h"
+#include "xfs_trans.h"
+#include "xfs_log.h"
+#include "xfs_dinode.h"
+
+/* ----- Kernel only functions below ----- */
+STATIC int
+xfs_readlink_bmap(
+ struct xfs_inode *ip,
+ char *link)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_bmbt_irec mval[XFS_SYMLINK_MAPS];
+ struct xfs_buf *bp;
+ xfs_daddr_t d;
+ char *cur_chunk;
+ int pathlen = ip->i_d.di_size;
+ int nmaps = XFS_SYMLINK_MAPS;
+ int byte_cnt;
+ int n;
+ int error = 0;
+ int fsblocks = 0;
+ int offset;
+
+ fsblocks = xfs_symlink_blocks(mp, pathlen);
+ error = xfs_bmapi_read(ip, 0, fsblocks, mval, &nmaps, 0);
+ if (error)
+ goto out;
+
+ offset = 0;
+ for (n = 0; n < nmaps; n++) {
+ d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock);
+ byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount);
+
+ bp = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt), 0,
+ &xfs_symlink_buf_ops);
+ if (!bp)
+ return XFS_ERROR(ENOMEM);
+ error = bp->b_error;
+ if (error) {
+ xfs_buf_ioerror_alert(bp, __func__);
+ xfs_buf_relse(bp);
+
+ /* bad CRC means corrupted metadata */
+ if (error == EFSBADCRC)
+ error = EFSCORRUPTED;
+ goto out;
+ }
+ byte_cnt = XFS_SYMLINK_BUF_SPACE(mp, byte_cnt);
+ if (pathlen < byte_cnt)
+ byte_cnt = pathlen;
+
+ cur_chunk = bp->b_addr;
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ if (!xfs_symlink_hdr_ok(ip->i_ino, offset,
+ byte_cnt, bp)) {
+ error = EFSCORRUPTED;
+ xfs_alert(mp,
+"symlink header does not match required off/len/owner (0x%x/Ox%x,0x%llx)",
+ offset, byte_cnt, ip->i_ino);
+ xfs_buf_relse(bp);
+ goto out;
+
+ }
+
+ cur_chunk += sizeof(struct xfs_dsymlink_hdr);
+ }
+
+ memcpy(link + offset, bp->b_addr, byte_cnt);
+
+ pathlen -= byte_cnt;
+ offset += byte_cnt;
+
+ xfs_buf_relse(bp);
+ }
+ ASSERT(pathlen == 0);
+
+ link[ip->i_d.di_size] = '\0';
+ error = 0;
+
+ out:
+ return error;
+}
+
+int
+xfs_readlink(
+ struct xfs_inode *ip,
+ char *link)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ xfs_fsize_t pathlen;
+ int error = 0;
+
+ trace_xfs_readlink(ip);
+
+ if (XFS_FORCED_SHUTDOWN(mp))
+ return XFS_ERROR(EIO);
+
+ xfs_ilock(ip, XFS_ILOCK_SHARED);
+
+ pathlen = ip->i_d.di_size;
+ if (!pathlen)
+ goto out;
+
+ if (pathlen < 0 || pathlen > MAXPATHLEN) {
+ xfs_alert(mp, "%s: inode (%llu) bad symlink length (%lld)",
+ __func__, (unsigned long long) ip->i_ino,
+ (long long) pathlen);
+ ASSERT(0);
+ error = XFS_ERROR(EFSCORRUPTED);
+ goto out;
+ }
+
+
+ if (ip->i_df.if_flags & XFS_IFINLINE) {
+ memcpy(link, ip->i_df.if_u1.if_data, pathlen);
+ link[pathlen] = '\0';
+ } else {
+ error = xfs_readlink_bmap(ip, link);
+ }
+
+ out:
+ xfs_iunlock(ip, XFS_ILOCK_SHARED);
+ return error;
+}
+
+int
+xfs_symlink(
+ struct xfs_inode *dp,
+ struct xfs_name *link_name,
+ const char *target_path,
+ umode_t mode,
+ struct xfs_inode **ipp)
+{
+ struct xfs_mount *mp = dp->i_mount;
+ struct xfs_trans *tp = NULL;
+ struct xfs_inode *ip = NULL;
+ int error = 0;
+ int pathlen;
+ struct xfs_bmap_free free_list;
+ xfs_fsblock_t first_block;
+ bool unlock_dp_on_error = false;
+ uint cancel_flags;
+ int committed;
+ xfs_fileoff_t first_fsb;
+ xfs_filblks_t fs_blocks;
+ int nmaps;
+ struct xfs_bmbt_irec mval[XFS_SYMLINK_MAPS];
+ xfs_daddr_t d;
+ const char *cur_chunk;
+ int byte_cnt;
+ int n;
+ xfs_buf_t *bp;
+ prid_t prid;
+ struct xfs_dquot *udqp = NULL;
+ struct xfs_dquot *gdqp = NULL;
+ struct xfs_dquot *pdqp = NULL;
+ uint resblks;
+
+ *ipp = NULL;
+
+ trace_xfs_symlink(dp, link_name);
+
+ if (XFS_FORCED_SHUTDOWN(mp))
+ return XFS_ERROR(EIO);
+
+ /*
+ * Check component lengths of the target path name.
+ */
+ pathlen = strlen(target_path);
+ if (pathlen >= MAXPATHLEN) /* total string too long */
+ return XFS_ERROR(ENAMETOOLONG);
+
+ udqp = gdqp = NULL;
+ prid = xfs_get_initial_prid(dp);
+
+ /*
+ * Make sure that we have allocated dquot(s) on disk.
+ */
+ error = xfs_qm_vop_dqalloc(dp,
+ xfs_kuid_to_uid(current_fsuid()),
+ xfs_kgid_to_gid(current_fsgid()), prid,
+ XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT,
+ &udqp, &gdqp, &pdqp);
+ if (error)
+ goto std_return;
+
+ tp = xfs_trans_alloc(mp, XFS_TRANS_SYMLINK);
+ cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
+ /*
+ * The symlink will fit into the inode data fork?
+ * There can't be any attributes so we get the whole variable part.
+ */
+ if (pathlen <= XFS_LITINO(mp, dp->i_d.di_version))
+ fs_blocks = 0;
+ else
+ fs_blocks = xfs_symlink_blocks(mp, pathlen);
+ resblks = XFS_SYMLINK_SPACE_RES(mp, link_name->len, fs_blocks);
+ error = xfs_trans_reserve(tp, &M_RES(mp)->tr_symlink, resblks, 0);
+ if (error == ENOSPC && fs_blocks == 0) {
+ resblks = 0;
+ error = xfs_trans_reserve(tp, &M_RES(mp)->tr_symlink, 0, 0);
+ }
+ if (error) {
+ cancel_flags = 0;
+ goto error_return;
+ }
+
+ xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
+ unlock_dp_on_error = true;
+
+ /*
+ * Check whether the directory allows new symlinks or not.
+ */
+ if (dp->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) {
+ error = XFS_ERROR(EPERM);
+ goto error_return;
+ }
+
+ /*
+ * Reserve disk quota : blocks and inode.
+ */
+ error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp,
+ pdqp, resblks, 1, 0);
+ if (error)
+ goto error_return;
+
+ /*
+ * Check for ability to enter directory entry, if no space reserved.
+ */
+ error = xfs_dir_canenter(tp, dp, link_name, resblks);
+ if (error)
+ goto error_return;
+ /*
+ * Initialize the bmap freelist prior to calling either
+ * bmapi or the directory create code.
+ */
+ xfs_bmap_init(&free_list, &first_block);
+
+ /*
+ * Allocate an inode for the symlink.
+ */
+ error = xfs_dir_ialloc(&tp, dp, S_IFLNK | (mode & ~S_IFMT), 1, 0,
+ prid, resblks > 0, &ip, NULL);
+ if (error) {
+ if (error == ENOSPC)
+ goto error_return;
+ goto error1;
+ }
+
+ /*
+ * An error after we've joined dp to the transaction will result in the
+ * transaction cancel unlocking dp so don't do it explicitly in the
+ * error path.
+ */
+ xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
+ unlock_dp_on_error = false;
+
+ /*
+ * Also attach the dquot(s) to it, if applicable.
+ */
+ xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp);
+
+ if (resblks)
+ resblks -= XFS_IALLOC_SPACE_RES(mp);
+ /*
+ * If the symlink will fit into the inode, write it inline.
+ */
+ if (pathlen <= XFS_IFORK_DSIZE(ip)) {
+ xfs_idata_realloc(ip, pathlen, XFS_DATA_FORK);
+ memcpy(ip->i_df.if_u1.if_data, target_path, pathlen);
+ ip->i_d.di_size = pathlen;
+
+ /*
+ * The inode was initially created in extent format.
+ */
+ ip->i_df.if_flags &= ~(XFS_IFEXTENTS | XFS_IFBROOT);
+ ip->i_df.if_flags |= XFS_IFINLINE;
+
+ ip->i_d.di_format = XFS_DINODE_FMT_LOCAL;
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_DDATA | XFS_ILOG_CORE);
+
+ } else {
+ int offset;
+
+ first_fsb = 0;
+ nmaps = XFS_SYMLINK_MAPS;
+
+ error = xfs_bmapi_write(tp, ip, first_fsb, fs_blocks,
+ XFS_BMAPI_METADATA, &first_block, resblks,
+ mval, &nmaps, &free_list);
+ if (error)
+ goto error2;
+
+ if (resblks)
+ resblks -= fs_blocks;
+ ip->i_d.di_size = pathlen;
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+ cur_chunk = target_path;
+ offset = 0;
+ for (n = 0; n < nmaps; n++) {
+ char *buf;
+
+ d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock);
+ byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount);
+ bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
+ BTOBB(byte_cnt), 0);
+ if (!bp) {
+ error = ENOMEM;
+ goto error2;
+ }
+ bp->b_ops = &xfs_symlink_buf_ops;
+
+ byte_cnt = XFS_SYMLINK_BUF_SPACE(mp, byte_cnt);
+ byte_cnt = min(byte_cnt, pathlen);
+
+ buf = bp->b_addr;
+ buf += xfs_symlink_hdr_set(mp, ip->i_ino, offset,
+ byte_cnt, bp);
+
+ memcpy(buf, cur_chunk, byte_cnt);
+
+ cur_chunk += byte_cnt;
+ pathlen -= byte_cnt;
+ offset += byte_cnt;
+
+ xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SYMLINK_BUF);
+ xfs_trans_log_buf(tp, bp, 0, (buf + byte_cnt - 1) -
+ (char *)bp->b_addr);
+ }
+ ASSERT(pathlen == 0);
+ }
+
+ /*
+ * Create the directory entry for the symlink.
+ */
+ error = xfs_dir_createname(tp, dp, link_name, ip->i_ino,
+ &first_block, &free_list, resblks);
+ if (error)
+ goto error2;
+ xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+ xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
+
+ /*
+ * If this is a synchronous mount, make sure that the
+ * symlink transaction goes to disk before returning to
+ * the user.
+ */
+ if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
+ xfs_trans_set_sync(tp);
+ }
+
+ error = xfs_bmap_finish(&tp, &free_list, &committed);
+ if (error) {
+ goto error2;
+ }
+ error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+ xfs_qm_dqrele(udqp);
+ xfs_qm_dqrele(gdqp);
+ xfs_qm_dqrele(pdqp);
+
+ *ipp = ip;
+ return 0;
+
+ error2:
+ IRELE(ip);
+ error1:
+ xfs_bmap_cancel(&free_list);
+ cancel_flags |= XFS_TRANS_ABORT;
+ error_return:
+ xfs_trans_cancel(tp, cancel_flags);
+ xfs_qm_dqrele(udqp);
+ xfs_qm_dqrele(gdqp);
+ xfs_qm_dqrele(pdqp);
+
+ if (unlock_dp_on_error)
+ xfs_iunlock(dp, XFS_ILOCK_EXCL);
+ std_return:
+ return error;
+}
+
+/*
+ * Free a symlink that has blocks associated with it.
+ */
+STATIC int
+xfs_inactive_symlink_rmt(
+ struct xfs_inode *ip)
+{
+ xfs_buf_t *bp;
+ int committed;
+ int done;
+ int error;
+ xfs_fsblock_t first_block;
+ xfs_bmap_free_t free_list;
+ int i;
+ xfs_mount_t *mp;
+ xfs_bmbt_irec_t mval[XFS_SYMLINK_MAPS];
+ int nmaps;
+ int size;
+ xfs_trans_t *tp;
+
+ mp = ip->i_mount;
+ ASSERT(ip->i_df.if_flags & XFS_IFEXTENTS);
+ /*
+ * We're freeing a symlink that has some
+ * blocks allocated to it. Free the
+ * blocks here. We know that we've got
+ * either 1 or 2 extents and that we can
+ * free them all in one bunmapi call.
+ */
+ ASSERT(ip->i_d.di_nextents > 0 && ip->i_d.di_nextents <= 2);
+
+ tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
+ error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
+ if (error) {
+ xfs_trans_cancel(tp, 0);
+ return error;
+ }
+
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, ip, 0);
+
+ /*
+ * Lock the inode, fix the size, and join it to the transaction.
+ * Hold it so in the normal path, we still have it locked for
+ * the second transaction. In the error paths we need it
+ * held so the cancel won't rele it, see below.
+ */
+ size = (int)ip->i_d.di_size;
+ ip->i_d.di_size = 0;
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+ /*
+ * Find the block(s) so we can inval and unmap them.
+ */
+ done = 0;
+ xfs_bmap_init(&free_list, &first_block);
+ nmaps = ARRAY_SIZE(mval);
+ error = xfs_bmapi_read(ip, 0, xfs_symlink_blocks(mp, size),
+ mval, &nmaps, 0);
+ if (error)
+ goto error_trans_cancel;
+ /*
+ * Invalidate the block(s). No validation is done.
+ */
+ for (i = 0; i < nmaps; i++) {
+ bp = xfs_trans_get_buf(tp, mp->m_ddev_targp,
+ XFS_FSB_TO_DADDR(mp, mval[i].br_startblock),
+ XFS_FSB_TO_BB(mp, mval[i].br_blockcount), 0);
+ if (!bp) {
+ error = ENOMEM;
+ goto error_bmap_cancel;
+ }
+ xfs_trans_binval(tp, bp);
+ }
+ /*
+ * Unmap the dead block(s) to the free_list.
+ */
+ error = xfs_bunmapi(tp, ip, 0, size, XFS_BMAPI_METADATA, nmaps,
+ &first_block, &free_list, &done);
+ if (error)
+ goto error_bmap_cancel;
+ ASSERT(done);
+ /*
+ * Commit the first transaction. This logs the EFI and the inode.
+ */
+ error = xfs_bmap_finish(&tp, &free_list, &committed);
+ if (error)
+ goto error_bmap_cancel;
+ /*
+ * The transaction must have been committed, since there were
+ * actually extents freed by xfs_bunmapi. See xfs_bmap_finish.
+ * The new tp has the extent freeing and EFDs.
+ */
+ ASSERT(committed);
+ /*
+ * The first xact was committed, so add the inode to the new one.
+ * Mark it dirty so it will be logged and moved forward in the log as
+ * part of every commit.
+ */
+ xfs_trans_ijoin(tp, ip, 0);
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+ /*
+ * Commit the transaction containing extent freeing and EFDs.
+ */
+ error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+ if (error) {
+ ASSERT(XFS_FORCED_SHUTDOWN(mp));
+ goto error_unlock;
+ }
+
+ /*
+ * Remove the memory for extent descriptions (just bookkeeping).
+ */
+ if (ip->i_df.if_bytes)
+ xfs_idata_realloc(ip, -ip->i_df.if_bytes, XFS_DATA_FORK);
+ ASSERT(ip->i_df.if_bytes == 0);
+
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ return 0;
+
+error_bmap_cancel:
+ xfs_bmap_cancel(&free_list);
+error_trans_cancel:
+ xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
+error_unlock:
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ return error;
+}
+
+/*
+ * xfs_inactive_symlink - free a symlink
+ */
+int
+xfs_inactive_symlink(
+ struct xfs_inode *ip)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ int pathlen;
+
+ trace_xfs_inactive_symlink(ip);
+
+ if (XFS_FORCED_SHUTDOWN(mp))
+ return XFS_ERROR(EIO);
+
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+
+ /*
+ * Zero length symlinks _can_ exist.
+ */
+ pathlen = (int)ip->i_d.di_size;
+ if (!pathlen) {
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ return 0;
+ }
+
+ if (pathlen < 0 || pathlen > MAXPATHLEN) {
+ xfs_alert(mp, "%s: inode (0x%llx) bad symlink length (%d)",
+ __func__, (unsigned long long)ip->i_ino, pathlen);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ ASSERT(0);
+ return XFS_ERROR(EFSCORRUPTED);
+ }
+
+ if (ip->i_df.if_flags & XFS_IFINLINE) {
+ if (ip->i_df.if_bytes > 0)
+ xfs_idata_realloc(ip, -(ip->i_df.if_bytes),
+ XFS_DATA_FORK);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ ASSERT(ip->i_df.if_bytes == 0);
+ return 0;
+ }
+
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+
+ /* remove the remote symlink */
+ return xfs_inactive_symlink_rmt(ip);
+}
diff --git a/fs/xfs/xfs_symlink.h b/fs/xfs/xfs_symlink.h
new file mode 100644
index 00000000000..e75245d0911
--- /dev/null
+++ b/fs/xfs/xfs_symlink.h
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2012 Red Hat, Inc. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#ifndef __XFS_SYMLINK_H
+#define __XFS_SYMLINK_H 1
+
+/* Kernel only symlink defintions */
+
+int xfs_symlink(struct xfs_inode *dp, struct xfs_name *link_name,
+ const char *target_path, umode_t mode, struct xfs_inode **ipp);
+int xfs_readlink(struct xfs_inode *ip, char *link);
+int xfs_inactive_symlink(struct xfs_inode *ip);
+
+#endif /* __XFS_SYMLINK_H */
diff --git a/fs/xfs/xfs_symlink_remote.c b/fs/xfs/xfs_symlink_remote.c
new file mode 100644
index 00000000000..23c2f2577c8
--- /dev/null
+++ b/fs/xfs/xfs_symlink_remote.c
@@ -0,0 +1,201 @@
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * Copyright (c) 2012-2013 Red Hat, Inc.
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_shared.h"
+#include "xfs_trans_resv.h"
+#include "xfs_ag.h"
+#include "xfs_sb.h"
+#include "xfs_mount.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_inode.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+#include "xfs_symlink.h"
+#include "xfs_cksum.h"
+#include "xfs_trans.h"
+#include "xfs_buf_item.h"
+
+
+/*
+ * Each contiguous block has a header, so it is not just a simple pathlen
+ * to FSB conversion.
+ */
+int
+xfs_symlink_blocks(
+ struct xfs_mount *mp,
+ int pathlen)
+{
+ int buflen = XFS_SYMLINK_BUF_SPACE(mp, mp->m_sb.sb_blocksize);
+
+ return (pathlen + buflen - 1) / buflen;
+}
+
+int
+xfs_symlink_hdr_set(
+ struct xfs_mount *mp,
+ xfs_ino_t ino,
+ uint32_t offset,
+ uint32_t size,
+ struct xfs_buf *bp)
+{
+ struct xfs_dsymlink_hdr *dsl = bp->b_addr;
+
+ if (!xfs_sb_version_hascrc(&mp->m_sb))
+ return 0;
+
+ dsl->sl_magic = cpu_to_be32(XFS_SYMLINK_MAGIC);
+ dsl->sl_offset = cpu_to_be32(offset);
+ dsl->sl_bytes = cpu_to_be32(size);
+ uuid_copy(&dsl->sl_uuid, &mp->m_sb.sb_uuid);
+ dsl->sl_owner = cpu_to_be64(ino);
+ dsl->sl_blkno = cpu_to_be64(bp->b_bn);
+ bp->b_ops = &xfs_symlink_buf_ops;
+
+ return sizeof(struct xfs_dsymlink_hdr);
+}
+
+/*
+ * Checking of the symlink header is split into two parts. the verifier does
+ * CRC, location and bounds checking, the unpacking function checks the path
+ * parameters and owner.
+ */
+bool
+xfs_symlink_hdr_ok(
+ xfs_ino_t ino,
+ uint32_t offset,
+ uint32_t size,
+ struct xfs_buf *bp)
+{
+ struct xfs_dsymlink_hdr *dsl = bp->b_addr;
+
+ if (offset != be32_to_cpu(dsl->sl_offset))
+ return false;
+ if (size != be32_to_cpu(dsl->sl_bytes))
+ return false;
+ if (ino != be64_to_cpu(dsl->sl_owner))
+ return false;
+
+ /* ok */
+ return true;
+}
+
+static bool
+xfs_symlink_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_dsymlink_hdr *dsl = bp->b_addr;
+
+ if (!xfs_sb_version_hascrc(&mp->m_sb))
+ return false;
+ if (dsl->sl_magic != cpu_to_be32(XFS_SYMLINK_MAGIC))
+ return false;
+ if (!uuid_equal(&dsl->sl_uuid, &mp->m_sb.sb_uuid))
+ return false;
+ if (bp->b_bn != be64_to_cpu(dsl->sl_blkno))
+ return false;
+ if (be32_to_cpu(dsl->sl_offset) +
+ be32_to_cpu(dsl->sl_bytes) >= MAXPATHLEN)
+ return false;
+ if (dsl->sl_owner == 0)
+ return false;
+
+ return true;
+}
+
+static void
+xfs_symlink_read_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+
+ /* no verification of non-crc buffers */
+ if (!xfs_sb_version_hascrc(&mp->m_sb))
+ return;
+
+ if (!xfs_buf_verify_cksum(bp, XFS_SYMLINK_CRC_OFF))
+ xfs_buf_ioerror(bp, EFSBADCRC);
+ else if (!xfs_symlink_verify(bp))
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
+
+ if (bp->b_error)
+ xfs_verifier_error(bp);
+}
+
+static void
+xfs_symlink_write_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_buf_log_item *bip = bp->b_fspriv;
+
+ /* no verification of non-crc buffers */
+ if (!xfs_sb_version_hascrc(&mp->m_sb))
+ return;
+
+ if (!xfs_symlink_verify(bp)) {
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
+ xfs_verifier_error(bp);
+ return;
+ }
+
+ if (bip) {
+ struct xfs_dsymlink_hdr *dsl = bp->b_addr;
+ dsl->sl_lsn = cpu_to_be64(bip->bli_item.li_lsn);
+ }
+ xfs_buf_update_cksum(bp, XFS_SYMLINK_CRC_OFF);
+}
+
+const struct xfs_buf_ops xfs_symlink_buf_ops = {
+ .verify_read = xfs_symlink_read_verify,
+ .verify_write = xfs_symlink_write_verify,
+};
+
+void
+xfs_symlink_local_to_remote(
+ struct xfs_trans *tp,
+ struct xfs_buf *bp,
+ struct xfs_inode *ip,
+ struct xfs_ifork *ifp)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ char *buf;
+
+ if (!xfs_sb_version_hascrc(&mp->m_sb)) {
+ bp->b_ops = NULL;
+ memcpy(bp->b_addr, ifp->if_u1.if_data, ifp->if_bytes);
+ return;
+ }
+
+ /*
+ * As this symlink fits in an inode literal area, it must also fit in
+ * the smallest buffer the filesystem supports.
+ */
+ ASSERT(BBTOB(bp->b_length) >=
+ ifp->if_bytes + sizeof(struct xfs_dsymlink_hdr));
+
+ bp->b_ops = &xfs_symlink_buf_ops;
+
+ buf = bp->b_addr;
+ buf += xfs_symlink_hdr_set(mp, ip->i_ino, 0, ifp->if_bytes, bp);
+ memcpy(buf, ifp->if_u1.if_data, ifp->if_bytes);
+}
diff --git a/fs/xfs/xfs_sysctl.c b/fs/xfs/xfs_sysctl.c
index 2801b5ce6cd..1743b9f8e23 100644
--- a/fs/xfs/xfs_sysctl.c
+++ b/fs/xfs/xfs_sysctl.c
@@ -25,11 +25,11 @@ static struct ctl_table_header *xfs_table_header;
#ifdef CONFIG_PROC_FS
STATIC int
xfs_stats_clear_proc_handler(
- ctl_table *ctl,
- int write,
- void __user *buffer,
- size_t *lenp,
- loff_t *ppos)
+ struct ctl_table *ctl,
+ int write,
+ void __user *buffer,
+ size_t *lenp,
+ loff_t *ppos)
{
int c, ret, *valp = ctl->data;
__uint32_t vn_active;
@@ -55,11 +55,11 @@ xfs_stats_clear_proc_handler(
STATIC int
xfs_panic_mask_proc_handler(
- ctl_table *ctl,
- int write,
- void __user *buffer,
- size_t *lenp,
- loff_t *ppos)
+ struct ctl_table *ctl,
+ int write,
+ void __user *buffer,
+ size_t *lenp,
+ loff_t *ppos)
{
int ret, *valp = ctl->data;
@@ -74,7 +74,7 @@ xfs_panic_mask_proc_handler(
}
#endif /* CONFIG_PROC_FS */
-static ctl_table xfs_table[] = {
+static struct ctl_table xfs_table[] = {
{
.procname = "irix_sgid_inherit",
.data = &xfs_params.sgid_inherit.val,
@@ -227,7 +227,7 @@ static ctl_table xfs_table[] = {
{}
};
-static ctl_table xfs_dir_table[] = {
+static struct ctl_table xfs_dir_table[] = {
{
.procname = "xfs",
.mode = 0555,
@@ -236,7 +236,7 @@ static ctl_table xfs_dir_table[] = {
{}
};
-static ctl_table xfs_root_table[] = {
+static struct ctl_table xfs_root_table[] = {
{
.procname = "fs",
.mode = 0555,
diff --git a/fs/xfs/xfs_trace.c b/fs/xfs/xfs_trace.c
index 624bedd8135..1e85bcd0e41 100644
--- a/fs/xfs/xfs_trace.c
+++ b/fs/xfs/xfs_trace.c
@@ -17,25 +17,25 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
-#include "xfs_types.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
-#include "xfs_da_btree.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dinode.h"
+#include "xfs_mount.h"
+#include "xfs_da_format.h"
#include "xfs_inode.h"
#include "xfs_btree.h"
-#include "xfs_mount.h"
+#include "xfs_da_btree.h"
#include "xfs_ialloc.h"
#include "xfs_itable.h"
#include "xfs_alloc.h"
#include "xfs_bmap.h"
#include "xfs_attr.h"
#include "xfs_attr_leaf.h"
+#include "xfs_trans.h"
+#include "xfs_log.h"
#include "xfs_log_priv.h"
#include "xfs_buf_item.h"
#include "xfs_quota.h"
@@ -45,6 +45,8 @@
#include "xfs_dquot.h"
#include "xfs_log_recover.h"
#include "xfs_inode_item.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_filestream.h"
/*
* We include this last to have the helpers above available for the trace
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 2e137d4a85a..152f8278263 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -31,8 +31,8 @@ struct xfs_da_args;
struct xfs_da_node_entry;
struct xfs_dquot;
struct xfs_log_item;
-struct xlog_ticket;
struct xlog;
+struct xlog_ticket;
struct xlog_recover;
struct xlog_recover_item;
struct xfs_buf_log_format;
@@ -135,6 +135,31 @@ DEFINE_PERAG_REF_EVENT(xfs_perag_clear_reclaim);
DEFINE_PERAG_REF_EVENT(xfs_perag_set_eofblocks);
DEFINE_PERAG_REF_EVENT(xfs_perag_clear_eofblocks);
+DECLARE_EVENT_CLASS(xfs_ag_class,
+ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno),
+ TP_ARGS(mp, agno),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->agno = agno;
+ ),
+ TP_printk("dev %d:%d agno %u",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno)
+);
+#define DEFINE_AG_EVENT(name) \
+DEFINE_EVENT(xfs_ag_class, name, \
+ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno), \
+ TP_ARGS(mp, agno))
+
+DEFINE_AG_EVENT(xfs_read_agf);
+DEFINE_AG_EVENT(xfs_alloc_read_agf);
+DEFINE_AG_EVENT(xfs_read_agi);
+DEFINE_AG_EVENT(xfs_ialloc_read_agi);
+
TRACE_EVENT(xfs_attr_list_node_descend,
TP_PROTO(struct xfs_attr_list_context *ctx,
struct xfs_da_node_entry *btree),
@@ -341,6 +366,7 @@ DEFINE_BUF_EVENT(xfs_buf_item_relse);
DEFINE_BUF_EVENT(xfs_buf_item_iodone);
DEFINE_BUF_EVENT(xfs_buf_item_iodone_async);
DEFINE_BUF_EVENT(xfs_buf_error_relse);
+DEFINE_BUF_EVENT(xfs_buf_wait_buftarg);
DEFINE_BUF_EVENT(xfs_trans_read_buf_io);
DEFINE_BUF_EVENT(xfs_trans_read_buf_shut);
@@ -485,9 +511,12 @@ DEFINE_EVENT(xfs_buf_item_class, name, \
TP_PROTO(struct xfs_buf_log_item *bip), \
TP_ARGS(bip))
DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size);
+DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size_ordered);
DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size_stale);
DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format);
+DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format_ordered);
DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format_stale);
+DEFINE_BUF_ITEM_EVENT(xfs_buf_item_ordered);
DEFINE_BUF_ITEM_EVENT(xfs_buf_item_pin);
DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unpin);
DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unpin_stale);
@@ -507,6 +536,65 @@ DEFINE_BUF_ITEM_EVENT(xfs_trans_bjoin);
DEFINE_BUF_ITEM_EVENT(xfs_trans_bhold);
DEFINE_BUF_ITEM_EVENT(xfs_trans_bhold_release);
DEFINE_BUF_ITEM_EVENT(xfs_trans_binval);
+DEFINE_BUF_ITEM_EVENT(xfs_trans_buf_ordered);
+
+DECLARE_EVENT_CLASS(xfs_filestream_class,
+ TP_PROTO(struct xfs_inode *ip, xfs_agnumber_t agno),
+ TP_ARGS(ip, agno),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(xfs_agnumber_t, agno)
+ __field(int, streams)
+ ),
+ TP_fast_assign(
+ __entry->dev = VFS_I(ip)->i_sb->s_dev;
+ __entry->ino = ip->i_ino;
+ __entry->agno = agno;
+ __entry->streams = xfs_filestream_peek_ag(ip->i_mount, agno);
+ ),
+ TP_printk("dev %d:%d ino 0x%llx agno %u streams %d",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->agno,
+ __entry->streams)
+)
+#define DEFINE_FILESTREAM_EVENT(name) \
+DEFINE_EVENT(xfs_filestream_class, name, \
+ TP_PROTO(struct xfs_inode *ip, xfs_agnumber_t agno), \
+ TP_ARGS(ip, agno))
+DEFINE_FILESTREAM_EVENT(xfs_filestream_free);
+DEFINE_FILESTREAM_EVENT(xfs_filestream_lookup);
+DEFINE_FILESTREAM_EVENT(xfs_filestream_scan);
+
+TRACE_EVENT(xfs_filestream_pick,
+ TP_PROTO(struct xfs_inode *ip, xfs_agnumber_t agno,
+ xfs_extlen_t free, int nscan),
+ TP_ARGS(ip, agno, free, nscan),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(xfs_agnumber_t, agno)
+ __field(int, streams)
+ __field(xfs_extlen_t, free)
+ __field(int, nscan)
+ ),
+ TP_fast_assign(
+ __entry->dev = VFS_I(ip)->i_sb->s_dev;
+ __entry->ino = ip->i_ino;
+ __entry->agno = agno;
+ __entry->streams = xfs_filestream_peek_ag(ip->i_mount, agno);
+ __entry->free = free;
+ __entry->nscan = nscan;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx agno %u streams %d free %d nscan %d",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->agno,
+ __entry->streams,
+ __entry->free,
+ __entry->nscan)
+);
DECLARE_EVENT_CLASS(xfs_lock_class,
TP_PROTO(struct xfs_inode *ip, unsigned lock_flags,
@@ -570,8 +658,11 @@ DEFINE_INODE_EVENT(xfs_iget_miss);
DEFINE_INODE_EVENT(xfs_getattr);
DEFINE_INODE_EVENT(xfs_setattr);
DEFINE_INODE_EVENT(xfs_readlink);
+DEFINE_INODE_EVENT(xfs_inactive_symlink);
DEFINE_INODE_EVENT(xfs_alloc_file_space);
DEFINE_INODE_EVENT(xfs_free_file_space);
+DEFINE_INODE_EVENT(xfs_zero_file_space);
+DEFINE_INODE_EVENT(xfs_collapse_file_space);
DEFINE_INODE_EVENT(xfs_readdir);
#ifdef CONFIG_XFS_POSIX_ACL
DEFINE_INODE_EVENT(xfs_get_acl);
@@ -618,6 +709,30 @@ DECLARE_EVENT_CLASS(xfs_iref_class,
(char *)__entry->caller_ip)
)
+TRACE_EVENT(xfs_iomap_prealloc_size,
+ TP_PROTO(struct xfs_inode *ip, xfs_fsblock_t blocks, int shift,
+ unsigned int writeio_blocks),
+ TP_ARGS(ip, blocks, shift, writeio_blocks),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(xfs_fsblock_t, blocks)
+ __field(int, shift)
+ __field(unsigned int, writeio_blocks)
+ ),
+ TP_fast_assign(
+ __entry->dev = VFS_I(ip)->i_sb->s_dev;
+ __entry->ino = ip->i_ino;
+ __entry->blocks = blocks;
+ __entry->shift = shift;
+ __entry->writeio_blocks = writeio_blocks;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx prealloc blocks %llu shift %d "
+ "m_writeio_blocks %u",
+ MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino,
+ __entry->blocks, __entry->shift, __entry->writeio_blocks)
+)
+
#define DEFINE_IREF_EVENT(name) \
DEFINE_EVENT(xfs_iref_class, name, \
TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip), \
@@ -908,6 +1023,63 @@ DEFINE_LOG_ITEM_EVENT(xfs_ail_pinned);
DEFINE_LOG_ITEM_EVENT(xfs_ail_locked);
DEFINE_LOG_ITEM_EVENT(xfs_ail_flushing);
+DECLARE_EVENT_CLASS(xfs_ail_class,
+ TP_PROTO(struct xfs_log_item *lip, xfs_lsn_t old_lsn, xfs_lsn_t new_lsn),
+ TP_ARGS(lip, old_lsn, new_lsn),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(void *, lip)
+ __field(uint, type)
+ __field(uint, flags)
+ __field(xfs_lsn_t, old_lsn)
+ __field(xfs_lsn_t, new_lsn)
+ ),
+ TP_fast_assign(
+ __entry->dev = lip->li_mountp->m_super->s_dev;
+ __entry->lip = lip;
+ __entry->type = lip->li_type;
+ __entry->flags = lip->li_flags;
+ __entry->old_lsn = old_lsn;
+ __entry->new_lsn = new_lsn;
+ ),
+ TP_printk("dev %d:%d lip 0x%p old lsn %d/%d new lsn %d/%d type %s flags %s",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->lip,
+ CYCLE_LSN(__entry->old_lsn), BLOCK_LSN(__entry->old_lsn),
+ CYCLE_LSN(__entry->new_lsn), BLOCK_LSN(__entry->new_lsn),
+ __print_symbolic(__entry->type, XFS_LI_TYPE_DESC),
+ __print_flags(__entry->flags, "|", XFS_LI_FLAGS))
+)
+
+#define DEFINE_AIL_EVENT(name) \
+DEFINE_EVENT(xfs_ail_class, name, \
+ TP_PROTO(struct xfs_log_item *lip, xfs_lsn_t old_lsn, xfs_lsn_t new_lsn), \
+ TP_ARGS(lip, old_lsn, new_lsn))
+DEFINE_AIL_EVENT(xfs_ail_insert);
+DEFINE_AIL_EVENT(xfs_ail_move);
+DEFINE_AIL_EVENT(xfs_ail_delete);
+
+TRACE_EVENT(xfs_log_assign_tail_lsn,
+ TP_PROTO(struct xlog *log, xfs_lsn_t new_lsn),
+ TP_ARGS(log, new_lsn),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_lsn_t, new_lsn)
+ __field(xfs_lsn_t, old_lsn)
+ __field(xfs_lsn_t, last_sync_lsn)
+ ),
+ TP_fast_assign(
+ __entry->dev = log->l_mp->m_super->s_dev;
+ __entry->new_lsn = new_lsn;
+ __entry->old_lsn = atomic64_read(&log->l_tail_lsn);
+ __entry->last_sync_lsn = atomic64_read(&log->l_last_sync_lsn);
+ ),
+ TP_printk("dev %d:%d new tail lsn %d/%d, old lsn %d/%d, last sync %d/%d",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ CYCLE_LSN(__entry->new_lsn), BLOCK_LSN(__entry->new_lsn),
+ CYCLE_LSN(__entry->old_lsn), BLOCK_LSN(__entry->old_lsn),
+ CYCLE_LSN(__entry->last_sync_lsn), BLOCK_LSN(__entry->last_sync_lsn))
+)
DECLARE_EVENT_CLASS(xfs_file_class,
TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset, int flags),
@@ -946,17 +1118,18 @@ DEFINE_RW_EVENT(xfs_file_read);
DEFINE_RW_EVENT(xfs_file_buffered_write);
DEFINE_RW_EVENT(xfs_file_direct_write);
DEFINE_RW_EVENT(xfs_file_splice_read);
-DEFINE_RW_EVENT(xfs_file_splice_write);
DECLARE_EVENT_CLASS(xfs_page_class,
- TP_PROTO(struct inode *inode, struct page *page, unsigned long off),
- TP_ARGS(inode, page, off),
+ TP_PROTO(struct inode *inode, struct page *page, unsigned long off,
+ unsigned int len),
+ TP_ARGS(inode, page, off, len),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_ino_t, ino)
__field(pgoff_t, pgoff)
__field(loff_t, size)
__field(unsigned long, offset)
+ __field(unsigned int, length)
__field(int, delalloc)
__field(int, unwritten)
),
@@ -970,24 +1143,27 @@ DECLARE_EVENT_CLASS(xfs_page_class,
__entry->pgoff = page_offset(page);
__entry->size = i_size_read(inode);
__entry->offset = off;
+ __entry->length = len;
__entry->delalloc = delalloc;
__entry->unwritten = unwritten;
),
TP_printk("dev %d:%d ino 0x%llx pgoff 0x%lx size 0x%llx offset %lx "
- "delalloc %d unwritten %d",
+ "length %x delalloc %d unwritten %d",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
__entry->pgoff,
__entry->size,
__entry->offset,
+ __entry->length,
__entry->delalloc,
__entry->unwritten)
)
#define DEFINE_PAGE_EVENT(name) \
DEFINE_EVENT(xfs_page_class, name, \
- TP_PROTO(struct inode *inode, struct page *page, unsigned long off), \
- TP_ARGS(inode, page, off))
+ TP_PROTO(struct inode *inode, struct page *page, unsigned long off, \
+ unsigned int len), \
+ TP_ARGS(inode, page, off, len))
DEFINE_PAGE_EVENT(xfs_writepage);
DEFINE_PAGE_EVENT(xfs_releasepage);
DEFINE_PAGE_EVENT(xfs_invalidatepage);
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 06ed520a767..d03932564cc 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -18,518 +18,25 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
-#include "xfs_types.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_mount.h"
-#include "xfs_error.h"
-#include "xfs_da_btree.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dinode.h"
#include "xfs_inode.h"
-#include "xfs_btree.h"
-#include "xfs_ialloc.h"
-#include "xfs_alloc.h"
#include "xfs_extent_busy.h"
-#include "xfs_bmap.h"
#include "xfs_quota.h"
+#include "xfs_trans.h"
#include "xfs_trans_priv.h"
-#include "xfs_trans_space.h"
-#include "xfs_inode_item.h"
+#include "xfs_log.h"
#include "xfs_trace.h"
+#include "xfs_error.h"
kmem_zone_t *xfs_trans_zone;
kmem_zone_t *xfs_log_item_desc_zone;
-
-/*
- * Various log reservation values.
- *
- * These are based on the size of the file system block because that is what
- * most transactions manipulate. Each adds in an additional 128 bytes per
- * item logged to try to account for the overhead of the transaction mechanism.
- *
- * Note: Most of the reservations underestimate the number of allocation
- * groups into which they could free extents in the xfs_bmap_finish() call.
- * This is because the number in the worst case is quite high and quite
- * unusual. In order to fix this we need to change xfs_bmap_finish() to free
- * extents in only a single AG at a time. This will require changes to the
- * EFI code as well, however, so that the EFI for the extents not freed is
- * logged again in each transaction. See SGI PV #261917.
- *
- * Reservation functions here avoid a huge stack in xfs_trans_init due to
- * register overflow from temporaries in the calculations.
- */
-
-
-/*
- * In a write transaction we can allocate a maximum of 2
- * extents. This gives:
- * the inode getting the new extents: inode size
- * the inode's bmap btree: max depth * block size
- * the agfs of the ags from which the extents are allocated: 2 * sector
- * the superblock free block counter: sector size
- * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
- * And the bmap_finish transaction can free bmap blocks in a join:
- * the agfs of the ags containing the blocks: 2 * sector size
- * the agfls of the ags containing the blocks: 2 * sector size
- * the super block free block counter: sector size
- * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
- */
-STATIC uint
-xfs_calc_write_reservation(
- struct xfs_mount *mp)
-{
- return XFS_DQUOT_LOGRES(mp) +
- MAX((mp->m_sb.sb_inodesize +
- XFS_FSB_TO_B(mp, XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)) +
- 2 * mp->m_sb.sb_sectsize +
- mp->m_sb.sb_sectsize +
- XFS_ALLOCFREE_LOG_RES(mp, 2) +
- 128 * (4 + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) +
- XFS_ALLOCFREE_LOG_COUNT(mp, 2))),
- (2 * mp->m_sb.sb_sectsize +
- 2 * mp->m_sb.sb_sectsize +
- mp->m_sb.sb_sectsize +
- XFS_ALLOCFREE_LOG_RES(mp, 2) +
- 128 * (5 + XFS_ALLOCFREE_LOG_COUNT(mp, 2))));
-}
-
-/*
- * In truncating a file we free up to two extents at once. We can modify:
- * the inode being truncated: inode size
- * the inode's bmap btree: (max depth + 1) * block size
- * And the bmap_finish transaction can free the blocks and bmap blocks:
- * the agf for each of the ags: 4 * sector size
- * the agfl for each of the ags: 4 * sector size
- * the super block to reflect the freed blocks: sector size
- * worst case split in allocation btrees per extent assuming 4 extents:
- * 4 exts * 2 trees * (2 * max depth - 1) * block size
- * the inode btree: max depth * blocksize
- * the allocation btrees: 2 trees * (max depth - 1) * block size
- */
-STATIC uint
-xfs_calc_itruncate_reservation(
- struct xfs_mount *mp)
-{
- return XFS_DQUOT_LOGRES(mp) +
- MAX((mp->m_sb.sb_inodesize +
- XFS_FSB_TO_B(mp, XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1) +
- 128 * (2 + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK))),
- (4 * mp->m_sb.sb_sectsize +
- 4 * mp->m_sb.sb_sectsize +
- mp->m_sb.sb_sectsize +
- XFS_ALLOCFREE_LOG_RES(mp, 4) +
- 128 * (9 + XFS_ALLOCFREE_LOG_COUNT(mp, 4)) +
- 128 * 5 +
- XFS_ALLOCFREE_LOG_RES(mp, 1) +
- 128 * (2 + XFS_IALLOC_BLOCKS(mp) + mp->m_in_maxlevels +
- XFS_ALLOCFREE_LOG_COUNT(mp, 1))));
-}
-
-/*
- * In renaming a files we can modify:
- * the four inodes involved: 4 * inode size
- * the two directory btrees: 2 * (max depth + v2) * dir block size
- * the two directory bmap btrees: 2 * max depth * block size
- * And the bmap_finish transaction can free dir and bmap blocks (two sets
- * of bmap blocks) giving:
- * the agf for the ags in which the blocks live: 3 * sector size
- * the agfl for the ags in which the blocks live: 3 * sector size
- * the superblock for the free block count: sector size
- * the allocation btrees: 3 exts * 2 trees * (2 * max depth - 1) * block size
- */
-STATIC uint
-xfs_calc_rename_reservation(
- struct xfs_mount *mp)
-{
- return XFS_DQUOT_LOGRES(mp) +
- MAX((4 * mp->m_sb.sb_inodesize +
- 2 * XFS_DIROP_LOG_RES(mp) +
- 128 * (4 + 2 * XFS_DIROP_LOG_COUNT(mp))),
- (3 * mp->m_sb.sb_sectsize +
- 3 * mp->m_sb.sb_sectsize +
- mp->m_sb.sb_sectsize +
- XFS_ALLOCFREE_LOG_RES(mp, 3) +
- 128 * (7 + XFS_ALLOCFREE_LOG_COUNT(mp, 3))));
-}
-
-/*
- * For creating a link to an inode:
- * the parent directory inode: inode size
- * the linked inode: inode size
- * the directory btree could split: (max depth + v2) * dir block size
- * the directory bmap btree could join or split: (max depth + v2) * blocksize
- * And the bmap_finish transaction can free some bmap blocks giving:
- * the agf for the ag in which the blocks live: sector size
- * the agfl for the ag in which the blocks live: sector size
- * the superblock for the free block count: sector size
- * the allocation btrees: 2 trees * (2 * max depth - 1) * block size
- */
-STATIC uint
-xfs_calc_link_reservation(
- struct xfs_mount *mp)
-{
- return XFS_DQUOT_LOGRES(mp) +
- MAX((mp->m_sb.sb_inodesize +
- mp->m_sb.sb_inodesize +
- XFS_DIROP_LOG_RES(mp) +
- 128 * (2 + XFS_DIROP_LOG_COUNT(mp))),
- (mp->m_sb.sb_sectsize +
- mp->m_sb.sb_sectsize +
- mp->m_sb.sb_sectsize +
- XFS_ALLOCFREE_LOG_RES(mp, 1) +
- 128 * (3 + XFS_ALLOCFREE_LOG_COUNT(mp, 1))));
-}
-
-/*
- * For removing a directory entry we can modify:
- * the parent directory inode: inode size
- * the removed inode: inode size
- * the directory btree could join: (max depth + v2) * dir block size
- * the directory bmap btree could join or split: (max depth + v2) * blocksize
- * And the bmap_finish transaction can free the dir and bmap blocks giving:
- * the agf for the ag in which the blocks live: 2 * sector size
- * the agfl for the ag in which the blocks live: 2 * sector size
- * the superblock for the free block count: sector size
- * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
- */
-STATIC uint
-xfs_calc_remove_reservation(
- struct xfs_mount *mp)
-{
- return XFS_DQUOT_LOGRES(mp) +
- MAX((mp->m_sb.sb_inodesize +
- mp->m_sb.sb_inodesize +
- XFS_DIROP_LOG_RES(mp) +
- 128 * (2 + XFS_DIROP_LOG_COUNT(mp))),
- (2 * mp->m_sb.sb_sectsize +
- 2 * mp->m_sb.sb_sectsize +
- mp->m_sb.sb_sectsize +
- XFS_ALLOCFREE_LOG_RES(mp, 2) +
- 128 * (5 + XFS_ALLOCFREE_LOG_COUNT(mp, 2))));
-}
-
-/*
- * For symlink we can modify:
- * the parent directory inode: inode size
- * the new inode: inode size
- * the inode btree entry: 1 block
- * the directory btree: (max depth + v2) * dir block size
- * the directory inode's bmap btree: (max depth + v2) * block size
- * the blocks for the symlink: 1 kB
- * Or in the first xact we allocate some inodes giving:
- * the agi and agf of the ag getting the new inodes: 2 * sectorsize
- * the inode blocks allocated: XFS_IALLOC_BLOCKS * blocksize
- * the inode btree: max depth * blocksize
- * the allocation btrees: 2 trees * (2 * max depth - 1) * block size
- */
-STATIC uint
-xfs_calc_symlink_reservation(
- struct xfs_mount *mp)
-{
- return XFS_DQUOT_LOGRES(mp) +
- MAX((mp->m_sb.sb_inodesize +
- mp->m_sb.sb_inodesize +
- XFS_FSB_TO_B(mp, 1) +
- XFS_DIROP_LOG_RES(mp) +
- 1024 +
- 128 * (4 + XFS_DIROP_LOG_COUNT(mp))),
- (2 * mp->m_sb.sb_sectsize +
- XFS_FSB_TO_B(mp, XFS_IALLOC_BLOCKS(mp)) +
- XFS_FSB_TO_B(mp, mp->m_in_maxlevels) +
- XFS_ALLOCFREE_LOG_RES(mp, 1) +
- 128 * (2 + XFS_IALLOC_BLOCKS(mp) + mp->m_in_maxlevels +
- XFS_ALLOCFREE_LOG_COUNT(mp, 1))));
-}
-
-/*
- * For create we can modify:
- * the parent directory inode: inode size
- * the new inode: inode size
- * the inode btree entry: block size
- * the superblock for the nlink flag: sector size
- * the directory btree: (max depth + v2) * dir block size
- * the directory inode's bmap btree: (max depth + v2) * block size
- * Or in the first xact we allocate some inodes giving:
- * the agi and agf of the ag getting the new inodes: 2 * sectorsize
- * the superblock for the nlink flag: sector size
- * the inode blocks allocated: XFS_IALLOC_BLOCKS * blocksize
- * the inode btree: max depth * blocksize
- * the allocation btrees: 2 trees * (max depth - 1) * block size
- */
-STATIC uint
-xfs_calc_create_reservation(
- struct xfs_mount *mp)
-{
- return XFS_DQUOT_LOGRES(mp) +
- MAX((mp->m_sb.sb_inodesize +
- mp->m_sb.sb_inodesize +
- mp->m_sb.sb_sectsize +
- XFS_FSB_TO_B(mp, 1) +
- XFS_DIROP_LOG_RES(mp) +
- 128 * (3 + XFS_DIROP_LOG_COUNT(mp))),
- (3 * mp->m_sb.sb_sectsize +
- XFS_FSB_TO_B(mp, XFS_IALLOC_BLOCKS(mp)) +
- XFS_FSB_TO_B(mp, mp->m_in_maxlevels) +
- XFS_ALLOCFREE_LOG_RES(mp, 1) +
- 128 * (2 + XFS_IALLOC_BLOCKS(mp) + mp->m_in_maxlevels +
- XFS_ALLOCFREE_LOG_COUNT(mp, 1))));
-}
-
-/*
- * Making a new directory is the same as creating a new file.
- */
-STATIC uint
-xfs_calc_mkdir_reservation(
- struct xfs_mount *mp)
-{
- return xfs_calc_create_reservation(mp);
-}
-
-/*
- * In freeing an inode we can modify:
- * the inode being freed: inode size
- * the super block free inode counter: sector size
- * the agi hash list and counters: sector size
- * the inode btree entry: block size
- * the on disk inode before ours in the agi hash list: inode cluster size
- * the inode btree: max depth * blocksize
- * the allocation btrees: 2 trees * (max depth - 1) * block size
- */
-STATIC uint
-xfs_calc_ifree_reservation(
- struct xfs_mount *mp)
-{
- return XFS_DQUOT_LOGRES(mp) +
- mp->m_sb.sb_inodesize +
- mp->m_sb.sb_sectsize +
- mp->m_sb.sb_sectsize +
- XFS_FSB_TO_B(mp, 1) +
- MAX((__uint16_t)XFS_FSB_TO_B(mp, 1),
- XFS_INODE_CLUSTER_SIZE(mp)) +
- 128 * 5 +
- XFS_ALLOCFREE_LOG_RES(mp, 1) +
- 128 * (2 + XFS_IALLOC_BLOCKS(mp) + mp->m_in_maxlevels +
- XFS_ALLOCFREE_LOG_COUNT(mp, 1));
-}
-
-/*
- * When only changing the inode we log the inode and possibly the superblock
- * We also add a bit of slop for the transaction stuff.
- */
-STATIC uint
-xfs_calc_ichange_reservation(
- struct xfs_mount *mp)
-{
- return XFS_DQUOT_LOGRES(mp) +
- mp->m_sb.sb_inodesize +
- mp->m_sb.sb_sectsize +
- 512;
-
-}
-
-/*
- * Growing the data section of the filesystem.
- * superblock
- * agi and agf
- * allocation btrees
- */
-STATIC uint
-xfs_calc_growdata_reservation(
- struct xfs_mount *mp)
-{
- return mp->m_sb.sb_sectsize * 3 +
- XFS_ALLOCFREE_LOG_RES(mp, 1) +
- 128 * (3 + XFS_ALLOCFREE_LOG_COUNT(mp, 1));
-}
-
-/*
- * Growing the rt section of the filesystem.
- * In the first set of transactions (ALLOC) we allocate space to the
- * bitmap or summary files.
- * superblock: sector size
- * agf of the ag from which the extent is allocated: sector size
- * bmap btree for bitmap/summary inode: max depth * blocksize
- * bitmap/summary inode: inode size
- * allocation btrees for 1 block alloc: 2 * (2 * maxdepth - 1) * blocksize
- */
-STATIC uint
-xfs_calc_growrtalloc_reservation(
- struct xfs_mount *mp)
-{
- return 2 * mp->m_sb.sb_sectsize +
- XFS_FSB_TO_B(mp, XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)) +
- mp->m_sb.sb_inodesize +
- XFS_ALLOCFREE_LOG_RES(mp, 1) +
- 128 * (3 + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) +
- XFS_ALLOCFREE_LOG_COUNT(mp, 1));
-}
-
-/*
- * Growing the rt section of the filesystem.
- * In the second set of transactions (ZERO) we zero the new metadata blocks.
- * one bitmap/summary block: blocksize
- */
-STATIC uint
-xfs_calc_growrtzero_reservation(
- struct xfs_mount *mp)
-{
- return mp->m_sb.sb_blocksize + 128;
-}
-
-/*
- * Growing the rt section of the filesystem.
- * In the third set of transactions (FREE) we update metadata without
- * allocating any new blocks.
- * superblock: sector size
- * bitmap inode: inode size
- * summary inode: inode size
- * one bitmap block: blocksize
- * summary blocks: new summary size
- */
-STATIC uint
-xfs_calc_growrtfree_reservation(
- struct xfs_mount *mp)
-{
- return mp->m_sb.sb_sectsize +
- 2 * mp->m_sb.sb_inodesize +
- mp->m_sb.sb_blocksize +
- mp->m_rsumsize +
- 128 * 5;
-}
-
-/*
- * Logging the inode modification timestamp on a synchronous write.
- * inode
- */
-STATIC uint
-xfs_calc_swrite_reservation(
- struct xfs_mount *mp)
-{
- return mp->m_sb.sb_inodesize + 128;
-}
-
-/*
- * Logging the inode mode bits when writing a setuid/setgid file
- * inode
- */
-STATIC uint
-xfs_calc_writeid_reservation(xfs_mount_t *mp)
-{
- return mp->m_sb.sb_inodesize + 128;
-}
-
-/*
- * Converting the inode from non-attributed to attributed.
- * the inode being converted: inode size
- * agf block and superblock (for block allocation)
- * the new block (directory sized)
- * bmap blocks for the new directory block
- * allocation btrees
- */
-STATIC uint
-xfs_calc_addafork_reservation(
- struct xfs_mount *mp)
-{
- return XFS_DQUOT_LOGRES(mp) +
- mp->m_sb.sb_inodesize +
- mp->m_sb.sb_sectsize * 2 +
- mp->m_dirblksize +
- XFS_FSB_TO_B(mp, XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1) +
- XFS_ALLOCFREE_LOG_RES(mp, 1) +
- 128 * (4 + XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1 +
- XFS_ALLOCFREE_LOG_COUNT(mp, 1));
-}
-
-/*
- * Removing the attribute fork of a file
- * the inode being truncated: inode size
- * the inode's bmap btree: max depth * block size
- * And the bmap_finish transaction can free the blocks and bmap blocks:
- * the agf for each of the ags: 4 * sector size
- * the agfl for each of the ags: 4 * sector size
- * the super block to reflect the freed blocks: sector size
- * worst case split in allocation btrees per extent assuming 4 extents:
- * 4 exts * 2 trees * (2 * max depth - 1) * block size
- */
-STATIC uint
-xfs_calc_attrinval_reservation(
- struct xfs_mount *mp)
-{
- return MAX((mp->m_sb.sb_inodesize +
- XFS_FSB_TO_B(mp, XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) +
- 128 * (1 + XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK))),
- (4 * mp->m_sb.sb_sectsize +
- 4 * mp->m_sb.sb_sectsize +
- mp->m_sb.sb_sectsize +
- XFS_ALLOCFREE_LOG_RES(mp, 4) +
- 128 * (9 + XFS_ALLOCFREE_LOG_COUNT(mp, 4))));
-}
-
-/*
- * Setting an attribute.
- * the inode getting the attribute
- * the superblock for allocations
- * the agfs extents are allocated from
- * the attribute btree * max depth
- * the inode allocation btree
- * Since attribute transaction space is dependent on the size of the attribute,
- * the calculation is done partially at mount time and partially at runtime.
- */
-STATIC uint
-xfs_calc_attrset_reservation(
- struct xfs_mount *mp)
-{
- return XFS_DQUOT_LOGRES(mp) +
- mp->m_sb.sb_inodesize +
- mp->m_sb.sb_sectsize +
- XFS_FSB_TO_B(mp, XFS_DA_NODE_MAXDEPTH) +
- 128 * (2 + XFS_DA_NODE_MAXDEPTH);
-}
-
-/*
- * Removing an attribute.
- * the inode: inode size
- * the attribute btree could join: max depth * block size
- * the inode bmap btree could join or split: max depth * block size
- * And the bmap_finish transaction can free the attr blocks freed giving:
- * the agf for the ag in which the blocks live: 2 * sector size
- * the agfl for the ag in which the blocks live: 2 * sector size
- * the superblock for the free block count: sector size
- * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
- */
-STATIC uint
-xfs_calc_attrrm_reservation(
- struct xfs_mount *mp)
-{
- return XFS_DQUOT_LOGRES(mp) +
- MAX((mp->m_sb.sb_inodesize +
- XFS_FSB_TO_B(mp, XFS_DA_NODE_MAXDEPTH) +
- XFS_FSB_TO_B(mp, XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) +
- 128 * (1 + XFS_DA_NODE_MAXDEPTH +
- XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK))),
- (2 * mp->m_sb.sb_sectsize +
- 2 * mp->m_sb.sb_sectsize +
- mp->m_sb.sb_sectsize +
- XFS_ALLOCFREE_LOG_RES(mp, 2) +
- 128 * (5 + XFS_ALLOCFREE_LOG_COUNT(mp, 2))));
-}
-
-/*
- * Clearing a bad agino number in an agi hash bucket.
- */
-STATIC uint
-xfs_calc_clear_agi_bucket_reservation(
- struct xfs_mount *mp)
-{
- return mp->m_sb.sb_sectsize + 128;
-}
-
/*
* Initialize the precomputed transaction reservation values
* in the mount structure.
@@ -538,29 +45,7 @@ void
xfs_trans_init(
struct xfs_mount *mp)
{
- struct xfs_trans_reservations *resp = &mp->m_reservations;
-
- resp->tr_write = xfs_calc_write_reservation(mp);
- resp->tr_itruncate = xfs_calc_itruncate_reservation(mp);
- resp->tr_rename = xfs_calc_rename_reservation(mp);
- resp->tr_link = xfs_calc_link_reservation(mp);
- resp->tr_remove = xfs_calc_remove_reservation(mp);
- resp->tr_symlink = xfs_calc_symlink_reservation(mp);
- resp->tr_create = xfs_calc_create_reservation(mp);
- resp->tr_mkdir = xfs_calc_mkdir_reservation(mp);
- resp->tr_ifree = xfs_calc_ifree_reservation(mp);
- resp->tr_ichange = xfs_calc_ichange_reservation(mp);
- resp->tr_growdata = xfs_calc_growdata_reservation(mp);
- resp->tr_swrite = xfs_calc_swrite_reservation(mp);
- resp->tr_writeid = xfs_calc_writeid_reservation(mp);
- resp->tr_addafork = xfs_calc_addafork_reservation(mp);
- resp->tr_attrinval = xfs_calc_attrinval_reservation(mp);
- resp->tr_attrset = xfs_calc_attrset_reservation(mp);
- resp->tr_attrrm = xfs_calc_attrrm_reservation(mp);
- resp->tr_clearagi = xfs_calc_clear_agi_bucket_reservation(mp);
- resp->tr_growrtalloc = xfs_calc_growrtalloc_reservation(mp);
- resp->tr_growrtzero = xfs_calc_growrtzero_reservation(mp);
- resp->tr_growrtfree = xfs_calc_growrtfree_reservation(mp);
+ xfs_trans_resv_calc(mp, M_RES(mp));
}
/*
@@ -596,7 +81,7 @@ _xfs_trans_alloc(
atomic_inc(&mp->m_active_trans);
tp = kmem_zone_zalloc(xfs_trans_zone, memflags);
- tp->t_magic = XFS_TRANS_MAGIC;
+ tp->t_magic = XFS_TRANS_HEADER_MAGIC;
tp->t_type = type;
tp->t_mountp = mp;
INIT_LIST_HEAD(&tp->t_items);
@@ -641,7 +126,7 @@ xfs_trans_dup(
/*
* Initialize the new transaction structure.
*/
- ntp->t_magic = XFS_TRANS_MAGIC;
+ ntp->t_magic = XFS_TRANS_HEADER_MAGIC;
ntp->t_type = tp->t_type;
ntp->t_mountp = tp->t_mountp;
INIT_LIST_HEAD(&ntp->t_items);
@@ -684,12 +169,10 @@ xfs_trans_dup(
*/
int
xfs_trans_reserve(
- xfs_trans_t *tp,
- uint blocks,
- uint logspace,
- uint rtextents,
- uint flags,
- uint logcount)
+ struct xfs_trans *tp,
+ struct xfs_trans_res *resp,
+ uint blocks,
+ uint rtextents)
{
int error = 0;
int rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0;
@@ -715,13 +198,15 @@ xfs_trans_reserve(
/*
* Reserve the log space needed for this transaction.
*/
- if (logspace > 0) {
+ if (resp->tr_logres > 0) {
bool permanent = false;
- ASSERT(tp->t_log_res == 0 || tp->t_log_res == logspace);
- ASSERT(tp->t_log_count == 0 || tp->t_log_count == logcount);
+ ASSERT(tp->t_log_res == 0 ||
+ tp->t_log_res == resp->tr_logres);
+ ASSERT(tp->t_log_count == 0 ||
+ tp->t_log_count == resp->tr_logcount);
- if (flags & XFS_TRANS_PERM_LOG_RES) {
+ if (resp->tr_logflags & XFS_TRANS_PERM_LOG_RES) {
tp->t_flags |= XFS_TRANS_PERM_LOG_RES;
permanent = true;
} else {
@@ -730,20 +215,21 @@ xfs_trans_reserve(
}
if (tp->t_ticket != NULL) {
- ASSERT(flags & XFS_TRANS_PERM_LOG_RES);
+ ASSERT(resp->tr_logflags & XFS_TRANS_PERM_LOG_RES);
error = xfs_log_regrant(tp->t_mountp, tp->t_ticket);
} else {
- error = xfs_log_reserve(tp->t_mountp, logspace,
- logcount, &tp->t_ticket,
- XFS_TRANSACTION, permanent,
- tp->t_type);
+ error = xfs_log_reserve(tp->t_mountp,
+ resp->tr_logres,
+ resp->tr_logcount,
+ &tp->t_ticket, XFS_TRANSACTION,
+ permanent, tp->t_type);
}
if (error)
goto undo_blocks;
- tp->t_log_res = logspace;
- tp->t_log_count = logcount;
+ tp->t_log_res = resp->tr_logres;
+ tp->t_log_count = resp->tr_logcount;
}
/*
@@ -768,10 +254,10 @@ xfs_trans_reserve(
* reservations which have already been performed.
*/
undo_log:
- if (logspace > 0) {
+ if (resp->tr_logres > 0) {
int log_flags;
- if (flags & XFS_TRANS_PERM_LOG_RES) {
+ if (resp->tr_logflags & XFS_TRANS_PERM_LOG_RES) {
log_flags = XFS_LOG_REL_PERM_RESERV;
} else {
log_flags = 0;
@@ -1219,10 +705,10 @@ xfs_trans_free_items(
lip->li_desc = NULL;
if (commit_lsn != NULLCOMMITLSN)
- IOP_COMMITTING(lip, commit_lsn);
+ lip->li_ops->iop_committing(lip, commit_lsn);
if (flags & XFS_TRANS_ABORT)
lip->li_flags |= XFS_LI_ABORTED;
- IOP_UNLOCK(lip);
+ lip->li_ops->iop_unlock(lip);
xfs_trans_free_item_desc(lidp);
}
@@ -1242,8 +728,11 @@ xfs_log_item_batch_insert(
/* xfs_trans_ail_update_bulk drops ailp->xa_lock */
xfs_trans_ail_update_bulk(ailp, cur, log_items, nr_items, commit_lsn);
- for (i = 0; i < nr_items; i++)
- IOP_UNPIN(log_items[i], 0);
+ for (i = 0; i < nr_items; i++) {
+ struct xfs_log_item *lip = log_items[i];
+
+ lip->li_ops->iop_unpin(lip, 0);
+ }
}
/*
@@ -1253,11 +742,11 @@ xfs_log_item_batch_insert(
*
* If we are called with the aborted flag set, it is because a log write during
* a CIL checkpoint commit has failed. In this case, all the items in the
- * checkpoint have already gone through IOP_COMMITED and IOP_UNLOCK, which
+ * checkpoint have already gone through iop_commited and iop_unlock, which
* means that checkpoint commit abort handling is treated exactly the same
* as an iclog write error even though we haven't started any IO yet. Hence in
- * this case all we need to do is IOP_COMMITTED processing, followed by an
- * IOP_UNPIN(aborted) call.
+ * this case all we need to do is iop_committed processing, followed by an
+ * iop_unpin(aborted) call.
*
* The AIL cursor is used to optimise the insert process. If commit_lsn is not
* at the end of the AIL, the insert cursor avoids the need to walk
@@ -1290,7 +779,7 @@ xfs_trans_committed_bulk(
if (aborted)
lip->li_flags |= XFS_LI_ABORTED;
- item_lsn = IOP_COMMITTED(lip, commit_lsn);
+ item_lsn = lip->li_ops->iop_committed(lip, commit_lsn);
/* item_lsn of -1 means the item needs no further processing */
if (XFS_LSN_CMP(item_lsn, (xfs_lsn_t)-1) == 0)
@@ -1302,7 +791,7 @@ xfs_trans_committed_bulk(
*/
if (aborted) {
ASSERT(XFS_FORCED_SHUTDOWN(ailp->xa_mount));
- IOP_UNPIN(lip, 1);
+ lip->li_ops->iop_unpin(lip, 1);
continue;
}
@@ -1320,7 +809,7 @@ xfs_trans_committed_bulk(
xfs_trans_ail_update(ailp, lip, item_lsn);
else
spin_unlock(&ailp->xa_lock);
- IOP_UNPIN(lip, 0);
+ lip->li_ops->iop_unpin(lip, 0);
continue;
}
@@ -1338,7 +827,7 @@ xfs_trans_committed_bulk(
xfs_log_item_batch_insert(ailp, &cur, log_items, i, commit_lsn);
spin_lock(&ailp->xa_lock);
- xfs_trans_ail_cursor_done(ailp, &cur);
+ xfs_trans_ail_cursor_done(&cur);
spin_unlock(&ailp->xa_lock);
}
@@ -1398,12 +887,7 @@ xfs_trans_commit(
xfs_trans_apply_sb_deltas(tp);
xfs_trans_apply_dquot_deltas(tp);
- error = xfs_log_commit_cil(mp, tp, &commit_lsn, flags);
- if (error == ENOMEM) {
- xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR);
- error = XFS_ERROR(EIO);
- goto out_unreserve;
- }
+ xfs_log_commit_cil(mp, tp, &commit_lsn, flags);
current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
xfs_trans_free(tp);
@@ -1413,10 +897,7 @@ xfs_trans_commit(
* log out now and wait for it.
*/
if (sync) {
- if (!error) {
- error = _xfs_log_force_lsn(mp, commit_lsn,
- XFS_LOG_SYNC, NULL);
- }
+ error = _xfs_log_force_lsn(mp, commit_lsn, XFS_LOG_SYNC, NULL);
XFS_STATS_INC(xs_trans_sync);
} else {
XFS_STATS_INC(xs_trans_async);
@@ -1518,7 +999,7 @@ xfs_trans_roll(
struct xfs_inode *dp)
{
struct xfs_trans *trans;
- unsigned int logres, count;
+ struct xfs_trans_res tres;
int error;
/*
@@ -1530,8 +1011,8 @@ xfs_trans_roll(
/*
* Copy the critical parameters from one trans to the next.
*/
- logres = trans->t_log_res;
- count = trans->t_log_count;
+ tres.tr_logres = trans->t_log_res;
+ tres.tr_logcount = trans->t_log_count;
*tpp = xfs_trans_dup(trans);
/*
@@ -1562,8 +1043,8 @@ xfs_trans_roll(
* across this call, or that anything that is locked be logged in
* the prior and the next transactions.
*/
- error = xfs_trans_reserve(trans, 0, logres, 0,
- XFS_TRANS_PERM_LOG_RES, count);
+ tres.tr_logflags = XFS_TRANS_PERM_LOG_RES;
+ error = xfs_trans_reserve(trans, &tres, 0, 0);
/*
* Ensure that the inode is in the new transaction and locked.
*/
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index c6c0601abd7..b5bc1ab3c4d 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -18,288 +18,7 @@
#ifndef __XFS_TRANS_H__
#define __XFS_TRANS_H__
-struct xfs_log_item;
-
-/*
- * This is the structure written in the log at the head of
- * every transaction. It identifies the type and id of the
- * transaction, and contains the number of items logged by
- * the transaction so we know how many to expect during recovery.
- *
- * Do not change the below structure without redoing the code in
- * xlog_recover_add_to_trans() and xlog_recover_add_to_cont_trans().
- */
-typedef struct xfs_trans_header {
- uint th_magic; /* magic number */
- uint th_type; /* transaction type */
- __int32_t th_tid; /* transaction id (unused) */
- uint th_num_items; /* num items logged by trans */
-} xfs_trans_header_t;
-
-#define XFS_TRANS_HEADER_MAGIC 0x5452414e /* TRAN */
-
-/*
- * Log item types.
- */
-#define XFS_LI_EFI 0x1236
-#define XFS_LI_EFD 0x1237
-#define XFS_LI_IUNLINK 0x1238
-#define XFS_LI_INODE 0x123b /* aligned ino chunks, var-size ibufs */
-#define XFS_LI_BUF 0x123c /* v2 bufs, variable sized inode bufs */
-#define XFS_LI_DQUOT 0x123d
-#define XFS_LI_QUOTAOFF 0x123e
-
-#define XFS_LI_TYPE_DESC \
- { XFS_LI_EFI, "XFS_LI_EFI" }, \
- { XFS_LI_EFD, "XFS_LI_EFD" }, \
- { XFS_LI_IUNLINK, "XFS_LI_IUNLINK" }, \
- { XFS_LI_INODE, "XFS_LI_INODE" }, \
- { XFS_LI_BUF, "XFS_LI_BUF" }, \
- { XFS_LI_DQUOT, "XFS_LI_DQUOT" }, \
- { XFS_LI_QUOTAOFF, "XFS_LI_QUOTAOFF" }
-
-/*
- * Transaction types. Used to distinguish types of buffers.
- */
-#define XFS_TRANS_SETATTR_NOT_SIZE 1
-#define XFS_TRANS_SETATTR_SIZE 2
-#define XFS_TRANS_INACTIVE 3
-#define XFS_TRANS_CREATE 4
-#define XFS_TRANS_CREATE_TRUNC 5
-#define XFS_TRANS_TRUNCATE_FILE 6
-#define XFS_TRANS_REMOVE 7
-#define XFS_TRANS_LINK 8
-#define XFS_TRANS_RENAME 9
-#define XFS_TRANS_MKDIR 10
-#define XFS_TRANS_RMDIR 11
-#define XFS_TRANS_SYMLINK 12
-#define XFS_TRANS_SET_DMATTRS 13
-#define XFS_TRANS_GROWFS 14
-#define XFS_TRANS_STRAT_WRITE 15
-#define XFS_TRANS_DIOSTRAT 16
-/* 17 was XFS_TRANS_WRITE_SYNC */
-#define XFS_TRANS_WRITEID 18
-#define XFS_TRANS_ADDAFORK 19
-#define XFS_TRANS_ATTRINVAL 20
-#define XFS_TRANS_ATRUNCATE 21
-#define XFS_TRANS_ATTR_SET 22
-#define XFS_TRANS_ATTR_RM 23
-#define XFS_TRANS_ATTR_FLAG 24
-#define XFS_TRANS_CLEAR_AGI_BUCKET 25
-#define XFS_TRANS_QM_SBCHANGE 26
-/*
- * Dummy entries since we use the transaction type to index into the
- * trans_type[] in xlog_recover_print_trans_head()
- */
-#define XFS_TRANS_DUMMY1 27
-#define XFS_TRANS_DUMMY2 28
-#define XFS_TRANS_QM_QUOTAOFF 29
-#define XFS_TRANS_QM_DQALLOC 30
-#define XFS_TRANS_QM_SETQLIM 31
-#define XFS_TRANS_QM_DQCLUSTER 32
-#define XFS_TRANS_QM_QINOCREATE 33
-#define XFS_TRANS_QM_QUOTAOFF_END 34
-#define XFS_TRANS_SB_UNIT 35
-#define XFS_TRANS_FSYNC_TS 36
-#define XFS_TRANS_GROWFSRT_ALLOC 37
-#define XFS_TRANS_GROWFSRT_ZERO 38
-#define XFS_TRANS_GROWFSRT_FREE 39
-#define XFS_TRANS_SWAPEXT 40
-#define XFS_TRANS_SB_COUNT 41
-#define XFS_TRANS_CHECKPOINT 42
-#define XFS_TRANS_TYPE_MAX 42
-/* new transaction types need to be reflected in xfs_logprint(8) */
-
-#define XFS_TRANS_TYPES \
- { XFS_TRANS_SETATTR_NOT_SIZE, "SETATTR_NOT_SIZE" }, \
- { XFS_TRANS_SETATTR_SIZE, "SETATTR_SIZE" }, \
- { XFS_TRANS_INACTIVE, "INACTIVE" }, \
- { XFS_TRANS_CREATE, "CREATE" }, \
- { XFS_TRANS_CREATE_TRUNC, "CREATE_TRUNC" }, \
- { XFS_TRANS_TRUNCATE_FILE, "TRUNCATE_FILE" }, \
- { XFS_TRANS_REMOVE, "REMOVE" }, \
- { XFS_TRANS_LINK, "LINK" }, \
- { XFS_TRANS_RENAME, "RENAME" }, \
- { XFS_TRANS_MKDIR, "MKDIR" }, \
- { XFS_TRANS_RMDIR, "RMDIR" }, \
- { XFS_TRANS_SYMLINK, "SYMLINK" }, \
- { XFS_TRANS_SET_DMATTRS, "SET_DMATTRS" }, \
- { XFS_TRANS_GROWFS, "GROWFS" }, \
- { XFS_TRANS_STRAT_WRITE, "STRAT_WRITE" }, \
- { XFS_TRANS_DIOSTRAT, "DIOSTRAT" }, \
- { XFS_TRANS_WRITEID, "WRITEID" }, \
- { XFS_TRANS_ADDAFORK, "ADDAFORK" }, \
- { XFS_TRANS_ATTRINVAL, "ATTRINVAL" }, \
- { XFS_TRANS_ATRUNCATE, "ATRUNCATE" }, \
- { XFS_TRANS_ATTR_SET, "ATTR_SET" }, \
- { XFS_TRANS_ATTR_RM, "ATTR_RM" }, \
- { XFS_TRANS_ATTR_FLAG, "ATTR_FLAG" }, \
- { XFS_TRANS_CLEAR_AGI_BUCKET, "CLEAR_AGI_BUCKET" }, \
- { XFS_TRANS_QM_SBCHANGE, "QM_SBCHANGE" }, \
- { XFS_TRANS_QM_QUOTAOFF, "QM_QUOTAOFF" }, \
- { XFS_TRANS_QM_DQALLOC, "QM_DQALLOC" }, \
- { XFS_TRANS_QM_SETQLIM, "QM_SETQLIM" }, \
- { XFS_TRANS_QM_DQCLUSTER, "QM_DQCLUSTER" }, \
- { XFS_TRANS_QM_QINOCREATE, "QM_QINOCREATE" }, \
- { XFS_TRANS_QM_QUOTAOFF_END, "QM_QOFF_END" }, \
- { XFS_TRANS_SB_UNIT, "SB_UNIT" }, \
- { XFS_TRANS_FSYNC_TS, "FSYNC_TS" }, \
- { XFS_TRANS_GROWFSRT_ALLOC, "GROWFSRT_ALLOC" }, \
- { XFS_TRANS_GROWFSRT_ZERO, "GROWFSRT_ZERO" }, \
- { XFS_TRANS_GROWFSRT_FREE, "GROWFSRT_FREE" }, \
- { XFS_TRANS_SWAPEXT, "SWAPEXT" }, \
- { XFS_TRANS_SB_COUNT, "SB_COUNT" }, \
- { XFS_TRANS_CHECKPOINT, "CHECKPOINT" }, \
- { XFS_TRANS_DUMMY1, "DUMMY1" }, \
- { XFS_TRANS_DUMMY2, "DUMMY2" }, \
- { XLOG_UNMOUNT_REC_TYPE, "UNMOUNT" }
-
-/*
- * This structure is used to track log items associated with
- * a transaction. It points to the log item and keeps some
- * flags to track the state of the log item. It also tracks
- * the amount of space needed to log the item it describes
- * once we get to commit processing (see xfs_trans_commit()).
- */
-struct xfs_log_item_desc {
- struct xfs_log_item *lid_item;
- struct list_head lid_trans;
- unsigned char lid_flags;
-};
-
-#define XFS_LID_DIRTY 0x1
-
-#define XFS_TRANS_MAGIC 0x5452414E /* 'TRAN' */
-/*
- * Values for t_flags.
- */
-#define XFS_TRANS_DIRTY 0x01 /* something needs to be logged */
-#define XFS_TRANS_SB_DIRTY 0x02 /* superblock is modified */
-#define XFS_TRANS_PERM_LOG_RES 0x04 /* xact took a permanent log res */
-#define XFS_TRANS_SYNC 0x08 /* make commit synchronous */
-#define XFS_TRANS_DQ_DIRTY 0x10 /* at least one dquot in trx dirty */
-#define XFS_TRANS_RESERVE 0x20 /* OK to use reserved data blocks */
-#define XFS_TRANS_FREEZE_PROT 0x40 /* Transaction has elevated writer
- count in superblock */
-
-/*
- * Values for call flags parameter.
- */
-#define XFS_TRANS_RELEASE_LOG_RES 0x4
-#define XFS_TRANS_ABORT 0x8
-
-/*
- * Field values for xfs_trans_mod_sb.
- */
-#define XFS_TRANS_SB_ICOUNT 0x00000001
-#define XFS_TRANS_SB_IFREE 0x00000002
-#define XFS_TRANS_SB_FDBLOCKS 0x00000004
-#define XFS_TRANS_SB_RES_FDBLOCKS 0x00000008
-#define XFS_TRANS_SB_FREXTENTS 0x00000010
-#define XFS_TRANS_SB_RES_FREXTENTS 0x00000020
-#define XFS_TRANS_SB_DBLOCKS 0x00000040
-#define XFS_TRANS_SB_AGCOUNT 0x00000080
-#define XFS_TRANS_SB_IMAXPCT 0x00000100
-#define XFS_TRANS_SB_REXTSIZE 0x00000200
-#define XFS_TRANS_SB_RBMBLOCKS 0x00000400
-#define XFS_TRANS_SB_RBLOCKS 0x00000800
-#define XFS_TRANS_SB_REXTENTS 0x00001000
-#define XFS_TRANS_SB_REXTSLOG 0x00002000
-
-
-/*
- * Per-extent log reservation for the allocation btree changes
- * involved in freeing or allocating an extent.
- * 2 trees * (2 blocks/level * max depth - 1) * block size
- */
-#define XFS_ALLOCFREE_LOG_RES(mp,nx) \
- ((nx) * (2 * XFS_FSB_TO_B((mp), 2 * XFS_AG_MAXLEVELS(mp) - 1)))
-#define XFS_ALLOCFREE_LOG_COUNT(mp,nx) \
- ((nx) * (2 * (2 * XFS_AG_MAXLEVELS(mp) - 1)))
-
-/*
- * Per-directory log reservation for any directory change.
- * dir blocks: (1 btree block per level + data block + free block) * dblock size
- * bmap btree: (levels + 2) * max depth * block size
- * v2 directory blocks can be fragmented below the dirblksize down to the fsb
- * size, so account for that in the DAENTER macros.
- */
-#define XFS_DIROP_LOG_RES(mp) \
- (XFS_FSB_TO_B(mp, XFS_DAENTER_BLOCKS(mp, XFS_DATA_FORK)) + \
- (XFS_FSB_TO_B(mp, XFS_DAENTER_BMAPS(mp, XFS_DATA_FORK) + 1)))
-#define XFS_DIROP_LOG_COUNT(mp) \
- (XFS_DAENTER_BLOCKS(mp, XFS_DATA_FORK) + \
- XFS_DAENTER_BMAPS(mp, XFS_DATA_FORK) + 1)
-
-
-#define XFS_WRITE_LOG_RES(mp) ((mp)->m_reservations.tr_write)
-#define XFS_ITRUNCATE_LOG_RES(mp) ((mp)->m_reservations.tr_itruncate)
-#define XFS_RENAME_LOG_RES(mp) ((mp)->m_reservations.tr_rename)
-#define XFS_LINK_LOG_RES(mp) ((mp)->m_reservations.tr_link)
-#define XFS_REMOVE_LOG_RES(mp) ((mp)->m_reservations.tr_remove)
-#define XFS_SYMLINK_LOG_RES(mp) ((mp)->m_reservations.tr_symlink)
-#define XFS_CREATE_LOG_RES(mp) ((mp)->m_reservations.tr_create)
-#define XFS_MKDIR_LOG_RES(mp) ((mp)->m_reservations.tr_mkdir)
-#define XFS_IFREE_LOG_RES(mp) ((mp)->m_reservations.tr_ifree)
-#define XFS_ICHANGE_LOG_RES(mp) ((mp)->m_reservations.tr_ichange)
-#define XFS_GROWDATA_LOG_RES(mp) ((mp)->m_reservations.tr_growdata)
-#define XFS_GROWRTALLOC_LOG_RES(mp) ((mp)->m_reservations.tr_growrtalloc)
-#define XFS_GROWRTZERO_LOG_RES(mp) ((mp)->m_reservations.tr_growrtzero)
-#define XFS_GROWRTFREE_LOG_RES(mp) ((mp)->m_reservations.tr_growrtfree)
-#define XFS_SWRITE_LOG_RES(mp) ((mp)->m_reservations.tr_swrite)
-/*
- * Logging the inode timestamps on an fsync -- same as SWRITE
- * as long as SWRITE logs the entire inode core
- */
-#define XFS_FSYNC_TS_LOG_RES(mp) ((mp)->m_reservations.tr_swrite)
-#define XFS_WRITEID_LOG_RES(mp) ((mp)->m_reservations.tr_swrite)
-#define XFS_ADDAFORK_LOG_RES(mp) ((mp)->m_reservations.tr_addafork)
-#define XFS_ATTRINVAL_LOG_RES(mp) ((mp)->m_reservations.tr_attrinval)
-#define XFS_ATTRSET_LOG_RES(mp, ext) \
- ((mp)->m_reservations.tr_attrset + \
- (ext * (mp)->m_sb.sb_sectsize) + \
- (ext * XFS_FSB_TO_B((mp), XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK))) + \
- (128 * (ext + (ext * XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)))))
-#define XFS_ATTRRM_LOG_RES(mp) ((mp)->m_reservations.tr_attrrm)
-#define XFS_CLEAR_AGI_BUCKET_LOG_RES(mp) ((mp)->m_reservations.tr_clearagi)
-
-
-/*
- * Various log count values.
- */
-#define XFS_DEFAULT_LOG_COUNT 1
-#define XFS_DEFAULT_PERM_LOG_COUNT 2
-#define XFS_ITRUNCATE_LOG_COUNT 2
-#define XFS_INACTIVE_LOG_COUNT 2
-#define XFS_CREATE_LOG_COUNT 2
-#define XFS_MKDIR_LOG_COUNT 3
-#define XFS_SYMLINK_LOG_COUNT 3
-#define XFS_REMOVE_LOG_COUNT 2
-#define XFS_LINK_LOG_COUNT 2
-#define XFS_RENAME_LOG_COUNT 2
-#define XFS_WRITE_LOG_COUNT 2
-#define XFS_ADDAFORK_LOG_COUNT 2
-#define XFS_ATTRINVAL_LOG_COUNT 1
-#define XFS_ATTRSET_LOG_COUNT 3
-#define XFS_ATTRRM_LOG_COUNT 3
-
-/*
- * Here we centralize the specification of XFS meta-data buffer
- * reference count values. This determine how hard the buffer
- * cache tries to hold onto the buffer.
- */
-#define XFS_AGF_REF 4
-#define XFS_AGI_REF 4
-#define XFS_AGFL_REF 3
-#define XFS_INO_BTREE_REF 3
-#define XFS_ALLOC_BTREE_REF 2
-#define XFS_BMAP_BTREE_REF 2
-#define XFS_DIR_BTREE_REF 2
-#define XFS_INO_REF 2
-#define XFS_ATTR_BTREE_REF 1
-#define XFS_DQUOT_REF 1
-
-#ifdef __KERNEL__
+/* kernel only transaction subsystem defines */
struct xfs_buf;
struct xfs_buftarg;
@@ -311,6 +30,7 @@ struct xfs_log_iovec;
struct xfs_log_item_desc;
struct xfs_mount;
struct xfs_trans;
+struct xfs_trans_res;
struct xfs_dquot_acct;
struct xfs_busy_extent;
@@ -343,8 +63,8 @@ typedef struct xfs_log_item {
{ XFS_LI_ABORTED, "ABORTED" }
struct xfs_item_ops {
- uint (*iop_size)(xfs_log_item_t *);
- void (*iop_format)(xfs_log_item_t *, struct xfs_log_iovec *);
+ void (*iop_size)(xfs_log_item_t *, int *, int *);
+ void (*iop_format)(xfs_log_item_t *, struct xfs_log_vec *);
void (*iop_pin)(xfs_log_item_t *);
void (*iop_unpin)(xfs_log_item_t *, int remove);
uint (*iop_push)(struct xfs_log_item *, struct list_head *);
@@ -353,35 +73,23 @@ struct xfs_item_ops {
void (*iop_committing)(xfs_log_item_t *, xfs_lsn_t);
};
-#define IOP_SIZE(ip) (*(ip)->li_ops->iop_size)(ip)
-#define IOP_FORMAT(ip,vp) (*(ip)->li_ops->iop_format)(ip, vp)
-#define IOP_PIN(ip) (*(ip)->li_ops->iop_pin)(ip)
-#define IOP_UNPIN(ip, remove) (*(ip)->li_ops->iop_unpin)(ip, remove)
-#define IOP_PUSH(ip, list) (*(ip)->li_ops->iop_push)(ip, list)
-#define IOP_UNLOCK(ip) (*(ip)->li_ops->iop_unlock)(ip)
-#define IOP_COMMITTED(ip, lsn) (*(ip)->li_ops->iop_committed)(ip, lsn)
-#define IOP_COMMITTING(ip, lsn) (*(ip)->li_ops->iop_committing)(ip, lsn)
+void xfs_log_item_init(struct xfs_mount *mp, struct xfs_log_item *item,
+ int type, const struct xfs_item_ops *ops);
/*
- * Return values for the IOP_PUSH() routines.
+ * Return values for the iop_push() routines.
*/
#define XFS_ITEM_SUCCESS 0
#define XFS_ITEM_PINNED 1
#define XFS_ITEM_LOCKED 2
#define XFS_ITEM_FLUSHING 3
-/*
- * This is the type of function which can be given to xfs_trans_callback()
- * to be called upon the transaction's commit to disk.
- */
-typedef void (*xfs_trans_callback_t)(struct xfs_trans *, void *);
/*
* This is the structure maintained for every active transaction.
*/
typedef struct xfs_trans {
unsigned int t_magic; /* magic number */
- xfs_log_callback_t t_logcb; /* log callback struct */
unsigned int t_type; /* transaction type */
unsigned int t_log_res; /* amt of log space resvd */
unsigned int t_log_count; /* count for perm log res */
@@ -403,7 +111,7 @@ typedef struct xfs_trans {
int64_t t_res_fdblocks_delta; /* on-disk only chg */
int64_t t_frextents_delta;/* superblock freextents chg*/
int64_t t_res_frextents_delta; /* on-disk only chg */
-#ifdef DEBUG
+#if defined(DEBUG) || defined(XFS_WARN)
int64_t t_ag_freeblks_delta; /* debugging counter */
int64_t t_ag_flist_delta; /* debugging counter */
int64_t t_ag_btree_delta; /* debugging counter */
@@ -417,7 +125,6 @@ typedef struct xfs_trans {
int64_t t_rextents_delta;/* superblocks rextents chg */
int64_t t_rextslog_delta;/* superblocks rextslog chg */
struct list_head t_items; /* log item descriptors */
- xfs_trans_header_t t_header; /* header for in-log trans */
struct list_head t_busy; /* list of busy extents */
unsigned long t_pflags; /* saved process flags state */
} xfs_trans_t;
@@ -431,7 +138,7 @@ typedef struct xfs_trans {
#define xfs_trans_get_block_res(tp) ((tp)->t_blk_res)
#define xfs_trans_set_sync(tp) ((tp)->t_flags |= XFS_TRANS_SYNC)
-#ifdef DEBUG
+#if defined(DEBUG) || defined(XFS_WARN)
#define xfs_trans_agblocks_delta(tp, d) ((tp)->t_ag_freeblks_delta += (int64_t)d)
#define xfs_trans_agflist_delta(tp, d) ((tp)->t_ag_flist_delta += (int64_t)d)
#define xfs_trans_agbtree_delta(tp, d) ((tp)->t_ag_btree_delta += (int64_t)d)
@@ -447,7 +154,7 @@ typedef struct xfs_trans {
xfs_trans_t *xfs_trans_alloc(struct xfs_mount *, uint);
xfs_trans_t *_xfs_trans_alloc(struct xfs_mount *, uint, xfs_km_flags_t);
xfs_trans_t *xfs_trans_dup(xfs_trans_t *);
-int xfs_trans_reserve(xfs_trans_t *, uint, uint, uint,
+int xfs_trans_reserve(struct xfs_trans *, struct xfs_trans_res *,
uint, uint);
void xfs_trans_mod_sb(xfs_trans_t *, uint, int64_t);
@@ -501,6 +208,7 @@ void xfs_trans_bhold_release(xfs_trans_t *, struct xfs_buf *);
void xfs_trans_binval(xfs_trans_t *, struct xfs_buf *);
void xfs_trans_inode_buf(xfs_trans_t *, struct xfs_buf *);
void xfs_trans_stale_inode_buf(xfs_trans_t *, struct xfs_buf *);
+void xfs_trans_ordered_buf(xfs_trans_t *, struct xfs_buf *);
void xfs_trans_dquot_buf(xfs_trans_t *, struct xfs_buf *, uint);
void xfs_trans_inode_alloc_buf(xfs_trans_t *, struct xfs_buf *);
void xfs_trans_ichgtime(struct xfs_trans *, struct xfs_inode *, int);
@@ -521,16 +229,17 @@ void xfs_trans_log_efd_extent(xfs_trans_t *,
xfs_fsblock_t,
xfs_extlen_t);
int xfs_trans_commit(xfs_trans_t *, uint flags);
+int xfs_trans_roll(struct xfs_trans **, struct xfs_inode *);
void xfs_trans_cancel(xfs_trans_t *, int);
int xfs_trans_ail_init(struct xfs_mount *);
void xfs_trans_ail_destroy(struct xfs_mount *);
+void xfs_trans_buf_set_type(struct xfs_trans *, struct xfs_buf *,
+ enum xfs_blft);
+void xfs_trans_buf_copy_type(struct xfs_buf *dst_bp,
+ struct xfs_buf *src_bp);
+
extern kmem_zone_t *xfs_trans_zone;
extern kmem_zone_t *xfs_log_item_desc_zone;
-#endif /* __KERNEL__ */
-
-void xfs_trans_init(struct xfs_mount *);
-int xfs_trans_roll(struct xfs_trans **, struct xfs_inode *);
-
#endif /* __XFS_TRANS_H__ */
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index 6011ee66133..cb0f3a84cc6 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -18,15 +18,16 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
-#include "xfs_types.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_mount.h"
+#include "xfs_trans.h"
#include "xfs_trans_priv.h"
#include "xfs_trace.h"
#include "xfs_error.h"
+#include "xfs_log.h"
#ifdef DEBUG
/*
@@ -55,40 +56,12 @@ xfs_ail_check(
ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) >= 0);
-#ifdef XFS_TRANS_DEBUG
- /*
- * Walk the list checking lsn ordering, and that every entry has the
- * XFS_LI_IN_AIL flag set. This is really expensive, so only do it
- * when specifically debugging the transaction subsystem.
- */
- prev_lip = list_entry(&ailp->xa_ail, xfs_log_item_t, li_ail);
- list_for_each_entry(lip, &ailp->xa_ail, li_ail) {
- if (&prev_lip->li_ail != &ailp->xa_ail)
- ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) <= 0);
- ASSERT((lip->li_flags & XFS_LI_IN_AIL) != 0);
- prev_lip = lip;
- }
-#endif /* XFS_TRANS_DEBUG */
}
#else /* !DEBUG */
#define xfs_ail_check(a,l)
#endif /* DEBUG */
/*
- * Return a pointer to the first item in the AIL. If the AIL is empty, then
- * return NULL.
- */
-xfs_log_item_t *
-xfs_ail_min(
- struct xfs_ail *ailp)
-{
- if (list_empty(&ailp->xa_ail))
- return NULL;
-
- return list_first_entry(&ailp->xa_ail, xfs_log_item_t, li_ail);
-}
-
- /*
* Return a pointer to the last item in the AIL. If the AIL is empty, then
* return NULL.
*/
@@ -200,7 +173,6 @@ xfs_trans_ail_cursor_next(
*/
void
xfs_trans_ail_cursor_done(
- struct xfs_ail *ailp,
struct xfs_ail_cursor *cur)
{
cur->item = NULL;
@@ -395,7 +367,7 @@ xfsaild_push(
* If the AIL is empty or our push has reached the end we are
* done now.
*/
- xfs_trans_ail_cursor_done(ailp, &cur);
+ xfs_trans_ail_cursor_done(&cur);
spin_unlock(&ailp->xa_lock);
goto out_done;
}
@@ -407,11 +379,11 @@ xfsaild_push(
int lock_result;
/*
- * Note that IOP_PUSH may unlock and reacquire the AIL lock. We
+ * Note that iop_push may unlock and reacquire the AIL lock. We
* rely on the AIL cursor implementation to be able to deal with
* the dropped lock.
*/
- lock_result = IOP_PUSH(lip, &ailp->xa_buf_list);
+ lock_result = lip->li_ops->iop_push(lip, &ailp->xa_buf_list);
switch (lock_result) {
case XFS_ITEM_SUCCESS:
XFS_STATS_INC(xs_push_ail_success);
@@ -480,7 +452,7 @@ xfsaild_push(
break;
lsn = lip->li_lsn;
}
- xfs_trans_ail_cursor_done(ailp, &cur);
+ xfs_trans_ail_cursor_done(&cur);
spin_unlock(&ailp->xa_lock);
if (xfs_buf_delwri_submit_nowait(&ailp->xa_buf_list))
@@ -686,11 +658,13 @@ xfs_trans_ail_update_bulk(
if (XFS_LSN_CMP(lsn, lip->li_lsn) <= 0)
continue;
+ trace_xfs_ail_move(lip, lip->li_lsn, lsn);
xfs_ail_delete(ailp, lip);
if (mlip == lip)
mlip_changed = 1;
} else {
lip->li_flags |= XFS_LI_IN_AIL;
+ trace_xfs_ail_insert(lip, 0, lsn);
}
lip->li_lsn = lsn;
list_add(&lip->li_ail, &tmp);
@@ -759,6 +733,7 @@ xfs_trans_ail_delete_bulk(
return;
}
+ trace_xfs_ail_delete(lip, mlip->li_lsn, lip->li_lsn);
xfs_ail_delete(ailp, lip);
lip->li_flags &= ~XFS_LI_IN_AIL;
lip->li_lsn = 0;
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 3edf5dbee00..b8eef0549f3 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -17,17 +17,15 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
-#include "xfs_types.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dinode.h"
#include "xfs_inode.h"
+#include "xfs_trans.h"
#include "xfs_buf_item.h"
#include "xfs_trans_priv.h"
#include "xfs_error.h"
@@ -277,6 +275,10 @@ xfs_trans_read_buf_map(
XFS_BUF_UNDONE(bp);
xfs_buf_stale(bp);
xfs_buf_relse(bp);
+
+ /* bad CRC means corrupted metadata */
+ if (error == EFSBADCRC)
+ error = EFSCORRUPTED;
return error;
}
#ifdef DEBUG
@@ -316,7 +318,18 @@ xfs_trans_read_buf_map(
ASSERT(bp->b_iodone == NULL);
XFS_BUF_READ(bp);
bp->b_ops = ops;
- xfsbdstrat(tp->t_mountp, bp);
+
+ /*
+ * XXX(hch): clean up the error handling here to be less
+ * of a mess..
+ */
+ if (XFS_FORCED_SHUTDOWN(mp)) {
+ trace_xfs_bdstrat_shut(bp, _RET_IP_);
+ xfs_bioerror_relse(bp);
+ } else {
+ xfs_buf_iorequest(bp);
+ }
+
error = xfs_buf_iowait(bp);
if (error) {
xfs_buf_ioerror_alert(bp, __func__);
@@ -329,6 +342,9 @@ xfs_trans_read_buf_map(
if (tp->t_flags & XFS_TRANS_DIRTY)
xfs_force_shutdown(tp->t_mountp,
SHUTDOWN_META_IO_ERROR);
+ /* bad CRC means corrupted metadata */
+ if (error == EFSBADCRC)
+ error = EFSCORRUPTED;
return error;
}
}
@@ -366,6 +382,10 @@ xfs_trans_read_buf_map(
if (tp->t_flags & XFS_TRANS_DIRTY)
xfs_force_shutdown(tp->t_mountp, SHUTDOWN_META_IO_ERROR);
xfs_buf_relse(bp);
+
+ /* bad CRC means corrupted metadata */
+ if (error == EFSBADCRC)
+ error = EFSCORRUPTED;
return error;
}
#ifdef DEBUG
@@ -397,7 +417,6 @@ shutdown_abort:
return XFS_ERROR(EIO);
}
-
/*
* Release the buffer bp which was previously acquired with one of the
* xfs_trans_... buffer allocation routines if the buffer has not
@@ -506,7 +525,7 @@ xfs_trans_brelse(xfs_trans_t *tp,
/*
* Mark the buffer as not needing to be unlocked when the buf item's
- * IOP_UNLOCK() routine is called. The buffer must already be locked
+ * iop_unlock() routine is called. The buffer must already be locked
* and associated with the given transaction.
*/
/* ARGSUSED */
@@ -603,8 +622,14 @@ xfs_trans_log_buf(xfs_trans_t *tp,
tp->t_flags |= XFS_TRANS_DIRTY;
bip->bli_item.li_desc->lid_flags |= XFS_LID_DIRTY;
- bip->bli_flags |= XFS_BLI_LOGGED;
- xfs_buf_item_log(bip, first, last);
+
+ /*
+ * If we have an ordered buffer we are not logging any dirty range but
+ * it still needs to be marked dirty and that it has been logged.
+ */
+ bip->bli_flags |= XFS_BLI_DIRTY | XFS_BLI_LOGGED;
+ if (!(bip->bli_flags & XFS_BLI_ORDERED))
+ xfs_buf_item_log(bip, first, last);
}
@@ -659,6 +684,7 @@ xfs_trans_binval(
ASSERT(XFS_BUF_ISSTALE(bp));
ASSERT(!(bip->bli_flags & (XFS_BLI_LOGGED | XFS_BLI_DIRTY)));
ASSERT(!(bip->__bli_format.blf_flags & XFS_BLF_INODE_BUF));
+ ASSERT(!(bip->__bli_format.blf_flags & XFS_BLFT_MASK));
ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL);
ASSERT(bip->bli_item.li_desc->lid_flags & XFS_LID_DIRTY);
ASSERT(tp->t_flags & XFS_TRANS_DIRTY);
@@ -671,6 +697,7 @@ xfs_trans_binval(
bip->bli_flags &= ~(XFS_BLI_INODE_BUF | XFS_BLI_LOGGED | XFS_BLI_DIRTY);
bip->__bli_format.blf_flags &= ~XFS_BLF_INODE_BUF;
bip->__bli_format.blf_flags |= XFS_BLF_CANCEL;
+ bip->__bli_format.blf_flags &= ~XFS_BLFT_MASK;
for (i = 0; i < bip->bli_format_count; i++) {
memset(bip->bli_formats[i].blf_data_map, 0,
(bip->bli_formats[i].blf_map_size * sizeof(uint)));
@@ -702,12 +729,13 @@ xfs_trans_inode_buf(
ASSERT(atomic_read(&bip->bli_refcount) > 0);
bip->bli_flags |= XFS_BLI_INODE_BUF;
+ xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DINO_BUF);
}
/*
* This call is used to indicate that the buffer is going to
* be staled and was an inode buffer. This means it gets
- * special processing during unpin - where any inodes
+ * special processing during unpin - where any inodes
* associated with the buffer should be removed from ail.
* There is also special processing during recovery,
* any replay of the inodes in the buffer needs to be
@@ -726,6 +754,7 @@ xfs_trans_stale_inode_buf(
bip->bli_flags |= XFS_BLI_STALE_INODE;
bip->bli_item.li_cb = xfs_buf_iodone;
+ xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DINO_BUF);
}
/*
@@ -749,8 +778,66 @@ xfs_trans_inode_alloc_buf(
ASSERT(atomic_read(&bip->bli_refcount) > 0);
bip->bli_flags |= XFS_BLI_INODE_ALLOC_BUF;
+ xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DINO_BUF);
}
+/*
+ * Mark the buffer as ordered for this transaction. This means
+ * that the contents of the buffer are not recorded in the transaction
+ * but it is tracked in the AIL as though it was. This allows us
+ * to record logical changes in transactions rather than the physical
+ * changes we make to the buffer without changing writeback ordering
+ * constraints of metadata buffers.
+ */
+void
+xfs_trans_ordered_buf(
+ struct xfs_trans *tp,
+ struct xfs_buf *bp)
+{
+ struct xfs_buf_log_item *bip = bp->b_fspriv;
+
+ ASSERT(bp->b_transp == tp);
+ ASSERT(bip != NULL);
+ ASSERT(atomic_read(&bip->bli_refcount) > 0);
+
+ bip->bli_flags |= XFS_BLI_ORDERED;
+ trace_xfs_buf_item_ordered(bip);
+}
+
+/*
+ * Set the type of the buffer for log recovery so that it can correctly identify
+ * and hence attach the correct buffer ops to the buffer after replay.
+ */
+void
+xfs_trans_buf_set_type(
+ struct xfs_trans *tp,
+ struct xfs_buf *bp,
+ enum xfs_blft type)
+{
+ struct xfs_buf_log_item *bip = bp->b_fspriv;
+
+ if (!tp)
+ return;
+
+ ASSERT(bp->b_transp == tp);
+ ASSERT(bip != NULL);
+ ASSERT(atomic_read(&bip->bli_refcount) > 0);
+
+ xfs_blft_to_flags(&bip->__bli_format, type);
+}
+
+void
+xfs_trans_buf_copy_type(
+ struct xfs_buf *dst_bp,
+ struct xfs_buf *src_bp)
+{
+ struct xfs_buf_log_item *sbip = src_bp->b_fspriv;
+ struct xfs_buf_log_item *dbip = dst_bp->b_fspriv;
+ enum xfs_blft type;
+
+ type = xfs_blft_from_flags(&sbip->__bli_format);
+ xfs_blft_to_flags(&dbip->__bli_format, type);
+}
/*
* Similar to xfs_trans_inode_buf(), this marks the buffer as a cluster of
@@ -769,14 +856,28 @@ xfs_trans_dquot_buf(
xfs_buf_t *bp,
uint type)
{
- xfs_buf_log_item_t *bip = bp->b_fspriv;
+ struct xfs_buf_log_item *bip = bp->b_fspriv;
- ASSERT(bp->b_transp == tp);
- ASSERT(bip != NULL);
ASSERT(type == XFS_BLF_UDQUOT_BUF ||
type == XFS_BLF_PDQUOT_BUF ||
type == XFS_BLF_GDQUOT_BUF);
- ASSERT(atomic_read(&bip->bli_refcount) > 0);
bip->__bli_format.blf_flags |= type;
+
+ switch (type) {
+ case XFS_BLF_UDQUOT_BUF:
+ type = XFS_BLFT_UDQUOT_BUF;
+ break;
+ case XFS_BLF_PDQUOT_BUF:
+ type = XFS_BLFT_PDQUOT_BUF;
+ break;
+ case XFS_BLF_GDQUOT_BUF:
+ type = XFS_BLFT_GDQUOT_BUF;
+ break;
+ default:
+ type = XFS_BLFT_UNKNOWN_BUF;
+ break;
+ }
+
+ xfs_trans_buf_set_type(tp, bp, type);
}
diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c
index 0c7fa54f309..41172861e85 100644
--- a/fs/xfs/xfs_trans_dquot.c
+++ b/fs/xfs/xfs_trans_dquot.c
@@ -17,22 +17,18 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
-#include "xfs_alloc.h"
-#include "xfs_quota.h"
#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
#include "xfs_inode.h"
-#include "xfs_itable.h"
-#include "xfs_bmap.h"
-#include "xfs_rtalloc.h"
#include "xfs_error.h"
-#include "xfs_attr.h"
-#include "xfs_buf_item.h"
+#include "xfs_trans.h"
#include "xfs_trans_priv.h"
+#include "xfs_quota.h"
#include "xfs_qm.h"
STATIC void xfs_trans_alloc_dqinfo(xfs_trans_t *);
@@ -103,8 +99,6 @@ xfs_trans_dup_dqinfo(
return;
xfs_trans_alloc_dqinfo(ntp);
- oqa = otp->t_dqinfo->dqa_usrdquots;
- nqa = ntp->t_dqinfo->dqa_usrdquots;
/*
* Because the quota blk reservation is carried forward,
@@ -113,7 +107,9 @@ xfs_trans_dup_dqinfo(
if(otp->t_flags & XFS_TRANS_DQ_DIRTY)
ntp->t_flags |= XFS_TRANS_DQ_DIRTY;
- for (j = 0; j < 2; j++) {
+ for (j = 0; j < XFS_QM_TRANS_DQTYPES; j++) {
+ oqa = otp->t_dqinfo->dqs[j];
+ nqa = ntp->t_dqinfo->dqs[j];
for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) {
if (oqa[i].qt_dquot == NULL)
break;
@@ -138,8 +134,6 @@ xfs_trans_dup_dqinfo(
oq->qt_ino_res = oq->qt_ino_res_used;
}
- oqa = otp->t_dqinfo->dqa_grpdquots;
- nqa = ntp->t_dqinfo->dqa_grpdquots;
}
}
@@ -157,8 +151,7 @@ xfs_trans_mod_dquot_byino(
if (!XFS_IS_QUOTA_RUNNING(mp) ||
!XFS_IS_QUOTA_ON(mp) ||
- ip->i_ino == mp->m_sb.sb_uquotino ||
- ip->i_ino == mp->m_sb.sb_gquotino)
+ xfs_is_quota_inode(&mp->m_sb, ip->i_ino))
return;
if (tp->t_dqinfo == NULL)
@@ -166,20 +159,28 @@ xfs_trans_mod_dquot_byino(
if (XFS_IS_UQUOTA_ON(mp) && ip->i_udquot)
(void) xfs_trans_mod_dquot(tp, ip->i_udquot, field, delta);
- if (XFS_IS_OQUOTA_ON(mp) && ip->i_gdquot)
+ if (XFS_IS_GQUOTA_ON(mp) && ip->i_gdquot)
(void) xfs_trans_mod_dquot(tp, ip->i_gdquot, field, delta);
+ if (XFS_IS_PQUOTA_ON(mp) && ip->i_pdquot)
+ (void) xfs_trans_mod_dquot(tp, ip->i_pdquot, field, delta);
}
-STATIC xfs_dqtrx_t *
+STATIC struct xfs_dqtrx *
xfs_trans_get_dqtrx(
- xfs_trans_t *tp,
- xfs_dquot_t *dqp)
+ struct xfs_trans *tp,
+ struct xfs_dquot *dqp)
{
- int i;
- xfs_dqtrx_t *qa;
-
- qa = XFS_QM_ISUDQ(dqp) ?
- tp->t_dqinfo->dqa_usrdquots : tp->t_dqinfo->dqa_grpdquots;
+ int i;
+ struct xfs_dqtrx *qa;
+
+ if (XFS_QM_ISUDQ(dqp))
+ qa = tp->t_dqinfo->dqs[XFS_QM_TRANS_USR];
+ else if (XFS_QM_ISGDQ(dqp))
+ qa = tp->t_dqinfo->dqs[XFS_QM_TRANS_GRP];
+ else if (XFS_QM_ISPDQ(dqp))
+ qa = tp->t_dqinfo->dqs[XFS_QM_TRANS_PRJ];
+ else
+ return NULL;
for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) {
if (qa[i].qt_dquot == NULL ||
@@ -292,11 +293,10 @@ xfs_trans_mod_dquot(
/*
- * Given an array of dqtrx structures, lock all the dquots associated
- * and join them to the transaction, provided they have been modified.
- * We know that the highest number of dquots (of one type - usr OR grp),
- * involved in a transaction is 2 and that both usr and grp combined - 3.
- * So, we don't attempt to make this very generic.
+ * Given an array of dqtrx structures, lock all the dquots associated and join
+ * them to the transaction, provided they have been modified. We know that the
+ * highest number of dquots of one type - usr, grp and prj - involved in a
+ * transaction is 3 so we don't need to make this very generic.
*/
STATIC void
xfs_trans_dqlockedjoin(
@@ -326,12 +326,12 @@ xfs_trans_dqlockedjoin(
*/
void
xfs_trans_apply_dquot_deltas(
- xfs_trans_t *tp)
+ struct xfs_trans *tp)
{
int i, j;
- xfs_dquot_t *dqp;
- xfs_dqtrx_t *qtrx, *qa;
- xfs_disk_dquot_t *d;
+ struct xfs_dquot *dqp;
+ struct xfs_dqtrx *qtrx, *qa;
+ struct xfs_disk_dquot *d;
long totalbdelta;
long totalrtbdelta;
@@ -339,12 +339,10 @@ xfs_trans_apply_dquot_deltas(
return;
ASSERT(tp->t_dqinfo);
- qa = tp->t_dqinfo->dqa_usrdquots;
- for (j = 0; j < 2; j++) {
- if (qa[0].qt_dquot == NULL) {
- qa = tp->t_dqinfo->dqa_grpdquots;
+ for (j = 0; j < XFS_QM_TRANS_DQTYPES; j++) {
+ qa = tp->t_dqinfo->dqs[j];
+ if (qa[0].qt_dquot == NULL)
continue;
- }
/*
* Lock all of the dquots and join them to the transaction.
@@ -412,7 +410,7 @@ xfs_trans_apply_dquot_deltas(
* Start/reset the timer(s) if needed.
*/
if (d->d_id) {
- xfs_qm_adjust_dqlimits(tp->t_mountp, d);
+ xfs_qm_adjust_dqlimits(tp->t_mountp, dqp);
xfs_qm_adjust_dqtimers(tp->t_mountp, d);
}
@@ -495,10 +493,6 @@ xfs_trans_apply_dquot_deltas(
ASSERT(dqp->q_res_rtbcount >=
be64_to_cpu(dqp->q_core.d_rtbcount));
}
- /*
- * Do the group quotas next
- */
- qa = tp->t_dqinfo->dqa_grpdquots;
}
}
@@ -516,14 +510,14 @@ xfs_trans_unreserve_and_mod_dquots(
int i, j;
xfs_dquot_t *dqp;
xfs_dqtrx_t *qtrx, *qa;
- boolean_t locked;
+ bool locked;
if (!tp->t_dqinfo || !(tp->t_flags & XFS_TRANS_DQ_DIRTY))
return;
- qa = tp->t_dqinfo->dqa_usrdquots;
+ for (j = 0; j < XFS_QM_TRANS_DQTYPES; j++) {
+ qa = tp->t_dqinfo->dqs[j];
- for (j = 0; j < 2; j++) {
for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) {
qtrx = &qa[i];
/*
@@ -537,17 +531,17 @@ xfs_trans_unreserve_and_mod_dquots(
* about the number of blocks used field, or deltas.
* Also we don't bother to zero the fields.
*/
- locked = B_FALSE;
+ locked = false;
if (qtrx->qt_blk_res) {
xfs_dqlock(dqp);
- locked = B_TRUE;
+ locked = true;
dqp->q_res_bcount -=
(xfs_qcnt_t)qtrx->qt_blk_res;
}
if (qtrx->qt_ino_res) {
if (!locked) {
xfs_dqlock(dqp);
- locked = B_TRUE;
+ locked = true;
}
dqp->q_res_icount -=
(xfs_qcnt_t)qtrx->qt_ino_res;
@@ -556,7 +550,7 @@ xfs_trans_unreserve_and_mod_dquots(
if (qtrx->qt_rtblk_res) {
if (!locked) {
xfs_dqlock(dqp);
- locked = B_TRUE;
+ locked = true;
}
dqp->q_res_rtbcount -=
(xfs_qcnt_t)qtrx->qt_rtblk_res;
@@ -565,7 +559,6 @@ xfs_trans_unreserve_and_mod_dquots(
xfs_dqunlock(dqp);
}
- qa = tp->t_dqinfo->dqa_grpdquots;
}
}
@@ -640,8 +633,8 @@ xfs_trans_dqresv(
if ((flags & XFS_QMOPT_FORCE_RES) == 0 &&
dqp->q_core.d_id &&
((XFS_IS_UQUOTA_ENFORCED(dqp->q_mount) && XFS_QM_ISUDQ(dqp)) ||
- (XFS_IS_OQUOTA_ENFORCED(dqp->q_mount) &&
- (XFS_QM_ISPDQ(dqp) || XFS_QM_ISGDQ(dqp))))) {
+ (XFS_IS_GQUOTA_ENFORCED(dqp->q_mount) && XFS_QM_ISGDQ(dqp)) ||
+ (XFS_IS_PQUOTA_ENFORCED(dqp->q_mount) && XFS_QM_ISPDQ(dqp)))) {
if (nblks > 0) {
/*
* dquot is locked already. See if we'd go over the
@@ -736,8 +729,8 @@ error_return:
/*
* Given dquot(s), make disk block and/or inode reservations against them.
- * The fact that this does the reservation against both the usr and
- * grp/prj quotas is important, because this follows a both-or-nothing
+ * The fact that this does the reservation against user, group and
+ * project quotas is important, because this follows a all-or-nothing
* approach.
*
* flags = XFS_QMOPT_FORCE_RES evades limit enforcement. Used by chown.
@@ -748,15 +741,16 @@ error_return:
*/
int
xfs_trans_reserve_quota_bydquots(
- xfs_trans_t *tp,
- xfs_mount_t *mp,
- xfs_dquot_t *udqp,
- xfs_dquot_t *gdqp,
- long nblks,
- long ninos,
- uint flags)
+ struct xfs_trans *tp,
+ struct xfs_mount *mp,
+ struct xfs_dquot *udqp,
+ struct xfs_dquot *gdqp,
+ struct xfs_dquot *pdqp,
+ long nblks,
+ long ninos,
+ uint flags)
{
- int resvd = 0, error;
+ int error;
if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp))
return 0;
@@ -771,28 +765,34 @@ xfs_trans_reserve_quota_bydquots(
(flags & ~XFS_QMOPT_ENOSPC));
if (error)
return error;
- resvd = 1;
}
if (gdqp) {
error = xfs_trans_dqresv(tp, mp, gdqp, nblks, ninos, flags);
- if (error) {
- /*
- * can't do it, so backout previous reservation
- */
- if (resvd) {
- flags |= XFS_QMOPT_FORCE_RES;
- xfs_trans_dqresv(tp, mp, udqp,
- -nblks, -ninos, flags);
- }
- return error;
- }
+ if (error)
+ goto unwind_usr;
+ }
+
+ if (pdqp) {
+ error = xfs_trans_dqresv(tp, mp, pdqp, nblks, ninos, flags);
+ if (error)
+ goto unwind_grp;
}
/*
* Didn't change anything critical, so, no need to log
*/
return 0;
+
+unwind_grp:
+ flags |= XFS_QMOPT_FORCE_RES;
+ if (gdqp)
+ xfs_trans_dqresv(tp, mp, gdqp, -nblks, -ninos, flags);
+unwind_usr:
+ flags |= XFS_QMOPT_FORCE_RES;
+ if (udqp)
+ xfs_trans_dqresv(tp, mp, udqp, -nblks, -ninos, flags);
+ return error;
}
@@ -816,8 +816,7 @@ xfs_trans_reserve_quota_nblks(
if (XFS_IS_PQUOTA_ON(mp))
flags |= XFS_QMOPT_ENOSPC;
- ASSERT(ip->i_ino != mp->m_sb.sb_uquotino);
- ASSERT(ip->i_ino != mp->m_sb.sb_gquotino);
+ ASSERT(!xfs_is_quota_inode(&mp->m_sb, ip->i_ino));
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
ASSERT((flags & ~(XFS_QMOPT_FORCE_RES | XFS_QMOPT_ENOSPC)) ==
@@ -830,6 +829,7 @@ xfs_trans_reserve_quota_nblks(
*/
return xfs_trans_reserve_quota_bydquots(tp, mp,
ip->i_udquot, ip->i_gdquot,
+ ip->i_pdquot,
nblks, ninos, flags);
}
diff --git a/fs/xfs/xfs_trans_extfree.c b/fs/xfs/xfs_trans_extfree.c
index 8d71b16ecca..47978ba89da 100644
--- a/fs/xfs/xfs_trans_extfree.c
+++ b/fs/xfs/xfs_trans_extfree.c
@@ -17,12 +17,13 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
-#include "xfs_types.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
+#include "xfs_shared.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_mount.h"
+#include "xfs_trans.h"
#include "xfs_trans_priv.h"
#include "xfs_extfree_item.h"
diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c
index d2eee20d5f5..50c3f561428 100644
--- a/fs/xfs/xfs_trans_inode.c
+++ b/fs/xfs/xfs_trans_inode.c
@@ -17,30 +17,19 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
-#include "xfs_types.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dinode.h"
#include "xfs_inode.h"
-#include "xfs_btree.h"
+#include "xfs_trans.h"
#include "xfs_trans_priv.h"
#include "xfs_inode_item.h"
#include "xfs_trace.h"
-#ifdef XFS_TRANS_DEBUG
-STATIC void
-xfs_trans_inode_broot_debug(
- xfs_inode_t *ip);
-#else
-#define xfs_trans_inode_broot_debug(ip)
-#endif
-
/*
* Add a locked inode to the transaction.
*
@@ -67,8 +56,6 @@ xfs_trans_ijoin(
* Get a log_item_desc to point at the new item.
*/
xfs_trans_add_item(tp, &iip->ili_item);
-
- xfs_trans_inode_broot_debug(ip);
}
/*
@@ -122,6 +109,19 @@ xfs_trans_log_inode(
ASSERT(ip->i_itemp != NULL);
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+ /*
+ * First time we log the inode in a transaction, bump the inode change
+ * counter if it is configured for this to occur. We don't use
+ * inode_inc_version() because there is no need for extra locking around
+ * i_version as we already hold the inode locked exclusively for
+ * metadata modification.
+ */
+ if (!(ip->i_itemp->ili_item.li_desc->lid_flags & XFS_LID_DIRTY) &&
+ IS_I_VERSION(VFS_I(ip))) {
+ ip->i_d.di_changecount = ++VFS_I(ip)->i_version;
+ flags |= XFS_ILOG_CORE;
+ }
+
tp->t_flags |= XFS_TRANS_DIRTY;
ip->i_itemp->ili_item.li_desc->lid_flags |= XFS_LID_DIRTY;
@@ -135,34 +135,3 @@ xfs_trans_log_inode(
flags |= ip->i_itemp->ili_last_fields;
ip->i_itemp->ili_fields |= flags;
}
-
-#ifdef XFS_TRANS_DEBUG
-/*
- * Keep track of the state of the inode btree root to make sure we
- * log it properly.
- */
-STATIC void
-xfs_trans_inode_broot_debug(
- xfs_inode_t *ip)
-{
- xfs_inode_log_item_t *iip;
-
- ASSERT(ip->i_itemp != NULL);
- iip = ip->i_itemp;
- if (iip->ili_root_size != 0) {
- ASSERT(iip->ili_orig_root != NULL);
- kmem_free(iip->ili_orig_root);
- iip->ili_root_size = 0;
- iip->ili_orig_root = NULL;
- }
- if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
- ASSERT((ip->i_df.if_broot != NULL) &&
- (ip->i_df.if_broot_bytes > 0));
- iip->ili_root_size = ip->i_df.if_broot_bytes;
- iip->ili_orig_root =
- (char*)kmem_alloc(iip->ili_root_size, KM_SLEEP);
- memcpy(iip->ili_orig_root, (char*)(ip->i_df.if_broot),
- iip->ili_root_size);
- }
-}
-#endif
diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h
index 53b7c9b0f8f..bd1281862ad 100644
--- a/fs/xfs/xfs_trans_priv.h
+++ b/fs/xfs/xfs_trans_priv.h
@@ -25,6 +25,8 @@ struct xfs_trans;
struct xfs_ail;
struct xfs_log_vec;
+
+void xfs_trans_init(struct xfs_mount *);
void xfs_trans_add_item(struct xfs_trans *, struct xfs_log_item *);
void xfs_trans_del_item(struct xfs_log_item *);
void xfs_trans_free_items(struct xfs_trans *tp, xfs_lsn_t commit_lsn,
@@ -83,6 +85,18 @@ void xfs_trans_ail_update_bulk(struct xfs_ail *ailp,
struct xfs_ail_cursor *cur,
struct xfs_log_item **log_items, int nr_items,
xfs_lsn_t lsn) __releases(ailp->xa_lock);
+/*
+ * Return a pointer to the first item in the AIL. If the AIL is empty, then
+ * return NULL.
+ */
+static inline struct xfs_log_item *
+xfs_ail_min(
+ struct xfs_ail *ailp)
+{
+ return list_first_entry_or_null(&ailp->xa_ail, struct xfs_log_item,
+ li_ail);
+}
+
static inline void
xfs_trans_ail_update(
struct xfs_ail *ailp,
@@ -119,8 +133,7 @@ struct xfs_log_item * xfs_trans_ail_cursor_last(struct xfs_ail *ailp,
xfs_lsn_t lsn);
struct xfs_log_item * xfs_trans_ail_cursor_next(struct xfs_ail *ailp,
struct xfs_ail_cursor *cur);
-void xfs_trans_ail_cursor_done(struct xfs_ail *ailp,
- struct xfs_ail_cursor *cur);
+void xfs_trans_ail_cursor_done(struct xfs_ail_cursor *cur);
#if BITS_PER_LONG != 64
static inline void
diff --git a/fs/xfs/xfs_trans_resv.c b/fs/xfs/xfs_trans_resv.c
new file mode 100644
index 00000000000..f2bda7c76b8
--- /dev/null
+++ b/fs/xfs/xfs_trans_resv.c
@@ -0,0 +1,894 @@
+/*
+ * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * Copyright (C) 2010 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_inode.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_ialloc.h"
+#include "xfs_quota.h"
+#include "xfs_trans.h"
+#include "xfs_qm.h"
+#include "xfs_trans_space.h"
+#include "xfs_trace.h"
+
+/*
+ * A buffer has a format structure overhead in the log in addition
+ * to the data, so we need to take this into account when reserving
+ * space in a transaction for a buffer. Round the space required up
+ * to a multiple of 128 bytes so that we don't change the historical
+ * reservation that has been used for this overhead.
+ */
+STATIC uint
+xfs_buf_log_overhead(void)
+{
+ return round_up(sizeof(struct xlog_op_header) +
+ sizeof(struct xfs_buf_log_format), 128);
+}
+
+/*
+ * Calculate out transaction log reservation per item in bytes.
+ *
+ * The nbufs argument is used to indicate the number of items that
+ * will be changed in a transaction. size is used to tell how many
+ * bytes should be reserved per item.
+ */
+STATIC uint
+xfs_calc_buf_res(
+ uint nbufs,
+ uint size)
+{
+ return nbufs * (size + xfs_buf_log_overhead());
+}
+
+/*
+ * Logging inodes is really tricksy. They are logged in memory format,
+ * which means that what we write into the log doesn't directly translate into
+ * the amount of space they use on disk.
+ *
+ * Case in point - btree format forks in memory format use more space than the
+ * on-disk format. In memory, the buffer contains a normal btree block header so
+ * the btree code can treat it as though it is just another generic buffer.
+ * However, when we write it to the inode fork, we don't write all of this
+ * header as it isn't needed. e.g. the root is only ever in the inode, so
+ * there's no need for sibling pointers which would waste 16 bytes of space.
+ *
+ * Hence when we have an inode with a maximally sized btree format fork, then
+ * amount of information we actually log is greater than the size of the inode
+ * on disk. Hence we need an inode reservation function that calculates all this
+ * correctly. So, we log:
+ *
+ * - 4 log op headers for object
+ * - for the ilf, the inode core and 2 forks
+ * - inode log format object
+ * - the inode core
+ * - two inode forks containing bmap btree root blocks.
+ * - the btree data contained by both forks will fit into the inode size,
+ * hence when combined with the inode core above, we have a total of the
+ * actual inode size.
+ * - the BMBT headers need to be accounted separately, as they are
+ * additional to the records and pointers that fit inside the inode
+ * forks.
+ */
+STATIC uint
+xfs_calc_inode_res(
+ struct xfs_mount *mp,
+ uint ninodes)
+{
+ return ninodes *
+ (4 * sizeof(struct xlog_op_header) +
+ sizeof(struct xfs_inode_log_format) +
+ mp->m_sb.sb_inodesize +
+ 2 * XFS_BMBT_BLOCK_LEN(mp));
+}
+
+/*
+ * The free inode btree is a conditional feature and the log reservation
+ * requirements differ slightly from that of the traditional inode allocation
+ * btree. The finobt tracks records for inode chunks with at least one free
+ * inode. A record can be removed from the tree for an inode allocation
+ * or free and thus the finobt reservation is unconditional across:
+ *
+ * - inode allocation
+ * - inode free
+ * - inode chunk allocation
+ *
+ * The 'modify' param indicates to include the record modification scenario. The
+ * 'alloc' param indicates to include the reservation for free space btree
+ * modifications on behalf of finobt modifications. This is required only for
+ * transactions that do not already account for free space btree modifications.
+ *
+ * the free inode btree: max depth * block size
+ * the allocation btrees: 2 trees * (max depth - 1) * block size
+ * the free inode btree entry: block size
+ */
+STATIC uint
+xfs_calc_finobt_res(
+ struct xfs_mount *mp,
+ int alloc,
+ int modify)
+{
+ uint res;
+
+ if (!xfs_sb_version_hasfinobt(&mp->m_sb))
+ return 0;
+
+ res = xfs_calc_buf_res(mp->m_in_maxlevels, XFS_FSB_TO_B(mp, 1));
+ if (alloc)
+ res += xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
+ XFS_FSB_TO_B(mp, 1));
+ if (modify)
+ res += (uint)XFS_FSB_TO_B(mp, 1);
+
+ return res;
+}
+
+/*
+ * Various log reservation values.
+ *
+ * These are based on the size of the file system block because that is what
+ * most transactions manipulate. Each adds in an additional 128 bytes per
+ * item logged to try to account for the overhead of the transaction mechanism.
+ *
+ * Note: Most of the reservations underestimate the number of allocation
+ * groups into which they could free extents in the xfs_bmap_finish() call.
+ * This is because the number in the worst case is quite high and quite
+ * unusual. In order to fix this we need to change xfs_bmap_finish() to free
+ * extents in only a single AG at a time. This will require changes to the
+ * EFI code as well, however, so that the EFI for the extents not freed is
+ * logged again in each transaction. See SGI PV #261917.
+ *
+ * Reservation functions here avoid a huge stack in xfs_trans_init due to
+ * register overflow from temporaries in the calculations.
+ */
+
+
+/*
+ * In a write transaction we can allocate a maximum of 2
+ * extents. This gives:
+ * the inode getting the new extents: inode size
+ * the inode's bmap btree: max depth * block size
+ * the agfs of the ags from which the extents are allocated: 2 * sector
+ * the superblock free block counter: sector size
+ * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
+ * And the bmap_finish transaction can free bmap blocks in a join:
+ * the agfs of the ags containing the blocks: 2 * sector size
+ * the agfls of the ags containing the blocks: 2 * sector size
+ * the super block free block counter: sector size
+ * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
+ */
+STATIC uint
+xfs_calc_write_reservation(
+ struct xfs_mount *mp)
+{
+ return XFS_DQUOT_LOGRES(mp) +
+ MAX((xfs_calc_inode_res(mp, 1) +
+ xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK),
+ XFS_FSB_TO_B(mp, 1)) +
+ xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) +
+ xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2),
+ XFS_FSB_TO_B(mp, 1))),
+ (xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) +
+ xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2),
+ XFS_FSB_TO_B(mp, 1))));
+}
+
+/*
+ * In truncating a file we free up to two extents at once. We can modify:
+ * the inode being truncated: inode size
+ * the inode's bmap btree: (max depth + 1) * block size
+ * And the bmap_finish transaction can free the blocks and bmap blocks:
+ * the agf for each of the ags: 4 * sector size
+ * the agfl for each of the ags: 4 * sector size
+ * the super block to reflect the freed blocks: sector size
+ * worst case split in allocation btrees per extent assuming 4 extents:
+ * 4 exts * 2 trees * (2 * max depth - 1) * block size
+ * the inode btree: max depth * blocksize
+ * the allocation btrees: 2 trees * (max depth - 1) * block size
+ */
+STATIC uint
+xfs_calc_itruncate_reservation(
+ struct xfs_mount *mp)
+{
+ return XFS_DQUOT_LOGRES(mp) +
+ MAX((xfs_calc_inode_res(mp, 1) +
+ xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1,
+ XFS_FSB_TO_B(mp, 1))),
+ (xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) +
+ xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 4),
+ XFS_FSB_TO_B(mp, 1)) +
+ xfs_calc_buf_res(5, 0) +
+ xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
+ XFS_FSB_TO_B(mp, 1)) +
+ xfs_calc_buf_res(2 + mp->m_ialloc_blks +
+ mp->m_in_maxlevels, 0)));
+}
+
+/*
+ * In renaming a files we can modify:
+ * the four inodes involved: 4 * inode size
+ * the two directory btrees: 2 * (max depth + v2) * dir block size
+ * the two directory bmap btrees: 2 * max depth * block size
+ * And the bmap_finish transaction can free dir and bmap blocks (two sets
+ * of bmap blocks) giving:
+ * the agf for the ags in which the blocks live: 3 * sector size
+ * the agfl for the ags in which the blocks live: 3 * sector size
+ * the superblock for the free block count: sector size
+ * the allocation btrees: 3 exts * 2 trees * (2 * max depth - 1) * block size
+ */
+STATIC uint
+xfs_calc_rename_reservation(
+ struct xfs_mount *mp)
+{
+ return XFS_DQUOT_LOGRES(mp) +
+ MAX((xfs_calc_inode_res(mp, 4) +
+ xfs_calc_buf_res(2 * XFS_DIROP_LOG_COUNT(mp),
+ XFS_FSB_TO_B(mp, 1))),
+ (xfs_calc_buf_res(7, mp->m_sb.sb_sectsize) +
+ xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 3),
+ XFS_FSB_TO_B(mp, 1))));
+}
+
+/*
+ * For removing an inode from unlinked list at first, we can modify:
+ * the agi hash list and counters: sector size
+ * the on disk inode before ours in the agi hash list: inode cluster size
+ */
+STATIC uint
+xfs_calc_iunlink_remove_reservation(
+ struct xfs_mount *mp)
+{
+ return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
+ max_t(uint, XFS_FSB_TO_B(mp, 1), mp->m_inode_cluster_size);
+}
+
+/*
+ * For creating a link to an inode:
+ * the parent directory inode: inode size
+ * the linked inode: inode size
+ * the directory btree could split: (max depth + v2) * dir block size
+ * the directory bmap btree could join or split: (max depth + v2) * blocksize
+ * And the bmap_finish transaction can free some bmap blocks giving:
+ * the agf for the ag in which the blocks live: sector size
+ * the agfl for the ag in which the blocks live: sector size
+ * the superblock for the free block count: sector size
+ * the allocation btrees: 2 trees * (2 * max depth - 1) * block size
+ */
+STATIC uint
+xfs_calc_link_reservation(
+ struct xfs_mount *mp)
+{
+ return XFS_DQUOT_LOGRES(mp) +
+ xfs_calc_iunlink_remove_reservation(mp) +
+ MAX((xfs_calc_inode_res(mp, 2) +
+ xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp),
+ XFS_FSB_TO_B(mp, 1))),
+ (xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) +
+ xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
+ XFS_FSB_TO_B(mp, 1))));
+}
+
+/*
+ * For adding an inode to unlinked list we can modify:
+ * the agi hash list: sector size
+ * the unlinked inode: inode size
+ */
+STATIC uint
+xfs_calc_iunlink_add_reservation(xfs_mount_t *mp)
+{
+ return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
+ xfs_calc_inode_res(mp, 1);
+}
+
+/*
+ * For removing a directory entry we can modify:
+ * the parent directory inode: inode size
+ * the removed inode: inode size
+ * the directory btree could join: (max depth + v2) * dir block size
+ * the directory bmap btree could join or split: (max depth + v2) * blocksize
+ * And the bmap_finish transaction can free the dir and bmap blocks giving:
+ * the agf for the ag in which the blocks live: 2 * sector size
+ * the agfl for the ag in which the blocks live: 2 * sector size
+ * the superblock for the free block count: sector size
+ * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
+ */
+STATIC uint
+xfs_calc_remove_reservation(
+ struct xfs_mount *mp)
+{
+ return XFS_DQUOT_LOGRES(mp) +
+ xfs_calc_iunlink_add_reservation(mp) +
+ MAX((xfs_calc_inode_res(mp, 1) +
+ xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp),
+ XFS_FSB_TO_B(mp, 1))),
+ (xfs_calc_buf_res(4, mp->m_sb.sb_sectsize) +
+ xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2),
+ XFS_FSB_TO_B(mp, 1))));
+}
+
+/*
+ * For create, break it in to the two cases that the transaction
+ * covers. We start with the modify case - allocation done by modification
+ * of the state of existing inodes - and the allocation case.
+ */
+
+/*
+ * For create we can modify:
+ * the parent directory inode: inode size
+ * the new inode: inode size
+ * the inode btree entry: block size
+ * the superblock for the nlink flag: sector size
+ * the directory btree: (max depth + v2) * dir block size
+ * the directory inode's bmap btree: (max depth + v2) * block size
+ * the finobt (record modification and allocation btrees)
+ */
+STATIC uint
+xfs_calc_create_resv_modify(
+ struct xfs_mount *mp)
+{
+ return xfs_calc_inode_res(mp, 2) +
+ xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
+ (uint)XFS_FSB_TO_B(mp, 1) +
+ xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), XFS_FSB_TO_B(mp, 1)) +
+ xfs_calc_finobt_res(mp, 1, 1);
+}
+
+/*
+ * For create we can allocate some inodes giving:
+ * the agi and agf of the ag getting the new inodes: 2 * sectorsize
+ * the superblock for the nlink flag: sector size
+ * the inode blocks allocated: mp->m_ialloc_blks * blocksize
+ * the inode btree: max depth * blocksize
+ * the allocation btrees: 2 trees * (max depth - 1) * block size
+ */
+STATIC uint
+xfs_calc_create_resv_alloc(
+ struct xfs_mount *mp)
+{
+ return xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
+ mp->m_sb.sb_sectsize +
+ xfs_calc_buf_res(mp->m_ialloc_blks, XFS_FSB_TO_B(mp, 1)) +
+ xfs_calc_buf_res(mp->m_in_maxlevels, XFS_FSB_TO_B(mp, 1)) +
+ xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
+ XFS_FSB_TO_B(mp, 1));
+}
+
+STATIC uint
+__xfs_calc_create_reservation(
+ struct xfs_mount *mp)
+{
+ return XFS_DQUOT_LOGRES(mp) +
+ MAX(xfs_calc_create_resv_alloc(mp),
+ xfs_calc_create_resv_modify(mp));
+}
+
+/*
+ * For icreate we can allocate some inodes giving:
+ * the agi and agf of the ag getting the new inodes: 2 * sectorsize
+ * the superblock for the nlink flag: sector size
+ * the inode btree: max depth * blocksize
+ * the allocation btrees: 2 trees * (max depth - 1) * block size
+ * the finobt (record insertion)
+ */
+STATIC uint
+xfs_calc_icreate_resv_alloc(
+ struct xfs_mount *mp)
+{
+ return xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
+ mp->m_sb.sb_sectsize +
+ xfs_calc_buf_res(mp->m_in_maxlevels, XFS_FSB_TO_B(mp, 1)) +
+ xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
+ XFS_FSB_TO_B(mp, 1)) +
+ xfs_calc_finobt_res(mp, 0, 0);
+}
+
+STATIC uint
+xfs_calc_icreate_reservation(xfs_mount_t *mp)
+{
+ return XFS_DQUOT_LOGRES(mp) +
+ MAX(xfs_calc_icreate_resv_alloc(mp),
+ xfs_calc_create_resv_modify(mp));
+}
+
+STATIC uint
+xfs_calc_create_reservation(
+ struct xfs_mount *mp)
+{
+ if (xfs_sb_version_hascrc(&mp->m_sb))
+ return xfs_calc_icreate_reservation(mp);
+ return __xfs_calc_create_reservation(mp);
+
+}
+
+STATIC uint
+xfs_calc_create_tmpfile_reservation(
+ struct xfs_mount *mp)
+{
+ uint res = XFS_DQUOT_LOGRES(mp);
+
+ if (xfs_sb_version_hascrc(&mp->m_sb))
+ res += xfs_calc_icreate_resv_alloc(mp);
+ else
+ res += xfs_calc_create_resv_alloc(mp);
+
+ return res + xfs_calc_iunlink_add_reservation(mp);
+}
+
+/*
+ * Making a new directory is the same as creating a new file.
+ */
+STATIC uint
+xfs_calc_mkdir_reservation(
+ struct xfs_mount *mp)
+{
+ return xfs_calc_create_reservation(mp);
+}
+
+
+/*
+ * Making a new symplink is the same as creating a new file, but
+ * with the added blocks for remote symlink data which can be up to 1kB in
+ * length (MAXPATHLEN).
+ */
+STATIC uint
+xfs_calc_symlink_reservation(
+ struct xfs_mount *mp)
+{
+ return xfs_calc_create_reservation(mp) +
+ xfs_calc_buf_res(1, MAXPATHLEN);
+}
+
+/*
+ * In freeing an inode we can modify:
+ * the inode being freed: inode size
+ * the super block free inode counter: sector size
+ * the agi hash list and counters: sector size
+ * the inode btree entry: block size
+ * the on disk inode before ours in the agi hash list: inode cluster size
+ * the inode btree: max depth * blocksize
+ * the allocation btrees: 2 trees * (max depth - 1) * block size
+ * the finobt (record insertion, removal or modification)
+ */
+STATIC uint
+xfs_calc_ifree_reservation(
+ struct xfs_mount *mp)
+{
+ return XFS_DQUOT_LOGRES(mp) +
+ xfs_calc_inode_res(mp, 1) +
+ xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
+ xfs_calc_buf_res(1, XFS_FSB_TO_B(mp, 1)) +
+ xfs_calc_iunlink_remove_reservation(mp) +
+ xfs_calc_buf_res(1, 0) +
+ xfs_calc_buf_res(2 + mp->m_ialloc_blks +
+ mp->m_in_maxlevels, 0) +
+ xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
+ XFS_FSB_TO_B(mp, 1)) +
+ xfs_calc_finobt_res(mp, 0, 1);
+}
+
+/*
+ * When only changing the inode we log the inode and possibly the superblock
+ * We also add a bit of slop for the transaction stuff.
+ */
+STATIC uint
+xfs_calc_ichange_reservation(
+ struct xfs_mount *mp)
+{
+ return XFS_DQUOT_LOGRES(mp) +
+ xfs_calc_inode_res(mp, 1) +
+ xfs_calc_buf_res(1, mp->m_sb.sb_sectsize);
+
+}
+
+/*
+ * Growing the data section of the filesystem.
+ * superblock
+ * agi and agf
+ * allocation btrees
+ */
+STATIC uint
+xfs_calc_growdata_reservation(
+ struct xfs_mount *mp)
+{
+ return xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) +
+ xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
+ XFS_FSB_TO_B(mp, 1));
+}
+
+/*
+ * Growing the rt section of the filesystem.
+ * In the first set of transactions (ALLOC) we allocate space to the
+ * bitmap or summary files.
+ * superblock: sector size
+ * agf of the ag from which the extent is allocated: sector size
+ * bmap btree for bitmap/summary inode: max depth * blocksize
+ * bitmap/summary inode: inode size
+ * allocation btrees for 1 block alloc: 2 * (2 * maxdepth - 1) * blocksize
+ */
+STATIC uint
+xfs_calc_growrtalloc_reservation(
+ struct xfs_mount *mp)
+{
+ return xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
+ xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK),
+ XFS_FSB_TO_B(mp, 1)) +
+ xfs_calc_inode_res(mp, 1) +
+ xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
+ XFS_FSB_TO_B(mp, 1));
+}
+
+/*
+ * Growing the rt section of the filesystem.
+ * In the second set of transactions (ZERO) we zero the new metadata blocks.
+ * one bitmap/summary block: blocksize
+ */
+STATIC uint
+xfs_calc_growrtzero_reservation(
+ struct xfs_mount *mp)
+{
+ return xfs_calc_buf_res(1, mp->m_sb.sb_blocksize);
+}
+
+/*
+ * Growing the rt section of the filesystem.
+ * In the third set of transactions (FREE) we update metadata without
+ * allocating any new blocks.
+ * superblock: sector size
+ * bitmap inode: inode size
+ * summary inode: inode size
+ * one bitmap block: blocksize
+ * summary blocks: new summary size
+ */
+STATIC uint
+xfs_calc_growrtfree_reservation(
+ struct xfs_mount *mp)
+{
+ return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
+ xfs_calc_inode_res(mp, 2) +
+ xfs_calc_buf_res(1, mp->m_sb.sb_blocksize) +
+ xfs_calc_buf_res(1, mp->m_rsumsize);
+}
+
+/*
+ * Logging the inode modification timestamp on a synchronous write.
+ * inode
+ */
+STATIC uint
+xfs_calc_swrite_reservation(
+ struct xfs_mount *mp)
+{
+ return xfs_calc_inode_res(mp, 1);
+}
+
+/*
+ * Logging the inode mode bits when writing a setuid/setgid file
+ * inode
+ */
+STATIC uint
+xfs_calc_writeid_reservation(
+ struct xfs_mount *mp)
+{
+ return xfs_calc_inode_res(mp, 1);
+}
+
+/*
+ * Converting the inode from non-attributed to attributed.
+ * the inode being converted: inode size
+ * agf block and superblock (for block allocation)
+ * the new block (directory sized)
+ * bmap blocks for the new directory block
+ * allocation btrees
+ */
+STATIC uint
+xfs_calc_addafork_reservation(
+ struct xfs_mount *mp)
+{
+ return XFS_DQUOT_LOGRES(mp) +
+ xfs_calc_inode_res(mp, 1) +
+ xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
+ xfs_calc_buf_res(1, mp->m_dir_geo->blksize) +
+ xfs_calc_buf_res(XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1,
+ XFS_FSB_TO_B(mp, 1)) +
+ xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
+ XFS_FSB_TO_B(mp, 1));
+}
+
+/*
+ * Removing the attribute fork of a file
+ * the inode being truncated: inode size
+ * the inode's bmap btree: max depth * block size
+ * And the bmap_finish transaction can free the blocks and bmap blocks:
+ * the agf for each of the ags: 4 * sector size
+ * the agfl for each of the ags: 4 * sector size
+ * the super block to reflect the freed blocks: sector size
+ * worst case split in allocation btrees per extent assuming 4 extents:
+ * 4 exts * 2 trees * (2 * max depth - 1) * block size
+ */
+STATIC uint
+xfs_calc_attrinval_reservation(
+ struct xfs_mount *mp)
+{
+ return MAX((xfs_calc_inode_res(mp, 1) +
+ xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK),
+ XFS_FSB_TO_B(mp, 1))),
+ (xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) +
+ xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 4),
+ XFS_FSB_TO_B(mp, 1))));
+}
+
+/*
+ * Setting an attribute at mount time.
+ * the inode getting the attribute
+ * the superblock for allocations
+ * the agfs extents are allocated from
+ * the attribute btree * max depth
+ * the inode allocation btree
+ * Since attribute transaction space is dependent on the size of the attribute,
+ * the calculation is done partially at mount time and partially at runtime(see
+ * below).
+ */
+STATIC uint
+xfs_calc_attrsetm_reservation(
+ struct xfs_mount *mp)
+{
+ return XFS_DQUOT_LOGRES(mp) +
+ xfs_calc_inode_res(mp, 1) +
+ xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
+ xfs_calc_buf_res(XFS_DA_NODE_MAXDEPTH, XFS_FSB_TO_B(mp, 1));
+}
+
+/*
+ * Setting an attribute at runtime, transaction space unit per block.
+ * the superblock for allocations: sector size
+ * the inode bmap btree could join or split: max depth * block size
+ * Since the runtime attribute transaction space is dependent on the total
+ * blocks needed for the 1st bmap, here we calculate out the space unit for
+ * one block so that the caller could figure out the total space according
+ * to the attibute extent length in blocks by:
+ * ext * M_RES(mp)->tr_attrsetrt.tr_logres
+ */
+STATIC uint
+xfs_calc_attrsetrt_reservation(
+ struct xfs_mount *mp)
+{
+ return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
+ xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK),
+ XFS_FSB_TO_B(mp, 1));
+}
+
+/*
+ * Removing an attribute.
+ * the inode: inode size
+ * the attribute btree could join: max depth * block size
+ * the inode bmap btree could join or split: max depth * block size
+ * And the bmap_finish transaction can free the attr blocks freed giving:
+ * the agf for the ag in which the blocks live: 2 * sector size
+ * the agfl for the ag in which the blocks live: 2 * sector size
+ * the superblock for the free block count: sector size
+ * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
+ */
+STATIC uint
+xfs_calc_attrrm_reservation(
+ struct xfs_mount *mp)
+{
+ return XFS_DQUOT_LOGRES(mp) +
+ MAX((xfs_calc_inode_res(mp, 1) +
+ xfs_calc_buf_res(XFS_DA_NODE_MAXDEPTH,
+ XFS_FSB_TO_B(mp, 1)) +
+ (uint)XFS_FSB_TO_B(mp,
+ XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) +
+ xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK), 0)),
+ (xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) +
+ xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2),
+ XFS_FSB_TO_B(mp, 1))));
+}
+
+/*
+ * Clearing a bad agino number in an agi hash bucket.
+ */
+STATIC uint
+xfs_calc_clear_agi_bucket_reservation(
+ struct xfs_mount *mp)
+{
+ return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize);
+}
+
+/*
+ * Clearing the quotaflags in the superblock.
+ * the super block for changing quota flags: sector size
+ */
+STATIC uint
+xfs_calc_qm_sbchange_reservation(
+ struct xfs_mount *mp)
+{
+ return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize);
+}
+
+/*
+ * Adjusting quota limits.
+ * the xfs_disk_dquot_t: sizeof(struct xfs_disk_dquot)
+ */
+STATIC uint
+xfs_calc_qm_setqlim_reservation(
+ struct xfs_mount *mp)
+{
+ return xfs_calc_buf_res(1, sizeof(struct xfs_disk_dquot));
+}
+
+/*
+ * Allocating quota on disk if needed.
+ * the write transaction log space for quota file extent allocation
+ * the unit of quota allocation: one system block size
+ */
+STATIC uint
+xfs_calc_qm_dqalloc_reservation(
+ struct xfs_mount *mp)
+{
+ return xfs_calc_write_reservation(mp) +
+ xfs_calc_buf_res(1,
+ XFS_FSB_TO_B(mp, XFS_DQUOT_CLUSTER_SIZE_FSB) - 1);
+}
+
+/*
+ * Turning off quotas.
+ * the xfs_qoff_logitem_t: sizeof(struct xfs_qoff_logitem) * 2
+ * the superblock for the quota flags: sector size
+ */
+STATIC uint
+xfs_calc_qm_quotaoff_reservation(
+ struct xfs_mount *mp)
+{
+ return sizeof(struct xfs_qoff_logitem) * 2 +
+ xfs_calc_buf_res(1, mp->m_sb.sb_sectsize);
+}
+
+/*
+ * End of turning off quotas.
+ * the xfs_qoff_logitem_t: sizeof(struct xfs_qoff_logitem) * 2
+ */
+STATIC uint
+xfs_calc_qm_quotaoff_end_reservation(
+ struct xfs_mount *mp)
+{
+ return sizeof(struct xfs_qoff_logitem) * 2;
+}
+
+/*
+ * Syncing the incore super block changes to disk.
+ * the super block to reflect the changes: sector size
+ */
+STATIC uint
+xfs_calc_sb_reservation(
+ struct xfs_mount *mp)
+{
+ return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize);
+}
+
+void
+xfs_trans_resv_calc(
+ struct xfs_mount *mp,
+ struct xfs_trans_resv *resp)
+{
+ /*
+ * The following transactions are logged in physical format and
+ * require a permanent reservation on space.
+ */
+ resp->tr_write.tr_logres = xfs_calc_write_reservation(mp);
+ resp->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT;
+ resp->tr_write.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+ resp->tr_itruncate.tr_logres = xfs_calc_itruncate_reservation(mp);
+ resp->tr_itruncate.tr_logcount = XFS_ITRUNCATE_LOG_COUNT;
+ resp->tr_itruncate.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+ resp->tr_rename.tr_logres = xfs_calc_rename_reservation(mp);
+ resp->tr_rename.tr_logcount = XFS_RENAME_LOG_COUNT;
+ resp->tr_rename.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+ resp->tr_link.tr_logres = xfs_calc_link_reservation(mp);
+ resp->tr_link.tr_logcount = XFS_LINK_LOG_COUNT;
+ resp->tr_link.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+ resp->tr_remove.tr_logres = xfs_calc_remove_reservation(mp);
+ resp->tr_remove.tr_logcount = XFS_REMOVE_LOG_COUNT;
+ resp->tr_remove.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+ resp->tr_symlink.tr_logres = xfs_calc_symlink_reservation(mp);
+ resp->tr_symlink.tr_logcount = XFS_SYMLINK_LOG_COUNT;
+ resp->tr_symlink.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+ resp->tr_create.tr_logres = xfs_calc_create_reservation(mp);
+ resp->tr_create.tr_logcount = XFS_CREATE_LOG_COUNT;
+ resp->tr_create.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+ resp->tr_create_tmpfile.tr_logres =
+ xfs_calc_create_tmpfile_reservation(mp);
+ resp->tr_create_tmpfile.tr_logcount = XFS_CREATE_TMPFILE_LOG_COUNT;
+ resp->tr_create_tmpfile.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+ resp->tr_mkdir.tr_logres = xfs_calc_mkdir_reservation(mp);
+ resp->tr_mkdir.tr_logcount = XFS_MKDIR_LOG_COUNT;
+ resp->tr_mkdir.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+ resp->tr_ifree.tr_logres = xfs_calc_ifree_reservation(mp);
+ resp->tr_ifree.tr_logcount = XFS_INACTIVE_LOG_COUNT;
+ resp->tr_ifree.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+ resp->tr_addafork.tr_logres = xfs_calc_addafork_reservation(mp);
+ resp->tr_addafork.tr_logcount = XFS_ADDAFORK_LOG_COUNT;
+ resp->tr_addafork.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+ resp->tr_attrinval.tr_logres = xfs_calc_attrinval_reservation(mp);
+ resp->tr_attrinval.tr_logcount = XFS_ATTRINVAL_LOG_COUNT;
+ resp->tr_attrinval.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+ resp->tr_attrsetm.tr_logres = xfs_calc_attrsetm_reservation(mp);
+ resp->tr_attrsetm.tr_logcount = XFS_ATTRSET_LOG_COUNT;
+ resp->tr_attrsetm.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+ resp->tr_attrrm.tr_logres = xfs_calc_attrrm_reservation(mp);
+ resp->tr_attrrm.tr_logcount = XFS_ATTRRM_LOG_COUNT;
+ resp->tr_attrrm.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+ resp->tr_growrtalloc.tr_logres = xfs_calc_growrtalloc_reservation(mp);
+ resp->tr_growrtalloc.tr_logcount = XFS_DEFAULT_PERM_LOG_COUNT;
+ resp->tr_growrtalloc.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+ resp->tr_qm_dqalloc.tr_logres = xfs_calc_qm_dqalloc_reservation(mp);
+ resp->tr_qm_dqalloc.tr_logcount = XFS_WRITE_LOG_COUNT;
+ resp->tr_qm_dqalloc.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+ /*
+ * The following transactions are logged in logical format with
+ * a default log count.
+ */
+ resp->tr_qm_sbchange.tr_logres = xfs_calc_qm_sbchange_reservation(mp);
+ resp->tr_qm_sbchange.tr_logcount = XFS_DEFAULT_LOG_COUNT;
+
+ resp->tr_qm_setqlim.tr_logres = xfs_calc_qm_setqlim_reservation(mp);
+ resp->tr_qm_setqlim.tr_logcount = XFS_DEFAULT_LOG_COUNT;
+
+ resp->tr_qm_quotaoff.tr_logres = xfs_calc_qm_quotaoff_reservation(mp);
+ resp->tr_qm_quotaoff.tr_logcount = XFS_DEFAULT_LOG_COUNT;
+
+ resp->tr_qm_equotaoff.tr_logres =
+ xfs_calc_qm_quotaoff_end_reservation(mp);
+ resp->tr_qm_equotaoff.tr_logcount = XFS_DEFAULT_LOG_COUNT;
+
+ resp->tr_sb.tr_logres = xfs_calc_sb_reservation(mp);
+ resp->tr_sb.tr_logcount = XFS_DEFAULT_LOG_COUNT;
+
+ /* The following transaction are logged in logical format */
+ resp->tr_ichange.tr_logres = xfs_calc_ichange_reservation(mp);
+ resp->tr_growdata.tr_logres = xfs_calc_growdata_reservation(mp);
+ resp->tr_fsyncts.tr_logres = xfs_calc_swrite_reservation(mp);
+ resp->tr_writeid.tr_logres = xfs_calc_writeid_reservation(mp);
+ resp->tr_attrsetrt.tr_logres = xfs_calc_attrsetrt_reservation(mp);
+ resp->tr_clearagi.tr_logres = xfs_calc_clear_agi_bucket_reservation(mp);
+ resp->tr_growrtzero.tr_logres = xfs_calc_growrtzero_reservation(mp);
+ resp->tr_growrtfree.tr_logres = xfs_calc_growrtfree_reservation(mp);
+}
diff --git a/fs/xfs/xfs_trans_resv.h b/fs/xfs/xfs_trans_resv.h
new file mode 100644
index 00000000000..1097d14cd58
--- /dev/null
+++ b/fs/xfs/xfs_trans_resv.h
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#ifndef __XFS_TRANS_RESV_H__
+#define __XFS_TRANS_RESV_H__
+
+struct xfs_mount;
+
+/*
+ * structure for maintaining pre-calculated transaction reservations.
+ */
+struct xfs_trans_res {
+ uint tr_logres; /* log space unit in bytes per log ticket */
+ int tr_logcount; /* number of log operations per log ticket */
+ int tr_logflags; /* log flags, currently only used for indicating
+ * a reservation request is permanent or not */
+};
+
+struct xfs_trans_resv {
+ struct xfs_trans_res tr_write; /* extent alloc trans */
+ struct xfs_trans_res tr_itruncate; /* truncate trans */
+ struct xfs_trans_res tr_rename; /* rename trans */
+ struct xfs_trans_res tr_link; /* link trans */
+ struct xfs_trans_res tr_remove; /* unlink trans */
+ struct xfs_trans_res tr_symlink; /* symlink trans */
+ struct xfs_trans_res tr_create; /* create trans */
+ struct xfs_trans_res tr_create_tmpfile; /* create O_TMPFILE trans */
+ struct xfs_trans_res tr_mkdir; /* mkdir trans */
+ struct xfs_trans_res tr_ifree; /* inode free trans */
+ struct xfs_trans_res tr_ichange; /* inode update trans */
+ struct xfs_trans_res tr_growdata; /* fs data section grow trans */
+ struct xfs_trans_res tr_addafork; /* add inode attr fork trans */
+ struct xfs_trans_res tr_writeid; /* write setuid/setgid file */
+ struct xfs_trans_res tr_attrinval; /* attr fork buffer
+ * invalidation */
+ struct xfs_trans_res tr_attrsetm; /* set/create an attribute at
+ * mount time */
+ struct xfs_trans_res tr_attrsetrt; /* set/create an attribute at
+ * runtime */
+ struct xfs_trans_res tr_attrrm; /* remove an attribute */
+ struct xfs_trans_res tr_clearagi; /* clear agi unlinked bucket */
+ struct xfs_trans_res tr_growrtalloc; /* grow realtime allocations */
+ struct xfs_trans_res tr_growrtzero; /* grow realtime zeroing */
+ struct xfs_trans_res tr_growrtfree; /* grow realtime freeing */
+ struct xfs_trans_res tr_qm_sbchange; /* change quota flags */
+ struct xfs_trans_res tr_qm_setqlim; /* adjust quota limits */
+ struct xfs_trans_res tr_qm_dqalloc; /* allocate quota on disk */
+ struct xfs_trans_res tr_qm_quotaoff; /* turn quota off */
+ struct xfs_trans_res tr_qm_equotaoff;/* end of turn quota off */
+ struct xfs_trans_res tr_sb; /* modify superblock */
+ struct xfs_trans_res tr_fsyncts; /* update timestamps on fsync */
+};
+
+/* shorthand way of accessing reservation structure */
+#define M_RES(mp) (&(mp)->m_resv)
+
+/*
+ * Per-extent log reservation for the allocation btree changes
+ * involved in freeing or allocating an extent.
+ * 2 trees * (2 blocks/level * max depth - 1) * block size
+ */
+#define XFS_ALLOCFREE_LOG_RES(mp,nx) \
+ ((nx) * (2 * XFS_FSB_TO_B((mp), 2 * XFS_AG_MAXLEVELS(mp) - 1)))
+#define XFS_ALLOCFREE_LOG_COUNT(mp,nx) \
+ ((nx) * (2 * (2 * XFS_AG_MAXLEVELS(mp) - 1)))
+
+/*
+ * Per-directory log reservation for any directory change.
+ * dir blocks: (1 btree block per level + data block + free block) * dblock size
+ * bmap btree: (levels + 2) * max depth * block size
+ * v2 directory blocks can be fragmented below the dirblksize down to the fsb
+ * size, so account for that in the DAENTER macros.
+ */
+#define XFS_DIROP_LOG_RES(mp) \
+ (XFS_FSB_TO_B(mp, XFS_DAENTER_BLOCKS(mp, XFS_DATA_FORK)) + \
+ (XFS_FSB_TO_B(mp, XFS_DAENTER_BMAPS(mp, XFS_DATA_FORK) + 1)))
+#define XFS_DIROP_LOG_COUNT(mp) \
+ (XFS_DAENTER_BLOCKS(mp, XFS_DATA_FORK) + \
+ XFS_DAENTER_BMAPS(mp, XFS_DATA_FORK) + 1)
+
+/*
+ * Various log count values.
+ */
+#define XFS_DEFAULT_LOG_COUNT 1
+#define XFS_DEFAULT_PERM_LOG_COUNT 2
+#define XFS_ITRUNCATE_LOG_COUNT 2
+#define XFS_INACTIVE_LOG_COUNT 2
+#define XFS_CREATE_LOG_COUNT 2
+#define XFS_CREATE_TMPFILE_LOG_COUNT 2
+#define XFS_MKDIR_LOG_COUNT 3
+#define XFS_SYMLINK_LOG_COUNT 3
+#define XFS_REMOVE_LOG_COUNT 2
+#define XFS_LINK_LOG_COUNT 2
+#define XFS_RENAME_LOG_COUNT 2
+#define XFS_WRITE_LOG_COUNT 2
+#define XFS_ADDAFORK_LOG_COUNT 2
+#define XFS_ATTRINVAL_LOG_COUNT 1
+#define XFS_ATTRSET_LOG_COUNT 3
+#define XFS_ATTRRM_LOG_COUNT 3
+
+void xfs_trans_resv_calc(struct xfs_mount *mp, struct xfs_trans_resv *resp);
+
+#endif /* __XFS_TRANS_RESV_H__ */
diff --git a/fs/xfs/xfs_trans_space.h b/fs/xfs/xfs_trans_space.h
index 7d2c920dfb9..bf9c4579334 100644
--- a/fs/xfs/xfs_trans_space.h
+++ b/fs/xfs/xfs_trans_space.h
@@ -28,7 +28,8 @@
(((b + XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp) - 1) / \
XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp)) * \
XFS_EXTENTADD_SPACE_RES(mp,w))
-#define XFS_DAENTER_1B(mp,w) ((w) == XFS_DATA_FORK ? (mp)->m_dirblkfsbs : 1)
+#define XFS_DAENTER_1B(mp,w) \
+ ((w) == XFS_DATA_FORK ? (mp)->m_dir_geo->fsbcount : 1)
#define XFS_DAENTER_DBS(mp,w) \
(XFS_DA_NODE_MAXDEPTH + (((w) == XFS_DATA_FORK) ? 2 : 0))
#define XFS_DAENTER_BLOCKS(mp,w) \
@@ -47,13 +48,15 @@
#define XFS_DIRREMOVE_SPACE_RES(mp) \
XFS_DAREMOVE_SPACE_RES(mp, XFS_DATA_FORK)
#define XFS_IALLOC_SPACE_RES(mp) \
- (XFS_IALLOC_BLOCKS(mp) + (mp)->m_in_maxlevels - 1)
+ ((mp)->m_ialloc_blks + \
+ (xfs_sb_version_hasfinobt(&mp->m_sb) ? 2 : 1 * \
+ ((mp)->m_in_maxlevels - 1)))
/*
* Space reservation values for various transactions.
*/
#define XFS_ADDAFORK_SPACE_RES(mp) \
- ((mp)->m_dirblkfsbs + XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK))
+ ((mp)->m_dir_geo->fsbcount + XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK))
#define XFS_ATTRRM_SPACE_RES(mp) \
XFS_DAREMOVE_SPACE_RES(mp, XFS_ATTR_FORK)
/* This macro is not used - see inline code in xfs_attr_set */
@@ -82,5 +85,8 @@
(XFS_DIRREMOVE_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp,nl))
#define XFS_SYMLINK_SPACE_RES(mp,nl,b) \
(XFS_IALLOC_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp,nl) + (b))
+#define XFS_IFREE_SPACE_RES(mp) \
+ (xfs_sb_version_hasfinobt(&mp->m_sb) ? (mp)->m_in_maxlevels : 0)
+
#endif /* __XFS_TRANS_SPACE_H__ */
diff --git a/fs/xfs/xfs_types.h b/fs/xfs/xfs_types.h
index 7a41874f4c2..65c6e6650b1 100644
--- a/fs/xfs/xfs_types.h
+++ b/fs/xfs/xfs_types.h
@@ -18,43 +18,7 @@
#ifndef __XFS_TYPES_H__
#define __XFS_TYPES_H__
-#ifdef __KERNEL__
-
-/*
- * Additional type declarations for XFS
- */
-typedef signed char __int8_t;
-typedef unsigned char __uint8_t;
-typedef signed short int __int16_t;
-typedef unsigned short int __uint16_t;
-typedef signed int __int32_t;
-typedef unsigned int __uint32_t;
-typedef signed long long int __int64_t;
-typedef unsigned long long int __uint64_t;
-
-typedef enum { B_FALSE,B_TRUE } boolean_t;
-typedef __uint32_t prid_t; /* project ID */
-typedef __uint32_t inst_t; /* an instruction */
-
-typedef __s64 xfs_off_t; /* <file offset> type */
-typedef unsigned long long xfs_ino_t; /* <inode> type */
-typedef __s64 xfs_daddr_t; /* <disk address> type */
-typedef char * xfs_caddr_t; /* <core address> type */
-typedef __u32 xfs_dev_t;
-typedef __u32 xfs_nlink_t;
-
-/* __psint_t is the same size as a pointer */
-#if (BITS_PER_LONG == 32)
-typedef __int32_t __psint_t;
-typedef __uint32_t __psunsigned_t;
-#elif (BITS_PER_LONG == 64)
-typedef __int64_t __psint_t;
-typedef __uint64_t __psunsigned_t;
-#else
-#error BITS_PER_LONG must be 32 or 64
-#endif
-
-#endif /* __KERNEL__ */
+typedef __uint32_t prid_t; /* project ID */
typedef __uint32_t xfs_agblock_t; /* blockno in alloc. group */
typedef __uint32_t xfs_agino_t; /* inode # within allocation grp */
@@ -147,6 +111,12 @@ typedef __uint64_t xfs_filblks_t; /* number of blocks in a file */
#define XFS_MAX_SECTORSIZE (1 << XFS_MAX_SECTORSIZE_LOG)
/*
+ * Inode fork identifiers.
+ */
+#define XFS_DATA_FORK 0
+#define XFS_ATTR_FORK 1
+
+/*
* Min numbers of data/attr fork btree root pointers.
*/
#define MINDBTPTRS 3
@@ -164,12 +134,29 @@ typedef enum {
typedef enum {
XFS_BTNUM_BNOi, XFS_BTNUM_CNTi, XFS_BTNUM_BMAPi, XFS_BTNUM_INOi,
- XFS_BTNUM_MAX
+ XFS_BTNUM_FINOi, XFS_BTNUM_MAX
} xfs_btnum_t;
struct xfs_name {
const unsigned char *name;
int len;
+ int type;
};
+/*
+ * uid_t and gid_t are hard-coded to 32 bits in the inode.
+ * Hence, an 'id' in a dquot is 32 bits..
+ */
+typedef __uint32_t xfs_dqid_t;
+
+/*
+ * Constants for bit manipulations.
+ */
+#define XFS_NBBYLOG 3 /* log2(NBBY) */
+#define XFS_WORDLOG 2 /* log2(sizeof(xfs_rtword_t)) */
+#define XFS_NBWORDLOG (XFS_NBBYLOG + XFS_WORDLOG)
+#define XFS_NBWORD (1 << XFS_NBWORDLOG)
+#define XFS_WORDMASK ((1 << XFS_WORDLOG) - 1)
+
+
#endif /* __XFS_TYPES_H__ */
diff --git a/fs/xfs/xfs_utils.c b/fs/xfs/xfs_utils.c
deleted file mode 100644
index 0025c78ac03..00000000000
--- a/fs/xfs/xfs_utils.c
+++ /dev/null
@@ -1,314 +0,0 @@
-/*
- * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_types.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_dir2.h"
-#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_dinode.h"
-#include "xfs_inode.h"
-#include "xfs_inode_item.h"
-#include "xfs_bmap.h"
-#include "xfs_error.h"
-#include "xfs_quota.h"
-#include "xfs_itable.h"
-#include "xfs_utils.h"
-
-
-/*
- * Allocates a new inode from disk and return a pointer to the
- * incore copy. This routine will internally commit the current
- * transaction and allocate a new one if the Space Manager needed
- * to do an allocation to replenish the inode free-list.
- *
- * This routine is designed to be called from xfs_create and
- * xfs_create_dir.
- *
- */
-int
-xfs_dir_ialloc(
- xfs_trans_t **tpp, /* input: current transaction;
- output: may be a new transaction. */
- xfs_inode_t *dp, /* directory within whose allocate
- the inode. */
- umode_t mode,
- xfs_nlink_t nlink,
- xfs_dev_t rdev,
- prid_t prid, /* project id */
- int okalloc, /* ok to allocate new space */
- xfs_inode_t **ipp, /* pointer to inode; it will be
- locked. */
- int *committed)
-
-{
- xfs_trans_t *tp;
- xfs_trans_t *ntp;
- xfs_inode_t *ip;
- xfs_buf_t *ialloc_context = NULL;
- int code;
- uint log_res;
- uint log_count;
- void *dqinfo;
- uint tflags;
-
- tp = *tpp;
- ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
-
- /*
- * xfs_ialloc will return a pointer to an incore inode if
- * the Space Manager has an available inode on the free
- * list. Otherwise, it will do an allocation and replenish
- * the freelist. Since we can only do one allocation per
- * transaction without deadlocks, we will need to commit the
- * current transaction and start a new one. We will then
- * need to call xfs_ialloc again to get the inode.
- *
- * If xfs_ialloc did an allocation to replenish the freelist,
- * it returns the bp containing the head of the freelist as
- * ialloc_context. We will hold a lock on it across the
- * transaction commit so that no other process can steal
- * the inode(s) that we've just allocated.
- */
- code = xfs_ialloc(tp, dp, mode, nlink, rdev, prid, okalloc,
- &ialloc_context, &ip);
-
- /*
- * Return an error if we were unable to allocate a new inode.
- * This should only happen if we run out of space on disk or
- * encounter a disk error.
- */
- if (code) {
- *ipp = NULL;
- return code;
- }
- if (!ialloc_context && !ip) {
- *ipp = NULL;
- return XFS_ERROR(ENOSPC);
- }
-
- /*
- * If the AGI buffer is non-NULL, then we were unable to get an
- * inode in one operation. We need to commit the current
- * transaction and call xfs_ialloc() again. It is guaranteed
- * to succeed the second time.
- */
- if (ialloc_context) {
- /*
- * Normally, xfs_trans_commit releases all the locks.
- * We call bhold to hang on to the ialloc_context across
- * the commit. Holding this buffer prevents any other
- * processes from doing any allocations in this
- * allocation group.
- */
- xfs_trans_bhold(tp, ialloc_context);
- /*
- * Save the log reservation so we can use
- * them in the next transaction.
- */
- log_res = xfs_trans_get_log_res(tp);
- log_count = xfs_trans_get_log_count(tp);
-
- /*
- * We want the quota changes to be associated with the next
- * transaction, NOT this one. So, detach the dqinfo from this
- * and attach it to the next transaction.
- */
- dqinfo = NULL;
- tflags = 0;
- if (tp->t_dqinfo) {
- dqinfo = (void *)tp->t_dqinfo;
- tp->t_dqinfo = NULL;
- tflags = tp->t_flags & XFS_TRANS_DQ_DIRTY;
- tp->t_flags &= ~(XFS_TRANS_DQ_DIRTY);
- }
-
- ntp = xfs_trans_dup(tp);
- code = xfs_trans_commit(tp, 0);
- tp = ntp;
- if (committed != NULL) {
- *committed = 1;
- }
- /*
- * If we get an error during the commit processing,
- * release the buffer that is still held and return
- * to the caller.
- */
- if (code) {
- xfs_buf_relse(ialloc_context);
- if (dqinfo) {
- tp->t_dqinfo = dqinfo;
- xfs_trans_free_dqinfo(tp);
- }
- *tpp = ntp;
- *ipp = NULL;
- return code;
- }
-
- /*
- * transaction commit worked ok so we can drop the extra ticket
- * reference that we gained in xfs_trans_dup()
- */
- xfs_log_ticket_put(tp->t_ticket);
- code = xfs_trans_reserve(tp, 0, log_res, 0,
- XFS_TRANS_PERM_LOG_RES, log_count);
- /*
- * Re-attach the quota info that we detached from prev trx.
- */
- if (dqinfo) {
- tp->t_dqinfo = dqinfo;
- tp->t_flags |= tflags;
- }
-
- if (code) {
- xfs_buf_relse(ialloc_context);
- *tpp = ntp;
- *ipp = NULL;
- return code;
- }
- xfs_trans_bjoin(tp, ialloc_context);
-
- /*
- * Call ialloc again. Since we've locked out all
- * other allocations in this allocation group,
- * this call should always succeed.
- */
- code = xfs_ialloc(tp, dp, mode, nlink, rdev, prid,
- okalloc, &ialloc_context, &ip);
-
- /*
- * If we get an error at this point, return to the caller
- * so that the current transaction can be aborted.
- */
- if (code) {
- *tpp = tp;
- *ipp = NULL;
- return code;
- }
- ASSERT(!ialloc_context && ip);
-
- } else {
- if (committed != NULL)
- *committed = 0;
- }
-
- *ipp = ip;
- *tpp = tp;
-
- return 0;
-}
-
-/*
- * Decrement the link count on an inode & log the change.
- * If this causes the link count to go to zero, initiate the
- * logging activity required to truncate a file.
- */
-int /* error */
-xfs_droplink(
- xfs_trans_t *tp,
- xfs_inode_t *ip)
-{
- int error;
-
- xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
-
- ASSERT (ip->i_d.di_nlink > 0);
- ip->i_d.di_nlink--;
- drop_nlink(VFS_I(ip));
- xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-
- error = 0;
- if (ip->i_d.di_nlink == 0) {
- /*
- * We're dropping the last link to this file.
- * Move the on-disk inode to the AGI unlinked list.
- * From xfs_inactive() we will pull the inode from
- * the list and free it.
- */
- error = xfs_iunlink(tp, ip);
- }
- return error;
-}
-
-/*
- * This gets called when the inode's version needs to be changed from 1 to 2.
- * Currently this happens when the nlink field overflows the old 16-bit value
- * or when chproj is called to change the project for the first time.
- * As a side effect the superblock version will also get rev'd
- * to contain the NLINK bit.
- */
-void
-xfs_bump_ino_vers2(
- xfs_trans_t *tp,
- xfs_inode_t *ip)
-{
- xfs_mount_t *mp;
-
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
- ASSERT(ip->i_d.di_version == 1);
-
- ip->i_d.di_version = 2;
- ip->i_d.di_onlink = 0;
- memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
- mp = tp->t_mountp;
- if (!xfs_sb_version_hasnlink(&mp->m_sb)) {
- spin_lock(&mp->m_sb_lock);
- if (!xfs_sb_version_hasnlink(&mp->m_sb)) {
- xfs_sb_version_addnlink(&mp->m_sb);
- spin_unlock(&mp->m_sb_lock);
- xfs_mod_sb(tp, XFS_SB_VERSIONNUM);
- } else {
- spin_unlock(&mp->m_sb_lock);
- }
- }
- /* Caller must log the inode */
-}
-
-/*
- * Increment the link count on an inode & log the change.
- */
-int
-xfs_bumplink(
- xfs_trans_t *tp,
- xfs_inode_t *ip)
-{
- xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
-
- ASSERT(ip->i_d.di_nlink > 0);
- ip->i_d.di_nlink++;
- inc_nlink(VFS_I(ip));
- if ((ip->i_d.di_version == 1) &&
- (ip->i_d.di_nlink > XFS_MAXLINK_1)) {
- /*
- * The inode has increased its number of links beyond
- * what can fit in an old format inode. It now needs
- * to be converted to a version 2 inode with a 32 bit
- * link count. If this is the first inode in the file
- * system to do this, then we need to bump the superblock
- * version number as well.
- */
- xfs_bump_ino_vers2(tp, ip);
- }
-
- xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
- return 0;
-}
diff --git a/fs/xfs/xfs_vnode.h b/fs/xfs/xfs_vnode.h
index db14d0c0868..e8a77383c0d 100644
--- a/fs/xfs/xfs_vnode.h
+++ b/fs/xfs/xfs_vnode.h
@@ -25,14 +25,6 @@ struct xfs_inode;
struct attrlist_cursor_kern;
/*
- * Return values for xfs_inactive. A return value of
- * VN_INACTIVE_NOCACHE implies that the file system behavior
- * has disassociated its state and bhv_desc_t from the vnode.
- */
-#define VN_INACTIVE_CACHE 0
-#define VN_INACTIVE_NOCACHE 1
-
-/*
* Flags for read/write calls - same values as IRIX
*/
#define IO_ISDIRECT 0x00004 /* bypass page cache */
@@ -43,15 +35,6 @@ struct attrlist_cursor_kern;
{ IO_INVIS, "INVIS"}
/*
- * Flush/Invalidate options for vop_toss/flush/flushinval_pages.
- */
-#define FI_NONE 0 /* none */
-#define FI_REMAPF 1 /* Do a remapf prior to the operation */
-#define FI_REMAPF_LOCKED 2 /* Do a remapf prior to the operation.
- Prevent VM access to the pages until
- the operation completes. */
-
-/*
* Some useful predicates.
*/
#define VN_MAPPED(vp) mapping_mapped(vp->i_mapping)
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
deleted file mode 100644
index d95f565a390..00000000000
--- a/fs/xfs/xfs_vnodeops.c
+++ /dev/null
@@ -1,2348 +0,0 @@
-/*
- * Copyright (c) 2000-2006 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_types.h"
-#include "xfs_bit.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_dir2.h"
-#include "xfs_mount.h"
-#include "xfs_da_btree.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dinode.h"
-#include "xfs_inode.h"
-#include "xfs_inode_item.h"
-#include "xfs_itable.h"
-#include "xfs_ialloc.h"
-#include "xfs_alloc.h"
-#include "xfs_bmap.h"
-#include "xfs_acl.h"
-#include "xfs_attr.h"
-#include "xfs_error.h"
-#include "xfs_quota.h"
-#include "xfs_utils.h"
-#include "xfs_rtalloc.h"
-#include "xfs_trans_space.h"
-#include "xfs_log_priv.h"
-#include "xfs_filestream.h"
-#include "xfs_vnodeops.h"
-#include "xfs_trace.h"
-#include "xfs_icache.h"
-
-/*
- * The maximum pathlen is 1024 bytes. Since the minimum file system
- * blocksize is 512 bytes, we can get a max of 2 extents back from
- * bmapi.
- */
-#define SYMLINK_MAPS 2
-
-STATIC int
-xfs_readlink_bmap(
- xfs_inode_t *ip,
- char *link)
-{
- xfs_mount_t *mp = ip->i_mount;
- int pathlen = ip->i_d.di_size;
- int nmaps = SYMLINK_MAPS;
- xfs_bmbt_irec_t mval[SYMLINK_MAPS];
- xfs_daddr_t d;
- int byte_cnt;
- int n;
- xfs_buf_t *bp;
- int error = 0;
-
- error = xfs_bmapi_read(ip, 0, XFS_B_TO_FSB(mp, pathlen), mval, &nmaps,
- 0);
- if (error)
- goto out;
-
- for (n = 0; n < nmaps; n++) {
- d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock);
- byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount);
-
- bp = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt), 0, NULL);
- if (!bp)
- return XFS_ERROR(ENOMEM);
- error = bp->b_error;
- if (error) {
- xfs_buf_ioerror_alert(bp, __func__);
- xfs_buf_relse(bp);
- goto out;
- }
- if (pathlen < byte_cnt)
- byte_cnt = pathlen;
- pathlen -= byte_cnt;
-
- memcpy(link, bp->b_addr, byte_cnt);
- xfs_buf_relse(bp);
- }
-
- link[ip->i_d.di_size] = '\0';
- error = 0;
-
- out:
- return error;
-}
-
-int
-xfs_readlink(
- xfs_inode_t *ip,
- char *link)
-{
- xfs_mount_t *mp = ip->i_mount;
- xfs_fsize_t pathlen;
- int error = 0;
-
- trace_xfs_readlink(ip);
-
- if (XFS_FORCED_SHUTDOWN(mp))
- return XFS_ERROR(EIO);
-
- xfs_ilock(ip, XFS_ILOCK_SHARED);
-
- pathlen = ip->i_d.di_size;
- if (!pathlen)
- goto out;
-
- if (pathlen < 0 || pathlen > MAXPATHLEN) {
- xfs_alert(mp, "%s: inode (%llu) bad symlink length (%lld)",
- __func__, (unsigned long long) ip->i_ino,
- (long long) pathlen);
- ASSERT(0);
- error = XFS_ERROR(EFSCORRUPTED);
- goto out;
- }
-
-
- if (ip->i_df.if_flags & XFS_IFINLINE) {
- memcpy(link, ip->i_df.if_u1.if_data, pathlen);
- link[pathlen] = '\0';
- } else {
- error = xfs_readlink_bmap(ip, link);
- }
-
- out:
- xfs_iunlock(ip, XFS_ILOCK_SHARED);
- return error;
-}
-
-/*
- * This is called by xfs_inactive to free any blocks beyond eof
- * when the link count isn't zero and by xfs_dm_punch_hole() when
- * punching a hole to EOF.
- */
-int
-xfs_free_eofblocks(
- xfs_mount_t *mp,
- xfs_inode_t *ip,
- bool need_iolock)
-{
- xfs_trans_t *tp;
- int error;
- xfs_fileoff_t end_fsb;
- xfs_fileoff_t last_fsb;
- xfs_filblks_t map_len;
- int nimaps;
- xfs_bmbt_irec_t imap;
-
- /*
- * Figure out if there are any blocks beyond the end
- * of the file. If not, then there is nothing to do.
- */
- end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_ISIZE(ip));
- last_fsb = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes);
- if (last_fsb <= end_fsb)
- return 0;
- map_len = last_fsb - end_fsb;
-
- nimaps = 1;
- xfs_ilock(ip, XFS_ILOCK_SHARED);
- error = xfs_bmapi_read(ip, end_fsb, map_len, &imap, &nimaps, 0);
- xfs_iunlock(ip, XFS_ILOCK_SHARED);
-
- if (!error && (nimaps != 0) &&
- (imap.br_startblock != HOLESTARTBLOCK ||
- ip->i_delayed_blks)) {
- /*
- * Attach the dquots to the inode up front.
- */
- error = xfs_qm_dqattach(ip, 0);
- if (error)
- return error;
-
- /*
- * There are blocks after the end of file.
- * Free them up now by truncating the file to
- * its current size.
- */
- tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
-
- if (need_iolock) {
- if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
- xfs_trans_cancel(tp, 0);
- return EAGAIN;
- }
- }
-
- error = xfs_trans_reserve(tp, 0,
- XFS_ITRUNCATE_LOG_RES(mp),
- 0, XFS_TRANS_PERM_LOG_RES,
- XFS_ITRUNCATE_LOG_COUNT);
- if (error) {
- ASSERT(XFS_FORCED_SHUTDOWN(mp));
- xfs_trans_cancel(tp, 0);
- if (need_iolock)
- xfs_iunlock(ip, XFS_IOLOCK_EXCL);
- return error;
- }
-
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- xfs_trans_ijoin(tp, ip, 0);
-
- /*
- * Do not update the on-disk file size. If we update the
- * on-disk file size and then the system crashes before the
- * contents of the file are flushed to disk then the files
- * may be full of holes (ie NULL files bug).
- */
- error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK,
- XFS_ISIZE(ip));
- if (error) {
- /*
- * If we get an error at this point we simply don't
- * bother truncating the file.
- */
- xfs_trans_cancel(tp,
- (XFS_TRANS_RELEASE_LOG_RES |
- XFS_TRANS_ABORT));
- } else {
- error = xfs_trans_commit(tp,
- XFS_TRANS_RELEASE_LOG_RES);
- if (!error)
- xfs_inode_clear_eofblocks_tag(ip);
- }
-
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- if (need_iolock)
- xfs_iunlock(ip, XFS_IOLOCK_EXCL);
- }
- return error;
-}
-
-/*
- * Free a symlink that has blocks associated with it.
- */
-STATIC int
-xfs_inactive_symlink_rmt(
- xfs_inode_t *ip,
- xfs_trans_t **tpp)
-{
- xfs_buf_t *bp;
- int committed;
- int done;
- int error;
- xfs_fsblock_t first_block;
- xfs_bmap_free_t free_list;
- int i;
- xfs_mount_t *mp;
- xfs_bmbt_irec_t mval[SYMLINK_MAPS];
- int nmaps;
- xfs_trans_t *ntp;
- int size;
- xfs_trans_t *tp;
-
- tp = *tpp;
- mp = ip->i_mount;
- ASSERT(ip->i_d.di_size > XFS_IFORK_DSIZE(ip));
- /*
- * We're freeing a symlink that has some
- * blocks allocated to it. Free the
- * blocks here. We know that we've got
- * either 1 or 2 extents and that we can
- * free them all in one bunmapi call.
- */
- ASSERT(ip->i_d.di_nextents > 0 && ip->i_d.di_nextents <= 2);
-
- /*
- * Lock the inode, fix the size, and join it to the transaction.
- * Hold it so in the normal path, we still have it locked for
- * the second transaction. In the error paths we need it
- * held so the cancel won't rele it, see below.
- */
- size = (int)ip->i_d.di_size;
- ip->i_d.di_size = 0;
- xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
- /*
- * Find the block(s) so we can inval and unmap them.
- */
- done = 0;
- xfs_bmap_init(&free_list, &first_block);
- nmaps = ARRAY_SIZE(mval);
- error = xfs_bmapi_read(ip, 0, XFS_B_TO_FSB(mp, size),
- mval, &nmaps, 0);
- if (error)
- goto error0;
- /*
- * Invalidate the block(s).
- */
- for (i = 0; i < nmaps; i++) {
- bp = xfs_trans_get_buf(tp, mp->m_ddev_targp,
- XFS_FSB_TO_DADDR(mp, mval[i].br_startblock),
- XFS_FSB_TO_BB(mp, mval[i].br_blockcount), 0);
- if (!bp) {
- error = ENOMEM;
- goto error1;
- }
- xfs_trans_binval(tp, bp);
- }
- /*
- * Unmap the dead block(s) to the free_list.
- */
- if ((error = xfs_bunmapi(tp, ip, 0, size, XFS_BMAPI_METADATA, nmaps,
- &first_block, &free_list, &done)))
- goto error1;
- ASSERT(done);
- /*
- * Commit the first transaction. This logs the EFI and the inode.
- */
- if ((error = xfs_bmap_finish(&tp, &free_list, &committed)))
- goto error1;
- /*
- * The transaction must have been committed, since there were
- * actually extents freed by xfs_bunmapi. See xfs_bmap_finish.
- * The new tp has the extent freeing and EFDs.
- */
- ASSERT(committed);
- /*
- * The first xact was committed, so add the inode to the new one.
- * Mark it dirty so it will be logged and moved forward in the log as
- * part of every commit.
- */
- xfs_trans_ijoin(tp, ip, 0);
- xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
- /*
- * Get a new, empty transaction to return to our caller.
- */
- ntp = xfs_trans_dup(tp);
- /*
- * Commit the transaction containing extent freeing and EFDs.
- * If we get an error on the commit here or on the reserve below,
- * we need to unlock the inode since the new transaction doesn't
- * have the inode attached.
- */
- error = xfs_trans_commit(tp, 0);
- tp = ntp;
- if (error) {
- ASSERT(XFS_FORCED_SHUTDOWN(mp));
- goto error0;
- }
- /*
- * transaction commit worked ok so we can drop the extra ticket
- * reference that we gained in xfs_trans_dup()
- */
- xfs_log_ticket_put(tp->t_ticket);
-
- /*
- * Remove the memory for extent descriptions (just bookkeeping).
- */
- if (ip->i_df.if_bytes)
- xfs_idata_realloc(ip, -ip->i_df.if_bytes, XFS_DATA_FORK);
- ASSERT(ip->i_df.if_bytes == 0);
- /*
- * Put an itruncate log reservation in the new transaction
- * for our caller.
- */
- if ((error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0,
- XFS_TRANS_PERM_LOG_RES, XFS_ITRUNCATE_LOG_COUNT))) {
- ASSERT(XFS_FORCED_SHUTDOWN(mp));
- goto error0;
- }
-
- xfs_trans_ijoin(tp, ip, 0);
- *tpp = tp;
- return 0;
-
- error1:
- xfs_bmap_cancel(&free_list);
- error0:
- return error;
-}
-
-int
-xfs_release(
- xfs_inode_t *ip)
-{
- xfs_mount_t *mp = ip->i_mount;
- int error;
-
- if (!S_ISREG(ip->i_d.di_mode) || (ip->i_d.di_mode == 0))
- return 0;
-
- /* If this is a read-only mount, don't do this (would generate I/O) */
- if (mp->m_flags & XFS_MOUNT_RDONLY)
- return 0;
-
- if (!XFS_FORCED_SHUTDOWN(mp)) {
- int truncated;
-
- /*
- * If we are using filestreams, and we have an unlinked
- * file that we are processing the last close on, then nothing
- * will be able to reopen and write to this file. Purge this
- * inode from the filestreams cache so that it doesn't delay
- * teardown of the inode.
- */
- if ((ip->i_d.di_nlink == 0) && xfs_inode_is_filestream(ip))
- xfs_filestream_deassociate(ip);
-
- /*
- * If we previously truncated this file and removed old data
- * in the process, we want to initiate "early" writeout on
- * the last close. This is an attempt to combat the notorious
- * NULL files problem which is particularly noticeable from a
- * truncate down, buffered (re-)write (delalloc), followed by
- * a crash. What we are effectively doing here is
- * significantly reducing the time window where we'd otherwise
- * be exposed to that problem.
- */
- truncated = xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED);
- if (truncated) {
- xfs_iflags_clear(ip, XFS_IDIRTY_RELEASE);
- if (VN_DIRTY(VFS_I(ip)) && ip->i_delayed_blks > 0) {
- error = -filemap_flush(VFS_I(ip)->i_mapping);
- if (error)
- return error;
- }
- }
- }
-
- if (ip->i_d.di_nlink == 0)
- return 0;
-
- if (xfs_can_free_eofblocks(ip, false)) {
-
- /*
- * If we can't get the iolock just skip truncating the blocks
- * past EOF because we could deadlock with the mmap_sem
- * otherwise. We'll get another chance to drop them once the
- * last reference to the inode is dropped, so we'll never leak
- * blocks permanently.
- *
- * Further, check if the inode is being opened, written and
- * closed frequently and we have delayed allocation blocks
- * outstanding (e.g. streaming writes from the NFS server),
- * truncating the blocks past EOF will cause fragmentation to
- * occur.
- *
- * In this case don't do the truncation, either, but we have to
- * be careful how we detect this case. Blocks beyond EOF show
- * up as i_delayed_blks even when the inode is clean, so we
- * need to truncate them away first before checking for a dirty
- * release. Hence on the first dirty close we will still remove
- * the speculative allocation, but after that we will leave it
- * in place.
- */
- if (xfs_iflags_test(ip, XFS_IDIRTY_RELEASE))
- return 0;
-
- error = xfs_free_eofblocks(mp, ip, true);
- if (error && error != EAGAIN)
- return error;
-
- /* delalloc blocks after truncation means it really is dirty */
- if (ip->i_delayed_blks)
- xfs_iflags_set(ip, XFS_IDIRTY_RELEASE);
- }
- return 0;
-}
-
-/*
- * xfs_inactive
- *
- * This is called when the vnode reference count for the vnode
- * goes to zero. If the file has been unlinked, then it must
- * now be truncated. Also, we clear all of the read-ahead state
- * kept for the inode here since the file is now closed.
- */
-int
-xfs_inactive(
- xfs_inode_t *ip)
-{
- xfs_bmap_free_t free_list;
- xfs_fsblock_t first_block;
- int committed;
- xfs_trans_t *tp;
- xfs_mount_t *mp;
- int error;
- int truncate = 0;
-
- /*
- * If the inode is already free, then there can be nothing
- * to clean up here.
- */
- if (ip->i_d.di_mode == 0 || is_bad_inode(VFS_I(ip))) {
- ASSERT(ip->i_df.if_real_bytes == 0);
- ASSERT(ip->i_df.if_broot_bytes == 0);
- return VN_INACTIVE_CACHE;
- }
-
- mp = ip->i_mount;
-
- error = 0;
-
- /* If this is a read-only mount, don't do this (would generate I/O) */
- if (mp->m_flags & XFS_MOUNT_RDONLY)
- goto out;
-
- if (ip->i_d.di_nlink != 0) {
- /*
- * force is true because we are evicting an inode from the
- * cache. Post-eof blocks must be freed, lest we end up with
- * broken free space accounting.
- */
- if (xfs_can_free_eofblocks(ip, true)) {
- error = xfs_free_eofblocks(mp, ip, false);
- if (error)
- return VN_INACTIVE_CACHE;
- }
- goto out;
- }
-
- if (S_ISREG(ip->i_d.di_mode) &&
- (ip->i_d.di_size != 0 || XFS_ISIZE(ip) != 0 ||
- ip->i_d.di_nextents > 0 || ip->i_delayed_blks > 0))
- truncate = 1;
-
- error = xfs_qm_dqattach(ip, 0);
- if (error)
- return VN_INACTIVE_CACHE;
-
- tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
- error = xfs_trans_reserve(tp, 0,
- (truncate || S_ISLNK(ip->i_d.di_mode)) ?
- XFS_ITRUNCATE_LOG_RES(mp) :
- XFS_IFREE_LOG_RES(mp),
- 0,
- XFS_TRANS_PERM_LOG_RES,
- XFS_ITRUNCATE_LOG_COUNT);
- if (error) {
- ASSERT(XFS_FORCED_SHUTDOWN(mp));
- xfs_trans_cancel(tp, 0);
- return VN_INACTIVE_CACHE;
- }
-
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- xfs_trans_ijoin(tp, ip, 0);
-
- if (S_ISLNK(ip->i_d.di_mode)) {
- /*
- * Zero length symlinks _can_ exist.
- */
- if (ip->i_d.di_size > XFS_IFORK_DSIZE(ip)) {
- error = xfs_inactive_symlink_rmt(ip, &tp);
- if (error)
- goto out_cancel;
- } else if (ip->i_df.if_bytes > 0) {
- xfs_idata_realloc(ip, -(ip->i_df.if_bytes),
- XFS_DATA_FORK);
- ASSERT(ip->i_df.if_bytes == 0);
- }
- } else if (truncate) {
- ip->i_d.di_size = 0;
- xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-
- error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, 0);
- if (error)
- goto out_cancel;
-
- ASSERT(ip->i_d.di_nextents == 0);
- }
-
- /*
- * If there are attributes associated with the file then blow them away
- * now. The code calls a routine that recursively deconstructs the
- * attribute fork. We need to just commit the current transaction
- * because we can't use it for xfs_attr_inactive().
- */
- if (ip->i_d.di_anextents > 0) {
- ASSERT(ip->i_d.di_forkoff != 0);
-
- error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
- if (error)
- goto out_unlock;
-
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
-
- error = xfs_attr_inactive(ip);
- if (error)
- goto out;
-
- tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
- error = xfs_trans_reserve(tp, 0,
- XFS_IFREE_LOG_RES(mp),
- 0, XFS_TRANS_PERM_LOG_RES,
- XFS_INACTIVE_LOG_COUNT);
- if (error) {
- xfs_trans_cancel(tp, 0);
- goto out;
- }
-
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- xfs_trans_ijoin(tp, ip, 0);
- }
-
- if (ip->i_afp)
- xfs_idestroy_fork(ip, XFS_ATTR_FORK);
-
- ASSERT(ip->i_d.di_anextents == 0);
-
- /*
- * Free the inode.
- */
- xfs_bmap_init(&free_list, &first_block);
- error = xfs_ifree(tp, ip, &free_list);
- if (error) {
- /*
- * If we fail to free the inode, shut down. The cancel
- * might do that, we need to make sure. Otherwise the
- * inode might be lost for a long time or forever.
- */
- if (!XFS_FORCED_SHUTDOWN(mp)) {
- xfs_notice(mp, "%s: xfs_ifree returned error %d",
- __func__, error);
- xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
- }
- xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
- } else {
- /*
- * Credit the quota account(s). The inode is gone.
- */
- xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_ICOUNT, -1);
-
- /*
- * Just ignore errors at this point. There is nothing we can
- * do except to try to keep going. Make sure it's not a silent
- * error.
- */
- error = xfs_bmap_finish(&tp, &free_list, &committed);
- if (error)
- xfs_notice(mp, "%s: xfs_bmap_finish returned error %d",
- __func__, error);
- error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
- if (error)
- xfs_notice(mp, "%s: xfs_trans_commit returned error %d",
- __func__, error);
- }
-
- /*
- * Release the dquots held by inode, if any.
- */
- xfs_qm_dqdetach(ip);
-out_unlock:
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
-out:
- return VN_INACTIVE_CACHE;
-out_cancel:
- xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
- goto out_unlock;
-}
-
-/*
- * Lookups up an inode from "name". If ci_name is not NULL, then a CI match
- * is allowed, otherwise it has to be an exact match. If a CI match is found,
- * ci_name->name will point to a the actual name (caller must free) or
- * will be set to NULL if an exact match is found.
- */
-int
-xfs_lookup(
- xfs_inode_t *dp,
- struct xfs_name *name,
- xfs_inode_t **ipp,
- struct xfs_name *ci_name)
-{
- xfs_ino_t inum;
- int error;
- uint lock_mode;
-
- trace_xfs_lookup(dp, name);
-
- if (XFS_FORCED_SHUTDOWN(dp->i_mount))
- return XFS_ERROR(EIO);
-
- lock_mode = xfs_ilock_map_shared(dp);
- error = xfs_dir_lookup(NULL, dp, name, &inum, ci_name);
- xfs_iunlock_map_shared(dp, lock_mode);
-
- if (error)
- goto out;
-
- error = xfs_iget(dp->i_mount, NULL, inum, 0, 0, ipp);
- if (error)
- goto out_free_name;
-
- return 0;
-
-out_free_name:
- if (ci_name)
- kmem_free(ci_name->name);
-out:
- *ipp = NULL;
- return error;
-}
-
-int
-xfs_create(
- xfs_inode_t *dp,
- struct xfs_name *name,
- umode_t mode,
- xfs_dev_t rdev,
- xfs_inode_t **ipp)
-{
- int is_dir = S_ISDIR(mode);
- struct xfs_mount *mp = dp->i_mount;
- struct xfs_inode *ip = NULL;
- struct xfs_trans *tp = NULL;
- int error;
- xfs_bmap_free_t free_list;
- xfs_fsblock_t first_block;
- boolean_t unlock_dp_on_error = B_FALSE;
- uint cancel_flags;
- int committed;
- prid_t prid;
- struct xfs_dquot *udqp = NULL;
- struct xfs_dquot *gdqp = NULL;
- uint resblks;
- uint log_res;
- uint log_count;
-
- trace_xfs_create(dp, name);
-
- if (XFS_FORCED_SHUTDOWN(mp))
- return XFS_ERROR(EIO);
-
- if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
- prid = xfs_get_projid(dp);
- else
- prid = XFS_PROJID_DEFAULT;
-
- /*
- * Make sure that we have allocated dquot(s) on disk.
- */
- error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid,
- XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp);
- if (error)
- return error;
-
- if (is_dir) {
- rdev = 0;
- resblks = XFS_MKDIR_SPACE_RES(mp, name->len);
- log_res = XFS_MKDIR_LOG_RES(mp);
- log_count = XFS_MKDIR_LOG_COUNT;
- tp = xfs_trans_alloc(mp, XFS_TRANS_MKDIR);
- } else {
- resblks = XFS_CREATE_SPACE_RES(mp, name->len);
- log_res = XFS_CREATE_LOG_RES(mp);
- log_count = XFS_CREATE_LOG_COUNT;
- tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE);
- }
-
- cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
-
- /*
- * Initially assume that the file does not exist and
- * reserve the resources for that case. If that is not
- * the case we'll drop the one we have and get a more
- * appropriate transaction later.
- */
- error = xfs_trans_reserve(tp, resblks, log_res, 0,
- XFS_TRANS_PERM_LOG_RES, log_count);
- if (error == ENOSPC) {
- /* flush outstanding delalloc blocks and retry */
- xfs_flush_inodes(mp);
- error = xfs_trans_reserve(tp, resblks, log_res, 0,
- XFS_TRANS_PERM_LOG_RES, log_count);
- }
- if (error == ENOSPC) {
- /* No space at all so try a "no-allocation" reservation */
- resblks = 0;
- error = xfs_trans_reserve(tp, 0, log_res, 0,
- XFS_TRANS_PERM_LOG_RES, log_count);
- }
- if (error) {
- cancel_flags = 0;
- goto out_trans_cancel;
- }
-
- xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
- unlock_dp_on_error = B_TRUE;
-
- xfs_bmap_init(&free_list, &first_block);
-
- /*
- * Reserve disk quota and the inode.
- */
- error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp, resblks, 1, 0);
- if (error)
- goto out_trans_cancel;
-
- error = xfs_dir_canenter(tp, dp, name, resblks);
- if (error)
- goto out_trans_cancel;
-
- /*
- * A newly created regular or special file just has one directory
- * entry pointing to them, but a directory also the "." entry
- * pointing to itself.
- */
- error = xfs_dir_ialloc(&tp, dp, mode, is_dir ? 2 : 1, rdev,
- prid, resblks > 0, &ip, &committed);
- if (error) {
- if (error == ENOSPC)
- goto out_trans_cancel;
- goto out_trans_abort;
- }
-
- /*
- * Now we join the directory inode to the transaction. We do not do it
- * earlier because xfs_dir_ialloc might commit the previous transaction
- * (and release all the locks). An error from here on will result in
- * the transaction cancel unlocking dp so don't do it explicitly in the
- * error path.
- */
- xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
- unlock_dp_on_error = B_FALSE;
-
- error = xfs_dir_createname(tp, dp, name, ip->i_ino,
- &first_block, &free_list, resblks ?
- resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
- if (error) {
- ASSERT(error != ENOSPC);
- goto out_trans_abort;
- }
- xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
- xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
-
- if (is_dir) {
- error = xfs_dir_init(tp, ip, dp);
- if (error)
- goto out_bmap_cancel;
-
- error = xfs_bumplink(tp, dp);
- if (error)
- goto out_bmap_cancel;
- }
-
- /*
- * If this is a synchronous mount, make sure that the
- * create transaction goes to disk before returning to
- * the user.
- */
- if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC))
- xfs_trans_set_sync(tp);
-
- /*
- * Attach the dquot(s) to the inodes and modify them incore.
- * These ids of the inode couldn't have changed since the new
- * inode has been locked ever since it was created.
- */
- xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp);
-
- error = xfs_bmap_finish(&tp, &free_list, &committed);
- if (error)
- goto out_bmap_cancel;
-
- error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
- if (error)
- goto out_release_inode;
-
- xfs_qm_dqrele(udqp);
- xfs_qm_dqrele(gdqp);
-
- *ipp = ip;
- return 0;
-
- out_bmap_cancel:
- xfs_bmap_cancel(&free_list);
- out_trans_abort:
- cancel_flags |= XFS_TRANS_ABORT;
- out_trans_cancel:
- xfs_trans_cancel(tp, cancel_flags);
- out_release_inode:
- /*
- * Wait until after the current transaction is aborted to
- * release the inode. This prevents recursive transactions
- * and deadlocks from xfs_inactive.
- */
- if (ip)
- IRELE(ip);
-
- xfs_qm_dqrele(udqp);
- xfs_qm_dqrele(gdqp);
-
- if (unlock_dp_on_error)
- xfs_iunlock(dp, XFS_ILOCK_EXCL);
- return error;
-}
-
-#ifdef DEBUG
-int xfs_locked_n;
-int xfs_small_retries;
-int xfs_middle_retries;
-int xfs_lots_retries;
-int xfs_lock_delays;
-#endif
-
-/*
- * Bump the subclass so xfs_lock_inodes() acquires each lock with
- * a different value
- */
-static inline int
-xfs_lock_inumorder(int lock_mode, int subclass)
-{
- if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL))
- lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_IOLOCK_SHIFT;
- if (lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL))
- lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_ILOCK_SHIFT;
-
- return lock_mode;
-}
-
-/*
- * The following routine will lock n inodes in exclusive mode.
- * We assume the caller calls us with the inodes in i_ino order.
- *
- * We need to detect deadlock where an inode that we lock
- * is in the AIL and we start waiting for another inode that is locked
- * by a thread in a long running transaction (such as truncate). This can
- * result in deadlock since the long running trans might need to wait
- * for the inode we just locked in order to push the tail and free space
- * in the log.
- */
-void
-xfs_lock_inodes(
- xfs_inode_t **ips,
- int inodes,
- uint lock_mode)
-{
- int attempts = 0, i, j, try_lock;
- xfs_log_item_t *lp;
-
- ASSERT(ips && (inodes >= 2)); /* we need at least two */
-
- try_lock = 0;
- i = 0;
-
-again:
- for (; i < inodes; i++) {
- ASSERT(ips[i]);
-
- if (i && (ips[i] == ips[i-1])) /* Already locked */
- continue;
-
- /*
- * If try_lock is not set yet, make sure all locked inodes
- * are not in the AIL.
- * If any are, set try_lock to be used later.
- */
-
- if (!try_lock) {
- for (j = (i - 1); j >= 0 && !try_lock; j--) {
- lp = (xfs_log_item_t *)ips[j]->i_itemp;
- if (lp && (lp->li_flags & XFS_LI_IN_AIL)) {
- try_lock++;
- }
- }
- }
-
- /*
- * If any of the previous locks we have locked is in the AIL,
- * we must TRY to get the second and subsequent locks. If
- * we can't get any, we must release all we have
- * and try again.
- */
-
- if (try_lock) {
- /* try_lock must be 0 if i is 0. */
- /*
- * try_lock means we have an inode locked
- * that is in the AIL.
- */
- ASSERT(i != 0);
- if (!xfs_ilock_nowait(ips[i], xfs_lock_inumorder(lock_mode, i))) {
- attempts++;
-
- /*
- * Unlock all previous guys and try again.
- * xfs_iunlock will try to push the tail
- * if the inode is in the AIL.
- */
-
- for(j = i - 1; j >= 0; j--) {
-
- /*
- * Check to see if we've already
- * unlocked this one.
- * Not the first one going back,
- * and the inode ptr is the same.
- */
- if ((j != (i - 1)) && ips[j] ==
- ips[j+1])
- continue;
-
- xfs_iunlock(ips[j], lock_mode);
- }
-
- if ((attempts % 5) == 0) {
- delay(1); /* Don't just spin the CPU */
-#ifdef DEBUG
- xfs_lock_delays++;
-#endif
- }
- i = 0;
- try_lock = 0;
- goto again;
- }
- } else {
- xfs_ilock(ips[i], xfs_lock_inumorder(lock_mode, i));
- }
- }
-
-#ifdef DEBUG
- if (attempts) {
- if (attempts < 5) xfs_small_retries++;
- else if (attempts < 100) xfs_middle_retries++;
- else xfs_lots_retries++;
- } else {
- xfs_locked_n++;
- }
-#endif
-}
-
-/*
- * xfs_lock_two_inodes() can only be used to lock one type of lock
- * at a time - the iolock or the ilock, but not both at once. If
- * we lock both at once, lockdep will report false positives saying
- * we have violated locking orders.
- */
-void
-xfs_lock_two_inodes(
- xfs_inode_t *ip0,
- xfs_inode_t *ip1,
- uint lock_mode)
-{
- xfs_inode_t *temp;
- int attempts = 0;
- xfs_log_item_t *lp;
-
- if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL))
- ASSERT((lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)) == 0);
- ASSERT(ip0->i_ino != ip1->i_ino);
-
- if (ip0->i_ino > ip1->i_ino) {
- temp = ip0;
- ip0 = ip1;
- ip1 = temp;
- }
-
- again:
- xfs_ilock(ip0, xfs_lock_inumorder(lock_mode, 0));
-
- /*
- * If the first lock we have locked is in the AIL, we must TRY to get
- * the second lock. If we can't get it, we must release the first one
- * and try again.
- */
- lp = (xfs_log_item_t *)ip0->i_itemp;
- if (lp && (lp->li_flags & XFS_LI_IN_AIL)) {
- if (!xfs_ilock_nowait(ip1, xfs_lock_inumorder(lock_mode, 1))) {
- xfs_iunlock(ip0, lock_mode);
- if ((++attempts % 5) == 0)
- delay(1); /* Don't just spin the CPU */
- goto again;
- }
- } else {
- xfs_ilock(ip1, xfs_lock_inumorder(lock_mode, 1));
- }
-}
-
-int
-xfs_remove(
- xfs_inode_t *dp,
- struct xfs_name *name,
- xfs_inode_t *ip)
-{
- xfs_mount_t *mp = dp->i_mount;
- xfs_trans_t *tp = NULL;
- int is_dir = S_ISDIR(ip->i_d.di_mode);
- int error = 0;
- xfs_bmap_free_t free_list;
- xfs_fsblock_t first_block;
- int cancel_flags;
- int committed;
- int link_zero;
- uint resblks;
- uint log_count;
-
- trace_xfs_remove(dp, name);
-
- if (XFS_FORCED_SHUTDOWN(mp))
- return XFS_ERROR(EIO);
-
- error = xfs_qm_dqattach(dp, 0);
- if (error)
- goto std_return;
-
- error = xfs_qm_dqattach(ip, 0);
- if (error)
- goto std_return;
-
- if (is_dir) {
- tp = xfs_trans_alloc(mp, XFS_TRANS_RMDIR);
- log_count = XFS_DEFAULT_LOG_COUNT;
- } else {
- tp = xfs_trans_alloc(mp, XFS_TRANS_REMOVE);
- log_count = XFS_REMOVE_LOG_COUNT;
- }
- cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
-
- /*
- * We try to get the real space reservation first,
- * allowing for directory btree deletion(s) implying
- * possible bmap insert(s). If we can't get the space
- * reservation then we use 0 instead, and avoid the bmap
- * btree insert(s) in the directory code by, if the bmap
- * insert tries to happen, instead trimming the LAST
- * block from the directory.
- */
- resblks = XFS_REMOVE_SPACE_RES(mp);
- error = xfs_trans_reserve(tp, resblks, XFS_REMOVE_LOG_RES(mp), 0,
- XFS_TRANS_PERM_LOG_RES, log_count);
- if (error == ENOSPC) {
- resblks = 0;
- error = xfs_trans_reserve(tp, 0, XFS_REMOVE_LOG_RES(mp), 0,
- XFS_TRANS_PERM_LOG_RES, log_count);
- }
- if (error) {
- ASSERT(error != ENOSPC);
- cancel_flags = 0;
- goto out_trans_cancel;
- }
-
- xfs_lock_two_inodes(dp, ip, XFS_ILOCK_EXCL);
-
- xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
- xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
-
- /*
- * If we're removing a directory perform some additional validation.
- */
- if (is_dir) {
- ASSERT(ip->i_d.di_nlink >= 2);
- if (ip->i_d.di_nlink != 2) {
- error = XFS_ERROR(ENOTEMPTY);
- goto out_trans_cancel;
- }
- if (!xfs_dir_isempty(ip)) {
- error = XFS_ERROR(ENOTEMPTY);
- goto out_trans_cancel;
- }
- }
-
- xfs_bmap_init(&free_list, &first_block);
- error = xfs_dir_removename(tp, dp, name, ip->i_ino,
- &first_block, &free_list, resblks);
- if (error) {
- ASSERT(error != ENOENT);
- goto out_bmap_cancel;
- }
- xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
-
- if (is_dir) {
- /*
- * Drop the link from ip's "..".
- */
- error = xfs_droplink(tp, dp);
- if (error)
- goto out_bmap_cancel;
-
- /*
- * Drop the "." link from ip to self.
- */
- error = xfs_droplink(tp, ip);
- if (error)
- goto out_bmap_cancel;
- } else {
- /*
- * When removing a non-directory we need to log the parent
- * inode here. For a directory this is done implicitly
- * by the xfs_droplink call for the ".." entry.
- */
- xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
- }
-
- /*
- * Drop the link from dp to ip.
- */
- error = xfs_droplink(tp, ip);
- if (error)
- goto out_bmap_cancel;
-
- /*
- * Determine if this is the last link while
- * we are in the transaction.
- */
- link_zero = (ip->i_d.di_nlink == 0);
-
- /*
- * If this is a synchronous mount, make sure that the
- * remove transaction goes to disk before returning to
- * the user.
- */
- if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC))
- xfs_trans_set_sync(tp);
-
- error = xfs_bmap_finish(&tp, &free_list, &committed);
- if (error)
- goto out_bmap_cancel;
-
- error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
- if (error)
- goto std_return;
-
- /*
- * If we are using filestreams, kill the stream association.
- * If the file is still open it may get a new one but that
- * will get killed on last close in xfs_close() so we don't
- * have to worry about that.
- */
- if (!is_dir && link_zero && xfs_inode_is_filestream(ip))
- xfs_filestream_deassociate(ip);
-
- return 0;
-
- out_bmap_cancel:
- xfs_bmap_cancel(&free_list);
- cancel_flags |= XFS_TRANS_ABORT;
- out_trans_cancel:
- xfs_trans_cancel(tp, cancel_flags);
- std_return:
- return error;
-}
-
-int
-xfs_link(
- xfs_inode_t *tdp,
- xfs_inode_t *sip,
- struct xfs_name *target_name)
-{
- xfs_mount_t *mp = tdp->i_mount;
- xfs_trans_t *tp;
- int error;
- xfs_bmap_free_t free_list;
- xfs_fsblock_t first_block;
- int cancel_flags;
- int committed;
- int resblks;
-
- trace_xfs_link(tdp, target_name);
-
- ASSERT(!S_ISDIR(sip->i_d.di_mode));
-
- if (XFS_FORCED_SHUTDOWN(mp))
- return XFS_ERROR(EIO);
-
- error = xfs_qm_dqattach(sip, 0);
- if (error)
- goto std_return;
-
- error = xfs_qm_dqattach(tdp, 0);
- if (error)
- goto std_return;
-
- tp = xfs_trans_alloc(mp, XFS_TRANS_LINK);
- cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
- resblks = XFS_LINK_SPACE_RES(mp, target_name->len);
- error = xfs_trans_reserve(tp, resblks, XFS_LINK_LOG_RES(mp), 0,
- XFS_TRANS_PERM_LOG_RES, XFS_LINK_LOG_COUNT);
- if (error == ENOSPC) {
- resblks = 0;
- error = xfs_trans_reserve(tp, 0, XFS_LINK_LOG_RES(mp), 0,
- XFS_TRANS_PERM_LOG_RES, XFS_LINK_LOG_COUNT);
- }
- if (error) {
- cancel_flags = 0;
- goto error_return;
- }
-
- xfs_lock_two_inodes(sip, tdp, XFS_ILOCK_EXCL);
-
- xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL);
- xfs_trans_ijoin(tp, tdp, XFS_ILOCK_EXCL);
-
- /*
- * If we are using project inheritance, we only allow hard link
- * creation in our tree when the project IDs are the same; else
- * the tree quota mechanism could be circumvented.
- */
- if (unlikely((tdp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
- (xfs_get_projid(tdp) != xfs_get_projid(sip)))) {
- error = XFS_ERROR(EXDEV);
- goto error_return;
- }
-
- error = xfs_dir_canenter(tp, tdp, target_name, resblks);
- if (error)
- goto error_return;
-
- xfs_bmap_init(&free_list, &first_block);
-
- error = xfs_dir_createname(tp, tdp, target_name, sip->i_ino,
- &first_block, &free_list, resblks);
- if (error)
- goto abort_return;
- xfs_trans_ichgtime(tp, tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
- xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE);
-
- error = xfs_bumplink(tp, sip);
- if (error)
- goto abort_return;
-
- /*
- * If this is a synchronous mount, make sure that the
- * link transaction goes to disk before returning to
- * the user.
- */
- if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
- xfs_trans_set_sync(tp);
- }
-
- error = xfs_bmap_finish (&tp, &free_list, &committed);
- if (error) {
- xfs_bmap_cancel(&free_list);
- goto abort_return;
- }
-
- return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
-
- abort_return:
- cancel_flags |= XFS_TRANS_ABORT;
- error_return:
- xfs_trans_cancel(tp, cancel_flags);
- std_return:
- return error;
-}
-
-int
-xfs_symlink(
- xfs_inode_t *dp,
- struct xfs_name *link_name,
- const char *target_path,
- umode_t mode,
- xfs_inode_t **ipp)
-{
- xfs_mount_t *mp = dp->i_mount;
- xfs_trans_t *tp;
- xfs_inode_t *ip;
- int error;
- int pathlen;
- xfs_bmap_free_t free_list;
- xfs_fsblock_t first_block;
- boolean_t unlock_dp_on_error = B_FALSE;
- uint cancel_flags;
- int committed;
- xfs_fileoff_t first_fsb;
- xfs_filblks_t fs_blocks;
- int nmaps;
- xfs_bmbt_irec_t mval[SYMLINK_MAPS];
- xfs_daddr_t d;
- const char *cur_chunk;
- int byte_cnt;
- int n;
- xfs_buf_t *bp;
- prid_t prid;
- struct xfs_dquot *udqp, *gdqp;
- uint resblks;
-
- *ipp = NULL;
- error = 0;
- ip = NULL;
- tp = NULL;
-
- trace_xfs_symlink(dp, link_name);
-
- if (XFS_FORCED_SHUTDOWN(mp))
- return XFS_ERROR(EIO);
-
- /*
- * Check component lengths of the target path name.
- */
- pathlen = strlen(target_path);
- if (pathlen >= MAXPATHLEN) /* total string too long */
- return XFS_ERROR(ENAMETOOLONG);
-
- udqp = gdqp = NULL;
- if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
- prid = xfs_get_projid(dp);
- else
- prid = XFS_PROJID_DEFAULT;
-
- /*
- * Make sure that we have allocated dquot(s) on disk.
- */
- error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid,
- XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp);
- if (error)
- goto std_return;
-
- tp = xfs_trans_alloc(mp, XFS_TRANS_SYMLINK);
- cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
- /*
- * The symlink will fit into the inode data fork?
- * There can't be any attributes so we get the whole variable part.
- */
- if (pathlen <= XFS_LITINO(mp))
- fs_blocks = 0;
- else
- fs_blocks = XFS_B_TO_FSB(mp, pathlen);
- resblks = XFS_SYMLINK_SPACE_RES(mp, link_name->len, fs_blocks);
- error = xfs_trans_reserve(tp, resblks, XFS_SYMLINK_LOG_RES(mp), 0,
- XFS_TRANS_PERM_LOG_RES, XFS_SYMLINK_LOG_COUNT);
- if (error == ENOSPC && fs_blocks == 0) {
- resblks = 0;
- error = xfs_trans_reserve(tp, 0, XFS_SYMLINK_LOG_RES(mp), 0,
- XFS_TRANS_PERM_LOG_RES, XFS_SYMLINK_LOG_COUNT);
- }
- if (error) {
- cancel_flags = 0;
- goto error_return;
- }
-
- xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
- unlock_dp_on_error = B_TRUE;
-
- /*
- * Check whether the directory allows new symlinks or not.
- */
- if (dp->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) {
- error = XFS_ERROR(EPERM);
- goto error_return;
- }
-
- /*
- * Reserve disk quota : blocks and inode.
- */
- error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp, resblks, 1, 0);
- if (error)
- goto error_return;
-
- /*
- * Check for ability to enter directory entry, if no space reserved.
- */
- error = xfs_dir_canenter(tp, dp, link_name, resblks);
- if (error)
- goto error_return;
- /*
- * Initialize the bmap freelist prior to calling either
- * bmapi or the directory create code.
- */
- xfs_bmap_init(&free_list, &first_block);
-
- /*
- * Allocate an inode for the symlink.
- */
- error = xfs_dir_ialloc(&tp, dp, S_IFLNK | (mode & ~S_IFMT), 1, 0,
- prid, resblks > 0, &ip, NULL);
- if (error) {
- if (error == ENOSPC)
- goto error_return;
- goto error1;
- }
-
- /*
- * An error after we've joined dp to the transaction will result in the
- * transaction cancel unlocking dp so don't do it explicitly in the
- * error path.
- */
- xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
- unlock_dp_on_error = B_FALSE;
-
- /*
- * Also attach the dquot(s) to it, if applicable.
- */
- xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp);
-
- if (resblks)
- resblks -= XFS_IALLOC_SPACE_RES(mp);
- /*
- * If the symlink will fit into the inode, write it inline.
- */
- if (pathlen <= XFS_IFORK_DSIZE(ip)) {
- xfs_idata_realloc(ip, pathlen, XFS_DATA_FORK);
- memcpy(ip->i_df.if_u1.if_data, target_path, pathlen);
- ip->i_d.di_size = pathlen;
-
- /*
- * The inode was initially created in extent format.
- */
- ip->i_df.if_flags &= ~(XFS_IFEXTENTS | XFS_IFBROOT);
- ip->i_df.if_flags |= XFS_IFINLINE;
-
- ip->i_d.di_format = XFS_DINODE_FMT_LOCAL;
- xfs_trans_log_inode(tp, ip, XFS_ILOG_DDATA | XFS_ILOG_CORE);
-
- } else {
- first_fsb = 0;
- nmaps = SYMLINK_MAPS;
-
- error = xfs_bmapi_write(tp, ip, first_fsb, fs_blocks,
- XFS_BMAPI_METADATA, &first_block, resblks,
- mval, &nmaps, &free_list);
- if (error)
- goto error2;
-
- if (resblks)
- resblks -= fs_blocks;
- ip->i_d.di_size = pathlen;
- xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-
- cur_chunk = target_path;
- for (n = 0; n < nmaps; n++) {
- d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock);
- byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount);
- bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
- BTOBB(byte_cnt), 0);
- if (!bp) {
- error = ENOMEM;
- goto error2;
- }
- if (pathlen < byte_cnt) {
- byte_cnt = pathlen;
- }
- pathlen -= byte_cnt;
-
- memcpy(bp->b_addr, cur_chunk, byte_cnt);
- cur_chunk += byte_cnt;
-
- xfs_trans_log_buf(tp, bp, 0, byte_cnt - 1);
- }
- }
-
- /*
- * Create the directory entry for the symlink.
- */
- error = xfs_dir_createname(tp, dp, link_name, ip->i_ino,
- &first_block, &free_list, resblks);
- if (error)
- goto error2;
- xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
- xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
-
- /*
- * If this is a synchronous mount, make sure that the
- * symlink transaction goes to disk before returning to
- * the user.
- */
- if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
- xfs_trans_set_sync(tp);
- }
-
- error = xfs_bmap_finish(&tp, &free_list, &committed);
- if (error) {
- goto error2;
- }
- error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
- xfs_qm_dqrele(udqp);
- xfs_qm_dqrele(gdqp);
-
- *ipp = ip;
- return 0;
-
- error2:
- IRELE(ip);
- error1:
- xfs_bmap_cancel(&free_list);
- cancel_flags |= XFS_TRANS_ABORT;
- error_return:
- xfs_trans_cancel(tp, cancel_flags);
- xfs_qm_dqrele(udqp);
- xfs_qm_dqrele(gdqp);
-
- if (unlock_dp_on_error)
- xfs_iunlock(dp, XFS_ILOCK_EXCL);
- std_return:
- return error;
-}
-
-int
-xfs_set_dmattrs(
- xfs_inode_t *ip,
- u_int evmask,
- u_int16_t state)
-{
- xfs_mount_t *mp = ip->i_mount;
- xfs_trans_t *tp;
- int error;
-
- if (!capable(CAP_SYS_ADMIN))
- return XFS_ERROR(EPERM);
-
- if (XFS_FORCED_SHUTDOWN(mp))
- return XFS_ERROR(EIO);
-
- tp = xfs_trans_alloc(mp, XFS_TRANS_SET_DMATTRS);
- error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES (mp), 0, 0, 0);
- if (error) {
- xfs_trans_cancel(tp, 0);
- return error;
- }
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
-
- ip->i_d.di_dmevmask = evmask;
- ip->i_d.di_dmstate = state;
-
- xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
- error = xfs_trans_commit(tp, 0);
-
- return error;
-}
-
-/*
- * xfs_alloc_file_space()
- * This routine allocates disk space for the given file.
- *
- * If alloc_type == 0, this request is for an ALLOCSP type
- * request which will change the file size. In this case, no
- * DMAPI event will be generated by the call. A TRUNCATE event
- * will be generated later by xfs_setattr.
- *
- * If alloc_type != 0, this request is for a RESVSP type
- * request, and a DMAPI DM_EVENT_WRITE will be generated if the
- * lower block boundary byte address is less than the file's
- * length.
- *
- * RETURNS:
- * 0 on success
- * errno on error
- *
- */
-STATIC int
-xfs_alloc_file_space(
- xfs_inode_t *ip,
- xfs_off_t offset,
- xfs_off_t len,
- int alloc_type,
- int attr_flags)
-{
- xfs_mount_t *mp = ip->i_mount;
- xfs_off_t count;
- xfs_filblks_t allocated_fsb;
- xfs_filblks_t allocatesize_fsb;
- xfs_extlen_t extsz, temp;
- xfs_fileoff_t startoffset_fsb;
- xfs_fsblock_t firstfsb;
- int nimaps;
- int quota_flag;
- int rt;
- xfs_trans_t *tp;
- xfs_bmbt_irec_t imaps[1], *imapp;
- xfs_bmap_free_t free_list;
- uint qblocks, resblks, resrtextents;
- int committed;
- int error;
-
- trace_xfs_alloc_file_space(ip);
-
- if (XFS_FORCED_SHUTDOWN(mp))
- return XFS_ERROR(EIO);
-
- error = xfs_qm_dqattach(ip, 0);
- if (error)
- return error;
-
- if (len <= 0)
- return XFS_ERROR(EINVAL);
-
- rt = XFS_IS_REALTIME_INODE(ip);
- extsz = xfs_get_extsz_hint(ip);
-
- count = len;
- imapp = &imaps[0];
- nimaps = 1;
- startoffset_fsb = XFS_B_TO_FSBT(mp, offset);
- allocatesize_fsb = XFS_B_TO_FSB(mp, count);
-
- /*
- * Allocate file space until done or until there is an error
- */
- while (allocatesize_fsb && !error) {
- xfs_fileoff_t s, e;
-
- /*
- * Determine space reservations for data/realtime.
- */
- if (unlikely(extsz)) {
- s = startoffset_fsb;
- do_div(s, extsz);
- s *= extsz;
- e = startoffset_fsb + allocatesize_fsb;
- if ((temp = do_mod(startoffset_fsb, extsz)))
- e += temp;
- if ((temp = do_mod(e, extsz)))
- e += extsz - temp;
- } else {
- s = 0;
- e = allocatesize_fsb;
- }
-
- /*
- * The transaction reservation is limited to a 32-bit block
- * count, hence we need to limit the number of blocks we are
- * trying to reserve to avoid an overflow. We can't allocate
- * more than @nimaps extents, and an extent is limited on disk
- * to MAXEXTLEN (21 bits), so use that to enforce the limit.
- */
- resblks = min_t(xfs_fileoff_t, (e - s), (MAXEXTLEN * nimaps));
- if (unlikely(rt)) {
- resrtextents = qblocks = resblks;
- resrtextents /= mp->m_sb.sb_rextsize;
- resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
- quota_flag = XFS_QMOPT_RES_RTBLKS;
- } else {
- resrtextents = 0;
- resblks = qblocks = XFS_DIOSTRAT_SPACE_RES(mp, resblks);
- quota_flag = XFS_QMOPT_RES_REGBLKS;
- }
-
- /*
- * Allocate and setup the transaction.
- */
- tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
- error = xfs_trans_reserve(tp, resblks,
- XFS_WRITE_LOG_RES(mp), resrtextents,
- XFS_TRANS_PERM_LOG_RES,
- XFS_WRITE_LOG_COUNT);
- /*
- * Check for running out of space
- */
- if (error) {
- /*
- * Free the transaction structure.
- */
- ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp));
- xfs_trans_cancel(tp, 0);
- break;
- }
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- error = xfs_trans_reserve_quota_nblks(tp, ip, qblocks,
- 0, quota_flag);
- if (error)
- goto error1;
-
- xfs_trans_ijoin(tp, ip, 0);
-
- xfs_bmap_init(&free_list, &firstfsb);
- error = xfs_bmapi_write(tp, ip, startoffset_fsb,
- allocatesize_fsb, alloc_type, &firstfsb,
- 0, imapp, &nimaps, &free_list);
- if (error) {
- goto error0;
- }
-
- /*
- * Complete the transaction
- */
- error = xfs_bmap_finish(&tp, &free_list, &committed);
- if (error) {
- goto error0;
- }
-
- error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- if (error) {
- break;
- }
-
- allocated_fsb = imapp->br_blockcount;
-
- if (nimaps == 0) {
- error = XFS_ERROR(ENOSPC);
- break;
- }
-
- startoffset_fsb += allocated_fsb;
- allocatesize_fsb -= allocated_fsb;
- }
-
- return error;
-
-error0: /* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */
- xfs_bmap_cancel(&free_list);
- xfs_trans_unreserve_quota_nblks(tp, ip, (long)qblocks, 0, quota_flag);
-
-error1: /* Just cancel transaction */
- xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- return error;
-}
-
-/*
- * Zero file bytes between startoff and endoff inclusive.
- * The iolock is held exclusive and no blocks are buffered.
- *
- * This function is used by xfs_free_file_space() to zero
- * partial blocks when the range to free is not block aligned.
- * When unreserving space with boundaries that are not block
- * aligned we round up the start and round down the end
- * boundaries and then use this function to zero the parts of
- * the blocks that got dropped during the rounding.
- */
-STATIC int
-xfs_zero_remaining_bytes(
- xfs_inode_t *ip,
- xfs_off_t startoff,
- xfs_off_t endoff)
-{
- xfs_bmbt_irec_t imap;
- xfs_fileoff_t offset_fsb;
- xfs_off_t lastoffset;
- xfs_off_t offset;
- xfs_buf_t *bp;
- xfs_mount_t *mp = ip->i_mount;
- int nimap;
- int error = 0;
-
- /*
- * Avoid doing I/O beyond eof - it's not necessary
- * since nothing can read beyond eof. The space will
- * be zeroed when the file is extended anyway.
- */
- if (startoff >= XFS_ISIZE(ip))
- return 0;
-
- if (endoff > XFS_ISIZE(ip))
- endoff = XFS_ISIZE(ip);
-
- bp = xfs_buf_get_uncached(XFS_IS_REALTIME_INODE(ip) ?
- mp->m_rtdev_targp : mp->m_ddev_targp,
- BTOBB(mp->m_sb.sb_blocksize), 0);
- if (!bp)
- return XFS_ERROR(ENOMEM);
-
- xfs_buf_unlock(bp);
-
- for (offset = startoff; offset <= endoff; offset = lastoffset + 1) {
- offset_fsb = XFS_B_TO_FSBT(mp, offset);
- nimap = 1;
- error = xfs_bmapi_read(ip, offset_fsb, 1, &imap, &nimap, 0);
- if (error || nimap < 1)
- break;
- ASSERT(imap.br_blockcount >= 1);
- ASSERT(imap.br_startoff == offset_fsb);
- lastoffset = XFS_FSB_TO_B(mp, imap.br_startoff + 1) - 1;
- if (lastoffset > endoff)
- lastoffset = endoff;
- if (imap.br_startblock == HOLESTARTBLOCK)
- continue;
- ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
- if (imap.br_state == XFS_EXT_UNWRITTEN)
- continue;
- XFS_BUF_UNDONE(bp);
- XFS_BUF_UNWRITE(bp);
- XFS_BUF_READ(bp);
- XFS_BUF_SET_ADDR(bp, xfs_fsb_to_db(ip, imap.br_startblock));
- xfsbdstrat(mp, bp);
- error = xfs_buf_iowait(bp);
- if (error) {
- xfs_buf_ioerror_alert(bp,
- "xfs_zero_remaining_bytes(read)");
- break;
- }
- memset(bp->b_addr +
- (offset - XFS_FSB_TO_B(mp, imap.br_startoff)),
- 0, lastoffset - offset + 1);
- XFS_BUF_UNDONE(bp);
- XFS_BUF_UNREAD(bp);
- XFS_BUF_WRITE(bp);
- xfsbdstrat(mp, bp);
- error = xfs_buf_iowait(bp);
- if (error) {
- xfs_buf_ioerror_alert(bp,
- "xfs_zero_remaining_bytes(write)");
- break;
- }
- }
- xfs_buf_free(bp);
- return error;
-}
-
-/*
- * xfs_free_file_space()
- * This routine frees disk space for the given file.
- *
- * This routine is only called by xfs_change_file_space
- * for an UNRESVSP type call.
- *
- * RETURNS:
- * 0 on success
- * errno on error
- *
- */
-STATIC int
-xfs_free_file_space(
- xfs_inode_t *ip,
- xfs_off_t offset,
- xfs_off_t len,
- int attr_flags)
-{
- int committed;
- int done;
- xfs_fileoff_t endoffset_fsb;
- int error;
- xfs_fsblock_t firstfsb;
- xfs_bmap_free_t free_list;
- xfs_bmbt_irec_t imap;
- xfs_off_t ioffset;
- xfs_extlen_t mod=0;
- xfs_mount_t *mp;
- int nimap;
- uint resblks;
- uint rounding;
- int rt;
- xfs_fileoff_t startoffset_fsb;
- xfs_trans_t *tp;
- int need_iolock = 1;
-
- mp = ip->i_mount;
-
- trace_xfs_free_file_space(ip);
-
- error = xfs_qm_dqattach(ip, 0);
- if (error)
- return error;
-
- error = 0;
- if (len <= 0) /* if nothing being freed */
- return error;
- rt = XFS_IS_REALTIME_INODE(ip);
- startoffset_fsb = XFS_B_TO_FSB(mp, offset);
- endoffset_fsb = XFS_B_TO_FSBT(mp, offset + len);
-
- if (attr_flags & XFS_ATTR_NOLOCK)
- need_iolock = 0;
- if (need_iolock) {
- xfs_ilock(ip, XFS_IOLOCK_EXCL);
- /* wait for the completion of any pending DIOs */
- inode_dio_wait(VFS_I(ip));
- }
-
- rounding = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE);
- ioffset = offset & ~(rounding - 1);
- error = -filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
- ioffset, -1);
- if (error)
- goto out_unlock_iolock;
- truncate_pagecache_range(VFS_I(ip), ioffset, -1);
-
- /*
- * Need to zero the stuff we're not freeing, on disk.
- * If it's a realtime file & can't use unwritten extents then we
- * actually need to zero the extent edges. Otherwise xfs_bunmapi
- * will take care of it for us.
- */
- if (rt && !xfs_sb_version_hasextflgbit(&mp->m_sb)) {
- nimap = 1;
- error = xfs_bmapi_read(ip, startoffset_fsb, 1,
- &imap, &nimap, 0);
- if (error)
- goto out_unlock_iolock;
- ASSERT(nimap == 0 || nimap == 1);
- if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
- xfs_daddr_t block;
-
- ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
- block = imap.br_startblock;
- mod = do_div(block, mp->m_sb.sb_rextsize);
- if (mod)
- startoffset_fsb += mp->m_sb.sb_rextsize - mod;
- }
- nimap = 1;
- error = xfs_bmapi_read(ip, endoffset_fsb - 1, 1,
- &imap, &nimap, 0);
- if (error)
- goto out_unlock_iolock;
- ASSERT(nimap == 0 || nimap == 1);
- if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
- ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
- mod++;
- if (mod && (mod != mp->m_sb.sb_rextsize))
- endoffset_fsb -= mod;
- }
- }
- if ((done = (endoffset_fsb <= startoffset_fsb)))
- /*
- * One contiguous piece to clear
- */
- error = xfs_zero_remaining_bytes(ip, offset, offset + len - 1);
- else {
- /*
- * Some full blocks, possibly two pieces to clear
- */
- if (offset < XFS_FSB_TO_B(mp, startoffset_fsb))
- error = xfs_zero_remaining_bytes(ip, offset,
- XFS_FSB_TO_B(mp, startoffset_fsb) - 1);
- if (!error &&
- XFS_FSB_TO_B(mp, endoffset_fsb) < offset + len)
- error = xfs_zero_remaining_bytes(ip,
- XFS_FSB_TO_B(mp, endoffset_fsb),
- offset + len - 1);
- }
-
- /*
- * free file space until done or until there is an error
- */
- resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
- while (!error && !done) {
-
- /*
- * allocate and setup the transaction. Allow this
- * transaction to dip into the reserve blocks to ensure
- * the freeing of the space succeeds at ENOSPC.
- */
- tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
- tp->t_flags |= XFS_TRANS_RESERVE;
- error = xfs_trans_reserve(tp,
- resblks,
- XFS_WRITE_LOG_RES(mp),
- 0,
- XFS_TRANS_PERM_LOG_RES,
- XFS_WRITE_LOG_COUNT);
-
- /*
- * check for running out of space
- */
- if (error) {
- /*
- * Free the transaction structure.
- */
- ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp));
- xfs_trans_cancel(tp, 0);
- break;
- }
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- error = xfs_trans_reserve_quota(tp, mp,
- ip->i_udquot, ip->i_gdquot,
- resblks, 0, XFS_QMOPT_RES_REGBLKS);
- if (error)
- goto error1;
-
- xfs_trans_ijoin(tp, ip, 0);
-
- /*
- * issue the bunmapi() call to free the blocks
- */
- xfs_bmap_init(&free_list, &firstfsb);
- error = xfs_bunmapi(tp, ip, startoffset_fsb,
- endoffset_fsb - startoffset_fsb,
- 0, 2, &firstfsb, &free_list, &done);
- if (error) {
- goto error0;
- }
-
- /*
- * complete the transaction
- */
- error = xfs_bmap_finish(&tp, &free_list, &committed);
- if (error) {
- goto error0;
- }
-
- error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- }
-
- out_unlock_iolock:
- if (need_iolock)
- xfs_iunlock(ip, XFS_IOLOCK_EXCL);
- return error;
-
- error0:
- xfs_bmap_cancel(&free_list);
- error1:
- xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
- xfs_iunlock(ip, need_iolock ? (XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL) :
- XFS_ILOCK_EXCL);
- return error;
-}
-
-
-STATIC int
-xfs_zero_file_space(
- struct xfs_inode *ip,
- xfs_off_t offset,
- xfs_off_t len,
- int attr_flags)
-{
- struct xfs_mount *mp = ip->i_mount;
- uint granularity;
- xfs_off_t start_boundary;
- xfs_off_t end_boundary;
- int error;
-
- granularity = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE);
-
- /*
- * Round the range of extents we are going to convert inwards. If the
- * offset is aligned, then it doesn't get changed so we zero from the
- * start of the block offset points to.
- */
- start_boundary = round_up(offset, granularity);
- end_boundary = round_down(offset + len, granularity);
-
- ASSERT(start_boundary >= offset);
- ASSERT(end_boundary <= offset + len);
-
- if (!(attr_flags & XFS_ATTR_NOLOCK))
- xfs_ilock(ip, XFS_IOLOCK_EXCL);
-
- if (start_boundary < end_boundary - 1) {
- /* punch out the page cache over the conversion range */
- truncate_pagecache_range(VFS_I(ip), start_boundary,
- end_boundary - 1);
- /* convert the blocks */
- error = xfs_alloc_file_space(ip, start_boundary,
- end_boundary - start_boundary - 1,
- XFS_BMAPI_PREALLOC | XFS_BMAPI_CONVERT,
- attr_flags);
- if (error)
- goto out_unlock;
-
- /* We've handled the interior of the range, now for the edges */
- if (start_boundary != offset)
- error = xfs_iozero(ip, offset, start_boundary - offset);
- if (error)
- goto out_unlock;
-
- if (end_boundary != offset + len)
- error = xfs_iozero(ip, end_boundary,
- offset + len - end_boundary);
-
- } else {
- /*
- * It's either a sub-granularity range or the range spanned lies
- * partially across two adjacent blocks.
- */
- error = xfs_iozero(ip, offset, len);
- }
-
-out_unlock:
- if (!(attr_flags & XFS_ATTR_NOLOCK))
- xfs_iunlock(ip, XFS_IOLOCK_EXCL);
- return error;
-
-}
-
-/*
- * xfs_change_file_space()
- * This routine allocates or frees disk space for the given file.
- * The user specified parameters are checked for alignment and size
- * limitations.
- *
- * RETURNS:
- * 0 on success
- * errno on error
- *
- */
-int
-xfs_change_file_space(
- xfs_inode_t *ip,
- int cmd,
- xfs_flock64_t *bf,
- xfs_off_t offset,
- int attr_flags)
-{
- xfs_mount_t *mp = ip->i_mount;
- int clrprealloc;
- int error;
- xfs_fsize_t fsize;
- int setprealloc;
- xfs_off_t startoffset;
- xfs_trans_t *tp;
- struct iattr iattr;
-
- if (!S_ISREG(ip->i_d.di_mode))
- return XFS_ERROR(EINVAL);
-
- switch (bf->l_whence) {
- case 0: /*SEEK_SET*/
- break;
- case 1: /*SEEK_CUR*/
- bf->l_start += offset;
- break;
- case 2: /*SEEK_END*/
- bf->l_start += XFS_ISIZE(ip);
- break;
- default:
- return XFS_ERROR(EINVAL);
- }
-
- /*
- * length of <= 0 for resv/unresv/zero is invalid. length for
- * alloc/free is ignored completely and we have no idea what userspace
- * might have set it to, so set it to zero to allow range
- * checks to pass.
- */
- switch (cmd) {
- case XFS_IOC_ZERO_RANGE:
- case XFS_IOC_RESVSP:
- case XFS_IOC_RESVSP64:
- case XFS_IOC_UNRESVSP:
- case XFS_IOC_UNRESVSP64:
- if (bf->l_len <= 0)
- return XFS_ERROR(EINVAL);
- break;
- default:
- bf->l_len = 0;
- break;
- }
-
- if (bf->l_start < 0 ||
- bf->l_start > mp->m_super->s_maxbytes ||
- bf->l_start + bf->l_len < 0 ||
- bf->l_start + bf->l_len >= mp->m_super->s_maxbytes)
- return XFS_ERROR(EINVAL);
-
- bf->l_whence = 0;
-
- startoffset = bf->l_start;
- fsize = XFS_ISIZE(ip);
-
- setprealloc = clrprealloc = 0;
- switch (cmd) {
- case XFS_IOC_ZERO_RANGE:
- error = xfs_zero_file_space(ip, startoffset, bf->l_len,
- attr_flags);
- if (error)
- return error;
- setprealloc = 1;
- break;
-
- case XFS_IOC_RESVSP:
- case XFS_IOC_RESVSP64:
- error = xfs_alloc_file_space(ip, startoffset, bf->l_len,
- XFS_BMAPI_PREALLOC, attr_flags);
- if (error)
- return error;
- setprealloc = 1;
- break;
-
- case XFS_IOC_UNRESVSP:
- case XFS_IOC_UNRESVSP64:
- if ((error = xfs_free_file_space(ip, startoffset, bf->l_len,
- attr_flags)))
- return error;
- break;
-
- case XFS_IOC_ALLOCSP:
- case XFS_IOC_ALLOCSP64:
- case XFS_IOC_FREESP:
- case XFS_IOC_FREESP64:
- /*
- * These operations actually do IO when extending the file, but
- * the allocation is done seperately to the zeroing that is
- * done. This set of operations need to be serialised against
- * other IO operations, such as truncate and buffered IO. We
- * need to take the IOLOCK here to serialise the allocation and
- * zeroing IO to prevent other IOLOCK holders (e.g. getbmap,
- * truncate, direct IO) from racing against the transient
- * allocated but not written state we can have here.
- */
- xfs_ilock(ip, XFS_IOLOCK_EXCL);
- if (startoffset > fsize) {
- error = xfs_alloc_file_space(ip, fsize,
- startoffset - fsize, 0,
- attr_flags | XFS_ATTR_NOLOCK);
- if (error) {
- xfs_iunlock(ip, XFS_IOLOCK_EXCL);
- break;
- }
- }
-
- iattr.ia_valid = ATTR_SIZE;
- iattr.ia_size = startoffset;
-
- error = xfs_setattr_size(ip, &iattr,
- attr_flags | XFS_ATTR_NOLOCK);
- xfs_iunlock(ip, XFS_IOLOCK_EXCL);
-
- if (error)
- return error;
-
- clrprealloc = 1;
- break;
-
- default:
- ASSERT(0);
- return XFS_ERROR(EINVAL);
- }
-
- /*
- * update the inode timestamp, mode, and prealloc flag bits
- */
- tp = xfs_trans_alloc(mp, XFS_TRANS_WRITEID);
-
- if ((error = xfs_trans_reserve(tp, 0, XFS_WRITEID_LOG_RES(mp),
- 0, 0, 0))) {
- /* ASSERT(0); */
- xfs_trans_cancel(tp, 0);
- return error;
- }
-
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
-
- if ((attr_flags & XFS_ATTR_DMI) == 0) {
- ip->i_d.di_mode &= ~S_ISUID;
-
- /*
- * Note that we don't have to worry about mandatory
- * file locking being disabled here because we only
- * clear the S_ISGID bit if the Group execute bit is
- * on, but if it was on then mandatory locking wouldn't
- * have been enabled.
- */
- if (ip->i_d.di_mode & S_IXGRP)
- ip->i_d.di_mode &= ~S_ISGID;
-
- xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
- }
- if (setprealloc)
- ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC;
- else if (clrprealloc)
- ip->i_d.di_flags &= ~XFS_DIFLAG_PREALLOC;
-
- xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
- if (attr_flags & XFS_ATTR_SYNC)
- xfs_trans_set_sync(tp);
- return xfs_trans_commit(tp, 0);
-}
diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h
deleted file mode 100644
index 5163022d980..00000000000
--- a/fs/xfs/xfs_vnodeops.h
+++ /dev/null
@@ -1,56 +0,0 @@
-#ifndef _XFS_VNODEOPS_H
-#define _XFS_VNODEOPS_H 1
-
-struct attrlist_cursor_kern;
-struct file;
-struct iattr;
-struct inode;
-struct iovec;
-struct kiocb;
-struct pipe_inode_info;
-struct uio;
-struct xfs_inode;
-
-
-int xfs_setattr_nonsize(struct xfs_inode *ip, struct iattr *vap, int flags);
-int xfs_setattr_size(struct xfs_inode *ip, struct iattr *vap, int flags);
-#define XFS_ATTR_DMI 0x01 /* invocation from a DMI function */
-#define XFS_ATTR_NONBLOCK 0x02 /* return EAGAIN if operation would block */
-#define XFS_ATTR_NOLOCK 0x04 /* Don't grab any conflicting locks */
-#define XFS_ATTR_NOACL 0x08 /* Don't call xfs_acl_chmod */
-#define XFS_ATTR_SYNC 0x10 /* synchronous operation required */
-
-int xfs_readlink(struct xfs_inode *ip, char *link);
-int xfs_release(struct xfs_inode *ip);
-int xfs_inactive(struct xfs_inode *ip);
-int xfs_lookup(struct xfs_inode *dp, struct xfs_name *name,
- struct xfs_inode **ipp, struct xfs_name *ci_name);
-int xfs_create(struct xfs_inode *dp, struct xfs_name *name, umode_t mode,
- xfs_dev_t rdev, struct xfs_inode **ipp);
-int xfs_remove(struct xfs_inode *dp, struct xfs_name *name,
- struct xfs_inode *ip);
-int xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip,
- struct xfs_name *target_name);
-int xfs_readdir(struct xfs_inode *dp, void *dirent, size_t bufsize,
- xfs_off_t *offset, filldir_t filldir);
-int xfs_symlink(struct xfs_inode *dp, struct xfs_name *link_name,
- const char *target_path, umode_t mode, struct xfs_inode **ipp);
-int xfs_set_dmattrs(struct xfs_inode *ip, u_int evmask, u_int16_t state);
-int xfs_change_file_space(struct xfs_inode *ip, int cmd,
- xfs_flock64_t *bf, xfs_off_t offset, int attr_flags);
-int xfs_rename(struct xfs_inode *src_dp, struct xfs_name *src_name,
- struct xfs_inode *src_ip, struct xfs_inode *target_dp,
- struct xfs_name *target_name, struct xfs_inode *target_ip);
-int xfs_attr_get(struct xfs_inode *ip, const unsigned char *name,
- unsigned char *value, int *valuelenp, int flags);
-int xfs_attr_set(struct xfs_inode *dp, const unsigned char *name,
- unsigned char *value, int valuelen, int flags);
-int xfs_attr_remove(struct xfs_inode *dp, const unsigned char *name, int flags);
-int xfs_attr_list(struct xfs_inode *dp, char *buffer, int bufsize,
- int flags, struct attrlist_cursor_kern *cursor);
-
-int xfs_iozero(struct xfs_inode *, loff_t, size_t);
-int xfs_zero_eof(struct xfs_inode *, xfs_off_t, xfs_fsize_t);
-int xfs_free_eofblocks(struct xfs_mount *, struct xfs_inode *, bool);
-
-#endif /* _XFS_VNODEOPS_H */
diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c
index 87d3e03878c..78ed92a46fd 100644
--- a/fs/xfs/xfs_xattr.c
+++ b/fs/xfs/xfs_xattr.c
@@ -17,13 +17,17 @@
*/
#include "xfs.h"
-#include "xfs_da_btree.h"
-#include "xfs_bmap_btree.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_da_format.h"
#include "xfs_inode.h"
#include "xfs_attr.h"
#include "xfs_attr_leaf.h"
#include "xfs_acl.h"
-#include "xfs_vnodeops.h"
#include <linux/posix_acl_xattr.h>
#include <linux/xattr.h>
@@ -98,8 +102,8 @@ const struct xattr_handler *xfs_xattr_handlers[] = {
&xfs_xattr_trusted_handler,
&xfs_xattr_security_handler,
#ifdef CONFIG_XFS_POSIX_ACL
- &xfs_xattr_acl_access_handler,
- &xfs_xattr_acl_default_handler,
+ &posix_acl_access_xattr_handler,
+ &posix_acl_default_xattr_handler,
#endif
NULL
};