diff options
Diffstat (limited to 'fs/xfs')
152 files changed, 32082 insertions, 23947 deletions
diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig index 5a7ffe54f5d..399e8cec6e6 100644 --- a/fs/xfs/Kconfig +++ b/fs/xfs/Kconfig @@ -69,9 +69,22 @@ config XFS_RT If unsure, say N. +config XFS_WARN + bool "XFS Verbose Warnings" + depends on XFS_FS && !XFS_DEBUG + help + Say Y here to get an XFS build with many additional warnings. + It converts ASSERT checks to WARN, so will log any out-of-bounds + conditions that occur that would otherwise be missed. It is much + lighter weight than XFS_DEBUG and does not modify algorithms and will + not cause the kernel to panic on non-fatal errors. + + However, similar to XFS_DEBUG, it is only advisable to use this if you + are debugging a particular problem. + config XFS_DEBUG - bool "XFS Debugging support (EXPERIMENTAL)" - depends on XFS_FS && EXPERIMENTAL + bool "XFS Debugging support" + depends on XFS_FS help Say Y here to get an XFS build with many debugging features, including ASSERT checks, function wrappers around macros, diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index d02201df855..c21f4350666 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -27,9 +27,12 @@ xfs-y += xfs_trace.o # highlevel code xfs-y += xfs_aops.o \ + xfs_attr_inactive.o \ + xfs_attr_list.o \ xfs_bit.o \ + xfs_bmap_util.o \ xfs_buf.o \ - xfs_dfrag.o \ + xfs_dir2_readdir.o \ xfs_discard.o \ xfs_error.o \ xfs_export.o \ @@ -44,12 +47,12 @@ xfs-y += xfs_aops.o \ xfs_iops.o \ xfs_itable.o \ xfs_message.o \ + xfs_mount.o \ xfs_mru_cache.o \ xfs_super.o \ + xfs_symlink.o \ + xfs_trans.o \ xfs_xattr.o \ - xfs_rename.o \ - xfs_utils.o \ - xfs_vnodeops.o \ kmem.o \ uuid.o @@ -58,22 +61,30 @@ xfs-y += xfs_alloc.o \ xfs_alloc_btree.o \ xfs_attr.o \ xfs_attr_leaf.o \ + xfs_attr_remote.o \ xfs_bmap.o \ xfs_bmap_btree.o \ xfs_btree.o \ xfs_da_btree.o \ + xfs_da_format.o \ xfs_dir2.o \ xfs_dir2_block.o \ xfs_dir2_data.o \ xfs_dir2_leaf.o \ xfs_dir2_node.o \ xfs_dir2_sf.o \ + xfs_dquot_buf.o \ xfs_ialloc.o \ xfs_ialloc_btree.o \ + xfs_icreate_item.o \ xfs_inode.o \ + xfs_inode_fork.o \ + xfs_inode_buf.o \ xfs_log_recover.o \ - xfs_mount.o \ - xfs_trans.o + xfs_log_rlimit.o \ + xfs_sb.o \ + xfs_symlink_remote.o \ + xfs_trans_resv.o # low-level transaction/log code xfs-y += xfs_log.o \ @@ -94,7 +105,11 @@ xfs-$(CONFIG_XFS_QUOTA) += xfs_dquot.o \ xfs_qm_bhv.o \ xfs_qm.o \ xfs_quotaops.o -xfs-$(CONFIG_XFS_RT) += xfs_rtalloc.o + +# xfs_rtbitmap is shared with libxfs +xfs-$(CONFIG_XFS_RT) += xfs_rtalloc.o \ + xfs_rtbitmap.o + xfs-$(CONFIG_XFS_POSIX_ACL) += xfs_acl.o xfs-$(CONFIG_PROC_FS) += xfs_stats.o xfs-$(CONFIG_SYSCTL) += xfs_sysctl.o diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c index 4a7286c1dc8..844e288b957 100644 --- a/fs/xfs/kmem.c +++ b/fs/xfs/kmem.c @@ -27,8 +27,6 @@ /* * Greedy allocation. May fail and may return vmalloced memory. - * - * Must be freed using kmem_free_large. */ void * kmem_zalloc_greedy(size_t *size, size_t minsize, size_t maxsize) @@ -36,7 +34,7 @@ kmem_zalloc_greedy(size_t *size, size_t minsize, size_t maxsize) void *ptr; size_t kmsize = maxsize; - while (!(ptr = kmem_zalloc_large(kmsize))) { + while (!(ptr = vzalloc(kmsize))) { if ((kmsize >>= 1) <= minsize) kmsize = minsize; } @@ -65,13 +63,32 @@ kmem_alloc(size_t size, xfs_km_flags_t flags) } void * -kmem_zalloc(size_t size, xfs_km_flags_t flags) +kmem_zalloc_large(size_t size, xfs_km_flags_t flags) { + unsigned noio_flag = 0; void *ptr; + gfp_t lflags; - ptr = kmem_alloc(size, flags); + ptr = kmem_zalloc(size, flags | KM_MAYFAIL); if (ptr) - memset((char *)ptr, 0, (int)size); + return ptr; + + /* + * __vmalloc() will allocate data pages and auxillary structures (e.g. + * pagetables) with GFP_KERNEL, yet we may be under GFP_NOFS context + * here. Hence we need to tell memory reclaim that we are in such a + * context via PF_MEMALLOC_NOIO to prevent memory reclaim re-entering + * the filesystem here and potentially deadlocking. + */ + if ((current->flags & PF_FSTRANS) || (flags & KM_NOFS)) + noio_flag = memalloc_noio_save(); + + lflags = kmem_flags_convert(flags); + ptr = __vmalloc(size, lflags | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL); + + if ((current->flags & PF_FSTRANS) || (flags & KM_NOFS)) + memalloc_noio_restore(noio_flag); + return ptr; } @@ -119,14 +136,3 @@ kmem_zone_alloc(kmem_zone_t *zone, xfs_km_flags_t flags) congestion_wait(BLK_RW_ASYNC, HZ/50); } while (1); } - -void * -kmem_zone_zalloc(kmem_zone_t *zone, xfs_km_flags_t flags) -{ - void *ptr; - - ptr = kmem_zone_alloc(zone, flags); - if (ptr) - memset((char *)ptr, 0, kmem_cache_size(zone)); - return ptr; -} diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h index b2f2620f9a8..64db0e53ede 100644 --- a/fs/xfs/kmem.h +++ b/fs/xfs/kmem.h @@ -32,6 +32,7 @@ typedef unsigned __bitwise xfs_km_flags_t; #define KM_NOSLEEP ((__force xfs_km_flags_t)0x0002u) #define KM_NOFS ((__force xfs_km_flags_t)0x0004u) #define KM_MAYFAIL ((__force xfs_km_flags_t)0x0008u) +#define KM_ZERO ((__force xfs_km_flags_t)0x0010u) /* * We use a special process flag to avoid recursive callbacks into @@ -43,7 +44,7 @@ kmem_flags_convert(xfs_km_flags_t flags) { gfp_t lflags; - BUG_ON(flags & ~(KM_SLEEP|KM_NOSLEEP|KM_NOFS|KM_MAYFAIL)); + BUG_ON(flags & ~(KM_SLEEP|KM_NOSLEEP|KM_NOFS|KM_MAYFAIL|KM_ZERO)); if (flags & KM_NOSLEEP) { lflags = GFP_ATOMIC | __GFP_NOWARN; @@ -52,25 +53,27 @@ kmem_flags_convert(xfs_km_flags_t flags) if ((current->flags & PF_FSTRANS) || (flags & KM_NOFS)) lflags &= ~__GFP_FS; } + + if (flags & KM_ZERO) + lflags |= __GFP_ZERO; + return lflags; } extern void *kmem_alloc(size_t, xfs_km_flags_t); -extern void *kmem_zalloc(size_t, xfs_km_flags_t); +extern void *kmem_zalloc_large(size_t size, xfs_km_flags_t); extern void *kmem_realloc(const void *, size_t, size_t, xfs_km_flags_t); extern void kmem_free(const void *); -static inline void *kmem_zalloc_large(size_t size) -{ - return vzalloc(size); -} -static inline void kmem_free_large(void *ptr) -{ - vfree(ptr); -} extern void *kmem_zalloc_greedy(size_t *, size_t, size_t); +static inline void * +kmem_zalloc(size_t size, xfs_km_flags_t flags) +{ + return kmem_alloc(size, flags | KM_ZERO); +} + /* * Zone interfaces */ @@ -109,6 +112,11 @@ kmem_zone_destroy(kmem_zone_t *zone) } extern void *kmem_zone_alloc(kmem_zone_t *, xfs_km_flags_t); -extern void *kmem_zone_zalloc(kmem_zone_t *, xfs_km_flags_t); + +static inline void * +kmem_zone_zalloc(kmem_zone_t *zone, xfs_km_flags_t flags) +{ + return kmem_zone_alloc(zone, flags | KM_ZERO); +} #endif /* __XFS_SUPPORT_KMEM_H__ */ diff --git a/fs/xfs/mrlock.h b/fs/xfs/mrlock.h index ff6a19873e5..e3c92d19e54 100644 --- a/fs/xfs/mrlock.h +++ b/fs/xfs/mrlock.h @@ -22,12 +22,12 @@ typedef struct { struct rw_semaphore mr_lock; -#ifdef DEBUG +#if defined(DEBUG) || defined(XFS_WARN) int mr_writer; #endif } mrlock_t; -#ifdef DEBUG +#if defined(DEBUG) || defined(XFS_WARN) #define mrinit(mrp, name) \ do { (mrp)->mr_writer = 0; init_rwsem(&(mrp)->mr_lock); } while (0) #else @@ -46,7 +46,7 @@ static inline void mraccess_nested(mrlock_t *mrp, int subclass) static inline void mrupdate_nested(mrlock_t *mrp, int subclass) { down_write_nested(&mrp->mr_lock, subclass); -#ifdef DEBUG +#if defined(DEBUG) || defined(XFS_WARN) mrp->mr_writer = 1; #endif } @@ -60,7 +60,7 @@ static inline int mrtryupdate(mrlock_t *mrp) { if (!down_write_trylock(&mrp->mr_lock)) return 0; -#ifdef DEBUG +#if defined(DEBUG) || defined(XFS_WARN) mrp->mr_writer = 1; #endif return 1; @@ -68,7 +68,7 @@ static inline int mrtryupdate(mrlock_t *mrp) static inline void mrunlock_excl(mrlock_t *mrp) { -#ifdef DEBUG +#if defined(DEBUG) || defined(XFS_WARN) mrp->mr_writer = 0; #endif up_write(&mrp->mr_lock); @@ -81,7 +81,7 @@ static inline void mrunlock_shared(mrlock_t *mrp) static inline void mrdemote(mrlock_t *mrp) { -#ifdef DEBUG +#if defined(DEBUG) || defined(XFS_WARN) mrp->mr_writer = 0; #endif downgrade_write(&mrp->mr_lock); diff --git a/fs/xfs/xfs.h b/fs/xfs/xfs.h index d8b11b7f94a..a742c47f7d5 100644 --- a/fs/xfs/xfs.h +++ b/fs/xfs/xfs.h @@ -24,6 +24,11 @@ #define XFS_BUF_LOCK_TRACKING 1 #endif +#ifdef CONFIG_XFS_WARN +#define XFS_WARN 1 +#endif + + #include "xfs_linux.h" #endif /* __XFS_H__ */ diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c index 1d32f1d5276..6888ad886ff 100644 --- a/fs/xfs/xfs_acl.c +++ b/fs/xfs/xfs_acl.c @@ -16,11 +16,15 @@ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include "xfs.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_ag.h" +#include "xfs_sb.h" +#include "xfs_mount.h" +#include "xfs_inode.h" #include "xfs_acl.h" #include "xfs_attr.h" -#include "xfs_bmap_btree.h" -#include "xfs_inode.h" -#include "xfs_vnodeops.h" #include "xfs_trace.h" #include <linux/slab.h> #include <linux/xattr.h> @@ -34,7 +38,9 @@ */ STATIC struct posix_acl * -xfs_acl_from_disk(struct xfs_acl *aclp) +xfs_acl_from_disk( + struct xfs_acl *aclp, + int max_entries) { struct posix_acl_entry *acl_e; struct posix_acl *acl; @@ -42,7 +48,7 @@ xfs_acl_from_disk(struct xfs_acl *aclp) unsigned int count, i; count = be32_to_cpu(aclp->acl_cnt); - if (count > XFS_ACL_MAX_ENTRIES) + if (count > max_entries) return ERR_PTR(-EFSCORRUPTED); acl = posix_acl_alloc(count, GFP_KERNEL); @@ -64,14 +70,15 @@ xfs_acl_from_disk(struct xfs_acl *aclp) switch (acl_e->e_tag) { case ACL_USER: + acl_e->e_uid = xfs_uid_to_kuid(be32_to_cpu(ace->ae_id)); + break; case ACL_GROUP: - acl_e->e_id = be32_to_cpu(ace->ae_id); + acl_e->e_gid = xfs_gid_to_kgid(be32_to_cpu(ace->ae_id)); break; case ACL_USER_OBJ: case ACL_GROUP_OBJ: case ACL_MASK: case ACL_OTHER: - acl_e->e_id = ACL_UNDEFINED_ID; break; default: goto fail; @@ -97,7 +104,18 @@ xfs_acl_to_disk(struct xfs_acl *aclp, const struct posix_acl *acl) acl_e = &acl->a_entries[i]; ace->ae_tag = cpu_to_be32(acl_e->e_tag); - ace->ae_id = cpu_to_be32(acl_e->e_id); + switch (acl_e->e_tag) { + case ACL_USER: + ace->ae_id = cpu_to_be32(xfs_kuid_to_uid(acl_e->e_uid)); + break; + case ACL_GROUP: + ace->ae_id = cpu_to_be32(xfs_kgid_to_gid(acl_e->e_gid)); + break; + default: + ace->ae_id = cpu_to_be32(ACL_UNDEFINED_ID); + break; + } + ace->ae_perm = cpu_to_be16(acl_e->e_perm); } } @@ -106,15 +124,11 @@ struct posix_acl * xfs_get_acl(struct inode *inode, int type) { struct xfs_inode *ip = XFS_I(inode); - struct posix_acl *acl; + struct posix_acl *acl = NULL; struct xfs_acl *xfs_acl; - int len = sizeof(struct xfs_acl); unsigned char *ea_name; int error; - - acl = get_cached_acl(inode, type); - if (acl != ACL_NOT_CACHED) - return acl; + int len; trace_xfs_get_acl(ip); @@ -133,8 +147,8 @@ xfs_get_acl(struct inode *inode, int type) * If we have a cached ACLs value just return it, not need to * go out to the disk. */ - - xfs_acl = kzalloc(sizeof(struct xfs_acl), GFP_KERNEL); + len = XFS_ACL_MAX_SIZE(ip->i_mount); + xfs_acl = kmem_zalloc_large(len, KM_SLEEP); if (!xfs_acl) return ERR_PTR(-ENOMEM); @@ -146,34 +160,29 @@ xfs_get_acl(struct inode *inode, int type) * cache entry, for any other error assume it is transient and * leave the cache entry as ACL_NOT_CACHED. */ - if (error == -ENOATTR) { - acl = NULL; + if (error == -ENOATTR) goto out_update_cache; - } goto out; } - acl = xfs_acl_from_disk(xfs_acl); + acl = xfs_acl_from_disk(xfs_acl, XFS_ACL_MAX_ENTRIES(ip->i_mount)); if (IS_ERR(acl)) goto out; - out_update_cache: +out_update_cache: set_cached_acl(inode, type, acl); - out: - kfree(xfs_acl); +out: + kmem_free(xfs_acl); return acl; } STATIC int -xfs_set_acl(struct inode *inode, int type, struct posix_acl *acl) +__xfs_set_acl(struct inode *inode, int type, struct posix_acl *acl) { struct xfs_inode *ip = XFS_I(inode); unsigned char *ea_name; int error; - if (S_ISLNK(inode->i_mode)) - return -EOPNOTSUPP; - switch (type) { case ACL_TYPE_ACCESS: ea_name = SGI_ACL_FILE; @@ -189,21 +198,22 @@ xfs_set_acl(struct inode *inode, int type, struct posix_acl *acl) if (acl) { struct xfs_acl *xfs_acl; - int len; + int len = XFS_ACL_MAX_SIZE(ip->i_mount); - xfs_acl = kzalloc(sizeof(struct xfs_acl), GFP_KERNEL); + xfs_acl = kmem_zalloc_large(len, KM_SLEEP); if (!xfs_acl) return -ENOMEM; xfs_acl_to_disk(xfs_acl, acl); - len = sizeof(struct xfs_acl) - - (sizeof(struct xfs_acl_entry) * - (XFS_ACL_MAX_ENTRIES - acl->a_count)); + + /* subtract away the unused acl entries */ + len -= sizeof(struct xfs_acl_entry) * + (XFS_ACL_MAX_ENTRIES(ip->i_mount) - acl->a_count); error = -xfs_attr_set(ip, ea_name, (unsigned char *)xfs_acl, len, ATTR_ROOT); - kfree(xfs_acl); + kmem_free(xfs_acl); } else { /* * A NULL ACL argument means we want to remove the ACL. @@ -243,7 +253,7 @@ xfs_set_mode(struct inode *inode, umode_t mode) static int xfs_acl_exists(struct inode *inode, unsigned char *name) { - int len = sizeof(struct xfs_acl); + int len = XFS_ACL_MAX_SIZE(XFS_M(inode->i_sb)); return (xfs_attr_get(XFS_I(inode), name, NULL, &len, ATTR_ROOT|ATTR_KERNOVAL) == 0); @@ -263,131 +273,23 @@ posix_acl_default_exists(struct inode *inode) return xfs_acl_exists(inode, SGI_ACL_DEFAULT); } -/* - * No need for i_mutex because the inode is not yet exposed to the VFS. - */ -int -xfs_inherit_acl(struct inode *inode, struct posix_acl *acl) -{ - umode_t mode = inode->i_mode; - int error = 0, inherit = 0; - - if (S_ISDIR(inode->i_mode)) { - error = xfs_set_acl(inode, ACL_TYPE_DEFAULT, acl); - if (error) - goto out; - } - - error = posix_acl_create(&acl, GFP_KERNEL, &mode); - if (error < 0) - return error; - - /* - * If posix_acl_create returns a positive value we need to - * inherit a permission that can't be represented using the Unix - * mode bits and we actually need to set an ACL. - */ - if (error > 0) - inherit = 1; - - error = xfs_set_mode(inode, mode); - if (error) - goto out; - - if (inherit) - error = xfs_set_acl(inode, ACL_TYPE_ACCESS, acl); - -out: - posix_acl_release(acl); - return error; -} - int -xfs_acl_chmod(struct inode *inode) -{ - struct posix_acl *acl; - int error; - - if (S_ISLNK(inode->i_mode)) - return -EOPNOTSUPP; - - acl = xfs_get_acl(inode, ACL_TYPE_ACCESS); - if (IS_ERR(acl) || !acl) - return PTR_ERR(acl); - - error = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode); - if (error) - return error; - - error = xfs_set_acl(inode, ACL_TYPE_ACCESS, acl); - posix_acl_release(acl); - return error; -} - -static int -xfs_xattr_acl_get(struct dentry *dentry, const char *name, - void *value, size_t size, int type) -{ - struct posix_acl *acl; - int error; - - acl = xfs_get_acl(dentry->d_inode, type); - if (IS_ERR(acl)) - return PTR_ERR(acl); - if (acl == NULL) - return -ENODATA; - - error = posix_acl_to_xattr(&init_user_ns, acl, value, size); - posix_acl_release(acl); - - return error; -} - -static int -xfs_xattr_acl_set(struct dentry *dentry, const char *name, - const void *value, size_t size, int flags, int type) +xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type) { - struct inode *inode = dentry->d_inode; - struct posix_acl *acl = NULL; int error = 0; - if (flags & XATTR_CREATE) - return -EINVAL; - if (type == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode)) - return value ? -EACCES : 0; - if ((current_fsuid() != inode->i_uid) && !capable(CAP_FOWNER)) - return -EPERM; - - if (!value) + if (!acl) goto set_acl; - acl = posix_acl_from_xattr(&init_user_ns, value, size); - if (!acl) { - /* - * acl_set_file(3) may request that we set default ACLs with - * zero length -- defend (gracefully) against that here. - */ - goto out; - } - if (IS_ERR(acl)) { - error = PTR_ERR(acl); - goto out; - } - - error = posix_acl_valid(acl); - if (error) - goto out_release; - - error = -EINVAL; - if (acl->a_count > XFS_ACL_MAX_ENTRIES) - goto out_release; + error = -E2BIG; + if (acl->a_count > XFS_ACL_MAX_ENTRIES(XFS_M(inode->i_sb))) + return error; if (type == ACL_TYPE_ACCESS) { umode_t mode = inode->i_mode; error = posix_acl_equiv_mode(acl, &mode); if (error <= 0) { - posix_acl_release(acl); acl = NULL; if (error < 0) @@ -396,27 +298,9 @@ xfs_xattr_acl_set(struct dentry *dentry, const char *name, error = xfs_set_mode(inode, mode); if (error) - goto out_release; + return error; } set_acl: - error = xfs_set_acl(inode, type, acl); - out_release: - posix_acl_release(acl); - out: - return error; + return __xfs_set_acl(inode, type, acl); } - -const struct xattr_handler xfs_xattr_acl_access_handler = { - .prefix = POSIX_ACL_XATTR_ACCESS, - .flags = ACL_TYPE_ACCESS, - .get = xfs_xattr_acl_get, - .set = xfs_xattr_acl_set, -}; - -const struct xattr_handler xfs_xattr_acl_default_handler = { - .prefix = POSIX_ACL_XATTR_DEFAULT, - .flags = ACL_TYPE_DEFAULT, - .get = xfs_xattr_acl_get, - .set = xfs_xattr_acl_set, -}; diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h index 39632d94135..5dc16374451 100644 --- a/fs/xfs/xfs_acl.h +++ b/fs/xfs/xfs_acl.h @@ -22,19 +22,36 @@ struct inode; struct posix_acl; struct xfs_inode; -#define XFS_ACL_MAX_ENTRIES 25 #define XFS_ACL_NOT_PRESENT (-1) /* On-disk XFS access control list structure */ +struct xfs_acl_entry { + __be32 ae_tag; + __be32 ae_id; + __be16 ae_perm; + __be16 ae_pad; /* fill the implicit hole in the structure */ +}; + struct xfs_acl { - __be32 acl_cnt; - struct xfs_acl_entry { - __be32 ae_tag; - __be32 ae_id; - __be16 ae_perm; - } acl_entry[XFS_ACL_MAX_ENTRIES]; + __be32 acl_cnt; + struct xfs_acl_entry acl_entry[0]; }; +/* + * The number of ACL entries allowed is defined by the on-disk format. + * For v4 superblocks, that is limited to 25 entries. For v5 superblocks, it is + * limited only by the maximum size of the xattr that stores the information. + */ +#define XFS_ACL_MAX_ENTRIES(mp) \ + (xfs_sb_version_hascrc(&mp->m_sb) \ + ? (XATTR_SIZE_MAX - sizeof(struct xfs_acl)) / \ + sizeof(struct xfs_acl_entry) \ + : 25) + +#define XFS_ACL_MAX_SIZE(mp) \ + (sizeof(struct xfs_acl) + \ + sizeof(struct xfs_acl_entry) * XFS_ACL_MAX_ENTRIES((mp))) + /* On-disk XFS extended attribute names */ #define SGI_ACL_FILE (unsigned char *)"SGI_ACL_FILE" #define SGI_ACL_DEFAULT (unsigned char *)"SGI_ACL_DEFAULT" @@ -43,20 +60,15 @@ struct xfs_acl { #ifdef CONFIG_XFS_POSIX_ACL extern struct posix_acl *xfs_get_acl(struct inode *inode, int type); -extern int xfs_inherit_acl(struct inode *inode, struct posix_acl *default_acl); -extern int xfs_acl_chmod(struct inode *inode); +extern int xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type); extern int posix_acl_access_exists(struct inode *inode); extern int posix_acl_default_exists(struct inode *inode); - -extern const struct xattr_handler xfs_xattr_acl_access_handler; -extern const struct xattr_handler xfs_xattr_acl_default_handler; #else static inline struct posix_acl *xfs_get_acl(struct inode *inode, int type) { return NULL; } -# define xfs_inherit_acl(inode, default_acl) 0 -# define xfs_acl_chmod(inode) 0 +# define xfs_set_acl NULL # define posix_acl_access_exists(inode) 0 # define posix_acl_default_exists(inode) 0 #endif /* CONFIG_XFS_POSIX_ACL */ diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h index f2aeedb6a57..6e247a99f5d 100644 --- a/fs/xfs/xfs_ag.h +++ b/fs/xfs/xfs_ag.h @@ -30,6 +30,7 @@ struct xfs_trans; #define XFS_AGF_MAGIC 0x58414746 /* 'XAGF' */ #define XFS_AGI_MAGIC 0x58414749 /* 'XAGI' */ +#define XFS_AGFL_MAGIC 0x5841464c /* 'XAFL' */ #define XFS_AGF_VERSION 1 #define XFS_AGI_VERSION 1 @@ -63,14 +64,33 @@ typedef struct xfs_agf { __be32 agf_spare0; /* spare field */ __be32 agf_levels[XFS_BTNUM_AGF]; /* btree levels */ __be32 agf_spare1; /* spare field */ + __be32 agf_flfirst; /* first freelist block's index */ __be32 agf_fllast; /* last freelist block's index */ __be32 agf_flcount; /* count of blocks in freelist */ __be32 agf_freeblks; /* total free blocks */ + __be32 agf_longest; /* longest free space */ __be32 agf_btreeblks; /* # of blocks held in AGF btrees */ + uuid_t agf_uuid; /* uuid of filesystem */ + + /* + * reserve some contiguous space for future logged fields before we add + * the unlogged fields. This makes the range logging via flags and + * structure offsets much simpler. + */ + __be64 agf_spare64[16]; + + /* unlogged fields, written during buffer writeback. */ + __be64 agf_lsn; /* last write sequence */ + __be32 agf_crc; /* crc of agf sector */ + __be32 agf_spare2; + + /* structure must be padded to 64 bit alignment */ } xfs_agf_t; +#define XFS_AGF_CRC_OFF offsetof(struct xfs_agf, agf_crc) + #define XFS_AGF_MAGICNUM 0x00000001 #define XFS_AGF_VERSIONNUM 0x00000002 #define XFS_AGF_SEQNO 0x00000004 @@ -83,7 +103,8 @@ typedef struct xfs_agf { #define XFS_AGF_FREEBLKS 0x00000200 #define XFS_AGF_LONGEST 0x00000400 #define XFS_AGF_BTREEBLKS 0x00000800 -#define XFS_AGF_NUM_BITS 12 +#define XFS_AGF_UUID 0x00001000 +#define XFS_AGF_NUM_BITS 13 #define XFS_AGF_ALL_BITS ((1 << XFS_AGF_NUM_BITS) - 1) #define XFS_AGF_FLAGS \ @@ -98,7 +119,8 @@ typedef struct xfs_agf { { XFS_AGF_FLCOUNT, "FLCOUNT" }, \ { XFS_AGF_FREEBLKS, "FREEBLKS" }, \ { XFS_AGF_LONGEST, "LONGEST" }, \ - { XFS_AGF_BTREEBLKS, "BTREEBLKS" } + { XFS_AGF_BTREEBLKS, "BTREEBLKS" }, \ + { XFS_AGF_UUID, "UUID" } /* disk block (xfs_daddr_t) in the AG */ #define XFS_AGF_DADDR(mp) ((xfs_daddr_t)(1 << (mp)->m_sectbb_log)) @@ -108,8 +130,6 @@ typedef struct xfs_agf { extern int xfs_read_agf(struct xfs_mount *mp, struct xfs_trans *tp, xfs_agnumber_t agno, int flags, struct xfs_buf **bpp); -extern const struct xfs_buf_ops xfs_agf_buf_ops; - /* * Size of the unlinked inode hash table in the agi. */ @@ -132,6 +152,7 @@ typedef struct xfs_agi { __be32 agi_root; /* root of inode btree */ __be32 agi_level; /* levels in inode btree */ __be32 agi_freecount; /* number of free inodes */ + __be32 agi_newino; /* new inode just allocated */ __be32 agi_dirino; /* last directory inode chunk */ /* @@ -139,21 +160,38 @@ typedef struct xfs_agi { * still being referenced. */ __be32 agi_unlinked[XFS_AGI_UNLINKED_BUCKETS]; + /* + * This marks the end of logging region 1 and start of logging region 2. + */ + uuid_t agi_uuid; /* uuid of filesystem */ + __be32 agi_crc; /* crc of agi sector */ + __be32 agi_pad32; + __be64 agi_lsn; /* last write sequence */ + + __be32 agi_free_root; /* root of the free inode btree */ + __be32 agi_free_level;/* levels in free inode btree */ + + /* structure must be padded to 64 bit alignment */ } xfs_agi_t; -#define XFS_AGI_MAGICNUM 0x00000001 -#define XFS_AGI_VERSIONNUM 0x00000002 -#define XFS_AGI_SEQNO 0x00000004 -#define XFS_AGI_LENGTH 0x00000008 -#define XFS_AGI_COUNT 0x00000010 -#define XFS_AGI_ROOT 0x00000020 -#define XFS_AGI_LEVEL 0x00000040 -#define XFS_AGI_FREECOUNT 0x00000080 -#define XFS_AGI_NEWINO 0x00000100 -#define XFS_AGI_DIRINO 0x00000200 -#define XFS_AGI_UNLINKED 0x00000400 -#define XFS_AGI_NUM_BITS 11 -#define XFS_AGI_ALL_BITS ((1 << XFS_AGI_NUM_BITS) - 1) +#define XFS_AGI_CRC_OFF offsetof(struct xfs_agi, agi_crc) + +#define XFS_AGI_MAGICNUM (1 << 0) +#define XFS_AGI_VERSIONNUM (1 << 1) +#define XFS_AGI_SEQNO (1 << 2) +#define XFS_AGI_LENGTH (1 << 3) +#define XFS_AGI_COUNT (1 << 4) +#define XFS_AGI_ROOT (1 << 5) +#define XFS_AGI_LEVEL (1 << 6) +#define XFS_AGI_FREECOUNT (1 << 7) +#define XFS_AGI_NEWINO (1 << 8) +#define XFS_AGI_DIRINO (1 << 9) +#define XFS_AGI_UNLINKED (1 << 10) +#define XFS_AGI_NUM_BITS_R1 11 /* end of the 1st agi logging region */ +#define XFS_AGI_ALL_BITS_R1 ((1 << XFS_AGI_NUM_BITS_R1) - 1) +#define XFS_AGI_FREE_ROOT (1 << 11) +#define XFS_AGI_FREE_LEVEL (1 << 12) +#define XFS_AGI_NUM_BITS_R2 13 /* disk block (xfs_daddr_t) in the AG */ #define XFS_AGI_DADDR(mp) ((xfs_daddr_t)(2 << (mp)->m_sectbb_log)) @@ -163,73 +201,40 @@ typedef struct xfs_agi { extern int xfs_read_agi(struct xfs_mount *mp, struct xfs_trans *tp, xfs_agnumber_t agno, struct xfs_buf **bpp); -extern const struct xfs_buf_ops xfs_agi_buf_ops; - /* * The third a.g. block contains the a.g. freelist, an array * of block pointers to blocks owned by the allocation btree code. */ #define XFS_AGFL_DADDR(mp) ((xfs_daddr_t)(3 << (mp)->m_sectbb_log)) #define XFS_AGFL_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_AGFL_DADDR(mp)) -#define XFS_AGFL_SIZE(mp) ((mp)->m_sb.sb_sectsize / sizeof(xfs_agblock_t)) #define XFS_BUF_TO_AGFL(bp) ((xfs_agfl_t *)((bp)->b_addr)) -typedef struct xfs_agfl { - __be32 agfl_bno[1]; /* actually XFS_AGFL_SIZE(mp) */ -} xfs_agfl_t; +#define XFS_BUF_TO_AGFL_BNO(mp, bp) \ + (xfs_sb_version_hascrc(&((mp)->m_sb)) ? \ + &(XFS_BUF_TO_AGFL(bp)->agfl_bno[0]) : \ + (__be32 *)(bp)->b_addr) /* - * Per-ag incore structure, copies of information in agf and agi, - * to improve the performance of allocation group selection. + * Size of the AGFL. For CRC-enabled filesystes we steal a couple of + * slots in the beginning of the block for a proper header with the + * location information and CRC. */ -#define XFS_PAGB_NUM_SLOTS 128 - -typedef struct xfs_perag { - struct xfs_mount *pag_mount; /* owner filesystem */ - xfs_agnumber_t pag_agno; /* AG this structure belongs to */ - atomic_t pag_ref; /* perag reference count */ - char pagf_init; /* this agf's entry is initialized */ - char pagi_init; /* this agi's entry is initialized */ - char pagf_metadata; /* the agf is preferred to be metadata */ - char pagi_inodeok; /* The agi is ok for inodes */ - __uint8_t pagf_levels[XFS_BTNUM_AGF]; - /* # of levels in bno & cnt btree */ - __uint32_t pagf_flcount; /* count of blocks in freelist */ - xfs_extlen_t pagf_freeblks; /* total free blocks */ - xfs_extlen_t pagf_longest; /* longest free space */ - __uint32_t pagf_btreeblks; /* # of blocks held in AGF btrees */ - xfs_agino_t pagi_freecount; /* number of free inodes */ - xfs_agino_t pagi_count; /* number of allocated inodes */ +#define XFS_AGFL_SIZE(mp) \ + (((mp)->m_sb.sb_sectsize - \ + (xfs_sb_version_hascrc(&((mp)->m_sb)) ? \ + sizeof(struct xfs_agfl) : 0)) / \ + sizeof(xfs_agblock_t)) - /* - * Inode allocation search lookup optimisation. - * If the pagino matches, the search for new inodes - * doesn't need to search the near ones again straight away - */ - xfs_agino_t pagl_pagino; - xfs_agino_t pagl_leftrec; - xfs_agino_t pagl_rightrec; -#ifdef __KERNEL__ - spinlock_t pagb_lock; /* lock for pagb_tree */ - struct rb_root pagb_tree; /* ordered tree of busy extents */ - - atomic_t pagf_fstrms; /* # of filestreams active in this AG */ - - spinlock_t pag_ici_lock; /* incore inode cache lock */ - struct radix_tree_root pag_ici_root; /* incore inode cache root */ - int pag_ici_reclaimable; /* reclaimable inodes */ - struct mutex pag_ici_reclaim_lock; /* serialisation point */ - unsigned long pag_ici_reclaim_cursor; /* reclaim restart point */ - - /* buffer cache index */ - spinlock_t pag_buf_lock; /* lock for pag_buf_tree */ - struct rb_root pag_buf_tree; /* ordered tree of active buffers */ - - /* for rcu-safe freeing */ - struct rcu_head rcu_head; -#endif - int pagb_count; /* pagb slots in use */ -} xfs_perag_t; +typedef struct xfs_agfl { + __be32 agfl_magicnum; + __be32 agfl_seqno; + uuid_t agfl_uuid; + __be64 agfl_lsn; + __be32 agfl_crc; + __be32 agfl_bno[]; /* actually XFS_AGFL_SIZE(mp) */ +} xfs_agfl_t; + +#define XFS_AGFL_CRC_OFF offsetof(struct xfs_agfl, agfl_crc) /* * tags for inode radix tree diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c index 393055fe3ae..d43813267a8 100644 --- a/fs/xfs/xfs_alloc.c +++ b/fs/xfs/xfs_alloc.c @@ -17,23 +17,25 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_types.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_shared.h" +#include "xfs_trans_resv.h" #include "xfs_bit.h" -#include "xfs_log.h" -#include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_mount.h" -#include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_btree.h" +#include "xfs_alloc_btree.h" #include "xfs_alloc.h" #include "xfs_extent_busy.h" #include "xfs_error.h" +#include "xfs_cksum.h" #include "xfs_trace.h" +#include "xfs_trans.h" +#include "xfs_buf_item.h" +#include "xfs_log.h" struct workqueue_struct *xfs_alloc_wq; @@ -173,6 +175,7 @@ xfs_alloc_compute_diff( xfs_agblock_t wantbno, /* target starting block */ xfs_extlen_t wantlen, /* target length */ xfs_extlen_t alignment, /* target alignment */ + char userdata, /* are we allocating data? */ xfs_agblock_t freebno, /* freespace's starting block */ xfs_extlen_t freelen, /* freespace's length */ xfs_agblock_t *newbnop) /* result: best start block from free */ @@ -187,7 +190,14 @@ xfs_alloc_compute_diff( ASSERT(freelen >= wantlen); freeend = freebno + freelen; wantend = wantbno + wantlen; - if (freebno >= wantbno) { + /* + * We want to allocate from the start of a free extent if it is past + * the desired block or if we are allocating user data and the free + * extent is before desired block. The second case is there to allow + * for contiguous allocation from the remaining free space if the file + * grows in the short term. + */ + if (freebno >= wantbno || (userdata && freeend < wantend)) { if ((newbno1 = roundup(freebno, alignment)) >= freeend) newbno1 = NULLAGBLOCK; } else if (freeend >= wantend && alignment > 1) { @@ -247,16 +257,14 @@ xfs_alloc_fix_len( k = rlen % args->prod; if (k == args->mod) return; - if (k > args->mod) { - if ((int)(rlen = rlen - k - args->mod) < (int)args->minlen) - return; - } else { - if ((int)(rlen = rlen - args->prod - (args->mod - k)) < - (int)args->minlen) - return; - } - ASSERT(rlen >= args->minlen); - ASSERT(rlen <= args->maxlen); + if (k > args->mod) + rlen = rlen - (k - args->mod); + else + rlen = rlen - args->prod + (args->mod - k); + if ((int)rlen < (int)args->minlen) + return; + ASSERT(rlen >= args->minlen && rlen <= args->maxlen); + ASSERT(rlen % args->prod == args->mod); args->len = rlen; } @@ -430,53 +438,80 @@ xfs_alloc_fixup_trees( return 0; } -static void +static bool xfs_agfl_verify( struct xfs_buf *bp) { -#ifdef WHEN_CRCS_COME_ALONG - /* - * we cannot actually do any verification of the AGFL because mkfs does - * not initialise the AGFL to zero or NULL. Hence the only valid part of - * the AGFL is what the AGF says is active. We can't get to the AGF, so - * we can't verify just those entries are valid. - * - * This problem goes away when the CRC format change comes along as that - * requires the AGFL to be initialised by mkfs. At that point, we can - * verify the blocks in the agfl -active or not- lie within the bounds - * of the AG. Until then, just leave this check ifdef'd out. - */ struct xfs_mount *mp = bp->b_target->bt_mount; struct xfs_agfl *agfl = XFS_BUF_TO_AGFL(bp); - int agfl_ok = 1; - int i; + if (!uuid_equal(&agfl->agfl_uuid, &mp->m_sb.sb_uuid)) + return false; + if (be32_to_cpu(agfl->agfl_magicnum) != XFS_AGFL_MAGIC) + return false; + /* + * during growfs operations, the perag is not fully initialised, + * so we can't use it for any useful checking. growfs ensures we can't + * use it by using uncached buffers that don't have the perag attached + * so we can detect and avoid this problem. + */ + if (bp->b_pag && be32_to_cpu(agfl->agfl_seqno) != bp->b_pag->pag_agno) + return false; + for (i = 0; i < XFS_AGFL_SIZE(mp); i++) { - if (be32_to_cpu(agfl->agfl_bno[i]) == NULLAGBLOCK || + if (be32_to_cpu(agfl->agfl_bno[i]) != NULLAGBLOCK && be32_to_cpu(agfl->agfl_bno[i]) >= mp->m_sb.sb_agblocks) - agfl_ok = 0; + return false; } - - if (!agfl_ok) { - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, agfl); - xfs_buf_ioerror(bp, EFSCORRUPTED); - } -#endif + return true; } static void -xfs_agfl_write_verify( +xfs_agfl_read_verify( struct xfs_buf *bp) { - xfs_agfl_verify(bp); + struct xfs_mount *mp = bp->b_target->bt_mount; + + /* + * There is no verification of non-crc AGFLs because mkfs does not + * initialise the AGFL to zero or NULL. Hence the only valid part of the + * AGFL is what the AGF says is active. We can't get to the AGF, so we + * can't verify just those entries are valid. + */ + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return; + + if (!xfs_buf_verify_cksum(bp, XFS_AGFL_CRC_OFF)) + xfs_buf_ioerror(bp, EFSBADCRC); + else if (!xfs_agfl_verify(bp)) + xfs_buf_ioerror(bp, EFSCORRUPTED); + + if (bp->b_error) + xfs_verifier_error(bp); } static void -xfs_agfl_read_verify( +xfs_agfl_write_verify( struct xfs_buf *bp) { - xfs_agfl_verify(bp); + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_buf_log_item *bip = bp->b_fspriv; + + /* no verification of non-crc AGFLs */ + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return; + + if (!xfs_agfl_verify(bp)) { + xfs_buf_ioerror(bp, EFSCORRUPTED); + xfs_verifier_error(bp); + return; + } + + if (bip) + XFS_BUF_TO_AGFL(bp)->agfl_lsn = cpu_to_be64(bip->bli_item.li_lsn); + + xfs_buf_update_cksum(bp, XFS_AGFL_CRC_OFF); } const struct xfs_buf_ops xfs_agfl_buf_ops = { @@ -504,7 +539,6 @@ xfs_alloc_read_agfl( XFS_FSS_TO_BB(mp, 1), 0, &bp, &xfs_agfl_buf_ops); if (error) return error; - ASSERT(!xfs_buf_geterror(bp)); xfs_buf_set_ref(bp, XFS_AGFL_REF); *bpp = bp; return 0; @@ -772,7 +806,8 @@ xfs_alloc_find_best_extent( xfs_alloc_fix_len(args); sdiff = xfs_alloc_compute_diff(args->agbno, args->len, - args->alignment, *sbnoa, + args->alignment, + args->userdata, *sbnoa, *slena, &new); /* @@ -836,13 +871,13 @@ xfs_alloc_ag_vextent_near( xfs_agblock_t ltnew; /* useful start bno of left side */ xfs_extlen_t rlen; /* length of returned extent */ int forced = 0; -#if defined(DEBUG) && defined(__KERNEL__) +#ifdef DEBUG /* * Randomly don't execute the first algorithm. */ int dofirst; /* set to do first algorithm */ - dofirst = random32() & 1; + dofirst = prandom_u32() & 1; #endif restart: @@ -896,8 +931,8 @@ restart: xfs_extlen_t blen=0; xfs_agblock_t bnew=0; -#if defined(DEBUG) && defined(__KERNEL__) - if (!dofirst) +#ifdef DEBUG + if (dofirst) break; #endif /* @@ -943,7 +978,8 @@ restart: if (args->len < blen) continue; ltdiff = xfs_alloc_compute_diff(args->agbno, args->len, - args->alignment, ltbnoa, ltlena, <new); + args->alignment, args->userdata, ltbnoa, + ltlena, <new); if (ltnew != NULLAGBLOCK && (args->len > blen || ltdiff < bdiff)) { bdiff = ltdiff; @@ -1095,7 +1131,8 @@ restart: args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen); xfs_alloc_fix_len(args); ltdiff = xfs_alloc_compute_diff(args->agbno, args->len, - args->alignment, ltbnoa, ltlena, <new); + args->alignment, args->userdata, ltbnoa, + ltlena, <new); error = xfs_alloc_find_best_extent(args, &bno_cur_lt, &bno_cur_gt, @@ -1111,7 +1148,8 @@ restart: args->len = XFS_EXTLEN_MIN(gtlena, args->maxlen); xfs_alloc_fix_len(args); gtdiff = xfs_alloc_compute_diff(args->agbno, args->len, - args->alignment, gtbnoa, gtlena, >new); + args->alignment, args->userdata, gtbnoa, + gtlena, >new); error = xfs_alloc_find_best_extent(args, &bno_cur_gt, &bno_cur_lt, @@ -1170,7 +1208,7 @@ restart: } rlen = args->len; (void)xfs_alloc_compute_diff(args->agbno, rlen, args->alignment, - ltbnoa, ltlena, <new); + args->userdata, ltbnoa, ltlena, <new); ASSERT(ltnew >= ltbno); ASSERT(ltnew + rlen <= ltbnoa + ltlena); ASSERT(ltnew + rlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length)); @@ -1925,8 +1963,6 @@ xfs_alloc_fix_freelist( targs.mp = mp; targs.agbp = agbp; targs.agno = args->agno; - targs.mod = targs.minleft = targs.wasdel = targs.userdata = - targs.minalignslop = 0; targs.alignment = targs.minlen = targs.prod = targs.isfl = 1; targs.type = XFS_ALLOCTYPE_THIS_AG; targs.pag = pag; @@ -1984,18 +2020,18 @@ xfs_alloc_get_freelist( int btreeblk) /* destination is a AGF btree */ { xfs_agf_t *agf; /* a.g. freespace structure */ - xfs_agfl_t *agfl; /* a.g. freelist structure */ xfs_buf_t *agflbp;/* buffer for a.g. freelist structure */ xfs_agblock_t bno; /* block number returned */ + __be32 *agfl_bno; int error; int logflags; - xfs_mount_t *mp; /* mount structure */ + xfs_mount_t *mp = tp->t_mountp; xfs_perag_t *pag; /* per allocation group data */ - agf = XFS_BUF_TO_AGF(agbp); /* * Freelist is empty, give up. */ + agf = XFS_BUF_TO_AGF(agbp); if (!agf->agf_flcount) { *bnop = NULLAGBLOCK; return 0; @@ -2003,15 +2039,17 @@ xfs_alloc_get_freelist( /* * Read the array of free blocks. */ - mp = tp->t_mountp; - if ((error = xfs_alloc_read_agfl(mp, tp, - be32_to_cpu(agf->agf_seqno), &agflbp))) + error = xfs_alloc_read_agfl(mp, tp, be32_to_cpu(agf->agf_seqno), + &agflbp); + if (error) return error; - agfl = XFS_BUF_TO_AGFL(agflbp); + + /* * Get the block number and update the data structures. */ - bno = be32_to_cpu(agfl->agfl_bno[be32_to_cpu(agf->agf_flfirst)]); + agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, agflbp); + bno = be32_to_cpu(agfl_bno[be32_to_cpu(agf->agf_flfirst)]); be32_add_cpu(&agf->agf_flfirst, 1); xfs_trans_brelse(tp, agflbp); if (be32_to_cpu(agf->agf_flfirst) == XFS_AGFL_SIZE(mp)) @@ -2060,11 +2098,14 @@ xfs_alloc_log_agf( offsetof(xfs_agf_t, agf_freeblks), offsetof(xfs_agf_t, agf_longest), offsetof(xfs_agf_t, agf_btreeblks), + offsetof(xfs_agf_t, agf_uuid), sizeof(xfs_agf_t) }; trace_xfs_agf(tp->t_mountp, XFS_BUF_TO_AGF(bp), fields, _RET_IP_); + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_AGF_BUF); + xfs_btree_offsets(fields, offsets, XFS_AGF_NUM_BITS, &first, &last); xfs_trans_log_buf(tp, bp, (uint)first, (uint)last); } @@ -2101,12 +2142,13 @@ xfs_alloc_put_freelist( int btreeblk) /* block came from a AGF btree */ { xfs_agf_t *agf; /* a.g. freespace structure */ - xfs_agfl_t *agfl; /* a.g. free block array */ __be32 *blockp;/* pointer to array entry */ int error; int logflags; xfs_mount_t *mp; /* mount structure */ xfs_perag_t *pag; /* per allocation group data */ + __be32 *agfl_bno; + int startoff; agf = XFS_BUF_TO_AGF(agbp); mp = tp->t_mountp; @@ -2114,7 +2156,6 @@ xfs_alloc_put_freelist( if (!agflbp && (error = xfs_alloc_read_agfl(mp, tp, be32_to_cpu(agf->agf_seqno), &agflbp))) return error; - agfl = XFS_BUF_TO_AGFL(agflbp); be32_add_cpu(&agf->agf_fllast, 1); if (be32_to_cpu(agf->agf_fllast) == XFS_AGFL_SIZE(mp)) agf->agf_fllast = 0; @@ -2135,32 +2176,38 @@ xfs_alloc_put_freelist( xfs_alloc_log_agf(tp, agbp, logflags); ASSERT(be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp)); - blockp = &agfl->agfl_bno[be32_to_cpu(agf->agf_fllast)]; + + agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, agflbp); + blockp = &agfl_bno[be32_to_cpu(agf->agf_fllast)]; *blockp = cpu_to_be32(bno); + startoff = (char *)blockp - (char *)agflbp->b_addr; + xfs_alloc_log_agf(tp, agbp, logflags); - xfs_trans_log_buf(tp, agflbp, - (int)((xfs_caddr_t)blockp - (xfs_caddr_t)agfl), - (int)((xfs_caddr_t)blockp - (xfs_caddr_t)agfl + - sizeof(xfs_agblock_t) - 1)); + + xfs_trans_buf_set_type(tp, agflbp, XFS_BLFT_AGFL_BUF); + xfs_trans_log_buf(tp, agflbp, startoff, + startoff + sizeof(xfs_agblock_t) - 1); return 0; } -static void +static bool xfs_agf_verify( + struct xfs_mount *mp, struct xfs_buf *bp) { - struct xfs_mount *mp = bp->b_target->bt_mount; - struct xfs_agf *agf; - int agf_ok; + struct xfs_agf *agf = XFS_BUF_TO_AGF(bp); - agf = XFS_BUF_TO_AGF(bp); + if (xfs_sb_version_hascrc(&mp->m_sb) && + !uuid_equal(&agf->agf_uuid, &mp->m_sb.sb_uuid)) + return false; - agf_ok = agf->agf_magicnum == cpu_to_be32(XFS_AGF_MAGIC) && - XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) && - be32_to_cpu(agf->agf_freeblks) <= be32_to_cpu(agf->agf_length) && - be32_to_cpu(agf->agf_flfirst) < XFS_AGFL_SIZE(mp) && - be32_to_cpu(agf->agf_fllast) < XFS_AGFL_SIZE(mp) && - be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp); + if (!(agf->agf_magicnum == cpu_to_be32(XFS_AGF_MAGIC) && + XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) && + be32_to_cpu(agf->agf_freeblks) <= be32_to_cpu(agf->agf_length) && + be32_to_cpu(agf->agf_flfirst) < XFS_AGFL_SIZE(mp) && + be32_to_cpu(agf->agf_fllast) < XFS_AGFL_SIZE(mp) && + be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp))) + return false; /* * during growfs operations, the perag is not fully initialised, @@ -2168,33 +2215,55 @@ xfs_agf_verify( * use it by using uncached buffers that don't have the perag attached * so we can detect and avoid this problem. */ - if (bp->b_pag) - agf_ok = agf_ok && be32_to_cpu(agf->agf_seqno) == - bp->b_pag->pag_agno; + if (bp->b_pag && be32_to_cpu(agf->agf_seqno) != bp->b_pag->pag_agno) + return false; - if (xfs_sb_version_haslazysbcount(&mp->m_sb)) - agf_ok = agf_ok && be32_to_cpu(agf->agf_btreeblks) <= - be32_to_cpu(agf->agf_length); + if (xfs_sb_version_haslazysbcount(&mp->m_sb) && + be32_to_cpu(agf->agf_btreeblks) > be32_to_cpu(agf->agf_length)) + return false; + + return true;; - if (unlikely(XFS_TEST_ERROR(!agf_ok, mp, XFS_ERRTAG_ALLOC_READ_AGF, - XFS_RANDOM_ALLOC_READ_AGF))) { - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, agf); - xfs_buf_ioerror(bp, EFSCORRUPTED); - } } static void xfs_agf_read_verify( struct xfs_buf *bp) { - xfs_agf_verify(bp); + struct xfs_mount *mp = bp->b_target->bt_mount; + + if (xfs_sb_version_hascrc(&mp->m_sb) && + !xfs_buf_verify_cksum(bp, XFS_AGF_CRC_OFF)) + xfs_buf_ioerror(bp, EFSBADCRC); + else if (XFS_TEST_ERROR(!xfs_agf_verify(mp, bp), mp, + XFS_ERRTAG_ALLOC_READ_AGF, + XFS_RANDOM_ALLOC_READ_AGF)) + xfs_buf_ioerror(bp, EFSCORRUPTED); + + if (bp->b_error) + xfs_verifier_error(bp); } static void xfs_agf_write_verify( struct xfs_buf *bp) { - xfs_agf_verify(bp); + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_buf_log_item *bip = bp->b_fspriv; + + if (!xfs_agf_verify(mp, bp)) { + xfs_buf_ioerror(bp, EFSCORRUPTED); + xfs_verifier_error(bp); + return; + } + + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return; + + if (bip) + XFS_BUF_TO_AGF(bp)->agf_lsn = cpu_to_be64(bip->bli_item.li_lsn); + + xfs_buf_update_cksum(bp, XFS_AGF_CRC_OFF); } const struct xfs_buf_ops xfs_agf_buf_ops = { @@ -2215,6 +2284,8 @@ xfs_read_agf( { int error; + trace_xfs_read_agf(mp, agno); + ASSERT(agno != NULLAGNUMBER); error = xfs_trans_read_buf( mp, tp, mp->m_ddev_targp, @@ -2245,8 +2316,9 @@ xfs_alloc_read_agf( struct xfs_perag *pag; /* per allocation group data */ int error; - ASSERT(agno != NULLAGNUMBER); + trace_xfs_alloc_read_agf(mp, agno); + ASSERT(agno != NULLAGNUMBER); error = xfs_read_agf(mp, tp, agno, (flags & XFS_ALLOC_FLAG_TRYLOCK) ? XBF_TRYLOCK : 0, bpp); diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h index 99d0a610155..feacb061bab 100644 --- a/fs/xfs/xfs_alloc.h +++ b/fs/xfs/xfs_alloc.h @@ -231,7 +231,4 @@ xfs_alloc_get_rec( xfs_extlen_t *len, /* output: length of extent */ int *stat); /* output: success/failure */ -extern const struct xfs_buf_ops xfs_agf_buf_ops; -extern const struct xfs_buf_ops xfs_agfl_buf_ops; - #endif /* __XFS_ALLOC_H__ */ diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c index b1ddef6b268..8358f1ded94 100644 --- a/fs/xfs/xfs_alloc_btree.c +++ b/fs/xfs/xfs_alloc_btree.c @@ -17,22 +17,21 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_types.h" -#include "xfs_log.h" -#include "xfs_trans.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_mount.h" -#include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_dinode.h" -#include "xfs_inode.h" #include "xfs_btree.h" +#include "xfs_alloc_btree.h" #include "xfs_alloc.h" #include "xfs_extent_busy.h" #include "xfs_error.h" #include "xfs_trace.h" +#include "xfs_cksum.h" +#include "xfs_trans.h" STATIC struct xfs_btree_cur * @@ -71,7 +70,6 @@ xfs_allocbt_alloc_block( struct xfs_btree_cur *cur, union xfs_btree_ptr *start, union xfs_btree_ptr *new, - int length, int *stat) { int error; @@ -272,7 +270,7 @@ xfs_allocbt_key_diff( return (__int64_t)be32_to_cpu(kp->ar_startblock) - rec->ar_startblock; } -static void +static bool xfs_allocbt_verify( struct xfs_buf *bp) { @@ -280,66 +278,105 @@ xfs_allocbt_verify( struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); struct xfs_perag *pag = bp->b_pag; unsigned int level; - int sblock_ok; /* block passes checks */ /* * magic number and level verification * - * During growfs operations, we can't verify the exact level as the - * perag is not fully initialised and hence not attached to the buffer. - * In this case, check against the maximum tree depth. + * During growfs operations, we can't verify the exact level or owner as + * the perag is not fully initialised and hence not attached to the + * buffer. In this case, check against the maximum tree depth. + * + * Similarly, during log recovery we will have a perag structure + * attached, but the agf information will not yet have been initialised + * from the on disk AGF. Again, we can only check against maximum limits + * in this case. */ level = be16_to_cpu(block->bb_level); switch (block->bb_magic) { + case cpu_to_be32(XFS_ABTB_CRC_MAGIC): + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return false; + if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid)) + return false; + if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn)) + return false; + if (pag && + be32_to_cpu(block->bb_u.s.bb_owner) != pag->pag_agno) + return false; + /* fall through */ case cpu_to_be32(XFS_ABTB_MAGIC): - if (pag) - sblock_ok = level < pag->pagf_levels[XFS_BTNUM_BNOi]; - else - sblock_ok = level < mp->m_ag_maxlevels; + if (pag && pag->pagf_init) { + if (level >= pag->pagf_levels[XFS_BTNUM_BNOi]) + return false; + } else if (level >= mp->m_ag_maxlevels) + return false; break; + case cpu_to_be32(XFS_ABTC_CRC_MAGIC): + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return false; + if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid)) + return false; + if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn)) + return false; + if (pag && + be32_to_cpu(block->bb_u.s.bb_owner) != pag->pag_agno) + return false; + /* fall through */ case cpu_to_be32(XFS_ABTC_MAGIC): - if (pag) - sblock_ok = level < pag->pagf_levels[XFS_BTNUM_CNTi]; - else - sblock_ok = level < mp->m_ag_maxlevels; + if (pag && pag->pagf_init) { + if (level >= pag->pagf_levels[XFS_BTNUM_CNTi]) + return false; + } else if (level >= mp->m_ag_maxlevels) + return false; break; default: - sblock_ok = 0; - break; + return false; } /* numrecs verification */ - sblock_ok = sblock_ok && - be16_to_cpu(block->bb_numrecs) <= mp->m_alloc_mxr[level != 0]; + if (be16_to_cpu(block->bb_numrecs) > mp->m_alloc_mxr[level != 0]) + return false; /* sibling pointer verification */ - sblock_ok = sblock_ok && - (block->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK) || - be32_to_cpu(block->bb_u.s.bb_leftsib) < mp->m_sb.sb_agblocks) && - block->bb_u.s.bb_leftsib && - (block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK) || - be32_to_cpu(block->bb_u.s.bb_rightsib) < mp->m_sb.sb_agblocks) && - block->bb_u.s.bb_rightsib; - - if (!sblock_ok) { - trace_xfs_btree_corrupt(bp, _RET_IP_); - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, block); - xfs_buf_ioerror(bp, EFSCORRUPTED); - } + if (!block->bb_u.s.bb_leftsib || + (be32_to_cpu(block->bb_u.s.bb_leftsib) >= mp->m_sb.sb_agblocks && + block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK))) + return false; + if (!block->bb_u.s.bb_rightsib || + (be32_to_cpu(block->bb_u.s.bb_rightsib) >= mp->m_sb.sb_agblocks && + block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK))) + return false; + + return true; } static void xfs_allocbt_read_verify( struct xfs_buf *bp) { - xfs_allocbt_verify(bp); + if (!xfs_btree_sblock_verify_crc(bp)) + xfs_buf_ioerror(bp, EFSBADCRC); + else if (!xfs_allocbt_verify(bp)) + xfs_buf_ioerror(bp, EFSCORRUPTED); + + if (bp->b_error) { + trace_xfs_btree_corrupt(bp, _RET_IP_); + xfs_verifier_error(bp); + } } static void xfs_allocbt_write_verify( struct xfs_buf *bp) { - xfs_allocbt_verify(bp); + if (!xfs_allocbt_verify(bp)) { + trace_xfs_btree_corrupt(bp, _RET_IP_); + xfs_buf_ioerror(bp, EFSCORRUPTED); + xfs_verifier_error(bp); + return; + } + xfs_btree_sblock_calc_crc(bp); + } const struct xfs_buf_ops xfs_allocbt_buf_ops = { @@ -348,7 +385,7 @@ const struct xfs_buf_ops xfs_allocbt_buf_ops = { }; -#ifdef DEBUG +#if defined(DEBUG) || defined(XFS_WARN) STATIC int xfs_allocbt_keys_inorder( struct xfs_btree_cur *cur, @@ -404,7 +441,7 @@ static const struct xfs_btree_ops xfs_allocbt_ops = { .init_ptr_from_cur = xfs_allocbt_init_ptr_from_cur, .key_diff = xfs_allocbt_key_diff, .buf_ops = &xfs_allocbt_buf_ops, -#ifdef DEBUG +#if defined(DEBUG) || defined(XFS_WARN) .keys_inorder = xfs_allocbt_keys_inorder, .recs_inorder = xfs_allocbt_recs_inorder, #endif @@ -444,6 +481,9 @@ xfs_allocbt_init_cursor( cur->bc_private.a.agbp = agbp; cur->bc_private.a.agno = agno; + if (xfs_sb_version_hascrc(&mp->m_sb)) + cur->bc_flags |= XFS_BTREE_CRC_BLOCKS; + return cur; } diff --git a/fs/xfs/xfs_alloc_btree.h b/fs/xfs/xfs_alloc_btree.h index 7e89a2b429d..45e189e7e81 100644 --- a/fs/xfs/xfs_alloc_btree.h +++ b/fs/xfs/xfs_alloc_btree.h @@ -27,42 +27,11 @@ struct xfs_btree_cur; struct xfs_mount; /* - * There are two on-disk btrees, one sorted by blockno and one sorted - * by blockcount and blockno. All blocks look the same to make the code - * simpler; if we have time later, we'll make the optimizations. - */ -#define XFS_ABTB_MAGIC 0x41425442 /* 'ABTB' for bno tree */ -#define XFS_ABTC_MAGIC 0x41425443 /* 'ABTC' for cnt tree */ - -/* - * Data record/key structure - */ -typedef struct xfs_alloc_rec { - __be32 ar_startblock; /* starting block number */ - __be32 ar_blockcount; /* count of free blocks */ -} xfs_alloc_rec_t, xfs_alloc_key_t; - -typedef struct xfs_alloc_rec_incore { - xfs_agblock_t ar_startblock; /* starting block number */ - xfs_extlen_t ar_blockcount; /* count of free blocks */ -} xfs_alloc_rec_incore_t; - -/* btree pointer type */ -typedef __be32 xfs_alloc_ptr_t; - -/* - * Block numbers in the AG: - * SB is sector 0, AGF is sector 1, AGI is sector 2, AGFL is sector 3. - */ -#define XFS_BNO_BLOCK(mp) ((xfs_agblock_t)(XFS_AGFL_BLOCK(mp) + 1)) -#define XFS_CNT_BLOCK(mp) ((xfs_agblock_t)(XFS_BNO_BLOCK(mp) + 1)) - -/* * Btree block header size depends on a superblock flag. - * - * (not quite yet, but soon) */ -#define XFS_ALLOC_BLOCK_LEN(mp) XFS_BTREE_SBLOCK_LEN +#define XFS_ALLOC_BLOCK_LEN(mp) \ + (xfs_sb_version_hascrc(&((mp)->m_sb)) ? \ + XFS_BTREE_SBLOCK_CRC_LEN : XFS_BTREE_SBLOCK_LEN) /* * Record, key, and pointer address macros for btree blocks. @@ -93,6 +62,4 @@ extern struct xfs_btree_cur *xfs_allocbt_init_cursor(struct xfs_mount *, xfs_agnumber_t, xfs_btnum_t); extern int xfs_allocbt_maxrecs(struct xfs_mount *, int, int); -extern const struct xfs_buf_ops xfs_allocbt_buf_ops; - #endif /* __XFS_ALLOC_BTREE_H__ */ diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 4111a40ebe1..faaf716e208 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -16,21 +16,25 @@ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include "xfs.h" -#include "xfs_log.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_trans.h" #include "xfs_mount.h" -#include "xfs_bmap_btree.h" -#include "xfs_dinode.h" #include "xfs_inode.h" +#include "xfs_trans.h" #include "xfs_inode_item.h" #include "xfs_alloc.h" #include "xfs_error.h" #include "xfs_iomap.h" -#include "xfs_vnodeops.h" #include "xfs_trace.h" #include "xfs_bmap.h" +#include "xfs_bmap_util.h" +#include "xfs_bmap_btree.h" +#include "xfs_dinode.h" +#include <linux/aio.h> #include <linux/gfp.h> #include <linux/mpage.h> #include <linux/pagevec.h> @@ -85,14 +89,6 @@ xfs_destroy_ioend( bh->b_end_io(bh, !ioend->io_error); } - if (ioend->io_iocb) { - if (ioend->io_isasync) { - aio_complete(ioend->io_iocb, ioend->io_error ? - ioend->io_error : ioend->io_result, 0); - } - inode_dio_done(ioend->io_inode); - } - mempool_free(ioend, xfs_ioend_pool); } @@ -115,7 +111,7 @@ xfs_setfilesize_trans_alloc( tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS); - error = xfs_trans_reserve(tp, 0, XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0); if (error) { xfs_trans_cancel(tp, 0); return error; @@ -280,7 +276,6 @@ xfs_alloc_ioend( * all the I/O from calling the completion routine too early. */ atomic_set(&ioend->io_remaining, 1); - ioend->io_isasync = 0; ioend->io_isdirect = 0; ioend->io_error = 0; ioend->io_list = NULL; @@ -290,8 +285,6 @@ xfs_alloc_ioend( ioend->io_buffer_tail = NULL; ioend->io_offset = 0; ioend->io_size = 0; - ioend->io_iocb = NULL; - ioend->io_result = 0; ioend->io_append_trans = NULL; INIT_WORK(&ioend->io_work, xfs_end_io); @@ -343,7 +336,7 @@ xfs_map_blocks( if (type == XFS_IO_DELALLOC && (!nimaps || isnullstartblock(imap->br_startblock))) { - error = xfs_iomap_write_allocate(ip, offset, count, imap); + error = xfs_iomap_write_allocate(ip, offset, imap); if (!error) trace_xfs_map_blocks_alloc(ip, offset, count, type, imap); return -XFS_ERROR(error); @@ -414,7 +407,7 @@ xfs_alloc_ioend_bio( struct bio *bio = bio_alloc(GFP_NOIO, nvecs); ASSERT(bio->bi_private == NULL); - bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9); + bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); bio->bi_bdev = bh->b_bdev; return bio; } @@ -450,7 +443,7 @@ xfs_start_page_writeback( end_page_writeback(page); } -static inline int bio_add_buffer(struct bio *bio, struct buffer_head *bh) +static inline int xfs_bio_add_buffer(struct bio *bio, struct buffer_head *bh) { return bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh)); } @@ -524,7 +517,7 @@ xfs_submit_ioend( goto retry; } - if (bio_add_buffer(bio, bh) != bh->b_size) { + if (xfs_bio_add_buffer(bio, bh) != bh->b_size) { xfs_submit_ioend_bio(wbc, ioend, bio); goto retry; } @@ -639,38 +632,46 @@ xfs_map_at_offset( } /* - * Test if a given page is suitable for writing as part of an unwritten - * or delayed allocate extent. + * Test if a given page contains at least one buffer of a given @type. + * If @check_all_buffers is true, then we walk all the buffers in the page to + * try to find one of the type passed in. If it is not set, then the caller only + * needs to check the first buffer on the page for a match. */ -STATIC int +STATIC bool xfs_check_page_type( struct page *page, - unsigned int type) + unsigned int type, + bool check_all_buffers) { - if (PageWriteback(page)) - return 0; + struct buffer_head *bh; + struct buffer_head *head; - if (page->mapping && page_has_buffers(page)) { - struct buffer_head *bh, *head; - int acceptable = 0; + if (PageWriteback(page)) + return false; + if (!page->mapping) + return false; + if (!page_has_buffers(page)) + return false; - bh = head = page_buffers(page); - do { - if (buffer_unwritten(bh)) - acceptable += (type == XFS_IO_UNWRITTEN); - else if (buffer_delay(bh)) - acceptable += (type == XFS_IO_DELALLOC); - else if (buffer_dirty(bh) && buffer_mapped(bh)) - acceptable += (type == XFS_IO_OVERWRITE); - else - break; - } while ((bh = bh->b_this_page) != head); + bh = head = page_buffers(page); + do { + if (buffer_unwritten(bh)) { + if (type == XFS_IO_UNWRITTEN) + return true; + } else if (buffer_delay(bh)) { + if (type == XFS_IO_DELALLOC) + return true; + } else if (buffer_dirty(bh) && buffer_mapped(bh)) { + if (type == XFS_IO_OVERWRITE) + return true; + } - if (acceptable) - return 1; - } + /* If we are only checking the first buffer, we are done now. */ + if (!check_all_buffers) + break; + } while ((bh = bh->b_this_page) != head); - return 0; + return false; } /* @@ -704,7 +705,7 @@ xfs_convert_page( goto fail_unlock_page; if (page->mapping != inode->i_mapping) goto fail_unlock_page; - if (!xfs_check_page_type(page, (*ioendp)->io_type)) + if (!xfs_check_page_type(page, (*ioendp)->io_type, false)) goto fail_unlock_page; /* @@ -724,12 +725,40 @@ xfs_convert_page( (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT, i_size_read(inode)); + /* + * If the current map does not span the entire page we are about to try + * to write, then give up. The only way we can write a page that spans + * multiple mappings in a single writeback iteration is via the + * xfs_vm_writepage() function. Data integrity writeback requires the + * entire page to be written in a single attempt, otherwise the part of + * the page we don't write here doesn't get written as part of the data + * integrity sync. + * + * For normal writeback, we also don't attempt to write partial pages + * here as it simply means that write_cache_pages() will see it under + * writeback and ignore the page until some point in the future, at + * which time this will be the only page in the file that needs + * writeback. Hence for more optimal IO patterns, we should always + * avoid partial page writeback due to multiple mappings on a page here. + */ + if (!xfs_imap_valid(inode, imap, end_offset)) + goto fail_unlock_page; + len = 1 << inode->i_blkbits; p_offset = min_t(unsigned long, end_offset & (PAGE_CACHE_SIZE - 1), PAGE_CACHE_SIZE); p_offset = p_offset ? roundup(p_offset, len) : PAGE_CACHE_SIZE; page_dirty = p_offset / len; + /* + * The moment we find a buffer that doesn't match our current type + * specification or can't be written, abort the loop and start + * writeback. As per the above xfs_imap_valid() check, only + * xfs_vm_writepage() can handle partial page writeback fully - we are + * limited here to the buffers that are contiguous with the current + * ioend, and hence a buffer we can't write breaks that contiguity and + * we have to defer the rest of the IO to xfs_vm_writepage(). + */ bh = head = page_buffers(page); do { if (offset >= end_offset) @@ -738,7 +767,7 @@ xfs_convert_page( uptodate = 0; if (!(PageUptodate(page) || buffer_uptodate(bh))) { done = 1; - continue; + break; } if (buffer_unwritten(bh) || buffer_delay(bh) || @@ -750,10 +779,11 @@ xfs_convert_page( else type = XFS_IO_OVERWRITE; - if (!xfs_imap_valid(inode, imap, offset)) { - done = 1; - continue; - } + /* + * imap should always be valid because of the above + * partial page end_offset check on the imap. + */ + ASSERT(xfs_imap_valid(inode, imap, offset)); lock_buffer(bh); if (type != XFS_IO_OVERWRITE) @@ -765,6 +795,7 @@ xfs_convert_page( count++; } else { done = 1; + break; } } while (offset += len, (bh = bh->b_this_page) != head); @@ -823,10 +854,12 @@ xfs_cluster_write( STATIC void xfs_vm_invalidatepage( struct page *page, - unsigned long offset) + unsigned int offset, + unsigned int length) { - trace_xfs_invalidatepage(page->mapping->host, page, offset); - block_invalidatepage(page, offset); + trace_xfs_invalidatepage(page->mapping->host, page, offset, + length); + block_invalidatepage(page, offset, length); } /* @@ -854,7 +887,7 @@ xfs_aops_discard_page( struct buffer_head *bh, *head; loff_t offset = page_offset(page); - if (!xfs_check_page_type(page, XFS_IO_DELALLOC)) + if (!xfs_check_page_type(page, XFS_IO_DELALLOC, true)) goto out_invalidate; if (XFS_FORCED_SHUTDOWN(ip->i_mount)) @@ -890,7 +923,7 @@ next_buffer: xfs_iunlock(ip, XFS_ILOCK_EXCL); out_invalidate: - xfs_vm_invalidatepage(page, 0); + xfs_vm_invalidatepage(page, 0, PAGE_CACHE_SIZE); return; } @@ -920,7 +953,7 @@ xfs_vm_writepage( int count = 0; int nonblocking = 0; - trace_xfs_writepage(inode, page, 0); + trace_xfs_writepage(inode, page, 0, 0); ASSERT(page_has_buffers(page)); @@ -942,39 +975,76 @@ xfs_vm_writepage( * Given that we do not allow direct reclaim to call us, we should * never be called while in a filesystem transaction. */ - if (WARN_ON(current->flags & PF_FSTRANS)) + if (WARN_ON_ONCE(current->flags & PF_FSTRANS)) goto redirty; /* Is this page beyond the end of the file? */ offset = i_size_read(inode); end_index = offset >> PAGE_CACHE_SHIFT; last_index = (offset - 1) >> PAGE_CACHE_SHIFT; - if (page->index >= end_index) { + + /* + * The page index is less than the end_index, adjust the end_offset + * to the highest offset that this page should represent. + * ----------------------------------------------------- + * | file mapping | <EOF> | + * ----------------------------------------------------- + * | Page ... | Page N-2 | Page N-1 | Page N | | + * ^--------------------------------^----------|-------- + * | desired writeback range | see else | + * ---------------------------------^------------------| + */ + if (page->index < end_index) + end_offset = (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT; + else { + /* + * Check whether the page to write out is beyond or straddles + * i_size or not. + * ------------------------------------------------------- + * | file mapping | <EOF> | + * ------------------------------------------------------- + * | Page ... | Page N-2 | Page N-1 | Page N | Beyond | + * ^--------------------------------^-----------|--------- + * | | Straddles | + * ---------------------------------^-----------|--------| + */ unsigned offset_into_page = offset & (PAGE_CACHE_SIZE - 1); /* - * Just skip the page if it is fully outside i_size, e.g. due - * to a truncate operation that is in progress. + * Skip the page if it is fully outside i_size, e.g. due to a + * truncate operation that is in progress. We must redirty the + * page so that reclaim stops reclaiming it. Otherwise + * xfs_vm_releasepage() is called on it and gets confused. + * + * Note that the end_index is unsigned long, it would overflow + * if the given offset is greater than 16TB on 32-bit system + * and if we do check the page is fully outside i_size or not + * via "if (page->index >= end_index + 1)" as "end_index + 1" + * will be evaluated to 0. Hence this page will be redirtied + * and be written out repeatedly which would result in an + * infinite loop, the user program that perform this operation + * will hang. Instead, we can verify this situation by checking + * if the page to write is totally beyond the i_size or if it's + * offset is just equal to the EOF. */ - if (page->index >= end_index + 1 || offset_into_page == 0) { - unlock_page(page); - return 0; - } + if (page->index > end_index || + (page->index == end_index && offset_into_page == 0)) + goto redirty; /* * The page straddles i_size. It must be zeroed out on each * and every writepage invocation because it may be mmapped. * "A file is mapped in multiples of the page size. For a file - * that is not a multiple of the page size, the remaining + * that is not a multiple of the page size, the remaining * memory is zeroed when mapped, and writes to that region are * not written out to the file." */ zero_user_segment(page, offset_into_page, PAGE_CACHE_SIZE); + + /* Adjust the end_offset to the end of file */ + end_offset = offset; } - end_offset = min_t(unsigned long long, - (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT, - offset); len = 1 << inode->i_blkbits; bh = head = page_buffers(page); @@ -1151,13 +1221,13 @@ xfs_vm_releasepage( { int delalloc, unwritten; - trace_xfs_releasepage(page->mapping->host, page, 0); + trace_xfs_releasepage(page->mapping->host, page, 0, 0); xfs_count_page_state(page, &delalloc, &unwritten); - if (WARN_ON(delalloc)) + if (WARN_ON_ONCE(delalloc)) return 0; - if (WARN_ON(unwritten)) + if (WARN_ON_ONCE(unwritten)) return 0; return try_to_free_buffers(page); @@ -1203,7 +1273,7 @@ __xfs_get_blocks( lockmode = XFS_ILOCK_EXCL; xfs_ilock(ip, lockmode); } else { - lockmode = xfs_ilock_map_shared(ip); + lockmode = xfs_ilock_data_map_shared(ip); } ASSERT(offset <= mp->m_super->s_maxbytes); @@ -1270,8 +1340,10 @@ __xfs_get_blocks( if (create || !ISUNWRITTEN(&imap)) xfs_map_buffer(inode, bh_result, &imap, offset); if (create && ISUNWRITTEN(&imap)) { - if (direct) + if (direct) { bh_result->b_private = inode; + set_buffer_defer_completion(bh_result); + } set_buffer_unwritten(bh_result); } } @@ -1309,6 +1381,14 @@ __xfs_get_blocks( /* * If this is O_DIRECT or the mpage code calling tell them how large * the mapping is, so that we can avoid repeated get_blocks calls. + * + * If the mapping spans EOF, then we have to break the mapping up as the + * mapping for blocks beyond EOF must be marked new so that sub block + * regions can be correctly zeroed. We can't do this for mappings within + * EOF unless the mapping was just allocated or is unwritten, otherwise + * the callers would overwrite existing data with zeros. Hence we have + * to split the mapping into a range up to and including EOF, and a + * second mapping for beyond EOF. */ if (direct || size > (1 << inode->i_blkbits)) { xfs_off_t mapping_size; @@ -1319,6 +1399,12 @@ __xfs_get_blocks( ASSERT(mapping_size > 0); if (mapping_size > size) mapping_size = size; + if (offset < i_size_read(inode) && + offset + mapping_size >= i_size_read(inode)) { + /* limit mapping to block that spans EOF */ + mapping_size = roundup_64(i_size_read(inode) - offset, + 1 << inode->i_blkbits); + } if (mapping_size > LONG_MAX) mapping_size = LONG_MAX; @@ -1368,9 +1454,7 @@ xfs_end_io_direct_write( struct kiocb *iocb, loff_t offset, ssize_t size, - void *private, - int ret, - bool is_async) + void *private) { struct xfs_ioend *ioend = iocb->private; @@ -1392,26 +1476,18 @@ xfs_end_io_direct_write( ioend->io_offset = offset; ioend->io_size = size; - ioend->io_iocb = iocb; - ioend->io_result = ret; if (private && size > 0) ioend->io_type = XFS_IO_UNWRITTEN; - if (is_async) { - ioend->io_isasync = 1; - xfs_finish_ioend(ioend); - } else { - xfs_finish_ioend_sync(ioend); - } + xfs_finish_ioend_sync(ioend); } STATIC ssize_t xfs_vm_direct_IO( int rw, struct kiocb *iocb, - const struct iovec *iov, - loff_t offset, - unsigned long nr_segs) + struct iov_iter *iter, + loff_t offset) { struct inode *inode = iocb->ki_filp->f_mapping->host; struct block_device *bdev = xfs_find_bdev_for_inode(inode); @@ -1419,7 +1495,7 @@ xfs_vm_direct_IO( ssize_t ret; if (rw & WRITE) { - size_t size = iov_length(iov, nr_segs); + size_t size = iov_iter_count(iter); /* * We cannot preallocate a size update transaction here as we @@ -1431,16 +1507,15 @@ xfs_vm_direct_IO( if (offset + size > XFS_I(inode)->i_d.di_size) ioend->io_isdirect = 1; - ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov, - offset, nr_segs, - xfs_get_blocks_direct, - xfs_end_io_direct_write, NULL, 0); + ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iter, + offset, xfs_get_blocks_direct, + xfs_end_io_direct_write, NULL, + DIO_ASYNC_EXTEND); if (ret != -EIOCBQUEUED && iocb->private) goto out_destroy_ioend; } else { - ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov, - offset, nr_segs, - xfs_get_blocks_direct, + ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iter, + offset, xfs_get_blocks_direct, NULL, NULL, 0); } @@ -1494,13 +1569,26 @@ xfs_vm_write_failed( loff_t pos, unsigned len) { - loff_t block_offset = pos & PAGE_MASK; + loff_t block_offset; loff_t block_start; loff_t block_end; loff_t from = pos & (PAGE_CACHE_SIZE - 1); loff_t to = from + len; struct buffer_head *bh, *head; + /* + * The request pos offset might be 32 or 64 bit, this is all fine + * on 64-bit platform. However, for 64-bit pos request on 32-bit + * platform, the high 32-bit will be masked off if we evaluate the + * block_offset via (pos & PAGE_MASK) because the PAGE_MASK is + * 0xfffff000 as an unsigned long, hence the result is incorrect + * which could cause the following ASSERT failed in most cases. + * In order to avoid this, we can evaluate the block_offset of the + * start of the page by using shifts rather than masks the mismatch + * problem. + */ + block_offset = (pos >> PAGE_CACHE_SHIFT) << PAGE_CACHE_SHIFT; + ASSERT(block_offset + from == pos); head = page_buffers(page); @@ -1526,6 +1614,16 @@ xfs_vm_write_failed( xfs_vm_kill_delalloc_range(inode, block_offset, block_offset + bh->b_size); + + /* + * This buffer does not contain data anymore. make sure anyone + * who finds it knows that for certain. + */ + clear_buffer_delay(bh); + clear_buffer_uptodate(bh); + clear_buffer_mapped(bh); + clear_buffer_new(bh); + clear_buffer_dirty(bh); } } @@ -1552,20 +1650,28 @@ xfs_vm_write_begin( ASSERT(len <= PAGE_CACHE_SIZE); - page = grab_cache_page_write_begin(mapping, index, - flags | AOP_FLAG_NOFS); + page = grab_cache_page_write_begin(mapping, index, flags); if (!page) return -ENOMEM; status = __block_write_begin(page, pos, len, xfs_get_blocks); if (unlikely(status)) { struct inode *inode = mapping->host; + size_t isize = i_size_read(inode); xfs_vm_write_failed(inode, page, pos, len); unlock_page(page); - if (pos + len > i_size_read(inode)) - truncate_pagecache(inode, pos + len, i_size_read(inode)); + /* + * If the write is beyond EOF, we only want to kill blocks + * allocated in this write, not blocks that were previously + * written successfully. + */ + if (pos + len > isize) { + ssize_t start = max_t(ssize_t, pos, isize); + + truncate_pagecache_range(inode, start, pos + len); + } page_cache_release(page); page = NULL; @@ -1576,9 +1682,12 @@ xfs_vm_write_begin( } /* - * On failure, we only need to kill delalloc blocks beyond EOF because they - * will never be written. For blocks within EOF, generic_write_end() zeros them - * so they are safe to leave alone and be written with all the other valid data. + * On failure, we only need to kill delalloc blocks beyond EOF in the range of + * this specific write because they will never be written. Previous writes + * beyond EOF where block allocation succeeded do not need to be trashed, so + * only new blocks from this write should be trashed. For blocks within + * EOF, generic_write_end() zeros them so they are safe to leave alone and be + * written with all the other valid data. */ STATIC int xfs_vm_write_end( @@ -1601,8 +1710,11 @@ xfs_vm_write_end( loff_t to = pos + len; if (to > isize) { - truncate_pagecache(inode, to, isize); + /* only kill blocks in this write beyond EOF */ + if (pos > isize) + isize = pos; xfs_vm_kill_delalloc_range(inode, isize, to); + truncate_pagecache_range(inode, isize, to); } } return ret; diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h index c325abb8d61..f94dd459dff 100644 --- a/fs/xfs/xfs_aops.h +++ b/fs/xfs/xfs_aops.h @@ -45,7 +45,6 @@ typedef struct xfs_ioend { unsigned int io_type; /* delalloc / unwritten */ int io_error; /* I/O error code */ atomic_t io_remaining; /* hold count */ - unsigned int io_isasync : 1; /* needs aio_complete */ unsigned int io_isdirect : 1;/* direct I/O */ struct inode *io_inode; /* file being written to */ struct buffer_head *io_buffer_head;/* buffer linked list head */ @@ -54,8 +53,6 @@ typedef struct xfs_ioend { xfs_off_t io_offset; /* offset in the file */ struct work_struct io_work; /* xfsdatad work queue */ struct xfs_trans *io_append_trans;/* xact. for size update */ - struct kiocb *io_iocb; - int io_result; } xfs_ioend_t; extern const struct address_space_operations xfs_address_space_operations; diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c index aaf472532b3..bfe36fc2cdc 100644 --- a/fs/xfs/xfs_attr.c +++ b/fs/xfs/xfs_attr.c @@ -15,31 +15,34 @@ * along with this program; if not, write the Free Software Foundation, * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ - #include "xfs.h" #include "xfs_fs.h" -#include "xfs_types.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_bit.h" -#include "xfs_log.h" -#include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_mount.h" +#include "xfs_da_format.h" #include "xfs_da_btree.h" -#include "xfs_bmap_btree.h" #include "xfs_attr_sf.h" -#include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_alloc.h" +#include "xfs_trans.h" #include "xfs_inode_item.h" #include "xfs_bmap.h" +#include "xfs_bmap_util.h" +#include "xfs_bmap_btree.h" #include "xfs_attr.h" #include "xfs_attr_leaf.h" +#include "xfs_attr_remote.h" #include "xfs_error.h" #include "xfs_quota.h" #include "xfs_trans_space.h" -#include "xfs_vnodeops.h" #include "xfs_trace.h" +#include "xfs_dinode.h" /* * xfs_attr.c @@ -62,7 +65,6 @@ STATIC int xfs_attr_shortform_addname(xfs_da_args_t *args); STATIC int xfs_attr_leaf_get(xfs_da_args_t *args); STATIC int xfs_attr_leaf_addname(xfs_da_args_t *args); STATIC int xfs_attr_leaf_removename(xfs_da_args_t *args); -STATIC int xfs_attr_leaf_list(xfs_attr_list_context_t *context); /* * Internal routines when attribute list is more than one block. @@ -70,34 +72,36 @@ STATIC int xfs_attr_leaf_list(xfs_attr_list_context_t *context); STATIC int xfs_attr_node_get(xfs_da_args_t *args); STATIC int xfs_attr_node_addname(xfs_da_args_t *args); STATIC int xfs_attr_node_removename(xfs_da_args_t *args); -STATIC int xfs_attr_node_list(xfs_attr_list_context_t *context); STATIC int xfs_attr_fillstate(xfs_da_state_t *state); STATIC int xfs_attr_refillstate(xfs_da_state_t *state); -/* - * Routines to manipulate out-of-line attribute values. - */ -STATIC int xfs_attr_rmtval_set(xfs_da_args_t *args); -STATIC int xfs_attr_rmtval_remove(xfs_da_args_t *args); - -#define ATTR_RMTVALUE_MAPSIZE 1 /* # of map entries at once */ STATIC int -xfs_attr_name_to_xname( - struct xfs_name *xname, - const unsigned char *aname) +xfs_attr_args_init( + struct xfs_da_args *args, + struct xfs_inode *dp, + const unsigned char *name, + int flags) { - if (!aname) + + if (!name) return EINVAL; - xname->name = aname; - xname->len = strlen((char *)aname); - if (xname->len >= MAXNAMELEN) + + memset(args, 0, sizeof(*args)); + args->geo = dp->i_mount->m_attr_geo; + args->whichfork = XFS_ATTR_FORK; + args->dp = dp; + args->flags = flags; + args->name = name; + args->namelen = strlen((const char *)name); + if (args->namelen >= MAXNAMELEN) return EFAULT; /* match IRIX behaviour */ + args->hashval = xfs_da_hashname(args->name, args->namelen); return 0; } -STATIC int +int xfs_inode_hasattr( struct xfs_inode *ip) { @@ -112,78 +116,46 @@ xfs_inode_hasattr( * Overall external interface routines. *========================================================================*/ -STATIC int -xfs_attr_get_int( +int +xfs_attr_get( struct xfs_inode *ip, - struct xfs_name *name, + const unsigned char *name, unsigned char *value, int *valuelenp, int flags) { - xfs_da_args_t args; - int error; + struct xfs_da_args args; + uint lock_mode; + int error; + + XFS_STATS_INC(xs_attr_get); + + if (XFS_FORCED_SHUTDOWN(ip->i_mount)) + return EIO; if (!xfs_inode_hasattr(ip)) return ENOATTR; - /* - * Fill in the arg structure for this request. - */ - memset((char *)&args, 0, sizeof(args)); - args.name = name->name; - args.namelen = name->len; + error = xfs_attr_args_init(&args, ip, name, flags); + if (error) + return error; + args.value = value; args.valuelen = *valuelenp; - args.flags = flags; - args.hashval = xfs_da_hashname(args.name, args.namelen); - args.dp = ip; - args.whichfork = XFS_ATTR_FORK; - /* - * Decide on what work routines to call based on the inode size. - */ - if (ip->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) { + lock_mode = xfs_ilock_attr_map_shared(ip); + if (!xfs_inode_hasattr(ip)) + error = ENOATTR; + else if (ip->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) error = xfs_attr_shortform_getvalue(&args); - } else if (xfs_bmap_one_block(ip, XFS_ATTR_FORK)) { + else if (xfs_bmap_one_block(ip, XFS_ATTR_FORK)) error = xfs_attr_leaf_get(&args); - } else { + else error = xfs_attr_node_get(&args); - } + xfs_iunlock(ip, lock_mode); - /* - * Return the number of bytes in the value to the caller. - */ *valuelenp = args.valuelen; - - if (error == EEXIST) - error = 0; - return(error); -} - -int -xfs_attr_get( - xfs_inode_t *ip, - const unsigned char *name, - unsigned char *value, - int *valuelenp, - int flags) -{ - int error; - struct xfs_name xname; - - XFS_STATS_INC(xs_attr_get); - - if (XFS_FORCED_SHUTDOWN(ip->i_mount)) - return(EIO); - - error = xfs_attr_name_to_xname(&xname, name); - if (error) - return error; - - xfs_ilock(ip, XFS_ILOCK_SHARED); - error = xfs_attr_get_int(ip, &xname, value, valuelenp, flags); - xfs_iunlock(ip, XFS_ILOCK_SHARED); - return(error); + return error == EEXIST ? 0 : error; } /* @@ -191,12 +163,10 @@ xfs_attr_get( */ STATIC int xfs_attr_calc_size( - struct xfs_inode *ip, - int namelen, - int valuelen, + struct xfs_da_args *args, int *local) { - struct xfs_mount *mp = ip->i_mount; + struct xfs_mount *mp = args->dp->i_mount; int size; int nblks; @@ -204,12 +174,10 @@ xfs_attr_calc_size( * Determine space new attribute will use, and if it would be * "local" or "remote" (note: local != inline). */ - size = xfs_attr_leaf_newentsize(namelen, valuelen, - mp->m_sb.sb_blocksize, local); - + size = xfs_attr_leaf_newentsize(args, local); nblks = XFS_DAENTER_SPACE_RES(mp, XFS_ATTR_FORK); if (*local) { - if (size > (mp->m_sb.sb_blocksize >> 1)) { + if (size > (args->geo->blksize / 2)) { /* Double split possible */ nblks *= 2; } @@ -218,7 +186,7 @@ xfs_attr_calc_size( * Out of line attribute, cannot double split, but * make room for the attribute value itself. */ - uint dblocks = XFS_B_TO_FSB(mp, valuelen); + uint dblocks = xfs_attr3_rmt_blocks(mp, args->valuelen); nblks += dblocks; nblks += XFS_NEXTENTADD_SPACE_RES(mp, dblocks, XFS_ATTR_FORK); } @@ -226,25 +194,38 @@ xfs_attr_calc_size( return nblks; } -STATIC int -xfs_attr_set_int( - struct xfs_inode *dp, - struct xfs_name *name, - unsigned char *value, - int valuelen, - int flags) +int +xfs_attr_set( + struct xfs_inode *dp, + const unsigned char *name, + unsigned char *value, + int valuelen, + int flags) { - xfs_da_args_t args; - xfs_fsblock_t firstblock; - xfs_bmap_free_t flist; - int error, err2, committed; - xfs_mount_t *mp = dp->i_mount; - int rsvd = (flags & ATTR_ROOT) != 0; - int local; + struct xfs_mount *mp = dp->i_mount; + struct xfs_da_args args; + struct xfs_bmap_free flist; + struct xfs_trans_res tres; + xfs_fsblock_t firstblock; + int rsvd = (flags & ATTR_ROOT) != 0; + int error, err2, committed, local; + + XFS_STATS_INC(xs_attr_set); + + if (XFS_FORCED_SHUTDOWN(dp->i_mount)) + return EIO; + + error = xfs_attr_args_init(&args, dp, name, flags); + if (error) + return error; + + args.value = value; + args.valuelen = valuelen; + args.firstblock = &firstblock; + args.flist = &flist; + args.op_flags = XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT; + args.total = xfs_attr_calc_size(&args, &local); - /* - * Attach the dquots to the inode. - */ error = xfs_qm_dqattach(dp, 0); if (error) return error; @@ -255,32 +236,14 @@ xfs_attr_set_int( */ if (XFS_IFORK_Q(dp) == 0) { int sf_size = sizeof(xfs_attr_sf_hdr_t) + - XFS_ATTR_SF_ENTSIZE_BYNAME(name->len, valuelen); + XFS_ATTR_SF_ENTSIZE_BYNAME(args.namelen, valuelen); - if ((error = xfs_bmap_add_attrfork(dp, sf_size, rsvd))) - return(error); + error = xfs_bmap_add_attrfork(dp, sf_size, rsvd); + if (error) + return error; } /* - * Fill in the arg structure for this request. - */ - memset((char *)&args, 0, sizeof(args)); - args.name = name->name; - args.namelen = name->len; - args.value = value; - args.valuelen = valuelen; - args.flags = flags; - args.hashval = xfs_da_hashname(args.name, args.namelen); - args.dp = dp; - args.firstblock = &firstblock; - args.flist = &flist; - args.whichfork = XFS_ATTR_FORK; - args.op_flags = XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT; - - /* Size is now blocks for attribute data */ - args.total = xfs_attr_calc_size(dp, name->len, valuelen, &local); - - /* * Start our first transaction of the day. * * All future transactions during this code must be "chained" off @@ -300,11 +263,14 @@ xfs_attr_set_int( if (rsvd) args.trans->t_flags |= XFS_TRANS_RESERVE; - if ((error = xfs_trans_reserve(args.trans, args.total, - XFS_ATTRSET_LOG_RES(mp, args.total), 0, - XFS_TRANS_PERM_LOG_RES, XFS_ATTRSET_LOG_COUNT))) { + tres.tr_logres = M_RES(mp)->tr_attrsetm.tr_logres + + M_RES(mp)->tr_attrsetrt.tr_logres * args.total; + tres.tr_logcount = XFS_ATTRSET_LOG_COUNT; + tres.tr_logflags = XFS_TRANS_PERM_LOG_RES; + error = xfs_trans_reserve(args.trans, &tres, args.total, 0); + if (error) { xfs_trans_cancel(args.trans, 0); - return(error); + return error; } xfs_ilock(dp, XFS_ILOCK_EXCL); @@ -314,7 +280,7 @@ xfs_attr_set_int( if (error) { xfs_iunlock(dp, XFS_ILOCK_EXCL); xfs_trans_cancel(args.trans, XFS_TRANS_RELEASE_LOG_RES); - return (error); + return error; } xfs_trans_ijoin(args.trans, dp, 0); @@ -323,9 +289,9 @@ xfs_attr_set_int( * If the attribute list is non-existent or a shortform list, * upgrade it to a single-leaf-block attribute list. */ - if ((dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) || - ((dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS) && - (dp->i_d.di_anextents == 0))) { + if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL || + (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS && + dp->i_d.di_anextents == 0)) { /* * Build initial attribute list (if required). @@ -350,9 +316,8 @@ xfs_attr_set_int( * the transaction goes to disk before returning * to the user. */ - if (mp->m_flags & XFS_MOUNT_WSYNC) { + if (mp->m_flags & XFS_MOUNT_WSYNC) xfs_trans_set_sync(args.trans); - } if (!error && (flags & ATTR_KERNOTIME) == 0) { xfs_trans_ichgtime(args.trans, dp, @@ -362,7 +327,7 @@ xfs_attr_set_int( XFS_TRANS_RELEASE_LOG_RES); xfs_iunlock(dp, XFS_ILOCK_EXCL); - return(error == 0 ? err2 : error); + return error ? error : err2; } /* @@ -400,22 +365,19 @@ xfs_attr_set_int( } - if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) { + if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) error = xfs_attr_leaf_addname(&args); - } else { + else error = xfs_attr_node_addname(&args); - } - if (error) { + if (error) goto out; - } /* * If this is a synchronous mount, make sure that the * transaction goes to disk before returning to the user. */ - if (mp->m_flags & XFS_MOUNT_WSYNC) { + if (mp->m_flags & XFS_MOUNT_WSYNC) xfs_trans_set_sync(args.trans); - } if ((flags & ATTR_KERNOTIME) == 0) xfs_trans_ichgtime(args.trans, dp, XFS_ICHGTIME_CHG); @@ -427,65 +389,47 @@ xfs_attr_set_int( error = xfs_trans_commit(args.trans, XFS_TRANS_RELEASE_LOG_RES); xfs_iunlock(dp, XFS_ILOCK_EXCL); - return(error); + return error; out: - if (args.trans) + if (args.trans) { xfs_trans_cancel(args.trans, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT); + } xfs_iunlock(dp, XFS_ILOCK_EXCL); - return(error); + return error; } +/* + * Generic handler routine to remove a name from an attribute list. + * Transitions attribute list from Btree to shortform as necessary. + */ int -xfs_attr_set( - xfs_inode_t *dp, - const unsigned char *name, - unsigned char *value, - int valuelen, - int flags) +xfs_attr_remove( + struct xfs_inode *dp, + const unsigned char *name, + int flags) { - int error; - struct xfs_name xname; + struct xfs_mount *mp = dp->i_mount; + struct xfs_da_args args; + struct xfs_bmap_free flist; + xfs_fsblock_t firstblock; + int error; - XFS_STATS_INC(xs_attr_set); + XFS_STATS_INC(xs_attr_remove); if (XFS_FORCED_SHUTDOWN(dp->i_mount)) - return (EIO); + return EIO; - error = xfs_attr_name_to_xname(&xname, name); + if (!xfs_inode_hasattr(dp)) + return ENOATTR; + + error = xfs_attr_args_init(&args, dp, name, flags); if (error) return error; - return xfs_attr_set_int(dp, &xname, value, valuelen, flags); -} - -/* - * Generic handler routine to remove a name from an attribute list. - * Transitions attribute list from Btree to shortform as necessary. - */ -STATIC int -xfs_attr_remove_int(xfs_inode_t *dp, struct xfs_name *name, int flags) -{ - xfs_da_args_t args; - xfs_fsblock_t firstblock; - xfs_bmap_free_t flist; - int error; - xfs_mount_t *mp = dp->i_mount; - - /* - * Fill in the arg structure for this request. - */ - memset((char *)&args, 0, sizeof(args)); - args.name = name->name; - args.namelen = name->len; - args.flags = flags; - args.hashval = xfs_da_hashname(args.name, args.namelen); - args.dp = dp; args.firstblock = &firstblock; args.flist = &flist; - args.total = 0; - args.whichfork = XFS_ATTR_FORK; /* * we have no control over the attribute names that userspace passes us @@ -494,9 +438,6 @@ xfs_attr_remove_int(xfs_inode_t *dp, struct xfs_name *name, int flags) */ args.op_flags = XFS_DA_OP_OKNOENT; - /* - * Attach the dquots to the inode. - */ error = xfs_qm_dqattach(dp, 0); if (error) return error; @@ -521,13 +462,11 @@ xfs_attr_remove_int(xfs_inode_t *dp, struct xfs_name *name, int flags) if (flags & ATTR_ROOT) args.trans->t_flags |= XFS_TRANS_RESERVE; - if ((error = xfs_trans_reserve(args.trans, - XFS_ATTRRM_SPACE_RES(mp), - XFS_ATTRRM_LOG_RES(mp), - 0, XFS_TRANS_PERM_LOG_RES, - XFS_ATTRRM_LOG_COUNT))) { + error = xfs_trans_reserve(args.trans, &M_RES(mp)->tr_attrrm, + XFS_ATTRRM_SPACE_RES(mp), 0); + if (error) { xfs_trans_cancel(args.trans, 0); - return(error); + return error; } xfs_ilock(dp, XFS_ILOCK_EXCL); @@ -537,35 +476,26 @@ xfs_attr_remove_int(xfs_inode_t *dp, struct xfs_name *name, int flags) */ xfs_trans_ijoin(args.trans, dp, 0); - /* - * Decide on what work routines to call based on the inode size. - */ if (!xfs_inode_hasattr(dp)) { error = XFS_ERROR(ENOATTR); - goto out; - } - if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) { + } else if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) { ASSERT(dp->i_afp->if_flags & XFS_IFINLINE); error = xfs_attr_shortform_remove(&args); - if (error) { - goto out; - } } else if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) { error = xfs_attr_leaf_removename(&args); } else { error = xfs_attr_node_removename(&args); } - if (error) { + + if (error) goto out; - } /* * If this is a synchronous mount, make sure that the * transaction goes to disk before returning to the user. */ - if (mp->m_flags & XFS_MOUNT_WSYNC) { + if (mp->m_flags & XFS_MOUNT_WSYNC) xfs_trans_set_sync(args.trans); - } if ((flags & ATTR_KERNOTIME) == 0) xfs_trans_ichgtime(args.trans, dp, XFS_ICHGTIME_CHG); @@ -577,267 +507,17 @@ xfs_attr_remove_int(xfs_inode_t *dp, struct xfs_name *name, int flags) error = xfs_trans_commit(args.trans, XFS_TRANS_RELEASE_LOG_RES); xfs_iunlock(dp, XFS_ILOCK_EXCL); - return(error); + return error; out: - if (args.trans) + if (args.trans) { xfs_trans_cancel(args.trans, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT); - xfs_iunlock(dp, XFS_ILOCK_EXCL); - return(error); -} - -int -xfs_attr_remove( - xfs_inode_t *dp, - const unsigned char *name, - int flags) -{ - int error; - struct xfs_name xname; - - XFS_STATS_INC(xs_attr_remove); - - if (XFS_FORCED_SHUTDOWN(dp->i_mount)) - return (EIO); - - error = xfs_attr_name_to_xname(&xname, name); - if (error) - return error; - - xfs_ilock(dp, XFS_ILOCK_SHARED); - if (!xfs_inode_hasattr(dp)) { - xfs_iunlock(dp, XFS_ILOCK_SHARED); - return XFS_ERROR(ENOATTR); - } - xfs_iunlock(dp, XFS_ILOCK_SHARED); - - return xfs_attr_remove_int(dp, &xname, flags); -} - -int -xfs_attr_list_int(xfs_attr_list_context_t *context) -{ - int error; - xfs_inode_t *dp = context->dp; - - XFS_STATS_INC(xs_attr_list); - - if (XFS_FORCED_SHUTDOWN(dp->i_mount)) - return EIO; - - xfs_ilock(dp, XFS_ILOCK_SHARED); - - /* - * Decide on what work routines to call based on the inode size. - */ - if (!xfs_inode_hasattr(dp)) { - error = 0; - } else if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) { - error = xfs_attr_shortform_list(context); - } else if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) { - error = xfs_attr_leaf_list(context); - } else { - error = xfs_attr_node_list(context); - } - - xfs_iunlock(dp, XFS_ILOCK_SHARED); - - return error; -} - -#define ATTR_ENTBASESIZE /* minimum bytes used by an attr */ \ - (((struct attrlist_ent *) 0)->a_name - (char *) 0) -#define ATTR_ENTSIZE(namelen) /* actual bytes used by an attr */ \ - ((ATTR_ENTBASESIZE + (namelen) + 1 + sizeof(u_int32_t)-1) \ - & ~(sizeof(u_int32_t)-1)) - -/* - * Format an attribute and copy it out to the user's buffer. - * Take care to check values and protect against them changing later, - * we may be reading them directly out of a user buffer. - */ -/*ARGSUSED*/ -STATIC int -xfs_attr_put_listent( - xfs_attr_list_context_t *context, - int flags, - unsigned char *name, - int namelen, - int valuelen, - unsigned char *value) -{ - struct attrlist *alist = (struct attrlist *)context->alist; - attrlist_ent_t *aep; - int arraytop; - - ASSERT(!(context->flags & ATTR_KERNOVAL)); - ASSERT(context->count >= 0); - ASSERT(context->count < (ATTR_MAX_VALUELEN/8)); - ASSERT(context->firstu >= sizeof(*alist)); - ASSERT(context->firstu <= context->bufsize); - - /* - * Only list entries in the right namespace. - */ - if (((context->flags & ATTR_SECURE) == 0) != - ((flags & XFS_ATTR_SECURE) == 0)) - return 0; - if (((context->flags & ATTR_ROOT) == 0) != - ((flags & XFS_ATTR_ROOT) == 0)) - return 0; - - arraytop = sizeof(*alist) + - context->count * sizeof(alist->al_offset[0]); - context->firstu -= ATTR_ENTSIZE(namelen); - if (context->firstu < arraytop) { - trace_xfs_attr_list_full(context); - alist->al_more = 1; - context->seen_enough = 1; - return 1; - } - - aep = (attrlist_ent_t *)&context->alist[context->firstu]; - aep->a_valuelen = valuelen; - memcpy(aep->a_name, name, namelen); - aep->a_name[namelen] = 0; - alist->al_offset[context->count++] = context->firstu; - alist->al_count = context->count; - trace_xfs_attr_list_add(context); - return 0; -} - -/* - * Generate a list of extended attribute names and optionally - * also value lengths. Positive return value follows the XFS - * convention of being an error, zero or negative return code - * is the length of the buffer returned (negated), indicating - * success. - */ -int -xfs_attr_list( - xfs_inode_t *dp, - char *buffer, - int bufsize, - int flags, - attrlist_cursor_kern_t *cursor) -{ - xfs_attr_list_context_t context; - struct attrlist *alist; - int error; - - /* - * Validate the cursor. - */ - if (cursor->pad1 || cursor->pad2) - return(XFS_ERROR(EINVAL)); - if ((cursor->initted == 0) && - (cursor->hashval || cursor->blkno || cursor->offset)) - return XFS_ERROR(EINVAL); - - /* - * Check for a properly aligned buffer. - */ - if (((long)buffer) & (sizeof(int)-1)) - return XFS_ERROR(EFAULT); - if (flags & ATTR_KERNOVAL) - bufsize = 0; - - /* - * Initialize the output buffer. - */ - memset(&context, 0, sizeof(context)); - context.dp = dp; - context.cursor = cursor; - context.resynch = 1; - context.flags = flags; - context.alist = buffer; - context.bufsize = (bufsize & ~(sizeof(int)-1)); /* align */ - context.firstu = context.bufsize; - context.put_listent = xfs_attr_put_listent; - - alist = (struct attrlist *)context.alist; - alist->al_count = 0; - alist->al_more = 0; - alist->al_offset[0] = context.bufsize; - - error = xfs_attr_list_int(&context); - ASSERT(error >= 0); - return error; -} - -int /* error */ -xfs_attr_inactive(xfs_inode_t *dp) -{ - xfs_trans_t *trans; - xfs_mount_t *mp; - int error; - - mp = dp->i_mount; - ASSERT(! XFS_NOT_DQATTACHED(mp, dp)); - - xfs_ilock(dp, XFS_ILOCK_SHARED); - if (!xfs_inode_hasattr(dp) || - dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) { - xfs_iunlock(dp, XFS_ILOCK_SHARED); - return 0; - } - xfs_iunlock(dp, XFS_ILOCK_SHARED); - - /* - * Start our first transaction of the day. - * - * All future transactions during this code must be "chained" off - * this one via the trans_dup() call. All transactions will contain - * the inode, and the inode will always be marked with trans_ihold(). - * Since the inode will be locked in all transactions, we must log - * the inode in every transaction to let it float upward through - * the log. - */ - trans = xfs_trans_alloc(mp, XFS_TRANS_ATTRINVAL); - if ((error = xfs_trans_reserve(trans, 0, XFS_ATTRINVAL_LOG_RES(mp), 0, - XFS_TRANS_PERM_LOG_RES, - XFS_ATTRINVAL_LOG_COUNT))) { - xfs_trans_cancel(trans, 0); - return(error); - } - xfs_ilock(dp, XFS_ILOCK_EXCL); - - /* - * No need to make quota reservations here. We expect to release some - * blocks, not allocate, in the common case. - */ - xfs_trans_ijoin(trans, dp, 0); - - /* - * Decide on what work routines to call based on the inode size. - */ - if (!xfs_inode_hasattr(dp) || - dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) { - error = 0; - goto out; } - error = xfs_attr_root_inactive(&trans, dp); - if (error) - goto out; - - error = xfs_itruncate_extents(&trans, dp, XFS_ATTR_FORK, 0); - if (error) - goto out; - - error = xfs_trans_commit(trans, XFS_TRANS_RELEASE_LOG_RES); xfs_iunlock(dp, XFS_ILOCK_EXCL); - - return(error); - -out: - xfs_trans_cancel(trans, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT); - xfs_iunlock(dp, XFS_ILOCK_EXCL); - return(error); + return error; } - - /*======================================================================== * External routines when attribute list is inside the inode *========================================================================*/ @@ -903,7 +583,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) */ dp = args->dp; args->blkno = 0; - error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, -1, &bp); + error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp); if (error) return error; @@ -911,30 +591,41 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) * Look up the given attribute in the leaf block. Figure out if * the given flags produce an error or call for an atomic rename. */ - retval = xfs_attr_leaf_lookup_int(bp, args); + retval = xfs_attr3_leaf_lookup_int(bp, args); if ((args->flags & ATTR_REPLACE) && (retval == ENOATTR)) { xfs_trans_brelse(args->trans, bp); - return(retval); + return retval; } else if (retval == EEXIST) { if (args->flags & ATTR_CREATE) { /* pure create op */ xfs_trans_brelse(args->trans, bp); - return(retval); + return retval; } trace_xfs_attr_leaf_replace(args); + /* save the attribute state for later removal*/ args->op_flags |= XFS_DA_OP_RENAME; /* an atomic rename */ args->blkno2 = args->blkno; /* set 2nd entry info*/ args->index2 = args->index; args->rmtblkno2 = args->rmtblkno; args->rmtblkcnt2 = args->rmtblkcnt; + args->rmtvaluelen2 = args->rmtvaluelen; + + /* + * clear the remote attr state now that it is saved so that the + * values reflect the state of the attribute we are about to + * add, not the attribute we just found and will remove later. + */ + args->rmtblkno = 0; + args->rmtblkcnt = 0; + args->rmtvaluelen = 0; } /* * Add the attribute to the leaf block, transitioning to a Btree * if required. */ - retval = xfs_attr_leaf_add(bp, args); + retval = xfs_attr3_leaf_add(bp, args); if (retval == ENOSPC) { /* * Promote the attribute list to the Btree format, then @@ -942,7 +633,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) * can manage its own transactions. */ xfs_bmap_init(args->flist, args->firstblock); - error = xfs_attr_leaf_to_node(args); + error = xfs_attr3_leaf_to_node(args); if (!error) { error = xfs_bmap_finish(&args->trans, args->flist, &committed); @@ -1007,7 +698,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) * In a separate transaction, set the incomplete flag on the * "old" attr and clear the incomplete flag on the "new" attr. */ - error = xfs_attr_leaf_flipflags(args); + error = xfs_attr3_leaf_flipflags(args); if (error) return(error); @@ -1019,6 +710,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) args->blkno = args->blkno2; args->rmtblkno = args->rmtblkno2; args->rmtblkcnt = args->rmtblkcnt2; + args->rmtvaluelen = args->rmtvaluelen2; if (args->rmtblkno) { error = xfs_attr_rmtval_remove(args); if (error) @@ -1029,19 +721,19 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) * Read in the block containing the "old" attr, then * remove the "old" attr from that block (neat, huh!) */ - error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, + error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp); if (error) return error; - xfs_attr_leaf_remove(bp, args); + xfs_attr3_leaf_remove(bp, args); /* * If the result is small enough, shrink it all into the inode. */ if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) { xfs_bmap_init(args->flist, args->firstblock); - error = xfs_attr_leaf_to_shortform(bp, args, forkoff); + error = xfs_attr3_leaf_to_shortform(bp, args, forkoff); /* bp is gone due to xfs_da_shrink_inode */ if (!error) { error = xfs_bmap_finish(&args->trans, @@ -1073,9 +765,9 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) /* * Added a "remote" value, just clear the incomplete flag. */ - error = xfs_attr_leaf_clearflag(args); + error = xfs_attr3_leaf_clearflag(args); } - return(error); + return error; } /* @@ -1098,24 +790,24 @@ xfs_attr_leaf_removename(xfs_da_args_t *args) */ dp = args->dp; args->blkno = 0; - error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, -1, &bp); + error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp); if (error) return error; - error = xfs_attr_leaf_lookup_int(bp, args); + error = xfs_attr3_leaf_lookup_int(bp, args); if (error == ENOATTR) { xfs_trans_brelse(args->trans, bp); - return(error); + return error; } - xfs_attr_leaf_remove(bp, args); + xfs_attr3_leaf_remove(bp, args); /* * If the result is small enough, shrink it all into the inode. */ if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) { xfs_bmap_init(args->flist, args->firstblock); - error = xfs_attr_leaf_to_shortform(bp, args, forkoff); + error = xfs_attr3_leaf_to_shortform(bp, args, forkoff); /* bp is gone due to xfs_da_shrink_inode */ if (!error) { error = xfs_bmap_finish(&args->trans, args->flist, @@ -1125,7 +817,7 @@ xfs_attr_leaf_removename(xfs_da_args_t *args) ASSERT(committed); args->trans = NULL; xfs_bmap_cancel(args->flist); - return(error); + return error; } /* @@ -1135,7 +827,7 @@ xfs_attr_leaf_removename(xfs_da_args_t *args) if (committed) xfs_trans_ijoin(args->trans, dp, 0); } - return(0); + return 0; } /* @@ -1153,47 +845,25 @@ xfs_attr_leaf_get(xfs_da_args_t *args) trace_xfs_attr_leaf_get(args); args->blkno = 0; - error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, -1, &bp); + error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp); if (error) return error; - error = xfs_attr_leaf_lookup_int(bp, args); + error = xfs_attr3_leaf_lookup_int(bp, args); if (error != EEXIST) { xfs_trans_brelse(args->trans, bp); - return(error); + return error; } - error = xfs_attr_leaf_getvalue(bp, args); + error = xfs_attr3_leaf_getvalue(bp, args); xfs_trans_brelse(args->trans, bp); if (!error && (args->rmtblkno > 0) && !(args->flags & ATTR_KERNOVAL)) { error = xfs_attr_rmtval_get(args); } - return(error); -} - -/* - * Copy out attribute entries for attr_list(), for leaf attribute lists. - */ -STATIC int -xfs_attr_leaf_list(xfs_attr_list_context_t *context) -{ - int error; - struct xfs_buf *bp; - - trace_xfs_attr_leaf_list(context); - - context->cursor->blkno = 0; - error = xfs_attr_leaf_read(NULL, context->dp, 0, -1, &bp); - if (error) - return XFS_ERROR(error); - - error = xfs_attr_leaf_list_int(bp, context); - xfs_trans_brelse(NULL, bp); - return XFS_ERROR(error); + return error; } - /*======================================================================== - * External routines when attribute list size > XFS_LBSIZE(mp). + * External routines when attribute list size > geo->blksize *========================================================================*/ /* @@ -1226,14 +896,12 @@ restart: state = xfs_da_state_alloc(); state->args = args; state->mp = mp; - state->blocksize = state->mp->m_sb.sb_blocksize; - state->node_ents = state->mp->m_attr_node_ents; /* * Search to see if name already exists, and get back a pointer * to where it should go. */ - error = xfs_da_node_lookup_int(state, &retval); + error = xfs_da3_node_lookup_int(state, &retval); if (error) goto out; blk = &state->path.blk[ state->path.active-1 ]; @@ -1246,16 +914,25 @@ restart: trace_xfs_attr_node_replace(args); + /* save the attribute state for later removal*/ args->op_flags |= XFS_DA_OP_RENAME; /* atomic rename op */ args->blkno2 = args->blkno; /* set 2nd entry info*/ args->index2 = args->index; args->rmtblkno2 = args->rmtblkno; args->rmtblkcnt2 = args->rmtblkcnt; + args->rmtvaluelen2 = args->rmtvaluelen; + + /* + * clear the remote attr state now that it is saved so that the + * values reflect the state of the attribute we are about to + * add, not the attribute we just found and will remove later. + */ args->rmtblkno = 0; args->rmtblkcnt = 0; + args->rmtvaluelen = 0; } - retval = xfs_attr_leaf_add(blk->bp, state->args); + retval = xfs_attr3_leaf_add(blk->bp, state->args); if (retval == ENOSPC) { if (state->path.active == 1) { /* @@ -1264,8 +941,9 @@ restart: * have been a b-tree. */ xfs_da_state_free(state); + state = NULL; xfs_bmap_init(args->flist, args->firstblock); - error = xfs_attr_leaf_to_node(args); + error = xfs_attr3_leaf_to_node(args); if (!error) { error = xfs_bmap_finish(&args->trans, args->flist, @@ -1304,7 +982,7 @@ restart: * in the index2/blkno2/rmtblkno2/rmtblkcnt2 fields. */ xfs_bmap_init(args->flist, args->firstblock); - error = xfs_da_split(state); + error = xfs_da3_split(state); if (!error) { error = xfs_bmap_finish(&args->trans, args->flist, &committed); @@ -1326,7 +1004,7 @@ restart: /* * Addition succeeded, update Btree hashvals. */ - xfs_da_fixhashpath(state, &state->path); + xfs_da3_fixhashpath(state, &state->path); } /* @@ -1367,7 +1045,7 @@ restart: * In a separate transaction, set the incomplete flag on the * "old" attr and clear the incomplete flag on the "new" attr. */ - error = xfs_attr_leaf_flipflags(args); + error = xfs_attr3_leaf_flipflags(args); if (error) goto out; @@ -1379,6 +1057,7 @@ restart: args->blkno = args->blkno2; args->rmtblkno = args->rmtblkno2; args->rmtblkcnt = args->rmtblkcnt2; + args->rmtvaluelen = args->rmtvaluelen2; if (args->rmtblkno) { error = xfs_attr_rmtval_remove(args); if (error) @@ -1394,10 +1073,8 @@ restart: state = xfs_da_state_alloc(); state->args = args; state->mp = mp; - state->blocksize = state->mp->m_sb.sb_blocksize; - state->node_ents = state->mp->m_attr_node_ents; state->inleaf = 0; - error = xfs_da_node_lookup_int(state, &retval); + error = xfs_da3_node_lookup_int(state, &retval); if (error) goto out; @@ -1406,15 +1083,15 @@ restart: */ blk = &state->path.blk[ state->path.active-1 ]; ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC); - error = xfs_attr_leaf_remove(blk->bp, args); - xfs_da_fixhashpath(state, &state->path); + error = xfs_attr3_leaf_remove(blk->bp, args); + xfs_da3_fixhashpath(state, &state->path); /* * Check to see if the tree needs to be collapsed. */ if (retval && (state->path.active > 1)) { xfs_bmap_init(args->flist, args->firstblock); - error = xfs_da_join(state); + error = xfs_da3_join(state); if (!error) { error = xfs_bmap_finish(&args->trans, args->flist, @@ -1447,7 +1124,7 @@ restart: /* * Added a "remote" value, just clear the incomplete flag. */ - error = xfs_attr_leaf_clearflag(args); + error = xfs_attr3_leaf_clearflag(args); if (error) goto out; } @@ -1486,13 +1163,11 @@ xfs_attr_node_removename(xfs_da_args_t *args) state = xfs_da_state_alloc(); state->args = args; state->mp = dp->i_mount; - state->blocksize = state->mp->m_sb.sb_blocksize; - state->node_ents = state->mp->m_attr_node_ents; /* * Search to see if name exists, and get back a pointer to it. */ - error = xfs_da_node_lookup_int(state, &retval); + error = xfs_da3_node_lookup_int(state, &retval); if (error || (retval != EEXIST)) { if (error == 0) error = retval; @@ -1521,7 +1196,7 @@ xfs_attr_node_removename(xfs_da_args_t *args) * Mark the attribute as INCOMPLETE, then bunmapi() the * remote value. */ - error = xfs_attr_leaf_setflag(args); + error = xfs_attr3_leaf_setflag(args); if (error) goto out; error = xfs_attr_rmtval_remove(args); @@ -1542,15 +1217,15 @@ xfs_attr_node_removename(xfs_da_args_t *args) */ blk = &state->path.blk[ state->path.active-1 ]; ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC); - retval = xfs_attr_leaf_remove(blk->bp, args); - xfs_da_fixhashpath(state, &state->path); + retval = xfs_attr3_leaf_remove(blk->bp, args); + xfs_da3_fixhashpath(state, &state->path); /* * Check to see if the tree needs to be collapsed. */ if (retval && (state->path.active > 1)) { xfs_bmap_init(args->flist, args->firstblock); - error = xfs_da_join(state); + error = xfs_da3_join(state); if (!error) { error = xfs_bmap_finish(&args->trans, args->flist, &committed); @@ -1588,13 +1263,13 @@ xfs_attr_node_removename(xfs_da_args_t *args) ASSERT(state->path.blk[0].bp); state->path.blk[0].bp = NULL; - error = xfs_attr_leaf_read(args->trans, args->dp, 0, -1, &bp); + error = xfs_attr3_leaf_read(args->trans, args->dp, 0, -1, &bp); if (error) goto out; if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) { xfs_bmap_init(args->flist, args->firstblock); - error = xfs_attr_leaf_to_shortform(bp, args, forkoff); + error = xfs_attr3_leaf_to_shortform(bp, args, forkoff); /* bp is gone due to xfs_da_shrink_inode */ if (!error) { error = xfs_bmap_finish(&args->trans, @@ -1696,7 +1371,7 @@ xfs_attr_refillstate(xfs_da_state_t *state) ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); for (blk = path->blk, level = 0; level < path->active; blk++, level++) { if (blk->disk_blkno) { - error = xfs_da_node_read(state->args->trans, + error = xfs_da3_node_read(state->args->trans, state->args->dp, blk->blkno, blk->disk_blkno, &blk->bp, XFS_ATTR_FORK); @@ -1715,7 +1390,7 @@ xfs_attr_refillstate(xfs_da_state_t *state) ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); for (blk = path->blk, level = 0; level < path->active; blk++, level++) { if (blk->disk_blkno) { - error = xfs_da_node_read(state->args->trans, + error = xfs_da3_node_read(state->args->trans, state->args->dp, blk->blkno, blk->disk_blkno, &blk->bp, XFS_ATTR_FORK); @@ -1749,13 +1424,11 @@ xfs_attr_node_get(xfs_da_args_t *args) state = xfs_da_state_alloc(); state->args = args; state->mp = args->dp->i_mount; - state->blocksize = state->mp->m_sb.sb_blocksize; - state->node_ents = state->mp->m_attr_node_ents; /* * Search to see if name exists, and get back a pointer to it. */ - error = xfs_da_node_lookup_int(state, &retval); + error = xfs_da3_node_lookup_int(state, &retval); if (error) { retval = error; } else if (retval == EEXIST) { @@ -1766,7 +1439,7 @@ xfs_attr_node_get(xfs_da_args_t *args) /* * Get the value, local or "remote" */ - retval = xfs_attr_leaf_getvalue(blk->bp, args); + retval = xfs_attr3_leaf_getvalue(blk->bp, args); if (!retval && (args->rmtblkno > 0) && !(args->flags & ATTR_KERNOVAL)) { retval = xfs_attr_rmtval_get(args); @@ -1784,420 +1457,3 @@ xfs_attr_node_get(xfs_da_args_t *args) xfs_da_state_free(state); return(retval); } - -STATIC int /* error */ -xfs_attr_node_list(xfs_attr_list_context_t *context) -{ - attrlist_cursor_kern_t *cursor; - xfs_attr_leafblock_t *leaf; - xfs_da_intnode_t *node; - xfs_da_node_entry_t *btree; - int error, i; - struct xfs_buf *bp; - - trace_xfs_attr_node_list(context); - - cursor = context->cursor; - cursor->initted = 1; - - /* - * Do all sorts of validation on the passed-in cursor structure. - * If anything is amiss, ignore the cursor and look up the hashval - * starting from the btree root. - */ - bp = NULL; - if (cursor->blkno > 0) { - error = xfs_da_node_read(NULL, context->dp, cursor->blkno, -1, - &bp, XFS_ATTR_FORK); - if ((error != 0) && (error != EFSCORRUPTED)) - return(error); - if (bp) { - node = bp->b_addr; - switch (be16_to_cpu(node->hdr.info.magic)) { - case XFS_DA_NODE_MAGIC: - trace_xfs_attr_list_wrong_blk(context); - xfs_trans_brelse(NULL, bp); - bp = NULL; - break; - case XFS_ATTR_LEAF_MAGIC: - leaf = bp->b_addr; - if (cursor->hashval > be32_to_cpu(leaf->entries[ - be16_to_cpu(leaf->hdr.count)-1].hashval)) { - trace_xfs_attr_list_wrong_blk(context); - xfs_trans_brelse(NULL, bp); - bp = NULL; - } else if (cursor->hashval <= - be32_to_cpu(leaf->entries[0].hashval)) { - trace_xfs_attr_list_wrong_blk(context); - xfs_trans_brelse(NULL, bp); - bp = NULL; - } - break; - default: - trace_xfs_attr_list_wrong_blk(context); - xfs_trans_brelse(NULL, bp); - bp = NULL; - } - } - } - - /* - * We did not find what we expected given the cursor's contents, - * so we start from the top and work down based on the hash value. - * Note that start of node block is same as start of leaf block. - */ - if (bp == NULL) { - cursor->blkno = 0; - for (;;) { - error = xfs_da_node_read(NULL, context->dp, - cursor->blkno, -1, &bp, - XFS_ATTR_FORK); - if (error) - return(error); - node = bp->b_addr; - if (node->hdr.info.magic == - cpu_to_be16(XFS_ATTR_LEAF_MAGIC)) - break; - if (unlikely(node->hdr.info.magic != - cpu_to_be16(XFS_DA_NODE_MAGIC))) { - XFS_CORRUPTION_ERROR("xfs_attr_node_list(3)", - XFS_ERRLEVEL_LOW, - context->dp->i_mount, - node); - xfs_trans_brelse(NULL, bp); - return(XFS_ERROR(EFSCORRUPTED)); - } - btree = node->btree; - for (i = 0; i < be16_to_cpu(node->hdr.count); - btree++, i++) { - if (cursor->hashval - <= be32_to_cpu(btree->hashval)) { - cursor->blkno = be32_to_cpu(btree->before); - trace_xfs_attr_list_node_descend(context, - btree); - break; - } - } - if (i == be16_to_cpu(node->hdr.count)) { - xfs_trans_brelse(NULL, bp); - return(0); - } - xfs_trans_brelse(NULL, bp); - } - } - ASSERT(bp != NULL); - - /* - * Roll upward through the blocks, processing each leaf block in - * order. As long as there is space in the result buffer, keep - * adding the information. - */ - for (;;) { - leaf = bp->b_addr; - error = xfs_attr_leaf_list_int(bp, context); - if (error) { - xfs_trans_brelse(NULL, bp); - return error; - } - if (context->seen_enough || leaf->hdr.info.forw == 0) - break; - cursor->blkno = be32_to_cpu(leaf->hdr.info.forw); - xfs_trans_brelse(NULL, bp); - error = xfs_attr_leaf_read(NULL, context->dp, cursor->blkno, -1, - &bp); - if (error) - return error; - } - xfs_trans_brelse(NULL, bp); - return(0); -} - - -/*======================================================================== - * External routines for manipulating out-of-line attribute values. - *========================================================================*/ - -/* - * Read the value associated with an attribute from the out-of-line buffer - * that we stored it in. - */ -int -xfs_attr_rmtval_get(xfs_da_args_t *args) -{ - xfs_bmbt_irec_t map[ATTR_RMTVALUE_MAPSIZE]; - xfs_mount_t *mp; - xfs_daddr_t dblkno; - void *dst; - xfs_buf_t *bp; - int nmap, error, tmp, valuelen, blkcnt, i; - xfs_dablk_t lblkno; - - trace_xfs_attr_rmtval_get(args); - - ASSERT(!(args->flags & ATTR_KERNOVAL)); - - mp = args->dp->i_mount; - dst = args->value; - valuelen = args->valuelen; - lblkno = args->rmtblkno; - while (valuelen > 0) { - nmap = ATTR_RMTVALUE_MAPSIZE; - error = xfs_bmapi_read(args->dp, (xfs_fileoff_t)lblkno, - args->rmtblkcnt, map, &nmap, - XFS_BMAPI_ATTRFORK); - if (error) - return(error); - ASSERT(nmap >= 1); - - for (i = 0; (i < nmap) && (valuelen > 0); i++) { - ASSERT((map[i].br_startblock != DELAYSTARTBLOCK) && - (map[i].br_startblock != HOLESTARTBLOCK)); - dblkno = XFS_FSB_TO_DADDR(mp, map[i].br_startblock); - blkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount); - error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, - dblkno, blkcnt, 0, &bp, NULL); - if (error) - return(error); - - tmp = min_t(int, valuelen, BBTOB(bp->b_length)); - xfs_buf_iomove(bp, 0, tmp, dst, XBRW_READ); - xfs_buf_relse(bp); - dst += tmp; - valuelen -= tmp; - - lblkno += map[i].br_blockcount; - } - } - ASSERT(valuelen == 0); - return(0); -} - -/* - * Write the value associated with an attribute into the out-of-line buffer - * that we have defined for it. - */ -STATIC int -xfs_attr_rmtval_set(xfs_da_args_t *args) -{ - xfs_mount_t *mp; - xfs_fileoff_t lfileoff; - xfs_inode_t *dp; - xfs_bmbt_irec_t map; - xfs_daddr_t dblkno; - void *src; - xfs_buf_t *bp; - xfs_dablk_t lblkno; - int blkcnt, valuelen, nmap, error, tmp, committed; - - trace_xfs_attr_rmtval_set(args); - - dp = args->dp; - mp = dp->i_mount; - src = args->value; - - /* - * Find a "hole" in the attribute address space large enough for - * us to drop the new attribute's value into. - */ - blkcnt = XFS_B_TO_FSB(mp, args->valuelen); - lfileoff = 0; - error = xfs_bmap_first_unused(args->trans, args->dp, blkcnt, &lfileoff, - XFS_ATTR_FORK); - if (error) { - return(error); - } - args->rmtblkno = lblkno = (xfs_dablk_t)lfileoff; - args->rmtblkcnt = blkcnt; - - /* - * Roll through the "value", allocating blocks on disk as required. - */ - while (blkcnt > 0) { - /* - * Allocate a single extent, up to the size of the value. - */ - xfs_bmap_init(args->flist, args->firstblock); - nmap = 1; - error = xfs_bmapi_write(args->trans, dp, (xfs_fileoff_t)lblkno, - blkcnt, - XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA, - args->firstblock, args->total, &map, &nmap, - args->flist); - if (!error) { - error = xfs_bmap_finish(&args->trans, args->flist, - &committed); - } - if (error) { - ASSERT(committed); - args->trans = NULL; - xfs_bmap_cancel(args->flist); - return(error); - } - - /* - * bmap_finish() may have committed the last trans and started - * a new one. We need the inode to be in all transactions. - */ - if (committed) - xfs_trans_ijoin(args->trans, dp, 0); - - ASSERT(nmap == 1); - ASSERT((map.br_startblock != DELAYSTARTBLOCK) && - (map.br_startblock != HOLESTARTBLOCK)); - lblkno += map.br_blockcount; - blkcnt -= map.br_blockcount; - - /* - * Start the next trans in the chain. - */ - error = xfs_trans_roll(&args->trans, dp); - if (error) - return (error); - } - - /* - * Roll through the "value", copying the attribute value to the - * already-allocated blocks. Blocks are written synchronously - * so that we can know they are all on disk before we turn off - * the INCOMPLETE flag. - */ - lblkno = args->rmtblkno; - valuelen = args->valuelen; - while (valuelen > 0) { - int buflen; - - /* - * Try to remember where we decided to put the value. - */ - xfs_bmap_init(args->flist, args->firstblock); - nmap = 1; - error = xfs_bmapi_read(dp, (xfs_fileoff_t)lblkno, - args->rmtblkcnt, &map, &nmap, - XFS_BMAPI_ATTRFORK); - if (error) - return(error); - ASSERT(nmap == 1); - ASSERT((map.br_startblock != DELAYSTARTBLOCK) && - (map.br_startblock != HOLESTARTBLOCK)); - - dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock), - blkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount); - - bp = xfs_buf_get(mp->m_ddev_targp, dblkno, blkcnt, 0); - if (!bp) - return ENOMEM; - - buflen = BBTOB(bp->b_length); - tmp = min_t(int, valuelen, buflen); - xfs_buf_iomove(bp, 0, tmp, src, XBRW_WRITE); - if (tmp < buflen) - xfs_buf_zero(bp, tmp, buflen - tmp); - - error = xfs_bwrite(bp); /* GROT: NOTE: synchronous write */ - xfs_buf_relse(bp); - if (error) - return error; - src += tmp; - valuelen -= tmp; - - lblkno += map.br_blockcount; - } - ASSERT(valuelen == 0); - return(0); -} - -/* - * Remove the value associated with an attribute by deleting the - * out-of-line buffer that it is stored on. - */ -STATIC int -xfs_attr_rmtval_remove(xfs_da_args_t *args) -{ - xfs_mount_t *mp; - xfs_bmbt_irec_t map; - xfs_buf_t *bp; - xfs_daddr_t dblkno; - xfs_dablk_t lblkno; - int valuelen, blkcnt, nmap, error, done, committed; - - trace_xfs_attr_rmtval_remove(args); - - mp = args->dp->i_mount; - - /* - * Roll through the "value", invalidating the attribute value's - * blocks. - */ - lblkno = args->rmtblkno; - valuelen = args->rmtblkcnt; - while (valuelen > 0) { - /* - * Try to remember where we decided to put the value. - */ - nmap = 1; - error = xfs_bmapi_read(args->dp, (xfs_fileoff_t)lblkno, - args->rmtblkcnt, &map, &nmap, - XFS_BMAPI_ATTRFORK); - if (error) - return(error); - ASSERT(nmap == 1); - ASSERT((map.br_startblock != DELAYSTARTBLOCK) && - (map.br_startblock != HOLESTARTBLOCK)); - - dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock), - blkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount); - - /* - * If the "remote" value is in the cache, remove it. - */ - bp = xfs_incore(mp->m_ddev_targp, dblkno, blkcnt, XBF_TRYLOCK); - if (bp) { - xfs_buf_stale(bp); - xfs_buf_relse(bp); - bp = NULL; - } - - valuelen -= map.br_blockcount; - - lblkno += map.br_blockcount; - } - - /* - * Keep de-allocating extents until the remote-value region is gone. - */ - lblkno = args->rmtblkno; - blkcnt = args->rmtblkcnt; - done = 0; - while (!done) { - xfs_bmap_init(args->flist, args->firstblock); - error = xfs_bunmapi(args->trans, args->dp, lblkno, blkcnt, - XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA, - 1, args->firstblock, args->flist, - &done); - if (!error) { - error = xfs_bmap_finish(&args->trans, args->flist, - &committed); - } - if (error) { - ASSERT(committed); - args->trans = NULL; - xfs_bmap_cancel(args->flist); - return(error); - } - - /* - * bmap_finish() may have committed the last trans and started - * a new one. We need the inode to be in all transactions. - */ - if (committed) - xfs_trans_ijoin(args->trans, args->dp, 0); - - /* - * Close out trans and start the next one in the chain. - */ - error = xfs_trans_roll(&args->trans, args->dp); - if (error) - return (error); - } - return(0); -} diff --git a/fs/xfs/xfs_attr.h b/fs/xfs/xfs_attr.h index e920d68ef50..dd482458947 100644 --- a/fs/xfs/xfs_attr.h +++ b/fs/xfs/xfs_attr.h @@ -140,7 +140,15 @@ typedef struct xfs_attr_list_context { * Overall external interface routines. */ int xfs_attr_inactive(struct xfs_inode *dp); -int xfs_attr_rmtval_get(struct xfs_da_args *args); int xfs_attr_list_int(struct xfs_attr_list_context *); +int xfs_inode_hasattr(struct xfs_inode *ip); +int xfs_attr_get(struct xfs_inode *ip, const unsigned char *name, + unsigned char *value, int *valuelenp, int flags); +int xfs_attr_set(struct xfs_inode *dp, const unsigned char *name, + unsigned char *value, int valuelen, int flags); +int xfs_attr_remove(struct xfs_inode *dp, const unsigned char *name, int flags); +int xfs_attr_list(struct xfs_inode *dp, char *buffer, int bufsize, + int flags, struct attrlist_cursor_kern *cursor); + #endif /* __XFS_ATTR_H__ */ diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c new file mode 100644 index 00000000000..09480c57f06 --- /dev/null +++ b/fs/xfs/xfs_attr_inactive.c @@ -0,0 +1,452 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * Copyright (c) 2013 Red Hat, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_bit.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_mount.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_inode.h" +#include "xfs_alloc.h" +#include "xfs_attr_remote.h" +#include "xfs_trans.h" +#include "xfs_inode_item.h" +#include "xfs_bmap.h" +#include "xfs_attr.h" +#include "xfs_attr_leaf.h" +#include "xfs_error.h" +#include "xfs_quota.h" +#include "xfs_trace.h" +#include "xfs_dinode.h" +#include "xfs_dir2.h" + +/* + * Look at all the extents for this logical region, + * invalidate any buffers that are incore/in transactions. + */ +STATIC int +xfs_attr3_leaf_freextent( + struct xfs_trans **trans, + struct xfs_inode *dp, + xfs_dablk_t blkno, + int blkcnt) +{ + struct xfs_bmbt_irec map; + struct xfs_buf *bp; + xfs_dablk_t tblkno; + xfs_daddr_t dblkno; + int tblkcnt; + int dblkcnt; + int nmap; + int error; + + /* + * Roll through the "value", invalidating the attribute value's + * blocks. + */ + tblkno = blkno; + tblkcnt = blkcnt; + while (tblkcnt > 0) { + /* + * Try to remember where we decided to put the value. + */ + nmap = 1; + error = xfs_bmapi_read(dp, (xfs_fileoff_t)tblkno, tblkcnt, + &map, &nmap, XFS_BMAPI_ATTRFORK); + if (error) { + return(error); + } + ASSERT(nmap == 1); + ASSERT(map.br_startblock != DELAYSTARTBLOCK); + + /* + * If it's a hole, these are already unmapped + * so there's nothing to invalidate. + */ + if (map.br_startblock != HOLESTARTBLOCK) { + + dblkno = XFS_FSB_TO_DADDR(dp->i_mount, + map.br_startblock); + dblkcnt = XFS_FSB_TO_BB(dp->i_mount, + map.br_blockcount); + bp = xfs_trans_get_buf(*trans, + dp->i_mount->m_ddev_targp, + dblkno, dblkcnt, 0); + if (!bp) + return ENOMEM; + xfs_trans_binval(*trans, bp); + /* + * Roll to next transaction. + */ + error = xfs_trans_roll(trans, dp); + if (error) + return (error); + } + + tblkno += map.br_blockcount; + tblkcnt -= map.br_blockcount; + } + + return(0); +} + +/* + * Invalidate all of the "remote" value regions pointed to by a particular + * leaf block. + * Note that we must release the lock on the buffer so that we are not + * caught holding something that the logging code wants to flush to disk. + */ +STATIC int +xfs_attr3_leaf_inactive( + struct xfs_trans **trans, + struct xfs_inode *dp, + struct xfs_buf *bp) +{ + struct xfs_attr_leafblock *leaf; + struct xfs_attr3_icleaf_hdr ichdr; + struct xfs_attr_leaf_entry *entry; + struct xfs_attr_leaf_name_remote *name_rmt; + struct xfs_attr_inactive_list *list; + struct xfs_attr_inactive_list *lp; + int error; + int count; + int size; + int tmp; + int i; + + leaf = bp->b_addr; + xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf); + + /* + * Count the number of "remote" value extents. + */ + count = 0; + entry = xfs_attr3_leaf_entryp(leaf); + for (i = 0; i < ichdr.count; entry++, i++) { + if (be16_to_cpu(entry->nameidx) && + ((entry->flags & XFS_ATTR_LOCAL) == 0)) { + name_rmt = xfs_attr3_leaf_name_remote(leaf, i); + if (name_rmt->valueblk) + count++; + } + } + + /* + * If there are no "remote" values, we're done. + */ + if (count == 0) { + xfs_trans_brelse(*trans, bp); + return 0; + } + + /* + * Allocate storage for a list of all the "remote" value extents. + */ + size = count * sizeof(xfs_attr_inactive_list_t); + list = kmem_alloc(size, KM_SLEEP); + + /* + * Identify each of the "remote" value extents. + */ + lp = list; + entry = xfs_attr3_leaf_entryp(leaf); + for (i = 0; i < ichdr.count; entry++, i++) { + if (be16_to_cpu(entry->nameidx) && + ((entry->flags & XFS_ATTR_LOCAL) == 0)) { + name_rmt = xfs_attr3_leaf_name_remote(leaf, i); + if (name_rmt->valueblk) { + lp->valueblk = be32_to_cpu(name_rmt->valueblk); + lp->valuelen = xfs_attr3_rmt_blocks(dp->i_mount, + be32_to_cpu(name_rmt->valuelen)); + lp++; + } + } + } + xfs_trans_brelse(*trans, bp); /* unlock for trans. in freextent() */ + + /* + * Invalidate each of the "remote" value extents. + */ + error = 0; + for (lp = list, i = 0; i < count; i++, lp++) { + tmp = xfs_attr3_leaf_freextent(trans, dp, + lp->valueblk, lp->valuelen); + + if (error == 0) + error = tmp; /* save only the 1st errno */ + } + + kmem_free(list); + return error; +} + +/* + * Recurse (gasp!) through the attribute nodes until we find leaves. + * We're doing a depth-first traversal in order to invalidate everything. + */ +STATIC int +xfs_attr3_node_inactive( + struct xfs_trans **trans, + struct xfs_inode *dp, + struct xfs_buf *bp, + int level) +{ + xfs_da_blkinfo_t *info; + xfs_da_intnode_t *node; + xfs_dablk_t child_fsb; + xfs_daddr_t parent_blkno, child_blkno; + int error, i; + struct xfs_buf *child_bp; + struct xfs_da_node_entry *btree; + struct xfs_da3_icnode_hdr ichdr; + + /* + * Since this code is recursive (gasp!) we must protect ourselves. + */ + if (level > XFS_DA_NODE_MAXDEPTH) { + xfs_trans_brelse(*trans, bp); /* no locks for later trans */ + return XFS_ERROR(EIO); + } + + node = bp->b_addr; + dp->d_ops->node_hdr_from_disk(&ichdr, node); + parent_blkno = bp->b_bn; + if (!ichdr.count) { + xfs_trans_brelse(*trans, bp); + return 0; + } + btree = dp->d_ops->node_tree_p(node); + child_fsb = be32_to_cpu(btree[0].before); + xfs_trans_brelse(*trans, bp); /* no locks for later trans */ + + /* + * If this is the node level just above the leaves, simply loop + * over the leaves removing all of them. If this is higher up + * in the tree, recurse downward. + */ + for (i = 0; i < ichdr.count; i++) { + /* + * Read the subsidiary block to see what we have to work with. + * Don't do this in a transaction. This is a depth-first + * traversal of the tree so we may deal with many blocks + * before we come back to this one. + */ + error = xfs_da3_node_read(*trans, dp, child_fsb, -2, &child_bp, + XFS_ATTR_FORK); + if (error) + return(error); + if (child_bp) { + /* save for re-read later */ + child_blkno = XFS_BUF_ADDR(child_bp); + + /* + * Invalidate the subtree, however we have to. + */ + info = child_bp->b_addr; + switch (info->magic) { + case cpu_to_be16(XFS_DA_NODE_MAGIC): + case cpu_to_be16(XFS_DA3_NODE_MAGIC): + error = xfs_attr3_node_inactive(trans, dp, + child_bp, level + 1); + break; + case cpu_to_be16(XFS_ATTR_LEAF_MAGIC): + case cpu_to_be16(XFS_ATTR3_LEAF_MAGIC): + error = xfs_attr3_leaf_inactive(trans, dp, + child_bp); + break; + default: + error = XFS_ERROR(EIO); + xfs_trans_brelse(*trans, child_bp); + break; + } + if (error) + return error; + + /* + * Remove the subsidiary block from the cache + * and from the log. + */ + error = xfs_da_get_buf(*trans, dp, 0, child_blkno, + &child_bp, XFS_ATTR_FORK); + if (error) + return error; + xfs_trans_binval(*trans, child_bp); + } + + /* + * If we're not done, re-read the parent to get the next + * child block number. + */ + if (i + 1 < ichdr.count) { + error = xfs_da3_node_read(*trans, dp, 0, parent_blkno, + &bp, XFS_ATTR_FORK); + if (error) + return error; + child_fsb = be32_to_cpu(btree[i + 1].before); + xfs_trans_brelse(*trans, bp); + } + /* + * Atomically commit the whole invalidate stuff. + */ + error = xfs_trans_roll(trans, dp); + if (error) + return error; + } + + return 0; +} + +/* + * Indiscriminately delete the entire attribute fork + * + * Recurse (gasp!) through the attribute nodes until we find leaves. + * We're doing a depth-first traversal in order to invalidate everything. + */ +int +xfs_attr3_root_inactive( + struct xfs_trans **trans, + struct xfs_inode *dp) +{ + struct xfs_da_blkinfo *info; + struct xfs_buf *bp; + xfs_daddr_t blkno; + int error; + + /* + * Read block 0 to see what we have to work with. + * We only get here if we have extents, since we remove + * the extents in reverse order the extent containing + * block 0 must still be there. + */ + error = xfs_da3_node_read(*trans, dp, 0, -1, &bp, XFS_ATTR_FORK); + if (error) + return error; + blkno = bp->b_bn; + + /* + * Invalidate the tree, even if the "tree" is only a single leaf block. + * This is a depth-first traversal! + */ + info = bp->b_addr; + switch (info->magic) { + case cpu_to_be16(XFS_DA_NODE_MAGIC): + case cpu_to_be16(XFS_DA3_NODE_MAGIC): + error = xfs_attr3_node_inactive(trans, dp, bp, 1); + break; + case cpu_to_be16(XFS_ATTR_LEAF_MAGIC): + case cpu_to_be16(XFS_ATTR3_LEAF_MAGIC): + error = xfs_attr3_leaf_inactive(trans, dp, bp); + break; + default: + error = XFS_ERROR(EIO); + xfs_trans_brelse(*trans, bp); + break; + } + if (error) + return error; + + /* + * Invalidate the incore copy of the root block. + */ + error = xfs_da_get_buf(*trans, dp, 0, blkno, &bp, XFS_ATTR_FORK); + if (error) + return error; + xfs_trans_binval(*trans, bp); /* remove from cache */ + /* + * Commit the invalidate and start the next transaction. + */ + error = xfs_trans_roll(trans, dp); + + return error; +} + +int +xfs_attr_inactive(xfs_inode_t *dp) +{ + xfs_trans_t *trans; + xfs_mount_t *mp; + int error; + + mp = dp->i_mount; + ASSERT(! XFS_NOT_DQATTACHED(mp, dp)); + + xfs_ilock(dp, XFS_ILOCK_SHARED); + if (!xfs_inode_hasattr(dp) || + dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) { + xfs_iunlock(dp, XFS_ILOCK_SHARED); + return 0; + } + xfs_iunlock(dp, XFS_ILOCK_SHARED); + + /* + * Start our first transaction of the day. + * + * All future transactions during this code must be "chained" off + * this one via the trans_dup() call. All transactions will contain + * the inode, and the inode will always be marked with trans_ihold(). + * Since the inode will be locked in all transactions, we must log + * the inode in every transaction to let it float upward through + * the log. + */ + trans = xfs_trans_alloc(mp, XFS_TRANS_ATTRINVAL); + error = xfs_trans_reserve(trans, &M_RES(mp)->tr_attrinval, 0, 0); + if (error) { + xfs_trans_cancel(trans, 0); + return(error); + } + xfs_ilock(dp, XFS_ILOCK_EXCL); + + /* + * No need to make quota reservations here. We expect to release some + * blocks, not allocate, in the common case. + */ + xfs_trans_ijoin(trans, dp, 0); + + /* + * Decide on what work routines to call based on the inode size. + */ + if (!xfs_inode_hasattr(dp) || + dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) { + error = 0; + goto out; + } + error = xfs_attr3_root_inactive(&trans, dp); + if (error) + goto out; + + error = xfs_itruncate_extents(&trans, dp, XFS_ATTR_FORK, 0); + if (error) + goto out; + + error = xfs_trans_commit(trans, XFS_TRANS_RELEASE_LOG_RES); + xfs_iunlock(dp, XFS_ILOCK_EXCL); + + return(error); + +out: + xfs_trans_cancel(trans, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT); + xfs_iunlock(dp, XFS_ILOCK_EXCL); + return(error); +} diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c index ee24993c7d1..28712d29e43 100644 --- a/fs/xfs/xfs_attr_leaf.c +++ b/fs/xfs/xfs_attr_leaf.c @@ -1,5 +1,6 @@ /* * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * Copyright (c) 2013 Red Hat, Inc. * All Rights Reserved. * * This program is free software; you can redistribute it and/or @@ -17,28 +18,32 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_types.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_bit.h" -#include "xfs_log.h" -#include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_mount.h" +#include "xfs_da_format.h" #include "xfs_da_btree.h" -#include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_alloc.h" -#include "xfs_btree.h" -#include "xfs_attr_sf.h" -#include "xfs_dinode.h" #include "xfs_inode.h" +#include "xfs_trans.h" #include "xfs_inode_item.h" +#include "xfs_bmap_btree.h" #include "xfs_bmap.h" +#include "xfs_attr_sf.h" +#include "xfs_attr_remote.h" #include "xfs_attr.h" #include "xfs_attr_leaf.h" #include "xfs_error.h" #include "xfs_trace.h" +#include "xfs_buf_item.h" +#include "xfs_cksum.h" +#include "xfs_dinode.h" +#include "xfs_dir2.h" + /* * xfs_attr_leaf.c @@ -53,85 +58,218 @@ /* * Routines used for growing the Btree. */ -STATIC int xfs_attr_leaf_create(xfs_da_args_t *args, xfs_dablk_t which_block, - struct xfs_buf **bpp); -STATIC int xfs_attr_leaf_add_work(struct xfs_buf *leaf_buffer, - xfs_da_args_t *args, int freemap_index); -STATIC void xfs_attr_leaf_compact(struct xfs_da_args *args, - struct xfs_buf *leaf_buffer); -STATIC void xfs_attr_leaf_rebalance(xfs_da_state_t *state, +STATIC int xfs_attr3_leaf_create(struct xfs_da_args *args, + xfs_dablk_t which_block, struct xfs_buf **bpp); +STATIC int xfs_attr3_leaf_add_work(struct xfs_buf *leaf_buffer, + struct xfs_attr3_icleaf_hdr *ichdr, + struct xfs_da_args *args, int freemap_index); +STATIC void xfs_attr3_leaf_compact(struct xfs_da_args *args, + struct xfs_attr3_icleaf_hdr *ichdr, + struct xfs_buf *leaf_buffer); +STATIC void xfs_attr3_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, xfs_da_state_blk_t *blk2); -STATIC int xfs_attr_leaf_figure_balance(xfs_da_state_t *state, - xfs_da_state_blk_t *leaf_blk_1, - xfs_da_state_blk_t *leaf_blk_2, - int *number_entries_in_blk1, - int *number_usedbytes_in_blk1); - -/* - * Routines used for shrinking the Btree. - */ -STATIC int xfs_attr_node_inactive(xfs_trans_t **trans, xfs_inode_t *dp, - struct xfs_buf *bp, int level); -STATIC int xfs_attr_leaf_inactive(xfs_trans_t **trans, xfs_inode_t *dp, - struct xfs_buf *bp); -STATIC int xfs_attr_leaf_freextent(xfs_trans_t **trans, xfs_inode_t *dp, - xfs_dablk_t blkno, int blkcnt); +STATIC int xfs_attr3_leaf_figure_balance(xfs_da_state_t *state, + xfs_da_state_blk_t *leaf_blk_1, + struct xfs_attr3_icleaf_hdr *ichdr1, + xfs_da_state_blk_t *leaf_blk_2, + struct xfs_attr3_icleaf_hdr *ichdr2, + int *number_entries_in_blk1, + int *number_usedbytes_in_blk1); /* * Utility routines. */ -STATIC void xfs_attr_leaf_moveents(xfs_attr_leafblock_t *src_leaf, - int src_start, - xfs_attr_leafblock_t *dst_leaf, - int dst_start, int move_count, - xfs_mount_t *mp); +STATIC void xfs_attr3_leaf_moveents(struct xfs_da_args *args, + struct xfs_attr_leafblock *src_leaf, + struct xfs_attr3_icleaf_hdr *src_ichdr, int src_start, + struct xfs_attr_leafblock *dst_leaf, + struct xfs_attr3_icleaf_hdr *dst_ichdr, int dst_start, + int move_count); STATIC int xfs_attr_leaf_entsize(xfs_attr_leafblock_t *leaf, int index); -static void -xfs_attr_leaf_verify( +void +xfs_attr3_leaf_hdr_from_disk( + struct xfs_attr3_icleaf_hdr *to, + struct xfs_attr_leafblock *from) +{ + int i; + + ASSERT(from->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC) || + from->hdr.info.magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC)); + + if (from->hdr.info.magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC)) { + struct xfs_attr3_leaf_hdr *hdr3 = (struct xfs_attr3_leaf_hdr *)from; + + to->forw = be32_to_cpu(hdr3->info.hdr.forw); + to->back = be32_to_cpu(hdr3->info.hdr.back); + to->magic = be16_to_cpu(hdr3->info.hdr.magic); + to->count = be16_to_cpu(hdr3->count); + to->usedbytes = be16_to_cpu(hdr3->usedbytes); + to->firstused = be16_to_cpu(hdr3->firstused); + to->holes = hdr3->holes; + + for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) { + to->freemap[i].base = be16_to_cpu(hdr3->freemap[i].base); + to->freemap[i].size = be16_to_cpu(hdr3->freemap[i].size); + } + return; + } + to->forw = be32_to_cpu(from->hdr.info.forw); + to->back = be32_to_cpu(from->hdr.info.back); + to->magic = be16_to_cpu(from->hdr.info.magic); + to->count = be16_to_cpu(from->hdr.count); + to->usedbytes = be16_to_cpu(from->hdr.usedbytes); + to->firstused = be16_to_cpu(from->hdr.firstused); + to->holes = from->hdr.holes; + + for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) { + to->freemap[i].base = be16_to_cpu(from->hdr.freemap[i].base); + to->freemap[i].size = be16_to_cpu(from->hdr.freemap[i].size); + } +} + +void +xfs_attr3_leaf_hdr_to_disk( + struct xfs_attr_leafblock *to, + struct xfs_attr3_icleaf_hdr *from) +{ + int i; + + ASSERT(from->magic == XFS_ATTR_LEAF_MAGIC || + from->magic == XFS_ATTR3_LEAF_MAGIC); + + if (from->magic == XFS_ATTR3_LEAF_MAGIC) { + struct xfs_attr3_leaf_hdr *hdr3 = (struct xfs_attr3_leaf_hdr *)to; + + hdr3->info.hdr.forw = cpu_to_be32(from->forw); + hdr3->info.hdr.back = cpu_to_be32(from->back); + hdr3->info.hdr.magic = cpu_to_be16(from->magic); + hdr3->count = cpu_to_be16(from->count); + hdr3->usedbytes = cpu_to_be16(from->usedbytes); + hdr3->firstused = cpu_to_be16(from->firstused); + hdr3->holes = from->holes; + hdr3->pad1 = 0; + + for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) { + hdr3->freemap[i].base = cpu_to_be16(from->freemap[i].base); + hdr3->freemap[i].size = cpu_to_be16(from->freemap[i].size); + } + return; + } + to->hdr.info.forw = cpu_to_be32(from->forw); + to->hdr.info.back = cpu_to_be32(from->back); + to->hdr.info.magic = cpu_to_be16(from->magic); + to->hdr.count = cpu_to_be16(from->count); + to->hdr.usedbytes = cpu_to_be16(from->usedbytes); + to->hdr.firstused = cpu_to_be16(from->firstused); + to->hdr.holes = from->holes; + to->hdr.pad1 = 0; + + for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) { + to->hdr.freemap[i].base = cpu_to_be16(from->freemap[i].base); + to->hdr.freemap[i].size = cpu_to_be16(from->freemap[i].size); + } +} + +static bool +xfs_attr3_leaf_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; - struct xfs_attr_leaf_hdr *hdr = bp->b_addr; - int block_ok = 0; + struct xfs_attr_leafblock *leaf = bp->b_addr; + struct xfs_attr3_icleaf_hdr ichdr; - block_ok = hdr->info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC); - if (!block_ok) { - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr); - xfs_buf_ioerror(bp, EFSCORRUPTED); + xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf); + + if (xfs_sb_version_hascrc(&mp->m_sb)) { + struct xfs_da3_node_hdr *hdr3 = bp->b_addr; + + if (ichdr.magic != XFS_ATTR3_LEAF_MAGIC) + return false; + + if (!uuid_equal(&hdr3->info.uuid, &mp->m_sb.sb_uuid)) + return false; + if (be64_to_cpu(hdr3->info.blkno) != bp->b_bn) + return false; + } else { + if (ichdr.magic != XFS_ATTR_LEAF_MAGIC) + return false; } + if (ichdr.count == 0) + return false; + + /* XXX: need to range check rest of attr header values */ + /* XXX: hash order check? */ + + return true; } static void -xfs_attr_leaf_read_verify( +xfs_attr3_leaf_write_verify( struct xfs_buf *bp) { - xfs_attr_leaf_verify(bp); + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_buf_log_item *bip = bp->b_fspriv; + struct xfs_attr3_leaf_hdr *hdr3 = bp->b_addr; + + if (!xfs_attr3_leaf_verify(bp)) { + xfs_buf_ioerror(bp, EFSCORRUPTED); + xfs_verifier_error(bp); + return; + } + + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return; + + if (bip) + hdr3->info.lsn = cpu_to_be64(bip->bli_item.li_lsn); + + xfs_buf_update_cksum(bp, XFS_ATTR3_LEAF_CRC_OFF); } +/* + * leaf/node format detection on trees is sketchy, so a node read can be done on + * leaf level blocks when detection identifies the tree as a node format tree + * incorrectly. In this case, we need to swap the verifier to match the correct + * format of the block being read. + */ static void -xfs_attr_leaf_write_verify( - struct xfs_buf *bp) +xfs_attr3_leaf_read_verify( + struct xfs_buf *bp) { - xfs_attr_leaf_verify(bp); + struct xfs_mount *mp = bp->b_target->bt_mount; + + if (xfs_sb_version_hascrc(&mp->m_sb) && + !xfs_buf_verify_cksum(bp, XFS_ATTR3_LEAF_CRC_OFF)) + xfs_buf_ioerror(bp, EFSBADCRC); + else if (!xfs_attr3_leaf_verify(bp)) + xfs_buf_ioerror(bp, EFSCORRUPTED); + + if (bp->b_error) + xfs_verifier_error(bp); } -const struct xfs_buf_ops xfs_attr_leaf_buf_ops = { - .verify_read = xfs_attr_leaf_read_verify, - .verify_write = xfs_attr_leaf_write_verify, +const struct xfs_buf_ops xfs_attr3_leaf_buf_ops = { + .verify_read = xfs_attr3_leaf_read_verify, + .verify_write = xfs_attr3_leaf_write_verify, }; int -xfs_attr_leaf_read( +xfs_attr3_leaf_read( struct xfs_trans *tp, struct xfs_inode *dp, xfs_dablk_t bno, xfs_daddr_t mappedbno, struct xfs_buf **bpp) { - return xfs_da_read_buf(tp, dp, bno, mappedbno, bpp, - XFS_ATTR_FORK, &xfs_attr_leaf_buf_ops); + int err; + + err = xfs_da_read_buf(tp, dp, bno, mappedbno, bpp, + XFS_ATTR_FORK, &xfs_attr3_leaf_buf_ops); + if (!err && tp) + xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_ATTR_LEAF_BUF); + return err; } /*======================================================================== @@ -172,7 +310,8 @@ xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes) int dsize; xfs_mount_t *mp = dp->i_mount; - offset = (XFS_LITINO(mp) - bytes) >> 3; /* rounded down */ + /* rounded down */ + offset = (XFS_LITINO(mp, dp->i_d.di_version) - bytes) >> 3; switch (dp->i_d.di_format) { case XFS_DINODE_FMT_DEV: @@ -231,7 +370,7 @@ xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes) return 0; return dp->i_d.di_forkoff; } - dsize = XFS_BMAP_BROOT_SPACE(dp->i_df.if_broot); + dsize = XFS_BMAP_BROOT_SPACE(mp, dp->i_df.if_broot); break; } @@ -243,7 +382,8 @@ xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes) minforkoff = roundup(minforkoff, 8) >> 3; /* attr fork btree root can have at least this many key/ptr pairs */ - maxforkoff = XFS_LITINO(mp) - XFS_BMDR_SPACE_CALC(MINABTPTRS); + maxforkoff = XFS_LITINO(mp, dp->i_d.di_version) - + XFS_BMDR_SPACE_CALC(MINABTPTRS); maxforkoff = maxforkoff >> 3; /* rounded down */ if (offset >= maxforkoff) @@ -487,7 +627,7 @@ xfs_attr_shortform_getvalue(xfs_da_args_t *args) xfs_attr_sf_entry_t *sfe; int i; - ASSERT(args->dp->i_d.di_aformat == XFS_IFINLINE); + ASSERT(args->dp->i_afp->if_flags == XFS_IFINLINE); sf = (xfs_attr_shortform_t *)args->dp->i_afp->if_u1.if_data; sfe = &sf->list[0]; for (i = 0; i < sf->hdr.count; @@ -542,6 +682,8 @@ xfs_attr_shortform_to_leaf(xfs_da_args_t *args) sf = (xfs_attr_shortform_t *)tmpbuffer; xfs_idata_realloc(dp, -size, XFS_ATTR_FORK); + xfs_bmap_local_to_extents_empty(dp, XFS_ATTR_FORK); + bp = NULL; error = xfs_da_grow_inode(args, &blkno); if (error) { @@ -557,7 +699,7 @@ xfs_attr_shortform_to_leaf(xfs_da_args_t *args) } ASSERT(blkno == 0); - error = xfs_attr_leaf_create(args, blkno, &bp); + error = xfs_attr3_leaf_create(args, blkno, &bp); if (error) { error = xfs_da_shrink_inode(args, 0, bp); bp = NULL; @@ -570,6 +712,7 @@ xfs_attr_shortform_to_leaf(xfs_da_args_t *args) memset((char *)&nargs, 0, sizeof(nargs)); nargs.dp = dp; + nargs.geo = args->geo; nargs.firstblock = args->firstblock; nargs.flist = args->flist; nargs.total = args->total; @@ -586,9 +729,9 @@ xfs_attr_shortform_to_leaf(xfs_da_args_t *args) nargs.hashval = xfs_da_hashname(sfe->nameval, sfe->namelen); nargs.flags = XFS_ATTR_NSP_ONDISK_TO_ARGS(sfe->flags); - error = xfs_attr_leaf_lookup_int(bp, &nargs); /* set a->index */ + error = xfs_attr3_leaf_lookup_int(bp, &nargs); /* set a->index */ ASSERT(error == ENOATTR); - error = xfs_attr_leaf_add(bp, &nargs); + error = xfs_attr3_leaf_add(bp, &nargs); ASSERT(error != ENOSPC); if (error) goto out; @@ -601,250 +744,81 @@ out: return(error); } -STATIC int -xfs_attr_shortform_compare(const void *a, const void *b) -{ - xfs_attr_sf_sort_t *sa, *sb; - - sa = (xfs_attr_sf_sort_t *)a; - sb = (xfs_attr_sf_sort_t *)b; - if (sa->hash < sb->hash) { - return(-1); - } else if (sa->hash > sb->hash) { - return(1); - } else { - return(sa->entno - sb->entno); - } -} - - -#define XFS_ISRESET_CURSOR(cursor) \ - (!((cursor)->initted) && !((cursor)->hashval) && \ - !((cursor)->blkno) && !((cursor)->offset)) -/* - * Copy out entries of shortform attribute lists for attr_list(). - * Shortform attribute lists are not stored in hashval sorted order. - * If the output buffer is not large enough to hold them all, then we - * we have to calculate each entries' hashvalue and sort them before - * we can begin returning them to the user. - */ -/*ARGSUSED*/ -int -xfs_attr_shortform_list(xfs_attr_list_context_t *context) -{ - attrlist_cursor_kern_t *cursor; - xfs_attr_sf_sort_t *sbuf, *sbp; - xfs_attr_shortform_t *sf; - xfs_attr_sf_entry_t *sfe; - xfs_inode_t *dp; - int sbsize, nsbuf, count, i; - int error; - - ASSERT(context != NULL); - dp = context->dp; - ASSERT(dp != NULL); - ASSERT(dp->i_afp != NULL); - sf = (xfs_attr_shortform_t *)dp->i_afp->if_u1.if_data; - ASSERT(sf != NULL); - if (!sf->hdr.count) - return(0); - cursor = context->cursor; - ASSERT(cursor != NULL); - - trace_xfs_attr_list_sf(context); - - /* - * If the buffer is large enough and the cursor is at the start, - * do not bother with sorting since we will return everything in - * one buffer and another call using the cursor won't need to be - * made. - * Note the generous fudge factor of 16 overhead bytes per entry. - * If bufsize is zero then put_listent must be a search function - * and can just scan through what we have. - */ - if (context->bufsize == 0 || - (XFS_ISRESET_CURSOR(cursor) && - (dp->i_afp->if_bytes + sf->hdr.count * 16) < context->bufsize)) { - for (i = 0, sfe = &sf->list[0]; i < sf->hdr.count; i++) { - error = context->put_listent(context, - sfe->flags, - sfe->nameval, - (int)sfe->namelen, - (int)sfe->valuelen, - &sfe->nameval[sfe->namelen]); - - /* - * Either search callback finished early or - * didn't fit it all in the buffer after all. - */ - if (context->seen_enough) - break; - - if (error) - return error; - sfe = XFS_ATTR_SF_NEXTENTRY(sfe); - } - trace_xfs_attr_list_sf_all(context); - return(0); - } - - /* do no more for a search callback */ - if (context->bufsize == 0) - return 0; - - /* - * It didn't all fit, so we have to sort everything on hashval. - */ - sbsize = sf->hdr.count * sizeof(*sbuf); - sbp = sbuf = kmem_alloc(sbsize, KM_SLEEP | KM_NOFS); - - /* - * Scan the attribute list for the rest of the entries, storing - * the relevant info from only those that match into a buffer. - */ - nsbuf = 0; - for (i = 0, sfe = &sf->list[0]; i < sf->hdr.count; i++) { - if (unlikely( - ((char *)sfe < (char *)sf) || - ((char *)sfe >= ((char *)sf + dp->i_afp->if_bytes)))) { - XFS_CORRUPTION_ERROR("xfs_attr_shortform_list", - XFS_ERRLEVEL_LOW, - context->dp->i_mount, sfe); - kmem_free(sbuf); - return XFS_ERROR(EFSCORRUPTED); - } - - sbp->entno = i; - sbp->hash = xfs_da_hashname(sfe->nameval, sfe->namelen); - sbp->name = sfe->nameval; - sbp->namelen = sfe->namelen; - /* These are bytes, and both on-disk, don't endian-flip */ - sbp->valuelen = sfe->valuelen; - sbp->flags = sfe->flags; - sfe = XFS_ATTR_SF_NEXTENTRY(sfe); - sbp++; - nsbuf++; - } - - /* - * Sort the entries on hash then entno. - */ - xfs_sort(sbuf, nsbuf, sizeof(*sbuf), xfs_attr_shortform_compare); - - /* - * Re-find our place IN THE SORTED LIST. - */ - count = 0; - cursor->initted = 1; - cursor->blkno = 0; - for (sbp = sbuf, i = 0; i < nsbuf; i++, sbp++) { - if (sbp->hash == cursor->hashval) { - if (cursor->offset == count) { - break; - } - count++; - } else if (sbp->hash > cursor->hashval) { - break; - } - } - if (i == nsbuf) { - kmem_free(sbuf); - return(0); - } - - /* - * Loop putting entries into the user buffer. - */ - for ( ; i < nsbuf; i++, sbp++) { - if (cursor->hashval != sbp->hash) { - cursor->hashval = sbp->hash; - cursor->offset = 0; - } - error = context->put_listent(context, - sbp->flags, - sbp->name, - sbp->namelen, - sbp->valuelen, - &sbp->name[sbp->namelen]); - if (error) - return error; - if (context->seen_enough) - break; - cursor->offset++; - } - - kmem_free(sbuf); - return(0); -} - /* * Check a leaf attribute block to see if all the entries would fit into * a shortform attribute list. */ int xfs_attr_shortform_allfit( - struct xfs_buf *bp, - struct xfs_inode *dp) + struct xfs_buf *bp, + struct xfs_inode *dp) { - xfs_attr_leafblock_t *leaf; - xfs_attr_leaf_entry_t *entry; + struct xfs_attr_leafblock *leaf; + struct xfs_attr_leaf_entry *entry; xfs_attr_leaf_name_local_t *name_loc; - int bytes, i; + struct xfs_attr3_icleaf_hdr leafhdr; + int bytes; + int i; leaf = bp->b_addr; - ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); + xfs_attr3_leaf_hdr_from_disk(&leafhdr, leaf); + entry = xfs_attr3_leaf_entryp(leaf); - entry = &leaf->entries[0]; bytes = sizeof(struct xfs_attr_sf_hdr); - for (i = 0; i < be16_to_cpu(leaf->hdr.count); entry++, i++) { + for (i = 0; i < leafhdr.count; entry++, i++) { if (entry->flags & XFS_ATTR_INCOMPLETE) continue; /* don't copy partial entries */ if (!(entry->flags & XFS_ATTR_LOCAL)) return(0); - name_loc = xfs_attr_leaf_name_local(leaf, i); + name_loc = xfs_attr3_leaf_name_local(leaf, i); if (name_loc->namelen >= XFS_ATTR_SF_ENTSIZE_MAX) return(0); if (be16_to_cpu(name_loc->valuelen) >= XFS_ATTR_SF_ENTSIZE_MAX) return(0); - bytes += sizeof(struct xfs_attr_sf_entry)-1 + bytes += sizeof(struct xfs_attr_sf_entry) - 1 + name_loc->namelen + be16_to_cpu(name_loc->valuelen); } if ((dp->i_mount->m_flags & XFS_MOUNT_ATTR2) && (dp->i_d.di_format != XFS_DINODE_FMT_BTREE) && (bytes == sizeof(struct xfs_attr_sf_hdr))) - return(-1); - return(xfs_attr_shortform_bytesfit(dp, bytes)); + return -1; + return xfs_attr_shortform_bytesfit(dp, bytes); } /* * Convert a leaf attribute list to shortform attribute list */ int -xfs_attr_leaf_to_shortform( - struct xfs_buf *bp, - xfs_da_args_t *args, - int forkoff) +xfs_attr3_leaf_to_shortform( + struct xfs_buf *bp, + struct xfs_da_args *args, + int forkoff) { - xfs_attr_leafblock_t *leaf; - xfs_attr_leaf_entry_t *entry; - xfs_attr_leaf_name_local_t *name_loc; - xfs_da_args_t nargs; - xfs_inode_t *dp; - char *tmpbuffer; - int error, i; + struct xfs_attr_leafblock *leaf; + struct xfs_attr3_icleaf_hdr ichdr; + struct xfs_attr_leaf_entry *entry; + struct xfs_attr_leaf_name_local *name_loc; + struct xfs_da_args nargs; + struct xfs_inode *dp = args->dp; + char *tmpbuffer; + int error; + int i; trace_xfs_attr_leaf_to_sf(args); - dp = args->dp; - tmpbuffer = kmem_alloc(XFS_LBSIZE(dp->i_mount), KM_SLEEP); - ASSERT(tmpbuffer != NULL); + tmpbuffer = kmem_alloc(args->geo->blksize, KM_SLEEP); + if (!tmpbuffer) + return ENOMEM; + + memcpy(tmpbuffer, bp->b_addr, args->geo->blksize); - ASSERT(bp != NULL); - memcpy(tmpbuffer, bp->b_addr, XFS_LBSIZE(dp->i_mount)); leaf = (xfs_attr_leafblock_t *)tmpbuffer; - ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); - memset(bp->b_addr, 0, XFS_LBSIZE(dp->i_mount)); + xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf); + entry = xfs_attr3_leaf_entryp(leaf); + + /* XXX (dgc): buffer is about to be marked stale - why zero it? */ + memset(bp->b_addr, 0, args->geo->blksize); /* * Clean out the prior contents of the attribute list. @@ -866,6 +840,7 @@ xfs_attr_leaf_to_shortform( * Copy the attributes */ memset((char *)&nargs, 0, sizeof(nargs)); + nargs.geo = args->geo; nargs.dp = dp; nargs.firstblock = args->firstblock; nargs.flist = args->flist; @@ -873,14 +848,14 @@ xfs_attr_leaf_to_shortform( nargs.whichfork = XFS_ATTR_FORK; nargs.trans = args->trans; nargs.op_flags = XFS_DA_OP_OKNOENT; - entry = &leaf->entries[0]; - for (i = 0; i < be16_to_cpu(leaf->hdr.count); entry++, i++) { + + for (i = 0; i < ichdr.count; entry++, i++) { if (entry->flags & XFS_ATTR_INCOMPLETE) continue; /* don't copy partial entries */ if (!entry->nameidx) continue; ASSERT(entry->flags & XFS_ATTR_LOCAL); - name_loc = xfs_attr_leaf_name_local(leaf, i); + name_loc = xfs_attr3_leaf_name_local(leaf, i); nargs.name = name_loc->nameval; nargs.namelen = name_loc->namelen; nargs.value = &name_loc->nameval[nargs.namelen]; @@ -893,64 +868,77 @@ xfs_attr_leaf_to_shortform( out: kmem_free(tmpbuffer); - return(error); + return error; } /* * Convert from using a single leaf to a root node and a leaf. */ int -xfs_attr_leaf_to_node(xfs_da_args_t *args) +xfs_attr3_leaf_to_node( + struct xfs_da_args *args) { - xfs_attr_leafblock_t *leaf; - xfs_da_intnode_t *node; - xfs_inode_t *dp; - struct xfs_buf *bp1, *bp2; - xfs_dablk_t blkno; - int error; + struct xfs_attr_leafblock *leaf; + struct xfs_attr3_icleaf_hdr icleafhdr; + struct xfs_attr_leaf_entry *entries; + struct xfs_da_node_entry *btree; + struct xfs_da3_icnode_hdr icnodehdr; + struct xfs_da_intnode *node; + struct xfs_inode *dp = args->dp; + struct xfs_mount *mp = dp->i_mount; + struct xfs_buf *bp1 = NULL; + struct xfs_buf *bp2 = NULL; + xfs_dablk_t blkno; + int error; trace_xfs_attr_leaf_to_node(args); - dp = args->dp; - bp1 = bp2 = NULL; error = xfs_da_grow_inode(args, &blkno); if (error) goto out; - error = xfs_attr_leaf_read(args->trans, args->dp, 0, -1, &bp1); + error = xfs_attr3_leaf_read(args->trans, dp, 0, -1, &bp1); if (error) goto out; - bp2 = NULL; - error = xfs_da_get_buf(args->trans, args->dp, blkno, -1, &bp2, - XFS_ATTR_FORK); + error = xfs_da_get_buf(args->trans, dp, blkno, -1, &bp2, XFS_ATTR_FORK); if (error) goto out; + + /* copy leaf to new buffer, update identifiers */ + xfs_trans_buf_set_type(args->trans, bp2, XFS_BLFT_ATTR_LEAF_BUF); bp2->b_ops = bp1->b_ops; - memcpy(bp2->b_addr, bp1->b_addr, XFS_LBSIZE(dp->i_mount)); - bp1 = NULL; - xfs_trans_log_buf(args->trans, bp2, 0, XFS_LBSIZE(dp->i_mount) - 1); + memcpy(bp2->b_addr, bp1->b_addr, args->geo->blksize); + if (xfs_sb_version_hascrc(&mp->m_sb)) { + struct xfs_da3_blkinfo *hdr3 = bp2->b_addr; + hdr3->blkno = cpu_to_be64(bp2->b_bn); + } + xfs_trans_log_buf(args->trans, bp2, 0, args->geo->blksize - 1); /* * Set up the new root node. */ - error = xfs_da_node_create(args, 0, 1, &bp1, XFS_ATTR_FORK); + error = xfs_da3_node_create(args, 0, 1, &bp1, XFS_ATTR_FORK); if (error) goto out; node = bp1->b_addr; + dp->d_ops->node_hdr_from_disk(&icnodehdr, node); + btree = dp->d_ops->node_tree_p(node); + leaf = bp2->b_addr; - ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); + xfs_attr3_leaf_hdr_from_disk(&icleafhdr, leaf); + entries = xfs_attr3_leaf_entryp(leaf); + /* both on-disk, don't endian-flip twice */ - node->btree[0].hashval = - leaf->entries[be16_to_cpu(leaf->hdr.count)-1 ].hashval; - node->btree[0].before = cpu_to_be32(blkno); - node->hdr.count = cpu_to_be16(1); - xfs_trans_log_buf(args->trans, bp1, 0, XFS_LBSIZE(dp->i_mount) - 1); + btree[0].hashval = entries[icleafhdr.count - 1].hashval; + btree[0].before = cpu_to_be32(blkno); + icnodehdr.count = 1; + dp->d_ops->node_hdr_to_disk(node, &icnodehdr); + xfs_trans_log_buf(args->trans, bp1, 0, args->geo->blksize - 1); error = 0; out: - return(error); + return error; } - /*======================================================================== * Routines used for growing the Btree. *========================================================================*/ @@ -960,52 +948,63 @@ out: * or a leaf in a node attribute list. */ STATIC int -xfs_attr_leaf_create( - xfs_da_args_t *args, - xfs_dablk_t blkno, - struct xfs_buf **bpp) +xfs_attr3_leaf_create( + struct xfs_da_args *args, + xfs_dablk_t blkno, + struct xfs_buf **bpp) { - xfs_attr_leafblock_t *leaf; - xfs_attr_leaf_hdr_t *hdr; - xfs_inode_t *dp; - struct xfs_buf *bp; - int error; + struct xfs_attr_leafblock *leaf; + struct xfs_attr3_icleaf_hdr ichdr; + struct xfs_inode *dp = args->dp; + struct xfs_mount *mp = dp->i_mount; + struct xfs_buf *bp; + int error; trace_xfs_attr_leaf_create(args); - dp = args->dp; - ASSERT(dp != NULL); error = xfs_da_get_buf(args->trans, args->dp, blkno, -1, &bp, XFS_ATTR_FORK); if (error) - return(error); - bp->b_ops = &xfs_attr_leaf_buf_ops; + return error; + bp->b_ops = &xfs_attr3_leaf_buf_ops; + xfs_trans_buf_set_type(args->trans, bp, XFS_BLFT_ATTR_LEAF_BUF); leaf = bp->b_addr; - memset((char *)leaf, 0, XFS_LBSIZE(dp->i_mount)); - hdr = &leaf->hdr; - hdr->info.magic = cpu_to_be16(XFS_ATTR_LEAF_MAGIC); - hdr->firstused = cpu_to_be16(XFS_LBSIZE(dp->i_mount)); - if (!hdr->firstused) { - hdr->firstused = cpu_to_be16( - XFS_LBSIZE(dp->i_mount) - XFS_ATTR_LEAF_NAME_ALIGN); - } + memset(leaf, 0, args->geo->blksize); - hdr->freemap[0].base = cpu_to_be16(sizeof(xfs_attr_leaf_hdr_t)); - hdr->freemap[0].size = cpu_to_be16(be16_to_cpu(hdr->firstused) - - sizeof(xfs_attr_leaf_hdr_t)); + memset(&ichdr, 0, sizeof(ichdr)); + ichdr.firstused = args->geo->blksize; - xfs_trans_log_buf(args->trans, bp, 0, XFS_LBSIZE(dp->i_mount) - 1); + if (xfs_sb_version_hascrc(&mp->m_sb)) { + struct xfs_da3_blkinfo *hdr3 = bp->b_addr; + + ichdr.magic = XFS_ATTR3_LEAF_MAGIC; + + hdr3->blkno = cpu_to_be64(bp->b_bn); + hdr3->owner = cpu_to_be64(dp->i_ino); + uuid_copy(&hdr3->uuid, &mp->m_sb.sb_uuid); + + ichdr.freemap[0].base = sizeof(struct xfs_attr3_leaf_hdr); + } else { + ichdr.magic = XFS_ATTR_LEAF_MAGIC; + ichdr.freemap[0].base = sizeof(struct xfs_attr_leaf_hdr); + } + ichdr.freemap[0].size = ichdr.firstused - ichdr.freemap[0].base; + + xfs_attr3_leaf_hdr_to_disk(leaf, &ichdr); + xfs_trans_log_buf(args->trans, bp, 0, args->geo->blksize - 1); *bpp = bp; - return(0); + return 0; } /* * Split the leaf node, rebalance, then add the new entry. */ int -xfs_attr_leaf_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk, - xfs_da_state_blk_t *newblk) +xfs_attr3_leaf_split( + struct xfs_da_state *state, + struct xfs_da_state_blk *oldblk, + struct xfs_da_state_blk *newblk) { xfs_dablk_t blkno; int error; @@ -1019,7 +1018,7 @@ xfs_attr_leaf_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk, error = xfs_da_grow_inode(state->args, &blkno); if (error) return(error); - error = xfs_attr_leaf_create(state->args, blkno, &newblk->bp); + error = xfs_attr3_leaf_create(state->args, blkno, &newblk->bp); if (error) return(error); newblk->blkno = blkno; @@ -1029,8 +1028,8 @@ xfs_attr_leaf_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk, * Rebalance the entries across the two leaves. * NOTE: rebalance() currently depends on the 2nd block being empty. */ - xfs_attr_leaf_rebalance(state, oldblk, newblk); - error = xfs_da_blk_link(state, oldblk, newblk); + xfs_attr3_leaf_rebalance(state, oldblk, newblk); + error = xfs_da3_blk_link(state, oldblk, newblk); if (error) return(error); @@ -1043,10 +1042,10 @@ xfs_attr_leaf_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk, */ if (state->inleaf) { trace_xfs_attr_leaf_add_old(state->args); - error = xfs_attr_leaf_add(oldblk->bp, state->args); + error = xfs_attr3_leaf_add(oldblk->bp, state->args); } else { trace_xfs_attr_leaf_add_new(state->args); - error = xfs_attr_leaf_add(newblk->bp, state->args); + error = xfs_attr3_leaf_add(newblk->bp, state->args); } /* @@ -1061,48 +1060,46 @@ xfs_attr_leaf_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk, * Add a name to the leaf attribute list structure. */ int -xfs_attr_leaf_add( +xfs_attr3_leaf_add( struct xfs_buf *bp, struct xfs_da_args *args) { - xfs_attr_leafblock_t *leaf; - xfs_attr_leaf_hdr_t *hdr; - xfs_attr_leaf_map_t *map; - int tablesize, entsize, sum, tmp, i; + struct xfs_attr_leafblock *leaf; + struct xfs_attr3_icleaf_hdr ichdr; + int tablesize; + int entsize; + int sum; + int tmp; + int i; trace_xfs_attr_leaf_add(args); leaf = bp->b_addr; - ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); - ASSERT((args->index >= 0) - && (args->index <= be16_to_cpu(leaf->hdr.count))); - hdr = &leaf->hdr; - entsize = xfs_attr_leaf_newentsize(args->namelen, args->valuelen, - args->trans->t_mountp->m_sb.sb_blocksize, NULL); + xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf); + ASSERT(args->index >= 0 && args->index <= ichdr.count); + entsize = xfs_attr_leaf_newentsize(args, NULL); /* * Search through freemap for first-fit on new name length. * (may need to figure in size of entry struct too) */ - tablesize = (be16_to_cpu(hdr->count) + 1) - * sizeof(xfs_attr_leaf_entry_t) - + sizeof(xfs_attr_leaf_hdr_t); - map = &hdr->freemap[XFS_ATTR_LEAF_MAPSIZE-1]; - for (sum = 0, i = XFS_ATTR_LEAF_MAPSIZE-1; i >= 0; map--, i--) { - if (tablesize > be16_to_cpu(hdr->firstused)) { - sum += be16_to_cpu(map->size); + tablesize = (ichdr.count + 1) * sizeof(xfs_attr_leaf_entry_t) + + xfs_attr3_leaf_hdr_size(leaf); + for (sum = 0, i = XFS_ATTR_LEAF_MAPSIZE - 1; i >= 0; i--) { + if (tablesize > ichdr.firstused) { + sum += ichdr.freemap[i].size; continue; } - if (!map->size) + if (!ichdr.freemap[i].size) continue; /* no space in this map */ tmp = entsize; - if (be16_to_cpu(map->base) < be16_to_cpu(hdr->firstused)) + if (ichdr.freemap[i].base < ichdr.firstused) tmp += sizeof(xfs_attr_leaf_entry_t); - if (be16_to_cpu(map->size) >= tmp) { - tmp = xfs_attr_leaf_add_work(bp, args, i); - return(tmp); + if (ichdr.freemap[i].size >= tmp) { + tmp = xfs_attr3_leaf_add_work(bp, &ichdr, args, i); + goto out_log_hdr; } - sum += be16_to_cpu(map->size); + sum += ichdr.freemap[i].size; } /* @@ -1110,82 +1107,86 @@ xfs_attr_leaf_add( * and we don't have enough freespace, then compaction will do us * no good and we should just give up. */ - if (!hdr->holes && (sum < entsize)) - return(XFS_ERROR(ENOSPC)); + if (!ichdr.holes && sum < entsize) + return XFS_ERROR(ENOSPC); /* * Compact the entries to coalesce free space. * This may change the hdr->count via dropping INCOMPLETE entries. */ - xfs_attr_leaf_compact(args, bp); + xfs_attr3_leaf_compact(args, &ichdr, bp); /* * After compaction, the block is guaranteed to have only one * free region, in freemap[0]. If it is not big enough, give up. */ - if (be16_to_cpu(hdr->freemap[0].size) - < (entsize + sizeof(xfs_attr_leaf_entry_t))) - return(XFS_ERROR(ENOSPC)); + if (ichdr.freemap[0].size < (entsize + sizeof(xfs_attr_leaf_entry_t))) { + tmp = ENOSPC; + goto out_log_hdr; + } + + tmp = xfs_attr3_leaf_add_work(bp, &ichdr, args, 0); - return(xfs_attr_leaf_add_work(bp, args, 0)); +out_log_hdr: + xfs_attr3_leaf_hdr_to_disk(leaf, &ichdr); + xfs_trans_log_buf(args->trans, bp, + XFS_DA_LOGRANGE(leaf, &leaf->hdr, + xfs_attr3_leaf_hdr_size(leaf))); + return tmp; } /* * Add a name to a leaf attribute list structure. */ STATIC int -xfs_attr_leaf_add_work( - struct xfs_buf *bp, - xfs_da_args_t *args, - int mapindex) +xfs_attr3_leaf_add_work( + struct xfs_buf *bp, + struct xfs_attr3_icleaf_hdr *ichdr, + struct xfs_da_args *args, + int mapindex) { - xfs_attr_leafblock_t *leaf; - xfs_attr_leaf_hdr_t *hdr; - xfs_attr_leaf_entry_t *entry; - xfs_attr_leaf_name_local_t *name_loc; - xfs_attr_leaf_name_remote_t *name_rmt; - xfs_attr_leaf_map_t *map; - xfs_mount_t *mp; - int tmp, i; + struct xfs_attr_leafblock *leaf; + struct xfs_attr_leaf_entry *entry; + struct xfs_attr_leaf_name_local *name_loc; + struct xfs_attr_leaf_name_remote *name_rmt; + struct xfs_mount *mp; + int tmp; + int i; trace_xfs_attr_leaf_add_work(args); leaf = bp->b_addr; - ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); - hdr = &leaf->hdr; - ASSERT((mapindex >= 0) && (mapindex < XFS_ATTR_LEAF_MAPSIZE)); - ASSERT((args->index >= 0) && (args->index <= be16_to_cpu(hdr->count))); + ASSERT(mapindex >= 0 && mapindex < XFS_ATTR_LEAF_MAPSIZE); + ASSERT(args->index >= 0 && args->index <= ichdr->count); /* * Force open some space in the entry array and fill it in. */ - entry = &leaf->entries[args->index]; - if (args->index < be16_to_cpu(hdr->count)) { - tmp = be16_to_cpu(hdr->count) - args->index; + entry = &xfs_attr3_leaf_entryp(leaf)[args->index]; + if (args->index < ichdr->count) { + tmp = ichdr->count - args->index; tmp *= sizeof(xfs_attr_leaf_entry_t); - memmove((char *)(entry+1), (char *)entry, tmp); + memmove(entry + 1, entry, tmp); xfs_trans_log_buf(args->trans, bp, XFS_DA_LOGRANGE(leaf, entry, tmp + sizeof(*entry))); } - be16_add_cpu(&hdr->count, 1); + ichdr->count++; /* * Allocate space for the new string (at the end of the run). */ - map = &hdr->freemap[mapindex]; mp = args->trans->t_mountp; - ASSERT(be16_to_cpu(map->base) < XFS_LBSIZE(mp)); - ASSERT((be16_to_cpu(map->base) & 0x3) == 0); - ASSERT(be16_to_cpu(map->size) >= - xfs_attr_leaf_newentsize(args->namelen, args->valuelen, - mp->m_sb.sb_blocksize, NULL)); - ASSERT(be16_to_cpu(map->size) < XFS_LBSIZE(mp)); - ASSERT((be16_to_cpu(map->size) & 0x3) == 0); - be16_add_cpu(&map->size, - -xfs_attr_leaf_newentsize(args->namelen, args->valuelen, - mp->m_sb.sb_blocksize, &tmp)); - entry->nameidx = cpu_to_be16(be16_to_cpu(map->base) + - be16_to_cpu(map->size)); + ASSERT(ichdr->freemap[mapindex].base < args->geo->blksize); + ASSERT((ichdr->freemap[mapindex].base & 0x3) == 0); + ASSERT(ichdr->freemap[mapindex].size >= + xfs_attr_leaf_newentsize(args, NULL)); + ASSERT(ichdr->freemap[mapindex].size < args->geo->blksize); + ASSERT((ichdr->freemap[mapindex].size & 0x3) == 0); + + ichdr->freemap[mapindex].size -= xfs_attr_leaf_newentsize(args, &tmp); + + entry->nameidx = cpu_to_be16(ichdr->freemap[mapindex].base + + ichdr->freemap[mapindex].size); entry->hashval = cpu_to_be32(args->hashval); entry->flags = tmp ? XFS_ATTR_LOCAL : 0; entry->flags |= XFS_ATTR_NSP_ARGS_TO_ONDISK(args->flags); @@ -1200,7 +1201,7 @@ xfs_attr_leaf_add_work( XFS_DA_LOGRANGE(leaf, entry, sizeof(*entry))); ASSERT((args->index == 0) || (be32_to_cpu(entry->hashval) >= be32_to_cpu((entry-1)->hashval))); - ASSERT((args->index == be16_to_cpu(hdr->count)-1) || + ASSERT((args->index == ichdr->count - 1) || (be32_to_cpu(entry->hashval) <= be32_to_cpu((entry+1)->hashval))); /* @@ -1211,14 +1212,14 @@ xfs_attr_leaf_add_work( * as part of this transaction (a split operation for example). */ if (entry->flags & XFS_ATTR_LOCAL) { - name_loc = xfs_attr_leaf_name_local(leaf, args->index); + name_loc = xfs_attr3_leaf_name_local(leaf, args->index); name_loc->namelen = args->namelen; name_loc->valuelen = cpu_to_be16(args->valuelen); memcpy((char *)name_loc->nameval, args->name, args->namelen); memcpy((char *)&name_loc->nameval[args->namelen], args->value, be16_to_cpu(name_loc->valuelen)); } else { - name_rmt = xfs_attr_leaf_name_remote(leaf, args->index); + name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index); name_rmt->namelen = args->namelen; memcpy((char *)name_rmt->name, args->name, args->namelen); entry->flags |= XFS_ATTR_INCOMPLETE; @@ -1226,91 +1227,132 @@ xfs_attr_leaf_add_work( name_rmt->valuelen = 0; name_rmt->valueblk = 0; args->rmtblkno = 1; - args->rmtblkcnt = XFS_B_TO_FSB(mp, args->valuelen); + args->rmtblkcnt = xfs_attr3_rmt_blocks(mp, args->valuelen); + args->rmtvaluelen = args->valuelen; } xfs_trans_log_buf(args->trans, bp, - XFS_DA_LOGRANGE(leaf, xfs_attr_leaf_name(leaf, args->index), + XFS_DA_LOGRANGE(leaf, xfs_attr3_leaf_name(leaf, args->index), xfs_attr_leaf_entsize(leaf, args->index))); /* * Update the control info for this leaf node */ - if (be16_to_cpu(entry->nameidx) < be16_to_cpu(hdr->firstused)) { - /* both on-disk, don't endian-flip twice */ - hdr->firstused = entry->nameidx; - } - ASSERT(be16_to_cpu(hdr->firstused) >= - ((be16_to_cpu(hdr->count) * sizeof(*entry)) + sizeof(*hdr))); - tmp = (be16_to_cpu(hdr->count)-1) * sizeof(xfs_attr_leaf_entry_t) - + sizeof(xfs_attr_leaf_hdr_t); - map = &hdr->freemap[0]; - for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; map++, i++) { - if (be16_to_cpu(map->base) == tmp) { - be16_add_cpu(&map->base, sizeof(xfs_attr_leaf_entry_t)); - be16_add_cpu(&map->size, - -((int)sizeof(xfs_attr_leaf_entry_t))); + if (be16_to_cpu(entry->nameidx) < ichdr->firstused) + ichdr->firstused = be16_to_cpu(entry->nameidx); + + ASSERT(ichdr->firstused >= ichdr->count * sizeof(xfs_attr_leaf_entry_t) + + xfs_attr3_leaf_hdr_size(leaf)); + tmp = (ichdr->count - 1) * sizeof(xfs_attr_leaf_entry_t) + + xfs_attr3_leaf_hdr_size(leaf); + + for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) { + if (ichdr->freemap[i].base == tmp) { + ichdr->freemap[i].base += sizeof(xfs_attr_leaf_entry_t); + ichdr->freemap[i].size -= sizeof(xfs_attr_leaf_entry_t); } } - be16_add_cpu(&hdr->usedbytes, xfs_attr_leaf_entsize(leaf, args->index)); - xfs_trans_log_buf(args->trans, bp, - XFS_DA_LOGRANGE(leaf, hdr, sizeof(*hdr))); - return(0); + ichdr->usedbytes += xfs_attr_leaf_entsize(leaf, args->index); + return 0; } /* * Garbage collect a leaf attribute list block by copying it to a new buffer. */ STATIC void -xfs_attr_leaf_compact( +xfs_attr3_leaf_compact( struct xfs_da_args *args, + struct xfs_attr3_icleaf_hdr *ichdr_dst, struct xfs_buf *bp) { - xfs_attr_leafblock_t *leaf_s, *leaf_d; - xfs_attr_leaf_hdr_t *hdr_s, *hdr_d; + struct xfs_attr_leafblock *leaf_src; + struct xfs_attr_leafblock *leaf_dst; + struct xfs_attr3_icleaf_hdr ichdr_src; struct xfs_trans *trans = args->trans; - struct xfs_mount *mp = trans->t_mountp; char *tmpbuffer; trace_xfs_attr_leaf_compact(args); - tmpbuffer = kmem_alloc(XFS_LBSIZE(mp), KM_SLEEP); - ASSERT(tmpbuffer != NULL); - memcpy(tmpbuffer, bp->b_addr, XFS_LBSIZE(mp)); - memset(bp->b_addr, 0, XFS_LBSIZE(mp)); + tmpbuffer = kmem_alloc(args->geo->blksize, KM_SLEEP); + memcpy(tmpbuffer, bp->b_addr, args->geo->blksize); + memset(bp->b_addr, 0, args->geo->blksize); + leaf_src = (xfs_attr_leafblock_t *)tmpbuffer; + leaf_dst = bp->b_addr; /* - * Copy basic information + * Copy the on-disk header back into the destination buffer to ensure + * all the information in the header that is not part of the incore + * header structure is preserved. */ - leaf_s = (xfs_attr_leafblock_t *)tmpbuffer; - leaf_d = bp->b_addr; - hdr_s = &leaf_s->hdr; - hdr_d = &leaf_d->hdr; - hdr_d->info = hdr_s->info; /* struct copy */ - hdr_d->firstused = cpu_to_be16(XFS_LBSIZE(mp)); - /* handle truncation gracefully */ - if (!hdr_d->firstused) { - hdr_d->firstused = cpu_to_be16( - XFS_LBSIZE(mp) - XFS_ATTR_LEAF_NAME_ALIGN); - } - hdr_d->usedbytes = 0; - hdr_d->count = 0; - hdr_d->holes = 0; - hdr_d->freemap[0].base = cpu_to_be16(sizeof(xfs_attr_leaf_hdr_t)); - hdr_d->freemap[0].size = cpu_to_be16(be16_to_cpu(hdr_d->firstused) - - sizeof(xfs_attr_leaf_hdr_t)); + memcpy(bp->b_addr, tmpbuffer, xfs_attr3_leaf_hdr_size(leaf_src)); + + /* Initialise the incore headers */ + ichdr_src = *ichdr_dst; /* struct copy */ + ichdr_dst->firstused = args->geo->blksize; + ichdr_dst->usedbytes = 0; + ichdr_dst->count = 0; + ichdr_dst->holes = 0; + ichdr_dst->freemap[0].base = xfs_attr3_leaf_hdr_size(leaf_src); + ichdr_dst->freemap[0].size = ichdr_dst->firstused - + ichdr_dst->freemap[0].base; + + /* write the header back to initialise the underlying buffer */ + xfs_attr3_leaf_hdr_to_disk(leaf_dst, ichdr_dst); /* * Copy all entry's in the same (sorted) order, * but allocate name/value pairs packed and in sequence. */ - xfs_attr_leaf_moveents(leaf_s, 0, leaf_d, 0, - be16_to_cpu(hdr_s->count), mp); - xfs_trans_log_buf(trans, bp, 0, XFS_LBSIZE(mp) - 1); + xfs_attr3_leaf_moveents(args, leaf_src, &ichdr_src, 0, + leaf_dst, ichdr_dst, 0, ichdr_src.count); + /* + * this logs the entire buffer, but the caller must write the header + * back to the buffer when it is finished modifying it. + */ + xfs_trans_log_buf(trans, bp, 0, args->geo->blksize - 1); kmem_free(tmpbuffer); } /* + * Compare two leaf blocks "order". + * Return 0 unless leaf2 should go before leaf1. + */ +static int +xfs_attr3_leaf_order( + struct xfs_buf *leaf1_bp, + struct xfs_attr3_icleaf_hdr *leaf1hdr, + struct xfs_buf *leaf2_bp, + struct xfs_attr3_icleaf_hdr *leaf2hdr) +{ + struct xfs_attr_leaf_entry *entries1; + struct xfs_attr_leaf_entry *entries2; + + entries1 = xfs_attr3_leaf_entryp(leaf1_bp->b_addr); + entries2 = xfs_attr3_leaf_entryp(leaf2_bp->b_addr); + if (leaf1hdr->count > 0 && leaf2hdr->count > 0 && + ((be32_to_cpu(entries2[0].hashval) < + be32_to_cpu(entries1[0].hashval)) || + (be32_to_cpu(entries2[leaf2hdr->count - 1].hashval) < + be32_to_cpu(entries1[leaf1hdr->count - 1].hashval)))) { + return 1; + } + return 0; +} + +int +xfs_attr_leaf_order( + struct xfs_buf *leaf1_bp, + struct xfs_buf *leaf2_bp) +{ + struct xfs_attr3_icleaf_hdr ichdr1; + struct xfs_attr3_icleaf_hdr ichdr2; + + xfs_attr3_leaf_hdr_from_disk(&ichdr1, leaf1_bp->b_addr); + xfs_attr3_leaf_hdr_from_disk(&ichdr2, leaf2_bp->b_addr); + return xfs_attr3_leaf_order(leaf1_bp, &ichdr1, leaf2_bp, &ichdr2); +} + +/* * Redistribute the attribute list entries between two leaf nodes, * taking into account the size of the new entry. * @@ -1323,14 +1365,23 @@ xfs_attr_leaf_compact( * the "new" and "old" values can end up in different blocks. */ STATIC void -xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, - xfs_da_state_blk_t *blk2) +xfs_attr3_leaf_rebalance( + struct xfs_da_state *state, + struct xfs_da_state_blk *blk1, + struct xfs_da_state_blk *blk2) { - xfs_da_args_t *args; - xfs_da_state_blk_t *tmp_blk; - xfs_attr_leafblock_t *leaf1, *leaf2; - xfs_attr_leaf_hdr_t *hdr1, *hdr2; - int count, totallen, max, space, swap; + struct xfs_da_args *args; + struct xfs_attr_leafblock *leaf1; + struct xfs_attr_leafblock *leaf2; + struct xfs_attr3_icleaf_hdr ichdr1; + struct xfs_attr3_icleaf_hdr ichdr2; + struct xfs_attr_leaf_entry *entries1; + struct xfs_attr_leaf_entry *entries2; + int count; + int totallen; + int max; + int space; + int swap; /* * Set up environment. @@ -1339,9 +1390,9 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, ASSERT(blk2->magic == XFS_ATTR_LEAF_MAGIC); leaf1 = blk1->bp->b_addr; leaf2 = blk2->bp->b_addr; - ASSERT(leaf1->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); - ASSERT(leaf2->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); - ASSERT(leaf2->hdr.count == 0); + xfs_attr3_leaf_hdr_from_disk(&ichdr1, leaf1); + xfs_attr3_leaf_hdr_from_disk(&ichdr2, leaf2); + ASSERT(ichdr2.count == 0); args = state->args; trace_xfs_attr_leaf_rebalance(args); @@ -1353,16 +1404,23 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, * second block, this code should never set "swap". */ swap = 0; - if (xfs_attr_leaf_order(blk1->bp, blk2->bp)) { + if (xfs_attr3_leaf_order(blk1->bp, &ichdr1, blk2->bp, &ichdr2)) { + struct xfs_da_state_blk *tmp_blk; + struct xfs_attr3_icleaf_hdr tmp_ichdr; + tmp_blk = blk1; blk1 = blk2; blk2 = tmp_blk; + + /* struct copies to swap them rather than reconverting */ + tmp_ichdr = ichdr1; + ichdr1 = ichdr2; + ichdr2 = tmp_ichdr; + leaf1 = blk1->bp->b_addr; leaf2 = blk2->bp->b_addr; swap = 1; } - hdr1 = &leaf1->hdr; - hdr2 = &leaf2->hdr; /* * Examine entries until we reduce the absolute difference in @@ -1372,41 +1430,39 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, * "inleaf" is true if the new entry should be inserted into blk1. * If "swap" is also true, then reverse the sense of "inleaf". */ - state->inleaf = xfs_attr_leaf_figure_balance(state, blk1, blk2, - &count, &totallen); + state->inleaf = xfs_attr3_leaf_figure_balance(state, blk1, &ichdr1, + blk2, &ichdr2, + &count, &totallen); if (swap) state->inleaf = !state->inleaf; /* * Move any entries required from leaf to leaf: */ - if (count < be16_to_cpu(hdr1->count)) { + if (count < ichdr1.count) { /* * Figure the total bytes to be added to the destination leaf. */ /* number entries being moved */ - count = be16_to_cpu(hdr1->count) - count; - space = be16_to_cpu(hdr1->usedbytes) - totallen; + count = ichdr1.count - count; + space = ichdr1.usedbytes - totallen; space += count * sizeof(xfs_attr_leaf_entry_t); /* * leaf2 is the destination, compact it if it looks tight. */ - max = be16_to_cpu(hdr2->firstused) - - sizeof(xfs_attr_leaf_hdr_t); - max -= be16_to_cpu(hdr2->count) * sizeof(xfs_attr_leaf_entry_t); + max = ichdr2.firstused - xfs_attr3_leaf_hdr_size(leaf1); + max -= ichdr2.count * sizeof(xfs_attr_leaf_entry_t); if (space > max) - xfs_attr_leaf_compact(args, blk2->bp); + xfs_attr3_leaf_compact(args, &ichdr2, blk2->bp); /* * Move high entries from leaf1 to low end of leaf2. */ - xfs_attr_leaf_moveents(leaf1, be16_to_cpu(hdr1->count) - count, - leaf2, 0, count, state->mp); + xfs_attr3_leaf_moveents(args, leaf1, &ichdr1, + ichdr1.count - count, leaf2, &ichdr2, 0, count); - xfs_trans_log_buf(args->trans, blk1->bp, 0, state->blocksize-1); - xfs_trans_log_buf(args->trans, blk2->bp, 0, state->blocksize-1); - } else if (count > be16_to_cpu(hdr1->count)) { + } else if (count > ichdr1.count) { /* * I assert that since all callers pass in an empty * second buffer, this code should never execute. @@ -1417,36 +1473,37 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, * Figure the total bytes to be added to the destination leaf. */ /* number entries being moved */ - count -= be16_to_cpu(hdr1->count); - space = totallen - be16_to_cpu(hdr1->usedbytes); + count -= ichdr1.count; + space = totallen - ichdr1.usedbytes; space += count * sizeof(xfs_attr_leaf_entry_t); /* * leaf1 is the destination, compact it if it looks tight. */ - max = be16_to_cpu(hdr1->firstused) - - sizeof(xfs_attr_leaf_hdr_t); - max -= be16_to_cpu(hdr1->count) * sizeof(xfs_attr_leaf_entry_t); + max = ichdr1.firstused - xfs_attr3_leaf_hdr_size(leaf1); + max -= ichdr1.count * sizeof(xfs_attr_leaf_entry_t); if (space > max) - xfs_attr_leaf_compact(args, blk1->bp); + xfs_attr3_leaf_compact(args, &ichdr1, blk1->bp); /* * Move low entries from leaf2 to high end of leaf1. */ - xfs_attr_leaf_moveents(leaf2, 0, leaf1, - be16_to_cpu(hdr1->count), count, state->mp); - - xfs_trans_log_buf(args->trans, blk1->bp, 0, state->blocksize-1); - xfs_trans_log_buf(args->trans, blk2->bp, 0, state->blocksize-1); + xfs_attr3_leaf_moveents(args, leaf2, &ichdr2, 0, leaf1, &ichdr1, + ichdr1.count, count); } + xfs_attr3_leaf_hdr_to_disk(leaf1, &ichdr1); + xfs_attr3_leaf_hdr_to_disk(leaf2, &ichdr2); + xfs_trans_log_buf(args->trans, blk1->bp, 0, args->geo->blksize - 1); + xfs_trans_log_buf(args->trans, blk2->bp, 0, args->geo->blksize - 1); + /* * Copy out last hashval in each block for B-tree code. */ - blk1->hashval = be32_to_cpu( - leaf1->entries[be16_to_cpu(leaf1->hdr.count)-1].hashval); - blk2->hashval = be32_to_cpu( - leaf2->entries[be16_to_cpu(leaf2->hdr.count)-1].hashval); + entries1 = xfs_attr3_leaf_entryp(leaf1); + entries2 = xfs_attr3_leaf_entryp(leaf2); + blk1->hashval = be32_to_cpu(entries1[ichdr1.count - 1].hashval); + blk2->hashval = be32_to_cpu(entries2[ichdr2.count - 1].hashval); /* * Adjust the expected index for insertion. @@ -1460,12 +1517,12 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, * inserting. The index/blkno fields refer to the "old" entry, * while the index2/blkno2 fields refer to the "new" entry. */ - if (blk1->index > be16_to_cpu(leaf1->hdr.count)) { + if (blk1->index > ichdr1.count) { ASSERT(state->inleaf == 0); - blk2->index = blk1->index - be16_to_cpu(leaf1->hdr.count); + blk2->index = blk1->index - ichdr1.count; args->index = args->index2 = blk2->index; args->blkno = args->blkno2 = blk2->blkno; - } else if (blk1->index == be16_to_cpu(leaf1->hdr.count)) { + } else if (blk1->index == ichdr1.count) { if (state->inleaf) { args->index = blk1->index; args->blkno = blk1->blkno; @@ -1477,8 +1534,7 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, * is already stored in blkno2/index2, so don't * overwrite it overwise we corrupt the tree. */ - blk2->index = blk1->index - - be16_to_cpu(leaf1->hdr.count); + blk2->index = blk1->index - ichdr1.count; args->index = blk2->index; args->blkno = blk2->blkno; if (!state->extravalid) { @@ -1506,42 +1562,38 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, * GROT: Do a double-split for this case? */ STATIC int -xfs_attr_leaf_figure_balance(xfs_da_state_t *state, - xfs_da_state_blk_t *blk1, - xfs_da_state_blk_t *blk2, - int *countarg, int *usedbytesarg) +xfs_attr3_leaf_figure_balance( + struct xfs_da_state *state, + struct xfs_da_state_blk *blk1, + struct xfs_attr3_icleaf_hdr *ichdr1, + struct xfs_da_state_blk *blk2, + struct xfs_attr3_icleaf_hdr *ichdr2, + int *countarg, + int *usedbytesarg) { - xfs_attr_leafblock_t *leaf1, *leaf2; - xfs_attr_leaf_hdr_t *hdr1, *hdr2; - xfs_attr_leaf_entry_t *entry; - int count, max, index, totallen, half; - int lastdelta, foundit, tmp; - - /* - * Set up environment. - */ - leaf1 = blk1->bp->b_addr; - leaf2 = blk2->bp->b_addr; - hdr1 = &leaf1->hdr; - hdr2 = &leaf2->hdr; - foundit = 0; - totallen = 0; + struct xfs_attr_leafblock *leaf1 = blk1->bp->b_addr; + struct xfs_attr_leafblock *leaf2 = blk2->bp->b_addr; + struct xfs_attr_leaf_entry *entry; + int count; + int max; + int index; + int totallen = 0; + int half; + int lastdelta; + int foundit = 0; + int tmp; /* * Examine entries until we reduce the absolute difference in * byte usage between the two blocks to a minimum. */ - max = be16_to_cpu(hdr1->count) + be16_to_cpu(hdr2->count); - half = (max+1) * sizeof(*entry); - half += be16_to_cpu(hdr1->usedbytes) + - be16_to_cpu(hdr2->usedbytes) + - xfs_attr_leaf_newentsize( - state->args->namelen, - state->args->valuelen, - state->blocksize, NULL); + max = ichdr1->count + ichdr2->count; + half = (max + 1) * sizeof(*entry); + half += ichdr1->usedbytes + ichdr2->usedbytes + + xfs_attr_leaf_newentsize(state->args, NULL); half /= 2; - lastdelta = state->blocksize; - entry = &leaf1->entries[0]; + lastdelta = state->args->geo->blksize; + entry = xfs_attr3_leaf_entryp(leaf1); for (count = index = 0; count < max; entry++, index++, count++) { #define XFS_ATTR_ABS(A) (((A) < 0) ? -(A) : (A)) @@ -1550,10 +1602,7 @@ xfs_attr_leaf_figure_balance(xfs_da_state_t *state, */ if (count == blk1->index) { tmp = totallen + sizeof(*entry) + - xfs_attr_leaf_newentsize( - state->args->namelen, - state->args->valuelen, - state->blocksize, NULL); + xfs_attr_leaf_newentsize(state->args, NULL); if (XFS_ATTR_ABS(half - tmp) > lastdelta) break; lastdelta = XFS_ATTR_ABS(half - tmp); @@ -1564,9 +1613,9 @@ xfs_attr_leaf_figure_balance(xfs_da_state_t *state, /* * Wrap around into the second block if necessary. */ - if (count == be16_to_cpu(hdr1->count)) { + if (count == ichdr1->count) { leaf1 = leaf2; - entry = &leaf1->entries[0]; + entry = xfs_attr3_leaf_entryp(leaf1); index = 0; } @@ -1589,15 +1638,12 @@ xfs_attr_leaf_figure_balance(xfs_da_state_t *state, totallen -= count * sizeof(*entry); if (foundit) { totallen -= sizeof(*entry) + - xfs_attr_leaf_newentsize( - state->args->namelen, - state->args->valuelen, - state->blocksize, NULL); + xfs_attr_leaf_newentsize(state->args, NULL); } *countarg = count; *usedbytesarg = totallen; - return(foundit); + return foundit; } /*======================================================================== @@ -1616,14 +1662,20 @@ xfs_attr_leaf_figure_balance(xfs_da_state_t *state, * GROT: allow for INCOMPLETE entries in calculation. */ int -xfs_attr_leaf_toosmall(xfs_da_state_t *state, int *action) +xfs_attr3_leaf_toosmall( + struct xfs_da_state *state, + int *action) { - xfs_attr_leafblock_t *leaf; - xfs_da_state_blk_t *blk; - xfs_da_blkinfo_t *info; - int count, bytes, forward, error, retval, i; - xfs_dablk_t blkno; - struct xfs_buf *bp; + struct xfs_attr_leafblock *leaf; + struct xfs_da_state_blk *blk; + struct xfs_attr3_icleaf_hdr ichdr; + struct xfs_buf *bp; + xfs_dablk_t blkno; + int bytes; + int forward; + int error; + int retval; + int i; trace_xfs_attr_leaf_toosmall(state->args); @@ -1633,14 +1685,12 @@ xfs_attr_leaf_toosmall(xfs_da_state_t *state, int *action) * to coalesce with a sibling. */ blk = &state->path.blk[ state->path.active-1 ]; - info = blk->bp->b_addr; - ASSERT(info->magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); - leaf = (xfs_attr_leafblock_t *)info; - count = be16_to_cpu(leaf->hdr.count); - bytes = sizeof(xfs_attr_leaf_hdr_t) + - count * sizeof(xfs_attr_leaf_entry_t) + - be16_to_cpu(leaf->hdr.usedbytes); - if (bytes > (state->blocksize >> 1)) { + leaf = blk->bp->b_addr; + xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf); + bytes = xfs_attr3_leaf_hdr_size(leaf) + + ichdr.count * sizeof(xfs_attr_leaf_entry_t) + + ichdr.usedbytes; + if (bytes > (state->args->geo->blksize >> 1)) { *action = 0; /* blk over 50%, don't try to join */ return(0); } @@ -1651,14 +1701,14 @@ xfs_attr_leaf_toosmall(xfs_da_state_t *state, int *action) * coalesce it with a sibling block. We choose (arbitrarily) * to merge with the forward block unless it is NULL. */ - if (count == 0) { + if (ichdr.count == 0) { /* * Make altpath point to the block we want to keep and * path point to the block we want to drop (this one). */ - forward = (info->forw != 0); + forward = (ichdr.forw != 0); memcpy(&state->altpath, &state->path, sizeof(state->path)); - error = xfs_da_path_shift(state, &state->altpath, forward, + error = xfs_da3_path_shift(state, &state->altpath, forward, 0, &retval); if (error) return(error); @@ -1667,7 +1717,7 @@ xfs_attr_leaf_toosmall(xfs_da_state_t *state, int *action) } else { *action = 2; } - return(0); + return 0; } /* @@ -1678,28 +1728,29 @@ xfs_attr_leaf_toosmall(xfs_da_state_t *state, int *action) * to shrink an attribute list over time. */ /* start with smaller blk num */ - forward = (be32_to_cpu(info->forw) < be32_to_cpu(info->back)); + forward = ichdr.forw < ichdr.back; for (i = 0; i < 2; forward = !forward, i++) { + struct xfs_attr3_icleaf_hdr ichdr2; if (forward) - blkno = be32_to_cpu(info->forw); + blkno = ichdr.forw; else - blkno = be32_to_cpu(info->back); + blkno = ichdr.back; if (blkno == 0) continue; - error = xfs_attr_leaf_read(state->args->trans, state->args->dp, + error = xfs_attr3_leaf_read(state->args->trans, state->args->dp, blkno, -1, &bp); if (error) return(error); - leaf = (xfs_attr_leafblock_t *)info; - count = be16_to_cpu(leaf->hdr.count); - bytes = state->blocksize - (state->blocksize>>2); - bytes -= be16_to_cpu(leaf->hdr.usedbytes); - leaf = bp->b_addr; - count += be16_to_cpu(leaf->hdr.count); - bytes -= be16_to_cpu(leaf->hdr.usedbytes); - bytes -= count * sizeof(xfs_attr_leaf_entry_t); - bytes -= sizeof(xfs_attr_leaf_hdr_t); + xfs_attr3_leaf_hdr_from_disk(&ichdr2, bp->b_addr); + + bytes = state->args->geo->blksize - + (state->args->geo->blksize >> 2) - + ichdr.usedbytes - ichdr2.usedbytes - + ((ichdr.count + ichdr2.count) * + sizeof(xfs_attr_leaf_entry_t)) - + xfs_attr3_leaf_hdr_size(leaf); + xfs_trans_brelse(state->args->trans, bp); if (bytes >= 0) break; /* fits with at least 25% to spare */ @@ -1715,10 +1766,10 @@ xfs_attr_leaf_toosmall(xfs_da_state_t *state, int *action) */ memcpy(&state->altpath, &state->path, sizeof(state->path)); if (blkno < blk->blkno) { - error = xfs_da_path_shift(state, &state->altpath, forward, + error = xfs_da3_path_shift(state, &state->altpath, forward, 0, &retval); } else { - error = xfs_da_path_shift(state, &state->path, forward, + error = xfs_da3_path_shift(state, &state->path, forward, 0, &retval); } if (error) @@ -1738,33 +1789,35 @@ xfs_attr_leaf_toosmall(xfs_da_state_t *state, int *action) * If two leaves are 37% full, when combined they will leave 25% free. */ int -xfs_attr_leaf_remove( - struct xfs_buf *bp, - xfs_da_args_t *args) +xfs_attr3_leaf_remove( + struct xfs_buf *bp, + struct xfs_da_args *args) { - xfs_attr_leafblock_t *leaf; - xfs_attr_leaf_hdr_t *hdr; - xfs_attr_leaf_map_t *map; - xfs_attr_leaf_entry_t *entry; - int before, after, smallest, entsize; - int tablesize, tmp, i; - xfs_mount_t *mp; + struct xfs_attr_leafblock *leaf; + struct xfs_attr3_icleaf_hdr ichdr; + struct xfs_attr_leaf_entry *entry; + int before; + int after; + int smallest; + int entsize; + int tablesize; + int tmp; + int i; trace_xfs_attr_leaf_remove(args); leaf = bp->b_addr; - ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); - hdr = &leaf->hdr; - mp = args->trans->t_mountp; - ASSERT((be16_to_cpu(hdr->count) > 0) - && (be16_to_cpu(hdr->count) < (XFS_LBSIZE(mp)/8))); - ASSERT((args->index >= 0) - && (args->index < be16_to_cpu(hdr->count))); - ASSERT(be16_to_cpu(hdr->firstused) >= - ((be16_to_cpu(hdr->count) * sizeof(*entry)) + sizeof(*hdr))); - entry = &leaf->entries[args->index]; - ASSERT(be16_to_cpu(entry->nameidx) >= be16_to_cpu(hdr->firstused)); - ASSERT(be16_to_cpu(entry->nameidx) < XFS_LBSIZE(mp)); + xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf); + + ASSERT(ichdr.count > 0 && ichdr.count < args->geo->blksize / 8); + ASSERT(args->index >= 0 && args->index < ichdr.count); + ASSERT(ichdr.firstused >= ichdr.count * sizeof(*entry) + + xfs_attr3_leaf_hdr_size(leaf)); + + entry = &xfs_attr3_leaf_entryp(leaf)[args->index]; + + ASSERT(be16_to_cpu(entry->nameidx) >= ichdr.firstused); + ASSERT(be16_to_cpu(entry->nameidx) < args->geo->blksize); /* * Scan through free region table: @@ -1772,30 +1825,28 @@ xfs_attr_leaf_remove( * find smallest free region in case we need to replace it, * adjust any map that borders the entry table, */ - tablesize = be16_to_cpu(hdr->count) * sizeof(xfs_attr_leaf_entry_t) - + sizeof(xfs_attr_leaf_hdr_t); - map = &hdr->freemap[0]; - tmp = be16_to_cpu(map->size); + tablesize = ichdr.count * sizeof(xfs_attr_leaf_entry_t) + + xfs_attr3_leaf_hdr_size(leaf); + tmp = ichdr.freemap[0].size; before = after = -1; smallest = XFS_ATTR_LEAF_MAPSIZE - 1; entsize = xfs_attr_leaf_entsize(leaf, args->index); - for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; map++, i++) { - ASSERT(be16_to_cpu(map->base) < XFS_LBSIZE(mp)); - ASSERT(be16_to_cpu(map->size) < XFS_LBSIZE(mp)); - if (be16_to_cpu(map->base) == tablesize) { - be16_add_cpu(&map->base, - -((int)sizeof(xfs_attr_leaf_entry_t))); - be16_add_cpu(&map->size, sizeof(xfs_attr_leaf_entry_t)); + for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) { + ASSERT(ichdr.freemap[i].base < args->geo->blksize); + ASSERT(ichdr.freemap[i].size < args->geo->blksize); + if (ichdr.freemap[i].base == tablesize) { + ichdr.freemap[i].base -= sizeof(xfs_attr_leaf_entry_t); + ichdr.freemap[i].size += sizeof(xfs_attr_leaf_entry_t); } - if ((be16_to_cpu(map->base) + be16_to_cpu(map->size)) - == be16_to_cpu(entry->nameidx)) { + if (ichdr.freemap[i].base + ichdr.freemap[i].size == + be16_to_cpu(entry->nameidx)) { before = i; - } else if (be16_to_cpu(map->base) - == (be16_to_cpu(entry->nameidx) + entsize)) { + } else if (ichdr.freemap[i].base == + (be16_to_cpu(entry->nameidx) + entsize)) { after = i; - } else if (be16_to_cpu(map->size) < tmp) { - tmp = be16_to_cpu(map->size); + } else if (ichdr.freemap[i].size < tmp) { + tmp = ichdr.freemap[i].size; smallest = i; } } @@ -1806,36 +1857,30 @@ xfs_attr_leaf_remove( */ if ((before >= 0) || (after >= 0)) { if ((before >= 0) && (after >= 0)) { - map = &hdr->freemap[before]; - be16_add_cpu(&map->size, entsize); - be16_add_cpu(&map->size, - be16_to_cpu(hdr->freemap[after].size)); - hdr->freemap[after].base = 0; - hdr->freemap[after].size = 0; + ichdr.freemap[before].size += entsize; + ichdr.freemap[before].size += ichdr.freemap[after].size; + ichdr.freemap[after].base = 0; + ichdr.freemap[after].size = 0; } else if (before >= 0) { - map = &hdr->freemap[before]; - be16_add_cpu(&map->size, entsize); + ichdr.freemap[before].size += entsize; } else { - map = &hdr->freemap[after]; - /* both on-disk, don't endian flip twice */ - map->base = entry->nameidx; - be16_add_cpu(&map->size, entsize); + ichdr.freemap[after].base = be16_to_cpu(entry->nameidx); + ichdr.freemap[after].size += entsize; } } else { /* * Replace smallest region (if it is smaller than free'd entry) */ - map = &hdr->freemap[smallest]; - if (be16_to_cpu(map->size) < entsize) { - map->base = cpu_to_be16(be16_to_cpu(entry->nameidx)); - map->size = cpu_to_be16(entsize); + if (ichdr.freemap[smallest].size < entsize) { + ichdr.freemap[smallest].base = be16_to_cpu(entry->nameidx); + ichdr.freemap[smallest].size = entsize; } } /* * Did we remove the first entry? */ - if (be16_to_cpu(entry->nameidx) == be16_to_cpu(hdr->firstused)) + if (be16_to_cpu(entry->nameidx) == ichdr.firstused) smallest = 1; else smallest = 0; @@ -1843,20 +1888,20 @@ xfs_attr_leaf_remove( /* * Compress the remaining entries and zero out the removed stuff. */ - memset(xfs_attr_leaf_name(leaf, args->index), 0, entsize); - be16_add_cpu(&hdr->usedbytes, -entsize); + memset(xfs_attr3_leaf_name(leaf, args->index), 0, entsize); + ichdr.usedbytes -= entsize; xfs_trans_log_buf(args->trans, bp, - XFS_DA_LOGRANGE(leaf, xfs_attr_leaf_name(leaf, args->index), + XFS_DA_LOGRANGE(leaf, xfs_attr3_leaf_name(leaf, args->index), entsize)); - tmp = (be16_to_cpu(hdr->count) - args->index) - * sizeof(xfs_attr_leaf_entry_t); - memmove((char *)entry, (char *)(entry+1), tmp); - be16_add_cpu(&hdr->count, -1); + tmp = (ichdr.count - args->index) * sizeof(xfs_attr_leaf_entry_t); + memmove(entry, entry + 1, tmp); + ichdr.count--; xfs_trans_log_buf(args->trans, bp, - XFS_DA_LOGRANGE(leaf, entry, tmp + sizeof(*entry))); - entry = &leaf->entries[be16_to_cpu(hdr->count)]; - memset((char *)entry, 0, sizeof(xfs_attr_leaf_entry_t)); + XFS_DA_LOGRANGE(leaf, entry, tmp + sizeof(xfs_attr_leaf_entry_t))); + + entry = &xfs_attr3_leaf_entryp(leaf)[ichdr.count]; + memset(entry, 0, sizeof(xfs_attr_leaf_entry_t)); /* * If we removed the first entry, re-find the first used byte @@ -1865,131 +1910,146 @@ xfs_attr_leaf_remove( * removing the name. */ if (smallest) { - tmp = XFS_LBSIZE(mp); - entry = &leaf->entries[0]; - for (i = be16_to_cpu(hdr->count)-1; i >= 0; entry++, i--) { - ASSERT(be16_to_cpu(entry->nameidx) >= - be16_to_cpu(hdr->firstused)); - ASSERT(be16_to_cpu(entry->nameidx) < XFS_LBSIZE(mp)); + tmp = args->geo->blksize; + entry = xfs_attr3_leaf_entryp(leaf); + for (i = ichdr.count - 1; i >= 0; entry++, i--) { + ASSERT(be16_to_cpu(entry->nameidx) >= ichdr.firstused); + ASSERT(be16_to_cpu(entry->nameidx) < args->geo->blksize); if (be16_to_cpu(entry->nameidx) < tmp) tmp = be16_to_cpu(entry->nameidx); } - hdr->firstused = cpu_to_be16(tmp); - if (!hdr->firstused) { - hdr->firstused = cpu_to_be16( - tmp - XFS_ATTR_LEAF_NAME_ALIGN); - } + ichdr.firstused = tmp; + if (!ichdr.firstused) + ichdr.firstused = tmp - XFS_ATTR_LEAF_NAME_ALIGN; } else { - hdr->holes = 1; /* mark as needing compaction */ + ichdr.holes = 1; /* mark as needing compaction */ } + xfs_attr3_leaf_hdr_to_disk(leaf, &ichdr); xfs_trans_log_buf(args->trans, bp, - XFS_DA_LOGRANGE(leaf, hdr, sizeof(*hdr))); + XFS_DA_LOGRANGE(leaf, &leaf->hdr, + xfs_attr3_leaf_hdr_size(leaf))); /* * Check if leaf is less than 50% full, caller may want to * "join" the leaf with a sibling if so. */ - tmp = sizeof(xfs_attr_leaf_hdr_t); - tmp += be16_to_cpu(leaf->hdr.count) * sizeof(xfs_attr_leaf_entry_t); - tmp += be16_to_cpu(leaf->hdr.usedbytes); - return(tmp < mp->m_attr_magicpct); /* leaf is < 37% full */ + tmp = ichdr.usedbytes + xfs_attr3_leaf_hdr_size(leaf) + + ichdr.count * sizeof(xfs_attr_leaf_entry_t); + + return tmp < args->geo->magicpct; /* leaf is < 37% full */ } /* * Move all the attribute list entries from drop_leaf into save_leaf. */ void -xfs_attr_leaf_unbalance(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk, - xfs_da_state_blk_t *save_blk) +xfs_attr3_leaf_unbalance( + struct xfs_da_state *state, + struct xfs_da_state_blk *drop_blk, + struct xfs_da_state_blk *save_blk) { - xfs_attr_leafblock_t *drop_leaf, *save_leaf, *tmp_leaf; - xfs_attr_leaf_hdr_t *drop_hdr, *save_hdr, *tmp_hdr; - xfs_mount_t *mp; - char *tmpbuffer; + struct xfs_attr_leafblock *drop_leaf = drop_blk->bp->b_addr; + struct xfs_attr_leafblock *save_leaf = save_blk->bp->b_addr; + struct xfs_attr3_icleaf_hdr drophdr; + struct xfs_attr3_icleaf_hdr savehdr; + struct xfs_attr_leaf_entry *entry; trace_xfs_attr_leaf_unbalance(state->args); - /* - * Set up environment. - */ - mp = state->mp; - ASSERT(drop_blk->magic == XFS_ATTR_LEAF_MAGIC); - ASSERT(save_blk->magic == XFS_ATTR_LEAF_MAGIC); drop_leaf = drop_blk->bp->b_addr; save_leaf = save_blk->bp->b_addr; - ASSERT(drop_leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); - ASSERT(save_leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); - drop_hdr = &drop_leaf->hdr; - save_hdr = &save_leaf->hdr; + xfs_attr3_leaf_hdr_from_disk(&drophdr, drop_leaf); + xfs_attr3_leaf_hdr_from_disk(&savehdr, save_leaf); + entry = xfs_attr3_leaf_entryp(drop_leaf); /* * Save last hashval from dying block for later Btree fixup. */ - drop_blk->hashval = be32_to_cpu( - drop_leaf->entries[be16_to_cpu(drop_leaf->hdr.count)-1].hashval); + drop_blk->hashval = be32_to_cpu(entry[drophdr.count - 1].hashval); /* * Check if we need a temp buffer, or can we do it in place. * Note that we don't check "leaf" for holes because we will * always be dropping it, toosmall() decided that for us already. */ - if (save_hdr->holes == 0) { + if (savehdr.holes == 0) { /* * dest leaf has no holes, so we add there. May need * to make some room in the entry array. */ - if (xfs_attr_leaf_order(save_blk->bp, drop_blk->bp)) { - xfs_attr_leaf_moveents(drop_leaf, 0, save_leaf, 0, - be16_to_cpu(drop_hdr->count), mp); + if (xfs_attr3_leaf_order(save_blk->bp, &savehdr, + drop_blk->bp, &drophdr)) { + xfs_attr3_leaf_moveents(state->args, + drop_leaf, &drophdr, 0, + save_leaf, &savehdr, 0, + drophdr.count); } else { - xfs_attr_leaf_moveents(drop_leaf, 0, save_leaf, - be16_to_cpu(save_hdr->count), - be16_to_cpu(drop_hdr->count), mp); + xfs_attr3_leaf_moveents(state->args, + drop_leaf, &drophdr, 0, + save_leaf, &savehdr, + savehdr.count, drophdr.count); } } else { /* * Destination has holes, so we make a temporary copy * of the leaf and add them both to that. */ - tmpbuffer = kmem_alloc(state->blocksize, KM_SLEEP); - ASSERT(tmpbuffer != NULL); - memset(tmpbuffer, 0, state->blocksize); - tmp_leaf = (xfs_attr_leafblock_t *)tmpbuffer; - tmp_hdr = &tmp_leaf->hdr; - tmp_hdr->info = save_hdr->info; /* struct copy */ - tmp_hdr->count = 0; - tmp_hdr->firstused = cpu_to_be16(state->blocksize); - if (!tmp_hdr->firstused) { - tmp_hdr->firstused = cpu_to_be16( - state->blocksize - XFS_ATTR_LEAF_NAME_ALIGN); - } - tmp_hdr->usedbytes = 0; - if (xfs_attr_leaf_order(save_blk->bp, drop_blk->bp)) { - xfs_attr_leaf_moveents(drop_leaf, 0, tmp_leaf, 0, - be16_to_cpu(drop_hdr->count), mp); - xfs_attr_leaf_moveents(save_leaf, 0, tmp_leaf, - be16_to_cpu(tmp_leaf->hdr.count), - be16_to_cpu(save_hdr->count), mp); + struct xfs_attr_leafblock *tmp_leaf; + struct xfs_attr3_icleaf_hdr tmphdr; + + tmp_leaf = kmem_zalloc(state->args->geo->blksize, KM_SLEEP); + + /* + * Copy the header into the temp leaf so that all the stuff + * not in the incore header is present and gets copied back in + * once we've moved all the entries. + */ + memcpy(tmp_leaf, save_leaf, xfs_attr3_leaf_hdr_size(save_leaf)); + + memset(&tmphdr, 0, sizeof(tmphdr)); + tmphdr.magic = savehdr.magic; + tmphdr.forw = savehdr.forw; + tmphdr.back = savehdr.back; + tmphdr.firstused = state->args->geo->blksize; + + /* write the header to the temp buffer to initialise it */ + xfs_attr3_leaf_hdr_to_disk(tmp_leaf, &tmphdr); + + if (xfs_attr3_leaf_order(save_blk->bp, &savehdr, + drop_blk->bp, &drophdr)) { + xfs_attr3_leaf_moveents(state->args, + drop_leaf, &drophdr, 0, + tmp_leaf, &tmphdr, 0, + drophdr.count); + xfs_attr3_leaf_moveents(state->args, + save_leaf, &savehdr, 0, + tmp_leaf, &tmphdr, tmphdr.count, + savehdr.count); } else { - xfs_attr_leaf_moveents(save_leaf, 0, tmp_leaf, 0, - be16_to_cpu(save_hdr->count), mp); - xfs_attr_leaf_moveents(drop_leaf, 0, tmp_leaf, - be16_to_cpu(tmp_leaf->hdr.count), - be16_to_cpu(drop_hdr->count), mp); + xfs_attr3_leaf_moveents(state->args, + save_leaf, &savehdr, 0, + tmp_leaf, &tmphdr, 0, + savehdr.count); + xfs_attr3_leaf_moveents(state->args, + drop_leaf, &drophdr, 0, + tmp_leaf, &tmphdr, tmphdr.count, + drophdr.count); } - memcpy((char *)save_leaf, (char *)tmp_leaf, state->blocksize); - kmem_free(tmpbuffer); + memcpy(save_leaf, tmp_leaf, state->args->geo->blksize); + savehdr = tmphdr; /* struct copy */ + kmem_free(tmp_leaf); } + xfs_attr3_leaf_hdr_to_disk(save_leaf, &savehdr); xfs_trans_log_buf(state->args->trans, save_blk->bp, 0, - state->blocksize - 1); + state->args->geo->blksize - 1); /* * Copy out last hashval in each block for B-tree code. */ - save_blk->hashval = be32_to_cpu( - save_leaf->entries[be16_to_cpu(save_leaf->hdr.count)-1].hashval); + entry = xfs_attr3_leaf_entryp(save_leaf); + save_blk->hashval = be32_to_cpu(entry[savehdr.count - 1].hashval); } /*======================================================================== @@ -2010,31 +2070,33 @@ xfs_attr_leaf_unbalance(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk, * Don't change the args->value unless we find the attribute. */ int -xfs_attr_leaf_lookup_int( - struct xfs_buf *bp, - xfs_da_args_t *args) +xfs_attr3_leaf_lookup_int( + struct xfs_buf *bp, + struct xfs_da_args *args) { - xfs_attr_leafblock_t *leaf; - xfs_attr_leaf_entry_t *entry; - xfs_attr_leaf_name_local_t *name_loc; - xfs_attr_leaf_name_remote_t *name_rmt; - int probe, span; - xfs_dahash_t hashval; + struct xfs_attr_leafblock *leaf; + struct xfs_attr3_icleaf_hdr ichdr; + struct xfs_attr_leaf_entry *entry; + struct xfs_attr_leaf_entry *entries; + struct xfs_attr_leaf_name_local *name_loc; + struct xfs_attr_leaf_name_remote *name_rmt; + xfs_dahash_t hashval; + int probe; + int span; trace_xfs_attr_leaf_lookup(args); leaf = bp->b_addr; - ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); - ASSERT(be16_to_cpu(leaf->hdr.count) - < (XFS_LBSIZE(args->dp->i_mount)/8)); + xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf); + entries = xfs_attr3_leaf_entryp(leaf); + ASSERT(ichdr.count < args->geo->blksize / 8); /* * Binary search. (note: small blocks will skip this loop) */ hashval = args->hashval; - probe = span = be16_to_cpu(leaf->hdr.count) / 2; - for (entry = &leaf->entries[probe]; span > 4; - entry = &leaf->entries[probe]) { + probe = span = ichdr.count / 2; + for (entry = &entries[probe]; span > 4; entry = &entries[probe]) { span /= 2; if (be32_to_cpu(entry->hashval) < hashval) probe += span; @@ -2043,35 +2105,31 @@ xfs_attr_leaf_lookup_int( else break; } - ASSERT((probe >= 0) && - (!leaf->hdr.count - || (probe < be16_to_cpu(leaf->hdr.count)))); - ASSERT((span <= 4) || (be32_to_cpu(entry->hashval) == hashval)); + ASSERT(probe >= 0 && (!ichdr.count || probe < ichdr.count)); + ASSERT(span <= 4 || be32_to_cpu(entry->hashval) == hashval); /* * Since we may have duplicate hashval's, find the first matching * hashval in the leaf. */ - while ((probe > 0) && (be32_to_cpu(entry->hashval) >= hashval)) { + while (probe > 0 && be32_to_cpu(entry->hashval) >= hashval) { entry--; probe--; } - while ((probe < be16_to_cpu(leaf->hdr.count)) && - (be32_to_cpu(entry->hashval) < hashval)) { + while (probe < ichdr.count && + be32_to_cpu(entry->hashval) < hashval) { entry++; probe++; } - if ((probe == be16_to_cpu(leaf->hdr.count)) || - (be32_to_cpu(entry->hashval) != hashval)) { + if (probe == ichdr.count || be32_to_cpu(entry->hashval) != hashval) { args->index = probe; - return(XFS_ERROR(ENOATTR)); + return XFS_ERROR(ENOATTR); } /* * Duplicate keys may be present, so search all of them for a match. */ - for ( ; (probe < be16_to_cpu(leaf->hdr.count)) && - (be32_to_cpu(entry->hashval) == hashval); + for (; probe < ichdr.count && (be32_to_cpu(entry->hashval) == hashval); entry++, probe++) { /* * GROT: Add code to remove incomplete entries. @@ -2085,33 +2143,36 @@ xfs_attr_leaf_lookup_int( continue; } if (entry->flags & XFS_ATTR_LOCAL) { - name_loc = xfs_attr_leaf_name_local(leaf, probe); + name_loc = xfs_attr3_leaf_name_local(leaf, probe); if (name_loc->namelen != args->namelen) continue; - if (memcmp(args->name, (char *)name_loc->nameval, args->namelen) != 0) + if (memcmp(args->name, name_loc->nameval, + args->namelen) != 0) continue; if (!xfs_attr_namesp_match(args->flags, entry->flags)) continue; args->index = probe; - return(XFS_ERROR(EEXIST)); + return XFS_ERROR(EEXIST); } else { - name_rmt = xfs_attr_leaf_name_remote(leaf, probe); + name_rmt = xfs_attr3_leaf_name_remote(leaf, probe); if (name_rmt->namelen != args->namelen) continue; - if (memcmp(args->name, (char *)name_rmt->name, - args->namelen) != 0) + if (memcmp(args->name, name_rmt->name, + args->namelen) != 0) continue; if (!xfs_attr_namesp_match(args->flags, entry->flags)) continue; args->index = probe; + args->rmtvaluelen = be32_to_cpu(name_rmt->valuelen); args->rmtblkno = be32_to_cpu(name_rmt->valueblk); - args->rmtblkcnt = XFS_B_TO_FSB(args->dp->i_mount, - be32_to_cpu(name_rmt->valuelen)); - return(XFS_ERROR(EEXIST)); + args->rmtblkcnt = xfs_attr3_rmt_blocks( + args->dp->i_mount, + args->rmtvaluelen); + return XFS_ERROR(EEXIST); } } args->index = probe; - return(XFS_ERROR(ENOATTR)); + return XFS_ERROR(ENOATTR); } /* @@ -2119,56 +2180,57 @@ xfs_attr_leaf_lookup_int( * list structure. */ int -xfs_attr_leaf_getvalue( - struct xfs_buf *bp, - xfs_da_args_t *args) +xfs_attr3_leaf_getvalue( + struct xfs_buf *bp, + struct xfs_da_args *args) { - int valuelen; - xfs_attr_leafblock_t *leaf; - xfs_attr_leaf_entry_t *entry; - xfs_attr_leaf_name_local_t *name_loc; - xfs_attr_leaf_name_remote_t *name_rmt; + struct xfs_attr_leafblock *leaf; + struct xfs_attr3_icleaf_hdr ichdr; + struct xfs_attr_leaf_entry *entry; + struct xfs_attr_leaf_name_local *name_loc; + struct xfs_attr_leaf_name_remote *name_rmt; + int valuelen; leaf = bp->b_addr; - ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); - ASSERT(be16_to_cpu(leaf->hdr.count) - < (XFS_LBSIZE(args->dp->i_mount)/8)); - ASSERT(args->index < be16_to_cpu(leaf->hdr.count)); + xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf); + ASSERT(ichdr.count < args->geo->blksize / 8); + ASSERT(args->index < ichdr.count); - entry = &leaf->entries[args->index]; + entry = &xfs_attr3_leaf_entryp(leaf)[args->index]; if (entry->flags & XFS_ATTR_LOCAL) { - name_loc = xfs_attr_leaf_name_local(leaf, args->index); + name_loc = xfs_attr3_leaf_name_local(leaf, args->index); ASSERT(name_loc->namelen == args->namelen); ASSERT(memcmp(args->name, name_loc->nameval, args->namelen) == 0); valuelen = be16_to_cpu(name_loc->valuelen); if (args->flags & ATTR_KERNOVAL) { args->valuelen = valuelen; - return(0); + return 0; } if (args->valuelen < valuelen) { args->valuelen = valuelen; - return(XFS_ERROR(ERANGE)); + return XFS_ERROR(ERANGE); } args->valuelen = valuelen; memcpy(args->value, &name_loc->nameval[args->namelen], valuelen); } else { - name_rmt = xfs_attr_leaf_name_remote(leaf, args->index); + name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index); ASSERT(name_rmt->namelen == args->namelen); ASSERT(memcmp(args->name, name_rmt->name, args->namelen) == 0); - valuelen = be32_to_cpu(name_rmt->valuelen); + args->rmtvaluelen = be32_to_cpu(name_rmt->valuelen); args->rmtblkno = be32_to_cpu(name_rmt->valueblk); - args->rmtblkcnt = XFS_B_TO_FSB(args->dp->i_mount, valuelen); + args->rmtblkcnt = xfs_attr3_rmt_blocks(args->dp->i_mount, + args->rmtvaluelen); if (args->flags & ATTR_KERNOVAL) { - args->valuelen = valuelen; - return(0); + args->valuelen = args->rmtvaluelen; + return 0; } - if (args->valuelen < valuelen) { - args->valuelen = valuelen; - return(XFS_ERROR(ERANGE)); + if (args->valuelen < args->rmtvaluelen) { + args->valuelen = args->rmtvaluelen; + return XFS_ERROR(ERANGE); } - args->valuelen = valuelen; + args->valuelen = args->rmtvaluelen; } - return(0); + return 0; } /*======================================================================== @@ -2181,13 +2243,21 @@ xfs_attr_leaf_getvalue( */ /*ARGSUSED*/ STATIC void -xfs_attr_leaf_moveents(xfs_attr_leafblock_t *leaf_s, int start_s, - xfs_attr_leafblock_t *leaf_d, int start_d, - int count, xfs_mount_t *mp) +xfs_attr3_leaf_moveents( + struct xfs_da_args *args, + struct xfs_attr_leafblock *leaf_s, + struct xfs_attr3_icleaf_hdr *ichdr_s, + int start_s, + struct xfs_attr_leafblock *leaf_d, + struct xfs_attr3_icleaf_hdr *ichdr_d, + int start_d, + int count) { - xfs_attr_leaf_hdr_t *hdr_s, *hdr_d; - xfs_attr_leaf_entry_t *entry_s, *entry_d; - int desti, tmp, i; + struct xfs_attr_leaf_entry *entry_s; + struct xfs_attr_leaf_entry *entry_d; + int desti; + int tmp; + int i; /* * Check for nothing to do. @@ -2198,45 +2268,41 @@ xfs_attr_leaf_moveents(xfs_attr_leafblock_t *leaf_s, int start_s, /* * Set up environment. */ - ASSERT(leaf_s->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); - ASSERT(leaf_d->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); - hdr_s = &leaf_s->hdr; - hdr_d = &leaf_d->hdr; - ASSERT((be16_to_cpu(hdr_s->count) > 0) && - (be16_to_cpu(hdr_s->count) < (XFS_LBSIZE(mp)/8))); - ASSERT(be16_to_cpu(hdr_s->firstused) >= - ((be16_to_cpu(hdr_s->count) - * sizeof(*entry_s))+sizeof(*hdr_s))); - ASSERT(be16_to_cpu(hdr_d->count) < (XFS_LBSIZE(mp)/8)); - ASSERT(be16_to_cpu(hdr_d->firstused) >= - ((be16_to_cpu(hdr_d->count) - * sizeof(*entry_d))+sizeof(*hdr_d))); - - ASSERT(start_s < be16_to_cpu(hdr_s->count)); - ASSERT(start_d <= be16_to_cpu(hdr_d->count)); - ASSERT(count <= be16_to_cpu(hdr_s->count)); + ASSERT(ichdr_s->magic == XFS_ATTR_LEAF_MAGIC || + ichdr_s->magic == XFS_ATTR3_LEAF_MAGIC); + ASSERT(ichdr_s->magic == ichdr_d->magic); + ASSERT(ichdr_s->count > 0 && ichdr_s->count < args->geo->blksize / 8); + ASSERT(ichdr_s->firstused >= (ichdr_s->count * sizeof(*entry_s)) + + xfs_attr3_leaf_hdr_size(leaf_s)); + ASSERT(ichdr_d->count < args->geo->blksize / 8); + ASSERT(ichdr_d->firstused >= (ichdr_d->count * sizeof(*entry_d)) + + xfs_attr3_leaf_hdr_size(leaf_d)); + + ASSERT(start_s < ichdr_s->count); + ASSERT(start_d <= ichdr_d->count); + ASSERT(count <= ichdr_s->count); + /* * Move the entries in the destination leaf up to make a hole? */ - if (start_d < be16_to_cpu(hdr_d->count)) { - tmp = be16_to_cpu(hdr_d->count) - start_d; + if (start_d < ichdr_d->count) { + tmp = ichdr_d->count - start_d; tmp *= sizeof(xfs_attr_leaf_entry_t); - entry_s = &leaf_d->entries[start_d]; - entry_d = &leaf_d->entries[start_d + count]; - memmove((char *)entry_d, (char *)entry_s, tmp); + entry_s = &xfs_attr3_leaf_entryp(leaf_d)[start_d]; + entry_d = &xfs_attr3_leaf_entryp(leaf_d)[start_d + count]; + memmove(entry_d, entry_s, tmp); } /* * Copy all entry's in the same (sorted) order, * but allocate attribute info packed and in sequence. */ - entry_s = &leaf_s->entries[start_s]; - entry_d = &leaf_d->entries[start_d]; + entry_s = &xfs_attr3_leaf_entryp(leaf_s)[start_s]; + entry_d = &xfs_attr3_leaf_entryp(leaf_d)[start_d]; desti = start_d; for (i = 0; i < count; entry_s++, entry_d++, desti++, i++) { - ASSERT(be16_to_cpu(entry_s->nameidx) - >= be16_to_cpu(hdr_s->firstused)); + ASSERT(be16_to_cpu(entry_s->nameidx) >= ichdr_s->firstused); tmp = xfs_attr_leaf_entsize(leaf_s, start_s + i); #ifdef GROT /* @@ -2245,36 +2311,34 @@ xfs_attr_leaf_moveents(xfs_attr_leafblock_t *leaf_s, int start_s, * off for 6.2, should be revisited later. */ if (entry_s->flags & XFS_ATTR_INCOMPLETE) { /* skip partials? */ - memset(xfs_attr_leaf_name(leaf_s, start_s + i), 0, tmp); - be16_add_cpu(&hdr_s->usedbytes, -tmp); - be16_add_cpu(&hdr_s->count, -1); + memset(xfs_attr3_leaf_name(leaf_s, start_s + i), 0, tmp); + ichdr_s->usedbytes -= tmp; + ichdr_s->count -= 1; entry_d--; /* to compensate for ++ in loop hdr */ desti--; if ((start_s + i) < offset) result++; /* insertion index adjustment */ } else { #endif /* GROT */ - be16_add_cpu(&hdr_d->firstused, -tmp); + ichdr_d->firstused -= tmp; /* both on-disk, don't endian flip twice */ entry_d->hashval = entry_s->hashval; - /* both on-disk, don't endian flip twice */ - entry_d->nameidx = hdr_d->firstused; + entry_d->nameidx = cpu_to_be16(ichdr_d->firstused); entry_d->flags = entry_s->flags; ASSERT(be16_to_cpu(entry_d->nameidx) + tmp - <= XFS_LBSIZE(mp)); - memmove(xfs_attr_leaf_name(leaf_d, desti), - xfs_attr_leaf_name(leaf_s, start_s + i), tmp); + <= args->geo->blksize); + memmove(xfs_attr3_leaf_name(leaf_d, desti), + xfs_attr3_leaf_name(leaf_s, start_s + i), tmp); ASSERT(be16_to_cpu(entry_s->nameidx) + tmp - <= XFS_LBSIZE(mp)); - memset(xfs_attr_leaf_name(leaf_s, start_s + i), 0, tmp); - be16_add_cpu(&hdr_s->usedbytes, -tmp); - be16_add_cpu(&hdr_d->usedbytes, tmp); - be16_add_cpu(&hdr_s->count, -1); - be16_add_cpu(&hdr_d->count, 1); - tmp = be16_to_cpu(hdr_d->count) - * sizeof(xfs_attr_leaf_entry_t) - + sizeof(xfs_attr_leaf_hdr_t); - ASSERT(be16_to_cpu(hdr_d->firstused) >= tmp); + <= args->geo->blksize); + memset(xfs_attr3_leaf_name(leaf_s, start_s + i), 0, tmp); + ichdr_s->usedbytes -= tmp; + ichdr_d->usedbytes += tmp; + ichdr_s->count -= 1; + ichdr_d->count += 1; + tmp = ichdr_d->count * sizeof(xfs_attr_leaf_entry_t) + + xfs_attr3_leaf_hdr_size(leaf_d); + ASSERT(ichdr_d->firstused >= tmp); #ifdef GROT } #endif /* GROT */ @@ -2283,71 +2347,40 @@ xfs_attr_leaf_moveents(xfs_attr_leafblock_t *leaf_s, int start_s, /* * Zero out the entries we just copied. */ - if (start_s == be16_to_cpu(hdr_s->count)) { + if (start_s == ichdr_s->count) { tmp = count * sizeof(xfs_attr_leaf_entry_t); - entry_s = &leaf_s->entries[start_s]; + entry_s = &xfs_attr3_leaf_entryp(leaf_s)[start_s]; ASSERT(((char *)entry_s + tmp) <= - ((char *)leaf_s + XFS_LBSIZE(mp))); - memset((char *)entry_s, 0, tmp); + ((char *)leaf_s + args->geo->blksize)); + memset(entry_s, 0, tmp); } else { /* * Move the remaining entries down to fill the hole, * then zero the entries at the top. */ - tmp = be16_to_cpu(hdr_s->count) - count; - tmp *= sizeof(xfs_attr_leaf_entry_t); - entry_s = &leaf_s->entries[start_s + count]; - entry_d = &leaf_s->entries[start_s]; - memmove((char *)entry_d, (char *)entry_s, tmp); + tmp = (ichdr_s->count - count) * sizeof(xfs_attr_leaf_entry_t); + entry_s = &xfs_attr3_leaf_entryp(leaf_s)[start_s + count]; + entry_d = &xfs_attr3_leaf_entryp(leaf_s)[start_s]; + memmove(entry_d, entry_s, tmp); tmp = count * sizeof(xfs_attr_leaf_entry_t); - entry_s = &leaf_s->entries[be16_to_cpu(hdr_s->count)]; + entry_s = &xfs_attr3_leaf_entryp(leaf_s)[ichdr_s->count]; ASSERT(((char *)entry_s + tmp) <= - ((char *)leaf_s + XFS_LBSIZE(mp))); - memset((char *)entry_s, 0, tmp); + ((char *)leaf_s + args->geo->blksize)); + memset(entry_s, 0, tmp); } /* * Fill in the freemap information */ - hdr_d->freemap[0].base = cpu_to_be16(sizeof(xfs_attr_leaf_hdr_t)); - be16_add_cpu(&hdr_d->freemap[0].base, be16_to_cpu(hdr_d->count) * - sizeof(xfs_attr_leaf_entry_t)); - hdr_d->freemap[0].size = cpu_to_be16(be16_to_cpu(hdr_d->firstused) - - be16_to_cpu(hdr_d->freemap[0].base)); - hdr_d->freemap[1].base = 0; - hdr_d->freemap[2].base = 0; - hdr_d->freemap[1].size = 0; - hdr_d->freemap[2].size = 0; - hdr_s->holes = 1; /* leaf may not be compact */ -} - -/* - * Compare two leaf blocks "order". - * Return 0 unless leaf2 should go before leaf1. - */ -int -xfs_attr_leaf_order( - struct xfs_buf *leaf1_bp, - struct xfs_buf *leaf2_bp) -{ - xfs_attr_leafblock_t *leaf1, *leaf2; - - leaf1 = leaf1_bp->b_addr; - leaf2 = leaf2_bp->b_addr; - ASSERT((leaf1->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)) && - (leaf2->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC))); - if ((be16_to_cpu(leaf1->hdr.count) > 0) && - (be16_to_cpu(leaf2->hdr.count) > 0) && - ((be32_to_cpu(leaf2->entries[0].hashval) < - be32_to_cpu(leaf1->entries[0].hashval)) || - (be32_to_cpu(leaf2->entries[ - be16_to_cpu(leaf2->hdr.count)-1].hashval) < - be32_to_cpu(leaf1->entries[ - be16_to_cpu(leaf1->hdr.count)-1].hashval)))) { - return(1); - } - return(0); + ichdr_d->freemap[0].base = xfs_attr3_leaf_hdr_size(leaf_d); + ichdr_d->freemap[0].base += ichdr_d->count * sizeof(xfs_attr_leaf_entry_t); + ichdr_d->freemap[0].size = ichdr_d->firstused - ichdr_d->freemap[0].base; + ichdr_d->freemap[1].base = 0; + ichdr_d->freemap[2].base = 0; + ichdr_d->freemap[1].size = 0; + ichdr_d->freemap[2].size = 0; + ichdr_s->holes = 1; /* leaf may not be compact */ } /* @@ -2358,15 +2391,16 @@ xfs_attr_leaf_lasthash( struct xfs_buf *bp, int *count) { - xfs_attr_leafblock_t *leaf; + struct xfs_attr3_icleaf_hdr ichdr; + struct xfs_attr_leaf_entry *entries; - leaf = bp->b_addr; - ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); + xfs_attr3_leaf_hdr_from_disk(&ichdr, bp->b_addr); + entries = xfs_attr3_leaf_entryp(bp->b_addr); if (count) - *count = be16_to_cpu(leaf->hdr.count); - if (!leaf->hdr.count) - return(0); - return be32_to_cpu(leaf->entries[be16_to_cpu(leaf->hdr.count)-1].hashval); + *count = ichdr.count; + if (!ichdr.count) + return 0; + return be32_to_cpu(entries[ichdr.count - 1].hashval); } /* @@ -2376,20 +2410,21 @@ xfs_attr_leaf_lasthash( STATIC int xfs_attr_leaf_entsize(xfs_attr_leafblock_t *leaf, int index) { + struct xfs_attr_leaf_entry *entries; xfs_attr_leaf_name_local_t *name_loc; xfs_attr_leaf_name_remote_t *name_rmt; int size; - ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); - if (leaf->entries[index].flags & XFS_ATTR_LOCAL) { - name_loc = xfs_attr_leaf_name_local(leaf, index); + entries = xfs_attr3_leaf_entryp(leaf); + if (entries[index].flags & XFS_ATTR_LOCAL) { + name_loc = xfs_attr3_leaf_name_local(leaf, index); size = xfs_attr_leaf_entsize_local(name_loc->namelen, be16_to_cpu(name_loc->valuelen)); } else { - name_rmt = xfs_attr_leaf_name_remote(leaf, index); + name_rmt = xfs_attr3_leaf_name_remote(leaf, index); size = xfs_attr_leaf_entsize_remote(name_rmt->namelen); } - return(size); + return size; } /* @@ -2399,140 +2434,21 @@ xfs_attr_leaf_entsize(xfs_attr_leafblock_t *leaf, int index) * a "local" or a "remote" attribute. */ int -xfs_attr_leaf_newentsize(int namelen, int valuelen, int blocksize, int *local) +xfs_attr_leaf_newentsize( + struct xfs_da_args *args, + int *local) { - int size; + int size; - size = xfs_attr_leaf_entsize_local(namelen, valuelen); - if (size < xfs_attr_leaf_entsize_local_max(blocksize)) { - if (local) { + size = xfs_attr_leaf_entsize_local(args->namelen, args->valuelen); + if (size < xfs_attr_leaf_entsize_local_max(args->geo->blksize)) { + if (local) *local = 1; - } - } else { - size = xfs_attr_leaf_entsize_remote(namelen); - if (local) { - *local = 0; - } + return size; } - return(size); -} - -/* - * Copy out attribute list entries for attr_list(), for leaf attribute lists. - */ -int -xfs_attr_leaf_list_int( - struct xfs_buf *bp, - xfs_attr_list_context_t *context) -{ - attrlist_cursor_kern_t *cursor; - xfs_attr_leafblock_t *leaf; - xfs_attr_leaf_entry_t *entry; - int retval, i; - - ASSERT(bp != NULL); - leaf = bp->b_addr; - cursor = context->cursor; - cursor->initted = 1; - - trace_xfs_attr_list_leaf(context); - - /* - * Re-find our place in the leaf block if this is a new syscall. - */ - if (context->resynch) { - entry = &leaf->entries[0]; - for (i = 0; i < be16_to_cpu(leaf->hdr.count); entry++, i++) { - if (be32_to_cpu(entry->hashval) == cursor->hashval) { - if (cursor->offset == context->dupcnt) { - context->dupcnt = 0; - break; - } - context->dupcnt++; - } else if (be32_to_cpu(entry->hashval) > - cursor->hashval) { - context->dupcnt = 0; - break; - } - } - if (i == be16_to_cpu(leaf->hdr.count)) { - trace_xfs_attr_list_notfound(context); - return(0); - } - } else { - entry = &leaf->entries[0]; - i = 0; - } - context->resynch = 0; - - /* - * We have found our place, start copying out the new attributes. - */ - retval = 0; - for ( ; (i < be16_to_cpu(leaf->hdr.count)); entry++, i++) { - if (be32_to_cpu(entry->hashval) != cursor->hashval) { - cursor->hashval = be32_to_cpu(entry->hashval); - cursor->offset = 0; - } - - if (entry->flags & XFS_ATTR_INCOMPLETE) - continue; /* skip incomplete entries */ - - if (entry->flags & XFS_ATTR_LOCAL) { - xfs_attr_leaf_name_local_t *name_loc = - xfs_attr_leaf_name_local(leaf, i); - - retval = context->put_listent(context, - entry->flags, - name_loc->nameval, - (int)name_loc->namelen, - be16_to_cpu(name_loc->valuelen), - &name_loc->nameval[name_loc->namelen]); - if (retval) - return retval; - } else { - xfs_attr_leaf_name_remote_t *name_rmt = - xfs_attr_leaf_name_remote(leaf, i); - - int valuelen = be32_to_cpu(name_rmt->valuelen); - - if (context->put_value) { - xfs_da_args_t args; - - memset((char *)&args, 0, sizeof(args)); - args.dp = context->dp; - args.whichfork = XFS_ATTR_FORK; - args.valuelen = valuelen; - args.value = kmem_alloc(valuelen, KM_SLEEP | KM_NOFS); - args.rmtblkno = be32_to_cpu(name_rmt->valueblk); - args.rmtblkcnt = XFS_B_TO_FSB(args.dp->i_mount, valuelen); - retval = xfs_attr_rmtval_get(&args); - if (retval) - return retval; - retval = context->put_listent(context, - entry->flags, - name_rmt->name, - (int)name_rmt->namelen, - valuelen, - args.value); - kmem_free(args.value); - } else { - retval = context->put_listent(context, - entry->flags, - name_rmt->name, - (int)name_rmt->namelen, - valuelen, - NULL); - } - if (retval) - return retval; - } - if (context->seen_enough) - break; - cursor->offset++; - } - trace_xfs_attr_list_leaf_end(context); - return(retval); + if (local) + *local = 0; + return xfs_attr_leaf_entsize_remote(args->namelen); } @@ -2544,14 +2460,16 @@ xfs_attr_leaf_list_int( * Clear the INCOMPLETE flag on an entry in a leaf block. */ int -xfs_attr_leaf_clearflag(xfs_da_args_t *args) +xfs_attr3_leaf_clearflag( + struct xfs_da_args *args) { - xfs_attr_leafblock_t *leaf; - xfs_attr_leaf_entry_t *entry; - xfs_attr_leaf_name_remote_t *name_rmt; - struct xfs_buf *bp; - int error; + struct xfs_attr_leafblock *leaf; + struct xfs_attr_leaf_entry *entry; + struct xfs_attr_leaf_name_remote *name_rmt; + struct xfs_buf *bp; + int error; #ifdef DEBUG + struct xfs_attr3_icleaf_hdr ichdr; xfs_attr_leaf_name_local_t *name_loc; int namelen; char *name; @@ -2561,23 +2479,25 @@ xfs_attr_leaf_clearflag(xfs_da_args_t *args) /* * Set up the operation. */ - error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, -1, &bp); + error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp); if (error) return(error); leaf = bp->b_addr; - ASSERT(args->index < be16_to_cpu(leaf->hdr.count)); - ASSERT(args->index >= 0); - entry = &leaf->entries[ args->index ]; + entry = &xfs_attr3_leaf_entryp(leaf)[args->index]; ASSERT(entry->flags & XFS_ATTR_INCOMPLETE); #ifdef DEBUG + xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf); + ASSERT(args->index < ichdr.count); + ASSERT(args->index >= 0); + if (entry->flags & XFS_ATTR_LOCAL) { - name_loc = xfs_attr_leaf_name_local(leaf, args->index); + name_loc = xfs_attr3_leaf_name_local(leaf, args->index); namelen = name_loc->namelen; name = (char *)name_loc->nameval; } else { - name_rmt = xfs_attr_leaf_name_remote(leaf, args->index); + name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index); namelen = name_rmt->namelen; name = (char *)name_rmt->name; } @@ -2592,9 +2512,9 @@ xfs_attr_leaf_clearflag(xfs_da_args_t *args) if (args->rmtblkno) { ASSERT((entry->flags & XFS_ATTR_LOCAL) == 0); - name_rmt = xfs_attr_leaf_name_remote(leaf, args->index); + name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index); name_rmt->valueblk = cpu_to_be32(args->rmtblkno); - name_rmt->valuelen = cpu_to_be32(args->valuelen); + name_rmt->valuelen = cpu_to_be32(args->rmtvaluelen); xfs_trans_log_buf(args->trans, bp, XFS_DA_LOGRANGE(leaf, name_rmt, sizeof(*name_rmt))); } @@ -2609,34 +2529,41 @@ xfs_attr_leaf_clearflag(xfs_da_args_t *args) * Set the INCOMPLETE flag on an entry in a leaf block. */ int -xfs_attr_leaf_setflag(xfs_da_args_t *args) +xfs_attr3_leaf_setflag( + struct xfs_da_args *args) { - xfs_attr_leafblock_t *leaf; - xfs_attr_leaf_entry_t *entry; - xfs_attr_leaf_name_remote_t *name_rmt; - struct xfs_buf *bp; + struct xfs_attr_leafblock *leaf; + struct xfs_attr_leaf_entry *entry; + struct xfs_attr_leaf_name_remote *name_rmt; + struct xfs_buf *bp; int error; +#ifdef DEBUG + struct xfs_attr3_icleaf_hdr ichdr; +#endif trace_xfs_attr_leaf_setflag(args); /* * Set up the operation. */ - error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, -1, &bp); + error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp); if (error) return(error); leaf = bp->b_addr; - ASSERT(args->index < be16_to_cpu(leaf->hdr.count)); +#ifdef DEBUG + xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf); + ASSERT(args->index < ichdr.count); ASSERT(args->index >= 0); - entry = &leaf->entries[ args->index ]; +#endif + entry = &xfs_attr3_leaf_entryp(leaf)[args->index]; ASSERT((entry->flags & XFS_ATTR_INCOMPLETE) == 0); entry->flags |= XFS_ATTR_INCOMPLETE; xfs_trans_log_buf(args->trans, bp, XFS_DA_LOGRANGE(leaf, entry, sizeof(*entry))); if ((entry->flags & XFS_ATTR_LOCAL) == 0) { - name_rmt = xfs_attr_leaf_name_remote(leaf, args->index); + name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index); name_rmt->valueblk = 0; name_rmt->valuelen = 0; xfs_trans_log_buf(args->trans, bp, @@ -2657,14 +2584,20 @@ xfs_attr_leaf_setflag(xfs_da_args_t *args) * Note that they could be in different blocks, or in the same block. */ int -xfs_attr_leaf_flipflags(xfs_da_args_t *args) +xfs_attr3_leaf_flipflags( + struct xfs_da_args *args) { - xfs_attr_leafblock_t *leaf1, *leaf2; - xfs_attr_leaf_entry_t *entry1, *entry2; - xfs_attr_leaf_name_remote_t *name_rmt; - struct xfs_buf *bp1, *bp2; + struct xfs_attr_leafblock *leaf1; + struct xfs_attr_leafblock *leaf2; + struct xfs_attr_leaf_entry *entry1; + struct xfs_attr_leaf_entry *entry2; + struct xfs_attr_leaf_name_remote *name_rmt; + struct xfs_buf *bp1; + struct xfs_buf *bp2; int error; #ifdef DEBUG + struct xfs_attr3_icleaf_hdr ichdr1; + struct xfs_attr3_icleaf_hdr ichdr2; xfs_attr_leaf_name_local_t *name_loc; int namelen1, namelen2; char *name1, *name2; @@ -2675,7 +2608,7 @@ xfs_attr_leaf_flipflags(xfs_da_args_t *args) /* * Read the block containing the "old" attr */ - error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, -1, &bp1); + error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp1); if (error) return error; @@ -2683,7 +2616,7 @@ xfs_attr_leaf_flipflags(xfs_da_args_t *args) * Read the block containing the "new" attr, if it is different */ if (args->blkno2 != args->blkno) { - error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno2, + error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno2, -1, &bp2); if (error) return error; @@ -2692,31 +2625,35 @@ xfs_attr_leaf_flipflags(xfs_da_args_t *args) } leaf1 = bp1->b_addr; - ASSERT(args->index < be16_to_cpu(leaf1->hdr.count)); - ASSERT(args->index >= 0); - entry1 = &leaf1->entries[ args->index ]; + entry1 = &xfs_attr3_leaf_entryp(leaf1)[args->index]; leaf2 = bp2->b_addr; - ASSERT(args->index2 < be16_to_cpu(leaf2->hdr.count)); - ASSERT(args->index2 >= 0); - entry2 = &leaf2->entries[ args->index2 ]; + entry2 = &xfs_attr3_leaf_entryp(leaf2)[args->index2]; #ifdef DEBUG + xfs_attr3_leaf_hdr_from_disk(&ichdr1, leaf1); + ASSERT(args->index < ichdr1.count); + ASSERT(args->index >= 0); + + xfs_attr3_leaf_hdr_from_disk(&ichdr2, leaf2); + ASSERT(args->index2 < ichdr2.count); + ASSERT(args->index2 >= 0); + if (entry1->flags & XFS_ATTR_LOCAL) { - name_loc = xfs_attr_leaf_name_local(leaf1, args->index); + name_loc = xfs_attr3_leaf_name_local(leaf1, args->index); namelen1 = name_loc->namelen; name1 = (char *)name_loc->nameval; } else { - name_rmt = xfs_attr_leaf_name_remote(leaf1, args->index); + name_rmt = xfs_attr3_leaf_name_remote(leaf1, args->index); namelen1 = name_rmt->namelen; name1 = (char *)name_rmt->name; } if (entry2->flags & XFS_ATTR_LOCAL) { - name_loc = xfs_attr_leaf_name_local(leaf2, args->index2); + name_loc = xfs_attr3_leaf_name_local(leaf2, args->index2); namelen2 = name_loc->namelen; name2 = (char *)name_loc->nameval; } else { - name_rmt = xfs_attr_leaf_name_remote(leaf2, args->index2); + name_rmt = xfs_attr3_leaf_name_remote(leaf2, args->index2); namelen2 = name_rmt->namelen; name2 = (char *)name_rmt->name; } @@ -2733,9 +2670,9 @@ xfs_attr_leaf_flipflags(xfs_da_args_t *args) XFS_DA_LOGRANGE(leaf1, entry1, sizeof(*entry1))); if (args->rmtblkno) { ASSERT((entry1->flags & XFS_ATTR_LOCAL) == 0); - name_rmt = xfs_attr_leaf_name_remote(leaf1, args->index); + name_rmt = xfs_attr3_leaf_name_remote(leaf1, args->index); name_rmt->valueblk = cpu_to_be32(args->rmtblkno); - name_rmt->valuelen = cpu_to_be32(args->valuelen); + name_rmt->valuelen = cpu_to_be32(args->rmtvaluelen); xfs_trans_log_buf(args->trans, bp1, XFS_DA_LOGRANGE(leaf1, name_rmt, sizeof(*name_rmt))); } @@ -2744,7 +2681,7 @@ xfs_attr_leaf_flipflags(xfs_da_args_t *args) xfs_trans_log_buf(args->trans, bp2, XFS_DA_LOGRANGE(leaf2, entry2, sizeof(*entry2))); if ((entry2->flags & XFS_ATTR_LOCAL) == 0) { - name_rmt = xfs_attr_leaf_name_remote(leaf2, args->index2); + name_rmt = xfs_attr3_leaf_name_remote(leaf2, args->index2); name_rmt->valueblk = 0; name_rmt->valuelen = 0; xfs_trans_log_buf(args->trans, bp2, @@ -2756,319 +2693,5 @@ xfs_attr_leaf_flipflags(xfs_da_args_t *args) */ error = xfs_trans_roll(&args->trans, args->dp); - return(error); -} - -/*======================================================================== - * Indiscriminately delete the entire attribute fork - *========================================================================*/ - -/* - * Recurse (gasp!) through the attribute nodes until we find leaves. - * We're doing a depth-first traversal in order to invalidate everything. - */ -int -xfs_attr_root_inactive(xfs_trans_t **trans, xfs_inode_t *dp) -{ - xfs_da_blkinfo_t *info; - xfs_daddr_t blkno; - struct xfs_buf *bp; - int error; - - /* - * Read block 0 to see what we have to work with. - * We only get here if we have extents, since we remove - * the extents in reverse order the extent containing - * block 0 must still be there. - */ - error = xfs_da_node_read(*trans, dp, 0, -1, &bp, XFS_ATTR_FORK); - if (error) - return(error); - blkno = XFS_BUF_ADDR(bp); - - /* - * Invalidate the tree, even if the "tree" is only a single leaf block. - * This is a depth-first traversal! - */ - info = bp->b_addr; - if (info->magic == cpu_to_be16(XFS_DA_NODE_MAGIC)) { - error = xfs_attr_node_inactive(trans, dp, bp, 1); - } else if (info->magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)) { - error = xfs_attr_leaf_inactive(trans, dp, bp); - } else { - error = XFS_ERROR(EIO); - xfs_trans_brelse(*trans, bp); - } - if (error) - return(error); - - /* - * Invalidate the incore copy of the root block. - */ - error = xfs_da_get_buf(*trans, dp, 0, blkno, &bp, XFS_ATTR_FORK); - if (error) - return(error); - xfs_trans_binval(*trans, bp); /* remove from cache */ - /* - * Commit the invalidate and start the next transaction. - */ - error = xfs_trans_roll(trans, dp); - - return (error); -} - -/* - * Recurse (gasp!) through the attribute nodes until we find leaves. - * We're doing a depth-first traversal in order to invalidate everything. - */ -STATIC int -xfs_attr_node_inactive( - struct xfs_trans **trans, - struct xfs_inode *dp, - struct xfs_buf *bp, - int level) -{ - xfs_da_blkinfo_t *info; - xfs_da_intnode_t *node; - xfs_dablk_t child_fsb; - xfs_daddr_t parent_blkno, child_blkno; - int error, count, i; - struct xfs_buf *child_bp; - - /* - * Since this code is recursive (gasp!) we must protect ourselves. - */ - if (level > XFS_DA_NODE_MAXDEPTH) { - xfs_trans_brelse(*trans, bp); /* no locks for later trans */ - return(XFS_ERROR(EIO)); - } - - node = bp->b_addr; - ASSERT(node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); - parent_blkno = XFS_BUF_ADDR(bp); /* save for re-read later */ - count = be16_to_cpu(node->hdr.count); - if (!count) { - xfs_trans_brelse(*trans, bp); - return(0); - } - child_fsb = be32_to_cpu(node->btree[0].before); - xfs_trans_brelse(*trans, bp); /* no locks for later trans */ - - /* - * If this is the node level just above the leaves, simply loop - * over the leaves removing all of them. If this is higher up - * in the tree, recurse downward. - */ - for (i = 0; i < count; i++) { - /* - * Read the subsidiary block to see what we have to work with. - * Don't do this in a transaction. This is a depth-first - * traversal of the tree so we may deal with many blocks - * before we come back to this one. - */ - error = xfs_da_node_read(*trans, dp, child_fsb, -2, &child_bp, - XFS_ATTR_FORK); - if (error) - return(error); - if (child_bp) { - /* save for re-read later */ - child_blkno = XFS_BUF_ADDR(child_bp); - - /* - * Invalidate the subtree, however we have to. - */ - info = child_bp->b_addr; - if (info->magic == cpu_to_be16(XFS_DA_NODE_MAGIC)) { - error = xfs_attr_node_inactive(trans, dp, - child_bp, level+1); - } else if (info->magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)) { - error = xfs_attr_leaf_inactive(trans, dp, - child_bp); - } else { - error = XFS_ERROR(EIO); - xfs_trans_brelse(*trans, child_bp); - } - if (error) - return(error); - - /* - * Remove the subsidiary block from the cache - * and from the log. - */ - error = xfs_da_get_buf(*trans, dp, 0, child_blkno, - &child_bp, XFS_ATTR_FORK); - if (error) - return(error); - xfs_trans_binval(*trans, child_bp); - } - - /* - * If we're not done, re-read the parent to get the next - * child block number. - */ - if ((i+1) < count) { - error = xfs_da_node_read(*trans, dp, 0, parent_blkno, - &bp, XFS_ATTR_FORK); - if (error) - return(error); - child_fsb = be32_to_cpu(node->btree[i+1].before); - xfs_trans_brelse(*trans, bp); - } - /* - * Atomically commit the whole invalidate stuff. - */ - error = xfs_trans_roll(trans, dp); - if (error) - return (error); - } - - return(0); -} - -/* - * Invalidate all of the "remote" value regions pointed to by a particular - * leaf block. - * Note that we must release the lock on the buffer so that we are not - * caught holding something that the logging code wants to flush to disk. - */ -STATIC int -xfs_attr_leaf_inactive( - struct xfs_trans **trans, - struct xfs_inode *dp, - struct xfs_buf *bp) -{ - xfs_attr_leafblock_t *leaf; - xfs_attr_leaf_entry_t *entry; - xfs_attr_leaf_name_remote_t *name_rmt; - xfs_attr_inactive_list_t *list, *lp; - int error, count, size, tmp, i; - - leaf = bp->b_addr; - ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); - - /* - * Count the number of "remote" value extents. - */ - count = 0; - entry = &leaf->entries[0]; - for (i = 0; i < be16_to_cpu(leaf->hdr.count); entry++, i++) { - if (be16_to_cpu(entry->nameidx) && - ((entry->flags & XFS_ATTR_LOCAL) == 0)) { - name_rmt = xfs_attr_leaf_name_remote(leaf, i); - if (name_rmt->valueblk) - count++; - } - } - - /* - * If there are no "remote" values, we're done. - */ - if (count == 0) { - xfs_trans_brelse(*trans, bp); - return(0); - } - - /* - * Allocate storage for a list of all the "remote" value extents. - */ - size = count * sizeof(xfs_attr_inactive_list_t); - list = (xfs_attr_inactive_list_t *)kmem_alloc(size, KM_SLEEP); - - /* - * Identify each of the "remote" value extents. - */ - lp = list; - entry = &leaf->entries[0]; - for (i = 0; i < be16_to_cpu(leaf->hdr.count); entry++, i++) { - if (be16_to_cpu(entry->nameidx) && - ((entry->flags & XFS_ATTR_LOCAL) == 0)) { - name_rmt = xfs_attr_leaf_name_remote(leaf, i); - if (name_rmt->valueblk) { - lp->valueblk = be32_to_cpu(name_rmt->valueblk); - lp->valuelen = XFS_B_TO_FSB(dp->i_mount, - be32_to_cpu(name_rmt->valuelen)); - lp++; - } - } - } - xfs_trans_brelse(*trans, bp); /* unlock for trans. in freextent() */ - - /* - * Invalidate each of the "remote" value extents. - */ - error = 0; - for (lp = list, i = 0; i < count; i++, lp++) { - tmp = xfs_attr_leaf_freextent(trans, dp, - lp->valueblk, lp->valuelen); - - if (error == 0) - error = tmp; /* save only the 1st errno */ - } - - kmem_free((xfs_caddr_t)list); - return(error); -} - -/* - * Look at all the extents for this logical region, - * invalidate any buffers that are incore/in transactions. - */ -STATIC int -xfs_attr_leaf_freextent(xfs_trans_t **trans, xfs_inode_t *dp, - xfs_dablk_t blkno, int blkcnt) -{ - xfs_bmbt_irec_t map; - xfs_dablk_t tblkno; - int tblkcnt, dblkcnt, nmap, error; - xfs_daddr_t dblkno; - xfs_buf_t *bp; - - /* - * Roll through the "value", invalidating the attribute value's - * blocks. - */ - tblkno = blkno; - tblkcnt = blkcnt; - while (tblkcnt > 0) { - /* - * Try to remember where we decided to put the value. - */ - nmap = 1; - error = xfs_bmapi_read(dp, (xfs_fileoff_t)tblkno, tblkcnt, - &map, &nmap, XFS_BMAPI_ATTRFORK); - if (error) { - return(error); - } - ASSERT(nmap == 1); - ASSERT(map.br_startblock != DELAYSTARTBLOCK); - - /* - * If it's a hole, these are already unmapped - * so there's nothing to invalidate. - */ - if (map.br_startblock != HOLESTARTBLOCK) { - - dblkno = XFS_FSB_TO_DADDR(dp->i_mount, - map.br_startblock); - dblkcnt = XFS_FSB_TO_BB(dp->i_mount, - map.br_blockcount); - bp = xfs_trans_get_buf(*trans, - dp->i_mount->m_ddev_targp, - dblkno, dblkcnt, 0); - if (!bp) - return ENOMEM; - xfs_trans_binval(*trans, bp); - /* - * Roll to next transaction. - */ - error = xfs_trans_roll(trans, dp); - if (error) - return (error); - } - - tblkno += map.br_blockcount; - tblkcnt -= map.br_blockcount; - } - - return(0); + return error; } diff --git a/fs/xfs/xfs_attr_leaf.h b/fs/xfs/xfs_attr_leaf.h index 77de139a58f..e2929da7c3b 100644 --- a/fs/xfs/xfs_attr_leaf.h +++ b/fs/xfs/xfs_attr_leaf.h @@ -1,5 +1,6 @@ /* * Copyright (c) 2000,2002-2003,2005 Silicon Graphics, Inc. + * Copyright (c) 2013 Red Hat, Inc. * All Rights Reserved. * * This program is free software; you can redistribute it and/or @@ -18,16 +19,6 @@ #ifndef __XFS_ATTR_LEAF_H__ #define __XFS_ATTR_LEAF_H__ -/* - * Attribute storage layout, internal structure, access macros, etc. - * - * Attribute lists are structured around Btrees where all the data - * elements are in the leaf nodes. Attribute names are hashed into an int, - * then that int is used as the index into the Btree. Since the hashval - * of an attribute name may not be unique, we may have duplicate keys. The - * internal links in the Btree are logical block offsets into the file. - */ - struct attrlist; struct attrlist_cursor_kern; struct xfs_attr_list_context; @@ -37,160 +28,6 @@ struct xfs_da_state_blk; struct xfs_inode; struct xfs_trans; -/*======================================================================== - * Attribute structure when equal to XFS_LBSIZE(mp) bytes. - *========================================================================*/ - -/* - * This is the structure of the leaf nodes in the Btree. - * - * Struct leaf_entry's are packed from the top. Name/values grow from the - * bottom but are not packed. The freemap contains run-length-encoded entries - * for the free bytes after the leaf_entry's, but only the N largest such, - * smaller runs are dropped. When the freemap doesn't show enough space - * for an allocation, we compact the name/value area and try again. If we - * still don't have enough space, then we have to split the block. The - * name/value structs (both local and remote versions) must be 32bit aligned. - * - * Since we have duplicate hash keys, for each key that matches, compare - * the actual name string. The root and intermediate node search always - * takes the first-in-the-block key match found, so we should only have - * to work "forw"ard. If none matches, continue with the "forw"ard leaf - * nodes until the hash key changes or the attribute name is found. - * - * We store the fact that an attribute is a ROOT/USER/SECURE attribute in - * the leaf_entry. The namespaces are independent only because we also look - * at the namespace bit when we are looking for a matching attribute name. - * - * We also store an "incomplete" bit in the leaf_entry. It shows that an - * attribute is in the middle of being created and should not be shown to - * the user if we crash during the time that the bit is set. We clear the - * bit when we have finished setting up the attribute. We do this because - * we cannot create some large attributes inside a single transaction, and we - * need some indication that we weren't finished if we crash in the middle. - */ -#define XFS_ATTR_LEAF_MAPSIZE 3 /* how many freespace slots */ - -typedef struct xfs_attr_leaf_map { /* RLE map of free bytes */ - __be16 base; /* base of free region */ - __be16 size; /* length of free region */ -} xfs_attr_leaf_map_t; - -typedef struct xfs_attr_leaf_hdr { /* constant-structure header block */ - xfs_da_blkinfo_t info; /* block type, links, etc. */ - __be16 count; /* count of active leaf_entry's */ - __be16 usedbytes; /* num bytes of names/values stored */ - __be16 firstused; /* first used byte in name area */ - __u8 holes; /* != 0 if blk needs compaction */ - __u8 pad1; - xfs_attr_leaf_map_t freemap[XFS_ATTR_LEAF_MAPSIZE]; - /* N largest free regions */ -} xfs_attr_leaf_hdr_t; - -typedef struct xfs_attr_leaf_entry { /* sorted on key, not name */ - __be32 hashval; /* hash value of name */ - __be16 nameidx; /* index into buffer of name/value */ - __u8 flags; /* LOCAL/ROOT/SECURE/INCOMPLETE flag */ - __u8 pad2; /* unused pad byte */ -} xfs_attr_leaf_entry_t; - -typedef struct xfs_attr_leaf_name_local { - __be16 valuelen; /* number of bytes in value */ - __u8 namelen; /* length of name bytes */ - __u8 nameval[1]; /* name/value bytes */ -} xfs_attr_leaf_name_local_t; - -typedef struct xfs_attr_leaf_name_remote { - __be32 valueblk; /* block number of value bytes */ - __be32 valuelen; /* number of bytes in value */ - __u8 namelen; /* length of name bytes */ - __u8 name[1]; /* name bytes */ -} xfs_attr_leaf_name_remote_t; - -typedef struct xfs_attr_leafblock { - xfs_attr_leaf_hdr_t hdr; /* constant-structure header block */ - xfs_attr_leaf_entry_t entries[1]; /* sorted on key, not name */ - xfs_attr_leaf_name_local_t namelist; /* grows from bottom of buf */ - xfs_attr_leaf_name_remote_t valuelist; /* grows from bottom of buf */ -} xfs_attr_leafblock_t; - -/* - * Flags used in the leaf_entry[i].flags field. - * NOTE: the INCOMPLETE bit must not collide with the flags bits specified - * on the system call, they are "or"ed together for various operations. - */ -#define XFS_ATTR_LOCAL_BIT 0 /* attr is stored locally */ -#define XFS_ATTR_ROOT_BIT 1 /* limit access to trusted attrs */ -#define XFS_ATTR_SECURE_BIT 2 /* limit access to secure attrs */ -#define XFS_ATTR_INCOMPLETE_BIT 7 /* attr in middle of create/delete */ -#define XFS_ATTR_LOCAL (1 << XFS_ATTR_LOCAL_BIT) -#define XFS_ATTR_ROOT (1 << XFS_ATTR_ROOT_BIT) -#define XFS_ATTR_SECURE (1 << XFS_ATTR_SECURE_BIT) -#define XFS_ATTR_INCOMPLETE (1 << XFS_ATTR_INCOMPLETE_BIT) - -/* - * Conversion macros for converting namespace bits from argument flags - * to ondisk flags. - */ -#define XFS_ATTR_NSP_ARGS_MASK (ATTR_ROOT | ATTR_SECURE) -#define XFS_ATTR_NSP_ONDISK_MASK (XFS_ATTR_ROOT | XFS_ATTR_SECURE) -#define XFS_ATTR_NSP_ONDISK(flags) ((flags) & XFS_ATTR_NSP_ONDISK_MASK) -#define XFS_ATTR_NSP_ARGS(flags) ((flags) & XFS_ATTR_NSP_ARGS_MASK) -#define XFS_ATTR_NSP_ARGS_TO_ONDISK(x) (((x) & ATTR_ROOT ? XFS_ATTR_ROOT : 0) |\ - ((x) & ATTR_SECURE ? XFS_ATTR_SECURE : 0)) -#define XFS_ATTR_NSP_ONDISK_TO_ARGS(x) (((x) & XFS_ATTR_ROOT ? ATTR_ROOT : 0) |\ - ((x) & XFS_ATTR_SECURE ? ATTR_SECURE : 0)) - -/* - * Alignment for namelist and valuelist entries (since they are mixed - * there can be only one alignment value) - */ -#define XFS_ATTR_LEAF_NAME_ALIGN ((uint)sizeof(xfs_dablk_t)) - -/* - * Cast typed pointers for "local" and "remote" name/value structs. - */ -static inline xfs_attr_leaf_name_remote_t * -xfs_attr_leaf_name_remote(xfs_attr_leafblock_t *leafp, int idx) -{ - return (xfs_attr_leaf_name_remote_t *) - &((char *)leafp)[be16_to_cpu(leafp->entries[idx].nameidx)]; -} - -static inline xfs_attr_leaf_name_local_t * -xfs_attr_leaf_name_local(xfs_attr_leafblock_t *leafp, int idx) -{ - return (xfs_attr_leaf_name_local_t *) - &((char *)leafp)[be16_to_cpu(leafp->entries[idx].nameidx)]; -} - -static inline char *xfs_attr_leaf_name(xfs_attr_leafblock_t *leafp, int idx) -{ - return &((char *)leafp)[be16_to_cpu(leafp->entries[idx].nameidx)]; -} - -/* - * Calculate total bytes used (including trailing pad for alignment) for - * a "local" name/value structure, a "remote" name/value structure, and - * a pointer which might be either. - */ -static inline int xfs_attr_leaf_entsize_remote(int nlen) -{ - return ((uint)sizeof(xfs_attr_leaf_name_remote_t) - 1 + (nlen) + \ - XFS_ATTR_LEAF_NAME_ALIGN - 1) & ~(XFS_ATTR_LEAF_NAME_ALIGN - 1); -} - -static inline int xfs_attr_leaf_entsize_local(int nlen, int vlen) -{ - return ((uint)sizeof(xfs_attr_leaf_name_local_t) - 1 + (nlen) + (vlen) + - XFS_ATTR_LEAF_NAME_ALIGN - 1) & ~(XFS_ATTR_LEAF_NAME_ALIGN - 1); -} - -static inline int xfs_attr_leaf_entsize_local_max(int bsize) -{ - return (((bsize) >> 1) + ((bsize) >> 2)); -} - /* * Used to keep a list of "remote value" extents when unlinking an inode. */ @@ -221,37 +58,37 @@ int xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes); /* * Internal routines when attribute fork size == XFS_LBSIZE(mp). */ -int xfs_attr_leaf_to_node(struct xfs_da_args *args); -int xfs_attr_leaf_to_shortform(struct xfs_buf *bp, +int xfs_attr3_leaf_to_node(struct xfs_da_args *args); +int xfs_attr3_leaf_to_shortform(struct xfs_buf *bp, struct xfs_da_args *args, int forkoff); -int xfs_attr_leaf_clearflag(struct xfs_da_args *args); -int xfs_attr_leaf_setflag(struct xfs_da_args *args); -int xfs_attr_leaf_flipflags(xfs_da_args_t *args); +int xfs_attr3_leaf_clearflag(struct xfs_da_args *args); +int xfs_attr3_leaf_setflag(struct xfs_da_args *args); +int xfs_attr3_leaf_flipflags(struct xfs_da_args *args); /* * Routines used for growing the Btree. */ -int xfs_attr_leaf_split(struct xfs_da_state *state, +int xfs_attr3_leaf_split(struct xfs_da_state *state, struct xfs_da_state_blk *oldblk, struct xfs_da_state_blk *newblk); -int xfs_attr_leaf_lookup_int(struct xfs_buf *leaf, +int xfs_attr3_leaf_lookup_int(struct xfs_buf *leaf, struct xfs_da_args *args); -int xfs_attr_leaf_getvalue(struct xfs_buf *bp, struct xfs_da_args *args); -int xfs_attr_leaf_add(struct xfs_buf *leaf_buffer, +int xfs_attr3_leaf_getvalue(struct xfs_buf *bp, struct xfs_da_args *args); +int xfs_attr3_leaf_add(struct xfs_buf *leaf_buffer, struct xfs_da_args *args); -int xfs_attr_leaf_remove(struct xfs_buf *leaf_buffer, +int xfs_attr3_leaf_remove(struct xfs_buf *leaf_buffer, struct xfs_da_args *args); -int xfs_attr_leaf_list_int(struct xfs_buf *bp, +int xfs_attr3_leaf_list_int(struct xfs_buf *bp, struct xfs_attr_list_context *context); /* * Routines used for shrinking the Btree. */ -int xfs_attr_leaf_toosmall(struct xfs_da_state *state, int *retval); -void xfs_attr_leaf_unbalance(struct xfs_da_state *state, +int xfs_attr3_leaf_toosmall(struct xfs_da_state *state, int *retval); +void xfs_attr3_leaf_unbalance(struct xfs_da_state *state, struct xfs_da_state_blk *drop_blk, struct xfs_da_state_blk *save_blk); -int xfs_attr_root_inactive(struct xfs_trans **trans, struct xfs_inode *dp); +int xfs_attr3_root_inactive(struct xfs_trans **trans, struct xfs_inode *dp); /* * Utility routines. @@ -259,12 +96,13 @@ int xfs_attr_root_inactive(struct xfs_trans **trans, struct xfs_inode *dp); xfs_dahash_t xfs_attr_leaf_lasthash(struct xfs_buf *bp, int *count); int xfs_attr_leaf_order(struct xfs_buf *leaf1_bp, struct xfs_buf *leaf2_bp); -int xfs_attr_leaf_newentsize(int namelen, int valuelen, int blocksize, - int *local); -int xfs_attr_leaf_read(struct xfs_trans *tp, struct xfs_inode *dp, +int xfs_attr_leaf_newentsize(struct xfs_da_args *args, int *local); +int xfs_attr3_leaf_read(struct xfs_trans *tp, struct xfs_inode *dp, xfs_dablk_t bno, xfs_daddr_t mappedbno, struct xfs_buf **bpp); - -extern const struct xfs_buf_ops xfs_attr_leaf_buf_ops; +void xfs_attr3_leaf_hdr_from_disk(struct xfs_attr3_icleaf_hdr *to, + struct xfs_attr_leafblock *from); +void xfs_attr3_leaf_hdr_to_disk(struct xfs_attr_leafblock *to, + struct xfs_attr3_icleaf_hdr *from); #endif /* __XFS_ATTR_LEAF_H__ */ diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c new file mode 100644 index 00000000000..90e2eeb2120 --- /dev/null +++ b/fs/xfs/xfs_attr_list.c @@ -0,0 +1,653 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * Copyright (c) 2013 Red Hat, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_bit.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_mount.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_inode.h" +#include "xfs_trans.h" +#include "xfs_inode_item.h" +#include "xfs_bmap.h" +#include "xfs_attr.h" +#include "xfs_attr_sf.h" +#include "xfs_attr_remote.h" +#include "xfs_attr_leaf.h" +#include "xfs_error.h" +#include "xfs_trace.h" +#include "xfs_buf_item.h" +#include "xfs_cksum.h" +#include "xfs_dinode.h" +#include "xfs_dir2.h" + +STATIC int +xfs_attr_shortform_compare(const void *a, const void *b) +{ + xfs_attr_sf_sort_t *sa, *sb; + + sa = (xfs_attr_sf_sort_t *)a; + sb = (xfs_attr_sf_sort_t *)b; + if (sa->hash < sb->hash) { + return(-1); + } else if (sa->hash > sb->hash) { + return(1); + } else { + return(sa->entno - sb->entno); + } +} + +#define XFS_ISRESET_CURSOR(cursor) \ + (!((cursor)->initted) && !((cursor)->hashval) && \ + !((cursor)->blkno) && !((cursor)->offset)) +/* + * Copy out entries of shortform attribute lists for attr_list(). + * Shortform attribute lists are not stored in hashval sorted order. + * If the output buffer is not large enough to hold them all, then we + * we have to calculate each entries' hashvalue and sort them before + * we can begin returning them to the user. + */ +int +xfs_attr_shortform_list(xfs_attr_list_context_t *context) +{ + attrlist_cursor_kern_t *cursor; + xfs_attr_sf_sort_t *sbuf, *sbp; + xfs_attr_shortform_t *sf; + xfs_attr_sf_entry_t *sfe; + xfs_inode_t *dp; + int sbsize, nsbuf, count, i; + int error; + + ASSERT(context != NULL); + dp = context->dp; + ASSERT(dp != NULL); + ASSERT(dp->i_afp != NULL); + sf = (xfs_attr_shortform_t *)dp->i_afp->if_u1.if_data; + ASSERT(sf != NULL); + if (!sf->hdr.count) + return(0); + cursor = context->cursor; + ASSERT(cursor != NULL); + + trace_xfs_attr_list_sf(context); + + /* + * If the buffer is large enough and the cursor is at the start, + * do not bother with sorting since we will return everything in + * one buffer and another call using the cursor won't need to be + * made. + * Note the generous fudge factor of 16 overhead bytes per entry. + * If bufsize is zero then put_listent must be a search function + * and can just scan through what we have. + */ + if (context->bufsize == 0 || + (XFS_ISRESET_CURSOR(cursor) && + (dp->i_afp->if_bytes + sf->hdr.count * 16) < context->bufsize)) { + for (i = 0, sfe = &sf->list[0]; i < sf->hdr.count; i++) { + error = context->put_listent(context, + sfe->flags, + sfe->nameval, + (int)sfe->namelen, + (int)sfe->valuelen, + &sfe->nameval[sfe->namelen]); + + /* + * Either search callback finished early or + * didn't fit it all in the buffer after all. + */ + if (context->seen_enough) + break; + + if (error) + return error; + sfe = XFS_ATTR_SF_NEXTENTRY(sfe); + } + trace_xfs_attr_list_sf_all(context); + return(0); + } + + /* do no more for a search callback */ + if (context->bufsize == 0) + return 0; + + /* + * It didn't all fit, so we have to sort everything on hashval. + */ + sbsize = sf->hdr.count * sizeof(*sbuf); + sbp = sbuf = kmem_alloc(sbsize, KM_SLEEP | KM_NOFS); + + /* + * Scan the attribute list for the rest of the entries, storing + * the relevant info from only those that match into a buffer. + */ + nsbuf = 0; + for (i = 0, sfe = &sf->list[0]; i < sf->hdr.count; i++) { + if (unlikely( + ((char *)sfe < (char *)sf) || + ((char *)sfe >= ((char *)sf + dp->i_afp->if_bytes)))) { + XFS_CORRUPTION_ERROR("xfs_attr_shortform_list", + XFS_ERRLEVEL_LOW, + context->dp->i_mount, sfe); + kmem_free(sbuf); + return XFS_ERROR(EFSCORRUPTED); + } + + sbp->entno = i; + sbp->hash = xfs_da_hashname(sfe->nameval, sfe->namelen); + sbp->name = sfe->nameval; + sbp->namelen = sfe->namelen; + /* These are bytes, and both on-disk, don't endian-flip */ + sbp->valuelen = sfe->valuelen; + sbp->flags = sfe->flags; + sfe = XFS_ATTR_SF_NEXTENTRY(sfe); + sbp++; + nsbuf++; + } + + /* + * Sort the entries on hash then entno. + */ + xfs_sort(sbuf, nsbuf, sizeof(*sbuf), xfs_attr_shortform_compare); + + /* + * Re-find our place IN THE SORTED LIST. + */ + count = 0; + cursor->initted = 1; + cursor->blkno = 0; + for (sbp = sbuf, i = 0; i < nsbuf; i++, sbp++) { + if (sbp->hash == cursor->hashval) { + if (cursor->offset == count) { + break; + } + count++; + } else if (sbp->hash > cursor->hashval) { + break; + } + } + if (i == nsbuf) { + kmem_free(sbuf); + return(0); + } + + /* + * Loop putting entries into the user buffer. + */ + for ( ; i < nsbuf; i++, sbp++) { + if (cursor->hashval != sbp->hash) { + cursor->hashval = sbp->hash; + cursor->offset = 0; + } + error = context->put_listent(context, + sbp->flags, + sbp->name, + sbp->namelen, + sbp->valuelen, + &sbp->name[sbp->namelen]); + if (error) + return error; + if (context->seen_enough) + break; + cursor->offset++; + } + + kmem_free(sbuf); + return(0); +} + +STATIC int +xfs_attr_node_list(xfs_attr_list_context_t *context) +{ + attrlist_cursor_kern_t *cursor; + xfs_attr_leafblock_t *leaf; + xfs_da_intnode_t *node; + struct xfs_attr3_icleaf_hdr leafhdr; + struct xfs_da3_icnode_hdr nodehdr; + struct xfs_da_node_entry *btree; + int error, i; + struct xfs_buf *bp; + struct xfs_inode *dp = context->dp; + + trace_xfs_attr_node_list(context); + + cursor = context->cursor; + cursor->initted = 1; + + /* + * Do all sorts of validation on the passed-in cursor structure. + * If anything is amiss, ignore the cursor and look up the hashval + * starting from the btree root. + */ + bp = NULL; + if (cursor->blkno > 0) { + error = xfs_da3_node_read(NULL, dp, cursor->blkno, -1, + &bp, XFS_ATTR_FORK); + if ((error != 0) && (error != EFSCORRUPTED)) + return(error); + if (bp) { + struct xfs_attr_leaf_entry *entries; + + node = bp->b_addr; + switch (be16_to_cpu(node->hdr.info.magic)) { + case XFS_DA_NODE_MAGIC: + case XFS_DA3_NODE_MAGIC: + trace_xfs_attr_list_wrong_blk(context); + xfs_trans_brelse(NULL, bp); + bp = NULL; + break; + case XFS_ATTR_LEAF_MAGIC: + case XFS_ATTR3_LEAF_MAGIC: + leaf = bp->b_addr; + xfs_attr3_leaf_hdr_from_disk(&leafhdr, leaf); + entries = xfs_attr3_leaf_entryp(leaf); + if (cursor->hashval > be32_to_cpu( + entries[leafhdr.count - 1].hashval)) { + trace_xfs_attr_list_wrong_blk(context); + xfs_trans_brelse(NULL, bp); + bp = NULL; + } else if (cursor->hashval <= be32_to_cpu( + entries[0].hashval)) { + trace_xfs_attr_list_wrong_blk(context); + xfs_trans_brelse(NULL, bp); + bp = NULL; + } + break; + default: + trace_xfs_attr_list_wrong_blk(context); + xfs_trans_brelse(NULL, bp); + bp = NULL; + } + } + } + + /* + * We did not find what we expected given the cursor's contents, + * so we start from the top and work down based on the hash value. + * Note that start of node block is same as start of leaf block. + */ + if (bp == NULL) { + cursor->blkno = 0; + for (;;) { + __uint16_t magic; + + error = xfs_da3_node_read(NULL, dp, + cursor->blkno, -1, &bp, + XFS_ATTR_FORK); + if (error) + return(error); + node = bp->b_addr; + magic = be16_to_cpu(node->hdr.info.magic); + if (magic == XFS_ATTR_LEAF_MAGIC || + magic == XFS_ATTR3_LEAF_MAGIC) + break; + if (magic != XFS_DA_NODE_MAGIC && + magic != XFS_DA3_NODE_MAGIC) { + XFS_CORRUPTION_ERROR("xfs_attr_node_list(3)", + XFS_ERRLEVEL_LOW, + context->dp->i_mount, + node); + xfs_trans_brelse(NULL, bp); + return XFS_ERROR(EFSCORRUPTED); + } + + dp->d_ops->node_hdr_from_disk(&nodehdr, node); + btree = dp->d_ops->node_tree_p(node); + for (i = 0; i < nodehdr.count; btree++, i++) { + if (cursor->hashval + <= be32_to_cpu(btree->hashval)) { + cursor->blkno = be32_to_cpu(btree->before); + trace_xfs_attr_list_node_descend(context, + btree); + break; + } + } + if (i == nodehdr.count) { + xfs_trans_brelse(NULL, bp); + return 0; + } + xfs_trans_brelse(NULL, bp); + } + } + ASSERT(bp != NULL); + + /* + * Roll upward through the blocks, processing each leaf block in + * order. As long as there is space in the result buffer, keep + * adding the information. + */ + for (;;) { + leaf = bp->b_addr; + error = xfs_attr3_leaf_list_int(bp, context); + if (error) { + xfs_trans_brelse(NULL, bp); + return error; + } + xfs_attr3_leaf_hdr_from_disk(&leafhdr, leaf); + if (context->seen_enough || leafhdr.forw == 0) + break; + cursor->blkno = leafhdr.forw; + xfs_trans_brelse(NULL, bp); + error = xfs_attr3_leaf_read(NULL, dp, cursor->blkno, -1, &bp); + if (error) + return error; + } + xfs_trans_brelse(NULL, bp); + return 0; +} + +/* + * Copy out attribute list entries for attr_list(), for leaf attribute lists. + */ +int +xfs_attr3_leaf_list_int( + struct xfs_buf *bp, + struct xfs_attr_list_context *context) +{ + struct attrlist_cursor_kern *cursor; + struct xfs_attr_leafblock *leaf; + struct xfs_attr3_icleaf_hdr ichdr; + struct xfs_attr_leaf_entry *entries; + struct xfs_attr_leaf_entry *entry; + int retval; + int i; + + trace_xfs_attr_list_leaf(context); + + leaf = bp->b_addr; + xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf); + entries = xfs_attr3_leaf_entryp(leaf); + + cursor = context->cursor; + cursor->initted = 1; + + /* + * Re-find our place in the leaf block if this is a new syscall. + */ + if (context->resynch) { + entry = &entries[0]; + for (i = 0; i < ichdr.count; entry++, i++) { + if (be32_to_cpu(entry->hashval) == cursor->hashval) { + if (cursor->offset == context->dupcnt) { + context->dupcnt = 0; + break; + } + context->dupcnt++; + } else if (be32_to_cpu(entry->hashval) > + cursor->hashval) { + context->dupcnt = 0; + break; + } + } + if (i == ichdr.count) { + trace_xfs_attr_list_notfound(context); + return 0; + } + } else { + entry = &entries[0]; + i = 0; + } + context->resynch = 0; + + /* + * We have found our place, start copying out the new attributes. + */ + retval = 0; + for (; i < ichdr.count; entry++, i++) { + if (be32_to_cpu(entry->hashval) != cursor->hashval) { + cursor->hashval = be32_to_cpu(entry->hashval); + cursor->offset = 0; + } + + if (entry->flags & XFS_ATTR_INCOMPLETE) + continue; /* skip incomplete entries */ + + if (entry->flags & XFS_ATTR_LOCAL) { + xfs_attr_leaf_name_local_t *name_loc = + xfs_attr3_leaf_name_local(leaf, i); + + retval = context->put_listent(context, + entry->flags, + name_loc->nameval, + (int)name_loc->namelen, + be16_to_cpu(name_loc->valuelen), + &name_loc->nameval[name_loc->namelen]); + if (retval) + return retval; + } else { + xfs_attr_leaf_name_remote_t *name_rmt = + xfs_attr3_leaf_name_remote(leaf, i); + + int valuelen = be32_to_cpu(name_rmt->valuelen); + + if (context->put_value) { + xfs_da_args_t args; + + memset((char *)&args, 0, sizeof(args)); + args.geo = context->dp->i_mount->m_attr_geo; + args.dp = context->dp; + args.whichfork = XFS_ATTR_FORK; + args.valuelen = valuelen; + args.rmtvaluelen = valuelen; + args.value = kmem_alloc(valuelen, KM_SLEEP | KM_NOFS); + args.rmtblkno = be32_to_cpu(name_rmt->valueblk); + args.rmtblkcnt = xfs_attr3_rmt_blocks( + args.dp->i_mount, valuelen); + retval = xfs_attr_rmtval_get(&args); + if (retval) + return retval; + retval = context->put_listent(context, + entry->flags, + name_rmt->name, + (int)name_rmt->namelen, + valuelen, + args.value); + kmem_free(args.value); + } else { + retval = context->put_listent(context, + entry->flags, + name_rmt->name, + (int)name_rmt->namelen, + valuelen, + NULL); + } + if (retval) + return retval; + } + if (context->seen_enough) + break; + cursor->offset++; + } + trace_xfs_attr_list_leaf_end(context); + return retval; +} + +/* + * Copy out attribute entries for attr_list(), for leaf attribute lists. + */ +STATIC int +xfs_attr_leaf_list(xfs_attr_list_context_t *context) +{ + int error; + struct xfs_buf *bp; + + trace_xfs_attr_leaf_list(context); + + context->cursor->blkno = 0; + error = xfs_attr3_leaf_read(NULL, context->dp, 0, -1, &bp); + if (error) + return XFS_ERROR(error); + + error = xfs_attr3_leaf_list_int(bp, context); + xfs_trans_brelse(NULL, bp); + return XFS_ERROR(error); +} + +int +xfs_attr_list_int( + xfs_attr_list_context_t *context) +{ + int error; + xfs_inode_t *dp = context->dp; + uint lock_mode; + + XFS_STATS_INC(xs_attr_list); + + if (XFS_FORCED_SHUTDOWN(dp->i_mount)) + return EIO; + + /* + * Decide on what work routines to call based on the inode size. + */ + lock_mode = xfs_ilock_attr_map_shared(dp); + if (!xfs_inode_hasattr(dp)) { + error = 0; + } else if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) { + error = xfs_attr_shortform_list(context); + } else if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) { + error = xfs_attr_leaf_list(context); + } else { + error = xfs_attr_node_list(context); + } + xfs_iunlock(dp, lock_mode); + return error; +} + +#define ATTR_ENTBASESIZE /* minimum bytes used by an attr */ \ + (((struct attrlist_ent *) 0)->a_name - (char *) 0) +#define ATTR_ENTSIZE(namelen) /* actual bytes used by an attr */ \ + ((ATTR_ENTBASESIZE + (namelen) + 1 + sizeof(u_int32_t)-1) \ + & ~(sizeof(u_int32_t)-1)) + +/* + * Format an attribute and copy it out to the user's buffer. + * Take care to check values and protect against them changing later, + * we may be reading them directly out of a user buffer. + */ +STATIC int +xfs_attr_put_listent( + xfs_attr_list_context_t *context, + int flags, + unsigned char *name, + int namelen, + int valuelen, + unsigned char *value) +{ + struct attrlist *alist = (struct attrlist *)context->alist; + attrlist_ent_t *aep; + int arraytop; + + ASSERT(!(context->flags & ATTR_KERNOVAL)); + ASSERT(context->count >= 0); + ASSERT(context->count < (ATTR_MAX_VALUELEN/8)); + ASSERT(context->firstu >= sizeof(*alist)); + ASSERT(context->firstu <= context->bufsize); + + /* + * Only list entries in the right namespace. + */ + if (((context->flags & ATTR_SECURE) == 0) != + ((flags & XFS_ATTR_SECURE) == 0)) + return 0; + if (((context->flags & ATTR_ROOT) == 0) != + ((flags & XFS_ATTR_ROOT) == 0)) + return 0; + + arraytop = sizeof(*alist) + + context->count * sizeof(alist->al_offset[0]); + context->firstu -= ATTR_ENTSIZE(namelen); + if (context->firstu < arraytop) { + trace_xfs_attr_list_full(context); + alist->al_more = 1; + context->seen_enough = 1; + return 1; + } + + aep = (attrlist_ent_t *)&context->alist[context->firstu]; + aep->a_valuelen = valuelen; + memcpy(aep->a_name, name, namelen); + aep->a_name[namelen] = 0; + alist->al_offset[context->count++] = context->firstu; + alist->al_count = context->count; + trace_xfs_attr_list_add(context); + return 0; +} + +/* + * Generate a list of extended attribute names and optionally + * also value lengths. Positive return value follows the XFS + * convention of being an error, zero or negative return code + * is the length of the buffer returned (negated), indicating + * success. + */ +int +xfs_attr_list( + xfs_inode_t *dp, + char *buffer, + int bufsize, + int flags, + attrlist_cursor_kern_t *cursor) +{ + xfs_attr_list_context_t context; + struct attrlist *alist; + int error; + + /* + * Validate the cursor. + */ + if (cursor->pad1 || cursor->pad2) + return(XFS_ERROR(EINVAL)); + if ((cursor->initted == 0) && + (cursor->hashval || cursor->blkno || cursor->offset)) + return XFS_ERROR(EINVAL); + + /* + * Check for a properly aligned buffer. + */ + if (((long)buffer) & (sizeof(int)-1)) + return XFS_ERROR(EFAULT); + if (flags & ATTR_KERNOVAL) + bufsize = 0; + + /* + * Initialize the output buffer. + */ + memset(&context, 0, sizeof(context)); + context.dp = dp; + context.cursor = cursor; + context.resynch = 1; + context.flags = flags; + context.alist = buffer; + context.bufsize = (bufsize & ~(sizeof(int)-1)); /* align */ + context.firstu = context.bufsize; + context.put_listent = xfs_attr_put_listent; + + alist = (struct attrlist *)context.alist; + alist->al_count = 0; + alist->al_more = 0; + alist->al_offset[0] = context.bufsize; + + error = xfs_attr_list_int(&context); + ASSERT(error >= 0); + return error; +} diff --git a/fs/xfs/xfs_attr_remote.c b/fs/xfs/xfs_attr_remote.c new file mode 100644 index 00000000000..b5adfecbb8e --- /dev/null +++ b/fs/xfs/xfs_attr_remote.c @@ -0,0 +1,628 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * Copyright (c) 2013 Red Hat, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_bit.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_mount.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_inode.h" +#include "xfs_alloc.h" +#include "xfs_trans.h" +#include "xfs_inode_item.h" +#include "xfs_bmap.h" +#include "xfs_bmap_util.h" +#include "xfs_attr.h" +#include "xfs_attr_leaf.h" +#include "xfs_attr_remote.h" +#include "xfs_trans_space.h" +#include "xfs_trace.h" +#include "xfs_cksum.h" +#include "xfs_buf_item.h" +#include "xfs_error.h" + +#define ATTR_RMTVALUE_MAPSIZE 1 /* # of map entries at once */ + +/* + * Each contiguous block has a header, so it is not just a simple attribute + * length to FSB conversion. + */ +int +xfs_attr3_rmt_blocks( + struct xfs_mount *mp, + int attrlen) +{ + if (xfs_sb_version_hascrc(&mp->m_sb)) { + int buflen = XFS_ATTR3_RMT_BUF_SPACE(mp, mp->m_sb.sb_blocksize); + return (attrlen + buflen - 1) / buflen; + } + return XFS_B_TO_FSB(mp, attrlen); +} + +/* + * Checking of the remote attribute header is split into two parts. The verifier + * does CRC, location and bounds checking, the unpacking function checks the + * attribute parameters and owner. + */ +static bool +xfs_attr3_rmt_hdr_ok( + void *ptr, + xfs_ino_t ino, + uint32_t offset, + uint32_t size, + xfs_daddr_t bno) +{ + struct xfs_attr3_rmt_hdr *rmt = ptr; + + if (bno != be64_to_cpu(rmt->rm_blkno)) + return false; + if (offset != be32_to_cpu(rmt->rm_offset)) + return false; + if (size != be32_to_cpu(rmt->rm_bytes)) + return false; + if (ino != be64_to_cpu(rmt->rm_owner)) + return false; + + /* ok */ + return true; +} + +static bool +xfs_attr3_rmt_verify( + struct xfs_mount *mp, + void *ptr, + int fsbsize, + xfs_daddr_t bno) +{ + struct xfs_attr3_rmt_hdr *rmt = ptr; + + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return false; + if (rmt->rm_magic != cpu_to_be32(XFS_ATTR3_RMT_MAGIC)) + return false; + if (!uuid_equal(&rmt->rm_uuid, &mp->m_sb.sb_uuid)) + return false; + if (be64_to_cpu(rmt->rm_blkno) != bno) + return false; + if (be32_to_cpu(rmt->rm_bytes) > fsbsize - sizeof(*rmt)) + return false; + if (be32_to_cpu(rmt->rm_offset) + + be32_to_cpu(rmt->rm_bytes) > XATTR_SIZE_MAX) + return false; + if (rmt->rm_owner == 0) + return false; + + return true; +} + +static void +xfs_attr3_rmt_read_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + char *ptr; + int len; + xfs_daddr_t bno; + int blksize = mp->m_attr_geo->blksize; + + /* no verification of non-crc buffers */ + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return; + + ptr = bp->b_addr; + bno = bp->b_bn; + len = BBTOB(bp->b_length); + ASSERT(len >= blksize); + + while (len > 0) { + if (!xfs_verify_cksum(ptr, blksize, XFS_ATTR3_RMT_CRC_OFF)) { + xfs_buf_ioerror(bp, EFSBADCRC); + break; + } + if (!xfs_attr3_rmt_verify(mp, ptr, blksize, bno)) { + xfs_buf_ioerror(bp, EFSCORRUPTED); + break; + } + len -= blksize; + ptr += blksize; + bno += BTOBB(blksize); + } + + if (bp->b_error) + xfs_verifier_error(bp); + else + ASSERT(len == 0); +} + +static void +xfs_attr3_rmt_write_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_buf_log_item *bip = bp->b_fspriv; + char *ptr; + int len; + xfs_daddr_t bno; + int blksize = mp->m_attr_geo->blksize; + + /* no verification of non-crc buffers */ + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return; + + ptr = bp->b_addr; + bno = bp->b_bn; + len = BBTOB(bp->b_length); + ASSERT(len >= blksize); + + while (len > 0) { + if (!xfs_attr3_rmt_verify(mp, ptr, blksize, bno)) { + xfs_buf_ioerror(bp, EFSCORRUPTED); + xfs_verifier_error(bp); + return; + } + if (bip) { + struct xfs_attr3_rmt_hdr *rmt; + + rmt = (struct xfs_attr3_rmt_hdr *)ptr; + rmt->rm_lsn = cpu_to_be64(bip->bli_item.li_lsn); + } + xfs_update_cksum(ptr, blksize, XFS_ATTR3_RMT_CRC_OFF); + + len -= blksize; + ptr += blksize; + bno += BTOBB(blksize); + } + ASSERT(len == 0); +} + +const struct xfs_buf_ops xfs_attr3_rmt_buf_ops = { + .verify_read = xfs_attr3_rmt_read_verify, + .verify_write = xfs_attr3_rmt_write_verify, +}; + +STATIC int +xfs_attr3_rmt_hdr_set( + struct xfs_mount *mp, + void *ptr, + xfs_ino_t ino, + uint32_t offset, + uint32_t size, + xfs_daddr_t bno) +{ + struct xfs_attr3_rmt_hdr *rmt = ptr; + + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return 0; + + rmt->rm_magic = cpu_to_be32(XFS_ATTR3_RMT_MAGIC); + rmt->rm_offset = cpu_to_be32(offset); + rmt->rm_bytes = cpu_to_be32(size); + uuid_copy(&rmt->rm_uuid, &mp->m_sb.sb_uuid); + rmt->rm_owner = cpu_to_be64(ino); + rmt->rm_blkno = cpu_to_be64(bno); + + return sizeof(struct xfs_attr3_rmt_hdr); +} + +/* + * Helper functions to copy attribute data in and out of the one disk extents + */ +STATIC int +xfs_attr_rmtval_copyout( + struct xfs_mount *mp, + struct xfs_buf *bp, + xfs_ino_t ino, + int *offset, + int *valuelen, + __uint8_t **dst) +{ + char *src = bp->b_addr; + xfs_daddr_t bno = bp->b_bn; + int len = BBTOB(bp->b_length); + int blksize = mp->m_attr_geo->blksize; + + ASSERT(len >= blksize); + + while (len > 0 && *valuelen > 0) { + int hdr_size = 0; + int byte_cnt = XFS_ATTR3_RMT_BUF_SPACE(mp, blksize); + + byte_cnt = min(*valuelen, byte_cnt); + + if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (!xfs_attr3_rmt_hdr_ok(src, ino, *offset, + byte_cnt, bno)) { + xfs_alert(mp, +"remote attribute header mismatch bno/off/len/owner (0x%llx/0x%x/Ox%x/0x%llx)", + bno, *offset, byte_cnt, ino); + return EFSCORRUPTED; + } + hdr_size = sizeof(struct xfs_attr3_rmt_hdr); + } + + memcpy(*dst, src + hdr_size, byte_cnt); + + /* roll buffer forwards */ + len -= blksize; + src += blksize; + bno += BTOBB(blksize); + + /* roll attribute data forwards */ + *valuelen -= byte_cnt; + *dst += byte_cnt; + *offset += byte_cnt; + } + return 0; +} + +STATIC void +xfs_attr_rmtval_copyin( + struct xfs_mount *mp, + struct xfs_buf *bp, + xfs_ino_t ino, + int *offset, + int *valuelen, + __uint8_t **src) +{ + char *dst = bp->b_addr; + xfs_daddr_t bno = bp->b_bn; + int len = BBTOB(bp->b_length); + int blksize = mp->m_attr_geo->blksize; + + ASSERT(len >= blksize); + + while (len > 0 && *valuelen > 0) { + int hdr_size; + int byte_cnt = XFS_ATTR3_RMT_BUF_SPACE(mp, blksize); + + byte_cnt = min(*valuelen, byte_cnt); + hdr_size = xfs_attr3_rmt_hdr_set(mp, dst, ino, *offset, + byte_cnt, bno); + + memcpy(dst + hdr_size, *src, byte_cnt); + + /* + * If this is the last block, zero the remainder of it. + * Check that we are actually the last block, too. + */ + if (byte_cnt + hdr_size < blksize) { + ASSERT(*valuelen - byte_cnt == 0); + ASSERT(len == blksize); + memset(dst + hdr_size + byte_cnt, 0, + blksize - hdr_size - byte_cnt); + } + + /* roll buffer forwards */ + len -= blksize; + dst += blksize; + bno += BTOBB(blksize); + + /* roll attribute data forwards */ + *valuelen -= byte_cnt; + *src += byte_cnt; + *offset += byte_cnt; + } +} + +/* + * Read the value associated with an attribute from the out-of-line buffer + * that we stored it in. + */ +int +xfs_attr_rmtval_get( + struct xfs_da_args *args) +{ + struct xfs_bmbt_irec map[ATTR_RMTVALUE_MAPSIZE]; + struct xfs_mount *mp = args->dp->i_mount; + struct xfs_buf *bp; + xfs_dablk_t lblkno = args->rmtblkno; + __uint8_t *dst = args->value; + int valuelen; + int nmap; + int error; + int blkcnt = args->rmtblkcnt; + int i; + int offset = 0; + + trace_xfs_attr_rmtval_get(args); + + ASSERT(!(args->flags & ATTR_KERNOVAL)); + ASSERT(args->rmtvaluelen == args->valuelen); + + valuelen = args->rmtvaluelen; + while (valuelen > 0) { + nmap = ATTR_RMTVALUE_MAPSIZE; + error = xfs_bmapi_read(args->dp, (xfs_fileoff_t)lblkno, + blkcnt, map, &nmap, + XFS_BMAPI_ATTRFORK); + if (error) + return error; + ASSERT(nmap >= 1); + + for (i = 0; (i < nmap) && (valuelen > 0); i++) { + xfs_daddr_t dblkno; + int dblkcnt; + + ASSERT((map[i].br_startblock != DELAYSTARTBLOCK) && + (map[i].br_startblock != HOLESTARTBLOCK)); + dblkno = XFS_FSB_TO_DADDR(mp, map[i].br_startblock); + dblkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount); + error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, + dblkno, dblkcnt, 0, &bp, + &xfs_attr3_rmt_buf_ops); + if (error) + return error; + + error = xfs_attr_rmtval_copyout(mp, bp, args->dp->i_ino, + &offset, &valuelen, + &dst); + xfs_buf_relse(bp); + if (error) + return error; + + /* roll attribute extent map forwards */ + lblkno += map[i].br_blockcount; + blkcnt -= map[i].br_blockcount; + } + } + ASSERT(valuelen == 0); + return 0; +} + +/* + * Write the value associated with an attribute into the out-of-line buffer + * that we have defined for it. + */ +int +xfs_attr_rmtval_set( + struct xfs_da_args *args) +{ + struct xfs_inode *dp = args->dp; + struct xfs_mount *mp = dp->i_mount; + struct xfs_bmbt_irec map; + xfs_dablk_t lblkno; + xfs_fileoff_t lfileoff = 0; + __uint8_t *src = args->value; + int blkcnt; + int valuelen; + int nmap; + int error; + int offset = 0; + + trace_xfs_attr_rmtval_set(args); + + /* + * Find a "hole" in the attribute address space large enough for + * us to drop the new attribute's value into. Because CRC enable + * attributes have headers, we can't just do a straight byte to FSB + * conversion and have to take the header space into account. + */ + blkcnt = xfs_attr3_rmt_blocks(mp, args->rmtvaluelen); + error = xfs_bmap_first_unused(args->trans, args->dp, blkcnt, &lfileoff, + XFS_ATTR_FORK); + if (error) + return error; + + args->rmtblkno = lblkno = (xfs_dablk_t)lfileoff; + args->rmtblkcnt = blkcnt; + + /* + * Roll through the "value", allocating blocks on disk as required. + */ + while (blkcnt > 0) { + int committed; + + /* + * Allocate a single extent, up to the size of the value. + */ + xfs_bmap_init(args->flist, args->firstblock); + nmap = 1; + error = xfs_bmapi_write(args->trans, dp, (xfs_fileoff_t)lblkno, + blkcnt, + XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA, + args->firstblock, args->total, &map, &nmap, + args->flist); + if (!error) { + error = xfs_bmap_finish(&args->trans, args->flist, + &committed); + } + if (error) { + ASSERT(committed); + args->trans = NULL; + xfs_bmap_cancel(args->flist); + return(error); + } + + /* + * bmap_finish() may have committed the last trans and started + * a new one. We need the inode to be in all transactions. + */ + if (committed) + xfs_trans_ijoin(args->trans, dp, 0); + + ASSERT(nmap == 1); + ASSERT((map.br_startblock != DELAYSTARTBLOCK) && + (map.br_startblock != HOLESTARTBLOCK)); + lblkno += map.br_blockcount; + blkcnt -= map.br_blockcount; + + /* + * Start the next trans in the chain. + */ + error = xfs_trans_roll(&args->trans, dp); + if (error) + return (error); + } + + /* + * Roll through the "value", copying the attribute value to the + * already-allocated blocks. Blocks are written synchronously + * so that we can know they are all on disk before we turn off + * the INCOMPLETE flag. + */ + lblkno = args->rmtblkno; + blkcnt = args->rmtblkcnt; + valuelen = args->rmtvaluelen; + while (valuelen > 0) { + struct xfs_buf *bp; + xfs_daddr_t dblkno; + int dblkcnt; + + ASSERT(blkcnt > 0); + + xfs_bmap_init(args->flist, args->firstblock); + nmap = 1; + error = xfs_bmapi_read(dp, (xfs_fileoff_t)lblkno, + blkcnt, &map, &nmap, + XFS_BMAPI_ATTRFORK); + if (error) + return(error); + ASSERT(nmap == 1); + ASSERT((map.br_startblock != DELAYSTARTBLOCK) && + (map.br_startblock != HOLESTARTBLOCK)); + + dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock), + dblkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount); + + bp = xfs_buf_get(mp->m_ddev_targp, dblkno, dblkcnt, 0); + if (!bp) + return ENOMEM; + bp->b_ops = &xfs_attr3_rmt_buf_ops; + + xfs_attr_rmtval_copyin(mp, bp, args->dp->i_ino, &offset, + &valuelen, &src); + + error = xfs_bwrite(bp); /* GROT: NOTE: synchronous write */ + xfs_buf_relse(bp); + if (error) + return error; + + + /* roll attribute extent map forwards */ + lblkno += map.br_blockcount; + blkcnt -= map.br_blockcount; + } + ASSERT(valuelen == 0); + return 0; +} + +/* + * Remove the value associated with an attribute by deleting the + * out-of-line buffer that it is stored on. + */ +int +xfs_attr_rmtval_remove( + struct xfs_da_args *args) +{ + struct xfs_mount *mp = args->dp->i_mount; + xfs_dablk_t lblkno; + int blkcnt; + int error; + int done; + + trace_xfs_attr_rmtval_remove(args); + + /* + * Roll through the "value", invalidating the attribute value's blocks. + */ + lblkno = args->rmtblkno; + blkcnt = args->rmtblkcnt; + while (blkcnt > 0) { + struct xfs_bmbt_irec map; + struct xfs_buf *bp; + xfs_daddr_t dblkno; + int dblkcnt; + int nmap; + + /* + * Try to remember where we decided to put the value. + */ + nmap = 1; + error = xfs_bmapi_read(args->dp, (xfs_fileoff_t)lblkno, + blkcnt, &map, &nmap, XFS_BMAPI_ATTRFORK); + if (error) + return(error); + ASSERT(nmap == 1); + ASSERT((map.br_startblock != DELAYSTARTBLOCK) && + (map.br_startblock != HOLESTARTBLOCK)); + + dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock), + dblkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount); + + /* + * If the "remote" value is in the cache, remove it. + */ + bp = xfs_incore(mp->m_ddev_targp, dblkno, dblkcnt, XBF_TRYLOCK); + if (bp) { + xfs_buf_stale(bp); + xfs_buf_relse(bp); + bp = NULL; + } + + lblkno += map.br_blockcount; + blkcnt -= map.br_blockcount; + } + + /* + * Keep de-allocating extents until the remote-value region is gone. + */ + lblkno = args->rmtblkno; + blkcnt = args->rmtblkcnt; + done = 0; + while (!done) { + int committed; + + xfs_bmap_init(args->flist, args->firstblock); + error = xfs_bunmapi(args->trans, args->dp, lblkno, blkcnt, + XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA, + 1, args->firstblock, args->flist, + &done); + if (!error) { + error = xfs_bmap_finish(&args->trans, args->flist, + &committed); + } + if (error) { + ASSERT(committed); + args->trans = NULL; + xfs_bmap_cancel(args->flist); + return error; + } + + /* + * bmap_finish() may have committed the last trans and started + * a new one. We need the inode to be in all transactions. + */ + if (committed) + xfs_trans_ijoin(args->trans, args->dp, 0); + + /* + * Close out trans and start the next one in the chain. + */ + error = xfs_trans_roll(&args->trans, args->dp); + if (error) + return (error); + } + return(0); +} diff --git a/fs/xfs/xfs_utils.h b/fs/xfs/xfs_attr_remote.h index 5eeab4690cf..5a9acfa156d 100644 --- a/fs/xfs/xfs_utils.h +++ b/fs/xfs/xfs_attr_remote.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. + * Copyright (c) 2013 Red Hat, Inc. * All Rights Reserved. * * This program is free software; you can redistribute it and/or @@ -15,13 +15,13 @@ * along with this program; if not, write the Free Software Foundation, * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ -#ifndef __XFS_UTILS_H__ -#define __XFS_UTILS_H__ +#ifndef __XFS_ATTR_REMOTE_H__ +#define __XFS_ATTR_REMOTE_H__ -extern int xfs_dir_ialloc(xfs_trans_t **, xfs_inode_t *, umode_t, xfs_nlink_t, - xfs_dev_t, prid_t, int, xfs_inode_t **, int *); -extern int xfs_droplink(xfs_trans_t *, xfs_inode_t *); -extern int xfs_bumplink(xfs_trans_t *, xfs_inode_t *); -extern void xfs_bump_ino_vers2(xfs_trans_t *, xfs_inode_t *); +int xfs_attr3_rmt_blocks(struct xfs_mount *mp, int attrlen); -#endif /* __XFS_UTILS_H__ */ +int xfs_attr_rmtval_get(struct xfs_da_args *args); +int xfs_attr_rmtval_set(struct xfs_da_args *args); +int xfs_attr_rmtval_remove(struct xfs_da_args *args); + +#endif /* __XFS_ATTR_REMOTE_H__ */ diff --git a/fs/xfs/xfs_bit.c b/fs/xfs/xfs_bit.c index 48228848f5a..0e8885a5964 100644 --- a/fs/xfs/xfs_bit.c +++ b/fs/xfs/xfs_bit.c @@ -16,10 +16,8 @@ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include "xfs.h" +#include "xfs_log_format.h" #include "xfs_bit.h" -#include "xfs_log.h" -#include "xfs_trans.h" -#include "xfs_buf_item.h" /* * XFS bit manipulation routines, used in non-realtime code. diff --git a/fs/xfs/xfs_bit.h b/fs/xfs/xfs_bit.h index f1e3c907044..e1649c0d3e0 100644 --- a/fs/xfs/xfs_bit.h +++ b/fs/xfs/xfs_bit.h @@ -66,8 +66,11 @@ static inline int xfs_lowbit64(__uint64_t v) n = ffs(w); } else { /* upper bits */ w = (__uint32_t)(v >> 32); - if (w && (n = ffs(w))) - n += 32; + if (w) { + n = ffs(w); + if (n) + n += 32; + } } return n - 1; } diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index 0e92d12765d..75c3fe5f3d9 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -17,207 +17,95 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_types.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_bit.h" -#include "xfs_log.h" #include "xfs_inum.h" -#include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_dir2.h" +#include "xfs_mount.h" +#include "xfs_da_format.h" #include "xfs_da_btree.h" -#include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_dinode.h" +#include "xfs_dir2.h" #include "xfs_inode.h" #include "xfs_btree.h" -#include "xfs_mount.h" -#include "xfs_itable.h" +#include "xfs_trans.h" #include "xfs_inode_item.h" #include "xfs_extfree_item.h" #include "xfs_alloc.h" #include "xfs_bmap.h" +#include "xfs_bmap_util.h" +#include "xfs_bmap_btree.h" #include "xfs_rtalloc.h" #include "xfs_error.h" -#include "xfs_attr_leaf.h" #include "xfs_quota.h" #include "xfs_trans_space.h" #include "xfs_buf_item.h" -#include "xfs_filestream.h" -#include "xfs_vnodeops.h" #include "xfs_trace.h" +#include "xfs_symlink.h" +#include "xfs_attr_leaf.h" +#include "xfs_dinode.h" +#include "xfs_filestream.h" kmem_zone_t *xfs_bmap_free_item_zone; /* - * Prototypes for internal bmap routines. - */ - -#ifdef DEBUG -STATIC void -xfs_bmap_check_leaf_extents( - struct xfs_btree_cur *cur, - struct xfs_inode *ip, - int whichfork); -#else -#define xfs_bmap_check_leaf_extents(cur, ip, whichfork) do { } while (0) -#endif - - -/* - * Called from xfs_bmap_add_attrfork to handle extents format files. - */ -STATIC int /* error */ -xfs_bmap_add_attrfork_extents( - xfs_trans_t *tp, /* transaction pointer */ - xfs_inode_t *ip, /* incore inode pointer */ - xfs_fsblock_t *firstblock, /* first block allocated */ - xfs_bmap_free_t *flist, /* blocks to free at commit */ - int *flags); /* inode logging flags */ - -/* - * Called from xfs_bmap_add_attrfork to handle local format files. - */ -STATIC int /* error */ -xfs_bmap_add_attrfork_local( - xfs_trans_t *tp, /* transaction pointer */ - xfs_inode_t *ip, /* incore inode pointer */ - xfs_fsblock_t *firstblock, /* first block allocated */ - xfs_bmap_free_t *flist, /* blocks to free at commit */ - int *flags); /* inode logging flags */ - -/* - * xfs_bmap_alloc is called by xfs_bmapi to allocate an extent for a file. - * It figures out where to ask the underlying allocator to put the new extent. - */ -STATIC int /* error */ -xfs_bmap_alloc( - xfs_bmalloca_t *ap); /* bmap alloc argument struct */ - -/* - * Transform a btree format file with only one leaf node, where the - * extents list will fit in the inode, into an extents format file. - * Since the file extents are already in-core, all we have to do is - * give up the space for the btree root and pitch the leaf block. - */ -STATIC int /* error */ -xfs_bmap_btree_to_extents( - xfs_trans_t *tp, /* transaction pointer */ - xfs_inode_t *ip, /* incore inode pointer */ - xfs_btree_cur_t *cur, /* btree cursor */ - int *logflagsp, /* inode logging flags */ - int whichfork); /* data or attr fork */ - -/* - * Remove the entry "free" from the free item list. Prev points to the - * previous entry, unless "free" is the head of the list. - */ -STATIC void -xfs_bmap_del_free( - xfs_bmap_free_t *flist, /* free item list header */ - xfs_bmap_free_item_t *prev, /* previous item on list, if any */ - xfs_bmap_free_item_t *free); /* list item to be freed */ - -/* - * Convert an extents-format file into a btree-format file. - * The new file will have a root block (in the inode) and a single child block. - */ -STATIC int /* error */ -xfs_bmap_extents_to_btree( - xfs_trans_t *tp, /* transaction pointer */ - xfs_inode_t *ip, /* incore inode pointer */ - xfs_fsblock_t *firstblock, /* first-block-allocated */ - xfs_bmap_free_t *flist, /* blocks freed in xaction */ - xfs_btree_cur_t **curp, /* cursor returned to caller */ - int wasdel, /* converting a delayed alloc */ - int *logflagsp, /* inode logging flags */ - int whichfork); /* data or attr fork */ - -/* - * Convert a local file to an extents file. - * This code is sort of bogus, since the file data needs to get - * logged so it won't be lost. The bmap-level manipulations are ok, though. - */ -STATIC int /* error */ -xfs_bmap_local_to_extents( - xfs_trans_t *tp, /* transaction pointer */ - xfs_inode_t *ip, /* incore inode pointer */ - xfs_fsblock_t *firstblock, /* first block allocated in xaction */ - xfs_extlen_t total, /* total blocks needed by transaction */ - int *logflagsp, /* inode logging flags */ - int whichfork); /* data or attr fork */ - -/* - * Search the extents list for the inode, for the extent containing bno. - * If bno lies in a hole, point to the next entry. If bno lies past eof, - * *eofp will be set, and *prevp will contain the last entry (null if none). - * Else, *lastxp will be set to the index of the found - * entry; *gotp will contain the entry. + * Miscellaneous helper functions */ -STATIC xfs_bmbt_rec_host_t * /* pointer to found extent entry */ -xfs_bmap_search_extents( - xfs_inode_t *ip, /* incore inode pointer */ - xfs_fileoff_t bno, /* block number searched for */ - int whichfork, /* data or attr fork */ - int *eofp, /* out: end of file found */ - xfs_extnum_t *lastxp, /* out: last extent index */ - xfs_bmbt_irec_t *gotp, /* out: extent entry found */ - xfs_bmbt_irec_t *prevp); /* out: previous extent entry found */ /* - * Compute the worst-case number of indirect blocks that will be used - * for ip's delayed extent of length "len". - */ -STATIC xfs_filblks_t -xfs_bmap_worst_indlen( - xfs_inode_t *ip, /* incore inode pointer */ - xfs_filblks_t len); /* delayed extent length */ - -#ifdef DEBUG -/* - * Perform various validation checks on the values being returned - * from xfs_bmapi(). + * Compute and fill in the value of the maximum depth of a bmap btree + * in this filesystem. Done once, during mount. */ -STATIC void -xfs_bmap_validate_ret( - xfs_fileoff_t bno, - xfs_filblks_t len, - int flags, - xfs_bmbt_irec_t *mval, - int nmap, - int ret_nmap); -#else -#define xfs_bmap_validate_ret(bno,len,flags,mval,onmap,nmap) -#endif /* DEBUG */ - -STATIC int -xfs_bmap_count_tree( - xfs_mount_t *mp, - xfs_trans_t *tp, - xfs_ifork_t *ifp, - xfs_fsblock_t blockno, - int levelin, - int *count); - -STATIC void -xfs_bmap_count_leaves( - xfs_ifork_t *ifp, - xfs_extnum_t idx, - int numrecs, - int *count); - -STATIC void -xfs_bmap_disk_count_leaves( - struct xfs_mount *mp, - struct xfs_btree_block *block, - int numrecs, - int *count); +void +xfs_bmap_compute_maxlevels( + xfs_mount_t *mp, /* file system mount structure */ + int whichfork) /* data or attr fork */ +{ + int level; /* btree level */ + uint maxblocks; /* max blocks at this level */ + uint maxleafents; /* max leaf entries possible */ + int maxrootrecs; /* max records in root block */ + int minleafrecs; /* min records in leaf block */ + int minnoderecs; /* min records in node block */ + int sz; /* root block size */ -/* - * Bmap internal routines. - */ + /* + * The maximum number of extents in a file, hence the maximum + * number of leaf entries, is controlled by the type of di_nextents + * (a signed 32-bit number, xfs_extnum_t), or by di_anextents + * (a signed 16-bit number, xfs_aextnum_t). + * + * Note that we can no longer assume that if we are in ATTR1 that + * the fork offset of all the inodes will be + * (xfs_default_attroffset(ip) >> 3) because we could have mounted + * with ATTR2 and then mounted back with ATTR1, keeping the + * di_forkoff's fixed but probably at various positions. Therefore, + * for both ATTR1 and ATTR2 we have to assume the worst case scenario + * of a minimum size available. + */ + if (whichfork == XFS_DATA_FORK) { + maxleafents = MAXEXTNUM; + sz = XFS_BMDR_SPACE_CALC(MINDBTPTRS); + } else { + maxleafents = MAXAEXTNUM; + sz = XFS_BMDR_SPACE_CALC(MINABTPTRS); + } + maxrootrecs = xfs_bmdr_maxrecs(sz, 0); + minleafrecs = mp->m_bmap_dmnr[0]; + minnoderecs = mp->m_bmap_dmnr[1]; + maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs; + for (level = 1; maxblocks > 1; level++) { + if (maxblocks <= maxrootrecs) + maxblocks = 1; + else + maxblocks = (maxblocks + minnoderecs - 1) / minnoderecs; + } + mp->m_bm_maxlevels[whichfork] = level; +} STATIC int /* error */ xfs_bmbt_lookup_eq( @@ -287,6 +175,834 @@ xfs_bmbt_update( } /* + * Compute the worst-case number of indirect blocks that will be used + * for ip's delayed extent of length "len". + */ +STATIC xfs_filblks_t +xfs_bmap_worst_indlen( + xfs_inode_t *ip, /* incore inode pointer */ + xfs_filblks_t len) /* delayed extent length */ +{ + int level; /* btree level number */ + int maxrecs; /* maximum record count at this level */ + xfs_mount_t *mp; /* mount structure */ + xfs_filblks_t rval; /* return value */ + + mp = ip->i_mount; + maxrecs = mp->m_bmap_dmxr[0]; + for (level = 0, rval = 0; + level < XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK); + level++) { + len += maxrecs - 1; + do_div(len, maxrecs); + rval += len; + if (len == 1) + return rval + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) - + level - 1; + if (level == 0) + maxrecs = mp->m_bmap_dmxr[1]; + } + return rval; +} + +/* + * Calculate the default attribute fork offset for newly created inodes. + */ +uint +xfs_default_attroffset( + struct xfs_inode *ip) +{ + struct xfs_mount *mp = ip->i_mount; + uint offset; + + if (mp->m_sb.sb_inodesize == 256) { + offset = XFS_LITINO(mp, ip->i_d.di_version) - + XFS_BMDR_SPACE_CALC(MINABTPTRS); + } else { + offset = XFS_BMDR_SPACE_CALC(6 * MINABTPTRS); + } + + ASSERT(offset < XFS_LITINO(mp, ip->i_d.di_version)); + return offset; +} + +/* + * Helper routine to reset inode di_forkoff field when switching + * attribute fork from local to extent format - we reset it where + * possible to make space available for inline data fork extents. + */ +STATIC void +xfs_bmap_forkoff_reset( + xfs_inode_t *ip, + int whichfork) +{ + if (whichfork == XFS_ATTR_FORK && + ip->i_d.di_format != XFS_DINODE_FMT_DEV && + ip->i_d.di_format != XFS_DINODE_FMT_UUID && + ip->i_d.di_format != XFS_DINODE_FMT_BTREE) { + uint dfl_forkoff = xfs_default_attroffset(ip) >> 3; + + if (dfl_forkoff > ip->i_d.di_forkoff) + ip->i_d.di_forkoff = dfl_forkoff; + } +} + +/* + * Debug/sanity checking code + */ + +STATIC int +xfs_bmap_sanity_check( + struct xfs_mount *mp, + struct xfs_buf *bp, + int level) +{ + struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); + + if (block->bb_magic != cpu_to_be32(XFS_BMAP_CRC_MAGIC) && + block->bb_magic != cpu_to_be32(XFS_BMAP_MAGIC)) + return 0; + + if (be16_to_cpu(block->bb_level) != level || + be16_to_cpu(block->bb_numrecs) == 0 || + be16_to_cpu(block->bb_numrecs) > mp->m_bmap_dmxr[level != 0]) + return 0; + + return 1; +} + +#ifdef DEBUG +STATIC struct xfs_buf * +xfs_bmap_get_bp( + struct xfs_btree_cur *cur, + xfs_fsblock_t bno) +{ + struct xfs_log_item_desc *lidp; + int i; + + if (!cur) + return NULL; + + for (i = 0; i < XFS_BTREE_MAXLEVELS; i++) { + if (!cur->bc_bufs[i]) + break; + if (XFS_BUF_ADDR(cur->bc_bufs[i]) == bno) + return cur->bc_bufs[i]; + } + + /* Chase down all the log items to see if the bp is there */ + list_for_each_entry(lidp, &cur->bc_tp->t_items, lid_trans) { + struct xfs_buf_log_item *bip; + bip = (struct xfs_buf_log_item *)lidp->lid_item; + if (bip->bli_item.li_type == XFS_LI_BUF && + XFS_BUF_ADDR(bip->bli_buf) == bno) + return bip->bli_buf; + } + + return NULL; +} + +STATIC void +xfs_check_block( + struct xfs_btree_block *block, + xfs_mount_t *mp, + int root, + short sz) +{ + int i, j, dmxr; + __be64 *pp, *thispa; /* pointer to block address */ + xfs_bmbt_key_t *prevp, *keyp; + + ASSERT(be16_to_cpu(block->bb_level) > 0); + + prevp = NULL; + for( i = 1; i <= xfs_btree_get_numrecs(block); i++) { + dmxr = mp->m_bmap_dmxr[0]; + keyp = XFS_BMBT_KEY_ADDR(mp, block, i); + + if (prevp) { + ASSERT(be64_to_cpu(prevp->br_startoff) < + be64_to_cpu(keyp->br_startoff)); + } + prevp = keyp; + + /* + * Compare the block numbers to see if there are dups. + */ + if (root) + pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, i, sz); + else + pp = XFS_BMBT_PTR_ADDR(mp, block, i, dmxr); + + for (j = i+1; j <= be16_to_cpu(block->bb_numrecs); j++) { + if (root) + thispa = XFS_BMAP_BROOT_PTR_ADDR(mp, block, j, sz); + else + thispa = XFS_BMBT_PTR_ADDR(mp, block, j, dmxr); + if (*thispa == *pp) { + xfs_warn(mp, "%s: thispa(%d) == pp(%d) %Ld", + __func__, j, i, + (unsigned long long)be64_to_cpu(*thispa)); + panic("%s: ptrs are equal in node\n", + __func__); + } + } + } +} + +/* + * Check that the extents for the inode ip are in the right order in all + * btree leaves. + */ + +STATIC void +xfs_bmap_check_leaf_extents( + xfs_btree_cur_t *cur, /* btree cursor or null */ + xfs_inode_t *ip, /* incore inode pointer */ + int whichfork) /* data or attr fork */ +{ + struct xfs_btree_block *block; /* current btree block */ + xfs_fsblock_t bno; /* block # of "block" */ + xfs_buf_t *bp; /* buffer for "block" */ + int error; /* error return value */ + xfs_extnum_t i=0, j; /* index into the extents list */ + xfs_ifork_t *ifp; /* fork structure */ + int level; /* btree level, for checking */ + xfs_mount_t *mp; /* file system mount structure */ + __be64 *pp; /* pointer to block address */ + xfs_bmbt_rec_t *ep; /* pointer to current extent */ + xfs_bmbt_rec_t last = {0, 0}; /* last extent in prev block */ + xfs_bmbt_rec_t *nextp; /* pointer to next extent */ + int bp_release = 0; + + if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE) { + return; + } + + bno = NULLFSBLOCK; + mp = ip->i_mount; + ifp = XFS_IFORK_PTR(ip, whichfork); + block = ifp->if_broot; + /* + * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out. + */ + level = be16_to_cpu(block->bb_level); + ASSERT(level > 0); + xfs_check_block(block, mp, 1, ifp->if_broot_bytes); + pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes); + bno = be64_to_cpu(*pp); + + ASSERT(bno != NULLDFSBNO); + ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount); + ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks); + + /* + * Go down the tree until leaf level is reached, following the first + * pointer (leftmost) at each level. + */ + while (level-- > 0) { + /* See if buf is in cur first */ + bp_release = 0; + bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno)); + if (!bp) { + bp_release = 1; + error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp, + XFS_BMAP_BTREE_REF, + &xfs_bmbt_buf_ops); + if (error) + goto error_norelse; + } + block = XFS_BUF_TO_BLOCK(bp); + XFS_WANT_CORRUPTED_GOTO( + xfs_bmap_sanity_check(mp, bp, level), + error0); + if (level == 0) + break; + + /* + * Check this block for basic sanity (increasing keys and + * no duplicate blocks). + */ + + xfs_check_block(block, mp, 0, 0); + pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]); + bno = be64_to_cpu(*pp); + XFS_WANT_CORRUPTED_GOTO(XFS_FSB_SANITY_CHECK(mp, bno), error0); + if (bp_release) { + bp_release = 0; + xfs_trans_brelse(NULL, bp); + } + } + + /* + * Here with bp and block set to the leftmost leaf node in the tree. + */ + i = 0; + + /* + * Loop over all leaf nodes checking that all extents are in the right order. + */ + for (;;) { + xfs_fsblock_t nextbno; + xfs_extnum_t num_recs; + + + num_recs = xfs_btree_get_numrecs(block); + + /* + * Read-ahead the next leaf block, if any. + */ + + nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib); + + /* + * Check all the extents to make sure they are OK. + * If we had a previous block, the last entry should + * conform with the first entry in this one. + */ + + ep = XFS_BMBT_REC_ADDR(mp, block, 1); + if (i) { + ASSERT(xfs_bmbt_disk_get_startoff(&last) + + xfs_bmbt_disk_get_blockcount(&last) <= + xfs_bmbt_disk_get_startoff(ep)); + } + for (j = 1; j < num_recs; j++) { + nextp = XFS_BMBT_REC_ADDR(mp, block, j + 1); + ASSERT(xfs_bmbt_disk_get_startoff(ep) + + xfs_bmbt_disk_get_blockcount(ep) <= + xfs_bmbt_disk_get_startoff(nextp)); + ep = nextp; + } + + last = *ep; + i += num_recs; + if (bp_release) { + bp_release = 0; + xfs_trans_brelse(NULL, bp); + } + bno = nextbno; + /* + * If we've reached the end, stop. + */ + if (bno == NULLFSBLOCK) + break; + + bp_release = 0; + bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno)); + if (!bp) { + bp_release = 1; + error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp, + XFS_BMAP_BTREE_REF, + &xfs_bmbt_buf_ops); + if (error) + goto error_norelse; + } + block = XFS_BUF_TO_BLOCK(bp); + } + if (bp_release) { + bp_release = 0; + xfs_trans_brelse(NULL, bp); + } + return; + +error0: + xfs_warn(mp, "%s: at error0", __func__); + if (bp_release) + xfs_trans_brelse(NULL, bp); +error_norelse: + xfs_warn(mp, "%s: BAD after btree leaves for %d extents", + __func__, i); + panic("%s: CORRUPTED BTREE OR SOMETHING", __func__); + return; +} + +/* + * Add bmap trace insert entries for all the contents of the extent records. + */ +void +xfs_bmap_trace_exlist( + xfs_inode_t *ip, /* incore inode pointer */ + xfs_extnum_t cnt, /* count of entries in the list */ + int whichfork, /* data or attr fork */ + unsigned long caller_ip) +{ + xfs_extnum_t idx; /* extent record index */ + xfs_ifork_t *ifp; /* inode fork pointer */ + int state = 0; + + if (whichfork == XFS_ATTR_FORK) + state |= BMAP_ATTRFORK; + + ifp = XFS_IFORK_PTR(ip, whichfork); + ASSERT(cnt == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))); + for (idx = 0; idx < cnt; idx++) + trace_xfs_extlist(ip, idx, whichfork, caller_ip); +} + +/* + * Validate that the bmbt_irecs being returned from bmapi are valid + * given the caller's original parameters. Specifically check the + * ranges of the returned irecs to ensure that they only extend beyond + * the given parameters if the XFS_BMAPI_ENTIRE flag was set. + */ +STATIC void +xfs_bmap_validate_ret( + xfs_fileoff_t bno, + xfs_filblks_t len, + int flags, + xfs_bmbt_irec_t *mval, + int nmap, + int ret_nmap) +{ + int i; /* index to map values */ + + ASSERT(ret_nmap <= nmap); + + for (i = 0; i < ret_nmap; i++) { + ASSERT(mval[i].br_blockcount > 0); + if (!(flags & XFS_BMAPI_ENTIRE)) { + ASSERT(mval[i].br_startoff >= bno); + ASSERT(mval[i].br_blockcount <= len); + ASSERT(mval[i].br_startoff + mval[i].br_blockcount <= + bno + len); + } else { + ASSERT(mval[i].br_startoff < bno + len); + ASSERT(mval[i].br_startoff + mval[i].br_blockcount > + bno); + } + ASSERT(i == 0 || + mval[i - 1].br_startoff + mval[i - 1].br_blockcount == + mval[i].br_startoff); + ASSERT(mval[i].br_startblock != DELAYSTARTBLOCK && + mval[i].br_startblock != HOLESTARTBLOCK); + ASSERT(mval[i].br_state == XFS_EXT_NORM || + mval[i].br_state == XFS_EXT_UNWRITTEN); + } +} + +#else +#define xfs_bmap_check_leaf_extents(cur, ip, whichfork) do { } while (0) +#define xfs_bmap_validate_ret(bno,len,flags,mval,onmap,nmap) +#endif /* DEBUG */ + +/* + * bmap free list manipulation functions + */ + +/* + * Add the extent to the list of extents to be free at transaction end. + * The list is maintained sorted (by block number). + */ +void +xfs_bmap_add_free( + xfs_fsblock_t bno, /* fs block number of extent */ + xfs_filblks_t len, /* length of extent */ + xfs_bmap_free_t *flist, /* list of extents */ + xfs_mount_t *mp) /* mount point structure */ +{ + xfs_bmap_free_item_t *cur; /* current (next) element */ + xfs_bmap_free_item_t *new; /* new element */ + xfs_bmap_free_item_t *prev; /* previous element */ +#ifdef DEBUG + xfs_agnumber_t agno; + xfs_agblock_t agbno; + + ASSERT(bno != NULLFSBLOCK); + ASSERT(len > 0); + ASSERT(len <= MAXEXTLEN); + ASSERT(!isnullstartblock(bno)); + agno = XFS_FSB_TO_AGNO(mp, bno); + agbno = XFS_FSB_TO_AGBNO(mp, bno); + ASSERT(agno < mp->m_sb.sb_agcount); + ASSERT(agbno < mp->m_sb.sb_agblocks); + ASSERT(len < mp->m_sb.sb_agblocks); + ASSERT(agbno + len <= mp->m_sb.sb_agblocks); +#endif + ASSERT(xfs_bmap_free_item_zone != NULL); + new = kmem_zone_alloc(xfs_bmap_free_item_zone, KM_SLEEP); + new->xbfi_startblock = bno; + new->xbfi_blockcount = (xfs_extlen_t)len; + for (prev = NULL, cur = flist->xbf_first; + cur != NULL; + prev = cur, cur = cur->xbfi_next) { + if (cur->xbfi_startblock >= bno) + break; + } + if (prev) + prev->xbfi_next = new; + else + flist->xbf_first = new; + new->xbfi_next = cur; + flist->xbf_count++; +} + +/* + * Remove the entry "free" from the free item list. Prev points to the + * previous entry, unless "free" is the head of the list. + */ +void +xfs_bmap_del_free( + xfs_bmap_free_t *flist, /* free item list header */ + xfs_bmap_free_item_t *prev, /* previous item on list, if any */ + xfs_bmap_free_item_t *free) /* list item to be freed */ +{ + if (prev) + prev->xbfi_next = free->xbfi_next; + else + flist->xbf_first = free->xbfi_next; + flist->xbf_count--; + kmem_zone_free(xfs_bmap_free_item_zone, free); +} + +/* + * Free up any items left in the list. + */ +void +xfs_bmap_cancel( + xfs_bmap_free_t *flist) /* list of bmap_free_items */ +{ + xfs_bmap_free_item_t *free; /* free list item */ + xfs_bmap_free_item_t *next; + + if (flist->xbf_count == 0) + return; + ASSERT(flist->xbf_first != NULL); + for (free = flist->xbf_first; free; free = next) { + next = free->xbfi_next; + xfs_bmap_del_free(flist, NULL, free); + } + ASSERT(flist->xbf_count == 0); +} + +/* + * Inode fork format manipulation functions + */ + +/* + * Transform a btree format file with only one leaf node, where the + * extents list will fit in the inode, into an extents format file. + * Since the file extents are already in-core, all we have to do is + * give up the space for the btree root and pitch the leaf block. + */ +STATIC int /* error */ +xfs_bmap_btree_to_extents( + xfs_trans_t *tp, /* transaction pointer */ + xfs_inode_t *ip, /* incore inode pointer */ + xfs_btree_cur_t *cur, /* btree cursor */ + int *logflagsp, /* inode logging flags */ + int whichfork) /* data or attr fork */ +{ + /* REFERENCED */ + struct xfs_btree_block *cblock;/* child btree block */ + xfs_fsblock_t cbno; /* child block number */ + xfs_buf_t *cbp; /* child block's buffer */ + int error; /* error return value */ + xfs_ifork_t *ifp; /* inode fork data */ + xfs_mount_t *mp; /* mount point structure */ + __be64 *pp; /* ptr to block address */ + struct xfs_btree_block *rblock;/* root btree block */ + + mp = ip->i_mount; + ifp = XFS_IFORK_PTR(ip, whichfork); + ASSERT(ifp->if_flags & XFS_IFEXTENTS); + ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE); + rblock = ifp->if_broot; + ASSERT(be16_to_cpu(rblock->bb_level) == 1); + ASSERT(be16_to_cpu(rblock->bb_numrecs) == 1); + ASSERT(xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0) == 1); + pp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, ifp->if_broot_bytes); + cbno = be64_to_cpu(*pp); + *logflagsp = 0; +#ifdef DEBUG + if ((error = xfs_btree_check_lptr(cur, cbno, 1))) + return error; +#endif + error = xfs_btree_read_bufl(mp, tp, cbno, 0, &cbp, XFS_BMAP_BTREE_REF, + &xfs_bmbt_buf_ops); + if (error) + return error; + cblock = XFS_BUF_TO_BLOCK(cbp); + if ((error = xfs_btree_check_block(cur, cblock, 0, cbp))) + return error; + xfs_bmap_add_free(cbno, 1, cur->bc_private.b.flist, mp); + ip->i_d.di_nblocks--; + xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L); + xfs_trans_binval(tp, cbp); + if (cur->bc_bufs[0] == cbp) + cur->bc_bufs[0] = NULL; + xfs_iroot_realloc(ip, -1, whichfork); + ASSERT(ifp->if_broot == NULL); + ASSERT((ifp->if_flags & XFS_IFBROOT) == 0); + XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS); + *logflagsp = XFS_ILOG_CORE | xfs_ilog_fext(whichfork); + return 0; +} + +/* + * Convert an extents-format file into a btree-format file. + * The new file will have a root block (in the inode) and a single child block. + */ +STATIC int /* error */ +xfs_bmap_extents_to_btree( + xfs_trans_t *tp, /* transaction pointer */ + xfs_inode_t *ip, /* incore inode pointer */ + xfs_fsblock_t *firstblock, /* first-block-allocated */ + xfs_bmap_free_t *flist, /* blocks freed in xaction */ + xfs_btree_cur_t **curp, /* cursor returned to caller */ + int wasdel, /* converting a delayed alloc */ + int *logflagsp, /* inode logging flags */ + int whichfork) /* data or attr fork */ +{ + struct xfs_btree_block *ablock; /* allocated (child) bt block */ + xfs_buf_t *abp; /* buffer for ablock */ + xfs_alloc_arg_t args; /* allocation arguments */ + xfs_bmbt_rec_t *arp; /* child record pointer */ + struct xfs_btree_block *block; /* btree root block */ + xfs_btree_cur_t *cur; /* bmap btree cursor */ + xfs_bmbt_rec_host_t *ep; /* extent record pointer */ + int error; /* error return value */ + xfs_extnum_t i, cnt; /* extent record index */ + xfs_ifork_t *ifp; /* inode fork pointer */ + xfs_bmbt_key_t *kp; /* root block key pointer */ + xfs_mount_t *mp; /* mount structure */ + xfs_extnum_t nextents; /* number of file extents */ + xfs_bmbt_ptr_t *pp; /* root block address pointer */ + + mp = ip->i_mount; + ifp = XFS_IFORK_PTR(ip, whichfork); + ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS); + + /* + * Make space in the inode incore. + */ + xfs_iroot_realloc(ip, 1, whichfork); + ifp->if_flags |= XFS_IFBROOT; + + /* + * Fill in the root. + */ + block = ifp->if_broot; + if (xfs_sb_version_hascrc(&mp->m_sb)) + xfs_btree_init_block_int(mp, block, XFS_BUF_DADDR_NULL, + XFS_BMAP_CRC_MAGIC, 1, 1, ip->i_ino, + XFS_BTREE_LONG_PTRS | XFS_BTREE_CRC_BLOCKS); + else + xfs_btree_init_block_int(mp, block, XFS_BUF_DADDR_NULL, + XFS_BMAP_MAGIC, 1, 1, ip->i_ino, + XFS_BTREE_LONG_PTRS); + + /* + * Need a cursor. Can't allocate until bb_level is filled in. + */ + cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork); + cur->bc_private.b.firstblock = *firstblock; + cur->bc_private.b.flist = flist; + cur->bc_private.b.flags = wasdel ? XFS_BTCUR_BPRV_WASDEL : 0; + /* + * Convert to a btree with two levels, one record in root. + */ + XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_BTREE); + memset(&args, 0, sizeof(args)); + args.tp = tp; + args.mp = mp; + args.firstblock = *firstblock; + if (*firstblock == NULLFSBLOCK) { + args.type = XFS_ALLOCTYPE_START_BNO; + args.fsbno = XFS_INO_TO_FSB(mp, ip->i_ino); + } else if (flist->xbf_low) { + args.type = XFS_ALLOCTYPE_START_BNO; + args.fsbno = *firstblock; + } else { + args.type = XFS_ALLOCTYPE_NEAR_BNO; + args.fsbno = *firstblock; + } + args.minlen = args.maxlen = args.prod = 1; + args.wasdel = wasdel; + *logflagsp = 0; + if ((error = xfs_alloc_vextent(&args))) { + xfs_iroot_realloc(ip, -1, whichfork); + xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); + return error; + } + /* + * Allocation can't fail, the space was reserved. + */ + ASSERT(args.fsbno != NULLFSBLOCK); + ASSERT(*firstblock == NULLFSBLOCK || + args.agno == XFS_FSB_TO_AGNO(mp, *firstblock) || + (flist->xbf_low && + args.agno > XFS_FSB_TO_AGNO(mp, *firstblock))); + *firstblock = cur->bc_private.b.firstblock = args.fsbno; + cur->bc_private.b.allocated++; + ip->i_d.di_nblocks++; + xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, 1L); + abp = xfs_btree_get_bufl(mp, tp, args.fsbno, 0); + /* + * Fill in the child block. + */ + abp->b_ops = &xfs_bmbt_buf_ops; + ablock = XFS_BUF_TO_BLOCK(abp); + if (xfs_sb_version_hascrc(&mp->m_sb)) + xfs_btree_init_block_int(mp, ablock, abp->b_bn, + XFS_BMAP_CRC_MAGIC, 0, 0, ip->i_ino, + XFS_BTREE_LONG_PTRS | XFS_BTREE_CRC_BLOCKS); + else + xfs_btree_init_block_int(mp, ablock, abp->b_bn, + XFS_BMAP_MAGIC, 0, 0, ip->i_ino, + XFS_BTREE_LONG_PTRS); + + arp = XFS_BMBT_REC_ADDR(mp, ablock, 1); + nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + for (cnt = i = 0; i < nextents; i++) { + ep = xfs_iext_get_ext(ifp, i); + if (!isnullstartblock(xfs_bmbt_get_startblock(ep))) { + arp->l0 = cpu_to_be64(ep->l0); + arp->l1 = cpu_to_be64(ep->l1); + arp++; cnt++; + } + } + ASSERT(cnt == XFS_IFORK_NEXTENTS(ip, whichfork)); + xfs_btree_set_numrecs(ablock, cnt); + + /* + * Fill in the root key and pointer. + */ + kp = XFS_BMBT_KEY_ADDR(mp, block, 1); + arp = XFS_BMBT_REC_ADDR(mp, ablock, 1); + kp->br_startoff = cpu_to_be64(xfs_bmbt_disk_get_startoff(arp)); + pp = XFS_BMBT_PTR_ADDR(mp, block, 1, xfs_bmbt_get_maxrecs(cur, + be16_to_cpu(block->bb_level))); + *pp = cpu_to_be64(args.fsbno); + + /* + * Do all this logging at the end so that + * the root is at the right level. + */ + xfs_btree_log_block(cur, abp, XFS_BB_ALL_BITS); + xfs_btree_log_recs(cur, abp, 1, be16_to_cpu(ablock->bb_numrecs)); + ASSERT(*curp == NULL); + *curp = cur; + *logflagsp = XFS_ILOG_CORE | xfs_ilog_fbroot(whichfork); + return 0; +} + +/* + * Convert a local file to an extents file. + * This code is out of bounds for data forks of regular files, + * since the file data needs to get logged so things will stay consistent. + * (The bmap-level manipulations are ok, though). + */ +void +xfs_bmap_local_to_extents_empty( + struct xfs_inode *ip, + int whichfork) +{ + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + + ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL); + ASSERT(ifp->if_bytes == 0); + ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) == 0); + + xfs_bmap_forkoff_reset(ip, whichfork); + ifp->if_flags &= ~XFS_IFINLINE; + ifp->if_flags |= XFS_IFEXTENTS; + XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS); +} + + +STATIC int /* error */ +xfs_bmap_local_to_extents( + xfs_trans_t *tp, /* transaction pointer */ + xfs_inode_t *ip, /* incore inode pointer */ + xfs_fsblock_t *firstblock, /* first block allocated in xaction */ + xfs_extlen_t total, /* total blocks needed by transaction */ + int *logflagsp, /* inode logging flags */ + int whichfork, + void (*init_fn)(struct xfs_trans *tp, + struct xfs_buf *bp, + struct xfs_inode *ip, + struct xfs_ifork *ifp)) +{ + int error = 0; + int flags; /* logging flags returned */ + xfs_ifork_t *ifp; /* inode fork pointer */ + xfs_alloc_arg_t args; /* allocation arguments */ + xfs_buf_t *bp; /* buffer for extent block */ + xfs_bmbt_rec_host_t *ep; /* extent record pointer */ + + /* + * We don't want to deal with the case of keeping inode data inline yet. + * So sending the data fork of a regular inode is invalid. + */ + ASSERT(!(S_ISREG(ip->i_d.di_mode) && whichfork == XFS_DATA_FORK)); + ifp = XFS_IFORK_PTR(ip, whichfork); + ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL); + + if (!ifp->if_bytes) { + xfs_bmap_local_to_extents_empty(ip, whichfork); + flags = XFS_ILOG_CORE; + goto done; + } + + flags = 0; + error = 0; + ASSERT((ifp->if_flags & (XFS_IFINLINE|XFS_IFEXTENTS|XFS_IFEXTIREC)) == + XFS_IFINLINE); + memset(&args, 0, sizeof(args)); + args.tp = tp; + args.mp = ip->i_mount; + args.firstblock = *firstblock; + /* + * Allocate a block. We know we need only one, since the + * file currently fits in an inode. + */ + if (*firstblock == NULLFSBLOCK) { + args.fsbno = XFS_INO_TO_FSB(args.mp, ip->i_ino); + args.type = XFS_ALLOCTYPE_START_BNO; + } else { + args.fsbno = *firstblock; + args.type = XFS_ALLOCTYPE_NEAR_BNO; + } + args.total = total; + args.minlen = args.maxlen = args.prod = 1; + error = xfs_alloc_vextent(&args); + if (error) + goto done; + + /* Can't fail, the space was reserved. */ + ASSERT(args.fsbno != NULLFSBLOCK); + ASSERT(args.len == 1); + *firstblock = args.fsbno; + bp = xfs_btree_get_bufl(args.mp, tp, args.fsbno, 0); + + /* initialise the block and copy the data */ + init_fn(tp, bp, ip, ifp); + + /* account for the change in fork size and log everything */ + xfs_trans_log_buf(tp, bp, 0, ifp->if_bytes - 1); + xfs_idata_realloc(ip, -ifp->if_bytes, whichfork); + xfs_bmap_local_to_extents_empty(ip, whichfork); + flags |= XFS_ILOG_CORE; + + xfs_iext_add(ifp, 0, 1); + ep = xfs_iext_get_ext(ifp, 0); + xfs_bmbt_set_allf(ep, 0, args.fsbno, 1, XFS_EXT_NORM); + trace_xfs_bmap_post_update(ip, 0, + whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0, + _THIS_IP_); + XFS_IFORK_NEXT_SET(ip, whichfork, 1); + ip->i_d.di_nblocks = 1; + xfs_trans_mod_dquot_byino(tp, ip, + XFS_TRANS_DQ_BCOUNT, 1L); + flags |= xfs_ilog_fext(whichfork); + +done: + *logflagsp = flags; + return error; +} + +/* * Called from xfs_bmap_add_attrfork to handle btree format files. */ STATIC int /* error */ @@ -357,7 +1073,15 @@ xfs_bmap_add_attrfork_extents( } /* - * Called from xfs_bmap_add_attrfork to handle local format files. + * Called from xfs_bmap_add_attrfork to handle local format files. Each + * different data fork content type needs a different callout to do the + * conversion. Some are basic and only require special block initialisation + * callouts for the data formating, others (directories) are so specialised they + * handle everything themselves. + * + * XXX (dgc): investigate whether directory conversion can use the generic + * formatting callout. It should be possible - it's just a very complex + * formatter. */ STATIC int /* error */ xfs_bmap_add_attrfork_local( @@ -368,27 +1092,651 @@ xfs_bmap_add_attrfork_local( int *flags) /* inode logging flags */ { xfs_da_args_t dargs; /* args for dir/attr code */ - int error; /* error return value */ - xfs_mount_t *mp; /* mount structure pointer */ if (ip->i_df.if_bytes <= XFS_IFORK_DSIZE(ip)) return 0; + if (S_ISDIR(ip->i_d.di_mode)) { - mp = ip->i_mount; memset(&dargs, 0, sizeof(dargs)); + dargs.geo = ip->i_mount->m_dir_geo; dargs.dp = ip; dargs.firstblock = firstblock; dargs.flist = flist; - dargs.total = mp->m_dirblkfsbs; + dargs.total = dargs.geo->fsbcount; dargs.whichfork = XFS_DATA_FORK; dargs.trans = tp; - error = xfs_dir2_sf_to_block(&dargs); - } else - error = xfs_bmap_local_to_extents(tp, ip, firstblock, 1, flags, - XFS_DATA_FORK); + return xfs_dir2_sf_to_block(&dargs); + } + + if (S_ISLNK(ip->i_d.di_mode)) + return xfs_bmap_local_to_extents(tp, ip, firstblock, 1, + flags, XFS_DATA_FORK, + xfs_symlink_local_to_remote); + + /* should only be called for types that support local format data */ + ASSERT(0); + return EFSCORRUPTED; +} + +/* + * Convert inode from non-attributed to attributed. + * Must not be in a transaction, ip must not be locked. + */ +int /* error code */ +xfs_bmap_add_attrfork( + xfs_inode_t *ip, /* incore inode pointer */ + int size, /* space new attribute needs */ + int rsvd) /* xact may use reserved blks */ +{ + xfs_fsblock_t firstblock; /* 1st block/ag allocated */ + xfs_bmap_free_t flist; /* freed extent records */ + xfs_mount_t *mp; /* mount structure */ + xfs_trans_t *tp; /* transaction pointer */ + int blks; /* space reservation */ + int version = 1; /* superblock attr version */ + int committed; /* xaction was committed */ + int logflags; /* logging flags */ + int error; /* error return value */ + int cancel_flags = 0; + + ASSERT(XFS_IFORK_Q(ip) == 0); + + mp = ip->i_mount; + ASSERT(!XFS_NOT_DQATTACHED(mp, ip)); + tp = xfs_trans_alloc(mp, XFS_TRANS_ADDAFORK); + blks = XFS_ADDAFORK_SPACE_RES(mp); + if (rsvd) + tp->t_flags |= XFS_TRANS_RESERVE; + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_addafork, blks, 0); + if (error) { + xfs_trans_cancel(tp, 0); + return error; + } + cancel_flags = XFS_TRANS_RELEASE_LOG_RES; + xfs_ilock(ip, XFS_ILOCK_EXCL); + error = xfs_trans_reserve_quota_nblks(tp, ip, blks, 0, rsvd ? + XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES : + XFS_QMOPT_RES_REGBLKS); + if (error) + goto trans_cancel; + cancel_flags |= XFS_TRANS_ABORT; + if (XFS_IFORK_Q(ip)) + goto trans_cancel; + if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS) { + /* + * For inodes coming from pre-6.2 filesystems. + */ + ASSERT(ip->i_d.di_aformat == 0); + ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS; + } + ASSERT(ip->i_d.di_anextents == 0); + + xfs_trans_ijoin(tp, ip, 0); + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + + switch (ip->i_d.di_format) { + case XFS_DINODE_FMT_DEV: + ip->i_d.di_forkoff = roundup(sizeof(xfs_dev_t), 8) >> 3; + break; + case XFS_DINODE_FMT_UUID: + ip->i_d.di_forkoff = roundup(sizeof(uuid_t), 8) >> 3; + break; + case XFS_DINODE_FMT_LOCAL: + case XFS_DINODE_FMT_EXTENTS: + case XFS_DINODE_FMT_BTREE: + ip->i_d.di_forkoff = xfs_attr_shortform_bytesfit(ip, size); + if (!ip->i_d.di_forkoff) + ip->i_d.di_forkoff = xfs_default_attroffset(ip) >> 3; + else if (mp->m_flags & XFS_MOUNT_ATTR2) + version = 2; + break; + default: + ASSERT(0); + error = XFS_ERROR(EINVAL); + goto trans_cancel; + } + + ASSERT(ip->i_afp == NULL); + ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP); + ip->i_afp->if_flags = XFS_IFEXTENTS; + logflags = 0; + xfs_bmap_init(&flist, &firstblock); + switch (ip->i_d.di_format) { + case XFS_DINODE_FMT_LOCAL: + error = xfs_bmap_add_attrfork_local(tp, ip, &firstblock, &flist, + &logflags); + break; + case XFS_DINODE_FMT_EXTENTS: + error = xfs_bmap_add_attrfork_extents(tp, ip, &firstblock, + &flist, &logflags); + break; + case XFS_DINODE_FMT_BTREE: + error = xfs_bmap_add_attrfork_btree(tp, ip, &firstblock, &flist, + &logflags); + break; + default: + error = 0; + break; + } + if (logflags) + xfs_trans_log_inode(tp, ip, logflags); + if (error) + goto bmap_cancel; + if (!xfs_sb_version_hasattr(&mp->m_sb) || + (!xfs_sb_version_hasattr2(&mp->m_sb) && version == 2)) { + __int64_t sbfields = 0; + + spin_lock(&mp->m_sb_lock); + if (!xfs_sb_version_hasattr(&mp->m_sb)) { + xfs_sb_version_addattr(&mp->m_sb); + sbfields |= XFS_SB_VERSIONNUM; + } + if (!xfs_sb_version_hasattr2(&mp->m_sb) && version == 2) { + xfs_sb_version_addattr2(&mp->m_sb); + sbfields |= (XFS_SB_VERSIONNUM | XFS_SB_FEATURES2); + } + if (sbfields) { + spin_unlock(&mp->m_sb_lock); + xfs_mod_sb(tp, sbfields); + } else + spin_unlock(&mp->m_sb_lock); + } + + error = xfs_bmap_finish(&tp, &flist, &committed); + if (error) + goto bmap_cancel; + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; + +bmap_cancel: + xfs_bmap_cancel(&flist); +trans_cancel: + xfs_trans_cancel(tp, cancel_flags); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return error; +} + +/* + * Internal and external extent tree search functions. + */ + +/* + * Read in the extents to if_extents. + * All inode fields are set up by caller, we just traverse the btree + * and copy the records in. If the file system cannot contain unwritten + * extents, the records are checked for no "state" flags. + */ +int /* error */ +xfs_bmap_read_extents( + xfs_trans_t *tp, /* transaction pointer */ + xfs_inode_t *ip, /* incore inode */ + int whichfork) /* data or attr fork */ +{ + struct xfs_btree_block *block; /* current btree block */ + xfs_fsblock_t bno; /* block # of "block" */ + xfs_buf_t *bp; /* buffer for "block" */ + int error; /* error return value */ + xfs_exntfmt_t exntf; /* XFS_EXTFMT_NOSTATE, if checking */ + xfs_extnum_t i, j; /* index into the extents list */ + xfs_ifork_t *ifp; /* fork structure */ + int level; /* btree level, for checking */ + xfs_mount_t *mp; /* file system mount structure */ + __be64 *pp; /* pointer to block address */ + /* REFERENCED */ + xfs_extnum_t room; /* number of entries there's room for */ + + bno = NULLFSBLOCK; + mp = ip->i_mount; + ifp = XFS_IFORK_PTR(ip, whichfork); + exntf = (whichfork != XFS_DATA_FORK) ? XFS_EXTFMT_NOSTATE : + XFS_EXTFMT_INODE(ip); + block = ifp->if_broot; + /* + * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out. + */ + level = be16_to_cpu(block->bb_level); + ASSERT(level > 0); + pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes); + bno = be64_to_cpu(*pp); + ASSERT(bno != NULLDFSBNO); + ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount); + ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks); + /* + * Go down the tree until leaf level is reached, following the first + * pointer (leftmost) at each level. + */ + while (level-- > 0) { + error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, + XFS_BMAP_BTREE_REF, &xfs_bmbt_buf_ops); + if (error) + return error; + block = XFS_BUF_TO_BLOCK(bp); + XFS_WANT_CORRUPTED_GOTO( + xfs_bmap_sanity_check(mp, bp, level), + error0); + if (level == 0) + break; + pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]); + bno = be64_to_cpu(*pp); + XFS_WANT_CORRUPTED_GOTO(XFS_FSB_SANITY_CHECK(mp, bno), error0); + xfs_trans_brelse(tp, bp); + } + /* + * Here with bp and block set to the leftmost leaf node in the tree. + */ + room = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + i = 0; + /* + * Loop over all leaf nodes. Copy information to the extent records. + */ + for (;;) { + xfs_bmbt_rec_t *frp; + xfs_fsblock_t nextbno; + xfs_extnum_t num_recs; + xfs_extnum_t start; + + num_recs = xfs_btree_get_numrecs(block); + if (unlikely(i + num_recs > room)) { + ASSERT(i + num_recs <= room); + xfs_warn(ip->i_mount, + "corrupt dinode %Lu, (btree extents).", + (unsigned long long) ip->i_ino); + XFS_CORRUPTION_ERROR("xfs_bmap_read_extents(1)", + XFS_ERRLEVEL_LOW, ip->i_mount, block); + goto error0; + } + XFS_WANT_CORRUPTED_GOTO( + xfs_bmap_sanity_check(mp, bp, 0), + error0); + /* + * Read-ahead the next leaf block, if any. + */ + nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib); + if (nextbno != NULLFSBLOCK) + xfs_btree_reada_bufl(mp, nextbno, 1, + &xfs_bmbt_buf_ops); + /* + * Copy records into the extent records. + */ + frp = XFS_BMBT_REC_ADDR(mp, block, 1); + start = i; + for (j = 0; j < num_recs; j++, i++, frp++) { + xfs_bmbt_rec_host_t *trp = xfs_iext_get_ext(ifp, i); + trp->l0 = be64_to_cpu(frp->l0); + trp->l1 = be64_to_cpu(frp->l1); + } + if (exntf == XFS_EXTFMT_NOSTATE) { + /* + * Check all attribute bmap btree records and + * any "older" data bmap btree records for a + * set bit in the "extent flag" position. + */ + if (unlikely(xfs_check_nostate_extents(ifp, + start, num_recs))) { + XFS_ERROR_REPORT("xfs_bmap_read_extents(2)", + XFS_ERRLEVEL_LOW, + ip->i_mount); + goto error0; + } + } + xfs_trans_brelse(tp, bp); + bno = nextbno; + /* + * If we've reached the end, stop. + */ + if (bno == NULLFSBLOCK) + break; + error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, + XFS_BMAP_BTREE_REF, &xfs_bmbt_buf_ops); + if (error) + return error; + block = XFS_BUF_TO_BLOCK(bp); + } + ASSERT(i == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))); + ASSERT(i == XFS_IFORK_NEXTENTS(ip, whichfork)); + XFS_BMAP_TRACE_EXLIST(ip, i, whichfork); + return 0; +error0: + xfs_trans_brelse(tp, bp); + return XFS_ERROR(EFSCORRUPTED); } + +/* + * Search the extent records for the entry containing block bno. + * If bno lies in a hole, point to the next entry. If bno lies + * past eof, *eofp will be set, and *prevp will contain the last + * entry (null if none). Else, *lastxp will be set to the index + * of the found entry; *gotp will contain the entry. + */ +STATIC xfs_bmbt_rec_host_t * /* pointer to found extent entry */ +xfs_bmap_search_multi_extents( + xfs_ifork_t *ifp, /* inode fork pointer */ + xfs_fileoff_t bno, /* block number searched for */ + int *eofp, /* out: end of file found */ + xfs_extnum_t *lastxp, /* out: last extent index */ + xfs_bmbt_irec_t *gotp, /* out: extent entry found */ + xfs_bmbt_irec_t *prevp) /* out: previous extent entry found */ +{ + xfs_bmbt_rec_host_t *ep; /* extent record pointer */ + xfs_extnum_t lastx; /* last extent index */ + + /* + * Initialize the extent entry structure to catch access to + * uninitialized br_startblock field. + */ + gotp->br_startoff = 0xffa5a5a5a5a5a5a5LL; + gotp->br_blockcount = 0xa55a5a5a5a5a5a5aLL; + gotp->br_state = XFS_EXT_INVALID; +#if XFS_BIG_BLKNOS + gotp->br_startblock = 0xffffa5a5a5a5a5a5LL; +#else + gotp->br_startblock = 0xffffa5a5; +#endif + prevp->br_startoff = NULLFILEOFF; + + ep = xfs_iext_bno_to_ext(ifp, bno, &lastx); + if (lastx > 0) { + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, lastx - 1), prevp); + } + if (lastx < (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))) { + xfs_bmbt_get_all(ep, gotp); + *eofp = 0; + } else { + if (lastx > 0) { + *gotp = *prevp; + } + *eofp = 1; + ep = NULL; + } + *lastxp = lastx; + return ep; +} + +/* + * Search the extents list for the inode, for the extent containing bno. + * If bno lies in a hole, point to the next entry. If bno lies past eof, + * *eofp will be set, and *prevp will contain the last entry (null if none). + * Else, *lastxp will be set to the index of the found + * entry; *gotp will contain the entry. + */ +STATIC xfs_bmbt_rec_host_t * /* pointer to found extent entry */ +xfs_bmap_search_extents( + xfs_inode_t *ip, /* incore inode pointer */ + xfs_fileoff_t bno, /* block number searched for */ + int fork, /* data or attr fork */ + int *eofp, /* out: end of file found */ + xfs_extnum_t *lastxp, /* out: last extent index */ + xfs_bmbt_irec_t *gotp, /* out: extent entry found */ + xfs_bmbt_irec_t *prevp) /* out: previous extent entry found */ +{ + xfs_ifork_t *ifp; /* inode fork pointer */ + xfs_bmbt_rec_host_t *ep; /* extent record pointer */ + + XFS_STATS_INC(xs_look_exlist); + ifp = XFS_IFORK_PTR(ip, fork); + + ep = xfs_bmap_search_multi_extents(ifp, bno, eofp, lastxp, gotp, prevp); + + if (unlikely(!(gotp->br_startblock) && (*lastxp != NULLEXTNUM) && + !(XFS_IS_REALTIME_INODE(ip) && fork == XFS_DATA_FORK))) { + xfs_alert_tag(ip->i_mount, XFS_PTAG_FSBLOCK_ZERO, + "Access to block zero in inode %llu " + "start_block: %llx start_off: %llx " + "blkcnt: %llx extent-state: %x lastx: %x", + (unsigned long long)ip->i_ino, + (unsigned long long)gotp->br_startblock, + (unsigned long long)gotp->br_startoff, + (unsigned long long)gotp->br_blockcount, + gotp->br_state, *lastxp); + *lastxp = NULLEXTNUM; + *eofp = 1; + return NULL; + } + return ep; +} + +/* + * Returns the file-relative block number of the first unused block(s) + * in the file with at least "len" logically contiguous blocks free. + * This is the lowest-address hole if the file has holes, else the first block + * past the end of file. + * Return 0 if the file is currently local (in-inode). + */ +int /* error */ +xfs_bmap_first_unused( + xfs_trans_t *tp, /* transaction pointer */ + xfs_inode_t *ip, /* incore inode */ + xfs_extlen_t len, /* size of hole to find */ + xfs_fileoff_t *first_unused, /* unused block */ + int whichfork) /* data or attr fork */ +{ + int error; /* error return value */ + int idx; /* extent record index */ + xfs_ifork_t *ifp; /* inode fork pointer */ + xfs_fileoff_t lastaddr; /* last block number seen */ + xfs_fileoff_t lowest; /* lowest useful block */ + xfs_fileoff_t max; /* starting useful block */ + xfs_fileoff_t off; /* offset for this block */ + xfs_extnum_t nextents; /* number of extent entries */ + + ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE || + XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS || + XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL); + if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) { + *first_unused = 0; + return 0; + } + ifp = XFS_IFORK_PTR(ip, whichfork); + if (!(ifp->if_flags & XFS_IFEXTENTS) && + (error = xfs_iread_extents(tp, ip, whichfork))) + return error; + lowest = *first_unused; + nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + for (idx = 0, lastaddr = 0, max = lowest; idx < nextents; idx++) { + xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, idx); + off = xfs_bmbt_get_startoff(ep); + /* + * See if the hole before this extent will work. + */ + if (off >= lowest + len && off - max >= len) { + *first_unused = max; + return 0; + } + lastaddr = off + xfs_bmbt_get_blockcount(ep); + max = XFS_FILEOFF_MAX(lastaddr, lowest); + } + *first_unused = max; + return 0; +} + +/* + * Returns the file-relative block number of the last block - 1 before + * last_block (input value) in the file. + * This is not based on i_size, it is based on the extent records. + * Returns 0 for local files, as they do not have extent records. + */ +int /* error */ +xfs_bmap_last_before( + xfs_trans_t *tp, /* transaction pointer */ + xfs_inode_t *ip, /* incore inode */ + xfs_fileoff_t *last_block, /* last block */ + int whichfork) /* data or attr fork */ +{ + xfs_fileoff_t bno; /* input file offset */ + int eof; /* hit end of file */ + xfs_bmbt_rec_host_t *ep; /* pointer to last extent */ + int error; /* error return value */ + xfs_bmbt_irec_t got; /* current extent value */ + xfs_ifork_t *ifp; /* inode fork pointer */ + xfs_extnum_t lastx; /* last extent used */ + xfs_bmbt_irec_t prev; /* previous extent value */ + + if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE && + XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && + XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL) + return XFS_ERROR(EIO); + if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) { + *last_block = 0; + return 0; + } + ifp = XFS_IFORK_PTR(ip, whichfork); + if (!(ifp->if_flags & XFS_IFEXTENTS) && + (error = xfs_iread_extents(tp, ip, whichfork))) + return error; + bno = *last_block - 1; + ep = xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got, + &prev); + if (eof || xfs_bmbt_get_startoff(ep) > bno) { + if (prev.br_startoff == NULLFILEOFF) + *last_block = 0; + else + *last_block = prev.br_startoff + prev.br_blockcount; + } + /* + * Otherwise *last_block is already the right answer. + */ + return 0; +} + +int +xfs_bmap_last_extent( + struct xfs_trans *tp, + struct xfs_inode *ip, + int whichfork, + struct xfs_bmbt_irec *rec, + int *is_empty) +{ + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + int error; + int nextents; + + if (!(ifp->if_flags & XFS_IFEXTENTS)) { + error = xfs_iread_extents(tp, ip, whichfork); + if (error) + return error; + } + + nextents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t); + if (nextents == 0) { + *is_empty = 1; + return 0; + } + + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, nextents - 1), rec); + *is_empty = 0; + return 0; +} + +/* + * Check the last inode extent to determine whether this allocation will result + * in blocks being allocated at the end of the file. When we allocate new data + * blocks at the end of the file which do not start at the previous data block, + * we will try to align the new blocks at stripe unit boundaries. + * + * Returns 1 in bma->aeof if the file (fork) is empty as any new write will be + * at, or past the EOF. + */ +STATIC int +xfs_bmap_isaeof( + struct xfs_bmalloca *bma, + int whichfork) +{ + struct xfs_bmbt_irec rec; + int is_empty; + int error; + + bma->aeof = 0; + error = xfs_bmap_last_extent(NULL, bma->ip, whichfork, &rec, + &is_empty); + if (error) + return error; + + if (is_empty) { + bma->aeof = 1; + return 0; + } + + /* + * Check if we are allocation or past the last extent, or at least into + * the last delayed allocated extent. + */ + bma->aeof = bma->offset >= rec.br_startoff + rec.br_blockcount || + (bma->offset >= rec.br_startoff && + isnullstartblock(rec.br_startblock)); + return 0; +} + +/* + * Returns the file-relative block number of the first block past eof in + * the file. This is not based on i_size, it is based on the extent records. + * Returns 0 for local files, as they do not have extent records. + */ +int +xfs_bmap_last_offset( + struct xfs_inode *ip, + xfs_fileoff_t *last_block, + int whichfork) +{ + struct xfs_bmbt_irec rec; + int is_empty; + int error; + + *last_block = 0; + + if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) + return 0; + + if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE && + XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS) + return XFS_ERROR(EIO); + + error = xfs_bmap_last_extent(NULL, ip, whichfork, &rec, &is_empty); + if (error || is_empty) + return error; + + *last_block = rec.br_startoff + rec.br_blockcount; + return 0; +} + +/* + * Returns whether the selected fork of the inode has exactly one + * block or not. For the data fork we check this matches di_size, + * implying the file's range is 0..bsize-1. + */ +int /* 1=>1 block, 0=>otherwise */ +xfs_bmap_one_block( + xfs_inode_t *ip, /* incore inode */ + int whichfork) /* data or attr fork */ +{ + xfs_bmbt_rec_host_t *ep; /* ptr to fork's extent */ + xfs_ifork_t *ifp; /* inode fork pointer */ + int rval; /* return value */ + xfs_bmbt_irec_t s; /* internal version of extent */ + +#ifndef DEBUG + if (whichfork == XFS_DATA_FORK) + return XFS_ISIZE(ip) == ip->i_mount->m_sb.sb_blocksize; +#endif /* !DEBUG */ + if (XFS_IFORK_NEXTENTS(ip, whichfork) != 1) + return 0; + if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS) + return 0; + ifp = XFS_IFORK_PTR(ip, whichfork); + ASSERT(ifp->if_flags & XFS_IFEXTENTS); + ep = xfs_iext_get_ext(ifp, 0); + xfs_bmbt_get_all(ep, &s); + rval = s.br_startoff == 0 && s.br_blockcount == 1; + if (rval && whichfork == XFS_DATA_FORK) + ASSERT(XFS_ISIZE(ip) == ip->i_mount->m_sb.sb_blocksize); + return rval; +} + +/* + * Extent tree manipulation functions used during allocation. + */ + /* * Convert a delayed allocation to a real allocation. */ @@ -1852,9 +3200,13 @@ done: } /* + * Functions used in the extent read, allocate and remove paths + */ + +/* * Adjust the size of the new extent based on di_extsize and rt extsize. */ -STATIC int +int xfs_bmap_extsize_align( xfs_mount_t *mp, xfs_bmbt_irec_t *gotp, /* next extent pointer */ @@ -2016,9 +3368,9 @@ xfs_bmap_extsize_align( #define XFS_ALLOC_GAP_UNITS 4 -STATIC void +void xfs_bmap_adjacent( - xfs_bmalloca_t *ap) /* bmap alloc argument struct */ + struct xfs_bmalloca *ap) /* bmap alloc argument struct */ { xfs_fsblock_t adjust; /* adjustment to block numbers */ xfs_agnumber_t fb_agno; /* ag number of ap->firstblock */ @@ -2164,107 +3516,65 @@ xfs_bmap_adjacent( #undef ISVALID } -STATIC int -xfs_bmap_rtalloc( - xfs_bmalloca_t *ap) /* bmap alloc argument struct */ +static int +xfs_bmap_longest_free_extent( + struct xfs_trans *tp, + xfs_agnumber_t ag, + xfs_extlen_t *blen, + int *notinit) { - xfs_alloctype_t atype = 0; /* type for allocation routines */ - int error; /* error return value */ - xfs_mount_t *mp; /* mount point structure */ - xfs_extlen_t prod = 0; /* product factor for allocators */ - xfs_extlen_t ralen = 0; /* realtime allocation length */ - xfs_extlen_t align; /* minimum allocation alignment */ - xfs_rtblock_t rtb; - - mp = ap->ip->i_mount; - align = xfs_get_extsz_hint(ap->ip); - prod = align / mp->m_sb.sb_rextsize; - error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev, - align, 1, ap->eof, 0, - ap->conv, &ap->offset, &ap->length); - if (error) - return error; - ASSERT(ap->length); - ASSERT(ap->length % mp->m_sb.sb_rextsize == 0); - - /* - * If the offset & length are not perfectly aligned - * then kill prod, it will just get us in trouble. - */ - if (do_mod(ap->offset, align) || ap->length % align) - prod = 1; - /* - * Set ralen to be the actual requested length in rtextents. - */ - ralen = ap->length / mp->m_sb.sb_rextsize; - /* - * If the old value was close enough to MAXEXTLEN that - * we rounded up to it, cut it back so it's valid again. - * Note that if it's a really large request (bigger than - * MAXEXTLEN), we don't hear about that number, and can't - * adjust the starting point to match it. - */ - if (ralen * mp->m_sb.sb_rextsize >= MAXEXTLEN) - ralen = MAXEXTLEN / mp->m_sb.sb_rextsize; - - /* - * Lock out other modifications to the RT bitmap inode. - */ - xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(ap->tp, mp->m_rbmip, XFS_ILOCK_EXCL); - - /* - * If it's an allocation to an empty file at offset 0, - * pick an extent that will space things out in the rt area. - */ - if (ap->eof && ap->offset == 0) { - xfs_rtblock_t uninitialized_var(rtx); /* realtime extent no */ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_perag *pag; + xfs_extlen_t longest; + int error = 0; - error = xfs_rtpick_extent(mp, ap->tp, ralen, &rtx); + pag = xfs_perag_get(mp, ag); + if (!pag->pagf_init) { + error = xfs_alloc_pagf_init(mp, tp, ag, XFS_ALLOC_FLAG_TRYLOCK); if (error) - return error; - ap->blkno = rtx * mp->m_sb.sb_rextsize; - } else { - ap->blkno = 0; + goto out; + + if (!pag->pagf_init) { + *notinit = 1; + goto out; + } } - xfs_bmap_adjacent(ap); + longest = xfs_alloc_longest_free_extent(mp, pag); + if (*blen < longest) + *blen = longest; - /* - * Realtime allocation, done through xfs_rtallocate_extent. - */ - atype = ap->blkno == 0 ? XFS_ALLOCTYPE_ANY_AG : XFS_ALLOCTYPE_NEAR_BNO; - do_div(ap->blkno, mp->m_sb.sb_rextsize); - rtb = ap->blkno; - ap->length = ralen; - if ((error = xfs_rtallocate_extent(ap->tp, ap->blkno, 1, ap->length, - &ralen, atype, ap->wasdel, prod, &rtb))) - return error; - if (rtb == NULLFSBLOCK && prod > 1 && - (error = xfs_rtallocate_extent(ap->tp, ap->blkno, 1, - ap->length, &ralen, atype, - ap->wasdel, 1, &rtb))) - return error; - ap->blkno = rtb; - if (ap->blkno != NULLFSBLOCK) { - ap->blkno *= mp->m_sb.sb_rextsize; - ralen *= mp->m_sb.sb_rextsize; - ap->length = ralen; - ap->ip->i_d.di_nblocks += ralen; - xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE); - if (ap->wasdel) - ap->ip->i_delayed_blks -= ralen; +out: + xfs_perag_put(pag); + return error; +} + +static void +xfs_bmap_select_minlen( + struct xfs_bmalloca *ap, + struct xfs_alloc_arg *args, + xfs_extlen_t *blen, + int notinit) +{ + if (notinit || *blen < ap->minlen) { /* - * Adjust the disk quota also. This was reserved - * earlier. + * Since we did a BUF_TRYLOCK above, it is possible that + * there is space for this request. */ - xfs_trans_mod_dquot_byino(ap->tp, ap->ip, - ap->wasdel ? XFS_TRANS_DQ_DELRTBCOUNT : - XFS_TRANS_DQ_RTBCOUNT, (long) ralen); + args->minlen = ap->minlen; + } else if (*blen < args->maxlen) { + /* + * If the best seen length is less than the request length, + * use the best as the minimum. + */ + args->minlen = *blen; } else { - ap->length = 0; + /* + * Otherwise we've seen an extent as big as maxlen, use that + * as the minimum. + */ + args->minlen = args->maxlen; } - return 0; } STATIC int @@ -2274,117 +3584,80 @@ xfs_bmap_btalloc_nullfb( xfs_extlen_t *blen) { struct xfs_mount *mp = ap->ip->i_mount; - struct xfs_perag *pag; xfs_agnumber_t ag, startag; int notinit = 0; int error; - if (ap->userdata && xfs_inode_is_filestream(ap->ip)) - args->type = XFS_ALLOCTYPE_NEAR_BNO; - else - args->type = XFS_ALLOCTYPE_START_BNO; + args->type = XFS_ALLOCTYPE_START_BNO; args->total = ap->total; - /* - * Search for an allocation group with a single extent large enough - * for the request. If one isn't found, then adjust the minimum - * allocation size to the largest space found. - */ startag = ag = XFS_FSB_TO_AGNO(mp, args->fsbno); if (startag == NULLAGNUMBER) startag = ag = 0; - pag = xfs_perag_get(mp, ag); while (*blen < args->maxlen) { - if (!pag->pagf_init) { - error = xfs_alloc_pagf_init(mp, args->tp, ag, - XFS_ALLOC_FLAG_TRYLOCK); - if (error) { - xfs_perag_put(pag); - return error; - } - } - - /* - * See xfs_alloc_fix_freelist... - */ - if (pag->pagf_init) { - xfs_extlen_t longest; - longest = xfs_alloc_longest_free_extent(mp, pag); - if (*blen < longest) - *blen = longest; - } else - notinit = 1; - - if (xfs_inode_is_filestream(ap->ip)) { - if (*blen >= args->maxlen) - break; - - if (ap->userdata) { - /* - * If startag is an invalid AG, we've - * come here once before and - * xfs_filestream_new_ag picked the - * best currently available. - * - * Don't continue looping, since we - * could loop forever. - */ - if (startag == NULLAGNUMBER) - break; - - error = xfs_filestream_new_ag(ap, &ag); - xfs_perag_put(pag); - if (error) - return error; + error = xfs_bmap_longest_free_extent(args->tp, ag, blen, + ¬init); + if (error) + return error; - /* loop again to set 'blen'*/ - startag = NULLAGNUMBER; - pag = xfs_perag_get(mp, ag); - continue; - } - } if (++ag == mp->m_sb.sb_agcount) ag = 0; if (ag == startag) break; - xfs_perag_put(pag); - pag = xfs_perag_get(mp, ag); } - xfs_perag_put(pag); - /* - * Since the above loop did a BUF_TRYLOCK, it is - * possible that there is space for this request. - */ - if (notinit || *blen < ap->minlen) - args->minlen = ap->minlen; - /* - * If the best seen length is less than the request - * length, use the best as the minimum. - */ - else if (*blen < args->maxlen) - args->minlen = *blen; - /* - * Otherwise we've seen an extent as big as maxlen, - * use that as the minimum. - */ - else - args->minlen = args->maxlen; + xfs_bmap_select_minlen(ap, args, blen, notinit); + return 0; +} + +STATIC int +xfs_bmap_btalloc_filestreams( + struct xfs_bmalloca *ap, + struct xfs_alloc_arg *args, + xfs_extlen_t *blen) +{ + struct xfs_mount *mp = ap->ip->i_mount; + xfs_agnumber_t ag; + int notinit = 0; + int error; + + args->type = XFS_ALLOCTYPE_NEAR_BNO; + args->total = ap->total; + + ag = XFS_FSB_TO_AGNO(mp, args->fsbno); + if (ag == NULLAGNUMBER) + ag = 0; + + error = xfs_bmap_longest_free_extent(args->tp, ag, blen, ¬init); + if (error) + return error; + + if (*blen < args->maxlen) { + error = xfs_filestream_new_ag(ap, &ag); + if (error) + return error; + + error = xfs_bmap_longest_free_extent(args->tp, ag, blen, + ¬init); + if (error) + return error; + + } + + xfs_bmap_select_minlen(ap, args, blen, notinit); /* - * set the failure fallback case to look in the selected - * AG as the stream may have moved. + * Set the failure fallback case to look in the selected AG as stream + * may have moved. */ - if (xfs_inode_is_filestream(ap->ip)) - ap->blkno = args->fsbno = XFS_AGB_TO_FSB(mp, ag, 0); - + ap->blkno = args->fsbno = XFS_AGB_TO_FSB(mp, ag, 0); return 0; } STATIC int xfs_bmap_btalloc( - xfs_bmalloca_t *ap) /* bmap alloc argument struct */ + struct xfs_bmalloca *ap) /* bmap alloc argument struct */ { xfs_mount_t *mp; /* mount point structure */ xfs_alloctype_t atype = 0; /* type for allocation routines */ @@ -2398,10 +3671,19 @@ xfs_bmap_btalloc( int isaligned; int tryagain; int error; + int stripe_align; ASSERT(ap->length); mp = ap->ip->i_mount; + + /* stripe alignment for allocation is determined by mount parameters */ + stripe_align = 0; + if (mp->m_swidth && (mp->m_flags & XFS_MOUNT_SWALLOC)) + stripe_align = mp->m_swidth; + else if (mp->m_dalign) + stripe_align = mp->m_dalign; + align = ap->userdata ? xfs_get_extsz_hint(ap->ip) : 0; if (unlikely(align)) { error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev, @@ -2410,6 +3692,8 @@ xfs_bmap_btalloc( ASSERT(!error); ASSERT(ap->length); } + + nullfb = *ap->firstblock == NULLFSBLOCK; fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, *ap->firstblock); if (nullfb) { @@ -2447,7 +3731,15 @@ xfs_bmap_btalloc( args.firstblock = *ap->firstblock; blen = 0; if (nullfb) { - error = xfs_bmap_btalloc_nullfb(ap, &args, &blen); + /* + * Search for an allocation group with a single extent large + * enough for the request. If one isn't found, then adjust + * the minimum allocation size to the largest space found. + */ + if (ap->userdata && xfs_inode_is_filestream(ap->ip)) + error = xfs_bmap_btalloc_filestreams(ap, &args, &blen); + else + error = xfs_bmap_btalloc_nullfb(ap, &args, &blen); if (error) return error; } else if (ap->flist->xbf_low) { @@ -2485,7 +3777,7 @@ xfs_bmap_btalloc( */ if (!ap->flist->xbf_low && ap->aeof) { if (!ap->offset) { - args.alignment = mp->m_dalign; + args.alignment = stripe_align; atype = args.type; isaligned = 1; /* @@ -2510,13 +3802,13 @@ xfs_bmap_btalloc( * of minlen+alignment+slop doesn't go up * between the calls. */ - if (blen > mp->m_dalign && blen <= args.maxlen) - nextminlen = blen - mp->m_dalign; + if (blen > stripe_align && blen <= args.maxlen) + nextminlen = blen - stripe_align; else nextminlen = args.minlen; - if (nextminlen + mp->m_dalign > args.minlen + 1) + if (nextminlen + stripe_align > args.minlen + 1) args.minalignslop = - nextminlen + mp->m_dalign - + nextminlen + stripe_align - args.minlen - 1; else args.minalignslop = 0; @@ -2538,7 +3830,7 @@ xfs_bmap_btalloc( */ args.type = atype; args.fsbno = ap->blkno; - args.alignment = mp->m_dalign; + args.alignment = stripe_align; args.minlen = nextminlen; args.minalignslop = 0; isaligned = 1; @@ -2616,7 +3908,7 @@ xfs_bmap_btalloc( */ STATIC int xfs_bmap_alloc( - xfs_bmalloca_t *ap) /* bmap alloc argument struct */ + struct xfs_bmalloca *ap) /* bmap alloc argument struct */ { if (XFS_IS_REALTIME_INODE(ap->ip) && ap->userdata) return xfs_bmap_rtalloc(ap); @@ -2624,1626 +3916,6 @@ xfs_bmap_alloc( } /* - * Transform a btree format file with only one leaf node, where the - * extents list will fit in the inode, into an extents format file. - * Since the file extents are already in-core, all we have to do is - * give up the space for the btree root and pitch the leaf block. - */ -STATIC int /* error */ -xfs_bmap_btree_to_extents( - xfs_trans_t *tp, /* transaction pointer */ - xfs_inode_t *ip, /* incore inode pointer */ - xfs_btree_cur_t *cur, /* btree cursor */ - int *logflagsp, /* inode logging flags */ - int whichfork) /* data or attr fork */ -{ - /* REFERENCED */ - struct xfs_btree_block *cblock;/* child btree block */ - xfs_fsblock_t cbno; /* child block number */ - xfs_buf_t *cbp; /* child block's buffer */ - int error; /* error return value */ - xfs_ifork_t *ifp; /* inode fork data */ - xfs_mount_t *mp; /* mount point structure */ - __be64 *pp; /* ptr to block address */ - struct xfs_btree_block *rblock;/* root btree block */ - - mp = ip->i_mount; - ifp = XFS_IFORK_PTR(ip, whichfork); - ASSERT(ifp->if_flags & XFS_IFEXTENTS); - ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE); - rblock = ifp->if_broot; - ASSERT(be16_to_cpu(rblock->bb_level) == 1); - ASSERT(be16_to_cpu(rblock->bb_numrecs) == 1); - ASSERT(xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0) == 1); - pp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, ifp->if_broot_bytes); - cbno = be64_to_cpu(*pp); - *logflagsp = 0; -#ifdef DEBUG - if ((error = xfs_btree_check_lptr(cur, cbno, 1))) - return error; -#endif - error = xfs_btree_read_bufl(mp, tp, cbno, 0, &cbp, XFS_BMAP_BTREE_REF, - &xfs_bmbt_buf_ops); - if (error) - return error; - cblock = XFS_BUF_TO_BLOCK(cbp); - if ((error = xfs_btree_check_block(cur, cblock, 0, cbp))) - return error; - xfs_bmap_add_free(cbno, 1, cur->bc_private.b.flist, mp); - ip->i_d.di_nblocks--; - xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L); - xfs_trans_binval(tp, cbp); - if (cur->bc_bufs[0] == cbp) - cur->bc_bufs[0] = NULL; - xfs_iroot_realloc(ip, -1, whichfork); - ASSERT(ifp->if_broot == NULL); - ASSERT((ifp->if_flags & XFS_IFBROOT) == 0); - XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS); - *logflagsp = XFS_ILOG_CORE | xfs_ilog_fext(whichfork); - return 0; -} - -/* - * Called by xfs_bmapi to update file extent records and the btree - * after removing space (or undoing a delayed allocation). - */ -STATIC int /* error */ -xfs_bmap_del_extent( - xfs_inode_t *ip, /* incore inode pointer */ - xfs_trans_t *tp, /* current transaction pointer */ - xfs_extnum_t *idx, /* extent number to update/delete */ - xfs_bmap_free_t *flist, /* list of extents to be freed */ - xfs_btree_cur_t *cur, /* if null, not a btree */ - xfs_bmbt_irec_t *del, /* data to remove from extents */ - int *logflagsp, /* inode logging flags */ - int whichfork) /* data or attr fork */ -{ - xfs_filblks_t da_new; /* new delay-alloc indirect blocks */ - xfs_filblks_t da_old; /* old delay-alloc indirect blocks */ - xfs_fsblock_t del_endblock=0; /* first block past del */ - xfs_fileoff_t del_endoff; /* first offset past del */ - int delay; /* current block is delayed allocated */ - int do_fx; /* free extent at end of routine */ - xfs_bmbt_rec_host_t *ep; /* current extent entry pointer */ - int error; /* error return value */ - int flags; /* inode logging flags */ - xfs_bmbt_irec_t got; /* current extent entry */ - xfs_fileoff_t got_endoff; /* first offset past got */ - int i; /* temp state */ - xfs_ifork_t *ifp; /* inode fork pointer */ - xfs_mount_t *mp; /* mount structure */ - xfs_filblks_t nblks; /* quota/sb block count */ - xfs_bmbt_irec_t new; /* new record to be inserted */ - /* REFERENCED */ - uint qfield; /* quota field to update */ - xfs_filblks_t temp; /* for indirect length calculations */ - xfs_filblks_t temp2; /* for indirect length calculations */ - int state = 0; - - XFS_STATS_INC(xs_del_exlist); - - if (whichfork == XFS_ATTR_FORK) - state |= BMAP_ATTRFORK; - - mp = ip->i_mount; - ifp = XFS_IFORK_PTR(ip, whichfork); - ASSERT((*idx >= 0) && (*idx < ifp->if_bytes / - (uint)sizeof(xfs_bmbt_rec_t))); - ASSERT(del->br_blockcount > 0); - ep = xfs_iext_get_ext(ifp, *idx); - xfs_bmbt_get_all(ep, &got); - ASSERT(got.br_startoff <= del->br_startoff); - del_endoff = del->br_startoff + del->br_blockcount; - got_endoff = got.br_startoff + got.br_blockcount; - ASSERT(got_endoff >= del_endoff); - delay = isnullstartblock(got.br_startblock); - ASSERT(isnullstartblock(del->br_startblock) == delay); - flags = 0; - qfield = 0; - error = 0; - /* - * If deleting a real allocation, must free up the disk space. - */ - if (!delay) { - flags = XFS_ILOG_CORE; - /* - * Realtime allocation. Free it and record di_nblocks update. - */ - if (whichfork == XFS_DATA_FORK && XFS_IS_REALTIME_INODE(ip)) { - xfs_fsblock_t bno; - xfs_filblks_t len; - - ASSERT(do_mod(del->br_blockcount, - mp->m_sb.sb_rextsize) == 0); - ASSERT(do_mod(del->br_startblock, - mp->m_sb.sb_rextsize) == 0); - bno = del->br_startblock; - len = del->br_blockcount; - do_div(bno, mp->m_sb.sb_rextsize); - do_div(len, mp->m_sb.sb_rextsize); - error = xfs_rtfree_extent(tp, bno, (xfs_extlen_t)len); - if (error) - goto done; - do_fx = 0; - nblks = len * mp->m_sb.sb_rextsize; - qfield = XFS_TRANS_DQ_RTBCOUNT; - } - /* - * Ordinary allocation. - */ - else { - do_fx = 1; - nblks = del->br_blockcount; - qfield = XFS_TRANS_DQ_BCOUNT; - } - /* - * Set up del_endblock and cur for later. - */ - del_endblock = del->br_startblock + del->br_blockcount; - if (cur) { - if ((error = xfs_bmbt_lookup_eq(cur, got.br_startoff, - got.br_startblock, got.br_blockcount, - &i))) - goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); - } - da_old = da_new = 0; - } else { - da_old = startblockval(got.br_startblock); - da_new = 0; - nblks = 0; - do_fx = 0; - } - /* - * Set flag value to use in switch statement. - * Left-contig is 2, right-contig is 1. - */ - switch (((got.br_startoff == del->br_startoff) << 1) | - (got_endoff == del_endoff)) { - case 3: - /* - * Matches the whole extent. Delete the entry. - */ - xfs_iext_remove(ip, *idx, 1, - whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0); - --*idx; - if (delay) - break; - - XFS_IFORK_NEXT_SET(ip, whichfork, - XFS_IFORK_NEXTENTS(ip, whichfork) - 1); - flags |= XFS_ILOG_CORE; - if (!cur) { - flags |= xfs_ilog_fext(whichfork); - break; - } - if ((error = xfs_btree_delete(cur, &i))) - goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); - break; - - case 2: - /* - * Deleting the first part of the extent. - */ - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); - xfs_bmbt_set_startoff(ep, del_endoff); - temp = got.br_blockcount - del->br_blockcount; - xfs_bmbt_set_blockcount(ep, temp); - if (delay) { - temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), - da_old); - xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); - da_new = temp; - break; - } - xfs_bmbt_set_startblock(ep, del_endblock); - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); - if (!cur) { - flags |= xfs_ilog_fext(whichfork); - break; - } - if ((error = xfs_bmbt_update(cur, del_endoff, del_endblock, - got.br_blockcount - del->br_blockcount, - got.br_state))) - goto done; - break; - - case 1: - /* - * Deleting the last part of the extent. - */ - temp = got.br_blockcount - del->br_blockcount; - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); - xfs_bmbt_set_blockcount(ep, temp); - if (delay) { - temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), - da_old); - xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); - da_new = temp; - break; - } - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); - if (!cur) { - flags |= xfs_ilog_fext(whichfork); - break; - } - if ((error = xfs_bmbt_update(cur, got.br_startoff, - got.br_startblock, - got.br_blockcount - del->br_blockcount, - got.br_state))) - goto done; - break; - - case 0: - /* - * Deleting the middle of the extent. - */ - temp = del->br_startoff - got.br_startoff; - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); - xfs_bmbt_set_blockcount(ep, temp); - new.br_startoff = del_endoff; - temp2 = got_endoff - del_endoff; - new.br_blockcount = temp2; - new.br_state = got.br_state; - if (!delay) { - new.br_startblock = del_endblock; - flags |= XFS_ILOG_CORE; - if (cur) { - if ((error = xfs_bmbt_update(cur, - got.br_startoff, - got.br_startblock, temp, - got.br_state))) - goto done; - if ((error = xfs_btree_increment(cur, 0, &i))) - goto done; - cur->bc_rec.b = new; - error = xfs_btree_insert(cur, &i); - if (error && error != ENOSPC) - goto done; - /* - * If get no-space back from btree insert, - * it tried a split, and we have a zero - * block reservation. - * Fix up our state and return the error. - */ - if (error == ENOSPC) { - /* - * Reset the cursor, don't trust - * it after any insert operation. - */ - if ((error = xfs_bmbt_lookup_eq(cur, - got.br_startoff, - got.br_startblock, - temp, &i))) - goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); - /* - * Update the btree record back - * to the original value. - */ - if ((error = xfs_bmbt_update(cur, - got.br_startoff, - got.br_startblock, - got.br_blockcount, - got.br_state))) - goto done; - /* - * Reset the extent record back - * to the original value. - */ - xfs_bmbt_set_blockcount(ep, - got.br_blockcount); - flags = 0; - error = XFS_ERROR(ENOSPC); - goto done; - } - XFS_WANT_CORRUPTED_GOTO(i == 1, done); - } else - flags |= xfs_ilog_fext(whichfork); - XFS_IFORK_NEXT_SET(ip, whichfork, - XFS_IFORK_NEXTENTS(ip, whichfork) + 1); - } else { - ASSERT(whichfork == XFS_DATA_FORK); - temp = xfs_bmap_worst_indlen(ip, temp); - xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); - temp2 = xfs_bmap_worst_indlen(ip, temp2); - new.br_startblock = nullstartblock((int)temp2); - da_new = temp + temp2; - while (da_new > da_old) { - if (temp) { - temp--; - da_new--; - xfs_bmbt_set_startblock(ep, - nullstartblock((int)temp)); - } - if (da_new == da_old) - break; - if (temp2) { - temp2--; - da_new--; - new.br_startblock = - nullstartblock((int)temp2); - } - } - } - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); - xfs_iext_insert(ip, *idx + 1, 1, &new, state); - ++*idx; - break; - } - /* - * If we need to, add to list of extents to delete. - */ - if (do_fx) - xfs_bmap_add_free(del->br_startblock, del->br_blockcount, flist, - mp); - /* - * Adjust inode # blocks in the file. - */ - if (nblks) - ip->i_d.di_nblocks -= nblks; - /* - * Adjust quota data. - */ - if (qfield) - xfs_trans_mod_dquot_byino(tp, ip, qfield, (long)-nblks); - - /* - * Account for change in delayed indirect blocks. - * Nothing to do for disk quota accounting here. - */ - ASSERT(da_old >= da_new); - if (da_old > da_new) { - xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, - (int64_t)(da_old - da_new), 0); - } -done: - *logflagsp = flags; - return error; -} - -/* - * Remove the entry "free" from the free item list. Prev points to the - * previous entry, unless "free" is the head of the list. - */ -STATIC void -xfs_bmap_del_free( - xfs_bmap_free_t *flist, /* free item list header */ - xfs_bmap_free_item_t *prev, /* previous item on list, if any */ - xfs_bmap_free_item_t *free) /* list item to be freed */ -{ - if (prev) - prev->xbfi_next = free->xbfi_next; - else - flist->xbf_first = free->xbfi_next; - flist->xbf_count--; - kmem_zone_free(xfs_bmap_free_item_zone, free); -} - -/* - * Convert an extents-format file into a btree-format file. - * The new file will have a root block (in the inode) and a single child block. - */ -STATIC int /* error */ -xfs_bmap_extents_to_btree( - xfs_trans_t *tp, /* transaction pointer */ - xfs_inode_t *ip, /* incore inode pointer */ - xfs_fsblock_t *firstblock, /* first-block-allocated */ - xfs_bmap_free_t *flist, /* blocks freed in xaction */ - xfs_btree_cur_t **curp, /* cursor returned to caller */ - int wasdel, /* converting a delayed alloc */ - int *logflagsp, /* inode logging flags */ - int whichfork) /* data or attr fork */ -{ - struct xfs_btree_block *ablock; /* allocated (child) bt block */ - xfs_buf_t *abp; /* buffer for ablock */ - xfs_alloc_arg_t args; /* allocation arguments */ - xfs_bmbt_rec_t *arp; /* child record pointer */ - struct xfs_btree_block *block; /* btree root block */ - xfs_btree_cur_t *cur; /* bmap btree cursor */ - xfs_bmbt_rec_host_t *ep; /* extent record pointer */ - int error; /* error return value */ - xfs_extnum_t i, cnt; /* extent record index */ - xfs_ifork_t *ifp; /* inode fork pointer */ - xfs_bmbt_key_t *kp; /* root block key pointer */ - xfs_mount_t *mp; /* mount structure */ - xfs_extnum_t nextents; /* number of file extents */ - xfs_bmbt_ptr_t *pp; /* root block address pointer */ - - ifp = XFS_IFORK_PTR(ip, whichfork); - ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS); - - /* - * Make space in the inode incore. - */ - xfs_iroot_realloc(ip, 1, whichfork); - ifp->if_flags |= XFS_IFBROOT; - - /* - * Fill in the root. - */ - block = ifp->if_broot; - block->bb_magic = cpu_to_be32(XFS_BMAP_MAGIC); - block->bb_level = cpu_to_be16(1); - block->bb_numrecs = cpu_to_be16(1); - block->bb_u.l.bb_leftsib = cpu_to_be64(NULLDFSBNO); - block->bb_u.l.bb_rightsib = cpu_to_be64(NULLDFSBNO); - - /* - * Need a cursor. Can't allocate until bb_level is filled in. - */ - mp = ip->i_mount; - cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork); - cur->bc_private.b.firstblock = *firstblock; - cur->bc_private.b.flist = flist; - cur->bc_private.b.flags = wasdel ? XFS_BTCUR_BPRV_WASDEL : 0; - /* - * Convert to a btree with two levels, one record in root. - */ - XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_BTREE); - memset(&args, 0, sizeof(args)); - args.tp = tp; - args.mp = mp; - args.firstblock = *firstblock; - if (*firstblock == NULLFSBLOCK) { - args.type = XFS_ALLOCTYPE_START_BNO; - args.fsbno = XFS_INO_TO_FSB(mp, ip->i_ino); - } else if (flist->xbf_low) { - args.type = XFS_ALLOCTYPE_START_BNO; - args.fsbno = *firstblock; - } else { - args.type = XFS_ALLOCTYPE_NEAR_BNO; - args.fsbno = *firstblock; - } - args.minlen = args.maxlen = args.prod = 1; - args.total = args.minleft = args.alignment = args.mod = args.isfl = - args.minalignslop = 0; - args.wasdel = wasdel; - *logflagsp = 0; - if ((error = xfs_alloc_vextent(&args))) { - xfs_iroot_realloc(ip, -1, whichfork); - xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); - return error; - } - /* - * Allocation can't fail, the space was reserved. - */ - ASSERT(args.fsbno != NULLFSBLOCK); - ASSERT(*firstblock == NULLFSBLOCK || - args.agno == XFS_FSB_TO_AGNO(mp, *firstblock) || - (flist->xbf_low && - args.agno > XFS_FSB_TO_AGNO(mp, *firstblock))); - *firstblock = cur->bc_private.b.firstblock = args.fsbno; - cur->bc_private.b.allocated++; - ip->i_d.di_nblocks++; - xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, 1L); - abp = xfs_btree_get_bufl(mp, tp, args.fsbno, 0); - /* - * Fill in the child block. - */ - abp->b_ops = &xfs_bmbt_buf_ops; - ablock = XFS_BUF_TO_BLOCK(abp); - ablock->bb_magic = cpu_to_be32(XFS_BMAP_MAGIC); - ablock->bb_level = 0; - ablock->bb_u.l.bb_leftsib = cpu_to_be64(NULLDFSBNO); - ablock->bb_u.l.bb_rightsib = cpu_to_be64(NULLDFSBNO); - arp = XFS_BMBT_REC_ADDR(mp, ablock, 1); - nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); - for (cnt = i = 0; i < nextents; i++) { - ep = xfs_iext_get_ext(ifp, i); - if (!isnullstartblock(xfs_bmbt_get_startblock(ep))) { - arp->l0 = cpu_to_be64(ep->l0); - arp->l1 = cpu_to_be64(ep->l1); - arp++; cnt++; - } - } - ASSERT(cnt == XFS_IFORK_NEXTENTS(ip, whichfork)); - xfs_btree_set_numrecs(ablock, cnt); - - /* - * Fill in the root key and pointer. - */ - kp = XFS_BMBT_KEY_ADDR(mp, block, 1); - arp = XFS_BMBT_REC_ADDR(mp, ablock, 1); - kp->br_startoff = cpu_to_be64(xfs_bmbt_disk_get_startoff(arp)); - pp = XFS_BMBT_PTR_ADDR(mp, block, 1, xfs_bmbt_get_maxrecs(cur, - be16_to_cpu(block->bb_level))); - *pp = cpu_to_be64(args.fsbno); - - /* - * Do all this logging at the end so that - * the root is at the right level. - */ - xfs_btree_log_block(cur, abp, XFS_BB_ALL_BITS); - xfs_btree_log_recs(cur, abp, 1, be16_to_cpu(ablock->bb_numrecs)); - ASSERT(*curp == NULL); - *curp = cur; - *logflagsp = XFS_ILOG_CORE | xfs_ilog_fbroot(whichfork); - return 0; -} - -/* - * Calculate the default attribute fork offset for newly created inodes. - */ -uint -xfs_default_attroffset( - struct xfs_inode *ip) -{ - struct xfs_mount *mp = ip->i_mount; - uint offset; - - if (mp->m_sb.sb_inodesize == 256) { - offset = XFS_LITINO(mp) - - XFS_BMDR_SPACE_CALC(MINABTPTRS); - } else { - offset = XFS_BMDR_SPACE_CALC(6 * MINABTPTRS); - } - - ASSERT(offset < XFS_LITINO(mp)); - return offset; -} - -/* - * Helper routine to reset inode di_forkoff field when switching - * attribute fork from local to extent format - we reset it where - * possible to make space available for inline data fork extents. - */ -STATIC void -xfs_bmap_forkoff_reset( - xfs_mount_t *mp, - xfs_inode_t *ip, - int whichfork) -{ - if (whichfork == XFS_ATTR_FORK && - ip->i_d.di_format != XFS_DINODE_FMT_DEV && - ip->i_d.di_format != XFS_DINODE_FMT_UUID && - ip->i_d.di_format != XFS_DINODE_FMT_BTREE) { - uint dfl_forkoff = xfs_default_attroffset(ip) >> 3; - - if (dfl_forkoff > ip->i_d.di_forkoff) - ip->i_d.di_forkoff = dfl_forkoff; - } -} - -/* - * Convert a local file to an extents file. - * This code is out of bounds for data forks of regular files, - * since the file data needs to get logged so things will stay consistent. - * (The bmap-level manipulations are ok, though). - */ -STATIC int /* error */ -xfs_bmap_local_to_extents( - xfs_trans_t *tp, /* transaction pointer */ - xfs_inode_t *ip, /* incore inode pointer */ - xfs_fsblock_t *firstblock, /* first block allocated in xaction */ - xfs_extlen_t total, /* total blocks needed by transaction */ - int *logflagsp, /* inode logging flags */ - int whichfork) /* data or attr fork */ -{ - int error; /* error return value */ - int flags; /* logging flags returned */ - xfs_ifork_t *ifp; /* inode fork pointer */ - - /* - * We don't want to deal with the case of keeping inode data inline yet. - * So sending the data fork of a regular inode is invalid. - */ - ASSERT(!(S_ISREG(ip->i_d.di_mode) && whichfork == XFS_DATA_FORK)); - ifp = XFS_IFORK_PTR(ip, whichfork); - ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL); - flags = 0; - error = 0; - if (ifp->if_bytes) { - xfs_alloc_arg_t args; /* allocation arguments */ - xfs_buf_t *bp; /* buffer for extent block */ - xfs_bmbt_rec_host_t *ep;/* extent record pointer */ - - memset(&args, 0, sizeof(args)); - args.tp = tp; - args.mp = ip->i_mount; - args.firstblock = *firstblock; - ASSERT((ifp->if_flags & - (XFS_IFINLINE|XFS_IFEXTENTS|XFS_IFEXTIREC)) == XFS_IFINLINE); - /* - * Allocate a block. We know we need only one, since the - * file currently fits in an inode. - */ - if (*firstblock == NULLFSBLOCK) { - args.fsbno = XFS_INO_TO_FSB(args.mp, ip->i_ino); - args.type = XFS_ALLOCTYPE_START_BNO; - } else { - args.fsbno = *firstblock; - args.type = XFS_ALLOCTYPE_NEAR_BNO; - } - args.total = total; - args.mod = args.minleft = args.alignment = args.wasdel = - args.isfl = args.minalignslop = 0; - args.minlen = args.maxlen = args.prod = 1; - if ((error = xfs_alloc_vextent(&args))) - goto done; - /* - * Can't fail, the space was reserved. - */ - ASSERT(args.fsbno != NULLFSBLOCK); - ASSERT(args.len == 1); - *firstblock = args.fsbno; - bp = xfs_btree_get_bufl(args.mp, tp, args.fsbno, 0); - bp->b_ops = &xfs_bmbt_buf_ops; - memcpy(bp->b_addr, ifp->if_u1.if_data, ifp->if_bytes); - xfs_trans_log_buf(tp, bp, 0, ifp->if_bytes - 1); - xfs_bmap_forkoff_reset(args.mp, ip, whichfork); - xfs_idata_realloc(ip, -ifp->if_bytes, whichfork); - xfs_iext_add(ifp, 0, 1); - ep = xfs_iext_get_ext(ifp, 0); - xfs_bmbt_set_allf(ep, 0, args.fsbno, 1, XFS_EXT_NORM); - trace_xfs_bmap_post_update(ip, 0, - whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0, - _THIS_IP_); - XFS_IFORK_NEXT_SET(ip, whichfork, 1); - ip->i_d.di_nblocks = 1; - xfs_trans_mod_dquot_byino(tp, ip, - XFS_TRANS_DQ_BCOUNT, 1L); - flags |= xfs_ilog_fext(whichfork); - } else { - ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) == 0); - xfs_bmap_forkoff_reset(ip->i_mount, ip, whichfork); - } - ifp->if_flags &= ~XFS_IFINLINE; - ifp->if_flags |= XFS_IFEXTENTS; - XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS); - flags |= XFS_ILOG_CORE; -done: - *logflagsp = flags; - return error; -} - -/* - * Search the extent records for the entry containing block bno. - * If bno lies in a hole, point to the next entry. If bno lies - * past eof, *eofp will be set, and *prevp will contain the last - * entry (null if none). Else, *lastxp will be set to the index - * of the found entry; *gotp will contain the entry. - */ -STATIC xfs_bmbt_rec_host_t * /* pointer to found extent entry */ -xfs_bmap_search_multi_extents( - xfs_ifork_t *ifp, /* inode fork pointer */ - xfs_fileoff_t bno, /* block number searched for */ - int *eofp, /* out: end of file found */ - xfs_extnum_t *lastxp, /* out: last extent index */ - xfs_bmbt_irec_t *gotp, /* out: extent entry found */ - xfs_bmbt_irec_t *prevp) /* out: previous extent entry found */ -{ - xfs_bmbt_rec_host_t *ep; /* extent record pointer */ - xfs_extnum_t lastx; /* last extent index */ - - /* - * Initialize the extent entry structure to catch access to - * uninitialized br_startblock field. - */ - gotp->br_startoff = 0xffa5a5a5a5a5a5a5LL; - gotp->br_blockcount = 0xa55a5a5a5a5a5a5aLL; - gotp->br_state = XFS_EXT_INVALID; -#if XFS_BIG_BLKNOS - gotp->br_startblock = 0xffffa5a5a5a5a5a5LL; -#else - gotp->br_startblock = 0xffffa5a5; -#endif - prevp->br_startoff = NULLFILEOFF; - - ep = xfs_iext_bno_to_ext(ifp, bno, &lastx); - if (lastx > 0) { - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, lastx - 1), prevp); - } - if (lastx < (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))) { - xfs_bmbt_get_all(ep, gotp); - *eofp = 0; - } else { - if (lastx > 0) { - *gotp = *prevp; - } - *eofp = 1; - ep = NULL; - } - *lastxp = lastx; - return ep; -} - -/* - * Search the extents list for the inode, for the extent containing bno. - * If bno lies in a hole, point to the next entry. If bno lies past eof, - * *eofp will be set, and *prevp will contain the last entry (null if none). - * Else, *lastxp will be set to the index of the found - * entry; *gotp will contain the entry. - */ -STATIC xfs_bmbt_rec_host_t * /* pointer to found extent entry */ -xfs_bmap_search_extents( - xfs_inode_t *ip, /* incore inode pointer */ - xfs_fileoff_t bno, /* block number searched for */ - int fork, /* data or attr fork */ - int *eofp, /* out: end of file found */ - xfs_extnum_t *lastxp, /* out: last extent index */ - xfs_bmbt_irec_t *gotp, /* out: extent entry found */ - xfs_bmbt_irec_t *prevp) /* out: previous extent entry found */ -{ - xfs_ifork_t *ifp; /* inode fork pointer */ - xfs_bmbt_rec_host_t *ep; /* extent record pointer */ - - XFS_STATS_INC(xs_look_exlist); - ifp = XFS_IFORK_PTR(ip, fork); - - ep = xfs_bmap_search_multi_extents(ifp, bno, eofp, lastxp, gotp, prevp); - - if (unlikely(!(gotp->br_startblock) && (*lastxp != NULLEXTNUM) && - !(XFS_IS_REALTIME_INODE(ip) && fork == XFS_DATA_FORK))) { - xfs_alert_tag(ip->i_mount, XFS_PTAG_FSBLOCK_ZERO, - "Access to block zero in inode %llu " - "start_block: %llx start_off: %llx " - "blkcnt: %llx extent-state: %x lastx: %x\n", - (unsigned long long)ip->i_ino, - (unsigned long long)gotp->br_startblock, - (unsigned long long)gotp->br_startoff, - (unsigned long long)gotp->br_blockcount, - gotp->br_state, *lastxp); - *lastxp = NULLEXTNUM; - *eofp = 1; - return NULL; - } - return ep; -} - -/* - * Compute the worst-case number of indirect blocks that will be used - * for ip's delayed extent of length "len". - */ -STATIC xfs_filblks_t -xfs_bmap_worst_indlen( - xfs_inode_t *ip, /* incore inode pointer */ - xfs_filblks_t len) /* delayed extent length */ -{ - int level; /* btree level number */ - int maxrecs; /* maximum record count at this level */ - xfs_mount_t *mp; /* mount structure */ - xfs_filblks_t rval; /* return value */ - - mp = ip->i_mount; - maxrecs = mp->m_bmap_dmxr[0]; - for (level = 0, rval = 0; - level < XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK); - level++) { - len += maxrecs - 1; - do_div(len, maxrecs); - rval += len; - if (len == 1) - return rval + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) - - level - 1; - if (level == 0) - maxrecs = mp->m_bmap_dmxr[1]; - } - return rval; -} - -/* - * Convert inode from non-attributed to attributed. - * Must not be in a transaction, ip must not be locked. - */ -int /* error code */ -xfs_bmap_add_attrfork( - xfs_inode_t *ip, /* incore inode pointer */ - int size, /* space new attribute needs */ - int rsvd) /* xact may use reserved blks */ -{ - xfs_fsblock_t firstblock; /* 1st block/ag allocated */ - xfs_bmap_free_t flist; /* freed extent records */ - xfs_mount_t *mp; /* mount structure */ - xfs_trans_t *tp; /* transaction pointer */ - int blks; /* space reservation */ - int version = 1; /* superblock attr version */ - int committed; /* xaction was committed */ - int logflags; /* logging flags */ - int error; /* error return value */ - - ASSERT(XFS_IFORK_Q(ip) == 0); - - mp = ip->i_mount; - ASSERT(!XFS_NOT_DQATTACHED(mp, ip)); - tp = xfs_trans_alloc(mp, XFS_TRANS_ADDAFORK); - blks = XFS_ADDAFORK_SPACE_RES(mp); - if (rsvd) - tp->t_flags |= XFS_TRANS_RESERVE; - if ((error = xfs_trans_reserve(tp, blks, XFS_ADDAFORK_LOG_RES(mp), 0, - XFS_TRANS_PERM_LOG_RES, XFS_ADDAFORK_LOG_COUNT))) - goto error0; - xfs_ilock(ip, XFS_ILOCK_EXCL); - error = xfs_trans_reserve_quota_nblks(tp, ip, blks, 0, rsvd ? - XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES : - XFS_QMOPT_RES_REGBLKS); - if (error) { - xfs_iunlock(ip, XFS_ILOCK_EXCL); - xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES); - return error; - } - if (XFS_IFORK_Q(ip)) - goto error1; - if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS) { - /* - * For inodes coming from pre-6.2 filesystems. - */ - ASSERT(ip->i_d.di_aformat == 0); - ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS; - } - ASSERT(ip->i_d.di_anextents == 0); - - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - - switch (ip->i_d.di_format) { - case XFS_DINODE_FMT_DEV: - ip->i_d.di_forkoff = roundup(sizeof(xfs_dev_t), 8) >> 3; - break; - case XFS_DINODE_FMT_UUID: - ip->i_d.di_forkoff = roundup(sizeof(uuid_t), 8) >> 3; - break; - case XFS_DINODE_FMT_LOCAL: - case XFS_DINODE_FMT_EXTENTS: - case XFS_DINODE_FMT_BTREE: - ip->i_d.di_forkoff = xfs_attr_shortform_bytesfit(ip, size); - if (!ip->i_d.di_forkoff) - ip->i_d.di_forkoff = xfs_default_attroffset(ip) >> 3; - else if (mp->m_flags & XFS_MOUNT_ATTR2) - version = 2; - break; - default: - ASSERT(0); - error = XFS_ERROR(EINVAL); - goto error1; - } - - ASSERT(ip->i_afp == NULL); - ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP); - ip->i_afp->if_flags = XFS_IFEXTENTS; - logflags = 0; - xfs_bmap_init(&flist, &firstblock); - switch (ip->i_d.di_format) { - case XFS_DINODE_FMT_LOCAL: - error = xfs_bmap_add_attrfork_local(tp, ip, &firstblock, &flist, - &logflags); - break; - case XFS_DINODE_FMT_EXTENTS: - error = xfs_bmap_add_attrfork_extents(tp, ip, &firstblock, - &flist, &logflags); - break; - case XFS_DINODE_FMT_BTREE: - error = xfs_bmap_add_attrfork_btree(tp, ip, &firstblock, &flist, - &logflags); - break; - default: - error = 0; - break; - } - if (logflags) - xfs_trans_log_inode(tp, ip, logflags); - if (error) - goto error2; - if (!xfs_sb_version_hasattr(&mp->m_sb) || - (!xfs_sb_version_hasattr2(&mp->m_sb) && version == 2)) { - __int64_t sbfields = 0; - - spin_lock(&mp->m_sb_lock); - if (!xfs_sb_version_hasattr(&mp->m_sb)) { - xfs_sb_version_addattr(&mp->m_sb); - sbfields |= XFS_SB_VERSIONNUM; - } - if (!xfs_sb_version_hasattr2(&mp->m_sb) && version == 2) { - xfs_sb_version_addattr2(&mp->m_sb); - sbfields |= (XFS_SB_VERSIONNUM | XFS_SB_FEATURES2); - } - if (sbfields) { - spin_unlock(&mp->m_sb_lock); - xfs_mod_sb(tp, sbfields); - } else - spin_unlock(&mp->m_sb_lock); - } - - error = xfs_bmap_finish(&tp, &flist, &committed); - if (error) - goto error2; - return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); -error2: - xfs_bmap_cancel(&flist); -error1: - xfs_iunlock(ip, XFS_ILOCK_EXCL); -error0: - xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT); - return error; -} - -/* - * Add the extent to the list of extents to be free at transaction end. - * The list is maintained sorted (by block number). - */ -/* ARGSUSED */ -void -xfs_bmap_add_free( - xfs_fsblock_t bno, /* fs block number of extent */ - xfs_filblks_t len, /* length of extent */ - xfs_bmap_free_t *flist, /* list of extents */ - xfs_mount_t *mp) /* mount point structure */ -{ - xfs_bmap_free_item_t *cur; /* current (next) element */ - xfs_bmap_free_item_t *new; /* new element */ - xfs_bmap_free_item_t *prev; /* previous element */ -#ifdef DEBUG - xfs_agnumber_t agno; - xfs_agblock_t agbno; - - ASSERT(bno != NULLFSBLOCK); - ASSERT(len > 0); - ASSERT(len <= MAXEXTLEN); - ASSERT(!isnullstartblock(bno)); - agno = XFS_FSB_TO_AGNO(mp, bno); - agbno = XFS_FSB_TO_AGBNO(mp, bno); - ASSERT(agno < mp->m_sb.sb_agcount); - ASSERT(agbno < mp->m_sb.sb_agblocks); - ASSERT(len < mp->m_sb.sb_agblocks); - ASSERT(agbno + len <= mp->m_sb.sb_agblocks); -#endif - ASSERT(xfs_bmap_free_item_zone != NULL); - new = kmem_zone_alloc(xfs_bmap_free_item_zone, KM_SLEEP); - new->xbfi_startblock = bno; - new->xbfi_blockcount = (xfs_extlen_t)len; - for (prev = NULL, cur = flist->xbf_first; - cur != NULL; - prev = cur, cur = cur->xbfi_next) { - if (cur->xbfi_startblock >= bno) - break; - } - if (prev) - prev->xbfi_next = new; - else - flist->xbf_first = new; - new->xbfi_next = cur; - flist->xbf_count++; -} - -/* - * Compute and fill in the value of the maximum depth of a bmap btree - * in this filesystem. Done once, during mount. - */ -void -xfs_bmap_compute_maxlevels( - xfs_mount_t *mp, /* file system mount structure */ - int whichfork) /* data or attr fork */ -{ - int level; /* btree level */ - uint maxblocks; /* max blocks at this level */ - uint maxleafents; /* max leaf entries possible */ - int maxrootrecs; /* max records in root block */ - int minleafrecs; /* min records in leaf block */ - int minnoderecs; /* min records in node block */ - int sz; /* root block size */ - - /* - * The maximum number of extents in a file, hence the maximum - * number of leaf entries, is controlled by the type of di_nextents - * (a signed 32-bit number, xfs_extnum_t), or by di_anextents - * (a signed 16-bit number, xfs_aextnum_t). - * - * Note that we can no longer assume that if we are in ATTR1 that - * the fork offset of all the inodes will be - * (xfs_default_attroffset(ip) >> 3) because we could have mounted - * with ATTR2 and then mounted back with ATTR1, keeping the - * di_forkoff's fixed but probably at various positions. Therefore, - * for both ATTR1 and ATTR2 we have to assume the worst case scenario - * of a minimum size available. - */ - if (whichfork == XFS_DATA_FORK) { - maxleafents = MAXEXTNUM; - sz = XFS_BMDR_SPACE_CALC(MINDBTPTRS); - } else { - maxleafents = MAXAEXTNUM; - sz = XFS_BMDR_SPACE_CALC(MINABTPTRS); - } - maxrootrecs = xfs_bmdr_maxrecs(mp, sz, 0); - minleafrecs = mp->m_bmap_dmnr[0]; - minnoderecs = mp->m_bmap_dmnr[1]; - maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs; - for (level = 1; maxblocks > 1; level++) { - if (maxblocks <= maxrootrecs) - maxblocks = 1; - else - maxblocks = (maxblocks + minnoderecs - 1) / minnoderecs; - } - mp->m_bm_maxlevels[whichfork] = level; -} - -/* - * Routine to be called at transaction's end by xfs_bmapi, xfs_bunmapi - * caller. Frees all the extents that need freeing, which must be done - * last due to locking considerations. We never free any extents in - * the first transaction. - * - * Return 1 if the given transaction was committed and a new one - * started, and 0 otherwise in the committed parameter. - */ -int /* error */ -xfs_bmap_finish( - xfs_trans_t **tp, /* transaction pointer addr */ - xfs_bmap_free_t *flist, /* i/o: list extents to free */ - int *committed) /* xact committed or not */ -{ - xfs_efd_log_item_t *efd; /* extent free data */ - xfs_efi_log_item_t *efi; /* extent free intention */ - int error; /* error return value */ - xfs_bmap_free_item_t *free; /* free extent item */ - unsigned int logres; /* new log reservation */ - unsigned int logcount; /* new log count */ - xfs_mount_t *mp; /* filesystem mount structure */ - xfs_bmap_free_item_t *next; /* next item on free list */ - xfs_trans_t *ntp; /* new transaction pointer */ - - ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES); - if (flist->xbf_count == 0) { - *committed = 0; - return 0; - } - ntp = *tp; - efi = xfs_trans_get_efi(ntp, flist->xbf_count); - for (free = flist->xbf_first; free; free = free->xbfi_next) - xfs_trans_log_efi_extent(ntp, efi, free->xbfi_startblock, - free->xbfi_blockcount); - logres = ntp->t_log_res; - logcount = ntp->t_log_count; - ntp = xfs_trans_dup(*tp); - error = xfs_trans_commit(*tp, 0); - *tp = ntp; - *committed = 1; - /* - * We have a new transaction, so we should return committed=1, - * even though we're returning an error. - */ - if (error) - return error; - - /* - * transaction commit worked ok so we can drop the extra ticket - * reference that we gained in xfs_trans_dup() - */ - xfs_log_ticket_put(ntp->t_ticket); - - if ((error = xfs_trans_reserve(ntp, 0, logres, 0, XFS_TRANS_PERM_LOG_RES, - logcount))) - return error; - efd = xfs_trans_get_efd(ntp, efi, flist->xbf_count); - for (free = flist->xbf_first; free != NULL; free = next) { - next = free->xbfi_next; - if ((error = xfs_free_extent(ntp, free->xbfi_startblock, - free->xbfi_blockcount))) { - /* - * The bmap free list will be cleaned up at a - * higher level. The EFI will be canceled when - * this transaction is aborted. - * Need to force shutdown here to make sure it - * happens, since this transaction may not be - * dirty yet. - */ - mp = ntp->t_mountp; - if (!XFS_FORCED_SHUTDOWN(mp)) - xfs_force_shutdown(mp, - (error == EFSCORRUPTED) ? - SHUTDOWN_CORRUPT_INCORE : - SHUTDOWN_META_IO_ERROR); - return error; - } - xfs_trans_log_efd_extent(ntp, efd, free->xbfi_startblock, - free->xbfi_blockcount); - xfs_bmap_del_free(flist, NULL, free); - } - return 0; -} - -/* - * Free up any items left in the list. - */ -void -xfs_bmap_cancel( - xfs_bmap_free_t *flist) /* list of bmap_free_items */ -{ - xfs_bmap_free_item_t *free; /* free list item */ - xfs_bmap_free_item_t *next; - - if (flist->xbf_count == 0) - return; - ASSERT(flist->xbf_first != NULL); - for (free = flist->xbf_first; free; free = next) { - next = free->xbfi_next; - xfs_bmap_del_free(flist, NULL, free); - } - ASSERT(flist->xbf_count == 0); -} - -/* - * Returns the file-relative block number of the first unused block(s) - * in the file with at least "len" logically contiguous blocks free. - * This is the lowest-address hole if the file has holes, else the first block - * past the end of file. - * Return 0 if the file is currently local (in-inode). - */ -int /* error */ -xfs_bmap_first_unused( - xfs_trans_t *tp, /* transaction pointer */ - xfs_inode_t *ip, /* incore inode */ - xfs_extlen_t len, /* size of hole to find */ - xfs_fileoff_t *first_unused, /* unused block */ - int whichfork) /* data or attr fork */ -{ - int error; /* error return value */ - int idx; /* extent record index */ - xfs_ifork_t *ifp; /* inode fork pointer */ - xfs_fileoff_t lastaddr; /* last block number seen */ - xfs_fileoff_t lowest; /* lowest useful block */ - xfs_fileoff_t max; /* starting useful block */ - xfs_fileoff_t off; /* offset for this block */ - xfs_extnum_t nextents; /* number of extent entries */ - - ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE || - XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS || - XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL); - if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) { - *first_unused = 0; - return 0; - } - ifp = XFS_IFORK_PTR(ip, whichfork); - if (!(ifp->if_flags & XFS_IFEXTENTS) && - (error = xfs_iread_extents(tp, ip, whichfork))) - return error; - lowest = *first_unused; - nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); - for (idx = 0, lastaddr = 0, max = lowest; idx < nextents; idx++) { - xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, idx); - off = xfs_bmbt_get_startoff(ep); - /* - * See if the hole before this extent will work. - */ - if (off >= lowest + len && off - max >= len) { - *first_unused = max; - return 0; - } - lastaddr = off + xfs_bmbt_get_blockcount(ep); - max = XFS_FILEOFF_MAX(lastaddr, lowest); - } - *first_unused = max; - return 0; -} - -/* - * Returns the file-relative block number of the last block + 1 before - * last_block (input value) in the file. - * This is not based on i_size, it is based on the extent records. - * Returns 0 for local files, as they do not have extent records. - */ -int /* error */ -xfs_bmap_last_before( - xfs_trans_t *tp, /* transaction pointer */ - xfs_inode_t *ip, /* incore inode */ - xfs_fileoff_t *last_block, /* last block */ - int whichfork) /* data or attr fork */ -{ - xfs_fileoff_t bno; /* input file offset */ - int eof; /* hit end of file */ - xfs_bmbt_rec_host_t *ep; /* pointer to last extent */ - int error; /* error return value */ - xfs_bmbt_irec_t got; /* current extent value */ - xfs_ifork_t *ifp; /* inode fork pointer */ - xfs_extnum_t lastx; /* last extent used */ - xfs_bmbt_irec_t prev; /* previous extent value */ - - if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE && - XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && - XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL) - return XFS_ERROR(EIO); - if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) { - *last_block = 0; - return 0; - } - ifp = XFS_IFORK_PTR(ip, whichfork); - if (!(ifp->if_flags & XFS_IFEXTENTS) && - (error = xfs_iread_extents(tp, ip, whichfork))) - return error; - bno = *last_block - 1; - ep = xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got, - &prev); - if (eof || xfs_bmbt_get_startoff(ep) > bno) { - if (prev.br_startoff == NULLFILEOFF) - *last_block = 0; - else - *last_block = prev.br_startoff + prev.br_blockcount; - } - /* - * Otherwise *last_block is already the right answer. - */ - return 0; -} - -STATIC int -xfs_bmap_last_extent( - struct xfs_trans *tp, - struct xfs_inode *ip, - int whichfork, - struct xfs_bmbt_irec *rec, - int *is_empty) -{ - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); - int error; - int nextents; - - if (!(ifp->if_flags & XFS_IFEXTENTS)) { - error = xfs_iread_extents(tp, ip, whichfork); - if (error) - return error; - } - - nextents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t); - if (nextents == 0) { - *is_empty = 1; - return 0; - } - - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, nextents - 1), rec); - *is_empty = 0; - return 0; -} - -/* - * Check the last inode extent to determine whether this allocation will result - * in blocks being allocated at the end of the file. When we allocate new data - * blocks at the end of the file which do not start at the previous data block, - * we will try to align the new blocks at stripe unit boundaries. - * - * Returns 0 in bma->aeof if the file (fork) is empty as any new write will be - * at, or past the EOF. - */ -STATIC int -xfs_bmap_isaeof( - struct xfs_bmalloca *bma, - int whichfork) -{ - struct xfs_bmbt_irec rec; - int is_empty; - int error; - - bma->aeof = 0; - error = xfs_bmap_last_extent(NULL, bma->ip, whichfork, &rec, - &is_empty); - if (error || is_empty) - return error; - - /* - * Check if we are allocation or past the last extent, or at least into - * the last delayed allocated extent. - */ - bma->aeof = bma->offset >= rec.br_startoff + rec.br_blockcount || - (bma->offset >= rec.br_startoff && - isnullstartblock(rec.br_startblock)); - return 0; -} - -/* - * Check if the endoff is outside the last extent. If so the caller will grow - * the allocation to a stripe unit boundary. All offsets are considered outside - * the end of file for an empty fork, so 1 is returned in *eof in that case. - */ -int -xfs_bmap_eof( - struct xfs_inode *ip, - xfs_fileoff_t endoff, - int whichfork, - int *eof) -{ - struct xfs_bmbt_irec rec; - int error; - - error = xfs_bmap_last_extent(NULL, ip, whichfork, &rec, eof); - if (error || *eof) - return error; - - *eof = endoff >= rec.br_startoff + rec.br_blockcount; - return 0; -} - -/* - * Returns the file-relative block number of the first block past eof in - * the file. This is not based on i_size, it is based on the extent records. - * Returns 0 for local files, as they do not have extent records. - */ -int -xfs_bmap_last_offset( - struct xfs_trans *tp, - struct xfs_inode *ip, - xfs_fileoff_t *last_block, - int whichfork) -{ - struct xfs_bmbt_irec rec; - int is_empty; - int error; - - *last_block = 0; - - if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) - return 0; - - if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE && - XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS) - return XFS_ERROR(EIO); - - error = xfs_bmap_last_extent(NULL, ip, whichfork, &rec, &is_empty); - if (error || is_empty) - return error; - - *last_block = rec.br_startoff + rec.br_blockcount; - return 0; -} - -/* - * Returns whether the selected fork of the inode has exactly one - * block or not. For the data fork we check this matches di_size, - * implying the file's range is 0..bsize-1. - */ -int /* 1=>1 block, 0=>otherwise */ -xfs_bmap_one_block( - xfs_inode_t *ip, /* incore inode */ - int whichfork) /* data or attr fork */ -{ - xfs_bmbt_rec_host_t *ep; /* ptr to fork's extent */ - xfs_ifork_t *ifp; /* inode fork pointer */ - int rval; /* return value */ - xfs_bmbt_irec_t s; /* internal version of extent */ - -#ifndef DEBUG - if (whichfork == XFS_DATA_FORK) - return XFS_ISIZE(ip) == ip->i_mount->m_sb.sb_blocksize; -#endif /* !DEBUG */ - if (XFS_IFORK_NEXTENTS(ip, whichfork) != 1) - return 0; - if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS) - return 0; - ifp = XFS_IFORK_PTR(ip, whichfork); - ASSERT(ifp->if_flags & XFS_IFEXTENTS); - ep = xfs_iext_get_ext(ifp, 0); - xfs_bmbt_get_all(ep, &s); - rval = s.br_startoff == 0 && s.br_blockcount == 1; - if (rval && whichfork == XFS_DATA_FORK) - ASSERT(XFS_ISIZE(ip) == ip->i_mount->m_sb.sb_blocksize); - return rval; -} - -STATIC int -xfs_bmap_sanity_check( - struct xfs_mount *mp, - struct xfs_buf *bp, - int level) -{ - struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); - - if (block->bb_magic != cpu_to_be32(XFS_BMAP_MAGIC) || - be16_to_cpu(block->bb_level) != level || - be16_to_cpu(block->bb_numrecs) == 0 || - be16_to_cpu(block->bb_numrecs) > mp->m_bmap_dmxr[level != 0]) - return 0; - return 1; -} - -/* - * Read in the extents to if_extents. - * All inode fields are set up by caller, we just traverse the btree - * and copy the records in. If the file system cannot contain unwritten - * extents, the records are checked for no "state" flags. - */ -int /* error */ -xfs_bmap_read_extents( - xfs_trans_t *tp, /* transaction pointer */ - xfs_inode_t *ip, /* incore inode */ - int whichfork) /* data or attr fork */ -{ - struct xfs_btree_block *block; /* current btree block */ - xfs_fsblock_t bno; /* block # of "block" */ - xfs_buf_t *bp; /* buffer for "block" */ - int error; /* error return value */ - xfs_exntfmt_t exntf; /* XFS_EXTFMT_NOSTATE, if checking */ - xfs_extnum_t i, j; /* index into the extents list */ - xfs_ifork_t *ifp; /* fork structure */ - int level; /* btree level, for checking */ - xfs_mount_t *mp; /* file system mount structure */ - __be64 *pp; /* pointer to block address */ - /* REFERENCED */ - xfs_extnum_t room; /* number of entries there's room for */ - - bno = NULLFSBLOCK; - mp = ip->i_mount; - ifp = XFS_IFORK_PTR(ip, whichfork); - exntf = (whichfork != XFS_DATA_FORK) ? XFS_EXTFMT_NOSTATE : - XFS_EXTFMT_INODE(ip); - block = ifp->if_broot; - /* - * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out. - */ - level = be16_to_cpu(block->bb_level); - ASSERT(level > 0); - pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes); - bno = be64_to_cpu(*pp); - ASSERT(bno != NULLDFSBNO); - ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount); - ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks); - /* - * Go down the tree until leaf level is reached, following the first - * pointer (leftmost) at each level. - */ - while (level-- > 0) { - error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, - XFS_BMAP_BTREE_REF, &xfs_bmbt_buf_ops); - if (error) - return error; - block = XFS_BUF_TO_BLOCK(bp); - XFS_WANT_CORRUPTED_GOTO( - xfs_bmap_sanity_check(mp, bp, level), - error0); - if (level == 0) - break; - pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]); - bno = be64_to_cpu(*pp); - XFS_WANT_CORRUPTED_GOTO(XFS_FSB_SANITY_CHECK(mp, bno), error0); - xfs_trans_brelse(tp, bp); - } - /* - * Here with bp and block set to the leftmost leaf node in the tree. - */ - room = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); - i = 0; - /* - * Loop over all leaf nodes. Copy information to the extent records. - */ - for (;;) { - xfs_bmbt_rec_t *frp; - xfs_fsblock_t nextbno; - xfs_extnum_t num_recs; - xfs_extnum_t start; - - num_recs = xfs_btree_get_numrecs(block); - if (unlikely(i + num_recs > room)) { - ASSERT(i + num_recs <= room); - xfs_warn(ip->i_mount, - "corrupt dinode %Lu, (btree extents).", - (unsigned long long) ip->i_ino); - XFS_CORRUPTION_ERROR("xfs_bmap_read_extents(1)", - XFS_ERRLEVEL_LOW, ip->i_mount, block); - goto error0; - } - XFS_WANT_CORRUPTED_GOTO( - xfs_bmap_sanity_check(mp, bp, 0), - error0); - /* - * Read-ahead the next leaf block, if any. - */ - nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib); - if (nextbno != NULLFSBLOCK) - xfs_btree_reada_bufl(mp, nextbno, 1, - &xfs_bmbt_buf_ops); - /* - * Copy records into the extent records. - */ - frp = XFS_BMBT_REC_ADDR(mp, block, 1); - start = i; - for (j = 0; j < num_recs; j++, i++, frp++) { - xfs_bmbt_rec_host_t *trp = xfs_iext_get_ext(ifp, i); - trp->l0 = be64_to_cpu(frp->l0); - trp->l1 = be64_to_cpu(frp->l1); - } - if (exntf == XFS_EXTFMT_NOSTATE) { - /* - * Check all attribute bmap btree records and - * any "older" data bmap btree records for a - * set bit in the "extent flag" position. - */ - if (unlikely(xfs_check_nostate_extents(ifp, - start, num_recs))) { - XFS_ERROR_REPORT("xfs_bmap_read_extents(2)", - XFS_ERRLEVEL_LOW, - ip->i_mount); - goto error0; - } - } - xfs_trans_brelse(tp, bp); - bno = nextbno; - /* - * If we've reached the end, stop. - */ - if (bno == NULLFSBLOCK) - break; - error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, - XFS_BMAP_BTREE_REF, &xfs_bmbt_buf_ops); - if (error) - return error; - block = XFS_BUF_TO_BLOCK(bp); - } - ASSERT(i == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))); - ASSERT(i == XFS_IFORK_NEXTENTS(ip, whichfork)); - XFS_BMAP_TRACE_EXLIST(ip, i, whichfork); - return 0; -error0: - xfs_trans_brelse(tp, bp); - return XFS_ERROR(EFSCORRUPTED); -} - -#ifdef DEBUG -/* - * Add bmap trace insert entries for all the contents of the extent records. - */ -void -xfs_bmap_trace_exlist( - xfs_inode_t *ip, /* incore inode pointer */ - xfs_extnum_t cnt, /* count of entries in the list */ - int whichfork, /* data or attr fork */ - unsigned long caller_ip) -{ - xfs_extnum_t idx; /* extent record index */ - xfs_ifork_t *ifp; /* inode fork pointer */ - int state = 0; - - if (whichfork == XFS_ATTR_FORK) - state |= BMAP_ATTRFORK; - - ifp = XFS_IFORK_PTR(ip, whichfork); - ASSERT(cnt == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))); - for (idx = 0; idx < cnt; idx++) - trace_xfs_extlist(ip, idx, whichfork, caller_ip); -} - -/* - * Validate that the bmbt_irecs being returned from bmapi are valid - * given the callers original parameters. Specifically check the - * ranges of the returned irecs to ensure that they only extent beyond - * the given parameters if the XFS_BMAPI_ENTIRE flag was set. - */ -STATIC void -xfs_bmap_validate_ret( - xfs_fileoff_t bno, - xfs_filblks_t len, - int flags, - xfs_bmbt_irec_t *mval, - int nmap, - int ret_nmap) -{ - int i; /* index to map values */ - - ASSERT(ret_nmap <= nmap); - - for (i = 0; i < ret_nmap; i++) { - ASSERT(mval[i].br_blockcount > 0); - if (!(flags & XFS_BMAPI_ENTIRE)) { - ASSERT(mval[i].br_startoff >= bno); - ASSERT(mval[i].br_blockcount <= len); - ASSERT(mval[i].br_startoff + mval[i].br_blockcount <= - bno + len); - } else { - ASSERT(mval[i].br_startoff < bno + len); - ASSERT(mval[i].br_startoff + mval[i].br_blockcount > - bno); - } - ASSERT(i == 0 || - mval[i - 1].br_startoff + mval[i - 1].br_blockcount == - mval[i].br_startoff); - ASSERT(mval[i].br_startblock != DELAYSTARTBLOCK && - mval[i].br_startblock != HOLESTARTBLOCK); - ASSERT(mval[i].br_state == XFS_EXT_NORM || - mval[i].br_state == XFS_EXT_UNWRITTEN); - } -} -#endif /* DEBUG */ - - -/* * Trim the returned map to the required bounds */ STATIC void @@ -4372,6 +4044,7 @@ xfs_bmapi_read( ASSERT(*nmap >= 1); ASSERT(!(flags & ~(XFS_BMAPI_ATTRFORK|XFS_BMAPI_ENTIRE| XFS_BMAPI_IGSTATE))); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)); if (unlikely(XFS_TEST_ERROR( (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && @@ -4566,6 +4239,7 @@ xfs_bmapi_delay( ASSERT(*nmap >= 1); ASSERT(*nmap <= XFS_BMAP_MAX_NMAP); ASSERT(!(flags & ~XFS_BMAPI_ENTIRE)); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); if (unlikely(XFS_TEST_ERROR( (XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_EXTENTS && @@ -4624,8 +4298,8 @@ xfs_bmapi_delay( } -STATIC int -__xfs_bmapi_allocate( +static int +xfs_bmapi_allocate( struct xfs_bmalloca *bma) { struct xfs_mount *mp = bma->ip->i_mount; @@ -4634,12 +4308,9 @@ __xfs_bmapi_allocate( struct xfs_ifork *ifp = XFS_IFORK_PTR(bma->ip, whichfork); int tmp_logflags = 0; int error; - int rt; ASSERT(bma->length > 0); - rt = (whichfork == XFS_DATA_FORK) && XFS_IS_REALTIME_INODE(bma->ip); - /* * For the wasdelay case, we could also just allocate the stuff asked * for in this bmap call but that wouldn't be as good. @@ -4680,9 +4351,6 @@ __xfs_bmapi_allocate( return error; } - if (bma->flags & XFS_BMAPI_STACK_SWITCH) - bma->stack_switch = 1; - error = xfs_bmap_alloc(bma); if (error) return error; @@ -4745,45 +4413,6 @@ __xfs_bmapi_allocate( return 0; } -static void -xfs_bmapi_allocate_worker( - struct work_struct *work) -{ - struct xfs_bmalloca *args = container_of(work, - struct xfs_bmalloca, work); - unsigned long pflags; - - /* we are in a transaction context here */ - current_set_flags_nested(&pflags, PF_FSTRANS); - - args->result = __xfs_bmapi_allocate(args); - complete(args->done); - - current_restore_flags_nested(&pflags, PF_FSTRANS); -} - -/* - * Some allocation requests often come in with little stack to work on. Push - * them off to a worker thread so there is lots of stack to use. Otherwise just - * call directly to avoid the context switch overhead here. - */ -int -xfs_bmapi_allocate( - struct xfs_bmalloca *args) -{ - DECLARE_COMPLETION_ONSTACK(done); - - if (!args->stack_switch) - return __xfs_bmapi_allocate(args); - - - args->done = &done; - INIT_WORK_ONSTACK(&args->work, xfs_bmapi_allocate_worker); - queue_work(xfs_alloc_wq, &args->work); - wait_for_completion(&done); - return args->result; -} - STATIC int xfs_bmapi_convert_unwritten( struct xfs_bmalloca *bma, @@ -4872,7 +4501,7 @@ xfs_bmapi_write( { struct xfs_mount *mp = ip->i_mount; struct xfs_ifork *ifp; - struct xfs_bmalloca bma = { 0 }; /* args for xfs_bmap_alloc */ + struct xfs_bmalloca bma = { NULL }; /* args for xfs_bmap_alloc */ xfs_fileoff_t end; /* end of mapped file region */ int eof; /* after the end of extents */ int error; /* error return */ @@ -4895,20 +4524,20 @@ xfs_bmapi_write( orig_mval = mval; orig_nmap = *nmap; #endif + whichfork = (flags & XFS_BMAPI_ATTRFORK) ? + XFS_ATTR_FORK : XFS_DATA_FORK; ASSERT(*nmap >= 1); ASSERT(*nmap <= XFS_BMAP_MAX_NMAP); ASSERT(!(flags & XFS_BMAPI_IGSTATE)); ASSERT(tp != NULL); ASSERT(len > 0); - - whichfork = (flags & XFS_BMAPI_ATTRFORK) ? - XFS_ATTR_FORK : XFS_DATA_FORK; + ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); if (unlikely(XFS_TEST_ERROR( (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && - XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE && - XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL), + XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE), mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) { XFS_ERROR_REPORT("xfs_bmapi_write", XFS_ERRLEVEL_LOW, mp); return XFS_ERROR(EFSCORRUPTED); @@ -4921,13 +4550,6 @@ xfs_bmapi_write( XFS_STATS_INC(xs_blk_mapw); - if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) { - error = xfs_bmap_local_to_extents(tp, ip, firstblock, total, - &bma.logflags, whichfork); - if (error) - goto error0; - } - if (*firstblock == NULLFSBLOCK) { if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE) bma.minleft = be16_to_cpu(ifp->if_broot->bb_level) + 1; @@ -5083,6 +4705,328 @@ error0: } /* + * Called by xfs_bmapi to update file extent records and the btree + * after removing space (or undoing a delayed allocation). + */ +STATIC int /* error */ +xfs_bmap_del_extent( + xfs_inode_t *ip, /* incore inode pointer */ + xfs_trans_t *tp, /* current transaction pointer */ + xfs_extnum_t *idx, /* extent number to update/delete */ + xfs_bmap_free_t *flist, /* list of extents to be freed */ + xfs_btree_cur_t *cur, /* if null, not a btree */ + xfs_bmbt_irec_t *del, /* data to remove from extents */ + int *logflagsp, /* inode logging flags */ + int whichfork) /* data or attr fork */ +{ + xfs_filblks_t da_new; /* new delay-alloc indirect blocks */ + xfs_filblks_t da_old; /* old delay-alloc indirect blocks */ + xfs_fsblock_t del_endblock=0; /* first block past del */ + xfs_fileoff_t del_endoff; /* first offset past del */ + int delay; /* current block is delayed allocated */ + int do_fx; /* free extent at end of routine */ + xfs_bmbt_rec_host_t *ep; /* current extent entry pointer */ + int error; /* error return value */ + int flags; /* inode logging flags */ + xfs_bmbt_irec_t got; /* current extent entry */ + xfs_fileoff_t got_endoff; /* first offset past got */ + int i; /* temp state */ + xfs_ifork_t *ifp; /* inode fork pointer */ + xfs_mount_t *mp; /* mount structure */ + xfs_filblks_t nblks; /* quota/sb block count */ + xfs_bmbt_irec_t new; /* new record to be inserted */ + /* REFERENCED */ + uint qfield; /* quota field to update */ + xfs_filblks_t temp; /* for indirect length calculations */ + xfs_filblks_t temp2; /* for indirect length calculations */ + int state = 0; + + XFS_STATS_INC(xs_del_exlist); + + if (whichfork == XFS_ATTR_FORK) + state |= BMAP_ATTRFORK; + + mp = ip->i_mount; + ifp = XFS_IFORK_PTR(ip, whichfork); + ASSERT((*idx >= 0) && (*idx < ifp->if_bytes / + (uint)sizeof(xfs_bmbt_rec_t))); + ASSERT(del->br_blockcount > 0); + ep = xfs_iext_get_ext(ifp, *idx); + xfs_bmbt_get_all(ep, &got); + ASSERT(got.br_startoff <= del->br_startoff); + del_endoff = del->br_startoff + del->br_blockcount; + got_endoff = got.br_startoff + got.br_blockcount; + ASSERT(got_endoff >= del_endoff); + delay = isnullstartblock(got.br_startblock); + ASSERT(isnullstartblock(del->br_startblock) == delay); + flags = 0; + qfield = 0; + error = 0; + /* + * If deleting a real allocation, must free up the disk space. + */ + if (!delay) { + flags = XFS_ILOG_CORE; + /* + * Realtime allocation. Free it and record di_nblocks update. + */ + if (whichfork == XFS_DATA_FORK && XFS_IS_REALTIME_INODE(ip)) { + xfs_fsblock_t bno; + xfs_filblks_t len; + + ASSERT(do_mod(del->br_blockcount, + mp->m_sb.sb_rextsize) == 0); + ASSERT(do_mod(del->br_startblock, + mp->m_sb.sb_rextsize) == 0); + bno = del->br_startblock; + len = del->br_blockcount; + do_div(bno, mp->m_sb.sb_rextsize); + do_div(len, mp->m_sb.sb_rextsize); + error = xfs_rtfree_extent(tp, bno, (xfs_extlen_t)len); + if (error) + goto done; + do_fx = 0; + nblks = len * mp->m_sb.sb_rextsize; + qfield = XFS_TRANS_DQ_RTBCOUNT; + } + /* + * Ordinary allocation. + */ + else { + do_fx = 1; + nblks = del->br_blockcount; + qfield = XFS_TRANS_DQ_BCOUNT; + } + /* + * Set up del_endblock and cur for later. + */ + del_endblock = del->br_startblock + del->br_blockcount; + if (cur) { + if ((error = xfs_bmbt_lookup_eq(cur, got.br_startoff, + got.br_startblock, got.br_blockcount, + &i))) + goto done; + XFS_WANT_CORRUPTED_GOTO(i == 1, done); + } + da_old = da_new = 0; + } else { + da_old = startblockval(got.br_startblock); + da_new = 0; + nblks = 0; + do_fx = 0; + } + /* + * Set flag value to use in switch statement. + * Left-contig is 2, right-contig is 1. + */ + switch (((got.br_startoff == del->br_startoff) << 1) | + (got_endoff == del_endoff)) { + case 3: + /* + * Matches the whole extent. Delete the entry. + */ + xfs_iext_remove(ip, *idx, 1, + whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0); + --*idx; + if (delay) + break; + + XFS_IFORK_NEXT_SET(ip, whichfork, + XFS_IFORK_NEXTENTS(ip, whichfork) - 1); + flags |= XFS_ILOG_CORE; + if (!cur) { + flags |= xfs_ilog_fext(whichfork); + break; + } + if ((error = xfs_btree_delete(cur, &i))) + goto done; + XFS_WANT_CORRUPTED_GOTO(i == 1, done); + break; + + case 2: + /* + * Deleting the first part of the extent. + */ + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + xfs_bmbt_set_startoff(ep, del_endoff); + temp = got.br_blockcount - del->br_blockcount; + xfs_bmbt_set_blockcount(ep, temp); + if (delay) { + temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), + da_old); + xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + da_new = temp; + break; + } + xfs_bmbt_set_startblock(ep, del_endblock); + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + if (!cur) { + flags |= xfs_ilog_fext(whichfork); + break; + } + if ((error = xfs_bmbt_update(cur, del_endoff, del_endblock, + got.br_blockcount - del->br_blockcount, + got.br_state))) + goto done; + break; + + case 1: + /* + * Deleting the last part of the extent. + */ + temp = got.br_blockcount - del->br_blockcount; + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + xfs_bmbt_set_blockcount(ep, temp); + if (delay) { + temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), + da_old); + xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + da_new = temp; + break; + } + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + if (!cur) { + flags |= xfs_ilog_fext(whichfork); + break; + } + if ((error = xfs_bmbt_update(cur, got.br_startoff, + got.br_startblock, + got.br_blockcount - del->br_blockcount, + got.br_state))) + goto done; + break; + + case 0: + /* + * Deleting the middle of the extent. + */ + temp = del->br_startoff - got.br_startoff; + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + xfs_bmbt_set_blockcount(ep, temp); + new.br_startoff = del_endoff; + temp2 = got_endoff - del_endoff; + new.br_blockcount = temp2; + new.br_state = got.br_state; + if (!delay) { + new.br_startblock = del_endblock; + flags |= XFS_ILOG_CORE; + if (cur) { + if ((error = xfs_bmbt_update(cur, + got.br_startoff, + got.br_startblock, temp, + got.br_state))) + goto done; + if ((error = xfs_btree_increment(cur, 0, &i))) + goto done; + cur->bc_rec.b = new; + error = xfs_btree_insert(cur, &i); + if (error && error != ENOSPC) + goto done; + /* + * If get no-space back from btree insert, + * it tried a split, and we have a zero + * block reservation. + * Fix up our state and return the error. + */ + if (error == ENOSPC) { + /* + * Reset the cursor, don't trust + * it after any insert operation. + */ + if ((error = xfs_bmbt_lookup_eq(cur, + got.br_startoff, + got.br_startblock, + temp, &i))) + goto done; + XFS_WANT_CORRUPTED_GOTO(i == 1, done); + /* + * Update the btree record back + * to the original value. + */ + if ((error = xfs_bmbt_update(cur, + got.br_startoff, + got.br_startblock, + got.br_blockcount, + got.br_state))) + goto done; + /* + * Reset the extent record back + * to the original value. + */ + xfs_bmbt_set_blockcount(ep, + got.br_blockcount); + flags = 0; + error = XFS_ERROR(ENOSPC); + goto done; + } + XFS_WANT_CORRUPTED_GOTO(i == 1, done); + } else + flags |= xfs_ilog_fext(whichfork); + XFS_IFORK_NEXT_SET(ip, whichfork, + XFS_IFORK_NEXTENTS(ip, whichfork) + 1); + } else { + ASSERT(whichfork == XFS_DATA_FORK); + temp = xfs_bmap_worst_indlen(ip, temp); + xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); + temp2 = xfs_bmap_worst_indlen(ip, temp2); + new.br_startblock = nullstartblock((int)temp2); + da_new = temp + temp2; + while (da_new > da_old) { + if (temp) { + temp--; + da_new--; + xfs_bmbt_set_startblock(ep, + nullstartblock((int)temp)); + } + if (da_new == da_old) + break; + if (temp2) { + temp2--; + da_new--; + new.br_startblock = + nullstartblock((int)temp2); + } + } + } + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + xfs_iext_insert(ip, *idx + 1, 1, &new, state); + ++*idx; + break; + } + /* + * If we need to, add to list of extents to delete. + */ + if (do_fx) + xfs_bmap_add_free(del->br_startblock, del->br_blockcount, flist, + mp); + /* + * Adjust inode # blocks in the file. + */ + if (nblks) + ip->i_d.di_nblocks -= nblks; + /* + * Adjust quota data. + */ + if (qfield) + xfs_trans_mod_dquot_byino(tp, ip, qfield, (long)-nblks); + + /* + * Account for change in delayed indirect blocks. + * Nothing to do for disk quota accounting here. + */ + ASSERT(da_old >= da_new); + if (da_old > da_new) { + xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, + (int64_t)(da_old - da_new), 0); + } +done: + *logflagsp = flags; + return error; +} + +/* * Unmap (remove) blocks from a file. * If nexts is nonzero then the number of extents to remove is limited to * that value. If not all extents in the block range can be removed then @@ -5138,6 +5082,7 @@ xfs_bunmapi( if (XFS_FORCED_SHUTDOWN(mp)) return XFS_ERROR(EIO); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); ASSERT(len > 0); ASSERT(nexts >= 0); @@ -5463,780 +5408,199 @@ error0: } /* - * returns 1 for success, 0 if we failed to map the extent. - */ -STATIC int -xfs_getbmapx_fix_eof_hole( - xfs_inode_t *ip, /* xfs incore inode pointer */ - struct getbmapx *out, /* output structure */ - int prealloced, /* this is a file with - * preallocated data space */ - __int64_t end, /* last block requested */ - xfs_fsblock_t startblock) -{ - __int64_t fixlen; - xfs_mount_t *mp; /* file system mount point */ - xfs_ifork_t *ifp; /* inode fork pointer */ - xfs_extnum_t lastx; /* last extent pointer */ - xfs_fileoff_t fileblock; - - if (startblock == HOLESTARTBLOCK) { - mp = ip->i_mount; - out->bmv_block = -1; - fixlen = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, XFS_ISIZE(ip))); - fixlen -= out->bmv_offset; - if (prealloced && out->bmv_offset + out->bmv_length == end) { - /* Came to hole at EOF. Trim it. */ - if (fixlen <= 0) - return 0; - out->bmv_length = fixlen; - } - } else { - if (startblock == DELAYSTARTBLOCK) - out->bmv_block = -2; - else - out->bmv_block = xfs_fsb_to_db(ip, startblock); - fileblock = XFS_BB_TO_FSB(ip->i_mount, out->bmv_offset); - ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); - if (xfs_iext_bno_to_ext(ifp, fileblock, &lastx) && - (lastx == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))-1)) - out->bmv_oflags |= BMV_OF_LAST; - } - - return 1; -} - -/* - * Get inode's extents as described in bmv, and format for output. - * Calls formatter to fill the user's buffer until all extents - * are mapped, until the passed-in bmv->bmv_count slots have - * been filled, or until the formatter short-circuits the loop, - * if it is tracking filled-in extents on its own. + * Shift extent records to the left to cover a hole. + * + * The maximum number of extents to be shifted in a single operation + * is @num_exts, and @current_ext keeps track of the current extent + * index we have shifted. @offset_shift_fsb is the length by which each + * extent is shifted. If there is no hole to shift the extents + * into, this will be considered invalid operation and we abort immediately. */ -int /* error code */ -xfs_getbmap( - xfs_inode_t *ip, - struct getbmapx *bmv, /* user bmap structure */ - xfs_bmap_format_t formatter, /* format to user */ - void *arg) /* formatter arg */ +int +xfs_bmap_shift_extents( + struct xfs_trans *tp, + struct xfs_inode *ip, + int *done, + xfs_fileoff_t start_fsb, + xfs_fileoff_t offset_shift_fsb, + xfs_extnum_t *current_ext, + xfs_fsblock_t *firstblock, + struct xfs_bmap_free *flist, + int num_exts) { - __int64_t bmvend; /* last block requested */ - int error = 0; /* return value */ - __int64_t fixlen; /* length for -1 case */ - int i; /* extent number */ - int lock; /* lock state */ - xfs_bmbt_irec_t *map; /* buffer for user's data */ - xfs_mount_t *mp; /* file system mount point */ - int nex; /* # of user extents can do */ - int nexleft; /* # of user extents left */ - int subnex; /* # of bmapi's can do */ - int nmap; /* number of map entries */ - struct getbmapx *out; /* output structure */ - int whichfork; /* data or attr fork */ - int prealloced; /* this is a file with - * preallocated data space */ - int iflags; /* interface flags */ - int bmapi_flags; /* flags for xfs_bmapi */ - int cur_ext = 0; - - mp = ip->i_mount; - iflags = bmv->bmv_iflags; - whichfork = iflags & BMV_IF_ATTRFORK ? XFS_ATTR_FORK : XFS_DATA_FORK; - - if (whichfork == XFS_ATTR_FORK) { - if (XFS_IFORK_Q(ip)) { - if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS && - ip->i_d.di_aformat != XFS_DINODE_FMT_BTREE && - ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL) - return XFS_ERROR(EINVAL); - } else if (unlikely( - ip->i_d.di_aformat != 0 && - ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS)) { - XFS_ERROR_REPORT("xfs_getbmap", XFS_ERRLEVEL_LOW, - ip->i_mount); - return XFS_ERROR(EFSCORRUPTED); - } - - prealloced = 0; - fixlen = 1LL << 32; - } else { - if (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS && - ip->i_d.di_format != XFS_DINODE_FMT_BTREE && - ip->i_d.di_format != XFS_DINODE_FMT_LOCAL) - return XFS_ERROR(EINVAL); - - if (xfs_get_extsz_hint(ip) || - ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC|XFS_DIFLAG_APPEND)){ - prealloced = 1; - fixlen = mp->m_super->s_maxbytes; - } else { - prealloced = 0; - fixlen = XFS_ISIZE(ip); - } - } - - if (bmv->bmv_length == -1) { - fixlen = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, fixlen)); - bmv->bmv_length = - max_t(__int64_t, fixlen - bmv->bmv_offset, 0); - } else if (bmv->bmv_length == 0) { - bmv->bmv_entries = 0; - return 0; - } else if (bmv->bmv_length < 0) { - return XFS_ERROR(EINVAL); - } - - nex = bmv->bmv_count - 1; - if (nex <= 0) - return XFS_ERROR(EINVAL); - bmvend = bmv->bmv_offset + bmv->bmv_length; - - - if (bmv->bmv_count > ULONG_MAX / sizeof(struct getbmapx)) - return XFS_ERROR(ENOMEM); - out = kmem_zalloc(bmv->bmv_count * sizeof(struct getbmapx), KM_MAYFAIL); - if (!out) { - out = kmem_zalloc_large(bmv->bmv_count * - sizeof(struct getbmapx)); - if (!out) - return XFS_ERROR(ENOMEM); - } + struct xfs_btree_cur *cur; + struct xfs_bmbt_rec_host *gotp; + struct xfs_bmbt_irec got; + struct xfs_bmbt_irec left; + struct xfs_mount *mp = ip->i_mount; + struct xfs_ifork *ifp; + xfs_extnum_t nexts = 0; + xfs_fileoff_t startoff; + int error = 0; + int i; + int whichfork = XFS_DATA_FORK; + int logflags; + xfs_filblks_t blockcount = 0; + int total_extents; - xfs_ilock(ip, XFS_IOLOCK_SHARED); - if (whichfork == XFS_DATA_FORK && !(iflags & BMV_IF_DELALLOC)) { - if (ip->i_delayed_blks || XFS_ISIZE(ip) > ip->i_d.di_size) { - error = -filemap_write_and_wait(VFS_I(ip)->i_mapping); - if (error) - goto out_unlock_iolock; - } - /* - * even after flushing the inode, there can still be delalloc - * blocks on the inode beyond EOF due to speculative - * preallocation. These are not removed until the release - * function is called or the inode is inactivated. Hence we - * cannot assert here that ip->i_delayed_blks == 0. - */ + if (unlikely(XFS_TEST_ERROR( + (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && + XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE), + mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) { + XFS_ERROR_REPORT("xfs_bmap_shift_extents", + XFS_ERRLEVEL_LOW, mp); + return XFS_ERROR(EFSCORRUPTED); } - lock = xfs_ilock_map_shared(ip); - - /* - * Don't let nex be bigger than the number of extents - * we can have assuming alternating holes and real extents. - */ - if (nex > XFS_IFORK_NEXTENTS(ip, whichfork) * 2 + 1) - nex = XFS_IFORK_NEXTENTS(ip, whichfork) * 2 + 1; - - bmapi_flags = xfs_bmapi_aflag(whichfork); - if (!(iflags & BMV_IF_PREALLOC)) - bmapi_flags |= XFS_BMAPI_IGSTATE; - - /* - * Allocate enough space to handle "subnex" maps at a time. - */ - error = ENOMEM; - subnex = 16; - map = kmem_alloc(subnex * sizeof(*map), KM_MAYFAIL | KM_NOFS); - if (!map) - goto out_unlock_ilock; - - bmv->bmv_entries = 0; - - if (XFS_IFORK_NEXTENTS(ip, whichfork) == 0 && - (whichfork == XFS_ATTR_FORK || !(iflags & BMV_IF_DELALLOC))) { - error = 0; - goto out_free_map; - } + if (XFS_FORCED_SHUTDOWN(mp)) + return XFS_ERROR(EIO); - nexleft = nex; + ASSERT(current_ext != NULL); - do { - nmap = (nexleft > subnex) ? subnex : nexleft; - error = xfs_bmapi_read(ip, XFS_BB_TO_FSBT(mp, bmv->bmv_offset), - XFS_BB_TO_FSB(mp, bmv->bmv_length), - map, &nmap, bmapi_flags); + ifp = XFS_IFORK_PTR(ip, whichfork); + if (!(ifp->if_flags & XFS_IFEXTENTS)) { + /* Read in all the extents */ + error = xfs_iread_extents(tp, ip, whichfork); if (error) - goto out_free_map; - ASSERT(nmap <= subnex); - - for (i = 0; i < nmap && nexleft && bmv->bmv_length; i++) { - out[cur_ext].bmv_oflags = 0; - if (map[i].br_state == XFS_EXT_UNWRITTEN) - out[cur_ext].bmv_oflags |= BMV_OF_PREALLOC; - else if (map[i].br_startblock == DELAYSTARTBLOCK) - out[cur_ext].bmv_oflags |= BMV_OF_DELALLOC; - out[cur_ext].bmv_offset = - XFS_FSB_TO_BB(mp, map[i].br_startoff); - out[cur_ext].bmv_length = - XFS_FSB_TO_BB(mp, map[i].br_blockcount); - out[cur_ext].bmv_unused1 = 0; - out[cur_ext].bmv_unused2 = 0; - - /* - * delayed allocation extents that start beyond EOF can - * occur due to speculative EOF allocation when the - * delalloc extent is larger than the largest freespace - * extent at conversion time. These extents cannot be - * converted by data writeback, so can exist here even - * if we are not supposed to be finding delalloc - * extents. - */ - if (map[i].br_startblock == DELAYSTARTBLOCK && - map[i].br_startoff <= XFS_B_TO_FSB(mp, XFS_ISIZE(ip))) - ASSERT((iflags & BMV_IF_DELALLOC) != 0); - - if (map[i].br_startblock == HOLESTARTBLOCK && - whichfork == XFS_ATTR_FORK) { - /* came to the end of attribute fork */ - out[cur_ext].bmv_oflags |= BMV_OF_LAST; - goto out_free_map; - } - - if (!xfs_getbmapx_fix_eof_hole(ip, &out[cur_ext], - prealloced, bmvend, - map[i].br_startblock)) - goto out_free_map; - - bmv->bmv_offset = - out[cur_ext].bmv_offset + - out[cur_ext].bmv_length; - bmv->bmv_length = - max_t(__int64_t, 0, bmvend - bmv->bmv_offset); - - /* - * In case we don't want to return the hole, - * don't increase cur_ext so that we can reuse - * it in the next loop. - */ - if ((iflags & BMV_IF_NO_HOLES) && - map[i].br_startblock == HOLESTARTBLOCK) { - memset(&out[cur_ext], 0, sizeof(out[cur_ext])); - continue; - } - - nexleft--; - bmv->bmv_entries++; - cur_ext++; - } - } while (nmap && nexleft && bmv->bmv_length); - - out_free_map: - kmem_free(map); - out_unlock_ilock: - xfs_iunlock_map_shared(ip, lock); - out_unlock_iolock: - xfs_iunlock(ip, XFS_IOLOCK_SHARED); - - for (i = 0; i < cur_ext; i++) { - int full = 0; /* user array is full */ - - /* format results & advance arg */ - error = formatter(&arg, &out[i], &full); - if (error || full) - break; - } - - if (is_vmalloc_addr(out)) - kmem_free_large(out); - else - kmem_free(out); - return error; -} - -#ifdef DEBUG -STATIC struct xfs_buf * -xfs_bmap_get_bp( - struct xfs_btree_cur *cur, - xfs_fsblock_t bno) -{ - struct xfs_log_item_desc *lidp; - int i; - - if (!cur) - return NULL; - - for (i = 0; i < XFS_BTREE_MAXLEVELS; i++) { - if (!cur->bc_bufs[i]) - break; - if (XFS_BUF_ADDR(cur->bc_bufs[i]) == bno) - return cur->bc_bufs[i]; - } - - /* Chase down all the log items to see if the bp is there */ - list_for_each_entry(lidp, &cur->bc_tp->t_items, lid_trans) { - struct xfs_buf_log_item *bip; - bip = (struct xfs_buf_log_item *)lidp->lid_item; - if (bip->bli_item.li_type == XFS_LI_BUF && - XFS_BUF_ADDR(bip->bli_buf) == bno) - return bip->bli_buf; - } - - return NULL; -} - -STATIC void -xfs_check_block( - struct xfs_btree_block *block, - xfs_mount_t *mp, - int root, - short sz) -{ - int i, j, dmxr; - __be64 *pp, *thispa; /* pointer to block address */ - xfs_bmbt_key_t *prevp, *keyp; - - ASSERT(be16_to_cpu(block->bb_level) > 0); - - prevp = NULL; - for( i = 1; i <= xfs_btree_get_numrecs(block); i++) { - dmxr = mp->m_bmap_dmxr[0]; - keyp = XFS_BMBT_KEY_ADDR(mp, block, i); - - if (prevp) { - ASSERT(be64_to_cpu(prevp->br_startoff) < - be64_to_cpu(keyp->br_startoff)); - } - prevp = keyp; - - /* - * Compare the block numbers to see if there are dups. - */ - if (root) - pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, i, sz); - else - pp = XFS_BMBT_PTR_ADDR(mp, block, i, dmxr); - - for (j = i+1; j <= be16_to_cpu(block->bb_numrecs); j++) { - if (root) - thispa = XFS_BMAP_BROOT_PTR_ADDR(mp, block, j, sz); - else - thispa = XFS_BMBT_PTR_ADDR(mp, block, j, dmxr); - if (*thispa == *pp) { - xfs_warn(mp, "%s: thispa(%d) == pp(%d) %Ld", - __func__, j, i, - (unsigned long long)be64_to_cpu(*thispa)); - panic("%s: ptrs are equal in node\n", - __func__); - } - } - } -} - -/* - * Check that the extents for the inode ip are in the right order in all - * btree leaves. - */ - -STATIC void -xfs_bmap_check_leaf_extents( - xfs_btree_cur_t *cur, /* btree cursor or null */ - xfs_inode_t *ip, /* incore inode pointer */ - int whichfork) /* data or attr fork */ -{ - struct xfs_btree_block *block; /* current btree block */ - xfs_fsblock_t bno; /* block # of "block" */ - xfs_buf_t *bp; /* buffer for "block" */ - int error; /* error return value */ - xfs_extnum_t i=0, j; /* index into the extents list */ - xfs_ifork_t *ifp; /* fork structure */ - int level; /* btree level, for checking */ - xfs_mount_t *mp; /* file system mount structure */ - __be64 *pp; /* pointer to block address */ - xfs_bmbt_rec_t *ep; /* pointer to current extent */ - xfs_bmbt_rec_t last = {0, 0}; /* last extent in prev block */ - xfs_bmbt_rec_t *nextp; /* pointer to next extent */ - int bp_release = 0; - - if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE) { - return; + return error; } - bno = NULLFSBLOCK; - mp = ip->i_mount; - ifp = XFS_IFORK_PTR(ip, whichfork); - block = ifp->if_broot; /* - * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out. + * If *current_ext is 0, we would need to lookup the extent + * from where we would start shifting and store it in gotp. */ - level = be16_to_cpu(block->bb_level); - ASSERT(level > 0); - xfs_check_block(block, mp, 1, ifp->if_broot_bytes); - pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes); - bno = be64_to_cpu(*pp); - - ASSERT(bno != NULLDFSBNO); - ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount); - ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks); - - /* - * Go down the tree until leaf level is reached, following the first - * pointer (leftmost) at each level. - */ - while (level-- > 0) { - /* See if buf is in cur first */ - bp_release = 0; - bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno)); - if (!bp) { - bp_release = 1; - error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp, - XFS_BMAP_BTREE_REF, - &xfs_bmbt_buf_ops); - if (error) - goto error_norelse; - } - block = XFS_BUF_TO_BLOCK(bp); - XFS_WANT_CORRUPTED_GOTO( - xfs_bmap_sanity_check(mp, bp, level), - error0); - if (level == 0) - break; - + if (!*current_ext) { + gotp = xfs_iext_bno_to_ext(ifp, start_fsb, current_ext); /* - * Check this block for basic sanity (increasing keys and - * no duplicate blocks). + * gotp can be null in 2 cases: 1) if there are no extents + * or 2) start_fsb lies in a hole beyond which there are + * no extents. Either way, we are done. */ - - xfs_check_block(block, mp, 0, 0); - pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]); - bno = be64_to_cpu(*pp); - XFS_WANT_CORRUPTED_GOTO(XFS_FSB_SANITY_CHECK(mp, bno), error0); - if (bp_release) { - bp_release = 0; - xfs_trans_brelse(NULL, bp); + if (!gotp) { + *done = 1; + return 0; } } - /* - * Here with bp and block set to the leftmost leaf node in the tree. - */ - i = 0; + /* We are going to change core inode */ + logflags = XFS_ILOG_CORE; + if (ifp->if_flags & XFS_IFBROOT) { + cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork); + cur->bc_private.b.firstblock = *firstblock; + cur->bc_private.b.flist = flist; + cur->bc_private.b.flags = 0; + } else { + cur = NULL; + logflags |= XFS_ILOG_DEXT; + } /* - * Loop over all leaf nodes checking that all extents are in the right order. + * There may be delalloc extents in the data fork before the range we + * are collapsing out, so we cannot + * use the count of real extents here. Instead we have to calculate it + * from the incore fork. */ - for (;;) { - xfs_fsblock_t nextbno; - xfs_extnum_t num_recs; - - - num_recs = xfs_btree_get_numrecs(block); + total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t); + while (nexts++ < num_exts && *current_ext < total_extents) { - /* - * Read-ahead the next leaf block, if any. - */ - - nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib); + gotp = xfs_iext_get_ext(ifp, *current_ext); + xfs_bmbt_get_all(gotp, &got); + startoff = got.br_startoff - offset_shift_fsb; /* - * Check all the extents to make sure they are OK. - * If we had a previous block, the last entry should - * conform with the first entry in this one. + * Before shifting extent into hole, make sure that the hole + * is large enough to accomodate the shift. */ + if (*current_ext) { + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, + *current_ext - 1), &left); - ep = XFS_BMBT_REC_ADDR(mp, block, 1); - if (i) { - ASSERT(xfs_bmbt_disk_get_startoff(&last) + - xfs_bmbt_disk_get_blockcount(&last) <= - xfs_bmbt_disk_get_startoff(ep)); - } - for (j = 1; j < num_recs; j++) { - nextp = XFS_BMBT_REC_ADDR(mp, block, j + 1); - ASSERT(xfs_bmbt_disk_get_startoff(ep) + - xfs_bmbt_disk_get_blockcount(ep) <= - xfs_bmbt_disk_get_startoff(nextp)); - ep = nextp; + if (startoff < left.br_startoff + left.br_blockcount) + error = XFS_ERROR(EINVAL); + } else if (offset_shift_fsb > got.br_startoff) { + /* + * When first extent is shifted, offset_shift_fsb + * should be less than the stating offset of + * the first extent. + */ + error = XFS_ERROR(EINVAL); } - last = *ep; - i += num_recs; - if (bp_release) { - bp_release = 0; - xfs_trans_brelse(NULL, bp); - } - bno = nextbno; - /* - * If we've reached the end, stop. - */ - if (bno == NULLFSBLOCK) - break; + if (error) + goto del_cursor; - bp_release = 0; - bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno)); - if (!bp) { - bp_release = 1; - error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp, - XFS_BMAP_BTREE_REF, - &xfs_bmbt_buf_ops); + if (cur) { + error = xfs_bmbt_lookup_eq(cur, got.br_startoff, + got.br_startblock, + got.br_blockcount, + &i); if (error) - goto error_norelse; + goto del_cursor; + XFS_WANT_CORRUPTED_GOTO(i == 1, del_cursor); } - block = XFS_BUF_TO_BLOCK(bp); - } - if (bp_release) { - bp_release = 0; - xfs_trans_brelse(NULL, bp); - } - return; -error0: - xfs_warn(mp, "%s: at error0", __func__); - if (bp_release) - xfs_trans_brelse(NULL, bp); -error_norelse: - xfs_warn(mp, "%s: BAD after btree leaves for %d extents", - __func__, i); - panic("%s: CORRUPTED BTREE OR SOMETHING", __func__); - return; -} -#endif - -/* - * Count fsblocks of the given fork. - */ -int /* error */ -xfs_bmap_count_blocks( - xfs_trans_t *tp, /* transaction pointer */ - xfs_inode_t *ip, /* incore inode */ - int whichfork, /* data or attr fork */ - int *count) /* out: count of blocks */ -{ - struct xfs_btree_block *block; /* current btree block */ - xfs_fsblock_t bno; /* block # of "block" */ - xfs_ifork_t *ifp; /* fork structure */ - int level; /* btree level, for checking */ - xfs_mount_t *mp; /* file system mount structure */ - __be64 *pp; /* pointer to block address */ - - bno = NULLFSBLOCK; - mp = ip->i_mount; - ifp = XFS_IFORK_PTR(ip, whichfork); - if ( XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS ) { - xfs_bmap_count_leaves(ifp, 0, - ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t), - count); - return 0; - } - - /* - * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out. - */ - block = ifp->if_broot; - level = be16_to_cpu(block->bb_level); - ASSERT(level > 0); - pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes); - bno = be64_to_cpu(*pp); - ASSERT(bno != NULLDFSBNO); - ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount); - ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks); - - if (unlikely(xfs_bmap_count_tree(mp, tp, ifp, bno, level, count) < 0)) { - XFS_ERROR_REPORT("xfs_bmap_count_blocks(2)", XFS_ERRLEVEL_LOW, - mp); - return XFS_ERROR(EFSCORRUPTED); - } - - return 0; -} + /* Check if we can merge 2 adjacent extents */ + if (*current_ext && + left.br_startoff + left.br_blockcount == startoff && + left.br_startblock + left.br_blockcount == + got.br_startblock && + left.br_state == got.br_state && + left.br_blockcount + got.br_blockcount <= MAXEXTLEN) { + blockcount = left.br_blockcount + + got.br_blockcount; + xfs_iext_remove(ip, *current_ext, 1, 0); + if (cur) { + error = xfs_btree_delete(cur, &i); + if (error) + goto del_cursor; + XFS_WANT_CORRUPTED_GOTO(i == 1, del_cursor); + } + XFS_IFORK_NEXT_SET(ip, whichfork, + XFS_IFORK_NEXTENTS(ip, whichfork) - 1); + gotp = xfs_iext_get_ext(ifp, --*current_ext); + xfs_bmbt_get_all(gotp, &got); -/* - * Recursively walks each level of a btree - * to count total fsblocks is use. - */ -STATIC int /* error */ -xfs_bmap_count_tree( - xfs_mount_t *mp, /* file system mount point */ - xfs_trans_t *tp, /* transaction pointer */ - xfs_ifork_t *ifp, /* inode fork pointer */ - xfs_fsblock_t blockno, /* file system block number */ - int levelin, /* level in btree */ - int *count) /* Count of blocks */ -{ - int error; - xfs_buf_t *bp, *nbp; - int level = levelin; - __be64 *pp; - xfs_fsblock_t bno = blockno; - xfs_fsblock_t nextbno; - struct xfs_btree_block *block, *nextblock; - int numrecs; - - error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, XFS_BMAP_BTREE_REF, - &xfs_bmbt_buf_ops); - if (error) - return error; - *count += 1; - block = XFS_BUF_TO_BLOCK(bp); + /* Make cursor point to the extent we will update */ + if (cur) { + error = xfs_bmbt_lookup_eq(cur, got.br_startoff, + got.br_startblock, + got.br_blockcount, + &i); + if (error) + goto del_cursor; + XFS_WANT_CORRUPTED_GOTO(i == 1, del_cursor); + } - if (--level) { - /* Not at node above leaves, count this level of nodes */ - nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib); - while (nextbno != NULLFSBLOCK) { - error = xfs_btree_read_bufl(mp, tp, nextbno, 0, &nbp, - XFS_BMAP_BTREE_REF, - &xfs_bmbt_buf_ops); - if (error) - return error; - *count += 1; - nextblock = XFS_BUF_TO_BLOCK(nbp); - nextbno = be64_to_cpu(nextblock->bb_u.l.bb_rightsib); - xfs_trans_brelse(tp, nbp); + xfs_bmbt_set_blockcount(gotp, blockcount); + got.br_blockcount = blockcount; + } else { + /* We have to update the startoff */ + xfs_bmbt_set_startoff(gotp, startoff); + got.br_startoff = startoff; } - /* Dive to the next level */ - pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]); - bno = be64_to_cpu(*pp); - if (unlikely((error = - xfs_bmap_count_tree(mp, tp, ifp, bno, level, count)) < 0)) { - xfs_trans_brelse(tp, bp); - XFS_ERROR_REPORT("xfs_bmap_count_tree(1)", - XFS_ERRLEVEL_LOW, mp); - return XFS_ERROR(EFSCORRUPTED); - } - xfs_trans_brelse(tp, bp); - } else { - /* count all level 1 nodes and their leaves */ - for (;;) { - nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib); - numrecs = be16_to_cpu(block->bb_numrecs); - xfs_bmap_disk_count_leaves(mp, block, numrecs, count); - xfs_trans_brelse(tp, bp); - if (nextbno == NULLFSBLOCK) - break; - bno = nextbno; - error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, - XFS_BMAP_BTREE_REF, - &xfs_bmbt_buf_ops); + if (cur) { + error = xfs_bmbt_update(cur, got.br_startoff, + got.br_startblock, + got.br_blockcount, + got.br_state); if (error) - return error; - *count += 1; - block = XFS_BUF_TO_BLOCK(bp); + goto del_cursor; } - } - return 0; -} -/* - * Count leaf blocks given a range of extent records. - */ -STATIC void -xfs_bmap_count_leaves( - xfs_ifork_t *ifp, - xfs_extnum_t idx, - int numrecs, - int *count) -{ - int b; - - for (b = 0; b < numrecs; b++) { - xfs_bmbt_rec_host_t *frp = xfs_iext_get_ext(ifp, idx + b); - *count += xfs_bmbt_get_blockcount(frp); + (*current_ext)++; + total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t); } -} -/* - * Count leaf blocks given a range of extent records originally - * in btree format. - */ -STATIC void -xfs_bmap_disk_count_leaves( - struct xfs_mount *mp, - struct xfs_btree_block *block, - int numrecs, - int *count) -{ - int b; - xfs_bmbt_rec_t *frp; - - for (b = 1; b <= numrecs; b++) { - frp = XFS_BMBT_REC_ADDR(mp, block, b); - *count += xfs_bmbt_disk_get_blockcount(frp); - } -} - -/* - * dead simple method of punching delalyed allocation blocks from a range in - * the inode. Walks a block at a time so will be slow, but is only executed in - * rare error cases so the overhead is not critical. This will alays punch out - * both the start and end blocks, even if the ranges only partially overlap - * them, so it is up to the caller to ensure that partial blocks are not - * passed in. - */ -int -xfs_bmap_punch_delalloc_range( - struct xfs_inode *ip, - xfs_fileoff_t start_fsb, - xfs_fileoff_t length) -{ - xfs_fileoff_t remaining = length; - int error = 0; - - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); - - do { - int done; - xfs_bmbt_irec_t imap; - int nimaps = 1; - xfs_fsblock_t firstblock; - xfs_bmap_free_t flist; - - /* - * Map the range first and check that it is a delalloc extent - * before trying to unmap the range. Otherwise we will be - * trying to remove a real extent (which requires a - * transaction) or a hole, which is probably a bad idea... - */ - error = xfs_bmapi_read(ip, start_fsb, 1, &imap, &nimaps, - XFS_BMAPI_ENTIRE); - - if (error) { - /* something screwed, just bail */ - if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { - xfs_alert(ip->i_mount, - "Failed delalloc mapping lookup ino %lld fsb %lld.", - ip->i_ino, start_fsb); - } - break; - } - if (!nimaps) { - /* nothing there */ - goto next_block; - } - if (imap.br_startblock != DELAYSTARTBLOCK) { - /* been converted, ignore */ - goto next_block; - } - WARN_ON(imap.br_blockcount == 0); - - /* - * Note: while we initialise the firstblock/flist pair, they - * should never be used because blocks should never be - * allocated or freed for a delalloc extent and hence we need - * don't cancel or finish them after the xfs_bunmapi() call. - */ - xfs_bmap_init(&flist, &firstblock); - error = xfs_bunmapi(NULL, ip, start_fsb, 1, 0, 1, &firstblock, - &flist, &done); - if (error) - break; + /* Check if we are done */ + if (*current_ext == total_extents) + *done = 1; - ASSERT(!flist.xbf_count && !flist.xbf_first); -next_block: - start_fsb++; - remaining--; - } while(remaining > 0); +del_cursor: + if (cur) + xfs_btree_del_cursor(cur, + error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR); + xfs_trans_log_inode(tp, ip, logflags); return error; } - -/* - * Convert the given file system block to a disk block. We have to treat it - * differently based on whether the file is a real time file or not, because the - * bmap code does. - */ -xfs_daddr_t -xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb) -{ - return (XFS_IS_REALTIME_INODE(ip) ? \ - (xfs_daddr_t)XFS_FSB_TO_BB((ip)->i_mount, (fsb)) : \ - XFS_FSB_TO_DADDR((ip)->i_mount, (fsb))); -} diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h index 5f469c3516e..b879ca56a64 100644 --- a/fs/xfs/xfs_bmap.h +++ b/fs/xfs/xfs_bmap.h @@ -77,7 +77,6 @@ typedef struct xfs_bmap_free * from written to unwritten, otherwise convert from unwritten to written. */ #define XFS_BMAPI_CONVERT 0x040 -#define XFS_BMAPI_STACK_SWITCH 0x080 #define XFS_BMAPI_FLAGS \ { XFS_BMAPI_ENTIRE, "ENTIRE" }, \ @@ -86,8 +85,7 @@ typedef struct xfs_bmap_free { XFS_BMAPI_PREALLOC, "PREALLOC" }, \ { XFS_BMAPI_IGSTATE, "IGSTATE" }, \ { XFS_BMAPI_CONTIG, "CONTIG" }, \ - { XFS_BMAPI_CONVERT, "CONVERT" }, \ - { XFS_BMAPI_STACK_SWITCH, "STACK_SWITCH" } + { XFS_BMAPI_CONVERT, "CONVERT" } static inline int xfs_bmapi_aflag(int w) @@ -108,41 +106,6 @@ static inline void xfs_bmap_init(xfs_bmap_free_t *flp, xfs_fsblock_t *fbp) } /* - * Argument structure for xfs_bmap_alloc. - */ -typedef struct xfs_bmalloca { - xfs_fsblock_t *firstblock; /* i/o first block allocated */ - struct xfs_bmap_free *flist; /* bmap freelist */ - struct xfs_trans *tp; /* transaction pointer */ - struct xfs_inode *ip; /* incore inode pointer */ - struct xfs_bmbt_irec prev; /* extent before the new one */ - struct xfs_bmbt_irec got; /* extent after, or delayed */ - - xfs_fileoff_t offset; /* offset in file filling in */ - xfs_extlen_t length; /* i/o length asked/allocated */ - xfs_fsblock_t blkno; /* starting block of new extent */ - - struct xfs_btree_cur *cur; /* btree cursor */ - xfs_extnum_t idx; /* current extent index */ - int nallocs;/* number of extents alloc'd */ - int logflags;/* flags for transaction logging */ - - xfs_extlen_t total; /* total blocks needed for xaction */ - xfs_extlen_t minlen; /* minimum allocation size (blocks) */ - xfs_extlen_t minleft; /* amount must be left after alloc */ - char eof; /* set if allocating past last extent */ - char wasdel; /* replacing a delayed allocation */ - char userdata;/* set if is user data */ - char aeof; /* allocated space at eof */ - char conv; /* overwriting unwritten extents */ - char stack_switch; - int flags; - struct completion *done; - struct work_struct work; - int result; -} xfs_bmalloca_t; - -/* * Flags for xfs_bmap_add_extent*. */ #define BMAP_LEFT_CONTIG (1 << 0) @@ -162,7 +125,17 @@ typedef struct xfs_bmalloca { { BMAP_RIGHT_FILLING, "RF" }, \ { BMAP_ATTRFORK, "ATTR" } -#if defined(__KERNEL) && defined(DEBUG) + +/* + * This macro is used to determine how many extents will be shifted + * in one write transaction. We could require two splits, + * an extent move on the first and an extent merge on the second, + * So it is proper that one extent is shifted inside write transaction + * at a time. + */ +#define XFS_BMAP_MAX_SHIFT_EXTENTS 1 + +#ifdef DEBUG void xfs_bmap_trace_exlist(struct xfs_inode *ip, xfs_extnum_t cnt, int whichfork, unsigned long caller_ip); #define XFS_BMAP_TRACE_EXLIST(ip,c,w) \ @@ -172,6 +145,7 @@ void xfs_bmap_trace_exlist(struct xfs_inode *ip, xfs_extnum_t cnt, #endif int xfs_bmap_add_attrfork(struct xfs_inode *ip, int size, int rsvd); +void xfs_bmap_local_to_extents_empty(struct xfs_inode *ip, int whichfork); void xfs_bmap_add_free(xfs_fsblock_t bno, xfs_filblks_t len, struct xfs_bmap_free *flist, struct xfs_mount *mp); void xfs_bmap_cancel(struct xfs_bmap_free *flist); @@ -180,8 +154,8 @@ int xfs_bmap_first_unused(struct xfs_trans *tp, struct xfs_inode *ip, xfs_extlen_t len, xfs_fileoff_t *unused, int whichfork); int xfs_bmap_last_before(struct xfs_trans *tp, struct xfs_inode *ip, xfs_fileoff_t *last_block, int whichfork); -int xfs_bmap_last_offset(struct xfs_trans *tp, struct xfs_inode *ip, - xfs_fileoff_t *unused, int whichfork); +int xfs_bmap_last_offset(struct xfs_inode *ip, xfs_fileoff_t *unused, + int whichfork); int xfs_bmap_one_block(struct xfs_inode *ip, int whichfork); int xfs_bmap_read_extents(struct xfs_trans *tp, struct xfs_inode *ip, int whichfork); @@ -203,24 +177,10 @@ int xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip, int xfs_check_nostate_extents(struct xfs_ifork *ifp, xfs_extnum_t idx, xfs_extnum_t num); uint xfs_default_attroffset(struct xfs_inode *ip); - -#ifdef __KERNEL__ -/* bmap to userspace formatter - copy to user & advance pointer */ -typedef int (*xfs_bmap_format_t)(void **, struct getbmapx *, int *); - -int xfs_bmap_finish(struct xfs_trans **tp, struct xfs_bmap_free *flist, - int *committed); -int xfs_getbmap(struct xfs_inode *ip, struct getbmapx *bmv, - xfs_bmap_format_t formatter, void *arg); -int xfs_bmap_eof(struct xfs_inode *ip, xfs_fileoff_t endoff, - int whichfork, int *eof); -int xfs_bmap_count_blocks(struct xfs_trans *tp, struct xfs_inode *ip, - int whichfork, int *count); -int xfs_bmap_punch_delalloc_range(struct xfs_inode *ip, - xfs_fileoff_t start_fsb, xfs_fileoff_t length); - -xfs_daddr_t xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb); - -#endif /* __KERNEL__ */ +int xfs_bmap_shift_extents(struct xfs_trans *tp, struct xfs_inode *ip, + int *done, xfs_fileoff_t start_fsb, + xfs_fileoff_t offset_shift_fsb, xfs_extnum_t *current_ext, + xfs_fsblock_t *firstblock, struct xfs_bmap_free *flist, + int num_exts); #endif /* __XFS_BMAP_H__ */ diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c index 061b45cbe61..948836c4fd9 100644 --- a/fs/xfs/xfs_bmap_btree.c +++ b/fs/xfs/xfs_bmap_btree.c @@ -17,26 +17,26 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_types.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_bit.h" -#include "xfs_log.h" -#include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_mount.h" -#include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_dinode.h" #include "xfs_inode.h" +#include "xfs_trans.h" #include "xfs_inode_item.h" #include "xfs_alloc.h" #include "xfs_btree.h" -#include "xfs_itable.h" +#include "xfs_bmap_btree.h" #include "xfs_bmap.h" #include "xfs_error.h" #include "xfs_quota.h" #include "xfs_trace.h" +#include "xfs_cksum.h" +#include "xfs_dinode.h" /* * Determine the extent state. @@ -59,25 +59,32 @@ xfs_extent_state( */ void xfs_bmdr_to_bmbt( - struct xfs_mount *mp, + struct xfs_inode *ip, xfs_bmdr_block_t *dblock, int dblocklen, struct xfs_btree_block *rblock, int rblocklen) { + struct xfs_mount *mp = ip->i_mount; int dmxr; xfs_bmbt_key_t *fkp; __be64 *fpp; xfs_bmbt_key_t *tkp; __be64 *tpp; - rblock->bb_magic = cpu_to_be32(XFS_BMAP_MAGIC); + if (xfs_sb_version_hascrc(&mp->m_sb)) + xfs_btree_init_block_int(mp, rblock, XFS_BUF_DADDR_NULL, + XFS_BMAP_CRC_MAGIC, 0, 0, ip->i_ino, + XFS_BTREE_LONG_PTRS | XFS_BTREE_CRC_BLOCKS); + else + xfs_btree_init_block_int(mp, rblock, XFS_BUF_DADDR_NULL, + XFS_BMAP_MAGIC, 0, 0, ip->i_ino, + XFS_BTREE_LONG_PTRS); + rblock->bb_level = dblock->bb_level; ASSERT(be16_to_cpu(rblock->bb_level) > 0); rblock->bb_numrecs = dblock->bb_numrecs; - rblock->bb_u.l.bb_leftsib = cpu_to_be64(NULLDFSBNO); - rblock->bb_u.l.bb_rightsib = cpu_to_be64(NULLDFSBNO); - dmxr = xfs_bmdr_maxrecs(mp, dblocklen, 0); + dmxr = xfs_bmdr_maxrecs(dblocklen, 0); fkp = XFS_BMDR_KEY_ADDR(dblock, 1); tkp = XFS_BMBT_KEY_ADDR(mp, rblock, 1); fpp = XFS_BMDR_PTR_ADDR(dblock, 1, dmxr); @@ -424,13 +431,19 @@ xfs_bmbt_to_bmdr( xfs_bmbt_key_t *tkp; __be64 *tpp; - ASSERT(rblock->bb_magic == cpu_to_be32(XFS_BMAP_MAGIC)); + if (xfs_sb_version_hascrc(&mp->m_sb)) { + ASSERT(rblock->bb_magic == cpu_to_be32(XFS_BMAP_CRC_MAGIC)); + ASSERT(uuid_equal(&rblock->bb_u.l.bb_uuid, &mp->m_sb.sb_uuid)); + ASSERT(rblock->bb_u.l.bb_blkno == + cpu_to_be64(XFS_BUF_DADDR_NULL)); + } else + ASSERT(rblock->bb_magic == cpu_to_be32(XFS_BMAP_MAGIC)); ASSERT(rblock->bb_u.l.bb_leftsib == cpu_to_be64(NULLDFSBNO)); ASSERT(rblock->bb_u.l.bb_rightsib == cpu_to_be64(NULLDFSBNO)); ASSERT(rblock->bb_level != 0); dblock->bb_level = rblock->bb_level; dblock->bb_numrecs = rblock->bb_numrecs; - dmxr = xfs_bmdr_maxrecs(mp, dblocklen, 0); + dmxr = xfs_bmdr_maxrecs(dblocklen, 0); fkp = XFS_BMBT_KEY_ADDR(mp, rblock, 1); tkp = XFS_BMDR_KEY_ADDR(dblock, 1); fpp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, rblocklen); @@ -506,7 +519,6 @@ xfs_bmbt_alloc_block( struct xfs_btree_cur *cur, union xfs_btree_ptr *start, union xfs_btree_ptr *new, - int length, int *stat) { xfs_alloc_arg_t args; /* block allocation args */ @@ -659,8 +671,7 @@ xfs_bmbt_get_dmaxrecs( { if (level != cur->bc_nlevels - 1) return cur->bc_mp->m_bmap_dmxr[level != 0]; - return xfs_bmdr_maxrecs(cur->bc_mp, cur->bc_private.b.forksize, - level == 0); + return xfs_bmdr_maxrecs(cur->bc_private.b.forksize, level == 0); } STATIC void @@ -708,59 +719,87 @@ xfs_bmbt_key_diff( cur->bc_rec.b.br_startoff; } -static void +static bool xfs_bmbt_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); unsigned int level; - int lblock_ok; /* block passes checks */ - /* magic number and level verification. + switch (block->bb_magic) { + case cpu_to_be32(XFS_BMAP_CRC_MAGIC): + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return false; + if (!uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_uuid)) + return false; + if (be64_to_cpu(block->bb_u.l.bb_blkno) != bp->b_bn) + return false; + /* + * XXX: need a better way of verifying the owner here. Right now + * just make sure there has been one set. + */ + if (be64_to_cpu(block->bb_u.l.bb_owner) == 0) + return false; + /* fall through */ + case cpu_to_be32(XFS_BMAP_MAGIC): + break; + default: + return false; + } + + /* + * numrecs and level verification. * - * We don't know waht fork we belong to, so just verify that the level + * We don't know what fork we belong to, so just verify that the level * is less than the maximum of the two. Later checks will be more * precise. */ level = be16_to_cpu(block->bb_level); - lblock_ok = block->bb_magic == cpu_to_be32(XFS_BMAP_MAGIC) && - level < max(mp->m_bm_maxlevels[0], mp->m_bm_maxlevels[1]); - - /* numrecs verification */ - lblock_ok = lblock_ok && - be16_to_cpu(block->bb_numrecs) <= mp->m_bmap_dmxr[level != 0]; + if (level > max(mp->m_bm_maxlevels[0], mp->m_bm_maxlevels[1])) + return false; + if (be16_to_cpu(block->bb_numrecs) > mp->m_bmap_dmxr[level != 0]) + return false; /* sibling pointer verification */ - lblock_ok = lblock_ok && - block->bb_u.l.bb_leftsib && - (block->bb_u.l.bb_leftsib == cpu_to_be64(NULLDFSBNO) || - XFS_FSB_SANITY_CHECK(mp, - be64_to_cpu(block->bb_u.l.bb_leftsib))) && - block->bb_u.l.bb_rightsib && - (block->bb_u.l.bb_rightsib == cpu_to_be64(NULLDFSBNO) || - XFS_FSB_SANITY_CHECK(mp, - be64_to_cpu(block->bb_u.l.bb_rightsib))); - - if (!lblock_ok) { - trace_xfs_btree_corrupt(bp, _RET_IP_); - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, block); - xfs_buf_ioerror(bp, EFSCORRUPTED); - } + if (!block->bb_u.l.bb_leftsib || + (block->bb_u.l.bb_leftsib != cpu_to_be64(NULLDFSBNO) && + !XFS_FSB_SANITY_CHECK(mp, be64_to_cpu(block->bb_u.l.bb_leftsib)))) + return false; + if (!block->bb_u.l.bb_rightsib || + (block->bb_u.l.bb_rightsib != cpu_to_be64(NULLDFSBNO) && + !XFS_FSB_SANITY_CHECK(mp, be64_to_cpu(block->bb_u.l.bb_rightsib)))) + return false; + + return true; } static void xfs_bmbt_read_verify( struct xfs_buf *bp) { - xfs_bmbt_verify(bp); + if (!xfs_btree_lblock_verify_crc(bp)) + xfs_buf_ioerror(bp, EFSBADCRC); + else if (!xfs_bmbt_verify(bp)) + xfs_buf_ioerror(bp, EFSCORRUPTED); + + if (bp->b_error) { + trace_xfs_btree_corrupt(bp, _RET_IP_); + xfs_verifier_error(bp); + } } static void xfs_bmbt_write_verify( struct xfs_buf *bp) { - xfs_bmbt_verify(bp); + if (!xfs_bmbt_verify(bp)) { + trace_xfs_btree_corrupt(bp, _RET_IP_); + xfs_buf_ioerror(bp, EFSCORRUPTED); + xfs_verifier_error(bp); + return; + } + xfs_btree_lblock_calc_crc(bp); } const struct xfs_buf_ops xfs_bmbt_buf_ops = { @@ -769,7 +808,7 @@ const struct xfs_buf_ops xfs_bmbt_buf_ops = { }; -#ifdef DEBUG +#if defined(DEBUG) || defined(XFS_WARN) STATIC int xfs_bmbt_keys_inorder( struct xfs_btree_cur *cur, @@ -809,7 +848,7 @@ static const struct xfs_btree_ops xfs_bmbt_ops = { .init_ptr_from_cur = xfs_bmbt_init_ptr_from_cur, .key_diff = xfs_bmbt_key_diff, .buf_ops = &xfs_bmbt_buf_ops, -#ifdef DEBUG +#if defined(DEBUG) || defined(XFS_WARN) .keys_inorder = xfs_bmbt_keys_inorder, .recs_inorder = xfs_bmbt_recs_inorder, #endif @@ -838,6 +877,8 @@ xfs_bmbt_init_cursor( cur->bc_ops = &xfs_bmbt_ops; cur->bc_flags = XFS_BTREE_LONG_PTRS | XFS_BTREE_ROOT_IN_INODE; + if (xfs_sb_version_hascrc(&mp->m_sb)) + cur->bc_flags |= XFS_BTREE_CRC_BLOCKS; cur->bc_private.b.forksize = XFS_IFORK_SIZE(ip, whichfork); cur->bc_private.b.ip = ip; @@ -871,7 +912,6 @@ xfs_bmbt_maxrecs( */ int xfs_bmdr_maxrecs( - struct xfs_mount *mp, int blocklen, int leaf) { @@ -881,3 +921,47 @@ xfs_bmdr_maxrecs( return blocklen / sizeof(xfs_bmdr_rec_t); return blocklen / (sizeof(xfs_bmdr_key_t) + sizeof(xfs_bmdr_ptr_t)); } + +/* + * Change the owner of a btree format fork fo the inode passed in. Change it to + * the owner of that is passed in so that we can change owners before or after + * we switch forks between inodes. The operation that the caller is doing will + * determine whether is needs to change owner before or after the switch. + * + * For demand paged transactional modification, the fork switch should be done + * after reading in all the blocks, modifying them and pinning them in the + * transaction. For modification when the buffers are already pinned in memory, + * the fork switch can be done before changing the owner as we won't need to + * validate the owner until the btree buffers are unpinned and writes can occur + * again. + * + * For recovery based ownership change, there is no transactional context and + * so a buffer list must be supplied so that we can record the buffers that we + * modified for the caller to issue IO on. + */ +int +xfs_bmbt_change_owner( + struct xfs_trans *tp, + struct xfs_inode *ip, + int whichfork, + xfs_ino_t new_owner, + struct list_head *buffer_list) +{ + struct xfs_btree_cur *cur; + int error; + + ASSERT(tp || buffer_list); + ASSERT(!(tp && buffer_list)); + if (whichfork == XFS_DATA_FORK) + ASSERT(ip->i_d.di_format == XFS_DINODE_FMT_BTREE); + else + ASSERT(ip->i_d.di_aformat == XFS_DINODE_FMT_BTREE); + + cur = xfs_bmbt_init_cursor(ip->i_mount, tp, ip, whichfork); + if (!cur) + return ENOMEM; + + error = xfs_btree_change_owner(cur, new_owner, buffer_list); + xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR); + return error; +} diff --git a/fs/xfs/xfs_bmap_btree.h b/fs/xfs/xfs_bmap_btree.h index 88469ca0869..819a8a4dee9 100644 --- a/fs/xfs/xfs_bmap_btree.h +++ b/fs/xfs/xfs_bmap_btree.h @@ -18,8 +18,6 @@ #ifndef __XFS_BMAP_BTREE_H__ #define __XFS_BMAP_BTREE_H__ -#define XFS_BMAP_MAGIC 0x424d4150 /* 'BMAP' */ - struct xfs_btree_cur; struct xfs_btree_block; struct xfs_mount; @@ -27,85 +25,6 @@ struct xfs_inode; struct xfs_trans; /* - * Bmap root header, on-disk form only. - */ -typedef struct xfs_bmdr_block { - __be16 bb_level; /* 0 is a leaf */ - __be16 bb_numrecs; /* current # of data records */ -} xfs_bmdr_block_t; - -/* - * Bmap btree record and extent descriptor. - * l0:63 is an extent flag (value 1 indicates non-normal). - * l0:9-62 are startoff. - * l0:0-8 and l1:21-63 are startblock. - * l1:0-20 are blockcount. - */ -#define BMBT_EXNTFLAG_BITLEN 1 -#define BMBT_STARTOFF_BITLEN 54 -#define BMBT_STARTBLOCK_BITLEN 52 -#define BMBT_BLOCKCOUNT_BITLEN 21 - -typedef struct xfs_bmbt_rec { - __be64 l0, l1; -} xfs_bmbt_rec_t; - -typedef __uint64_t xfs_bmbt_rec_base_t; /* use this for casts */ -typedef xfs_bmbt_rec_t xfs_bmdr_rec_t; - -typedef struct xfs_bmbt_rec_host { - __uint64_t l0, l1; -} xfs_bmbt_rec_host_t; - -/* - * Values and macros for delayed-allocation startblock fields. - */ -#define STARTBLOCKVALBITS 17 -#define STARTBLOCKMASKBITS (15 + XFS_BIG_BLKNOS * 20) -#define DSTARTBLOCKMASKBITS (15 + 20) -#define STARTBLOCKMASK \ - (((((xfs_fsblock_t)1) << STARTBLOCKMASKBITS) - 1) << STARTBLOCKVALBITS) -#define DSTARTBLOCKMASK \ - (((((xfs_dfsbno_t)1) << DSTARTBLOCKMASKBITS) - 1) << STARTBLOCKVALBITS) - -static inline int isnullstartblock(xfs_fsblock_t x) -{ - return ((x) & STARTBLOCKMASK) == STARTBLOCKMASK; -} - -static inline int isnulldstartblock(xfs_dfsbno_t x) -{ - return ((x) & DSTARTBLOCKMASK) == DSTARTBLOCKMASK; -} - -static inline xfs_fsblock_t nullstartblock(int k) -{ - ASSERT(k < (1 << STARTBLOCKVALBITS)); - return STARTBLOCKMASK | (k); -} - -static inline xfs_filblks_t startblockval(xfs_fsblock_t x) -{ - return (xfs_filblks_t)((x) & ~STARTBLOCKMASK); -} - -/* - * Possible extent formats. - */ -typedef enum { - XFS_EXTFMT_NOSTATE = 0, - XFS_EXTFMT_HASSTATE -} xfs_exntfmt_t; - -/* - * Possible extent states. - */ -typedef enum { - XFS_EXT_NORM, XFS_EXT_UNWRITTEN, - XFS_EXT_DMAPI_OFFLINE, XFS_EXT_INVALID -} xfs_exntst_t; - -/* * Extent state and extent format macros. */ #define XFS_EXTFMT_INODE(x) \ @@ -114,32 +33,11 @@ typedef enum { #define ISUNWRITTEN(x) ((x)->br_state == XFS_EXT_UNWRITTEN) /* - * Incore version of above. - */ -typedef struct xfs_bmbt_irec -{ - xfs_fileoff_t br_startoff; /* starting file offset */ - xfs_fsblock_t br_startblock; /* starting block number */ - xfs_filblks_t br_blockcount; /* number of blocks */ - xfs_exntst_t br_state; /* extent state */ -} xfs_bmbt_irec_t; - -/* - * Key structure for non-leaf levels of the tree. - */ -typedef struct xfs_bmbt_key { - __be64 br_startoff; /* starting file offset */ -} xfs_bmbt_key_t, xfs_bmdr_key_t; - -/* btree pointer type */ -typedef __be64 xfs_bmbt_ptr_t, xfs_bmdr_ptr_t; - -/* * Btree block header size depends on a superblock flag. - * - * (not quite yet, but soon) */ -#define XFS_BMBT_BLOCK_LEN(mp) XFS_BTREE_LBLOCK_LEN +#define XFS_BMBT_BLOCK_LEN(mp) \ + (xfs_sb_version_hascrc(&((mp)->m_sb)) ? \ + XFS_BTREE_LBLOCK_CRC_LEN : XFS_BTREE_LBLOCK_LEN) #define XFS_BMBT_REC_ADDR(mp, block, index) \ ((xfs_bmbt_rec_t *) \ @@ -186,15 +84,17 @@ typedef __be64 xfs_bmbt_ptr_t, xfs_bmdr_ptr_t; #define XFS_BMAP_BROOT_PTR_ADDR(mp, bb, i, sz) \ XFS_BMBT_PTR_ADDR(mp, bb, i, xfs_bmbt_maxrecs(mp, sz, 0)) -#define XFS_BMAP_BROOT_SPACE_CALC(nrecs) \ - (int)(XFS_BTREE_LBLOCK_LEN + \ +#define XFS_BMAP_BROOT_SPACE_CALC(mp, nrecs) \ + (int)(XFS_BMBT_BLOCK_LEN(mp) + \ ((nrecs) * (sizeof(xfs_bmbt_key_t) + sizeof(xfs_bmbt_ptr_t)))) -#define XFS_BMAP_BROOT_SPACE(bb) \ - (XFS_BMAP_BROOT_SPACE_CALC(be16_to_cpu((bb)->bb_numrecs))) +#define XFS_BMAP_BROOT_SPACE(mp, bb) \ + (XFS_BMAP_BROOT_SPACE_CALC(mp, be16_to_cpu((bb)->bb_numrecs))) #define XFS_BMDR_SPACE_CALC(nrecs) \ (int)(sizeof(xfs_bmdr_block_t) + \ ((nrecs) * (sizeof(xfs_bmbt_key_t) + sizeof(xfs_bmbt_ptr_t)))) +#define XFS_BMAP_BMDR_SPACE(bb) \ + (XFS_BMDR_SPACE_CALC(be16_to_cpu((bb)->bb_numrecs))) /* * Maximum number of bmap btree levels. @@ -204,7 +104,7 @@ typedef __be64 xfs_bmbt_ptr_t, xfs_bmdr_ptr_t; /* * Prototypes for xfs_bmap.c to call. */ -extern void xfs_bmdr_to_bmbt(struct xfs_mount *, xfs_bmdr_block_t *, int, +extern void xfs_bmdr_to_bmbt(struct xfs_inode *, xfs_bmdr_block_t *, int, struct xfs_btree_block *, int); extern void xfs_bmbt_get_all(xfs_bmbt_rec_host_t *r, xfs_bmbt_irec_t *s); extern xfs_filblks_t xfs_bmbt_get_blockcount(xfs_bmbt_rec_host_t *r); @@ -230,12 +130,14 @@ extern void xfs_bmbt_to_bmdr(struct xfs_mount *, struct xfs_btree_block *, int, xfs_bmdr_block_t *, int); extern int xfs_bmbt_get_maxrecs(struct xfs_btree_cur *, int level); -extern int xfs_bmdr_maxrecs(struct xfs_mount *, int blocklen, int leaf); +extern int xfs_bmdr_maxrecs(int blocklen, int leaf); extern int xfs_bmbt_maxrecs(struct xfs_mount *, int blocklen, int leaf); +extern int xfs_bmbt_change_owner(struct xfs_trans *tp, struct xfs_inode *ip, + int whichfork, xfs_ino_t new_owner, + struct list_head *buffer_list); + extern struct xfs_btree_cur *xfs_bmbt_init_cursor(struct xfs_mount *, struct xfs_trans *, struct xfs_inode *, int); -extern const struct xfs_buf_ops xfs_bmbt_buf_ops; - #endif /* __XFS_BMAP_BTREE_H__ */ diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c new file mode 100644 index 00000000000..64731ef3324 --- /dev/null +++ b/fs/xfs/xfs_bmap_util.c @@ -0,0 +1,1897 @@ +/* + * Copyright (c) 2000-2006 Silicon Graphics, Inc. + * Copyright (c) 2012 Red Hat, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_bit.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_mount.h" +#include "xfs_da_format.h" +#include "xfs_inode.h" +#include "xfs_btree.h" +#include "xfs_trans.h" +#include "xfs_extfree_item.h" +#include "xfs_alloc.h" +#include "xfs_bmap.h" +#include "xfs_bmap_util.h" +#include "xfs_bmap_btree.h" +#include "xfs_rtalloc.h" +#include "xfs_error.h" +#include "xfs_quota.h" +#include "xfs_trans_space.h" +#include "xfs_trace.h" +#include "xfs_icache.h" +#include "xfs_log.h" +#include "xfs_dinode.h" + +/* Kernel only BMAP related definitions and functions */ + +/* + * Convert the given file system block to a disk block. We have to treat it + * differently based on whether the file is a real time file or not, because the + * bmap code does. + */ +xfs_daddr_t +xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb) +{ + return (XFS_IS_REALTIME_INODE(ip) ? \ + (xfs_daddr_t)XFS_FSB_TO_BB((ip)->i_mount, (fsb)) : \ + XFS_FSB_TO_DADDR((ip)->i_mount, (fsb))); +} + +/* + * Routine to be called at transaction's end by xfs_bmapi, xfs_bunmapi + * caller. Frees all the extents that need freeing, which must be done + * last due to locking considerations. We never free any extents in + * the first transaction. + * + * Return 1 if the given transaction was committed and a new one + * started, and 0 otherwise in the committed parameter. + */ +int /* error */ +xfs_bmap_finish( + xfs_trans_t **tp, /* transaction pointer addr */ + xfs_bmap_free_t *flist, /* i/o: list extents to free */ + int *committed) /* xact committed or not */ +{ + xfs_efd_log_item_t *efd; /* extent free data */ + xfs_efi_log_item_t *efi; /* extent free intention */ + int error; /* error return value */ + xfs_bmap_free_item_t *free; /* free extent item */ + struct xfs_trans_res tres; /* new log reservation */ + xfs_mount_t *mp; /* filesystem mount structure */ + xfs_bmap_free_item_t *next; /* next item on free list */ + xfs_trans_t *ntp; /* new transaction pointer */ + + ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES); + if (flist->xbf_count == 0) { + *committed = 0; + return 0; + } + ntp = *tp; + efi = xfs_trans_get_efi(ntp, flist->xbf_count); + for (free = flist->xbf_first; free; free = free->xbfi_next) + xfs_trans_log_efi_extent(ntp, efi, free->xbfi_startblock, + free->xbfi_blockcount); + + tres.tr_logres = ntp->t_log_res; + tres.tr_logcount = ntp->t_log_count; + tres.tr_logflags = XFS_TRANS_PERM_LOG_RES; + ntp = xfs_trans_dup(*tp); + error = xfs_trans_commit(*tp, 0); + *tp = ntp; + *committed = 1; + /* + * We have a new transaction, so we should return committed=1, + * even though we're returning an error. + */ + if (error) + return error; + + /* + * transaction commit worked ok so we can drop the extra ticket + * reference that we gained in xfs_trans_dup() + */ + xfs_log_ticket_put(ntp->t_ticket); + + error = xfs_trans_reserve(ntp, &tres, 0, 0); + if (error) + return error; + efd = xfs_trans_get_efd(ntp, efi, flist->xbf_count); + for (free = flist->xbf_first; free != NULL; free = next) { + next = free->xbfi_next; + if ((error = xfs_free_extent(ntp, free->xbfi_startblock, + free->xbfi_blockcount))) { + /* + * The bmap free list will be cleaned up at a + * higher level. The EFI will be canceled when + * this transaction is aborted. + * Need to force shutdown here to make sure it + * happens, since this transaction may not be + * dirty yet. + */ + mp = ntp->t_mountp; + if (!XFS_FORCED_SHUTDOWN(mp)) + xfs_force_shutdown(mp, + (error == EFSCORRUPTED) ? + SHUTDOWN_CORRUPT_INCORE : + SHUTDOWN_META_IO_ERROR); + return error; + } + xfs_trans_log_efd_extent(ntp, efd, free->xbfi_startblock, + free->xbfi_blockcount); + xfs_bmap_del_free(flist, NULL, free); + } + return 0; +} + +int +xfs_bmap_rtalloc( + struct xfs_bmalloca *ap) /* bmap alloc argument struct */ +{ + xfs_alloctype_t atype = 0; /* type for allocation routines */ + int error; /* error return value */ + xfs_mount_t *mp; /* mount point structure */ + xfs_extlen_t prod = 0; /* product factor for allocators */ + xfs_extlen_t ralen = 0; /* realtime allocation length */ + xfs_extlen_t align; /* minimum allocation alignment */ + xfs_rtblock_t rtb; + + mp = ap->ip->i_mount; + align = xfs_get_extsz_hint(ap->ip); + prod = align / mp->m_sb.sb_rextsize; + error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev, + align, 1, ap->eof, 0, + ap->conv, &ap->offset, &ap->length); + if (error) + return error; + ASSERT(ap->length); + ASSERT(ap->length % mp->m_sb.sb_rextsize == 0); + + /* + * If the offset & length are not perfectly aligned + * then kill prod, it will just get us in trouble. + */ + if (do_mod(ap->offset, align) || ap->length % align) + prod = 1; + /* + * Set ralen to be the actual requested length in rtextents. + */ + ralen = ap->length / mp->m_sb.sb_rextsize; + /* + * If the old value was close enough to MAXEXTLEN that + * we rounded up to it, cut it back so it's valid again. + * Note that if it's a really large request (bigger than + * MAXEXTLEN), we don't hear about that number, and can't + * adjust the starting point to match it. + */ + if (ralen * mp->m_sb.sb_rextsize >= MAXEXTLEN) + ralen = MAXEXTLEN / mp->m_sb.sb_rextsize; + + /* + * Lock out other modifications to the RT bitmap inode. + */ + xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(ap->tp, mp->m_rbmip, XFS_ILOCK_EXCL); + + /* + * If it's an allocation to an empty file at offset 0, + * pick an extent that will space things out in the rt area. + */ + if (ap->eof && ap->offset == 0) { + xfs_rtblock_t uninitialized_var(rtx); /* realtime extent no */ + + error = xfs_rtpick_extent(mp, ap->tp, ralen, &rtx); + if (error) + return error; + ap->blkno = rtx * mp->m_sb.sb_rextsize; + } else { + ap->blkno = 0; + } + + xfs_bmap_adjacent(ap); + + /* + * Realtime allocation, done through xfs_rtallocate_extent. + */ + atype = ap->blkno == 0 ? XFS_ALLOCTYPE_ANY_AG : XFS_ALLOCTYPE_NEAR_BNO; + do_div(ap->blkno, mp->m_sb.sb_rextsize); + rtb = ap->blkno; + ap->length = ralen; + if ((error = xfs_rtallocate_extent(ap->tp, ap->blkno, 1, ap->length, + &ralen, atype, ap->wasdel, prod, &rtb))) + return error; + if (rtb == NULLFSBLOCK && prod > 1 && + (error = xfs_rtallocate_extent(ap->tp, ap->blkno, 1, + ap->length, &ralen, atype, + ap->wasdel, 1, &rtb))) + return error; + ap->blkno = rtb; + if (ap->blkno != NULLFSBLOCK) { + ap->blkno *= mp->m_sb.sb_rextsize; + ralen *= mp->m_sb.sb_rextsize; + ap->length = ralen; + ap->ip->i_d.di_nblocks += ralen; + xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE); + if (ap->wasdel) + ap->ip->i_delayed_blks -= ralen; + /* + * Adjust the disk quota also. This was reserved + * earlier. + */ + xfs_trans_mod_dquot_byino(ap->tp, ap->ip, + ap->wasdel ? XFS_TRANS_DQ_DELRTBCOUNT : + XFS_TRANS_DQ_RTBCOUNT, (long) ralen); + } else { + ap->length = 0; + } + return 0; +} + +/* + * Check if the endoff is outside the last extent. If so the caller will grow + * the allocation to a stripe unit boundary. All offsets are considered outside + * the end of file for an empty fork, so 1 is returned in *eof in that case. + */ +int +xfs_bmap_eof( + struct xfs_inode *ip, + xfs_fileoff_t endoff, + int whichfork, + int *eof) +{ + struct xfs_bmbt_irec rec; + int error; + + error = xfs_bmap_last_extent(NULL, ip, whichfork, &rec, eof); + if (error || *eof) + return error; + + *eof = endoff >= rec.br_startoff + rec.br_blockcount; + return 0; +} + +/* + * Extent tree block counting routines. + */ + +/* + * Count leaf blocks given a range of extent records. + */ +STATIC void +xfs_bmap_count_leaves( + xfs_ifork_t *ifp, + xfs_extnum_t idx, + int numrecs, + int *count) +{ + int b; + + for (b = 0; b < numrecs; b++) { + xfs_bmbt_rec_host_t *frp = xfs_iext_get_ext(ifp, idx + b); + *count += xfs_bmbt_get_blockcount(frp); + } +} + +/* + * Count leaf blocks given a range of extent records originally + * in btree format. + */ +STATIC void +xfs_bmap_disk_count_leaves( + struct xfs_mount *mp, + struct xfs_btree_block *block, + int numrecs, + int *count) +{ + int b; + xfs_bmbt_rec_t *frp; + + for (b = 1; b <= numrecs; b++) { + frp = XFS_BMBT_REC_ADDR(mp, block, b); + *count += xfs_bmbt_disk_get_blockcount(frp); + } +} + +/* + * Recursively walks each level of a btree + * to count total fsblocks in use. + */ +STATIC int /* error */ +xfs_bmap_count_tree( + xfs_mount_t *mp, /* file system mount point */ + xfs_trans_t *tp, /* transaction pointer */ + xfs_ifork_t *ifp, /* inode fork pointer */ + xfs_fsblock_t blockno, /* file system block number */ + int levelin, /* level in btree */ + int *count) /* Count of blocks */ +{ + int error; + xfs_buf_t *bp, *nbp; + int level = levelin; + __be64 *pp; + xfs_fsblock_t bno = blockno; + xfs_fsblock_t nextbno; + struct xfs_btree_block *block, *nextblock; + int numrecs; + + error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, XFS_BMAP_BTREE_REF, + &xfs_bmbt_buf_ops); + if (error) + return error; + *count += 1; + block = XFS_BUF_TO_BLOCK(bp); + + if (--level) { + /* Not at node above leaves, count this level of nodes */ + nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib); + while (nextbno != NULLFSBLOCK) { + error = xfs_btree_read_bufl(mp, tp, nextbno, 0, &nbp, + XFS_BMAP_BTREE_REF, + &xfs_bmbt_buf_ops); + if (error) + return error; + *count += 1; + nextblock = XFS_BUF_TO_BLOCK(nbp); + nextbno = be64_to_cpu(nextblock->bb_u.l.bb_rightsib); + xfs_trans_brelse(tp, nbp); + } + + /* Dive to the next level */ + pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]); + bno = be64_to_cpu(*pp); + if (unlikely((error = + xfs_bmap_count_tree(mp, tp, ifp, bno, level, count)) < 0)) { + xfs_trans_brelse(tp, bp); + XFS_ERROR_REPORT("xfs_bmap_count_tree(1)", + XFS_ERRLEVEL_LOW, mp); + return XFS_ERROR(EFSCORRUPTED); + } + xfs_trans_brelse(tp, bp); + } else { + /* count all level 1 nodes and their leaves */ + for (;;) { + nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib); + numrecs = be16_to_cpu(block->bb_numrecs); + xfs_bmap_disk_count_leaves(mp, block, numrecs, count); + xfs_trans_brelse(tp, bp); + if (nextbno == NULLFSBLOCK) + break; + bno = nextbno; + error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, + XFS_BMAP_BTREE_REF, + &xfs_bmbt_buf_ops); + if (error) + return error; + *count += 1; + block = XFS_BUF_TO_BLOCK(bp); + } + } + return 0; +} + +/* + * Count fsblocks of the given fork. + */ +int /* error */ +xfs_bmap_count_blocks( + xfs_trans_t *tp, /* transaction pointer */ + xfs_inode_t *ip, /* incore inode */ + int whichfork, /* data or attr fork */ + int *count) /* out: count of blocks */ +{ + struct xfs_btree_block *block; /* current btree block */ + xfs_fsblock_t bno; /* block # of "block" */ + xfs_ifork_t *ifp; /* fork structure */ + int level; /* btree level, for checking */ + xfs_mount_t *mp; /* file system mount structure */ + __be64 *pp; /* pointer to block address */ + + bno = NULLFSBLOCK; + mp = ip->i_mount; + ifp = XFS_IFORK_PTR(ip, whichfork); + if ( XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS ) { + xfs_bmap_count_leaves(ifp, 0, + ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t), + count); + return 0; + } + + /* + * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out. + */ + block = ifp->if_broot; + level = be16_to_cpu(block->bb_level); + ASSERT(level > 0); + pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes); + bno = be64_to_cpu(*pp); + ASSERT(bno != NULLDFSBNO); + ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount); + ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks); + + if (unlikely(xfs_bmap_count_tree(mp, tp, ifp, bno, level, count) < 0)) { + XFS_ERROR_REPORT("xfs_bmap_count_blocks(2)", XFS_ERRLEVEL_LOW, + mp); + return XFS_ERROR(EFSCORRUPTED); + } + + return 0; +} + +/* + * returns 1 for success, 0 if we failed to map the extent. + */ +STATIC int +xfs_getbmapx_fix_eof_hole( + xfs_inode_t *ip, /* xfs incore inode pointer */ + struct getbmapx *out, /* output structure */ + int prealloced, /* this is a file with + * preallocated data space */ + __int64_t end, /* last block requested */ + xfs_fsblock_t startblock) +{ + __int64_t fixlen; + xfs_mount_t *mp; /* file system mount point */ + xfs_ifork_t *ifp; /* inode fork pointer */ + xfs_extnum_t lastx; /* last extent pointer */ + xfs_fileoff_t fileblock; + + if (startblock == HOLESTARTBLOCK) { + mp = ip->i_mount; + out->bmv_block = -1; + fixlen = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, XFS_ISIZE(ip))); + fixlen -= out->bmv_offset; + if (prealloced && out->bmv_offset + out->bmv_length == end) { + /* Came to hole at EOF. Trim it. */ + if (fixlen <= 0) + return 0; + out->bmv_length = fixlen; + } + } else { + if (startblock == DELAYSTARTBLOCK) + out->bmv_block = -2; + else + out->bmv_block = xfs_fsb_to_db(ip, startblock); + fileblock = XFS_BB_TO_FSB(ip->i_mount, out->bmv_offset); + ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); + if (xfs_iext_bno_to_ext(ifp, fileblock, &lastx) && + (lastx == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))-1)) + out->bmv_oflags |= BMV_OF_LAST; + } + + return 1; +} + +/* + * Get inode's extents as described in bmv, and format for output. + * Calls formatter to fill the user's buffer until all extents + * are mapped, until the passed-in bmv->bmv_count slots have + * been filled, or until the formatter short-circuits the loop, + * if it is tracking filled-in extents on its own. + */ +int /* error code */ +xfs_getbmap( + xfs_inode_t *ip, + struct getbmapx *bmv, /* user bmap structure */ + xfs_bmap_format_t formatter, /* format to user */ + void *arg) /* formatter arg */ +{ + __int64_t bmvend; /* last block requested */ + int error = 0; /* return value */ + __int64_t fixlen; /* length for -1 case */ + int i; /* extent number */ + int lock; /* lock state */ + xfs_bmbt_irec_t *map; /* buffer for user's data */ + xfs_mount_t *mp; /* file system mount point */ + int nex; /* # of user extents can do */ + int nexleft; /* # of user extents left */ + int subnex; /* # of bmapi's can do */ + int nmap; /* number of map entries */ + struct getbmapx *out; /* output structure */ + int whichfork; /* data or attr fork */ + int prealloced; /* this is a file with + * preallocated data space */ + int iflags; /* interface flags */ + int bmapi_flags; /* flags for xfs_bmapi */ + int cur_ext = 0; + + mp = ip->i_mount; + iflags = bmv->bmv_iflags; + whichfork = iflags & BMV_IF_ATTRFORK ? XFS_ATTR_FORK : XFS_DATA_FORK; + + if (whichfork == XFS_ATTR_FORK) { + if (XFS_IFORK_Q(ip)) { + if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS && + ip->i_d.di_aformat != XFS_DINODE_FMT_BTREE && + ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL) + return XFS_ERROR(EINVAL); + } else if (unlikely( + ip->i_d.di_aformat != 0 && + ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS)) { + XFS_ERROR_REPORT("xfs_getbmap", XFS_ERRLEVEL_LOW, + ip->i_mount); + return XFS_ERROR(EFSCORRUPTED); + } + + prealloced = 0; + fixlen = 1LL << 32; + } else { + if (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS && + ip->i_d.di_format != XFS_DINODE_FMT_BTREE && + ip->i_d.di_format != XFS_DINODE_FMT_LOCAL) + return XFS_ERROR(EINVAL); + + if (xfs_get_extsz_hint(ip) || + ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC|XFS_DIFLAG_APPEND)){ + prealloced = 1; + fixlen = mp->m_super->s_maxbytes; + } else { + prealloced = 0; + fixlen = XFS_ISIZE(ip); + } + } + + if (bmv->bmv_length == -1) { + fixlen = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, fixlen)); + bmv->bmv_length = + max_t(__int64_t, fixlen - bmv->bmv_offset, 0); + } else if (bmv->bmv_length == 0) { + bmv->bmv_entries = 0; + return 0; + } else if (bmv->bmv_length < 0) { + return XFS_ERROR(EINVAL); + } + + nex = bmv->bmv_count - 1; + if (nex <= 0) + return XFS_ERROR(EINVAL); + bmvend = bmv->bmv_offset + bmv->bmv_length; + + + if (bmv->bmv_count > ULONG_MAX / sizeof(struct getbmapx)) + return XFS_ERROR(ENOMEM); + out = kmem_zalloc_large(bmv->bmv_count * sizeof(struct getbmapx), 0); + if (!out) + return XFS_ERROR(ENOMEM); + + xfs_ilock(ip, XFS_IOLOCK_SHARED); + if (whichfork == XFS_DATA_FORK) { + if (!(iflags & BMV_IF_DELALLOC) && + (ip->i_delayed_blks || XFS_ISIZE(ip) > ip->i_d.di_size)) { + error = -filemap_write_and_wait(VFS_I(ip)->i_mapping); + if (error) + goto out_unlock_iolock; + + /* + * Even after flushing the inode, there can still be + * delalloc blocks on the inode beyond EOF due to + * speculative preallocation. These are not removed + * until the release function is called or the inode + * is inactivated. Hence we cannot assert here that + * ip->i_delayed_blks == 0. + */ + } + + lock = xfs_ilock_data_map_shared(ip); + } else { + lock = xfs_ilock_attr_map_shared(ip); + } + + /* + * Don't let nex be bigger than the number of extents + * we can have assuming alternating holes and real extents. + */ + if (nex > XFS_IFORK_NEXTENTS(ip, whichfork) * 2 + 1) + nex = XFS_IFORK_NEXTENTS(ip, whichfork) * 2 + 1; + + bmapi_flags = xfs_bmapi_aflag(whichfork); + if (!(iflags & BMV_IF_PREALLOC)) + bmapi_flags |= XFS_BMAPI_IGSTATE; + + /* + * Allocate enough space to handle "subnex" maps at a time. + */ + error = ENOMEM; + subnex = 16; + map = kmem_alloc(subnex * sizeof(*map), KM_MAYFAIL | KM_NOFS); + if (!map) + goto out_unlock_ilock; + + bmv->bmv_entries = 0; + + if (XFS_IFORK_NEXTENTS(ip, whichfork) == 0 && + (whichfork == XFS_ATTR_FORK || !(iflags & BMV_IF_DELALLOC))) { + error = 0; + goto out_free_map; + } + + nexleft = nex; + + do { + nmap = (nexleft > subnex) ? subnex : nexleft; + error = xfs_bmapi_read(ip, XFS_BB_TO_FSBT(mp, bmv->bmv_offset), + XFS_BB_TO_FSB(mp, bmv->bmv_length), + map, &nmap, bmapi_flags); + if (error) + goto out_free_map; + ASSERT(nmap <= subnex); + + for (i = 0; i < nmap && nexleft && bmv->bmv_length; i++) { + out[cur_ext].bmv_oflags = 0; + if (map[i].br_state == XFS_EXT_UNWRITTEN) + out[cur_ext].bmv_oflags |= BMV_OF_PREALLOC; + else if (map[i].br_startblock == DELAYSTARTBLOCK) + out[cur_ext].bmv_oflags |= BMV_OF_DELALLOC; + out[cur_ext].bmv_offset = + XFS_FSB_TO_BB(mp, map[i].br_startoff); + out[cur_ext].bmv_length = + XFS_FSB_TO_BB(mp, map[i].br_blockcount); + out[cur_ext].bmv_unused1 = 0; + out[cur_ext].bmv_unused2 = 0; + + /* + * delayed allocation extents that start beyond EOF can + * occur due to speculative EOF allocation when the + * delalloc extent is larger than the largest freespace + * extent at conversion time. These extents cannot be + * converted by data writeback, so can exist here even + * if we are not supposed to be finding delalloc + * extents. + */ + if (map[i].br_startblock == DELAYSTARTBLOCK && + map[i].br_startoff <= XFS_B_TO_FSB(mp, XFS_ISIZE(ip))) + ASSERT((iflags & BMV_IF_DELALLOC) != 0); + + if (map[i].br_startblock == HOLESTARTBLOCK && + whichfork == XFS_ATTR_FORK) { + /* came to the end of attribute fork */ + out[cur_ext].bmv_oflags |= BMV_OF_LAST; + goto out_free_map; + } + + if (!xfs_getbmapx_fix_eof_hole(ip, &out[cur_ext], + prealloced, bmvend, + map[i].br_startblock)) + goto out_free_map; + + bmv->bmv_offset = + out[cur_ext].bmv_offset + + out[cur_ext].bmv_length; + bmv->bmv_length = + max_t(__int64_t, 0, bmvend - bmv->bmv_offset); + + /* + * In case we don't want to return the hole, + * don't increase cur_ext so that we can reuse + * it in the next loop. + */ + if ((iflags & BMV_IF_NO_HOLES) && + map[i].br_startblock == HOLESTARTBLOCK) { + memset(&out[cur_ext], 0, sizeof(out[cur_ext])); + continue; + } + + nexleft--; + bmv->bmv_entries++; + cur_ext++; + } + } while (nmap && nexleft && bmv->bmv_length); + + out_free_map: + kmem_free(map); + out_unlock_ilock: + xfs_iunlock(ip, lock); + out_unlock_iolock: + xfs_iunlock(ip, XFS_IOLOCK_SHARED); + + for (i = 0; i < cur_ext; i++) { + int full = 0; /* user array is full */ + + /* format results & advance arg */ + error = formatter(&arg, &out[i], &full); + if (error || full) + break; + } + + kmem_free(out); + return error; +} + +/* + * dead simple method of punching delalyed allocation blocks from a range in + * the inode. Walks a block at a time so will be slow, but is only executed in + * rare error cases so the overhead is not critical. This will always punch out + * both the start and end blocks, even if the ranges only partially overlap + * them, so it is up to the caller to ensure that partial blocks are not + * passed in. + */ +int +xfs_bmap_punch_delalloc_range( + struct xfs_inode *ip, + xfs_fileoff_t start_fsb, + xfs_fileoff_t length) +{ + xfs_fileoff_t remaining = length; + int error = 0; + + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + + do { + int done; + xfs_bmbt_irec_t imap; + int nimaps = 1; + xfs_fsblock_t firstblock; + xfs_bmap_free_t flist; + + /* + * Map the range first and check that it is a delalloc extent + * before trying to unmap the range. Otherwise we will be + * trying to remove a real extent (which requires a + * transaction) or a hole, which is probably a bad idea... + */ + error = xfs_bmapi_read(ip, start_fsb, 1, &imap, &nimaps, + XFS_BMAPI_ENTIRE); + + if (error) { + /* something screwed, just bail */ + if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { + xfs_alert(ip->i_mount, + "Failed delalloc mapping lookup ino %lld fsb %lld.", + ip->i_ino, start_fsb); + } + break; + } + if (!nimaps) { + /* nothing there */ + goto next_block; + } + if (imap.br_startblock != DELAYSTARTBLOCK) { + /* been converted, ignore */ + goto next_block; + } + WARN_ON(imap.br_blockcount == 0); + + /* + * Note: while we initialise the firstblock/flist pair, they + * should never be used because blocks should never be + * allocated or freed for a delalloc extent and hence we need + * don't cancel or finish them after the xfs_bunmapi() call. + */ + xfs_bmap_init(&flist, &firstblock); + error = xfs_bunmapi(NULL, ip, start_fsb, 1, 0, 1, &firstblock, + &flist, &done); + if (error) + break; + + ASSERT(!flist.xbf_count && !flist.xbf_first); +next_block: + start_fsb++; + remaining--; + } while(remaining > 0); + + return error; +} + +/* + * Test whether it is appropriate to check an inode for and free post EOF + * blocks. The 'force' parameter determines whether we should also consider + * regular files that are marked preallocated or append-only. + */ +bool +xfs_can_free_eofblocks(struct xfs_inode *ip, bool force) +{ + /* prealloc/delalloc exists only on regular files */ + if (!S_ISREG(ip->i_d.di_mode)) + return false; + + /* + * Zero sized files with no cached pages and delalloc blocks will not + * have speculative prealloc/delalloc blocks to remove. + */ + if (VFS_I(ip)->i_size == 0 && + VN_CACHED(VFS_I(ip)) == 0 && + ip->i_delayed_blks == 0) + return false; + + /* If we haven't read in the extent list, then don't do it now. */ + if (!(ip->i_df.if_flags & XFS_IFEXTENTS)) + return false; + + /* + * Do not free real preallocated or append-only files unless the file + * has delalloc blocks and we are forced to remove them. + */ + if (ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)) + if (!force || ip->i_delayed_blks == 0) + return false; + + return true; +} + +/* + * This is called by xfs_inactive to free any blocks beyond eof + * when the link count isn't zero and by xfs_dm_punch_hole() when + * punching a hole to EOF. + */ +int +xfs_free_eofblocks( + xfs_mount_t *mp, + xfs_inode_t *ip, + bool need_iolock) +{ + xfs_trans_t *tp; + int error; + xfs_fileoff_t end_fsb; + xfs_fileoff_t last_fsb; + xfs_filblks_t map_len; + int nimaps; + xfs_bmbt_irec_t imap; + + /* + * Figure out if there are any blocks beyond the end + * of the file. If not, then there is nothing to do. + */ + end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_ISIZE(ip)); + last_fsb = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes); + if (last_fsb <= end_fsb) + return 0; + map_len = last_fsb - end_fsb; + + nimaps = 1; + xfs_ilock(ip, XFS_ILOCK_SHARED); + error = xfs_bmapi_read(ip, end_fsb, map_len, &imap, &nimaps, 0); + xfs_iunlock(ip, XFS_ILOCK_SHARED); + + if (!error && (nimaps != 0) && + (imap.br_startblock != HOLESTARTBLOCK || + ip->i_delayed_blks)) { + /* + * Attach the dquots to the inode up front. + */ + error = xfs_qm_dqattach(ip, 0); + if (error) + return error; + + /* + * There are blocks after the end of file. + * Free them up now by truncating the file to + * its current size. + */ + tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE); + + if (need_iolock) { + if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) { + xfs_trans_cancel(tp, 0); + return EAGAIN; + } + } + + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0); + if (error) { + ASSERT(XFS_FORCED_SHUTDOWN(mp)); + xfs_trans_cancel(tp, 0); + if (need_iolock) + xfs_iunlock(ip, XFS_IOLOCK_EXCL); + return error; + } + + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, 0); + + /* + * Do not update the on-disk file size. If we update the + * on-disk file size and then the system crashes before the + * contents of the file are flushed to disk then the files + * may be full of holes (ie NULL files bug). + */ + error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, + XFS_ISIZE(ip)); + if (error) { + /* + * If we get an error at this point we simply don't + * bother truncating the file. + */ + xfs_trans_cancel(tp, + (XFS_TRANS_RELEASE_LOG_RES | + XFS_TRANS_ABORT)); + } else { + error = xfs_trans_commit(tp, + XFS_TRANS_RELEASE_LOG_RES); + if (!error) + xfs_inode_clear_eofblocks_tag(ip); + } + + xfs_iunlock(ip, XFS_ILOCK_EXCL); + if (need_iolock) + xfs_iunlock(ip, XFS_IOLOCK_EXCL); + } + return error; +} + +int +xfs_alloc_file_space( + struct xfs_inode *ip, + xfs_off_t offset, + xfs_off_t len, + int alloc_type) +{ + xfs_mount_t *mp = ip->i_mount; + xfs_off_t count; + xfs_filblks_t allocated_fsb; + xfs_filblks_t allocatesize_fsb; + xfs_extlen_t extsz, temp; + xfs_fileoff_t startoffset_fsb; + xfs_fsblock_t firstfsb; + int nimaps; + int quota_flag; + int rt; + xfs_trans_t *tp; + xfs_bmbt_irec_t imaps[1], *imapp; + xfs_bmap_free_t free_list; + uint qblocks, resblks, resrtextents; + int committed; + int error; + + trace_xfs_alloc_file_space(ip); + + if (XFS_FORCED_SHUTDOWN(mp)) + return XFS_ERROR(EIO); + + error = xfs_qm_dqattach(ip, 0); + if (error) + return error; + + if (len <= 0) + return XFS_ERROR(EINVAL); + + rt = XFS_IS_REALTIME_INODE(ip); + extsz = xfs_get_extsz_hint(ip); + + count = len; + imapp = &imaps[0]; + nimaps = 1; + startoffset_fsb = XFS_B_TO_FSBT(mp, offset); + allocatesize_fsb = XFS_B_TO_FSB(mp, count); + + /* + * Allocate file space until done or until there is an error + */ + while (allocatesize_fsb && !error) { + xfs_fileoff_t s, e; + + /* + * Determine space reservations for data/realtime. + */ + if (unlikely(extsz)) { + s = startoffset_fsb; + do_div(s, extsz); + s *= extsz; + e = startoffset_fsb + allocatesize_fsb; + if ((temp = do_mod(startoffset_fsb, extsz))) + e += temp; + if ((temp = do_mod(e, extsz))) + e += extsz - temp; + } else { + s = 0; + e = allocatesize_fsb; + } + + /* + * The transaction reservation is limited to a 32-bit block + * count, hence we need to limit the number of blocks we are + * trying to reserve to avoid an overflow. We can't allocate + * more than @nimaps extents, and an extent is limited on disk + * to MAXEXTLEN (21 bits), so use that to enforce the limit. + */ + resblks = min_t(xfs_fileoff_t, (e - s), (MAXEXTLEN * nimaps)); + if (unlikely(rt)) { + resrtextents = qblocks = resblks; + resrtextents /= mp->m_sb.sb_rextsize; + resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0); + quota_flag = XFS_QMOPT_RES_RTBLKS; + } else { + resrtextents = 0; + resblks = qblocks = XFS_DIOSTRAT_SPACE_RES(mp, resblks); + quota_flag = XFS_QMOPT_RES_REGBLKS; + } + + /* + * Allocate and setup the transaction. + */ + tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, + resblks, resrtextents); + /* + * Check for running out of space + */ + if (error) { + /* + * Free the transaction structure. + */ + ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp)); + xfs_trans_cancel(tp, 0); + break; + } + xfs_ilock(ip, XFS_ILOCK_EXCL); + error = xfs_trans_reserve_quota_nblks(tp, ip, qblocks, + 0, quota_flag); + if (error) + goto error1; + + xfs_trans_ijoin(tp, ip, 0); + + xfs_bmap_init(&free_list, &firstfsb); + error = xfs_bmapi_write(tp, ip, startoffset_fsb, + allocatesize_fsb, alloc_type, &firstfsb, + 0, imapp, &nimaps, &free_list); + if (error) { + goto error0; + } + + /* + * Complete the transaction + */ + error = xfs_bmap_finish(&tp, &free_list, &committed); + if (error) { + goto error0; + } + + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + if (error) { + break; + } + + allocated_fsb = imapp->br_blockcount; + + if (nimaps == 0) { + error = XFS_ERROR(ENOSPC); + break; + } + + startoffset_fsb += allocated_fsb; + allocatesize_fsb -= allocated_fsb; + } + + return error; + +error0: /* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */ + xfs_bmap_cancel(&free_list); + xfs_trans_unreserve_quota_nblks(tp, ip, (long)qblocks, 0, quota_flag); + +error1: /* Just cancel transaction */ + xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return error; +} + +/* + * Zero file bytes between startoff and endoff inclusive. + * The iolock is held exclusive and no blocks are buffered. + * + * This function is used by xfs_free_file_space() to zero + * partial blocks when the range to free is not block aligned. + * When unreserving space with boundaries that are not block + * aligned we round up the start and round down the end + * boundaries and then use this function to zero the parts of + * the blocks that got dropped during the rounding. + */ +STATIC int +xfs_zero_remaining_bytes( + xfs_inode_t *ip, + xfs_off_t startoff, + xfs_off_t endoff) +{ + xfs_bmbt_irec_t imap; + xfs_fileoff_t offset_fsb; + xfs_off_t lastoffset; + xfs_off_t offset; + xfs_buf_t *bp; + xfs_mount_t *mp = ip->i_mount; + int nimap; + int error = 0; + + /* + * Avoid doing I/O beyond eof - it's not necessary + * since nothing can read beyond eof. The space will + * be zeroed when the file is extended anyway. + */ + if (startoff >= XFS_ISIZE(ip)) + return 0; + + if (endoff > XFS_ISIZE(ip)) + endoff = XFS_ISIZE(ip); + + bp = xfs_buf_get_uncached(XFS_IS_REALTIME_INODE(ip) ? + mp->m_rtdev_targp : mp->m_ddev_targp, + BTOBB(mp->m_sb.sb_blocksize), 0); + if (!bp) + return XFS_ERROR(ENOMEM); + + xfs_buf_unlock(bp); + + for (offset = startoff; offset <= endoff; offset = lastoffset + 1) { + uint lock_mode; + + offset_fsb = XFS_B_TO_FSBT(mp, offset); + nimap = 1; + + lock_mode = xfs_ilock_data_map_shared(ip); + error = xfs_bmapi_read(ip, offset_fsb, 1, &imap, &nimap, 0); + xfs_iunlock(ip, lock_mode); + + if (error || nimap < 1) + break; + ASSERT(imap.br_blockcount >= 1); + ASSERT(imap.br_startoff == offset_fsb); + lastoffset = XFS_FSB_TO_B(mp, imap.br_startoff + 1) - 1; + if (lastoffset > endoff) + lastoffset = endoff; + if (imap.br_startblock == HOLESTARTBLOCK) + continue; + ASSERT(imap.br_startblock != DELAYSTARTBLOCK); + if (imap.br_state == XFS_EXT_UNWRITTEN) + continue; + XFS_BUF_UNDONE(bp); + XFS_BUF_UNWRITE(bp); + XFS_BUF_READ(bp); + XFS_BUF_SET_ADDR(bp, xfs_fsb_to_db(ip, imap.br_startblock)); + + if (XFS_FORCED_SHUTDOWN(mp)) { + error = XFS_ERROR(EIO); + break; + } + xfs_buf_iorequest(bp); + error = xfs_buf_iowait(bp); + if (error) { + xfs_buf_ioerror_alert(bp, + "xfs_zero_remaining_bytes(read)"); + break; + } + memset(bp->b_addr + + (offset - XFS_FSB_TO_B(mp, imap.br_startoff)), + 0, lastoffset - offset + 1); + XFS_BUF_UNDONE(bp); + XFS_BUF_UNREAD(bp); + XFS_BUF_WRITE(bp); + + if (XFS_FORCED_SHUTDOWN(mp)) { + error = XFS_ERROR(EIO); + break; + } + xfs_buf_iorequest(bp); + error = xfs_buf_iowait(bp); + if (error) { + xfs_buf_ioerror_alert(bp, + "xfs_zero_remaining_bytes(write)"); + break; + } + } + xfs_buf_free(bp); + return error; +} + +int +xfs_free_file_space( + struct xfs_inode *ip, + xfs_off_t offset, + xfs_off_t len) +{ + int committed; + int done; + xfs_fileoff_t endoffset_fsb; + int error; + xfs_fsblock_t firstfsb; + xfs_bmap_free_t free_list; + xfs_bmbt_irec_t imap; + xfs_off_t ioffset; + xfs_extlen_t mod=0; + xfs_mount_t *mp; + int nimap; + uint resblks; + xfs_off_t rounding; + int rt; + xfs_fileoff_t startoffset_fsb; + xfs_trans_t *tp; + + mp = ip->i_mount; + + trace_xfs_free_file_space(ip); + + error = xfs_qm_dqattach(ip, 0); + if (error) + return error; + + error = 0; + if (len <= 0) /* if nothing being freed */ + return error; + rt = XFS_IS_REALTIME_INODE(ip); + startoffset_fsb = XFS_B_TO_FSB(mp, offset); + endoffset_fsb = XFS_B_TO_FSBT(mp, offset + len); + + /* wait for the completion of any pending DIOs */ + inode_dio_wait(VFS_I(ip)); + + rounding = max_t(xfs_off_t, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE); + ioffset = offset & ~(rounding - 1); + error = -filemap_write_and_wait_range(VFS_I(ip)->i_mapping, + ioffset, -1); + if (error) + goto out; + truncate_pagecache_range(VFS_I(ip), ioffset, -1); + + /* + * Need to zero the stuff we're not freeing, on disk. + * If it's a realtime file & can't use unwritten extents then we + * actually need to zero the extent edges. Otherwise xfs_bunmapi + * will take care of it for us. + */ + if (rt && !xfs_sb_version_hasextflgbit(&mp->m_sb)) { + nimap = 1; + error = xfs_bmapi_read(ip, startoffset_fsb, 1, + &imap, &nimap, 0); + if (error) + goto out; + ASSERT(nimap == 0 || nimap == 1); + if (nimap && imap.br_startblock != HOLESTARTBLOCK) { + xfs_daddr_t block; + + ASSERT(imap.br_startblock != DELAYSTARTBLOCK); + block = imap.br_startblock; + mod = do_div(block, mp->m_sb.sb_rextsize); + if (mod) + startoffset_fsb += mp->m_sb.sb_rextsize - mod; + } + nimap = 1; + error = xfs_bmapi_read(ip, endoffset_fsb - 1, 1, + &imap, &nimap, 0); + if (error) + goto out; + ASSERT(nimap == 0 || nimap == 1); + if (nimap && imap.br_startblock != HOLESTARTBLOCK) { + ASSERT(imap.br_startblock != DELAYSTARTBLOCK); + mod++; + if (mod && (mod != mp->m_sb.sb_rextsize)) + endoffset_fsb -= mod; + } + } + if ((done = (endoffset_fsb <= startoffset_fsb))) + /* + * One contiguous piece to clear + */ + error = xfs_zero_remaining_bytes(ip, offset, offset + len - 1); + else { + /* + * Some full blocks, possibly two pieces to clear + */ + if (offset < XFS_FSB_TO_B(mp, startoffset_fsb)) + error = xfs_zero_remaining_bytes(ip, offset, + XFS_FSB_TO_B(mp, startoffset_fsb) - 1); + if (!error && + XFS_FSB_TO_B(mp, endoffset_fsb) < offset + len) + error = xfs_zero_remaining_bytes(ip, + XFS_FSB_TO_B(mp, endoffset_fsb), + offset + len - 1); + } + + /* + * free file space until done or until there is an error + */ + resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0); + while (!error && !done) { + + /* + * allocate and setup the transaction. Allow this + * transaction to dip into the reserve blocks to ensure + * the freeing of the space succeeds at ENOSPC. + */ + tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, resblks, 0); + + /* + * check for running out of space + */ + if (error) { + /* + * Free the transaction structure. + */ + ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp)); + xfs_trans_cancel(tp, 0); + break; + } + xfs_ilock(ip, XFS_ILOCK_EXCL); + error = xfs_trans_reserve_quota(tp, mp, + ip->i_udquot, ip->i_gdquot, ip->i_pdquot, + resblks, 0, XFS_QMOPT_RES_REGBLKS); + if (error) + goto error1; + + xfs_trans_ijoin(tp, ip, 0); + + /* + * issue the bunmapi() call to free the blocks + */ + xfs_bmap_init(&free_list, &firstfsb); + error = xfs_bunmapi(tp, ip, startoffset_fsb, + endoffset_fsb - startoffset_fsb, + 0, 2, &firstfsb, &free_list, &done); + if (error) { + goto error0; + } + + /* + * complete the transaction + */ + error = xfs_bmap_finish(&tp, &free_list, &committed); + if (error) { + goto error0; + } + + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + } + + out: + return error; + + error0: + xfs_bmap_cancel(&free_list); + error1: + xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + goto out; +} + + +int +xfs_zero_file_space( + struct xfs_inode *ip, + xfs_off_t offset, + xfs_off_t len) +{ + struct xfs_mount *mp = ip->i_mount; + uint granularity; + xfs_off_t start_boundary; + xfs_off_t end_boundary; + int error; + + trace_xfs_zero_file_space(ip); + + granularity = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE); + + /* + * Round the range of extents we are going to convert inwards. If the + * offset is aligned, then it doesn't get changed so we zero from the + * start of the block offset points to. + */ + start_boundary = round_up(offset, granularity); + end_boundary = round_down(offset + len, granularity); + + ASSERT(start_boundary >= offset); + ASSERT(end_boundary <= offset + len); + + if (start_boundary < end_boundary - 1) { + /* + * punch out delayed allocation blocks and the page cache over + * the conversion range + */ + xfs_ilock(ip, XFS_ILOCK_EXCL); + error = xfs_bmap_punch_delalloc_range(ip, + XFS_B_TO_FSBT(mp, start_boundary), + XFS_B_TO_FSB(mp, end_boundary - start_boundary)); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + truncate_pagecache_range(VFS_I(ip), start_boundary, + end_boundary - 1); + + /* convert the blocks */ + error = xfs_alloc_file_space(ip, start_boundary, + end_boundary - start_boundary - 1, + XFS_BMAPI_PREALLOC | XFS_BMAPI_CONVERT); + if (error) + goto out; + + /* We've handled the interior of the range, now for the edges */ + if (start_boundary != offset) { + error = xfs_iozero(ip, offset, start_boundary - offset); + if (error) + goto out; + } + + if (end_boundary != offset + len) + error = xfs_iozero(ip, end_boundary, + offset + len - end_boundary); + + } else { + /* + * It's either a sub-granularity range or the range spanned lies + * partially across two adjacent blocks. + */ + error = xfs_iozero(ip, offset, len); + } + +out: + return error; + +} + +/* + * xfs_collapse_file_space() + * This routine frees disk space and shift extent for the given file. + * The first thing we do is to free data blocks in the specified range + * by calling xfs_free_file_space(). It would also sync dirty data + * and invalidate page cache over the region on which collapse range + * is working. And Shift extent records to the left to cover a hole. + * RETURNS: + * 0 on success + * errno on error + * + */ +int +xfs_collapse_file_space( + struct xfs_inode *ip, + xfs_off_t offset, + xfs_off_t len) +{ + int done = 0; + struct xfs_mount *mp = ip->i_mount; + struct xfs_trans *tp; + int error; + xfs_extnum_t current_ext = 0; + struct xfs_bmap_free free_list; + xfs_fsblock_t first_block; + int committed; + xfs_fileoff_t start_fsb; + xfs_fileoff_t shift_fsb; + + ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); + + trace_xfs_collapse_file_space(ip); + + start_fsb = XFS_B_TO_FSB(mp, offset + len); + shift_fsb = XFS_B_TO_FSB(mp, len); + + error = xfs_free_file_space(ip, offset, len); + if (error) + return error; + + while (!error && !done) { + tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT); + /* + * We would need to reserve permanent block for transaction. + * This will come into picture when after shifting extent into + * hole we found that adjacent extents can be merged which + * may lead to freeing of a block during record update. + */ + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, + XFS_DIOSTRAT_SPACE_RES(mp, 0), 0); + if (error) { + xfs_trans_cancel(tp, 0); + break; + } + + xfs_ilock(ip, XFS_ILOCK_EXCL); + error = xfs_trans_reserve_quota(tp, mp, ip->i_udquot, + ip->i_gdquot, ip->i_pdquot, + XFS_DIOSTRAT_SPACE_RES(mp, 0), 0, + XFS_QMOPT_RES_REGBLKS); + if (error) + goto out; + + xfs_trans_ijoin(tp, ip, 0); + + xfs_bmap_init(&free_list, &first_block); + + /* + * We are using the write transaction in which max 2 bmbt + * updates are allowed + */ + error = xfs_bmap_shift_extents(tp, ip, &done, start_fsb, + shift_fsb, ¤t_ext, + &first_block, &free_list, + XFS_BMAP_MAX_SHIFT_EXTENTS); + if (error) + goto out; + + error = xfs_bmap_finish(&tp, &free_list, &committed); + if (error) + goto out; + + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + } + + return error; + +out: + xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return error; +} + +/* + * We need to check that the format of the data fork in the temporary inode is + * valid for the target inode before doing the swap. This is not a problem with + * attr1 because of the fixed fork offset, but attr2 has a dynamically sized + * data fork depending on the space the attribute fork is taking so we can get + * invalid formats on the target inode. + * + * E.g. target has space for 7 extents in extent format, temp inode only has + * space for 6. If we defragment down to 7 extents, then the tmp format is a + * btree, but when swapped it needs to be in extent format. Hence we can't just + * blindly swap data forks on attr2 filesystems. + * + * Note that we check the swap in both directions so that we don't end up with + * a corrupt temporary inode, either. + * + * Note that fixing the way xfs_fsr sets up the attribute fork in the source + * inode will prevent this situation from occurring, so all we do here is + * reject and log the attempt. basically we are putting the responsibility on + * userspace to get this right. + */ +static int +xfs_swap_extents_check_format( + xfs_inode_t *ip, /* target inode */ + xfs_inode_t *tip) /* tmp inode */ +{ + + /* Should never get a local format */ + if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL || + tip->i_d.di_format == XFS_DINODE_FMT_LOCAL) + return EINVAL; + + /* + * if the target inode has less extents that then temporary inode then + * why did userspace call us? + */ + if (ip->i_d.di_nextents < tip->i_d.di_nextents) + return EINVAL; + + /* + * if the target inode is in extent form and the temp inode is in btree + * form then we will end up with the target inode in the wrong format + * as we already know there are less extents in the temp inode. + */ + if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && + tip->i_d.di_format == XFS_DINODE_FMT_BTREE) + return EINVAL; + + /* Check temp in extent form to max in target */ + if (tip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && + XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) > + XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)) + return EINVAL; + + /* Check target in extent form to max in temp */ + if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && + XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) > + XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK)) + return EINVAL; + + /* + * If we are in a btree format, check that the temp root block will fit + * in the target and that it has enough extents to be in btree format + * in the target. + * + * Note that we have to be careful to allow btree->extent conversions + * (a common defrag case) which will occur when the temp inode is in + * extent format... + */ + if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE) { + if (XFS_IFORK_BOFF(ip) && + XFS_BMAP_BMDR_SPACE(tip->i_df.if_broot) > XFS_IFORK_BOFF(ip)) + return EINVAL; + if (XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) <= + XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)) + return EINVAL; + } + + /* Reciprocal target->temp btree format checks */ + if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE) { + if (XFS_IFORK_BOFF(tip) && + XFS_BMAP_BMDR_SPACE(ip->i_df.if_broot) > XFS_IFORK_BOFF(tip)) + return EINVAL; + if (XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) <= + XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK)) + return EINVAL; + } + + return 0; +} + +int +xfs_swap_extents( + xfs_inode_t *ip, /* target inode */ + xfs_inode_t *tip, /* tmp inode */ + xfs_swapext_t *sxp) +{ + xfs_mount_t *mp = ip->i_mount; + xfs_trans_t *tp; + xfs_bstat_t *sbp = &sxp->sx_stat; + xfs_ifork_t *tempifp, *ifp, *tifp; + int src_log_flags, target_log_flags; + int error = 0; + int aforkblks = 0; + int taforkblks = 0; + __uint64_t tmp; + + tempifp = kmem_alloc(sizeof(xfs_ifork_t), KM_MAYFAIL); + if (!tempifp) { + error = XFS_ERROR(ENOMEM); + goto out; + } + + /* + * we have to do two separate lock calls here to keep lockdep + * happy. If we try to get all the locks in one call, lock will + * report false positives when we drop the ILOCK and regain them + * below. + */ + xfs_lock_two_inodes(ip, tip, XFS_IOLOCK_EXCL); + xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL); + + /* Verify that both files have the same format */ + if ((ip->i_d.di_mode & S_IFMT) != (tip->i_d.di_mode & S_IFMT)) { + error = XFS_ERROR(EINVAL); + goto out_unlock; + } + + /* Verify both files are either real-time or non-realtime */ + if (XFS_IS_REALTIME_INODE(ip) != XFS_IS_REALTIME_INODE(tip)) { + error = XFS_ERROR(EINVAL); + goto out_unlock; + } + + error = -filemap_write_and_wait(VFS_I(tip)->i_mapping); + if (error) + goto out_unlock; + truncate_pagecache_range(VFS_I(tip), 0, -1); + + /* Verify O_DIRECT for ftmp */ + if (VN_CACHED(VFS_I(tip)) != 0) { + error = XFS_ERROR(EINVAL); + goto out_unlock; + } + + /* Verify all data are being swapped */ + if (sxp->sx_offset != 0 || + sxp->sx_length != ip->i_d.di_size || + sxp->sx_length != tip->i_d.di_size) { + error = XFS_ERROR(EFAULT); + goto out_unlock; + } + + trace_xfs_swap_extent_before(ip, 0); + trace_xfs_swap_extent_before(tip, 1); + + /* check inode formats now that data is flushed */ + error = xfs_swap_extents_check_format(ip, tip); + if (error) { + xfs_notice(mp, + "%s: inode 0x%llx format is incompatible for exchanging.", + __func__, ip->i_ino); + goto out_unlock; + } + + /* + * Compare the current change & modify times with that + * passed in. If they differ, we abort this swap. + * This is the mechanism used to ensure the calling + * process that the file was not changed out from + * under it. + */ + if ((sbp->bs_ctime.tv_sec != VFS_I(ip)->i_ctime.tv_sec) || + (sbp->bs_ctime.tv_nsec != VFS_I(ip)->i_ctime.tv_nsec) || + (sbp->bs_mtime.tv_sec != VFS_I(ip)->i_mtime.tv_sec) || + (sbp->bs_mtime.tv_nsec != VFS_I(ip)->i_mtime.tv_nsec)) { + error = XFS_ERROR(EBUSY); + goto out_unlock; + } + + /* We need to fail if the file is memory mapped. Once we have tossed + * all existing pages, the page fault will have no option + * but to go to the filesystem for pages. By making the page fault call + * vop_read (or write in the case of autogrow) they block on the iolock + * until we have switched the extents. + */ + if (VN_MAPPED(VFS_I(ip))) { + error = XFS_ERROR(EBUSY); + goto out_unlock; + } + + xfs_iunlock(ip, XFS_ILOCK_EXCL); + xfs_iunlock(tip, XFS_ILOCK_EXCL); + + /* + * There is a race condition here since we gave up the + * ilock. However, the data fork will not change since + * we have the iolock (locked for truncation too) so we + * are safe. We don't really care if non-io related + * fields change. + */ + truncate_pagecache_range(VFS_I(ip), 0, -1); + + tp = xfs_trans_alloc(mp, XFS_TRANS_SWAPEXT); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0); + if (error) { + xfs_iunlock(ip, XFS_IOLOCK_EXCL); + xfs_iunlock(tip, XFS_IOLOCK_EXCL); + xfs_trans_cancel(tp, 0); + goto out; + } + xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL); + + /* + * Count the number of extended attribute blocks + */ + if ( ((XFS_IFORK_Q(ip) != 0) && (ip->i_d.di_anextents > 0)) && + (ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) { + error = xfs_bmap_count_blocks(tp, ip, XFS_ATTR_FORK, &aforkblks); + if (error) + goto out_trans_cancel; + } + if ( ((XFS_IFORK_Q(tip) != 0) && (tip->i_d.di_anextents > 0)) && + (tip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) { + error = xfs_bmap_count_blocks(tp, tip, XFS_ATTR_FORK, + &taforkblks); + if (error) + goto out_trans_cancel; + } + + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); + xfs_trans_ijoin(tp, tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); + + /* + * Before we've swapped the forks, lets set the owners of the forks + * appropriately. We have to do this as we are demand paging the btree + * buffers, and so the validation done on read will expect the owner + * field to be correctly set. Once we change the owners, we can swap the + * inode forks. + * + * Note the trickiness in setting the log flags - we set the owner log + * flag on the opposite inode (i.e. the inode we are setting the new + * owner to be) because once we swap the forks and log that, log + * recovery is going to see the fork as owned by the swapped inode, + * not the pre-swapped inodes. + */ + src_log_flags = XFS_ILOG_CORE; + target_log_flags = XFS_ILOG_CORE; + if (ip->i_d.di_version == 3 && + ip->i_d.di_format == XFS_DINODE_FMT_BTREE) { + target_log_flags |= XFS_ILOG_DOWNER; + error = xfs_bmbt_change_owner(tp, ip, XFS_DATA_FORK, + tip->i_ino, NULL); + if (error) + goto out_trans_cancel; + } + + if (tip->i_d.di_version == 3 && + tip->i_d.di_format == XFS_DINODE_FMT_BTREE) { + src_log_flags |= XFS_ILOG_DOWNER; + error = xfs_bmbt_change_owner(tp, tip, XFS_DATA_FORK, + ip->i_ino, NULL); + if (error) + goto out_trans_cancel; + } + + /* + * Swap the data forks of the inodes + */ + ifp = &ip->i_df; + tifp = &tip->i_df; + *tempifp = *ifp; /* struct copy */ + *ifp = *tifp; /* struct copy */ + *tifp = *tempifp; /* struct copy */ + + /* + * Fix the on-disk inode values + */ + tmp = (__uint64_t)ip->i_d.di_nblocks; + ip->i_d.di_nblocks = tip->i_d.di_nblocks - taforkblks + aforkblks; + tip->i_d.di_nblocks = tmp + taforkblks - aforkblks; + + tmp = (__uint64_t) ip->i_d.di_nextents; + ip->i_d.di_nextents = tip->i_d.di_nextents; + tip->i_d.di_nextents = tmp; + + tmp = (__uint64_t) ip->i_d.di_format; + ip->i_d.di_format = tip->i_d.di_format; + tip->i_d.di_format = tmp; + + /* + * The extents in the source inode could still contain speculative + * preallocation beyond EOF (e.g. the file is open but not modified + * while defrag is in progress). In that case, we need to copy over the + * number of delalloc blocks the data fork in the source inode is + * tracking beyond EOF so that when the fork is truncated away when the + * temporary inode is unlinked we don't underrun the i_delayed_blks + * counter on that inode. + */ + ASSERT(tip->i_delayed_blks == 0); + tip->i_delayed_blks = ip->i_delayed_blks; + ip->i_delayed_blks = 0; + + switch (ip->i_d.di_format) { + case XFS_DINODE_FMT_EXTENTS: + /* If the extents fit in the inode, fix the + * pointer. Otherwise it's already NULL or + * pointing to the extent. + */ + if (ip->i_d.di_nextents <= XFS_INLINE_EXTS) { + ifp->if_u1.if_extents = + ifp->if_u2.if_inline_ext; + } + src_log_flags |= XFS_ILOG_DEXT; + break; + case XFS_DINODE_FMT_BTREE: + ASSERT(ip->i_d.di_version < 3 || + (src_log_flags & XFS_ILOG_DOWNER)); + src_log_flags |= XFS_ILOG_DBROOT; + break; + } + + switch (tip->i_d.di_format) { + case XFS_DINODE_FMT_EXTENTS: + /* If the extents fit in the inode, fix the + * pointer. Otherwise it's already NULL or + * pointing to the extent. + */ + if (tip->i_d.di_nextents <= XFS_INLINE_EXTS) { + tifp->if_u1.if_extents = + tifp->if_u2.if_inline_ext; + } + target_log_flags |= XFS_ILOG_DEXT; + break; + case XFS_DINODE_FMT_BTREE: + target_log_flags |= XFS_ILOG_DBROOT; + ASSERT(tip->i_d.di_version < 3 || + (target_log_flags & XFS_ILOG_DOWNER)); + break; + } + + xfs_trans_log_inode(tp, ip, src_log_flags); + xfs_trans_log_inode(tp, tip, target_log_flags); + + /* + * If this is a synchronous mount, make sure that the + * transaction goes to disk before returning to the user. + */ + if (mp->m_flags & XFS_MOUNT_WSYNC) + xfs_trans_set_sync(tp); + + error = xfs_trans_commit(tp, 0); + + trace_xfs_swap_extent_after(ip, 0); + trace_xfs_swap_extent_after(tip, 1); +out: + kmem_free(tempifp); + return error; + +out_unlock: + xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); + xfs_iunlock(tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); + goto out; + +out_trans_cancel: + xfs_trans_cancel(tp, 0); + goto out_unlock; +} diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h new file mode 100644 index 00000000000..2fdb72d2c90 --- /dev/null +++ b/fs/xfs/xfs_bmap_util.h @@ -0,0 +1,112 @@ +/* + * Copyright (c) 2000-2006 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_BMAP_UTIL_H__ +#define __XFS_BMAP_UTIL_H__ + +/* Kernel only BMAP related definitions and functions */ + +struct xfs_bmbt_irec; +struct xfs_bmap_free_item; +struct xfs_ifork; +struct xfs_inode; +struct xfs_mount; +struct xfs_trans; + +/* + * Argument structure for xfs_bmap_alloc. + */ +struct xfs_bmalloca { + xfs_fsblock_t *firstblock; /* i/o first block allocated */ + struct xfs_bmap_free *flist; /* bmap freelist */ + struct xfs_trans *tp; /* transaction pointer */ + struct xfs_inode *ip; /* incore inode pointer */ + struct xfs_bmbt_irec prev; /* extent before the new one */ + struct xfs_bmbt_irec got; /* extent after, or delayed */ + + xfs_fileoff_t offset; /* offset in file filling in */ + xfs_extlen_t length; /* i/o length asked/allocated */ + xfs_fsblock_t blkno; /* starting block of new extent */ + + struct xfs_btree_cur *cur; /* btree cursor */ + xfs_extnum_t idx; /* current extent index */ + int nallocs;/* number of extents alloc'd */ + int logflags;/* flags for transaction logging */ + + xfs_extlen_t total; /* total blocks needed for xaction */ + xfs_extlen_t minlen; /* minimum allocation size (blocks) */ + xfs_extlen_t minleft; /* amount must be left after alloc */ + bool eof; /* set if allocating past last extent */ + bool wasdel; /* replacing a delayed allocation */ + bool userdata;/* set if is user data */ + bool aeof; /* allocated space at eof */ + bool conv; /* overwriting unwritten extents */ + int flags; + struct completion *done; + struct work_struct work; + int result; +}; + +int xfs_bmap_finish(struct xfs_trans **tp, struct xfs_bmap_free *flist, + int *committed); +int xfs_bmap_rtalloc(struct xfs_bmalloca *ap); +int xfs_bmap_eof(struct xfs_inode *ip, xfs_fileoff_t endoff, + int whichfork, int *eof); +int xfs_bmap_count_blocks(struct xfs_trans *tp, struct xfs_inode *ip, + int whichfork, int *count); +int xfs_bmap_punch_delalloc_range(struct xfs_inode *ip, + xfs_fileoff_t start_fsb, xfs_fileoff_t length); + +/* bmap to userspace formatter - copy to user & advance pointer */ +typedef int (*xfs_bmap_format_t)(void **, struct getbmapx *, int *); +int xfs_getbmap(struct xfs_inode *ip, struct getbmapx *bmv, + xfs_bmap_format_t formatter, void *arg); + +/* functions in xfs_bmap.c that are only needed by xfs_bmap_util.c */ +void xfs_bmap_del_free(struct xfs_bmap_free *flist, + struct xfs_bmap_free_item *prev, + struct xfs_bmap_free_item *free); +int xfs_bmap_extsize_align(struct xfs_mount *mp, struct xfs_bmbt_irec *gotp, + struct xfs_bmbt_irec *prevp, xfs_extlen_t extsz, + int rt, int eof, int delay, int convert, + xfs_fileoff_t *offp, xfs_extlen_t *lenp); +void xfs_bmap_adjacent(struct xfs_bmalloca *ap); +int xfs_bmap_last_extent(struct xfs_trans *tp, struct xfs_inode *ip, + int whichfork, struct xfs_bmbt_irec *rec, + int *is_empty); + +/* preallocation and hole punch interface */ +int xfs_alloc_file_space(struct xfs_inode *ip, xfs_off_t offset, + xfs_off_t len, int alloc_type); +int xfs_free_file_space(struct xfs_inode *ip, xfs_off_t offset, + xfs_off_t len); +int xfs_zero_file_space(struct xfs_inode *ip, xfs_off_t offset, + xfs_off_t len); +int xfs_collapse_file_space(struct xfs_inode *, xfs_off_t offset, + xfs_off_t len); + +/* EOF block manipulation functions */ +bool xfs_can_free_eofblocks(struct xfs_inode *ip, bool force); +int xfs_free_eofblocks(struct xfs_mount *mp, struct xfs_inode *ip, + bool need_iolock); + +int xfs_swap_extents(struct xfs_inode *ip, struct xfs_inode *tip, + struct xfs_swapext *sx); + +xfs_daddr_t xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb); + +#endif /* __XFS_BMAP_UTIL_H__ */ diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c index db010408d70..cf893bc1e37 100644 --- a/fs/xfs/xfs_btree.c +++ b/fs/xfs/xfs_btree.c @@ -17,22 +17,23 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_types.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_bit.h" -#include "xfs_log.h" -#include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_mount.h" -#include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_dinode.h" #include "xfs_inode.h" +#include "xfs_trans.h" #include "xfs_inode_item.h" +#include "xfs_buf_item.h" #include "xfs_btree.h" #include "xfs_error.h" #include "xfs_trace.h" +#include "xfs_cksum.h" +#include "xfs_alloc.h" /* * Cursor allocation zone. @@ -42,9 +43,14 @@ kmem_zone_t *xfs_btree_cur_zone; /* * Btree magic numbers. */ -const __uint32_t xfs_magics[XFS_BTNUM_MAX] = { - XFS_ABTB_MAGIC, XFS_ABTC_MAGIC, XFS_BMAP_MAGIC, XFS_IBT_MAGIC +static const __uint32_t xfs_magics[2][XFS_BTNUM_MAX] = { + { XFS_ABTB_MAGIC, XFS_ABTC_MAGIC, XFS_BMAP_MAGIC, XFS_IBT_MAGIC, + XFS_FIBT_MAGIC }, + { XFS_ABTB_CRC_MAGIC, XFS_ABTC_CRC_MAGIC, + XFS_BMAP_CRC_MAGIC, XFS_IBT_CRC_MAGIC, XFS_FIBT_CRC_MAGIC } }; +#define xfs_btree_magic(cur) \ + xfs_magics[!!((cur)->bc_flags & XFS_BTREE_CRC_BLOCKS)][cur->bc_btnum] STATIC int /* error (0 or EFSCORRUPTED) */ @@ -54,30 +60,38 @@ xfs_btree_check_lblock( int level, /* level of the btree block */ struct xfs_buf *bp) /* buffer for block, if any */ { - int lblock_ok; /* block passes checks */ + int lblock_ok = 1; /* block passes checks */ struct xfs_mount *mp; /* file system mount point */ mp = cur->bc_mp; - lblock_ok = - be32_to_cpu(block->bb_magic) == xfs_magics[cur->bc_btnum] && + + if (xfs_sb_version_hascrc(&mp->m_sb)) { + lblock_ok = lblock_ok && + uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_uuid) && + block->bb_u.l.bb_blkno == cpu_to_be64( + bp ? bp->b_bn : XFS_BUF_DADDR_NULL); + } + + lblock_ok = lblock_ok && + be32_to_cpu(block->bb_magic) == xfs_btree_magic(cur) && be16_to_cpu(block->bb_level) == level && be16_to_cpu(block->bb_numrecs) <= cur->bc_ops->get_maxrecs(cur, level) && block->bb_u.l.bb_leftsib && (block->bb_u.l.bb_leftsib == cpu_to_be64(NULLDFSBNO) || XFS_FSB_SANITY_CHECK(mp, - be64_to_cpu(block->bb_u.l.bb_leftsib))) && + be64_to_cpu(block->bb_u.l.bb_leftsib))) && block->bb_u.l.bb_rightsib && (block->bb_u.l.bb_rightsib == cpu_to_be64(NULLDFSBNO) || XFS_FSB_SANITY_CHECK(mp, - be64_to_cpu(block->bb_u.l.bb_rightsib))); + be64_to_cpu(block->bb_u.l.bb_rightsib))); + if (unlikely(XFS_TEST_ERROR(!lblock_ok, mp, XFS_ERRTAG_BTREE_CHECK_LBLOCK, XFS_RANDOM_BTREE_CHECK_LBLOCK))) { if (bp) trace_xfs_btree_corrupt(bp, _RET_IP_); - XFS_ERROR_REPORT("xfs_btree_check_lblock", XFS_ERRLEVEL_LOW, - mp); + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp); return XFS_ERROR(EFSCORRUPTED); } return 0; @@ -90,16 +104,26 @@ xfs_btree_check_sblock( int level, /* level of the btree block */ struct xfs_buf *bp) /* buffer containing block */ { + struct xfs_mount *mp; /* file system mount point */ struct xfs_buf *agbp; /* buffer for ag. freespace struct */ struct xfs_agf *agf; /* ag. freespace structure */ xfs_agblock_t agflen; /* native ag. freespace length */ - int sblock_ok; /* block passes checks */ + int sblock_ok = 1; /* block passes checks */ + mp = cur->bc_mp; agbp = cur->bc_private.a.agbp; agf = XFS_BUF_TO_AGF(agbp); agflen = be32_to_cpu(agf->agf_length); - sblock_ok = - be32_to_cpu(block->bb_magic) == xfs_magics[cur->bc_btnum] && + + if (xfs_sb_version_hascrc(&mp->m_sb)) { + sblock_ok = sblock_ok && + uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid) && + block->bb_u.s.bb_blkno == cpu_to_be64( + bp ? bp->b_bn : XFS_BUF_DADDR_NULL); + } + + sblock_ok = sblock_ok && + be32_to_cpu(block->bb_magic) == xfs_btree_magic(cur) && be16_to_cpu(block->bb_level) == level && be16_to_cpu(block->bb_numrecs) <= cur->bc_ops->get_maxrecs(cur, level) && @@ -109,13 +133,13 @@ xfs_btree_check_sblock( (block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK) || be32_to_cpu(block->bb_u.s.bb_rightsib) < agflen) && block->bb_u.s.bb_rightsib; - if (unlikely(XFS_TEST_ERROR(!sblock_ok, cur->bc_mp, + + if (unlikely(XFS_TEST_ERROR(!sblock_ok, mp, XFS_ERRTAG_BTREE_CHECK_SBLOCK, XFS_RANDOM_BTREE_CHECK_SBLOCK))) { if (bp) trace_xfs_btree_corrupt(bp, _RET_IP_); - XFS_CORRUPTION_ERROR("xfs_btree_check_sblock", - XFS_ERRLEVEL_LOW, cur->bc_mp, block); + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp); return XFS_ERROR(EFSCORRUPTED); } return 0; @@ -194,6 +218,70 @@ xfs_btree_check_ptr( #endif /* + * Calculate CRC on the whole btree block and stuff it into the + * long-form btree header. + * + * Prior to calculting the CRC, pull the LSN out of the buffer log item and put + * it into the buffer so recovery knows what the last modifcation was that made + * it to disk. + */ +void +xfs_btree_lblock_calc_crc( + struct xfs_buf *bp) +{ + struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); + struct xfs_buf_log_item *bip = bp->b_fspriv; + + if (!xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb)) + return; + if (bip) + block->bb_u.l.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn); + xfs_buf_update_cksum(bp, XFS_BTREE_LBLOCK_CRC_OFF); +} + +bool +xfs_btree_lblock_verify_crc( + struct xfs_buf *bp) +{ + if (xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb)) + return xfs_buf_verify_cksum(bp, XFS_BTREE_LBLOCK_CRC_OFF); + + return true; +} + +/* + * Calculate CRC on the whole btree block and stuff it into the + * short-form btree header. + * + * Prior to calculting the CRC, pull the LSN out of the buffer log item and put + * it into the buffer so recovery knows what the last modifcation was that made + * it to disk. + */ +void +xfs_btree_sblock_calc_crc( + struct xfs_buf *bp) +{ + struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); + struct xfs_buf_log_item *bip = bp->b_fspriv; + + if (!xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb)) + return; + if (bip) + block->bb_u.s.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn); + xfs_buf_update_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF); +} + +bool +xfs_btree_sblock_verify_crc( + struct xfs_buf *bp) +{ + if (xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb)) + return xfs_buf_verify_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF); + + return true; +} + +/* * Delete the btree cursor. */ void @@ -277,10 +365,8 @@ xfs_btree_dup_cursor( *ncur = NULL; return error; } - new->bc_bufs[i] = bp; - ASSERT(!xfs_buf_geterror(bp)); - } else - new->bc_bufs[i] = NULL; + } + new->bc_bufs[i] = bp; } *ncur = new; return 0; @@ -321,9 +407,14 @@ xfs_btree_dup_cursor( */ static inline size_t xfs_btree_block_len(struct xfs_btree_cur *cur) { - return (cur->bc_flags & XFS_BTREE_LONG_PTRS) ? - XFS_BTREE_LBLOCK_LEN : - XFS_BTREE_SBLOCK_LEN; + if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { + if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) + return XFS_BTREE_LBLOCK_CRC_LEN; + return XFS_BTREE_LBLOCK_LEN; + } + if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) + return XFS_BTREE_SBLOCK_CRC_LEN; + return XFS_BTREE_SBLOCK_LEN; } /* @@ -417,7 +508,7 @@ xfs_btree_ptr_addr( } /* - * Get a the root block which is stored in the inode. + * Get the root block which is stored in the inode. * * For now this btree implementation assumes the btree root is always * stored in the if_broot field of an inode fork. @@ -463,14 +554,11 @@ xfs_btree_get_bufl( xfs_fsblock_t fsbno, /* file system block number */ uint lock) /* lock flags for get_buf */ { - xfs_buf_t *bp; /* buffer pointer (return value) */ xfs_daddr_t d; /* real disk block address */ ASSERT(fsbno != NULLFSBLOCK); d = XFS_FSB_TO_DADDR(mp, fsbno); - bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, lock); - ASSERT(!xfs_buf_geterror(bp)); - return bp; + return xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, lock); } /* @@ -485,15 +573,12 @@ xfs_btree_get_bufs( xfs_agblock_t agbno, /* allocation group block number */ uint lock) /* lock flags for get_buf */ { - xfs_buf_t *bp; /* buffer pointer (return value) */ xfs_daddr_t d; /* real disk block address */ ASSERT(agno != NULLAGNUMBER); ASSERT(agbno != NULLAGBLOCK); d = XFS_AGB_TO_DADDR(mp, agno, agbno); - bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, lock); - ASSERT(!xfs_buf_geterror(bp)); - return bp; + return xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, lock); } /* @@ -633,7 +718,6 @@ xfs_btree_read_bufl( mp->m_bsize, lock, &bp, ops); if (error) return error; - ASSERT(!xfs_buf_geterror(bp)); if (bp) xfs_buf_set_ref(bp, refval); *bpp = bp; @@ -762,6 +846,41 @@ xfs_btree_readahead( return xfs_btree_readahead_sblock(cur, lr, block); } +STATIC xfs_daddr_t +xfs_btree_ptr_to_daddr( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr) +{ + if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { + ASSERT(ptr->l != cpu_to_be64(NULLDFSBNO)); + + return XFS_FSB_TO_DADDR(cur->bc_mp, be64_to_cpu(ptr->l)); + } else { + ASSERT(cur->bc_private.a.agno != NULLAGNUMBER); + ASSERT(ptr->s != cpu_to_be32(NULLAGBLOCK)); + + return XFS_AGB_TO_DADDR(cur->bc_mp, cur->bc_private.a.agno, + be32_to_cpu(ptr->s)); + } +} + +/* + * Readahead @count btree blocks at the given @ptr location. + * + * We don't need to care about long or short form btrees here as we have a + * method of converting the ptr directly to a daddr available to us. + */ +STATIC void +xfs_btree_readahead_ptr( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr, + xfs_extlen_t count) +{ + xfs_buf_readahead(cur->bc_mp->m_ddev_targp, + xfs_btree_ptr_to_daddr(cur, ptr), + cur->bc_mp->m_bsize * count, cur->bc_ops->buf_ops); +} + /* * Set the buffer for level "lev" in the cursor to bp, releasing * any previous buffer. @@ -863,43 +982,87 @@ xfs_btree_set_sibling( } void +xfs_btree_init_block_int( + struct xfs_mount *mp, + struct xfs_btree_block *buf, + xfs_daddr_t blkno, + __u32 magic, + __u16 level, + __u16 numrecs, + __u64 owner, + unsigned int flags) +{ + buf->bb_magic = cpu_to_be32(magic); + buf->bb_level = cpu_to_be16(level); + buf->bb_numrecs = cpu_to_be16(numrecs); + + if (flags & XFS_BTREE_LONG_PTRS) { + buf->bb_u.l.bb_leftsib = cpu_to_be64(NULLDFSBNO); + buf->bb_u.l.bb_rightsib = cpu_to_be64(NULLDFSBNO); + if (flags & XFS_BTREE_CRC_BLOCKS) { + buf->bb_u.l.bb_blkno = cpu_to_be64(blkno); + buf->bb_u.l.bb_owner = cpu_to_be64(owner); + uuid_copy(&buf->bb_u.l.bb_uuid, &mp->m_sb.sb_uuid); + buf->bb_u.l.bb_pad = 0; + buf->bb_u.l.bb_lsn = 0; + } + } else { + /* owner is a 32 bit value on short blocks */ + __u32 __owner = (__u32)owner; + + buf->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK); + buf->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK); + if (flags & XFS_BTREE_CRC_BLOCKS) { + buf->bb_u.s.bb_blkno = cpu_to_be64(blkno); + buf->bb_u.s.bb_owner = cpu_to_be32(__owner); + uuid_copy(&buf->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid); + buf->bb_u.s.bb_lsn = 0; + } + } +} + +void xfs_btree_init_block( struct xfs_mount *mp, struct xfs_buf *bp, __u32 magic, __u16 level, __u16 numrecs, + __u64 owner, unsigned int flags) { - struct xfs_btree_block *new = XFS_BUF_TO_BLOCK(bp); - - new->bb_magic = cpu_to_be32(magic); - new->bb_level = cpu_to_be16(level); - new->bb_numrecs = cpu_to_be16(numrecs); - - if (flags & XFS_BTREE_LONG_PTRS) { - new->bb_u.l.bb_leftsib = cpu_to_be64(NULLDFSBNO); - new->bb_u.l.bb_rightsib = cpu_to_be64(NULLDFSBNO); - } else { - new->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK); - new->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK); - } + xfs_btree_init_block_int(mp, XFS_BUF_TO_BLOCK(bp), bp->b_bn, + magic, level, numrecs, owner, flags); } STATIC void xfs_btree_init_block_cur( struct xfs_btree_cur *cur, + struct xfs_buf *bp, int level, - int numrecs, - struct xfs_buf *bp) + int numrecs) { - xfs_btree_init_block(cur->bc_mp, bp, xfs_magics[cur->bc_btnum], - level, numrecs, cur->bc_flags); + __u64 owner; + + /* + * we can pull the owner from the cursor right now as the different + * owners align directly with the pointer size of the btree. This may + * change in future, but is safe for current users of the generic btree + * code. + */ + if (cur->bc_flags & XFS_BTREE_LONG_PTRS) + owner = cur->bc_private.b.ip->i_ino; + else + owner = cur->bc_private.a.agno; + + xfs_btree_init_block_int(cur->bc_mp, XFS_BUF_TO_BLOCK(bp), bp->b_bn, + xfs_btree_magic(cur), level, numrecs, + owner, cur->bc_flags); } /* * Return true if ptr is the last record in the btree and - * we need to track updateÑ• to this record. The decision + * we need to track updates to this record. The decision * will be further refined in the update_lastrec method. */ STATIC int @@ -936,24 +1099,6 @@ xfs_btree_buf_to_ptr( } } -STATIC xfs_daddr_t -xfs_btree_ptr_to_daddr( - struct xfs_btree_cur *cur, - union xfs_btree_ptr *ptr) -{ - if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { - ASSERT(ptr->l != cpu_to_be64(NULLDFSBNO)); - - return XFS_FSB_TO_DADDR(cur->bc_mp, be64_to_cpu(ptr->l)); - } else { - ASSERT(cur->bc_private.a.agno != NULLAGNUMBER); - ASSERT(ptr->s != cpu_to_be32(NULLAGBLOCK)); - - return XFS_AGB_TO_DADDR(cur->bc_mp, cur->bc_private.a.agno, - be32_to_cpu(ptr->s)); - } -} - STATIC void xfs_btree_set_refs( struct xfs_btree_cur *cur, @@ -965,6 +1110,7 @@ xfs_btree_set_refs( xfs_buf_set_ref(bp, XFS_ALLOC_BTREE_REF); break; case XFS_BTNUM_INO: + case XFS_BTNUM_FINO: xfs_buf_set_ref(bp, XFS_INO_BTREE_REF); break; case XFS_BTNUM_BMAP: @@ -1009,7 +1155,6 @@ STATIC int xfs_btree_read_buf_block( struct xfs_btree_cur *cur, union xfs_btree_ptr *ptr, - int level, int flags, struct xfs_btree_block **block, struct xfs_buf **bpp) @@ -1028,7 +1173,6 @@ xfs_btree_read_buf_block( if (error) return error; - ASSERT(!xfs_buf_geterror(*bpp)); xfs_btree_set_refs(cur, *bpp); *block = XFS_BUF_TO_BLOCK(*bpp); return 0; @@ -1147,6 +1291,7 @@ xfs_btree_log_keys( XFS_BTREE_TRACE_ARGBII(cur, bp, first, last); if (bp) { + xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF); xfs_trans_log_buf(cur->bc_tp, bp, xfs_btree_key_offset(cur, first), xfs_btree_key_offset(cur, last + 1) - 1); @@ -1171,6 +1316,7 @@ xfs_btree_log_recs( XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); XFS_BTREE_TRACE_ARGBII(cur, bp, first, last); + xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF); xfs_trans_log_buf(cur->bc_tp, bp, xfs_btree_rec_offset(cur, first), xfs_btree_rec_offset(cur, last + 1) - 1); @@ -1195,6 +1341,7 @@ xfs_btree_log_ptrs( struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); int level = xfs_btree_get_level(block); + xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF); xfs_trans_log_buf(cur->bc_tp, bp, xfs_btree_ptr_offset(cur, first, level), xfs_btree_ptr_offset(cur, last + 1, level) - 1); @@ -1223,7 +1370,12 @@ xfs_btree_log_block( offsetof(struct xfs_btree_block, bb_numrecs), offsetof(struct xfs_btree_block, bb_u.s.bb_leftsib), offsetof(struct xfs_btree_block, bb_u.s.bb_rightsib), - XFS_BTREE_SBLOCK_LEN + offsetof(struct xfs_btree_block, bb_u.s.bb_blkno), + offsetof(struct xfs_btree_block, bb_u.s.bb_lsn), + offsetof(struct xfs_btree_block, bb_u.s.bb_uuid), + offsetof(struct xfs_btree_block, bb_u.s.bb_owner), + offsetof(struct xfs_btree_block, bb_u.s.bb_crc), + XFS_BTREE_SBLOCK_CRC_LEN }; static const short loffsets[] = { /* table of offsets (long) */ offsetof(struct xfs_btree_block, bb_magic), @@ -1231,17 +1383,40 @@ xfs_btree_log_block( offsetof(struct xfs_btree_block, bb_numrecs), offsetof(struct xfs_btree_block, bb_u.l.bb_leftsib), offsetof(struct xfs_btree_block, bb_u.l.bb_rightsib), - XFS_BTREE_LBLOCK_LEN + offsetof(struct xfs_btree_block, bb_u.l.bb_blkno), + offsetof(struct xfs_btree_block, bb_u.l.bb_lsn), + offsetof(struct xfs_btree_block, bb_u.l.bb_uuid), + offsetof(struct xfs_btree_block, bb_u.l.bb_owner), + offsetof(struct xfs_btree_block, bb_u.l.bb_crc), + offsetof(struct xfs_btree_block, bb_u.l.bb_pad), + XFS_BTREE_LBLOCK_CRC_LEN }; XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); XFS_BTREE_TRACE_ARGBI(cur, bp, fields); if (bp) { + int nbits; + + if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) { + /* + * We don't log the CRC when updating a btree + * block but instead recreate it during log + * recovery. As the log buffers have checksums + * of their own this is safe and avoids logging a crc + * update in a lot of places. + */ + if (fields == XFS_BB_ALL_BITS) + fields = XFS_BB_ALL_BITS_CRC; + nbits = XFS_BB_NUM_BITS_CRC; + } else { + nbits = XFS_BB_NUM_BITS; + } xfs_btree_offsets(fields, (cur->bc_flags & XFS_BTREE_LONG_PTRS) ? loffsets : soffsets, - XFS_BB_NUM_BITS, &first, &last); + nbits, &first, &last); + xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF); xfs_trans_log_buf(cur->bc_tp, bp, first, last); } else { xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip, @@ -1336,8 +1511,8 @@ xfs_btree_increment( union xfs_btree_ptr *ptrp; ptrp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[lev], block); - error = xfs_btree_read_buf_block(cur, ptrp, --lev, - 0, &block, &bp); + --lev; + error = xfs_btree_read_buf_block(cur, ptrp, 0, &block, &bp); if (error) goto error0; @@ -1435,8 +1610,8 @@ xfs_btree_decrement( union xfs_btree_ptr *ptrp; ptrp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[lev], block); - error = xfs_btree_read_buf_block(cur, ptrp, --lev, - 0, &block, &bp); + --lev; + error = xfs_btree_read_buf_block(cur, ptrp, 0, &block, &bp); if (error) goto error0; xfs_btree_setbuf(cur, lev, bp); @@ -1486,7 +1661,7 @@ xfs_btree_lookup_get_block( return 0; } - error = xfs_btree_read_buf_block(cur, pp, level, 0, blkp, &bp); + error = xfs_btree_read_buf_block(cur, pp, 0, blkp, &bp); if (error) return error; @@ -1518,7 +1693,7 @@ xfs_lookup_get_search_key( /* * Lookup the record. The cursor is made to point to it, based on dir. - * Return 0 if can't find any such record, 1 for success. + * stat is set to 0 if can't find any such record, 1 for success. */ int /* error */ xfs_btree_lookup( @@ -1837,7 +2012,7 @@ xfs_btree_lshift( goto out0; /* Set up the left neighbor as "left". */ - error = xfs_btree_read_buf_block(cur, &lptr, level, 0, &left, &lbp); + error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp); if (error) goto error0; @@ -2021,7 +2196,7 @@ xfs_btree_rshift( goto out0; /* Set up the right neighbor as "right". */ - error = xfs_btree_read_buf_block(cur, &rptr, level, 0, &right, &rbp); + error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp); if (error) goto error0; @@ -2149,7 +2324,7 @@ error1: * record (to be inserted into parent). */ STATIC int /* error */ -xfs_btree_split( +__xfs_btree_split( struct xfs_btree_cur *cur, int level, union xfs_btree_ptr *ptrp, @@ -2191,7 +2366,7 @@ xfs_btree_split( xfs_btree_buf_to_ptr(cur, lbp, &lptr); /* Allocate the new block. If we can't do it, we're toast. Give up. */ - error = cur->bc_ops->alloc_block(cur, &lptr, &rptr, 1, stat); + error = cur->bc_ops->alloc_block(cur, &lptr, &rptr, stat); if (error) goto error0; if (*stat == 0) @@ -2204,7 +2379,7 @@ xfs_btree_split( goto error0; /* Fill in the btree header for the new right block. */ - xfs_btree_init_block_cur(cur, xfs_btree_get_level(left), 0, rbp); + xfs_btree_init_block_cur(cur, rbp, xfs_btree_get_level(left), 0); /* * Split the entries between the old and the new block evenly. @@ -2289,7 +2464,7 @@ xfs_btree_split( * point back to right instead of to left. */ if (!xfs_btree_ptr_is_null(cur, &rrptr)) { - error = xfs_btree_read_buf_block(cur, &rrptr, level, + error = xfs_btree_read_buf_block(cur, &rrptr, 0, &rrblock, &rrbp); if (error) goto error0; @@ -2329,6 +2504,85 @@ error0: return error; } +struct xfs_btree_split_args { + struct xfs_btree_cur *cur; + int level; + union xfs_btree_ptr *ptrp; + union xfs_btree_key *key; + struct xfs_btree_cur **curp; + int *stat; /* success/failure */ + int result; + bool kswapd; /* allocation in kswapd context */ + struct completion *done; + struct work_struct work; +}; + +/* + * Stack switching interfaces for allocation + */ +static void +xfs_btree_split_worker( + struct work_struct *work) +{ + struct xfs_btree_split_args *args = container_of(work, + struct xfs_btree_split_args, work); + unsigned long pflags; + unsigned long new_pflags = PF_FSTRANS; + + /* + * we are in a transaction context here, but may also be doing work + * in kswapd context, and hence we may need to inherit that state + * temporarily to ensure that we don't block waiting for memory reclaim + * in any way. + */ + if (args->kswapd) + new_pflags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD; + + current_set_flags_nested(&pflags, new_pflags); + + args->result = __xfs_btree_split(args->cur, args->level, args->ptrp, + args->key, args->curp, args->stat); + complete(args->done); + + current_restore_flags_nested(&pflags, new_pflags); +} + +/* + * BMBT split requests often come in with little stack to work on. Push + * them off to a worker thread so there is lots of stack to use. For the other + * btree types, just call directly to avoid the context switch overhead here. + */ +STATIC int /* error */ +xfs_btree_split( + struct xfs_btree_cur *cur, + int level, + union xfs_btree_ptr *ptrp, + union xfs_btree_key *key, + struct xfs_btree_cur **curp, + int *stat) /* success/failure */ +{ + struct xfs_btree_split_args args; + DECLARE_COMPLETION_ONSTACK(done); + + if (cur->bc_btnum != XFS_BTNUM_BMAP) + return __xfs_btree_split(cur, level, ptrp, key, curp, stat); + + args.cur = cur; + args.level = level; + args.ptrp = ptrp; + args.key = key; + args.curp = curp; + args.stat = stat; + args.done = &done; + args.kswapd = current_is_kswapd(); + INIT_WORK_ONSTACK(&args.work, xfs_btree_split_worker); + queue_work(xfs_alloc_wq, &args.work); + wait_for_completion(&done); + destroy_work_on_stack(&args.work); + return args.result; +} + + /* * Copy the old inode root contents into a real block and make the * broot point to it. @@ -2364,7 +2618,7 @@ xfs_btree_new_iroot( pp = xfs_btree_ptr_addr(cur, 1, block); /* Allocate the new block. If we can't do it, we're toast. Give up. */ - error = cur->bc_ops->alloc_block(cur, pp, &nptr, 1, stat); + error = cur->bc_ops->alloc_block(cur, pp, &nptr, stat); if (error) goto error0; if (*stat == 0) { @@ -2378,7 +2632,17 @@ xfs_btree_new_iroot( if (error) goto error0; + /* + * we can't just memcpy() the root in for CRC enabled btree blocks. + * In that case have to also ensure the blkno remains correct + */ memcpy(cblock, block, xfs_btree_block_len(cur)); + if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) { + if (cur->bc_flags & XFS_BTREE_LONG_PTRS) + cblock->bb_u.l.bb_blkno = cpu_to_be64(cbp->b_bn); + else + cblock->bb_u.s.bb_blkno = cpu_to_be64(cbp->b_bn); + } be16_add_cpu(&block->bb_level, 1); xfs_btree_set_numrecs(block, 1); @@ -2458,7 +2722,7 @@ xfs_btree_new_root( cur->bc_ops->init_ptr_from_cur(cur, &rptr); /* Allocate the new block. If we can't do it, we're toast. Give up. */ - error = cur->bc_ops->alloc_block(cur, &rptr, &lptr, 1, stat); + error = cur->bc_ops->alloc_block(cur, &rptr, &lptr, stat); if (error) goto error0; if (*stat == 0) @@ -2493,8 +2757,7 @@ xfs_btree_new_root( lbp = bp; xfs_btree_buf_to_ptr(cur, lbp, &lptr); left = block; - error = xfs_btree_read_buf_block(cur, &rptr, - cur->bc_nlevels - 1, 0, &right, &rbp); + error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp); if (error) goto error0; bp = rbp; @@ -2505,15 +2768,14 @@ xfs_btree_new_root( xfs_btree_buf_to_ptr(cur, rbp, &rptr); right = block; xfs_btree_get_sibling(cur, right, &lptr, XFS_BB_LEFTSIB); - error = xfs_btree_read_buf_block(cur, &lptr, - cur->bc_nlevels - 1, 0, &left, &lbp); + error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp); if (error) goto error0; bp = lbp; nptr = 2; } /* Fill in the new block's btree header and log it. */ - xfs_btree_init_block_cur(cur, cur->bc_nlevels, 2, nbp); + xfs_btree_init_block_cur(cur, nbp, cur->bc_nlevels, 2); xfs_btree_log_block(cur, nbp, XFS_BB_ALL_BITS); ASSERT(!xfs_btree_ptr_is_null(cur, &lptr) && !xfs_btree_ptr_is_null(cur, &rptr)); @@ -2580,7 +2842,6 @@ xfs_btree_make_block_unfull( if (numrecs < cur->bc_ops->get_dmaxrecs(cur, level)) { /* A root block that can be made bigger. */ - xfs_iroot_realloc(ip, 1, cur->bc_private.b.whichfork); } else { /* A root block that needs replacing */ @@ -3459,8 +3720,7 @@ xfs_btree_delrec( rptr = cptr; right = block; rbp = bp; - error = xfs_btree_read_buf_block(cur, &lptr, level, - 0, &left, &lbp); + error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp); if (error) goto error0; @@ -3477,8 +3737,7 @@ xfs_btree_delrec( lptr = cptr; left = block; lbp = bp; - error = xfs_btree_read_buf_block(cur, &rptr, level, - 0, &right, &rbp); + error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp); if (error) goto error0; @@ -3550,8 +3809,7 @@ xfs_btree_delrec( /* If there is a right sibling, point it to the remaining block. */ xfs_btree_get_sibling(cur, left, &cptr, XFS_BB_RIGHTSIB); if (!xfs_btree_ptr_is_null(cur, &cptr)) { - error = xfs_btree_read_buf_block(cur, &cptr, level, - 0, &rrblock, &rrbp); + error = xfs_btree_read_buf_block(cur, &cptr, 0, &rrblock, &rrbp); if (error) goto error0; xfs_btree_set_sibling(cur, rrblock, &lptr, XFS_BB_LEFTSIB); @@ -3692,3 +3950,120 @@ xfs_btree_get_rec( *stat = 1; return 0; } + +/* + * Change the owner of a btree. + * + * The mechanism we use here is ordered buffer logging. Because we don't know + * how many buffers were are going to need to modify, we don't really want to + * have to make transaction reservations for the worst case of every buffer in a + * full size btree as that may be more space that we can fit in the log.... + * + * We do the btree walk in the most optimal manner possible - we have sibling + * pointers so we can just walk all the blocks on each level from left to right + * in a single pass, and then move to the next level and do the same. We can + * also do readahead on the sibling pointers to get IO moving more quickly, + * though for slow disks this is unlikely to make much difference to performance + * as the amount of CPU work we have to do before moving to the next block is + * relatively small. + * + * For each btree block that we load, modify the owner appropriately, set the + * buffer as an ordered buffer and log it appropriately. We need to ensure that + * we mark the region we change dirty so that if the buffer is relogged in + * a subsequent transaction the changes we make here as an ordered buffer are + * correctly relogged in that transaction. If we are in recovery context, then + * just queue the modified buffer as delayed write buffer so the transaction + * recovery completion writes the changes to disk. + */ +static int +xfs_btree_block_change_owner( + struct xfs_btree_cur *cur, + int level, + __uint64_t new_owner, + struct list_head *buffer_list) +{ + struct xfs_btree_block *block; + struct xfs_buf *bp; + union xfs_btree_ptr rptr; + + /* do right sibling readahead */ + xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA); + + /* modify the owner */ + block = xfs_btree_get_block(cur, level, &bp); + if (cur->bc_flags & XFS_BTREE_LONG_PTRS) + block->bb_u.l.bb_owner = cpu_to_be64(new_owner); + else + block->bb_u.s.bb_owner = cpu_to_be32(new_owner); + + /* + * If the block is a root block hosted in an inode, we might not have a + * buffer pointer here and we shouldn't attempt to log the change as the + * information is already held in the inode and discarded when the root + * block is formatted into the on-disk inode fork. We still change it, + * though, so everything is consistent in memory. + */ + if (bp) { + if (cur->bc_tp) { + xfs_trans_ordered_buf(cur->bc_tp, bp); + xfs_btree_log_block(cur, bp, XFS_BB_OWNER); + } else { + xfs_buf_delwri_queue(bp, buffer_list); + } + } else { + ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE); + ASSERT(level == cur->bc_nlevels - 1); + } + + /* now read rh sibling block for next iteration */ + xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB); + if (xfs_btree_ptr_is_null(cur, &rptr)) + return ENOENT; + + return xfs_btree_lookup_get_block(cur, level, &rptr, &block); +} + +int +xfs_btree_change_owner( + struct xfs_btree_cur *cur, + __uint64_t new_owner, + struct list_head *buffer_list) +{ + union xfs_btree_ptr lptr; + int level; + struct xfs_btree_block *block = NULL; + int error = 0; + + cur->bc_ops->init_ptr_from_cur(cur, &lptr); + + /* for each level */ + for (level = cur->bc_nlevels - 1; level >= 0; level--) { + /* grab the left hand block */ + error = xfs_btree_lookup_get_block(cur, level, &lptr, &block); + if (error) + return error; + + /* readahead the left most block for the next level down */ + if (level > 0) { + union xfs_btree_ptr *ptr; + + ptr = xfs_btree_ptr_addr(cur, 1, block); + xfs_btree_readahead_ptr(cur, ptr, 1); + + /* save for the next iteration of the loop */ + lptr = *ptr; + } + + /* for each buffer in the level */ + do { + error = xfs_btree_block_change_owner(cur, level, + new_owner, + buffer_list); + } while (!error); + + if (error != ENOENT) + return error; + } + + return 0; +} diff --git a/fs/xfs/xfs_btree.h b/fs/xfs/xfs_btree.h index f932897194e..a04b69422f6 100644 --- a/fs/xfs/xfs_btree.h +++ b/fs/xfs/xfs_btree.h @@ -27,48 +27,6 @@ struct xfs_trans; extern kmem_zone_t *xfs_btree_cur_zone; /* - * This nonsense is to make -wlint happy. - */ -#define XFS_LOOKUP_EQ ((xfs_lookup_t)XFS_LOOKUP_EQi) -#define XFS_LOOKUP_LE ((xfs_lookup_t)XFS_LOOKUP_LEi) -#define XFS_LOOKUP_GE ((xfs_lookup_t)XFS_LOOKUP_GEi) - -#define XFS_BTNUM_BNO ((xfs_btnum_t)XFS_BTNUM_BNOi) -#define XFS_BTNUM_CNT ((xfs_btnum_t)XFS_BTNUM_CNTi) -#define XFS_BTNUM_BMAP ((xfs_btnum_t)XFS_BTNUM_BMAPi) -#define XFS_BTNUM_INO ((xfs_btnum_t)XFS_BTNUM_INOi) - -/* - * Generic btree header. - * - * This is a combination of the actual format used on disk for short and long - * format btrees. The first three fields are shared by both format, but - * the pointers are different and should be used with care. - * - * To get the size of the actual short or long form headers please use - * the size macros below. Never use sizeof(xfs_btree_block). - */ -struct xfs_btree_block { - __be32 bb_magic; /* magic number for block type */ - __be16 bb_level; /* 0 is a leaf */ - __be16 bb_numrecs; /* current # of data records */ - union { - struct { - __be32 bb_leftsib; - __be32 bb_rightsib; - } s; /* short form pointers */ - struct { - __be64 bb_leftsib; - __be64 bb_rightsib; - } l; /* long form pointers */ - } bb_u; /* rest */ -}; - -#define XFS_BTREE_SBLOCK_LEN 16 /* size of a short form block */ -#define XFS_BTREE_LBLOCK_LEN 24 /* size of a long form block */ - - -/* * Generic key, ptr and record wrapper structures. * * These are disk format structures, and are converted where necessary @@ -94,20 +52,34 @@ union xfs_btree_rec { }; /* - * For logging record fields. + * This nonsense is to make -wlint happy. */ -#define XFS_BB_MAGIC 0x01 -#define XFS_BB_LEVEL 0x02 -#define XFS_BB_NUMRECS 0x04 -#define XFS_BB_LEFTSIB 0x08 -#define XFS_BB_RIGHTSIB 0x10 -#define XFS_BB_NUM_BITS 5 -#define XFS_BB_ALL_BITS ((1 << XFS_BB_NUM_BITS) - 1) +#define XFS_LOOKUP_EQ ((xfs_lookup_t)XFS_LOOKUP_EQi) +#define XFS_LOOKUP_LE ((xfs_lookup_t)XFS_LOOKUP_LEi) +#define XFS_LOOKUP_GE ((xfs_lookup_t)XFS_LOOKUP_GEi) + +#define XFS_BTNUM_BNO ((xfs_btnum_t)XFS_BTNUM_BNOi) +#define XFS_BTNUM_CNT ((xfs_btnum_t)XFS_BTNUM_CNTi) +#define XFS_BTNUM_BMAP ((xfs_btnum_t)XFS_BTNUM_BMAPi) +#define XFS_BTNUM_INO ((xfs_btnum_t)XFS_BTNUM_INOi) +#define XFS_BTNUM_FINO ((xfs_btnum_t)XFS_BTNUM_FINOi) /* - * Magic numbers for btree blocks. + * For logging record fields. */ -extern const __uint32_t xfs_magics[]; +#define XFS_BB_MAGIC (1 << 0) +#define XFS_BB_LEVEL (1 << 1) +#define XFS_BB_NUMRECS (1 << 2) +#define XFS_BB_LEFTSIB (1 << 3) +#define XFS_BB_RIGHTSIB (1 << 4) +#define XFS_BB_BLKNO (1 << 5) +#define XFS_BB_LSN (1 << 6) +#define XFS_BB_UUID (1 << 7) +#define XFS_BB_OWNER (1 << 8) +#define XFS_BB_NUM_BITS 5 +#define XFS_BB_ALL_BITS ((1 << XFS_BB_NUM_BITS) - 1) +#define XFS_BB_NUM_BITS_CRC 9 +#define XFS_BB_ALL_BITS_CRC ((1 << XFS_BB_NUM_BITS_CRC) - 1) /* * Generic stats interface @@ -121,6 +93,7 @@ do { \ case XFS_BTNUM_CNT: __XFS_BTREE_STATS_INC(abtc, stat); break; \ case XFS_BTNUM_BMAP: __XFS_BTREE_STATS_INC(bmbt, stat); break; \ case XFS_BTNUM_INO: __XFS_BTREE_STATS_INC(ibt, stat); break; \ + case XFS_BTNUM_FINO: __XFS_BTREE_STATS_INC(fibt, stat); break; \ case XFS_BTNUM_MAX: ASSERT(0); /* fucking gcc */ ; break; \ } \ } while (0) @@ -134,6 +107,7 @@ do { \ case XFS_BTNUM_CNT: __XFS_BTREE_STATS_ADD(abtc, stat, val); break; \ case XFS_BTNUM_BMAP: __XFS_BTREE_STATS_ADD(bmbt, stat, val); break; \ case XFS_BTNUM_INO: __XFS_BTREE_STATS_ADD(ibt, stat, val); break; \ + case XFS_BTNUM_FINO: __XFS_BTREE_STATS_ADD(fibt, stat, val); break; \ case XFS_BTNUM_MAX: ASSERT(0); /* fucking gcc */ ; break; \ } \ } while (0) @@ -158,7 +132,7 @@ struct xfs_btree_ops { int (*alloc_block)(struct xfs_btree_cur *cur, union xfs_btree_ptr *start_bno, union xfs_btree_ptr *new_bno, - int length, int *stat); + int *stat); int (*free_block)(struct xfs_btree_cur *cur, struct xfs_buf *bp); /* update last record information */ @@ -190,7 +164,7 @@ struct xfs_btree_ops { const struct xfs_buf_ops *buf_ops; -#ifdef DEBUG +#if defined(DEBUG) || defined(XFS_WARN) /* check that k1 is lower than k2 */ int (*keys_inorder)(struct xfs_btree_cur *cur, union xfs_btree_key *k1, @@ -256,6 +230,7 @@ typedef struct xfs_btree_cur #define XFS_BTREE_LONG_PTRS (1<<0) /* pointers are 64bits long */ #define XFS_BTREE_ROOT_IN_INODE (1<<1) /* root may be variable size */ #define XFS_BTREE_LASTREC_UPDATE (1<<2) /* track last rec externally */ +#define XFS_BTREE_CRC_BLOCKS (1<<3) /* uses extended btree blocks */ #define XFS_BTREE_NOERROR 0 @@ -393,8 +368,20 @@ xfs_btree_init_block( __u32 magic, __u16 level, __u16 numrecs, + __u64 owner, unsigned int flags); +void +xfs_btree_init_block_int( + struct xfs_mount *mp, + struct xfs_btree_block *buf, + xfs_daddr_t blkno, + __u32 magic, + __u16 level, + __u16 numrecs, + __u64 owner, + unsigned int flags); + /* * Common btree core entry points. */ @@ -406,6 +393,16 @@ int xfs_btree_new_iroot(struct xfs_btree_cur *, int *, int *); int xfs_btree_insert(struct xfs_btree_cur *, int *); int xfs_btree_delete(struct xfs_btree_cur *, int *); int xfs_btree_get_rec(struct xfs_btree_cur *, union xfs_btree_rec **, int *); +int xfs_btree_change_owner(struct xfs_btree_cur *cur, __uint64_t new_owner, + struct list_head *buffer_list); + +/* + * btree block CRC helpers + */ +void xfs_btree_lblock_calc_crc(struct xfs_buf *); +bool xfs_btree_lblock_verify_crc(struct xfs_buf *); +void xfs_btree_sblock_calc_crc(struct xfs_buf *); +bool xfs_btree_sblock_verify_crc(struct xfs_buf *); /* * Internal btree helpers also used by xfs_bmap.c. diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 56d1614760c..7a34a1ae655 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -34,11 +34,13 @@ #include <linux/backing-dev.h> #include <linux/freezer.h> +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_sb.h" -#include "xfs_log.h" #include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_trace.h" +#include "xfs_log.h" static kmem_zone_t *xfs_buf_zone; @@ -80,54 +82,6 @@ xfs_buf_vmap_len( } /* - * xfs_buf_lru_add - add a buffer to the LRU. - * - * The LRU takes a new reference to the buffer so that it will only be freed - * once the shrinker takes the buffer off the LRU. - */ -STATIC void -xfs_buf_lru_add( - struct xfs_buf *bp) -{ - struct xfs_buftarg *btp = bp->b_target; - - spin_lock(&btp->bt_lru_lock); - if (list_empty(&bp->b_lru)) { - atomic_inc(&bp->b_hold); - list_add_tail(&bp->b_lru, &btp->bt_lru); - btp->bt_lru_nr++; - bp->b_lru_flags &= ~_XBF_LRU_DISPOSE; - } - spin_unlock(&btp->bt_lru_lock); -} - -/* - * xfs_buf_lru_del - remove a buffer from the LRU - * - * The unlocked check is safe here because it only occurs when there are not - * b_lru_ref counts left on the inode under the pag->pag_buf_lock. it is there - * to optimise the shrinker removing the buffer from the LRU and calling - * xfs_buf_free(). i.e. it removes an unnecessary round trip on the - * bt_lru_lock. - */ -STATIC void -xfs_buf_lru_del( - struct xfs_buf *bp) -{ - struct xfs_buftarg *btp = bp->b_target; - - if (list_empty(&bp->b_lru)) - return; - - spin_lock(&btp->bt_lru_lock); - if (!list_empty(&bp->b_lru)) { - list_del_init(&bp->b_lru); - btp->bt_lru_nr--; - } - spin_unlock(&btp->bt_lru_lock); -} - -/* * When we mark a buffer stale, we remove the buffer from the LRU and clear the * b_lru_ref count so that the buffer is freed immediately when the buffer * reference count falls to zero. If the buffer is already on the LRU, we need @@ -150,20 +104,14 @@ xfs_buf_stale( */ bp->b_flags &= ~_XBF_DELWRI_Q; - atomic_set(&(bp)->b_lru_ref, 0); - if (!list_empty(&bp->b_lru)) { - struct xfs_buftarg *btp = bp->b_target; + spin_lock(&bp->b_lock); + atomic_set(&bp->b_lru_ref, 0); + if (!(bp->b_state & XFS_BSTATE_DISPOSE) && + (list_lru_del(&bp->b_target->bt_lru, &bp->b_lru))) + atomic_dec(&bp->b_hold); - spin_lock(&btp->bt_lru_lock); - if (!list_empty(&bp->b_lru) && - !(bp->b_lru_flags & _XBF_LRU_DISPOSE)) { - list_del_init(&bp->b_lru); - btp->bt_lru_nr--; - atomic_dec(&bp->b_hold); - } - spin_unlock(&btp->bt_lru_lock); - } ASSERT(atomic_read(&bp->b_hold) >= 1); + spin_unlock(&bp->b_lock); } static int @@ -227,6 +175,7 @@ _xfs_buf_alloc( INIT_LIST_HEAD(&bp->b_list); RB_CLEAR_NODE(&bp->b_rbnode); sema_init(&bp->b_sema, 0); /* held, no waiters */ + spin_lock_init(&bp->b_lock); XB_SET_OWNER(bp); bp->b_target = target; bp->b_flags = flags; @@ -267,8 +216,7 @@ _xfs_buf_alloc( STATIC int _xfs_buf_get_pages( xfs_buf_t *bp, - int page_count, - xfs_buf_flags_t flags) + int page_count) { /* Make sure that we have a page list */ if (bp->b_pages == NULL) { @@ -303,7 +251,7 @@ _xfs_buf_free_pages( * Releases the specified buffer. * * The modification state of any associated pages is left unchanged. - * The buffer most not be on any hash - use xfs_buf_rele instead for + * The buffer must not be on any hash - use xfs_buf_rele instead for * hashed and refcounted buffers */ void @@ -381,7 +329,7 @@ use_alloc_page: end = (BBTOB(bp->b_maps[0].bm_bn + bp->b_length) + PAGE_SIZE - 1) >> PAGE_SHIFT; page_count = end - start; - error = _xfs_buf_get_pages(bp, page_count, flags); + error = _xfs_buf_get_pages(bp, page_count); if (unlikely(error)) return error; @@ -447,7 +395,17 @@ _xfs_buf_map_pages( bp->b_addr = NULL; } else { int retried = 0; + unsigned noio_flag; + /* + * vm_map_ram() will allocate auxillary structures (e.g. + * pagetables) with GFP_KERNEL, yet we are likely to be under + * GFP_NOFS context here. Hence we need to tell memory reclaim + * that we are in such a context via PF_MEMALLOC_NOIO to prevent + * memory reclaim re-entering the filesystem here and + * potentially deadlocking. + */ + noio_flag = memalloc_noio_save(); do { bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count, -1, PAGE_KERNEL); @@ -455,6 +413,7 @@ _xfs_buf_map_pages( break; vm_unmap_aliases(); } while (retried++ <= 1); + memalloc_noio_restore(noio_flag); if (!bp->b_addr) return -ENOMEM; @@ -487,6 +446,7 @@ _xfs_buf_find( struct rb_node *parent; xfs_buf_t *bp; xfs_daddr_t blkno = map[0].bm_bn; + xfs_daddr_t eofs; int numblks = 0; int i; @@ -495,8 +455,26 @@ _xfs_buf_find( numbytes = BBTOB(numblks); /* Check for IOs smaller than the sector size / not sector aligned */ - ASSERT(!(numbytes < (1 << btp->bt_sshift))); - ASSERT(!(BBTOB(blkno) & (xfs_off_t)btp->bt_smask)); + ASSERT(!(numbytes < btp->bt_meta_sectorsize)); + ASSERT(!(BBTOB(blkno) & (xfs_off_t)btp->bt_meta_sectormask)); + + /* + * Corrupted block numbers can get through to here, unfortunately, so we + * have to check that the buffer falls within the filesystem bounds. + */ + eofs = XFS_FSB_TO_BB(btp->bt_mount, btp->bt_mount->m_sb.sb_dblocks); + if (blkno >= eofs) { + /* + * XXX (dgc): we should really be returning EFSCORRUPTED here, + * but none of the higher level infrastructure supports + * returning a specific error on buffer lookup failures. + */ + xfs_alert(btp->bt_mount, + "%s: Block out of range: block 0x%llx, EOFS 0x%llx ", + __func__, blkno, eofs); + WARN_ON(1); + return NULL; + } /* get tree root */ pag = xfs_perag_get(btp->bt_mount, @@ -623,7 +601,7 @@ found: error = _xfs_buf_map_pages(bp, flags); if (unlikely(error)) { xfs_warn(target->bt_mount, - "%s: failed to map pages\n", __func__); + "%s: failed to map pagesn", __func__); xfs_buf_relse(bp); return NULL; } @@ -730,7 +708,11 @@ xfs_buf_read_uncached( bp->b_flags |= XBF_READ; bp->b_ops = ops; - xfsbdstrat(target->bt_mount, bp); + if (XFS_FORCED_SHUTDOWN(target->bt_mount)) { + xfs_buf_relse(bp); + return NULL; + } + xfs_buf_iorequest(bp); xfs_buf_iowait(bp); return bp; } @@ -795,7 +777,7 @@ xfs_buf_associate_memory( bp->b_pages = NULL; bp->b_addr = mem; - rval = _xfs_buf_get_pages(bp, page_count, 0); + rval = _xfs_buf_get_pages(bp, page_count); if (rval) return rval; @@ -828,7 +810,7 @@ xfs_buf_get_uncached( goto fail; page_count = PAGE_ALIGN(numblks << BBSHIFT) >> PAGE_SHIFT; - error = _xfs_buf_get_pages(bp, page_count, 0); + error = _xfs_buf_get_pages(bp, page_count); if (error) goto fail_free_buf; @@ -842,7 +824,7 @@ xfs_buf_get_uncached( error = _xfs_buf_map_pages(bp, 0); if (unlikely(error)) { xfs_warn(target->bt_mount, - "%s: failed to map pages\n", __func__); + "%s: failed to map pages", __func__); goto fail_free_mem; } @@ -897,12 +879,33 @@ xfs_buf_rele( ASSERT(atomic_read(&bp->b_hold) > 0); if (atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock)) { - if (!(bp->b_flags & XBF_STALE) && - atomic_read(&bp->b_lru_ref)) { - xfs_buf_lru_add(bp); + spin_lock(&bp->b_lock); + if (!(bp->b_flags & XBF_STALE) && atomic_read(&bp->b_lru_ref)) { + /* + * If the buffer is added to the LRU take a new + * reference to the buffer for the LRU and clear the + * (now stale) dispose list state flag + */ + if (list_lru_add(&bp->b_target->bt_lru, &bp->b_lru)) { + bp->b_state &= ~XFS_BSTATE_DISPOSE; + atomic_inc(&bp->b_hold); + } + spin_unlock(&bp->b_lock); spin_unlock(&pag->pag_buf_lock); } else { - xfs_buf_lru_del(bp); + /* + * most of the time buffers will already be removed from + * the LRU, so optimise that case by checking for the + * XFS_BSTATE_DISPOSE flag indicating the last list the + * buffer was on was the disposal list + */ + if (!(bp->b_state & XFS_BSTATE_DISPOSE)) { + list_lru_del(&bp->b_target->bt_lru, &bp->b_lru); + } else { + ASSERT(list_empty(&bp->b_lru)); + } + spin_unlock(&bp->b_lock); + ASSERT(!(bp->b_flags & _XBF_DELWRI_Q)); rb_erase(&bp->b_rbnode, &pag->pag_buf_tree); spin_unlock(&pag->pag_buf_lock); @@ -933,8 +936,6 @@ xfs_buf_trylock( locked = down_trylock(&bp->b_sema) == 0; if (locked) XB_SET_OWNER(bp); - else if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE)) - xfs_log_force(bp->b_target->bt_mount, 0); trace_xfs_buf_trylock(bp, _RET_IP_); return locked; @@ -1006,7 +1007,9 @@ xfs_buf_iodone_work( bool read = !!(bp->b_flags & XBF_READ); bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD); - if (read && bp->b_ops) + + /* only validate buffers that were read without errors */ + if (read && bp->b_ops && !bp->b_error && (bp->b_flags & XBF_DONE)) bp->b_ops->verify_read(bp); if (bp->b_iodone) @@ -1100,7 +1103,7 @@ xfs_bioerror( * This is meant for userdata errors; metadata bufs come with * iodone functions attached, so that we can track down errors. */ -STATIC int +int xfs_bioerror_relse( struct xfs_buf *bp) { @@ -1163,7 +1166,7 @@ xfs_bwrite( ASSERT(xfs_buf_islocked(bp)); bp->b_flags |= XBF_WRITE; - bp->b_flags &= ~(XBF_ASYNC | XBF_READ | _XBF_DELWRI_Q); + bp->b_flags &= ~(XBF_ASYNC | XBF_READ | _XBF_DELWRI_Q | XBF_WRITE_FAIL); xfs_bdstrat_cb(bp); @@ -1175,25 +1178,6 @@ xfs_bwrite( return error; } -/* - * Wrapper around bdstrat so that we can stop data from going to disk in case - * we are shutting down the filesystem. Typically user data goes thru this - * path; one of the exceptions is the superblock. - */ -void -xfsbdstrat( - struct xfs_mount *mp, - struct xfs_buf *bp) -{ - if (XFS_FORCED_SHUTDOWN(mp)) { - trace_xfs_bdstrat_shut(bp, _RET_IP_); - xfs_bioerror_relse(bp); - return; - } - - xfs_buf_iorequest(bp); -} - STATIC void _xfs_buf_ioend( xfs_buf_t *bp, @@ -1266,7 +1250,7 @@ next_chunk: bio = bio_alloc(GFP_NOIO, nr_pages); bio->bi_bdev = bp->b_target->bt_bdev; - bio->bi_sector = sector; + bio->bi_iter.bi_sector = sector; bio->bi_end_io = xfs_buf_bio_end_io; bio->bi_private = bp; @@ -1288,7 +1272,7 @@ next_chunk: total_nr_pages--; } - if (likely(bio->bi_size)) { + if (likely(bio->bi_iter.bi_size)) { if (xfs_buf_is_vmapped(bp)) { flush_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp)); @@ -1318,6 +1302,12 @@ _xfs_buf_ioapply( int size; int i; + /* + * Make sure we capture only current IO errors rather than stale errors + * left over from previous use of the buffer (e.g. failed readahead). + */ + bp->b_error = 0; + if (bp->b_flags & XBF_WRITE) { if (bp->b_flags & XBF_SYNCIO) rw = WRITE_SYNC; @@ -1381,21 +1371,29 @@ xfs_buf_iorequest( xfs_buf_wait_unpin(bp); xfs_buf_hold(bp); - /* Set the count to 1 initially, this will stop an I/O + /* + * Set the count to 1 initially, this will stop an I/O * completion callout which happens before we have started * all the I/O from calling xfs_buf_ioend too early. */ atomic_set(&bp->b_io_remaining, 1); _xfs_buf_ioapply(bp); - _xfs_buf_ioend(bp, 1); + /* + * If _xfs_buf_ioapply failed, we'll get back here with + * only the reference we took above. _xfs_buf_ioend will + * drop it to zero, so we'd better not queue it for later, + * or we'll free it before it's done. + */ + _xfs_buf_ioend(bp, bp->b_error ? 0 : 1); xfs_buf_rele(bp); } /* * Waits for I/O to complete on the buffer supplied. It returns immediately if - * no I/O is pending or there is already a pending error on the buffer. It - * returns the I/O error code, if any, or 0 if there was no error. + * no I/O is pending or there is already a pending error on the buffer, in which + * case nothing will ever complete. It returns the I/O error code, if any, or + * 0 if there was no error. */ int xfs_buf_iowait( @@ -1476,81 +1474,127 @@ xfs_buf_iomove( * returned. These buffers will have an elevated hold count, so wait on those * while freeing all the buffers only held by the LRU. */ +static enum lru_status +xfs_buftarg_wait_rele( + struct list_head *item, + spinlock_t *lru_lock, + void *arg) + +{ + struct xfs_buf *bp = container_of(item, struct xfs_buf, b_lru); + struct list_head *dispose = arg; + + if (atomic_read(&bp->b_hold) > 1) { + /* need to wait, so skip it this pass */ + trace_xfs_buf_wait_buftarg(bp, _RET_IP_); + return LRU_SKIP; + } + if (!spin_trylock(&bp->b_lock)) + return LRU_SKIP; + + /* + * clear the LRU reference count so the buffer doesn't get + * ignored in xfs_buf_rele(). + */ + atomic_set(&bp->b_lru_ref, 0); + bp->b_state |= XFS_BSTATE_DISPOSE; + list_move(item, dispose); + spin_unlock(&bp->b_lock); + return LRU_REMOVED; +} + void xfs_wait_buftarg( struct xfs_buftarg *btp) { - struct xfs_buf *bp; + LIST_HEAD(dispose); + int loop = 0; -restart: - spin_lock(&btp->bt_lru_lock); - while (!list_empty(&btp->bt_lru)) { - bp = list_first_entry(&btp->bt_lru, struct xfs_buf, b_lru); - if (atomic_read(&bp->b_hold) > 1) { - spin_unlock(&btp->bt_lru_lock); - delay(100); - goto restart; + /* loop until there is nothing left on the lru list. */ + while (list_lru_count(&btp->bt_lru)) { + list_lru_walk(&btp->bt_lru, xfs_buftarg_wait_rele, + &dispose, LONG_MAX); + + while (!list_empty(&dispose)) { + struct xfs_buf *bp; + bp = list_first_entry(&dispose, struct xfs_buf, b_lru); + list_del_init(&bp->b_lru); + if (bp->b_flags & XBF_WRITE_FAIL) { + xfs_alert(btp->bt_mount, +"Corruption Alert: Buffer at block 0x%llx had permanent write failures!\n" +"Please run xfs_repair to determine the extent of the problem.", + (long long)bp->b_bn); + } + xfs_buf_rele(bp); } - /* - * clear the LRU reference count so the buffer doesn't get - * ignored in xfs_buf_rele(). - */ - atomic_set(&bp->b_lru_ref, 0); - spin_unlock(&btp->bt_lru_lock); - xfs_buf_rele(bp); - spin_lock(&btp->bt_lru_lock); + if (loop++ != 0) + delay(100); } - spin_unlock(&btp->bt_lru_lock); } -int -xfs_buftarg_shrink( +static enum lru_status +xfs_buftarg_isolate( + struct list_head *item, + spinlock_t *lru_lock, + void *arg) +{ + struct xfs_buf *bp = container_of(item, struct xfs_buf, b_lru); + struct list_head *dispose = arg; + + /* + * we are inverting the lru lock/bp->b_lock here, so use a trylock. + * If we fail to get the lock, just skip it. + */ + if (!spin_trylock(&bp->b_lock)) + return LRU_SKIP; + /* + * Decrement the b_lru_ref count unless the value is already + * zero. If the value is already zero, we need to reclaim the + * buffer, otherwise it gets another trip through the LRU. + */ + if (!atomic_add_unless(&bp->b_lru_ref, -1, 0)) { + spin_unlock(&bp->b_lock); + return LRU_ROTATE; + } + + bp->b_state |= XFS_BSTATE_DISPOSE; + list_move(item, dispose); + spin_unlock(&bp->b_lock); + return LRU_REMOVED; +} + +static unsigned long +xfs_buftarg_shrink_scan( struct shrinker *shrink, struct shrink_control *sc) { struct xfs_buftarg *btp = container_of(shrink, struct xfs_buftarg, bt_shrinker); - struct xfs_buf *bp; - int nr_to_scan = sc->nr_to_scan; LIST_HEAD(dispose); + unsigned long freed; + unsigned long nr_to_scan = sc->nr_to_scan; - if (!nr_to_scan) - return btp->bt_lru_nr; - - spin_lock(&btp->bt_lru_lock); - while (!list_empty(&btp->bt_lru)) { - if (nr_to_scan-- <= 0) - break; - - bp = list_first_entry(&btp->bt_lru, struct xfs_buf, b_lru); - - /* - * Decrement the b_lru_ref count unless the value is already - * zero. If the value is already zero, we need to reclaim the - * buffer, otherwise it gets another trip through the LRU. - */ - if (!atomic_add_unless(&bp->b_lru_ref, -1, 0)) { - list_move_tail(&bp->b_lru, &btp->bt_lru); - continue; - } - - /* - * remove the buffer from the LRU now to avoid needing another - * lock round trip inside xfs_buf_rele(). - */ - list_move(&bp->b_lru, &dispose); - btp->bt_lru_nr--; - bp->b_lru_flags |= _XBF_LRU_DISPOSE; - } - spin_unlock(&btp->bt_lru_lock); + freed = list_lru_walk_node(&btp->bt_lru, sc->nid, xfs_buftarg_isolate, + &dispose, &nr_to_scan); while (!list_empty(&dispose)) { + struct xfs_buf *bp; bp = list_first_entry(&dispose, struct xfs_buf, b_lru); list_del_init(&bp->b_lru); xfs_buf_rele(bp); } - return btp->bt_lru_nr; + return freed; +} + +static unsigned long +xfs_buftarg_shrink_count( + struct shrinker *shrink, + struct shrink_control *sc) +{ + struct xfs_buftarg *btp = container_of(shrink, + struct xfs_buftarg, bt_shrinker); + return list_lru_count_node(&btp->bt_lru, sc->nid); } void @@ -1559,6 +1603,7 @@ xfs_free_buftarg( struct xfs_buftarg *btp) { unregister_shrinker(&btp->bt_shrinker); + list_lru_destroy(&btp->bt_lru); if (mp->m_flags & XFS_MOUNT_BARRIER) xfs_blkdev_issue_flush(btp); @@ -1566,16 +1611,14 @@ xfs_free_buftarg( kmem_free(btp); } -STATIC int -xfs_setsize_buftarg_flags( +int +xfs_setsize_buftarg( xfs_buftarg_t *btp, - unsigned int blocksize, - unsigned int sectorsize, - int verbose) + unsigned int sectorsize) { - btp->bt_bsize = blocksize; - btp->bt_sshift = ffs(sectorsize) - 1; - btp->bt_smask = sectorsize - 1; + /* Set up metadata sector size info */ + btp->bt_meta_sectorsize = sectorsize; + btp->bt_meta_sectormask = sectorsize - 1; if (set_blocksize(btp->bt_bdev, sectorsize)) { char name[BDEVNAME_SIZE]; @@ -1583,47 +1626,39 @@ xfs_setsize_buftarg_flags( bdevname(btp->bt_bdev, name); xfs_warn(btp->bt_mount, - "Cannot set_blocksize to %u on device %s\n", + "Cannot set_blocksize to %u on device %s", sectorsize, name); return EINVAL; } + /* Set up device logical sector size mask */ + btp->bt_logical_sectorsize = bdev_logical_block_size(btp->bt_bdev); + btp->bt_logical_sectormask = bdev_logical_block_size(btp->bt_bdev) - 1; + return 0; } /* - * When allocating the initial buffer target we have not yet - * read in the superblock, so don't know what sized sectors - * are being used is at this early stage. Play safe. + * When allocating the initial buffer target we have not yet + * read in the superblock, so don't know what sized sectors + * are being used at this early stage. Play safe. */ STATIC int xfs_setsize_buftarg_early( xfs_buftarg_t *btp, struct block_device *bdev) { - return xfs_setsize_buftarg_flags(btp, - PAGE_SIZE, bdev_logical_block_size(bdev), 0); -} - -int -xfs_setsize_buftarg( - xfs_buftarg_t *btp, - unsigned int blocksize, - unsigned int sectorsize) -{ - return xfs_setsize_buftarg_flags(btp, blocksize, sectorsize, 1); + return xfs_setsize_buftarg(btp, bdev_logical_block_size(bdev)); } xfs_buftarg_t * xfs_alloc_buftarg( struct xfs_mount *mp, - struct block_device *bdev, - int external, - const char *fsname) + struct block_device *bdev) { xfs_buftarg_t *btp; - btp = kmem_zalloc(sizeof(*btp), KM_SLEEP); + btp = kmem_zalloc(sizeof(*btp), KM_SLEEP | KM_NOFS); btp->bt_mount = mp; btp->bt_dev = bdev->bd_dev; @@ -1632,12 +1667,16 @@ xfs_alloc_buftarg( if (!btp->bt_bdi) goto error; - INIT_LIST_HEAD(&btp->bt_lru); - spin_lock_init(&btp->bt_lru_lock); if (xfs_setsize_buftarg_early(btp, bdev)) goto error; - btp->bt_shrinker.shrink = xfs_buftarg_shrink; + + if (list_lru_init(&btp->bt_lru)) + goto error; + + btp->bt_shrinker.count_objects = xfs_buftarg_shrink_count; + btp->bt_shrinker.scan_objects = xfs_buftarg_shrink_scan; btp->bt_shrinker.seeks = DEFAULT_SEEKS; + btp->bt_shrinker.flags = SHRINKER_NUMA_AWARE; register_shrinker(&btp->bt_shrinker); return btp; @@ -1759,7 +1798,7 @@ __xfs_buf_delwri_submit( blk_start_plug(&plug); list_for_each_entry_safe(bp, n, io_list, b_list) { - bp->b_flags &= ~(_XBF_DELWRI_Q | XBF_ASYNC); + bp->b_flags &= ~(_XBF_DELWRI_Q | XBF_ASYNC | XBF_WRITE_FAIL); bp->b_flags |= XBF_WRITE; if (!wait) { diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 433a12ed7b1..3a7a5523d3d 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -25,6 +25,7 @@ #include <linux/fs.h> #include <linux/buffer_head.h> #include <linux/uio.h> +#include <linux/list_lru.h> /* * Base types @@ -44,6 +45,7 @@ typedef enum { #define XBF_ASYNC (1 << 4) /* initiator will not wait for completion */ #define XBF_DONE (1 << 5) /* all pages in the buffer uptodate */ #define XBF_STALE (1 << 6) /* buffer has been staled, do not find it */ +#define XBF_WRITE_FAIL (1 << 24)/* async writes have failed on this buffer */ /* I/O hints for the BIO layer */ #define XBF_SYNCIO (1 << 10)/* treat this buffer as synchronous I/O */ @@ -59,7 +61,6 @@ typedef enum { #define _XBF_KMEM (1 << 21)/* backed by heap memory */ #define _XBF_DELWRI_Q (1 << 22)/* buffer on a delwri queue */ #define _XBF_COMPOUND (1 << 23)/* compound buffer */ -#define _XBF_LRU_DISPOSE (1 << 24)/* buffer being discarded */ typedef unsigned int xfs_buf_flags_t; @@ -70,6 +71,7 @@ typedef unsigned int xfs_buf_flags_t; { XBF_ASYNC, "ASYNC" }, \ { XBF_DONE, "DONE" }, \ { XBF_STALE, "STALE" }, \ + { XBF_WRITE_FAIL, "WRITE_FAIL" }, \ { XBF_SYNCIO, "SYNCIO" }, \ { XBF_FUA, "FUA" }, \ { XBF_FLUSH, "FLUSH" }, \ @@ -78,23 +80,40 @@ typedef unsigned int xfs_buf_flags_t; { _XBF_PAGES, "PAGES" }, \ { _XBF_KMEM, "KMEM" }, \ { _XBF_DELWRI_Q, "DELWRI_Q" }, \ - { _XBF_COMPOUND, "COMPOUND" }, \ - { _XBF_LRU_DISPOSE, "LRU_DISPOSE" } + { _XBF_COMPOUND, "COMPOUND" } + +/* + * Internal state flags. + */ +#define XFS_BSTATE_DISPOSE (1 << 0) /* buffer being discarded */ + +/* + * The xfs_buftarg contains 2 notions of "sector size" - + * + * 1) The metadata sector size, which is the minimum unit and + * alignment of IO which will be performed by metadata operations. + * 2) The device logical sector size + * + * The first is specified at mkfs time, and is stored on-disk in the + * superblock's sb_sectsize. + * + * The latter is derived from the underlying device, and controls direct IO + * alignment constraints. + */ typedef struct xfs_buftarg { dev_t bt_dev; struct block_device *bt_bdev; struct backing_dev_info *bt_bdi; struct xfs_mount *bt_mount; - unsigned int bt_bsize; - unsigned int bt_sshift; - size_t bt_smask; + unsigned int bt_meta_sectorsize; + size_t bt_meta_sectormask; + size_t bt_logical_sectorsize; + size_t bt_logical_sectormask; /* LRU control structures */ struct shrinker bt_shrinker; - struct list_head bt_lru; - spinlock_t bt_lru_lock; - unsigned int bt_lru_nr; + struct list_lru bt_lru; } xfs_buftarg_t; struct xfs_buf; @@ -137,7 +156,8 @@ typedef struct xfs_buf { * bt_lru_lock and not by b_sema */ struct list_head b_lru; /* lru list */ - xfs_buf_flags_t b_lru_flags; /* internal lru status flags */ + spinlock_t b_lock; /* internal state lock */ + unsigned int b_state; /* internal state flags */ wait_queue_head_t b_waiters; /* unpin waiters */ struct list_head b_list; struct xfs_perag *b_pag; /* contains rbtree root */ @@ -266,9 +286,6 @@ extern void xfs_buf_unlock(xfs_buf_t *); /* Buffer Read and Write Routines */ extern int xfs_bwrite(struct xfs_buf *bp); - -extern void xfsbdstrat(struct xfs_mount *, struct xfs_buf *); - extern void xfs_buf_ioend(xfs_buf_t *, int); extern void xfs_buf_ioerror(xfs_buf_t *, int); extern void xfs_buf_ioerror_alert(struct xfs_buf *, const char *func); @@ -279,10 +296,7 @@ extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, void *, #define xfs_buf_zero(bp, off, len) \ xfs_buf_iomove((bp), (off), (len), NULL, XBRW_ZERO) -static inline int xfs_buf_geterror(xfs_buf_t *bp) -{ - return bp ? bp->b_error : ENOMEM; -} +extern int xfs_bioerror_relse(struct xfs_buf *); /* Buffer Utility Routines */ extern xfs_caddr_t xfs_buf_offset(xfs_buf_t *, size_t); @@ -298,7 +312,8 @@ extern void xfs_buf_terminate(void); #define XFS_BUF_ZEROFLAGS(bp) \ ((bp)->b_flags &= ~(XBF_READ|XBF_WRITE|XBF_ASYNC| \ - XBF_SYNCIO|XBF_FUA|XBF_FLUSH)) + XBF_SYNCIO|XBF_FUA|XBF_FLUSH| \ + XBF_WRITE_FAIL)) void xfs_buf_stale(struct xfs_buf *bp); #define XFS_BUF_UNSTALE(bp) ((bp)->b_flags &= ~XBF_STALE) @@ -349,14 +364,28 @@ static inline void xfs_buf_relse(xfs_buf_t *bp) xfs_buf_rele(bp); } +static inline int +xfs_buf_verify_cksum(struct xfs_buf *bp, unsigned long cksum_offset) +{ + return xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length), + cksum_offset); +} + +static inline void +xfs_buf_update_cksum(struct xfs_buf *bp, unsigned long cksum_offset) +{ + xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), + cksum_offset); +} + /* * Handling of buftargs. */ extern xfs_buftarg_t *xfs_alloc_buftarg(struct xfs_mount *, - struct block_device *, int, const char *); + struct block_device *); extern void xfs_free_buftarg(struct xfs_mount *, struct xfs_buftarg *); extern void xfs_wait_buftarg(xfs_buftarg_t *); -extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int, unsigned int); +extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int); #define xfs_getsize_buftarg(buftarg) block_size((buftarg)->bt_bdev) #define xfs_readonly_buftarg(buftarg) bdev_read_only((buftarg)->bt_bdev) diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 77b09750e92..4654338b03f 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -17,17 +17,18 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_types.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_bit.h" -#include "xfs_log.h" -#include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_mount.h" +#include "xfs_trans.h" #include "xfs_buf_item.h" #include "xfs_trans_priv.h" #include "xfs_error.h" #include "xfs_trace.h" +#include "xfs_log.h" kmem_zone_t *xfs_buf_item_zone; @@ -37,110 +38,15 @@ static inline struct xfs_buf_log_item *BUF_ITEM(struct xfs_log_item *lip) return container_of(lip, struct xfs_buf_log_item, bli_item); } +STATIC void xfs_buf_do_callbacks(struct xfs_buf *bp); -#ifdef XFS_TRANS_DEBUG -/* - * This function uses an alternate strategy for tracking the bytes - * that the user requests to be logged. This can then be used - * in conjunction with the bli_orig array in the buf log item to - * catch bugs in our callers' code. - * - * We also double check the bits set in xfs_buf_item_log using a - * simple algorithm to check that every byte is accounted for. - */ -STATIC void -xfs_buf_item_log_debug( - xfs_buf_log_item_t *bip, - uint first, - uint last) -{ - uint x; - uint byte; - uint nbytes; - uint chunk_num; - uint word_num; - uint bit_num; - uint bit_set; - uint *wordp; - - ASSERT(bip->bli_logged != NULL); - byte = first; - nbytes = last - first + 1; - bfset(bip->bli_logged, first, nbytes); - for (x = 0; x < nbytes; x++) { - chunk_num = byte >> XFS_BLF_SHIFT; - word_num = chunk_num >> BIT_TO_WORD_SHIFT; - bit_num = chunk_num & (NBWORD - 1); - wordp = &(bip->__bli_format.blf_data_map[word_num]); - bit_set = *wordp & (1 << bit_num); - ASSERT(bit_set); - byte++; - } -} - -/* - * This function is called when we flush something into a buffer without - * logging it. This happens for things like inodes which are logged - * separately from the buffer. - */ -void -xfs_buf_item_flush_log_debug( - xfs_buf_t *bp, - uint first, - uint last) -{ - xfs_buf_log_item_t *bip = bp->b_fspriv; - uint nbytes; - - if (bip == NULL || (bip->bli_item.li_type != XFS_LI_BUF)) - return; - - ASSERT(bip->bli_logged != NULL); - nbytes = last - first + 1; - bfset(bip->bli_logged, first, nbytes); -} - -/* - * This function is called to verify that our callers have logged - * all the bytes that they changed. - * - * It does this by comparing the original copy of the buffer stored in - * the buf log item's bli_orig array to the current copy of the buffer - * and ensuring that all bytes which mismatch are set in the bli_logged - * array of the buf log item. - */ -STATIC void -xfs_buf_item_log_check( - xfs_buf_log_item_t *bip) +static inline int +xfs_buf_log_format_size( + struct xfs_buf_log_format *blfp) { - char *orig; - char *buffer; - int x; - xfs_buf_t *bp; - - ASSERT(bip->bli_orig != NULL); - ASSERT(bip->bli_logged != NULL); - - bp = bip->bli_buf; - ASSERT(bp->b_length > 0); - ASSERT(bp->b_addr != NULL); - orig = bip->bli_orig; - buffer = bp->b_addr; - for (x = 0; x < BBTOB(bp->b_length); x++) { - if (orig[x] != buffer[x] && !btst(bip->bli_logged, x)) { - xfs_emerg(bp->b_mount, - "%s: bip %x buffer %x orig %x index %d", - __func__, bip, bp, orig, x); - ASSERT(0); - } - } + return offsetof(struct xfs_buf_log_format, blf_data_map) + + (blfp->blf_map_size * sizeof(blfp->blf_data_map[0])); } -#else -#define xfs_buf_item_log_debug(x,y,z) -#define xfs_buf_item_log_check(x) -#endif - -STATIC void xfs_buf_do_callbacks(struct xfs_buf *bp); /* * This returns the number of log iovecs needed to log the @@ -152,25 +58,27 @@ STATIC void xfs_buf_do_callbacks(struct xfs_buf *bp); * * If the XFS_BLI_STALE flag has been set, then log nothing. */ -STATIC uint +STATIC void xfs_buf_item_size_segment( struct xfs_buf_log_item *bip, - struct xfs_buf_log_format *blfp) + struct xfs_buf_log_format *blfp, + int *nvecs, + int *nbytes) { struct xfs_buf *bp = bip->bli_buf; - uint nvecs; int next_bit; int last_bit; last_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, 0); if (last_bit == -1) - return 0; + return; /* * initial count for a dirty buffer is 2 vectors - the format structure * and the first dirty region. */ - nvecs = 2; + *nvecs += 2; + *nbytes += xfs_buf_log_format_size(blfp) + XFS_BLF_CHUNK; while (last_bit != -1) { /* @@ -190,18 +98,17 @@ xfs_buf_item_size_segment( break; } else if (next_bit != last_bit + 1) { last_bit = next_bit; - nvecs++; + (*nvecs)++; } else if (xfs_buf_offset(bp, next_bit * XFS_BLF_CHUNK) != (xfs_buf_offset(bp, last_bit * XFS_BLF_CHUNK) + XFS_BLF_CHUNK)) { last_bit = next_bit; - nvecs++; + (*nvecs)++; } else { last_bit++; } + *nbytes += XFS_BLF_CHUNK; } - - return nvecs; } /* @@ -221,12 +128,13 @@ xfs_buf_item_size_segment( * If the XFS_BLI_STALE flag has been set, then log nothing but the buf log * format structures. */ -STATIC uint +STATIC void xfs_buf_item_size( - struct xfs_log_item *lip) + struct xfs_log_item *lip, + int *nvecs, + int *nbytes) { struct xfs_buf_log_item *bip = BUF_ITEM(lip); - uint nvecs; int i; ASSERT(atomic_read(&bip->bli_refcount) > 0); @@ -238,11 +146,26 @@ xfs_buf_item_size( */ trace_xfs_buf_item_size_stale(bip); ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL); - return bip->bli_format_count; + *nvecs += bip->bli_format_count; + for (i = 0; i < bip->bli_format_count; i++) { + *nbytes += xfs_buf_log_format_size(&bip->bli_formats[i]); + } + return; } ASSERT(bip->bli_flags & XFS_BLI_LOGGED); + if (bip->bli_flags & XFS_BLI_ORDERED) { + /* + * The buffer has been logged just to order it. + * It is not being included in the transaction + * commit, so no vectors are used at all. + */ + trace_xfs_buf_item_size_ordered(bip); + *nvecs = XFS_LOG_VEC_ORDERED; + return; + } + /* * the vector count is based on the number of buffer vectors we have * dirty bits in. This will only be greater than one when we have a @@ -252,30 +175,54 @@ xfs_buf_item_size( * count for the extra buf log format structure that will need to be * written. */ - nvecs = 0; for (i = 0; i < bip->bli_format_count; i++) { - nvecs += xfs_buf_item_size_segment(bip, &bip->bli_formats[i]); + xfs_buf_item_size_segment(bip, &bip->bli_formats[i], + nvecs, nbytes); } - trace_xfs_buf_item_size(bip); - return nvecs; } -static struct xfs_log_iovec * +static inline void +xfs_buf_item_copy_iovec( + struct xfs_log_vec *lv, + struct xfs_log_iovec **vecp, + struct xfs_buf *bp, + uint offset, + int first_bit, + uint nbits) +{ + offset += first_bit * XFS_BLF_CHUNK; + xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_BCHUNK, + xfs_buf_offset(bp, offset), + nbits * XFS_BLF_CHUNK); +} + +static inline bool +xfs_buf_item_straddle( + struct xfs_buf *bp, + uint offset, + int next_bit, + int last_bit) +{ + return xfs_buf_offset(bp, offset + (next_bit << XFS_BLF_SHIFT)) != + (xfs_buf_offset(bp, offset + (last_bit << XFS_BLF_SHIFT)) + + XFS_BLF_CHUNK); +} + +static void xfs_buf_item_format_segment( struct xfs_buf_log_item *bip, - struct xfs_log_iovec *vecp, + struct xfs_log_vec *lv, + struct xfs_log_iovec **vecp, uint offset, struct xfs_buf_log_format *blfp) { struct xfs_buf *bp = bip->bli_buf; uint base_size; - uint nvecs; int first_bit; int last_bit; int next_bit; uint nbits; - uint buffer_offset; /* copy the flags across from the base format item */ blfp->blf_flags = bip->__bli_format.blf_flags; @@ -285,24 +232,19 @@ xfs_buf_item_format_segment( * the actual size of the dirty bitmap rather than the size of the in * memory structure. */ - base_size = offsetof(struct xfs_buf_log_format, blf_data_map) + - (blfp->blf_map_size * sizeof(blfp->blf_data_map[0])); + base_size = xfs_buf_log_format_size(blfp); - nvecs = 0; first_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, 0); if (!(bip->bli_flags & XFS_BLI_STALE) && first_bit == -1) { /* * If the map is not be dirty in the transaction, mark * the size as zero and do not advance the vector pointer. */ - goto out; + return; } - vecp->i_addr = blfp; - vecp->i_len = base_size; - vecp->i_type = XLOG_REG_TYPE_BFORMAT; - vecp++; - nvecs = 1; + blfp = xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_BFORMAT, blfp, base_size); + blfp->blf_size = 1; if (bip->bli_flags & XFS_BLI_STALE) { /* @@ -312,13 +254,13 @@ xfs_buf_item_format_segment( */ trace_xfs_buf_item_format_stale(bip); ASSERT(blfp->blf_flags & XFS_BLF_CANCEL); - goto out; + return; } + /* * Fill in an iovec for each set of contiguous chunks. */ - last_bit = first_bit; nbits = 1; for (;;) { @@ -331,47 +273,22 @@ xfs_buf_item_format_segment( next_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, (uint)last_bit + 1); /* - * If we run out of bits fill in the last iovec and get - * out of the loop. - * Else if we start a new set of bits then fill in the - * iovec for the series we were looking at and start - * counting the bits in the new one. - * Else we're still in the same set of bits so just - * keep counting and scanning. + * If we run out of bits fill in the last iovec and get out of + * the loop. Else if we start a new set of bits then fill in + * the iovec for the series we were looking at and start + * counting the bits in the new one. Else we're still in the + * same set of bits so just keep counting and scanning. */ if (next_bit == -1) { - buffer_offset = offset + first_bit * XFS_BLF_CHUNK; - vecp->i_addr = xfs_buf_offset(bp, buffer_offset); - vecp->i_len = nbits * XFS_BLF_CHUNK; - vecp->i_type = XLOG_REG_TYPE_BCHUNK; - nvecs++; + xfs_buf_item_copy_iovec(lv, vecp, bp, offset, + first_bit, nbits); + blfp->blf_size++; break; - } else if (next_bit != last_bit + 1) { - buffer_offset = offset + first_bit * XFS_BLF_CHUNK; - vecp->i_addr = xfs_buf_offset(bp, buffer_offset); - vecp->i_len = nbits * XFS_BLF_CHUNK; - vecp->i_type = XLOG_REG_TYPE_BCHUNK; - nvecs++; - vecp++; - first_bit = next_bit; - last_bit = next_bit; - nbits = 1; - } else if (xfs_buf_offset(bp, offset + - (next_bit << XFS_BLF_SHIFT)) != - (xfs_buf_offset(bp, offset + - (last_bit << XFS_BLF_SHIFT)) + - XFS_BLF_CHUNK)) { - buffer_offset = offset + first_bit * XFS_BLF_CHUNK; - vecp->i_addr = xfs_buf_offset(bp, buffer_offset); - vecp->i_len = nbits * XFS_BLF_CHUNK; - vecp->i_type = XLOG_REG_TYPE_BCHUNK; -/* - * You would think we need to bump the nvecs here too, but we do not - * this number is used by recovery, and it gets confused by the boundary - * split here - * nvecs++; - */ - vecp++; + } else if (next_bit != last_bit + 1 || + xfs_buf_item_straddle(bp, offset, next_bit, last_bit)) { + xfs_buf_item_copy_iovec(lv, vecp, bp, offset, + first_bit, nbits); + blfp->blf_size++; first_bit = next_bit; last_bit = next_bit; nbits = 1; @@ -380,9 +297,6 @@ xfs_buf_item_format_segment( nbits++; } } -out: - blfp->blf_size = nvecs; - return vecp; } /* @@ -394,10 +308,11 @@ out: STATIC void xfs_buf_item_format( struct xfs_log_item *lip, - struct xfs_log_iovec *vecp) + struct xfs_log_vec *lv) { struct xfs_buf_log_item *bip = BUF_ITEM(lip); struct xfs_buf *bp = bip->bli_buf; + struct xfs_log_iovec *vecp = NULL; uint offset = 0; int i; @@ -407,21 +322,39 @@ xfs_buf_item_format( /* * If it is an inode buffer, transfer the in-memory state to the - * format flags and clear the in-memory state. We do not transfer + * format flags and clear the in-memory state. + * + * For buffer based inode allocation, we do not transfer * this state if the inode buffer allocation has not yet been committed * to the log as setting the XFS_BLI_INODE_BUF flag will prevent * correct replay of the inode allocation. + * + * For icreate item based inode allocation, the buffers aren't written + * to the journal during allocation, and hence we should always tag the + * buffer as an inode buffer so that the correct unlinked list replay + * occurs during recovery. */ if (bip->bli_flags & XFS_BLI_INODE_BUF) { - if (!((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) && + if (xfs_sb_version_hascrc(&lip->li_mountp->m_sb) || + !((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) && xfs_log_item_in_current_chkpt(lip))) bip->__bli_format.blf_flags |= XFS_BLF_INODE_BUF; bip->bli_flags &= ~XFS_BLI_INODE_BUF; } + if ((bip->bli_flags & (XFS_BLI_ORDERED|XFS_BLI_STALE)) == + XFS_BLI_ORDERED) { + /* + * The buffer has been logged just to order it. It is not being + * included in the transaction commit, so don't format it. + */ + trace_xfs_buf_item_format_ordered(bip); + return; + } + for (i = 0; i < bip->bli_format_count; i++) { - vecp = xfs_buf_item_format_segment(bip, vecp, offset, - &bip->bli_formats[i]); + xfs_buf_item_format_segment(bip, lv, &vecp, offset, + &bip->bli_formats[i]); offset += bp->b_maps[i].bm_len; } @@ -429,7 +362,6 @@ xfs_buf_item_format( * Check to make sure everything is consistent. */ trace_xfs_buf_item_format(bip); - xfs_buf_item_log_check(bip); } /* @@ -449,6 +381,7 @@ xfs_buf_item_pin( ASSERT(atomic_read(&bip->bli_refcount) > 0); ASSERT((bip->bli_flags & XFS_BLI_LOGGED) || + (bip->bli_flags & XFS_BLI_ORDERED) || (bip->bli_flags & XFS_BLI_STALE)); trace_xfs_buf_item_pin(bip); @@ -562,6 +495,14 @@ xfs_buf_item_unpin( } } +/* + * Buffer IO error rate limiting. Limit it to no more than 10 messages per 30 + * seconds so as to not spam logs too much on repeated detection of the same + * buffer being bad.. + */ + +DEFINE_RATELIMIT_STATE(xfs_buf_write_fail_rl_state, 30 * HZ, 10); + STATIC uint xfs_buf_item_push( struct xfs_log_item *lip, @@ -573,13 +514,31 @@ xfs_buf_item_push( if (xfs_buf_ispinned(bp)) return XFS_ITEM_PINNED; - if (!xfs_buf_trylock(bp)) + if (!xfs_buf_trylock(bp)) { + /* + * If we have just raced with a buffer being pinned and it has + * been marked stale, we could end up stalling until someone else + * issues a log force to unpin the stale buffer. Check for the + * race condition here so xfsaild recognizes the buffer is pinned + * and queues a log force to move it along. + */ + if (xfs_buf_ispinned(bp)) + return XFS_ITEM_PINNED; return XFS_ITEM_LOCKED; + } ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); trace_xfs_buf_item_push(bip); + /* has a previous flush failed due to IO errors? */ + if ((bp->b_flags & XBF_WRITE_FAIL) && + ___ratelimit(&xfs_buf_write_fail_rl_state, "XFS:")) { + xfs_warn(bp->b_target->bt_mount, +"Detected failing async write on buffer block 0x%llx. Retrying async write.\n", + (long long)bp->b_bn); + } + if (!xfs_buf_delwri_queue(bp, buffer_list)) rval = XFS_ITEM_FLUSHING; xfs_buf_unlock(bp); @@ -611,8 +570,9 @@ xfs_buf_item_unlock( { struct xfs_buf_log_item *bip = BUF_ITEM(lip); struct xfs_buf *bp = bip->bli_buf; - int aborted, clean, i; - uint hold; + bool clean; + bool aborted; + int flags; /* Clear the buffer's association with this transaction. */ bp->b_transp = NULL; @@ -623,23 +583,21 @@ xfs_buf_item_unlock( * (cancelled) buffers at unpin time, but we'll never go through the * pin/unpin cycle if we abort inside commit. */ - aborted = (lip->li_flags & XFS_LI_ABORTED) != 0; - + aborted = (lip->li_flags & XFS_LI_ABORTED) ? true : false; /* - * Before possibly freeing the buf item, determine if we should - * release the buffer at the end of this routine. + * Before possibly freeing the buf item, copy the per-transaction state + * so we can reference it safely later after clearing it from the + * buffer log item. */ - hold = bip->bli_flags & XFS_BLI_HOLD; - - /* Clear the per transaction state. */ - bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_HOLD); + flags = bip->bli_flags; + bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_HOLD | XFS_BLI_ORDERED); /* * If the buf item is marked stale, then don't do anything. We'll * unlock the buffer and free the buf item when the buffer is unpinned * for the last time. */ - if (bip->bli_flags & XFS_BLI_STALE) { + if (flags & XFS_BLI_STALE) { trace_xfs_buf_item_unlock_stale(bip); ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL); if (!aborted) { @@ -652,22 +610,49 @@ xfs_buf_item_unlock( /* * If the buf item isn't tracking any data, free it, otherwise drop the - * reference we hold to it. + * reference we hold to it. If we are aborting the transaction, this may + * be the only reference to the buf item, so we free it anyway + * regardless of whether it is dirty or not. A dirty abort implies a + * shutdown, anyway. + * + * Ordered buffers are dirty but may have no recorded changes, so ensure + * we only release clean items here. */ - clean = 1; - for (i = 0; i < bip->bli_format_count; i++) { - if (!xfs_bitmap_empty(bip->bli_formats[i].blf_data_map, - bip->bli_formats[i].blf_map_size)) { - clean = 0; - break; + clean = (flags & XFS_BLI_DIRTY) ? false : true; + if (clean) { + int i; + for (i = 0; i < bip->bli_format_count; i++) { + if (!xfs_bitmap_empty(bip->bli_formats[i].blf_data_map, + bip->bli_formats[i].blf_map_size)) { + clean = false; + break; + } } } - if (clean) - xfs_buf_item_relse(bp); - else - atomic_dec(&bip->bli_refcount); - if (!hold) + /* + * Clean buffers, by definition, cannot be in the AIL. However, aborted + * buffers may be dirty and hence in the AIL. Therefore if we are + * aborting a buffer and we've just taken the last refernce away, we + * have to check if it is in the AIL before freeing it. We need to free + * it in this case, because an aborted transaction has already shut the + * filesystem down and this is the last chance we will have to do so. + */ + if (atomic_dec_and_test(&bip->bli_refcount)) { + if (clean) + xfs_buf_item_relse(bp); + else if (aborted) { + ASSERT(XFS_FORCED_SHUTDOWN(lip->li_mountp)); + if (lip->li_flags & XFS_LI_IN_AIL) { + spin_lock(&lip->li_ailp->xa_lock); + xfs_trans_ail_delete(lip->li_ailp, lip, + SHUTDOWN_LOG_IO_ERROR); + } + xfs_buf_item_relse(bp); + } + } + + if (!(flags & XFS_BLI_HOLD)) xfs_buf_relse(bp); } @@ -811,20 +796,6 @@ xfs_buf_item_init( bip->bli_formats[i].blf_map_size = map_size; } -#ifdef XFS_TRANS_DEBUG - /* - * Allocate the arrays for tracking what needs to be logged - * and what our callers request to be logged. bli_orig - * holds a copy of the original, clean buffer for comparison - * against, and bli_logged keeps a 1 bit flag per byte in - * the buffer to indicate which bytes the callers have asked - * to have logged. - */ - bip->bli_orig = kmem_alloc(BBTOB(bp->b_length), KM_SLEEP); - memcpy(bip->bli_orig, bp->b_addr, BBTOB(bp->b_length)); - bip->bli_logged = kmem_zalloc(BBTOB(bp->b_length) / NBBY, KM_SLEEP); -#endif - /* * Put the buf item into the list of items attached to the * buffer at the front. @@ -839,9 +810,8 @@ xfs_buf_item_init( * Mark bytes first through last inclusive as dirty in the buf * item's bitmap. */ -void +static void xfs_buf_item_log_segment( - struct xfs_buf_log_item *bip, uint first, uint last, uint *map) @@ -915,8 +885,6 @@ xfs_buf_item_log_segment( mask = (1 << end_bit) - 1; *wordp |= mask; } - - xfs_buf_item_log_debug(bip, first, last); } /* @@ -935,12 +903,6 @@ xfs_buf_item_log( struct xfs_buf *bp = bip->bli_buf; /* - * Mark the item as having some dirty data for - * quick reference in xfs_buf_item_dirty. - */ - bip->bli_flags |= XFS_BLI_DIRTY; - - /* * walk each buffer segment and mark them dirty appropriately. */ start = 0; @@ -957,7 +919,7 @@ xfs_buf_item_log( if (end > last) end = last; - xfs_buf_item_log_segment(bip, first, end, + xfs_buf_item_log_segment(first, end, &bip->bli_formats[i].blf_data_map[0]); start += bp->b_maps[i].bm_len; @@ -966,7 +928,7 @@ xfs_buf_item_log( /* - * Return 1 if the buffer has some data that has been logged (at any + * Return 1 if the buffer has been logged or ordered in a transaction (at any * point, not just the current transaction) and 0 if not. */ uint @@ -980,11 +942,6 @@ STATIC void xfs_buf_item_free( xfs_buf_log_item_t *bip) { -#ifdef XFS_TRANS_DEBUG - kmem_free(bip->bli_orig); - kmem_free(bip->bli_logged); -#endif /* XFS_TRANS_DEBUG */ - xfs_buf_item_free_format(bip); kmem_zone_free(xfs_buf_item_zone, bip); } @@ -1000,11 +957,11 @@ void xfs_buf_item_relse( xfs_buf_t *bp) { - xfs_buf_log_item_t *bip; + xfs_buf_log_item_t *bip = bp->b_fspriv; trace_xfs_buf_item_relse(bp, _RET_IP_); + ASSERT(!(bip->bli_item.li_flags & XFS_LI_IN_AIL)); - bip = bp->b_fspriv; bp->b_fspriv = bip->bli_item.li_bio_list; if (bp->b_fspriv == NULL) bp->b_iodone = NULL; @@ -1095,7 +1052,7 @@ xfs_buf_iodone_callbacks( static ulong lasttime; static xfs_buftarg_t *lasttarg; - if (likely(!xfs_buf_geterror(bp))) + if (likely(!bp->b_error)) goto do_callbacks; /* @@ -1134,8 +1091,9 @@ xfs_buf_iodone_callbacks( xfs_buf_ioerror(bp, 0); /* errno of 0 unsets the flag */ - if (!XFS_BUF_ISSTALE(bp)) { - bp->b_flags |= XBF_WRITE | XBF_ASYNC | XBF_DONE; + if (!(bp->b_flags & (XBF_STALE|XBF_WRITE_FAIL))) { + bp->b_flags |= XBF_WRITE | XBF_ASYNC | + XBF_DONE | XBF_WRITE_FAIL; xfs_buf_iorequest(bp); } else { xfs_buf_relse(bp); diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h index 16def435944..3f3455a4151 100644 --- a/fs/xfs/xfs_buf_item.h +++ b/fs/xfs/xfs_buf_item.h @@ -18,51 +18,9 @@ #ifndef __XFS_BUF_ITEM_H__ #define __XFS_BUF_ITEM_H__ -extern kmem_zone_t *xfs_buf_item_zone; - -/* - * This flag indicates that the buffer contains on disk inodes - * and requires special recovery handling. - */ -#define XFS_BLF_INODE_BUF 0x1 -/* - * This flag indicates that the buffer should not be replayed - * during recovery because its blocks are being freed. - */ -#define XFS_BLF_CANCEL 0x2 -/* - * This flag indicates that the buffer contains on disk - * user or group dquots and may require special recovery handling. - */ -#define XFS_BLF_UDQUOT_BUF 0x4 -#define XFS_BLF_PDQUOT_BUF 0x8 -#define XFS_BLF_GDQUOT_BUF 0x10 - -#define XFS_BLF_CHUNK 128 -#define XFS_BLF_SHIFT 7 -#define BIT_TO_WORD_SHIFT 5 -#define NBWORD (NBBY * sizeof(unsigned int)) +/* kernel only definitions */ -/* - * This is the structure used to lay out a buf log item in the - * log. The data map describes which 128 byte chunks of the buffer - * have been logged. - */ -#define XFS_BLF_DATAMAP_SIZE ((XFS_MAX_BLOCKSIZE / XFS_BLF_CHUNK) / NBWORD) - -typedef struct xfs_buf_log_format { - unsigned short blf_type; /* buf log item type indicator */ - unsigned short blf_size; /* size of this item */ - ushort blf_flags; /* misc state */ - ushort blf_len; /* number of blocks in this buf */ - __int64_t blf_blkno; /* starting blkno of this buf */ - unsigned int blf_map_size; /* used size of data bitmap in words */ - unsigned int blf_data_map[XFS_BLF_DATAMAP_SIZE]; /* dirty bitmap */ -} xfs_buf_log_format_t; - -/* - * buf log item flags - */ +/* buf log item flags */ #define XFS_BLI_HOLD 0x01 #define XFS_BLI_DIRTY 0x02 #define XFS_BLI_STALE 0x04 @@ -70,6 +28,7 @@ typedef struct xfs_buf_log_format { #define XFS_BLI_INODE_ALLOC_BUF 0x10 #define XFS_BLI_STALE_INODE 0x20 #define XFS_BLI_INODE_BUF 0x40 +#define XFS_BLI_ORDERED 0x80 #define XFS_BLI_FLAGS \ { XFS_BLI_HOLD, "HOLD" }, \ @@ -78,10 +37,9 @@ typedef struct xfs_buf_log_format { { XFS_BLI_LOGGED, "LOGGED" }, \ { XFS_BLI_INODE_ALLOC_BUF, "INODE_ALLOC" }, \ { XFS_BLI_STALE_INODE, "STALE_INODE" }, \ - { XFS_BLI_INODE_BUF, "INODE_BUF" } - + { XFS_BLI_INODE_BUF, "INODE_BUF" }, \ + { XFS_BLI_ORDERED, "ORDERED" } -#ifdef __KERNEL__ struct xfs_buf; struct xfs_mount; @@ -98,10 +56,6 @@ typedef struct xfs_buf_log_item { unsigned int bli_flags; /* misc flags */ unsigned int bli_recur; /* lock recursion count */ atomic_t bli_refcount; /* cnt of tp refs */ -#ifdef XFS_TRANS_DEBUG - char *bli_orig; /* original buffer copy */ - char *bli_logged; /* bytes logged (bitmap) */ -#endif int bli_format_count; /* count of headers */ struct xfs_buf_log_format *bli_formats; /* array of in-log header ptrs */ struct xfs_buf_log_format __bli_format; /* embedded in-log header */ @@ -117,16 +71,6 @@ void xfs_buf_attach_iodone(struct xfs_buf *, void xfs_buf_iodone_callbacks(struct xfs_buf *); void xfs_buf_iodone(struct xfs_buf *, struct xfs_log_item *); -#ifdef XFS_TRANS_DEBUG -void -xfs_buf_item_flush_log_debug( - struct xfs_buf *bp, - uint first, - uint last); -#else -#define xfs_buf_item_flush_log_debug(bp, first, last) -#endif - -#endif /* __KERNEL__ */ +extern kmem_zone_t *xfs_buf_item_zone; #endif /* __XFS_BUF_ITEM_H__ */ diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c index 4d7696a0241..a514ab61665 100644 --- a/fs/xfs/xfs_da_btree.c +++ b/fs/xfs/xfs_da_btree.c @@ -1,5 +1,6 @@ /* * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * Copyright (c) 2013 Red Hat, Inc. * All Rights Reserved. * * This program is free software; you can redistribute it and/or @@ -17,20 +18,20 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_types.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_bit.h" -#include "xfs_log.h" -#include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_mount.h" +#include "xfs_da_format.h" #include "xfs_da_btree.h" -#include "xfs_bmap_btree.h" #include "xfs_dir2.h" -#include "xfs_dir2_format.h" #include "xfs_dir2_priv.h" -#include "xfs_dinode.h" #include "xfs_inode.h" +#include "xfs_trans.h" #include "xfs_inode_item.h" #include "xfs_alloc.h" #include "xfs_bmap.h" @@ -38,6 +39,8 @@ #include "xfs_attr_leaf.h" #include "xfs_error.h" #include "xfs_trace.h" +#include "xfs_cksum.h" +#include "xfs_buf_item.h" /* * xfs_da_btree.c @@ -52,69 +55,148 @@ /* * Routines used for growing the Btree. */ -STATIC int xfs_da_root_split(xfs_da_state_t *state, +STATIC int xfs_da3_root_split(xfs_da_state_t *state, xfs_da_state_blk_t *existing_root, xfs_da_state_blk_t *new_child); -STATIC int xfs_da_node_split(xfs_da_state_t *state, +STATIC int xfs_da3_node_split(xfs_da_state_t *state, xfs_da_state_blk_t *existing_blk, xfs_da_state_blk_t *split_blk, xfs_da_state_blk_t *blk_to_add, int treelevel, int *result); -STATIC void xfs_da_node_rebalance(xfs_da_state_t *state, +STATIC void xfs_da3_node_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *node_blk_1, xfs_da_state_blk_t *node_blk_2); -STATIC void xfs_da_node_add(xfs_da_state_t *state, +STATIC void xfs_da3_node_add(xfs_da_state_t *state, xfs_da_state_blk_t *old_node_blk, xfs_da_state_blk_t *new_node_blk); /* * Routines used for shrinking the Btree. */ -STATIC int xfs_da_root_join(xfs_da_state_t *state, +STATIC int xfs_da3_root_join(xfs_da_state_t *state, xfs_da_state_blk_t *root_blk); -STATIC int xfs_da_node_toosmall(xfs_da_state_t *state, int *retval); -STATIC void xfs_da_node_remove(xfs_da_state_t *state, +STATIC int xfs_da3_node_toosmall(xfs_da_state_t *state, int *retval); +STATIC void xfs_da3_node_remove(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk); -STATIC void xfs_da_node_unbalance(xfs_da_state_t *state, +STATIC void xfs_da3_node_unbalance(xfs_da_state_t *state, xfs_da_state_blk_t *src_node_blk, xfs_da_state_blk_t *dst_node_blk); /* * Utility routines. */ -STATIC uint xfs_da_node_lasthash(struct xfs_buf *bp, int *count); -STATIC int xfs_da_node_order(struct xfs_buf *node1_bp, - struct xfs_buf *node2_bp); -STATIC int xfs_da_blk_unlink(xfs_da_state_t *state, +STATIC int xfs_da3_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk, xfs_da_state_blk_t *save_blk); -STATIC void xfs_da_state_kill_altpath(xfs_da_state_t *state); -static void -xfs_da_node_verify( + +kmem_zone_t *xfs_da_state_zone; /* anchor for state struct zone */ + +/* + * Allocate a dir-state structure. + * We don't put them on the stack since they're large. + */ +xfs_da_state_t * +xfs_da_state_alloc(void) +{ + return kmem_zone_zalloc(xfs_da_state_zone, KM_NOFS); +} + +/* + * Kill the altpath contents of a da-state structure. + */ +STATIC void +xfs_da_state_kill_altpath(xfs_da_state_t *state) +{ + int i; + + for (i = 0; i < state->altpath.active; i++) + state->altpath.blk[i].bp = NULL; + state->altpath.active = 0; +} + +/* + * Free a da-state structure. + */ +void +xfs_da_state_free(xfs_da_state_t *state) +{ + xfs_da_state_kill_altpath(state); +#ifdef DEBUG + memset((char *)state, 0, sizeof(*state)); +#endif /* DEBUG */ + kmem_zone_free(xfs_da_state_zone, state); +} + +static bool +xfs_da3_node_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; - struct xfs_da_node_hdr *hdr = bp->b_addr; - int block_ok = 0; - - block_ok = hdr->info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC); - block_ok = block_ok && - be16_to_cpu(hdr->level) > 0 && - be16_to_cpu(hdr->count) > 0 ; - if (!block_ok) { - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr); - xfs_buf_ioerror(bp, EFSCORRUPTED); + struct xfs_da_intnode *hdr = bp->b_addr; + struct xfs_da3_icnode_hdr ichdr; + const struct xfs_dir_ops *ops; + + ops = xfs_dir_get_ops(mp, NULL); + + ops->node_hdr_from_disk(&ichdr, hdr); + + if (xfs_sb_version_hascrc(&mp->m_sb)) { + struct xfs_da3_node_hdr *hdr3 = bp->b_addr; + + if (ichdr.magic != XFS_DA3_NODE_MAGIC) + return false; + + if (!uuid_equal(&hdr3->info.uuid, &mp->m_sb.sb_uuid)) + return false; + if (be64_to_cpu(hdr3->info.blkno) != bp->b_bn) + return false; + } else { + if (ichdr.magic != XFS_DA_NODE_MAGIC) + return false; } + if (ichdr.level == 0) + return false; + if (ichdr.level > XFS_DA_NODE_MAXDEPTH) + return false; + if (ichdr.count == 0) + return false; + + /* + * we don't know if the node is for and attribute or directory tree, + * so only fail if the count is outside both bounds + */ + if (ichdr.count > mp->m_dir_geo->node_ents && + ichdr.count > mp->m_attr_geo->node_ents) + return false; + /* XXX: hash order check? */ + + return true; } static void -xfs_da_node_write_verify( +xfs_da3_node_write_verify( struct xfs_buf *bp) { - xfs_da_node_verify(bp); + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_buf_log_item *bip = bp->b_fspriv; + struct xfs_da3_node_hdr *hdr3 = bp->b_addr; + + if (!xfs_da3_node_verify(bp)) { + xfs_buf_ioerror(bp, EFSCORRUPTED); + xfs_verifier_error(bp); + return; + } + + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return; + + if (bip) + hdr3->info.lsn = cpu_to_be64(bip->bli_item.li_lsn); + + xfs_buf_update_cksum(bp, XFS_DA3_NODE_CRC_OFF); } /* @@ -124,40 +206,49 @@ xfs_da_node_write_verify( * format of the block being read. */ static void -xfs_da_node_read_verify( +xfs_da3_node_read_verify( struct xfs_buf *bp) { - struct xfs_mount *mp = bp->b_target->bt_mount; struct xfs_da_blkinfo *info = bp->b_addr; switch (be16_to_cpu(info->magic)) { + case XFS_DA3_NODE_MAGIC: + if (!xfs_buf_verify_cksum(bp, XFS_DA3_NODE_CRC_OFF)) { + xfs_buf_ioerror(bp, EFSBADCRC); + break; + } + /* fall through */ case XFS_DA_NODE_MAGIC: - xfs_da_node_verify(bp); - break; + if (!xfs_da3_node_verify(bp)) { + xfs_buf_ioerror(bp, EFSCORRUPTED); + break; + } + return; case XFS_ATTR_LEAF_MAGIC: - bp->b_ops = &xfs_attr_leaf_buf_ops; + case XFS_ATTR3_LEAF_MAGIC: + bp->b_ops = &xfs_attr3_leaf_buf_ops; bp->b_ops->verify_read(bp); return; case XFS_DIR2_LEAFN_MAGIC: - bp->b_ops = &xfs_dir2_leafn_buf_ops; + case XFS_DIR3_LEAFN_MAGIC: + bp->b_ops = &xfs_dir3_leafn_buf_ops; bp->b_ops->verify_read(bp); return; default: - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, - mp, info); - xfs_buf_ioerror(bp, EFSCORRUPTED); break; } + + /* corrupt block */ + xfs_verifier_error(bp); } -const struct xfs_buf_ops xfs_da_node_buf_ops = { - .verify_read = xfs_da_node_read_verify, - .verify_write = xfs_da_node_write_verify, +const struct xfs_buf_ops xfs_da3_node_buf_ops = { + .verify_read = xfs_da3_node_read_verify, + .verify_write = xfs_da3_node_write_verify, }; - int -xfs_da_node_read( +xfs_da3_node_read( struct xfs_trans *tp, struct xfs_inode *dp, xfs_dablk_t bno, @@ -165,8 +256,35 @@ xfs_da_node_read( struct xfs_buf **bpp, int which_fork) { - return xfs_da_read_buf(tp, dp, bno, mappedbno, bpp, - which_fork, &xfs_da_node_buf_ops); + int err; + + err = xfs_da_read_buf(tp, dp, bno, mappedbno, bpp, + which_fork, &xfs_da3_node_buf_ops); + if (!err && tp) { + struct xfs_da_blkinfo *info = (*bpp)->b_addr; + int type; + + switch (be16_to_cpu(info->magic)) { + case XFS_DA_NODE_MAGIC: + case XFS_DA3_NODE_MAGIC: + type = XFS_BLFT_DA_NODE_BUF; + break; + case XFS_ATTR_LEAF_MAGIC: + case XFS_ATTR3_LEAF_MAGIC: + type = XFS_BLFT_ATTR_LEAF_BUF; + break; + case XFS_DIR2_LEAFN_MAGIC: + case XFS_DIR3_LEAFN_MAGIC: + type = XFS_BLFT_DIR_LEAFN_BUF; + break; + default: + type = 0; + ASSERT(0); + break; + } + xfs_trans_buf_set_type(tp, *bpp, type); + } + return err; } /*======================================================================== @@ -177,33 +295,47 @@ xfs_da_node_read( * Create the initial contents of an intermediate node. */ int -xfs_da_node_create(xfs_da_args_t *args, xfs_dablk_t blkno, int level, - struct xfs_buf **bpp, int whichfork) +xfs_da3_node_create( + struct xfs_da_args *args, + xfs_dablk_t blkno, + int level, + struct xfs_buf **bpp, + int whichfork) { - xfs_da_intnode_t *node; - struct xfs_buf *bp; - int error; - xfs_trans_t *tp; + struct xfs_da_intnode *node; + struct xfs_trans *tp = args->trans; + struct xfs_mount *mp = tp->t_mountp; + struct xfs_da3_icnode_hdr ichdr = {0}; + struct xfs_buf *bp; + int error; + struct xfs_inode *dp = args->dp; trace_xfs_da_node_create(args); + ASSERT(level <= XFS_DA_NODE_MAXDEPTH); - tp = args->trans; - error = xfs_da_get_buf(tp, args->dp, blkno, -1, &bp, whichfork); + error = xfs_da_get_buf(tp, dp, blkno, -1, &bp, whichfork); if (error) return(error); - ASSERT(bp != NULL); + bp->b_ops = &xfs_da3_node_buf_ops; + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DA_NODE_BUF); node = bp->b_addr; - node->hdr.info.forw = 0; - node->hdr.info.back = 0; - node->hdr.info.magic = cpu_to_be16(XFS_DA_NODE_MAGIC); - node->hdr.info.pad = 0; - node->hdr.count = 0; - node->hdr.level = cpu_to_be16(level); + if (xfs_sb_version_hascrc(&mp->m_sb)) { + struct xfs_da3_node_hdr *hdr3 = bp->b_addr; + + ichdr.magic = XFS_DA3_NODE_MAGIC; + hdr3->info.blkno = cpu_to_be64(bp->b_bn); + hdr3->info.owner = cpu_to_be64(args->dp->i_ino); + uuid_copy(&hdr3->info.uuid, &mp->m_sb.sb_uuid); + } else { + ichdr.magic = XFS_DA_NODE_MAGIC; + } + ichdr.level = level; + + dp->d_ops->node_hdr_to_disk(node, &ichdr); xfs_trans_log_buf(tp, bp, - XFS_DA_LOGRANGE(node, &node->hdr, sizeof(node->hdr))); + XFS_DA_LOGRANGE(node, &node->hdr, dp->d_ops->node_hdr_size)); - bp->b_ops = &xfs_da_node_buf_ops; *bpp = bp; return(0); } @@ -213,12 +345,18 @@ xfs_da_node_create(xfs_da_args_t *args, xfs_dablk_t blkno, int level, * intermediate nodes, rebalance, etc. */ int /* error */ -xfs_da_split(xfs_da_state_t *state) +xfs_da3_split( + struct xfs_da_state *state) { - xfs_da_state_blk_t *oldblk, *newblk, *addblk; - xfs_da_intnode_t *node; - struct xfs_buf *bp; - int max, action, error, i; + struct xfs_da_state_blk *oldblk; + struct xfs_da_state_blk *newblk; + struct xfs_da_state_blk *addblk; + struct xfs_da_intnode *node; + struct xfs_buf *bp; + int max; + int action = 0; + int error; + int i; trace_xfs_da_split(state->args); @@ -246,7 +384,7 @@ xfs_da_split(xfs_da_state_t *state) */ switch (oldblk->magic) { case XFS_ATTR_LEAF_MAGIC: - error = xfs_attr_leaf_split(state, oldblk, newblk); + error = xfs_attr3_leaf_split(state, oldblk, newblk); if ((error != 0) && (error != ENOSPC)) { return(error); /* GROT: attr is inconsistent */ } @@ -261,12 +399,12 @@ xfs_da_split(xfs_da_state_t *state) if (state->inleaf) { state->extraafter = 0; /* before newblk */ trace_xfs_attr_leaf_split_before(state->args); - error = xfs_attr_leaf_split(state, oldblk, + error = xfs_attr3_leaf_split(state, oldblk, &state->extrablk); } else { state->extraafter = 1; /* after newblk */ trace_xfs_attr_leaf_split_after(state->args); - error = xfs_attr_leaf_split(state, newblk, + error = xfs_attr3_leaf_split(state, newblk, &state->extrablk); } if (error) @@ -280,7 +418,7 @@ xfs_da_split(xfs_da_state_t *state) addblk = newblk; break; case XFS_DA_NODE_MAGIC: - error = xfs_da_node_split(state, oldblk, newblk, addblk, + error = xfs_da3_node_split(state, oldblk, newblk, addblk, max - i, &action); addblk->bp = NULL; if (error) @@ -298,7 +436,7 @@ xfs_da_split(xfs_da_state_t *state) /* * Update the btree to show the new hashval for this child. */ - xfs_da_fixhashpath(state, &state->path); + xfs_da3_fixhashpath(state, &state->path); } if (!addblk) return(0); @@ -308,7 +446,7 @@ xfs_da_split(xfs_da_state_t *state) */ ASSERT(state->path.active == 0); oldblk = &state->path.blk[0]; - error = xfs_da_root_split(state, oldblk, addblk); + error = xfs_da3_root_split(state, oldblk, addblk); if (error) { addblk->bp = NULL; return(error); /* GROT: dir is inconsistent */ @@ -319,8 +457,12 @@ xfs_da_split(xfs_da_state_t *state) * just got bumped because of the addition of a new root node. * There might be three blocks involved if a double split occurred, * and the original block 0 could be at any position in the list. + * + * Note: the magic numbers and sibling pointers are in the same + * physical place for both v2 and v3 headers (by design). Hence it + * doesn't matter which version of the xfs_da_intnode structure we use + * here as the result will be the same using either structure. */ - node = oldblk->bp->b_addr; if (node->hdr.info.forw) { if (be32_to_cpu(node->hdr.info.forw) == addblk->blkno) { @@ -359,18 +501,25 @@ xfs_da_split(xfs_da_state_t *state) * the EOF, extending the inode in process. */ STATIC int /* error */ -xfs_da_root_split(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, - xfs_da_state_blk_t *blk2) +xfs_da3_root_split( + struct xfs_da_state *state, + struct xfs_da_state_blk *blk1, + struct xfs_da_state_blk *blk2) { - xfs_da_intnode_t *node, *oldroot; - xfs_da_args_t *args; - xfs_dablk_t blkno; - struct xfs_buf *bp; - int error, size; - xfs_inode_t *dp; - xfs_trans_t *tp; - xfs_mount_t *mp; - xfs_dir2_leaf_t *leaf; + struct xfs_da_intnode *node; + struct xfs_da_intnode *oldroot; + struct xfs_da_node_entry *btree; + struct xfs_da3_icnode_hdr nodehdr; + struct xfs_da_args *args; + struct xfs_buf *bp; + struct xfs_inode *dp; + struct xfs_trans *tp; + struct xfs_mount *mp; + struct xfs_dir2_leaf *leaf; + xfs_dablk_t blkno; + int level; + int error; + int size; trace_xfs_da_root_split(state->args); @@ -379,85 +528,132 @@ xfs_da_root_split(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, * to a free space somewhere. */ args = state->args; - ASSERT(args != NULL); error = xfs_da_grow_inode(args, &blkno); if (error) - return(error); + return error; + dp = args->dp; tp = args->trans; mp = state->mp; error = xfs_da_get_buf(tp, dp, blkno, -1, &bp, args->whichfork); if (error) - return(error); - ASSERT(bp != NULL); + return error; node = bp->b_addr; oldroot = blk1->bp->b_addr; - if (oldroot->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC)) { - size = (int)((char *)&oldroot->btree[be16_to_cpu(oldroot->hdr.count)] - - (char *)oldroot); + if (oldroot->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC) || + oldroot->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC)) { + struct xfs_da3_icnode_hdr nodehdr; + + dp->d_ops->node_hdr_from_disk(&nodehdr, oldroot); + btree = dp->d_ops->node_tree_p(oldroot); + size = (int)((char *)&btree[nodehdr.count] - (char *)oldroot); + level = nodehdr.level; + + /* + * we are about to copy oldroot to bp, so set up the type + * of bp while we know exactly what it will be. + */ + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DA_NODE_BUF); } else { - ASSERT(oldroot->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); + struct xfs_dir3_icleaf_hdr leafhdr; + struct xfs_dir2_leaf_entry *ents; + leaf = (xfs_dir2_leaf_t *)oldroot; - size = (int)((char *)&leaf->ents[be16_to_cpu(leaf->hdr.count)] - - (char *)leaf); + dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf); + ents = dp->d_ops->leaf_ents_p(leaf); + + ASSERT(leafhdr.magic == XFS_DIR2_LEAFN_MAGIC || + leafhdr.magic == XFS_DIR3_LEAFN_MAGIC); + size = (int)((char *)&ents[leafhdr.count] - (char *)leaf); + level = 0; + + /* + * we are about to copy oldroot to bp, so set up the type + * of bp while we know exactly what it will be. + */ + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_LEAFN_BUF); } + + /* + * we can copy most of the information in the node from one block to + * another, but for CRC enabled headers we have to make sure that the + * block specific identifiers are kept intact. We update the buffer + * directly for this. + */ memcpy(node, oldroot, size); + if (oldroot->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC) || + oldroot->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)) { + struct xfs_da3_intnode *node3 = (struct xfs_da3_intnode *)node; + + node3->hdr.info.blkno = cpu_to_be64(bp->b_bn); + } xfs_trans_log_buf(tp, bp, 0, size - 1); bp->b_ops = blk1->bp->b_ops; + xfs_trans_buf_copy_type(bp, blk1->bp); blk1->bp = bp; blk1->blkno = blkno; /* * Set up the new root node. */ - error = xfs_da_node_create(args, - (args->whichfork == XFS_DATA_FORK) ? mp->m_dirleafblk : 0, - be16_to_cpu(node->hdr.level) + 1, &bp, args->whichfork); + error = xfs_da3_node_create(args, + (args->whichfork == XFS_DATA_FORK) ? args->geo->leafblk : 0, + level + 1, &bp, args->whichfork); if (error) - return(error); + return error; + node = bp->b_addr; - node->btree[0].hashval = cpu_to_be32(blk1->hashval); - node->btree[0].before = cpu_to_be32(blk1->blkno); - node->btree[1].hashval = cpu_to_be32(blk2->hashval); - node->btree[1].before = cpu_to_be32(blk2->blkno); - node->hdr.count = cpu_to_be16(2); + dp->d_ops->node_hdr_from_disk(&nodehdr, node); + btree = dp->d_ops->node_tree_p(node); + btree[0].hashval = cpu_to_be32(blk1->hashval); + btree[0].before = cpu_to_be32(blk1->blkno); + btree[1].hashval = cpu_to_be32(blk2->hashval); + btree[1].before = cpu_to_be32(blk2->blkno); + nodehdr.count = 2; + dp->d_ops->node_hdr_to_disk(node, &nodehdr); #ifdef DEBUG - if (oldroot->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)) { - ASSERT(blk1->blkno >= mp->m_dirleafblk && - blk1->blkno < mp->m_dirfreeblk); - ASSERT(blk2->blkno >= mp->m_dirleafblk && - blk2->blkno < mp->m_dirfreeblk); + if (oldroot->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) || + oldroot->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)) { + ASSERT(blk1->blkno >= args->geo->leafblk && + blk1->blkno < args->geo->freeblk); + ASSERT(blk2->blkno >= args->geo->leafblk && + blk2->blkno < args->geo->freeblk); } #endif /* Header is already logged by xfs_da_node_create */ xfs_trans_log_buf(tp, bp, - XFS_DA_LOGRANGE(node, node->btree, - sizeof(xfs_da_node_entry_t) * 2)); + XFS_DA_LOGRANGE(node, btree, sizeof(xfs_da_node_entry_t) * 2)); - return(0); + return 0; } /* * Split the node, rebalance, then add the new entry. */ STATIC int /* error */ -xfs_da_node_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk, - xfs_da_state_blk_t *newblk, - xfs_da_state_blk_t *addblk, - int treelevel, int *result) +xfs_da3_node_split( + struct xfs_da_state *state, + struct xfs_da_state_blk *oldblk, + struct xfs_da_state_blk *newblk, + struct xfs_da_state_blk *addblk, + int treelevel, + int *result) { - xfs_da_intnode_t *node; - xfs_dablk_t blkno; - int newcount, error; - int useextra; + struct xfs_da_intnode *node; + struct xfs_da3_icnode_hdr nodehdr; + xfs_dablk_t blkno; + int newcount; + int error; + int useextra; + struct xfs_inode *dp = state->args->dp; trace_xfs_da_node_split(state->args); node = oldblk->bp->b_addr; - ASSERT(node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); + dp->d_ops->node_hdr_from_disk(&nodehdr, node); /* * With V2 dirs the extra block is data or freespace. @@ -467,7 +663,7 @@ xfs_da_node_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk, /* * Do we have to split the node? */ - if ((be16_to_cpu(node->hdr.count) + newcount) > state->node_ents) { + if (nodehdr.count + newcount > state->args->geo->node_ents) { /* * Allocate a new node, add to the doubly linked chain of * nodes, then move some of our excess entries into it. @@ -476,14 +672,14 @@ xfs_da_node_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk, if (error) return(error); /* GROT: dir is inconsistent */ - error = xfs_da_node_create(state->args, blkno, treelevel, + error = xfs_da3_node_create(state->args, blkno, treelevel, &newblk->bp, state->args->whichfork); if (error) return(error); /* GROT: dir is inconsistent */ newblk->blkno = blkno; newblk->magic = XFS_DA_NODE_MAGIC; - xfs_da_node_rebalance(state, oldblk, newblk); - error = xfs_da_blk_link(state, oldblk, newblk); + xfs_da3_node_rebalance(state, oldblk, newblk); + error = xfs_da3_blk_link(state, oldblk, newblk); if (error) return(error); *result = 1; @@ -495,7 +691,7 @@ xfs_da_node_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk, * Insert the new entry(s) into the correct block * (updating last hashval in the process). * - * xfs_da_node_add() inserts BEFORE the given index, + * xfs_da3_node_add() inserts BEFORE the given index, * and as a result of using node_lookup_int() we always * point to a valid entry (not after one), but a split * operation always results in a new block whose hashvals @@ -504,22 +700,23 @@ xfs_da_node_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk, * If we had double-split op below us, then add the extra block too. */ node = oldblk->bp->b_addr; - if (oldblk->index <= be16_to_cpu(node->hdr.count)) { + dp->d_ops->node_hdr_from_disk(&nodehdr, node); + if (oldblk->index <= nodehdr.count) { oldblk->index++; - xfs_da_node_add(state, oldblk, addblk); + xfs_da3_node_add(state, oldblk, addblk); if (useextra) { if (state->extraafter) oldblk->index++; - xfs_da_node_add(state, oldblk, &state->extrablk); + xfs_da3_node_add(state, oldblk, &state->extrablk); state->extravalid = 0; } } else { newblk->index++; - xfs_da_node_add(state, newblk, addblk); + xfs_da3_node_add(state, newblk, addblk); if (useextra) { if (state->extraafter) newblk->index++; - xfs_da_node_add(state, newblk, &state->extrablk); + xfs_da3_node_add(state, newblk, &state->extrablk); state->extravalid = 0; } } @@ -534,33 +731,54 @@ xfs_da_node_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk, * NOTE: if blk2 is empty, then it will get the upper half of blk1. */ STATIC void -xfs_da_node_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, - xfs_da_state_blk_t *blk2) +xfs_da3_node_rebalance( + struct xfs_da_state *state, + struct xfs_da_state_blk *blk1, + struct xfs_da_state_blk *blk2) { - xfs_da_intnode_t *node1, *node2, *tmpnode; - xfs_da_node_entry_t *btree_s, *btree_d; - int count, tmp; - xfs_trans_t *tp; + struct xfs_da_intnode *node1; + struct xfs_da_intnode *node2; + struct xfs_da_intnode *tmpnode; + struct xfs_da_node_entry *btree1; + struct xfs_da_node_entry *btree2; + struct xfs_da_node_entry *btree_s; + struct xfs_da_node_entry *btree_d; + struct xfs_da3_icnode_hdr nodehdr1; + struct xfs_da3_icnode_hdr nodehdr2; + struct xfs_trans *tp; + int count; + int tmp; + int swap = 0; + struct xfs_inode *dp = state->args->dp; trace_xfs_da_node_rebalance(state->args); node1 = blk1->bp->b_addr; node2 = blk2->bp->b_addr; + dp->d_ops->node_hdr_from_disk(&nodehdr1, node1); + dp->d_ops->node_hdr_from_disk(&nodehdr2, node2); + btree1 = dp->d_ops->node_tree_p(node1); + btree2 = dp->d_ops->node_tree_p(node2); + /* * Figure out how many entries need to move, and in which direction. * Swap the nodes around if that makes it simpler. */ - if ((be16_to_cpu(node1->hdr.count) > 0) && (be16_to_cpu(node2->hdr.count) > 0) && - ((be32_to_cpu(node2->btree[0].hashval) < be32_to_cpu(node1->btree[0].hashval)) || - (be32_to_cpu(node2->btree[be16_to_cpu(node2->hdr.count)-1].hashval) < - be32_to_cpu(node1->btree[be16_to_cpu(node1->hdr.count)-1].hashval)))) { + if (nodehdr1.count > 0 && nodehdr2.count > 0 && + ((be32_to_cpu(btree2[0].hashval) < be32_to_cpu(btree1[0].hashval)) || + (be32_to_cpu(btree2[nodehdr2.count - 1].hashval) < + be32_to_cpu(btree1[nodehdr1.count - 1].hashval)))) { tmpnode = node1; node1 = node2; node2 = tmpnode; + dp->d_ops->node_hdr_from_disk(&nodehdr1, node1); + dp->d_ops->node_hdr_from_disk(&nodehdr2, node2); + btree1 = dp->d_ops->node_tree_p(node1); + btree2 = dp->d_ops->node_tree_p(node2); + swap = 1; } - ASSERT(node1->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); - ASSERT(node2->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); - count = (be16_to_cpu(node1->hdr.count) - be16_to_cpu(node2->hdr.count)) / 2; + + count = (nodehdr1.count - nodehdr2.count) / 2; if (count == 0) return; tp = state->args->trans; @@ -571,10 +789,11 @@ xfs_da_node_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, /* * Move elements in node2 up to make a hole. */ - if ((tmp = be16_to_cpu(node2->hdr.count)) > 0) { + tmp = nodehdr2.count; + if (tmp > 0) { tmp *= (uint)sizeof(xfs_da_node_entry_t); - btree_s = &node2->btree[0]; - btree_d = &node2->btree[count]; + btree_s = &btree2[0]; + btree_d = &btree2[count]; memmove(btree_d, btree_s, tmp); } @@ -582,12 +801,12 @@ xfs_da_node_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, * Move the req'd B-tree elements from high in node1 to * low in node2. */ - be16_add_cpu(&node2->hdr.count, count); + nodehdr2.count += count; tmp = count * (uint)sizeof(xfs_da_node_entry_t); - btree_s = &node1->btree[be16_to_cpu(node1->hdr.count) - count]; - btree_d = &node2->btree[0]; + btree_s = &btree1[nodehdr1.count - count]; + btree_d = &btree2[0]; memcpy(btree_d, btree_s, tmp); - be16_add_cpu(&node1->hdr.count, -count); + nodehdr1.count -= count; } else { /* * Move the req'd B-tree elements from low in node2 to @@ -595,49 +814,59 @@ xfs_da_node_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, */ count = -count; tmp = count * (uint)sizeof(xfs_da_node_entry_t); - btree_s = &node2->btree[0]; - btree_d = &node1->btree[be16_to_cpu(node1->hdr.count)]; + btree_s = &btree2[0]; + btree_d = &btree1[nodehdr1.count]; memcpy(btree_d, btree_s, tmp); - be16_add_cpu(&node1->hdr.count, count); + nodehdr1.count += count; + xfs_trans_log_buf(tp, blk1->bp, XFS_DA_LOGRANGE(node1, btree_d, tmp)); /* * Move elements in node2 down to fill the hole. */ - tmp = be16_to_cpu(node2->hdr.count) - count; + tmp = nodehdr2.count - count; tmp *= (uint)sizeof(xfs_da_node_entry_t); - btree_s = &node2->btree[count]; - btree_d = &node2->btree[0]; + btree_s = &btree2[count]; + btree_d = &btree2[0]; memmove(btree_d, btree_s, tmp); - be16_add_cpu(&node2->hdr.count, -count); + nodehdr2.count -= count; } /* * Log header of node 1 and all current bits of node 2. */ + dp->d_ops->node_hdr_to_disk(node1, &nodehdr1); xfs_trans_log_buf(tp, blk1->bp, - XFS_DA_LOGRANGE(node1, &node1->hdr, sizeof(node1->hdr))); + XFS_DA_LOGRANGE(node1, &node1->hdr, dp->d_ops->node_hdr_size)); + + dp->d_ops->node_hdr_to_disk(node2, &nodehdr2); xfs_trans_log_buf(tp, blk2->bp, XFS_DA_LOGRANGE(node2, &node2->hdr, - sizeof(node2->hdr) + - sizeof(node2->btree[0]) * be16_to_cpu(node2->hdr.count))); + dp->d_ops->node_hdr_size + + (sizeof(btree2[0]) * nodehdr2.count))); /* * Record the last hashval from each block for upward propagation. * (note: don't use the swapped node pointers) */ - node1 = blk1->bp->b_addr; - node2 = blk2->bp->b_addr; - blk1->hashval = be32_to_cpu(node1->btree[be16_to_cpu(node1->hdr.count)-1].hashval); - blk2->hashval = be32_to_cpu(node2->btree[be16_to_cpu(node2->hdr.count)-1].hashval); + if (swap) { + node1 = blk1->bp->b_addr; + node2 = blk2->bp->b_addr; + dp->d_ops->node_hdr_from_disk(&nodehdr1, node1); + dp->d_ops->node_hdr_from_disk(&nodehdr2, node2); + btree1 = dp->d_ops->node_tree_p(node1); + btree2 = dp->d_ops->node_tree_p(node2); + } + blk1->hashval = be32_to_cpu(btree1[nodehdr1.count - 1].hashval); + blk2->hashval = be32_to_cpu(btree2[nodehdr2.count - 1].hashval); /* * Adjust the expected index for insertion. */ - if (blk1->index >= be16_to_cpu(node1->hdr.count)) { - blk2->index = blk1->index - be16_to_cpu(node1->hdr.count); - blk1->index = be16_to_cpu(node1->hdr.count) + 1; /* make it invalid */ + if (blk1->index >= nodehdr1.count) { + blk2->index = blk1->index - nodehdr1.count; + blk1->index = nodehdr1.count + 1; /* make it invalid */ } } @@ -645,44 +874,52 @@ xfs_da_node_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, * Add a new entry to an intermediate node. */ STATIC void -xfs_da_node_add(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk, - xfs_da_state_blk_t *newblk) +xfs_da3_node_add( + struct xfs_da_state *state, + struct xfs_da_state_blk *oldblk, + struct xfs_da_state_blk *newblk) { - xfs_da_intnode_t *node; - xfs_da_node_entry_t *btree; - int tmp; + struct xfs_da_intnode *node; + struct xfs_da3_icnode_hdr nodehdr; + struct xfs_da_node_entry *btree; + int tmp; + struct xfs_inode *dp = state->args->dp; trace_xfs_da_node_add(state->args); node = oldblk->bp->b_addr; - ASSERT(node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); - ASSERT((oldblk->index >= 0) && (oldblk->index <= be16_to_cpu(node->hdr.count))); + dp->d_ops->node_hdr_from_disk(&nodehdr, node); + btree = dp->d_ops->node_tree_p(node); + + ASSERT(oldblk->index >= 0 && oldblk->index <= nodehdr.count); ASSERT(newblk->blkno != 0); if (state->args->whichfork == XFS_DATA_FORK) - ASSERT(newblk->blkno >= state->mp->m_dirleafblk && - newblk->blkno < state->mp->m_dirfreeblk); + ASSERT(newblk->blkno >= state->args->geo->leafblk && + newblk->blkno < state->args->geo->freeblk); /* * We may need to make some room before we insert the new node. */ tmp = 0; - btree = &node->btree[ oldblk->index ]; - if (oldblk->index < be16_to_cpu(node->hdr.count)) { - tmp = (be16_to_cpu(node->hdr.count) - oldblk->index) * (uint)sizeof(*btree); - memmove(btree + 1, btree, tmp); + if (oldblk->index < nodehdr.count) { + tmp = (nodehdr.count - oldblk->index) * (uint)sizeof(*btree); + memmove(&btree[oldblk->index + 1], &btree[oldblk->index], tmp); } - btree->hashval = cpu_to_be32(newblk->hashval); - btree->before = cpu_to_be32(newblk->blkno); + btree[oldblk->index].hashval = cpu_to_be32(newblk->hashval); + btree[oldblk->index].before = cpu_to_be32(newblk->blkno); xfs_trans_log_buf(state->args->trans, oldblk->bp, - XFS_DA_LOGRANGE(node, btree, tmp + sizeof(*btree))); - be16_add_cpu(&node->hdr.count, 1); + XFS_DA_LOGRANGE(node, &btree[oldblk->index], + tmp + sizeof(*btree))); + + nodehdr.count += 1; + dp->d_ops->node_hdr_to_disk(node, &nodehdr); xfs_trans_log_buf(state->args->trans, oldblk->bp, - XFS_DA_LOGRANGE(node, &node->hdr, sizeof(node->hdr))); + XFS_DA_LOGRANGE(node, &node->hdr, dp->d_ops->node_hdr_size)); /* * Copy the last hash value from the oldblk to propagate upwards. */ - oldblk->hashval = be32_to_cpu(node->btree[be16_to_cpu(node->hdr.count)-1 ].hashval); + oldblk->hashval = be32_to_cpu(btree[nodehdr.count - 1].hashval); } /*======================================================================== @@ -694,14 +931,16 @@ xfs_da_node_add(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk, * possibly deallocating that block, etc... */ int -xfs_da_join(xfs_da_state_t *state) +xfs_da3_join( + struct xfs_da_state *state) { - xfs_da_state_blk_t *drop_blk, *save_blk; - int action, error; + struct xfs_da_state_blk *drop_blk; + struct xfs_da_state_blk *save_blk; + int action = 0; + int error; trace_xfs_da_join(state->args); - action = 0; drop_blk = &state->path.blk[ state->path.active-1 ]; save_blk = &state->altpath.blk[ state->path.active-1 ]; ASSERT(state->path.blk[0].magic == XFS_DA_NODE_MAGIC); @@ -722,12 +961,12 @@ xfs_da_join(xfs_da_state_t *state) */ switch (drop_blk->magic) { case XFS_ATTR_LEAF_MAGIC: - error = xfs_attr_leaf_toosmall(state, &action); + error = xfs_attr3_leaf_toosmall(state, &action); if (error) return(error); if (action == 0) return(0); - xfs_attr_leaf_unbalance(state, drop_blk, save_blk); + xfs_attr3_leaf_unbalance(state, drop_blk, save_blk); break; case XFS_DIR2_LEAFN_MAGIC: error = xfs_dir2_leafn_toosmall(state, &action); @@ -742,18 +981,18 @@ xfs_da_join(xfs_da_state_t *state) * Remove the offending node, fixup hashvals, * check for a toosmall neighbor. */ - xfs_da_node_remove(state, drop_blk); - xfs_da_fixhashpath(state, &state->path); - error = xfs_da_node_toosmall(state, &action); + xfs_da3_node_remove(state, drop_blk); + xfs_da3_fixhashpath(state, &state->path); + error = xfs_da3_node_toosmall(state, &action); if (error) return(error); if (action == 0) return 0; - xfs_da_node_unbalance(state, drop_blk, save_blk); + xfs_da3_node_unbalance(state, drop_blk, save_blk); break; } - xfs_da_fixhashpath(state, &state->altpath); - error = xfs_da_blk_unlink(state, drop_blk, save_blk); + xfs_da3_fixhashpath(state, &state->altpath); + error = xfs_da3_blk_unlink(state, drop_blk, save_blk); xfs_da_state_kill_altpath(state); if (error) return(error); @@ -768,9 +1007,9 @@ xfs_da_join(xfs_da_state_t *state) * we only have one entry in the root, make the child block * the new root. */ - xfs_da_node_remove(state, drop_blk); - xfs_da_fixhashpath(state, &state->path); - error = xfs_da_root_join(state, &state->path.blk[0]); + xfs_da3_node_remove(state, drop_blk); + xfs_da3_fixhashpath(state, &state->path); + error = xfs_da3_root_join(state, &state->path.blk[0]); return(error); } @@ -782,9 +1021,13 @@ xfs_da_blkinfo_onlychild_validate(struct xfs_da_blkinfo *blkinfo, __u16 level) if (level == 1) { ASSERT(magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) || - magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); - } else - ASSERT(magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); + magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC) || + magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC) || + magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC)); + } else { + ASSERT(magic == cpu_to_be16(XFS_DA_NODE_MAGIC) || + magic == cpu_to_be16(XFS_DA3_NODE_MAGIC)); + } ASSERT(!blkinfo->forw); ASSERT(!blkinfo->back); } @@ -797,53 +1040,64 @@ xfs_da_blkinfo_onlychild_validate(struct xfs_da_blkinfo *blkinfo, __u16 level) * the old root to block 0 as the new root node. */ STATIC int -xfs_da_root_join(xfs_da_state_t *state, xfs_da_state_blk_t *root_blk) +xfs_da3_root_join( + struct xfs_da_state *state, + struct xfs_da_state_blk *root_blk) { - xfs_da_intnode_t *oldroot; - xfs_da_args_t *args; - xfs_dablk_t child; - struct xfs_buf *bp; - int error; + struct xfs_da_intnode *oldroot; + struct xfs_da_args *args; + xfs_dablk_t child; + struct xfs_buf *bp; + struct xfs_da3_icnode_hdr oldroothdr; + struct xfs_da_node_entry *btree; + int error; + struct xfs_inode *dp = state->args->dp; trace_xfs_da_root_join(state->args); - args = state->args; - ASSERT(args != NULL); ASSERT(root_blk->magic == XFS_DA_NODE_MAGIC); + + args = state->args; oldroot = root_blk->bp->b_addr; - ASSERT(oldroot->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); - ASSERT(!oldroot->hdr.info.forw); - ASSERT(!oldroot->hdr.info.back); + dp->d_ops->node_hdr_from_disk(&oldroothdr, oldroot); + ASSERT(oldroothdr.forw == 0); + ASSERT(oldroothdr.back == 0); /* * If the root has more than one child, then don't do anything. */ - if (be16_to_cpu(oldroot->hdr.count) > 1) - return(0); + if (oldroothdr.count > 1) + return 0; /* * Read in the (only) child block, then copy those bytes into * the root block's buffer and free the original child block. */ - child = be32_to_cpu(oldroot->btree[0].before); + btree = dp->d_ops->node_tree_p(oldroot); + child = be32_to_cpu(btree[0].before); ASSERT(child != 0); - error = xfs_da_node_read(args->trans, args->dp, child, -1, &bp, + error = xfs_da3_node_read(args->trans, dp, child, -1, &bp, args->whichfork); if (error) - return(error); - ASSERT(bp != NULL); - xfs_da_blkinfo_onlychild_validate(bp->b_addr, - be16_to_cpu(oldroot->hdr.level)); + return error; + xfs_da_blkinfo_onlychild_validate(bp->b_addr, oldroothdr.level); /* * This could be copying a leaf back into the root block in the case of * there only being a single leaf block left in the tree. Hence we have * to update the b_ops pointer as well to match the buffer type change - * that could occur. + * that could occur. For dir3 blocks we also need to update the block + * number in the buffer header. */ - memcpy(root_blk->bp->b_addr, bp->b_addr, state->blocksize); + memcpy(root_blk->bp->b_addr, bp->b_addr, args->geo->blksize); root_blk->bp->b_ops = bp->b_ops; - xfs_trans_log_buf(args->trans, root_blk->bp, 0, state->blocksize - 1); + xfs_trans_buf_copy_type(root_blk->bp, bp); + if (oldroothdr.magic == XFS_DA3_NODE_MAGIC) { + struct xfs_da3_blkinfo *da3 = root_blk->bp->b_addr; + da3->blkno = cpu_to_be64(root_blk->bp->b_bn); + } + xfs_trans_log_buf(args->trans, root_blk->bp, 0, + args->geo->blksize - 1); error = xfs_da_shrink_inode(args, child, bp); return(error); } @@ -858,14 +1112,22 @@ xfs_da_root_join(xfs_da_state_t *state, xfs_da_state_blk_t *root_blk) * If nothing can be done, return 0. */ STATIC int -xfs_da_node_toosmall(xfs_da_state_t *state, int *action) +xfs_da3_node_toosmall( + struct xfs_da_state *state, + int *action) { - xfs_da_intnode_t *node; - xfs_da_state_blk_t *blk; - xfs_da_blkinfo_t *info; - int count, forward, error, retval, i; - xfs_dablk_t blkno; - struct xfs_buf *bp; + struct xfs_da_intnode *node; + struct xfs_da_state_blk *blk; + struct xfs_da_blkinfo *info; + xfs_dablk_t blkno; + struct xfs_buf *bp; + struct xfs_da3_icnode_hdr nodehdr; + int count; + int forward; + int error; + int retval; + int i; + struct xfs_inode *dp = state->args->dp; trace_xfs_da_node_toosmall(state->args); @@ -876,10 +1138,9 @@ xfs_da_node_toosmall(xfs_da_state_t *state, int *action) */ blk = &state->path.blk[ state->path.active-1 ]; info = blk->bp->b_addr; - ASSERT(info->magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); node = (xfs_da_intnode_t *)info; - count = be16_to_cpu(node->hdr.count); - if (count > (state->node_ents >> 1)) { + dp->d_ops->node_hdr_from_disk(&nodehdr, node); + if (nodehdr.count > (state->args->geo->node_ents >> 1)) { *action = 0; /* blk over 50%, don't try to join */ return(0); /* blk over 50%, don't try to join */ } @@ -890,14 +1151,14 @@ xfs_da_node_toosmall(xfs_da_state_t *state, int *action) * coalesce it with a sibling block. We choose (arbitrarily) * to merge with the forward block unless it is NULL. */ - if (count == 0) { + if (nodehdr.count == 0) { /* * Make altpath point to the block we want to keep and * path point to the block we want to drop (this one). */ forward = (info->forw != 0); memcpy(&state->altpath, &state->path, sizeof(state->path)); - error = xfs_da_path_shift(state, &state->altpath, forward, + error = xfs_da3_path_shift(state, &state->altpath, forward, 0, &retval); if (error) return(error); @@ -916,35 +1177,35 @@ xfs_da_node_toosmall(xfs_da_state_t *state, int *action) * We prefer coalescing with the lower numbered sibling so as * to shrink a directory over time. */ + count = state->args->geo->node_ents; + count -= state->args->geo->node_ents >> 2; + count -= nodehdr.count; + /* start with smaller blk num */ - forward = (be32_to_cpu(info->forw) < be32_to_cpu(info->back)); + forward = nodehdr.forw < nodehdr.back; for (i = 0; i < 2; forward = !forward, i++) { + struct xfs_da3_icnode_hdr thdr; if (forward) - blkno = be32_to_cpu(info->forw); + blkno = nodehdr.forw; else - blkno = be32_to_cpu(info->back); + blkno = nodehdr.back; if (blkno == 0) continue; - error = xfs_da_node_read(state->args->trans, state->args->dp, + error = xfs_da3_node_read(state->args->trans, dp, blkno, -1, &bp, state->args->whichfork); if (error) return(error); - ASSERT(bp != NULL); - node = (xfs_da_intnode_t *)info; - count = state->node_ents; - count -= state->node_ents >> 2; - count -= be16_to_cpu(node->hdr.count); node = bp->b_addr; - ASSERT(node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); - count -= be16_to_cpu(node->hdr.count); + dp->d_ops->node_hdr_from_disk(&thdr, node); xfs_trans_brelse(state->args->trans, bp); - if (count >= 0) + + if (count - thdr.count >= 0) break; /* fits with at least 25% to spare */ } if (i >= 2) { *action = 0; - return(0); + return 0; } /* @@ -953,28 +1214,43 @@ xfs_da_node_toosmall(xfs_da_state_t *state, int *action) */ memcpy(&state->altpath, &state->path, sizeof(state->path)); if (blkno < blk->blkno) { - error = xfs_da_path_shift(state, &state->altpath, forward, + error = xfs_da3_path_shift(state, &state->altpath, forward, 0, &retval); - if (error) { - return(error); - } - if (retval) { - *action = 0; - return(0); - } } else { - error = xfs_da_path_shift(state, &state->path, forward, + error = xfs_da3_path_shift(state, &state->path, forward, 0, &retval); - if (error) { - return(error); - } - if (retval) { - *action = 0; - return(0); - } + } + if (error) + return error; + if (retval) { + *action = 0; + return 0; } *action = 1; - return(0); + return 0; +} + +/* + * Pick up the last hashvalue from an intermediate node. + */ +STATIC uint +xfs_da3_node_lasthash( + struct xfs_inode *dp, + struct xfs_buf *bp, + int *count) +{ + struct xfs_da_intnode *node; + struct xfs_da_node_entry *btree; + struct xfs_da3_icnode_hdr nodehdr; + + node = bp->b_addr; + dp->d_ops->node_hdr_from_disk(&nodehdr, node); + if (count) + *count = nodehdr.count; + if (!nodehdr.count) + return 0; + btree = dp->d_ops->node_tree_p(node); + return be32_to_cpu(btree[nodehdr.count - 1].hashval); } /* @@ -982,13 +1258,17 @@ xfs_da_node_toosmall(xfs_da_state_t *state, int *action) * when we stop making changes, return. */ void -xfs_da_fixhashpath(xfs_da_state_t *state, xfs_da_state_path_t *path) +xfs_da3_fixhashpath( + struct xfs_da_state *state, + struct xfs_da_state_path *path) { - xfs_da_state_blk_t *blk; - xfs_da_intnode_t *node; - xfs_da_node_entry_t *btree; - xfs_dahash_t lasthash=0; - int level, count; + struct xfs_da_state_blk *blk; + struct xfs_da_intnode *node; + struct xfs_da_node_entry *btree; + xfs_dahash_t lasthash=0; + int level; + int count; + struct xfs_inode *dp = state->args->dp; trace_xfs_da_fixhashpath(state->args); @@ -1001,28 +1281,31 @@ xfs_da_fixhashpath(xfs_da_state_t *state, xfs_da_state_path_t *path) return; break; case XFS_DIR2_LEAFN_MAGIC: - lasthash = xfs_dir2_leafn_lasthash(blk->bp, &count); + lasthash = xfs_dir2_leafn_lasthash(dp, blk->bp, &count); if (count == 0) return; break; case XFS_DA_NODE_MAGIC: - lasthash = xfs_da_node_lasthash(blk->bp, &count); + lasthash = xfs_da3_node_lasthash(dp, blk->bp, &count); if (count == 0) return; break; } for (blk--, level--; level >= 0; blk--, level--) { + struct xfs_da3_icnode_hdr nodehdr; + node = blk->bp->b_addr; - ASSERT(node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); - btree = &node->btree[ blk->index ]; - if (be32_to_cpu(btree->hashval) == lasthash) + dp->d_ops->node_hdr_from_disk(&nodehdr, node); + btree = dp->d_ops->node_tree_p(node); + if (be32_to_cpu(btree[blk->index].hashval) == lasthash) break; blk->hashval = lasthash; - btree->hashval = cpu_to_be32(lasthash); + btree[blk->index].hashval = cpu_to_be32(lasthash); xfs_trans_log_buf(state->args->trans, blk->bp, - XFS_DA_LOGRANGE(node, btree, sizeof(*btree))); + XFS_DA_LOGRANGE(node, &btree[blk->index], + sizeof(*btree))); - lasthash = be32_to_cpu(node->btree[be16_to_cpu(node->hdr.count)-1].hashval); + lasthash = be32_to_cpu(btree[nodehdr.count - 1].hashval); } } @@ -1030,104 +1313,122 @@ xfs_da_fixhashpath(xfs_da_state_t *state, xfs_da_state_path_t *path) * Remove an entry from an intermediate node. */ STATIC void -xfs_da_node_remove(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk) +xfs_da3_node_remove( + struct xfs_da_state *state, + struct xfs_da_state_blk *drop_blk) { - xfs_da_intnode_t *node; - xfs_da_node_entry_t *btree; - int tmp; + struct xfs_da_intnode *node; + struct xfs_da3_icnode_hdr nodehdr; + struct xfs_da_node_entry *btree; + int index; + int tmp; + struct xfs_inode *dp = state->args->dp; trace_xfs_da_node_remove(state->args); node = drop_blk->bp->b_addr; - ASSERT(drop_blk->index < be16_to_cpu(node->hdr.count)); + dp->d_ops->node_hdr_from_disk(&nodehdr, node); + ASSERT(drop_blk->index < nodehdr.count); ASSERT(drop_blk->index >= 0); /* * Copy over the offending entry, or just zero it out. */ - btree = &node->btree[drop_blk->index]; - if (drop_blk->index < (be16_to_cpu(node->hdr.count)-1)) { - tmp = be16_to_cpu(node->hdr.count) - drop_blk->index - 1; + index = drop_blk->index; + btree = dp->d_ops->node_tree_p(node); + if (index < nodehdr.count - 1) { + tmp = nodehdr.count - index - 1; tmp *= (uint)sizeof(xfs_da_node_entry_t); - memmove(btree, btree + 1, tmp); + memmove(&btree[index], &btree[index + 1], tmp); xfs_trans_log_buf(state->args->trans, drop_blk->bp, - XFS_DA_LOGRANGE(node, btree, tmp)); - btree = &node->btree[be16_to_cpu(node->hdr.count)-1]; + XFS_DA_LOGRANGE(node, &btree[index], tmp)); + index = nodehdr.count - 1; } - memset((char *)btree, 0, sizeof(xfs_da_node_entry_t)); + memset(&btree[index], 0, sizeof(xfs_da_node_entry_t)); xfs_trans_log_buf(state->args->trans, drop_blk->bp, - XFS_DA_LOGRANGE(node, btree, sizeof(*btree))); - be16_add_cpu(&node->hdr.count, -1); + XFS_DA_LOGRANGE(node, &btree[index], sizeof(btree[index]))); + nodehdr.count -= 1; + dp->d_ops->node_hdr_to_disk(node, &nodehdr); xfs_trans_log_buf(state->args->trans, drop_blk->bp, - XFS_DA_LOGRANGE(node, &node->hdr, sizeof(node->hdr))); + XFS_DA_LOGRANGE(node, &node->hdr, dp->d_ops->node_hdr_size)); /* * Copy the last hash value from the block to propagate upwards. */ - btree--; - drop_blk->hashval = be32_to_cpu(btree->hashval); + drop_blk->hashval = be32_to_cpu(btree[index - 1].hashval); } /* - * Unbalance the btree elements between two intermediate nodes, + * Unbalance the elements between two intermediate nodes, * move all Btree elements from one node into another. */ STATIC void -xfs_da_node_unbalance(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk, - xfs_da_state_blk_t *save_blk) +xfs_da3_node_unbalance( + struct xfs_da_state *state, + struct xfs_da_state_blk *drop_blk, + struct xfs_da_state_blk *save_blk) { - xfs_da_intnode_t *drop_node, *save_node; - xfs_da_node_entry_t *btree; - int tmp; - xfs_trans_t *tp; + struct xfs_da_intnode *drop_node; + struct xfs_da_intnode *save_node; + struct xfs_da_node_entry *drop_btree; + struct xfs_da_node_entry *save_btree; + struct xfs_da3_icnode_hdr drop_hdr; + struct xfs_da3_icnode_hdr save_hdr; + struct xfs_trans *tp; + int sindex; + int tmp; + struct xfs_inode *dp = state->args->dp; trace_xfs_da_node_unbalance(state->args); drop_node = drop_blk->bp->b_addr; save_node = save_blk->bp->b_addr; - ASSERT(drop_node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); - ASSERT(save_node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); + dp->d_ops->node_hdr_from_disk(&drop_hdr, drop_node); + dp->d_ops->node_hdr_from_disk(&save_hdr, save_node); + drop_btree = dp->d_ops->node_tree_p(drop_node); + save_btree = dp->d_ops->node_tree_p(save_node); tp = state->args->trans; /* * If the dying block has lower hashvals, then move all the * elements in the remaining block up to make a hole. */ - if ((be32_to_cpu(drop_node->btree[0].hashval) < be32_to_cpu(save_node->btree[ 0 ].hashval)) || - (be32_to_cpu(drop_node->btree[be16_to_cpu(drop_node->hdr.count)-1].hashval) < - be32_to_cpu(save_node->btree[be16_to_cpu(save_node->hdr.count)-1].hashval))) - { - btree = &save_node->btree[be16_to_cpu(drop_node->hdr.count)]; - tmp = be16_to_cpu(save_node->hdr.count) * (uint)sizeof(xfs_da_node_entry_t); - memmove(btree, &save_node->btree[0], tmp); - btree = &save_node->btree[0]; + if ((be32_to_cpu(drop_btree[0].hashval) < + be32_to_cpu(save_btree[0].hashval)) || + (be32_to_cpu(drop_btree[drop_hdr.count - 1].hashval) < + be32_to_cpu(save_btree[save_hdr.count - 1].hashval))) { + /* XXX: check this - is memmove dst correct? */ + tmp = save_hdr.count * sizeof(xfs_da_node_entry_t); + memmove(&save_btree[drop_hdr.count], &save_btree[0], tmp); + + sindex = 0; xfs_trans_log_buf(tp, save_blk->bp, - XFS_DA_LOGRANGE(save_node, btree, - (be16_to_cpu(save_node->hdr.count) + be16_to_cpu(drop_node->hdr.count)) * - sizeof(xfs_da_node_entry_t))); + XFS_DA_LOGRANGE(save_node, &save_btree[0], + (save_hdr.count + drop_hdr.count) * + sizeof(xfs_da_node_entry_t))); } else { - btree = &save_node->btree[be16_to_cpu(save_node->hdr.count)]; + sindex = save_hdr.count; xfs_trans_log_buf(tp, save_blk->bp, - XFS_DA_LOGRANGE(save_node, btree, - be16_to_cpu(drop_node->hdr.count) * - sizeof(xfs_da_node_entry_t))); + XFS_DA_LOGRANGE(save_node, &save_btree[sindex], + drop_hdr.count * sizeof(xfs_da_node_entry_t))); } /* * Move all the B-tree elements from drop_blk to save_blk. */ - tmp = be16_to_cpu(drop_node->hdr.count) * (uint)sizeof(xfs_da_node_entry_t); - memcpy(btree, &drop_node->btree[0], tmp); - be16_add_cpu(&save_node->hdr.count, be16_to_cpu(drop_node->hdr.count)); + tmp = drop_hdr.count * (uint)sizeof(xfs_da_node_entry_t); + memcpy(&save_btree[sindex], &drop_btree[0], tmp); + save_hdr.count += drop_hdr.count; + dp->d_ops->node_hdr_to_disk(save_node, &save_hdr); xfs_trans_log_buf(tp, save_blk->bp, XFS_DA_LOGRANGE(save_node, &save_node->hdr, - sizeof(save_node->hdr))); + dp->d_ops->node_hdr_size)); /* * Save the last hashval in the remaining block for upward propagation. */ - save_blk->hashval = be32_to_cpu(save_node->btree[be16_to_cpu(save_node->hdr.count)-1].hashval); + save_blk->hashval = be32_to_cpu(save_btree[save_hdr.count - 1].hashval); } /*======================================================================== @@ -1146,16 +1447,25 @@ xfs_da_node_unbalance(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk, * pruned depth-first tree search. */ int /* error */ -xfs_da_node_lookup_int(xfs_da_state_t *state, int *result) +xfs_da3_node_lookup_int( + struct xfs_da_state *state, + int *result) { - xfs_da_state_blk_t *blk; - xfs_da_blkinfo_t *curr; - xfs_da_intnode_t *node; - xfs_da_node_entry_t *btree; - xfs_dablk_t blkno; - int probe, span, max, error, retval; - xfs_dahash_t hashval, btreehashval; - xfs_da_args_t *args; + struct xfs_da_state_blk *blk; + struct xfs_da_blkinfo *curr; + struct xfs_da_intnode *node; + struct xfs_da_node_entry *btree; + struct xfs_da3_icnode_hdr nodehdr; + struct xfs_da_args *args; + xfs_dablk_t blkno; + xfs_dahash_t hashval; + xfs_dahash_t btreehashval; + int probe; + int span; + int max; + int error; + int retval; + struct xfs_inode *dp = state->args->dp; args = state->args; @@ -1163,7 +1473,7 @@ xfs_da_node_lookup_int(xfs_da_state_t *state, int *result) * Descend thru the B-tree searching each level for the right * node to use, until the right hashval is found. */ - blkno = (args->whichfork == XFS_DATA_FORK)? state->mp->m_dirleafblk : 0; + blkno = (args->whichfork == XFS_DATA_FORK)? args->geo->leafblk : 0; for (blk = &state->path.blk[0], state->path.active = 1; state->path.active <= XFS_DA_NODE_MAXDEPTH; blk++, state->path.active++) { @@ -1171,7 +1481,7 @@ xfs_da_node_lookup_int(xfs_da_state_t *state, int *result) * Read the next node down in the tree. */ blk->blkno = blkno; - error = xfs_da_node_read(args->trans, args->dp, blkno, + error = xfs_da3_node_read(args->trans, args->dp, blkno, -1, &blk->bp, args->whichfork); if (error) { blk->blkno = 0; @@ -1180,66 +1490,76 @@ xfs_da_node_lookup_int(xfs_da_state_t *state, int *result) } curr = blk->bp->b_addr; blk->magic = be16_to_cpu(curr->magic); - ASSERT(blk->magic == XFS_DA_NODE_MAGIC || - blk->magic == XFS_DIR2_LEAFN_MAGIC || - blk->magic == XFS_ATTR_LEAF_MAGIC); + + if (blk->magic == XFS_ATTR_LEAF_MAGIC || + blk->magic == XFS_ATTR3_LEAF_MAGIC) { + blk->magic = XFS_ATTR_LEAF_MAGIC; + blk->hashval = xfs_attr_leaf_lasthash(blk->bp, NULL); + break; + } + + if (blk->magic == XFS_DIR2_LEAFN_MAGIC || + blk->magic == XFS_DIR3_LEAFN_MAGIC) { + blk->magic = XFS_DIR2_LEAFN_MAGIC; + blk->hashval = xfs_dir2_leafn_lasthash(args->dp, + blk->bp, NULL); + break; + } + + blk->magic = XFS_DA_NODE_MAGIC; + /* * Search an intermediate node for a match. */ - if (blk->magic == XFS_DA_NODE_MAGIC) { - node = blk->bp->b_addr; - max = be16_to_cpu(node->hdr.count); - blk->hashval = be32_to_cpu(node->btree[max-1].hashval); + node = blk->bp->b_addr; + dp->d_ops->node_hdr_from_disk(&nodehdr, node); + btree = dp->d_ops->node_tree_p(node); - /* - * Binary search. (note: small blocks will skip loop) - */ - probe = span = max / 2; - hashval = args->hashval; - for (btree = &node->btree[probe]; span > 4; - btree = &node->btree[probe]) { - span /= 2; - btreehashval = be32_to_cpu(btree->hashval); - if (btreehashval < hashval) - probe += span; - else if (btreehashval > hashval) - probe -= span; - else - break; - } - ASSERT((probe >= 0) && (probe < max)); - ASSERT((span <= 4) || (be32_to_cpu(btree->hashval) == hashval)); + max = nodehdr.count; + blk->hashval = be32_to_cpu(btree[max - 1].hashval); - /* - * Since we may have duplicate hashval's, find the first - * matching hashval in the node. - */ - while ((probe > 0) && (be32_to_cpu(btree->hashval) >= hashval)) { - btree--; - probe--; - } - while ((probe < max) && (be32_to_cpu(btree->hashval) < hashval)) { - btree++; - probe++; - } + /* + * Binary search. (note: small blocks will skip loop) + */ + probe = span = max / 2; + hashval = args->hashval; + while (span > 4) { + span /= 2; + btreehashval = be32_to_cpu(btree[probe].hashval); + if (btreehashval < hashval) + probe += span; + else if (btreehashval > hashval) + probe -= span; + else + break; + } + ASSERT((probe >= 0) && (probe < max)); + ASSERT((span <= 4) || + (be32_to_cpu(btree[probe].hashval) == hashval)); - /* - * Pick the right block to descend on. - */ - if (probe == max) { - blk->index = max-1; - blkno = be32_to_cpu(node->btree[max-1].before); - } else { - blk->index = probe; - blkno = be32_to_cpu(btree->before); - } - } else if (blk->magic == XFS_ATTR_LEAF_MAGIC) { - blk->hashval = xfs_attr_leaf_lasthash(blk->bp, NULL); - break; - } else if (blk->magic == XFS_DIR2_LEAFN_MAGIC) { - blk->hashval = xfs_dir2_leafn_lasthash(blk->bp, NULL); - break; + /* + * Since we may have duplicate hashval's, find the first + * matching hashval in the node. + */ + while (probe > 0 && + be32_to_cpu(btree[probe].hashval) >= hashval) { + probe--; + } + while (probe < max && + be32_to_cpu(btree[probe].hashval) < hashval) { + probe++; + } + + /* + * Pick the right block to descend on. + */ + if (probe == max) { + blk->index = max - 1; + blkno = be32_to_cpu(btree[max - 1].before); + } else { + blk->index = probe; + blkno = be32_to_cpu(btree[probe].before); } } @@ -1254,7 +1574,7 @@ xfs_da_node_lookup_int(xfs_da_state_t *state, int *result) retval = xfs_dir2_leafn_lookup_int(blk->bp, args, &blk->index, state); } else if (blk->magic == XFS_ATTR_LEAF_MAGIC) { - retval = xfs_attr_leaf_lookup_int(blk->bp, args); + retval = xfs_attr3_leaf_lookup_int(blk->bp, args); blk->index = args->index; args->blkno = blk->blkno; } else { @@ -1263,7 +1583,7 @@ xfs_da_node_lookup_int(xfs_da_state_t *state, int *result) } if (((retval == ENOENT) || (retval == ENOATTR)) && (blk->hashval == args->hashval)) { - error = xfs_da_path_shift(state, &state->path, 1, 1, + error = xfs_da3_path_shift(state, &state->path, 1, 1, &retval); if (error) return(error); @@ -1285,16 +1605,54 @@ xfs_da_node_lookup_int(xfs_da_state_t *state, int *result) *========================================================================*/ /* + * Compare two intermediate nodes for "order". + */ +STATIC int +xfs_da3_node_order( + struct xfs_inode *dp, + struct xfs_buf *node1_bp, + struct xfs_buf *node2_bp) +{ + struct xfs_da_intnode *node1; + struct xfs_da_intnode *node2; + struct xfs_da_node_entry *btree1; + struct xfs_da_node_entry *btree2; + struct xfs_da3_icnode_hdr node1hdr; + struct xfs_da3_icnode_hdr node2hdr; + + node1 = node1_bp->b_addr; + node2 = node2_bp->b_addr; + dp->d_ops->node_hdr_from_disk(&node1hdr, node1); + dp->d_ops->node_hdr_from_disk(&node2hdr, node2); + btree1 = dp->d_ops->node_tree_p(node1); + btree2 = dp->d_ops->node_tree_p(node2); + + if (node1hdr.count > 0 && node2hdr.count > 0 && + ((be32_to_cpu(btree2[0].hashval) < be32_to_cpu(btree1[0].hashval)) || + (be32_to_cpu(btree2[node2hdr.count - 1].hashval) < + be32_to_cpu(btree1[node1hdr.count - 1].hashval)))) { + return 1; + } + return 0; +} + +/* * Link a new block into a doubly linked list of blocks (of whatever type). */ int /* error */ -xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk, - xfs_da_state_blk_t *new_blk) +xfs_da3_blk_link( + struct xfs_da_state *state, + struct xfs_da_state_blk *old_blk, + struct xfs_da_state_blk *new_blk) { - xfs_da_blkinfo_t *old_info, *new_info, *tmp_info; - xfs_da_args_t *args; - int before=0, error; - struct xfs_buf *bp; + struct xfs_da_blkinfo *old_info; + struct xfs_da_blkinfo *new_info; + struct xfs_da_blkinfo *tmp_info; + struct xfs_da_args *args; + struct xfs_buf *bp; + int before = 0; + int error; + struct xfs_inode *dp = state->args->dp; /* * Set up environment. @@ -1306,19 +1664,16 @@ xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk, ASSERT(old_blk->magic == XFS_DA_NODE_MAGIC || old_blk->magic == XFS_DIR2_LEAFN_MAGIC || old_blk->magic == XFS_ATTR_LEAF_MAGIC); - ASSERT(old_blk->magic == be16_to_cpu(old_info->magic)); - ASSERT(new_blk->magic == be16_to_cpu(new_info->magic)); - ASSERT(old_blk->magic == new_blk->magic); switch (old_blk->magic) { case XFS_ATTR_LEAF_MAGIC: before = xfs_attr_leaf_order(old_blk->bp, new_blk->bp); break; case XFS_DIR2_LEAFN_MAGIC: - before = xfs_dir2_leafn_order(old_blk->bp, new_blk->bp); + before = xfs_dir2_leafn_order(dp, old_blk->bp, new_blk->bp); break; case XFS_DA_NODE_MAGIC: - before = xfs_da_node_order(old_blk->bp, new_blk->bp); + before = xfs_da3_node_order(dp, old_blk->bp, new_blk->bp); break; } @@ -1333,14 +1688,14 @@ xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk, new_info->forw = cpu_to_be32(old_blk->blkno); new_info->back = old_info->back; if (old_info->back) { - error = xfs_da_node_read(args->trans, args->dp, + error = xfs_da3_node_read(args->trans, dp, be32_to_cpu(old_info->back), -1, &bp, args->whichfork); if (error) return(error); ASSERT(bp != NULL); tmp_info = bp->b_addr; - ASSERT(be16_to_cpu(tmp_info->magic) == be16_to_cpu(old_info->magic)); + ASSERT(tmp_info->magic == old_info->magic); ASSERT(be32_to_cpu(tmp_info->forw) == old_blk->blkno); tmp_info->forw = cpu_to_be32(new_blk->blkno); xfs_trans_log_buf(args->trans, bp, 0, sizeof(*tmp_info)-1); @@ -1354,7 +1709,7 @@ xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk, new_info->forw = old_info->forw; new_info->back = cpu_to_be32(old_blk->blkno); if (old_info->forw) { - error = xfs_da_node_read(args->trans, args->dp, + error = xfs_da3_node_read(args->trans, dp, be32_to_cpu(old_info->forw), -1, &bp, args->whichfork); if (error) @@ -1375,59 +1730,20 @@ xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk, } /* - * Compare two intermediate nodes for "order". - */ -STATIC int -xfs_da_node_order( - struct xfs_buf *node1_bp, - struct xfs_buf *node2_bp) -{ - xfs_da_intnode_t *node1, *node2; - - node1 = node1_bp->b_addr; - node2 = node2_bp->b_addr; - ASSERT(node1->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC) && - node2->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); - if ((be16_to_cpu(node1->hdr.count) > 0) && (be16_to_cpu(node2->hdr.count) > 0) && - ((be32_to_cpu(node2->btree[0].hashval) < - be32_to_cpu(node1->btree[0].hashval)) || - (be32_to_cpu(node2->btree[be16_to_cpu(node2->hdr.count)-1].hashval) < - be32_to_cpu(node1->btree[be16_to_cpu(node1->hdr.count)-1].hashval)))) { - return(1); - } - return(0); -} - -/* - * Pick up the last hashvalue from an intermediate node. - */ -STATIC uint -xfs_da_node_lasthash( - struct xfs_buf *bp, - int *count) -{ - xfs_da_intnode_t *node; - - node = bp->b_addr; - ASSERT(node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); - if (count) - *count = be16_to_cpu(node->hdr.count); - if (!node->hdr.count) - return(0); - return be32_to_cpu(node->btree[be16_to_cpu(node->hdr.count)-1].hashval); -} - -/* * Unlink a block from a doubly linked list of blocks. */ STATIC int /* error */ -xfs_da_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk, - xfs_da_state_blk_t *save_blk) +xfs_da3_blk_unlink( + struct xfs_da_state *state, + struct xfs_da_state_blk *drop_blk, + struct xfs_da_state_blk *save_blk) { - xfs_da_blkinfo_t *drop_info, *save_info, *tmp_info; - xfs_da_args_t *args; - struct xfs_buf *bp; - int error; + struct xfs_da_blkinfo *drop_info; + struct xfs_da_blkinfo *save_info; + struct xfs_da_blkinfo *tmp_info; + struct xfs_da_args *args; + struct xfs_buf *bp; + int error; /* * Set up environment. @@ -1439,8 +1755,6 @@ xfs_da_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk, ASSERT(save_blk->magic == XFS_DA_NODE_MAGIC || save_blk->magic == XFS_DIR2_LEAFN_MAGIC || save_blk->magic == XFS_ATTR_LEAF_MAGIC); - ASSERT(save_blk->magic == be16_to_cpu(save_info->magic)); - ASSERT(drop_blk->magic == be16_to_cpu(drop_info->magic)); ASSERT(save_blk->magic == drop_blk->magic); ASSERT((be32_to_cpu(save_info->forw) == drop_blk->blkno) || (be32_to_cpu(save_info->back) == drop_blk->blkno)); @@ -1454,7 +1768,7 @@ xfs_da_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk, trace_xfs_da_unlink_back(args); save_info->back = drop_info->back; if (drop_info->back) { - error = xfs_da_node_read(args->trans, args->dp, + error = xfs_da3_node_read(args->trans, args->dp, be32_to_cpu(drop_info->back), -1, &bp, args->whichfork); if (error) @@ -1471,7 +1785,7 @@ xfs_da_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk, trace_xfs_da_unlink_forward(args); save_info->forw = drop_info->forw; if (drop_info->forw) { - error = xfs_da_node_read(args->trans, args->dp, + error = xfs_da3_node_read(args->trans, args->dp, be32_to_cpu(drop_info->forw), -1, &bp, args->whichfork); if (error) @@ -1499,15 +1813,23 @@ xfs_da_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk, * the new bottom and the root. */ int /* error */ -xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path, - int forward, int release, int *result) +xfs_da3_path_shift( + struct xfs_da_state *state, + struct xfs_da_state_path *path, + int forward, + int release, + int *result) { - xfs_da_state_blk_t *blk; - xfs_da_blkinfo_t *info; - xfs_da_intnode_t *node; - xfs_da_args_t *args; - xfs_dablk_t blkno=0; - int level, error; + struct xfs_da_state_blk *blk; + struct xfs_da_blkinfo *info; + struct xfs_da_intnode *node; + struct xfs_da_args *args; + struct xfs_da_node_entry *btree; + struct xfs_da3_icnode_hdr nodehdr; + xfs_dablk_t blkno = 0; + int level; + int error; + struct xfs_inode *dp = state->args->dp; trace_xfs_da_path_shift(state->args); @@ -1522,16 +1844,17 @@ xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path, ASSERT((path->active > 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); level = (path->active-1) - 1; /* skip bottom layer in path */ for (blk = &path->blk[level]; level >= 0; blk--, level--) { - ASSERT(blk->bp != NULL); node = blk->bp->b_addr; - ASSERT(node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); - if (forward && (blk->index < be16_to_cpu(node->hdr.count)-1)) { + dp->d_ops->node_hdr_from_disk(&nodehdr, node); + btree = dp->d_ops->node_tree_p(node); + + if (forward && (blk->index < nodehdr.count - 1)) { blk->index++; - blkno = be32_to_cpu(node->btree[blk->index].before); + blkno = be32_to_cpu(btree[blk->index].before); break; } else if (!forward && (blk->index > 0)) { blk->index--; - blkno = be32_to_cpu(node->btree[blk->index].before); + blkno = be32_to_cpu(btree[blk->index].before); break; } } @@ -1557,45 +1880,59 @@ xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path, * Read the next child block. */ blk->blkno = blkno; - error = xfs_da_node_read(args->trans, args->dp, blkno, -1, + error = xfs_da3_node_read(args->trans, dp, blkno, -1, &blk->bp, args->whichfork); if (error) return(error); - ASSERT(blk->bp != NULL); info = blk->bp->b_addr; ASSERT(info->magic == cpu_to_be16(XFS_DA_NODE_MAGIC) || + info->magic == cpu_to_be16(XFS_DA3_NODE_MAGIC) || info->magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) || - info->magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); - blk->magic = be16_to_cpu(info->magic); - if (blk->magic == XFS_DA_NODE_MAGIC) { + info->magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC) || + info->magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC) || + info->magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC)); + + + /* + * Note: we flatten the magic number to a single type so we + * don't have to compare against crc/non-crc types elsewhere. + */ + switch (be16_to_cpu(info->magic)) { + case XFS_DA_NODE_MAGIC: + case XFS_DA3_NODE_MAGIC: + blk->magic = XFS_DA_NODE_MAGIC; node = (xfs_da_intnode_t *)info; - blk->hashval = be32_to_cpu(node->btree[be16_to_cpu(node->hdr.count)-1].hashval); + dp->d_ops->node_hdr_from_disk(&nodehdr, node); + btree = dp->d_ops->node_tree_p(node); + blk->hashval = be32_to_cpu(btree[nodehdr.count - 1].hashval); if (forward) blk->index = 0; else - blk->index = be16_to_cpu(node->hdr.count)-1; - blkno = be32_to_cpu(node->btree[blk->index].before); - } else { + blk->index = nodehdr.count - 1; + blkno = be32_to_cpu(btree[blk->index].before); + break; + case XFS_ATTR_LEAF_MAGIC: + case XFS_ATTR3_LEAF_MAGIC: + blk->magic = XFS_ATTR_LEAF_MAGIC; ASSERT(level == path->active-1); blk->index = 0; - switch(blk->magic) { - case XFS_ATTR_LEAF_MAGIC: - blk->hashval = xfs_attr_leaf_lasthash(blk->bp, - NULL); - break; - case XFS_DIR2_LEAFN_MAGIC: - blk->hashval = xfs_dir2_leafn_lasthash(blk->bp, - NULL); - break; - default: - ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC || - blk->magic == XFS_DIR2_LEAFN_MAGIC); - break; - } + blk->hashval = xfs_attr_leaf_lasthash(blk->bp, NULL); + break; + case XFS_DIR2_LEAFN_MAGIC: + case XFS_DIR3_LEAFN_MAGIC: + blk->magic = XFS_DIR2_LEAFN_MAGIC; + ASSERT(level == path->active-1); + blk->index = 0; + blk->hashval = xfs_dir2_leafn_lasthash(args->dp, + blk->bp, NULL); + break; + default: + ASSERT(0); + break; } } *result = 0; - return(0); + return 0; } @@ -1754,20 +2091,12 @@ xfs_da_grow_inode( xfs_dablk_t *new_blkno) { xfs_fileoff_t bno; - int count; int error; trace_xfs_da_grow_inode(args); - if (args->whichfork == XFS_DATA_FORK) { - bno = args->dp->i_mount->m_dirleafblk; - count = args->dp->i_mount->m_dirblkfsbs; - } else { - bno = 0; - count = 1; - } - - error = xfs_da_grow_inode_int(args, &bno, count); + bno = args->geo->leafblk; + error = xfs_da_grow_inode_int(args, &bno, args->geo->fsbcount); if (!error) *new_blkno = (xfs_dablk_t)bno; return error; @@ -1782,34 +2111,48 @@ xfs_da_grow_inode( * a bmap btree split to do that. */ STATIC int -xfs_da_swap_lastblock( - xfs_da_args_t *args, - xfs_dablk_t *dead_blknop, - struct xfs_buf **dead_bufp) +xfs_da3_swap_lastblock( + struct xfs_da_args *args, + xfs_dablk_t *dead_blknop, + struct xfs_buf **dead_bufp) { - xfs_dablk_t dead_blkno, last_blkno, sib_blkno, par_blkno; - struct xfs_buf *dead_buf, *last_buf, *sib_buf, *par_buf; - xfs_fileoff_t lastoff; - xfs_inode_t *ip; - xfs_trans_t *tp; - xfs_mount_t *mp; - int error, w, entno, level, dead_level; - xfs_da_blkinfo_t *dead_info, *sib_info; - xfs_da_intnode_t *par_node, *dead_node; - xfs_dir2_leaf_t *dead_leaf2; - xfs_dahash_t dead_hash; + struct xfs_da_blkinfo *dead_info; + struct xfs_da_blkinfo *sib_info; + struct xfs_da_intnode *par_node; + struct xfs_da_intnode *dead_node; + struct xfs_dir2_leaf *dead_leaf2; + struct xfs_da_node_entry *btree; + struct xfs_da3_icnode_hdr par_hdr; + struct xfs_inode *dp; + struct xfs_trans *tp; + struct xfs_mount *mp; + struct xfs_buf *dead_buf; + struct xfs_buf *last_buf; + struct xfs_buf *sib_buf; + struct xfs_buf *par_buf; + xfs_dahash_t dead_hash; + xfs_fileoff_t lastoff; + xfs_dablk_t dead_blkno; + xfs_dablk_t last_blkno; + xfs_dablk_t sib_blkno; + xfs_dablk_t par_blkno; + int error; + int w; + int entno; + int level; + int dead_level; trace_xfs_da_swap_lastblock(args); dead_buf = *dead_bufp; dead_blkno = *dead_blknop; tp = args->trans; - ip = args->dp; + dp = args->dp; w = args->whichfork; ASSERT(w == XFS_DATA_FORK); - mp = ip->i_mount; - lastoff = mp->m_dirfreeblk; - error = xfs_bmap_last_before(tp, ip, &lastoff, w); + mp = dp->i_mount; + lastoff = args->geo->freeblk; + error = xfs_bmap_last_before(tp, dp, &lastoff, w); if (error) return error; if (unlikely(lastoff == 0)) { @@ -1820,35 +2163,44 @@ xfs_da_swap_lastblock( /* * Read the last block in the btree space. */ - last_blkno = (xfs_dablk_t)lastoff - mp->m_dirblkfsbs; - error = xfs_da_node_read(tp, ip, last_blkno, -1, &last_buf, w); + last_blkno = (xfs_dablk_t)lastoff - args->geo->fsbcount; + error = xfs_da3_node_read(tp, dp, last_blkno, -1, &last_buf, w); if (error) return error; /* * Copy the last block into the dead buffer and log it. */ - memcpy(dead_buf->b_addr, last_buf->b_addr, mp->m_dirblksize); - xfs_trans_log_buf(tp, dead_buf, 0, mp->m_dirblksize - 1); + memcpy(dead_buf->b_addr, last_buf->b_addr, args->geo->blksize); + xfs_trans_log_buf(tp, dead_buf, 0, args->geo->blksize - 1); dead_info = dead_buf->b_addr; /* * Get values from the moved block. */ - if (dead_info->magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)) { + if (dead_info->magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) || + dead_info->magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)) { + struct xfs_dir3_icleaf_hdr leafhdr; + struct xfs_dir2_leaf_entry *ents; + dead_leaf2 = (xfs_dir2_leaf_t *)dead_info; + dp->d_ops->leaf_hdr_from_disk(&leafhdr, dead_leaf2); + ents = dp->d_ops->leaf_ents_p(dead_leaf2); dead_level = 0; - dead_hash = be32_to_cpu(dead_leaf2->ents[be16_to_cpu(dead_leaf2->hdr.count) - 1].hashval); + dead_hash = be32_to_cpu(ents[leafhdr.count - 1].hashval); } else { - ASSERT(dead_info->magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); + struct xfs_da3_icnode_hdr deadhdr; + dead_node = (xfs_da_intnode_t *)dead_info; - dead_level = be16_to_cpu(dead_node->hdr.level); - dead_hash = be32_to_cpu(dead_node->btree[be16_to_cpu(dead_node->hdr.count) - 1].hashval); + dp->d_ops->node_hdr_from_disk(&deadhdr, dead_node); + btree = dp->d_ops->node_tree_p(dead_node); + dead_level = deadhdr.level; + dead_hash = be32_to_cpu(btree[deadhdr.count - 1].hashval); } sib_buf = par_buf = NULL; /* * If the moved block has a left sibling, fix up the pointers. */ if ((sib_blkno = be32_to_cpu(dead_info->back))) { - error = xfs_da_node_read(tp, ip, sib_blkno, -1, &sib_buf, w); + error = xfs_da3_node_read(tp, dp, sib_blkno, -1, &sib_buf, w); if (error) goto done; sib_info = sib_buf->b_addr; @@ -1870,7 +2222,7 @@ xfs_da_swap_lastblock( * If the moved block has a right sibling, fix up the pointers. */ if ((sib_blkno = be32_to_cpu(dead_info->forw))) { - error = xfs_da_node_read(tp, ip, sib_blkno, -1, &sib_buf, w); + error = xfs_da3_node_read(tp, dp, sib_blkno, -1, &sib_buf, w); if (error) goto done; sib_info = sib_buf->b_addr; @@ -1888,37 +2240,37 @@ xfs_da_swap_lastblock( sizeof(sib_info->back))); sib_buf = NULL; } - par_blkno = mp->m_dirleafblk; + par_blkno = args->geo->leafblk; level = -1; /* * Walk down the tree looking for the parent of the moved block. */ for (;;) { - error = xfs_da_node_read(tp, ip, par_blkno, -1, &par_buf, w); + error = xfs_da3_node_read(tp, dp, par_blkno, -1, &par_buf, w); if (error) goto done; par_node = par_buf->b_addr; - if (unlikely(par_node->hdr.info.magic != - cpu_to_be16(XFS_DA_NODE_MAGIC) || - (level >= 0 && level != be16_to_cpu(par_node->hdr.level) + 1))) { + dp->d_ops->node_hdr_from_disk(&par_hdr, par_node); + if (level >= 0 && level != par_hdr.level + 1) { XFS_ERROR_REPORT("xfs_da_swap_lastblock(4)", XFS_ERRLEVEL_LOW, mp); error = XFS_ERROR(EFSCORRUPTED); goto done; } - level = be16_to_cpu(par_node->hdr.level); + level = par_hdr.level; + btree = dp->d_ops->node_tree_p(par_node); for (entno = 0; - entno < be16_to_cpu(par_node->hdr.count) && - be32_to_cpu(par_node->btree[entno].hashval) < dead_hash; + entno < par_hdr.count && + be32_to_cpu(btree[entno].hashval) < dead_hash; entno++) continue; - if (unlikely(entno == be16_to_cpu(par_node->hdr.count))) { + if (entno == par_hdr.count) { XFS_ERROR_REPORT("xfs_da_swap_lastblock(5)", XFS_ERRLEVEL_LOW, mp); error = XFS_ERROR(EFSCORRUPTED); goto done; } - par_blkno = be32_to_cpu(par_node->btree[entno].before); + par_blkno = be32_to_cpu(btree[entno].before); if (level == dead_level + 1) break; xfs_trans_brelse(tp, par_buf); @@ -1930,13 +2282,13 @@ xfs_da_swap_lastblock( */ for (;;) { for (; - entno < be16_to_cpu(par_node->hdr.count) && - be32_to_cpu(par_node->btree[entno].before) != last_blkno; + entno < par_hdr.count && + be32_to_cpu(btree[entno].before) != last_blkno; entno++) continue; - if (entno < be16_to_cpu(par_node->hdr.count)) + if (entno < par_hdr.count) break; - par_blkno = be32_to_cpu(par_node->hdr.info.forw); + par_blkno = par_hdr.forw; xfs_trans_brelse(tp, par_buf); par_buf = NULL; if (unlikely(par_blkno == 0)) { @@ -1945,27 +2297,27 @@ xfs_da_swap_lastblock( error = XFS_ERROR(EFSCORRUPTED); goto done; } - error = xfs_da_node_read(tp, ip, par_blkno, -1, &par_buf, w); + error = xfs_da3_node_read(tp, dp, par_blkno, -1, &par_buf, w); if (error) goto done; par_node = par_buf->b_addr; - if (unlikely( - be16_to_cpu(par_node->hdr.level) != level || - par_node->hdr.info.magic != cpu_to_be16(XFS_DA_NODE_MAGIC))) { + dp->d_ops->node_hdr_from_disk(&par_hdr, par_node); + if (par_hdr.level != level) { XFS_ERROR_REPORT("xfs_da_swap_lastblock(7)", XFS_ERRLEVEL_LOW, mp); error = XFS_ERROR(EFSCORRUPTED); goto done; } + btree = dp->d_ops->node_tree_p(par_node); entno = 0; } /* * Update the parent entry pointing to the moved block. */ - par_node->btree[entno].before = cpu_to_be32(dead_blkno); + btree[entno].before = cpu_to_be32(dead_blkno); xfs_trans_log_buf(tp, par_buf, - XFS_DA_LOGRANGE(par_node, &par_node->btree[entno].before, - sizeof(par_node->btree[entno].before))); + XFS_DA_LOGRANGE(par_node, &btree[entno].before, + sizeof(btree[entno].before))); *dead_blknop = last_blkno; *dead_bufp = last_buf; return 0; @@ -1998,23 +2350,21 @@ xfs_da_shrink_inode( w = args->whichfork; tp = args->trans; mp = dp->i_mount; - if (w == XFS_DATA_FORK) - count = mp->m_dirblkfsbs; - else - count = 1; + count = args->geo->fsbcount; for (;;) { /* * Remove extents. If we get ENOSPC for a dir we have to move * the last block to the place we want to kill. */ - if ((error = xfs_bunmapi(tp, dp, dead_blkno, count, - xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA, - 0, args->firstblock, args->flist, - &done)) == ENOSPC) { + error = xfs_bunmapi(tp, dp, dead_blkno, count, + xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA, + 0, args->firstblock, args->flist, &done); + if (error == ENOSPC) { if (w != XFS_DATA_FORK) break; - if ((error = xfs_da_swap_lastblock(args, &dead_blkno, - &dead_buf))) + error = xfs_da3_swap_lastblock(args, &dead_blkno, + &dead_buf); + if (error) break; } else { break; @@ -2063,9 +2413,9 @@ static int xfs_buf_map_from_irec( struct xfs_mount *mp, struct xfs_buf_map **mapp, - unsigned int *nmaps, + int *nmaps, struct xfs_bmbt_irec *irecs, - unsigned int nirecs) + int nirecs) { struct xfs_buf_map *map; int i; @@ -2074,7 +2424,8 @@ xfs_buf_map_from_irec( ASSERT(nirecs >= 1); if (nirecs > 1) { - map = kmem_zalloc(nirecs * sizeof(struct xfs_buf_map), KM_SLEEP); + map = kmem_zalloc(nirecs * sizeof(struct xfs_buf_map), + KM_SLEEP | KM_NOFS); if (!map) return ENOMEM; *mapp = map; @@ -2101,7 +2452,6 @@ xfs_buf_map_from_irec( */ static int xfs_dabuf_map( - struct xfs_trans *trans, struct xfs_inode *dp, xfs_dablk_t bno, xfs_daddr_t mappedbno, @@ -2119,7 +2469,10 @@ xfs_dabuf_map( ASSERT(map && *map); ASSERT(*nmaps == 1); - nfsb = (whichfork == XFS_DATA_FORK) ? mp->m_dirblkfsbs : 1; + if (whichfork == XFS_DATA_FORK) + nfsb = mp->m_dir_geo->fsbcount; + else + nfsb = mp->m_attr_geo->fsbcount; /* * Caller doesn't have a mapping. -2 means don't complain @@ -2130,7 +2483,8 @@ xfs_dabuf_map( * Optimize the one-block case. */ if (nfsb != 1) - irecs = kmem_zalloc(sizeof(irec) * nfsb, KM_SLEEP); + irecs = kmem_zalloc(sizeof(irec) * nfsb, + KM_SLEEP | KM_NOFS); nirecs = nfsb; error = xfs_bmapi_read(dp, (xfs_fileoff_t)bno, nfsb, irecs, @@ -2196,7 +2550,7 @@ xfs_da_get_buf( *bpp = NULL; mapp = ↦ nmap = 1; - error = xfs_dabuf_map(trans, dp, bno, mappedbno, whichfork, + error = xfs_dabuf_map(dp, bno, mappedbno, whichfork, &mapp, &nmap); if (error) { /* mapping a hole is not an error, but we don't continue */ @@ -2244,7 +2598,7 @@ xfs_da_read_buf( *bpp = NULL; mapp = ↦ nmap = 1; - error = xfs_dabuf_map(trans, dp, bno, mappedbno, whichfork, + error = xfs_dabuf_map(dp, bno, mappedbno, whichfork, &mapp, &nmap); if (error) { /* mapping a hole is not an error, but we don't continue */ @@ -2263,38 +2617,6 @@ xfs_da_read_buf( xfs_buf_set_ref(bp, XFS_ATTR_BTREE_REF); else xfs_buf_set_ref(bp, XFS_DIR_BTREE_REF); - - /* - * This verification code will be moved to a CRC verification callback - * function so just leave it here unchanged until then. - */ - { - xfs_dir2_data_hdr_t *hdr = bp->b_addr; - xfs_dir2_free_t *free = bp->b_addr; - xfs_da_blkinfo_t *info = bp->b_addr; - uint magic, magic1; - struct xfs_mount *mp = dp->i_mount; - - magic = be16_to_cpu(info->magic); - magic1 = be32_to_cpu(hdr->magic); - if (unlikely( - XFS_TEST_ERROR((magic != XFS_DA_NODE_MAGIC) && - (magic != XFS_ATTR_LEAF_MAGIC) && - (magic != XFS_DIR2_LEAF1_MAGIC) && - (magic != XFS_DIR2_LEAFN_MAGIC) && - (magic1 != XFS_DIR2_BLOCK_MAGIC) && - (magic1 != XFS_DIR2_DATA_MAGIC) && - (free->hdr.magic != cpu_to_be32(XFS_DIR2_FREE_MAGIC)), - mp, XFS_ERRTAG_DA_READ_BUF, - XFS_RANDOM_DA_READ_BUF))) { - trace_xfs_da_btree_corrupt(bp, _RET_IP_); - XFS_CORRUPTION_ERROR("xfs_da_do_buf(2)", - XFS_ERRLEVEL_LOW, mp, info); - error = XFS_ERROR(EFSCORRUPTED); - xfs_trans_brelse(trans, bp); - goto out_free; - } - } *bpp = bp; out_free: if (mapp != &map) @@ -2308,7 +2630,6 @@ out_free: */ xfs_daddr_t xfs_da_reada_buf( - struct xfs_trans *trans, struct xfs_inode *dp, xfs_dablk_t bno, xfs_daddr_t mappedbno, @@ -2322,7 +2643,7 @@ xfs_da_reada_buf( mapp = ↦ nmap = 1; - error = xfs_dabuf_map(trans, dp, bno, mappedbno, whichfork, + error = xfs_dabuf_map(dp, bno, mappedbno, whichfork, &mapp, &nmap); if (error) { /* mapping a hole is not an error, but we don't continue */ @@ -2342,41 +2663,3 @@ out_free: return -1; return mappedbno; } - -kmem_zone_t *xfs_da_state_zone; /* anchor for state struct zone */ - -/* - * Allocate a dir-state structure. - * We don't put them on the stack since they're large. - */ -xfs_da_state_t * -xfs_da_state_alloc(void) -{ - return kmem_zone_zalloc(xfs_da_state_zone, KM_NOFS); -} - -/* - * Kill the altpath contents of a da-state structure. - */ -STATIC void -xfs_da_state_kill_altpath(xfs_da_state_t *state) -{ - int i; - - for (i = 0; i < state->altpath.active; i++) - state->altpath.blk[i].bp = NULL; - state->altpath.active = 0; -} - -/* - * Free a da-state structure. - */ -void -xfs_da_state_free(xfs_da_state_t *state) -{ - xfs_da_state_kill_altpath(state); -#ifdef DEBUG - memset((char *)state, 0, sizeof(*state)); -#endif /* DEBUG */ - kmem_zone_free(xfs_da_state_zone, state); -} diff --git a/fs/xfs/xfs_da_btree.h b/fs/xfs/xfs_da_btree.h index ee5170c46ae..6e153e399a7 100644 --- a/fs/xfs/xfs_da_btree.h +++ b/fs/xfs/xfs_da_btree.h @@ -1,5 +1,6 @@ /* * Copyright (c) 2000,2002,2005 Silicon Graphics, Inc. + * Copyright (c) 2013 Red Hat, Inc. * All Rights Reserved. * * This program is free software; you can redistribute it and/or @@ -20,58 +21,26 @@ struct xfs_bmap_free; struct xfs_inode; -struct xfs_mount; struct xfs_trans; struct zone; - -/*======================================================================== - * Directory Structure when greater than XFS_LBSIZE(mp) bytes. - *========================================================================*/ - -/* - * This structure is common to both leaf nodes and non-leaf nodes in the Btree. - * - * It is used to manage a doubly linked list of all blocks at the same - * level in the Btree, and to identify which type of block this is. - */ -#define XFS_DA_NODE_MAGIC 0xfebe /* magic number: non-leaf blocks */ -#define XFS_ATTR_LEAF_MAGIC 0xfbee /* magic number: attribute leaf blks */ -#define XFS_DIR2_LEAF1_MAGIC 0xd2f1 /* magic number: v2 dirlf single blks */ -#define XFS_DIR2_LEAFN_MAGIC 0xd2ff /* magic number: v2 dirlf multi blks */ - -typedef struct xfs_da_blkinfo { - __be32 forw; /* previous block in list */ - __be32 back; /* following block in list */ - __be16 magic; /* validity check on block */ - __be16 pad; /* unused */ -} xfs_da_blkinfo_t; +struct xfs_dir_ops; /* - * This is the structure of the root and intermediate nodes in the Btree. - * The leaf nodes are defined above. - * - * Entries are not packed. - * - * Since we have duplicate keys, use a binary search but always follow - * all match in the block, not just the first match found. + * Directory/attribute geometry information. There will be one of these for each + * data fork type, and it will be passed around via the xfs_da_args. Global + * structures will be attached to the xfs_mount. */ -#define XFS_DA_NODE_MAXDEPTH 5 /* max depth of Btree */ - -typedef struct xfs_da_intnode { - struct xfs_da_node_hdr { /* constant-structure header block */ - xfs_da_blkinfo_t info; /* block type, links, etc. */ - __be16 count; /* count of active entries */ - __be16 level; /* level above leaves (leaf == 0) */ - } hdr; - struct xfs_da_node_entry { - __be32 hashval; /* hash value for this descendant */ - __be32 before; /* Btree block before this key */ - } btree[1]; /* variable sized array of keys */ -} xfs_da_intnode_t; -typedef struct xfs_da_node_hdr xfs_da_node_hdr_t; -typedef struct xfs_da_node_entry xfs_da_node_entry_t; - -#define XFS_LBSIZE(mp) (mp)->m_sb.sb_blocksize +struct xfs_da_geometry { + int blksize; /* da block size in bytes */ + int fsbcount; /* da block size in filesystem blocks */ + uint8_t fsblog; /* log2 of _filesystem_ block size */ + uint8_t blklog; /* log2 of da block size */ + uint node_ents; /* # of entries in a danode */ + int magicpct; /* 37% of block size in bytes */ + xfs_dablk_t datablk; /* blockno of dir data v2 */ + xfs_dablk_t leafblk; /* blockno of leaf data v2 */ + xfs_dablk_t freeblk; /* blockno of free data v2 */ +}; /*======================================================================== * Btree searching and modification structure definitions. @@ -90,8 +59,10 @@ enum xfs_dacmp { * Structure to ease passing around component names. */ typedef struct xfs_da_args { + struct xfs_da_geometry *geo; /* da block geometry */ const __uint8_t *name; /* string (maybe not NULL terminated) */ int namelen; /* length of string (maybe no NULL) */ + __uint8_t filetype; /* filetype of inode for directories */ __uint8_t *value; /* set of bytes (maybe contain NULLs) */ int valuelen; /* length of value */ int flags; /* argument flags (eg: ATTR_NOCREATE) */ @@ -107,10 +78,12 @@ typedef struct xfs_da_args { int index; /* index of attr of interest in blk */ xfs_dablk_t rmtblkno; /* remote attr value starting blkno */ int rmtblkcnt; /* remote attr value block count */ + int rmtvaluelen; /* remote attr value length in bytes */ xfs_dablk_t blkno2; /* blkno of 2nd attr leaf of interest */ int index2; /* index of 2nd attr in blk */ xfs_dablk_t rmtblkno2; /* remote attr value starting blkno */ int rmtblkcnt2; /* remote attr value block count */ + int rmtvaluelen2; /* remote attr value length in bytes */ int op_flags; /* operation flags */ enum xfs_dacmp cmpresult; /* name compare result for lookups */ } xfs_da_args_t; @@ -155,8 +128,6 @@ typedef struct xfs_da_state_path { typedef struct xfs_da_state { xfs_da_args_t *args; /* filename arguments */ struct xfs_mount *mp; /* filesystem mount point */ - unsigned int blocksize; /* logical block size */ - unsigned int node_ents; /* how many entries in danode */ xfs_da_state_path_t path; /* search/split paths */ xfs_da_state_path_t altpath; /* alternate path for join */ unsigned char inleaf; /* insert into 1->lf, 0->splf */ @@ -191,29 +162,29 @@ struct xfs_nameops { /* * Routines used for growing the Btree. */ -int xfs_da_node_create(xfs_da_args_t *args, xfs_dablk_t blkno, int level, - struct xfs_buf **bpp, int whichfork); -int xfs_da_split(xfs_da_state_t *state); +int xfs_da3_node_create(struct xfs_da_args *args, xfs_dablk_t blkno, + int level, struct xfs_buf **bpp, int whichfork); +int xfs_da3_split(xfs_da_state_t *state); /* * Routines used for shrinking the Btree. */ -int xfs_da_join(xfs_da_state_t *state); -void xfs_da_fixhashpath(xfs_da_state_t *state, - xfs_da_state_path_t *path_to_to_fix); +int xfs_da3_join(xfs_da_state_t *state); +void xfs_da3_fixhashpath(struct xfs_da_state *state, + struct xfs_da_state_path *path_to_to_fix); /* * Routines used for finding things in the Btree. */ -int xfs_da_node_lookup_int(xfs_da_state_t *state, int *result); -int xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path, +int xfs_da3_node_lookup_int(xfs_da_state_t *state, int *result); +int xfs_da3_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path, int forward, int release, int *result); /* * Utility routines. */ -int xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk, +int xfs_da3_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk, xfs_da_state_blk_t *new_blk); -int xfs_da_node_read(struct xfs_trans *tp, struct xfs_inode *dp, +int xfs_da3_node_read(struct xfs_trans *tp, struct xfs_inode *dp, xfs_dablk_t bno, xfs_daddr_t mappedbno, struct xfs_buf **bpp, int which_fork); @@ -230,9 +201,9 @@ int xfs_da_read_buf(struct xfs_trans *trans, struct xfs_inode *dp, xfs_dablk_t bno, xfs_daddr_t mappedbno, struct xfs_buf **bpp, int whichfork, const struct xfs_buf_ops *ops); -xfs_daddr_t xfs_da_reada_buf(struct xfs_trans *trans, struct xfs_inode *dp, - xfs_dablk_t bno, xfs_daddr_t mapped_bno, - int whichfork, const struct xfs_buf_ops *ops); +xfs_daddr_t xfs_da_reada_buf(struct xfs_inode *dp, xfs_dablk_t bno, + xfs_daddr_t mapped_bno, int whichfork, + const struct xfs_buf_ops *ops); int xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno, struct xfs_buf *dead_buf); diff --git a/fs/xfs/xfs_da_format.c b/fs/xfs/xfs_da_format.c new file mode 100644 index 00000000000..c9aee52a37e --- /dev/null +++ b/fs/xfs/xfs_da_format.c @@ -0,0 +1,911 @@ +/* + * Copyright (c) 2000,2002,2005 Silicon Graphics, Inc. + * Copyright (c) 2013 Red Hat, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_mount.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_inode.h" +#include "xfs_dir2.h" +#include "xfs_dir2_priv.h" + +/* + * Shortform directory ops + */ +static int +xfs_dir2_sf_entsize( + struct xfs_dir2_sf_hdr *hdr, + int len) +{ + int count = sizeof(struct xfs_dir2_sf_entry); /* namelen + offset */ + + count += len; /* name */ + count += hdr->i8count ? sizeof(xfs_dir2_ino8_t) : + sizeof(xfs_dir2_ino4_t); /* ino # */ + return count; +} + +static int +xfs_dir3_sf_entsize( + struct xfs_dir2_sf_hdr *hdr, + int len) +{ + return xfs_dir2_sf_entsize(hdr, len) + sizeof(__uint8_t); +} + +static struct xfs_dir2_sf_entry * +xfs_dir2_sf_nextentry( + struct xfs_dir2_sf_hdr *hdr, + struct xfs_dir2_sf_entry *sfep) +{ + return (struct xfs_dir2_sf_entry *) + ((char *)sfep + xfs_dir2_sf_entsize(hdr, sfep->namelen)); +} + +static struct xfs_dir2_sf_entry * +xfs_dir3_sf_nextentry( + struct xfs_dir2_sf_hdr *hdr, + struct xfs_dir2_sf_entry *sfep) +{ + return (struct xfs_dir2_sf_entry *) + ((char *)sfep + xfs_dir3_sf_entsize(hdr, sfep->namelen)); +} + + +/* + * For filetype enabled shortform directories, the file type field is stored at + * the end of the name. Because it's only a single byte, endian conversion is + * not necessary. For non-filetype enable directories, the type is always + * unknown and we never store the value. + */ +static __uint8_t +xfs_dir2_sfe_get_ftype( + struct xfs_dir2_sf_entry *sfep) +{ + return XFS_DIR3_FT_UNKNOWN; +} + +static void +xfs_dir2_sfe_put_ftype( + struct xfs_dir2_sf_entry *sfep, + __uint8_t ftype) +{ + ASSERT(ftype < XFS_DIR3_FT_MAX); +} + +static __uint8_t +xfs_dir3_sfe_get_ftype( + struct xfs_dir2_sf_entry *sfep) +{ + __uint8_t ftype; + + ftype = sfep->name[sfep->namelen]; + if (ftype >= XFS_DIR3_FT_MAX) + return XFS_DIR3_FT_UNKNOWN; + return ftype; +} + +static void +xfs_dir3_sfe_put_ftype( + struct xfs_dir2_sf_entry *sfep, + __uint8_t ftype) +{ + ASSERT(ftype < XFS_DIR3_FT_MAX); + + sfep->name[sfep->namelen] = ftype; +} + +/* + * Inode numbers in short-form directories can come in two versions, + * either 4 bytes or 8 bytes wide. These helpers deal with the + * two forms transparently by looking at the headers i8count field. + * + * For 64-bit inode number the most significant byte must be zero. + */ +static xfs_ino_t +xfs_dir2_sf_get_ino( + struct xfs_dir2_sf_hdr *hdr, + xfs_dir2_inou_t *from) +{ + if (hdr->i8count) + return get_unaligned_be64(&from->i8.i) & 0x00ffffffffffffffULL; + else + return get_unaligned_be32(&from->i4.i); +} + +static void +xfs_dir2_sf_put_ino( + struct xfs_dir2_sf_hdr *hdr, + xfs_dir2_inou_t *to, + xfs_ino_t ino) +{ + ASSERT((ino & 0xff00000000000000ULL) == 0); + + if (hdr->i8count) + put_unaligned_be64(ino, &to->i8.i); + else + put_unaligned_be32(ino, &to->i4.i); +} + +static xfs_ino_t +xfs_dir2_sf_get_parent_ino( + struct xfs_dir2_sf_hdr *hdr) +{ + return xfs_dir2_sf_get_ino(hdr, &hdr->parent); +} + +static void +xfs_dir2_sf_put_parent_ino( + struct xfs_dir2_sf_hdr *hdr, + xfs_ino_t ino) +{ + xfs_dir2_sf_put_ino(hdr, &hdr->parent, ino); +} + +/* + * In short-form directory entries the inode numbers are stored at variable + * offset behind the entry name. If the entry stores a filetype value, then it + * sits between the name and the inode number. Hence the inode numbers may only + * be accessed through the helpers below. + */ +static xfs_ino_t +xfs_dir2_sfe_get_ino( + struct xfs_dir2_sf_hdr *hdr, + struct xfs_dir2_sf_entry *sfep) +{ + return xfs_dir2_sf_get_ino(hdr, + (xfs_dir2_inou_t *)&sfep->name[sfep->namelen]); +} + +static void +xfs_dir2_sfe_put_ino( + struct xfs_dir2_sf_hdr *hdr, + struct xfs_dir2_sf_entry *sfep, + xfs_ino_t ino) +{ + xfs_dir2_sf_put_ino(hdr, + (xfs_dir2_inou_t *)&sfep->name[sfep->namelen], ino); +} + +static xfs_ino_t +xfs_dir3_sfe_get_ino( + struct xfs_dir2_sf_hdr *hdr, + struct xfs_dir2_sf_entry *sfep) +{ + return xfs_dir2_sf_get_ino(hdr, + (xfs_dir2_inou_t *)&sfep->name[sfep->namelen + 1]); +} + +static void +xfs_dir3_sfe_put_ino( + struct xfs_dir2_sf_hdr *hdr, + struct xfs_dir2_sf_entry *sfep, + xfs_ino_t ino) +{ + xfs_dir2_sf_put_ino(hdr, + (xfs_dir2_inou_t *)&sfep->name[sfep->namelen + 1], ino); +} + + +/* + * Directory data block operations + */ + +/* + * For special situations, the dirent size ends up fixed because we always know + * what the size of the entry is. That's true for the "." and "..", and + * therefore we know that they are a fixed size and hence their offsets are + * constant, as is the first entry. + * + * Hence, this calculation is written as a macro to be able to be calculated at + * compile time and so certain offsets can be calculated directly in the + * structure initaliser via the macro. There are two macros - one for dirents + * with ftype and without so there are no unresolvable conditionals in the + * calculations. We also use round_up() as XFS_DIR2_DATA_ALIGN is always a power + * of 2 and the compiler doesn't reject it (unlike roundup()). + */ +#define XFS_DIR2_DATA_ENTSIZE(n) \ + round_up((offsetof(struct xfs_dir2_data_entry, name[0]) + (n) + \ + sizeof(xfs_dir2_data_off_t)), XFS_DIR2_DATA_ALIGN) + +#define XFS_DIR3_DATA_ENTSIZE(n) \ + round_up((offsetof(struct xfs_dir2_data_entry, name[0]) + (n) + \ + sizeof(xfs_dir2_data_off_t) + sizeof(__uint8_t)), \ + XFS_DIR2_DATA_ALIGN) + +static int +xfs_dir2_data_entsize( + int n) +{ + return XFS_DIR2_DATA_ENTSIZE(n); +} + +static int +xfs_dir3_data_entsize( + int n) +{ + return XFS_DIR3_DATA_ENTSIZE(n); +} + +static __uint8_t +xfs_dir2_data_get_ftype( + struct xfs_dir2_data_entry *dep) +{ + return XFS_DIR3_FT_UNKNOWN; +} + +static void +xfs_dir2_data_put_ftype( + struct xfs_dir2_data_entry *dep, + __uint8_t ftype) +{ + ASSERT(ftype < XFS_DIR3_FT_MAX); +} + +static __uint8_t +xfs_dir3_data_get_ftype( + struct xfs_dir2_data_entry *dep) +{ + __uint8_t ftype = dep->name[dep->namelen]; + + ASSERT(ftype < XFS_DIR3_FT_MAX); + if (ftype >= XFS_DIR3_FT_MAX) + return XFS_DIR3_FT_UNKNOWN; + return ftype; +} + +static void +xfs_dir3_data_put_ftype( + struct xfs_dir2_data_entry *dep, + __uint8_t type) +{ + ASSERT(type < XFS_DIR3_FT_MAX); + ASSERT(dep->namelen != 0); + + dep->name[dep->namelen] = type; +} + +/* + * Pointer to an entry's tag word. + */ +static __be16 * +xfs_dir2_data_entry_tag_p( + struct xfs_dir2_data_entry *dep) +{ + return (__be16 *)((char *)dep + + xfs_dir2_data_entsize(dep->namelen) - sizeof(__be16)); +} + +static __be16 * +xfs_dir3_data_entry_tag_p( + struct xfs_dir2_data_entry *dep) +{ + return (__be16 *)((char *)dep + + xfs_dir3_data_entsize(dep->namelen) - sizeof(__be16)); +} + +/* + * location of . and .. in data space (always block 0) + */ +static struct xfs_dir2_data_entry * +xfs_dir2_data_dot_entry_p( + struct xfs_dir2_data_hdr *hdr) +{ + return (struct xfs_dir2_data_entry *) + ((char *)hdr + sizeof(struct xfs_dir2_data_hdr)); +} + +static struct xfs_dir2_data_entry * +xfs_dir2_data_dotdot_entry_p( + struct xfs_dir2_data_hdr *hdr) +{ + return (struct xfs_dir2_data_entry *) + ((char *)hdr + sizeof(struct xfs_dir2_data_hdr) + + XFS_DIR2_DATA_ENTSIZE(1)); +} + +static struct xfs_dir2_data_entry * +xfs_dir2_data_first_entry_p( + struct xfs_dir2_data_hdr *hdr) +{ + return (struct xfs_dir2_data_entry *) + ((char *)hdr + sizeof(struct xfs_dir2_data_hdr) + + XFS_DIR2_DATA_ENTSIZE(1) + + XFS_DIR2_DATA_ENTSIZE(2)); +} + +static struct xfs_dir2_data_entry * +xfs_dir2_ftype_data_dotdot_entry_p( + struct xfs_dir2_data_hdr *hdr) +{ + return (struct xfs_dir2_data_entry *) + ((char *)hdr + sizeof(struct xfs_dir2_data_hdr) + + XFS_DIR3_DATA_ENTSIZE(1)); +} + +static struct xfs_dir2_data_entry * +xfs_dir2_ftype_data_first_entry_p( + struct xfs_dir2_data_hdr *hdr) +{ + return (struct xfs_dir2_data_entry *) + ((char *)hdr + sizeof(struct xfs_dir2_data_hdr) + + XFS_DIR3_DATA_ENTSIZE(1) + + XFS_DIR3_DATA_ENTSIZE(2)); +} + +static struct xfs_dir2_data_entry * +xfs_dir3_data_dot_entry_p( + struct xfs_dir2_data_hdr *hdr) +{ + return (struct xfs_dir2_data_entry *) + ((char *)hdr + sizeof(struct xfs_dir3_data_hdr)); +} + +static struct xfs_dir2_data_entry * +xfs_dir3_data_dotdot_entry_p( + struct xfs_dir2_data_hdr *hdr) +{ + return (struct xfs_dir2_data_entry *) + ((char *)hdr + sizeof(struct xfs_dir3_data_hdr) + + XFS_DIR3_DATA_ENTSIZE(1)); +} + +static struct xfs_dir2_data_entry * +xfs_dir3_data_first_entry_p( + struct xfs_dir2_data_hdr *hdr) +{ + return (struct xfs_dir2_data_entry *) + ((char *)hdr + sizeof(struct xfs_dir3_data_hdr) + + XFS_DIR3_DATA_ENTSIZE(1) + + XFS_DIR3_DATA_ENTSIZE(2)); +} + +static struct xfs_dir2_data_free * +xfs_dir2_data_bestfree_p(struct xfs_dir2_data_hdr *hdr) +{ + return hdr->bestfree; +} + +static struct xfs_dir2_data_free * +xfs_dir3_data_bestfree_p(struct xfs_dir2_data_hdr *hdr) +{ + return ((struct xfs_dir3_data_hdr *)hdr)->best_free; +} + +static struct xfs_dir2_data_entry * +xfs_dir2_data_entry_p(struct xfs_dir2_data_hdr *hdr) +{ + return (struct xfs_dir2_data_entry *) + ((char *)hdr + sizeof(struct xfs_dir2_data_hdr)); +} + +static struct xfs_dir2_data_unused * +xfs_dir2_data_unused_p(struct xfs_dir2_data_hdr *hdr) +{ + return (struct xfs_dir2_data_unused *) + ((char *)hdr + sizeof(struct xfs_dir2_data_hdr)); +} + +static struct xfs_dir2_data_entry * +xfs_dir3_data_entry_p(struct xfs_dir2_data_hdr *hdr) +{ + return (struct xfs_dir2_data_entry *) + ((char *)hdr + sizeof(struct xfs_dir3_data_hdr)); +} + +static struct xfs_dir2_data_unused * +xfs_dir3_data_unused_p(struct xfs_dir2_data_hdr *hdr) +{ + return (struct xfs_dir2_data_unused *) + ((char *)hdr + sizeof(struct xfs_dir3_data_hdr)); +} + + +/* + * Directory Leaf block operations + */ +static int +xfs_dir2_max_leaf_ents(struct xfs_da_geometry *geo) +{ + return (geo->blksize - sizeof(struct xfs_dir2_leaf_hdr)) / + (uint)sizeof(struct xfs_dir2_leaf_entry); +} + +static struct xfs_dir2_leaf_entry * +xfs_dir2_leaf_ents_p(struct xfs_dir2_leaf *lp) +{ + return lp->__ents; +} + +static int +xfs_dir3_max_leaf_ents(struct xfs_da_geometry *geo) +{ + return (geo->blksize - sizeof(struct xfs_dir3_leaf_hdr)) / + (uint)sizeof(struct xfs_dir2_leaf_entry); +} + +static struct xfs_dir2_leaf_entry * +xfs_dir3_leaf_ents_p(struct xfs_dir2_leaf *lp) +{ + return ((struct xfs_dir3_leaf *)lp)->__ents; +} + +static void +xfs_dir2_leaf_hdr_from_disk( + struct xfs_dir3_icleaf_hdr *to, + struct xfs_dir2_leaf *from) +{ + to->forw = be32_to_cpu(from->hdr.info.forw); + to->back = be32_to_cpu(from->hdr.info.back); + to->magic = be16_to_cpu(from->hdr.info.magic); + to->count = be16_to_cpu(from->hdr.count); + to->stale = be16_to_cpu(from->hdr.stale); + + ASSERT(to->magic == XFS_DIR2_LEAF1_MAGIC || + to->magic == XFS_DIR2_LEAFN_MAGIC); +} + +static void +xfs_dir2_leaf_hdr_to_disk( + struct xfs_dir2_leaf *to, + struct xfs_dir3_icleaf_hdr *from) +{ + ASSERT(from->magic == XFS_DIR2_LEAF1_MAGIC || + from->magic == XFS_DIR2_LEAFN_MAGIC); + + to->hdr.info.forw = cpu_to_be32(from->forw); + to->hdr.info.back = cpu_to_be32(from->back); + to->hdr.info.magic = cpu_to_be16(from->magic); + to->hdr.count = cpu_to_be16(from->count); + to->hdr.stale = cpu_to_be16(from->stale); +} + +static void +xfs_dir3_leaf_hdr_from_disk( + struct xfs_dir3_icleaf_hdr *to, + struct xfs_dir2_leaf *from) +{ + struct xfs_dir3_leaf_hdr *hdr3 = (struct xfs_dir3_leaf_hdr *)from; + + to->forw = be32_to_cpu(hdr3->info.hdr.forw); + to->back = be32_to_cpu(hdr3->info.hdr.back); + to->magic = be16_to_cpu(hdr3->info.hdr.magic); + to->count = be16_to_cpu(hdr3->count); + to->stale = be16_to_cpu(hdr3->stale); + + ASSERT(to->magic == XFS_DIR3_LEAF1_MAGIC || + to->magic == XFS_DIR3_LEAFN_MAGIC); +} + +static void +xfs_dir3_leaf_hdr_to_disk( + struct xfs_dir2_leaf *to, + struct xfs_dir3_icleaf_hdr *from) +{ + struct xfs_dir3_leaf_hdr *hdr3 = (struct xfs_dir3_leaf_hdr *)to; + + ASSERT(from->magic == XFS_DIR3_LEAF1_MAGIC || + from->magic == XFS_DIR3_LEAFN_MAGIC); + + hdr3->info.hdr.forw = cpu_to_be32(from->forw); + hdr3->info.hdr.back = cpu_to_be32(from->back); + hdr3->info.hdr.magic = cpu_to_be16(from->magic); + hdr3->count = cpu_to_be16(from->count); + hdr3->stale = cpu_to_be16(from->stale); +} + + +/* + * Directory/Attribute Node block operations + */ +static struct xfs_da_node_entry * +xfs_da2_node_tree_p(struct xfs_da_intnode *dap) +{ + return dap->__btree; +} + +static struct xfs_da_node_entry * +xfs_da3_node_tree_p(struct xfs_da_intnode *dap) +{ + return ((struct xfs_da3_intnode *)dap)->__btree; +} + +static void +xfs_da2_node_hdr_from_disk( + struct xfs_da3_icnode_hdr *to, + struct xfs_da_intnode *from) +{ + ASSERT(from->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); + to->forw = be32_to_cpu(from->hdr.info.forw); + to->back = be32_to_cpu(from->hdr.info.back); + to->magic = be16_to_cpu(from->hdr.info.magic); + to->count = be16_to_cpu(from->hdr.__count); + to->level = be16_to_cpu(from->hdr.__level); +} + +static void +xfs_da2_node_hdr_to_disk( + struct xfs_da_intnode *to, + struct xfs_da3_icnode_hdr *from) +{ + ASSERT(from->magic == XFS_DA_NODE_MAGIC); + to->hdr.info.forw = cpu_to_be32(from->forw); + to->hdr.info.back = cpu_to_be32(from->back); + to->hdr.info.magic = cpu_to_be16(from->magic); + to->hdr.__count = cpu_to_be16(from->count); + to->hdr.__level = cpu_to_be16(from->level); +} + +static void +xfs_da3_node_hdr_from_disk( + struct xfs_da3_icnode_hdr *to, + struct xfs_da_intnode *from) +{ + struct xfs_da3_node_hdr *hdr3 = (struct xfs_da3_node_hdr *)from; + + ASSERT(from->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC)); + to->forw = be32_to_cpu(hdr3->info.hdr.forw); + to->back = be32_to_cpu(hdr3->info.hdr.back); + to->magic = be16_to_cpu(hdr3->info.hdr.magic); + to->count = be16_to_cpu(hdr3->__count); + to->level = be16_to_cpu(hdr3->__level); +} + +static void +xfs_da3_node_hdr_to_disk( + struct xfs_da_intnode *to, + struct xfs_da3_icnode_hdr *from) +{ + struct xfs_da3_node_hdr *hdr3 = (struct xfs_da3_node_hdr *)to; + + ASSERT(from->magic == XFS_DA3_NODE_MAGIC); + hdr3->info.hdr.forw = cpu_to_be32(from->forw); + hdr3->info.hdr.back = cpu_to_be32(from->back); + hdr3->info.hdr.magic = cpu_to_be16(from->magic); + hdr3->__count = cpu_to_be16(from->count); + hdr3->__level = cpu_to_be16(from->level); +} + + +/* + * Directory free space block operations + */ +static int +xfs_dir2_free_max_bests(struct xfs_da_geometry *geo) +{ + return (geo->blksize - sizeof(struct xfs_dir2_free_hdr)) / + sizeof(xfs_dir2_data_off_t); +} + +static __be16 * +xfs_dir2_free_bests_p(struct xfs_dir2_free *free) +{ + return (__be16 *)((char *)free + sizeof(struct xfs_dir2_free_hdr)); +} + +/* + * Convert data space db to the corresponding free db. + */ +static xfs_dir2_db_t +xfs_dir2_db_to_fdb(struct xfs_da_geometry *geo, xfs_dir2_db_t db) +{ + return xfs_dir2_byte_to_db(geo, XFS_DIR2_FREE_OFFSET) + + (db / xfs_dir2_free_max_bests(geo)); +} + +/* + * Convert data space db to the corresponding index in a free db. + */ +static int +xfs_dir2_db_to_fdindex(struct xfs_da_geometry *geo, xfs_dir2_db_t db) +{ + return db % xfs_dir2_free_max_bests(geo); +} + +static int +xfs_dir3_free_max_bests(struct xfs_da_geometry *geo) +{ + return (geo->blksize - sizeof(struct xfs_dir3_free_hdr)) / + sizeof(xfs_dir2_data_off_t); +} + +static __be16 * +xfs_dir3_free_bests_p(struct xfs_dir2_free *free) +{ + return (__be16 *)((char *)free + sizeof(struct xfs_dir3_free_hdr)); +} + +/* + * Convert data space db to the corresponding free db. + */ +static xfs_dir2_db_t +xfs_dir3_db_to_fdb(struct xfs_da_geometry *geo, xfs_dir2_db_t db) +{ + return xfs_dir2_byte_to_db(geo, XFS_DIR2_FREE_OFFSET) + + (db / xfs_dir3_free_max_bests(geo)); +} + +/* + * Convert data space db to the corresponding index in a free db. + */ +static int +xfs_dir3_db_to_fdindex(struct xfs_da_geometry *geo, xfs_dir2_db_t db) +{ + return db % xfs_dir3_free_max_bests(geo); +} + +static void +xfs_dir2_free_hdr_from_disk( + struct xfs_dir3_icfree_hdr *to, + struct xfs_dir2_free *from) +{ + to->magic = be32_to_cpu(from->hdr.magic); + to->firstdb = be32_to_cpu(from->hdr.firstdb); + to->nvalid = be32_to_cpu(from->hdr.nvalid); + to->nused = be32_to_cpu(from->hdr.nused); + ASSERT(to->magic == XFS_DIR2_FREE_MAGIC); +} + +static void +xfs_dir2_free_hdr_to_disk( + struct xfs_dir2_free *to, + struct xfs_dir3_icfree_hdr *from) +{ + ASSERT(from->magic == XFS_DIR2_FREE_MAGIC); + + to->hdr.magic = cpu_to_be32(from->magic); + to->hdr.firstdb = cpu_to_be32(from->firstdb); + to->hdr.nvalid = cpu_to_be32(from->nvalid); + to->hdr.nused = cpu_to_be32(from->nused); +} + +static void +xfs_dir3_free_hdr_from_disk( + struct xfs_dir3_icfree_hdr *to, + struct xfs_dir2_free *from) +{ + struct xfs_dir3_free_hdr *hdr3 = (struct xfs_dir3_free_hdr *)from; + + to->magic = be32_to_cpu(hdr3->hdr.magic); + to->firstdb = be32_to_cpu(hdr3->firstdb); + to->nvalid = be32_to_cpu(hdr3->nvalid); + to->nused = be32_to_cpu(hdr3->nused); + + ASSERT(to->magic == XFS_DIR3_FREE_MAGIC); +} + +static void +xfs_dir3_free_hdr_to_disk( + struct xfs_dir2_free *to, + struct xfs_dir3_icfree_hdr *from) +{ + struct xfs_dir3_free_hdr *hdr3 = (struct xfs_dir3_free_hdr *)to; + + ASSERT(from->magic == XFS_DIR3_FREE_MAGIC); + + hdr3->hdr.magic = cpu_to_be32(from->magic); + hdr3->firstdb = cpu_to_be32(from->firstdb); + hdr3->nvalid = cpu_to_be32(from->nvalid); + hdr3->nused = cpu_to_be32(from->nused); +} + +static const struct xfs_dir_ops xfs_dir2_ops = { + .sf_entsize = xfs_dir2_sf_entsize, + .sf_nextentry = xfs_dir2_sf_nextentry, + .sf_get_ftype = xfs_dir2_sfe_get_ftype, + .sf_put_ftype = xfs_dir2_sfe_put_ftype, + .sf_get_ino = xfs_dir2_sfe_get_ino, + .sf_put_ino = xfs_dir2_sfe_put_ino, + .sf_get_parent_ino = xfs_dir2_sf_get_parent_ino, + .sf_put_parent_ino = xfs_dir2_sf_put_parent_ino, + + .data_entsize = xfs_dir2_data_entsize, + .data_get_ftype = xfs_dir2_data_get_ftype, + .data_put_ftype = xfs_dir2_data_put_ftype, + .data_entry_tag_p = xfs_dir2_data_entry_tag_p, + .data_bestfree_p = xfs_dir2_data_bestfree_p, + + .data_dot_offset = sizeof(struct xfs_dir2_data_hdr), + .data_dotdot_offset = sizeof(struct xfs_dir2_data_hdr) + + XFS_DIR2_DATA_ENTSIZE(1), + .data_first_offset = sizeof(struct xfs_dir2_data_hdr) + + XFS_DIR2_DATA_ENTSIZE(1) + + XFS_DIR2_DATA_ENTSIZE(2), + .data_entry_offset = sizeof(struct xfs_dir2_data_hdr), + + .data_dot_entry_p = xfs_dir2_data_dot_entry_p, + .data_dotdot_entry_p = xfs_dir2_data_dotdot_entry_p, + .data_first_entry_p = xfs_dir2_data_first_entry_p, + .data_entry_p = xfs_dir2_data_entry_p, + .data_unused_p = xfs_dir2_data_unused_p, + + .leaf_hdr_size = sizeof(struct xfs_dir2_leaf_hdr), + .leaf_hdr_to_disk = xfs_dir2_leaf_hdr_to_disk, + .leaf_hdr_from_disk = xfs_dir2_leaf_hdr_from_disk, + .leaf_max_ents = xfs_dir2_max_leaf_ents, + .leaf_ents_p = xfs_dir2_leaf_ents_p, + + .node_hdr_size = sizeof(struct xfs_da_node_hdr), + .node_hdr_to_disk = xfs_da2_node_hdr_to_disk, + .node_hdr_from_disk = xfs_da2_node_hdr_from_disk, + .node_tree_p = xfs_da2_node_tree_p, + + .free_hdr_size = sizeof(struct xfs_dir2_free_hdr), + .free_hdr_to_disk = xfs_dir2_free_hdr_to_disk, + .free_hdr_from_disk = xfs_dir2_free_hdr_from_disk, + .free_max_bests = xfs_dir2_free_max_bests, + .free_bests_p = xfs_dir2_free_bests_p, + .db_to_fdb = xfs_dir2_db_to_fdb, + .db_to_fdindex = xfs_dir2_db_to_fdindex, +}; + +static const struct xfs_dir_ops xfs_dir2_ftype_ops = { + .sf_entsize = xfs_dir3_sf_entsize, + .sf_nextentry = xfs_dir3_sf_nextentry, + .sf_get_ftype = xfs_dir3_sfe_get_ftype, + .sf_put_ftype = xfs_dir3_sfe_put_ftype, + .sf_get_ino = xfs_dir3_sfe_get_ino, + .sf_put_ino = xfs_dir3_sfe_put_ino, + .sf_get_parent_ino = xfs_dir2_sf_get_parent_ino, + .sf_put_parent_ino = xfs_dir2_sf_put_parent_ino, + + .data_entsize = xfs_dir3_data_entsize, + .data_get_ftype = xfs_dir3_data_get_ftype, + .data_put_ftype = xfs_dir3_data_put_ftype, + .data_entry_tag_p = xfs_dir3_data_entry_tag_p, + .data_bestfree_p = xfs_dir2_data_bestfree_p, + + .data_dot_offset = sizeof(struct xfs_dir2_data_hdr), + .data_dotdot_offset = sizeof(struct xfs_dir2_data_hdr) + + XFS_DIR3_DATA_ENTSIZE(1), + .data_first_offset = sizeof(struct xfs_dir2_data_hdr) + + XFS_DIR3_DATA_ENTSIZE(1) + + XFS_DIR3_DATA_ENTSIZE(2), + .data_entry_offset = sizeof(struct xfs_dir2_data_hdr), + + .data_dot_entry_p = xfs_dir2_data_dot_entry_p, + .data_dotdot_entry_p = xfs_dir2_ftype_data_dotdot_entry_p, + .data_first_entry_p = xfs_dir2_ftype_data_first_entry_p, + .data_entry_p = xfs_dir2_data_entry_p, + .data_unused_p = xfs_dir2_data_unused_p, + + .leaf_hdr_size = sizeof(struct xfs_dir2_leaf_hdr), + .leaf_hdr_to_disk = xfs_dir2_leaf_hdr_to_disk, + .leaf_hdr_from_disk = xfs_dir2_leaf_hdr_from_disk, + .leaf_max_ents = xfs_dir2_max_leaf_ents, + .leaf_ents_p = xfs_dir2_leaf_ents_p, + + .node_hdr_size = sizeof(struct xfs_da_node_hdr), + .node_hdr_to_disk = xfs_da2_node_hdr_to_disk, + .node_hdr_from_disk = xfs_da2_node_hdr_from_disk, + .node_tree_p = xfs_da2_node_tree_p, + + .free_hdr_size = sizeof(struct xfs_dir2_free_hdr), + .free_hdr_to_disk = xfs_dir2_free_hdr_to_disk, + .free_hdr_from_disk = xfs_dir2_free_hdr_from_disk, + .free_max_bests = xfs_dir2_free_max_bests, + .free_bests_p = xfs_dir2_free_bests_p, + .db_to_fdb = xfs_dir2_db_to_fdb, + .db_to_fdindex = xfs_dir2_db_to_fdindex, +}; + +static const struct xfs_dir_ops xfs_dir3_ops = { + .sf_entsize = xfs_dir3_sf_entsize, + .sf_nextentry = xfs_dir3_sf_nextentry, + .sf_get_ftype = xfs_dir3_sfe_get_ftype, + .sf_put_ftype = xfs_dir3_sfe_put_ftype, + .sf_get_ino = xfs_dir3_sfe_get_ino, + .sf_put_ino = xfs_dir3_sfe_put_ino, + .sf_get_parent_ino = xfs_dir2_sf_get_parent_ino, + .sf_put_parent_ino = xfs_dir2_sf_put_parent_ino, + + .data_entsize = xfs_dir3_data_entsize, + .data_get_ftype = xfs_dir3_data_get_ftype, + .data_put_ftype = xfs_dir3_data_put_ftype, + .data_entry_tag_p = xfs_dir3_data_entry_tag_p, + .data_bestfree_p = xfs_dir3_data_bestfree_p, + + .data_dot_offset = sizeof(struct xfs_dir3_data_hdr), + .data_dotdot_offset = sizeof(struct xfs_dir3_data_hdr) + + XFS_DIR3_DATA_ENTSIZE(1), + .data_first_offset = sizeof(struct xfs_dir3_data_hdr) + + XFS_DIR3_DATA_ENTSIZE(1) + + XFS_DIR3_DATA_ENTSIZE(2), + .data_entry_offset = sizeof(struct xfs_dir3_data_hdr), + + .data_dot_entry_p = xfs_dir3_data_dot_entry_p, + .data_dotdot_entry_p = xfs_dir3_data_dotdot_entry_p, + .data_first_entry_p = xfs_dir3_data_first_entry_p, + .data_entry_p = xfs_dir3_data_entry_p, + .data_unused_p = xfs_dir3_data_unused_p, + + .leaf_hdr_size = sizeof(struct xfs_dir3_leaf_hdr), + .leaf_hdr_to_disk = xfs_dir3_leaf_hdr_to_disk, + .leaf_hdr_from_disk = xfs_dir3_leaf_hdr_from_disk, + .leaf_max_ents = xfs_dir3_max_leaf_ents, + .leaf_ents_p = xfs_dir3_leaf_ents_p, + + .node_hdr_size = sizeof(struct xfs_da3_node_hdr), + .node_hdr_to_disk = xfs_da3_node_hdr_to_disk, + .node_hdr_from_disk = xfs_da3_node_hdr_from_disk, + .node_tree_p = xfs_da3_node_tree_p, + + .free_hdr_size = sizeof(struct xfs_dir3_free_hdr), + .free_hdr_to_disk = xfs_dir3_free_hdr_to_disk, + .free_hdr_from_disk = xfs_dir3_free_hdr_from_disk, + .free_max_bests = xfs_dir3_free_max_bests, + .free_bests_p = xfs_dir3_free_bests_p, + .db_to_fdb = xfs_dir3_db_to_fdb, + .db_to_fdindex = xfs_dir3_db_to_fdindex, +}; + +static const struct xfs_dir_ops xfs_dir2_nondir_ops = { + .node_hdr_size = sizeof(struct xfs_da_node_hdr), + .node_hdr_to_disk = xfs_da2_node_hdr_to_disk, + .node_hdr_from_disk = xfs_da2_node_hdr_from_disk, + .node_tree_p = xfs_da2_node_tree_p, +}; + +static const struct xfs_dir_ops xfs_dir3_nondir_ops = { + .node_hdr_size = sizeof(struct xfs_da3_node_hdr), + .node_hdr_to_disk = xfs_da3_node_hdr_to_disk, + .node_hdr_from_disk = xfs_da3_node_hdr_from_disk, + .node_tree_p = xfs_da3_node_tree_p, +}; + +/* + * Return the ops structure according to the current config. If we are passed + * an inode, then that overrides the default config we use which is based on + * feature bits. + */ +const struct xfs_dir_ops * +xfs_dir_get_ops( + struct xfs_mount *mp, + struct xfs_inode *dp) +{ + if (dp) + return dp->d_ops; + if (mp->m_dir_inode_ops) + return mp->m_dir_inode_ops; + if (xfs_sb_version_hascrc(&mp->m_sb)) + return &xfs_dir3_ops; + if (xfs_sb_version_hasftype(&mp->m_sb)) + return &xfs_dir2_ftype_ops; + return &xfs_dir2_ops; +} + +const struct xfs_dir_ops * +xfs_nondir_get_ops( + struct xfs_mount *mp, + struct xfs_inode *dp) +{ + if (dp) + return dp->d_ops; + if (mp->m_nondir_inode_ops) + return mp->m_nondir_inode_ops; + if (xfs_sb_version_hascrc(&mp->m_sb)) + return &xfs_dir3_nondir_ops; + return &xfs_dir2_nondir_ops; +} diff --git a/fs/xfs/xfs_da_format.h b/fs/xfs/xfs_da_format.h new file mode 100644 index 00000000000..0a49b028637 --- /dev/null +++ b/fs/xfs/xfs_da_format.h @@ -0,0 +1,861 @@ +/* + * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc. + * Copyright (c) 2013 Red Hat, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_DA_FORMAT_H__ +#define __XFS_DA_FORMAT_H__ + +/* + * This structure is common to both leaf nodes and non-leaf nodes in the Btree. + * + * It is used to manage a doubly linked list of all blocks at the same + * level in the Btree, and to identify which type of block this is. + */ +#define XFS_DA_NODE_MAGIC 0xfebe /* magic number: non-leaf blocks */ +#define XFS_ATTR_LEAF_MAGIC 0xfbee /* magic number: attribute leaf blks */ +#define XFS_DIR2_LEAF1_MAGIC 0xd2f1 /* magic number: v2 dirlf single blks */ +#define XFS_DIR2_LEAFN_MAGIC 0xd2ff /* magic number: v2 dirlf multi blks */ + +typedef struct xfs_da_blkinfo { + __be32 forw; /* previous block in list */ + __be32 back; /* following block in list */ + __be16 magic; /* validity check on block */ + __be16 pad; /* unused */ +} xfs_da_blkinfo_t; + +/* + * CRC enabled directory structure types + * + * The headers change size for the additional verification information, but + * otherwise the tree layouts and contents are unchanged. Hence the da btree + * code can use the struct xfs_da_blkinfo for manipulating the tree links and + * magic numbers without modification for both v2 and v3 nodes. + */ +#define XFS_DA3_NODE_MAGIC 0x3ebe /* magic number: non-leaf blocks */ +#define XFS_ATTR3_LEAF_MAGIC 0x3bee /* magic number: attribute leaf blks */ +#define XFS_DIR3_LEAF1_MAGIC 0x3df1 /* magic number: v2 dirlf single blks */ +#define XFS_DIR3_LEAFN_MAGIC 0x3dff /* magic number: v2 dirlf multi blks */ + +struct xfs_da3_blkinfo { + /* + * the node link manipulation code relies on the fact that the first + * element of this structure is the struct xfs_da_blkinfo so it can + * ignore the differences in the rest of the structures. + */ + struct xfs_da_blkinfo hdr; + __be32 crc; /* CRC of block */ + __be64 blkno; /* first block of the buffer */ + __be64 lsn; /* sequence number of last write */ + uuid_t uuid; /* filesystem we belong to */ + __be64 owner; /* inode that owns the block */ +}; + +/* + * This is the structure of the root and intermediate nodes in the Btree. + * The leaf nodes are defined above. + * + * Entries are not packed. + * + * Since we have duplicate keys, use a binary search but always follow + * all match in the block, not just the first match found. + */ +#define XFS_DA_NODE_MAXDEPTH 5 /* max depth of Btree */ + +typedef struct xfs_da_node_hdr { + struct xfs_da_blkinfo info; /* block type, links, etc. */ + __be16 __count; /* count of active entries */ + __be16 __level; /* level above leaves (leaf == 0) */ +} xfs_da_node_hdr_t; + +struct xfs_da3_node_hdr { + struct xfs_da3_blkinfo info; /* block type, links, etc. */ + __be16 __count; /* count of active entries */ + __be16 __level; /* level above leaves (leaf == 0) */ + __be32 __pad32; +}; + +#define XFS_DA3_NODE_CRC_OFF (offsetof(struct xfs_da3_node_hdr, info.crc)) + +typedef struct xfs_da_node_entry { + __be32 hashval; /* hash value for this descendant */ + __be32 before; /* Btree block before this key */ +} xfs_da_node_entry_t; + +typedef struct xfs_da_intnode { + struct xfs_da_node_hdr hdr; + struct xfs_da_node_entry __btree[]; +} xfs_da_intnode_t; + +struct xfs_da3_intnode { + struct xfs_da3_node_hdr hdr; + struct xfs_da_node_entry __btree[]; +}; + +/* + * In-core version of the node header to abstract the differences in the v2 and + * v3 disk format of the headers. Callers need to convert to/from disk format as + * appropriate. + */ +struct xfs_da3_icnode_hdr { + __uint32_t forw; + __uint32_t back; + __uint16_t magic; + __uint16_t count; + __uint16_t level; +}; + +/* + * Directory version 2. + * + * There are 4 possible formats: + * - shortform - embedded into the inode + * - single block - data with embedded leaf at the end + * - multiple data blocks, single leaf+freeindex block + * - data blocks, node and leaf blocks (btree), freeindex blocks + * + * Note: many node blocks structures and constants are shared with the attr + * code and defined in xfs_da_btree.h. + */ + +#define XFS_DIR2_BLOCK_MAGIC 0x58443242 /* XD2B: single block dirs */ +#define XFS_DIR2_DATA_MAGIC 0x58443244 /* XD2D: multiblock dirs */ +#define XFS_DIR2_FREE_MAGIC 0x58443246 /* XD2F: free index blocks */ + +/* + * Directory Version 3 With CRCs. + * + * The tree formats are the same as for version 2 directories. The difference + * is in the block header and dirent formats. In many cases the v3 structures + * use v2 definitions as they are no different and this makes code sharing much + * easier. + * + * Also, the xfs_dir3_*() functions handle both v2 and v3 formats - if the + * format is v2 then they switch to the existing v2 code, or the format is v3 + * they implement the v3 functionality. This means the existing dir2 is a mix of + * xfs_dir2/xfs_dir3 calls and functions. The xfs_dir3 functions are called + * where there is a difference in the formats, otherwise the code is unchanged. + * + * Where it is possible, the code decides what to do based on the magic numbers + * in the blocks rather than feature bits in the superblock. This means the code + * is as independent of the external XFS code as possible as doesn't require + * passing struct xfs_mount pointers into places where it isn't really + * necessary. + * + * Version 3 includes: + * + * - a larger block header for CRC and identification purposes and so the + * offsets of all the structures inside the blocks are different. + * + * - new magic numbers to be able to detect the v2/v3 types on the fly. + */ + +#define XFS_DIR3_BLOCK_MAGIC 0x58444233 /* XDB3: single block dirs */ +#define XFS_DIR3_DATA_MAGIC 0x58444433 /* XDD3: multiblock dirs */ +#define XFS_DIR3_FREE_MAGIC 0x58444633 /* XDF3: free index blocks */ + +/* + * Dirents in version 3 directories have a file type field. Additions to this + * list are an on-disk format change, requiring feature bits. Valid values + * are as follows: + */ +#define XFS_DIR3_FT_UNKNOWN 0 +#define XFS_DIR3_FT_REG_FILE 1 +#define XFS_DIR3_FT_DIR 2 +#define XFS_DIR3_FT_CHRDEV 3 +#define XFS_DIR3_FT_BLKDEV 4 +#define XFS_DIR3_FT_FIFO 5 +#define XFS_DIR3_FT_SOCK 6 +#define XFS_DIR3_FT_SYMLINK 7 +#define XFS_DIR3_FT_WHT 8 + +#define XFS_DIR3_FT_MAX 9 + +/* + * Byte offset in data block and shortform entry. + */ +typedef __uint16_t xfs_dir2_data_off_t; +#define NULLDATAOFF 0xffffU +typedef uint xfs_dir2_data_aoff_t; /* argument form */ + +/* + * Normalized offset (in a data block) of the entry, really xfs_dir2_data_off_t. + * Only need 16 bits, this is the byte offset into the single block form. + */ +typedef struct { __uint8_t i[2]; } __arch_pack xfs_dir2_sf_off_t; + +/* + * Offset in data space of a data entry. + */ +typedef __uint32_t xfs_dir2_dataptr_t; +#define XFS_DIR2_MAX_DATAPTR ((xfs_dir2_dataptr_t)0xffffffff) +#define XFS_DIR2_NULL_DATAPTR ((xfs_dir2_dataptr_t)0) + +/* + * Byte offset in a directory. + */ +typedef xfs_off_t xfs_dir2_off_t; + +/* + * Directory block number (logical dirblk in file) + */ +typedef __uint32_t xfs_dir2_db_t; + +/* + * Inode number stored as 8 8-bit values. + */ +typedef struct { __uint8_t i[8]; } xfs_dir2_ino8_t; + +/* + * Inode number stored as 4 8-bit values. + * Works a lot of the time, when all the inode numbers in a directory + * fit in 32 bits. + */ +typedef struct { __uint8_t i[4]; } xfs_dir2_ino4_t; + +typedef union { + xfs_dir2_ino8_t i8; + xfs_dir2_ino4_t i4; +} xfs_dir2_inou_t; +#define XFS_DIR2_MAX_SHORT_INUM ((xfs_ino_t)0xffffffffULL) + +/* + * Directory layout when stored internal to an inode. + * + * Small directories are packed as tightly as possible so as to fit into the + * literal area of the inode. These "shortform" directories consist of a + * single xfs_dir2_sf_hdr header followed by zero or more xfs_dir2_sf_entry + * structures. Due the different inode number storage size and the variable + * length name field in the xfs_dir2_sf_entry all these structure are + * variable length, and the accessors in this file should be used to iterate + * over them. + */ +typedef struct xfs_dir2_sf_hdr { + __uint8_t count; /* count of entries */ + __uint8_t i8count; /* count of 8-byte inode #s */ + xfs_dir2_inou_t parent; /* parent dir inode number */ +} __arch_pack xfs_dir2_sf_hdr_t; + +typedef struct xfs_dir2_sf_entry { + __u8 namelen; /* actual name length */ + xfs_dir2_sf_off_t offset; /* saved offset */ + __u8 name[]; /* name, variable size */ + /* + * A single byte containing the file type field follows the inode + * number for version 3 directory entries. + * + * A xfs_dir2_ino8_t or xfs_dir2_ino4_t follows here, at a + * variable offset after the name. + */ +} __arch_pack xfs_dir2_sf_entry_t; + +static inline int xfs_dir2_sf_hdr_size(int i8count) +{ + return sizeof(struct xfs_dir2_sf_hdr) - + (i8count == 0) * + (sizeof(xfs_dir2_ino8_t) - sizeof(xfs_dir2_ino4_t)); +} + +static inline xfs_dir2_data_aoff_t +xfs_dir2_sf_get_offset(xfs_dir2_sf_entry_t *sfep) +{ + return get_unaligned_be16(&sfep->offset.i); +} + +static inline void +xfs_dir2_sf_put_offset(xfs_dir2_sf_entry_t *sfep, xfs_dir2_data_aoff_t off) +{ + put_unaligned_be16(off, &sfep->offset.i); +} + +static inline struct xfs_dir2_sf_entry * +xfs_dir2_sf_firstentry(struct xfs_dir2_sf_hdr *hdr) +{ + return (struct xfs_dir2_sf_entry *) + ((char *)hdr + xfs_dir2_sf_hdr_size(hdr->i8count)); +} + +/* + * Data block structures. + * + * A pure data block looks like the following drawing on disk: + * + * +-------------------------------------------------+ + * | xfs_dir2_data_hdr_t | + * +-------------------------------------------------+ + * | xfs_dir2_data_entry_t OR xfs_dir2_data_unused_t | + * | xfs_dir2_data_entry_t OR xfs_dir2_data_unused_t | + * | xfs_dir2_data_entry_t OR xfs_dir2_data_unused_t | + * | ... | + * +-------------------------------------------------+ + * | unused space | + * +-------------------------------------------------+ + * + * As all the entries are variable size structures the accessors below should + * be used to iterate over them. + * + * In addition to the pure data blocks for the data and node formats, + * most structures are also used for the combined data/freespace "block" + * format below. + */ + +#define XFS_DIR2_DATA_ALIGN_LOG 3 /* i.e., 8 bytes */ +#define XFS_DIR2_DATA_ALIGN (1 << XFS_DIR2_DATA_ALIGN_LOG) +#define XFS_DIR2_DATA_FREE_TAG 0xffff +#define XFS_DIR2_DATA_FD_COUNT 3 + +/* + * Directory address space divided into sections, + * spaces separated by 32GB. + */ +#define XFS_DIR2_SPACE_SIZE (1ULL << (32 + XFS_DIR2_DATA_ALIGN_LOG)) +#define XFS_DIR2_DATA_SPACE 0 +#define XFS_DIR2_DATA_OFFSET (XFS_DIR2_DATA_SPACE * XFS_DIR2_SPACE_SIZE) + +/* + * Describe a free area in the data block. + * + * The freespace will be formatted as a xfs_dir2_data_unused_t. + */ +typedef struct xfs_dir2_data_free { + __be16 offset; /* start of freespace */ + __be16 length; /* length of freespace */ +} xfs_dir2_data_free_t; + +/* + * Header for the data blocks. + * + * The code knows that XFS_DIR2_DATA_FD_COUNT is 3. + */ +typedef struct xfs_dir2_data_hdr { + __be32 magic; /* XFS_DIR2_DATA_MAGIC or */ + /* XFS_DIR2_BLOCK_MAGIC */ + xfs_dir2_data_free_t bestfree[XFS_DIR2_DATA_FD_COUNT]; +} xfs_dir2_data_hdr_t; + +/* + * define a structure for all the verification fields we are adding to the + * directory block structures. This will be used in several structures. + * The magic number must be the first entry to align with all the dir2 + * structures so we determine how to decode them just by the magic number. + */ +struct xfs_dir3_blk_hdr { + __be32 magic; /* magic number */ + __be32 crc; /* CRC of block */ + __be64 blkno; /* first block of the buffer */ + __be64 lsn; /* sequence number of last write */ + uuid_t uuid; /* filesystem we belong to */ + __be64 owner; /* inode that owns the block */ +}; + +struct xfs_dir3_data_hdr { + struct xfs_dir3_blk_hdr hdr; + xfs_dir2_data_free_t best_free[XFS_DIR2_DATA_FD_COUNT]; + __be32 pad; /* 64 bit alignment */ +}; + +#define XFS_DIR3_DATA_CRC_OFF offsetof(struct xfs_dir3_data_hdr, hdr.crc) + +/* + * Active entry in a data block. + * + * Aligned to 8 bytes. After the variable length name field there is a + * 2 byte tag field, which can be accessed using xfs_dir3_data_entry_tag_p. + * + * For dir3 structures, there is file type field between the name and the tag. + * This can only be manipulated by helper functions. It is packed hard against + * the end of the name so any padding for rounding is between the file type and + * the tag. + */ +typedef struct xfs_dir2_data_entry { + __be64 inumber; /* inode number */ + __u8 namelen; /* name length */ + __u8 name[]; /* name bytes, no null */ + /* __u8 filetype; */ /* type of inode we point to */ + /* __be16 tag; */ /* starting offset of us */ +} xfs_dir2_data_entry_t; + +/* + * Unused entry in a data block. + * + * Aligned to 8 bytes. Tag appears as the last 2 bytes and must be accessed + * using xfs_dir2_data_unused_tag_p. + */ +typedef struct xfs_dir2_data_unused { + __be16 freetag; /* XFS_DIR2_DATA_FREE_TAG */ + __be16 length; /* total free length */ + /* variable offset */ + __be16 tag; /* starting offset of us */ +} xfs_dir2_data_unused_t; + +/* + * Pointer to a freespace's tag word. + */ +static inline __be16 * +xfs_dir2_data_unused_tag_p(struct xfs_dir2_data_unused *dup) +{ + return (__be16 *)((char *)dup + + be16_to_cpu(dup->length) - sizeof(__be16)); +} + +/* + * Leaf block structures. + * + * A pure leaf block looks like the following drawing on disk: + * + * +---------------------------+ + * | xfs_dir2_leaf_hdr_t | + * +---------------------------+ + * | xfs_dir2_leaf_entry_t | + * | xfs_dir2_leaf_entry_t | + * | xfs_dir2_leaf_entry_t | + * | xfs_dir2_leaf_entry_t | + * | ... | + * +---------------------------+ + * | xfs_dir2_data_off_t | + * | xfs_dir2_data_off_t | + * | xfs_dir2_data_off_t | + * | ... | + * +---------------------------+ + * | xfs_dir2_leaf_tail_t | + * +---------------------------+ + * + * The xfs_dir2_data_off_t members (bests) and tail are at the end of the block + * for single-leaf (magic = XFS_DIR2_LEAF1_MAGIC) blocks only, but not present + * for directories with separate leaf nodes and free space blocks + * (magic = XFS_DIR2_LEAFN_MAGIC). + * + * As all the entries are variable size structures the accessors below should + * be used to iterate over them. + */ + +/* + * Offset of the leaf/node space. First block in this space + * is the btree root. + */ +#define XFS_DIR2_LEAF_SPACE 1 +#define XFS_DIR2_LEAF_OFFSET (XFS_DIR2_LEAF_SPACE * XFS_DIR2_SPACE_SIZE) + +/* + * Leaf block header. + */ +typedef struct xfs_dir2_leaf_hdr { + xfs_da_blkinfo_t info; /* header for da routines */ + __be16 count; /* count of entries */ + __be16 stale; /* count of stale entries */ +} xfs_dir2_leaf_hdr_t; + +struct xfs_dir3_leaf_hdr { + struct xfs_da3_blkinfo info; /* header for da routines */ + __be16 count; /* count of entries */ + __be16 stale; /* count of stale entries */ + __be32 pad; /* 64 bit alignment */ +}; + +struct xfs_dir3_icleaf_hdr { + __uint32_t forw; + __uint32_t back; + __uint16_t magic; + __uint16_t count; + __uint16_t stale; +}; + +/* + * Leaf block entry. + */ +typedef struct xfs_dir2_leaf_entry { + __be32 hashval; /* hash value of name */ + __be32 address; /* address of data entry */ +} xfs_dir2_leaf_entry_t; + +/* + * Leaf block tail. + */ +typedef struct xfs_dir2_leaf_tail { + __be32 bestcount; +} xfs_dir2_leaf_tail_t; + +/* + * Leaf block. + */ +typedef struct xfs_dir2_leaf { + xfs_dir2_leaf_hdr_t hdr; /* leaf header */ + xfs_dir2_leaf_entry_t __ents[]; /* entries */ +} xfs_dir2_leaf_t; + +struct xfs_dir3_leaf { + struct xfs_dir3_leaf_hdr hdr; /* leaf header */ + struct xfs_dir2_leaf_entry __ents[]; /* entries */ +}; + +#define XFS_DIR3_LEAF_CRC_OFF offsetof(struct xfs_dir3_leaf_hdr, info.crc) + +/* + * Get address of the bests array in the single-leaf block. + */ +static inline __be16 * +xfs_dir2_leaf_bests_p(struct xfs_dir2_leaf_tail *ltp) +{ + return (__be16 *)ltp - be32_to_cpu(ltp->bestcount); +} + +/* + * Free space block defintions for the node format. + */ + +/* + * Offset of the freespace index. + */ +#define XFS_DIR2_FREE_SPACE 2 +#define XFS_DIR2_FREE_OFFSET (XFS_DIR2_FREE_SPACE * XFS_DIR2_SPACE_SIZE) + +typedef struct xfs_dir2_free_hdr { + __be32 magic; /* XFS_DIR2_FREE_MAGIC */ + __be32 firstdb; /* db of first entry */ + __be32 nvalid; /* count of valid entries */ + __be32 nused; /* count of used entries */ +} xfs_dir2_free_hdr_t; + +typedef struct xfs_dir2_free { + xfs_dir2_free_hdr_t hdr; /* block header */ + __be16 bests[]; /* best free counts */ + /* unused entries are -1 */ +} xfs_dir2_free_t; + +struct xfs_dir3_free_hdr { + struct xfs_dir3_blk_hdr hdr; + __be32 firstdb; /* db of first entry */ + __be32 nvalid; /* count of valid entries */ + __be32 nused; /* count of used entries */ + __be32 pad; /* 64 bit alignment */ +}; + +struct xfs_dir3_free { + struct xfs_dir3_free_hdr hdr; + __be16 bests[]; /* best free counts */ + /* unused entries are -1 */ +}; + +#define XFS_DIR3_FREE_CRC_OFF offsetof(struct xfs_dir3_free, hdr.hdr.crc) + +/* + * In core version of the free block header, abstracted away from on-disk format + * differences. Use this in the code, and convert to/from the disk version using + * xfs_dir3_free_hdr_from_disk/xfs_dir3_free_hdr_to_disk. + */ +struct xfs_dir3_icfree_hdr { + __uint32_t magic; + __uint32_t firstdb; + __uint32_t nvalid; + __uint32_t nused; + +}; + +/* + * Single block format. + * + * The single block format looks like the following drawing on disk: + * + * +-------------------------------------------------+ + * | xfs_dir2_data_hdr_t | + * +-------------------------------------------------+ + * | xfs_dir2_data_entry_t OR xfs_dir2_data_unused_t | + * | xfs_dir2_data_entry_t OR xfs_dir2_data_unused_t | + * | xfs_dir2_data_entry_t OR xfs_dir2_data_unused_t : + * | ... | + * +-------------------------------------------------+ + * | unused space | + * +-------------------------------------------------+ + * | ... | + * | xfs_dir2_leaf_entry_t | + * | xfs_dir2_leaf_entry_t | + * +-------------------------------------------------+ + * | xfs_dir2_block_tail_t | + * +-------------------------------------------------+ + * + * As all the entries are variable size structures the accessors below should + * be used to iterate over them. + */ + +typedef struct xfs_dir2_block_tail { + __be32 count; /* count of leaf entries */ + __be32 stale; /* count of stale lf entries */ +} xfs_dir2_block_tail_t; + +/* + * Pointer to the leaf entries embedded in a data block (1-block format) + */ +static inline struct xfs_dir2_leaf_entry * +xfs_dir2_block_leaf_p(struct xfs_dir2_block_tail *btp) +{ + return ((struct xfs_dir2_leaf_entry *)btp) - be32_to_cpu(btp->count); +} + + +/* + * Attribute storage layout + * + * Attribute lists are structured around Btrees where all the data + * elements are in the leaf nodes. Attribute names are hashed into an int, + * then that int is used as the index into the Btree. Since the hashval + * of an attribute name may not be unique, we may have duplicate keys. The + * internal links in the Btree are logical block offsets into the file. + * + * Struct leaf_entry's are packed from the top. Name/values grow from the + * bottom but are not packed. The freemap contains run-length-encoded entries + * for the free bytes after the leaf_entry's, but only the N largest such, + * smaller runs are dropped. When the freemap doesn't show enough space + * for an allocation, we compact the name/value area and try again. If we + * still don't have enough space, then we have to split the block. The + * name/value structs (both local and remote versions) must be 32bit aligned. + * + * Since we have duplicate hash keys, for each key that matches, compare + * the actual name string. The root and intermediate node search always + * takes the first-in-the-block key match found, so we should only have + * to work "forw"ard. If none matches, continue with the "forw"ard leaf + * nodes until the hash key changes or the attribute name is found. + * + * We store the fact that an attribute is a ROOT/USER/SECURE attribute in + * the leaf_entry. The namespaces are independent only because we also look + * at the namespace bit when we are looking for a matching attribute name. + * + * We also store an "incomplete" bit in the leaf_entry. It shows that an + * attribute is in the middle of being created and should not be shown to + * the user if we crash during the time that the bit is set. We clear the + * bit when we have finished setting up the attribute. We do this because + * we cannot create some large attributes inside a single transaction, and we + * need some indication that we weren't finished if we crash in the middle. + */ +#define XFS_ATTR_LEAF_MAPSIZE 3 /* how many freespace slots */ + +typedef struct xfs_attr_leaf_map { /* RLE map of free bytes */ + __be16 base; /* base of free region */ + __be16 size; /* length of free region */ +} xfs_attr_leaf_map_t; + +typedef struct xfs_attr_leaf_hdr { /* constant-structure header block */ + xfs_da_blkinfo_t info; /* block type, links, etc. */ + __be16 count; /* count of active leaf_entry's */ + __be16 usedbytes; /* num bytes of names/values stored */ + __be16 firstused; /* first used byte in name area */ + __u8 holes; /* != 0 if blk needs compaction */ + __u8 pad1; + xfs_attr_leaf_map_t freemap[XFS_ATTR_LEAF_MAPSIZE]; + /* N largest free regions */ +} xfs_attr_leaf_hdr_t; + +typedef struct xfs_attr_leaf_entry { /* sorted on key, not name */ + __be32 hashval; /* hash value of name */ + __be16 nameidx; /* index into buffer of name/value */ + __u8 flags; /* LOCAL/ROOT/SECURE/INCOMPLETE flag */ + __u8 pad2; /* unused pad byte */ +} xfs_attr_leaf_entry_t; + +typedef struct xfs_attr_leaf_name_local { + __be16 valuelen; /* number of bytes in value */ + __u8 namelen; /* length of name bytes */ + __u8 nameval[1]; /* name/value bytes */ +} xfs_attr_leaf_name_local_t; + +typedef struct xfs_attr_leaf_name_remote { + __be32 valueblk; /* block number of value bytes */ + __be32 valuelen; /* number of bytes in value */ + __u8 namelen; /* length of name bytes */ + __u8 name[1]; /* name bytes */ +} xfs_attr_leaf_name_remote_t; + +typedef struct xfs_attr_leafblock { + xfs_attr_leaf_hdr_t hdr; /* constant-structure header block */ + xfs_attr_leaf_entry_t entries[1]; /* sorted on key, not name */ + xfs_attr_leaf_name_local_t namelist; /* grows from bottom of buf */ + xfs_attr_leaf_name_remote_t valuelist; /* grows from bottom of buf */ +} xfs_attr_leafblock_t; + +/* + * CRC enabled leaf structures. Called "version 3" structures to match the + * version number of the directory and dablk structures for this feature, and + * attr2 is already taken by the variable inode attribute fork size feature. + */ +struct xfs_attr3_leaf_hdr { + struct xfs_da3_blkinfo info; + __be16 count; + __be16 usedbytes; + __be16 firstused; + __u8 holes; + __u8 pad1; + struct xfs_attr_leaf_map freemap[XFS_ATTR_LEAF_MAPSIZE]; + __be32 pad2; /* 64 bit alignment */ +}; + +#define XFS_ATTR3_LEAF_CRC_OFF (offsetof(struct xfs_attr3_leaf_hdr, info.crc)) + +struct xfs_attr3_leafblock { + struct xfs_attr3_leaf_hdr hdr; + struct xfs_attr_leaf_entry entries[1]; + + /* + * The rest of the block contains the following structures after the + * leaf entries, growing from the bottom up. The variables are never + * referenced, the locations accessed purely from helper functions. + * + * struct xfs_attr_leaf_name_local + * struct xfs_attr_leaf_name_remote + */ +}; + +/* + * incore, neutral version of the attribute leaf header + */ +struct xfs_attr3_icleaf_hdr { + __uint32_t forw; + __uint32_t back; + __uint16_t magic; + __uint16_t count; + __uint16_t usedbytes; + __uint16_t firstused; + __u8 holes; + struct { + __uint16_t base; + __uint16_t size; + } freemap[XFS_ATTR_LEAF_MAPSIZE]; +}; + +/* + * Flags used in the leaf_entry[i].flags field. + * NOTE: the INCOMPLETE bit must not collide with the flags bits specified + * on the system call, they are "or"ed together for various operations. + */ +#define XFS_ATTR_LOCAL_BIT 0 /* attr is stored locally */ +#define XFS_ATTR_ROOT_BIT 1 /* limit access to trusted attrs */ +#define XFS_ATTR_SECURE_BIT 2 /* limit access to secure attrs */ +#define XFS_ATTR_INCOMPLETE_BIT 7 /* attr in middle of create/delete */ +#define XFS_ATTR_LOCAL (1 << XFS_ATTR_LOCAL_BIT) +#define XFS_ATTR_ROOT (1 << XFS_ATTR_ROOT_BIT) +#define XFS_ATTR_SECURE (1 << XFS_ATTR_SECURE_BIT) +#define XFS_ATTR_INCOMPLETE (1 << XFS_ATTR_INCOMPLETE_BIT) + +/* + * Conversion macros for converting namespace bits from argument flags + * to ondisk flags. + */ +#define XFS_ATTR_NSP_ARGS_MASK (ATTR_ROOT | ATTR_SECURE) +#define XFS_ATTR_NSP_ONDISK_MASK (XFS_ATTR_ROOT | XFS_ATTR_SECURE) +#define XFS_ATTR_NSP_ONDISK(flags) ((flags) & XFS_ATTR_NSP_ONDISK_MASK) +#define XFS_ATTR_NSP_ARGS(flags) ((flags) & XFS_ATTR_NSP_ARGS_MASK) +#define XFS_ATTR_NSP_ARGS_TO_ONDISK(x) (((x) & ATTR_ROOT ? XFS_ATTR_ROOT : 0) |\ + ((x) & ATTR_SECURE ? XFS_ATTR_SECURE : 0)) +#define XFS_ATTR_NSP_ONDISK_TO_ARGS(x) (((x) & XFS_ATTR_ROOT ? ATTR_ROOT : 0) |\ + ((x) & XFS_ATTR_SECURE ? ATTR_SECURE : 0)) + +/* + * Alignment for namelist and valuelist entries (since they are mixed + * there can be only one alignment value) + */ +#define XFS_ATTR_LEAF_NAME_ALIGN ((uint)sizeof(xfs_dablk_t)) + +static inline int +xfs_attr3_leaf_hdr_size(struct xfs_attr_leafblock *leafp) +{ + if (leafp->hdr.info.magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC)) + return sizeof(struct xfs_attr3_leaf_hdr); + return sizeof(struct xfs_attr_leaf_hdr); +} + +static inline struct xfs_attr_leaf_entry * +xfs_attr3_leaf_entryp(xfs_attr_leafblock_t *leafp) +{ + if (leafp->hdr.info.magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC)) + return &((struct xfs_attr3_leafblock *)leafp)->entries[0]; + return &leafp->entries[0]; +} + +/* + * Cast typed pointers for "local" and "remote" name/value structs. + */ +static inline char * +xfs_attr3_leaf_name(xfs_attr_leafblock_t *leafp, int idx) +{ + struct xfs_attr_leaf_entry *entries = xfs_attr3_leaf_entryp(leafp); + + return &((char *)leafp)[be16_to_cpu(entries[idx].nameidx)]; +} + +static inline xfs_attr_leaf_name_remote_t * +xfs_attr3_leaf_name_remote(xfs_attr_leafblock_t *leafp, int idx) +{ + return (xfs_attr_leaf_name_remote_t *)xfs_attr3_leaf_name(leafp, idx); +} + +static inline xfs_attr_leaf_name_local_t * +xfs_attr3_leaf_name_local(xfs_attr_leafblock_t *leafp, int idx) +{ + return (xfs_attr_leaf_name_local_t *)xfs_attr3_leaf_name(leafp, idx); +} + +/* + * Calculate total bytes used (including trailing pad for alignment) for + * a "local" name/value structure, a "remote" name/value structure, and + * a pointer which might be either. + */ +static inline int xfs_attr_leaf_entsize_remote(int nlen) +{ + return ((uint)sizeof(xfs_attr_leaf_name_remote_t) - 1 + (nlen) + \ + XFS_ATTR_LEAF_NAME_ALIGN - 1) & ~(XFS_ATTR_LEAF_NAME_ALIGN - 1); +} + +static inline int xfs_attr_leaf_entsize_local(int nlen, int vlen) +{ + return ((uint)sizeof(xfs_attr_leaf_name_local_t) - 1 + (nlen) + (vlen) + + XFS_ATTR_LEAF_NAME_ALIGN - 1) & ~(XFS_ATTR_LEAF_NAME_ALIGN - 1); +} + +static inline int xfs_attr_leaf_entsize_local_max(int bsize) +{ + return (((bsize) >> 1) + ((bsize) >> 2)); +} + + + +/* + * Remote attribute block format definition + * + * There is one of these headers per filesystem block in a remote attribute. + * This is done to ensure there is a 1:1 mapping between the attribute value + * length and the number of blocks needed to store the attribute. This makes the + * verification of a buffer a little more complex, but greatly simplifies the + * allocation, reading and writing of these attributes as we don't have to guess + * the number of blocks needed to store the attribute data. + */ +#define XFS_ATTR3_RMT_MAGIC 0x5841524d /* XARM */ + +struct xfs_attr3_rmt_hdr { + __be32 rm_magic; + __be32 rm_offset; + __be32 rm_bytes; + __be32 rm_crc; + uuid_t rm_uuid; + __be64 rm_owner; + __be64 rm_blkno; + __be64 rm_lsn; +}; + +#define XFS_ATTR3_RMT_CRC_OFF offsetof(struct xfs_attr3_rmt_hdr, rm_crc) + +#define XFS_ATTR3_RMT_BUF_SPACE(mp, bufsize) \ + ((bufsize) - (xfs_sb_version_hascrc(&(mp)->m_sb) ? \ + sizeof(struct xfs_attr3_rmt_hdr) : 0)) + +#endif /* __XFS_DA_FORMAT_H__ */ diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c deleted file mode 100644 index d0e9c74d3d9..00000000000 --- a/fs/xfs/xfs_dfrag.c +++ /dev/null @@ -1,449 +0,0 @@ -/* - * Copyright (c) 2000-2006 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#include "xfs.h" -#include "xfs_fs.h" -#include "xfs_types.h" -#include "xfs_log.h" -#include "xfs_trans.h" -#include "xfs_sb.h" -#include "xfs_ag.h" -#include "xfs_mount.h" -#include "xfs_bmap_btree.h" -#include "xfs_dinode.h" -#include "xfs_inode.h" -#include "xfs_inode_item.h" -#include "xfs_bmap.h" -#include "xfs_itable.h" -#include "xfs_dfrag.h" -#include "xfs_error.h" -#include "xfs_vnodeops.h" -#include "xfs_trace.h" - - -static int xfs_swap_extents( - xfs_inode_t *ip, /* target inode */ - xfs_inode_t *tip, /* tmp inode */ - xfs_swapext_t *sxp); - -/* - * ioctl interface for swapext - */ -int -xfs_swapext( - xfs_swapext_t *sxp) -{ - xfs_inode_t *ip, *tip; - struct fd f, tmp; - int error = 0; - - /* Pull information for the target fd */ - f = fdget((int)sxp->sx_fdtarget); - if (!f.file) { - error = XFS_ERROR(EINVAL); - goto out; - } - - if (!(f.file->f_mode & FMODE_WRITE) || - !(f.file->f_mode & FMODE_READ) || - (f.file->f_flags & O_APPEND)) { - error = XFS_ERROR(EBADF); - goto out_put_file; - } - - tmp = fdget((int)sxp->sx_fdtmp); - if (!tmp.file) { - error = XFS_ERROR(EINVAL); - goto out_put_file; - } - - if (!(tmp.file->f_mode & FMODE_WRITE) || - !(tmp.file->f_mode & FMODE_READ) || - (tmp.file->f_flags & O_APPEND)) { - error = XFS_ERROR(EBADF); - goto out_put_tmp_file; - } - - if (IS_SWAPFILE(f.file->f_path.dentry->d_inode) || - IS_SWAPFILE(tmp.file->f_path.dentry->d_inode)) { - error = XFS_ERROR(EINVAL); - goto out_put_tmp_file; - } - - ip = XFS_I(f.file->f_path.dentry->d_inode); - tip = XFS_I(tmp.file->f_path.dentry->d_inode); - - if (ip->i_mount != tip->i_mount) { - error = XFS_ERROR(EINVAL); - goto out_put_tmp_file; - } - - if (ip->i_ino == tip->i_ino) { - error = XFS_ERROR(EINVAL); - goto out_put_tmp_file; - } - - if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { - error = XFS_ERROR(EIO); - goto out_put_tmp_file; - } - - error = xfs_swap_extents(ip, tip, sxp); - - out_put_tmp_file: - fdput(tmp); - out_put_file: - fdput(f); - out: - return error; -} - -/* - * We need to check that the format of the data fork in the temporary inode is - * valid for the target inode before doing the swap. This is not a problem with - * attr1 because of the fixed fork offset, but attr2 has a dynamically sized - * data fork depending on the space the attribute fork is taking so we can get - * invalid formats on the target inode. - * - * E.g. target has space for 7 extents in extent format, temp inode only has - * space for 6. If we defragment down to 7 extents, then the tmp format is a - * btree, but when swapped it needs to be in extent format. Hence we can't just - * blindly swap data forks on attr2 filesystems. - * - * Note that we check the swap in both directions so that we don't end up with - * a corrupt temporary inode, either. - * - * Note that fixing the way xfs_fsr sets up the attribute fork in the source - * inode will prevent this situation from occurring, so all we do here is - * reject and log the attempt. basically we are putting the responsibility on - * userspace to get this right. - */ -static int -xfs_swap_extents_check_format( - xfs_inode_t *ip, /* target inode */ - xfs_inode_t *tip) /* tmp inode */ -{ - - /* Should never get a local format */ - if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL || - tip->i_d.di_format == XFS_DINODE_FMT_LOCAL) - return EINVAL; - - /* - * if the target inode has less extents that then temporary inode then - * why did userspace call us? - */ - if (ip->i_d.di_nextents < tip->i_d.di_nextents) - return EINVAL; - - /* - * if the target inode is in extent form and the temp inode is in btree - * form then we will end up with the target inode in the wrong format - * as we already know there are less extents in the temp inode. - */ - if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && - tip->i_d.di_format == XFS_DINODE_FMT_BTREE) - return EINVAL; - - /* Check temp in extent form to max in target */ - if (tip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && - XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) > - XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)) - return EINVAL; - - /* Check target in extent form to max in temp */ - if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && - XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) > - XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK)) - return EINVAL; - - /* - * If we are in a btree format, check that the temp root block will fit - * in the target and that it has enough extents to be in btree format - * in the target. - * - * Note that we have to be careful to allow btree->extent conversions - * (a common defrag case) which will occur when the temp inode is in - * extent format... - */ - if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE) { - if (XFS_IFORK_BOFF(ip) && - tip->i_df.if_broot_bytes > XFS_IFORK_BOFF(ip)) - return EINVAL; - if (XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) <= - XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)) - return EINVAL; - } - - /* Reciprocal target->temp btree format checks */ - if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE) { - if (XFS_IFORK_BOFF(tip) && - ip->i_df.if_broot_bytes > XFS_IFORK_BOFF(tip)) - return EINVAL; - - if (XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) <= - XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK)) - return EINVAL; - } - - return 0; -} - -static int -xfs_swap_extents( - xfs_inode_t *ip, /* target inode */ - xfs_inode_t *tip, /* tmp inode */ - xfs_swapext_t *sxp) -{ - xfs_mount_t *mp = ip->i_mount; - xfs_trans_t *tp; - xfs_bstat_t *sbp = &sxp->sx_stat; - xfs_ifork_t *tempifp, *ifp, *tifp; - int src_log_flags, target_log_flags; - int error = 0; - int aforkblks = 0; - int taforkblks = 0; - __uint64_t tmp; - - tempifp = kmem_alloc(sizeof(xfs_ifork_t), KM_MAYFAIL); - if (!tempifp) { - error = XFS_ERROR(ENOMEM); - goto out; - } - - /* - * we have to do two separate lock calls here to keep lockdep - * happy. If we try to get all the locks in one call, lock will - * report false positives when we drop the ILOCK and regain them - * below. - */ - xfs_lock_two_inodes(ip, tip, XFS_IOLOCK_EXCL); - xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL); - - /* Verify that both files have the same format */ - if ((ip->i_d.di_mode & S_IFMT) != (tip->i_d.di_mode & S_IFMT)) { - error = XFS_ERROR(EINVAL); - goto out_unlock; - } - - /* Verify both files are either real-time or non-realtime */ - if (XFS_IS_REALTIME_INODE(ip) != XFS_IS_REALTIME_INODE(tip)) { - error = XFS_ERROR(EINVAL); - goto out_unlock; - } - - error = -filemap_write_and_wait(VFS_I(ip)->i_mapping); - if (error) - goto out_unlock; - truncate_pagecache_range(VFS_I(ip), 0, -1); - - /* Verify O_DIRECT for ftmp */ - if (VN_CACHED(VFS_I(tip)) != 0) { - error = XFS_ERROR(EINVAL); - goto out_unlock; - } - - /* Verify all data are being swapped */ - if (sxp->sx_offset != 0 || - sxp->sx_length != ip->i_d.di_size || - sxp->sx_length != tip->i_d.di_size) { - error = XFS_ERROR(EFAULT); - goto out_unlock; - } - - trace_xfs_swap_extent_before(ip, 0); - trace_xfs_swap_extent_before(tip, 1); - - /* check inode formats now that data is flushed */ - error = xfs_swap_extents_check_format(ip, tip); - if (error) { - xfs_notice(mp, - "%s: inode 0x%llx format is incompatible for exchanging.", - __func__, ip->i_ino); - goto out_unlock; - } - - /* - * Compare the current change & modify times with that - * passed in. If they differ, we abort this swap. - * This is the mechanism used to ensure the calling - * process that the file was not changed out from - * under it. - */ - if ((sbp->bs_ctime.tv_sec != VFS_I(ip)->i_ctime.tv_sec) || - (sbp->bs_ctime.tv_nsec != VFS_I(ip)->i_ctime.tv_nsec) || - (sbp->bs_mtime.tv_sec != VFS_I(ip)->i_mtime.tv_sec) || - (sbp->bs_mtime.tv_nsec != VFS_I(ip)->i_mtime.tv_nsec)) { - error = XFS_ERROR(EBUSY); - goto out_unlock; - } - - /* We need to fail if the file is memory mapped. Once we have tossed - * all existing pages, the page fault will have no option - * but to go to the filesystem for pages. By making the page fault call - * vop_read (or write in the case of autogrow) they block on the iolock - * until we have switched the extents. - */ - if (VN_MAPPED(VFS_I(ip))) { - error = XFS_ERROR(EBUSY); - goto out_unlock; - } - - xfs_iunlock(ip, XFS_ILOCK_EXCL); - xfs_iunlock(tip, XFS_ILOCK_EXCL); - - /* - * There is a race condition here since we gave up the - * ilock. However, the data fork will not change since - * we have the iolock (locked for truncation too) so we - * are safe. We don't really care if non-io related - * fields change. - */ - truncate_pagecache_range(VFS_I(ip), 0, -1); - - tp = xfs_trans_alloc(mp, XFS_TRANS_SWAPEXT); - if ((error = xfs_trans_reserve(tp, 0, - XFS_ICHANGE_LOG_RES(mp), 0, - 0, 0))) { - xfs_iunlock(ip, XFS_IOLOCK_EXCL); - xfs_iunlock(tip, XFS_IOLOCK_EXCL); - xfs_trans_cancel(tp, 0); - goto out; - } - xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL); - - /* - * Count the number of extended attribute blocks - */ - if ( ((XFS_IFORK_Q(ip) != 0) && (ip->i_d.di_anextents > 0)) && - (ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) { - error = xfs_bmap_count_blocks(tp, ip, XFS_ATTR_FORK, &aforkblks); - if (error) - goto out_trans_cancel; - } - if ( ((XFS_IFORK_Q(tip) != 0) && (tip->i_d.di_anextents > 0)) && - (tip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) { - error = xfs_bmap_count_blocks(tp, tip, XFS_ATTR_FORK, - &taforkblks); - if (error) - goto out_trans_cancel; - } - - /* - * Swap the data forks of the inodes - */ - ifp = &ip->i_df; - tifp = &tip->i_df; - *tempifp = *ifp; /* struct copy */ - *ifp = *tifp; /* struct copy */ - *tifp = *tempifp; /* struct copy */ - - /* - * Fix the on-disk inode values - */ - tmp = (__uint64_t)ip->i_d.di_nblocks; - ip->i_d.di_nblocks = tip->i_d.di_nblocks - taforkblks + aforkblks; - tip->i_d.di_nblocks = tmp + taforkblks - aforkblks; - - tmp = (__uint64_t) ip->i_d.di_nextents; - ip->i_d.di_nextents = tip->i_d.di_nextents; - tip->i_d.di_nextents = tmp; - - tmp = (__uint64_t) ip->i_d.di_format; - ip->i_d.di_format = tip->i_d.di_format; - tip->i_d.di_format = tmp; - - /* - * The extents in the source inode could still contain speculative - * preallocation beyond EOF (e.g. the file is open but not modified - * while defrag is in progress). In that case, we need to copy over the - * number of delalloc blocks the data fork in the source inode is - * tracking beyond EOF so that when the fork is truncated away when the - * temporary inode is unlinked we don't underrun the i_delayed_blks - * counter on that inode. - */ - ASSERT(tip->i_delayed_blks == 0); - tip->i_delayed_blks = ip->i_delayed_blks; - ip->i_delayed_blks = 0; - - src_log_flags = XFS_ILOG_CORE; - switch (ip->i_d.di_format) { - case XFS_DINODE_FMT_EXTENTS: - /* If the extents fit in the inode, fix the - * pointer. Otherwise it's already NULL or - * pointing to the extent. - */ - if (ip->i_d.di_nextents <= XFS_INLINE_EXTS) { - ifp->if_u1.if_extents = - ifp->if_u2.if_inline_ext; - } - src_log_flags |= XFS_ILOG_DEXT; - break; - case XFS_DINODE_FMT_BTREE: - src_log_flags |= XFS_ILOG_DBROOT; - break; - } - - target_log_flags = XFS_ILOG_CORE; - switch (tip->i_d.di_format) { - case XFS_DINODE_FMT_EXTENTS: - /* If the extents fit in the inode, fix the - * pointer. Otherwise it's already NULL or - * pointing to the extent. - */ - if (tip->i_d.di_nextents <= XFS_INLINE_EXTS) { - tifp->if_u1.if_extents = - tifp->if_u2.if_inline_ext; - } - target_log_flags |= XFS_ILOG_DEXT; - break; - case XFS_DINODE_FMT_BTREE: - target_log_flags |= XFS_ILOG_DBROOT; - break; - } - - - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); - xfs_trans_ijoin(tp, tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); - - xfs_trans_log_inode(tp, ip, src_log_flags); - xfs_trans_log_inode(tp, tip, target_log_flags); - - /* - * If this is a synchronous mount, make sure that the - * transaction goes to disk before returning to the user. - */ - if (mp->m_flags & XFS_MOUNT_WSYNC) - xfs_trans_set_sync(tp); - - error = xfs_trans_commit(tp, 0); - - trace_xfs_swap_extent_after(ip, 0); - trace_xfs_swap_extent_after(tip, 1); -out: - kmem_free(tempifp); - return error; - -out_unlock: - xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); - xfs_iunlock(tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); - goto out; - -out_trans_cancel: - xfs_trans_cancel(tp, 0); - goto out_unlock; -} diff --git a/fs/xfs/xfs_dfrag.h b/fs/xfs/xfs_dfrag.h deleted file mode 100644 index 20bdd935c12..00000000000 --- a/fs/xfs/xfs_dfrag.h +++ /dev/null @@ -1,53 +0,0 @@ -/* - * Copyright (c) 2000,2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#ifndef __XFS_DFRAG_H__ -#define __XFS_DFRAG_H__ - -/* - * Structure passed to xfs_swapext - */ - -typedef struct xfs_swapext -{ - __int64_t sx_version; /* version */ - __int64_t sx_fdtarget; /* fd of target file */ - __int64_t sx_fdtmp; /* fd of tmp file */ - xfs_off_t sx_offset; /* offset into file */ - xfs_off_t sx_length; /* leng from offset */ - char sx_pad[16]; /* pad space, unused */ - xfs_bstat_t sx_stat; /* stat of target b4 copy */ -} xfs_swapext_t; - -/* - * Version flag - */ -#define XFS_SX_VERSION 0 - -#ifdef __KERNEL__ -/* - * Prototypes for visible xfs_dfrag.c routines. - */ - -/* - * Syscall interface for xfs_swapext - */ -int xfs_swapext(struct xfs_swapext *sx); - -#endif /* __KERNEL__ */ - -#endif /* __XFS_DFRAG_H__ */ diff --git a/fs/xfs/xfs_dinode.h b/fs/xfs/xfs_dinode.h index 1d9643b3dce..623bbe8fd92 100644 --- a/fs/xfs/xfs_dinode.h +++ b/fs/xfs/xfs_dinode.h @@ -19,7 +19,7 @@ #define __XFS_DINODE_H__ #define XFS_DINODE_MAGIC 0x494e /* 'IN' */ -#define XFS_DINODE_GOOD_VERSION(v) (((v) == 1 || (v) == 2)) +#define XFS_DINODE_GOOD_VERSION(v) ((v) >= 1 && (v) <= 3) typedef struct xfs_timestamp { __be32 t_sec; /* timestamp seconds */ @@ -39,6 +39,9 @@ typedef struct xfs_timestamp { * There is a very similar struct icdinode in xfs_inode which matches the * layout of the first 96 bytes of this structure, but is kept in native * format instead of big endian. + * + * Note: di_flushiter is only used by v1/2 inodes - it's effectively a zeroed + * padding field for v3 inodes. */ typedef struct xfs_dinode { __be16 di_magic; /* inode magic # = XFS_DINODE_MAGIC */ @@ -70,11 +73,38 @@ typedef struct xfs_dinode { /* di_next_unlinked is the only non-core field in the old dinode */ __be32 di_next_unlinked;/* agi unlinked list ptr */ -} __attribute__((packed)) xfs_dinode_t; + + /* start of the extended dinode, writable fields */ + __le32 di_crc; /* CRC of the inode */ + __be64 di_changecount; /* number of attribute changes */ + __be64 di_lsn; /* flush sequence */ + __be64 di_flags2; /* more random flags */ + __u8 di_pad2[16]; /* more padding for future expansion */ + + /* fields only written to during inode creation */ + xfs_timestamp_t di_crtime; /* time created */ + __be64 di_ino; /* inode number */ + uuid_t di_uuid; /* UUID of the filesystem */ + + /* structure must be padded to 64 bit alignment */ +} xfs_dinode_t; + +#define XFS_DINODE_CRC_OFF offsetof(struct xfs_dinode, di_crc) #define DI_MAX_FLUSH 0xffff /* + * Size of the core inode on disk. Version 1 and 2 inodes have + * the same size, but version 3 has grown a few additional fields. + */ +static inline uint xfs_dinode_size(int version) +{ + if (version == 3) + return sizeof(struct xfs_dinode); + return offsetof(struct xfs_dinode, di_crc); +} + +/* * The 32 bit link count in the inode theoretically maxes out at UINT_MAX. * Since the pathconf interface is signed, we use 2^31 - 1 instead. * The old inode format had a 16 bit link count, so its maximum is USHRT_MAX. @@ -104,11 +134,8 @@ typedef enum xfs_dinode_fmt { /* * Inode size for given fs. */ -#define XFS_LITINO(mp) \ - ((int)(((mp)->m_sb.sb_inodesize) - sizeof(struct xfs_dinode))) - -#define XFS_BROOT_SIZE_ADJ \ - (XFS_BTREE_LBLOCK_LEN - sizeof(xfs_bmdr_block_t)) +#define XFS_LITINO(mp, version) \ + ((int)(((mp)->m_sb.sb_inodesize) - xfs_dinode_size(version))) /* * Inode data & attribute fork sizes, per inode. @@ -119,10 +146,10 @@ typedef enum xfs_dinode_fmt { #define XFS_DFORK_DSIZE(dip,mp) \ (XFS_DFORK_Q(dip) ? \ XFS_DFORK_BOFF(dip) : \ - XFS_LITINO(mp)) + XFS_LITINO(mp, (dip)->di_version)) #define XFS_DFORK_ASIZE(dip,mp) \ (XFS_DFORK_Q(dip) ? \ - XFS_LITINO(mp) - XFS_DFORK_BOFF(dip) : \ + XFS_LITINO(mp, (dip)->di_version) - XFS_DFORK_BOFF(dip) : \ 0) #define XFS_DFORK_SIZE(dip,mp,w) \ ((w) == XFS_DATA_FORK ? \ @@ -133,7 +160,7 @@ typedef enum xfs_dinode_fmt { * Return pointers to the data or attribute forks. */ #define XFS_DFORK_DPTR(dip) \ - ((char *)(dip) + sizeof(struct xfs_dinode)) + ((char *)dip + xfs_dinode_size(dip->di_version)) #define XFS_DFORK_APTR(dip) \ (XFS_DFORK_DPTR(dip) + XFS_DFORK_BOFF(dip)) #define XFS_DFORK_PTR(dip,w) \ diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c index b26a50f9921..79670cda48a 100644 --- a/fs/xfs/xfs_dir2.c +++ b/fs/xfs/xfs_dir2.c @@ -17,28 +17,27 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_types.h" -#include "xfs_log.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_inum.h" -#include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_mount.h" +#include "xfs_da_format.h" #include "xfs_da_btree.h" -#include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_dinode.h" #include "xfs_inode.h" +#include "xfs_trans.h" #include "xfs_inode_item.h" #include "xfs_bmap.h" #include "xfs_dir2.h" -#include "xfs_dir2_format.h" #include "xfs_dir2_priv.h" #include "xfs_error.h" -#include "xfs_vnodeops.h" #include "xfs_trace.h" +#include "xfs_dinode.h" + +struct xfs_name xfs_name_dotdot = { (unsigned char *)"..", 2, XFS_DIR3_FT_DIR }; -struct xfs_name xfs_name_dotdot = { (unsigned char *)"..", 2}; /* * ASCII case-insensitive (ie. A-Z) support for directories that was @@ -86,29 +85,74 @@ static struct xfs_nameops xfs_ascii_ci_nameops = { .compname = xfs_ascii_ci_compname, }; -void -xfs_dir_mount( - xfs_mount_t *mp) +int +xfs_da_mount( + struct xfs_mount *mp) { - ASSERT(xfs_sb_version_hasdirv2(&mp->m_sb)); + struct xfs_da_geometry *dageo; + int nodehdr_size; + + + ASSERT(mp->m_sb.sb_versionnum & XFS_SB_VERSION_DIRV2BIT); ASSERT((1 << (mp->m_sb.sb_blocklog + mp->m_sb.sb_dirblklog)) <= XFS_MAX_BLOCKSIZE); - mp->m_dirblksize = 1 << (mp->m_sb.sb_blocklog + mp->m_sb.sb_dirblklog); - mp->m_dirblkfsbs = 1 << mp->m_sb.sb_dirblklog; - mp->m_dirdatablk = xfs_dir2_db_to_da(mp, XFS_DIR2_DATA_FIRSTDB(mp)); - mp->m_dirleafblk = xfs_dir2_db_to_da(mp, XFS_DIR2_LEAF_FIRSTDB(mp)); - mp->m_dirfreeblk = xfs_dir2_db_to_da(mp, XFS_DIR2_FREE_FIRSTDB(mp)); - mp->m_attr_node_ents = - (mp->m_sb.sb_blocksize - (uint)sizeof(xfs_da_node_hdr_t)) / - (uint)sizeof(xfs_da_node_entry_t); - mp->m_dir_node_ents = - (mp->m_dirblksize - (uint)sizeof(xfs_da_node_hdr_t)) / - (uint)sizeof(xfs_da_node_entry_t); - mp->m_dir_magicpct = (mp->m_dirblksize * 37) / 100; + + mp->m_dir_inode_ops = xfs_dir_get_ops(mp, NULL); + mp->m_nondir_inode_ops = xfs_nondir_get_ops(mp, NULL); + + nodehdr_size = mp->m_dir_inode_ops->node_hdr_size; + mp->m_dir_geo = kmem_zalloc(sizeof(struct xfs_da_geometry), + KM_SLEEP | KM_MAYFAIL); + mp->m_attr_geo = kmem_zalloc(sizeof(struct xfs_da_geometry), + KM_SLEEP | KM_MAYFAIL); + if (!mp->m_dir_geo || !mp->m_attr_geo) { + kmem_free(mp->m_dir_geo); + kmem_free(mp->m_attr_geo); + return ENOMEM; + } + + /* set up directory geometry */ + dageo = mp->m_dir_geo; + dageo->blklog = mp->m_sb.sb_blocklog + mp->m_sb.sb_dirblklog; + dageo->fsblog = mp->m_sb.sb_blocklog; + dageo->blksize = 1 << dageo->blklog; + dageo->fsbcount = 1 << mp->m_sb.sb_dirblklog; + + /* + * Now we've set up the block conversion variables, we can calculate the + * segment block constants using the geometry structure. + */ + dageo->datablk = xfs_dir2_byte_to_da(dageo, XFS_DIR2_DATA_OFFSET); + dageo->leafblk = xfs_dir2_byte_to_da(dageo, XFS_DIR2_LEAF_OFFSET); + dageo->freeblk = xfs_dir2_byte_to_da(dageo, XFS_DIR2_FREE_OFFSET); + dageo->node_ents = (dageo->blksize - nodehdr_size) / + (uint)sizeof(xfs_da_node_entry_t); + dageo->magicpct = (dageo->blksize * 37) / 100; + + /* set up attribute geometry - single fsb only */ + dageo = mp->m_attr_geo; + dageo->blklog = mp->m_sb.sb_blocklog; + dageo->fsblog = mp->m_sb.sb_blocklog; + dageo->blksize = 1 << dageo->blklog; + dageo->fsbcount = 1; + dageo->node_ents = (dageo->blksize - nodehdr_size) / + (uint)sizeof(xfs_da_node_entry_t); + dageo->magicpct = (dageo->blksize * 37) / 100; + if (xfs_sb_version_hasasciici(&mp->m_sb)) mp->m_dirnameops = &xfs_ascii_ci_nameops; else mp->m_dirnameops = &xfs_default_nameops; + + return 0; +} + +void +xfs_da_unmount( + struct xfs_mount *mp) +{ + kmem_free(mp->m_dir_geo); + kmem_free(mp->m_attr_geo); } /* @@ -172,16 +216,24 @@ xfs_dir_init( xfs_inode_t *dp, xfs_inode_t *pdp) { - xfs_da_args_t args; + struct xfs_da_args *args; int error; - memset((char *)&args, 0, sizeof(args)); - args.dp = dp; - args.trans = tp; ASSERT(S_ISDIR(dp->i_d.di_mode)); - if ((error = xfs_dir_ino_validate(tp->t_mountp, pdp->i_ino))) + error = xfs_dir_ino_validate(tp->t_mountp, pdp->i_ino); + if (error) return error; - return xfs_dir2_sf_create(&args, pdp->i_ino); + + args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS); + if (!args) + return ENOMEM; + + args->geo = dp->i_mount->m_dir_geo; + args->dp = dp; + args->trans = tp; + error = xfs_dir2_sf_create(args, pdp->i_ino); + kmem_free(args); + return error; } /* @@ -197,40 +249,57 @@ xfs_dir_createname( xfs_bmap_free_t *flist, /* bmap's freeblock list */ xfs_extlen_t total) /* bmap's total block count */ { - xfs_da_args_t args; + struct xfs_da_args *args; int rval; int v; /* type-checking value */ ASSERT(S_ISDIR(dp->i_d.di_mode)); - if ((rval = xfs_dir_ino_validate(tp->t_mountp, inum))) + rval = xfs_dir_ino_validate(tp->t_mountp, inum); + if (rval) return rval; XFS_STATS_INC(xs_dir_create); - memset(&args, 0, sizeof(xfs_da_args_t)); - args.name = name->name; - args.namelen = name->len; - args.hashval = dp->i_mount->m_dirnameops->hashname(name); - args.inumber = inum; - args.dp = dp; - args.firstblock = first; - args.flist = flist; - args.total = total; - args.whichfork = XFS_DATA_FORK; - args.trans = tp; - args.op_flags = XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT; - - if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) - rval = xfs_dir2_sf_addname(&args); - else if ((rval = xfs_dir2_isblock(tp, dp, &v))) - return rval; - else if (v) - rval = xfs_dir2_block_addname(&args); - else if ((rval = xfs_dir2_isleaf(tp, dp, &v))) - return rval; - else if (v) - rval = xfs_dir2_leaf_addname(&args); + args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS); + if (!args) + return ENOMEM; + + args->geo = dp->i_mount->m_dir_geo; + args->name = name->name; + args->namelen = name->len; + args->filetype = name->type; + args->hashval = dp->i_mount->m_dirnameops->hashname(name); + args->inumber = inum; + args->dp = dp; + args->firstblock = first; + args->flist = flist; + args->total = total; + args->whichfork = XFS_DATA_FORK; + args->trans = tp; + args->op_flags = XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT; + + if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) { + rval = xfs_dir2_sf_addname(args); + goto out_free; + } + + rval = xfs_dir2_isblock(args, &v); + if (rval) + goto out_free; + if (v) { + rval = xfs_dir2_block_addname(args); + goto out_free; + } + + rval = xfs_dir2_isleaf(args, &v); + if (rval) + goto out_free; + if (v) + rval = xfs_dir2_leaf_addname(args); else - rval = xfs_dir2_node_addname(&args); + rval = xfs_dir2_node_addname(args); + +out_free: + kmem_free(args); return rval; } @@ -273,45 +342,67 @@ xfs_dir_lookup( xfs_ino_t *inum, /* out: inode number */ struct xfs_name *ci_name) /* out: actual name if CI match */ { - xfs_da_args_t args; + struct xfs_da_args *args; int rval; int v; /* type-checking value */ ASSERT(S_ISDIR(dp->i_d.di_mode)); XFS_STATS_INC(xs_dir_lookup); - memset(&args, 0, sizeof(xfs_da_args_t)); - args.name = name->name; - args.namelen = name->len; - args.hashval = dp->i_mount->m_dirnameops->hashname(name); - args.dp = dp; - args.whichfork = XFS_DATA_FORK; - args.trans = tp; - args.op_flags = XFS_DA_OP_OKNOENT; + /* + * We need to use KM_NOFS here so that lockdep will not throw false + * positive deadlock warnings on a non-transactional lookup path. It is + * safe to recurse into inode recalim in that case, but lockdep can't + * easily be taught about it. Hence KM_NOFS avoids having to add more + * lockdep Doing this avoids having to add a bunch of lockdep class + * annotations into the reclaim path for the ilock. + */ + args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS); + args->geo = dp->i_mount->m_dir_geo; + args->name = name->name; + args->namelen = name->len; + args->filetype = name->type; + args->hashval = dp->i_mount->m_dirnameops->hashname(name); + args->dp = dp; + args->whichfork = XFS_DATA_FORK; + args->trans = tp; + args->op_flags = XFS_DA_OP_OKNOENT; if (ci_name) - args.op_flags |= XFS_DA_OP_CILOOKUP; + args->op_flags |= XFS_DA_OP_CILOOKUP; - if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) - rval = xfs_dir2_sf_lookup(&args); - else if ((rval = xfs_dir2_isblock(tp, dp, &v))) - return rval; - else if (v) - rval = xfs_dir2_block_lookup(&args); - else if ((rval = xfs_dir2_isleaf(tp, dp, &v))) - return rval; - else if (v) - rval = xfs_dir2_leaf_lookup(&args); + if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) { + rval = xfs_dir2_sf_lookup(args); + goto out_check_rval; + } + + rval = xfs_dir2_isblock(args, &v); + if (rval) + goto out_free; + if (v) { + rval = xfs_dir2_block_lookup(args); + goto out_check_rval; + } + + rval = xfs_dir2_isleaf(args, &v); + if (rval) + goto out_free; + if (v) + rval = xfs_dir2_leaf_lookup(args); else - rval = xfs_dir2_node_lookup(&args); + rval = xfs_dir2_node_lookup(args); + +out_check_rval: if (rval == EEXIST) rval = 0; if (!rval) { - *inum = args.inumber; + *inum = args->inumber; if (ci_name) { - ci_name->name = args.value; - ci_name->len = args.valuelen; + ci_name->name = args->value; + ci_name->len = args->valuelen; } } +out_free: + kmem_free(args); return rval; } @@ -328,71 +419,52 @@ xfs_dir_removename( xfs_bmap_free_t *flist, /* bmap's freeblock list */ xfs_extlen_t total) /* bmap's total block count */ { - xfs_da_args_t args; + struct xfs_da_args *args; int rval; int v; /* type-checking value */ ASSERT(S_ISDIR(dp->i_d.di_mode)); XFS_STATS_INC(xs_dir_remove); - memset(&args, 0, sizeof(xfs_da_args_t)); - args.name = name->name; - args.namelen = name->len; - args.hashval = dp->i_mount->m_dirnameops->hashname(name); - args.inumber = ino; - args.dp = dp; - args.firstblock = first; - args.flist = flist; - args.total = total; - args.whichfork = XFS_DATA_FORK; - args.trans = tp; - - if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) - rval = xfs_dir2_sf_removename(&args); - else if ((rval = xfs_dir2_isblock(tp, dp, &v))) - return rval; - else if (v) - rval = xfs_dir2_block_removename(&args); - else if ((rval = xfs_dir2_isleaf(tp, dp, &v))) - return rval; - else if (v) - rval = xfs_dir2_leaf_removename(&args); - else - rval = xfs_dir2_node_removename(&args); - return rval; -} - -/* - * Read a directory. - */ -int -xfs_readdir( - xfs_inode_t *dp, - void *dirent, - size_t bufsize, - xfs_off_t *offset, - filldir_t filldir) -{ - int rval; /* return value */ - int v; /* type-checking value */ + args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS); + if (!args) + return ENOMEM; - trace_xfs_readdir(dp); + args->geo = dp->i_mount->m_dir_geo; + args->name = name->name; + args->namelen = name->len; + args->filetype = name->type; + args->hashval = dp->i_mount->m_dirnameops->hashname(name); + args->inumber = ino; + args->dp = dp; + args->firstblock = first; + args->flist = flist; + args->total = total; + args->whichfork = XFS_DATA_FORK; + args->trans = tp; + + if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) { + rval = xfs_dir2_sf_removename(args); + goto out_free; + } - if (XFS_FORCED_SHUTDOWN(dp->i_mount)) - return XFS_ERROR(EIO); + rval = xfs_dir2_isblock(args, &v); + if (rval) + goto out_free; + if (v) { + rval = xfs_dir2_block_removename(args); + goto out_free; + } - ASSERT(S_ISDIR(dp->i_d.di_mode)); - XFS_STATS_INC(xs_dir_getdents); - - if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) - rval = xfs_dir2_sf_getdents(dp, dirent, offset, filldir); - else if ((rval = xfs_dir2_isblock(NULL, dp, &v))) - ; - else if (v) - rval = xfs_dir2_block_getdents(dp, dirent, offset, filldir); + rval = xfs_dir2_isleaf(args, &v); + if (rval) + goto out_free; + if (v) + rval = xfs_dir2_leaf_removename(args); else - rval = xfs_dir2_leaf_getdents(dp, dirent, bufsize, offset, - filldir); + rval = xfs_dir2_node_removename(args); +out_free: + kmem_free(args); return rval; } @@ -409,39 +481,55 @@ xfs_dir_replace( xfs_bmap_free_t *flist, /* bmap's freeblock list */ xfs_extlen_t total) /* bmap's total block count */ { - xfs_da_args_t args; + struct xfs_da_args *args; int rval; int v; /* type-checking value */ ASSERT(S_ISDIR(dp->i_d.di_mode)); - if ((rval = xfs_dir_ino_validate(tp->t_mountp, inum))) + rval = xfs_dir_ino_validate(tp->t_mountp, inum); + if (rval) return rval; - memset(&args, 0, sizeof(xfs_da_args_t)); - args.name = name->name; - args.namelen = name->len; - args.hashval = dp->i_mount->m_dirnameops->hashname(name); - args.inumber = inum; - args.dp = dp; - args.firstblock = first; - args.flist = flist; - args.total = total; - args.whichfork = XFS_DATA_FORK; - args.trans = tp; - - if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) - rval = xfs_dir2_sf_replace(&args); - else if ((rval = xfs_dir2_isblock(tp, dp, &v))) - return rval; - else if (v) - rval = xfs_dir2_block_replace(&args); - else if ((rval = xfs_dir2_isleaf(tp, dp, &v))) - return rval; - else if (v) - rval = xfs_dir2_leaf_replace(&args); + args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS); + if (!args) + return ENOMEM; + + args->geo = dp->i_mount->m_dir_geo; + args->name = name->name; + args->namelen = name->len; + args->filetype = name->type; + args->hashval = dp->i_mount->m_dirnameops->hashname(name); + args->inumber = inum; + args->dp = dp; + args->firstblock = first; + args->flist = flist; + args->total = total; + args->whichfork = XFS_DATA_FORK; + args->trans = tp; + + if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) { + rval = xfs_dir2_sf_replace(args); + goto out_free; + } + + rval = xfs_dir2_isblock(args, &v); + if (rval) + goto out_free; + if (v) { + rval = xfs_dir2_block_replace(args); + goto out_free; + } + + rval = xfs_dir2_isleaf(args, &v); + if (rval) + goto out_free; + if (v) + rval = xfs_dir2_leaf_replace(args); else - rval = xfs_dir2_node_replace(&args); + rval = xfs_dir2_node_replace(args); +out_free: + kmem_free(args); return rval; } @@ -456,7 +544,7 @@ xfs_dir_canenter( struct xfs_name *name, /* name of entry to add */ uint resblks) { - xfs_da_args_t args; + struct xfs_da_args *args; int rval; int v; /* type-checking value */ @@ -465,28 +553,43 @@ xfs_dir_canenter( ASSERT(S_ISDIR(dp->i_d.di_mode)); - memset(&args, 0, sizeof(xfs_da_args_t)); - args.name = name->name; - args.namelen = name->len; - args.hashval = dp->i_mount->m_dirnameops->hashname(name); - args.dp = dp; - args.whichfork = XFS_DATA_FORK; - args.trans = tp; - args.op_flags = XFS_DA_OP_JUSTCHECK | XFS_DA_OP_ADDNAME | + args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS); + if (!args) + return ENOMEM; + + args->geo = dp->i_mount->m_dir_geo; + args->name = name->name; + args->namelen = name->len; + args->filetype = name->type; + args->hashval = dp->i_mount->m_dirnameops->hashname(name); + args->dp = dp; + args->whichfork = XFS_DATA_FORK; + args->trans = tp; + args->op_flags = XFS_DA_OP_JUSTCHECK | XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT; - if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) - rval = xfs_dir2_sf_addname(&args); - else if ((rval = xfs_dir2_isblock(tp, dp, &v))) - return rval; - else if (v) - rval = xfs_dir2_block_addname(&args); - else if ((rval = xfs_dir2_isleaf(tp, dp, &v))) - return rval; - else if (v) - rval = xfs_dir2_leaf_addname(&args); + if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) { + rval = xfs_dir2_sf_addname(args); + goto out_free; + } + + rval = xfs_dir2_isblock(args, &v); + if (rval) + goto out_free; + if (v) { + rval = xfs_dir2_block_addname(args); + goto out_free; + } + + rval = xfs_dir2_isleaf(args, &v); + if (rval) + goto out_free; + if (v) + rval = xfs_dir2_leaf_addname(args); else - rval = xfs_dir2_node_addname(&args); + rval = xfs_dir2_node_addname(args); +out_free: + kmem_free(args); return rval; } @@ -518,13 +621,13 @@ xfs_dir2_grow_inode( * Set lowest possible block in the space requested. */ bno = XFS_B_TO_FSBT(mp, space * XFS_DIR2_SPACE_SIZE); - count = mp->m_dirblkfsbs; + count = args->geo->fsbcount; error = xfs_da_grow_inode_int(args, &bno, count); if (error) return error; - *dbp = xfs_dir2_da_to_db(mp, (xfs_dablk_t)bno); + *dbp = xfs_dir2_da_to_db(args->geo, (xfs_dablk_t)bno); /* * Update file's size if this is the data space and it grew. @@ -546,19 +649,16 @@ xfs_dir2_grow_inode( */ int xfs_dir2_isblock( - xfs_trans_t *tp, - xfs_inode_t *dp, - int *vp) /* out: 1 is block, 0 is not block */ + struct xfs_da_args *args, + int *vp) /* out: 1 is block, 0 is not block */ { - xfs_fileoff_t last; /* last file offset */ - xfs_mount_t *mp; - int rval; + xfs_fileoff_t last; /* last file offset */ + int rval; - mp = dp->i_mount; - if ((rval = xfs_bmap_last_offset(tp, dp, &last, XFS_DATA_FORK))) + if ((rval = xfs_bmap_last_offset(args->dp, &last, XFS_DATA_FORK))) return rval; - rval = XFS_FSB_TO_B(mp, last) == mp->m_dirblksize; - ASSERT(rval == 0 || dp->i_d.di_size == mp->m_dirblksize); + rval = XFS_FSB_TO_B(args->dp->i_mount, last) == args->geo->blksize; + ASSERT(rval == 0 || args->dp->i_d.di_size == args->geo->blksize); *vp = rval; return 0; } @@ -568,18 +668,15 @@ xfs_dir2_isblock( */ int xfs_dir2_isleaf( - xfs_trans_t *tp, - xfs_inode_t *dp, - int *vp) /* out: 1 is leaf, 0 is not leaf */ + struct xfs_da_args *args, + int *vp) /* out: 1 is block, 0 is not block */ { - xfs_fileoff_t last; /* last file offset */ - xfs_mount_t *mp; - int rval; + xfs_fileoff_t last; /* last file offset */ + int rval; - mp = dp->i_mount; - if ((rval = xfs_bmap_last_offset(tp, dp, &last, XFS_DATA_FORK))) + if ((rval = xfs_bmap_last_offset(args->dp, &last, XFS_DATA_FORK))) return rval; - *vp = last == mp->m_dirleafblk + (1 << mp->m_sb.sb_dirblklog); + *vp = last == args->geo->leafblk + args->geo->fsbcount; return 0; } @@ -607,11 +704,11 @@ xfs_dir2_shrink_inode( dp = args->dp; mp = dp->i_mount; tp = args->trans; - da = xfs_dir2_db_to_da(mp, db); + da = xfs_dir2_db_to_da(args->geo, db); /* * Unmap the fsblock(s). */ - if ((error = xfs_bunmapi(tp, dp, da, mp->m_dirblkfsbs, + if ((error = xfs_bunmapi(tp, dp, da, args->geo->fsbcount, XFS_BMAPI_METADATA, 0, args->firstblock, args->flist, &done))) { /* @@ -638,12 +735,12 @@ xfs_dir2_shrink_inode( /* * If it's not a data block, we're done. */ - if (db >= XFS_DIR2_LEAF_FIRSTDB(mp)) + if (db >= xfs_dir2_byte_to_db(args->geo, XFS_DIR2_LEAF_OFFSET)) return 0; /* * If the block isn't the last one in the directory, we're done. */ - if (dp->i_d.di_size > xfs_dir2_db_off_to_byte(mp, db + 1, 0)) + if (dp->i_d.di_size > xfs_dir2_db_off_to_byte(args->geo, db + 1, 0)) return 0; bno = da; if ((error = xfs_bmap_last_before(tp, dp, &bno, XFS_DATA_FORK))) { @@ -652,7 +749,7 @@ xfs_dir2_shrink_inode( */ return error; } - if (db == mp->m_dirdatablk) + if (db == args->geo->datablk) ASSERT(bno == 0); else ASSERT(bno > 0); diff --git a/fs/xfs/xfs_dir2.h b/fs/xfs/xfs_dir2.h index e937d9991c1..c8e86b0b5e9 100644 --- a/fs/xfs/xfs_dir2.h +++ b/fs/xfs/xfs_dir2.h @@ -23,14 +23,100 @@ struct xfs_da_args; struct xfs_inode; struct xfs_mount; struct xfs_trans; +struct xfs_dir2_sf_hdr; +struct xfs_dir2_sf_entry; +struct xfs_dir2_data_hdr; +struct xfs_dir2_data_entry; +struct xfs_dir2_data_unused; extern struct xfs_name xfs_name_dotdot; /* + * directory operations vector for encode/decode routines + */ +struct xfs_dir_ops { + int (*sf_entsize)(struct xfs_dir2_sf_hdr *hdr, int len); + struct xfs_dir2_sf_entry * + (*sf_nextentry)(struct xfs_dir2_sf_hdr *hdr, + struct xfs_dir2_sf_entry *sfep); + __uint8_t (*sf_get_ftype)(struct xfs_dir2_sf_entry *sfep); + void (*sf_put_ftype)(struct xfs_dir2_sf_entry *sfep, + __uint8_t ftype); + xfs_ino_t (*sf_get_ino)(struct xfs_dir2_sf_hdr *hdr, + struct xfs_dir2_sf_entry *sfep); + void (*sf_put_ino)(struct xfs_dir2_sf_hdr *hdr, + struct xfs_dir2_sf_entry *sfep, + xfs_ino_t ino); + xfs_ino_t (*sf_get_parent_ino)(struct xfs_dir2_sf_hdr *hdr); + void (*sf_put_parent_ino)(struct xfs_dir2_sf_hdr *hdr, + xfs_ino_t ino); + + int (*data_entsize)(int len); + __uint8_t (*data_get_ftype)(struct xfs_dir2_data_entry *dep); + void (*data_put_ftype)(struct xfs_dir2_data_entry *dep, + __uint8_t ftype); + __be16 * (*data_entry_tag_p)(struct xfs_dir2_data_entry *dep); + struct xfs_dir2_data_free * + (*data_bestfree_p)(struct xfs_dir2_data_hdr *hdr); + + xfs_dir2_data_aoff_t data_dot_offset; + xfs_dir2_data_aoff_t data_dotdot_offset; + xfs_dir2_data_aoff_t data_first_offset; + size_t data_entry_offset; + + struct xfs_dir2_data_entry * + (*data_dot_entry_p)(struct xfs_dir2_data_hdr *hdr); + struct xfs_dir2_data_entry * + (*data_dotdot_entry_p)(struct xfs_dir2_data_hdr *hdr); + struct xfs_dir2_data_entry * + (*data_first_entry_p)(struct xfs_dir2_data_hdr *hdr); + struct xfs_dir2_data_entry * + (*data_entry_p)(struct xfs_dir2_data_hdr *hdr); + struct xfs_dir2_data_unused * + (*data_unused_p)(struct xfs_dir2_data_hdr *hdr); + + int leaf_hdr_size; + void (*leaf_hdr_to_disk)(struct xfs_dir2_leaf *to, + struct xfs_dir3_icleaf_hdr *from); + void (*leaf_hdr_from_disk)(struct xfs_dir3_icleaf_hdr *to, + struct xfs_dir2_leaf *from); + int (*leaf_max_ents)(struct xfs_da_geometry *geo); + struct xfs_dir2_leaf_entry * + (*leaf_ents_p)(struct xfs_dir2_leaf *lp); + + int node_hdr_size; + void (*node_hdr_to_disk)(struct xfs_da_intnode *to, + struct xfs_da3_icnode_hdr *from); + void (*node_hdr_from_disk)(struct xfs_da3_icnode_hdr *to, + struct xfs_da_intnode *from); + struct xfs_da_node_entry * + (*node_tree_p)(struct xfs_da_intnode *dap); + + int free_hdr_size; + void (*free_hdr_to_disk)(struct xfs_dir2_free *to, + struct xfs_dir3_icfree_hdr *from); + void (*free_hdr_from_disk)(struct xfs_dir3_icfree_hdr *to, + struct xfs_dir2_free *from); + int (*free_max_bests)(struct xfs_da_geometry *geo); + __be16 * (*free_bests_p)(struct xfs_dir2_free *free); + xfs_dir2_db_t (*db_to_fdb)(struct xfs_da_geometry *geo, + xfs_dir2_db_t db); + int (*db_to_fdindex)(struct xfs_da_geometry *geo, + xfs_dir2_db_t db); +}; + +extern const struct xfs_dir_ops * + xfs_dir_get_ops(struct xfs_mount *mp, struct xfs_inode *dp); +extern const struct xfs_dir_ops * + xfs_nondir_get_ops(struct xfs_mount *mp, struct xfs_inode *dp); + +/* * Generic directory interface routines */ extern void xfs_dir_startup(void); -extern void xfs_dir_mount(struct xfs_mount *mp); +extern int xfs_da_mount(struct xfs_mount *mp); +extern void xfs_da_unmount(struct xfs_mount *mp); + extern int xfs_dir_isempty(struct xfs_inode *dp); extern int xfs_dir_init(struct xfs_trans *tp, struct xfs_inode *dp, struct xfs_inode *pdp); @@ -57,4 +143,38 @@ extern int xfs_dir_canenter(struct xfs_trans *tp, struct xfs_inode *dp, */ extern int xfs_dir2_sf_to_block(struct xfs_da_args *args); +/* + * Interface routines used by userspace utilities + */ +extern int xfs_dir2_isblock(struct xfs_da_args *args, int *r); +extern int xfs_dir2_isleaf(struct xfs_da_args *args, int *r); +extern int xfs_dir2_shrink_inode(struct xfs_da_args *args, xfs_dir2_db_t db, + struct xfs_buf *bp); + +extern void xfs_dir2_data_freescan(struct xfs_inode *dp, + struct xfs_dir2_data_hdr *hdr, int *loghead); +extern void xfs_dir2_data_log_entry(struct xfs_da_args *args, + struct xfs_buf *bp, struct xfs_dir2_data_entry *dep); +extern void xfs_dir2_data_log_header(struct xfs_da_args *args, + struct xfs_buf *bp); +extern void xfs_dir2_data_log_unused(struct xfs_da_args *args, + struct xfs_buf *bp, struct xfs_dir2_data_unused *dup); +extern void xfs_dir2_data_make_free(struct xfs_da_args *args, + struct xfs_buf *bp, xfs_dir2_data_aoff_t offset, + xfs_dir2_data_aoff_t len, int *needlogp, int *needscanp); +extern void xfs_dir2_data_use_free(struct xfs_da_args *args, + struct xfs_buf *bp, struct xfs_dir2_data_unused *dup, + xfs_dir2_data_aoff_t offset, xfs_dir2_data_aoff_t len, + int *needlogp, int *needscanp); + +extern struct xfs_dir2_data_free *xfs_dir2_data_freefind( + struct xfs_dir2_data_hdr *hdr, struct xfs_dir2_data_free *bf, + struct xfs_dir2_data_unused *dup); + +extern const struct xfs_buf_ops xfs_dir3_block_buf_ops; +extern const struct xfs_buf_ops xfs_dir3_leafn_buf_ops; +extern const struct xfs_buf_ops xfs_dir3_leaf1_buf_ops; +extern const struct xfs_buf_ops xfs_dir3_free_buf_ops; +extern const struct xfs_buf_ops xfs_dir3_data_buf_ops; + #endif /* __XFS_DIR2_H__ */ diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c index 12afe07a91d..c7cd3154026 100644 --- a/fs/xfs/xfs_dir2_block.c +++ b/fs/xfs/xfs_dir2_block.c @@ -1,5 +1,6 @@ /* * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. + * Copyright (c) 2013 Red Hat, Inc. * All Rights Reserved. * * This program is free software; you can redistribute it and/or @@ -17,22 +18,25 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_types.h" -#include "xfs_log.h" -#include "xfs_trans.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_mount.h" +#include "xfs_da_format.h" #include "xfs_da_btree.h" -#include "xfs_bmap_btree.h" -#include "xfs_dinode.h" #include "xfs_inode.h" +#include "xfs_trans.h" #include "xfs_inode_item.h" +#include "xfs_bmap.h" +#include "xfs_buf_item.h" #include "xfs_dir2.h" -#include "xfs_dir2_format.h" #include "xfs_dir2_priv.h" #include "xfs_error.h" #include "xfs_trace.h" +#include "xfs_cksum.h" +#include "xfs_dinode.h" /* * Local function prototypes. @@ -56,56 +60,116 @@ xfs_dir_startup(void) xfs_dir_hash_dotdot = xfs_da_hashname((unsigned char *)"..", 2); } -static void -xfs_dir2_block_verify( +static bool +xfs_dir3_block_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; - struct xfs_dir2_data_hdr *hdr = bp->b_addr; - int block_ok = 0; - - block_ok = hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC); - block_ok = block_ok && __xfs_dir2_data_check(NULL, bp) == 0; - - if (!block_ok) { - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr); - xfs_buf_ioerror(bp, EFSCORRUPTED); + struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; + + if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (hdr3->magic != cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)) + return false; + if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_uuid)) + return false; + if (be64_to_cpu(hdr3->blkno) != bp->b_bn) + return false; + } else { + if (hdr3->magic != cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)) + return false; } + if (__xfs_dir3_data_check(NULL, bp)) + return false; + return true; } static void -xfs_dir2_block_read_verify( +xfs_dir3_block_read_verify( struct xfs_buf *bp) { - xfs_dir2_block_verify(bp); + struct xfs_mount *mp = bp->b_target->bt_mount; + + if (xfs_sb_version_hascrc(&mp->m_sb) && + !xfs_buf_verify_cksum(bp, XFS_DIR3_DATA_CRC_OFF)) + xfs_buf_ioerror(bp, EFSBADCRC); + else if (!xfs_dir3_block_verify(bp)) + xfs_buf_ioerror(bp, EFSCORRUPTED); + + if (bp->b_error) + xfs_verifier_error(bp); } static void -xfs_dir2_block_write_verify( +xfs_dir3_block_write_verify( struct xfs_buf *bp) { - xfs_dir2_block_verify(bp); + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_buf_log_item *bip = bp->b_fspriv; + struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; + + if (!xfs_dir3_block_verify(bp)) { + xfs_buf_ioerror(bp, EFSCORRUPTED); + xfs_verifier_error(bp); + return; + } + + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return; + + if (bip) + hdr3->lsn = cpu_to_be64(bip->bli_item.li_lsn); + + xfs_buf_update_cksum(bp, XFS_DIR3_DATA_CRC_OFF); } -const struct xfs_buf_ops xfs_dir2_block_buf_ops = { - .verify_read = xfs_dir2_block_read_verify, - .verify_write = xfs_dir2_block_write_verify, +const struct xfs_buf_ops xfs_dir3_block_buf_ops = { + .verify_read = xfs_dir3_block_read_verify, + .verify_write = xfs_dir3_block_write_verify, }; -static int -xfs_dir2_block_read( +int +xfs_dir3_block_read( struct xfs_trans *tp, struct xfs_inode *dp, struct xfs_buf **bpp) { struct xfs_mount *mp = dp->i_mount; + int err; + + err = xfs_da_read_buf(tp, dp, mp->m_dir_geo->datablk, -1, bpp, + XFS_DATA_FORK, &xfs_dir3_block_buf_ops); + if (!err && tp) + xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_BLOCK_BUF); + return err; +} + +static void +xfs_dir3_block_init( + struct xfs_mount *mp, + struct xfs_trans *tp, + struct xfs_buf *bp, + struct xfs_inode *dp) +{ + struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; - return xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, bpp, - XFS_DATA_FORK, &xfs_dir2_block_buf_ops); + bp->b_ops = &xfs_dir3_block_buf_ops; + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_BLOCK_BUF); + + if (xfs_sb_version_hascrc(&mp->m_sb)) { + memset(hdr3, 0, sizeof(*hdr3)); + hdr3->magic = cpu_to_be32(XFS_DIR3_BLOCK_MAGIC); + hdr3->blkno = cpu_to_be64(bp->b_bn); + hdr3->owner = cpu_to_be64(dp->i_ino); + uuid_copy(&hdr3->uuid, &mp->m_sb.sb_uuid); + return; + + } + hdr3->magic = cpu_to_be32(XFS_DIR2_BLOCK_MAGIC); } static void xfs_dir2_block_need_space( + struct xfs_inode *dp, struct xfs_dir2_data_hdr *hdr, struct xfs_dir2_block_tail *btp, struct xfs_dir2_leaf_entry *blp, @@ -121,7 +185,7 @@ xfs_dir2_block_need_space( struct xfs_dir2_data_unused *enddup = NULL; *compact = 0; - bf = hdr->bestfree; + bf = dp->d_ops->data_bestfree_p(hdr); /* * If there are stale entries we'll use one for the leaf. @@ -217,7 +281,7 @@ out: */ static void xfs_dir2_block_compact( - struct xfs_trans *tp, + struct xfs_da_args *args, struct xfs_buf *bp, struct xfs_dir2_data_hdr *hdr, struct xfs_dir2_block_tail *btp, @@ -250,18 +314,17 @@ xfs_dir2_block_compact( *lfloglow = toidx + 1 - (be32_to_cpu(btp->stale) - 1); *lfloghigh -= be32_to_cpu(btp->stale) - 1; be32_add_cpu(&btp->count, -(be32_to_cpu(btp->stale) - 1)); - xfs_dir2_data_make_free(tp, bp, + xfs_dir2_data_make_free(args, bp, (xfs_dir2_data_aoff_t)((char *)blp - (char *)hdr), (xfs_dir2_data_aoff_t)((be32_to_cpu(btp->stale) - 1) * sizeof(*blp)), needlog, &needscan); - blp += be32_to_cpu(btp->stale) - 1; btp->stale = cpu_to_be32(1); /* * If we now need to rebuild the bestfree map, do so. * This needs to happen before the next call to use_free. */ if (needscan) - xfs_dir2_data_freescan(tp->t_mountp, hdr, needlog); + xfs_dir2_data_freescan(args->dp, hdr, needlog); } /* @@ -303,24 +366,24 @@ xfs_dir2_block_addname( mp = dp->i_mount; /* Read the (one and only) directory block into bp. */ - error = xfs_dir2_block_read(tp, dp, &bp); + error = xfs_dir3_block_read(tp, dp, &bp); if (error) return error; - len = xfs_dir2_data_entsize(args->namelen); + len = dp->d_ops->data_entsize(args->namelen); /* * Set up pointers to parts of the block. */ hdr = bp->b_addr; - btp = xfs_dir2_block_tail_p(mp, hdr); + btp = xfs_dir2_block_tail_p(args->geo, hdr); blp = xfs_dir2_block_leaf_p(btp); /* * Find out if we can reuse stale entries or whether we need extra * space for entry and new leaf. */ - xfs_dir2_block_need_space(hdr, btp, blp, &tagp, &dup, + xfs_dir2_block_need_space(dp, hdr, btp, blp, &tagp, &dup, &enddup, &compact, len); /* @@ -356,7 +419,7 @@ xfs_dir2_block_addname( * If need to compact the leaf entries, do it now. */ if (compact) { - xfs_dir2_block_compact(tp, bp, hdr, btp, blp, &needlog, + xfs_dir2_block_compact(args, bp, hdr, btp, blp, &needlog, &lfloghigh, &lfloglow); /* recalculate blp post-compaction */ blp = xfs_dir2_block_leaf_p(btp); @@ -391,7 +454,7 @@ xfs_dir2_block_addname( /* * Mark the space needed for the new leaf entry, now in use. */ - xfs_dir2_data_use_free(tp, bp, enddup, + xfs_dir2_data_use_free(args, bp, enddup, (xfs_dir2_data_aoff_t) ((char *)enddup - (char *)hdr + be16_to_cpu(enddup->length) - sizeof(*blp)), @@ -406,7 +469,7 @@ xfs_dir2_block_addname( * This needs to happen before the next call to use_free. */ if (needscan) { - xfs_dir2_data_freescan(mp, hdr, &needlog); + xfs_dir2_data_freescan(dp, hdr, &needlog); needscan = 0; } /* @@ -472,13 +535,13 @@ xfs_dir2_block_addname( * Fill in the leaf entry. */ blp[mid].hashval = cpu_to_be32(args->hashval); - blp[mid].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(mp, + blp[mid].address = cpu_to_be32(xfs_dir2_byte_to_dataptr( (char *)dep - (char *)hdr)); xfs_dir2_block_log_leaf(tp, bp, lfloglow, lfloghigh); /* * Mark space for the data entry used. */ - xfs_dir2_data_use_free(tp, bp, dup, + xfs_dir2_data_use_free(args, bp, dup, (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr), (xfs_dir2_data_aoff_t)len, &needlog, &needscan); /* @@ -487,116 +550,19 @@ xfs_dir2_block_addname( dep->inumber = cpu_to_be64(args->inumber); dep->namelen = args->namelen; memcpy(dep->name, args->name, args->namelen); - tagp = xfs_dir2_data_entry_tag_p(dep); + dp->d_ops->data_put_ftype(dep, args->filetype); + tagp = dp->d_ops->data_entry_tag_p(dep); *tagp = cpu_to_be16((char *)dep - (char *)hdr); /* * Clean up the bestfree array and log the header, tail, and entry. */ if (needscan) - xfs_dir2_data_freescan(mp, hdr, &needlog); + xfs_dir2_data_freescan(dp, hdr, &needlog); if (needlog) - xfs_dir2_data_log_header(tp, bp); + xfs_dir2_data_log_header(args, bp); xfs_dir2_block_log_tail(tp, bp); - xfs_dir2_data_log_entry(tp, bp, dep); - xfs_dir2_data_check(dp, bp); - return 0; -} - -/* - * Readdir for block directories. - */ -int /* error */ -xfs_dir2_block_getdents( - xfs_inode_t *dp, /* incore inode */ - void *dirent, - xfs_off_t *offset, - filldir_t filldir) -{ - xfs_dir2_data_hdr_t *hdr; /* block header */ - struct xfs_buf *bp; /* buffer for block */ - xfs_dir2_block_tail_t *btp; /* block tail */ - xfs_dir2_data_entry_t *dep; /* block data entry */ - xfs_dir2_data_unused_t *dup; /* block unused entry */ - char *endptr; /* end of the data entries */ - int error; /* error return value */ - xfs_mount_t *mp; /* filesystem mount point */ - char *ptr; /* current data entry */ - int wantoff; /* starting block offset */ - xfs_off_t cook; - - mp = dp->i_mount; - /* - * If the block number in the offset is out of range, we're done. - */ - if (xfs_dir2_dataptr_to_db(mp, *offset) > mp->m_dirdatablk) - return 0; - - error = xfs_dir2_block_read(NULL, dp, &bp); - if (error) - return error; - - /* - * Extract the byte offset we start at from the seek pointer. - * We'll skip entries before this. - */ - wantoff = xfs_dir2_dataptr_to_off(mp, *offset); - hdr = bp->b_addr; - xfs_dir2_data_check(dp, bp); - /* - * Set up values for the loop. - */ - btp = xfs_dir2_block_tail_p(mp, hdr); - ptr = (char *)(hdr + 1); - endptr = (char *)xfs_dir2_block_leaf_p(btp); - - /* - * Loop over the data portion of the block. - * Each object is a real entry (dep) or an unused one (dup). - */ - while (ptr < endptr) { - dup = (xfs_dir2_data_unused_t *)ptr; - /* - * Unused, skip it. - */ - if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) { - ptr += be16_to_cpu(dup->length); - continue; - } - - dep = (xfs_dir2_data_entry_t *)ptr; - - /* - * Bump pointer for the next iteration. - */ - ptr += xfs_dir2_data_entsize(dep->namelen); - /* - * The entry is before the desired starting point, skip it. - */ - if ((char *)dep - (char *)hdr < wantoff) - continue; - - cook = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk, - (char *)dep - (char *)hdr); - - /* - * If it didn't fit, set the final offset to here & return. - */ - if (filldir(dirent, (char *)dep->name, dep->namelen, - cook & 0x7fffffff, be64_to_cpu(dep->inumber), - DT_UNKNOWN)) { - *offset = cook & 0x7fffffff; - xfs_trans_brelse(NULL, bp); - return 0; - } - } - - /* - * Reached the end of the block. - * Set the offset to a non-existent block 1 and return. - */ - *offset = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk + 1, 0) & - 0x7fffffff; - xfs_trans_brelse(NULL, bp); + xfs_dir2_data_log_entry(args, bp, dep); + xfs_dir3_data_check(dp, bp); return 0; } @@ -614,7 +580,7 @@ xfs_dir2_block_log_leaf( xfs_dir2_leaf_entry_t *blp; xfs_dir2_block_tail_t *btp; - btp = xfs_dir2_block_tail_p(tp->t_mountp, hdr); + btp = xfs_dir2_block_tail_p(tp->t_mountp->m_dir_geo, hdr); blp = xfs_dir2_block_leaf_p(btp); xfs_trans_log_buf(tp, bp, (uint)((char *)&blp[first] - (char *)hdr), (uint)((char *)&blp[last + 1] - (char *)hdr - 1)); @@ -631,7 +597,7 @@ xfs_dir2_block_log_tail( xfs_dir2_data_hdr_t *hdr = bp->b_addr; xfs_dir2_block_tail_t *btp; - btp = xfs_dir2_block_tail_p(tp->t_mountp, hdr); + btp = xfs_dir2_block_tail_p(tp->t_mountp->m_dir_geo, hdr); xfs_trans_log_buf(tp, bp, (uint)((char *)btp - (char *)hdr), (uint)((char *)(btp + 1) - (char *)hdr - 1)); } @@ -665,18 +631,20 @@ xfs_dir2_block_lookup( dp = args->dp; mp = dp->i_mount; hdr = bp->b_addr; - xfs_dir2_data_check(dp, bp); - btp = xfs_dir2_block_tail_p(mp, hdr); + xfs_dir3_data_check(dp, bp); + btp = xfs_dir2_block_tail_p(args->geo, hdr); blp = xfs_dir2_block_leaf_p(btp); /* * Get the offset from the leaf entry, to point to the data. */ dep = (xfs_dir2_data_entry_t *)((char *)hdr + - xfs_dir2_dataptr_to_off(mp, be32_to_cpu(blp[ent].address))); + xfs_dir2_dataptr_to_off(args->geo, + be32_to_cpu(blp[ent].address))); /* * Fill in inode number, CI name if appropriate, release the block. */ args->inumber = be64_to_cpu(dep->inumber); + args->filetype = dp->d_ops->data_get_ftype(dep); error = xfs_dir_cilookup_result(args, dep->name, dep->namelen); xfs_trans_brelse(args->trans, bp); return XFS_ERROR(error); @@ -711,13 +679,13 @@ xfs_dir2_block_lookup_int( tp = args->trans; mp = dp->i_mount; - error = xfs_dir2_block_read(tp, dp, &bp); + error = xfs_dir3_block_read(tp, dp, &bp); if (error) return error; hdr = bp->b_addr; - xfs_dir2_data_check(dp, bp); - btp = xfs_dir2_block_tail_p(mp, hdr); + xfs_dir3_data_check(dp, bp); + btp = xfs_dir2_block_tail_p(args->geo, hdr); blp = xfs_dir2_block_leaf_p(btp); /* * Loop doing a binary search for our hash value. @@ -755,7 +723,7 @@ xfs_dir2_block_lookup_int( * Get pointer to the entry from the leaf. */ dep = (xfs_dir2_data_entry_t *) - ((char *)hdr + xfs_dir2_dataptr_to_off(mp, addr)); + ((char *)hdr + xfs_dir2_dataptr_to_off(args->geo, addr)); /* * Compare name and if it's an exact match, return the index * and buffer. If it's the first case-insensitive match, store @@ -822,20 +790,21 @@ xfs_dir2_block_removename( tp = args->trans; mp = dp->i_mount; hdr = bp->b_addr; - btp = xfs_dir2_block_tail_p(mp, hdr); + btp = xfs_dir2_block_tail_p(args->geo, hdr); blp = xfs_dir2_block_leaf_p(btp); /* * Point to the data entry using the leaf entry. */ - dep = (xfs_dir2_data_entry_t *) - ((char *)hdr + xfs_dir2_dataptr_to_off(mp, be32_to_cpu(blp[ent].address))); + dep = (xfs_dir2_data_entry_t *)((char *)hdr + + xfs_dir2_dataptr_to_off(args->geo, + be32_to_cpu(blp[ent].address))); /* * Mark the data entry's space free. */ needlog = needscan = 0; - xfs_dir2_data_make_free(tp, bp, + xfs_dir2_data_make_free(args, bp, (xfs_dir2_data_aoff_t)((char *)dep - (char *)hdr), - xfs_dir2_data_entsize(dep->namelen), &needlog, &needscan); + dp->d_ops->data_entsize(dep->namelen), &needlog, &needscan); /* * Fix up the block tail. */ @@ -850,10 +819,10 @@ xfs_dir2_block_removename( * Fix up bestfree, log the header if necessary. */ if (needscan) - xfs_dir2_data_freescan(mp, hdr, &needlog); + xfs_dir2_data_freescan(dp, hdr, &needlog); if (needlog) - xfs_dir2_data_log_header(tp, bp); - xfs_dir2_data_check(dp, bp); + xfs_dir2_data_log_header(args, bp); + xfs_dir3_data_check(dp, bp); /* * See if the size as a shortform is good enough. */ @@ -897,20 +866,22 @@ xfs_dir2_block_replace( dp = args->dp; mp = dp->i_mount; hdr = bp->b_addr; - btp = xfs_dir2_block_tail_p(mp, hdr); + btp = xfs_dir2_block_tail_p(args->geo, hdr); blp = xfs_dir2_block_leaf_p(btp); /* * Point to the data entry we need to change. */ - dep = (xfs_dir2_data_entry_t *) - ((char *)hdr + xfs_dir2_dataptr_to_off(mp, be32_to_cpu(blp[ent].address))); + dep = (xfs_dir2_data_entry_t *)((char *)hdr + + xfs_dir2_dataptr_to_off(args->geo, + be32_to_cpu(blp[ent].address))); ASSERT(be64_to_cpu(dep->inumber) != args->inumber); /* * Change the inode number to the new value. */ dep->inumber = cpu_to_be64(args->inumber); - xfs_dir2_data_log_entry(args->trans, bp, dep); - xfs_dir2_data_check(dp, bp); + dp->d_ops->data_put_ftype(dep, args->filetype); + xfs_dir2_data_log_entry(args, bp, dep); + xfs_dir3_data_check(dp, bp); return 0; } @@ -958,6 +929,8 @@ xfs_dir2_leaf_to_block( __be16 *tagp; /* end of entry (tag) */ int to; /* block/leaf to index */ xfs_trans_t *tp; /* transaction pointer */ + struct xfs_dir2_leaf_entry *ents; + struct xfs_dir3_icleaf_hdr leafhdr; trace_xfs_dir2_leaf_to_block(args); @@ -965,18 +938,25 @@ xfs_dir2_leaf_to_block( tp = args->trans; mp = dp->i_mount; leaf = lbp->b_addr; - ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC)); - ltp = xfs_dir2_leaf_tail_p(mp, leaf); + dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf); + ents = dp->d_ops->leaf_ents_p(leaf); + ltp = xfs_dir2_leaf_tail_p(args->geo, leaf); + + ASSERT(leafhdr.magic == XFS_DIR2_LEAF1_MAGIC || + leafhdr.magic == XFS_DIR3_LEAF1_MAGIC); /* * If there are data blocks other than the first one, take this * opportunity to remove trailing empty data blocks that may have * been left behind during no-space-reservation operations. * These will show up in the leaf bests table. */ - while (dp->i_d.di_size > mp->m_dirblksize) { + while (dp->i_d.di_size > args->geo->blksize) { + int hdrsz; + + hdrsz = dp->d_ops->data_entry_offset; bestsp = xfs_dir2_leaf_bests_p(ltp); if (be16_to_cpu(bestsp[be32_to_cpu(ltp->bestcount) - 1]) == - mp->m_dirblksize - (uint)sizeof(*hdr)) { + args->geo->blksize - hdrsz) { if ((error = xfs_dir2_leaf_trim_data(args, lbp, (xfs_dir2_db_t)(be32_to_cpu(ltp->bestcount) - 1)))) @@ -988,21 +968,23 @@ xfs_dir2_leaf_to_block( * Read the data block if we don't already have it, give up if it fails. */ if (!dbp) { - error = xfs_dir2_data_read(tp, dp, mp->m_dirdatablk, -1, &dbp); + error = xfs_dir3_data_read(tp, dp, args->geo->datablk, -1, &dbp); if (error) return error; } hdr = dbp->b_addr; - ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC)); + ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC)); + /* * Size of the "leaf" area in the block. */ size = (uint)sizeof(xfs_dir2_block_tail_t) + - (uint)sizeof(*lep) * (be16_to_cpu(leaf->hdr.count) - be16_to_cpu(leaf->hdr.stale)); + (uint)sizeof(*lep) * (leafhdr.count - leafhdr.stale); /* * Look at the last data entry. */ - tagp = (__be16 *)((char *)hdr + mp->m_dirblksize) - 1; + tagp = (__be16 *)((char *)hdr + args->geo->blksize) - 1; dup = (xfs_dir2_data_unused_t *)((char *)hdr + be16_to_cpu(*tagp)); /* * If it's not free or is too short we can't do it. @@ -1014,31 +996,30 @@ xfs_dir2_leaf_to_block( /* * Start converting it to block form. */ - dbp->b_ops = &xfs_dir2_block_buf_ops; - hdr->magic = cpu_to_be32(XFS_DIR2_BLOCK_MAGIC); + xfs_dir3_block_init(mp, tp, dbp, dp); + needlog = 1; needscan = 0; /* * Use up the space at the end of the block (blp/btp). */ - xfs_dir2_data_use_free(tp, dbp, dup, mp->m_dirblksize - size, size, + xfs_dir2_data_use_free(args, dbp, dup, args->geo->blksize - size, size, &needlog, &needscan); /* * Initialize the block tail. */ - btp = xfs_dir2_block_tail_p(mp, hdr); - btp->count = cpu_to_be32(be16_to_cpu(leaf->hdr.count) - be16_to_cpu(leaf->hdr.stale)); + btp = xfs_dir2_block_tail_p(args->geo, hdr); + btp->count = cpu_to_be32(leafhdr.count - leafhdr.stale); btp->stale = 0; xfs_dir2_block_log_tail(tp, dbp); /* * Initialize the block leaf area. We compact out stale entries. */ lep = xfs_dir2_block_leaf_p(btp); - for (from = to = 0; from < be16_to_cpu(leaf->hdr.count); from++) { - if (leaf->ents[from].address == - cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) + for (from = to = 0; from < leafhdr.count; from++) { + if (ents[from].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) continue; - lep[to++] = leaf->ents[from]; + lep[to++] = ents[from]; } ASSERT(to == be32_to_cpu(btp->count)); xfs_dir2_block_log_leaf(tp, dbp, 0, be32_to_cpu(btp->count) - 1); @@ -1046,13 +1027,13 @@ xfs_dir2_leaf_to_block( * Scan the bestfree if we need it and log the data block header. */ if (needscan) - xfs_dir2_data_freescan(mp, hdr, &needlog); + xfs_dir2_data_freescan(dp, hdr, &needlog); if (needlog) - xfs_dir2_data_log_header(tp, dbp); + xfs_dir2_data_log_header(args, dbp); /* * Pitch the old leaf block. */ - error = xfs_da_shrink_inode(args, mp->m_dirleafblk, lbp); + error = xfs_da_shrink_inode(args, args->geo->leafblk, lbp); if (error) return error; @@ -1096,13 +1077,15 @@ xfs_dir2_sf_to_block( __be16 *tagp; /* end of data entry */ xfs_trans_t *tp; /* transaction pointer */ struct xfs_name name; + struct xfs_ifork *ifp; trace_xfs_dir2_sf_to_block(args); dp = args->dp; tp = args->trans; mp = dp->i_mount; - ASSERT(dp->i_df.if_flags & XFS_IFINLINE); + ifp = XFS_IFORK_PTR(dp, XFS_DATA_FORK); + ASSERT(ifp->if_flags & XFS_IFINLINE); /* * Bomb out if the shortform directory is way too short. */ @@ -1111,22 +1094,23 @@ xfs_dir2_sf_to_block( return XFS_ERROR(EIO); } - oldsfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; + oldsfp = (xfs_dir2_sf_hdr_t *)ifp->if_u1.if_data; - ASSERT(dp->i_df.if_bytes == dp->i_d.di_size); - ASSERT(dp->i_df.if_u1.if_data != NULL); + ASSERT(ifp->if_bytes == dp->i_d.di_size); + ASSERT(ifp->if_u1.if_data != NULL); ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(oldsfp->i8count)); + ASSERT(dp->i_d.di_nextents == 0); /* * Copy the directory into a temporary buffer. * Then pitch the incore inode data so we can make extents. */ - sfp = kmem_alloc(dp->i_df.if_bytes, KM_SLEEP); - memcpy(sfp, oldsfp, dp->i_df.if_bytes); + sfp = kmem_alloc(ifp->if_bytes, KM_SLEEP); + memcpy(sfp, oldsfp, ifp->if_bytes); - xfs_idata_realloc(dp, -dp->i_df.if_bytes, XFS_DATA_FORK); + xfs_idata_realloc(dp, -ifp->if_bytes, XFS_DATA_FORK); + xfs_bmap_local_to_extents_empty(dp, XFS_DATA_FORK); dp->i_d.di_size = 0; - xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); /* * Add block 0 to the inode. @@ -1137,16 +1121,16 @@ xfs_dir2_sf_to_block( return error; } /* - * Initialize the data block. + * Initialize the data block, then convert it to block format. */ - error = xfs_dir2_data_init(args, blkno, &bp); + error = xfs_dir3_data_init(args, blkno, &bp); if (error) { kmem_free(sfp); return error; } - bp->b_ops = &xfs_dir2_block_buf_ops; + xfs_dir3_block_init(mp, tp, bp, dp); hdr = bp->b_addr; - hdr->magic = cpu_to_be32(XFS_DIR2_BLOCK_MAGIC); + /* * Compute size of block "tail" area. */ @@ -1156,15 +1140,15 @@ xfs_dir2_sf_to_block( * The whole thing is initialized to free by the init routine. * Say we're using the leaf and tail area. */ - dup = (xfs_dir2_data_unused_t *)(hdr + 1); + dup = dp->d_ops->data_unused_p(hdr); needlog = needscan = 0; - xfs_dir2_data_use_free(tp, bp, dup, mp->m_dirblksize - i, i, &needlog, - &needscan); + xfs_dir2_data_use_free(args, bp, dup, args->geo->blksize - i, + i, &needlog, &needscan); ASSERT(needscan == 0); /* * Fill in the tail. */ - btp = xfs_dir2_block_tail_p(mp, hdr); + btp = xfs_dir2_block_tail_p(args->geo, hdr); btp->count = cpu_to_be32(sfp->count + 2); /* ., .. */ btp->stale = 0; blp = xfs_dir2_block_leaf_p(btp); @@ -1172,38 +1156,38 @@ xfs_dir2_sf_to_block( /* * Remove the freespace, we'll manage it. */ - xfs_dir2_data_use_free(tp, bp, dup, + xfs_dir2_data_use_free(args, bp, dup, (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr), be16_to_cpu(dup->length), &needlog, &needscan); /* * Create entry for . */ - dep = (xfs_dir2_data_entry_t *) - ((char *)hdr + XFS_DIR2_DATA_DOT_OFFSET); + dep = dp->d_ops->data_dot_entry_p(hdr); dep->inumber = cpu_to_be64(dp->i_ino); dep->namelen = 1; dep->name[0] = '.'; - tagp = xfs_dir2_data_entry_tag_p(dep); + dp->d_ops->data_put_ftype(dep, XFS_DIR3_FT_DIR); + tagp = dp->d_ops->data_entry_tag_p(dep); *tagp = cpu_to_be16((char *)dep - (char *)hdr); - xfs_dir2_data_log_entry(tp, bp, dep); + xfs_dir2_data_log_entry(args, bp, dep); blp[0].hashval = cpu_to_be32(xfs_dir_hash_dot); - blp[0].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(mp, + blp[0].address = cpu_to_be32(xfs_dir2_byte_to_dataptr( (char *)dep - (char *)hdr)); /* * Create entry for .. */ - dep = (xfs_dir2_data_entry_t *) - ((char *)hdr + XFS_DIR2_DATA_DOTDOT_OFFSET); - dep->inumber = cpu_to_be64(xfs_dir2_sf_get_parent_ino(sfp)); + dep = dp->d_ops->data_dotdot_entry_p(hdr); + dep->inumber = cpu_to_be64(dp->d_ops->sf_get_parent_ino(sfp)); dep->namelen = 2; dep->name[0] = dep->name[1] = '.'; - tagp = xfs_dir2_data_entry_tag_p(dep); + dp->d_ops->data_put_ftype(dep, XFS_DIR3_FT_DIR); + tagp = dp->d_ops->data_entry_tag_p(dep); *tagp = cpu_to_be16((char *)dep - (char *)hdr); - xfs_dir2_data_log_entry(tp, bp, dep); + xfs_dir2_data_log_entry(args, bp, dep); blp[1].hashval = cpu_to_be32(xfs_dir_hash_dotdot); - blp[1].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(mp, + blp[1].address = cpu_to_be32(xfs_dir2_byte_to_dataptr( (char *)dep - (char *)hdr)); - offset = XFS_DIR2_DATA_FIRST_OFFSET; + offset = dp->d_ops->data_first_offset; /* * Loop over existing entries, stuff them in. */ @@ -1233,8 +1217,10 @@ xfs_dir2_sf_to_block( dup->length = cpu_to_be16(newoffset - offset); *xfs_dir2_data_unused_tag_p(dup) = cpu_to_be16( ((char *)dup - (char *)hdr)); - xfs_dir2_data_log_unused(tp, bp, dup); - xfs_dir2_data_freeinsert(hdr, dup, &dummy); + xfs_dir2_data_log_unused(args, bp, dup); + xfs_dir2_data_freeinsert(hdr, + dp->d_ops->data_bestfree_p(hdr), + dup, &dummy); offset += be16_to_cpu(dup->length); continue; } @@ -1242,23 +1228,24 @@ xfs_dir2_sf_to_block( * Copy a real entry. */ dep = (xfs_dir2_data_entry_t *)((char *)hdr + newoffset); - dep->inumber = cpu_to_be64(xfs_dir2_sfe_get_ino(sfp, sfep)); + dep->inumber = cpu_to_be64(dp->d_ops->sf_get_ino(sfp, sfep)); dep->namelen = sfep->namelen; + dp->d_ops->data_put_ftype(dep, dp->d_ops->sf_get_ftype(sfep)); memcpy(dep->name, sfep->name, dep->namelen); - tagp = xfs_dir2_data_entry_tag_p(dep); + tagp = dp->d_ops->data_entry_tag_p(dep); *tagp = cpu_to_be16((char *)dep - (char *)hdr); - xfs_dir2_data_log_entry(tp, bp, dep); + xfs_dir2_data_log_entry(args, bp, dep); name.name = sfep->name; name.len = sfep->namelen; blp[2 + i].hashval = cpu_to_be32(mp->m_dirnameops-> hashname(&name)); - blp[2 + i].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(mp, + blp[2 + i].address = cpu_to_be32(xfs_dir2_byte_to_dataptr( (char *)dep - (char *)hdr)); offset = (int)((char *)(tagp + 1) - (char *)hdr); if (++i == sfp->count) sfep = NULL; else - sfep = xfs_dir2_sf_nextentry(sfp, sfep); + sfep = dp->d_ops->sf_nextentry(sfp, sfep); } /* Done with the temporary buffer */ kmem_free(sfp); @@ -1273,6 +1260,6 @@ xfs_dir2_sf_to_block( ASSERT(needscan == 0); xfs_dir2_block_log_leaf(tp, bp, 0, be32_to_cpu(btp->count) - 1); xfs_dir2_block_log_tail(tp, bp); - xfs_dir2_data_check(dp, bp); + xfs_dir3_data_check(dp, bp); return 0; } diff --git a/fs/xfs/xfs_dir2_data.c b/fs/xfs/xfs_dir2_data.c index ffcf1774152..8c2f6422648 100644 --- a/fs/xfs/xfs_dir2_data.c +++ b/fs/xfs/xfs_dir2_data.c @@ -1,5 +1,6 @@ /* * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. + * Copyright (c) 2013 Red Hat, Inc. * All Rights Reserved. * * This program is free software; you can redistribute it and/or @@ -17,22 +18,21 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_types.h" -#include "xfs_log.h" -#include "xfs_trans.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_mount.h" +#include "xfs_da_format.h" #include "xfs_da_btree.h" -#include "xfs_bmap_btree.h" -#include "xfs_dinode.h" #include "xfs_inode.h" -#include "xfs_dir2_format.h" +#include "xfs_dir2.h" #include "xfs_dir2_priv.h" #include "xfs_error.h" - -STATIC xfs_dir2_data_free_t * -xfs_dir2_data_freefind(xfs_dir2_data_hdr_t *hdr, xfs_dir2_data_unused_t *dup); +#include "xfs_trans.h" +#include "xfs_buf_item.h" +#include "xfs_cksum.h" /* * Check the consistency of the data block. @@ -40,7 +40,7 @@ xfs_dir2_data_freefind(xfs_dir2_data_hdr_t *hdr, xfs_dir2_data_unused_t *dup); * Return 0 is the buffer is good, otherwise an error. */ int -__xfs_dir2_data_check( +__xfs_dir3_data_check( struct xfs_inode *dp, /* incore inode pointer */ struct xfs_buf *bp) /* data block's buffer */ { @@ -62,30 +62,52 @@ __xfs_dir2_data_check( char *p; /* current data position */ int stale; /* count of stale leaves */ struct xfs_name name; + const struct xfs_dir_ops *ops; + struct xfs_da_geometry *geo; mp = bp->b_target->bt_mount; + geo = mp->m_dir_geo; + + /* + * We can be passed a null dp here from a verifier, so we need to go the + * hard way to get them. + */ + ops = xfs_dir_get_ops(mp, dp); + hdr = bp->b_addr; - bf = hdr->bestfree; - p = (char *)(hdr + 1); + p = (char *)ops->data_entry_p(hdr); switch (hdr->magic) { + case cpu_to_be32(XFS_DIR3_BLOCK_MAGIC): case cpu_to_be32(XFS_DIR2_BLOCK_MAGIC): - btp = xfs_dir2_block_tail_p(mp, hdr); + btp = xfs_dir2_block_tail_p(geo, hdr); lep = xfs_dir2_block_leaf_p(btp); endp = (char *)lep; + + /* + * The number of leaf entries is limited by the size of the + * block and the amount of space used by the data entries. + * We don't know how much space is used by the data entries yet, + * so just ensure that the count falls somewhere inside the + * block right now. + */ + XFS_WANT_CORRUPTED_RETURN(be32_to_cpu(btp->count) < + ((char *)btp - p) / sizeof(struct xfs_dir2_leaf_entry)); break; + case cpu_to_be32(XFS_DIR3_DATA_MAGIC): case cpu_to_be32(XFS_DIR2_DATA_MAGIC): - endp = (char *)hdr + mp->m_dirblksize; + endp = (char *)hdr + geo->blksize; break; default: XFS_ERROR_REPORT("Bad Magic", XFS_ERRLEVEL_LOW, mp); return EFSCORRUPTED; } - count = lastfree = freeseen = 0; /* * Account for zero bestfree entries. */ + bf = ops->data_bestfree_p(hdr); + count = lastfree = freeseen = 0; if (!bf[0].length) { XFS_WANT_CORRUPTED_RETURN(!bf[0].offset); freeseen |= 1 << 0; @@ -118,7 +140,7 @@ __xfs_dir2_data_check( XFS_WANT_CORRUPTED_RETURN( be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup)) == (char *)dup - (char *)hdr); - dfp = xfs_dir2_data_freefind(hdr, dup); + dfp = xfs_dir2_data_freefind(hdr, bf, dup); if (dfp) { i = (int)(dfp - bf); XFS_WANT_CORRUPTED_RETURN( @@ -144,14 +166,17 @@ __xfs_dir2_data_check( XFS_WANT_CORRUPTED_RETURN( !xfs_dir_ino_validate(mp, be64_to_cpu(dep->inumber))); XFS_WANT_CORRUPTED_RETURN( - be16_to_cpu(*xfs_dir2_data_entry_tag_p(dep)) == + be16_to_cpu(*ops->data_entry_tag_p(dep)) == (char *)dep - (char *)hdr); + XFS_WANT_CORRUPTED_RETURN( + ops->data_get_ftype(dep) < XFS_DIR3_FT_MAX); count++; lastfree = 0; - if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)) { - addr = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk, - (xfs_dir2_data_aoff_t) - ((char *)dep - (char *)hdr)); + if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)) { + addr = xfs_dir2_db_off_to_dataptr(geo, geo->datablk, + (xfs_dir2_data_aoff_t) + ((char *)dep - (char *)hdr)); name.name = dep->name; name.len = dep->namelen; hash = mp->m_dirnameops->hashname(&name); @@ -162,13 +187,14 @@ __xfs_dir2_data_check( } XFS_WANT_CORRUPTED_RETURN(i < be32_to_cpu(btp->count)); } - p += xfs_dir2_data_entsize(dep->namelen); + p += ops->data_entsize(dep->namelen); } /* * Need to have seen all the entries and all the bestfree slots. */ XFS_WANT_CORRUPTED_RETURN(freeseen == 7); - if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)) { + if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)) { for (i = stale = 0; i < be32_to_cpu(btp->count); i++) { if (lep[i].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) @@ -185,21 +211,27 @@ __xfs_dir2_data_check( return 0; } -static void -xfs_dir2_data_verify( +static bool +xfs_dir3_data_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; - struct xfs_dir2_data_hdr *hdr = bp->b_addr; - int block_ok = 0; - - block_ok = hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC); - block_ok = block_ok && __xfs_dir2_data_check(NULL, bp) == 0; + struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; - if (!block_ok) { - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr); - xfs_buf_ioerror(bp, EFSCORRUPTED); + if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (hdr3->magic != cpu_to_be32(XFS_DIR3_DATA_MAGIC)) + return false; + if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_uuid)) + return false; + if (be64_to_cpu(hdr3->blkno) != bp->b_bn) + return false; + } else { + if (hdr3->magic != cpu_to_be32(XFS_DIR2_DATA_MAGIC)) + return false; } + if (__xfs_dir3_data_check(NULL, bp)) + return false; + return true; } /* @@ -208,102 +240,136 @@ xfs_dir2_data_verify( * format buffer or a data format buffer on readahead. */ static void -xfs_dir2_data_reada_verify( +xfs_dir3_data_reada_verify( struct xfs_buf *bp) { - struct xfs_mount *mp = bp->b_target->bt_mount; struct xfs_dir2_data_hdr *hdr = bp->b_addr; switch (hdr->magic) { case cpu_to_be32(XFS_DIR2_BLOCK_MAGIC): - bp->b_ops = &xfs_dir2_block_buf_ops; + case cpu_to_be32(XFS_DIR3_BLOCK_MAGIC): + bp->b_ops = &xfs_dir3_block_buf_ops; bp->b_ops->verify_read(bp); return; case cpu_to_be32(XFS_DIR2_DATA_MAGIC): - xfs_dir2_data_verify(bp); + case cpu_to_be32(XFS_DIR3_DATA_MAGIC): + xfs_dir3_data_verify(bp); return; default: - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr); xfs_buf_ioerror(bp, EFSCORRUPTED); + xfs_verifier_error(bp); break; } } static void -xfs_dir2_data_read_verify( +xfs_dir3_data_read_verify( struct xfs_buf *bp) { - xfs_dir2_data_verify(bp); + struct xfs_mount *mp = bp->b_target->bt_mount; + + if (xfs_sb_version_hascrc(&mp->m_sb) && + !xfs_buf_verify_cksum(bp, XFS_DIR3_DATA_CRC_OFF)) + xfs_buf_ioerror(bp, EFSBADCRC); + else if (!xfs_dir3_data_verify(bp)) + xfs_buf_ioerror(bp, EFSCORRUPTED); + + if (bp->b_error) + xfs_verifier_error(bp); } static void -xfs_dir2_data_write_verify( +xfs_dir3_data_write_verify( struct xfs_buf *bp) { - xfs_dir2_data_verify(bp); + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_buf_log_item *bip = bp->b_fspriv; + struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; + + if (!xfs_dir3_data_verify(bp)) { + xfs_buf_ioerror(bp, EFSCORRUPTED); + xfs_verifier_error(bp); + return; + } + + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return; + + if (bip) + hdr3->lsn = cpu_to_be64(bip->bli_item.li_lsn); + + xfs_buf_update_cksum(bp, XFS_DIR3_DATA_CRC_OFF); } -const struct xfs_buf_ops xfs_dir2_data_buf_ops = { - .verify_read = xfs_dir2_data_read_verify, - .verify_write = xfs_dir2_data_write_verify, +const struct xfs_buf_ops xfs_dir3_data_buf_ops = { + .verify_read = xfs_dir3_data_read_verify, + .verify_write = xfs_dir3_data_write_verify, }; -static const struct xfs_buf_ops xfs_dir2_data_reada_buf_ops = { - .verify_read = xfs_dir2_data_reada_verify, - .verify_write = xfs_dir2_data_write_verify, +static const struct xfs_buf_ops xfs_dir3_data_reada_buf_ops = { + .verify_read = xfs_dir3_data_reada_verify, + .verify_write = xfs_dir3_data_write_verify, }; int -xfs_dir2_data_read( +xfs_dir3_data_read( struct xfs_trans *tp, struct xfs_inode *dp, xfs_dablk_t bno, xfs_daddr_t mapped_bno, struct xfs_buf **bpp) { - return xfs_da_read_buf(tp, dp, bno, mapped_bno, bpp, - XFS_DATA_FORK, &xfs_dir2_data_buf_ops); + int err; + + err = xfs_da_read_buf(tp, dp, bno, mapped_bno, bpp, + XFS_DATA_FORK, &xfs_dir3_data_buf_ops); + if (!err && tp) + xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_DATA_BUF); + return err; } int -xfs_dir2_data_readahead( - struct xfs_trans *tp, +xfs_dir3_data_readahead( struct xfs_inode *dp, xfs_dablk_t bno, xfs_daddr_t mapped_bno) { - return xfs_da_reada_buf(tp, dp, bno, mapped_bno, - XFS_DATA_FORK, &xfs_dir2_data_reada_buf_ops); + return xfs_da_reada_buf(dp, bno, mapped_bno, + XFS_DATA_FORK, &xfs_dir3_data_reada_buf_ops); } /* * Given a data block and an unused entry from that block, * return the bestfree entry if any that corresponds to it. */ -STATIC xfs_dir2_data_free_t * +xfs_dir2_data_free_t * xfs_dir2_data_freefind( - xfs_dir2_data_hdr_t *hdr, /* data block */ - xfs_dir2_data_unused_t *dup) /* data unused entry */ + struct xfs_dir2_data_hdr *hdr, /* data block header */ + struct xfs_dir2_data_free *bf, /* bestfree table pointer */ + struct xfs_dir2_data_unused *dup) /* unused space */ { xfs_dir2_data_free_t *dfp; /* bestfree entry */ xfs_dir2_data_aoff_t off; /* offset value needed */ -#if defined(DEBUG) && defined(__KERNEL__) +#ifdef DEBUG int matched; /* matched the value */ int seenzero; /* saw a 0 bestfree entry */ #endif off = (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr); -#if defined(DEBUG) && defined(__KERNEL__) + +#ifdef DEBUG /* * Validate some consistency in the bestfree table. * Check order, non-overlapping entries, and if we find the * one we're looking for it has to be exact. */ ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || - hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)); - for (dfp = &hdr->bestfree[0], seenzero = matched = 0; - dfp < &hdr->bestfree[XFS_DIR2_DATA_FD_COUNT]; + hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)); + for (dfp = &bf[0], seenzero = matched = 0; + dfp < &bf[XFS_DIR2_DATA_FD_COUNT]; dfp++) { if (!dfp->offset) { ASSERT(!dfp->length); @@ -319,7 +385,7 @@ xfs_dir2_data_freefind( else ASSERT(be16_to_cpu(dfp->offset) + be16_to_cpu(dfp->length) <= off); ASSERT(matched || be16_to_cpu(dfp->length) >= be16_to_cpu(dup->length)); - if (dfp > &hdr->bestfree[0]) + if (dfp > &bf[0]) ASSERT(be16_to_cpu(dfp[-1].length) >= be16_to_cpu(dfp[0].length)); } #endif @@ -328,14 +394,12 @@ xfs_dir2_data_freefind( * it can't be there since they're sorted. */ if (be16_to_cpu(dup->length) < - be16_to_cpu(hdr->bestfree[XFS_DIR2_DATA_FD_COUNT - 1].length)) + be16_to_cpu(bf[XFS_DIR2_DATA_FD_COUNT - 1].length)) return NULL; /* * Look at the three bestfree entries for our guy. */ - for (dfp = &hdr->bestfree[0]; - dfp < &hdr->bestfree[XFS_DIR2_DATA_FD_COUNT]; - dfp++) { + for (dfp = &bf[0]; dfp < &bf[XFS_DIR2_DATA_FD_COUNT]; dfp++) { if (!dfp->offset) return NULL; if (be16_to_cpu(dfp->offset) == off) @@ -352,18 +416,18 @@ xfs_dir2_data_freefind( */ xfs_dir2_data_free_t * /* entry inserted */ xfs_dir2_data_freeinsert( - xfs_dir2_data_hdr_t *hdr, /* data block pointer */ - xfs_dir2_data_unused_t *dup, /* unused space */ + struct xfs_dir2_data_hdr *hdr, /* data block pointer */ + struct xfs_dir2_data_free *dfp, /* bestfree table pointer */ + struct xfs_dir2_data_unused *dup, /* unused space */ int *loghead) /* log the data header (out) */ { - xfs_dir2_data_free_t *dfp; /* bestfree table pointer */ xfs_dir2_data_free_t new; /* new bestfree entry */ -#ifdef __KERNEL__ ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || - hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)); -#endif - dfp = hdr->bestfree; + hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)); + new.length = dup->length; new.offset = cpu_to_be16((char *)dup - (char *)hdr); @@ -396,36 +460,39 @@ xfs_dir2_data_freeinsert( */ STATIC void xfs_dir2_data_freeremove( - xfs_dir2_data_hdr_t *hdr, /* data block header */ - xfs_dir2_data_free_t *dfp, /* bestfree entry pointer */ + struct xfs_dir2_data_hdr *hdr, /* data block header */ + struct xfs_dir2_data_free *bf, /* bestfree table pointer */ + struct xfs_dir2_data_free *dfp, /* bestfree entry pointer */ int *loghead) /* out: log data header */ { -#ifdef __KERNEL__ + ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || - hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)); -#endif + hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)); + /* * It's the first entry, slide the next 2 up. */ - if (dfp == &hdr->bestfree[0]) { - hdr->bestfree[0] = hdr->bestfree[1]; - hdr->bestfree[1] = hdr->bestfree[2]; + if (dfp == &bf[0]) { + bf[0] = bf[1]; + bf[1] = bf[2]; } /* * It's the second entry, slide the 3rd entry up. */ - else if (dfp == &hdr->bestfree[1]) - hdr->bestfree[1] = hdr->bestfree[2]; + else if (dfp == &bf[1]) + bf[1] = bf[2]; /* * Must be the last entry. */ else - ASSERT(dfp == &hdr->bestfree[2]); + ASSERT(dfp == &bf[2]); /* * Clear the 3rd entry, must be zero now. */ - hdr->bestfree[2].length = 0; - hdr->bestfree[2].offset = 0; + bf[2].length = 0; + bf[2].offset = 0; *loghead = 1; } @@ -434,34 +501,39 @@ xfs_dir2_data_freeremove( */ void xfs_dir2_data_freescan( - xfs_mount_t *mp, /* filesystem mount point */ - xfs_dir2_data_hdr_t *hdr, /* data block header */ - int *loghead) /* out: log data header */ + struct xfs_inode *dp, + struct xfs_dir2_data_hdr *hdr, + int *loghead) { xfs_dir2_block_tail_t *btp; /* block tail */ xfs_dir2_data_entry_t *dep; /* active data entry */ xfs_dir2_data_unused_t *dup; /* unused data entry */ + struct xfs_dir2_data_free *bf; char *endp; /* end of block's data */ char *p; /* current entry pointer */ + struct xfs_da_geometry *geo = dp->i_mount->m_dir_geo; -#ifdef __KERNEL__ ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || - hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)); -#endif + hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)); + /* * Start by clearing the table. */ - memset(hdr->bestfree, 0, sizeof(hdr->bestfree)); + bf = dp->d_ops->data_bestfree_p(hdr); + memset(bf, 0, sizeof(*bf) * XFS_DIR2_DATA_FD_COUNT); *loghead = 1; /* * Set up pointers. */ - p = (char *)(hdr + 1); - if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)) { - btp = xfs_dir2_block_tail_p(mp, hdr); + p = (char *)dp->d_ops->data_entry_p(hdr); + if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)) { + btp = xfs_dir2_block_tail_p(geo, hdr); endp = (char *)xfs_dir2_block_leaf_p(btp); } else - endp = (char *)hdr + mp->m_dirblksize; + endp = (char *)hdr + geo->blksize; /* * Loop over the block's entries. */ @@ -473,7 +545,7 @@ xfs_dir2_data_freescan( if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) { ASSERT((char *)dup - (char *)hdr == be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup))); - xfs_dir2_data_freeinsert(hdr, dup, loghead); + xfs_dir2_data_freeinsert(hdr, bf, dup, loghead); p += be16_to_cpu(dup->length); } /* @@ -482,8 +554,8 @@ xfs_dir2_data_freescan( else { dep = (xfs_dir2_data_entry_t *)p; ASSERT((char *)dep - (char *)hdr == - be16_to_cpu(*xfs_dir2_data_entry_tag_p(dep))); - p += xfs_dir2_data_entsize(dep->namelen); + be16_to_cpu(*dp->d_ops->data_entry_tag_p(dep))); + p += dp->d_ops->data_entsize(dep->namelen); } } } @@ -493,7 +565,7 @@ xfs_dir2_data_freescan( * Give back the buffer for the created block. */ int /* error */ -xfs_dir2_data_init( +xfs_dir3_data_init( xfs_da_args_t *args, /* directory operation args */ xfs_dir2_db_t blkno, /* logical dir block number */ struct xfs_buf **bpp) /* output block buffer */ @@ -502,6 +574,7 @@ xfs_dir2_data_init( xfs_dir2_data_hdr_t *hdr; /* data block header */ xfs_inode_t *dp; /* incore directory inode */ xfs_dir2_data_unused_t *dup; /* unused entry pointer */ + struct xfs_dir2_data_free *bf; int error; /* error return value */ int i; /* bestfree index */ xfs_mount_t *mp; /* filesystem mount point */ @@ -514,38 +587,51 @@ xfs_dir2_data_init( /* * Get the buffer set up for the block. */ - error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(mp, blkno), -1, &bp, - XFS_DATA_FORK); + error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(args->geo, blkno), + -1, &bp, XFS_DATA_FORK); if (error) return error; - bp->b_ops = &xfs_dir2_data_buf_ops; + bp->b_ops = &xfs_dir3_data_buf_ops; + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_DATA_BUF); /* * Initialize the header. */ hdr = bp->b_addr; - hdr->magic = cpu_to_be32(XFS_DIR2_DATA_MAGIC); - hdr->bestfree[0].offset = cpu_to_be16(sizeof(*hdr)); + if (xfs_sb_version_hascrc(&mp->m_sb)) { + struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; + + memset(hdr3, 0, sizeof(*hdr3)); + hdr3->magic = cpu_to_be32(XFS_DIR3_DATA_MAGIC); + hdr3->blkno = cpu_to_be64(bp->b_bn); + hdr3->owner = cpu_to_be64(dp->i_ino); + uuid_copy(&hdr3->uuid, &mp->m_sb.sb_uuid); + + } else + hdr->magic = cpu_to_be32(XFS_DIR2_DATA_MAGIC); + + bf = dp->d_ops->data_bestfree_p(hdr); + bf[0].offset = cpu_to_be16(dp->d_ops->data_entry_offset); for (i = 1; i < XFS_DIR2_DATA_FD_COUNT; i++) { - hdr->bestfree[i].length = 0; - hdr->bestfree[i].offset = 0; + bf[i].length = 0; + bf[i].offset = 0; } /* * Set up an unused entry for the block's body. */ - dup = (xfs_dir2_data_unused_t *)(hdr + 1); + dup = dp->d_ops->data_unused_p(hdr); dup->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG); - t = mp->m_dirblksize - (uint)sizeof(*hdr); - hdr->bestfree[0].length = cpu_to_be16(t); + t = args->geo->blksize - (uint)dp->d_ops->data_entry_offset; + bf[0].length = cpu_to_be16(t); dup->length = cpu_to_be16(t); *xfs_dir2_data_unused_tag_p(dup) = cpu_to_be16((char *)dup - (char *)hdr); /* * Log it and return it. */ - xfs_dir2_data_log_header(tp, bp); - xfs_dir2_data_log_unused(tp, bp, dup); + xfs_dir2_data_log_header(args, bp); + xfs_dir2_data_log_unused(args, bp, dup); *bpp = bp; return 0; } @@ -555,17 +641,19 @@ xfs_dir2_data_init( */ void xfs_dir2_data_log_entry( - struct xfs_trans *tp, + struct xfs_da_args *args, struct xfs_buf *bp, xfs_dir2_data_entry_t *dep) /* data entry pointer */ { - xfs_dir2_data_hdr_t *hdr = bp->b_addr; + struct xfs_dir2_data_hdr *hdr = bp->b_addr; ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || - hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)); + hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)); - xfs_trans_log_buf(tp, bp, (uint)((char *)dep - (char *)hdr), - (uint)((char *)(xfs_dir2_data_entry_tag_p(dep) + 1) - + xfs_trans_log_buf(args->trans, bp, (uint)((char *)dep - (char *)hdr), + (uint)((char *)(args->dp->d_ops->data_entry_tag_p(dep) + 1) - (char *)hdr - 1)); } @@ -574,15 +662,20 @@ xfs_dir2_data_log_entry( */ void xfs_dir2_data_log_header( - struct xfs_trans *tp, + struct xfs_da_args *args, struct xfs_buf *bp) { - xfs_dir2_data_hdr_t *hdr = bp->b_addr; +#ifdef DEBUG + struct xfs_dir2_data_hdr *hdr = bp->b_addr; ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || - hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)); + hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)); +#endif - xfs_trans_log_buf(tp, bp, 0, sizeof(*hdr) - 1); + xfs_trans_log_buf(args->trans, bp, 0, + args->dp->d_ops->data_entry_offset - 1); } /* @@ -590,25 +683,27 @@ xfs_dir2_data_log_header( */ void xfs_dir2_data_log_unused( - struct xfs_trans *tp, + struct xfs_da_args *args, struct xfs_buf *bp, xfs_dir2_data_unused_t *dup) /* data unused pointer */ { xfs_dir2_data_hdr_t *hdr = bp->b_addr; ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || - hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)); + hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)); /* * Log the first part of the unused entry. */ - xfs_trans_log_buf(tp, bp, (uint)((char *)dup - (char *)hdr), + xfs_trans_log_buf(args->trans, bp, (uint)((char *)dup - (char *)hdr), (uint)((char *)&dup->length + sizeof(dup->length) - 1 - (char *)hdr)); /* * Log the end (tag) of the unused entry. */ - xfs_trans_log_buf(tp, bp, + xfs_trans_log_buf(args->trans, bp, (uint)((char *)xfs_dir2_data_unused_tag_p(dup) - (char *)hdr), (uint)((char *)xfs_dir2_data_unused_tag_p(dup) - (char *)hdr + sizeof(xfs_dir2_data_off_t) - 1)); @@ -620,7 +715,7 @@ xfs_dir2_data_log_unused( */ void xfs_dir2_data_make_free( - struct xfs_trans *tp, + struct xfs_da_args *args, struct xfs_buf *bp, xfs_dir2_data_aoff_t offset, /* starting byte offset */ xfs_dir2_data_aoff_t len, /* length in bytes */ @@ -630,32 +725,33 @@ xfs_dir2_data_make_free( xfs_dir2_data_hdr_t *hdr; /* data block pointer */ xfs_dir2_data_free_t *dfp; /* bestfree pointer */ char *endptr; /* end of data area */ - xfs_mount_t *mp; /* filesystem mount point */ int needscan; /* need to regen bestfree */ xfs_dir2_data_unused_t *newdup; /* new unused entry */ xfs_dir2_data_unused_t *postdup; /* unused entry after us */ xfs_dir2_data_unused_t *prevdup; /* unused entry before us */ + struct xfs_dir2_data_free *bf; - mp = tp->t_mountp; hdr = bp->b_addr; /* * Figure out where the end of the data area is. */ - if (hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC)) - endptr = (char *)hdr + mp->m_dirblksize; + if (hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC)) + endptr = (char *)hdr + args->geo->blksize; else { xfs_dir2_block_tail_t *btp; /* block tail */ - ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)); - btp = xfs_dir2_block_tail_p(mp, hdr); + ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)); + btp = xfs_dir2_block_tail_p(args->geo, hdr); endptr = (char *)xfs_dir2_block_leaf_p(btp); } /* * If this isn't the start of the block, then back up to * the previous entry and see if it's free. */ - if (offset > sizeof(*hdr)) { + if (offset > args->dp->d_ops->data_entry_offset) { __be16 *tagp; /* tag just before us */ tagp = (__be16 *)((char *)hdr + offset) - 1; @@ -681,28 +777,29 @@ xfs_dir2_data_make_free( * Previous and following entries are both free, * merge everything into a single free entry. */ + bf = args->dp->d_ops->data_bestfree_p(hdr); if (prevdup && postdup) { xfs_dir2_data_free_t *dfp2; /* another bestfree pointer */ /* * See if prevdup and/or postdup are in bestfree table. */ - dfp = xfs_dir2_data_freefind(hdr, prevdup); - dfp2 = xfs_dir2_data_freefind(hdr, postdup); + dfp = xfs_dir2_data_freefind(hdr, bf, prevdup); + dfp2 = xfs_dir2_data_freefind(hdr, bf, postdup); /* * We need a rescan unless there are exactly 2 free entries * namely our two. Then we know what's happening, otherwise * since the third bestfree is there, there might be more * entries. */ - needscan = (hdr->bestfree[2].length != 0); + needscan = (bf[2].length != 0); /* * Fix up the new big freespace. */ be16_add_cpu(&prevdup->length, len + be16_to_cpu(postdup->length)); *xfs_dir2_data_unused_tag_p(prevdup) = cpu_to_be16((char *)prevdup - (char *)hdr); - xfs_dir2_data_log_unused(tp, bp, prevdup); + xfs_dir2_data_log_unused(args, bp, prevdup); if (!needscan) { /* * Has to be the case that entries 0 and 1 are @@ -711,18 +808,19 @@ xfs_dir2_data_make_free( * Remove entry 1 first then entry 0. */ ASSERT(dfp && dfp2); - if (dfp == &hdr->bestfree[1]) { - dfp = &hdr->bestfree[0]; + if (dfp == &bf[1]) { + dfp = &bf[0]; ASSERT(dfp2 == dfp); - dfp2 = &hdr->bestfree[1]; + dfp2 = &bf[1]; } - xfs_dir2_data_freeremove(hdr, dfp2, needlogp); - xfs_dir2_data_freeremove(hdr, dfp, needlogp); + xfs_dir2_data_freeremove(hdr, bf, dfp2, needlogp); + xfs_dir2_data_freeremove(hdr, bf, dfp, needlogp); /* * Now insert the new entry. */ - dfp = xfs_dir2_data_freeinsert(hdr, prevdup, needlogp); - ASSERT(dfp == &hdr->bestfree[0]); + dfp = xfs_dir2_data_freeinsert(hdr, bf, prevdup, + needlogp); + ASSERT(dfp == &bf[0]); ASSERT(dfp->length == prevdup->length); ASSERT(!dfp[1].length); ASSERT(!dfp[2].length); @@ -732,54 +830,54 @@ xfs_dir2_data_make_free( * The entry before us is free, merge with it. */ else if (prevdup) { - dfp = xfs_dir2_data_freefind(hdr, prevdup); + dfp = xfs_dir2_data_freefind(hdr, bf, prevdup); be16_add_cpu(&prevdup->length, len); *xfs_dir2_data_unused_tag_p(prevdup) = cpu_to_be16((char *)prevdup - (char *)hdr); - xfs_dir2_data_log_unused(tp, bp, prevdup); + xfs_dir2_data_log_unused(args, bp, prevdup); /* * If the previous entry was in the table, the new entry * is longer, so it will be in the table too. Remove * the old one and add the new one. */ if (dfp) { - xfs_dir2_data_freeremove(hdr, dfp, needlogp); - xfs_dir2_data_freeinsert(hdr, prevdup, needlogp); + xfs_dir2_data_freeremove(hdr, bf, dfp, needlogp); + xfs_dir2_data_freeinsert(hdr, bf, prevdup, needlogp); } /* * Otherwise we need a scan if the new entry is big enough. */ else { needscan = be16_to_cpu(prevdup->length) > - be16_to_cpu(hdr->bestfree[2].length); + be16_to_cpu(bf[2].length); } } /* * The following entry is free, merge with it. */ else if (postdup) { - dfp = xfs_dir2_data_freefind(hdr, postdup); + dfp = xfs_dir2_data_freefind(hdr, bf, postdup); newdup = (xfs_dir2_data_unused_t *)((char *)hdr + offset); newdup->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG); newdup->length = cpu_to_be16(len + be16_to_cpu(postdup->length)); *xfs_dir2_data_unused_tag_p(newdup) = cpu_to_be16((char *)newdup - (char *)hdr); - xfs_dir2_data_log_unused(tp, bp, newdup); + xfs_dir2_data_log_unused(args, bp, newdup); /* * If the following entry was in the table, the new entry * is longer, so it will be in the table too. Remove * the old one and add the new one. */ if (dfp) { - xfs_dir2_data_freeremove(hdr, dfp, needlogp); - xfs_dir2_data_freeinsert(hdr, newdup, needlogp); + xfs_dir2_data_freeremove(hdr, bf, dfp, needlogp); + xfs_dir2_data_freeinsert(hdr, bf, newdup, needlogp); } /* * Otherwise we need a scan if the new entry is big enough. */ else { needscan = be16_to_cpu(newdup->length) > - be16_to_cpu(hdr->bestfree[2].length); + be16_to_cpu(bf[2].length); } } /* @@ -791,8 +889,8 @@ xfs_dir2_data_make_free( newdup->length = cpu_to_be16(len); *xfs_dir2_data_unused_tag_p(newdup) = cpu_to_be16((char *)newdup - (char *)hdr); - xfs_dir2_data_log_unused(tp, bp, newdup); - xfs_dir2_data_freeinsert(hdr, newdup, needlogp); + xfs_dir2_data_log_unused(args, bp, newdup); + xfs_dir2_data_freeinsert(hdr, bf, newdup, needlogp); } *needscanp = needscan; } @@ -802,7 +900,7 @@ xfs_dir2_data_make_free( */ void xfs_dir2_data_use_free( - struct xfs_trans *tp, + struct xfs_da_args *args, struct xfs_buf *bp, xfs_dir2_data_unused_t *dup, /* unused entry */ xfs_dir2_data_aoff_t offset, /* starting offset to use */ @@ -818,10 +916,13 @@ xfs_dir2_data_use_free( xfs_dir2_data_unused_t *newdup; /* new unused entry */ xfs_dir2_data_unused_t *newdup2; /* another new unused entry */ int oldlen; /* old unused entry's length */ + struct xfs_dir2_data_free *bf; hdr = bp->b_addr; ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || - hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)); + hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)); ASSERT(be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG); ASSERT(offset >= (char *)dup - (char *)hdr); ASSERT(offset + len <= (char *)dup + be16_to_cpu(dup->length) - (char *)hdr); @@ -829,9 +930,10 @@ xfs_dir2_data_use_free( /* * Look up the entry in the bestfree table. */ - dfp = xfs_dir2_data_freefind(hdr, dup); oldlen = be16_to_cpu(dup->length); - ASSERT(dfp || oldlen <= be16_to_cpu(hdr->bestfree[2].length)); + bf = args->dp->d_ops->data_bestfree_p(hdr); + dfp = xfs_dir2_data_freefind(hdr, bf, dup); + ASSERT(dfp || oldlen <= be16_to_cpu(bf[2].length)); /* * Check for alignment with front and back of the entry. */ @@ -845,9 +947,10 @@ xfs_dir2_data_use_free( */ if (matchfront && matchback) { if (dfp) { - needscan = (hdr->bestfree[2].offset != 0); + needscan = (bf[2].offset != 0); if (!needscan) - xfs_dir2_data_freeremove(hdr, dfp, needlogp); + xfs_dir2_data_freeremove(hdr, bf, dfp, + needlogp); } } /* @@ -860,13 +963,14 @@ xfs_dir2_data_use_free( newdup->length = cpu_to_be16(oldlen - len); *xfs_dir2_data_unused_tag_p(newdup) = cpu_to_be16((char *)newdup - (char *)hdr); - xfs_dir2_data_log_unused(tp, bp, newdup); + xfs_dir2_data_log_unused(args, bp, newdup); /* * If it was in the table, remove it and add the new one. */ if (dfp) { - xfs_dir2_data_freeremove(hdr, dfp, needlogp); - dfp = xfs_dir2_data_freeinsert(hdr, newdup, needlogp); + xfs_dir2_data_freeremove(hdr, bf, dfp, needlogp); + dfp = xfs_dir2_data_freeinsert(hdr, bf, newdup, + needlogp); ASSERT(dfp != NULL); ASSERT(dfp->length == newdup->length); ASSERT(be16_to_cpu(dfp->offset) == (char *)newdup - (char *)hdr); @@ -875,7 +979,7 @@ xfs_dir2_data_use_free( * that means we don't know if there was a better * choice for the last slot, or not. Rescan. */ - needscan = dfp == &hdr->bestfree[2]; + needscan = dfp == &bf[2]; } } /* @@ -887,13 +991,14 @@ xfs_dir2_data_use_free( newdup->length = cpu_to_be16(((char *)hdr + offset) - (char *)newdup); *xfs_dir2_data_unused_tag_p(newdup) = cpu_to_be16((char *)newdup - (char *)hdr); - xfs_dir2_data_log_unused(tp, bp, newdup); + xfs_dir2_data_log_unused(args, bp, newdup); /* * If it was in the table, remove it and add the new one. */ if (dfp) { - xfs_dir2_data_freeremove(hdr, dfp, needlogp); - dfp = xfs_dir2_data_freeinsert(hdr, newdup, needlogp); + xfs_dir2_data_freeremove(hdr, bf, dfp, needlogp); + dfp = xfs_dir2_data_freeinsert(hdr, bf, newdup, + needlogp); ASSERT(dfp != NULL); ASSERT(dfp->length == newdup->length); ASSERT(be16_to_cpu(dfp->offset) == (char *)newdup - (char *)hdr); @@ -902,7 +1007,7 @@ xfs_dir2_data_use_free( * that means we don't know if there was a better * choice for the last slot, or not. Rescan. */ - needscan = dfp == &hdr->bestfree[2]; + needscan = dfp == &bf[2]; } } /* @@ -914,13 +1019,13 @@ xfs_dir2_data_use_free( newdup->length = cpu_to_be16(((char *)hdr + offset) - (char *)newdup); *xfs_dir2_data_unused_tag_p(newdup) = cpu_to_be16((char *)newdup - (char *)hdr); - xfs_dir2_data_log_unused(tp, bp, newdup); + xfs_dir2_data_log_unused(args, bp, newdup); newdup2 = (xfs_dir2_data_unused_t *)((char *)hdr + offset + len); newdup2->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG); newdup2->length = cpu_to_be16(oldlen - len - be16_to_cpu(newdup->length)); *xfs_dir2_data_unused_tag_p(newdup2) = cpu_to_be16((char *)newdup2 - (char *)hdr); - xfs_dir2_data_log_unused(tp, bp, newdup2); + xfs_dir2_data_log_unused(args, bp, newdup2); /* * If the old entry was in the table, we need to scan * if the 3rd entry was valid, since these entries @@ -930,11 +1035,13 @@ xfs_dir2_data_use_free( * the 2 new will work. */ if (dfp) { - needscan = (hdr->bestfree[2].length != 0); + needscan = (bf[2].length != 0); if (!needscan) { - xfs_dir2_data_freeremove(hdr, dfp, needlogp); - xfs_dir2_data_freeinsert(hdr, newdup, needlogp); - xfs_dir2_data_freeinsert(hdr, newdup2, + xfs_dir2_data_freeremove(hdr, bf, dfp, + needlogp); + xfs_dir2_data_freeinsert(hdr, bf, newdup, + needlogp); + xfs_dir2_data_freeinsert(hdr, bf, newdup2, needlogp); } } diff --git a/fs/xfs/xfs_dir2_format.h b/fs/xfs/xfs_dir2_format.h deleted file mode 100644 index 07270981f48..00000000000 --- a/fs/xfs/xfs_dir2_format.h +++ /dev/null @@ -1,597 +0,0 @@ -/* - * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#ifndef __XFS_DIR2_FORMAT_H__ -#define __XFS_DIR2_FORMAT_H__ - -/* - * Directory version 2. - * - * There are 4 possible formats: - * - shortform - embedded into the inode - * - single block - data with embedded leaf at the end - * - multiple data blocks, single leaf+freeindex block - * - data blocks, node and leaf blocks (btree), freeindex blocks - * - * Note: many node blocks structures and constants are shared with the attr - * code and defined in xfs_da_btree.h. - */ - -#define XFS_DIR2_BLOCK_MAGIC 0x58443242 /* XD2B: single block dirs */ -#define XFS_DIR2_DATA_MAGIC 0x58443244 /* XD2D: multiblock dirs */ -#define XFS_DIR2_FREE_MAGIC 0x58443246 /* XD2F: free index blocks */ - -/* - * Byte offset in data block and shortform entry. - */ -typedef __uint16_t xfs_dir2_data_off_t; -#define NULLDATAOFF 0xffffU -typedef uint xfs_dir2_data_aoff_t; /* argument form */ - -/* - * Normalized offset (in a data block) of the entry, really xfs_dir2_data_off_t. - * Only need 16 bits, this is the byte offset into the single block form. - */ -typedef struct { __uint8_t i[2]; } __arch_pack xfs_dir2_sf_off_t; - -/* - * Offset in data space of a data entry. - */ -typedef __uint32_t xfs_dir2_dataptr_t; -#define XFS_DIR2_MAX_DATAPTR ((xfs_dir2_dataptr_t)0xffffffff) -#define XFS_DIR2_NULL_DATAPTR ((xfs_dir2_dataptr_t)0) - -/* - * Byte offset in a directory. - */ -typedef xfs_off_t xfs_dir2_off_t; - -/* - * Directory block number (logical dirblk in file) - */ -typedef __uint32_t xfs_dir2_db_t; - -/* - * Inode number stored as 8 8-bit values. - */ -typedef struct { __uint8_t i[8]; } xfs_dir2_ino8_t; - -/* - * Inode number stored as 4 8-bit values. - * Works a lot of the time, when all the inode numbers in a directory - * fit in 32 bits. - */ -typedef struct { __uint8_t i[4]; } xfs_dir2_ino4_t; - -typedef union { - xfs_dir2_ino8_t i8; - xfs_dir2_ino4_t i4; -} xfs_dir2_inou_t; -#define XFS_DIR2_MAX_SHORT_INUM ((xfs_ino_t)0xffffffffULL) - -/* - * Directory layout when stored internal to an inode. - * - * Small directories are packed as tightly as possible so as to fit into the - * literal area of the inode. These "shortform" directories consist of a - * single xfs_dir2_sf_hdr header followed by zero or more xfs_dir2_sf_entry - * structures. Due the different inode number storage size and the variable - * length name field in the xfs_dir2_sf_entry all these structure are - * variable length, and the accessors in this file should be used to iterate - * over them. - */ -typedef struct xfs_dir2_sf_hdr { - __uint8_t count; /* count of entries */ - __uint8_t i8count; /* count of 8-byte inode #s */ - xfs_dir2_inou_t parent; /* parent dir inode number */ -} __arch_pack xfs_dir2_sf_hdr_t; - -typedef struct xfs_dir2_sf_entry { - __u8 namelen; /* actual name length */ - xfs_dir2_sf_off_t offset; /* saved offset */ - __u8 name[]; /* name, variable size */ - /* - * A xfs_dir2_ino8_t or xfs_dir2_ino4_t follows here, at a - * variable offset after the name. - */ -} __arch_pack xfs_dir2_sf_entry_t; - -static inline int xfs_dir2_sf_hdr_size(int i8count) -{ - return sizeof(struct xfs_dir2_sf_hdr) - - (i8count == 0) * - (sizeof(xfs_dir2_ino8_t) - sizeof(xfs_dir2_ino4_t)); -} - -static inline xfs_dir2_data_aoff_t -xfs_dir2_sf_get_offset(xfs_dir2_sf_entry_t *sfep) -{ - return get_unaligned_be16(&sfep->offset.i); -} - -static inline void -xfs_dir2_sf_put_offset(xfs_dir2_sf_entry_t *sfep, xfs_dir2_data_aoff_t off) -{ - put_unaligned_be16(off, &sfep->offset.i); -} - -static inline int -xfs_dir2_sf_entsize(struct xfs_dir2_sf_hdr *hdr, int len) -{ - return sizeof(struct xfs_dir2_sf_entry) + /* namelen + offset */ - len + /* name */ - (hdr->i8count ? /* ino */ - sizeof(xfs_dir2_ino8_t) : - sizeof(xfs_dir2_ino4_t)); -} - -static inline struct xfs_dir2_sf_entry * -xfs_dir2_sf_firstentry(struct xfs_dir2_sf_hdr *hdr) -{ - return (struct xfs_dir2_sf_entry *) - ((char *)hdr + xfs_dir2_sf_hdr_size(hdr->i8count)); -} - -static inline struct xfs_dir2_sf_entry * -xfs_dir2_sf_nextentry(struct xfs_dir2_sf_hdr *hdr, - struct xfs_dir2_sf_entry *sfep) -{ - return (struct xfs_dir2_sf_entry *) - ((char *)sfep + xfs_dir2_sf_entsize(hdr, sfep->namelen)); -} - - -/* - * Data block structures. - * - * A pure data block looks like the following drawing on disk: - * - * +-------------------------------------------------+ - * | xfs_dir2_data_hdr_t | - * +-------------------------------------------------+ - * | xfs_dir2_data_entry_t OR xfs_dir2_data_unused_t | - * | xfs_dir2_data_entry_t OR xfs_dir2_data_unused_t | - * | xfs_dir2_data_entry_t OR xfs_dir2_data_unused_t | - * | ... | - * +-------------------------------------------------+ - * | unused space | - * +-------------------------------------------------+ - * - * As all the entries are variable size structures the accessors below should - * be used to iterate over them. - * - * In addition to the pure data blocks for the data and node formats, - * most structures are also used for the combined data/freespace "block" - * format below. - */ - -#define XFS_DIR2_DATA_ALIGN_LOG 3 /* i.e., 8 bytes */ -#define XFS_DIR2_DATA_ALIGN (1 << XFS_DIR2_DATA_ALIGN_LOG) -#define XFS_DIR2_DATA_FREE_TAG 0xffff -#define XFS_DIR2_DATA_FD_COUNT 3 - -/* - * Directory address space divided into sections, - * spaces separated by 32GB. - */ -#define XFS_DIR2_SPACE_SIZE (1ULL << (32 + XFS_DIR2_DATA_ALIGN_LOG)) -#define XFS_DIR2_DATA_SPACE 0 -#define XFS_DIR2_DATA_OFFSET (XFS_DIR2_DATA_SPACE * XFS_DIR2_SPACE_SIZE) -#define XFS_DIR2_DATA_FIRSTDB(mp) \ - xfs_dir2_byte_to_db(mp, XFS_DIR2_DATA_OFFSET) - -/* - * Offsets of . and .. in data space (always block 0) - */ -#define XFS_DIR2_DATA_DOT_OFFSET \ - ((xfs_dir2_data_aoff_t)sizeof(struct xfs_dir2_data_hdr)) -#define XFS_DIR2_DATA_DOTDOT_OFFSET \ - (XFS_DIR2_DATA_DOT_OFFSET + xfs_dir2_data_entsize(1)) -#define XFS_DIR2_DATA_FIRST_OFFSET \ - (XFS_DIR2_DATA_DOTDOT_OFFSET + xfs_dir2_data_entsize(2)) - -/* - * Describe a free area in the data block. - * - * The freespace will be formatted as a xfs_dir2_data_unused_t. - */ -typedef struct xfs_dir2_data_free { - __be16 offset; /* start of freespace */ - __be16 length; /* length of freespace */ -} xfs_dir2_data_free_t; - -/* - * Header for the data blocks. - * - * The code knows that XFS_DIR2_DATA_FD_COUNT is 3. - */ -typedef struct xfs_dir2_data_hdr { - __be32 magic; /* XFS_DIR2_DATA_MAGIC or */ - /* XFS_DIR2_BLOCK_MAGIC */ - xfs_dir2_data_free_t bestfree[XFS_DIR2_DATA_FD_COUNT]; -} xfs_dir2_data_hdr_t; - -/* - * Active entry in a data block. - * - * Aligned to 8 bytes. After the variable length name field there is a - * 2 byte tag field, which can be accessed using xfs_dir2_data_entry_tag_p. - */ -typedef struct xfs_dir2_data_entry { - __be64 inumber; /* inode number */ - __u8 namelen; /* name length */ - __u8 name[]; /* name bytes, no null */ - /* __be16 tag; */ /* starting offset of us */ -} xfs_dir2_data_entry_t; - -/* - * Unused entry in a data block. - * - * Aligned to 8 bytes. Tag appears as the last 2 bytes and must be accessed - * using xfs_dir2_data_unused_tag_p. - */ -typedef struct xfs_dir2_data_unused { - __be16 freetag; /* XFS_DIR2_DATA_FREE_TAG */ - __be16 length; /* total free length */ - /* variable offset */ - __be16 tag; /* starting offset of us */ -} xfs_dir2_data_unused_t; - -/* - * Size of a data entry. - */ -static inline int xfs_dir2_data_entsize(int n) -{ - return (int)roundup(offsetof(struct xfs_dir2_data_entry, name[0]) + n + - (uint)sizeof(xfs_dir2_data_off_t), XFS_DIR2_DATA_ALIGN); -} - -/* - * Pointer to an entry's tag word. - */ -static inline __be16 * -xfs_dir2_data_entry_tag_p(struct xfs_dir2_data_entry *dep) -{ - return (__be16 *)((char *)dep + - xfs_dir2_data_entsize(dep->namelen) - sizeof(__be16)); -} - -/* - * Pointer to a freespace's tag word. - */ -static inline __be16 * -xfs_dir2_data_unused_tag_p(struct xfs_dir2_data_unused *dup) -{ - return (__be16 *)((char *)dup + - be16_to_cpu(dup->length) - sizeof(__be16)); -} - -/* - * Leaf block structures. - * - * A pure leaf block looks like the following drawing on disk: - * - * +---------------------------+ - * | xfs_dir2_leaf_hdr_t | - * +---------------------------+ - * | xfs_dir2_leaf_entry_t | - * | xfs_dir2_leaf_entry_t | - * | xfs_dir2_leaf_entry_t | - * | xfs_dir2_leaf_entry_t | - * | ... | - * +---------------------------+ - * | xfs_dir2_data_off_t | - * | xfs_dir2_data_off_t | - * | xfs_dir2_data_off_t | - * | ... | - * +---------------------------+ - * | xfs_dir2_leaf_tail_t | - * +---------------------------+ - * - * The xfs_dir2_data_off_t members (bests) and tail are at the end of the block - * for single-leaf (magic = XFS_DIR2_LEAF1_MAGIC) blocks only, but not present - * for directories with separate leaf nodes and free space blocks - * (magic = XFS_DIR2_LEAFN_MAGIC). - * - * As all the entries are variable size structures the accessors below should - * be used to iterate over them. - */ - -/* - * Offset of the leaf/node space. First block in this space - * is the btree root. - */ -#define XFS_DIR2_LEAF_SPACE 1 -#define XFS_DIR2_LEAF_OFFSET (XFS_DIR2_LEAF_SPACE * XFS_DIR2_SPACE_SIZE) -#define XFS_DIR2_LEAF_FIRSTDB(mp) \ - xfs_dir2_byte_to_db(mp, XFS_DIR2_LEAF_OFFSET) - -/* - * Leaf block header. - */ -typedef struct xfs_dir2_leaf_hdr { - xfs_da_blkinfo_t info; /* header for da routines */ - __be16 count; /* count of entries */ - __be16 stale; /* count of stale entries */ -} xfs_dir2_leaf_hdr_t; - -/* - * Leaf block entry. - */ -typedef struct xfs_dir2_leaf_entry { - __be32 hashval; /* hash value of name */ - __be32 address; /* address of data entry */ -} xfs_dir2_leaf_entry_t; - -/* - * Leaf block tail. - */ -typedef struct xfs_dir2_leaf_tail { - __be32 bestcount; -} xfs_dir2_leaf_tail_t; - -/* - * Leaf block. - */ -typedef struct xfs_dir2_leaf { - xfs_dir2_leaf_hdr_t hdr; /* leaf header */ - xfs_dir2_leaf_entry_t ents[]; /* entries */ -} xfs_dir2_leaf_t; - -/* - * DB blocks here are logical directory block numbers, not filesystem blocks. - */ - -static inline int xfs_dir2_max_leaf_ents(struct xfs_mount *mp) -{ - return (mp->m_dirblksize - (uint)sizeof(struct xfs_dir2_leaf_hdr)) / - (uint)sizeof(struct xfs_dir2_leaf_entry); -} - -/* - * Get address of the bestcount field in the single-leaf block. - */ -static inline struct xfs_dir2_leaf_tail * -xfs_dir2_leaf_tail_p(struct xfs_mount *mp, struct xfs_dir2_leaf *lp) -{ - return (struct xfs_dir2_leaf_tail *) - ((char *)lp + mp->m_dirblksize - - sizeof(struct xfs_dir2_leaf_tail)); -} - -/* - * Get address of the bests array in the single-leaf block. - */ -static inline __be16 * -xfs_dir2_leaf_bests_p(struct xfs_dir2_leaf_tail *ltp) -{ - return (__be16 *)ltp - be32_to_cpu(ltp->bestcount); -} - -/* - * Convert dataptr to byte in file space - */ -static inline xfs_dir2_off_t -xfs_dir2_dataptr_to_byte(struct xfs_mount *mp, xfs_dir2_dataptr_t dp) -{ - return (xfs_dir2_off_t)dp << XFS_DIR2_DATA_ALIGN_LOG; -} - -/* - * Convert byte in file space to dataptr. It had better be aligned. - */ -static inline xfs_dir2_dataptr_t -xfs_dir2_byte_to_dataptr(struct xfs_mount *mp, xfs_dir2_off_t by) -{ - return (xfs_dir2_dataptr_t)(by >> XFS_DIR2_DATA_ALIGN_LOG); -} - -/* - * Convert byte in space to (DB) block - */ -static inline xfs_dir2_db_t -xfs_dir2_byte_to_db(struct xfs_mount *mp, xfs_dir2_off_t by) -{ - return (xfs_dir2_db_t) - (by >> (mp->m_sb.sb_blocklog + mp->m_sb.sb_dirblklog)); -} - -/* - * Convert dataptr to a block number - */ -static inline xfs_dir2_db_t -xfs_dir2_dataptr_to_db(struct xfs_mount *mp, xfs_dir2_dataptr_t dp) -{ - return xfs_dir2_byte_to_db(mp, xfs_dir2_dataptr_to_byte(mp, dp)); -} - -/* - * Convert byte in space to offset in a block - */ -static inline xfs_dir2_data_aoff_t -xfs_dir2_byte_to_off(struct xfs_mount *mp, xfs_dir2_off_t by) -{ - return (xfs_dir2_data_aoff_t)(by & - ((1 << (mp->m_sb.sb_blocklog + mp->m_sb.sb_dirblklog)) - 1)); -} - -/* - * Convert dataptr to a byte offset in a block - */ -static inline xfs_dir2_data_aoff_t -xfs_dir2_dataptr_to_off(struct xfs_mount *mp, xfs_dir2_dataptr_t dp) -{ - return xfs_dir2_byte_to_off(mp, xfs_dir2_dataptr_to_byte(mp, dp)); -} - -/* - * Convert block and offset to byte in space - */ -static inline xfs_dir2_off_t -xfs_dir2_db_off_to_byte(struct xfs_mount *mp, xfs_dir2_db_t db, - xfs_dir2_data_aoff_t o) -{ - return ((xfs_dir2_off_t)db << - (mp->m_sb.sb_blocklog + mp->m_sb.sb_dirblklog)) + o; -} - -/* - * Convert block (DB) to block (dablk) - */ -static inline xfs_dablk_t -xfs_dir2_db_to_da(struct xfs_mount *mp, xfs_dir2_db_t db) -{ - return (xfs_dablk_t)(db << mp->m_sb.sb_dirblklog); -} - -/* - * Convert byte in space to (DA) block - */ -static inline xfs_dablk_t -xfs_dir2_byte_to_da(struct xfs_mount *mp, xfs_dir2_off_t by) -{ - return xfs_dir2_db_to_da(mp, xfs_dir2_byte_to_db(mp, by)); -} - -/* - * Convert block and offset to dataptr - */ -static inline xfs_dir2_dataptr_t -xfs_dir2_db_off_to_dataptr(struct xfs_mount *mp, xfs_dir2_db_t db, - xfs_dir2_data_aoff_t o) -{ - return xfs_dir2_byte_to_dataptr(mp, xfs_dir2_db_off_to_byte(mp, db, o)); -} - -/* - * Convert block (dablk) to block (DB) - */ -static inline xfs_dir2_db_t -xfs_dir2_da_to_db(struct xfs_mount *mp, xfs_dablk_t da) -{ - return (xfs_dir2_db_t)(da >> mp->m_sb.sb_dirblklog); -} - -/* - * Convert block (dablk) to byte offset in space - */ -static inline xfs_dir2_off_t -xfs_dir2_da_to_byte(struct xfs_mount *mp, xfs_dablk_t da) -{ - return xfs_dir2_db_off_to_byte(mp, xfs_dir2_da_to_db(mp, da), 0); -} - -/* - * Free space block defintions for the node format. - */ - -/* - * Offset of the freespace index. - */ -#define XFS_DIR2_FREE_SPACE 2 -#define XFS_DIR2_FREE_OFFSET (XFS_DIR2_FREE_SPACE * XFS_DIR2_SPACE_SIZE) -#define XFS_DIR2_FREE_FIRSTDB(mp) \ - xfs_dir2_byte_to_db(mp, XFS_DIR2_FREE_OFFSET) - -typedef struct xfs_dir2_free_hdr { - __be32 magic; /* XFS_DIR2_FREE_MAGIC */ - __be32 firstdb; /* db of first entry */ - __be32 nvalid; /* count of valid entries */ - __be32 nused; /* count of used entries */ -} xfs_dir2_free_hdr_t; - -typedef struct xfs_dir2_free { - xfs_dir2_free_hdr_t hdr; /* block header */ - __be16 bests[]; /* best free counts */ - /* unused entries are -1 */ -} xfs_dir2_free_t; - -static inline int xfs_dir2_free_max_bests(struct xfs_mount *mp) -{ - return (mp->m_dirblksize - sizeof(struct xfs_dir2_free_hdr)) / - sizeof(xfs_dir2_data_off_t); -} - -/* - * Convert data space db to the corresponding free db. - */ -static inline xfs_dir2_db_t -xfs_dir2_db_to_fdb(struct xfs_mount *mp, xfs_dir2_db_t db) -{ - return XFS_DIR2_FREE_FIRSTDB(mp) + db / xfs_dir2_free_max_bests(mp); -} - -/* - * Convert data space db to the corresponding index in a free db. - */ -static inline int -xfs_dir2_db_to_fdindex(struct xfs_mount *mp, xfs_dir2_db_t db) -{ - return db % xfs_dir2_free_max_bests(mp); -} - -/* - * Single block format. - * - * The single block format looks like the following drawing on disk: - * - * +-------------------------------------------------+ - * | xfs_dir2_data_hdr_t | - * +-------------------------------------------------+ - * | xfs_dir2_data_entry_t OR xfs_dir2_data_unused_t | - * | xfs_dir2_data_entry_t OR xfs_dir2_data_unused_t | - * | xfs_dir2_data_entry_t OR xfs_dir2_data_unused_t : - * | ... | - * +-------------------------------------------------+ - * | unused space | - * +-------------------------------------------------+ - * | ... | - * | xfs_dir2_leaf_entry_t | - * | xfs_dir2_leaf_entry_t | - * +-------------------------------------------------+ - * | xfs_dir2_block_tail_t | - * +-------------------------------------------------+ - * - * As all the entries are variable size structures the accessors below should - * be used to iterate over them. - */ - -typedef struct xfs_dir2_block_tail { - __be32 count; /* count of leaf entries */ - __be32 stale; /* count of stale lf entries */ -} xfs_dir2_block_tail_t; - -/* - * Pointer to the leaf header embedded in a data block (1-block format) - */ -static inline struct xfs_dir2_block_tail * -xfs_dir2_block_tail_p(struct xfs_mount *mp, struct xfs_dir2_data_hdr *hdr) -{ - return ((struct xfs_dir2_block_tail *) - ((char *)hdr + mp->m_dirblksize)) - 1; -} - -/* - * Pointer to the leaf entries embedded in a data block (1-block format) - */ -static inline struct xfs_dir2_leaf_entry * -xfs_dir2_block_leaf_p(struct xfs_dir2_block_tail *btp) -{ - return ((struct xfs_dir2_leaf_entry *)btp) - be32_to_cpu(btp->count); -} - -#endif /* __XFS_DIR2_FORMAT_H__ */ diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c index 60cd2fa4e04..fb0aad4440c 100644 --- a/fs/xfs/xfs_dir2_leaf.c +++ b/fs/xfs/xfs_dir2_leaf.c @@ -1,5 +1,6 @@ /* * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. + * Copyright (c) 2013 Red Hat, Inc. * All Rights Reserved. * * This program is free software; you can redistribute it and/or @@ -17,113 +18,351 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_types.h" -#include "xfs_bit.h" -#include "xfs_log.h" -#include "xfs_trans.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_mount.h" +#include "xfs_da_format.h" #include "xfs_da_btree.h" -#include "xfs_bmap_btree.h" -#include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_bmap.h" -#include "xfs_dir2_format.h" +#include "xfs_dir2.h" #include "xfs_dir2_priv.h" #include "xfs_error.h" #include "xfs_trace.h" +#include "xfs_trans.h" +#include "xfs_buf_item.h" +#include "xfs_cksum.h" /* * Local function declarations. */ +static int xfs_dir2_leaf_lookup_int(xfs_da_args_t *args, struct xfs_buf **lbpp, + int *indexp, struct xfs_buf **dbpp); +static void xfs_dir3_leaf_log_bests(struct xfs_da_args *args, + struct xfs_buf *bp, int first, int last); +static void xfs_dir3_leaf_log_tail(struct xfs_da_args *args, + struct xfs_buf *bp); + +/* + * Check the internal consistency of a leaf1 block. + * Pop an assert if something is wrong. + */ #ifdef DEBUG -static void xfs_dir2_leaf_check(struct xfs_inode *dp, struct xfs_buf *bp); +#define xfs_dir3_leaf_check(dp, bp) \ +do { \ + if (!xfs_dir3_leaf1_check((dp), (bp))) \ + ASSERT(0); \ +} while (0); + +STATIC bool +xfs_dir3_leaf1_check( + struct xfs_inode *dp, + struct xfs_buf *bp) +{ + struct xfs_dir2_leaf *leaf = bp->b_addr; + struct xfs_dir3_icleaf_hdr leafhdr; + + dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf); + + if (leafhdr.magic == XFS_DIR3_LEAF1_MAGIC) { + struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr; + if (be64_to_cpu(leaf3->info.blkno) != bp->b_bn) + return false; + } else if (leafhdr.magic != XFS_DIR2_LEAF1_MAGIC) + return false; + + return xfs_dir3_leaf_check_int(dp->i_mount, dp, &leafhdr, leaf); +} #else -#define xfs_dir2_leaf_check(dp, bp) +#define xfs_dir3_leaf_check(dp, bp) #endif -static int xfs_dir2_leaf_lookup_int(xfs_da_args_t *args, struct xfs_buf **lbpp, - int *indexp, struct xfs_buf **dbpp); -static void xfs_dir2_leaf_log_bests(struct xfs_trans *tp, struct xfs_buf *bp, - int first, int last); -static void xfs_dir2_leaf_log_tail(struct xfs_trans *tp, struct xfs_buf *bp); -static void -xfs_dir2_leaf_verify( +bool +xfs_dir3_leaf_check_int( + struct xfs_mount *mp, + struct xfs_inode *dp, + struct xfs_dir3_icleaf_hdr *hdr, + struct xfs_dir2_leaf *leaf) +{ + struct xfs_dir2_leaf_entry *ents; + xfs_dir2_leaf_tail_t *ltp; + int stale; + int i; + const struct xfs_dir_ops *ops; + struct xfs_dir3_icleaf_hdr leafhdr; + struct xfs_da_geometry *geo = mp->m_dir_geo; + + /* + * we can be passed a null dp here from a verifier, so we need to go the + * hard way to get them. + */ + ops = xfs_dir_get_ops(mp, dp); + + if (!hdr) { + ops->leaf_hdr_from_disk(&leafhdr, leaf); + hdr = &leafhdr; + } + + ents = ops->leaf_ents_p(leaf); + ltp = xfs_dir2_leaf_tail_p(geo, leaf); + + /* + * XXX (dgc): This value is not restrictive enough. + * Should factor in the size of the bests table as well. + * We can deduce a value for that from di_size. + */ + if (hdr->count > ops->leaf_max_ents(geo)) + return false; + + /* Leaves and bests don't overlap in leaf format. */ + if ((hdr->magic == XFS_DIR2_LEAF1_MAGIC || + hdr->magic == XFS_DIR3_LEAF1_MAGIC) && + (char *)&ents[hdr->count] > (char *)xfs_dir2_leaf_bests_p(ltp)) + return false; + + /* Check hash value order, count stale entries. */ + for (i = stale = 0; i < hdr->count; i++) { + if (i + 1 < hdr->count) { + if (be32_to_cpu(ents[i].hashval) > + be32_to_cpu(ents[i + 1].hashval)) + return false; + } + if (ents[i].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) + stale++; + } + if (hdr->stale != stale) + return false; + return true; +} + +/* + * We verify the magic numbers before decoding the leaf header so that on debug + * kernels we don't get assertion failures in xfs_dir3_leaf_hdr_from_disk() due + * to incorrect magic numbers. + */ +static bool +xfs_dir3_leaf_verify( struct xfs_buf *bp, - __be16 magic) + __uint16_t magic) { struct xfs_mount *mp = bp->b_target->bt_mount; - struct xfs_dir2_leaf_hdr *hdr = bp->b_addr; - int block_ok = 0; + struct xfs_dir2_leaf *leaf = bp->b_addr; + + ASSERT(magic == XFS_DIR2_LEAF1_MAGIC || magic == XFS_DIR2_LEAFN_MAGIC); + + if (xfs_sb_version_hascrc(&mp->m_sb)) { + struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr; + __uint16_t magic3; + + magic3 = (magic == XFS_DIR2_LEAF1_MAGIC) ? XFS_DIR3_LEAF1_MAGIC + : XFS_DIR3_LEAFN_MAGIC; + + if (leaf3->info.hdr.magic != cpu_to_be16(magic3)) + return false; + if (!uuid_equal(&leaf3->info.uuid, &mp->m_sb.sb_uuid)) + return false; + if (be64_to_cpu(leaf3->info.blkno) != bp->b_bn) + return false; + } else { + if (leaf->hdr.info.magic != cpu_to_be16(magic)) + return false; + } - block_ok = hdr->info.magic == magic; - if (!block_ok) { - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr); + return xfs_dir3_leaf_check_int(mp, NULL, NULL, leaf); +} + +static void +__read_verify( + struct xfs_buf *bp, + __uint16_t magic) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + + if (xfs_sb_version_hascrc(&mp->m_sb) && + !xfs_buf_verify_cksum(bp, XFS_DIR3_LEAF_CRC_OFF)) + xfs_buf_ioerror(bp, EFSBADCRC); + else if (!xfs_dir3_leaf_verify(bp, magic)) xfs_buf_ioerror(bp, EFSCORRUPTED); + + if (bp->b_error) + xfs_verifier_error(bp); +} + +static void +__write_verify( + struct xfs_buf *bp, + __uint16_t magic) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_buf_log_item *bip = bp->b_fspriv; + struct xfs_dir3_leaf_hdr *hdr3 = bp->b_addr; + + if (!xfs_dir3_leaf_verify(bp, magic)) { + xfs_buf_ioerror(bp, EFSCORRUPTED); + xfs_verifier_error(bp); + return; } + + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return; + + if (bip) + hdr3->info.lsn = cpu_to_be64(bip->bli_item.li_lsn); + + xfs_buf_update_cksum(bp, XFS_DIR3_LEAF_CRC_OFF); } static void -xfs_dir2_leaf1_read_verify( +xfs_dir3_leaf1_read_verify( struct xfs_buf *bp) { - xfs_dir2_leaf_verify(bp, cpu_to_be16(XFS_DIR2_LEAF1_MAGIC)); + __read_verify(bp, XFS_DIR2_LEAF1_MAGIC); } static void -xfs_dir2_leaf1_write_verify( +xfs_dir3_leaf1_write_verify( struct xfs_buf *bp) { - xfs_dir2_leaf_verify(bp, cpu_to_be16(XFS_DIR2_LEAF1_MAGIC)); + __write_verify(bp, XFS_DIR2_LEAF1_MAGIC); } -void -xfs_dir2_leafn_read_verify( +static void +xfs_dir3_leafn_read_verify( struct xfs_buf *bp) { - xfs_dir2_leaf_verify(bp, cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); + __read_verify(bp, XFS_DIR2_LEAFN_MAGIC); } -void -xfs_dir2_leafn_write_verify( +static void +xfs_dir3_leafn_write_verify( struct xfs_buf *bp) { - xfs_dir2_leaf_verify(bp, cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); + __write_verify(bp, XFS_DIR2_LEAFN_MAGIC); } -static const struct xfs_buf_ops xfs_dir2_leaf1_buf_ops = { - .verify_read = xfs_dir2_leaf1_read_verify, - .verify_write = xfs_dir2_leaf1_write_verify, +const struct xfs_buf_ops xfs_dir3_leaf1_buf_ops = { + .verify_read = xfs_dir3_leaf1_read_verify, + .verify_write = xfs_dir3_leaf1_write_verify, }; -const struct xfs_buf_ops xfs_dir2_leafn_buf_ops = { - .verify_read = xfs_dir2_leafn_read_verify, - .verify_write = xfs_dir2_leafn_write_verify, +const struct xfs_buf_ops xfs_dir3_leafn_buf_ops = { + .verify_read = xfs_dir3_leafn_read_verify, + .verify_write = xfs_dir3_leafn_write_verify, }; static int -xfs_dir2_leaf_read( +xfs_dir3_leaf_read( struct xfs_trans *tp, struct xfs_inode *dp, xfs_dablk_t fbno, xfs_daddr_t mappedbno, struct xfs_buf **bpp) { - return xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp, - XFS_DATA_FORK, &xfs_dir2_leaf1_buf_ops); + int err; + + err = xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp, + XFS_DATA_FORK, &xfs_dir3_leaf1_buf_ops); + if (!err && tp) + xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_LEAF1_BUF); + return err; } int -xfs_dir2_leafn_read( +xfs_dir3_leafn_read( struct xfs_trans *tp, struct xfs_inode *dp, xfs_dablk_t fbno, xfs_daddr_t mappedbno, struct xfs_buf **bpp) { - return xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp, - XFS_DATA_FORK, &xfs_dir2_leafn_buf_ops); + int err; + + err = xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp, + XFS_DATA_FORK, &xfs_dir3_leafn_buf_ops); + if (!err && tp) + xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_LEAFN_BUF); + return err; +} + +/* + * Initialize a new leaf block, leaf1 or leafn magic accepted. + */ +static void +xfs_dir3_leaf_init( + struct xfs_mount *mp, + struct xfs_trans *tp, + struct xfs_buf *bp, + xfs_ino_t owner, + __uint16_t type) +{ + struct xfs_dir2_leaf *leaf = bp->b_addr; + + ASSERT(type == XFS_DIR2_LEAF1_MAGIC || type == XFS_DIR2_LEAFN_MAGIC); + + if (xfs_sb_version_hascrc(&mp->m_sb)) { + struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr; + + memset(leaf3, 0, sizeof(*leaf3)); + + leaf3->info.hdr.magic = (type == XFS_DIR2_LEAF1_MAGIC) + ? cpu_to_be16(XFS_DIR3_LEAF1_MAGIC) + : cpu_to_be16(XFS_DIR3_LEAFN_MAGIC); + leaf3->info.blkno = cpu_to_be64(bp->b_bn); + leaf3->info.owner = cpu_to_be64(owner); + uuid_copy(&leaf3->info.uuid, &mp->m_sb.sb_uuid); + } else { + memset(leaf, 0, sizeof(*leaf)); + leaf->hdr.info.magic = cpu_to_be16(type); + } + + /* + * If it's a leaf-format directory initialize the tail. + * Caller is responsible for initialising the bests table. + */ + if (type == XFS_DIR2_LEAF1_MAGIC) { + struct xfs_dir2_leaf_tail *ltp; + + ltp = xfs_dir2_leaf_tail_p(mp->m_dir_geo, leaf); + ltp->bestcount = 0; + bp->b_ops = &xfs_dir3_leaf1_buf_ops; + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_LEAF1_BUF); + } else { + bp->b_ops = &xfs_dir3_leafn_buf_ops; + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_LEAFN_BUF); + } +} + +int +xfs_dir3_leaf_get_buf( + xfs_da_args_t *args, + xfs_dir2_db_t bno, + struct xfs_buf **bpp, + __uint16_t magic) +{ + struct xfs_inode *dp = args->dp; + struct xfs_trans *tp = args->trans; + struct xfs_mount *mp = dp->i_mount; + struct xfs_buf *bp; + int error; + + ASSERT(magic == XFS_DIR2_LEAF1_MAGIC || magic == XFS_DIR2_LEAFN_MAGIC); + ASSERT(bno >= xfs_dir2_byte_to_db(args->geo, XFS_DIR2_LEAF_OFFSET) && + bno < xfs_dir2_byte_to_db(args->geo, XFS_DIR2_FREE_OFFSET)); + + error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(args->geo, bno), + -1, &bp, XFS_DATA_FORK); + if (error) + return error; + + xfs_dir3_leaf_init(mp, tp, bp, dp->i_ino, magic); + xfs_dir3_leaf_log_header(args, bp); + if (magic == XFS_DIR2_LEAF1_MAGIC) + xfs_dir3_leaf_log_tail(args, bp); + *bpp = bp; + return 0; } /* @@ -149,6 +388,9 @@ xfs_dir2_block_to_leaf( int needlog; /* need to log block header */ int needscan; /* need to rescan bestfree */ xfs_trans_t *tp; /* transaction pointer */ + struct xfs_dir2_data_free *bf; + struct xfs_dir2_leaf_entry *ents; + struct xfs_dir3_icleaf_hdr leafhdr; trace_xfs_dir2_block_to_leaf(args); @@ -163,70 +405,83 @@ xfs_dir2_block_to_leaf( if ((error = xfs_da_grow_inode(args, &blkno))) { return error; } - ldb = xfs_dir2_da_to_db(mp, blkno); - ASSERT(ldb == XFS_DIR2_LEAF_FIRSTDB(mp)); + ldb = xfs_dir2_da_to_db(args->geo, blkno); + ASSERT(ldb == xfs_dir2_byte_to_db(args->geo, XFS_DIR2_LEAF_OFFSET)); /* * Initialize the leaf block, get a buffer for it. */ - if ((error = xfs_dir2_leaf_init(args, ldb, &lbp, XFS_DIR2_LEAF1_MAGIC))) { + error = xfs_dir3_leaf_get_buf(args, ldb, &lbp, XFS_DIR2_LEAF1_MAGIC); + if (error) return error; - } - ASSERT(lbp != NULL); + leaf = lbp->b_addr; hdr = dbp->b_addr; - xfs_dir2_data_check(dp, dbp); - btp = xfs_dir2_block_tail_p(mp, hdr); + xfs_dir3_data_check(dp, dbp); + btp = xfs_dir2_block_tail_p(args->geo, hdr); blp = xfs_dir2_block_leaf_p(btp); + bf = dp->d_ops->data_bestfree_p(hdr); + ents = dp->d_ops->leaf_ents_p(leaf); + /* * Set the counts in the leaf header. */ - leaf->hdr.count = cpu_to_be16(be32_to_cpu(btp->count)); - leaf->hdr.stale = cpu_to_be16(be32_to_cpu(btp->stale)); + dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf); + leafhdr.count = be32_to_cpu(btp->count); + leafhdr.stale = be32_to_cpu(btp->stale); + dp->d_ops->leaf_hdr_to_disk(leaf, &leafhdr); + xfs_dir3_leaf_log_header(args, lbp); + /* * Could compact these but I think we always do the conversion * after squeezing out stale entries. */ - memcpy(leaf->ents, blp, be32_to_cpu(btp->count) * sizeof(xfs_dir2_leaf_entry_t)); - xfs_dir2_leaf_log_ents(tp, lbp, 0, be16_to_cpu(leaf->hdr.count) - 1); + memcpy(ents, blp, be32_to_cpu(btp->count) * sizeof(xfs_dir2_leaf_entry_t)); + xfs_dir3_leaf_log_ents(args, lbp, 0, leafhdr.count - 1); needscan = 0; needlog = 1; /* * Make the space formerly occupied by the leaf entries and block * tail be free. */ - xfs_dir2_data_make_free(tp, dbp, + xfs_dir2_data_make_free(args, dbp, (xfs_dir2_data_aoff_t)((char *)blp - (char *)hdr), - (xfs_dir2_data_aoff_t)((char *)hdr + mp->m_dirblksize - + (xfs_dir2_data_aoff_t)((char *)hdr + args->geo->blksize - (char *)blp), &needlog, &needscan); /* * Fix up the block header, make it a data block. */ - dbp->b_ops = &xfs_dir2_data_buf_ops; - hdr->magic = cpu_to_be32(XFS_DIR2_DATA_MAGIC); + dbp->b_ops = &xfs_dir3_data_buf_ops; + xfs_trans_buf_set_type(tp, dbp, XFS_BLFT_DIR_DATA_BUF); + if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)) + hdr->magic = cpu_to_be32(XFS_DIR2_DATA_MAGIC); + else + hdr->magic = cpu_to_be32(XFS_DIR3_DATA_MAGIC); + if (needscan) - xfs_dir2_data_freescan(mp, hdr, &needlog); + xfs_dir2_data_freescan(dp, hdr, &needlog); /* * Set up leaf tail and bests table. */ - ltp = xfs_dir2_leaf_tail_p(mp, leaf); + ltp = xfs_dir2_leaf_tail_p(args->geo, leaf); ltp->bestcount = cpu_to_be32(1); bestsp = xfs_dir2_leaf_bests_p(ltp); - bestsp[0] = hdr->bestfree[0].length; + bestsp[0] = bf[0].length; /* * Log the data header and leaf bests table. */ if (needlog) - xfs_dir2_data_log_header(tp, dbp); - xfs_dir2_leaf_check(dp, lbp); - xfs_dir2_data_check(dp, dbp); - xfs_dir2_leaf_log_bests(tp, lbp, 0, 0); + xfs_dir2_data_log_header(args, dbp); + xfs_dir3_leaf_check(dp, lbp); + xfs_dir3_data_check(dp, dbp); + xfs_dir3_leaf_log_bests(args, lbp, 0, 0); return 0; } STATIC void -xfs_dir2_leaf_find_stale( - struct xfs_dir2_leaf *leaf, +xfs_dir3_leaf_find_stale( + struct xfs_dir3_icleaf_hdr *leafhdr, + struct xfs_dir2_leaf_entry *ents, int index, int *lowstale, int *highstale) @@ -235,7 +490,7 @@ xfs_dir2_leaf_find_stale( * Find the first stale entry before our index, if any. */ for (*lowstale = index - 1; *lowstale >= 0; --*lowstale) { - if (leaf->ents[*lowstale].address == + if (ents[*lowstale].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) break; } @@ -245,10 +500,8 @@ xfs_dir2_leaf_find_stale( * Stop if the result would require moving more entries than using * lowstale. */ - for (*highstale = index; - *highstale < be16_to_cpu(leaf->hdr.count); - ++*highstale) { - if (leaf->ents[*highstale].address == + for (*highstale = index; *highstale < leafhdr->count; ++*highstale) { + if (ents[*highstale].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) break; if (*lowstale >= 0 && index - *lowstale <= *highstale - index) @@ -257,8 +510,9 @@ xfs_dir2_leaf_find_stale( } struct xfs_dir2_leaf_entry * -xfs_dir2_leaf_find_entry( - xfs_dir2_leaf_t *leaf, /* leaf structure */ +xfs_dir3_leaf_find_entry( + struct xfs_dir3_icleaf_hdr *leafhdr, + struct xfs_dir2_leaf_entry *ents, int index, /* leaf table position */ int compact, /* need to compact leaves */ int lowstale, /* index of prev stale leaf */ @@ -266,7 +520,7 @@ xfs_dir2_leaf_find_entry( int *lfloglow, /* low leaf logging index */ int *lfloghigh) /* high leaf logging index */ { - if (!leaf->hdr.stale) { + if (!leafhdr->stale) { xfs_dir2_leaf_entry_t *lep; /* leaf entry table pointer */ /* @@ -274,18 +528,16 @@ xfs_dir2_leaf_find_entry( * * If there are no stale entries, just insert a hole at index. */ - lep = &leaf->ents[index]; - if (index < be16_to_cpu(leaf->hdr.count)) + lep = &ents[index]; + if (index < leafhdr->count) memmove(lep + 1, lep, - (be16_to_cpu(leaf->hdr.count) - index) * - sizeof(*lep)); + (leafhdr->count - index) * sizeof(*lep)); /* * Record low and high logging indices for the leaf. */ *lfloglow = index; - *lfloghigh = be16_to_cpu(leaf->hdr.count); - be16_add_cpu(&leaf->hdr.count, 1); + *lfloghigh = leafhdr->count++; return lep; } @@ -299,16 +551,17 @@ xfs_dir2_leaf_find_entry( * entries before and after our insertion point. */ if (compact == 0) - xfs_dir2_leaf_find_stale(leaf, index, &lowstale, &highstale); + xfs_dir3_leaf_find_stale(leafhdr, ents, index, + &lowstale, &highstale); /* * If the low one is better, use it. */ if (lowstale >= 0 && - (highstale == be16_to_cpu(leaf->hdr.count) || + (highstale == leafhdr->count || index - lowstale - 1 < highstale - index)) { ASSERT(index - lowstale - 1 >= 0); - ASSERT(leaf->ents[lowstale].address == + ASSERT(ents[lowstale].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)); /* @@ -316,37 +569,34 @@ xfs_dir2_leaf_find_entry( * for the new entry. */ if (index - lowstale - 1 > 0) { - memmove(&leaf->ents[lowstale], - &leaf->ents[lowstale + 1], + memmove(&ents[lowstale], &ents[lowstale + 1], (index - lowstale - 1) * - sizeof(xfs_dir2_leaf_entry_t)); + sizeof(xfs_dir2_leaf_entry_t)); } *lfloglow = MIN(lowstale, *lfloglow); *lfloghigh = MAX(index - 1, *lfloghigh); - be16_add_cpu(&leaf->hdr.stale, -1); - return &leaf->ents[index - 1]; + leafhdr->stale--; + return &ents[index - 1]; } /* * The high one is better, so use that one. */ ASSERT(highstale - index >= 0); - ASSERT(leaf->ents[highstale].address == - cpu_to_be32(XFS_DIR2_NULL_DATAPTR)); + ASSERT(ents[highstale].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)); /* * Copy entries down to cover the stale entry and make room for the * new entry. */ if (highstale - index > 0) { - memmove(&leaf->ents[index + 1], - &leaf->ents[index], + memmove(&ents[index + 1], &ents[index], (highstale - index) * sizeof(xfs_dir2_leaf_entry_t)); } *lfloglow = MIN(index, *lfloglow); *lfloghigh = MAX(highstale, *lfloghigh); - be16_add_cpu(&leaf->hdr.stale, -1); - return &leaf->ents[index]; + leafhdr->stale--; + return &ents[index]; } /* @@ -383,6 +633,9 @@ xfs_dir2_leaf_addname( __be16 *tagp; /* end of data entry */ xfs_trans_t *tp; /* transaction pointer */ xfs_dir2_db_t use_block; /* data block number */ + struct xfs_dir2_data_free *bf; /* bestfree table */ + struct xfs_dir2_leaf_entry *ents; + struct xfs_dir3_icleaf_hdr leafhdr; trace_xfs_dir2_leaf_addname(args); @@ -390,7 +643,7 @@ xfs_dir2_leaf_addname( tp = args->trans; mp = dp->i_mount; - error = xfs_dir2_leaf_read(tp, dp, mp->m_dirleafblk, -1, &lbp); + error = xfs_dir3_leaf_read(tp, dp, args->geo->leafblk, -1, &lbp); if (error) return error; @@ -402,21 +655,24 @@ xfs_dir2_leaf_addname( */ index = xfs_dir2_leaf_search_hash(args, lbp); leaf = lbp->b_addr; - ltp = xfs_dir2_leaf_tail_p(mp, leaf); + ltp = xfs_dir2_leaf_tail_p(args->geo, leaf); + ents = dp->d_ops->leaf_ents_p(leaf); + dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf); bestsp = xfs_dir2_leaf_bests_p(ltp); - length = xfs_dir2_data_entsize(args->namelen); + length = dp->d_ops->data_entsize(args->namelen); + /* * See if there are any entries with the same hash value * and space in their block for the new entry. * This is good because it puts multiple same-hash value entries * in a data block, improving the lookup of those entries. */ - for (use_block = -1, lep = &leaf->ents[index]; - index < be16_to_cpu(leaf->hdr.count) && be32_to_cpu(lep->hashval) == args->hashval; + for (use_block = -1, lep = &ents[index]; + index < leafhdr.count && be32_to_cpu(lep->hashval) == args->hashval; index++, lep++) { if (be32_to_cpu(lep->address) == XFS_DIR2_NULL_DATAPTR) continue; - i = xfs_dir2_dataptr_to_db(mp, be32_to_cpu(lep->address)); + i = xfs_dir2_dataptr_to_db(args->geo, be32_to_cpu(lep->address)); ASSERT(i < be32_to_cpu(ltp->bestcount)); ASSERT(bestsp[i] != cpu_to_be16(NULLDATAOFF)); if (be16_to_cpu(bestsp[i]) >= length) { @@ -445,7 +701,7 @@ xfs_dir2_leaf_addname( * How many bytes do we need in the leaf block? */ needbytes = 0; - if (!leaf->hdr.stale) + if (!leafhdr.stale) needbytes += sizeof(xfs_dir2_leaf_entry_t); if (use_block == -1) needbytes += sizeof(xfs_dir2_data_off_t); @@ -460,16 +716,15 @@ xfs_dir2_leaf_addname( * If we don't have enough free bytes but we can make enough * by compacting out stale entries, we'll do that. */ - if ((char *)bestsp - (char *)&leaf->ents[be16_to_cpu(leaf->hdr.count)] < - needbytes && be16_to_cpu(leaf->hdr.stale) > 1) { + if ((char *)bestsp - (char *)&ents[leafhdr.count] < needbytes && + leafhdr.stale > 1) compact = 1; - } + /* * Otherwise if we don't have enough free bytes we need to * convert to node form. */ - else if ((char *)bestsp - (char *)&leaf->ents[be16_to_cpu( - leaf->hdr.count)] < needbytes) { + else if ((char *)bestsp - (char *)&ents[leafhdr.count] < needbytes) { /* * Just checking or no space reservation, give up. */ @@ -517,15 +772,15 @@ xfs_dir2_leaf_addname( * point later. */ if (compact) { - xfs_dir2_leaf_compact_x1(lbp, &index, &lowstale, &highstale, - &lfloglow, &lfloghigh); + xfs_dir3_leaf_compact_x1(&leafhdr, ents, &index, &lowstale, + &highstale, &lfloglow, &lfloghigh); } /* * There are stale entries, so we'll need log-low and log-high * impossibly bad values later. */ - else if (be16_to_cpu(leaf->hdr.stale)) { - lfloglow = be16_to_cpu(leaf->hdr.count); + else if (leafhdr.stale) { + lfloglow = leafhdr.count; lfloghigh = -1; } /* @@ -544,7 +799,7 @@ xfs_dir2_leaf_addname( /* * Initialize the block. */ - if ((error = xfs_dir2_data_init(args, use_block, &dbp))) { + if ((error = xfs_dir3_data_init(args, use_block, &dbp))) { xfs_trans_brelse(tp, lbp); return error; } @@ -557,43 +812,46 @@ xfs_dir2_leaf_addname( memmove(&bestsp[0], &bestsp[1], be32_to_cpu(ltp->bestcount) * sizeof(bestsp[0])); be32_add_cpu(<p->bestcount, 1); - xfs_dir2_leaf_log_tail(tp, lbp); - xfs_dir2_leaf_log_bests(tp, lbp, 0, be32_to_cpu(ltp->bestcount) - 1); + xfs_dir3_leaf_log_tail(args, lbp); + xfs_dir3_leaf_log_bests(args, lbp, 0, + be32_to_cpu(ltp->bestcount) - 1); } /* * If we're filling in a previously empty block just log it. */ else - xfs_dir2_leaf_log_bests(tp, lbp, use_block, use_block); + xfs_dir3_leaf_log_bests(args, lbp, use_block, use_block); hdr = dbp->b_addr; - bestsp[use_block] = hdr->bestfree[0].length; + bf = dp->d_ops->data_bestfree_p(hdr); + bestsp[use_block] = bf[0].length; grown = 1; } else { /* * Already had space in some data block. * Just read that one in. */ - error = xfs_dir2_data_read(tp, dp, - xfs_dir2_db_to_da(mp, use_block), - -1, &dbp); + error = xfs_dir3_data_read(tp, dp, + xfs_dir2_db_to_da(args->geo, use_block), + -1, &dbp); if (error) { xfs_trans_brelse(tp, lbp); return error; } hdr = dbp->b_addr; + bf = dp->d_ops->data_bestfree_p(hdr); grown = 0; } /* * Point to the biggest freespace in our data block. */ dup = (xfs_dir2_data_unused_t *) - ((char *)hdr + be16_to_cpu(hdr->bestfree[0].offset)); + ((char *)hdr + be16_to_cpu(bf[0].offset)); ASSERT(be16_to_cpu(dup->length) >= length); needscan = needlog = 0; /* * Mark the initial part of our freespace in use for the new entry. */ - xfs_dir2_data_use_free(tp, dbp, dup, + xfs_dir2_data_use_free(args, dbp, dup, (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr), length, &needlog, &needscan); /* @@ -603,117 +861,78 @@ xfs_dir2_leaf_addname( dep->inumber = cpu_to_be64(args->inumber); dep->namelen = args->namelen; memcpy(dep->name, args->name, dep->namelen); - tagp = xfs_dir2_data_entry_tag_p(dep); + dp->d_ops->data_put_ftype(dep, args->filetype); + tagp = dp->d_ops->data_entry_tag_p(dep); *tagp = cpu_to_be16((char *)dep - (char *)hdr); /* * Need to scan fix up the bestfree table. */ if (needscan) - xfs_dir2_data_freescan(mp, hdr, &needlog); + xfs_dir2_data_freescan(dp, hdr, &needlog); /* * Need to log the data block's header. */ if (needlog) - xfs_dir2_data_log_header(tp, dbp); - xfs_dir2_data_log_entry(tp, dbp, dep); + xfs_dir2_data_log_header(args, dbp); + xfs_dir2_data_log_entry(args, dbp, dep); /* * If the bests table needs to be changed, do it. * Log the change unless we've already done that. */ - if (be16_to_cpu(bestsp[use_block]) != be16_to_cpu(hdr->bestfree[0].length)) { - bestsp[use_block] = hdr->bestfree[0].length; + if (be16_to_cpu(bestsp[use_block]) != be16_to_cpu(bf[0].length)) { + bestsp[use_block] = bf[0].length; if (!grown) - xfs_dir2_leaf_log_bests(tp, lbp, use_block, use_block); + xfs_dir3_leaf_log_bests(args, lbp, use_block, use_block); } - lep = xfs_dir2_leaf_find_entry(leaf, index, compact, lowstale, + lep = xfs_dir3_leaf_find_entry(&leafhdr, ents, index, compact, lowstale, highstale, &lfloglow, &lfloghigh); /* * Fill in the new leaf entry. */ lep->hashval = cpu_to_be32(args->hashval); - lep->address = cpu_to_be32(xfs_dir2_db_off_to_dataptr(mp, use_block, + lep->address = cpu_to_be32( + xfs_dir2_db_off_to_dataptr(args->geo, use_block, be16_to_cpu(*tagp))); /* * Log the leaf fields and give up the buffers. */ - xfs_dir2_leaf_log_header(tp, lbp); - xfs_dir2_leaf_log_ents(tp, lbp, lfloglow, lfloghigh); - xfs_dir2_leaf_check(dp, lbp); - xfs_dir2_data_check(dp, dbp); + dp->d_ops->leaf_hdr_to_disk(leaf, &leafhdr); + xfs_dir3_leaf_log_header(args, lbp); + xfs_dir3_leaf_log_ents(args, lbp, lfloglow, lfloghigh); + xfs_dir3_leaf_check(dp, lbp); + xfs_dir3_data_check(dp, dbp); return 0; } -#ifdef DEBUG -/* - * Check the internal consistency of a leaf1 block. - * Pop an assert if something is wrong. - */ -STATIC void -xfs_dir2_leaf_check( - struct xfs_inode *dp, /* incore directory inode */ - struct xfs_buf *bp) /* leaf's buffer */ -{ - int i; /* leaf index */ - xfs_dir2_leaf_t *leaf; /* leaf structure */ - xfs_dir2_leaf_tail_t *ltp; /* leaf tail pointer */ - xfs_mount_t *mp; /* filesystem mount point */ - int stale; /* count of stale leaves */ - - leaf = bp->b_addr; - mp = dp->i_mount; - ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC)); - /* - * This value is not restrictive enough. - * Should factor in the size of the bests table as well. - * We can deduce a value for that from di_size. - */ - ASSERT(be16_to_cpu(leaf->hdr.count) <= xfs_dir2_max_leaf_ents(mp)); - ltp = xfs_dir2_leaf_tail_p(mp, leaf); - /* - * Leaves and bests don't overlap. - */ - ASSERT((char *)&leaf->ents[be16_to_cpu(leaf->hdr.count)] <= - (char *)xfs_dir2_leaf_bests_p(ltp)); - /* - * Check hash value order, count stale entries. - */ - for (i = stale = 0; i < be16_to_cpu(leaf->hdr.count); i++) { - if (i + 1 < be16_to_cpu(leaf->hdr.count)) - ASSERT(be32_to_cpu(leaf->ents[i].hashval) <= - be32_to_cpu(leaf->ents[i + 1].hashval)); - if (leaf->ents[i].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) - stale++; - } - ASSERT(be16_to_cpu(leaf->hdr.stale) == stale); -} -#endif /* DEBUG */ - /* * Compact out any stale entries in the leaf. * Log the header and changed leaf entries, if any. */ void -xfs_dir2_leaf_compact( +xfs_dir3_leaf_compact( xfs_da_args_t *args, /* operation arguments */ + struct xfs_dir3_icleaf_hdr *leafhdr, struct xfs_buf *bp) /* leaf buffer */ { int from; /* source leaf index */ xfs_dir2_leaf_t *leaf; /* leaf structure */ int loglow; /* first leaf entry to log */ int to; /* target leaf index */ + struct xfs_dir2_leaf_entry *ents; + struct xfs_inode *dp = args->dp; leaf = bp->b_addr; - if (!leaf->hdr.stale) { + if (!leafhdr->stale) return; - } + /* * Compress out the stale entries in place. */ - for (from = to = 0, loglow = -1; from < be16_to_cpu(leaf->hdr.count); from++) { - if (leaf->ents[from].address == - cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) + ents = dp->d_ops->leaf_ents_p(leaf); + for (from = to = 0, loglow = -1; from < leafhdr->count; from++) { + if (ents[from].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) continue; /* * Only actually copy the entries that are different. @@ -721,19 +940,21 @@ xfs_dir2_leaf_compact( if (from > to) { if (loglow == -1) loglow = to; - leaf->ents[to] = leaf->ents[from]; + ents[to] = ents[from]; } to++; } /* * Update and log the header, log the leaf entries. */ - ASSERT(be16_to_cpu(leaf->hdr.stale) == from - to); - be16_add_cpu(&leaf->hdr.count, -(be16_to_cpu(leaf->hdr.stale))); - leaf->hdr.stale = 0; - xfs_dir2_leaf_log_header(args->trans, bp); + ASSERT(leafhdr->stale == from - to); + leafhdr->count -= leafhdr->stale; + leafhdr->stale = 0; + + dp->d_ops->leaf_hdr_to_disk(leaf, leafhdr); + xfs_dir3_leaf_log_header(args, bp); if (loglow != -1) - xfs_dir2_leaf_log_ents(args->trans, bp, loglow, to - 1); + xfs_dir3_leaf_log_ents(args, bp, loglow, to - 1); } /* @@ -745,8 +966,9 @@ xfs_dir2_leaf_compact( * and leaf logging indices. */ void -xfs_dir2_leaf_compact_x1( - struct xfs_buf *bp, /* leaf buffer */ +xfs_dir3_leaf_compact_x1( + struct xfs_dir3_icleaf_hdr *leafhdr, + struct xfs_dir2_leaf_entry *ents, int *indexp, /* insertion index */ int *lowstalep, /* out: stale entry before us */ int *highstalep, /* out: stale entry after us */ @@ -757,22 +979,20 @@ xfs_dir2_leaf_compact_x1( int highstale; /* stale entry at/after index */ int index; /* insertion index */ int keepstale; /* source index of kept stale */ - xfs_dir2_leaf_t *leaf; /* leaf structure */ int lowstale; /* stale entry before index */ int newindex=0; /* new insertion index */ int to; /* destination copy index */ - leaf = bp->b_addr; - ASSERT(be16_to_cpu(leaf->hdr.stale) > 1); + ASSERT(leafhdr->stale > 1); index = *indexp; - xfs_dir2_leaf_find_stale(leaf, index, &lowstale, &highstale); + xfs_dir3_leaf_find_stale(leafhdr, ents, index, &lowstale, &highstale); /* * Pick the better of lowstale and highstale. */ if (lowstale >= 0 && - (highstale == be16_to_cpu(leaf->hdr.count) || + (highstale == leafhdr->count || index - lowstale <= highstale - index)) keepstale = lowstale; else @@ -781,15 +1001,14 @@ xfs_dir2_leaf_compact_x1( * Copy the entries in place, removing all the stale entries * except keepstale. */ - for (from = to = 0; from < be16_to_cpu(leaf->hdr.count); from++) { + for (from = to = 0; from < leafhdr->count; from++) { /* * Notice the new value of index. */ if (index == from) newindex = to; if (from != keepstale && - leaf->ents[from].address == - cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) { + ents[from].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) { if (from == to) *lowlogp = to; continue; @@ -803,7 +1022,7 @@ xfs_dir2_leaf_compact_x1( * Copy only the entries that have moved. */ if (from > to) - leaf->ents[to] = leaf->ents[from]; + ents[to] = ents[from]; to++; } ASSERT(from > to); @@ -817,8 +1036,8 @@ xfs_dir2_leaf_compact_x1( /* * Adjust the leaf header values. */ - be16_add_cpu(&leaf->hdr.count, -(from - to)); - leaf->hdr.stale = cpu_to_be16(1); + leafhdr->count -= from - to; + leafhdr->stale = 1; /* * Remember the low/high stale value only in the "right" * direction. @@ -826,479 +1045,35 @@ xfs_dir2_leaf_compact_x1( if (lowstale >= newindex) lowstale = -1; else - highstale = be16_to_cpu(leaf->hdr.count); - *highlogp = be16_to_cpu(leaf->hdr.count) - 1; + highstale = leafhdr->count; + *highlogp = leafhdr->count - 1; *lowstalep = lowstale; *highstalep = highstale; } -struct xfs_dir2_leaf_map_info { - xfs_extlen_t map_blocks; /* number of fsbs in map */ - xfs_dablk_t map_off; /* last mapped file offset */ - int map_size; /* total entries in *map */ - int map_valid; /* valid entries in *map */ - int nmap; /* mappings to ask xfs_bmapi */ - xfs_dir2_db_t curdb; /* db for current block */ - int ra_current; /* number of read-ahead blks */ - int ra_index; /* *map index for read-ahead */ - int ra_offset; /* map entry offset for ra */ - int ra_want; /* readahead count wanted */ - struct xfs_bmbt_irec map[]; /* map vector for blocks */ -}; - -STATIC int -xfs_dir2_leaf_readbuf( - struct xfs_inode *dp, - size_t bufsize, - struct xfs_dir2_leaf_map_info *mip, - xfs_dir2_off_t *curoff, - struct xfs_buf **bpp) -{ - struct xfs_mount *mp = dp->i_mount; - struct xfs_buf *bp = *bpp; - struct xfs_bmbt_irec *map = mip->map; - int error = 0; - int length; - int i; - int j; - - /* - * If we have a buffer, we need to release it and - * take it out of the mapping. - */ - - if (bp) { - xfs_trans_brelse(NULL, bp); - bp = NULL; - mip->map_blocks -= mp->m_dirblkfsbs; - /* - * Loop to get rid of the extents for the - * directory block. - */ - for (i = mp->m_dirblkfsbs; i > 0; ) { - j = min_t(int, map->br_blockcount, i); - map->br_blockcount -= j; - map->br_startblock += j; - map->br_startoff += j; - /* - * If mapping is done, pitch it from - * the table. - */ - if (!map->br_blockcount && --mip->map_valid) - memmove(&map[0], &map[1], - sizeof(map[0]) * mip->map_valid); - i -= j; - } - } - - /* - * Recalculate the readahead blocks wanted. - */ - mip->ra_want = howmany(bufsize + mp->m_dirblksize, - mp->m_sb.sb_blocksize) - 1; - ASSERT(mip->ra_want >= 0); - - /* - * If we don't have as many as we want, and we haven't - * run out of data blocks, get some more mappings. - */ - if (1 + mip->ra_want > mip->map_blocks && - mip->map_off < xfs_dir2_byte_to_da(mp, XFS_DIR2_LEAF_OFFSET)) { - /* - * Get more bmaps, fill in after the ones - * we already have in the table. - */ - mip->nmap = mip->map_size - mip->map_valid; - error = xfs_bmapi_read(dp, mip->map_off, - xfs_dir2_byte_to_da(mp, XFS_DIR2_LEAF_OFFSET) - - mip->map_off, - &map[mip->map_valid], &mip->nmap, 0); - - /* - * Don't know if we should ignore this or try to return an - * error. The trouble with returning errors is that readdir - * will just stop without actually passing the error through. - */ - if (error) - goto out; /* XXX */ - - /* - * If we got all the mappings we asked for, set the final map - * offset based on the last bmap value received. Otherwise, - * we've reached the end. - */ - if (mip->nmap == mip->map_size - mip->map_valid) { - i = mip->map_valid + mip->nmap - 1; - mip->map_off = map[i].br_startoff + map[i].br_blockcount; - } else - mip->map_off = xfs_dir2_byte_to_da(mp, - XFS_DIR2_LEAF_OFFSET); - - /* - * Look for holes in the mapping, and eliminate them. Count up - * the valid blocks. - */ - for (i = mip->map_valid; i < mip->map_valid + mip->nmap; ) { - if (map[i].br_startblock == HOLESTARTBLOCK) { - mip->nmap--; - length = mip->map_valid + mip->nmap - i; - if (length) - memmove(&map[i], &map[i + 1], - sizeof(map[i]) * length); - } else { - mip->map_blocks += map[i].br_blockcount; - i++; - } - } - mip->map_valid += mip->nmap; - } - - /* - * No valid mappings, so no more data blocks. - */ - if (!mip->map_valid) { - *curoff = xfs_dir2_da_to_byte(mp, mip->map_off); - goto out; - } - - /* - * Read the directory block starting at the first mapping. - */ - mip->curdb = xfs_dir2_da_to_db(mp, map->br_startoff); - error = xfs_dir2_data_read(NULL, dp, map->br_startoff, - map->br_blockcount >= mp->m_dirblkfsbs ? - XFS_FSB_TO_DADDR(mp, map->br_startblock) : -1, &bp); - - /* - * Should just skip over the data block instead of giving up. - */ - if (error) - goto out; /* XXX */ - - /* - * Adjust the current amount of read-ahead: we just read a block that - * was previously ra. - */ - if (mip->ra_current) - mip->ra_current -= mp->m_dirblkfsbs; - - /* - * Do we need more readahead? - */ - for (mip->ra_index = mip->ra_offset = i = 0; - mip->ra_want > mip->ra_current && i < mip->map_blocks; - i += mp->m_dirblkfsbs) { - ASSERT(mip->ra_index < mip->map_valid); - /* - * Read-ahead a contiguous directory block. - */ - if (i > mip->ra_current && - map[mip->ra_index].br_blockcount >= mp->m_dirblkfsbs) { - xfs_dir2_data_readahead(NULL, dp, - map[mip->ra_index].br_startoff + mip->ra_offset, - XFS_FSB_TO_DADDR(mp, - map[mip->ra_index].br_startblock + - mip->ra_offset)); - mip->ra_current = i; - } - - /* - * Read-ahead a non-contiguous directory block. This doesn't - * use our mapping, but this is a very rare case. - */ - else if (i > mip->ra_current) { - xfs_dir2_data_readahead(NULL, dp, - map[mip->ra_index].br_startoff + - mip->ra_offset, -1); - mip->ra_current = i; - } - - /* - * Advance offset through the mapping table. - */ - for (j = 0; j < mp->m_dirblkfsbs; j++) { - /* - * The rest of this extent but not more than a dir - * block. - */ - length = min_t(int, mp->m_dirblkfsbs, - map[mip->ra_index].br_blockcount - - mip->ra_offset); - j += length; - mip->ra_offset += length; - - /* - * Advance to the next mapping if this one is used up. - */ - if (mip->ra_offset == map[mip->ra_index].br_blockcount) { - mip->ra_offset = 0; - mip->ra_index++; - } - } - } - -out: - *bpp = bp; - return error; -} - -/* - * Getdents (readdir) for leaf and node directories. - * This reads the data blocks only, so is the same for both forms. - */ -int /* error */ -xfs_dir2_leaf_getdents( - xfs_inode_t *dp, /* incore directory inode */ - void *dirent, - size_t bufsize, - xfs_off_t *offset, - filldir_t filldir) -{ - struct xfs_buf *bp = NULL; /* data block buffer */ - xfs_dir2_data_hdr_t *hdr; /* data block header */ - xfs_dir2_data_entry_t *dep; /* data entry */ - xfs_dir2_data_unused_t *dup; /* unused entry */ - int error = 0; /* error return value */ - int length; /* temporary length value */ - xfs_mount_t *mp; /* filesystem mount point */ - int byteoff; /* offset in current block */ - xfs_dir2_off_t curoff; /* current overall offset */ - xfs_dir2_off_t newoff; /* new curoff after new blk */ - char *ptr = NULL; /* pointer to current data */ - struct xfs_dir2_leaf_map_info *map_info; - - /* - * If the offset is at or past the largest allowed value, - * give up right away. - */ - if (*offset >= XFS_DIR2_MAX_DATAPTR) - return 0; - - mp = dp->i_mount; - - /* - * Set up to bmap a number of blocks based on the caller's - * buffer size, the directory block size, and the filesystem - * block size. - */ - length = howmany(bufsize + mp->m_dirblksize, - mp->m_sb.sb_blocksize); - map_info = kmem_zalloc(offsetof(struct xfs_dir2_leaf_map_info, map) + - (length * sizeof(struct xfs_bmbt_irec)), - KM_SLEEP); - map_info->map_size = length; - - /* - * Inside the loop we keep the main offset value as a byte offset - * in the directory file. - */ - curoff = xfs_dir2_dataptr_to_byte(mp, *offset); - - /* - * Force this conversion through db so we truncate the offset - * down to get the start of the data block. - */ - map_info->map_off = xfs_dir2_db_to_da(mp, - xfs_dir2_byte_to_db(mp, curoff)); - - /* - * Loop over directory entries until we reach the end offset. - * Get more blocks and readahead as necessary. - */ - while (curoff < XFS_DIR2_LEAF_OFFSET) { - /* - * If we have no buffer, or we're off the end of the - * current buffer, need to get another one. - */ - if (!bp || ptr >= (char *)bp->b_addr + mp->m_dirblksize) { - - error = xfs_dir2_leaf_readbuf(dp, bufsize, map_info, - &curoff, &bp); - if (error || !map_info->map_valid) - break; - - /* - * Having done a read, we need to set a new offset. - */ - newoff = xfs_dir2_db_off_to_byte(mp, map_info->curdb, 0); - /* - * Start of the current block. - */ - if (curoff < newoff) - curoff = newoff; - /* - * Make sure we're in the right block. - */ - else if (curoff > newoff) - ASSERT(xfs_dir2_byte_to_db(mp, curoff) == - map_info->curdb); - hdr = bp->b_addr; - xfs_dir2_data_check(dp, bp); - /* - * Find our position in the block. - */ - ptr = (char *)(hdr + 1); - byteoff = xfs_dir2_byte_to_off(mp, curoff); - /* - * Skip past the header. - */ - if (byteoff == 0) - curoff += (uint)sizeof(*hdr); - /* - * Skip past entries until we reach our offset. - */ - else { - while ((char *)ptr - (char *)hdr < byteoff) { - dup = (xfs_dir2_data_unused_t *)ptr; - - if (be16_to_cpu(dup->freetag) - == XFS_DIR2_DATA_FREE_TAG) { - - length = be16_to_cpu(dup->length); - ptr += length; - continue; - } - dep = (xfs_dir2_data_entry_t *)ptr; - length = - xfs_dir2_data_entsize(dep->namelen); - ptr += length; - } - /* - * Now set our real offset. - */ - curoff = - xfs_dir2_db_off_to_byte(mp, - xfs_dir2_byte_to_db(mp, curoff), - (char *)ptr - (char *)hdr); - if (ptr >= (char *)hdr + mp->m_dirblksize) { - continue; - } - } - } - /* - * We have a pointer to an entry. - * Is it a live one? - */ - dup = (xfs_dir2_data_unused_t *)ptr; - /* - * No, it's unused, skip over it. - */ - if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) { - length = be16_to_cpu(dup->length); - ptr += length; - curoff += length; - continue; - } - - dep = (xfs_dir2_data_entry_t *)ptr; - length = xfs_dir2_data_entsize(dep->namelen); - - if (filldir(dirent, (char *)dep->name, dep->namelen, - xfs_dir2_byte_to_dataptr(mp, curoff) & 0x7fffffff, - be64_to_cpu(dep->inumber), DT_UNKNOWN)) - break; - - /* - * Advance to next entry in the block. - */ - ptr += length; - curoff += length; - /* bufsize may have just been a guess; don't go negative */ - bufsize = bufsize > length ? bufsize - length : 0; - } - - /* - * All done. Set output offset value to current offset. - */ - if (curoff > xfs_dir2_dataptr_to_byte(mp, XFS_DIR2_MAX_DATAPTR)) - *offset = XFS_DIR2_MAX_DATAPTR & 0x7fffffff; - else - *offset = xfs_dir2_byte_to_dataptr(mp, curoff) & 0x7fffffff; - kmem_free(map_info); - if (bp) - xfs_trans_brelse(NULL, bp); - return error; -} - -/* - * Initialize a new leaf block, leaf1 or leafn magic accepted. - */ -int -xfs_dir2_leaf_init( - xfs_da_args_t *args, /* operation arguments */ - xfs_dir2_db_t bno, /* directory block number */ - struct xfs_buf **bpp, /* out: leaf buffer */ - int magic) /* magic number for block */ -{ - struct xfs_buf *bp; /* leaf buffer */ - xfs_inode_t *dp; /* incore directory inode */ - int error; /* error return code */ - xfs_dir2_leaf_t *leaf; /* leaf structure */ - xfs_dir2_leaf_tail_t *ltp; /* leaf tail structure */ - xfs_mount_t *mp; /* filesystem mount point */ - xfs_trans_t *tp; /* transaction pointer */ - - dp = args->dp; - ASSERT(dp != NULL); - tp = args->trans; - mp = dp->i_mount; - ASSERT(bno >= XFS_DIR2_LEAF_FIRSTDB(mp) && - bno < XFS_DIR2_FREE_FIRSTDB(mp)); - /* - * Get the buffer for the block. - */ - error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(mp, bno), -1, &bp, - XFS_DATA_FORK); - if (error) - return error; - - /* - * Initialize the header. - */ - leaf = bp->b_addr; - leaf->hdr.info.magic = cpu_to_be16(magic); - leaf->hdr.info.forw = 0; - leaf->hdr.info.back = 0; - leaf->hdr.count = 0; - leaf->hdr.stale = 0; - xfs_dir2_leaf_log_header(tp, bp); - /* - * If it's a leaf-format directory initialize the tail. - * In this case our caller has the real bests table to copy into - * the block. - */ - if (magic == XFS_DIR2_LEAF1_MAGIC) { - bp->b_ops = &xfs_dir2_leaf1_buf_ops; - ltp = xfs_dir2_leaf_tail_p(mp, leaf); - ltp->bestcount = 0; - xfs_dir2_leaf_log_tail(tp, bp); - } else - bp->b_ops = &xfs_dir2_leafn_buf_ops; - *bpp = bp; - return 0; -} - /* * Log the bests entries indicated from a leaf1 block. */ static void -xfs_dir2_leaf_log_bests( - xfs_trans_t *tp, /* transaction pointer */ +xfs_dir3_leaf_log_bests( + struct xfs_da_args *args, struct xfs_buf *bp, /* leaf buffer */ int first, /* first entry to log */ int last) /* last entry to log */ { __be16 *firstb; /* pointer to first entry */ __be16 *lastb; /* pointer to last entry */ - xfs_dir2_leaf_t *leaf; /* leaf structure */ + struct xfs_dir2_leaf *leaf = bp->b_addr; xfs_dir2_leaf_tail_t *ltp; /* leaf tail structure */ - leaf = bp->b_addr; - ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC)); - ltp = xfs_dir2_leaf_tail_p(tp->t_mountp, leaf); + ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC) || + leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAF1_MAGIC)); + + ltp = xfs_dir2_leaf_tail_p(args->geo, leaf); firstb = xfs_dir2_leaf_bests_p(ltp) + first; lastb = xfs_dir2_leaf_bests_p(ltp) + last; - xfs_trans_log_buf(tp, bp, (uint)((char *)firstb - (char *)leaf), + xfs_trans_log_buf(args->trans, bp, + (uint)((char *)firstb - (char *)leaf), (uint)((char *)lastb - (char *)leaf + sizeof(*lastb) - 1)); } @@ -1306,22 +1081,27 @@ xfs_dir2_leaf_log_bests( * Log the leaf entries indicated from a leaf1 or leafn block. */ void -xfs_dir2_leaf_log_ents( - xfs_trans_t *tp, /* transaction pointer */ - struct xfs_buf *bp, /* leaf buffer */ - int first, /* first entry to log */ - int last) /* last entry to log */ +xfs_dir3_leaf_log_ents( + struct xfs_da_args *args, + struct xfs_buf *bp, + int first, + int last) { xfs_dir2_leaf_entry_t *firstlep; /* pointer to first entry */ xfs_dir2_leaf_entry_t *lastlep; /* pointer to last entry */ - xfs_dir2_leaf_t *leaf; /* leaf structure */ + struct xfs_dir2_leaf *leaf = bp->b_addr; + struct xfs_dir2_leaf_entry *ents; - leaf = bp->b_addr; ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC) || - leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); - firstlep = &leaf->ents[first]; - lastlep = &leaf->ents[last]; - xfs_trans_log_buf(tp, bp, (uint)((char *)firstlep - (char *)leaf), + leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAF1_MAGIC) || + leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) || + leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)); + + ents = args->dp->d_ops->leaf_ents_p(leaf); + firstlep = &ents[first]; + lastlep = &ents[last]; + xfs_trans_log_buf(args->trans, bp, + (uint)((char *)firstlep - (char *)leaf), (uint)((char *)lastlep - (char *)leaf + sizeof(*lastlep) - 1)); } @@ -1329,37 +1109,41 @@ xfs_dir2_leaf_log_ents( * Log the header of the leaf1 or leafn block. */ void -xfs_dir2_leaf_log_header( - struct xfs_trans *tp, +xfs_dir3_leaf_log_header( + struct xfs_da_args *args, struct xfs_buf *bp) { - xfs_dir2_leaf_t *leaf; /* leaf structure */ + struct xfs_dir2_leaf *leaf = bp->b_addr; - leaf = bp->b_addr; ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC) || - leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); - xfs_trans_log_buf(tp, bp, (uint)((char *)&leaf->hdr - (char *)leaf), - (uint)(sizeof(leaf->hdr) - 1)); + leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAF1_MAGIC) || + leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) || + leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)); + + xfs_trans_log_buf(args->trans, bp, + (uint)((char *)&leaf->hdr - (char *)leaf), + args->dp->d_ops->leaf_hdr_size - 1); } /* * Log the tail of the leaf1 block. */ STATIC void -xfs_dir2_leaf_log_tail( - struct xfs_trans *tp, +xfs_dir3_leaf_log_tail( + struct xfs_da_args *args, struct xfs_buf *bp) { - xfs_dir2_leaf_t *leaf; /* leaf structure */ + struct xfs_dir2_leaf *leaf = bp->b_addr; xfs_dir2_leaf_tail_t *ltp; /* leaf tail structure */ - xfs_mount_t *mp; /* filesystem mount point */ - mp = tp->t_mountp; - leaf = bp->b_addr; - ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC)); - ltp = xfs_dir2_leaf_tail_p(mp, leaf); - xfs_trans_log_buf(tp, bp, (uint)((char *)ltp - (char *)leaf), - (uint)(mp->m_dirblksize - 1)); + ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC) || + leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAF1_MAGIC) || + leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) || + leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)); + + ltp = xfs_dir2_leaf_tail_p(args->geo, leaf); + xfs_trans_log_buf(args->trans, bp, (uint)((char *)ltp - (char *)leaf), + (uint)(args->geo->blksize - 1)); } /* @@ -1380,6 +1164,7 @@ xfs_dir2_leaf_lookup( xfs_dir2_leaf_t *leaf; /* leaf structure */ xfs_dir2_leaf_entry_t *lep; /* leaf entry */ xfs_trans_t *tp; /* transaction pointer */ + struct xfs_dir2_leaf_entry *ents; trace_xfs_dir2_leaf_lookup(args); @@ -1391,22 +1176,25 @@ xfs_dir2_leaf_lookup( } tp = args->trans; dp = args->dp; - xfs_dir2_leaf_check(dp, lbp); + xfs_dir3_leaf_check(dp, lbp); leaf = lbp->b_addr; + ents = dp->d_ops->leaf_ents_p(leaf); /* * Get to the leaf entry and contained data entry address. */ - lep = &leaf->ents[index]; + lep = &ents[index]; + /* * Point to the data entry. */ dep = (xfs_dir2_data_entry_t *) ((char *)dbp->b_addr + - xfs_dir2_dataptr_to_off(dp->i_mount, be32_to_cpu(lep->address))); + xfs_dir2_dataptr_to_off(args->geo, be32_to_cpu(lep->address))); /* * Return the found inode number & CI name if appropriate */ args->inumber = be64_to_cpu(dep->inumber); + args->filetype = dp->d_ops->data_get_ftype(dep); error = xfs_dir_cilookup_result(args, dep->name, dep->namelen); xfs_trans_brelse(tp, dbp); xfs_trans_brelse(tp, lbp); @@ -1440,18 +1228,23 @@ xfs_dir2_leaf_lookup_int( xfs_trans_t *tp; /* transaction pointer */ xfs_dir2_db_t cidb = -1; /* case match data block no. */ enum xfs_dacmp cmp; /* name compare result */ + struct xfs_dir2_leaf_entry *ents; + struct xfs_dir3_icleaf_hdr leafhdr; dp = args->dp; tp = args->trans; mp = dp->i_mount; - error = xfs_dir2_leaf_read(tp, dp, mp->m_dirleafblk, -1, &lbp); + error = xfs_dir3_leaf_read(tp, dp, args->geo->leafblk, -1, &lbp); if (error) return error; *lbpp = lbp; leaf = lbp->b_addr; - xfs_dir2_leaf_check(dp, lbp); + xfs_dir3_leaf_check(dp, lbp); + ents = dp->d_ops->leaf_ents_p(leaf); + dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf); + /* * Look for the first leaf entry with our hash value. */ @@ -1460,9 +1253,9 @@ xfs_dir2_leaf_lookup_int( * Loop over all the entries with the right hash value * looking to match the name. */ - for (lep = &leaf->ents[index]; index < be16_to_cpu(leaf->hdr.count) && - be32_to_cpu(lep->hashval) == args->hashval; - lep++, index++) { + for (lep = &ents[index]; + index < leafhdr.count && be32_to_cpu(lep->hashval) == args->hashval; + lep++, index++) { /* * Skip over stale leaf entries. */ @@ -1471,7 +1264,8 @@ xfs_dir2_leaf_lookup_int( /* * Get the new data block number. */ - newdb = xfs_dir2_dataptr_to_db(mp, be32_to_cpu(lep->address)); + newdb = xfs_dir2_dataptr_to_db(args->geo, + be32_to_cpu(lep->address)); /* * If it's not the same as the old data block number, * need to pitch the old one and read the new one. @@ -1479,9 +1273,9 @@ xfs_dir2_leaf_lookup_int( if (newdb != curdb) { if (dbp) xfs_trans_brelse(tp, dbp); - error = xfs_dir2_data_read(tp, dp, - xfs_dir2_db_to_da(mp, newdb), - -1, &dbp); + error = xfs_dir3_data_read(tp, dp, + xfs_dir2_db_to_da(args->geo, newdb), + -1, &dbp); if (error) { xfs_trans_brelse(tp, lbp); return error; @@ -1492,7 +1286,8 @@ xfs_dir2_leaf_lookup_int( * Point to the data entry. */ dep = (xfs_dir2_data_entry_t *)((char *)dbp->b_addr + - xfs_dir2_dataptr_to_off(mp, be32_to_cpu(lep->address))); + xfs_dir2_dataptr_to_off(args->geo, + be32_to_cpu(lep->address))); /* * Compare name and if it's an exact match, return the index * and buffer. If it's the first case-insensitive match, store @@ -1520,9 +1315,9 @@ xfs_dir2_leaf_lookup_int( ASSERT(cidb != -1); if (cidb != curdb) { xfs_trans_brelse(tp, dbp); - error = xfs_dir2_data_read(tp, dp, - xfs_dir2_db_to_da(mp, cidb), - -1, &dbp); + error = xfs_dir3_data_read(tp, dp, + xfs_dir2_db_to_da(args->geo, cidb), + -1, &dbp); if (error) { xfs_trans_brelse(tp, lbp); return error; @@ -1566,6 +1361,9 @@ xfs_dir2_leaf_removename( int needscan; /* need to rescan data frees */ xfs_dir2_data_off_t oldbest; /* old value of best free */ xfs_trans_t *tp; /* transaction pointer */ + struct xfs_dir2_data_free *bf; /* bestfree table */ + struct xfs_dir2_leaf_entry *ents; + struct xfs_dir3_icleaf_hdr leafhdr; trace_xfs_dir2_leaf_removename(args); @@ -1580,55 +1378,61 @@ xfs_dir2_leaf_removename( mp = dp->i_mount; leaf = lbp->b_addr; hdr = dbp->b_addr; - xfs_dir2_data_check(dp, dbp); + xfs_dir3_data_check(dp, dbp); + bf = dp->d_ops->data_bestfree_p(hdr); + dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf); + ents = dp->d_ops->leaf_ents_p(leaf); /* * Point to the leaf entry, use that to point to the data entry. */ - lep = &leaf->ents[index]; - db = xfs_dir2_dataptr_to_db(mp, be32_to_cpu(lep->address)); - dep = (xfs_dir2_data_entry_t *) - ((char *)hdr + xfs_dir2_dataptr_to_off(mp, be32_to_cpu(lep->address))); + lep = &ents[index]; + db = xfs_dir2_dataptr_to_db(args->geo, be32_to_cpu(lep->address)); + dep = (xfs_dir2_data_entry_t *)((char *)hdr + + xfs_dir2_dataptr_to_off(args->geo, be32_to_cpu(lep->address))); needscan = needlog = 0; - oldbest = be16_to_cpu(hdr->bestfree[0].length); - ltp = xfs_dir2_leaf_tail_p(mp, leaf); + oldbest = be16_to_cpu(bf[0].length); + ltp = xfs_dir2_leaf_tail_p(args->geo, leaf); bestsp = xfs_dir2_leaf_bests_p(ltp); ASSERT(be16_to_cpu(bestsp[db]) == oldbest); /* * Mark the former data entry unused. */ - xfs_dir2_data_make_free(tp, dbp, + xfs_dir2_data_make_free(args, dbp, (xfs_dir2_data_aoff_t)((char *)dep - (char *)hdr), - xfs_dir2_data_entsize(dep->namelen), &needlog, &needscan); + dp->d_ops->data_entsize(dep->namelen), &needlog, &needscan); /* * We just mark the leaf entry stale by putting a null in it. */ - be16_add_cpu(&leaf->hdr.stale, 1); - xfs_dir2_leaf_log_header(tp, lbp); + leafhdr.stale++; + dp->d_ops->leaf_hdr_to_disk(leaf, &leafhdr); + xfs_dir3_leaf_log_header(args, lbp); + lep->address = cpu_to_be32(XFS_DIR2_NULL_DATAPTR); - xfs_dir2_leaf_log_ents(tp, lbp, index, index); + xfs_dir3_leaf_log_ents(args, lbp, index, index); + /* * Scan the freespace in the data block again if necessary, * log the data block header if necessary. */ if (needscan) - xfs_dir2_data_freescan(mp, hdr, &needlog); + xfs_dir2_data_freescan(dp, hdr, &needlog); if (needlog) - xfs_dir2_data_log_header(tp, dbp); + xfs_dir2_data_log_header(args, dbp); /* * If the longest freespace in the data block has changed, * put the new value in the bests table and log that. */ - if (be16_to_cpu(hdr->bestfree[0].length) != oldbest) { - bestsp[db] = hdr->bestfree[0].length; - xfs_dir2_leaf_log_bests(tp, lbp, db, db); + if (be16_to_cpu(bf[0].length) != oldbest) { + bestsp[db] = bf[0].length; + xfs_dir3_leaf_log_bests(args, lbp, db, db); } - xfs_dir2_data_check(dp, dbp); + xfs_dir3_data_check(dp, dbp); /* * If the data block is now empty then get rid of the data block. */ - if (be16_to_cpu(hdr->bestfree[0].length) == - mp->m_dirblksize - (uint)sizeof(*hdr)) { - ASSERT(db != mp->m_dirdatablk); + if (be16_to_cpu(bf[0].length) == + args->geo->blksize - dp->d_ops->data_entry_offset) { + ASSERT(db != args->geo->datablk); if ((error = xfs_dir2_shrink_inode(args, db, dbp))) { /* * Nope, can't get rid of it because it caused @@ -1638,7 +1442,7 @@ xfs_dir2_leaf_removename( */ if (error == ENOSPC && args->total == 0) error = 0; - xfs_dir2_leaf_check(dp, lbp); + xfs_dir3_leaf_check(dp, lbp); return error; } dbp = NULL; @@ -1661,18 +1465,19 @@ xfs_dir2_leaf_removename( memmove(&bestsp[db - i], bestsp, (be32_to_cpu(ltp->bestcount) - (db - i)) * sizeof(*bestsp)); be32_add_cpu(<p->bestcount, -(db - i)); - xfs_dir2_leaf_log_tail(tp, lbp); - xfs_dir2_leaf_log_bests(tp, lbp, 0, be32_to_cpu(ltp->bestcount) - 1); + xfs_dir3_leaf_log_tail(args, lbp); + xfs_dir3_leaf_log_bests(args, lbp, 0, + be32_to_cpu(ltp->bestcount) - 1); } else bestsp[db] = cpu_to_be16(NULLDATAOFF); } /* * If the data block was not the first one, drop it. */ - else if (db != mp->m_dirdatablk) + else if (db != args->geo->datablk) dbp = NULL; - xfs_dir2_leaf_check(dp, lbp); + xfs_dir3_leaf_check(dp, lbp); /* * See if we can convert to block form. */ @@ -1695,6 +1500,7 @@ xfs_dir2_leaf_replace( xfs_dir2_leaf_t *leaf; /* leaf structure */ xfs_dir2_leaf_entry_t *lep; /* leaf entry */ xfs_trans_t *tp; /* transaction pointer */ + struct xfs_dir2_leaf_entry *ents; trace_xfs_dir2_leaf_replace(args); @@ -1706,24 +1512,26 @@ xfs_dir2_leaf_replace( } dp = args->dp; leaf = lbp->b_addr; + ents = dp->d_ops->leaf_ents_p(leaf); /* * Point to the leaf entry, get data address from it. */ - lep = &leaf->ents[index]; + lep = &ents[index]; /* * Point to the data entry. */ dep = (xfs_dir2_data_entry_t *) ((char *)dbp->b_addr + - xfs_dir2_dataptr_to_off(dp->i_mount, be32_to_cpu(lep->address))); + xfs_dir2_dataptr_to_off(args->geo, be32_to_cpu(lep->address))); ASSERT(args->inumber != be64_to_cpu(dep->inumber)); /* * Put the new inode number in, log it. */ dep->inumber = cpu_to_be64(args->inumber); + dp->d_ops->data_put_ftype(dep, args->filetype); tp = args->trans; - xfs_dir2_data_log_entry(tp, dbp, dep); - xfs_dir2_leaf_check(dp, lbp); + xfs_dir2_data_log_entry(args, dbp, dep); + xfs_dir3_leaf_check(dp, lbp); xfs_trans_brelse(tp, lbp); return 0; } @@ -1745,17 +1553,18 @@ xfs_dir2_leaf_search_hash( xfs_dir2_leaf_t *leaf; /* leaf structure */ xfs_dir2_leaf_entry_t *lep; /* leaf entry */ int mid=0; /* current leaf index */ + struct xfs_dir2_leaf_entry *ents; + struct xfs_dir3_icleaf_hdr leafhdr; leaf = lbp->b_addr; -#ifndef __KERNEL__ - if (!leaf->hdr.count) - return 0; -#endif + ents = args->dp->d_ops->leaf_ents_p(leaf); + args->dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf); + /* * Note, the table cannot be empty, so we have to go through the loop. * Binary search the leaf entries looking for our hash value. */ - for (lep = leaf->ents, low = 0, high = be16_to_cpu(leaf->hdr.count) - 1, + for (lep = ents, low = 0, high = leafhdr.count - 1, hashwant = args->hashval; low <= high; ) { mid = (low + high) >> 1; @@ -1807,20 +1616,23 @@ xfs_dir2_leaf_trim_data( /* * Read the offending data block. We need its buffer. */ - error = xfs_dir2_data_read(tp, dp, xfs_dir2_db_to_da(mp, db), -1, &dbp); + error = xfs_dir3_data_read(tp, dp, xfs_dir2_db_to_da(args->geo, db), + -1, &dbp); if (error) return error; leaf = lbp->b_addr; - ltp = xfs_dir2_leaf_tail_p(mp, leaf); + ltp = xfs_dir2_leaf_tail_p(args->geo, leaf); #ifdef DEBUG { struct xfs_dir2_data_hdr *hdr = dbp->b_addr; + struct xfs_dir2_data_free *bf = dp->d_ops->data_bestfree_p(hdr); - ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC)); - ASSERT(be16_to_cpu(hdr->bestfree[0].length) == - mp->m_dirblksize - (uint)sizeof(*hdr)); + ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC)); + ASSERT(be16_to_cpu(bf[0].length) == + args->geo->blksize - dp->d_ops->data_entry_offset); ASSERT(db == be32_to_cpu(ltp->bestcount) - 1); } #endif @@ -1839,23 +1651,29 @@ xfs_dir2_leaf_trim_data( bestsp = xfs_dir2_leaf_bests_p(ltp); be32_add_cpu(<p->bestcount, -1); memmove(&bestsp[1], &bestsp[0], be32_to_cpu(ltp->bestcount) * sizeof(*bestsp)); - xfs_dir2_leaf_log_tail(tp, lbp); - xfs_dir2_leaf_log_bests(tp, lbp, 0, be32_to_cpu(ltp->bestcount) - 1); + xfs_dir3_leaf_log_tail(args, lbp); + xfs_dir3_leaf_log_bests(args, lbp, 0, be32_to_cpu(ltp->bestcount) - 1); return 0; } static inline size_t -xfs_dir2_leaf_size( - struct xfs_dir2_leaf_hdr *hdr, +xfs_dir3_leaf_size( + struct xfs_dir3_icleaf_hdr *hdr, int counts) { - int entries; + int entries; + int hdrsize; + + entries = hdr->count - hdr->stale; + if (hdr->magic == XFS_DIR2_LEAF1_MAGIC || + hdr->magic == XFS_DIR2_LEAFN_MAGIC) + hdrsize = sizeof(struct xfs_dir2_leaf_hdr); + else + hdrsize = sizeof(struct xfs_dir3_leaf_hdr); - entries = be16_to_cpu(hdr->count) - be16_to_cpu(hdr->stale); - return sizeof(xfs_dir2_leaf_hdr_t) + - entries * sizeof(xfs_dir2_leaf_entry_t) + - counts * sizeof(xfs_dir2_data_off_t) + - sizeof(xfs_dir2_leaf_tail_t); + return hdrsize + entries * sizeof(xfs_dir2_leaf_entry_t) + + counts * sizeof(xfs_dir2_data_off_t) + + sizeof(xfs_dir2_leaf_tail_t); } /* @@ -1879,6 +1697,8 @@ xfs_dir2_node_to_leaf( xfs_mount_t *mp; /* filesystem mount point */ int rval; /* successful free trim? */ xfs_trans_t *tp; /* transaction pointer */ + struct xfs_dir3_icleaf_hdr leafhdr; + struct xfs_dir3_icfree_hdr freehdr; /* * There's more than a leaf level in the btree, so there must @@ -1896,22 +1716,22 @@ xfs_dir2_node_to_leaf( /* * Get the last offset in the file. */ - if ((error = xfs_bmap_last_offset(tp, dp, &fo, XFS_DATA_FORK))) { + if ((error = xfs_bmap_last_offset(dp, &fo, XFS_DATA_FORK))) { return error; } - fo -= mp->m_dirblkfsbs; + fo -= args->geo->fsbcount; /* * If there are freespace blocks other than the first one, * take this opportunity to remove trailing empty freespace blocks * that may have been left behind during no-space-reservation * operations. */ - while (fo > mp->m_dirfreeblk) { + while (fo > args->geo->freeblk) { if ((error = xfs_dir2_node_trim_free(args, fo, &rval))) { return error; } if (rval) - fo -= mp->m_dirblkfsbs; + fo -= args->geo->fsbcount; else return 0; } @@ -1924,60 +1744,71 @@ xfs_dir2_node_to_leaf( /* * If it's not the single leaf block, give up. */ - if (XFS_FSB_TO_B(mp, fo) > XFS_DIR2_LEAF_OFFSET + mp->m_dirblksize) + if (XFS_FSB_TO_B(mp, fo) > XFS_DIR2_LEAF_OFFSET + args->geo->blksize) return 0; lbp = state->path.blk[0].bp; leaf = lbp->b_addr; - ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); + dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf); + + ASSERT(leafhdr.magic == XFS_DIR2_LEAFN_MAGIC || + leafhdr.magic == XFS_DIR3_LEAFN_MAGIC); + /* * Read the freespace block. */ - error = xfs_dir2_free_read(tp, dp, mp->m_dirfreeblk, &fbp); + error = xfs_dir2_free_read(tp, dp, args->geo->freeblk, &fbp); if (error) return error; free = fbp->b_addr; - ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC)); - ASSERT(!free->hdr.firstdb); + dp->d_ops->free_hdr_from_disk(&freehdr, free); + + ASSERT(!freehdr.firstdb); /* * Now see if the leafn and free data will fit in a leaf1. * If not, release the buffer and give up. */ - if (xfs_dir2_leaf_size(&leaf->hdr, be32_to_cpu(free->hdr.nvalid)) > - mp->m_dirblksize) { + if (xfs_dir3_leaf_size(&leafhdr, freehdr.nvalid) > args->geo->blksize) { xfs_trans_brelse(tp, fbp); return 0; } /* * If the leaf has any stale entries in it, compress them out. - * The compact routine will log the header. */ - if (be16_to_cpu(leaf->hdr.stale)) - xfs_dir2_leaf_compact(args, lbp); - else - xfs_dir2_leaf_log_header(tp, lbp); + if (leafhdr.stale) + xfs_dir3_leaf_compact(args, &leafhdr, lbp); - lbp->b_ops = &xfs_dir2_leaf1_buf_ops; - leaf->hdr.info.magic = cpu_to_be16(XFS_DIR2_LEAF1_MAGIC); + lbp->b_ops = &xfs_dir3_leaf1_buf_ops; + xfs_trans_buf_set_type(tp, lbp, XFS_BLFT_DIR_LEAF1_BUF); + leafhdr.magic = (leafhdr.magic == XFS_DIR2_LEAFN_MAGIC) + ? XFS_DIR2_LEAF1_MAGIC + : XFS_DIR3_LEAF1_MAGIC; /* * Set up the leaf tail from the freespace block. */ - ltp = xfs_dir2_leaf_tail_p(mp, leaf); - ltp->bestcount = free->hdr.nvalid; + ltp = xfs_dir2_leaf_tail_p(args->geo, leaf); + ltp->bestcount = cpu_to_be32(freehdr.nvalid); + /* * Set up the leaf bests table. */ - memcpy(xfs_dir2_leaf_bests_p(ltp), free->bests, - be32_to_cpu(ltp->bestcount) * sizeof(xfs_dir2_data_off_t)); - xfs_dir2_leaf_log_bests(tp, lbp, 0, be32_to_cpu(ltp->bestcount) - 1); - xfs_dir2_leaf_log_tail(tp, lbp); - xfs_dir2_leaf_check(dp, lbp); + memcpy(xfs_dir2_leaf_bests_p(ltp), dp->d_ops->free_bests_p(free), + freehdr.nvalid * sizeof(xfs_dir2_data_off_t)); + + dp->d_ops->leaf_hdr_to_disk(leaf, &leafhdr); + xfs_dir3_leaf_log_header(args, lbp); + xfs_dir3_leaf_log_bests(args, lbp, 0, be32_to_cpu(ltp->bestcount) - 1); + xfs_dir3_leaf_log_tail(args, lbp); + xfs_dir3_leaf_check(dp, lbp); + /* * Get rid of the freespace block. */ - error = xfs_dir2_shrink_inode(args, XFS_DIR2_FREE_FIRSTDB(mp), fbp); + error = xfs_dir2_shrink_inode(args, + xfs_dir2_byte_to_db(args->geo, XFS_DIR2_FREE_OFFSET), + fbp); if (error) { /* * This can't fail here because it can only happen when diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c index 5980f9b7fa9..da43d304fca 100644 --- a/fs/xfs/xfs_dir2_node.c +++ b/fs/xfs/xfs_dir2_node.c @@ -1,5 +1,6 @@ /* * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * Copyright (c) 2013 Red Hat, Inc. * All Rights Reserved. * * This program is free software; you can redistribute it and/or @@ -17,35 +18,29 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_types.h" -#include "xfs_log.h" -#include "xfs_trans.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_mount.h" +#include "xfs_da_format.h" #include "xfs_da_btree.h" -#include "xfs_bmap_btree.h" -#include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_bmap.h" -#include "xfs_dir2_format.h" +#include "xfs_dir2.h" #include "xfs_dir2_priv.h" #include "xfs_error.h" #include "xfs_trace.h" +#include "xfs_trans.h" +#include "xfs_buf_item.h" +#include "xfs_cksum.h" /* * Function declarations. */ static int xfs_dir2_leafn_add(struct xfs_buf *bp, xfs_da_args_t *args, int index); -#ifdef DEBUG -static void xfs_dir2_leafn_check(struct xfs_inode *dp, struct xfs_buf *bp); -#else -#define xfs_dir2_leafn_check(dp, bp) -#endif -static void xfs_dir2_leafn_moveents(xfs_da_args_t *args, struct xfs_buf *bp_s, - int start_s, struct xfs_buf *bp_d, - int start_d, int count); static void xfs_dir2_leafn_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, xfs_da_state_blk_t *blk2); @@ -55,52 +50,127 @@ static int xfs_dir2_leafn_remove(xfs_da_args_t *args, struct xfs_buf *bp, static int xfs_dir2_node_addname_int(xfs_da_args_t *args, xfs_da_state_blk_t *fblk); -static void -xfs_dir2_free_verify( +/* + * Check internal consistency of a leafn block. + */ +#ifdef DEBUG +#define xfs_dir3_leaf_check(dp, bp) \ +do { \ + if (!xfs_dir3_leafn_check((dp), (bp))) \ + ASSERT(0); \ +} while (0); + +static bool +xfs_dir3_leafn_check( + struct xfs_inode *dp, + struct xfs_buf *bp) +{ + struct xfs_dir2_leaf *leaf = bp->b_addr; + struct xfs_dir3_icleaf_hdr leafhdr; + + dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf); + + if (leafhdr.magic == XFS_DIR3_LEAFN_MAGIC) { + struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr; + if (be64_to_cpu(leaf3->info.blkno) != bp->b_bn) + return false; + } else if (leafhdr.magic != XFS_DIR2_LEAFN_MAGIC) + return false; + + return xfs_dir3_leaf_check_int(dp->i_mount, dp, &leafhdr, leaf); +} +#else +#define xfs_dir3_leaf_check(dp, bp) +#endif + +static bool +xfs_dir3_free_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; struct xfs_dir2_free_hdr *hdr = bp->b_addr; - int block_ok = 0; - block_ok = hdr->magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC); - if (!block_ok) { - XFS_CORRUPTION_ERROR("xfs_dir2_free_verify magic", - XFS_ERRLEVEL_LOW, mp, hdr); - xfs_buf_ioerror(bp, EFSCORRUPTED); + if (xfs_sb_version_hascrc(&mp->m_sb)) { + struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; + + if (hdr3->magic != cpu_to_be32(XFS_DIR3_FREE_MAGIC)) + return false; + if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_uuid)) + return false; + if (be64_to_cpu(hdr3->blkno) != bp->b_bn) + return false; + } else { + if (hdr->magic != cpu_to_be32(XFS_DIR2_FREE_MAGIC)) + return false; } + + /* XXX: should bounds check the xfs_dir3_icfree_hdr here */ + + return true; } static void -xfs_dir2_free_read_verify( +xfs_dir3_free_read_verify( struct xfs_buf *bp) { - xfs_dir2_free_verify(bp); + struct xfs_mount *mp = bp->b_target->bt_mount; + + if (xfs_sb_version_hascrc(&mp->m_sb) && + !xfs_buf_verify_cksum(bp, XFS_DIR3_FREE_CRC_OFF)) + xfs_buf_ioerror(bp, EFSBADCRC); + else if (!xfs_dir3_free_verify(bp)) + xfs_buf_ioerror(bp, EFSCORRUPTED); + + if (bp->b_error) + xfs_verifier_error(bp); } static void -xfs_dir2_free_write_verify( +xfs_dir3_free_write_verify( struct xfs_buf *bp) { - xfs_dir2_free_verify(bp); + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_buf_log_item *bip = bp->b_fspriv; + struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; + + if (!xfs_dir3_free_verify(bp)) { + xfs_buf_ioerror(bp, EFSCORRUPTED); + xfs_verifier_error(bp); + return; + } + + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return; + + if (bip) + hdr3->lsn = cpu_to_be64(bip->bli_item.li_lsn); + + xfs_buf_update_cksum(bp, XFS_DIR3_FREE_CRC_OFF); } -static const struct xfs_buf_ops xfs_dir2_free_buf_ops = { - .verify_read = xfs_dir2_free_read_verify, - .verify_write = xfs_dir2_free_write_verify, +const struct xfs_buf_ops xfs_dir3_free_buf_ops = { + .verify_read = xfs_dir3_free_read_verify, + .verify_write = xfs_dir3_free_write_verify, }; static int -__xfs_dir2_free_read( +__xfs_dir3_free_read( struct xfs_trans *tp, struct xfs_inode *dp, xfs_dablk_t fbno, xfs_daddr_t mappedbno, struct xfs_buf **bpp) { - return xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp, - XFS_DATA_FORK, &xfs_dir2_free_buf_ops); + int err; + + err = xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp, + XFS_DATA_FORK, &xfs_dir3_free_buf_ops); + + /* try read returns without an error or *bpp if it lands in a hole */ + if (!err && tp && *bpp) + xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_FREE_BUF); + return err; } int @@ -110,7 +180,7 @@ xfs_dir2_free_read( xfs_dablk_t fbno, struct xfs_buf **bpp) { - return __xfs_dir2_free_read(tp, dp, fbno, -1, bpp); + return __xfs_dir3_free_read(tp, dp, fbno, -1, bpp); } static int @@ -120,7 +190,50 @@ xfs_dir2_free_try_read( xfs_dablk_t fbno, struct xfs_buf **bpp) { - return __xfs_dir2_free_read(tp, dp, fbno, -2, bpp); + return __xfs_dir3_free_read(tp, dp, fbno, -2, bpp); +} + +static int +xfs_dir3_free_get_buf( + xfs_da_args_t *args, + xfs_dir2_db_t fbno, + struct xfs_buf **bpp) +{ + struct xfs_trans *tp = args->trans; + struct xfs_inode *dp = args->dp; + struct xfs_mount *mp = dp->i_mount; + struct xfs_buf *bp; + int error; + struct xfs_dir3_icfree_hdr hdr; + + error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(args->geo, fbno), + -1, &bp, XFS_DATA_FORK); + if (error) + return error; + + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_FREE_BUF); + bp->b_ops = &xfs_dir3_free_buf_ops; + + /* + * Initialize the new block to be empty, and remember + * its first slot as our empty slot. + */ + memset(bp->b_addr, 0, sizeof(struct xfs_dir3_free_hdr)); + memset(&hdr, 0, sizeof(hdr)); + + if (xfs_sb_version_hascrc(&mp->m_sb)) { + struct xfs_dir3_free_hdr *hdr3 = bp->b_addr; + + hdr.magic = XFS_DIR3_FREE_MAGIC; + + hdr3->hdr.blkno = cpu_to_be64(bp->b_bn); + hdr3->hdr.owner = cpu_to_be64(dp->i_ino); + uuid_copy(&hdr3->hdr.uuid, &mp->m_sb.sb_uuid); + } else + hdr.magic = XFS_DIR2_FREE_MAGIC; + dp->d_ops->free_hdr_to_disk(bp->b_addr, &hdr); + *bpp = bp; + return 0; } /* @@ -128,19 +241,22 @@ xfs_dir2_free_try_read( */ STATIC void xfs_dir2_free_log_bests( - struct xfs_trans *tp, + struct xfs_da_args *args, struct xfs_buf *bp, int first, /* first entry to log */ int last) /* last entry to log */ { xfs_dir2_free_t *free; /* freespace structure */ + __be16 *bests; free = bp->b_addr; - ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC)); - xfs_trans_log_buf(tp, bp, - (uint)((char *)&free->bests[first] - (char *)free), - (uint)((char *)&free->bests[last] - (char *)free + - sizeof(free->bests[0]) - 1)); + bests = args->dp->d_ops->free_bests_p(free); + ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC) || + free->hdr.magic == cpu_to_be32(XFS_DIR3_FREE_MAGIC)); + xfs_trans_log_buf(args->trans, bp, + (uint)((char *)&bests[first] - (char *)free), + (uint)((char *)&bests[last] - (char *)free + + sizeof(bests[0]) - 1)); } /* @@ -148,15 +264,18 @@ xfs_dir2_free_log_bests( */ static void xfs_dir2_free_log_header( - struct xfs_trans *tp, + struct xfs_da_args *args, struct xfs_buf *bp) { +#ifdef DEBUG xfs_dir2_free_t *free; /* freespace structure */ free = bp->b_addr; - ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC)); - xfs_trans_log_buf(tp, bp, (uint)((char *)&free->hdr - (char *)free), - (uint)(sizeof(xfs_dir2_free_hdr_t) - 1)); + ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC) || + free->hdr.magic == cpu_to_be32(XFS_DIR3_FREE_MAGIC)); +#endif + xfs_trans_log_buf(args->trans, bp, 0, + args->dp->d_ops->free_hdr_size - 1); } /* @@ -183,6 +302,7 @@ xfs_dir2_leaf_to_node( xfs_dir2_data_off_t off; /* freespace entry value */ __be16 *to; /* pointer to freespace entry */ xfs_trans_t *tp; /* transaction pointer */ + struct xfs_dir3_icfree_hdr freehdr; trace_xfs_dir2_leaf_to_node(args); @@ -195,48 +315,57 @@ xfs_dir2_leaf_to_node( if ((error = xfs_dir2_grow_inode(args, XFS_DIR2_FREE_SPACE, &fdb))) { return error; } - ASSERT(fdb == XFS_DIR2_FREE_FIRSTDB(mp)); + ASSERT(fdb == xfs_dir2_byte_to_db(args->geo, XFS_DIR2_FREE_OFFSET)); /* * Get the buffer for the new freespace block. */ - error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(mp, fdb), -1, &fbp, - XFS_DATA_FORK); + error = xfs_dir3_free_get_buf(args, fdb, &fbp); if (error) return error; - fbp->b_ops = &xfs_dir2_free_buf_ops; free = fbp->b_addr; + dp->d_ops->free_hdr_from_disk(&freehdr, free); leaf = lbp->b_addr; - ltp = xfs_dir2_leaf_tail_p(mp, leaf); - /* - * Initialize the freespace block header. - */ - free->hdr.magic = cpu_to_be32(XFS_DIR2_FREE_MAGIC); - free->hdr.firstdb = 0; - ASSERT(be32_to_cpu(ltp->bestcount) <= (uint)dp->i_d.di_size / mp->m_dirblksize); - free->hdr.nvalid = ltp->bestcount; + ltp = xfs_dir2_leaf_tail_p(args->geo, leaf); + ASSERT(be32_to_cpu(ltp->bestcount) <= + (uint)dp->i_d.di_size / args->geo->blksize); + /* * Copy freespace entries from the leaf block to the new block. * Count active entries. */ - for (i = n = 0, from = xfs_dir2_leaf_bests_p(ltp), to = free->bests; - i < be32_to_cpu(ltp->bestcount); i++, from++, to++) { + from = xfs_dir2_leaf_bests_p(ltp); + to = dp->d_ops->free_bests_p(free); + for (i = n = 0; i < be32_to_cpu(ltp->bestcount); i++, from++, to++) { if ((off = be16_to_cpu(*from)) != NULLDATAOFF) n++; *to = cpu_to_be16(off); } - free->hdr.nused = cpu_to_be32(n); - lbp->b_ops = &xfs_dir2_leafn_buf_ops; - leaf->hdr.info.magic = cpu_to_be16(XFS_DIR2_LEAFN_MAGIC); + /* + * Now initialize the freespace block header. + */ + freehdr.nused = n; + freehdr.nvalid = be32_to_cpu(ltp->bestcount); + + dp->d_ops->free_hdr_to_disk(fbp->b_addr, &freehdr); + xfs_dir2_free_log_bests(args, fbp, 0, freehdr.nvalid - 1); + xfs_dir2_free_log_header(args, fbp); /* - * Log everything. + * Converting the leaf to a leafnode is just a matter of changing the + * magic number and the ops. Do the change directly to the buffer as + * it's less work (and less code) than decoding the header to host + * format and back again. */ - xfs_dir2_leaf_log_header(tp, lbp); - xfs_dir2_free_log_header(tp, fbp); - xfs_dir2_free_log_bests(tp, fbp, 0, be32_to_cpu(free->hdr.nvalid) - 1); - xfs_dir2_leafn_check(dp, lbp); + if (leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC)) + leaf->hdr.info.magic = cpu_to_be16(XFS_DIR2_LEAFN_MAGIC); + else + leaf->hdr.info.magic = cpu_to_be16(XFS_DIR3_LEAFN_MAGIC); + lbp->b_ops = &xfs_dir3_leafn_buf_ops; + xfs_trans_buf_set_type(tp, lbp, XFS_BLFT_DIR_LEAFN_BUF); + xfs_dir3_leaf_log_header(args, lbp); + xfs_dir3_leaf_check(dp, lbp); return 0; } @@ -260,6 +389,8 @@ xfs_dir2_leafn_add( int lowstale; /* previous stale entry */ xfs_mount_t *mp; /* filesystem mount point */ xfs_trans_t *tp; /* transaction pointer */ + struct xfs_dir3_icleaf_hdr leafhdr; + struct xfs_dir2_leaf_entry *ents; trace_xfs_dir2_leafn_add(args, index); @@ -267,6 +398,8 @@ xfs_dir2_leafn_add( mp = dp->i_mount; tp = args->trans; leaf = bp->b_addr; + dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf); + ents = dp->d_ops->leaf_ents_p(leaf); /* * Quick check just to make sure we are not going to index @@ -282,15 +415,15 @@ xfs_dir2_leafn_add( * a compact. */ - if (be16_to_cpu(leaf->hdr.count) == xfs_dir2_max_leaf_ents(mp)) { - if (!leaf->hdr.stale) + if (leafhdr.count == dp->d_ops->leaf_max_ents(args->geo)) { + if (!leafhdr.stale) return XFS_ERROR(ENOSPC); - compact = be16_to_cpu(leaf->hdr.stale) > 1; + compact = leafhdr.stale > 1; } else compact = 0; - ASSERT(index == 0 || be32_to_cpu(leaf->ents[index - 1].hashval) <= args->hashval); - ASSERT(index == be16_to_cpu(leaf->hdr.count) || - be32_to_cpu(leaf->ents[index].hashval) >= args->hashval); + ASSERT(index == 0 || be32_to_cpu(ents[index - 1].hashval) <= args->hashval); + ASSERT(index == leafhdr.count || + be32_to_cpu(ents[index].hashval) >= args->hashval); if (args->op_flags & XFS_DA_OP_JUSTCHECK) return 0; @@ -299,61 +432,52 @@ xfs_dir2_leafn_add( * Compact out all but one stale leaf entry. Leaves behind * the entry closest to index. */ - if (compact) { - xfs_dir2_leaf_compact_x1(bp, &index, &lowstale, &highstale, - &lfloglow, &lfloghigh); - } - /* - * Set impossible logging indices for this case. - */ - else if (leaf->hdr.stale) { - lfloglow = be16_to_cpu(leaf->hdr.count); + if (compact) + xfs_dir3_leaf_compact_x1(&leafhdr, ents, &index, &lowstale, + &highstale, &lfloglow, &lfloghigh); + else if (leafhdr.stale) { + /* + * Set impossible logging indices for this case. + */ + lfloglow = leafhdr.count; lfloghigh = -1; } /* * Insert the new entry, log everything. */ - lep = xfs_dir2_leaf_find_entry(leaf, index, compact, lowstale, + lep = xfs_dir3_leaf_find_entry(&leafhdr, ents, index, compact, lowstale, highstale, &lfloglow, &lfloghigh); lep->hashval = cpu_to_be32(args->hashval); - lep->address = cpu_to_be32(xfs_dir2_db_off_to_dataptr(mp, + lep->address = cpu_to_be32(xfs_dir2_db_off_to_dataptr(args->geo, args->blkno, args->index)); - xfs_dir2_leaf_log_header(tp, bp); - xfs_dir2_leaf_log_ents(tp, bp, lfloglow, lfloghigh); - xfs_dir2_leafn_check(dp, bp); + + dp->d_ops->leaf_hdr_to_disk(leaf, &leafhdr); + xfs_dir3_leaf_log_header(args, bp); + xfs_dir3_leaf_log_ents(args, bp, lfloglow, lfloghigh); + xfs_dir3_leaf_check(dp, bp); return 0; } #ifdef DEBUG -/* - * Check internal consistency of a leafn block. - */ -void -xfs_dir2_leafn_check( +static void +xfs_dir2_free_hdr_check( struct xfs_inode *dp, - struct xfs_buf *bp) + struct xfs_buf *bp, + xfs_dir2_db_t db) { - int i; /* leaf index */ - xfs_dir2_leaf_t *leaf; /* leaf structure */ - xfs_mount_t *mp; /* filesystem mount point */ - int stale; /* count of stale leaves */ + struct xfs_dir3_icfree_hdr hdr; - leaf = bp->b_addr; - mp = dp->i_mount; - ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); - ASSERT(be16_to_cpu(leaf->hdr.count) <= xfs_dir2_max_leaf_ents(mp)); - for (i = stale = 0; i < be16_to_cpu(leaf->hdr.count); i++) { - if (i + 1 < be16_to_cpu(leaf->hdr.count)) { - ASSERT(be32_to_cpu(leaf->ents[i].hashval) <= - be32_to_cpu(leaf->ents[i + 1].hashval)); - } - if (leaf->ents[i].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) - stale++; - } - ASSERT(be16_to_cpu(leaf->hdr.stale) == stale); + dp->d_ops->free_hdr_from_disk(&hdr, bp->b_addr); + + ASSERT((hdr.firstdb % + dp->d_ops->free_max_bests(dp->i_mount->m_dir_geo)) == 0); + ASSERT(hdr.firstdb <= db); + ASSERT(db < hdr.firstdb + hdr.nvalid); } +#else +#define xfs_dir2_free_hdr_check(dp, bp, db) #endif /* DEBUG */ /* @@ -362,18 +486,26 @@ xfs_dir2_leafn_check( */ xfs_dahash_t /* hash value */ xfs_dir2_leafn_lasthash( + struct xfs_inode *dp, struct xfs_buf *bp, /* leaf buffer */ int *count) /* count of entries in leaf */ { - xfs_dir2_leaf_t *leaf; /* leaf structure */ + struct xfs_dir2_leaf *leaf = bp->b_addr; + struct xfs_dir2_leaf_entry *ents; + struct xfs_dir3_icleaf_hdr leafhdr; + + dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf); + + ASSERT(leafhdr.magic == XFS_DIR2_LEAFN_MAGIC || + leafhdr.magic == XFS_DIR3_LEAFN_MAGIC); - leaf = bp->b_addr; - ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); if (count) - *count = be16_to_cpu(leaf->hdr.count); - if (!leaf->hdr.count) + *count = leafhdr.count; + if (!leafhdr.count) return 0; - return be32_to_cpu(leaf->ents[be16_to_cpu(leaf->hdr.count) - 1].hashval); + + ents = dp->d_ops->leaf_ents_p(leaf); + return be32_to_cpu(ents[leafhdr.count - 1].hashval); } /* @@ -402,16 +534,19 @@ xfs_dir2_leafn_lookup_for_addname( xfs_dir2_db_t newdb; /* new data block number */ xfs_dir2_db_t newfdb; /* new free block number */ xfs_trans_t *tp; /* transaction pointer */ + struct xfs_dir2_leaf_entry *ents; + struct xfs_dir3_icleaf_hdr leafhdr; dp = args->dp; tp = args->trans; mp = dp->i_mount; leaf = bp->b_addr; - ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); -#ifdef __KERNEL__ - ASSERT(be16_to_cpu(leaf->hdr.count) > 0); -#endif - xfs_dir2_leafn_check(dp, bp); + dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf); + ents = dp->d_ops->leaf_ents_p(leaf); + + xfs_dir3_leaf_check(dp, bp); + ASSERT(leafhdr.count > 0); + /* * Look up the hash value in the leaf entries. */ @@ -424,15 +559,16 @@ xfs_dir2_leafn_lookup_for_addname( curbp = state->extrablk.bp; curfdb = state->extrablk.blkno; free = curbp->b_addr; - ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC)); + ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC) || + free->hdr.magic == cpu_to_be32(XFS_DIR3_FREE_MAGIC)); } - length = xfs_dir2_data_entsize(args->namelen); + length = dp->d_ops->data_entsize(args->namelen); /* * Loop over leaf entries with the right hash value. */ - for (lep = &leaf->ents[index]; index < be16_to_cpu(leaf->hdr.count) && - be32_to_cpu(lep->hashval) == args->hashval; - lep++, index++) { + for (lep = &ents[index]; + index < leafhdr.count && be32_to_cpu(lep->hashval) == args->hashval; + lep++, index++) { /* * Skip stale leaf entries. */ @@ -441,7 +577,8 @@ xfs_dir2_leafn_lookup_for_addname( /* * Pull the data block number from the entry. */ - newdb = xfs_dir2_dataptr_to_db(mp, be32_to_cpu(lep->address)); + newdb = xfs_dir2_dataptr_to_db(args->geo, + be32_to_cpu(lep->address)); /* * For addname, we're looking for a place to put the new entry. * We want to use a data block with an entry of equal @@ -451,12 +588,14 @@ xfs_dir2_leafn_lookup_for_addname( * in hand, take a look at it. */ if (newdb != curdb) { + __be16 *bests; + curdb = newdb; /* * Convert the data block to the free block * holding its freespace information. */ - newfdb = xfs_dir2_db_to_fdb(mp, newdb); + newfdb = dp->d_ops->db_to_fdb(args->geo, newdb); /* * If it's not the one we have in hand, read it in. */ @@ -468,28 +607,24 @@ xfs_dir2_leafn_lookup_for_addname( xfs_trans_brelse(tp, curbp); error = xfs_dir2_free_read(tp, dp, - xfs_dir2_db_to_da(mp, newfdb), + xfs_dir2_db_to_da(args->geo, + newfdb), &curbp); if (error) return error; free = curbp->b_addr; - ASSERT(be32_to_cpu(free->hdr.magic) == - XFS_DIR2_FREE_MAGIC); - ASSERT((be32_to_cpu(free->hdr.firstdb) % - xfs_dir2_free_max_bests(mp)) == 0); - ASSERT(be32_to_cpu(free->hdr.firstdb) <= curdb); - ASSERT(curdb < be32_to_cpu(free->hdr.firstdb) + - be32_to_cpu(free->hdr.nvalid)); + + xfs_dir2_free_hdr_check(dp, curbp, curdb); } /* * Get the index for our entry. */ - fi = xfs_dir2_db_to_fdindex(mp, curdb); + fi = dp->d_ops->db_to_fdindex(args->geo, curdb); /* * If it has room, return it. */ - if (unlikely(free->bests[fi] == - cpu_to_be16(NULLDATAOFF))) { + bests = dp->d_ops->free_bests_p(free); + if (unlikely(bests[fi] == cpu_to_be16(NULLDATAOFF))) { XFS_ERROR_REPORT("xfs_dir2_leafn_lookup_int", XFS_ERRLEVEL_LOW, mp); if (curfdb != newfdb) @@ -497,7 +632,7 @@ xfs_dir2_leafn_lookup_for_addname( return XFS_ERROR(EFSCORRUPTED); } curfdb = newfdb; - if (be16_to_cpu(free->bests[fi]) >= length) + if (be16_to_cpu(bests[fi]) >= length) goto out; } } @@ -511,6 +646,12 @@ out: state->extrablk.bp = curbp; state->extrablk.index = fi; state->extrablk.blkno = curfdb; + + /* + * Important: this magic number is not in the buffer - it's for + * buffer type information and therefore only the free/data type + * matters here, not whether CRCs are enabled or not. + */ state->extrablk.magic = XFS_DIR2_FREE_MAGIC; } else { state->extravalid = 0; @@ -545,16 +686,19 @@ xfs_dir2_leafn_lookup_for_entry( xfs_dir2_db_t newdb; /* new data block number */ xfs_trans_t *tp; /* transaction pointer */ enum xfs_dacmp cmp; /* comparison result */ + struct xfs_dir2_leaf_entry *ents; + struct xfs_dir3_icleaf_hdr leafhdr; dp = args->dp; tp = args->trans; mp = dp->i_mount; leaf = bp->b_addr; - ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); -#ifdef __KERNEL__ - ASSERT(be16_to_cpu(leaf->hdr.count) > 0); -#endif - xfs_dir2_leafn_check(dp, bp); + dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf); + ents = dp->d_ops->leaf_ents_p(leaf); + + xfs_dir3_leaf_check(dp, bp); + ASSERT(leafhdr.count > 0); + /* * Look up the hash value in the leaf entries. */ @@ -569,9 +713,9 @@ xfs_dir2_leafn_lookup_for_entry( /* * Loop over leaf entries with the right hash value. */ - for (lep = &leaf->ents[index]; index < be16_to_cpu(leaf->hdr.count) && - be32_to_cpu(lep->hashval) == args->hashval; - lep++, index++) { + for (lep = &ents[index]; + index < leafhdr.count && be32_to_cpu(lep->hashval) == args->hashval; + lep++, index++) { /* * Skip stale leaf entries. */ @@ -580,7 +724,8 @@ xfs_dir2_leafn_lookup_for_entry( /* * Pull the data block number from the entry. */ - newdb = xfs_dir2_dataptr_to_db(mp, be32_to_cpu(lep->address)); + newdb = xfs_dir2_dataptr_to_db(args->geo, + be32_to_cpu(lep->address)); /* * Not adding a new entry, so we really want to find * the name given to us. @@ -604,20 +749,22 @@ xfs_dir2_leafn_lookup_for_entry( ASSERT(state->extravalid); curbp = state->extrablk.bp; } else { - error = xfs_dir2_data_read(tp, dp, - xfs_dir2_db_to_da(mp, newdb), + error = xfs_dir3_data_read(tp, dp, + xfs_dir2_db_to_da(args->geo, + newdb), -1, &curbp); if (error) return error; } - xfs_dir2_data_check(dp, curbp); + xfs_dir3_data_check(dp, curbp); curdb = newdb; } /* * Point to the data entry. */ dep = (xfs_dir2_data_entry_t *)((char *)curbp->b_addr + - xfs_dir2_dataptr_to_off(mp, be32_to_cpu(lep->address))); + xfs_dir2_dataptr_to_off(args->geo, + be32_to_cpu(lep->address))); /* * Compare the entry and if it's an exact match, return * EEXIST immediately. If it's the first case-insensitive @@ -631,6 +778,7 @@ xfs_dir2_leafn_lookup_for_entry( xfs_trans_brelse(tp, state->extrablk.bp); args->cmpresult = cmp; args->inumber = be64_to_cpu(dep->inumber); + args->filetype = dp->d_ops->data_get_ftype(dep); *indexp = index; state->extravalid = 1; state->extrablk.bp = curbp; @@ -638,13 +786,13 @@ xfs_dir2_leafn_lookup_for_entry( state->extrablk.index = (int)((char *)dep - (char *)curbp->b_addr); state->extrablk.magic = XFS_DIR2_DATA_MAGIC; - curbp->b_ops = &xfs_dir2_data_buf_ops; + curbp->b_ops = &xfs_dir3_data_buf_ops; + xfs_trans_buf_set_type(tp, curbp, XFS_BLFT_DIR_DATA_BUF); if (cmp == XFS_CMP_EXACT) return XFS_ERROR(EEXIST); } } - ASSERT(index == be16_to_cpu(leaf->hdr.count) || - (args->op_flags & XFS_DA_OP_OKNOENT)); + ASSERT(index == leafhdr.count || (args->op_flags & XFS_DA_OP_OKNOENT)); if (curbp) { if (args->cmpresult == XFS_CMP_DIFFERENT) { /* Giving back last used data block. */ @@ -653,7 +801,8 @@ xfs_dir2_leafn_lookup_for_entry( state->extrablk.index = -1; state->extrablk.blkno = curdb; state->extrablk.magic = XFS_DIR2_DATA_MAGIC; - curbp->b_ops = &xfs_dir2_data_buf_ops; + curbp->b_ops = &xfs_dir3_data_buf_ops; + xfs_trans_buf_set_type(tp, curbp, XFS_BLFT_DIR_DATA_BUF); } else { /* If the curbp is not the CI match block, drop it */ if (state->extrablk.bp != curbp) @@ -689,52 +838,49 @@ xfs_dir2_leafn_lookup_int( * Log entries and headers. Stale entries are preserved. */ static void -xfs_dir2_leafn_moveents( - xfs_da_args_t *args, /* operation arguments */ - struct xfs_buf *bp_s, /* source leaf buffer */ - int start_s, /* source leaf index */ - struct xfs_buf *bp_d, /* destination leaf buffer */ - int start_d, /* destination leaf index */ - int count) /* count of leaves to copy */ +xfs_dir3_leafn_moveents( + xfs_da_args_t *args, /* operation arguments */ + struct xfs_buf *bp_s, /* source */ + struct xfs_dir3_icleaf_hdr *shdr, + struct xfs_dir2_leaf_entry *sents, + int start_s,/* source leaf index */ + struct xfs_buf *bp_d, /* destination */ + struct xfs_dir3_icleaf_hdr *dhdr, + struct xfs_dir2_leaf_entry *dents, + int start_d,/* destination leaf index */ + int count) /* count of leaves to copy */ { - xfs_dir2_leaf_t *leaf_d; /* destination leaf structure */ - xfs_dir2_leaf_t *leaf_s; /* source leaf structure */ - int stale; /* count stale leaves copied */ - xfs_trans_t *tp; /* transaction pointer */ + int stale; /* count stale leaves copied */ trace_xfs_dir2_leafn_moveents(args, start_s, start_d, count); /* * Silently return if nothing to do. */ - if (count == 0) { + if (count == 0) return; - } - tp = args->trans; - leaf_s = bp_s->b_addr; - leaf_d = bp_d->b_addr; + /* * If the destination index is not the end of the current * destination leaf entries, open up a hole in the destination * to hold the new entries. */ - if (start_d < be16_to_cpu(leaf_d->hdr.count)) { - memmove(&leaf_d->ents[start_d + count], &leaf_d->ents[start_d], - (be16_to_cpu(leaf_d->hdr.count) - start_d) * - sizeof(xfs_dir2_leaf_entry_t)); - xfs_dir2_leaf_log_ents(tp, bp_d, start_d + count, - count + be16_to_cpu(leaf_d->hdr.count) - 1); + if (start_d < dhdr->count) { + memmove(&dents[start_d + count], &dents[start_d], + (dhdr->count - start_d) * sizeof(xfs_dir2_leaf_entry_t)); + xfs_dir3_leaf_log_ents(args, bp_d, start_d + count, + count + dhdr->count - 1); } /* * If the source has stale leaves, count the ones in the copy range * so we can update the header correctly. */ - if (leaf_s->hdr.stale) { + if (shdr->stale) { int i; /* temp leaf index */ for (i = start_s, stale = 0; i < start_s + count; i++) { - if (leaf_s->ents[i].address == - cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) + if (sents[i].address == + cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) stale++; } } else @@ -742,29 +888,27 @@ xfs_dir2_leafn_moveents( /* * Copy the leaf entries from source to destination. */ - memcpy(&leaf_d->ents[start_d], &leaf_s->ents[start_s], + memcpy(&dents[start_d], &sents[start_s], count * sizeof(xfs_dir2_leaf_entry_t)); - xfs_dir2_leaf_log_ents(tp, bp_d, start_d, start_d + count - 1); + xfs_dir3_leaf_log_ents(args, bp_d, start_d, start_d + count - 1); + /* * If there are source entries after the ones we copied, * delete the ones we copied by sliding the next ones down. */ - if (start_s + count < be16_to_cpu(leaf_s->hdr.count)) { - memmove(&leaf_s->ents[start_s], &leaf_s->ents[start_s + count], + if (start_s + count < shdr->count) { + memmove(&sents[start_s], &sents[start_s + count], count * sizeof(xfs_dir2_leaf_entry_t)); - xfs_dir2_leaf_log_ents(tp, bp_s, start_s, start_s + count - 1); + xfs_dir3_leaf_log_ents(args, bp_s, start_s, start_s + count - 1); } + /* * Update the headers and log them. */ - be16_add_cpu(&leaf_s->hdr.count, -(count)); - be16_add_cpu(&leaf_s->hdr.stale, -(stale)); - be16_add_cpu(&leaf_d->hdr.count, count); - be16_add_cpu(&leaf_d->hdr.stale, stale); - xfs_dir2_leaf_log_header(tp, bp_s); - xfs_dir2_leaf_log_header(tp, bp_d); - xfs_dir2_leafn_check(args->dp, bp_s); - xfs_dir2_leafn_check(args->dp, bp_d); + shdr->count -= count; + shdr->stale -= stale; + dhdr->count += count; + dhdr->stale += stale; } /* @@ -773,21 +917,26 @@ xfs_dir2_leafn_moveents( */ int /* sort order */ xfs_dir2_leafn_order( - struct xfs_buf *leaf1_bp, /* leaf1 buffer */ - struct xfs_buf *leaf2_bp) /* leaf2 buffer */ + struct xfs_inode *dp, + struct xfs_buf *leaf1_bp, /* leaf1 buffer */ + struct xfs_buf *leaf2_bp) /* leaf2 buffer */ { - xfs_dir2_leaf_t *leaf1; /* leaf1 structure */ - xfs_dir2_leaf_t *leaf2; /* leaf2 structure */ - - leaf1 = leaf1_bp->b_addr; - leaf2 = leaf2_bp->b_addr; - ASSERT(leaf1->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); - ASSERT(leaf2->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); - if (be16_to_cpu(leaf1->hdr.count) > 0 && - be16_to_cpu(leaf2->hdr.count) > 0 && - (be32_to_cpu(leaf2->ents[0].hashval) < be32_to_cpu(leaf1->ents[0].hashval) || - be32_to_cpu(leaf2->ents[be16_to_cpu(leaf2->hdr.count) - 1].hashval) < - be32_to_cpu(leaf1->ents[be16_to_cpu(leaf1->hdr.count) - 1].hashval))) + struct xfs_dir2_leaf *leaf1 = leaf1_bp->b_addr; + struct xfs_dir2_leaf *leaf2 = leaf2_bp->b_addr; + struct xfs_dir2_leaf_entry *ents1; + struct xfs_dir2_leaf_entry *ents2; + struct xfs_dir3_icleaf_hdr hdr1; + struct xfs_dir3_icleaf_hdr hdr2; + + dp->d_ops->leaf_hdr_from_disk(&hdr1, leaf1); + dp->d_ops->leaf_hdr_from_disk(&hdr2, leaf2); + ents1 = dp->d_ops->leaf_ents_p(leaf1); + ents2 = dp->d_ops->leaf_ents_p(leaf2); + + if (hdr1.count > 0 && hdr2.count > 0 && + (be32_to_cpu(ents2[0].hashval) < be32_to_cpu(ents1[0].hashval) || + be32_to_cpu(ents2[hdr2.count - 1].hashval) < + be32_to_cpu(ents1[hdr1.count - 1].hashval))) return 1; return 0; } @@ -811,17 +960,22 @@ xfs_dir2_leafn_rebalance( xfs_dir2_leaf_t *leaf1; /* first leaf structure */ xfs_dir2_leaf_t *leaf2; /* second leaf structure */ int mid; /* midpoint leaf index */ -#ifdef DEBUG +#if defined(DEBUG) || defined(XFS_WARN) int oldstale; /* old count of stale leaves */ #endif int oldsum; /* old total leaf count */ int swap; /* swapped leaf blocks */ + struct xfs_dir2_leaf_entry *ents1; + struct xfs_dir2_leaf_entry *ents2; + struct xfs_dir3_icleaf_hdr hdr1; + struct xfs_dir3_icleaf_hdr hdr2; + struct xfs_inode *dp = state->args->dp; args = state->args; /* * If the block order is wrong, swap the arguments. */ - if ((swap = xfs_dir2_leafn_order(blk1->bp, blk2->bp))) { + if ((swap = xfs_dir2_leafn_order(dp, blk1->bp, blk2->bp))) { xfs_da_state_blk_t *tmp; /* temp for block swap */ tmp = blk1; @@ -830,11 +984,17 @@ xfs_dir2_leafn_rebalance( } leaf1 = blk1->bp->b_addr; leaf2 = blk2->bp->b_addr; - oldsum = be16_to_cpu(leaf1->hdr.count) + be16_to_cpu(leaf2->hdr.count); -#ifdef DEBUG - oldstale = be16_to_cpu(leaf1->hdr.stale) + be16_to_cpu(leaf2->hdr.stale); + dp->d_ops->leaf_hdr_from_disk(&hdr1, leaf1); + dp->d_ops->leaf_hdr_from_disk(&hdr2, leaf2); + ents1 = dp->d_ops->leaf_ents_p(leaf1); + ents2 = dp->d_ops->leaf_ents_p(leaf2); + + oldsum = hdr1.count + hdr2.count; +#if defined(DEBUG) || defined(XFS_WARN) + oldstale = hdr1.stale + hdr2.stale; #endif mid = oldsum >> 1; + /* * If the old leaf count was odd then the new one will be even, * so we need to divide the new count evenly. @@ -842,10 +1002,10 @@ xfs_dir2_leafn_rebalance( if (oldsum & 1) { xfs_dahash_t midhash; /* middle entry hash value */ - if (mid >= be16_to_cpu(leaf1->hdr.count)) - midhash = be32_to_cpu(leaf2->ents[mid - be16_to_cpu(leaf1->hdr.count)].hashval); + if (mid >= hdr1.count) + midhash = be32_to_cpu(ents2[mid - hdr1.count].hashval); else - midhash = be32_to_cpu(leaf1->ents[mid].hashval); + midhash = be32_to_cpu(ents1[mid].hashval); isleft = args->hashval <= midhash; } /* @@ -859,46 +1019,58 @@ xfs_dir2_leafn_rebalance( * Calculate moved entry count. Positive means left-to-right, * negative means right-to-left. Then move the entries. */ - count = be16_to_cpu(leaf1->hdr.count) - mid + (isleft == 0); + count = hdr1.count - mid + (isleft == 0); if (count > 0) - xfs_dir2_leafn_moveents(args, blk1->bp, - be16_to_cpu(leaf1->hdr.count) - count, blk2->bp, 0, count); + xfs_dir3_leafn_moveents(args, blk1->bp, &hdr1, ents1, + hdr1.count - count, blk2->bp, + &hdr2, ents2, 0, count); else if (count < 0) - xfs_dir2_leafn_moveents(args, blk2->bp, 0, blk1->bp, - be16_to_cpu(leaf1->hdr.count), count); - ASSERT(be16_to_cpu(leaf1->hdr.count) + be16_to_cpu(leaf2->hdr.count) == oldsum); - ASSERT(be16_to_cpu(leaf1->hdr.stale) + be16_to_cpu(leaf2->hdr.stale) == oldstale); + xfs_dir3_leafn_moveents(args, blk2->bp, &hdr2, ents2, 0, + blk1->bp, &hdr1, ents1, + hdr1.count, count); + + ASSERT(hdr1.count + hdr2.count == oldsum); + ASSERT(hdr1.stale + hdr2.stale == oldstale); + + /* log the changes made when moving the entries */ + dp->d_ops->leaf_hdr_to_disk(leaf1, &hdr1); + dp->d_ops->leaf_hdr_to_disk(leaf2, &hdr2); + xfs_dir3_leaf_log_header(args, blk1->bp); + xfs_dir3_leaf_log_header(args, blk2->bp); + + xfs_dir3_leaf_check(dp, blk1->bp); + xfs_dir3_leaf_check(dp, blk2->bp); + /* * Mark whether we're inserting into the old or new leaf. */ - if (be16_to_cpu(leaf1->hdr.count) < be16_to_cpu(leaf2->hdr.count)) + if (hdr1.count < hdr2.count) state->inleaf = swap; - else if (be16_to_cpu(leaf1->hdr.count) > be16_to_cpu(leaf2->hdr.count)) + else if (hdr1.count > hdr2.count) state->inleaf = !swap; else - state->inleaf = - swap ^ (blk1->index <= be16_to_cpu(leaf1->hdr.count)); + state->inleaf = swap ^ (blk1->index <= hdr1.count); /* * Adjust the expected index for insertion. */ if (!state->inleaf) - blk2->index = blk1->index - be16_to_cpu(leaf1->hdr.count); + blk2->index = blk1->index - hdr1.count; /* * Finally sanity check just to make sure we are not returning a * negative index */ - if(blk2->index < 0) { + if (blk2->index < 0) { state->inleaf = 1; blk2->index = 0; - xfs_alert(args->dp->i_mount, - "%s: picked the wrong leaf? reverting original leaf: blk1->index %d\n", + xfs_alert(dp->i_mount, + "%s: picked the wrong leaf? reverting original leaf: blk1->index %d", __func__, blk1->index); } } static int -xfs_dir2_data_block_free( +xfs_dir3_data_block_free( xfs_da_args_t *args, struct xfs_dir2_data_hdr *hdr, struct xfs_dir2_free *free, @@ -907,64 +1079,72 @@ xfs_dir2_data_block_free( struct xfs_buf *fbp, int longest) { - struct xfs_trans *tp = args->trans; int logfree = 0; + __be16 *bests; + struct xfs_dir3_icfree_hdr freehdr; + struct xfs_inode *dp = args->dp; - if (!hdr) { - /* One less used entry in the free table. */ - be32_add_cpu(&free->hdr.nused, -1); - xfs_dir2_free_log_header(tp, fbp); - + dp->d_ops->free_hdr_from_disk(&freehdr, free); + bests = dp->d_ops->free_bests_p(free); + if (hdr) { /* - * If this was the last entry in the table, we can trim the - * table size back. There might be other entries at the end - * referring to non-existent data blocks, get those too. + * Data block is not empty, just set the free entry to the new + * value. */ - if (findex == be32_to_cpu(free->hdr.nvalid) - 1) { - int i; /* free entry index */ + bests[findex] = cpu_to_be16(longest); + xfs_dir2_free_log_bests(args, fbp, findex, findex); + return 0; + } - for (i = findex - 1; i >= 0; i--) { - if (free->bests[i] != cpu_to_be16(NULLDATAOFF)) - break; - } - free->hdr.nvalid = cpu_to_be32(i + 1); - logfree = 0; - } else { - /* Not the last entry, just punch it out. */ - free->bests[findex] = cpu_to_be16(NULLDATAOFF); - logfree = 1; - } - /* - * If there are no useful entries left in the block, - * get rid of the block if we can. - */ - if (!free->hdr.nused) { - int error; + /* One less used entry in the free table. */ + freehdr.nused--; - error = xfs_dir2_shrink_inode(args, fdb, fbp); - if (error == 0) { - fbp = NULL; - logfree = 0; - } else if (error != ENOSPC || args->total != 0) - return error; - /* - * It's possible to get ENOSPC if there is no - * space reservation. In this case some one - * else will eventually get rid of this block. - */ + /* + * If this was the last entry in the table, we can trim the table size + * back. There might be other entries at the end referring to + * non-existent data blocks, get those too. + */ + if (findex == freehdr.nvalid - 1) { + int i; /* free entry index */ + + for (i = findex - 1; i >= 0; i--) { + if (bests[i] != cpu_to_be16(NULLDATAOFF)) + break; } + freehdr.nvalid = i + 1; + logfree = 0; } else { + /* Not the last entry, just punch it out. */ + bests[findex] = cpu_to_be16(NULLDATAOFF); + logfree = 1; + } + + dp->d_ops->free_hdr_to_disk(free, &freehdr); + xfs_dir2_free_log_header(args, fbp); + + /* + * If there are no useful entries left in the block, get rid of the + * block if we can. + */ + if (!freehdr.nused) { + int error; + + error = xfs_dir2_shrink_inode(args, fdb, fbp); + if (error == 0) { + fbp = NULL; + logfree = 0; + } else if (error != ENOSPC || args->total != 0) + return error; /* - * Data block is not empty, just set the free entry to the new - * value. + * It's possible to get ENOSPC if there is no + * space reservation. In this case some one + * else will eventually get rid of this block. */ - free->bests[findex] = cpu_to_be16(longest); - logfree = 1; } /* Log the free entry that changed, unless we got rid of it. */ if (logfree) - xfs_dir2_free_log_bests(tp, fbp, findex, findex); + xfs_dir2_free_log_bests(args, fbp, findex, findex); return 0; } @@ -994,6 +1174,9 @@ xfs_dir2_leafn_remove( int needlog; /* need to log data header */ int needscan; /* need to rescan data frees */ xfs_trans_t *tp; /* transaction pointer */ + struct xfs_dir2_data_free *bf; /* bestfree table */ + struct xfs_dir3_icleaf_hdr leafhdr; + struct xfs_dir2_leaf_entry *ents; trace_xfs_dir2_leafn_remove(args, index); @@ -1001,26 +1184,33 @@ xfs_dir2_leafn_remove( tp = args->trans; mp = dp->i_mount; leaf = bp->b_addr; - ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); + dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf); + ents = dp->d_ops->leaf_ents_p(leaf); + /* * Point to the entry we're removing. */ - lep = &leaf->ents[index]; + lep = &ents[index]; + /* * Extract the data block and offset from the entry. */ - db = xfs_dir2_dataptr_to_db(mp, be32_to_cpu(lep->address)); + db = xfs_dir2_dataptr_to_db(args->geo, be32_to_cpu(lep->address)); ASSERT(dblk->blkno == db); - off = xfs_dir2_dataptr_to_off(mp, be32_to_cpu(lep->address)); + off = xfs_dir2_dataptr_to_off(args->geo, be32_to_cpu(lep->address)); ASSERT(dblk->index == off); + /* * Kill the leaf entry by marking it stale. * Log the leaf block changes. */ - be16_add_cpu(&leaf->hdr.stale, 1); - xfs_dir2_leaf_log_header(tp, bp); + leafhdr.stale++; + dp->d_ops->leaf_hdr_to_disk(leaf, &leafhdr); + xfs_dir3_leaf_log_header(args, bp); + lep->address = cpu_to_be32(XFS_DIR2_NULL_DATAPTR); - xfs_dir2_leaf_log_ents(tp, bp, index, index); + xfs_dir3_leaf_log_ents(args, bp, index, index); + /* * Make the data entry free. Keep track of the longest freespace * in the data block in case it changes. @@ -1028,24 +1218,25 @@ xfs_dir2_leafn_remove( dbp = dblk->bp; hdr = dbp->b_addr; dep = (xfs_dir2_data_entry_t *)((char *)hdr + off); - longest = be16_to_cpu(hdr->bestfree[0].length); + bf = dp->d_ops->data_bestfree_p(hdr); + longest = be16_to_cpu(bf[0].length); needlog = needscan = 0; - xfs_dir2_data_make_free(tp, dbp, off, - xfs_dir2_data_entsize(dep->namelen), &needlog, &needscan); + xfs_dir2_data_make_free(args, dbp, off, + dp->d_ops->data_entsize(dep->namelen), &needlog, &needscan); /* * Rescan the data block freespaces for bestfree. * Log the data block header if needed. */ if (needscan) - xfs_dir2_data_freescan(mp, hdr, &needlog); + xfs_dir2_data_freescan(dp, hdr, &needlog); if (needlog) - xfs_dir2_data_log_header(tp, dbp); - xfs_dir2_data_check(dp, dbp); + xfs_dir2_data_log_header(args, dbp); + xfs_dir3_data_check(dp, dbp); /* * If the longest data block freespace changes, need to update * the corresponding freeblock entry. */ - if (longest < be16_to_cpu(hdr->bestfree[0].length)) { + if (longest < be16_to_cpu(bf[0].length)) { int error; /* error return value */ struct xfs_buf *fbp; /* freeblock buffer */ xfs_dir2_db_t fdb; /* freeblock block number */ @@ -1056,26 +1247,33 @@ xfs_dir2_leafn_remove( * Convert the data block number to a free block, * read in the free block. */ - fdb = xfs_dir2_db_to_fdb(mp, db); - error = xfs_dir2_free_read(tp, dp, xfs_dir2_db_to_da(mp, fdb), + fdb = dp->d_ops->db_to_fdb(args->geo, db); + error = xfs_dir2_free_read(tp, dp, + xfs_dir2_db_to_da(args->geo, fdb), &fbp); if (error) return error; free = fbp->b_addr; - ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC)); - ASSERT(be32_to_cpu(free->hdr.firstdb) == - xfs_dir2_free_max_bests(mp) * - (fdb - XFS_DIR2_FREE_FIRSTDB(mp))); +#ifdef DEBUG + { + struct xfs_dir3_icfree_hdr freehdr; + dp->d_ops->free_hdr_from_disk(&freehdr, free); + ASSERT(freehdr.firstdb == dp->d_ops->free_max_bests(args->geo) * + (fdb - xfs_dir2_byte_to_db(args->geo, + XFS_DIR2_FREE_OFFSET))); + } +#endif /* * Calculate which entry we need to fix. */ - findex = xfs_dir2_db_to_fdindex(mp, db); - longest = be16_to_cpu(hdr->bestfree[0].length); + findex = dp->d_ops->db_to_fdindex(args->geo, db); + longest = be16_to_cpu(bf[0].length); /* * If the data block is now empty we can get rid of it * (usually). */ - if (longest == mp->m_dirblksize - (uint)sizeof(*hdr)) { + if (longest == args->geo->blksize - + dp->d_ops->data_entry_offset) { /* * Try to punch out the data block. */ @@ -1096,22 +1294,20 @@ xfs_dir2_leafn_remove( * If we got rid of the data block, we can eliminate that entry * in the free block. */ - error = xfs_dir2_data_block_free(args, hdr, free, + error = xfs_dir3_data_block_free(args, hdr, free, fdb, findex, fbp, longest); if (error) return error; } - xfs_dir2_leafn_check(dp, bp); + xfs_dir3_leaf_check(dp, bp); /* * Return indication of whether this leaf block is empty enough * to justify trying to join it with a neighbor. */ - *rval = - ((uint)sizeof(leaf->hdr) + - (uint)sizeof(leaf->ents[0]) * - (be16_to_cpu(leaf->hdr.count) - be16_to_cpu(leaf->hdr.stale))) < - mp->m_dir_magicpct; + *rval = (dp->d_ops->leaf_hdr_size + + (uint)sizeof(ents[0]) * (leafhdr.count - leafhdr.stale)) < + args->geo->magicpct; return 0; } @@ -1128,13 +1324,14 @@ xfs_dir2_leafn_split( xfs_dablk_t blkno; /* new leaf block number */ int error; /* error return value */ xfs_mount_t *mp; /* filesystem mount point */ + struct xfs_inode *dp; /* * Allocate space for a new leaf node. */ args = state->args; - mp = args->dp->i_mount; - ASSERT(args != NULL); + dp = args->dp; + mp = dp->i_mount; ASSERT(oldblk->magic == XFS_DIR2_LEAFN_MAGIC); error = xfs_da_grow_inode(args, &blkno); if (error) { @@ -1143,11 +1340,11 @@ xfs_dir2_leafn_split( /* * Initialize the new leaf block. */ - error = xfs_dir2_leaf_init(args, xfs_dir2_da_to_db(mp, blkno), - &newblk->bp, XFS_DIR2_LEAFN_MAGIC); - if (error) { + error = xfs_dir3_leaf_get_buf(args, xfs_dir2_da_to_db(args->geo, blkno), + &newblk->bp, XFS_DIR2_LEAFN_MAGIC); + if (error) return error; - } + newblk->blkno = blkno; newblk->magic = XFS_DIR2_LEAFN_MAGIC; /* @@ -1155,7 +1352,7 @@ xfs_dir2_leafn_split( * block into the leaves. */ xfs_dir2_leafn_rebalance(state, oldblk, newblk); - error = xfs_da_blk_link(state, oldblk, newblk); + error = xfs_da3_blk_link(state, oldblk, newblk); if (error) { return error; } @@ -1169,10 +1366,10 @@ xfs_dir2_leafn_split( /* * Update last hashval in each block since we added the name. */ - oldblk->hashval = xfs_dir2_leafn_lasthash(oldblk->bp, NULL); - newblk->hashval = xfs_dir2_leafn_lasthash(newblk->bp, NULL); - xfs_dir2_leafn_check(args->dp, oldblk->bp); - xfs_dir2_leafn_check(args->dp, newblk->bp); + oldblk->hashval = xfs_dir2_leafn_lasthash(dp, oldblk->bp, NULL); + newblk->hashval = xfs_dir2_leafn_lasthash(dp, newblk->bp, NULL); + xfs_dir3_leaf_check(dp, oldblk->bp); + xfs_dir3_leaf_check(dp, newblk->bp); return error; } @@ -1198,9 +1395,11 @@ xfs_dir2_leafn_toosmall( int error; /* error return value */ int forward; /* sibling block direction */ int i; /* sibling counter */ - xfs_da_blkinfo_t *info; /* leaf block header */ xfs_dir2_leaf_t *leaf; /* leaf structure */ int rval; /* result from path_shift */ + struct xfs_dir3_icleaf_hdr leafhdr; + struct xfs_dir2_leaf_entry *ents; + struct xfs_inode *dp = state->args->dp; /* * Check for the degenerate case of the block being over 50% full. @@ -1208,12 +1407,14 @@ xfs_dir2_leafn_toosmall( * to coalesce with a sibling. */ blk = &state->path.blk[state->path.active - 1]; - info = blk->bp->b_addr; - ASSERT(info->magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); - leaf = (xfs_dir2_leaf_t *)info; - count = be16_to_cpu(leaf->hdr.count) - be16_to_cpu(leaf->hdr.stale); - bytes = (uint)sizeof(leaf->hdr) + count * (uint)sizeof(leaf->ents[0]); - if (bytes > (state->blocksize >> 1)) { + leaf = blk->bp->b_addr; + dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf); + ents = dp->d_ops->leaf_ents_p(leaf); + xfs_dir3_leaf_check(dp, blk->bp); + + count = leafhdr.count - leafhdr.stale; + bytes = dp->d_ops->leaf_hdr_size + count * sizeof(ents[0]); + if (bytes > (state->args->geo->blksize >> 1)) { /* * Blk over 50%, don't try to join. */ @@ -1231,9 +1432,9 @@ xfs_dir2_leafn_toosmall( * Make altpath point to the block we want to keep and * path point to the block we want to drop (this one). */ - forward = (info->forw != 0); + forward = (leafhdr.forw != 0); memcpy(&state->altpath, &state->path, sizeof(state->path)); - error = xfs_da_path_shift(state, &state->altpath, forward, 0, + error = xfs_da3_path_shift(state, &state->altpath, forward, 0, &rval); if (error) return error; @@ -1247,15 +1448,17 @@ xfs_dir2_leafn_toosmall( * We prefer coalescing with the lower numbered sibling so as * to shrink a directory over time. */ - forward = be32_to_cpu(info->forw) < be32_to_cpu(info->back); + forward = leafhdr.forw < leafhdr.back; for (i = 0, bp = NULL; i < 2; forward = !forward, i++) { - blkno = forward ? be32_to_cpu(info->forw) : be32_to_cpu(info->back); + struct xfs_dir3_icleaf_hdr hdr2; + + blkno = forward ? leafhdr.forw : leafhdr.back; if (blkno == 0) continue; /* * Read the sibling leaf block. */ - error = xfs_dir2_leafn_read(state->args->trans, state->args->dp, + error = xfs_dir3_leafn_read(state->args->trans, dp, blkno, -1, &bp); if (error) return error; @@ -1263,13 +1466,16 @@ xfs_dir2_leafn_toosmall( /* * Count bytes in the two blocks combined. */ - leaf = (xfs_dir2_leaf_t *)info; - count = be16_to_cpu(leaf->hdr.count) - be16_to_cpu(leaf->hdr.stale); - bytes = state->blocksize - (state->blocksize >> 2); + count = leafhdr.count - leafhdr.stale; + bytes = state->args->geo->blksize - + (state->args->geo->blksize >> 2); + leaf = bp->b_addr; - ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); - count += be16_to_cpu(leaf->hdr.count) - be16_to_cpu(leaf->hdr.stale); - bytes -= count * (uint)sizeof(leaf->ents[0]); + dp->d_ops->leaf_hdr_from_disk(&hdr2, leaf); + ents = dp->d_ops->leaf_ents_p(leaf); + count += hdr2.count - hdr2.stale; + bytes -= count * sizeof(ents[0]); + /* * Fits with at least 25% to spare. */ @@ -1291,10 +1497,10 @@ xfs_dir2_leafn_toosmall( */ memcpy(&state->altpath, &state->path, sizeof(state->path)); if (blkno < blk->blkno) - error = xfs_da_path_shift(state, &state->altpath, forward, 0, + error = xfs_da3_path_shift(state, &state->altpath, forward, 0, &rval); else - error = xfs_da_path_shift(state, &state->path, forward, 0, + error = xfs_da3_path_shift(state, &state->path, forward, 0, &rval); if (error) { return error; @@ -1316,34 +1522,54 @@ xfs_dir2_leafn_unbalance( xfs_da_args_t *args; /* operation arguments */ xfs_dir2_leaf_t *drop_leaf; /* dead leaf structure */ xfs_dir2_leaf_t *save_leaf; /* surviving leaf structure */ + struct xfs_dir3_icleaf_hdr savehdr; + struct xfs_dir3_icleaf_hdr drophdr; + struct xfs_dir2_leaf_entry *sents; + struct xfs_dir2_leaf_entry *dents; + struct xfs_inode *dp = state->args->dp; args = state->args; ASSERT(drop_blk->magic == XFS_DIR2_LEAFN_MAGIC); ASSERT(save_blk->magic == XFS_DIR2_LEAFN_MAGIC); drop_leaf = drop_blk->bp->b_addr; save_leaf = save_blk->bp->b_addr; - ASSERT(drop_leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); - ASSERT(save_leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); + + dp->d_ops->leaf_hdr_from_disk(&savehdr, save_leaf); + dp->d_ops->leaf_hdr_from_disk(&drophdr, drop_leaf); + sents = dp->d_ops->leaf_ents_p(save_leaf); + dents = dp->d_ops->leaf_ents_p(drop_leaf); + /* * If there are any stale leaf entries, take this opportunity * to purge them. */ - if (drop_leaf->hdr.stale) - xfs_dir2_leaf_compact(args, drop_blk->bp); - if (save_leaf->hdr.stale) - xfs_dir2_leaf_compact(args, save_blk->bp); + if (drophdr.stale) + xfs_dir3_leaf_compact(args, &drophdr, drop_blk->bp); + if (savehdr.stale) + xfs_dir3_leaf_compact(args, &savehdr, save_blk->bp); + /* * Move the entries from drop to the appropriate end of save. */ - drop_blk->hashval = be32_to_cpu(drop_leaf->ents[be16_to_cpu(drop_leaf->hdr.count) - 1].hashval); - if (xfs_dir2_leafn_order(save_blk->bp, drop_blk->bp)) - xfs_dir2_leafn_moveents(args, drop_blk->bp, 0, save_blk->bp, 0, - be16_to_cpu(drop_leaf->hdr.count)); + drop_blk->hashval = be32_to_cpu(dents[drophdr.count - 1].hashval); + if (xfs_dir2_leafn_order(dp, save_blk->bp, drop_blk->bp)) + xfs_dir3_leafn_moveents(args, drop_blk->bp, &drophdr, dents, 0, + save_blk->bp, &savehdr, sents, 0, + drophdr.count); else - xfs_dir2_leafn_moveents(args, drop_blk->bp, 0, save_blk->bp, - be16_to_cpu(save_leaf->hdr.count), be16_to_cpu(drop_leaf->hdr.count)); - save_blk->hashval = be32_to_cpu(save_leaf->ents[be16_to_cpu(save_leaf->hdr.count) - 1].hashval); - xfs_dir2_leafn_check(args->dp, save_blk->bp); + xfs_dir3_leafn_moveents(args, drop_blk->bp, &drophdr, dents, 0, + save_blk->bp, &savehdr, sents, + savehdr.count, drophdr.count); + save_blk->hashval = be32_to_cpu(sents[savehdr.count - 1].hashval); + + /* log the changes made when moving the entries */ + dp->d_ops->leaf_hdr_to_disk(save_leaf, &savehdr); + dp->d_ops->leaf_hdr_to_disk(drop_leaf, &drophdr); + xfs_dir3_leaf_log_header(args, save_blk->bp); + xfs_dir3_leaf_log_header(args, drop_blk->bp); + + xfs_dir3_leaf_check(dp, save_blk->bp); + xfs_dir3_leaf_check(dp, drop_blk->bp); } /* @@ -1366,13 +1592,11 @@ xfs_dir2_node_addname( state = xfs_da_state_alloc(); state->args = args; state->mp = args->dp->i_mount; - state->blocksize = state->mp->m_dirblksize; - state->node_ents = state->mp->m_dir_node_ents; /* * Look up the name. We're not supposed to find it, but * this gives us the insertion point. */ - error = xfs_da_node_lookup_int(state, &rval); + error = xfs_da3_node_lookup_int(state, &rval); if (error) rval = error; if (rval != ENOENT) { @@ -1398,7 +1622,7 @@ xfs_dir2_node_addname( * It worked, fix the hash values up the btree. */ if (!(args->op_flags & XFS_DA_OP_JUSTCHECK)) - xfs_da_fixhashpath(state, &state->path); + xfs_da3_fixhashpath(state, &state->path); } else { /* * It didn't work, we need to split the leaf block. @@ -1410,7 +1634,7 @@ xfs_dir2_node_addname( /* * Split the leaf block and insert the new entry. */ - rval = xfs_da_split(state); + rval = xfs_da3_split(state); } done: xfs_da_state_free(state); @@ -1447,11 +1671,14 @@ xfs_dir2_node_addname_int( int needscan; /* need to rescan data frees */ __be16 *tagp; /* data entry tag pointer */ xfs_trans_t *tp; /* transaction pointer */ + __be16 *bests; + struct xfs_dir3_icfree_hdr freehdr; + struct xfs_dir2_data_free *bf; dp = args->dp; mp = dp->i_mount; tp = args->trans; - length = xfs_dir2_data_entsize(args->namelen); + length = dp->d_ops->data_entsize(args->namelen); /* * If we came in with a freespace block that means that lookup * found an entry with our hash value. This is the freespace @@ -1464,36 +1691,37 @@ xfs_dir2_node_addname_int( */ ifbno = fblk->blkno; free = fbp->b_addr; - ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC)); findex = fblk->index; + bests = dp->d_ops->free_bests_p(free); + dp->d_ops->free_hdr_from_disk(&freehdr, free); + /* * This means the free entry showed that the data block had * space for our entry, so we remembered it. * Use that data block. */ if (findex >= 0) { - ASSERT(findex < be32_to_cpu(free->hdr.nvalid)); - ASSERT(be16_to_cpu(free->bests[findex]) != NULLDATAOFF); - ASSERT(be16_to_cpu(free->bests[findex]) >= length); - dbno = be32_to_cpu(free->hdr.firstdb) + findex; - } - /* - * The data block looked at didn't have enough room. - * We'll start at the beginning of the freespace entries. - */ - else { + ASSERT(findex < freehdr.nvalid); + ASSERT(be16_to_cpu(bests[findex]) != NULLDATAOFF); + ASSERT(be16_to_cpu(bests[findex]) >= length); + dbno = freehdr.firstdb + findex; + } else { + /* + * The data block looked at didn't have enough room. + * We'll start at the beginning of the freespace entries. + */ dbno = -1; findex = 0; } - } - /* - * Didn't come in with a freespace block, so don't have a data block. - */ - else { + } else { + /* + * Didn't come in with a freespace block, so no data block. + */ ifbno = dbno = -1; fbp = NULL; findex = 0; } + /* * If we don't have a data block yet, we're going to scan the * freespace blocks looking for one. Figure out what the @@ -1502,9 +1730,9 @@ xfs_dir2_node_addname_int( if (dbno == -1) { xfs_fileoff_t fo; /* freespace block number */ - if ((error = xfs_bmap_last_offset(tp, dp, &fo, XFS_DATA_FORK))) + if ((error = xfs_bmap_last_offset(dp, &fo, XFS_DATA_FORK))) return error; - lastfbno = xfs_dir2_da_to_db(mp, (xfs_dablk_t)fo); + lastfbno = xfs_dir2_da_to_db(args->geo, (xfs_dablk_t)fo); fbno = ifbno; } /* @@ -1522,7 +1750,8 @@ xfs_dir2_node_addname_int( * us a freespace block to start with. */ if (++fbno == 0) - fbno = XFS_DIR2_FREE_FIRSTDB(mp); + fbno = xfs_dir2_byte_to_db(args->geo, + XFS_DIR2_FREE_OFFSET); /* * If it's ifbno we already looked at it. */ @@ -1540,27 +1769,33 @@ xfs_dir2_node_addname_int( * to avoid it. */ error = xfs_dir2_free_try_read(tp, dp, - xfs_dir2_db_to_da(mp, fbno), - &fbp); + xfs_dir2_db_to_da(args->geo, fbno), + &fbp); if (error) return error; if (!fbp) continue; free = fbp->b_addr; - ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC)); findex = 0; } /* * Look at the current free entry. Is it good enough? + * + * The bests initialisation should be where the bufer is read in + * the above branch. But gcc is too stupid to realise that bests + * and the freehdr are actually initialised if they are placed + * there, so we have to do it here to avoid warnings. Blech. */ - if (be16_to_cpu(free->bests[findex]) != NULLDATAOFF && - be16_to_cpu(free->bests[findex]) >= length) - dbno = be32_to_cpu(free->hdr.firstdb) + findex; + bests = dp->d_ops->free_bests_p(free); + dp->d_ops->free_hdr_from_disk(&freehdr, free); + if (be16_to_cpu(bests[findex]) != NULLDATAOFF && + be16_to_cpu(bests[findex]) >= length) + dbno = freehdr.firstdb + findex; else { /* * Are we done with the freeblock? */ - if (++findex == be32_to_cpu(free->hdr.nvalid)) { + if (++findex == freehdr.nvalid) { /* * Drop the block. */ @@ -1588,7 +1823,7 @@ xfs_dir2_node_addname_int( if (unlikely((error = xfs_dir2_grow_inode(args, XFS_DIR2_DATA_SPACE, &dbno)) || - (error = xfs_dir2_data_init(args, dbno, &dbp)))) + (error = xfs_dir3_data_init(args, dbno, &dbp)))) return error; /* @@ -1603,10 +1838,10 @@ xfs_dir2_node_addname_int( * Get the freespace block corresponding to the data block * that was just allocated. */ - fbno = xfs_dir2_db_to_fdb(mp, dbno); + fbno = dp->d_ops->db_to_fdb(args->geo, dbno); error = xfs_dir2_free_try_read(tp, dp, - xfs_dir2_db_to_da(mp, fbno), - &fbp); + xfs_dir2_db_to_da(args->geo, fbno), + &fbp); if (error) return error; @@ -1614,18 +1849,19 @@ xfs_dir2_node_addname_int( * If there wasn't a freespace block, the read will * return a NULL fbp. Allocate and initialize a new one. */ - if( fbp == NULL ) { - if ((error = xfs_dir2_grow_inode(args, XFS_DIR2_FREE_SPACE, - &fbno))) { + if (!fbp) { + error = xfs_dir2_grow_inode(args, XFS_DIR2_FREE_SPACE, + &fbno); + if (error) return error; - } - if (unlikely(xfs_dir2_db_to_fdb(mp, dbno) != fbno)) { + if (dp->d_ops->db_to_fdb(args->geo, dbno) != fbno) { xfs_alert(mp, "%s: dir ino %llu needed freesp block %lld for\n" " data block %lld, got %lld ifbno %llu lastfbno %d", __func__, (unsigned long long)dp->i_ino, - (long long)xfs_dir2_db_to_fdb(mp, dbno), + (long long)dp->d_ops->db_to_fdb( + args->geo, dbno), (long long)dbno, (long long)fbno, (unsigned long long)ifbno, lastfbno); if (fblk) { @@ -1646,52 +1882,50 @@ xfs_dir2_node_addname_int( /* * Get a buffer for the new block. */ - error = xfs_da_get_buf(tp, dp, - xfs_dir2_db_to_da(mp, fbno), - -1, &fbp, XFS_DATA_FORK); + error = xfs_dir3_free_get_buf(args, fbno, &fbp); if (error) return error; - fbp->b_ops = &xfs_dir2_free_buf_ops; + free = fbp->b_addr; + bests = dp->d_ops->free_bests_p(free); + dp->d_ops->free_hdr_from_disk(&freehdr, free); /* - * Initialize the new block to be empty, and remember - * its first slot as our empty slot. + * Remember the first slot as our empty slot. */ - free = fbp->b_addr; - free->hdr.magic = cpu_to_be32(XFS_DIR2_FREE_MAGIC); - free->hdr.firstdb = cpu_to_be32( - (fbno - XFS_DIR2_FREE_FIRSTDB(mp)) * - xfs_dir2_free_max_bests(mp)); - free->hdr.nvalid = 0; - free->hdr.nused = 0; + freehdr.firstdb = + (fbno - xfs_dir2_byte_to_db(args->geo, + XFS_DIR2_FREE_OFFSET)) * + dp->d_ops->free_max_bests(args->geo); } else { free = fbp->b_addr; - ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC)); + bests = dp->d_ops->free_bests_p(free); + dp->d_ops->free_hdr_from_disk(&freehdr, free); } /* * Set the freespace block index from the data block number. */ - findex = xfs_dir2_db_to_fdindex(mp, dbno); + findex = dp->d_ops->db_to_fdindex(args->geo, dbno); /* * If it's after the end of the current entries in the * freespace block, extend that table. */ - if (findex >= be32_to_cpu(free->hdr.nvalid)) { - ASSERT(findex < xfs_dir2_free_max_bests(mp)); - free->hdr.nvalid = cpu_to_be32(findex + 1); + if (findex >= freehdr.nvalid) { + ASSERT(findex < dp->d_ops->free_max_bests(args->geo)); + freehdr.nvalid = findex + 1; /* * Tag new entry so nused will go up. */ - free->bests[findex] = cpu_to_be16(NULLDATAOFF); + bests[findex] = cpu_to_be16(NULLDATAOFF); } /* * If this entry was for an empty data block * (this should always be true) then update the header. */ - if (free->bests[findex] == cpu_to_be16(NULLDATAOFF)) { - be32_add_cpu(&free->hdr.nused, 1); - xfs_dir2_free_log_header(tp, fbp); + if (bests[findex] == cpu_to_be16(NULLDATAOFF)) { + freehdr.nused++; + dp->d_ops->free_hdr_to_disk(fbp->b_addr, &freehdr); + xfs_dir2_free_log_header(args, fbp); } /* * Update the real value in the table. @@ -1699,7 +1933,8 @@ xfs_dir2_node_addname_int( * change again. */ hdr = dbp->b_addr; - free->bests[findex] = hdr->bestfree[0].length; + bf = dp->d_ops->data_bestfree_p(hdr); + bests[findex] = bf[0].length; logfree = 1; } /* @@ -1715,24 +1950,26 @@ xfs_dir2_node_addname_int( /* * Read the data block in. */ - error = xfs_dir2_data_read(tp, dp, xfs_dir2_db_to_da(mp, dbno), + error = xfs_dir3_data_read(tp, dp, + xfs_dir2_db_to_da(args->geo, dbno), -1, &dbp); if (error) return error; hdr = dbp->b_addr; + bf = dp->d_ops->data_bestfree_p(hdr); logfree = 0; } - ASSERT(be16_to_cpu(hdr->bestfree[0].length) >= length); + ASSERT(be16_to_cpu(bf[0].length) >= length); /* * Point to the existing unused space. */ dup = (xfs_dir2_data_unused_t *) - ((char *)hdr + be16_to_cpu(hdr->bestfree[0].offset)); + ((char *)hdr + be16_to_cpu(bf[0].offset)); needscan = needlog = 0; /* * Mark the first part of the unused space, inuse for us. */ - xfs_dir2_data_use_free(tp, dbp, dup, + xfs_dir2_data_use_free(args, dbp, dup, (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr), length, &needlog, &needscan); /* @@ -1742,31 +1979,33 @@ xfs_dir2_node_addname_int( dep->inumber = cpu_to_be64(args->inumber); dep->namelen = args->namelen; memcpy(dep->name, args->name, dep->namelen); - tagp = xfs_dir2_data_entry_tag_p(dep); + dp->d_ops->data_put_ftype(dep, args->filetype); + tagp = dp->d_ops->data_entry_tag_p(dep); *tagp = cpu_to_be16((char *)dep - (char *)hdr); - xfs_dir2_data_log_entry(tp, dbp, dep); + xfs_dir2_data_log_entry(args, dbp, dep); /* * Rescan the block for bestfree if needed. */ if (needscan) - xfs_dir2_data_freescan(mp, hdr, &needlog); + xfs_dir2_data_freescan(dp, hdr, &needlog); /* * Log the data block header if needed. */ if (needlog) - xfs_dir2_data_log_header(tp, dbp); + xfs_dir2_data_log_header(args, dbp); /* * If the freespace entry is now wrong, update it. */ - if (be16_to_cpu(free->bests[findex]) != be16_to_cpu(hdr->bestfree[0].length)) { - free->bests[findex] = hdr->bestfree[0].length; + bests = dp->d_ops->free_bests_p(free); /* gcc is so stupid */ + if (be16_to_cpu(bests[findex]) != be16_to_cpu(bf[0].length)) { + bests[findex] = bf[0].length; logfree = 1; } /* * Log the freespace entry if needed. */ if (logfree) - xfs_dir2_free_log_bests(tp, fbp, findex, findex); + xfs_dir2_free_log_bests(args, fbp, findex, findex); /* * Return the data block and offset in args, then drop the data block. */ @@ -1777,7 +2016,7 @@ xfs_dir2_node_addname_int( /* * Lookup an entry in a node-format directory. - * All the real work happens in xfs_da_node_lookup_int. + * All the real work happens in xfs_da3_node_lookup_int. * The only real output is the inode number of the entry. */ int /* error */ @@ -1797,12 +2036,10 @@ xfs_dir2_node_lookup( state = xfs_da_state_alloc(); state->args = args; state->mp = args->dp->i_mount; - state->blocksize = state->mp->m_dirblksize; - state->node_ents = state->mp->m_dir_node_ents; /* * Fill in the path to the entry in the cursor. */ - error = xfs_da_node_lookup_int(state, &rval); + error = xfs_da3_node_lookup_int(state, &rval); if (error) rval = error; else if (rval == ENOENT && args->cmpresult == XFS_CMP_CASE) { @@ -1837,12 +2074,12 @@ xfs_dir2_node_lookup( */ int /* error */ xfs_dir2_node_removename( - xfs_da_args_t *args) /* operation arguments */ + struct xfs_da_args *args) /* operation arguments */ { - xfs_da_state_blk_t *blk; /* leaf block */ + struct xfs_da_state_blk *blk; /* leaf block */ int error; /* error return value */ int rval; /* operation return value */ - xfs_da_state_t *state; /* btree cursor */ + struct xfs_da_state *state; /* btree cursor */ trace_xfs_dir2_node_removename(args); @@ -1852,21 +2089,18 @@ xfs_dir2_node_removename( state = xfs_da_state_alloc(); state->args = args; state->mp = args->dp->i_mount; - state->blocksize = state->mp->m_dirblksize; - state->node_ents = state->mp->m_dir_node_ents; - /* - * Look up the entry we're deleting, set up the cursor. - */ - error = xfs_da_node_lookup_int(state, &rval); + + /* Look up the entry we're deleting, set up the cursor. */ + error = xfs_da3_node_lookup_int(state, &rval); if (error) - rval = error; - /* - * Didn't find it, upper layer screwed up. - */ + goto out_free; + + /* Didn't find it, upper layer screwed up. */ if (rval != EEXIST) { - xfs_da_state_free(state); - return rval; + error = rval; + goto out_free; } + blk = &state->path.blk[state->path.active - 1]; ASSERT(blk->magic == XFS_DIR2_LEAFN_MAGIC); ASSERT(state->extravalid); @@ -1877,21 +2111,22 @@ xfs_dir2_node_removename( error = xfs_dir2_leafn_remove(args, blk->bp, blk->index, &state->extrablk, &rval); if (error) - return error; + goto out_free; /* * Fix the hash values up the btree. */ - xfs_da_fixhashpath(state, &state->path); + xfs_da3_fixhashpath(state, &state->path); /* * If we need to join leaf blocks, do it. */ if (rval && state->path.active > 1) - error = xfs_da_join(state); + error = xfs_da3_join(state); /* * If no errors so far, try conversion to leaf format. */ if (!error) error = xfs_dir2_node_to_leaf(state); +out_free: xfs_da_state_free(state); return error; } @@ -1922,13 +2157,11 @@ xfs_dir2_node_replace( state = xfs_da_state_alloc(); state->args = args; state->mp = args->dp->i_mount; - state->blocksize = state->mp->m_dirblksize; - state->node_ents = state->mp->m_dir_node_ents; inum = args->inumber; /* * Lookup the entry to change in the btree. */ - error = xfs_da_node_lookup_int(state, &rval); + error = xfs_da3_node_lookup_int(state, &rval); if (error) { rval = error; } @@ -1937,28 +2170,33 @@ xfs_dir2_node_replace( * and locked it. But paranoia is good. */ if (rval == EEXIST) { + struct xfs_dir2_leaf_entry *ents; /* * Find the leaf entry. */ blk = &state->path.blk[state->path.active - 1]; ASSERT(blk->magic == XFS_DIR2_LEAFN_MAGIC); leaf = blk->bp->b_addr; - lep = &leaf->ents[blk->index]; + ents = args->dp->d_ops->leaf_ents_p(leaf); + lep = &ents[blk->index]; ASSERT(state->extravalid); /* * Point to the data entry. */ hdr = state->extrablk.bp->b_addr; - ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC)); + ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC)); dep = (xfs_dir2_data_entry_t *) ((char *)hdr + - xfs_dir2_dataptr_to_off(state->mp, be32_to_cpu(lep->address))); + xfs_dir2_dataptr_to_off(args->geo, + be32_to_cpu(lep->address))); ASSERT(inum != be64_to_cpu(dep->inumber)); /* * Fill in the new inode number and log the entry. */ dep->inumber = cpu_to_be64(inum); - xfs_dir2_data_log_entry(args->trans, state->extrablk.bp, dep); + args->dp->d_ops->data_put_ftype(dep, args->filetype); + xfs_dir2_data_log_entry(args, state->extrablk.bp, dep); rval = 0; } /* @@ -1995,6 +2233,7 @@ xfs_dir2_node_trim_free( xfs_dir2_free_t *free; /* freespace structure */ xfs_mount_t *mp; /* filesystem mount point */ xfs_trans_t *tp; /* transaction pointer */ + struct xfs_dir3_icfree_hdr freehdr; dp = args->dp; mp = dp->i_mount; @@ -2012,11 +2251,12 @@ xfs_dir2_node_trim_free( if (!bp) return 0; free = bp->b_addr; - ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC)); + dp->d_ops->free_hdr_from_disk(&freehdr, free); + /* * If there are used entries, there's nothing to do. */ - if (be32_to_cpu(free->hdr.nused) > 0) { + if (freehdr.nused > 0) { xfs_trans_brelse(tp, bp); *rvalp = 0; return 0; @@ -2024,9 +2264,9 @@ xfs_dir2_node_trim_free( /* * Blow the block away. */ - if ((error = - xfs_dir2_shrink_inode(args, xfs_dir2_da_to_db(mp, (xfs_dablk_t)fo), - bp))) { + error = xfs_dir2_shrink_inode(args, + xfs_dir2_da_to_db(args->geo, (xfs_dablk_t)fo), bp); + if (error) { /* * Can't fail with ENOSPC since that only happens with no * space reservation, when breaking up an extent into two diff --git a/fs/xfs/xfs_dir2_priv.h b/fs/xfs/xfs_dir2_priv.h index 7da79f6515f..27ce0794d19 100644 --- a/fs/xfs/xfs_dir2_priv.h +++ b/fs/xfs/xfs_dir2_priv.h @@ -18,23 +18,160 @@ #ifndef __XFS_DIR2_PRIV_H__ #define __XFS_DIR2_PRIV_H__ +struct dir_context; + +/* + * Directory offset/block conversion functions. + * + * DB blocks here are logical directory block numbers, not filesystem blocks. + */ + +/* + * Convert dataptr to byte in file space + */ +static inline xfs_dir2_off_t +xfs_dir2_dataptr_to_byte(xfs_dir2_dataptr_t dp) +{ + return (xfs_dir2_off_t)dp << XFS_DIR2_DATA_ALIGN_LOG; +} + +/* + * Convert byte in file space to dataptr. It had better be aligned. + */ +static inline xfs_dir2_dataptr_t +xfs_dir2_byte_to_dataptr(xfs_dir2_off_t by) +{ + return (xfs_dir2_dataptr_t)(by >> XFS_DIR2_DATA_ALIGN_LOG); +} + +/* + * Convert byte in space to (DB) block + */ +static inline xfs_dir2_db_t +xfs_dir2_byte_to_db(struct xfs_da_geometry *geo, xfs_dir2_off_t by) +{ + return (xfs_dir2_db_t)(by >> geo->blklog); +} + +/* + * Convert dataptr to a block number + */ +static inline xfs_dir2_db_t +xfs_dir2_dataptr_to_db(struct xfs_da_geometry *geo, xfs_dir2_dataptr_t dp) +{ + return xfs_dir2_byte_to_db(geo, xfs_dir2_dataptr_to_byte(dp)); +} + +/* + * Convert byte in space to offset in a block + */ +static inline xfs_dir2_data_aoff_t +xfs_dir2_byte_to_off(struct xfs_da_geometry *geo, xfs_dir2_off_t by) +{ + return (xfs_dir2_data_aoff_t)(by & (geo->blksize - 1)); +} + +/* + * Convert dataptr to a byte offset in a block + */ +static inline xfs_dir2_data_aoff_t +xfs_dir2_dataptr_to_off(struct xfs_da_geometry *geo, xfs_dir2_dataptr_t dp) +{ + return xfs_dir2_byte_to_off(geo, xfs_dir2_dataptr_to_byte(dp)); +} + +/* + * Convert block and offset to byte in space + */ +static inline xfs_dir2_off_t +xfs_dir2_db_off_to_byte(struct xfs_da_geometry *geo, xfs_dir2_db_t db, + xfs_dir2_data_aoff_t o) +{ + return ((xfs_dir2_off_t)db << geo->blklog) + o; +} + +/* + * Convert block (DB) to block (dablk) + */ +static inline xfs_dablk_t +xfs_dir2_db_to_da(struct xfs_da_geometry *geo, xfs_dir2_db_t db) +{ + return (xfs_dablk_t)(db << (geo->blklog - geo->fsblog)); +} + +/* + * Convert byte in space to (DA) block + */ +static inline xfs_dablk_t +xfs_dir2_byte_to_da(struct xfs_da_geometry *geo, xfs_dir2_off_t by) +{ + return xfs_dir2_db_to_da(geo, xfs_dir2_byte_to_db(geo, by)); +} + +/* + * Convert block and offset to dataptr + */ +static inline xfs_dir2_dataptr_t +xfs_dir2_db_off_to_dataptr(struct xfs_da_geometry *geo, xfs_dir2_db_t db, + xfs_dir2_data_aoff_t o) +{ + return xfs_dir2_byte_to_dataptr(xfs_dir2_db_off_to_byte(geo, db, o)); +} + +/* + * Convert block (dablk) to block (DB) + */ +static inline xfs_dir2_db_t +xfs_dir2_da_to_db(struct xfs_da_geometry *geo, xfs_dablk_t da) +{ + return (xfs_dir2_db_t)(da >> (geo->blklog - geo->fsblog)); +} + +/* + * Convert block (dablk) to byte offset in space + */ +static inline xfs_dir2_off_t +xfs_dir2_da_to_byte(struct xfs_da_geometry *geo, xfs_dablk_t da) +{ + return xfs_dir2_db_off_to_byte(geo, xfs_dir2_da_to_db(geo, da), 0); +} + +/* + * Directory tail pointer accessor functions. Based on block geometry. + */ +static inline struct xfs_dir2_block_tail * +xfs_dir2_block_tail_p(struct xfs_da_geometry *geo, struct xfs_dir2_data_hdr *hdr) +{ + return ((struct xfs_dir2_block_tail *) + ((char *)hdr + geo->blksize)) - 1; +} + +static inline struct xfs_dir2_leaf_tail * +xfs_dir2_leaf_tail_p(struct xfs_da_geometry *geo, struct xfs_dir2_leaf *lp) +{ + return (struct xfs_dir2_leaf_tail *) + ((char *)lp + geo->blksize - + sizeof(struct xfs_dir2_leaf_tail)); +} + /* xfs_dir2.c */ extern int xfs_dir_ino_validate(struct xfs_mount *mp, xfs_ino_t ino); -extern int xfs_dir2_isblock(struct xfs_trans *tp, struct xfs_inode *dp, int *r); -extern int xfs_dir2_isleaf(struct xfs_trans *tp, struct xfs_inode *dp, int *r); extern int xfs_dir2_grow_inode(struct xfs_da_args *args, int space, xfs_dir2_db_t *dbp); -extern int xfs_dir2_shrink_inode(struct xfs_da_args *args, xfs_dir2_db_t db, - struct xfs_buf *bp); extern int xfs_dir_cilookup_result(struct xfs_da_args *args, const unsigned char *name, int len); -/* xfs_dir2_block.c */ -extern const struct xfs_buf_ops xfs_dir2_block_buf_ops; +#define S_SHIFT 12 +extern const unsigned char xfs_mode_to_ftype[]; +extern unsigned char xfs_dir3_get_dtype(struct xfs_mount *mp, + __uint8_t filetype); + + +/* xfs_dir2_block.c */ +extern int xfs_dir3_block_read(struct xfs_trans *tp, struct xfs_inode *dp, + struct xfs_buf **bpp); extern int xfs_dir2_block_addname(struct xfs_da_args *args); -extern int xfs_dir2_block_getdents(struct xfs_inode *dp, void *dirent, - xfs_off_t *offset, filldir_t filldir); extern int xfs_dir2_block_lookup(struct xfs_da_args *args); extern int xfs_dir2_block_removename(struct xfs_da_args *args); extern int xfs_dir2_block_replace(struct xfs_da_args *args); @@ -43,58 +180,40 @@ extern int xfs_dir2_leaf_to_block(struct xfs_da_args *args, /* xfs_dir2_data.c */ #ifdef DEBUG -#define xfs_dir2_data_check(dp,bp) __xfs_dir2_data_check(dp, bp); +#define xfs_dir3_data_check(dp,bp) __xfs_dir3_data_check(dp, bp); #else -#define xfs_dir2_data_check(dp,bp) +#define xfs_dir3_data_check(dp,bp) #endif -extern const struct xfs_buf_ops xfs_dir2_data_buf_ops; - -extern int __xfs_dir2_data_check(struct xfs_inode *dp, struct xfs_buf *bp); -extern int xfs_dir2_data_read(struct xfs_trans *tp, struct xfs_inode *dp, +extern int __xfs_dir3_data_check(struct xfs_inode *dp, struct xfs_buf *bp); +extern int xfs_dir3_data_read(struct xfs_trans *tp, struct xfs_inode *dp, xfs_dablk_t bno, xfs_daddr_t mapped_bno, struct xfs_buf **bpp); -extern int xfs_dir2_data_readahead(struct xfs_trans *tp, struct xfs_inode *dp, - xfs_dablk_t bno, xfs_daddr_t mapped_bno); +extern int xfs_dir3_data_readahead(struct xfs_inode *dp, xfs_dablk_t bno, + xfs_daddr_t mapped_bno); extern struct xfs_dir2_data_free * xfs_dir2_data_freeinsert(struct xfs_dir2_data_hdr *hdr, - struct xfs_dir2_data_unused *dup, int *loghead); -extern void xfs_dir2_data_freescan(struct xfs_mount *mp, - struct xfs_dir2_data_hdr *hdr, int *loghead); -extern int xfs_dir2_data_init(struct xfs_da_args *args, xfs_dir2_db_t blkno, + struct xfs_dir2_data_free *bf, struct xfs_dir2_data_unused *dup, + int *loghead); +extern int xfs_dir3_data_init(struct xfs_da_args *args, xfs_dir2_db_t blkno, struct xfs_buf **bpp); -extern void xfs_dir2_data_log_entry(struct xfs_trans *tp, struct xfs_buf *bp, - struct xfs_dir2_data_entry *dep); -extern void xfs_dir2_data_log_header(struct xfs_trans *tp, - struct xfs_buf *bp); -extern void xfs_dir2_data_log_unused(struct xfs_trans *tp, struct xfs_buf *bp, - struct xfs_dir2_data_unused *dup); -extern void xfs_dir2_data_make_free(struct xfs_trans *tp, struct xfs_buf *bp, - xfs_dir2_data_aoff_t offset, xfs_dir2_data_aoff_t len, - int *needlogp, int *needscanp); -extern void xfs_dir2_data_use_free(struct xfs_trans *tp, struct xfs_buf *bp, - struct xfs_dir2_data_unused *dup, xfs_dir2_data_aoff_t offset, - xfs_dir2_data_aoff_t len, int *needlogp, int *needscanp); /* xfs_dir2_leaf.c */ -extern const struct xfs_buf_ops xfs_dir2_leafn_buf_ops; - -extern int xfs_dir2_leafn_read(struct xfs_trans *tp, struct xfs_inode *dp, +extern int xfs_dir3_leafn_read(struct xfs_trans *tp, struct xfs_inode *dp, xfs_dablk_t fbno, xfs_daddr_t mappedbno, struct xfs_buf **bpp); extern int xfs_dir2_block_to_leaf(struct xfs_da_args *args, struct xfs_buf *dbp); extern int xfs_dir2_leaf_addname(struct xfs_da_args *args); -extern void xfs_dir2_leaf_compact(struct xfs_da_args *args, - struct xfs_buf *bp); -extern void xfs_dir2_leaf_compact_x1(struct xfs_buf *bp, int *indexp, +extern void xfs_dir3_leaf_compact(struct xfs_da_args *args, + struct xfs_dir3_icleaf_hdr *leafhdr, struct xfs_buf *bp); +extern void xfs_dir3_leaf_compact_x1(struct xfs_dir3_icleaf_hdr *leafhdr, + struct xfs_dir2_leaf_entry *ents, int *indexp, int *lowstalep, int *highstalep, int *lowlogp, int *highlogp); -extern int xfs_dir2_leaf_getdents(struct xfs_inode *dp, void *dirent, - size_t bufsize, xfs_off_t *offset, filldir_t filldir); -extern int xfs_dir2_leaf_init(struct xfs_da_args *args, xfs_dir2_db_t bno, - struct xfs_buf **bpp, int magic); -extern void xfs_dir2_leaf_log_ents(struct xfs_trans *tp, struct xfs_buf *bp, - int first, int last); -extern void xfs_dir2_leaf_log_header(struct xfs_trans *tp, +extern int xfs_dir3_leaf_get_buf(struct xfs_da_args *args, xfs_dir2_db_t bno, + struct xfs_buf **bpp, __uint16_t magic); +extern void xfs_dir3_leaf_log_ents(struct xfs_da_args *args, + struct xfs_buf *bp, int first, int last); +extern void xfs_dir3_leaf_log_header(struct xfs_da_args *args, struct xfs_buf *bp); extern int xfs_dir2_leaf_lookup(struct xfs_da_args *args); extern int xfs_dir2_leaf_removename(struct xfs_da_args *args); @@ -104,19 +223,23 @@ extern int xfs_dir2_leaf_search_hash(struct xfs_da_args *args, extern int xfs_dir2_leaf_trim_data(struct xfs_da_args *args, struct xfs_buf *lbp, xfs_dir2_db_t db); extern struct xfs_dir2_leaf_entry * -xfs_dir2_leaf_find_entry(struct xfs_dir2_leaf *leaf, int index, int compact, - int lowstale, int highstale, - int *lfloglow, int *lfloghigh); +xfs_dir3_leaf_find_entry(struct xfs_dir3_icleaf_hdr *leafhdr, + struct xfs_dir2_leaf_entry *ents, int index, int compact, + int lowstale, int highstale, int *lfloglow, int *lfloghigh); extern int xfs_dir2_node_to_leaf(struct xfs_da_state *state); +extern bool xfs_dir3_leaf_check_int(struct xfs_mount *mp, struct xfs_inode *dp, + struct xfs_dir3_icleaf_hdr *hdr, struct xfs_dir2_leaf *leaf); + /* xfs_dir2_node.c */ extern int xfs_dir2_leaf_to_node(struct xfs_da_args *args, struct xfs_buf *lbp); -extern xfs_dahash_t xfs_dir2_leafn_lasthash(struct xfs_buf *bp, int *count); +extern xfs_dahash_t xfs_dir2_leafn_lasthash(struct xfs_inode *dp, + struct xfs_buf *bp, int *count); extern int xfs_dir2_leafn_lookup_int(struct xfs_buf *bp, struct xfs_da_args *args, int *indexp, struct xfs_da_state *state); -extern int xfs_dir2_leafn_order(struct xfs_buf *leaf1_bp, +extern int xfs_dir2_leafn_order(struct xfs_inode *dp, struct xfs_buf *leaf1_bp, struct xfs_buf *leaf2_bp); extern int xfs_dir2_leafn_split(struct xfs_da_state *state, struct xfs_da_state_blk *oldblk, struct xfs_da_state_blk *newblk); @@ -134,19 +257,18 @@ extern int xfs_dir2_free_read(struct xfs_trans *tp, struct xfs_inode *dp, xfs_dablk_t fbno, struct xfs_buf **bpp); /* xfs_dir2_sf.c */ -extern xfs_ino_t xfs_dir2_sf_get_parent_ino(struct xfs_dir2_sf_hdr *sfp); -extern xfs_ino_t xfs_dir2_sfe_get_ino(struct xfs_dir2_sf_hdr *sfp, - struct xfs_dir2_sf_entry *sfep); extern int xfs_dir2_block_sfsize(struct xfs_inode *dp, struct xfs_dir2_data_hdr *block, struct xfs_dir2_sf_hdr *sfhp); extern int xfs_dir2_block_to_sf(struct xfs_da_args *args, struct xfs_buf *bp, int size, xfs_dir2_sf_hdr_t *sfhp); extern int xfs_dir2_sf_addname(struct xfs_da_args *args); extern int xfs_dir2_sf_create(struct xfs_da_args *args, xfs_ino_t pino); -extern int xfs_dir2_sf_getdents(struct xfs_inode *dp, void *dirent, - xfs_off_t *offset, filldir_t filldir); extern int xfs_dir2_sf_lookup(struct xfs_da_args *args); extern int xfs_dir2_sf_removename(struct xfs_da_args *args); extern int xfs_dir2_sf_replace(struct xfs_da_args *args); +/* xfs_dir2_readdir.c */ +extern int xfs_readdir(struct xfs_inode *dp, struct dir_context *ctx, + size_t bufsize); + #endif /* __XFS_DIR2_PRIV_H__ */ diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c new file mode 100644 index 00000000000..48e99afb9cb --- /dev/null +++ b/fs/xfs/xfs_dir2_readdir.c @@ -0,0 +1,700 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * Copyright (c) 2013 Red Hat, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_bit.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_mount.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_inode.h" +#include "xfs_dir2.h" +#include "xfs_dir2_priv.h" +#include "xfs_error.h" +#include "xfs_trace.h" +#include "xfs_bmap.h" +#include "xfs_trans.h" +#include "xfs_dinode.h" + +/* + * Directory file type support functions + */ +static unsigned char xfs_dir3_filetype_table[] = { + DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, + DT_FIFO, DT_SOCK, DT_LNK, DT_WHT, +}; + +unsigned char +xfs_dir3_get_dtype( + struct xfs_mount *mp, + __uint8_t filetype) +{ + if (!xfs_sb_version_hasftype(&mp->m_sb)) + return DT_UNKNOWN; + + if (filetype >= XFS_DIR3_FT_MAX) + return DT_UNKNOWN; + + return xfs_dir3_filetype_table[filetype]; +} +/* + * @mode, if set, indicates that the type field needs to be set up. + * This uses the transformation from file mode to DT_* as defined in linux/fs.h + * for file type specification. This will be propagated into the directory + * structure if appropriate for the given operation and filesystem config. + */ +const unsigned char xfs_mode_to_ftype[S_IFMT >> S_SHIFT] = { + [0] = XFS_DIR3_FT_UNKNOWN, + [S_IFREG >> S_SHIFT] = XFS_DIR3_FT_REG_FILE, + [S_IFDIR >> S_SHIFT] = XFS_DIR3_FT_DIR, + [S_IFCHR >> S_SHIFT] = XFS_DIR3_FT_CHRDEV, + [S_IFBLK >> S_SHIFT] = XFS_DIR3_FT_BLKDEV, + [S_IFIFO >> S_SHIFT] = XFS_DIR3_FT_FIFO, + [S_IFSOCK >> S_SHIFT] = XFS_DIR3_FT_SOCK, + [S_IFLNK >> S_SHIFT] = XFS_DIR3_FT_SYMLINK, +}; + +STATIC int +xfs_dir2_sf_getdents( + struct xfs_da_args *args, + struct dir_context *ctx) +{ + int i; /* shortform entry number */ + struct xfs_inode *dp = args->dp; /* incore directory inode */ + xfs_dir2_dataptr_t off; /* current entry's offset */ + xfs_dir2_sf_entry_t *sfep; /* shortform directory entry */ + xfs_dir2_sf_hdr_t *sfp; /* shortform structure */ + xfs_dir2_dataptr_t dot_offset; + xfs_dir2_dataptr_t dotdot_offset; + xfs_ino_t ino; + struct xfs_da_geometry *geo = args->geo; + + ASSERT(dp->i_df.if_flags & XFS_IFINLINE); + /* + * Give up if the directory is way too short. + */ + if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) { + ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount)); + return XFS_ERROR(EIO); + } + + ASSERT(dp->i_df.if_bytes == dp->i_d.di_size); + ASSERT(dp->i_df.if_u1.if_data != NULL); + + sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; + + ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(sfp->i8count)); + + /* + * If the block number in the offset is out of range, we're done. + */ + if (xfs_dir2_dataptr_to_db(geo, ctx->pos) > geo->datablk) + return 0; + + /* + * Precalculate offsets for . and .. as we will always need them. + * + * XXX(hch): the second argument is sometimes 0 and sometimes + * geo->datablk + */ + dot_offset = xfs_dir2_db_off_to_dataptr(geo, geo->datablk, + dp->d_ops->data_dot_offset); + dotdot_offset = xfs_dir2_db_off_to_dataptr(geo, geo->datablk, + dp->d_ops->data_dotdot_offset); + + /* + * Put . entry unless we're starting past it. + */ + if (ctx->pos <= dot_offset) { + ctx->pos = dot_offset & 0x7fffffff; + if (!dir_emit(ctx, ".", 1, dp->i_ino, DT_DIR)) + return 0; + } + + /* + * Put .. entry unless we're starting past it. + */ + if (ctx->pos <= dotdot_offset) { + ino = dp->d_ops->sf_get_parent_ino(sfp); + ctx->pos = dotdot_offset & 0x7fffffff; + if (!dir_emit(ctx, "..", 2, ino, DT_DIR)) + return 0; + } + + /* + * Loop while there are more entries and put'ing works. + */ + sfep = xfs_dir2_sf_firstentry(sfp); + for (i = 0; i < sfp->count; i++) { + __uint8_t filetype; + + off = xfs_dir2_db_off_to_dataptr(geo, geo->datablk, + xfs_dir2_sf_get_offset(sfep)); + + if (ctx->pos > off) { + sfep = dp->d_ops->sf_nextentry(sfp, sfep); + continue; + } + + ino = dp->d_ops->sf_get_ino(sfp, sfep); + filetype = dp->d_ops->sf_get_ftype(sfep); + ctx->pos = off & 0x7fffffff; + if (!dir_emit(ctx, (char *)sfep->name, sfep->namelen, ino, + xfs_dir3_get_dtype(dp->i_mount, filetype))) + return 0; + sfep = dp->d_ops->sf_nextentry(sfp, sfep); + } + + ctx->pos = xfs_dir2_db_off_to_dataptr(geo, geo->datablk + 1, 0) & + 0x7fffffff; + return 0; +} + +/* + * Readdir for block directories. + */ +STATIC int +xfs_dir2_block_getdents( + struct xfs_da_args *args, + struct dir_context *ctx) +{ + struct xfs_inode *dp = args->dp; /* incore directory inode */ + xfs_dir2_data_hdr_t *hdr; /* block header */ + struct xfs_buf *bp; /* buffer for block */ + xfs_dir2_block_tail_t *btp; /* block tail */ + xfs_dir2_data_entry_t *dep; /* block data entry */ + xfs_dir2_data_unused_t *dup; /* block unused entry */ + char *endptr; /* end of the data entries */ + int error; /* error return value */ + char *ptr; /* current data entry */ + int wantoff; /* starting block offset */ + xfs_off_t cook; + struct xfs_da_geometry *geo = args->geo; + + /* + * If the block number in the offset is out of range, we're done. + */ + if (xfs_dir2_dataptr_to_db(geo, ctx->pos) > geo->datablk) + return 0; + + error = xfs_dir3_block_read(NULL, dp, &bp); + if (error) + return error; + + /* + * Extract the byte offset we start at from the seek pointer. + * We'll skip entries before this. + */ + wantoff = xfs_dir2_dataptr_to_off(geo, ctx->pos); + hdr = bp->b_addr; + xfs_dir3_data_check(dp, bp); + /* + * Set up values for the loop. + */ + btp = xfs_dir2_block_tail_p(geo, hdr); + ptr = (char *)dp->d_ops->data_entry_p(hdr); + endptr = (char *)xfs_dir2_block_leaf_p(btp); + + /* + * Loop over the data portion of the block. + * Each object is a real entry (dep) or an unused one (dup). + */ + while (ptr < endptr) { + __uint8_t filetype; + + dup = (xfs_dir2_data_unused_t *)ptr; + /* + * Unused, skip it. + */ + if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) { + ptr += be16_to_cpu(dup->length); + continue; + } + + dep = (xfs_dir2_data_entry_t *)ptr; + + /* + * Bump pointer for the next iteration. + */ + ptr += dp->d_ops->data_entsize(dep->namelen); + /* + * The entry is before the desired starting point, skip it. + */ + if ((char *)dep - (char *)hdr < wantoff) + continue; + + cook = xfs_dir2_db_off_to_dataptr(geo, geo->datablk, + (char *)dep - (char *)hdr); + + ctx->pos = cook & 0x7fffffff; + filetype = dp->d_ops->data_get_ftype(dep); + /* + * If it didn't fit, set the final offset to here & return. + */ + if (!dir_emit(ctx, (char *)dep->name, dep->namelen, + be64_to_cpu(dep->inumber), + xfs_dir3_get_dtype(dp->i_mount, filetype))) { + xfs_trans_brelse(NULL, bp); + return 0; + } + } + + /* + * Reached the end of the block. + * Set the offset to a non-existent block 1 and return. + */ + ctx->pos = xfs_dir2_db_off_to_dataptr(geo, geo->datablk + 1, 0) & + 0x7fffffff; + xfs_trans_brelse(NULL, bp); + return 0; +} + +struct xfs_dir2_leaf_map_info { + xfs_extlen_t map_blocks; /* number of fsbs in map */ + xfs_dablk_t map_off; /* last mapped file offset */ + int map_size; /* total entries in *map */ + int map_valid; /* valid entries in *map */ + int nmap; /* mappings to ask xfs_bmapi */ + xfs_dir2_db_t curdb; /* db for current block */ + int ra_current; /* number of read-ahead blks */ + int ra_index; /* *map index for read-ahead */ + int ra_offset; /* map entry offset for ra */ + int ra_want; /* readahead count wanted */ + struct xfs_bmbt_irec map[]; /* map vector for blocks */ +}; + +STATIC int +xfs_dir2_leaf_readbuf( + struct xfs_da_args *args, + size_t bufsize, + struct xfs_dir2_leaf_map_info *mip, + xfs_dir2_off_t *curoff, + struct xfs_buf **bpp) +{ + struct xfs_inode *dp = args->dp; + struct xfs_buf *bp = *bpp; + struct xfs_bmbt_irec *map = mip->map; + struct blk_plug plug; + int error = 0; + int length; + int i; + int j; + struct xfs_da_geometry *geo = args->geo; + + /* + * If we have a buffer, we need to release it and + * take it out of the mapping. + */ + + if (bp) { + xfs_trans_brelse(NULL, bp); + bp = NULL; + mip->map_blocks -= geo->fsbcount; + /* + * Loop to get rid of the extents for the + * directory block. + */ + for (i = geo->fsbcount; i > 0; ) { + j = min_t(int, map->br_blockcount, i); + map->br_blockcount -= j; + map->br_startblock += j; + map->br_startoff += j; + /* + * If mapping is done, pitch it from + * the table. + */ + if (!map->br_blockcount && --mip->map_valid) + memmove(&map[0], &map[1], + sizeof(map[0]) * mip->map_valid); + i -= j; + } + } + + /* + * Recalculate the readahead blocks wanted. + */ + mip->ra_want = howmany(bufsize + geo->blksize, (1 << geo->fsblog)) - 1; + ASSERT(mip->ra_want >= 0); + + /* + * If we don't have as many as we want, and we haven't + * run out of data blocks, get some more mappings. + */ + if (1 + mip->ra_want > mip->map_blocks && + mip->map_off < xfs_dir2_byte_to_da(geo, XFS_DIR2_LEAF_OFFSET)) { + /* + * Get more bmaps, fill in after the ones + * we already have in the table. + */ + mip->nmap = mip->map_size - mip->map_valid; + error = xfs_bmapi_read(dp, mip->map_off, + xfs_dir2_byte_to_da(geo, XFS_DIR2_LEAF_OFFSET) - + mip->map_off, + &map[mip->map_valid], &mip->nmap, 0); + + /* + * Don't know if we should ignore this or try to return an + * error. The trouble with returning errors is that readdir + * will just stop without actually passing the error through. + */ + if (error) + goto out; /* XXX */ + + /* + * If we got all the mappings we asked for, set the final map + * offset based on the last bmap value received. Otherwise, + * we've reached the end. + */ + if (mip->nmap == mip->map_size - mip->map_valid) { + i = mip->map_valid + mip->nmap - 1; + mip->map_off = map[i].br_startoff + map[i].br_blockcount; + } else + mip->map_off = xfs_dir2_byte_to_da(geo, + XFS_DIR2_LEAF_OFFSET); + + /* + * Look for holes in the mapping, and eliminate them. Count up + * the valid blocks. + */ + for (i = mip->map_valid; i < mip->map_valid + mip->nmap; ) { + if (map[i].br_startblock == HOLESTARTBLOCK) { + mip->nmap--; + length = mip->map_valid + mip->nmap - i; + if (length) + memmove(&map[i], &map[i + 1], + sizeof(map[i]) * length); + } else { + mip->map_blocks += map[i].br_blockcount; + i++; + } + } + mip->map_valid += mip->nmap; + } + + /* + * No valid mappings, so no more data blocks. + */ + if (!mip->map_valid) { + *curoff = xfs_dir2_da_to_byte(geo, mip->map_off); + goto out; + } + + /* + * Read the directory block starting at the first mapping. + */ + mip->curdb = xfs_dir2_da_to_db(geo, map->br_startoff); + error = xfs_dir3_data_read(NULL, dp, map->br_startoff, + map->br_blockcount >= geo->fsbcount ? + XFS_FSB_TO_DADDR(dp->i_mount, map->br_startblock) : + -1, &bp); + /* + * Should just skip over the data block instead of giving up. + */ + if (error) + goto out; /* XXX */ + + /* + * Adjust the current amount of read-ahead: we just read a block that + * was previously ra. + */ + if (mip->ra_current) + mip->ra_current -= geo->fsbcount; + + /* + * Do we need more readahead? + */ + blk_start_plug(&plug); + for (mip->ra_index = mip->ra_offset = i = 0; + mip->ra_want > mip->ra_current && i < mip->map_blocks; + i += geo->fsbcount) { + ASSERT(mip->ra_index < mip->map_valid); + /* + * Read-ahead a contiguous directory block. + */ + if (i > mip->ra_current && + map[mip->ra_index].br_blockcount >= geo->fsbcount) { + xfs_dir3_data_readahead(dp, + map[mip->ra_index].br_startoff + mip->ra_offset, + XFS_FSB_TO_DADDR(dp->i_mount, + map[mip->ra_index].br_startblock + + mip->ra_offset)); + mip->ra_current = i; + } + + /* + * Read-ahead a non-contiguous directory block. This doesn't + * use our mapping, but this is a very rare case. + */ + else if (i > mip->ra_current) { + xfs_dir3_data_readahead(dp, + map[mip->ra_index].br_startoff + + mip->ra_offset, -1); + mip->ra_current = i; + } + + /* + * Advance offset through the mapping table. + */ + for (j = 0; j < geo->fsbcount; j += length ) { + /* + * The rest of this extent but not more than a dir + * block. + */ + length = min_t(int, geo->fsbcount, + map[mip->ra_index].br_blockcount - + mip->ra_offset); + mip->ra_offset += length; + + /* + * Advance to the next mapping if this one is used up. + */ + if (mip->ra_offset == map[mip->ra_index].br_blockcount) { + mip->ra_offset = 0; + mip->ra_index++; + } + } + } + blk_finish_plug(&plug); + +out: + *bpp = bp; + return error; +} + +/* + * Getdents (readdir) for leaf and node directories. + * This reads the data blocks only, so is the same for both forms. + */ +STATIC int +xfs_dir2_leaf_getdents( + struct xfs_da_args *args, + struct dir_context *ctx, + size_t bufsize) +{ + struct xfs_inode *dp = args->dp; + struct xfs_buf *bp = NULL; /* data block buffer */ + xfs_dir2_data_hdr_t *hdr; /* data block header */ + xfs_dir2_data_entry_t *dep; /* data entry */ + xfs_dir2_data_unused_t *dup; /* unused entry */ + int error = 0; /* error return value */ + int length; /* temporary length value */ + int byteoff; /* offset in current block */ + xfs_dir2_off_t curoff; /* current overall offset */ + xfs_dir2_off_t newoff; /* new curoff after new blk */ + char *ptr = NULL; /* pointer to current data */ + struct xfs_dir2_leaf_map_info *map_info; + struct xfs_da_geometry *geo = args->geo; + + /* + * If the offset is at or past the largest allowed value, + * give up right away. + */ + if (ctx->pos >= XFS_DIR2_MAX_DATAPTR) + return 0; + + /* + * Set up to bmap a number of blocks based on the caller's + * buffer size, the directory block size, and the filesystem + * block size. + */ + length = howmany(bufsize + geo->blksize, (1 << geo->fsblog)); + map_info = kmem_zalloc(offsetof(struct xfs_dir2_leaf_map_info, map) + + (length * sizeof(struct xfs_bmbt_irec)), + KM_SLEEP | KM_NOFS); + map_info->map_size = length; + + /* + * Inside the loop we keep the main offset value as a byte offset + * in the directory file. + */ + curoff = xfs_dir2_dataptr_to_byte(ctx->pos); + + /* + * Force this conversion through db so we truncate the offset + * down to get the start of the data block. + */ + map_info->map_off = xfs_dir2_db_to_da(geo, + xfs_dir2_byte_to_db(geo, curoff)); + + /* + * Loop over directory entries until we reach the end offset. + * Get more blocks and readahead as necessary. + */ + while (curoff < XFS_DIR2_LEAF_OFFSET) { + __uint8_t filetype; + + /* + * If we have no buffer, or we're off the end of the + * current buffer, need to get another one. + */ + if (!bp || ptr >= (char *)bp->b_addr + geo->blksize) { + + error = xfs_dir2_leaf_readbuf(args, bufsize, map_info, + &curoff, &bp); + if (error || !map_info->map_valid) + break; + + /* + * Having done a read, we need to set a new offset. + */ + newoff = xfs_dir2_db_off_to_byte(geo, + map_info->curdb, 0); + /* + * Start of the current block. + */ + if (curoff < newoff) + curoff = newoff; + /* + * Make sure we're in the right block. + */ + else if (curoff > newoff) + ASSERT(xfs_dir2_byte_to_db(geo, curoff) == + map_info->curdb); + hdr = bp->b_addr; + xfs_dir3_data_check(dp, bp); + /* + * Find our position in the block. + */ + ptr = (char *)dp->d_ops->data_entry_p(hdr); + byteoff = xfs_dir2_byte_to_off(geo, curoff); + /* + * Skip past the header. + */ + if (byteoff == 0) + curoff += dp->d_ops->data_entry_offset; + /* + * Skip past entries until we reach our offset. + */ + else { + while ((char *)ptr - (char *)hdr < byteoff) { + dup = (xfs_dir2_data_unused_t *)ptr; + + if (be16_to_cpu(dup->freetag) + == XFS_DIR2_DATA_FREE_TAG) { + + length = be16_to_cpu(dup->length); + ptr += length; + continue; + } + dep = (xfs_dir2_data_entry_t *)ptr; + length = + dp->d_ops->data_entsize(dep->namelen); + ptr += length; + } + /* + * Now set our real offset. + */ + curoff = + xfs_dir2_db_off_to_byte(geo, + xfs_dir2_byte_to_db(geo, curoff), + (char *)ptr - (char *)hdr); + if (ptr >= (char *)hdr + geo->blksize) { + continue; + } + } + } + /* + * We have a pointer to an entry. + * Is it a live one? + */ + dup = (xfs_dir2_data_unused_t *)ptr; + /* + * No, it's unused, skip over it. + */ + if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) { + length = be16_to_cpu(dup->length); + ptr += length; + curoff += length; + continue; + } + + dep = (xfs_dir2_data_entry_t *)ptr; + length = dp->d_ops->data_entsize(dep->namelen); + filetype = dp->d_ops->data_get_ftype(dep); + + ctx->pos = xfs_dir2_byte_to_dataptr(curoff) & 0x7fffffff; + if (!dir_emit(ctx, (char *)dep->name, dep->namelen, + be64_to_cpu(dep->inumber), + xfs_dir3_get_dtype(dp->i_mount, filetype))) + break; + + /* + * Advance to next entry in the block. + */ + ptr += length; + curoff += length; + /* bufsize may have just been a guess; don't go negative */ + bufsize = bufsize > length ? bufsize - length : 0; + } + + /* + * All done. Set output offset value to current offset. + */ + if (curoff > xfs_dir2_dataptr_to_byte(XFS_DIR2_MAX_DATAPTR)) + ctx->pos = XFS_DIR2_MAX_DATAPTR & 0x7fffffff; + else + ctx->pos = xfs_dir2_byte_to_dataptr(curoff) & 0x7fffffff; + kmem_free(map_info); + if (bp) + xfs_trans_brelse(NULL, bp); + return error; +} + +/* + * Read a directory. + */ +int +xfs_readdir( + struct xfs_inode *dp, + struct dir_context *ctx, + size_t bufsize) +{ + struct xfs_da_args args = { NULL }; + int rval; + int v; + uint lock_mode; + + trace_xfs_readdir(dp); + + if (XFS_FORCED_SHUTDOWN(dp->i_mount)) + return XFS_ERROR(EIO); + + ASSERT(S_ISDIR(dp->i_d.di_mode)); + XFS_STATS_INC(xs_dir_getdents); + + args.dp = dp; + args.geo = dp->i_mount->m_dir_geo; + + lock_mode = xfs_ilock_data_map_shared(dp); + if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) + rval = xfs_dir2_sf_getdents(&args, ctx); + else if ((rval = xfs_dir2_isblock(&args, &v))) + ; + else if (v) + rval = xfs_dir2_block_getdents(&args, ctx); + else + rval = xfs_dir2_leaf_getdents(&args, ctx, bufsize); + xfs_iunlock(dp, lock_mode); + + return rval; +} diff --git a/fs/xfs/xfs_dir2_sf.c b/fs/xfs/xfs_dir2_sf.c index 1b9fc3ec7e4..53c3be619db 100644 --- a/fs/xfs/xfs_dir2_sf.c +++ b/fs/xfs/xfs_dir2_sf.c @@ -17,22 +17,22 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_types.h" -#include "xfs_log.h" -#include "xfs_trans.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_mount.h" +#include "xfs_da_format.h" #include "xfs_da_btree.h" -#include "xfs_bmap_btree.h" -#include "xfs_dinode.h" #include "xfs_inode.h" +#include "xfs_trans.h" #include "xfs_inode_item.h" #include "xfs_error.h" #include "xfs_dir2.h" -#include "xfs_dir2_format.h" #include "xfs_dir2_priv.h" #include "xfs_trace.h" +#include "xfs_dinode.h" /* * Prototypes for internal functions. @@ -57,82 +57,6 @@ static void xfs_dir2_sf_toino8(xfs_da_args_t *args); #endif /* XFS_BIG_INUMS */ /* - * Inode numbers in short-form directories can come in two versions, - * either 4 bytes or 8 bytes wide. These helpers deal with the - * two forms transparently by looking at the headers i8count field. - * - * For 64-bit inode number the most significant byte must be zero. - */ -static xfs_ino_t -xfs_dir2_sf_get_ino( - struct xfs_dir2_sf_hdr *hdr, - xfs_dir2_inou_t *from) -{ - if (hdr->i8count) - return get_unaligned_be64(&from->i8.i) & 0x00ffffffffffffffULL; - else - return get_unaligned_be32(&from->i4.i); -} - -static void -xfs_dir2_sf_put_ino( - struct xfs_dir2_sf_hdr *hdr, - xfs_dir2_inou_t *to, - xfs_ino_t ino) -{ - ASSERT((ino & 0xff00000000000000ULL) == 0); - - if (hdr->i8count) - put_unaligned_be64(ino, &to->i8.i); - else - put_unaligned_be32(ino, &to->i4.i); -} - -xfs_ino_t -xfs_dir2_sf_get_parent_ino( - struct xfs_dir2_sf_hdr *hdr) -{ - return xfs_dir2_sf_get_ino(hdr, &hdr->parent); -} - -static void -xfs_dir2_sf_put_parent_ino( - struct xfs_dir2_sf_hdr *hdr, - xfs_ino_t ino) -{ - xfs_dir2_sf_put_ino(hdr, &hdr->parent, ino); -} - -/* - * In short-form directory entries the inode numbers are stored at variable - * offset behind the entry name. The inode numbers may only be accessed - * through the helpers below. - */ -static xfs_dir2_inou_t * -xfs_dir2_sfe_inop( - struct xfs_dir2_sf_entry *sfep) -{ - return (xfs_dir2_inou_t *)&sfep->name[sfep->namelen]; -} - -xfs_ino_t -xfs_dir2_sfe_get_ino( - struct xfs_dir2_sf_hdr *hdr, - struct xfs_dir2_sf_entry *sfep) -{ - return xfs_dir2_sf_get_ino(hdr, xfs_dir2_sfe_inop(sfep)); -} - -static void -xfs_dir2_sfe_put_ino( - struct xfs_dir2_sf_hdr *hdr, - struct xfs_dir2_sf_entry *sfep, - xfs_ino_t ino) -{ - xfs_dir2_sf_put_ino(hdr, xfs_dir2_sfe_inop(sfep), ino); -} - -/* * Given a block directory (dp/block), calculate its size as a shortform (sf) * directory and a header for the sf directory, if it will fit it the * space currently present in the inode. If it won't fit, the output @@ -157,11 +81,20 @@ xfs_dir2_block_sfsize( int namelen; /* total name bytes */ xfs_ino_t parent = 0; /* parent inode number */ int size=0; /* total computed size */ + int has_ftype; + struct xfs_da_geometry *geo; mp = dp->i_mount; + geo = mp->m_dir_geo; + + /* + * if there is a filetype field, add the extra byte to the namelen + * for each entry that we see. + */ + has_ftype = xfs_sb_version_hasftype(&mp->m_sb) ? 1 : 0; count = i8count = namelen = 0; - btp = xfs_dir2_block_tail_p(mp, hdr); + btp = xfs_dir2_block_tail_p(geo, hdr); blp = xfs_dir2_block_leaf_p(btp); /* @@ -173,8 +106,8 @@ xfs_dir2_block_sfsize( /* * Calculate the pointer to the entry at hand. */ - dep = (xfs_dir2_data_entry_t *) - ((char *)hdr + xfs_dir2_dataptr_to_off(mp, addr)); + dep = (xfs_dir2_data_entry_t *)((char *)hdr + + xfs_dir2_dataptr_to_off(geo, addr)); /* * Detect . and .., so we can special-case them. * . is not included in sf directories. @@ -188,9 +121,10 @@ xfs_dir2_block_sfsize( if (!isdot) i8count += be64_to_cpu(dep->inumber) > XFS_DIR2_MAX_SHORT_INUM; #endif + /* take into account the file type field */ if (!isdot && !isdotdot) { count++; - namelen += dep->namelen; + namelen += dep->namelen + has_ftype; } else if (isdotdot) parent = be64_to_cpu(dep->inumber); /* @@ -211,7 +145,7 @@ xfs_dir2_block_sfsize( */ sfhp->count = count; sfhp->i8count = i8count; - xfs_dir2_sf_put_parent_ino(sfhp, parent); + dp->d_ops->sf_put_parent_ino(sfhp, parent); return size; } @@ -238,6 +172,7 @@ xfs_dir2_block_to_sf( char *ptr; /* current data pointer */ xfs_dir2_sf_entry_t *sfep; /* shortform entry */ xfs_dir2_sf_hdr_t *sfp; /* shortform directory header */ + xfs_dir2_sf_hdr_t *dst; /* temporary data buffer */ trace_xfs_dir2_block_to_sf(args); @@ -245,40 +180,25 @@ xfs_dir2_block_to_sf( mp = dp->i_mount; /* - * Make a copy of the block data, so we can shrink the inode - * and add local data. + * allocate a temporary destination buffer the size of the inode + * to format the data into. Once we have formatted the data, we + * can free the block and copy the formatted data into the inode literal + * area. */ - hdr = kmem_alloc(mp->m_dirblksize, KM_SLEEP); - memcpy(hdr, bp->b_addr, mp->m_dirblksize); - logflags = XFS_ILOG_CORE; - if ((error = xfs_dir2_shrink_inode(args, mp->m_dirdatablk, bp))) { - ASSERT(error != ENOSPC); - goto out; - } + dst = kmem_alloc(mp->m_sb.sb_inodesize, KM_SLEEP); + hdr = bp->b_addr; /* - * The buffer is now unconditionally gone, whether - * xfs_dir2_shrink_inode worked or not. - * - * Convert the inode to local format. - */ - dp->i_df.if_flags &= ~XFS_IFEXTENTS; - dp->i_df.if_flags |= XFS_IFINLINE; - dp->i_d.di_format = XFS_DINODE_FMT_LOCAL; - ASSERT(dp->i_df.if_bytes == 0); - xfs_idata_realloc(dp, size, XFS_DATA_FORK); - logflags |= XFS_ILOG_DDATA; - /* * Copy the header into the newly allocate local space. */ - sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; + sfp = (xfs_dir2_sf_hdr_t *)dst; memcpy(sfp, sfhp, xfs_dir2_sf_hdr_size(sfhp->i8count)); - dp->i_d.di_size = size; + /* * Set up to loop over the block's entries. */ - btp = xfs_dir2_block_tail_p(mp, hdr); - ptr = (char *)(hdr + 1); + btp = xfs_dir2_block_tail_p(args->geo, hdr); + ptr = (char *)dp->d_ops->data_entry_p(hdr); endptr = (char *)xfs_dir2_block_leaf_p(btp); sfep = xfs_dir2_sf_firstentry(sfp); /* @@ -306,7 +226,7 @@ xfs_dir2_block_to_sf( else if (dep->namelen == 2 && dep->name[0] == '.' && dep->name[1] == '.') ASSERT(be64_to_cpu(dep->inumber) == - xfs_dir2_sf_get_parent_ino(sfp)); + dp->d_ops->sf_get_parent_ino(sfp)); /* * Normal entry, copy it into shortform. */ @@ -316,18 +236,44 @@ xfs_dir2_block_to_sf( (xfs_dir2_data_aoff_t) ((char *)dep - (char *)hdr)); memcpy(sfep->name, dep->name, dep->namelen); - xfs_dir2_sfe_put_ino(sfp, sfep, - be64_to_cpu(dep->inumber)); + dp->d_ops->sf_put_ino(sfp, sfep, + be64_to_cpu(dep->inumber)); + dp->d_ops->sf_put_ftype(sfep, + dp->d_ops->data_get_ftype(dep)); - sfep = xfs_dir2_sf_nextentry(sfp, sfep); + sfep = dp->d_ops->sf_nextentry(sfp, sfep); } - ptr += xfs_dir2_data_entsize(dep->namelen); + ptr += dp->d_ops->data_entsize(dep->namelen); } ASSERT((char *)sfep - (char *)sfp == size); + + /* now we are done with the block, we can shrink the inode */ + logflags = XFS_ILOG_CORE; + error = xfs_dir2_shrink_inode(args, args->geo->datablk, bp); + if (error) { + ASSERT(error != ENOSPC); + goto out; + } + + /* + * The buffer is now unconditionally gone, whether + * xfs_dir2_shrink_inode worked or not. + * + * Convert the inode to local format and copy the data in. + */ + dp->i_df.if_flags &= ~XFS_IFEXTENTS; + dp->i_df.if_flags |= XFS_IFINLINE; + dp->i_d.di_format = XFS_DINODE_FMT_LOCAL; + ASSERT(dp->i_df.if_bytes == 0); + xfs_idata_realloc(dp, size, XFS_DATA_FORK); + + logflags |= XFS_ILOG_DDATA; + memcpy(dp->i_df.if_u1.if_data, dst, size); + dp->i_d.di_size = size; xfs_dir2_sf_check(args); out: xfs_trans_log_inode(args->trans, dp, logflags); - kmem_free(hdr); + kmem_free(dst); return error; } @@ -341,14 +287,12 @@ int /* error */ xfs_dir2_sf_addname( xfs_da_args_t *args) /* operation arguments */ { - int add_entsize; /* size of the new entry */ xfs_inode_t *dp; /* incore directory inode */ int error; /* error return value */ int incr_isize; /* total change in size */ int new_isize; /* di_size after adding name */ int objchange; /* changing to 8-byte inodes */ xfs_dir2_data_aoff_t offset = 0; /* offset for new entry */ - int old_isize; /* di_size before adding name */ int pick; /* which algorithm to use */ xfs_dir2_sf_hdr_t *sfp; /* shortform structure */ xfs_dir2_sf_entry_t *sfep = NULL; /* shortform entry */ @@ -372,8 +316,7 @@ xfs_dir2_sf_addname( /* * Compute entry (and change in) size. */ - add_entsize = xfs_dir2_sf_entsize(sfp, args->namelen); - incr_isize = add_entsize; + incr_isize = dp->d_ops->sf_entsize(sfp, args->namelen); objchange = 0; #if XFS_BIG_INUMS /* @@ -381,11 +324,8 @@ xfs_dir2_sf_addname( */ if (args->inumber > XFS_DIR2_MAX_SHORT_INUM && sfp->i8count == 0) { /* - * Yes, adjust the entry size and the total size. + * Yes, adjust the inode size. old count + (parent + new) */ - add_entsize += - (uint)sizeof(xfs_dir2_ino8_t) - - (uint)sizeof(xfs_dir2_ino4_t); incr_isize += (sfp->count + 2) * ((uint)sizeof(xfs_dir2_ino8_t) - @@ -393,8 +333,7 @@ xfs_dir2_sf_addname( objchange = 1; } #endif - old_isize = (int)dp->i_d.di_size; - new_isize = old_isize + incr_isize; + new_isize = (int)dp->i_d.di_size + incr_isize; /* * Won't fit as shortform any more (due to size), * or the pick routine says it won't (due to offset values). @@ -466,8 +405,8 @@ xfs_dir2_sf_addname_easy( /* * Grow the in-inode space. */ - xfs_idata_realloc(dp, xfs_dir2_sf_entsize(sfp, args->namelen), - XFS_DATA_FORK); + xfs_idata_realloc(dp, dp->d_ops->sf_entsize(sfp, args->namelen), + XFS_DATA_FORK); /* * Need to set up again due to realloc of the inode data. */ @@ -479,7 +418,9 @@ xfs_dir2_sf_addname_easy( sfep->namelen = args->namelen; xfs_dir2_sf_put_offset(sfep, offset); memcpy(sfep->name, args->name, sfep->namelen); - xfs_dir2_sfe_put_ino(sfp, sfep, args->inumber); + dp->d_ops->sf_put_ino(sfp, sfep, args->inumber); + dp->d_ops->sf_put_ftype(sfep, args->filetype); + /* * Update the header and inode. */ @@ -519,11 +460,13 @@ xfs_dir2_sf_addname_hard( xfs_dir2_sf_hdr_t *oldsfp; /* original shortform dir */ xfs_dir2_sf_entry_t *sfep; /* entry in new dir */ xfs_dir2_sf_hdr_t *sfp; /* new shortform dir */ + struct xfs_mount *mp; /* * Copy the old directory to the stack buffer. */ dp = args->dp; + mp = dp->i_mount; sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; old_isize = (int)dp->i_d.di_size; @@ -535,13 +478,13 @@ xfs_dir2_sf_addname_hard( * to insert the new entry. * If it's going to end up at the end then oldsfep will point there. */ - for (offset = XFS_DIR2_DATA_FIRST_OFFSET, + for (offset = dp->d_ops->data_first_offset, oldsfep = xfs_dir2_sf_firstentry(oldsfp), - add_datasize = xfs_dir2_data_entsize(args->namelen), + add_datasize = dp->d_ops->data_entsize(args->namelen), eof = (char *)oldsfep == &buf[old_isize]; !eof; - offset = new_offset + xfs_dir2_data_entsize(oldsfep->namelen), - oldsfep = xfs_dir2_sf_nextentry(oldsfp, oldsfep), + offset = new_offset + dp->d_ops->data_entsize(oldsfep->namelen), + oldsfep = dp->d_ops->sf_nextentry(oldsfp, oldsfep), eof = (char *)oldsfep == &buf[old_isize]) { new_offset = xfs_dir2_sf_get_offset(oldsfep); if (offset + add_datasize <= new_offset) @@ -570,7 +513,8 @@ xfs_dir2_sf_addname_hard( sfep->namelen = args->namelen; xfs_dir2_sf_put_offset(sfep, offset); memcpy(sfep->name, args->name, sfep->namelen); - xfs_dir2_sfe_put_ino(sfp, sfep, args->inumber); + dp->d_ops->sf_put_ino(sfp, sfep, args->inumber); + dp->d_ops->sf_put_ftype(sfep, args->filetype); sfp->count++; #if XFS_BIG_INUMS if (args->inumber > XFS_DIR2_MAX_SHORT_INUM && !objchange) @@ -580,7 +524,7 @@ xfs_dir2_sf_addname_hard( * If there's more left to copy, do that. */ if (!eof) { - sfep = xfs_dir2_sf_nextentry(sfp, sfep); + sfep = dp->d_ops->sf_nextentry(sfp, sfep); memcpy(sfep, oldsfep, old_isize - nbytes); } kmem_free(buf); @@ -616,8 +560,8 @@ xfs_dir2_sf_addname_pick( mp = dp->i_mount; sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; - size = xfs_dir2_data_entsize(args->namelen); - offset = XFS_DIR2_DATA_FIRST_OFFSET; + size = dp->d_ops->data_entsize(args->namelen); + offset = dp->d_ops->data_first_offset; sfep = xfs_dir2_sf_firstentry(sfp); holefit = 0; /* @@ -629,8 +573,8 @@ xfs_dir2_sf_addname_pick( if (!holefit) holefit = offset + size <= xfs_dir2_sf_get_offset(sfep); offset = xfs_dir2_sf_get_offset(sfep) + - xfs_dir2_data_entsize(sfep->namelen); - sfep = xfs_dir2_sf_nextentry(sfp, sfep); + dp->d_ops->data_entsize(sfep->namelen); + sfep = dp->d_ops->sf_nextentry(sfp, sfep); } /* * Calculate data bytes used excluding the new entry, if this @@ -644,7 +588,7 @@ xfs_dir2_sf_addname_pick( * we'll go back, convert to block, then try the insert and convert * to leaf. */ - if (used + (holefit ? 0 : size) > mp->m_dirblksize) + if (used + (holefit ? 0 : size) > args->geo->blksize) return 0; /* * If changing the inode number size, do it the hard way. @@ -659,7 +603,7 @@ xfs_dir2_sf_addname_pick( /* * If it won't fit at the end then do it the hard way (use the hole). */ - if (used + size > mp->m_dirblksize) + if (used + size > args->geo->blksize) return 2; /* * Do it the easy way. @@ -684,31 +628,33 @@ xfs_dir2_sf_check( int offset; /* data offset */ xfs_dir2_sf_entry_t *sfep; /* shortform dir entry */ xfs_dir2_sf_hdr_t *sfp; /* shortform structure */ + struct xfs_mount *mp; dp = args->dp; + mp = dp->i_mount; sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; - offset = XFS_DIR2_DATA_FIRST_OFFSET; - ino = xfs_dir2_sf_get_parent_ino(sfp); + offset = dp->d_ops->data_first_offset; + ino = dp->d_ops->sf_get_parent_ino(sfp); i8count = ino > XFS_DIR2_MAX_SHORT_INUM; for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp); i < sfp->count; - i++, sfep = xfs_dir2_sf_nextentry(sfp, sfep)) { + i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep)) { ASSERT(xfs_dir2_sf_get_offset(sfep) >= offset); - ino = xfs_dir2_sfe_get_ino(sfp, sfep); + ino = dp->d_ops->sf_get_ino(sfp, sfep); i8count += ino > XFS_DIR2_MAX_SHORT_INUM; offset = xfs_dir2_sf_get_offset(sfep) + - xfs_dir2_data_entsize(sfep->namelen); + dp->d_ops->data_entsize(sfep->namelen); + ASSERT(dp->d_ops->sf_get_ftype(sfep) < XFS_DIR3_FT_MAX); } ASSERT(i8count == sfp->i8count); ASSERT(XFS_BIG_INUMS || i8count == 0); ASSERT((char *)sfep - (char *)sfp == dp->i_d.di_size); ASSERT(offset + (sfp->count + 2) * (uint)sizeof(xfs_dir2_leaf_entry_t) + - (uint)sizeof(xfs_dir2_block_tail_t) <= - dp->i_mount->m_dirblksize); + (uint)sizeof(xfs_dir2_block_tail_t) <= args->geo->blksize); } #endif /* DEBUG */ @@ -757,7 +703,7 @@ xfs_dir2_sf_create( /* * Now can put in the inode number, since i8count is set. */ - xfs_dir2_sf_put_parent_ino(sfp, pino); + dp->d_ops->sf_put_parent_ino(sfp, pino); sfp->count = 0; dp->i_d.di_size = size; xfs_dir2_sf_check(args); @@ -765,105 +711,6 @@ xfs_dir2_sf_create( return 0; } -int /* error */ -xfs_dir2_sf_getdents( - xfs_inode_t *dp, /* incore directory inode */ - void *dirent, - xfs_off_t *offset, - filldir_t filldir) -{ - int i; /* shortform entry number */ - xfs_mount_t *mp; /* filesystem mount point */ - xfs_dir2_dataptr_t off; /* current entry's offset */ - xfs_dir2_sf_entry_t *sfep; /* shortform directory entry */ - xfs_dir2_sf_hdr_t *sfp; /* shortform structure */ - xfs_dir2_dataptr_t dot_offset; - xfs_dir2_dataptr_t dotdot_offset; - xfs_ino_t ino; - - mp = dp->i_mount; - - ASSERT(dp->i_df.if_flags & XFS_IFINLINE); - /* - * Give up if the directory is way too short. - */ - if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) { - ASSERT(XFS_FORCED_SHUTDOWN(mp)); - return XFS_ERROR(EIO); - } - - ASSERT(dp->i_df.if_bytes == dp->i_d.di_size); - ASSERT(dp->i_df.if_u1.if_data != NULL); - - sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; - - ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(sfp->i8count)); - - /* - * If the block number in the offset is out of range, we're done. - */ - if (xfs_dir2_dataptr_to_db(mp, *offset) > mp->m_dirdatablk) - return 0; - - /* - * Precalculate offsets for . and .. as we will always need them. - * - * XXX(hch): the second argument is sometimes 0 and sometimes - * mp->m_dirdatablk. - */ - dot_offset = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk, - XFS_DIR2_DATA_DOT_OFFSET); - dotdot_offset = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk, - XFS_DIR2_DATA_DOTDOT_OFFSET); - - /* - * Put . entry unless we're starting past it. - */ - if (*offset <= dot_offset) { - if (filldir(dirent, ".", 1, dot_offset & 0x7fffffff, dp->i_ino, DT_DIR)) { - *offset = dot_offset & 0x7fffffff; - return 0; - } - } - - /* - * Put .. entry unless we're starting past it. - */ - if (*offset <= dotdot_offset) { - ino = xfs_dir2_sf_get_parent_ino(sfp); - if (filldir(dirent, "..", 2, dotdot_offset & 0x7fffffff, ino, DT_DIR)) { - *offset = dotdot_offset & 0x7fffffff; - return 0; - } - } - - /* - * Loop while there are more entries and put'ing works. - */ - sfep = xfs_dir2_sf_firstentry(sfp); - for (i = 0; i < sfp->count; i++) { - off = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk, - xfs_dir2_sf_get_offset(sfep)); - - if (*offset > off) { - sfep = xfs_dir2_sf_nextentry(sfp, sfep); - continue; - } - - ino = xfs_dir2_sfe_get_ino(sfp, sfep); - if (filldir(dirent, (char *)sfep->name, sfep->namelen, - off & 0x7fffffff, ino, DT_UNKNOWN)) { - *offset = off & 0x7fffffff; - return 0; - } - sfep = xfs_dir2_sf_nextentry(sfp, sfep); - } - - *offset = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk + 1, 0) & - 0x7fffffff; - return 0; -} - /* * Lookup an entry in a shortform directory. * Returns EEXIST if found, ENOENT if not found. @@ -903,6 +750,7 @@ xfs_dir2_sf_lookup( if (args->namelen == 1 && args->name[0] == '.') { args->inumber = dp->i_ino; args->cmpresult = XFS_CMP_EXACT; + args->filetype = XFS_DIR3_FT_DIR; return XFS_ERROR(EEXIST); } /* @@ -910,8 +758,9 @@ xfs_dir2_sf_lookup( */ if (args->namelen == 2 && args->name[0] == '.' && args->name[1] == '.') { - args->inumber = xfs_dir2_sf_get_parent_ino(sfp); + args->inumber = dp->d_ops->sf_get_parent_ino(sfp); args->cmpresult = XFS_CMP_EXACT; + args->filetype = XFS_DIR3_FT_DIR; return XFS_ERROR(EEXIST); } /* @@ -919,7 +768,7 @@ xfs_dir2_sf_lookup( */ ci_sfep = NULL; for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp); i < sfp->count; - i++, sfep = xfs_dir2_sf_nextentry(sfp, sfep)) { + i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep)) { /* * Compare name and if it's an exact match, return the inode * number. If it's the first case-insensitive match, store the @@ -929,7 +778,8 @@ xfs_dir2_sf_lookup( sfep->namelen); if (cmp != XFS_CMP_DIFFERENT && cmp != args->cmpresult) { args->cmpresult = cmp; - args->inumber = xfs_dir2_sfe_get_ino(sfp, sfep); + args->inumber = dp->d_ops->sf_get_ino(sfp, sfep); + args->filetype = dp->d_ops->sf_get_ftype(sfep); if (cmp == XFS_CMP_EXACT) return XFS_ERROR(EEXIST); ci_sfep = sfep; @@ -985,10 +835,10 @@ xfs_dir2_sf_removename( * Find the one we're deleting. */ for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp); i < sfp->count; - i++, sfep = xfs_dir2_sf_nextentry(sfp, sfep)) { + i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep)) { if (xfs_da_compname(args, sfep->name, sfep->namelen) == XFS_CMP_EXACT) { - ASSERT(xfs_dir2_sfe_get_ino(sfp, sfep) == + ASSERT(dp->d_ops->sf_get_ino(sfp, sfep) == args->inumber); break; } @@ -1002,7 +852,7 @@ xfs_dir2_sf_removename( * Calculate sizes. */ byteoff = (int)((char *)sfep - (char *)sfp); - entsize = xfs_dir2_sf_entsize(sfp, args->namelen); + entsize = dp->d_ops->sf_entsize(sfp, args->namelen); newsize = oldsize - entsize; /* * Copy the part if any after the removed entry, sliding it down. @@ -1109,25 +959,25 @@ xfs_dir2_sf_replace( if (args->namelen == 2 && args->name[0] == '.' && args->name[1] == '.') { #if XFS_BIG_INUMS || defined(DEBUG) - ino = xfs_dir2_sf_get_parent_ino(sfp); + ino = dp->d_ops->sf_get_parent_ino(sfp); ASSERT(args->inumber != ino); #endif - xfs_dir2_sf_put_parent_ino(sfp, args->inumber); + dp->d_ops->sf_put_parent_ino(sfp, args->inumber); } /* * Normal entry, look for the name. */ else { - for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp); - i < sfp->count; - i++, sfep = xfs_dir2_sf_nextentry(sfp, sfep)) { + for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp); i < sfp->count; + i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep)) { if (xfs_da_compname(args, sfep->name, sfep->namelen) == XFS_CMP_EXACT) { #if XFS_BIG_INUMS || defined(DEBUG) - ino = xfs_dir2_sfe_get_ino(sfp, sfep); + ino = dp->d_ops->sf_get_ino(sfp, sfep); ASSERT(args->inumber != ino); #endif - xfs_dir2_sfe_put_ino(sfp, sfep, args->inumber); + dp->d_ops->sf_put_ino(sfp, sfep, args->inumber); + dp->d_ops->sf_put_ftype(sfep, args->filetype); break; } } @@ -1194,10 +1044,12 @@ xfs_dir2_sf_toino4( int oldsize; /* old inode size */ xfs_dir2_sf_entry_t *sfep; /* new sf entry */ xfs_dir2_sf_hdr_t *sfp; /* new sf directory */ + struct xfs_mount *mp; trace_xfs_dir2_sf_toino4(args); dp = args->dp; + mp = dp->i_mount; /* * Copy the old directory to the buffer. @@ -1228,20 +1080,21 @@ xfs_dir2_sf_toino4( */ sfp->count = oldsfp->count; sfp->i8count = 0; - xfs_dir2_sf_put_parent_ino(sfp, xfs_dir2_sf_get_parent_ino(oldsfp)); + dp->d_ops->sf_put_parent_ino(sfp, dp->d_ops->sf_get_parent_ino(oldsfp)); /* * Copy the entries field by field. */ for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp), oldsfep = xfs_dir2_sf_firstentry(oldsfp); i < sfp->count; - i++, sfep = xfs_dir2_sf_nextentry(sfp, sfep), - oldsfep = xfs_dir2_sf_nextentry(oldsfp, oldsfep)) { + i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep), + oldsfep = dp->d_ops->sf_nextentry(oldsfp, oldsfep)) { sfep->namelen = oldsfep->namelen; sfep->offset = oldsfep->offset; memcpy(sfep->name, oldsfep->name, sfep->namelen); - xfs_dir2_sfe_put_ino(sfp, sfep, - xfs_dir2_sfe_get_ino(oldsfp, oldsfep)); + dp->d_ops->sf_put_ino(sfp, sfep, + dp->d_ops->sf_get_ino(oldsfp, oldsfep)); + dp->d_ops->sf_put_ftype(sfep, dp->d_ops->sf_get_ftype(oldsfep)); } /* * Clean up the inode. @@ -1252,9 +1105,9 @@ xfs_dir2_sf_toino4( } /* - * Convert from 4-byte inode numbers to 8-byte inode numbers. - * The new 8-byte inode number is not there yet, we leave with the - * count 1 but no corresponding entry. + * Convert existing entries from 4-byte inode numbers to 8-byte inode numbers. + * The new entry w/ an 8-byte inode number is not there yet; we leave with + * i8count set to 1, but no corresponding 8-byte entry. */ static void xfs_dir2_sf_toino8( @@ -1269,10 +1122,12 @@ xfs_dir2_sf_toino8( int oldsize; /* old inode size */ xfs_dir2_sf_entry_t *sfep; /* new sf entry */ xfs_dir2_sf_hdr_t *sfp; /* new sf directory */ + struct xfs_mount *mp; trace_xfs_dir2_sf_toino8(args); dp = args->dp; + mp = dp->i_mount; /* * Copy the old directory to the buffer. @@ -1285,7 +1140,7 @@ xfs_dir2_sf_toino8( ASSERT(oldsfp->i8count == 0); memcpy(buf, oldsfp, oldsize); /* - * Compute the new inode size. + * Compute the new inode size (nb: entry count + 1 for parent) */ newsize = oldsize + @@ -1303,20 +1158,21 @@ xfs_dir2_sf_toino8( */ sfp->count = oldsfp->count; sfp->i8count = 1; - xfs_dir2_sf_put_parent_ino(sfp, xfs_dir2_sf_get_parent_ino(oldsfp)); + dp->d_ops->sf_put_parent_ino(sfp, dp->d_ops->sf_get_parent_ino(oldsfp)); /* * Copy the entries field by field. */ for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp), oldsfep = xfs_dir2_sf_firstentry(oldsfp); i < sfp->count; - i++, sfep = xfs_dir2_sf_nextentry(sfp, sfep), - oldsfep = xfs_dir2_sf_nextentry(oldsfp, oldsfep)) { + i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep), + oldsfep = dp->d_ops->sf_nextentry(oldsfp, oldsfep)) { sfep->namelen = oldsfep->namelen; sfep->offset = oldsfep->offset; memcpy(sfep->name, oldsfep->name, sfep->namelen); - xfs_dir2_sfe_put_ino(sfp, sfep, - xfs_dir2_sfe_get_ino(oldsfp, oldsfep)); + dp->d_ops->sf_put_ino(sfp, sfep, + dp->d_ops->sf_get_ino(oldsfp, oldsfep)); + dp->d_ops->sf_put_ftype(sfep, dp->d_ops->sf_get_ftype(oldsfep)); } /* * Clean up the inode. diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c index 69cf4fcde03..4f11ef01113 100644 --- a/fs/xfs/xfs_discard.c +++ b/fs/xfs/xfs_discard.c @@ -16,22 +16,22 @@ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include "xfs.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_sb.h" -#include "xfs_log.h" #include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_quota.h" -#include "xfs_trans.h" -#include "xfs_alloc_btree.h" -#include "xfs_bmap_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_btree.h" #include "xfs_inode.h" +#include "xfs_btree.h" +#include "xfs_alloc_btree.h" #include "xfs_alloc.h" #include "xfs_error.h" #include "xfs_extent_busy.h" #include "xfs_discard.h" #include "xfs_trace.h" +#include "xfs_log.h" STATIC int xfs_trim_extents( @@ -157,7 +157,7 @@ xfs_ioc_trim( struct xfs_mount *mp, struct fstrim_range __user *urange) { - struct request_queue *q = mp->m_ddev_targp->bt_bdev->bd_disk->queue; + struct request_queue *q = bdev_get_queue(mp->m_ddev_targp->bt_bdev); unsigned int granularity = q->limits.discard_granularity; struct fstrim_range range; xfs_daddr_t start, end, minlen; @@ -180,7 +180,8 @@ xfs_ioc_trim( * matter as trimming blocks is an advisory interface. */ if (range.start >= XFS_FSB_TO_B(mp, mp->m_sb.sb_dblocks) || - range.minlen > XFS_FSB_TO_B(mp, XFS_ALLOC_AG_MAX_USABLE(mp))) + range.minlen > XFS_FSB_TO_B(mp, XFS_ALLOC_AG_MAX_USABLE(mp)) || + range.len < mp->m_sb.sb_blocksize) return -XFS_ERROR(EINVAL); start = BTOBB(range.start); diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index 9e1bf5294c9..3ee0cd43edc 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -17,26 +17,29 @@ */ #include "xfs.h" #include "xfs_fs.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_shared.h" +#include "xfs_trans_resv.h" #include "xfs_bit.h" -#include "xfs_log.h" -#include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_alloc.h" -#include "xfs_quota.h" #include "xfs_mount.h" -#include "xfs_bmap_btree.h" #include "xfs_inode.h" #include "xfs_bmap.h" -#include "xfs_rtalloc.h" +#include "xfs_bmap_util.h" +#include "xfs_alloc.h" +#include "xfs_quota.h" #include "xfs_error.h" -#include "xfs_itable.h" -#include "xfs_attr.h" +#include "xfs_trans.h" #include "xfs_buf_item.h" #include "xfs_trans_space.h" #include "xfs_trans_priv.h" #include "xfs_qm.h" +#include "xfs_cksum.h" #include "xfs_trace.h" +#include "xfs_log.h" +#include "xfs_bmap_btree.h" /* * Lock order: @@ -61,7 +64,8 @@ int xfs_dqerror_mod = 33; struct kmem_zone *xfs_qm_dqtrxzone; static struct kmem_zone *xfs_qm_dqzone; -static struct lock_class_key xfs_dquot_other_class; +static struct lock_class_key xfs_dquot_group_class; +static struct lock_class_key xfs_dquot_project_class; /* * This is called to free all the memory associated with a dquot @@ -85,17 +89,23 @@ xfs_qm_dqdestroy( */ void xfs_qm_adjust_dqlimits( - xfs_mount_t *mp, - xfs_disk_dquot_t *d) + struct xfs_mount *mp, + struct xfs_dquot *dq) { - xfs_quotainfo_t *q = mp->m_quotainfo; + struct xfs_quotainfo *q = mp->m_quotainfo; + struct xfs_disk_dquot *d = &dq->q_core; + int prealloc = 0; ASSERT(d->d_id); - if (q->qi_bsoftlimit && !d->d_blk_softlimit) + if (q->qi_bsoftlimit && !d->d_blk_softlimit) { d->d_blk_softlimit = cpu_to_be64(q->qi_bsoftlimit); - if (q->qi_bhardlimit && !d->d_blk_hardlimit) + prealloc = 1; + } + if (q->qi_bhardlimit && !d->d_blk_hardlimit) { d->d_blk_hardlimit = cpu_to_be64(q->qi_bhardlimit); + prealloc = 1; + } if (q->qi_isoftlimit && !d->d_ino_softlimit) d->d_ino_softlimit = cpu_to_be64(q->qi_isoftlimit); if (q->qi_ihardlimit && !d->d_ino_hardlimit) @@ -104,6 +114,9 @@ xfs_qm_adjust_dqlimits( d->d_rtb_softlimit = cpu_to_be64(q->qi_rtbsoftlimit); if (q->qi_rtbhardlimit && !d->d_rtb_hardlimit) d->d_rtb_hardlimit = cpu_to_be64(q->qi_rtbhardlimit); + + if (prealloc) + xfs_dquot_set_prealloc_limits(dq); } /* @@ -239,6 +252,11 @@ xfs_qm_init_dquot_blk( d->dd_diskdq.d_version = XFS_DQUOT_VERSION; d->dd_diskdq.d_id = cpu_to_be32(curid); d->dd_diskdq.d_flags = type; + if (xfs_sb_version_hascrc(&mp->m_sb)) { + uuid_copy(&d->dd_uuid, &mp->m_sb.sb_uuid); + xfs_update_cksum((char *)d, sizeof(struct xfs_dqblk), + XFS_DQUOT_CRC_OFF); + } } xfs_trans_dquot_buf(tp, bp, @@ -248,60 +266,32 @@ xfs_qm_init_dquot_blk( xfs_trans_log_buf(tp, bp, 0, BBTOB(q->qi_dqchunklen) - 1); } -static void -xfs_dquot_buf_verify( - struct xfs_buf *bp) +/* + * Initialize the dynamic speculative preallocation thresholds. The lo/hi + * watermarks correspond to the soft and hard limits by default. If a soft limit + * is not specified, we use 95% of the hard limit. + */ +void +xfs_dquot_set_prealloc_limits(struct xfs_dquot *dqp) { - struct xfs_mount *mp = bp->b_target->bt_mount; - struct xfs_dqblk *d = (struct xfs_dqblk *)bp->b_addr; - struct xfs_disk_dquot *ddq; - xfs_dqid_t id = 0; - int i; - - /* - * On the first read of the buffer, verify that each dquot is valid. - * We don't know what the id of the dquot is supposed to be, just that - * they should be increasing monotonically within the buffer. If the - * first id is corrupt, then it will fail on the second dquot in the - * buffer so corruptions could point to the wrong dquot in this case. - */ - for (i = 0; i < mp->m_quotainfo->qi_dqperchunk; i++) { - int error; - - ddq = &d[i].dd_diskdq; - - if (i == 0) - id = be32_to_cpu(ddq->d_id); - - error = xfs_qm_dqcheck(mp, ddq, id + i, 0, XFS_QMOPT_DOWARN, - "xfs_dquot_read_verify"); - if (error) { - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, d); - xfs_buf_ioerror(bp, EFSCORRUPTED); - break; - } + __uint64_t space; + + dqp->q_prealloc_hi_wmark = be64_to_cpu(dqp->q_core.d_blk_hardlimit); + dqp->q_prealloc_lo_wmark = be64_to_cpu(dqp->q_core.d_blk_softlimit); + if (!dqp->q_prealloc_lo_wmark) { + dqp->q_prealloc_lo_wmark = dqp->q_prealloc_hi_wmark; + do_div(dqp->q_prealloc_lo_wmark, 100); + dqp->q_prealloc_lo_wmark *= 95; } -} -static void -xfs_dquot_buf_read_verify( - struct xfs_buf *bp) -{ - xfs_dquot_buf_verify(bp); -} + space = dqp->q_prealloc_hi_wmark; -void -xfs_dquot_buf_write_verify( - struct xfs_buf *bp) -{ - xfs_dquot_buf_verify(bp); + do_div(space, 100); + dqp->q_low_space[XFS_QLOWSP_1_PCNT] = space; + dqp->q_low_space[XFS_QLOWSP_3_PCNT] = space * 3; + dqp->q_low_space[XFS_QLOWSP_5_PCNT] = space * 5; } -const struct xfs_buf_ops xfs_dquot_buf_ops = { - .verify_read = xfs_dquot_buf_read_verify, - .verify_write = xfs_dquot_buf_write_verify, -}; - /* * Allocate a block and fill it with dquots. * This is called when the bmapi finds a hole. @@ -363,10 +353,10 @@ xfs_qm_dqalloc( dqp->q_blkno, mp->m_quotainfo->qi_dqchunklen, 0); - - error = xfs_buf_geterror(bp); - if (error) + if (!bp) { + error = ENOMEM; goto error1; + } bp->b_ops = &xfs_dquot_buf_ops; /* @@ -412,6 +402,7 @@ xfs_qm_dqalloc( return (error); } + STATIC int xfs_qm_dqrepair( struct xfs_mount *mp, @@ -445,7 +436,7 @@ xfs_qm_dqrepair( /* Do the actual repair of dquots in this buffer */ for (i = 0; i < mp->m_quotainfo->qi_dqperchunk; i++) { ddq = &d[i].dd_diskdq; - error = xfs_qm_dqcheck(mp, ddq, firstid + i, + error = xfs_dqcheck(mp, ddq, firstid + i, dqp->dq_flags & XFS_DQ_ALLTYPES, XFS_QMOPT_DQREPAIR, "xfs_qm_dqrepair"); if (error) { @@ -471,23 +462,24 @@ xfs_qm_dqtobp( xfs_buf_t **O_bpp, uint flags) { - xfs_bmbt_irec_t map; - int nmaps = 1, error; - xfs_buf_t *bp; - xfs_inode_t *quotip = XFS_DQ_TO_QIP(dqp); - xfs_mount_t *mp = dqp->q_mount; - xfs_dqid_t id = be32_to_cpu(dqp->q_core.d_id); - xfs_trans_t *tp = (tpp ? *tpp : NULL); + struct xfs_bmbt_irec map; + int nmaps = 1, error; + struct xfs_buf *bp; + struct xfs_inode *quotip = xfs_dq_to_quota_inode(dqp); + struct xfs_mount *mp = dqp->q_mount; + xfs_dqid_t id = be32_to_cpu(dqp->q_core.d_id); + struct xfs_trans *tp = (tpp ? *tpp : NULL); + uint lock_mode; dqp->q_fileoffset = (xfs_fileoff_t)id / mp->m_quotainfo->qi_dqperchunk; - xfs_ilock(quotip, XFS_ILOCK_SHARED); + lock_mode = xfs_ilock_data_map_shared(quotip); if (!xfs_this_quota_on(dqp->q_mount, dqp->dq_flags)) { /* * Return if this type of quotas is turned off while we * didn't have the quota inode lock. */ - xfs_iunlock(quotip, XFS_ILOCK_SHARED); + xfs_iunlock(quotip, lock_mode); return ESRCH; } @@ -497,7 +489,7 @@ xfs_qm_dqtobp( error = xfs_bmapi_read(quotip, dqp->q_fileoffset, XFS_DQUOT_CLUSTER_SIZE_FSB, &map, &nmaps, 0); - xfs_iunlock(quotip, XFS_ILOCK_SHARED); + xfs_iunlock(quotip, lock_mode); if (error) return error; @@ -602,8 +594,20 @@ xfs_qm_dqread( * Make sure group quotas have a different lock class than user * quotas. */ - if (!(type & XFS_DQ_USER)) - lockdep_set_class(&dqp->q_qlock, &xfs_dquot_other_class); + switch (type) { + case XFS_DQ_USER: + /* uses the default lock class */ + break; + case XFS_DQ_GROUP: + lockdep_set_class(&dqp->q_qlock, &xfs_dquot_group_class); + break; + case XFS_DQ_PROJ: + lockdep_set_class(&dqp->q_qlock, &xfs_dquot_project_class); + break; + default: + ASSERT(0); + break; + } XFS_STATS_INC(xs_qm_dquot); @@ -611,16 +615,8 @@ xfs_qm_dqread( if (flags & XFS_QMOPT_DQALLOC) { tp = xfs_trans_alloc(mp, XFS_TRANS_QM_DQALLOC); - error = xfs_trans_reserve(tp, XFS_QM_DQALLOC_SPACE_RES(mp), - XFS_WRITE_LOG_RES(mp) + - /* - * Round the chunklen up to the next multiple - * of 128 (buf log item chunk size)). - */ - BBTOB(mp->m_quotainfo->qi_dqchunklen) - 1 + 128, - 0, - XFS_TRANS_PERM_LOG_RES, - XFS_WRITE_LOG_COUNT); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_dqalloc, + XFS_QM_DQALLOC_SPACE_RES(mp), 0); if (error) goto error1; cancelflags = XFS_TRANS_RELEASE_LOG_RES; @@ -654,6 +650,9 @@ xfs_qm_dqread( dqp->q_res_icount = be64_to_cpu(ddqp->d_icount); dqp->q_res_rtbcount = be64_to_cpu(ddqp->d_rtbcount); + /* initialize the dquot speculative prealloc thresholds */ + xfs_dquot_set_prealloc_limits(dqp); + /* Mark the buf so that this will stay incore a little longer */ xfs_buf_set_ref(bp, XFS_DQUOT_REF); @@ -708,7 +707,7 @@ xfs_qm_dqget( xfs_dquot_t **O_dqpp) /* OUT : locked incore dquot */ { struct xfs_quotainfo *qi = mp->m_quotainfo; - struct radix_tree_root *tree = XFS_DQUOT_TREE(qi, type); + struct radix_tree_root *tree = xfs_dquot_tree(qi, type); struct xfs_dquot *dqp; int error; @@ -833,43 +832,6 @@ restart: return (0); } - -STATIC void -xfs_qm_dqput_final( - struct xfs_dquot *dqp) -{ - struct xfs_quotainfo *qi = dqp->q_mount->m_quotainfo; - struct xfs_dquot *gdqp; - - trace_xfs_dqput_free(dqp); - - mutex_lock(&qi->qi_lru_lock); - if (list_empty(&dqp->q_lru)) { - list_add_tail(&dqp->q_lru, &qi->qi_lru_list); - qi->qi_lru_count++; - XFS_STATS_INC(xs_qm_dquot_unused); - } - mutex_unlock(&qi->qi_lru_lock); - - /* - * If we just added a udquot to the freelist, then we want to release - * the gdquot reference that it (probably) has. Otherwise it'll keep - * the gdquot from getting reclaimed. - */ - gdqp = dqp->q_gdquot; - if (gdqp) { - xfs_dqlock(gdqp); - dqp->q_gdquot = NULL; - } - xfs_dqunlock(dqp); - - /* - * If we had a group quota hint, release it now. - */ - if (gdqp) - xfs_qm_dqput(gdqp); -} - /* * Release a reference to the dquot (decrement ref-count) and unlock it. * @@ -885,10 +847,14 @@ xfs_qm_dqput( trace_xfs_dqput(dqp); - if (--dqp->q_nrefs > 0) - xfs_dqunlock(dqp); - else - xfs_qm_dqput_final(dqp); + if (--dqp->q_nrefs == 0) { + struct xfs_quotainfo *qi = dqp->q_mount->m_quotainfo; + trace_xfs_dqput_free(dqp); + + if (list_lru_add(&qi->qi_lru, &dqp->q_lru)) + XFS_STATS_INC(xs_qm_dquot_unused); + } + xfs_dqunlock(dqp); } /* @@ -1020,7 +986,7 @@ xfs_qm_dqflush( /* * A simple sanity check in case we got a corrupted dquot.. */ - error = xfs_qm_dqcheck(mp, &dqp->q_core, be32_to_cpu(ddqp->d_id), 0, + error = xfs_dqcheck(mp, &dqp->q_core, be32_to_cpu(ddqp->d_id), 0, XFS_QMOPT_DOWARN, "dqflush (incore copy)"); if (error) { xfs_buf_relse(bp); @@ -1041,6 +1007,23 @@ xfs_qm_dqflush( &dqp->q_logitem.qli_item.li_lsn); /* + * copy the lsn into the on-disk dquot now while we have the in memory + * dquot here. This can't be done later in the write verifier as we + * can't get access to the log item at that point in time. + * + * We also calculate the CRC here so that the on-disk dquot in the + * buffer always has a valid CRC. This ensures there is no possibility + * of a dquot without an up-to-date CRC getting to disk. + */ + if (xfs_sb_version_hascrc(&mp->m_sb)) { + struct xfs_dqblk *dqb = (struct xfs_dqblk *)ddqp; + + dqb->dd_lsn = cpu_to_be64(dqp->q_logitem.qli_item.li_lsn); + xfs_update_cksum((char *)dqb, sizeof(struct xfs_dqblk), + XFS_DQUOT_CRC_OFF); + } + + /* * Attach an iodone routine so that we can remove this dquot from the * AIL and release the flush lock once the dquot is synced to disk. */ diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h index c694a8469c4..68a68f70483 100644 --- a/fs/xfs/xfs_dquot.h +++ b/fs/xfs/xfs_dquot.h @@ -32,6 +32,13 @@ struct xfs_mount; struct xfs_trans; +enum { + XFS_QLOWSP_1_PCNT = 0, + XFS_QLOWSP_3_PCNT, + XFS_QLOWSP_5_PCNT, + XFS_QLOWSP_MAX +}; + /* * The incore dquot structure */ @@ -45,12 +52,14 @@ typedef struct xfs_dquot { int q_bufoffset; /* off of dq in buffer (# dquots) */ xfs_fileoff_t q_fileoffset; /* offset in quotas file */ - struct xfs_dquot*q_gdquot; /* group dquot, hint only */ xfs_disk_dquot_t q_core; /* actual usage & quotas */ xfs_dq_logitem_t q_logitem; /* dquot log item */ xfs_qcnt_t q_res_bcount; /* total regular nblks used+reserved */ xfs_qcnt_t q_res_icount; /* total inos allocd+reserved */ xfs_qcnt_t q_res_rtbcount;/* total realtime blks used+reserved */ + xfs_qcnt_t q_prealloc_lo_wmark;/* prealloc throttle wmark */ + xfs_qcnt_t q_prealloc_hi_wmark;/* prealloc disabled wmark */ + int64_t q_low_space[XFS_QLOWSP_MAX]; struct mutex q_qlock; /* quota lock */ struct completion q_flush; /* flush completion queue */ atomic_t q_pincount; /* dquot pin count */ @@ -108,8 +117,9 @@ static inline int xfs_this_quota_on(struct xfs_mount *mp, int type) case XFS_DQ_USER: return XFS_IS_UQUOTA_ON(mp); case XFS_DQ_GROUP: + return XFS_IS_GQUOTA_ON(mp); case XFS_DQ_PROJ: - return XFS_IS_OQUOTA_ON(mp); + return XFS_IS_PQUOTA_ON(mp); default: return 0; } @@ -121,8 +131,9 @@ static inline xfs_dquot_t *xfs_inode_dquot(struct xfs_inode *ip, int type) case XFS_DQ_USER: return ip->i_udquot; case XFS_DQ_GROUP: - case XFS_DQ_PROJ: return ip->i_gdquot; + case XFS_DQ_PROJ: + return ip->i_pdquot; default: return NULL; } @@ -133,10 +144,6 @@ static inline xfs_dquot_t *xfs_inode_dquot(struct xfs_inode *ip, int type) #define XFS_QM_ISUDQ(dqp) ((dqp)->dq_flags & XFS_DQ_USER) #define XFS_QM_ISPDQ(dqp) ((dqp)->dq_flags & XFS_DQ_PROJ) #define XFS_QM_ISGDQ(dqp) ((dqp)->dq_flags & XFS_DQ_GROUP) -#define XFS_DQ_TO_QINF(dqp) ((dqp)->q_mount->m_quotainfo) -#define XFS_DQ_TO_QIP(dqp) (XFS_QM_ISUDQ(dqp) ? \ - XFS_DQ_TO_QINF(dqp)->qi_uquotaip : \ - XFS_DQ_TO_QINF(dqp)->qi_gquotaip) extern int xfs_qm_dqread(struct xfs_mount *, xfs_dqid_t, uint, uint, struct xfs_dquot **); @@ -145,14 +152,16 @@ extern int xfs_qm_dqflush(struct xfs_dquot *, struct xfs_buf **); extern void xfs_qm_dqunpin_wait(xfs_dquot_t *); extern void xfs_qm_adjust_dqtimers(xfs_mount_t *, xfs_disk_dquot_t *); -extern void xfs_qm_adjust_dqlimits(xfs_mount_t *, - xfs_disk_dquot_t *); +extern void xfs_qm_adjust_dqlimits(struct xfs_mount *, + struct xfs_dquot *); extern int xfs_qm_dqget(xfs_mount_t *, xfs_inode_t *, xfs_dqid_t, uint, uint, xfs_dquot_t **); extern void xfs_qm_dqput(xfs_dquot_t *); extern void xfs_dqlock2(struct xfs_dquot *, struct xfs_dquot *); +extern void xfs_dquot_set_prealloc_limits(struct xfs_dquot *); + static inline struct xfs_dquot *xfs_qm_dqhold(struct xfs_dquot *dqp) { xfs_dqlock(dqp); @@ -161,6 +170,4 @@ static inline struct xfs_dquot *xfs_qm_dqhold(struct xfs_dquot *dqp) return dqp; } -extern const struct xfs_buf_ops xfs_dquot_buf_ops; - #endif /* __XFS_DQUOT_H__ */ diff --git a/fs/xfs/xfs_dquot_buf.c b/fs/xfs/xfs_dquot_buf.c new file mode 100644 index 00000000000..c2ac0c611ad --- /dev/null +++ b/fs/xfs/xfs_dquot_buf.c @@ -0,0 +1,290 @@ +/* + * Copyright (c) 2000-2006 Silicon Graphics, Inc. + * Copyright (c) 2013 Red Hat, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_quota.h" +#include "xfs_trans.h" +#include "xfs_qm.h" +#include "xfs_error.h" +#include "xfs_cksum.h" +#include "xfs_trace.h" + +int +xfs_calc_dquots_per_chunk( + unsigned int nbblks) /* basic block units */ +{ + unsigned int ndquots; + + ASSERT(nbblks > 0); + ndquots = BBTOB(nbblks); + do_div(ndquots, sizeof(xfs_dqblk_t)); + + return ndquots; +} + +/* + * Do some primitive error checking on ondisk dquot data structures. + */ +int +xfs_dqcheck( + struct xfs_mount *mp, + xfs_disk_dquot_t *ddq, + xfs_dqid_t id, + uint type, /* used only when IO_dorepair is true */ + uint flags, + char *str) +{ + xfs_dqblk_t *d = (xfs_dqblk_t *)ddq; + int errs = 0; + + /* + * We can encounter an uninitialized dquot buffer for 2 reasons: + * 1. If we crash while deleting the quotainode(s), and those blks got + * used for user data. This is because we take the path of regular + * file deletion; however, the size field of quotainodes is never + * updated, so all the tricks that we play in itruncate_finish + * don't quite matter. + * + * 2. We don't play the quota buffers when there's a quotaoff logitem. + * But the allocation will be replayed so we'll end up with an + * uninitialized quota block. + * + * This is all fine; things are still consistent, and we haven't lost + * any quota information. Just don't complain about bad dquot blks. + */ + if (ddq->d_magic != cpu_to_be16(XFS_DQUOT_MAGIC)) { + if (flags & XFS_QMOPT_DOWARN) + xfs_alert(mp, + "%s : XFS dquot ID 0x%x, magic 0x%x != 0x%x", + str, id, be16_to_cpu(ddq->d_magic), XFS_DQUOT_MAGIC); + errs++; + } + if (ddq->d_version != XFS_DQUOT_VERSION) { + if (flags & XFS_QMOPT_DOWARN) + xfs_alert(mp, + "%s : XFS dquot ID 0x%x, version 0x%x != 0x%x", + str, id, ddq->d_version, XFS_DQUOT_VERSION); + errs++; + } + + if (ddq->d_flags != XFS_DQ_USER && + ddq->d_flags != XFS_DQ_PROJ && + ddq->d_flags != XFS_DQ_GROUP) { + if (flags & XFS_QMOPT_DOWARN) + xfs_alert(mp, + "%s : XFS dquot ID 0x%x, unknown flags 0x%x", + str, id, ddq->d_flags); + errs++; + } + + if (id != -1 && id != be32_to_cpu(ddq->d_id)) { + if (flags & XFS_QMOPT_DOWARN) + xfs_alert(mp, + "%s : ondisk-dquot 0x%p, ID mismatch: " + "0x%x expected, found id 0x%x", + str, ddq, id, be32_to_cpu(ddq->d_id)); + errs++; + } + + if (!errs && ddq->d_id) { + if (ddq->d_blk_softlimit && + be64_to_cpu(ddq->d_bcount) > + be64_to_cpu(ddq->d_blk_softlimit)) { + if (!ddq->d_btimer) { + if (flags & XFS_QMOPT_DOWARN) + xfs_alert(mp, + "%s : Dquot ID 0x%x (0x%p) BLK TIMER NOT STARTED", + str, (int)be32_to_cpu(ddq->d_id), ddq); + errs++; + } + } + if (ddq->d_ino_softlimit && + be64_to_cpu(ddq->d_icount) > + be64_to_cpu(ddq->d_ino_softlimit)) { + if (!ddq->d_itimer) { + if (flags & XFS_QMOPT_DOWARN) + xfs_alert(mp, + "%s : Dquot ID 0x%x (0x%p) INODE TIMER NOT STARTED", + str, (int)be32_to_cpu(ddq->d_id), ddq); + errs++; + } + } + if (ddq->d_rtb_softlimit && + be64_to_cpu(ddq->d_rtbcount) > + be64_to_cpu(ddq->d_rtb_softlimit)) { + if (!ddq->d_rtbtimer) { + if (flags & XFS_QMOPT_DOWARN) + xfs_alert(mp, + "%s : Dquot ID 0x%x (0x%p) RTBLK TIMER NOT STARTED", + str, (int)be32_to_cpu(ddq->d_id), ddq); + errs++; + } + } + } + + if (!errs || !(flags & XFS_QMOPT_DQREPAIR)) + return errs; + + if (flags & XFS_QMOPT_DOWARN) + xfs_notice(mp, "Re-initializing dquot ID 0x%x", id); + + /* + * Typically, a repair is only requested by quotacheck. + */ + ASSERT(id != -1); + ASSERT(flags & XFS_QMOPT_DQREPAIR); + memset(d, 0, sizeof(xfs_dqblk_t)); + + d->dd_diskdq.d_magic = cpu_to_be16(XFS_DQUOT_MAGIC); + d->dd_diskdq.d_version = XFS_DQUOT_VERSION; + d->dd_diskdq.d_flags = type; + d->dd_diskdq.d_id = cpu_to_be32(id); + + if (xfs_sb_version_hascrc(&mp->m_sb)) { + uuid_copy(&d->dd_uuid, &mp->m_sb.sb_uuid); + xfs_update_cksum((char *)d, sizeof(struct xfs_dqblk), + XFS_DQUOT_CRC_OFF); + } + + return errs; +} + +STATIC bool +xfs_dquot_buf_verify_crc( + struct xfs_mount *mp, + struct xfs_buf *bp) +{ + struct xfs_dqblk *d = (struct xfs_dqblk *)bp->b_addr; + int ndquots; + int i; + + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return true; + + /* + * if we are in log recovery, the quota subsystem has not been + * initialised so we have no quotainfo structure. In that case, we need + * to manually calculate the number of dquots in the buffer. + */ + if (mp->m_quotainfo) + ndquots = mp->m_quotainfo->qi_dqperchunk; + else + ndquots = xfs_calc_dquots_per_chunk( + XFS_BB_TO_FSB(mp, bp->b_length)); + + for (i = 0; i < ndquots; i++, d++) { + if (!xfs_verify_cksum((char *)d, sizeof(struct xfs_dqblk), + XFS_DQUOT_CRC_OFF)) + return false; + if (!uuid_equal(&d->dd_uuid, &mp->m_sb.sb_uuid)) + return false; + } + return true; +} + +STATIC bool +xfs_dquot_buf_verify( + struct xfs_mount *mp, + struct xfs_buf *bp) +{ + struct xfs_dqblk *d = (struct xfs_dqblk *)bp->b_addr; + xfs_dqid_t id = 0; + int ndquots; + int i; + + /* + * if we are in log recovery, the quota subsystem has not been + * initialised so we have no quotainfo structure. In that case, we need + * to manually calculate the number of dquots in the buffer. + */ + if (mp->m_quotainfo) + ndquots = mp->m_quotainfo->qi_dqperchunk; + else + ndquots = xfs_calc_dquots_per_chunk(bp->b_length); + + /* + * On the first read of the buffer, verify that each dquot is valid. + * We don't know what the id of the dquot is supposed to be, just that + * they should be increasing monotonically within the buffer. If the + * first id is corrupt, then it will fail on the second dquot in the + * buffer so corruptions could point to the wrong dquot in this case. + */ + for (i = 0; i < ndquots; i++) { + struct xfs_disk_dquot *ddq; + int error; + + ddq = &d[i].dd_diskdq; + + if (i == 0) + id = be32_to_cpu(ddq->d_id); + + error = xfs_dqcheck(mp, ddq, id + i, 0, XFS_QMOPT_DOWARN, + "xfs_dquot_buf_verify"); + if (error) + return false; + } + return true; +} + +static void +xfs_dquot_buf_read_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + + if (!xfs_dquot_buf_verify_crc(mp, bp)) + xfs_buf_ioerror(bp, EFSBADCRC); + else if (!xfs_dquot_buf_verify(mp, bp)) + xfs_buf_ioerror(bp, EFSCORRUPTED); + + if (bp->b_error) + xfs_verifier_error(bp); +} + +/* + * we don't calculate the CRC here as that is done when the dquot is flushed to + * the buffer after the update is done. This ensures that the dquot in the + * buffer always has an up-to-date CRC value. + */ +static void +xfs_dquot_buf_write_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + + if (!xfs_dquot_buf_verify(mp, bp)) { + xfs_buf_ioerror(bp, EFSCORRUPTED); + xfs_verifier_error(bp); + return; + } +} + +const struct xfs_buf_ops xfs_dquot_buf_ops = { + .verify_read = xfs_dquot_buf_read_verify, + .verify_write = xfs_dquot_buf_write_verify, +}; + diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c index 57aa4b03720..f33fbaaa4d8 100644 --- a/fs/xfs/xfs_dquot_item.c +++ b/fs/xfs/xfs_dquot_item.c @@ -17,23 +17,20 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_log.h" -#include "xfs_trans.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_alloc.h" -#include "xfs_quota.h" #include "xfs_mount.h" -#include "xfs_bmap_btree.h" #include "xfs_inode.h" -#include "xfs_bmap.h" -#include "xfs_rtalloc.h" +#include "xfs_quota.h" #include "xfs_error.h" -#include "xfs_itable.h" -#include "xfs_attr.h" +#include "xfs_trans.h" #include "xfs_buf_item.h" #include "xfs_trans_priv.h" #include "xfs_qm.h" +#include "xfs_log.h" static inline struct xfs_dq_logitem *DQUOT_ITEM(struct xfs_log_item *lip) { @@ -43,14 +40,15 @@ static inline struct xfs_dq_logitem *DQUOT_ITEM(struct xfs_log_item *lip) /* * returns the number of iovecs needed to log the given dquot item. */ -STATIC uint +STATIC void xfs_qm_dquot_logitem_size( - struct xfs_log_item *lip) + struct xfs_log_item *lip, + int *nvecs, + int *nbytes) { - /* - * we need only two iovecs, one for the format, one for the real thing - */ - return 2; + *nvecs += 2; + *nbytes += sizeof(struct xfs_dq_logformat) + + sizeof(struct xfs_disk_dquot); } /* @@ -59,20 +57,24 @@ xfs_qm_dquot_logitem_size( STATIC void xfs_qm_dquot_logitem_format( struct xfs_log_item *lip, - struct xfs_log_iovec *logvec) + struct xfs_log_vec *lv) { struct xfs_dq_logitem *qlip = DQUOT_ITEM(lip); - - logvec->i_addr = &qlip->qli_format; - logvec->i_len = sizeof(xfs_dq_logformat_t); - logvec->i_type = XLOG_REG_TYPE_QFORMAT; - logvec++; - logvec->i_addr = &qlip->qli_dquot->q_core; - logvec->i_len = sizeof(xfs_disk_dquot_t); - logvec->i_type = XLOG_REG_TYPE_DQUOT; - - qlip->qli_format.qlf_size = 2; - + struct xfs_log_iovec *vecp = NULL; + struct xfs_dq_logformat *qlf; + + qlf = xlog_prepare_iovec(lv, &vecp, XLOG_REG_TYPE_QFORMAT); + qlf->qlf_type = XFS_LI_DQUOT; + qlf->qlf_size = 2; + qlf->qlf_id = be32_to_cpu(qlip->qli_dquot->q_core.d_id); + qlf->qlf_blkno = qlip->qli_dquot->q_blkno; + qlf->qlf_len = 1; + qlf->qlf_boffset = qlip->qli_dquot->q_bufoffset; + xlog_finish_iovec(lv, vecp, sizeof(struct xfs_dq_logformat)); + + xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_DQUOT, + &qlip->qli_dquot->q_core, + sizeof(struct xfs_disk_dquot)); } /* @@ -140,7 +142,8 @@ xfs_qm_dqunpin_wait( STATIC uint xfs_qm_dquot_logitem_push( struct xfs_log_item *lip, - struct list_head *buffer_list) + struct list_head *buffer_list) __releases(&lip->li_ailp->xa_lock) + __acquires(&lip->li_ailp->xa_lock) { struct xfs_dquot *dqp = DQUOT_ITEM(lip)->qli_dquot; struct xfs_buf *bp = NULL; @@ -258,18 +261,6 @@ xfs_qm_dquot_logitem_init( xfs_log_item_init(dqp->q_mount, &lp->qli_item, XFS_LI_DQUOT, &xfs_dquot_item_ops); lp->qli_dquot = dqp; - lp->qli_format.qlf_type = XFS_LI_DQUOT; - lp->qli_format.qlf_id = be32_to_cpu(dqp->q_core.d_id); - lp->qli_format.qlf_blkno = dqp->q_blkno; - lp->qli_format.qlf_len = 1; - /* - * This is just the offset of this dquot within its buffer - * (which is currently 1 FSB and probably won't change). - * Hence 32 bits for this offset should be just fine. - * Alternatively, we can store (bufoffset / sizeof(xfs_dqblk_t)) - * here, and recompute it at recovery time. - */ - lp->qli_format.qlf_boffset = (__uint32_t)dqp->q_bufoffset; } /*------------------ QUOTAOFF LOG ITEMS -------------------*/ @@ -285,33 +276,30 @@ static inline struct xfs_qoff_logitem *QOFF_ITEM(struct xfs_log_item *lip) * We only need 1 iovec for an quotaoff item. It just logs the * quotaoff_log_format structure. */ -STATIC uint +STATIC void xfs_qm_qoff_logitem_size( - struct xfs_log_item *lip) + struct xfs_log_item *lip, + int *nvecs, + int *nbytes) { - return 1; + *nvecs += 1; + *nbytes += sizeof(struct xfs_qoff_logitem); } -/* - * This is called to fill in the vector of log iovecs for the - * given quotaoff log item. We use only 1 iovec, and we point that - * at the quotaoff_log_format structure embedded in the quotaoff item. - * It is at this point that we assert that all of the extent - * slots in the quotaoff item have been filled. - */ STATIC void xfs_qm_qoff_logitem_format( struct xfs_log_item *lip, - struct xfs_log_iovec *log_vector) + struct xfs_log_vec *lv) { struct xfs_qoff_logitem *qflip = QOFF_ITEM(lip); - - ASSERT(qflip->qql_format.qf_type == XFS_LI_QUOTAOFF); - - log_vector->i_addr = &qflip->qql_format; - log_vector->i_len = sizeof(xfs_qoff_logitem_t); - log_vector->i_type = XLOG_REG_TYPE_QUOTAOFF; - qflip->qql_format.qf_size = 1; + struct xfs_log_iovec *vecp = NULL; + struct xfs_qoff_logformat *qlf; + + qlf = xlog_prepare_iovec(lv, &vecp, XLOG_REG_TYPE_QUOTAOFF); + qlf->qf_type = XFS_LI_QUOTAOFF; + qlf->qf_size = 1; + qlf->qf_flags = qflip->qql_flags; + xlog_finish_iovec(lv, vecp, sizeof(struct xfs_qoff_logitem)); } /* @@ -451,8 +439,7 @@ xfs_qm_qoff_logitem_init( xfs_log_item_init(mp, &qf->qql_item, XFS_LI_QUOTAOFF, start ? &xfs_qm_qoffend_logitem_ops : &xfs_qm_qoff_logitem_ops); qf->qql_item.li_mountp = mp; - qf->qql_format.qf_type = XFS_LI_QUOTAOFF; - qf->qql_format.qf_flags = flags; qf->qql_start_lip = start; + qf->qql_flags = flags; return qf; } diff --git a/fs/xfs/xfs_dquot_item.h b/fs/xfs/xfs_dquot_item.h index 5acae2ada70..502e9464634 100644 --- a/fs/xfs/xfs_dquot_item.h +++ b/fs/xfs/xfs_dquot_item.h @@ -27,13 +27,12 @@ typedef struct xfs_dq_logitem { xfs_log_item_t qli_item; /* common portion */ struct xfs_dquot *qli_dquot; /* dquot ptr */ xfs_lsn_t qli_flush_lsn; /* lsn at last flush */ - xfs_dq_logformat_t qli_format; /* logged structure */ } xfs_dq_logitem_t; typedef struct xfs_qoff_logitem { xfs_log_item_t qql_item; /* common portion */ struct xfs_qoff_logitem *qql_start_lip; /* qoff-start logitem, if any */ - xfs_qoff_logformat_t qql_format; /* logged structure */ + unsigned int qql_flags; } xfs_qoff_logitem_t; diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index 610456054dc..edac5b057d2 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -16,17 +16,13 @@ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include "xfs.h" +#include "xfs_format.h" #include "xfs_fs.h" -#include "xfs_types.h" -#include "xfs_log.h" -#include "xfs_trans.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_mount.h" -#include "xfs_bmap_btree.h" -#include "xfs_dinode.h" -#include "xfs_inode.h" -#include "xfs_utils.h" #include "xfs_error.h" #ifdef DEBUG @@ -66,7 +62,7 @@ xfs_error_test(int error_tag, int *fsidp, char *expression, int i; int64_t fsid; - if (random32() % randfactor) + if (prandom_u32() % randfactor) return 0; memcpy(&fsid, fsidp, sizeof(xfs_fsid_t)); @@ -160,7 +156,7 @@ xfs_error_report( { if (level <= xfs_error_level) { xfs_alert_tag(mp, XFS_PTAG_ERROR_REPORT, - "Internal error %s at line %d of file %s. Caller 0x%p\n", + "Internal error %s at line %d of file %s. Caller %pF", tag, linenum, filename, ra); xfs_stack_trace(); @@ -178,7 +174,32 @@ xfs_corruption_error( inst_t *ra) { if (level <= xfs_error_level) - xfs_hex_dump(p, 16); + xfs_hex_dump(p, 64); xfs_error_report(tag, level, mp, filename, linenum, ra); xfs_alert(mp, "Corruption detected. Unmount and run xfs_repair"); } + +/* + * Warnings specifically for verifier errors. Differentiate CRC vs. invalid + * values, and omit the stack trace unless the error level is tuned high. + */ +void +xfs_verifier_error( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + + xfs_alert(mp, "Metadata %s detected at %pF, block 0x%llx", + bp->b_error == EFSBADCRC ? "CRC error" : "corruption", + __return_address, bp->b_bn); + + xfs_alert(mp, "Unmount and run xfs_repair"); + + if (xfs_error_level >= XFS_ERRLEVEL_LOW) { + xfs_alert(mp, "First 64 bytes of corrupted metadata buffer:"); + xfs_hex_dump(xfs_buf_offset(bp, 0), 64); + } + + if (xfs_error_level >= XFS_ERRLEVEL_HIGH) + xfs_stack_trace(); +} diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h index 079a367f44e..c1c57d4a4b5 100644 --- a/fs/xfs/xfs_error.h +++ b/fs/xfs/xfs_error.h @@ -34,6 +34,7 @@ extern void xfs_error_report(const char *tag, int level, struct xfs_mount *mp, extern void xfs_corruption_error(const char *tag, int level, struct xfs_mount *mp, void *p, const char *filename, int linenum, inst_t *ra); +extern void xfs_verifier_error(struct xfs_buf *bp); #define XFS_ERROR_REPORT(e, lvl, mp) \ xfs_error_report(e, lvl, mp, __FILE__, __LINE__, __return_address) diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c index a83611849ce..753e467aa1a 100644 --- a/fs/xfs/xfs_export.c +++ b/fs/xfs/xfs_export.c @@ -16,20 +16,21 @@ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include "xfs.h" -#include "xfs_types.h" -#include "xfs_log.h" -#include "xfs_trans.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_dir2.h" #include "xfs_mount.h" +#include "xfs_da_format.h" +#include "xfs_dir2.h" #include "xfs_export.h" -#include "xfs_vnodeops.h" -#include "xfs_bmap_btree.h" #include "xfs_inode.h" +#include "xfs_trans.h" #include "xfs_inode_item.h" #include "xfs_trace.h" #include "xfs_icache.h" +#include "xfs_log.h" /* * Note that we only accept fileids which are long enough rather than allow @@ -48,7 +49,7 @@ static int xfs_fileid_length(int fileid_type) case FILEID_INO32_GEN_PARENT | XFS_FILEID_TYPE_64FLAG: return 6; } - return 255; /* invalid */ + return FILEID_INVALID; } STATIC int @@ -90,7 +91,7 @@ xfs_fs_encode_fh( len = xfs_fileid_length(fileid_type); if (*max_len < len) { *max_len = len; - return 255; + return FILEID_INVALID; } *max_len = len; @@ -236,7 +237,7 @@ xfs_fs_nfs_commit_metadata( if (!lsn) return 0; - return _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL); + return -_xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL); } const struct export_operations xfs_export_operations = { diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c index 85e9f87a1a7..fd22f69049d 100644 --- a/fs/xfs/xfs_extent_busy.c +++ b/fs/xfs/xfs_extent_busy.c @@ -19,17 +19,18 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_types.h" -#include "xfs_log.h" -#include "xfs_trans.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_shared.h" +#include "xfs_trans_resv.h" #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_mount.h" -#include "xfs_bmap_btree.h" #include "xfs_alloc.h" -#include "xfs_inode.h" #include "xfs_extent_busy.h" #include "xfs_trace.h" +#include "xfs_trans.h" +#include "xfs_log.h" void xfs_extent_busy_insert( @@ -147,7 +148,7 @@ xfs_extent_busy_search( * extent. If the overlap covers the beginning, the end, or all of the busy * extent, the overlapping portion can be made unbusy and used for the * allocation. We can't split a busy extent because we can't modify a - * transaction/CIL context busy list, but we can update an entries block + * transaction/CIL context busy list, but we can update an entry's block * number or length. * * Returns true if the extent can safely be reused, or false if the search @@ -160,7 +161,8 @@ xfs_extent_busy_update_extent( struct xfs_extent_busy *busyp, xfs_agblock_t fbno, xfs_extlen_t flen, - bool userdata) + bool userdata) __releases(&pag->pagb_lock) + __acquires(&pag->pagb_lock) { xfs_agblock_t fend = fbno + flen; xfs_agblock_t bbno = busyp->bno; diff --git a/fs/xfs/xfs_extent_busy.h b/fs/xfs/xfs_extent_busy.h index 985412d65ba..bfff284d2dc 100644 --- a/fs/xfs/xfs_extent_busy.h +++ b/fs/xfs/xfs_extent_busy.h @@ -20,6 +20,10 @@ #ifndef __XFS_EXTENT_BUSY_H__ #define __XFS_EXTENT_BUSY_H__ +struct xfs_mount; +struct xfs_trans; +struct xfs_alloc_arg; + /* * Busy block/extent entry. Indexed by a rbtree in perag to mark blocks that * have been freed but whose transactions aren't committed to disk yet. diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index feb36d7551a..fb7a4c1ce1c 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -17,15 +17,16 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_types.h" -#include "xfs_log.h" -#include "xfs_trans.h" -#include "xfs_buf_item.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_mount.h" +#include "xfs_trans.h" #include "xfs_trans_priv.h" +#include "xfs_buf_item.h" #include "xfs_extfree_item.h" +#include "xfs_log.h" kmem_zone_t *xfs_efi_zone; @@ -50,9 +51,8 @@ xfs_efi_item_free( * Freeing the efi requires that we remove it from the AIL if it has already * been placed there. However, the EFI may not yet have been placed in the AIL * when called by xfs_efi_release() from EFD processing due to the ordering of - * committed vs unpin operations in bulk insert operations. Hence the - * test_and_clear_bit(XFS_EFI_COMMITTED) to ensure only the last caller frees - * the EFI. + * committed vs unpin operations in bulk insert operations. Hence the reference + * count to ensure only the last caller frees the EFI. */ STATIC void __xfs_efi_release( @@ -60,7 +60,7 @@ __xfs_efi_release( { struct xfs_ail *ailp = efip->efi_item.li_ailp; - if (!test_and_clear_bit(XFS_EFI_COMMITTED, &efip->efi_flags)) { + if (atomic_dec_and_test(&efip->efi_refcount)) { spin_lock(&ailp->xa_lock); /* xfs_trans_ail_delete() drops the AIL lock. */ xfs_trans_ail_delete(ailp, &efip->efi_item, @@ -74,11 +74,22 @@ __xfs_efi_release( * We only need 1 iovec for an efi item. It just logs the efi_log_format * structure. */ -STATIC uint +static inline int +xfs_efi_item_sizeof( + struct xfs_efi_log_item *efip) +{ + return sizeof(struct xfs_efi_log_format) + + (efip->efi_format.efi_nextents - 1) * sizeof(xfs_extent_t); +} + +STATIC void xfs_efi_item_size( - struct xfs_log_item *lip) + struct xfs_log_item *lip, + int *nvecs, + int *nbytes) { - return 1; + *nvecs += 1; + *nbytes += xfs_efi_item_sizeof(EFI_ITEM(lip)); } /* @@ -91,24 +102,20 @@ xfs_efi_item_size( STATIC void xfs_efi_item_format( struct xfs_log_item *lip, - struct xfs_log_iovec *log_vector) + struct xfs_log_vec *lv) { struct xfs_efi_log_item *efip = EFI_ITEM(lip); - uint size; + struct xfs_log_iovec *vecp = NULL; ASSERT(atomic_read(&efip->efi_next_extent) == efip->efi_format.efi_nextents); efip->efi_format.efi_type = XFS_LI_EFI; - - size = sizeof(xfs_efi_log_format_t); - size += (efip->efi_format.efi_nextents - 1) * sizeof(xfs_extent_t); efip->efi_format.efi_size = 1; - log_vector->i_addr = &efip->efi_format; - log_vector->i_len = size; - log_vector->i_type = XLOG_REG_TYPE_EFI_FORMAT; - ASSERT(size >= sizeof(xfs_efi_log_format_t)); + xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_EFI_FORMAT, + &efip->efi_format, + xfs_efi_item_sizeof(efip)); } @@ -126,8 +133,8 @@ xfs_efi_item_pin( * which the EFI is manipulated during a transaction. If we are being asked to * remove the EFI it's because the transaction has been cancelled and by * definition that means the EFI cannot be in the AIL so remove it from the - * transaction and free it. Otherwise coordinate with xfs_efi_release() (via - * XFS_EFI_COMMITTED) to determine who gets to free the EFI. + * transaction and free it. Otherwise coordinate with xfs_efi_release() + * to determine who gets to free the EFI. */ STATIC void xfs_efi_item_unpin( @@ -171,19 +178,13 @@ xfs_efi_item_unlock( /* * The EFI is logged only once and cannot be moved in the log, so simply return - * the lsn at which it's been logged. For bulk transaction committed - * processing, the EFI may be processed but not yet unpinned prior to the EFD - * being processed. Set the XFS_EFI_COMMITTED flag so this case can be detected - * when processing the EFD. + * the lsn at which it's been logged. */ STATIC xfs_lsn_t xfs_efi_item_committed( struct xfs_log_item *lip, xfs_lsn_t lsn) { - struct xfs_efi_log_item *efip = EFI_ITEM(lip); - - set_bit(XFS_EFI_COMMITTED, &efip->efi_flags); return lsn; } @@ -241,6 +242,7 @@ xfs_efi_init( efip->efi_format.efi_nextents = nextents; efip->efi_format.efi_id = (__psint_t)(void*)efip; atomic_set(&efip->efi_next_extent, 0); + atomic_set(&efip->efi_refcount, 2); return efip; } @@ -310,8 +312,14 @@ xfs_efi_release(xfs_efi_log_item_t *efip, uint nextents) { ASSERT(atomic_read(&efip->efi_next_extent) >= nextents); - if (atomic_sub_and_test(nextents, &efip->efi_next_extent)) + if (atomic_sub_and_test(nextents, &efip->efi_next_extent)) { + /* recovery needs us to drop the EFI reference, too */ + if (test_bit(XFS_EFI_RECOVERED, &efip->efi_flags)) + __xfs_efi_release(efip); + __xfs_efi_release(efip); + /* efip may now have been freed, do not reference it again. */ + } } static inline struct xfs_efd_log_item *EFD_ITEM(struct xfs_log_item *lip) @@ -333,11 +341,22 @@ xfs_efd_item_free(struct xfs_efd_log_item *efdp) * We only need 1 iovec for an efd item. It just logs the efd_log_format * structure. */ -STATIC uint +static inline int +xfs_efd_item_sizeof( + struct xfs_efd_log_item *efdp) +{ + return sizeof(xfs_efd_log_format_t) + + (efdp->efd_format.efd_nextents - 1) * sizeof(xfs_extent_t); +} + +STATIC void xfs_efd_item_size( - struct xfs_log_item *lip) + struct xfs_log_item *lip, + int *nvecs, + int *nbytes) { - return 1; + *nvecs += 1; + *nbytes += xfs_efd_item_sizeof(EFD_ITEM(lip)); } /* @@ -350,23 +369,19 @@ xfs_efd_item_size( STATIC void xfs_efd_item_format( struct xfs_log_item *lip, - struct xfs_log_iovec *log_vector) + struct xfs_log_vec *lv) { struct xfs_efd_log_item *efdp = EFD_ITEM(lip); - uint size; + struct xfs_log_iovec *vecp = NULL; ASSERT(efdp->efd_next_extent == efdp->efd_format.efd_nextents); efdp->efd_format.efd_type = XFS_LI_EFD; - - size = sizeof(xfs_efd_log_format_t); - size += (efdp->efd_format.efd_nextents - 1) * sizeof(xfs_extent_t); efdp->efd_format.efd_size = 1; - log_vector->i_addr = &efdp->efd_format; - log_vector->i_len = size; - log_vector->i_type = XLOG_REG_TYPE_EFD_FORMAT; - ASSERT(size >= sizeof(xfs_efd_log_format_t)); + xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_EFD_FORMAT, + &efdp->efd_format, + xfs_efd_item_sizeof(efdp)); } /* diff --git a/fs/xfs/xfs_extfree_item.h b/fs/xfs/xfs_extfree_item.h index 375f68e4253..0ffbce32d56 100644 --- a/fs/xfs/xfs_extfree_item.h +++ b/fs/xfs/xfs_extfree_item.h @@ -18,93 +18,11 @@ #ifndef __XFS_EXTFREE_ITEM_H__ #define __XFS_EXTFREE_ITEM_H__ +/* kernel only EFI/EFD definitions */ + struct xfs_mount; struct kmem_zone; -typedef struct xfs_extent { - xfs_dfsbno_t ext_start; - xfs_extlen_t ext_len; -} xfs_extent_t; - -/* - * Since an xfs_extent_t has types (start:64, len: 32) - * there are different alignments on 32 bit and 64 bit kernels. - * So we provide the different variants for use by a - * conversion routine. - */ - -typedef struct xfs_extent_32 { - __uint64_t ext_start; - __uint32_t ext_len; -} __attribute__((packed)) xfs_extent_32_t; - -typedef struct xfs_extent_64 { - __uint64_t ext_start; - __uint32_t ext_len; - __uint32_t ext_pad; -} xfs_extent_64_t; - -/* - * This is the structure used to lay out an efi log item in the - * log. The efi_extents field is a variable size array whose - * size is given by efi_nextents. - */ -typedef struct xfs_efi_log_format { - __uint16_t efi_type; /* efi log item type */ - __uint16_t efi_size; /* size of this item */ - __uint32_t efi_nextents; /* # extents to free */ - __uint64_t efi_id; /* efi identifier */ - xfs_extent_t efi_extents[1]; /* array of extents to free */ -} xfs_efi_log_format_t; - -typedef struct xfs_efi_log_format_32 { - __uint16_t efi_type; /* efi log item type */ - __uint16_t efi_size; /* size of this item */ - __uint32_t efi_nextents; /* # extents to free */ - __uint64_t efi_id; /* efi identifier */ - xfs_extent_32_t efi_extents[1]; /* array of extents to free */ -} __attribute__((packed)) xfs_efi_log_format_32_t; - -typedef struct xfs_efi_log_format_64 { - __uint16_t efi_type; /* efi log item type */ - __uint16_t efi_size; /* size of this item */ - __uint32_t efi_nextents; /* # extents to free */ - __uint64_t efi_id; /* efi identifier */ - xfs_extent_64_t efi_extents[1]; /* array of extents to free */ -} xfs_efi_log_format_64_t; - -/* - * This is the structure used to lay out an efd log item in the - * log. The efd_extents array is a variable size array whose - * size is given by efd_nextents; - */ -typedef struct xfs_efd_log_format { - __uint16_t efd_type; /* efd log item type */ - __uint16_t efd_size; /* size of this item */ - __uint32_t efd_nextents; /* # of extents freed */ - __uint64_t efd_efi_id; /* id of corresponding efi */ - xfs_extent_t efd_extents[1]; /* array of extents freed */ -} xfs_efd_log_format_t; - -typedef struct xfs_efd_log_format_32 { - __uint16_t efd_type; /* efd log item type */ - __uint16_t efd_size; /* size of this item */ - __uint32_t efd_nextents; /* # of extents freed */ - __uint64_t efd_efi_id; /* id of corresponding efi */ - xfs_extent_32_t efd_extents[1]; /* array of extents freed */ -} __attribute__((packed)) xfs_efd_log_format_32_t; - -typedef struct xfs_efd_log_format_64 { - __uint16_t efd_type; /* efd log item type */ - __uint16_t efd_size; /* size of this item */ - __uint32_t efd_nextents; /* # of extents freed */ - __uint64_t efd_efi_id; /* id of corresponding efi */ - xfs_extent_64_t efd_extents[1]; /* array of extents freed */ -} xfs_efd_log_format_64_t; - - -#ifdef __KERNEL__ - /* * Max number of extents in fast allocation path. */ @@ -114,16 +32,20 @@ typedef struct xfs_efd_log_format_64 { * Define EFI flag bits. Manipulated by set/clear/test_bit operators. */ #define XFS_EFI_RECOVERED 1 -#define XFS_EFI_COMMITTED 2 /* - * This is the "extent free intention" log item. It is used - * to log the fact that some extents need to be free. It is - * used in conjunction with the "extent free done" log item - * described below. + * This is the "extent free intention" log item. It is used to log the fact + * that some extents need to be free. It is used in conjunction with the + * "extent free done" log item described below. + * + * The EFI is reference counted so that it is not freed prior to both the EFI + * and EFD being committed and unpinned. This ensures that when the last + * reference goes away the EFI will always be in the AIL as it has been + * unpinned, regardless of whether the EFD is processed before or after the EFI. */ typedef struct xfs_efi_log_item { xfs_log_item_t efi_item; + atomic_t efi_refcount; atomic_t efi_next_extent; unsigned long efi_flags; /* misc flags */ xfs_efi_log_format_t efi_format; @@ -156,6 +78,4 @@ int xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt); void xfs_efi_item_free(xfs_efi_log_item_t *); -#endif /* __KERNEL__ */ - #endif /* __XFS_EXTFREE_ITEM_H__ */ diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 67284edb84d..1f66779d7a4 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -17,25 +17,29 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_log.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_trans.h" #include "xfs_mount.h" -#include "xfs_bmap_btree.h" -#include "xfs_alloc.h" -#include "xfs_dinode.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" #include "xfs_inode.h" +#include "xfs_trans.h" #include "xfs_inode_item.h" #include "xfs_bmap.h" +#include "xfs_bmap_util.h" #include "xfs_error.h" -#include "xfs_vnodeops.h" -#include "xfs_da_btree.h" -#include "xfs_dir2_format.h" +#include "xfs_dir2.h" #include "xfs_dir2_priv.h" #include "xfs_ioctl.h" #include "xfs_trace.h" +#include "xfs_log.h" +#include "xfs_dinode.h" +#include <linux/aio.h> #include <linux/dcache.h> #include <linux/falloc.h> #include <linux/pagevec.h> @@ -151,7 +155,7 @@ xfs_dir_fsync( if (!lsn) return 0; - return _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL); + return -_xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL); } STATIC int @@ -225,39 +229,33 @@ xfs_file_fsync( } STATIC ssize_t -xfs_file_aio_read( +xfs_file_read_iter( struct kiocb *iocb, - const struct iovec *iovp, - unsigned long nr_segs, - loff_t pos) + struct iov_iter *to) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; - size_t size = 0; + size_t size = iov_iter_count(to); ssize_t ret = 0; int ioflags = 0; xfs_fsize_t n; + loff_t pos = iocb->ki_pos; XFS_STATS_INC(xs_read_calls); - BUG_ON(iocb->ki_pos != pos); - if (unlikely(file->f_flags & O_DIRECT)) ioflags |= IO_ISDIRECT; if (file->f_mode & FMODE_NOCMTIME) ioflags |= IO_INVIS; - ret = generic_segment_checks(iovp, &nr_segs, &size, VERIFY_WRITE); - if (ret < 0) - return ret; - if (unlikely(ioflags & IO_ISDIRECT)) { xfs_buftarg_t *target = XFS_IS_REALTIME_INODE(ip) ? mp->m_rtdev_targp : mp->m_ddev_targp; - if ((pos & target->bt_smask) || (size & target->bt_smask)) { + /* DIO must be aligned to device logical sector size */ + if ((pos | size) & target->bt_logical_sectormask) { if (pos == i_size_read(inode)) return 0; return -XFS_ERROR(EINVAL); @@ -290,7 +288,7 @@ xfs_file_aio_read( xfs_rw_ilock(ip, XFS_IOLOCK_EXCL); if (inode->i_mapping->nrpages) { - ret = -filemap_write_and_wait_range( + ret = filemap_write_and_wait_range( VFS_I(ip)->i_mapping, pos, -1); if (ret) { @@ -304,7 +302,7 @@ xfs_file_aio_read( trace_xfs_file_read(ip, size, pos, ioflags); - ret = generic_file_aio_read(iocb, iovp, nr_segs, pos); + ret = generic_file_read_iter(iocb, to); if (ret > 0) XFS_STATS_ADD(xs_read_bytes, ret); @@ -345,47 +343,6 @@ xfs_file_splice_read( } /* - * xfs_file_splice_write() does not use xfs_rw_ilock() because - * generic_file_splice_write() takes the i_mutex itself. This, in theory, - * couuld cause lock inversions between the aio_write path and the splice path - * if someone is doing concurrent splice(2) based writes and write(2) based - * writes to the same inode. The only real way to fix this is to re-implement - * the generic code here with correct locking orders. - */ -STATIC ssize_t -xfs_file_splice_write( - struct pipe_inode_info *pipe, - struct file *outfilp, - loff_t *ppos, - size_t count, - unsigned int flags) -{ - struct inode *inode = outfilp->f_mapping->host; - struct xfs_inode *ip = XFS_I(inode); - int ioflags = 0; - ssize_t ret; - - XFS_STATS_INC(xs_write_calls); - - if (outfilp->f_mode & FMODE_NOCMTIME) - ioflags |= IO_INVIS; - - if (XFS_FORCED_SHUTDOWN(ip->i_mount)) - return -EIO; - - xfs_ilock(ip, XFS_IOLOCK_EXCL); - - trace_xfs_file_splice_write(ip, count, *ppos, ioflags); - - ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags); - if (ret > 0) - XFS_STATS_ADD(xs_write_bytes, ret); - - xfs_iunlock(ip, XFS_IOLOCK_EXCL); - return ret; -} - -/* * This routine is called to handle zeroing any space in the last block of the * file that is beyond the EOF. We do this since the size is being increased * without writing anything to that block and we don't want to read the @@ -620,10 +577,7 @@ restart: STATIC ssize_t xfs_file_dio_aio_write( struct kiocb *iocb, - const struct iovec *iovp, - unsigned long nr_segs, - loff_t pos, - size_t ocount) + struct iov_iter *from) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; @@ -631,15 +585,18 @@ xfs_file_dio_aio_write( struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; ssize_t ret = 0; - size_t count = ocount; int unaligned_io = 0; int iolock; + size_t count = iov_iter_count(from); + loff_t pos = iocb->ki_pos; struct xfs_buftarg *target = XFS_IS_REALTIME_INODE(ip) ? mp->m_rtdev_targp : mp->m_ddev_targp; - if ((pos & target->bt_smask) || (count & target->bt_smask)) + /* DIO must be aligned to device logical sector size */ + if ((pos | count) & target->bt_logical_sectormask) return -XFS_ERROR(EINVAL); + /* "unaligned" here means not aligned to a filesystem block */ if ((pos & mp->m_blockmask) || ((pos + count) & mp->m_blockmask)) unaligned_io = 1; @@ -670,9 +627,10 @@ xfs_file_dio_aio_write( ret = xfs_file_aio_write_checks(file, &pos, &count, &iolock); if (ret) goto out; + iov_iter_truncate(from, count); if (mapping->nrpages) { - ret = -filemap_write_and_wait_range(VFS_I(ip)->i_mapping, + ret = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, pos, -1); if (ret) goto out; @@ -691,8 +649,7 @@ xfs_file_dio_aio_write( } trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0); - ret = generic_file_direct_write(iocb, iovp, - &nr_segs, pos, &iocb->ki_pos, count, ocount); + ret = generic_file_direct_write(iocb, from, pos); out: xfs_rw_iunlock(ip, iolock); @@ -705,10 +662,7 @@ out: STATIC ssize_t xfs_file_buffered_aio_write( struct kiocb *iocb, - const struct iovec *iovp, - unsigned long nr_segs, - loff_t pos, - size_t ocount) + struct iov_iter *from) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; @@ -717,7 +671,8 @@ xfs_file_buffered_aio_write( ssize_t ret; int enospc = 0; int iolock = XFS_IOLOCK_EXCL; - size_t count = ocount; + loff_t pos = iocb->ki_pos; + size_t count = iov_iter_count(from); xfs_rw_ilock(ip, iolock); @@ -725,14 +680,15 @@ xfs_file_buffered_aio_write( if (ret) goto out; + iov_iter_truncate(from, count); /* We can write back this queue in page reclaim */ current->backing_dev_info = mapping->backing_dev_info; write_retry: trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, 0); - ret = generic_file_buffered_write(iocb, iovp, nr_segs, - pos, &iocb->ki_pos, count, 0); - + ret = generic_perform_write(file, from, pos); + if (likely(ret >= 0)) + iocb->ki_pos = pos + ret; /* * If we just got an ENOSPC, try to write back all dirty inodes to * convert delalloc space to free up some of the excess reserved @@ -751,42 +707,29 @@ out: } STATIC ssize_t -xfs_file_aio_write( +xfs_file_write_iter( struct kiocb *iocb, - const struct iovec *iovp, - unsigned long nr_segs, - loff_t pos) + struct iov_iter *from) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; struct xfs_inode *ip = XFS_I(inode); ssize_t ret; - size_t ocount = 0; + size_t ocount = iov_iter_count(from); XFS_STATS_INC(xs_write_calls); - BUG_ON(iocb->ki_pos != pos); - - ret = generic_segment_checks(iovp, &nr_segs, &ocount, VERIFY_READ); - if (ret) - return ret; - if (ocount == 0) return 0; - sb_start_write(inode->i_sb); - - if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { - ret = -EIO; - goto out; - } + if (XFS_FORCED_SHUTDOWN(ip->i_mount)) + return -EIO; if (unlikely(file->f_flags & O_DIRECT)) - ret = xfs_file_dio_aio_write(iocb, iovp, nr_segs, pos, ocount); + ret = xfs_file_dio_aio_write(iocb, from); else - ret = xfs_file_buffered_aio_write(iocb, iovp, nr_segs, pos, - ocount); + ret = xfs_file_buffered_aio_write(iocb, from); if (ret > 0) { ssize_t err; @@ -794,56 +737,99 @@ xfs_file_aio_write( XFS_STATS_ADD(xs_write_bytes, ret); /* Handle various SYNC-type writes */ - err = generic_write_sync(file, pos, ret); + err = generic_write_sync(file, iocb->ki_pos - ret, ret); if (err < 0) ret = err; } - -out: - sb_end_write(inode->i_sb); return ret; } STATIC long xfs_file_fallocate( - struct file *file, - int mode, - loff_t offset, - loff_t len) + struct file *file, + int mode, + loff_t offset, + loff_t len) { - struct inode *inode = file->f_path.dentry->d_inode; - long error; - loff_t new_size = 0; - xfs_flock64_t bf; - xfs_inode_t *ip = XFS_I(inode); - int cmd = XFS_IOC_RESVSP; - int attr_flags = XFS_ATTR_NOLOCK; + struct inode *inode = file_inode(file); + struct xfs_inode *ip = XFS_I(inode); + struct xfs_trans *tp; + long error; + loff_t new_size = 0; - if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) + if (!S_ISREG(inode->i_mode)) + return -EINVAL; + if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | + FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE)) return -EOPNOTSUPP; - bf.l_whence = 0; - bf.l_start = offset; - bf.l_len = len; - xfs_ilock(ip, XFS_IOLOCK_EXCL); + if (mode & FALLOC_FL_PUNCH_HOLE) { + error = xfs_free_file_space(ip, offset, len); + if (error) + goto out_unlock; + } else if (mode & FALLOC_FL_COLLAPSE_RANGE) { + unsigned blksize_mask = (1 << inode->i_blkbits) - 1; - if (mode & FALLOC_FL_PUNCH_HOLE) - cmd = XFS_IOC_UNRESVSP; + if (offset & blksize_mask || len & blksize_mask) { + error = EINVAL; + goto out_unlock; + } - /* check the new inode size is valid before allocating */ - if (!(mode & FALLOC_FL_KEEP_SIZE) && - offset + len > i_size_read(inode)) { - new_size = offset + len; - error = inode_newsize_ok(inode, new_size); + /* + * There is no need to overlap collapse range with EOF, + * in which case it is effectively a truncate operation + */ + if (offset + len >= i_size_read(inode)) { + error = EINVAL; + goto out_unlock; + } + + new_size = i_size_read(inode) - len; + + error = xfs_collapse_file_space(ip, offset, len); + if (error) + goto out_unlock; + } else { + if (!(mode & FALLOC_FL_KEEP_SIZE) && + offset + len > i_size_read(inode)) { + new_size = offset + len; + error = -inode_newsize_ok(inode, new_size); + if (error) + goto out_unlock; + } + + if (mode & FALLOC_FL_ZERO_RANGE) + error = xfs_zero_file_space(ip, offset, len); + else + error = xfs_alloc_file_space(ip, offset, len, + XFS_BMAPI_PREALLOC); if (error) goto out_unlock; } - if (file->f_flags & O_DSYNC) - attr_flags |= XFS_ATTR_SYNC; + tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_WRITEID); + error = xfs_trans_reserve(tp, &M_RES(ip->i_mount)->tr_writeid, 0, 0); + if (error) { + xfs_trans_cancel(tp, 0); + goto out_unlock; + } + + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + ip->i_d.di_mode &= ~S_ISUID; + if (ip->i_d.di_mode & S_IXGRP) + ip->i_d.di_mode &= ~S_ISGID; + + if (!(mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_COLLAPSE_RANGE))) + ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC; - error = -xfs_change_file_space(ip, cmd, &bf, 0, attr_flags); + xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + + if (file->f_flags & O_DSYNC) + xfs_trans_set_sync(tp); + error = xfs_trans_commit(tp, 0); if (error) goto out_unlock; @@ -853,12 +839,12 @@ xfs_file_fallocate( iattr.ia_valid = ATTR_SIZE; iattr.ia_size = new_size; - error = -xfs_setattr_size(ip, &iattr, XFS_ATTR_NOLOCK); + error = xfs_setattr_size(ip, &iattr); } out_unlock: xfs_iunlock(ip, XFS_IOLOCK_EXCL); - return error; + return -error; } @@ -891,9 +877,9 @@ xfs_dir_open( * If there are any blocks, read-ahead block 0 as we're almost * certain to have the next operation be a read there. */ - mode = xfs_ilock_map_shared(ip); + mode = xfs_ilock_data_map_shared(ip); if (ip->i_d.di_nextents > 0) - xfs_dir2_data_readahead(NULL, ip, 0, -1); + xfs_dir3_data_readahead(ip, 0, -1); xfs_iunlock(ip, mode); return 0; } @@ -908,11 +894,10 @@ xfs_file_release( STATIC int xfs_file_readdir( - struct file *filp, - void *dirent, - filldir_t filldir) + struct file *file, + struct dir_context *ctx) { - struct inode *inode = filp->f_path.dentry->d_inode; + struct inode *inode = file_inode(file); xfs_inode_t *ip = XFS_I(inode); int error; size_t bufsize; @@ -931,8 +916,7 @@ xfs_file_readdir( */ bufsize = (size_t)min_t(loff_t, 32768, ip->i_d.di_size); - error = xfs_readdir(ip, dirent, bufsize, - (xfs_off_t *)&filp->f_pos, filldir); + error = xfs_readdir(ip, ctx, bufsize); if (error) return -error; return 0; @@ -1196,7 +1180,7 @@ xfs_seek_data( uint lock; int error; - lock = xfs_ilock_map_shared(ip); + lock = xfs_ilock_data_map_shared(ip); isize = i_size_read(inode); if (start >= isize) { @@ -1272,11 +1256,10 @@ xfs_seek_data( } out: - if (offset != file->f_pos) - file->f_pos = offset; + offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes); out_unlock: - xfs_iunlock_map_shared(ip, lock); + xfs_iunlock(ip, lock); if (error) return -error; @@ -1301,7 +1284,7 @@ xfs_seek_hole( if (XFS_FORCED_SHUTDOWN(mp)) return -XFS_ERROR(EIO); - lock = xfs_ilock_map_shared(ip); + lock = xfs_ilock_data_map_shared(ip); isize = i_size_read(inode); if (start >= isize) { @@ -1381,11 +1364,10 @@ out: * situation in particular. */ offset = min_t(loff_t, offset, isize); - if (offset != file->f_pos) - file->f_pos = offset; + offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes); out_unlock: - xfs_iunlock_map_shared(ip, lock); + xfs_iunlock(ip, lock); if (error) return -error; @@ -1414,12 +1396,12 @@ xfs_file_llseek( const struct file_operations xfs_file_operations = { .llseek = xfs_file_llseek, - .read = do_sync_read, - .write = do_sync_write, - .aio_read = xfs_file_aio_read, - .aio_write = xfs_file_aio_write, + .read = new_sync_read, + .write = new_sync_write, + .read_iter = xfs_file_read_iter, + .write_iter = xfs_file_write_iter, .splice_read = xfs_file_splice_read, - .splice_write = xfs_file_splice_write, + .splice_write = iter_file_splice_write, .unlocked_ioctl = xfs_file_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = xfs_file_compat_ioctl, @@ -1434,7 +1416,7 @@ const struct file_operations xfs_file_operations = { const struct file_operations xfs_dir_file_operations = { .open = xfs_dir_open, .read = generic_read_dir, - .readdir = xfs_file_readdir, + .iterate = xfs_file_readdir, .llseek = generic_file_llseek, .unlocked_ioctl = xfs_file_ioctl, #ifdef CONFIG_COMPAT @@ -1445,6 +1427,7 @@ const struct file_operations xfs_dir_file_operations = { static const struct vm_operations_struct xfs_file_vm_ops = { .fault = filemap_fault, + .map_pages = filemap_map_pages, .page_mkwrite = xfs_vm_page_mkwrite, .remap_pages = generic_file_remap_pages, }; diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c index 5170306a100..8ec81bed799 100644 --- a/fs/xfs/xfs_filestream.c +++ b/fs/xfs/xfs_filestream.c @@ -1,5 +1,6 @@ /* * Copyright (c) 2006-2007 Silicon Graphics, Inc. + * Copyright (c) 2014 Christoph Hellwig. * All Rights Reserved. * * This program is free software; you can redistribute it and/or @@ -16,116 +17,36 @@ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include "xfs.h" -#include "xfs_bmap_btree.h" -#include "xfs_inum.h" -#include "xfs_dinode.h" -#include "xfs_inode.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_ag.h" -#include "xfs_log.h" -#include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_mount.h" +#include "xfs_inum.h" +#include "xfs_inode.h" #include "xfs_bmap.h" +#include "xfs_bmap_util.h" #include "xfs_alloc.h" -#include "xfs_utils.h" #include "xfs_mru_cache.h" +#include "xfs_dinode.h" #include "xfs_filestream.h" #include "xfs_trace.h" -#ifdef XFS_FILESTREAMS_TRACE - -ktrace_t *xfs_filestreams_trace_buf; - -STATIC void -xfs_filestreams_trace( - xfs_mount_t *mp, /* mount point */ - int type, /* type of trace */ - const char *func, /* source function */ - int line, /* source line number */ - __psunsigned_t arg0, - __psunsigned_t arg1, - __psunsigned_t arg2, - __psunsigned_t arg3, - __psunsigned_t arg4, - __psunsigned_t arg5) -{ - ktrace_enter(xfs_filestreams_trace_buf, - (void *)(__psint_t)(type | (line << 16)), - (void *)func, - (void *)(__psunsigned_t)current_pid(), - (void *)mp, - (void *)(__psunsigned_t)arg0, - (void *)(__psunsigned_t)arg1, - (void *)(__psunsigned_t)arg2, - (void *)(__psunsigned_t)arg3, - (void *)(__psunsigned_t)arg4, - (void *)(__psunsigned_t)arg5, - NULL, NULL, NULL, NULL, NULL, NULL); -} - -#define TRACE0(mp,t) TRACE6(mp,t,0,0,0,0,0,0) -#define TRACE1(mp,t,a0) TRACE6(mp,t,a0,0,0,0,0,0) -#define TRACE2(mp,t,a0,a1) TRACE6(mp,t,a0,a1,0,0,0,0) -#define TRACE3(mp,t,a0,a1,a2) TRACE6(mp,t,a0,a1,a2,0,0,0) -#define TRACE4(mp,t,a0,a1,a2,a3) TRACE6(mp,t,a0,a1,a2,a3,0,0) -#define TRACE5(mp,t,a0,a1,a2,a3,a4) TRACE6(mp,t,a0,a1,a2,a3,a4,0) -#define TRACE6(mp,t,a0,a1,a2,a3,a4,a5) \ - xfs_filestreams_trace(mp, t, __func__, __LINE__, \ - (__psunsigned_t)a0, (__psunsigned_t)a1, \ - (__psunsigned_t)a2, (__psunsigned_t)a3, \ - (__psunsigned_t)a4, (__psunsigned_t)a5) - -#define TRACE_AG_SCAN(mp, ag, ag2) \ - TRACE2(mp, XFS_FSTRM_KTRACE_AGSCAN, ag, ag2); -#define TRACE_AG_PICK1(mp, max_ag, maxfree) \ - TRACE2(mp, XFS_FSTRM_KTRACE_AGPICK1, max_ag, maxfree); -#define TRACE_AG_PICK2(mp, ag, ag2, cnt, free, scan, flag) \ - TRACE6(mp, XFS_FSTRM_KTRACE_AGPICK2, ag, ag2, \ - cnt, free, scan, flag) -#define TRACE_UPDATE(mp, ip, ag, cnt, ag2, cnt2) \ - TRACE5(mp, XFS_FSTRM_KTRACE_UPDATE, ip, ag, cnt, ag2, cnt2) -#define TRACE_FREE(mp, ip, pip, ag, cnt) \ - TRACE4(mp, XFS_FSTRM_KTRACE_FREE, ip, pip, ag, cnt) -#define TRACE_LOOKUP(mp, ip, pip, ag, cnt) \ - TRACE4(mp, XFS_FSTRM_KTRACE_ITEM_LOOKUP, ip, pip, ag, cnt) -#define TRACE_ASSOCIATE(mp, ip, pip, ag, cnt) \ - TRACE4(mp, XFS_FSTRM_KTRACE_ASSOCIATE, ip, pip, ag, cnt) -#define TRACE_MOVEAG(mp, ip, pip, oag, ocnt, nag, ncnt) \ - TRACE6(mp, XFS_FSTRM_KTRACE_MOVEAG, ip, pip, oag, ocnt, nag, ncnt) -#define TRACE_ORPHAN(mp, ip, ag) \ - TRACE2(mp, XFS_FSTRM_KTRACE_ORPHAN, ip, ag); - - -#else -#define TRACE_AG_SCAN(mp, ag, ag2) -#define TRACE_AG_PICK1(mp, max_ag, maxfree) -#define TRACE_AG_PICK2(mp, ag, ag2, cnt, free, scan, flag) -#define TRACE_UPDATE(mp, ip, ag, cnt, ag2, cnt2) -#define TRACE_FREE(mp, ip, pip, ag, cnt) -#define TRACE_LOOKUP(mp, ip, pip, ag, cnt) -#define TRACE_ASSOCIATE(mp, ip, pip, ag, cnt) -#define TRACE_MOVEAG(mp, ip, pip, oag, ocnt, nag, ncnt) -#define TRACE_ORPHAN(mp, ip, ag) -#endif - -static kmem_zone_t *item_zone; +struct xfs_fstrm_item { + struct xfs_mru_cache_elem mru; + struct xfs_inode *ip; + xfs_agnumber_t ag; /* AG in use for this directory */ +}; -/* - * Structure for associating a file or a directory with an allocation group. - * The parent directory pointer is only needed for files, but since there will - * generally be vastly more files than directories in the cache, using the same - * data structure simplifies the code with very little memory overhead. - */ -typedef struct fstrm_item -{ - xfs_agnumber_t ag; /* AG currently in use for the file/directory. */ - xfs_inode_t *ip; /* inode self-pointer. */ - xfs_inode_t *pip; /* Parent directory inode pointer. */ -} fstrm_item_t; +enum xfs_fstrm_alloc { + XFS_PICK_USERDATA = 1, + XFS_PICK_LOWSPACE = 2, +}; /* * Allocation group filestream associations are tracked with per-ag atomic - * counters. These counters allow _xfs_filestream_pick_ag() to tell whether a + * counters. These counters allow xfs_filestream_pick_ag() to tell whether a * particular AG already has active filestreams associated with it. The mount * point's m_peraglock is used to protect these counters from per-ag array * re-allocation during a growfs operation. When xfs_growfs_data_private() is @@ -160,7 +81,7 @@ typedef struct fstrm_item * the cache that reference per-ag array elements that have since been * reallocated. */ -static int +int xfs_filestream_peek_ag( xfs_mount_t *mp, xfs_agnumber_t agno) @@ -200,23 +121,40 @@ xfs_filestream_put_ag( xfs_perag_put(pag); } +static void +xfs_fstrm_free_func( + struct xfs_mru_cache_elem *mru) +{ + struct xfs_fstrm_item *item = + container_of(mru, struct xfs_fstrm_item, mru); + + xfs_filestream_put_ag(item->ip->i_mount, item->ag); + + trace_xfs_filestream_free(item->ip, item->ag); + + kmem_free(item); +} + /* * Scan the AGs starting at startag looking for an AG that isn't in use and has * at least minlen blocks free. */ static int -_xfs_filestream_pick_ag( - xfs_mount_t *mp, - xfs_agnumber_t startag, - xfs_agnumber_t *agp, - int flags, - xfs_extlen_t minlen) +xfs_filestream_pick_ag( + struct xfs_inode *ip, + xfs_agnumber_t startag, + xfs_agnumber_t *agp, + int flags, + xfs_extlen_t minlen) { - int streams, max_streams; - int err, trylock, nscan; - xfs_extlen_t longest, free, minfree, maxfree = 0; - xfs_agnumber_t ag, max_ag = NULLAGNUMBER; - struct xfs_perag *pag; + struct xfs_mount *mp = ip->i_mount; + struct xfs_fstrm_item *item; + struct xfs_perag *pag; + xfs_extlen_t longest, free = 0, minfree, maxfree = 0; + xfs_agnumber_t ag, max_ag = NULLAGNUMBER; + int err, trylock, nscan; + + ASSERT(S_ISDIR(ip->i_d.di_mode)); /* 2% of an AG's blocks must be free for it to be chosen. */ minfree = mp->m_sb.sb_agblocks / 50; @@ -228,8 +166,9 @@ _xfs_filestream_pick_ag( trylock = XFS_ALLOC_FLAG_TRYLOCK; for (nscan = 0; 1; nscan++) { + trace_xfs_filestream_scan(ip, ag); + pag = xfs_perag_get(mp, ag); - TRACE_AG_SCAN(mp, ag, atomic_read(&pag->pagf_fstrms)); if (!pag->pagf_init) { err = xfs_alloc_pagf_init(mp, NULL, ag, trylock); @@ -246,7 +185,6 @@ _xfs_filestream_pick_ag( /* Keep track of the AG with the most free blocks. */ if (pag->pagf_freeblks > maxfree) { maxfree = pag->pagf_freeblks; - max_streams = atomic_read(&pag->pagf_fstrms); max_ag = ag; } @@ -269,7 +207,6 @@ _xfs_filestream_pick_ag( /* Break out, retaining the reference on the AG. */ free = pag->pagf_freeblks; - streams = atomic_read(&pag->pagf_fstrms); xfs_perag_put(pag); *agp = ag; break; @@ -305,317 +242,98 @@ next_ag: */ if (max_ag != NULLAGNUMBER) { xfs_filestream_get_ag(mp, max_ag); - TRACE_AG_PICK1(mp, max_ag, maxfree); - streams = max_streams; free = maxfree; *agp = max_ag; break; } /* take AG 0 if none matched */ - TRACE_AG_PICK1(mp, max_ag, maxfree); + trace_xfs_filestream_pick(ip, *agp, free, nscan); *agp = 0; return 0; } - TRACE_AG_PICK2(mp, startag, *agp, streams, free, nscan, flags); - - return 0; -} - -/* - * Set the allocation group number for a file or a directory, updating inode - * references and per-AG references as appropriate. - */ -static int -_xfs_filestream_update_ag( - xfs_inode_t *ip, - xfs_inode_t *pip, - xfs_agnumber_t ag) -{ - int err = 0; - xfs_mount_t *mp; - xfs_mru_cache_t *cache; - fstrm_item_t *item; - xfs_agnumber_t old_ag; - xfs_inode_t *old_pip; + trace_xfs_filestream_pick(ip, *agp, free, nscan); - /* - * Either ip is a regular file and pip is a directory, or ip is a - * directory and pip is NULL. - */ - ASSERT(ip && ((S_ISREG(ip->i_d.di_mode) && pip && - S_ISDIR(pip->i_d.di_mode)) || - (S_ISDIR(ip->i_d.di_mode) && !pip))); - - mp = ip->i_mount; - cache = mp->m_filestream; - - item = xfs_mru_cache_lookup(cache, ip->i_ino); - if (item) { - ASSERT(item->ip == ip); - old_ag = item->ag; - item->ag = ag; - old_pip = item->pip; - item->pip = pip; - xfs_mru_cache_done(cache); - - /* - * If the AG has changed, drop the old ref and take a new one, - * effectively transferring the reference from old to new AG. - */ - if (ag != old_ag) { - xfs_filestream_put_ag(mp, old_ag); - xfs_filestream_get_ag(mp, ag); - } - - /* - * If ip is a file and its pip has changed, drop the old ref and - * take a new one. - */ - if (pip && pip != old_pip) { - IRELE(old_pip); - IHOLD(pip); - } - - TRACE_UPDATE(mp, ip, old_ag, xfs_filestream_peek_ag(mp, old_ag), - ag, xfs_filestream_peek_ag(mp, ag)); + if (*agp == NULLAGNUMBER) return 0; - } - item = kmem_zone_zalloc(item_zone, KM_MAYFAIL); + err = ENOMEM; + item = kmem_alloc(sizeof(*item), KM_MAYFAIL); if (!item) - return ENOMEM; + goto out_put_ag; - item->ag = ag; + item->ag = *agp; item->ip = ip; - item->pip = pip; - err = xfs_mru_cache_insert(cache, ip->i_ino, item); + err = xfs_mru_cache_insert(mp->m_filestream, ip->i_ino, &item->mru); if (err) { - kmem_zone_free(item_zone, item); - return err; + if (err == EEXIST) + err = 0; + goto out_free_item; } - /* Take a reference on the AG. */ - xfs_filestream_get_ag(mp, ag); - - /* - * Take a reference on the inode itself regardless of whether it's a - * regular file or a directory. - */ - IHOLD(ip); - - /* - * In the case of a regular file, take a reference on the parent inode - * as well to ensure it remains in-core. - */ - if (pip) - IHOLD(pip); - - TRACE_UPDATE(mp, ip, ag, xfs_filestream_peek_ag(mp, ag), - ag, xfs_filestream_peek_ag(mp, ag)); - - return 0; -} - -/* xfs_fstrm_free_func(): callback for freeing cached stream items. */ -STATIC void -xfs_fstrm_free_func( - unsigned long ino, - void *data) -{ - fstrm_item_t *item = (fstrm_item_t *)data; - xfs_inode_t *ip = item->ip; - - ASSERT(ip->i_ino == ino); - - xfs_iflags_clear(ip, XFS_IFILESTREAM); - - /* Drop the reference taken on the AG when the item was added. */ - xfs_filestream_put_ag(ip->i_mount, item->ag); - - TRACE_FREE(ip->i_mount, ip, item->pip, item->ag, - xfs_filestream_peek_ag(ip->i_mount, item->ag)); - - /* - * _xfs_filestream_update_ag() always takes a reference on the inode - * itself, whether it's a file or a directory. Release it here. - * This can result in the inode being freed and so we must - * not hold any inode locks when freeing filesstreams objects - * otherwise we can deadlock here. - */ - IRELE(ip); - - /* - * In the case of a regular file, _xfs_filestream_update_ag() also - * takes a ref on the parent inode to keep it in-core. Release that - * too. - */ - if (item->pip) - IRELE(item->pip); - - /* Finally, free the memory allocated for the item. */ - kmem_zone_free(item_zone, item); -} - -/* - * xfs_filestream_init() is called at xfs initialisation time to set up the - * memory zone that will be used for filestream data structure allocation. - */ -int -xfs_filestream_init(void) -{ - item_zone = kmem_zone_init(sizeof(fstrm_item_t), "fstrm_item"); - if (!item_zone) - return -ENOMEM; - return 0; -} - -/* - * xfs_filestream_uninit() is called at xfs termination time to destroy the - * memory zone that was used for filestream data structure allocation. - */ -void -xfs_filestream_uninit(void) -{ - kmem_zone_destroy(item_zone); -} - -/* - * xfs_filestream_mount() is called when a file system is mounted with the - * filestream option. It is responsible for allocating the data structures - * needed to track the new file system's file streams. - */ -int -xfs_filestream_mount( - xfs_mount_t *mp) -{ - int err; - unsigned int lifetime, grp_count; - - /* - * The filestream timer tunable is currently fixed within the range of - * one second to four minutes, with five seconds being the default. The - * group count is somewhat arbitrary, but it'd be nice to adhere to the - * timer tunable to within about 10 percent. This requires at least 10 - * groups. - */ - lifetime = xfs_fstrm_centisecs * 10; - grp_count = 10; - - err = xfs_mru_cache_create(&mp->m_filestream, lifetime, grp_count, - xfs_fstrm_free_func); +out_free_item: + kmem_free(item); +out_put_ag: + xfs_filestream_put_ag(mp, *agp); return err; } -/* - * xfs_filestream_unmount() is called when a file system that was mounted with - * the filestream option is unmounted. It drains the data structures created - * to track the file system's file streams and frees all the memory that was - * allocated. - */ -void -xfs_filestream_unmount( - xfs_mount_t *mp) +static struct xfs_inode * +xfs_filestream_get_parent( + struct xfs_inode *ip) { - xfs_mru_cache_destroy(mp->m_filestream); -} + struct inode *inode = VFS_I(ip), *dir = NULL; + struct dentry *dentry, *parent; -/* - * Return the AG of the filestream the file or directory belongs to, or - * NULLAGNUMBER otherwise. - */ -xfs_agnumber_t -xfs_filestream_lookup_ag( - xfs_inode_t *ip) -{ - xfs_mru_cache_t *cache; - fstrm_item_t *item; - xfs_agnumber_t ag; - int ref; - - if (!S_ISREG(ip->i_d.di_mode) && !S_ISDIR(ip->i_d.di_mode)) { - ASSERT(0); - return NULLAGNUMBER; - } + dentry = d_find_alias(inode); + if (!dentry) + goto out; - cache = ip->i_mount->m_filestream; - item = xfs_mru_cache_lookup(cache, ip->i_ino); - if (!item) { - TRACE_LOOKUP(ip->i_mount, ip, NULL, NULLAGNUMBER, 0); - return NULLAGNUMBER; - } + parent = dget_parent(dentry); + if (!parent) + goto out_dput; - ASSERT(ip == item->ip); - ag = item->ag; - ref = xfs_filestream_peek_ag(ip->i_mount, ag); - xfs_mru_cache_done(cache); + dir = igrab(parent->d_inode); + dput(parent); - TRACE_LOOKUP(ip->i_mount, ip, item->pip, ag, ref); - return ag; +out_dput: + dput(dentry); +out: + return dir ? XFS_I(dir) : NULL; } /* - * xfs_filestream_associate() should only be called to associate a regular file - * with its parent directory. Calling it with a child directory isn't - * appropriate because filestreams don't apply to entire directory hierarchies. - * Creating a file in a child directory of an existing filestream directory - * starts a new filestream with its own allocation group association. + * Find the right allocation group for a file, either by finding an + * existing file stream or creating a new one. * - * Returns < 0 on error, 0 if successful association occurred, > 0 if - * we failed to get an association because of locking issues. + * Returns NULLAGNUMBER in case of an error. */ -int -xfs_filestream_associate( - xfs_inode_t *pip, - xfs_inode_t *ip) +xfs_agnumber_t +xfs_filestream_lookup_ag( + struct xfs_inode *ip) { - xfs_mount_t *mp; - xfs_mru_cache_t *cache; - fstrm_item_t *item; - xfs_agnumber_t ag, rotorstep, startag; - int err = 0; + struct xfs_mount *mp = ip->i_mount; + struct xfs_inode *pip = NULL; + xfs_agnumber_t startag, ag = NULLAGNUMBER; + struct xfs_mru_cache_elem *mru; - ASSERT(S_ISDIR(pip->i_d.di_mode)); ASSERT(S_ISREG(ip->i_d.di_mode)); - if (!S_ISDIR(pip->i_d.di_mode) || !S_ISREG(ip->i_d.di_mode)) - return -EINVAL; - - mp = pip->i_mount; - cache = mp->m_filestream; - - /* - * We have a problem, Houston. - * - * Taking the iolock here violates inode locking order - we already - * hold the ilock. Hence if we block getting this lock we may never - * wake. Unfortunately, that means if we can't get the lock, we're - * screwed in terms of getting a stream association - we can't spin - * waiting for the lock because someone else is waiting on the lock we - * hold and we cannot drop that as we are in a transaction here. - * - * Lucky for us, this inversion is not a problem because it's a - * directory inode that we are trying to lock here. - * - * So, if we can't get the iolock without sleeping then just give up - */ - if (!xfs_ilock_nowait(pip, XFS_IOLOCK_EXCL)) - return 1; - /* If the parent directory is already in the cache, use its AG. */ - item = xfs_mru_cache_lookup(cache, pip->i_ino); - if (item) { - ASSERT(item->ip == pip); - ag = item->ag; - xfs_mru_cache_done(cache); + pip = xfs_filestream_get_parent(ip); + if (!pip) + goto out; - TRACE_LOOKUP(mp, pip, pip, ag, xfs_filestream_peek_ag(mp, ag)); - err = _xfs_filestream_update_ag(ip, pip, ag); + mru = xfs_mru_cache_lookup(mp->m_filestream, pip->i_ino); + if (mru) { + ag = container_of(mru, struct xfs_fstrm_item, mru)->ag; + xfs_mru_cache_done(mp->m_filestream); - goto exit; + trace_xfs_filestream_lookup(ip, ag); + goto out; } /* @@ -623,202 +341,94 @@ xfs_filestream_associate( * use the directory inode's AG. */ if (mp->m_flags & XFS_MOUNT_32BITINODES) { - rotorstep = xfs_rotorstep; + xfs_agnumber_t rotorstep = xfs_rotorstep; startag = (mp->m_agfrotor / rotorstep) % mp->m_sb.sb_agcount; mp->m_agfrotor = (mp->m_agfrotor + 1) % (mp->m_sb.sb_agcount * rotorstep); } else startag = XFS_INO_TO_AGNO(mp, pip->i_ino); - /* Pick a new AG for the parent inode starting at startag. */ - err = _xfs_filestream_pick_ag(mp, startag, &ag, 0, 0); - if (err || ag == NULLAGNUMBER) - goto exit_did_pick; - - /* Associate the parent inode with the AG. */ - err = _xfs_filestream_update_ag(pip, NULL, ag); - if (err) - goto exit_did_pick; - - /* Associate the file inode with the AG. */ - err = _xfs_filestream_update_ag(ip, pip, ag); - if (err) - goto exit_did_pick; - - TRACE_ASSOCIATE(mp, ip, pip, ag, xfs_filestream_peek_ag(mp, ag)); - -exit_did_pick: - /* - * If _xfs_filestream_pick_ag() returned a valid AG, remove the - * reference it took on it, since the file and directory will have taken - * their own now if they were successfully cached. - */ - if (ag != NULLAGNUMBER) - xfs_filestream_put_ag(mp, ag); - -exit: - xfs_iunlock(pip, XFS_IOLOCK_EXCL); - return -err; + if (xfs_filestream_pick_ag(pip, startag, &ag, 0, 0)) + ag = NULLAGNUMBER; +out: + IRELE(pip); + return ag; } /* - * Pick a new allocation group for the current file and its file stream. This - * function is called by xfs_bmap_filestreams() with the mount point's per-ag - * lock held. + * Pick a new allocation group for the current file and its file stream. + * + * This is called when the allocator can't find a suitable extent in the + * current AG, and we have to move the stream into a new AG with more space. */ int xfs_filestream_new_ag( - xfs_bmalloca_t *ap, - xfs_agnumber_t *agp) + struct xfs_bmalloca *ap, + xfs_agnumber_t *agp) { - int flags, err; - xfs_inode_t *ip, *pip = NULL; - xfs_mount_t *mp; - xfs_mru_cache_t *cache; - xfs_extlen_t minlen; - fstrm_item_t *dir, *file; - xfs_agnumber_t ag = NULLAGNUMBER; - - ip = ap->ip; - mp = ip->i_mount; - cache = mp->m_filestream; - minlen = ap->length; - *agp = NULLAGNUMBER; + struct xfs_inode *ip = ap->ip, *pip; + struct xfs_mount *mp = ip->i_mount; + xfs_extlen_t minlen = ap->length; + xfs_agnumber_t startag = 0; + int flags, err = 0; + struct xfs_mru_cache_elem *mru; - /* - * Look for the file in the cache, removing it if it's found. Doing - * this allows it to be held across the dir lookup that follows. - */ - file = xfs_mru_cache_remove(cache, ip->i_ino); - if (file) { - ASSERT(ip == file->ip); - - /* Save the file's parent inode and old AG number for later. */ - pip = file->pip; - ag = file->ag; - - /* Look for the file's directory in the cache. */ - dir = xfs_mru_cache_lookup(cache, pip->i_ino); - if (dir) { - ASSERT(pip == dir->ip); - - /* - * If the directory has already moved on to a new AG, - * use that AG as the new AG for the file. Don't - * forget to twiddle the AG refcounts to match the - * movement. - */ - if (dir->ag != file->ag) { - xfs_filestream_put_ag(mp, file->ag); - xfs_filestream_get_ag(mp, dir->ag); - *agp = file->ag = dir->ag; - } - - xfs_mru_cache_done(cache); - } + *agp = NULLAGNUMBER; - /* - * Put the file back in the cache. If this fails, the free - * function needs to be called to tidy up in the same way as if - * the item had simply expired from the cache. - */ - err = xfs_mru_cache_insert(cache, ip->i_ino, file); - if (err) { - xfs_fstrm_free_func(ip->i_ino, file); - return err; - } + pip = xfs_filestream_get_parent(ip); + if (!pip) + goto exit; - /* - * If the file's AG was moved to the directory's new AG, there's - * nothing more to be done. - */ - if (*agp != NULLAGNUMBER) { - TRACE_MOVEAG(mp, ip, pip, - ag, xfs_filestream_peek_ag(mp, ag), - *agp, xfs_filestream_peek_ag(mp, *agp)); - return 0; - } + mru = xfs_mru_cache_remove(mp->m_filestream, pip->i_ino); + if (mru) { + struct xfs_fstrm_item *item = + container_of(mru, struct xfs_fstrm_item, mru); + startag = (item->ag + 1) % mp->m_sb.sb_agcount; } - /* - * If the file's parent directory is known, take its iolock in exclusive - * mode to prevent two sibling files from racing each other to migrate - * themselves and their parent to different AGs. - * - * Note that we lock the parent directory iolock inside the child - * iolock here. That's fine as we never hold both parent and child - * iolock in any other place. This is different from the ilock, - * which requires locking of the child after the parent for namespace - * operations. - */ - if (pip) - xfs_ilock(pip, XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT); - - /* - * A new AG needs to be found for the file. If the file's parent - * directory is also known, it will be moved to the new AG as well to - * ensure that files created inside it in future use the new AG. - */ - ag = (ag == NULLAGNUMBER) ? 0 : (ag + 1) % mp->m_sb.sb_agcount; flags = (ap->userdata ? XFS_PICK_USERDATA : 0) | (ap->flist->xbf_low ? XFS_PICK_LOWSPACE : 0); - err = _xfs_filestream_pick_ag(mp, ag, agp, flags, minlen); - if (err || *agp == NULLAGNUMBER) - goto exit; + err = xfs_filestream_pick_ag(pip, startag, agp, flags, minlen); /* - * If the file wasn't found in the file cache, then its parent directory - * inode isn't known. For this to have happened, the file must either - * be pre-existing, or it was created long enough ago that its cache - * entry has expired. This isn't the sort of usage that the filestreams - * allocator is trying to optimise, so there's no point trying to track - * its new AG somehow in the filestream data structures. + * Only free the item here so we skip over the old AG earlier. */ - if (!pip) { - TRACE_ORPHAN(mp, ip, *agp); - goto exit; - } - - /* Associate the parent inode with the AG. */ - err = _xfs_filestream_update_ag(pip, NULL, *agp); - if (err) - goto exit; - - /* Associate the file inode with the AG. */ - err = _xfs_filestream_update_ag(ip, pip, *agp); - if (err) - goto exit; - - TRACE_MOVEAG(mp, ip, pip, NULLAGNUMBER, 0, - *agp, xfs_filestream_peek_ag(mp, *agp)); + if (mru) + xfs_fstrm_free_func(mru); + IRELE(pip); exit: - /* - * If _xfs_filestream_pick_ag() returned a valid AG, remove the - * reference it took on it, since the file and directory will have taken - * their own now if they were successfully cached. - */ - if (*agp != NULLAGNUMBER) - xfs_filestream_put_ag(mp, *agp); - else + if (*agp == NULLAGNUMBER) *agp = 0; - - if (pip) - xfs_iunlock(pip, XFS_IOLOCK_EXCL); - return err; } -/* - * Remove an association between an inode and a filestream object. - * Typically this is done on last close of an unlinked file. - */ void xfs_filestream_deassociate( - xfs_inode_t *ip) + struct xfs_inode *ip) { - xfs_mru_cache_t *cache = ip->i_mount->m_filestream; + xfs_mru_cache_delete(ip->i_mount->m_filestream, ip->i_ino); +} - xfs_mru_cache_delete(cache, ip->i_ino); +int +xfs_filestream_mount( + xfs_mount_t *mp) +{ + /* + * The filestream timer tunable is currently fixed within the range of + * one second to four minutes, with five seconds being the default. The + * group count is somewhat arbitrary, but it'd be nice to adhere to the + * timer tunable to within about 10 percent. This requires at least 10 + * groups. + */ + return xfs_mru_cache_create(&mp->m_filestream, xfs_fstrm_centisecs * 10, + 10, xfs_fstrm_free_func); +} + +void +xfs_filestream_unmount( + xfs_mount_t *mp) +{ + xfs_mru_cache_destroy(mp->m_filestream); } diff --git a/fs/xfs/xfs_filestream.h b/fs/xfs/xfs_filestream.h index 09dd9af4543..2ef43406e53 100644 --- a/fs/xfs/xfs_filestream.h +++ b/fs/xfs/xfs_filestream.h @@ -18,57 +18,23 @@ #ifndef __XFS_FILESTREAM_H__ #define __XFS_FILESTREAM_H__ -#ifdef __KERNEL__ - struct xfs_mount; struct xfs_inode; -struct xfs_perag; struct xfs_bmalloca; -#ifdef XFS_FILESTREAMS_TRACE -#define XFS_FSTRM_KTRACE_INFO 1 -#define XFS_FSTRM_KTRACE_AGSCAN 2 -#define XFS_FSTRM_KTRACE_AGPICK1 3 -#define XFS_FSTRM_KTRACE_AGPICK2 4 -#define XFS_FSTRM_KTRACE_UPDATE 5 -#define XFS_FSTRM_KTRACE_FREE 6 -#define XFS_FSTRM_KTRACE_ITEM_LOOKUP 7 -#define XFS_FSTRM_KTRACE_ASSOCIATE 8 -#define XFS_FSTRM_KTRACE_MOVEAG 9 -#define XFS_FSTRM_KTRACE_ORPHAN 10 - -#define XFS_FSTRM_KTRACE_SIZE 16384 -extern ktrace_t *xfs_filestreams_trace_buf; - -#endif - -/* allocation selection flags */ -typedef enum xfs_fstrm_alloc { - XFS_PICK_USERDATA = 1, - XFS_PICK_LOWSPACE = 2, -} xfs_fstrm_alloc_t; - -/* prototypes for filestream.c */ -int xfs_filestream_init(void); -void xfs_filestream_uninit(void); int xfs_filestream_mount(struct xfs_mount *mp); void xfs_filestream_unmount(struct xfs_mount *mp); -xfs_agnumber_t xfs_filestream_lookup_ag(struct xfs_inode *ip); -int xfs_filestream_associate(struct xfs_inode *dip, struct xfs_inode *ip); void xfs_filestream_deassociate(struct xfs_inode *ip); +xfs_agnumber_t xfs_filestream_lookup_ag(struct xfs_inode *ip); int xfs_filestream_new_ag(struct xfs_bmalloca *ap, xfs_agnumber_t *agp); +int xfs_filestream_peek_ag(struct xfs_mount *mp, xfs_agnumber_t agno); - -/* filestreams for the inode? */ static inline int xfs_inode_is_filestream( struct xfs_inode *ip) { return (ip->i_mount->m_flags & XFS_MOUNT_FILESTREAMS) || - xfs_iflags_test(ip, XFS_IFILESTREAM) || (ip->i_d.di_flags & XFS_DIFLAG_FILESTREAM); } -#endif /* __KERNEL__ */ - #endif /* __XFS_FILESTREAM_H__ */ diff --git a/fs/xfs/xfs_format.h b/fs/xfs/xfs_format.h new file mode 100644 index 00000000000..34d85aca305 --- /dev/null +++ b/fs/xfs/xfs_format.h @@ -0,0 +1,428 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_FORMAT_H__ +#define __XFS_FORMAT_H__ + +/* + * XFS On Disk Format Definitions + * + * This header file defines all the on-disk format definitions for + * general XFS objects. Directory and attribute related objects are defined in + * xfs_da_format.h, which log and log item formats are defined in + * xfs_log_format.h. Everything else goes here. + */ + +struct xfs_mount; +struct xfs_trans; +struct xfs_inode; +struct xfs_buf; +struct xfs_ifork; + +/* + * RealTime Device format definitions + */ + +/* Min and max rt extent sizes, specified in bytes */ +#define XFS_MAX_RTEXTSIZE (1024 * 1024 * 1024) /* 1GB */ +#define XFS_DFL_RTEXTSIZE (64 * 1024) /* 64kB */ +#define XFS_MIN_RTEXTSIZE (4 * 1024) /* 4kB */ + +#define XFS_BLOCKSIZE(mp) ((mp)->m_sb.sb_blocksize) +#define XFS_BLOCKMASK(mp) ((mp)->m_blockmask) +#define XFS_BLOCKWSIZE(mp) ((mp)->m_blockwsize) +#define XFS_BLOCKWMASK(mp) ((mp)->m_blockwmask) + +/* + * RT Summary and bit manipulation macros. + */ +#define XFS_SUMOFFS(mp,ls,bb) ((int)((ls) * (mp)->m_sb.sb_rbmblocks + (bb))) +#define XFS_SUMOFFSTOBLOCK(mp,s) \ + (((s) * (uint)sizeof(xfs_suminfo_t)) >> (mp)->m_sb.sb_blocklog) +#define XFS_SUMPTR(mp,bp,so) \ + ((xfs_suminfo_t *)((bp)->b_addr + \ + (((so) * (uint)sizeof(xfs_suminfo_t)) & XFS_BLOCKMASK(mp)))) + +#define XFS_BITTOBLOCK(mp,bi) ((bi) >> (mp)->m_blkbit_log) +#define XFS_BLOCKTOBIT(mp,bb) ((bb) << (mp)->m_blkbit_log) +#define XFS_BITTOWORD(mp,bi) \ + ((int)(((bi) >> XFS_NBWORDLOG) & XFS_BLOCKWMASK(mp))) + +#define XFS_RTMIN(a,b) ((a) < (b) ? (a) : (b)) +#define XFS_RTMAX(a,b) ((a) > (b) ? (a) : (b)) + +#define XFS_RTLOBIT(w) xfs_lowbit32(w) +#define XFS_RTHIBIT(w) xfs_highbit32(w) + +#if XFS_BIG_BLKNOS +#define XFS_RTBLOCKLOG(b) xfs_highbit64(b) +#else +#define XFS_RTBLOCKLOG(b) xfs_highbit32(b) +#endif + +/* + * Dquot and dquot block format definitions + */ +#define XFS_DQUOT_MAGIC 0x4451 /* 'DQ' */ +#define XFS_DQUOT_VERSION (u_int8_t)0x01 /* latest version number */ + +/* + * This is the main portion of the on-disk representation of quota + * information for a user. This is the q_core of the xfs_dquot_t that + * is kept in kernel memory. We pad this with some more expansion room + * to construct the on disk structure. + */ +typedef struct xfs_disk_dquot { + __be16 d_magic; /* dquot magic = XFS_DQUOT_MAGIC */ + __u8 d_version; /* dquot version */ + __u8 d_flags; /* XFS_DQ_USER/PROJ/GROUP */ + __be32 d_id; /* user,project,group id */ + __be64 d_blk_hardlimit;/* absolute limit on disk blks */ + __be64 d_blk_softlimit;/* preferred limit on disk blks */ + __be64 d_ino_hardlimit;/* maximum # allocated inodes */ + __be64 d_ino_softlimit;/* preferred inode limit */ + __be64 d_bcount; /* disk blocks owned by the user */ + __be64 d_icount; /* inodes owned by the user */ + __be32 d_itimer; /* zero if within inode limits if not, + this is when we refuse service */ + __be32 d_btimer; /* similar to above; for disk blocks */ + __be16 d_iwarns; /* warnings issued wrt num inodes */ + __be16 d_bwarns; /* warnings issued wrt disk blocks */ + __be32 d_pad0; /* 64 bit align */ + __be64 d_rtb_hardlimit;/* absolute limit on realtime blks */ + __be64 d_rtb_softlimit;/* preferred limit on RT disk blks */ + __be64 d_rtbcount; /* realtime blocks owned */ + __be32 d_rtbtimer; /* similar to above; for RT disk blocks */ + __be16 d_rtbwarns; /* warnings issued wrt RT disk blocks */ + __be16 d_pad; +} xfs_disk_dquot_t; + +/* + * This is what goes on disk. This is separated from the xfs_disk_dquot because + * carrying the unnecessary padding would be a waste of memory. + */ +typedef struct xfs_dqblk { + xfs_disk_dquot_t dd_diskdq; /* portion that lives incore as well */ + char dd_fill[4]; /* filling for posterity */ + + /* + * These two are only present on filesystems with the CRC bits set. + */ + __be32 dd_crc; /* checksum */ + __be64 dd_lsn; /* last modification in log */ + uuid_t dd_uuid; /* location information */ +} xfs_dqblk_t; + +#define XFS_DQUOT_CRC_OFF offsetof(struct xfs_dqblk, dd_crc) + +/* + * Remote symlink format and access functions. + */ +#define XFS_SYMLINK_MAGIC 0x58534c4d /* XSLM */ + +struct xfs_dsymlink_hdr { + __be32 sl_magic; + __be32 sl_offset; + __be32 sl_bytes; + __be32 sl_crc; + uuid_t sl_uuid; + __be64 sl_owner; + __be64 sl_blkno; + __be64 sl_lsn; +}; + +#define XFS_SYMLINK_CRC_OFF offsetof(struct xfs_dsymlink_hdr, sl_crc) + +/* + * The maximum pathlen is 1024 bytes. Since the minimum file system + * blocksize is 512 bytes, we can get a max of 3 extents back from + * bmapi when crc headers are taken into account. + */ +#define XFS_SYMLINK_MAPS 3 + +#define XFS_SYMLINK_BUF_SPACE(mp, bufsize) \ + ((bufsize) - (xfs_sb_version_hascrc(&(mp)->m_sb) ? \ + sizeof(struct xfs_dsymlink_hdr) : 0)) + + +/* + * Allocation Btree format definitions + * + * There are two on-disk btrees, one sorted by blockno and one sorted + * by blockcount and blockno. All blocks look the same to make the code + * simpler; if we have time later, we'll make the optimizations. + */ +#define XFS_ABTB_MAGIC 0x41425442 /* 'ABTB' for bno tree */ +#define XFS_ABTB_CRC_MAGIC 0x41423342 /* 'AB3B' */ +#define XFS_ABTC_MAGIC 0x41425443 /* 'ABTC' for cnt tree */ +#define XFS_ABTC_CRC_MAGIC 0x41423343 /* 'AB3C' */ + +/* + * Data record/key structure + */ +typedef struct xfs_alloc_rec { + __be32 ar_startblock; /* starting block number */ + __be32 ar_blockcount; /* count of free blocks */ +} xfs_alloc_rec_t, xfs_alloc_key_t; + +typedef struct xfs_alloc_rec_incore { + xfs_agblock_t ar_startblock; /* starting block number */ + xfs_extlen_t ar_blockcount; /* count of free blocks */ +} xfs_alloc_rec_incore_t; + +/* btree pointer type */ +typedef __be32 xfs_alloc_ptr_t; + +/* + * Block numbers in the AG: + * SB is sector 0, AGF is sector 1, AGI is sector 2, AGFL is sector 3. + */ +#define XFS_BNO_BLOCK(mp) ((xfs_agblock_t)(XFS_AGFL_BLOCK(mp) + 1)) +#define XFS_CNT_BLOCK(mp) ((xfs_agblock_t)(XFS_BNO_BLOCK(mp) + 1)) + + +/* + * Inode Allocation Btree format definitions + * + * There is a btree for the inode map per allocation group. + */ +#define XFS_IBT_MAGIC 0x49414254 /* 'IABT' */ +#define XFS_IBT_CRC_MAGIC 0x49414233 /* 'IAB3' */ +#define XFS_FIBT_MAGIC 0x46494254 /* 'FIBT' */ +#define XFS_FIBT_CRC_MAGIC 0x46494233 /* 'FIB3' */ + +typedef __uint64_t xfs_inofree_t; +#define XFS_INODES_PER_CHUNK (NBBY * sizeof(xfs_inofree_t)) +#define XFS_INODES_PER_CHUNK_LOG (XFS_NBBYLOG + 3) +#define XFS_INOBT_ALL_FREE ((xfs_inofree_t)-1) +#define XFS_INOBT_MASK(i) ((xfs_inofree_t)1 << (i)) + +static inline xfs_inofree_t xfs_inobt_maskn(int i, int n) +{ + return ((n >= XFS_INODES_PER_CHUNK ? 0 : XFS_INOBT_MASK(n)) - 1) << i; +} + +/* + * Data record structure + */ +typedef struct xfs_inobt_rec { + __be32 ir_startino; /* starting inode number */ + __be32 ir_freecount; /* count of free inodes (set bits) */ + __be64 ir_free; /* free inode mask */ +} xfs_inobt_rec_t; + +typedef struct xfs_inobt_rec_incore { + xfs_agino_t ir_startino; /* starting inode number */ + __int32_t ir_freecount; /* count of free inodes (set bits) */ + xfs_inofree_t ir_free; /* free inode mask */ +} xfs_inobt_rec_incore_t; + + +/* + * Key structure + */ +typedef struct xfs_inobt_key { + __be32 ir_startino; /* starting inode number */ +} xfs_inobt_key_t; + +/* btree pointer type */ +typedef __be32 xfs_inobt_ptr_t; + +/* + * block numbers in the AG. + */ +#define XFS_IBT_BLOCK(mp) ((xfs_agblock_t)(XFS_CNT_BLOCK(mp) + 1)) +#define XFS_FIBT_BLOCK(mp) ((xfs_agblock_t)(XFS_IBT_BLOCK(mp) + 1)) + +/* + * The first data block of an AG depends on whether the filesystem was formatted + * with the finobt feature. If so, account for the finobt reserved root btree + * block. + */ +#define XFS_PREALLOC_BLOCKS(mp) \ + (xfs_sb_version_hasfinobt(&((mp)->m_sb)) ? \ + XFS_FIBT_BLOCK(mp) + 1 : \ + XFS_IBT_BLOCK(mp) + 1) + + + +/* + * BMAP Btree format definitions + * + * This includes both the root block definition that sits inside an inode fork + * and the record/pointer formats for the leaf/node in the blocks. + */ +#define XFS_BMAP_MAGIC 0x424d4150 /* 'BMAP' */ +#define XFS_BMAP_CRC_MAGIC 0x424d4133 /* 'BMA3' */ + +/* + * Bmap root header, on-disk form only. + */ +typedef struct xfs_bmdr_block { + __be16 bb_level; /* 0 is a leaf */ + __be16 bb_numrecs; /* current # of data records */ +} xfs_bmdr_block_t; + +/* + * Bmap btree record and extent descriptor. + * l0:63 is an extent flag (value 1 indicates non-normal). + * l0:9-62 are startoff. + * l0:0-8 and l1:21-63 are startblock. + * l1:0-20 are blockcount. + */ +#define BMBT_EXNTFLAG_BITLEN 1 +#define BMBT_STARTOFF_BITLEN 54 +#define BMBT_STARTBLOCK_BITLEN 52 +#define BMBT_BLOCKCOUNT_BITLEN 21 + +typedef struct xfs_bmbt_rec { + __be64 l0, l1; +} xfs_bmbt_rec_t; + +typedef __uint64_t xfs_bmbt_rec_base_t; /* use this for casts */ +typedef xfs_bmbt_rec_t xfs_bmdr_rec_t; + +typedef struct xfs_bmbt_rec_host { + __uint64_t l0, l1; +} xfs_bmbt_rec_host_t; + +/* + * Values and macros for delayed-allocation startblock fields. + */ +#define STARTBLOCKVALBITS 17 +#define STARTBLOCKMASKBITS (15 + XFS_BIG_BLKNOS * 20) +#define DSTARTBLOCKMASKBITS (15 + 20) +#define STARTBLOCKMASK \ + (((((xfs_fsblock_t)1) << STARTBLOCKMASKBITS) - 1) << STARTBLOCKVALBITS) +#define DSTARTBLOCKMASK \ + (((((xfs_dfsbno_t)1) << DSTARTBLOCKMASKBITS) - 1) << STARTBLOCKVALBITS) + +static inline int isnullstartblock(xfs_fsblock_t x) +{ + return ((x) & STARTBLOCKMASK) == STARTBLOCKMASK; +} + +static inline int isnulldstartblock(xfs_dfsbno_t x) +{ + return ((x) & DSTARTBLOCKMASK) == DSTARTBLOCKMASK; +} + +static inline xfs_fsblock_t nullstartblock(int k) +{ + ASSERT(k < (1 << STARTBLOCKVALBITS)); + return STARTBLOCKMASK | (k); +} + +static inline xfs_filblks_t startblockval(xfs_fsblock_t x) +{ + return (xfs_filblks_t)((x) & ~STARTBLOCKMASK); +} + +/* + * Possible extent formats. + */ +typedef enum { + XFS_EXTFMT_NOSTATE = 0, + XFS_EXTFMT_HASSTATE +} xfs_exntfmt_t; + +/* + * Possible extent states. + */ +typedef enum { + XFS_EXT_NORM, XFS_EXT_UNWRITTEN, + XFS_EXT_DMAPI_OFFLINE, XFS_EXT_INVALID +} xfs_exntst_t; + +/* + * Incore version of above. + */ +typedef struct xfs_bmbt_irec +{ + xfs_fileoff_t br_startoff; /* starting file offset */ + xfs_fsblock_t br_startblock; /* starting block number */ + xfs_filblks_t br_blockcount; /* number of blocks */ + xfs_exntst_t br_state; /* extent state */ +} xfs_bmbt_irec_t; + +/* + * Key structure for non-leaf levels of the tree. + */ +typedef struct xfs_bmbt_key { + __be64 br_startoff; /* starting file offset */ +} xfs_bmbt_key_t, xfs_bmdr_key_t; + +/* btree pointer type */ +typedef __be64 xfs_bmbt_ptr_t, xfs_bmdr_ptr_t; + + +/* + * Generic Btree block format definitions + * + * This is a combination of the actual format used on disk for short and long + * format btrees. The first three fields are shared by both format, but the + * pointers are different and should be used with care. + * + * To get the size of the actual short or long form headers please use the size + * macros below. Never use sizeof(xfs_btree_block). + * + * The blkno, crc, lsn, owner and uuid fields are only available in filesystems + * with the crc feature bit, and all accesses to them must be conditional on + * that flag. + */ +struct xfs_btree_block { + __be32 bb_magic; /* magic number for block type */ + __be16 bb_level; /* 0 is a leaf */ + __be16 bb_numrecs; /* current # of data records */ + union { + struct { + __be32 bb_leftsib; + __be32 bb_rightsib; + + __be64 bb_blkno; + __be64 bb_lsn; + uuid_t bb_uuid; + __be32 bb_owner; + __le32 bb_crc; + } s; /* short form pointers */ + struct { + __be64 bb_leftsib; + __be64 bb_rightsib; + + __be64 bb_blkno; + __be64 bb_lsn; + uuid_t bb_uuid; + __be64 bb_owner; + __le32 bb_crc; + __be32 bb_pad; /* padding for alignment */ + } l; /* long form pointers */ + } bb_u; /* rest */ +}; + +#define XFS_BTREE_SBLOCK_LEN 16 /* size of a short form block */ +#define XFS_BTREE_LBLOCK_LEN 24 /* size of a long form block */ + +/* sizes of CRC enabled btree blocks */ +#define XFS_BTREE_SBLOCK_CRC_LEN (XFS_BTREE_SBLOCK_LEN + 40) +#define XFS_BTREE_LBLOCK_CRC_LEN (XFS_BTREE_LBLOCK_LEN + 48) + +#define XFS_BTREE_SBLOCK_CRC_OFF \ + offsetof(struct xfs_btree_block, bb_u.s.bb_crc) +#define XFS_BTREE_LBLOCK_CRC_OFF \ + offsetof(struct xfs_btree_block, bb_u.l.bb_crc) + +#endif /* __XFS_FORMAT_H__ */ diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h index 6dda3f949b0..d34703dbcb4 100644 --- a/fs/xfs/xfs_fs.h +++ b/fs/xfs/xfs_fs.h @@ -233,13 +233,17 @@ typedef struct xfs_fsop_resblks { #define XFS_FSOP_GEOM_FLAGS_LOGV2 0x0100 /* log format version 2 */ #define XFS_FSOP_GEOM_FLAGS_SECTOR 0x0200 /* sector sizes >1BB */ #define XFS_FSOP_GEOM_FLAGS_ATTR2 0x0400 /* inline attributes rework */ -#define XFS_FSOP_GEOM_FLAGS_PROJID32 0x0800 /* 32-bit project IDs */ +#define XFS_FSOP_GEOM_FLAGS_PROJID32 0x0800 /* 32-bit project IDs */ #define XFS_FSOP_GEOM_FLAGS_DIRV2CI 0x1000 /* ASCII only CI names */ #define XFS_FSOP_GEOM_FLAGS_LAZYSB 0x4000 /* lazy superblock counters */ - +#define XFS_FSOP_GEOM_FLAGS_V5SB 0x8000 /* version 5 superblock */ +#define XFS_FSOP_GEOM_FLAGS_FTYPE 0x10000 /* inode directory types */ +#define XFS_FSOP_GEOM_FLAGS_FINOBT 0x20000 /* free inode btree */ /* - * Minimum and maximum sizes need for growth checks + * Minimum and maximum sizes need for growth checks. + * + * Block counts are in units of filesystem blocks, not basic blocks. */ #define XFS_MIN_AG_BLOCKS 64 #define XFS_MIN_LOG_BLOCKS 512ULL @@ -310,6 +314,17 @@ typedef struct xfs_bstat { } xfs_bstat_t; /* + * Project quota id helpers (previously projid was 16bit only + * and using two 16bit values to hold new 32bit projid was choosen + * to retain compatibility with "old" filesystems). + */ +static inline __uint32_t +bstat_get_projid(struct xfs_bstat *bs) +{ + return (__uint32_t)bs->bs_projid_hi << 16 | bs->bs_projid_lo; +} + +/* * The user-level BulkStat Request interface structure. */ typedef struct xfs_fsop_bulkreq { @@ -343,7 +358,7 @@ typedef struct xfs_error_injection { * Speculative preallocation trimming. */ #define XFS_EOFBLOCKS_VERSION 1 -struct xfs_eofblocks { +struct xfs_fs_eofblocks { __u32 eof_version; __u32 eof_flags; uid_t eof_uid; @@ -449,6 +464,21 @@ typedef struct xfs_handle { + (handle).ha_fid.fid_len) /* + * Structure passed to XFS_IOC_SWAPEXT + */ +typedef struct xfs_swapext +{ + __int64_t sx_version; /* version */ +#define XFS_SX_VERSION 0 + __int64_t sx_fdtarget; /* fd of target file */ + __int64_t sx_fdtmp; /* fd of tmp file */ + xfs_off_t sx_offset; /* offset into file */ + xfs_off_t sx_length; /* leng from offset */ + char sx_pad[16]; /* pad space, unused */ + xfs_bstat_t sx_stat; /* stat of target b4 copy */ +} xfs_swapext_t; + +/* * Flags for going down operation */ #define XFS_FSOP_GOING_FLAGS_DEFAULT 0x0 /* going down */ @@ -486,7 +516,7 @@ typedef struct xfs_handle { /* XFS_IOC_GETBIOSIZE ---- deprecated 47 */ #define XFS_IOC_GETBMAPX _IOWR('X', 56, struct getbmap) #define XFS_IOC_ZERO_RANGE _IOW ('X', 57, struct xfs_flock64) -#define XFS_IOC_FREE_EOFBLOCKS _IOR ('X', 58, struct xfs_eofblocks) +#define XFS_IOC_FREE_EOFBLOCKS _IOR ('X', 58, struct xfs_fs_eofblocks) /* * ioctl commands that replace IRIX syssgi()'s @@ -510,8 +540,14 @@ typedef struct xfs_handle { #define XFS_IOC_ERROR_INJECTION _IOW ('X', 116, struct xfs_error_injection) #define XFS_IOC_ERROR_CLEARALL _IOW ('X', 117, struct xfs_error_injection) /* XFS_IOC_ATTRCTL_BY_HANDLE -- deprecated 118 */ + /* XFS_IOC_FREEZE -- FIFREEZE 119 */ /* XFS_IOC_THAW -- FITHAW 120 */ +#ifndef FIFREEZE +#define XFS_IOC_FREEZE _IOWR('X', 119, int) +#define XFS_IOC_THAW _IOWR('X', 120, int) +#endif + #define XFS_IOC_FSSETDM_BY_HANDLE _IOW ('X', 121, struct xfs_fsop_setdm_handlereq) #define XFS_IOC_ATTRLIST_BY_HANDLE _IOW ('X', 122, struct xfs_fsop_attrlist_handlereq) #define XFS_IOC_ATTRMULTI_BY_HANDLE _IOW ('X', 123, struct xfs_fsop_attrmulti_handlereq) diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 94eaeedc549..d2295561570 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -17,28 +17,31 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_types.h" -#include "xfs_log.h" -#include "xfs_trans.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_mount.h" -#include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_dinode.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" #include "xfs_inode.h" +#include "xfs_trans.h" #include "xfs_inode_item.h" -#include "xfs_btree.h" #include "xfs_error.h" +#include "xfs_btree.h" +#include "xfs_alloc_btree.h" #include "xfs_alloc.h" #include "xfs_ialloc.h" #include "xfs_fsops.h" #include "xfs_itable.h" #include "xfs_trans_space.h" #include "xfs_rtalloc.h" -#include "xfs_filestream.h" #include "xfs_trace.h" +#include "xfs_log.h" +#include "xfs_dinode.h" +#include "xfs_filestream.h" /* * File system operations @@ -73,23 +76,18 @@ xfs_fs_geometry( } if (new_version >= 3) { geo->version = XFS_FSOP_GEOM_VERSION; - geo->flags = + geo->flags = XFS_FSOP_GEOM_FLAGS_NLINK | + XFS_FSOP_GEOM_FLAGS_DIRV2 | (xfs_sb_version_hasattr(&mp->m_sb) ? XFS_FSOP_GEOM_FLAGS_ATTR : 0) | - (xfs_sb_version_hasnlink(&mp->m_sb) ? - XFS_FSOP_GEOM_FLAGS_NLINK : 0) | (xfs_sb_version_hasquota(&mp->m_sb) ? XFS_FSOP_GEOM_FLAGS_QUOTA : 0) | (xfs_sb_version_hasalign(&mp->m_sb) ? XFS_FSOP_GEOM_FLAGS_IALIGN : 0) | (xfs_sb_version_hasdalign(&mp->m_sb) ? XFS_FSOP_GEOM_FLAGS_DALIGN : 0) | - (xfs_sb_version_hasshared(&mp->m_sb) ? - XFS_FSOP_GEOM_FLAGS_SHARED : 0) | (xfs_sb_version_hasextflgbit(&mp->m_sb) ? XFS_FSOP_GEOM_FLAGS_EXTFLG : 0) | - (xfs_sb_version_hasdirv2(&mp->m_sb) ? - XFS_FSOP_GEOM_FLAGS_DIRV2 : 0) | (xfs_sb_version_hassector(&mp->m_sb) ? XFS_FSOP_GEOM_FLAGS_SECTOR : 0) | (xfs_sb_version_hasasciici(&mp->m_sb) ? @@ -99,11 +97,17 @@ xfs_fs_geometry( (xfs_sb_version_hasattr2(&mp->m_sb) ? XFS_FSOP_GEOM_FLAGS_ATTR2 : 0) | (xfs_sb_version_hasprojid32bit(&mp->m_sb) ? - XFS_FSOP_GEOM_FLAGS_PROJID32 : 0); + XFS_FSOP_GEOM_FLAGS_PROJID32 : 0) | + (xfs_sb_version_hascrc(&mp->m_sb) ? + XFS_FSOP_GEOM_FLAGS_V5SB : 0) | + (xfs_sb_version_hasftype(&mp->m_sb) ? + XFS_FSOP_GEOM_FLAGS_FTYPE : 0) | + (xfs_sb_version_hasfinobt(&mp->m_sb) ? + XFS_FSOP_GEOM_FLAGS_FINOBT : 0); geo->logsectsize = xfs_sb_version_hassector(&mp->m_sb) ? mp->m_sb.sb_logsectsize : BBSIZE; geo->rtsectsize = mp->m_sb.sb_blocksize; - geo->dirblocksize = mp->m_dirblksize; + geo->dirblocksize = mp->m_dir_geo->blksize; } if (new_version >= 4) { geo->flags |= @@ -151,7 +155,7 @@ xfs_growfs_data_private( xfs_buf_t *bp; int bucket; int dpct; - int error; + int error, saved_error = 0; xfs_agnumber_t nagcount; xfs_agnumber_t nagimax = 0; xfs_rfsblock_t nb, nb_mod; @@ -174,7 +178,7 @@ xfs_growfs_data_private( if (!bp) return EIO; if (bp->b_error) { - int error = bp->b_error; + error = bp->b_error; xfs_buf_relse(bp); return error; } @@ -201,8 +205,9 @@ xfs_growfs_data_private( tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFS); tp->t_flags |= XFS_TRANS_RESERVE; - if ((error = xfs_trans_reserve(tp, XFS_GROWFS_SPACE_RES(mp), - XFS_GROWDATA_LOG_RES(mp), 0, 0, 0))) { + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growdata, + XFS_GROWFS_SPACE_RES(mp), 0); + if (error) { xfs_trans_cancel(tp, 0); return error; } @@ -214,6 +219,8 @@ xfs_growfs_data_private( */ nfree = 0; for (agno = nagcount - 1; agno >= oagcount; agno--, new -= agsize) { + __be32 *agfl_bno; + /* * AG freespace header block */ @@ -247,6 +254,9 @@ xfs_growfs_data_private( tmpsize = agsize - XFS_PREALLOC_BLOCKS(mp); agf->agf_freeblks = cpu_to_be32(tmpsize); agf->agf_longest = cpu_to_be32(tmpsize); + if (xfs_sb_version_hascrc(&mp->m_sb)) + uuid_copy(&agf->agf_uuid, &mp->m_sb.sb_uuid); + error = xfs_bwrite(bp); xfs_buf_relse(bp); if (error) @@ -265,8 +275,15 @@ xfs_growfs_data_private( } agfl = XFS_BUF_TO_AGFL(bp); + if (xfs_sb_version_hascrc(&mp->m_sb)) { + agfl->agfl_magicnum = cpu_to_be32(XFS_AGFL_MAGIC); + agfl->agfl_seqno = cpu_to_be32(agno); + uuid_copy(&agfl->agfl_uuid, &mp->m_sb.sb_uuid); + } + + agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, bp); for (bucket = 0; bucket < XFS_AGFL_SIZE(mp); bucket++) - agfl->agfl_bno[bucket] = cpu_to_be32(NULLAGBLOCK); + agfl_bno[bucket] = cpu_to_be32(NULLAGBLOCK); error = xfs_bwrite(bp); xfs_buf_relse(bp); @@ -296,8 +313,15 @@ xfs_growfs_data_private( agi->agi_freecount = 0; agi->agi_newino = cpu_to_be32(NULLAGINO); agi->agi_dirino = cpu_to_be32(NULLAGINO); + if (xfs_sb_version_hascrc(&mp->m_sb)) + uuid_copy(&agi->agi_uuid, &mp->m_sb.sb_uuid); + if (xfs_sb_version_hasfinobt(&mp->m_sb)) { + agi->agi_free_root = cpu_to_be32(XFS_FIBT_BLOCK(mp)); + agi->agi_free_level = cpu_to_be32(1); + } for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++) agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO); + error = xfs_bwrite(bp); xfs_buf_relse(bp); if (error) @@ -316,7 +340,13 @@ xfs_growfs_data_private( goto error0; } - xfs_btree_init_block(mp, bp, XFS_ABTB_MAGIC, 0, 1, 0); + if (xfs_sb_version_hascrc(&mp->m_sb)) + xfs_btree_init_block(mp, bp, XFS_ABTB_CRC_MAGIC, 0, 1, + agno, XFS_BTREE_CRC_BLOCKS); + else + xfs_btree_init_block(mp, bp, XFS_ABTB_MAGIC, 0, 1, + agno, 0); + arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1); arec->ar_startblock = cpu_to_be32(XFS_PREALLOC_BLOCKS(mp)); arec->ar_blockcount = cpu_to_be32( @@ -339,7 +369,13 @@ xfs_growfs_data_private( goto error0; } - xfs_btree_init_block(mp, bp, XFS_ABTC_MAGIC, 0, 1, 0); + if (xfs_sb_version_hascrc(&mp->m_sb)) + xfs_btree_init_block(mp, bp, XFS_ABTC_CRC_MAGIC, 0, 1, + agno, XFS_BTREE_CRC_BLOCKS); + else + xfs_btree_init_block(mp, bp, XFS_ABTC_MAGIC, 0, 1, + agno, 0); + arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1); arec->ar_startblock = cpu_to_be32(XFS_PREALLOC_BLOCKS(mp)); arec->ar_blockcount = cpu_to_be32( @@ -363,12 +399,45 @@ xfs_growfs_data_private( goto error0; } - xfs_btree_init_block(mp, bp, XFS_IBT_MAGIC, 0, 0, 0); + if (xfs_sb_version_hascrc(&mp->m_sb)) + xfs_btree_init_block(mp, bp, XFS_IBT_CRC_MAGIC, 0, 0, + agno, XFS_BTREE_CRC_BLOCKS); + else + xfs_btree_init_block(mp, bp, XFS_IBT_MAGIC, 0, 0, + agno, 0); error = xfs_bwrite(bp); xfs_buf_relse(bp); if (error) goto error0; + + /* + * FINO btree root block + */ + if (xfs_sb_version_hasfinobt(&mp->m_sb)) { + bp = xfs_growfs_get_hdr_buf(mp, + XFS_AGB_TO_DADDR(mp, agno, XFS_FIBT_BLOCK(mp)), + BTOBB(mp->m_sb.sb_blocksize), 0, + &xfs_inobt_buf_ops); + if (!bp) { + error = ENOMEM; + goto error0; + } + + if (xfs_sb_version_hascrc(&mp->m_sb)) + xfs_btree_init_block(mp, bp, XFS_FIBT_CRC_MAGIC, + 0, 0, agno, + XFS_BTREE_CRC_BLOCKS); + else + xfs_btree_init_block(mp, bp, XFS_FIBT_MAGIC, 0, + 0, agno, 0); + + error = xfs_bwrite(bp); + xfs_buf_relse(bp); + if (error) + goto error0; + } + } xfs_trans_agblocks_delta(tp, nfree); /* @@ -465,29 +534,33 @@ xfs_growfs_data_private( error = ENOMEM; } + /* + * If we get an error reading or writing alternate superblocks, + * continue. xfs_repair chooses the "best" superblock based + * on most matches; if we break early, we'll leave more + * superblocks un-updated than updated, and xfs_repair may + * pick them over the properly-updated primary. + */ if (error) { xfs_warn(mp, "error %d reading secondary superblock for ag %d", error, agno); - break; + saved_error = error; + continue; } xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb, XFS_SB_ALL_BITS); - /* - * If we get an error writing out the alternate superblocks, - * just issue a warning and continue. The real work is - * already done and committed. - */ error = xfs_bwrite(bp); xfs_buf_relse(bp); if (error) { xfs_warn(mp, "write error %d updating secondary superblock for ag %d", error, agno); - break; /* no point in continuing */ + saved_error = error; + continue; } } - return error; + return saved_error ? saved_error : error; error0: xfs_trans_cancel(tp, XFS_TRANS_ABORT); @@ -709,8 +782,7 @@ xfs_fs_log_dummy( int error; tp = _xfs_trans_alloc(mp, XFS_TRANS_DUMMY1, KM_SLEEP); - error = xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0, - XFS_DEFAULT_LOG_COUNT); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_sb, 0, 0); if (error) { xfs_trans_cancel(tp, 0); return error; diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c index a815412eab8..5960e5593fe 100644 --- a/fs/xfs/xfs_ialloc.c +++ b/fs/xfs/xfs_ialloc.c @@ -17,25 +17,30 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_types.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_bit.h" -#include "xfs_log.h" #include "xfs_inum.h" -#include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_mount.h" -#include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_btree.h" #include "xfs_ialloc.h" +#include "xfs_ialloc_btree.h" #include "xfs_alloc.h" #include "xfs_rtalloc.h" #include "xfs_error.h" #include "xfs_bmap.h" +#include "xfs_cksum.h" +#include "xfs_trans.h" +#include "xfs_buf_item.h" +#include "xfs_icreate_item.h" +#include "xfs_icache.h" +#include "xfs_dinode.h" +#include "xfs_trace.h" /* @@ -47,7 +52,7 @@ xfs_ialloc_cluster_alignment( { if (xfs_sb_version_hasalign(&args->mp->m_sb) && args->mp->m_sb.sb_inoalignmt >= - XFS_B_TO_FSBT(args->mp, XFS_INODE_CLUSTER_SIZE(args->mp))) + XFS_B_TO_FSBT(args->mp, args->mp->m_inode_cluster_size)) return args->mp->m_sb.sb_inoalignmt; return 1; } @@ -107,6 +112,66 @@ xfs_inobt_get_rec( } /* + * Insert a single inobt record. Cursor must already point to desired location. + */ +STATIC int +xfs_inobt_insert_rec( + struct xfs_btree_cur *cur, + __int32_t freecount, + xfs_inofree_t free, + int *stat) +{ + cur->bc_rec.i.ir_freecount = freecount; + cur->bc_rec.i.ir_free = free; + return xfs_btree_insert(cur, stat); +} + +/* + * Insert records describing a newly allocated inode chunk into the inobt. + */ +STATIC int +xfs_inobt_insert( + struct xfs_mount *mp, + struct xfs_trans *tp, + struct xfs_buf *agbp, + xfs_agino_t newino, + xfs_agino_t newlen, + xfs_btnum_t btnum) +{ + struct xfs_btree_cur *cur; + struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp); + xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno); + xfs_agino_t thisino; + int i; + int error; + + cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, btnum); + + for (thisino = newino; + thisino < newino + newlen; + thisino += XFS_INODES_PER_CHUNK) { + error = xfs_inobt_lookup(cur, thisino, XFS_LOOKUP_EQ, &i); + if (error) { + xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); + return error; + } + ASSERT(i == 0); + + error = xfs_inobt_insert_rec(cur, XFS_INODES_PER_CHUNK, + XFS_INOBT_ALL_FREE, &i); + if (error) { + xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); + return error; + } + ASSERT(i == 1); + } + + xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); + + return 0; +} + +/* * Verify that the number of free inodes in the AGI is correct. */ #ifdef DEBUG @@ -148,12 +213,16 @@ xfs_check_agi_freecount( #endif /* - * Initialise a new set of inodes. + * Initialise a new set of inodes. When called without a transaction context + * (e.g. from recovery) we initiate a delayed write of the inode buffers rather + * than logging them (which in a transaction context puts them into the AIL + * for writeback rather than the xfsbufd queue). */ -STATIC int +int xfs_ialloc_inode_init( struct xfs_mount *mp, struct xfs_trans *tp, + struct list_head *buffer_list, xfs_agnumber_t agno, xfs_agblock_t agbno, xfs_agblock_t length, @@ -161,38 +230,58 @@ xfs_ialloc_inode_init( { struct xfs_buf *fbuf; struct xfs_dinode *free; - int blks_per_cluster, nbufs, ninodes; + int nbufs, blks_per_cluster, inodes_per_cluster; int version; int i, j; xfs_daddr_t d; + xfs_ino_t ino = 0; /* - * Loop over the new block(s), filling in the inodes. - * For small block sizes, manipulate the inodes in buffers - * which are multiples of the blocks size. + * Loop over the new block(s), filling in the inodes. For small block + * sizes, manipulate the inodes in buffers which are multiples of the + * blocks size. */ - if (mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(mp)) { - blks_per_cluster = 1; - nbufs = length; - ninodes = mp->m_sb.sb_inopblock; - } else { - blks_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) / - mp->m_sb.sb_blocksize; - nbufs = length / blks_per_cluster; - ninodes = blks_per_cluster * mp->m_sb.sb_inopblock; - } + blks_per_cluster = xfs_icluster_size_fsb(mp); + inodes_per_cluster = blks_per_cluster << mp->m_sb.sb_inopblog; + nbufs = length / blks_per_cluster; /* - * Figure out what version number to use in the inodes we create. - * If the superblock version has caught up to the one that supports - * the new inode format, then use the new inode version. Otherwise - * use the old version so that old kernels will continue to be - * able to use the file system. + * Figure out what version number to use in the inodes we create. If + * the superblock version has caught up to the one that supports the new + * inode format, then use the new inode version. Otherwise use the old + * version so that old kernels will continue to be able to use the file + * system. + * + * For v3 inodes, we also need to write the inode number into the inode, + * so calculate the first inode number of the chunk here as + * XFS_OFFBNO_TO_AGINO() only works within a filesystem block, not + * across multiple filesystem blocks (such as a cluster) and so cannot + * be used in the cluster buffer loop below. + * + * Further, because we are writing the inode directly into the buffer + * and calculating a CRC on the entire inode, we have ot log the entire + * inode so that the entire range the CRC covers is present in the log. + * That means for v3 inode we log the entire buffer rather than just the + * inode cores. */ - if (xfs_sb_version_hasnlink(&mp->m_sb)) + if (xfs_sb_version_hascrc(&mp->m_sb)) { + version = 3; + ino = XFS_AGINO_TO_INO(mp, agno, + XFS_OFFBNO_TO_AGINO(mp, agbno, 0)); + + /* + * log the initialisation that is about to take place as an + * logical operation. This means the transaction does not + * need to log the physical changes to the inode buffers as log + * recovery will know what initialisation is actually needed. + * Hence we only need to log the buffers as "ordered" buffers so + * they track in the AIL as if they were physically logged. + */ + if (tp) + xfs_icreate_log(tp, agno, agbno, mp->m_ialloc_inos, + mp->m_sb.sb_inodesize, length, gen); + } else version = 2; - else - version = 1; for (j = 0; j < nbufs; j++) { /* @@ -204,27 +293,58 @@ xfs_ialloc_inode_init( XBF_UNMAPPED); if (!fbuf) return ENOMEM; - /* - * Initialize all inodes in this buffer and then log them. - * - * XXX: It would be much better if we had just one transaction - * to log a whole cluster of inodes instead of all the - * individual transactions causing a lot of log traffic. - */ + + /* Initialize the inode buffers and log them appropriately. */ fbuf->b_ops = &xfs_inode_buf_ops; - xfs_buf_zero(fbuf, 0, ninodes << mp->m_sb.sb_inodelog); - for (i = 0; i < ninodes; i++) { + xfs_buf_zero(fbuf, 0, BBTOB(fbuf->b_length)); + for (i = 0; i < inodes_per_cluster; i++) { int ioffset = i << mp->m_sb.sb_inodelog; - uint isize = sizeof(struct xfs_dinode); + uint isize = xfs_dinode_size(version); free = xfs_make_iptr(mp, fbuf, i); free->di_magic = cpu_to_be16(XFS_DINODE_MAGIC); free->di_version = version; free->di_gen = cpu_to_be32(gen); free->di_next_unlinked = cpu_to_be32(NULLAGINO); - xfs_trans_log_buf(tp, fbuf, ioffset, ioffset + isize - 1); + + if (version == 3) { + free->di_ino = cpu_to_be64(ino); + ino++; + uuid_copy(&free->di_uuid, &mp->m_sb.sb_uuid); + xfs_dinode_calc_crc(mp, free); + } else if (tp) { + /* just log the inode core */ + xfs_trans_log_buf(tp, fbuf, ioffset, + ioffset + isize - 1); + } + } + + if (tp) { + /* + * Mark the buffer as an inode allocation buffer so it + * sticks in AIL at the point of this allocation + * transaction. This ensures the they are on disk before + * the tail of the log can be moved past this + * transaction (i.e. by preventing relogging from moving + * it forward in the log). + */ + xfs_trans_inode_alloc_buf(tp, fbuf); + if (version == 3) { + /* + * Mark the buffer as ordered so that they are + * not physically logged in the transaction but + * still tracked in the AIL as part of the + * transaction and pin the log appropriately. + */ + xfs_trans_ordered_buf(tp, fbuf); + xfs_trans_log_buf(tp, fbuf, 0, + BBTOB(fbuf->b_length) - 1); + } + } else { + fbuf->b_flags |= XBF_DONE; + xfs_buf_delwri_queue(fbuf, buffer_list); + xfs_buf_relse(fbuf); } - xfs_trans_inode_alloc_buf(tp, fbuf); } return 0; } @@ -241,13 +361,10 @@ xfs_ialloc_ag_alloc( { xfs_agi_t *agi; /* allocation group header */ xfs_alloc_arg_t args; /* allocation argument structure */ - xfs_btree_cur_t *cur; /* inode btree cursor */ xfs_agnumber_t agno; int error; - int i; xfs_agino_t newino; /* new first inode's number */ xfs_agino_t newlen; /* new number of inodes */ - xfs_agino_t thisino; /* current inode number, for loop */ int isaligned = 0; /* inode allocation at stripe unit */ /* boundary */ struct xfs_perag *pag; @@ -260,27 +377,25 @@ xfs_ialloc_ag_alloc( * Locking will ensure that we don't have two callers in here * at one time. */ - newlen = XFS_IALLOC_INODES(args.mp); + newlen = args.mp->m_ialloc_inos; if (args.mp->m_maxicount && args.mp->m_sb.sb_icount + newlen > args.mp->m_maxicount) return XFS_ERROR(ENOSPC); - args.minlen = args.maxlen = XFS_IALLOC_BLOCKS(args.mp); + args.minlen = args.maxlen = args.mp->m_ialloc_blks; /* * First try to allocate inodes contiguous with the last-allocated * chunk of inodes. If the filesystem is striped, this will fill * an entire stripe unit with inodes. - */ + */ agi = XFS_BUF_TO_AGI(agbp); newino = be32_to_cpu(agi->agi_newino); agno = be32_to_cpu(agi->agi_seqno); args.agbno = XFS_AGINO_TO_AGBNO(args.mp, newino) + - XFS_IALLOC_BLOCKS(args.mp); + args.mp->m_ialloc_blks; if (likely(newino != NULLAGINO && (args.agbno < be32_to_cpu(agi->agi_length)))) { args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno); args.type = XFS_ALLOCTYPE_THIS_BNO; - args.mod = args.total = args.wasdel = args.isfl = - args.userdata = args.minalignslop = 0; args.prod = 1; /* @@ -303,6 +418,18 @@ xfs_ialloc_ag_alloc( args.minleft = args.mp->m_in_maxlevels - 1; if ((error = xfs_alloc_vextent(&args))) return error; + + /* + * This request might have dirtied the transaction if the AG can + * satisfy the request, but the exact block was not available. + * If the allocation did fail, subsequent requests will relax + * the exact agbno requirement and increase the alignment + * instead. It is critical that the total size of the request + * (len + alignment + slop) does not increase from this point + * on, so reset minalignslop to ensure it is not included in + * subsequent requests. + */ + args.minalignslop = 0; } else args.fsbno = NULLFSBLOCK; @@ -333,8 +460,6 @@ xfs_ialloc_ag_alloc( * Allocate a fixed-size extent of inodes. */ args.type = XFS_ALLOCTYPE_NEAR_BNO; - args.mod = args.total = args.wasdel = args.isfl = - args.userdata = args.minalignslop = 0; args.prod = 1; /* * Allow space for the inode btree to split. @@ -372,8 +497,8 @@ xfs_ialloc_ag_alloc( * rather than a linear progression to prevent the next generation * number from being easily guessable. */ - error = xfs_ialloc_inode_init(args.mp, tp, agno, args.agbno, - args.len, random32()); + error = xfs_ialloc_inode_init(args.mp, tp, NULL, agno, args.agbno, + args.len, prandom_u32()); if (error) return error; @@ -389,29 +514,19 @@ xfs_ialloc_ag_alloc( agi->agi_newino = cpu_to_be32(newino); /* - * Insert records describing the new inode chunk into the btree. + * Insert records describing the new inode chunk into the btrees. */ - cur = xfs_inobt_init_cursor(args.mp, tp, agbp, agno); - for (thisino = newino; - thisino < newino + newlen; - thisino += XFS_INODES_PER_CHUNK) { - cur->bc_rec.i.ir_startino = thisino; - cur->bc_rec.i.ir_freecount = XFS_INODES_PER_CHUNK; - cur->bc_rec.i.ir_free = XFS_INOBT_ALL_FREE; - error = xfs_btree_lookup(cur, XFS_LOOKUP_EQ, &i); - if (error) { - xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); - return error; - } - ASSERT(i == 0); - error = xfs_btree_insert(cur, &i); - if (error) { - xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); + error = xfs_inobt_insert(args.mp, tp, agbp, newino, newlen, + XFS_BTNUM_INO); + if (error) + return error; + + if (xfs_sb_version_hasfinobt(&args.mp->m_sb)) { + error = xfs_inobt_insert(args.mp, tp, agbp, newino, newlen, + XFS_BTNUM_FINO); + if (error) return error; - } - ASSERT(i == 1); } - xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); /* * Log allocation group header fields */ @@ -443,7 +558,7 @@ xfs_ialloc_next_ag( /* * Select an allocation group to look for a free inode in, based on the parent - * inode and then mode. Return the allocation group buffer. + * inode and the mode. Return the allocation group buffer. */ STATIC xfs_agnumber_t xfs_ialloc_ag_select( @@ -520,7 +635,7 @@ xfs_ialloc_ag_select( * Is there enough free space for the file plus a block of * inodes? (if we need to allocate some)? */ - ineed = XFS_IALLOC_BLOCKS(mp); + ineed = mp->m_ialloc_blks; longest = pag->pagf_longest; if (!longest) longest = pag->pagf_flcount > 0; @@ -585,8 +700,7 @@ xfs_ialloc_get_rec( struct xfs_btree_cur *cur, xfs_agino_t agino, xfs_inobt_rec_incore_t *rec, - int *done, - int left) + int *done) { int error; int i; @@ -606,13 +720,10 @@ xfs_ialloc_get_rec( } /* - * Allocate an inode. - * - * The caller selected an AG for us, and made sure that free inodes are - * available. + * Allocate an inode using the inobt-only algorithm. */ STATIC int -xfs_dialloc_ag( +xfs_dialloc_ag_inobt( struct xfs_trans *tp, struct xfs_buf *agbp, xfs_ino_t parent, @@ -638,7 +749,7 @@ xfs_dialloc_ag( ASSERT(pag->pagi_freecount > 0); restart_pagno: - cur = xfs_inobt_init_cursor(mp, tp, agbp, agno); + cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO); /* * If pagino is 0 (this is the root inode allocation) use newino. * This must work because we've just allocated some. @@ -666,7 +777,7 @@ xfs_dialloc_ag( error = xfs_inobt_get_rec(cur, &rec, &j); if (error) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(j == 1, error0); if (rec.ir_freecount > 0) { /* @@ -694,12 +805,12 @@ xfs_dialloc_ag( pag->pagl_leftrec != NULLAGINO && pag->pagl_rightrec != NULLAGINO) { error = xfs_ialloc_get_rec(tcur, pag->pagl_leftrec, - &trec, &doneleft, 1); + &trec, &doneleft); if (error) goto error1; error = xfs_ialloc_get_rec(cur, pag->pagl_rightrec, - &rec, &doneright, 0); + &rec, &doneright); if (error) goto error1; } else { @@ -871,6 +982,294 @@ error0: } /* + * Use the free inode btree to allocate an inode based on distance from the + * parent. Note that the provided cursor may be deleted and replaced. + */ +STATIC int +xfs_dialloc_ag_finobt_near( + xfs_agino_t pagino, + struct xfs_btree_cur **ocur, + struct xfs_inobt_rec_incore *rec) +{ + struct xfs_btree_cur *lcur = *ocur; /* left search cursor */ + struct xfs_btree_cur *rcur; /* right search cursor */ + struct xfs_inobt_rec_incore rrec; + int error; + int i, j; + + error = xfs_inobt_lookup(lcur, pagino, XFS_LOOKUP_LE, &i); + if (error) + return error; + + if (i == 1) { + error = xfs_inobt_get_rec(lcur, rec, &i); + if (error) + return error; + XFS_WANT_CORRUPTED_RETURN(i == 1); + + /* + * See if we've landed in the parent inode record. The finobt + * only tracks chunks with at least one free inode, so record + * existence is enough. + */ + if (pagino >= rec->ir_startino && + pagino < (rec->ir_startino + XFS_INODES_PER_CHUNK)) + return 0; + } + + error = xfs_btree_dup_cursor(lcur, &rcur); + if (error) + return error; + + error = xfs_inobt_lookup(rcur, pagino, XFS_LOOKUP_GE, &j); + if (error) + goto error_rcur; + if (j == 1) { + error = xfs_inobt_get_rec(rcur, &rrec, &j); + if (error) + goto error_rcur; + XFS_WANT_CORRUPTED_GOTO(j == 1, error_rcur); + } + + XFS_WANT_CORRUPTED_GOTO(i == 1 || j == 1, error_rcur); + if (i == 1 && j == 1) { + /* + * Both the left and right records are valid. Choose the closer + * inode chunk to the target. + */ + if ((pagino - rec->ir_startino + XFS_INODES_PER_CHUNK - 1) > + (rrec.ir_startino - pagino)) { + *rec = rrec; + xfs_btree_del_cursor(lcur, XFS_BTREE_NOERROR); + *ocur = rcur; + } else { + xfs_btree_del_cursor(rcur, XFS_BTREE_NOERROR); + } + } else if (j == 1) { + /* only the right record is valid */ + *rec = rrec; + xfs_btree_del_cursor(lcur, XFS_BTREE_NOERROR); + *ocur = rcur; + } else if (i == 1) { + /* only the left record is valid */ + xfs_btree_del_cursor(rcur, XFS_BTREE_NOERROR); + } + + return 0; + +error_rcur: + xfs_btree_del_cursor(rcur, XFS_BTREE_ERROR); + return error; +} + +/* + * Use the free inode btree to find a free inode based on a newino hint. If + * the hint is NULL, find the first free inode in the AG. + */ +STATIC int +xfs_dialloc_ag_finobt_newino( + struct xfs_agi *agi, + struct xfs_btree_cur *cur, + struct xfs_inobt_rec_incore *rec) +{ + int error; + int i; + + if (agi->agi_newino != cpu_to_be32(NULLAGINO)) { + error = xfs_inobt_lookup(cur, agi->agi_newino, XFS_LOOKUP_EQ, + &i); + if (error) + return error; + if (i == 1) { + error = xfs_inobt_get_rec(cur, rec, &i); + if (error) + return error; + XFS_WANT_CORRUPTED_RETURN(i == 1); + + return 0; + } + } + + /* + * Find the first inode available in the AG. + */ + error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i); + if (error) + return error; + XFS_WANT_CORRUPTED_RETURN(i == 1); + + error = xfs_inobt_get_rec(cur, rec, &i); + if (error) + return error; + XFS_WANT_CORRUPTED_RETURN(i == 1); + + return 0; +} + +/* + * Update the inobt based on a modification made to the finobt. Also ensure that + * the records from both trees are equivalent post-modification. + */ +STATIC int +xfs_dialloc_ag_update_inobt( + struct xfs_btree_cur *cur, /* inobt cursor */ + struct xfs_inobt_rec_incore *frec, /* finobt record */ + int offset) /* inode offset */ +{ + struct xfs_inobt_rec_incore rec; + int error; + int i; + + error = xfs_inobt_lookup(cur, frec->ir_startino, XFS_LOOKUP_EQ, &i); + if (error) + return error; + XFS_WANT_CORRUPTED_RETURN(i == 1); + + error = xfs_inobt_get_rec(cur, &rec, &i); + if (error) + return error; + XFS_WANT_CORRUPTED_RETURN(i == 1); + ASSERT((XFS_AGINO_TO_OFFSET(cur->bc_mp, rec.ir_startino) % + XFS_INODES_PER_CHUNK) == 0); + + rec.ir_free &= ~XFS_INOBT_MASK(offset); + rec.ir_freecount--; + + XFS_WANT_CORRUPTED_RETURN((rec.ir_free == frec->ir_free) && + (rec.ir_freecount == frec->ir_freecount)); + + error = xfs_inobt_update(cur, &rec); + if (error) + return error; + + return 0; +} + +/* + * Allocate an inode using the free inode btree, if available. Otherwise, fall + * back to the inobt search algorithm. + * + * The caller selected an AG for us, and made sure that free inodes are + * available. + */ +STATIC int +xfs_dialloc_ag( + struct xfs_trans *tp, + struct xfs_buf *agbp, + xfs_ino_t parent, + xfs_ino_t *inop) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp); + xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno); + xfs_agnumber_t pagno = XFS_INO_TO_AGNO(mp, parent); + xfs_agino_t pagino = XFS_INO_TO_AGINO(mp, parent); + struct xfs_perag *pag; + struct xfs_btree_cur *cur; /* finobt cursor */ + struct xfs_btree_cur *icur; /* inobt cursor */ + struct xfs_inobt_rec_incore rec; + xfs_ino_t ino; + int error; + int offset; + int i; + + if (!xfs_sb_version_hasfinobt(&mp->m_sb)) + return xfs_dialloc_ag_inobt(tp, agbp, parent, inop); + + pag = xfs_perag_get(mp, agno); + + /* + * If pagino is 0 (this is the root inode allocation) use newino. + * This must work because we've just allocated some. + */ + if (!pagino) + pagino = be32_to_cpu(agi->agi_newino); + + cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_FINO); + + error = xfs_check_agi_freecount(cur, agi); + if (error) + goto error_cur; + + /* + * The search algorithm depends on whether we're in the same AG as the + * parent. If so, find the closest available inode to the parent. If + * not, consider the agi hint or find the first free inode in the AG. + */ + if (agno == pagno) + error = xfs_dialloc_ag_finobt_near(pagino, &cur, &rec); + else + error = xfs_dialloc_ag_finobt_newino(agi, cur, &rec); + if (error) + goto error_cur; + + offset = xfs_lowbit64(rec.ir_free); + ASSERT(offset >= 0); + ASSERT(offset < XFS_INODES_PER_CHUNK); + ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) % + XFS_INODES_PER_CHUNK) == 0); + ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino + offset); + + /* + * Modify or remove the finobt record. + */ + rec.ir_free &= ~XFS_INOBT_MASK(offset); + rec.ir_freecount--; + if (rec.ir_freecount) + error = xfs_inobt_update(cur, &rec); + else + error = xfs_btree_delete(cur, &i); + if (error) + goto error_cur; + + /* + * The finobt has now been updated appropriately. We haven't updated the + * agi and superblock yet, so we can create an inobt cursor and validate + * the original freecount. If all is well, make the equivalent update to + * the inobt using the finobt record and offset information. + */ + icur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO); + + error = xfs_check_agi_freecount(icur, agi); + if (error) + goto error_icur; + + error = xfs_dialloc_ag_update_inobt(icur, &rec, offset); + if (error) + goto error_icur; + + /* + * Both trees have now been updated. We must update the perag and + * superblock before we can check the freecount for each btree. + */ + be32_add_cpu(&agi->agi_freecount, -1); + xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT); + pag->pagi_freecount--; + + xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -1); + + error = xfs_check_agi_freecount(icur, agi); + if (error) + goto error_icur; + error = xfs_check_agi_freecount(cur, agi); + if (error) + goto error_icur; + + xfs_btree_del_cursor(icur, XFS_BTREE_NOERROR); + xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); + xfs_perag_put(pag); + *inop = ino; + return 0; + +error_icur: + xfs_btree_del_cursor(icur, XFS_BTREE_ERROR); +error_cur: + xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); + xfs_perag_put(pag); + return error; +} + +/* * Allocate an inode on disk. * * Mode is used to tell whether the new inode will need space, and whether it @@ -935,7 +1334,7 @@ xfs_dialloc( * inode. */ if (mp->m_maxicount && - mp->m_sb.sb_icount + XFS_IALLOC_INODES(mp) > mp->m_maxicount) { + mp->m_sb.sb_icount + mp->m_ialloc_inos > mp->m_maxicount) { noroom = 1; okalloc = 0; } @@ -1029,78 +1428,34 @@ out_error: return XFS_ERROR(error); } -/* - * Free disk inode. Carefully avoids touching the incore inode, all - * manipulations incore are the caller's responsibility. - * The on-disk inode is not changed by this operation, only the - * btree (free inode mask) is changed. - */ -int -xfs_difree( - xfs_trans_t *tp, /* transaction pointer */ - xfs_ino_t inode, /* inode to be freed */ - xfs_bmap_free_t *flist, /* extents to free */ - int *delete, /* set if inode cluster was deleted */ - xfs_ino_t *first_ino) /* first inode in deleted cluster */ +STATIC int +xfs_difree_inobt( + struct xfs_mount *mp, + struct xfs_trans *tp, + struct xfs_buf *agbp, + xfs_agino_t agino, + struct xfs_bmap_free *flist, + int *deleted, + xfs_ino_t *first_ino, + struct xfs_inobt_rec_incore *orec) { - /* REFERENCED */ - xfs_agblock_t agbno; /* block number containing inode */ - xfs_buf_t *agbp; /* buffer containing allocation group header */ - xfs_agino_t agino; /* inode number relative to allocation group */ - xfs_agnumber_t agno; /* allocation group number */ - xfs_agi_t *agi; /* allocation group header */ - xfs_btree_cur_t *cur; /* inode btree cursor */ - int error; /* error return value */ - int i; /* result code */ - int ilen; /* inodes in an inode cluster */ - xfs_mount_t *mp; /* mount structure for filesystem */ - int off; /* offset of inode in inode chunk */ - xfs_inobt_rec_incore_t rec; /* btree record */ - struct xfs_perag *pag; - - mp = tp->t_mountp; + struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp); + xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno); + struct xfs_perag *pag; + struct xfs_btree_cur *cur; + struct xfs_inobt_rec_incore rec; + int ilen; + int error; + int i; + int off; - /* - * Break up inode number into its components. - */ - agno = XFS_INO_TO_AGNO(mp, inode); - if (agno >= mp->m_sb.sb_agcount) { - xfs_warn(mp, "%s: agno >= mp->m_sb.sb_agcount (%d >= %d).", - __func__, agno, mp->m_sb.sb_agcount); - ASSERT(0); - return XFS_ERROR(EINVAL); - } - agino = XFS_INO_TO_AGINO(mp, inode); - if (inode != XFS_AGINO_TO_INO(mp, agno, agino)) { - xfs_warn(mp, "%s: inode != XFS_AGINO_TO_INO() (%llu != %llu).", - __func__, (unsigned long long)inode, - (unsigned long long)XFS_AGINO_TO_INO(mp, agno, agino)); - ASSERT(0); - return XFS_ERROR(EINVAL); - } - agbno = XFS_AGINO_TO_AGBNO(mp, agino); - if (agbno >= mp->m_sb.sb_agblocks) { - xfs_warn(mp, "%s: agbno >= mp->m_sb.sb_agblocks (%d >= %d).", - __func__, agbno, mp->m_sb.sb_agblocks); - ASSERT(0); - return XFS_ERROR(EINVAL); - } - /* - * Get the allocation group header. - */ - error = xfs_ialloc_read_agi(mp, tp, agno, &agbp); - if (error) { - xfs_warn(mp, "%s: xfs_ialloc_read_agi() returned error %d.", - __func__, error); - return error; - } - agi = XFS_BUF_TO_AGI(agbp); ASSERT(agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC)); - ASSERT(agbno < be32_to_cpu(agi->agi_length)); + ASSERT(XFS_AGINO_TO_AGBNO(mp, agino) < be32_to_cpu(agi->agi_length)); + /* * Initialize the cursor. */ - cur = xfs_inobt_init_cursor(mp, tp, agbp, agno); + cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO); error = xfs_check_agi_freecount(cur, agi); if (error) @@ -1138,9 +1493,9 @@ xfs_difree( * When an inode cluster is free, it becomes eligible for removal */ if (!(mp->m_flags & XFS_MOUNT_IKEEP) && - (rec.ir_freecount == XFS_IALLOC_INODES(mp))) { + (rec.ir_freecount == mp->m_ialloc_inos)) { - *delete = 1; + *deleted = 1; *first_ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino); /* @@ -1148,7 +1503,7 @@ xfs_difree( * AGI and Superblock inode counts, and mark the disk space * to be freed when the transaction is committed. */ - ilen = XFS_IALLOC_INODES(mp); + ilen = mp->m_ialloc_inos; be32_add_cpu(&agi->agi_count, -ilen); be32_add_cpu(&agi->agi_freecount, -(ilen - 1)); xfs_ialloc_log_agi(tp, agbp, XFS_AGI_COUNT | XFS_AGI_FREECOUNT); @@ -1164,11 +1519,11 @@ xfs_difree( goto error0; } - xfs_bmap_add_free(XFS_AGB_TO_FSB(mp, - agno, XFS_INO_TO_AGBNO(mp,rec.ir_startino)), - XFS_IALLOC_BLOCKS(mp), flist, mp); + xfs_bmap_add_free(XFS_AGB_TO_FSB(mp, agno, + XFS_AGINO_TO_AGBNO(mp, rec.ir_startino)), + mp->m_ialloc_blks, flist, mp); } else { - *delete = 0; + *deleted = 0; error = xfs_inobt_update(cur, &rec); if (error) { @@ -1192,6 +1547,7 @@ xfs_difree( if (error) goto error0; + *orec = rec; xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); return 0; @@ -1200,6 +1556,182 @@ error0: return error; } +/* + * Free an inode in the free inode btree. + */ +STATIC int +xfs_difree_finobt( + struct xfs_mount *mp, + struct xfs_trans *tp, + struct xfs_buf *agbp, + xfs_agino_t agino, + struct xfs_inobt_rec_incore *ibtrec) /* inobt record */ +{ + struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp); + xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno); + struct xfs_btree_cur *cur; + struct xfs_inobt_rec_incore rec; + int offset = agino - ibtrec->ir_startino; + int error; + int i; + + cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_FINO); + + error = xfs_inobt_lookup(cur, ibtrec->ir_startino, XFS_LOOKUP_EQ, &i); + if (error) + goto error; + if (i == 0) { + /* + * If the record does not exist in the finobt, we must have just + * freed an inode in a previously fully allocated chunk. If not, + * something is out of sync. + */ + XFS_WANT_CORRUPTED_GOTO(ibtrec->ir_freecount == 1, error); + + error = xfs_inobt_insert_rec(cur, ibtrec->ir_freecount, + ibtrec->ir_free, &i); + if (error) + goto error; + ASSERT(i == 1); + + goto out; + } + + /* + * Read and update the existing record. We could just copy the ibtrec + * across here, but that would defeat the purpose of having redundant + * metadata. By making the modifications independently, we can catch + * corruptions that we wouldn't see if we just copied from one record + * to another. + */ + error = xfs_inobt_get_rec(cur, &rec, &i); + if (error) + goto error; + XFS_WANT_CORRUPTED_GOTO(i == 1, error); + + rec.ir_free |= XFS_INOBT_MASK(offset); + rec.ir_freecount++; + + XFS_WANT_CORRUPTED_GOTO((rec.ir_free == ibtrec->ir_free) && + (rec.ir_freecount == ibtrec->ir_freecount), + error); + + /* + * The content of inobt records should always match between the inobt + * and finobt. The lifecycle of records in the finobt is different from + * the inobt in that the finobt only tracks records with at least one + * free inode. Hence, if all of the inodes are free and we aren't + * keeping inode chunks permanently on disk, remove the record. + * Otherwise, update the record with the new information. + */ + if (rec.ir_freecount == mp->m_ialloc_inos && + !(mp->m_flags & XFS_MOUNT_IKEEP)) { + error = xfs_btree_delete(cur, &i); + if (error) + goto error; + ASSERT(i == 1); + } else { + error = xfs_inobt_update(cur, &rec); + if (error) + goto error; + } + +out: + error = xfs_check_agi_freecount(cur, agi); + if (error) + goto error; + + xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); + return 0; + +error: + xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); + return error; +} + +/* + * Free disk inode. Carefully avoids touching the incore inode, all + * manipulations incore are the caller's responsibility. + * The on-disk inode is not changed by this operation, only the + * btree (free inode mask) is changed. + */ +int +xfs_difree( + struct xfs_trans *tp, /* transaction pointer */ + xfs_ino_t inode, /* inode to be freed */ + struct xfs_bmap_free *flist, /* extents to free */ + int *deleted,/* set if inode cluster was deleted */ + xfs_ino_t *first_ino)/* first inode in deleted cluster */ +{ + /* REFERENCED */ + xfs_agblock_t agbno; /* block number containing inode */ + struct xfs_buf *agbp; /* buffer for allocation group header */ + xfs_agino_t agino; /* allocation group inode number */ + xfs_agnumber_t agno; /* allocation group number */ + int error; /* error return value */ + struct xfs_mount *mp; /* mount structure for filesystem */ + struct xfs_inobt_rec_incore rec;/* btree record */ + + mp = tp->t_mountp; + + /* + * Break up inode number into its components. + */ + agno = XFS_INO_TO_AGNO(mp, inode); + if (agno >= mp->m_sb.sb_agcount) { + xfs_warn(mp, "%s: agno >= mp->m_sb.sb_agcount (%d >= %d).", + __func__, agno, mp->m_sb.sb_agcount); + ASSERT(0); + return XFS_ERROR(EINVAL); + } + agino = XFS_INO_TO_AGINO(mp, inode); + if (inode != XFS_AGINO_TO_INO(mp, agno, agino)) { + xfs_warn(mp, "%s: inode != XFS_AGINO_TO_INO() (%llu != %llu).", + __func__, (unsigned long long)inode, + (unsigned long long)XFS_AGINO_TO_INO(mp, agno, agino)); + ASSERT(0); + return XFS_ERROR(EINVAL); + } + agbno = XFS_AGINO_TO_AGBNO(mp, agino); + if (agbno >= mp->m_sb.sb_agblocks) { + xfs_warn(mp, "%s: agbno >= mp->m_sb.sb_agblocks (%d >= %d).", + __func__, agbno, mp->m_sb.sb_agblocks); + ASSERT(0); + return XFS_ERROR(EINVAL); + } + /* + * Get the allocation group header. + */ + error = xfs_ialloc_read_agi(mp, tp, agno, &agbp); + if (error) { + xfs_warn(mp, "%s: xfs_ialloc_read_agi() returned error %d.", + __func__, error); + return error; + } + + /* + * Fix up the inode allocation btree. + */ + error = xfs_difree_inobt(mp, tp, agbp, agino, flist, deleted, first_ino, + &rec); + if (error) + goto error0; + + /* + * Fix up the free inode btree. + */ + if (xfs_sb_version_hasfinobt(&mp->m_sb)) { + error = xfs_difree_finobt(mp, tp, agbp, agino, &rec); + if (error) + goto error0; + } + + return 0; + +error0: + return error; +} + STATIC int xfs_imap_lookup( struct xfs_mount *mp, @@ -1231,7 +1763,7 @@ xfs_imap_lookup( * we have a record, we need to ensure it contains the inode number * we are looking up. */ - cur = xfs_inobt_init_cursor(mp, tp, agbp, agno); + cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO); error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i); if (!error) { if (i) @@ -1247,7 +1779,7 @@ xfs_imap_lookup( /* check that the returned record contains the required inode */ if (rec.ir_startino > agino || - rec.ir_startino + XFS_IALLOC_INODES(mp) <= agino) + rec.ir_startino + mp->m_ialloc_inos <= agino) return EINVAL; /* for untrusted inodes check it is allocated first */ @@ -1279,7 +1811,7 @@ xfs_imap( xfs_agblock_t cluster_agbno; /* first block in inode cluster */ int error; /* error code */ int offset; /* index of inode in its buffer */ - int offset_agbno; /* blks from chunk start to inode */ + xfs_agblock_t offset_agbno; /* blks from chunk start to inode */ ASSERT(ino != NULLFSINO); @@ -1320,7 +1852,7 @@ xfs_imap( return XFS_ERROR(EINVAL); } - blks_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_blocklog; + blks_per_cluster = xfs_icluster_size_fsb(mp); /* * For bulkstat and handle lookups, we have an untrusted inode number @@ -1341,7 +1873,7 @@ xfs_imap( * If the inode cluster size is the same as the blocksize or * smaller we get to the buffer by simple arithmetics. */ - if (XFS_INODE_CLUSTER_SIZE(mp) <= mp->m_sb.sb_blocksize) { + if (blks_per_cluster == 1) { offset = XFS_INO_TO_OFFSET(mp, ino); ASSERT(offset < mp->m_sb.sb_inopblock); @@ -1419,7 +1951,16 @@ xfs_ialloc_compute_maxlevels( } /* - * Log specified fields for the ag hdr (inode section) + * Log specified fields for the ag hdr (inode section). The growth of the agi + * structure over time requires that we interpret the buffer as two logical + * regions delineated by the end of the unlinked list. This is due to the size + * of the hash table and its location in the middle of the agi. + * + * For example, a request to log a field before agi_unlinked and a field after + * agi_unlinked could cause us to log the entire hash table and use an excessive + * amount of log space. To avoid this behavior, log the region up through + * agi_unlinked in one call and the region after agi_unlinked through the end of + * the structure in another. */ void xfs_ialloc_log_agi( @@ -1442,6 +1983,8 @@ xfs_ialloc_log_agi( offsetof(xfs_agi_t, agi_newino), offsetof(xfs_agi_t, agi_dirino), offsetof(xfs_agi_t, agi_unlinked), + offsetof(xfs_agi_t, agi_free_root), + offsetof(xfs_agi_t, agi_free_level), sizeof(xfs_agi_t) }; #ifdef DEBUG @@ -1450,14 +1993,30 @@ xfs_ialloc_log_agi( agi = XFS_BUF_TO_AGI(bp); ASSERT(agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC)); #endif + + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_AGI_BUF); + /* - * Compute byte offsets for the first and last fields. + * Compute byte offsets for the first and last fields in the first + * region and log the agi buffer. This only logs up through + * agi_unlinked. */ - xfs_btree_offsets(fields, offsets, XFS_AGI_NUM_BITS, &first, &last); + if (fields & XFS_AGI_ALL_BITS_R1) { + xfs_btree_offsets(fields, offsets, XFS_AGI_NUM_BITS_R1, + &first, &last); + xfs_trans_log_buf(tp, bp, first, last); + } + /* - * Log the allocation group inode header buffer. + * Mask off the bits in the first region and calculate the first and + * last field offsets for any bits in the second region. */ - xfs_trans_log_buf(tp, bp, first, last); + fields &= ~XFS_AGI_ALL_BITS_R1; + if (fields) { + xfs_btree_offsets(fields, offsets, XFS_AGI_NUM_BITS_R2, + &first, &last); + xfs_trans_log_buf(tp, bp, first, last); + } } #ifdef DEBUG @@ -1474,19 +2033,23 @@ xfs_check_agi_unlinked( #define xfs_check_agi_unlinked(agi) #endif -static void +static bool xfs_agi_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; struct xfs_agi *agi = XFS_BUF_TO_AGI(bp); - int agi_ok; + if (xfs_sb_version_hascrc(&mp->m_sb) && + !uuid_equal(&agi->agi_uuid, &mp->m_sb.sb_uuid)) + return false; /* * Validate the magic number of the agi block. */ - agi_ok = agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC) && - XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum)); + if (agi->agi_magicnum != cpu_to_be32(XFS_AGI_MAGIC)) + return false; + if (!XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum))) + return false; /* * during growfs operations, the perag is not fully initialised, @@ -1494,30 +2057,50 @@ xfs_agi_verify( * use it by using uncached buffers that don't have the perag attached * so we can detect and avoid this problem. */ - if (bp->b_pag) - agi_ok = agi_ok && be32_to_cpu(agi->agi_seqno) == - bp->b_pag->pag_agno; + if (bp->b_pag && be32_to_cpu(agi->agi_seqno) != bp->b_pag->pag_agno) + return false; - if (unlikely(XFS_TEST_ERROR(!agi_ok, mp, XFS_ERRTAG_IALLOC_READ_AGI, - XFS_RANDOM_IALLOC_READ_AGI))) { - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, agi); - xfs_buf_ioerror(bp, EFSCORRUPTED); - } xfs_check_agi_unlinked(agi); + return true; } static void xfs_agi_read_verify( struct xfs_buf *bp) { - xfs_agi_verify(bp); + struct xfs_mount *mp = bp->b_target->bt_mount; + + if (xfs_sb_version_hascrc(&mp->m_sb) && + !xfs_buf_verify_cksum(bp, XFS_AGI_CRC_OFF)) + xfs_buf_ioerror(bp, EFSBADCRC); + else if (XFS_TEST_ERROR(!xfs_agi_verify(bp), mp, + XFS_ERRTAG_IALLOC_READ_AGI, + XFS_RANDOM_IALLOC_READ_AGI)) + xfs_buf_ioerror(bp, EFSCORRUPTED); + + if (bp->b_error) + xfs_verifier_error(bp); } static void xfs_agi_write_verify( struct xfs_buf *bp) { - xfs_agi_verify(bp); + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_buf_log_item *bip = bp->b_fspriv; + + if (!xfs_agi_verify(bp)) { + xfs_buf_ioerror(bp, EFSCORRUPTED); + xfs_verifier_error(bp); + return; + } + + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return; + + if (bip) + XFS_BUF_TO_AGI(bp)->agi_lsn = cpu_to_be64(bip->bli_item.li_lsn); + xfs_buf_update_cksum(bp, XFS_AGI_CRC_OFF); } const struct xfs_buf_ops xfs_agi_buf_ops = { @@ -1537,15 +2120,15 @@ xfs_read_agi( { int error; - ASSERT(agno != NULLAGNUMBER); + trace_xfs_read_agi(mp, agno); + ASSERT(agno != NULLAGNUMBER); error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)), XFS_FSS_TO_BB(mp, 1), 0, bpp, &xfs_agi_buf_ops); if (error) return error; - ASSERT(!xfs_buf_geterror(*bpp)); xfs_buf_set_ref(*bpp, XFS_AGI_REF); return 0; } @@ -1561,6 +2144,8 @@ xfs_ialloc_read_agi( struct xfs_perag *pag; /* per allocation group data */ int error; + trace_xfs_ialloc_read_agi(mp, agno); + error = xfs_read_agi(mp, tp, agno, bpp); if (error) return error; diff --git a/fs/xfs/xfs_ialloc.h b/fs/xfs/xfs_ialloc.h index c8da3df271e..95ad1c002d6 100644 --- a/fs/xfs/xfs_ialloc.h +++ b/fs/xfs/xfs_ialloc.h @@ -23,18 +23,20 @@ struct xfs_dinode; struct xfs_imap; struct xfs_mount; struct xfs_trans; +struct xfs_btree_cur; -/* - * Allocation parameters for inode allocation. - */ -#define XFS_IALLOC_INODES(mp) (mp)->m_ialloc_inos -#define XFS_IALLOC_BLOCKS(mp) (mp)->m_ialloc_blks - -/* - * Move inodes in clusters of this size. - */ +/* Move inodes in clusters of this size */ #define XFS_INODE_BIG_CLUSTER_SIZE 8192 -#define XFS_INODE_CLUSTER_SIZE(mp) (mp)->m_inode_cluster_size + +/* Calculate and return the number of filesystem blocks per inode cluster */ +static inline int +xfs_icluster_size_fsb( + struct xfs_mount *mp) +{ + if (mp->m_sb.sb_blocksize >= mp->m_inode_cluster_size) + return 1; + return mp->m_inode_cluster_size >> mp->m_sb.sb_blocklog; +} /* * Make an inode pointer out of the buffer/offset. @@ -42,7 +44,7 @@ struct xfs_trans; static inline struct xfs_dinode * xfs_make_iptr(struct xfs_mount *mp, struct xfs_buf *b, int o) { - return (xfs_dinode_t *) + return (struct xfs_dinode *) (xfs_buf_offset(b, o << (mp)->m_sb.sb_inodelog)); } @@ -88,7 +90,7 @@ xfs_difree( struct xfs_trans *tp, /* transaction pointer */ xfs_ino_t inode, /* inode to be freed */ struct xfs_bmap_free *flist, /* extents to free */ - int *delete, /* set if inode cluster was deleted */ + int *deleted, /* set if inode cluster was deleted */ xfs_ino_t *first_ino); /* first inode in deleted cluster */ /* @@ -150,6 +152,12 @@ int xfs_inobt_lookup(struct xfs_btree_cur *cur, xfs_agino_t ino, int xfs_inobt_get_rec(struct xfs_btree_cur *cur, xfs_inobt_rec_incore_t *rec, int *stat); -extern const struct xfs_buf_ops xfs_agi_buf_ops; +/* + * Inode chunk initialisation routine + */ +int xfs_ialloc_inode_init(struct xfs_mount *mp, struct xfs_trans *tp, + struct list_head *buffer_list, + xfs_agnumber_t agno, xfs_agblock_t agbno, + xfs_agblock_t length, unsigned int gen); #endif /* __XFS_IALLOC_H__ */ diff --git a/fs/xfs/xfs_ialloc_btree.c b/fs/xfs/xfs_ialloc_btree.c index bec344b3650..726f83a681a 100644 --- a/fs/xfs/xfs_ialloc_btree.c +++ b/fs/xfs/xfs_ialloc_btree.c @@ -17,23 +17,23 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_types.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_bit.h" -#include "xfs_log.h" -#include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_mount.h" -#include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_btree.h" #include "xfs_ialloc.h" +#include "xfs_ialloc_btree.h" #include "xfs_alloc.h" #include "xfs_error.h" #include "xfs_trace.h" +#include "xfs_cksum.h" +#include "xfs_trans.h" STATIC int @@ -49,7 +49,8 @@ xfs_inobt_dup_cursor( struct xfs_btree_cur *cur) { return xfs_inobt_init_cursor(cur->bc_mp, cur->bc_tp, - cur->bc_private.a.agbp, cur->bc_private.a.agno); + cur->bc_private.a.agbp, cur->bc_private.a.agno, + cur->bc_btnum); } STATIC void @@ -66,12 +67,26 @@ xfs_inobt_set_root( xfs_ialloc_log_agi(cur->bc_tp, agbp, XFS_AGI_ROOT | XFS_AGI_LEVEL); } +STATIC void +xfs_finobt_set_root( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *nptr, + int inc) /* level change */ +{ + struct xfs_buf *agbp = cur->bc_private.a.agbp; + struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp); + + agi->agi_free_root = nptr->s; + be32_add_cpu(&agi->agi_free_level, inc); + xfs_ialloc_log_agi(cur->bc_tp, agbp, + XFS_AGI_FREE_ROOT | XFS_AGI_FREE_LEVEL); +} + STATIC int xfs_inobt_alloc_block( struct xfs_btree_cur *cur, union xfs_btree_ptr *start, union xfs_btree_ptr *new, - int length, int *stat) { xfs_alloc_arg_t args; /* block allocation args */ @@ -173,6 +188,17 @@ xfs_inobt_init_ptr_from_cur( ptr->s = agi->agi_root; } +STATIC void +xfs_finobt_init_ptr_from_cur( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr) +{ + struct xfs_agi *agi = XFS_BUF_TO_AGI(cur->bc_private.a.agbp); + + ASSERT(cur->bc_private.a.agno == be32_to_cpu(agi->agi_seqno)); + ptr->s = agi->agi_free_root; +} + STATIC __int64_t xfs_inobt_key_diff( struct xfs_btree_cur *cur, @@ -182,52 +208,92 @@ xfs_inobt_key_diff( cur->bc_rec.i.ir_startino; } -void +static int xfs_inobt_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); + struct xfs_perag *pag = bp->b_pag; unsigned int level; - int sblock_ok; /* block passes checks */ - /* magic number and level verification */ - level = be16_to_cpu(block->bb_level); - sblock_ok = block->bb_magic == cpu_to_be32(XFS_IBT_MAGIC) && - level < mp->m_in_maxlevels; + /* + * During growfs operations, we can't verify the exact owner as the + * perag is not fully initialised and hence not attached to the buffer. + * + * Similarly, during log recovery we will have a perag structure + * attached, but the agi information will not yet have been initialised + * from the on disk AGI. We don't currently use any of this information, + * but beware of the landmine (i.e. need to check pag->pagi_init) if we + * ever do. + */ + switch (block->bb_magic) { + case cpu_to_be32(XFS_IBT_CRC_MAGIC): + case cpu_to_be32(XFS_FIBT_CRC_MAGIC): + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return false; + if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid)) + return false; + if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn)) + return false; + if (pag && + be32_to_cpu(block->bb_u.s.bb_owner) != pag->pag_agno) + return false; + /* fall through */ + case cpu_to_be32(XFS_IBT_MAGIC): + case cpu_to_be32(XFS_FIBT_MAGIC): + break; + default: + return 0; + } - /* numrecs verification */ - sblock_ok = sblock_ok && - be16_to_cpu(block->bb_numrecs) <= mp->m_inobt_mxr[level != 0]; + /* numrecs and level verification */ + level = be16_to_cpu(block->bb_level); + if (level >= mp->m_in_maxlevels) + return false; + if (be16_to_cpu(block->bb_numrecs) > mp->m_inobt_mxr[level != 0]) + return false; /* sibling pointer verification */ - sblock_ok = sblock_ok && - (block->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK) || - be32_to_cpu(block->bb_u.s.bb_leftsib) < mp->m_sb.sb_agblocks) && - block->bb_u.s.bb_leftsib && - (block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK) || - be32_to_cpu(block->bb_u.s.bb_rightsib) < mp->m_sb.sb_agblocks) && - block->bb_u.s.bb_rightsib; - - if (!sblock_ok) { - trace_xfs_btree_corrupt(bp, _RET_IP_); - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, block); - xfs_buf_ioerror(bp, EFSCORRUPTED); - } + if (!block->bb_u.s.bb_leftsib || + (be32_to_cpu(block->bb_u.s.bb_leftsib) >= mp->m_sb.sb_agblocks && + block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK))) + return false; + if (!block->bb_u.s.bb_rightsib || + (be32_to_cpu(block->bb_u.s.bb_rightsib) >= mp->m_sb.sb_agblocks && + block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK))) + return false; + + return true; } static void xfs_inobt_read_verify( struct xfs_buf *bp) { - xfs_inobt_verify(bp); + if (!xfs_btree_sblock_verify_crc(bp)) + xfs_buf_ioerror(bp, EFSBADCRC); + else if (!xfs_inobt_verify(bp)) + xfs_buf_ioerror(bp, EFSCORRUPTED); + + if (bp->b_error) { + trace_xfs_btree_corrupt(bp, _RET_IP_); + xfs_verifier_error(bp); + } } static void xfs_inobt_write_verify( struct xfs_buf *bp) { - xfs_inobt_verify(bp); + if (!xfs_inobt_verify(bp)) { + trace_xfs_btree_corrupt(bp, _RET_IP_); + xfs_buf_ioerror(bp, EFSCORRUPTED); + xfs_verifier_error(bp); + return; + } + xfs_btree_sblock_calc_crc(bp); + } const struct xfs_buf_ops xfs_inobt_buf_ops = { @@ -235,7 +301,7 @@ const struct xfs_buf_ops xfs_inobt_buf_ops = { .verify_write = xfs_inobt_write_verify, }; -#ifdef DEBUG +#if defined(DEBUG) || defined(XFS_WARN) STATIC int xfs_inobt_keys_inorder( struct xfs_btree_cur *cur, @@ -273,7 +339,29 @@ static const struct xfs_btree_ops xfs_inobt_ops = { .init_ptr_from_cur = xfs_inobt_init_ptr_from_cur, .key_diff = xfs_inobt_key_diff, .buf_ops = &xfs_inobt_buf_ops, -#ifdef DEBUG +#if defined(DEBUG) || defined(XFS_WARN) + .keys_inorder = xfs_inobt_keys_inorder, + .recs_inorder = xfs_inobt_recs_inorder, +#endif +}; + +static const struct xfs_btree_ops xfs_finobt_ops = { + .rec_len = sizeof(xfs_inobt_rec_t), + .key_len = sizeof(xfs_inobt_key_t), + + .dup_cursor = xfs_inobt_dup_cursor, + .set_root = xfs_finobt_set_root, + .alloc_block = xfs_inobt_alloc_block, + .free_block = xfs_inobt_free_block, + .get_minrecs = xfs_inobt_get_minrecs, + .get_maxrecs = xfs_inobt_get_maxrecs, + .init_key_from_rec = xfs_inobt_init_key_from_rec, + .init_rec_from_key = xfs_inobt_init_rec_from_key, + .init_rec_from_cur = xfs_inobt_init_rec_from_cur, + .init_ptr_from_cur = xfs_finobt_init_ptr_from_cur, + .key_diff = xfs_inobt_key_diff, + .buf_ops = &xfs_inobt_buf_ops, +#if defined(DEBUG) || defined(XFS_WARN) .keys_inorder = xfs_inobt_keys_inorder, .recs_inorder = xfs_inobt_recs_inorder, #endif @@ -287,7 +375,8 @@ xfs_inobt_init_cursor( struct xfs_mount *mp, /* file system mount point */ struct xfs_trans *tp, /* transaction pointer */ struct xfs_buf *agbp, /* buffer for agi structure */ - xfs_agnumber_t agno) /* allocation group number */ + xfs_agnumber_t agno, /* allocation group number */ + xfs_btnum_t btnum) /* ialloc or free ino btree */ { struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp); struct xfs_btree_cur *cur; @@ -296,11 +385,19 @@ xfs_inobt_init_cursor( cur->bc_tp = tp; cur->bc_mp = mp; - cur->bc_nlevels = be32_to_cpu(agi->agi_level); - cur->bc_btnum = XFS_BTNUM_INO; + cur->bc_btnum = btnum; + if (btnum == XFS_BTNUM_INO) { + cur->bc_nlevels = be32_to_cpu(agi->agi_level); + cur->bc_ops = &xfs_inobt_ops; + } else { + cur->bc_nlevels = be32_to_cpu(agi->agi_free_level); + cur->bc_ops = &xfs_finobt_ops; + } + cur->bc_blocklog = mp->m_sb.sb_blocklog; - cur->bc_ops = &xfs_inobt_ops; + if (xfs_sb_version_hascrc(&mp->m_sb)) + cur->bc_flags |= XFS_BTREE_CRC_BLOCKS; cur->bc_private.a.agbp = agbp; cur->bc_private.a.agno = agno; diff --git a/fs/xfs/xfs_ialloc_btree.h b/fs/xfs/xfs_ialloc_btree.h index 25c0239a8ea..d7ebea72c2d 100644 --- a/fs/xfs/xfs_ialloc_btree.h +++ b/fs/xfs/xfs_ialloc_btree.h @@ -27,59 +27,11 @@ struct xfs_btree_cur; struct xfs_mount; /* - * There is a btree for the inode map per allocation group. - */ -#define XFS_IBT_MAGIC 0x49414254 /* 'IABT' */ - -typedef __uint64_t xfs_inofree_t; -#define XFS_INODES_PER_CHUNK (NBBY * sizeof(xfs_inofree_t)) -#define XFS_INODES_PER_CHUNK_LOG (XFS_NBBYLOG + 3) -#define XFS_INOBT_ALL_FREE ((xfs_inofree_t)-1) -#define XFS_INOBT_MASK(i) ((xfs_inofree_t)1 << (i)) - -static inline xfs_inofree_t xfs_inobt_maskn(int i, int n) -{ - return ((n >= XFS_INODES_PER_CHUNK ? 0 : XFS_INOBT_MASK(n)) - 1) << i; -} - -/* - * Data record structure - */ -typedef struct xfs_inobt_rec { - __be32 ir_startino; /* starting inode number */ - __be32 ir_freecount; /* count of free inodes (set bits) */ - __be64 ir_free; /* free inode mask */ -} xfs_inobt_rec_t; - -typedef struct xfs_inobt_rec_incore { - xfs_agino_t ir_startino; /* starting inode number */ - __int32_t ir_freecount; /* count of free inodes (set bits) */ - xfs_inofree_t ir_free; /* free inode mask */ -} xfs_inobt_rec_incore_t; - - -/* - * Key structure - */ -typedef struct xfs_inobt_key { - __be32 ir_startino; /* starting inode number */ -} xfs_inobt_key_t; - -/* btree pointer type */ -typedef __be32 xfs_inobt_ptr_t; - -/* - * block numbers in the AG. - */ -#define XFS_IBT_BLOCK(mp) ((xfs_agblock_t)(XFS_CNT_BLOCK(mp) + 1)) -#define XFS_PREALLOC_BLOCKS(mp) ((xfs_agblock_t)(XFS_IBT_BLOCK(mp) + 1)) - -/* * Btree block header size depends on a superblock flag. - * - * (not quite yet, but soon) */ -#define XFS_INOBT_BLOCK_LEN(mp) XFS_BTREE_SBLOCK_LEN +#define XFS_INOBT_BLOCK_LEN(mp) \ + (xfs_sb_version_hascrc(&((mp)->m_sb)) ? \ + XFS_BTREE_SBLOCK_CRC_LEN : XFS_BTREE_SBLOCK_LEN) /* * Record, key, and pointer address macros for btree blocks. @@ -106,9 +58,8 @@ typedef __be32 xfs_inobt_ptr_t; ((index) - 1) * sizeof(xfs_inobt_ptr_t))) extern struct xfs_btree_cur *xfs_inobt_init_cursor(struct xfs_mount *, - struct xfs_trans *, struct xfs_buf *, xfs_agnumber_t); + struct xfs_trans *, struct xfs_buf *, xfs_agnumber_t, + xfs_btnum_t); extern int xfs_inobt_maxrecs(struct xfs_mount *, int, int); -extern const struct xfs_buf_ops xfs_inobt_buf_ops; - #endif /* __XFS_IALLOC_BTREE_H__ */ diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 96e344e3e92..c48df5f25b9 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -17,26 +17,22 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_types.h" -#include "xfs_log.h" -#include "xfs_log_priv.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_inum.h" -#include "xfs_trans.h" -#include "xfs_trans_priv.h" #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_mount.h" -#include "xfs_bmap_btree.h" #include "xfs_inode.h" -#include "xfs_dinode.h" #include "xfs_error.h" -#include "xfs_filestream.h" -#include "xfs_vnodeops.h" +#include "xfs_trans.h" +#include "xfs_trans_priv.h" #include "xfs_inode_item.h" #include "xfs_quota.h" #include "xfs_trace.h" -#include "xfs_fsops.h" #include "xfs_icache.h" +#include "xfs_bmap_util.h" #include <linux/kthread.h> #include <linux/freezer.h> @@ -47,7 +43,7 @@ STATIC void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, /* * Allocate and initialise an xfs_inode. */ -STATIC struct xfs_inode * +struct xfs_inode * xfs_inode_alloc( struct xfs_mount *mp, xfs_ino_t ino) @@ -97,7 +93,7 @@ xfs_inode_free_callback( kmem_zone_free(xfs_inode_zone, ip); } -STATIC void +void xfs_inode_free( struct xfs_inode *ip) { @@ -118,11 +114,6 @@ xfs_inode_free( ip->i_itemp = NULL; } - /* asserts to verify all state is correct here */ - ASSERT(atomic_read(&ip->i_pincount) == 0); - ASSERT(!spin_is_locked(&ip->i_flags_lock)); - ASSERT(!xfs_isiflocked(ip)); - /* * Because we use RCU freeing we need to ensure the inode always * appears to be reclaimed with an invalid inode number when in the @@ -134,6 +125,10 @@ xfs_inode_free( ip->i_ino = 0; spin_unlock(&ip->i_flags_lock); + /* asserts to verify all state is correct here */ + ASSERT(atomic_read(&ip->i_pincount) == 0); + ASSERT(!xfs_isiflocked(ip)); + call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback); } @@ -335,7 +330,9 @@ xfs_iget_cache_miss( iflags = XFS_INEW; if (flags & XFS_IGET_DONTCACHE) iflags |= XFS_IDONTCACHE; - ip->i_udquot = ip->i_gdquot = NULL; + ip->i_udquot = NULL; + ip->i_gdquot = NULL; + ip->i_pdquot = NULL; xfs_iflags_set(ip, iflags); /* insert the new inode */ @@ -498,11 +495,6 @@ xfs_inode_ag_walk_grab( if (!igrab(inode)) return ENOENT; - if (is_bad_inode(inode)) { - IRELE(ip); - return ENOENT; - } - /* inode is valid */ return 0; @@ -515,8 +507,7 @@ STATIC int xfs_inode_ag_walk( struct xfs_mount *mp, struct xfs_perag *pag, - int (*execute)(struct xfs_inode *ip, - struct xfs_perag *pag, int flags, + int (*execute)(struct xfs_inode *ip, int flags, void *args), int flags, void *args, @@ -590,7 +581,7 @@ restart: for (i = 0; i < nr_found; i++) { if (!batch[i]) continue; - error = execute(batch[i], pag, flags, args); + error = execute(batch[i], flags, args); IRELE(batch[i]); if (error == EAGAIN) { skipped++; @@ -617,7 +608,7 @@ restart: /* * Background scanning to trim post-EOF preallocated space. This is queued - * based on the 'background_prealloc_discard_period' tunable (5m by default). + * based on the 'speculative_prealloc_lifetime' tunable (5m by default). */ STATIC void xfs_queue_eofblocks( @@ -644,8 +635,7 @@ xfs_eofblocks_worker( int xfs_inode_ag_iterator( struct xfs_mount *mp, - int (*execute)(struct xfs_inode *ip, - struct xfs_perag *pag, int flags, + int (*execute)(struct xfs_inode *ip, int flags, void *args), int flags, void *args) @@ -672,8 +662,7 @@ xfs_inode_ag_iterator( int xfs_inode_ag_iterator_tag( struct xfs_mount *mp, - int (*execute)(struct xfs_inode *ip, - struct xfs_perag *pag, int flags, + int (*execute)(struct xfs_inode *ip, int flags, void *args), int flags, void *args, @@ -916,8 +905,6 @@ restart: xfs_iflock(ip); } - if (is_bad_inode(VFS_I(ip))) - goto reclaim; if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { xfs_iunpin_wait(ip); xfs_iflush_abort(ip, false); @@ -1164,7 +1151,7 @@ xfs_reclaim_inodes( * them to be cleaned, which we hope will not be very long due to the * background walker having already kicked the IO off on those dirty inodes. */ -void +long xfs_reclaim_inodes_nr( struct xfs_mount *mp, int nr_to_scan) @@ -1173,7 +1160,7 @@ xfs_reclaim_inodes_nr( xfs_reclaim_work_queue(mp); xfs_ail_push_all(mp->m_ail); - xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK | SYNC_WAIT, &nr_to_scan); + return xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK | SYNC_WAIT, &nr_to_scan); } /* @@ -1201,15 +1188,15 @@ xfs_inode_match_id( struct xfs_inode *ip, struct xfs_eofblocks *eofb) { - if (eofb->eof_flags & XFS_EOF_FLAGS_UID && - ip->i_d.di_uid != eofb->eof_uid) + if ((eofb->eof_flags & XFS_EOF_FLAGS_UID) && + !uid_eq(VFS_I(ip)->i_uid, eofb->eof_uid)) return 0; - if (eofb->eof_flags & XFS_EOF_FLAGS_GID && - ip->i_d.di_gid != eofb->eof_gid) + if ((eofb->eof_flags & XFS_EOF_FLAGS_GID) && + !gid_eq(VFS_I(ip)->i_gid, eofb->eof_gid)) return 0; - if (eofb->eof_flags & XFS_EOF_FLAGS_PRID && + if ((eofb->eof_flags & XFS_EOF_FLAGS_PRID) && xfs_get_projid(ip) != eofb->eof_prid) return 0; @@ -1219,7 +1206,6 @@ xfs_inode_match_id( STATIC int xfs_inode_free_eofblocks( struct xfs_inode *ip, - struct xfs_perag *pag, int flags, void *args) { diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index e0f138c70a2..9cf017b899b 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -21,17 +21,36 @@ struct xfs_mount; struct xfs_perag; +struct xfs_eofblocks { + __u32 eof_flags; + kuid_t eof_uid; + kgid_t eof_gid; + prid_t eof_prid; + __u64 eof_min_file_size; +}; + #define SYNC_WAIT 0x0001 /* wait for i/o to complete */ #define SYNC_TRYLOCK 0x0002 /* only try to lock inodes */ +/* + * Flags for xfs_iget() + */ +#define XFS_IGET_CREATE 0x1 +#define XFS_IGET_UNTRUSTED 0x2 +#define XFS_IGET_DONTCACHE 0x4 + int xfs_iget(struct xfs_mount *mp, struct xfs_trans *tp, xfs_ino_t ino, uint flags, uint lock_flags, xfs_inode_t **ipp); +/* recovery needs direct inode allocation capability */ +struct xfs_inode * xfs_inode_alloc(struct xfs_mount *mp, xfs_ino_t ino); +void xfs_inode_free(struct xfs_inode *ip); + void xfs_reclaim_worker(struct work_struct *work); int xfs_reclaim_inodes(struct xfs_mount *mp, int mode); int xfs_reclaim_inodes_count(struct xfs_mount *mp); -void xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan); +long xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan); void xfs_inode_set_reclaim_tag(struct xfs_inode *ip); @@ -40,14 +59,46 @@ void xfs_inode_clear_eofblocks_tag(struct xfs_inode *ip); int xfs_icache_free_eofblocks(struct xfs_mount *, struct xfs_eofblocks *); void xfs_eofblocks_worker(struct work_struct *); -int xfs_sync_inode_grab(struct xfs_inode *ip); int xfs_inode_ag_iterator(struct xfs_mount *mp, - int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, - int flags, void *args), + int (*execute)(struct xfs_inode *ip, int flags, void *args), int flags, void *args); int xfs_inode_ag_iterator_tag(struct xfs_mount *mp, - int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, - int flags, void *args), + int (*execute)(struct xfs_inode *ip, int flags, void *args), int flags, void *args, int tag); +static inline int +xfs_fs_eofblocks_from_user( + struct xfs_fs_eofblocks *src, + struct xfs_eofblocks *dst) +{ + if (src->eof_version != XFS_EOFBLOCKS_VERSION) + return EINVAL; + + if (src->eof_flags & ~XFS_EOF_FLAGS_VALID) + return EINVAL; + + if (memchr_inv(&src->pad32, 0, sizeof(src->pad32)) || + memchr_inv(src->pad64, 0, sizeof(src->pad64))) + return EINVAL; + + dst->eof_flags = src->eof_flags; + dst->eof_prid = src->eof_prid; + dst->eof_min_file_size = src->eof_min_file_size; + + dst->eof_uid = INVALID_UID; + if (src->eof_flags & XFS_EOF_FLAGS_UID) { + dst->eof_uid = make_kuid(current_user_ns(), src->eof_uid); + if (!uid_valid(dst->eof_uid)) + return EINVAL; + } + + dst->eof_gid = INVALID_GID; + if (src->eof_flags & XFS_EOF_FLAGS_GID) { + dst->eof_gid = make_kgid(current_user_ns(), src->eof_gid); + if (!gid_valid(dst->eof_gid)) + return EINVAL; + } + return 0; +} + #endif diff --git a/fs/xfs/xfs_icreate_item.c b/fs/xfs/xfs_icreate_item.c new file mode 100644 index 00000000000..7e454923325 --- /dev/null +++ b/fs/xfs/xfs_icreate_item.c @@ -0,0 +1,189 @@ +/* + * Copyright (c) 2008-2010, 2013 Dave Chinner + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_bit.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_mount.h" +#include "xfs_trans.h" +#include "xfs_trans_priv.h" +#include "xfs_error.h" +#include "xfs_icreate_item.h" +#include "xfs_log.h" + +kmem_zone_t *xfs_icreate_zone; /* inode create item zone */ + +static inline struct xfs_icreate_item *ICR_ITEM(struct xfs_log_item *lip) +{ + return container_of(lip, struct xfs_icreate_item, ic_item); +} + +/* + * This returns the number of iovecs needed to log the given inode item. + * + * We only need one iovec for the icreate log structure. + */ +STATIC void +xfs_icreate_item_size( + struct xfs_log_item *lip, + int *nvecs, + int *nbytes) +{ + *nvecs += 1; + *nbytes += sizeof(struct xfs_icreate_log); +} + +/* + * This is called to fill in the vector of log iovecs for the + * given inode create log item. + */ +STATIC void +xfs_icreate_item_format( + struct xfs_log_item *lip, + struct xfs_log_vec *lv) +{ + struct xfs_icreate_item *icp = ICR_ITEM(lip); + struct xfs_log_iovec *vecp = NULL; + + xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_ICREATE, + &icp->ic_format, + sizeof(struct xfs_icreate_log)); +} + + +/* Pinning has no meaning for the create item, so just return. */ +STATIC void +xfs_icreate_item_pin( + struct xfs_log_item *lip) +{ +} + + +/* pinning has no meaning for the create item, so just return. */ +STATIC void +xfs_icreate_item_unpin( + struct xfs_log_item *lip, + int remove) +{ +} + +STATIC void +xfs_icreate_item_unlock( + struct xfs_log_item *lip) +{ + struct xfs_icreate_item *icp = ICR_ITEM(lip); + + if (icp->ic_item.li_flags & XFS_LI_ABORTED) + kmem_zone_free(xfs_icreate_zone, icp); + return; +} + +/* + * Because we have ordered buffers being tracked in the AIL for the inode + * creation, we don't need the create item after this. Hence we can free + * the log item and return -1 to tell the caller we're done with the item. + */ +STATIC xfs_lsn_t +xfs_icreate_item_committed( + struct xfs_log_item *lip, + xfs_lsn_t lsn) +{ + struct xfs_icreate_item *icp = ICR_ITEM(lip); + + kmem_zone_free(xfs_icreate_zone, icp); + return (xfs_lsn_t)-1; +} + +/* item can never get into the AIL */ +STATIC uint +xfs_icreate_item_push( + struct xfs_log_item *lip, + struct list_head *buffer_list) +{ + ASSERT(0); + return XFS_ITEM_SUCCESS; +} + +/* Ordered buffers do the dependency tracking here, so this does nothing. */ +STATIC void +xfs_icreate_item_committing( + struct xfs_log_item *lip, + xfs_lsn_t lsn) +{ +} + +/* + * This is the ops vector shared by all buf log items. + */ +static struct xfs_item_ops xfs_icreate_item_ops = { + .iop_size = xfs_icreate_item_size, + .iop_format = xfs_icreate_item_format, + .iop_pin = xfs_icreate_item_pin, + .iop_unpin = xfs_icreate_item_unpin, + .iop_push = xfs_icreate_item_push, + .iop_unlock = xfs_icreate_item_unlock, + .iop_committed = xfs_icreate_item_committed, + .iop_committing = xfs_icreate_item_committing, +}; + + +/* + * Initialize the inode log item for a newly allocated (in-core) inode. + * + * Inode extents can only reside within an AG. Hence specify the starting + * block for the inode chunk by offset within an AG as well as the + * length of the allocated extent. + * + * This joins the item to the transaction and marks it dirty so + * that we don't need a separate call to do this, nor does the + * caller need to know anything about the icreate item. + */ +void +xfs_icreate_log( + struct xfs_trans *tp, + xfs_agnumber_t agno, + xfs_agblock_t agbno, + unsigned int count, + unsigned int inode_size, + xfs_agblock_t length, + unsigned int generation) +{ + struct xfs_icreate_item *icp; + + icp = kmem_zone_zalloc(xfs_icreate_zone, KM_SLEEP); + + xfs_log_item_init(tp->t_mountp, &icp->ic_item, XFS_LI_ICREATE, + &xfs_icreate_item_ops); + + icp->ic_format.icl_type = XFS_LI_ICREATE; + icp->ic_format.icl_size = 1; /* single vector */ + icp->ic_format.icl_ag = cpu_to_be32(agno); + icp->ic_format.icl_agbno = cpu_to_be32(agbno); + icp->ic_format.icl_count = cpu_to_be32(count); + icp->ic_format.icl_isize = cpu_to_be32(inode_size); + icp->ic_format.icl_length = cpu_to_be32(length); + icp->ic_format.icl_gen = cpu_to_be32(generation); + + xfs_trans_add_item(tp, &icp->ic_item); + tp->t_flags |= XFS_TRANS_DIRTY; + icp->ic_item.li_desc->lid_flags |= XFS_LID_DIRTY; +} diff --git a/fs/xfs/xfs_icreate_item.h b/fs/xfs/xfs_icreate_item.h new file mode 100644 index 00000000000..59e89f87c09 --- /dev/null +++ b/fs/xfs/xfs_icreate_item.h @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2008-2010, Dave Chinner + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef XFS_ICREATE_ITEM_H +#define XFS_ICREATE_ITEM_H 1 + +/* in memory log item structure */ +struct xfs_icreate_item { + struct xfs_log_item ic_item; + struct xfs_icreate_log ic_format; +}; + +extern kmem_zone_t *xfs_icreate_zone; /* inode create item zone */ + +void xfs_icreate_log(struct xfs_trans *tp, xfs_agnumber_t agno, + xfs_agblock_t agbno, unsigned int count, + unsigned int inode_size, xfs_agblock_t length, + unsigned int generation); + +#endif /* XFS_ICREATE_ITEM_H */ diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 66282dcb821..a6115fe1ac9 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -19,35 +19,38 @@ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_types.h" -#include "xfs_log.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_inum.h" -#include "xfs_trans.h" -#include "xfs_trans_priv.h" #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_mount.h" -#include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_attr_sf.h" -#include "xfs_dinode.h" #include "xfs_inode.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_dir2.h" +#include "xfs_attr_sf.h" +#include "xfs_attr.h" +#include "xfs_trans_space.h" +#include "xfs_trans.h" #include "xfs_buf_item.h" #include "xfs_inode_item.h" -#include "xfs_btree.h" -#include "xfs_alloc.h" #include "xfs_ialloc.h" #include "xfs_bmap.h" +#include "xfs_bmap_util.h" #include "xfs_error.h" -#include "xfs_utils.h" #include "xfs_quota.h" #include "xfs_filestream.h" -#include "xfs_vnodeops.h" +#include "xfs_cksum.h" #include "xfs_trace.h" #include "xfs_icache.h" +#include "xfs_symlink.h" +#include "xfs_trans_priv.h" +#include "xfs_log.h" +#include "xfs_bmap_btree.h" -kmem_zone_t *xfs_ifork_zone; kmem_zone_t *xfs_inode_zone; /* @@ -57,9 +60,8 @@ kmem_zone_t *xfs_inode_zone; #define XFS_ITRUNC_MAX_EXTENTS 2 STATIC int xfs_iflush_int(xfs_inode_t *, xfs_buf_t *); -STATIC int xfs_iformat_local(xfs_inode_t *, xfs_dinode_t *, int, int); -STATIC int xfs_iformat_extents(xfs_inode_t *, xfs_dinode_t *, int); -STATIC int xfs_iformat_btree(xfs_inode_t *, xfs_dinode_t *, int); + +STATIC int xfs_iunlink_remove(xfs_trans_t *, xfs_inode_t *); /* * helper function to extract extent size hint from inode @@ -76,48 +78,44 @@ xfs_get_extsz_hint( } /* - * This is a wrapper routine around the xfs_ilock() routine used to centralize - * some grungy code. It is used in places that wish to lock the inode solely - * for reading the extents. The reason these places can't just call - * xfs_ilock(SHARED) is that the inode lock also guards to bringing in of the - * extents from disk for a file in b-tree format. If the inode is in b-tree - * format, then we need to lock the inode exclusively until the extents are read - * in. Locking it exclusively all the time would limit our parallelism - * unnecessarily, though. What we do instead is check to see if the extents - * have been read in yet, and only lock the inode exclusively if they have not. + * These two are wrapper routines around the xfs_ilock() routine used to + * centralize some grungy code. They are used in places that wish to lock the + * inode solely for reading the extents. The reason these places can't just + * call xfs_ilock(ip, XFS_ILOCK_SHARED) is that the inode lock also guards to + * bringing in of the extents from disk for a file in b-tree format. If the + * inode is in b-tree format, then we need to lock the inode exclusively until + * the extents are read in. Locking it exclusively all the time would limit + * our parallelism unnecessarily, though. What we do instead is check to see + * if the extents have been read in yet, and only lock the inode exclusively + * if they have not. * - * The function returns a value which should be given to the corresponding - * xfs_iunlock_map_shared(). This value is the mode in which the lock was - * actually taken. + * The functions return a value which should be given to the corresponding + * xfs_iunlock() call. */ uint -xfs_ilock_map_shared( - xfs_inode_t *ip) +xfs_ilock_data_map_shared( + struct xfs_inode *ip) { - uint lock_mode; + uint lock_mode = XFS_ILOCK_SHARED; - if ((ip->i_d.di_format == XFS_DINODE_FMT_BTREE) && - ((ip->i_df.if_flags & XFS_IFEXTENTS) == 0)) { + if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE && + (ip->i_df.if_flags & XFS_IFEXTENTS) == 0) lock_mode = XFS_ILOCK_EXCL; - } else { - lock_mode = XFS_ILOCK_SHARED; - } - xfs_ilock(ip, lock_mode); - return lock_mode; } -/* - * This is simply the unlock routine to go with xfs_ilock_map_shared(). - * All it does is call xfs_iunlock() with the given lock_mode. - */ -void -xfs_iunlock_map_shared( - xfs_inode_t *ip, - unsigned int lock_mode) +uint +xfs_ilock_attr_map_shared( + struct xfs_inode *ip) { - xfs_iunlock(ip, lock_mode); + uint lock_mode = XFS_ILOCK_SHARED; + + if (ip->i_d.di_aformat == XFS_DINODE_FMT_BTREE && + (ip->i_afp->if_flags & XFS_IFEXTENTS) == 0) + lock_mode = XFS_ILOCK_EXCL; + xfs_ilock(ip, lock_mode); + return lock_mode; } /* @@ -286,7 +284,7 @@ xfs_ilock_demote( trace_xfs_ilock_demote(ip, lock_flags, _RET_IP_); } -#ifdef DEBUG +#if defined(DEBUG) || defined(XFS_WARN) int xfs_isilocked( xfs_inode_t *ip, @@ -309,599 +307,202 @@ xfs_isilocked( } #endif -void -__xfs_iflock( - struct xfs_inode *ip) -{ - wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IFLOCK_BIT); - DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IFLOCK_BIT); - - do { - prepare_to_wait_exclusive(wq, &wait.wait, TASK_UNINTERRUPTIBLE); - if (xfs_isiflocked(ip)) - io_schedule(); - } while (!xfs_iflock_nowait(ip)); - - finish_wait(wq, &wait.wait); -} - #ifdef DEBUG +int xfs_locked_n; +int xfs_small_retries; +int xfs_middle_retries; +int xfs_lots_retries; +int xfs_lock_delays; +#endif + /* - * Make sure that the extents in the given memory buffer - * are valid. + * Bump the subclass so xfs_lock_inodes() acquires each lock with + * a different value */ -STATIC void -xfs_validate_extents( - xfs_ifork_t *ifp, - int nrecs, - xfs_exntfmt_t fmt) +static inline int +xfs_lock_inumorder(int lock_mode, int subclass) { - xfs_bmbt_irec_t irec; - xfs_bmbt_rec_host_t rec; - int i; + if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)) + lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_IOLOCK_SHIFT; + if (lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)) + lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_ILOCK_SHIFT; - for (i = 0; i < nrecs; i++) { - xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i); - rec.l0 = get_unaligned(&ep->l0); - rec.l1 = get_unaligned(&ep->l1); - xfs_bmbt_get_all(&rec, &irec); - if (fmt == XFS_EXTFMT_NOSTATE) - ASSERT(irec.br_state == XFS_EXT_NORM); - } + return lock_mode; } -#else /* DEBUG */ -#define xfs_validate_extents(ifp, nrecs, fmt) -#endif /* DEBUG */ /* - * Check that none of the inode's in the buffer have a next - * unlinked field of 0. + * The following routine will lock n inodes in exclusive mode. + * We assume the caller calls us with the inodes in i_ino order. + * + * We need to detect deadlock where an inode that we lock + * is in the AIL and we start waiting for another inode that is locked + * by a thread in a long running transaction (such as truncate). This can + * result in deadlock since the long running trans might need to wait + * for the inode we just locked in order to push the tail and free space + * in the log. */ -#if defined(DEBUG) void -xfs_inobp_check( - xfs_mount_t *mp, - xfs_buf_t *bp) +xfs_lock_inodes( + xfs_inode_t **ips, + int inodes, + uint lock_mode) { - int i; - int j; - xfs_dinode_t *dip; + int attempts = 0, i, j, try_lock; + xfs_log_item_t *lp; - j = mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog; + ASSERT(ips && (inodes >= 2)); /* we need at least two */ - for (i = 0; i < j; i++) { - dip = (xfs_dinode_t *)xfs_buf_offset(bp, - i * mp->m_sb.sb_inodesize); - if (!dip->di_next_unlinked) { - xfs_alert(mp, - "Detected bogus zero next_unlinked field in incore inode buffer 0x%p.", - bp); - ASSERT(dip->di_next_unlinked); - } - } -} -#endif - -static void -xfs_inode_buf_verify( - struct xfs_buf *bp) -{ - struct xfs_mount *mp = bp->b_target->bt_mount; - int i; - int ni; - - /* - * Validate the magic number and version of every inode in the buffer - */ - ni = XFS_BB_TO_FSB(mp, bp->b_length) * mp->m_sb.sb_inopblock; - for (i = 0; i < ni; i++) { - int di_ok; - xfs_dinode_t *dip; - - dip = (struct xfs_dinode *)xfs_buf_offset(bp, - (i << mp->m_sb.sb_inodelog)); - di_ok = dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) && - XFS_DINODE_GOOD_VERSION(dip->di_version); - if (unlikely(XFS_TEST_ERROR(!di_ok, mp, - XFS_ERRTAG_ITOBP_INOTOBP, - XFS_RANDOM_ITOBP_INOTOBP))) { - xfs_buf_ioerror(bp, EFSCORRUPTED); - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_HIGH, - mp, dip); -#ifdef DEBUG - xfs_emerg(mp, - "bad inode magic/vsn daddr %lld #%d (magic=%x)", - (unsigned long long)bp->b_bn, i, - be16_to_cpu(dip->di_magic)); - ASSERT(0); -#endif - } - } - xfs_inobp_check(mp, bp); -} - - -static void -xfs_inode_buf_read_verify( - struct xfs_buf *bp) -{ - xfs_inode_buf_verify(bp); -} - -static void -xfs_inode_buf_write_verify( - struct xfs_buf *bp) -{ - xfs_inode_buf_verify(bp); -} + try_lock = 0; + i = 0; -const struct xfs_buf_ops xfs_inode_buf_ops = { - .verify_read = xfs_inode_buf_read_verify, - .verify_write = xfs_inode_buf_write_verify, -}; +again: + for (; i < inodes; i++) { + ASSERT(ips[i]); + if (i && (ips[i] == ips[i-1])) /* Already locked */ + continue; -/* - * This routine is called to map an inode to the buffer containing the on-disk - * version of the inode. It returns a pointer to the buffer containing the - * on-disk inode in the bpp parameter, and in the dipp parameter it returns a - * pointer to the on-disk inode within that buffer. - * - * If a non-zero error is returned, then the contents of bpp and dipp are - * undefined. - */ -int -xfs_imap_to_bp( - struct xfs_mount *mp, - struct xfs_trans *tp, - struct xfs_imap *imap, - struct xfs_dinode **dipp, - struct xfs_buf **bpp, - uint buf_flags, - uint iget_flags) -{ - struct xfs_buf *bp; - int error; + /* + * If try_lock is not set yet, make sure all locked inodes + * are not in the AIL. + * If any are, set try_lock to be used later. + */ - buf_flags |= XBF_UNMAPPED; - error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno, - (int)imap->im_len, buf_flags, &bp, - &xfs_inode_buf_ops); - if (error) { - if (error == EAGAIN) { - ASSERT(buf_flags & XBF_TRYLOCK); - return error; + if (!try_lock) { + for (j = (i - 1); j >= 0 && !try_lock; j--) { + lp = (xfs_log_item_t *)ips[j]->i_itemp; + if (lp && (lp->li_flags & XFS_LI_IN_AIL)) { + try_lock++; + } + } } - if (error == EFSCORRUPTED && - (iget_flags & XFS_IGET_UNTRUSTED)) - return XFS_ERROR(EINVAL); - - xfs_warn(mp, "%s: xfs_trans_read_buf() returned error %d.", - __func__, error); - return error; - } - - *bpp = bp; - *dipp = (struct xfs_dinode *)xfs_buf_offset(bp, imap->im_boffset); - return 0; -} - -/* - * Move inode type and inode format specific information from the - * on-disk inode to the in-core inode. For fifos, devs, and sockets - * this means set if_rdev to the proper value. For files, directories, - * and symlinks this means to bring in the in-line data or extent - * pointers. For a file in B-tree format, only the root is immediately - * brought in-core. The rest will be in-lined in if_extents when it - * is first referenced (see xfs_iread_extents()). - */ -STATIC int -xfs_iformat( - xfs_inode_t *ip, - xfs_dinode_t *dip) -{ - xfs_attr_shortform_t *atp; - int size; - int error = 0; - xfs_fsize_t di_size; - - if (unlikely(be32_to_cpu(dip->di_nextents) + - be16_to_cpu(dip->di_anextents) > - be64_to_cpu(dip->di_nblocks))) { - xfs_warn(ip->i_mount, - "corrupt dinode %Lu, extent total = %d, nblocks = %Lu.", - (unsigned long long)ip->i_ino, - (int)(be32_to_cpu(dip->di_nextents) + - be16_to_cpu(dip->di_anextents)), - (unsigned long long) - be64_to_cpu(dip->di_nblocks)); - XFS_CORRUPTION_ERROR("xfs_iformat(1)", XFS_ERRLEVEL_LOW, - ip->i_mount, dip); - return XFS_ERROR(EFSCORRUPTED); - } - - if (unlikely(dip->di_forkoff > ip->i_mount->m_sb.sb_inodesize)) { - xfs_warn(ip->i_mount, "corrupt dinode %Lu, forkoff = 0x%x.", - (unsigned long long)ip->i_ino, - dip->di_forkoff); - XFS_CORRUPTION_ERROR("xfs_iformat(2)", XFS_ERRLEVEL_LOW, - ip->i_mount, dip); - return XFS_ERROR(EFSCORRUPTED); - } - - if (unlikely((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) && - !ip->i_mount->m_rtdev_targp)) { - xfs_warn(ip->i_mount, - "corrupt dinode %Lu, has realtime flag set.", - ip->i_ino); - XFS_CORRUPTION_ERROR("xfs_iformat(realtime)", - XFS_ERRLEVEL_LOW, ip->i_mount, dip); - return XFS_ERROR(EFSCORRUPTED); - } - - switch (ip->i_d.di_mode & S_IFMT) { - case S_IFIFO: - case S_IFCHR: - case S_IFBLK: - case S_IFSOCK: - if (unlikely(dip->di_format != XFS_DINODE_FMT_DEV)) { - XFS_CORRUPTION_ERROR("xfs_iformat(3)", XFS_ERRLEVEL_LOW, - ip->i_mount, dip); - return XFS_ERROR(EFSCORRUPTED); - } - ip->i_d.di_size = 0; - ip->i_df.if_u2.if_rdev = xfs_dinode_get_rdev(dip); - break; + /* + * If any of the previous locks we have locked is in the AIL, + * we must TRY to get the second and subsequent locks. If + * we can't get any, we must release all we have + * and try again. + */ - case S_IFREG: - case S_IFLNK: - case S_IFDIR: - switch (dip->di_format) { - case XFS_DINODE_FMT_LOCAL: + if (try_lock) { + /* try_lock must be 0 if i is 0. */ /* - * no local regular files yet + * try_lock means we have an inode locked + * that is in the AIL. */ - if (unlikely(S_ISREG(be16_to_cpu(dip->di_mode)))) { - xfs_warn(ip->i_mount, - "corrupt inode %Lu (local format for regular file).", - (unsigned long long) ip->i_ino); - XFS_CORRUPTION_ERROR("xfs_iformat(4)", - XFS_ERRLEVEL_LOW, - ip->i_mount, dip); - return XFS_ERROR(EFSCORRUPTED); - } + ASSERT(i != 0); + if (!xfs_ilock_nowait(ips[i], xfs_lock_inumorder(lock_mode, i))) { + attempts++; + + /* + * Unlock all previous guys and try again. + * xfs_iunlock will try to push the tail + * if the inode is in the AIL. + */ + + for(j = i - 1; j >= 0; j--) { + + /* + * Check to see if we've already + * unlocked this one. + * Not the first one going back, + * and the inode ptr is the same. + */ + if ((j != (i - 1)) && ips[j] == + ips[j+1]) + continue; + + xfs_iunlock(ips[j], lock_mode); + } - di_size = be64_to_cpu(dip->di_size); - if (unlikely(di_size > XFS_DFORK_DSIZE(dip, ip->i_mount))) { - xfs_warn(ip->i_mount, - "corrupt inode %Lu (bad size %Ld for local inode).", - (unsigned long long) ip->i_ino, - (long long) di_size); - XFS_CORRUPTION_ERROR("xfs_iformat(5)", - XFS_ERRLEVEL_LOW, - ip->i_mount, dip); - return XFS_ERROR(EFSCORRUPTED); + if ((attempts % 5) == 0) { + delay(1); /* Don't just spin the CPU */ +#ifdef DEBUG + xfs_lock_delays++; +#endif + } + i = 0; + try_lock = 0; + goto again; } - - size = (int)di_size; - error = xfs_iformat_local(ip, dip, XFS_DATA_FORK, size); - break; - case XFS_DINODE_FMT_EXTENTS: - error = xfs_iformat_extents(ip, dip, XFS_DATA_FORK); - break; - case XFS_DINODE_FMT_BTREE: - error = xfs_iformat_btree(ip, dip, XFS_DATA_FORK); - break; - default: - XFS_ERROR_REPORT("xfs_iformat(6)", XFS_ERRLEVEL_LOW, - ip->i_mount); - return XFS_ERROR(EFSCORRUPTED); + } else { + xfs_ilock(ips[i], xfs_lock_inumorder(lock_mode, i)); } - break; - - default: - XFS_ERROR_REPORT("xfs_iformat(7)", XFS_ERRLEVEL_LOW, ip->i_mount); - return XFS_ERROR(EFSCORRUPTED); } - if (error) { - return error; - } - if (!XFS_DFORK_Q(dip)) - return 0; - - ASSERT(ip->i_afp == NULL); - ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP | KM_NOFS); - - switch (dip->di_aformat) { - case XFS_DINODE_FMT_LOCAL: - atp = (xfs_attr_shortform_t *)XFS_DFORK_APTR(dip); - size = be16_to_cpu(atp->hdr.totsize); - - if (unlikely(size < sizeof(struct xfs_attr_sf_hdr))) { - xfs_warn(ip->i_mount, - "corrupt inode %Lu (bad attr fork size %Ld).", - (unsigned long long) ip->i_ino, - (long long) size); - XFS_CORRUPTION_ERROR("xfs_iformat(8)", - XFS_ERRLEVEL_LOW, - ip->i_mount, dip); - return XFS_ERROR(EFSCORRUPTED); - } - error = xfs_iformat_local(ip, dip, XFS_ATTR_FORK, size); - break; - case XFS_DINODE_FMT_EXTENTS: - error = xfs_iformat_extents(ip, dip, XFS_ATTR_FORK); - break; - case XFS_DINODE_FMT_BTREE: - error = xfs_iformat_btree(ip, dip, XFS_ATTR_FORK); - break; - default: - error = XFS_ERROR(EFSCORRUPTED); - break; - } - if (error) { - kmem_zone_free(xfs_ifork_zone, ip->i_afp); - ip->i_afp = NULL; - xfs_idestroy_fork(ip, XFS_DATA_FORK); +#ifdef DEBUG + if (attempts) { + if (attempts < 5) xfs_small_retries++; + else if (attempts < 100) xfs_middle_retries++; + else xfs_lots_retries++; + } else { + xfs_locked_n++; } - return error; +#endif } /* - * The file is in-lined in the on-disk inode. - * If it fits into if_inline_data, then copy - * it there, otherwise allocate a buffer for it - * and copy the data there. Either way, set - * if_data to point at the data. - * If we allocate a buffer for the data, make - * sure that its size is a multiple of 4 and - * record the real size in i_real_bytes. + * xfs_lock_two_inodes() can only be used to lock one type of lock + * at a time - the iolock or the ilock, but not both at once. If + * we lock both at once, lockdep will report false positives saying + * we have violated locking orders. */ -STATIC int -xfs_iformat_local( - xfs_inode_t *ip, - xfs_dinode_t *dip, - int whichfork, - int size) +void +xfs_lock_two_inodes( + xfs_inode_t *ip0, + xfs_inode_t *ip1, + uint lock_mode) { - xfs_ifork_t *ifp; - int real_size; - - /* - * If the size is unreasonable, then something - * is wrong and we just bail out rather than crash in - * kmem_alloc() or memcpy() below. - */ - if (unlikely(size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) { - xfs_warn(ip->i_mount, - "corrupt inode %Lu (bad size %d for local fork, size = %d).", - (unsigned long long) ip->i_ino, size, - XFS_DFORK_SIZE(dip, ip->i_mount, whichfork)); - XFS_CORRUPTION_ERROR("xfs_iformat_local", XFS_ERRLEVEL_LOW, - ip->i_mount, dip); - return XFS_ERROR(EFSCORRUPTED); - } - ifp = XFS_IFORK_PTR(ip, whichfork); - real_size = 0; - if (size == 0) - ifp->if_u1.if_data = NULL; - else if (size <= sizeof(ifp->if_u2.if_inline_data)) - ifp->if_u1.if_data = ifp->if_u2.if_inline_data; - else { - real_size = roundup(size, 4); - ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP | KM_NOFS); - } - ifp->if_bytes = size; - ifp->if_real_bytes = real_size; - if (size) - memcpy(ifp->if_u1.if_data, XFS_DFORK_PTR(dip, whichfork), size); - ifp->if_flags &= ~XFS_IFEXTENTS; - ifp->if_flags |= XFS_IFINLINE; - return 0; -} + xfs_inode_t *temp; + int attempts = 0; + xfs_log_item_t *lp; -/* - * The file consists of a set of extents all - * of which fit into the on-disk inode. - * If there are few enough extents to fit into - * the if_inline_ext, then copy them there. - * Otherwise allocate a buffer for them and copy - * them into it. Either way, set if_extents - * to point at the extents. - */ -STATIC int -xfs_iformat_extents( - xfs_inode_t *ip, - xfs_dinode_t *dip, - int whichfork) -{ - xfs_bmbt_rec_t *dp; - xfs_ifork_t *ifp; - int nex; - int size; - int i; - - ifp = XFS_IFORK_PTR(ip, whichfork); - nex = XFS_DFORK_NEXTENTS(dip, whichfork); - size = nex * (uint)sizeof(xfs_bmbt_rec_t); - - /* - * If the number of extents is unreasonable, then something - * is wrong and we just bail out rather than crash in - * kmem_alloc() or memcpy() below. - */ - if (unlikely(size < 0 || size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) { - xfs_warn(ip->i_mount, "corrupt inode %Lu ((a)extents = %d).", - (unsigned long long) ip->i_ino, nex); - XFS_CORRUPTION_ERROR("xfs_iformat_extents(1)", XFS_ERRLEVEL_LOW, - ip->i_mount, dip); - return XFS_ERROR(EFSCORRUPTED); - } - - ifp->if_real_bytes = 0; - if (nex == 0) - ifp->if_u1.if_extents = NULL; - else if (nex <= XFS_INLINE_EXTS) - ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext; - else - xfs_iext_add(ifp, 0, nex); - - ifp->if_bytes = size; - if (size) { - dp = (xfs_bmbt_rec_t *) XFS_DFORK_PTR(dip, whichfork); - xfs_validate_extents(ifp, nex, XFS_EXTFMT_INODE(ip)); - for (i = 0; i < nex; i++, dp++) { - xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i); - ep->l0 = get_unaligned_be64(&dp->l0); - ep->l1 = get_unaligned_be64(&dp->l1); - } - XFS_BMAP_TRACE_EXLIST(ip, nex, whichfork); - if (whichfork != XFS_DATA_FORK || - XFS_EXTFMT_INODE(ip) == XFS_EXTFMT_NOSTATE) - if (unlikely(xfs_check_nostate_extents( - ifp, 0, nex))) { - XFS_ERROR_REPORT("xfs_iformat_extents(2)", - XFS_ERRLEVEL_LOW, - ip->i_mount); - return XFS_ERROR(EFSCORRUPTED); - } + if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)) + ASSERT((lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)) == 0); + ASSERT(ip0->i_ino != ip1->i_ino); + + if (ip0->i_ino > ip1->i_ino) { + temp = ip0; + ip0 = ip1; + ip1 = temp; } - ifp->if_flags |= XFS_IFEXTENTS; - return 0; -} -/* - * The file has too many extents to fit into - * the inode, so they are in B-tree format. - * Allocate a buffer for the root of the B-tree - * and copy the root into it. The i_extents - * field will remain NULL until all of the - * extents are read in (when they are needed). - */ -STATIC int -xfs_iformat_btree( - xfs_inode_t *ip, - xfs_dinode_t *dip, - int whichfork) -{ - xfs_bmdr_block_t *dfp; - xfs_ifork_t *ifp; - /* REFERENCED */ - int nrecs; - int size; - - ifp = XFS_IFORK_PTR(ip, whichfork); - dfp = (xfs_bmdr_block_t *)XFS_DFORK_PTR(dip, whichfork); - size = XFS_BMAP_BROOT_SPACE(dfp); - nrecs = be16_to_cpu(dfp->bb_numrecs); - - /* - * blow out if -- fork has less extents than can fit in - * fork (fork shouldn't be a btree format), root btree - * block has more records than can fit into the fork, - * or the number of extents is greater than the number of - * blocks. - */ - if (unlikely(XFS_IFORK_NEXTENTS(ip, whichfork) <= - XFS_IFORK_MAXEXT(ip, whichfork) || - XFS_BMDR_SPACE_CALC(nrecs) > - XFS_DFORK_SIZE(dip, ip->i_mount, whichfork) || - XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks)) { - xfs_warn(ip->i_mount, "corrupt inode %Lu (btree).", - (unsigned long long) ip->i_ino); - XFS_CORRUPTION_ERROR("xfs_iformat_btree", XFS_ERRLEVEL_LOW, - ip->i_mount, dip); - return XFS_ERROR(EFSCORRUPTED); - } - - ifp->if_broot_bytes = size; - ifp->if_broot = kmem_alloc(size, KM_SLEEP | KM_NOFS); - ASSERT(ifp->if_broot != NULL); - /* - * Copy and convert from the on-disk structure - * to the in-memory structure. - */ - xfs_bmdr_to_bmbt(ip->i_mount, dfp, - XFS_DFORK_SIZE(dip, ip->i_mount, whichfork), - ifp->if_broot, size); - ifp->if_flags &= ~XFS_IFEXTENTS; - ifp->if_flags |= XFS_IFBROOT; + again: + xfs_ilock(ip0, xfs_lock_inumorder(lock_mode, 0)); - return 0; + /* + * If the first lock we have locked is in the AIL, we must TRY to get + * the second lock. If we can't get it, we must release the first one + * and try again. + */ + lp = (xfs_log_item_t *)ip0->i_itemp; + if (lp && (lp->li_flags & XFS_LI_IN_AIL)) { + if (!xfs_ilock_nowait(ip1, xfs_lock_inumorder(lock_mode, 1))) { + xfs_iunlock(ip0, lock_mode); + if ((++attempts % 5) == 0) + delay(1); /* Don't just spin the CPU */ + goto again; + } + } else { + xfs_ilock(ip1, xfs_lock_inumorder(lock_mode, 1)); + } } -STATIC void -xfs_dinode_from_disk( - xfs_icdinode_t *to, - xfs_dinode_t *from) -{ - to->di_magic = be16_to_cpu(from->di_magic); - to->di_mode = be16_to_cpu(from->di_mode); - to->di_version = from ->di_version; - to->di_format = from->di_format; - to->di_onlink = be16_to_cpu(from->di_onlink); - to->di_uid = be32_to_cpu(from->di_uid); - to->di_gid = be32_to_cpu(from->di_gid); - to->di_nlink = be32_to_cpu(from->di_nlink); - to->di_projid_lo = be16_to_cpu(from->di_projid_lo); - to->di_projid_hi = be16_to_cpu(from->di_projid_hi); - memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad)); - to->di_flushiter = be16_to_cpu(from->di_flushiter); - to->di_atime.t_sec = be32_to_cpu(from->di_atime.t_sec); - to->di_atime.t_nsec = be32_to_cpu(from->di_atime.t_nsec); - to->di_mtime.t_sec = be32_to_cpu(from->di_mtime.t_sec); - to->di_mtime.t_nsec = be32_to_cpu(from->di_mtime.t_nsec); - to->di_ctime.t_sec = be32_to_cpu(from->di_ctime.t_sec); - to->di_ctime.t_nsec = be32_to_cpu(from->di_ctime.t_nsec); - to->di_size = be64_to_cpu(from->di_size); - to->di_nblocks = be64_to_cpu(from->di_nblocks); - to->di_extsize = be32_to_cpu(from->di_extsize); - to->di_nextents = be32_to_cpu(from->di_nextents); - to->di_anextents = be16_to_cpu(from->di_anextents); - to->di_forkoff = from->di_forkoff; - to->di_aformat = from->di_aformat; - to->di_dmevmask = be32_to_cpu(from->di_dmevmask); - to->di_dmstate = be16_to_cpu(from->di_dmstate); - to->di_flags = be16_to_cpu(from->di_flags); - to->di_gen = be32_to_cpu(from->di_gen); -} void -xfs_dinode_to_disk( - xfs_dinode_t *to, - xfs_icdinode_t *from) +__xfs_iflock( + struct xfs_inode *ip) { - to->di_magic = cpu_to_be16(from->di_magic); - to->di_mode = cpu_to_be16(from->di_mode); - to->di_version = from ->di_version; - to->di_format = from->di_format; - to->di_onlink = cpu_to_be16(from->di_onlink); - to->di_uid = cpu_to_be32(from->di_uid); - to->di_gid = cpu_to_be32(from->di_gid); - to->di_nlink = cpu_to_be32(from->di_nlink); - to->di_projid_lo = cpu_to_be16(from->di_projid_lo); - to->di_projid_hi = cpu_to_be16(from->di_projid_hi); - memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad)); - to->di_flushiter = cpu_to_be16(from->di_flushiter); - to->di_atime.t_sec = cpu_to_be32(from->di_atime.t_sec); - to->di_atime.t_nsec = cpu_to_be32(from->di_atime.t_nsec); - to->di_mtime.t_sec = cpu_to_be32(from->di_mtime.t_sec); - to->di_mtime.t_nsec = cpu_to_be32(from->di_mtime.t_nsec); - to->di_ctime.t_sec = cpu_to_be32(from->di_ctime.t_sec); - to->di_ctime.t_nsec = cpu_to_be32(from->di_ctime.t_nsec); - to->di_size = cpu_to_be64(from->di_size); - to->di_nblocks = cpu_to_be64(from->di_nblocks); - to->di_extsize = cpu_to_be32(from->di_extsize); - to->di_nextents = cpu_to_be32(from->di_nextents); - to->di_anextents = cpu_to_be16(from->di_anextents); - to->di_forkoff = from->di_forkoff; - to->di_aformat = from->di_aformat; - to->di_dmevmask = cpu_to_be32(from->di_dmevmask); - to->di_dmstate = cpu_to_be16(from->di_dmstate); - to->di_flags = cpu_to_be16(from->di_flags); - to->di_gen = cpu_to_be32(from->di_gen); + wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IFLOCK_BIT); + DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IFLOCK_BIT); + + do { + prepare_to_wait_exclusive(wq, &wait.wait, TASK_UNINTERRUPTIBLE); + if (xfs_isiflocked(ip)) + io_schedule(); + } while (!xfs_iflock_nowait(ip)); + + finish_wait(wq, &wait.wait); } STATIC uint @@ -963,162 +564,49 @@ xfs_dic2xflags( } /* - * Read the disk inode attributes into the in-core inode structure. + * Lookups up an inode from "name". If ci_name is not NULL, then a CI match + * is allowed, otherwise it has to be an exact match. If a CI match is found, + * ci_name->name will point to a the actual name (caller must free) or + * will be set to NULL if an exact match is found. */ int -xfs_iread( - xfs_mount_t *mp, - xfs_trans_t *tp, - xfs_inode_t *ip, - uint iget_flags) +xfs_lookup( + xfs_inode_t *dp, + struct xfs_name *name, + xfs_inode_t **ipp, + struct xfs_name *ci_name) { - xfs_buf_t *bp; - xfs_dinode_t *dip; - int error; + xfs_ino_t inum; + int error; + uint lock_mode; - /* - * Fill in the location information in the in-core inode. - */ - error = xfs_imap(mp, tp, ip->i_ino, &ip->i_imap, iget_flags); - if (error) - return error; + trace_xfs_lookup(dp, name); - /* - * Get pointers to the on-disk inode and the buffer containing it. - */ - error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &bp, 0, iget_flags); - if (error) - return error; + if (XFS_FORCED_SHUTDOWN(dp->i_mount)) + return XFS_ERROR(EIO); - /* - * If we got something that isn't an inode it means someone - * (nfs or dmi) has a stale handle. - */ - if (dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC)) { -#ifdef DEBUG - xfs_alert(mp, - "%s: dip->di_magic (0x%x) != XFS_DINODE_MAGIC (0x%x)", - __func__, be16_to_cpu(dip->di_magic), XFS_DINODE_MAGIC); -#endif /* DEBUG */ - error = XFS_ERROR(EINVAL); - goto out_brelse; - } + lock_mode = xfs_ilock_data_map_shared(dp); + error = xfs_dir_lookup(NULL, dp, name, &inum, ci_name); + xfs_iunlock(dp, lock_mode); - /* - * If the on-disk inode is already linked to a directory - * entry, copy all of the inode into the in-core inode. - * xfs_iformat() handles copying in the inode format - * specific information. - * Otherwise, just get the truly permanent information. - */ - if (dip->di_mode) { - xfs_dinode_from_disk(&ip->i_d, dip); - error = xfs_iformat(ip, dip); - if (error) { -#ifdef DEBUG - xfs_alert(mp, "%s: xfs_iformat() returned error %d", - __func__, error); -#endif /* DEBUG */ - goto out_brelse; - } - } else { - ip->i_d.di_magic = be16_to_cpu(dip->di_magic); - ip->i_d.di_version = dip->di_version; - ip->i_d.di_gen = be32_to_cpu(dip->di_gen); - ip->i_d.di_flushiter = be16_to_cpu(dip->di_flushiter); - /* - * Make sure to pull in the mode here as well in - * case the inode is released without being used. - * This ensures that xfs_inactive() will see that - * the inode is already free and not try to mess - * with the uninitialized part of it. - */ - ip->i_d.di_mode = 0; - } - - /* - * The inode format changed when we moved the link count and - * made it 32 bits long. If this is an old format inode, - * convert it in memory to look like a new one. If it gets - * flushed to disk we will convert back before flushing or - * logging it. We zero out the new projid field and the old link - * count field. We'll handle clearing the pad field (the remains - * of the old uuid field) when we actually convert the inode to - * the new format. We don't change the version number so that we - * can distinguish this from a real new format inode. - */ - if (ip->i_d.di_version == 1) { - ip->i_d.di_nlink = ip->i_d.di_onlink; - ip->i_d.di_onlink = 0; - xfs_set_projid(ip, 0); - } + if (error) + goto out; - ip->i_delayed_blks = 0; + error = xfs_iget(dp->i_mount, NULL, inum, 0, 0, ipp); + if (error) + goto out_free_name; - /* - * Mark the buffer containing the inode as something to keep - * around for a while. This helps to keep recently accessed - * meta-data in-core longer. - */ - xfs_buf_set_ref(bp, XFS_INO_REF); + return 0; - /* - * Use xfs_trans_brelse() to release the buffer containing the - * on-disk inode, because it was acquired with xfs_trans_read_buf() - * in xfs_imap_to_bp() above. If tp is NULL, this is just a normal - * brelse(). If we're within a transaction, then xfs_trans_brelse() - * will only release the buffer if it is not dirty within the - * transaction. It will be OK to release the buffer in this case, - * because inodes on disk are never destroyed and we will be - * locking the new in-core inode before putting it in the hash - * table where other processes can find it. Thus we don't have - * to worry about the inode being changed just because we released - * the buffer. - */ - out_brelse: - xfs_trans_brelse(tp, bp); +out_free_name: + if (ci_name) + kmem_free(ci_name->name); +out: + *ipp = NULL; return error; } /* - * Read in extents from a btree-format inode. - * Allocate and fill in if_extents. Real work is done in xfs_bmap.c. - */ -int -xfs_iread_extents( - xfs_trans_t *tp, - xfs_inode_t *ip, - int whichfork) -{ - int error; - xfs_ifork_t *ifp; - xfs_extnum_t nextents; - - if (unlikely(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)) { - XFS_ERROR_REPORT("xfs_iread_extents", XFS_ERRLEVEL_LOW, - ip->i_mount); - return XFS_ERROR(EFSCORRUPTED); - } - nextents = XFS_IFORK_NEXTENTS(ip, whichfork); - ifp = XFS_IFORK_PTR(ip, whichfork); - - /* - * We know that the size is valid (it's checked in iformat_btree) - */ - ifp->if_bytes = ifp->if_real_bytes = 0; - ifp->if_flags |= XFS_IFEXTENTS; - xfs_iext_add(ifp, 0, nextents); - error = xfs_bmap_read_extents(tp, ip, whichfork); - if (error) { - xfs_iext_destroy(ifp); - ifp->if_flags &= ~XFS_IFEXTENTS; - return error; - } - xfs_validate_extents(ifp, nextents, XFS_EXTFMT_INODE(ip)); - return 0; -} - -/* * Allocate an inode on disk and return a copy of its in-core version. * The in-core inode is locked exclusively. Set mode, nlink, and rdev * appropriately within the inode. The uid and gid for the inode are @@ -1161,12 +649,12 @@ xfs_ialloc( xfs_buf_t **ialloc_context, xfs_inode_t **ipp) { + struct xfs_mount *mp = tp->t_mountp; xfs_ino_t ino; xfs_inode_t *ip; uint flags; int error; timespec_t tv; - int filestreams = 0; /* * Call the space management code to pick @@ -1187,42 +675,29 @@ xfs_ialloc( * This is because we're setting fields here we need * to prevent others from looking at until we're done. */ - error = xfs_iget(tp->t_mountp, tp, ino, XFS_IGET_CREATE, + error = xfs_iget(mp, tp, ino, XFS_IGET_CREATE, XFS_ILOCK_EXCL, &ip); if (error) return error; ASSERT(ip != NULL); + /* + * We always convert v1 inodes to v2 now - we only support filesystems + * with >= v2 inode capability, so there is no reason for ever leaving + * an inode in v1 format. + */ + if (ip->i_d.di_version == 1) + ip->i_d.di_version = 2; + ip->i_d.di_mode = mode; ip->i_d.di_onlink = 0; ip->i_d.di_nlink = nlink; ASSERT(ip->i_d.di_nlink == nlink); - ip->i_d.di_uid = current_fsuid(); - ip->i_d.di_gid = current_fsgid(); + ip->i_d.di_uid = xfs_kuid_to_uid(current_fsuid()); + ip->i_d.di_gid = xfs_kgid_to_gid(current_fsgid()); xfs_set_projid(ip, prid); memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad)); - /* - * If the superblock version is up to where we support new format - * inodes and this is currently an old format inode, then change - * the inode version number now. This way we only do the conversion - * here rather than here and in the flush/logging code. - */ - if (xfs_sb_version_hasnlink(&tp->t_mountp->m_sb) && - ip->i_d.di_version == 1) { - ip->i_d.di_version = 2; - /* - * We've already zeroed the old link count, the projid field, - * and the pad field. - */ - } - - /* - * Project ids won't be stored on disk if we are using a version 1 inode. - */ - if ((prid != 0) && (ip->i_d.di_version == 1)) - xfs_bump_ino_vers2(tp, ip); - if (pip && XFS_INHERIT_GID(pip)) { ip->i_d.di_gid = pip->i_d.di_gid; if ((pip->i_d.di_mode & S_ISGID) && S_ISDIR(mode)) { @@ -1237,7 +712,7 @@ xfs_ialloc( */ if ((irix_sgid_inherit) && (ip->i_d.di_mode & S_ISGID) && - (!in_group_p((gid_t)ip->i_d.di_gid))) { + (!in_group_p(xfs_gid_to_kgid(ip->i_d.di_gid)))) { ip->i_d.di_mode &= ~S_ISGID; } @@ -1258,6 +733,19 @@ xfs_ialloc( ip->i_d.di_dmevmask = 0; ip->i_d.di_dmstate = 0; ip->i_d.di_flags = 0; + + if (ip->i_d.di_version == 3) { + ASSERT(ip->i_d.di_ino == ino); + ASSERT(uuid_equal(&ip->i_d.di_uuid, &mp->m_sb.sb_uuid)); + ip->i_d.di_crc = 0; + ip->i_d.di_changecount = 1; + ip->i_d.di_lsn = 0; + ip->i_d.di_flags2 = 0; + memset(&(ip->i_d.di_pad2[0]), 0, sizeof(ip->i_d.di_pad2)); + ip->i_d.di_crtime = ip->i_d.di_mtime; + } + + flags = XFS_ILOG_CORE; switch (mode & S_IFMT) { case S_IFIFO: @@ -1270,13 +758,6 @@ xfs_ialloc( flags |= XFS_ILOG_DEV; break; case S_IFREG: - /* - * we can't set up filestreams until after the VFS inode - * is set up properly. - */ - if (pip && xfs_inode_is_filestream(pip)) - filestreams = 1; - /* fall through */ case S_IFDIR: if (pip && (pip->i_d.di_flags & XFS_DIFLAG_ANY)) { uint di_flags = 0; @@ -1342,20 +823,653 @@ xfs_ialloc( /* now that we have an i_mode we can setup inode ops and unlock */ xfs_setup_inode(ip); - /* now we have set up the vfs inode we can associate the filestream */ - if (filestreams) { - error = xfs_filestream_associate(pip, ip); - if (error < 0) - return -error; - if (!error) - xfs_iflags_set(ip, XFS_IFILESTREAM); + *ipp = ip; + return 0; +} + +/* + * Allocates a new inode from disk and return a pointer to the + * incore copy. This routine will internally commit the current + * transaction and allocate a new one if the Space Manager needed + * to do an allocation to replenish the inode free-list. + * + * This routine is designed to be called from xfs_create and + * xfs_create_dir. + * + */ +int +xfs_dir_ialloc( + xfs_trans_t **tpp, /* input: current transaction; + output: may be a new transaction. */ + xfs_inode_t *dp, /* directory within whose allocate + the inode. */ + umode_t mode, + xfs_nlink_t nlink, + xfs_dev_t rdev, + prid_t prid, /* project id */ + int okalloc, /* ok to allocate new space */ + xfs_inode_t **ipp, /* pointer to inode; it will be + locked. */ + int *committed) + +{ + xfs_trans_t *tp; + xfs_trans_t *ntp; + xfs_inode_t *ip; + xfs_buf_t *ialloc_context = NULL; + int code; + void *dqinfo; + uint tflags; + + tp = *tpp; + ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); + + /* + * xfs_ialloc will return a pointer to an incore inode if + * the Space Manager has an available inode on the free + * list. Otherwise, it will do an allocation and replenish + * the freelist. Since we can only do one allocation per + * transaction without deadlocks, we will need to commit the + * current transaction and start a new one. We will then + * need to call xfs_ialloc again to get the inode. + * + * If xfs_ialloc did an allocation to replenish the freelist, + * it returns the bp containing the head of the freelist as + * ialloc_context. We will hold a lock on it across the + * transaction commit so that no other process can steal + * the inode(s) that we've just allocated. + */ + code = xfs_ialloc(tp, dp, mode, nlink, rdev, prid, okalloc, + &ialloc_context, &ip); + + /* + * Return an error if we were unable to allocate a new inode. + * This should only happen if we run out of space on disk or + * encounter a disk error. + */ + if (code) { + *ipp = NULL; + return code; + } + if (!ialloc_context && !ip) { + *ipp = NULL; + return XFS_ERROR(ENOSPC); + } + + /* + * If the AGI buffer is non-NULL, then we were unable to get an + * inode in one operation. We need to commit the current + * transaction and call xfs_ialloc() again. It is guaranteed + * to succeed the second time. + */ + if (ialloc_context) { + struct xfs_trans_res tres; + + /* + * Normally, xfs_trans_commit releases all the locks. + * We call bhold to hang on to the ialloc_context across + * the commit. Holding this buffer prevents any other + * processes from doing any allocations in this + * allocation group. + */ + xfs_trans_bhold(tp, ialloc_context); + /* + * Save the log reservation so we can use + * them in the next transaction. + */ + tres.tr_logres = xfs_trans_get_log_res(tp); + tres.tr_logcount = xfs_trans_get_log_count(tp); + + /* + * We want the quota changes to be associated with the next + * transaction, NOT this one. So, detach the dqinfo from this + * and attach it to the next transaction. + */ + dqinfo = NULL; + tflags = 0; + if (tp->t_dqinfo) { + dqinfo = (void *)tp->t_dqinfo; + tp->t_dqinfo = NULL; + tflags = tp->t_flags & XFS_TRANS_DQ_DIRTY; + tp->t_flags &= ~(XFS_TRANS_DQ_DIRTY); + } + + ntp = xfs_trans_dup(tp); + code = xfs_trans_commit(tp, 0); + tp = ntp; + if (committed != NULL) { + *committed = 1; + } + /* + * If we get an error during the commit processing, + * release the buffer that is still held and return + * to the caller. + */ + if (code) { + xfs_buf_relse(ialloc_context); + if (dqinfo) { + tp->t_dqinfo = dqinfo; + xfs_trans_free_dqinfo(tp); + } + *tpp = ntp; + *ipp = NULL; + return code; + } + + /* + * transaction commit worked ok so we can drop the extra ticket + * reference that we gained in xfs_trans_dup() + */ + xfs_log_ticket_put(tp->t_ticket); + tres.tr_logflags = XFS_TRANS_PERM_LOG_RES; + code = xfs_trans_reserve(tp, &tres, 0, 0); + + /* + * Re-attach the quota info that we detached from prev trx. + */ + if (dqinfo) { + tp->t_dqinfo = dqinfo; + tp->t_flags |= tflags; + } + + if (code) { + xfs_buf_relse(ialloc_context); + *tpp = ntp; + *ipp = NULL; + return code; + } + xfs_trans_bjoin(tp, ialloc_context); + + /* + * Call ialloc again. Since we've locked out all + * other allocations in this allocation group, + * this call should always succeed. + */ + code = xfs_ialloc(tp, dp, mode, nlink, rdev, prid, + okalloc, &ialloc_context, &ip); + + /* + * If we get an error at this point, return to the caller + * so that the current transaction can be aborted. + */ + if (code) { + *tpp = tp; + *ipp = NULL; + return code; + } + ASSERT(!ialloc_context && ip); + + } else { + if (committed != NULL) + *committed = 0; } *ipp = ip; + *tpp = tp; + return 0; } /* + * Decrement the link count on an inode & log the change. + * If this causes the link count to go to zero, initiate the + * logging activity required to truncate a file. + */ +int /* error */ +xfs_droplink( + xfs_trans_t *tp, + xfs_inode_t *ip) +{ + int error; + + xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG); + + ASSERT (ip->i_d.di_nlink > 0); + ip->i_d.di_nlink--; + drop_nlink(VFS_I(ip)); + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + + error = 0; + if (ip->i_d.di_nlink == 0) { + /* + * We're dropping the last link to this file. + * Move the on-disk inode to the AGI unlinked list. + * From xfs_inactive() we will pull the inode from + * the list and free it. + */ + error = xfs_iunlink(tp, ip); + } + return error; +} + +/* + * Increment the link count on an inode & log the change. + */ +int +xfs_bumplink( + xfs_trans_t *tp, + xfs_inode_t *ip) +{ + xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG); + + ASSERT(ip->i_d.di_version > 1); + ASSERT(ip->i_d.di_nlink > 0 || (VFS_I(ip)->i_state & I_LINKABLE)); + ip->i_d.di_nlink++; + inc_nlink(VFS_I(ip)); + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + return 0; +} + +int +xfs_create( + xfs_inode_t *dp, + struct xfs_name *name, + umode_t mode, + xfs_dev_t rdev, + xfs_inode_t **ipp) +{ + int is_dir = S_ISDIR(mode); + struct xfs_mount *mp = dp->i_mount; + struct xfs_inode *ip = NULL; + struct xfs_trans *tp = NULL; + int error; + xfs_bmap_free_t free_list; + xfs_fsblock_t first_block; + bool unlock_dp_on_error = false; + uint cancel_flags; + int committed; + prid_t prid; + struct xfs_dquot *udqp = NULL; + struct xfs_dquot *gdqp = NULL; + struct xfs_dquot *pdqp = NULL; + struct xfs_trans_res tres; + uint resblks; + + trace_xfs_create(dp, name); + + if (XFS_FORCED_SHUTDOWN(mp)) + return XFS_ERROR(EIO); + + prid = xfs_get_initial_prid(dp); + + /* + * Make sure that we have allocated dquot(s) on disk. + */ + error = xfs_qm_vop_dqalloc(dp, xfs_kuid_to_uid(current_fsuid()), + xfs_kgid_to_gid(current_fsgid()), prid, + XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, + &udqp, &gdqp, &pdqp); + if (error) + return error; + + if (is_dir) { + rdev = 0; + resblks = XFS_MKDIR_SPACE_RES(mp, name->len); + tres.tr_logres = M_RES(mp)->tr_mkdir.tr_logres; + tres.tr_logcount = XFS_MKDIR_LOG_COUNT; + tp = xfs_trans_alloc(mp, XFS_TRANS_MKDIR); + } else { + resblks = XFS_CREATE_SPACE_RES(mp, name->len); + tres.tr_logres = M_RES(mp)->tr_create.tr_logres; + tres.tr_logcount = XFS_CREATE_LOG_COUNT; + tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE); + } + + cancel_flags = XFS_TRANS_RELEASE_LOG_RES; + + /* + * Initially assume that the file does not exist and + * reserve the resources for that case. If that is not + * the case we'll drop the one we have and get a more + * appropriate transaction later. + */ + tres.tr_logflags = XFS_TRANS_PERM_LOG_RES; + error = xfs_trans_reserve(tp, &tres, resblks, 0); + if (error == ENOSPC) { + /* flush outstanding delalloc blocks and retry */ + xfs_flush_inodes(mp); + error = xfs_trans_reserve(tp, &tres, resblks, 0); + } + if (error == ENOSPC) { + /* No space at all so try a "no-allocation" reservation */ + resblks = 0; + error = xfs_trans_reserve(tp, &tres, 0, 0); + } + if (error) { + cancel_flags = 0; + goto out_trans_cancel; + } + + xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT); + unlock_dp_on_error = true; + + xfs_bmap_init(&free_list, &first_block); + + /* + * Reserve disk quota and the inode. + */ + error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp, + pdqp, resblks, 1, 0); + if (error) + goto out_trans_cancel; + + error = xfs_dir_canenter(tp, dp, name, resblks); + if (error) + goto out_trans_cancel; + + /* + * A newly created regular or special file just has one directory + * entry pointing to them, but a directory also the "." entry + * pointing to itself. + */ + error = xfs_dir_ialloc(&tp, dp, mode, is_dir ? 2 : 1, rdev, + prid, resblks > 0, &ip, &committed); + if (error) { + if (error == ENOSPC) + goto out_trans_cancel; + goto out_trans_abort; + } + + /* + * Now we join the directory inode to the transaction. We do not do it + * earlier because xfs_dir_ialloc might commit the previous transaction + * (and release all the locks). An error from here on will result in + * the transaction cancel unlocking dp so don't do it explicitly in the + * error path. + */ + xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); + unlock_dp_on_error = false; + + error = xfs_dir_createname(tp, dp, name, ip->i_ino, + &first_block, &free_list, resblks ? + resblks - XFS_IALLOC_SPACE_RES(mp) : 0); + if (error) { + ASSERT(error != ENOSPC); + goto out_trans_abort; + } + xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); + + if (is_dir) { + error = xfs_dir_init(tp, ip, dp); + if (error) + goto out_bmap_cancel; + + error = xfs_bumplink(tp, dp); + if (error) + goto out_bmap_cancel; + } + + /* + * If this is a synchronous mount, make sure that the + * create transaction goes to disk before returning to + * the user. + */ + if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) + xfs_trans_set_sync(tp); + + /* + * Attach the dquot(s) to the inodes and modify them incore. + * These ids of the inode couldn't have changed since the new + * inode has been locked ever since it was created. + */ + xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp); + + error = xfs_bmap_finish(&tp, &free_list, &committed); + if (error) + goto out_bmap_cancel; + + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + if (error) + goto out_release_inode; + + xfs_qm_dqrele(udqp); + xfs_qm_dqrele(gdqp); + xfs_qm_dqrele(pdqp); + + *ipp = ip; + return 0; + + out_bmap_cancel: + xfs_bmap_cancel(&free_list); + out_trans_abort: + cancel_flags |= XFS_TRANS_ABORT; + out_trans_cancel: + xfs_trans_cancel(tp, cancel_flags); + out_release_inode: + /* + * Wait until after the current transaction is aborted to + * release the inode. This prevents recursive transactions + * and deadlocks from xfs_inactive. + */ + if (ip) + IRELE(ip); + + xfs_qm_dqrele(udqp); + xfs_qm_dqrele(gdqp); + xfs_qm_dqrele(pdqp); + + if (unlock_dp_on_error) + xfs_iunlock(dp, XFS_ILOCK_EXCL); + return error; +} + +int +xfs_create_tmpfile( + struct xfs_inode *dp, + struct dentry *dentry, + umode_t mode, + struct xfs_inode **ipp) +{ + struct xfs_mount *mp = dp->i_mount; + struct xfs_inode *ip = NULL; + struct xfs_trans *tp = NULL; + int error; + uint cancel_flags = XFS_TRANS_RELEASE_LOG_RES; + prid_t prid; + struct xfs_dquot *udqp = NULL; + struct xfs_dquot *gdqp = NULL; + struct xfs_dquot *pdqp = NULL; + struct xfs_trans_res *tres; + uint resblks; + + if (XFS_FORCED_SHUTDOWN(mp)) + return XFS_ERROR(EIO); + + prid = xfs_get_initial_prid(dp); + + /* + * Make sure that we have allocated dquot(s) on disk. + */ + error = xfs_qm_vop_dqalloc(dp, xfs_kuid_to_uid(current_fsuid()), + xfs_kgid_to_gid(current_fsgid()), prid, + XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, + &udqp, &gdqp, &pdqp); + if (error) + return error; + + resblks = XFS_IALLOC_SPACE_RES(mp); + tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE_TMPFILE); + + tres = &M_RES(mp)->tr_create_tmpfile; + error = xfs_trans_reserve(tp, tres, resblks, 0); + if (error == ENOSPC) { + /* No space at all so try a "no-allocation" reservation */ + resblks = 0; + error = xfs_trans_reserve(tp, tres, 0, 0); + } + if (error) { + cancel_flags = 0; + goto out_trans_cancel; + } + + error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp, + pdqp, resblks, 1, 0); + if (error) + goto out_trans_cancel; + + error = xfs_dir_ialloc(&tp, dp, mode, 1, 0, + prid, resblks > 0, &ip, NULL); + if (error) { + if (error == ENOSPC) + goto out_trans_cancel; + goto out_trans_abort; + } + + if (mp->m_flags & XFS_MOUNT_WSYNC) + xfs_trans_set_sync(tp); + + /* + * Attach the dquot(s) to the inodes and modify them incore. + * These ids of the inode couldn't have changed since the new + * inode has been locked ever since it was created. + */ + xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp); + + ip->i_d.di_nlink--; + error = xfs_iunlink(tp, ip); + if (error) + goto out_trans_abort; + + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + if (error) + goto out_release_inode; + + xfs_qm_dqrele(udqp); + xfs_qm_dqrele(gdqp); + xfs_qm_dqrele(pdqp); + + *ipp = ip; + return 0; + + out_trans_abort: + cancel_flags |= XFS_TRANS_ABORT; + out_trans_cancel: + xfs_trans_cancel(tp, cancel_flags); + out_release_inode: + /* + * Wait until after the current transaction is aborted to + * release the inode. This prevents recursive transactions + * and deadlocks from xfs_inactive. + */ + if (ip) + IRELE(ip); + + xfs_qm_dqrele(udqp); + xfs_qm_dqrele(gdqp); + xfs_qm_dqrele(pdqp); + + return error; +} + +int +xfs_link( + xfs_inode_t *tdp, + xfs_inode_t *sip, + struct xfs_name *target_name) +{ + xfs_mount_t *mp = tdp->i_mount; + xfs_trans_t *tp; + int error; + xfs_bmap_free_t free_list; + xfs_fsblock_t first_block; + int cancel_flags; + int committed; + int resblks; + + trace_xfs_link(tdp, target_name); + + ASSERT(!S_ISDIR(sip->i_d.di_mode)); + + if (XFS_FORCED_SHUTDOWN(mp)) + return XFS_ERROR(EIO); + + error = xfs_qm_dqattach(sip, 0); + if (error) + goto std_return; + + error = xfs_qm_dqattach(tdp, 0); + if (error) + goto std_return; + + tp = xfs_trans_alloc(mp, XFS_TRANS_LINK); + cancel_flags = XFS_TRANS_RELEASE_LOG_RES; + resblks = XFS_LINK_SPACE_RES(mp, target_name->len); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_link, resblks, 0); + if (error == ENOSPC) { + resblks = 0; + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_link, 0, 0); + } + if (error) { + cancel_flags = 0; + goto error_return; + } + + xfs_lock_two_inodes(sip, tdp, XFS_ILOCK_EXCL); + + xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, tdp, XFS_ILOCK_EXCL); + + /* + * If we are using project inheritance, we only allow hard link + * creation in our tree when the project IDs are the same; else + * the tree quota mechanism could be circumvented. + */ + if (unlikely((tdp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) && + (xfs_get_projid(tdp) != xfs_get_projid(sip)))) { + error = XFS_ERROR(EXDEV); + goto error_return; + } + + error = xfs_dir_canenter(tp, tdp, target_name, resblks); + if (error) + goto error_return; + + xfs_bmap_init(&free_list, &first_block); + + if (sip->i_d.di_nlink == 0) { + error = xfs_iunlink_remove(tp, sip); + if (error) + goto abort_return; + } + + error = xfs_dir_createname(tp, tdp, target_name, sip->i_ino, + &first_block, &free_list, resblks); + if (error) + goto abort_return; + xfs_trans_ichgtime(tp, tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE); + + error = xfs_bumplink(tp, sip); + if (error) + goto abort_return; + + /* + * If this is a synchronous mount, make sure that the + * link transaction goes to disk before returning to + * the user. + */ + if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) { + xfs_trans_set_sync(tp); + } + + error = xfs_bmap_finish (&tp, &free_list, &committed); + if (error) { + xfs_bmap_cancel(&free_list); + goto abort_return; + } + + return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + + abort_return: + cancel_flags |= XFS_TRANS_ABORT; + error_return: + xfs_trans_cancel(tp, cancel_flags); + std_return: + return error; +} + +/* * Free up the underlying blocks past new_size. The new size must be smaller * than the current size. This routine can be used both for the attribute and * data fork, and does not modify the inode size, which is left to the caller. @@ -1465,10 +1579,7 @@ xfs_itruncate_extents( * reference that we gained in xfs_trans_dup() */ xfs_log_ticket_put(tp->t_ticket); - error = xfs_trans_reserve(tp, 0, - XFS_ITRUNCATE_LOG_RES(mp), 0, - XFS_TRANS_PERM_LOG_RES, - XFS_ITRUNCATE_LOG_COUNT); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0); if (error) goto out; } @@ -1494,6 +1605,320 @@ out_bmap_cancel: goto out; } +int +xfs_release( + xfs_inode_t *ip) +{ + xfs_mount_t *mp = ip->i_mount; + int error; + + if (!S_ISREG(ip->i_d.di_mode) || (ip->i_d.di_mode == 0)) + return 0; + + /* If this is a read-only mount, don't do this (would generate I/O) */ + if (mp->m_flags & XFS_MOUNT_RDONLY) + return 0; + + if (!XFS_FORCED_SHUTDOWN(mp)) { + int truncated; + + /* + * If we previously truncated this file and removed old data + * in the process, we want to initiate "early" writeout on + * the last close. This is an attempt to combat the notorious + * NULL files problem which is particularly noticeable from a + * truncate down, buffered (re-)write (delalloc), followed by + * a crash. What we are effectively doing here is + * significantly reducing the time window where we'd otherwise + * be exposed to that problem. + */ + truncated = xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED); + if (truncated) { + xfs_iflags_clear(ip, XFS_IDIRTY_RELEASE); + if (VN_DIRTY(VFS_I(ip)) && ip->i_delayed_blks > 0) { + error = -filemap_flush(VFS_I(ip)->i_mapping); + if (error) + return error; + } + } + } + + if (ip->i_d.di_nlink == 0) + return 0; + + if (xfs_can_free_eofblocks(ip, false)) { + + /* + * If we can't get the iolock just skip truncating the blocks + * past EOF because we could deadlock with the mmap_sem + * otherwise. We'll get another chance to drop them once the + * last reference to the inode is dropped, so we'll never leak + * blocks permanently. + * + * Further, check if the inode is being opened, written and + * closed frequently and we have delayed allocation blocks + * outstanding (e.g. streaming writes from the NFS server), + * truncating the blocks past EOF will cause fragmentation to + * occur. + * + * In this case don't do the truncation, either, but we have to + * be careful how we detect this case. Blocks beyond EOF show + * up as i_delayed_blks even when the inode is clean, so we + * need to truncate them away first before checking for a dirty + * release. Hence on the first dirty close we will still remove + * the speculative allocation, but after that we will leave it + * in place. + */ + if (xfs_iflags_test(ip, XFS_IDIRTY_RELEASE)) + return 0; + + error = xfs_free_eofblocks(mp, ip, true); + if (error && error != EAGAIN) + return error; + + /* delalloc blocks after truncation means it really is dirty */ + if (ip->i_delayed_blks) + xfs_iflags_set(ip, XFS_IDIRTY_RELEASE); + } + return 0; +} + +/* + * xfs_inactive_truncate + * + * Called to perform a truncate when an inode becomes unlinked. + */ +STATIC int +xfs_inactive_truncate( + struct xfs_inode *ip) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_trans *tp; + int error; + + tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0); + if (error) { + ASSERT(XFS_FORCED_SHUTDOWN(mp)); + xfs_trans_cancel(tp, 0); + return error; + } + + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, 0); + + /* + * Log the inode size first to prevent stale data exposure in the event + * of a system crash before the truncate completes. See the related + * comment in xfs_setattr_size() for details. + */ + ip->i_d.di_size = 0; + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + + error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, 0); + if (error) + goto error_trans_cancel; + + ASSERT(ip->i_d.di_nextents == 0); + + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + if (error) + goto error_unlock; + + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return 0; + +error_trans_cancel: + xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); +error_unlock: + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return error; +} + +/* + * xfs_inactive_ifree() + * + * Perform the inode free when an inode is unlinked. + */ +STATIC int +xfs_inactive_ifree( + struct xfs_inode *ip) +{ + xfs_bmap_free_t free_list; + xfs_fsblock_t first_block; + int committed; + struct xfs_mount *mp = ip->i_mount; + struct xfs_trans *tp; + int error; + + tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE); + + /* + * The ifree transaction might need to allocate blocks for record + * insertion to the finobt. We don't want to fail here at ENOSPC, so + * allow ifree to dip into the reserved block pool if necessary. + * + * Freeing large sets of inodes generally means freeing inode chunks, + * directory and file data blocks, so this should be relatively safe. + * Only under severe circumstances should it be possible to free enough + * inodes to exhaust the reserve block pool via finobt expansion while + * at the same time not creating free space in the filesystem. + * + * Send a warning if the reservation does happen to fail, as the inode + * now remains allocated and sits on the unlinked list until the fs is + * repaired. + */ + tp->t_flags |= XFS_TRANS_RESERVE; + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ifree, + XFS_IFREE_SPACE_RES(mp), 0); + if (error) { + if (error == ENOSPC) { + xfs_warn_ratelimited(mp, + "Failed to remove inode(s) from unlinked list. " + "Please free space, unmount and run xfs_repair."); + } else { + ASSERT(XFS_FORCED_SHUTDOWN(mp)); + } + xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES); + return error; + } + + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, 0); + + xfs_bmap_init(&free_list, &first_block); + error = xfs_ifree(tp, ip, &free_list); + if (error) { + /* + * If we fail to free the inode, shut down. The cancel + * might do that, we need to make sure. Otherwise the + * inode might be lost for a long time or forever. + */ + if (!XFS_FORCED_SHUTDOWN(mp)) { + xfs_notice(mp, "%s: xfs_ifree returned error %d", + __func__, error); + xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); + } + xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return error; + } + + /* + * Credit the quota account(s). The inode is gone. + */ + xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_ICOUNT, -1); + + /* + * Just ignore errors at this point. There is nothing we can + * do except to try to keep going. Make sure it's not a silent + * error. + */ + error = xfs_bmap_finish(&tp, &free_list, &committed); + if (error) + xfs_notice(mp, "%s: xfs_bmap_finish returned error %d", + __func__, error); + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + if (error) + xfs_notice(mp, "%s: xfs_trans_commit returned error %d", + __func__, error); + + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return 0; +} + +/* + * xfs_inactive + * + * This is called when the vnode reference count for the vnode + * goes to zero. If the file has been unlinked, then it must + * now be truncated. Also, we clear all of the read-ahead state + * kept for the inode here since the file is now closed. + */ +void +xfs_inactive( + xfs_inode_t *ip) +{ + struct xfs_mount *mp; + int error; + int truncate = 0; + + /* + * If the inode is already free, then there can be nothing + * to clean up here. + */ + if (ip->i_d.di_mode == 0) { + ASSERT(ip->i_df.if_real_bytes == 0); + ASSERT(ip->i_df.if_broot_bytes == 0); + return; + } + + mp = ip->i_mount; + + /* If this is a read-only mount, don't do this (would generate I/O) */ + if (mp->m_flags & XFS_MOUNT_RDONLY) + return; + + if (ip->i_d.di_nlink != 0) { + /* + * force is true because we are evicting an inode from the + * cache. Post-eof blocks must be freed, lest we end up with + * broken free space accounting. + */ + if (xfs_can_free_eofblocks(ip, true)) + xfs_free_eofblocks(mp, ip, false); + + return; + } + + if (S_ISREG(ip->i_d.di_mode) && + (ip->i_d.di_size != 0 || XFS_ISIZE(ip) != 0 || + ip->i_d.di_nextents > 0 || ip->i_delayed_blks > 0)) + truncate = 1; + + error = xfs_qm_dqattach(ip, 0); + if (error) + return; + + if (S_ISLNK(ip->i_d.di_mode)) + error = xfs_inactive_symlink(ip); + else if (truncate) + error = xfs_inactive_truncate(ip); + if (error) + return; + + /* + * If there are attributes associated with the file then blow them away + * now. The code calls a routine that recursively deconstructs the + * attribute fork. We need to just commit the current transaction + * because we can't use it for xfs_attr_inactive(). + */ + if (ip->i_d.di_anextents > 0) { + ASSERT(ip->i_d.di_forkoff != 0); + + error = xfs_attr_inactive(ip); + if (error) + return; + } + + if (ip->i_afp) + xfs_idestroy_fork(ip, XFS_ATTR_FORK); + + ASSERT(ip->i_d.di_anextents == 0); + + /* + * Free the inode. + */ + error = xfs_inactive_ifree(ip); + if (error) + return; + + /* + * Release the dquots held by inode, if any. + */ + xfs_qm_dqdetach(ip); +} + /* * This is called when the inode's link count goes to 0. * We place the on-disk inode on a list in the AGI. It @@ -1554,6 +1979,10 @@ xfs_iunlink( dip->di_next_unlinked = agi->agi_unlinked[bucket_index]; offset = ip->i_imap.im_boffset + offsetof(xfs_dinode_t, di_next_unlinked); + + /* need to recalc the inode CRC if appropriate */ + xfs_dinode_calc_crc(mp, dip); + xfs_trans_inode_buf(tp, ibp); xfs_trans_log_buf(tp, ibp, offset, (offset + sizeof(xfs_agino_t) - 1)); @@ -1639,6 +2068,10 @@ xfs_iunlink_remove( dip->di_next_unlinked = cpu_to_be32(NULLAGINO); offset = ip->i_imap.im_boffset + offsetof(xfs_dinode_t, di_next_unlinked); + + /* need to recalc the inode CRC if appropriate */ + xfs_dinode_calc_crc(mp, dip); + xfs_trans_inode_buf(tp, ibp); xfs_trans_log_buf(tp, ibp, offset, (offset + sizeof(xfs_agino_t) - 1)); @@ -1712,6 +2145,10 @@ xfs_iunlink_remove( dip->di_next_unlinked = cpu_to_be32(NULLAGINO); offset = ip->i_imap.im_boffset + offsetof(xfs_dinode_t, di_next_unlinked); + + /* need to recalc the inode CRC if appropriate */ + xfs_dinode_calc_crc(mp, dip); + xfs_trans_inode_buf(tp, ibp); xfs_trans_log_buf(tp, ibp, offset, (offset + sizeof(xfs_agino_t) - 1)); @@ -1725,6 +2162,10 @@ xfs_iunlink_remove( last_dip->di_next_unlinked = cpu_to_be32(next_agino); ASSERT(next_agino != 0); offset = last_offset + offsetof(xfs_dinode_t, di_next_unlinked); + + /* need to recalc the inode CRC if appropriate */ + xfs_dinode_calc_crc(mp, last_dip); + xfs_trans_inode_buf(tp, last_ibp); xfs_trans_log_buf(tp, last_ibp, offset, (offset + sizeof(xfs_agino_t) - 1)); @@ -1734,7 +2175,7 @@ xfs_iunlink_remove( } /* - * A big issue when freeing the inode cluster is is that we _cannot_ skip any + * A big issue when freeing the inode cluster is that we _cannot_ skip any * inodes that are in memory - they all must be marked stale and attached to * the cluster buffer. */ @@ -1746,8 +2187,8 @@ xfs_ifree_cluster( { xfs_mount_t *mp = free_ip->i_mount; int blks_per_cluster; + int inodes_per_cluster; int nbufs; - int ninodes; int i, j; xfs_daddr_t blkno; xfs_buf_t *bp; @@ -1757,18 +2198,11 @@ xfs_ifree_cluster( struct xfs_perag *pag; pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, inum)); - if (mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(mp)) { - blks_per_cluster = 1; - ninodes = mp->m_sb.sb_inopblock; - nbufs = XFS_IALLOC_BLOCKS(mp); - } else { - blks_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) / - mp->m_sb.sb_blocksize; - ninodes = blks_per_cluster * mp->m_sb.sb_inopblock; - nbufs = XFS_IALLOC_BLOCKS(mp) / blks_per_cluster; - } + blks_per_cluster = xfs_icluster_size_fsb(mp); + inodes_per_cluster = blks_per_cluster << mp->m_sb.sb_inopblog; + nbufs = mp->m_ialloc_blks / blks_per_cluster; - for (j = 0; j < nbufs; j++, inum += ninodes) { + for (j = 0; j < nbufs; j++, inum += inodes_per_cluster) { blkno = XFS_AGB_TO_DADDR(mp, XFS_INO_TO_AGNO(mp, inum), XFS_INO_TO_AGBNO(mp, inum)); @@ -1830,7 +2264,7 @@ xfs_ifree_cluster( * transaction stale above, which means there is no point in * even trying to lock them. */ - for (i = 0; i < ninodes; i++) { + for (i = 0; i < inodes_per_cluster; i++) { retry: rcu_read_lock(); ip = radix_tree_lookup(&pag->pag_ici_root, @@ -1928,8 +2362,6 @@ xfs_ifree( int error; int delete; xfs_ino_t first_ino; - xfs_dinode_t *dip; - xfs_buf_t *ibp; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); ASSERT(ip->i_d.di_nlink == 0); @@ -1942,14 +2374,13 @@ xfs_ifree( * Pull the on-disk inode from the AGI unlinked list. */ error = xfs_iunlink_remove(tp, ip); - if (error != 0) { + if (error) return error; - } error = xfs_difree(tp, ip->i_ino, flist, &delete, &first_ino); - if (error != 0) { + if (error) return error; - } + ip->i_d.di_mode = 0; /* mark incore inode as free */ ip->i_d.di_flags = 0; ip->i_d.di_dmevmask = 0; @@ -1961,300 +2392,15 @@ xfs_ifree( * by reincarnations of this inode. */ ip->i_d.di_gen++; - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - error = xfs_imap_to_bp(ip->i_mount, tp, &ip->i_imap, &dip, &ibp, - 0, 0); - if (error) - return error; - - /* - * Clear the on-disk di_mode. This is to prevent xfs_bulkstat - * from picking up this inode when it is reclaimed (its incore state - * initialzed but not flushed to disk yet). The in-core di_mode is - * already cleared and a corresponding transaction logged. - * The hack here just synchronizes the in-core to on-disk - * di_mode value in advance before the actual inode sync to disk. - * This is OK because the inode is already unlinked and would never - * change its di_mode again for this inode generation. - * This is a temporary hack that would require a proper fix - * in the future. - */ - dip->di_mode = 0; - - if (delete) { + if (delete) error = xfs_ifree_cluster(ip, tp, first_ino); - } return error; } /* - * Reallocate the space for if_broot based on the number of records - * being added or deleted as indicated in rec_diff. Move the records - * and pointers in if_broot to fit the new size. When shrinking this - * will eliminate holes between the records and pointers created by - * the caller. When growing this will create holes to be filled in - * by the caller. - * - * The caller must not request to add more records than would fit in - * the on-disk inode root. If the if_broot is currently NULL, then - * if we adding records one will be allocated. The caller must also - * not request that the number of records go below zero, although - * it can go to zero. - * - * ip -- the inode whose if_broot area is changing - * ext_diff -- the change in the number of records, positive or negative, - * requested for the if_broot array. - */ -void -xfs_iroot_realloc( - xfs_inode_t *ip, - int rec_diff, - int whichfork) -{ - struct xfs_mount *mp = ip->i_mount; - int cur_max; - xfs_ifork_t *ifp; - struct xfs_btree_block *new_broot; - int new_max; - size_t new_size; - char *np; - char *op; - - /* - * Handle the degenerate case quietly. - */ - if (rec_diff == 0) { - return; - } - - ifp = XFS_IFORK_PTR(ip, whichfork); - if (rec_diff > 0) { - /* - * If there wasn't any memory allocated before, just - * allocate it now and get out. - */ - if (ifp->if_broot_bytes == 0) { - new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(rec_diff); - ifp->if_broot = kmem_alloc(new_size, KM_SLEEP | KM_NOFS); - ifp->if_broot_bytes = (int)new_size; - return; - } - - /* - * If there is already an existing if_broot, then we need - * to realloc() it and shift the pointers to their new - * location. The records don't change location because - * they are kept butted up against the btree block header. - */ - cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0); - new_max = cur_max + rec_diff; - new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(new_max); - ifp->if_broot = kmem_realloc(ifp->if_broot, new_size, - (size_t)XFS_BMAP_BROOT_SPACE_CALC(cur_max), /* old size */ - KM_SLEEP | KM_NOFS); - op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1, - ifp->if_broot_bytes); - np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1, - (int)new_size); - ifp->if_broot_bytes = (int)new_size; - ASSERT(ifp->if_broot_bytes <= - XFS_IFORK_SIZE(ip, whichfork) + XFS_BROOT_SIZE_ADJ); - memmove(np, op, cur_max * (uint)sizeof(xfs_dfsbno_t)); - return; - } - - /* - * rec_diff is less than 0. In this case, we are shrinking the - * if_broot buffer. It must already exist. If we go to zero - * records, just get rid of the root and clear the status bit. - */ - ASSERT((ifp->if_broot != NULL) && (ifp->if_broot_bytes > 0)); - cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0); - new_max = cur_max + rec_diff; - ASSERT(new_max >= 0); - if (new_max > 0) - new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(new_max); - else - new_size = 0; - if (new_size > 0) { - new_broot = kmem_alloc(new_size, KM_SLEEP | KM_NOFS); - /* - * First copy over the btree block header. - */ - memcpy(new_broot, ifp->if_broot, XFS_BTREE_LBLOCK_LEN); - } else { - new_broot = NULL; - ifp->if_flags &= ~XFS_IFBROOT; - } - - /* - * Only copy the records and pointers if there are any. - */ - if (new_max > 0) { - /* - * First copy the records. - */ - op = (char *)XFS_BMBT_REC_ADDR(mp, ifp->if_broot, 1); - np = (char *)XFS_BMBT_REC_ADDR(mp, new_broot, 1); - memcpy(np, op, new_max * (uint)sizeof(xfs_bmbt_rec_t)); - - /* - * Then copy the pointers. - */ - op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1, - ifp->if_broot_bytes); - np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, new_broot, 1, - (int)new_size); - memcpy(np, op, new_max * (uint)sizeof(xfs_dfsbno_t)); - } - kmem_free(ifp->if_broot); - ifp->if_broot = new_broot; - ifp->if_broot_bytes = (int)new_size; - ASSERT(ifp->if_broot_bytes <= - XFS_IFORK_SIZE(ip, whichfork) + XFS_BROOT_SIZE_ADJ); - return; -} - - -/* - * This is called when the amount of space needed for if_data - * is increased or decreased. The change in size is indicated by - * the number of bytes that need to be added or deleted in the - * byte_diff parameter. - * - * If the amount of space needed has decreased below the size of the - * inline buffer, then switch to using the inline buffer. Otherwise, - * use kmem_realloc() or kmem_alloc() to adjust the size of the buffer - * to what is needed. - * - * ip -- the inode whose if_data area is changing - * byte_diff -- the change in the number of bytes, positive or negative, - * requested for the if_data array. - */ -void -xfs_idata_realloc( - xfs_inode_t *ip, - int byte_diff, - int whichfork) -{ - xfs_ifork_t *ifp; - int new_size; - int real_size; - - if (byte_diff == 0) { - return; - } - - ifp = XFS_IFORK_PTR(ip, whichfork); - new_size = (int)ifp->if_bytes + byte_diff; - ASSERT(new_size >= 0); - - if (new_size == 0) { - if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) { - kmem_free(ifp->if_u1.if_data); - } - ifp->if_u1.if_data = NULL; - real_size = 0; - } else if (new_size <= sizeof(ifp->if_u2.if_inline_data)) { - /* - * If the valid extents/data can fit in if_inline_ext/data, - * copy them from the malloc'd vector and free it. - */ - if (ifp->if_u1.if_data == NULL) { - ifp->if_u1.if_data = ifp->if_u2.if_inline_data; - } else if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) { - ASSERT(ifp->if_real_bytes != 0); - memcpy(ifp->if_u2.if_inline_data, ifp->if_u1.if_data, - new_size); - kmem_free(ifp->if_u1.if_data); - ifp->if_u1.if_data = ifp->if_u2.if_inline_data; - } - real_size = 0; - } else { - /* - * Stuck with malloc/realloc. - * For inline data, the underlying buffer must be - * a multiple of 4 bytes in size so that it can be - * logged and stay on word boundaries. We enforce - * that here. - */ - real_size = roundup(new_size, 4); - if (ifp->if_u1.if_data == NULL) { - ASSERT(ifp->if_real_bytes == 0); - ifp->if_u1.if_data = kmem_alloc(real_size, - KM_SLEEP | KM_NOFS); - } else if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) { - /* - * Only do the realloc if the underlying size - * is really changing. - */ - if (ifp->if_real_bytes != real_size) { - ifp->if_u1.if_data = - kmem_realloc(ifp->if_u1.if_data, - real_size, - ifp->if_real_bytes, - KM_SLEEP | KM_NOFS); - } - } else { - ASSERT(ifp->if_real_bytes == 0); - ifp->if_u1.if_data = kmem_alloc(real_size, - KM_SLEEP | KM_NOFS); - memcpy(ifp->if_u1.if_data, ifp->if_u2.if_inline_data, - ifp->if_bytes); - } - } - ifp->if_real_bytes = real_size; - ifp->if_bytes = new_size; - ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork)); -} - -void -xfs_idestroy_fork( - xfs_inode_t *ip, - int whichfork) -{ - xfs_ifork_t *ifp; - - ifp = XFS_IFORK_PTR(ip, whichfork); - if (ifp->if_broot != NULL) { - kmem_free(ifp->if_broot); - ifp->if_broot = NULL; - } - - /* - * If the format is local, then we can't have an extents - * array so just look for an inline data array. If we're - * not local then we may or may not have an extents list, - * so check and free it up if we do. - */ - if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) { - if ((ifp->if_u1.if_data != ifp->if_u2.if_inline_data) && - (ifp->if_u1.if_data != NULL)) { - ASSERT(ifp->if_real_bytes != 0); - kmem_free(ifp->if_u1.if_data); - ifp->if_u1.if_data = NULL; - ifp->if_real_bytes = 0; - } - } else if ((ifp->if_flags & XFS_IFEXTENTS) && - ((ifp->if_flags & XFS_IFEXTIREC) || - ((ifp->if_u1.if_extents != NULL) && - (ifp->if_u1.if_extents != ifp->if_u2.if_inline_ext)))) { - ASSERT(ifp->if_real_bytes != 0); - xfs_iext_destroy(ifp); - } - ASSERT(ifp->if_u1.if_extents == NULL || - ifp->if_u1.if_extents == ifp->if_u2.if_inline_ext); - ASSERT(ifp->if_real_bytes == 0); - if (whichfork == XFS_ATTR_FORK) { - kmem_zone_free(xfs_ifork_zone, ip->i_afp); - ip->i_afp = NULL; - } -} - -/* * This is called to unpin an inode. The caller must have the inode locked * in at least shared mode so that the buffer cannot be subsequently pinned * once someone is waiting for it to be unpinned. @@ -2298,165 +2444,480 @@ xfs_iunpin_wait( } /* - * xfs_iextents_copy() + * Removing an inode from the namespace involves removing the directory entry + * and dropping the link count on the inode. Removing the directory entry can + * result in locking an AGF (directory blocks were freed) and removing a link + * count can result in placing the inode on an unlinked list which results in + * locking an AGI. * - * This is called to copy the REAL extents (as opposed to the delayed - * allocation extents) from the inode into the given buffer. It - * returns the number of bytes copied into the buffer. + * The big problem here is that we have an ordering constraint on AGF and AGI + * locking - inode allocation locks the AGI, then can allocate a new extent for + * new inodes, locking the AGF after the AGI. Similarly, freeing the inode + * removes the inode from the unlinked list, requiring that we lock the AGI + * first, and then freeing the inode can result in an inode chunk being freed + * and hence freeing disk space requiring that we lock an AGF. * - * If there are no delayed allocation extents, then we can just - * memcpy() the extents into the buffer. Otherwise, we need to - * examine each extent in turn and skip those which are delayed. + * Hence the ordering that is imposed by other parts of the code is AGI before + * AGF. This means we cannot remove the directory entry before we drop the inode + * reference count and put it on the unlinked list as this results in a lock + * order of AGF then AGI, and this can deadlock against inode allocation and + * freeing. Therefore we must drop the link counts before we remove the + * directory entry. + * + * This is still safe from a transactional point of view - it is not until we + * get to xfs_bmap_finish() that we have the possibility of multiple + * transactions in this operation. Hence as long as we remove the directory + * entry and drop the link count in the first transaction of the remove + * operation, there are no transactional constraints on the ordering here. */ int -xfs_iextents_copy( - xfs_inode_t *ip, - xfs_bmbt_rec_t *dp, - int whichfork) +xfs_remove( + xfs_inode_t *dp, + struct xfs_name *name, + xfs_inode_t *ip) { - int copied; - int i; - xfs_ifork_t *ifp; - int nrecs; - xfs_fsblock_t start_block; + xfs_mount_t *mp = dp->i_mount; + xfs_trans_t *tp = NULL; + int is_dir = S_ISDIR(ip->i_d.di_mode); + int error = 0; + xfs_bmap_free_t free_list; + xfs_fsblock_t first_block; + int cancel_flags; + int committed; + int link_zero; + uint resblks; + uint log_count; - ifp = XFS_IFORK_PTR(ip, whichfork); - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); - ASSERT(ifp->if_bytes > 0); + trace_xfs_remove(dp, name); + + if (XFS_FORCED_SHUTDOWN(mp)) + return XFS_ERROR(EIO); + + error = xfs_qm_dqattach(dp, 0); + if (error) + goto std_return; - nrecs = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); - XFS_BMAP_TRACE_EXLIST(ip, nrecs, whichfork); - ASSERT(nrecs > 0); + error = xfs_qm_dqattach(ip, 0); + if (error) + goto std_return; + + if (is_dir) { + tp = xfs_trans_alloc(mp, XFS_TRANS_RMDIR); + log_count = XFS_DEFAULT_LOG_COUNT; + } else { + tp = xfs_trans_alloc(mp, XFS_TRANS_REMOVE); + log_count = XFS_REMOVE_LOG_COUNT; + } + cancel_flags = XFS_TRANS_RELEASE_LOG_RES; /* - * There are some delayed allocation extents in the - * inode, so copy the extents one at a time and skip - * the delayed ones. There must be at least one - * non-delayed extent. + * We try to get the real space reservation first, + * allowing for directory btree deletion(s) implying + * possible bmap insert(s). If we can't get the space + * reservation then we use 0 instead, and avoid the bmap + * btree insert(s) in the directory code by, if the bmap + * insert tries to happen, instead trimming the LAST + * block from the directory. */ - copied = 0; - for (i = 0; i < nrecs; i++) { - xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i); - start_block = xfs_bmbt_get_startblock(ep); - if (isnullstartblock(start_block)) { - /* - * It's a delayed allocation extent, so skip it. - */ - continue; + resblks = XFS_REMOVE_SPACE_RES(mp); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_remove, resblks, 0); + if (error == ENOSPC) { + resblks = 0; + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_remove, 0, 0); + } + if (error) { + ASSERT(error != ENOSPC); + cancel_flags = 0; + goto out_trans_cancel; + } + + xfs_lock_two_inodes(dp, ip, XFS_ILOCK_EXCL); + + xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + + /* + * If we're removing a directory perform some additional validation. + */ + cancel_flags |= XFS_TRANS_ABORT; + if (is_dir) { + ASSERT(ip->i_d.di_nlink >= 2); + if (ip->i_d.di_nlink != 2) { + error = XFS_ERROR(ENOTEMPTY); + goto out_trans_cancel; + } + if (!xfs_dir_isempty(ip)) { + error = XFS_ERROR(ENOTEMPTY); + goto out_trans_cancel; } - /* Translate to on disk format */ - put_unaligned(cpu_to_be64(ep->l0), &dp->l0); - put_unaligned(cpu_to_be64(ep->l1), &dp->l1); - dp++; - copied++; + /* Drop the link from ip's "..". */ + error = xfs_droplink(tp, dp); + if (error) + goto out_trans_cancel; + + /* Drop the "." link from ip to self. */ + error = xfs_droplink(tp, ip); + if (error) + goto out_trans_cancel; + } else { + /* + * When removing a non-directory we need to log the parent + * inode here. For a directory this is done implicitly + * by the xfs_droplink call for the ".." entry. + */ + xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); + } + xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + + /* Drop the link from dp to ip. */ + error = xfs_droplink(tp, ip); + if (error) + goto out_trans_cancel; + + /* Determine if this is the last link while the inode is locked */ + link_zero = (ip->i_d.di_nlink == 0); + + xfs_bmap_init(&free_list, &first_block); + error = xfs_dir_removename(tp, dp, name, ip->i_ino, + &first_block, &free_list, resblks); + if (error) { + ASSERT(error != ENOENT); + goto out_bmap_cancel; } - ASSERT(copied != 0); - xfs_validate_extents(ifp, copied, XFS_EXTFMT_INODE(ip)); - return (copied * (uint)sizeof(xfs_bmbt_rec_t)); + /* + * If this is a synchronous mount, make sure that the + * remove transaction goes to disk before returning to + * the user. + */ + if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) + xfs_trans_set_sync(tp); + + error = xfs_bmap_finish(&tp, &free_list, &committed); + if (error) + goto out_bmap_cancel; + + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + if (error) + goto std_return; + + if (is_dir && xfs_inode_is_filestream(ip)) + xfs_filestream_deassociate(ip); + + return 0; + + out_bmap_cancel: + xfs_bmap_cancel(&free_list); + out_trans_cancel: + xfs_trans_cancel(tp, cancel_flags); + std_return: + return error; } /* - * Each of the following cases stores data into the same region - * of the on-disk inode, so only one of them can be valid at - * any given time. While it is possible to have conflicting formats - * and log flags, e.g. having XFS_ILOG_?DATA set when the fork is - * in EXTENTS format, this can only happen when the fork has - * changed formats after being modified but before being flushed. - * In these cases, the format always takes precedence, because the - * format indicates the current state of the fork. + * Enter all inodes for a rename transaction into a sorted array. */ -/*ARGSUSED*/ STATIC void -xfs_iflush_fork( - xfs_inode_t *ip, - xfs_dinode_t *dip, - xfs_inode_log_item_t *iip, - int whichfork, - xfs_buf_t *bp) -{ - char *cp; - xfs_ifork_t *ifp; - xfs_mount_t *mp; -#ifdef XFS_TRANS_DEBUG - int first; -#endif - static const short brootflag[2] = - { XFS_ILOG_DBROOT, XFS_ILOG_ABROOT }; - static const short dataflag[2] = - { XFS_ILOG_DDATA, XFS_ILOG_ADATA }; - static const short extflag[2] = - { XFS_ILOG_DEXT, XFS_ILOG_AEXT }; - - if (!iip) - return; - ifp = XFS_IFORK_PTR(ip, whichfork); +xfs_sort_for_rename( + xfs_inode_t *dp1, /* in: old (source) directory inode */ + xfs_inode_t *dp2, /* in: new (target) directory inode */ + xfs_inode_t *ip1, /* in: inode of old entry */ + xfs_inode_t *ip2, /* in: inode of new entry, if it + already exists, NULL otherwise. */ + xfs_inode_t **i_tab,/* out: array of inode returned, sorted */ + int *num_inodes) /* out: number of inodes in array */ +{ + xfs_inode_t *temp; + int i, j; + /* - * This can happen if we gave up in iformat in an error path, - * for the attribute fork. - */ - if (!ifp) { - ASSERT(whichfork == XFS_ATTR_FORK); - return; + * i_tab contains a list of pointers to inodes. We initialize + * the table here & we'll sort it. We will then use it to + * order the acquisition of the inode locks. + * + * Note that the table may contain duplicates. e.g., dp1 == dp2. + */ + i_tab[0] = dp1; + i_tab[1] = dp2; + i_tab[2] = ip1; + if (ip2) { + *num_inodes = 4; + i_tab[3] = ip2; + } else { + *num_inodes = 3; + i_tab[3] = NULL; } - cp = XFS_DFORK_PTR(dip, whichfork); - mp = ip->i_mount; - switch (XFS_IFORK_FORMAT(ip, whichfork)) { - case XFS_DINODE_FMT_LOCAL: - if ((iip->ili_fields & dataflag[whichfork]) && - (ifp->if_bytes > 0)) { - ASSERT(ifp->if_u1.if_data != NULL); - ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork)); - memcpy(cp, ifp->if_u1.if_data, ifp->if_bytes); - } - break; - case XFS_DINODE_FMT_EXTENTS: - ASSERT((ifp->if_flags & XFS_IFEXTENTS) || - !(iip->ili_fields & extflag[whichfork])); - if ((iip->ili_fields & extflag[whichfork]) && - (ifp->if_bytes > 0)) { - ASSERT(xfs_iext_get_ext(ifp, 0)); - ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) > 0); - (void)xfs_iextents_copy(ip, (xfs_bmbt_rec_t *)cp, - whichfork); + /* + * Sort the elements via bubble sort. (Remember, there are at + * most 4 elements to sort, so this is adequate.) + */ + for (i = 0; i < *num_inodes; i++) { + for (j = 1; j < *num_inodes; j++) { + if (i_tab[j]->i_ino < i_tab[j-1]->i_ino) { + temp = i_tab[j]; + i_tab[j] = i_tab[j-1]; + i_tab[j-1] = temp; + } } - break; + } +} - case XFS_DINODE_FMT_BTREE: - if ((iip->ili_fields & brootflag[whichfork]) && - (ifp->if_broot_bytes > 0)) { - ASSERT(ifp->if_broot != NULL); - ASSERT(ifp->if_broot_bytes <= - (XFS_IFORK_SIZE(ip, whichfork) + - XFS_BROOT_SIZE_ADJ)); - xfs_bmbt_to_bmdr(mp, ifp->if_broot, ifp->if_broot_bytes, - (xfs_bmdr_block_t *)cp, - XFS_DFORK_SIZE(dip, mp, whichfork)); - } - break; +/* + * xfs_rename + */ +int +xfs_rename( + xfs_inode_t *src_dp, + struct xfs_name *src_name, + xfs_inode_t *src_ip, + xfs_inode_t *target_dp, + struct xfs_name *target_name, + xfs_inode_t *target_ip) +{ + xfs_trans_t *tp = NULL; + xfs_mount_t *mp = src_dp->i_mount; + int new_parent; /* moving to a new dir */ + int src_is_directory; /* src_name is a directory */ + int error; + xfs_bmap_free_t free_list; + xfs_fsblock_t first_block; + int cancel_flags; + int committed; + xfs_inode_t *inodes[4]; + int spaceres; + int num_inodes; + + trace_xfs_rename(src_dp, target_dp, src_name, target_name); + + new_parent = (src_dp != target_dp); + src_is_directory = S_ISDIR(src_ip->i_d.di_mode); + + xfs_sort_for_rename(src_dp, target_dp, src_ip, target_ip, + inodes, &num_inodes); + + xfs_bmap_init(&free_list, &first_block); + tp = xfs_trans_alloc(mp, XFS_TRANS_RENAME); + cancel_flags = XFS_TRANS_RELEASE_LOG_RES; + spaceres = XFS_RENAME_SPACE_RES(mp, target_name->len); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_rename, spaceres, 0); + if (error == ENOSPC) { + spaceres = 0; + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_rename, 0, 0); + } + if (error) { + xfs_trans_cancel(tp, 0); + goto std_return; + } + + /* + * Attach the dquots to the inodes + */ + error = xfs_qm_vop_rename_dqattach(inodes); + if (error) { + xfs_trans_cancel(tp, cancel_flags); + goto std_return; + } + + /* + * Lock all the participating inodes. Depending upon whether + * the target_name exists in the target directory, and + * whether the target directory is the same as the source + * directory, we can lock from 2 to 4 inodes. + */ + xfs_lock_inodes(inodes, num_inodes, XFS_ILOCK_EXCL); + + /* + * Join all the inodes to the transaction. From this point on, + * we can rely on either trans_commit or trans_cancel to unlock + * them. + */ + xfs_trans_ijoin(tp, src_dp, XFS_ILOCK_EXCL); + if (new_parent) + xfs_trans_ijoin(tp, target_dp, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, src_ip, XFS_ILOCK_EXCL); + if (target_ip) + xfs_trans_ijoin(tp, target_ip, XFS_ILOCK_EXCL); + + /* + * If we are using project inheritance, we only allow renames + * into our tree when the project IDs are the same; else the + * tree quota mechanism would be circumvented. + */ + if (unlikely((target_dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) && + (xfs_get_projid(target_dp) != xfs_get_projid(src_ip)))) { + error = XFS_ERROR(EXDEV); + goto error_return; + } + + /* + * Set up the target. + */ + if (target_ip == NULL) { + /* + * If there's no space reservation, check the entry will + * fit before actually inserting it. + */ + error = xfs_dir_canenter(tp, target_dp, target_name, spaceres); + if (error) + goto error_return; + /* + * If target does not exist and the rename crosses + * directories, adjust the target directory link count + * to account for the ".." reference from the new entry. + */ + error = xfs_dir_createname(tp, target_dp, target_name, + src_ip->i_ino, &first_block, + &free_list, spaceres); + if (error == ENOSPC) + goto error_return; + if (error) + goto abort_return; - case XFS_DINODE_FMT_DEV: - if (iip->ili_fields & XFS_ILOG_DEV) { - ASSERT(whichfork == XFS_DATA_FORK); - xfs_dinode_put_rdev(dip, ip->i_df.if_u2.if_rdev); + xfs_trans_ichgtime(tp, target_dp, + XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + + if (new_parent && src_is_directory) { + error = xfs_bumplink(tp, target_dp); + if (error) + goto abort_return; + } + } else { /* target_ip != NULL */ + /* + * If target exists and it's a directory, check that both + * target and source are directories and that target can be + * destroyed, or that neither is a directory. + */ + if (S_ISDIR(target_ip->i_d.di_mode)) { + /* + * Make sure target dir is empty. + */ + if (!(xfs_dir_isempty(target_ip)) || + (target_ip->i_d.di_nlink > 2)) { + error = XFS_ERROR(EEXIST); + goto error_return; + } } - break; - case XFS_DINODE_FMT_UUID: - if (iip->ili_fields & XFS_ILOG_UUID) { - ASSERT(whichfork == XFS_DATA_FORK); - memcpy(XFS_DFORK_DPTR(dip), - &ip->i_df.if_u2.if_uuid, - sizeof(uuid_t)); + /* + * Link the source inode under the target name. + * If the source inode is a directory and we are moving + * it across directories, its ".." entry will be + * inconsistent until we replace that down below. + * + * In case there is already an entry with the same + * name at the destination directory, remove it first. + */ + error = xfs_dir_replace(tp, target_dp, target_name, + src_ip->i_ino, + &first_block, &free_list, spaceres); + if (error) + goto abort_return; + + xfs_trans_ichgtime(tp, target_dp, + XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + + /* + * Decrement the link count on the target since the target + * dir no longer points to it. + */ + error = xfs_droplink(tp, target_ip); + if (error) + goto abort_return; + + if (src_is_directory) { + /* + * Drop the link from the old "." entry. + */ + error = xfs_droplink(tp, target_ip); + if (error) + goto abort_return; } - break; + } /* target_ip != NULL */ - default: - ASSERT(0); - break; + /* + * Remove the source. + */ + if (new_parent && src_is_directory) { + /* + * Rewrite the ".." entry to point to the new + * directory. + */ + error = xfs_dir_replace(tp, src_ip, &xfs_name_dotdot, + target_dp->i_ino, + &first_block, &free_list, spaceres); + ASSERT(error != EEXIST); + if (error) + goto abort_return; } + + /* + * We always want to hit the ctime on the source inode. + * + * This isn't strictly required by the standards since the source + * inode isn't really being changed, but old unix file systems did + * it and some incremental backup programs won't work without it. + */ + xfs_trans_ichgtime(tp, src_ip, XFS_ICHGTIME_CHG); + xfs_trans_log_inode(tp, src_ip, XFS_ILOG_CORE); + + /* + * Adjust the link count on src_dp. This is necessary when + * renaming a directory, either within one parent when + * the target existed, or across two parent directories. + */ + if (src_is_directory && (new_parent || target_ip != NULL)) { + + /* + * Decrement link count on src_directory since the + * entry that's moved no longer points to it. + */ + error = xfs_droplink(tp, src_dp); + if (error) + goto abort_return; + } + + error = xfs_dir_removename(tp, src_dp, src_name, src_ip->i_ino, + &first_block, &free_list, spaceres); + if (error) + goto abort_return; + + xfs_trans_ichgtime(tp, src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE); + if (new_parent) + xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE); + + /* + * If this is a synchronous mount, make sure that the + * rename transaction goes to disk before returning to + * the user. + */ + if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) { + xfs_trans_set_sync(tp); + } + + error = xfs_bmap_finish(&tp, &free_list, &committed); + if (error) { + xfs_bmap_cancel(&free_list); + xfs_trans_cancel(tp, (XFS_TRANS_RELEASE_LOG_RES | + XFS_TRANS_ABORT)); + goto std_return; + } + + /* + * trans_commit will unlock src_ip, target_ip & decrement + * the vnode references. + */ + return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + + abort_return: + cancel_flags |= XFS_TRANS_ABORT; + error_return: + xfs_bmap_cancel(&free_list); + xfs_trans_cancel(tp, cancel_flags); + std_return: + return error; } STATIC int @@ -2478,13 +2939,13 @@ xfs_iflush_cluster( pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); - inodes_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog; + inodes_per_cluster = mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog; ilist_size = inodes_per_cluster * sizeof(xfs_inode_t *); ilist = kmem_alloc(ilist_size, KM_MAYFAIL|KM_NOFS); if (!ilist) goto out_put; - mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1); + mask = ~(((mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog)) - 1); first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask; rcu_read_lock(); /* really need a gang lookup range call here */ @@ -2715,26 +3176,21 @@ abort_out: return error; } - STATIC int xfs_iflush_int( - xfs_inode_t *ip, - xfs_buf_t *bp) + struct xfs_inode *ip, + struct xfs_buf *bp) { - xfs_inode_log_item_t *iip; - xfs_dinode_t *dip; - xfs_mount_t *mp; -#ifdef XFS_TRANS_DEBUG - int first; -#endif + struct xfs_inode_log_item *iip = ip->i_itemp; + struct xfs_dinode *dip; + struct xfs_mount *mp = ip->i_mount; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); ASSERT(xfs_isiflocked(ip)); ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE || ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)); - - iip = ip->i_itemp; - mp = ip->i_mount; + ASSERT(iip != NULL && iip->ili_fields != 0); + ASSERT(ip->i_d.di_version > 1); /* set *dip = inode's place in the buffer */ dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset); @@ -2793,12 +3249,18 @@ xfs_iflush_int( __func__, ip->i_ino, ip->i_d.di_forkoff, ip); goto corrupt_out; } + /* - * bump the flush iteration count, used to detect flushes which - * postdate a log record during recovery. + * Inode item log recovery for v2 inodes are dependent on the + * di_flushiter count for correct sequencing. We bump the flush + * iteration count so we can detect flushes which postdate a log record + * during recovery. This is redundant as we now log every change and + * hence this can't happen but we need to still do it to ensure + * backwards compatibility with old kernels that predate logging all + * inode changes. */ - - ip->i_d.di_flushiter++; + if (ip->i_d.di_version < 3) + ip->i_d.di_flushiter++; /* * Copy the dirty parts of the inode into the on-disk @@ -2812,40 +3274,9 @@ xfs_iflush_int( if (ip->i_d.di_flushiter == DI_MAX_FLUSH) ip->i_d.di_flushiter = 0; - /* - * If this is really an old format inode and the superblock version - * has not been updated to support only new format inodes, then - * convert back to the old inode format. If the superblock version - * has been updated, then make the conversion permanent. - */ - ASSERT(ip->i_d.di_version == 1 || xfs_sb_version_hasnlink(&mp->m_sb)); - if (ip->i_d.di_version == 1) { - if (!xfs_sb_version_hasnlink(&mp->m_sb)) { - /* - * Convert it back. - */ - ASSERT(ip->i_d.di_nlink <= XFS_MAXLINK_1); - dip->di_onlink = cpu_to_be16(ip->i_d.di_nlink); - } else { - /* - * The superblock version has already been bumped, - * so just make the conversion to the new inode - * format permanent. - */ - ip->i_d.di_version = 2; - dip->di_version = 2; - ip->i_d.di_onlink = 0; - dip->di_onlink = 0; - memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad)); - memset(&(dip->di_pad[0]), 0, - sizeof(dip->di_pad)); - ASSERT(xfs_get_projid(ip) == 0); - } - } - - xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK, bp); + xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK); if (XFS_IFORK_Q(ip)) - xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK, bp); + xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK); xfs_inobp_check(mp, bp); /* @@ -2873,1112 +3304,32 @@ xfs_iflush_int( * need the AIL lock, because it is a 64 bit value that cannot be read * atomically. */ - if (iip != NULL && iip->ili_fields != 0) { - iip->ili_last_fields = iip->ili_fields; - iip->ili_fields = 0; - iip->ili_logged = 1; - - xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn, - &iip->ili_item.li_lsn); - - /* - * Attach the function xfs_iflush_done to the inode's - * buffer. This will remove the inode from the AIL - * and unlock the inode's flush lock when the inode is - * completely written to disk. - */ - xfs_buf_attach_iodone(bp, xfs_iflush_done, &iip->ili_item); - - ASSERT(bp->b_fspriv != NULL); - ASSERT(bp->b_iodone != NULL); - } else { - /* - * We're flushing an inode which is not in the AIL and has - * not been logged. For this case we can immediately drop - * the inode flush lock because we can avoid the whole - * AIL state thing. It's OK to drop the flush lock now, - * because we've already locked the buffer and to do anything - * you really need both. - */ - if (iip != NULL) { - ASSERT(iip->ili_logged == 0); - ASSERT(iip->ili_last_fields == 0); - ASSERT((iip->ili_item.li_flags & XFS_LI_IN_AIL) == 0); - } - xfs_ifunlock(ip); - } - - return 0; - -corrupt_out: - return XFS_ERROR(EFSCORRUPTED); -} - -/* - * Return a pointer to the extent record at file index idx. - */ -xfs_bmbt_rec_host_t * -xfs_iext_get_ext( - xfs_ifork_t *ifp, /* inode fork pointer */ - xfs_extnum_t idx) /* index of target extent */ -{ - ASSERT(idx >= 0); - ASSERT(idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t)); - - if ((ifp->if_flags & XFS_IFEXTIREC) && (idx == 0)) { - return ifp->if_u1.if_ext_irec->er_extbuf; - } else if (ifp->if_flags & XFS_IFEXTIREC) { - xfs_ext_irec_t *erp; /* irec pointer */ - int erp_idx = 0; /* irec index */ - xfs_extnum_t page_idx = idx; /* ext index in target list */ - - erp = xfs_iext_idx_to_irec(ifp, &page_idx, &erp_idx, 0); - return &erp->er_extbuf[page_idx]; - } else if (ifp->if_bytes) { - return &ifp->if_u1.if_extents[idx]; - } else { - return NULL; - } -} - -/* - * Insert new item(s) into the extent records for incore inode - * fork 'ifp'. 'count' new items are inserted at index 'idx'. - */ -void -xfs_iext_insert( - xfs_inode_t *ip, /* incore inode pointer */ - xfs_extnum_t idx, /* starting index of new items */ - xfs_extnum_t count, /* number of inserted items */ - xfs_bmbt_irec_t *new, /* items to insert */ - int state) /* type of extent conversion */ -{ - xfs_ifork_t *ifp = (state & BMAP_ATTRFORK) ? ip->i_afp : &ip->i_df; - xfs_extnum_t i; /* extent record index */ - - trace_xfs_iext_insert(ip, idx, new, state, _RET_IP_); - - ASSERT(ifp->if_flags & XFS_IFEXTENTS); - xfs_iext_add(ifp, idx, count); - for (i = idx; i < idx + count; i++, new++) - xfs_bmbt_set_all(xfs_iext_get_ext(ifp, i), new); -} - -/* - * This is called when the amount of space required for incore file - * extents needs to be increased. The ext_diff parameter stores the - * number of new extents being added and the idx parameter contains - * the extent index where the new extents will be added. If the new - * extents are being appended, then we just need to (re)allocate and - * initialize the space. Otherwise, if the new extents are being - * inserted into the middle of the existing entries, a bit more work - * is required to make room for the new extents to be inserted. The - * caller is responsible for filling in the new extent entries upon - * return. - */ -void -xfs_iext_add( - xfs_ifork_t *ifp, /* inode fork pointer */ - xfs_extnum_t idx, /* index to begin adding exts */ - int ext_diff) /* number of extents to add */ -{ - int byte_diff; /* new bytes being added */ - int new_size; /* size of extents after adding */ - xfs_extnum_t nextents; /* number of extents in file */ - - nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); - ASSERT((idx >= 0) && (idx <= nextents)); - byte_diff = ext_diff * sizeof(xfs_bmbt_rec_t); - new_size = ifp->if_bytes + byte_diff; - /* - * If the new number of extents (nextents + ext_diff) - * fits inside the inode, then continue to use the inline - * extent buffer. - */ - if (nextents + ext_diff <= XFS_INLINE_EXTS) { - if (idx < nextents) { - memmove(&ifp->if_u2.if_inline_ext[idx + ext_diff], - &ifp->if_u2.if_inline_ext[idx], - (nextents - idx) * sizeof(xfs_bmbt_rec_t)); - memset(&ifp->if_u2.if_inline_ext[idx], 0, byte_diff); - } - ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext; - ifp->if_real_bytes = 0; - } - /* - * Otherwise use a linear (direct) extent list. - * If the extents are currently inside the inode, - * xfs_iext_realloc_direct will switch us from - * inline to direct extent allocation mode. - */ - else if (nextents + ext_diff <= XFS_LINEAR_EXTS) { - xfs_iext_realloc_direct(ifp, new_size); - if (idx < nextents) { - memmove(&ifp->if_u1.if_extents[idx + ext_diff], - &ifp->if_u1.if_extents[idx], - (nextents - idx) * sizeof(xfs_bmbt_rec_t)); - memset(&ifp->if_u1.if_extents[idx], 0, byte_diff); - } - } - /* Indirection array */ - else { - xfs_ext_irec_t *erp; - int erp_idx = 0; - int page_idx = idx; - - ASSERT(nextents + ext_diff > XFS_LINEAR_EXTS); - if (ifp->if_flags & XFS_IFEXTIREC) { - erp = xfs_iext_idx_to_irec(ifp, &page_idx, &erp_idx, 1); - } else { - xfs_iext_irec_init(ifp); - ASSERT(ifp->if_flags & XFS_IFEXTIREC); - erp = ifp->if_u1.if_ext_irec; - } - /* Extents fit in target extent page */ - if (erp && erp->er_extcount + ext_diff <= XFS_LINEAR_EXTS) { - if (page_idx < erp->er_extcount) { - memmove(&erp->er_extbuf[page_idx + ext_diff], - &erp->er_extbuf[page_idx], - (erp->er_extcount - page_idx) * - sizeof(xfs_bmbt_rec_t)); - memset(&erp->er_extbuf[page_idx], 0, byte_diff); - } - erp->er_extcount += ext_diff; - xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff); - } - /* Insert a new extent page */ - else if (erp) { - xfs_iext_add_indirect_multi(ifp, - erp_idx, page_idx, ext_diff); - } - /* - * If extent(s) are being appended to the last page in - * the indirection array and the new extent(s) don't fit - * in the page, then erp is NULL and erp_idx is set to - * the next index needed in the indirection array. - */ - else { - int count = ext_diff; - - while (count) { - erp = xfs_iext_irec_new(ifp, erp_idx); - erp->er_extcount = count; - count -= MIN(count, (int)XFS_LINEAR_EXTS); - if (count) { - erp_idx++; - } - } - } - } - ifp->if_bytes = new_size; -} + iip->ili_last_fields = iip->ili_fields; + iip->ili_fields = 0; + iip->ili_logged = 1; -/* - * This is called when incore extents are being added to the indirection - * array and the new extents do not fit in the target extent list. The - * erp_idx parameter contains the irec index for the target extent list - * in the indirection array, and the idx parameter contains the extent - * index within the list. The number of extents being added is stored - * in the count parameter. - * - * |-------| |-------| - * | | | | idx - number of extents before idx - * | idx | | count | - * | | | | count - number of extents being inserted at idx - * |-------| |-------| - * | count | | nex2 | nex2 - number of extents after idx + count - * |-------| |-------| - */ -void -xfs_iext_add_indirect_multi( - xfs_ifork_t *ifp, /* inode fork pointer */ - int erp_idx, /* target extent irec index */ - xfs_extnum_t idx, /* index within target list */ - int count) /* new extents being added */ -{ - int byte_diff; /* new bytes being added */ - xfs_ext_irec_t *erp; /* pointer to irec entry */ - xfs_extnum_t ext_diff; /* number of extents to add */ - xfs_extnum_t ext_cnt; /* new extents still needed */ - xfs_extnum_t nex2; /* extents after idx + count */ - xfs_bmbt_rec_t *nex2_ep = NULL; /* temp list for nex2 extents */ - int nlists; /* number of irec's (lists) */ - - ASSERT(ifp->if_flags & XFS_IFEXTIREC); - erp = &ifp->if_u1.if_ext_irec[erp_idx]; - nex2 = erp->er_extcount - idx; - nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; - - /* - * Save second part of target extent list - * (all extents past */ - if (nex2) { - byte_diff = nex2 * sizeof(xfs_bmbt_rec_t); - nex2_ep = (xfs_bmbt_rec_t *) kmem_alloc(byte_diff, KM_NOFS); - memmove(nex2_ep, &erp->er_extbuf[idx], byte_diff); - erp->er_extcount -= nex2; - xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, -nex2); - memset(&erp->er_extbuf[idx], 0, byte_diff); - } - - /* - * Add the new extents to the end of the target - * list, then allocate new irec record(s) and - * extent buffer(s) as needed to store the rest - * of the new extents. - */ - ext_cnt = count; - ext_diff = MIN(ext_cnt, (int)XFS_LINEAR_EXTS - erp->er_extcount); - if (ext_diff) { - erp->er_extcount += ext_diff; - xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff); - ext_cnt -= ext_diff; - } - while (ext_cnt) { - erp_idx++; - erp = xfs_iext_irec_new(ifp, erp_idx); - ext_diff = MIN(ext_cnt, (int)XFS_LINEAR_EXTS); - erp->er_extcount = ext_diff; - xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff); - ext_cnt -= ext_diff; - } - - /* Add nex2 extents back to indirection array */ - if (nex2) { - xfs_extnum_t ext_avail; - int i; - - byte_diff = nex2 * sizeof(xfs_bmbt_rec_t); - ext_avail = XFS_LINEAR_EXTS - erp->er_extcount; - i = 0; - /* - * If nex2 extents fit in the current page, append - * nex2_ep after the new extents. - */ - if (nex2 <= ext_avail) { - i = erp->er_extcount; - } - /* - * Otherwise, check if space is available in the - * next page. - */ - else if ((erp_idx < nlists - 1) && - (nex2 <= (ext_avail = XFS_LINEAR_EXTS - - ifp->if_u1.if_ext_irec[erp_idx+1].er_extcount))) { - erp_idx++; - erp++; - /* Create a hole for nex2 extents */ - memmove(&erp->er_extbuf[nex2], erp->er_extbuf, - erp->er_extcount * sizeof(xfs_bmbt_rec_t)); - } - /* - * Final choice, create a new extent page for - * nex2 extents. - */ - else { - erp_idx++; - erp = xfs_iext_irec_new(ifp, erp_idx); - } - memmove(&erp->er_extbuf[i], nex2_ep, byte_diff); - kmem_free(nex2_ep); - erp->er_extcount += nex2; - xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, nex2); - } -} + xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn, + &iip->ili_item.li_lsn); -/* - * This is called when the amount of space required for incore file - * extents needs to be decreased. The ext_diff parameter stores the - * number of extents to be removed and the idx parameter contains - * the extent index where the extents will be removed from. - * - * If the amount of space needed has decreased below the linear - * limit, XFS_IEXT_BUFSZ, then switch to using the contiguous - * extent array. Otherwise, use kmem_realloc() to adjust the - * size to what is needed. - */ -void -xfs_iext_remove( - xfs_inode_t *ip, /* incore inode pointer */ - xfs_extnum_t idx, /* index to begin removing exts */ - int ext_diff, /* number of extents to remove */ - int state) /* type of extent conversion */ -{ - xfs_ifork_t *ifp = (state & BMAP_ATTRFORK) ? ip->i_afp : &ip->i_df; - xfs_extnum_t nextents; /* number of extents in file */ - int new_size; /* size of extents after removal */ - - trace_xfs_iext_remove(ip, idx, state, _RET_IP_); - - ASSERT(ext_diff > 0); - nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); - new_size = (nextents - ext_diff) * sizeof(xfs_bmbt_rec_t); - - if (new_size == 0) { - xfs_iext_destroy(ifp); - } else if (ifp->if_flags & XFS_IFEXTIREC) { - xfs_iext_remove_indirect(ifp, idx, ext_diff); - } else if (ifp->if_real_bytes) { - xfs_iext_remove_direct(ifp, idx, ext_diff); - } else { - xfs_iext_remove_inline(ifp, idx, ext_diff); - } - ifp->if_bytes = new_size; -} - -/* - * This removes ext_diff extents from the inline buffer, beginning - * at extent index idx. - */ -void -xfs_iext_remove_inline( - xfs_ifork_t *ifp, /* inode fork pointer */ - xfs_extnum_t idx, /* index to begin removing exts */ - int ext_diff) /* number of extents to remove */ -{ - int nextents; /* number of extents in file */ - - ASSERT(!(ifp->if_flags & XFS_IFEXTIREC)); - ASSERT(idx < XFS_INLINE_EXTS); - nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); - ASSERT(((nextents - ext_diff) > 0) && - (nextents - ext_diff) < XFS_INLINE_EXTS); - - if (idx + ext_diff < nextents) { - memmove(&ifp->if_u2.if_inline_ext[idx], - &ifp->if_u2.if_inline_ext[idx + ext_diff], - (nextents - (idx + ext_diff)) * - sizeof(xfs_bmbt_rec_t)); - memset(&ifp->if_u2.if_inline_ext[nextents - ext_diff], - 0, ext_diff * sizeof(xfs_bmbt_rec_t)); - } else { - memset(&ifp->if_u2.if_inline_ext[idx], 0, - ext_diff * sizeof(xfs_bmbt_rec_t)); - } -} - -/* - * This removes ext_diff extents from a linear (direct) extent list, - * beginning at extent index idx. If the extents are being removed - * from the end of the list (ie. truncate) then we just need to re- - * allocate the list to remove the extra space. Otherwise, if the - * extents are being removed from the middle of the existing extent - * entries, then we first need to move the extent records beginning - * at idx + ext_diff up in the list to overwrite the records being - * removed, then remove the extra space via kmem_realloc. - */ -void -xfs_iext_remove_direct( - xfs_ifork_t *ifp, /* inode fork pointer */ - xfs_extnum_t idx, /* index to begin removing exts */ - int ext_diff) /* number of extents to remove */ -{ - xfs_extnum_t nextents; /* number of extents in file */ - int new_size; /* size of extents after removal */ - - ASSERT(!(ifp->if_flags & XFS_IFEXTIREC)); - new_size = ifp->if_bytes - - (ext_diff * sizeof(xfs_bmbt_rec_t)); - nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); - - if (new_size == 0) { - xfs_iext_destroy(ifp); - return; - } - /* Move extents up in the list (if needed) */ - if (idx + ext_diff < nextents) { - memmove(&ifp->if_u1.if_extents[idx], - &ifp->if_u1.if_extents[idx + ext_diff], - (nextents - (idx + ext_diff)) * - sizeof(xfs_bmbt_rec_t)); - } - memset(&ifp->if_u1.if_extents[nextents - ext_diff], - 0, ext_diff * sizeof(xfs_bmbt_rec_t)); /* - * Reallocate the direct extent list. If the extents - * will fit inside the inode then xfs_iext_realloc_direct - * will switch from direct to inline extent allocation - * mode for us. + * Attach the function xfs_iflush_done to the inode's + * buffer. This will remove the inode from the AIL + * and unlock the inode's flush lock when the inode is + * completely written to disk. */ - xfs_iext_realloc_direct(ifp, new_size); - ifp->if_bytes = new_size; -} + xfs_buf_attach_iodone(bp, xfs_iflush_done, &iip->ili_item); -/* - * This is called when incore extents are being removed from the - * indirection array and the extents being removed span multiple extent - * buffers. The idx parameter contains the file extent index where we - * want to begin removing extents, and the count parameter contains - * how many extents need to be removed. - * - * |-------| |-------| - * | nex1 | | | nex1 - number of extents before idx - * |-------| | count | - * | | | | count - number of extents being removed at idx - * | count | |-------| - * | | | nex2 | nex2 - number of extents after idx + count - * |-------| |-------| - */ -void -xfs_iext_remove_indirect( - xfs_ifork_t *ifp, /* inode fork pointer */ - xfs_extnum_t idx, /* index to begin removing extents */ - int count) /* number of extents to remove */ -{ - xfs_ext_irec_t *erp; /* indirection array pointer */ - int erp_idx = 0; /* indirection array index */ - xfs_extnum_t ext_cnt; /* extents left to remove */ - xfs_extnum_t ext_diff; /* extents to remove in current list */ - xfs_extnum_t nex1; /* number of extents before idx */ - xfs_extnum_t nex2; /* extents after idx + count */ - int page_idx = idx; /* index in target extent list */ - - ASSERT(ifp->if_flags & XFS_IFEXTIREC); - erp = xfs_iext_idx_to_irec(ifp, &page_idx, &erp_idx, 0); - ASSERT(erp != NULL); - nex1 = page_idx; - ext_cnt = count; - while (ext_cnt) { - nex2 = MAX((erp->er_extcount - (nex1 + ext_cnt)), 0); - ext_diff = MIN(ext_cnt, (erp->er_extcount - nex1)); - /* - * Check for deletion of entire list; - * xfs_iext_irec_remove() updates extent offsets. - */ - if (ext_diff == erp->er_extcount) { - xfs_iext_irec_remove(ifp, erp_idx); - ext_cnt -= ext_diff; - nex1 = 0; - if (ext_cnt) { - ASSERT(erp_idx < ifp->if_real_bytes / - XFS_IEXT_BUFSZ); - erp = &ifp->if_u1.if_ext_irec[erp_idx]; - nex1 = 0; - continue; - } else { - break; - } - } - /* Move extents up (if needed) */ - if (nex2) { - memmove(&erp->er_extbuf[nex1], - &erp->er_extbuf[nex1 + ext_diff], - nex2 * sizeof(xfs_bmbt_rec_t)); - } - /* Zero out rest of page */ - memset(&erp->er_extbuf[nex1 + nex2], 0, (XFS_IEXT_BUFSZ - - ((nex1 + nex2) * sizeof(xfs_bmbt_rec_t)))); - /* Update remaining counters */ - erp->er_extcount -= ext_diff; - xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, -ext_diff); - ext_cnt -= ext_diff; - nex1 = 0; - erp_idx++; - erp++; - } - ifp->if_bytes -= count * sizeof(xfs_bmbt_rec_t); - xfs_iext_irec_compact(ifp); -} + /* update the lsn in the on disk inode if required */ + if (ip->i_d.di_version == 3) + dip->di_lsn = cpu_to_be64(iip->ili_item.li_lsn); -/* - * Create, destroy, or resize a linear (direct) block of extents. - */ -void -xfs_iext_realloc_direct( - xfs_ifork_t *ifp, /* inode fork pointer */ - int new_size) /* new size of extents */ -{ - int rnew_size; /* real new size of extents */ - - rnew_size = new_size; - - ASSERT(!(ifp->if_flags & XFS_IFEXTIREC) || - ((new_size >= 0) && (new_size <= XFS_IEXT_BUFSZ) && - (new_size != ifp->if_real_bytes))); - - /* Free extent records */ - if (new_size == 0) { - xfs_iext_destroy(ifp); - } - /* Resize direct extent list and zero any new bytes */ - else if (ifp->if_real_bytes) { - /* Check if extents will fit inside the inode */ - if (new_size <= XFS_INLINE_EXTS * sizeof(xfs_bmbt_rec_t)) { - xfs_iext_direct_to_inline(ifp, new_size / - (uint)sizeof(xfs_bmbt_rec_t)); - ifp->if_bytes = new_size; - return; - } - if (!is_power_of_2(new_size)){ - rnew_size = roundup_pow_of_two(new_size); - } - if (rnew_size != ifp->if_real_bytes) { - ifp->if_u1.if_extents = - kmem_realloc(ifp->if_u1.if_extents, - rnew_size, - ifp->if_real_bytes, KM_NOFS); - } - if (rnew_size > ifp->if_real_bytes) { - memset(&ifp->if_u1.if_extents[ifp->if_bytes / - (uint)sizeof(xfs_bmbt_rec_t)], 0, - rnew_size - ifp->if_real_bytes); - } - } - /* - * Switch from the inline extent buffer to a direct - * extent list. Be sure to include the inline extent - * bytes in new_size. - */ - else { - new_size += ifp->if_bytes; - if (!is_power_of_2(new_size)) { - rnew_size = roundup_pow_of_two(new_size); - } - xfs_iext_inline_to_direct(ifp, rnew_size); - } - ifp->if_real_bytes = rnew_size; - ifp->if_bytes = new_size; -} - -/* - * Switch from linear (direct) extent records to inline buffer. - */ -void -xfs_iext_direct_to_inline( - xfs_ifork_t *ifp, /* inode fork pointer */ - xfs_extnum_t nextents) /* number of extents in file */ -{ - ASSERT(ifp->if_flags & XFS_IFEXTENTS); - ASSERT(nextents <= XFS_INLINE_EXTS); - /* - * The inline buffer was zeroed when we switched - * from inline to direct extent allocation mode, - * so we don't need to clear it here. - */ - memcpy(ifp->if_u2.if_inline_ext, ifp->if_u1.if_extents, - nextents * sizeof(xfs_bmbt_rec_t)); - kmem_free(ifp->if_u1.if_extents); - ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext; - ifp->if_real_bytes = 0; -} - -/* - * Switch from inline buffer to linear (direct) extent records. - * new_size should already be rounded up to the next power of 2 - * by the caller (when appropriate), so use new_size as it is. - * However, since new_size may be rounded up, we can't update - * if_bytes here. It is the caller's responsibility to update - * if_bytes upon return. - */ -void -xfs_iext_inline_to_direct( - xfs_ifork_t *ifp, /* inode fork pointer */ - int new_size) /* number of extents in file */ -{ - ifp->if_u1.if_extents = kmem_alloc(new_size, KM_NOFS); - memset(ifp->if_u1.if_extents, 0, new_size); - if (ifp->if_bytes) { - memcpy(ifp->if_u1.if_extents, ifp->if_u2.if_inline_ext, - ifp->if_bytes); - memset(ifp->if_u2.if_inline_ext, 0, XFS_INLINE_EXTS * - sizeof(xfs_bmbt_rec_t)); - } - ifp->if_real_bytes = new_size; -} - -/* - * Resize an extent indirection array to new_size bytes. - */ -STATIC void -xfs_iext_realloc_indirect( - xfs_ifork_t *ifp, /* inode fork pointer */ - int new_size) /* new indirection array size */ -{ - int nlists; /* number of irec's (ex lists) */ - int size; /* current indirection array size */ - - ASSERT(ifp->if_flags & XFS_IFEXTIREC); - nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; - size = nlists * sizeof(xfs_ext_irec_t); - ASSERT(ifp->if_real_bytes); - ASSERT((new_size >= 0) && (new_size != size)); - if (new_size == 0) { - xfs_iext_destroy(ifp); - } else { - ifp->if_u1.if_ext_irec = (xfs_ext_irec_t *) - kmem_realloc(ifp->if_u1.if_ext_irec, - new_size, size, KM_NOFS); - } -} - -/* - * Switch from indirection array to linear (direct) extent allocations. - */ -STATIC void -xfs_iext_indirect_to_direct( - xfs_ifork_t *ifp) /* inode fork pointer */ -{ - xfs_bmbt_rec_host_t *ep; /* extent record pointer */ - xfs_extnum_t nextents; /* number of extents in file */ - int size; /* size of file extents */ - - ASSERT(ifp->if_flags & XFS_IFEXTIREC); - nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); - ASSERT(nextents <= XFS_LINEAR_EXTS); - size = nextents * sizeof(xfs_bmbt_rec_t); - - xfs_iext_irec_compact_pages(ifp); - ASSERT(ifp->if_real_bytes == XFS_IEXT_BUFSZ); - - ep = ifp->if_u1.if_ext_irec->er_extbuf; - kmem_free(ifp->if_u1.if_ext_irec); - ifp->if_flags &= ~XFS_IFEXTIREC; - ifp->if_u1.if_extents = ep; - ifp->if_bytes = size; - if (nextents < XFS_LINEAR_EXTS) { - xfs_iext_realloc_direct(ifp, size); - } -} - -/* - * Free incore file extents. - */ -void -xfs_iext_destroy( - xfs_ifork_t *ifp) /* inode fork pointer */ -{ - if (ifp->if_flags & XFS_IFEXTIREC) { - int erp_idx; - int nlists; - - nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; - for (erp_idx = nlists - 1; erp_idx >= 0 ; erp_idx--) { - xfs_iext_irec_remove(ifp, erp_idx); - } - ifp->if_flags &= ~XFS_IFEXTIREC; - } else if (ifp->if_real_bytes) { - kmem_free(ifp->if_u1.if_extents); - } else if (ifp->if_bytes) { - memset(ifp->if_u2.if_inline_ext, 0, XFS_INLINE_EXTS * - sizeof(xfs_bmbt_rec_t)); - } - ifp->if_u1.if_extents = NULL; - ifp->if_real_bytes = 0; - ifp->if_bytes = 0; -} - -/* - * Return a pointer to the extent record for file system block bno. - */ -xfs_bmbt_rec_host_t * /* pointer to found extent record */ -xfs_iext_bno_to_ext( - xfs_ifork_t *ifp, /* inode fork pointer */ - xfs_fileoff_t bno, /* block number to search for */ - xfs_extnum_t *idxp) /* index of target extent */ -{ - xfs_bmbt_rec_host_t *base; /* pointer to first extent */ - xfs_filblks_t blockcount = 0; /* number of blocks in extent */ - xfs_bmbt_rec_host_t *ep = NULL; /* pointer to target extent */ - xfs_ext_irec_t *erp = NULL; /* indirection array pointer */ - int high; /* upper boundary in search */ - xfs_extnum_t idx = 0; /* index of target extent */ - int low; /* lower boundary in search */ - xfs_extnum_t nextents; /* number of file extents */ - xfs_fileoff_t startoff = 0; /* start offset of extent */ - - nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); - if (nextents == 0) { - *idxp = 0; - return NULL; - } - low = 0; - if (ifp->if_flags & XFS_IFEXTIREC) { - /* Find target extent list */ - int erp_idx = 0; - erp = xfs_iext_bno_to_irec(ifp, bno, &erp_idx); - base = erp->er_extbuf; - high = erp->er_extcount - 1; - } else { - base = ifp->if_u1.if_extents; - high = nextents - 1; - } - /* Binary search extent records */ - while (low <= high) { - idx = (low + high) >> 1; - ep = base + idx; - startoff = xfs_bmbt_get_startoff(ep); - blockcount = xfs_bmbt_get_blockcount(ep); - if (bno < startoff) { - high = idx - 1; - } else if (bno >= startoff + blockcount) { - low = idx + 1; - } else { - /* Convert back to file-based extent index */ - if (ifp->if_flags & XFS_IFEXTIREC) { - idx += erp->er_extoff; - } - *idxp = idx; - return ep; - } - } - /* Convert back to file-based extent index */ - if (ifp->if_flags & XFS_IFEXTIREC) { - idx += erp->er_extoff; - } - if (bno >= startoff + blockcount) { - if (++idx == nextents) { - ep = NULL; - } else { - ep = xfs_iext_get_ext(ifp, idx); - } - } - *idxp = idx; - return ep; -} - -/* - * Return a pointer to the indirection array entry containing the - * extent record for filesystem block bno. Store the index of the - * target irec in *erp_idxp. - */ -xfs_ext_irec_t * /* pointer to found extent record */ -xfs_iext_bno_to_irec( - xfs_ifork_t *ifp, /* inode fork pointer */ - xfs_fileoff_t bno, /* block number to search for */ - int *erp_idxp) /* irec index of target ext list */ -{ - xfs_ext_irec_t *erp = NULL; /* indirection array pointer */ - xfs_ext_irec_t *erp_next; /* next indirection array entry */ - int erp_idx; /* indirection array index */ - int nlists; /* number of extent irec's (lists) */ - int high; /* binary search upper limit */ - int low; /* binary search lower limit */ - - ASSERT(ifp->if_flags & XFS_IFEXTIREC); - nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; - erp_idx = 0; - low = 0; - high = nlists - 1; - while (low <= high) { - erp_idx = (low + high) >> 1; - erp = &ifp->if_u1.if_ext_irec[erp_idx]; - erp_next = erp_idx < nlists - 1 ? erp + 1 : NULL; - if (bno < xfs_bmbt_get_startoff(erp->er_extbuf)) { - high = erp_idx - 1; - } else if (erp_next && bno >= - xfs_bmbt_get_startoff(erp_next->er_extbuf)) { - low = erp_idx + 1; - } else { - break; - } - } - *erp_idxp = erp_idx; - return erp; -} + /* generate the checksum. */ + xfs_dinode_calc_crc(mp, dip); -/* - * Return a pointer to the indirection array entry containing the - * extent record at file extent index *idxp. Store the index of the - * target irec in *erp_idxp and store the page index of the target - * extent record in *idxp. - */ -xfs_ext_irec_t * -xfs_iext_idx_to_irec( - xfs_ifork_t *ifp, /* inode fork pointer */ - xfs_extnum_t *idxp, /* extent index (file -> page) */ - int *erp_idxp, /* pointer to target irec */ - int realloc) /* new bytes were just added */ -{ - xfs_ext_irec_t *prev; /* pointer to previous irec */ - xfs_ext_irec_t *erp = NULL; /* pointer to current irec */ - int erp_idx; /* indirection array index */ - int nlists; /* number of irec's (ex lists) */ - int high; /* binary search upper limit */ - int low; /* binary search lower limit */ - xfs_extnum_t page_idx = *idxp; /* extent index in target list */ - - ASSERT(ifp->if_flags & XFS_IFEXTIREC); - ASSERT(page_idx >= 0); - ASSERT(page_idx <= ifp->if_bytes / sizeof(xfs_bmbt_rec_t)); - ASSERT(page_idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t) || realloc); - - nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; - erp_idx = 0; - low = 0; - high = nlists - 1; - - /* Binary search extent irec's */ - while (low <= high) { - erp_idx = (low + high) >> 1; - erp = &ifp->if_u1.if_ext_irec[erp_idx]; - prev = erp_idx > 0 ? erp - 1 : NULL; - if (page_idx < erp->er_extoff || (page_idx == erp->er_extoff && - realloc && prev && prev->er_extcount < XFS_LINEAR_EXTS)) { - high = erp_idx - 1; - } else if (page_idx > erp->er_extoff + erp->er_extcount || - (page_idx == erp->er_extoff + erp->er_extcount && - !realloc)) { - low = erp_idx + 1; - } else if (page_idx == erp->er_extoff + erp->er_extcount && - erp->er_extcount == XFS_LINEAR_EXTS) { - ASSERT(realloc); - page_idx = 0; - erp_idx++; - erp = erp_idx < nlists ? erp + 1 : NULL; - break; - } else { - page_idx -= erp->er_extoff; - break; - } - } - *idxp = page_idx; - *erp_idxp = erp_idx; - return(erp); -} - -/* - * Allocate and initialize an indirection array once the space needed - * for incore extents increases above XFS_IEXT_BUFSZ. - */ -void -xfs_iext_irec_init( - xfs_ifork_t *ifp) /* inode fork pointer */ -{ - xfs_ext_irec_t *erp; /* indirection array pointer */ - xfs_extnum_t nextents; /* number of extents in file */ - - ASSERT(!(ifp->if_flags & XFS_IFEXTIREC)); - nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); - ASSERT(nextents <= XFS_LINEAR_EXTS); - - erp = kmem_alloc(sizeof(xfs_ext_irec_t), KM_NOFS); - - if (nextents == 0) { - ifp->if_u1.if_extents = kmem_alloc(XFS_IEXT_BUFSZ, KM_NOFS); - } else if (!ifp->if_real_bytes) { - xfs_iext_inline_to_direct(ifp, XFS_IEXT_BUFSZ); - } else if (ifp->if_real_bytes < XFS_IEXT_BUFSZ) { - xfs_iext_realloc_direct(ifp, XFS_IEXT_BUFSZ); - } - erp->er_extbuf = ifp->if_u1.if_extents; - erp->er_extcount = nextents; - erp->er_extoff = 0; - - ifp->if_flags |= XFS_IFEXTIREC; - ifp->if_real_bytes = XFS_IEXT_BUFSZ; - ifp->if_bytes = nextents * sizeof(xfs_bmbt_rec_t); - ifp->if_u1.if_ext_irec = erp; - - return; -} - -/* - * Allocate and initialize a new entry in the indirection array. - */ -xfs_ext_irec_t * -xfs_iext_irec_new( - xfs_ifork_t *ifp, /* inode fork pointer */ - int erp_idx) /* index for new irec */ -{ - xfs_ext_irec_t *erp; /* indirection array pointer */ - int i; /* loop counter */ - int nlists; /* number of irec's (ex lists) */ - - ASSERT(ifp->if_flags & XFS_IFEXTIREC); - nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; - - /* Resize indirection array */ - xfs_iext_realloc_indirect(ifp, ++nlists * - sizeof(xfs_ext_irec_t)); - /* - * Move records down in the array so the - * new page can use erp_idx. - */ - erp = ifp->if_u1.if_ext_irec; - for (i = nlists - 1; i > erp_idx; i--) { - memmove(&erp[i], &erp[i-1], sizeof(xfs_ext_irec_t)); - } - ASSERT(i == erp_idx); - - /* Initialize new extent record */ - erp = ifp->if_u1.if_ext_irec; - erp[erp_idx].er_extbuf = kmem_alloc(XFS_IEXT_BUFSZ, KM_NOFS); - ifp->if_real_bytes = nlists * XFS_IEXT_BUFSZ; - memset(erp[erp_idx].er_extbuf, 0, XFS_IEXT_BUFSZ); - erp[erp_idx].er_extcount = 0; - erp[erp_idx].er_extoff = erp_idx > 0 ? - erp[erp_idx-1].er_extoff + erp[erp_idx-1].er_extcount : 0; - return (&erp[erp_idx]); -} - -/* - * Remove a record from the indirection array. - */ -void -xfs_iext_irec_remove( - xfs_ifork_t *ifp, /* inode fork pointer */ - int erp_idx) /* irec index to remove */ -{ - xfs_ext_irec_t *erp; /* indirection array pointer */ - int i; /* loop counter */ - int nlists; /* number of irec's (ex lists) */ - - ASSERT(ifp->if_flags & XFS_IFEXTIREC); - nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; - erp = &ifp->if_u1.if_ext_irec[erp_idx]; - if (erp->er_extbuf) { - xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, - -erp->er_extcount); - kmem_free(erp->er_extbuf); - } - /* Compact extent records */ - erp = ifp->if_u1.if_ext_irec; - for (i = erp_idx; i < nlists - 1; i++) { - memmove(&erp[i], &erp[i+1], sizeof(xfs_ext_irec_t)); - } - /* - * Manually free the last extent record from the indirection - * array. A call to xfs_iext_realloc_indirect() with a size - * of zero would result in a call to xfs_iext_destroy() which - * would in turn call this function again, creating a nasty - * infinite loop. - */ - if (--nlists) { - xfs_iext_realloc_indirect(ifp, - nlists * sizeof(xfs_ext_irec_t)); - } else { - kmem_free(ifp->if_u1.if_ext_irec); - } - ifp->if_real_bytes = nlists * XFS_IEXT_BUFSZ; -} - -/* - * This is called to clean up large amounts of unused memory allocated - * by the indirection array. Before compacting anything though, verify - * that the indirection array is still needed and switch back to the - * linear extent list (or even the inline buffer) if possible. The - * compaction policy is as follows: - * - * Full Compaction: Extents fit into a single page (or inline buffer) - * Partial Compaction: Extents occupy less than 50% of allocated space - * No Compaction: Extents occupy at least 50% of allocated space - */ -void -xfs_iext_irec_compact( - xfs_ifork_t *ifp) /* inode fork pointer */ -{ - xfs_extnum_t nextents; /* number of extents in file */ - int nlists; /* number of irec's (ex lists) */ - - ASSERT(ifp->if_flags & XFS_IFEXTIREC); - nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; - nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); - - if (nextents == 0) { - xfs_iext_destroy(ifp); - } else if (nextents <= XFS_INLINE_EXTS) { - xfs_iext_indirect_to_direct(ifp); - xfs_iext_direct_to_inline(ifp, nextents); - } else if (nextents <= XFS_LINEAR_EXTS) { - xfs_iext_indirect_to_direct(ifp); - } else if (nextents < (nlists * XFS_LINEAR_EXTS) >> 1) { - xfs_iext_irec_compact_pages(ifp); - } -} - -/* - * Combine extents from neighboring extent pages. - */ -void -xfs_iext_irec_compact_pages( - xfs_ifork_t *ifp) /* inode fork pointer */ -{ - xfs_ext_irec_t *erp, *erp_next;/* pointers to irec entries */ - int erp_idx = 0; /* indirection array index */ - int nlists; /* number of irec's (ex lists) */ - - ASSERT(ifp->if_flags & XFS_IFEXTIREC); - nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; - while (erp_idx < nlists - 1) { - erp = &ifp->if_u1.if_ext_irec[erp_idx]; - erp_next = erp + 1; - if (erp_next->er_extcount <= - (XFS_LINEAR_EXTS - erp->er_extcount)) { - memcpy(&erp->er_extbuf[erp->er_extcount], - erp_next->er_extbuf, erp_next->er_extcount * - sizeof(xfs_bmbt_rec_t)); - erp->er_extcount += erp_next->er_extcount; - /* - * Free page before removing extent record - * so er_extoffs don't get modified in - * xfs_iext_irec_remove. - */ - kmem_free(erp_next->er_extbuf); - erp_next->er_extbuf = NULL; - xfs_iext_irec_remove(ifp, erp_idx + 1); - nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; - } else { - erp_idx++; - } - } -} - -/* - * This is called to update the er_extoff field in the indirection - * array when extents have been added or removed from one of the - * extent lists. erp_idx contains the irec index to begin updating - * at and ext_diff contains the number of extents that were added - * or removed. - */ -void -xfs_iext_irec_update_extoffs( - xfs_ifork_t *ifp, /* inode fork pointer */ - int erp_idx, /* irec index to update */ - int ext_diff) /* number of new extents */ -{ - int i; /* loop counter */ - int nlists; /* number of irec's (ex lists */ - - ASSERT(ifp->if_flags & XFS_IFEXTIREC); - nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; - for (i = erp_idx; i < nlists; i++) { - ifp->if_u1.if_ext_irec[i].er_extoff += ext_diff; - } -} - -/* - * Test whether it is appropriate to check an inode for and free post EOF - * blocks. The 'force' parameter determines whether we should also consider - * regular files that are marked preallocated or append-only. - */ -bool -xfs_can_free_eofblocks(struct xfs_inode *ip, bool force) -{ - /* prealloc/delalloc exists only on regular files */ - if (!S_ISREG(ip->i_d.di_mode)) - return false; - - /* - * Zero sized files with no cached pages and delalloc blocks will not - * have speculative prealloc/delalloc blocks to remove. - */ - if (VFS_I(ip)->i_size == 0 && - VN_CACHED(VFS_I(ip)) == 0 && - ip->i_delayed_blks == 0) - return false; - - /* If we haven't read in the extent list, then don't do it now. */ - if (!(ip->i_df.if_flags & XFS_IFEXTENTS)) - return false; - - /* - * Do not free real preallocated or append-only files unless the file - * has delalloc blocks and we are forced to remove them. - */ - if (ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)) - if (!force || ip->i_delayed_blks == 0) - return false; + ASSERT(bp->b_fspriv != NULL); + ASSERT(bp->b_iodone != NULL); + return 0; - return true; +corrupt_out: + return XFS_ERROR(EFSCORRUPTED); } - diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 22baf6ea4fa..f72bffa6726 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -18,199 +18,15 @@ #ifndef __XFS_INODE_H__ #define __XFS_INODE_H__ -struct posix_acl; -struct xfs_dinode; -struct xfs_inode; - -/* - * Fork identifiers. - */ -#define XFS_DATA_FORK 0 -#define XFS_ATTR_FORK 1 - -/* - * The following xfs_ext_irec_t struct introduces a second (top) level - * to the in-core extent allocation scheme. These structs are allocated - * in a contiguous block, creating an indirection array where each entry - * (irec) contains a pointer to a buffer of in-core extent records which - * it manages. Each extent buffer is 4k in size, since 4k is the system - * page size on Linux i386 and systems with larger page sizes don't seem - * to gain much, if anything, by using their native page size as the - * extent buffer size. Also, using 4k extent buffers everywhere provides - * a consistent interface for CXFS across different platforms. - * - * There is currently no limit on the number of irec's (extent lists) - * allowed, so heavily fragmented files may require an indirection array - * which spans multiple system pages of memory. The number of extents - * which would require this amount of contiguous memory is very large - * and should not cause problems in the foreseeable future. However, - * if the memory needed for the contiguous array ever becomes a problem, - * it is possible that a third level of indirection may be required. - */ -typedef struct xfs_ext_irec { - xfs_bmbt_rec_host_t *er_extbuf; /* block of extent records */ - xfs_extnum_t er_extoff; /* extent offset in file */ - xfs_extnum_t er_extcount; /* number of extents in page/block */ -} xfs_ext_irec_t; - -/* - * File incore extent information, present for each of data & attr forks. - */ -#define XFS_IEXT_BUFSZ 4096 -#define XFS_LINEAR_EXTS (XFS_IEXT_BUFSZ / (uint)sizeof(xfs_bmbt_rec_t)) -#define XFS_INLINE_EXTS 2 -#define XFS_INLINE_DATA 32 -typedef struct xfs_ifork { - int if_bytes; /* bytes in if_u1 */ - int if_real_bytes; /* bytes allocated in if_u1 */ - struct xfs_btree_block *if_broot; /* file's incore btree root */ - short if_broot_bytes; /* bytes allocated for root */ - unsigned char if_flags; /* per-fork flags */ - union { - xfs_bmbt_rec_host_t *if_extents;/* linear map file exts */ - xfs_ext_irec_t *if_ext_irec; /* irec map file exts */ - char *if_data; /* inline file data */ - } if_u1; - union { - xfs_bmbt_rec_host_t if_inline_ext[XFS_INLINE_EXTS]; - /* very small file extents */ - char if_inline_data[XFS_INLINE_DATA]; - /* very small file data */ - xfs_dev_t if_rdev; /* dev number if special */ - uuid_t if_uuid; /* mount point value */ - } if_u2; -} xfs_ifork_t; - -/* - * Inode location information. Stored in the inode and passed to - * xfs_imap_to_bp() to get a buffer and dinode for a given inode. - */ -struct xfs_imap { - xfs_daddr_t im_blkno; /* starting BB of inode chunk */ - ushort im_len; /* length in BBs of inode chunk */ - ushort im_boffset; /* inode offset in block in bytes */ -}; - -/* - * This is the xfs in-core inode structure. - * Most of the on-disk inode is embedded in the i_d field. - * - * The extent pointers/inline file space, however, are managed - * separately. The memory for this information is pointed to by - * the if_u1 unions depending on the type of the data. - * This is used to linearize the array of extents for fast in-core - * access. This is used until the file's number of extents - * surpasses XFS_MAX_INCORE_EXTENTS, at which point all extent pointers - * are accessed through the buffer cache. - * - * Other state kept in the in-core inode is used for identification, - * locking, transactional updating, etc of the inode. - * - * Generally, we do not want to hold the i_rlock while holding the - * i_ilock. Hierarchy is i_iolock followed by i_rlock. - * - * xfs_iptr_t contains all the inode fields up to and including the - * i_mnext and i_mprev fields, it is used as a marker in the inode - * chain off the mount structure by xfs_sync calls. - */ - -typedef struct xfs_ictimestamp { - __int32_t t_sec; /* timestamp seconds */ - __int32_t t_nsec; /* timestamp nanoseconds */ -} xfs_ictimestamp_t; - -/* - * NOTE: This structure must be kept identical to struct xfs_dinode - * in xfs_dinode.h except for the endianness annotations. - */ -typedef struct xfs_icdinode { - __uint16_t di_magic; /* inode magic # = XFS_DINODE_MAGIC */ - __uint16_t di_mode; /* mode and type of file */ - __int8_t di_version; /* inode version */ - __int8_t di_format; /* format of di_c data */ - __uint16_t di_onlink; /* old number of links to file */ - __uint32_t di_uid; /* owner's user id */ - __uint32_t di_gid; /* owner's group id */ - __uint32_t di_nlink; /* number of links to file */ - __uint16_t di_projid_lo; /* lower part of owner's project id */ - __uint16_t di_projid_hi; /* higher part of owner's project id */ - __uint8_t di_pad[6]; /* unused, zeroed space */ - __uint16_t di_flushiter; /* incremented on flush */ - xfs_ictimestamp_t di_atime; /* time last accessed */ - xfs_ictimestamp_t di_mtime; /* time last modified */ - xfs_ictimestamp_t di_ctime; /* time created/inode modified */ - xfs_fsize_t di_size; /* number of bytes in file */ - xfs_drfsbno_t di_nblocks; /* # of direct & btree blocks used */ - xfs_extlen_t di_extsize; /* basic/minimum extent size for file */ - xfs_extnum_t di_nextents; /* number of extents in data fork */ - xfs_aextnum_t di_anextents; /* number of extents in attribute fork*/ - __uint8_t di_forkoff; /* attr fork offs, <<3 for 64b align */ - __int8_t di_aformat; /* format of attr fork's data */ - __uint32_t di_dmevmask; /* DMIG event mask */ - __uint16_t di_dmstate; /* DMIG state info */ - __uint16_t di_flags; /* random flags, XFS_DIFLAG_... */ - __uint32_t di_gen; /* generation number */ -} xfs_icdinode_t; - -/* - * Flags for xfs_ichgtime(). - */ -#define XFS_ICHGTIME_MOD 0x1 /* data fork modification timestamp */ -#define XFS_ICHGTIME_CHG 0x2 /* inode field change timestamp */ +#include "xfs_inode_buf.h" +#include "xfs_inode_fork.h" +#include "xfs_dinode.h" /* - * Per-fork incore inode flags. + * Kernel only inode definitions */ -#define XFS_IFINLINE 0x01 /* Inline data is read in */ -#define XFS_IFEXTENTS 0x02 /* All extent pointers are read in */ -#define XFS_IFBROOT 0x04 /* i_broot points to the bmap b-tree root */ -#define XFS_IFEXTIREC 0x08 /* Indirection array of extent blocks */ - -/* - * Fork handling. - */ - -#define XFS_IFORK_Q(ip) ((ip)->i_d.di_forkoff != 0) -#define XFS_IFORK_BOFF(ip) ((int)((ip)->i_d.di_forkoff << 3)) - -#define XFS_IFORK_PTR(ip,w) \ - ((w) == XFS_DATA_FORK ? \ - &(ip)->i_df : \ - (ip)->i_afp) -#define XFS_IFORK_DSIZE(ip) \ - (XFS_IFORK_Q(ip) ? \ - XFS_IFORK_BOFF(ip) : \ - XFS_LITINO((ip)->i_mount)) -#define XFS_IFORK_ASIZE(ip) \ - (XFS_IFORK_Q(ip) ? \ - XFS_LITINO((ip)->i_mount) - XFS_IFORK_BOFF(ip) : \ - 0) -#define XFS_IFORK_SIZE(ip,w) \ - ((w) == XFS_DATA_FORK ? \ - XFS_IFORK_DSIZE(ip) : \ - XFS_IFORK_ASIZE(ip)) -#define XFS_IFORK_FORMAT(ip,w) \ - ((w) == XFS_DATA_FORK ? \ - (ip)->i_d.di_format : \ - (ip)->i_d.di_aformat) -#define XFS_IFORK_FMT_SET(ip,w,n) \ - ((w) == XFS_DATA_FORK ? \ - ((ip)->i_d.di_format = (n)) : \ - ((ip)->i_d.di_aformat = (n))) -#define XFS_IFORK_NEXTENTS(ip,w) \ - ((w) == XFS_DATA_FORK ? \ - (ip)->i_d.di_nextents : \ - (ip)->i_d.di_anextents) -#define XFS_IFORK_NEXT_SET(ip,w,n) \ - ((w) == XFS_DATA_FORK ? \ - ((ip)->i_d.di_nextents = (n)) : \ - ((ip)->i_d.di_anextents = (n))) -#define XFS_IFORK_MAXEXT(ip, w) \ - (XFS_IFORK_SIZE(ip, w) / sizeof(xfs_bmbt_rec_t)) - - -#ifdef __KERNEL__ - +struct xfs_dinode; +struct xfs_inode; struct xfs_buf; struct xfs_bmap_free; struct xfs_bmbt_irec; @@ -224,6 +40,7 @@ typedef struct xfs_inode { struct xfs_mount *i_mount; /* fs mount struct ptr */ struct xfs_dquot *i_udquot; /* user dquot */ struct xfs_dquot *i_gdquot; /* group dquot */ + struct xfs_dquot *i_pdquot; /* project dquot */ /* Inode location stuff */ xfs_ino_t i_ino; /* inode number (agno/agino)*/ @@ -233,6 +50,9 @@ typedef struct xfs_inode { xfs_ifork_t *i_afp; /* attribute fork pointer */ xfs_ifork_t i_df; /* data fork */ + /* operations vectors */ + const struct xfs_dir_ops *d_ops; /* directory ops vector */ + /* Transaction and locking information. */ struct xfs_inode_log_item *i_itemp; /* logging information */ mrlock_t i_lock; /* inode lock */ @@ -373,6 +193,15 @@ xfs_set_projid(struct xfs_inode *ip, ip->i_d.di_projid_lo = (__uint16_t) (projid & 0xffff); } +static inline prid_t +xfs_get_initial_prid(struct xfs_inode *dp) +{ + if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) + return xfs_get_projid(dp); + + return XFS_PROJID_DEFAULT; +} + /* * In-core inode flags. */ @@ -380,7 +209,6 @@ xfs_set_projid(struct xfs_inode *ip, #define XFS_ISTALE (1 << 1) /* inode has been staled */ #define XFS_IRECLAIMABLE (1 << 2) /* inode can be reclaimed */ #define XFS_INEW (1 << 3) /* inode has just been allocated */ -#define XFS_IFILESTREAM (1 << 4) /* inode is in a filestream dir. */ #define XFS_ITRUNCATED (1 << 5) /* truncated down so flush-on-close */ #define XFS_IDIRTY_RELEASE (1 << 6) /* dirty release already seen */ #define __XFS_IFLOCK_BIT 7 /* inode is being flushed right now */ @@ -396,8 +224,7 @@ xfs_set_projid(struct xfs_inode *ip, */ #define XFS_IRECLAIM_RESET_FLAGS \ (XFS_IRECLAIMABLE | XFS_IRECLAIM | \ - XFS_IDIRTY_RELEASE | XFS_ITRUNCATED | \ - XFS_IFILESTREAM); + XFS_IDIRTY_RELEASE | XFS_ITRUNCATED) /* * Synchronize processes attempting to flush the in-core inode back to disk. @@ -419,6 +246,7 @@ static inline void xfs_iflock(struct xfs_inode *ip) static inline void xfs_ifunlock(struct xfs_inode *ip) { xfs_iflags_clear(ip, XFS_IFLOCK); + smp_mb(); wake_up_bit(&ip->i_flags, __XFS_IFLOCK_BIT); } @@ -497,16 +325,30 @@ static inline int xfs_isiflocked(struct xfs_inode *ip) ((pip)->i_d.di_mode & S_ISGID)) -/* - * xfs_inode.c prototypes. - */ +int xfs_release(struct xfs_inode *ip); +void xfs_inactive(struct xfs_inode *ip); +int xfs_lookup(struct xfs_inode *dp, struct xfs_name *name, + struct xfs_inode **ipp, struct xfs_name *ci_name); +int xfs_create(struct xfs_inode *dp, struct xfs_name *name, + umode_t mode, xfs_dev_t rdev, struct xfs_inode **ipp); +int xfs_create_tmpfile(struct xfs_inode *dp, struct dentry *dentry, + umode_t mode, struct xfs_inode **ipp); +int xfs_remove(struct xfs_inode *dp, struct xfs_name *name, + struct xfs_inode *ip); +int xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip, + struct xfs_name *target_name); +int xfs_rename(struct xfs_inode *src_dp, struct xfs_name *src_name, + struct xfs_inode *src_ip, struct xfs_inode *target_dp, + struct xfs_name *target_name, + struct xfs_inode *target_ip); + void xfs_ilock(xfs_inode_t *, uint); int xfs_ilock_nowait(xfs_inode_t *, uint); void xfs_iunlock(xfs_inode_t *, uint); void xfs_ilock_demote(xfs_inode_t *, uint); int xfs_isilocked(xfs_inode_t *, uint); -uint xfs_ilock_map_shared(xfs_inode_t *); -void xfs_iunlock_map_shared(xfs_inode_t *, uint); +uint xfs_ilock_data_map_shared(struct xfs_inode *); +uint xfs_ilock_attr_map_shared(struct xfs_inode *); int xfs_ialloc(struct xfs_trans *, xfs_inode_t *, umode_t, xfs_nlink_t, xfs_dev_t, prid_t, int, struct xfs_buf **, xfs_inode_t **); @@ -520,13 +362,27 @@ int xfs_itruncate_extents(struct xfs_trans **, struct xfs_inode *, int xfs_iunlink(struct xfs_trans *, xfs_inode_t *); void xfs_iext_realloc(xfs_inode_t *, int, int); + void xfs_iunpin_wait(xfs_inode_t *); +#define xfs_ipincount(ip) ((unsigned int) atomic_read(&ip->i_pincount)) + int xfs_iflush(struct xfs_inode *, struct xfs_buf **); void xfs_lock_inodes(xfs_inode_t **, int, uint); void xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint); xfs_extlen_t xfs_get_extsz_hint(struct xfs_inode *ip); +int xfs_dir_ialloc(struct xfs_trans **, struct xfs_inode *, umode_t, + xfs_nlink_t, xfs_dev_t, prid_t, int, + struct xfs_inode **, int *); +int xfs_droplink(struct xfs_trans *, struct xfs_inode *); +int xfs_bumplink(struct xfs_trans *, struct xfs_inode *); + +/* from xfs_file.c */ +int xfs_zero_eof(struct xfs_inode *, xfs_off_t, xfs_fsize_t); +int xfs_iozero(struct xfs_inode *, loff_t, size_t); + + #define IHOLD(ip) \ do { \ ASSERT(atomic_read(&VFS_I(ip)->i_count) > 0) ; \ @@ -540,64 +396,6 @@ do { \ iput(VFS_I(ip)); \ } while (0) -#endif /* __KERNEL__ */ - -/* - * Flags for xfs_iget() - */ -#define XFS_IGET_CREATE 0x1 -#define XFS_IGET_UNTRUSTED 0x2 -#define XFS_IGET_DONTCACHE 0x4 - -int xfs_imap_to_bp(struct xfs_mount *, struct xfs_trans *, - struct xfs_imap *, struct xfs_dinode **, - struct xfs_buf **, uint, uint); -int xfs_iread(struct xfs_mount *, struct xfs_trans *, - struct xfs_inode *, uint); -void xfs_dinode_to_disk(struct xfs_dinode *, - struct xfs_icdinode *); -void xfs_idestroy_fork(struct xfs_inode *, int); -void xfs_idata_realloc(struct xfs_inode *, int, int); -void xfs_iroot_realloc(struct xfs_inode *, int, int); -int xfs_iread_extents(struct xfs_trans *, struct xfs_inode *, int); -int xfs_iextents_copy(struct xfs_inode *, xfs_bmbt_rec_t *, int); - -xfs_bmbt_rec_host_t *xfs_iext_get_ext(xfs_ifork_t *, xfs_extnum_t); -void xfs_iext_insert(xfs_inode_t *, xfs_extnum_t, xfs_extnum_t, - xfs_bmbt_irec_t *, int); -void xfs_iext_add(xfs_ifork_t *, xfs_extnum_t, int); -void xfs_iext_add_indirect_multi(xfs_ifork_t *, int, xfs_extnum_t, int); -void xfs_iext_remove(xfs_inode_t *, xfs_extnum_t, int, int); -void xfs_iext_remove_inline(xfs_ifork_t *, xfs_extnum_t, int); -void xfs_iext_remove_direct(xfs_ifork_t *, xfs_extnum_t, int); -void xfs_iext_remove_indirect(xfs_ifork_t *, xfs_extnum_t, int); -void xfs_iext_realloc_direct(xfs_ifork_t *, int); -void xfs_iext_direct_to_inline(xfs_ifork_t *, xfs_extnum_t); -void xfs_iext_inline_to_direct(xfs_ifork_t *, int); -void xfs_iext_destroy(xfs_ifork_t *); -xfs_bmbt_rec_host_t *xfs_iext_bno_to_ext(xfs_ifork_t *, xfs_fileoff_t, int *); -xfs_ext_irec_t *xfs_iext_bno_to_irec(xfs_ifork_t *, xfs_fileoff_t, int *); -xfs_ext_irec_t *xfs_iext_idx_to_irec(xfs_ifork_t *, xfs_extnum_t *, int *, int); -void xfs_iext_irec_init(xfs_ifork_t *); -xfs_ext_irec_t *xfs_iext_irec_new(xfs_ifork_t *, int); -void xfs_iext_irec_remove(xfs_ifork_t *, int); -void xfs_iext_irec_compact(xfs_ifork_t *); -void xfs_iext_irec_compact_pages(xfs_ifork_t *); -void xfs_iext_irec_compact_full(xfs_ifork_t *); -void xfs_iext_irec_update_extoffs(xfs_ifork_t *, int, int); -bool xfs_can_free_eofblocks(struct xfs_inode *, bool); - -#define xfs_ipincount(ip) ((unsigned int) atomic_read(&ip->i_pincount)) - -#if defined(DEBUG) -void xfs_inobp_check(struct xfs_mount *, struct xfs_buf *); -#else -#define xfs_inobp_check(mp, bp) -#endif /* DEBUG */ - -extern struct kmem_zone *xfs_ifork_zone; extern struct kmem_zone *xfs_inode_zone; -extern struct kmem_zone *xfs_ili_zone; -extern const struct xfs_buf_ops xfs_inode_buf_ops; #endif /* __XFS_INODE_H__ */ diff --git a/fs/xfs/xfs_inode_buf.c b/fs/xfs/xfs_inode_buf.c new file mode 100644 index 00000000000..cb35ae41d4a --- /dev/null +++ b/fs/xfs/xfs_inode_buf.c @@ -0,0 +1,479 @@ +/* + * Copyright (c) 2000-2006 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_error.h" +#include "xfs_cksum.h" +#include "xfs_icache.h" +#include "xfs_trans.h" +#include "xfs_ialloc.h" +#include "xfs_dinode.h" + +/* + * Check that none of the inode's in the buffer have a next + * unlinked field of 0. + */ +#if defined(DEBUG) +void +xfs_inobp_check( + xfs_mount_t *mp, + xfs_buf_t *bp) +{ + int i; + int j; + xfs_dinode_t *dip; + + j = mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog; + + for (i = 0; i < j; i++) { + dip = (xfs_dinode_t *)xfs_buf_offset(bp, + i * mp->m_sb.sb_inodesize); + if (!dip->di_next_unlinked) { + xfs_alert(mp, + "Detected bogus zero next_unlinked field in inode %d buffer 0x%llx.", + i, (long long)bp->b_bn); + } + } +} +#endif + +/* + * If we are doing readahead on an inode buffer, we might be in log recovery + * reading an inode allocation buffer that hasn't yet been replayed, and hence + * has not had the inode cores stamped into it. Hence for readahead, the buffer + * may be potentially invalid. + * + * If the readahead buffer is invalid, we don't want to mark it with an error, + * but we do want to clear the DONE status of the buffer so that a followup read + * will re-read it from disk. This will ensure that we don't get an unnecessary + * warnings during log recovery and we don't get unnecssary panics on debug + * kernels. + */ +static void +xfs_inode_buf_verify( + struct xfs_buf *bp, + bool readahead) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + int i; + int ni; + + /* + * Validate the magic number and version of every inode in the buffer + */ + ni = XFS_BB_TO_FSB(mp, bp->b_length) * mp->m_sb.sb_inopblock; + for (i = 0; i < ni; i++) { + int di_ok; + xfs_dinode_t *dip; + + dip = (struct xfs_dinode *)xfs_buf_offset(bp, + (i << mp->m_sb.sb_inodelog)); + di_ok = dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) && + XFS_DINODE_GOOD_VERSION(dip->di_version); + if (unlikely(XFS_TEST_ERROR(!di_ok, mp, + XFS_ERRTAG_ITOBP_INOTOBP, + XFS_RANDOM_ITOBP_INOTOBP))) { + if (readahead) { + bp->b_flags &= ~XBF_DONE; + return; + } + + xfs_buf_ioerror(bp, EFSCORRUPTED); + xfs_verifier_error(bp); +#ifdef DEBUG + xfs_alert(mp, + "bad inode magic/vsn daddr %lld #%d (magic=%x)", + (unsigned long long)bp->b_bn, i, + be16_to_cpu(dip->di_magic)); +#endif + } + } + xfs_inobp_check(mp, bp); +} + + +static void +xfs_inode_buf_read_verify( + struct xfs_buf *bp) +{ + xfs_inode_buf_verify(bp, false); +} + +static void +xfs_inode_buf_readahead_verify( + struct xfs_buf *bp) +{ + xfs_inode_buf_verify(bp, true); +} + +static void +xfs_inode_buf_write_verify( + struct xfs_buf *bp) +{ + xfs_inode_buf_verify(bp, false); +} + +const struct xfs_buf_ops xfs_inode_buf_ops = { + .verify_read = xfs_inode_buf_read_verify, + .verify_write = xfs_inode_buf_write_verify, +}; + +const struct xfs_buf_ops xfs_inode_buf_ra_ops = { + .verify_read = xfs_inode_buf_readahead_verify, + .verify_write = xfs_inode_buf_write_verify, +}; + + +/* + * This routine is called to map an inode to the buffer containing the on-disk + * version of the inode. It returns a pointer to the buffer containing the + * on-disk inode in the bpp parameter, and in the dipp parameter it returns a + * pointer to the on-disk inode within that buffer. + * + * If a non-zero error is returned, then the contents of bpp and dipp are + * undefined. + */ +int +xfs_imap_to_bp( + struct xfs_mount *mp, + struct xfs_trans *tp, + struct xfs_imap *imap, + struct xfs_dinode **dipp, + struct xfs_buf **bpp, + uint buf_flags, + uint iget_flags) +{ + struct xfs_buf *bp; + int error; + + buf_flags |= XBF_UNMAPPED; + error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno, + (int)imap->im_len, buf_flags, &bp, + &xfs_inode_buf_ops); + if (error) { + if (error == EAGAIN) { + ASSERT(buf_flags & XBF_TRYLOCK); + return error; + } + + if (error == EFSCORRUPTED && + (iget_flags & XFS_IGET_UNTRUSTED)) + return XFS_ERROR(EINVAL); + + xfs_warn(mp, "%s: xfs_trans_read_buf() returned error %d.", + __func__, error); + return error; + } + + *bpp = bp; + *dipp = (struct xfs_dinode *)xfs_buf_offset(bp, imap->im_boffset); + return 0; +} + +void +xfs_dinode_from_disk( + xfs_icdinode_t *to, + xfs_dinode_t *from) +{ + to->di_magic = be16_to_cpu(from->di_magic); + to->di_mode = be16_to_cpu(from->di_mode); + to->di_version = from ->di_version; + to->di_format = from->di_format; + to->di_onlink = be16_to_cpu(from->di_onlink); + to->di_uid = be32_to_cpu(from->di_uid); + to->di_gid = be32_to_cpu(from->di_gid); + to->di_nlink = be32_to_cpu(from->di_nlink); + to->di_projid_lo = be16_to_cpu(from->di_projid_lo); + to->di_projid_hi = be16_to_cpu(from->di_projid_hi); + memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad)); + to->di_flushiter = be16_to_cpu(from->di_flushiter); + to->di_atime.t_sec = be32_to_cpu(from->di_atime.t_sec); + to->di_atime.t_nsec = be32_to_cpu(from->di_atime.t_nsec); + to->di_mtime.t_sec = be32_to_cpu(from->di_mtime.t_sec); + to->di_mtime.t_nsec = be32_to_cpu(from->di_mtime.t_nsec); + to->di_ctime.t_sec = be32_to_cpu(from->di_ctime.t_sec); + to->di_ctime.t_nsec = be32_to_cpu(from->di_ctime.t_nsec); + to->di_size = be64_to_cpu(from->di_size); + to->di_nblocks = be64_to_cpu(from->di_nblocks); + to->di_extsize = be32_to_cpu(from->di_extsize); + to->di_nextents = be32_to_cpu(from->di_nextents); + to->di_anextents = be16_to_cpu(from->di_anextents); + to->di_forkoff = from->di_forkoff; + to->di_aformat = from->di_aformat; + to->di_dmevmask = be32_to_cpu(from->di_dmevmask); + to->di_dmstate = be16_to_cpu(from->di_dmstate); + to->di_flags = be16_to_cpu(from->di_flags); + to->di_gen = be32_to_cpu(from->di_gen); + + if (to->di_version == 3) { + to->di_changecount = be64_to_cpu(from->di_changecount); + to->di_crtime.t_sec = be32_to_cpu(from->di_crtime.t_sec); + to->di_crtime.t_nsec = be32_to_cpu(from->di_crtime.t_nsec); + to->di_flags2 = be64_to_cpu(from->di_flags2); + to->di_ino = be64_to_cpu(from->di_ino); + to->di_lsn = be64_to_cpu(from->di_lsn); + memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2)); + uuid_copy(&to->di_uuid, &from->di_uuid); + } +} + +void +xfs_dinode_to_disk( + xfs_dinode_t *to, + xfs_icdinode_t *from) +{ + to->di_magic = cpu_to_be16(from->di_magic); + to->di_mode = cpu_to_be16(from->di_mode); + to->di_version = from ->di_version; + to->di_format = from->di_format; + to->di_onlink = cpu_to_be16(from->di_onlink); + to->di_uid = cpu_to_be32(from->di_uid); + to->di_gid = cpu_to_be32(from->di_gid); + to->di_nlink = cpu_to_be32(from->di_nlink); + to->di_projid_lo = cpu_to_be16(from->di_projid_lo); + to->di_projid_hi = cpu_to_be16(from->di_projid_hi); + memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad)); + to->di_atime.t_sec = cpu_to_be32(from->di_atime.t_sec); + to->di_atime.t_nsec = cpu_to_be32(from->di_atime.t_nsec); + to->di_mtime.t_sec = cpu_to_be32(from->di_mtime.t_sec); + to->di_mtime.t_nsec = cpu_to_be32(from->di_mtime.t_nsec); + to->di_ctime.t_sec = cpu_to_be32(from->di_ctime.t_sec); + to->di_ctime.t_nsec = cpu_to_be32(from->di_ctime.t_nsec); + to->di_size = cpu_to_be64(from->di_size); + to->di_nblocks = cpu_to_be64(from->di_nblocks); + to->di_extsize = cpu_to_be32(from->di_extsize); + to->di_nextents = cpu_to_be32(from->di_nextents); + to->di_anextents = cpu_to_be16(from->di_anextents); + to->di_forkoff = from->di_forkoff; + to->di_aformat = from->di_aformat; + to->di_dmevmask = cpu_to_be32(from->di_dmevmask); + to->di_dmstate = cpu_to_be16(from->di_dmstate); + to->di_flags = cpu_to_be16(from->di_flags); + to->di_gen = cpu_to_be32(from->di_gen); + + if (from->di_version == 3) { + to->di_changecount = cpu_to_be64(from->di_changecount); + to->di_crtime.t_sec = cpu_to_be32(from->di_crtime.t_sec); + to->di_crtime.t_nsec = cpu_to_be32(from->di_crtime.t_nsec); + to->di_flags2 = cpu_to_be64(from->di_flags2); + to->di_ino = cpu_to_be64(from->di_ino); + to->di_lsn = cpu_to_be64(from->di_lsn); + memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2)); + uuid_copy(&to->di_uuid, &from->di_uuid); + to->di_flushiter = 0; + } else { + to->di_flushiter = cpu_to_be16(from->di_flushiter); + } +} + +static bool +xfs_dinode_verify( + struct xfs_mount *mp, + struct xfs_inode *ip, + struct xfs_dinode *dip) +{ + if (dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC)) + return false; + + /* only version 3 or greater inodes are extensively verified here */ + if (dip->di_version < 3) + return true; + + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return false; + if (!xfs_verify_cksum((char *)dip, mp->m_sb.sb_inodesize, + XFS_DINODE_CRC_OFF)) + return false; + if (be64_to_cpu(dip->di_ino) != ip->i_ino) + return false; + if (!uuid_equal(&dip->di_uuid, &mp->m_sb.sb_uuid)) + return false; + return true; +} + +void +xfs_dinode_calc_crc( + struct xfs_mount *mp, + struct xfs_dinode *dip) +{ + __uint32_t crc; + + if (dip->di_version < 3) + return; + + ASSERT(xfs_sb_version_hascrc(&mp->m_sb)); + crc = xfs_start_cksum((char *)dip, mp->m_sb.sb_inodesize, + XFS_DINODE_CRC_OFF); + dip->di_crc = xfs_end_cksum(crc); +} + +/* + * Read the disk inode attributes into the in-core inode structure. + * + * For version 5 superblocks, if we are initialising a new inode and we are not + * utilising the XFS_MOUNT_IKEEP inode cluster mode, we can simple build the new + * inode core with a random generation number. If we are keeping inodes around, + * we need to read the inode cluster to get the existing generation number off + * disk. Further, if we are using version 4 superblocks (i.e. v1/v2 inode + * format) then log recovery is dependent on the di_flushiter field being + * initialised from the current on-disk value and hence we must also read the + * inode off disk. + */ +int +xfs_iread( + xfs_mount_t *mp, + xfs_trans_t *tp, + xfs_inode_t *ip, + uint iget_flags) +{ + xfs_buf_t *bp; + xfs_dinode_t *dip; + int error; + + /* + * Fill in the location information in the in-core inode. + */ + error = xfs_imap(mp, tp, ip->i_ino, &ip->i_imap, iget_flags); + if (error) + return error; + + /* shortcut IO on inode allocation if possible */ + if ((iget_flags & XFS_IGET_CREATE) && + xfs_sb_version_hascrc(&mp->m_sb) && + !(mp->m_flags & XFS_MOUNT_IKEEP)) { + /* initialise the on-disk inode core */ + memset(&ip->i_d, 0, sizeof(ip->i_d)); + ip->i_d.di_magic = XFS_DINODE_MAGIC; + ip->i_d.di_gen = prandom_u32(); + if (xfs_sb_version_hascrc(&mp->m_sb)) { + ip->i_d.di_version = 3; + ip->i_d.di_ino = ip->i_ino; + uuid_copy(&ip->i_d.di_uuid, &mp->m_sb.sb_uuid); + } else + ip->i_d.di_version = 2; + return 0; + } + + /* + * Get pointers to the on-disk inode and the buffer containing it. + */ + error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &bp, 0, iget_flags); + if (error) + return error; + + /* even unallocated inodes are verified */ + if (!xfs_dinode_verify(mp, ip, dip)) { + xfs_alert(mp, "%s: validation failed for inode %lld failed", + __func__, ip->i_ino); + + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, dip); + error = XFS_ERROR(EFSCORRUPTED); + goto out_brelse; + } + + /* + * If the on-disk inode is already linked to a directory + * entry, copy all of the inode into the in-core inode. + * xfs_iformat_fork() handles copying in the inode format + * specific information. + * Otherwise, just get the truly permanent information. + */ + if (dip->di_mode) { + xfs_dinode_from_disk(&ip->i_d, dip); + error = xfs_iformat_fork(ip, dip); + if (error) { +#ifdef DEBUG + xfs_alert(mp, "%s: xfs_iformat() returned error %d", + __func__, error); +#endif /* DEBUG */ + goto out_brelse; + } + } else { + /* + * Partial initialisation of the in-core inode. Just the bits + * that xfs_ialloc won't overwrite or relies on being correct. + */ + ip->i_d.di_magic = be16_to_cpu(dip->di_magic); + ip->i_d.di_version = dip->di_version; + ip->i_d.di_gen = be32_to_cpu(dip->di_gen); + ip->i_d.di_flushiter = be16_to_cpu(dip->di_flushiter); + + if (dip->di_version == 3) { + ip->i_d.di_ino = be64_to_cpu(dip->di_ino); + uuid_copy(&ip->i_d.di_uuid, &dip->di_uuid); + } + + /* + * Make sure to pull in the mode here as well in + * case the inode is released without being used. + * This ensures that xfs_inactive() will see that + * the inode is already free and not try to mess + * with the uninitialized part of it. + */ + ip->i_d.di_mode = 0; + } + + /* + * Automatically convert version 1 inode formats in memory to version 2 + * inode format. If the inode is modified, it will get logged and + * rewritten as a version 2 inode. We can do this because we set the + * superblock feature bit for v2 inodes unconditionally during mount + * and it means the reast of the code can assume the inode version is 2 + * or higher. + */ + if (ip->i_d.di_version == 1) { + ip->i_d.di_version = 2; + memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad)); + ip->i_d.di_nlink = ip->i_d.di_onlink; + ip->i_d.di_onlink = 0; + xfs_set_projid(ip, 0); + } + + ip->i_delayed_blks = 0; + + /* + * Mark the buffer containing the inode as something to keep + * around for a while. This helps to keep recently accessed + * meta-data in-core longer. + */ + xfs_buf_set_ref(bp, XFS_INO_REF); + + /* + * Use xfs_trans_brelse() to release the buffer containing the on-disk + * inode, because it was acquired with xfs_trans_read_buf() in + * xfs_imap_to_bp() above. If tp is NULL, this is just a normal + * brelse(). If we're within a transaction, then xfs_trans_brelse() + * will only release the buffer if it is not dirty within the + * transaction. It will be OK to release the buffer in this case, + * because inodes on disk are never destroyed and we will be locking the + * new in-core inode before putting it in the cache where other + * processes can find it. Thus we don't have to worry about the inode + * being changed just because we released the buffer. + */ + out_brelse: + xfs_trans_brelse(tp, bp); + return error; +} diff --git a/fs/xfs/xfs_inode_buf.h b/fs/xfs/xfs_inode_buf.h new file mode 100644 index 00000000000..9308c47f2a5 --- /dev/null +++ b/fs/xfs/xfs_inode_buf.h @@ -0,0 +1,50 @@ +/* + * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_INODE_BUF_H__ +#define __XFS_INODE_BUF_H__ + +struct xfs_inode; +struct xfs_dinode; +struct xfs_icdinode; + +/* + * Inode location information. Stored in the inode and passed to + * xfs_imap_to_bp() to get a buffer and dinode for a given inode. + */ +struct xfs_imap { + xfs_daddr_t im_blkno; /* starting BB of inode chunk */ + ushort im_len; /* length in BBs of inode chunk */ + ushort im_boffset; /* inode offset in block in bytes */ +}; + +int xfs_imap_to_bp(struct xfs_mount *, struct xfs_trans *, + struct xfs_imap *, struct xfs_dinode **, + struct xfs_buf **, uint, uint); +int xfs_iread(struct xfs_mount *, struct xfs_trans *, + struct xfs_inode *, uint); +void xfs_dinode_calc_crc(struct xfs_mount *, struct xfs_dinode *); +void xfs_dinode_to_disk(struct xfs_dinode *to, struct xfs_icdinode *from); +void xfs_dinode_from_disk(struct xfs_icdinode *to, struct xfs_dinode *from); + +#if defined(DEBUG) +void xfs_inobp_check(struct xfs_mount *, struct xfs_buf *); +#else +#define xfs_inobp_check(mp, bp) +#endif /* DEBUG */ + +#endif /* __XFS_INODE_BUF_H__ */ diff --git a/fs/xfs/xfs_inode_fork.c b/fs/xfs/xfs_inode_fork.c new file mode 100644 index 00000000000..b031e8d0d92 --- /dev/null +++ b/fs/xfs/xfs_inode_fork.c @@ -0,0 +1,1906 @@ +/* + * Copyright (c) 2000-2006 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include <linux/log2.h> + +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_inum.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_trans.h" +#include "xfs_inode_item.h" +#include "xfs_bmap_btree.h" +#include "xfs_bmap.h" +#include "xfs_error.h" +#include "xfs_trace.h" +#include "xfs_attr_sf.h" +#include "xfs_dinode.h" + +kmem_zone_t *xfs_ifork_zone; + +STATIC int xfs_iformat_local(xfs_inode_t *, xfs_dinode_t *, int, int); +STATIC int xfs_iformat_extents(xfs_inode_t *, xfs_dinode_t *, int); +STATIC int xfs_iformat_btree(xfs_inode_t *, xfs_dinode_t *, int); + +#ifdef DEBUG +/* + * Make sure that the extents in the given memory buffer + * are valid. + */ +void +xfs_validate_extents( + xfs_ifork_t *ifp, + int nrecs, + xfs_exntfmt_t fmt) +{ + xfs_bmbt_irec_t irec; + xfs_bmbt_rec_host_t rec; + int i; + + for (i = 0; i < nrecs; i++) { + xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i); + rec.l0 = get_unaligned(&ep->l0); + rec.l1 = get_unaligned(&ep->l1); + xfs_bmbt_get_all(&rec, &irec); + if (fmt == XFS_EXTFMT_NOSTATE) + ASSERT(irec.br_state == XFS_EXT_NORM); + } +} +#else /* DEBUG */ +#define xfs_validate_extents(ifp, nrecs, fmt) +#endif /* DEBUG */ + + +/* + * Move inode type and inode format specific information from the + * on-disk inode to the in-core inode. For fifos, devs, and sockets + * this means set if_rdev to the proper value. For files, directories, + * and symlinks this means to bring in the in-line data or extent + * pointers. For a file in B-tree format, only the root is immediately + * brought in-core. The rest will be in-lined in if_extents when it + * is first referenced (see xfs_iread_extents()). + */ +int +xfs_iformat_fork( + xfs_inode_t *ip, + xfs_dinode_t *dip) +{ + xfs_attr_shortform_t *atp; + int size; + int error = 0; + xfs_fsize_t di_size; + + if (unlikely(be32_to_cpu(dip->di_nextents) + + be16_to_cpu(dip->di_anextents) > + be64_to_cpu(dip->di_nblocks))) { + xfs_warn(ip->i_mount, + "corrupt dinode %Lu, extent total = %d, nblocks = %Lu.", + (unsigned long long)ip->i_ino, + (int)(be32_to_cpu(dip->di_nextents) + + be16_to_cpu(dip->di_anextents)), + (unsigned long long) + be64_to_cpu(dip->di_nblocks)); + XFS_CORRUPTION_ERROR("xfs_iformat(1)", XFS_ERRLEVEL_LOW, + ip->i_mount, dip); + return XFS_ERROR(EFSCORRUPTED); + } + + if (unlikely(dip->di_forkoff > ip->i_mount->m_sb.sb_inodesize)) { + xfs_warn(ip->i_mount, "corrupt dinode %Lu, forkoff = 0x%x.", + (unsigned long long)ip->i_ino, + dip->di_forkoff); + XFS_CORRUPTION_ERROR("xfs_iformat(2)", XFS_ERRLEVEL_LOW, + ip->i_mount, dip); + return XFS_ERROR(EFSCORRUPTED); + } + + if (unlikely((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) && + !ip->i_mount->m_rtdev_targp)) { + xfs_warn(ip->i_mount, + "corrupt dinode %Lu, has realtime flag set.", + ip->i_ino); + XFS_CORRUPTION_ERROR("xfs_iformat(realtime)", + XFS_ERRLEVEL_LOW, ip->i_mount, dip); + return XFS_ERROR(EFSCORRUPTED); + } + + switch (ip->i_d.di_mode & S_IFMT) { + case S_IFIFO: + case S_IFCHR: + case S_IFBLK: + case S_IFSOCK: + if (unlikely(dip->di_format != XFS_DINODE_FMT_DEV)) { + XFS_CORRUPTION_ERROR("xfs_iformat(3)", XFS_ERRLEVEL_LOW, + ip->i_mount, dip); + return XFS_ERROR(EFSCORRUPTED); + } + ip->i_d.di_size = 0; + ip->i_df.if_u2.if_rdev = xfs_dinode_get_rdev(dip); + break; + + case S_IFREG: + case S_IFLNK: + case S_IFDIR: + switch (dip->di_format) { + case XFS_DINODE_FMT_LOCAL: + /* + * no local regular files yet + */ + if (unlikely(S_ISREG(be16_to_cpu(dip->di_mode)))) { + xfs_warn(ip->i_mount, + "corrupt inode %Lu (local format for regular file).", + (unsigned long long) ip->i_ino); + XFS_CORRUPTION_ERROR("xfs_iformat(4)", + XFS_ERRLEVEL_LOW, + ip->i_mount, dip); + return XFS_ERROR(EFSCORRUPTED); + } + + di_size = be64_to_cpu(dip->di_size); + if (unlikely(di_size < 0 || + di_size > XFS_DFORK_DSIZE(dip, ip->i_mount))) { + xfs_warn(ip->i_mount, + "corrupt inode %Lu (bad size %Ld for local inode).", + (unsigned long long) ip->i_ino, + (long long) di_size); + XFS_CORRUPTION_ERROR("xfs_iformat(5)", + XFS_ERRLEVEL_LOW, + ip->i_mount, dip); + return XFS_ERROR(EFSCORRUPTED); + } + + size = (int)di_size; + error = xfs_iformat_local(ip, dip, XFS_DATA_FORK, size); + break; + case XFS_DINODE_FMT_EXTENTS: + error = xfs_iformat_extents(ip, dip, XFS_DATA_FORK); + break; + case XFS_DINODE_FMT_BTREE: + error = xfs_iformat_btree(ip, dip, XFS_DATA_FORK); + break; + default: + XFS_ERROR_REPORT("xfs_iformat(6)", XFS_ERRLEVEL_LOW, + ip->i_mount); + return XFS_ERROR(EFSCORRUPTED); + } + break; + + default: + XFS_ERROR_REPORT("xfs_iformat(7)", XFS_ERRLEVEL_LOW, ip->i_mount); + return XFS_ERROR(EFSCORRUPTED); + } + if (error) { + return error; + } + if (!XFS_DFORK_Q(dip)) + return 0; + + ASSERT(ip->i_afp == NULL); + ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP | KM_NOFS); + + switch (dip->di_aformat) { + case XFS_DINODE_FMT_LOCAL: + atp = (xfs_attr_shortform_t *)XFS_DFORK_APTR(dip); + size = be16_to_cpu(atp->hdr.totsize); + + if (unlikely(size < sizeof(struct xfs_attr_sf_hdr))) { + xfs_warn(ip->i_mount, + "corrupt inode %Lu (bad attr fork size %Ld).", + (unsigned long long) ip->i_ino, + (long long) size); + XFS_CORRUPTION_ERROR("xfs_iformat(8)", + XFS_ERRLEVEL_LOW, + ip->i_mount, dip); + return XFS_ERROR(EFSCORRUPTED); + } + + error = xfs_iformat_local(ip, dip, XFS_ATTR_FORK, size); + break; + case XFS_DINODE_FMT_EXTENTS: + error = xfs_iformat_extents(ip, dip, XFS_ATTR_FORK); + break; + case XFS_DINODE_FMT_BTREE: + error = xfs_iformat_btree(ip, dip, XFS_ATTR_FORK); + break; + default: + error = XFS_ERROR(EFSCORRUPTED); + break; + } + if (error) { + kmem_zone_free(xfs_ifork_zone, ip->i_afp); + ip->i_afp = NULL; + xfs_idestroy_fork(ip, XFS_DATA_FORK); + } + return error; +} + +/* + * The file is in-lined in the on-disk inode. + * If it fits into if_inline_data, then copy + * it there, otherwise allocate a buffer for it + * and copy the data there. Either way, set + * if_data to point at the data. + * If we allocate a buffer for the data, make + * sure that its size is a multiple of 4 and + * record the real size in i_real_bytes. + */ +STATIC int +xfs_iformat_local( + xfs_inode_t *ip, + xfs_dinode_t *dip, + int whichfork, + int size) +{ + xfs_ifork_t *ifp; + int real_size; + + /* + * If the size is unreasonable, then something + * is wrong and we just bail out rather than crash in + * kmem_alloc() or memcpy() below. + */ + if (unlikely(size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) { + xfs_warn(ip->i_mount, + "corrupt inode %Lu (bad size %d for local fork, size = %d).", + (unsigned long long) ip->i_ino, size, + XFS_DFORK_SIZE(dip, ip->i_mount, whichfork)); + XFS_CORRUPTION_ERROR("xfs_iformat_local", XFS_ERRLEVEL_LOW, + ip->i_mount, dip); + return XFS_ERROR(EFSCORRUPTED); + } + ifp = XFS_IFORK_PTR(ip, whichfork); + real_size = 0; + if (size == 0) + ifp->if_u1.if_data = NULL; + else if (size <= sizeof(ifp->if_u2.if_inline_data)) + ifp->if_u1.if_data = ifp->if_u2.if_inline_data; + else { + real_size = roundup(size, 4); + ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP | KM_NOFS); + } + ifp->if_bytes = size; + ifp->if_real_bytes = real_size; + if (size) + memcpy(ifp->if_u1.if_data, XFS_DFORK_PTR(dip, whichfork), size); + ifp->if_flags &= ~XFS_IFEXTENTS; + ifp->if_flags |= XFS_IFINLINE; + return 0; +} + +/* + * The file consists of a set of extents all + * of which fit into the on-disk inode. + * If there are few enough extents to fit into + * the if_inline_ext, then copy them there. + * Otherwise allocate a buffer for them and copy + * them into it. Either way, set if_extents + * to point at the extents. + */ +STATIC int +xfs_iformat_extents( + xfs_inode_t *ip, + xfs_dinode_t *dip, + int whichfork) +{ + xfs_bmbt_rec_t *dp; + xfs_ifork_t *ifp; + int nex; + int size; + int i; + + ifp = XFS_IFORK_PTR(ip, whichfork); + nex = XFS_DFORK_NEXTENTS(dip, whichfork); + size = nex * (uint)sizeof(xfs_bmbt_rec_t); + + /* + * If the number of extents is unreasonable, then something + * is wrong and we just bail out rather than crash in + * kmem_alloc() or memcpy() below. + */ + if (unlikely(size < 0 || size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) { + xfs_warn(ip->i_mount, "corrupt inode %Lu ((a)extents = %d).", + (unsigned long long) ip->i_ino, nex); + XFS_CORRUPTION_ERROR("xfs_iformat_extents(1)", XFS_ERRLEVEL_LOW, + ip->i_mount, dip); + return XFS_ERROR(EFSCORRUPTED); + } + + ifp->if_real_bytes = 0; + if (nex == 0) + ifp->if_u1.if_extents = NULL; + else if (nex <= XFS_INLINE_EXTS) + ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext; + else + xfs_iext_add(ifp, 0, nex); + + ifp->if_bytes = size; + if (size) { + dp = (xfs_bmbt_rec_t *) XFS_DFORK_PTR(dip, whichfork); + xfs_validate_extents(ifp, nex, XFS_EXTFMT_INODE(ip)); + for (i = 0; i < nex; i++, dp++) { + xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i); + ep->l0 = get_unaligned_be64(&dp->l0); + ep->l1 = get_unaligned_be64(&dp->l1); + } + XFS_BMAP_TRACE_EXLIST(ip, nex, whichfork); + if (whichfork != XFS_DATA_FORK || + XFS_EXTFMT_INODE(ip) == XFS_EXTFMT_NOSTATE) + if (unlikely(xfs_check_nostate_extents( + ifp, 0, nex))) { + XFS_ERROR_REPORT("xfs_iformat_extents(2)", + XFS_ERRLEVEL_LOW, + ip->i_mount); + return XFS_ERROR(EFSCORRUPTED); + } + } + ifp->if_flags |= XFS_IFEXTENTS; + return 0; +} + +/* + * The file has too many extents to fit into + * the inode, so they are in B-tree format. + * Allocate a buffer for the root of the B-tree + * and copy the root into it. The i_extents + * field will remain NULL until all of the + * extents are read in (when they are needed). + */ +STATIC int +xfs_iformat_btree( + xfs_inode_t *ip, + xfs_dinode_t *dip, + int whichfork) +{ + struct xfs_mount *mp = ip->i_mount; + xfs_bmdr_block_t *dfp; + xfs_ifork_t *ifp; + /* REFERENCED */ + int nrecs; + int size; + + ifp = XFS_IFORK_PTR(ip, whichfork); + dfp = (xfs_bmdr_block_t *)XFS_DFORK_PTR(dip, whichfork); + size = XFS_BMAP_BROOT_SPACE(mp, dfp); + nrecs = be16_to_cpu(dfp->bb_numrecs); + + /* + * blow out if -- fork has less extents than can fit in + * fork (fork shouldn't be a btree format), root btree + * block has more records than can fit into the fork, + * or the number of extents is greater than the number of + * blocks. + */ + if (unlikely(XFS_IFORK_NEXTENTS(ip, whichfork) <= + XFS_IFORK_MAXEXT(ip, whichfork) || + XFS_BMDR_SPACE_CALC(nrecs) > + XFS_DFORK_SIZE(dip, mp, whichfork) || + XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks)) { + xfs_warn(mp, "corrupt inode %Lu (btree).", + (unsigned long long) ip->i_ino); + XFS_CORRUPTION_ERROR("xfs_iformat_btree", XFS_ERRLEVEL_LOW, + mp, dip); + return XFS_ERROR(EFSCORRUPTED); + } + + ifp->if_broot_bytes = size; + ifp->if_broot = kmem_alloc(size, KM_SLEEP | KM_NOFS); + ASSERT(ifp->if_broot != NULL); + /* + * Copy and convert from the on-disk structure + * to the in-memory structure. + */ + xfs_bmdr_to_bmbt(ip, dfp, XFS_DFORK_SIZE(dip, ip->i_mount, whichfork), + ifp->if_broot, size); + ifp->if_flags &= ~XFS_IFEXTENTS; + ifp->if_flags |= XFS_IFBROOT; + + return 0; +} + +/* + * Read in extents from a btree-format inode. + * Allocate and fill in if_extents. Real work is done in xfs_bmap.c. + */ +int +xfs_iread_extents( + xfs_trans_t *tp, + xfs_inode_t *ip, + int whichfork) +{ + int error; + xfs_ifork_t *ifp; + xfs_extnum_t nextents; + + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + + if (unlikely(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)) { + XFS_ERROR_REPORT("xfs_iread_extents", XFS_ERRLEVEL_LOW, + ip->i_mount); + return XFS_ERROR(EFSCORRUPTED); + } + nextents = XFS_IFORK_NEXTENTS(ip, whichfork); + ifp = XFS_IFORK_PTR(ip, whichfork); + + /* + * We know that the size is valid (it's checked in iformat_btree) + */ + ifp->if_bytes = ifp->if_real_bytes = 0; + ifp->if_flags |= XFS_IFEXTENTS; + xfs_iext_add(ifp, 0, nextents); + error = xfs_bmap_read_extents(tp, ip, whichfork); + if (error) { + xfs_iext_destroy(ifp); + ifp->if_flags &= ~XFS_IFEXTENTS; + return error; + } + xfs_validate_extents(ifp, nextents, XFS_EXTFMT_INODE(ip)); + return 0; +} +/* + * Reallocate the space for if_broot based on the number of records + * being added or deleted as indicated in rec_diff. Move the records + * and pointers in if_broot to fit the new size. When shrinking this + * will eliminate holes between the records and pointers created by + * the caller. When growing this will create holes to be filled in + * by the caller. + * + * The caller must not request to add more records than would fit in + * the on-disk inode root. If the if_broot is currently NULL, then + * if we are adding records, one will be allocated. The caller must also + * not request that the number of records go below zero, although + * it can go to zero. + * + * ip -- the inode whose if_broot area is changing + * ext_diff -- the change in the number of records, positive or negative, + * requested for the if_broot array. + */ +void +xfs_iroot_realloc( + xfs_inode_t *ip, + int rec_diff, + int whichfork) +{ + struct xfs_mount *mp = ip->i_mount; + int cur_max; + xfs_ifork_t *ifp; + struct xfs_btree_block *new_broot; + int new_max; + size_t new_size; + char *np; + char *op; + + /* + * Handle the degenerate case quietly. + */ + if (rec_diff == 0) { + return; + } + + ifp = XFS_IFORK_PTR(ip, whichfork); + if (rec_diff > 0) { + /* + * If there wasn't any memory allocated before, just + * allocate it now and get out. + */ + if (ifp->if_broot_bytes == 0) { + new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, rec_diff); + ifp->if_broot = kmem_alloc(new_size, KM_SLEEP | KM_NOFS); + ifp->if_broot_bytes = (int)new_size; + return; + } + + /* + * If there is already an existing if_broot, then we need + * to realloc() it and shift the pointers to their new + * location. The records don't change location because + * they are kept butted up against the btree block header. + */ + cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0); + new_max = cur_max + rec_diff; + new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, new_max); + ifp->if_broot = kmem_realloc(ifp->if_broot, new_size, + XFS_BMAP_BROOT_SPACE_CALC(mp, cur_max), + KM_SLEEP | KM_NOFS); + op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1, + ifp->if_broot_bytes); + np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1, + (int)new_size); + ifp->if_broot_bytes = (int)new_size; + ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <= + XFS_IFORK_SIZE(ip, whichfork)); + memmove(np, op, cur_max * (uint)sizeof(xfs_dfsbno_t)); + return; + } + + /* + * rec_diff is less than 0. In this case, we are shrinking the + * if_broot buffer. It must already exist. If we go to zero + * records, just get rid of the root and clear the status bit. + */ + ASSERT((ifp->if_broot != NULL) && (ifp->if_broot_bytes > 0)); + cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0); + new_max = cur_max + rec_diff; + ASSERT(new_max >= 0); + if (new_max > 0) + new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, new_max); + else + new_size = 0; + if (new_size > 0) { + new_broot = kmem_alloc(new_size, KM_SLEEP | KM_NOFS); + /* + * First copy over the btree block header. + */ + memcpy(new_broot, ifp->if_broot, + XFS_BMBT_BLOCK_LEN(ip->i_mount)); + } else { + new_broot = NULL; + ifp->if_flags &= ~XFS_IFBROOT; + } + + /* + * Only copy the records and pointers if there are any. + */ + if (new_max > 0) { + /* + * First copy the records. + */ + op = (char *)XFS_BMBT_REC_ADDR(mp, ifp->if_broot, 1); + np = (char *)XFS_BMBT_REC_ADDR(mp, new_broot, 1); + memcpy(np, op, new_max * (uint)sizeof(xfs_bmbt_rec_t)); + + /* + * Then copy the pointers. + */ + op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1, + ifp->if_broot_bytes); + np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, new_broot, 1, + (int)new_size); + memcpy(np, op, new_max * (uint)sizeof(xfs_dfsbno_t)); + } + kmem_free(ifp->if_broot); + ifp->if_broot = new_broot; + ifp->if_broot_bytes = (int)new_size; + if (ifp->if_broot) + ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <= + XFS_IFORK_SIZE(ip, whichfork)); + return; +} + + +/* + * This is called when the amount of space needed for if_data + * is increased or decreased. The change in size is indicated by + * the number of bytes that need to be added or deleted in the + * byte_diff parameter. + * + * If the amount of space needed has decreased below the size of the + * inline buffer, then switch to using the inline buffer. Otherwise, + * use kmem_realloc() or kmem_alloc() to adjust the size of the buffer + * to what is needed. + * + * ip -- the inode whose if_data area is changing + * byte_diff -- the change in the number of bytes, positive or negative, + * requested for the if_data array. + */ +void +xfs_idata_realloc( + xfs_inode_t *ip, + int byte_diff, + int whichfork) +{ + xfs_ifork_t *ifp; + int new_size; + int real_size; + + if (byte_diff == 0) { + return; + } + + ifp = XFS_IFORK_PTR(ip, whichfork); + new_size = (int)ifp->if_bytes + byte_diff; + ASSERT(new_size >= 0); + + if (new_size == 0) { + if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) { + kmem_free(ifp->if_u1.if_data); + } + ifp->if_u1.if_data = NULL; + real_size = 0; + } else if (new_size <= sizeof(ifp->if_u2.if_inline_data)) { + /* + * If the valid extents/data can fit in if_inline_ext/data, + * copy them from the malloc'd vector and free it. + */ + if (ifp->if_u1.if_data == NULL) { + ifp->if_u1.if_data = ifp->if_u2.if_inline_data; + } else if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) { + ASSERT(ifp->if_real_bytes != 0); + memcpy(ifp->if_u2.if_inline_data, ifp->if_u1.if_data, + new_size); + kmem_free(ifp->if_u1.if_data); + ifp->if_u1.if_data = ifp->if_u2.if_inline_data; + } + real_size = 0; + } else { + /* + * Stuck with malloc/realloc. + * For inline data, the underlying buffer must be + * a multiple of 4 bytes in size so that it can be + * logged and stay on word boundaries. We enforce + * that here. + */ + real_size = roundup(new_size, 4); + if (ifp->if_u1.if_data == NULL) { + ASSERT(ifp->if_real_bytes == 0); + ifp->if_u1.if_data = kmem_alloc(real_size, + KM_SLEEP | KM_NOFS); + } else if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) { + /* + * Only do the realloc if the underlying size + * is really changing. + */ + if (ifp->if_real_bytes != real_size) { + ifp->if_u1.if_data = + kmem_realloc(ifp->if_u1.if_data, + real_size, + ifp->if_real_bytes, + KM_SLEEP | KM_NOFS); + } + } else { + ASSERT(ifp->if_real_bytes == 0); + ifp->if_u1.if_data = kmem_alloc(real_size, + KM_SLEEP | KM_NOFS); + memcpy(ifp->if_u1.if_data, ifp->if_u2.if_inline_data, + ifp->if_bytes); + } + } + ifp->if_real_bytes = real_size; + ifp->if_bytes = new_size; + ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork)); +} + +void +xfs_idestroy_fork( + xfs_inode_t *ip, + int whichfork) +{ + xfs_ifork_t *ifp; + + ifp = XFS_IFORK_PTR(ip, whichfork); + if (ifp->if_broot != NULL) { + kmem_free(ifp->if_broot); + ifp->if_broot = NULL; + } + + /* + * If the format is local, then we can't have an extents + * array so just look for an inline data array. If we're + * not local then we may or may not have an extents list, + * so check and free it up if we do. + */ + if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) { + if ((ifp->if_u1.if_data != ifp->if_u2.if_inline_data) && + (ifp->if_u1.if_data != NULL)) { + ASSERT(ifp->if_real_bytes != 0); + kmem_free(ifp->if_u1.if_data); + ifp->if_u1.if_data = NULL; + ifp->if_real_bytes = 0; + } + } else if ((ifp->if_flags & XFS_IFEXTENTS) && + ((ifp->if_flags & XFS_IFEXTIREC) || + ((ifp->if_u1.if_extents != NULL) && + (ifp->if_u1.if_extents != ifp->if_u2.if_inline_ext)))) { + ASSERT(ifp->if_real_bytes != 0); + xfs_iext_destroy(ifp); + } + ASSERT(ifp->if_u1.if_extents == NULL || + ifp->if_u1.if_extents == ifp->if_u2.if_inline_ext); + ASSERT(ifp->if_real_bytes == 0); + if (whichfork == XFS_ATTR_FORK) { + kmem_zone_free(xfs_ifork_zone, ip->i_afp); + ip->i_afp = NULL; + } +} + +/* + * Convert in-core extents to on-disk form + * + * For either the data or attr fork in extent format, we need to endian convert + * the in-core extent as we place them into the on-disk inode. + * + * In the case of the data fork, the in-core and on-disk fork sizes can be + * different due to delayed allocation extents. We only copy on-disk extents + * here, so callers must always use the physical fork size to determine the + * size of the buffer passed to this routine. We will return the size actually + * used. + */ +int +xfs_iextents_copy( + xfs_inode_t *ip, + xfs_bmbt_rec_t *dp, + int whichfork) +{ + int copied; + int i; + xfs_ifork_t *ifp; + int nrecs; + xfs_fsblock_t start_block; + + ifp = XFS_IFORK_PTR(ip, whichfork); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); + ASSERT(ifp->if_bytes > 0); + + nrecs = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + XFS_BMAP_TRACE_EXLIST(ip, nrecs, whichfork); + ASSERT(nrecs > 0); + + /* + * There are some delayed allocation extents in the + * inode, so copy the extents one at a time and skip + * the delayed ones. There must be at least one + * non-delayed extent. + */ + copied = 0; + for (i = 0; i < nrecs; i++) { + xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i); + start_block = xfs_bmbt_get_startblock(ep); + if (isnullstartblock(start_block)) { + /* + * It's a delayed allocation extent, so skip it. + */ + continue; + } + + /* Translate to on disk format */ + put_unaligned_be64(ep->l0, &dp->l0); + put_unaligned_be64(ep->l1, &dp->l1); + dp++; + copied++; + } + ASSERT(copied != 0); + xfs_validate_extents(ifp, copied, XFS_EXTFMT_INODE(ip)); + + return (copied * (uint)sizeof(xfs_bmbt_rec_t)); +} + +/* + * Each of the following cases stores data into the same region + * of the on-disk inode, so only one of them can be valid at + * any given time. While it is possible to have conflicting formats + * and log flags, e.g. having XFS_ILOG_?DATA set when the fork is + * in EXTENTS format, this can only happen when the fork has + * changed formats after being modified but before being flushed. + * In these cases, the format always takes precedence, because the + * format indicates the current state of the fork. + */ +void +xfs_iflush_fork( + xfs_inode_t *ip, + xfs_dinode_t *dip, + xfs_inode_log_item_t *iip, + int whichfork) +{ + char *cp; + xfs_ifork_t *ifp; + xfs_mount_t *mp; + static const short brootflag[2] = + { XFS_ILOG_DBROOT, XFS_ILOG_ABROOT }; + static const short dataflag[2] = + { XFS_ILOG_DDATA, XFS_ILOG_ADATA }; + static const short extflag[2] = + { XFS_ILOG_DEXT, XFS_ILOG_AEXT }; + + if (!iip) + return; + ifp = XFS_IFORK_PTR(ip, whichfork); + /* + * This can happen if we gave up in iformat in an error path, + * for the attribute fork. + */ + if (!ifp) { + ASSERT(whichfork == XFS_ATTR_FORK); + return; + } + cp = XFS_DFORK_PTR(dip, whichfork); + mp = ip->i_mount; + switch (XFS_IFORK_FORMAT(ip, whichfork)) { + case XFS_DINODE_FMT_LOCAL: + if ((iip->ili_fields & dataflag[whichfork]) && + (ifp->if_bytes > 0)) { + ASSERT(ifp->if_u1.if_data != NULL); + ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork)); + memcpy(cp, ifp->if_u1.if_data, ifp->if_bytes); + } + break; + + case XFS_DINODE_FMT_EXTENTS: + ASSERT((ifp->if_flags & XFS_IFEXTENTS) || + !(iip->ili_fields & extflag[whichfork])); + if ((iip->ili_fields & extflag[whichfork]) && + (ifp->if_bytes > 0)) { + ASSERT(xfs_iext_get_ext(ifp, 0)); + ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) > 0); + (void)xfs_iextents_copy(ip, (xfs_bmbt_rec_t *)cp, + whichfork); + } + break; + + case XFS_DINODE_FMT_BTREE: + if ((iip->ili_fields & brootflag[whichfork]) && + (ifp->if_broot_bytes > 0)) { + ASSERT(ifp->if_broot != NULL); + ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <= + XFS_IFORK_SIZE(ip, whichfork)); + xfs_bmbt_to_bmdr(mp, ifp->if_broot, ifp->if_broot_bytes, + (xfs_bmdr_block_t *)cp, + XFS_DFORK_SIZE(dip, mp, whichfork)); + } + break; + + case XFS_DINODE_FMT_DEV: + if (iip->ili_fields & XFS_ILOG_DEV) { + ASSERT(whichfork == XFS_DATA_FORK); + xfs_dinode_put_rdev(dip, ip->i_df.if_u2.if_rdev); + } + break; + + case XFS_DINODE_FMT_UUID: + if (iip->ili_fields & XFS_ILOG_UUID) { + ASSERT(whichfork == XFS_DATA_FORK); + memcpy(XFS_DFORK_DPTR(dip), + &ip->i_df.if_u2.if_uuid, + sizeof(uuid_t)); + } + break; + + default: + ASSERT(0); + break; + } +} + +/* + * Return a pointer to the extent record at file index idx. + */ +xfs_bmbt_rec_host_t * +xfs_iext_get_ext( + xfs_ifork_t *ifp, /* inode fork pointer */ + xfs_extnum_t idx) /* index of target extent */ +{ + ASSERT(idx >= 0); + ASSERT(idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t)); + + if ((ifp->if_flags & XFS_IFEXTIREC) && (idx == 0)) { + return ifp->if_u1.if_ext_irec->er_extbuf; + } else if (ifp->if_flags & XFS_IFEXTIREC) { + xfs_ext_irec_t *erp; /* irec pointer */ + int erp_idx = 0; /* irec index */ + xfs_extnum_t page_idx = idx; /* ext index in target list */ + + erp = xfs_iext_idx_to_irec(ifp, &page_idx, &erp_idx, 0); + return &erp->er_extbuf[page_idx]; + } else if (ifp->if_bytes) { + return &ifp->if_u1.if_extents[idx]; + } else { + return NULL; + } +} + +/* + * Insert new item(s) into the extent records for incore inode + * fork 'ifp'. 'count' new items are inserted at index 'idx'. + */ +void +xfs_iext_insert( + xfs_inode_t *ip, /* incore inode pointer */ + xfs_extnum_t idx, /* starting index of new items */ + xfs_extnum_t count, /* number of inserted items */ + xfs_bmbt_irec_t *new, /* items to insert */ + int state) /* type of extent conversion */ +{ + xfs_ifork_t *ifp = (state & BMAP_ATTRFORK) ? ip->i_afp : &ip->i_df; + xfs_extnum_t i; /* extent record index */ + + trace_xfs_iext_insert(ip, idx, new, state, _RET_IP_); + + ASSERT(ifp->if_flags & XFS_IFEXTENTS); + xfs_iext_add(ifp, idx, count); + for (i = idx; i < idx + count; i++, new++) + xfs_bmbt_set_all(xfs_iext_get_ext(ifp, i), new); +} + +/* + * This is called when the amount of space required for incore file + * extents needs to be increased. The ext_diff parameter stores the + * number of new extents being added and the idx parameter contains + * the extent index where the new extents will be added. If the new + * extents are being appended, then we just need to (re)allocate and + * initialize the space. Otherwise, if the new extents are being + * inserted into the middle of the existing entries, a bit more work + * is required to make room for the new extents to be inserted. The + * caller is responsible for filling in the new extent entries upon + * return. + */ +void +xfs_iext_add( + xfs_ifork_t *ifp, /* inode fork pointer */ + xfs_extnum_t idx, /* index to begin adding exts */ + int ext_diff) /* number of extents to add */ +{ + int byte_diff; /* new bytes being added */ + int new_size; /* size of extents after adding */ + xfs_extnum_t nextents; /* number of extents in file */ + + nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + ASSERT((idx >= 0) && (idx <= nextents)); + byte_diff = ext_diff * sizeof(xfs_bmbt_rec_t); + new_size = ifp->if_bytes + byte_diff; + /* + * If the new number of extents (nextents + ext_diff) + * fits inside the inode, then continue to use the inline + * extent buffer. + */ + if (nextents + ext_diff <= XFS_INLINE_EXTS) { + if (idx < nextents) { + memmove(&ifp->if_u2.if_inline_ext[idx + ext_diff], + &ifp->if_u2.if_inline_ext[idx], + (nextents - idx) * sizeof(xfs_bmbt_rec_t)); + memset(&ifp->if_u2.if_inline_ext[idx], 0, byte_diff); + } + ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext; + ifp->if_real_bytes = 0; + } + /* + * Otherwise use a linear (direct) extent list. + * If the extents are currently inside the inode, + * xfs_iext_realloc_direct will switch us from + * inline to direct extent allocation mode. + */ + else if (nextents + ext_diff <= XFS_LINEAR_EXTS) { + xfs_iext_realloc_direct(ifp, new_size); + if (idx < nextents) { + memmove(&ifp->if_u1.if_extents[idx + ext_diff], + &ifp->if_u1.if_extents[idx], + (nextents - idx) * sizeof(xfs_bmbt_rec_t)); + memset(&ifp->if_u1.if_extents[idx], 0, byte_diff); + } + } + /* Indirection array */ + else { + xfs_ext_irec_t *erp; + int erp_idx = 0; + int page_idx = idx; + + ASSERT(nextents + ext_diff > XFS_LINEAR_EXTS); + if (ifp->if_flags & XFS_IFEXTIREC) { + erp = xfs_iext_idx_to_irec(ifp, &page_idx, &erp_idx, 1); + } else { + xfs_iext_irec_init(ifp); + ASSERT(ifp->if_flags & XFS_IFEXTIREC); + erp = ifp->if_u1.if_ext_irec; + } + /* Extents fit in target extent page */ + if (erp && erp->er_extcount + ext_diff <= XFS_LINEAR_EXTS) { + if (page_idx < erp->er_extcount) { + memmove(&erp->er_extbuf[page_idx + ext_diff], + &erp->er_extbuf[page_idx], + (erp->er_extcount - page_idx) * + sizeof(xfs_bmbt_rec_t)); + memset(&erp->er_extbuf[page_idx], 0, byte_diff); + } + erp->er_extcount += ext_diff; + xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff); + } + /* Insert a new extent page */ + else if (erp) { + xfs_iext_add_indirect_multi(ifp, + erp_idx, page_idx, ext_diff); + } + /* + * If extent(s) are being appended to the last page in + * the indirection array and the new extent(s) don't fit + * in the page, then erp is NULL and erp_idx is set to + * the next index needed in the indirection array. + */ + else { + uint count = ext_diff; + + while (count) { + erp = xfs_iext_irec_new(ifp, erp_idx); + erp->er_extcount = min(count, XFS_LINEAR_EXTS); + count -= erp->er_extcount; + if (count) + erp_idx++; + } + } + } + ifp->if_bytes = new_size; +} + +/* + * This is called when incore extents are being added to the indirection + * array and the new extents do not fit in the target extent list. The + * erp_idx parameter contains the irec index for the target extent list + * in the indirection array, and the idx parameter contains the extent + * index within the list. The number of extents being added is stored + * in the count parameter. + * + * |-------| |-------| + * | | | | idx - number of extents before idx + * | idx | | count | + * | | | | count - number of extents being inserted at idx + * |-------| |-------| + * | count | | nex2 | nex2 - number of extents after idx + count + * |-------| |-------| + */ +void +xfs_iext_add_indirect_multi( + xfs_ifork_t *ifp, /* inode fork pointer */ + int erp_idx, /* target extent irec index */ + xfs_extnum_t idx, /* index within target list */ + int count) /* new extents being added */ +{ + int byte_diff; /* new bytes being added */ + xfs_ext_irec_t *erp; /* pointer to irec entry */ + xfs_extnum_t ext_diff; /* number of extents to add */ + xfs_extnum_t ext_cnt; /* new extents still needed */ + xfs_extnum_t nex2; /* extents after idx + count */ + xfs_bmbt_rec_t *nex2_ep = NULL; /* temp list for nex2 extents */ + int nlists; /* number of irec's (lists) */ + + ASSERT(ifp->if_flags & XFS_IFEXTIREC); + erp = &ifp->if_u1.if_ext_irec[erp_idx]; + nex2 = erp->er_extcount - idx; + nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; + + /* + * Save second part of target extent list + * (all extents past */ + if (nex2) { + byte_diff = nex2 * sizeof(xfs_bmbt_rec_t); + nex2_ep = (xfs_bmbt_rec_t *) kmem_alloc(byte_diff, KM_NOFS); + memmove(nex2_ep, &erp->er_extbuf[idx], byte_diff); + erp->er_extcount -= nex2; + xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, -nex2); + memset(&erp->er_extbuf[idx], 0, byte_diff); + } + + /* + * Add the new extents to the end of the target + * list, then allocate new irec record(s) and + * extent buffer(s) as needed to store the rest + * of the new extents. + */ + ext_cnt = count; + ext_diff = MIN(ext_cnt, (int)XFS_LINEAR_EXTS - erp->er_extcount); + if (ext_diff) { + erp->er_extcount += ext_diff; + xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff); + ext_cnt -= ext_diff; + } + while (ext_cnt) { + erp_idx++; + erp = xfs_iext_irec_new(ifp, erp_idx); + ext_diff = MIN(ext_cnt, (int)XFS_LINEAR_EXTS); + erp->er_extcount = ext_diff; + xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff); + ext_cnt -= ext_diff; + } + + /* Add nex2 extents back to indirection array */ + if (nex2) { + xfs_extnum_t ext_avail; + int i; + + byte_diff = nex2 * sizeof(xfs_bmbt_rec_t); + ext_avail = XFS_LINEAR_EXTS - erp->er_extcount; + i = 0; + /* + * If nex2 extents fit in the current page, append + * nex2_ep after the new extents. + */ + if (nex2 <= ext_avail) { + i = erp->er_extcount; + } + /* + * Otherwise, check if space is available in the + * next page. + */ + else if ((erp_idx < nlists - 1) && + (nex2 <= (ext_avail = XFS_LINEAR_EXTS - + ifp->if_u1.if_ext_irec[erp_idx+1].er_extcount))) { + erp_idx++; + erp++; + /* Create a hole for nex2 extents */ + memmove(&erp->er_extbuf[nex2], erp->er_extbuf, + erp->er_extcount * sizeof(xfs_bmbt_rec_t)); + } + /* + * Final choice, create a new extent page for + * nex2 extents. + */ + else { + erp_idx++; + erp = xfs_iext_irec_new(ifp, erp_idx); + } + memmove(&erp->er_extbuf[i], nex2_ep, byte_diff); + kmem_free(nex2_ep); + erp->er_extcount += nex2; + xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, nex2); + } +} + +/* + * This is called when the amount of space required for incore file + * extents needs to be decreased. The ext_diff parameter stores the + * number of extents to be removed and the idx parameter contains + * the extent index where the extents will be removed from. + * + * If the amount of space needed has decreased below the linear + * limit, XFS_IEXT_BUFSZ, then switch to using the contiguous + * extent array. Otherwise, use kmem_realloc() to adjust the + * size to what is needed. + */ +void +xfs_iext_remove( + xfs_inode_t *ip, /* incore inode pointer */ + xfs_extnum_t idx, /* index to begin removing exts */ + int ext_diff, /* number of extents to remove */ + int state) /* type of extent conversion */ +{ + xfs_ifork_t *ifp = (state & BMAP_ATTRFORK) ? ip->i_afp : &ip->i_df; + xfs_extnum_t nextents; /* number of extents in file */ + int new_size; /* size of extents after removal */ + + trace_xfs_iext_remove(ip, idx, state, _RET_IP_); + + ASSERT(ext_diff > 0); + nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + new_size = (nextents - ext_diff) * sizeof(xfs_bmbt_rec_t); + + if (new_size == 0) { + xfs_iext_destroy(ifp); + } else if (ifp->if_flags & XFS_IFEXTIREC) { + xfs_iext_remove_indirect(ifp, idx, ext_diff); + } else if (ifp->if_real_bytes) { + xfs_iext_remove_direct(ifp, idx, ext_diff); + } else { + xfs_iext_remove_inline(ifp, idx, ext_diff); + } + ifp->if_bytes = new_size; +} + +/* + * This removes ext_diff extents from the inline buffer, beginning + * at extent index idx. + */ +void +xfs_iext_remove_inline( + xfs_ifork_t *ifp, /* inode fork pointer */ + xfs_extnum_t idx, /* index to begin removing exts */ + int ext_diff) /* number of extents to remove */ +{ + int nextents; /* number of extents in file */ + + ASSERT(!(ifp->if_flags & XFS_IFEXTIREC)); + ASSERT(idx < XFS_INLINE_EXTS); + nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + ASSERT(((nextents - ext_diff) > 0) && + (nextents - ext_diff) < XFS_INLINE_EXTS); + + if (idx + ext_diff < nextents) { + memmove(&ifp->if_u2.if_inline_ext[idx], + &ifp->if_u2.if_inline_ext[idx + ext_diff], + (nextents - (idx + ext_diff)) * + sizeof(xfs_bmbt_rec_t)); + memset(&ifp->if_u2.if_inline_ext[nextents - ext_diff], + 0, ext_diff * sizeof(xfs_bmbt_rec_t)); + } else { + memset(&ifp->if_u2.if_inline_ext[idx], 0, + ext_diff * sizeof(xfs_bmbt_rec_t)); + } +} + +/* + * This removes ext_diff extents from a linear (direct) extent list, + * beginning at extent index idx. If the extents are being removed + * from the end of the list (ie. truncate) then we just need to re- + * allocate the list to remove the extra space. Otherwise, if the + * extents are being removed from the middle of the existing extent + * entries, then we first need to move the extent records beginning + * at idx + ext_diff up in the list to overwrite the records being + * removed, then remove the extra space via kmem_realloc. + */ +void +xfs_iext_remove_direct( + xfs_ifork_t *ifp, /* inode fork pointer */ + xfs_extnum_t idx, /* index to begin removing exts */ + int ext_diff) /* number of extents to remove */ +{ + xfs_extnum_t nextents; /* number of extents in file */ + int new_size; /* size of extents after removal */ + + ASSERT(!(ifp->if_flags & XFS_IFEXTIREC)); + new_size = ifp->if_bytes - + (ext_diff * sizeof(xfs_bmbt_rec_t)); + nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + + if (new_size == 0) { + xfs_iext_destroy(ifp); + return; + } + /* Move extents up in the list (if needed) */ + if (idx + ext_diff < nextents) { + memmove(&ifp->if_u1.if_extents[idx], + &ifp->if_u1.if_extents[idx + ext_diff], + (nextents - (idx + ext_diff)) * + sizeof(xfs_bmbt_rec_t)); + } + memset(&ifp->if_u1.if_extents[nextents - ext_diff], + 0, ext_diff * sizeof(xfs_bmbt_rec_t)); + /* + * Reallocate the direct extent list. If the extents + * will fit inside the inode then xfs_iext_realloc_direct + * will switch from direct to inline extent allocation + * mode for us. + */ + xfs_iext_realloc_direct(ifp, new_size); + ifp->if_bytes = new_size; +} + +/* + * This is called when incore extents are being removed from the + * indirection array and the extents being removed span multiple extent + * buffers. The idx parameter contains the file extent index where we + * want to begin removing extents, and the count parameter contains + * how many extents need to be removed. + * + * |-------| |-------| + * | nex1 | | | nex1 - number of extents before idx + * |-------| | count | + * | | | | count - number of extents being removed at idx + * | count | |-------| + * | | | nex2 | nex2 - number of extents after idx + count + * |-------| |-------| + */ +void +xfs_iext_remove_indirect( + xfs_ifork_t *ifp, /* inode fork pointer */ + xfs_extnum_t idx, /* index to begin removing extents */ + int count) /* number of extents to remove */ +{ + xfs_ext_irec_t *erp; /* indirection array pointer */ + int erp_idx = 0; /* indirection array index */ + xfs_extnum_t ext_cnt; /* extents left to remove */ + xfs_extnum_t ext_diff; /* extents to remove in current list */ + xfs_extnum_t nex1; /* number of extents before idx */ + xfs_extnum_t nex2; /* extents after idx + count */ + int page_idx = idx; /* index in target extent list */ + + ASSERT(ifp->if_flags & XFS_IFEXTIREC); + erp = xfs_iext_idx_to_irec(ifp, &page_idx, &erp_idx, 0); + ASSERT(erp != NULL); + nex1 = page_idx; + ext_cnt = count; + while (ext_cnt) { + nex2 = MAX((erp->er_extcount - (nex1 + ext_cnt)), 0); + ext_diff = MIN(ext_cnt, (erp->er_extcount - nex1)); + /* + * Check for deletion of entire list; + * xfs_iext_irec_remove() updates extent offsets. + */ + if (ext_diff == erp->er_extcount) { + xfs_iext_irec_remove(ifp, erp_idx); + ext_cnt -= ext_diff; + nex1 = 0; + if (ext_cnt) { + ASSERT(erp_idx < ifp->if_real_bytes / + XFS_IEXT_BUFSZ); + erp = &ifp->if_u1.if_ext_irec[erp_idx]; + nex1 = 0; + continue; + } else { + break; + } + } + /* Move extents up (if needed) */ + if (nex2) { + memmove(&erp->er_extbuf[nex1], + &erp->er_extbuf[nex1 + ext_diff], + nex2 * sizeof(xfs_bmbt_rec_t)); + } + /* Zero out rest of page */ + memset(&erp->er_extbuf[nex1 + nex2], 0, (XFS_IEXT_BUFSZ - + ((nex1 + nex2) * sizeof(xfs_bmbt_rec_t)))); + /* Update remaining counters */ + erp->er_extcount -= ext_diff; + xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, -ext_diff); + ext_cnt -= ext_diff; + nex1 = 0; + erp_idx++; + erp++; + } + ifp->if_bytes -= count * sizeof(xfs_bmbt_rec_t); + xfs_iext_irec_compact(ifp); +} + +/* + * Create, destroy, or resize a linear (direct) block of extents. + */ +void +xfs_iext_realloc_direct( + xfs_ifork_t *ifp, /* inode fork pointer */ + int new_size) /* new size of extents after adding */ +{ + int rnew_size; /* real new size of extents */ + + rnew_size = new_size; + + ASSERT(!(ifp->if_flags & XFS_IFEXTIREC) || + ((new_size >= 0) && (new_size <= XFS_IEXT_BUFSZ) && + (new_size != ifp->if_real_bytes))); + + /* Free extent records */ + if (new_size == 0) { + xfs_iext_destroy(ifp); + } + /* Resize direct extent list and zero any new bytes */ + else if (ifp->if_real_bytes) { + /* Check if extents will fit inside the inode */ + if (new_size <= XFS_INLINE_EXTS * sizeof(xfs_bmbt_rec_t)) { + xfs_iext_direct_to_inline(ifp, new_size / + (uint)sizeof(xfs_bmbt_rec_t)); + ifp->if_bytes = new_size; + return; + } + if (!is_power_of_2(new_size)){ + rnew_size = roundup_pow_of_two(new_size); + } + if (rnew_size != ifp->if_real_bytes) { + ifp->if_u1.if_extents = + kmem_realloc(ifp->if_u1.if_extents, + rnew_size, + ifp->if_real_bytes, KM_NOFS); + } + if (rnew_size > ifp->if_real_bytes) { + memset(&ifp->if_u1.if_extents[ifp->if_bytes / + (uint)sizeof(xfs_bmbt_rec_t)], 0, + rnew_size - ifp->if_real_bytes); + } + } + /* Switch from the inline extent buffer to a direct extent list */ + else { + if (!is_power_of_2(new_size)) { + rnew_size = roundup_pow_of_two(new_size); + } + xfs_iext_inline_to_direct(ifp, rnew_size); + } + ifp->if_real_bytes = rnew_size; + ifp->if_bytes = new_size; +} + +/* + * Switch from linear (direct) extent records to inline buffer. + */ +void +xfs_iext_direct_to_inline( + xfs_ifork_t *ifp, /* inode fork pointer */ + xfs_extnum_t nextents) /* number of extents in file */ +{ + ASSERT(ifp->if_flags & XFS_IFEXTENTS); + ASSERT(nextents <= XFS_INLINE_EXTS); + /* + * The inline buffer was zeroed when we switched + * from inline to direct extent allocation mode, + * so we don't need to clear it here. + */ + memcpy(ifp->if_u2.if_inline_ext, ifp->if_u1.if_extents, + nextents * sizeof(xfs_bmbt_rec_t)); + kmem_free(ifp->if_u1.if_extents); + ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext; + ifp->if_real_bytes = 0; +} + +/* + * Switch from inline buffer to linear (direct) extent records. + * new_size should already be rounded up to the next power of 2 + * by the caller (when appropriate), so use new_size as it is. + * However, since new_size may be rounded up, we can't update + * if_bytes here. It is the caller's responsibility to update + * if_bytes upon return. + */ +void +xfs_iext_inline_to_direct( + xfs_ifork_t *ifp, /* inode fork pointer */ + int new_size) /* number of extents in file */ +{ + ifp->if_u1.if_extents = kmem_alloc(new_size, KM_NOFS); + memset(ifp->if_u1.if_extents, 0, new_size); + if (ifp->if_bytes) { + memcpy(ifp->if_u1.if_extents, ifp->if_u2.if_inline_ext, + ifp->if_bytes); + memset(ifp->if_u2.if_inline_ext, 0, XFS_INLINE_EXTS * + sizeof(xfs_bmbt_rec_t)); + } + ifp->if_real_bytes = new_size; +} + +/* + * Resize an extent indirection array to new_size bytes. + */ +STATIC void +xfs_iext_realloc_indirect( + xfs_ifork_t *ifp, /* inode fork pointer */ + int new_size) /* new indirection array size */ +{ + int nlists; /* number of irec's (ex lists) */ + int size; /* current indirection array size */ + + ASSERT(ifp->if_flags & XFS_IFEXTIREC); + nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; + size = nlists * sizeof(xfs_ext_irec_t); + ASSERT(ifp->if_real_bytes); + ASSERT((new_size >= 0) && (new_size != size)); + if (new_size == 0) { + xfs_iext_destroy(ifp); + } else { + ifp->if_u1.if_ext_irec = (xfs_ext_irec_t *) + kmem_realloc(ifp->if_u1.if_ext_irec, + new_size, size, KM_NOFS); + } +} + +/* + * Switch from indirection array to linear (direct) extent allocations. + */ +STATIC void +xfs_iext_indirect_to_direct( + xfs_ifork_t *ifp) /* inode fork pointer */ +{ + xfs_bmbt_rec_host_t *ep; /* extent record pointer */ + xfs_extnum_t nextents; /* number of extents in file */ + int size; /* size of file extents */ + + ASSERT(ifp->if_flags & XFS_IFEXTIREC); + nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + ASSERT(nextents <= XFS_LINEAR_EXTS); + size = nextents * sizeof(xfs_bmbt_rec_t); + + xfs_iext_irec_compact_pages(ifp); + ASSERT(ifp->if_real_bytes == XFS_IEXT_BUFSZ); + + ep = ifp->if_u1.if_ext_irec->er_extbuf; + kmem_free(ifp->if_u1.if_ext_irec); + ifp->if_flags &= ~XFS_IFEXTIREC; + ifp->if_u1.if_extents = ep; + ifp->if_bytes = size; + if (nextents < XFS_LINEAR_EXTS) { + xfs_iext_realloc_direct(ifp, size); + } +} + +/* + * Free incore file extents. + */ +void +xfs_iext_destroy( + xfs_ifork_t *ifp) /* inode fork pointer */ +{ + if (ifp->if_flags & XFS_IFEXTIREC) { + int erp_idx; + int nlists; + + nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; + for (erp_idx = nlists - 1; erp_idx >= 0 ; erp_idx--) { + xfs_iext_irec_remove(ifp, erp_idx); + } + ifp->if_flags &= ~XFS_IFEXTIREC; + } else if (ifp->if_real_bytes) { + kmem_free(ifp->if_u1.if_extents); + } else if (ifp->if_bytes) { + memset(ifp->if_u2.if_inline_ext, 0, XFS_INLINE_EXTS * + sizeof(xfs_bmbt_rec_t)); + } + ifp->if_u1.if_extents = NULL; + ifp->if_real_bytes = 0; + ifp->if_bytes = 0; +} + +/* + * Return a pointer to the extent record for file system block bno. + */ +xfs_bmbt_rec_host_t * /* pointer to found extent record */ +xfs_iext_bno_to_ext( + xfs_ifork_t *ifp, /* inode fork pointer */ + xfs_fileoff_t bno, /* block number to search for */ + xfs_extnum_t *idxp) /* index of target extent */ +{ + xfs_bmbt_rec_host_t *base; /* pointer to first extent */ + xfs_filblks_t blockcount = 0; /* number of blocks in extent */ + xfs_bmbt_rec_host_t *ep = NULL; /* pointer to target extent */ + xfs_ext_irec_t *erp = NULL; /* indirection array pointer */ + int high; /* upper boundary in search */ + xfs_extnum_t idx = 0; /* index of target extent */ + int low; /* lower boundary in search */ + xfs_extnum_t nextents; /* number of file extents */ + xfs_fileoff_t startoff = 0; /* start offset of extent */ + + nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + if (nextents == 0) { + *idxp = 0; + return NULL; + } + low = 0; + if (ifp->if_flags & XFS_IFEXTIREC) { + /* Find target extent list */ + int erp_idx = 0; + erp = xfs_iext_bno_to_irec(ifp, bno, &erp_idx); + base = erp->er_extbuf; + high = erp->er_extcount - 1; + } else { + base = ifp->if_u1.if_extents; + high = nextents - 1; + } + /* Binary search extent records */ + while (low <= high) { + idx = (low + high) >> 1; + ep = base + idx; + startoff = xfs_bmbt_get_startoff(ep); + blockcount = xfs_bmbt_get_blockcount(ep); + if (bno < startoff) { + high = idx - 1; + } else if (bno >= startoff + blockcount) { + low = idx + 1; + } else { + /* Convert back to file-based extent index */ + if (ifp->if_flags & XFS_IFEXTIREC) { + idx += erp->er_extoff; + } + *idxp = idx; + return ep; + } + } + /* Convert back to file-based extent index */ + if (ifp->if_flags & XFS_IFEXTIREC) { + idx += erp->er_extoff; + } + if (bno >= startoff + blockcount) { + if (++idx == nextents) { + ep = NULL; + } else { + ep = xfs_iext_get_ext(ifp, idx); + } + } + *idxp = idx; + return ep; +} + +/* + * Return a pointer to the indirection array entry containing the + * extent record for filesystem block bno. Store the index of the + * target irec in *erp_idxp. + */ +xfs_ext_irec_t * /* pointer to found extent record */ +xfs_iext_bno_to_irec( + xfs_ifork_t *ifp, /* inode fork pointer */ + xfs_fileoff_t bno, /* block number to search for */ + int *erp_idxp) /* irec index of target ext list */ +{ + xfs_ext_irec_t *erp = NULL; /* indirection array pointer */ + xfs_ext_irec_t *erp_next; /* next indirection array entry */ + int erp_idx; /* indirection array index */ + int nlists; /* number of extent irec's (lists) */ + int high; /* binary search upper limit */ + int low; /* binary search lower limit */ + + ASSERT(ifp->if_flags & XFS_IFEXTIREC); + nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; + erp_idx = 0; + low = 0; + high = nlists - 1; + while (low <= high) { + erp_idx = (low + high) >> 1; + erp = &ifp->if_u1.if_ext_irec[erp_idx]; + erp_next = erp_idx < nlists - 1 ? erp + 1 : NULL; + if (bno < xfs_bmbt_get_startoff(erp->er_extbuf)) { + high = erp_idx - 1; + } else if (erp_next && bno >= + xfs_bmbt_get_startoff(erp_next->er_extbuf)) { + low = erp_idx + 1; + } else { + break; + } + } + *erp_idxp = erp_idx; + return erp; +} + +/* + * Return a pointer to the indirection array entry containing the + * extent record at file extent index *idxp. Store the index of the + * target irec in *erp_idxp and store the page index of the target + * extent record in *idxp. + */ +xfs_ext_irec_t * +xfs_iext_idx_to_irec( + xfs_ifork_t *ifp, /* inode fork pointer */ + xfs_extnum_t *idxp, /* extent index (file -> page) */ + int *erp_idxp, /* pointer to target irec */ + int realloc) /* new bytes were just added */ +{ + xfs_ext_irec_t *prev; /* pointer to previous irec */ + xfs_ext_irec_t *erp = NULL; /* pointer to current irec */ + int erp_idx; /* indirection array index */ + int nlists; /* number of irec's (ex lists) */ + int high; /* binary search upper limit */ + int low; /* binary search lower limit */ + xfs_extnum_t page_idx = *idxp; /* extent index in target list */ + + ASSERT(ifp->if_flags & XFS_IFEXTIREC); + ASSERT(page_idx >= 0); + ASSERT(page_idx <= ifp->if_bytes / sizeof(xfs_bmbt_rec_t)); + ASSERT(page_idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t) || realloc); + + nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; + erp_idx = 0; + low = 0; + high = nlists - 1; + + /* Binary search extent irec's */ + while (low <= high) { + erp_idx = (low + high) >> 1; + erp = &ifp->if_u1.if_ext_irec[erp_idx]; + prev = erp_idx > 0 ? erp - 1 : NULL; + if (page_idx < erp->er_extoff || (page_idx == erp->er_extoff && + realloc && prev && prev->er_extcount < XFS_LINEAR_EXTS)) { + high = erp_idx - 1; + } else if (page_idx > erp->er_extoff + erp->er_extcount || + (page_idx == erp->er_extoff + erp->er_extcount && + !realloc)) { + low = erp_idx + 1; + } else if (page_idx == erp->er_extoff + erp->er_extcount && + erp->er_extcount == XFS_LINEAR_EXTS) { + ASSERT(realloc); + page_idx = 0; + erp_idx++; + erp = erp_idx < nlists ? erp + 1 : NULL; + break; + } else { + page_idx -= erp->er_extoff; + break; + } + } + *idxp = page_idx; + *erp_idxp = erp_idx; + return(erp); +} + +/* + * Allocate and initialize an indirection array once the space needed + * for incore extents increases above XFS_IEXT_BUFSZ. + */ +void +xfs_iext_irec_init( + xfs_ifork_t *ifp) /* inode fork pointer */ +{ + xfs_ext_irec_t *erp; /* indirection array pointer */ + xfs_extnum_t nextents; /* number of extents in file */ + + ASSERT(!(ifp->if_flags & XFS_IFEXTIREC)); + nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + ASSERT(nextents <= XFS_LINEAR_EXTS); + + erp = kmem_alloc(sizeof(xfs_ext_irec_t), KM_NOFS); + + if (nextents == 0) { + ifp->if_u1.if_extents = kmem_alloc(XFS_IEXT_BUFSZ, KM_NOFS); + } else if (!ifp->if_real_bytes) { + xfs_iext_inline_to_direct(ifp, XFS_IEXT_BUFSZ); + } else if (ifp->if_real_bytes < XFS_IEXT_BUFSZ) { + xfs_iext_realloc_direct(ifp, XFS_IEXT_BUFSZ); + } + erp->er_extbuf = ifp->if_u1.if_extents; + erp->er_extcount = nextents; + erp->er_extoff = 0; + + ifp->if_flags |= XFS_IFEXTIREC; + ifp->if_real_bytes = XFS_IEXT_BUFSZ; + ifp->if_bytes = nextents * sizeof(xfs_bmbt_rec_t); + ifp->if_u1.if_ext_irec = erp; + + return; +} + +/* + * Allocate and initialize a new entry in the indirection array. + */ +xfs_ext_irec_t * +xfs_iext_irec_new( + xfs_ifork_t *ifp, /* inode fork pointer */ + int erp_idx) /* index for new irec */ +{ + xfs_ext_irec_t *erp; /* indirection array pointer */ + int i; /* loop counter */ + int nlists; /* number of irec's (ex lists) */ + + ASSERT(ifp->if_flags & XFS_IFEXTIREC); + nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; + + /* Resize indirection array */ + xfs_iext_realloc_indirect(ifp, ++nlists * + sizeof(xfs_ext_irec_t)); + /* + * Move records down in the array so the + * new page can use erp_idx. + */ + erp = ifp->if_u1.if_ext_irec; + for (i = nlists - 1; i > erp_idx; i--) { + memmove(&erp[i], &erp[i-1], sizeof(xfs_ext_irec_t)); + } + ASSERT(i == erp_idx); + + /* Initialize new extent record */ + erp = ifp->if_u1.if_ext_irec; + erp[erp_idx].er_extbuf = kmem_alloc(XFS_IEXT_BUFSZ, KM_NOFS); + ifp->if_real_bytes = nlists * XFS_IEXT_BUFSZ; + memset(erp[erp_idx].er_extbuf, 0, XFS_IEXT_BUFSZ); + erp[erp_idx].er_extcount = 0; + erp[erp_idx].er_extoff = erp_idx > 0 ? + erp[erp_idx-1].er_extoff + erp[erp_idx-1].er_extcount : 0; + return (&erp[erp_idx]); +} + +/* + * Remove a record from the indirection array. + */ +void +xfs_iext_irec_remove( + xfs_ifork_t *ifp, /* inode fork pointer */ + int erp_idx) /* irec index to remove */ +{ + xfs_ext_irec_t *erp; /* indirection array pointer */ + int i; /* loop counter */ + int nlists; /* number of irec's (ex lists) */ + + ASSERT(ifp->if_flags & XFS_IFEXTIREC); + nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; + erp = &ifp->if_u1.if_ext_irec[erp_idx]; + if (erp->er_extbuf) { + xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, + -erp->er_extcount); + kmem_free(erp->er_extbuf); + } + /* Compact extent records */ + erp = ifp->if_u1.if_ext_irec; + for (i = erp_idx; i < nlists - 1; i++) { + memmove(&erp[i], &erp[i+1], sizeof(xfs_ext_irec_t)); + } + /* + * Manually free the last extent record from the indirection + * array. A call to xfs_iext_realloc_indirect() with a size + * of zero would result in a call to xfs_iext_destroy() which + * would in turn call this function again, creating a nasty + * infinite loop. + */ + if (--nlists) { + xfs_iext_realloc_indirect(ifp, + nlists * sizeof(xfs_ext_irec_t)); + } else { + kmem_free(ifp->if_u1.if_ext_irec); + } + ifp->if_real_bytes = nlists * XFS_IEXT_BUFSZ; +} + +/* + * This is called to clean up large amounts of unused memory allocated + * by the indirection array. Before compacting anything though, verify + * that the indirection array is still needed and switch back to the + * linear extent list (or even the inline buffer) if possible. The + * compaction policy is as follows: + * + * Full Compaction: Extents fit into a single page (or inline buffer) + * Partial Compaction: Extents occupy less than 50% of allocated space + * No Compaction: Extents occupy at least 50% of allocated space + */ +void +xfs_iext_irec_compact( + xfs_ifork_t *ifp) /* inode fork pointer */ +{ + xfs_extnum_t nextents; /* number of extents in file */ + int nlists; /* number of irec's (ex lists) */ + + ASSERT(ifp->if_flags & XFS_IFEXTIREC); + nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; + nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + + if (nextents == 0) { + xfs_iext_destroy(ifp); + } else if (nextents <= XFS_INLINE_EXTS) { + xfs_iext_indirect_to_direct(ifp); + xfs_iext_direct_to_inline(ifp, nextents); + } else if (nextents <= XFS_LINEAR_EXTS) { + xfs_iext_indirect_to_direct(ifp); + } else if (nextents < (nlists * XFS_LINEAR_EXTS) >> 1) { + xfs_iext_irec_compact_pages(ifp); + } +} + +/* + * Combine extents from neighboring extent pages. + */ +void +xfs_iext_irec_compact_pages( + xfs_ifork_t *ifp) /* inode fork pointer */ +{ + xfs_ext_irec_t *erp, *erp_next;/* pointers to irec entries */ + int erp_idx = 0; /* indirection array index */ + int nlists; /* number of irec's (ex lists) */ + + ASSERT(ifp->if_flags & XFS_IFEXTIREC); + nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; + while (erp_idx < nlists - 1) { + erp = &ifp->if_u1.if_ext_irec[erp_idx]; + erp_next = erp + 1; + if (erp_next->er_extcount <= + (XFS_LINEAR_EXTS - erp->er_extcount)) { + memcpy(&erp->er_extbuf[erp->er_extcount], + erp_next->er_extbuf, erp_next->er_extcount * + sizeof(xfs_bmbt_rec_t)); + erp->er_extcount += erp_next->er_extcount; + /* + * Free page before removing extent record + * so er_extoffs don't get modified in + * xfs_iext_irec_remove. + */ + kmem_free(erp_next->er_extbuf); + erp_next->er_extbuf = NULL; + xfs_iext_irec_remove(ifp, erp_idx + 1); + nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; + } else { + erp_idx++; + } + } +} + +/* + * This is called to update the er_extoff field in the indirection + * array when extents have been added or removed from one of the + * extent lists. erp_idx contains the irec index to begin updating + * at and ext_diff contains the number of extents that were added + * or removed. + */ +void +xfs_iext_irec_update_extoffs( + xfs_ifork_t *ifp, /* inode fork pointer */ + int erp_idx, /* irec index to update */ + int ext_diff) /* number of new extents */ +{ + int i; /* loop counter */ + int nlists; /* number of irec's (ex lists */ + + ASSERT(ifp->if_flags & XFS_IFEXTIREC); + nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; + for (i = erp_idx; i < nlists; i++) { + ifp->if_u1.if_ext_irec[i].er_extoff += ext_diff; + } +} diff --git a/fs/xfs/xfs_inode_fork.h b/fs/xfs/xfs_inode_fork.h new file mode 100644 index 00000000000..7d3b1ed6dcb --- /dev/null +++ b/fs/xfs/xfs_inode_fork.h @@ -0,0 +1,171 @@ +/* + * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_INODE_FORK_H__ +#define __XFS_INODE_FORK_H__ + +struct xfs_inode_log_item; +struct xfs_dinode; + +/* + * The following xfs_ext_irec_t struct introduces a second (top) level + * to the in-core extent allocation scheme. These structs are allocated + * in a contiguous block, creating an indirection array where each entry + * (irec) contains a pointer to a buffer of in-core extent records which + * it manages. Each extent buffer is 4k in size, since 4k is the system + * page size on Linux i386 and systems with larger page sizes don't seem + * to gain much, if anything, by using their native page size as the + * extent buffer size. Also, using 4k extent buffers everywhere provides + * a consistent interface for CXFS across different platforms. + * + * There is currently no limit on the number of irec's (extent lists) + * allowed, so heavily fragmented files may require an indirection array + * which spans multiple system pages of memory. The number of extents + * which would require this amount of contiguous memory is very large + * and should not cause problems in the foreseeable future. However, + * if the memory needed for the contiguous array ever becomes a problem, + * it is possible that a third level of indirection may be required. + */ +typedef struct xfs_ext_irec { + xfs_bmbt_rec_host_t *er_extbuf; /* block of extent records */ + xfs_extnum_t er_extoff; /* extent offset in file */ + xfs_extnum_t er_extcount; /* number of extents in page/block */ +} xfs_ext_irec_t; + +/* + * File incore extent information, present for each of data & attr forks. + */ +#define XFS_IEXT_BUFSZ 4096 +#define XFS_LINEAR_EXTS (XFS_IEXT_BUFSZ / (uint)sizeof(xfs_bmbt_rec_t)) +#define XFS_INLINE_EXTS 2 +#define XFS_INLINE_DATA 32 +typedef struct xfs_ifork { + int if_bytes; /* bytes in if_u1 */ + int if_real_bytes; /* bytes allocated in if_u1 */ + struct xfs_btree_block *if_broot; /* file's incore btree root */ + short if_broot_bytes; /* bytes allocated for root */ + unsigned char if_flags; /* per-fork flags */ + union { + xfs_bmbt_rec_host_t *if_extents;/* linear map file exts */ + xfs_ext_irec_t *if_ext_irec; /* irec map file exts */ + char *if_data; /* inline file data */ + } if_u1; + union { + xfs_bmbt_rec_host_t if_inline_ext[XFS_INLINE_EXTS]; + /* very small file extents */ + char if_inline_data[XFS_INLINE_DATA]; + /* very small file data */ + xfs_dev_t if_rdev; /* dev number if special */ + uuid_t if_uuid; /* mount point value */ + } if_u2; +} xfs_ifork_t; + +/* + * Per-fork incore inode flags. + */ +#define XFS_IFINLINE 0x01 /* Inline data is read in */ +#define XFS_IFEXTENTS 0x02 /* All extent pointers are read in */ +#define XFS_IFBROOT 0x04 /* i_broot points to the bmap b-tree root */ +#define XFS_IFEXTIREC 0x08 /* Indirection array of extent blocks */ + +/* + * Fork handling. + */ + +#define XFS_IFORK_Q(ip) ((ip)->i_d.di_forkoff != 0) +#define XFS_IFORK_BOFF(ip) ((int)((ip)->i_d.di_forkoff << 3)) + +#define XFS_IFORK_PTR(ip,w) \ + ((w) == XFS_DATA_FORK ? \ + &(ip)->i_df : \ + (ip)->i_afp) +#define XFS_IFORK_DSIZE(ip) \ + (XFS_IFORK_Q(ip) ? \ + XFS_IFORK_BOFF(ip) : \ + XFS_LITINO((ip)->i_mount, (ip)->i_d.di_version)) +#define XFS_IFORK_ASIZE(ip) \ + (XFS_IFORK_Q(ip) ? \ + XFS_LITINO((ip)->i_mount, (ip)->i_d.di_version) - \ + XFS_IFORK_BOFF(ip) : \ + 0) +#define XFS_IFORK_SIZE(ip,w) \ + ((w) == XFS_DATA_FORK ? \ + XFS_IFORK_DSIZE(ip) : \ + XFS_IFORK_ASIZE(ip)) +#define XFS_IFORK_FORMAT(ip,w) \ + ((w) == XFS_DATA_FORK ? \ + (ip)->i_d.di_format : \ + (ip)->i_d.di_aformat) +#define XFS_IFORK_FMT_SET(ip,w,n) \ + ((w) == XFS_DATA_FORK ? \ + ((ip)->i_d.di_format = (n)) : \ + ((ip)->i_d.di_aformat = (n))) +#define XFS_IFORK_NEXTENTS(ip,w) \ + ((w) == XFS_DATA_FORK ? \ + (ip)->i_d.di_nextents : \ + (ip)->i_d.di_anextents) +#define XFS_IFORK_NEXT_SET(ip,w,n) \ + ((w) == XFS_DATA_FORK ? \ + ((ip)->i_d.di_nextents = (n)) : \ + ((ip)->i_d.di_anextents = (n))) +#define XFS_IFORK_MAXEXT(ip, w) \ + (XFS_IFORK_SIZE(ip, w) / sizeof(xfs_bmbt_rec_t)) + +int xfs_iformat_fork(struct xfs_inode *, struct xfs_dinode *); +void xfs_iflush_fork(struct xfs_inode *, struct xfs_dinode *, + struct xfs_inode_log_item *, int); +void xfs_idestroy_fork(struct xfs_inode *, int); +void xfs_idata_realloc(struct xfs_inode *, int, int); +void xfs_iroot_realloc(struct xfs_inode *, int, int); +int xfs_iread_extents(struct xfs_trans *, struct xfs_inode *, int); +int xfs_iextents_copy(struct xfs_inode *, struct xfs_bmbt_rec *, + int); + +struct xfs_bmbt_rec_host * + xfs_iext_get_ext(struct xfs_ifork *, xfs_extnum_t); +void xfs_iext_insert(struct xfs_inode *, xfs_extnum_t, xfs_extnum_t, + struct xfs_bmbt_irec *, int); +void xfs_iext_add(struct xfs_ifork *, xfs_extnum_t, int); +void xfs_iext_add_indirect_multi(struct xfs_ifork *, int, + xfs_extnum_t, int); +void xfs_iext_remove(struct xfs_inode *, xfs_extnum_t, int, int); +void xfs_iext_remove_inline(struct xfs_ifork *, xfs_extnum_t, int); +void xfs_iext_remove_direct(struct xfs_ifork *, xfs_extnum_t, int); +void xfs_iext_remove_indirect(struct xfs_ifork *, xfs_extnum_t, int); +void xfs_iext_realloc_direct(struct xfs_ifork *, int); +void xfs_iext_direct_to_inline(struct xfs_ifork *, xfs_extnum_t); +void xfs_iext_inline_to_direct(struct xfs_ifork *, int); +void xfs_iext_destroy(struct xfs_ifork *); +struct xfs_bmbt_rec_host * + xfs_iext_bno_to_ext(struct xfs_ifork *, xfs_fileoff_t, int *); +struct xfs_ext_irec * + xfs_iext_bno_to_irec(struct xfs_ifork *, xfs_fileoff_t, int *); +struct xfs_ext_irec * + xfs_iext_idx_to_irec(struct xfs_ifork *, xfs_extnum_t *, int *, + int); +void xfs_iext_irec_init(struct xfs_ifork *); +struct xfs_ext_irec * + xfs_iext_irec_new(struct xfs_ifork *, int); +void xfs_iext_irec_remove(struct xfs_ifork *, int); +void xfs_iext_irec_compact(struct xfs_ifork *); +void xfs_iext_irec_compact_pages(struct xfs_ifork *); +void xfs_iext_irec_compact_full(struct xfs_ifork *); +void xfs_iext_irec_update_extoffs(struct xfs_ifork *, int, int); + +extern struct kmem_zone *xfs_ifork_zone; + +#endif /* __XFS_INODE_FORK_H__ */ diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index d041d47d9d8..a640137b357 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -17,19 +17,20 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_types.h" -#include "xfs_log.h" -#include "xfs_trans.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_mount.h" -#include "xfs_trans_priv.h" -#include "xfs_bmap_btree.h" -#include "xfs_dinode.h" #include "xfs_inode.h" +#include "xfs_trans.h" #include "xfs_inode_item.h" #include "xfs_error.h" #include "xfs_trace.h" +#include "xfs_trans_priv.h" +#include "xfs_dinode.h" +#include "xfs_log.h" kmem_zone_t *xfs_ili_zone; /* inode log item zone */ @@ -39,179 +40,120 @@ static inline struct xfs_inode_log_item *INODE_ITEM(struct xfs_log_item *lip) return container_of(lip, struct xfs_inode_log_item, ili_item); } - -/* - * This returns the number of iovecs needed to log the given inode item. - * - * We need one iovec for the inode log format structure, one for the - * inode core, and possibly one for the inode data/extents/b-tree root - * and one for the inode attribute data/extents/b-tree root. - */ -STATIC uint -xfs_inode_item_size( - struct xfs_log_item *lip) +STATIC void +xfs_inode_item_data_fork_size( + struct xfs_inode_log_item *iip, + int *nvecs, + int *nbytes) { - struct xfs_inode_log_item *iip = INODE_ITEM(lip); struct xfs_inode *ip = iip->ili_inode; - uint nvecs = 2; switch (ip->i_d.di_format) { case XFS_DINODE_FMT_EXTENTS: if ((iip->ili_fields & XFS_ILOG_DEXT) && ip->i_d.di_nextents > 0 && - ip->i_df.if_bytes > 0) - nvecs++; + ip->i_df.if_bytes > 0) { + /* worst case, doesn't subtract delalloc extents */ + *nbytes += XFS_IFORK_DSIZE(ip); + *nvecs += 1; + } break; - case XFS_DINODE_FMT_BTREE: if ((iip->ili_fields & XFS_ILOG_DBROOT) && - ip->i_df.if_broot_bytes > 0) - nvecs++; + ip->i_df.if_broot_bytes > 0) { + *nbytes += ip->i_df.if_broot_bytes; + *nvecs += 1; + } break; - case XFS_DINODE_FMT_LOCAL: if ((iip->ili_fields & XFS_ILOG_DDATA) && - ip->i_df.if_bytes > 0) - nvecs++; + ip->i_df.if_bytes > 0) { + *nbytes += roundup(ip->i_df.if_bytes, 4); + *nvecs += 1; + } break; case XFS_DINODE_FMT_DEV: case XFS_DINODE_FMT_UUID: break; - default: ASSERT(0); break; } +} - if (!XFS_IFORK_Q(ip)) - return nvecs; - +STATIC void +xfs_inode_item_attr_fork_size( + struct xfs_inode_log_item *iip, + int *nvecs, + int *nbytes) +{ + struct xfs_inode *ip = iip->ili_inode; - /* - * Log any necessary attribute data. - */ switch (ip->i_d.di_aformat) { case XFS_DINODE_FMT_EXTENTS: if ((iip->ili_fields & XFS_ILOG_AEXT) && ip->i_d.di_anextents > 0 && - ip->i_afp->if_bytes > 0) - nvecs++; + ip->i_afp->if_bytes > 0) { + /* worst case, doesn't subtract unused space */ + *nbytes += XFS_IFORK_ASIZE(ip); + *nvecs += 1; + } break; - case XFS_DINODE_FMT_BTREE: if ((iip->ili_fields & XFS_ILOG_ABROOT) && - ip->i_afp->if_broot_bytes > 0) - nvecs++; + ip->i_afp->if_broot_bytes > 0) { + *nbytes += ip->i_afp->if_broot_bytes; + *nvecs += 1; + } break; - case XFS_DINODE_FMT_LOCAL: if ((iip->ili_fields & XFS_ILOG_ADATA) && - ip->i_afp->if_bytes > 0) - nvecs++; + ip->i_afp->if_bytes > 0) { + *nbytes += roundup(ip->i_afp->if_bytes, 4); + *nvecs += 1; + } break; - default: ASSERT(0); break; } - - return nvecs; } /* - * xfs_inode_item_format_extents - convert in-core extents to on-disk form - * - * For either the data or attr fork in extent format, we need to endian convert - * the in-core extent as we place them into the on-disk inode. In this case, we - * need to do this conversion before we write the extents into the log. Because - * we don't have the disk inode to write into here, we allocate a buffer and - * format the extents into it via xfs_iextents_copy(). We free the buffer in - * the unlock routine after the copy for the log has been made. + * This returns the number of iovecs needed to log the given inode item. * - * In the case of the data fork, the in-core and on-disk fork sizes can be - * different due to delayed allocation extents. We only log on-disk extents - * here, so always use the physical fork size to determine the size of the - * buffer we need to allocate. + * We need one iovec for the inode log format structure, one for the + * inode core, and possibly one for the inode data/extents/b-tree root + * and one for the inode attribute data/extents/b-tree root. */ STATIC void -xfs_inode_item_format_extents( - struct xfs_inode *ip, - struct xfs_log_iovec *vecp, - int whichfork, - int type) +xfs_inode_item_size( + struct xfs_log_item *lip, + int *nvecs, + int *nbytes) { - xfs_bmbt_rec_t *ext_buffer; + struct xfs_inode_log_item *iip = INODE_ITEM(lip); + struct xfs_inode *ip = iip->ili_inode; - ext_buffer = kmem_alloc(XFS_IFORK_SIZE(ip, whichfork), KM_SLEEP); - if (whichfork == XFS_DATA_FORK) - ip->i_itemp->ili_extents_buf = ext_buffer; - else - ip->i_itemp->ili_aextents_buf = ext_buffer; + *nvecs += 2; + *nbytes += sizeof(struct xfs_inode_log_format) + + xfs_icdinode_size(ip->i_d.di_version); - vecp->i_addr = ext_buffer; - vecp->i_len = xfs_iextents_copy(ip, ext_buffer, whichfork); - vecp->i_type = type; + xfs_inode_item_data_fork_size(iip, nvecs, nbytes); + if (XFS_IFORK_Q(ip)) + xfs_inode_item_attr_fork_size(iip, nvecs, nbytes); } -/* - * This is called to fill in the vector of log iovecs for the - * given inode log item. It fills the first item with an inode - * log format structure, the second with the on-disk inode structure, - * and a possible third and/or fourth with the inode data/extents/b-tree - * root and inode attributes data/extents/b-tree root. - */ STATIC void -xfs_inode_item_format( - struct xfs_log_item *lip, - struct xfs_log_iovec *vecp) +xfs_inode_item_format_data_fork( + struct xfs_inode_log_item *iip, + struct xfs_inode_log_format *ilf, + struct xfs_log_vec *lv, + struct xfs_log_iovec **vecp) { - struct xfs_inode_log_item *iip = INODE_ITEM(lip); struct xfs_inode *ip = iip->ili_inode; - uint nvecs; size_t data_bytes; - xfs_mount_t *mp; - - vecp->i_addr = &iip->ili_format; - vecp->i_len = sizeof(xfs_inode_log_format_t); - vecp->i_type = XLOG_REG_TYPE_IFORMAT; - vecp++; - nvecs = 1; - - vecp->i_addr = &ip->i_d; - vecp->i_len = sizeof(struct xfs_icdinode); - vecp->i_type = XLOG_REG_TYPE_ICORE; - vecp++; - nvecs++; - - /* - * If this is really an old format inode, then we need to - * log it as such. This means that we have to copy the link - * count from the new field to the old. We don't have to worry - * about the new fields, because nothing trusts them as long as - * the old inode version number is there. If the superblock already - * has a new version number, then we don't bother converting back. - */ - mp = ip->i_mount; - ASSERT(ip->i_d.di_version == 1 || xfs_sb_version_hasnlink(&mp->m_sb)); - if (ip->i_d.di_version == 1) { - if (!xfs_sb_version_hasnlink(&mp->m_sb)) { - /* - * Convert it back. - */ - ASSERT(ip->i_d.di_nlink <= XFS_MAXLINK_1); - ip->i_d.di_onlink = ip->i_d.di_nlink; - } else { - /* - * The superblock version has already been bumped, - * so just make the conversion to the new inode - * format permanent. - */ - ip->i_d.di_version = 2; - ip->i_d.di_onlink = 0; - memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad)); - } - } switch (ip->i_d.di_format) { case XFS_DINODE_FMT_EXTENTS: @@ -222,36 +164,23 @@ xfs_inode_item_format( if ((iip->ili_fields & XFS_ILOG_DEXT) && ip->i_d.di_nextents > 0 && ip->i_df.if_bytes > 0) { + struct xfs_bmbt_rec *p; + ASSERT(ip->i_df.if_u1.if_extents != NULL); ASSERT(ip->i_df.if_bytes / sizeof(xfs_bmbt_rec_t) > 0); - ASSERT(iip->ili_extents_buf == NULL); - -#ifdef XFS_NATIVE_HOST - if (ip->i_d.di_nextents == ip->i_df.if_bytes / - (uint)sizeof(xfs_bmbt_rec_t)) { - /* - * There are no delayed allocation - * extents, so just point to the - * real extents array. - */ - vecp->i_addr = ip->i_df.if_u1.if_extents; - vecp->i_len = ip->i_df.if_bytes; - vecp->i_type = XLOG_REG_TYPE_IEXT; - } else -#endif - { - xfs_inode_item_format_extents(ip, vecp, - XFS_DATA_FORK, XLOG_REG_TYPE_IEXT); - } - ASSERT(vecp->i_len <= ip->i_df.if_bytes); - iip->ili_format.ilf_dsize = vecp->i_len; - vecp++; - nvecs++; + + p = xlog_prepare_iovec(lv, vecp, XLOG_REG_TYPE_IEXT); + data_bytes = xfs_iextents_copy(ip, p, XFS_DATA_FORK); + xlog_finish_iovec(lv, *vecp, data_bytes); + + ASSERT(data_bytes <= ip->i_df.if_bytes); + + ilf->ilf_dsize = data_bytes; + ilf->ilf_size++; } else { iip->ili_fields &= ~XFS_ILOG_DEXT; } break; - case XFS_DINODE_FMT_BTREE: iip->ili_fields &= ~(XFS_ILOG_DDATA | XFS_ILOG_DEXT | @@ -260,91 +189,70 @@ xfs_inode_item_format( if ((iip->ili_fields & XFS_ILOG_DBROOT) && ip->i_df.if_broot_bytes > 0) { ASSERT(ip->i_df.if_broot != NULL); - vecp->i_addr = ip->i_df.if_broot; - vecp->i_len = ip->i_df.if_broot_bytes; - vecp->i_type = XLOG_REG_TYPE_IBROOT; - vecp++; - nvecs++; - iip->ili_format.ilf_dsize = ip->i_df.if_broot_bytes; + xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_IBROOT, + ip->i_df.if_broot, + ip->i_df.if_broot_bytes); + ilf->ilf_dsize = ip->i_df.if_broot_bytes; + ilf->ilf_size++; } else { ASSERT(!(iip->ili_fields & XFS_ILOG_DBROOT)); -#ifdef XFS_TRANS_DEBUG - if (iip->ili_root_size > 0) { - ASSERT(iip->ili_root_size == - ip->i_df.if_broot_bytes); - ASSERT(memcmp(iip->ili_orig_root, - ip->i_df.if_broot, - iip->ili_root_size) == 0); - } else { - ASSERT(ip->i_df.if_broot_bytes == 0); - } -#endif iip->ili_fields &= ~XFS_ILOG_DBROOT; } break; - case XFS_DINODE_FMT_LOCAL: iip->ili_fields &= ~(XFS_ILOG_DEXT | XFS_ILOG_DBROOT | XFS_ILOG_DEV | XFS_ILOG_UUID); if ((iip->ili_fields & XFS_ILOG_DDATA) && ip->i_df.if_bytes > 0) { - ASSERT(ip->i_df.if_u1.if_data != NULL); - ASSERT(ip->i_d.di_size > 0); - - vecp->i_addr = ip->i_df.if_u1.if_data; /* * Round i_bytes up to a word boundary. * The underlying memory is guaranteed to * to be there by xfs_idata_realloc(). */ data_bytes = roundup(ip->i_df.if_bytes, 4); - ASSERT((ip->i_df.if_real_bytes == 0) || - (ip->i_df.if_real_bytes == data_bytes)); - vecp->i_len = (int)data_bytes; - vecp->i_type = XLOG_REG_TYPE_ILOCAL; - vecp++; - nvecs++; - iip->ili_format.ilf_dsize = (unsigned)data_bytes; + ASSERT(ip->i_df.if_real_bytes == 0 || + ip->i_df.if_real_bytes == data_bytes); + ASSERT(ip->i_df.if_u1.if_data != NULL); + ASSERT(ip->i_d.di_size > 0); + xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_ILOCAL, + ip->i_df.if_u1.if_data, data_bytes); + ilf->ilf_dsize = (unsigned)data_bytes; + ilf->ilf_size++; } else { iip->ili_fields &= ~XFS_ILOG_DDATA; } break; - case XFS_DINODE_FMT_DEV: iip->ili_fields &= ~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT | XFS_ILOG_DEXT | XFS_ILOG_UUID); - if (iip->ili_fields & XFS_ILOG_DEV) { - iip->ili_format.ilf_u.ilfu_rdev = - ip->i_df.if_u2.if_rdev; - } + if (iip->ili_fields & XFS_ILOG_DEV) + ilf->ilf_u.ilfu_rdev = ip->i_df.if_u2.if_rdev; break; - case XFS_DINODE_FMT_UUID: iip->ili_fields &= ~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT | XFS_ILOG_DEXT | XFS_ILOG_DEV); - if (iip->ili_fields & XFS_ILOG_UUID) { - iip->ili_format.ilf_u.ilfu_uuid = - ip->i_df.if_u2.if_uuid; - } + if (iip->ili_fields & XFS_ILOG_UUID) + ilf->ilf_u.ilfu_uuid = ip->i_df.if_u2.if_uuid; break; - default: ASSERT(0); break; } +} - /* - * If there are no attributes associated with the file, then we're done. - */ - if (!XFS_IFORK_Q(ip)) { - iip->ili_fields &= - ~(XFS_ILOG_ADATA | XFS_ILOG_ABROOT | XFS_ILOG_AEXT); - goto out; - } +STATIC void +xfs_inode_item_format_attr_fork( + struct xfs_inode_log_item *iip, + struct xfs_inode_log_format *ilf, + struct xfs_log_vec *lv, + struct xfs_log_iovec **vecp) +{ + struct xfs_inode *ip = iip->ili_inode; + size_t data_bytes; switch (ip->i_d.di_aformat) { case XFS_DINODE_FMT_EXTENTS: @@ -354,30 +262,22 @@ xfs_inode_item_format( if ((iip->ili_fields & XFS_ILOG_AEXT) && ip->i_d.di_anextents > 0 && ip->i_afp->if_bytes > 0) { + struct xfs_bmbt_rec *p; + ASSERT(ip->i_afp->if_bytes / sizeof(xfs_bmbt_rec_t) == ip->i_d.di_anextents); ASSERT(ip->i_afp->if_u1.if_extents != NULL); -#ifdef XFS_NATIVE_HOST - /* - * There are not delayed allocation extents - * for attributes, so just point at the array. - */ - vecp->i_addr = ip->i_afp->if_u1.if_extents; - vecp->i_len = ip->i_afp->if_bytes; - vecp->i_type = XLOG_REG_TYPE_IATTR_EXT; -#else - ASSERT(iip->ili_aextents_buf == NULL); - xfs_inode_item_format_extents(ip, vecp, - XFS_ATTR_FORK, XLOG_REG_TYPE_IATTR_EXT); -#endif - iip->ili_format.ilf_asize = vecp->i_len; - vecp++; - nvecs++; + + p = xlog_prepare_iovec(lv, vecp, XLOG_REG_TYPE_IATTR_EXT); + data_bytes = xfs_iextents_copy(ip, p, XFS_ATTR_FORK); + xlog_finish_iovec(lv, *vecp, data_bytes); + + ilf->ilf_asize = data_bytes; + ilf->ilf_size++; } else { iip->ili_fields &= ~XFS_ILOG_AEXT; } break; - case XFS_DINODE_FMT_BTREE: iip->ili_fields &= ~(XFS_ILOG_ADATA | XFS_ILOG_AEXT); @@ -386,61 +286,89 @@ xfs_inode_item_format( ip->i_afp->if_broot_bytes > 0) { ASSERT(ip->i_afp->if_broot != NULL); - vecp->i_addr = ip->i_afp->if_broot; - vecp->i_len = ip->i_afp->if_broot_bytes; - vecp->i_type = XLOG_REG_TYPE_IATTR_BROOT; - vecp++; - nvecs++; - iip->ili_format.ilf_asize = ip->i_afp->if_broot_bytes; + xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_IATTR_BROOT, + ip->i_afp->if_broot, + ip->i_afp->if_broot_bytes); + ilf->ilf_asize = ip->i_afp->if_broot_bytes; + ilf->ilf_size++; } else { iip->ili_fields &= ~XFS_ILOG_ABROOT; } break; - case XFS_DINODE_FMT_LOCAL: iip->ili_fields &= ~(XFS_ILOG_AEXT | XFS_ILOG_ABROOT); if ((iip->ili_fields & XFS_ILOG_ADATA) && ip->i_afp->if_bytes > 0) { - ASSERT(ip->i_afp->if_u1.if_data != NULL); - - vecp->i_addr = ip->i_afp->if_u1.if_data; /* * Round i_bytes up to a word boundary. * The underlying memory is guaranteed to * to be there by xfs_idata_realloc(). */ data_bytes = roundup(ip->i_afp->if_bytes, 4); - ASSERT((ip->i_afp->if_real_bytes == 0) || - (ip->i_afp->if_real_bytes == data_bytes)); - vecp->i_len = (int)data_bytes; - vecp->i_type = XLOG_REG_TYPE_IATTR_LOCAL; - vecp++; - nvecs++; - iip->ili_format.ilf_asize = (unsigned)data_bytes; + ASSERT(ip->i_afp->if_real_bytes == 0 || + ip->i_afp->if_real_bytes == data_bytes); + ASSERT(ip->i_afp->if_u1.if_data != NULL); + xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_IATTR_LOCAL, + ip->i_afp->if_u1.if_data, + data_bytes); + ilf->ilf_asize = (unsigned)data_bytes; + ilf->ilf_size++; } else { iip->ili_fields &= ~XFS_ILOG_ADATA; } break; - default: ASSERT(0); break; } - -out: - /* - * Now update the log format that goes out to disk from the in-core - * values. We always write the inode core to make the arithmetic - * games in recovery easier, which isn't a big deal as just about any - * transaction would dirty it anyway. - */ - iip->ili_format.ilf_fields = XFS_ILOG_CORE | - (iip->ili_fields & ~XFS_ILOG_TIMESTAMP); - iip->ili_format.ilf_size = nvecs; } +/* + * This is called to fill in the vector of log iovecs for the given inode + * log item. It fills the first item with an inode log format structure, + * the second with the on-disk inode structure, and a possible third and/or + * fourth with the inode data/extents/b-tree root and inode attributes + * data/extents/b-tree root. + */ +STATIC void +xfs_inode_item_format( + struct xfs_log_item *lip, + struct xfs_log_vec *lv) +{ + struct xfs_inode_log_item *iip = INODE_ITEM(lip); + struct xfs_inode *ip = iip->ili_inode; + struct xfs_inode_log_format *ilf; + struct xfs_log_iovec *vecp = NULL; + + ASSERT(ip->i_d.di_version > 1); + + ilf = xlog_prepare_iovec(lv, &vecp, XLOG_REG_TYPE_IFORMAT); + ilf->ilf_type = XFS_LI_INODE; + ilf->ilf_ino = ip->i_ino; + ilf->ilf_blkno = ip->i_imap.im_blkno; + ilf->ilf_len = ip->i_imap.im_len; + ilf->ilf_boffset = ip->i_imap.im_boffset; + ilf->ilf_fields = XFS_ILOG_CORE; + ilf->ilf_size = 2; /* format + core */ + xlog_finish_iovec(lv, vecp, sizeof(struct xfs_inode_log_format)); + + xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_ICORE, + &ip->i_d, + xfs_icdinode_size(ip->i_d.di_version)); + + xfs_inode_item_format_data_fork(iip, ilf, lv, &vecp); + if (XFS_IFORK_Q(ip)) { + xfs_inode_item_format_attr_fork(iip, ilf, lv, &vecp); + } else { + iip->ili_fields &= + ~(XFS_ILOG_ADATA | XFS_ILOG_ABROOT | XFS_ILOG_AEXT); + } + + /* update the format with the exact fields we actually logged */ + ilf->ilf_fields |= (iip->ili_fields & ~XFS_ILOG_TIMESTAMP); +} /* * This is called to pin the inode associated with the inode log @@ -557,27 +485,6 @@ xfs_inode_item_unlock( ASSERT(ip->i_itemp != NULL); ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); - /* - * If the inode needed a separate buffer with which to log - * its extents, then free it now. - */ - if (iip->ili_extents_buf != NULL) { - ASSERT(ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS); - ASSERT(ip->i_d.di_nextents > 0); - ASSERT(iip->ili_fields & XFS_ILOG_DEXT); - ASSERT(ip->i_df.if_bytes > 0); - kmem_free(iip->ili_extents_buf); - iip->ili_extents_buf = NULL; - } - if (iip->ili_aextents_buf != NULL) { - ASSERT(ip->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS); - ASSERT(ip->i_d.di_anextents > 0); - ASSERT(iip->ili_fields & XFS_ILOG_AEXT); - ASSERT(ip->i_afp->if_bytes > 0); - kmem_free(iip->ili_aextents_buf); - iip->ili_aextents_buf = NULL; - } - lock_flags = iip->ili_lock_flags; iip->ili_lock_flags = 0; if (lock_flags) @@ -664,11 +571,6 @@ xfs_inode_item_init( iip->ili_inode = ip; xfs_log_item_init(mp, &iip->ili_item, XFS_LI_INODE, &xfs_inode_item_ops); - iip->ili_format.ilf_type = XFS_LI_INODE; - iip->ili_format.ilf_ino = ip->i_ino; - iip->ili_format.ilf_blkno = ip->i_imap.im_blkno; - iip->ili_format.ilf_len = ip->i_imap.im_len; - iip->ili_format.ilf_boffset = ip->i_imap.im_boffset; } /* @@ -678,11 +580,6 @@ void xfs_inode_item_destroy( xfs_inode_t *ip) { -#ifdef XFS_TRANS_DEBUG - if (ip->i_itemp->ili_root_size != 0) { - kmem_free(ip->i_itemp->ili_orig_root); - } -#endif kmem_zone_free(xfs_ili_zone, ip->i_itemp); } diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h index 376d4d0b263..488d81254e2 100644 --- a/fs/xfs/xfs_inode_item.h +++ b/fs/xfs/xfs_inode_item.h @@ -18,123 +18,13 @@ #ifndef __XFS_INODE_ITEM_H__ #define __XFS_INODE_ITEM_H__ -/* - * This is the structure used to lay out an inode log item in the - * log. The size of the inline data/extents/b-tree root to be logged - * (if any) is indicated in the ilf_dsize field. Changes to this structure - * must be added on to the end. - */ -typedef struct xfs_inode_log_format { - __uint16_t ilf_type; /* inode log item type */ - __uint16_t ilf_size; /* size of this item */ - __uint32_t ilf_fields; /* flags for fields logged */ - __uint16_t ilf_asize; /* size of attr d/ext/root */ - __uint16_t ilf_dsize; /* size of data/ext/root */ - __uint64_t ilf_ino; /* inode number */ - union { - __uint32_t ilfu_rdev; /* rdev value for dev inode*/ - uuid_t ilfu_uuid; /* mount point value */ - } ilf_u; - __int64_t ilf_blkno; /* blkno of inode buffer */ - __int32_t ilf_len; /* len of inode buffer */ - __int32_t ilf_boffset; /* off of inode in buffer */ -} xfs_inode_log_format_t; - -typedef struct xfs_inode_log_format_32 { - __uint16_t ilf_type; /* inode log item type */ - __uint16_t ilf_size; /* size of this item */ - __uint32_t ilf_fields; /* flags for fields logged */ - __uint16_t ilf_asize; /* size of attr d/ext/root */ - __uint16_t ilf_dsize; /* size of data/ext/root */ - __uint64_t ilf_ino; /* inode number */ - union { - __uint32_t ilfu_rdev; /* rdev value for dev inode*/ - uuid_t ilfu_uuid; /* mount point value */ - } ilf_u; - __int64_t ilf_blkno; /* blkno of inode buffer */ - __int32_t ilf_len; /* len of inode buffer */ - __int32_t ilf_boffset; /* off of inode in buffer */ -} __attribute__((packed)) xfs_inode_log_format_32_t; - -typedef struct xfs_inode_log_format_64 { - __uint16_t ilf_type; /* inode log item type */ - __uint16_t ilf_size; /* size of this item */ - __uint32_t ilf_fields; /* flags for fields logged */ - __uint16_t ilf_asize; /* size of attr d/ext/root */ - __uint16_t ilf_dsize; /* size of data/ext/root */ - __uint32_t ilf_pad; /* pad for 64 bit boundary */ - __uint64_t ilf_ino; /* inode number */ - union { - __uint32_t ilfu_rdev; /* rdev value for dev inode*/ - uuid_t ilfu_uuid; /* mount point value */ - } ilf_u; - __int64_t ilf_blkno; /* blkno of inode buffer */ - __int32_t ilf_len; /* len of inode buffer */ - __int32_t ilf_boffset; /* off of inode in buffer */ -} xfs_inode_log_format_64_t; - -/* - * Flags for xfs_trans_log_inode flags field. - */ -#define XFS_ILOG_CORE 0x001 /* log standard inode fields */ -#define XFS_ILOG_DDATA 0x002 /* log i_df.if_data */ -#define XFS_ILOG_DEXT 0x004 /* log i_df.if_extents */ -#define XFS_ILOG_DBROOT 0x008 /* log i_df.i_broot */ -#define XFS_ILOG_DEV 0x010 /* log the dev field */ -#define XFS_ILOG_UUID 0x020 /* log the uuid field */ -#define XFS_ILOG_ADATA 0x040 /* log i_af.if_data */ -#define XFS_ILOG_AEXT 0x080 /* log i_af.if_extents */ -#define XFS_ILOG_ABROOT 0x100 /* log i_af.i_broot */ - - -/* - * The timestamps are dirty, but not necessarily anything else in the inode - * core. Unlike the other fields above this one must never make it to disk - * in the ilf_fields of the inode_log_format, but is purely store in-memory in - * ili_fields in the inode_log_item. - */ -#define XFS_ILOG_TIMESTAMP 0x4000 - -#define XFS_ILOG_NONCORE (XFS_ILOG_DDATA | XFS_ILOG_DEXT | \ - XFS_ILOG_DBROOT | XFS_ILOG_DEV | \ - XFS_ILOG_UUID | XFS_ILOG_ADATA | \ - XFS_ILOG_AEXT | XFS_ILOG_ABROOT) - -#define XFS_ILOG_DFORK (XFS_ILOG_DDATA | XFS_ILOG_DEXT | \ - XFS_ILOG_DBROOT) - -#define XFS_ILOG_AFORK (XFS_ILOG_ADATA | XFS_ILOG_AEXT | \ - XFS_ILOG_ABROOT) - -#define XFS_ILOG_ALL (XFS_ILOG_CORE | XFS_ILOG_DDATA | \ - XFS_ILOG_DEXT | XFS_ILOG_DBROOT | \ - XFS_ILOG_DEV | XFS_ILOG_UUID | \ - XFS_ILOG_ADATA | XFS_ILOG_AEXT | \ - XFS_ILOG_ABROOT | XFS_ILOG_TIMESTAMP) - -static inline int xfs_ilog_fbroot(int w) -{ - return (w == XFS_DATA_FORK ? XFS_ILOG_DBROOT : XFS_ILOG_ABROOT); -} - -static inline int xfs_ilog_fext(int w) -{ - return (w == XFS_DATA_FORK ? XFS_ILOG_DEXT : XFS_ILOG_AEXT); -} - -static inline int xfs_ilog_fdata(int w) -{ - return (w == XFS_DATA_FORK ? XFS_ILOG_DDATA : XFS_ILOG_ADATA); -} - -#ifdef __KERNEL__ +/* kernel only definitions */ struct xfs_buf; struct xfs_bmbt_rec; struct xfs_inode; struct xfs_mount; - typedef struct xfs_inode_log_item { xfs_log_item_t ili_item; /* common portion */ struct xfs_inode *ili_inode; /* inode ptr */ @@ -144,18 +34,8 @@ typedef struct xfs_inode_log_item { unsigned short ili_logged; /* flushed logged data */ unsigned int ili_last_fields; /* fields when flushed */ unsigned int ili_fields; /* fields to be logged */ - struct xfs_bmbt_rec *ili_extents_buf; /* array of logged - data exts */ - struct xfs_bmbt_rec *ili_aextents_buf; /* array of logged - attr exts */ -#ifdef XFS_TRANS_DEBUG - int ili_root_size; - char *ili_orig_root; -#endif - xfs_inode_log_format_t ili_format; /* logged structure */ } xfs_inode_log_item_t; - static inline int xfs_inode_clean(xfs_inode_t *ip) { return !ip->i_itemp || !(ip->i_itemp->ili_fields & XFS_ILOG_ALL); @@ -169,6 +49,6 @@ extern void xfs_iflush_abort(struct xfs_inode *, bool); extern int xfs_inode_item_format_convert(xfs_log_iovec_t *, xfs_inode_log_format_t *); -#endif /* __KERNEL__ */ +extern struct kmem_zone *xfs_ili_zone; #endif /* __XFS_INODE_ITEM_H__ */ diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index c1c3ef88a26..8bc1bbce745 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -17,32 +17,31 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_log.h" -#include "xfs_trans.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_alloc.h" #include "xfs_mount.h" -#include "xfs_bmap_btree.h" -#include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_ioctl.h" +#include "xfs_alloc.h" #include "xfs_rtalloc.h" #include "xfs_itable.h" #include "xfs_error.h" #include "xfs_attr.h" #include "xfs_bmap.h" -#include "xfs_buf_item.h" -#include "xfs_utils.h" -#include "xfs_dfrag.h" +#include "xfs_bmap_util.h" #include "xfs_fsops.h" -#include "xfs_vnodeops.h" #include "xfs_discard.h" #include "xfs_quota.h" -#include "xfs_inode_item.h" #include "xfs_export.h" #include "xfs_trace.h" #include "xfs_icache.h" +#include "xfs_symlink.h" +#include "xfs_dinode.h" +#include "xfs_trans.h" #include <linux/capability.h> #include <linux/dcache.h> @@ -71,7 +70,7 @@ xfs_find_handle( int hsize; xfs_handle_t handle; struct inode *inode; - struct fd f = {0}; + struct fd f = {NULL}; struct path path; int error; struct xfs_inode *ip; @@ -80,7 +79,7 @@ xfs_find_handle( f = fdget(hreq->fd); if (!f.file) return -EBADF; - inode = f.file->f_path.dentry->d_inode; + inode = file_inode(f.file); } else { error = user_lpath((const char __user *)hreq->path, &path); if (error) @@ -113,15 +112,11 @@ xfs_find_handle( memset(&handle.ha_fid, 0, sizeof(handle.ha_fid)); hsize = sizeof(xfs_fsid_t); } else { - int lock_mode; - - lock_mode = xfs_ilock_map_shared(ip); handle.ha_fid.fid_len = sizeof(xfs_fid_t) - sizeof(handle.ha_fid.fid_len); handle.ha_fid.fid_pad = 0; handle.ha_fid.fid_gen = ip->i_d.di_gen; handle.ha_fid.fid_ino = ip->i_ino; - xfs_iunlock_map_shared(ip, lock_mode); hsize = XFS_HSIZE(handle); } @@ -168,7 +163,7 @@ xfs_handle_to_dentry( /* * Only allow handle opens under a directory. */ - if (!S_ISDIR(parfilp->f_path.dentry->d_inode->i_mode)) + if (!S_ISDIR(file_inode(parfilp)->i_mode)) return ERR_PTR(-ENOTDIR); if (hlen != sizeof(xfs_handle_t)) @@ -248,7 +243,7 @@ xfs_open_by_handle( goto out_dput; } - fd = get_unused_fd(); + fd = get_unused_fd_flags(0); if (fd < 0) { error = fd; goto out_dput; @@ -276,32 +271,6 @@ xfs_open_by_handle( return error; } -/* - * This is a copy from fs/namei.c:vfs_readlink(), except for removing it's - * unused first argument. - */ -STATIC int -do_readlink( - char __user *buffer, - int buflen, - const char *link) -{ - int len; - - len = PTR_ERR(link); - if (IS_ERR(link)) - goto out; - - len = strlen(link); - if (len > (unsigned) buflen) - len = buflen; - if (copy_to_user(buffer, link, len)) - len = -EFAULT; - out: - return len; -} - - int xfs_readlink_by_handle( struct file *parfilp, @@ -339,7 +308,7 @@ xfs_readlink_by_handle( error = -xfs_readlink(XFS_I(dentry->d_inode), link); if (error) goto out_kfree; - error = do_readlink(hreq->ohandle, olen, link); + error = readlink_copy(hreq->ohandle, olen, link); if (error) goto out_kfree; @@ -350,6 +319,40 @@ xfs_readlink_by_handle( return error; } +int +xfs_set_dmattrs( + xfs_inode_t *ip, + u_int evmask, + u_int16_t state) +{ + xfs_mount_t *mp = ip->i_mount; + xfs_trans_t *tp; + int error; + + if (!capable(CAP_SYS_ADMIN)) + return XFS_ERROR(EPERM); + + if (XFS_FORCED_SHUTDOWN(mp)) + return XFS_ERROR(EIO); + + tp = xfs_trans_alloc(mp, XFS_TRANS_SET_DMATTRS); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0); + if (error) { + xfs_trans_cancel(tp, 0); + return error; + } + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + + ip->i_d.di_dmevmask = evmask; + ip->i_d.di_dmstate = state; + + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + error = xfs_trans_commit(tp, 0); + + return error; +} + STATIC int xfs_fssetdm_by_handle( struct file *parfilp, @@ -409,7 +412,8 @@ xfs_attrlist_by_handle( return -XFS_ERROR(EPERM); if (copy_from_user(&al_hreq, arg, sizeof(xfs_fsop_attrlist_handlereq_t))) return -XFS_ERROR(EFAULT); - if (al_hreq.buflen > XATTR_LIST_MAX) + if (al_hreq.buflen < sizeof(struct attrlist) || + al_hreq.buflen > XATTR_LIST_MAX) return -XFS_ERROR(EINVAL); /* @@ -422,7 +426,7 @@ xfs_attrlist_by_handle( if (IS_ERR(dentry)) return PTR_ERR(dentry); - kbuf = kzalloc(al_hreq.buflen, GFP_KERNEL); + kbuf = kmem_zalloc_large(al_hreq.buflen, KM_SLEEP); if (!kbuf) goto out_dput; @@ -435,9 +439,9 @@ xfs_attrlist_by_handle( if (copy_to_user(al_hreq.buffer, kbuf, al_hreq.buflen)) error = -EFAULT; - out_kfree: - kfree(kbuf); - out_dput: +out_kfree: + kmem_free(kbuf); +out_dput: dput(dentry); return error; } @@ -455,12 +459,9 @@ xfs_attrmulti_attr_get( if (*len > XATTR_SIZE_MAX) return EINVAL; - kbuf = kmem_zalloc(*len, KM_SLEEP | KM_MAYFAIL); - if (!kbuf) { - kbuf = kmem_zalloc_large(*len); - if (!kbuf) - return ENOMEM; - } + kbuf = kmem_zalloc_large(*len, KM_SLEEP); + if (!kbuf) + return ENOMEM; error = xfs_attr_get(XFS_I(inode), name, kbuf, (int *)len, flags); if (error) @@ -469,11 +470,8 @@ xfs_attrmulti_attr_get( if (copy_to_user(ubuf, kbuf, *len)) error = EFAULT; - out_kfree: - if (is_vmalloc_addr(kbuf)) - kmem_free_large(kbuf); - else - kmem_free(kbuf); +out_kfree: + kmem_free(kbuf); return error; } @@ -545,10 +543,11 @@ xfs_attrmulti_by_handle( ops = memdup_user(am_hreq.ops, size); if (IS_ERR(ops)) { - error = PTR_ERR(ops); + error = -PTR_ERR(ops); goto out_dput; } + error = ENOMEM; attr_name = kmalloc(MAXNAMELEN, GFP_KERNEL); if (!attr_name) goto out_kfree_ops; @@ -558,7 +557,7 @@ xfs_attrmulti_by_handle( ops[i].am_error = strncpy_from_user((char *)attr_name, ops[i].am_attrname, MAXNAMELEN); if (ops[i].am_error == 0 || ops[i].am_error == MAXNAMELEN) - error = -ERANGE; + error = ERANGE; if (ops[i].am_error < 0) break; @@ -613,7 +612,11 @@ xfs_ioc_space( unsigned int cmd, xfs_flock64_t *bf) { - int attr_flags = 0; + struct xfs_mount *mp = ip->i_mount; + struct xfs_trans *tp; + struct iattr iattr; + bool setprealloc = false; + bool clrprealloc = false; int error; /* @@ -633,19 +636,128 @@ xfs_ioc_space( if (!S_ISREG(inode->i_mode)) return -XFS_ERROR(EINVAL); - if (filp->f_flags & (O_NDELAY|O_NONBLOCK)) - attr_flags |= XFS_ATTR_NONBLOCK; + error = mnt_want_write_file(filp); + if (error) + return error; - if (filp->f_flags & O_DSYNC) - attr_flags |= XFS_ATTR_SYNC; + xfs_ilock(ip, XFS_IOLOCK_EXCL); + + switch (bf->l_whence) { + case 0: /*SEEK_SET*/ + break; + case 1: /*SEEK_CUR*/ + bf->l_start += filp->f_pos; + break; + case 2: /*SEEK_END*/ + bf->l_start += XFS_ISIZE(ip); + break; + default: + error = XFS_ERROR(EINVAL); + goto out_unlock; + } - if (ioflags & IO_INVIS) - attr_flags |= XFS_ATTR_DMI; + /* + * length of <= 0 for resv/unresv/zero is invalid. length for + * alloc/free is ignored completely and we have no idea what userspace + * might have set it to, so set it to zero to allow range + * checks to pass. + */ + switch (cmd) { + case XFS_IOC_ZERO_RANGE: + case XFS_IOC_RESVSP: + case XFS_IOC_RESVSP64: + case XFS_IOC_UNRESVSP: + case XFS_IOC_UNRESVSP64: + if (bf->l_len <= 0) { + error = XFS_ERROR(EINVAL); + goto out_unlock; + } + break; + default: + bf->l_len = 0; + break; + } + + if (bf->l_start < 0 || + bf->l_start > mp->m_super->s_maxbytes || + bf->l_start + bf->l_len < 0 || + bf->l_start + bf->l_len >= mp->m_super->s_maxbytes) { + error = XFS_ERROR(EINVAL); + goto out_unlock; + } + + switch (cmd) { + case XFS_IOC_ZERO_RANGE: + error = xfs_zero_file_space(ip, bf->l_start, bf->l_len); + if (!error) + setprealloc = true; + break; + case XFS_IOC_RESVSP: + case XFS_IOC_RESVSP64: + error = xfs_alloc_file_space(ip, bf->l_start, bf->l_len, + XFS_BMAPI_PREALLOC); + if (!error) + setprealloc = true; + break; + case XFS_IOC_UNRESVSP: + case XFS_IOC_UNRESVSP64: + error = xfs_free_file_space(ip, bf->l_start, bf->l_len); + break; + case XFS_IOC_ALLOCSP: + case XFS_IOC_ALLOCSP64: + case XFS_IOC_FREESP: + case XFS_IOC_FREESP64: + if (bf->l_start > XFS_ISIZE(ip)) { + error = xfs_alloc_file_space(ip, XFS_ISIZE(ip), + bf->l_start - XFS_ISIZE(ip), 0); + if (error) + goto out_unlock; + } + + iattr.ia_valid = ATTR_SIZE; + iattr.ia_size = bf->l_start; + + error = xfs_setattr_size(ip, &iattr); + if (!error) + clrprealloc = true; + break; + default: + ASSERT(0); + error = XFS_ERROR(EINVAL); + } - error = mnt_want_write_file(filp); if (error) - return error; - error = xfs_change_file_space(ip, cmd, bf, filp->f_pos, attr_flags); + goto out_unlock; + + tp = xfs_trans_alloc(mp, XFS_TRANS_WRITEID); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_writeid, 0, 0); + if (error) { + xfs_trans_cancel(tp, 0); + goto out_unlock; + } + + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + + if (!(ioflags & IO_INVIS)) { + ip->i_d.di_mode &= ~S_ISUID; + if (ip->i_d.di_mode & S_IXGRP) + ip->i_d.di_mode &= ~S_ISGID; + xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + } + + if (setprealloc) + ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC; + else if (clrprealloc) + ip->i_d.di_flags &= ~XFS_DIFLAG_PREALLOC; + + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + if (filp->f_flags & O_DSYNC) + xfs_trans_set_sync(tp); + error = xfs_trans_commit(tp, 0); + +out_unlock: + xfs_iunlock(ip, XFS_IOLOCK_EXCL); mnt_drop_write_file(filp); return -error; } @@ -922,7 +1034,7 @@ xfs_ioctl_setattr( struct xfs_trans *tp; unsigned int lock_flags = 0; struct xfs_dquot *udqp = NULL; - struct xfs_dquot *gdqp = NULL; + struct xfs_dquot *pdqp = NULL; struct xfs_dquot *olddquot = NULL; int code; @@ -951,7 +1063,7 @@ xfs_ioctl_setattr( if (XFS_IS_QUOTA_ON(mp) && (mask & FSX_PROJID)) { code = xfs_qm_vop_dqalloc(ip, ip->i_d.di_uid, ip->i_d.di_gid, fa->fsx_projid, - XFS_QMOPT_PQUOTA, &udqp, &gdqp); + XFS_QMOPT_PQUOTA, &udqp, NULL, &pdqp); if (code) return code; } @@ -961,7 +1073,7 @@ xfs_ioctl_setattr( * first do an error checking pass. */ tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE); - code = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0); + code = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0); if (code) goto error_return; @@ -975,21 +1087,28 @@ xfs_ioctl_setattr( * to the file owner ID, except in cases where the * CAP_FSETID capability is applicable. */ - if (current_fsuid() != ip->i_d.di_uid && !capable(CAP_FOWNER)) { + if (!inode_owner_or_capable(VFS_I(ip))) { code = XFS_ERROR(EPERM); goto error_return; } /* * Do a quota reservation only if projid is actually going to change. + * Only allow changing of projid from init_user_ns since it is a + * non user namespace aware identifier. */ if (mask & FSX_PROJID) { + if (current_user_ns() != &init_user_ns) { + code = XFS_ERROR(EINVAL); + goto error_return; + } + if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_PQUOTA_ON(mp) && xfs_get_projid(ip) != fa->fsx_projid) { ASSERT(tp); - code = xfs_qm_vop_chown_reserve(tp, ip, udqp, gdqp, - capable(CAP_FOWNER) ? + code = xfs_qm_vop_chown_reserve(tp, ip, udqp, NULL, + pdqp, capable(CAP_FOWNER) ? XFS_QMOPT_FORCE_RES : 0); if (code) /* out of quota */ goto error_return; @@ -1097,7 +1216,7 @@ xfs_ioctl_setattr( * cleared upon successful return from chown() */ if ((ip->i_d.di_mode & (S_ISUID|S_ISGID)) && - !capable(CAP_FSETID)) + !capable_wrt_inode_uidgid(VFS_I(ip), CAP_FSETID)) ip->i_d.di_mode &= ~(S_ISUID|S_ISGID); /* @@ -1107,17 +1226,10 @@ xfs_ioctl_setattr( if (xfs_get_projid(ip) != fa->fsx_projid) { if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_PQUOTA_ON(mp)) { olddquot = xfs_qm_vop_chown(tp, ip, - &ip->i_gdquot, gdqp); + &ip->i_pdquot, pdqp); } + ASSERT(ip->i_d.di_version > 1); xfs_set_projid(ip, fa->fsx_projid); - - /* - * We may have to rev the inode as well as - * the superblock version number since projids didn't - * exist before DINODE_VERSION_2 and SB_VERSION_NLINK. - */ - if (ip->i_d.di_version == 1) - xfs_bump_ino_vers2(tp, ip); } } @@ -1154,13 +1266,13 @@ xfs_ioctl_setattr( */ xfs_qm_dqrele(olddquot); xfs_qm_dqrele(udqp); - xfs_qm_dqrele(gdqp); + xfs_qm_dqrele(pdqp); return code; error_return: xfs_qm_dqrele(udqp); - xfs_qm_dqrele(gdqp); + xfs_qm_dqrele(pdqp); xfs_trans_cancel(tp, 0); if (lock_flags) xfs_iunlock(ip, lock_flags); @@ -1322,6 +1434,75 @@ xfs_ioc_getbmapx( return 0; } +int +xfs_ioc_swapext( + xfs_swapext_t *sxp) +{ + xfs_inode_t *ip, *tip; + struct fd f, tmp; + int error = 0; + + /* Pull information for the target fd */ + f = fdget((int)sxp->sx_fdtarget); + if (!f.file) { + error = XFS_ERROR(EINVAL); + goto out; + } + + if (!(f.file->f_mode & FMODE_WRITE) || + !(f.file->f_mode & FMODE_READ) || + (f.file->f_flags & O_APPEND)) { + error = XFS_ERROR(EBADF); + goto out_put_file; + } + + tmp = fdget((int)sxp->sx_fdtmp); + if (!tmp.file) { + error = XFS_ERROR(EINVAL); + goto out_put_file; + } + + if (!(tmp.file->f_mode & FMODE_WRITE) || + !(tmp.file->f_mode & FMODE_READ) || + (tmp.file->f_flags & O_APPEND)) { + error = XFS_ERROR(EBADF); + goto out_put_tmp_file; + } + + if (IS_SWAPFILE(file_inode(f.file)) || + IS_SWAPFILE(file_inode(tmp.file))) { + error = XFS_ERROR(EINVAL); + goto out_put_tmp_file; + } + + ip = XFS_I(file_inode(f.file)); + tip = XFS_I(file_inode(tmp.file)); + + if (ip->i_mount != tip->i_mount) { + error = XFS_ERROR(EINVAL); + goto out_put_tmp_file; + } + + if (ip->i_ino == tip->i_ino) { + error = XFS_ERROR(EINVAL); + goto out_put_tmp_file; + } + + if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { + error = XFS_ERROR(EIO); + goto out_put_tmp_file; + } + + error = xfs_swap_extents(ip, tip, sxp); + + out_put_tmp_file: + fdput(tmp); + out_put_file: + fdput(f); + out: + return error; +} + /* * Note: some of the ioctl's return positive numbers as a * byte count indicating success, such as readlink_by_handle. @@ -1334,7 +1515,7 @@ xfs_file_ioctl( unsigned int cmd, unsigned long p) { - struct inode *inode = filp->f_path.dentry->d_inode; + struct inode *inode = file_inode(filp); struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; void __user *arg = (void __user *)p; @@ -1370,7 +1551,7 @@ xfs_file_ioctl( XFS_IS_REALTIME_INODE(ip) ? mp->m_rtdev_targp : mp->m_ddev_targp; - da.d_mem = da.d_miniosz = 1 << target->bt_sshift; + da.d_mem = da.d_miniosz = target->bt_logical_sectorsize; da.d_maxiosz = INT_MAX & ~(da.d_miniosz - 1); if (copy_to_user(arg, &da, sizeof(da))) @@ -1466,7 +1647,7 @@ xfs_file_ioctl( error = mnt_want_write_file(filp); if (error) return error; - error = xfs_swapext(&sxp); + error = xfs_ioc_swapext(&sxp); mnt_drop_write_file(filp); return -error; } @@ -1604,23 +1785,23 @@ xfs_file_ioctl( return -error; case XFS_IOC_FREE_EOFBLOCKS: { - struct xfs_eofblocks eofb; + struct xfs_fs_eofblocks eofb; + struct xfs_eofblocks keofb; - if (copy_from_user(&eofb, arg, sizeof(eofb))) - return -XFS_ERROR(EFAULT); + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; - if (eofb.eof_version != XFS_EOFBLOCKS_VERSION) - return -XFS_ERROR(EINVAL); + if (mp->m_flags & XFS_MOUNT_RDONLY) + return -XFS_ERROR(EROFS); - if (eofb.eof_flags & ~XFS_EOF_FLAGS_VALID) - return -XFS_ERROR(EINVAL); + if (copy_from_user(&eofb, arg, sizeof(eofb))) + return -XFS_ERROR(EFAULT); - if (memchr_inv(&eofb.pad32, 0, sizeof(eofb.pad32)) || - memchr_inv(eofb.pad64, 0, sizeof(eofb.pad64))) - return -XFS_ERROR(EINVAL); + error = xfs_fs_eofblocks_from_user(&eofb, &keofb); + if (error) + return -error; - error = xfs_icache_free_eofblocks(mp, &eofb); - return -error; + return -xfs_icache_free_eofblocks(mp, &keofb); } default: diff --git a/fs/xfs/xfs_ioctl.h b/fs/xfs/xfs_ioctl.h index d56173b34a2..77c02c7900b 100644 --- a/fs/xfs/xfs_ioctl.h +++ b/fs/xfs/xfs_ioctl.h @@ -27,6 +27,10 @@ xfs_ioc_space( unsigned int cmd, xfs_flock64_t *bf); +int +xfs_ioc_swapext( + xfs_swapext_t *sxp); + extern int xfs_find_handle( unsigned int cmd, @@ -82,4 +86,10 @@ xfs_file_compat_ioctl( unsigned int cmd, unsigned long arg); +extern int +xfs_set_dmattrs( + struct xfs_inode *ip, + u_int evmask, + u_int16_t state); + #endif diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c index 1244274a567..944d5baa710 100644 --- a/fs/xfs/xfs_ioctl32.c +++ b/fs/xfs/xfs_ioctl32.c @@ -22,19 +22,16 @@ #include <asm/uaccess.h> #include "xfs.h" #include "xfs_fs.h" -#include "xfs_log.h" -#include "xfs_trans.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_mount.h" -#include "xfs_bmap_btree.h" #include "xfs_vnode.h" -#include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_itable.h" #include "xfs_error.h" -#include "xfs_dfrag.h" -#include "xfs_vnodeops.h" #include "xfs_fsops.h" #include "xfs_alloc.h" #include "xfs_rtalloc.h" @@ -359,7 +356,8 @@ xfs_compat_attrlist_by_handle( if (copy_from_user(&al_hreq, arg, sizeof(compat_xfs_fsop_attrlist_handlereq_t))) return -XFS_ERROR(EFAULT); - if (al_hreq.buflen > XATTR_LIST_MAX) + if (al_hreq.buflen < sizeof(struct attrlist) || + al_hreq.buflen > XATTR_LIST_MAX) return -XFS_ERROR(EINVAL); /* @@ -373,7 +371,7 @@ xfs_compat_attrlist_by_handle( return PTR_ERR(dentry); error = -ENOMEM; - kbuf = kmalloc(al_hreq.buflen, GFP_KERNEL); + kbuf = kmem_zalloc_large(al_hreq.buflen, KM_SLEEP); if (!kbuf) goto out_dput; @@ -386,9 +384,9 @@ xfs_compat_attrlist_by_handle( if (copy_to_user(compat_ptr(al_hreq.buffer), kbuf, al_hreq.buflen)) error = -EFAULT; - out_kfree: - kfree(kbuf); - out_dput: +out_kfree: + kmem_free(kbuf); +out_dput: dput(dentry); return error; } @@ -426,10 +424,11 @@ xfs_compat_attrmulti_by_handle( ops = memdup_user(compat_ptr(am_hreq.ops), size); if (IS_ERR(ops)) { - error = PTR_ERR(ops); + error = -PTR_ERR(ops); goto out_dput; } + error = ENOMEM; attr_name = kmalloc(MAXNAMELEN, GFP_KERNEL); if (!attr_name) goto out_kfree_ops; @@ -440,7 +439,7 @@ xfs_compat_attrmulti_by_handle( compat_ptr(ops[i].am_attrname), MAXNAMELEN); if (ops[i].am_error == 0 || ops[i].am_error == MAXNAMELEN) - error = -ERANGE; + error = ERANGE; if (ops[i].am_error < 0) break; @@ -530,7 +529,7 @@ xfs_file_compat_ioctl( unsigned cmd, unsigned long p) { - struct inode *inode = filp->f_path.dentry->d_inode; + struct inode *inode = file_inode(filp); struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; void __user *arg = (void __user *)p; @@ -638,7 +637,7 @@ xfs_file_compat_ioctl( error = mnt_want_write_file(filp); if (error) return error; - error = xfs_swapext(&sxp); + error = xfs_ioc_swapext(&sxp); mnt_drop_write_file(filp); return -error; } diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index add06b4e9a6..6d3ec2b6ee2 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -17,31 +17,28 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_log.h" -#include "xfs_trans.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_alloc.h" -#include "xfs_quota.h" #include "xfs_mount.h" -#include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_dinode.h" #include "xfs_inode.h" -#include "xfs_inode_item.h" #include "xfs_btree.h" +#include "xfs_bmap_btree.h" #include "xfs_bmap.h" -#include "xfs_rtalloc.h" +#include "xfs_bmap_util.h" #include "xfs_error.h" -#include "xfs_itable.h" -#include "xfs_attr.h" -#include "xfs_buf_item.h" +#include "xfs_trans.h" #include "xfs_trans_space.h" -#include "xfs_utils.h" #include "xfs_iomap.h" #include "xfs_trace.h" #include "xfs_icache.h" +#include "xfs_quota.h" +#include "xfs_dquot_item.h" +#include "xfs_dquot.h" +#include "xfs_dinode.h" #define XFS_WRITEIO_ALIGN(mp,off) (((off) >> mp->m_writeio_log) \ @@ -107,7 +104,7 @@ xfs_alert_fsblock_zero( xfs_alert_tag(ip->i_mount, XFS_PTAG_FSBLOCK_ZERO, "Access to block zero in inode %llu " "start_block: %llx start_off: %llx " - "blkcnt: %llx extent-state: %x\n", + "blkcnt: %llx extent-state: %x", (unsigned long long)ip->i_ino, (unsigned long long)imap->br_startblock, (unsigned long long)imap->br_startoff, @@ -131,7 +128,6 @@ xfs_iomap_write_direct( xfs_fsblock_t firstfsb; xfs_extlen_t extsz, temp; int nimaps; - int bmapi_flag; int quota_flag; int rt; xfs_trans_t *tp; @@ -185,10 +181,8 @@ xfs_iomap_write_direct( * Allocate and setup the transaction */ tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT); - error = xfs_trans_reserve(tp, resblks, - XFS_WRITE_LOG_RES(mp), resrtextents, - XFS_TRANS_PERM_LOG_RES, - XFS_WRITE_LOG_COUNT); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, + resblks, resrtextents); /* * Check for running out of space, note: need lock to return */ @@ -205,18 +199,15 @@ xfs_iomap_write_direct( xfs_trans_ijoin(tp, ip, 0); - bmapi_flag = 0; - if (offset < XFS_ISIZE(ip) || extsz) - bmapi_flag |= XFS_BMAPI_PREALLOC; - /* * From this point onwards we overwrite the imap pointer that the * caller gave to us. */ xfs_bmap_init(&free_list, &firstfsb); nimaps = 1; - error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb, bmapi_flag, - &firstfsb, 0, imap, &nimaps, &free_list); + error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb, + XFS_BMAPI_PREALLOC, &firstfsb, 0, + imap, &nimaps, &free_list); if (error) goto out_bmap_cancel; @@ -282,6 +273,15 @@ xfs_iomap_eof_want_preallocate( return 0; /* + * If the file is smaller than the minimum prealloc and we are using + * dynamic preallocation, don't do any preallocation at all as it is + * likely this is the only write to the file that is going to be done. + */ + if (!(mp->m_flags & XFS_MOUNT_DFLT_IOSIZE) && + XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, mp->m_writeio_blocks)) + return 0; + + /* * If there are any real blocks past eof, then don't * do any speculative allocation. */ @@ -311,6 +311,121 @@ xfs_iomap_eof_want_preallocate( } /* + * Determine the initial size of the preallocation. We are beyond the current + * EOF here, but we need to take into account whether this is a sparse write or + * an extending write when determining the preallocation size. Hence we need to + * look up the extent that ends at the current write offset and use the result + * to determine the preallocation size. + * + * If the extent is a hole, then preallocation is essentially disabled. + * Otherwise we take the size of the preceeding data extent as the basis for the + * preallocation size. If the size of the extent is greater than half the + * maximum extent length, then use the current offset as the basis. This ensures + * that for large files the preallocation size always extends to MAXEXTLEN + * rather than falling short due to things like stripe unit/width alignment of + * real extents. + */ +STATIC xfs_fsblock_t +xfs_iomap_eof_prealloc_initial_size( + struct xfs_mount *mp, + struct xfs_inode *ip, + xfs_off_t offset, + xfs_bmbt_irec_t *imap, + int nimaps) +{ + xfs_fileoff_t start_fsb; + int imaps = 1; + int error; + + ASSERT(nimaps >= imaps); + + /* if we are using a specific prealloc size, return now */ + if (mp->m_flags & XFS_MOUNT_DFLT_IOSIZE) + return 0; + + /* If the file is small, then use the minimum prealloc */ + if (XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, mp->m_dalign)) + return 0; + + /* + * As we write multiple pages, the offset will always align to the + * start of a page and hence point to a hole at EOF. i.e. if the size is + * 4096 bytes, we only have one block at FSB 0, but XFS_B_TO_FSB(4096) + * will return FSB 1. Hence if there are blocks in the file, we want to + * point to the block prior to the EOF block and not the hole that maps + * directly at @offset. + */ + start_fsb = XFS_B_TO_FSB(mp, offset); + if (start_fsb) + start_fsb--; + error = xfs_bmapi_read(ip, start_fsb, 1, imap, &imaps, XFS_BMAPI_ENTIRE); + if (error) + return 0; + + ASSERT(imaps == 1); + if (imap[0].br_startblock == HOLESTARTBLOCK) + return 0; + if (imap[0].br_blockcount <= (MAXEXTLEN >> 1)) + return imap[0].br_blockcount << 1; + return XFS_B_TO_FSB(mp, offset); +} + +STATIC bool +xfs_quota_need_throttle( + struct xfs_inode *ip, + int type, + xfs_fsblock_t alloc_blocks) +{ + struct xfs_dquot *dq = xfs_inode_dquot(ip, type); + + if (!dq || !xfs_this_quota_on(ip->i_mount, type)) + return false; + + /* no hi watermark, no throttle */ + if (!dq->q_prealloc_hi_wmark) + return false; + + /* under the lo watermark, no throttle */ + if (dq->q_res_bcount + alloc_blocks < dq->q_prealloc_lo_wmark) + return false; + + return true; +} + +STATIC void +xfs_quota_calc_throttle( + struct xfs_inode *ip, + int type, + xfs_fsblock_t *qblocks, + int *qshift) +{ + int64_t freesp; + int shift = 0; + struct xfs_dquot *dq = xfs_inode_dquot(ip, type); + + /* over hi wmark, squash the prealloc completely */ + if (dq->q_res_bcount >= dq->q_prealloc_hi_wmark) { + *qblocks = 0; + return; + } + + freesp = dq->q_prealloc_hi_wmark - dq->q_res_bcount; + if (freesp < dq->q_low_space[XFS_QLOWSP_5_PCNT]) { + shift = 2; + if (freesp < dq->q_low_space[XFS_QLOWSP_3_PCNT]) + shift += 2; + if (freesp < dq->q_low_space[XFS_QLOWSP_1_PCNT]) + shift += 2; + } + + /* only overwrite the throttle values if we are more aggressive */ + if ((freesp >> shift) < (*qblocks >> *qshift)) { + *qblocks = freesp; + *qshift = shift; + } +} + +/* * If we don't have a user specified preallocation size, dynamically increase * the preallocation size as the size of the file grows. Cap the maximum size * at a single extent or less if the filesystem is near full. The closer the @@ -319,43 +434,95 @@ xfs_iomap_eof_want_preallocate( STATIC xfs_fsblock_t xfs_iomap_prealloc_size( struct xfs_mount *mp, - struct xfs_inode *ip) + struct xfs_inode *ip, + xfs_off_t offset, + struct xfs_bmbt_irec *imap, + int nimaps) { xfs_fsblock_t alloc_blocks = 0; + int shift = 0; + int64_t freesp; + xfs_fsblock_t qblocks; + int qshift = 0; - if (!(mp->m_flags & XFS_MOUNT_DFLT_IOSIZE)) { - int shift = 0; - int64_t freesp; + alloc_blocks = xfs_iomap_eof_prealloc_initial_size(mp, ip, offset, + imap, nimaps); + if (!alloc_blocks) + goto check_writeio; + qblocks = alloc_blocks; - /* - * rounddown_pow_of_two() returns an undefined result - * if we pass in alloc_blocks = 0. Hence the "+ 1" to - * ensure we always pass in a non-zero value. - */ - alloc_blocks = XFS_B_TO_FSB(mp, XFS_ISIZE(ip)) + 1; - alloc_blocks = XFS_FILEOFF_MIN(MAXEXTLEN, - rounddown_pow_of_two(alloc_blocks)); - - xfs_icsb_sync_counters(mp, XFS_ICSB_LAZY_COUNT); - freesp = mp->m_sb.sb_fdblocks; - if (freesp < mp->m_low_space[XFS_LOWSP_5_PCNT]) { - shift = 2; - if (freesp < mp->m_low_space[XFS_LOWSP_4_PCNT]) - shift++; - if (freesp < mp->m_low_space[XFS_LOWSP_3_PCNT]) - shift++; - if (freesp < mp->m_low_space[XFS_LOWSP_2_PCNT]) - shift++; - if (freesp < mp->m_low_space[XFS_LOWSP_1_PCNT]) - shift++; - } - if (shift) - alloc_blocks >>= shift; + /* + * MAXEXTLEN is not a power of two value but we round the prealloc down + * to the nearest power of two value after throttling. To prevent the + * round down from unconditionally reducing the maximum supported prealloc + * size, we round up first, apply appropriate throttling, round down and + * cap the value to MAXEXTLEN. + */ + alloc_blocks = XFS_FILEOFF_MIN(roundup_pow_of_two(MAXEXTLEN), + alloc_blocks); + + xfs_icsb_sync_counters(mp, XFS_ICSB_LAZY_COUNT); + freesp = mp->m_sb.sb_fdblocks; + if (freesp < mp->m_low_space[XFS_LOWSP_5_PCNT]) { + shift = 2; + if (freesp < mp->m_low_space[XFS_LOWSP_4_PCNT]) + shift++; + if (freesp < mp->m_low_space[XFS_LOWSP_3_PCNT]) + shift++; + if (freesp < mp->m_low_space[XFS_LOWSP_2_PCNT]) + shift++; + if (freesp < mp->m_low_space[XFS_LOWSP_1_PCNT]) + shift++; } + /* + * Check each quota to cap the prealloc size and provide a shift + * value to throttle with. + */ + if (xfs_quota_need_throttle(ip, XFS_DQ_USER, alloc_blocks)) + xfs_quota_calc_throttle(ip, XFS_DQ_USER, &qblocks, &qshift); + if (xfs_quota_need_throttle(ip, XFS_DQ_GROUP, alloc_blocks)) + xfs_quota_calc_throttle(ip, XFS_DQ_GROUP, &qblocks, &qshift); + if (xfs_quota_need_throttle(ip, XFS_DQ_PROJ, alloc_blocks)) + xfs_quota_calc_throttle(ip, XFS_DQ_PROJ, &qblocks, &qshift); + + /* + * The final prealloc size is set to the minimum of free space available + * in each of the quotas and the overall filesystem. + * + * The shift throttle value is set to the maximum value as determined by + * the global low free space values and per-quota low free space values. + */ + alloc_blocks = MIN(alloc_blocks, qblocks); + shift = MAX(shift, qshift); + + if (shift) + alloc_blocks >>= shift; + /* + * rounddown_pow_of_two() returns an undefined result if we pass in + * alloc_blocks = 0. + */ + if (alloc_blocks) + alloc_blocks = rounddown_pow_of_two(alloc_blocks); + if (alloc_blocks > MAXEXTLEN) + alloc_blocks = MAXEXTLEN; + + /* + * If we are still trying to allocate more space than is + * available, squash the prealloc hard. This can happen if we + * have a large file on a small filesystem and the above + * lowspace thresholds are smaller than MAXEXTLEN. + */ + while (alloc_blocks && alloc_blocks >= freesp) + alloc_blocks >>= 4; + +check_writeio: if (alloc_blocks < mp->m_writeio_blocks) alloc_blocks = mp->m_writeio_blocks; + trace_xfs_iomap_prealloc_size(ip, alloc_blocks, shift, + mp->m_writeio_blocks); + return alloc_blocks; } @@ -390,7 +557,6 @@ xfs_iomap_write_delay( extsz = xfs_get_extsz_hint(ip); offset_fsb = XFS_B_TO_FSBT(mp, offset); - error = xfs_iomap_eof_want_preallocate(mp, ip, offset, count, imap, XFS_WRITE_IMAPS, &prealloc); if (error) @@ -398,7 +564,10 @@ xfs_iomap_write_delay( retry: if (prealloc) { - xfs_fsblock_t alloc_blocks = xfs_iomap_prealloc_size(mp, ip); + xfs_fsblock_t alloc_blocks; + + alloc_blocks = xfs_iomap_prealloc_size(mp, ip, offset, imap, + XFS_WRITE_IMAPS); aligned_offset = XFS_WRITEIO_ALIGN(mp, (offset + count - 1)); ioalign = XFS_B_TO_FSBT(mp, aligned_offset); @@ -476,7 +645,6 @@ int xfs_iomap_write_allocate( xfs_inode_t *ip, xfs_off_t offset, - size_t count, xfs_bmbt_irec_t *imap) { xfs_mount_t *mp = ip->i_mount; @@ -518,10 +686,8 @@ xfs_iomap_write_allocate( tp = xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE); tp->t_flags |= XFS_TRANS_RESERVE; nres = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK); - error = xfs_trans_reserve(tp, nres, - XFS_WRITE_LOG_RES(mp), - 0, XFS_TRANS_PERM_LOG_RES, - XFS_WRITE_LOG_COUNT); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, + nres, 0); if (error) { xfs_trans_cancel(tp, 0); return XFS_ERROR(error); @@ -564,7 +730,7 @@ xfs_iomap_write_allocate( */ nimaps = 1; end_fsb = XFS_B_TO_FSB(mp, XFS_ISIZE(ip)); - error = xfs_bmap_last_offset(NULL, ip, &last_block, + error = xfs_bmap_last_offset(ip, &last_block, XFS_DATA_FORK); if (error) goto trans_cancel; @@ -583,8 +749,7 @@ xfs_iomap_write_allocate( * pointer that the caller gave to us. */ error = xfs_bmapi_write(tp, ip, map_start_fsb, - count_fsb, - XFS_BMAPI_STACK_SWITCH, + count_fsb, 0, &first_block, 1, imap, &nimaps, &free_list); if (error) @@ -684,10 +849,8 @@ xfs_iomap_write_unwritten( sb_start_intwrite(mp->m_super); tp = _xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE, KM_NOFS); tp->t_flags |= XFS_TRANS_RESERVE | XFS_TRANS_FREEZE_PROT; - error = xfs_trans_reserve(tp, resblks, - XFS_WRITE_LOG_RES(mp), 0, - XFS_TRANS_PERM_LOG_RES, - XFS_WRITE_LOG_COUNT); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, + resblks, 0); if (error) { xfs_trans_cancel(tp, 0); return XFS_ERROR(error); diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h index 80615760959..411fbb8919e 100644 --- a/fs/xfs/xfs_iomap.h +++ b/fs/xfs/xfs_iomap.h @@ -21,12 +21,12 @@ struct xfs_inode; struct xfs_bmbt_irec; -extern int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t, +int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t, struct xfs_bmbt_irec *, int); -extern int xfs_iomap_write_delay(struct xfs_inode *, xfs_off_t, size_t, +int xfs_iomap_write_delay(struct xfs_inode *, xfs_off_t, size_t, struct xfs_bmbt_irec *); -extern int xfs_iomap_write_allocate(struct xfs_inode *, xfs_off_t, size_t, +int xfs_iomap_write_allocate(struct xfs_inode *, xfs_off_t, struct xfs_bmbt_irec *); -extern int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, size_t); +int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, size_t); #endif /* __XFS_IOMAP_H__*/ diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index d82efaa2ac7..205613a0606 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -17,28 +17,29 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_acl.h" -#include "xfs_log.h" -#include "xfs_trans.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_alloc.h" -#include "xfs_quota.h" #include "xfs_mount.h" -#include "xfs_bmap_btree.h" -#include "xfs_dinode.h" +#include "xfs_da_format.h" #include "xfs_inode.h" #include "xfs_bmap.h" -#include "xfs_rtalloc.h" +#include "xfs_bmap_util.h" +#include "xfs_acl.h" +#include "xfs_quota.h" #include "xfs_error.h" -#include "xfs_itable.h" #include "xfs_attr.h" -#include "xfs_buf_item.h" -#include "xfs_utils.h" -#include "xfs_vnodeops.h" -#include "xfs_inode_item.h" +#include "xfs_trans.h" #include "xfs_trace.h" #include "xfs_icache.h" +#include "xfs_symlink.h" +#include "xfs_da_btree.h" +#include "xfs_dir2_priv.h" +#include "xfs_dinode.h" +#include "xfs_trans_space.h" #include <linux/capability.h> #include <linux/xattr.h> @@ -48,6 +49,18 @@ #include <linux/fiemap.h> #include <linux/slab.h> +/* + * Directories have different lock order w.r.t. mmap_sem compared to regular + * files. This is due to readdir potentially triggering page faults on a user + * buffer inside filldir(), and this happens with the ilock on the directory + * held. For regular files, the lock order is the other way around - the + * mmap_sem is taken during the page fault, and then we lock the ilock to do + * block mapping. Hence we need a different class for the directory ilock so + * that lockdep can tell them apart. + */ +static struct lock_class_key xfs_nondir_ilock_class; +static struct lock_class_key xfs_dir_ilock_class; + static int xfs_initxattrs( struct inode *inode, @@ -59,8 +72,8 @@ xfs_initxattrs( int error = 0; for (xattr = xattr_array; xattr->name != NULL; xattr++) { - error = xfs_attr_set(ip, xattr->name, xattr->value, - xattr->value_len, ATTR_SECURE); + error = -xfs_attr_set(ip, xattr->name, xattr->value, + xattr->value_len, ATTR_SECURE); if (error < 0) break; } @@ -80,17 +93,19 @@ xfs_init_security( struct inode *dir, const struct qstr *qstr) { - return security_inode_init_security(inode, dir, qstr, - &xfs_initxattrs, NULL); + return -security_inode_init_security(inode, dir, qstr, + &xfs_initxattrs, NULL); } static void xfs_dentry_to_name( struct xfs_name *namep, - struct dentry *dentry) + struct dentry *dentry, + int mode) { namep->name = dentry->d_name.name; namep->len = dentry->d_name.len; + namep->type = xfs_mode_to_ftype[(mode & S_IFMT) >> S_SHIFT]; } STATIC void @@ -106,22 +121,22 @@ xfs_cleanup_inode( * xfs_init_security we must back out. * ENOSPC can hit here, among other things. */ - xfs_dentry_to_name(&teardown, dentry); + xfs_dentry_to_name(&teardown, dentry, 0); xfs_remove(XFS_I(dir), &teardown, XFS_I(inode)); - iput(inode); } STATIC int -xfs_vn_mknod( +xfs_generic_create( struct inode *dir, struct dentry *dentry, umode_t mode, - dev_t rdev) + dev_t rdev, + bool tmpfile) /* unnamed file */ { struct inode *inode; struct xfs_inode *ip = NULL; - struct posix_acl *default_acl = NULL; + struct posix_acl *default_acl, *acl; struct xfs_name name; int error; @@ -137,17 +152,16 @@ xfs_vn_mknod( rdev = 0; } - if (IS_POSIXACL(dir)) { - default_acl = xfs_get_acl(dir, ACL_TYPE_DEFAULT); - if (IS_ERR(default_acl)) - return PTR_ERR(default_acl); + error = posix_acl_create(dir, &mode, &default_acl, &acl); + if (error) + return error; - if (!default_acl) - mode &= ~current_umask(); + if (!tmpfile) { + xfs_dentry_to_name(&name, dentry, mode); + error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip); + } else { + error = xfs_create_tmpfile(XFS_I(dir), dentry, mode, &ip); } - - xfs_dentry_to_name(&name, dentry); - error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip); if (unlikely(error)) goto out_free_acl; @@ -157,22 +171,46 @@ xfs_vn_mknod( if (unlikely(error)) goto out_cleanup_inode; +#ifdef CONFIG_XFS_POSIX_ACL if (default_acl) { - error = -xfs_inherit_acl(inode, default_acl); - default_acl = NULL; - if (unlikely(error)) + error = -xfs_set_acl(inode, default_acl, ACL_TYPE_DEFAULT); + if (error) + goto out_cleanup_inode; + } + if (acl) { + error = -xfs_set_acl(inode, acl, ACL_TYPE_ACCESS); + if (error) goto out_cleanup_inode; } +#endif + if (tmpfile) + d_tmpfile(dentry, inode); + else + d_instantiate(dentry, inode); - d_instantiate(dentry, inode); + out_free_acl: + if (default_acl) + posix_acl_release(default_acl); + if (acl) + posix_acl_release(acl); return -error; out_cleanup_inode: - xfs_cleanup_inode(dir, inode, dentry); - out_free_acl: - posix_acl_release(default_acl); - return -error; + if (!tmpfile) + xfs_cleanup_inode(dir, inode, dentry); + iput(inode); + goto out_free_acl; +} + +STATIC int +xfs_vn_mknod( + struct inode *dir, + struct dentry *dentry, + umode_t mode, + dev_t rdev) +{ + return xfs_generic_create(dir, dentry, mode, rdev, false); } STATIC int @@ -207,7 +245,7 @@ xfs_vn_lookup( if (dentry->d_name.len >= MAXNAMELEN) return ERR_PTR(-ENAMETOOLONG); - xfs_dentry_to_name(&name, dentry); + xfs_dentry_to_name(&name, dentry, 0); error = xfs_lookup(XFS_I(dir), &name, &cip, NULL); if (unlikely(error)) { if (unlikely(error != ENOENT)) @@ -234,7 +272,7 @@ xfs_vn_ci_lookup( if (dentry->d_name.len >= MAXNAMELEN) return ERR_PTR(-ENAMETOOLONG); - xfs_dentry_to_name(&xname, dentry); + xfs_dentry_to_name(&xname, dentry, 0); error = xfs_lookup(XFS_I(dir), &xname, &ip, &ci_name); if (unlikely(error)) { if (unlikely(error != ENOENT)) @@ -269,7 +307,7 @@ xfs_vn_link( struct xfs_name name; int error; - xfs_dentry_to_name(&name, dentry); + xfs_dentry_to_name(&name, dentry, inode->i_mode); error = xfs_link(XFS_I(dir), XFS_I(inode), &name); if (unlikely(error)) @@ -288,7 +326,7 @@ xfs_vn_unlink( struct xfs_name name; int error; - xfs_dentry_to_name(&name, dentry); + xfs_dentry_to_name(&name, dentry, 0); error = -xfs_remove(XFS_I(dir), &name, XFS_I(dentry->d_inode)); if (error) @@ -318,7 +356,7 @@ xfs_vn_symlink( mode = S_IFLNK | (irix_symlink_mode ? 0777 & ~current_umask() : S_IRWXUGO); - xfs_dentry_to_name(&name, dentry); + xfs_dentry_to_name(&name, dentry, mode); error = xfs_symlink(XFS_I(dir), &name, symname, mode, &cip); if (unlikely(error)) @@ -335,6 +373,7 @@ xfs_vn_symlink( out_cleanup_inode: xfs_cleanup_inode(dir, inode, dentry); + iput(inode); out: return -error; } @@ -350,12 +389,12 @@ xfs_vn_rename( struct xfs_name oname; struct xfs_name nname; - xfs_dentry_to_name(&oname, odentry); - xfs_dentry_to_name(&nname, ndentry); + xfs_dentry_to_name(&oname, odentry, 0); + xfs_dentry_to_name(&nname, ndentry, odentry->d_inode->i_mode); return -xfs_rename(XFS_I(odir), &oname, XFS_I(odentry->d_inode), XFS_I(ndir), &nname, new_inode ? - XFS_I(new_inode) : NULL); + XFS_I(new_inode) : NULL); } /* @@ -389,18 +428,6 @@ xfs_vn_follow_link( return NULL; } -STATIC void -xfs_vn_put_link( - struct dentry *dentry, - struct nameidata *nd, - void *p) -{ - char *s = nd_get_link(nd); - - if (!IS_ERR(s)) - kfree(s); -} - STATIC int xfs_vn_getattr( struct vfsmount *mnt, @@ -420,8 +447,8 @@ xfs_vn_getattr( stat->dev = inode->i_sb->s_dev; stat->mode = ip->i_d.di_mode; stat->nlink = ip->i_d.di_nlink; - stat->uid = ip->i_d.di_uid; - stat->gid = ip->i_d.di_gid; + stat->uid = inode->i_uid; + stat->gid = inode->i_gid; stat->ino = ip->i_ino; stat->atime = inode->i_atime; stat->mtime = inode->i_mtime; @@ -455,6 +482,49 @@ xfs_vn_getattr( return 0; } +static void +xfs_setattr_mode( + struct xfs_inode *ip, + struct iattr *iattr) +{ + struct inode *inode = VFS_I(ip); + umode_t mode = iattr->ia_mode; + + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + + ip->i_d.di_mode &= S_IFMT; + ip->i_d.di_mode |= mode & ~S_IFMT; + + inode->i_mode &= S_IFMT; + inode->i_mode |= mode & ~S_IFMT; +} + +static void +xfs_setattr_time( + struct xfs_inode *ip, + struct iattr *iattr) +{ + struct inode *inode = VFS_I(ip); + + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + + if (iattr->ia_valid & ATTR_ATIME) { + inode->i_atime = iattr->ia_atime; + ip->i_d.di_atime.t_sec = iattr->ia_atime.tv_sec; + ip->i_d.di_atime.t_nsec = iattr->ia_atime.tv_nsec; + } + if (iattr->ia_valid & ATTR_CTIME) { + inode->i_ctime = iattr->ia_ctime; + ip->i_d.di_ctime.t_sec = iattr->ia_ctime.tv_sec; + ip->i_d.di_ctime.t_nsec = iattr->ia_ctime.tv_nsec; + } + if (iattr->ia_valid & ATTR_MTIME) { + inode->i_mtime = iattr->ia_mtime; + ip->i_d.di_mtime.t_sec = iattr->ia_mtime.tv_sec; + ip->i_d.di_mtime.t_nsec = iattr->ia_mtime.tv_nsec; + } +} + int xfs_setattr_nonsize( struct xfs_inode *ip, @@ -466,22 +536,25 @@ xfs_setattr_nonsize( int mask = iattr->ia_valid; xfs_trans_t *tp; int error; - uid_t uid = 0, iuid = 0; - gid_t gid = 0, igid = 0; + kuid_t uid = GLOBAL_ROOT_UID, iuid = GLOBAL_ROOT_UID; + kgid_t gid = GLOBAL_ROOT_GID, igid = GLOBAL_ROOT_GID; struct xfs_dquot *udqp = NULL, *gdqp = NULL; struct xfs_dquot *olddquot1 = NULL, *olddquot2 = NULL; trace_xfs_setattr(ip); - if (mp->m_flags & XFS_MOUNT_RDONLY) - return XFS_ERROR(EROFS); + /* If acls are being inherited, we already have this checked */ + if (!(flags & XFS_ATTR_NOACL)) { + if (mp->m_flags & XFS_MOUNT_RDONLY) + return XFS_ERROR(EROFS); - if (XFS_FORCED_SHUTDOWN(mp)) - return XFS_ERROR(EIO); + if (XFS_FORCED_SHUTDOWN(mp)) + return XFS_ERROR(EIO); - error = -inode_change_ok(inode, iattr); - if (error) - return XFS_ERROR(error); + error = -inode_change_ok(inode, iattr); + if (error) + return XFS_ERROR(error); + } ASSERT((mask & ATTR_SIZE) == 0); @@ -500,13 +573,13 @@ xfs_setattr_nonsize( uid = iattr->ia_uid; qflags |= XFS_QMOPT_UQUOTA; } else { - uid = ip->i_d.di_uid; + uid = inode->i_uid; } if ((mask & ATTR_GID) && XFS_IS_GQUOTA_ON(mp)) { gid = iattr->ia_gid; qflags |= XFS_QMOPT_GQUOTA; } else { - gid = ip->i_d.di_gid; + gid = inode->i_gid; } /* @@ -516,14 +589,16 @@ xfs_setattr_nonsize( */ ASSERT(udqp == NULL); ASSERT(gdqp == NULL); - error = xfs_qm_vop_dqalloc(ip, uid, gid, xfs_get_projid(ip), - qflags, &udqp, &gdqp); + error = xfs_qm_vop_dqalloc(ip, xfs_kuid_to_uid(uid), + xfs_kgid_to_gid(gid), + xfs_get_projid(ip), + qflags, &udqp, &gdqp, NULL); if (error) return error; } tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE); - error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0); if (error) goto out_dqrele; @@ -539,8 +614,8 @@ xfs_setattr_nonsize( * while we didn't have the inode locked, inode's dquot(s) * would have changed also. */ - iuid = ip->i_d.di_uid; - igid = ip->i_d.di_gid; + iuid = inode->i_uid; + igid = inode->i_gid; gid = (mask & ATTR_GID) ? iattr->ia_gid : igid; uid = (mask & ATTR_UID) ? iattr->ia_uid : iuid; @@ -549,11 +624,11 @@ xfs_setattr_nonsize( * going to change. */ if (XFS_IS_QUOTA_RUNNING(mp) && - ((XFS_IS_UQUOTA_ON(mp) && iuid != uid) || - (XFS_IS_GQUOTA_ON(mp) && igid != gid))) { + ((XFS_IS_UQUOTA_ON(mp) && !uid_eq(iuid, uid)) || + (XFS_IS_GQUOTA_ON(mp) && !gid_eq(igid, gid)))) { ASSERT(tp); error = xfs_qm_vop_chown_reserve(tp, ip, udqp, gdqp, - capable(CAP_FOWNER) ? + NULL, capable(CAP_FOWNER) ? XFS_QMOPT_FORCE_RES : 0); if (error) /* out of quota */ goto out_trans_cancel; @@ -580,63 +655,34 @@ xfs_setattr_nonsize( * Change the ownerships and register quota modifications * in the transaction. */ - if (iuid != uid) { + if (!uid_eq(iuid, uid)) { if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_UQUOTA_ON(mp)) { ASSERT(mask & ATTR_UID); ASSERT(udqp); olddquot1 = xfs_qm_vop_chown(tp, ip, &ip->i_udquot, udqp); } - ip->i_d.di_uid = uid; + ip->i_d.di_uid = xfs_kuid_to_uid(uid); inode->i_uid = uid; } - if (igid != gid) { + if (!gid_eq(igid, gid)) { if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_GQUOTA_ON(mp)) { - ASSERT(!XFS_IS_PQUOTA_ON(mp)); + ASSERT(xfs_sb_version_has_pquotino(&mp->m_sb) || + !XFS_IS_PQUOTA_ON(mp)); ASSERT(mask & ATTR_GID); ASSERT(gdqp); olddquot2 = xfs_qm_vop_chown(tp, ip, &ip->i_gdquot, gdqp); } - ip->i_d.di_gid = gid; + ip->i_d.di_gid = xfs_kgid_to_gid(gid); inode->i_gid = gid; } } - /* - * Change file access modes. - */ - if (mask & ATTR_MODE) { - umode_t mode = iattr->ia_mode; - - if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID)) - mode &= ~S_ISGID; - - ip->i_d.di_mode &= S_IFMT; - ip->i_d.di_mode |= mode & ~S_IFMT; - - inode->i_mode &= S_IFMT; - inode->i_mode |= mode & ~S_IFMT; - } - - /* - * Change file access or modified times. - */ - if (mask & ATTR_ATIME) { - inode->i_atime = iattr->ia_atime; - ip->i_d.di_atime.t_sec = iattr->ia_atime.tv_sec; - ip->i_d.di_atime.t_nsec = iattr->ia_atime.tv_nsec; - } - if (mask & ATTR_CTIME) { - inode->i_ctime = iattr->ia_ctime; - ip->i_d.di_ctime.t_sec = iattr->ia_ctime.tv_sec; - ip->i_d.di_ctime.t_nsec = iattr->ia_ctime.tv_nsec; - } - if (mask & ATTR_MTIME) { - inode->i_mtime = iattr->ia_mtime; - ip->i_d.di_mtime.t_sec = iattr->ia_mtime.tv_sec; - ip->i_d.di_mtime.t_nsec = iattr->ia_mtime.tv_nsec; - } + if (mask & ATTR_MODE) + xfs_setattr_mode(ip, iattr); + if (mask & (ATTR_ATIME|ATTR_CTIME|ATTR_MTIME)) + xfs_setattr_time(ip, iattr); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); @@ -667,7 +713,7 @@ xfs_setattr_nonsize( * Posix ACL code seems to care about this issue either. */ if ((mask & ATTR_MODE) && !(flags & XFS_ATTR_NOACL)) { - error = -xfs_acl_chmod(inode); + error = -posix_acl_chmod(inode, inode->i_mode); if (error) return XFS_ERROR(error); } @@ -689,12 +735,10 @@ out_dqrele: int xfs_setattr_size( struct xfs_inode *ip, - struct iattr *iattr, - int flags) + struct iattr *iattr) { struct xfs_mount *mp = ip->i_mount; struct inode *inode = VFS_I(ip); - int mask = iattr->ia_valid; xfs_off_t oldsize, newsize; struct xfs_trans *tp; int error; @@ -713,15 +757,10 @@ xfs_setattr_size( if (error) return XFS_ERROR(error); + ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); ASSERT(S_ISREG(ip->i_d.di_mode)); - ASSERT((mask & (ATTR_MODE|ATTR_UID|ATTR_GID|ATTR_ATIME|ATTR_ATIME_SET| - ATTR_MTIME_SET|ATTR_KILL_SUID|ATTR_KILL_SGID| - ATTR_KILL_PRIV|ATTR_TIMES_SET)) == 0); - - if (!(flags & XFS_ATTR_NOLOCK)) { - lock_flags |= XFS_IOLOCK_EXCL; - xfs_ilock(ip, lock_flags); - } + ASSERT((iattr->ia_valid & (ATTR_UID|ATTR_GID|ATTR_ATIME|ATTR_ATIME_SET| + ATTR_MTIME_SET|ATTR_KILL_PRIV|ATTR_TIMES_SET)) == 0); oldsize = inode->i_size; newsize = iattr->ia_size; @@ -730,13 +769,12 @@ xfs_setattr_size( * Short circuit the truncate case for zero length files. */ if (newsize == 0 && oldsize == 0 && ip->i_d.di_nextents == 0) { - if (!(mask & (ATTR_CTIME|ATTR_MTIME))) - goto out_unlock; + if (!(iattr->ia_valid & (ATTR_CTIME|ATTR_MTIME))) + return 0; /* * Use the regular setattr path to update the timestamps. */ - xfs_iunlock(ip, lock_flags); iattr->ia_valid &= ~ATTR_SIZE; return xfs_setattr_nonsize(ip, iattr, 0); } @@ -746,7 +784,7 @@ xfs_setattr_size( */ error = xfs_qm_dqattach(ip, 0); if (error) - goto out_unlock; + return error; /* * Now we can make the changes. Before we join the inode to the @@ -764,7 +802,7 @@ xfs_setattr_size( */ error = xfs_zero_eof(ip, newsize, oldsize); if (error) - goto out_unlock; + return error; } /* @@ -783,7 +821,7 @@ xfs_setattr_size( error = -filemap_write_and_wait_range(VFS_I(ip)->i_mapping, ip->i_d.di_size, newsize); if (error) - goto out_unlock; + return error; } /* @@ -791,24 +829,34 @@ xfs_setattr_size( */ inode_dio_wait(inode); + /* + * Do all the page cache truncate work outside the transaction context + * as the "lock" order is page lock->log space reservation. i.e. + * locking pages inside the transaction can ABBA deadlock with + * writeback. We have to do the VFS inode size update before we truncate + * the pagecache, however, to avoid racing with page faults beyond the + * new EOF they are not serialised against truncate operations except by + * page locks and size updates. + * + * Hence we are in a situation where a truncate can fail with ENOMEM + * from xfs_trans_reserve(), but having already truncated the in-memory + * version of the file (i.e. made user visible changes). There's not + * much we can do about this, except to hope that the caller sees ENOMEM + * and retries the truncate operation. + */ error = -block_truncate_page(inode->i_mapping, newsize, xfs_get_blocks); if (error) - goto out_unlock; + return error; + truncate_setsize(inode, newsize); tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_SIZE); - error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0, - XFS_TRANS_PERM_LOG_RES, - XFS_ITRUNCATE_LOG_COUNT); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0); if (error) goto out_trans_cancel; - truncate_setsize(inode, newsize); - commit_flags = XFS_TRANS_RELEASE_LOG_RES; lock_flags |= XFS_ILOCK_EXCL; - xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, 0); /* @@ -821,10 +869,11 @@ xfs_setattr_size( * these flags set. For all other operations the VFS set these flags * explicitly if it wants a timestamp update. */ - if (newsize != oldsize && (!(mask & (ATTR_CTIME | ATTR_MTIME)))) { + if (newsize != oldsize && + !(iattr->ia_valid & (ATTR_CTIME | ATTR_MTIME))) { iattr->ia_ctime = iattr->ia_mtime = current_fs_time(inode->i_sb); - mask |= ATTR_CTIME | ATTR_MTIME; + iattr->ia_valid |= ATTR_CTIME | ATTR_MTIME; } /* @@ -860,16 +909,10 @@ xfs_setattr_size( xfs_inode_clear_eofblocks_tag(ip); } - if (mask & ATTR_CTIME) { - inode->i_ctime = iattr->ia_ctime; - ip->i_d.di_ctime.t_sec = iattr->ia_ctime.tv_sec; - ip->i_d.di_ctime.t_nsec = iattr->ia_ctime.tv_nsec; - } - if (mask & ATTR_MTIME) { - inode->i_mtime = iattr->ia_mtime; - ip->i_d.di_mtime.t_sec = iattr->ia_mtime.tv_sec; - ip->i_d.di_mtime.t_nsec = iattr->ia_mtime.tv_nsec; - } + if (iattr->ia_valid & ATTR_MODE) + xfs_setattr_mode(ip, iattr); + if (iattr->ia_valid & (ATTR_ATIME|ATTR_CTIME|ATTR_MTIME)) + xfs_setattr_time(ip, iattr); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); @@ -893,12 +936,21 @@ out_trans_cancel: STATIC int xfs_vn_setattr( - struct dentry *dentry, - struct iattr *iattr) + struct dentry *dentry, + struct iattr *iattr) { - if (iattr->ia_valid & ATTR_SIZE) - return -xfs_setattr_size(XFS_I(dentry->d_inode), iattr, 0); - return -xfs_setattr_nonsize(XFS_I(dentry->d_inode), iattr, 0); + struct xfs_inode *ip = XFS_I(dentry->d_inode); + int error; + + if (iattr->ia_valid & ATTR_SIZE) { + xfs_ilock(ip, XFS_IOLOCK_EXCL); + error = xfs_setattr_size(ip, iattr); + xfs_iunlock(ip, XFS_IOLOCK_EXCL); + } else { + error = xfs_setattr_nonsize(ip, iattr, 0); + } + + return -error; } STATIC int @@ -915,7 +967,7 @@ xfs_vn_update_time( trace_xfs_update_time(ip); tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS); - error = xfs_trans_reserve(tp, 0, XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0); if (error) { xfs_trans_cancel(tp, 0); return -error; @@ -970,7 +1022,8 @@ xfs_fiemap_format( if (bmv->bmv_oflags & BMV_OF_PREALLOC) fiemap_flags |= FIEMAP_EXTENT_UNWRITTEN; else if (bmv->bmv_oflags & BMV_OF_DELALLOC) { - fiemap_flags |= FIEMAP_EXTENT_DELALLOC; + fiemap_flags |= (FIEMAP_EXTENT_DELALLOC | + FIEMAP_EXTENT_UNKNOWN); physical = 0; /* no block yet */ } if (bmv->bmv_oflags & BMV_OF_LAST) @@ -1027,8 +1080,18 @@ xfs_vn_fiemap( return 0; } +STATIC int +xfs_vn_tmpfile( + struct inode *dir, + struct dentry *dentry, + umode_t mode) +{ + return xfs_generic_create(dir, dentry, mode, 0, true); +} + static const struct inode_operations xfs_inode_operations = { .get_acl = xfs_get_acl, + .set_acl = xfs_set_acl, .getattr = xfs_vn_getattr, .setattr = xfs_vn_setattr, .setxattr = generic_setxattr, @@ -1056,6 +1119,7 @@ static const struct inode_operations xfs_dir_inode_operations = { .mknod = xfs_vn_mknod, .rename = xfs_vn_rename, .get_acl = xfs_get_acl, + .set_acl = xfs_set_acl, .getattr = xfs_vn_getattr, .setattr = xfs_vn_setattr, .setxattr = generic_setxattr, @@ -1063,6 +1127,7 @@ static const struct inode_operations xfs_dir_inode_operations = { .removexattr = generic_removexattr, .listxattr = xfs_vn_listxattr, .update_time = xfs_vn_update_time, + .tmpfile = xfs_vn_tmpfile, }; static const struct inode_operations xfs_dir_ci_inode_operations = { @@ -1082,6 +1147,7 @@ static const struct inode_operations xfs_dir_ci_inode_operations = { .mknod = xfs_vn_mknod, .rename = xfs_vn_rename, .get_acl = xfs_get_acl, + .set_acl = xfs_set_acl, .getattr = xfs_vn_getattr, .setattr = xfs_vn_setattr, .setxattr = generic_setxattr, @@ -1089,13 +1155,13 @@ static const struct inode_operations xfs_dir_ci_inode_operations = { .removexattr = generic_removexattr, .listxattr = xfs_vn_listxattr, .update_time = xfs_vn_update_time, + .tmpfile = xfs_vn_tmpfile, }; static const struct inode_operations xfs_symlink_inode_operations = { .readlink = generic_readlink, .follow_link = xfs_vn_follow_link, - .put_link = xfs_vn_put_link, - .get_acl = xfs_get_acl, + .put_link = kfree_put_link, .getattr = xfs_vn_getattr, .setattr = xfs_vn_setattr, .setxattr = generic_setxattr, @@ -1145,6 +1211,7 @@ xfs_setup_inode( struct xfs_inode *ip) { struct inode *inode = &ip->i_vnode; + gfp_t gfp_mask; inode->i_ino = ip->i_ino; inode->i_state = I_NEW; @@ -1155,8 +1222,8 @@ xfs_setup_inode( inode->i_mode = ip->i_d.di_mode; set_nlink(inode, ip->i_d.di_nlink); - inode->i_uid = ip->i_d.di_uid; - inode->i_gid = ip->i_d.di_gid; + inode->i_uid = xfs_uid_to_kuid(ip->i_d.di_uid); + inode->i_gid = xfs_gid_to_kgid(ip->i_d.di_gid); switch (inode->i_mode & S_IFMT) { case S_IFBLK: @@ -1180,6 +1247,8 @@ xfs_setup_inode( inode->i_ctime.tv_nsec = ip->i_d.di_ctime.t_nsec; xfs_diflags_to_iflags(inode, ip); + ip->d_ops = ip->i_mount->m_nondir_inode_ops; + lockdep_set_class(&ip->i_lock.mr_lock, &xfs_nondir_ilock_class); switch (inode->i_mode & S_IFMT) { case S_IFREG: inode->i_op = &xfs_inode_operations; @@ -1187,11 +1256,13 @@ xfs_setup_inode( inode->i_mapping->a_ops = &xfs_address_space_operations; break; case S_IFDIR: + lockdep_set_class(&ip->i_lock.mr_lock, &xfs_dir_ilock_class); if (xfs_sb_version_hasasciici(&XFS_M(inode->i_sb)->m_sb)) inode->i_op = &xfs_dir_ci_inode_operations; else inode->i_op = &xfs_dir_inode_operations; inode->i_fop = &xfs_dir_file_operations; + ip->d_ops = ip->i_mount->m_dir_inode_ops; break; case S_IFLNK: inode->i_op = &xfs_symlink_inode_operations; @@ -1205,6 +1276,14 @@ xfs_setup_inode( } /* + * Ensure all page cache allocations are done from GFP_NOFS context to + * prevent direct reclaim recursion back into the filesystem and blowing + * stacks or deadlocking. + */ + gfp_mask = mapping_gfp_mask(inode->i_mapping); + mapping_set_gfp_mask(inode->i_mapping, (gfp_mask & ~(__GFP_FS))); + + /* * If there is no attribute fork no ACL can exist on this inode, * and it can't have any file capabilities attached to it either. */ diff --git a/fs/xfs/xfs_iops.h b/fs/xfs/xfs_iops.h index ef41c92ce66..1c34e433592 100644 --- a/fs/xfs/xfs_iops.h +++ b/fs/xfs/xfs_iops.h @@ -27,4 +27,13 @@ extern ssize_t xfs_vn_listxattr(struct dentry *, char *data, size_t size); extern void xfs_setup_inode(struct xfs_inode *); +/* + * Internal setattr interfaces. + */ +#define XFS_ATTR_NOACL 0x01 /* Don't call posix_acl_chmod */ + +extern int xfs_setattr_nonsize(struct xfs_inode *ip, struct iattr *vap, + int flags); +extern int xfs_setattr_size(struct xfs_inode *ip, struct iattr *vap); + #endif /* __XFS_IOPS_H__ */ diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index 2ea7d402188..cb64f222d60 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -17,24 +17,23 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_types.h" -#include "xfs_log.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_inum.h" -#include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_mount.h" -#include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_dinode.h" #include "xfs_inode.h" +#include "xfs_btree.h" #include "xfs_ialloc.h" +#include "xfs_ialloc_btree.h" #include "xfs_itable.h" #include "xfs_error.h" -#include "xfs_btree.h" #include "xfs_trace.h" #include "xfs_icache.h" +#include "xfs_dinode.h" STATIC int xfs_internal_inum( @@ -43,7 +42,7 @@ xfs_internal_inum( { return (ino == mp->m_sb.sb_rbmino || ino == mp->m_sb.sb_rsumino || (xfs_sb_version_hasquota(&mp->m_sb) && - (ino == mp->m_sb.sb_uquotino || ino == mp->m_sb.sb_gquotino))); + xfs_is_quota_inode(&mp->m_sb, ino))); } /* @@ -210,9 +209,8 @@ xfs_bulkstat( xfs_inobt_rec_incore_t *irbuf; /* start of irec buffer */ xfs_inobt_rec_incore_t *irbufend; /* end of good irec buffer entries */ xfs_ino_t lastino; /* last inode number returned */ - int nbcluster; /* # of blocks in a cluster */ - int nicluster; /* # of inodes in a cluster */ - int nimask; /* mask for inode clusters */ + int blks_per_cluster; /* # of blocks per cluster */ + int inodes_per_cluster;/* # of inodes per cluster */ int nirbuf; /* size of irbuf */ int rval; /* return value error code */ int tmp; /* result value from btree calls */ @@ -221,7 +219,6 @@ xfs_bulkstat( char __user *ubufp; /* pointer into user's buffer */ int ubelem; /* spaces used in user's buffer */ int ubused; /* bytes used by formatter */ - xfs_buf_t *bp; /* ptr to on-disk inode cluster buf */ /* * Get the last inode value, see if there's nothing to do. @@ -245,11 +242,8 @@ xfs_bulkstat( *done = 0; fmterror = 0; ubufp = ubuffer; - nicluster = mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(mp) ? - mp->m_sb.sb_inopblock : - (XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog); - nimask = ~(nicluster - 1); - nbcluster = nicluster >> mp->m_sb.sb_inopblog; + blks_per_cluster = xfs_icluster_size_fsb(mp); + inodes_per_cluster = blks_per_cluster << mp->m_sb.sb_inopblog; irbuf = kmem_zalloc_greedy(&irbsize, PAGE_SIZE, PAGE_SIZE * 4); if (!irbuf) return ENOMEM; @@ -263,7 +257,6 @@ xfs_bulkstat( rval = 0; while (XFS_BULKSTAT_UBLEFT(ubleft) && agno < mp->m_sb.sb_agcount) { cond_resched(); - bp = NULL; error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp); if (error) { /* @@ -277,7 +270,8 @@ xfs_bulkstat( /* * Allocate and initialize a btree cursor for ialloc btree. */ - cur = xfs_inobt_init_cursor(mp, NULL, agbp, agno); + cur = xfs_inobt_init_cursor(mp, NULL, agbp, agno, + XFS_BTNUM_INO); irbp = irbuf; irbufend = irbuf + nirbuf; end_of_ag = 0; @@ -383,22 +377,25 @@ xfs_bulkstat( * Also start read-ahead now for this chunk. */ if (r.ir_freecount < XFS_INODES_PER_CHUNK) { + struct blk_plug plug; /* * Loop over all clusters in the next chunk. * Do a readahead if there are any allocated * inodes in that cluster. */ + blk_start_plug(&plug); agbno = XFS_AGINO_TO_AGBNO(mp, r.ir_startino); for (chunkidx = 0; chunkidx < XFS_INODES_PER_CHUNK; - chunkidx += nicluster, - agbno += nbcluster) { - if (xfs_inobt_maskn(chunkidx, nicluster) - & ~r.ir_free) + chunkidx += inodes_per_cluster, + agbno += blks_per_cluster) { + if (xfs_inobt_maskn(chunkidx, + inodes_per_cluster) & ~r.ir_free) xfs_btree_reada_bufs(mp, agno, - agbno, nbcluster, + agbno, blks_per_cluster, &xfs_inode_buf_ops); } + blk_finish_plug(&plug); irbp->ir_startino = r.ir_startino; irbp->ir_freecount = r.ir_freecount; irbp->ir_free = r.ir_free; @@ -433,27 +430,7 @@ xfs_bulkstat( irbp->ir_freecount < XFS_INODES_PER_CHUNK; chunkidx++, clustidx++, agino++) { ASSERT(chunkidx < XFS_INODES_PER_CHUNK); - /* - * Recompute agbno if this is the - * first inode of the cluster. - * - * Careful with clustidx. There can be - * multiple clusters per chunk, a single - * cluster per chunk or a cluster that has - * inodes represented from several different - * chunks (if blocksize is large). - * - * Because of this, the starting clustidx is - * initialized to zero in this loop but must - * later be reset after reading in the cluster - * buffer. - */ - if ((chunkidx & (nicluster - 1)) == 0) { - agbno = XFS_AGINO_TO_AGBNO(mp, - irbp->ir_startino) + - ((chunkidx & nimask) >> - mp->m_sb.sb_inopblog); - } + ino = XFS_AGINO_TO_INO(mp, agno, agino); /* * Skip if this inode is free. @@ -499,10 +476,6 @@ xfs_bulkstat( cond_resched(); } - - if (bp) - xfs_buf_relse(bp); - /* * Set up for the next loop iteration. */ @@ -518,7 +491,7 @@ xfs_bulkstat( /* * Done, we're either out of filesystem or space to put the data. */ - kmem_free_large(irbuf); + kmem_free(irbuf); *ubcountp = ubelem; /* * Found some inodes, return them now and return the error next time. @@ -564,8 +537,9 @@ xfs_bulkstat_single( * at the expense of the error case. */ - ino = (xfs_ino_t)*lastinop; - error = xfs_bulkstat_one(mp, ino, buffer, sizeof(xfs_bstat_t), 0, &res); + ino = *lastinop; + error = xfs_bulkstat_one(mp, ino, buffer, sizeof(xfs_bstat_t), + NULL, &res); if (error) { /* * Special case way failed, do it the "long" way @@ -648,7 +622,8 @@ xfs_inumbers( agino = 0; continue; } - cur = xfs_inobt_init_cursor(mp, NULL, agbp, agno); + cur = xfs_inobt_init_cursor(mp, NULL, agbp, agno, + XFS_BTNUM_INO); error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_GE, &tmp); if (error) { diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index fe7e4df85a7..825249d2dfc 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -32,6 +32,38 @@ # define XFS_BIG_INUMS 0 #endif +/* + * Kernel specific type declarations for XFS + */ +typedef signed char __int8_t; +typedef unsigned char __uint8_t; +typedef signed short int __int16_t; +typedef unsigned short int __uint16_t; +typedef signed int __int32_t; +typedef unsigned int __uint32_t; +typedef signed long long int __int64_t; +typedef unsigned long long int __uint64_t; + +typedef __uint32_t inst_t; /* an instruction */ + +typedef __s64 xfs_off_t; /* <file offset> type */ +typedef unsigned long long xfs_ino_t; /* <inode> type */ +typedef __s64 xfs_daddr_t; /* <disk address> type */ +typedef char * xfs_caddr_t; /* <core address> type */ +typedef __u32 xfs_dev_t; +typedef __u32 xfs_nlink_t; + +/* __psint_t is the same size as a pointer */ +#if (BITS_PER_LONG == 32) +typedef __int32_t __psint_t; +typedef __uint32_t __psunsigned_t; +#elif (BITS_PER_LONG == 64) +typedef __int64_t __psint_t; +typedef __uint64_t __psunsigned_t; +#else +#error BITS_PER_LONG must be 32 or 64 +#endif + #include "xfs_types.h" #include "kmem.h" @@ -72,6 +104,7 @@ #include <linux/kthread.h> #include <linux/freezer.h> #include <linux/list_sort.h> +#include <linux/ratelimit.h> #include <asm/page.h> #include <asm/div64.h> @@ -86,6 +119,7 @@ #include "xfs_iops.h" #include "xfs_aops.h" #include "xfs_super.h" +#include "xfs_cksum.h" #include "xfs_buf.h" #include "xfs_message.h" @@ -113,8 +147,6 @@ #define xfs_inherit_sync xfs_params.inherit_sync.val #define xfs_inherit_nodump xfs_params.inherit_nodump.val #define xfs_inherit_noatime xfs_params.inherit_noatim.val -#define xfs_buf_timer_centisecs xfs_params.xfs_buf_timer.val -#define xfs_buf_age_centisecs xfs_params.xfs_buf_age.val #define xfs_inherit_nosymlinks xfs_params.inherit_nosym.val #define xfs_rotorstep xfs_params.rotorstep.val #define xfs_inherit_nodefrag xfs_params.inherit_nodfrg.val @@ -147,6 +179,7 @@ #define ENOATTR ENODATA /* Attribute not found */ #define EWRONGFS EINVAL /* Mount with wrong filesystem type */ #define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ +#define EFSBADCRC EBADMSG /* Bad CRC detected */ #define SYNCHRONIZE() barrier() #define __return_address __builtin_return_address(0) @@ -158,6 +191,32 @@ #define MAX(a,b) (max(a,b)) #define howmany(x, y) (((x)+((y)-1))/(y)) +/* Kernel uid/gid conversion. These are used to convert to/from the on disk + * uid_t/gid_t types to the kuid_t/kgid_t types that the kernel uses internally. + * The conversion here is type only, the value will remain the same since we + * are converting to the init_user_ns. The uid is later mapped to a particular + * user namespace value when crossing the kernel/user boundary. + */ +static inline __uint32_t xfs_kuid_to_uid(kuid_t uid) +{ + return from_kuid(&init_user_ns, uid); +} + +static inline kuid_t xfs_uid_to_kuid(__uint32_t uid) +{ + return make_kuid(&init_user_ns, uid); +} + +static inline __uint32_t xfs_kgid_to_gid(kgid_t gid) +{ + return from_kgid(&init_user_ns, gid); +} + +static inline kgid_t xfs_gid_to_kgid(__uint32_t gid) +{ + return make_kgid(&init_user_ns, gid); +} + /* * Various platform dependent calls that don't fit anywhere else */ @@ -292,22 +351,34 @@ static inline __uint64_t howmany_64(__uint64_t x, __uint32_t y) #define ASSERT_ALWAYS(expr) \ (unlikely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__)) -#ifndef DEBUG -#define ASSERT(expr) ((void)0) +#ifdef DEBUG +#define ASSERT(expr) \ + (unlikely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__)) #ifndef STATIC -# define STATIC static noinline +# define STATIC noinline #endif -#else /* DEBUG */ +#else /* !DEBUG */ + +#ifdef XFS_WARN #define ASSERT(expr) \ - (unlikely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__)) + (unlikely(expr) ? (void)0 : asswarn(#expr, __FILE__, __LINE__)) #ifndef STATIC -# define STATIC noinline +# define STATIC static noinline +#endif + +#else /* !DEBUG && !XFS_WARN */ + +#define ASSERT(expr) ((void)0) + +#ifndef STATIC +# define STATIC static noinline #endif +#endif /* XFS_WARN */ #endif /* DEBUG */ #endif /* __XFS_LINUX__ */ diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 46bd9d52ab5..292308dede6 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -17,21 +17,19 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_types.h" -#include "xfs_log.h" -#include "xfs_trans.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_error.h" +#include "xfs_trans.h" +#include "xfs_trans_priv.h" +#include "xfs_log.h" #include "xfs_log_priv.h" -#include "xfs_buf_item.h" -#include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" #include "xfs_log_recover.h" -#include "xfs_trans_priv.h" -#include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_trace.h" #include "xfs_fsops.h" @@ -120,7 +118,7 @@ xlog_verify_iclog( struct xlog *log, struct xlog_in_core *iclog, int count, - boolean_t syncing); + bool syncing); STATIC void xlog_verify_tail_lsn( struct xlog *log, @@ -257,7 +255,8 @@ xlog_grant_head_wait( struct xlog *log, struct xlog_grant_head *head, struct xlog_ticket *tic, - int need_bytes) + int need_bytes) __releases(&head->lock) + __acquires(&head->lock) { list_add_tail(&tic->t_queue, &head->waiters); @@ -614,13 +613,16 @@ xfs_log_mount( xfs_daddr_t blk_offset, int num_bblks) { - int error; + int error = 0; + int min_logfsbs; - if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) - xfs_notice(mp, "Mounting Filesystem"); - else { + if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) { + xfs_notice(mp, "Mounting V%d Filesystem", + XFS_SB_VERSION_NUM(&mp->m_sb)); + } else { xfs_notice(mp, -"Mounting filesystem in no-recovery mode. Filesystem will be inconsistent."); +"Mounting V%d filesystem in no-recovery mode. Filesystem will be inconsistent.", + XFS_SB_VERSION_NUM(&mp->m_sb)); ASSERT(mp->m_flags & XFS_MOUNT_RDONLY); } @@ -631,6 +633,50 @@ xfs_log_mount( } /* + * Validate the given log space and drop a critical message via syslog + * if the log size is too small that would lead to some unexpected + * situations in transaction log space reservation stage. + * + * Note: we can't just reject the mount if the validation fails. This + * would mean that people would have to downgrade their kernel just to + * remedy the situation as there is no way to grow the log (short of + * black magic surgery with xfs_db). + * + * We can, however, reject mounts for CRC format filesystems, as the + * mkfs binary being used to make the filesystem should never create a + * filesystem with a log that is too small. + */ + min_logfsbs = xfs_log_calc_minimum_size(mp); + + if (mp->m_sb.sb_logblocks < min_logfsbs) { + xfs_warn(mp, + "Log size %d blocks too small, minimum size is %d blocks", + mp->m_sb.sb_logblocks, min_logfsbs); + error = EINVAL; + } else if (mp->m_sb.sb_logblocks > XFS_MAX_LOG_BLOCKS) { + xfs_warn(mp, + "Log size %d blocks too large, maximum size is %lld blocks", + mp->m_sb.sb_logblocks, XFS_MAX_LOG_BLOCKS); + error = EINVAL; + } else if (XFS_FSB_TO_B(mp, mp->m_sb.sb_logblocks) > XFS_MAX_LOG_BYTES) { + xfs_warn(mp, + "log size %lld bytes too large, maximum size is %lld bytes", + XFS_FSB_TO_B(mp, mp->m_sb.sb_logblocks), + XFS_MAX_LOG_BYTES); + error = EINVAL; + } + if (error) { + if (xfs_sb_version_hascrc(&mp->m_sb)) { + xfs_crit(mp, "AAIEEE! Log failed size checks. Abort!"); + ASSERT(0); + goto out_free_log; + } + xfs_crit(mp, +"Log size out of supported range. Continuing onwards, but if log hangs are\n" +"experienced then please report this message in the bug report."); + } + + /* * Initialize the AIL now we have a log. */ error = xfs_trans_ail_init(mp); @@ -720,7 +766,7 @@ xfs_log_mount_finish(xfs_mount_t *mp) * Unmount record used to have a string "Unmount filesystem--" in the * data section where the "Un" was really a magic number (XLOG_UNMOUNT_TYPE). * We just write the magic number now since that particular field isn't - * currently architecture converted and "nUmount" is a bit foo. + * currently architecture converted and "Unmount" is a bit foo. * As far as I know, there weren't any dependencies on the old behaviour. */ @@ -954,27 +1000,34 @@ xfs_log_space_wake( } /* - * Determine if we have a transaction that has gone to disk - * that needs to be covered. To begin the transition to the idle state - * firstly the log needs to be idle (no AIL and nothing in the iclogs). - * If we are then in a state where covering is needed, the caller is informed - * that dummy transactions are required to move the log into the idle state. + * Determine if we have a transaction that has gone to disk that needs to be + * covered. To begin the transition to the idle state firstly the log needs to + * be idle. That means the CIL, the AIL and the iclogs needs to be empty before + * we start attempting to cover the log. * - * Because this is called as part of the sync process, we should also indicate - * that dummy transactions should be issued in anything but the covered or - * idle states. This ensures that the log tail is accurately reflected in - * the log at the end of the sync, hence if a crash occurrs avoids replay - * of transactions where the metadata is already on disk. + * Only if we are then in a state where covering is needed, the caller is + * informed that dummy transactions are required to move the log into the idle + * state. + * + * If there are any items in the AIl or CIL, then we do not want to attempt to + * cover the log as we may be in a situation where there isn't log space + * available to run a dummy transaction and this can lead to deadlocks when the + * tail of the log is pinned by an item that is modified in the CIL. Hence + * there's no point in running a dummy transaction at this point because we + * can't start trying to idle the log until both the CIL and AIL are empty. */ int xfs_log_need_covered(xfs_mount_t *mp) { - int needed = 0; struct xlog *log = mp->m_log; + int needed = 0; if (!xfs_fs_writable(mp)) return 0; + if (!xlog_cil_empty(log)) + return 0; + spin_lock(&log->l_icloglock); switch (log->l_covered_state) { case XLOG_STATE_COVER_DONE: @@ -983,14 +1036,17 @@ xfs_log_need_covered(xfs_mount_t *mp) break; case XLOG_STATE_COVER_NEED: case XLOG_STATE_COVER_NEED2: - if (!xfs_ail_min_lsn(log->l_ailp) && - xlog_iclogs_empty(log)) { - if (log->l_covered_state == XLOG_STATE_COVER_NEED) - log->l_covered_state = XLOG_STATE_COVER_DONE; - else - log->l_covered_state = XLOG_STATE_COVER_DONE2; - } - /* FALLTHRU */ + if (xfs_ail_min_lsn(log->l_ailp)) + break; + if (!xlog_iclogs_empty(log)) + break; + + needed = 1; + if (log->l_covered_state == XLOG_STATE_COVER_NEED) + log->l_covered_state = XLOG_STATE_COVER_DONE; + else + log->l_covered_state = XLOG_STATE_COVER_DONE2; + break; default: needed = 1; break; @@ -1022,6 +1078,7 @@ xlog_assign_tail_lsn_locked( tail_lsn = lip->li_lsn; else tail_lsn = atomic64_read(&log->l_last_sync_lsn); + trace_xfs_log_assign_tail_lsn(log, tail_lsn); atomic64_set(&log->l_tail_lsn, tail_lsn); return tail_lsn; } @@ -1108,7 +1165,7 @@ xlog_iodone(xfs_buf_t *bp) /* * Race to shutdown the filesystem if we see an error. */ - if (XFS_TEST_ERROR((xfs_buf_geterror(bp)), l->l_mp, + if (XFS_TEST_ERROR(bp->b_error, l->l_mp, XFS_ERRTAG_IODONE_IOERR, XFS_RANDOM_IODONE_IOERR)) { xfs_buf_ioerror_alert(bp, __func__); xfs_buf_stale(bp); @@ -1126,11 +1183,14 @@ xlog_iodone(xfs_buf_t *bp) /* log I/O is always issued ASYNC */ ASSERT(XFS_BUF_ISASYNC(bp)); xlog_state_done_syncing(iclog, aborted); + /* - * do not reference the buffer (bp) here as we could race - * with it being freed after writing the unmount record to the - * log. + * drop the buffer lock now that we are done. Nothing references + * the buffer after this, so an unmount waiting on this lock can now + * tear it down safely. As such, it is unsafe to reference the buffer + * (bp) after the unlock as we could race with it being freed. */ + xfs_buf_unlock(bp); } /* @@ -1313,8 +1373,16 @@ xlog_alloc_log( bp = xfs_buf_alloc(mp->m_logdev_targp, 0, BTOBB(log->l_iclog_size), 0); if (!bp) goto out_free_log; - bp->b_iodone = xlog_iodone; + + /* + * The iclogbuf buffer locks are held over IO but we are not going to do + * IO yet. Hence unlock the buffer so that the log IO path can grab it + * when appropriately. + */ ASSERT(xfs_buf_islocked(bp)); + xfs_buf_unlock(bp); + + bp->b_iodone = xlog_iodone; log->l_xbuf = bp; spin_lock_init(&log->l_icloglock); @@ -1343,6 +1411,9 @@ xlog_alloc_log( if (!bp) goto out_free_iclog; + ASSERT(xfs_buf_islocked(bp)); + xfs_buf_unlock(bp); + bp->b_iodone = xlog_iodone; iclog->ic_bp = bp; iclog->ic_data = bp->b_addr; @@ -1367,7 +1438,6 @@ xlog_alloc_log( iclog->ic_callback_tail = &(iclog->ic_callback); iclog->ic_datap = (char *)iclog->ic_data + log->l_iclog_hsize; - ASSERT(xfs_buf_islocked(iclog->ic_bp)); init_waitqueue_head(&iclog->ic_force_wait); init_waitqueue_head(&iclog->ic_write_wait); @@ -1576,6 +1646,12 @@ xlog_cksum( * we transition the iclogs to IOERROR state *after* flushing all existing * iclogs to disk. This is because we don't want anymore new transactions to be * started or completed afterwards. + * + * We lock the iclogbufs here so that we can serialise against IO completion + * during unmount. We might be processing a shutdown triggered during unmount, + * and that can occur asynchronously to the unmount thread, and hence we need to + * ensure that completes before tearing down the iclogbufs. Hence we need to + * hold the buffer lock across the log IO to acheive that. */ STATIC int xlog_bdstrat( @@ -1583,6 +1659,7 @@ xlog_bdstrat( { struct xlog_in_core *iclog = bp->b_fspriv; + xfs_buf_lock(bp); if (iclog->ic_state & XLOG_STATE_IOERROR) { xfs_buf_ioerror(bp, EIO); xfs_buf_stale(bp); @@ -1590,7 +1667,8 @@ xlog_bdstrat( /* * It would seem logical to return EIO here, but we rely on * the log state machine to propagate I/O errors instead of - * doing it here. + * doing it here. Similarly, IO completion will unlock the + * buffer, so we don't do it here. */ return 0; } @@ -1737,7 +1815,7 @@ xlog_sync( ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1); ASSERT(XFS_BUF_ADDR(bp) + BTOBB(count) <= log->l_logBBsize); - xlog_verify_iclog(log, iclog, count, B_TRUE); + xlog_verify_iclog(log, iclog, count, true); /* account for log which doesn't start at block #0 */ XFS_BUF_SET_ADDR(bp, XFS_BUF_ADDR(bp) + log->l_logBBstart); @@ -1792,14 +1870,28 @@ xlog_dealloc_log( xlog_cil_destroy(log); /* - * always need to ensure that the extra buffer does not point to memory - * owned by another log buffer before we free it. + * Cycle all the iclogbuf locks to make sure all log IO completion + * is done before we tear down these buffers. + */ + iclog = log->l_iclog; + for (i = 0; i < log->l_iclog_bufs; i++) { + xfs_buf_lock(iclog->ic_bp); + xfs_buf_unlock(iclog->ic_bp); + iclog = iclog->ic_next; + } + + /* + * Always need to ensure that the extra buffer does not point to memory + * owned by another log buffer before we free it. Also, cycle the lock + * first to ensure we've completed IO on it. */ + xfs_buf_lock(log->l_xbuf); + xfs_buf_unlock(log->l_xbuf); xfs_buf_set_empty(log->l_xbuf, BTOBB(log->l_iclog_size)); xfs_buf_free(log->l_xbuf); iclog = log->l_iclog; - for (i=0; i<log->l_iclog_bufs; i++) { + for (i = 0; i < log->l_iclog_bufs; i++) { xfs_buf_free(iclog->ic_bp); next_iclog = iclog->ic_next; kmem_free(iclog); @@ -1933,7 +2025,7 @@ xlog_print_tic_res( for (i = 0; i < ticket->t_res_num; i++) { uint r_type = ticket->t_res_arr[i].r_type; - xfs_warn(mp, "region[%u]: %s - %u bytes\n", i, + xfs_warn(mp, "region[%u]: %s - %u bytes", i, ((r_type <= 0 || r_type > XLOG_REG_TYPE_MAX) ? "bad-rtype" : res_type_str[r_type-1]), ticket->t_res_arr[i].r_len); @@ -1941,7 +2033,7 @@ xlog_print_tic_res( xfs_alert_tag(mp, XFS_PTAG_LOGRES, "xlog_write: reservation ran out. Need to up reservation"); - xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); + xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR); } /* @@ -1963,6 +2055,10 @@ xlog_write_calc_vec_length( headers++; for (lv = log_vector; lv; lv = lv->lv_next) { + /* we don't write ordered log vectors */ + if (lv->lv_buf_len == XFS_LOG_VEC_ORDERED) + continue; + headers += lv->lv_niovecs; for (i = 0; i < lv->lv_niovecs; i++) { @@ -2040,7 +2136,7 @@ xlog_write_setup_ophdr( * Set up the parameters of the region copy into the log. This has * to handle region write split across multiple log buffers - this * state is kept external to this function so that this code can - * can be written in an obvious, self documenting manner. + * be written in an obvious, self documenting manner. */ static int xlog_write_setup_copy( @@ -2216,7 +2312,7 @@ xlog_write( index = 0; lv = log_vector; vecp = lv->lv_iovecp; - while (lv && index < lv->lv_niovecs) { + while (lv && (!lv->lv_niovecs || index < lv->lv_niovecs)) { void *ptr; int log_offset; @@ -2236,13 +2332,22 @@ xlog_write( * This loop writes out as many regions as can fit in the amount * of space which was allocated by xlog_state_get_iclog_space(). */ - while (lv && index < lv->lv_niovecs) { - struct xfs_log_iovec *reg = &vecp[index]; + while (lv && (!lv->lv_niovecs || index < lv->lv_niovecs)) { + struct xfs_log_iovec *reg; struct xlog_op_header *ophdr; int start_rec_copy; int copy_len; int copy_off; + bool ordered = false; + /* ordered log vectors have no regions to write */ + if (lv->lv_buf_len == XFS_LOG_VEC_ORDERED) { + ASSERT(lv->lv_niovecs == 0); + ordered = true; + goto next_lv; + } + + reg = &vecp[index]; ASSERT(reg->i_len % sizeof(__int32_t) == 0); ASSERT((unsigned long)ptr % sizeof(__int32_t) == 0); @@ -2302,12 +2407,13 @@ xlog_write( break; if (++index == lv->lv_niovecs) { +next_lv: lv = lv->lv_next; index = 0; if (lv) vecp = lv->lv_iovecp; } - if (record_cnt == 0) { + if (record_cnt == 0 && ordered == false) { if (!lv) return 0; break; @@ -3377,24 +3483,17 @@ xfs_log_ticket_get( } /* - * Allocate and initialise a new log ticket. + * Figure out the total log space unit (in bytes) that would be + * required for a log ticket. */ -struct xlog_ticket * -xlog_ticket_alloc( - struct xlog *log, - int unit_bytes, - int cnt, - char client, - bool permanent, - xfs_km_flags_t alloc_flags) +int +xfs_log_calc_unit_res( + struct xfs_mount *mp, + int unit_bytes) { - struct xlog_ticket *tic; - uint num_headers; - int iclog_space; - - tic = kmem_zone_zalloc(xfs_log_ticket_zone, alloc_flags); - if (!tic) - return NULL; + struct xlog *log = mp->m_log; + int iclog_space; + uint num_headers; /* * Permanent reservations have up to 'cnt'-1 active log operations @@ -3469,23 +3568,46 @@ xlog_ticket_alloc( unit_bytes += log->l_iclog_hsize; /* for roundoff padding for transaction data and one for commit record */ - if (xfs_sb_version_haslogv2(&log->l_mp->m_sb) && - log->l_mp->m_sb.sb_logsunit > 1) { + if (xfs_sb_version_haslogv2(&mp->m_sb) && mp->m_sb.sb_logsunit > 1) { /* log su roundoff */ - unit_bytes += 2*log->l_mp->m_sb.sb_logsunit; + unit_bytes += 2 * mp->m_sb.sb_logsunit; } else { /* BB roundoff */ - unit_bytes += 2*BBSIZE; + unit_bytes += 2 * BBSIZE; } + return unit_bytes; +} + +/* + * Allocate and initialise a new log ticket. + */ +struct xlog_ticket * +xlog_ticket_alloc( + struct xlog *log, + int unit_bytes, + int cnt, + char client, + bool permanent, + xfs_km_flags_t alloc_flags) +{ + struct xlog_ticket *tic; + int unit_res; + + tic = kmem_zone_zalloc(xfs_log_ticket_zone, alloc_flags); + if (!tic) + return NULL; + + unit_res = xfs_log_calc_unit_res(log->l_mp, unit_bytes); + atomic_set(&tic->t_ref, 1); tic->t_task = current; INIT_LIST_HEAD(&tic->t_queue); - tic->t_unit_res = unit_bytes; - tic->t_curr_res = unit_bytes; + tic->t_unit_res = unit_res; + tic->t_curr_res = unit_res; tic->t_cnt = cnt; tic->t_ocnt = cnt; - tic->t_tid = random32(); + tic->t_tid = prandom_u32(); tic->t_clientid = client; tic->t_flags = XLOG_TIC_INITED; tic->t_trans_type = 0; @@ -3611,7 +3733,7 @@ xlog_verify_iclog( struct xlog *log, struct xlog_in_core *iclog, int count, - boolean_t syncing) + bool syncing) { xlog_op_header_t *ophead; xlog_in_core_t *icptr; @@ -3626,11 +3748,9 @@ xlog_verify_iclog( /* check validity of iclog pointers */ spin_lock(&log->l_icloglock); icptr = log->l_iclog; - for (i=0; i < log->l_iclog_bufs; i++) { - if (icptr == NULL) - xfs_emerg(log->l_mp, "%s: invalid ptr", __func__); - icptr = icptr->ic_next; - } + for (i = 0; i < log->l_iclog_bufs; i++, icptr = icptr->ic_next) + ASSERT(icptr); + if (icptr != log->l_iclog) xfs_emerg(log->l_mp, "%s: corrupt iclog ring", __func__); spin_unlock(&log->l_icloglock); @@ -3659,7 +3779,7 @@ xlog_verify_iclog( /* clientid is only 1 byte */ field_offset = (__psint_t) ((xfs_caddr_t)&(ophead->oh_clientid) - base_ptr); - if (syncing == B_FALSE || (field_offset & 0x1ff)) { + if (!syncing || (field_offset & 0x1ff)) { clientid = ophead->oh_clientid; } else { idx = BTOBBT((xfs_caddr_t)&(ophead->oh_clientid) - iclog->ic_datap); @@ -3682,7 +3802,7 @@ xlog_verify_iclog( /* check length */ field_offset = (__psint_t) ((xfs_caddr_t)&(ophead->oh_len) - base_ptr); - if (syncing == B_FALSE || (field_offset & 0x1ff)) { + if (!syncing || (field_offset & 0x1ff)) { op_len = be32_to_cpu(ophead->oh_len); } else { idx = BTOBBT((__psint_t)&ophead->oh_len - @@ -3832,11 +3952,14 @@ xfs_log_force_umount( retval = xlog_state_ioerror(log); spin_unlock(&log->l_icloglock); } + /* - * Wake up everybody waiting on xfs_log_force. - * Callback all log item committed functions as if the - * log writes were completed. + * Wake up everybody waiting on xfs_log_force. Wake the CIL push first + * as if the log writes were completed. The abort handling in the log + * item committed callback functions will do this again under lock to + * avoid races. */ + wake_up_all(&log->l_cilp->xc_commit_wait); xlog_state_do_callback(log, XFS_LI_ABORTED, NULL); #ifdef XFSERRORDEBUG diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index 5caee96059d..84e0deb95ab 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -18,14 +18,81 @@ #ifndef __XFS_LOG_H__ #define __XFS_LOG_H__ -/* get lsn fields */ -#define CYCLE_LSN(lsn) ((uint)((lsn)>>32)) -#define BLOCK_LSN(lsn) ((uint)(lsn)) +struct xfs_log_vec { + struct xfs_log_vec *lv_next; /* next lv in build list */ + int lv_niovecs; /* number of iovecs in lv */ + struct xfs_log_iovec *lv_iovecp; /* iovec array */ + struct xfs_log_item *lv_item; /* owner */ + char *lv_buf; /* formatted buffer */ + int lv_bytes; /* accounted space in buffer */ + int lv_buf_len; /* aligned size of buffer */ + int lv_size; /* size of allocated lv */ +}; + +#define XFS_LOG_VEC_ORDERED (-1) + +static inline void * +xlog_prepare_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec **vecp, + uint type) +{ + struct xfs_log_iovec *vec = *vecp; + + if (vec) { + ASSERT(vec - lv->lv_iovecp < lv->lv_niovecs); + vec++; + } else { + vec = &lv->lv_iovecp[0]; + } + + vec->i_type = type; + vec->i_addr = lv->lv_buf + lv->lv_buf_len; + + ASSERT(IS_ALIGNED((unsigned long)vec->i_addr, sizeof(uint64_t))); + + *vecp = vec; + return vec->i_addr; +} + +/* + * We need to make sure the next buffer is naturally aligned for the biggest + * basic data type we put into it. We already accounted for this padding when + * sizing the buffer. + * + * However, this padding does not get written into the log, and hence we have to + * track the space used by the log vectors separately to prevent log space hangs + * due to inaccurate accounting (i.e. a leak) of the used log space through the + * CIL context ticket. + */ +static inline void +xlog_finish_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec *vec, int len) +{ + lv->lv_buf_len += round_up(len, sizeof(uint64_t)); + lv->lv_bytes += len; + vec->i_len = len; +} -/* this is used in a spot where we might otherwise double-endian-flip */ -#define CYCLE_LSN_DISK(lsn) (((__be32 *)&(lsn))[0]) +static inline void * +xlog_copy_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec **vecp, + uint type, void *data, int len) +{ + void *buf; + + buf = xlog_prepare_iovec(lv, vecp, type); + memcpy(buf, data, len); + xlog_finish_iovec(lv, *vecp, len); + return buf; +} + +/* + * Structure used to pass callback function and the function's argument + * to the log manager. + */ +typedef struct xfs_log_callback { + struct xfs_log_callback *cb_next; + void (*cb_func)(void *, int); + void *cb_arg; +} xfs_log_callback_t; -#ifdef __KERNEL__ /* * By comparing each component, we don't have to worry about extra * endian issues in treating two 32 bit numbers as one 64 bit number @@ -59,64 +126,6 @@ static inline xfs_lsn_t _lsn_cmp(xfs_lsn_t lsn1, xfs_lsn_t lsn2) */ #define XFS_LOG_SYNC 0x1 -#endif /* __KERNEL__ */ - - -/* Log Clients */ -#define XFS_TRANSACTION 0x69 -#define XFS_VOLUME 0x2 -#define XFS_LOG 0xaa - - -/* Region types for iovec's i_type */ -#define XLOG_REG_TYPE_BFORMAT 1 -#define XLOG_REG_TYPE_BCHUNK 2 -#define XLOG_REG_TYPE_EFI_FORMAT 3 -#define XLOG_REG_TYPE_EFD_FORMAT 4 -#define XLOG_REG_TYPE_IFORMAT 5 -#define XLOG_REG_TYPE_ICORE 6 -#define XLOG_REG_TYPE_IEXT 7 -#define XLOG_REG_TYPE_IBROOT 8 -#define XLOG_REG_TYPE_ILOCAL 9 -#define XLOG_REG_TYPE_IATTR_EXT 10 -#define XLOG_REG_TYPE_IATTR_BROOT 11 -#define XLOG_REG_TYPE_IATTR_LOCAL 12 -#define XLOG_REG_TYPE_QFORMAT 13 -#define XLOG_REG_TYPE_DQUOT 14 -#define XLOG_REG_TYPE_QUOTAOFF 15 -#define XLOG_REG_TYPE_LRHEADER 16 -#define XLOG_REG_TYPE_UNMOUNT 17 -#define XLOG_REG_TYPE_COMMIT 18 -#define XLOG_REG_TYPE_TRANSHDR 19 -#define XLOG_REG_TYPE_MAX 19 - -typedef struct xfs_log_iovec { - void *i_addr; /* beginning address of region */ - int i_len; /* length in bytes of region */ - uint i_type; /* type of region */ -} xfs_log_iovec_t; - -struct xfs_log_vec { - struct xfs_log_vec *lv_next; /* next lv in build list */ - int lv_niovecs; /* number of iovecs in lv */ - struct xfs_log_iovec *lv_iovecp; /* iovec array */ - struct xfs_log_item *lv_item; /* owner */ - char *lv_buf; /* formatted buffer */ - int lv_buf_len; /* size of formatted buffer */ -}; - -/* - * Structure used to pass callback function and the function's argument - * to the log manager. - */ -typedef struct xfs_log_callback { - struct xfs_log_callback *cb_next; - void (*cb_func)(void *, int); - void *cb_arg; -} xfs_log_callback_t; - - -#ifdef __KERNEL__ /* Log manager interfaces */ struct xfs_mount; struct xlog_in_core; @@ -124,11 +133,7 @@ struct xlog_ticket; struct xfs_log_item; struct xfs_item_ops; struct xfs_trans; - -void xfs_log_item_init(struct xfs_mount *mp, - struct xfs_log_item *item, - int type, - const struct xfs_item_ops *ops); +struct xfs_log_callback; xfs_lsn_t xfs_log_done(struct xfs_mount *mp, struct xlog_ticket *ticket, @@ -156,7 +161,7 @@ xfs_lsn_t xlog_assign_tail_lsn_locked(struct xfs_mount *mp); void xfs_log_space_wake(struct xfs_mount *mp); int xfs_log_notify(struct xfs_mount *mp, struct xlog_in_core *iclog, - xfs_log_callback_t *callback_entry); + struct xfs_log_callback *callback_entry); int xfs_log_release_iclog(struct xfs_mount *mp, struct xlog_in_core *iclog); int xfs_log_reserve(struct xfs_mount *mp, @@ -177,7 +182,7 @@ void xlog_iodone(struct xfs_buf *); struct xlog_ticket *xfs_log_ticket_get(struct xlog_ticket *ticket); void xfs_log_ticket_put(struct xlog_ticket *ticket); -int xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp, +void xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp, xfs_lsn_t *commit_lsn, int flags); bool xfs_log_item_in_current_chkpt(struct xfs_log_item *lip); @@ -185,5 +190,4 @@ void xfs_log_work_queue(struct xfs_mount *mp); void xfs_log_worker(struct work_struct *work); void xfs_log_quiesce(struct xfs_mount *mp); -#endif #endif /* __XFS_LOG_H__ */ diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index ddc4529d07d..b3425b34e3d 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -17,11 +17,9 @@ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_types.h" -#include "xfs_log.h" -#include "xfs_trans.h" -#include "xfs_trans_priv.h" -#include "xfs_log_priv.h" +#include "xfs_log_format.h" +#include "xfs_shared.h" +#include "xfs_trans_resv.h" #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_mount.h" @@ -29,6 +27,10 @@ #include "xfs_alloc.h" #include "xfs_extent_busy.h" #include "xfs_discard.h" +#include "xfs_trans.h" +#include "xfs_trans_priv.h" +#include "xfs_log.h" +#include "xfs_log_priv.h" /* * Allocate a new ticket. Failing to get a new ticket makes it really hard to @@ -81,6 +83,53 @@ xlog_cil_init_post_recovery( } /* + * Prepare the log item for insertion into the CIL. Calculate the difference in + * log space and vectors it will consume, and if it is a new item pin it as + * well. + */ +STATIC void +xfs_cil_prepare_item( + struct xlog *log, + struct xfs_log_vec *lv, + struct xfs_log_vec *old_lv, + int *diff_len, + int *diff_iovecs) +{ + /* Account for the new LV being passed in */ + if (lv->lv_buf_len != XFS_LOG_VEC_ORDERED) { + *diff_len += lv->lv_bytes; + *diff_iovecs += lv->lv_niovecs; + } + + /* + * If there is no old LV, this is the first time we've seen the item in + * this CIL context and so we need to pin it. If we are replacing the + * old_lv, then remove the space it accounts for and free it. + */ + if (!old_lv) + lv->lv_item->li_ops->iop_pin(lv->lv_item); + else if (old_lv != lv) { + ASSERT(lv->lv_buf_len != XFS_LOG_VEC_ORDERED); + + *diff_len -= old_lv->lv_bytes; + *diff_iovecs -= old_lv->lv_niovecs; + kmem_free(old_lv); + } + + /* attach new log vector to log item */ + lv->lv_item->li_lv = lv; + + /* + * If this is the first time the item is being committed to the + * CIL, store the sequence number on the log item so we can + * tell in future commits whether this is the first checkpoint + * the item is being committed into. + */ + if (!lv->lv_item->li_seq) + lv->lv_item->li_seq = log->l_cilp->xc_ctx->sequence; +} + +/* * Format log item into a flat buffers * * For delayed logging, we need to hold a formatted buffer containing all the @@ -106,120 +155,119 @@ xlog_cil_init_post_recovery( * format the regions into the iclog as though they are being formatted * directly out of the objects themselves. */ -static struct xfs_log_vec * -xlog_cil_prepare_log_vecs( - struct xfs_trans *tp) +static void +xlog_cil_insert_format_items( + struct xlog *log, + struct xfs_trans *tp, + int *diff_len, + int *diff_iovecs) { struct xfs_log_item_desc *lidp; - struct xfs_log_vec *lv = NULL; - struct xfs_log_vec *ret_lv = NULL; /* Bail out if we didn't find a log item. */ if (list_empty(&tp->t_items)) { ASSERT(0); - return NULL; + return; } list_for_each_entry(lidp, &tp->t_items, lid_trans) { - struct xfs_log_vec *new_lv; - void *ptr; - int index; - int len = 0; - uint niovecs; + struct xfs_log_item *lip = lidp->lid_item; + struct xfs_log_vec *lv; + struct xfs_log_vec *old_lv; + int niovecs = 0; + int nbytes = 0; + int buf_size; + bool ordered = false; /* Skip items which aren't dirty in this transaction. */ if (!(lidp->lid_flags & XFS_LID_DIRTY)) continue; + /* get number of vecs and size of data to be stored */ + lip->li_ops->iop_size(lip, &niovecs, &nbytes); + /* Skip items that do not have any vectors for writing */ - niovecs = IOP_SIZE(lidp->lid_item); if (!niovecs) continue; - new_lv = kmem_zalloc(sizeof(*new_lv) + - niovecs * sizeof(struct xfs_log_iovec), - KM_SLEEP); + /* + * Ordered items need to be tracked but we do not wish to write + * them. We need a logvec to track the object, but we do not + * need an iovec or buffer to be allocated for copying data. + */ + if (niovecs == XFS_LOG_VEC_ORDERED) { + ordered = true; + niovecs = 0; + nbytes = 0; + } - /* The allocated iovec region lies beyond the log vector. */ - new_lv->lv_iovecp = (struct xfs_log_iovec *)&new_lv[1]; - new_lv->lv_niovecs = niovecs; - new_lv->lv_item = lidp->lid_item; + /* + * We 64-bit align the length of each iovec so that the start + * of the next one is naturally aligned. We'll need to + * account for that slack space here. Then round nbytes up + * to 64-bit alignment so that the initial buffer alignment is + * easy to calculate and verify. + */ + nbytes += niovecs * sizeof(uint64_t); + nbytes = round_up(nbytes, sizeof(uint64_t)); - /* build the vector array and calculate it's length */ - IOP_FORMAT(new_lv->lv_item, new_lv->lv_iovecp); - for (index = 0; index < new_lv->lv_niovecs; index++) - len += new_lv->lv_iovecp[index].i_len; + /* grab the old item if it exists for reservation accounting */ + old_lv = lip->li_lv; - new_lv->lv_buf_len = len; - new_lv->lv_buf = kmem_alloc(new_lv->lv_buf_len, - KM_SLEEP|KM_NOFS); - ptr = new_lv->lv_buf; + /* + * The data buffer needs to start 64-bit aligned, so round up + * that space to ensure we can align it appropriately and not + * overrun the buffer. + */ + buf_size = nbytes + + round_up((sizeof(struct xfs_log_vec) + + niovecs * sizeof(struct xfs_log_iovec)), + sizeof(uint64_t)); - for (index = 0; index < new_lv->lv_niovecs; index++) { - struct xfs_log_iovec *vec = &new_lv->lv_iovecp[index]; + /* compare to existing item size */ + if (lip->li_lv && buf_size <= lip->li_lv->lv_size) { + /* same or smaller, optimise common overwrite case */ + lv = lip->li_lv; + lv->lv_next = NULL; - memcpy(ptr, vec->i_addr, vec->i_len); - vec->i_addr = ptr; - ptr += vec->i_len; - } - ASSERT(ptr == new_lv->lv_buf + new_lv->lv_buf_len); + if (ordered) + goto insert; - if (!ret_lv) - ret_lv = new_lv; - else - lv->lv_next = new_lv; - lv = new_lv; - } + /* + * set the item up as though it is a new insertion so + * that the space reservation accounting is correct. + */ + *diff_iovecs -= lv->lv_niovecs; + *diff_len -= lv->lv_bytes; + } else { + /* allocate new data chunk */ + lv = kmem_zalloc(buf_size, KM_SLEEP|KM_NOFS); + lv->lv_item = lip; + lv->lv_size = buf_size; + if (ordered) { + /* track as an ordered logvec */ + ASSERT(lip->li_lv == NULL); + lv->lv_buf_len = XFS_LOG_VEC_ORDERED; + goto insert; + } + lv->lv_iovecp = (struct xfs_log_iovec *)&lv[1]; + } - return ret_lv; -} + /* Ensure the lv is set up according to ->iop_size */ + lv->lv_niovecs = niovecs; -/* - * Prepare the log item for insertion into the CIL. Calculate the difference in - * log space and vectors it will consume, and if it is a new item pin it as - * well. - */ -STATIC void -xfs_cil_prepare_item( - struct xlog *log, - struct xfs_log_vec *lv, - int *len, - int *diff_iovecs) -{ - struct xfs_log_vec *old = lv->lv_item->li_lv; - - if (old) { - /* existing lv on log item, space used is a delta */ - ASSERT(!list_empty(&lv->lv_item->li_cil)); - ASSERT(old->lv_buf && old->lv_buf_len && old->lv_niovecs); - - *len += lv->lv_buf_len - old->lv_buf_len; - *diff_iovecs += lv->lv_niovecs - old->lv_niovecs; - kmem_free(old->lv_buf); - kmem_free(old); - } else { - /* new lv, must pin the log item */ - ASSERT(!lv->lv_item->li_lv); - ASSERT(list_empty(&lv->lv_item->li_cil)); - - *len += lv->lv_buf_len; - *diff_iovecs += lv->lv_niovecs; - IOP_PIN(lv->lv_item); + /* The allocated data region lies beyond the iovec region */ + lv->lv_buf_len = 0; + lv->lv_bytes = 0; + lv->lv_buf = (char *)lv + buf_size - nbytes; + ASSERT(IS_ALIGNED((unsigned long)lv->lv_buf, sizeof(uint64_t))); + lip->li_ops->iop_format(lip, lv); +insert: + ASSERT(lv->lv_buf_len <= nbytes); + xfs_cil_prepare_item(log, lv, old_lv, diff_len, diff_iovecs); } - - /* attach new log vector to log item */ - lv->lv_item->li_lv = lv; - - /* - * If this is the first time the item is being committed to the - * CIL, store the sequence number on the log item so we can - * tell in future commits whether this is the first checkpoint - * the item is being committed into. - */ - if (!lv->lv_item->li_seq) - lv->lv_item->li_seq = log->l_cilp->xc_ctx->sequence; } /* @@ -232,47 +280,47 @@ xfs_cil_prepare_item( static void xlog_cil_insert_items( struct xlog *log, - struct xfs_log_vec *log_vector, - struct xlog_ticket *ticket) + struct xfs_trans *tp) { struct xfs_cil *cil = log->l_cilp; struct xfs_cil_ctx *ctx = cil->xc_ctx; - struct xfs_log_vec *lv; + struct xfs_log_item_desc *lidp; int len = 0; int diff_iovecs = 0; int iclog_space; - ASSERT(log_vector); + ASSERT(tp); /* - * Do all the accounting aggregation and switching of log vectors - * around in a separate loop to the insertion of items into the CIL. - * Then we can do a separate loop to update the CIL within a single - * lock/unlock pair. This reduces the number of round trips on the CIL - * lock from O(nr_logvectors) to O(1) and greatly reduces the overall - * hold time for the transaction commit. - * - * If this is the first time the item is being placed into the CIL in - * this context, pin it so it can't be written to disk until the CIL is - * flushed to the iclog and the iclog written to disk. - * * We can do this safely because the context can't checkpoint until we * are done so it doesn't matter exactly how we update the CIL. */ - for (lv = log_vector; lv; lv = lv->lv_next) - xfs_cil_prepare_item(log, lv, &len, &diff_iovecs); - - /* account for space used by new iovec headers */ - len += diff_iovecs * sizeof(xlog_op_header_t); + xlog_cil_insert_format_items(log, tp, &len, &diff_iovecs); + /* + * Now (re-)position everything modified at the tail of the CIL. + * We do this here so we only need to take the CIL lock once during + * the transaction commit. + */ spin_lock(&cil->xc_cil_lock); + list_for_each_entry(lidp, &tp->t_items, lid_trans) { + struct xfs_log_item *lip = lidp->lid_item; + + /* Skip items which aren't dirty in this transaction. */ + if (!(lidp->lid_flags & XFS_LID_DIRTY)) + continue; - /* move the items to the tail of the CIL */ - for (lv = log_vector; lv; lv = lv->lv_next) - list_move_tail(&lv->lv_item->li_cil, &cil->xc_cil); + list_move_tail(&lip->li_cil, &cil->xc_cil); + } + /* account for space used by new iovec headers */ + len += diff_iovecs * sizeof(xlog_op_header_t); ctx->nvecs += diff_iovecs; + /* attach the transaction to the CIL if it has any busy extents */ + if (!list_empty(&tp->t_busy)) + list_splice_init(&tp->t_busy, &ctx->busy_extents); + /* * Now transfer enough transaction reservation to the context ticket * for the checkpoint. The context ticket is special - the unit @@ -281,10 +329,8 @@ xlog_cil_insert_items( * during the transaction commit. */ if (ctx->ticket->t_curr_res == 0) { - /* first commit in checkpoint, steal the header reservation */ - ASSERT(ticket->t_curr_res >= ctx->ticket->t_unit_res + len); ctx->ticket->t_curr_res = ctx->ticket->t_unit_res; - ticket->t_curr_res -= ctx->ticket->t_unit_res; + tp->t_ticket->t_curr_res -= ctx->ticket->t_unit_res; } /* do we need space for more log record headers? */ @@ -298,10 +344,10 @@ xlog_cil_insert_items( hdrs *= log->l_iclog_hsize + sizeof(struct xlog_op_header); ctx->ticket->t_unit_res += hdrs; ctx->ticket->t_curr_res += hdrs; - ticket->t_curr_res -= hdrs; - ASSERT(ticket->t_curr_res >= len); + tp->t_ticket->t_curr_res -= hdrs; + ASSERT(tp->t_ticket->t_curr_res >= len); } - ticket->t_curr_res -= len; + tp->t_ticket->t_curr_res -= len; ctx->space_used += len; spin_unlock(&cil->xc_cil_lock); @@ -315,7 +361,6 @@ xlog_cil_free_logvec( for (lv = log_vector; lv; ) { struct xfs_log_vec *next = lv->lv_next; - kmem_free(lv->lv_buf); kmem_free(lv); lv = next; } @@ -341,9 +386,17 @@ xlog_cil_committed( xfs_extent_busy_clear(mp, &ctx->busy_extents, (mp->m_flags & XFS_MOUNT_DISCARD) && !abort); - spin_lock(&ctx->cil->xc_cil_lock); + /* + * If we are aborting the commit, wake up anyone waiting on the + * committing list. If we don't, then a shutdown we can leave processes + * waiting in xlog_cil_force_lsn() waiting on a sequence commit that + * will never happen because we aborted it. + */ + spin_lock(&ctx->cil->xc_push_lock); + if (abort) + wake_up_all(&ctx->cil->xc_commit_wait); list_del(&ctx->committing); - spin_unlock(&ctx->cil->xc_cil_lock); + spin_unlock(&ctx->cil->xc_push_lock); xlog_cil_free_logvec(ctx->lv_chain); @@ -381,9 +434,7 @@ xlog_cil_push( struct xfs_cil_ctx *new_ctx; struct xlog_in_core *commit_iclog; struct xlog_ticket *tic; - int num_lv; int num_iovecs; - int len; int error = 0; struct xfs_trans_header thdr; struct xfs_log_iovec lhdr; @@ -400,7 +451,7 @@ xlog_cil_push( down_write(&cil->xc_ctx_lock); ctx = cil->xc_ctx; - spin_lock(&cil->xc_cil_lock); + spin_lock(&cil->xc_push_lock); push_seq = cil->xc_push_seq; ASSERT(push_seq <= ctx->sequence); @@ -411,10 +462,10 @@ xlog_cil_push( */ if (list_empty(&cil->xc_cil)) { cil->xc_push_seq = 0; - spin_unlock(&cil->xc_cil_lock); + spin_unlock(&cil->xc_push_lock); goto out_skip; } - spin_unlock(&cil->xc_cil_lock); + spin_unlock(&cil->xc_push_lock); /* check for a previously pushed seqeunce */ @@ -428,12 +479,9 @@ xlog_cil_push( * side which is currently locked out by the flush lock. */ lv = NULL; - num_lv = 0; num_iovecs = 0; - len = 0; while (!list_empty(&cil->xc_cil)) { struct xfs_log_item *item; - int i; item = list_first_entry(&cil->xc_cil, struct xfs_log_item, li_cil); @@ -444,11 +492,7 @@ xlog_cil_push( lv->lv_next = item->li_lv; lv = item->li_lv; item->li_lv = NULL; - - num_lv++; num_iovecs += lv->lv_niovecs; - for (i = 0; i < lv->lv_niovecs; i++) - len += lv->lv_iovecp[i].i_len; } /* @@ -464,13 +508,6 @@ xlog_cil_push( cil->xc_ctx = new_ctx; /* - * mirror the new sequence into the cil structure so that we can do - * unlocked checks against the current sequence in log forces without - * risking deferencing a freed context pointer. - */ - cil->xc_current_sequence = new_ctx->sequence; - - /* * The switch is now done, so we can drop the context lock and move out * of a shared context. We can't just go straight to the commit record, * though - we need to synchronise with previous and future commits so @@ -488,10 +525,17 @@ xlog_cil_push( * Hence we need to add this context to the committing context list so * that higher sequences will wait for us to write out a commit record * before they do. + * + * xfs_log_force_lsn requires us to mirror the new sequence into the cil + * structure atomically with the addition of this sequence to the + * committing list. This also ensures that we can do unlocked checks + * against the current sequence in log forces without risking + * deferencing a freed context pointer. */ - spin_lock(&cil->xc_cil_lock); + spin_lock(&cil->xc_push_lock); + cil->xc_current_sequence = new_ctx->sequence; list_add(&ctx->committing, &cil->xc_committing); - spin_unlock(&cil->xc_cil_lock); + spin_unlock(&cil->xc_push_lock); up_write(&cil->xc_ctx_lock); /* @@ -526,11 +570,21 @@ xlog_cil_push( * order the commit records so replay will get them in the right order. */ restart: - spin_lock(&cil->xc_cil_lock); + spin_lock(&cil->xc_push_lock); list_for_each_entry(new_ctx, &cil->xc_committing, committing) { /* + * Avoid getting stuck in this loop because we were woken by the + * shutdown, but then went back to sleep once already in the + * shutdown state. + */ + if (XLOG_FORCED_SHUTDOWN(log)) { + spin_unlock(&cil->xc_push_lock); + goto out_abort_free_ticket; + } + + /* * Higher sequences will wait for this one so skip them. - * Don't wait for own own sequence, either. + * Don't wait for our own sequence, either. */ if (new_ctx->sequence >= ctx->sequence) continue; @@ -539,11 +593,11 @@ restart: * It is still being pushed! Wait for the push to * complete, then start again from the beginning. */ - xlog_wait(&cil->xc_commit_wait, &cil->xc_cil_lock); + xlog_wait(&cil->xc_commit_wait, &cil->xc_push_lock); goto restart; } } - spin_unlock(&cil->xc_cil_lock); + spin_unlock(&cil->xc_push_lock); /* xfs_log_done always frees the ticket on error. */ commit_lsn = xfs_log_done(log->l_mp, tic, &commit_iclog, 0); @@ -562,10 +616,10 @@ restart: * callbacks to the iclog we can assign the commit LSN to the context * and wake up anyone who is waiting for the commit to complete. */ - spin_lock(&cil->xc_cil_lock); + spin_lock(&cil->xc_push_lock); ctx->commit_lsn = commit_lsn; wake_up_all(&cil->xc_commit_wait); - spin_unlock(&cil->xc_cil_lock); + spin_unlock(&cil->xc_push_lock); /* release the hounds! */ return xfs_log_release_iclog(log->l_mp, commit_iclog); @@ -618,17 +672,23 @@ xlog_cil_push_background( if (cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log)) return; - spin_lock(&cil->xc_cil_lock); + spin_lock(&cil->xc_push_lock); if (cil->xc_push_seq < cil->xc_current_sequence) { cil->xc_push_seq = cil->xc_current_sequence; queue_work(log->l_mp->m_cil_workqueue, &cil->xc_push_work); } - spin_unlock(&cil->xc_cil_lock); + spin_unlock(&cil->xc_push_lock); } +/* + * xlog_cil_push_now() is used to trigger an immediate CIL push to the sequence + * number that is passed. When it returns, the work will be queued for + * @push_seq, but it won't be completed. The caller is expected to do any + * waiting for push_seq to complete if it is required. + */ static void -xlog_cil_push_foreground( +xlog_cil_push_now( struct xlog *log, xfs_lsn_t push_seq) { @@ -646,17 +706,29 @@ xlog_cil_push_foreground( * If the CIL is empty or we've already pushed the sequence then * there's no work we need to do. */ - spin_lock(&cil->xc_cil_lock); + spin_lock(&cil->xc_push_lock); if (list_empty(&cil->xc_cil) || push_seq <= cil->xc_push_seq) { - spin_unlock(&cil->xc_cil_lock); + spin_unlock(&cil->xc_push_lock); return; } cil->xc_push_seq = push_seq; - spin_unlock(&cil->xc_cil_lock); + queue_work(log->l_mp->m_cil_workqueue, &cil->xc_push_work); + spin_unlock(&cil->xc_push_lock); +} - /* do the push now */ - xlog_cil_push(log); +bool +xlog_cil_empty( + struct xlog *log) +{ + struct xfs_cil *cil = log->l_cilp; + bool empty = false; + + spin_lock(&cil->xc_push_lock); + if (list_empty(&cil->xc_cil)) + empty = true; + spin_unlock(&cil->xc_push_lock); + return empty; } /* @@ -668,15 +740,11 @@ xlog_cil_push_foreground( * transaction to the checkpoint context so we carry the busy extents through * to checkpoint completion, and then unlock all the items in the transaction. * - * For more specific information about the order of operations in - * xfs_log_commit_cil() please refer to the comments in - * xfs_trans_commit_iclog(). - * * Called with the context lock already held in read mode to lock out * background commit, returns without it held once background commits are * allowed again. */ -int +void xfs_log_commit_cil( struct xfs_mount *mp, struct xfs_trans *tp, @@ -684,42 +752,25 @@ xfs_log_commit_cil( int flags) { struct xlog *log = mp->m_log; + struct xfs_cil *cil = log->l_cilp; int log_flags = 0; - struct xfs_log_vec *log_vector; if (flags & XFS_TRANS_RELEASE_LOG_RES) log_flags = XFS_LOG_REL_PERM_RESERV; - /* - * Do all the hard work of formatting items (including memory - * allocation) outside the CIL context lock. This prevents stalling CIL - * pushes when we are low on memory and a transaction commit spends a - * lot of time in memory reclaim. - */ - log_vector = xlog_cil_prepare_log_vecs(tp); - if (!log_vector) - return ENOMEM; - /* lock out background commit */ - down_read(&log->l_cilp->xc_ctx_lock); - if (commit_lsn) - *commit_lsn = log->l_cilp->xc_ctx->sequence; + down_read(&cil->xc_ctx_lock); - xlog_cil_insert_items(log, log_vector, tp->t_ticket); + xlog_cil_insert_items(log, tp); /* check we didn't blow the reservation */ if (tp->t_ticket->t_curr_res < 0) - xlog_print_tic_res(log->l_mp, tp->t_ticket); + xlog_print_tic_res(mp, tp->t_ticket); - /* attach the transaction to the CIL if it has any busy extents */ - if (!list_empty(&tp->t_busy)) { - spin_lock(&log->l_cilp->xc_cil_lock); - list_splice_init(&tp->t_busy, - &log->l_cilp->xc_ctx->busy_extents); - spin_unlock(&log->l_cilp->xc_cil_lock); - } + tp->t_commit_lsn = cil->xc_ctx->sequence; + if (commit_lsn) + *commit_lsn = tp->t_commit_lsn; - tp->t_commit_lsn = *commit_lsn; xfs_log_done(mp, tp->t_ticket, NULL, log_flags); xfs_trans_unreserve_and_mod_sb(tp); @@ -734,12 +785,11 @@ xfs_log_commit_cil( * the log items. This affects (at least) processing of stale buffers, * inodes and EFIs. */ - xfs_trans_free_items(tp, *commit_lsn, 0); + xfs_trans_free_items(tp, tp->t_commit_lsn, 0); xlog_cil_push_background(log); - up_read(&log->l_cilp->xc_ctx_lock); - return 0; + up_read(&cil->xc_ctx_lock); } /* @@ -768,7 +818,8 @@ xlog_cil_force_lsn( * xlog_cil_push() handles racing pushes for the same sequence, * so no need to deal with it here. */ - xlog_cil_push_foreground(log, sequence); +restart: + xlog_cil_push_now(log, sequence); /* * See if we can find a previous sequence still committing. @@ -776,9 +827,15 @@ xlog_cil_force_lsn( * before allowing the force of push_seq to go ahead. Hence block * on commits for those as well. */ -restart: - spin_lock(&cil->xc_cil_lock); + spin_lock(&cil->xc_push_lock); list_for_each_entry(ctx, &cil->xc_committing, committing) { + /* + * Avoid getting stuck in this loop because we were woken by the + * shutdown, but then went back to sleep once already in the + * shutdown state. + */ + if (XLOG_FORCED_SHUTDOWN(log)) + goto out_shutdown; if (ctx->sequence > sequence) continue; if (!ctx->commit_lsn) { @@ -786,7 +843,7 @@ restart: * It is still being pushed! Wait for the push to * complete, then start again from the beginning. */ - xlog_wait(&cil->xc_commit_wait, &cil->xc_cil_lock); + xlog_wait(&cil->xc_commit_wait, &cil->xc_push_lock); goto restart; } if (ctx->sequence != sequence) @@ -794,8 +851,39 @@ restart: /* found it! */ commit_lsn = ctx->commit_lsn; } - spin_unlock(&cil->xc_cil_lock); + + /* + * The call to xlog_cil_push_now() executes the push in the background. + * Hence by the time we have got here it our sequence may not have been + * pushed yet. This is true if the current sequence still matches the + * push sequence after the above wait loop and the CIL still contains + * dirty objects. + * + * When the push occurs, it will empty the CIL and atomically increment + * the currect sequence past the push sequence and move it into the + * committing list. Of course, if the CIL is clean at the time of the + * push, it won't have pushed the CIL at all, so in that case we should + * try the push for this sequence again from the start just in case. + */ + if (sequence == cil->xc_current_sequence && + !list_empty(&cil->xc_cil)) { + spin_unlock(&cil->xc_push_lock); + goto restart; + } + + spin_unlock(&cil->xc_push_lock); return commit_lsn; + + /* + * We detected a shutdown in progress. We need to trigger the log force + * to pass through it's iclog state machine error handling, even though + * we are already in a shutdown state. Hence we can't return + * NULLCOMMITLSN here as that has special meaning to log forces (i.e. + * LSN is already stable), so we return a zero LSN instead. + */ +out_shutdown: + spin_unlock(&cil->xc_push_lock); + return 0; } /* @@ -852,6 +940,7 @@ xlog_cil_init( INIT_LIST_HEAD(&cil->xc_cil); INIT_LIST_HEAD(&cil->xc_committing); spin_lock_init(&cil->xc_cil_lock); + spin_lock_init(&cil->xc_push_lock); init_rwsem(&cil->xc_ctx_lock); init_waitqueue_head(&cil->xc_commit_wait); diff --git a/fs/xfs/xfs_log_format.h b/fs/xfs/xfs_log_format.h new file mode 100644 index 00000000000..f0969c77bdb --- /dev/null +++ b/fs/xfs/xfs_log_format.h @@ -0,0 +1,679 @@ +/* + * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_LOG_FORMAT_H__ +#define __XFS_LOG_FORMAT_H__ + +struct xfs_mount; +struct xfs_trans_res; + +/* + * On-disk Log Format definitions. + * + * This file contains all the on-disk format definitions used within the log. It + * includes the physical log structure itself, as well as all the log item + * format structures that are written into the log and intepreted by log + * recovery. We start with the physical log format definitions, and then work + * through all the log items definitions and everything they encode into the + * log. + */ +typedef __uint32_t xlog_tid_t; + +#define XLOG_MIN_ICLOGS 2 +#define XLOG_MAX_ICLOGS 8 +#define XLOG_HEADER_MAGIC_NUM 0xFEEDbabe /* Invalid cycle number */ +#define XLOG_VERSION_1 1 +#define XLOG_VERSION_2 2 /* Large IClogs, Log sunit */ +#define XLOG_VERSION_OKBITS (XLOG_VERSION_1 | XLOG_VERSION_2) +#define XLOG_MIN_RECORD_BSIZE (16*1024) /* eventually 32k */ +#define XLOG_BIG_RECORD_BSIZE (32*1024) /* 32k buffers */ +#define XLOG_MAX_RECORD_BSIZE (256*1024) +#define XLOG_HEADER_CYCLE_SIZE (32*1024) /* cycle data in header */ +#define XLOG_MIN_RECORD_BSHIFT 14 /* 16384 == 1 << 14 */ +#define XLOG_BIG_RECORD_BSHIFT 15 /* 32k == 1 << 15 */ +#define XLOG_MAX_RECORD_BSHIFT 18 /* 256k == 1 << 18 */ +#define XLOG_BTOLSUNIT(log, b) (((b)+(log)->l_mp->m_sb.sb_logsunit-1) / \ + (log)->l_mp->m_sb.sb_logsunit) +#define XLOG_LSUNITTOB(log, su) ((su) * (log)->l_mp->m_sb.sb_logsunit) + +#define XLOG_HEADER_SIZE 512 + +/* Minimum number of transactions that must fit in the log (defined by mkfs) */ +#define XFS_MIN_LOG_FACTOR 3 + +#define XLOG_REC_SHIFT(log) \ + BTOBB(1 << (xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? \ + XLOG_MAX_RECORD_BSHIFT : XLOG_BIG_RECORD_BSHIFT)) +#define XLOG_TOTAL_REC_SHIFT(log) \ + BTOBB(XLOG_MAX_ICLOGS << (xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? \ + XLOG_MAX_RECORD_BSHIFT : XLOG_BIG_RECORD_BSHIFT)) + +/* get lsn fields */ +#define CYCLE_LSN(lsn) ((uint)((lsn)>>32)) +#define BLOCK_LSN(lsn) ((uint)(lsn)) + +/* this is used in a spot where we might otherwise double-endian-flip */ +#define CYCLE_LSN_DISK(lsn) (((__be32 *)&(lsn))[0]) + +static inline xfs_lsn_t xlog_assign_lsn(uint cycle, uint block) +{ + return ((xfs_lsn_t)cycle << 32) | block; +} + +static inline uint xlog_get_cycle(char *ptr) +{ + if (be32_to_cpu(*(__be32 *)ptr) == XLOG_HEADER_MAGIC_NUM) + return be32_to_cpu(*((__be32 *)ptr + 1)); + else + return be32_to_cpu(*(__be32 *)ptr); +} + +/* Log Clients */ +#define XFS_TRANSACTION 0x69 +#define XFS_VOLUME 0x2 +#define XFS_LOG 0xaa + +#define XLOG_UNMOUNT_TYPE 0x556e /* Un for Unmount */ + +/* Region types for iovec's i_type */ +#define XLOG_REG_TYPE_BFORMAT 1 +#define XLOG_REG_TYPE_BCHUNK 2 +#define XLOG_REG_TYPE_EFI_FORMAT 3 +#define XLOG_REG_TYPE_EFD_FORMAT 4 +#define XLOG_REG_TYPE_IFORMAT 5 +#define XLOG_REG_TYPE_ICORE 6 +#define XLOG_REG_TYPE_IEXT 7 +#define XLOG_REG_TYPE_IBROOT 8 +#define XLOG_REG_TYPE_ILOCAL 9 +#define XLOG_REG_TYPE_IATTR_EXT 10 +#define XLOG_REG_TYPE_IATTR_BROOT 11 +#define XLOG_REG_TYPE_IATTR_LOCAL 12 +#define XLOG_REG_TYPE_QFORMAT 13 +#define XLOG_REG_TYPE_DQUOT 14 +#define XLOG_REG_TYPE_QUOTAOFF 15 +#define XLOG_REG_TYPE_LRHEADER 16 +#define XLOG_REG_TYPE_UNMOUNT 17 +#define XLOG_REG_TYPE_COMMIT 18 +#define XLOG_REG_TYPE_TRANSHDR 19 +#define XLOG_REG_TYPE_ICREATE 20 +#define XLOG_REG_TYPE_MAX 20 + +/* + * Flags to log operation header + * + * The first write of a new transaction will be preceded with a start + * record, XLOG_START_TRANS. Once a transaction is committed, a commit + * record is written, XLOG_COMMIT_TRANS. If a single region can not fit into + * the remainder of the current active in-core log, it is split up into + * multiple regions. Each partial region will be marked with a + * XLOG_CONTINUE_TRANS until the last one, which gets marked with XLOG_END_TRANS. + * + */ +#define XLOG_START_TRANS 0x01 /* Start a new transaction */ +#define XLOG_COMMIT_TRANS 0x02 /* Commit this transaction */ +#define XLOG_CONTINUE_TRANS 0x04 /* Cont this trans into new region */ +#define XLOG_WAS_CONT_TRANS 0x08 /* Cont this trans into new region */ +#define XLOG_END_TRANS 0x10 /* End a continued transaction */ +#define XLOG_UNMOUNT_TRANS 0x20 /* Unmount a filesystem transaction */ + + +typedef struct xlog_op_header { + __be32 oh_tid; /* transaction id of operation : 4 b */ + __be32 oh_len; /* bytes in data region : 4 b */ + __u8 oh_clientid; /* who sent me this : 1 b */ + __u8 oh_flags; /* : 1 b */ + __u16 oh_res2; /* 32 bit align : 2 b */ +} xlog_op_header_t; + +/* valid values for h_fmt */ +#define XLOG_FMT_UNKNOWN 0 +#define XLOG_FMT_LINUX_LE 1 +#define XLOG_FMT_LINUX_BE 2 +#define XLOG_FMT_IRIX_BE 3 + +/* our fmt */ +#ifdef XFS_NATIVE_HOST +#define XLOG_FMT XLOG_FMT_LINUX_BE +#else +#define XLOG_FMT XLOG_FMT_LINUX_LE +#endif + +typedef struct xlog_rec_header { + __be32 h_magicno; /* log record (LR) identifier : 4 */ + __be32 h_cycle; /* write cycle of log : 4 */ + __be32 h_version; /* LR version : 4 */ + __be32 h_len; /* len in bytes; should be 64-bit aligned: 4 */ + __be64 h_lsn; /* lsn of this LR : 8 */ + __be64 h_tail_lsn; /* lsn of 1st LR w/ buffers not committed: 8 */ + __le32 h_crc; /* crc of log record : 4 */ + __be32 h_prev_block; /* block number to previous LR : 4 */ + __be32 h_num_logops; /* number of log operations in this LR : 4 */ + __be32 h_cycle_data[XLOG_HEADER_CYCLE_SIZE / BBSIZE]; + /* new fields */ + __be32 h_fmt; /* format of log record : 4 */ + uuid_t h_fs_uuid; /* uuid of FS : 16 */ + __be32 h_size; /* iclog size : 4 */ +} xlog_rec_header_t; + +typedef struct xlog_rec_ext_header { + __be32 xh_cycle; /* write cycle of log : 4 */ + __be32 xh_cycle_data[XLOG_HEADER_CYCLE_SIZE / BBSIZE]; /* : 256 */ +} xlog_rec_ext_header_t; + +/* + * Quite misnamed, because this union lays out the actual on-disk log buffer. + */ +typedef union xlog_in_core2 { + xlog_rec_header_t hic_header; + xlog_rec_ext_header_t hic_xheader; + char hic_sector[XLOG_HEADER_SIZE]; +} xlog_in_core_2_t; + +/* not an on-disk structure, but needed by log recovery in userspace */ +typedef struct xfs_log_iovec { + void *i_addr; /* beginning address of region */ + int i_len; /* length in bytes of region */ + uint i_type; /* type of region */ +} xfs_log_iovec_t; + + +/* + * Transaction Header definitions. + * + * This is the structure written in the log at the head of every transaction. It + * identifies the type and id of the transaction, and contains the number of + * items logged by the transaction so we know how many to expect during + * recovery. + * + * Do not change the below structure without redoing the code in + * xlog_recover_add_to_trans() and xlog_recover_add_to_cont_trans(). + */ +typedef struct xfs_trans_header { + uint th_magic; /* magic number */ + uint th_type; /* transaction type */ + __int32_t th_tid; /* transaction id (unused) */ + uint th_num_items; /* num items logged by trans */ +} xfs_trans_header_t; + +#define XFS_TRANS_HEADER_MAGIC 0x5452414e /* TRAN */ + +/* + * Log item types. + */ +#define XFS_LI_EFI 0x1236 +#define XFS_LI_EFD 0x1237 +#define XFS_LI_IUNLINK 0x1238 +#define XFS_LI_INODE 0x123b /* aligned ino chunks, var-size ibufs */ +#define XFS_LI_BUF 0x123c /* v2 bufs, variable sized inode bufs */ +#define XFS_LI_DQUOT 0x123d +#define XFS_LI_QUOTAOFF 0x123e +#define XFS_LI_ICREATE 0x123f + +#define XFS_LI_TYPE_DESC \ + { XFS_LI_EFI, "XFS_LI_EFI" }, \ + { XFS_LI_EFD, "XFS_LI_EFD" }, \ + { XFS_LI_IUNLINK, "XFS_LI_IUNLINK" }, \ + { XFS_LI_INODE, "XFS_LI_INODE" }, \ + { XFS_LI_BUF, "XFS_LI_BUF" }, \ + { XFS_LI_DQUOT, "XFS_LI_DQUOT" }, \ + { XFS_LI_QUOTAOFF, "XFS_LI_QUOTAOFF" }, \ + { XFS_LI_ICREATE, "XFS_LI_ICREATE" } + +/* + * Inode Log Item Format definitions. + * + * This is the structure used to lay out an inode log item in the + * log. The size of the inline data/extents/b-tree root to be logged + * (if any) is indicated in the ilf_dsize field. Changes to this structure + * must be added on to the end. + */ +typedef struct xfs_inode_log_format { + __uint16_t ilf_type; /* inode log item type */ + __uint16_t ilf_size; /* size of this item */ + __uint32_t ilf_fields; /* flags for fields logged */ + __uint16_t ilf_asize; /* size of attr d/ext/root */ + __uint16_t ilf_dsize; /* size of data/ext/root */ + __uint64_t ilf_ino; /* inode number */ + union { + __uint32_t ilfu_rdev; /* rdev value for dev inode*/ + uuid_t ilfu_uuid; /* mount point value */ + } ilf_u; + __int64_t ilf_blkno; /* blkno of inode buffer */ + __int32_t ilf_len; /* len of inode buffer */ + __int32_t ilf_boffset; /* off of inode in buffer */ +} xfs_inode_log_format_t; + +typedef struct xfs_inode_log_format_32 { + __uint16_t ilf_type; /* inode log item type */ + __uint16_t ilf_size; /* size of this item */ + __uint32_t ilf_fields; /* flags for fields logged */ + __uint16_t ilf_asize; /* size of attr d/ext/root */ + __uint16_t ilf_dsize; /* size of data/ext/root */ + __uint64_t ilf_ino; /* inode number */ + union { + __uint32_t ilfu_rdev; /* rdev value for dev inode*/ + uuid_t ilfu_uuid; /* mount point value */ + } ilf_u; + __int64_t ilf_blkno; /* blkno of inode buffer */ + __int32_t ilf_len; /* len of inode buffer */ + __int32_t ilf_boffset; /* off of inode in buffer */ +} __attribute__((packed)) xfs_inode_log_format_32_t; + +typedef struct xfs_inode_log_format_64 { + __uint16_t ilf_type; /* inode log item type */ + __uint16_t ilf_size; /* size of this item */ + __uint32_t ilf_fields; /* flags for fields logged */ + __uint16_t ilf_asize; /* size of attr d/ext/root */ + __uint16_t ilf_dsize; /* size of data/ext/root */ + __uint32_t ilf_pad; /* pad for 64 bit boundary */ + __uint64_t ilf_ino; /* inode number */ + union { + __uint32_t ilfu_rdev; /* rdev value for dev inode*/ + uuid_t ilfu_uuid; /* mount point value */ + } ilf_u; + __int64_t ilf_blkno; /* blkno of inode buffer */ + __int32_t ilf_len; /* len of inode buffer */ + __int32_t ilf_boffset; /* off of inode in buffer */ +} xfs_inode_log_format_64_t; + +/* + * Flags for xfs_trans_log_inode flags field. + */ +#define XFS_ILOG_CORE 0x001 /* log standard inode fields */ +#define XFS_ILOG_DDATA 0x002 /* log i_df.if_data */ +#define XFS_ILOG_DEXT 0x004 /* log i_df.if_extents */ +#define XFS_ILOG_DBROOT 0x008 /* log i_df.i_broot */ +#define XFS_ILOG_DEV 0x010 /* log the dev field */ +#define XFS_ILOG_UUID 0x020 /* log the uuid field */ +#define XFS_ILOG_ADATA 0x040 /* log i_af.if_data */ +#define XFS_ILOG_AEXT 0x080 /* log i_af.if_extents */ +#define XFS_ILOG_ABROOT 0x100 /* log i_af.i_broot */ +#define XFS_ILOG_DOWNER 0x200 /* change the data fork owner on replay */ +#define XFS_ILOG_AOWNER 0x400 /* change the attr fork owner on replay */ + + +/* + * The timestamps are dirty, but not necessarily anything else in the inode + * core. Unlike the other fields above this one must never make it to disk + * in the ilf_fields of the inode_log_format, but is purely store in-memory in + * ili_fields in the inode_log_item. + */ +#define XFS_ILOG_TIMESTAMP 0x4000 + +#define XFS_ILOG_NONCORE (XFS_ILOG_DDATA | XFS_ILOG_DEXT | \ + XFS_ILOG_DBROOT | XFS_ILOG_DEV | \ + XFS_ILOG_UUID | XFS_ILOG_ADATA | \ + XFS_ILOG_AEXT | XFS_ILOG_ABROOT | \ + XFS_ILOG_DOWNER | XFS_ILOG_AOWNER) + +#define XFS_ILOG_DFORK (XFS_ILOG_DDATA | XFS_ILOG_DEXT | \ + XFS_ILOG_DBROOT) + +#define XFS_ILOG_AFORK (XFS_ILOG_ADATA | XFS_ILOG_AEXT | \ + XFS_ILOG_ABROOT) + +#define XFS_ILOG_ALL (XFS_ILOG_CORE | XFS_ILOG_DDATA | \ + XFS_ILOG_DEXT | XFS_ILOG_DBROOT | \ + XFS_ILOG_DEV | XFS_ILOG_UUID | \ + XFS_ILOG_ADATA | XFS_ILOG_AEXT | \ + XFS_ILOG_ABROOT | XFS_ILOG_TIMESTAMP | \ + XFS_ILOG_DOWNER | XFS_ILOG_AOWNER) + +static inline int xfs_ilog_fbroot(int w) +{ + return (w == XFS_DATA_FORK ? XFS_ILOG_DBROOT : XFS_ILOG_ABROOT); +} + +static inline int xfs_ilog_fext(int w) +{ + return (w == XFS_DATA_FORK ? XFS_ILOG_DEXT : XFS_ILOG_AEXT); +} + +static inline int xfs_ilog_fdata(int w) +{ + return (w == XFS_DATA_FORK ? XFS_ILOG_DDATA : XFS_ILOG_ADATA); +} + +/* + * Incore version of the on-disk inode core structures. We log this directly + * into the journal in host CPU format (for better or worse) and as such + * directly mirrors the xfs_dinode structure as it must contain all the same + * information. + */ +typedef struct xfs_ictimestamp { + __int32_t t_sec; /* timestamp seconds */ + __int32_t t_nsec; /* timestamp nanoseconds */ +} xfs_ictimestamp_t; + +/* + * NOTE: This structure must be kept identical to struct xfs_dinode + * in xfs_dinode.h except for the endianness annotations. + */ +typedef struct xfs_icdinode { + __uint16_t di_magic; /* inode magic # = XFS_DINODE_MAGIC */ + __uint16_t di_mode; /* mode and type of file */ + __int8_t di_version; /* inode version */ + __int8_t di_format; /* format of di_c data */ + __uint16_t di_onlink; /* old number of links to file */ + __uint32_t di_uid; /* owner's user id */ + __uint32_t di_gid; /* owner's group id */ + __uint32_t di_nlink; /* number of links to file */ + __uint16_t di_projid_lo; /* lower part of owner's project id */ + __uint16_t di_projid_hi; /* higher part of owner's project id */ + __uint8_t di_pad[6]; /* unused, zeroed space */ + __uint16_t di_flushiter; /* incremented on flush */ + xfs_ictimestamp_t di_atime; /* time last accessed */ + xfs_ictimestamp_t di_mtime; /* time last modified */ + xfs_ictimestamp_t di_ctime; /* time created/inode modified */ + xfs_fsize_t di_size; /* number of bytes in file */ + xfs_drfsbno_t di_nblocks; /* # of direct & btree blocks used */ + xfs_extlen_t di_extsize; /* basic/minimum extent size for file */ + xfs_extnum_t di_nextents; /* number of extents in data fork */ + xfs_aextnum_t di_anextents; /* number of extents in attribute fork*/ + __uint8_t di_forkoff; /* attr fork offs, <<3 for 64b align */ + __int8_t di_aformat; /* format of attr fork's data */ + __uint32_t di_dmevmask; /* DMIG event mask */ + __uint16_t di_dmstate; /* DMIG state info */ + __uint16_t di_flags; /* random flags, XFS_DIFLAG_... */ + __uint32_t di_gen; /* generation number */ + + /* di_next_unlinked is the only non-core field in the old dinode */ + xfs_agino_t di_next_unlinked;/* agi unlinked list ptr */ + + /* start of the extended dinode, writable fields */ + __uint32_t di_crc; /* CRC of the inode */ + __uint64_t di_changecount; /* number of attribute changes */ + xfs_lsn_t di_lsn; /* flush sequence */ + __uint64_t di_flags2; /* more random flags */ + __uint8_t di_pad2[16]; /* more padding for future expansion */ + + /* fields only written to during inode creation */ + xfs_ictimestamp_t di_crtime; /* time created */ + xfs_ino_t di_ino; /* inode number */ + uuid_t di_uuid; /* UUID of the filesystem */ + + /* structure must be padded to 64 bit alignment */ +} xfs_icdinode_t; + +static inline uint xfs_icdinode_size(int version) +{ + if (version == 3) + return sizeof(struct xfs_icdinode); + return offsetof(struct xfs_icdinode, di_next_unlinked); +} + +/* + * Buffer Log Format defintions + * + * These are the physical dirty bitmap defintions for the log format structure. + */ +#define XFS_BLF_CHUNK 128 +#define XFS_BLF_SHIFT 7 +#define BIT_TO_WORD_SHIFT 5 +#define NBWORD (NBBY * sizeof(unsigned int)) + +/* + * This flag indicates that the buffer contains on disk inodes + * and requires special recovery handling. + */ +#define XFS_BLF_INODE_BUF (1<<0) + +/* + * This flag indicates that the buffer should not be replayed + * during recovery because its blocks are being freed. + */ +#define XFS_BLF_CANCEL (1<<1) + +/* + * This flag indicates that the buffer contains on disk + * user or group dquots and may require special recovery handling. + */ +#define XFS_BLF_UDQUOT_BUF (1<<2) +#define XFS_BLF_PDQUOT_BUF (1<<3) +#define XFS_BLF_GDQUOT_BUF (1<<4) + +/* + * This is the structure used to lay out a buf log item in the + * log. The data map describes which 128 byte chunks of the buffer + * have been logged. + */ +#define XFS_BLF_DATAMAP_SIZE ((XFS_MAX_BLOCKSIZE / XFS_BLF_CHUNK) / NBWORD) + +typedef struct xfs_buf_log_format { + unsigned short blf_type; /* buf log item type indicator */ + unsigned short blf_size; /* size of this item */ + ushort blf_flags; /* misc state */ + ushort blf_len; /* number of blocks in this buf */ + __int64_t blf_blkno; /* starting blkno of this buf */ + unsigned int blf_map_size; /* used size of data bitmap in words */ + unsigned int blf_data_map[XFS_BLF_DATAMAP_SIZE]; /* dirty bitmap */ +} xfs_buf_log_format_t; + +/* + * All buffers now need to tell recovery where the magic number + * is so that it can verify and calculate the CRCs on the buffer correctly + * once the changes have been replayed into the buffer. + * + * The type value is held in the upper 5 bits of the blf_flags field, which is + * an unsigned 16 bit field. Hence we need to shift it 11 bits up and down. + */ +#define XFS_BLFT_BITS 5 +#define XFS_BLFT_SHIFT 11 +#define XFS_BLFT_MASK (((1 << XFS_BLFT_BITS) - 1) << XFS_BLFT_SHIFT) + +enum xfs_blft { + XFS_BLFT_UNKNOWN_BUF = 0, + XFS_BLFT_UDQUOT_BUF, + XFS_BLFT_PDQUOT_BUF, + XFS_BLFT_GDQUOT_BUF, + XFS_BLFT_BTREE_BUF, + XFS_BLFT_AGF_BUF, + XFS_BLFT_AGFL_BUF, + XFS_BLFT_AGI_BUF, + XFS_BLFT_DINO_BUF, + XFS_BLFT_SYMLINK_BUF, + XFS_BLFT_DIR_BLOCK_BUF, + XFS_BLFT_DIR_DATA_BUF, + XFS_BLFT_DIR_FREE_BUF, + XFS_BLFT_DIR_LEAF1_BUF, + XFS_BLFT_DIR_LEAFN_BUF, + XFS_BLFT_DA_NODE_BUF, + XFS_BLFT_ATTR_LEAF_BUF, + XFS_BLFT_ATTR_RMT_BUF, + XFS_BLFT_SB_BUF, + XFS_BLFT_MAX_BUF = (1 << XFS_BLFT_BITS), +}; + +static inline void +xfs_blft_to_flags(struct xfs_buf_log_format *blf, enum xfs_blft type) +{ + ASSERT(type > XFS_BLFT_UNKNOWN_BUF && type < XFS_BLFT_MAX_BUF); + blf->blf_flags &= ~XFS_BLFT_MASK; + blf->blf_flags |= ((type << XFS_BLFT_SHIFT) & XFS_BLFT_MASK); +} + +static inline __uint16_t +xfs_blft_from_flags(struct xfs_buf_log_format *blf) +{ + return (blf->blf_flags & XFS_BLFT_MASK) >> XFS_BLFT_SHIFT; +} + +/* + * EFI/EFD log format definitions + */ +typedef struct xfs_extent { + xfs_dfsbno_t ext_start; + xfs_extlen_t ext_len; +} xfs_extent_t; + +/* + * Since an xfs_extent_t has types (start:64, len: 32) + * there are different alignments on 32 bit and 64 bit kernels. + * So we provide the different variants for use by a + * conversion routine. + */ +typedef struct xfs_extent_32 { + __uint64_t ext_start; + __uint32_t ext_len; +} __attribute__((packed)) xfs_extent_32_t; + +typedef struct xfs_extent_64 { + __uint64_t ext_start; + __uint32_t ext_len; + __uint32_t ext_pad; +} xfs_extent_64_t; + +/* + * This is the structure used to lay out an efi log item in the + * log. The efi_extents field is a variable size array whose + * size is given by efi_nextents. + */ +typedef struct xfs_efi_log_format { + __uint16_t efi_type; /* efi log item type */ + __uint16_t efi_size; /* size of this item */ + __uint32_t efi_nextents; /* # extents to free */ + __uint64_t efi_id; /* efi identifier */ + xfs_extent_t efi_extents[1]; /* array of extents to free */ +} xfs_efi_log_format_t; + +typedef struct xfs_efi_log_format_32 { + __uint16_t efi_type; /* efi log item type */ + __uint16_t efi_size; /* size of this item */ + __uint32_t efi_nextents; /* # extents to free */ + __uint64_t efi_id; /* efi identifier */ + xfs_extent_32_t efi_extents[1]; /* array of extents to free */ +} __attribute__((packed)) xfs_efi_log_format_32_t; + +typedef struct xfs_efi_log_format_64 { + __uint16_t efi_type; /* efi log item type */ + __uint16_t efi_size; /* size of this item */ + __uint32_t efi_nextents; /* # extents to free */ + __uint64_t efi_id; /* efi identifier */ + xfs_extent_64_t efi_extents[1]; /* array of extents to free */ +} xfs_efi_log_format_64_t; + +/* + * This is the structure used to lay out an efd log item in the + * log. The efd_extents array is a variable size array whose + * size is given by efd_nextents; + */ +typedef struct xfs_efd_log_format { + __uint16_t efd_type; /* efd log item type */ + __uint16_t efd_size; /* size of this item */ + __uint32_t efd_nextents; /* # of extents freed */ + __uint64_t efd_efi_id; /* id of corresponding efi */ + xfs_extent_t efd_extents[1]; /* array of extents freed */ +} xfs_efd_log_format_t; + +typedef struct xfs_efd_log_format_32 { + __uint16_t efd_type; /* efd log item type */ + __uint16_t efd_size; /* size of this item */ + __uint32_t efd_nextents; /* # of extents freed */ + __uint64_t efd_efi_id; /* id of corresponding efi */ + xfs_extent_32_t efd_extents[1]; /* array of extents freed */ +} __attribute__((packed)) xfs_efd_log_format_32_t; + +typedef struct xfs_efd_log_format_64 { + __uint16_t efd_type; /* efd log item type */ + __uint16_t efd_size; /* size of this item */ + __uint32_t efd_nextents; /* # of extents freed */ + __uint64_t efd_efi_id; /* id of corresponding efi */ + xfs_extent_64_t efd_extents[1]; /* array of extents freed */ +} xfs_efd_log_format_64_t; + +/* + * Dquot Log format definitions. + * + * The first two fields must be the type and size fitting into + * 32 bits : log_recovery code assumes that. + */ +typedef struct xfs_dq_logformat { + __uint16_t qlf_type; /* dquot log item type */ + __uint16_t qlf_size; /* size of this item */ + xfs_dqid_t qlf_id; /* usr/grp/proj id : 32 bits */ + __int64_t qlf_blkno; /* blkno of dquot buffer */ + __int32_t qlf_len; /* len of dquot buffer */ + __uint32_t qlf_boffset; /* off of dquot in buffer */ +} xfs_dq_logformat_t; + +/* + * log format struct for QUOTAOFF records. + * The first two fields must be the type and size fitting into + * 32 bits : log_recovery code assumes that. + * We write two LI_QUOTAOFF logitems per quotaoff, the last one keeps a pointer + * to the first and ensures that the first logitem is taken out of the AIL + * only when the last one is securely committed. + */ +typedef struct xfs_qoff_logformat { + unsigned short qf_type; /* quotaoff log item type */ + unsigned short qf_size; /* size of this item */ + unsigned int qf_flags; /* USR and/or GRP */ + char qf_pad[12]; /* padding for future */ +} xfs_qoff_logformat_t; + +/* + * Disk quotas status in m_qflags, and also sb_qflags. 16 bits. + */ +#define XFS_UQUOTA_ACCT 0x0001 /* user quota accounting ON */ +#define XFS_UQUOTA_ENFD 0x0002 /* user quota limits enforced */ +#define XFS_UQUOTA_CHKD 0x0004 /* quotacheck run on usr quotas */ +#define XFS_PQUOTA_ACCT 0x0008 /* project quota accounting ON */ +#define XFS_OQUOTA_ENFD 0x0010 /* other (grp/prj) quota limits enforced */ +#define XFS_OQUOTA_CHKD 0x0020 /* quotacheck run on other (grp/prj) quotas */ +#define XFS_GQUOTA_ACCT 0x0040 /* group quota accounting ON */ + +/* + * Conversion to and from the combined OQUOTA flag (if necessary) + * is done only in xfs_sb_qflags_to_disk() and xfs_sb_qflags_from_disk() + */ +#define XFS_GQUOTA_ENFD 0x0080 /* group quota limits enforced */ +#define XFS_GQUOTA_CHKD 0x0100 /* quotacheck run on group quotas */ +#define XFS_PQUOTA_ENFD 0x0200 /* project quota limits enforced */ +#define XFS_PQUOTA_CHKD 0x0400 /* quotacheck run on project quotas */ + +#define XFS_ALL_QUOTA_ACCT \ + (XFS_UQUOTA_ACCT | XFS_GQUOTA_ACCT | XFS_PQUOTA_ACCT) +#define XFS_ALL_QUOTA_ENFD \ + (XFS_UQUOTA_ENFD | XFS_GQUOTA_ENFD | XFS_PQUOTA_ENFD) +#define XFS_ALL_QUOTA_CHKD \ + (XFS_UQUOTA_CHKD | XFS_GQUOTA_CHKD | XFS_PQUOTA_CHKD) + +#define XFS_MOUNT_QUOTA_ALL (XFS_UQUOTA_ACCT|XFS_UQUOTA_ENFD|\ + XFS_UQUOTA_CHKD|XFS_GQUOTA_ACCT|\ + XFS_GQUOTA_ENFD|XFS_GQUOTA_CHKD|\ + XFS_PQUOTA_ACCT|XFS_PQUOTA_ENFD|\ + XFS_PQUOTA_CHKD) + +/* + * Inode create log item structure + * + * Log recovery assumes the first two entries are the type and size and they fit + * in 32 bits. Also in host order (ugh) so they have to be 32 bit aligned so + * decoding can be done correctly. + */ +struct xfs_icreate_log { + __uint16_t icl_type; /* type of log format structure */ + __uint16_t icl_size; /* size of log format structure */ + __be32 icl_ag; /* ag being allocated in */ + __be32 icl_agbno; /* start block of inode range */ + __be32 icl_count; /* number of inodes to initialise */ + __be32 icl_isize; /* size of inodes */ + __be32 icl_length; /* length of extent to initialise */ + __be32 icl_gen; /* inode generation number to use */ +}; + +#endif /* __XFS_LOG_FORMAT_H__ */ diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index 16d8d12ea3b..9bc403a9e54 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -22,53 +22,16 @@ struct xfs_buf; struct xlog; struct xlog_ticket; struct xfs_mount; +struct xfs_log_callback; /* - * Macros, structures, prototypes for internal log manager use. + * Flags for log structure */ - -#define XLOG_MIN_ICLOGS 2 -#define XLOG_MAX_ICLOGS 8 -#define XLOG_HEADER_MAGIC_NUM 0xFEEDbabe /* Invalid cycle number */ -#define XLOG_VERSION_1 1 -#define XLOG_VERSION_2 2 /* Large IClogs, Log sunit */ -#define XLOG_VERSION_OKBITS (XLOG_VERSION_1 | XLOG_VERSION_2) -#define XLOG_MIN_RECORD_BSIZE (16*1024) /* eventually 32k */ -#define XLOG_BIG_RECORD_BSIZE (32*1024) /* 32k buffers */ -#define XLOG_MAX_RECORD_BSIZE (256*1024) -#define XLOG_HEADER_CYCLE_SIZE (32*1024) /* cycle data in header */ -#define XLOG_MIN_RECORD_BSHIFT 14 /* 16384 == 1 << 14 */ -#define XLOG_BIG_RECORD_BSHIFT 15 /* 32k == 1 << 15 */ -#define XLOG_MAX_RECORD_BSHIFT 18 /* 256k == 1 << 18 */ -#define XLOG_BTOLSUNIT(log, b) (((b)+(log)->l_mp->m_sb.sb_logsunit-1) / \ - (log)->l_mp->m_sb.sb_logsunit) -#define XLOG_LSUNITTOB(log, su) ((su) * (log)->l_mp->m_sb.sb_logsunit) - -#define XLOG_HEADER_SIZE 512 - -#define XLOG_REC_SHIFT(log) \ - BTOBB(1 << (xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? \ - XLOG_MAX_RECORD_BSHIFT : XLOG_BIG_RECORD_BSHIFT)) -#define XLOG_TOTAL_REC_SHIFT(log) \ - BTOBB(XLOG_MAX_ICLOGS << (xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? \ - XLOG_MAX_RECORD_BSHIFT : XLOG_BIG_RECORD_BSHIFT)) - -static inline xfs_lsn_t xlog_assign_lsn(uint cycle, uint block) -{ - return ((xfs_lsn_t)cycle << 32) | block; -} - -static inline uint xlog_get_cycle(char *ptr) -{ - if (be32_to_cpu(*(__be32 *)ptr) == XLOG_HEADER_MAGIC_NUM) - return be32_to_cpu(*((__be32 *)ptr + 1)); - else - return be32_to_cpu(*(__be32 *)ptr); -} - -#define BLK_AVG(blk1, blk2) ((blk1+blk2) >> 1) - -#ifdef __KERNEL__ +#define XLOG_ACTIVE_RECOVERY 0x2 /* in the middle of recovery */ +#define XLOG_RECOVERY_NEEDED 0x4 /* log was recovered */ +#define XLOG_IO_ERROR 0x8 /* log hit an I/O error, and being + shutdown */ +#define XLOG_TAIL_WARN 0x10 /* log tail verify warning issued */ /* * get client id from packed copy. @@ -101,28 +64,8 @@ static inline uint xlog_get_client_id(__be32 i) #define XLOG_STATE_IOERROR 0x0080 /* IO error happened in sync'ing log */ #define XLOG_STATE_ALL 0x7FFF /* All possible valid flags */ #define XLOG_STATE_NOTUSED 0x8000 /* This IC log not being used */ -#endif /* __KERNEL__ */ /* - * Flags to log operation header - * - * The first write of a new transaction will be preceded with a start - * record, XLOG_START_TRANS. Once a transaction is committed, a commit - * record is written, XLOG_COMMIT_TRANS. If a single region can not fit into - * the remainder of the current active in-core log, it is split up into - * multiple regions. Each partial region will be marked with a - * XLOG_CONTINUE_TRANS until the last one, which gets marked with XLOG_END_TRANS. - * - */ -#define XLOG_START_TRANS 0x01 /* Start a new transaction */ -#define XLOG_COMMIT_TRANS 0x02 /* Commit this transaction */ -#define XLOG_CONTINUE_TRANS 0x04 /* Cont this trans into new region */ -#define XLOG_WAS_CONT_TRANS 0x08 /* Cont this trans into new region */ -#define XLOG_END_TRANS 0x10 /* End a continued transaction */ -#define XLOG_UNMOUNT_TRANS 0x20 /* Unmount a filesystem transaction */ - -#ifdef __KERNEL__ -/* * Flags to log ticket */ #define XLOG_TIC_INITED 0x1 /* has been initialized */ @@ -132,22 +75,6 @@ static inline uint xlog_get_client_id(__be32 i) { XLOG_TIC_INITED, "XLOG_TIC_INITED" }, \ { XLOG_TIC_PERM_RESERV, "XLOG_TIC_PERM_RESERV" } -#endif /* __KERNEL__ */ - -#define XLOG_UNMOUNT_TYPE 0x556e /* Un for Unmount */ - -/* - * Flags for log structure - */ -#define XLOG_ACTIVE_RECOVERY 0x2 /* in the middle of recovery */ -#define XLOG_RECOVERY_NEEDED 0x4 /* log was recovered */ -#define XLOG_IO_ERROR 0x8 /* log hit an I/O error, and being - shutdown */ -#define XLOG_TAIL_WARN 0x10 /* log tail verify warning issued */ - -typedef __uint32_t xlog_tid_t; - -#ifdef __KERNEL__ /* * Below are states for covering allocation transactions. * By covering, we mean changing the h_tail_lsn in the last on-disk @@ -223,7 +150,6 @@ typedef __uint32_t xlog_tid_t; #define XLOG_COVER_OPS 5 - /* Ticket reservation region accounting */ #define XLOG_TIC_LEN_MAX 15 @@ -258,64 +184,6 @@ typedef struct xlog_ticket { xlog_res_t t_res_arr[XLOG_TIC_LEN_MAX]; /* array of res : 8 * 15 */ } xlog_ticket_t; -#endif - - -typedef struct xlog_op_header { - __be32 oh_tid; /* transaction id of operation : 4 b */ - __be32 oh_len; /* bytes in data region : 4 b */ - __u8 oh_clientid; /* who sent me this : 1 b */ - __u8 oh_flags; /* : 1 b */ - __u16 oh_res2; /* 32 bit align : 2 b */ -} xlog_op_header_t; - - -/* valid values for h_fmt */ -#define XLOG_FMT_UNKNOWN 0 -#define XLOG_FMT_LINUX_LE 1 -#define XLOG_FMT_LINUX_BE 2 -#define XLOG_FMT_IRIX_BE 3 - -/* our fmt */ -#ifdef XFS_NATIVE_HOST -#define XLOG_FMT XLOG_FMT_LINUX_BE -#else -#define XLOG_FMT XLOG_FMT_LINUX_LE -#endif - -typedef struct xlog_rec_header { - __be32 h_magicno; /* log record (LR) identifier : 4 */ - __be32 h_cycle; /* write cycle of log : 4 */ - __be32 h_version; /* LR version : 4 */ - __be32 h_len; /* len in bytes; should be 64-bit aligned: 4 */ - __be64 h_lsn; /* lsn of this LR : 8 */ - __be64 h_tail_lsn; /* lsn of 1st LR w/ buffers not committed: 8 */ - __le32 h_crc; /* crc of log record : 4 */ - __be32 h_prev_block; /* block number to previous LR : 4 */ - __be32 h_num_logops; /* number of log operations in this LR : 4 */ - __be32 h_cycle_data[XLOG_HEADER_CYCLE_SIZE / BBSIZE]; - /* new fields */ - __be32 h_fmt; /* format of log record : 4 */ - uuid_t h_fs_uuid; /* uuid of FS : 16 */ - __be32 h_size; /* iclog size : 4 */ -} xlog_rec_header_t; - -typedef struct xlog_rec_ext_header { - __be32 xh_cycle; /* write cycle of log : 4 */ - __be32 xh_cycle_data[XLOG_HEADER_CYCLE_SIZE / BBSIZE]; /* : 256 */ -} xlog_rec_ext_header_t; - -#ifdef __KERNEL__ - -/* - * Quite misnamed, because this union lays out the actual on-disk log buffer. - */ -typedef union xlog_in_core2 { - xlog_rec_header_t hic_header; - xlog_rec_ext_header_t hic_xheader; - char hic_sector[XLOG_HEADER_SIZE]; -} xlog_in_core_2_t; - /* * - A log record header is 512 bytes. There is plenty of room to grow the * xlog_rec_header_t into the reserved space. @@ -360,8 +228,8 @@ typedef struct xlog_in_core { /* Callback structures need their own cacheline */ spinlock_t ic_callback_lock ____cacheline_aligned_in_smp; - xfs_log_callback_t *ic_callback; - xfs_log_callback_t **ic_callback_tail; + struct xfs_log_callback *ic_callback; + struct xfs_log_callback **ic_callback_tail; /* reference counts need their own cacheline */ atomic_t ic_refcnt ____cacheline_aligned_in_smp; @@ -387,7 +255,7 @@ struct xfs_cil_ctx { int space_used; /* aggregate size of regions */ struct list_head busy_extents; /* busy extents in chkpt */ struct xfs_log_vec *lv_chain; /* logvecs being pushed */ - xfs_log_callback_t log_cb; /* completion callback hook. */ + struct xfs_log_callback log_cb; /* completion callback hook. */ struct list_head committing; /* ctx committing list */ }; @@ -411,14 +279,17 @@ struct xfs_cil { struct xlog *xc_log; struct list_head xc_cil; spinlock_t xc_cil_lock; + + struct rw_semaphore xc_ctx_lock ____cacheline_aligned_in_smp; struct xfs_cil_ctx *xc_ctx; - struct rw_semaphore xc_ctx_lock; + + spinlock_t xc_push_lock ____cacheline_aligned_in_smp; + xfs_lsn_t xc_push_seq; struct list_head xc_committing; wait_queue_head_t xc_commit_wait; xfs_lsn_t xc_current_sequence; struct work_struct xc_push_work; - xfs_lsn_t xc_push_seq; -}; +} ____cacheline_aligned_in_smp; /* * The amount of log space we allow the CIL to aggregate is difficult to size. @@ -468,7 +339,6 @@ struct xfs_cil { * threshold, yet give us plenty of space for aggregation on large logs. */ #define XLOG_CIL_SPACE_LIMIT(log) (log->l_logsize >> 3) -#define XLOG_CIL_HARD_SPACE_LIMIT(log) (3 * (log->l_logsize >> 4)) /* * ticket grant locks, queues and accounting have their own cachlines @@ -645,12 +515,10 @@ xlog_assign_grant_head(atomic64_t *head, int cycle, int space) /* * Committed Item List interfaces */ -int -xlog_cil_init(struct xlog *log); -void -xlog_cil_init_post_recovery(struct xlog *log); -void -xlog_cil_destroy(struct xlog *log); +int xlog_cil_init(struct xlog *log); +void xlog_cil_init_post_recovery(struct xlog *log); +void xlog_cil_destroy(struct xlog *log); +bool xlog_cil_empty(struct xlog *log); /* * CIL force routines @@ -687,6 +555,5 @@ static inline void xlog_wait(wait_queue_head_t *wq, spinlock_t *lock) schedule(); remove_wait_queue(wq, &wait); } -#endif /* __KERNEL__ */ #endif /* __XFS_LOG_PRIV_H__ */ diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 96fcbb85ff8..981af0f6504 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -17,33 +17,36 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_types.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_bit.h" -#include "xfs_log.h" #include "xfs_inum.h" -#include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_mount.h" -#include "xfs_error.h" -#include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_dinode.h" +#include "xfs_da_format.h" #include "xfs_inode.h" -#include "xfs_inode_item.h" -#include "xfs_alloc.h" -#include "xfs_ialloc.h" +#include "xfs_trans.h" +#include "xfs_log.h" #include "xfs_log_priv.h" -#include "xfs_buf_item.h" #include "xfs_log_recover.h" +#include "xfs_inode_item.h" #include "xfs_extfree_item.h" #include "xfs_trans_priv.h" +#include "xfs_alloc.h" +#include "xfs_ialloc.h" #include "xfs_quota.h" -#include "xfs_utils.h" #include "xfs_cksum.h" #include "xfs_trace.h" #include "xfs_icache.h" +#include "xfs_bmap_btree.h" +#include "xfs_dinode.h" +#include "xfs_error.h" +#include "xfs_dir2.h" + +#define BLK_AVG(blk1, blk2) ((blk1+blk2) >> 1) STATIC int xlog_find_zeroed( @@ -190,7 +193,10 @@ xlog_bread_noalign( bp->b_io_length = nbblks; bp->b_error = 0; - xfsbdstrat(log->l_mp, bp); + if (XFS_FORCED_SHUTDOWN(log->l_mp)) + return XFS_ERROR(EIO); + + xfs_buf_iorequest(bp); error = xfs_buf_iowait(bp); if (error) xfs_buf_ioerror_alert(bp, __func__); @@ -294,9 +300,9 @@ xlog_header_check_dump( xfs_mount_t *mp, xlog_rec_header_t *head) { - xfs_debug(mp, "%s: SB : uuid = %pU, fmt = %d\n", + xfs_debug(mp, "%s: SB : uuid = %pU, fmt = %d", __func__, &mp->m_sb.sb_uuid, XLOG_FMT); - xfs_debug(mp, " log : uuid = %pU, fmt = %d\n", + xfs_debug(mp, " log : uuid = %pU, fmt = %d", &head->h_fs_uuid, be32_to_cpu(head->h_fmt)); } #else @@ -597,7 +603,7 @@ out: /* * Head is defined to be the point of the log where the next log write - * write could go. This means that incomplete LR writes at the end are + * could go. This means that incomplete LR writes at the end are * eliminated when calculating the head. We aren't guaranteed that previous * LR have complete transactions. We only know that a cycle number of * current cycle number -1 won't be present in the log if we start writing @@ -953,6 +959,7 @@ xlog_find_tail( } if (!found) { xfs_warn(log->l_mp, "%s: couldn't find sync record", __func__); + xlog_put_bp(bp); ASSERT(0); return XFS_ERROR(EIO); } @@ -1134,7 +1141,8 @@ xlog_find_zeroed( */ xfs_warn(log->l_mp, "Log inconsistent or not a log (last==0, first!=1)"); - return XFS_ERROR(EINVAL); + error = XFS_ERROR(EINVAL); + goto bp_err; } /* we have a partially zeroed log */ @@ -1442,9 +1450,8 @@ xlog_recover_find_tid( xlog_tid_t tid) { xlog_recover_t *trans; - struct hlist_node *n; - hlist_for_each_entry(trans, n, head, r_list) { + hlist_for_each_entry(trans, head, r_list) { if (trans->r_log_tid == tid) return trans; } @@ -1573,6 +1580,7 @@ xlog_recover_add_to_trans( "bad number of regions (%d) in inode log format", in_f->ilf_size); ASSERT(0); + kmem_free(ptr); return XFS_ERROR(EIO); } @@ -1591,10 +1599,53 @@ xlog_recover_add_to_trans( } /* - * Sort the log items in the transaction. Cancelled buffers need - * to be put first so they are processed before any items that might - * modify the buffers. If they are cancelled, then the modifications - * don't need to be replayed. + * Sort the log items in the transaction. + * + * The ordering constraints are defined by the inode allocation and unlink + * behaviour. The rules are: + * + * 1. Every item is only logged once in a given transaction. Hence it + * represents the last logged state of the item. Hence ordering is + * dependent on the order in which operations need to be performed so + * required initial conditions are always met. + * + * 2. Cancelled buffers are recorded in pass 1 in a separate table and + * there's nothing to replay from them so we can simply cull them + * from the transaction. However, we can't do that until after we've + * replayed all the other items because they may be dependent on the + * cancelled buffer and replaying the cancelled buffer can remove it + * form the cancelled buffer table. Hence they have tobe done last. + * + * 3. Inode allocation buffers must be replayed before inode items that + * read the buffer and replay changes into it. For filesystems using the + * ICREATE transactions, this means XFS_LI_ICREATE objects need to get + * treated the same as inode allocation buffers as they create and + * initialise the buffers directly. + * + * 4. Inode unlink buffers must be replayed after inode items are replayed. + * This ensures that inodes are completely flushed to the inode buffer + * in a "free" state before we remove the unlinked inode list pointer. + * + * Hence the ordering needs to be inode allocation buffers first, inode items + * second, inode unlink buffers third and cancelled buffers last. + * + * But there's a problem with that - we can't tell an inode allocation buffer + * apart from a regular buffer, so we can't separate them. We can, however, + * tell an inode unlink buffer from the others, and so we can separate them out + * from all the other buffers and move them to last. + * + * Hence, 4 lists, in order from head to tail: + * - buffer_list for all buffers except cancelled/inode unlink buffers + * - item_list for all non-buffer items + * - inode_buffer_list for inode unlink buffers + * - cancel_list for the cancelled buffers + * + * Note that we add objects to the tail of the lists so that first-to-last + * ordering is preserved within the lists. Adding objects to the head of the + * list means when we traverse from the head we walk them in last-to-first + * order. For cancelled buffers and inode unlink buffers this doesn't matter, + * but for all other items there may be specific ordering that we need to + * preserve. */ STATIC int xlog_recover_reorder_trans( @@ -1603,20 +1654,34 @@ xlog_recover_reorder_trans( int pass) { xlog_recover_item_t *item, *n; + int error = 0; LIST_HEAD(sort_list); + LIST_HEAD(cancel_list); + LIST_HEAD(buffer_list); + LIST_HEAD(inode_buffer_list); + LIST_HEAD(inode_list); list_splice_init(&trans->r_itemq, &sort_list); list_for_each_entry_safe(item, n, &sort_list, ri_list) { xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr; switch (ITEM_TYPE(item)) { + case XFS_LI_ICREATE: + list_move_tail(&item->ri_list, &buffer_list); + break; case XFS_LI_BUF: - if (!(buf_f->blf_flags & XFS_BLF_CANCEL)) { + if (buf_f->blf_flags & XFS_BLF_CANCEL) { trace_xfs_log_recover_item_reorder_head(log, trans, item, pass); - list_move(&item->ri_list, &trans->r_itemq); + list_move(&item->ri_list, &cancel_list); break; } + if (buf_f->blf_flags & XFS_BLF_INODE_BUF) { + list_move(&item->ri_list, &inode_buffer_list); + break; + } + list_move_tail(&item->ri_list, &buffer_list); + break; case XFS_LI_INODE: case XFS_LI_DQUOT: case XFS_LI_QUOTAOFF: @@ -1624,18 +1689,34 @@ xlog_recover_reorder_trans( case XFS_LI_EFI: trace_xfs_log_recover_item_reorder_tail(log, trans, item, pass); - list_move_tail(&item->ri_list, &trans->r_itemq); + list_move_tail(&item->ri_list, &inode_list); break; default: xfs_warn(log->l_mp, "%s: unrecognized type of log operation", __func__); ASSERT(0); - return XFS_ERROR(EIO); + /* + * return the remaining items back to the transaction + * item list so they can be freed in caller. + */ + if (!list_empty(&sort_list)) + list_splice_init(&sort_list, &trans->r_itemq); + error = XFS_ERROR(EIO); + goto out; } } +out: ASSERT(list_empty(&sort_list)); - return 0; + if (!list_empty(&buffer_list)) + list_splice(&buffer_list, &trans->r_itemq); + if (!list_empty(&inode_list)) + list_splice_tail(&inode_list, &trans->r_itemq); + if (!list_empty(&inode_buffer_list)) + list_splice_tail(&inode_buffer_list, &trans->r_itemq); + if (!list_empty(&cancel_list)) + list_splice_tail(&cancel_list, &trans->r_itemq); + return error; } /* @@ -1693,19 +1774,11 @@ xlog_recover_buffer_pass1( /* * Check to see whether the buffer being recovered has a corresponding - * entry in the buffer cancel record table. If it does then return 1 - * so that it will be cancelled, otherwise return 0. If the buffer is - * actually a buffer cancel item (XFS_BLF_CANCEL is set), then decrement - * the refcount on the entry in the table and remove it from the table - * if this is the last reference. - * - * We remove the cancel record from the table when we encounter its - * last occurrence in the log so that if the same buffer is re-used - * again after its last cancellation we actually replay the changes - * made at that point. + * entry in the buffer cancel record table. If it is, return the cancel + * buffer structure to the caller. */ -STATIC int -xlog_check_buffer_cancelled( +STATIC struct xfs_buf_cancel * +xlog_peek_buffer_cancelled( struct xlog *log, xfs_daddr_t blkno, uint len, @@ -1714,22 +1787,16 @@ xlog_check_buffer_cancelled( struct list_head *bucket; struct xfs_buf_cancel *bcp; - if (log->l_buf_cancel_table == NULL) { - /* - * There is nothing in the table built in pass one, - * so this buffer must not be cancelled. - */ + if (!log->l_buf_cancel_table) { + /* empty table means no cancelled buffers in the log */ ASSERT(!(flags & XFS_BLF_CANCEL)); - return 0; + return NULL; } - /* - * Search for an entry in the cancel table that matches our buffer. - */ bucket = XLOG_BUF_CANCEL_BUCKET(log, blkno); list_for_each_entry(bcp, bucket, bc_list) { if (bcp->bc_blkno == blkno && bcp->bc_len == len) - goto found; + return bcp; } /* @@ -1737,9 +1804,32 @@ xlog_check_buffer_cancelled( * that the buffer is NOT cancelled. */ ASSERT(!(flags & XFS_BLF_CANCEL)); - return 0; + return NULL; +} + +/* + * If the buffer is being cancelled then return 1 so that it will be cancelled, + * otherwise return 0. If the buffer is actually a buffer cancel item + * (XFS_BLF_CANCEL is set), then decrement the refcount on the entry in the + * table and remove it from the table if this is the last reference. + * + * We remove the cancel record from the table when we encounter its last + * occurrence in the log so that if the same buffer is re-used again after its + * last cancellation we actually replay the changes made at that point. + */ +STATIC int +xlog_check_buffer_cancelled( + struct xlog *log, + xfs_daddr_t blkno, + uint len, + ushort flags) +{ + struct xfs_buf_cancel *bcp; + + bcp = xlog_peek_buffer_cancelled(log, blkno, len, flags); + if (!bcp) + return 0; -found: /* * We've go a match, so return 1 so that the recovery of this buffer * is cancelled. If this buffer is actually a buffer cancel log @@ -1787,6 +1877,13 @@ xlog_recover_do_inode_buffer( trace_xfs_log_recover_buf_inode_buf(mp->m_log, buf_f); + /* + * Post recovery validation only works properly on CRC enabled + * filesystems. + */ + if (xfs_sb_version_hascrc(&mp->m_sb)) + bp->b_ops = &xfs_inode_buf_ops; + inodes_per_buf = BBTOB(bp->b_io_length) >> mp->m_sb.sb_inodelog; for (i = 0; i < inodes_per_buf; i++) { next_unlinked_offset = (i * mp->m_sb.sb_inodesize) + @@ -1852,12 +1949,361 @@ xlog_recover_do_inode_buffer( buffer_nextp = (xfs_agino_t *)xfs_buf_offset(bp, next_unlinked_offset); *buffer_nextp = *logged_nextp; + + /* + * If necessary, recalculate the CRC in the on-disk inode. We + * have to leave the inode in a consistent state for whoever + * reads it next.... + */ + xfs_dinode_calc_crc(mp, (struct xfs_dinode *) + xfs_buf_offset(bp, i * mp->m_sb.sb_inodesize)); + } return 0; } /* + * V5 filesystems know the age of the buffer on disk being recovered. We can + * have newer objects on disk than we are replaying, and so for these cases we + * don't want to replay the current change as that will make the buffer contents + * temporarily invalid on disk. + * + * The magic number might not match the buffer type we are going to recover + * (e.g. reallocated blocks), so we ignore the xfs_buf_log_format flags. Hence + * extract the LSN of the existing object in the buffer based on it's current + * magic number. If we don't recognise the magic number in the buffer, then + * return a LSN of -1 so that the caller knows it was an unrecognised block and + * so can recover the buffer. + * + * Note: we cannot rely solely on magic number matches to determine that the + * buffer has a valid LSN - we also need to verify that it belongs to this + * filesystem, so we need to extract the object's LSN and compare it to that + * which we read from the superblock. If the UUIDs don't match, then we've got a + * stale metadata block from an old filesystem instance that we need to recover + * over the top of. + */ +static xfs_lsn_t +xlog_recover_get_buf_lsn( + struct xfs_mount *mp, + struct xfs_buf *bp) +{ + __uint32_t magic32; + __uint16_t magic16; + __uint16_t magicda; + void *blk = bp->b_addr; + uuid_t *uuid; + xfs_lsn_t lsn = -1; + + /* v4 filesystems always recover immediately */ + if (!xfs_sb_version_hascrc(&mp->m_sb)) + goto recover_immediately; + + magic32 = be32_to_cpu(*(__be32 *)blk); + switch (magic32) { + case XFS_ABTB_CRC_MAGIC: + case XFS_ABTC_CRC_MAGIC: + case XFS_ABTB_MAGIC: + case XFS_ABTC_MAGIC: + case XFS_IBT_CRC_MAGIC: + case XFS_IBT_MAGIC: { + struct xfs_btree_block *btb = blk; + + lsn = be64_to_cpu(btb->bb_u.s.bb_lsn); + uuid = &btb->bb_u.s.bb_uuid; + break; + } + case XFS_BMAP_CRC_MAGIC: + case XFS_BMAP_MAGIC: { + struct xfs_btree_block *btb = blk; + + lsn = be64_to_cpu(btb->bb_u.l.bb_lsn); + uuid = &btb->bb_u.l.bb_uuid; + break; + } + case XFS_AGF_MAGIC: + lsn = be64_to_cpu(((struct xfs_agf *)blk)->agf_lsn); + uuid = &((struct xfs_agf *)blk)->agf_uuid; + break; + case XFS_AGFL_MAGIC: + lsn = be64_to_cpu(((struct xfs_agfl *)blk)->agfl_lsn); + uuid = &((struct xfs_agfl *)blk)->agfl_uuid; + break; + case XFS_AGI_MAGIC: + lsn = be64_to_cpu(((struct xfs_agi *)blk)->agi_lsn); + uuid = &((struct xfs_agi *)blk)->agi_uuid; + break; + case XFS_SYMLINK_MAGIC: + lsn = be64_to_cpu(((struct xfs_dsymlink_hdr *)blk)->sl_lsn); + uuid = &((struct xfs_dsymlink_hdr *)blk)->sl_uuid; + break; + case XFS_DIR3_BLOCK_MAGIC: + case XFS_DIR3_DATA_MAGIC: + case XFS_DIR3_FREE_MAGIC: + lsn = be64_to_cpu(((struct xfs_dir3_blk_hdr *)blk)->lsn); + uuid = &((struct xfs_dir3_blk_hdr *)blk)->uuid; + break; + case XFS_ATTR3_RMT_MAGIC: + lsn = be64_to_cpu(((struct xfs_attr3_rmt_hdr *)blk)->rm_lsn); + uuid = &((struct xfs_attr3_rmt_hdr *)blk)->rm_uuid; + break; + case XFS_SB_MAGIC: + lsn = be64_to_cpu(((struct xfs_dsb *)blk)->sb_lsn); + uuid = &((struct xfs_dsb *)blk)->sb_uuid; + break; + default: + break; + } + + if (lsn != (xfs_lsn_t)-1) { + if (!uuid_equal(&mp->m_sb.sb_uuid, uuid)) + goto recover_immediately; + return lsn; + } + + magicda = be16_to_cpu(((struct xfs_da_blkinfo *)blk)->magic); + switch (magicda) { + case XFS_DIR3_LEAF1_MAGIC: + case XFS_DIR3_LEAFN_MAGIC: + case XFS_DA3_NODE_MAGIC: + lsn = be64_to_cpu(((struct xfs_da3_blkinfo *)blk)->lsn); + uuid = &((struct xfs_da3_blkinfo *)blk)->uuid; + break; + default: + break; + } + + if (lsn != (xfs_lsn_t)-1) { + if (!uuid_equal(&mp->m_sb.sb_uuid, uuid)) + goto recover_immediately; + return lsn; + } + + /* + * We do individual object checks on dquot and inode buffers as they + * have their own individual LSN records. Also, we could have a stale + * buffer here, so we have to at least recognise these buffer types. + * + * A notd complexity here is inode unlinked list processing - it logs + * the inode directly in the buffer, but we don't know which inodes have + * been modified, and there is no global buffer LSN. Hence we need to + * recover all inode buffer types immediately. This problem will be + * fixed by logical logging of the unlinked list modifications. + */ + magic16 = be16_to_cpu(*(__be16 *)blk); + switch (magic16) { + case XFS_DQUOT_MAGIC: + case XFS_DINODE_MAGIC: + goto recover_immediately; + default: + break; + } + + /* unknown buffer contents, recover immediately */ + +recover_immediately: + return (xfs_lsn_t)-1; + +} + +/* + * Validate the recovered buffer is of the correct type and attach the + * appropriate buffer operations to them for writeback. Magic numbers are in a + * few places: + * the first 16 bits of the buffer (inode buffer, dquot buffer), + * the first 32 bits of the buffer (most blocks), + * inside a struct xfs_da_blkinfo at the start of the buffer. + */ +static void +xlog_recover_validate_buf_type( + struct xfs_mount *mp, + struct xfs_buf *bp, + xfs_buf_log_format_t *buf_f) +{ + struct xfs_da_blkinfo *info = bp->b_addr; + __uint32_t magic32; + __uint16_t magic16; + __uint16_t magicda; + + magic32 = be32_to_cpu(*(__be32 *)bp->b_addr); + magic16 = be16_to_cpu(*(__be16*)bp->b_addr); + magicda = be16_to_cpu(info->magic); + switch (xfs_blft_from_flags(buf_f)) { + case XFS_BLFT_BTREE_BUF: + switch (magic32) { + case XFS_ABTB_CRC_MAGIC: + case XFS_ABTC_CRC_MAGIC: + case XFS_ABTB_MAGIC: + case XFS_ABTC_MAGIC: + bp->b_ops = &xfs_allocbt_buf_ops; + break; + case XFS_IBT_CRC_MAGIC: + case XFS_FIBT_CRC_MAGIC: + case XFS_IBT_MAGIC: + case XFS_FIBT_MAGIC: + bp->b_ops = &xfs_inobt_buf_ops; + break; + case XFS_BMAP_CRC_MAGIC: + case XFS_BMAP_MAGIC: + bp->b_ops = &xfs_bmbt_buf_ops; + break; + default: + xfs_warn(mp, "Bad btree block magic!"); + ASSERT(0); + break; + } + break; + case XFS_BLFT_AGF_BUF: + if (magic32 != XFS_AGF_MAGIC) { + xfs_warn(mp, "Bad AGF block magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_agf_buf_ops; + break; + case XFS_BLFT_AGFL_BUF: + if (!xfs_sb_version_hascrc(&mp->m_sb)) + break; + if (magic32 != XFS_AGFL_MAGIC) { + xfs_warn(mp, "Bad AGFL block magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_agfl_buf_ops; + break; + case XFS_BLFT_AGI_BUF: + if (magic32 != XFS_AGI_MAGIC) { + xfs_warn(mp, "Bad AGI block magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_agi_buf_ops; + break; + case XFS_BLFT_UDQUOT_BUF: + case XFS_BLFT_PDQUOT_BUF: + case XFS_BLFT_GDQUOT_BUF: +#ifdef CONFIG_XFS_QUOTA + if (magic16 != XFS_DQUOT_MAGIC) { + xfs_warn(mp, "Bad DQUOT block magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_dquot_buf_ops; +#else + xfs_alert(mp, + "Trying to recover dquots without QUOTA support built in!"); + ASSERT(0); +#endif + break; + case XFS_BLFT_DINO_BUF: + /* + * we get here with inode allocation buffers, not buffers that + * track unlinked list changes. + */ + if (magic16 != XFS_DINODE_MAGIC) { + xfs_warn(mp, "Bad INODE block magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_inode_buf_ops; + break; + case XFS_BLFT_SYMLINK_BUF: + if (magic32 != XFS_SYMLINK_MAGIC) { + xfs_warn(mp, "Bad symlink block magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_symlink_buf_ops; + break; + case XFS_BLFT_DIR_BLOCK_BUF: + if (magic32 != XFS_DIR2_BLOCK_MAGIC && + magic32 != XFS_DIR3_BLOCK_MAGIC) { + xfs_warn(mp, "Bad dir block magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_dir3_block_buf_ops; + break; + case XFS_BLFT_DIR_DATA_BUF: + if (magic32 != XFS_DIR2_DATA_MAGIC && + magic32 != XFS_DIR3_DATA_MAGIC) { + xfs_warn(mp, "Bad dir data magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_dir3_data_buf_ops; + break; + case XFS_BLFT_DIR_FREE_BUF: + if (magic32 != XFS_DIR2_FREE_MAGIC && + magic32 != XFS_DIR3_FREE_MAGIC) { + xfs_warn(mp, "Bad dir3 free magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_dir3_free_buf_ops; + break; + case XFS_BLFT_DIR_LEAF1_BUF: + if (magicda != XFS_DIR2_LEAF1_MAGIC && + magicda != XFS_DIR3_LEAF1_MAGIC) { + xfs_warn(mp, "Bad dir leaf1 magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_dir3_leaf1_buf_ops; + break; + case XFS_BLFT_DIR_LEAFN_BUF: + if (magicda != XFS_DIR2_LEAFN_MAGIC && + magicda != XFS_DIR3_LEAFN_MAGIC) { + xfs_warn(mp, "Bad dir leafn magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_dir3_leafn_buf_ops; + break; + case XFS_BLFT_DA_NODE_BUF: + if (magicda != XFS_DA_NODE_MAGIC && + magicda != XFS_DA3_NODE_MAGIC) { + xfs_warn(mp, "Bad da node magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_da3_node_buf_ops; + break; + case XFS_BLFT_ATTR_LEAF_BUF: + if (magicda != XFS_ATTR_LEAF_MAGIC && + magicda != XFS_ATTR3_LEAF_MAGIC) { + xfs_warn(mp, "Bad attr leaf magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_attr3_leaf_buf_ops; + break; + case XFS_BLFT_ATTR_RMT_BUF: + if (!xfs_sb_version_hascrc(&mp->m_sb)) + break; + if (magic32 != XFS_ATTR3_RMT_MAGIC) { + xfs_warn(mp, "Bad attr remote magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_attr3_rmt_buf_ops; + break; + case XFS_BLFT_SB_BUF: + if (magic32 != XFS_SB_MAGIC) { + xfs_warn(mp, "Bad SB block magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_sb_buf_ops; + break; + default: + xfs_warn(mp, "Unknown buffer type %d!", + xfs_blft_from_flags(buf_f)); + break; + } +} + +/* * Perform a 'normal' buffer recovery. Each logged region of the * buffer should be copied over the corresponding region in the * given buffer. The bitmap in the buf log format structure indicates @@ -1893,6 +2339,17 @@ xlog_recover_do_reg_buffer( ((uint)bit << XFS_BLF_SHIFT) + (nbits << XFS_BLF_SHIFT)); /* + * The dirty regions logged in the buffer, even though + * contiguous, may span multiple chunks. This is because the + * dirty region may span a physical page boundary in a buffer + * and hence be split into two separate vectors for writing into + * the log. Hence we need to trim nbits back to the length of + * the current region being copied out of the log. + */ + if (item->ri_buf[i].i_len < (nbits << XFS_BLF_SHIFT)) + nbits = item->ri_buf[i].i_len >> XFS_BLF_SHIFT; + + /* * Do a sanity check if this is a dquot buffer. Just checking * the first dquot in the buffer should do. XXXThis is * probably a good thing to do for other buf types also. @@ -1911,7 +2368,7 @@ xlog_recover_do_reg_buffer( item->ri_buf[i].i_len, __func__); goto next; } - error = xfs_qm_dqcheck(mp, item->ri_buf[i].i_addr, + error = xfs_dqcheck(mp, item->ri_buf[i].i_addr, -1, 0, XFS_QMOPT_DOWARN, "dquot_buf_recover"); if (error) @@ -1929,132 +2386,22 @@ xlog_recover_do_reg_buffer( /* Shouldn't be any more regions */ ASSERT(i == item->ri_total); -} - -/* - * Do some primitive error checking on ondisk dquot data structures. - */ -int -xfs_qm_dqcheck( - struct xfs_mount *mp, - xfs_disk_dquot_t *ddq, - xfs_dqid_t id, - uint type, /* used only when IO_dorepair is true */ - uint flags, - char *str) -{ - xfs_dqblk_t *d = (xfs_dqblk_t *)ddq; - int errs = 0; - - /* - * We can encounter an uninitialized dquot buffer for 2 reasons: - * 1. If we crash while deleting the quotainode(s), and those blks got - * used for user data. This is because we take the path of regular - * file deletion; however, the size field of quotainodes is never - * updated, so all the tricks that we play in itruncate_finish - * don't quite matter. - * - * 2. We don't play the quota buffers when there's a quotaoff logitem. - * But the allocation will be replayed so we'll end up with an - * uninitialized quota block. - * - * This is all fine; things are still consistent, and we haven't lost - * any quota information. Just don't complain about bad dquot blks. - */ - if (ddq->d_magic != cpu_to_be16(XFS_DQUOT_MAGIC)) { - if (flags & XFS_QMOPT_DOWARN) - xfs_alert(mp, - "%s : XFS dquot ID 0x%x, magic 0x%x != 0x%x", - str, id, be16_to_cpu(ddq->d_magic), XFS_DQUOT_MAGIC); - errs++; - } - if (ddq->d_version != XFS_DQUOT_VERSION) { - if (flags & XFS_QMOPT_DOWARN) - xfs_alert(mp, - "%s : XFS dquot ID 0x%x, version 0x%x != 0x%x", - str, id, ddq->d_version, XFS_DQUOT_VERSION); - errs++; - } - - if (ddq->d_flags != XFS_DQ_USER && - ddq->d_flags != XFS_DQ_PROJ && - ddq->d_flags != XFS_DQ_GROUP) { - if (flags & XFS_QMOPT_DOWARN) - xfs_alert(mp, - "%s : XFS dquot ID 0x%x, unknown flags 0x%x", - str, id, ddq->d_flags); - errs++; - } - - if (id != -1 && id != be32_to_cpu(ddq->d_id)) { - if (flags & XFS_QMOPT_DOWARN) - xfs_alert(mp, - "%s : ondisk-dquot 0x%p, ID mismatch: " - "0x%x expected, found id 0x%x", - str, ddq, id, be32_to_cpu(ddq->d_id)); - errs++; - } - - if (!errs && ddq->d_id) { - if (ddq->d_blk_softlimit && - be64_to_cpu(ddq->d_bcount) > - be64_to_cpu(ddq->d_blk_softlimit)) { - if (!ddq->d_btimer) { - if (flags & XFS_QMOPT_DOWARN) - xfs_alert(mp, - "%s : Dquot ID 0x%x (0x%p) BLK TIMER NOT STARTED", - str, (int)be32_to_cpu(ddq->d_id), ddq); - errs++; - } - } - if (ddq->d_ino_softlimit && - be64_to_cpu(ddq->d_icount) > - be64_to_cpu(ddq->d_ino_softlimit)) { - if (!ddq->d_itimer) { - if (flags & XFS_QMOPT_DOWARN) - xfs_alert(mp, - "%s : Dquot ID 0x%x (0x%p) INODE TIMER NOT STARTED", - str, (int)be32_to_cpu(ddq->d_id), ddq); - errs++; - } - } - if (ddq->d_rtb_softlimit && - be64_to_cpu(ddq->d_rtbcount) > - be64_to_cpu(ddq->d_rtb_softlimit)) { - if (!ddq->d_rtbtimer) { - if (flags & XFS_QMOPT_DOWARN) - xfs_alert(mp, - "%s : Dquot ID 0x%x (0x%p) RTBLK TIMER NOT STARTED", - str, (int)be32_to_cpu(ddq->d_id), ddq); - errs++; - } - } - } - - if (!errs || !(flags & XFS_QMOPT_DQREPAIR)) - return errs; - - if (flags & XFS_QMOPT_DOWARN) - xfs_notice(mp, "Re-initializing dquot ID 0x%x", id); /* - * Typically, a repair is only requested by quotacheck. + * We can only do post recovery validation on items on CRC enabled + * fielsystems as we need to know when the buffer was written to be able + * to determine if we should have replayed the item. If we replay old + * metadata over a newer buffer, then it will enter a temporarily + * inconsistent state resulting in verification failures. Hence for now + * just avoid the verification stage for non-crc filesystems */ - ASSERT(id != -1); - ASSERT(flags & XFS_QMOPT_DQREPAIR); - memset(d, 0, sizeof(xfs_dqblk_t)); - - d->dd_diskdq.d_magic = cpu_to_be16(XFS_DQUOT_MAGIC); - d->dd_diskdq.d_version = XFS_DQUOT_VERSION; - d->dd_diskdq.d_flags = type; - d->dd_diskdq.d_id = cpu_to_be32(id); - - return errs; + if (xfs_sb_version_hascrc(&mp->m_sb)) + xlog_recover_validate_buf_type(mp, bp, buf_f); } /* * Perform a dquot buffer recovery. - * Simple algorithm: if we have found a QUOTAOFF logitem of the same type + * Simple algorithm: if we have found a QUOTAOFF log item of the same type * (ie. USR or GRP), then just toss this buffer away; don't recover it. * Else, treat it as a regular buffer and do recovery. */ @@ -2113,20 +2460,22 @@ xlog_recover_do_dquot_buffer( * over the log during recovery. During the first we build a table of * those buffers which have been cancelled, and during the second we * only replay those buffers which do not have corresponding cancel - * records in the table. See xlog_recover_do_buffer_pass[1,2] above + * records in the table. See xlog_recover_buffer_pass[1,2] above * for more details on the implementation of the table of cancel records. */ STATIC int xlog_recover_buffer_pass2( struct xlog *log, struct list_head *buffer_list, - struct xlog_recover_item *item) + struct xlog_recover_item *item, + xfs_lsn_t current_lsn) { xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr; xfs_mount_t *mp = log->l_mp; xfs_buf_t *bp; int error; uint buf_flags; + xfs_lsn_t lsn; /* * In this pass we only want to recover all the buffers which have @@ -2151,10 +2500,17 @@ xlog_recover_buffer_pass2( error = bp->b_error; if (error) { xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#1)"); - xfs_buf_relse(bp); - return error; + goto out_release; } + /* + * recover the buffer only if we get an LSN from it and it's less than + * the lsn of the transaction we are replaying. + */ + lsn = xlog_recover_get_buf_lsn(mp, bp); + if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) + goto out_release; + if (buf_f->blf_flags & XFS_BLF_INODE_BUF) { error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f); } else if (buf_f->blf_flags & @@ -2164,7 +2520,7 @@ xlog_recover_buffer_pass2( xlog_recover_do_reg_buffer(mp, item, bp, buf_f); } if (error) - return XFS_ERROR(error); + goto out_release; /* * Perform delayed write on the buffer. Asynchronous writes will be @@ -2172,19 +2528,19 @@ xlog_recover_buffer_pass2( * * Also make sure that only inode buffers with good sizes stay in * the buffer cache. The kernel moves inodes in buffers of 1 block - * or XFS_INODE_CLUSTER_SIZE bytes, whichever is bigger. The inode + * or mp->m_inode_cluster_size bytes, whichever is bigger. The inode * buffers in the log can be a different size if the log was generated * by an older kernel using unclustered inode buffers or a newer kernel * running with a different inode cluster size. Regardless, if the - * the inode buffer size isn't MAX(blocksize, XFS_INODE_CLUSTER_SIZE) - * for *our* value of XFS_INODE_CLUSTER_SIZE, then we need to keep + * the inode buffer size isn't MAX(blocksize, mp->m_inode_cluster_size) + * for *our* value of mp->m_inode_cluster_size, then we need to keep * the buffer out of the buffer cache so that the buffer won't * overlap with future reads of those inodes. */ if (XFS_DINODE_MAGIC == be16_to_cpu(*((__be16 *)xfs_buf_offset(bp, 0))) && (BBTOB(bp->b_io_length) != MAX(log->l_mp->m_sb.sb_blocksize, - (__uint32_t)XFS_INODE_CLUSTER_SIZE(log->l_mp)))) { + (__uint32_t)log->l_mp->m_inode_cluster_size))) { xfs_buf_stale(bp); error = xfs_bwrite(bp); } else { @@ -2193,15 +2549,93 @@ xlog_recover_buffer_pass2( xfs_buf_delwri_queue(bp, buffer_list); } +out_release: xfs_buf_relse(bp); return error; } +/* + * Inode fork owner changes + * + * If we have been told that we have to reparent the inode fork, it's because an + * extent swap operation on a CRC enabled filesystem has been done and we are + * replaying it. We need to walk the BMBT of the appropriate fork and change the + * owners of it. + * + * The complexity here is that we don't have an inode context to work with, so + * after we've replayed the inode we need to instantiate one. This is where the + * fun begins. + * + * We are in the middle of log recovery, so we can't run transactions. That + * means we cannot use cache coherent inode instantiation via xfs_iget(), as + * that will result in the corresponding iput() running the inode through + * xfs_inactive(). If we've just replayed an inode core that changes the link + * count to zero (i.e. it's been unlinked), then xfs_inactive() will run + * transactions (bad!). + * + * So, to avoid this, we instantiate an inode directly from the inode core we've + * just recovered. We have the buffer still locked, and all we really need to + * instantiate is the inode core and the forks being modified. We can do this + * manually, then run the inode btree owner change, and then tear down the + * xfs_inode without having to run any transactions at all. + * + * Also, because we don't have a transaction context available here but need to + * gather all the buffers we modify for writeback so we pass the buffer_list + * instead for the operation to use. + */ + +STATIC int +xfs_recover_inode_owner_change( + struct xfs_mount *mp, + struct xfs_dinode *dip, + struct xfs_inode_log_format *in_f, + struct list_head *buffer_list) +{ + struct xfs_inode *ip; + int error; + + ASSERT(in_f->ilf_fields & (XFS_ILOG_DOWNER|XFS_ILOG_AOWNER)); + + ip = xfs_inode_alloc(mp, in_f->ilf_ino); + if (!ip) + return ENOMEM; + + /* instantiate the inode */ + xfs_dinode_from_disk(&ip->i_d, dip); + ASSERT(ip->i_d.di_version >= 3); + + error = xfs_iformat_fork(ip, dip); + if (error) + goto out_free_ip; + + + if (in_f->ilf_fields & XFS_ILOG_DOWNER) { + ASSERT(in_f->ilf_fields & XFS_ILOG_DBROOT); + error = xfs_bmbt_change_owner(NULL, ip, XFS_DATA_FORK, + ip->i_ino, buffer_list); + if (error) + goto out_free_ip; + } + + if (in_f->ilf_fields & XFS_ILOG_AOWNER) { + ASSERT(in_f->ilf_fields & XFS_ILOG_ABROOT); + error = xfs_bmbt_change_owner(NULL, ip, XFS_ATTR_FORK, + ip->i_ino, buffer_list); + if (error) + goto out_free_ip; + } + +out_free_ip: + xfs_inode_free(ip); + return error; +} + STATIC int xlog_recover_inode_pass2( struct xlog *log, struct list_head *buffer_list, - struct xlog_recover_item *item) + struct xlog_recover_item *item, + xfs_lsn_t current_lsn) { xfs_inode_log_format_t *in_f; xfs_mount_t *mp = log->l_mp; @@ -2214,6 +2648,7 @@ xlog_recover_inode_pass2( int attr_index; uint fields; xfs_icdinode_t *dicp; + uint isize; int need_free = 0; if (item->ri_buf[0].i_len == sizeof(xfs_inode_log_format_t)) { @@ -2239,7 +2674,7 @@ xlog_recover_inode_pass2( trace_xfs_log_recover_inode_recover(log, in_f); bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len, 0, - NULL); + &xfs_inode_buf_ops); if (!bp) { error = ENOMEM; goto error; @@ -2247,8 +2682,7 @@ xlog_recover_inode_pass2( error = bp->b_error; if (error) { xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#2)"); - xfs_buf_relse(bp); - goto error; + goto out_release; } ASSERT(in_f->ilf_fields & XFS_ILOG_CORE); dip = (xfs_dinode_t *)xfs_buf_offset(bp, in_f->ilf_boffset); @@ -2258,29 +2692,52 @@ xlog_recover_inode_pass2( * like an inode! */ if (unlikely(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC))) { - xfs_buf_relse(bp); xfs_alert(mp, "%s: Bad inode magic number, dip = 0x%p, dino bp = 0x%p, ino = %Ld", __func__, dip, bp, in_f->ilf_ino); XFS_ERROR_REPORT("xlog_recover_inode_pass2(1)", XFS_ERRLEVEL_LOW, mp); error = EFSCORRUPTED; - goto error; + goto out_release; } dicp = item->ri_buf[1].i_addr; if (unlikely(dicp->di_magic != XFS_DINODE_MAGIC)) { - xfs_buf_relse(bp); xfs_alert(mp, "%s: Bad inode log record, rec ptr 0x%p, ino %Ld", __func__, item, in_f->ilf_ino); XFS_ERROR_REPORT("xlog_recover_inode_pass2(2)", XFS_ERRLEVEL_LOW, mp); error = EFSCORRUPTED; - goto error; + goto out_release; + } + + /* + * If the inode has an LSN in it, recover the inode only if it's less + * than the lsn of the transaction we are replaying. Note: we still + * need to replay an owner change even though the inode is more recent + * than the transaction as there is no guarantee that all the btree + * blocks are more recent than this transaction, too. + */ + if (dip->di_version >= 3) { + xfs_lsn_t lsn = be64_to_cpu(dip->di_lsn); + + if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) { + trace_xfs_log_recover_inode_skip(log, in_f); + error = 0; + goto out_owner_change; + } } - /* Skip replay when the on disk inode is newer than the log one */ - if (dicp->di_flushiter < be16_to_cpu(dip->di_flushiter)) { + /* + * di_flushiter is only valid for v1/2 inodes. All changes for v3 inodes + * are transactional and if ordering is necessary we can determine that + * more accurately by the LSN field in the V3 inode core. Don't trust + * the inode versions we might be changing them here - use the + * superblock flag to determine whether we need to look at di_flushiter + * to skip replay when the on disk inode is newer than the log one + */ + if (!xfs_sb_version_hascrc(&mp->m_sb) && + dicp->di_flushiter < be16_to_cpu(dip->di_flushiter)) { /* * Deal with the wrap case, DI_MAX_FLUSH is less * than smaller numbers @@ -2289,12 +2746,12 @@ xlog_recover_inode_pass2( dicp->di_flushiter < (DI_MAX_FLUSH >> 1)) { /* do nothing */ } else { - xfs_buf_relse(bp); trace_xfs_log_recover_inode_skip(log, in_f); error = 0; - goto error; + goto out_release; } } + /* Take the opportunity to reset the flush iteration count */ dicp->di_flushiter = 0; @@ -2303,13 +2760,12 @@ xlog_recover_inode_pass2( (dicp->di_format != XFS_DINODE_FMT_BTREE)) { XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(3)", XFS_ERRLEVEL_LOW, mp, dicp); - xfs_buf_relse(bp); xfs_alert(mp, "%s: Bad regular inode log record, rec ptr 0x%p, " "ino ptr = 0x%p, ino bp = 0x%p, ino %Ld", __func__, item, dip, bp, in_f->ilf_ino); error = EFSCORRUPTED; - goto error; + goto out_release; } } else if (unlikely(S_ISDIR(dicp->di_mode))) { if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) && @@ -2317,19 +2773,17 @@ xlog_recover_inode_pass2( (dicp->di_format != XFS_DINODE_FMT_LOCAL)) { XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(4)", XFS_ERRLEVEL_LOW, mp, dicp); - xfs_buf_relse(bp); xfs_alert(mp, "%s: Bad dir inode log record, rec ptr 0x%p, " "ino ptr = 0x%p, ino bp = 0x%p, ino %Ld", __func__, item, dip, bp, in_f->ilf_ino); error = EFSCORRUPTED; - goto error; + goto out_release; } } if (unlikely(dicp->di_nextents + dicp->di_anextents > dicp->di_nblocks)){ XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(5)", XFS_ERRLEVEL_LOW, mp, dicp); - xfs_buf_relse(bp); xfs_alert(mp, "%s: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, " "dino bp 0x%p, ino %Ld, total extents = %d, nblocks = %Ld", @@ -2337,38 +2791,37 @@ xlog_recover_inode_pass2( dicp->di_nextents + dicp->di_anextents, dicp->di_nblocks); error = EFSCORRUPTED; - goto error; + goto out_release; } if (unlikely(dicp->di_forkoff > mp->m_sb.sb_inodesize)) { XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(6)", XFS_ERRLEVEL_LOW, mp, dicp); - xfs_buf_relse(bp); xfs_alert(mp, "%s: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, " "dino bp 0x%p, ino %Ld, forkoff 0x%x", __func__, item, dip, bp, in_f->ilf_ino, dicp->di_forkoff); error = EFSCORRUPTED; - goto error; + goto out_release; } - if (unlikely(item->ri_buf[1].i_len > sizeof(struct xfs_icdinode))) { + isize = xfs_icdinode_size(dicp->di_version); + if (unlikely(item->ri_buf[1].i_len > isize)) { XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(7)", XFS_ERRLEVEL_LOW, mp, dicp); - xfs_buf_relse(bp); xfs_alert(mp, "%s: Bad inode log record length %d, rec ptr 0x%p", __func__, item->ri_buf[1].i_len, item); error = EFSCORRUPTED; - goto error; + goto out_release; } /* The core is in in-core format */ - xfs_dinode_to_disk(dip, item->ri_buf[1].i_addr); + xfs_dinode_to_disk(dip, dicp); /* the rest is in on-disk format */ - if (item->ri_buf[1].i_len > sizeof(struct xfs_icdinode)) { - memcpy((xfs_caddr_t) dip + sizeof(struct xfs_icdinode), - item->ri_buf[1].i_addr + sizeof(struct xfs_icdinode), - item->ri_buf[1].i_len - sizeof(struct xfs_icdinode)); + if (item->ri_buf[1].i_len > isize) { + memcpy((char *)dip + isize, + item->ri_buf[1].i_addr + isize, + item->ri_buf[1].i_len - isize); } fields = in_f->ilf_fields; @@ -2384,7 +2837,7 @@ xlog_recover_inode_pass2( } if (in_f->ilf_size == 2) - goto write_inode_buffer; + goto out_owner_change; len = item->ri_buf[2].i_len; src = item->ri_buf[2].i_addr; ASSERT(in_f->ilf_size <= 4); @@ -2445,16 +2898,23 @@ xlog_recover_inode_pass2( default: xfs_warn(log->l_mp, "%s: Invalid flag", __func__); ASSERT(0); - xfs_buf_relse(bp); error = EIO; - goto error; + goto out_release; } } -write_inode_buffer: +out_owner_change: + if (in_f->ilf_fields & (XFS_ILOG_DOWNER|XFS_ILOG_AOWNER)) + error = xfs_recover_inode_owner_change(mp, dip, in_f, + buffer_list); + /* re-generate the checksum. */ + xfs_dinode_calc_crc(log->l_mp, dip); + ASSERT(bp->b_target->bt_mount == mp); bp->b_iodone = xlog_recover_iodone; xfs_buf_delwri_queue(bp, buffer_list); + +out_release: xfs_buf_relse(bp); error: if (need_free) @@ -2496,7 +2956,8 @@ STATIC int xlog_recover_dquot_pass2( struct xlog *log, struct list_head *buffer_list, - struct xlog_recover_item *item) + struct xlog_recover_item *item, + xfs_lsn_t current_lsn) { xfs_mount_t *mp = log->l_mp; xfs_buf_t *bp; @@ -2543,7 +3004,7 @@ xlog_recover_dquot_pass2( */ dq_f = item->ri_buf[0].i_addr; ASSERT(dq_f); - error = xfs_qm_dqcheck(mp, recddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN, + error = xfs_dqcheck(mp, recddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN, "xlog_recover_dquot_pass2 (log copy)"); if (error) return XFS_ERROR(EIO); @@ -2563,22 +3024,40 @@ xlog_recover_dquot_pass2( * was among a chunk of dquots created earlier, and we did some * minimal initialization then. */ - error = xfs_qm_dqcheck(mp, ddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN, + error = xfs_dqcheck(mp, ddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN, "xlog_recover_dquot_pass2"); if (error) { xfs_buf_relse(bp); return XFS_ERROR(EIO); } + /* + * If the dquot has an LSN in it, recover the dquot only if it's less + * than the lsn of the transaction we are replaying. + */ + if (xfs_sb_version_hascrc(&mp->m_sb)) { + struct xfs_dqblk *dqb = (struct xfs_dqblk *)ddq; + xfs_lsn_t lsn = be64_to_cpu(dqb->dd_lsn); + + if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) { + goto out_release; + } + } + memcpy(ddq, recddq, item->ri_buf[1].i_len); + if (xfs_sb_version_hascrc(&mp->m_sb)) { + xfs_update_cksum((char *)ddq, sizeof(struct xfs_dqblk), + XFS_DQUOT_CRC_OFF); + } ASSERT(dq_f->qlf_size == 2); ASSERT(bp->b_target->bt_mount == mp); bp->b_iodone = xlog_recover_iodone; xfs_buf_delwri_queue(bp, buffer_list); - xfs_buf_relse(bp); - return (0); +out_release: + xfs_buf_relse(bp); + return 0; } /* @@ -2668,13 +3147,100 @@ xlog_recover_efd_pass2( } lip = xfs_trans_ail_cursor_next(ailp, &cur); } - xfs_trans_ail_cursor_done(ailp, &cur); + xfs_trans_ail_cursor_done(&cur); spin_unlock(&ailp->xa_lock); return 0; } /* + * This routine is called when an inode create format structure is found in a + * committed transaction in the log. It's purpose is to initialise the inodes + * being allocated on disk. This requires us to get inode cluster buffers that + * match the range to be intialised, stamped with inode templates and written + * by delayed write so that subsequent modifications will hit the cached buffer + * and only need writing out at the end of recovery. + */ +STATIC int +xlog_recover_do_icreate_pass2( + struct xlog *log, + struct list_head *buffer_list, + xlog_recover_item_t *item) +{ + struct xfs_mount *mp = log->l_mp; + struct xfs_icreate_log *icl; + xfs_agnumber_t agno; + xfs_agblock_t agbno; + unsigned int count; + unsigned int isize; + xfs_agblock_t length; + + icl = (struct xfs_icreate_log *)item->ri_buf[0].i_addr; + if (icl->icl_type != XFS_LI_ICREATE) { + xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad type"); + return EINVAL; + } + + if (icl->icl_size != 1) { + xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad icl size"); + return EINVAL; + } + + agno = be32_to_cpu(icl->icl_ag); + if (agno >= mp->m_sb.sb_agcount) { + xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad agno"); + return EINVAL; + } + agbno = be32_to_cpu(icl->icl_agbno); + if (!agbno || agbno == NULLAGBLOCK || agbno >= mp->m_sb.sb_agblocks) { + xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad agbno"); + return EINVAL; + } + isize = be32_to_cpu(icl->icl_isize); + if (isize != mp->m_sb.sb_inodesize) { + xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad isize"); + return EINVAL; + } + count = be32_to_cpu(icl->icl_count); + if (!count) { + xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad count"); + return EINVAL; + } + length = be32_to_cpu(icl->icl_length); + if (!length || length >= mp->m_sb.sb_agblocks) { + xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad length"); + return EINVAL; + } + + /* existing allocation is fixed value */ + ASSERT(count == mp->m_ialloc_inos); + ASSERT(length == mp->m_ialloc_blks); + if (count != mp->m_ialloc_inos || + length != mp->m_ialloc_blks) { + xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad count 2"); + return EINVAL; + } + + /* + * Inode buffers can be freed. Do not replay the inode initialisation as + * we could be overwriting something written after this inode buffer was + * cancelled. + * + * XXX: we need to iterate all buffers and only init those that are not + * cancelled. I think that a more fine grained factoring of + * xfs_ialloc_inode_init may be appropriate here to enable this to be + * done easily. + */ + if (xlog_check_buffer_cancelled(log, + XFS_AGB_TO_DADDR(mp, agno, agbno), length, 0)) + return 0; + + xfs_ialloc_inode_init(mp, NULL, buffer_list, agno, agbno, length, + be32_to_cpu(icl->icl_gen)); + return 0; +} + +/* * Free up any resources allocated by the transaction * * Remember that EFIs, EFDs, and IUNLINKs are handled later. @@ -2699,6 +3265,106 @@ xlog_recover_free_trans( kmem_free(trans); } +STATIC void +xlog_recover_buffer_ra_pass2( + struct xlog *log, + struct xlog_recover_item *item) +{ + struct xfs_buf_log_format *buf_f = item->ri_buf[0].i_addr; + struct xfs_mount *mp = log->l_mp; + + if (xlog_peek_buffer_cancelled(log, buf_f->blf_blkno, + buf_f->blf_len, buf_f->blf_flags)) { + return; + } + + xfs_buf_readahead(mp->m_ddev_targp, buf_f->blf_blkno, + buf_f->blf_len, NULL); +} + +STATIC void +xlog_recover_inode_ra_pass2( + struct xlog *log, + struct xlog_recover_item *item) +{ + struct xfs_inode_log_format ilf_buf; + struct xfs_inode_log_format *ilfp; + struct xfs_mount *mp = log->l_mp; + int error; + + if (item->ri_buf[0].i_len == sizeof(struct xfs_inode_log_format)) { + ilfp = item->ri_buf[0].i_addr; + } else { + ilfp = &ilf_buf; + memset(ilfp, 0, sizeof(*ilfp)); + error = xfs_inode_item_format_convert(&item->ri_buf[0], ilfp); + if (error) + return; + } + + if (xlog_peek_buffer_cancelled(log, ilfp->ilf_blkno, ilfp->ilf_len, 0)) + return; + + xfs_buf_readahead(mp->m_ddev_targp, ilfp->ilf_blkno, + ilfp->ilf_len, &xfs_inode_buf_ra_ops); +} + +STATIC void +xlog_recover_dquot_ra_pass2( + struct xlog *log, + struct xlog_recover_item *item) +{ + struct xfs_mount *mp = log->l_mp; + struct xfs_disk_dquot *recddq; + struct xfs_dq_logformat *dq_f; + uint type; + + + if (mp->m_qflags == 0) + return; + + recddq = item->ri_buf[1].i_addr; + if (recddq == NULL) + return; + if (item->ri_buf[1].i_len < sizeof(struct xfs_disk_dquot)) + return; + + type = recddq->d_flags & (XFS_DQ_USER | XFS_DQ_PROJ | XFS_DQ_GROUP); + ASSERT(type); + if (log->l_quotaoffs_flag & type) + return; + + dq_f = item->ri_buf[0].i_addr; + ASSERT(dq_f); + ASSERT(dq_f->qlf_len == 1); + + xfs_buf_readahead(mp->m_ddev_targp, dq_f->qlf_blkno, + XFS_FSB_TO_BB(mp, dq_f->qlf_len), NULL); +} + +STATIC void +xlog_recover_ra_pass2( + struct xlog *log, + struct xlog_recover_item *item) +{ + switch (ITEM_TYPE(item)) { + case XFS_LI_BUF: + xlog_recover_buffer_ra_pass2(log, item); + break; + case XFS_LI_INODE: + xlog_recover_inode_ra_pass2(log, item); + break; + case XFS_LI_DQUOT: + xlog_recover_dquot_ra_pass2(log, item); + break; + case XFS_LI_EFI: + case XFS_LI_EFD: + case XFS_LI_QUOTAOFF: + default: + break; + } +} + STATIC int xlog_recover_commit_pass1( struct xlog *log, @@ -2716,6 +3382,7 @@ xlog_recover_commit_pass1( case XFS_LI_EFI: case XFS_LI_EFD: case XFS_LI_DQUOT: + case XFS_LI_ICREATE: /* nothing to do in pass 1 */ return 0; default: @@ -2737,15 +3404,20 @@ xlog_recover_commit_pass2( switch (ITEM_TYPE(item)) { case XFS_LI_BUF: - return xlog_recover_buffer_pass2(log, buffer_list, item); + return xlog_recover_buffer_pass2(log, buffer_list, item, + trans->r_lsn); case XFS_LI_INODE: - return xlog_recover_inode_pass2(log, buffer_list, item); + return xlog_recover_inode_pass2(log, buffer_list, item, + trans->r_lsn); case XFS_LI_EFI: return xlog_recover_efi_pass2(log, item, trans->r_lsn); case XFS_LI_EFD: return xlog_recover_efd_pass2(log, item); case XFS_LI_DQUOT: - return xlog_recover_dquot_pass2(log, buffer_list, item); + return xlog_recover_dquot_pass2(log, buffer_list, item, + trans->r_lsn); + case XFS_LI_ICREATE: + return xlog_recover_do_icreate_pass2(log, buffer_list, item); case XFS_LI_QUOTAOFF: /* nothing to do in pass2 */ return 0; @@ -2757,6 +3429,26 @@ xlog_recover_commit_pass2( } } +STATIC int +xlog_recover_items_pass2( + struct xlog *log, + struct xlog_recover *trans, + struct list_head *buffer_list, + struct list_head *item_list) +{ + struct xlog_recover_item *item; + int error = 0; + + list_for_each_entry(item, item_list, ri_list) { + error = xlog_recover_commit_pass2(log, trans, + buffer_list, item); + if (error) + return error; + } + + return error; +} + /* * Perform the transaction. * @@ -2769,9 +3461,16 @@ xlog_recover_commit_trans( struct xlog_recover *trans, int pass) { - int error = 0, error2; - xlog_recover_item_t *item; - LIST_HEAD (buffer_list); + int error = 0; + int error2; + int items_queued = 0; + struct xlog_recover_item *item; + struct xlog_recover_item *next; + LIST_HEAD (buffer_list); + LIST_HEAD (ra_list); + LIST_HEAD (done_list); + + #define XLOG_RECOVER_COMMIT_QUEUE_MAX 100 hlist_del(&trans->r_list); @@ -2779,14 +3478,22 @@ xlog_recover_commit_trans( if (error) return error; - list_for_each_entry(item, &trans->r_itemq, ri_list) { + list_for_each_entry_safe(item, next, &trans->r_itemq, ri_list) { switch (pass) { case XLOG_RECOVER_PASS1: error = xlog_recover_commit_pass1(log, trans, item); break; case XLOG_RECOVER_PASS2: - error = xlog_recover_commit_pass2(log, trans, - &buffer_list, item); + xlog_recover_ra_pass2(log, item); + list_move_tail(&item->ri_list, &ra_list); + items_queued++; + if (items_queued >= XLOG_RECOVER_COMMIT_QUEUE_MAX) { + error = xlog_recover_items_pass2(log, trans, + &buffer_list, &ra_list); + list_splice_tail_init(&ra_list, &done_list); + items_queued = 0; + } + break; default: ASSERT(0); @@ -2796,17 +3503,26 @@ xlog_recover_commit_trans( goto out; } +out: + if (!list_empty(&ra_list)) { + if (!error) + error = xlog_recover_items_pass2(log, trans, + &buffer_list, &ra_list); + list_splice_tail_init(&ra_list, &done_list); + } + + if (!list_empty(&done_list)) + list_splice_init(&done_list, &trans->r_itemq); + xlog_recover_free_trans(trans); -out: error2 = xfs_buf_delwri_submit(&buffer_list); return error ? error : error2; } STATIC int xlog_recover_unmount_trans( - struct xlog *log, - struct xlog_recover *trans) + struct xlog *log) { /* Do nothing now */ xfs_warn(log->l_mp, "%s: Unmount LR", __func__); @@ -2880,7 +3596,7 @@ xlog_recover_process_data( trans, pass); break; case XLOG_UNMOUNT_TRANS: - error = xlog_recover_unmount_trans(log, trans); + error = xlog_recover_unmount_trans(log); break; case XLOG_WAS_CONT_TRANS: error = xlog_recover_add_to_cont_trans(log, @@ -2905,8 +3621,10 @@ xlog_recover_process_data( error = XFS_ERROR(EIO); break; } - if (error) + if (error) { + xlog_recover_free_trans(trans); return error; + } } dp += be32_to_cpu(ohead->oh_len); num_logops--; @@ -2949,13 +3667,14 @@ xlog_recover_process_efi( * This will pull the EFI from the AIL and * free the memory associated with it. */ + set_bit(XFS_EFI_RECOVERED, &efip->efi_flags); xfs_efi_release(efip, efip->efi_format.efi_nextents); return XFS_ERROR(EIO); } } tp = xfs_trans_alloc(mp, 0); - error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0, 0, 0); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0); if (error) goto abort_error; efdp = xfs_trans_get_efd(tp, efip, efip->efi_format.efi_nextents); @@ -3039,7 +3758,7 @@ xlog_recover_process_efis( lip = xfs_trans_ail_cursor_next(ailp, &cur); } out: - xfs_trans_ail_cursor_done(ailp, &cur); + xfs_trans_ail_cursor_done(&cur); spin_unlock(&ailp->xa_lock); return error; } @@ -3061,8 +3780,7 @@ xlog_recover_clear_agi_bucket( int error; tp = xfs_trans_alloc(mp, XFS_TRANS_CLEAR_AGI_BUCKET); - error = xfs_trans_reserve(tp, 0, XFS_CLEAR_AGI_BUCKET_LOG_RES(mp), - 0, 0, 0); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_clearagi, 0, 0); if (error) goto out_abort; @@ -3239,7 +3957,7 @@ xlog_unpack_data_crc( if (crc != rhead->h_crc) { if (rhead->h_crc || xfs_sb_version_hascrc(&log->l_mp->m_sb)) { xfs_alert(log->l_mp, - "log record CRC mismatch: found 0x%x, expected 0x%x.\n", + "log record CRC mismatch: found 0x%x, expected 0x%x.", le32_to_cpu(rhead->h_crc), le32_to_cpu(crc)); xfs_hex_dump(dp, 32); @@ -3694,7 +4412,13 @@ xlog_do_recover( XFS_BUF_READ(bp); XFS_BUF_UNASYNC(bp); bp->b_ops = &xfs_sb_buf_ops; - xfsbdstrat(log->l_mp, bp); + + if (XFS_FORCED_SHUTDOWN(log->l_mp)) { + xfs_buf_relse(bp); + return XFS_ERROR(EIO); + } + + xfs_buf_iorequest(bp); error = xfs_buf_iowait(bp); if (error) { xfs_buf_ioerror_alert(bp, __func__); @@ -3752,6 +4476,25 @@ xlog_recover( return error; } + /* + * Version 5 superblock log feature mask validation. We know the + * log is dirty so check if there are any unknown log features + * in what we need to recover. If there are unknown features + * (e.g. unsupported transactions, then simply reject the + * attempt at recovery before touching anything. + */ + if (XFS_SB_VERSION_NUM(&log->l_mp->m_sb) == XFS_SB_VERSION_5 && + xfs_sb_has_incompat_log_feature(&log->l_mp->m_sb, + XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN)) { + xfs_warn(log->l_mp, +"Superblock has unknown incompatible log features (0x%x) enabled.\n" +"The log can not be fully and/or safely recovered by this kernel.\n" +"Please recover the log on a kernel that supports the unknown features.", + (log->l_mp->m_sb.sb_features_log_incompat & + XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN)); + return EINVAL; + } + xfs_notice(log->l_mp, "Starting recovery (logdev: %s)", log->l_mp->m_logname ? log->l_mp->m_logname : "internal"); diff --git a/fs/xfs/xfs_log_rlimit.c b/fs/xfs/xfs_log_rlimit.c new file mode 100644 index 00000000000..ee7e0e80246 --- /dev/null +++ b/fs/xfs/xfs_log_rlimit.c @@ -0,0 +1,150 @@ +/* + * Copyright (c) 2013 Jie Liu. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_ag.h" +#include "xfs_sb.h" +#include "xfs_mount.h" +#include "xfs_da_format.h" +#include "xfs_trans_space.h" +#include "xfs_inode.h" +#include "xfs_da_btree.h" +#include "xfs_attr_leaf.h" +#include "xfs_bmap_btree.h" + +/* + * Calculate the maximum length in bytes that would be required for a local + * attribute value as large attributes out of line are not logged. + */ +STATIC int +xfs_log_calc_max_attrsetm_res( + struct xfs_mount *mp) +{ + int size; + int nblks; + + size = xfs_attr_leaf_entsize_local_max(mp->m_attr_geo->blksize) - + MAXNAMELEN - 1; + nblks = XFS_DAENTER_SPACE_RES(mp, XFS_ATTR_FORK); + nblks += XFS_B_TO_FSB(mp, size); + nblks += XFS_NEXTENTADD_SPACE_RES(mp, size, XFS_ATTR_FORK); + + return M_RES(mp)->tr_attrsetm.tr_logres + + M_RES(mp)->tr_attrsetrt.tr_logres * nblks; +} + +/* + * Iterate over the log space reservation table to figure out and return + * the maximum one in terms of the pre-calculated values which were done + * at mount time. + */ +STATIC void +xfs_log_get_max_trans_res( + struct xfs_mount *mp, + struct xfs_trans_res *max_resp) +{ + struct xfs_trans_res *resp; + struct xfs_trans_res *end_resp; + int log_space = 0; + int attr_space; + + attr_space = xfs_log_calc_max_attrsetm_res(mp); + + resp = (struct xfs_trans_res *)M_RES(mp); + end_resp = (struct xfs_trans_res *)(M_RES(mp) + 1); + for (; resp < end_resp; resp++) { + int tmp = resp->tr_logcount > 1 ? + resp->tr_logres * resp->tr_logcount : + resp->tr_logres; + if (log_space < tmp) { + log_space = tmp; + *max_resp = *resp; /* struct copy */ + } + } + + if (attr_space > log_space) { + *max_resp = M_RES(mp)->tr_attrsetm; /* struct copy */ + max_resp->tr_logres = attr_space; + } +} + +/* + * Calculate the minimum valid log size for the given superblock configuration. + * Used to calculate the minimum log size at mkfs time, and to determine if + * the log is large enough or not at mount time. Returns the minimum size in + * filesystem block size units. + */ +int +xfs_log_calc_minimum_size( + struct xfs_mount *mp) +{ + struct xfs_trans_res tres = {0}; + int max_logres; + int min_logblks = 0; + int lsunit = 0; + + xfs_log_get_max_trans_res(mp, &tres); + + max_logres = xfs_log_calc_unit_res(mp, tres.tr_logres); + if (tres.tr_logcount > 1) + max_logres *= tres.tr_logcount; + + if (xfs_sb_version_haslogv2(&mp->m_sb) && mp->m_sb.sb_logsunit > 1) + lsunit = BTOBB(mp->m_sb.sb_logsunit); + + /* + * Two factors should be taken into account for calculating the minimum + * log space. + * 1) The fundamental limitation is that no single transaction can be + * larger than half size of the log. + * + * From mkfs.xfs, this is considered by the XFS_MIN_LOG_FACTOR + * define, which is set to 3. That means we can definitely fit + * maximally sized 2 transactions in the log. We'll use this same + * value here. + * + * 2) If the lsunit option is specified, a transaction requires 2 LSU + * for the reservation because there are two log writes that can + * require padding - the transaction data and the commit record which + * are written separately and both can require padding to the LSU. + * Consider that we can have an active CIL reservation holding 2*LSU, + * but the CIL is not over a push threshold, in this case, if we + * don't have enough log space for at one new transaction, which + * includes another 2*LSU in the reservation, we will run into dead + * loop situation in log space grant procedure. i.e. + * xlog_grant_head_wait(). + * + * Hence the log size needs to be able to contain two maximally sized + * and padded transactions, which is (2 * (2 * LSU + maxlres)). + * + * Also, the log size should be a multiple of the log stripe unit, round + * it up to lsunit boundary if lsunit is specified. + */ + if (lsunit) { + min_logblks = roundup_64(BTOBB(max_logres), lsunit) + + 2 * lsunit; + } else + min_logblks = BTOBB(max_logres) + 2 * BBSIZE; + min_logblks *= XFS_MIN_LOG_FACTOR; + + return XFS_BB_TO_FSB(mp, min_logblks); +} diff --git a/fs/xfs/xfs_message.c b/fs/xfs/xfs_message.c index 331cd9f83a7..63ca2f0420b 100644 --- a/fs/xfs/xfs_message.c +++ b/fs/xfs/xfs_message.c @@ -17,9 +17,8 @@ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_types.h" -#include "xfs_log.h" -#include "xfs_trans.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_mount.h" @@ -93,6 +92,14 @@ xfs_alert_tag( } void +asswarn(char *expr, char *file, int line) +{ + xfs_warn(NULL, "Assertion failed: %s, file: %s, line: %d", + expr, file, line); + WARN_ON(1); +} + +void assfail(char *expr, char *file, int line) { xfs_emerg(NULL, "Assertion failed: %s, file: %s, line: %d", diff --git a/fs/xfs/xfs_message.h b/fs/xfs/xfs_message.h index 56dc0c17f16..85401155750 100644 --- a/fs/xfs/xfs_message.h +++ b/fs/xfs/xfs_message.h @@ -30,7 +30,34 @@ void xfs_debug(const struct xfs_mount *mp, const char *fmt, ...) } #endif +#define xfs_printk_ratelimited(func, dev, fmt, ...) \ +do { \ + static DEFINE_RATELIMIT_STATE(_rs, \ + DEFAULT_RATELIMIT_INTERVAL, \ + DEFAULT_RATELIMIT_BURST); \ + if (__ratelimit(&_rs)) \ + func(dev, fmt, ##__VA_ARGS__); \ +} while (0) + +#define xfs_emerg_ratelimited(dev, fmt, ...) \ + xfs_printk_ratelimited(xfs_emerg, dev, fmt, ##__VA_ARGS__) +#define xfs_alert_ratelimited(dev, fmt, ...) \ + xfs_printk_ratelimited(xfs_alert, dev, fmt, ##__VA_ARGS__) +#define xfs_crit_ratelimited(dev, fmt, ...) \ + xfs_printk_ratelimited(xfs_crit, dev, fmt, ##__VA_ARGS__) +#define xfs_err_ratelimited(dev, fmt, ...) \ + xfs_printk_ratelimited(xfs_err, dev, fmt, ##__VA_ARGS__) +#define xfs_warn_ratelimited(dev, fmt, ...) \ + xfs_printk_ratelimited(xfs_warn, dev, fmt, ##__VA_ARGS__) +#define xfs_notice_ratelimited(dev, fmt, ...) \ + xfs_printk_ratelimited(xfs_notice, dev, fmt, ##__VA_ARGS__) +#define xfs_info_ratelimited(dev, fmt, ...) \ + xfs_printk_ratelimited(xfs_info, dev, fmt, ##__VA_ARGS__) +#define xfs_debug_ratelimited(dev, fmt, ...) \ + xfs_printk_ratelimited(xfs_debug, dev, fmt, ##__VA_ARGS__) + extern void assfail(char *expr, char *f, int l); +extern void asswarn(char *expr, char *f, int l); extern void xfs_hex_dump(void *p, int length); diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index da508463ff1..3507cd0ec40 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -17,32 +17,31 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_types.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_bit.h" -#include "xfs_log.h" #include "xfs_inum.h" -#include "xfs_trans.h" -#include "xfs_trans_priv.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_dir2.h" #include "xfs_mount.h" -#include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_dinode.h" +#include "xfs_da_format.h" #include "xfs_inode.h" -#include "xfs_btree.h" +#include "xfs_dir2.h" #include "xfs_ialloc.h" #include "xfs_alloc.h" #include "xfs_rtalloc.h" #include "xfs_bmap.h" +#include "xfs_trans.h" +#include "xfs_trans_priv.h" +#include "xfs_log.h" #include "xfs_error.h" #include "xfs_quota.h" #include "xfs_fsops.h" -#include "xfs_utils.h" #include "xfs_trace.h" #include "xfs_icache.h" +#include "xfs_dinode.h" #ifdef HAVE_PERCPU_SB @@ -57,61 +56,6 @@ STATIC void xfs_icsb_disable_counter(xfs_mount_t *, xfs_sb_field_t); #define xfs_icsb_balance_counter_locked(mp, a, b) do { } while (0) #endif -static const struct { - short offset; - short type; /* 0 = integer - * 1 = binary / string (no translation) - */ -} xfs_sb_info[] = { - { offsetof(xfs_sb_t, sb_magicnum), 0 }, - { offsetof(xfs_sb_t, sb_blocksize), 0 }, - { offsetof(xfs_sb_t, sb_dblocks), 0 }, - { offsetof(xfs_sb_t, sb_rblocks), 0 }, - { offsetof(xfs_sb_t, sb_rextents), 0 }, - { offsetof(xfs_sb_t, sb_uuid), 1 }, - { offsetof(xfs_sb_t, sb_logstart), 0 }, - { offsetof(xfs_sb_t, sb_rootino), 0 }, - { offsetof(xfs_sb_t, sb_rbmino), 0 }, - { offsetof(xfs_sb_t, sb_rsumino), 0 }, - { offsetof(xfs_sb_t, sb_rextsize), 0 }, - { offsetof(xfs_sb_t, sb_agblocks), 0 }, - { offsetof(xfs_sb_t, sb_agcount), 0 }, - { offsetof(xfs_sb_t, sb_rbmblocks), 0 }, - { offsetof(xfs_sb_t, sb_logblocks), 0 }, - { offsetof(xfs_sb_t, sb_versionnum), 0 }, - { offsetof(xfs_sb_t, sb_sectsize), 0 }, - { offsetof(xfs_sb_t, sb_inodesize), 0 }, - { offsetof(xfs_sb_t, sb_inopblock), 0 }, - { offsetof(xfs_sb_t, sb_fname[0]), 1 }, - { offsetof(xfs_sb_t, sb_blocklog), 0 }, - { offsetof(xfs_sb_t, sb_sectlog), 0 }, - { offsetof(xfs_sb_t, sb_inodelog), 0 }, - { offsetof(xfs_sb_t, sb_inopblog), 0 }, - { offsetof(xfs_sb_t, sb_agblklog), 0 }, - { offsetof(xfs_sb_t, sb_rextslog), 0 }, - { offsetof(xfs_sb_t, sb_inprogress), 0 }, - { offsetof(xfs_sb_t, sb_imax_pct), 0 }, - { offsetof(xfs_sb_t, sb_icount), 0 }, - { offsetof(xfs_sb_t, sb_ifree), 0 }, - { offsetof(xfs_sb_t, sb_fdblocks), 0 }, - { offsetof(xfs_sb_t, sb_frextents), 0 }, - { offsetof(xfs_sb_t, sb_uquotino), 0 }, - { offsetof(xfs_sb_t, sb_gquotino), 0 }, - { offsetof(xfs_sb_t, sb_qflags), 0 }, - { offsetof(xfs_sb_t, sb_flags), 0 }, - { offsetof(xfs_sb_t, sb_shared_vn), 0 }, - { offsetof(xfs_sb_t, sb_inoalignmt), 0 }, - { offsetof(xfs_sb_t, sb_unit), 0 }, - { offsetof(xfs_sb_t, sb_width), 0 }, - { offsetof(xfs_sb_t, sb_dirblklog), 0 }, - { offsetof(xfs_sb_t, sb_logsectlog), 0 }, - { offsetof(xfs_sb_t, sb_logsectsize),0 }, - { offsetof(xfs_sb_t, sb_logsunit), 0 }, - { offsetof(xfs_sb_t, sb_features2), 0 }, - { offsetof(xfs_sb_t, sb_bad_features2), 0 }, - { sizeof(xfs_sb_t), 0 } -}; - static DEFINE_MUTEX(xfs_uuid_table_mutex); static int xfs_uuid_table_size; static uuid_t *xfs_uuid_table; @@ -187,64 +131,6 @@ xfs_uuid_unmount( } -/* - * Reference counting access wrappers to the perag structures. - * Because we never free per-ag structures, the only thing we - * have to protect against changes is the tree structure itself. - */ -struct xfs_perag * -xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno) -{ - struct xfs_perag *pag; - int ref = 0; - - rcu_read_lock(); - pag = radix_tree_lookup(&mp->m_perag_tree, agno); - if (pag) { - ASSERT(atomic_read(&pag->pag_ref) >= 0); - ref = atomic_inc_return(&pag->pag_ref); - } - rcu_read_unlock(); - trace_xfs_perag_get(mp, agno, ref, _RET_IP_); - return pag; -} - -/* - * search from @first to find the next perag with the given tag set. - */ -struct xfs_perag * -xfs_perag_get_tag( - struct xfs_mount *mp, - xfs_agnumber_t first, - int tag) -{ - struct xfs_perag *pag; - int found; - int ref; - - rcu_read_lock(); - found = radix_tree_gang_lookup_tag(&mp->m_perag_tree, - (void **)&pag, first, 1, tag); - if (found <= 0) { - rcu_read_unlock(); - return NULL; - } - ref = atomic_inc_return(&pag->pag_ref); - rcu_read_unlock(); - trace_xfs_perag_get_tag(mp, pag->pag_agno, ref, _RET_IP_); - return pag; -} - -void -xfs_perag_put(struct xfs_perag *pag) -{ - int ref; - - ASSERT(atomic_read(&pag->pag_ref) > 0); - ref = atomic_dec_return(&pag->pag_ref); - trace_xfs_perag_put(pag->pag_mount, pag->pag_agno, ref, _RET_IP_); -} - STATIC void __xfs_free_perag( struct rcu_head *head) @@ -297,131 +183,6 @@ xfs_sb_validate_fsb_count( return 0; } -/* - * Check the validity of the SB found. - */ -STATIC int -xfs_mount_validate_sb( - xfs_mount_t *mp, - xfs_sb_t *sbp, - bool check_inprogress) -{ - - /* - * If the log device and data device have the - * same device number, the log is internal. - * Consequently, the sb_logstart should be non-zero. If - * we have a zero sb_logstart in this case, we may be trying to mount - * a volume filesystem in a non-volume manner. - */ - if (sbp->sb_magicnum != XFS_SB_MAGIC) { - xfs_warn(mp, "bad magic number"); - return XFS_ERROR(EWRONGFS); - } - - if (!xfs_sb_good_version(sbp)) { - xfs_warn(mp, "bad version"); - return XFS_ERROR(EWRONGFS); - } - - if (unlikely( - sbp->sb_logstart == 0 && mp->m_logdev_targp == mp->m_ddev_targp)) { - xfs_warn(mp, - "filesystem is marked as having an external log; " - "specify logdev on the mount command line."); - return XFS_ERROR(EINVAL); - } - - if (unlikely( - sbp->sb_logstart != 0 && mp->m_logdev_targp != mp->m_ddev_targp)) { - xfs_warn(mp, - "filesystem is marked as having an internal log; " - "do not specify logdev on the mount command line."); - return XFS_ERROR(EINVAL); - } - - /* - * More sanity checking. Most of these were stolen directly from - * xfs_repair. - */ - if (unlikely( - sbp->sb_agcount <= 0 || - sbp->sb_sectsize < XFS_MIN_SECTORSIZE || - sbp->sb_sectsize > XFS_MAX_SECTORSIZE || - sbp->sb_sectlog < XFS_MIN_SECTORSIZE_LOG || - sbp->sb_sectlog > XFS_MAX_SECTORSIZE_LOG || - sbp->sb_sectsize != (1 << sbp->sb_sectlog) || - sbp->sb_blocksize < XFS_MIN_BLOCKSIZE || - sbp->sb_blocksize > XFS_MAX_BLOCKSIZE || - sbp->sb_blocklog < XFS_MIN_BLOCKSIZE_LOG || - sbp->sb_blocklog > XFS_MAX_BLOCKSIZE_LOG || - sbp->sb_blocksize != (1 << sbp->sb_blocklog) || - sbp->sb_inodesize < XFS_DINODE_MIN_SIZE || - sbp->sb_inodesize > XFS_DINODE_MAX_SIZE || - sbp->sb_inodelog < XFS_DINODE_MIN_LOG || - sbp->sb_inodelog > XFS_DINODE_MAX_LOG || - sbp->sb_inodesize != (1 << sbp->sb_inodelog) || - (sbp->sb_blocklog - sbp->sb_inodelog != sbp->sb_inopblog) || - (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE) || - (sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE) || - (sbp->sb_imax_pct > 100 /* zero sb_imax_pct is valid */) || - sbp->sb_dblocks == 0 || - sbp->sb_dblocks > XFS_MAX_DBLOCKS(sbp) || - sbp->sb_dblocks < XFS_MIN_DBLOCKS(sbp))) { - XFS_CORRUPTION_ERROR("SB sanity check failed", - XFS_ERRLEVEL_LOW, mp, sbp); - return XFS_ERROR(EFSCORRUPTED); - } - - /* - * Until this is fixed only page-sized or smaller data blocks work. - */ - if (unlikely(sbp->sb_blocksize > PAGE_SIZE)) { - xfs_warn(mp, - "File system with blocksize %d bytes. " - "Only pagesize (%ld) or less will currently work.", - sbp->sb_blocksize, PAGE_SIZE); - return XFS_ERROR(ENOSYS); - } - - /* - * Currently only very few inode sizes are supported. - */ - switch (sbp->sb_inodesize) { - case 256: - case 512: - case 1024: - case 2048: - break; - default: - xfs_warn(mp, "inode size of %d bytes not supported", - sbp->sb_inodesize); - return XFS_ERROR(ENOSYS); - } - - if (xfs_sb_validate_fsb_count(sbp, sbp->sb_dblocks) || - xfs_sb_validate_fsb_count(sbp, sbp->sb_rblocks)) { - xfs_warn(mp, - "file system too large to be mounted on this system."); - return XFS_ERROR(EFBIG); - } - - if (check_inprogress && sbp->sb_inprogress) { - xfs_warn(mp, "Offline file system operation in progress!"); - return XFS_ERROR(EFSCORRUPTED); - } - - /* - * Version 1 directory format has never worked on Linux. - */ - if (unlikely(!xfs_sb_version_hasdirv2(sbp))) { - xfs_warn(mp, "file system using version 1 directory format"); - return XFS_ERROR(ENOSYS); - } - - return 0; -} - int xfs_initialize_perag( xfs_mount_t *mp, @@ -506,206 +267,44 @@ out_unwind: return error; } -void -xfs_sb_from_disk( - struct xfs_sb *to, - xfs_dsb_t *from) -{ - to->sb_magicnum = be32_to_cpu(from->sb_magicnum); - to->sb_blocksize = be32_to_cpu(from->sb_blocksize); - to->sb_dblocks = be64_to_cpu(from->sb_dblocks); - to->sb_rblocks = be64_to_cpu(from->sb_rblocks); - to->sb_rextents = be64_to_cpu(from->sb_rextents); - memcpy(&to->sb_uuid, &from->sb_uuid, sizeof(to->sb_uuid)); - to->sb_logstart = be64_to_cpu(from->sb_logstart); - to->sb_rootino = be64_to_cpu(from->sb_rootino); - to->sb_rbmino = be64_to_cpu(from->sb_rbmino); - to->sb_rsumino = be64_to_cpu(from->sb_rsumino); - to->sb_rextsize = be32_to_cpu(from->sb_rextsize); - to->sb_agblocks = be32_to_cpu(from->sb_agblocks); - to->sb_agcount = be32_to_cpu(from->sb_agcount); - to->sb_rbmblocks = be32_to_cpu(from->sb_rbmblocks); - to->sb_logblocks = be32_to_cpu(from->sb_logblocks); - to->sb_versionnum = be16_to_cpu(from->sb_versionnum); - to->sb_sectsize = be16_to_cpu(from->sb_sectsize); - to->sb_inodesize = be16_to_cpu(from->sb_inodesize); - to->sb_inopblock = be16_to_cpu(from->sb_inopblock); - memcpy(&to->sb_fname, &from->sb_fname, sizeof(to->sb_fname)); - to->sb_blocklog = from->sb_blocklog; - to->sb_sectlog = from->sb_sectlog; - to->sb_inodelog = from->sb_inodelog; - to->sb_inopblog = from->sb_inopblog; - to->sb_agblklog = from->sb_agblklog; - to->sb_rextslog = from->sb_rextslog; - to->sb_inprogress = from->sb_inprogress; - to->sb_imax_pct = from->sb_imax_pct; - to->sb_icount = be64_to_cpu(from->sb_icount); - to->sb_ifree = be64_to_cpu(from->sb_ifree); - to->sb_fdblocks = be64_to_cpu(from->sb_fdblocks); - to->sb_frextents = be64_to_cpu(from->sb_frextents); - to->sb_uquotino = be64_to_cpu(from->sb_uquotino); - to->sb_gquotino = be64_to_cpu(from->sb_gquotino); - to->sb_qflags = be16_to_cpu(from->sb_qflags); - to->sb_flags = from->sb_flags; - to->sb_shared_vn = from->sb_shared_vn; - to->sb_inoalignmt = be32_to_cpu(from->sb_inoalignmt); - to->sb_unit = be32_to_cpu(from->sb_unit); - to->sb_width = be32_to_cpu(from->sb_width); - to->sb_dirblklog = from->sb_dirblklog; - to->sb_logsectlog = from->sb_logsectlog; - to->sb_logsectsize = be16_to_cpu(from->sb_logsectsize); - to->sb_logsunit = be32_to_cpu(from->sb_logsunit); - to->sb_features2 = be32_to_cpu(from->sb_features2); - to->sb_bad_features2 = be32_to_cpu(from->sb_bad_features2); -} - -/* - * Copy in core superblock to ondisk one. - * - * The fields argument is mask of superblock fields to copy. - */ -void -xfs_sb_to_disk( - xfs_dsb_t *to, - xfs_sb_t *from, - __int64_t fields) -{ - xfs_caddr_t to_ptr = (xfs_caddr_t)to; - xfs_caddr_t from_ptr = (xfs_caddr_t)from; - xfs_sb_field_t f; - int first; - int size; - - ASSERT(fields); - if (!fields) - return; - - while (fields) { - f = (xfs_sb_field_t)xfs_lowbit64((__uint64_t)fields); - first = xfs_sb_info[f].offset; - size = xfs_sb_info[f + 1].offset - first; - - ASSERT(xfs_sb_info[f].type == 0 || xfs_sb_info[f].type == 1); - - if (size == 1 || xfs_sb_info[f].type == 1) { - memcpy(to_ptr + first, from_ptr + first, size); - } else { - switch (size) { - case 2: - *(__be16 *)(to_ptr + first) = - cpu_to_be16(*(__u16 *)(from_ptr + first)); - break; - case 4: - *(__be32 *)(to_ptr + first) = - cpu_to_be32(*(__u32 *)(from_ptr + first)); - break; - case 8: - *(__be64 *)(to_ptr + first) = - cpu_to_be64(*(__u64 *)(from_ptr + first)); - break; - default: - ASSERT(0); - } - } - - fields &= ~(1LL << f); - } -} - -static void -xfs_sb_verify( - struct xfs_buf *bp) -{ - struct xfs_mount *mp = bp->b_target->bt_mount; - struct xfs_sb sb; - int error; - - xfs_sb_from_disk(&sb, XFS_BUF_TO_SBP(bp)); - - /* - * Only check the in progress field for the primary superblock as - * mkfs.xfs doesn't clear it from secondary superblocks. - */ - error = xfs_mount_validate_sb(mp, &sb, bp->b_bn == XFS_SB_DADDR); - if (error) - xfs_buf_ioerror(bp, error); -} - -static void -xfs_sb_read_verify( - struct xfs_buf *bp) -{ - xfs_sb_verify(bp); -} - -/* - * We may be probed for a filesystem match, so we may not want to emit - * messages when the superblock buffer is not actually an XFS superblock. - * If we find an XFS superblock, the run a normal, noisy mount because we are - * really going to mount it and want to know about errors. - */ -static void -xfs_sb_quiet_read_verify( - struct xfs_buf *bp) -{ - struct xfs_sb sb; - - xfs_sb_from_disk(&sb, XFS_BUF_TO_SBP(bp)); - - if (sb.sb_magicnum == XFS_SB_MAGIC) { - /* XFS filesystem, verify noisily! */ - xfs_sb_read_verify(bp); - return; - } - /* quietly fail */ - xfs_buf_ioerror(bp, EFSCORRUPTED); -} - -static void -xfs_sb_write_verify( - struct xfs_buf *bp) -{ - xfs_sb_verify(bp); -} - -const struct xfs_buf_ops xfs_sb_buf_ops = { - .verify_read = xfs_sb_read_verify, - .verify_write = xfs_sb_write_verify, -}; - -static const struct xfs_buf_ops xfs_sb_quiet_buf_ops = { - .verify_read = xfs_sb_quiet_read_verify, - .verify_write = xfs_sb_write_verify, -}; - /* * xfs_readsb * * Does the initial read of the superblock. */ int -xfs_readsb(xfs_mount_t *mp, int flags) +xfs_readsb( + struct xfs_mount *mp, + int flags) { unsigned int sector_size; - xfs_buf_t *bp; + struct xfs_buf *bp; + struct xfs_sb *sbp = &mp->m_sb; int error; int loud = !(flags & XFS_MFSI_QUIET); + const struct xfs_buf_ops *buf_ops; ASSERT(mp->m_sb_bp == NULL); ASSERT(mp->m_ddev_targp != NULL); /* + * For the initial read, we must guess at the sector + * size based on the block device. It's enough to + * get the sb_sectsize out of the superblock and + * then reread with the proper length. + * We don't verify it yet, because it may not be complete. + */ + sector_size = xfs_getsize_buftarg(mp->m_ddev_targp); + buf_ops = NULL; + + /* * Allocate a (locked) buffer to hold the superblock. * This will be kept around at all times to optimize * access to the superblock. */ - sector_size = xfs_getsize_buftarg(mp->m_ddev_targp); - reread: bp = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_SB_DADDR, - BTOBB(sector_size), 0, - loud ? &xfs_sb_buf_ops - : &xfs_sb_quiet_buf_ops); + BTOBB(sector_size), 0, buf_ops); if (!bp) { if (loud) xfs_warn(mp, "SB buffer read failed"); @@ -714,39 +313,58 @@ reread: if (bp->b_error) { error = bp->b_error; if (loud) - xfs_warn(mp, "SB validate failed"); + xfs_warn(mp, "SB validate failed with error %d.", error); + /* bad CRC means corrupted metadata */ + if (error == EFSBADCRC) + error = EFSCORRUPTED; goto release_buf; } /* * Initialize the mount structure from the superblock. */ - xfs_sb_from_disk(&mp->m_sb, XFS_BUF_TO_SBP(bp)); + xfs_sb_from_disk(sbp, XFS_BUF_TO_SBP(bp)); + xfs_sb_quota_from_disk(sbp); + + /* + * If we haven't validated the superblock, do so now before we try + * to check the sector size and reread the superblock appropriately. + */ + if (sbp->sb_magicnum != XFS_SB_MAGIC) { + if (loud) + xfs_warn(mp, "Invalid superblock magic number"); + error = EINVAL; + goto release_buf; + } /* * We must be able to do sector-sized and sector-aligned IO. */ - if (sector_size > mp->m_sb.sb_sectsize) { + if (sector_size > sbp->sb_sectsize) { if (loud) xfs_warn(mp, "device supports %u byte sectors (not %u)", - sector_size, mp->m_sb.sb_sectsize); + sector_size, sbp->sb_sectsize); error = ENOSYS; goto release_buf; } - /* - * If device sector size is smaller than the superblock size, - * re-read the superblock so the buffer is correctly sized. - */ - if (sector_size < mp->m_sb.sb_sectsize) { + if (buf_ops == NULL) { + /* + * Re-read the superblock so the buffer is correctly sized, + * and properly verified. + */ xfs_buf_relse(bp); - sector_size = mp->m_sb.sb_sectsize; + sector_size = sbp->sb_sectsize; + buf_ops = loud ? &xfs_sb_buf_ops : &xfs_sb_quiet_buf_ops; goto reread; } /* Initialize per-cpu counters */ xfs_icsb_reinit_counters(mp); + /* no need to be quiet anymore, so reset the buf ops */ + bp->b_ops = &xfs_sb_buf_ops; + mp->m_sb_bp = bp; xfs_buf_unlock(bp); return 0; @@ -756,107 +374,6 @@ release_buf: return error; } - -/* - * xfs_mount_common - * - * Mount initialization code establishing various mount - * fields from the superblock associated with the given - * mount structure - */ -STATIC void -xfs_mount_common(xfs_mount_t *mp, xfs_sb_t *sbp) -{ - mp->m_agfrotor = mp->m_agirotor = 0; - spin_lock_init(&mp->m_agirotor_lock); - mp->m_maxagi = mp->m_sb.sb_agcount; - mp->m_blkbit_log = sbp->sb_blocklog + XFS_NBBYLOG; - mp->m_blkbb_log = sbp->sb_blocklog - BBSHIFT; - mp->m_sectbb_log = sbp->sb_sectlog - BBSHIFT; - mp->m_agno_log = xfs_highbit32(sbp->sb_agcount - 1) + 1; - mp->m_agino_log = sbp->sb_inopblog + sbp->sb_agblklog; - mp->m_blockmask = sbp->sb_blocksize - 1; - mp->m_blockwsize = sbp->sb_blocksize >> XFS_WORDLOG; - mp->m_blockwmask = mp->m_blockwsize - 1; - - mp->m_alloc_mxr[0] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, 1); - mp->m_alloc_mxr[1] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, 0); - mp->m_alloc_mnr[0] = mp->m_alloc_mxr[0] / 2; - mp->m_alloc_mnr[1] = mp->m_alloc_mxr[1] / 2; - - mp->m_inobt_mxr[0] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, 1); - mp->m_inobt_mxr[1] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, 0); - mp->m_inobt_mnr[0] = mp->m_inobt_mxr[0] / 2; - mp->m_inobt_mnr[1] = mp->m_inobt_mxr[1] / 2; - - mp->m_bmap_dmxr[0] = xfs_bmbt_maxrecs(mp, sbp->sb_blocksize, 1); - mp->m_bmap_dmxr[1] = xfs_bmbt_maxrecs(mp, sbp->sb_blocksize, 0); - mp->m_bmap_dmnr[0] = mp->m_bmap_dmxr[0] / 2; - mp->m_bmap_dmnr[1] = mp->m_bmap_dmxr[1] / 2; - - mp->m_bsize = XFS_FSB_TO_BB(mp, 1); - mp->m_ialloc_inos = (int)MAX((__uint16_t)XFS_INODES_PER_CHUNK, - sbp->sb_inopblock); - mp->m_ialloc_blks = mp->m_ialloc_inos >> sbp->sb_inopblog; -} - -/* - * xfs_initialize_perag_data - * - * Read in each per-ag structure so we can count up the number of - * allocated inodes, free inodes and used filesystem blocks as this - * information is no longer persistent in the superblock. Once we have - * this information, write it into the in-core superblock structure. - */ -STATIC int -xfs_initialize_perag_data(xfs_mount_t *mp, xfs_agnumber_t agcount) -{ - xfs_agnumber_t index; - xfs_perag_t *pag; - xfs_sb_t *sbp = &mp->m_sb; - uint64_t ifree = 0; - uint64_t ialloc = 0; - uint64_t bfree = 0; - uint64_t bfreelst = 0; - uint64_t btree = 0; - int error; - - for (index = 0; index < agcount; index++) { - /* - * read the agf, then the agi. This gets us - * all the information we need and populates the - * per-ag structures for us. - */ - error = xfs_alloc_pagf_init(mp, NULL, index, 0); - if (error) - return error; - - error = xfs_ialloc_pagi_init(mp, NULL, index); - if (error) - return error; - pag = xfs_perag_get(mp, index); - ifree += pag->pagi_freecount; - ialloc += pag->pagi_count; - bfree += pag->pagf_freeblks; - bfreelst += pag->pagf_flcount; - btree += pag->pagf_btreeblks; - xfs_perag_put(pag); - } - /* - * Overwrite incore superblock counters with just-read data - */ - spin_lock(&mp->m_sb_lock); - sbp->sb_ifree = ifree; - sbp->sb_icount = ialloc; - sbp->sb_fdblocks = bfree + bfreelst + btree; - spin_unlock(&mp->m_sb_lock); - - /* Fixup the per-cpu counters as well. */ - xfs_icsb_reinit_counters(mp); - - return 0; -} - /* * Update alignment values based on mount options and sb values */ @@ -872,42 +389,27 @@ xfs_update_alignment(xfs_mount_t *mp) */ if ((BBTOB(mp->m_dalign) & mp->m_blockmask) || (BBTOB(mp->m_swidth) & mp->m_blockmask)) { - if (mp->m_flags & XFS_MOUNT_RETERR) { - xfs_warn(mp, "alignment check failed: " - "(sunit/swidth vs. blocksize)"); - return XFS_ERROR(EINVAL); - } - mp->m_dalign = mp->m_swidth = 0; + xfs_warn(mp, + "alignment check failed: sunit/swidth vs. blocksize(%d)", + sbp->sb_blocksize); + return XFS_ERROR(EINVAL); } else { /* * Convert the stripe unit and width to FSBs. */ mp->m_dalign = XFS_BB_TO_FSBT(mp, mp->m_dalign); if (mp->m_dalign && (sbp->sb_agblocks % mp->m_dalign)) { - if (mp->m_flags & XFS_MOUNT_RETERR) { - xfs_warn(mp, "alignment check failed: " - "(sunit/swidth vs. ag size)"); - return XFS_ERROR(EINVAL); - } xfs_warn(mp, - "stripe alignment turned off: sunit(%d)/swidth(%d) " - "incompatible with agsize(%d)", - mp->m_dalign, mp->m_swidth, - sbp->sb_agblocks); - - mp->m_dalign = 0; - mp->m_swidth = 0; + "alignment check failed: sunit/swidth vs. agsize(%d)", + sbp->sb_agblocks); + return XFS_ERROR(EINVAL); } else if (mp->m_dalign) { mp->m_swidth = XFS_BB_TO_FSBT(mp, mp->m_swidth); } else { - if (mp->m_flags & XFS_MOUNT_RETERR) { - xfs_warn(mp, "alignment check failed: " - "sunit(%d) less than bsize(%d)", - mp->m_dalign, - mp->m_blockmask +1); - return XFS_ERROR(EINVAL); - } - mp->m_swidth = 0; + xfs_warn(mp, + "alignment check failed: sunit(%d) less than bsize(%d)", + mp->m_dalign, sbp->sb_blocksize); + return XFS_ERROR(EINVAL); } } @@ -924,6 +426,10 @@ xfs_update_alignment(xfs_mount_t *mp) sbp->sb_width = mp->m_swidth; mp->m_update_flags |= XFS_SB_WIDTH; } + } else { + xfs_warn(mp, + "cannot change alignment: superblock does not support data alignment"); + return XFS_ERROR(EINVAL); } } else if ((mp->m_flags & XFS_MOUNT_NOALIGN) != XFS_MOUNT_NOALIGN && xfs_sb_version_hasdalign(&mp->m_sb)) { @@ -1039,7 +545,7 @@ xfs_set_inoalignment(xfs_mount_t *mp) } /* - * Check that the data (and log if separate) are an ok size. + * Check that the data (and log if separate) is an ok size. */ STATIC int xfs_check_sizes(xfs_mount_t *mp) @@ -1109,8 +615,7 @@ xfs_mount_reset_sbqflags( return 0; tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SBCHANGE); - error = xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0, - XFS_DEFAULT_LOG_COUNT); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_sbchange, 0, 0); if (error) { xfs_trans_cancel(tp, 0); xfs_alert(mp, "%s: Superblock update failed!", __func__); @@ -1160,7 +665,7 @@ xfs_mountfs( uint quotaflags = 0; int error = 0; - xfs_mount_common(mp, sbp); + xfs_sb_mount_common(mp, sbp); /* * Check for a mismatched features2 values. Older kernels @@ -1203,6 +708,12 @@ xfs_mountfs( mp->m_update_flags |= XFS_SB_VERSIONNUM; } + /* always use v2 inodes by default now */ + if (!(mp->m_sb.sb_versionnum & XFS_SB_VERSION_NLINKBIT)) { + mp->m_sb.sb_versionnum |= XFS_SB_VERSION_NLINKBIT; + mp->m_update_flags |= XFS_SB_VERSIONNUM; + } + /* * Check if sb_agblocks is aligned at stripe boundary * If sb_agblocks is NOT aligned turn off m_dalign since @@ -1236,8 +747,20 @@ xfs_mountfs( * Set the inode cluster size. * This may still be overridden by the file system * block size if it is larger than the chosen cluster size. + * + * For v5 filesystems, scale the cluster size with the inode size to + * keep a constant ratio of inode per cluster buffer, but only if mkfs + * has set the inode alignment value appropriately for larger cluster + * sizes. */ mp->m_inode_cluster_size = XFS_INODE_BIG_CLUSTER_SIZE; + if (xfs_sb_version_hascrc(&mp->m_sb)) { + int new_size = mp->m_inode_cluster_size; + + new_size *= mp->m_sb.sb_inodesize / XFS_DINODE_MIN_SIZE; + if (mp->m_sb.sb_inoalignmt >= XFS_B_TO_FSBT(mp, new_size)) + mp->m_inode_cluster_size = new_size; + } /* * Set inode alignment fields @@ -1245,7 +768,7 @@ xfs_mountfs( xfs_set_inoalignment(mp); /* - * Check that the data (and log if separate) are an ok size. + * Check that the data (and log if separate) is an ok size. */ error = xfs_check_sizes(mp); if (error) @@ -1268,12 +791,11 @@ xfs_mountfs( mp->m_dmevmask = 0; /* not persistent; set after each mount */ - xfs_dir_mount(mp); - - /* - * Initialize the attribute manager's entries. - */ - mp->m_attr_magicpct = (mp->m_sb.sb_blocksize * 37) / 100; + error = xfs_da_mount(mp); + if (error) { + xfs_warn(mp, "Failed dir/attr init: %d", error); + goto out_remove_uuid; + } /* * Initialize the precomputed transaction reservations values. @@ -1288,7 +810,7 @@ xfs_mountfs( error = xfs_initialize_perag(mp, sbp->sb_agcount, &mp->m_maxagi); if (error) { xfs_warn(mp, "Failed per-ag init: %d", error); - goto out_remove_uuid; + goto out_free_dir; } if (!sbp->sb_logblocks) { @@ -1463,6 +985,8 @@ xfs_mountfs( xfs_wait_buftarg(mp->m_ddev_targp); out_free_perag: xfs_free_perag(mp); + out_free_dir: + xfs_da_unmount(mp); out_remove_uuid: xfs_uuid_unmount(mp); out: @@ -1540,6 +1064,7 @@ xfs_unmountfs( "Freespace may not be correct on next mount."); xfs_log_unmount(mp); + xfs_da_unmount(mp); xfs_uuid_unmount(mp); #if defined(DEBUG) @@ -1583,8 +1108,7 @@ xfs_log_sbcount(xfs_mount_t *mp) return 0; tp = _xfs_trans_alloc(mp, XFS_TRANS_SB_COUNT, KM_SLEEP); - error = xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0, - XFS_DEFAULT_LOG_COUNT); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_sb, 0, 0); if (error) { xfs_trans_cancel(tp, 0); return error; @@ -1597,48 +1121,7 @@ xfs_log_sbcount(xfs_mount_t *mp) } /* - * xfs_mod_sb() can be used to copy arbitrary changes to the - * in-core superblock into the superblock buffer to be logged. - * It does not provide the higher level of locking that is - * needed to protect the in-core superblock from concurrent - * access. - */ -void -xfs_mod_sb(xfs_trans_t *tp, __int64_t fields) -{ - xfs_buf_t *bp; - int first; - int last; - xfs_mount_t *mp; - xfs_sb_field_t f; - - ASSERT(fields); - if (!fields) - return; - mp = tp->t_mountp; - bp = xfs_trans_getsb(tp, mp, 0); - first = sizeof(xfs_sb_t); - last = 0; - - /* translate/copy */ - - xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb, fields); - - /* find modified range */ - f = (xfs_sb_field_t)xfs_highbit64((__uint64_t)fields); - ASSERT((1LL << f) & XFS_SB_MOD_BITS); - last = xfs_sb_info[f + 1].offset - 1; - - f = (xfs_sb_field_t)xfs_lowbit64((__uint64_t)fields); - ASSERT((1LL << f) & XFS_SB_MOD_BITS); - first = xfs_sb_info[f].offset; - - xfs_trans_log_buf(tp, bp, first, last); -} - - -/* - * xfs_mod_incore_sb_unlocked() is a utility routine common used to apply + * xfs_mod_incore_sb_unlocked() is a utility routine commonly used to apply * a delta to a specified field in the in-core superblock. Simply * switch on the field indicated and apply the delta to that field. * Fields are not allowed to dip below zero, so if the delta would @@ -1945,8 +1428,7 @@ xfs_mount_log_sb( XFS_SB_VERSIONNUM)); tp = xfs_trans_alloc(mp, XFS_TRANS_SB_UNIT); - error = xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0, - XFS_DEFAULT_LOG_COUNT); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_sb, 0, 0); if (error) { xfs_trans_cancel(tp, 0); return error; @@ -2104,12 +1586,6 @@ xfs_icsb_init_counters( if (mp->m_sb_cnts == NULL) return -ENOMEM; -#ifdef CONFIG_HOTPLUG_CPU - mp->m_icsb_notifier.notifier_call = xfs_icsb_cpu_notify; - mp->m_icsb_notifier.priority = 0; - register_hotcpu_notifier(&mp->m_icsb_notifier); -#endif /* CONFIG_HOTPLUG_CPU */ - for_each_online_cpu(i) { cntp = (xfs_icsb_cnts_t *)per_cpu_ptr(mp->m_sb_cnts, i); memset(cntp, 0, sizeof(xfs_icsb_cnts_t)); @@ -2122,6 +1598,13 @@ xfs_icsb_init_counters( * initial balance kicks us off correctly */ mp->m_icsb_counters = -1; + +#ifdef CONFIG_HOTPLUG_CPU + mp->m_icsb_notifier.notifier_call = xfs_icsb_cpu_notify; + mp->m_icsb_notifier.priority = 0; + register_hotcpu_notifier(&mp->m_icsb_notifier); +#endif /* CONFIG_HOTPLUG_CPU */ + return 0; } diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index bab8314507e..7295a0b7c34 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -18,38 +18,7 @@ #ifndef __XFS_MOUNT_H__ #define __XFS_MOUNT_H__ -typedef struct xfs_trans_reservations { - uint tr_write; /* extent alloc trans */ - uint tr_itruncate; /* truncate trans */ - uint tr_rename; /* rename trans */ - uint tr_link; /* link trans */ - uint tr_remove; /* unlink trans */ - uint tr_symlink; /* symlink trans */ - uint tr_create; /* create trans */ - uint tr_mkdir; /* mkdir trans */ - uint tr_ifree; /* inode free trans */ - uint tr_ichange; /* inode update trans */ - uint tr_growdata; /* fs data section grow trans */ - uint tr_swrite; /* sync write inode trans */ - uint tr_addafork; /* cvt inode to attributed trans */ - uint tr_writeid; /* write setuid/setgid file */ - uint tr_attrinval; /* attr fork buffer invalidation */ - uint tr_attrset; /* set/create an attribute */ - uint tr_attrrm; /* remove an attribute */ - uint tr_clearagi; /* clear bad agi unlinked ino bucket */ - uint tr_growrtalloc; /* grow realtime allocations */ - uint tr_growrtzero; /* grow realtime zeroing */ - uint tr_growrtfree; /* grow realtime freeing */ -} xfs_trans_reservations_t; - -#ifndef __KERNEL__ - -#define xfs_daddr_to_agno(mp,d) \ - ((xfs_agnumber_t)(XFS_BB_TO_FSBT(mp, d) / (mp)->m_sb.sb_agblocks)) -#define xfs_daddr_to_agbno(mp,d) \ - ((xfs_agblock_t)(XFS_BB_TO_FSBT(mp, d) % (mp)->m_sb.sb_agblocks)) - -#else /* __KERNEL__ */ +#ifdef __KERNEL__ struct xlog; struct xfs_inode; @@ -57,6 +26,8 @@ struct xfs_mru_cache; struct xfs_nameops; struct xfs_ail; struct xfs_quotainfo; +struct xfs_dir_ops; +struct xfs_da_geometry; #ifdef HAVE_PERCPU_SB @@ -126,6 +97,8 @@ typedef struct xfs_mount { uint m_readio_blocks; /* min read size blocks */ uint m_writeio_log; /* min write size log bytes */ uint m_writeio_blocks; /* min write size blocks */ + struct xfs_da_geometry *m_dir_geo; /* directory block geometry */ + struct xfs_da_geometry *m_attr_geo; /* attribute block geometry */ struct xlog *m_log; /* log specific stuff */ int m_logbufs; /* number of log buffers */ int m_logbsize; /* size of each log buffer */ @@ -142,7 +115,7 @@ typedef struct xfs_mount { __uint8_t m_blkbb_log; /* blocklog - BBSHIFT */ __uint8_t m_agno_log; /* log #ag's */ __uint8_t m_agino_log; /* #bits for agino in inum */ - __uint16_t m_inode_cluster_size;/* min inode buf size */ + uint m_inode_cluster_size;/* min inode buf size */ uint m_blockmask; /* sb_blocksize-1 */ uint m_blockwsize; /* sb_blocksize in words */ uint m_blockwmask; /* blockwsize-1 */ @@ -161,13 +134,11 @@ typedef struct xfs_mount { int m_fixedfsid[2]; /* unchanged for life of FS */ uint m_dmevmask; /* DMI events for this FS */ __uint64_t m_flags; /* global mount flags */ - uint m_dir_node_ents; /* #entries in a dir danode */ - uint m_attr_node_ents; /* #entries in attr danode */ int m_ialloc_inos; /* inodes in inode allocation */ int m_ialloc_blks; /* blocks in inode allocation */ int m_inoalign_mask;/* mask sb_inoalignmt if used */ uint m_qflags; /* quota status flags */ - xfs_trans_reservations_t m_reservations;/* precomputed res values */ + struct xfs_trans_resv m_resv; /* precomputed res values */ __uint64_t m_maxicount; /* maximum inode count */ __uint64_t m_resblks; /* total reserved blocks */ __uint64_t m_resblks_avail;/* available reserved blocks */ @@ -175,18 +146,11 @@ typedef struct xfs_mount { int m_dalign; /* stripe unit */ int m_swidth; /* stripe width */ int m_sinoalign; /* stripe unit inode alignment */ - int m_attr_magicpct;/* 37% of the blocksize */ - int m_dir_magicpct; /* 37% of the dir blocksize */ __uint8_t m_sectbb_log; /* sectlog - BBSHIFT */ const struct xfs_nameops *m_dirnameops; /* vector of dir name ops */ - int m_dirblksize; /* directory block sz--bytes */ - int m_dirblkfsbs; /* directory block sz--fsbs */ - xfs_dablk_t m_dirdatablk; /* blockno of dir data v2 */ - xfs_dablk_t m_dirleafblk; /* blockno of dir non-data v2 */ - xfs_dablk_t m_dirfreeblk; /* blockno of dirfreeindex v2 */ + const struct xfs_dir_ops *m_dir_inode_ops; /* vector of dir inode ops */ + const struct xfs_dir_ops *m_nondir_inode_ops; /* !dir inode ops */ uint m_chsize; /* size of next field */ - struct xfs_chash *m_chash; /* fs private inode per-cluster - * hash table */ atomic_t m_active_trans; /* number trans frozen */ #ifdef HAVE_PERCPU_SB xfs_icsb_cnts_t __percpu *m_sb_cnts; /* per-cpu superblock counters */ @@ -200,7 +164,6 @@ typedef struct xfs_mount { trimming */ __int64_t m_update_flags; /* sb flags we need to update on the next remount,rw */ - struct shrinker m_inode_shrink; /* inode reclaim shrinker */ int64_t m_low_space[XFS_LOWSP_MAX]; /* low free space thresholds */ @@ -223,8 +186,6 @@ typedef struct xfs_mount { operations, typically for disk errors in metadata */ #define XFS_MOUNT_DISCARD (1ULL << 5) /* discard unused blocks */ -#define XFS_MOUNT_RETERR (1ULL << 6) /* return alignment errors to - user */ #define XFS_MOUNT_NOALIGN (1ULL << 7) /* turn off stripe alignment allocations */ #define XFS_MOUNT_ATTR2 (1ULL << 8) /* allow use of attr2 format */ @@ -328,14 +289,6 @@ xfs_daddr_to_agbno(struct xfs_mount *mp, xfs_daddr_t d) } /* - * perag get/put wrappers for ref counting - */ -struct xfs_perag *xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno); -struct xfs_perag *xfs_perag_get_tag(struct xfs_mount *mp, xfs_agnumber_t agno, - int tag); -void xfs_perag_put(struct xfs_perag *pag); - -/* * Per-cpu superblock locking functions */ #ifdef HAVE_PERCPU_SB @@ -364,9 +317,63 @@ typedef struct xfs_mod_sb { int64_t msb_delta; /* Change to make to specified field */ } xfs_mod_sb_t; +/* + * Per-ag incore structure, copies of information in agf and agi, to improve the + * performance of allocation group selection. This is defined for the kernel + * only, and hence is defined here instead of in xfs_ag.h. You need the struct + * xfs_mount to be defined to look up a xfs_perag anyway (via mp->m_perag_tree), + * so this doesn't introduce any strange header file dependencies. + */ +typedef struct xfs_perag { + struct xfs_mount *pag_mount; /* owner filesystem */ + xfs_agnumber_t pag_agno; /* AG this structure belongs to */ + atomic_t pag_ref; /* perag reference count */ + char pagf_init; /* this agf's entry is initialized */ + char pagi_init; /* this agi's entry is initialized */ + char pagf_metadata; /* the agf is preferred to be metadata */ + char pagi_inodeok; /* The agi is ok for inodes */ + __uint8_t pagf_levels[XFS_BTNUM_AGF]; + /* # of levels in bno & cnt btree */ + __uint32_t pagf_flcount; /* count of blocks in freelist */ + xfs_extlen_t pagf_freeblks; /* total free blocks */ + xfs_extlen_t pagf_longest; /* longest free space */ + __uint32_t pagf_btreeblks; /* # of blocks held in AGF btrees */ + xfs_agino_t pagi_freecount; /* number of free inodes */ + xfs_agino_t pagi_count; /* number of allocated inodes */ + + /* + * Inode allocation search lookup optimisation. + * If the pagino matches, the search for new inodes + * doesn't need to search the near ones again straight away + */ + xfs_agino_t pagl_pagino; + xfs_agino_t pagl_leftrec; + xfs_agino_t pagl_rightrec; + spinlock_t pagb_lock; /* lock for pagb_tree */ + struct rb_root pagb_tree; /* ordered tree of busy extents */ + + atomic_t pagf_fstrms; /* # of filestreams active in this AG */ + + spinlock_t pag_ici_lock; /* incore inode cache lock */ + struct radix_tree_root pag_ici_root; /* incore inode cache root */ + int pag_ici_reclaimable; /* reclaimable inodes */ + struct mutex pag_ici_reclaim_lock; /* serialisation point */ + unsigned long pag_ici_reclaim_cursor; /* reclaim restart point */ + + /* buffer cache index */ + spinlock_t pag_buf_lock; /* lock for pag_buf_tree */ + struct rb_root pag_buf_tree; /* ordered tree of active buffers */ + + /* for rcu-safe freeing */ + struct rcu_head rcu_head; + int pagb_count; /* pagb slots in use */ +} xfs_perag_t; + extern int xfs_log_sbcount(xfs_mount_t *); extern __uint64_t xfs_default_resblks(xfs_mount_t *mp); extern int xfs_mountfs(xfs_mount_t *mp); +extern int xfs_initialize_perag(xfs_mount_t *mp, xfs_agnumber_t agcount, + xfs_agnumber_t *maxagi); extern void xfs_unmountfs(xfs_mount_t *); extern int xfs_mod_incore_sb(xfs_mount_t *, xfs_sb_field_t, int64_t, int); @@ -385,12 +392,4 @@ extern void xfs_set_low_space_thresholds(struct xfs_mount *); #endif /* __KERNEL__ */ -extern void xfs_mod_sb(struct xfs_trans *, __int64_t); -extern int xfs_initialize_perag(struct xfs_mount *, xfs_agnumber_t, - xfs_agnumber_t *); -extern void xfs_sb_from_disk(struct xfs_sb *, struct xfs_dsb *); -extern void xfs_sb_to_disk(struct xfs_dsb *, struct xfs_sb *, __int64_t); - -extern const struct xfs_buf_ops xfs_sb_buf_ops; - #endif /* __XFS_MOUNT_H__ */ diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c index 4aff5639573..f99b4933dc2 100644 --- a/fs/xfs/xfs_mru_cache.c +++ b/fs/xfs/xfs_mru_cache.c @@ -100,14 +100,20 @@ * likely result in a loop in one of the lists. That's a sure-fire recipe for * an infinite loop in the code. */ -typedef struct xfs_mru_cache_elem -{ - struct list_head list_node; - unsigned long key; - void *value; -} xfs_mru_cache_elem_t; +struct xfs_mru_cache { + struct radix_tree_root store; /* Core storage data structure. */ + struct list_head *lists; /* Array of lists, one per grp. */ + struct list_head reap_list; /* Elements overdue for reaping. */ + spinlock_t lock; /* Lock to protect this struct. */ + unsigned int grp_count; /* Number of discrete groups. */ + unsigned int grp_time; /* Time period spanned by grps. */ + unsigned int lru_grp; /* Group containing time zero. */ + unsigned long time_zero; /* Time first element was added. */ + xfs_mru_cache_free_func_t free_func; /* Function pointer for freeing. */ + struct delayed_work work; /* Workqueue data for reaping. */ + unsigned int queued; /* work has been queued */ +}; -static kmem_zone_t *xfs_mru_elem_zone; static struct workqueue_struct *xfs_mru_reap_wq; /* @@ -129,12 +135,12 @@ static struct workqueue_struct *xfs_mru_reap_wq; */ STATIC unsigned long _xfs_mru_cache_migrate( - xfs_mru_cache_t *mru, - unsigned long now) + struct xfs_mru_cache *mru, + unsigned long now) { - unsigned int grp; - unsigned int migrated = 0; - struct list_head *lru_list; + unsigned int grp; + unsigned int migrated = 0; + struct list_head *lru_list; /* Nothing to do if the data store is empty. */ if (!mru->time_zero) @@ -193,11 +199,11 @@ _xfs_mru_cache_migrate( */ STATIC void _xfs_mru_cache_list_insert( - xfs_mru_cache_t *mru, - xfs_mru_cache_elem_t *elem) + struct xfs_mru_cache *mru, + struct xfs_mru_cache_elem *elem) { - unsigned int grp = 0; - unsigned long now = jiffies; + unsigned int grp = 0; + unsigned long now = jiffies; /* * If the data store is empty, initialise time zero, leave grp set to @@ -231,10 +237,10 @@ _xfs_mru_cache_list_insert( */ STATIC void _xfs_mru_cache_clear_reap_list( - xfs_mru_cache_t *mru) __releases(mru->lock) __acquires(mru->lock) - + struct xfs_mru_cache *mru) + __releases(mru->lock) __acquires(mru->lock) { - xfs_mru_cache_elem_t *elem, *next; + struct xfs_mru_cache_elem *elem, *next; struct list_head tmp; INIT_LIST_HEAD(&tmp); @@ -252,15 +258,8 @@ _xfs_mru_cache_clear_reap_list( spin_unlock(&mru->lock); list_for_each_entry_safe(elem, next, &tmp, list_node) { - - /* Remove the element from the reap list. */ list_del_init(&elem->list_node); - - /* Call the client's free function with the key and value pointer. */ - mru->free_func(elem->key, elem->value); - - /* Free the element structure. */ - kmem_zone_free(xfs_mru_elem_zone, elem); + mru->free_func(elem); } spin_lock(&mru->lock); @@ -277,7 +276,8 @@ STATIC void _xfs_mru_cache_reap( struct work_struct *work) { - xfs_mru_cache_t *mru = container_of(work, xfs_mru_cache_t, work.work); + struct xfs_mru_cache *mru = + container_of(work, struct xfs_mru_cache, work.work); unsigned long now, next; ASSERT(mru && mru->lists); @@ -304,28 +304,16 @@ _xfs_mru_cache_reap( int xfs_mru_cache_init(void) { - xfs_mru_elem_zone = kmem_zone_init(sizeof(xfs_mru_cache_elem_t), - "xfs_mru_cache_elem"); - if (!xfs_mru_elem_zone) - goto out; - xfs_mru_reap_wq = alloc_workqueue("xfs_mru_cache", WQ_MEM_RECLAIM, 1); if (!xfs_mru_reap_wq) - goto out_destroy_mru_elem_zone; - + return -ENOMEM; return 0; - - out_destroy_mru_elem_zone: - kmem_zone_destroy(xfs_mru_elem_zone); - out: - return -ENOMEM; } void xfs_mru_cache_uninit(void) { destroy_workqueue(xfs_mru_reap_wq); - kmem_zone_destroy(xfs_mru_elem_zone); } /* @@ -336,14 +324,14 @@ xfs_mru_cache_uninit(void) */ int xfs_mru_cache_create( - xfs_mru_cache_t **mrup, + struct xfs_mru_cache **mrup, unsigned int lifetime_ms, unsigned int grp_count, xfs_mru_cache_free_func_t free_func) { - xfs_mru_cache_t *mru = NULL; - int err = 0, grp; - unsigned int grp_time; + struct xfs_mru_cache *mru = NULL; + int err = 0, grp; + unsigned int grp_time; if (mrup) *mrup = NULL; @@ -400,7 +388,7 @@ exit: */ static void xfs_mru_cache_flush( - xfs_mru_cache_t *mru) + struct xfs_mru_cache *mru) { if (!mru || !mru->lists) return; @@ -420,7 +408,7 @@ xfs_mru_cache_flush( void xfs_mru_cache_destroy( - xfs_mru_cache_t *mru) + struct xfs_mru_cache *mru) { if (!mru || !mru->lists) return; @@ -438,38 +426,30 @@ xfs_mru_cache_destroy( */ int xfs_mru_cache_insert( - xfs_mru_cache_t *mru, - unsigned long key, - void *value) + struct xfs_mru_cache *mru, + unsigned long key, + struct xfs_mru_cache_elem *elem) { - xfs_mru_cache_elem_t *elem; + int error; ASSERT(mru && mru->lists); if (!mru || !mru->lists) return EINVAL; - elem = kmem_zone_zalloc(xfs_mru_elem_zone, KM_SLEEP); - if (!elem) + if (radix_tree_preload(GFP_KERNEL)) return ENOMEM; - if (radix_tree_preload(GFP_KERNEL)) { - kmem_zone_free(xfs_mru_elem_zone, elem); - return ENOMEM; - } - INIT_LIST_HEAD(&elem->list_node); elem->key = key; - elem->value = value; spin_lock(&mru->lock); - - radix_tree_insert(&mru->store, key, elem); + error = -radix_tree_insert(&mru->store, key, elem); radix_tree_preload_end(); - _xfs_mru_cache_list_insert(mru, elem); - + if (!error) + _xfs_mru_cache_list_insert(mru, elem); spin_unlock(&mru->lock); - return 0; + return error; } /* @@ -478,13 +458,12 @@ xfs_mru_cache_insert( * the client data pointer for the removed element is returned, otherwise this * function will return a NULL pointer. */ -void * +struct xfs_mru_cache_elem * xfs_mru_cache_remove( - xfs_mru_cache_t *mru, - unsigned long key) + struct xfs_mru_cache *mru, + unsigned long key) { - xfs_mru_cache_elem_t *elem; - void *value = NULL; + struct xfs_mru_cache_elem *elem; ASSERT(mru && mru->lists); if (!mru || !mru->lists) @@ -492,17 +471,11 @@ xfs_mru_cache_remove( spin_lock(&mru->lock); elem = radix_tree_delete(&mru->store, key); - if (elem) { - value = elem->value; + if (elem) list_del(&elem->list_node); - } - spin_unlock(&mru->lock); - if (elem) - kmem_zone_free(xfs_mru_elem_zone, elem); - - return value; + return elem; } /* @@ -511,13 +484,14 @@ xfs_mru_cache_remove( */ void xfs_mru_cache_delete( - xfs_mru_cache_t *mru, - unsigned long key) + struct xfs_mru_cache *mru, + unsigned long key) { - void *value = xfs_mru_cache_remove(mru, key); + struct xfs_mru_cache_elem *elem; - if (value) - mru->free_func(key, value); + elem = xfs_mru_cache_remove(mru, key); + if (elem) + mru->free_func(elem); } /* @@ -540,12 +514,12 @@ xfs_mru_cache_delete( * status, we need to help it get it right by annotating the path that does * not release the lock. */ -void * +struct xfs_mru_cache_elem * xfs_mru_cache_lookup( - xfs_mru_cache_t *mru, - unsigned long key) + struct xfs_mru_cache *mru, + unsigned long key) { - xfs_mru_cache_elem_t *elem; + struct xfs_mru_cache_elem *elem; ASSERT(mru && mru->lists); if (!mru || !mru->lists) @@ -560,7 +534,7 @@ xfs_mru_cache_lookup( } else spin_unlock(&mru->lock); - return elem ? elem->value : NULL; + return elem; } /* @@ -570,7 +544,8 @@ xfs_mru_cache_lookup( */ void xfs_mru_cache_done( - xfs_mru_cache_t *mru) __releases(mru->lock) + struct xfs_mru_cache *mru) + __releases(mru->lock) { spin_unlock(&mru->lock); } diff --git a/fs/xfs/xfs_mru_cache.h b/fs/xfs/xfs_mru_cache.h index 36dd3ec8b4e..fb5245ba5ff 100644 --- a/fs/xfs/xfs_mru_cache.h +++ b/fs/xfs/xfs_mru_cache.h @@ -18,24 +18,15 @@ #ifndef __XFS_MRU_CACHE_H__ #define __XFS_MRU_CACHE_H__ +struct xfs_mru_cache; -/* Function pointer type for callback to free a client's data pointer. */ -typedef void (*xfs_mru_cache_free_func_t)(unsigned long, void*); +struct xfs_mru_cache_elem { + struct list_head list_node; + unsigned long key; +}; -typedef struct xfs_mru_cache -{ - struct radix_tree_root store; /* Core storage data structure. */ - struct list_head *lists; /* Array of lists, one per grp. */ - struct list_head reap_list; /* Elements overdue for reaping. */ - spinlock_t lock; /* Lock to protect this struct. */ - unsigned int grp_count; /* Number of discrete groups. */ - unsigned int grp_time; /* Time period spanned by grps. */ - unsigned int lru_grp; /* Group containing time zero. */ - unsigned long time_zero; /* Time first element was added. */ - xfs_mru_cache_free_func_t free_func; /* Function pointer for freeing. */ - struct delayed_work work; /* Workqueue data for reaping. */ - unsigned int queued; /* work has been queued */ -} xfs_mru_cache_t; +/* Function pointer type for callback to free a client's data pointer. */ +typedef void (*xfs_mru_cache_free_func_t)(struct xfs_mru_cache_elem *elem); int xfs_mru_cache_init(void); void xfs_mru_cache_uninit(void); @@ -44,10 +35,12 @@ int xfs_mru_cache_create(struct xfs_mru_cache **mrup, unsigned int lifetime_ms, xfs_mru_cache_free_func_t free_func); void xfs_mru_cache_destroy(struct xfs_mru_cache *mru); int xfs_mru_cache_insert(struct xfs_mru_cache *mru, unsigned long key, - void *value); -void * xfs_mru_cache_remove(struct xfs_mru_cache *mru, unsigned long key); + struct xfs_mru_cache_elem *elem); +struct xfs_mru_cache_elem * +xfs_mru_cache_remove(struct xfs_mru_cache *mru, unsigned long key); void xfs_mru_cache_delete(struct xfs_mru_cache *mru, unsigned long key); -void *xfs_mru_cache_lookup(struct xfs_mru_cache *mru, unsigned long key); +struct xfs_mru_cache_elem * +xfs_mru_cache_lookup(struct xfs_mru_cache *mru, unsigned long key); void xfs_mru_cache_done(struct xfs_mru_cache *mru); #endif /* __XFS_MRU_CACHE_H__ */ diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 60eff476315..6d26759c779 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -17,30 +17,28 @@ */ #include "xfs.h" #include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_bit.h" -#include "xfs_log.h" -#include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_alloc.h" -#include "xfs_quota.h" #include "xfs_mount.h" -#include "xfs_bmap_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_ialloc.h" #include "xfs_itable.h" -#include "xfs_rtalloc.h" +#include "xfs_quota.h" #include "xfs_error.h" #include "xfs_bmap.h" -#include "xfs_attr.h" -#include "xfs_buf_item.h" +#include "xfs_bmap_btree.h" +#include "xfs_trans.h" #include "xfs_trans_space.h" -#include "xfs_utils.h" #include "xfs_qm.h" #include "xfs_trace.h" #include "xfs_icache.h" +#include "xfs_cksum.h" +#include "xfs_dinode.h" /* * The global quota manager. There is only one of these for the entire @@ -50,8 +48,9 @@ */ STATIC int xfs_qm_init_quotainos(xfs_mount_t *); STATIC int xfs_qm_init_quotainfo(xfs_mount_t *); -STATIC int xfs_qm_shake(struct shrinker *, struct shrink_control *); + +STATIC void xfs_qm_dqfree_one(struct xfs_dquot *dqp); /* * We use the batch lookup interface to iterate over the dquots as it * currently is the only interface into the radix tree code that allows @@ -69,7 +68,7 @@ xfs_qm_dquot_walk( void *data) { struct xfs_quotainfo *qi = mp->m_quotainfo; - struct radix_tree_root *tree = XFS_DQUOT_TREE(qi, type); + struct radix_tree_root *tree = xfs_dquot_tree(qi, type); uint32_t next_index; int last_error = 0; int skipped; @@ -135,7 +134,6 @@ xfs_qm_dqpurge( { struct xfs_mount *mp = dqp->q_mount; struct xfs_quotainfo *qi = mp->m_quotainfo; - struct xfs_dquot *gdqp = NULL; xfs_dqlock(dqp); if ((dqp->dq_flags & XFS_DQ_FREEING) || dqp->q_nrefs != 0) { @@ -143,16 +141,6 @@ xfs_qm_dqpurge( return EAGAIN; } - /* - * If this quota has a group hint attached, prepare for releasing it - * now. - */ - gdqp = dqp->q_gdquot; - if (gdqp) { - xfs_dqlock(gdqp); - dqp->q_gdquot = NULL; - } - dqp->dq_flags |= XFS_DQ_FREEING; xfs_dqflock(dqp); @@ -188,7 +176,7 @@ xfs_qm_dqpurge( xfs_dqfunlock(dqp); xfs_dqunlock(dqp); - radix_tree_delete(XFS_DQUOT_TREE(qi, dqp->q_core.d_flags), + radix_tree_delete(xfs_dquot_tree(qi, dqp->q_core.d_flags), be32_to_cpu(dqp->q_core.d_id)); qi->qi_dquots--; @@ -196,17 +184,11 @@ xfs_qm_dqpurge( * We move dquots to the freelist as soon as their reference count * hits zero, so it really should be on the freelist here. */ - mutex_lock(&qi->qi_lru_lock); ASSERT(!list_empty(&dqp->q_lru)); - list_del_init(&dqp->q_lru); - qi->qi_lru_count--; + list_lru_del(&qi->qi_lru, &dqp->q_lru); XFS_STATS_DEC(xs_qm_dquot_unused); - mutex_unlock(&qi->qi_lru_lock); xfs_qm_dqdestroy(dqp); - - if (gdqp) - xfs_qm_dqput(gdqp); return 0; } @@ -298,8 +280,10 @@ xfs_qm_mount_quotas( */ if (!XFS_IS_UQUOTA_ON(mp)) mp->m_qflags &= ~XFS_UQUOTA_CHKD; - if (!(XFS_IS_GQUOTA_ON(mp) || XFS_IS_PQUOTA_ON(mp))) - mp->m_qflags &= ~XFS_OQUOTA_CHKD; + if (!XFS_IS_GQUOTA_ON(mp)) + mp->m_qflags &= ~XFS_GQUOTA_CHKD; + if (!XFS_IS_PQUOTA_ON(mp)) + mp->m_qflags &= ~XFS_PQUOTA_CHKD; write_changes: /* @@ -361,6 +345,10 @@ xfs_qm_unmount_quotas( IRELE(mp->m_quotainfo->qi_gquotaip); mp->m_quotainfo->qi_gquotaip = NULL; } + if (mp->m_quotainfo->qi_pquotaip) { + IRELE(mp->m_quotainfo->qi_pquotaip); + mp->m_quotainfo->qi_pquotaip = NULL; + } } } @@ -370,7 +358,6 @@ xfs_qm_dqattach_one( xfs_dqid_t id, uint type, uint doalloc, - xfs_dquot_t *udqhint, /* hint */ xfs_dquot_t **IO_idqpp) { xfs_dquot_t *dqp; @@ -380,9 +367,9 @@ xfs_qm_dqattach_one( error = 0; /* - * See if we already have it in the inode itself. IO_idqpp is - * &i_udquot or &i_gdquot. This made the code look weird, but - * made the logic a lot simpler. + * See if we already have it in the inode itself. IO_idqpp is &i_udquot + * or &i_gdquot. This made the code look weird, but made the logic a lot + * simpler. */ dqp = *IO_idqpp; if (dqp) { @@ -391,46 +378,10 @@ xfs_qm_dqattach_one( } /* - * udqhint is the i_udquot field in inode, and is non-NULL only - * when the type arg is group/project. Its purpose is to save a - * lookup by dqid (xfs_qm_dqget) by caching a group dquot inside - * the user dquot. - */ - if (udqhint) { - ASSERT(type == XFS_DQ_GROUP || type == XFS_DQ_PROJ); - xfs_dqlock(udqhint); - - /* - * No need to take dqlock to look at the id. - * - * The ID can't change until it gets reclaimed, and it won't - * be reclaimed as long as we have a ref from inode and we - * hold the ilock. - */ - dqp = udqhint->q_gdquot; - if (dqp && be32_to_cpu(dqp->q_core.d_id) == id) { - ASSERT(*IO_idqpp == NULL); - - *IO_idqpp = xfs_qm_dqhold(dqp); - xfs_dqunlock(udqhint); - return 0; - } - - /* - * We can't hold a dquot lock when we call the dqget code. - * We'll deadlock in no time, because of (not conforming to) - * lock ordering - the inodelock comes before any dquot lock, - * and we may drop and reacquire the ilock in xfs_qm_dqget(). - */ - xfs_dqunlock(udqhint); - } - - /* - * Find the dquot from somewhere. This bumps the - * reference count of dquot and returns it locked. - * This can return ENOENT if dquot didn't exist on - * disk and we didn't ask it to allocate; - * ESRCH if quotas got turned off suddenly. + * Find the dquot from somewhere. This bumps the reference count of + * dquot and returns it locked. This can return ENOENT if dquot didn't + * exist on disk and we didn't ask it to allocate; ESRCH if quotas got + * turned off suddenly. */ error = xfs_qm_dqget(ip->i_mount, ip, id, type, doalloc | XFS_QMOPT_DOWARN, &dqp); @@ -448,34 +399,6 @@ xfs_qm_dqattach_one( return 0; } - -/* - * Given a udquot and gdquot, attach a ptr to the group dquot in the - * udquot as a hint for future lookups. - */ -STATIC void -xfs_qm_dqattach_grouphint( - xfs_dquot_t *udq, - xfs_dquot_t *gdq) -{ - xfs_dquot_t *tmp; - - xfs_dqlock(udq); - - tmp = udq->q_gdquot; - if (tmp) { - if (tmp == gdq) - goto done; - - udq->q_gdquot = NULL; - xfs_qm_dqrele(tmp); - } - - udq->q_gdquot = xfs_qm_dqhold(gdq); -done: - xfs_dqunlock(udq); -} - static bool xfs_qm_need_dqattach( struct xfs_inode *ip) @@ -488,8 +411,7 @@ xfs_qm_need_dqattach( return false; if (!XFS_NOT_DQATTACHED(mp, ip)) return false; - if (ip->i_ino == mp->m_sb.sb_uquotino || - ip->i_ino == mp->m_sb.sb_gquotino) + if (xfs_is_quota_inode(&mp->m_sb, ip->i_ino)) return false; return true; } @@ -507,7 +429,6 @@ xfs_qm_dqattach_locked( uint flags) { xfs_mount_t *mp = ip->i_mount; - uint nquotas = 0; int error = 0; if (!xfs_qm_need_dqattach(ip)) @@ -515,62 +436,39 @@ xfs_qm_dqattach_locked( ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); - if (XFS_IS_UQUOTA_ON(mp)) { + if (XFS_IS_UQUOTA_ON(mp) && !ip->i_udquot) { error = xfs_qm_dqattach_one(ip, ip->i_d.di_uid, XFS_DQ_USER, flags & XFS_QMOPT_DQALLOC, - NULL, &ip->i_udquot); + &ip->i_udquot); if (error) goto done; - nquotas++; + ASSERT(ip->i_udquot); } - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); - if (XFS_IS_OQUOTA_ON(mp)) { - error = XFS_IS_GQUOTA_ON(mp) ? - xfs_qm_dqattach_one(ip, ip->i_d.di_gid, XFS_DQ_GROUP, + if (XFS_IS_GQUOTA_ON(mp) && !ip->i_gdquot) { + error = xfs_qm_dqattach_one(ip, ip->i_d.di_gid, XFS_DQ_GROUP, flags & XFS_QMOPT_DQALLOC, - ip->i_udquot, &ip->i_gdquot) : - xfs_qm_dqattach_one(ip, xfs_get_projid(ip), XFS_DQ_PROJ, + &ip->i_gdquot); + if (error) + goto done; + ASSERT(ip->i_gdquot); + } + + if (XFS_IS_PQUOTA_ON(mp) && !ip->i_pdquot) { + error = xfs_qm_dqattach_one(ip, xfs_get_projid(ip), XFS_DQ_PROJ, flags & XFS_QMOPT_DQALLOC, - ip->i_udquot, &ip->i_gdquot); - /* - * Don't worry about the udquot that we may have - * attached above. It'll get detached, if not already. - */ + &ip->i_pdquot); if (error) goto done; - nquotas++; + ASSERT(ip->i_pdquot); } +done: /* - * Attach this group quota to the user quota as a hint. - * This WON'T, in general, result in a thrash. + * Don't worry about the dquots that we may have attached before any + * error - they'll get detached later if it has not already been done. */ - if (nquotas == 2) { - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); - ASSERT(ip->i_udquot); - ASSERT(ip->i_gdquot); - - /* - * We do not have i_udquot locked at this point, but this check - * is OK since we don't depend on the i_gdquot to be accurate - * 100% all the time. It is just a hint, and this will - * succeed in general. - */ - if (ip->i_udquot->q_gdquot != ip->i_gdquot) - xfs_qm_dqattach_grouphint(ip->i_udquot, ip->i_gdquot); - } - - done: -#ifdef DEBUG - if (!error) { - if (XFS_IS_UQUOTA_ON(mp)) - ASSERT(ip->i_udquot); - if (XFS_IS_OQUOTA_ON(mp)) - ASSERT(ip->i_gdquot); - } ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); -#endif return error; } @@ -600,13 +498,12 @@ void xfs_qm_dqdetach( xfs_inode_t *ip) { - if (!(ip->i_udquot || ip->i_gdquot)) + if (!(ip->i_udquot || ip->i_gdquot || ip->i_pdquot)) return; trace_xfs_dquot_dqdetach(ip); - ASSERT(ip->i_ino != ip->i_mount->m_sb.sb_uquotino); - ASSERT(ip->i_ino != ip->i_mount->m_sb.sb_gquotino); + ASSERT(!xfs_is_quota_inode(&ip->i_mount->m_sb, ip->i_ino)); if (ip->i_udquot) { xfs_qm_dqrele(ip->i_udquot); ip->i_udquot = NULL; @@ -615,6 +512,147 @@ xfs_qm_dqdetach( xfs_qm_dqrele(ip->i_gdquot); ip->i_gdquot = NULL; } + if (ip->i_pdquot) { + xfs_qm_dqrele(ip->i_pdquot); + ip->i_pdquot = NULL; + } +} + +struct xfs_qm_isolate { + struct list_head buffers; + struct list_head dispose; +}; + +static enum lru_status +xfs_qm_dquot_isolate( + struct list_head *item, + spinlock_t *lru_lock, + void *arg) +{ + struct xfs_dquot *dqp = container_of(item, + struct xfs_dquot, q_lru); + struct xfs_qm_isolate *isol = arg; + + if (!xfs_dqlock_nowait(dqp)) + goto out_miss_busy; + + /* + * This dquot has acquired a reference in the meantime remove it from + * the freelist and try again. + */ + if (dqp->q_nrefs) { + xfs_dqunlock(dqp); + XFS_STATS_INC(xs_qm_dqwants); + + trace_xfs_dqreclaim_want(dqp); + list_del_init(&dqp->q_lru); + XFS_STATS_DEC(xs_qm_dquot_unused); + return LRU_REMOVED; + } + + /* + * If the dquot is dirty, flush it. If it's already being flushed, just + * skip it so there is time for the IO to complete before we try to + * reclaim it again on the next LRU pass. + */ + if (!xfs_dqflock_nowait(dqp)) { + xfs_dqunlock(dqp); + goto out_miss_busy; + } + + if (XFS_DQ_IS_DIRTY(dqp)) { + struct xfs_buf *bp = NULL; + int error; + + trace_xfs_dqreclaim_dirty(dqp); + + /* we have to drop the LRU lock to flush the dquot */ + spin_unlock(lru_lock); + + error = xfs_qm_dqflush(dqp, &bp); + if (error) { + xfs_warn(dqp->q_mount, "%s: dquot %p flush failed", + __func__, dqp); + goto out_unlock_dirty; + } + + xfs_buf_delwri_queue(bp, &isol->buffers); + xfs_buf_relse(bp); + goto out_unlock_dirty; + } + xfs_dqfunlock(dqp); + + /* + * Prevent lookups now that we are past the point of no return. + */ + dqp->dq_flags |= XFS_DQ_FREEING; + xfs_dqunlock(dqp); + + ASSERT(dqp->q_nrefs == 0); + list_move_tail(&dqp->q_lru, &isol->dispose); + XFS_STATS_DEC(xs_qm_dquot_unused); + trace_xfs_dqreclaim_done(dqp); + XFS_STATS_INC(xs_qm_dqreclaims); + return LRU_REMOVED; + +out_miss_busy: + trace_xfs_dqreclaim_busy(dqp); + XFS_STATS_INC(xs_qm_dqreclaim_misses); + return LRU_SKIP; + +out_unlock_dirty: + trace_xfs_dqreclaim_busy(dqp); + XFS_STATS_INC(xs_qm_dqreclaim_misses); + xfs_dqunlock(dqp); + spin_lock(lru_lock); + return LRU_RETRY; +} + +static unsigned long +xfs_qm_shrink_scan( + struct shrinker *shrink, + struct shrink_control *sc) +{ + struct xfs_quotainfo *qi = container_of(shrink, + struct xfs_quotainfo, qi_shrinker); + struct xfs_qm_isolate isol; + unsigned long freed; + int error; + unsigned long nr_to_scan = sc->nr_to_scan; + + if ((sc->gfp_mask & (__GFP_FS|__GFP_WAIT)) != (__GFP_FS|__GFP_WAIT)) + return 0; + + INIT_LIST_HEAD(&isol.buffers); + INIT_LIST_HEAD(&isol.dispose); + + freed = list_lru_walk_node(&qi->qi_lru, sc->nid, xfs_qm_dquot_isolate, &isol, + &nr_to_scan); + + error = xfs_buf_delwri_submit(&isol.buffers); + if (error) + xfs_warn(NULL, "%s: dquot reclaim failed", __func__); + + while (!list_empty(&isol.dispose)) { + struct xfs_dquot *dqp; + + dqp = list_first_entry(&isol.dispose, struct xfs_dquot, q_lru); + list_del_init(&dqp->q_lru); + xfs_qm_dqfree_one(dqp); + } + + return freed; +} + +static unsigned long +xfs_qm_shrink_count( + struct shrinker *shrink, + struct shrink_control *sc) +{ + struct xfs_quotainfo *qi = container_of(shrink, + struct xfs_quotainfo, qi_shrinker); + + return list_lru_count_node(&qi->qi_lru, sc->nid); } /* @@ -633,32 +671,29 @@ xfs_qm_init_quotainfo( qinf = mp->m_quotainfo = kmem_zalloc(sizeof(xfs_quotainfo_t), KM_SLEEP); + error = -list_lru_init(&qinf->qi_lru); + if (error) + goto out_free_qinf; + /* * See if quotainodes are setup, and if not, allocate them, * and change the superblock accordingly. */ - if ((error = xfs_qm_init_quotainos(mp))) { - kmem_free(qinf); - mp->m_quotainfo = NULL; - return error; - } + error = xfs_qm_init_quotainos(mp); + if (error) + goto out_free_lru; INIT_RADIX_TREE(&qinf->qi_uquota_tree, GFP_NOFS); INIT_RADIX_TREE(&qinf->qi_gquota_tree, GFP_NOFS); + INIT_RADIX_TREE(&qinf->qi_pquota_tree, GFP_NOFS); mutex_init(&qinf->qi_tree_lock); - INIT_LIST_HEAD(&qinf->qi_lru_list); - qinf->qi_lru_count = 0; - mutex_init(&qinf->qi_lru_lock); - /* mutex used to serialize quotaoffs */ mutex_init(&qinf->qi_quotaofflock); /* Precalc some constants */ qinf->qi_dqchunklen = XFS_FSB_TO_BB(mp, XFS_DQUOT_CLUSTER_SIZE_FSB); - ASSERT(qinf->qi_dqchunklen); - qinf->qi_dqperchunk = BBTOB(qinf->qi_dqchunklen); - do_div(qinf->qi_dqperchunk, sizeof(xfs_dqblk_t)); + qinf->qi_dqperchunk = xfs_calc_dquots_per_chunk(qinf->qi_dqchunklen); mp->m_qflags |= (mp->m_sb.sb_qflags & XFS_ALL_QUOTA_CHKD); @@ -705,7 +740,7 @@ xfs_qm_init_quotainfo( qinf->qi_isoftlimit = be64_to_cpu(ddqp->d_ino_softlimit); qinf->qi_rtbhardlimit = be64_to_cpu(ddqp->d_rtb_hardlimit); qinf->qi_rtbsoftlimit = be64_to_cpu(ddqp->d_rtb_softlimit); - + xfs_qm_dqdestroy(dqp); } else { qinf->qi_btimelimit = XFS_QM_BTIMELIMIT; @@ -716,10 +751,19 @@ xfs_qm_init_quotainfo( qinf->qi_rtbwarnlimit = XFS_QM_RTBWARNLIMIT; } - qinf->qi_shrinker.shrink = xfs_qm_shake; + qinf->qi_shrinker.count_objects = xfs_qm_shrink_count; + qinf->qi_shrinker.scan_objects = xfs_qm_shrink_scan; qinf->qi_shrinker.seeks = DEFAULT_SEEKS; + qinf->qi_shrinker.flags = SHRINKER_NUMA_AWARE; register_shrinker(&qinf->qi_shrinker); return 0; + +out_free_lru: + list_lru_destroy(&qinf->qi_lru); +out_free_qinf: + kmem_free(qinf); + mp->m_quotainfo = NULL; + return error; } @@ -738,6 +782,7 @@ xfs_qm_destroy_quotainfo( ASSERT(qi != NULL); unregister_shrinker(&qi->qi_shrinker); + list_lru_destroy(&qi->qi_lru); if (qi->qi_uquotaip) { IRELE(qi->qi_uquotaip); @@ -747,6 +792,10 @@ xfs_qm_destroy_quotainfo( IRELE(qi->qi_gquotaip); qi->qi_gquotaip = NULL; } + if (qi->qi_pquotaip) { + IRELE(qi->qi_pquotaip); + qi->qi_pquotaip = NULL; + } mutex_destroy(&qi->qi_quotaofflock); kmem_free(qi); mp->m_quotainfo = NULL; @@ -767,21 +816,52 @@ xfs_qm_qino_alloc( int error; int committed; + *ip = NULL; + /* + * With superblock that doesn't have separate pquotino, we + * share an inode between gquota and pquota. If the on-disk + * superblock has GQUOTA and the filesystem is now mounted + * with PQUOTA, just use sb_gquotino for sb_pquotino and + * vice-versa. + */ + if (!xfs_sb_version_has_pquotino(&mp->m_sb) && + (flags & (XFS_QMOPT_PQUOTA|XFS_QMOPT_GQUOTA))) { + xfs_ino_t ino = NULLFSINO; + + if ((flags & XFS_QMOPT_PQUOTA) && + (mp->m_sb.sb_gquotino != NULLFSINO)) { + ino = mp->m_sb.sb_gquotino; + ASSERT(mp->m_sb.sb_pquotino == NULLFSINO); + } else if ((flags & XFS_QMOPT_GQUOTA) && + (mp->m_sb.sb_pquotino != NULLFSINO)) { + ino = mp->m_sb.sb_pquotino; + ASSERT(mp->m_sb.sb_gquotino == NULLFSINO); + } + if (ino != NULLFSINO) { + error = xfs_iget(mp, NULL, ino, 0, 0, ip); + if (error) + return error; + mp->m_sb.sb_gquotino = NULLFSINO; + mp->m_sb.sb_pquotino = NULLFSINO; + } + } + tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QINOCREATE); - if ((error = xfs_trans_reserve(tp, - XFS_QM_QINOCREATE_SPACE_RES(mp), - XFS_CREATE_LOG_RES(mp), 0, - XFS_TRANS_PERM_LOG_RES, - XFS_CREATE_LOG_COUNT))) { + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_create, + XFS_QM_QINOCREATE_SPACE_RES(mp), 0); + if (error) { xfs_trans_cancel(tp, 0); return error; } - error = xfs_dir_ialloc(&tp, NULL, S_IFREG, 1, 0, 0, 1, ip, &committed); - if (error) { - xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | - XFS_TRANS_ABORT); - return error; + if (!*ip) { + error = xfs_dir_ialloc(&tp, NULL, S_IFREG, 1, 0, 0, 1, ip, + &committed); + if (error) { + xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | + XFS_TRANS_ABORT); + return error; + } } /* @@ -793,21 +873,25 @@ xfs_qm_qino_alloc( if (flags & XFS_QMOPT_SBVERSION) { ASSERT(!xfs_sb_version_hasquota(&mp->m_sb)); ASSERT((sbfields & (XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO | - XFS_SB_GQUOTINO | XFS_SB_QFLAGS)) == - (XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO | - XFS_SB_GQUOTINO | XFS_SB_QFLAGS)); + XFS_SB_GQUOTINO | XFS_SB_PQUOTINO | XFS_SB_QFLAGS)) == + (XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO | + XFS_SB_GQUOTINO | XFS_SB_PQUOTINO | + XFS_SB_QFLAGS)); xfs_sb_version_addquota(&mp->m_sb); mp->m_sb.sb_uquotino = NULLFSINO; mp->m_sb.sb_gquotino = NULLFSINO; + mp->m_sb.sb_pquotino = NULLFSINO; - /* qflags will get updated _after_ quotacheck */ - mp->m_sb.sb_qflags = 0; + /* qflags will get updated fully _after_ quotacheck */ + mp->m_sb.sb_qflags = mp->m_qflags & XFS_ALL_QUOTA_ACCT; } if (flags & XFS_QMOPT_UQUOTA) mp->m_sb.sb_uquotino = (*ip)->i_ino; - else + else if (flags & XFS_QMOPT_GQUOTA) mp->m_sb.sb_gquotino = (*ip)->i_ino; + else + mp->m_sb.sb_pquotino = (*ip)->i_ino; spin_unlock(&mp->m_sb_lock); xfs_mod_sb(tp, sbfields); @@ -826,7 +910,7 @@ xfs_qm_reset_dqcounts( xfs_dqid_t id, uint type) { - xfs_disk_dquot_t *ddq; + struct xfs_dqblk *dqb; int j; trace_xfs_reset_dqcounts(bp, _RET_IP_); @@ -840,15 +924,19 @@ xfs_qm_reset_dqcounts( do_div(j, sizeof(xfs_dqblk_t)); ASSERT(mp->m_quotainfo->qi_dqperchunk == j); #endif - ddq = bp->b_addr; + dqb = bp->b_addr; for (j = 0; j < mp->m_quotainfo->qi_dqperchunk; j++) { + struct xfs_disk_dquot *ddq; + + ddq = (struct xfs_disk_dquot *)&dqb[j]; + /* * Do a sanity check, and if needed, repair the dqblk. Don't * output any warnings because it's perfectly possible to - * find uninitialised dquot blks. See comment in xfs_qm_dqcheck. + * find uninitialised dquot blks. See comment in xfs_dqcheck. */ - (void) xfs_qm_dqcheck(mp, ddq, id+j, type, XFS_QMOPT_DQREPAIR, - "xfs_quotacheck"); + xfs_dqcheck(mp, ddq, id+j, type, XFS_QMOPT_DQREPAIR, + "xfs_quotacheck"); ddq->d_bcount = 0; ddq->d_icount = 0; ddq->d_rtbcount = 0; @@ -858,7 +946,12 @@ xfs_qm_reset_dqcounts( ddq->d_bwarns = 0; ddq->d_iwarns = 0; ddq->d_rtbwarns = 0; - ddq = (xfs_disk_dquot_t *) ((xfs_dqblk_t *)ddq + 1); + + if (xfs_sb_version_hascrc(&mp->m_sb)) { + xfs_update_cksum((char *)&dqb[j], + sizeof(struct xfs_dqblk), + XFS_DQUOT_CRC_OFF); + } } } @@ -894,15 +987,29 @@ xfs_qm_dqiter_bufs( XFS_FSB_TO_DADDR(mp, bno), mp->m_quotainfo->qi_dqchunklen, 0, &bp, &xfs_dquot_buf_ops); + + /* + * CRC and validation errors will return a EFSCORRUPTED here. If + * this occurs, re-read without CRC validation so that we can + * repair the damage via xfs_qm_reset_dqcounts(). This process + * will leave a trace in the log indicating corruption has + * been detected. + */ + if (error == EFSCORRUPTED) { + error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, + XFS_FSB_TO_DADDR(mp, bno), + mp->m_quotainfo->qi_dqchunklen, 0, &bp, + NULL); + } + if (error) break; xfs_qm_reset_dqcounts(mp, bp, firstid, type); xfs_buf_delwri_queue(bp, buffer_list); xfs_buf_relse(bp); - /* - * goto the next block. - */ + + /* goto the next block. */ bno++; firstid += mp->m_quotainfo->qi_dqperchunk; } @@ -944,16 +1051,18 @@ xfs_qm_dqiterate( lblkno = 0; maxlblkcnt = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes); do { + uint lock_mode; + nmaps = XFS_DQITER_MAP_SIZE; /* * We aren't changing the inode itself. Just changing * some of its data. No new blocks are added here, and * the inode is never added to the transaction. */ - xfs_ilock(qip, XFS_ILOCK_SHARED); + lock_mode = xfs_ilock_data_map_shared(qip); error = xfs_bmapi_read(qip, lblkno, maxlblkcnt - lblkno, map, &nmaps, 0); - xfs_iunlock(qip, XFS_ILOCK_SHARED); + xfs_iunlock(qip, lock_mode); if (error) break; @@ -1057,7 +1166,7 @@ xfs_qm_quotacheck_dqadjust( * There are no timers for the default values set in the root dquot. */ if (dqp->q_core.d_id) { - xfs_qm_adjust_dqlimits(mp, &dqp->q_core); + xfs_qm_adjust_dqlimits(mp, dqp); xfs_qm_adjust_dqtimers(mp, &dqp->q_core); } @@ -1115,7 +1224,7 @@ xfs_qm_dqusage_adjust( * rootino must have its resources accounted for, not so with the quota * inodes. */ - if (ino == mp->m_sb.sb_uquotino || ino == mp->m_sb.sb_gquotino) { + if (xfs_is_quota_inode(&mp->m_sb, ino)) { *res = BULKSTAT_RV_NOTHING; return XFS_ERROR(EINVAL); } @@ -1225,19 +1334,21 @@ int xfs_qm_quotacheck( xfs_mount_t *mp) { - int done, count, error, error2; - xfs_ino_t lastino; - size_t structsz; - xfs_inode_t *uip, *gip; - uint flags; - LIST_HEAD (buffer_list); + int done, count, error, error2; + xfs_ino_t lastino; + size_t structsz; + uint flags; + LIST_HEAD (buffer_list); + struct xfs_inode *uip = mp->m_quotainfo->qi_uquotaip; + struct xfs_inode *gip = mp->m_quotainfo->qi_gquotaip; + struct xfs_inode *pip = mp->m_quotainfo->qi_pquotaip; count = INT_MAX; structsz = 1; lastino = 0; flags = 0; - ASSERT(mp->m_quotainfo->qi_uquotaip || mp->m_quotainfo->qi_gquotaip); + ASSERT(uip || gip || pip); ASSERT(XFS_IS_QUOTA_RUNNING(mp)); xfs_notice(mp, "Quotacheck needed: Please wait."); @@ -1247,7 +1358,6 @@ xfs_qm_quotacheck( * their counters to zero. We need a clean slate. * We don't log our changes till later. */ - uip = mp->m_quotainfo->qi_uquotaip; if (uip) { error = xfs_qm_dqiterate(mp, uip, XFS_QMOPT_UQUOTA, &buffer_list); @@ -1256,14 +1366,20 @@ xfs_qm_quotacheck( flags |= XFS_UQUOTA_CHKD; } - gip = mp->m_quotainfo->qi_gquotaip; if (gip) { - error = xfs_qm_dqiterate(mp, gip, XFS_IS_GQUOTA_ON(mp) ? - XFS_QMOPT_GQUOTA : XFS_QMOPT_PQUOTA, + error = xfs_qm_dqiterate(mp, gip, XFS_QMOPT_GQUOTA, + &buffer_list); + if (error) + goto error_return; + flags |= XFS_GQUOTA_CHKD; + } + + if (pip) { + error = xfs_qm_dqiterate(mp, pip, XFS_QMOPT_PQUOTA, &buffer_list); if (error) goto error_return; - flags |= XFS_OQUOTA_CHKD; + flags |= XFS_PQUOTA_CHKD; } do { @@ -1358,15 +1474,14 @@ STATIC int xfs_qm_init_quotainos( xfs_mount_t *mp) { - xfs_inode_t *uip, *gip; - int error; - __int64_t sbflags; - uint flags; + struct xfs_inode *uip = NULL; + struct xfs_inode *gip = NULL; + struct xfs_inode *pip = NULL; + int error; + __int64_t sbflags = 0; + uint flags = 0; ASSERT(mp->m_quotainfo); - uip = gip = NULL; - sbflags = 0; - flags = 0; /* * Get the uquota and gquota inodes @@ -1375,57 +1490,80 @@ xfs_qm_init_quotainos( if (XFS_IS_UQUOTA_ON(mp) && mp->m_sb.sb_uquotino != NULLFSINO) { ASSERT(mp->m_sb.sb_uquotino > 0); - if ((error = xfs_iget(mp, NULL, mp->m_sb.sb_uquotino, - 0, 0, &uip))) + error = xfs_iget(mp, NULL, mp->m_sb.sb_uquotino, + 0, 0, &uip); + if (error) return XFS_ERROR(error); } - if (XFS_IS_OQUOTA_ON(mp) && + if (XFS_IS_GQUOTA_ON(mp) && mp->m_sb.sb_gquotino != NULLFSINO) { ASSERT(mp->m_sb.sb_gquotino > 0); - if ((error = xfs_iget(mp, NULL, mp->m_sb.sb_gquotino, - 0, 0, &gip))) { - if (uip) - IRELE(uip); - return XFS_ERROR(error); - } + error = xfs_iget(mp, NULL, mp->m_sb.sb_gquotino, + 0, 0, &gip); + if (error) + goto error_rele; + } + if (XFS_IS_PQUOTA_ON(mp) && + mp->m_sb.sb_pquotino != NULLFSINO) { + ASSERT(mp->m_sb.sb_pquotino > 0); + error = xfs_iget(mp, NULL, mp->m_sb.sb_pquotino, + 0, 0, &pip); + if (error) + goto error_rele; } } else { flags |= XFS_QMOPT_SBVERSION; sbflags |= (XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO | - XFS_SB_GQUOTINO | XFS_SB_QFLAGS); + XFS_SB_GQUOTINO | XFS_SB_PQUOTINO | + XFS_SB_QFLAGS); } /* - * Create the two inodes, if they don't exist already. The changes + * Create the three inodes, if they don't exist already. The changes * made above will get added to a transaction and logged in one of * the qino_alloc calls below. If the device is readonly, * temporarily switch to read-write to do this. */ if (XFS_IS_UQUOTA_ON(mp) && uip == NULL) { - if ((error = xfs_qm_qino_alloc(mp, &uip, + error = xfs_qm_qino_alloc(mp, &uip, sbflags | XFS_SB_UQUOTINO, - flags | XFS_QMOPT_UQUOTA))) - return XFS_ERROR(error); + flags | XFS_QMOPT_UQUOTA); + if (error) + goto error_rele; flags &= ~XFS_QMOPT_SBVERSION; } - if (XFS_IS_OQUOTA_ON(mp) && gip == NULL) { - flags |= (XFS_IS_GQUOTA_ON(mp) ? - XFS_QMOPT_GQUOTA : XFS_QMOPT_PQUOTA); + if (XFS_IS_GQUOTA_ON(mp) && gip == NULL) { error = xfs_qm_qino_alloc(mp, &gip, - sbflags | XFS_SB_GQUOTINO, flags); - if (error) { - if (uip) - IRELE(uip); + sbflags | XFS_SB_GQUOTINO, + flags | XFS_QMOPT_GQUOTA); + if (error) + goto error_rele; - return XFS_ERROR(error); - } + flags &= ~XFS_QMOPT_SBVERSION; + } + if (XFS_IS_PQUOTA_ON(mp) && pip == NULL) { + error = xfs_qm_qino_alloc(mp, &pip, + sbflags | XFS_SB_PQUOTINO, + flags | XFS_QMOPT_PQUOTA); + if (error) + goto error_rele; } mp->m_quotainfo->qi_uquotaip = uip; mp->m_quotainfo->qi_gquotaip = gip; + mp->m_quotainfo->qi_pquotaip = pip; return 0; + +error_rele: + if (uip) + IRELE(uip); + if (gip) + IRELE(gip); + if (pip) + IRELE(pip); + return XFS_ERROR(error); } STATIC void @@ -1436,7 +1574,7 @@ xfs_qm_dqfree_one( struct xfs_quotainfo *qi = mp->m_quotainfo; mutex_lock(&qi->qi_tree_lock); - radix_tree_delete(XFS_DQUOT_TREE(qi, dqp->q_core.d_flags), + radix_tree_delete(xfs_dquot_tree(qi, dqp->q_core.d_flags), be32_to_cpu(dqp->q_core.d_id)); qi->qi_dquots--; @@ -1445,132 +1583,6 @@ xfs_qm_dqfree_one( xfs_qm_dqdestroy(dqp); } -STATIC void -xfs_qm_dqreclaim_one( - struct xfs_dquot *dqp, - struct list_head *buffer_list, - struct list_head *dispose_list) -{ - struct xfs_mount *mp = dqp->q_mount; - struct xfs_quotainfo *qi = mp->m_quotainfo; - int error; - - if (!xfs_dqlock_nowait(dqp)) - goto out_move_tail; - - /* - * This dquot has acquired a reference in the meantime remove it from - * the freelist and try again. - */ - if (dqp->q_nrefs) { - xfs_dqunlock(dqp); - - trace_xfs_dqreclaim_want(dqp); - XFS_STATS_INC(xs_qm_dqwants); - - list_del_init(&dqp->q_lru); - qi->qi_lru_count--; - XFS_STATS_DEC(xs_qm_dquot_unused); - return; - } - - /* - * Try to grab the flush lock. If this dquot is in the process of - * getting flushed to disk, we don't want to reclaim it. - */ - if (!xfs_dqflock_nowait(dqp)) - goto out_unlock_move_tail; - - if (XFS_DQ_IS_DIRTY(dqp)) { - struct xfs_buf *bp = NULL; - - trace_xfs_dqreclaim_dirty(dqp); - - error = xfs_qm_dqflush(dqp, &bp); - if (error) { - xfs_warn(mp, "%s: dquot %p flush failed", - __func__, dqp); - goto out_unlock_move_tail; - } - - xfs_buf_delwri_queue(bp, buffer_list); - xfs_buf_relse(bp); - /* - * Give the dquot another try on the freelist, as the - * flushing will take some time. - */ - goto out_unlock_move_tail; - } - xfs_dqfunlock(dqp); - - /* - * Prevent lookups now that we are past the point of no return. - */ - dqp->dq_flags |= XFS_DQ_FREEING; - xfs_dqunlock(dqp); - - ASSERT(dqp->q_nrefs == 0); - list_move_tail(&dqp->q_lru, dispose_list); - qi->qi_lru_count--; - XFS_STATS_DEC(xs_qm_dquot_unused); - - trace_xfs_dqreclaim_done(dqp); - XFS_STATS_INC(xs_qm_dqreclaims); - return; - - /* - * Move the dquot to the tail of the list so that we don't spin on it. - */ -out_unlock_move_tail: - xfs_dqunlock(dqp); -out_move_tail: - list_move_tail(&dqp->q_lru, &qi->qi_lru_list); - trace_xfs_dqreclaim_busy(dqp); - XFS_STATS_INC(xs_qm_dqreclaim_misses); -} - -STATIC int -xfs_qm_shake( - struct shrinker *shrink, - struct shrink_control *sc) -{ - struct xfs_quotainfo *qi = - container_of(shrink, struct xfs_quotainfo, qi_shrinker); - int nr_to_scan = sc->nr_to_scan; - LIST_HEAD (buffer_list); - LIST_HEAD (dispose_list); - struct xfs_dquot *dqp; - int error; - - if ((sc->gfp_mask & (__GFP_FS|__GFP_WAIT)) != (__GFP_FS|__GFP_WAIT)) - return 0; - if (!nr_to_scan) - goto out; - - mutex_lock(&qi->qi_lru_lock); - while (!list_empty(&qi->qi_lru_list)) { - if (nr_to_scan-- <= 0) - break; - dqp = list_first_entry(&qi->qi_lru_list, struct xfs_dquot, - q_lru); - xfs_qm_dqreclaim_one(dqp, &buffer_list, &dispose_list); - } - mutex_unlock(&qi->qi_lru_lock); - - error = xfs_buf_delwri_submit(&buffer_list); - if (error) - xfs_warn(NULL, "%s: dquot reclaim failed", __func__); - - while (!list_empty(&dispose_list)) { - dqp = list_first_entry(&dispose_list, struct xfs_dquot, q_lru); - list_del_init(&dqp->q_lru); - xfs_qm_dqfree_one(dqp); - } - -out: - return (qi->qi_lru_count / 100) * sysctl_vfs_cache_pressure; -} - /* * Start a transaction and write the incore superblock changes to * disk. flags parameter indicates which fields have changed. @@ -1584,10 +1596,8 @@ xfs_qm_write_sb_changes( int error; tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SBCHANGE); - if ((error = xfs_trans_reserve(tp, 0, - mp->m_sb.sb_sectsize + 128, 0, - 0, - XFS_DEFAULT_LOG_COUNT))) { + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_sbchange, 0, 0); + if (error) { xfs_trans_cancel(tp, 0); return error; } @@ -1615,15 +1625,18 @@ xfs_qm_write_sb_changes( int xfs_qm_vop_dqalloc( struct xfs_inode *ip, - uid_t uid, - gid_t gid, + xfs_dqid_t uid, + xfs_dqid_t gid, prid_t prid, uint flags, struct xfs_dquot **O_udqpp, - struct xfs_dquot **O_gdqpp) + struct xfs_dquot **O_gdqpp, + struct xfs_dquot **O_pdqpp) { struct xfs_mount *mp = ip->i_mount; - struct xfs_dquot *uq, *gq; + struct xfs_dquot *uq = NULL; + struct xfs_dquot *gq = NULL; + struct xfs_dquot *pq = NULL; int error; uint lockflags; @@ -1648,7 +1661,6 @@ xfs_qm_vop_dqalloc( } } - uq = gq = NULL; if ((flags & XFS_QMOPT_UQUOTA) && XFS_IS_UQUOTA_ON(mp)) { if (ip->i_d.di_uid != uid) { /* @@ -1661,11 +1673,12 @@ xfs_qm_vop_dqalloc( * holding ilock. */ xfs_iunlock(ip, lockflags); - if ((error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t) uid, + error = xfs_qm_dqget(mp, NULL, uid, XFS_DQ_USER, XFS_QMOPT_DQALLOC | XFS_QMOPT_DOWARN, - &uq))) { + &uq); + if (error) { ASSERT(error != ENOENT); return error; } @@ -1687,15 +1700,14 @@ xfs_qm_vop_dqalloc( if ((flags & XFS_QMOPT_GQUOTA) && XFS_IS_GQUOTA_ON(mp)) { if (ip->i_d.di_gid != gid) { xfs_iunlock(ip, lockflags); - if ((error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)gid, + error = xfs_qm_dqget(mp, NULL, gid, XFS_DQ_GROUP, XFS_QMOPT_DQALLOC | XFS_QMOPT_DOWARN, - &gq))) { - if (uq) - xfs_qm_dqrele(uq); + &gq); + if (error) { ASSERT(error != ENOENT); - return error; + goto error_rele; } xfs_dqunlock(gq); lockflags = XFS_ILOCK_SHARED; @@ -1704,25 +1716,25 @@ xfs_qm_vop_dqalloc( ASSERT(ip->i_gdquot); gq = xfs_qm_dqhold(ip->i_gdquot); } - } else if ((flags & XFS_QMOPT_PQUOTA) && XFS_IS_PQUOTA_ON(mp)) { + } + if ((flags & XFS_QMOPT_PQUOTA) && XFS_IS_PQUOTA_ON(mp)) { if (xfs_get_projid(ip) != prid) { xfs_iunlock(ip, lockflags); - if ((error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)prid, + error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)prid, XFS_DQ_PROJ, XFS_QMOPT_DQALLOC | XFS_QMOPT_DOWARN, - &gq))) { - if (uq) - xfs_qm_dqrele(uq); + &pq); + if (error) { ASSERT(error != ENOENT); - return (error); + goto error_rele; } - xfs_dqunlock(gq); + xfs_dqunlock(pq); lockflags = XFS_ILOCK_SHARED; xfs_ilock(ip, lockflags); } else { - ASSERT(ip->i_gdquot); - gq = xfs_qm_dqhold(ip->i_gdquot); + ASSERT(ip->i_pdquot); + pq = xfs_qm_dqhold(ip->i_pdquot); } } if (uq) @@ -1737,7 +1749,18 @@ xfs_qm_vop_dqalloc( *O_gdqpp = gq; else if (gq) xfs_qm_dqrele(gq); + if (O_pdqpp) + *O_pdqpp = pq; + else if (pq) + xfs_qm_dqrele(pq); return 0; + +error_rele: + if (gq) + xfs_qm_dqrele(gq); + if (uq) + xfs_qm_dqrele(uq); + return error; } /* @@ -1785,29 +1808,34 @@ xfs_qm_vop_chown( */ int xfs_qm_vop_chown_reserve( - xfs_trans_t *tp, - xfs_inode_t *ip, - xfs_dquot_t *udqp, - xfs_dquot_t *gdqp, - uint flags) + struct xfs_trans *tp, + struct xfs_inode *ip, + struct xfs_dquot *udqp, + struct xfs_dquot *gdqp, + struct xfs_dquot *pdqp, + uint flags) { - xfs_mount_t *mp = ip->i_mount; - uint delblks, blkflags, prjflags = 0; - xfs_dquot_t *unresudq, *unresgdq, *delblksudq, *delblksgdq; - int error; + struct xfs_mount *mp = ip->i_mount; + uint delblks, blkflags, prjflags = 0; + struct xfs_dquot *udq_unres = NULL; + struct xfs_dquot *gdq_unres = NULL; + struct xfs_dquot *pdq_unres = NULL; + struct xfs_dquot *udq_delblks = NULL; + struct xfs_dquot *gdq_delblks = NULL; + struct xfs_dquot *pdq_delblks = NULL; + int error; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); ASSERT(XFS_IS_QUOTA_RUNNING(mp)); delblks = ip->i_delayed_blks; - delblksudq = delblksgdq = unresudq = unresgdq = NULL; blkflags = XFS_IS_REALTIME_INODE(ip) ? XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS; if (XFS_IS_UQUOTA_ON(mp) && udqp && - ip->i_d.di_uid != (uid_t)be32_to_cpu(udqp->q_core.d_id)) { - delblksudq = udqp; + ip->i_d.di_uid != be32_to_cpu(udqp->q_core.d_id)) { + udq_delblks = udqp; /* * If there are delayed allocation blocks, then we have to * unreserve those from the old dquot, and add them to the @@ -1815,29 +1843,34 @@ xfs_qm_vop_chown_reserve( */ if (delblks) { ASSERT(ip->i_udquot); - unresudq = ip->i_udquot; + udq_unres = ip->i_udquot; } } - if (XFS_IS_OQUOTA_ON(ip->i_mount) && gdqp) { - if (XFS_IS_PQUOTA_ON(ip->i_mount) && - xfs_get_projid(ip) != be32_to_cpu(gdqp->q_core.d_id)) - prjflags = XFS_QMOPT_ENOSPC; - - if (prjflags || - (XFS_IS_GQUOTA_ON(ip->i_mount) && - ip->i_d.di_gid != be32_to_cpu(gdqp->q_core.d_id))) { - delblksgdq = gdqp; - if (delblks) { - ASSERT(ip->i_gdquot); - unresgdq = ip->i_gdquot; - } + if (XFS_IS_GQUOTA_ON(ip->i_mount) && gdqp && + ip->i_d.di_gid != be32_to_cpu(gdqp->q_core.d_id)) { + gdq_delblks = gdqp; + if (delblks) { + ASSERT(ip->i_gdquot); + gdq_unres = ip->i_gdquot; } } - if ((error = xfs_trans_reserve_quota_bydquots(tp, ip->i_mount, - delblksudq, delblksgdq, ip->i_d.di_nblocks, 1, - flags | blkflags | prjflags))) - return (error); + if (XFS_IS_PQUOTA_ON(ip->i_mount) && pdqp && + xfs_get_projid(ip) != be32_to_cpu(pdqp->q_core.d_id)) { + prjflags = XFS_QMOPT_ENOSPC; + pdq_delblks = pdqp; + if (delblks) { + ASSERT(ip->i_pdquot); + pdq_unres = ip->i_pdquot; + } + } + + error = xfs_trans_reserve_quota_bydquots(tp, ip->i_mount, + udq_delblks, gdq_delblks, pdq_delblks, + ip->i_d.di_nblocks, 1, + flags | blkflags | prjflags); + if (error) + return error; /* * Do the delayed blks reservations/unreservations now. Since, these @@ -1849,15 +1882,17 @@ xfs_qm_vop_chown_reserve( /* * Do the reservations first. Unreservation can't fail. */ - ASSERT(delblksudq || delblksgdq); - ASSERT(unresudq || unresgdq); - if ((error = xfs_trans_reserve_quota_bydquots(NULL, ip->i_mount, - delblksudq, delblksgdq, (xfs_qcnt_t)delblks, 0, - flags | blkflags | prjflags))) - return (error); + ASSERT(udq_delblks || gdq_delblks || pdq_delblks); + ASSERT(udq_unres || gdq_unres || pdq_unres); + error = xfs_trans_reserve_quota_bydquots(NULL, ip->i_mount, + udq_delblks, gdq_delblks, pdq_delblks, + (xfs_qcnt_t)delblks, 0, + flags | blkflags | prjflags); + if (error) + return error; xfs_trans_reserve_quota_bydquots(NULL, ip->i_mount, - unresudq, unresgdq, -((xfs_qcnt_t)delblks), 0, - blkflags); + udq_unres, gdq_unres, pdq_unres, + -((xfs_qcnt_t)delblks), 0, blkflags); } return (0); @@ -1896,7 +1931,8 @@ xfs_qm_vop_create_dqattach( struct xfs_trans *tp, struct xfs_inode *ip, struct xfs_dquot *udqp, - struct xfs_dquot *gdqp) + struct xfs_dquot *gdqp, + struct xfs_dquot *pdqp) { struct xfs_mount *mp = tp->t_mountp; @@ -1906,23 +1942,25 @@ xfs_qm_vop_create_dqattach( ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); ASSERT(XFS_IS_QUOTA_RUNNING(mp)); - if (udqp) { + if (udqp && XFS_IS_UQUOTA_ON(mp)) { ASSERT(ip->i_udquot == NULL); - ASSERT(XFS_IS_UQUOTA_ON(mp)); ASSERT(ip->i_d.di_uid == be32_to_cpu(udqp->q_core.d_id)); ip->i_udquot = xfs_qm_dqhold(udqp); xfs_trans_mod_dquot(tp, udqp, XFS_TRANS_DQ_ICOUNT, 1); } - if (gdqp) { + if (gdqp && XFS_IS_GQUOTA_ON(mp)) { ASSERT(ip->i_gdquot == NULL); - ASSERT(XFS_IS_OQUOTA_ON(mp)); - ASSERT((XFS_IS_GQUOTA_ON(mp) ? - ip->i_d.di_gid : xfs_get_projid(ip)) == - be32_to_cpu(gdqp->q_core.d_id)); - + ASSERT(ip->i_d.di_gid == be32_to_cpu(gdqp->q_core.d_id)); ip->i_gdquot = xfs_qm_dqhold(gdqp); xfs_trans_mod_dquot(tp, gdqp, XFS_TRANS_DQ_ICOUNT, 1); } + if (pdqp && XFS_IS_PQUOTA_ON(mp)) { + ASSERT(ip->i_pdquot == NULL); + ASSERT(xfs_get_projid(ip) == be32_to_cpu(pdqp->q_core.d_id)); + + ip->i_pdquot = xfs_qm_dqhold(pdqp); + xfs_trans_mod_dquot(tp, pdqp, XFS_TRANS_DQ_ICOUNT, 1); + } } diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h index 44b858b79d7..797fd463627 100644 --- a/fs/xfs/xfs_qm.h +++ b/fs/xfs/xfs_qm.h @@ -20,13 +20,29 @@ #include "xfs_dquot_item.h" #include "xfs_dquot.h" -#include "xfs_quota_priv.h" struct xfs_inode; extern struct kmem_zone *xfs_qm_dqtrxzone; /* + * Number of bmaps that we ask from bmapi when doing a quotacheck. + * We make this restriction to keep the memory usage to a minimum. + */ +#define XFS_DQITER_MAP_SIZE 10 + +#define XFS_IS_DQUOT_UNINITIALIZED(dqp) ( \ + !dqp->q_core.d_blk_hardlimit && \ + !dqp->q_core.d_blk_softlimit && \ + !dqp->q_core.d_rtb_hardlimit && \ + !dqp->q_core.d_rtb_softlimit && \ + !dqp->q_core.d_ino_hardlimit && \ + !dqp->q_core.d_ino_softlimit && \ + !dqp->q_core.d_bcount && \ + !dqp->q_core.d_rtbcount && \ + !dqp->q_core.d_icount) + +/* * This defines the unit of allocation of dquots. * Currently, it is just one file system block, and a 4K blk contains 30 * (136 * 30 = 4080) dquots. It's probably not worth trying to make @@ -44,12 +60,12 @@ extern struct kmem_zone *xfs_qm_dqtrxzone; typedef struct xfs_quotainfo { struct radix_tree_root qi_uquota_tree; struct radix_tree_root qi_gquota_tree; + struct radix_tree_root qi_pquota_tree; struct mutex qi_tree_lock; - xfs_inode_t *qi_uquotaip; /* user quota inode */ - xfs_inode_t *qi_gquotaip; /* group quota inode */ - struct list_head qi_lru_list; - struct mutex qi_lru_lock; - int qi_lru_count; + struct xfs_inode *qi_uquotaip; /* user quota inode */ + struct xfs_inode *qi_gquotaip; /* group quota inode */ + struct xfs_inode *qi_pquotaip; /* project quota inode */ + struct list_lru qi_lru; int qi_dquots; time_t qi_btimelimit; /* limit for blks timer */ time_t qi_itimelimit; /* limit for inodes timer */ @@ -69,28 +85,64 @@ typedef struct xfs_quotainfo { struct shrinker qi_shrinker; } xfs_quotainfo_t; -#define XFS_DQUOT_TREE(qi, type) \ - ((type & XFS_DQ_USER) ? \ - &((qi)->qi_uquota_tree) : \ - &((qi)->qi_gquota_tree)) +static inline struct radix_tree_root * +xfs_dquot_tree( + struct xfs_quotainfo *qi, + int type) +{ + switch (type) { + case XFS_DQ_USER: + return &qi->qi_uquota_tree; + case XFS_DQ_GROUP: + return &qi->qi_gquota_tree; + case XFS_DQ_PROJ: + return &qi->qi_pquota_tree; + default: + ASSERT(0); + } + return NULL; +} +static inline struct xfs_inode * +xfs_dq_to_quota_inode(struct xfs_dquot *dqp) +{ + switch (dqp->dq_flags & XFS_DQ_ALLTYPES) { + case XFS_DQ_USER: + return dqp->q_mount->m_quotainfo->qi_uquotaip; + case XFS_DQ_GROUP: + return dqp->q_mount->m_quotainfo->qi_gquotaip; + case XFS_DQ_PROJ: + return dqp->q_mount->m_quotainfo->qi_pquotaip; + default: + ASSERT(0); + } + return NULL; +} -extern void xfs_trans_mod_dquot(xfs_trans_t *, xfs_dquot_t *, uint, long); -extern int xfs_trans_reserve_quota_bydquots(xfs_trans_t *, xfs_mount_t *, - xfs_dquot_t *, xfs_dquot_t *, long, long, uint); -extern void xfs_trans_dqjoin(xfs_trans_t *, xfs_dquot_t *); -extern void xfs_trans_log_dquot(xfs_trans_t *, xfs_dquot_t *); +extern void xfs_trans_mod_dquot(struct xfs_trans *, + struct xfs_dquot *, uint, long); +extern int xfs_trans_reserve_quota_bydquots(struct xfs_trans *, + struct xfs_mount *, struct xfs_dquot *, + struct xfs_dquot *, struct xfs_dquot *, + long, long, uint); +extern void xfs_trans_dqjoin(struct xfs_trans *, struct xfs_dquot *); +extern void xfs_trans_log_dquot(struct xfs_trans *, struct xfs_dquot *); /* - * We keep the usr and grp dquots separately so that locking will be easier - * to do at commit time. All transactions that we know of at this point + * We keep the usr, grp, and prj dquots separately so that locking will be + * easier to do at commit time. All transactions that we know of at this point * affect no more than two dquots of one type. Hence, the TRANS_MAXDQS value. */ +enum { + XFS_QM_TRANS_USR = 0, + XFS_QM_TRANS_GRP, + XFS_QM_TRANS_PRJ, + XFS_QM_TRANS_DQTYPES +}; #define XFS_QM_TRANS_MAXDQS 2 -typedef struct xfs_dquot_acct { - xfs_dqtrx_t dqa_usrdquots[XFS_QM_TRANS_MAXDQS]; - xfs_dqtrx_t dqa_grpdquots[XFS_QM_TRANS_MAXDQS]; -} xfs_dquot_acct_t; +struct xfs_dquot_acct { + struct xfs_dqtrx dqs[XFS_QM_TRANS_DQTYPES][XFS_QM_TRANS_MAXDQS]; +}; /* * Users are allowed to have a usage exceeding their softlimit for @@ -104,22 +156,25 @@ typedef struct xfs_dquot_acct { #define XFS_QM_IWARNLIMIT 5 #define XFS_QM_RTBWARNLIMIT 5 -extern void xfs_qm_destroy_quotainfo(xfs_mount_t *); -extern int xfs_qm_quotacheck(xfs_mount_t *); -extern int xfs_qm_write_sb_changes(xfs_mount_t *, __int64_t); +extern void xfs_qm_destroy_quotainfo(struct xfs_mount *); +extern int xfs_qm_quotacheck(struct xfs_mount *); +extern int xfs_qm_write_sb_changes(struct xfs_mount *, __int64_t); /* dquot stuff */ -extern void xfs_qm_dqpurge_all(xfs_mount_t *, uint); -extern void xfs_qm_dqrele_all_inodes(xfs_mount_t *, uint); +extern void xfs_qm_dqpurge_all(struct xfs_mount *, uint); +extern void xfs_qm_dqrele_all_inodes(struct xfs_mount *, uint); /* quota ops */ -extern int xfs_qm_scall_trunc_qfiles(xfs_mount_t *, uint); -extern int xfs_qm_scall_getquota(xfs_mount_t *, xfs_dqid_t, uint, - fs_disk_quota_t *); -extern int xfs_qm_scall_setqlim(xfs_mount_t *, xfs_dqid_t, uint, - fs_disk_quota_t *); -extern int xfs_qm_scall_getqstat(xfs_mount_t *, fs_quota_stat_t *); -extern int xfs_qm_scall_quotaon(xfs_mount_t *, uint); -extern int xfs_qm_scall_quotaoff(xfs_mount_t *, uint); +extern int xfs_qm_scall_trunc_qfiles(struct xfs_mount *, uint); +extern int xfs_qm_scall_getquota(struct xfs_mount *, xfs_dqid_t, + uint, struct fs_disk_quota *); +extern int xfs_qm_scall_setqlim(struct xfs_mount *, xfs_dqid_t, uint, + struct fs_disk_quota *); +extern int xfs_qm_scall_getqstat(struct xfs_mount *, + struct fs_quota_stat *); +extern int xfs_qm_scall_getqstatv(struct xfs_mount *, + struct fs_quota_statv *); +extern int xfs_qm_scall_quotaon(struct xfs_mount *, uint); +extern int xfs_qm_scall_quotaoff(struct xfs_mount *, uint); #endif /* __XFS_QM_H__ */ diff --git a/fs/xfs/xfs_qm_bhv.c b/fs/xfs/xfs_qm_bhv.c index 6b39115bf14..e9be63abd8d 100644 --- a/fs/xfs/xfs_qm_bhv.c +++ b/fs/xfs/xfs_qm_bhv.c @@ -17,21 +17,16 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_log.h" -#include "xfs_trans.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_alloc.h" #include "xfs_quota.h" #include "xfs_mount.h" -#include "xfs_bmap_btree.h" #include "xfs_inode.h" -#include "xfs_itable.h" -#include "xfs_bmap.h" -#include "xfs_rtalloc.h" #include "xfs_error.h" -#include "xfs_attr.h" -#include "xfs_buf_item.h" +#include "xfs_trans.h" #include "xfs_qm.h" @@ -112,16 +107,16 @@ xfs_qm_newmount( if (((uquotaondisk && !XFS_IS_UQUOTA_ON(mp)) || (!uquotaondisk && XFS_IS_UQUOTA_ON(mp)) || - (pquotaondisk && !XFS_IS_PQUOTA_ON(mp)) || - (!pquotaondisk && XFS_IS_PQUOTA_ON(mp)) || (gquotaondisk && !XFS_IS_GQUOTA_ON(mp)) || - (!gquotaondisk && XFS_IS_OQUOTA_ON(mp))) && + (!gquotaondisk && XFS_IS_GQUOTA_ON(mp)) || + (pquotaondisk && !XFS_IS_PQUOTA_ON(mp)) || + (!pquotaondisk && XFS_IS_PQUOTA_ON(mp))) && xfs_dev_is_read_only(mp, "changing quota state")) { xfs_warn(mp, "please mount with%s%s%s%s.", (!quotaondisk ? "out quota" : ""), (uquotaondisk ? " usrquota" : ""), - (pquotaondisk ? " prjquota" : ""), - (gquotaondisk ? " grpquota" : "")); + (gquotaondisk ? " grpquota" : ""), + (pquotaondisk ? " prjquota" : "")); return XFS_ERROR(EPERM); } @@ -146,7 +141,7 @@ xfs_qm_newmount( * inode goes inactive and wants to free blocks, * or via xfs_log_mount_finish. */ - *needquotamount = B_TRUE; + *needquotamount = true; *quotaflags = mp->m_qflags; mp->m_qflags = 0; } diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c index 8a59f854655..bbc813caba4 100644 --- a/fs/xfs/xfs_qm_syscalls.c +++ b/fs/xfs/xfs_qm_syscalls.c @@ -20,24 +20,18 @@ #include "xfs.h" #include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_bit.h" -#include "xfs_log.h" -#include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_alloc.h" -#include "xfs_quota.h" #include "xfs_mount.h" -#include "xfs_bmap_btree.h" #include "xfs_inode.h" -#include "xfs_inode_item.h" -#include "xfs_itable.h" -#include "xfs_bmap.h" -#include "xfs_rtalloc.h" +#include "xfs_trans.h" #include "xfs_error.h" -#include "xfs_attr.h" -#include "xfs_buf_item.h" -#include "xfs_utils.h" +#include "xfs_quota.h" #include "xfs_qm.h" #include "xfs_trace.h" #include "xfs_icache.h" @@ -117,11 +111,12 @@ xfs_qm_scall_quotaoff( } if (flags & XFS_GQUOTA_ACCT) { dqtype |= XFS_QMOPT_GQUOTA; - flags |= (XFS_OQUOTA_CHKD | XFS_OQUOTA_ENFD); + flags |= (XFS_GQUOTA_CHKD | XFS_GQUOTA_ENFD); inactivate_flags |= XFS_GQUOTA_ACTIVE; - } else if (flags & XFS_PQUOTA_ACCT) { + } + if (flags & XFS_PQUOTA_ACCT) { dqtype |= XFS_QMOPT_PQUOTA; - flags |= (XFS_OQUOTA_CHKD | XFS_OQUOTA_ENFD); + flags |= (XFS_PQUOTA_CHKD | XFS_PQUOTA_ENFD); inactivate_flags |= XFS_PQUOTA_ACTIVE; } @@ -198,10 +193,9 @@ xfs_qm_scall_quotaoff( } /* - * If quotas is completely disabled, close shop. + * If all quotas are completely turned off, close shop. */ - if (((flags & XFS_MOUNT_QUOTA_ALL) == XFS_MOUNT_QUOTA_SET1) || - ((flags & XFS_MOUNT_QUOTA_ALL) == XFS_MOUNT_QUOTA_SET2)) { + if (mp->m_qflags == 0) { mutex_unlock(&q->qi_quotaofflock); xfs_qm_destroy_quotainfo(mp); return (0); @@ -214,10 +208,14 @@ xfs_qm_scall_quotaoff( IRELE(q->qi_uquotaip); q->qi_uquotaip = NULL; } - if ((dqtype & (XFS_QMOPT_GQUOTA|XFS_QMOPT_PQUOTA)) && q->qi_gquotaip) { + if ((dqtype & XFS_QMOPT_GQUOTA) && q->qi_gquotaip) { IRELE(q->qi_gquotaip); q->qi_gquotaip = NULL; } + if ((dqtype & XFS_QMOPT_PQUOTA) && q->qi_pquotaip) { + IRELE(q->qi_pquotaip); + q->qi_pquotaip = NULL; + } out_unlock: mutex_unlock(&q->qi_quotaofflock); @@ -243,9 +241,7 @@ xfs_qm_scall_trunc_qfile( xfs_ilock(ip, XFS_IOLOCK_EXCL); tp = xfs_trans_alloc(mp, XFS_TRANS_TRUNCATE_FILE); - error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0, - XFS_TRANS_PERM_LOG_RES, - XFS_ITRUNCATE_LOG_COUNT); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0); if (error) { xfs_trans_cancel(tp, 0); xfs_iunlock(ip, XFS_IOLOCK_EXCL); @@ -282,20 +278,29 @@ xfs_qm_scall_trunc_qfiles( xfs_mount_t *mp, uint flags) { - int error = 0, error2 = 0; + int error = EINVAL; - if (!xfs_sb_version_hasquota(&mp->m_sb) || flags == 0) { - xfs_debug(mp, "%s: flags=%x m_qflags=%x\n", + if (!xfs_sb_version_hasquota(&mp->m_sb) || flags == 0 || + (flags & ~XFS_DQ_ALLTYPES)) { + xfs_debug(mp, "%s: flags=%x m_qflags=%x", __func__, flags, mp->m_qflags); return XFS_ERROR(EINVAL); } - if (flags & XFS_DQ_USER) + if (flags & XFS_DQ_USER) { error = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_uquotino); - if (flags & (XFS_DQ_GROUP|XFS_DQ_PROJ)) - error2 = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_gquotino); + if (error) + return error; + } + if (flags & XFS_DQ_GROUP) { + error = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_gquotino); + if (error) + return error; + } + if (flags & XFS_DQ_PROJ) + error = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_pquotino); - return error ? error : error2; + return error; } /* @@ -321,7 +326,7 @@ xfs_qm_scall_quotaon( sbflags = 0; if (flags == 0) { - xfs_debug(mp, "%s: zero flags, m_qflags=%x\n", + xfs_debug(mp, "%s: zero flags, m_qflags=%x", __func__, mp->m_qflags); return XFS_ERROR(EINVAL); } @@ -335,16 +340,16 @@ xfs_qm_scall_quotaon( * quota acct on ondisk without m_qflags' knowing. */ if (((flags & XFS_UQUOTA_ACCT) == 0 && - (mp->m_sb.sb_qflags & XFS_UQUOTA_ACCT) == 0 && - (flags & XFS_UQUOTA_ENFD)) - || + (mp->m_sb.sb_qflags & XFS_UQUOTA_ACCT) == 0 && + (flags & XFS_UQUOTA_ENFD)) || + ((flags & XFS_GQUOTA_ACCT) == 0 && + (mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT) == 0 && + (flags & XFS_GQUOTA_ENFD)) || ((flags & XFS_PQUOTA_ACCT) == 0 && - (mp->m_sb.sb_qflags & XFS_PQUOTA_ACCT) == 0 && - (flags & XFS_GQUOTA_ACCT) == 0 && - (mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT) == 0 && - (flags & XFS_OQUOTA_ENFD))) { + (mp->m_sb.sb_qflags & XFS_PQUOTA_ACCT) == 0 && + (flags & XFS_PQUOTA_ENFD))) { xfs_debug(mp, - "%s: Can't enforce without acct, flags=%x sbflags=%x\n", + "%s: Can't enforce without acct, flags=%x sbflags=%x", __func__, flags, mp->m_sb.sb_qflags); return XFS_ERROR(EINVAL); } @@ -400,6 +405,7 @@ xfs_qm_scall_quotaon( /* * Return quota status information, such as uquota-off, enforcements, etc. + * for Q_XGETQSTAT command. */ int xfs_qm_scall_getqstat( @@ -407,11 +413,13 @@ xfs_qm_scall_getqstat( struct fs_quota_stat *out) { struct xfs_quotainfo *q = mp->m_quotainfo; - struct xfs_inode *uip, *gip; - boolean_t tempuqip, tempgqip; + struct xfs_inode *uip = NULL; + struct xfs_inode *gip = NULL; + struct xfs_inode *pip = NULL; + bool tempuqip = false; + bool tempgqip = false; + bool temppqip = false; - uip = gip = NULL; - tempuqip = tempgqip = B_FALSE; memset(out, 0, sizeof(fs_quota_stat_t)); out->qs_version = FS_QSTAT_VERSION; @@ -420,26 +428,121 @@ xfs_qm_scall_getqstat( out->qs_gquota.qfs_ino = NULLFSINO; return (0); } + + out->qs_flags = (__uint16_t) xfs_qm_export_flags(mp->m_qflags & + (XFS_ALL_QUOTA_ACCT| + XFS_ALL_QUOTA_ENFD)); + if (q) { + uip = q->qi_uquotaip; + gip = q->qi_gquotaip; + pip = q->qi_pquotaip; + } + if (!uip && mp->m_sb.sb_uquotino != NULLFSINO) { + if (xfs_iget(mp, NULL, mp->m_sb.sb_uquotino, + 0, 0, &uip) == 0) + tempuqip = true; + } + if (!gip && mp->m_sb.sb_gquotino != NULLFSINO) { + if (xfs_iget(mp, NULL, mp->m_sb.sb_gquotino, + 0, 0, &gip) == 0) + tempgqip = true; + } + /* + * Q_XGETQSTAT doesn't have room for both group and project quotas. + * So, allow the project quota values to be copied out only if + * there is no group quota information available. + */ + if (!gip) { + if (!pip && mp->m_sb.sb_pquotino != NULLFSINO) { + if (xfs_iget(mp, NULL, mp->m_sb.sb_pquotino, + 0, 0, &pip) == 0) + temppqip = true; + } + } else + pip = NULL; + if (uip) { + out->qs_uquota.qfs_ino = mp->m_sb.sb_uquotino; + out->qs_uquota.qfs_nblks = uip->i_d.di_nblocks; + out->qs_uquota.qfs_nextents = uip->i_d.di_nextents; + if (tempuqip) + IRELE(uip); + } + + if (gip) { + out->qs_gquota.qfs_ino = mp->m_sb.sb_gquotino; + out->qs_gquota.qfs_nblks = gip->i_d.di_nblocks; + out->qs_gquota.qfs_nextents = gip->i_d.di_nextents; + if (tempgqip) + IRELE(gip); + } + if (pip) { + out->qs_gquota.qfs_ino = mp->m_sb.sb_gquotino; + out->qs_gquota.qfs_nblks = pip->i_d.di_nblocks; + out->qs_gquota.qfs_nextents = pip->i_d.di_nextents; + if (temppqip) + IRELE(pip); + } + if (q) { + out->qs_incoredqs = q->qi_dquots; + out->qs_btimelimit = q->qi_btimelimit; + out->qs_itimelimit = q->qi_itimelimit; + out->qs_rtbtimelimit = q->qi_rtbtimelimit; + out->qs_bwarnlimit = q->qi_bwarnlimit; + out->qs_iwarnlimit = q->qi_iwarnlimit; + } + return 0; +} + +/* + * Return quota status information, such as uquota-off, enforcements, etc. + * for Q_XGETQSTATV command, to support separate project quota field. + */ +int +xfs_qm_scall_getqstatv( + struct xfs_mount *mp, + struct fs_quota_statv *out) +{ + struct xfs_quotainfo *q = mp->m_quotainfo; + struct xfs_inode *uip = NULL; + struct xfs_inode *gip = NULL; + struct xfs_inode *pip = NULL; + bool tempuqip = false; + bool tempgqip = false; + bool temppqip = false; + + if (!xfs_sb_version_hasquota(&mp->m_sb)) { + out->qs_uquota.qfs_ino = NULLFSINO; + out->qs_gquota.qfs_ino = NULLFSINO; + out->qs_pquota.qfs_ino = NULLFSINO; + return (0); + } + out->qs_flags = (__uint16_t) xfs_qm_export_flags(mp->m_qflags & (XFS_ALL_QUOTA_ACCT| XFS_ALL_QUOTA_ENFD)); - out->qs_pad = 0; out->qs_uquota.qfs_ino = mp->m_sb.sb_uquotino; out->qs_gquota.qfs_ino = mp->m_sb.sb_gquotino; + out->qs_pquota.qfs_ino = mp->m_sb.sb_pquotino; if (q) { uip = q->qi_uquotaip; gip = q->qi_gquotaip; + pip = q->qi_pquotaip; } if (!uip && mp->m_sb.sb_uquotino != NULLFSINO) { if (xfs_iget(mp, NULL, mp->m_sb.sb_uquotino, 0, 0, &uip) == 0) - tempuqip = B_TRUE; + tempuqip = true; } if (!gip && mp->m_sb.sb_gquotino != NULLFSINO) { if (xfs_iget(mp, NULL, mp->m_sb.sb_gquotino, 0, 0, &gip) == 0) - tempgqip = B_TRUE; + tempgqip = true; + } + if (!pip && mp->m_sb.sb_pquotino != NULLFSINO) { + if (xfs_iget(mp, NULL, mp->m_sb.sb_pquotino, + 0, 0, &pip) == 0) + temppqip = true; } if (uip) { out->qs_uquota.qfs_nblks = uip->i_d.di_nblocks; @@ -447,12 +550,19 @@ xfs_qm_scall_getqstat( if (tempuqip) IRELE(uip); } + if (gip) { out->qs_gquota.qfs_nblks = gip->i_d.di_nblocks; out->qs_gquota.qfs_nextents = gip->i_d.di_nextents; if (tempgqip) IRELE(gip); } + if (pip) { + out->qs_pquota.qfs_nblks = pip->i_d.di_nblocks; + out->qs_pquota.qfs_nextents = pip->i_d.di_nextents; + if (temppqip) + IRELE(pip); + } if (q) { out->qs_incoredqs = q->qi_dquots; out->qs_btimelimit = q->qi_btimelimit; @@ -472,15 +582,15 @@ xfs_qm_scall_getqstat( */ int xfs_qm_scall_setqlim( - xfs_mount_t *mp, + struct xfs_mount *mp, xfs_dqid_t id, uint type, fs_disk_quota_t *newlim) { struct xfs_quotainfo *q = mp->m_quotainfo; - xfs_disk_dquot_t *ddq; - xfs_dquot_t *dqp; - xfs_trans_t *tp; + struct xfs_disk_dquot *ddq; + struct xfs_dquot *dqp; + struct xfs_trans *tp; int error; xfs_qcnt_t hard, soft; @@ -489,30 +599,35 @@ xfs_qm_scall_setqlim( if ((newlim->d_fieldmask & XFS_DQ_MASK) == 0) return 0; - tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SETQLIM); - if ((error = xfs_trans_reserve(tp, 0, sizeof(xfs_disk_dquot_t) + 128, - 0, 0, XFS_DEFAULT_LOG_COUNT))) { - xfs_trans_cancel(tp, 0); - return (error); - } - /* * We don't want to race with a quotaoff so take the quotaoff lock. - * (We don't hold an inode lock, so there's nothing else to stop - * a quotaoff from happening). (XXXThis doesn't currently happen - * because we take the vfslock before calling xfs_qm_sysent). + * We don't hold an inode lock, so there's nothing else to stop + * a quotaoff from happening. */ mutex_lock(&q->qi_quotaofflock); /* - * Get the dquot (locked), and join it to the transaction. - * Allocate the dquot if this doesn't exist. + * Get the dquot (locked) before we start, as we need to do a + * transaction to allocate it if it doesn't exist. Once we have the + * dquot, unlock it so we can start the next transaction safely. We hold + * a reference to the dquot, so it's safe to do this unlock/lock without + * it being reclaimed in the mean time. */ - if ((error = xfs_qm_dqget(mp, NULL, id, type, XFS_QMOPT_DQALLOC, &dqp))) { - xfs_trans_cancel(tp, XFS_TRANS_ABORT); + error = xfs_qm_dqget(mp, NULL, id, type, XFS_QMOPT_DQALLOC, &dqp); + if (error) { ASSERT(error != ENOENT); goto out_unlock; } + xfs_dqunlock(dqp); + + tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SETQLIM); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_setqlim, 0, 0); + if (error) { + xfs_trans_cancel(tp, 0); + goto out_rele; + } + + xfs_dqlock(dqp); xfs_trans_dqjoin(tp, dqp); ddq = &dqp->q_core; @@ -528,12 +643,13 @@ xfs_qm_scall_setqlim( if (hard == 0 || hard >= soft) { ddq->d_blk_hardlimit = cpu_to_be64(hard); ddq->d_blk_softlimit = cpu_to_be64(soft); + xfs_dquot_set_prealloc_limits(dqp); if (id == 0) { q->qi_bhardlimit = hard; q->qi_bsoftlimit = soft; } } else { - xfs_debug(mp, "blkhard %Ld < blksoft %Ld\n", hard, soft); + xfs_debug(mp, "blkhard %Ld < blksoft %Ld", hard, soft); } hard = (newlim->d_fieldmask & FS_DQ_RTBHARD) ? (xfs_qcnt_t) XFS_BB_TO_FSB(mp, newlim->d_rtb_hardlimit) : @@ -549,7 +665,7 @@ xfs_qm_scall_setqlim( q->qi_rtbsoftlimit = soft; } } else { - xfs_debug(mp, "rtbhard %Ld < rtbsoft %Ld\n", hard, soft); + xfs_debug(mp, "rtbhard %Ld < rtbsoft %Ld", hard, soft); } hard = (newlim->d_fieldmask & FS_DQ_IHARD) ? @@ -566,7 +682,7 @@ xfs_qm_scall_setqlim( q->qi_isoftlimit = soft; } } else { - xfs_debug(mp, "ihard %Ld < isoft %Ld\n", hard, soft); + xfs_debug(mp, "ihard %Ld < isoft %Ld", hard, soft); } /* @@ -619,9 +735,10 @@ xfs_qm_scall_setqlim( xfs_trans_log_dquot(tp, dqp); error = xfs_trans_commit(tp, 0); - xfs_qm_dqrele(dqp); - out_unlock: +out_rele: + xfs_qm_dqrele(dqp); +out_unlock: mutex_unlock(&q->qi_quotaofflock); return error; } @@ -638,8 +755,8 @@ xfs_qm_log_quotaoff_end( tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QUOTAOFF_END); - if ((error = xfs_trans_reserve(tp, 0, sizeof(xfs_qoff_logitem_t) * 2, - 0, 0, XFS_DEFAULT_LOG_COUNT))) { + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_equotaoff, 0, 0); + if (error) { xfs_trans_cancel(tp, 0); return (error); } @@ -671,14 +788,9 @@ xfs_qm_log_quotaoff( uint oldsbqflag=0; tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QUOTAOFF); - if ((error = xfs_trans_reserve(tp, 0, - sizeof(xfs_qoff_logitem_t) * 2 + - mp->m_sb.sb_sectsize + 128, - 0, - 0, - XFS_DEFAULT_LOG_COUNT))) { + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_quotaoff, 0, 0); + if (error) goto error0; - } qoffi = xfs_trans_get_qoff_item(tp, NULL, flags & XFS_ALL_QUOTA_ACCT); xfs_trans_log_quotaoff_item(tp, qoffi); @@ -771,9 +883,12 @@ xfs_qm_scall_getquota( * gets turned off. No need to confuse the user level code, * so return zeroes in that case. */ - if ((!XFS_IS_UQUOTA_ENFORCED(mp) && dqp->q_core.d_flags == XFS_DQ_USER) || - (!XFS_IS_OQUOTA_ENFORCED(mp) && - (dqp->q_core.d_flags & (XFS_DQ_PROJ | XFS_DQ_GROUP)))) { + if ((!XFS_IS_UQUOTA_ENFORCED(mp) && + dqp->q_core.d_flags == XFS_DQ_USER) || + (!XFS_IS_GQUOTA_ENFORCED(mp) && + dqp->q_core.d_flags == XFS_DQ_GROUP) || + (!XFS_IS_PQUOTA_ENFORCED(mp) && + dqp->q_core.d_flags == XFS_DQ_PROJ)) { dst->d_btimer = 0; dst->d_itimer = 0; dst->d_rtbtimer = 0; @@ -781,8 +896,8 @@ xfs_qm_scall_getquota( #ifdef DEBUG if (((XFS_IS_UQUOTA_ENFORCED(mp) && dst->d_flags == FS_USER_QUOTA) || - (XFS_IS_OQUOTA_ENFORCED(mp) && - (dst->d_flags & (FS_PROJ_QUOTA | FS_GROUP_QUOTA)))) && + (XFS_IS_GQUOTA_ENFORCED(mp) && dst->d_flags == FS_GROUP_QUOTA) || + (XFS_IS_PQUOTA_ENFORCED(mp) && dst->d_flags == FS_PROJ_QUOTA)) && dst->d_id != 0) { if ((dst->d_bcount > dst->d_blk_softlimit) && (dst->d_blk_softlimit > 0)) { @@ -828,16 +943,16 @@ xfs_qm_export_flags( uflags = 0; if (flags & XFS_UQUOTA_ACCT) uflags |= FS_QUOTA_UDQ_ACCT; - if (flags & XFS_PQUOTA_ACCT) - uflags |= FS_QUOTA_PDQ_ACCT; if (flags & XFS_GQUOTA_ACCT) uflags |= FS_QUOTA_GDQ_ACCT; + if (flags & XFS_PQUOTA_ACCT) + uflags |= FS_QUOTA_PDQ_ACCT; if (flags & XFS_UQUOTA_ENFD) uflags |= FS_QUOTA_UDQ_ENFD; - if (flags & (XFS_OQUOTA_ENFD)) { - uflags |= (flags & XFS_GQUOTA_ACCT) ? - FS_QUOTA_GDQ_ENFD : FS_QUOTA_PDQ_ENFD; - } + if (flags & XFS_GQUOTA_ENFD) + uflags |= FS_QUOTA_GDQ_ENFD; + if (flags & XFS_PQUOTA_ENFD) + uflags |= FS_QUOTA_PDQ_ENFD; return (uflags); } @@ -845,15 +960,16 @@ xfs_qm_export_flags( STATIC int xfs_dqrele_inode( struct xfs_inode *ip, - struct xfs_perag *pag, int flags, void *args) { /* skip quota inodes */ if (ip == ip->i_mount->m_quotainfo->qi_uquotaip || - ip == ip->i_mount->m_quotainfo->qi_gquotaip) { + ip == ip->i_mount->m_quotainfo->qi_gquotaip || + ip == ip->i_mount->m_quotainfo->qi_pquotaip) { ASSERT(ip->i_udquot == NULL); ASSERT(ip->i_gdquot == NULL); + ASSERT(ip->i_pdquot == NULL); return 0; } @@ -862,10 +978,14 @@ xfs_dqrele_inode( xfs_qm_dqrele(ip->i_udquot); ip->i_udquot = NULL; } - if (flags & (XFS_PQUOTA_ACCT|XFS_GQUOTA_ACCT) && ip->i_gdquot) { + if ((flags & XFS_GQUOTA_ACCT) && ip->i_gdquot) { xfs_qm_dqrele(ip->i_gdquot); ip->i_gdquot = NULL; } + if ((flags & XFS_PQUOTA_ACCT) && ip->i_pdquot) { + xfs_qm_dqrele(ip->i_pdquot); + ip->i_pdquot = NULL; + } xfs_iunlock(ip, XFS_ILOCK_EXCL); return 0; } diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h index b50ec5b95d5..5376dd406ba 100644 --- a/fs/xfs/xfs_quota.h +++ b/fs/xfs/xfs_quota.h @@ -18,229 +18,14 @@ #ifndef __XFS_QUOTA_H__ #define __XFS_QUOTA_H__ -struct xfs_trans; - -/* - * The ondisk form of a dquot structure. - */ -#define XFS_DQUOT_MAGIC 0x4451 /* 'DQ' */ -#define XFS_DQUOT_VERSION (u_int8_t)0x01 /* latest version number */ - -/* - * uid_t and gid_t are hard-coded to 32 bits in the inode. - * Hence, an 'id' in a dquot is 32 bits.. - */ -typedef __uint32_t xfs_dqid_t; - -/* - * Even though users may not have quota limits occupying all 64-bits, - * they may need 64-bit accounting. Hence, 64-bit quota-counters, - * and quota-limits. This is a waste in the common case, but hey ... - */ -typedef __uint64_t xfs_qcnt_t; -typedef __uint16_t xfs_qwarncnt_t; - -/* - * This is the main portion of the on-disk representation of quota - * information for a user. This is the q_core of the xfs_dquot_t that - * is kept in kernel memory. We pad this with some more expansion room - * to construct the on disk structure. - */ -typedef struct xfs_disk_dquot { - __be16 d_magic; /* dquot magic = XFS_DQUOT_MAGIC */ - __u8 d_version; /* dquot version */ - __u8 d_flags; /* XFS_DQ_USER/PROJ/GROUP */ - __be32 d_id; /* user,project,group id */ - __be64 d_blk_hardlimit;/* absolute limit on disk blks */ - __be64 d_blk_softlimit;/* preferred limit on disk blks */ - __be64 d_ino_hardlimit;/* maximum # allocated inodes */ - __be64 d_ino_softlimit;/* preferred inode limit */ - __be64 d_bcount; /* disk blocks owned by the user */ - __be64 d_icount; /* inodes owned by the user */ - __be32 d_itimer; /* zero if within inode limits if not, - this is when we refuse service */ - __be32 d_btimer; /* similar to above; for disk blocks */ - __be16 d_iwarns; /* warnings issued wrt num inodes */ - __be16 d_bwarns; /* warnings issued wrt disk blocks */ - __be32 d_pad0; /* 64 bit align */ - __be64 d_rtb_hardlimit;/* absolute limit on realtime blks */ - __be64 d_rtb_softlimit;/* preferred limit on RT disk blks */ - __be64 d_rtbcount; /* realtime blocks owned */ - __be32 d_rtbtimer; /* similar to above; for RT disk blocks */ - __be16 d_rtbwarns; /* warnings issued wrt RT disk blocks */ - __be16 d_pad; -} xfs_disk_dquot_t; - -/* - * This is what goes on disk. This is separated from the xfs_disk_dquot because - * carrying the unnecessary padding would be a waste of memory. - */ -typedef struct xfs_dqblk { - xfs_disk_dquot_t dd_diskdq; /* portion that lives incore as well */ - char dd_fill[32]; /* filling for posterity */ -} xfs_dqblk_t; +#include "xfs_quota_defs.h" /* - * flags for q_flags field in the dquot. + * Kernel only quota definitions and functions */ -#define XFS_DQ_USER 0x0001 /* a user quota */ -#define XFS_DQ_PROJ 0x0002 /* project quota */ -#define XFS_DQ_GROUP 0x0004 /* a group quota */ -#define XFS_DQ_DIRTY 0x0008 /* dquot is dirty */ -#define XFS_DQ_FREEING 0x0010 /* dquot is beeing torn down */ - -#define XFS_DQ_ALLTYPES (XFS_DQ_USER|XFS_DQ_PROJ|XFS_DQ_GROUP) - -#define XFS_DQ_FLAGS \ - { XFS_DQ_USER, "USER" }, \ - { XFS_DQ_PROJ, "PROJ" }, \ - { XFS_DQ_GROUP, "GROUP" }, \ - { XFS_DQ_DIRTY, "DIRTY" }, \ - { XFS_DQ_FREEING, "FREEING" } - -/* - * In the worst case, when both user and group quotas are on, - * we can have a max of three dquots changing in a single transaction. - */ -#define XFS_DQUOT_LOGRES(mp) (sizeof(xfs_disk_dquot_t) * 3) - - -/* - * These are the structures used to lay out dquots and quotaoff - * records on the log. Quite similar to those of inodes. - */ - -/* - * log format struct for dquots. - * The first two fields must be the type and size fitting into - * 32 bits : log_recovery code assumes that. - */ -typedef struct xfs_dq_logformat { - __uint16_t qlf_type; /* dquot log item type */ - __uint16_t qlf_size; /* size of this item */ - xfs_dqid_t qlf_id; /* usr/grp/proj id : 32 bits */ - __int64_t qlf_blkno; /* blkno of dquot buffer */ - __int32_t qlf_len; /* len of dquot buffer */ - __uint32_t qlf_boffset; /* off of dquot in buffer */ -} xfs_dq_logformat_t; - -/* - * log format struct for QUOTAOFF records. - * The first two fields must be the type and size fitting into - * 32 bits : log_recovery code assumes that. - * We write two LI_QUOTAOFF logitems per quotaoff, the last one keeps a pointer - * to the first and ensures that the first logitem is taken out of the AIL - * only when the last one is securely committed. - */ -typedef struct xfs_qoff_logformat { - unsigned short qf_type; /* quotaoff log item type */ - unsigned short qf_size; /* size of this item */ - unsigned int qf_flags; /* USR and/or GRP */ - char qf_pad[12]; /* padding for future */ -} xfs_qoff_logformat_t; - -/* - * Disk quotas status in m_qflags, and also sb_qflags. 16 bits. - */ -#define XFS_UQUOTA_ACCT 0x0001 /* user quota accounting ON */ -#define XFS_UQUOTA_ENFD 0x0002 /* user quota limits enforced */ -#define XFS_UQUOTA_CHKD 0x0004 /* quotacheck run on usr quotas */ -#define XFS_PQUOTA_ACCT 0x0008 /* project quota accounting ON */ -#define XFS_OQUOTA_ENFD 0x0010 /* other (grp/prj) quota limits enforced */ -#define XFS_OQUOTA_CHKD 0x0020 /* quotacheck run on other (grp/prj) quotas */ -#define XFS_GQUOTA_ACCT 0x0040 /* group quota accounting ON */ - -/* - * Quota Accounting/Enforcement flags - */ -#define XFS_ALL_QUOTA_ACCT \ - (XFS_UQUOTA_ACCT | XFS_GQUOTA_ACCT | XFS_PQUOTA_ACCT) -#define XFS_ALL_QUOTA_ENFD (XFS_UQUOTA_ENFD | XFS_OQUOTA_ENFD) -#define XFS_ALL_QUOTA_CHKD (XFS_UQUOTA_CHKD | XFS_OQUOTA_CHKD) - -#define XFS_IS_QUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_ALL_QUOTA_ACCT) -#define XFS_IS_UQUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_UQUOTA_ACCT) -#define XFS_IS_PQUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_PQUOTA_ACCT) -#define XFS_IS_GQUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_GQUOTA_ACCT) -#define XFS_IS_UQUOTA_ENFORCED(mp) ((mp)->m_qflags & XFS_UQUOTA_ENFD) -#define XFS_IS_OQUOTA_ENFORCED(mp) ((mp)->m_qflags & XFS_OQUOTA_ENFD) - -/* - * Incore only flags for quotaoff - these bits get cleared when quota(s) - * are in the process of getting turned off. These flags are in m_qflags but - * never in sb_qflags. - */ -#define XFS_UQUOTA_ACTIVE 0x0100 /* uquotas are being turned off */ -#define XFS_PQUOTA_ACTIVE 0x0200 /* pquotas are being turned off */ -#define XFS_GQUOTA_ACTIVE 0x0400 /* gquotas are being turned off */ -#define XFS_ALL_QUOTA_ACTIVE \ - (XFS_UQUOTA_ACTIVE | XFS_PQUOTA_ACTIVE | XFS_GQUOTA_ACTIVE) - -/* - * Checking XFS_IS_*QUOTA_ON() while holding any inode lock guarantees - * quota will be not be switched off as long as that inode lock is held. - */ -#define XFS_IS_QUOTA_ON(mp) ((mp)->m_qflags & (XFS_UQUOTA_ACTIVE | \ - XFS_GQUOTA_ACTIVE | \ - XFS_PQUOTA_ACTIVE)) -#define XFS_IS_OQUOTA_ON(mp) ((mp)->m_qflags & (XFS_GQUOTA_ACTIVE | \ - XFS_PQUOTA_ACTIVE)) -#define XFS_IS_UQUOTA_ON(mp) ((mp)->m_qflags & XFS_UQUOTA_ACTIVE) -#define XFS_IS_GQUOTA_ON(mp) ((mp)->m_qflags & XFS_GQUOTA_ACTIVE) -#define XFS_IS_PQUOTA_ON(mp) ((mp)->m_qflags & XFS_PQUOTA_ACTIVE) - -/* - * Flags to tell various functions what to do. Not all of these are meaningful - * to a single function. None of these XFS_QMOPT_* flags are meant to have - * persistent values (ie. their values can and will change between versions) - */ -#define XFS_QMOPT_DQALLOC 0x0000002 /* alloc dquot ondisk if needed */ -#define XFS_QMOPT_UQUOTA 0x0000004 /* user dquot requested */ -#define XFS_QMOPT_PQUOTA 0x0000008 /* project dquot requested */ -#define XFS_QMOPT_FORCE_RES 0x0000010 /* ignore quota limits */ -#define XFS_QMOPT_SBVERSION 0x0000040 /* change superblock version num */ -#define XFS_QMOPT_DOWARN 0x0000400 /* increase warning cnt if needed */ -#define XFS_QMOPT_DQREPAIR 0x0001000 /* repair dquot if damaged */ -#define XFS_QMOPT_GQUOTA 0x0002000 /* group dquot requested */ -#define XFS_QMOPT_ENOSPC 0x0004000 /* enospc instead of edquot (prj) */ - -/* - * flags to xfs_trans_mod_dquot to indicate which field needs to be - * modified. - */ -#define XFS_QMOPT_RES_REGBLKS 0x0010000 -#define XFS_QMOPT_RES_RTBLKS 0x0020000 -#define XFS_QMOPT_BCOUNT 0x0040000 -#define XFS_QMOPT_ICOUNT 0x0080000 -#define XFS_QMOPT_RTBCOUNT 0x0100000 -#define XFS_QMOPT_DELBCOUNT 0x0200000 -#define XFS_QMOPT_DELRTBCOUNT 0x0400000 -#define XFS_QMOPT_RES_INOS 0x0800000 - -/* - * flags for dqalloc. - */ -#define XFS_QMOPT_INHERIT 0x1000000 - -/* - * flags to xfs_trans_mod_dquot. - */ -#define XFS_TRANS_DQ_RES_BLKS XFS_QMOPT_RES_REGBLKS -#define XFS_TRANS_DQ_RES_RTBLKS XFS_QMOPT_RES_RTBLKS -#define XFS_TRANS_DQ_RES_INOS XFS_QMOPT_RES_INOS -#define XFS_TRANS_DQ_BCOUNT XFS_QMOPT_BCOUNT -#define XFS_TRANS_DQ_DELBCOUNT XFS_QMOPT_DELBCOUNT -#define XFS_TRANS_DQ_ICOUNT XFS_QMOPT_ICOUNT -#define XFS_TRANS_DQ_RTBCOUNT XFS_QMOPT_RTBCOUNT -#define XFS_TRANS_DQ_DELRTBCOUNT XFS_QMOPT_DELRTBCOUNT - - -#define XFS_QMOPT_QUOTALL \ - (XFS_QMOPT_UQUOTA | XFS_QMOPT_PQUOTA | XFS_QMOPT_GQUOTA) -#define XFS_QMOPT_RESBLK_MASK (XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_RES_RTBLKS) +struct xfs_trans; -#ifdef __KERNEL__ /* * This check is done typically without holding the inode lock; * that may seem racy, but it is harmless in the context that it is used. @@ -250,34 +35,18 @@ typedef struct xfs_qoff_logformat { * we didn't have the inode locked, the appropriate dquot(s) will be * attached atomically. */ -#define XFS_NOT_DQATTACHED(mp, ip) ((XFS_IS_UQUOTA_ON(mp) &&\ - (ip)->i_udquot == NULL) || \ - (XFS_IS_OQUOTA_ON(mp) && \ - (ip)->i_gdquot == NULL)) +#define XFS_NOT_DQATTACHED(mp, ip) \ + ((XFS_IS_UQUOTA_ON(mp) && (ip)->i_udquot == NULL) || \ + (XFS_IS_GQUOTA_ON(mp) && (ip)->i_gdquot == NULL) || \ + (XFS_IS_PQUOTA_ON(mp) && (ip)->i_pdquot == NULL)) #define XFS_QM_NEED_QUOTACHECK(mp) \ ((XFS_IS_UQUOTA_ON(mp) && \ (mp->m_sb.sb_qflags & XFS_UQUOTA_CHKD) == 0) || \ (XFS_IS_GQUOTA_ON(mp) && \ - ((mp->m_sb.sb_qflags & XFS_OQUOTA_CHKD) == 0 || \ - (mp->m_sb.sb_qflags & XFS_PQUOTA_ACCT))) || \ + (mp->m_sb.sb_qflags & XFS_GQUOTA_CHKD) == 0) || \ (XFS_IS_PQUOTA_ON(mp) && \ - ((mp->m_sb.sb_qflags & XFS_OQUOTA_CHKD) == 0 || \ - (mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT)))) - -#define XFS_MOUNT_QUOTA_SET1 (XFS_UQUOTA_ACCT|XFS_UQUOTA_ENFD|\ - XFS_UQUOTA_CHKD|XFS_PQUOTA_ACCT|\ - XFS_OQUOTA_ENFD|XFS_OQUOTA_CHKD) - -#define XFS_MOUNT_QUOTA_SET2 (XFS_UQUOTA_ACCT|XFS_UQUOTA_ENFD|\ - XFS_UQUOTA_CHKD|XFS_GQUOTA_ACCT|\ - XFS_OQUOTA_ENFD|XFS_OQUOTA_CHKD) - -#define XFS_MOUNT_QUOTA_ALL (XFS_UQUOTA_ACCT|XFS_UQUOTA_ENFD|\ - XFS_UQUOTA_CHKD|XFS_PQUOTA_ACCT|\ - XFS_OQUOTA_ENFD|XFS_OQUOTA_CHKD|\ - XFS_GQUOTA_ACCT) - + (mp->m_sb.sb_qflags & XFS_PQUOTA_CHKD) == 0)) /* * The structure kept inside the xfs_trans_t keep track of dquot changes @@ -309,17 +78,19 @@ extern int xfs_trans_reserve_quota_nblks(struct xfs_trans *, struct xfs_inode *, long, long, uint); extern int xfs_trans_reserve_quota_bydquots(struct xfs_trans *, struct xfs_mount *, struct xfs_dquot *, - struct xfs_dquot *, long, long, uint); + struct xfs_dquot *, struct xfs_dquot *, long, long, uint); -extern int xfs_qm_vop_dqalloc(struct xfs_inode *, uid_t, gid_t, prid_t, uint, - struct xfs_dquot **, struct xfs_dquot **); +extern int xfs_qm_vop_dqalloc(struct xfs_inode *, xfs_dqid_t, xfs_dqid_t, + prid_t, uint, struct xfs_dquot **, struct xfs_dquot **, + struct xfs_dquot **); extern void xfs_qm_vop_create_dqattach(struct xfs_trans *, struct xfs_inode *, - struct xfs_dquot *, struct xfs_dquot *); + struct xfs_dquot *, struct xfs_dquot *, struct xfs_dquot *); extern int xfs_qm_vop_rename_dqattach(struct xfs_inode **); extern struct xfs_dquot *xfs_qm_vop_chown(struct xfs_trans *, struct xfs_inode *, struct xfs_dquot **, struct xfs_dquot *); extern int xfs_qm_vop_chown_reserve(struct xfs_trans *, struct xfs_inode *, - struct xfs_dquot *, struct xfs_dquot *, uint); + struct xfs_dquot *, struct xfs_dquot *, + struct xfs_dquot *, uint); extern int xfs_qm_dqattach(struct xfs_inode *, uint); extern int xfs_qm_dqattach_locked(struct xfs_inode *, uint); extern void xfs_qm_dqdetach(struct xfs_inode *); @@ -332,11 +103,13 @@ extern void xfs_qm_unmount_quotas(struct xfs_mount *); #else static inline int -xfs_qm_vop_dqalloc(struct xfs_inode *ip, uid_t uid, gid_t gid, prid_t prid, - uint flags, struct xfs_dquot **udqp, struct xfs_dquot **gdqp) +xfs_qm_vop_dqalloc(struct xfs_inode *ip, xfs_dqid_t uid, xfs_dqid_t gid, + prid_t prid, uint flags, struct xfs_dquot **udqp, + struct xfs_dquot **gdqp, struct xfs_dquot **pdqp) { *udqp = NULL; *gdqp = NULL; + *pdqp = NULL; return 0; } #define xfs_trans_dup_dqinfo(tp, tp2) @@ -351,14 +124,15 @@ static inline int xfs_trans_reserve_quota_nblks(struct xfs_trans *tp, } static inline int xfs_trans_reserve_quota_bydquots(struct xfs_trans *tp, struct xfs_mount *mp, struct xfs_dquot *udqp, - struct xfs_dquot *gdqp, long nblks, long nions, uint flags) + struct xfs_dquot *gdqp, struct xfs_dquot *pdqp, + long nblks, long nions, uint flags) { return 0; } -#define xfs_qm_vop_create_dqattach(tp, ip, u, g) +#define xfs_qm_vop_create_dqattach(tp, ip, u, g, p) #define xfs_qm_vop_rename_dqattach(it) (0) #define xfs_qm_vop_chown(tp, ip, old, new) (NULL) -#define xfs_qm_vop_chown_reserve(tp, ip, u, g, fl) (0) +#define xfs_qm_vop_chown_reserve(tp, ip, u, g, p, fl) (0) #define xfs_qm_dqattach(ip, fl) (0) #define xfs_qm_dqattach_locked(ip, fl) (0) #define xfs_qm_dqdetach(ip) @@ -372,13 +146,10 @@ static inline int xfs_trans_reserve_quota_bydquots(struct xfs_trans *tp, #define xfs_trans_unreserve_quota_nblks(tp, ip, nblks, ninos, flags) \ xfs_trans_reserve_quota_nblks(tp, ip, -(nblks), -(ninos), flags) -#define xfs_trans_reserve_quota(tp, mp, ud, gd, nb, ni, f) \ - xfs_trans_reserve_quota_bydquots(tp, mp, ud, gd, nb, ni, \ +#define xfs_trans_reserve_quota(tp, mp, ud, gd, pd, nb, ni, f) \ + xfs_trans_reserve_quota_bydquots(tp, mp, ud, gd, pd, nb, ni, \ f | XFS_QMOPT_RES_REGBLKS) -extern int xfs_qm_dqcheck(struct xfs_mount *, xfs_disk_dquot_t *, - xfs_dqid_t, uint, uint, char *); extern int xfs_mount_reset_sbqflags(struct xfs_mount *); -#endif /* __KERNEL__ */ #endif /* __XFS_QUOTA_H__ */ diff --git a/fs/xfs/xfs_quota_defs.h b/fs/xfs/xfs_quota_defs.h new file mode 100644 index 00000000000..137e2093707 --- /dev/null +++ b/fs/xfs/xfs_quota_defs.h @@ -0,0 +1,161 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_QUOTA_DEFS_H__ +#define __XFS_QUOTA_DEFS_H__ + +/* + * Quota definitions shared between user and kernel source trees. + */ + +/* + * Even though users may not have quota limits occupying all 64-bits, + * they may need 64-bit accounting. Hence, 64-bit quota-counters, + * and quota-limits. This is a waste in the common case, but hey ... + */ +typedef __uint64_t xfs_qcnt_t; +typedef __uint16_t xfs_qwarncnt_t; + +/* + * flags for q_flags field in the dquot. + */ +#define XFS_DQ_USER 0x0001 /* a user quota */ +#define XFS_DQ_PROJ 0x0002 /* project quota */ +#define XFS_DQ_GROUP 0x0004 /* a group quota */ +#define XFS_DQ_DIRTY 0x0008 /* dquot is dirty */ +#define XFS_DQ_FREEING 0x0010 /* dquot is beeing torn down */ + +#define XFS_DQ_ALLTYPES (XFS_DQ_USER|XFS_DQ_PROJ|XFS_DQ_GROUP) + +#define XFS_DQ_FLAGS \ + { XFS_DQ_USER, "USER" }, \ + { XFS_DQ_PROJ, "PROJ" }, \ + { XFS_DQ_GROUP, "GROUP" }, \ + { XFS_DQ_DIRTY, "DIRTY" }, \ + { XFS_DQ_FREEING, "FREEING" } + +/* + * We have the possibility of all three quota types being active at once, and + * hence free space modification requires modification of all three current + * dquots in a single transaction. For this case we need to have a reservation + * of at least 3 dquots. + * + * However, a chmod operation can change both UID and GID in a single + * transaction, resulting in requiring {old, new} x {uid, gid} dquots to be + * modified. Hence for this case we need to reserve space for at least 4 dquots. + * + * And in the worst case, there's a rename operation that can be modifying up to + * 4 inodes with dquots attached to them. In reality, the only inodes that can + * have their dquots modified are the source and destination directory inodes + * due to directory name creation and removal. That can require space allocation + * and/or freeing on both directory inodes, and hence all three dquots on each + * inode can be modified. And if the directories are world writeable, all the + * dquots can be unique and so 6 dquots can be modified.... + * + * And, of course, we also need to take into account the dquot log format item + * used to describe each dquot. + */ +#define XFS_DQUOT_LOGRES(mp) \ + ((sizeof(struct xfs_dq_logformat) + sizeof(struct xfs_disk_dquot)) * 6) + +#define XFS_IS_QUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_ALL_QUOTA_ACCT) +#define XFS_IS_UQUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_UQUOTA_ACCT) +#define XFS_IS_PQUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_PQUOTA_ACCT) +#define XFS_IS_GQUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_GQUOTA_ACCT) +#define XFS_IS_UQUOTA_ENFORCED(mp) ((mp)->m_qflags & XFS_UQUOTA_ENFD) +#define XFS_IS_GQUOTA_ENFORCED(mp) ((mp)->m_qflags & XFS_GQUOTA_ENFD) +#define XFS_IS_PQUOTA_ENFORCED(mp) ((mp)->m_qflags & XFS_PQUOTA_ENFD) + +/* + * Incore only flags for quotaoff - these bits get cleared when quota(s) + * are in the process of getting turned off. These flags are in m_qflags but + * never in sb_qflags. + */ +#define XFS_UQUOTA_ACTIVE 0x1000 /* uquotas are being turned off */ +#define XFS_GQUOTA_ACTIVE 0x2000 /* gquotas are being turned off */ +#define XFS_PQUOTA_ACTIVE 0x4000 /* pquotas are being turned off */ +#define XFS_ALL_QUOTA_ACTIVE \ + (XFS_UQUOTA_ACTIVE | XFS_GQUOTA_ACTIVE | XFS_PQUOTA_ACTIVE) + +/* + * Checking XFS_IS_*QUOTA_ON() while holding any inode lock guarantees + * quota will be not be switched off as long as that inode lock is held. + */ +#define XFS_IS_QUOTA_ON(mp) ((mp)->m_qflags & (XFS_UQUOTA_ACTIVE | \ + XFS_GQUOTA_ACTIVE | \ + XFS_PQUOTA_ACTIVE)) +#define XFS_IS_OQUOTA_ON(mp) ((mp)->m_qflags & (XFS_GQUOTA_ACTIVE | \ + XFS_PQUOTA_ACTIVE)) +#define XFS_IS_UQUOTA_ON(mp) ((mp)->m_qflags & XFS_UQUOTA_ACTIVE) +#define XFS_IS_GQUOTA_ON(mp) ((mp)->m_qflags & XFS_GQUOTA_ACTIVE) +#define XFS_IS_PQUOTA_ON(mp) ((mp)->m_qflags & XFS_PQUOTA_ACTIVE) + +/* + * Flags to tell various functions what to do. Not all of these are meaningful + * to a single function. None of these XFS_QMOPT_* flags are meant to have + * persistent values (ie. their values can and will change between versions) + */ +#define XFS_QMOPT_DQALLOC 0x0000002 /* alloc dquot ondisk if needed */ +#define XFS_QMOPT_UQUOTA 0x0000004 /* user dquot requested */ +#define XFS_QMOPT_PQUOTA 0x0000008 /* project dquot requested */ +#define XFS_QMOPT_FORCE_RES 0x0000010 /* ignore quota limits */ +#define XFS_QMOPT_SBVERSION 0x0000040 /* change superblock version num */ +#define XFS_QMOPT_DOWARN 0x0000400 /* increase warning cnt if needed */ +#define XFS_QMOPT_DQREPAIR 0x0001000 /* repair dquot if damaged */ +#define XFS_QMOPT_GQUOTA 0x0002000 /* group dquot requested */ +#define XFS_QMOPT_ENOSPC 0x0004000 /* enospc instead of edquot (prj) */ + +/* + * flags to xfs_trans_mod_dquot to indicate which field needs to be + * modified. + */ +#define XFS_QMOPT_RES_REGBLKS 0x0010000 +#define XFS_QMOPT_RES_RTBLKS 0x0020000 +#define XFS_QMOPT_BCOUNT 0x0040000 +#define XFS_QMOPT_ICOUNT 0x0080000 +#define XFS_QMOPT_RTBCOUNT 0x0100000 +#define XFS_QMOPT_DELBCOUNT 0x0200000 +#define XFS_QMOPT_DELRTBCOUNT 0x0400000 +#define XFS_QMOPT_RES_INOS 0x0800000 + +/* + * flags for dqalloc. + */ +#define XFS_QMOPT_INHERIT 0x1000000 + +/* + * flags to xfs_trans_mod_dquot. + */ +#define XFS_TRANS_DQ_RES_BLKS XFS_QMOPT_RES_REGBLKS +#define XFS_TRANS_DQ_RES_RTBLKS XFS_QMOPT_RES_RTBLKS +#define XFS_TRANS_DQ_RES_INOS XFS_QMOPT_RES_INOS +#define XFS_TRANS_DQ_BCOUNT XFS_QMOPT_BCOUNT +#define XFS_TRANS_DQ_DELBCOUNT XFS_QMOPT_DELBCOUNT +#define XFS_TRANS_DQ_ICOUNT XFS_QMOPT_ICOUNT +#define XFS_TRANS_DQ_RTBCOUNT XFS_QMOPT_RTBCOUNT +#define XFS_TRANS_DQ_DELRTBCOUNT XFS_QMOPT_DELRTBCOUNT + + +#define XFS_QMOPT_QUOTALL \ + (XFS_QMOPT_UQUOTA | XFS_QMOPT_PQUOTA | XFS_QMOPT_GQUOTA) +#define XFS_QMOPT_RESBLK_MASK (XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_RES_RTBLKS) + +extern int xfs_dqcheck(struct xfs_mount *mp, xfs_disk_dquot_t *ddq, + xfs_dqid_t id, uint type, uint flags, char *str); +extern int xfs_calc_dquots_per_chunk(unsigned int nbblks); + +#endif /* __XFS_QUOTA_H__ */ diff --git a/fs/xfs/xfs_quota_priv.h b/fs/xfs/xfs_quota_priv.h deleted file mode 100644 index 6d86219d93d..00000000000 --- a/fs/xfs/xfs_quota_priv.h +++ /dev/null @@ -1,42 +0,0 @@ -/* - * Copyright (c) 2000-2003 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#ifndef __XFS_QUOTA_PRIV_H__ -#define __XFS_QUOTA_PRIV_H__ - -/* - * Number of bmaps that we ask from bmapi when doing a quotacheck. - * We make this restriction to keep the memory usage to a minimum. - */ -#define XFS_DQITER_MAP_SIZE 10 - -#define XFS_IS_DQUOT_UNINITIALIZED(dqp) ( \ - !dqp->q_core.d_blk_hardlimit && \ - !dqp->q_core.d_blk_softlimit && \ - !dqp->q_core.d_rtb_hardlimit && \ - !dqp->q_core.d_rtb_softlimit && \ - !dqp->q_core.d_ino_hardlimit && \ - !dqp->q_core.d_ino_softlimit && \ - !dqp->q_core.d_bcount && \ - !dqp->q_core.d_rtbcount && \ - !dqp->q_core.d_icount) - -#define DQFLAGTO_TYPESTR(d) (((d)->dq_flags & XFS_DQ_USER) ? "USR" : \ - (((d)->dq_flags & XFS_DQ_GROUP) ? "GRP" : \ - (((d)->dq_flags & XFS_DQ_PROJ) ? "PRJ":"???"))) - -#endif /* __XFS_QUOTA_PRIV_H__ */ diff --git a/fs/xfs/xfs_quotaops.c b/fs/xfs/xfs_quotaops.c index 71926d63052..2ad1b9822e9 100644 --- a/fs/xfs/xfs_quotaops.c +++ b/fs/xfs/xfs_quotaops.c @@ -16,14 +16,15 @@ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include "xfs.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_sb.h" -#include "xfs_log.h" #include "xfs_ag.h" #include "xfs_mount.h" +#include "xfs_inode.h" #include "xfs_quota.h" #include "xfs_trans.h" -#include "xfs_bmap_btree.h" -#include "xfs_inode.h" #include "xfs_qm.h" #include <linux/quota.h> @@ -54,6 +55,18 @@ xfs_fs_get_xstate( } STATIC int +xfs_fs_get_xstatev( + struct super_block *sb, + struct fs_quota_statv *fqs) +{ + struct xfs_mount *mp = XFS_M(sb); + + if (!XFS_IS_QUOTA_RUNNING(mp)) + return -ENOSYS; + return -xfs_qm_scall_getqstatv(mp, fqs); +} + +STATIC int xfs_fs_set_xstate( struct super_block *sb, unsigned int uflags, @@ -75,8 +88,10 @@ xfs_fs_set_xstate( flags |= XFS_GQUOTA_ACCT; if (uflags & FS_QUOTA_UDQ_ENFD) flags |= XFS_UQUOTA_ENFD; - if (uflags & (FS_QUOTA_PDQ_ENFD|FS_QUOTA_GDQ_ENFD)) - flags |= XFS_OQUOTA_ENFD; + if (uflags & FS_QUOTA_GDQ_ENFD) + flags |= XFS_GQUOTA_ENFD; + if (uflags & FS_QUOTA_PDQ_ENFD) + flags |= XFS_PQUOTA_ENFD; switch (op) { case Q_XQUOTAON: @@ -85,16 +100,36 @@ xfs_fs_set_xstate( if (!XFS_IS_QUOTA_ON(mp)) return -EINVAL; return -xfs_qm_scall_quotaoff(mp, flags); - case Q_XQUOTARM: - if (XFS_IS_QUOTA_ON(mp)) - return -EINVAL; - return -xfs_qm_scall_trunc_qfiles(mp, flags); } return -EINVAL; } STATIC int +xfs_fs_rm_xquota( + struct super_block *sb, + unsigned int uflags) +{ + struct xfs_mount *mp = XFS_M(sb); + unsigned int flags = 0; + + if (sb->s_flags & MS_RDONLY) + return -EROFS; + + if (XFS_IS_QUOTA_ON(mp)) + return -EINVAL; + + if (uflags & FS_USER_QUOTA) + flags |= XFS_DQ_USER; + if (uflags & FS_GROUP_QUOTA) + flags |= XFS_DQ_GROUP; + if (uflags & FS_USER_QUOTA) + flags |= XFS_DQ_PROJ; + + return -xfs_qm_scall_trunc_qfiles(mp, flags); +} + +STATIC int xfs_fs_get_dqblk( struct super_block *sb, struct kqid qid, @@ -131,8 +166,10 @@ xfs_fs_set_dqblk( } const struct quotactl_ops xfs_quotactl_operations = { + .get_xstatev = xfs_fs_get_xstatev, .get_xstate = xfs_fs_get_xstate, .set_xstate = xfs_fs_set_xstate, + .rm_xquota = xfs_fs_rm_xquota, .get_dqblk = xfs_fs_get_dqblk, .set_dqblk = xfs_fs_set_dqblk, }; diff --git a/fs/xfs/xfs_rename.c b/fs/xfs/xfs_rename.c deleted file mode 100644 index 30ff5f401d2..00000000000 --- a/fs/xfs/xfs_rename.c +++ /dev/null @@ -1,346 +0,0 @@ -/* - * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#include "xfs.h" -#include "xfs_fs.h" -#include "xfs_types.h" -#include "xfs_log.h" -#include "xfs_trans.h" -#include "xfs_sb.h" -#include "xfs_ag.h" -#include "xfs_dir2.h" -#include "xfs_mount.h" -#include "xfs_da_btree.h" -#include "xfs_bmap_btree.h" -#include "xfs_dinode.h" -#include "xfs_inode.h" -#include "xfs_inode_item.h" -#include "xfs_bmap.h" -#include "xfs_error.h" -#include "xfs_quota.h" -#include "xfs_utils.h" -#include "xfs_trans_space.h" -#include "xfs_vnodeops.h" -#include "xfs_trace.h" - - -/* - * Enter all inodes for a rename transaction into a sorted array. - */ -STATIC void -xfs_sort_for_rename( - xfs_inode_t *dp1, /* in: old (source) directory inode */ - xfs_inode_t *dp2, /* in: new (target) directory inode */ - xfs_inode_t *ip1, /* in: inode of old entry */ - xfs_inode_t *ip2, /* in: inode of new entry, if it - already exists, NULL otherwise. */ - xfs_inode_t **i_tab,/* out: array of inode returned, sorted */ - int *num_inodes) /* out: number of inodes in array */ -{ - xfs_inode_t *temp; - int i, j; - - /* - * i_tab contains a list of pointers to inodes. We initialize - * the table here & we'll sort it. We will then use it to - * order the acquisition of the inode locks. - * - * Note that the table may contain duplicates. e.g., dp1 == dp2. - */ - i_tab[0] = dp1; - i_tab[1] = dp2; - i_tab[2] = ip1; - if (ip2) { - *num_inodes = 4; - i_tab[3] = ip2; - } else { - *num_inodes = 3; - i_tab[3] = NULL; - } - - /* - * Sort the elements via bubble sort. (Remember, there are at - * most 4 elements to sort, so this is adequate.) - */ - for (i = 0; i < *num_inodes; i++) { - for (j = 1; j < *num_inodes; j++) { - if (i_tab[j]->i_ino < i_tab[j-1]->i_ino) { - temp = i_tab[j]; - i_tab[j] = i_tab[j-1]; - i_tab[j-1] = temp; - } - } - } -} - -/* - * xfs_rename - */ -int -xfs_rename( - xfs_inode_t *src_dp, - struct xfs_name *src_name, - xfs_inode_t *src_ip, - xfs_inode_t *target_dp, - struct xfs_name *target_name, - xfs_inode_t *target_ip) -{ - xfs_trans_t *tp = NULL; - xfs_mount_t *mp = src_dp->i_mount; - int new_parent; /* moving to a new dir */ - int src_is_directory; /* src_name is a directory */ - int error; - xfs_bmap_free_t free_list; - xfs_fsblock_t first_block; - int cancel_flags; - int committed; - xfs_inode_t *inodes[4]; - int spaceres; - int num_inodes; - - trace_xfs_rename(src_dp, target_dp, src_name, target_name); - - new_parent = (src_dp != target_dp); - src_is_directory = S_ISDIR(src_ip->i_d.di_mode); - - xfs_sort_for_rename(src_dp, target_dp, src_ip, target_ip, - inodes, &num_inodes); - - xfs_bmap_init(&free_list, &first_block); - tp = xfs_trans_alloc(mp, XFS_TRANS_RENAME); - cancel_flags = XFS_TRANS_RELEASE_LOG_RES; - spaceres = XFS_RENAME_SPACE_RES(mp, target_name->len); - error = xfs_trans_reserve(tp, spaceres, XFS_RENAME_LOG_RES(mp), 0, - XFS_TRANS_PERM_LOG_RES, XFS_RENAME_LOG_COUNT); - if (error == ENOSPC) { - spaceres = 0; - error = xfs_trans_reserve(tp, 0, XFS_RENAME_LOG_RES(mp), 0, - XFS_TRANS_PERM_LOG_RES, XFS_RENAME_LOG_COUNT); - } - if (error) { - xfs_trans_cancel(tp, 0); - goto std_return; - } - - /* - * Attach the dquots to the inodes - */ - error = xfs_qm_vop_rename_dqattach(inodes); - if (error) { - xfs_trans_cancel(tp, cancel_flags); - goto std_return; - } - - /* - * Lock all the participating inodes. Depending upon whether - * the target_name exists in the target directory, and - * whether the target directory is the same as the source - * directory, we can lock from 2 to 4 inodes. - */ - xfs_lock_inodes(inodes, num_inodes, XFS_ILOCK_EXCL); - - /* - * Join all the inodes to the transaction. From this point on, - * we can rely on either trans_commit or trans_cancel to unlock - * them. - */ - xfs_trans_ijoin(tp, src_dp, XFS_ILOCK_EXCL); - if (new_parent) - xfs_trans_ijoin(tp, target_dp, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, src_ip, XFS_ILOCK_EXCL); - if (target_ip) - xfs_trans_ijoin(tp, target_ip, XFS_ILOCK_EXCL); - - /* - * If we are using project inheritance, we only allow renames - * into our tree when the project IDs are the same; else the - * tree quota mechanism would be circumvented. - */ - if (unlikely((target_dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) && - (xfs_get_projid(target_dp) != xfs_get_projid(src_ip)))) { - error = XFS_ERROR(EXDEV); - goto error_return; - } - - /* - * Set up the target. - */ - if (target_ip == NULL) { - /* - * If there's no space reservation, check the entry will - * fit before actually inserting it. - */ - error = xfs_dir_canenter(tp, target_dp, target_name, spaceres); - if (error) - goto error_return; - /* - * If target does not exist and the rename crosses - * directories, adjust the target directory link count - * to account for the ".." reference from the new entry. - */ - error = xfs_dir_createname(tp, target_dp, target_name, - src_ip->i_ino, &first_block, - &free_list, spaceres); - if (error == ENOSPC) - goto error_return; - if (error) - goto abort_return; - - xfs_trans_ichgtime(tp, target_dp, - XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); - - if (new_parent && src_is_directory) { - error = xfs_bumplink(tp, target_dp); - if (error) - goto abort_return; - } - } else { /* target_ip != NULL */ - /* - * If target exists and it's a directory, check that both - * target and source are directories and that target can be - * destroyed, or that neither is a directory. - */ - if (S_ISDIR(target_ip->i_d.di_mode)) { - /* - * Make sure target dir is empty. - */ - if (!(xfs_dir_isempty(target_ip)) || - (target_ip->i_d.di_nlink > 2)) { - error = XFS_ERROR(EEXIST); - goto error_return; - } - } - - /* - * Link the source inode under the target name. - * If the source inode is a directory and we are moving - * it across directories, its ".." entry will be - * inconsistent until we replace that down below. - * - * In case there is already an entry with the same - * name at the destination directory, remove it first. - */ - error = xfs_dir_replace(tp, target_dp, target_name, - src_ip->i_ino, - &first_block, &free_list, spaceres); - if (error) - goto abort_return; - - xfs_trans_ichgtime(tp, target_dp, - XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); - - /* - * Decrement the link count on the target since the target - * dir no longer points to it. - */ - error = xfs_droplink(tp, target_ip); - if (error) - goto abort_return; - - if (src_is_directory) { - /* - * Drop the link from the old "." entry. - */ - error = xfs_droplink(tp, target_ip); - if (error) - goto abort_return; - } - } /* target_ip != NULL */ - - /* - * Remove the source. - */ - if (new_parent && src_is_directory) { - /* - * Rewrite the ".." entry to point to the new - * directory. - */ - error = xfs_dir_replace(tp, src_ip, &xfs_name_dotdot, - target_dp->i_ino, - &first_block, &free_list, spaceres); - ASSERT(error != EEXIST); - if (error) - goto abort_return; - } - - /* - * We always want to hit the ctime on the source inode. - * - * This isn't strictly required by the standards since the source - * inode isn't really being changed, but old unix file systems did - * it and some incremental backup programs won't work without it. - */ - xfs_trans_ichgtime(tp, src_ip, XFS_ICHGTIME_CHG); - xfs_trans_log_inode(tp, src_ip, XFS_ILOG_CORE); - - /* - * Adjust the link count on src_dp. This is necessary when - * renaming a directory, either within one parent when - * the target existed, or across two parent directories. - */ - if (src_is_directory && (new_parent || target_ip != NULL)) { - - /* - * Decrement link count on src_directory since the - * entry that's moved no longer points to it. - */ - error = xfs_droplink(tp, src_dp); - if (error) - goto abort_return; - } - - error = xfs_dir_removename(tp, src_dp, src_name, src_ip->i_ino, - &first_block, &free_list, spaceres); - if (error) - goto abort_return; - - xfs_trans_ichgtime(tp, src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); - xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE); - if (new_parent) - xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE); - - /* - * If this is a synchronous mount, make sure that the - * rename transaction goes to disk before returning to - * the user. - */ - if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) { - xfs_trans_set_sync(tp); - } - - error = xfs_bmap_finish(&tp, &free_list, &committed); - if (error) { - xfs_bmap_cancel(&free_list); - xfs_trans_cancel(tp, (XFS_TRANS_RELEASE_LOG_RES | - XFS_TRANS_ABORT)); - goto std_return; - } - - /* - * trans_commit will unlock src_ip, target_ip & decrement - * the vnode references. - */ - return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); - - abort_return: - cancel_flags |= XFS_TRANS_ABORT; - error_return: - xfs_bmap_cancel(&free_list); - xfs_trans_cancel(tp, cancel_flags); - std_return: - return error; -} diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index 98dc670d3ee..ec5ca65c621 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -17,173 +17,260 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_types.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_bit.h" -#include "xfs_log.h" -#include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_dir2.h" #include "xfs_mount.h" -#include "xfs_bmap_btree.h" -#include "xfs_dinode.h" #include "xfs_inode.h" -#include "xfs_alloc.h" #include "xfs_bmap.h" -#include "xfs_rtalloc.h" -#include "xfs_fsops.h" +#include "xfs_bmap_util.h" +#include "xfs_bmap_btree.h" +#include "xfs_alloc.h" #include "xfs_error.h" -#include "xfs_inode_item.h" +#include "xfs_trans.h" #include "xfs_trans_space.h" -#include "xfs_utils.h" #include "xfs_trace.h" #include "xfs_buf.h" #include "xfs_icache.h" +#include "xfs_dinode.h" +#include "xfs_rtalloc.h" /* - * Prototypes for internal functions. + * Read and return the summary information for a given extent size, + * bitmap block combination. + * Keeps track of a current summary block, so we don't keep reading + * it from the buffer cache. */ +STATIC int /* error */ +xfs_rtget_summary( + xfs_mount_t *mp, /* file system mount structure */ + xfs_trans_t *tp, /* transaction pointer */ + int log, /* log2 of extent size */ + xfs_rtblock_t bbno, /* bitmap block number */ + xfs_buf_t **rbpp, /* in/out: summary block buffer */ + xfs_fsblock_t *rsb, /* in/out: summary block number */ + xfs_suminfo_t *sum) /* out: summary info for this block */ +{ + xfs_buf_t *bp; /* buffer for summary block */ + int error; /* error value */ + xfs_fsblock_t sb; /* summary fsblock */ + int so; /* index into the summary file */ + xfs_suminfo_t *sp; /* pointer to returned data */ + /* + * Compute entry number in the summary file. + */ + so = XFS_SUMOFFS(mp, log, bbno); + /* + * Compute the block number in the summary file. + */ + sb = XFS_SUMOFFSTOBLOCK(mp, so); + /* + * If we have an old buffer, and the block number matches, use that. + */ + if (rbpp && *rbpp && *rsb == sb) + bp = *rbpp; + /* + * Otherwise we have to get the buffer. + */ + else { + /* + * If there was an old one, get rid of it first. + */ + if (rbpp && *rbpp) + xfs_trans_brelse(tp, *rbpp); + error = xfs_rtbuf_get(mp, tp, sb, 1, &bp); + if (error) { + return error; + } + /* + * Remember this buffer and block for the next call. + */ + if (rbpp) { + *rbpp = bp; + *rsb = sb; + } + } + /* + * Point to the summary information & copy it out. + */ + sp = XFS_SUMPTR(mp, bp, so); + *sum = *sp; + /* + * Drop the buffer if we're not asked to remember it. + */ + if (!rbpp) + xfs_trans_brelse(tp, bp); + return 0; +} -STATIC int xfs_rtallocate_range(xfs_mount_t *, xfs_trans_t *, xfs_rtblock_t, - xfs_extlen_t, xfs_buf_t **, xfs_fsblock_t *); -STATIC int xfs_rtany_summary(xfs_mount_t *, xfs_trans_t *, int, int, - xfs_rtblock_t, xfs_buf_t **, xfs_fsblock_t *, int *); -STATIC int xfs_rtcheck_range(xfs_mount_t *, xfs_trans_t *, xfs_rtblock_t, - xfs_extlen_t, int, xfs_rtblock_t *, int *); -STATIC int xfs_rtfind_back(xfs_mount_t *, xfs_trans_t *, xfs_rtblock_t, - xfs_rtblock_t, xfs_rtblock_t *); -STATIC int xfs_rtfind_forw(xfs_mount_t *, xfs_trans_t *, xfs_rtblock_t, - xfs_rtblock_t, xfs_rtblock_t *); -STATIC int xfs_rtget_summary( xfs_mount_t *, xfs_trans_t *, int, - xfs_rtblock_t, xfs_buf_t **, xfs_fsblock_t *, xfs_suminfo_t *); -STATIC int xfs_rtmodify_range(xfs_mount_t *, xfs_trans_t *, xfs_rtblock_t, - xfs_extlen_t, int); -STATIC int xfs_rtmodify_summary(xfs_mount_t *, xfs_trans_t *, int, - xfs_rtblock_t, int, xfs_buf_t **, xfs_fsblock_t *); - -/* - * Internal functions. - */ /* - * Allocate space to the bitmap or summary file, and zero it, for growfs. + * Return whether there are any free extents in the size range given + * by low and high, for the bitmap block bbno. */ STATIC int /* error */ -xfs_growfs_rt_alloc( - xfs_mount_t *mp, /* file system mount point */ - xfs_extlen_t oblocks, /* old count of blocks */ - xfs_extlen_t nblocks, /* new count of blocks */ - xfs_inode_t *ip) /* inode (bitmap/summary) */ +xfs_rtany_summary( + xfs_mount_t *mp, /* file system mount structure */ + xfs_trans_t *tp, /* transaction pointer */ + int low, /* low log2 extent size */ + int high, /* high log2 extent size */ + xfs_rtblock_t bbno, /* bitmap block number */ + xfs_buf_t **rbpp, /* in/out: summary block buffer */ + xfs_fsblock_t *rsb, /* in/out: summary block number */ + int *stat) /* out: any good extents here? */ { - xfs_fileoff_t bno; /* block number in file */ - xfs_buf_t *bp; /* temporary buffer for zeroing */ - int committed; /* transaction committed flag */ - xfs_daddr_t d; /* disk block address */ - int error; /* error return value */ - xfs_fsblock_t firstblock; /* first block allocated in xaction */ - xfs_bmap_free_t flist; /* list of freed blocks */ - xfs_fsblock_t fsbno; /* filesystem block for bno */ - xfs_bmbt_irec_t map; /* block map output */ - int nmap; /* number of block maps */ - int resblks; /* space reservation */ + int error; /* error value */ + int log; /* loop counter, log2 of ext. size */ + xfs_suminfo_t sum; /* summary data */ /* - * Allocate space to the file, as necessary. + * Loop over logs of extent sizes. Order is irrelevant. */ - while (oblocks < nblocks) { - int cancelflags = 0; - xfs_trans_t *tp; - - tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFSRT_ALLOC); - resblks = XFS_GROWFSRT_SPACE_RES(mp, nblocks - oblocks); + for (log = low; log <= high; log++) { /* - * Reserve space & log for one extent added to the file. + * Get one summary datum. */ - if ((error = xfs_trans_reserve(tp, resblks, - XFS_GROWRTALLOC_LOG_RES(mp), 0, - XFS_TRANS_PERM_LOG_RES, - XFS_DEFAULT_PERM_LOG_COUNT))) - goto error_cancel; - cancelflags = XFS_TRANS_RELEASE_LOG_RES; + error = xfs_rtget_summary(mp, tp, log, bbno, rbpp, rsb, &sum); + if (error) { + return error; + } /* - * Lock the inode. + * If there are any, return success. */ - xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + if (sum) { + *stat = 1; + return 0; + } + } + /* + * Found nothing, return failure. + */ + *stat = 0; + return 0; +} - xfs_bmap_init(&flist, &firstblock); - /* - * Allocate blocks to the bitmap file. - */ - nmap = 1; - cancelflags |= XFS_TRANS_ABORT; - error = xfs_bmapi_write(tp, ip, oblocks, nblocks - oblocks, - XFS_BMAPI_METADATA, &firstblock, - resblks, &map, &nmap, &flist); - if (!error && nmap < 1) - error = XFS_ERROR(ENOSPC); - if (error) - goto error_cancel; - /* - * Free any blocks freed up in the transaction, then commit. - */ - error = xfs_bmap_finish(&tp, &flist, &committed); - if (error) - goto error_cancel; - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); - if (error) - goto error; - /* - * Now we need to clear the allocated blocks. - * Do this one block per transaction, to keep it simple. - */ - cancelflags = 0; - for (bno = map.br_startoff, fsbno = map.br_startblock; - bno < map.br_startoff + map.br_blockcount; - bno++, fsbno++) { - tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFSRT_ZERO); - /* - * Reserve log for one block zeroing. - */ - if ((error = xfs_trans_reserve(tp, 0, - XFS_GROWRTZERO_LOG_RES(mp), 0, 0, 0))) - goto error_cancel; - /* - * Lock the bitmap inode. - */ - xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); - /* - * Get a buffer for the block. - */ - d = XFS_FSB_TO_DADDR(mp, fsbno); - bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, - mp->m_bsize, 0); - if (bp == NULL) { - error = XFS_ERROR(EIO); -error_cancel: - xfs_trans_cancel(tp, cancelflags); - goto error; - } - memset(bp->b_addr, 0, mp->m_sb.sb_blocksize); - xfs_trans_log_buf(tp, bp, 0, mp->m_sb.sb_blocksize - 1); - /* - * Commit the transaction. - */ - error = xfs_trans_commit(tp, 0); + +/* + * Copy and transform the summary file, given the old and new + * parameters in the mount structures. + */ +STATIC int /* error */ +xfs_rtcopy_summary( + xfs_mount_t *omp, /* old file system mount point */ + xfs_mount_t *nmp, /* new file system mount point */ + xfs_trans_t *tp) /* transaction pointer */ +{ + xfs_rtblock_t bbno; /* bitmap block number */ + xfs_buf_t *bp; /* summary buffer */ + int error; /* error return value */ + int log; /* summary level number (log length) */ + xfs_suminfo_t sum; /* summary data */ + xfs_fsblock_t sumbno; /* summary block number */ + + bp = NULL; + for (log = omp->m_rsumlevels - 1; log >= 0; log--) { + for (bbno = omp->m_sb.sb_rbmblocks - 1; + (xfs_srtblock_t)bbno >= 0; + bbno--) { + error = xfs_rtget_summary(omp, tp, log, bbno, &bp, + &sumbno, &sum); if (error) - goto error; + return error; + if (sum == 0) + continue; + error = xfs_rtmodify_summary(omp, tp, log, bbno, -sum, + &bp, &sumbno); + if (error) + return error; + error = xfs_rtmodify_summary(nmp, tp, log, bbno, sum, + &bp, &sumbno); + if (error) + return error; + ASSERT(sum > 0); } - /* - * Go on to the next extent, if any. - */ - oblocks = map.br_startoff + map.br_blockcount; } return 0; +} +/* + * Mark an extent specified by start and len allocated. + * Updates all the summary information as well as the bitmap. + */ +STATIC int /* error */ +xfs_rtallocate_range( + xfs_mount_t *mp, /* file system mount point */ + xfs_trans_t *tp, /* transaction pointer */ + xfs_rtblock_t start, /* start block to allocate */ + xfs_extlen_t len, /* length to allocate */ + xfs_buf_t **rbpp, /* in/out: summary block buffer */ + xfs_fsblock_t *rsb) /* in/out: summary block number */ +{ + xfs_rtblock_t end; /* end of the allocated extent */ + int error; /* error value */ + xfs_rtblock_t postblock = 0; /* first block allocated > end */ + xfs_rtblock_t preblock = 0; /* first block allocated < start */ -error: + end = start + len - 1; + /* + * Assume we're allocating out of the middle of a free extent. + * We need to find the beginning and end of the extent so we can + * properly update the summary. + */ + error = xfs_rtfind_back(mp, tp, start, 0, &preblock); + if (error) { + return error; + } + /* + * Find the next allocated block (end of free extent). + */ + error = xfs_rtfind_forw(mp, tp, end, mp->m_sb.sb_rextents - 1, + &postblock); + if (error) { + return error; + } + /* + * Decrement the summary information corresponding to the entire + * (old) free extent. + */ + error = xfs_rtmodify_summary(mp, tp, + XFS_RTBLOCKLOG(postblock + 1 - preblock), + XFS_BITTOBLOCK(mp, preblock), -1, rbpp, rsb); + if (error) { + return error; + } + /* + * If there are blocks not being allocated at the front of the + * old extent, add summary data for them to be free. + */ + if (preblock < start) { + error = xfs_rtmodify_summary(mp, tp, + XFS_RTBLOCKLOG(start - preblock), + XFS_BITTOBLOCK(mp, preblock), 1, rbpp, rsb); + if (error) { + return error; + } + } + /* + * If there are blocks not being allocated at the end of the + * old extent, add summary data for them to be free. + */ + if (postblock > end) { + error = xfs_rtmodify_summary(mp, tp, + XFS_RTBLOCKLOG(postblock - end), + XFS_BITTOBLOCK(mp, end + 1), 1, rbpp, rsb); + if (error) { + return error; + } + } + /* + * Modify the bitmap to mark this extent allocated. + */ + error = xfs_rtmodify_range(mp, tp, start, len, 0); return error; } @@ -722,1112 +809,126 @@ xfs_rtallocate_extent_size( } /* - * Mark an extent specified by start and len allocated. - * Updates all the summary information as well as the bitmap. + * Allocate space to the bitmap or summary file, and zero it, for growfs. */ STATIC int /* error */ -xfs_rtallocate_range( +xfs_growfs_rt_alloc( xfs_mount_t *mp, /* file system mount point */ - xfs_trans_t *tp, /* transaction pointer */ - xfs_rtblock_t start, /* start block to allocate */ - xfs_extlen_t len, /* length to allocate */ - xfs_buf_t **rbpp, /* in/out: summary block buffer */ - xfs_fsblock_t *rsb) /* in/out: summary block number */ + xfs_extlen_t oblocks, /* old count of blocks */ + xfs_extlen_t nblocks, /* new count of blocks */ + xfs_inode_t *ip) /* inode (bitmap/summary) */ { - xfs_rtblock_t end; /* end of the allocated extent */ - int error; /* error value */ - xfs_rtblock_t postblock; /* first block allocated > end */ - xfs_rtblock_t preblock; /* first block allocated < start */ + xfs_fileoff_t bno; /* block number in file */ + xfs_buf_t *bp; /* temporary buffer for zeroing */ + int committed; /* transaction committed flag */ + xfs_daddr_t d; /* disk block address */ + int error; /* error return value */ + xfs_fsblock_t firstblock; /* first block allocated in xaction */ + xfs_bmap_free_t flist; /* list of freed blocks */ + xfs_fsblock_t fsbno; /* filesystem block for bno */ + xfs_bmbt_irec_t map; /* block map output */ + int nmap; /* number of block maps */ + int resblks; /* space reservation */ - end = start + len - 1; - /* - * Assume we're allocating out of the middle of a free extent. - * We need to find the beginning and end of the extent so we can - * properly update the summary. - */ - error = xfs_rtfind_back(mp, tp, start, 0, &preblock); - if (error) { - return error; - } - /* - * Find the next allocated block (end of free extent). - */ - error = xfs_rtfind_forw(mp, tp, end, mp->m_sb.sb_rextents - 1, - &postblock); - if (error) { - return error; - } - /* - * Decrement the summary information corresponding to the entire - * (old) free extent. - */ - error = xfs_rtmodify_summary(mp, tp, - XFS_RTBLOCKLOG(postblock + 1 - preblock), - XFS_BITTOBLOCK(mp, preblock), -1, rbpp, rsb); - if (error) { - return error; - } - /* - * If there are blocks not being allocated at the front of the - * old extent, add summary data for them to be free. - */ - if (preblock < start) { - error = xfs_rtmodify_summary(mp, tp, - XFS_RTBLOCKLOG(start - preblock), - XFS_BITTOBLOCK(mp, preblock), 1, rbpp, rsb); - if (error) { - return error; - } - } /* - * If there are blocks not being allocated at the end of the - * old extent, add summary data for them to be free. - */ - if (postblock > end) { - error = xfs_rtmodify_summary(mp, tp, - XFS_RTBLOCKLOG(postblock - end), - XFS_BITTOBLOCK(mp, end + 1), 1, rbpp, rsb); - if (error) { - return error; - } - } - /* - * Modify the bitmap to mark this extent allocated. + * Allocate space to the file, as necessary. */ - error = xfs_rtmodify_range(mp, tp, start, len, 0); - return error; -} - -/* - * Return whether there are any free extents in the size range given - * by low and high, for the bitmap block bbno. - */ -STATIC int /* error */ -xfs_rtany_summary( - xfs_mount_t *mp, /* file system mount structure */ - xfs_trans_t *tp, /* transaction pointer */ - int low, /* low log2 extent size */ - int high, /* high log2 extent size */ - xfs_rtblock_t bbno, /* bitmap block number */ - xfs_buf_t **rbpp, /* in/out: summary block buffer */ - xfs_fsblock_t *rsb, /* in/out: summary block number */ - int *stat) /* out: any good extents here? */ -{ - int error; /* error value */ - int log; /* loop counter, log2 of ext. size */ - xfs_suminfo_t sum; /* summary data */ + while (oblocks < nblocks) { + int cancelflags = 0; + xfs_trans_t *tp; - /* - * Loop over logs of extent sizes. Order is irrelevant. - */ - for (log = low; log <= high; log++) { + tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFSRT_ALLOC); + resblks = XFS_GROWFSRT_SPACE_RES(mp, nblocks - oblocks); /* - * Get one summary datum. + * Reserve space & log for one extent added to the file. */ - error = xfs_rtget_summary(mp, tp, log, bbno, rbpp, rsb, &sum); - if (error) { - return error; - } + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growrtalloc, + resblks, 0); + if (error) + goto error_cancel; + cancelflags = XFS_TRANS_RELEASE_LOG_RES; /* - * If there are any, return success. + * Lock the inode. */ - if (sum) { - *stat = 1; - return 0; - } - } - /* - * Found nothing, return failure. - */ - *stat = 0; - return 0; -} - -/* - * Get a buffer for the bitmap or summary file block specified. - * The buffer is returned read and locked. - */ -STATIC int /* error */ -xfs_rtbuf_get( - xfs_mount_t *mp, /* file system mount structure */ - xfs_trans_t *tp, /* transaction pointer */ - xfs_rtblock_t block, /* block number in bitmap or summary */ - int issum, /* is summary not bitmap */ - xfs_buf_t **bpp) /* output: buffer for the block */ -{ - xfs_buf_t *bp; /* block buffer, result */ - xfs_inode_t *ip; /* bitmap or summary inode */ - xfs_bmbt_irec_t map; - int nmap = 1; - int error; /* error value */ - - ip = issum ? mp->m_rsumip : mp->m_rbmip; - - error = xfs_bmapi_read(ip, block, 1, &map, &nmap, XFS_DATA_FORK); - if (error) - return error; - - ASSERT(map.br_startblock != NULLFSBLOCK); - error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, - XFS_FSB_TO_DADDR(mp, map.br_startblock), - mp->m_bsize, 0, &bp, NULL); - if (error) - return error; - ASSERT(!xfs_buf_geterror(bp)); - *bpp = bp; - return 0; -} - -#ifdef DEBUG -/* - * Check that the given extent (block range) is allocated already. - */ -STATIC int /* error */ -xfs_rtcheck_alloc_range( - xfs_mount_t *mp, /* file system mount point */ - xfs_trans_t *tp, /* transaction pointer */ - xfs_rtblock_t bno, /* starting block number of extent */ - xfs_extlen_t len, /* length of extent */ - int *stat) /* out: 1 for allocated, 0 for not */ -{ - xfs_rtblock_t new; /* dummy for xfs_rtcheck_range */ - - return xfs_rtcheck_range(mp, tp, bno, len, 0, &new, stat); -} -#endif - -/* - * Check that the given range is either all allocated (val = 0) or - * all free (val = 1). - */ -STATIC int /* error */ -xfs_rtcheck_range( - xfs_mount_t *mp, /* file system mount point */ - xfs_trans_t *tp, /* transaction pointer */ - xfs_rtblock_t start, /* starting block number of extent */ - xfs_extlen_t len, /* length of extent */ - int val, /* 1 for free, 0 for allocated */ - xfs_rtblock_t *new, /* out: first block not matching */ - int *stat) /* out: 1 for matches, 0 for not */ -{ - xfs_rtword_t *b; /* current word in buffer */ - int bit; /* bit number in the word */ - xfs_rtblock_t block; /* bitmap block number */ - xfs_buf_t *bp; /* buf for the block */ - xfs_rtword_t *bufp; /* starting word in buffer */ - int error; /* error value */ - xfs_rtblock_t i; /* current bit number rel. to start */ - xfs_rtblock_t lastbit; /* last useful bit in word */ - xfs_rtword_t mask; /* mask of relevant bits for value */ - xfs_rtword_t wdiff; /* difference from wanted value */ - int word; /* word number in the buffer */ + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); - /* - * Compute starting bitmap block number - */ - block = XFS_BITTOBLOCK(mp, start); - /* - * Read the bitmap block. - */ - error = xfs_rtbuf_get(mp, tp, block, 0, &bp); - if (error) { - return error; - } - bufp = bp->b_addr; - /* - * Compute the starting word's address, and starting bit. - */ - word = XFS_BITTOWORD(mp, start); - b = &bufp[word]; - bit = (int)(start & (XFS_NBWORD - 1)); - /* - * 0 (allocated) => all zero's; 1 (free) => all one's. - */ - val = -val; - /* - * If not starting on a word boundary, deal with the first - * (partial) word. - */ - if (bit) { - /* - * Compute first bit not examined. - */ - lastbit = XFS_RTMIN(bit + len, XFS_NBWORD); - /* - * Mask of relevant bits. - */ - mask = (((xfs_rtword_t)1 << (lastbit - bit)) - 1) << bit; - /* - * Compute difference between actual and desired value. - */ - if ((wdiff = (*b ^ val) & mask)) { - /* - * Different, compute first wrong bit and return. - */ - xfs_trans_brelse(tp, bp); - i = XFS_RTLOBIT(wdiff) - bit; - *new = start + i; - *stat = 0; - return 0; - } - i = lastbit - bit; - /* - * Go on to next block if that's where the next word is - * and we need the next word. - */ - if (++word == XFS_BLOCKWSIZE(mp) && i < len) { - /* - * If done with this block, get the next one. - */ - xfs_trans_brelse(tp, bp); - error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp); - if (error) { - return error; - } - b = bufp = bp->b_addr; - word = 0; - } else { - /* - * Go on to the next word in the buffer. - */ - b++; - } - } else { - /* - * Starting on a word boundary, no partial word. - */ - i = 0; - } - /* - * Loop over whole words in buffers. When we use up one buffer - * we move on to the next one. - */ - while (len - i >= XFS_NBWORD) { - /* - * Compute difference between actual and desired value. - */ - if ((wdiff = *b ^ val)) { - /* - * Different, compute first wrong bit and return. - */ - xfs_trans_brelse(tp, bp); - i += XFS_RTLOBIT(wdiff); - *new = start + i; - *stat = 0; - return 0; - } - i += XFS_NBWORD; + xfs_bmap_init(&flist, &firstblock); /* - * Go on to next block if that's where the next word is - * and we need the next word. + * Allocate blocks to the bitmap file. */ - if (++word == XFS_BLOCKWSIZE(mp) && i < len) { - /* - * If done with this block, get the next one. - */ - xfs_trans_brelse(tp, bp); - error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp); - if (error) { - return error; - } - b = bufp = bp->b_addr; - word = 0; - } else { - /* - * Go on to the next word in the buffer. - */ - b++; - } - } - /* - * If not ending on a word boundary, deal with the last - * (partial) word. - */ - if ((lastbit = len - i)) { + nmap = 1; + cancelflags |= XFS_TRANS_ABORT; + error = xfs_bmapi_write(tp, ip, oblocks, nblocks - oblocks, + XFS_BMAPI_METADATA, &firstblock, + resblks, &map, &nmap, &flist); + if (!error && nmap < 1) + error = XFS_ERROR(ENOSPC); + if (error) + goto error_cancel; /* - * Mask of relevant bits. + * Free any blocks freed up in the transaction, then commit. */ - mask = ((xfs_rtword_t)1 << lastbit) - 1; + error = xfs_bmap_finish(&tp, &flist, &committed); + if (error) + goto error_cancel; + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + if (error) + goto error; /* - * Compute difference between actual and desired value. + * Now we need to clear the allocated blocks. + * Do this one block per transaction, to keep it simple. */ - if ((wdiff = (*b ^ val) & mask)) { + cancelflags = 0; + for (bno = map.br_startoff, fsbno = map.br_startblock; + bno < map.br_startoff + map.br_blockcount; + bno++, fsbno++) { + tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFSRT_ZERO); /* - * Different, compute first wrong bit and return. + * Reserve log for one block zeroing. */ - xfs_trans_brelse(tp, bp); - i += XFS_RTLOBIT(wdiff); - *new = start + i; - *stat = 0; - return 0; - } else - i = len; - } - /* - * Successful, return. - */ - xfs_trans_brelse(tp, bp); - *new = start + i; - *stat = 1; - return 0; -} - -/* - * Copy and transform the summary file, given the old and new - * parameters in the mount structures. - */ -STATIC int /* error */ -xfs_rtcopy_summary( - xfs_mount_t *omp, /* old file system mount point */ - xfs_mount_t *nmp, /* new file system mount point */ - xfs_trans_t *tp) /* transaction pointer */ -{ - xfs_rtblock_t bbno; /* bitmap block number */ - xfs_buf_t *bp; /* summary buffer */ - int error; /* error return value */ - int log; /* summary level number (log length) */ - xfs_suminfo_t sum; /* summary data */ - xfs_fsblock_t sumbno; /* summary block number */ - - bp = NULL; - for (log = omp->m_rsumlevels - 1; log >= 0; log--) { - for (bbno = omp->m_sb.sb_rbmblocks - 1; - (xfs_srtblock_t)bbno >= 0; - bbno--) { - error = xfs_rtget_summary(omp, tp, log, bbno, &bp, - &sumbno, &sum); - if (error) - return error; - if (sum == 0) - continue; - error = xfs_rtmodify_summary(omp, tp, log, bbno, -sum, - &bp, &sumbno); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growrtzero, + 0, 0); if (error) - return error; - error = xfs_rtmodify_summary(nmp, tp, log, bbno, sum, - &bp, &sumbno); - if (error) - return error; - ASSERT(sum > 0); - } - } - return 0; -} - -/* - * Searching backward from start to limit, find the first block whose - * allocated/free state is different from start's. - */ -STATIC int /* error */ -xfs_rtfind_back( - xfs_mount_t *mp, /* file system mount point */ - xfs_trans_t *tp, /* transaction pointer */ - xfs_rtblock_t start, /* starting block to look at */ - xfs_rtblock_t limit, /* last block to look at */ - xfs_rtblock_t *rtblock) /* out: start block found */ -{ - xfs_rtword_t *b; /* current word in buffer */ - int bit; /* bit number in the word */ - xfs_rtblock_t block; /* bitmap block number */ - xfs_buf_t *bp; /* buf for the block */ - xfs_rtword_t *bufp; /* starting word in buffer */ - int error; /* error value */ - xfs_rtblock_t firstbit; /* first useful bit in the word */ - xfs_rtblock_t i; /* current bit number rel. to start */ - xfs_rtblock_t len; /* length of inspected area */ - xfs_rtword_t mask; /* mask of relevant bits for value */ - xfs_rtword_t want; /* mask for "good" values */ - xfs_rtword_t wdiff; /* difference from wanted value */ - int word; /* word number in the buffer */ - - /* - * Compute and read in starting bitmap block for starting block. - */ - block = XFS_BITTOBLOCK(mp, start); - error = xfs_rtbuf_get(mp, tp, block, 0, &bp); - if (error) { - return error; - } - bufp = bp->b_addr; - /* - * Get the first word's index & point to it. - */ - word = XFS_BITTOWORD(mp, start); - b = &bufp[word]; - bit = (int)(start & (XFS_NBWORD - 1)); - len = start - limit + 1; - /* - * Compute match value, based on the bit at start: if 1 (free) - * then all-ones, else all-zeroes. - */ - want = (*b & ((xfs_rtword_t)1 << bit)) ? -1 : 0; - /* - * If the starting position is not word-aligned, deal with the - * partial word. - */ - if (bit < XFS_NBWORD - 1) { - /* - * Calculate first (leftmost) bit number to look at, - * and mask for all the relevant bits in this word. - */ - firstbit = XFS_RTMAX((xfs_srtblock_t)(bit - len + 1), 0); - mask = (((xfs_rtword_t)1 << (bit - firstbit + 1)) - 1) << - firstbit; - /* - * Calculate the difference between the value there - * and what we're looking for. - */ - if ((wdiff = (*b ^ want) & mask)) { - /* - * Different. Mark where we are and return. - */ - xfs_trans_brelse(tp, bp); - i = bit - XFS_RTHIBIT(wdiff); - *rtblock = start - i + 1; - return 0; - } - i = bit - firstbit + 1; - /* - * Go on to previous block if that's where the previous word is - * and we need the previous word. - */ - if (--word == -1 && i < len) { - /* - * If done with this block, get the previous one. - */ - xfs_trans_brelse(tp, bp); - error = xfs_rtbuf_get(mp, tp, --block, 0, &bp); - if (error) { - return error; - } - bufp = bp->b_addr; - word = XFS_BLOCKWMASK(mp); - b = &bufp[word]; - } else { - /* - * Go on to the previous word in the buffer. - */ - b--; - } - } else { - /* - * Starting on a word boundary, no partial word. - */ - i = 0; - } - /* - * Loop over whole words in buffers. When we use up one buffer - * we move on to the previous one. - */ - while (len - i >= XFS_NBWORD) { - /* - * Compute difference between actual and desired value. - */ - if ((wdiff = *b ^ want)) { - /* - * Different, mark where we are and return. - */ - xfs_trans_brelse(tp, bp); - i += XFS_NBWORD - 1 - XFS_RTHIBIT(wdiff); - *rtblock = start - i + 1; - return 0; - } - i += XFS_NBWORD; - /* - * Go on to previous block if that's where the previous word is - * and we need the previous word. - */ - if (--word == -1 && i < len) { - /* - * If done with this block, get the previous one. - */ - xfs_trans_brelse(tp, bp); - error = xfs_rtbuf_get(mp, tp, --block, 0, &bp); - if (error) { - return error; - } - bufp = bp->b_addr; - word = XFS_BLOCKWMASK(mp); - b = &bufp[word]; - } else { - /* - * Go on to the previous word in the buffer. - */ - b--; - } - } - /* - * If not ending on a word boundary, deal with the last - * (partial) word. - */ - if (len - i) { - /* - * Calculate first (leftmost) bit number to look at, - * and mask for all the relevant bits in this word. - */ - firstbit = XFS_NBWORD - (len - i); - mask = (((xfs_rtword_t)1 << (len - i)) - 1) << firstbit; - /* - * Compute difference between actual and desired value. - */ - if ((wdiff = (*b ^ want) & mask)) { - /* - * Different, mark where we are and return. - */ - xfs_trans_brelse(tp, bp); - i += XFS_NBWORD - 1 - XFS_RTHIBIT(wdiff); - *rtblock = start - i + 1; - return 0; - } else - i = len; - } - /* - * No match, return that we scanned the whole area. - */ - xfs_trans_brelse(tp, bp); - *rtblock = start - i + 1; - return 0; -} - -/* - * Searching forward from start to limit, find the first block whose - * allocated/free state is different from start's. - */ -STATIC int /* error */ -xfs_rtfind_forw( - xfs_mount_t *mp, /* file system mount point */ - xfs_trans_t *tp, /* transaction pointer */ - xfs_rtblock_t start, /* starting block to look at */ - xfs_rtblock_t limit, /* last block to look at */ - xfs_rtblock_t *rtblock) /* out: start block found */ -{ - xfs_rtword_t *b; /* current word in buffer */ - int bit; /* bit number in the word */ - xfs_rtblock_t block; /* bitmap block number */ - xfs_buf_t *bp; /* buf for the block */ - xfs_rtword_t *bufp; /* starting word in buffer */ - int error; /* error value */ - xfs_rtblock_t i; /* current bit number rel. to start */ - xfs_rtblock_t lastbit; /* last useful bit in the word */ - xfs_rtblock_t len; /* length of inspected area */ - xfs_rtword_t mask; /* mask of relevant bits for value */ - xfs_rtword_t want; /* mask for "good" values */ - xfs_rtword_t wdiff; /* difference from wanted value */ - int word; /* word number in the buffer */ - - /* - * Compute and read in starting bitmap block for starting block. - */ - block = XFS_BITTOBLOCK(mp, start); - error = xfs_rtbuf_get(mp, tp, block, 0, &bp); - if (error) { - return error; - } - bufp = bp->b_addr; - /* - * Get the first word's index & point to it. - */ - word = XFS_BITTOWORD(mp, start); - b = &bufp[word]; - bit = (int)(start & (XFS_NBWORD - 1)); - len = limit - start + 1; - /* - * Compute match value, based on the bit at start: if 1 (free) - * then all-ones, else all-zeroes. - */ - want = (*b & ((xfs_rtword_t)1 << bit)) ? -1 : 0; - /* - * If the starting position is not word-aligned, deal with the - * partial word. - */ - if (bit) { - /* - * Calculate last (rightmost) bit number to look at, - * and mask for all the relevant bits in this word. - */ - lastbit = XFS_RTMIN(bit + len, XFS_NBWORD); - mask = (((xfs_rtword_t)1 << (lastbit - bit)) - 1) << bit; - /* - * Calculate the difference between the value there - * and what we're looking for. - */ - if ((wdiff = (*b ^ want) & mask)) { - /* - * Different. Mark where we are and return. - */ - xfs_trans_brelse(tp, bp); - i = XFS_RTLOBIT(wdiff) - bit; - *rtblock = start + i - 1; - return 0; - } - i = lastbit - bit; - /* - * Go on to next block if that's where the next word is - * and we need the next word. - */ - if (++word == XFS_BLOCKWSIZE(mp) && i < len) { - /* - * If done with this block, get the previous one. - */ - xfs_trans_brelse(tp, bp); - error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp); - if (error) { - return error; - } - b = bufp = bp->b_addr; - word = 0; - } else { - /* - * Go on to the previous word in the buffer. - */ - b++; - } - } else { - /* - * Starting on a word boundary, no partial word. - */ - i = 0; - } - /* - * Loop over whole words in buffers. When we use up one buffer - * we move on to the next one. - */ - while (len - i >= XFS_NBWORD) { - /* - * Compute difference between actual and desired value. - */ - if ((wdiff = *b ^ want)) { + goto error_cancel; /* - * Different, mark where we are and return. + * Lock the bitmap inode. */ - xfs_trans_brelse(tp, bp); - i += XFS_RTLOBIT(wdiff); - *rtblock = start + i - 1; - return 0; - } - i += XFS_NBWORD; - /* - * Go on to next block if that's where the next word is - * and we need the next word. - */ - if (++word == XFS_BLOCKWSIZE(mp) && i < len) { + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); /* - * If done with this block, get the next one. + * Get a buffer for the block. */ - xfs_trans_brelse(tp, bp); - error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp); - if (error) { - return error; + d = XFS_FSB_TO_DADDR(mp, fsbno); + bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, + mp->m_bsize, 0); + if (bp == NULL) { + error = XFS_ERROR(EIO); +error_cancel: + xfs_trans_cancel(tp, cancelflags); + goto error; } - b = bufp = bp->b_addr; - word = 0; - } else { + memset(bp->b_addr, 0, mp->m_sb.sb_blocksize); + xfs_trans_log_buf(tp, bp, 0, mp->m_sb.sb_blocksize - 1); /* - * Go on to the next word in the buffer. + * Commit the transaction. */ - b++; + error = xfs_trans_commit(tp, 0); + if (error) + goto error; } - } - /* - * If not ending on a word boundary, deal with the last - * (partial) word. - */ - if ((lastbit = len - i)) { /* - * Calculate mask for all the relevant bits in this word. - */ - mask = ((xfs_rtword_t)1 << lastbit) - 1; - /* - * Compute difference between actual and desired value. + * Go on to the next extent, if any. */ - if ((wdiff = (*b ^ want) & mask)) { - /* - * Different, mark where we are and return. - */ - xfs_trans_brelse(tp, bp); - i += XFS_RTLOBIT(wdiff); - *rtblock = start + i - 1; - return 0; - } else - i = len; + oblocks = map.br_startoff + map.br_blockcount; } - /* - * No match, return that we scanned the whole area. - */ - xfs_trans_brelse(tp, bp); - *rtblock = start + i - 1; return 0; -} -/* - * Mark an extent specified by start and len freed. - * Updates all the summary information as well as the bitmap. - */ -STATIC int /* error */ -xfs_rtfree_range( - xfs_mount_t *mp, /* file system mount point */ - xfs_trans_t *tp, /* transaction pointer */ - xfs_rtblock_t start, /* starting block to free */ - xfs_extlen_t len, /* length to free */ - xfs_buf_t **rbpp, /* in/out: summary block buffer */ - xfs_fsblock_t *rsb) /* in/out: summary block number */ -{ - xfs_rtblock_t end; /* end of the freed extent */ - int error; /* error value */ - xfs_rtblock_t postblock; /* first block freed > end */ - xfs_rtblock_t preblock; /* first block freed < start */ - - end = start + len - 1; - /* - * Modify the bitmap to mark this extent freed. - */ - error = xfs_rtmodify_range(mp, tp, start, len, 1); - if (error) { - return error; - } - /* - * Assume we're freeing out of the middle of an allocated extent. - * We need to find the beginning and end of the extent so we can - * properly update the summary. - */ - error = xfs_rtfind_back(mp, tp, start, 0, &preblock); - if (error) { - return error; - } - /* - * Find the next allocated block (end of allocated extent). - */ - error = xfs_rtfind_forw(mp, tp, end, mp->m_sb.sb_rextents - 1, - &postblock); - if (error) - return error; - /* - * If there are blocks not being freed at the front of the - * old extent, add summary data for them to be allocated. - */ - if (preblock < start) { - error = xfs_rtmodify_summary(mp, tp, - XFS_RTBLOCKLOG(start - preblock), - XFS_BITTOBLOCK(mp, preblock), -1, rbpp, rsb); - if (error) { - return error; - } - } - /* - * If there are blocks not being freed at the end of the - * old extent, add summary data for them to be allocated. - */ - if (postblock > end) { - error = xfs_rtmodify_summary(mp, tp, - XFS_RTBLOCKLOG(postblock - end), - XFS_BITTOBLOCK(mp, end + 1), -1, rbpp, rsb); - if (error) { - return error; - } - } - /* - * Increment the summary information corresponding to the entire - * (new) free extent. - */ - error = xfs_rtmodify_summary(mp, tp, - XFS_RTBLOCKLOG(postblock + 1 - preblock), - XFS_BITTOBLOCK(mp, preblock), 1, rbpp, rsb); +error: return error; } /* - * Read and return the summary information for a given extent size, - * bitmap block combination. - * Keeps track of a current summary block, so we don't keep reading - * it from the buffer cache. - */ -STATIC int /* error */ -xfs_rtget_summary( - xfs_mount_t *mp, /* file system mount structure */ - xfs_trans_t *tp, /* transaction pointer */ - int log, /* log2 of extent size */ - xfs_rtblock_t bbno, /* bitmap block number */ - xfs_buf_t **rbpp, /* in/out: summary block buffer */ - xfs_fsblock_t *rsb, /* in/out: summary block number */ - xfs_suminfo_t *sum) /* out: summary info for this block */ -{ - xfs_buf_t *bp; /* buffer for summary block */ - int error; /* error value */ - xfs_fsblock_t sb; /* summary fsblock */ - int so; /* index into the summary file */ - xfs_suminfo_t *sp; /* pointer to returned data */ - - /* - * Compute entry number in the summary file. - */ - so = XFS_SUMOFFS(mp, log, bbno); - /* - * Compute the block number in the summary file. - */ - sb = XFS_SUMOFFSTOBLOCK(mp, so); - /* - * If we have an old buffer, and the block number matches, use that. - */ - if (rbpp && *rbpp && *rsb == sb) - bp = *rbpp; - /* - * Otherwise we have to get the buffer. - */ - else { - /* - * If there was an old one, get rid of it first. - */ - if (rbpp && *rbpp) - xfs_trans_brelse(tp, *rbpp); - error = xfs_rtbuf_get(mp, tp, sb, 1, &bp); - if (error) { - return error; - } - /* - * Remember this buffer and block for the next call. - */ - if (rbpp) { - *rbpp = bp; - *rsb = sb; - } - } - /* - * Point to the summary information & copy it out. - */ - sp = XFS_SUMPTR(mp, bp, so); - *sum = *sp; - /* - * Drop the buffer if we're not asked to remember it. - */ - if (!rbpp) - xfs_trans_brelse(tp, bp); - return 0; -} - -/* - * Set the given range of bitmap bits to the given value. - * Do whatever I/O and logging is required. - */ -STATIC int /* error */ -xfs_rtmodify_range( - xfs_mount_t *mp, /* file system mount point */ - xfs_trans_t *tp, /* transaction pointer */ - xfs_rtblock_t start, /* starting block to modify */ - xfs_extlen_t len, /* length of extent to modify */ - int val) /* 1 for free, 0 for allocated */ -{ - xfs_rtword_t *b; /* current word in buffer */ - int bit; /* bit number in the word */ - xfs_rtblock_t block; /* bitmap block number */ - xfs_buf_t *bp; /* buf for the block */ - xfs_rtword_t *bufp; /* starting word in buffer */ - int error; /* error value */ - xfs_rtword_t *first; /* first used word in the buffer */ - int i; /* current bit number rel. to start */ - int lastbit; /* last useful bit in word */ - xfs_rtword_t mask; /* mask o frelevant bits for value */ - int word; /* word number in the buffer */ - - /* - * Compute starting bitmap block number. - */ - block = XFS_BITTOBLOCK(mp, start); - /* - * Read the bitmap block, and point to its data. - */ - error = xfs_rtbuf_get(mp, tp, block, 0, &bp); - if (error) { - return error; - } - bufp = bp->b_addr; - /* - * Compute the starting word's address, and starting bit. - */ - word = XFS_BITTOWORD(mp, start); - first = b = &bufp[word]; - bit = (int)(start & (XFS_NBWORD - 1)); - /* - * 0 (allocated) => all zeroes; 1 (free) => all ones. - */ - val = -val; - /* - * If not starting on a word boundary, deal with the first - * (partial) word. - */ - if (bit) { - /* - * Compute first bit not changed and mask of relevant bits. - */ - lastbit = XFS_RTMIN(bit + len, XFS_NBWORD); - mask = (((xfs_rtword_t)1 << (lastbit - bit)) - 1) << bit; - /* - * Set/clear the active bits. - */ - if (val) - *b |= mask; - else - *b &= ~mask; - i = lastbit - bit; - /* - * Go on to the next block if that's where the next word is - * and we need the next word. - */ - if (++word == XFS_BLOCKWSIZE(mp) && i < len) { - /* - * Log the changed part of this block. - * Get the next one. - */ - xfs_trans_log_buf(tp, bp, - (uint)((char *)first - (char *)bufp), - (uint)((char *)b - (char *)bufp)); - error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp); - if (error) { - return error; - } - first = b = bufp = bp->b_addr; - word = 0; - } else { - /* - * Go on to the next word in the buffer - */ - b++; - } - } else { - /* - * Starting on a word boundary, no partial word. - */ - i = 0; - } - /* - * Loop over whole words in buffers. When we use up one buffer - * we move on to the next one. - */ - while (len - i >= XFS_NBWORD) { - /* - * Set the word value correctly. - */ - *b = val; - i += XFS_NBWORD; - /* - * Go on to the next block if that's where the next word is - * and we need the next word. - */ - if (++word == XFS_BLOCKWSIZE(mp) && i < len) { - /* - * Log the changed part of this block. - * Get the next one. - */ - xfs_trans_log_buf(tp, bp, - (uint)((char *)first - (char *)bufp), - (uint)((char *)b - (char *)bufp)); - error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp); - if (error) { - return error; - } - first = b = bufp = bp->b_addr; - word = 0; - } else { - /* - * Go on to the next word in the buffer - */ - b++; - } - } - /* - * If not ending on a word boundary, deal with the last - * (partial) word. - */ - if ((lastbit = len - i)) { - /* - * Compute a mask of relevant bits. - */ - bit = 0; - mask = ((xfs_rtword_t)1 << lastbit) - 1; - /* - * Set/clear the active bits. - */ - if (val) - *b |= mask; - else - *b &= ~mask; - b++; - } - /* - * Log any remaining changed bytes. - */ - if (b > first) - xfs_trans_log_buf(tp, bp, (uint)((char *)first - (char *)bufp), - (uint)((char *)b - (char *)bufp - 1)); - return 0; -} - -/* - * Read and modify the summary information for a given extent size, - * bitmap block combination. - * Keeps track of a current summary block, so we don't keep reading - * it from the buffer cache. - */ -STATIC int /* error */ -xfs_rtmodify_summary( - xfs_mount_t *mp, /* file system mount point */ - xfs_trans_t *tp, /* transaction pointer */ - int log, /* log2 of extent size */ - xfs_rtblock_t bbno, /* bitmap block number */ - int delta, /* change to make to summary info */ - xfs_buf_t **rbpp, /* in/out: summary block buffer */ - xfs_fsblock_t *rsb) /* in/out: summary block number */ -{ - xfs_buf_t *bp; /* buffer for the summary block */ - int error; /* error value */ - xfs_fsblock_t sb; /* summary fsblock */ - int so; /* index into the summary file */ - xfs_suminfo_t *sp; /* pointer to returned data */ - - /* - * Compute entry number in the summary file. - */ - so = XFS_SUMOFFS(mp, log, bbno); - /* - * Compute the block number in the summary file. - */ - sb = XFS_SUMOFFSTOBLOCK(mp, so); - /* - * If we have an old buffer, and the block number matches, use that. - */ - if (rbpp && *rbpp && *rsb == sb) - bp = *rbpp; - /* - * Otherwise we have to get the buffer. - */ - else { - /* - * If there was an old one, get rid of it first. - */ - if (rbpp && *rbpp) - xfs_trans_brelse(tp, *rbpp); - error = xfs_rtbuf_get(mp, tp, sb, 1, &bp); - if (error) { - return error; - } - /* - * Remember this buffer and block for the next call. - */ - if (rbpp) { - *rbpp = bp; - *rsb = sb; - } - } - /* - * Point to the summary information, modify and log it. - */ - sp = XFS_SUMPTR(mp, bp, so); - *sp += delta; - xfs_trans_log_buf(tp, bp, (uint)((char *)sp - (char *)bp->b_addr), - (uint)((char *)sp - (char *)bp->b_addr + sizeof(*sp) - 1)); - return 0; -} - -/* * Visible (exported) functions. */ @@ -1958,8 +1059,9 @@ xfs_growfs_rt( * Start a transaction, get the log reservation. */ tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFSRT_FREE); - if ((error = xfs_trans_reserve(tp, 0, - XFS_GROWRTFREE_LOG_RES(nmp), 0, 0, 0))) + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growrtfree, + 0, 0); + if (error) goto error_cancel; /* * Lock out other callers by grabbing the bitmap inode lock. @@ -2129,66 +1231,6 @@ xfs_rtallocate_extent( } /* - * Free an extent in the realtime subvolume. Length is expressed in - * realtime extents, as is the block number. - */ -int /* error */ -xfs_rtfree_extent( - xfs_trans_t *tp, /* transaction pointer */ - xfs_rtblock_t bno, /* starting block number to free */ - xfs_extlen_t len) /* length of extent freed */ -{ - int error; /* error value */ - xfs_mount_t *mp; /* file system mount structure */ - xfs_fsblock_t sb; /* summary file block number */ - xfs_buf_t *sumbp; /* summary file block buffer */ - - mp = tp->t_mountp; - - ASSERT(mp->m_rbmip->i_itemp != NULL); - ASSERT(xfs_isilocked(mp->m_rbmip, XFS_ILOCK_EXCL)); - -#if defined(__KERNEL__) && defined(DEBUG) - /* - * Check to see that this whole range is currently allocated. - */ - { - int stat; /* result from checking range */ - - error = xfs_rtcheck_alloc_range(mp, tp, bno, len, &stat); - if (error) { - return error; - } - ASSERT(stat); - } -#endif - sumbp = NULL; - /* - * Free the range of realtime blocks. - */ - error = xfs_rtfree_range(mp, tp, bno, len, &sumbp, &sb); - if (error) { - return error; - } - /* - * Mark more blocks free in the superblock. - */ - xfs_trans_mod_sb(tp, XFS_TRANS_SB_FREXTENTS, (long)len); - /* - * If we've now freed all the blocks, reset the file sequence - * number to 0. - */ - if (tp->t_frextents_delta + mp->m_sb.sb_frextents == - mp->m_sb.sb_rextents) { - if (!(mp->m_rbmip->i_d.di_flags & XFS_DIFLAG_NEWRTBM)) - mp->m_rbmip->i_d.di_flags |= XFS_DIFLAG_NEWRTBM; - *(__uint64_t *)&mp->m_rbmip->i_d.di_atime = 0; - xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE); - } - return 0; -} - -/* * Initialize realtime fields in the mount structure. */ int /* error */ diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h index f7f3a359c1c..752b63d1030 100644 --- a/fs/xfs/xfs_rtalloc.h +++ b/fs/xfs/xfs_rtalloc.h @@ -18,58 +18,11 @@ #ifndef __XFS_RTALLOC_H__ #define __XFS_RTALLOC_H__ +/* kernel only definitions and functions */ + struct xfs_mount; struct xfs_trans; -/* Min and max rt extent sizes, specified in bytes */ -#define XFS_MAX_RTEXTSIZE (1024 * 1024 * 1024) /* 1GB */ -#define XFS_DFL_RTEXTSIZE (64 * 1024) /* 64kB */ -#define XFS_MIN_RTEXTSIZE (4 * 1024) /* 4kB */ - -/* - * Constants for bit manipulations. - */ -#define XFS_NBBYLOG 3 /* log2(NBBY) */ -#define XFS_WORDLOG 2 /* log2(sizeof(xfs_rtword_t)) */ -#define XFS_NBWORDLOG (XFS_NBBYLOG + XFS_WORDLOG) -#define XFS_NBWORD (1 << XFS_NBWORDLOG) -#define XFS_WORDMASK ((1 << XFS_WORDLOG) - 1) - -#define XFS_BLOCKSIZE(mp) ((mp)->m_sb.sb_blocksize) -#define XFS_BLOCKMASK(mp) ((mp)->m_blockmask) -#define XFS_BLOCKWSIZE(mp) ((mp)->m_blockwsize) -#define XFS_BLOCKWMASK(mp) ((mp)->m_blockwmask) - -/* - * Summary and bit manipulation macros. - */ -#define XFS_SUMOFFS(mp,ls,bb) ((int)((ls) * (mp)->m_sb.sb_rbmblocks + (bb))) -#define XFS_SUMOFFSTOBLOCK(mp,s) \ - (((s) * (uint)sizeof(xfs_suminfo_t)) >> (mp)->m_sb.sb_blocklog) -#define XFS_SUMPTR(mp,bp,so) \ - ((xfs_suminfo_t *)((bp)->b_addr + \ - (((so) * (uint)sizeof(xfs_suminfo_t)) & XFS_BLOCKMASK(mp)))) - -#define XFS_BITTOBLOCK(mp,bi) ((bi) >> (mp)->m_blkbit_log) -#define XFS_BLOCKTOBIT(mp,bb) ((bb) << (mp)->m_blkbit_log) -#define XFS_BITTOWORD(mp,bi) \ - ((int)(((bi) >> XFS_NBWORDLOG) & XFS_BLOCKWMASK(mp))) - -#define XFS_RTMIN(a,b) ((a) < (b) ? (a) : (b)) -#define XFS_RTMAX(a,b) ((a) > (b) ? (a) : (b)) - -#define XFS_RTLOBIT(w) xfs_lowbit32(w) -#define XFS_RTHIBIT(w) xfs_highbit32(w) - -#if XFS_BIG_BLKNOS -#define XFS_RTBLOCKLOG(b) xfs_highbit64(b) -#else -#define XFS_RTBLOCKLOG(b) xfs_highbit32(b) -#endif - - -#ifdef __KERNEL__ - #ifdef CONFIG_XFS_RT /* * Function prototypes for exported functions. @@ -142,6 +95,30 @@ xfs_growfs_rt( struct xfs_mount *mp, /* file system mount structure */ xfs_growfs_rt_t *in); /* user supplied growfs struct */ +/* + * From xfs_rtbitmap.c + */ +int xfs_rtbuf_get(struct xfs_mount *mp, struct xfs_trans *tp, + xfs_rtblock_t block, int issum, struct xfs_buf **bpp); +int xfs_rtcheck_range(struct xfs_mount *mp, struct xfs_trans *tp, + xfs_rtblock_t start, xfs_extlen_t len, int val, + xfs_rtblock_t *new, int *stat); +int xfs_rtfind_back(struct xfs_mount *mp, struct xfs_trans *tp, + xfs_rtblock_t start, xfs_rtblock_t limit, + xfs_rtblock_t *rtblock); +int xfs_rtfind_forw(struct xfs_mount *mp, struct xfs_trans *tp, + xfs_rtblock_t start, xfs_rtblock_t limit, + xfs_rtblock_t *rtblock); +int xfs_rtmodify_range(struct xfs_mount *mp, struct xfs_trans *tp, + xfs_rtblock_t start, xfs_extlen_t len, int val); +int xfs_rtmodify_summary(struct xfs_mount *mp, struct xfs_trans *tp, int log, + xfs_rtblock_t bbno, int delta, xfs_buf_t **rbpp, + xfs_fsblock_t *rsb); +int xfs_rtfree_range(struct xfs_mount *mp, struct xfs_trans *tp, + xfs_rtblock_t start, xfs_extlen_t len, + struct xfs_buf **rbpp, xfs_fsblock_t *rsb); + + #else # define xfs_rtallocate_extent(t,b,min,max,l,a,f,p,rb) (ENOSYS) # define xfs_rtfree_extent(t,b,l) (ENOSYS) @@ -161,6 +138,4 @@ xfs_rtmount_init( # define xfs_rtunmount_inodes(m) #endif /* CONFIG_XFS_RT */ -#endif /* __KERNEL__ */ - #endif /* __XFS_RTALLOC_H__ */ diff --git a/fs/xfs/xfs_rtbitmap.c b/fs/xfs/xfs_rtbitmap.c new file mode 100644 index 00000000000..f4dd697cac0 --- /dev/null +++ b/fs/xfs/xfs_rtbitmap.c @@ -0,0 +1,973 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_bit.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_bmap.h" +#include "xfs_bmap_util.h" +#include "xfs_bmap_btree.h" +#include "xfs_alloc.h" +#include "xfs_error.h" +#include "xfs_trans.h" +#include "xfs_trans_space.h" +#include "xfs_trace.h" +#include "xfs_buf.h" +#include "xfs_icache.h" +#include "xfs_dinode.h" +#include "xfs_rtalloc.h" + + +/* + * Realtime allocator bitmap functions shared with userspace. + */ + +/* + * Get a buffer for the bitmap or summary file block specified. + * The buffer is returned read and locked. + */ +int +xfs_rtbuf_get( + xfs_mount_t *mp, /* file system mount structure */ + xfs_trans_t *tp, /* transaction pointer */ + xfs_rtblock_t block, /* block number in bitmap or summary */ + int issum, /* is summary not bitmap */ + xfs_buf_t **bpp) /* output: buffer for the block */ +{ + xfs_buf_t *bp; /* block buffer, result */ + xfs_inode_t *ip; /* bitmap or summary inode */ + xfs_bmbt_irec_t map; + int nmap = 1; + int error; /* error value */ + + ip = issum ? mp->m_rsumip : mp->m_rbmip; + + error = xfs_bmapi_read(ip, block, 1, &map, &nmap, XFS_DATA_FORK); + if (error) + return error; + + ASSERT(map.br_startblock != NULLFSBLOCK); + error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, + XFS_FSB_TO_DADDR(mp, map.br_startblock), + mp->m_bsize, 0, &bp, NULL); + if (error) + return error; + *bpp = bp; + return 0; +} + +/* + * Searching backward from start to limit, find the first block whose + * allocated/free state is different from start's. + */ +int +xfs_rtfind_back( + xfs_mount_t *mp, /* file system mount point */ + xfs_trans_t *tp, /* transaction pointer */ + xfs_rtblock_t start, /* starting block to look at */ + xfs_rtblock_t limit, /* last block to look at */ + xfs_rtblock_t *rtblock) /* out: start block found */ +{ + xfs_rtword_t *b; /* current word in buffer */ + int bit; /* bit number in the word */ + xfs_rtblock_t block; /* bitmap block number */ + xfs_buf_t *bp; /* buf for the block */ + xfs_rtword_t *bufp; /* starting word in buffer */ + int error; /* error value */ + xfs_rtblock_t firstbit; /* first useful bit in the word */ + xfs_rtblock_t i; /* current bit number rel. to start */ + xfs_rtblock_t len; /* length of inspected area */ + xfs_rtword_t mask; /* mask of relevant bits for value */ + xfs_rtword_t want; /* mask for "good" values */ + xfs_rtword_t wdiff; /* difference from wanted value */ + int word; /* word number in the buffer */ + + /* + * Compute and read in starting bitmap block for starting block. + */ + block = XFS_BITTOBLOCK(mp, start); + error = xfs_rtbuf_get(mp, tp, block, 0, &bp); + if (error) { + return error; + } + bufp = bp->b_addr; + /* + * Get the first word's index & point to it. + */ + word = XFS_BITTOWORD(mp, start); + b = &bufp[word]; + bit = (int)(start & (XFS_NBWORD - 1)); + len = start - limit + 1; + /* + * Compute match value, based on the bit at start: if 1 (free) + * then all-ones, else all-zeroes. + */ + want = (*b & ((xfs_rtword_t)1 << bit)) ? -1 : 0; + /* + * If the starting position is not word-aligned, deal with the + * partial word. + */ + if (bit < XFS_NBWORD - 1) { + /* + * Calculate first (leftmost) bit number to look at, + * and mask for all the relevant bits in this word. + */ + firstbit = XFS_RTMAX((xfs_srtblock_t)(bit - len + 1), 0); + mask = (((xfs_rtword_t)1 << (bit - firstbit + 1)) - 1) << + firstbit; + /* + * Calculate the difference between the value there + * and what we're looking for. + */ + if ((wdiff = (*b ^ want) & mask)) { + /* + * Different. Mark where we are and return. + */ + xfs_trans_brelse(tp, bp); + i = bit - XFS_RTHIBIT(wdiff); + *rtblock = start - i + 1; + return 0; + } + i = bit - firstbit + 1; + /* + * Go on to previous block if that's where the previous word is + * and we need the previous word. + */ + if (--word == -1 && i < len) { + /* + * If done with this block, get the previous one. + */ + xfs_trans_brelse(tp, bp); + error = xfs_rtbuf_get(mp, tp, --block, 0, &bp); + if (error) { + return error; + } + bufp = bp->b_addr; + word = XFS_BLOCKWMASK(mp); + b = &bufp[word]; + } else { + /* + * Go on to the previous word in the buffer. + */ + b--; + } + } else { + /* + * Starting on a word boundary, no partial word. + */ + i = 0; + } + /* + * Loop over whole words in buffers. When we use up one buffer + * we move on to the previous one. + */ + while (len - i >= XFS_NBWORD) { + /* + * Compute difference between actual and desired value. + */ + if ((wdiff = *b ^ want)) { + /* + * Different, mark where we are and return. + */ + xfs_trans_brelse(tp, bp); + i += XFS_NBWORD - 1 - XFS_RTHIBIT(wdiff); + *rtblock = start - i + 1; + return 0; + } + i += XFS_NBWORD; + /* + * Go on to previous block if that's where the previous word is + * and we need the previous word. + */ + if (--word == -1 && i < len) { + /* + * If done with this block, get the previous one. + */ + xfs_trans_brelse(tp, bp); + error = xfs_rtbuf_get(mp, tp, --block, 0, &bp); + if (error) { + return error; + } + bufp = bp->b_addr; + word = XFS_BLOCKWMASK(mp); + b = &bufp[word]; + } else { + /* + * Go on to the previous word in the buffer. + */ + b--; + } + } + /* + * If not ending on a word boundary, deal with the last + * (partial) word. + */ + if (len - i) { + /* + * Calculate first (leftmost) bit number to look at, + * and mask for all the relevant bits in this word. + */ + firstbit = XFS_NBWORD - (len - i); + mask = (((xfs_rtword_t)1 << (len - i)) - 1) << firstbit; + /* + * Compute difference between actual and desired value. + */ + if ((wdiff = (*b ^ want) & mask)) { + /* + * Different, mark where we are and return. + */ + xfs_trans_brelse(tp, bp); + i += XFS_NBWORD - 1 - XFS_RTHIBIT(wdiff); + *rtblock = start - i + 1; + return 0; + } else + i = len; + } + /* + * No match, return that we scanned the whole area. + */ + xfs_trans_brelse(tp, bp); + *rtblock = start - i + 1; + return 0; +} + +/* + * Searching forward from start to limit, find the first block whose + * allocated/free state is different from start's. + */ +int +xfs_rtfind_forw( + xfs_mount_t *mp, /* file system mount point */ + xfs_trans_t *tp, /* transaction pointer */ + xfs_rtblock_t start, /* starting block to look at */ + xfs_rtblock_t limit, /* last block to look at */ + xfs_rtblock_t *rtblock) /* out: start block found */ +{ + xfs_rtword_t *b; /* current word in buffer */ + int bit; /* bit number in the word */ + xfs_rtblock_t block; /* bitmap block number */ + xfs_buf_t *bp; /* buf for the block */ + xfs_rtword_t *bufp; /* starting word in buffer */ + int error; /* error value */ + xfs_rtblock_t i; /* current bit number rel. to start */ + xfs_rtblock_t lastbit; /* last useful bit in the word */ + xfs_rtblock_t len; /* length of inspected area */ + xfs_rtword_t mask; /* mask of relevant bits for value */ + xfs_rtword_t want; /* mask for "good" values */ + xfs_rtword_t wdiff; /* difference from wanted value */ + int word; /* word number in the buffer */ + + /* + * Compute and read in starting bitmap block for starting block. + */ + block = XFS_BITTOBLOCK(mp, start); + error = xfs_rtbuf_get(mp, tp, block, 0, &bp); + if (error) { + return error; + } + bufp = bp->b_addr; + /* + * Get the first word's index & point to it. + */ + word = XFS_BITTOWORD(mp, start); + b = &bufp[word]; + bit = (int)(start & (XFS_NBWORD - 1)); + len = limit - start + 1; + /* + * Compute match value, based on the bit at start: if 1 (free) + * then all-ones, else all-zeroes. + */ + want = (*b & ((xfs_rtword_t)1 << bit)) ? -1 : 0; + /* + * If the starting position is not word-aligned, deal with the + * partial word. + */ + if (bit) { + /* + * Calculate last (rightmost) bit number to look at, + * and mask for all the relevant bits in this word. + */ + lastbit = XFS_RTMIN(bit + len, XFS_NBWORD); + mask = (((xfs_rtword_t)1 << (lastbit - bit)) - 1) << bit; + /* + * Calculate the difference between the value there + * and what we're looking for. + */ + if ((wdiff = (*b ^ want) & mask)) { + /* + * Different. Mark where we are and return. + */ + xfs_trans_brelse(tp, bp); + i = XFS_RTLOBIT(wdiff) - bit; + *rtblock = start + i - 1; + return 0; + } + i = lastbit - bit; + /* + * Go on to next block if that's where the next word is + * and we need the next word. + */ + if (++word == XFS_BLOCKWSIZE(mp) && i < len) { + /* + * If done with this block, get the previous one. + */ + xfs_trans_brelse(tp, bp); + error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp); + if (error) { + return error; + } + b = bufp = bp->b_addr; + word = 0; + } else { + /* + * Go on to the previous word in the buffer. + */ + b++; + } + } else { + /* + * Starting on a word boundary, no partial word. + */ + i = 0; + } + /* + * Loop over whole words in buffers. When we use up one buffer + * we move on to the next one. + */ + while (len - i >= XFS_NBWORD) { + /* + * Compute difference between actual and desired value. + */ + if ((wdiff = *b ^ want)) { + /* + * Different, mark where we are and return. + */ + xfs_trans_brelse(tp, bp); + i += XFS_RTLOBIT(wdiff); + *rtblock = start + i - 1; + return 0; + } + i += XFS_NBWORD; + /* + * Go on to next block if that's where the next word is + * and we need the next word. + */ + if (++word == XFS_BLOCKWSIZE(mp) && i < len) { + /* + * If done with this block, get the next one. + */ + xfs_trans_brelse(tp, bp); + error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp); + if (error) { + return error; + } + b = bufp = bp->b_addr; + word = 0; + } else { + /* + * Go on to the next word in the buffer. + */ + b++; + } + } + /* + * If not ending on a word boundary, deal with the last + * (partial) word. + */ + if ((lastbit = len - i)) { + /* + * Calculate mask for all the relevant bits in this word. + */ + mask = ((xfs_rtword_t)1 << lastbit) - 1; + /* + * Compute difference between actual and desired value. + */ + if ((wdiff = (*b ^ want) & mask)) { + /* + * Different, mark where we are and return. + */ + xfs_trans_brelse(tp, bp); + i += XFS_RTLOBIT(wdiff); + *rtblock = start + i - 1; + return 0; + } else + i = len; + } + /* + * No match, return that we scanned the whole area. + */ + xfs_trans_brelse(tp, bp); + *rtblock = start + i - 1; + return 0; +} + +/* + * Read and modify the summary information for a given extent size, + * bitmap block combination. + * Keeps track of a current summary block, so we don't keep reading + * it from the buffer cache. + */ +int +xfs_rtmodify_summary( + xfs_mount_t *mp, /* file system mount point */ + xfs_trans_t *tp, /* transaction pointer */ + int log, /* log2 of extent size */ + xfs_rtblock_t bbno, /* bitmap block number */ + int delta, /* change to make to summary info */ + xfs_buf_t **rbpp, /* in/out: summary block buffer */ + xfs_fsblock_t *rsb) /* in/out: summary block number */ +{ + xfs_buf_t *bp; /* buffer for the summary block */ + int error; /* error value */ + xfs_fsblock_t sb; /* summary fsblock */ + int so; /* index into the summary file */ + xfs_suminfo_t *sp; /* pointer to returned data */ + + /* + * Compute entry number in the summary file. + */ + so = XFS_SUMOFFS(mp, log, bbno); + /* + * Compute the block number in the summary file. + */ + sb = XFS_SUMOFFSTOBLOCK(mp, so); + /* + * If we have an old buffer, and the block number matches, use that. + */ + if (rbpp && *rbpp && *rsb == sb) + bp = *rbpp; + /* + * Otherwise we have to get the buffer. + */ + else { + /* + * If there was an old one, get rid of it first. + */ + if (rbpp && *rbpp) + xfs_trans_brelse(tp, *rbpp); + error = xfs_rtbuf_get(mp, tp, sb, 1, &bp); + if (error) { + return error; + } + /* + * Remember this buffer and block for the next call. + */ + if (rbpp) { + *rbpp = bp; + *rsb = sb; + } + } + /* + * Point to the summary information, modify and log it. + */ + sp = XFS_SUMPTR(mp, bp, so); + *sp += delta; + xfs_trans_log_buf(tp, bp, (uint)((char *)sp - (char *)bp->b_addr), + (uint)((char *)sp - (char *)bp->b_addr + sizeof(*sp) - 1)); + return 0; +} + +/* + * Set the given range of bitmap bits to the given value. + * Do whatever I/O and logging is required. + */ +int +xfs_rtmodify_range( + xfs_mount_t *mp, /* file system mount point */ + xfs_trans_t *tp, /* transaction pointer */ + xfs_rtblock_t start, /* starting block to modify */ + xfs_extlen_t len, /* length of extent to modify */ + int val) /* 1 for free, 0 for allocated */ +{ + xfs_rtword_t *b; /* current word in buffer */ + int bit; /* bit number in the word */ + xfs_rtblock_t block; /* bitmap block number */ + xfs_buf_t *bp; /* buf for the block */ + xfs_rtword_t *bufp; /* starting word in buffer */ + int error; /* error value */ + xfs_rtword_t *first; /* first used word in the buffer */ + int i; /* current bit number rel. to start */ + int lastbit; /* last useful bit in word */ + xfs_rtword_t mask; /* mask o frelevant bits for value */ + int word; /* word number in the buffer */ + + /* + * Compute starting bitmap block number. + */ + block = XFS_BITTOBLOCK(mp, start); + /* + * Read the bitmap block, and point to its data. + */ + error = xfs_rtbuf_get(mp, tp, block, 0, &bp); + if (error) { + return error; + } + bufp = bp->b_addr; + /* + * Compute the starting word's address, and starting bit. + */ + word = XFS_BITTOWORD(mp, start); + first = b = &bufp[word]; + bit = (int)(start & (XFS_NBWORD - 1)); + /* + * 0 (allocated) => all zeroes; 1 (free) => all ones. + */ + val = -val; + /* + * If not starting on a word boundary, deal with the first + * (partial) word. + */ + if (bit) { + /* + * Compute first bit not changed and mask of relevant bits. + */ + lastbit = XFS_RTMIN(bit + len, XFS_NBWORD); + mask = (((xfs_rtword_t)1 << (lastbit - bit)) - 1) << bit; + /* + * Set/clear the active bits. + */ + if (val) + *b |= mask; + else + *b &= ~mask; + i = lastbit - bit; + /* + * Go on to the next block if that's where the next word is + * and we need the next word. + */ + if (++word == XFS_BLOCKWSIZE(mp) && i < len) { + /* + * Log the changed part of this block. + * Get the next one. + */ + xfs_trans_log_buf(tp, bp, + (uint)((char *)first - (char *)bufp), + (uint)((char *)b - (char *)bufp)); + error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp); + if (error) { + return error; + } + first = b = bufp = bp->b_addr; + word = 0; + } else { + /* + * Go on to the next word in the buffer + */ + b++; + } + } else { + /* + * Starting on a word boundary, no partial word. + */ + i = 0; + } + /* + * Loop over whole words in buffers. When we use up one buffer + * we move on to the next one. + */ + while (len - i >= XFS_NBWORD) { + /* + * Set the word value correctly. + */ + *b = val; + i += XFS_NBWORD; + /* + * Go on to the next block if that's where the next word is + * and we need the next word. + */ + if (++word == XFS_BLOCKWSIZE(mp) && i < len) { + /* + * Log the changed part of this block. + * Get the next one. + */ + xfs_trans_log_buf(tp, bp, + (uint)((char *)first - (char *)bufp), + (uint)((char *)b - (char *)bufp)); + error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp); + if (error) { + return error; + } + first = b = bufp = bp->b_addr; + word = 0; + } else { + /* + * Go on to the next word in the buffer + */ + b++; + } + } + /* + * If not ending on a word boundary, deal with the last + * (partial) word. + */ + if ((lastbit = len - i)) { + /* + * Compute a mask of relevant bits. + */ + bit = 0; + mask = ((xfs_rtword_t)1 << lastbit) - 1; + /* + * Set/clear the active bits. + */ + if (val) + *b |= mask; + else + *b &= ~mask; + b++; + } + /* + * Log any remaining changed bytes. + */ + if (b > first) + xfs_trans_log_buf(tp, bp, (uint)((char *)first - (char *)bufp), + (uint)((char *)b - (char *)bufp - 1)); + return 0; +} + +/* + * Mark an extent specified by start and len freed. + * Updates all the summary information as well as the bitmap. + */ +int +xfs_rtfree_range( + xfs_mount_t *mp, /* file system mount point */ + xfs_trans_t *tp, /* transaction pointer */ + xfs_rtblock_t start, /* starting block to free */ + xfs_extlen_t len, /* length to free */ + xfs_buf_t **rbpp, /* in/out: summary block buffer */ + xfs_fsblock_t *rsb) /* in/out: summary block number */ +{ + xfs_rtblock_t end; /* end of the freed extent */ + int error; /* error value */ + xfs_rtblock_t postblock; /* first block freed > end */ + xfs_rtblock_t preblock; /* first block freed < start */ + + end = start + len - 1; + /* + * Modify the bitmap to mark this extent freed. + */ + error = xfs_rtmodify_range(mp, tp, start, len, 1); + if (error) { + return error; + } + /* + * Assume we're freeing out of the middle of an allocated extent. + * We need to find the beginning and end of the extent so we can + * properly update the summary. + */ + error = xfs_rtfind_back(mp, tp, start, 0, &preblock); + if (error) { + return error; + } + /* + * Find the next allocated block (end of allocated extent). + */ + error = xfs_rtfind_forw(mp, tp, end, mp->m_sb.sb_rextents - 1, + &postblock); + if (error) + return error; + /* + * If there are blocks not being freed at the front of the + * old extent, add summary data for them to be allocated. + */ + if (preblock < start) { + error = xfs_rtmodify_summary(mp, tp, + XFS_RTBLOCKLOG(start - preblock), + XFS_BITTOBLOCK(mp, preblock), -1, rbpp, rsb); + if (error) { + return error; + } + } + /* + * If there are blocks not being freed at the end of the + * old extent, add summary data for them to be allocated. + */ + if (postblock > end) { + error = xfs_rtmodify_summary(mp, tp, + XFS_RTBLOCKLOG(postblock - end), + XFS_BITTOBLOCK(mp, end + 1), -1, rbpp, rsb); + if (error) { + return error; + } + } + /* + * Increment the summary information corresponding to the entire + * (new) free extent. + */ + error = xfs_rtmodify_summary(mp, tp, + XFS_RTBLOCKLOG(postblock + 1 - preblock), + XFS_BITTOBLOCK(mp, preblock), 1, rbpp, rsb); + return error; +} + +/* + * Check that the given range is either all allocated (val = 0) or + * all free (val = 1). + */ +int +xfs_rtcheck_range( + xfs_mount_t *mp, /* file system mount point */ + xfs_trans_t *tp, /* transaction pointer */ + xfs_rtblock_t start, /* starting block number of extent */ + xfs_extlen_t len, /* length of extent */ + int val, /* 1 for free, 0 for allocated */ + xfs_rtblock_t *new, /* out: first block not matching */ + int *stat) /* out: 1 for matches, 0 for not */ +{ + xfs_rtword_t *b; /* current word in buffer */ + int bit; /* bit number in the word */ + xfs_rtblock_t block; /* bitmap block number */ + xfs_buf_t *bp; /* buf for the block */ + xfs_rtword_t *bufp; /* starting word in buffer */ + int error; /* error value */ + xfs_rtblock_t i; /* current bit number rel. to start */ + xfs_rtblock_t lastbit; /* last useful bit in word */ + xfs_rtword_t mask; /* mask of relevant bits for value */ + xfs_rtword_t wdiff; /* difference from wanted value */ + int word; /* word number in the buffer */ + + /* + * Compute starting bitmap block number + */ + block = XFS_BITTOBLOCK(mp, start); + /* + * Read the bitmap block. + */ + error = xfs_rtbuf_get(mp, tp, block, 0, &bp); + if (error) { + return error; + } + bufp = bp->b_addr; + /* + * Compute the starting word's address, and starting bit. + */ + word = XFS_BITTOWORD(mp, start); + b = &bufp[word]; + bit = (int)(start & (XFS_NBWORD - 1)); + /* + * 0 (allocated) => all zero's; 1 (free) => all one's. + */ + val = -val; + /* + * If not starting on a word boundary, deal with the first + * (partial) word. + */ + if (bit) { + /* + * Compute first bit not examined. + */ + lastbit = XFS_RTMIN(bit + len, XFS_NBWORD); + /* + * Mask of relevant bits. + */ + mask = (((xfs_rtword_t)1 << (lastbit - bit)) - 1) << bit; + /* + * Compute difference between actual and desired value. + */ + if ((wdiff = (*b ^ val) & mask)) { + /* + * Different, compute first wrong bit and return. + */ + xfs_trans_brelse(tp, bp); + i = XFS_RTLOBIT(wdiff) - bit; + *new = start + i; + *stat = 0; + return 0; + } + i = lastbit - bit; + /* + * Go on to next block if that's where the next word is + * and we need the next word. + */ + if (++word == XFS_BLOCKWSIZE(mp) && i < len) { + /* + * If done with this block, get the next one. + */ + xfs_trans_brelse(tp, bp); + error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp); + if (error) { + return error; + } + b = bufp = bp->b_addr; + word = 0; + } else { + /* + * Go on to the next word in the buffer. + */ + b++; + } + } else { + /* + * Starting on a word boundary, no partial word. + */ + i = 0; + } + /* + * Loop over whole words in buffers. When we use up one buffer + * we move on to the next one. + */ + while (len - i >= XFS_NBWORD) { + /* + * Compute difference between actual and desired value. + */ + if ((wdiff = *b ^ val)) { + /* + * Different, compute first wrong bit and return. + */ + xfs_trans_brelse(tp, bp); + i += XFS_RTLOBIT(wdiff); + *new = start + i; + *stat = 0; + return 0; + } + i += XFS_NBWORD; + /* + * Go on to next block if that's where the next word is + * and we need the next word. + */ + if (++word == XFS_BLOCKWSIZE(mp) && i < len) { + /* + * If done with this block, get the next one. + */ + xfs_trans_brelse(tp, bp); + error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp); + if (error) { + return error; + } + b = bufp = bp->b_addr; + word = 0; + } else { + /* + * Go on to the next word in the buffer. + */ + b++; + } + } + /* + * If not ending on a word boundary, deal with the last + * (partial) word. + */ + if ((lastbit = len - i)) { + /* + * Mask of relevant bits. + */ + mask = ((xfs_rtword_t)1 << lastbit) - 1; + /* + * Compute difference between actual and desired value. + */ + if ((wdiff = (*b ^ val) & mask)) { + /* + * Different, compute first wrong bit and return. + */ + xfs_trans_brelse(tp, bp); + i += XFS_RTLOBIT(wdiff); + *new = start + i; + *stat = 0; + return 0; + } else + i = len; + } + /* + * Successful, return. + */ + xfs_trans_brelse(tp, bp); + *new = start + i; + *stat = 1; + return 0; +} + +#ifdef DEBUG +/* + * Check that the given extent (block range) is allocated already. + */ +STATIC int /* error */ +xfs_rtcheck_alloc_range( + xfs_mount_t *mp, /* file system mount point */ + xfs_trans_t *tp, /* transaction pointer */ + xfs_rtblock_t bno, /* starting block number of extent */ + xfs_extlen_t len) /* length of extent */ +{ + xfs_rtblock_t new; /* dummy for xfs_rtcheck_range */ + int stat; + int error; + + error = xfs_rtcheck_range(mp, tp, bno, len, 0, &new, &stat); + if (error) + return error; + ASSERT(stat); + return 0; +} +#else +#define xfs_rtcheck_alloc_range(m,t,b,l) (0) +#endif +/* + * Free an extent in the realtime subvolume. Length is expressed in + * realtime extents, as is the block number. + */ +int /* error */ +xfs_rtfree_extent( + xfs_trans_t *tp, /* transaction pointer */ + xfs_rtblock_t bno, /* starting block number to free */ + xfs_extlen_t len) /* length of extent freed */ +{ + int error; /* error value */ + xfs_mount_t *mp; /* file system mount structure */ + xfs_fsblock_t sb; /* summary file block number */ + xfs_buf_t *sumbp = NULL; /* summary file block buffer */ + + mp = tp->t_mountp; + + ASSERT(mp->m_rbmip->i_itemp != NULL); + ASSERT(xfs_isilocked(mp->m_rbmip, XFS_ILOCK_EXCL)); + + error = xfs_rtcheck_alloc_range(mp, tp, bno, len); + if (error) + return error; + + /* + * Free the range of realtime blocks. + */ + error = xfs_rtfree_range(mp, tp, bno, len, &sumbp, &sb); + if (error) { + return error; + } + /* + * Mark more blocks free in the superblock. + */ + xfs_trans_mod_sb(tp, XFS_TRANS_SB_FREXTENTS, (long)len); + /* + * If we've now freed all the blocks, reset the file sequence + * number to 0. + */ + if (tp->t_frextents_delta + mp->m_sb.sb_frextents == + mp->m_sb.sb_rextents) { + if (!(mp->m_rbmip->i_d.di_flags & XFS_DIFLAG_NEWRTBM)) + mp->m_rbmip->i_d.di_flags |= XFS_DIFLAG_NEWRTBM; + *(__uint64_t *)&mp->m_rbmip->i_d.di_atime = 0; + xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE); + } + return 0; +} + diff --git a/fs/xfs/xfs_sb.c b/fs/xfs/xfs_sb.c new file mode 100644 index 00000000000..7703fa6770f --- /dev/null +++ b/fs/xfs/xfs_sb.c @@ -0,0 +1,836 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_bit.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_ialloc.h" +#include "xfs_alloc.h" +#include "xfs_error.h" +#include "xfs_trace.h" +#include "xfs_cksum.h" +#include "xfs_trans.h" +#include "xfs_buf_item.h" +#include "xfs_dinode.h" +#include "xfs_bmap_btree.h" +#include "xfs_alloc_btree.h" +#include "xfs_ialloc_btree.h" + +/* + * Physical superblock buffer manipulations. Shared with libxfs in userspace. + */ + +static const struct { + short offset; + short type; /* 0 = integer + * 1 = binary / string (no translation) + */ +} xfs_sb_info[] = { + { offsetof(xfs_sb_t, sb_magicnum), 0 }, + { offsetof(xfs_sb_t, sb_blocksize), 0 }, + { offsetof(xfs_sb_t, sb_dblocks), 0 }, + { offsetof(xfs_sb_t, sb_rblocks), 0 }, + { offsetof(xfs_sb_t, sb_rextents), 0 }, + { offsetof(xfs_sb_t, sb_uuid), 1 }, + { offsetof(xfs_sb_t, sb_logstart), 0 }, + { offsetof(xfs_sb_t, sb_rootino), 0 }, + { offsetof(xfs_sb_t, sb_rbmino), 0 }, + { offsetof(xfs_sb_t, sb_rsumino), 0 }, + { offsetof(xfs_sb_t, sb_rextsize), 0 }, + { offsetof(xfs_sb_t, sb_agblocks), 0 }, + { offsetof(xfs_sb_t, sb_agcount), 0 }, + { offsetof(xfs_sb_t, sb_rbmblocks), 0 }, + { offsetof(xfs_sb_t, sb_logblocks), 0 }, + { offsetof(xfs_sb_t, sb_versionnum), 0 }, + { offsetof(xfs_sb_t, sb_sectsize), 0 }, + { offsetof(xfs_sb_t, sb_inodesize), 0 }, + { offsetof(xfs_sb_t, sb_inopblock), 0 }, + { offsetof(xfs_sb_t, sb_fname[0]), 1 }, + { offsetof(xfs_sb_t, sb_blocklog), 0 }, + { offsetof(xfs_sb_t, sb_sectlog), 0 }, + { offsetof(xfs_sb_t, sb_inodelog), 0 }, + { offsetof(xfs_sb_t, sb_inopblog), 0 }, + { offsetof(xfs_sb_t, sb_agblklog), 0 }, + { offsetof(xfs_sb_t, sb_rextslog), 0 }, + { offsetof(xfs_sb_t, sb_inprogress), 0 }, + { offsetof(xfs_sb_t, sb_imax_pct), 0 }, + { offsetof(xfs_sb_t, sb_icount), 0 }, + { offsetof(xfs_sb_t, sb_ifree), 0 }, + { offsetof(xfs_sb_t, sb_fdblocks), 0 }, + { offsetof(xfs_sb_t, sb_frextents), 0 }, + { offsetof(xfs_sb_t, sb_uquotino), 0 }, + { offsetof(xfs_sb_t, sb_gquotino), 0 }, + { offsetof(xfs_sb_t, sb_qflags), 0 }, + { offsetof(xfs_sb_t, sb_flags), 0 }, + { offsetof(xfs_sb_t, sb_shared_vn), 0 }, + { offsetof(xfs_sb_t, sb_inoalignmt), 0 }, + { offsetof(xfs_sb_t, sb_unit), 0 }, + { offsetof(xfs_sb_t, sb_width), 0 }, + { offsetof(xfs_sb_t, sb_dirblklog), 0 }, + { offsetof(xfs_sb_t, sb_logsectlog), 0 }, + { offsetof(xfs_sb_t, sb_logsectsize), 0 }, + { offsetof(xfs_sb_t, sb_logsunit), 0 }, + { offsetof(xfs_sb_t, sb_features2), 0 }, + { offsetof(xfs_sb_t, sb_bad_features2), 0 }, + { offsetof(xfs_sb_t, sb_features_compat), 0 }, + { offsetof(xfs_sb_t, sb_features_ro_compat), 0 }, + { offsetof(xfs_sb_t, sb_features_incompat), 0 }, + { offsetof(xfs_sb_t, sb_features_log_incompat), 0 }, + { offsetof(xfs_sb_t, sb_crc), 0 }, + { offsetof(xfs_sb_t, sb_pad), 0 }, + { offsetof(xfs_sb_t, sb_pquotino), 0 }, + { offsetof(xfs_sb_t, sb_lsn), 0 }, + { sizeof(xfs_sb_t), 0 } +}; + +/* + * Reference counting access wrappers to the perag structures. + * Because we never free per-ag structures, the only thing we + * have to protect against changes is the tree structure itself. + */ +struct xfs_perag * +xfs_perag_get( + struct xfs_mount *mp, + xfs_agnumber_t agno) +{ + struct xfs_perag *pag; + int ref = 0; + + rcu_read_lock(); + pag = radix_tree_lookup(&mp->m_perag_tree, agno); + if (pag) { + ASSERT(atomic_read(&pag->pag_ref) >= 0); + ref = atomic_inc_return(&pag->pag_ref); + } + rcu_read_unlock(); + trace_xfs_perag_get(mp, agno, ref, _RET_IP_); + return pag; +} + +/* + * search from @first to find the next perag with the given tag set. + */ +struct xfs_perag * +xfs_perag_get_tag( + struct xfs_mount *mp, + xfs_agnumber_t first, + int tag) +{ + struct xfs_perag *pag; + int found; + int ref; + + rcu_read_lock(); + found = radix_tree_gang_lookup_tag(&mp->m_perag_tree, + (void **)&pag, first, 1, tag); + if (found <= 0) { + rcu_read_unlock(); + return NULL; + } + ref = atomic_inc_return(&pag->pag_ref); + rcu_read_unlock(); + trace_xfs_perag_get_tag(mp, pag->pag_agno, ref, _RET_IP_); + return pag; +} + +void +xfs_perag_put( + struct xfs_perag *pag) +{ + int ref; + + ASSERT(atomic_read(&pag->pag_ref) > 0); + ref = atomic_dec_return(&pag->pag_ref); + trace_xfs_perag_put(pag->pag_mount, pag->pag_agno, ref, _RET_IP_); +} + +/* + * Check the validity of the SB found. + */ +STATIC int +xfs_mount_validate_sb( + xfs_mount_t *mp, + xfs_sb_t *sbp, + bool check_inprogress, + bool check_version) +{ + + /* + * If the log device and data device have the + * same device number, the log is internal. + * Consequently, the sb_logstart should be non-zero. If + * we have a zero sb_logstart in this case, we may be trying to mount + * a volume filesystem in a non-volume manner. + */ + if (sbp->sb_magicnum != XFS_SB_MAGIC) { + xfs_warn(mp, "bad magic number"); + return XFS_ERROR(EWRONGFS); + } + + + if (!xfs_sb_good_version(sbp)) { + xfs_warn(mp, "bad version"); + return XFS_ERROR(EWRONGFS); + } + + /* + * Version 5 superblock feature mask validation. Reject combinations the + * kernel cannot support up front before checking anything else. For + * write validation, we don't need to check feature masks. + */ + if (check_version && XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) { + if (xfs_sb_has_compat_feature(sbp, + XFS_SB_FEAT_COMPAT_UNKNOWN)) { + xfs_warn(mp, +"Superblock has unknown compatible features (0x%x) enabled.\n" +"Using a more recent kernel is recommended.", + (sbp->sb_features_compat & + XFS_SB_FEAT_COMPAT_UNKNOWN)); + } + + if (xfs_sb_has_ro_compat_feature(sbp, + XFS_SB_FEAT_RO_COMPAT_UNKNOWN)) { + xfs_alert(mp, +"Superblock has unknown read-only compatible features (0x%x) enabled.", + (sbp->sb_features_ro_compat & + XFS_SB_FEAT_RO_COMPAT_UNKNOWN)); + if (!(mp->m_flags & XFS_MOUNT_RDONLY)) { + xfs_warn(mp, +"Attempted to mount read-only compatible filesystem read-write.\n" +"Filesystem can only be safely mounted read only."); + return XFS_ERROR(EINVAL); + } + } + if (xfs_sb_has_incompat_feature(sbp, + XFS_SB_FEAT_INCOMPAT_UNKNOWN)) { + xfs_warn(mp, +"Superblock has unknown incompatible features (0x%x) enabled.\n" +"Filesystem can not be safely mounted by this kernel.", + (sbp->sb_features_incompat & + XFS_SB_FEAT_INCOMPAT_UNKNOWN)); + return XFS_ERROR(EINVAL); + } + } + + if (xfs_sb_version_has_pquotino(sbp)) { + if (sbp->sb_qflags & (XFS_OQUOTA_ENFD | XFS_OQUOTA_CHKD)) { + xfs_notice(mp, + "Version 5 of Super block has XFS_OQUOTA bits."); + return XFS_ERROR(EFSCORRUPTED); + } + } else if (sbp->sb_qflags & (XFS_PQUOTA_ENFD | XFS_GQUOTA_ENFD | + XFS_PQUOTA_CHKD | XFS_GQUOTA_CHKD)) { + xfs_notice(mp, +"Superblock earlier than Version 5 has XFS_[PQ]UOTA_{ENFD|CHKD} bits."); + return XFS_ERROR(EFSCORRUPTED); + } + + if (unlikely( + sbp->sb_logstart == 0 && mp->m_logdev_targp == mp->m_ddev_targp)) { + xfs_warn(mp, + "filesystem is marked as having an external log; " + "specify logdev on the mount command line."); + return XFS_ERROR(EINVAL); + } + + if (unlikely( + sbp->sb_logstart != 0 && mp->m_logdev_targp != mp->m_ddev_targp)) { + xfs_warn(mp, + "filesystem is marked as having an internal log; " + "do not specify logdev on the mount command line."); + return XFS_ERROR(EINVAL); + } + + /* + * More sanity checking. Most of these were stolen directly from + * xfs_repair. + */ + if (unlikely( + sbp->sb_agcount <= 0 || + sbp->sb_sectsize < XFS_MIN_SECTORSIZE || + sbp->sb_sectsize > XFS_MAX_SECTORSIZE || + sbp->sb_sectlog < XFS_MIN_SECTORSIZE_LOG || + sbp->sb_sectlog > XFS_MAX_SECTORSIZE_LOG || + sbp->sb_sectsize != (1 << sbp->sb_sectlog) || + sbp->sb_blocksize < XFS_MIN_BLOCKSIZE || + sbp->sb_blocksize > XFS_MAX_BLOCKSIZE || + sbp->sb_blocklog < XFS_MIN_BLOCKSIZE_LOG || + sbp->sb_blocklog > XFS_MAX_BLOCKSIZE_LOG || + sbp->sb_blocksize != (1 << sbp->sb_blocklog) || + sbp->sb_inodesize < XFS_DINODE_MIN_SIZE || + sbp->sb_inodesize > XFS_DINODE_MAX_SIZE || + sbp->sb_inodelog < XFS_DINODE_MIN_LOG || + sbp->sb_inodelog > XFS_DINODE_MAX_LOG || + sbp->sb_inodesize != (1 << sbp->sb_inodelog) || + sbp->sb_inopblock != howmany(sbp->sb_blocksize,sbp->sb_inodesize) || + (sbp->sb_blocklog - sbp->sb_inodelog != sbp->sb_inopblog) || + (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE) || + (sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE) || + (sbp->sb_imax_pct > 100 /* zero sb_imax_pct is valid */) || + sbp->sb_dblocks == 0 || + sbp->sb_dblocks > XFS_MAX_DBLOCKS(sbp) || + sbp->sb_dblocks < XFS_MIN_DBLOCKS(sbp) || + sbp->sb_shared_vn != 0)) { + xfs_notice(mp, "SB sanity check failed"); + return XFS_ERROR(EFSCORRUPTED); + } + + /* + * Until this is fixed only page-sized or smaller data blocks work. + */ + if (unlikely(sbp->sb_blocksize > PAGE_SIZE)) { + xfs_warn(mp, + "File system with blocksize %d bytes. " + "Only pagesize (%ld) or less will currently work.", + sbp->sb_blocksize, PAGE_SIZE); + return XFS_ERROR(ENOSYS); + } + + /* + * Currently only very few inode sizes are supported. + */ + switch (sbp->sb_inodesize) { + case 256: + case 512: + case 1024: + case 2048: + break; + default: + xfs_warn(mp, "inode size of %d bytes not supported", + sbp->sb_inodesize); + return XFS_ERROR(ENOSYS); + } + + if (xfs_sb_validate_fsb_count(sbp, sbp->sb_dblocks) || + xfs_sb_validate_fsb_count(sbp, sbp->sb_rblocks)) { + xfs_warn(mp, + "file system too large to be mounted on this system."); + return XFS_ERROR(EFBIG); + } + + if (check_inprogress && sbp->sb_inprogress) { + xfs_warn(mp, "Offline file system operation in progress!"); + return XFS_ERROR(EFSCORRUPTED); + } + return 0; +} + +void +xfs_sb_quota_from_disk(struct xfs_sb *sbp) +{ + /* + * older mkfs doesn't initialize quota inodes to NULLFSINO. This + * leads to in-core values having two different values for a quota + * inode to be invalid: 0 and NULLFSINO. Change it to a single value + * NULLFSINO. + * + * Note that this change affect only the in-core values. These + * values are not written back to disk unless any quota information + * is written to the disk. Even in that case, sb_pquotino field is + * not written to disk unless the superblock supports pquotino. + */ + if (sbp->sb_uquotino == 0) + sbp->sb_uquotino = NULLFSINO; + if (sbp->sb_gquotino == 0) + sbp->sb_gquotino = NULLFSINO; + if (sbp->sb_pquotino == 0) + sbp->sb_pquotino = NULLFSINO; + + /* + * We need to do these manipilations only if we are working + * with an older version of on-disk superblock. + */ + if (xfs_sb_version_has_pquotino(sbp)) + return; + + if (sbp->sb_qflags & XFS_OQUOTA_ENFD) + sbp->sb_qflags |= (sbp->sb_qflags & XFS_PQUOTA_ACCT) ? + XFS_PQUOTA_ENFD : XFS_GQUOTA_ENFD; + if (sbp->sb_qflags & XFS_OQUOTA_CHKD) + sbp->sb_qflags |= (sbp->sb_qflags & XFS_PQUOTA_ACCT) ? + XFS_PQUOTA_CHKD : XFS_GQUOTA_CHKD; + sbp->sb_qflags &= ~(XFS_OQUOTA_ENFD | XFS_OQUOTA_CHKD); + + if (sbp->sb_qflags & XFS_PQUOTA_ACCT) { + /* + * In older version of superblock, on-disk superblock only + * has sb_gquotino, and in-core superblock has both sb_gquotino + * and sb_pquotino. But, only one of them is supported at any + * point of time. So, if PQUOTA is set in disk superblock, + * copy over sb_gquotino to sb_pquotino. + */ + sbp->sb_pquotino = sbp->sb_gquotino; + sbp->sb_gquotino = NULLFSINO; + } +} + +void +xfs_sb_from_disk( + struct xfs_sb *to, + xfs_dsb_t *from) +{ + to->sb_magicnum = be32_to_cpu(from->sb_magicnum); + to->sb_blocksize = be32_to_cpu(from->sb_blocksize); + to->sb_dblocks = be64_to_cpu(from->sb_dblocks); + to->sb_rblocks = be64_to_cpu(from->sb_rblocks); + to->sb_rextents = be64_to_cpu(from->sb_rextents); + memcpy(&to->sb_uuid, &from->sb_uuid, sizeof(to->sb_uuid)); + to->sb_logstart = be64_to_cpu(from->sb_logstart); + to->sb_rootino = be64_to_cpu(from->sb_rootino); + to->sb_rbmino = be64_to_cpu(from->sb_rbmino); + to->sb_rsumino = be64_to_cpu(from->sb_rsumino); + to->sb_rextsize = be32_to_cpu(from->sb_rextsize); + to->sb_agblocks = be32_to_cpu(from->sb_agblocks); + to->sb_agcount = be32_to_cpu(from->sb_agcount); + to->sb_rbmblocks = be32_to_cpu(from->sb_rbmblocks); + to->sb_logblocks = be32_to_cpu(from->sb_logblocks); + to->sb_versionnum = be16_to_cpu(from->sb_versionnum); + to->sb_sectsize = be16_to_cpu(from->sb_sectsize); + to->sb_inodesize = be16_to_cpu(from->sb_inodesize); + to->sb_inopblock = be16_to_cpu(from->sb_inopblock); + memcpy(&to->sb_fname, &from->sb_fname, sizeof(to->sb_fname)); + to->sb_blocklog = from->sb_blocklog; + to->sb_sectlog = from->sb_sectlog; + to->sb_inodelog = from->sb_inodelog; + to->sb_inopblog = from->sb_inopblog; + to->sb_agblklog = from->sb_agblklog; + to->sb_rextslog = from->sb_rextslog; + to->sb_inprogress = from->sb_inprogress; + to->sb_imax_pct = from->sb_imax_pct; + to->sb_icount = be64_to_cpu(from->sb_icount); + to->sb_ifree = be64_to_cpu(from->sb_ifree); + to->sb_fdblocks = be64_to_cpu(from->sb_fdblocks); + to->sb_frextents = be64_to_cpu(from->sb_frextents); + to->sb_uquotino = be64_to_cpu(from->sb_uquotino); + to->sb_gquotino = be64_to_cpu(from->sb_gquotino); + to->sb_qflags = be16_to_cpu(from->sb_qflags); + to->sb_flags = from->sb_flags; + to->sb_shared_vn = from->sb_shared_vn; + to->sb_inoalignmt = be32_to_cpu(from->sb_inoalignmt); + to->sb_unit = be32_to_cpu(from->sb_unit); + to->sb_width = be32_to_cpu(from->sb_width); + to->sb_dirblklog = from->sb_dirblklog; + to->sb_logsectlog = from->sb_logsectlog; + to->sb_logsectsize = be16_to_cpu(from->sb_logsectsize); + to->sb_logsunit = be32_to_cpu(from->sb_logsunit); + to->sb_features2 = be32_to_cpu(from->sb_features2); + to->sb_bad_features2 = be32_to_cpu(from->sb_bad_features2); + to->sb_features_compat = be32_to_cpu(from->sb_features_compat); + to->sb_features_ro_compat = be32_to_cpu(from->sb_features_ro_compat); + to->sb_features_incompat = be32_to_cpu(from->sb_features_incompat); + to->sb_features_log_incompat = + be32_to_cpu(from->sb_features_log_incompat); + to->sb_pad = 0; + to->sb_pquotino = be64_to_cpu(from->sb_pquotino); + to->sb_lsn = be64_to_cpu(from->sb_lsn); +} + +static inline void +xfs_sb_quota_to_disk( + xfs_dsb_t *to, + xfs_sb_t *from, + __int64_t *fields) +{ + __uint16_t qflags = from->sb_qflags; + + /* + * We need to do these manipilations only if we are working + * with an older version of on-disk superblock. + */ + if (xfs_sb_version_has_pquotino(from)) + return; + + if (*fields & XFS_SB_QFLAGS) { + /* + * The in-core version of sb_qflags do not have + * XFS_OQUOTA_* flags, whereas the on-disk version + * does. So, convert incore XFS_{PG}QUOTA_* flags + * to on-disk XFS_OQUOTA_* flags. + */ + qflags &= ~(XFS_PQUOTA_ENFD | XFS_PQUOTA_CHKD | + XFS_GQUOTA_ENFD | XFS_GQUOTA_CHKD); + + if (from->sb_qflags & + (XFS_PQUOTA_ENFD | XFS_GQUOTA_ENFD)) + qflags |= XFS_OQUOTA_ENFD; + if (from->sb_qflags & + (XFS_PQUOTA_CHKD | XFS_GQUOTA_CHKD)) + qflags |= XFS_OQUOTA_CHKD; + to->sb_qflags = cpu_to_be16(qflags); + *fields &= ~XFS_SB_QFLAGS; + } + + /* + * GQUOTINO and PQUOTINO cannot be used together in versions of + * superblock that do not have pquotino. from->sb_flags tells us which + * quota is active and should be copied to disk. If neither are active, + * make sure we write NULLFSINO to the sb_gquotino field as a quota + * inode value of "0" is invalid when the XFS_SB_VERSION_QUOTA feature + * bit is set. + * + * Note that we don't need to handle the sb_uquotino or sb_pquotino here + * as they do not require any translation. Hence the main sb field loop + * will write them appropriately from the in-core superblock. + */ + if ((*fields & XFS_SB_GQUOTINO) && + (from->sb_qflags & XFS_GQUOTA_ACCT)) + to->sb_gquotino = cpu_to_be64(from->sb_gquotino); + else if ((*fields & XFS_SB_PQUOTINO) && + (from->sb_qflags & XFS_PQUOTA_ACCT)) + to->sb_gquotino = cpu_to_be64(from->sb_pquotino); + else { + /* + * We can't rely on just the fields being logged to tell us + * that it is safe to write NULLFSINO - we should only do that + * if quotas are not actually enabled. Hence only write + * NULLFSINO if both in-core quota inodes are NULL. + */ + if (from->sb_gquotino == NULLFSINO && + from->sb_pquotino == NULLFSINO) + to->sb_gquotino = cpu_to_be64(NULLFSINO); + } + + *fields &= ~(XFS_SB_PQUOTINO | XFS_SB_GQUOTINO); +} + +/* + * Copy in core superblock to ondisk one. + * + * The fields argument is mask of superblock fields to copy. + */ +void +xfs_sb_to_disk( + xfs_dsb_t *to, + xfs_sb_t *from, + __int64_t fields) +{ + xfs_caddr_t to_ptr = (xfs_caddr_t)to; + xfs_caddr_t from_ptr = (xfs_caddr_t)from; + xfs_sb_field_t f; + int first; + int size; + + ASSERT(fields); + if (!fields) + return; + + xfs_sb_quota_to_disk(to, from, &fields); + while (fields) { + f = (xfs_sb_field_t)xfs_lowbit64((__uint64_t)fields); + first = xfs_sb_info[f].offset; + size = xfs_sb_info[f + 1].offset - first; + + ASSERT(xfs_sb_info[f].type == 0 || xfs_sb_info[f].type == 1); + + if (size == 1 || xfs_sb_info[f].type == 1) { + memcpy(to_ptr + first, from_ptr + first, size); + } else { + switch (size) { + case 2: + *(__be16 *)(to_ptr + first) = + cpu_to_be16(*(__u16 *)(from_ptr + first)); + break; + case 4: + *(__be32 *)(to_ptr + first) = + cpu_to_be32(*(__u32 *)(from_ptr + first)); + break; + case 8: + *(__be64 *)(to_ptr + first) = + cpu_to_be64(*(__u64 *)(from_ptr + first)); + break; + default: + ASSERT(0); + } + } + + fields &= ~(1LL << f); + } +} + +static int +xfs_sb_verify( + struct xfs_buf *bp, + bool check_version) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_sb sb; + + xfs_sb_from_disk(&sb, XFS_BUF_TO_SBP(bp)); + + /* + * Only check the in progress field for the primary superblock as + * mkfs.xfs doesn't clear it from secondary superblocks. + */ + return xfs_mount_validate_sb(mp, &sb, bp->b_bn == XFS_SB_DADDR, + check_version); +} + +/* + * If the superblock has the CRC feature bit set or the CRC field is non-null, + * check that the CRC is valid. We check the CRC field is non-null because a + * single bit error could clear the feature bit and unused parts of the + * superblock are supposed to be zero. Hence a non-null crc field indicates that + * we've potentially lost a feature bit and we should check it anyway. + * + * However, past bugs (i.e. in growfs) left non-zeroed regions beyond the + * last field in V4 secondary superblocks. So for secondary superblocks, + * we are more forgiving, and ignore CRC failures if the primary doesn't + * indicate that the fs version is V5. + */ +static void +xfs_sb_read_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_dsb *dsb = XFS_BUF_TO_SBP(bp); + int error; + + /* + * open code the version check to avoid needing to convert the entire + * superblock from disk order just to check the version number + */ + if (dsb->sb_magicnum == cpu_to_be32(XFS_SB_MAGIC) && + (((be16_to_cpu(dsb->sb_versionnum) & XFS_SB_VERSION_NUMBITS) == + XFS_SB_VERSION_5) || + dsb->sb_crc != 0)) { + + if (!xfs_buf_verify_cksum(bp, XFS_SB_CRC_OFF)) { + /* Only fail bad secondaries on a known V5 filesystem */ + if (bp->b_bn == XFS_SB_DADDR || + xfs_sb_version_hascrc(&mp->m_sb)) { + error = EFSBADCRC; + goto out_error; + } + } + } + error = xfs_sb_verify(bp, true); + +out_error: + if (error) { + xfs_buf_ioerror(bp, error); + if (error == EFSCORRUPTED || error == EFSBADCRC) + xfs_verifier_error(bp); + } +} + +/* + * We may be probed for a filesystem match, so we may not want to emit + * messages when the superblock buffer is not actually an XFS superblock. + * If we find an XFS superblock, then run a normal, noisy mount because we are + * really going to mount it and want to know about errors. + */ +static void +xfs_sb_quiet_read_verify( + struct xfs_buf *bp) +{ + struct xfs_dsb *dsb = XFS_BUF_TO_SBP(bp); + + if (dsb->sb_magicnum == cpu_to_be32(XFS_SB_MAGIC)) { + /* XFS filesystem, verify noisily! */ + xfs_sb_read_verify(bp); + return; + } + /* quietly fail */ + xfs_buf_ioerror(bp, EWRONGFS); +} + +static void +xfs_sb_write_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_buf_log_item *bip = bp->b_fspriv; + int error; + + error = xfs_sb_verify(bp, false); + if (error) { + xfs_buf_ioerror(bp, error); + xfs_verifier_error(bp); + return; + } + + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return; + + if (bip) + XFS_BUF_TO_SBP(bp)->sb_lsn = cpu_to_be64(bip->bli_item.li_lsn); + + xfs_buf_update_cksum(bp, XFS_SB_CRC_OFF); +} + +const struct xfs_buf_ops xfs_sb_buf_ops = { + .verify_read = xfs_sb_read_verify, + .verify_write = xfs_sb_write_verify, +}; + +const struct xfs_buf_ops xfs_sb_quiet_buf_ops = { + .verify_read = xfs_sb_quiet_read_verify, + .verify_write = xfs_sb_write_verify, +}; + +/* + * xfs_mount_common + * + * Mount initialization code establishing various mount + * fields from the superblock associated with the given + * mount structure + */ +void +xfs_sb_mount_common( + struct xfs_mount *mp, + struct xfs_sb *sbp) +{ + mp->m_agfrotor = mp->m_agirotor = 0; + spin_lock_init(&mp->m_agirotor_lock); + mp->m_maxagi = mp->m_sb.sb_agcount; + mp->m_blkbit_log = sbp->sb_blocklog + XFS_NBBYLOG; + mp->m_blkbb_log = sbp->sb_blocklog - BBSHIFT; + mp->m_sectbb_log = sbp->sb_sectlog - BBSHIFT; + mp->m_agno_log = xfs_highbit32(sbp->sb_agcount - 1) + 1; + mp->m_agino_log = sbp->sb_inopblog + sbp->sb_agblklog; + mp->m_blockmask = sbp->sb_blocksize - 1; + mp->m_blockwsize = sbp->sb_blocksize >> XFS_WORDLOG; + mp->m_blockwmask = mp->m_blockwsize - 1; + + mp->m_alloc_mxr[0] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, 1); + mp->m_alloc_mxr[1] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, 0); + mp->m_alloc_mnr[0] = mp->m_alloc_mxr[0] / 2; + mp->m_alloc_mnr[1] = mp->m_alloc_mxr[1] / 2; + + mp->m_inobt_mxr[0] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, 1); + mp->m_inobt_mxr[1] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, 0); + mp->m_inobt_mnr[0] = mp->m_inobt_mxr[0] / 2; + mp->m_inobt_mnr[1] = mp->m_inobt_mxr[1] / 2; + + mp->m_bmap_dmxr[0] = xfs_bmbt_maxrecs(mp, sbp->sb_blocksize, 1); + mp->m_bmap_dmxr[1] = xfs_bmbt_maxrecs(mp, sbp->sb_blocksize, 0); + mp->m_bmap_dmnr[0] = mp->m_bmap_dmxr[0] / 2; + mp->m_bmap_dmnr[1] = mp->m_bmap_dmxr[1] / 2; + + mp->m_bsize = XFS_FSB_TO_BB(mp, 1); + mp->m_ialloc_inos = (int)MAX((__uint16_t)XFS_INODES_PER_CHUNK, + sbp->sb_inopblock); + mp->m_ialloc_blks = mp->m_ialloc_inos >> sbp->sb_inopblog; +} + +/* + * xfs_initialize_perag_data + * + * Read in each per-ag structure so we can count up the number of + * allocated inodes, free inodes and used filesystem blocks as this + * information is no longer persistent in the superblock. Once we have + * this information, write it into the in-core superblock structure. + */ +int +xfs_initialize_perag_data( + struct xfs_mount *mp, + xfs_agnumber_t agcount) +{ + xfs_agnumber_t index; + xfs_perag_t *pag; + xfs_sb_t *sbp = &mp->m_sb; + uint64_t ifree = 0; + uint64_t ialloc = 0; + uint64_t bfree = 0; + uint64_t bfreelst = 0; + uint64_t btree = 0; + int error; + + for (index = 0; index < agcount; index++) { + /* + * read the agf, then the agi. This gets us + * all the information we need and populates the + * per-ag structures for us. + */ + error = xfs_alloc_pagf_init(mp, NULL, index, 0); + if (error) + return error; + + error = xfs_ialloc_pagi_init(mp, NULL, index); + if (error) + return error; + pag = xfs_perag_get(mp, index); + ifree += pag->pagi_freecount; + ialloc += pag->pagi_count; + bfree += pag->pagf_freeblks; + bfreelst += pag->pagf_flcount; + btree += pag->pagf_btreeblks; + xfs_perag_put(pag); + } + /* + * Overwrite incore superblock counters with just-read data + */ + spin_lock(&mp->m_sb_lock); + sbp->sb_ifree = ifree; + sbp->sb_icount = ialloc; + sbp->sb_fdblocks = bfree + bfreelst + btree; + spin_unlock(&mp->m_sb_lock); + + /* Fixup the per-cpu counters as well. */ + xfs_icsb_reinit_counters(mp); + + return 0; +} + +/* + * xfs_mod_sb() can be used to copy arbitrary changes to the + * in-core superblock into the superblock buffer to be logged. + * It does not provide the higher level of locking that is + * needed to protect the in-core superblock from concurrent + * access. + */ +void +xfs_mod_sb(xfs_trans_t *tp, __int64_t fields) +{ + xfs_buf_t *bp; + int first; + int last; + xfs_mount_t *mp; + xfs_sb_field_t f; + + ASSERT(fields); + if (!fields) + return; + mp = tp->t_mountp; + bp = xfs_trans_getsb(tp, mp, 0); + first = sizeof(xfs_sb_t); + last = 0; + + /* translate/copy */ + + xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb, fields); + + /* find modified range */ + f = (xfs_sb_field_t)xfs_highbit64((__uint64_t)fields); + ASSERT((1LL << f) & XFS_SB_MOD_BITS); + last = xfs_sb_info[f + 1].offset - 1; + + f = (xfs_sb_field_t)xfs_lowbit64((__uint64_t)fields); + ASSERT((1LL << f) & XFS_SB_MOD_BITS); + first = xfs_sb_info[f].offset; + + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SB_BUF); + xfs_trans_log_buf(tp, bp, first, last); +} diff --git a/fs/xfs/xfs_sb.h b/fs/xfs/xfs_sb.h index a05b45175fb..c43c2d609a2 100644 --- a/fs/xfs/xfs_sb.h +++ b/fs/xfs/xfs_sb.h @@ -26,16 +26,16 @@ struct xfs_buf; struct xfs_mount; +struct xfs_trans; #define XFS_SB_MAGIC 0x58465342 /* 'XFSB' */ #define XFS_SB_VERSION_1 1 /* 5.3, 6.0.1, 6.1 */ #define XFS_SB_VERSION_2 2 /* 6.2 - attributes */ #define XFS_SB_VERSION_3 3 /* 6.2 - new inode version */ #define XFS_SB_VERSION_4 4 /* 6.2+ - bitmask version */ +#define XFS_SB_VERSION_5 5 /* CRC enabled filesystem */ #define XFS_SB_VERSION_NUMBITS 0x000f #define XFS_SB_VERSION_ALLFBITS 0xfff0 -#define XFS_SB_VERSION_SASHFBITS 0xf000 -#define XFS_SB_VERSION_REALFBITS 0x0ff0 #define XFS_SB_VERSION_ATTRBIT 0x0010 #define XFS_SB_VERSION_NLINKBIT 0x0020 #define XFS_SB_VERSION_QUOTABIT 0x0040 @@ -48,24 +48,15 @@ struct xfs_mount; #define XFS_SB_VERSION_DIRV2BIT 0x2000 #define XFS_SB_VERSION_BORGBIT 0x4000 /* ASCII only case-insens. */ #define XFS_SB_VERSION_MOREBITSBIT 0x8000 -#define XFS_SB_VERSION_OKSASHFBITS \ - (XFS_SB_VERSION_EXTFLGBIT | \ - XFS_SB_VERSION_DIRV2BIT | \ - XFS_SB_VERSION_BORGBIT) -#define XFS_SB_VERSION_OKREALFBITS \ - (XFS_SB_VERSION_ATTRBIT | \ - XFS_SB_VERSION_NLINKBIT | \ - XFS_SB_VERSION_QUOTABIT | \ - XFS_SB_VERSION_ALIGNBIT | \ - XFS_SB_VERSION_DALIGNBIT | \ - XFS_SB_VERSION_SHAREDBIT | \ - XFS_SB_VERSION_LOGV2BIT | \ - XFS_SB_VERSION_SECTORBIT | \ - XFS_SB_VERSION_MOREBITSBIT) -#define XFS_SB_VERSION_OKREALBITS \ - (XFS_SB_VERSION_NUMBITS | \ - XFS_SB_VERSION_OKREALFBITS | \ - XFS_SB_VERSION_OKSASHFBITS) + +/* + * Supported feature bit list is just all bits in the versionnum field because + * we've used them all up and understand them all. Except, of course, for the + * shared superblock bit, which nobody knows what it does and so is unsupported. + */ +#define XFS_SB_VERSION_OKBITS \ + ((XFS_SB_VERSION_NUMBITS | XFS_SB_VERSION_ALLFBITS) & \ + ~XFS_SB_VERSION_SHAREDBIT) /* * There are two words to hold XFS "feature" bits: the original @@ -74,7 +65,6 @@ struct xfs_mount; * * These defines represent bits in sb_features2. */ -#define XFS_SB_VERSION2_REALFBITS 0x00ffffff /* Mask: features */ #define XFS_SB_VERSION2_RESERVED1BIT 0x00000001 #define XFS_SB_VERSION2_LAZYSBCOUNTBIT 0x00000002 /* Superblk counters */ #define XFS_SB_VERSION2_RESERVED4BIT 0x00000004 @@ -82,16 +72,13 @@ struct xfs_mount; #define XFS_SB_VERSION2_PARENTBIT 0x00000010 /* parent pointers */ #define XFS_SB_VERSION2_PROJID32BIT 0x00000080 /* 32 bit project id */ #define XFS_SB_VERSION2_CRCBIT 0x00000100 /* metadata CRCs */ +#define XFS_SB_VERSION2_FTYPE 0x00000200 /* inode type in dir */ -#define XFS_SB_VERSION2_OKREALFBITS \ +#define XFS_SB_VERSION2_OKBITS \ (XFS_SB_VERSION2_LAZYSBCOUNTBIT | \ XFS_SB_VERSION2_ATTR2BIT | \ - XFS_SB_VERSION2_PROJID32BIT) -#define XFS_SB_VERSION2_OKSASHFBITS \ - (0) -#define XFS_SB_VERSION2_OKREALBITS \ - (XFS_SB_VERSION2_OKREALFBITS | \ - XFS_SB_VERSION2_OKSASHFBITS ) + XFS_SB_VERSION2_PROJID32BIT | \ + XFS_SB_VERSION2_FTYPE) /* * Superblock - in core version. Must match the ondisk version below. @@ -161,9 +148,25 @@ typedef struct xfs_sb { */ __uint32_t sb_bad_features2; + /* version 5 superblock fields start here */ + + /* feature masks */ + __uint32_t sb_features_compat; + __uint32_t sb_features_ro_compat; + __uint32_t sb_features_incompat; + __uint32_t sb_features_log_incompat; + + __uint32_t sb_crc; /* superblock crc */ + __uint32_t sb_pad; + + xfs_ino_t sb_pquotino; /* project quota inode */ + xfs_lsn_t sb_lsn; /* last write sequence */ + /* must be padded to 64 bit alignment */ } xfs_sb_t; +#define XFS_SB_CRC_OFF offsetof(struct xfs_sb, sb_crc) + /* * Superblock - on disk version. Must match the in core version above. * Must be padded to 64 bit alignment. @@ -229,7 +232,21 @@ typedef struct xfs_dsb { * for features2 bits. Easiest just to mark it bad and not use * it for anything else. */ - __be32 sb_bad_features2; + __be32 sb_bad_features2; + + /* version 5 superblock fields start here */ + + /* feature masks */ + __be32 sb_features_compat; + __be32 sb_features_ro_compat; + __be32 sb_features_incompat; + __be32 sb_features_log_incompat; + + __le32 sb_crc; /* superblock crc */ + __be32 sb_pad; + + __be64 sb_pquotino; /* project quota inode */ + __be64 sb_lsn; /* last write sequence */ /* must be padded to 64 bit alignment */ } xfs_dsb_t; @@ -250,7 +267,10 @@ typedef enum { XFS_SBS_GQUOTINO, XFS_SBS_QFLAGS, XFS_SBS_FLAGS, XFS_SBS_SHARED_VN, XFS_SBS_INOALIGNMT, XFS_SBS_UNIT, XFS_SBS_WIDTH, XFS_SBS_DIRBLKLOG, XFS_SBS_LOGSECTLOG, XFS_SBS_LOGSECTSIZE, XFS_SBS_LOGSUNIT, - XFS_SBS_FEATURES2, XFS_SBS_BAD_FEATURES2, + XFS_SBS_FEATURES2, XFS_SBS_BAD_FEATURES2, XFS_SBS_FEATURES_COMPAT, + XFS_SBS_FEATURES_RO_COMPAT, XFS_SBS_FEATURES_INCOMPAT, + XFS_SBS_FEATURES_LOG_INCOMPAT, XFS_SBS_CRC, XFS_SBS_PAD, + XFS_SBS_PQUOTINO, XFS_SBS_LSN, XFS_SBS_FIELDCOUNT } xfs_sb_field_t; @@ -276,6 +296,12 @@ typedef enum { #define XFS_SB_FDBLOCKS XFS_SB_MVAL(FDBLOCKS) #define XFS_SB_FEATURES2 XFS_SB_MVAL(FEATURES2) #define XFS_SB_BAD_FEATURES2 XFS_SB_MVAL(BAD_FEATURES2) +#define XFS_SB_FEATURES_COMPAT XFS_SB_MVAL(FEATURES_COMPAT) +#define XFS_SB_FEATURES_RO_COMPAT XFS_SB_MVAL(FEATURES_RO_COMPAT) +#define XFS_SB_FEATURES_INCOMPAT XFS_SB_MVAL(FEATURES_INCOMPAT) +#define XFS_SB_FEATURES_LOG_INCOMPAT XFS_SB_MVAL(FEATURES_LOG_INCOMPAT) +#define XFS_SB_CRC XFS_SB_MVAL(CRC) +#define XFS_SB_PQUOTINO XFS_SB_MVAL(PQUOTINO) #define XFS_SB_NUM_BITS ((int)XFS_SBS_FIELDCOUNT) #define XFS_SB_ALL_BITS ((1LL << XFS_SB_NUM_BITS) - 1) #define XFS_SB_MOD_BITS \ @@ -283,7 +309,9 @@ typedef enum { XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO | XFS_SB_GQUOTINO | \ XFS_SB_QFLAGS | XFS_SB_SHARED_VN | XFS_SB_UNIT | XFS_SB_WIDTH | \ XFS_SB_ICOUNT | XFS_SB_IFREE | XFS_SB_FDBLOCKS | XFS_SB_FEATURES2 | \ - XFS_SB_BAD_FEATURES2) + XFS_SB_BAD_FEATURES2 | XFS_SB_FEATURES_COMPAT | \ + XFS_SB_FEATURES_RO_COMPAT | XFS_SB_FEATURES_INCOMPAT | \ + XFS_SB_FEATURES_LOG_INCOMPAT | XFS_SB_PQUOTINO) /* @@ -300,220 +328,248 @@ typedef enum { #define XFS_SB_VERSION_NUM(sbp) ((sbp)->sb_versionnum & XFS_SB_VERSION_NUMBITS) -static inline int xfs_sb_good_version(xfs_sb_t *sbp) +/* + * The first XFS version we support is a v4 superblock with V2 directories. + */ +static inline bool xfs_sb_good_v4_features(struct xfs_sb *sbp) { - /* We always support version 1-3 */ - if (sbp->sb_versionnum >= XFS_SB_VERSION_1 && - sbp->sb_versionnum <= XFS_SB_VERSION_3) - return 1; - - /* We support version 4 if all feature bits are supported */ - if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) { - if ((sbp->sb_versionnum & ~XFS_SB_VERSION_OKREALBITS) || - ((sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT) && - (sbp->sb_features2 & ~XFS_SB_VERSION2_OKREALBITS))) - return 0; - -#ifdef __KERNEL__ - if (sbp->sb_shared_vn > XFS_SB_MAX_SHARED_VN) - return 0; -#else - if ((sbp->sb_versionnum & XFS_SB_VERSION_SHAREDBIT) && - sbp->sb_shared_vn > XFS_SB_MAX_SHARED_VN) - return 0; -#endif - - return 1; - } - - return 0; + if (!(sbp->sb_versionnum & XFS_SB_VERSION_DIRV2BIT)) + return false; + + /* check for unknown features in the fs */ + if ((sbp->sb_versionnum & ~XFS_SB_VERSION_OKBITS) || + ((sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT) && + (sbp->sb_features2 & ~XFS_SB_VERSION2_OKBITS))) + return false; + + return true; +} + +static inline bool xfs_sb_good_version(struct xfs_sb *sbp) +{ + if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) + return true; + if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) + return xfs_sb_good_v4_features(sbp); + return false; } /* * Detect a mismatched features2 field. Older kernels read/wrote * this into the wrong slot, so to be safe we keep them in sync. */ -static inline int xfs_sb_has_mismatched_features2(xfs_sb_t *sbp) +static inline bool xfs_sb_has_mismatched_features2(struct xfs_sb *sbp) { - return (sbp->sb_bad_features2 != sbp->sb_features2); + return sbp->sb_bad_features2 != sbp->sb_features2; } -static inline unsigned xfs_sb_version_tonew(unsigned v) +static inline bool xfs_sb_version_hasattr(struct xfs_sb *sbp) { - if (v == XFS_SB_VERSION_1) - return XFS_SB_VERSION_4; - - if (v == XFS_SB_VERSION_2) - return XFS_SB_VERSION_4 | XFS_SB_VERSION_ATTRBIT; - - return XFS_SB_VERSION_4 | XFS_SB_VERSION_ATTRBIT | - XFS_SB_VERSION_NLINKBIT; + return (sbp->sb_versionnum & XFS_SB_VERSION_ATTRBIT); } -static inline unsigned xfs_sb_version_toold(unsigned v) +static inline void xfs_sb_version_addattr(struct xfs_sb *sbp) { - if (v & (XFS_SB_VERSION_QUOTABIT | XFS_SB_VERSION_ALIGNBIT)) - return 0; - if (v & XFS_SB_VERSION_NLINKBIT) - return XFS_SB_VERSION_3; - if (v & XFS_SB_VERSION_ATTRBIT) - return XFS_SB_VERSION_2; - return XFS_SB_VERSION_1; + sbp->sb_versionnum |= XFS_SB_VERSION_ATTRBIT; } -static inline int xfs_sb_version_hasattr(xfs_sb_t *sbp) +static inline bool xfs_sb_version_hasquota(struct xfs_sb *sbp) { - return sbp->sb_versionnum == XFS_SB_VERSION_2 || - sbp->sb_versionnum == XFS_SB_VERSION_3 || - (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 && - (sbp->sb_versionnum & XFS_SB_VERSION_ATTRBIT)); + return (sbp->sb_versionnum & XFS_SB_VERSION_QUOTABIT); } -static inline void xfs_sb_version_addattr(xfs_sb_t *sbp) +static inline void xfs_sb_version_addquota(struct xfs_sb *sbp) { - if (sbp->sb_versionnum == XFS_SB_VERSION_1) - sbp->sb_versionnum = XFS_SB_VERSION_2; - else if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) - sbp->sb_versionnum |= XFS_SB_VERSION_ATTRBIT; - else - sbp->sb_versionnum = XFS_SB_VERSION_4 | XFS_SB_VERSION_ATTRBIT; + sbp->sb_versionnum |= XFS_SB_VERSION_QUOTABIT; } -static inline int xfs_sb_version_hasnlink(xfs_sb_t *sbp) +static inline bool xfs_sb_version_hasalign(struct xfs_sb *sbp) { - return sbp->sb_versionnum == XFS_SB_VERSION_3 || - (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 && - (sbp->sb_versionnum & XFS_SB_VERSION_NLINKBIT)); + return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 || + (sbp->sb_versionnum & XFS_SB_VERSION_ALIGNBIT)); } -static inline void xfs_sb_version_addnlink(xfs_sb_t *sbp) +static inline bool xfs_sb_version_hasdalign(struct xfs_sb *sbp) { - if (sbp->sb_versionnum <= XFS_SB_VERSION_2) - sbp->sb_versionnum = XFS_SB_VERSION_3; - else - sbp->sb_versionnum |= XFS_SB_VERSION_NLINKBIT; + return (sbp->sb_versionnum & XFS_SB_VERSION_DALIGNBIT); } -static inline int xfs_sb_version_hasquota(xfs_sb_t *sbp) +static inline bool xfs_sb_version_haslogv2(struct xfs_sb *sbp) { - return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 && - (sbp->sb_versionnum & XFS_SB_VERSION_QUOTABIT); + return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 || + (sbp->sb_versionnum & XFS_SB_VERSION_LOGV2BIT); } -static inline void xfs_sb_version_addquota(xfs_sb_t *sbp) +static inline bool xfs_sb_version_hasextflgbit(struct xfs_sb *sbp) { - if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) - sbp->sb_versionnum |= XFS_SB_VERSION_QUOTABIT; - else - sbp->sb_versionnum = xfs_sb_version_tonew(sbp->sb_versionnum) | - XFS_SB_VERSION_QUOTABIT; + return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 || + (sbp->sb_versionnum & XFS_SB_VERSION_EXTFLGBIT); } -static inline int xfs_sb_version_hasalign(xfs_sb_t *sbp) +static inline bool xfs_sb_version_hassector(struct xfs_sb *sbp) { - return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 && - (sbp->sb_versionnum & XFS_SB_VERSION_ALIGNBIT); + return (sbp->sb_versionnum & XFS_SB_VERSION_SECTORBIT); } -static inline int xfs_sb_version_hasdalign(xfs_sb_t *sbp) +static inline bool xfs_sb_version_hasasciici(struct xfs_sb *sbp) { - return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 && - (sbp->sb_versionnum & XFS_SB_VERSION_DALIGNBIT); + return (sbp->sb_versionnum & XFS_SB_VERSION_BORGBIT); } -static inline int xfs_sb_version_hasshared(xfs_sb_t *sbp) +static inline bool xfs_sb_version_hasmorebits(struct xfs_sb *sbp) { - return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 && - (sbp->sb_versionnum & XFS_SB_VERSION_SHAREDBIT); + return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 || + (sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT); } -static inline int xfs_sb_version_hasdirv2(xfs_sb_t *sbp) +/* + * sb_features2 bit version macros. + */ +static inline bool xfs_sb_version_haslazysbcount(struct xfs_sb *sbp) { - return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 && - (sbp->sb_versionnum & XFS_SB_VERSION_DIRV2BIT); + return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) || + (xfs_sb_version_hasmorebits(sbp) && + (sbp->sb_features2 & XFS_SB_VERSION2_LAZYSBCOUNTBIT)); } -static inline int xfs_sb_version_haslogv2(xfs_sb_t *sbp) +static inline bool xfs_sb_version_hasattr2(struct xfs_sb *sbp) { - return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 && - (sbp->sb_versionnum & XFS_SB_VERSION_LOGV2BIT); + return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) || + (xfs_sb_version_hasmorebits(sbp) && + (sbp->sb_features2 & XFS_SB_VERSION2_ATTR2BIT)); } -static inline int xfs_sb_version_hasextflgbit(xfs_sb_t *sbp) +static inline void xfs_sb_version_addattr2(struct xfs_sb *sbp) { - return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 && - (sbp->sb_versionnum & XFS_SB_VERSION_EXTFLGBIT); + sbp->sb_versionnum |= XFS_SB_VERSION_MOREBITSBIT; + sbp->sb_features2 |= XFS_SB_VERSION2_ATTR2BIT; + sbp->sb_bad_features2 |= XFS_SB_VERSION2_ATTR2BIT; } -static inline int xfs_sb_version_hassector(xfs_sb_t *sbp) +static inline void xfs_sb_version_removeattr2(struct xfs_sb *sbp) { - return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 && - (sbp->sb_versionnum & XFS_SB_VERSION_SECTORBIT); + sbp->sb_features2 &= ~XFS_SB_VERSION2_ATTR2BIT; + sbp->sb_bad_features2 &= ~XFS_SB_VERSION2_ATTR2BIT; + if (!sbp->sb_features2) + sbp->sb_versionnum &= ~XFS_SB_VERSION_MOREBITSBIT; } -static inline int xfs_sb_version_hasasciici(xfs_sb_t *sbp) +static inline bool xfs_sb_version_hasprojid32bit(struct xfs_sb *sbp) { - return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 && - (sbp->sb_versionnum & XFS_SB_VERSION_BORGBIT); + return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) || + (xfs_sb_version_hasmorebits(sbp) && + (sbp->sb_features2 & XFS_SB_VERSION2_PROJID32BIT)); } -static inline int xfs_sb_version_hasmorebits(xfs_sb_t *sbp) +static inline void xfs_sb_version_addprojid32bit(struct xfs_sb *sbp) { - return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 && - (sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT); + sbp->sb_versionnum |= XFS_SB_VERSION_MOREBITSBIT; + sbp->sb_features2 |= XFS_SB_VERSION2_PROJID32BIT; + sbp->sb_bad_features2 |= XFS_SB_VERSION2_PROJID32BIT; } /* - * sb_features2 bit version macros. + * Extended v5 superblock feature masks. These are to be used for new v5 + * superblock features only. + * + * Compat features are new features that old kernels will not notice or affect + * and so can mount read-write without issues. * - * For example, for a bit defined as XFS_SB_VERSION2_FUNBIT, has a macro: + * RO-Compat (read only) are features that old kernels can read but will break + * if they write. Hence only read-only mounts of such filesystems are allowed on + * kernels that don't support the feature bit. * - * SB_VERSION_HASFUNBIT(xfs_sb_t *sbp) - * ((xfs_sb_version_hasmorebits(sbp) && - * ((sbp)->sb_features2 & XFS_SB_VERSION2_FUNBIT) + * InCompat features are features which old kernels will not understand and so + * must not mount. + * + * Log-InCompat features are for changes to log formats or new transactions that + * can't be replayed on older kernels. The fields are set when the filesystem is + * mounted, and a clean unmount clears the fields. */ +#define XFS_SB_FEAT_COMPAT_ALL 0 +#define XFS_SB_FEAT_COMPAT_UNKNOWN ~XFS_SB_FEAT_COMPAT_ALL +static inline bool +xfs_sb_has_compat_feature( + struct xfs_sb *sbp, + __uint32_t feature) +{ + return (sbp->sb_features_compat & feature) != 0; +} -static inline int xfs_sb_version_haslazysbcount(xfs_sb_t *sbp) +#define XFS_SB_FEAT_RO_COMPAT_FINOBT (1 << 0) /* free inode btree */ +#define XFS_SB_FEAT_RO_COMPAT_ALL \ + (XFS_SB_FEAT_RO_COMPAT_FINOBT) +#define XFS_SB_FEAT_RO_COMPAT_UNKNOWN ~XFS_SB_FEAT_RO_COMPAT_ALL +static inline bool +xfs_sb_has_ro_compat_feature( + struct xfs_sb *sbp, + __uint32_t feature) { - return xfs_sb_version_hasmorebits(sbp) && - (sbp->sb_features2 & XFS_SB_VERSION2_LAZYSBCOUNTBIT); + return (sbp->sb_features_ro_compat & feature) != 0; } -static inline int xfs_sb_version_hasattr2(xfs_sb_t *sbp) +#define XFS_SB_FEAT_INCOMPAT_FTYPE (1 << 0) /* filetype in dirent */ +#define XFS_SB_FEAT_INCOMPAT_ALL \ + (XFS_SB_FEAT_INCOMPAT_FTYPE) + +#define XFS_SB_FEAT_INCOMPAT_UNKNOWN ~XFS_SB_FEAT_INCOMPAT_ALL +static inline bool +xfs_sb_has_incompat_feature( + struct xfs_sb *sbp, + __uint32_t feature) { - return xfs_sb_version_hasmorebits(sbp) && - (sbp->sb_features2 & XFS_SB_VERSION2_ATTR2BIT); + return (sbp->sb_features_incompat & feature) != 0; } -static inline void xfs_sb_version_addattr2(xfs_sb_t *sbp) +#define XFS_SB_FEAT_INCOMPAT_LOG_ALL 0 +#define XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN ~XFS_SB_FEAT_INCOMPAT_LOG_ALL +static inline bool +xfs_sb_has_incompat_log_feature( + struct xfs_sb *sbp, + __uint32_t feature) { - sbp->sb_versionnum |= XFS_SB_VERSION_MOREBITSBIT; - sbp->sb_features2 |= XFS_SB_VERSION2_ATTR2BIT; + return (sbp->sb_features_log_incompat & feature) != 0; } -static inline void xfs_sb_version_removeattr2(xfs_sb_t *sbp) +/* + * V5 superblock specific feature checks + */ +static inline int xfs_sb_version_hascrc(struct xfs_sb *sbp) { - sbp->sb_features2 &= ~XFS_SB_VERSION2_ATTR2BIT; - if (!sbp->sb_features2) - sbp->sb_versionnum &= ~XFS_SB_VERSION_MOREBITSBIT; + return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5; +} + +static inline int xfs_sb_version_has_pquotino(struct xfs_sb *sbp) +{ + return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5; } -static inline int xfs_sb_version_hasprojid32bit(xfs_sb_t *sbp) +static inline int xfs_sb_version_hasftype(struct xfs_sb *sbp) { - return xfs_sb_version_hasmorebits(sbp) && - (sbp->sb_features2 & XFS_SB_VERSION2_PROJID32BIT); + return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 && + xfs_sb_has_incompat_feature(sbp, XFS_SB_FEAT_INCOMPAT_FTYPE)) || + (xfs_sb_version_hasmorebits(sbp) && + (sbp->sb_features2 & XFS_SB_VERSION2_FTYPE)); } -static inline int xfs_sb_version_hascrc(xfs_sb_t *sbp) +static inline int xfs_sb_version_hasfinobt(xfs_sb_t *sbp) { - return (xfs_sb_version_hasmorebits(sbp) && - (sbp->sb_features2 & XFS_SB_VERSION2_CRCBIT)); + return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) && + (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_FINOBT); } /* * end of superblock version macros */ +static inline bool +xfs_is_quota_inode(struct xfs_sb *sbp, xfs_ino_t ino) +{ + return (ino == sbp->sb_uquotino || + ino == sbp->sb_gquotino || + ino == sbp->sb_pquotino); +} + #define XFS_SB_DADDR ((xfs_daddr_t)0) /* daddr in filesystem/ag */ #define XFS_SB_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_SB_DADDR) #define XFS_BUF_TO_SBP(bp) ((xfs_dsb_t *)((bp)->b_addr)) @@ -546,4 +602,20 @@ static inline int xfs_sb_version_hascrc(xfs_sb_t *sbp) #define XFS_B_TO_FSBT(mp,b) (((__uint64_t)(b)) >> (mp)->m_sb.sb_blocklog) #define XFS_B_FSB_OFFSET(mp,b) ((b) & (mp)->m_blockmask) +/* + * perag get/put wrappers for ref counting + */ +extern struct xfs_perag *xfs_perag_get(struct xfs_mount *, xfs_agnumber_t); +extern struct xfs_perag *xfs_perag_get_tag(struct xfs_mount *, xfs_agnumber_t, + int tag); +extern void xfs_perag_put(struct xfs_perag *pag); +extern int xfs_initialize_perag_data(struct xfs_mount *, xfs_agnumber_t); + +extern void xfs_sb_calc_crc(struct xfs_buf *); +extern void xfs_mod_sb(struct xfs_trans *, __int64_t); +extern void xfs_sb_mount_common(struct xfs_mount *, struct xfs_sb *); +extern void xfs_sb_from_disk(struct xfs_sb *, struct xfs_dsb *); +extern void xfs_sb_to_disk(struct xfs_dsb *, struct xfs_sb *, __int64_t); +extern void xfs_sb_quota_from_disk(struct xfs_sb *sbp); + #endif /* __XFS_SB_H__ */ diff --git a/fs/xfs/xfs_shared.h b/fs/xfs/xfs_shared.h new file mode 100644 index 00000000000..82404da2ca6 --- /dev/null +++ b/fs/xfs/xfs_shared.h @@ -0,0 +1,246 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * Copyright (c) 2013 Red Hat, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_SHARED_H__ +#define __XFS_SHARED_H__ + +/* + * Definitions shared between kernel and userspace that don't fit into any other + * header file that is shared with userspace. + */ +struct xfs_ifork; +struct xfs_buf; +struct xfs_buf_ops; +struct xfs_mount; +struct xfs_trans; +struct xfs_inode; + +/* + * Buffer verifier operations are widely used, including userspace tools + */ +extern const struct xfs_buf_ops xfs_agf_buf_ops; +extern const struct xfs_buf_ops xfs_agi_buf_ops; +extern const struct xfs_buf_ops xfs_agf_buf_ops; +extern const struct xfs_buf_ops xfs_agfl_buf_ops; +extern const struct xfs_buf_ops xfs_allocbt_buf_ops; +extern const struct xfs_buf_ops xfs_attr3_leaf_buf_ops; +extern const struct xfs_buf_ops xfs_attr3_rmt_buf_ops; +extern const struct xfs_buf_ops xfs_bmbt_buf_ops; +extern const struct xfs_buf_ops xfs_da3_node_buf_ops; +extern const struct xfs_buf_ops xfs_dquot_buf_ops; +extern const struct xfs_buf_ops xfs_symlink_buf_ops; +extern const struct xfs_buf_ops xfs_agi_buf_ops; +extern const struct xfs_buf_ops xfs_inobt_buf_ops; +extern const struct xfs_buf_ops xfs_inode_buf_ops; +extern const struct xfs_buf_ops xfs_inode_buf_ra_ops; +extern const struct xfs_buf_ops xfs_dquot_buf_ops; +extern const struct xfs_buf_ops xfs_sb_buf_ops; +extern const struct xfs_buf_ops xfs_sb_quiet_buf_ops; +extern const struct xfs_buf_ops xfs_symlink_buf_ops; + +/* + * Transaction types. Used to distinguish types of buffers. These never reach + * the log. + */ +#define XFS_TRANS_SETATTR_NOT_SIZE 1 +#define XFS_TRANS_SETATTR_SIZE 2 +#define XFS_TRANS_INACTIVE 3 +#define XFS_TRANS_CREATE 4 +#define XFS_TRANS_CREATE_TRUNC 5 +#define XFS_TRANS_TRUNCATE_FILE 6 +#define XFS_TRANS_REMOVE 7 +#define XFS_TRANS_LINK 8 +#define XFS_TRANS_RENAME 9 +#define XFS_TRANS_MKDIR 10 +#define XFS_TRANS_RMDIR 11 +#define XFS_TRANS_SYMLINK 12 +#define XFS_TRANS_SET_DMATTRS 13 +#define XFS_TRANS_GROWFS 14 +#define XFS_TRANS_STRAT_WRITE 15 +#define XFS_TRANS_DIOSTRAT 16 +/* 17 was XFS_TRANS_WRITE_SYNC */ +#define XFS_TRANS_WRITEID 18 +#define XFS_TRANS_ADDAFORK 19 +#define XFS_TRANS_ATTRINVAL 20 +#define XFS_TRANS_ATRUNCATE 21 +#define XFS_TRANS_ATTR_SET 22 +#define XFS_TRANS_ATTR_RM 23 +#define XFS_TRANS_ATTR_FLAG 24 +#define XFS_TRANS_CLEAR_AGI_BUCKET 25 +#define XFS_TRANS_QM_SBCHANGE 26 +/* + * Dummy entries since we use the transaction type to index into the + * trans_type[] in xlog_recover_print_trans_head() + */ +#define XFS_TRANS_DUMMY1 27 +#define XFS_TRANS_DUMMY2 28 +#define XFS_TRANS_QM_QUOTAOFF 29 +#define XFS_TRANS_QM_DQALLOC 30 +#define XFS_TRANS_QM_SETQLIM 31 +#define XFS_TRANS_QM_DQCLUSTER 32 +#define XFS_TRANS_QM_QINOCREATE 33 +#define XFS_TRANS_QM_QUOTAOFF_END 34 +#define XFS_TRANS_SB_UNIT 35 +#define XFS_TRANS_FSYNC_TS 36 +#define XFS_TRANS_GROWFSRT_ALLOC 37 +#define XFS_TRANS_GROWFSRT_ZERO 38 +#define XFS_TRANS_GROWFSRT_FREE 39 +#define XFS_TRANS_SWAPEXT 40 +#define XFS_TRANS_SB_COUNT 41 +#define XFS_TRANS_CHECKPOINT 42 +#define XFS_TRANS_ICREATE 43 +#define XFS_TRANS_CREATE_TMPFILE 44 +#define XFS_TRANS_TYPE_MAX 44 +/* new transaction types need to be reflected in xfs_logprint(8) */ + +#define XFS_TRANS_TYPES \ + { XFS_TRANS_SETATTR_NOT_SIZE, "SETATTR_NOT_SIZE" }, \ + { XFS_TRANS_SETATTR_SIZE, "SETATTR_SIZE" }, \ + { XFS_TRANS_INACTIVE, "INACTIVE" }, \ + { XFS_TRANS_CREATE, "CREATE" }, \ + { XFS_TRANS_CREATE_TMPFILE, "CREATE_TMPFILE" }, \ + { XFS_TRANS_CREATE_TRUNC, "CREATE_TRUNC" }, \ + { XFS_TRANS_TRUNCATE_FILE, "TRUNCATE_FILE" }, \ + { XFS_TRANS_REMOVE, "REMOVE" }, \ + { XFS_TRANS_LINK, "LINK" }, \ + { XFS_TRANS_RENAME, "RENAME" }, \ + { XFS_TRANS_MKDIR, "MKDIR" }, \ + { XFS_TRANS_RMDIR, "RMDIR" }, \ + { XFS_TRANS_SYMLINK, "SYMLINK" }, \ + { XFS_TRANS_SET_DMATTRS, "SET_DMATTRS" }, \ + { XFS_TRANS_GROWFS, "GROWFS" }, \ + { XFS_TRANS_STRAT_WRITE, "STRAT_WRITE" }, \ + { XFS_TRANS_DIOSTRAT, "DIOSTRAT" }, \ + { XFS_TRANS_WRITEID, "WRITEID" }, \ + { XFS_TRANS_ADDAFORK, "ADDAFORK" }, \ + { XFS_TRANS_ATTRINVAL, "ATTRINVAL" }, \ + { XFS_TRANS_ATRUNCATE, "ATRUNCATE" }, \ + { XFS_TRANS_ATTR_SET, "ATTR_SET" }, \ + { XFS_TRANS_ATTR_RM, "ATTR_RM" }, \ + { XFS_TRANS_ATTR_FLAG, "ATTR_FLAG" }, \ + { XFS_TRANS_CLEAR_AGI_BUCKET, "CLEAR_AGI_BUCKET" }, \ + { XFS_TRANS_QM_SBCHANGE, "QM_SBCHANGE" }, \ + { XFS_TRANS_QM_QUOTAOFF, "QM_QUOTAOFF" }, \ + { XFS_TRANS_QM_DQALLOC, "QM_DQALLOC" }, \ + { XFS_TRANS_QM_SETQLIM, "QM_SETQLIM" }, \ + { XFS_TRANS_QM_DQCLUSTER, "QM_DQCLUSTER" }, \ + { XFS_TRANS_QM_QINOCREATE, "QM_QINOCREATE" }, \ + { XFS_TRANS_QM_QUOTAOFF_END, "QM_QOFF_END" }, \ + { XFS_TRANS_SB_UNIT, "SB_UNIT" }, \ + { XFS_TRANS_FSYNC_TS, "FSYNC_TS" }, \ + { XFS_TRANS_GROWFSRT_ALLOC, "GROWFSRT_ALLOC" }, \ + { XFS_TRANS_GROWFSRT_ZERO, "GROWFSRT_ZERO" }, \ + { XFS_TRANS_GROWFSRT_FREE, "GROWFSRT_FREE" }, \ + { XFS_TRANS_SWAPEXT, "SWAPEXT" }, \ + { XFS_TRANS_SB_COUNT, "SB_COUNT" }, \ + { XFS_TRANS_CHECKPOINT, "CHECKPOINT" }, \ + { XFS_TRANS_DUMMY1, "DUMMY1" }, \ + { XFS_TRANS_DUMMY2, "DUMMY2" }, \ + { XLOG_UNMOUNT_REC_TYPE, "UNMOUNT" } + +/* + * This structure is used to track log items associated with + * a transaction. It points to the log item and keeps some + * flags to track the state of the log item. It also tracks + * the amount of space needed to log the item it describes + * once we get to commit processing (see xfs_trans_commit()). + */ +struct xfs_log_item_desc { + struct xfs_log_item *lid_item; + struct list_head lid_trans; + unsigned char lid_flags; +}; + +#define XFS_LID_DIRTY 0x1 + +/* log size calculation functions */ +int xfs_log_calc_unit_res(struct xfs_mount *mp, int unit_bytes); +int xfs_log_calc_minimum_size(struct xfs_mount *); + + +/* + * Values for t_flags. + */ +#define XFS_TRANS_DIRTY 0x01 /* something needs to be logged */ +#define XFS_TRANS_SB_DIRTY 0x02 /* superblock is modified */ +#define XFS_TRANS_PERM_LOG_RES 0x04 /* xact took a permanent log res */ +#define XFS_TRANS_SYNC 0x08 /* make commit synchronous */ +#define XFS_TRANS_DQ_DIRTY 0x10 /* at least one dquot in trx dirty */ +#define XFS_TRANS_RESERVE 0x20 /* OK to use reserved data blocks */ +#define XFS_TRANS_FREEZE_PROT 0x40 /* Transaction has elevated writer + count in superblock */ +/* + * Values for call flags parameter. + */ +#define XFS_TRANS_RELEASE_LOG_RES 0x4 +#define XFS_TRANS_ABORT 0x8 + +/* + * Field values for xfs_trans_mod_sb. + */ +#define XFS_TRANS_SB_ICOUNT 0x00000001 +#define XFS_TRANS_SB_IFREE 0x00000002 +#define XFS_TRANS_SB_FDBLOCKS 0x00000004 +#define XFS_TRANS_SB_RES_FDBLOCKS 0x00000008 +#define XFS_TRANS_SB_FREXTENTS 0x00000010 +#define XFS_TRANS_SB_RES_FREXTENTS 0x00000020 +#define XFS_TRANS_SB_DBLOCKS 0x00000040 +#define XFS_TRANS_SB_AGCOUNT 0x00000080 +#define XFS_TRANS_SB_IMAXPCT 0x00000100 +#define XFS_TRANS_SB_REXTSIZE 0x00000200 +#define XFS_TRANS_SB_RBMBLOCKS 0x00000400 +#define XFS_TRANS_SB_RBLOCKS 0x00000800 +#define XFS_TRANS_SB_REXTENTS 0x00001000 +#define XFS_TRANS_SB_REXTSLOG 0x00002000 + +/* + * Here we centralize the specification of XFS meta-data buffer reference count + * values. This determines how hard the buffer cache tries to hold onto the + * buffer. + */ +#define XFS_AGF_REF 4 +#define XFS_AGI_REF 4 +#define XFS_AGFL_REF 3 +#define XFS_INO_BTREE_REF 3 +#define XFS_ALLOC_BTREE_REF 2 +#define XFS_BMAP_BTREE_REF 2 +#define XFS_DIR_BTREE_REF 2 +#define XFS_INO_REF 2 +#define XFS_ATTR_BTREE_REF 1 +#define XFS_DQUOT_REF 1 + +/* + * Flags for xfs_trans_ichgtime(). + */ +#define XFS_ICHGTIME_MOD 0x1 /* data fork modification timestamp */ +#define XFS_ICHGTIME_CHG 0x2 /* inode field change timestamp */ +#define XFS_ICHGTIME_CREATE 0x4 /* inode create timestamp */ + + +/* + * Symlink decoding/encoding functions + */ +int xfs_symlink_blocks(struct xfs_mount *mp, int pathlen); +int xfs_symlink_hdr_set(struct xfs_mount *mp, xfs_ino_t ino, uint32_t offset, + uint32_t size, struct xfs_buf *bp); +bool xfs_symlink_hdr_ok(xfs_ino_t ino, uint32_t offset, + uint32_t size, struct xfs_buf *bp); +void xfs_symlink_local_to_remote(struct xfs_trans *tp, struct xfs_buf *bp, + struct xfs_inode *ip, struct xfs_ifork *ifp); + +#endif /* __XFS_SHARED_H__ */ diff --git a/fs/xfs/xfs_stats.c b/fs/xfs/xfs_stats.c index ce372b7d564..f2240383d4b 100644 --- a/fs/xfs/xfs_stats.c +++ b/fs/xfs/xfs_stats.c @@ -59,6 +59,7 @@ static int xfs_stat_proc_show(struct seq_file *m, void *v) { "abtc2", XFSSTAT_END_ABTC_V2 }, { "bmbt2", XFSSTAT_END_BMBT_V2 }, { "ibt2", XFSSTAT_END_IBT_V2 }, + { "fibt2", XFSSTAT_END_FIBT_V2 }, /* we print both series of quota information together */ { "qm", XFSSTAT_END_QM }, }; diff --git a/fs/xfs/xfs_stats.h b/fs/xfs/xfs_stats.h index c03ad38ceae..c8f238b8299 100644 --- a/fs/xfs/xfs_stats.h +++ b/fs/xfs/xfs_stats.h @@ -183,7 +183,23 @@ struct xfsstats { __uint32_t xs_ibt_2_alloc; __uint32_t xs_ibt_2_free; __uint32_t xs_ibt_2_moves; -#define XFSSTAT_END_XQMSTAT (XFSSTAT_END_IBT_V2+6) +#define XFSSTAT_END_FIBT_V2 (XFSSTAT_END_IBT_V2+15) + __uint32_t xs_fibt_2_lookup; + __uint32_t xs_fibt_2_compare; + __uint32_t xs_fibt_2_insrec; + __uint32_t xs_fibt_2_delrec; + __uint32_t xs_fibt_2_newroot; + __uint32_t xs_fibt_2_killroot; + __uint32_t xs_fibt_2_increment; + __uint32_t xs_fibt_2_decrement; + __uint32_t xs_fibt_2_lshift; + __uint32_t xs_fibt_2_rshift; + __uint32_t xs_fibt_2_split; + __uint32_t xs_fibt_2_join; + __uint32_t xs_fibt_2_alloc; + __uint32_t xs_fibt_2_free; + __uint32_t xs_fibt_2_moves; +#define XFSSTAT_END_XQMSTAT (XFSSTAT_END_FIBT_V2+6) __uint32_t xs_qm_dqreclaims; __uint32_t xs_qm_dqreclaim_misses; __uint32_t xs_qm_dquot_dups; diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index ab8839b2627..8f0333b3f7a 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -17,40 +17,36 @@ */ #include "xfs.h" -#include "xfs_log.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_inum.h" -#include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_dir2.h" -#include "xfs_alloc.h" -#include "xfs_quota.h" #include "xfs_mount.h" -#include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_dinode.h" +#include "xfs_da_format.h" #include "xfs_inode.h" #include "xfs_btree.h" -#include "xfs_ialloc.h" #include "xfs_bmap.h" -#include "xfs_rtalloc.h" +#include "xfs_alloc.h" #include "xfs_error.h" -#include "xfs_itable.h" #include "xfs_fsops.h" -#include "xfs_attr.h" +#include "xfs_trans.h" #include "xfs_buf_item.h" -#include "xfs_utils.h" -#include "xfs_vnodeops.h" +#include "xfs_log.h" #include "xfs_log_priv.h" -#include "xfs_trans_priv.h" -#include "xfs_filestream.h" #include "xfs_da_btree.h" +#include "xfs_dir2.h" #include "xfs_extfree_item.h" #include "xfs_mru_cache.h" #include "xfs_inode_item.h" #include "xfs_icache.h" #include "xfs_trace.h" +#include "xfs_icreate_item.h" +#include "xfs_dinode.h" +#include "xfs_filestream.h" +#include "xfs_quota.h" #include <linux/namei.h> #include <linux/init.h> @@ -139,9 +135,9 @@ static const match_table_t tokens = { STATIC unsigned long -suffix_strtoul(char *s, char **endp, unsigned int base) +suffix_kstrtoint(char *s, unsigned int base, int *res) { - int last, shift_left_factor = 0; + int last, shift_left_factor = 0, _res; char *value = s; last = strlen(value) - 1; @@ -158,7 +154,10 @@ suffix_strtoul(char *s, char **endp, unsigned int base) value[last] = '\0'; } - return simple_strtoul((const char *)s, endp, base) << shift_left_factor; + if (kstrtoint(s, base, &_res)) + return -EINVAL; + *res = _res << shift_left_factor; + return 0; } /* @@ -174,7 +173,7 @@ xfs_parseargs( char *options) { struct super_block *sb = mp->m_super; - char *this_char, *value, *eov; + char *this_char, *value; int dsunit = 0; int dswidth = 0; int iosize = 0; @@ -230,14 +229,16 @@ xfs_parseargs( this_char); return EINVAL; } - mp->m_logbufs = simple_strtoul(value, &eov, 10); + if (kstrtoint(value, 10, &mp->m_logbufs)) + return EINVAL; } else if (!strcmp(this_char, MNTOPT_LOGBSIZE)) { if (!value || !*value) { xfs_warn(mp, "%s option requires an argument", this_char); return EINVAL; } - mp->m_logbsize = suffix_strtoul(value, &eov, 10); + if (suffix_kstrtoint(value, 10, &mp->m_logbsize)) + return EINVAL; } else if (!strcmp(this_char, MNTOPT_LOGDEV)) { if (!value || !*value) { xfs_warn(mp, "%s option requires an argument", @@ -266,7 +267,8 @@ xfs_parseargs( this_char); return EINVAL; } - iosize = simple_strtoul(value, &eov, 10); + if (kstrtoint(value, 10, &iosize)) + return EINVAL; iosizelog = ffs(iosize) - 1; } else if (!strcmp(this_char, MNTOPT_ALLOCSIZE)) { if (!value || !*value) { @@ -274,7 +276,8 @@ xfs_parseargs( this_char); return EINVAL; } - iosize = suffix_strtoul(value, &eov, 10); + if (suffix_kstrtoint(value, 10, &iosize)) + return EINVAL; iosizelog = ffs(iosize) - 1; } else if (!strcmp(this_char, MNTOPT_GRPID) || !strcmp(this_char, MNTOPT_BSDGROUPS)) { @@ -296,14 +299,16 @@ xfs_parseargs( this_char); return EINVAL; } - dsunit = simple_strtoul(value, &eov, 10); + if (kstrtoint(value, 10, &dsunit)) + return EINVAL; } else if (!strcmp(this_char, MNTOPT_SWIDTH)) { if (!value || !*value) { xfs_warn(mp, "%s option requires an argument", this_char); return EINVAL; } - dswidth = simple_strtoul(value, &eov, 10); + if (kstrtoint(value, 10, &dswidth)) + return EINVAL; } else if (!strcmp(this_char, MNTOPT_32BITINODE)) { mp->m_flags |= XFS_MOUNT_SMALL_INUMS; } else if (!strcmp(this_char, MNTOPT_64BITINODE)) { @@ -350,17 +355,17 @@ xfs_parseargs( } else if (!strcmp(this_char, MNTOPT_PQUOTA) || !strcmp(this_char, MNTOPT_PRJQUOTA)) { mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE | - XFS_OQUOTA_ENFD); + XFS_PQUOTA_ENFD); } else if (!strcmp(this_char, MNTOPT_PQUOTANOENF)) { mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE); - mp->m_qflags &= ~XFS_OQUOTA_ENFD; + mp->m_qflags &= ~XFS_PQUOTA_ENFD; } else if (!strcmp(this_char, MNTOPT_GQUOTA) || !strcmp(this_char, MNTOPT_GRPQUOTA)) { mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE | - XFS_OQUOTA_ENFD); + XFS_GQUOTA_ENFD); } else if (!strcmp(this_char, MNTOPT_GQUOTANOENF)) { mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE); - mp->m_qflags &= ~XFS_OQUOTA_ENFD; + mp->m_qflags &= ~XFS_GQUOTA_ENFD; } else if (!strcmp(this_char, MNTOPT_DELAYLOG)) { xfs_warn(mp, "delaylog is the default now, option is deprecated."); @@ -411,12 +416,6 @@ xfs_parseargs( } #endif - if ((mp->m_qflags & (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE)) && - (mp->m_qflags & (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE))) { - xfs_warn(mp, "cannot mount with both project and group quota"); - return EINVAL; - } - if ((dsunit && !dswidth) || (!dsunit && dswidth)) { xfs_warn(mp, "sunit and swidth must be specified together"); return EINVAL; @@ -430,20 +429,15 @@ xfs_parseargs( } done: - if (!(mp->m_flags & XFS_MOUNT_NOALIGN)) { + if (dsunit && !(mp->m_flags & XFS_MOUNT_NOALIGN)) { /* * At this point the superblock has not been read * in, therefore we do not know the block size. * Before the mount call ends we will convert * these to FSBs. */ - if (dsunit) { - mp->m_dalign = dsunit; - mp->m_flags |= XFS_MOUNT_RETERR; - } - - if (dswidth) - mp->m_swidth = dswidth; + mp->m_dalign = dsunit; + mp->m_swidth = dswidth; } if (mp->m_logbufs != -1 && @@ -551,15 +545,14 @@ xfs_showargs( else if (mp->m_qflags & XFS_UQUOTA_ACCT) seq_puts(m, "," MNTOPT_UQUOTANOENF); - /* Either project or group quotas can be active, not both */ - if (mp->m_qflags & XFS_PQUOTA_ACCT) { - if (mp->m_qflags & XFS_OQUOTA_ENFD) + if (mp->m_qflags & XFS_PQUOTA_ENFD) seq_puts(m, "," MNTOPT_PRJQUOTA); else seq_puts(m, "," MNTOPT_PQUOTANOENF); - } else if (mp->m_qflags & XFS_GQUOTA_ACCT) { - if (mp->m_qflags & XFS_OQUOTA_ENFD) + } + if (mp->m_qflags & XFS_GQUOTA_ACCT) { + if (mp->m_qflags & XFS_GQUOTA_ENFD) seq_puts(m, "," MNTOPT_GRPQUOTA); else seq_puts(m, "," MNTOPT_GQUOTANOENF); @@ -772,20 +765,18 @@ xfs_open_devices( * Setup xfs_mount buffer target pointers */ error = ENOMEM; - mp->m_ddev_targp = xfs_alloc_buftarg(mp, ddev, 0, mp->m_fsname); + mp->m_ddev_targp = xfs_alloc_buftarg(mp, ddev); if (!mp->m_ddev_targp) goto out_close_rtdev; if (rtdev) { - mp->m_rtdev_targp = xfs_alloc_buftarg(mp, rtdev, 1, - mp->m_fsname); + mp->m_rtdev_targp = xfs_alloc_buftarg(mp, rtdev); if (!mp->m_rtdev_targp) goto out_free_ddev_targ; } if (logdev && logdev != ddev) { - mp->m_logdev_targp = xfs_alloc_buftarg(mp, logdev, 1, - mp->m_fsname); + mp->m_logdev_targp = xfs_alloc_buftarg(mp, logdev); if (!mp->m_logdev_targp) goto out_free_rtdev_targ; } else { @@ -818,8 +809,7 @@ xfs_setup_devices( { int error; - error = xfs_setsize_buftarg(mp->m_ddev_targp, mp->m_sb.sb_blocksize, - mp->m_sb.sb_sectsize); + error = xfs_setsize_buftarg(mp->m_ddev_targp, mp->m_sb.sb_sectsize); if (error) return error; @@ -829,14 +819,12 @@ xfs_setup_devices( if (xfs_sb_version_hassector(&mp->m_sb)) log_sector_size = mp->m_sb.sb_logsectsize; error = xfs_setsize_buftarg(mp->m_logdev_targp, - mp->m_sb.sb_blocksize, log_sector_size); if (error) return error; } if (mp->m_rtdev_targp) { error = xfs_setsize_buftarg(mp->m_rtdev_targp, - mp->m_sb.sb_blocksize, mp->m_sb.sb_sectsize); if (error) return error; @@ -865,17 +853,17 @@ xfs_init_mount_workqueues( goto out_destroy_unwritten; mp->m_reclaim_workqueue = alloc_workqueue("xfs-reclaim/%s", - WQ_NON_REENTRANT, 0, mp->m_fsname); + 0, 0, mp->m_fsname); if (!mp->m_reclaim_workqueue) goto out_destroy_cil; mp->m_log_workqueue = alloc_workqueue("xfs-log/%s", - WQ_NON_REENTRANT, 0, mp->m_fsname); + 0, 0, mp->m_fsname); if (!mp->m_log_workqueue) goto out_destroy_reclaim; mp->m_eofblocks_workqueue = alloc_workqueue("xfs-eofblocks/%s", - WQ_NON_REENTRANT, 0, mp->m_fsname); + 0, 0, mp->m_fsname); if (!mp->m_eofblocks_workqueue) goto out_destroy_log; @@ -948,10 +936,6 @@ xfs_fs_destroy_inode( XFS_STATS_INC(vn_reclaim); - /* bad inode, get out here ASAP */ - if (is_bad_inode(inode)) - goto out_reclaim; - ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0); /* @@ -967,7 +951,6 @@ xfs_fs_destroy_inode( * this more efficiently than we can here, so simply let background * reclaim tear down all inodes. */ -out_reclaim: xfs_inode_set_reclaim_tag(ip); } @@ -1008,7 +991,7 @@ xfs_fs_evict_inode( trace_xfs_evict_inode(ip); - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); clear_inode(inode); XFS_STATS_INC(vn_rele); XFS_STATS_INC(vn_remove); @@ -1127,8 +1110,8 @@ xfs_fs_statfs( spin_unlock(&mp->m_sb_lock); if ((ip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) && - ((mp->m_qflags & (XFS_PQUOTA_ACCT|XFS_OQUOTA_ENFD))) == - (XFS_PQUOTA_ACCT|XFS_OQUOTA_ENFD)) + ((mp->m_qflags & (XFS_PQUOTA_ACCT|XFS_PQUOTA_ENFD))) == + (XFS_PQUOTA_ACCT|XFS_PQUOTA_ENFD)) xfs_qm_statvfs(ip, statp); return 0; } @@ -1167,7 +1150,7 @@ xfs_restore_resvblks(struct xfs_mount *mp) * Note: xfs_log_quiesce() stops background log work - the callers must ensure * it is started again when appropriate. */ -void +static void xfs_quiesce_attr( struct xfs_mount *mp) { @@ -1209,6 +1192,7 @@ xfs_fs_remount( char *p; int error; + sync_filesystem(sb); while ((p = strsep(&options, ",")) != NULL) { int token; @@ -1248,7 +1232,7 @@ xfs_fs_remount( */ #if 0 xfs_info(mp, - "mount option \"%s\" not supported for remount\n", p); + "mount option \"%s\" not supported for remount", p); return -EINVAL; #else break; @@ -1364,6 +1348,17 @@ xfs_finish_flags( } /* + * V5 filesystems always use attr2 format for attributes. + */ + if (xfs_sb_version_hascrc(&mp->m_sb) && + (mp->m_flags & XFS_MOUNT_NOATTR2)) { + xfs_warn(mp, +"Cannot mount a V5 filesystem as %s. %s is always enabled for V5 filesystems.", + MNTOPT_NOATTR2, MNTOPT_ATTR2); + return XFS_ERROR(EINVAL); + } + + /* * mkfs'ed attr2 will turn on attr2 mount unless explicitly * told by noattr2 to turn it off */ @@ -1380,6 +1375,14 @@ xfs_finish_flags( return XFS_ERROR(EROFS); } + if ((mp->m_qflags & (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE)) && + (mp->m_qflags & (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE)) && + !xfs_sb_version_has_pquotino(&mp->m_sb)) { + xfs_warn(mp, + "Super block does not support project and group quota together"); + return XFS_ERROR(EINVAL); + } + return 0; } @@ -1425,11 +1428,11 @@ xfs_fs_fill_super( if (error) goto out_free_fsname; - error = xfs_init_mount_workqueues(mp); + error = -xfs_init_mount_workqueues(mp); if (error) goto out_close_devices; - error = xfs_icsb_init_counters(mp); + error = -xfs_icsb_init_counters(mp); if (error) goto out_destroy_workqueues; @@ -1461,6 +1464,10 @@ xfs_fs_fill_super( sb->s_time_gran = 1; set_posix_acl_flag(sb); + /* version 5 superblocks support inode version counters. */ + if (XFS_SB_VERSION_NUM(&mp->m_sb) == XFS_SB_VERSION_5) + sb->s_flags |= MS_I_VERSION; + error = xfs_mountfs(mp); if (error) goto out_filestream_unmount; @@ -1470,10 +1477,6 @@ xfs_fs_fill_super( error = ENOENT; goto out_unmount; } - if (is_bad_inode(root)) { - error = EINVAL; - goto out_unmount; - } sb->s_root = d_make_root(root); if (!sb->s_root) { error = ENOMEM; @@ -1514,19 +1517,21 @@ xfs_fs_mount( return mount_bdev(fs_type, flags, dev_name, data, xfs_fs_fill_super); } -static int +static long xfs_fs_nr_cached_objects( - struct super_block *sb) + struct super_block *sb, + int nid) { return xfs_reclaim_inodes_count(XFS_M(sb)); } -static void +static long xfs_fs_free_cached_objects( struct super_block *sb, - int nr_to_scan) + long nr_to_scan, + int nid) { - xfs_reclaim_inodes_nr(XFS_M(sb), nr_to_scan); + return xfs_reclaim_inodes_nr(XFS_M(sb), nr_to_scan); } static const struct super_operations xfs_super_operations = { @@ -1552,6 +1557,7 @@ static struct file_system_type xfs_fs_type = { .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; +MODULE_ALIAS_FS("xfs"); STATIC int __init xfs_init_zones(void) @@ -1634,9 +1640,15 @@ xfs_init_zones(void) KM_ZONE_SPREAD, NULL); if (!xfs_ili_zone) goto out_destroy_inode_zone; + xfs_icreate_zone = kmem_zone_init(sizeof(struct xfs_icreate_item), + "xfs_icr"); + if (!xfs_icreate_zone) + goto out_destroy_ili_zone; return 0; + out_destroy_ili_zone: + kmem_zone_destroy(xfs_ili_zone); out_destroy_inode_zone: kmem_zone_destroy(xfs_inode_zone); out_destroy_efi_zone: @@ -1675,6 +1687,7 @@ xfs_destroy_zones(void) * destroy caches. */ rcu_barrier(); + kmem_zone_destroy(xfs_icreate_zone); kmem_zone_destroy(xfs_ili_zone); kmem_zone_destroy(xfs_inode_zone); kmem_zone_destroy(xfs_efi_zone); @@ -1736,13 +1749,9 @@ init_xfs_fs(void) if (error) goto out_destroy_wq; - error = xfs_filestream_init(); - if (error) - goto out_mru_cache_uninit; - error = xfs_buf_init(); if (error) - goto out_filestream_uninit; + goto out_mru_cache_uninit; error = xfs_init_procfs(); if (error) @@ -1769,8 +1778,6 @@ init_xfs_fs(void) xfs_cleanup_procfs(); out_buf_terminate: xfs_buf_terminate(); - out_filestream_uninit: - xfs_filestream_uninit(); out_mru_cache_uninit: xfs_mru_cache_uninit(); out_destroy_wq: @@ -1789,7 +1796,6 @@ exit_xfs_fs(void) xfs_sysctl_unregister(); xfs_cleanup_procfs(); xfs_buf_terminate(); - xfs_filestream_uninit(); xfs_mru_cache_uninit(); xfs_destroy_workqueues(); xfs_destroy_zones(); diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c new file mode 100644 index 00000000000..d69363c833e --- /dev/null +++ b/fs/xfs/xfs_symlink.c @@ -0,0 +1,599 @@ +/* + * Copyright (c) 2000-2006 Silicon Graphics, Inc. + * Copyright (c) 2012-2013 Red Hat, Inc. + * All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_shared.h" +#include "xfs_fs.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_bit.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_mount.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_dir2.h" +#include "xfs_inode.h" +#include "xfs_ialloc.h" +#include "xfs_alloc.h" +#include "xfs_bmap.h" +#include "xfs_bmap_btree.h" +#include "xfs_bmap_util.h" +#include "xfs_error.h" +#include "xfs_quota.h" +#include "xfs_trans_space.h" +#include "xfs_trace.h" +#include "xfs_symlink.h" +#include "xfs_trans.h" +#include "xfs_log.h" +#include "xfs_dinode.h" + +/* ----- Kernel only functions below ----- */ +STATIC int +xfs_readlink_bmap( + struct xfs_inode *ip, + char *link) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_bmbt_irec mval[XFS_SYMLINK_MAPS]; + struct xfs_buf *bp; + xfs_daddr_t d; + char *cur_chunk; + int pathlen = ip->i_d.di_size; + int nmaps = XFS_SYMLINK_MAPS; + int byte_cnt; + int n; + int error = 0; + int fsblocks = 0; + int offset; + + fsblocks = xfs_symlink_blocks(mp, pathlen); + error = xfs_bmapi_read(ip, 0, fsblocks, mval, &nmaps, 0); + if (error) + goto out; + + offset = 0; + for (n = 0; n < nmaps; n++) { + d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock); + byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount); + + bp = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt), 0, + &xfs_symlink_buf_ops); + if (!bp) + return XFS_ERROR(ENOMEM); + error = bp->b_error; + if (error) { + xfs_buf_ioerror_alert(bp, __func__); + xfs_buf_relse(bp); + + /* bad CRC means corrupted metadata */ + if (error == EFSBADCRC) + error = EFSCORRUPTED; + goto out; + } + byte_cnt = XFS_SYMLINK_BUF_SPACE(mp, byte_cnt); + if (pathlen < byte_cnt) + byte_cnt = pathlen; + + cur_chunk = bp->b_addr; + if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (!xfs_symlink_hdr_ok(ip->i_ino, offset, + byte_cnt, bp)) { + error = EFSCORRUPTED; + xfs_alert(mp, +"symlink header does not match required off/len/owner (0x%x/Ox%x,0x%llx)", + offset, byte_cnt, ip->i_ino); + xfs_buf_relse(bp); + goto out; + + } + + cur_chunk += sizeof(struct xfs_dsymlink_hdr); + } + + memcpy(link + offset, bp->b_addr, byte_cnt); + + pathlen -= byte_cnt; + offset += byte_cnt; + + xfs_buf_relse(bp); + } + ASSERT(pathlen == 0); + + link[ip->i_d.di_size] = '\0'; + error = 0; + + out: + return error; +} + +int +xfs_readlink( + struct xfs_inode *ip, + char *link) +{ + struct xfs_mount *mp = ip->i_mount; + xfs_fsize_t pathlen; + int error = 0; + + trace_xfs_readlink(ip); + + if (XFS_FORCED_SHUTDOWN(mp)) + return XFS_ERROR(EIO); + + xfs_ilock(ip, XFS_ILOCK_SHARED); + + pathlen = ip->i_d.di_size; + if (!pathlen) + goto out; + + if (pathlen < 0 || pathlen > MAXPATHLEN) { + xfs_alert(mp, "%s: inode (%llu) bad symlink length (%lld)", + __func__, (unsigned long long) ip->i_ino, + (long long) pathlen); + ASSERT(0); + error = XFS_ERROR(EFSCORRUPTED); + goto out; + } + + + if (ip->i_df.if_flags & XFS_IFINLINE) { + memcpy(link, ip->i_df.if_u1.if_data, pathlen); + link[pathlen] = '\0'; + } else { + error = xfs_readlink_bmap(ip, link); + } + + out: + xfs_iunlock(ip, XFS_ILOCK_SHARED); + return error; +} + +int +xfs_symlink( + struct xfs_inode *dp, + struct xfs_name *link_name, + const char *target_path, + umode_t mode, + struct xfs_inode **ipp) +{ + struct xfs_mount *mp = dp->i_mount; + struct xfs_trans *tp = NULL; + struct xfs_inode *ip = NULL; + int error = 0; + int pathlen; + struct xfs_bmap_free free_list; + xfs_fsblock_t first_block; + bool unlock_dp_on_error = false; + uint cancel_flags; + int committed; + xfs_fileoff_t first_fsb; + xfs_filblks_t fs_blocks; + int nmaps; + struct xfs_bmbt_irec mval[XFS_SYMLINK_MAPS]; + xfs_daddr_t d; + const char *cur_chunk; + int byte_cnt; + int n; + xfs_buf_t *bp; + prid_t prid; + struct xfs_dquot *udqp = NULL; + struct xfs_dquot *gdqp = NULL; + struct xfs_dquot *pdqp = NULL; + uint resblks; + + *ipp = NULL; + + trace_xfs_symlink(dp, link_name); + + if (XFS_FORCED_SHUTDOWN(mp)) + return XFS_ERROR(EIO); + + /* + * Check component lengths of the target path name. + */ + pathlen = strlen(target_path); + if (pathlen >= MAXPATHLEN) /* total string too long */ + return XFS_ERROR(ENAMETOOLONG); + + udqp = gdqp = NULL; + prid = xfs_get_initial_prid(dp); + + /* + * Make sure that we have allocated dquot(s) on disk. + */ + error = xfs_qm_vop_dqalloc(dp, + xfs_kuid_to_uid(current_fsuid()), + xfs_kgid_to_gid(current_fsgid()), prid, + XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, + &udqp, &gdqp, &pdqp); + if (error) + goto std_return; + + tp = xfs_trans_alloc(mp, XFS_TRANS_SYMLINK); + cancel_flags = XFS_TRANS_RELEASE_LOG_RES; + /* + * The symlink will fit into the inode data fork? + * There can't be any attributes so we get the whole variable part. + */ + if (pathlen <= XFS_LITINO(mp, dp->i_d.di_version)) + fs_blocks = 0; + else + fs_blocks = xfs_symlink_blocks(mp, pathlen); + resblks = XFS_SYMLINK_SPACE_RES(mp, link_name->len, fs_blocks); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_symlink, resblks, 0); + if (error == ENOSPC && fs_blocks == 0) { + resblks = 0; + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_symlink, 0, 0); + } + if (error) { + cancel_flags = 0; + goto error_return; + } + + xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT); + unlock_dp_on_error = true; + + /* + * Check whether the directory allows new symlinks or not. + */ + if (dp->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) { + error = XFS_ERROR(EPERM); + goto error_return; + } + + /* + * Reserve disk quota : blocks and inode. + */ + error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp, + pdqp, resblks, 1, 0); + if (error) + goto error_return; + + /* + * Check for ability to enter directory entry, if no space reserved. + */ + error = xfs_dir_canenter(tp, dp, link_name, resblks); + if (error) + goto error_return; + /* + * Initialize the bmap freelist prior to calling either + * bmapi or the directory create code. + */ + xfs_bmap_init(&free_list, &first_block); + + /* + * Allocate an inode for the symlink. + */ + error = xfs_dir_ialloc(&tp, dp, S_IFLNK | (mode & ~S_IFMT), 1, 0, + prid, resblks > 0, &ip, NULL); + if (error) { + if (error == ENOSPC) + goto error_return; + goto error1; + } + + /* + * An error after we've joined dp to the transaction will result in the + * transaction cancel unlocking dp so don't do it explicitly in the + * error path. + */ + xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); + unlock_dp_on_error = false; + + /* + * Also attach the dquot(s) to it, if applicable. + */ + xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp); + + if (resblks) + resblks -= XFS_IALLOC_SPACE_RES(mp); + /* + * If the symlink will fit into the inode, write it inline. + */ + if (pathlen <= XFS_IFORK_DSIZE(ip)) { + xfs_idata_realloc(ip, pathlen, XFS_DATA_FORK); + memcpy(ip->i_df.if_u1.if_data, target_path, pathlen); + ip->i_d.di_size = pathlen; + + /* + * The inode was initially created in extent format. + */ + ip->i_df.if_flags &= ~(XFS_IFEXTENTS | XFS_IFBROOT); + ip->i_df.if_flags |= XFS_IFINLINE; + + ip->i_d.di_format = XFS_DINODE_FMT_LOCAL; + xfs_trans_log_inode(tp, ip, XFS_ILOG_DDATA | XFS_ILOG_CORE); + + } else { + int offset; + + first_fsb = 0; + nmaps = XFS_SYMLINK_MAPS; + + error = xfs_bmapi_write(tp, ip, first_fsb, fs_blocks, + XFS_BMAPI_METADATA, &first_block, resblks, + mval, &nmaps, &free_list); + if (error) + goto error2; + + if (resblks) + resblks -= fs_blocks; + ip->i_d.di_size = pathlen; + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + + cur_chunk = target_path; + offset = 0; + for (n = 0; n < nmaps; n++) { + char *buf; + + d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock); + byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount); + bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, + BTOBB(byte_cnt), 0); + if (!bp) { + error = ENOMEM; + goto error2; + } + bp->b_ops = &xfs_symlink_buf_ops; + + byte_cnt = XFS_SYMLINK_BUF_SPACE(mp, byte_cnt); + byte_cnt = min(byte_cnt, pathlen); + + buf = bp->b_addr; + buf += xfs_symlink_hdr_set(mp, ip->i_ino, offset, + byte_cnt, bp); + + memcpy(buf, cur_chunk, byte_cnt); + + cur_chunk += byte_cnt; + pathlen -= byte_cnt; + offset += byte_cnt; + + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SYMLINK_BUF); + xfs_trans_log_buf(tp, bp, 0, (buf + byte_cnt - 1) - + (char *)bp->b_addr); + } + ASSERT(pathlen == 0); + } + + /* + * Create the directory entry for the symlink. + */ + error = xfs_dir_createname(tp, dp, link_name, ip->i_ino, + &first_block, &free_list, resblks); + if (error) + goto error2; + xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); + + /* + * If this is a synchronous mount, make sure that the + * symlink transaction goes to disk before returning to + * the user. + */ + if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) { + xfs_trans_set_sync(tp); + } + + error = xfs_bmap_finish(&tp, &free_list, &committed); + if (error) { + goto error2; + } + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + xfs_qm_dqrele(udqp); + xfs_qm_dqrele(gdqp); + xfs_qm_dqrele(pdqp); + + *ipp = ip; + return 0; + + error2: + IRELE(ip); + error1: + xfs_bmap_cancel(&free_list); + cancel_flags |= XFS_TRANS_ABORT; + error_return: + xfs_trans_cancel(tp, cancel_flags); + xfs_qm_dqrele(udqp); + xfs_qm_dqrele(gdqp); + xfs_qm_dqrele(pdqp); + + if (unlock_dp_on_error) + xfs_iunlock(dp, XFS_ILOCK_EXCL); + std_return: + return error; +} + +/* + * Free a symlink that has blocks associated with it. + */ +STATIC int +xfs_inactive_symlink_rmt( + struct xfs_inode *ip) +{ + xfs_buf_t *bp; + int committed; + int done; + int error; + xfs_fsblock_t first_block; + xfs_bmap_free_t free_list; + int i; + xfs_mount_t *mp; + xfs_bmbt_irec_t mval[XFS_SYMLINK_MAPS]; + int nmaps; + int size; + xfs_trans_t *tp; + + mp = ip->i_mount; + ASSERT(ip->i_df.if_flags & XFS_IFEXTENTS); + /* + * We're freeing a symlink that has some + * blocks allocated to it. Free the + * blocks here. We know that we've got + * either 1 or 2 extents and that we can + * free them all in one bunmapi call. + */ + ASSERT(ip->i_d.di_nextents > 0 && ip->i_d.di_nextents <= 2); + + tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0); + if (error) { + xfs_trans_cancel(tp, 0); + return error; + } + + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, 0); + + /* + * Lock the inode, fix the size, and join it to the transaction. + * Hold it so in the normal path, we still have it locked for + * the second transaction. In the error paths we need it + * held so the cancel won't rele it, see below. + */ + size = (int)ip->i_d.di_size; + ip->i_d.di_size = 0; + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + /* + * Find the block(s) so we can inval and unmap them. + */ + done = 0; + xfs_bmap_init(&free_list, &first_block); + nmaps = ARRAY_SIZE(mval); + error = xfs_bmapi_read(ip, 0, xfs_symlink_blocks(mp, size), + mval, &nmaps, 0); + if (error) + goto error_trans_cancel; + /* + * Invalidate the block(s). No validation is done. + */ + for (i = 0; i < nmaps; i++) { + bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, + XFS_FSB_TO_DADDR(mp, mval[i].br_startblock), + XFS_FSB_TO_BB(mp, mval[i].br_blockcount), 0); + if (!bp) { + error = ENOMEM; + goto error_bmap_cancel; + } + xfs_trans_binval(tp, bp); + } + /* + * Unmap the dead block(s) to the free_list. + */ + error = xfs_bunmapi(tp, ip, 0, size, XFS_BMAPI_METADATA, nmaps, + &first_block, &free_list, &done); + if (error) + goto error_bmap_cancel; + ASSERT(done); + /* + * Commit the first transaction. This logs the EFI and the inode. + */ + error = xfs_bmap_finish(&tp, &free_list, &committed); + if (error) + goto error_bmap_cancel; + /* + * The transaction must have been committed, since there were + * actually extents freed by xfs_bunmapi. See xfs_bmap_finish. + * The new tp has the extent freeing and EFDs. + */ + ASSERT(committed); + /* + * The first xact was committed, so add the inode to the new one. + * Mark it dirty so it will be logged and moved forward in the log as + * part of every commit. + */ + xfs_trans_ijoin(tp, ip, 0); + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + /* + * Commit the transaction containing extent freeing and EFDs. + */ + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + if (error) { + ASSERT(XFS_FORCED_SHUTDOWN(mp)); + goto error_unlock; + } + + /* + * Remove the memory for extent descriptions (just bookkeeping). + */ + if (ip->i_df.if_bytes) + xfs_idata_realloc(ip, -ip->i_df.if_bytes, XFS_DATA_FORK); + ASSERT(ip->i_df.if_bytes == 0); + + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return 0; + +error_bmap_cancel: + xfs_bmap_cancel(&free_list); +error_trans_cancel: + xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); +error_unlock: + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return error; +} + +/* + * xfs_inactive_symlink - free a symlink + */ +int +xfs_inactive_symlink( + struct xfs_inode *ip) +{ + struct xfs_mount *mp = ip->i_mount; + int pathlen; + + trace_xfs_inactive_symlink(ip); + + if (XFS_FORCED_SHUTDOWN(mp)) + return XFS_ERROR(EIO); + + xfs_ilock(ip, XFS_ILOCK_EXCL); + + /* + * Zero length symlinks _can_ exist. + */ + pathlen = (int)ip->i_d.di_size; + if (!pathlen) { + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return 0; + } + + if (pathlen < 0 || pathlen > MAXPATHLEN) { + xfs_alert(mp, "%s: inode (0x%llx) bad symlink length (%d)", + __func__, (unsigned long long)ip->i_ino, pathlen); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + ASSERT(0); + return XFS_ERROR(EFSCORRUPTED); + } + + if (ip->i_df.if_flags & XFS_IFINLINE) { + if (ip->i_df.if_bytes > 0) + xfs_idata_realloc(ip, -(ip->i_df.if_bytes), + XFS_DATA_FORK); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + ASSERT(ip->i_df.if_bytes == 0); + return 0; + } + + xfs_iunlock(ip, XFS_ILOCK_EXCL); + + /* remove the remote symlink */ + return xfs_inactive_symlink_rmt(ip); +} diff --git a/fs/xfs/xfs_symlink.h b/fs/xfs/xfs_symlink.h new file mode 100644 index 00000000000..e75245d0911 --- /dev/null +++ b/fs/xfs/xfs_symlink.h @@ -0,0 +1,27 @@ +/* + * Copyright (c) 2012 Red Hat, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_SYMLINK_H +#define __XFS_SYMLINK_H 1 + +/* Kernel only symlink defintions */ + +int xfs_symlink(struct xfs_inode *dp, struct xfs_name *link_name, + const char *target_path, umode_t mode, struct xfs_inode **ipp); +int xfs_readlink(struct xfs_inode *ip, char *link); +int xfs_inactive_symlink(struct xfs_inode *ip); + +#endif /* __XFS_SYMLINK_H */ diff --git a/fs/xfs/xfs_symlink_remote.c b/fs/xfs/xfs_symlink_remote.c new file mode 100644 index 00000000000..23c2f2577c8 --- /dev/null +++ b/fs/xfs/xfs_symlink_remote.c @@ -0,0 +1,201 @@ +/* + * Copyright (c) 2000-2006 Silicon Graphics, Inc. + * Copyright (c) 2012-2013 Red Hat, Inc. + * All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_shared.h" +#include "xfs_trans_resv.h" +#include "xfs_ag.h" +#include "xfs_sb.h" +#include "xfs_mount.h" +#include "xfs_bmap_btree.h" +#include "xfs_inode.h" +#include "xfs_error.h" +#include "xfs_trace.h" +#include "xfs_symlink.h" +#include "xfs_cksum.h" +#include "xfs_trans.h" +#include "xfs_buf_item.h" + + +/* + * Each contiguous block has a header, so it is not just a simple pathlen + * to FSB conversion. + */ +int +xfs_symlink_blocks( + struct xfs_mount *mp, + int pathlen) +{ + int buflen = XFS_SYMLINK_BUF_SPACE(mp, mp->m_sb.sb_blocksize); + + return (pathlen + buflen - 1) / buflen; +} + +int +xfs_symlink_hdr_set( + struct xfs_mount *mp, + xfs_ino_t ino, + uint32_t offset, + uint32_t size, + struct xfs_buf *bp) +{ + struct xfs_dsymlink_hdr *dsl = bp->b_addr; + + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return 0; + + dsl->sl_magic = cpu_to_be32(XFS_SYMLINK_MAGIC); + dsl->sl_offset = cpu_to_be32(offset); + dsl->sl_bytes = cpu_to_be32(size); + uuid_copy(&dsl->sl_uuid, &mp->m_sb.sb_uuid); + dsl->sl_owner = cpu_to_be64(ino); + dsl->sl_blkno = cpu_to_be64(bp->b_bn); + bp->b_ops = &xfs_symlink_buf_ops; + + return sizeof(struct xfs_dsymlink_hdr); +} + +/* + * Checking of the symlink header is split into two parts. the verifier does + * CRC, location and bounds checking, the unpacking function checks the path + * parameters and owner. + */ +bool +xfs_symlink_hdr_ok( + xfs_ino_t ino, + uint32_t offset, + uint32_t size, + struct xfs_buf *bp) +{ + struct xfs_dsymlink_hdr *dsl = bp->b_addr; + + if (offset != be32_to_cpu(dsl->sl_offset)) + return false; + if (size != be32_to_cpu(dsl->sl_bytes)) + return false; + if (ino != be64_to_cpu(dsl->sl_owner)) + return false; + + /* ok */ + return true; +} + +static bool +xfs_symlink_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_dsymlink_hdr *dsl = bp->b_addr; + + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return false; + if (dsl->sl_magic != cpu_to_be32(XFS_SYMLINK_MAGIC)) + return false; + if (!uuid_equal(&dsl->sl_uuid, &mp->m_sb.sb_uuid)) + return false; + if (bp->b_bn != be64_to_cpu(dsl->sl_blkno)) + return false; + if (be32_to_cpu(dsl->sl_offset) + + be32_to_cpu(dsl->sl_bytes) >= MAXPATHLEN) + return false; + if (dsl->sl_owner == 0) + return false; + + return true; +} + +static void +xfs_symlink_read_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + + /* no verification of non-crc buffers */ + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return; + + if (!xfs_buf_verify_cksum(bp, XFS_SYMLINK_CRC_OFF)) + xfs_buf_ioerror(bp, EFSBADCRC); + else if (!xfs_symlink_verify(bp)) + xfs_buf_ioerror(bp, EFSCORRUPTED); + + if (bp->b_error) + xfs_verifier_error(bp); +} + +static void +xfs_symlink_write_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_buf_log_item *bip = bp->b_fspriv; + + /* no verification of non-crc buffers */ + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return; + + if (!xfs_symlink_verify(bp)) { + xfs_buf_ioerror(bp, EFSCORRUPTED); + xfs_verifier_error(bp); + return; + } + + if (bip) { + struct xfs_dsymlink_hdr *dsl = bp->b_addr; + dsl->sl_lsn = cpu_to_be64(bip->bli_item.li_lsn); + } + xfs_buf_update_cksum(bp, XFS_SYMLINK_CRC_OFF); +} + +const struct xfs_buf_ops xfs_symlink_buf_ops = { + .verify_read = xfs_symlink_read_verify, + .verify_write = xfs_symlink_write_verify, +}; + +void +xfs_symlink_local_to_remote( + struct xfs_trans *tp, + struct xfs_buf *bp, + struct xfs_inode *ip, + struct xfs_ifork *ifp) +{ + struct xfs_mount *mp = ip->i_mount; + char *buf; + + if (!xfs_sb_version_hascrc(&mp->m_sb)) { + bp->b_ops = NULL; + memcpy(bp->b_addr, ifp->if_u1.if_data, ifp->if_bytes); + return; + } + + /* + * As this symlink fits in an inode literal area, it must also fit in + * the smallest buffer the filesystem supports. + */ + ASSERT(BBTOB(bp->b_length) >= + ifp->if_bytes + sizeof(struct xfs_dsymlink_hdr)); + + bp->b_ops = &xfs_symlink_buf_ops; + + buf = bp->b_addr; + buf += xfs_symlink_hdr_set(mp, ip->i_ino, 0, ifp->if_bytes, bp); + memcpy(buf, ifp->if_u1.if_data, ifp->if_bytes); +} diff --git a/fs/xfs/xfs_sysctl.c b/fs/xfs/xfs_sysctl.c index 2801b5ce6cd..1743b9f8e23 100644 --- a/fs/xfs/xfs_sysctl.c +++ b/fs/xfs/xfs_sysctl.c @@ -25,11 +25,11 @@ static struct ctl_table_header *xfs_table_header; #ifdef CONFIG_PROC_FS STATIC int xfs_stats_clear_proc_handler( - ctl_table *ctl, - int write, - void __user *buffer, - size_t *lenp, - loff_t *ppos) + struct ctl_table *ctl, + int write, + void __user *buffer, + size_t *lenp, + loff_t *ppos) { int c, ret, *valp = ctl->data; __uint32_t vn_active; @@ -55,11 +55,11 @@ xfs_stats_clear_proc_handler( STATIC int xfs_panic_mask_proc_handler( - ctl_table *ctl, - int write, - void __user *buffer, - size_t *lenp, - loff_t *ppos) + struct ctl_table *ctl, + int write, + void __user *buffer, + size_t *lenp, + loff_t *ppos) { int ret, *valp = ctl->data; @@ -74,7 +74,7 @@ xfs_panic_mask_proc_handler( } #endif /* CONFIG_PROC_FS */ -static ctl_table xfs_table[] = { +static struct ctl_table xfs_table[] = { { .procname = "irix_sgid_inherit", .data = &xfs_params.sgid_inherit.val, @@ -227,7 +227,7 @@ static ctl_table xfs_table[] = { {} }; -static ctl_table xfs_dir_table[] = { +static struct ctl_table xfs_dir_table[] = { { .procname = "xfs", .mode = 0555, @@ -236,7 +236,7 @@ static ctl_table xfs_dir_table[] = { {} }; -static ctl_table xfs_root_table[] = { +static struct ctl_table xfs_root_table[] = { { .procname = "fs", .mode = 0555, diff --git a/fs/xfs/xfs_trace.c b/fs/xfs/xfs_trace.c index 624bedd8135..1e85bcd0e41 100644 --- a/fs/xfs/xfs_trace.c +++ b/fs/xfs/xfs_trace.c @@ -17,25 +17,25 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_types.h" -#include "xfs_log.h" -#include "xfs_trans.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_da_btree.h" -#include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_dinode.h" +#include "xfs_mount.h" +#include "xfs_da_format.h" #include "xfs_inode.h" #include "xfs_btree.h" -#include "xfs_mount.h" +#include "xfs_da_btree.h" #include "xfs_ialloc.h" #include "xfs_itable.h" #include "xfs_alloc.h" #include "xfs_bmap.h" #include "xfs_attr.h" #include "xfs_attr_leaf.h" +#include "xfs_trans.h" +#include "xfs_log.h" #include "xfs_log_priv.h" #include "xfs_buf_item.h" #include "xfs_quota.h" @@ -45,6 +45,8 @@ #include "xfs_dquot.h" #include "xfs_log_recover.h" #include "xfs_inode_item.h" +#include "xfs_bmap_btree.h" +#include "xfs_filestream.h" /* * We include this last to have the helpers above available for the trace diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 2e137d4a85a..152f8278263 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -31,8 +31,8 @@ struct xfs_da_args; struct xfs_da_node_entry; struct xfs_dquot; struct xfs_log_item; -struct xlog_ticket; struct xlog; +struct xlog_ticket; struct xlog_recover; struct xlog_recover_item; struct xfs_buf_log_format; @@ -135,6 +135,31 @@ DEFINE_PERAG_REF_EVENT(xfs_perag_clear_reclaim); DEFINE_PERAG_REF_EVENT(xfs_perag_set_eofblocks); DEFINE_PERAG_REF_EVENT(xfs_perag_clear_eofblocks); +DECLARE_EVENT_CLASS(xfs_ag_class, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno), + TP_ARGS(mp, agno), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + ), + TP_printk("dev %d:%d agno %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno) +); +#define DEFINE_AG_EVENT(name) \ +DEFINE_EVENT(xfs_ag_class, name, \ + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno), \ + TP_ARGS(mp, agno)) + +DEFINE_AG_EVENT(xfs_read_agf); +DEFINE_AG_EVENT(xfs_alloc_read_agf); +DEFINE_AG_EVENT(xfs_read_agi); +DEFINE_AG_EVENT(xfs_ialloc_read_agi); + TRACE_EVENT(xfs_attr_list_node_descend, TP_PROTO(struct xfs_attr_list_context *ctx, struct xfs_da_node_entry *btree), @@ -341,6 +366,7 @@ DEFINE_BUF_EVENT(xfs_buf_item_relse); DEFINE_BUF_EVENT(xfs_buf_item_iodone); DEFINE_BUF_EVENT(xfs_buf_item_iodone_async); DEFINE_BUF_EVENT(xfs_buf_error_relse); +DEFINE_BUF_EVENT(xfs_buf_wait_buftarg); DEFINE_BUF_EVENT(xfs_trans_read_buf_io); DEFINE_BUF_EVENT(xfs_trans_read_buf_shut); @@ -485,9 +511,12 @@ DEFINE_EVENT(xfs_buf_item_class, name, \ TP_PROTO(struct xfs_buf_log_item *bip), \ TP_ARGS(bip)) DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size); +DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size_ordered); DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size_stale); DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format); +DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format_ordered); DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format_stale); +DEFINE_BUF_ITEM_EVENT(xfs_buf_item_ordered); DEFINE_BUF_ITEM_EVENT(xfs_buf_item_pin); DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unpin); DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unpin_stale); @@ -507,6 +536,65 @@ DEFINE_BUF_ITEM_EVENT(xfs_trans_bjoin); DEFINE_BUF_ITEM_EVENT(xfs_trans_bhold); DEFINE_BUF_ITEM_EVENT(xfs_trans_bhold_release); DEFINE_BUF_ITEM_EVENT(xfs_trans_binval); +DEFINE_BUF_ITEM_EVENT(xfs_trans_buf_ordered); + +DECLARE_EVENT_CLASS(xfs_filestream_class, + TP_PROTO(struct xfs_inode *ip, xfs_agnumber_t agno), + TP_ARGS(ip, agno), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_agnumber_t, agno) + __field(int, streams) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->agno = agno; + __entry->streams = xfs_filestream_peek_ag(ip->i_mount, agno); + ), + TP_printk("dev %d:%d ino 0x%llx agno %u streams %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->agno, + __entry->streams) +) +#define DEFINE_FILESTREAM_EVENT(name) \ +DEFINE_EVENT(xfs_filestream_class, name, \ + TP_PROTO(struct xfs_inode *ip, xfs_agnumber_t agno), \ + TP_ARGS(ip, agno)) +DEFINE_FILESTREAM_EVENT(xfs_filestream_free); +DEFINE_FILESTREAM_EVENT(xfs_filestream_lookup); +DEFINE_FILESTREAM_EVENT(xfs_filestream_scan); + +TRACE_EVENT(xfs_filestream_pick, + TP_PROTO(struct xfs_inode *ip, xfs_agnumber_t agno, + xfs_extlen_t free, int nscan), + TP_ARGS(ip, agno, free, nscan), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_agnumber_t, agno) + __field(int, streams) + __field(xfs_extlen_t, free) + __field(int, nscan) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->agno = agno; + __entry->streams = xfs_filestream_peek_ag(ip->i_mount, agno); + __entry->free = free; + __entry->nscan = nscan; + ), + TP_printk("dev %d:%d ino 0x%llx agno %u streams %d free %d nscan %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->agno, + __entry->streams, + __entry->free, + __entry->nscan) +); DECLARE_EVENT_CLASS(xfs_lock_class, TP_PROTO(struct xfs_inode *ip, unsigned lock_flags, @@ -570,8 +658,11 @@ DEFINE_INODE_EVENT(xfs_iget_miss); DEFINE_INODE_EVENT(xfs_getattr); DEFINE_INODE_EVENT(xfs_setattr); DEFINE_INODE_EVENT(xfs_readlink); +DEFINE_INODE_EVENT(xfs_inactive_symlink); DEFINE_INODE_EVENT(xfs_alloc_file_space); DEFINE_INODE_EVENT(xfs_free_file_space); +DEFINE_INODE_EVENT(xfs_zero_file_space); +DEFINE_INODE_EVENT(xfs_collapse_file_space); DEFINE_INODE_EVENT(xfs_readdir); #ifdef CONFIG_XFS_POSIX_ACL DEFINE_INODE_EVENT(xfs_get_acl); @@ -618,6 +709,30 @@ DECLARE_EVENT_CLASS(xfs_iref_class, (char *)__entry->caller_ip) ) +TRACE_EVENT(xfs_iomap_prealloc_size, + TP_PROTO(struct xfs_inode *ip, xfs_fsblock_t blocks, int shift, + unsigned int writeio_blocks), + TP_ARGS(ip, blocks, shift, writeio_blocks), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_fsblock_t, blocks) + __field(int, shift) + __field(unsigned int, writeio_blocks) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->blocks = blocks; + __entry->shift = shift; + __entry->writeio_blocks = writeio_blocks; + ), + TP_printk("dev %d:%d ino 0x%llx prealloc blocks %llu shift %d " + "m_writeio_blocks %u", + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, + __entry->blocks, __entry->shift, __entry->writeio_blocks) +) + #define DEFINE_IREF_EVENT(name) \ DEFINE_EVENT(xfs_iref_class, name, \ TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip), \ @@ -908,6 +1023,63 @@ DEFINE_LOG_ITEM_EVENT(xfs_ail_pinned); DEFINE_LOG_ITEM_EVENT(xfs_ail_locked); DEFINE_LOG_ITEM_EVENT(xfs_ail_flushing); +DECLARE_EVENT_CLASS(xfs_ail_class, + TP_PROTO(struct xfs_log_item *lip, xfs_lsn_t old_lsn, xfs_lsn_t new_lsn), + TP_ARGS(lip, old_lsn, new_lsn), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(void *, lip) + __field(uint, type) + __field(uint, flags) + __field(xfs_lsn_t, old_lsn) + __field(xfs_lsn_t, new_lsn) + ), + TP_fast_assign( + __entry->dev = lip->li_mountp->m_super->s_dev; + __entry->lip = lip; + __entry->type = lip->li_type; + __entry->flags = lip->li_flags; + __entry->old_lsn = old_lsn; + __entry->new_lsn = new_lsn; + ), + TP_printk("dev %d:%d lip 0x%p old lsn %d/%d new lsn %d/%d type %s flags %s", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->lip, + CYCLE_LSN(__entry->old_lsn), BLOCK_LSN(__entry->old_lsn), + CYCLE_LSN(__entry->new_lsn), BLOCK_LSN(__entry->new_lsn), + __print_symbolic(__entry->type, XFS_LI_TYPE_DESC), + __print_flags(__entry->flags, "|", XFS_LI_FLAGS)) +) + +#define DEFINE_AIL_EVENT(name) \ +DEFINE_EVENT(xfs_ail_class, name, \ + TP_PROTO(struct xfs_log_item *lip, xfs_lsn_t old_lsn, xfs_lsn_t new_lsn), \ + TP_ARGS(lip, old_lsn, new_lsn)) +DEFINE_AIL_EVENT(xfs_ail_insert); +DEFINE_AIL_EVENT(xfs_ail_move); +DEFINE_AIL_EVENT(xfs_ail_delete); + +TRACE_EVENT(xfs_log_assign_tail_lsn, + TP_PROTO(struct xlog *log, xfs_lsn_t new_lsn), + TP_ARGS(log, new_lsn), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_lsn_t, new_lsn) + __field(xfs_lsn_t, old_lsn) + __field(xfs_lsn_t, last_sync_lsn) + ), + TP_fast_assign( + __entry->dev = log->l_mp->m_super->s_dev; + __entry->new_lsn = new_lsn; + __entry->old_lsn = atomic64_read(&log->l_tail_lsn); + __entry->last_sync_lsn = atomic64_read(&log->l_last_sync_lsn); + ), + TP_printk("dev %d:%d new tail lsn %d/%d, old lsn %d/%d, last sync %d/%d", + MAJOR(__entry->dev), MINOR(__entry->dev), + CYCLE_LSN(__entry->new_lsn), BLOCK_LSN(__entry->new_lsn), + CYCLE_LSN(__entry->old_lsn), BLOCK_LSN(__entry->old_lsn), + CYCLE_LSN(__entry->last_sync_lsn), BLOCK_LSN(__entry->last_sync_lsn)) +) DECLARE_EVENT_CLASS(xfs_file_class, TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset, int flags), @@ -946,17 +1118,18 @@ DEFINE_RW_EVENT(xfs_file_read); DEFINE_RW_EVENT(xfs_file_buffered_write); DEFINE_RW_EVENT(xfs_file_direct_write); DEFINE_RW_EVENT(xfs_file_splice_read); -DEFINE_RW_EVENT(xfs_file_splice_write); DECLARE_EVENT_CLASS(xfs_page_class, - TP_PROTO(struct inode *inode, struct page *page, unsigned long off), - TP_ARGS(inode, page, off), + TP_PROTO(struct inode *inode, struct page *page, unsigned long off, + unsigned int len), + TP_ARGS(inode, page, off, len), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_ino_t, ino) __field(pgoff_t, pgoff) __field(loff_t, size) __field(unsigned long, offset) + __field(unsigned int, length) __field(int, delalloc) __field(int, unwritten) ), @@ -970,24 +1143,27 @@ DECLARE_EVENT_CLASS(xfs_page_class, __entry->pgoff = page_offset(page); __entry->size = i_size_read(inode); __entry->offset = off; + __entry->length = len; __entry->delalloc = delalloc; __entry->unwritten = unwritten; ), TP_printk("dev %d:%d ino 0x%llx pgoff 0x%lx size 0x%llx offset %lx " - "delalloc %d unwritten %d", + "length %x delalloc %d unwritten %d", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->pgoff, __entry->size, __entry->offset, + __entry->length, __entry->delalloc, __entry->unwritten) ) #define DEFINE_PAGE_EVENT(name) \ DEFINE_EVENT(xfs_page_class, name, \ - TP_PROTO(struct inode *inode, struct page *page, unsigned long off), \ - TP_ARGS(inode, page, off)) + TP_PROTO(struct inode *inode, struct page *page, unsigned long off, \ + unsigned int len), \ + TP_ARGS(inode, page, off, len)) DEFINE_PAGE_EVENT(xfs_writepage); DEFINE_PAGE_EVENT(xfs_releasepage); DEFINE_PAGE_EVENT(xfs_invalidatepage); diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 06ed520a767..d03932564cc 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -18,518 +18,25 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_types.h" -#include "xfs_log.h" -#include "xfs_trans.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_mount.h" -#include "xfs_error.h" -#include "xfs_da_btree.h" -#include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_dinode.h" #include "xfs_inode.h" -#include "xfs_btree.h" -#include "xfs_ialloc.h" -#include "xfs_alloc.h" #include "xfs_extent_busy.h" -#include "xfs_bmap.h" #include "xfs_quota.h" +#include "xfs_trans.h" #include "xfs_trans_priv.h" -#include "xfs_trans_space.h" -#include "xfs_inode_item.h" +#include "xfs_log.h" #include "xfs_trace.h" +#include "xfs_error.h" kmem_zone_t *xfs_trans_zone; kmem_zone_t *xfs_log_item_desc_zone; - -/* - * Various log reservation values. - * - * These are based on the size of the file system block because that is what - * most transactions manipulate. Each adds in an additional 128 bytes per - * item logged to try to account for the overhead of the transaction mechanism. - * - * Note: Most of the reservations underestimate the number of allocation - * groups into which they could free extents in the xfs_bmap_finish() call. - * This is because the number in the worst case is quite high and quite - * unusual. In order to fix this we need to change xfs_bmap_finish() to free - * extents in only a single AG at a time. This will require changes to the - * EFI code as well, however, so that the EFI for the extents not freed is - * logged again in each transaction. See SGI PV #261917. - * - * Reservation functions here avoid a huge stack in xfs_trans_init due to - * register overflow from temporaries in the calculations. - */ - - -/* - * In a write transaction we can allocate a maximum of 2 - * extents. This gives: - * the inode getting the new extents: inode size - * the inode's bmap btree: max depth * block size - * the agfs of the ags from which the extents are allocated: 2 * sector - * the superblock free block counter: sector size - * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size - * And the bmap_finish transaction can free bmap blocks in a join: - * the agfs of the ags containing the blocks: 2 * sector size - * the agfls of the ags containing the blocks: 2 * sector size - * the super block free block counter: sector size - * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size - */ -STATIC uint -xfs_calc_write_reservation( - struct xfs_mount *mp) -{ - return XFS_DQUOT_LOGRES(mp) + - MAX((mp->m_sb.sb_inodesize + - XFS_FSB_TO_B(mp, XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)) + - 2 * mp->m_sb.sb_sectsize + - mp->m_sb.sb_sectsize + - XFS_ALLOCFREE_LOG_RES(mp, 2) + - 128 * (4 + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + - XFS_ALLOCFREE_LOG_COUNT(mp, 2))), - (2 * mp->m_sb.sb_sectsize + - 2 * mp->m_sb.sb_sectsize + - mp->m_sb.sb_sectsize + - XFS_ALLOCFREE_LOG_RES(mp, 2) + - 128 * (5 + XFS_ALLOCFREE_LOG_COUNT(mp, 2)))); -} - -/* - * In truncating a file we free up to two extents at once. We can modify: - * the inode being truncated: inode size - * the inode's bmap btree: (max depth + 1) * block size - * And the bmap_finish transaction can free the blocks and bmap blocks: - * the agf for each of the ags: 4 * sector size - * the agfl for each of the ags: 4 * sector size - * the super block to reflect the freed blocks: sector size - * worst case split in allocation btrees per extent assuming 4 extents: - * 4 exts * 2 trees * (2 * max depth - 1) * block size - * the inode btree: max depth * blocksize - * the allocation btrees: 2 trees * (max depth - 1) * block size - */ -STATIC uint -xfs_calc_itruncate_reservation( - struct xfs_mount *mp) -{ - return XFS_DQUOT_LOGRES(mp) + - MAX((mp->m_sb.sb_inodesize + - XFS_FSB_TO_B(mp, XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1) + - 128 * (2 + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK))), - (4 * mp->m_sb.sb_sectsize + - 4 * mp->m_sb.sb_sectsize + - mp->m_sb.sb_sectsize + - XFS_ALLOCFREE_LOG_RES(mp, 4) + - 128 * (9 + XFS_ALLOCFREE_LOG_COUNT(mp, 4)) + - 128 * 5 + - XFS_ALLOCFREE_LOG_RES(mp, 1) + - 128 * (2 + XFS_IALLOC_BLOCKS(mp) + mp->m_in_maxlevels + - XFS_ALLOCFREE_LOG_COUNT(mp, 1)))); -} - -/* - * In renaming a files we can modify: - * the four inodes involved: 4 * inode size - * the two directory btrees: 2 * (max depth + v2) * dir block size - * the two directory bmap btrees: 2 * max depth * block size - * And the bmap_finish transaction can free dir and bmap blocks (two sets - * of bmap blocks) giving: - * the agf for the ags in which the blocks live: 3 * sector size - * the agfl for the ags in which the blocks live: 3 * sector size - * the superblock for the free block count: sector size - * the allocation btrees: 3 exts * 2 trees * (2 * max depth - 1) * block size - */ -STATIC uint -xfs_calc_rename_reservation( - struct xfs_mount *mp) -{ - return XFS_DQUOT_LOGRES(mp) + - MAX((4 * mp->m_sb.sb_inodesize + - 2 * XFS_DIROP_LOG_RES(mp) + - 128 * (4 + 2 * XFS_DIROP_LOG_COUNT(mp))), - (3 * mp->m_sb.sb_sectsize + - 3 * mp->m_sb.sb_sectsize + - mp->m_sb.sb_sectsize + - XFS_ALLOCFREE_LOG_RES(mp, 3) + - 128 * (7 + XFS_ALLOCFREE_LOG_COUNT(mp, 3)))); -} - -/* - * For creating a link to an inode: - * the parent directory inode: inode size - * the linked inode: inode size - * the directory btree could split: (max depth + v2) * dir block size - * the directory bmap btree could join or split: (max depth + v2) * blocksize - * And the bmap_finish transaction can free some bmap blocks giving: - * the agf for the ag in which the blocks live: sector size - * the agfl for the ag in which the blocks live: sector size - * the superblock for the free block count: sector size - * the allocation btrees: 2 trees * (2 * max depth - 1) * block size - */ -STATIC uint -xfs_calc_link_reservation( - struct xfs_mount *mp) -{ - return XFS_DQUOT_LOGRES(mp) + - MAX((mp->m_sb.sb_inodesize + - mp->m_sb.sb_inodesize + - XFS_DIROP_LOG_RES(mp) + - 128 * (2 + XFS_DIROP_LOG_COUNT(mp))), - (mp->m_sb.sb_sectsize + - mp->m_sb.sb_sectsize + - mp->m_sb.sb_sectsize + - XFS_ALLOCFREE_LOG_RES(mp, 1) + - 128 * (3 + XFS_ALLOCFREE_LOG_COUNT(mp, 1)))); -} - -/* - * For removing a directory entry we can modify: - * the parent directory inode: inode size - * the removed inode: inode size - * the directory btree could join: (max depth + v2) * dir block size - * the directory bmap btree could join or split: (max depth + v2) * blocksize - * And the bmap_finish transaction can free the dir and bmap blocks giving: - * the agf for the ag in which the blocks live: 2 * sector size - * the agfl for the ag in which the blocks live: 2 * sector size - * the superblock for the free block count: sector size - * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size - */ -STATIC uint -xfs_calc_remove_reservation( - struct xfs_mount *mp) -{ - return XFS_DQUOT_LOGRES(mp) + - MAX((mp->m_sb.sb_inodesize + - mp->m_sb.sb_inodesize + - XFS_DIROP_LOG_RES(mp) + - 128 * (2 + XFS_DIROP_LOG_COUNT(mp))), - (2 * mp->m_sb.sb_sectsize + - 2 * mp->m_sb.sb_sectsize + - mp->m_sb.sb_sectsize + - XFS_ALLOCFREE_LOG_RES(mp, 2) + - 128 * (5 + XFS_ALLOCFREE_LOG_COUNT(mp, 2)))); -} - -/* - * For symlink we can modify: - * the parent directory inode: inode size - * the new inode: inode size - * the inode btree entry: 1 block - * the directory btree: (max depth + v2) * dir block size - * the directory inode's bmap btree: (max depth + v2) * block size - * the blocks for the symlink: 1 kB - * Or in the first xact we allocate some inodes giving: - * the agi and agf of the ag getting the new inodes: 2 * sectorsize - * the inode blocks allocated: XFS_IALLOC_BLOCKS * blocksize - * the inode btree: max depth * blocksize - * the allocation btrees: 2 trees * (2 * max depth - 1) * block size - */ -STATIC uint -xfs_calc_symlink_reservation( - struct xfs_mount *mp) -{ - return XFS_DQUOT_LOGRES(mp) + - MAX((mp->m_sb.sb_inodesize + - mp->m_sb.sb_inodesize + - XFS_FSB_TO_B(mp, 1) + - XFS_DIROP_LOG_RES(mp) + - 1024 + - 128 * (4 + XFS_DIROP_LOG_COUNT(mp))), - (2 * mp->m_sb.sb_sectsize + - XFS_FSB_TO_B(mp, XFS_IALLOC_BLOCKS(mp)) + - XFS_FSB_TO_B(mp, mp->m_in_maxlevels) + - XFS_ALLOCFREE_LOG_RES(mp, 1) + - 128 * (2 + XFS_IALLOC_BLOCKS(mp) + mp->m_in_maxlevels + - XFS_ALLOCFREE_LOG_COUNT(mp, 1)))); -} - -/* - * For create we can modify: - * the parent directory inode: inode size - * the new inode: inode size - * the inode btree entry: block size - * the superblock for the nlink flag: sector size - * the directory btree: (max depth + v2) * dir block size - * the directory inode's bmap btree: (max depth + v2) * block size - * Or in the first xact we allocate some inodes giving: - * the agi and agf of the ag getting the new inodes: 2 * sectorsize - * the superblock for the nlink flag: sector size - * the inode blocks allocated: XFS_IALLOC_BLOCKS * blocksize - * the inode btree: max depth * blocksize - * the allocation btrees: 2 trees * (max depth - 1) * block size - */ -STATIC uint -xfs_calc_create_reservation( - struct xfs_mount *mp) -{ - return XFS_DQUOT_LOGRES(mp) + - MAX((mp->m_sb.sb_inodesize + - mp->m_sb.sb_inodesize + - mp->m_sb.sb_sectsize + - XFS_FSB_TO_B(mp, 1) + - XFS_DIROP_LOG_RES(mp) + - 128 * (3 + XFS_DIROP_LOG_COUNT(mp))), - (3 * mp->m_sb.sb_sectsize + - XFS_FSB_TO_B(mp, XFS_IALLOC_BLOCKS(mp)) + - XFS_FSB_TO_B(mp, mp->m_in_maxlevels) + - XFS_ALLOCFREE_LOG_RES(mp, 1) + - 128 * (2 + XFS_IALLOC_BLOCKS(mp) + mp->m_in_maxlevels + - XFS_ALLOCFREE_LOG_COUNT(mp, 1)))); -} - -/* - * Making a new directory is the same as creating a new file. - */ -STATIC uint -xfs_calc_mkdir_reservation( - struct xfs_mount *mp) -{ - return xfs_calc_create_reservation(mp); -} - -/* - * In freeing an inode we can modify: - * the inode being freed: inode size - * the super block free inode counter: sector size - * the agi hash list and counters: sector size - * the inode btree entry: block size - * the on disk inode before ours in the agi hash list: inode cluster size - * the inode btree: max depth * blocksize - * the allocation btrees: 2 trees * (max depth - 1) * block size - */ -STATIC uint -xfs_calc_ifree_reservation( - struct xfs_mount *mp) -{ - return XFS_DQUOT_LOGRES(mp) + - mp->m_sb.sb_inodesize + - mp->m_sb.sb_sectsize + - mp->m_sb.sb_sectsize + - XFS_FSB_TO_B(mp, 1) + - MAX((__uint16_t)XFS_FSB_TO_B(mp, 1), - XFS_INODE_CLUSTER_SIZE(mp)) + - 128 * 5 + - XFS_ALLOCFREE_LOG_RES(mp, 1) + - 128 * (2 + XFS_IALLOC_BLOCKS(mp) + mp->m_in_maxlevels + - XFS_ALLOCFREE_LOG_COUNT(mp, 1)); -} - -/* - * When only changing the inode we log the inode and possibly the superblock - * We also add a bit of slop for the transaction stuff. - */ -STATIC uint -xfs_calc_ichange_reservation( - struct xfs_mount *mp) -{ - return XFS_DQUOT_LOGRES(mp) + - mp->m_sb.sb_inodesize + - mp->m_sb.sb_sectsize + - 512; - -} - -/* - * Growing the data section of the filesystem. - * superblock - * agi and agf - * allocation btrees - */ -STATIC uint -xfs_calc_growdata_reservation( - struct xfs_mount *mp) -{ - return mp->m_sb.sb_sectsize * 3 + - XFS_ALLOCFREE_LOG_RES(mp, 1) + - 128 * (3 + XFS_ALLOCFREE_LOG_COUNT(mp, 1)); -} - -/* - * Growing the rt section of the filesystem. - * In the first set of transactions (ALLOC) we allocate space to the - * bitmap or summary files. - * superblock: sector size - * agf of the ag from which the extent is allocated: sector size - * bmap btree for bitmap/summary inode: max depth * blocksize - * bitmap/summary inode: inode size - * allocation btrees for 1 block alloc: 2 * (2 * maxdepth - 1) * blocksize - */ -STATIC uint -xfs_calc_growrtalloc_reservation( - struct xfs_mount *mp) -{ - return 2 * mp->m_sb.sb_sectsize + - XFS_FSB_TO_B(mp, XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)) + - mp->m_sb.sb_inodesize + - XFS_ALLOCFREE_LOG_RES(mp, 1) + - 128 * (3 + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + - XFS_ALLOCFREE_LOG_COUNT(mp, 1)); -} - -/* - * Growing the rt section of the filesystem. - * In the second set of transactions (ZERO) we zero the new metadata blocks. - * one bitmap/summary block: blocksize - */ -STATIC uint -xfs_calc_growrtzero_reservation( - struct xfs_mount *mp) -{ - return mp->m_sb.sb_blocksize + 128; -} - -/* - * Growing the rt section of the filesystem. - * In the third set of transactions (FREE) we update metadata without - * allocating any new blocks. - * superblock: sector size - * bitmap inode: inode size - * summary inode: inode size - * one bitmap block: blocksize - * summary blocks: new summary size - */ -STATIC uint -xfs_calc_growrtfree_reservation( - struct xfs_mount *mp) -{ - return mp->m_sb.sb_sectsize + - 2 * mp->m_sb.sb_inodesize + - mp->m_sb.sb_blocksize + - mp->m_rsumsize + - 128 * 5; -} - -/* - * Logging the inode modification timestamp on a synchronous write. - * inode - */ -STATIC uint -xfs_calc_swrite_reservation( - struct xfs_mount *mp) -{ - return mp->m_sb.sb_inodesize + 128; -} - -/* - * Logging the inode mode bits when writing a setuid/setgid file - * inode - */ -STATIC uint -xfs_calc_writeid_reservation(xfs_mount_t *mp) -{ - return mp->m_sb.sb_inodesize + 128; -} - -/* - * Converting the inode from non-attributed to attributed. - * the inode being converted: inode size - * agf block and superblock (for block allocation) - * the new block (directory sized) - * bmap blocks for the new directory block - * allocation btrees - */ -STATIC uint -xfs_calc_addafork_reservation( - struct xfs_mount *mp) -{ - return XFS_DQUOT_LOGRES(mp) + - mp->m_sb.sb_inodesize + - mp->m_sb.sb_sectsize * 2 + - mp->m_dirblksize + - XFS_FSB_TO_B(mp, XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1) + - XFS_ALLOCFREE_LOG_RES(mp, 1) + - 128 * (4 + XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1 + - XFS_ALLOCFREE_LOG_COUNT(mp, 1)); -} - -/* - * Removing the attribute fork of a file - * the inode being truncated: inode size - * the inode's bmap btree: max depth * block size - * And the bmap_finish transaction can free the blocks and bmap blocks: - * the agf for each of the ags: 4 * sector size - * the agfl for each of the ags: 4 * sector size - * the super block to reflect the freed blocks: sector size - * worst case split in allocation btrees per extent assuming 4 extents: - * 4 exts * 2 trees * (2 * max depth - 1) * block size - */ -STATIC uint -xfs_calc_attrinval_reservation( - struct xfs_mount *mp) -{ - return MAX((mp->m_sb.sb_inodesize + - XFS_FSB_TO_B(mp, XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) + - 128 * (1 + XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK))), - (4 * mp->m_sb.sb_sectsize + - 4 * mp->m_sb.sb_sectsize + - mp->m_sb.sb_sectsize + - XFS_ALLOCFREE_LOG_RES(mp, 4) + - 128 * (9 + XFS_ALLOCFREE_LOG_COUNT(mp, 4)))); -} - -/* - * Setting an attribute. - * the inode getting the attribute - * the superblock for allocations - * the agfs extents are allocated from - * the attribute btree * max depth - * the inode allocation btree - * Since attribute transaction space is dependent on the size of the attribute, - * the calculation is done partially at mount time and partially at runtime. - */ -STATIC uint -xfs_calc_attrset_reservation( - struct xfs_mount *mp) -{ - return XFS_DQUOT_LOGRES(mp) + - mp->m_sb.sb_inodesize + - mp->m_sb.sb_sectsize + - XFS_FSB_TO_B(mp, XFS_DA_NODE_MAXDEPTH) + - 128 * (2 + XFS_DA_NODE_MAXDEPTH); -} - -/* - * Removing an attribute. - * the inode: inode size - * the attribute btree could join: max depth * block size - * the inode bmap btree could join or split: max depth * block size - * And the bmap_finish transaction can free the attr blocks freed giving: - * the agf for the ag in which the blocks live: 2 * sector size - * the agfl for the ag in which the blocks live: 2 * sector size - * the superblock for the free block count: sector size - * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size - */ -STATIC uint -xfs_calc_attrrm_reservation( - struct xfs_mount *mp) -{ - return XFS_DQUOT_LOGRES(mp) + - MAX((mp->m_sb.sb_inodesize + - XFS_FSB_TO_B(mp, XFS_DA_NODE_MAXDEPTH) + - XFS_FSB_TO_B(mp, XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) + - 128 * (1 + XFS_DA_NODE_MAXDEPTH + - XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK))), - (2 * mp->m_sb.sb_sectsize + - 2 * mp->m_sb.sb_sectsize + - mp->m_sb.sb_sectsize + - XFS_ALLOCFREE_LOG_RES(mp, 2) + - 128 * (5 + XFS_ALLOCFREE_LOG_COUNT(mp, 2)))); -} - -/* - * Clearing a bad agino number in an agi hash bucket. - */ -STATIC uint -xfs_calc_clear_agi_bucket_reservation( - struct xfs_mount *mp) -{ - return mp->m_sb.sb_sectsize + 128; -} - /* * Initialize the precomputed transaction reservation values * in the mount structure. @@ -538,29 +45,7 @@ void xfs_trans_init( struct xfs_mount *mp) { - struct xfs_trans_reservations *resp = &mp->m_reservations; - - resp->tr_write = xfs_calc_write_reservation(mp); - resp->tr_itruncate = xfs_calc_itruncate_reservation(mp); - resp->tr_rename = xfs_calc_rename_reservation(mp); - resp->tr_link = xfs_calc_link_reservation(mp); - resp->tr_remove = xfs_calc_remove_reservation(mp); - resp->tr_symlink = xfs_calc_symlink_reservation(mp); - resp->tr_create = xfs_calc_create_reservation(mp); - resp->tr_mkdir = xfs_calc_mkdir_reservation(mp); - resp->tr_ifree = xfs_calc_ifree_reservation(mp); - resp->tr_ichange = xfs_calc_ichange_reservation(mp); - resp->tr_growdata = xfs_calc_growdata_reservation(mp); - resp->tr_swrite = xfs_calc_swrite_reservation(mp); - resp->tr_writeid = xfs_calc_writeid_reservation(mp); - resp->tr_addafork = xfs_calc_addafork_reservation(mp); - resp->tr_attrinval = xfs_calc_attrinval_reservation(mp); - resp->tr_attrset = xfs_calc_attrset_reservation(mp); - resp->tr_attrrm = xfs_calc_attrrm_reservation(mp); - resp->tr_clearagi = xfs_calc_clear_agi_bucket_reservation(mp); - resp->tr_growrtalloc = xfs_calc_growrtalloc_reservation(mp); - resp->tr_growrtzero = xfs_calc_growrtzero_reservation(mp); - resp->tr_growrtfree = xfs_calc_growrtfree_reservation(mp); + xfs_trans_resv_calc(mp, M_RES(mp)); } /* @@ -596,7 +81,7 @@ _xfs_trans_alloc( atomic_inc(&mp->m_active_trans); tp = kmem_zone_zalloc(xfs_trans_zone, memflags); - tp->t_magic = XFS_TRANS_MAGIC; + tp->t_magic = XFS_TRANS_HEADER_MAGIC; tp->t_type = type; tp->t_mountp = mp; INIT_LIST_HEAD(&tp->t_items); @@ -641,7 +126,7 @@ xfs_trans_dup( /* * Initialize the new transaction structure. */ - ntp->t_magic = XFS_TRANS_MAGIC; + ntp->t_magic = XFS_TRANS_HEADER_MAGIC; ntp->t_type = tp->t_type; ntp->t_mountp = tp->t_mountp; INIT_LIST_HEAD(&ntp->t_items); @@ -684,12 +169,10 @@ xfs_trans_dup( */ int xfs_trans_reserve( - xfs_trans_t *tp, - uint blocks, - uint logspace, - uint rtextents, - uint flags, - uint logcount) + struct xfs_trans *tp, + struct xfs_trans_res *resp, + uint blocks, + uint rtextents) { int error = 0; int rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0; @@ -715,13 +198,15 @@ xfs_trans_reserve( /* * Reserve the log space needed for this transaction. */ - if (logspace > 0) { + if (resp->tr_logres > 0) { bool permanent = false; - ASSERT(tp->t_log_res == 0 || tp->t_log_res == logspace); - ASSERT(tp->t_log_count == 0 || tp->t_log_count == logcount); + ASSERT(tp->t_log_res == 0 || + tp->t_log_res == resp->tr_logres); + ASSERT(tp->t_log_count == 0 || + tp->t_log_count == resp->tr_logcount); - if (flags & XFS_TRANS_PERM_LOG_RES) { + if (resp->tr_logflags & XFS_TRANS_PERM_LOG_RES) { tp->t_flags |= XFS_TRANS_PERM_LOG_RES; permanent = true; } else { @@ -730,20 +215,21 @@ xfs_trans_reserve( } if (tp->t_ticket != NULL) { - ASSERT(flags & XFS_TRANS_PERM_LOG_RES); + ASSERT(resp->tr_logflags & XFS_TRANS_PERM_LOG_RES); error = xfs_log_regrant(tp->t_mountp, tp->t_ticket); } else { - error = xfs_log_reserve(tp->t_mountp, logspace, - logcount, &tp->t_ticket, - XFS_TRANSACTION, permanent, - tp->t_type); + error = xfs_log_reserve(tp->t_mountp, + resp->tr_logres, + resp->tr_logcount, + &tp->t_ticket, XFS_TRANSACTION, + permanent, tp->t_type); } if (error) goto undo_blocks; - tp->t_log_res = logspace; - tp->t_log_count = logcount; + tp->t_log_res = resp->tr_logres; + tp->t_log_count = resp->tr_logcount; } /* @@ -768,10 +254,10 @@ xfs_trans_reserve( * reservations which have already been performed. */ undo_log: - if (logspace > 0) { + if (resp->tr_logres > 0) { int log_flags; - if (flags & XFS_TRANS_PERM_LOG_RES) { + if (resp->tr_logflags & XFS_TRANS_PERM_LOG_RES) { log_flags = XFS_LOG_REL_PERM_RESERV; } else { log_flags = 0; @@ -1219,10 +705,10 @@ xfs_trans_free_items( lip->li_desc = NULL; if (commit_lsn != NULLCOMMITLSN) - IOP_COMMITTING(lip, commit_lsn); + lip->li_ops->iop_committing(lip, commit_lsn); if (flags & XFS_TRANS_ABORT) lip->li_flags |= XFS_LI_ABORTED; - IOP_UNLOCK(lip); + lip->li_ops->iop_unlock(lip); xfs_trans_free_item_desc(lidp); } @@ -1242,8 +728,11 @@ xfs_log_item_batch_insert( /* xfs_trans_ail_update_bulk drops ailp->xa_lock */ xfs_trans_ail_update_bulk(ailp, cur, log_items, nr_items, commit_lsn); - for (i = 0; i < nr_items; i++) - IOP_UNPIN(log_items[i], 0); + for (i = 0; i < nr_items; i++) { + struct xfs_log_item *lip = log_items[i]; + + lip->li_ops->iop_unpin(lip, 0); + } } /* @@ -1253,11 +742,11 @@ xfs_log_item_batch_insert( * * If we are called with the aborted flag set, it is because a log write during * a CIL checkpoint commit has failed. In this case, all the items in the - * checkpoint have already gone through IOP_COMMITED and IOP_UNLOCK, which + * checkpoint have already gone through iop_commited and iop_unlock, which * means that checkpoint commit abort handling is treated exactly the same * as an iclog write error even though we haven't started any IO yet. Hence in - * this case all we need to do is IOP_COMMITTED processing, followed by an - * IOP_UNPIN(aborted) call. + * this case all we need to do is iop_committed processing, followed by an + * iop_unpin(aborted) call. * * The AIL cursor is used to optimise the insert process. If commit_lsn is not * at the end of the AIL, the insert cursor avoids the need to walk @@ -1290,7 +779,7 @@ xfs_trans_committed_bulk( if (aborted) lip->li_flags |= XFS_LI_ABORTED; - item_lsn = IOP_COMMITTED(lip, commit_lsn); + item_lsn = lip->li_ops->iop_committed(lip, commit_lsn); /* item_lsn of -1 means the item needs no further processing */ if (XFS_LSN_CMP(item_lsn, (xfs_lsn_t)-1) == 0) @@ -1302,7 +791,7 @@ xfs_trans_committed_bulk( */ if (aborted) { ASSERT(XFS_FORCED_SHUTDOWN(ailp->xa_mount)); - IOP_UNPIN(lip, 1); + lip->li_ops->iop_unpin(lip, 1); continue; } @@ -1320,7 +809,7 @@ xfs_trans_committed_bulk( xfs_trans_ail_update(ailp, lip, item_lsn); else spin_unlock(&ailp->xa_lock); - IOP_UNPIN(lip, 0); + lip->li_ops->iop_unpin(lip, 0); continue; } @@ -1338,7 +827,7 @@ xfs_trans_committed_bulk( xfs_log_item_batch_insert(ailp, &cur, log_items, i, commit_lsn); spin_lock(&ailp->xa_lock); - xfs_trans_ail_cursor_done(ailp, &cur); + xfs_trans_ail_cursor_done(&cur); spin_unlock(&ailp->xa_lock); } @@ -1398,12 +887,7 @@ xfs_trans_commit( xfs_trans_apply_sb_deltas(tp); xfs_trans_apply_dquot_deltas(tp); - error = xfs_log_commit_cil(mp, tp, &commit_lsn, flags); - if (error == ENOMEM) { - xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR); - error = XFS_ERROR(EIO); - goto out_unreserve; - } + xfs_log_commit_cil(mp, tp, &commit_lsn, flags); current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); xfs_trans_free(tp); @@ -1413,10 +897,7 @@ xfs_trans_commit( * log out now and wait for it. */ if (sync) { - if (!error) { - error = _xfs_log_force_lsn(mp, commit_lsn, - XFS_LOG_SYNC, NULL); - } + error = _xfs_log_force_lsn(mp, commit_lsn, XFS_LOG_SYNC, NULL); XFS_STATS_INC(xs_trans_sync); } else { XFS_STATS_INC(xs_trans_async); @@ -1518,7 +999,7 @@ xfs_trans_roll( struct xfs_inode *dp) { struct xfs_trans *trans; - unsigned int logres, count; + struct xfs_trans_res tres; int error; /* @@ -1530,8 +1011,8 @@ xfs_trans_roll( /* * Copy the critical parameters from one trans to the next. */ - logres = trans->t_log_res; - count = trans->t_log_count; + tres.tr_logres = trans->t_log_res; + tres.tr_logcount = trans->t_log_count; *tpp = xfs_trans_dup(trans); /* @@ -1562,8 +1043,8 @@ xfs_trans_roll( * across this call, or that anything that is locked be logged in * the prior and the next transactions. */ - error = xfs_trans_reserve(trans, 0, logres, 0, - XFS_TRANS_PERM_LOG_RES, count); + tres.tr_logflags = XFS_TRANS_PERM_LOG_RES; + error = xfs_trans_reserve(trans, &tres, 0, 0); /* * Ensure that the inode is in the new transaction and locked. */ diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index c6c0601abd7..b5bc1ab3c4d 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -18,288 +18,7 @@ #ifndef __XFS_TRANS_H__ #define __XFS_TRANS_H__ -struct xfs_log_item; - -/* - * This is the structure written in the log at the head of - * every transaction. It identifies the type and id of the - * transaction, and contains the number of items logged by - * the transaction so we know how many to expect during recovery. - * - * Do not change the below structure without redoing the code in - * xlog_recover_add_to_trans() and xlog_recover_add_to_cont_trans(). - */ -typedef struct xfs_trans_header { - uint th_magic; /* magic number */ - uint th_type; /* transaction type */ - __int32_t th_tid; /* transaction id (unused) */ - uint th_num_items; /* num items logged by trans */ -} xfs_trans_header_t; - -#define XFS_TRANS_HEADER_MAGIC 0x5452414e /* TRAN */ - -/* - * Log item types. - */ -#define XFS_LI_EFI 0x1236 -#define XFS_LI_EFD 0x1237 -#define XFS_LI_IUNLINK 0x1238 -#define XFS_LI_INODE 0x123b /* aligned ino chunks, var-size ibufs */ -#define XFS_LI_BUF 0x123c /* v2 bufs, variable sized inode bufs */ -#define XFS_LI_DQUOT 0x123d -#define XFS_LI_QUOTAOFF 0x123e - -#define XFS_LI_TYPE_DESC \ - { XFS_LI_EFI, "XFS_LI_EFI" }, \ - { XFS_LI_EFD, "XFS_LI_EFD" }, \ - { XFS_LI_IUNLINK, "XFS_LI_IUNLINK" }, \ - { XFS_LI_INODE, "XFS_LI_INODE" }, \ - { XFS_LI_BUF, "XFS_LI_BUF" }, \ - { XFS_LI_DQUOT, "XFS_LI_DQUOT" }, \ - { XFS_LI_QUOTAOFF, "XFS_LI_QUOTAOFF" } - -/* - * Transaction types. Used to distinguish types of buffers. - */ -#define XFS_TRANS_SETATTR_NOT_SIZE 1 -#define XFS_TRANS_SETATTR_SIZE 2 -#define XFS_TRANS_INACTIVE 3 -#define XFS_TRANS_CREATE 4 -#define XFS_TRANS_CREATE_TRUNC 5 -#define XFS_TRANS_TRUNCATE_FILE 6 -#define XFS_TRANS_REMOVE 7 -#define XFS_TRANS_LINK 8 -#define XFS_TRANS_RENAME 9 -#define XFS_TRANS_MKDIR 10 -#define XFS_TRANS_RMDIR 11 -#define XFS_TRANS_SYMLINK 12 -#define XFS_TRANS_SET_DMATTRS 13 -#define XFS_TRANS_GROWFS 14 -#define XFS_TRANS_STRAT_WRITE 15 -#define XFS_TRANS_DIOSTRAT 16 -/* 17 was XFS_TRANS_WRITE_SYNC */ -#define XFS_TRANS_WRITEID 18 -#define XFS_TRANS_ADDAFORK 19 -#define XFS_TRANS_ATTRINVAL 20 -#define XFS_TRANS_ATRUNCATE 21 -#define XFS_TRANS_ATTR_SET 22 -#define XFS_TRANS_ATTR_RM 23 -#define XFS_TRANS_ATTR_FLAG 24 -#define XFS_TRANS_CLEAR_AGI_BUCKET 25 -#define XFS_TRANS_QM_SBCHANGE 26 -/* - * Dummy entries since we use the transaction type to index into the - * trans_type[] in xlog_recover_print_trans_head() - */ -#define XFS_TRANS_DUMMY1 27 -#define XFS_TRANS_DUMMY2 28 -#define XFS_TRANS_QM_QUOTAOFF 29 -#define XFS_TRANS_QM_DQALLOC 30 -#define XFS_TRANS_QM_SETQLIM 31 -#define XFS_TRANS_QM_DQCLUSTER 32 -#define XFS_TRANS_QM_QINOCREATE 33 -#define XFS_TRANS_QM_QUOTAOFF_END 34 -#define XFS_TRANS_SB_UNIT 35 -#define XFS_TRANS_FSYNC_TS 36 -#define XFS_TRANS_GROWFSRT_ALLOC 37 -#define XFS_TRANS_GROWFSRT_ZERO 38 -#define XFS_TRANS_GROWFSRT_FREE 39 -#define XFS_TRANS_SWAPEXT 40 -#define XFS_TRANS_SB_COUNT 41 -#define XFS_TRANS_CHECKPOINT 42 -#define XFS_TRANS_TYPE_MAX 42 -/* new transaction types need to be reflected in xfs_logprint(8) */ - -#define XFS_TRANS_TYPES \ - { XFS_TRANS_SETATTR_NOT_SIZE, "SETATTR_NOT_SIZE" }, \ - { XFS_TRANS_SETATTR_SIZE, "SETATTR_SIZE" }, \ - { XFS_TRANS_INACTIVE, "INACTIVE" }, \ - { XFS_TRANS_CREATE, "CREATE" }, \ - { XFS_TRANS_CREATE_TRUNC, "CREATE_TRUNC" }, \ - { XFS_TRANS_TRUNCATE_FILE, "TRUNCATE_FILE" }, \ - { XFS_TRANS_REMOVE, "REMOVE" }, \ - { XFS_TRANS_LINK, "LINK" }, \ - { XFS_TRANS_RENAME, "RENAME" }, \ - { XFS_TRANS_MKDIR, "MKDIR" }, \ - { XFS_TRANS_RMDIR, "RMDIR" }, \ - { XFS_TRANS_SYMLINK, "SYMLINK" }, \ - { XFS_TRANS_SET_DMATTRS, "SET_DMATTRS" }, \ - { XFS_TRANS_GROWFS, "GROWFS" }, \ - { XFS_TRANS_STRAT_WRITE, "STRAT_WRITE" }, \ - { XFS_TRANS_DIOSTRAT, "DIOSTRAT" }, \ - { XFS_TRANS_WRITEID, "WRITEID" }, \ - { XFS_TRANS_ADDAFORK, "ADDAFORK" }, \ - { XFS_TRANS_ATTRINVAL, "ATTRINVAL" }, \ - { XFS_TRANS_ATRUNCATE, "ATRUNCATE" }, \ - { XFS_TRANS_ATTR_SET, "ATTR_SET" }, \ - { XFS_TRANS_ATTR_RM, "ATTR_RM" }, \ - { XFS_TRANS_ATTR_FLAG, "ATTR_FLAG" }, \ - { XFS_TRANS_CLEAR_AGI_BUCKET, "CLEAR_AGI_BUCKET" }, \ - { XFS_TRANS_QM_SBCHANGE, "QM_SBCHANGE" }, \ - { XFS_TRANS_QM_QUOTAOFF, "QM_QUOTAOFF" }, \ - { XFS_TRANS_QM_DQALLOC, "QM_DQALLOC" }, \ - { XFS_TRANS_QM_SETQLIM, "QM_SETQLIM" }, \ - { XFS_TRANS_QM_DQCLUSTER, "QM_DQCLUSTER" }, \ - { XFS_TRANS_QM_QINOCREATE, "QM_QINOCREATE" }, \ - { XFS_TRANS_QM_QUOTAOFF_END, "QM_QOFF_END" }, \ - { XFS_TRANS_SB_UNIT, "SB_UNIT" }, \ - { XFS_TRANS_FSYNC_TS, "FSYNC_TS" }, \ - { XFS_TRANS_GROWFSRT_ALLOC, "GROWFSRT_ALLOC" }, \ - { XFS_TRANS_GROWFSRT_ZERO, "GROWFSRT_ZERO" }, \ - { XFS_TRANS_GROWFSRT_FREE, "GROWFSRT_FREE" }, \ - { XFS_TRANS_SWAPEXT, "SWAPEXT" }, \ - { XFS_TRANS_SB_COUNT, "SB_COUNT" }, \ - { XFS_TRANS_CHECKPOINT, "CHECKPOINT" }, \ - { XFS_TRANS_DUMMY1, "DUMMY1" }, \ - { XFS_TRANS_DUMMY2, "DUMMY2" }, \ - { XLOG_UNMOUNT_REC_TYPE, "UNMOUNT" } - -/* - * This structure is used to track log items associated with - * a transaction. It points to the log item and keeps some - * flags to track the state of the log item. It also tracks - * the amount of space needed to log the item it describes - * once we get to commit processing (see xfs_trans_commit()). - */ -struct xfs_log_item_desc { - struct xfs_log_item *lid_item; - struct list_head lid_trans; - unsigned char lid_flags; -}; - -#define XFS_LID_DIRTY 0x1 - -#define XFS_TRANS_MAGIC 0x5452414E /* 'TRAN' */ -/* - * Values for t_flags. - */ -#define XFS_TRANS_DIRTY 0x01 /* something needs to be logged */ -#define XFS_TRANS_SB_DIRTY 0x02 /* superblock is modified */ -#define XFS_TRANS_PERM_LOG_RES 0x04 /* xact took a permanent log res */ -#define XFS_TRANS_SYNC 0x08 /* make commit synchronous */ -#define XFS_TRANS_DQ_DIRTY 0x10 /* at least one dquot in trx dirty */ -#define XFS_TRANS_RESERVE 0x20 /* OK to use reserved data blocks */ -#define XFS_TRANS_FREEZE_PROT 0x40 /* Transaction has elevated writer - count in superblock */ - -/* - * Values for call flags parameter. - */ -#define XFS_TRANS_RELEASE_LOG_RES 0x4 -#define XFS_TRANS_ABORT 0x8 - -/* - * Field values for xfs_trans_mod_sb. - */ -#define XFS_TRANS_SB_ICOUNT 0x00000001 -#define XFS_TRANS_SB_IFREE 0x00000002 -#define XFS_TRANS_SB_FDBLOCKS 0x00000004 -#define XFS_TRANS_SB_RES_FDBLOCKS 0x00000008 -#define XFS_TRANS_SB_FREXTENTS 0x00000010 -#define XFS_TRANS_SB_RES_FREXTENTS 0x00000020 -#define XFS_TRANS_SB_DBLOCKS 0x00000040 -#define XFS_TRANS_SB_AGCOUNT 0x00000080 -#define XFS_TRANS_SB_IMAXPCT 0x00000100 -#define XFS_TRANS_SB_REXTSIZE 0x00000200 -#define XFS_TRANS_SB_RBMBLOCKS 0x00000400 -#define XFS_TRANS_SB_RBLOCKS 0x00000800 -#define XFS_TRANS_SB_REXTENTS 0x00001000 -#define XFS_TRANS_SB_REXTSLOG 0x00002000 - - -/* - * Per-extent log reservation for the allocation btree changes - * involved in freeing or allocating an extent. - * 2 trees * (2 blocks/level * max depth - 1) * block size - */ -#define XFS_ALLOCFREE_LOG_RES(mp,nx) \ - ((nx) * (2 * XFS_FSB_TO_B((mp), 2 * XFS_AG_MAXLEVELS(mp) - 1))) -#define XFS_ALLOCFREE_LOG_COUNT(mp,nx) \ - ((nx) * (2 * (2 * XFS_AG_MAXLEVELS(mp) - 1))) - -/* - * Per-directory log reservation for any directory change. - * dir blocks: (1 btree block per level + data block + free block) * dblock size - * bmap btree: (levels + 2) * max depth * block size - * v2 directory blocks can be fragmented below the dirblksize down to the fsb - * size, so account for that in the DAENTER macros. - */ -#define XFS_DIROP_LOG_RES(mp) \ - (XFS_FSB_TO_B(mp, XFS_DAENTER_BLOCKS(mp, XFS_DATA_FORK)) + \ - (XFS_FSB_TO_B(mp, XFS_DAENTER_BMAPS(mp, XFS_DATA_FORK) + 1))) -#define XFS_DIROP_LOG_COUNT(mp) \ - (XFS_DAENTER_BLOCKS(mp, XFS_DATA_FORK) + \ - XFS_DAENTER_BMAPS(mp, XFS_DATA_FORK) + 1) - - -#define XFS_WRITE_LOG_RES(mp) ((mp)->m_reservations.tr_write) -#define XFS_ITRUNCATE_LOG_RES(mp) ((mp)->m_reservations.tr_itruncate) -#define XFS_RENAME_LOG_RES(mp) ((mp)->m_reservations.tr_rename) -#define XFS_LINK_LOG_RES(mp) ((mp)->m_reservations.tr_link) -#define XFS_REMOVE_LOG_RES(mp) ((mp)->m_reservations.tr_remove) -#define XFS_SYMLINK_LOG_RES(mp) ((mp)->m_reservations.tr_symlink) -#define XFS_CREATE_LOG_RES(mp) ((mp)->m_reservations.tr_create) -#define XFS_MKDIR_LOG_RES(mp) ((mp)->m_reservations.tr_mkdir) -#define XFS_IFREE_LOG_RES(mp) ((mp)->m_reservations.tr_ifree) -#define XFS_ICHANGE_LOG_RES(mp) ((mp)->m_reservations.tr_ichange) -#define XFS_GROWDATA_LOG_RES(mp) ((mp)->m_reservations.tr_growdata) -#define XFS_GROWRTALLOC_LOG_RES(mp) ((mp)->m_reservations.tr_growrtalloc) -#define XFS_GROWRTZERO_LOG_RES(mp) ((mp)->m_reservations.tr_growrtzero) -#define XFS_GROWRTFREE_LOG_RES(mp) ((mp)->m_reservations.tr_growrtfree) -#define XFS_SWRITE_LOG_RES(mp) ((mp)->m_reservations.tr_swrite) -/* - * Logging the inode timestamps on an fsync -- same as SWRITE - * as long as SWRITE logs the entire inode core - */ -#define XFS_FSYNC_TS_LOG_RES(mp) ((mp)->m_reservations.tr_swrite) -#define XFS_WRITEID_LOG_RES(mp) ((mp)->m_reservations.tr_swrite) -#define XFS_ADDAFORK_LOG_RES(mp) ((mp)->m_reservations.tr_addafork) -#define XFS_ATTRINVAL_LOG_RES(mp) ((mp)->m_reservations.tr_attrinval) -#define XFS_ATTRSET_LOG_RES(mp, ext) \ - ((mp)->m_reservations.tr_attrset + \ - (ext * (mp)->m_sb.sb_sectsize) + \ - (ext * XFS_FSB_TO_B((mp), XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK))) + \ - (128 * (ext + (ext * XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK))))) -#define XFS_ATTRRM_LOG_RES(mp) ((mp)->m_reservations.tr_attrrm) -#define XFS_CLEAR_AGI_BUCKET_LOG_RES(mp) ((mp)->m_reservations.tr_clearagi) - - -/* - * Various log count values. - */ -#define XFS_DEFAULT_LOG_COUNT 1 -#define XFS_DEFAULT_PERM_LOG_COUNT 2 -#define XFS_ITRUNCATE_LOG_COUNT 2 -#define XFS_INACTIVE_LOG_COUNT 2 -#define XFS_CREATE_LOG_COUNT 2 -#define XFS_MKDIR_LOG_COUNT 3 -#define XFS_SYMLINK_LOG_COUNT 3 -#define XFS_REMOVE_LOG_COUNT 2 -#define XFS_LINK_LOG_COUNT 2 -#define XFS_RENAME_LOG_COUNT 2 -#define XFS_WRITE_LOG_COUNT 2 -#define XFS_ADDAFORK_LOG_COUNT 2 -#define XFS_ATTRINVAL_LOG_COUNT 1 -#define XFS_ATTRSET_LOG_COUNT 3 -#define XFS_ATTRRM_LOG_COUNT 3 - -/* - * Here we centralize the specification of XFS meta-data buffer - * reference count values. This determine how hard the buffer - * cache tries to hold onto the buffer. - */ -#define XFS_AGF_REF 4 -#define XFS_AGI_REF 4 -#define XFS_AGFL_REF 3 -#define XFS_INO_BTREE_REF 3 -#define XFS_ALLOC_BTREE_REF 2 -#define XFS_BMAP_BTREE_REF 2 -#define XFS_DIR_BTREE_REF 2 -#define XFS_INO_REF 2 -#define XFS_ATTR_BTREE_REF 1 -#define XFS_DQUOT_REF 1 - -#ifdef __KERNEL__ +/* kernel only transaction subsystem defines */ struct xfs_buf; struct xfs_buftarg; @@ -311,6 +30,7 @@ struct xfs_log_iovec; struct xfs_log_item_desc; struct xfs_mount; struct xfs_trans; +struct xfs_trans_res; struct xfs_dquot_acct; struct xfs_busy_extent; @@ -343,8 +63,8 @@ typedef struct xfs_log_item { { XFS_LI_ABORTED, "ABORTED" } struct xfs_item_ops { - uint (*iop_size)(xfs_log_item_t *); - void (*iop_format)(xfs_log_item_t *, struct xfs_log_iovec *); + void (*iop_size)(xfs_log_item_t *, int *, int *); + void (*iop_format)(xfs_log_item_t *, struct xfs_log_vec *); void (*iop_pin)(xfs_log_item_t *); void (*iop_unpin)(xfs_log_item_t *, int remove); uint (*iop_push)(struct xfs_log_item *, struct list_head *); @@ -353,35 +73,23 @@ struct xfs_item_ops { void (*iop_committing)(xfs_log_item_t *, xfs_lsn_t); }; -#define IOP_SIZE(ip) (*(ip)->li_ops->iop_size)(ip) -#define IOP_FORMAT(ip,vp) (*(ip)->li_ops->iop_format)(ip, vp) -#define IOP_PIN(ip) (*(ip)->li_ops->iop_pin)(ip) -#define IOP_UNPIN(ip, remove) (*(ip)->li_ops->iop_unpin)(ip, remove) -#define IOP_PUSH(ip, list) (*(ip)->li_ops->iop_push)(ip, list) -#define IOP_UNLOCK(ip) (*(ip)->li_ops->iop_unlock)(ip) -#define IOP_COMMITTED(ip, lsn) (*(ip)->li_ops->iop_committed)(ip, lsn) -#define IOP_COMMITTING(ip, lsn) (*(ip)->li_ops->iop_committing)(ip, lsn) +void xfs_log_item_init(struct xfs_mount *mp, struct xfs_log_item *item, + int type, const struct xfs_item_ops *ops); /* - * Return values for the IOP_PUSH() routines. + * Return values for the iop_push() routines. */ #define XFS_ITEM_SUCCESS 0 #define XFS_ITEM_PINNED 1 #define XFS_ITEM_LOCKED 2 #define XFS_ITEM_FLUSHING 3 -/* - * This is the type of function which can be given to xfs_trans_callback() - * to be called upon the transaction's commit to disk. - */ -typedef void (*xfs_trans_callback_t)(struct xfs_trans *, void *); /* * This is the structure maintained for every active transaction. */ typedef struct xfs_trans { unsigned int t_magic; /* magic number */ - xfs_log_callback_t t_logcb; /* log callback struct */ unsigned int t_type; /* transaction type */ unsigned int t_log_res; /* amt of log space resvd */ unsigned int t_log_count; /* count for perm log res */ @@ -403,7 +111,7 @@ typedef struct xfs_trans { int64_t t_res_fdblocks_delta; /* on-disk only chg */ int64_t t_frextents_delta;/* superblock freextents chg*/ int64_t t_res_frextents_delta; /* on-disk only chg */ -#ifdef DEBUG +#if defined(DEBUG) || defined(XFS_WARN) int64_t t_ag_freeblks_delta; /* debugging counter */ int64_t t_ag_flist_delta; /* debugging counter */ int64_t t_ag_btree_delta; /* debugging counter */ @@ -417,7 +125,6 @@ typedef struct xfs_trans { int64_t t_rextents_delta;/* superblocks rextents chg */ int64_t t_rextslog_delta;/* superblocks rextslog chg */ struct list_head t_items; /* log item descriptors */ - xfs_trans_header_t t_header; /* header for in-log trans */ struct list_head t_busy; /* list of busy extents */ unsigned long t_pflags; /* saved process flags state */ } xfs_trans_t; @@ -431,7 +138,7 @@ typedef struct xfs_trans { #define xfs_trans_get_block_res(tp) ((tp)->t_blk_res) #define xfs_trans_set_sync(tp) ((tp)->t_flags |= XFS_TRANS_SYNC) -#ifdef DEBUG +#if defined(DEBUG) || defined(XFS_WARN) #define xfs_trans_agblocks_delta(tp, d) ((tp)->t_ag_freeblks_delta += (int64_t)d) #define xfs_trans_agflist_delta(tp, d) ((tp)->t_ag_flist_delta += (int64_t)d) #define xfs_trans_agbtree_delta(tp, d) ((tp)->t_ag_btree_delta += (int64_t)d) @@ -447,7 +154,7 @@ typedef struct xfs_trans { xfs_trans_t *xfs_trans_alloc(struct xfs_mount *, uint); xfs_trans_t *_xfs_trans_alloc(struct xfs_mount *, uint, xfs_km_flags_t); xfs_trans_t *xfs_trans_dup(xfs_trans_t *); -int xfs_trans_reserve(xfs_trans_t *, uint, uint, uint, +int xfs_trans_reserve(struct xfs_trans *, struct xfs_trans_res *, uint, uint); void xfs_trans_mod_sb(xfs_trans_t *, uint, int64_t); @@ -501,6 +208,7 @@ void xfs_trans_bhold_release(xfs_trans_t *, struct xfs_buf *); void xfs_trans_binval(xfs_trans_t *, struct xfs_buf *); void xfs_trans_inode_buf(xfs_trans_t *, struct xfs_buf *); void xfs_trans_stale_inode_buf(xfs_trans_t *, struct xfs_buf *); +void xfs_trans_ordered_buf(xfs_trans_t *, struct xfs_buf *); void xfs_trans_dquot_buf(xfs_trans_t *, struct xfs_buf *, uint); void xfs_trans_inode_alloc_buf(xfs_trans_t *, struct xfs_buf *); void xfs_trans_ichgtime(struct xfs_trans *, struct xfs_inode *, int); @@ -521,16 +229,17 @@ void xfs_trans_log_efd_extent(xfs_trans_t *, xfs_fsblock_t, xfs_extlen_t); int xfs_trans_commit(xfs_trans_t *, uint flags); +int xfs_trans_roll(struct xfs_trans **, struct xfs_inode *); void xfs_trans_cancel(xfs_trans_t *, int); int xfs_trans_ail_init(struct xfs_mount *); void xfs_trans_ail_destroy(struct xfs_mount *); +void xfs_trans_buf_set_type(struct xfs_trans *, struct xfs_buf *, + enum xfs_blft); +void xfs_trans_buf_copy_type(struct xfs_buf *dst_bp, + struct xfs_buf *src_bp); + extern kmem_zone_t *xfs_trans_zone; extern kmem_zone_t *xfs_log_item_desc_zone; -#endif /* __KERNEL__ */ - -void xfs_trans_init(struct xfs_mount *); -int xfs_trans_roll(struct xfs_trans **, struct xfs_inode *); - #endif /* __XFS_TRANS_H__ */ diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index 6011ee66133..cb0f3a84cc6 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -18,15 +18,16 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_types.h" -#include "xfs_log.h" -#include "xfs_trans.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_mount.h" +#include "xfs_trans.h" #include "xfs_trans_priv.h" #include "xfs_trace.h" #include "xfs_error.h" +#include "xfs_log.h" #ifdef DEBUG /* @@ -55,40 +56,12 @@ xfs_ail_check( ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) >= 0); -#ifdef XFS_TRANS_DEBUG - /* - * Walk the list checking lsn ordering, and that every entry has the - * XFS_LI_IN_AIL flag set. This is really expensive, so only do it - * when specifically debugging the transaction subsystem. - */ - prev_lip = list_entry(&ailp->xa_ail, xfs_log_item_t, li_ail); - list_for_each_entry(lip, &ailp->xa_ail, li_ail) { - if (&prev_lip->li_ail != &ailp->xa_ail) - ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) <= 0); - ASSERT((lip->li_flags & XFS_LI_IN_AIL) != 0); - prev_lip = lip; - } -#endif /* XFS_TRANS_DEBUG */ } #else /* !DEBUG */ #define xfs_ail_check(a,l) #endif /* DEBUG */ /* - * Return a pointer to the first item in the AIL. If the AIL is empty, then - * return NULL. - */ -xfs_log_item_t * -xfs_ail_min( - struct xfs_ail *ailp) -{ - if (list_empty(&ailp->xa_ail)) - return NULL; - - return list_first_entry(&ailp->xa_ail, xfs_log_item_t, li_ail); -} - - /* * Return a pointer to the last item in the AIL. If the AIL is empty, then * return NULL. */ @@ -200,7 +173,6 @@ xfs_trans_ail_cursor_next( */ void xfs_trans_ail_cursor_done( - struct xfs_ail *ailp, struct xfs_ail_cursor *cur) { cur->item = NULL; @@ -395,7 +367,7 @@ xfsaild_push( * If the AIL is empty or our push has reached the end we are * done now. */ - xfs_trans_ail_cursor_done(ailp, &cur); + xfs_trans_ail_cursor_done(&cur); spin_unlock(&ailp->xa_lock); goto out_done; } @@ -407,11 +379,11 @@ xfsaild_push( int lock_result; /* - * Note that IOP_PUSH may unlock and reacquire the AIL lock. We + * Note that iop_push may unlock and reacquire the AIL lock. We * rely on the AIL cursor implementation to be able to deal with * the dropped lock. */ - lock_result = IOP_PUSH(lip, &ailp->xa_buf_list); + lock_result = lip->li_ops->iop_push(lip, &ailp->xa_buf_list); switch (lock_result) { case XFS_ITEM_SUCCESS: XFS_STATS_INC(xs_push_ail_success); @@ -480,7 +452,7 @@ xfsaild_push( break; lsn = lip->li_lsn; } - xfs_trans_ail_cursor_done(ailp, &cur); + xfs_trans_ail_cursor_done(&cur); spin_unlock(&ailp->xa_lock); if (xfs_buf_delwri_submit_nowait(&ailp->xa_buf_list)) @@ -686,11 +658,13 @@ xfs_trans_ail_update_bulk( if (XFS_LSN_CMP(lsn, lip->li_lsn) <= 0) continue; + trace_xfs_ail_move(lip, lip->li_lsn, lsn); xfs_ail_delete(ailp, lip); if (mlip == lip) mlip_changed = 1; } else { lip->li_flags |= XFS_LI_IN_AIL; + trace_xfs_ail_insert(lip, 0, lsn); } lip->li_lsn = lsn; list_add(&lip->li_ail, &tmp); @@ -759,6 +733,7 @@ xfs_trans_ail_delete_bulk( return; } + trace_xfs_ail_delete(lip, mlip->li_lsn, lip->li_lsn); xfs_ail_delete(ailp, lip); lip->li_flags &= ~XFS_LI_IN_AIL; lip->li_lsn = 0; diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index 3edf5dbee00..b8eef0549f3 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c @@ -17,17 +17,15 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_types.h" -#include "xfs_log.h" -#include "xfs_trans.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_mount.h" -#include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_dinode.h" #include "xfs_inode.h" +#include "xfs_trans.h" #include "xfs_buf_item.h" #include "xfs_trans_priv.h" #include "xfs_error.h" @@ -277,6 +275,10 @@ xfs_trans_read_buf_map( XFS_BUF_UNDONE(bp); xfs_buf_stale(bp); xfs_buf_relse(bp); + + /* bad CRC means corrupted metadata */ + if (error == EFSBADCRC) + error = EFSCORRUPTED; return error; } #ifdef DEBUG @@ -316,7 +318,18 @@ xfs_trans_read_buf_map( ASSERT(bp->b_iodone == NULL); XFS_BUF_READ(bp); bp->b_ops = ops; - xfsbdstrat(tp->t_mountp, bp); + + /* + * XXX(hch): clean up the error handling here to be less + * of a mess.. + */ + if (XFS_FORCED_SHUTDOWN(mp)) { + trace_xfs_bdstrat_shut(bp, _RET_IP_); + xfs_bioerror_relse(bp); + } else { + xfs_buf_iorequest(bp); + } + error = xfs_buf_iowait(bp); if (error) { xfs_buf_ioerror_alert(bp, __func__); @@ -329,6 +342,9 @@ xfs_trans_read_buf_map( if (tp->t_flags & XFS_TRANS_DIRTY) xfs_force_shutdown(tp->t_mountp, SHUTDOWN_META_IO_ERROR); + /* bad CRC means corrupted metadata */ + if (error == EFSBADCRC) + error = EFSCORRUPTED; return error; } } @@ -366,6 +382,10 @@ xfs_trans_read_buf_map( if (tp->t_flags & XFS_TRANS_DIRTY) xfs_force_shutdown(tp->t_mountp, SHUTDOWN_META_IO_ERROR); xfs_buf_relse(bp); + + /* bad CRC means corrupted metadata */ + if (error == EFSBADCRC) + error = EFSCORRUPTED; return error; } #ifdef DEBUG @@ -397,7 +417,6 @@ shutdown_abort: return XFS_ERROR(EIO); } - /* * Release the buffer bp which was previously acquired with one of the * xfs_trans_... buffer allocation routines if the buffer has not @@ -506,7 +525,7 @@ xfs_trans_brelse(xfs_trans_t *tp, /* * Mark the buffer as not needing to be unlocked when the buf item's - * IOP_UNLOCK() routine is called. The buffer must already be locked + * iop_unlock() routine is called. The buffer must already be locked * and associated with the given transaction. */ /* ARGSUSED */ @@ -603,8 +622,14 @@ xfs_trans_log_buf(xfs_trans_t *tp, tp->t_flags |= XFS_TRANS_DIRTY; bip->bli_item.li_desc->lid_flags |= XFS_LID_DIRTY; - bip->bli_flags |= XFS_BLI_LOGGED; - xfs_buf_item_log(bip, first, last); + + /* + * If we have an ordered buffer we are not logging any dirty range but + * it still needs to be marked dirty and that it has been logged. + */ + bip->bli_flags |= XFS_BLI_DIRTY | XFS_BLI_LOGGED; + if (!(bip->bli_flags & XFS_BLI_ORDERED)) + xfs_buf_item_log(bip, first, last); } @@ -659,6 +684,7 @@ xfs_trans_binval( ASSERT(XFS_BUF_ISSTALE(bp)); ASSERT(!(bip->bli_flags & (XFS_BLI_LOGGED | XFS_BLI_DIRTY))); ASSERT(!(bip->__bli_format.blf_flags & XFS_BLF_INODE_BUF)); + ASSERT(!(bip->__bli_format.blf_flags & XFS_BLFT_MASK)); ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL); ASSERT(bip->bli_item.li_desc->lid_flags & XFS_LID_DIRTY); ASSERT(tp->t_flags & XFS_TRANS_DIRTY); @@ -671,6 +697,7 @@ xfs_trans_binval( bip->bli_flags &= ~(XFS_BLI_INODE_BUF | XFS_BLI_LOGGED | XFS_BLI_DIRTY); bip->__bli_format.blf_flags &= ~XFS_BLF_INODE_BUF; bip->__bli_format.blf_flags |= XFS_BLF_CANCEL; + bip->__bli_format.blf_flags &= ~XFS_BLFT_MASK; for (i = 0; i < bip->bli_format_count; i++) { memset(bip->bli_formats[i].blf_data_map, 0, (bip->bli_formats[i].blf_map_size * sizeof(uint))); @@ -702,12 +729,13 @@ xfs_trans_inode_buf( ASSERT(atomic_read(&bip->bli_refcount) > 0); bip->bli_flags |= XFS_BLI_INODE_BUF; + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DINO_BUF); } /* * This call is used to indicate that the buffer is going to * be staled and was an inode buffer. This means it gets - * special processing during unpin - where any inodes + * special processing during unpin - where any inodes * associated with the buffer should be removed from ail. * There is also special processing during recovery, * any replay of the inodes in the buffer needs to be @@ -726,6 +754,7 @@ xfs_trans_stale_inode_buf( bip->bli_flags |= XFS_BLI_STALE_INODE; bip->bli_item.li_cb = xfs_buf_iodone; + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DINO_BUF); } /* @@ -749,8 +778,66 @@ xfs_trans_inode_alloc_buf( ASSERT(atomic_read(&bip->bli_refcount) > 0); bip->bli_flags |= XFS_BLI_INODE_ALLOC_BUF; + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DINO_BUF); } +/* + * Mark the buffer as ordered for this transaction. This means + * that the contents of the buffer are not recorded in the transaction + * but it is tracked in the AIL as though it was. This allows us + * to record logical changes in transactions rather than the physical + * changes we make to the buffer without changing writeback ordering + * constraints of metadata buffers. + */ +void +xfs_trans_ordered_buf( + struct xfs_trans *tp, + struct xfs_buf *bp) +{ + struct xfs_buf_log_item *bip = bp->b_fspriv; + + ASSERT(bp->b_transp == tp); + ASSERT(bip != NULL); + ASSERT(atomic_read(&bip->bli_refcount) > 0); + + bip->bli_flags |= XFS_BLI_ORDERED; + trace_xfs_buf_item_ordered(bip); +} + +/* + * Set the type of the buffer for log recovery so that it can correctly identify + * and hence attach the correct buffer ops to the buffer after replay. + */ +void +xfs_trans_buf_set_type( + struct xfs_trans *tp, + struct xfs_buf *bp, + enum xfs_blft type) +{ + struct xfs_buf_log_item *bip = bp->b_fspriv; + + if (!tp) + return; + + ASSERT(bp->b_transp == tp); + ASSERT(bip != NULL); + ASSERT(atomic_read(&bip->bli_refcount) > 0); + + xfs_blft_to_flags(&bip->__bli_format, type); +} + +void +xfs_trans_buf_copy_type( + struct xfs_buf *dst_bp, + struct xfs_buf *src_bp) +{ + struct xfs_buf_log_item *sbip = src_bp->b_fspriv; + struct xfs_buf_log_item *dbip = dst_bp->b_fspriv; + enum xfs_blft type; + + type = xfs_blft_from_flags(&sbip->__bli_format); + xfs_blft_to_flags(&dbip->__bli_format, type); +} /* * Similar to xfs_trans_inode_buf(), this marks the buffer as a cluster of @@ -769,14 +856,28 @@ xfs_trans_dquot_buf( xfs_buf_t *bp, uint type) { - xfs_buf_log_item_t *bip = bp->b_fspriv; + struct xfs_buf_log_item *bip = bp->b_fspriv; - ASSERT(bp->b_transp == tp); - ASSERT(bip != NULL); ASSERT(type == XFS_BLF_UDQUOT_BUF || type == XFS_BLF_PDQUOT_BUF || type == XFS_BLF_GDQUOT_BUF); - ASSERT(atomic_read(&bip->bli_refcount) > 0); bip->__bli_format.blf_flags |= type; + + switch (type) { + case XFS_BLF_UDQUOT_BUF: + type = XFS_BLFT_UDQUOT_BUF; + break; + case XFS_BLF_PDQUOT_BUF: + type = XFS_BLFT_PDQUOT_BUF; + break; + case XFS_BLF_GDQUOT_BUF: + type = XFS_BLFT_GDQUOT_BUF; + break; + default: + type = XFS_BLFT_UNKNOWN_BUF; + break; + } + + xfs_trans_buf_set_type(tp, bp, type); } diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c index 0c7fa54f309..41172861e85 100644 --- a/fs/xfs/xfs_trans_dquot.c +++ b/fs/xfs/xfs_trans_dquot.c @@ -17,22 +17,18 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_log.h" -#include "xfs_trans.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_alloc.h" -#include "xfs_quota.h" #include "xfs_mount.h" -#include "xfs_bmap_btree.h" #include "xfs_inode.h" -#include "xfs_itable.h" -#include "xfs_bmap.h" -#include "xfs_rtalloc.h" #include "xfs_error.h" -#include "xfs_attr.h" -#include "xfs_buf_item.h" +#include "xfs_trans.h" #include "xfs_trans_priv.h" +#include "xfs_quota.h" #include "xfs_qm.h" STATIC void xfs_trans_alloc_dqinfo(xfs_trans_t *); @@ -103,8 +99,6 @@ xfs_trans_dup_dqinfo( return; xfs_trans_alloc_dqinfo(ntp); - oqa = otp->t_dqinfo->dqa_usrdquots; - nqa = ntp->t_dqinfo->dqa_usrdquots; /* * Because the quota blk reservation is carried forward, @@ -113,7 +107,9 @@ xfs_trans_dup_dqinfo( if(otp->t_flags & XFS_TRANS_DQ_DIRTY) ntp->t_flags |= XFS_TRANS_DQ_DIRTY; - for (j = 0; j < 2; j++) { + for (j = 0; j < XFS_QM_TRANS_DQTYPES; j++) { + oqa = otp->t_dqinfo->dqs[j]; + nqa = ntp->t_dqinfo->dqs[j]; for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) { if (oqa[i].qt_dquot == NULL) break; @@ -138,8 +134,6 @@ xfs_trans_dup_dqinfo( oq->qt_ino_res = oq->qt_ino_res_used; } - oqa = otp->t_dqinfo->dqa_grpdquots; - nqa = ntp->t_dqinfo->dqa_grpdquots; } } @@ -157,8 +151,7 @@ xfs_trans_mod_dquot_byino( if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp) || - ip->i_ino == mp->m_sb.sb_uquotino || - ip->i_ino == mp->m_sb.sb_gquotino) + xfs_is_quota_inode(&mp->m_sb, ip->i_ino)) return; if (tp->t_dqinfo == NULL) @@ -166,20 +159,28 @@ xfs_trans_mod_dquot_byino( if (XFS_IS_UQUOTA_ON(mp) && ip->i_udquot) (void) xfs_trans_mod_dquot(tp, ip->i_udquot, field, delta); - if (XFS_IS_OQUOTA_ON(mp) && ip->i_gdquot) + if (XFS_IS_GQUOTA_ON(mp) && ip->i_gdquot) (void) xfs_trans_mod_dquot(tp, ip->i_gdquot, field, delta); + if (XFS_IS_PQUOTA_ON(mp) && ip->i_pdquot) + (void) xfs_trans_mod_dquot(tp, ip->i_pdquot, field, delta); } -STATIC xfs_dqtrx_t * +STATIC struct xfs_dqtrx * xfs_trans_get_dqtrx( - xfs_trans_t *tp, - xfs_dquot_t *dqp) + struct xfs_trans *tp, + struct xfs_dquot *dqp) { - int i; - xfs_dqtrx_t *qa; - - qa = XFS_QM_ISUDQ(dqp) ? - tp->t_dqinfo->dqa_usrdquots : tp->t_dqinfo->dqa_grpdquots; + int i; + struct xfs_dqtrx *qa; + + if (XFS_QM_ISUDQ(dqp)) + qa = tp->t_dqinfo->dqs[XFS_QM_TRANS_USR]; + else if (XFS_QM_ISGDQ(dqp)) + qa = tp->t_dqinfo->dqs[XFS_QM_TRANS_GRP]; + else if (XFS_QM_ISPDQ(dqp)) + qa = tp->t_dqinfo->dqs[XFS_QM_TRANS_PRJ]; + else + return NULL; for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) { if (qa[i].qt_dquot == NULL || @@ -292,11 +293,10 @@ xfs_trans_mod_dquot( /* - * Given an array of dqtrx structures, lock all the dquots associated - * and join them to the transaction, provided they have been modified. - * We know that the highest number of dquots (of one type - usr OR grp), - * involved in a transaction is 2 and that both usr and grp combined - 3. - * So, we don't attempt to make this very generic. + * Given an array of dqtrx structures, lock all the dquots associated and join + * them to the transaction, provided they have been modified. We know that the + * highest number of dquots of one type - usr, grp and prj - involved in a + * transaction is 3 so we don't need to make this very generic. */ STATIC void xfs_trans_dqlockedjoin( @@ -326,12 +326,12 @@ xfs_trans_dqlockedjoin( */ void xfs_trans_apply_dquot_deltas( - xfs_trans_t *tp) + struct xfs_trans *tp) { int i, j; - xfs_dquot_t *dqp; - xfs_dqtrx_t *qtrx, *qa; - xfs_disk_dquot_t *d; + struct xfs_dquot *dqp; + struct xfs_dqtrx *qtrx, *qa; + struct xfs_disk_dquot *d; long totalbdelta; long totalrtbdelta; @@ -339,12 +339,10 @@ xfs_trans_apply_dquot_deltas( return; ASSERT(tp->t_dqinfo); - qa = tp->t_dqinfo->dqa_usrdquots; - for (j = 0; j < 2; j++) { - if (qa[0].qt_dquot == NULL) { - qa = tp->t_dqinfo->dqa_grpdquots; + for (j = 0; j < XFS_QM_TRANS_DQTYPES; j++) { + qa = tp->t_dqinfo->dqs[j]; + if (qa[0].qt_dquot == NULL) continue; - } /* * Lock all of the dquots and join them to the transaction. @@ -412,7 +410,7 @@ xfs_trans_apply_dquot_deltas( * Start/reset the timer(s) if needed. */ if (d->d_id) { - xfs_qm_adjust_dqlimits(tp->t_mountp, d); + xfs_qm_adjust_dqlimits(tp->t_mountp, dqp); xfs_qm_adjust_dqtimers(tp->t_mountp, d); } @@ -495,10 +493,6 @@ xfs_trans_apply_dquot_deltas( ASSERT(dqp->q_res_rtbcount >= be64_to_cpu(dqp->q_core.d_rtbcount)); } - /* - * Do the group quotas next - */ - qa = tp->t_dqinfo->dqa_grpdquots; } } @@ -516,14 +510,14 @@ xfs_trans_unreserve_and_mod_dquots( int i, j; xfs_dquot_t *dqp; xfs_dqtrx_t *qtrx, *qa; - boolean_t locked; + bool locked; if (!tp->t_dqinfo || !(tp->t_flags & XFS_TRANS_DQ_DIRTY)) return; - qa = tp->t_dqinfo->dqa_usrdquots; + for (j = 0; j < XFS_QM_TRANS_DQTYPES; j++) { + qa = tp->t_dqinfo->dqs[j]; - for (j = 0; j < 2; j++) { for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) { qtrx = &qa[i]; /* @@ -537,17 +531,17 @@ xfs_trans_unreserve_and_mod_dquots( * about the number of blocks used field, or deltas. * Also we don't bother to zero the fields. */ - locked = B_FALSE; + locked = false; if (qtrx->qt_blk_res) { xfs_dqlock(dqp); - locked = B_TRUE; + locked = true; dqp->q_res_bcount -= (xfs_qcnt_t)qtrx->qt_blk_res; } if (qtrx->qt_ino_res) { if (!locked) { xfs_dqlock(dqp); - locked = B_TRUE; + locked = true; } dqp->q_res_icount -= (xfs_qcnt_t)qtrx->qt_ino_res; @@ -556,7 +550,7 @@ xfs_trans_unreserve_and_mod_dquots( if (qtrx->qt_rtblk_res) { if (!locked) { xfs_dqlock(dqp); - locked = B_TRUE; + locked = true; } dqp->q_res_rtbcount -= (xfs_qcnt_t)qtrx->qt_rtblk_res; @@ -565,7 +559,6 @@ xfs_trans_unreserve_and_mod_dquots( xfs_dqunlock(dqp); } - qa = tp->t_dqinfo->dqa_grpdquots; } } @@ -640,8 +633,8 @@ xfs_trans_dqresv( if ((flags & XFS_QMOPT_FORCE_RES) == 0 && dqp->q_core.d_id && ((XFS_IS_UQUOTA_ENFORCED(dqp->q_mount) && XFS_QM_ISUDQ(dqp)) || - (XFS_IS_OQUOTA_ENFORCED(dqp->q_mount) && - (XFS_QM_ISPDQ(dqp) || XFS_QM_ISGDQ(dqp))))) { + (XFS_IS_GQUOTA_ENFORCED(dqp->q_mount) && XFS_QM_ISGDQ(dqp)) || + (XFS_IS_PQUOTA_ENFORCED(dqp->q_mount) && XFS_QM_ISPDQ(dqp)))) { if (nblks > 0) { /* * dquot is locked already. See if we'd go over the @@ -736,8 +729,8 @@ error_return: /* * Given dquot(s), make disk block and/or inode reservations against them. - * The fact that this does the reservation against both the usr and - * grp/prj quotas is important, because this follows a both-or-nothing + * The fact that this does the reservation against user, group and + * project quotas is important, because this follows a all-or-nothing * approach. * * flags = XFS_QMOPT_FORCE_RES evades limit enforcement. Used by chown. @@ -748,15 +741,16 @@ error_return: */ int xfs_trans_reserve_quota_bydquots( - xfs_trans_t *tp, - xfs_mount_t *mp, - xfs_dquot_t *udqp, - xfs_dquot_t *gdqp, - long nblks, - long ninos, - uint flags) + struct xfs_trans *tp, + struct xfs_mount *mp, + struct xfs_dquot *udqp, + struct xfs_dquot *gdqp, + struct xfs_dquot *pdqp, + long nblks, + long ninos, + uint flags) { - int resvd = 0, error; + int error; if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp)) return 0; @@ -771,28 +765,34 @@ xfs_trans_reserve_quota_bydquots( (flags & ~XFS_QMOPT_ENOSPC)); if (error) return error; - resvd = 1; } if (gdqp) { error = xfs_trans_dqresv(tp, mp, gdqp, nblks, ninos, flags); - if (error) { - /* - * can't do it, so backout previous reservation - */ - if (resvd) { - flags |= XFS_QMOPT_FORCE_RES; - xfs_trans_dqresv(tp, mp, udqp, - -nblks, -ninos, flags); - } - return error; - } + if (error) + goto unwind_usr; + } + + if (pdqp) { + error = xfs_trans_dqresv(tp, mp, pdqp, nblks, ninos, flags); + if (error) + goto unwind_grp; } /* * Didn't change anything critical, so, no need to log */ return 0; + +unwind_grp: + flags |= XFS_QMOPT_FORCE_RES; + if (gdqp) + xfs_trans_dqresv(tp, mp, gdqp, -nblks, -ninos, flags); +unwind_usr: + flags |= XFS_QMOPT_FORCE_RES; + if (udqp) + xfs_trans_dqresv(tp, mp, udqp, -nblks, -ninos, flags); + return error; } @@ -816,8 +816,7 @@ xfs_trans_reserve_quota_nblks( if (XFS_IS_PQUOTA_ON(mp)) flags |= XFS_QMOPT_ENOSPC; - ASSERT(ip->i_ino != mp->m_sb.sb_uquotino); - ASSERT(ip->i_ino != mp->m_sb.sb_gquotino); + ASSERT(!xfs_is_quota_inode(&mp->m_sb, ip->i_ino)); ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); ASSERT((flags & ~(XFS_QMOPT_FORCE_RES | XFS_QMOPT_ENOSPC)) == @@ -830,6 +829,7 @@ xfs_trans_reserve_quota_nblks( */ return xfs_trans_reserve_quota_bydquots(tp, mp, ip->i_udquot, ip->i_gdquot, + ip->i_pdquot, nblks, ninos, flags); } diff --git a/fs/xfs/xfs_trans_extfree.c b/fs/xfs/xfs_trans_extfree.c index 8d71b16ecca..47978ba89da 100644 --- a/fs/xfs/xfs_trans_extfree.c +++ b/fs/xfs/xfs_trans_extfree.c @@ -17,12 +17,13 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_types.h" -#include "xfs_log.h" -#include "xfs_trans.h" +#include "xfs_shared.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_mount.h" +#include "xfs_trans.h" #include "xfs_trans_priv.h" #include "xfs_extfree_item.h" diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c index d2eee20d5f5..50c3f561428 100644 --- a/fs/xfs/xfs_trans_inode.c +++ b/fs/xfs/xfs_trans_inode.c @@ -17,30 +17,19 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_types.h" -#include "xfs_log.h" -#include "xfs_trans.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_mount.h" -#include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_dinode.h" #include "xfs_inode.h" -#include "xfs_btree.h" +#include "xfs_trans.h" #include "xfs_trans_priv.h" #include "xfs_inode_item.h" #include "xfs_trace.h" -#ifdef XFS_TRANS_DEBUG -STATIC void -xfs_trans_inode_broot_debug( - xfs_inode_t *ip); -#else -#define xfs_trans_inode_broot_debug(ip) -#endif - /* * Add a locked inode to the transaction. * @@ -67,8 +56,6 @@ xfs_trans_ijoin( * Get a log_item_desc to point at the new item. */ xfs_trans_add_item(tp, &iip->ili_item); - - xfs_trans_inode_broot_debug(ip); } /* @@ -122,6 +109,19 @@ xfs_trans_log_inode( ASSERT(ip->i_itemp != NULL); ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + /* + * First time we log the inode in a transaction, bump the inode change + * counter if it is configured for this to occur. We don't use + * inode_inc_version() because there is no need for extra locking around + * i_version as we already hold the inode locked exclusively for + * metadata modification. + */ + if (!(ip->i_itemp->ili_item.li_desc->lid_flags & XFS_LID_DIRTY) && + IS_I_VERSION(VFS_I(ip))) { + ip->i_d.di_changecount = ++VFS_I(ip)->i_version; + flags |= XFS_ILOG_CORE; + } + tp->t_flags |= XFS_TRANS_DIRTY; ip->i_itemp->ili_item.li_desc->lid_flags |= XFS_LID_DIRTY; @@ -135,34 +135,3 @@ xfs_trans_log_inode( flags |= ip->i_itemp->ili_last_fields; ip->i_itemp->ili_fields |= flags; } - -#ifdef XFS_TRANS_DEBUG -/* - * Keep track of the state of the inode btree root to make sure we - * log it properly. - */ -STATIC void -xfs_trans_inode_broot_debug( - xfs_inode_t *ip) -{ - xfs_inode_log_item_t *iip; - - ASSERT(ip->i_itemp != NULL); - iip = ip->i_itemp; - if (iip->ili_root_size != 0) { - ASSERT(iip->ili_orig_root != NULL); - kmem_free(iip->ili_orig_root); - iip->ili_root_size = 0; - iip->ili_orig_root = NULL; - } - if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE) { - ASSERT((ip->i_df.if_broot != NULL) && - (ip->i_df.if_broot_bytes > 0)); - iip->ili_root_size = ip->i_df.if_broot_bytes; - iip->ili_orig_root = - (char*)kmem_alloc(iip->ili_root_size, KM_SLEEP); - memcpy(iip->ili_orig_root, (char*)(ip->i_df.if_broot), - iip->ili_root_size); - } -} -#endif diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h index 53b7c9b0f8f..bd1281862ad 100644 --- a/fs/xfs/xfs_trans_priv.h +++ b/fs/xfs/xfs_trans_priv.h @@ -25,6 +25,8 @@ struct xfs_trans; struct xfs_ail; struct xfs_log_vec; + +void xfs_trans_init(struct xfs_mount *); void xfs_trans_add_item(struct xfs_trans *, struct xfs_log_item *); void xfs_trans_del_item(struct xfs_log_item *); void xfs_trans_free_items(struct xfs_trans *tp, xfs_lsn_t commit_lsn, @@ -83,6 +85,18 @@ void xfs_trans_ail_update_bulk(struct xfs_ail *ailp, struct xfs_ail_cursor *cur, struct xfs_log_item **log_items, int nr_items, xfs_lsn_t lsn) __releases(ailp->xa_lock); +/* + * Return a pointer to the first item in the AIL. If the AIL is empty, then + * return NULL. + */ +static inline struct xfs_log_item * +xfs_ail_min( + struct xfs_ail *ailp) +{ + return list_first_entry_or_null(&ailp->xa_ail, struct xfs_log_item, + li_ail); +} + static inline void xfs_trans_ail_update( struct xfs_ail *ailp, @@ -119,8 +133,7 @@ struct xfs_log_item * xfs_trans_ail_cursor_last(struct xfs_ail *ailp, xfs_lsn_t lsn); struct xfs_log_item * xfs_trans_ail_cursor_next(struct xfs_ail *ailp, struct xfs_ail_cursor *cur); -void xfs_trans_ail_cursor_done(struct xfs_ail *ailp, - struct xfs_ail_cursor *cur); +void xfs_trans_ail_cursor_done(struct xfs_ail_cursor *cur); #if BITS_PER_LONG != 64 static inline void diff --git a/fs/xfs/xfs_trans_resv.c b/fs/xfs/xfs_trans_resv.c new file mode 100644 index 00000000000..f2bda7c76b8 --- /dev/null +++ b/fs/xfs/xfs_trans_resv.c @@ -0,0 +1,894 @@ +/* + * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. + * Copyright (C) 2010 Red Hat, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_mount.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_inode.h" +#include "xfs_bmap_btree.h" +#include "xfs_ialloc.h" +#include "xfs_quota.h" +#include "xfs_trans.h" +#include "xfs_qm.h" +#include "xfs_trans_space.h" +#include "xfs_trace.h" + +/* + * A buffer has a format structure overhead in the log in addition + * to the data, so we need to take this into account when reserving + * space in a transaction for a buffer. Round the space required up + * to a multiple of 128 bytes so that we don't change the historical + * reservation that has been used for this overhead. + */ +STATIC uint +xfs_buf_log_overhead(void) +{ + return round_up(sizeof(struct xlog_op_header) + + sizeof(struct xfs_buf_log_format), 128); +} + +/* + * Calculate out transaction log reservation per item in bytes. + * + * The nbufs argument is used to indicate the number of items that + * will be changed in a transaction. size is used to tell how many + * bytes should be reserved per item. + */ +STATIC uint +xfs_calc_buf_res( + uint nbufs, + uint size) +{ + return nbufs * (size + xfs_buf_log_overhead()); +} + +/* + * Logging inodes is really tricksy. They are logged in memory format, + * which means that what we write into the log doesn't directly translate into + * the amount of space they use on disk. + * + * Case in point - btree format forks in memory format use more space than the + * on-disk format. In memory, the buffer contains a normal btree block header so + * the btree code can treat it as though it is just another generic buffer. + * However, when we write it to the inode fork, we don't write all of this + * header as it isn't needed. e.g. the root is only ever in the inode, so + * there's no need for sibling pointers which would waste 16 bytes of space. + * + * Hence when we have an inode with a maximally sized btree format fork, then + * amount of information we actually log is greater than the size of the inode + * on disk. Hence we need an inode reservation function that calculates all this + * correctly. So, we log: + * + * - 4 log op headers for object + * - for the ilf, the inode core and 2 forks + * - inode log format object + * - the inode core + * - two inode forks containing bmap btree root blocks. + * - the btree data contained by both forks will fit into the inode size, + * hence when combined with the inode core above, we have a total of the + * actual inode size. + * - the BMBT headers need to be accounted separately, as they are + * additional to the records and pointers that fit inside the inode + * forks. + */ +STATIC uint +xfs_calc_inode_res( + struct xfs_mount *mp, + uint ninodes) +{ + return ninodes * + (4 * sizeof(struct xlog_op_header) + + sizeof(struct xfs_inode_log_format) + + mp->m_sb.sb_inodesize + + 2 * XFS_BMBT_BLOCK_LEN(mp)); +} + +/* + * The free inode btree is a conditional feature and the log reservation + * requirements differ slightly from that of the traditional inode allocation + * btree. The finobt tracks records for inode chunks with at least one free + * inode. A record can be removed from the tree for an inode allocation + * or free and thus the finobt reservation is unconditional across: + * + * - inode allocation + * - inode free + * - inode chunk allocation + * + * The 'modify' param indicates to include the record modification scenario. The + * 'alloc' param indicates to include the reservation for free space btree + * modifications on behalf of finobt modifications. This is required only for + * transactions that do not already account for free space btree modifications. + * + * the free inode btree: max depth * block size + * the allocation btrees: 2 trees * (max depth - 1) * block size + * the free inode btree entry: block size + */ +STATIC uint +xfs_calc_finobt_res( + struct xfs_mount *mp, + int alloc, + int modify) +{ + uint res; + + if (!xfs_sb_version_hasfinobt(&mp->m_sb)) + return 0; + + res = xfs_calc_buf_res(mp->m_in_maxlevels, XFS_FSB_TO_B(mp, 1)); + if (alloc) + res += xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), + XFS_FSB_TO_B(mp, 1)); + if (modify) + res += (uint)XFS_FSB_TO_B(mp, 1); + + return res; +} + +/* + * Various log reservation values. + * + * These are based on the size of the file system block because that is what + * most transactions manipulate. Each adds in an additional 128 bytes per + * item logged to try to account for the overhead of the transaction mechanism. + * + * Note: Most of the reservations underestimate the number of allocation + * groups into which they could free extents in the xfs_bmap_finish() call. + * This is because the number in the worst case is quite high and quite + * unusual. In order to fix this we need to change xfs_bmap_finish() to free + * extents in only a single AG at a time. This will require changes to the + * EFI code as well, however, so that the EFI for the extents not freed is + * logged again in each transaction. See SGI PV #261917. + * + * Reservation functions here avoid a huge stack in xfs_trans_init due to + * register overflow from temporaries in the calculations. + */ + + +/* + * In a write transaction we can allocate a maximum of 2 + * extents. This gives: + * the inode getting the new extents: inode size + * the inode's bmap btree: max depth * block size + * the agfs of the ags from which the extents are allocated: 2 * sector + * the superblock free block counter: sector size + * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size + * And the bmap_finish transaction can free bmap blocks in a join: + * the agfs of the ags containing the blocks: 2 * sector size + * the agfls of the ags containing the blocks: 2 * sector size + * the super block free block counter: sector size + * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size + */ +STATIC uint +xfs_calc_write_reservation( + struct xfs_mount *mp) +{ + return XFS_DQUOT_LOGRES(mp) + + MAX((xfs_calc_inode_res(mp, 1) + + xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK), + XFS_FSB_TO_B(mp, 1)) + + xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2), + XFS_FSB_TO_B(mp, 1))), + (xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2), + XFS_FSB_TO_B(mp, 1)))); +} + +/* + * In truncating a file we free up to two extents at once. We can modify: + * the inode being truncated: inode size + * the inode's bmap btree: (max depth + 1) * block size + * And the bmap_finish transaction can free the blocks and bmap blocks: + * the agf for each of the ags: 4 * sector size + * the agfl for each of the ags: 4 * sector size + * the super block to reflect the freed blocks: sector size + * worst case split in allocation btrees per extent assuming 4 extents: + * 4 exts * 2 trees * (2 * max depth - 1) * block size + * the inode btree: max depth * blocksize + * the allocation btrees: 2 trees * (max depth - 1) * block size + */ +STATIC uint +xfs_calc_itruncate_reservation( + struct xfs_mount *mp) +{ + return XFS_DQUOT_LOGRES(mp) + + MAX((xfs_calc_inode_res(mp, 1) + + xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1, + XFS_FSB_TO_B(mp, 1))), + (xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 4), + XFS_FSB_TO_B(mp, 1)) + + xfs_calc_buf_res(5, 0) + + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), + XFS_FSB_TO_B(mp, 1)) + + xfs_calc_buf_res(2 + mp->m_ialloc_blks + + mp->m_in_maxlevels, 0))); +} + +/* + * In renaming a files we can modify: + * the four inodes involved: 4 * inode size + * the two directory btrees: 2 * (max depth + v2) * dir block size + * the two directory bmap btrees: 2 * max depth * block size + * And the bmap_finish transaction can free dir and bmap blocks (two sets + * of bmap blocks) giving: + * the agf for the ags in which the blocks live: 3 * sector size + * the agfl for the ags in which the blocks live: 3 * sector size + * the superblock for the free block count: sector size + * the allocation btrees: 3 exts * 2 trees * (2 * max depth - 1) * block size + */ +STATIC uint +xfs_calc_rename_reservation( + struct xfs_mount *mp) +{ + return XFS_DQUOT_LOGRES(mp) + + MAX((xfs_calc_inode_res(mp, 4) + + xfs_calc_buf_res(2 * XFS_DIROP_LOG_COUNT(mp), + XFS_FSB_TO_B(mp, 1))), + (xfs_calc_buf_res(7, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 3), + XFS_FSB_TO_B(mp, 1)))); +} + +/* + * For removing an inode from unlinked list at first, we can modify: + * the agi hash list and counters: sector size + * the on disk inode before ours in the agi hash list: inode cluster size + */ +STATIC uint +xfs_calc_iunlink_remove_reservation( + struct xfs_mount *mp) +{ + return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) + + max_t(uint, XFS_FSB_TO_B(mp, 1), mp->m_inode_cluster_size); +} + +/* + * For creating a link to an inode: + * the parent directory inode: inode size + * the linked inode: inode size + * the directory btree could split: (max depth + v2) * dir block size + * the directory bmap btree could join or split: (max depth + v2) * blocksize + * And the bmap_finish transaction can free some bmap blocks giving: + * the agf for the ag in which the blocks live: sector size + * the agfl for the ag in which the blocks live: sector size + * the superblock for the free block count: sector size + * the allocation btrees: 2 trees * (2 * max depth - 1) * block size + */ +STATIC uint +xfs_calc_link_reservation( + struct xfs_mount *mp) +{ + return XFS_DQUOT_LOGRES(mp) + + xfs_calc_iunlink_remove_reservation(mp) + + MAX((xfs_calc_inode_res(mp, 2) + + xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), + XFS_FSB_TO_B(mp, 1))), + (xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), + XFS_FSB_TO_B(mp, 1)))); +} + +/* + * For adding an inode to unlinked list we can modify: + * the agi hash list: sector size + * the unlinked inode: inode size + */ +STATIC uint +xfs_calc_iunlink_add_reservation(xfs_mount_t *mp) +{ + return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) + + xfs_calc_inode_res(mp, 1); +} + +/* + * For removing a directory entry we can modify: + * the parent directory inode: inode size + * the removed inode: inode size + * the directory btree could join: (max depth + v2) * dir block size + * the directory bmap btree could join or split: (max depth + v2) * blocksize + * And the bmap_finish transaction can free the dir and bmap blocks giving: + * the agf for the ag in which the blocks live: 2 * sector size + * the agfl for the ag in which the blocks live: 2 * sector size + * the superblock for the free block count: sector size + * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size + */ +STATIC uint +xfs_calc_remove_reservation( + struct xfs_mount *mp) +{ + return XFS_DQUOT_LOGRES(mp) + + xfs_calc_iunlink_add_reservation(mp) + + MAX((xfs_calc_inode_res(mp, 1) + + xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), + XFS_FSB_TO_B(mp, 1))), + (xfs_calc_buf_res(4, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2), + XFS_FSB_TO_B(mp, 1)))); +} + +/* + * For create, break it in to the two cases that the transaction + * covers. We start with the modify case - allocation done by modification + * of the state of existing inodes - and the allocation case. + */ + +/* + * For create we can modify: + * the parent directory inode: inode size + * the new inode: inode size + * the inode btree entry: block size + * the superblock for the nlink flag: sector size + * the directory btree: (max depth + v2) * dir block size + * the directory inode's bmap btree: (max depth + v2) * block size + * the finobt (record modification and allocation btrees) + */ +STATIC uint +xfs_calc_create_resv_modify( + struct xfs_mount *mp) +{ + return xfs_calc_inode_res(mp, 2) + + xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) + + (uint)XFS_FSB_TO_B(mp, 1) + + xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), XFS_FSB_TO_B(mp, 1)) + + xfs_calc_finobt_res(mp, 1, 1); +} + +/* + * For create we can allocate some inodes giving: + * the agi and agf of the ag getting the new inodes: 2 * sectorsize + * the superblock for the nlink flag: sector size + * the inode blocks allocated: mp->m_ialloc_blks * blocksize + * the inode btree: max depth * blocksize + * the allocation btrees: 2 trees * (max depth - 1) * block size + */ +STATIC uint +xfs_calc_create_resv_alloc( + struct xfs_mount *mp) +{ + return xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) + + mp->m_sb.sb_sectsize + + xfs_calc_buf_res(mp->m_ialloc_blks, XFS_FSB_TO_B(mp, 1)) + + xfs_calc_buf_res(mp->m_in_maxlevels, XFS_FSB_TO_B(mp, 1)) + + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), + XFS_FSB_TO_B(mp, 1)); +} + +STATIC uint +__xfs_calc_create_reservation( + struct xfs_mount *mp) +{ + return XFS_DQUOT_LOGRES(mp) + + MAX(xfs_calc_create_resv_alloc(mp), + xfs_calc_create_resv_modify(mp)); +} + +/* + * For icreate we can allocate some inodes giving: + * the agi and agf of the ag getting the new inodes: 2 * sectorsize + * the superblock for the nlink flag: sector size + * the inode btree: max depth * blocksize + * the allocation btrees: 2 trees * (max depth - 1) * block size + * the finobt (record insertion) + */ +STATIC uint +xfs_calc_icreate_resv_alloc( + struct xfs_mount *mp) +{ + return xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) + + mp->m_sb.sb_sectsize + + xfs_calc_buf_res(mp->m_in_maxlevels, XFS_FSB_TO_B(mp, 1)) + + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), + XFS_FSB_TO_B(mp, 1)) + + xfs_calc_finobt_res(mp, 0, 0); +} + +STATIC uint +xfs_calc_icreate_reservation(xfs_mount_t *mp) +{ + return XFS_DQUOT_LOGRES(mp) + + MAX(xfs_calc_icreate_resv_alloc(mp), + xfs_calc_create_resv_modify(mp)); +} + +STATIC uint +xfs_calc_create_reservation( + struct xfs_mount *mp) +{ + if (xfs_sb_version_hascrc(&mp->m_sb)) + return xfs_calc_icreate_reservation(mp); + return __xfs_calc_create_reservation(mp); + +} + +STATIC uint +xfs_calc_create_tmpfile_reservation( + struct xfs_mount *mp) +{ + uint res = XFS_DQUOT_LOGRES(mp); + + if (xfs_sb_version_hascrc(&mp->m_sb)) + res += xfs_calc_icreate_resv_alloc(mp); + else + res += xfs_calc_create_resv_alloc(mp); + + return res + xfs_calc_iunlink_add_reservation(mp); +} + +/* + * Making a new directory is the same as creating a new file. + */ +STATIC uint +xfs_calc_mkdir_reservation( + struct xfs_mount *mp) +{ + return xfs_calc_create_reservation(mp); +} + + +/* + * Making a new symplink is the same as creating a new file, but + * with the added blocks for remote symlink data which can be up to 1kB in + * length (MAXPATHLEN). + */ +STATIC uint +xfs_calc_symlink_reservation( + struct xfs_mount *mp) +{ + return xfs_calc_create_reservation(mp) + + xfs_calc_buf_res(1, MAXPATHLEN); +} + +/* + * In freeing an inode we can modify: + * the inode being freed: inode size + * the super block free inode counter: sector size + * the agi hash list and counters: sector size + * the inode btree entry: block size + * the on disk inode before ours in the agi hash list: inode cluster size + * the inode btree: max depth * blocksize + * the allocation btrees: 2 trees * (max depth - 1) * block size + * the finobt (record insertion, removal or modification) + */ +STATIC uint +xfs_calc_ifree_reservation( + struct xfs_mount *mp) +{ + return XFS_DQUOT_LOGRES(mp) + + xfs_calc_inode_res(mp, 1) + + xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(1, XFS_FSB_TO_B(mp, 1)) + + xfs_calc_iunlink_remove_reservation(mp) + + xfs_calc_buf_res(1, 0) + + xfs_calc_buf_res(2 + mp->m_ialloc_blks + + mp->m_in_maxlevels, 0) + + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), + XFS_FSB_TO_B(mp, 1)) + + xfs_calc_finobt_res(mp, 0, 1); +} + +/* + * When only changing the inode we log the inode and possibly the superblock + * We also add a bit of slop for the transaction stuff. + */ +STATIC uint +xfs_calc_ichange_reservation( + struct xfs_mount *mp) +{ + return XFS_DQUOT_LOGRES(mp) + + xfs_calc_inode_res(mp, 1) + + xfs_calc_buf_res(1, mp->m_sb.sb_sectsize); + +} + +/* + * Growing the data section of the filesystem. + * superblock + * agi and agf + * allocation btrees + */ +STATIC uint +xfs_calc_growdata_reservation( + struct xfs_mount *mp) +{ + return xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), + XFS_FSB_TO_B(mp, 1)); +} + +/* + * Growing the rt section of the filesystem. + * In the first set of transactions (ALLOC) we allocate space to the + * bitmap or summary files. + * superblock: sector size + * agf of the ag from which the extent is allocated: sector size + * bmap btree for bitmap/summary inode: max depth * blocksize + * bitmap/summary inode: inode size + * allocation btrees for 1 block alloc: 2 * (2 * maxdepth - 1) * blocksize + */ +STATIC uint +xfs_calc_growrtalloc_reservation( + struct xfs_mount *mp) +{ + return xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK), + XFS_FSB_TO_B(mp, 1)) + + xfs_calc_inode_res(mp, 1) + + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), + XFS_FSB_TO_B(mp, 1)); +} + +/* + * Growing the rt section of the filesystem. + * In the second set of transactions (ZERO) we zero the new metadata blocks. + * one bitmap/summary block: blocksize + */ +STATIC uint +xfs_calc_growrtzero_reservation( + struct xfs_mount *mp) +{ + return xfs_calc_buf_res(1, mp->m_sb.sb_blocksize); +} + +/* + * Growing the rt section of the filesystem. + * In the third set of transactions (FREE) we update metadata without + * allocating any new blocks. + * superblock: sector size + * bitmap inode: inode size + * summary inode: inode size + * one bitmap block: blocksize + * summary blocks: new summary size + */ +STATIC uint +xfs_calc_growrtfree_reservation( + struct xfs_mount *mp) +{ + return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) + + xfs_calc_inode_res(mp, 2) + + xfs_calc_buf_res(1, mp->m_sb.sb_blocksize) + + xfs_calc_buf_res(1, mp->m_rsumsize); +} + +/* + * Logging the inode modification timestamp on a synchronous write. + * inode + */ +STATIC uint +xfs_calc_swrite_reservation( + struct xfs_mount *mp) +{ + return xfs_calc_inode_res(mp, 1); +} + +/* + * Logging the inode mode bits when writing a setuid/setgid file + * inode + */ +STATIC uint +xfs_calc_writeid_reservation( + struct xfs_mount *mp) +{ + return xfs_calc_inode_res(mp, 1); +} + +/* + * Converting the inode from non-attributed to attributed. + * the inode being converted: inode size + * agf block and superblock (for block allocation) + * the new block (directory sized) + * bmap blocks for the new directory block + * allocation btrees + */ +STATIC uint +xfs_calc_addafork_reservation( + struct xfs_mount *mp) +{ + return XFS_DQUOT_LOGRES(mp) + + xfs_calc_inode_res(mp, 1) + + xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(1, mp->m_dir_geo->blksize) + + xfs_calc_buf_res(XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1, + XFS_FSB_TO_B(mp, 1)) + + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), + XFS_FSB_TO_B(mp, 1)); +} + +/* + * Removing the attribute fork of a file + * the inode being truncated: inode size + * the inode's bmap btree: max depth * block size + * And the bmap_finish transaction can free the blocks and bmap blocks: + * the agf for each of the ags: 4 * sector size + * the agfl for each of the ags: 4 * sector size + * the super block to reflect the freed blocks: sector size + * worst case split in allocation btrees per extent assuming 4 extents: + * 4 exts * 2 trees * (2 * max depth - 1) * block size + */ +STATIC uint +xfs_calc_attrinval_reservation( + struct xfs_mount *mp) +{ + return MAX((xfs_calc_inode_res(mp, 1) + + xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK), + XFS_FSB_TO_B(mp, 1))), + (xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 4), + XFS_FSB_TO_B(mp, 1)))); +} + +/* + * Setting an attribute at mount time. + * the inode getting the attribute + * the superblock for allocations + * the agfs extents are allocated from + * the attribute btree * max depth + * the inode allocation btree + * Since attribute transaction space is dependent on the size of the attribute, + * the calculation is done partially at mount time and partially at runtime(see + * below). + */ +STATIC uint +xfs_calc_attrsetm_reservation( + struct xfs_mount *mp) +{ + return XFS_DQUOT_LOGRES(mp) + + xfs_calc_inode_res(mp, 1) + + xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(XFS_DA_NODE_MAXDEPTH, XFS_FSB_TO_B(mp, 1)); +} + +/* + * Setting an attribute at runtime, transaction space unit per block. + * the superblock for allocations: sector size + * the inode bmap btree could join or split: max depth * block size + * Since the runtime attribute transaction space is dependent on the total + * blocks needed for the 1st bmap, here we calculate out the space unit for + * one block so that the caller could figure out the total space according + * to the attibute extent length in blocks by: + * ext * M_RES(mp)->tr_attrsetrt.tr_logres + */ +STATIC uint +xfs_calc_attrsetrt_reservation( + struct xfs_mount *mp) +{ + return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK), + XFS_FSB_TO_B(mp, 1)); +} + +/* + * Removing an attribute. + * the inode: inode size + * the attribute btree could join: max depth * block size + * the inode bmap btree could join or split: max depth * block size + * And the bmap_finish transaction can free the attr blocks freed giving: + * the agf for the ag in which the blocks live: 2 * sector size + * the agfl for the ag in which the blocks live: 2 * sector size + * the superblock for the free block count: sector size + * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size + */ +STATIC uint +xfs_calc_attrrm_reservation( + struct xfs_mount *mp) +{ + return XFS_DQUOT_LOGRES(mp) + + MAX((xfs_calc_inode_res(mp, 1) + + xfs_calc_buf_res(XFS_DA_NODE_MAXDEPTH, + XFS_FSB_TO_B(mp, 1)) + + (uint)XFS_FSB_TO_B(mp, + XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) + + xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK), 0)), + (xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2), + XFS_FSB_TO_B(mp, 1)))); +} + +/* + * Clearing a bad agino number in an agi hash bucket. + */ +STATIC uint +xfs_calc_clear_agi_bucket_reservation( + struct xfs_mount *mp) +{ + return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize); +} + +/* + * Clearing the quotaflags in the superblock. + * the super block for changing quota flags: sector size + */ +STATIC uint +xfs_calc_qm_sbchange_reservation( + struct xfs_mount *mp) +{ + return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize); +} + +/* + * Adjusting quota limits. + * the xfs_disk_dquot_t: sizeof(struct xfs_disk_dquot) + */ +STATIC uint +xfs_calc_qm_setqlim_reservation( + struct xfs_mount *mp) +{ + return xfs_calc_buf_res(1, sizeof(struct xfs_disk_dquot)); +} + +/* + * Allocating quota on disk if needed. + * the write transaction log space for quota file extent allocation + * the unit of quota allocation: one system block size + */ +STATIC uint +xfs_calc_qm_dqalloc_reservation( + struct xfs_mount *mp) +{ + return xfs_calc_write_reservation(mp) + + xfs_calc_buf_res(1, + XFS_FSB_TO_B(mp, XFS_DQUOT_CLUSTER_SIZE_FSB) - 1); +} + +/* + * Turning off quotas. + * the xfs_qoff_logitem_t: sizeof(struct xfs_qoff_logitem) * 2 + * the superblock for the quota flags: sector size + */ +STATIC uint +xfs_calc_qm_quotaoff_reservation( + struct xfs_mount *mp) +{ + return sizeof(struct xfs_qoff_logitem) * 2 + + xfs_calc_buf_res(1, mp->m_sb.sb_sectsize); +} + +/* + * End of turning off quotas. + * the xfs_qoff_logitem_t: sizeof(struct xfs_qoff_logitem) * 2 + */ +STATIC uint +xfs_calc_qm_quotaoff_end_reservation( + struct xfs_mount *mp) +{ + return sizeof(struct xfs_qoff_logitem) * 2; +} + +/* + * Syncing the incore super block changes to disk. + * the super block to reflect the changes: sector size + */ +STATIC uint +xfs_calc_sb_reservation( + struct xfs_mount *mp) +{ + return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize); +} + +void +xfs_trans_resv_calc( + struct xfs_mount *mp, + struct xfs_trans_resv *resp) +{ + /* + * The following transactions are logged in physical format and + * require a permanent reservation on space. + */ + resp->tr_write.tr_logres = xfs_calc_write_reservation(mp); + resp->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT; + resp->tr_write.tr_logflags |= XFS_TRANS_PERM_LOG_RES; + + resp->tr_itruncate.tr_logres = xfs_calc_itruncate_reservation(mp); + resp->tr_itruncate.tr_logcount = XFS_ITRUNCATE_LOG_COUNT; + resp->tr_itruncate.tr_logflags |= XFS_TRANS_PERM_LOG_RES; + + resp->tr_rename.tr_logres = xfs_calc_rename_reservation(mp); + resp->tr_rename.tr_logcount = XFS_RENAME_LOG_COUNT; + resp->tr_rename.tr_logflags |= XFS_TRANS_PERM_LOG_RES; + + resp->tr_link.tr_logres = xfs_calc_link_reservation(mp); + resp->tr_link.tr_logcount = XFS_LINK_LOG_COUNT; + resp->tr_link.tr_logflags |= XFS_TRANS_PERM_LOG_RES; + + resp->tr_remove.tr_logres = xfs_calc_remove_reservation(mp); + resp->tr_remove.tr_logcount = XFS_REMOVE_LOG_COUNT; + resp->tr_remove.tr_logflags |= XFS_TRANS_PERM_LOG_RES; + + resp->tr_symlink.tr_logres = xfs_calc_symlink_reservation(mp); + resp->tr_symlink.tr_logcount = XFS_SYMLINK_LOG_COUNT; + resp->tr_symlink.tr_logflags |= XFS_TRANS_PERM_LOG_RES; + + resp->tr_create.tr_logres = xfs_calc_create_reservation(mp); + resp->tr_create.tr_logcount = XFS_CREATE_LOG_COUNT; + resp->tr_create.tr_logflags |= XFS_TRANS_PERM_LOG_RES; + + resp->tr_create_tmpfile.tr_logres = + xfs_calc_create_tmpfile_reservation(mp); + resp->tr_create_tmpfile.tr_logcount = XFS_CREATE_TMPFILE_LOG_COUNT; + resp->tr_create_tmpfile.tr_logflags |= XFS_TRANS_PERM_LOG_RES; + + resp->tr_mkdir.tr_logres = xfs_calc_mkdir_reservation(mp); + resp->tr_mkdir.tr_logcount = XFS_MKDIR_LOG_COUNT; + resp->tr_mkdir.tr_logflags |= XFS_TRANS_PERM_LOG_RES; + + resp->tr_ifree.tr_logres = xfs_calc_ifree_reservation(mp); + resp->tr_ifree.tr_logcount = XFS_INACTIVE_LOG_COUNT; + resp->tr_ifree.tr_logflags |= XFS_TRANS_PERM_LOG_RES; + + resp->tr_addafork.tr_logres = xfs_calc_addafork_reservation(mp); + resp->tr_addafork.tr_logcount = XFS_ADDAFORK_LOG_COUNT; + resp->tr_addafork.tr_logflags |= XFS_TRANS_PERM_LOG_RES; + + resp->tr_attrinval.tr_logres = xfs_calc_attrinval_reservation(mp); + resp->tr_attrinval.tr_logcount = XFS_ATTRINVAL_LOG_COUNT; + resp->tr_attrinval.tr_logflags |= XFS_TRANS_PERM_LOG_RES; + + resp->tr_attrsetm.tr_logres = xfs_calc_attrsetm_reservation(mp); + resp->tr_attrsetm.tr_logcount = XFS_ATTRSET_LOG_COUNT; + resp->tr_attrsetm.tr_logflags |= XFS_TRANS_PERM_LOG_RES; + + resp->tr_attrrm.tr_logres = xfs_calc_attrrm_reservation(mp); + resp->tr_attrrm.tr_logcount = XFS_ATTRRM_LOG_COUNT; + resp->tr_attrrm.tr_logflags |= XFS_TRANS_PERM_LOG_RES; + + resp->tr_growrtalloc.tr_logres = xfs_calc_growrtalloc_reservation(mp); + resp->tr_growrtalloc.tr_logcount = XFS_DEFAULT_PERM_LOG_COUNT; + resp->tr_growrtalloc.tr_logflags |= XFS_TRANS_PERM_LOG_RES; + + resp->tr_qm_dqalloc.tr_logres = xfs_calc_qm_dqalloc_reservation(mp); + resp->tr_qm_dqalloc.tr_logcount = XFS_WRITE_LOG_COUNT; + resp->tr_qm_dqalloc.tr_logflags |= XFS_TRANS_PERM_LOG_RES; + + /* + * The following transactions are logged in logical format with + * a default log count. + */ + resp->tr_qm_sbchange.tr_logres = xfs_calc_qm_sbchange_reservation(mp); + resp->tr_qm_sbchange.tr_logcount = XFS_DEFAULT_LOG_COUNT; + + resp->tr_qm_setqlim.tr_logres = xfs_calc_qm_setqlim_reservation(mp); + resp->tr_qm_setqlim.tr_logcount = XFS_DEFAULT_LOG_COUNT; + + resp->tr_qm_quotaoff.tr_logres = xfs_calc_qm_quotaoff_reservation(mp); + resp->tr_qm_quotaoff.tr_logcount = XFS_DEFAULT_LOG_COUNT; + + resp->tr_qm_equotaoff.tr_logres = + xfs_calc_qm_quotaoff_end_reservation(mp); + resp->tr_qm_equotaoff.tr_logcount = XFS_DEFAULT_LOG_COUNT; + + resp->tr_sb.tr_logres = xfs_calc_sb_reservation(mp); + resp->tr_sb.tr_logcount = XFS_DEFAULT_LOG_COUNT; + + /* The following transaction are logged in logical format */ + resp->tr_ichange.tr_logres = xfs_calc_ichange_reservation(mp); + resp->tr_growdata.tr_logres = xfs_calc_growdata_reservation(mp); + resp->tr_fsyncts.tr_logres = xfs_calc_swrite_reservation(mp); + resp->tr_writeid.tr_logres = xfs_calc_writeid_reservation(mp); + resp->tr_attrsetrt.tr_logres = xfs_calc_attrsetrt_reservation(mp); + resp->tr_clearagi.tr_logres = xfs_calc_clear_agi_bucket_reservation(mp); + resp->tr_growrtzero.tr_logres = xfs_calc_growrtzero_reservation(mp); + resp->tr_growrtfree.tr_logres = xfs_calc_growrtfree_reservation(mp); +} diff --git a/fs/xfs/xfs_trans_resv.h b/fs/xfs/xfs_trans_resv.h new file mode 100644 index 00000000000..1097d14cd58 --- /dev/null +++ b/fs/xfs/xfs_trans_resv.h @@ -0,0 +1,117 @@ +/* + * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_TRANS_RESV_H__ +#define __XFS_TRANS_RESV_H__ + +struct xfs_mount; + +/* + * structure for maintaining pre-calculated transaction reservations. + */ +struct xfs_trans_res { + uint tr_logres; /* log space unit in bytes per log ticket */ + int tr_logcount; /* number of log operations per log ticket */ + int tr_logflags; /* log flags, currently only used for indicating + * a reservation request is permanent or not */ +}; + +struct xfs_trans_resv { + struct xfs_trans_res tr_write; /* extent alloc trans */ + struct xfs_trans_res tr_itruncate; /* truncate trans */ + struct xfs_trans_res tr_rename; /* rename trans */ + struct xfs_trans_res tr_link; /* link trans */ + struct xfs_trans_res tr_remove; /* unlink trans */ + struct xfs_trans_res tr_symlink; /* symlink trans */ + struct xfs_trans_res tr_create; /* create trans */ + struct xfs_trans_res tr_create_tmpfile; /* create O_TMPFILE trans */ + struct xfs_trans_res tr_mkdir; /* mkdir trans */ + struct xfs_trans_res tr_ifree; /* inode free trans */ + struct xfs_trans_res tr_ichange; /* inode update trans */ + struct xfs_trans_res tr_growdata; /* fs data section grow trans */ + struct xfs_trans_res tr_addafork; /* add inode attr fork trans */ + struct xfs_trans_res tr_writeid; /* write setuid/setgid file */ + struct xfs_trans_res tr_attrinval; /* attr fork buffer + * invalidation */ + struct xfs_trans_res tr_attrsetm; /* set/create an attribute at + * mount time */ + struct xfs_trans_res tr_attrsetrt; /* set/create an attribute at + * runtime */ + struct xfs_trans_res tr_attrrm; /* remove an attribute */ + struct xfs_trans_res tr_clearagi; /* clear agi unlinked bucket */ + struct xfs_trans_res tr_growrtalloc; /* grow realtime allocations */ + struct xfs_trans_res tr_growrtzero; /* grow realtime zeroing */ + struct xfs_trans_res tr_growrtfree; /* grow realtime freeing */ + struct xfs_trans_res tr_qm_sbchange; /* change quota flags */ + struct xfs_trans_res tr_qm_setqlim; /* adjust quota limits */ + struct xfs_trans_res tr_qm_dqalloc; /* allocate quota on disk */ + struct xfs_trans_res tr_qm_quotaoff; /* turn quota off */ + struct xfs_trans_res tr_qm_equotaoff;/* end of turn quota off */ + struct xfs_trans_res tr_sb; /* modify superblock */ + struct xfs_trans_res tr_fsyncts; /* update timestamps on fsync */ +}; + +/* shorthand way of accessing reservation structure */ +#define M_RES(mp) (&(mp)->m_resv) + +/* + * Per-extent log reservation for the allocation btree changes + * involved in freeing or allocating an extent. + * 2 trees * (2 blocks/level * max depth - 1) * block size + */ +#define XFS_ALLOCFREE_LOG_RES(mp,nx) \ + ((nx) * (2 * XFS_FSB_TO_B((mp), 2 * XFS_AG_MAXLEVELS(mp) - 1))) +#define XFS_ALLOCFREE_LOG_COUNT(mp,nx) \ + ((nx) * (2 * (2 * XFS_AG_MAXLEVELS(mp) - 1))) + +/* + * Per-directory log reservation for any directory change. + * dir blocks: (1 btree block per level + data block + free block) * dblock size + * bmap btree: (levels + 2) * max depth * block size + * v2 directory blocks can be fragmented below the dirblksize down to the fsb + * size, so account for that in the DAENTER macros. + */ +#define XFS_DIROP_LOG_RES(mp) \ + (XFS_FSB_TO_B(mp, XFS_DAENTER_BLOCKS(mp, XFS_DATA_FORK)) + \ + (XFS_FSB_TO_B(mp, XFS_DAENTER_BMAPS(mp, XFS_DATA_FORK) + 1))) +#define XFS_DIROP_LOG_COUNT(mp) \ + (XFS_DAENTER_BLOCKS(mp, XFS_DATA_FORK) + \ + XFS_DAENTER_BMAPS(mp, XFS_DATA_FORK) + 1) + +/* + * Various log count values. + */ +#define XFS_DEFAULT_LOG_COUNT 1 +#define XFS_DEFAULT_PERM_LOG_COUNT 2 +#define XFS_ITRUNCATE_LOG_COUNT 2 +#define XFS_INACTIVE_LOG_COUNT 2 +#define XFS_CREATE_LOG_COUNT 2 +#define XFS_CREATE_TMPFILE_LOG_COUNT 2 +#define XFS_MKDIR_LOG_COUNT 3 +#define XFS_SYMLINK_LOG_COUNT 3 +#define XFS_REMOVE_LOG_COUNT 2 +#define XFS_LINK_LOG_COUNT 2 +#define XFS_RENAME_LOG_COUNT 2 +#define XFS_WRITE_LOG_COUNT 2 +#define XFS_ADDAFORK_LOG_COUNT 2 +#define XFS_ATTRINVAL_LOG_COUNT 1 +#define XFS_ATTRSET_LOG_COUNT 3 +#define XFS_ATTRRM_LOG_COUNT 3 + +void xfs_trans_resv_calc(struct xfs_mount *mp, struct xfs_trans_resv *resp); + +#endif /* __XFS_TRANS_RESV_H__ */ diff --git a/fs/xfs/xfs_trans_space.h b/fs/xfs/xfs_trans_space.h index 7d2c920dfb9..bf9c4579334 100644 --- a/fs/xfs/xfs_trans_space.h +++ b/fs/xfs/xfs_trans_space.h @@ -28,7 +28,8 @@ (((b + XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp) - 1) / \ XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp)) * \ XFS_EXTENTADD_SPACE_RES(mp,w)) -#define XFS_DAENTER_1B(mp,w) ((w) == XFS_DATA_FORK ? (mp)->m_dirblkfsbs : 1) +#define XFS_DAENTER_1B(mp,w) \ + ((w) == XFS_DATA_FORK ? (mp)->m_dir_geo->fsbcount : 1) #define XFS_DAENTER_DBS(mp,w) \ (XFS_DA_NODE_MAXDEPTH + (((w) == XFS_DATA_FORK) ? 2 : 0)) #define XFS_DAENTER_BLOCKS(mp,w) \ @@ -47,13 +48,15 @@ #define XFS_DIRREMOVE_SPACE_RES(mp) \ XFS_DAREMOVE_SPACE_RES(mp, XFS_DATA_FORK) #define XFS_IALLOC_SPACE_RES(mp) \ - (XFS_IALLOC_BLOCKS(mp) + (mp)->m_in_maxlevels - 1) + ((mp)->m_ialloc_blks + \ + (xfs_sb_version_hasfinobt(&mp->m_sb) ? 2 : 1 * \ + ((mp)->m_in_maxlevels - 1))) /* * Space reservation values for various transactions. */ #define XFS_ADDAFORK_SPACE_RES(mp) \ - ((mp)->m_dirblkfsbs + XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK)) + ((mp)->m_dir_geo->fsbcount + XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK)) #define XFS_ATTRRM_SPACE_RES(mp) \ XFS_DAREMOVE_SPACE_RES(mp, XFS_ATTR_FORK) /* This macro is not used - see inline code in xfs_attr_set */ @@ -82,5 +85,8 @@ (XFS_DIRREMOVE_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp,nl)) #define XFS_SYMLINK_SPACE_RES(mp,nl,b) \ (XFS_IALLOC_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp,nl) + (b)) +#define XFS_IFREE_SPACE_RES(mp) \ + (xfs_sb_version_hasfinobt(&mp->m_sb) ? (mp)->m_in_maxlevels : 0) + #endif /* __XFS_TRANS_SPACE_H__ */ diff --git a/fs/xfs/xfs_types.h b/fs/xfs/xfs_types.h index 7a41874f4c2..65c6e6650b1 100644 --- a/fs/xfs/xfs_types.h +++ b/fs/xfs/xfs_types.h @@ -18,43 +18,7 @@ #ifndef __XFS_TYPES_H__ #define __XFS_TYPES_H__ -#ifdef __KERNEL__ - -/* - * Additional type declarations for XFS - */ -typedef signed char __int8_t; -typedef unsigned char __uint8_t; -typedef signed short int __int16_t; -typedef unsigned short int __uint16_t; -typedef signed int __int32_t; -typedef unsigned int __uint32_t; -typedef signed long long int __int64_t; -typedef unsigned long long int __uint64_t; - -typedef enum { B_FALSE,B_TRUE } boolean_t; -typedef __uint32_t prid_t; /* project ID */ -typedef __uint32_t inst_t; /* an instruction */ - -typedef __s64 xfs_off_t; /* <file offset> type */ -typedef unsigned long long xfs_ino_t; /* <inode> type */ -typedef __s64 xfs_daddr_t; /* <disk address> type */ -typedef char * xfs_caddr_t; /* <core address> type */ -typedef __u32 xfs_dev_t; -typedef __u32 xfs_nlink_t; - -/* __psint_t is the same size as a pointer */ -#if (BITS_PER_LONG == 32) -typedef __int32_t __psint_t; -typedef __uint32_t __psunsigned_t; -#elif (BITS_PER_LONG == 64) -typedef __int64_t __psint_t; -typedef __uint64_t __psunsigned_t; -#else -#error BITS_PER_LONG must be 32 or 64 -#endif - -#endif /* __KERNEL__ */ +typedef __uint32_t prid_t; /* project ID */ typedef __uint32_t xfs_agblock_t; /* blockno in alloc. group */ typedef __uint32_t xfs_agino_t; /* inode # within allocation grp */ @@ -147,6 +111,12 @@ typedef __uint64_t xfs_filblks_t; /* number of blocks in a file */ #define XFS_MAX_SECTORSIZE (1 << XFS_MAX_SECTORSIZE_LOG) /* + * Inode fork identifiers. + */ +#define XFS_DATA_FORK 0 +#define XFS_ATTR_FORK 1 + +/* * Min numbers of data/attr fork btree root pointers. */ #define MINDBTPTRS 3 @@ -164,12 +134,29 @@ typedef enum { typedef enum { XFS_BTNUM_BNOi, XFS_BTNUM_CNTi, XFS_BTNUM_BMAPi, XFS_BTNUM_INOi, - XFS_BTNUM_MAX + XFS_BTNUM_FINOi, XFS_BTNUM_MAX } xfs_btnum_t; struct xfs_name { const unsigned char *name; int len; + int type; }; +/* + * uid_t and gid_t are hard-coded to 32 bits in the inode. + * Hence, an 'id' in a dquot is 32 bits.. + */ +typedef __uint32_t xfs_dqid_t; + +/* + * Constants for bit manipulations. + */ +#define XFS_NBBYLOG 3 /* log2(NBBY) */ +#define XFS_WORDLOG 2 /* log2(sizeof(xfs_rtword_t)) */ +#define XFS_NBWORDLOG (XFS_NBBYLOG + XFS_WORDLOG) +#define XFS_NBWORD (1 << XFS_NBWORDLOG) +#define XFS_WORDMASK ((1 << XFS_WORDLOG) - 1) + + #endif /* __XFS_TYPES_H__ */ diff --git a/fs/xfs/xfs_utils.c b/fs/xfs/xfs_utils.c deleted file mode 100644 index 0025c78ac03..00000000000 --- a/fs/xfs/xfs_utils.c +++ /dev/null @@ -1,314 +0,0 @@ -/* - * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#include "xfs.h" -#include "xfs_fs.h" -#include "xfs_types.h" -#include "xfs_log.h" -#include "xfs_trans.h" -#include "xfs_sb.h" -#include "xfs_ag.h" -#include "xfs_dir2.h" -#include "xfs_mount.h" -#include "xfs_bmap_btree.h" -#include "xfs_dinode.h" -#include "xfs_inode.h" -#include "xfs_inode_item.h" -#include "xfs_bmap.h" -#include "xfs_error.h" -#include "xfs_quota.h" -#include "xfs_itable.h" -#include "xfs_utils.h" - - -/* - * Allocates a new inode from disk and return a pointer to the - * incore copy. This routine will internally commit the current - * transaction and allocate a new one if the Space Manager needed - * to do an allocation to replenish the inode free-list. - * - * This routine is designed to be called from xfs_create and - * xfs_create_dir. - * - */ -int -xfs_dir_ialloc( - xfs_trans_t **tpp, /* input: current transaction; - output: may be a new transaction. */ - xfs_inode_t *dp, /* directory within whose allocate - the inode. */ - umode_t mode, - xfs_nlink_t nlink, - xfs_dev_t rdev, - prid_t prid, /* project id */ - int okalloc, /* ok to allocate new space */ - xfs_inode_t **ipp, /* pointer to inode; it will be - locked. */ - int *committed) - -{ - xfs_trans_t *tp; - xfs_trans_t *ntp; - xfs_inode_t *ip; - xfs_buf_t *ialloc_context = NULL; - int code; - uint log_res; - uint log_count; - void *dqinfo; - uint tflags; - - tp = *tpp; - ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); - - /* - * xfs_ialloc will return a pointer to an incore inode if - * the Space Manager has an available inode on the free - * list. Otherwise, it will do an allocation and replenish - * the freelist. Since we can only do one allocation per - * transaction without deadlocks, we will need to commit the - * current transaction and start a new one. We will then - * need to call xfs_ialloc again to get the inode. - * - * If xfs_ialloc did an allocation to replenish the freelist, - * it returns the bp containing the head of the freelist as - * ialloc_context. We will hold a lock on it across the - * transaction commit so that no other process can steal - * the inode(s) that we've just allocated. - */ - code = xfs_ialloc(tp, dp, mode, nlink, rdev, prid, okalloc, - &ialloc_context, &ip); - - /* - * Return an error if we were unable to allocate a new inode. - * This should only happen if we run out of space on disk or - * encounter a disk error. - */ - if (code) { - *ipp = NULL; - return code; - } - if (!ialloc_context && !ip) { - *ipp = NULL; - return XFS_ERROR(ENOSPC); - } - - /* - * If the AGI buffer is non-NULL, then we were unable to get an - * inode in one operation. We need to commit the current - * transaction and call xfs_ialloc() again. It is guaranteed - * to succeed the second time. - */ - if (ialloc_context) { - /* - * Normally, xfs_trans_commit releases all the locks. - * We call bhold to hang on to the ialloc_context across - * the commit. Holding this buffer prevents any other - * processes from doing any allocations in this - * allocation group. - */ - xfs_trans_bhold(tp, ialloc_context); - /* - * Save the log reservation so we can use - * them in the next transaction. - */ - log_res = xfs_trans_get_log_res(tp); - log_count = xfs_trans_get_log_count(tp); - - /* - * We want the quota changes to be associated with the next - * transaction, NOT this one. So, detach the dqinfo from this - * and attach it to the next transaction. - */ - dqinfo = NULL; - tflags = 0; - if (tp->t_dqinfo) { - dqinfo = (void *)tp->t_dqinfo; - tp->t_dqinfo = NULL; - tflags = tp->t_flags & XFS_TRANS_DQ_DIRTY; - tp->t_flags &= ~(XFS_TRANS_DQ_DIRTY); - } - - ntp = xfs_trans_dup(tp); - code = xfs_trans_commit(tp, 0); - tp = ntp; - if (committed != NULL) { - *committed = 1; - } - /* - * If we get an error during the commit processing, - * release the buffer that is still held and return - * to the caller. - */ - if (code) { - xfs_buf_relse(ialloc_context); - if (dqinfo) { - tp->t_dqinfo = dqinfo; - xfs_trans_free_dqinfo(tp); - } - *tpp = ntp; - *ipp = NULL; - return code; - } - - /* - * transaction commit worked ok so we can drop the extra ticket - * reference that we gained in xfs_trans_dup() - */ - xfs_log_ticket_put(tp->t_ticket); - code = xfs_trans_reserve(tp, 0, log_res, 0, - XFS_TRANS_PERM_LOG_RES, log_count); - /* - * Re-attach the quota info that we detached from prev trx. - */ - if (dqinfo) { - tp->t_dqinfo = dqinfo; - tp->t_flags |= tflags; - } - - if (code) { - xfs_buf_relse(ialloc_context); - *tpp = ntp; - *ipp = NULL; - return code; - } - xfs_trans_bjoin(tp, ialloc_context); - - /* - * Call ialloc again. Since we've locked out all - * other allocations in this allocation group, - * this call should always succeed. - */ - code = xfs_ialloc(tp, dp, mode, nlink, rdev, prid, - okalloc, &ialloc_context, &ip); - - /* - * If we get an error at this point, return to the caller - * so that the current transaction can be aborted. - */ - if (code) { - *tpp = tp; - *ipp = NULL; - return code; - } - ASSERT(!ialloc_context && ip); - - } else { - if (committed != NULL) - *committed = 0; - } - - *ipp = ip; - *tpp = tp; - - return 0; -} - -/* - * Decrement the link count on an inode & log the change. - * If this causes the link count to go to zero, initiate the - * logging activity required to truncate a file. - */ -int /* error */ -xfs_droplink( - xfs_trans_t *tp, - xfs_inode_t *ip) -{ - int error; - - xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG); - - ASSERT (ip->i_d.di_nlink > 0); - ip->i_d.di_nlink--; - drop_nlink(VFS_I(ip)); - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - - error = 0; - if (ip->i_d.di_nlink == 0) { - /* - * We're dropping the last link to this file. - * Move the on-disk inode to the AGI unlinked list. - * From xfs_inactive() we will pull the inode from - * the list and free it. - */ - error = xfs_iunlink(tp, ip); - } - return error; -} - -/* - * This gets called when the inode's version needs to be changed from 1 to 2. - * Currently this happens when the nlink field overflows the old 16-bit value - * or when chproj is called to change the project for the first time. - * As a side effect the superblock version will also get rev'd - * to contain the NLINK bit. - */ -void -xfs_bump_ino_vers2( - xfs_trans_t *tp, - xfs_inode_t *ip) -{ - xfs_mount_t *mp; - - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); - ASSERT(ip->i_d.di_version == 1); - - ip->i_d.di_version = 2; - ip->i_d.di_onlink = 0; - memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad)); - mp = tp->t_mountp; - if (!xfs_sb_version_hasnlink(&mp->m_sb)) { - spin_lock(&mp->m_sb_lock); - if (!xfs_sb_version_hasnlink(&mp->m_sb)) { - xfs_sb_version_addnlink(&mp->m_sb); - spin_unlock(&mp->m_sb_lock); - xfs_mod_sb(tp, XFS_SB_VERSIONNUM); - } else { - spin_unlock(&mp->m_sb_lock); - } - } - /* Caller must log the inode */ -} - -/* - * Increment the link count on an inode & log the change. - */ -int -xfs_bumplink( - xfs_trans_t *tp, - xfs_inode_t *ip) -{ - xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG); - - ASSERT(ip->i_d.di_nlink > 0); - ip->i_d.di_nlink++; - inc_nlink(VFS_I(ip)); - if ((ip->i_d.di_version == 1) && - (ip->i_d.di_nlink > XFS_MAXLINK_1)) { - /* - * The inode has increased its number of links beyond - * what can fit in an old format inode. It now needs - * to be converted to a version 2 inode with a 32 bit - * link count. If this is the first inode in the file - * system to do this, then we need to bump the superblock - * version number as well. - */ - xfs_bump_ino_vers2(tp, ip); - } - - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - return 0; -} diff --git a/fs/xfs/xfs_vnode.h b/fs/xfs/xfs_vnode.h index db14d0c0868..e8a77383c0d 100644 --- a/fs/xfs/xfs_vnode.h +++ b/fs/xfs/xfs_vnode.h @@ -25,14 +25,6 @@ struct xfs_inode; struct attrlist_cursor_kern; /* - * Return values for xfs_inactive. A return value of - * VN_INACTIVE_NOCACHE implies that the file system behavior - * has disassociated its state and bhv_desc_t from the vnode. - */ -#define VN_INACTIVE_CACHE 0 -#define VN_INACTIVE_NOCACHE 1 - -/* * Flags for read/write calls - same values as IRIX */ #define IO_ISDIRECT 0x00004 /* bypass page cache */ @@ -43,15 +35,6 @@ struct attrlist_cursor_kern; { IO_INVIS, "INVIS"} /* - * Flush/Invalidate options for vop_toss/flush/flushinval_pages. - */ -#define FI_NONE 0 /* none */ -#define FI_REMAPF 1 /* Do a remapf prior to the operation */ -#define FI_REMAPF_LOCKED 2 /* Do a remapf prior to the operation. - Prevent VM access to the pages until - the operation completes. */ - -/* * Some useful predicates. */ #define VN_MAPPED(vp) mapping_mapped(vp->i_mapping) diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c deleted file mode 100644 index d95f565a390..00000000000 --- a/fs/xfs/xfs_vnodeops.c +++ /dev/null @@ -1,2348 +0,0 @@ -/* - * Copyright (c) 2000-2006 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "xfs.h" -#include "xfs_fs.h" -#include "xfs_types.h" -#include "xfs_bit.h" -#include "xfs_log.h" -#include "xfs_trans.h" -#include "xfs_sb.h" -#include "xfs_ag.h" -#include "xfs_dir2.h" -#include "xfs_mount.h" -#include "xfs_da_btree.h" -#include "xfs_bmap_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_dinode.h" -#include "xfs_inode.h" -#include "xfs_inode_item.h" -#include "xfs_itable.h" -#include "xfs_ialloc.h" -#include "xfs_alloc.h" -#include "xfs_bmap.h" -#include "xfs_acl.h" -#include "xfs_attr.h" -#include "xfs_error.h" -#include "xfs_quota.h" -#include "xfs_utils.h" -#include "xfs_rtalloc.h" -#include "xfs_trans_space.h" -#include "xfs_log_priv.h" -#include "xfs_filestream.h" -#include "xfs_vnodeops.h" -#include "xfs_trace.h" -#include "xfs_icache.h" - -/* - * The maximum pathlen is 1024 bytes. Since the minimum file system - * blocksize is 512 bytes, we can get a max of 2 extents back from - * bmapi. - */ -#define SYMLINK_MAPS 2 - -STATIC int -xfs_readlink_bmap( - xfs_inode_t *ip, - char *link) -{ - xfs_mount_t *mp = ip->i_mount; - int pathlen = ip->i_d.di_size; - int nmaps = SYMLINK_MAPS; - xfs_bmbt_irec_t mval[SYMLINK_MAPS]; - xfs_daddr_t d; - int byte_cnt; - int n; - xfs_buf_t *bp; - int error = 0; - - error = xfs_bmapi_read(ip, 0, XFS_B_TO_FSB(mp, pathlen), mval, &nmaps, - 0); - if (error) - goto out; - - for (n = 0; n < nmaps; n++) { - d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock); - byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount); - - bp = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt), 0, NULL); - if (!bp) - return XFS_ERROR(ENOMEM); - error = bp->b_error; - if (error) { - xfs_buf_ioerror_alert(bp, __func__); - xfs_buf_relse(bp); - goto out; - } - if (pathlen < byte_cnt) - byte_cnt = pathlen; - pathlen -= byte_cnt; - - memcpy(link, bp->b_addr, byte_cnt); - xfs_buf_relse(bp); - } - - link[ip->i_d.di_size] = '\0'; - error = 0; - - out: - return error; -} - -int -xfs_readlink( - xfs_inode_t *ip, - char *link) -{ - xfs_mount_t *mp = ip->i_mount; - xfs_fsize_t pathlen; - int error = 0; - - trace_xfs_readlink(ip); - - if (XFS_FORCED_SHUTDOWN(mp)) - return XFS_ERROR(EIO); - - xfs_ilock(ip, XFS_ILOCK_SHARED); - - pathlen = ip->i_d.di_size; - if (!pathlen) - goto out; - - if (pathlen < 0 || pathlen > MAXPATHLEN) { - xfs_alert(mp, "%s: inode (%llu) bad symlink length (%lld)", - __func__, (unsigned long long) ip->i_ino, - (long long) pathlen); - ASSERT(0); - error = XFS_ERROR(EFSCORRUPTED); - goto out; - } - - - if (ip->i_df.if_flags & XFS_IFINLINE) { - memcpy(link, ip->i_df.if_u1.if_data, pathlen); - link[pathlen] = '\0'; - } else { - error = xfs_readlink_bmap(ip, link); - } - - out: - xfs_iunlock(ip, XFS_ILOCK_SHARED); - return error; -} - -/* - * This is called by xfs_inactive to free any blocks beyond eof - * when the link count isn't zero and by xfs_dm_punch_hole() when - * punching a hole to EOF. - */ -int -xfs_free_eofblocks( - xfs_mount_t *mp, - xfs_inode_t *ip, - bool need_iolock) -{ - xfs_trans_t *tp; - int error; - xfs_fileoff_t end_fsb; - xfs_fileoff_t last_fsb; - xfs_filblks_t map_len; - int nimaps; - xfs_bmbt_irec_t imap; - - /* - * Figure out if there are any blocks beyond the end - * of the file. If not, then there is nothing to do. - */ - end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_ISIZE(ip)); - last_fsb = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes); - if (last_fsb <= end_fsb) - return 0; - map_len = last_fsb - end_fsb; - - nimaps = 1; - xfs_ilock(ip, XFS_ILOCK_SHARED); - error = xfs_bmapi_read(ip, end_fsb, map_len, &imap, &nimaps, 0); - xfs_iunlock(ip, XFS_ILOCK_SHARED); - - if (!error && (nimaps != 0) && - (imap.br_startblock != HOLESTARTBLOCK || - ip->i_delayed_blks)) { - /* - * Attach the dquots to the inode up front. - */ - error = xfs_qm_dqattach(ip, 0); - if (error) - return error; - - /* - * There are blocks after the end of file. - * Free them up now by truncating the file to - * its current size. - */ - tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE); - - if (need_iolock) { - if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) { - xfs_trans_cancel(tp, 0); - return EAGAIN; - } - } - - error = xfs_trans_reserve(tp, 0, - XFS_ITRUNCATE_LOG_RES(mp), - 0, XFS_TRANS_PERM_LOG_RES, - XFS_ITRUNCATE_LOG_COUNT); - if (error) { - ASSERT(XFS_FORCED_SHUTDOWN(mp)); - xfs_trans_cancel(tp, 0); - if (need_iolock) - xfs_iunlock(ip, XFS_IOLOCK_EXCL); - return error; - } - - xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, 0); - - /* - * Do not update the on-disk file size. If we update the - * on-disk file size and then the system crashes before the - * contents of the file are flushed to disk then the files - * may be full of holes (ie NULL files bug). - */ - error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, - XFS_ISIZE(ip)); - if (error) { - /* - * If we get an error at this point we simply don't - * bother truncating the file. - */ - xfs_trans_cancel(tp, - (XFS_TRANS_RELEASE_LOG_RES | - XFS_TRANS_ABORT)); - } else { - error = xfs_trans_commit(tp, - XFS_TRANS_RELEASE_LOG_RES); - if (!error) - xfs_inode_clear_eofblocks_tag(ip); - } - - xfs_iunlock(ip, XFS_ILOCK_EXCL); - if (need_iolock) - xfs_iunlock(ip, XFS_IOLOCK_EXCL); - } - return error; -} - -/* - * Free a symlink that has blocks associated with it. - */ -STATIC int -xfs_inactive_symlink_rmt( - xfs_inode_t *ip, - xfs_trans_t **tpp) -{ - xfs_buf_t *bp; - int committed; - int done; - int error; - xfs_fsblock_t first_block; - xfs_bmap_free_t free_list; - int i; - xfs_mount_t *mp; - xfs_bmbt_irec_t mval[SYMLINK_MAPS]; - int nmaps; - xfs_trans_t *ntp; - int size; - xfs_trans_t *tp; - - tp = *tpp; - mp = ip->i_mount; - ASSERT(ip->i_d.di_size > XFS_IFORK_DSIZE(ip)); - /* - * We're freeing a symlink that has some - * blocks allocated to it. Free the - * blocks here. We know that we've got - * either 1 or 2 extents and that we can - * free them all in one bunmapi call. - */ - ASSERT(ip->i_d.di_nextents > 0 && ip->i_d.di_nextents <= 2); - - /* - * Lock the inode, fix the size, and join it to the transaction. - * Hold it so in the normal path, we still have it locked for - * the second transaction. In the error paths we need it - * held so the cancel won't rele it, see below. - */ - size = (int)ip->i_d.di_size; - ip->i_d.di_size = 0; - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - /* - * Find the block(s) so we can inval and unmap them. - */ - done = 0; - xfs_bmap_init(&free_list, &first_block); - nmaps = ARRAY_SIZE(mval); - error = xfs_bmapi_read(ip, 0, XFS_B_TO_FSB(mp, size), - mval, &nmaps, 0); - if (error) - goto error0; - /* - * Invalidate the block(s). - */ - for (i = 0; i < nmaps; i++) { - bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, - XFS_FSB_TO_DADDR(mp, mval[i].br_startblock), - XFS_FSB_TO_BB(mp, mval[i].br_blockcount), 0); - if (!bp) { - error = ENOMEM; - goto error1; - } - xfs_trans_binval(tp, bp); - } - /* - * Unmap the dead block(s) to the free_list. - */ - if ((error = xfs_bunmapi(tp, ip, 0, size, XFS_BMAPI_METADATA, nmaps, - &first_block, &free_list, &done))) - goto error1; - ASSERT(done); - /* - * Commit the first transaction. This logs the EFI and the inode. - */ - if ((error = xfs_bmap_finish(&tp, &free_list, &committed))) - goto error1; - /* - * The transaction must have been committed, since there were - * actually extents freed by xfs_bunmapi. See xfs_bmap_finish. - * The new tp has the extent freeing and EFDs. - */ - ASSERT(committed); - /* - * The first xact was committed, so add the inode to the new one. - * Mark it dirty so it will be logged and moved forward in the log as - * part of every commit. - */ - xfs_trans_ijoin(tp, ip, 0); - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - /* - * Get a new, empty transaction to return to our caller. - */ - ntp = xfs_trans_dup(tp); - /* - * Commit the transaction containing extent freeing and EFDs. - * If we get an error on the commit here or on the reserve below, - * we need to unlock the inode since the new transaction doesn't - * have the inode attached. - */ - error = xfs_trans_commit(tp, 0); - tp = ntp; - if (error) { - ASSERT(XFS_FORCED_SHUTDOWN(mp)); - goto error0; - } - /* - * transaction commit worked ok so we can drop the extra ticket - * reference that we gained in xfs_trans_dup() - */ - xfs_log_ticket_put(tp->t_ticket); - - /* - * Remove the memory for extent descriptions (just bookkeeping). - */ - if (ip->i_df.if_bytes) - xfs_idata_realloc(ip, -ip->i_df.if_bytes, XFS_DATA_FORK); - ASSERT(ip->i_df.if_bytes == 0); - /* - * Put an itruncate log reservation in the new transaction - * for our caller. - */ - if ((error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0, - XFS_TRANS_PERM_LOG_RES, XFS_ITRUNCATE_LOG_COUNT))) { - ASSERT(XFS_FORCED_SHUTDOWN(mp)); - goto error0; - } - - xfs_trans_ijoin(tp, ip, 0); - *tpp = tp; - return 0; - - error1: - xfs_bmap_cancel(&free_list); - error0: - return error; -} - -int -xfs_release( - xfs_inode_t *ip) -{ - xfs_mount_t *mp = ip->i_mount; - int error; - - if (!S_ISREG(ip->i_d.di_mode) || (ip->i_d.di_mode == 0)) - return 0; - - /* If this is a read-only mount, don't do this (would generate I/O) */ - if (mp->m_flags & XFS_MOUNT_RDONLY) - return 0; - - if (!XFS_FORCED_SHUTDOWN(mp)) { - int truncated; - - /* - * If we are using filestreams, and we have an unlinked - * file that we are processing the last close on, then nothing - * will be able to reopen and write to this file. Purge this - * inode from the filestreams cache so that it doesn't delay - * teardown of the inode. - */ - if ((ip->i_d.di_nlink == 0) && xfs_inode_is_filestream(ip)) - xfs_filestream_deassociate(ip); - - /* - * If we previously truncated this file and removed old data - * in the process, we want to initiate "early" writeout on - * the last close. This is an attempt to combat the notorious - * NULL files problem which is particularly noticeable from a - * truncate down, buffered (re-)write (delalloc), followed by - * a crash. What we are effectively doing here is - * significantly reducing the time window where we'd otherwise - * be exposed to that problem. - */ - truncated = xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED); - if (truncated) { - xfs_iflags_clear(ip, XFS_IDIRTY_RELEASE); - if (VN_DIRTY(VFS_I(ip)) && ip->i_delayed_blks > 0) { - error = -filemap_flush(VFS_I(ip)->i_mapping); - if (error) - return error; - } - } - } - - if (ip->i_d.di_nlink == 0) - return 0; - - if (xfs_can_free_eofblocks(ip, false)) { - - /* - * If we can't get the iolock just skip truncating the blocks - * past EOF because we could deadlock with the mmap_sem - * otherwise. We'll get another chance to drop them once the - * last reference to the inode is dropped, so we'll never leak - * blocks permanently. - * - * Further, check if the inode is being opened, written and - * closed frequently and we have delayed allocation blocks - * outstanding (e.g. streaming writes from the NFS server), - * truncating the blocks past EOF will cause fragmentation to - * occur. - * - * In this case don't do the truncation, either, but we have to - * be careful how we detect this case. Blocks beyond EOF show - * up as i_delayed_blks even when the inode is clean, so we - * need to truncate them away first before checking for a dirty - * release. Hence on the first dirty close we will still remove - * the speculative allocation, but after that we will leave it - * in place. - */ - if (xfs_iflags_test(ip, XFS_IDIRTY_RELEASE)) - return 0; - - error = xfs_free_eofblocks(mp, ip, true); - if (error && error != EAGAIN) - return error; - - /* delalloc blocks after truncation means it really is dirty */ - if (ip->i_delayed_blks) - xfs_iflags_set(ip, XFS_IDIRTY_RELEASE); - } - return 0; -} - -/* - * xfs_inactive - * - * This is called when the vnode reference count for the vnode - * goes to zero. If the file has been unlinked, then it must - * now be truncated. Also, we clear all of the read-ahead state - * kept for the inode here since the file is now closed. - */ -int -xfs_inactive( - xfs_inode_t *ip) -{ - xfs_bmap_free_t free_list; - xfs_fsblock_t first_block; - int committed; - xfs_trans_t *tp; - xfs_mount_t *mp; - int error; - int truncate = 0; - - /* - * If the inode is already free, then there can be nothing - * to clean up here. - */ - if (ip->i_d.di_mode == 0 || is_bad_inode(VFS_I(ip))) { - ASSERT(ip->i_df.if_real_bytes == 0); - ASSERT(ip->i_df.if_broot_bytes == 0); - return VN_INACTIVE_CACHE; - } - - mp = ip->i_mount; - - error = 0; - - /* If this is a read-only mount, don't do this (would generate I/O) */ - if (mp->m_flags & XFS_MOUNT_RDONLY) - goto out; - - if (ip->i_d.di_nlink != 0) { - /* - * force is true because we are evicting an inode from the - * cache. Post-eof blocks must be freed, lest we end up with - * broken free space accounting. - */ - if (xfs_can_free_eofblocks(ip, true)) { - error = xfs_free_eofblocks(mp, ip, false); - if (error) - return VN_INACTIVE_CACHE; - } - goto out; - } - - if (S_ISREG(ip->i_d.di_mode) && - (ip->i_d.di_size != 0 || XFS_ISIZE(ip) != 0 || - ip->i_d.di_nextents > 0 || ip->i_delayed_blks > 0)) - truncate = 1; - - error = xfs_qm_dqattach(ip, 0); - if (error) - return VN_INACTIVE_CACHE; - - tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE); - error = xfs_trans_reserve(tp, 0, - (truncate || S_ISLNK(ip->i_d.di_mode)) ? - XFS_ITRUNCATE_LOG_RES(mp) : - XFS_IFREE_LOG_RES(mp), - 0, - XFS_TRANS_PERM_LOG_RES, - XFS_ITRUNCATE_LOG_COUNT); - if (error) { - ASSERT(XFS_FORCED_SHUTDOWN(mp)); - xfs_trans_cancel(tp, 0); - return VN_INACTIVE_CACHE; - } - - xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, 0); - - if (S_ISLNK(ip->i_d.di_mode)) { - /* - * Zero length symlinks _can_ exist. - */ - if (ip->i_d.di_size > XFS_IFORK_DSIZE(ip)) { - error = xfs_inactive_symlink_rmt(ip, &tp); - if (error) - goto out_cancel; - } else if (ip->i_df.if_bytes > 0) { - xfs_idata_realloc(ip, -(ip->i_df.if_bytes), - XFS_DATA_FORK); - ASSERT(ip->i_df.if_bytes == 0); - } - } else if (truncate) { - ip->i_d.di_size = 0; - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - - error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, 0); - if (error) - goto out_cancel; - - ASSERT(ip->i_d.di_nextents == 0); - } - - /* - * If there are attributes associated with the file then blow them away - * now. The code calls a routine that recursively deconstructs the - * attribute fork. We need to just commit the current transaction - * because we can't use it for xfs_attr_inactive(). - */ - if (ip->i_d.di_anextents > 0) { - ASSERT(ip->i_d.di_forkoff != 0); - - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); - if (error) - goto out_unlock; - - xfs_iunlock(ip, XFS_ILOCK_EXCL); - - error = xfs_attr_inactive(ip); - if (error) - goto out; - - tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE); - error = xfs_trans_reserve(tp, 0, - XFS_IFREE_LOG_RES(mp), - 0, XFS_TRANS_PERM_LOG_RES, - XFS_INACTIVE_LOG_COUNT); - if (error) { - xfs_trans_cancel(tp, 0); - goto out; - } - - xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, 0); - } - - if (ip->i_afp) - xfs_idestroy_fork(ip, XFS_ATTR_FORK); - - ASSERT(ip->i_d.di_anextents == 0); - - /* - * Free the inode. - */ - xfs_bmap_init(&free_list, &first_block); - error = xfs_ifree(tp, ip, &free_list); - if (error) { - /* - * If we fail to free the inode, shut down. The cancel - * might do that, we need to make sure. Otherwise the - * inode might be lost for a long time or forever. - */ - if (!XFS_FORCED_SHUTDOWN(mp)) { - xfs_notice(mp, "%s: xfs_ifree returned error %d", - __func__, error); - xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); - } - xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT); - } else { - /* - * Credit the quota account(s). The inode is gone. - */ - xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_ICOUNT, -1); - - /* - * Just ignore errors at this point. There is nothing we can - * do except to try to keep going. Make sure it's not a silent - * error. - */ - error = xfs_bmap_finish(&tp, &free_list, &committed); - if (error) - xfs_notice(mp, "%s: xfs_bmap_finish returned error %d", - __func__, error); - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); - if (error) - xfs_notice(mp, "%s: xfs_trans_commit returned error %d", - __func__, error); - } - - /* - * Release the dquots held by inode, if any. - */ - xfs_qm_dqdetach(ip); -out_unlock: - xfs_iunlock(ip, XFS_ILOCK_EXCL); -out: - return VN_INACTIVE_CACHE; -out_cancel: - xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); - goto out_unlock; -} - -/* - * Lookups up an inode from "name". If ci_name is not NULL, then a CI match - * is allowed, otherwise it has to be an exact match. If a CI match is found, - * ci_name->name will point to a the actual name (caller must free) or - * will be set to NULL if an exact match is found. - */ -int -xfs_lookup( - xfs_inode_t *dp, - struct xfs_name *name, - xfs_inode_t **ipp, - struct xfs_name *ci_name) -{ - xfs_ino_t inum; - int error; - uint lock_mode; - - trace_xfs_lookup(dp, name); - - if (XFS_FORCED_SHUTDOWN(dp->i_mount)) - return XFS_ERROR(EIO); - - lock_mode = xfs_ilock_map_shared(dp); - error = xfs_dir_lookup(NULL, dp, name, &inum, ci_name); - xfs_iunlock_map_shared(dp, lock_mode); - - if (error) - goto out; - - error = xfs_iget(dp->i_mount, NULL, inum, 0, 0, ipp); - if (error) - goto out_free_name; - - return 0; - -out_free_name: - if (ci_name) - kmem_free(ci_name->name); -out: - *ipp = NULL; - return error; -} - -int -xfs_create( - xfs_inode_t *dp, - struct xfs_name *name, - umode_t mode, - xfs_dev_t rdev, - xfs_inode_t **ipp) -{ - int is_dir = S_ISDIR(mode); - struct xfs_mount *mp = dp->i_mount; - struct xfs_inode *ip = NULL; - struct xfs_trans *tp = NULL; - int error; - xfs_bmap_free_t free_list; - xfs_fsblock_t first_block; - boolean_t unlock_dp_on_error = B_FALSE; - uint cancel_flags; - int committed; - prid_t prid; - struct xfs_dquot *udqp = NULL; - struct xfs_dquot *gdqp = NULL; - uint resblks; - uint log_res; - uint log_count; - - trace_xfs_create(dp, name); - - if (XFS_FORCED_SHUTDOWN(mp)) - return XFS_ERROR(EIO); - - if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) - prid = xfs_get_projid(dp); - else - prid = XFS_PROJID_DEFAULT; - - /* - * Make sure that we have allocated dquot(s) on disk. - */ - error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid, - XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp); - if (error) - return error; - - if (is_dir) { - rdev = 0; - resblks = XFS_MKDIR_SPACE_RES(mp, name->len); - log_res = XFS_MKDIR_LOG_RES(mp); - log_count = XFS_MKDIR_LOG_COUNT; - tp = xfs_trans_alloc(mp, XFS_TRANS_MKDIR); - } else { - resblks = XFS_CREATE_SPACE_RES(mp, name->len); - log_res = XFS_CREATE_LOG_RES(mp); - log_count = XFS_CREATE_LOG_COUNT; - tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE); - } - - cancel_flags = XFS_TRANS_RELEASE_LOG_RES; - - /* - * Initially assume that the file does not exist and - * reserve the resources for that case. If that is not - * the case we'll drop the one we have and get a more - * appropriate transaction later. - */ - error = xfs_trans_reserve(tp, resblks, log_res, 0, - XFS_TRANS_PERM_LOG_RES, log_count); - if (error == ENOSPC) { - /* flush outstanding delalloc blocks and retry */ - xfs_flush_inodes(mp); - error = xfs_trans_reserve(tp, resblks, log_res, 0, - XFS_TRANS_PERM_LOG_RES, log_count); - } - if (error == ENOSPC) { - /* No space at all so try a "no-allocation" reservation */ - resblks = 0; - error = xfs_trans_reserve(tp, 0, log_res, 0, - XFS_TRANS_PERM_LOG_RES, log_count); - } - if (error) { - cancel_flags = 0; - goto out_trans_cancel; - } - - xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT); - unlock_dp_on_error = B_TRUE; - - xfs_bmap_init(&free_list, &first_block); - - /* - * Reserve disk quota and the inode. - */ - error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp, resblks, 1, 0); - if (error) - goto out_trans_cancel; - - error = xfs_dir_canenter(tp, dp, name, resblks); - if (error) - goto out_trans_cancel; - - /* - * A newly created regular or special file just has one directory - * entry pointing to them, but a directory also the "." entry - * pointing to itself. - */ - error = xfs_dir_ialloc(&tp, dp, mode, is_dir ? 2 : 1, rdev, - prid, resblks > 0, &ip, &committed); - if (error) { - if (error == ENOSPC) - goto out_trans_cancel; - goto out_trans_abort; - } - - /* - * Now we join the directory inode to the transaction. We do not do it - * earlier because xfs_dir_ialloc might commit the previous transaction - * (and release all the locks). An error from here on will result in - * the transaction cancel unlocking dp so don't do it explicitly in the - * error path. - */ - xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); - unlock_dp_on_error = B_FALSE; - - error = xfs_dir_createname(tp, dp, name, ip->i_ino, - &first_block, &free_list, resblks ? - resblks - XFS_IALLOC_SPACE_RES(mp) : 0); - if (error) { - ASSERT(error != ENOSPC); - goto out_trans_abort; - } - xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); - xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); - - if (is_dir) { - error = xfs_dir_init(tp, ip, dp); - if (error) - goto out_bmap_cancel; - - error = xfs_bumplink(tp, dp); - if (error) - goto out_bmap_cancel; - } - - /* - * If this is a synchronous mount, make sure that the - * create transaction goes to disk before returning to - * the user. - */ - if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) - xfs_trans_set_sync(tp); - - /* - * Attach the dquot(s) to the inodes and modify them incore. - * These ids of the inode couldn't have changed since the new - * inode has been locked ever since it was created. - */ - xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp); - - error = xfs_bmap_finish(&tp, &free_list, &committed); - if (error) - goto out_bmap_cancel; - - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); - if (error) - goto out_release_inode; - - xfs_qm_dqrele(udqp); - xfs_qm_dqrele(gdqp); - - *ipp = ip; - return 0; - - out_bmap_cancel: - xfs_bmap_cancel(&free_list); - out_trans_abort: - cancel_flags |= XFS_TRANS_ABORT; - out_trans_cancel: - xfs_trans_cancel(tp, cancel_flags); - out_release_inode: - /* - * Wait until after the current transaction is aborted to - * release the inode. This prevents recursive transactions - * and deadlocks from xfs_inactive. - */ - if (ip) - IRELE(ip); - - xfs_qm_dqrele(udqp); - xfs_qm_dqrele(gdqp); - - if (unlock_dp_on_error) - xfs_iunlock(dp, XFS_ILOCK_EXCL); - return error; -} - -#ifdef DEBUG -int xfs_locked_n; -int xfs_small_retries; -int xfs_middle_retries; -int xfs_lots_retries; -int xfs_lock_delays; -#endif - -/* - * Bump the subclass so xfs_lock_inodes() acquires each lock with - * a different value - */ -static inline int -xfs_lock_inumorder(int lock_mode, int subclass) -{ - if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)) - lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_IOLOCK_SHIFT; - if (lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)) - lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_ILOCK_SHIFT; - - return lock_mode; -} - -/* - * The following routine will lock n inodes in exclusive mode. - * We assume the caller calls us with the inodes in i_ino order. - * - * We need to detect deadlock where an inode that we lock - * is in the AIL and we start waiting for another inode that is locked - * by a thread in a long running transaction (such as truncate). This can - * result in deadlock since the long running trans might need to wait - * for the inode we just locked in order to push the tail and free space - * in the log. - */ -void -xfs_lock_inodes( - xfs_inode_t **ips, - int inodes, - uint lock_mode) -{ - int attempts = 0, i, j, try_lock; - xfs_log_item_t *lp; - - ASSERT(ips && (inodes >= 2)); /* we need at least two */ - - try_lock = 0; - i = 0; - -again: - for (; i < inodes; i++) { - ASSERT(ips[i]); - - if (i && (ips[i] == ips[i-1])) /* Already locked */ - continue; - - /* - * If try_lock is not set yet, make sure all locked inodes - * are not in the AIL. - * If any are, set try_lock to be used later. - */ - - if (!try_lock) { - for (j = (i - 1); j >= 0 && !try_lock; j--) { - lp = (xfs_log_item_t *)ips[j]->i_itemp; - if (lp && (lp->li_flags & XFS_LI_IN_AIL)) { - try_lock++; - } - } - } - - /* - * If any of the previous locks we have locked is in the AIL, - * we must TRY to get the second and subsequent locks. If - * we can't get any, we must release all we have - * and try again. - */ - - if (try_lock) { - /* try_lock must be 0 if i is 0. */ - /* - * try_lock means we have an inode locked - * that is in the AIL. - */ - ASSERT(i != 0); - if (!xfs_ilock_nowait(ips[i], xfs_lock_inumorder(lock_mode, i))) { - attempts++; - - /* - * Unlock all previous guys and try again. - * xfs_iunlock will try to push the tail - * if the inode is in the AIL. - */ - - for(j = i - 1; j >= 0; j--) { - - /* - * Check to see if we've already - * unlocked this one. - * Not the first one going back, - * and the inode ptr is the same. - */ - if ((j != (i - 1)) && ips[j] == - ips[j+1]) - continue; - - xfs_iunlock(ips[j], lock_mode); - } - - if ((attempts % 5) == 0) { - delay(1); /* Don't just spin the CPU */ -#ifdef DEBUG - xfs_lock_delays++; -#endif - } - i = 0; - try_lock = 0; - goto again; - } - } else { - xfs_ilock(ips[i], xfs_lock_inumorder(lock_mode, i)); - } - } - -#ifdef DEBUG - if (attempts) { - if (attempts < 5) xfs_small_retries++; - else if (attempts < 100) xfs_middle_retries++; - else xfs_lots_retries++; - } else { - xfs_locked_n++; - } -#endif -} - -/* - * xfs_lock_two_inodes() can only be used to lock one type of lock - * at a time - the iolock or the ilock, but not both at once. If - * we lock both at once, lockdep will report false positives saying - * we have violated locking orders. - */ -void -xfs_lock_two_inodes( - xfs_inode_t *ip0, - xfs_inode_t *ip1, - uint lock_mode) -{ - xfs_inode_t *temp; - int attempts = 0; - xfs_log_item_t *lp; - - if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)) - ASSERT((lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)) == 0); - ASSERT(ip0->i_ino != ip1->i_ino); - - if (ip0->i_ino > ip1->i_ino) { - temp = ip0; - ip0 = ip1; - ip1 = temp; - } - - again: - xfs_ilock(ip0, xfs_lock_inumorder(lock_mode, 0)); - - /* - * If the first lock we have locked is in the AIL, we must TRY to get - * the second lock. If we can't get it, we must release the first one - * and try again. - */ - lp = (xfs_log_item_t *)ip0->i_itemp; - if (lp && (lp->li_flags & XFS_LI_IN_AIL)) { - if (!xfs_ilock_nowait(ip1, xfs_lock_inumorder(lock_mode, 1))) { - xfs_iunlock(ip0, lock_mode); - if ((++attempts % 5) == 0) - delay(1); /* Don't just spin the CPU */ - goto again; - } - } else { - xfs_ilock(ip1, xfs_lock_inumorder(lock_mode, 1)); - } -} - -int -xfs_remove( - xfs_inode_t *dp, - struct xfs_name *name, - xfs_inode_t *ip) -{ - xfs_mount_t *mp = dp->i_mount; - xfs_trans_t *tp = NULL; - int is_dir = S_ISDIR(ip->i_d.di_mode); - int error = 0; - xfs_bmap_free_t free_list; - xfs_fsblock_t first_block; - int cancel_flags; - int committed; - int link_zero; - uint resblks; - uint log_count; - - trace_xfs_remove(dp, name); - - if (XFS_FORCED_SHUTDOWN(mp)) - return XFS_ERROR(EIO); - - error = xfs_qm_dqattach(dp, 0); - if (error) - goto std_return; - - error = xfs_qm_dqattach(ip, 0); - if (error) - goto std_return; - - if (is_dir) { - tp = xfs_trans_alloc(mp, XFS_TRANS_RMDIR); - log_count = XFS_DEFAULT_LOG_COUNT; - } else { - tp = xfs_trans_alloc(mp, XFS_TRANS_REMOVE); - log_count = XFS_REMOVE_LOG_COUNT; - } - cancel_flags = XFS_TRANS_RELEASE_LOG_RES; - - /* - * We try to get the real space reservation first, - * allowing for directory btree deletion(s) implying - * possible bmap insert(s). If we can't get the space - * reservation then we use 0 instead, and avoid the bmap - * btree insert(s) in the directory code by, if the bmap - * insert tries to happen, instead trimming the LAST - * block from the directory. - */ - resblks = XFS_REMOVE_SPACE_RES(mp); - error = xfs_trans_reserve(tp, resblks, XFS_REMOVE_LOG_RES(mp), 0, - XFS_TRANS_PERM_LOG_RES, log_count); - if (error == ENOSPC) { - resblks = 0; - error = xfs_trans_reserve(tp, 0, XFS_REMOVE_LOG_RES(mp), 0, - XFS_TRANS_PERM_LOG_RES, log_count); - } - if (error) { - ASSERT(error != ENOSPC); - cancel_flags = 0; - goto out_trans_cancel; - } - - xfs_lock_two_inodes(dp, ip, XFS_ILOCK_EXCL); - - xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); - - /* - * If we're removing a directory perform some additional validation. - */ - if (is_dir) { - ASSERT(ip->i_d.di_nlink >= 2); - if (ip->i_d.di_nlink != 2) { - error = XFS_ERROR(ENOTEMPTY); - goto out_trans_cancel; - } - if (!xfs_dir_isempty(ip)) { - error = XFS_ERROR(ENOTEMPTY); - goto out_trans_cancel; - } - } - - xfs_bmap_init(&free_list, &first_block); - error = xfs_dir_removename(tp, dp, name, ip->i_ino, - &first_block, &free_list, resblks); - if (error) { - ASSERT(error != ENOENT); - goto out_bmap_cancel; - } - xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); - - if (is_dir) { - /* - * Drop the link from ip's "..". - */ - error = xfs_droplink(tp, dp); - if (error) - goto out_bmap_cancel; - - /* - * Drop the "." link from ip to self. - */ - error = xfs_droplink(tp, ip); - if (error) - goto out_bmap_cancel; - } else { - /* - * When removing a non-directory we need to log the parent - * inode here. For a directory this is done implicitly - * by the xfs_droplink call for the ".." entry. - */ - xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); - } - - /* - * Drop the link from dp to ip. - */ - error = xfs_droplink(tp, ip); - if (error) - goto out_bmap_cancel; - - /* - * Determine if this is the last link while - * we are in the transaction. - */ - link_zero = (ip->i_d.di_nlink == 0); - - /* - * If this is a synchronous mount, make sure that the - * remove transaction goes to disk before returning to - * the user. - */ - if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) - xfs_trans_set_sync(tp); - - error = xfs_bmap_finish(&tp, &free_list, &committed); - if (error) - goto out_bmap_cancel; - - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); - if (error) - goto std_return; - - /* - * If we are using filestreams, kill the stream association. - * If the file is still open it may get a new one but that - * will get killed on last close in xfs_close() so we don't - * have to worry about that. - */ - if (!is_dir && link_zero && xfs_inode_is_filestream(ip)) - xfs_filestream_deassociate(ip); - - return 0; - - out_bmap_cancel: - xfs_bmap_cancel(&free_list); - cancel_flags |= XFS_TRANS_ABORT; - out_trans_cancel: - xfs_trans_cancel(tp, cancel_flags); - std_return: - return error; -} - -int -xfs_link( - xfs_inode_t *tdp, - xfs_inode_t *sip, - struct xfs_name *target_name) -{ - xfs_mount_t *mp = tdp->i_mount; - xfs_trans_t *tp; - int error; - xfs_bmap_free_t free_list; - xfs_fsblock_t first_block; - int cancel_flags; - int committed; - int resblks; - - trace_xfs_link(tdp, target_name); - - ASSERT(!S_ISDIR(sip->i_d.di_mode)); - - if (XFS_FORCED_SHUTDOWN(mp)) - return XFS_ERROR(EIO); - - error = xfs_qm_dqattach(sip, 0); - if (error) - goto std_return; - - error = xfs_qm_dqattach(tdp, 0); - if (error) - goto std_return; - - tp = xfs_trans_alloc(mp, XFS_TRANS_LINK); - cancel_flags = XFS_TRANS_RELEASE_LOG_RES; - resblks = XFS_LINK_SPACE_RES(mp, target_name->len); - error = xfs_trans_reserve(tp, resblks, XFS_LINK_LOG_RES(mp), 0, - XFS_TRANS_PERM_LOG_RES, XFS_LINK_LOG_COUNT); - if (error == ENOSPC) { - resblks = 0; - error = xfs_trans_reserve(tp, 0, XFS_LINK_LOG_RES(mp), 0, - XFS_TRANS_PERM_LOG_RES, XFS_LINK_LOG_COUNT); - } - if (error) { - cancel_flags = 0; - goto error_return; - } - - xfs_lock_two_inodes(sip, tdp, XFS_ILOCK_EXCL); - - xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, tdp, XFS_ILOCK_EXCL); - - /* - * If we are using project inheritance, we only allow hard link - * creation in our tree when the project IDs are the same; else - * the tree quota mechanism could be circumvented. - */ - if (unlikely((tdp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) && - (xfs_get_projid(tdp) != xfs_get_projid(sip)))) { - error = XFS_ERROR(EXDEV); - goto error_return; - } - - error = xfs_dir_canenter(tp, tdp, target_name, resblks); - if (error) - goto error_return; - - xfs_bmap_init(&free_list, &first_block); - - error = xfs_dir_createname(tp, tdp, target_name, sip->i_ino, - &first_block, &free_list, resblks); - if (error) - goto abort_return; - xfs_trans_ichgtime(tp, tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); - xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE); - - error = xfs_bumplink(tp, sip); - if (error) - goto abort_return; - - /* - * If this is a synchronous mount, make sure that the - * link transaction goes to disk before returning to - * the user. - */ - if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) { - xfs_trans_set_sync(tp); - } - - error = xfs_bmap_finish (&tp, &free_list, &committed); - if (error) { - xfs_bmap_cancel(&free_list); - goto abort_return; - } - - return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); - - abort_return: - cancel_flags |= XFS_TRANS_ABORT; - error_return: - xfs_trans_cancel(tp, cancel_flags); - std_return: - return error; -} - -int -xfs_symlink( - xfs_inode_t *dp, - struct xfs_name *link_name, - const char *target_path, - umode_t mode, - xfs_inode_t **ipp) -{ - xfs_mount_t *mp = dp->i_mount; - xfs_trans_t *tp; - xfs_inode_t *ip; - int error; - int pathlen; - xfs_bmap_free_t free_list; - xfs_fsblock_t first_block; - boolean_t unlock_dp_on_error = B_FALSE; - uint cancel_flags; - int committed; - xfs_fileoff_t first_fsb; - xfs_filblks_t fs_blocks; - int nmaps; - xfs_bmbt_irec_t mval[SYMLINK_MAPS]; - xfs_daddr_t d; - const char *cur_chunk; - int byte_cnt; - int n; - xfs_buf_t *bp; - prid_t prid; - struct xfs_dquot *udqp, *gdqp; - uint resblks; - - *ipp = NULL; - error = 0; - ip = NULL; - tp = NULL; - - trace_xfs_symlink(dp, link_name); - - if (XFS_FORCED_SHUTDOWN(mp)) - return XFS_ERROR(EIO); - - /* - * Check component lengths of the target path name. - */ - pathlen = strlen(target_path); - if (pathlen >= MAXPATHLEN) /* total string too long */ - return XFS_ERROR(ENAMETOOLONG); - - udqp = gdqp = NULL; - if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) - prid = xfs_get_projid(dp); - else - prid = XFS_PROJID_DEFAULT; - - /* - * Make sure that we have allocated dquot(s) on disk. - */ - error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid, - XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp); - if (error) - goto std_return; - - tp = xfs_trans_alloc(mp, XFS_TRANS_SYMLINK); - cancel_flags = XFS_TRANS_RELEASE_LOG_RES; - /* - * The symlink will fit into the inode data fork? - * There can't be any attributes so we get the whole variable part. - */ - if (pathlen <= XFS_LITINO(mp)) - fs_blocks = 0; - else - fs_blocks = XFS_B_TO_FSB(mp, pathlen); - resblks = XFS_SYMLINK_SPACE_RES(mp, link_name->len, fs_blocks); - error = xfs_trans_reserve(tp, resblks, XFS_SYMLINK_LOG_RES(mp), 0, - XFS_TRANS_PERM_LOG_RES, XFS_SYMLINK_LOG_COUNT); - if (error == ENOSPC && fs_blocks == 0) { - resblks = 0; - error = xfs_trans_reserve(tp, 0, XFS_SYMLINK_LOG_RES(mp), 0, - XFS_TRANS_PERM_LOG_RES, XFS_SYMLINK_LOG_COUNT); - } - if (error) { - cancel_flags = 0; - goto error_return; - } - - xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT); - unlock_dp_on_error = B_TRUE; - - /* - * Check whether the directory allows new symlinks or not. - */ - if (dp->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) { - error = XFS_ERROR(EPERM); - goto error_return; - } - - /* - * Reserve disk quota : blocks and inode. - */ - error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp, resblks, 1, 0); - if (error) - goto error_return; - - /* - * Check for ability to enter directory entry, if no space reserved. - */ - error = xfs_dir_canenter(tp, dp, link_name, resblks); - if (error) - goto error_return; - /* - * Initialize the bmap freelist prior to calling either - * bmapi or the directory create code. - */ - xfs_bmap_init(&free_list, &first_block); - - /* - * Allocate an inode for the symlink. - */ - error = xfs_dir_ialloc(&tp, dp, S_IFLNK | (mode & ~S_IFMT), 1, 0, - prid, resblks > 0, &ip, NULL); - if (error) { - if (error == ENOSPC) - goto error_return; - goto error1; - } - - /* - * An error after we've joined dp to the transaction will result in the - * transaction cancel unlocking dp so don't do it explicitly in the - * error path. - */ - xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); - unlock_dp_on_error = B_FALSE; - - /* - * Also attach the dquot(s) to it, if applicable. - */ - xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp); - - if (resblks) - resblks -= XFS_IALLOC_SPACE_RES(mp); - /* - * If the symlink will fit into the inode, write it inline. - */ - if (pathlen <= XFS_IFORK_DSIZE(ip)) { - xfs_idata_realloc(ip, pathlen, XFS_DATA_FORK); - memcpy(ip->i_df.if_u1.if_data, target_path, pathlen); - ip->i_d.di_size = pathlen; - - /* - * The inode was initially created in extent format. - */ - ip->i_df.if_flags &= ~(XFS_IFEXTENTS | XFS_IFBROOT); - ip->i_df.if_flags |= XFS_IFINLINE; - - ip->i_d.di_format = XFS_DINODE_FMT_LOCAL; - xfs_trans_log_inode(tp, ip, XFS_ILOG_DDATA | XFS_ILOG_CORE); - - } else { - first_fsb = 0; - nmaps = SYMLINK_MAPS; - - error = xfs_bmapi_write(tp, ip, first_fsb, fs_blocks, - XFS_BMAPI_METADATA, &first_block, resblks, - mval, &nmaps, &free_list); - if (error) - goto error2; - - if (resblks) - resblks -= fs_blocks; - ip->i_d.di_size = pathlen; - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - - cur_chunk = target_path; - for (n = 0; n < nmaps; n++) { - d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock); - byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount); - bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, - BTOBB(byte_cnt), 0); - if (!bp) { - error = ENOMEM; - goto error2; - } - if (pathlen < byte_cnt) { - byte_cnt = pathlen; - } - pathlen -= byte_cnt; - - memcpy(bp->b_addr, cur_chunk, byte_cnt); - cur_chunk += byte_cnt; - - xfs_trans_log_buf(tp, bp, 0, byte_cnt - 1); - } - } - - /* - * Create the directory entry for the symlink. - */ - error = xfs_dir_createname(tp, dp, link_name, ip->i_ino, - &first_block, &free_list, resblks); - if (error) - goto error2; - xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); - xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); - - /* - * If this is a synchronous mount, make sure that the - * symlink transaction goes to disk before returning to - * the user. - */ - if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) { - xfs_trans_set_sync(tp); - } - - error = xfs_bmap_finish(&tp, &free_list, &committed); - if (error) { - goto error2; - } - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); - xfs_qm_dqrele(udqp); - xfs_qm_dqrele(gdqp); - - *ipp = ip; - return 0; - - error2: - IRELE(ip); - error1: - xfs_bmap_cancel(&free_list); - cancel_flags |= XFS_TRANS_ABORT; - error_return: - xfs_trans_cancel(tp, cancel_flags); - xfs_qm_dqrele(udqp); - xfs_qm_dqrele(gdqp); - - if (unlock_dp_on_error) - xfs_iunlock(dp, XFS_ILOCK_EXCL); - std_return: - return error; -} - -int -xfs_set_dmattrs( - xfs_inode_t *ip, - u_int evmask, - u_int16_t state) -{ - xfs_mount_t *mp = ip->i_mount; - xfs_trans_t *tp; - int error; - - if (!capable(CAP_SYS_ADMIN)) - return XFS_ERROR(EPERM); - - if (XFS_FORCED_SHUTDOWN(mp)) - return XFS_ERROR(EIO); - - tp = xfs_trans_alloc(mp, XFS_TRANS_SET_DMATTRS); - error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES (mp), 0, 0, 0); - if (error) { - xfs_trans_cancel(tp, 0); - return error; - } - xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); - - ip->i_d.di_dmevmask = evmask; - ip->i_d.di_dmstate = state; - - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - error = xfs_trans_commit(tp, 0); - - return error; -} - -/* - * xfs_alloc_file_space() - * This routine allocates disk space for the given file. - * - * If alloc_type == 0, this request is for an ALLOCSP type - * request which will change the file size. In this case, no - * DMAPI event will be generated by the call. A TRUNCATE event - * will be generated later by xfs_setattr. - * - * If alloc_type != 0, this request is for a RESVSP type - * request, and a DMAPI DM_EVENT_WRITE will be generated if the - * lower block boundary byte address is less than the file's - * length. - * - * RETURNS: - * 0 on success - * errno on error - * - */ -STATIC int -xfs_alloc_file_space( - xfs_inode_t *ip, - xfs_off_t offset, - xfs_off_t len, - int alloc_type, - int attr_flags) -{ - xfs_mount_t *mp = ip->i_mount; - xfs_off_t count; - xfs_filblks_t allocated_fsb; - xfs_filblks_t allocatesize_fsb; - xfs_extlen_t extsz, temp; - xfs_fileoff_t startoffset_fsb; - xfs_fsblock_t firstfsb; - int nimaps; - int quota_flag; - int rt; - xfs_trans_t *tp; - xfs_bmbt_irec_t imaps[1], *imapp; - xfs_bmap_free_t free_list; - uint qblocks, resblks, resrtextents; - int committed; - int error; - - trace_xfs_alloc_file_space(ip); - - if (XFS_FORCED_SHUTDOWN(mp)) - return XFS_ERROR(EIO); - - error = xfs_qm_dqattach(ip, 0); - if (error) - return error; - - if (len <= 0) - return XFS_ERROR(EINVAL); - - rt = XFS_IS_REALTIME_INODE(ip); - extsz = xfs_get_extsz_hint(ip); - - count = len; - imapp = &imaps[0]; - nimaps = 1; - startoffset_fsb = XFS_B_TO_FSBT(mp, offset); - allocatesize_fsb = XFS_B_TO_FSB(mp, count); - - /* - * Allocate file space until done or until there is an error - */ - while (allocatesize_fsb && !error) { - xfs_fileoff_t s, e; - - /* - * Determine space reservations for data/realtime. - */ - if (unlikely(extsz)) { - s = startoffset_fsb; - do_div(s, extsz); - s *= extsz; - e = startoffset_fsb + allocatesize_fsb; - if ((temp = do_mod(startoffset_fsb, extsz))) - e += temp; - if ((temp = do_mod(e, extsz))) - e += extsz - temp; - } else { - s = 0; - e = allocatesize_fsb; - } - - /* - * The transaction reservation is limited to a 32-bit block - * count, hence we need to limit the number of blocks we are - * trying to reserve to avoid an overflow. We can't allocate - * more than @nimaps extents, and an extent is limited on disk - * to MAXEXTLEN (21 bits), so use that to enforce the limit. - */ - resblks = min_t(xfs_fileoff_t, (e - s), (MAXEXTLEN * nimaps)); - if (unlikely(rt)) { - resrtextents = qblocks = resblks; - resrtextents /= mp->m_sb.sb_rextsize; - resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0); - quota_flag = XFS_QMOPT_RES_RTBLKS; - } else { - resrtextents = 0; - resblks = qblocks = XFS_DIOSTRAT_SPACE_RES(mp, resblks); - quota_flag = XFS_QMOPT_RES_REGBLKS; - } - - /* - * Allocate and setup the transaction. - */ - tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT); - error = xfs_trans_reserve(tp, resblks, - XFS_WRITE_LOG_RES(mp), resrtextents, - XFS_TRANS_PERM_LOG_RES, - XFS_WRITE_LOG_COUNT); - /* - * Check for running out of space - */ - if (error) { - /* - * Free the transaction structure. - */ - ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp)); - xfs_trans_cancel(tp, 0); - break; - } - xfs_ilock(ip, XFS_ILOCK_EXCL); - error = xfs_trans_reserve_quota_nblks(tp, ip, qblocks, - 0, quota_flag); - if (error) - goto error1; - - xfs_trans_ijoin(tp, ip, 0); - - xfs_bmap_init(&free_list, &firstfsb); - error = xfs_bmapi_write(tp, ip, startoffset_fsb, - allocatesize_fsb, alloc_type, &firstfsb, - 0, imapp, &nimaps, &free_list); - if (error) { - goto error0; - } - - /* - * Complete the transaction - */ - error = xfs_bmap_finish(&tp, &free_list, &committed); - if (error) { - goto error0; - } - - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); - xfs_iunlock(ip, XFS_ILOCK_EXCL); - if (error) { - break; - } - - allocated_fsb = imapp->br_blockcount; - - if (nimaps == 0) { - error = XFS_ERROR(ENOSPC); - break; - } - - startoffset_fsb += allocated_fsb; - allocatesize_fsb -= allocated_fsb; - } - - return error; - -error0: /* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */ - xfs_bmap_cancel(&free_list); - xfs_trans_unreserve_quota_nblks(tp, ip, (long)qblocks, 0, quota_flag); - -error1: /* Just cancel transaction */ - xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); - xfs_iunlock(ip, XFS_ILOCK_EXCL); - return error; -} - -/* - * Zero file bytes between startoff and endoff inclusive. - * The iolock is held exclusive and no blocks are buffered. - * - * This function is used by xfs_free_file_space() to zero - * partial blocks when the range to free is not block aligned. - * When unreserving space with boundaries that are not block - * aligned we round up the start and round down the end - * boundaries and then use this function to zero the parts of - * the blocks that got dropped during the rounding. - */ -STATIC int -xfs_zero_remaining_bytes( - xfs_inode_t *ip, - xfs_off_t startoff, - xfs_off_t endoff) -{ - xfs_bmbt_irec_t imap; - xfs_fileoff_t offset_fsb; - xfs_off_t lastoffset; - xfs_off_t offset; - xfs_buf_t *bp; - xfs_mount_t *mp = ip->i_mount; - int nimap; - int error = 0; - - /* - * Avoid doing I/O beyond eof - it's not necessary - * since nothing can read beyond eof. The space will - * be zeroed when the file is extended anyway. - */ - if (startoff >= XFS_ISIZE(ip)) - return 0; - - if (endoff > XFS_ISIZE(ip)) - endoff = XFS_ISIZE(ip); - - bp = xfs_buf_get_uncached(XFS_IS_REALTIME_INODE(ip) ? - mp->m_rtdev_targp : mp->m_ddev_targp, - BTOBB(mp->m_sb.sb_blocksize), 0); - if (!bp) - return XFS_ERROR(ENOMEM); - - xfs_buf_unlock(bp); - - for (offset = startoff; offset <= endoff; offset = lastoffset + 1) { - offset_fsb = XFS_B_TO_FSBT(mp, offset); - nimap = 1; - error = xfs_bmapi_read(ip, offset_fsb, 1, &imap, &nimap, 0); - if (error || nimap < 1) - break; - ASSERT(imap.br_blockcount >= 1); - ASSERT(imap.br_startoff == offset_fsb); - lastoffset = XFS_FSB_TO_B(mp, imap.br_startoff + 1) - 1; - if (lastoffset > endoff) - lastoffset = endoff; - if (imap.br_startblock == HOLESTARTBLOCK) - continue; - ASSERT(imap.br_startblock != DELAYSTARTBLOCK); - if (imap.br_state == XFS_EXT_UNWRITTEN) - continue; - XFS_BUF_UNDONE(bp); - XFS_BUF_UNWRITE(bp); - XFS_BUF_READ(bp); - XFS_BUF_SET_ADDR(bp, xfs_fsb_to_db(ip, imap.br_startblock)); - xfsbdstrat(mp, bp); - error = xfs_buf_iowait(bp); - if (error) { - xfs_buf_ioerror_alert(bp, - "xfs_zero_remaining_bytes(read)"); - break; - } - memset(bp->b_addr + - (offset - XFS_FSB_TO_B(mp, imap.br_startoff)), - 0, lastoffset - offset + 1); - XFS_BUF_UNDONE(bp); - XFS_BUF_UNREAD(bp); - XFS_BUF_WRITE(bp); - xfsbdstrat(mp, bp); - error = xfs_buf_iowait(bp); - if (error) { - xfs_buf_ioerror_alert(bp, - "xfs_zero_remaining_bytes(write)"); - break; - } - } - xfs_buf_free(bp); - return error; -} - -/* - * xfs_free_file_space() - * This routine frees disk space for the given file. - * - * This routine is only called by xfs_change_file_space - * for an UNRESVSP type call. - * - * RETURNS: - * 0 on success - * errno on error - * - */ -STATIC int -xfs_free_file_space( - xfs_inode_t *ip, - xfs_off_t offset, - xfs_off_t len, - int attr_flags) -{ - int committed; - int done; - xfs_fileoff_t endoffset_fsb; - int error; - xfs_fsblock_t firstfsb; - xfs_bmap_free_t free_list; - xfs_bmbt_irec_t imap; - xfs_off_t ioffset; - xfs_extlen_t mod=0; - xfs_mount_t *mp; - int nimap; - uint resblks; - uint rounding; - int rt; - xfs_fileoff_t startoffset_fsb; - xfs_trans_t *tp; - int need_iolock = 1; - - mp = ip->i_mount; - - trace_xfs_free_file_space(ip); - - error = xfs_qm_dqattach(ip, 0); - if (error) - return error; - - error = 0; - if (len <= 0) /* if nothing being freed */ - return error; - rt = XFS_IS_REALTIME_INODE(ip); - startoffset_fsb = XFS_B_TO_FSB(mp, offset); - endoffset_fsb = XFS_B_TO_FSBT(mp, offset + len); - - if (attr_flags & XFS_ATTR_NOLOCK) - need_iolock = 0; - if (need_iolock) { - xfs_ilock(ip, XFS_IOLOCK_EXCL); - /* wait for the completion of any pending DIOs */ - inode_dio_wait(VFS_I(ip)); - } - - rounding = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE); - ioffset = offset & ~(rounding - 1); - error = -filemap_write_and_wait_range(VFS_I(ip)->i_mapping, - ioffset, -1); - if (error) - goto out_unlock_iolock; - truncate_pagecache_range(VFS_I(ip), ioffset, -1); - - /* - * Need to zero the stuff we're not freeing, on disk. - * If it's a realtime file & can't use unwritten extents then we - * actually need to zero the extent edges. Otherwise xfs_bunmapi - * will take care of it for us. - */ - if (rt && !xfs_sb_version_hasextflgbit(&mp->m_sb)) { - nimap = 1; - error = xfs_bmapi_read(ip, startoffset_fsb, 1, - &imap, &nimap, 0); - if (error) - goto out_unlock_iolock; - ASSERT(nimap == 0 || nimap == 1); - if (nimap && imap.br_startblock != HOLESTARTBLOCK) { - xfs_daddr_t block; - - ASSERT(imap.br_startblock != DELAYSTARTBLOCK); - block = imap.br_startblock; - mod = do_div(block, mp->m_sb.sb_rextsize); - if (mod) - startoffset_fsb += mp->m_sb.sb_rextsize - mod; - } - nimap = 1; - error = xfs_bmapi_read(ip, endoffset_fsb - 1, 1, - &imap, &nimap, 0); - if (error) - goto out_unlock_iolock; - ASSERT(nimap == 0 || nimap == 1); - if (nimap && imap.br_startblock != HOLESTARTBLOCK) { - ASSERT(imap.br_startblock != DELAYSTARTBLOCK); - mod++; - if (mod && (mod != mp->m_sb.sb_rextsize)) - endoffset_fsb -= mod; - } - } - if ((done = (endoffset_fsb <= startoffset_fsb))) - /* - * One contiguous piece to clear - */ - error = xfs_zero_remaining_bytes(ip, offset, offset + len - 1); - else { - /* - * Some full blocks, possibly two pieces to clear - */ - if (offset < XFS_FSB_TO_B(mp, startoffset_fsb)) - error = xfs_zero_remaining_bytes(ip, offset, - XFS_FSB_TO_B(mp, startoffset_fsb) - 1); - if (!error && - XFS_FSB_TO_B(mp, endoffset_fsb) < offset + len) - error = xfs_zero_remaining_bytes(ip, - XFS_FSB_TO_B(mp, endoffset_fsb), - offset + len - 1); - } - - /* - * free file space until done or until there is an error - */ - resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0); - while (!error && !done) { - - /* - * allocate and setup the transaction. Allow this - * transaction to dip into the reserve blocks to ensure - * the freeing of the space succeeds at ENOSPC. - */ - tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT); - tp->t_flags |= XFS_TRANS_RESERVE; - error = xfs_trans_reserve(tp, - resblks, - XFS_WRITE_LOG_RES(mp), - 0, - XFS_TRANS_PERM_LOG_RES, - XFS_WRITE_LOG_COUNT); - - /* - * check for running out of space - */ - if (error) { - /* - * Free the transaction structure. - */ - ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp)); - xfs_trans_cancel(tp, 0); - break; - } - xfs_ilock(ip, XFS_ILOCK_EXCL); - error = xfs_trans_reserve_quota(tp, mp, - ip->i_udquot, ip->i_gdquot, - resblks, 0, XFS_QMOPT_RES_REGBLKS); - if (error) - goto error1; - - xfs_trans_ijoin(tp, ip, 0); - - /* - * issue the bunmapi() call to free the blocks - */ - xfs_bmap_init(&free_list, &firstfsb); - error = xfs_bunmapi(tp, ip, startoffset_fsb, - endoffset_fsb - startoffset_fsb, - 0, 2, &firstfsb, &free_list, &done); - if (error) { - goto error0; - } - - /* - * complete the transaction - */ - error = xfs_bmap_finish(&tp, &free_list, &committed); - if (error) { - goto error0; - } - - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); - xfs_iunlock(ip, XFS_ILOCK_EXCL); - } - - out_unlock_iolock: - if (need_iolock) - xfs_iunlock(ip, XFS_IOLOCK_EXCL); - return error; - - error0: - xfs_bmap_cancel(&free_list); - error1: - xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); - xfs_iunlock(ip, need_iolock ? (XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL) : - XFS_ILOCK_EXCL); - return error; -} - - -STATIC int -xfs_zero_file_space( - struct xfs_inode *ip, - xfs_off_t offset, - xfs_off_t len, - int attr_flags) -{ - struct xfs_mount *mp = ip->i_mount; - uint granularity; - xfs_off_t start_boundary; - xfs_off_t end_boundary; - int error; - - granularity = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE); - - /* - * Round the range of extents we are going to convert inwards. If the - * offset is aligned, then it doesn't get changed so we zero from the - * start of the block offset points to. - */ - start_boundary = round_up(offset, granularity); - end_boundary = round_down(offset + len, granularity); - - ASSERT(start_boundary >= offset); - ASSERT(end_boundary <= offset + len); - - if (!(attr_flags & XFS_ATTR_NOLOCK)) - xfs_ilock(ip, XFS_IOLOCK_EXCL); - - if (start_boundary < end_boundary - 1) { - /* punch out the page cache over the conversion range */ - truncate_pagecache_range(VFS_I(ip), start_boundary, - end_boundary - 1); - /* convert the blocks */ - error = xfs_alloc_file_space(ip, start_boundary, - end_boundary - start_boundary - 1, - XFS_BMAPI_PREALLOC | XFS_BMAPI_CONVERT, - attr_flags); - if (error) - goto out_unlock; - - /* We've handled the interior of the range, now for the edges */ - if (start_boundary != offset) - error = xfs_iozero(ip, offset, start_boundary - offset); - if (error) - goto out_unlock; - - if (end_boundary != offset + len) - error = xfs_iozero(ip, end_boundary, - offset + len - end_boundary); - - } else { - /* - * It's either a sub-granularity range or the range spanned lies - * partially across two adjacent blocks. - */ - error = xfs_iozero(ip, offset, len); - } - -out_unlock: - if (!(attr_flags & XFS_ATTR_NOLOCK)) - xfs_iunlock(ip, XFS_IOLOCK_EXCL); - return error; - -} - -/* - * xfs_change_file_space() - * This routine allocates or frees disk space for the given file. - * The user specified parameters are checked for alignment and size - * limitations. - * - * RETURNS: - * 0 on success - * errno on error - * - */ -int -xfs_change_file_space( - xfs_inode_t *ip, - int cmd, - xfs_flock64_t *bf, - xfs_off_t offset, - int attr_flags) -{ - xfs_mount_t *mp = ip->i_mount; - int clrprealloc; - int error; - xfs_fsize_t fsize; - int setprealloc; - xfs_off_t startoffset; - xfs_trans_t *tp; - struct iattr iattr; - - if (!S_ISREG(ip->i_d.di_mode)) - return XFS_ERROR(EINVAL); - - switch (bf->l_whence) { - case 0: /*SEEK_SET*/ - break; - case 1: /*SEEK_CUR*/ - bf->l_start += offset; - break; - case 2: /*SEEK_END*/ - bf->l_start += XFS_ISIZE(ip); - break; - default: - return XFS_ERROR(EINVAL); - } - - /* - * length of <= 0 for resv/unresv/zero is invalid. length for - * alloc/free is ignored completely and we have no idea what userspace - * might have set it to, so set it to zero to allow range - * checks to pass. - */ - switch (cmd) { - case XFS_IOC_ZERO_RANGE: - case XFS_IOC_RESVSP: - case XFS_IOC_RESVSP64: - case XFS_IOC_UNRESVSP: - case XFS_IOC_UNRESVSP64: - if (bf->l_len <= 0) - return XFS_ERROR(EINVAL); - break; - default: - bf->l_len = 0; - break; - } - - if (bf->l_start < 0 || - bf->l_start > mp->m_super->s_maxbytes || - bf->l_start + bf->l_len < 0 || - bf->l_start + bf->l_len >= mp->m_super->s_maxbytes) - return XFS_ERROR(EINVAL); - - bf->l_whence = 0; - - startoffset = bf->l_start; - fsize = XFS_ISIZE(ip); - - setprealloc = clrprealloc = 0; - switch (cmd) { - case XFS_IOC_ZERO_RANGE: - error = xfs_zero_file_space(ip, startoffset, bf->l_len, - attr_flags); - if (error) - return error; - setprealloc = 1; - break; - - case XFS_IOC_RESVSP: - case XFS_IOC_RESVSP64: - error = xfs_alloc_file_space(ip, startoffset, bf->l_len, - XFS_BMAPI_PREALLOC, attr_flags); - if (error) - return error; - setprealloc = 1; - break; - - case XFS_IOC_UNRESVSP: - case XFS_IOC_UNRESVSP64: - if ((error = xfs_free_file_space(ip, startoffset, bf->l_len, - attr_flags))) - return error; - break; - - case XFS_IOC_ALLOCSP: - case XFS_IOC_ALLOCSP64: - case XFS_IOC_FREESP: - case XFS_IOC_FREESP64: - /* - * These operations actually do IO when extending the file, but - * the allocation is done seperately to the zeroing that is - * done. This set of operations need to be serialised against - * other IO operations, such as truncate and buffered IO. We - * need to take the IOLOCK here to serialise the allocation and - * zeroing IO to prevent other IOLOCK holders (e.g. getbmap, - * truncate, direct IO) from racing against the transient - * allocated but not written state we can have here. - */ - xfs_ilock(ip, XFS_IOLOCK_EXCL); - if (startoffset > fsize) { - error = xfs_alloc_file_space(ip, fsize, - startoffset - fsize, 0, - attr_flags | XFS_ATTR_NOLOCK); - if (error) { - xfs_iunlock(ip, XFS_IOLOCK_EXCL); - break; - } - } - - iattr.ia_valid = ATTR_SIZE; - iattr.ia_size = startoffset; - - error = xfs_setattr_size(ip, &iattr, - attr_flags | XFS_ATTR_NOLOCK); - xfs_iunlock(ip, XFS_IOLOCK_EXCL); - - if (error) - return error; - - clrprealloc = 1; - break; - - default: - ASSERT(0); - return XFS_ERROR(EINVAL); - } - - /* - * update the inode timestamp, mode, and prealloc flag bits - */ - tp = xfs_trans_alloc(mp, XFS_TRANS_WRITEID); - - if ((error = xfs_trans_reserve(tp, 0, XFS_WRITEID_LOG_RES(mp), - 0, 0, 0))) { - /* ASSERT(0); */ - xfs_trans_cancel(tp, 0); - return error; - } - - xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); - - if ((attr_flags & XFS_ATTR_DMI) == 0) { - ip->i_d.di_mode &= ~S_ISUID; - - /* - * Note that we don't have to worry about mandatory - * file locking being disabled here because we only - * clear the S_ISGID bit if the Group execute bit is - * on, but if it was on then mandatory locking wouldn't - * have been enabled. - */ - if (ip->i_d.di_mode & S_IXGRP) - ip->i_d.di_mode &= ~S_ISGID; - - xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); - } - if (setprealloc) - ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC; - else if (clrprealloc) - ip->i_d.di_flags &= ~XFS_DIFLAG_PREALLOC; - - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - if (attr_flags & XFS_ATTR_SYNC) - xfs_trans_set_sync(tp); - return xfs_trans_commit(tp, 0); -} diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h deleted file mode 100644 index 5163022d980..00000000000 --- a/fs/xfs/xfs_vnodeops.h +++ /dev/null @@ -1,56 +0,0 @@ -#ifndef _XFS_VNODEOPS_H -#define _XFS_VNODEOPS_H 1 - -struct attrlist_cursor_kern; -struct file; -struct iattr; -struct inode; -struct iovec; -struct kiocb; -struct pipe_inode_info; -struct uio; -struct xfs_inode; - - -int xfs_setattr_nonsize(struct xfs_inode *ip, struct iattr *vap, int flags); -int xfs_setattr_size(struct xfs_inode *ip, struct iattr *vap, int flags); -#define XFS_ATTR_DMI 0x01 /* invocation from a DMI function */ -#define XFS_ATTR_NONBLOCK 0x02 /* return EAGAIN if operation would block */ -#define XFS_ATTR_NOLOCK 0x04 /* Don't grab any conflicting locks */ -#define XFS_ATTR_NOACL 0x08 /* Don't call xfs_acl_chmod */ -#define XFS_ATTR_SYNC 0x10 /* synchronous operation required */ - -int xfs_readlink(struct xfs_inode *ip, char *link); -int xfs_release(struct xfs_inode *ip); -int xfs_inactive(struct xfs_inode *ip); -int xfs_lookup(struct xfs_inode *dp, struct xfs_name *name, - struct xfs_inode **ipp, struct xfs_name *ci_name); -int xfs_create(struct xfs_inode *dp, struct xfs_name *name, umode_t mode, - xfs_dev_t rdev, struct xfs_inode **ipp); -int xfs_remove(struct xfs_inode *dp, struct xfs_name *name, - struct xfs_inode *ip); -int xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip, - struct xfs_name *target_name); -int xfs_readdir(struct xfs_inode *dp, void *dirent, size_t bufsize, - xfs_off_t *offset, filldir_t filldir); -int xfs_symlink(struct xfs_inode *dp, struct xfs_name *link_name, - const char *target_path, umode_t mode, struct xfs_inode **ipp); -int xfs_set_dmattrs(struct xfs_inode *ip, u_int evmask, u_int16_t state); -int xfs_change_file_space(struct xfs_inode *ip, int cmd, - xfs_flock64_t *bf, xfs_off_t offset, int attr_flags); -int xfs_rename(struct xfs_inode *src_dp, struct xfs_name *src_name, - struct xfs_inode *src_ip, struct xfs_inode *target_dp, - struct xfs_name *target_name, struct xfs_inode *target_ip); -int xfs_attr_get(struct xfs_inode *ip, const unsigned char *name, - unsigned char *value, int *valuelenp, int flags); -int xfs_attr_set(struct xfs_inode *dp, const unsigned char *name, - unsigned char *value, int valuelen, int flags); -int xfs_attr_remove(struct xfs_inode *dp, const unsigned char *name, int flags); -int xfs_attr_list(struct xfs_inode *dp, char *buffer, int bufsize, - int flags, struct attrlist_cursor_kern *cursor); - -int xfs_iozero(struct xfs_inode *, loff_t, size_t); -int xfs_zero_eof(struct xfs_inode *, xfs_off_t, xfs_fsize_t); -int xfs_free_eofblocks(struct xfs_mount *, struct xfs_inode *, bool); - -#endif /* _XFS_VNODEOPS_H */ diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c index 87d3e03878c..78ed92a46fd 100644 --- a/fs/xfs/xfs_xattr.c +++ b/fs/xfs/xfs_xattr.c @@ -17,13 +17,17 @@ */ #include "xfs.h" -#include "xfs_da_btree.h" -#include "xfs_bmap_btree.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_mount.h" +#include "xfs_da_format.h" #include "xfs_inode.h" #include "xfs_attr.h" #include "xfs_attr_leaf.h" #include "xfs_acl.h" -#include "xfs_vnodeops.h" #include <linux/posix_acl_xattr.h> #include <linux/xattr.h> @@ -98,8 +102,8 @@ const struct xattr_handler *xfs_xattr_handlers[] = { &xfs_xattr_trusted_handler, &xfs_xattr_security_handler, #ifdef CONFIG_XFS_POSIX_ACL - &xfs_xattr_acl_access_handler, - &xfs_xattr_acl_default_handler, + &posix_acl_access_xattr_handler, + &posix_acl_default_xattr_handler, #endif NULL }; |
